{"id":40031,"date":"2025-04-10T14:03:02","date_gmt":"2025-04-10T06:03:02","guid":{"rendered":"https:\/\/17aitech.com\/?p=40031"},"modified":"2025-04-10T16:06:21","modified_gmt":"2025-04-10T08:06:21","slug":"%e3%80%90%e6%a8%a1%e5%9e%8b%e6%b5%8b%e8%af%95%e3%80%91%e5%9f%ba%e4%ba%8eopencompass%e5%ae%9e%e7%8e%b0agent%e6%9c%80%e4%b8%ba%e8%8b%9b%e5%88%bb%e7%9a%84%e8%af%84%e6%b5%8b%e9%9b%86%ef%bc%9agaia","status":"publish","type":"post","link":"https:\/\/17aitech.com\/?p=40031","title":{"rendered":"\u3010\u6a21\u578b\u6d4b\u8bd5\u3011\u57fa\u4e8eOpenCompass\u5b9e\u73b0Agent\u6700\u4e3a\u82db\u523b\u7684\u57fa\u51c6\u8bc4\u6d4b\uff1aGAIA"},"content":{"rendered":"<div id=\"ez-toc-container\" class=\"ez-toc-v2_0_78 ez-toc-wrap-left-text counter-hierarchy ez-toc-counter ez-toc-light-blue ez-toc-container-direction\">\n<div class=\"ez-toc-title-container\">\n<p class=\"ez-toc-title\" style=\"cursor:inherit\">\u6587\u7ae0\u76ee\u5f55<\/p>\n<span class=\"ez-toc-title-toggle\"><a href=\"#\" class=\"ez-toc-pull-right ez-toc-btn ez-toc-btn-xs ez-toc-btn-default ez-toc-toggle\" aria-label=\"Toggle Table of Content\"><span class=\"ez-toc-js-icon-con\"><span class=\"\"><span class=\"eztoc-hide\" style=\"display:none;\">Toggle<\/span><span class=\"ez-toc-icon-toggle-span\"><svg style=\"fill: #999;color:#999\" xmlns=\"http:\/\/www.w3.org\/2000\/svg\" class=\"list-377408\" width=\"20px\" height=\"20px\" viewBox=\"0 0 24 24\" fill=\"none\"><path d=\"M6 6H4v2h2V6zm14 0H8v2h12V6zM4 11h2v2H4v-2zm16 0H8v2h12v-2zM4 16h2v2H4v-2zm16 0H8v2h12v-2z\" fill=\"currentColor\"><\/path><\/svg><svg style=\"fill: #999;color:#999\" class=\"arrow-unsorted-368013\" xmlns=\"http:\/\/www.w3.org\/2000\/svg\" width=\"10px\" height=\"10px\" viewBox=\"0 0 24 24\" version=\"1.2\" baseProfile=\"tiny\"><path d=\"M18.2 9.3l-6.2-6.3-6.2 6.3c-.2.2-.3.4-.3.7s.1.5.3.7c.2.2.4.3.7.3h11c.3 0 .5-.1.7-.3.2-.2.3-.5.3-.7s-.1-.5-.3-.7zM5.8 14.7l6.2 6.3 6.2-6.3c.2-.2.3-.5.3-.7s-.1-.5-.3-.7c-.2-.2-.4-.3-.7-.3h-11c-.3 0-.5.1-.7.3-.2.2-.3.5-.3.7s.1.5.3.7z\"\/><\/svg><\/span><\/span><\/span><\/a><\/span><\/div>\n<nav><ul class='ez-toc-list ez-toc-list-level-1 ' ><li class='ez-toc-page-1 ez-toc-heading-level-2'><a class=\"ez-toc-link ez-toc-heading-1\" href=\"https:\/\/17aitech.com\/?p=40031\/#%E8%83%8C%E6%99%AF\" >\u80cc\u666f<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-2'><a class=\"ez-toc-link ez-toc-heading-2\" href=\"https:\/\/17aitech.com\/?p=40031\/#%E7%9B%AE%E6%A0%87\" >\u76ee\u6807<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-2'><a class=\"ez-toc-link ez-toc-heading-3\" href=\"https:\/\/17aitech.com\/?p=40031\/#%E5%88%86%E6%9E%90\" >\u5206\u6790<\/a><ul class='ez-toc-list-level-3' ><li class='ez-toc-heading-level-3'><a class=\"ez-toc-link ez-toc-heading-4\" href=\"https:\/\/17aitech.com\/?p=40031\/#1_%E4%BA%86%E8%A7%A3GAIA%E5%9F%BA%E5%87%86%E6%B5%8B%E8%AF%95\" >1. \u4e86\u89e3GAIA\u57fa\u51c6\u6d4b\u8bd5<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-3'><a class=\"ez-toc-link ez-toc-heading-5\" href=\"https:\/\/17aitech.com\/?p=40031\/#2_%E4%B8%8B%E8%BD%BDGAIA%E6%95%B0%E6%8D%AE%E9%9B%86\" >2. \u4e0b\u8f7dGAIA\u6570\u636e\u96c6<\/a><ul class='ez-toc-list-level-4' ><li class='ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-6\" href=\"https:\/\/17aitech.com\/?p=40031\/#21_%E9%85%8D%E7%BD%AEHuggingFace%E9%95%9C%E5%83%8F\" >2.1 \u914d\u7f6eHuggingFace\u955c\u50cf<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-7\" href=\"https:\/\/17aitech.com\/?p=40031\/#22_%E8%8E%B7%E5%8F%96HuggingFace%E7%9A%84Token\" >2.2 \u83b7\u53d6HuggingFace\u7684Token<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-8\" href=\"https:\/\/17aitech.com\/?p=40031\/#23_%E9%80%9A%E8%BF%87huggingface-cli%E7%99%BB%E5%BD%95\" >2.3 \u901a\u8fc7huggingface-cli\u767b\u5f55<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-9\" href=\"https:\/\/17aitech.com\/?p=40031\/#24_%E4%B8%8B%E8%BD%BDGAIA%E6%95%B0%E6%8D%AE%E9%9B%86\" >2.4 \u4e0b\u8f7dGAIA\u6570\u636e\u96c6<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-10\" href=\"https:\/\/17aitech.com\/?p=40031\/#25_%E6%9F%A5%E7%9C%8B%E6%95%B0%E6%8D%AE%E9%9B%86%E5%86%85%E5%AE%B9\" >2.5 \u67e5\u770b\u6570\u636e\u96c6\u5185\u5bb9<\/a><\/li><\/ul><\/li><li class='ez-toc-page-1 ez-toc-heading-level-3'><a class=\"ez-toc-link ez-toc-heading-11\" href=\"https:\/\/17aitech.com\/?p=40031\/#3_%E5%88%86%E6%9E%90GAIA%E6%95%B0%E6%8D%AE%E9%9B%86\" >3. \u5206\u6790GAIA\u6570\u636e\u96c6<\/a><ul class='ez-toc-list-level-4' ><li class='ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-12\" href=\"https:\/\/17aitech.com\/?p=40031\/#31_%E6%A0%B7%E4%BE%8B1\" >3.1 \u6837\u4f8b1<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-13\" href=\"https:\/\/17aitech.com\/?p=40031\/#32_%E6%A0%B7%E4%BE%8B2\" >3.2 \u6837\u4f8b2<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-14\" href=\"https:\/\/17aitech.com\/?p=40031\/#33_%E6%A0%B7%E4%BE%8B3\" >3.3 \u6837\u4f8b3<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-15\" href=\"https:\/\/17aitech.com\/?p=40031\/#34_%E6%A0%B7%E4%BE%8B4\" >3.4 \u6837\u4f8b4<\/a><\/li><\/ul><\/li><\/ul><\/li><li class='ez-toc-page-1 ez-toc-heading-level-2'><a class=\"ez-toc-link ez-toc-heading-16\" href=\"https:\/\/17aitech.com\/?p=40031\/#%E5%AE%9E%E6%96%BD\" >\u5b9e\u65bd<\/a><ul class='ez-toc-list-level-3' ><li class='ez-toc-heading-level-3'><a class=\"ez-toc-link ez-toc-heading-17\" href=\"https:\/\/17aitech.com\/?p=40031\/#4_%E6%B7%BB%E5%8A%A0GAIA%E6%95%B0%E6%8D%AE%E9%9B%86\" >4. \u6dfb\u52a0GAIA\u6570\u636e\u96c6<\/a><ul class='ez-toc-list-level-4' ><li class='ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-18\" href=\"https:\/\/17aitech.com\/?p=40031\/#41_opencompassdatasets_%E5%A2%9E%E5%8A%A0%E6%95%B0%E6%8D%AE%E9%9B%86%E5%AE%9A%E4%B9%89\" >4.1 opencompass\/datasets \u589e\u52a0\u6570\u636e\u96c6\u5b9a\u4e49<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-19\" href=\"https:\/\/17aitech.com\/?p=40031\/#42_opencompassconfigs_%E5%A2%9E%E5%8A%A0%E6%95%B0%E6%8D%AE%E9%9B%86%E9%85%8D%E7%BD%AE\" >4.2 opencompass\/configs \u589e\u52a0\u6570\u636e\u96c6\u914d\u7f6e<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-20\" href=\"https:\/\/17aitech.com\/?p=40031\/#43_opencompassutilsdatasets_infopy_%E6%B7%BB%E5%8A%A0%E6%95%B0%E6%8D%AE%E9%9B%86%E6%98%A0%E5%B0%84\" >4.3 opencompass\/utils\/datasets_info.py \u6dfb\u52a0\u6570\u636e\u96c6\u6620\u5c04<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-21\" href=\"https:\/\/17aitech.com\/?p=40031\/#44_dataset-indexyml_%E6%B3%A8%E5%86%8C%E6%95%B0%E6%8D%AE%E9%9B%86\" >4.4 dataset-index.yml \u6ce8\u518c\u6570\u636e\u96c6<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-22\" href=\"https:\/\/17aitech.com\/?p=40031\/#45_init_py_%E6%B7%BB%E5%8A%A0%E5%88%9D%E5%A7%8B%E5%8C%96%E4%BF%A1%E6%81%AF\" >4.5 __init__.py \u6dfb\u52a0\u521d\u59cb\u5316\u4fe1\u606f<\/a><\/li><\/ul><\/li><li class='ez-toc-page-1 ez-toc-heading-level-3'><a class=\"ez-toc-link ez-toc-heading-23\" href=\"https:\/\/17aitech.com\/?p=40031\/#5_%E8%B0%83%E8%AF%95%E8%BF%90%E8%A1%8C_gaia_%E6%95%B0%E6%8D%AE%E9%9B%86\" >5. \u8c03\u8bd5\u8fd0\u884c gaia \u6570\u636e\u96c6<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-3'><a class=\"ez-toc-link ez-toc-heading-24\" href=\"https:\/\/17aitech.com\/?p=40031\/#6_%E9%9B%86%E6%88%90%E8%87%B3ai-eval-system%E4%B8%AD%E8%BF%9B%E8%A1%8CDify%E5%B9%B3%E5%8F%B0%E5%BA%94%E7%94%A8%E8%AF%84%E6%B5%8B\" >6. \u96c6\u6210\u81f3ai-eval-system\u4e2d\u8fdb\u884cDify\u5e73\u53f0\u5e94\u7528\u8bc4\u6d4b<\/a><ul class='ez-toc-list-level-4' ><li class='ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-25\" href=\"https:\/\/17aitech.com\/?p=40031\/#61_%E9%85%8D%E7%BD%AE%E6%95%B0%E6%8D%AE%E9%9B%86\" >6.1 \u914d\u7f6e\u6570\u636e\u96c6<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-26\" href=\"https:\/\/17aitech.com\/?p=40031\/#62_%E6%95%B0%E6%8D%AE%E5%BA%93%E6%B7%BB%E5%8A%A0%E6%95%B0%E6%8D%AE%E9%9B%86\" >6.2 \u6570\u636e\u5e93\u6dfb\u52a0\u6570\u636e\u96c6<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-27\" href=\"https:\/\/17aitech.com\/?p=40031\/#63_%E5%BC%80%E5%8F%91%E8%80%85%E6%96%B9%E5%BC%8F%E5%90%AF%E5%8A%A8ai-eval-system\" >6.3 \u5f00\u53d1\u8005\u65b9\u5f0f\u542f\u52a8ai-eval-system<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-28\" href=\"https:\/\/17aitech.com\/?p=40031\/#64_Dify%E5%B9%B3%E5%8F%B0%E5%88%9B%E5%BB%BAAgent%E5%BA%94%E7%94%A8\" >6.4 Dify\u5e73\u53f0\u521b\u5efaAgent\u5e94\u7528<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-29\" href=\"https:\/\/17aitech.com\/?p=40031\/#64_ai-eval-system%E5%B9%B3%E5%8F%B0%E5%88%9B%E5%BB%BA%E8%AF%84%E6%B5%8B%E4%BB%BB%E5%8A%A1\" >6.4 ai-eval-system\u5e73\u53f0\u521b\u5efa\u8bc4\u6d4b\u4efb\u52a1<\/a><\/li><\/ul><\/li><\/ul><\/li><li class='ez-toc-page-1 ez-toc-heading-level-2'><a class=\"ez-toc-link ez-toc-heading-30\" href=\"https:\/\/17aitech.com\/?p=40031\/#%E6%80%BB%E7%BB%93\" >\u603b\u7ed3<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-2'><a class=\"ez-toc-link ez-toc-heading-31\" href=\"https:\/\/17aitech.com\/?p=40031\/#%E5%8F%82%E8%80%83%E8%B5%84%E6%96%99\" >\u53c2\u8003\u8d44\u6599<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-2'><a class=\"ez-toc-link ez-toc-heading-32\" href=\"https:\/\/17aitech.com\/?p=40031\/#%E9%99%84%E5%BD%95\" >\u9644\u5f55<\/a><\/li><\/ul><\/nav><\/div>\n<h2><span class=\"ez-toc-section\" id=\"%E8%83%8C%E6%99%AF\"><\/span>\u80cc\u666f<span class=\"ez-toc-section-end\"><\/span><\/h2>\n<p>\u4e3a\u4e86\u80fd\u591f\u66f4\u597d\u5730\u8bc4\u4f30Agent\u7684\u80fd\u529b\uff0c\u6211\u4eec\u9700\u8981\u5728OpenCompass\u7684\u8bc4\u6d4b\u6846\u67b6\u57fa\u7840\u4e0a\uff0c\u5c1d\u8bd5\u5f15\u5165\u5f53\u524d\u6700\u4e3a\u4e25\u82db\u7684Agent\u80fd\u529b\u8bc4\u4f30\u57fa\u51c6\uff1aGAIA\uff08General AI Assistant Benchmark\uff09\uff0c\u672c\u7ae0\u662f\u5bf9GAIA\u57fa\u51c6\u6d4b\u8bd5\u7684\u8c03\u7814\u603b\u7ed3\u6587\u6863\u3002<\/p>\n<h2><span class=\"ez-toc-section\" id=\"%E7%9B%AE%E6%A0%87\"><\/span>\u76ee\u6807<span class=\"ez-toc-section-end\"><\/span><\/h2>\n<ul>\n<li>\u76ee\u68071\uff1a\u8c03\u7814GAIA\u57fa\u51c6\u6d4b\u8bd5\uff0c\u4e86\u89e3\u5176\u6570\u636e\u5185\u5bb9\u57fa\u672c\u6784\u6210\u3002<\/li>\n<li>\u76ee\u68072\uff1a\u8fd0\u884cGAIA\u57fa\u51c6\u6d4b\u8bd5\uff0c\u4e86\u89e3\u5176\u8fd0\u884c\u65b9\u5f0f\u3002<\/li>\n<li>\u76ee\u68073\uff1a\u5728OpenCompass\u6846\u67b6\u4e0b\uff0c\u5c1d\u8bd5\u5f15\u5165GAIA\u57fa\u51c6\u6d4b\u8bd5\u3002<\/li>\n<\/ul>\n<h2><span class=\"ez-toc-section\" id=\"%E5%88%86%E6%9E%90\"><\/span>\u5206\u6790<span class=\"ez-toc-section-end\"><\/span><\/h2>\n<h3><span class=\"ez-toc-section\" id=\"1_%E4%BA%86%E8%A7%A3GAIA%E5%9F%BA%E5%87%86%E6%B5%8B%E8%AF%95\"><\/span>1. \u4e86\u89e3GAIA\u57fa\u51c6\u6d4b\u8bd5<span class=\"ez-toc-section-end\"><\/span><\/h3>\n<p><code>GAIA\uff08A Benchmark for General AI Assistants\uff09<\/code> \u662f\u7531Meta\u3001HuggingFace\u7b49\u56e2\u961f\u63d0\u51fa\u7684\u901a\u7528AI\u52a9\u624b\u8bc4\u4f30\u57fa\u51c6\uff0c\u65e8\u5728\u6d4b\u8bd5AI\u7cfb\u7edf\u5728\u73b0\u5b9e\u4efb\u52a1\u4e2d\u7684\u63a8\u7406\u3001\u591a\u6a21\u6001\u5904\u7406\u3001\u5de5\u5177\u4f7f\u7528\u7b49\u57fa\u7840\u80fd\u529b\u3002GAIA\uff08\u6d4b\u8bd5\u91cd\u70b9\u8003\u5bdf\u6a21\u578b\u7684\u7f51\u7edc\u6d4f\u89c8\u3001\u591a\u6a21\u6001\u5904\u7406\u3001\u4ee3\u7801\u6267\u884c\u548c\u6587\u4ef6\u63a8\u7406\u80fd\u529b\uff0c\u5e76\u8bbe\u7f6e\u4e09\u4e2a\u96be\u5ea6\u7ea7\u522b\uff08\u57fa\u7840\u3001\u8fdb\u9636\u3001\u4e13\u5bb6\u7ea7\uff09\u3002\u4f8b\u5982\uff0c\u4efb\u52a1\u53ef\u80fd\u6d89\u53ca\u4ece\u52a8\u6001\u7f51\u9875\u4e2d\u63d0\u53d6\u6570\u636e\u3001\u89e3\u6790PDF\u56fe\u8868\uff0c\u6216\u7ed3\u5408\u56fe\u50cf\u4e0e\u6587\u672c\u8fdb\u884c\u7efc\u5408\u5206\u6790\u3002<\/p>\n<p><strong>\u8bba\u6587\u5730\u5740<\/strong>\uff1a<a href=\"https:\/\/arxiv.org\/pdf\/2311.12983\">https:\/\/arxiv.org\/pdf\/2311.12983<\/a><\/p>\n<p><strong>huggingface\u6392\u884c\u699c<\/strong>\uff1a<a href=\"https:\/\/huggingface.co\/spaces\/gaia-benchmark\/leaderboard\">https:\/\/huggingface.co\/spaces\/gaia-benchmark\/leaderboard<\/a><\/p>\n<p><strong>\u95ee\u9898\u89c4\u6a21<\/strong>\uff1a\u5171\u5305\u542b<code>466<\/code>\u4e2a\u95ee\u9898\uff0c\u5176\u4e2d<code>166<\/code>\u4e2a\u516c\u5f00\u5f00\u53d1\u96c6\u95ee\u9898\u548c\u7b54\u6848\uff0c<code>300<\/code>\u4e2a\u6d4b\u8bd5\u96c6\u95ee\u9898\u4fdd\u7559\u7b54\u6848\u7528\u4e8e\u6392\u884c\u699c\u7ade\u4e89\u3002<br \/>\n<strong>\u95ee\u9898\u7c7b\u578b<\/strong>\uff1a\u591a\u6570\u95ee\u9898\u4e3a\u6587\u672c\u5f62\u5f0f\uff0c\u90e8\u5206\u9644\u5e26\u56fe\u50cf\u3001\u7535\u5b50\u8868\u683c\u7b49\u591a\u6a21\u6001\u6587\u4ef6\uff08\u5982\u89e3\u6790\u8868\u683c\u6570\u636e\u6216\u8bc6\u522b\u56fe\u50cf\u4fe1\u606f\uff09\u3002<br \/>\n<strong>\u4efb\u52a1\u573a\u666f<\/strong>\uff1a\u6db5\u76d6\u65e5\u5e38\u4e2a\u4eba\u4efb\u52a1\uff08\u5982\u67e5\u627e\u7f51\u9875\u6ce8\u518c\u4fe1\u606f\uff09\u3001\u79d1\u5b66\u95ee\u9898\uff08\u5982\u6570\u636e\u5206\u6790\uff09\u53ca\u901a\u7528\u77e5\u8bc6\u67e5\u8be2\u3002<br \/>\n<strong>\u7b54\u6848\u683c\u5f0f<\/strong>\uff1a\u6bcf\u4e2a\u95ee\u9898\u5bf9\u5e94\u552f\u4e00\u3001\u7b80\u77ed\u7684\u4e8b\u5b9e\u6027\u7b54\u6848\uff08\u5982\u5b57\u7b26\u4e32\u3001\u6570\u5b57\u6216\u5217\u8868\uff09\uff0c\u4fbf\u4e8e\u81ea\u52a8\u5316\u8bc4\u4f30\u3002<\/p>\n<p><strong>\u96be\u5ea6\u5206\u7ea7<\/strong>\uff1a<\/p>\n<ul>\n<li>Level 1\uff1a\u7b80\u5355\u4efb\u52a1\uff0c\u901a\u5e38\u65e0\u9700\u5de5\u5177\u6216\u4ec5\u97001\u4e2a\u5de5\u5177\uff0c\u6b65\u9aa4\u4e0d\u8d85\u8fc75\u6b65\uff08\u4f8b\u5982\u67e5\u627e\u7f51\u9875\u4e2d\u7684\u7279\u5b9a\u4fe1\u606f\uff09\u3002<\/li>\n<li>Level 2\uff1a\u4e2d\u7b49\u4efb\u52a1\uff0c\u97005-10\u6b65\u64cd\u4f5c\uff0c\u7ed3\u5408\u591a\u79cd\u5de5\u5177\uff08\u5982\u7f51\u7edc\u641c\u7d22+\u8868\u683c\u89e3\u6790\uff09\u3002<\/li>\n<li>Level 3\uff1a\u590d\u6742\u4efb\u52a1\uff0c\u8981\u6c42\u8fd1\u4e4e\u5b8c\u7f8e\u7684\u901a\u7528\u52a9\u624b\u80fd\u529b\uff0c\u9700\u4efb\u610f\u957f\u64cd\u4f5c\u5e8f\u5217\u548c\u591a\u5de5\u5177\u534f\u540c\uff08\u5982\u8de8\u6a21\u6001\u4fe1\u606f\u6574\u5408\u4e0e\u63a8\u7406\uff09<\/li>\n<\/ul>\n<h3><span class=\"ez-toc-section\" id=\"2_%E4%B8%8B%E8%BD%BDGAIA%E6%95%B0%E6%8D%AE%E9%9B%86\"><\/span>2. \u4e0b\u8f7dGAIA\u6570\u636e\u96c6<span class=\"ez-toc-section-end\"><\/span><\/h3>\n<p>\u5728Jupyter Notebook \u4e2d\uff0c\u901a\u8fc7\u4ee5\u4e0b\u65b9\u5f0f\u4e0b\u8f7d\u5e76\u83b7\u53d6\u548cGAIA\u6570\u636e\u96c6\u3002<\/p>\n<h4><span class=\"ez-toc-section\" id=\"21_%E9%85%8D%E7%BD%AEHuggingFace%E9%95%9C%E5%83%8F\"><\/span>2.1 \u914d\u7f6eHuggingFace\u955c\u50cf<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<pre><code class=\"language-python\">import os\n# \u8bbe\u7f6e\u73af\u5883\u53d8\u91cf\uff08\u4ec5\u5728\u5f53\u524d\u4f1a\u8bdd\u6709\u6548\uff09\nos.environ[&quot;HF_ENDPOINT&quot;] = &quot;https:\/\/hf-mirror.com&quot;  <\/code><\/pre>\n<h4><span class=\"ez-toc-section\" id=\"22_%E8%8E%B7%E5%8F%96HuggingFace%E7%9A%84Token\"><\/span>2.2 \u83b7\u53d6HuggingFace\u7684Token<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<ol>\n<li>\u8bbf\u95ee HuggingFace \u5b98\u7f51\uff0c\u6ce8\u518c\u8d26\u53f7\u5e76\u83b7\u53d6 Token\u3002<br \/>\n<a href=\"https:\/\/17aitech.com\/wp-content\/uploads\/2025\/04\/\u83b7\u53d6Token.png\" data-fancybox=\"images\" data-fancybox=\"gallery\"><img decoding=\"async\" src=\"https:\/\/17aitech.com\/wp-content\/uploads\/2025\/04\/\u83b7\u53d6Token.png\" alt=\"\" \/><\/a><\/li>\n<\/ol>\n<h4><span class=\"ez-toc-section\" id=\"23_%E9%80%9A%E8%BF%87huggingface-cli%E7%99%BB%E5%BD%95\"><\/span>2.3 \u901a\u8fc7huggingface-cli\u767b\u5f55<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<p>\u5728jupyter notebook\u4e2d\u6267\u884c\u4ee5\u4e0b\u547d\u4ee4 <\/p>\n<pre><code class=\"language-python\">!huggingface-cli login --token hf_HqxmRaSxadGZynzH*****<\/code><\/pre>\n<p>\u8bf4\u660e\uff1a<\/p>\n<ul>\n<li><code>hf_HqxmRaSxadGZynzH*****<\/code> \u662f\u4e0a\u8ff0\u7b2c2\u6b65\u9aa4\u83b7\u53d6\u7684HuggingFace\u7684Token\u3002<\/li>\n<\/ul>\n<h4><span class=\"ez-toc-section\" id=\"24_%E4%B8%8B%E8%BD%BDGAIA%E6%95%B0%E6%8D%AE%E9%9B%86\"><\/span>2.4 \u4e0b\u8f7dGAIA\u6570\u636e\u96c6<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<pre><code class=\"language-python\">from datasets import load_dataset\n\nds = load_dataset(&quot;gaia-benchmark\/GAIA&quot;, &#039;2023_all&#039;, cache_dir=&quot;cache&quot;)<\/code><\/pre>\n<p>\u8bf4\u660e\uff1a<\/p>\n<ul>\n<li>GAIA\u6709\u4e09\u79cd\u7ea7\u522b\u6570\u636e\u96c6\uff0c\u5206\u522b\u4e3a <code>2023_level1<\/code>, <code>2023_level2<\/code>, <code>2023_level3<\/code>\u3002<\/li>\n<li>\u5982\u679c\u9009\u62e9 <code>2023_all<\/code>\uff0c\u5219\u9ed8\u8ba4\u52a0\u8f7d\u6240\u6709\u7ea7\u522b\u7684\u6570\u636e\u96c6\u3002<\/li>\n<\/ul>\n<h4><span class=\"ez-toc-section\" id=\"25_%E6%9F%A5%E7%9C%8B%E6%95%B0%E6%8D%AE%E9%9B%86%E5%86%85%E5%AE%B9\"><\/span>2.5 \u67e5\u770b\u6570\u636e\u96c6\u5185\u5bb9<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<pre><code class=\"language-python\"># \u67e5\u770b\u8bad\u7ec3\u96c6\u6837\u672c\u6570\nprint(&quot;Train samples:&quot;, len(ds[&#039;test&#039;]))\nprint(&quot;Validation samples:&quot;, len(ds[&#039;validation&#039;]))<\/code><\/pre>\n<p>\u8fd0\u884c\u7ed3\u679c\uff1a<\/p>\n<pre><code>Train samples: 301\nValidation samples: 165<\/code><\/pre>\n<p>\u901a\u8fc7\u4ee5\u4e0b\u4ee3\u7801\u8fdb\u4e00\u6b65\u67e5\u770b\u6570\u636e\u96c6\u7684\u5185\u5bb9<\/p>\n<pre><code class=\"language-python\">from pprint import pprint\n\n# \u67e5\u770b\u6570\u636e\u96c6\u7684\u7279\u5f81\npprint(ds[&#039;validation&#039;].features)\n\n# \u67e5\u770b\u7b2c\u4e00\u6761\u6570\u636e\u7684\u5143\u6570\u636e\nsample = ds[&#039;validation&#039;][0]\n\npprint(sample)<\/code><\/pre>\n<p>\u8fd0\u884c\u7ed3\u679c\uff1a<br \/>\n<a href=\"https:\/\/17aitech.com\/wp-content\/uploads\/2025\/04\/\u6570\u636e\u96c6\u5185\u5bb9.png\" data-fancybox=\"images\" data-fancybox=\"gallery\"><img decoding=\"async\" src=\"https:\/\/17aitech.com\/wp-content\/uploads\/2025\/04\/\u6570\u636e\u96c6\u5185\u5bb9.png\" alt=\"\" \/><\/a><\/p>\n<p>\u8bf4\u660e\uff1a<\/p>\n<ul>\n<li><code>GAIA\u6570\u636e\u96c6<\/code>\u4e2d\u4e3b\u8981\u7684\u7ec4\u6210\u90e8\u5206\u5373\u4e3a\uff1a<code>Question<\/code>\u3001<code>Final Answer<\/code>\u3002<\/li>\n<li><code>Question<\/code>\u7684\u95ee\u9898\u4e00\u822c\u662f\u9700\u8981\u4f7f\u7528\u4e00\u5b9a\u5de5\u5177\u624d\u80fd\u83b7\u53d6\u5230\u7b54\u6848\u7684\u95ee\u9898\u3002<\/li>\n<li><code>Final Answer<\/code>\u662f\u5bf9\u5e94Question\u5bf9\u5e94\u7684\u7b54\u6848\uff0c\u662f\u786e\u5b9a\u6027\u7684\u7b54\u6848\u3002<\/li>\n<li><code>GAIA\u6570\u636e\u96c6<\/code>\u4e3a\u4e86\u907f\u514d<code>\u6570\u636e\u6c61\u67d3<\/code>(\u5c06\u6d4b\u8bd5\u6570\u636e\u96c6\u62ff\u6765\u8fdb\u884c\u8bad\u7ec3\uff0c\u4ece\u800c\u63d0\u9ad8\u699c\u5355\u6392\u540d)\uff0c\u5176\u6570\u636e\u96c6\u4e2d\u53ea\u6709<code>Validation<\/code>\u6709\u7b54\u6848\uff0c\u800c<code>Test<\/code>\u6570\u636e\u96c6\u7684\u7b54\u6848\u4e3a\u7a7a\u3002<\/li>\n<\/ul>\n<h3><span class=\"ez-toc-section\" id=\"3_%E5%88%86%E6%9E%90GAIA%E6%95%B0%E6%8D%AE%E9%9B%86\"><\/span>3. \u5206\u6790GAIA\u6570\u636e\u96c6<span class=\"ez-toc-section-end\"><\/span><\/h3>\n<p>\u4e3a\u4e86\u66f4\u52a0\u6df1\u5165\u7406\u89e3GAIA\u6570\u636e\u96c6\uff0c\u6211\u4eec\u6311\u9009\u90e8\u5206\u6570\u636e\u96c6\u66f4\u52a0\u76f4\u89c2\u5730\u4e86\u89e3\u5176\u5185\u5bb9\u3002<\/p>\n<h4><span class=\"ez-toc-section\" id=\"31_%E6%A0%B7%E4%BE%8B1\"><\/span>3.1 \u6837\u4f8b1<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<p><strong>task_id<\/strong>\uff1a<code>c61d22de-5f6c-4958-a7f6-5e9707bd3466<\/code><br \/>\n<strong>Question<\/strong>:<\/p>\n<pre><code class=\"language-bash\">A paper about AI regulation that was originally submitted to arXiv.org in June 2022 shows a figure with three axes, where each axis has a label word at both ends. Which of these words is used to describe a type of society in a Physics and Society article submitted to arXiv.org on August 11, 2016?\n\n\u7ffb\u8bd1\uff1a\u4e00\u7bc7\u6700\u521d\u4e8e2022\u5e746\u6708\u63d0\u4ea4\u5230arXiv.org\u7684\u5173\u4e8eAI\u76d1\u7ba1\u7684\u8bba\u6587\u5c55\u793a\u4e86\u4e00\u4e2a\u5305\u542b\u4e09\u4e2a\u5750\u6807\u8f74\u7684\u56fe\u8868\uff0c\u6bcf\u4e2a\u8f74\u7684\u4e24\u7aef\u90fd\u5e26\u6709\u6807\u7b7e\u8bcd\u3002\u57282016\u5e748\u670811\u65e5\u63d0\u4ea4\u7ed9arXiv.org\u7684\u300a\u7269\u7406\u4e0e\u793e\u4f1a\u300b\u6587\u7ae0\u4e2d\uff0c\u8fd9\u4e9b\u8bcd\u4e2d\u54ea\u4e2a\u88ab\u7528\u6765\u63cf\u8ff0\u4e00\u79cd\u793e\u4f1a\u7c7b\u578b<\/code><\/pre>\n<p><strong>Final Answer<\/strong>:<\/p>\n<pre><code class=\"language-bash\">egalitarian\n\u7ffb\u8bd1\uff1a\u5e73\u7b49\u4e3b\u4e49<\/code><\/pre>\n<p><strong>\u8bf4\u660e<\/strong>\uff1a<br \/>\n\u4e3a\u4e86\u56de\u7b54\u4e0a\u8ff0\u7684Question\uff0cAgent\u5fc5\u987b\u5177\u6709\u5916\u90e8\u5de5\u5177\u8bbf\u95ee\u7684\u80fd\u529b\uff0c\u5b83\u9700\u8981\u6267\u884c\u4ee5\u4e0b\u6b65\u9aa4\u624d\u80fd\u5f97\u5230\u7b54\u6848\uff1a<\/p>\n<ol>\n<li>\u8bbf\u95ee <code>arXiv.org<\/code> \u5e76\u8fdb\u5165<code>\u201c\u9ad8\u7ea7\u641c\u7d22\uff08Advanced Search\uff09\u201d<\/code>\u9875\u9762\u3002<\/li>\n<li>\u5728\u641c\u7d22\u6846\u4e2d\u8f93\u5165 <code>\u201cAI regulation\u201d<\/code>\uff0c\u5e76\u4ece\u4e0b\u62c9\u83dc\u5355\u4e2d\u9009\u62e9 <code>\u201cAll fields\u201d<\/code>\uff08\u6240\u6709\u5b57\u6bb5\uff09\uff0c\u63d0\u4ea4\u641c\u7d22 [4]\u3002<\/li>\n<li>\u5728\u65e5\u671f\u8f93\u5165\u680f\u4e2d\u586b\u5199 <code>2022-06-01<\/code> \u548c <code>2022-07-01<\/code>\uff0c\u9009\u62e9<code> \u201cSubmission date (original)\u201d<\/code>\uff08\u63d0\u4ea4\u65e5\u671f-\u539f\u59cb\uff09\uff0c\u63d0\u4ea4\u641c\u7d22\u3002<\/li>\n<li>\u5728\u641c\u7d22\u7ed3\u679c\u4e2d\u627e\u5230\u6807\u9898\u4e3a <code>\u300aFairness in Agreement With European Values: An Interdisciplinary Perspective on AI Regulation\u300b<\/code> \u7684\u8bba\u6587\uff0c\u786e\u8ba4\u5176\u56fe\u8868\u5305\u542b\u4e09\u4e2a\u5750\u6807\u8f74\u4e14\u8f74\u4e24\u7aef\u6709\u6807\u7b7e [1][4]\u3002<\/li>\n<li>\u8bb0\u5f55\u8be5\u56fe\u8868\u6807\u7b7e\u7684\u516d\u4e2a\u8bcd\u6c47\uff1a<code>deontological\uff08\u4e49\u52a1\u8bba\u7684\uff09<\/code>, <code>egalitarian\uff08\u5e73\u7b49\u4e3b\u4e49\u7684\uff09<\/code>, <code>localized\uff08\u672c\u5730\u5316\u7684\uff09<\/code>, <code>standardized\uff08\u6807\u51c6\u5316\u7684\uff09<\/code>, <code>utilitarian\uff08\u529f\u5229\u4e3b\u4e49\u7684\uff09<\/code>, <code>consequential\uff08\u7ed3\u679c\u4e3b\u4e49\u7684\uff09<\/code>\u3002<\/li>\n<li>\u8fd4\u56de<code> arXiv.org<\/code>\u3002<\/li>\n<li>\u5728\u5206\u7c7b\u5217\u8868\u4e2d\u627e\u5230\u5e76\u8fdb\u5165<code> \u201cPhysics and Society\u201d\uff08\u7269\u7406\u4e0e\u793e\u4f1a\uff09<\/code> \u7c7b\u522b\u9875\u9762 [4]\u3002<\/li>\n<li>\u8bb0\u5f55\u8be5\u5206\u7c7b\u7684\u6807\u7b7e\u4e3a<code> \u201cphysics.soc-ph\u201d<\/code>\u3002<\/li>\n<li>\u518d\u6b21\u8fdb\u5165<code>\u201c\u9ad8\u7ea7\u641c\u7d22\u201d<\/code>\u9875\u9762\u3002<\/li>\n<li>\u5728\u641c\u7d22\u6846\u8f93\u5165 <code>\u201cphysics.soc-ph\u201d<\/code>\uff0c\u9009\u62e9 <code>\u201cAll fields\u201d\uff08\u6240\u6709\u5b57\u6bb5\uff09<\/code>[4]\u3002<\/li>\n<li>\u5728\u65e5\u671f\u680f\u8f93\u5165<code> 2016-08-11 <\/code>\u548c <code>2016-08-12<\/code>\uff0c\u9009\u62e9 <code>\u201cSubmission date (original)\u201d<\/code>\uff08\u63d0\u4ea4\u65e5\u671f-\u539f\u59cb\uff09\uff0c\u63d0\u4ea4\u641c\u7d22\u3002<\/li>\n<li>\u5728\u7ed3\u679c\u4e2d\u641c\u7d22\u8fd9\u516d\u4e2a\u8bcd\uff0c\u627e\u5230\u6807\u9898\u4e3a <code>\u300aPhase transition from egalitarian to hierarchical societies driven by competition between cognitive and social constraints\u300b<\/code> \u7684\u8bba\u6587\uff0c\u786e\u8ba4 <code>\u201cegalitarian\u201d<\/code> \u662f\u6b63\u786e\u7b54\u6848 [4]\u3002<\/li>\n<\/ol>\n<blockquote>\n<p>\u5907\u6ce8\uff1a<br \/>\n\u4ee5\u4e0a\u6b65\u9aa4\u5728\u6570\u636e\u96c6\u7684\u6807\u6ce8\u5185\u5bb9<code>Annotator Metadata<\/code>\u4e2d\u6709\u8bf4\u660e\u3002<\/p>\n<\/blockquote>\n<h4><span class=\"ez-toc-section\" id=\"32_%E6%A0%B7%E4%BE%8B2\"><\/span>3.2 \u6837\u4f8b2<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<p><strong>task_id<\/strong>\uff1a<code>17b5a6a3-bc87-42e8-b0fb-6ab0781ef2cc<\/code><br \/>\n<strong>Question<\/strong>:<\/p>\n<pre><code class=\"language-bash\">I\u2019m researching species that became invasive after people who kept them as pets released them. There\u2019s a certain species of fish that was popularized as a pet by being the main character of the movie Finding Nemo. According to the USGS, where was this fish found as a nonnative species, before the year 2020? I need the answer formatted as the five-digit zip codes of the places the species was found, separated by commas if there is more than one place.\n\n\u8c03\u67e5\u56e0\u5ba0\u7269\u653e\u751f\u5bfc\u81f4\u5165\u4fb5\u7684\u751f\u7269\u3002\u67d0\u9c7c\u7c7b\u56e0\u300a\u6d77\u5e95\u603b\u52a8\u5458\u300b\u4e3b\u89d2\uff08\u5c3c\u83ab\uff09\u8d70\u7ea2\uff0c\u968f\u540e\u88ab\u653e\u751f\u81f3\u975e\u539f\u751f\u5730\u3002\u6839\u636eUSGS\u6570\u636e\uff0c2020\u5e74\u524d\u6b64\u9c7c\u4f5c\u4e3a\u5916\u6765\u7269\u79cd\u88ab\u53d1\u73b0\u4e8e\u4f55\u5904\uff1f\u9700\u4ee55\u4f4d\u90ae\u7f16\uff08\u591a\u4e2a\u5219\u9017\u53f7\u5206\u9694\uff09\u5448\u73b0\u7b54\u6848\u3002<\/code><\/pre>\n<p><strong>Final Answer<\/strong>:<\/p>\n<pre><code class=\"language-bash\">34689<\/code><\/pre>\n<p>\u8bf4\u660e\uff1a<br \/>\n\u4e3a\u4e86\u56de\u7b54\u4e0a\u8ff0\u7684Question\uff0cAgent\u9700\u8981\u6267\u884c\u4ee5\u4e0a\u6b65\u9aa4\u624d\u80fd\u5f97\u5230\u7b54\u6848\uff1a<\/p>\n<ol>\n<li>\u67e5\u8bc1\u7535\u5f71\u539f\u578b\u2014\u2014\u641c\u7d22&quot;finding nemo main character&quot;\u5e76\u786e\u8ba4\u4e3b\u89d2\u4e3a\u5c0f\u4e11\u9c7c\uff08clownfish\uff09[\u6765\u6e90\uff1a\u641c\u7d22\u5f15\u64ce]\u3002<\/li>\n<li>\u5b9a\u4f4d\u6570\u636e\u5e93\u2014\u2014\u8bbf\u95eeUSGS\u975e\u672c\u5730\u6c34\u751f\u7269\u79cd\u5e93\uff08Nonindigenous Aquatic Species\uff09\uff0c\u5728\u6d77\u6d0b\u9c7c\u7c7b\u5206\u7c7b\u4e0b\u627e\u5230\u5c0f\u4e11\u9c7c\u6761\u76ee\u300cClown anemonefish\u300d<\/li>\n<li>\u63d0\u53d6\u4f4d\u7f6e\u4fe1\u606f\u2014\u2014\u7b5b\u90092020\u5e74\u524d\u8bb0\u5f55[2]\uff08\u6848\u4f8b\uff1a\u4ec5\u4f5b\u7f57\u91cc\u8fbe\u5ddeFred Howard Park\u4e00\u5904[5]\uff09\u3002<\/li>\n<li>\u68c0\u7d22\u90ae\u7f16\uff08\u5916\u90e8\u5de5\u5177\uff09\u2014\u2014\u901a\u8fc7\u5730\u540d\u786e\u8ba4\u90ae\u7f16\u4e3a34689[7]\u3002<\/li>\n<li>\u8f93\u51fa\u7ed3\u679c \u2014\u2014 \u56e0\u5355\u6761\u8bb0\u5f55\u76f4\u63a5\u8fd4\u56de\uff1a34689<\/li>\n<\/ol>\n<h4><span class=\"ez-toc-section\" id=\"33_%E6%A0%B7%E4%BE%8B3\"><\/span>3.3 \u6837\u4f8b3<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<p><strong>task_id<\/strong>: <code>32102e3e-d12a-4209-9163-7b3a104efe5d<\/code><br \/>\n<strong>Question<\/strong>:<\/p>\n<pre><code class=\"language-bash\">The attached spreadsheet shows the inventory for a movie and video game rental store in Seattle, Washington. What is the title of the oldest Blu-Ray recorded in this spreadsheet? Return it as appearing in the spreadsheet.\n\n\u6240\u9644\u7535\u5b50\u8868\u683c\u662f\u7f8e\u56fd\u534e\u76db\u987f\u5dde\u897f\u96c5\u56fe\u5e02\u67d0\u5f71\u789f\u6e38\u620f\u79df\u8d41\u5e97\u7684\u5e93\u5b58\u6e05\u5355\uff0c\u9700\u56de\u7b54\uff1a\u8868\u683c\u4e2d\u8bb0\u5f55\u7684\u5e74\u4ee3\u6700\u65e9\u7684\u84dd\u5149\u5149\u789f\uff08Blu-Ray\uff09\u7684\u6807\u9898\u662f\u4ec0\u4e48\uff1f\u7b54\u6848\u5fc5\u987b\u4e25\u683c\u6309\u8868\u683c\u5185\u539f\u6587\u683c\u5f0f\u8fd4\u56de\u3002<\/code><\/pre>\n<p><strong>Final Answer<\/strong>:<\/p>\n<pre><code class=\"language-bash\">Time-Parking 2: Parallel Universe<\/code><\/pre>\n<p>\u8bf4\u660e\uff1a<br \/>\n\u4e3a\u4e86\u56de\u7b54\u4e0a\u8ff0\u7684Question\uff0cAgent\u9700\u8981\u6267\u884c\u4ee5\u4e0a\u6b65\u9aa4\u624d\u80fd\u5f97\u5230\u7b54\u6848\uff1a<\/p>\n<ol>\n<li>\u6253\u5f00\u6587\u4ef6\uff1a\u8f7d\u5165\u63d0\u4f9b\u7684\u7535\u5b50\u8868\u683c\u3002<\/li>\n<li>\u7b5b\u9009\u6570\u636e\uff1a\u627e\u5230\u201cBlu-Ray\u201d\u5206\u7c7b\u5217\uff0c\u6bd4\u5bf9\u5e74\u4efd\u5b57\u6bb5\uff0c\u786e\u8ba4\u6700\u65e9\u7684\u5e74\u4efd\u4e3a 2009\u5e74\u3002<\/li>\n<li>\u5b9a\u4f4d\u76ee\u6807\uff1a\u9501\u5b9a2009\u5e74\u4efd\u5bf9\u5e94\u7684\u84dd\u5149\u5149\u789f\u6807\u9898\uff1a<br \/>\n\u300aTime-Parking 2: Parallel Universe\u300b\uff08\u4fdd\u7559\u539f\u6587\u62fc\u5199\u548c\u5927\u5c0f\u5199\uff09<\/li>\n<\/ol>\n<p><a href=\"https:\/\/17aitech.com\/wp-content\/uploads\/2025\/04\/\u84dd\u5149\u5149\u789f\u6570\u636e\u96c6.png\" data-fancybox=\"images\" data-fancybox=\"gallery\"><img decoding=\"async\" src=\"https:\/\/17aitech.com\/wp-content\/uploads\/2025\/04\/\u84dd\u5149\u5149\u789f\u6570\u636e\u96c6.png\" alt=\"\" \/><\/a><\/p>\n<h4><span class=\"ez-toc-section\" id=\"34_%E6%A0%B7%E4%BE%8B4\"><\/span>3.4 \u6837\u4f8b4<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<p>task_id: <code>fcca530fc-4052-43b2-b130-b30968d8aa44<\/code><br \/>\nQuestion:<\/p>\n<pre><code class=\"language-bash\">Review the chess position provided in the image. It is black&#039;s turn. Provide the correct next move for black which guarantees a win. Please provide your response in algebraic notation.\n\n\u67e5\u770b\u56fe\u7247\u4e2d\u7ed9\u51fa\u7684\u68cb\u5c40\u60c5\u51b5\u3002\u8f6e\u5230\u9ed1\u65b9\u8d70\u68cb\u4e86\u3002\u7ed9\u51fa\u9ed1\u65b9\u80fd\u786e\u4fdd\u83b7\u80dc\u7684\u6b63\u786e\u4e0b\u4e00\u6b65\u8d70\u6cd5\u3002\u8bf7\u4ee5\u56fd\u9645\u8c61\u68cb\u7684\u4ee3\u6570\u8868\u793a\u6cd5\u7ed9\u51fa\u60a8\u7684\u56de\u7b54\u3002<\/code><\/pre>\n<p><strong>Final Answer<\/strong>:<\/p>\n<pre><code class=\"language-bash\">Rd5<\/code><\/pre>\n<p>\u8bf4\u660e\uff1a<br \/>\n\u4e3a\u4e86\u56de\u7b54\u4e0a\u8ff0\u7684Question\uff0cAgent\u9700\u8981\u6267\u884c\u4ee5\u4e0a\u6b65\u9aa4\u624d\u80fd\u5f97\u5230\u7b54\u6848\uff1a<\/p>\n<ol>\n<li>\n<p>\u8bfb\u53d6\u56fe\u7247<br \/>\n<a href=\"https:\/\/17aitech.com\/wp-content\/uploads\/2025\/04\/\u56fd\u9645\u8c61\u68cb\u68cb\u5c40\u56fe\u7247.png\" data-fancybox=\"images\" data-fancybox=\"gallery\"><img decoding=\"async\" src=\"https:\/\/17aitech.com\/wp-content\/uploads\/2025\/04\/\u56fd\u9645\u8c61\u68cb\u68cb\u5c40\u56fe\u7247.png\" alt=\"\" \/><\/a><\/p>\n<\/li>\n<li>\n<p>\u68cb\u76d8\u5c40\u9762\u8bc4\u4f30\uff08\u6240\u6709\u5b50\u529b\u4f4d\u7f6e\u4e0e\u6001\u52bf\u5206\u6790\uff09<\/p>\n<\/li>\n<li>\n<p>\u751f\u6210\u9ed1\u65b9\u6700\u4f18\u7740\u6cd5 \u2192 \u6700\u7ec8\u8f93\u51fa\uff1aRd5 [\u7cbe\u786e\u4ee3\u6570\u8bb0\u8c31\u6cd5\u683c\u5f0f]<\/p>\n<\/li>\n<\/ol>\n<p>\u901a\u8fc7\u4ee5\u4e0a\u7684\u6837\u4f8b\u5206\u6790\uff0c\u521d\u6b65\u4e86\u89e3\u5230\uff1a<\/p>\n<ul>\n<li>GAIA\u6570\u636e\u96c6\u662f\u4ee5<code>Question + Answer<\/code>\u7684\u5f62\u5f0f\u5b58\u5728\u7684\u3002<\/li>\n<li>GAIA\u6570\u636e\u96c6\u7684\u95ee\u9898\u4e00\u5b9a\u662f\u9700\u8981<code>Agent<\/code>\u501f\u52a9\u5916\u90e8\u5de5\u5177(\u5982\uff1a\u641c\u7d22\u5f15\u64ce\u3001\u6570\u636e\u5e93\u3001\u6587\u4ef6\u89e3\u6790\u5668\u7b49)\u624d\u80fd\u5f97\u5230\u6b63\u786e\u7b54\u6848\u3002<\/li>\n<li>GAIA\u6570\u636e\u96c6\u4e2d\u7684\u90e8\u5206\u95ee\u9898\u4f1a\u6d89\u53ca\u5230\u9644\u4ef6\u6587\u6863\uff0c\u6587\u6863\u7c7b\u578b\u5305\u542b\uff1axlsx\u3001pdf\u3001docx\u3001png\u7b49\uff0cagent\u9700\u8981\u80fd\u591f\u89e3\u6790\u5e76\u63d0\u53d6\u5176\u4e2d\u7684\u4fe1\u606f\u3002<\/li>\n<\/ul>\n<h2><span class=\"ez-toc-section\" id=\"%E5%AE%9E%E6%96%BD\"><\/span>\u5b9e\u65bd<span class=\"ez-toc-section-end\"><\/span><\/h2>\n<h3><span class=\"ez-toc-section\" id=\"4_%E6%B7%BB%E5%8A%A0GAIA%E6%95%B0%E6%8D%AE%E9%9B%86\"><\/span>4. \u6dfb\u52a0GAIA\u6570\u636e\u96c6<span class=\"ez-toc-section-end\"><\/span><\/h3>\n<h4><span class=\"ez-toc-section\" id=\"41_opencompassdatasets_%E5%A2%9E%E5%8A%A0%E6%95%B0%E6%8D%AE%E9%9B%86%E5%AE%9A%E4%B9%89\"><\/span>4.1 <code>opencompass\/datasets<\/code> \u589e\u52a0\u6570\u636e\u96c6\u5b9a\u4e49<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<p>\u4ee3\u7801\u6587\u4ef6\uff1a<code>libs\/OpenCompass\/opencompass\/datasets\/gaia.py<\/code><br \/>\n\u4ee3\u7801\u5185\u5bb9\uff1a<\/p>\n<pre><code class=\"language-python\">import json\nfrom os import environ\nimport os\n\nfrom datasets import Dataset\n\nfrom opencompass.registry import LOAD_DATASET\nfrom opencompass.utils import get_data_path\n\nfrom .base import BaseDataset\n\n@LOAD_DATASET.register_module()\nclass GAIADataset(BaseDataset):\n\n    @staticmethod\n    def load(path, local_mode: bool = False):        \n        from datasets import load_dataset\n        try:\n            # \u56e0\u4e3aModelScope\u7684GAIA\u6570\u636e\u96c6\u8bfb\u53d6\u5b58\u5728\u95ee\u9898\uff0c\u6240\u4ee5\u4ecehuggingface\u8bfb\u53d6\n            ds = load_dataset(&quot;gaia-benchmark\/GAIA&quot;, &#039;2023_all&#039;, split=&#039;validation&#039;)\n            rows = []\n            for item in ds:\n                rows.append({\n                    &#039;question&#039;: item[&#039;Question&#039;],\n                    &#039;answerKey&#039;: item[&#039;Final answer&#039;],\n                    &#039;file_path&#039;: item[&#039;file_path&#039;],\n                    &#039;file_name&#039;: item[&#039;file_name&#039;],\n                    &#039;level&#039;: item[&#039;Level&#039;]\n                })\n        except Exception as e:\n            print(f&quot;Error loading local file: {e}&quot;)\n\n        return Dataset.from_list(rows)<\/code><\/pre>\n<h4><span class=\"ez-toc-section\" id=\"42_opencompassconfigs_%E5%A2%9E%E5%8A%A0%E6%95%B0%E6%8D%AE%E9%9B%86%E9%85%8D%E7%BD%AE\"><\/span>4.2 <code>opencompass\/configs<\/code> \u589e\u52a0\u6570\u636e\u96c6\u914d\u7f6e<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<p>\u4ee3\u7801\u6587\u4ef6\uff1a<code>libs\/OpenCompass\/opencompass\/configs\/datasets\/GAIA\/gaia.py<\/code><br \/>\n\u4ee3\u7801\u5185\u5bb9\uff1a<\/p>\n<pre><code class=\"language-python\">from opencompass.openicl.icl_prompt_template import PromptTemplate\nfrom opencompass.openicl.icl_retriever import ZeroRetriever\nfrom opencompass.openicl.icl_inferencer import GenInferencer\nfrom opencompass.openicl.icl_evaluator import AccEvaluator\nfrom opencompass.datasets import GAIADataset\nfrom opencompass.utils.text_postprocessors import first_capital_postprocess\n\ngaia_reader_cfg = dict(\n    input_columns=&#039;question&#039;,\n    output_column=&#039;answerKey&#039;,\n    test_split=&#039;test&#039;)\n\ngaia_infer_cfg = dict(\n    prompt_template=dict(\n        type=PromptTemplate,\n        template=dict(round=[\n            dict(\n                role=&#039;HUMAN&#039;,\n                prompt=\n                &#039;\u8bf7\u6839\u636e\u95ee\u9898\uff1a{question}\\n\u7ed9\u51fa\u7b54\u6848\u3002\u7b54\uff1a&#039;\n            ),\n        ]),\n    ),\n    retriever=dict(type=ZeroRetriever),\n    inferencer=dict(type=GenInferencer),\n)\ngaia_eval_cfg = dict(\n    evaluator=dict(type=AccEvaluator),\n    pred_role=&#039;BOT&#039;,\n    pred_postprocessor=dict(type=first_capital_postprocess),\n)\n\ngaia_datasets = [\n    dict(\n        abbr=&#039;gaia-validation&#039;,\n        type=GAIADataset,\n        path=&#039;opencompass\/gaia&#039;,\n        local_mode=False,\n        reader_cfg=gaia_reader_cfg,\n        infer_cfg=gaia_infer_cfg,\n        eval_cfg=gaia_eval_cfg,\n    )\n]\n<\/code><\/pre>\n<h4><span class=\"ez-toc-section\" id=\"43_opencompassutilsdatasets_infopy_%E6%B7%BB%E5%8A%A0%E6%95%B0%E6%8D%AE%E9%9B%86%E6%98%A0%E5%B0%84\"><\/span>4.3 <code>opencompass\/utils\/datasets_info.py<\/code> \u6dfb\u52a0\u6570\u636e\u96c6\u6620\u5c04<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<p>\u4ee3\u7801\u6587\u4ef6\uff1a<code>libs\/OpenCompass\/opencompass\/utils\/datasets_info.py<\/code><br \/>\n\u4ee3\u7801\u5185\u5bb9\uff1a<\/p>\n<pre><code class=\"language-python\">DATASETS_MAPPING = {\n    # GAIA Datasets\n    &quot;opencompass\/gaia&quot;: {\n        &quot;ms_id&quot;: None,\n        &quot;hf_id&quot;: &quot;gaia-benchmark\/GAIA&quot;,\n        &quot;local&quot;: &quot;.\/data\/gaia\/&quot;,\n    },\n    # \u4ee5\u4e0b\u5185\u5bb9\u7701\u7565<\/code><\/pre>\n<h4><span class=\"ez-toc-section\" id=\"44_dataset-indexyml_%E6%B3%A8%E5%86%8C%E6%95%B0%E6%8D%AE%E9%9B%86\"><\/span>4.4 <code>dataset-index.yml<\/code> \u6ce8\u518c\u6570\u636e\u96c6<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<p>\u4ee3\u7801\u6587\u4ef6\uff1a<code>libs\/OpenCompass\/opencompass\/dataset-index.yml<\/code><br \/>\n\u4ee3\u7801\u5185\u5bb9\uff1a<\/p>\n<pre><code class=\"language-yaml\">- gaia:\n    name: GAIA\n    category: Reasoning\n    paper: https:\/\/arxiv.org\/abs\/2311.12983\n    configpath: opencompass\/configs\/datasets\/GAIA\/gaia_gen.py\n    configpath_llmjudge: &#039;&#039;<\/code><\/pre>\n<h4><span class=\"ez-toc-section\" id=\"45_init_py_%E6%B7%BB%E5%8A%A0%E5%88%9D%E5%A7%8B%E5%8C%96%E4%BF%A1%E6%81%AF\"><\/span>4.5 <code>__init__.py<\/code> \u6dfb\u52a0\u521d\u59cb\u5316\u4fe1\u606f<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<p>\u4ee3\u7801\u6587\u4ef6\uff1a<code>libs\/OpenCompass\/opencompass\/datasets\/__init__.py<\/code><br \/>\n\u4ee3\u7801\u5185\u5bb9\uff1a<\/p>\n<pre><code class=\"language-python\">from .gaia import *  # noqa: F401, F403<\/code><\/pre>\n<p>\u81f3\u6b64\uff0c\u6211\u4eec\u5b8c\u6210\u4e86\u5728Compass\u4e2d\u6dfb\u52a0GAIA\u6570\u636e\u96c6\u7684\u914d\u7f6e\u3002<\/p>\n<h3><span class=\"ez-toc-section\" id=\"5_%E8%B0%83%E8%AF%95%E8%BF%90%E8%A1%8C_gaia_%E6%95%B0%E6%8D%AE%E9%9B%86\"><\/span>5. \u8c03\u8bd5\u8fd0\u884c <code>gaia<\/code> \u6570\u636e\u96c6<span class=\"ez-toc-section-end\"><\/span><\/h3>\n<p>\u5728VsCode\u7684<code>launch.json<\/code>\u4e2d\uff0c\u589e\u52a0\u5982\u4e0b\u8c03\u8bd5\u914d\u7f6e\uff1a<\/p>\n<pre><code class=\"language-json\">{\n    &quot;version&quot;: &quot;0.2.0&quot;,\n    &quot;configurations&quot;: [\n        {\n            &quot;name&quot;: &quot;OpenCompass&quot;,\n            &quot;type&quot;: &quot;python&quot;,\n            &quot;request&quot;: &quot;launch&quot;,\n            &quot;module&quot;: &quot;opencompass.cli.main&quot;,\n            &quot;cwd&quot;: &quot;${workspaceFolder}\/libs\/OpenCompass&quot;,\n            &quot;python&quot;: &quot;${command:python.interpreterPath}&quot;,\n            &quot;env&quot;: {\n                &quot;MODEL&quot;: &quot;deepseek-ai\/DeepSeek-V3&quot;,\n                &quot;API_KEY&quot;: &quot;sk-pboelsoxvgeapocquovvdkvv******&quot;,\n                &quot;API_URL&quot;: &quot;https:\/\/api.siliconflow.cn\/v1\/&quot;  \n            },\n            &quot;args&quot;: [\n                &quot;--models&quot;, &quot;custom_api&quot;, \n                &quot;--datasets&quot;, &quot;gaia_gen&quot;, \n                &quot;--debug&quot;, &quot;-m&quot;, &quot;all&quot;]\n        }\n    ]\n} <\/code><\/pre>\n<p>\u8fd0\u884cOpenCompass\u8c03\u8bd5\u914d\u7f6e\uff0c\u8fd0\u884c\u7ed3\u679c\u5982\u4e0b\uff1a<br \/>\n<a href=\"https:\/\/17aitech.com\/wp-content\/uploads\/2025\/04\/vscode\u8c03\u8bd5\u7ed3\u679c.png\" data-fancybox=\"images\" data-fancybox=\"gallery\"><img decoding=\"async\" src=\"https:\/\/17aitech.com\/wp-content\/uploads\/2025\/04\/vscode\u8c03\u8bd5\u7ed3\u679c.png\" alt=\"\" \/><\/a><\/p>\n<p>\u4ece\u56fe\u4e2d\u53ef\u4ee5\u770b\u5230\uff0c\u65b0\u6dfb\u52a0\u7684gaia\u6570\u636e\u96c6\u8fd0\u884c\u6210\u529f\uff0c\u4e0b\u4e00\u6b65\u5c06\u6570\u636e\u96c6\u6dfb\u52a0\u81f3ai-eval-system\u4e2d\uff0c\u4ee5\u4fbf\u8fdb\u884cDify\u5e73\u53f0\u4e0aAgent\u7684\u8bc4\u6d4b\u3002<\/p>\n<h3><span class=\"ez-toc-section\" id=\"6_%E9%9B%86%E6%88%90%E8%87%B3ai-eval-system%E4%B8%AD%E8%BF%9B%E8%A1%8CDify%E5%B9%B3%E5%8F%B0%E5%BA%94%E7%94%A8%E8%AF%84%E6%B5%8B\"><\/span>6. \u96c6\u6210\u81f3ai-eval-system\u4e2d\u8fdb\u884cDify\u5e73\u53f0\u5e94\u7528\u8bc4\u6d4b<span class=\"ez-toc-section-end\"><\/span><\/h3>\n<h4><span class=\"ez-toc-section\" id=\"61_%E9%85%8D%E7%BD%AE%E6%95%B0%E6%8D%AE%E9%9B%86\"><\/span>6.1 \u914d\u7f6e\u6570\u636e\u96c6<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<p>\u786e\u4fdd\u5728<code>ai-eval-system<\/code>\u4e0b\u7684<code>libs\/OpenCompass<\/code>\u7684\u76ee\u5f55\u4e0b\uff0c\u5df2\u5b8c\u6210\u4e0a\u8ff0<code>4.1<\/code>~<code>4.5<\/code>\u6b65\u9aa4\u3002<\/p>\n<h4><span class=\"ez-toc-section\" id=\"62_%E6%95%B0%E6%8D%AE%E5%BA%93%E6%B7%BB%E5%8A%A0%E6%95%B0%E6%8D%AE%E9%9B%86\"><\/span>6.2 \u6570\u636e\u5e93\u6dfb\u52a0\u6570\u636e\u96c6<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<p>\u901a\u8fc7\u4ee5\u4e0bSQL\u547d\u4ee4\u5411\u6570\u636e\u5e93\u6dfb\u52a0\u6570\u636e\u96c6\u4fe1\u606f\uff1a<\/p>\n<pre><code class=\"language-SQL\">INSERT INTO datasets (\n    name, \n    description, \n    category,\n    type,\n    file_path, \n    configuration, \n    user_id, \n    is_active\n) VALUES \n(\n    &#039;gaia_gen&#039;, \n    &#039;GAIA\u6570\u636e\u96c6\uff0c\u8fd9\u662f\u4e00\u79cd\u4e25\u82db\u7684\u8bc4\u4f30Agent\u901a\u7528\u80fd\u529b\u8bc4\u6d4b\u7684\u6570\u636e\u96c6\uff0c\u5176\u4e2d\u5305\u542b165\u4e2a\u4efb\u52a1\uff0c\u6bcf\u4e2a\u4efb\u52a1\u90fd\u9700\u8981agent\u501f\u52a9\u5916\u90e8\u5de5\u5177\u6765\u5b8c\u6210\u3002&#039;, \n    &#039;\u667a\u80fd\u4f53&#039;, \n    &#039;benchmark&#039;, \n    &#039;\/data\/gaia&#039;, \n    &#039;{&quot;format&quot;: &quot;chat&quot;}&#039;, \n    1, \n    1\n);<\/code><\/pre>\n<h4><span class=\"ez-toc-section\" id=\"63_%E5%BC%80%E5%8F%91%E8%80%85%E6%96%B9%E5%BC%8F%E5%90%AF%E5%8A%A8ai-eval-system\"><\/span>6.3 \u5f00\u53d1\u8005\u65b9\u5f0f\u542f\u52a8ai-eval-system<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<p>\u5177\u4f53\u65b9\u6cd5\u8bf7\u53c2\u8003<a href=\"https:\/\/github.com\/domonic18\/ai-eval-system?tab=readme-ov-file#%E5%BC%80%E5%8F%91%E6%A8%A1%E5%BC%8F%E8%BF%90%E8%A1%8C\">Readme\u6587\u6863<\/a>\uff0c\u6b64\u5904\u4e0d\u518d\u8d58\u8ff0\u3002<\/p>\n<h4><span class=\"ez-toc-section\" id=\"64_Dify%E5%B9%B3%E5%8F%B0%E5%88%9B%E5%BB%BAAgent%E5%BA%94%E7%94%A8\"><\/span>6.4 Dify\u5e73\u53f0\u521b\u5efaAgent\u5e94\u7528<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<p>\u5728Dify\u5e73\u53f0\u4e0a\u521b\u5efa\u4e00\u4e2aAgent\u5e76\u4e14\u4e3aAgent\u6dfb\u52a0arxiv_search\u5de5\u5177\uff0c\u5982\u4e0b\uff1a<br \/>\n<a href=\"https:\/\/17aitech.com\/wp-content\/uploads\/2025\/04\/\u521b\u5efa\u5e94\u7528.png\" data-fancybox=\"images\" data-fancybox=\"gallery\"><img decoding=\"async\" src=\"https:\/\/17aitech.com\/wp-content\/uploads\/2025\/04\/\u521b\u5efa\u5e94\u7528.png\" alt=\"\" \/><\/a><\/p>\n<h4><span class=\"ez-toc-section\" id=\"64_ai-eval-system%E5%B9%B3%E5%8F%B0%E5%88%9B%E5%BB%BA%E8%AF%84%E6%B5%8B%E4%BB%BB%E5%8A%A1\"><\/span>6.4 ai-eval-system\u5e73\u53f0\u521b\u5efa\u8bc4\u6d4b\u4efb\u52a1<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<p>\u5728ai-eval-system\u5e73\u53f0\u521b\u5efa\u8bc4\u6d4b\u4efb\u52a1\uff0c\u914d\u7f6e\u76f8\u5e94\u7684DIFY_URL\u548cDIFY_API_KEY\u3002<\/p>\n<blockquote>\n<p>\u5907\u6ce8\uff1a<br \/>\nDIFY_URL\u548cDIFY_API_KEY\u7684\u83b7\u53d6\u65b9\u6cd5\uff0c\u672c\u7ae0\u4e0d\u518d\u8d58\u8ff0\uff0c\u5177\u4f53\u8bf7\u53c2\u8003<a href=\"https:\/\/github.com\/domonic18\/ai-eval-system\">Readme\u7684\u4f7f\u7528\u8bf4\u660e<\/a>\u3002<\/p>\n<\/blockquote>\n<p><a href=\"https:\/\/17aitech.com\/wp-content\/uploads\/2025\/04\/\u521b\u5efa\u8bc4\u6d4b\u4efb\u52a1.png\" data-fancybox=\"images\" data-fancybox=\"gallery\"><img decoding=\"async\" src=\"https:\/\/17aitech.com\/wp-content\/uploads\/2025\/04\/\u521b\u5efa\u8bc4\u6d4b\u4efb\u52a1.png\" alt=\"\" \/><\/a><br \/>\n<a href=\"https:\/\/17aitech.com\/wp-content\/uploads\/2025\/04\/\u914d\u7f6e\u8bc4\u6d4b\u6570\u636e.png\" data-fancybox=\"images\" data-fancybox=\"gallery\"><img decoding=\"async\" src=\"https:\/\/17aitech.com\/wp-content\/uploads\/2025\/04\/\u914d\u7f6e\u8bc4\u6d4b\u6570\u636e.png\" alt=\"\" \/><\/a><\/p>\n<p>\u521b\u5efa\u5e76\u8fd0\u884c\u8bc4\u6d4b\u540e\uff0c\u5728Dify\u5e73\u53f0\u5bf9\u5e94Agent\u7684\u65e5\u5fd7\u4e0e\u6807\u6ce8\u4e2d\u53ef\u4ee5\u770b\u5230\u5b9e\u65f6\u65e5\u5fd7\u60c5\u51b5\u3002<\/p>\n<p><a href=\"https:\/\/17aitech.com\/wp-content\/uploads\/2025\/04\/Dify\u5e73\u53f0.png\" data-fancybox=\"images\" data-fancybox=\"gallery\"><img decoding=\"async\" src=\"https:\/\/17aitech.com\/wp-content\/uploads\/2025\/04\/Dify\u5e73\u53f0.png\" alt=\"\" \/><\/a><\/p>\n<h2><span class=\"ez-toc-section\" id=\"%E6%80%BB%E7%BB%93\"><\/span>\u603b\u7ed3<span class=\"ez-toc-section-end\"><\/span><\/h2>\n<p>\u81f3\u6b64\uff0c\u6211\u4eec\u5df2\u5b8c\u6210\u5728opencompass\u4e2d\u6dfb\u52a0GAIA\u6570\u636e\u96c6\uff0c\u5e76\u96c6\u6210\u81f3ai-eval-system\u4e2d\u8fdb\u884cDify\u5e73\u53f0\u5e94\u7528\u8bc4\u6d4b\u3002<\/p>\n<p>\u4f46\u662f\u5982\u679c\u6570\u636e\u96c6\u662f\u5e26\u6709\u9644\u4ef6\u7684\u6837\u4f8b(\u5982\uff1a<code>3.3 \u6837\u4f8b3<\/code> \u548c <code>3.4 \u6837\u4f8b4<\/code>)\uff0c\u76ee\u524d\u8fd8\u4e0d\u652f\u6301\uff0c\u8fd8\u9700\u8981\u7ee7\u7eed\u8c03\u7814\u5b9e\u73b0\u3002<\/p>\n<h2><span class=\"ez-toc-section\" id=\"%E5%8F%82%E8%80%83%E8%B5%84%E6%96%99\"><\/span>\u53c2\u8003\u8d44\u6599<span class=\"ez-toc-section-end\"><\/span><\/h2>\n<ul>\n<li><a href=\"https:\/\/arxiv.org\/abs\/2311.12983\">arxiv\uff1aGAIA: a benchmark for General AI Assistants<\/a><\/li>\n<li><a href=\"https:\/\/opencompass.readthedocs.io\/zh-cn\/latest\/advanced_guides\/new_dataset.html\">OpenCompass\u5b98\u7f51\uff1a\u6dfb\u52a0\u65b0\u6570\u636e\u96c6<\/a><\/li>\n<li><a href=\"https:\/\/huggingface.co\/datasets\/gaia-benchmark\/GAIA\">Huggingface\uff1aGAIA \u901a\u7528\u4eba\u5de5\u667a\u80fd\u52a9\u624b\u7684\u57fa\u51c6\u6570\u636e\u96c6<\/a><\/li>\n<\/ul>\n<h2><span class=\"ez-toc-section\" id=\"%E9%99%84%E5%BD%95\"><\/span>\u9644\u5f55<span class=\"ez-toc-section-end\"><\/span><\/h2>\n<p>\u9664<code>GAIA<\/code>\u4e4b\u5916\uff0c\u4ee5\u4e0b\u7684\u6570\u636e\u96c6\u65b9\u6848\u4e5f\u5e38\u7528\u4e8e\u8bc4\u4f30 <code>Agent<\/code> \u80fd\u529b\uff1a<\/p>\n<ul>\n<li><strong>ToolQA<\/strong>\uff1a\u5c06\u73b0\u6709\u6570\u636e\u96c6\u4e0e\u4eba\u7c7b\u6ce8\u91ca\uff08\u5982 MMLU\u3001MATH \u7b49\uff09\u7ed3\u5408\u8d77\u6765\uff0c\u4f46\u5b58\u5728\u8bad\u7ec3\u8fc7\u7a0b\u4e2d\u6570\u636e\u6c61\u67d3\u7684\u98ce\u9669\uff0c\u5e76\u4e14\u65e0\u6cd5\u786e\u4fdd\u5b9e\u9645\u6d4b\u8bd5\u4e86\u5de5\u5177\u7684\u4f7f\u7528\u60c5\u51b5\u3002<\/li>\n<li><strong>APIBench<\/strong>\uff1a\u7528\u4e8e\u6d4b\u8bd5\u7c7b\u4f3c\u4ee3\u7406\u7684\u7cfb\u7edf\u8c03\u7528\u5176\u7279\u5b9a <code>API<\/code> \u7684\u80fd\u529b\uff0c\u7c7b\u4f3c\u4e8e <code>API-Bank<\/code>\uff0c\u540e\u8005\u63d0\u4f9b\u4e00\u4e2a <code>API<\/code> \u6c60\u4ee5\u5728\u8bc4\u4f30\u8fc7\u7a0b\u4e2d\u5e2e\u52a9\u5927\u8bed\u8a00\u6a21\u578b\u3002<\/li>\n<li><strong>AgentBench<\/strong>\uff1a\u63d0\u4f9b\u4e86\u8bb8\u591a\u5c01\u95ed\u73af\u5883\uff0c\u5728\u8fd9\u4e9b\u73af\u5883\u4e2d\u53ef\u4ee5\u90e8\u7f72\u4f5c\u4e3a\u52a9\u624b\u7684\u5927\u8bed\u8a00\u6a21\u578b\u6765\u56de\u7b54\u7528\u6237\u67e5\u8be2\uff08\u4ece Unix shell \u5230\u7f51\u7edc\u8d2d\u7269 API\uff09\u3002\u4f46\u662f\u7531\u4e8e\u8fd9\u7c7b\u8bc4\u4f30\u4f9d\u8d56\u4e8e\u5c01\u95ed\u73af\u5883\uff0c\u5b83\u4eec\u53ef\u80fd\u8bc4\u4f30\u7684\u662f\u52a9\u624b\u5bf9\u7279\u5b9a <code>API<\/code> \u7684\u5b66\u4e60\u4f7f\u7528\u7a0b\u5ea6\uff0c\u800c\u4e0d\u662f\u57fa\u4e8e\u73b0\u5b9e\u4e16\u754c\u4ea4\u4e92\u7684\u66f4\u666e\u904d\u7ed3\u679c\u3002<\/li>\n<li><strong>OpenAGI<\/strong>\uff1a\u63a8\u51fa\u4e86\u4e00\u4e2a\u5e73\u53f0\u548c\u57fa\u51c6\u6d4b\u8bd5\uff0c\u7531\u591a\u4e2a\u8de8\u6a21\u6001\u548c\u80fd\u529b\u7684\u591a\u6b65\u9aa4\u4efb\u52a1\u7ec4\u6210\uff0c\u4e0e <code>GAIA<\/code> \u66f4\u4e3a\u63a5\u8fd1\u3002\u4e0e GAIA \u7684\u6838\u5fc3\u533a\u522b\u5728\u4e8e\uff0c\u4ed6\u4eec\u7684\u4efb\u52a1\u4fa7\u91cd\u4e8e\u5f53\u524d\u6a21\u578b\u7684\u80fd\u529b\uff0c\u800c\u4e0d\u662f\u672a\u6765\u7684\u8fdb\u6b65\u3002<\/li>\n<\/ul>\n","protected":false},"excerpt":{"rendered":"<p>\u80cc\u666f \u4e3a\u4e86\u80fd\u591f\u66f4\u597d\u5730\u8bc4\u4f30Agent\u7684\u80fd\u529b [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":40041,"comment_status":"open","ping_status":"closed","sticky":false,"template":"","format":"aside","meta":{"site-sidebar-layout":"default","site-content-layout":"","ast-site-content-layout":"default","site-content-style":"default","site-sidebar-style":"default","ast-global-header-display":"","ast-banner-title-visibility":"","ast-main-header-display":"","ast-hfb-above-header-display":"","ast-hfb-below-header-display":"","ast-hfb-mobile-header-display":"","site-post-title":"","ast-breadcrumbs-content":"","ast-featured-img":"","footer-sml-layout":"","theme-transparent-header-meta":"default","adv-header-id-meta":"","stick-header-meta":"default","header-above-stick-meta":"","header-main-stick-meta":"","header-below-stick-meta":"","astra-migrate-meta-layouts":"set","ast-page-background-enabled":"default","ast-page-background-meta":{"desktop":{"background-color":"var(--ast-global-color-4)","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""},"tablet":{"background-color":"","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""},"mobile":{"background-color":"","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""}},"ast-content-background-meta":{"desktop":{"background-color":"var(--ast-global-color-5)","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""},"tablet":{"background-color":"var(--ast-global-color-5)","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""},"mobile":{"background-color":"var(--ast-global-color-5)","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""}},"footnotes":""},"categories":[69],"tags":[83],"class_list":["post-40031","post","type-post","status-publish","format-aside","has-post-thumbnail","hentry","category-report","tag-83","post_format-post-format-aside"],"yoast_head":"<!-- This site is optimized with the Yoast SEO plugin v26.4 - https:\/\/yoast.com\/wordpress\/plugins\/seo\/ -->\n<title>\u3010\u6a21\u578b\u6d4b\u8bd5\u3011\u57fa\u4e8eOpenCompass\u5b9e\u73b0Agent\u6700\u4e3a\u82db\u523b\u7684\u57fa\u51c6\u8bc4\u6d4b\uff1aGAIA - \u4e00\u8d77AI\u6280\u672f<\/title>\n<meta name=\"robots\" content=\"index, follow, max-snippet:-1, max-image-preview:large, max-video-preview:-1\" \/>\n<link rel=\"canonical\" href=\"https:\/\/17aitech.com\/?p=40031\" \/>\n<script type=\"application\/ld+json\" class=\"yoast-schema-graph\">{\"@context\":\"https:\/\/schema.org\",\"@graph\":[{\"@type\":\"WebPage\",\"@id\":\"https:\/\/17aitech.com\/?p=40031\",\"url\":\"https:\/\/17aitech.com\/?p=40031\",\"name\":\"\u3010\u6a21\u578b\u6d4b\u8bd5\u3011\u57fa\u4e8eOpenCompass\u5b9e\u73b0Agent\u6700\u4e3a\u82db\u523b\u7684\u57fa\u51c6\u8bc4\u6d4b\uff1aGAIA - \u4e00\u8d77AI\u6280\u672f\",\"isPartOf\":{\"@id\":\"https:\/\/17aitech.com\/#website\"},\"primaryImageOfPage\":{\"@id\":\"https:\/\/17aitech.com\/?p=40031#primaryimage\"},\"image\":{\"@id\":\"https:\/\/17aitech.com\/?p=40031#primaryimage\"},\"thumbnailUrl\":\"https:\/\/17aitech.com\/wp-content\/uploads\/2025\/04\/\u5c01\u9762.jpg\",\"datePublished\":\"2025-04-10T06:03:02+00:00\",\"dateModified\":\"2025-04-10T08:06:21+00:00\",\"author\":{\"@id\":\"https:\/\/17aitech.com\/#\/schema\/person\/3d23bb6f7f115fcefc9ae7803a691739\"},\"breadcrumb\":{\"@id\":\"https:\/\/17aitech.com\/?p=40031#breadcrumb\"},\"inLanguage\":\"zh-Hans\",\"potentialAction\":[{\"@type\":\"ReadAction\",\"target\":[\"https:\/\/17aitech.com\/?p=40031\"]}]},{\"@type\":\"ImageObject\",\"inLanguage\":\"zh-Hans\",\"@id\":\"https:\/\/17aitech.com\/?p=40031#primaryimage\",\"url\":\"https:\/\/17aitech.com\/wp-content\/uploads\/2025\/04\/\u5c01\u9762.jpg\",\"contentUrl\":\"https:\/\/17aitech.com\/wp-content\/uploads\/2025\/04\/\u5c01\u9762.jpg\",\"width\":1200,\"height\":675},{\"@type\":\"BreadcrumbList\",\"@id\":\"https:\/\/17aitech.com\/?p=40031#breadcrumb\",\"itemListElement\":[{\"@type\":\"ListItem\",\"position\":1,\"name\":\"\u9996\u9875\",\"item\":\"https:\/\/17aitech.com\/\"},{\"@type\":\"ListItem\",\"position\":2,\"name\":\"\u3010\u6a21\u578b\u6d4b\u8bd5\u3011\u57fa\u4e8eOpenCompass\u5b9e\u73b0Agent\u6700\u4e3a\u82db\u523b\u7684\u57fa\u51c6\u8bc4\u6d4b\uff1aGAIA\"}]},{\"@type\":\"WebSite\",\"@id\":\"https:\/\/17aitech.com\/#website\",\"url\":\"https:\/\/17aitech.com\/\",\"name\":\"\u4e00\u8d77AI\u6280\u672f\",\"description\":\"\u8ba9AI\u77e5\u8bc6\u89e6\u624b\u53ef\u53ca\",\"alternateName\":\"\u4e00\u8d77AI\u6280\u672f\",\"potentialAction\":[{\"@type\":\"SearchAction\",\"target\":{\"@type\":\"EntryPoint\",\"urlTemplate\":\"https:\/\/17aitech.com\/?s={search_term_string}\"},\"query-input\":{\"@type\":\"PropertyValueSpecification\",\"valueRequired\":true,\"valueName\":\"search_term_string\"}}],\"inLanguage\":\"zh-Hans\"},{\"@type\":\"Person\",\"@id\":\"https:\/\/17aitech.com\/#\/schema\/person\/3d23bb6f7f115fcefc9ae7803a691739\",\"name\":\"Dongming\",\"image\":{\"@type\":\"ImageObject\",\"inLanguage\":\"zh-Hans\",\"@id\":\"https:\/\/17aitech.com\/#\/schema\/person\/image\/\",\"url\":\"\/\/17aitech.com\/wp-content\/uploads\/member\/avatars\/238a0b923820dcc5.1732798681.jpg\",\"contentUrl\":\"\/\/17aitech.com\/wp-content\/uploads\/member\/avatars\/238a0b923820dcc5.1732798681.jpg\",\"caption\":\"Dongming\"},\"description\":\"\u89c1\u5929\u5730\uff0c\u89c1\u4f17\u751f\uff0c\u89c1\u81ea\u5df1\u3002\",\"sameAs\":[\"http:\/\/17aitech.com\"],\"url\":\"https:\/\/17aitech.com\/?page_id=33738&user=1\"}]}<\/script>\n<!-- \/ Yoast SEO plugin. -->","yoast_head_json":{"title":"\u3010\u6a21\u578b\u6d4b\u8bd5\u3011\u57fa\u4e8eOpenCompass\u5b9e\u73b0Agent\u6700\u4e3a\u82db\u523b\u7684\u57fa\u51c6\u8bc4\u6d4b\uff1aGAIA - \u4e00\u8d77AI\u6280\u672f","robots":{"index":"index","follow":"follow","max-snippet":"max-snippet:-1","max-image-preview":"max-image-preview:large","max-video-preview":"max-video-preview:-1"},"canonical":"https:\/\/17aitech.com\/?p=40031","schema":{"@context":"https:\/\/schema.org","@graph":[{"@type":"WebPage","@id":"https:\/\/17aitech.com\/?p=40031","url":"https:\/\/17aitech.com\/?p=40031","name":"\u3010\u6a21\u578b\u6d4b\u8bd5\u3011\u57fa\u4e8eOpenCompass\u5b9e\u73b0Agent\u6700\u4e3a\u82db\u523b\u7684\u57fa\u51c6\u8bc4\u6d4b\uff1aGAIA - \u4e00\u8d77AI\u6280\u672f","isPartOf":{"@id":"https:\/\/17aitech.com\/#website"},"primaryImageOfPage":{"@id":"https:\/\/17aitech.com\/?p=40031#primaryimage"},"image":{"@id":"https:\/\/17aitech.com\/?p=40031#primaryimage"},"thumbnailUrl":"https:\/\/17aitech.com\/wp-content\/uploads\/2025\/04\/\u5c01\u9762.jpg","datePublished":"2025-04-10T06:03:02+00:00","dateModified":"2025-04-10T08:06:21+00:00","author":{"@id":"https:\/\/17aitech.com\/#\/schema\/person\/3d23bb6f7f115fcefc9ae7803a691739"},"breadcrumb":{"@id":"https:\/\/17aitech.com\/?p=40031#breadcrumb"},"inLanguage":"zh-Hans","potentialAction":[{"@type":"ReadAction","target":["https:\/\/17aitech.com\/?p=40031"]}]},{"@type":"ImageObject","inLanguage":"zh-Hans","@id":"https:\/\/17aitech.com\/?p=40031#primaryimage","url":"https:\/\/17aitech.com\/wp-content\/uploads\/2025\/04\/\u5c01\u9762.jpg","contentUrl":"https:\/\/17aitech.com\/wp-content\/uploads\/2025\/04\/\u5c01\u9762.jpg","width":1200,"height":675},{"@type":"BreadcrumbList","@id":"https:\/\/17aitech.com\/?p=40031#breadcrumb","itemListElement":[{"@type":"ListItem","position":1,"name":"\u9996\u9875","item":"https:\/\/17aitech.com\/"},{"@type":"ListItem","position":2,"name":"\u3010\u6a21\u578b\u6d4b\u8bd5\u3011\u57fa\u4e8eOpenCompass\u5b9e\u73b0Agent\u6700\u4e3a\u82db\u523b\u7684\u57fa\u51c6\u8bc4\u6d4b\uff1aGAIA"}]},{"@type":"WebSite","@id":"https:\/\/17aitech.com\/#website","url":"https:\/\/17aitech.com\/","name":"\u4e00\u8d77AI\u6280\u672f","description":"\u8ba9AI\u77e5\u8bc6\u89e6\u624b\u53ef\u53ca","alternateName":"\u4e00\u8d77AI\u6280\u672f","potentialAction":[{"@type":"SearchAction","target":{"@type":"EntryPoint","urlTemplate":"https:\/\/17aitech.com\/?s={search_term_string}"},"query-input":{"@type":"PropertyValueSpecification","valueRequired":true,"valueName":"search_term_string"}}],"inLanguage":"zh-Hans"},{"@type":"Person","@id":"https:\/\/17aitech.com\/#\/schema\/person\/3d23bb6f7f115fcefc9ae7803a691739","name":"Dongming","image":{"@type":"ImageObject","inLanguage":"zh-Hans","@id":"https:\/\/17aitech.com\/#\/schema\/person\/image\/","url":"\/\/17aitech.com\/wp-content\/uploads\/member\/avatars\/238a0b923820dcc5.1732798681.jpg","contentUrl":"\/\/17aitech.com\/wp-content\/uploads\/member\/avatars\/238a0b923820dcc5.1732798681.jpg","caption":"Dongming"},"description":"\u89c1\u5929\u5730\uff0c\u89c1\u4f17\u751f\uff0c\u89c1\u81ea\u5df1\u3002","sameAs":["http:\/\/17aitech.com"],"url":"https:\/\/17aitech.com\/?page_id=33738&user=1"}]}},"_links":{"self":[{"href":"https:\/\/17aitech.com\/index.php?rest_route=\/wp\/v2\/posts\/40031","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/17aitech.com\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/17aitech.com\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/17aitech.com\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/17aitech.com\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=40031"}],"version-history":[{"count":2,"href":"https:\/\/17aitech.com\/index.php?rest_route=\/wp\/v2\/posts\/40031\/revisions"}],"predecessor-version":[{"id":40044,"href":"https:\/\/17aitech.com\/index.php?rest_route=\/wp\/v2\/posts\/40031\/revisions\/40044"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/17aitech.com\/index.php?rest_route=\/wp\/v2\/media\/40041"}],"wp:attachment":[{"href":"https:\/\/17aitech.com\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=40031"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/17aitech.com\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=40031"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/17aitech.com\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=40031"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}