{"id":13440,"date":"2024-08-14T03:10:26","date_gmt":"2024-08-13T19:10:26","guid":{"rendered":"https:\/\/17aitech.com\/?p=13440"},"modified":"2024-10-08T15:10:41","modified_gmt":"2024-10-08T07:10:41","slug":"%e3%80%90%e8%af%be%e7%a8%8b%e6%80%bb%e7%bb%93%e3%80%91day23%ef%bc%9a%e5%a4%a7%e6%a8%a1%e5%9e%8b%e8%ae%ad%e7%bb%83%e7%ad%96%e7%95%a5%ef%bc%88bert%e6%a8%a1%e5%9e%8b%e4%b8%8eglm%e6%a8%a1%e5%9e%8b","status":"publish","type":"post","link":"https:\/\/17aitech.com\/?p=13440","title":{"rendered":"\u3010\u8bfe\u7a0b\u603b\u7ed3\u3011day23\uff1a\u5927\u6a21\u578b\u8bad\u7ec3\u7b56\u7565\uff08BERT\u6a21\u578b\u4e0eGLM\u6a21\u578b\uff09"},"content":{"rendered":"<div id=\"ez-toc-container\" class=\"ez-toc-v2_0_78 ez-toc-wrap-left-text counter-hierarchy ez-toc-counter ez-toc-light-blue ez-toc-container-direction\">\n<div class=\"ez-toc-title-container\">\n<p class=\"ez-toc-title\" style=\"cursor:inherit\">\u6587\u7ae0\u76ee\u5f55<\/p>\n<span class=\"ez-toc-title-toggle\"><a href=\"#\" class=\"ez-toc-pull-right ez-toc-btn ez-toc-btn-xs ez-toc-btn-default ez-toc-toggle\" aria-label=\"Toggle Table of Content\"><span class=\"ez-toc-js-icon-con\"><span class=\"\"><span class=\"eztoc-hide\" style=\"display:none;\">Toggle<\/span><span class=\"ez-toc-icon-toggle-span\"><svg style=\"fill: #999;color:#999\" xmlns=\"http:\/\/www.w3.org\/2000\/svg\" class=\"list-377408\" width=\"20px\" height=\"20px\" viewBox=\"0 0 24 24\" fill=\"none\"><path d=\"M6 6H4v2h2V6zm14 0H8v2h12V6zM4 11h2v2H4v-2zm16 0H8v2h12v-2zM4 16h2v2H4v-2zm16 0H8v2h12v-2z\" fill=\"currentColor\"><\/path><\/svg><svg style=\"fill: #999;color:#999\" class=\"arrow-unsorted-368013\" xmlns=\"http:\/\/www.w3.org\/2000\/svg\" width=\"10px\" height=\"10px\" viewBox=\"0 0 24 24\" version=\"1.2\" baseProfile=\"tiny\"><path d=\"M18.2 9.3l-6.2-6.3-6.2 6.3c-.2.2-.3.4-.3.7s.1.5.3.7c.2.2.4.3.7.3h11c.3 0 .5-.1.7-.3.2-.2.3-.5.3-.7s-.1-.5-.3-.7zM5.8 14.7l6.2 6.3 6.2-6.3c.2-.2.3-.5.3-.7s-.1-.5-.3-.7c-.2-.2-.4-.3-.7-.3h-11c-.3 0-.5.1-.7.3-.2.2-.3.5-.3.7s.1.5.3.7z\"\/><\/svg><\/span><\/span><\/span><\/a><\/span><\/div>\n<nav><ul class='ez-toc-list ez-toc-list-level-1 ' ><li class='ez-toc-page-1 ez-toc-heading-level-2'><a class=\"ez-toc-link ez-toc-heading-1\" href=\"https:\/\/17aitech.com\/?p=13440\/#%E5%89%8D%E8%A8%80\" >\u524d\u8a00<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-2'><a class=\"ez-toc-link ez-toc-heading-2\" href=\"https:\/\/17aitech.com\/?p=13440\/#%E8%B5%84%E6%96%99\" >\u8d44\u6599<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-2'><a class=\"ez-toc-link ez-toc-heading-3\" href=\"https:\/\/17aitech.com\/?p=13440\/#BERT_%E6%A8%A1%E5%9E%8B%E8%AE%AD%E7%BB%83%E7%AD%96%E7%95%A5\" >BERT \u6a21\u578b\u8bad\u7ec3\u7b56\u7565<\/a><ul class='ez-toc-list-level-3' ><li class='ez-toc-heading-level-3'><a class=\"ez-toc-link ez-toc-heading-4\" href=\"https:\/\/17aitech.com\/?p=13440\/#%E8%83%8C%E6%99%AF%E4%BB%8B%E7%BB%8D\" >\u80cc\u666f\u4ecb\u7ecd<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-3'><a class=\"ez-toc-link ez-toc-heading-5\" href=\"https:\/\/17aitech.com\/?p=13440\/#%E8%AE%BA%E6%96%87%E9%98%85%E8%AF%BB%E7%90%86%E8%A7%A3\" >\u8bba\u6587\u9605\u8bfb\u7406\u89e3<\/a><ul class='ez-toc-list-level-4' ><li class='ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-6\" href=\"https:\/\/17aitech.com\/?p=13440\/#%E6%91%98%E8%A6%81Abstract\" >\u6458\u8981(Abstract)<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-7\" href=\"https:\/\/17aitech.com\/?p=13440\/#%E5%BC%95%E8%A8%80Introduction\" >\u5f15\u8a00(Introduction)<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-8\" href=\"https:\/\/17aitech.com\/?p=13440\/#%E7%9B%B8%E5%85%B3%E5%B7%A5%E4%BD%9CRelated_Work\" >\u76f8\u5173\u5de5\u4f5c(Related Work)<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-9\" href=\"https:\/\/17aitech.com\/?p=13440\/#BERT%E9%83%A8%E5%88%86BERT\" >BERT\u90e8\u5206(BERT)<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-10\" href=\"https:\/\/17aitech.com\/?p=13440\/#%E6%A8%A1%E5%9E%8B%E6%9E%B6%E6%9E%84Model_Architecture\" >\u6a21\u578b\u67b6\u6784(Model Architecture)<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-11\" href=\"https:\/\/17aitech.com\/?p=13440\/#%E8%BE%93%E5%85%A5%E8%BE%93%E5%87%BA%E8%A1%A8%E5%BE%81InputOutput_Representations\" >\u8f93\u5165\/\u8f93\u51fa\u8868\u5f81(Input\/Output Representations)<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-12\" href=\"https:\/\/17aitech.com\/?p=13440\/#%E4%BB%BB%E5%8A%A11_Masked_LM\" >\u4efb\u52a11: Masked LM<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-13\" href=\"https:\/\/17aitech.com\/?p=13440\/#%E4%BB%BB%E5%8A%A12%EF%BC%9A%E4%B8%8B%E4%B8%80%E4%B8%AA%E5%8F%A5%E5%AD%90%E9%A2%84%E6%B5%8B%EF%BC%88NSP%EF%BC%89\" >\u4efb\u52a12\uff1a\u4e0b\u4e00\u4e2a\u53e5\u5b50\u9884\u6d4b\uff08NSP\uff09<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-14\" href=\"https:\/\/17aitech.com\/?p=13440\/#%E9%A2%84%E8%AE%AD%E7%BB%83%E6%95%B0%E6%8D%AEPre-training_data\" >\u9884\u8bad\u7ec3\u6570\u636e(Pre-training data)<\/a><\/li><\/ul><\/li><li class='ez-toc-page-1 ez-toc-heading-level-3'><a class=\"ez-toc-link ez-toc-heading-15\" href=\"https:\/\/17aitech.com\/?p=13440\/#%E8%AE%BA%E6%96%87%E5%86%85%E5%AE%B9%E6%80%BB%E7%BB%93\" >\u8bba\u6587\u5185\u5bb9\u603b\u7ed3<\/a><ul class='ez-toc-list-level-4' ><li class='ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-16\" href=\"https:\/\/17aitech.com\/?p=13440\/#%E9%A2%84%E8%AE%AD%E7%BB%83%E9%98%B6%E6%AE%B5\" >\u9884\u8bad\u7ec3\u9636\u6bb5<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-17\" href=\"https:\/\/17aitech.com\/?p=13440\/#%E5%BE%AE%E8%B0%83%E9%98%B6%E6%AE%B5\" >\u5fae\u8c03\u9636\u6bb5<\/a><\/li><\/ul><\/li><\/ul><\/li><li class='ez-toc-page-1 ez-toc-heading-level-2'><a class=\"ez-toc-link ez-toc-heading-18\" href=\"https:\/\/17aitech.com\/?p=13440\/#GLM_%E6%A8%A1%E5%9E%8B%E8%AE%AD%E7%BB%83%E7%AD%96%E7%95%A5\" >GLM \u6a21\u578b\u8bad\u7ec3\u7b56\u7565<\/a><ul class='ez-toc-list-level-3' ><li class='ez-toc-heading-level-3'><a class=\"ez-toc-link ez-toc-heading-19\" href=\"https:\/\/17aitech.com\/?p=13440\/#%E8%83%8C%E6%99%AF%E4%BB%8B%E7%BB%8D-2\" >\u80cc\u666f\u4ecb\u7ecd<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-3'><a class=\"ez-toc-link ez-toc-heading-20\" href=\"https:\/\/17aitech.com\/?p=13440\/#%E8%AE%BA%E6%96%87%E9%98%85%E8%AF%BB%E7%90%86%E8%A7%A3-2\" >\u8bba\u6587\u9605\u8bfb\u7406\u89e3<\/a><ul class='ez-toc-list-level-4' ><li class='ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-21\" href=\"https:\/\/17aitech.com\/?p=13440\/#%E6%91%98%E8%A6%81Abstract-2\" >\u6458\u8981(Abstract)<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-22\" href=\"https:\/\/17aitech.com\/?p=13440\/#%E5%BC%95%E8%A8%80Introduction-2\" >\u5f15\u8a00(Introduction)<\/a><\/li><\/ul><\/li><li class='ez-toc-page-1 ez-toc-heading-level-3'><a class=\"ez-toc-link ez-toc-heading-23\" href=\"https:\/\/17aitech.com\/?p=13440\/#%E8%AE%BA%E6%96%87%E5%86%85%E5%AE%B9%E6%80%BB%E7%BB%93-2\" >\u8bba\u6587\u5185\u5bb9\u603b\u7ed3<\/a><\/li><\/ul><\/li><li class='ez-toc-page-1 ez-toc-heading-level-2'><a class=\"ez-toc-link ez-toc-heading-24\" href=\"https:\/\/17aitech.com\/?p=13440\/#%E6%80%BB%E7%BB%93\" >\u603b\u7ed3<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-2'><a class=\"ez-toc-link ez-toc-heading-25\" href=\"https:\/\/17aitech.com\/?p=13440\/#%E8%AF%BB%E5%90%8E%E6%84%9F\" >\u8bfb\u540e\u611f<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-2'><a class=\"ez-toc-link ez-toc-heading-26\" href=\"https:\/\/17aitech.com\/?p=13440\/#%E5%8F%82%E8%80%83%E8%B5%84%E6%96%99\" >\u53c2\u8003\u8d44\u6599<\/a><\/li><\/ul><\/nav><\/div>\n<h2><span class=\"ez-toc-section\" id=\"%E5%89%8D%E8%A8%80\"><\/span>\u524d\u8a00<span class=\"ez-toc-section-end\"><\/span><\/h2>\n<p>\u5728\u524d\u4e24\u7ae0\u7684\u5b66\u4e60\u4e2d\uff0c\u6211\u4eec\u4e86\u89e3\u5230<a href=\"https:\/\/17aitech.com\/?p=12389#toc-6\">\u5927\u6a21\u578b\u7684\u8bad\u7ec3\u8fc7\u7a0b<\/a>\uff0c\u5176\u4e2dBase model(\u5e95\u5ea7\u5927\u6a21\u578b)\u7684\u8bad\u7ec3\u91c7\u7528\u4e86<code>\u6316\u7a7a\u586b\u7a7a<\/code>\u7684\u7b56\u7565\u3002\u672c\u7ae0\u6211\u4eec\u5c06\u7ed3\u5408 <code>BERT<\/code> \u548c <code>GLM<\/code> \u4e24\u4e2a\u6a21\u578b\u7684\u8bba\u6587\uff0c\u6df1\u5165\u4e86\u89e3\u5728\u9884\u8bad\u7ec3\u65f6\u4e24\u8005\u7684\u5177\u4f53\u8bad\u7ec3\u7b56\u7565\u3002<\/p>\n<h2><span class=\"ez-toc-section\" id=\"%E8%B5%84%E6%96%99\"><\/span>\u8d44\u6599<span class=\"ez-toc-section-end\"><\/span><\/h2>\n<ul>\n<li>BERT\u8bba\u6587\uff1a<a href=\"https:\/\/arxiv.org\/pdf\/1810.04805\">https:\/\/arxiv.org\/pdf\/1810.04805<\/a><\/li>\n<li>GLM\u8bba\u6587\uff1a<a href=\"https:\/\/arxiv.org\/pdf\/2103.10360\">https:\/\/arxiv.org\/pdf\/2103.10360<\/a><\/li>\n<\/ul>\n<h2><span class=\"ez-toc-section\" id=\"BERT_%E6%A8%A1%E5%9E%8B%E8%AE%AD%E7%BB%83%E7%AD%96%E7%95%A5\"><\/span>BERT \u6a21\u578b\u8bad\u7ec3\u7b56\u7565<span class=\"ez-toc-section-end\"><\/span><\/h2>\n<h3><span class=\"ez-toc-section\" id=\"%E8%83%8C%E6%99%AF%E4%BB%8B%E7%BB%8D\"><\/span>\u80cc\u666f\u4ecb\u7ecd<span class=\"ez-toc-section-end\"><\/span><\/h3>\n<p><strong>BERT<\/strong>\uff08<strong>B<\/strong>idirectional <strong>E<\/strong>ncoder <strong>R<\/strong>epresentations from <strong>T<\/strong>ransformers\uff09\u662f\u7531Google\u57282018\u5e74\u63d0\u51fa\u7684\u4e00\u79cd\u9884\u8bad\u7ec3\u8bed\u8a00\u6a21\u578b\u3002<\/p>\n<p><strong>\u8d21\u732e<\/strong>\uff1a<br \/>\n<code>BERT<\/code> \u5f00\u521b\u4e86<code>\u9884\u8bad\u7ec3\u548c\u5fae\u8c03\u7684\u8303\u5f0f<\/code>\uff0c\u4f7f\u5f97\u5728NLP\u4efb\u52a1\u4e2d\uff0c\u7814\u7a76\u8005\u53ef\u4ee5\u5229\u7528\u5927\u89c4\u6a21\u65e0\u6807\u6ce8\u6570\u636e\u8fdb\u884c\u9884\u8bad\u7ec3\uff0c\u4ece\u800c\u51cf\u5c11\u5bf9\u6807\u6ce8\u6570\u636e\u7684\u4f9d\u8d56\u3002<\/p>\n<h3><span class=\"ez-toc-section\" id=\"%E8%AE%BA%E6%96%87%E9%98%85%E8%AF%BB%E7%90%86%E8%A7%A3\"><\/span>\u8bba\u6587\u9605\u8bfb\u7406\u89e3<span class=\"ez-toc-section-end\"><\/span><\/h3>\n<h4><span class=\"ez-toc-section\" id=\"%E6%91%98%E8%A6%81Abstract\"><\/span>\u6458\u8981(Abstract)<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<p><strong>\u8bba\u6587\u539f\u6587<\/strong>(NO.1)<\/p>\n<blockquote>\n<p>We introduce a new language representation model called <strong>BERT<\/strong>, which stands for <strong>B<\/strong>idirectional <strong>E<\/strong>ncoder <strong>R<\/strong>epresentations from <strong>T<\/strong>ransformers. Unlike recent language representation models (Peters et al., 2018a; Radford et al., 2018), BERT is designed to pretrain deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context in all layers. As a result, the pre-trained BERT model can be finetuned with just one additional output layer to create state-of-the-art models for a wide range of tasks, such as question answering and language inference, without substantial taskspecific architecture modifications. <\/p>\n<\/blockquote>\n<p><strong>\u8bba\u6587\u7ffb\u8bd1<\/strong>(NO.1)<\/p>\n<blockquote>\n<p>\u6211\u4eec\u4ecb\u7ecd\u4e00\u79cd\u65b0\u7684\u8bed\u8a00\u8868\u793a\u6a21\u578b\uff0c\u79f0\u4e3a BERT\uff08Bidirectional Encoder Representations from Transformers\uff09\uff0c\u5373\uff1a\u57fa\u4e8e<code>Transformers<\/code> \u7684\u53cc\u5411\u7f16\u7801\u8868\u5f81\u6a21\u578b\u3002\u4e0e\u6700\u8fd1\u7684\u4e00\u4e9b\u8bed\u8a00\u8868\u5f81\u6a21\u578b\u6709\u6240\u4e0d\u540c\uff0c<code>BERT<\/code> \u901a\u8fc7\u5728\u6240\u6709\u7684\u5c42\u4e2d\u5171\u540c\u8003\u8651\u4e0a\u4e0b\u6587\u7684\u65b9\u5f0f\uff0c\u5e76\u4f7f\u7528\u4e0d\u9700\u8981\u6807\u6ce8\u7684\u6587\u672c\u4e2d\u8fdb\u884c\u53cc\u5411\u7684\u3001\u6df1\u5ea6\u7684\u8bed\u8a00\u8868\u5f81\u6a21\u578b\u9884\u8bad\u7ec3\u3002<br \/>\n\u7ed3\u679c\uff0c\u9884\u8bad\u7ec3\u7684 <code>BERT<\/code> \u6a21\u578b\u53ea\u9700\u6dfb\u52a0\u4e00\u4e2a\u989d\u5916\u7684\u8f93\u51fa\u5c42\u5373\u53ef\u8fdb\u884c\u5fae\u8c03\uff0c\u4ece\u800c\u4e3a\u5e7f\u6cdb\u7684\u4efb\u52a1\uff08\u5982\u95ee\u7b54\u548c\u8bed\u8a00\u63a8\u7406\uff09\u521b\u5efa\u6700\u5148\u8fdb\u7684\u6a21\u578b\uff0c\u800c\u65e0\u9700\u5bf9\u7279\u5b9a\u4efb\u52a1\u7684\u67b6\u6784\u8fdb\u884c\u91cd\u5927\u4fee\u6539\u3002<\/p>\n<\/blockquote>\n<p><strong>\u8bba\u6587\u7406\u89e3<\/strong><\/p>\n<ul>\n<li>\n<p><strong>\u53cc\u5411\u7684\u4e0a\u4e0b\u6587\u65b9\u5f0f<\/strong>\uff1a\u662f\u6307\u4e00\u6bb5\u8bcd\u5411\u91cf\u4e2d\uff0c\u5176\u4e2d\u7684\u67d0\u4e2a\u8bcd\u53ef\u4ee5\u4e0e\u5de6\u53f3\u7684\u4e0a\u4e0b\u6587\u540c\u65f6\u8fdb\u884c\u6ce8\u610f\u529b\u8ba1\u7b97\uff0c\u800c\u4e0d\u662f\u81ea\u56de\u5f52\u5f0f\u7684\u53ea\u80fd\u8fdb\u884c\u5355\u5411\u8ba1\u7b97\u3002<\/p>\n<pre><code class=\"language-python\"># \u4f8b\u5982\uff1a\u4e00\u6bb5\u6587\u5b57\nA B C D E F G\n# 1.\u5047\u8bbe\u5728\u8bcdD\u5904\uff0c\u5982\u679c\u662f\u5355\u5411\u81ea\u56de\u5f52\u8ba1\u7b97\uff0c\u90a3\u4e48D\u53ea\u80fd\u4e0eA\u3001B\u3001C\u8fdb\u884c\u6ce8\u610f\u529b\u8ba1\u7b97\uff1b\n# 2.\u4f46\u662f\u5982\u679c\u662f\u53cc\u5411\u7684\u4e0a\u4e0b\u6587\uff0c\u90a3\u4e48D\u53ef\u4ee5\u540c\u65f6\u4e0eA\u3001B\u3001C\u3001D\u3001E\u3001F\u3001G\u8fdb\u884c\u6ce8\u610f\u529b\u8ba1\u7b97\u3002<\/code><\/pre>\n<\/li>\n<li>\n<p><strong>\u5fae\u8c03<\/strong>\uff1a\u4e0a\u8ff0\u5173\u4e8e\u5fae\u8c03\u7684\u8868\u8ff0\uff0c\u9610\u660e\u4e86<strong>\u5f53\u524d\u5927\u6a21\u578b\u7684\u8bad\u7ec3\u601d\u8def<\/strong>\uff1a\u4e0d\u9488\u5bf9\u4efb\u4f55\u4efb\u52a1\uff0c\u5148\u8bad\u7ec3\u4e00\u4e2a\u901a\u7528\u6a21\u578b\uff0c\u7136\u540e\u518d\u7b80\u5355\u7684\u6dfb\u52a0\u4e00\u4e2alayer\u53bb\u8fdb\u884c\u5fae\u8c03\u3002<\/p>\n<\/li>\n<\/ul>\n<h4><span class=\"ez-toc-section\" id=\"%E5%BC%95%E8%A8%80Introduction\"><\/span>\u5f15\u8a00(Introduction)<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<p><strong>\u8bba\u6587\u539f\u6587<\/strong>(NO.2)<\/p>\n<blockquote>\n<p>There are two existing strategies for applying pre-trained language representations to downstream tasks: feature-based and fine-tuning. The feature-based approach, such as ELMo (Peters et al., 2018a), uses task-specific architectures that include the pre-trained representations as additional features. The fine-tuning approach, such as the Generative Pre-trained Transformer (OpenAI GPT) (Radford et al., 2018), introduces minimal task-specific parameters, and is trained on the downstream tasks by simply fine-tuning all pretrained parameters. The two approaches share the same objective function during pre-training, where they use unidirectional language models to learn general language representations.<\/p>\n<\/blockquote>\n<p><strong>\u8bba\u6587\u7ffb\u8bd1<\/strong>(NO.2)<\/p>\n<blockquote>\n<p>\u73b0\u5728\u6709\u4e24\u79cd\u5e94\u7528\u4e8e\u4e0b\u6e38\u4efb\u52a1\u8bed\u8a00\u8868\u5f81\u6a21\u578b\u7684\u9884\u8bad\u7ec3\u7b56\u7565\uff1a<strong>\u57fa\u4e8e\u7279\u5f81\u7684<\/strong> \u548c <strong>\u57fa\u4e8e\u5fae\u8c03\u7684<\/strong>\u3002\u57fa\u4e8e\u7279\u5f81\u7684\u65b9\u6cd5\uff0c\u5982ELMo\uff08Peters et al., 2018a\uff09\uff0c\u4f7f\u7528\u7279\u5b9a\u4e8e\u4efb\u52a1\u7684\u67b6\u6784\uff0c\u5c06\u9884\u8bad\u7ec3\u7684\u8868\u793a\u4f5c\u4e3a\u9644\u52a0\u7279\u5f81\u3002\u57fa\u4e8e\u5fae\u8c03\u7684\u65b9\u6cd5\uff0c\u5982\u751f\u6210\u9884\u8bad\u7ec3\u7684Transformer\uff08OpenAI GPT\uff09\uff08Radford et al., 2018\uff09\uff0c\u5f15\u5165\u4e86\u6700\u5c0f\u7684\u4efb\u52a1\u7279\u5b9a\u53c2\u6570\uff0c\u5e76\u901a\u8fc7\u7b80\u5355\u5730\u5fae\u8c03\u6240\u6709\u9884\u8bad\u7ec3\u53c2\u6570\u8fdb\u884c\u8bad\u7ec3\u3002\u8fd9\u4e24\u4e2a\u7b56\u7565\u5728\u9884\u8bad\u7ec3\u671f\u95f4\u5171\u4eab\u76f8\u540c\u7684\u76ee\u6807\u51fd\u6570\uff0c\u5e76\u4e14\u4f7f\u7528\u5355\u5411\u8bed\u8a00\u6a21\u578b\u6765\u5b66\u4e60\u901a\u7528\u7684\u8bed\u8a00\u8868\u5f81\u3002<\/p>\n<\/blockquote>\n<p><strong>\u8bba\u6587\u7406\u89e3<\/strong>\uff1a<\/p>\n<ul>\n<li>\u6709\u4e24\u79cd\u8bad\u7ec3\u7b56\u7565\uff1a\u57fa\u4e8e\u7279\u5f81\u7684 \u548c \u57fa\u4e8e\u5fae\u8c03\u7684\u3002<\/li>\n<li>\u57fa\u4e8e\u7279\u5f81\u7684\u8bad\u7ec3\u7b56\u7565\uff0c\u662f\u9488\u5bf9\u7279\u5b9a\u4efb\u52a1\u7684\uff0c\u9700\u8981\u4fee\u6539\u6a21\u578b\u7ed3\u6784\u3002<\/li>\n<li>\u57fa\u4e8e\u5fae\u8c03\u7684\u8bad\u7ec3\u7b56\u7565\uff0c\u53ef\u4ee5\u505a\u5230\u6539\u6570\u636e\u4e0d\u6539\u6a21\u578b\u7ed3\u6784\u3002<\/li>\n<\/ul>\n<p><strong>\u8bba\u6587\u539f\u6587<\/strong>(NO.3)<\/p>\n<blockquote>\n<p>We argue that current techniques restrict the power of the pre-trained representations, especially for the fine-tuning approaches. The major limitation is that standard language models are unidirectional, and this limits the choice of architectures that can be used during pre-training. For example, in OpenAI GPT, the authors use a left-toright architecture, where every token can only attend to previous tokens in the self-attention layers of the Transformer (Vaswani et al., 2017). Such restrictions are sub-optimal for sentence-level tasks, and could be very harmful when applying finetuning based approaches to token-level tasks such as question answering, where it is crucial to incorporate context from both directions.<\/p>\n<\/blockquote>\n<p><strong>\u8bba\u6587\u7ffb\u8bd1<\/strong>(NO.3)<\/p>\n<blockquote>\n<p>\u6211\u4eec\u8ba4\u4e3a\u5f53\u524d\u7684\u6280\u672f\u9650\u5236\u4e86\u9884\u8bad\u7ec3\u8868\u5f81\u7684\u80fd\u529b\uff0c\u5c24\u5176\u662f\u5bf9\u4e8e\u5fae\u8c03\u65b9\u6cd5\u3002\u4e3b\u8981\u7684\u9650\u5236\u5728\u4e8e\u6807\u51c6\u8bed\u8a00\u6a21\u578b\u662f\u5355\u5411\u7684\uff0c\u8fd9\u9650\u5236\u4e86\u5728\u9884\u8bad\u7ec3\u671f\u95f4\u53ef\u4ee5\u4f7f\u7528\u7684\u67b6\u6784\u9009\u62e9\u3002\u4f8b\u5982\uff0c\u5728 <code>OpenAI GPT<\/code> \u4e2d\uff0c\u4f5c\u8005\u4f7f\u7528\u4e86 <code>\u5355\u5411\u4ece\u5de6\u5230\u53f3<\/code> \u7684\u67b6\u6784\uff0c\u8fd9\u4f7f\u5f97\u5728<code>Transformer<\/code> \u7684\u81ea\u6ce8\u610f\u529b\u5c42\u4e2d\uff08Vaswani \u7b49\uff0c2017\uff09\uff0c\u6bcf\u4e2a <code>token<\/code> \u53ea\u80fd\u770b\u5230\u5230\u4e0e\u4e4b\u524d\u7684 <code>token<\/code> \u8fdb\u884c\u6ce8\u610f\u529b\u8ba1\u7b97\u3002\u8fd9\u6837\u7684\u9650\u5236\u5bf9\u4e8e\u53e5\u5b50\u7ea7\u4efb\u52a1\u6765\u8bf4\u662f\u6b21\u4f18\u7684\uff0c\u800c\u5728\u5c06\u57fa\u4e8e\u5fae\u8c03\u7684\u65b9\u6cd5\u5e94\u7528\u4e8e<code>token<\/code>\u7ea7\u4efb\u52a1\uff08\u5982\u95ee\u7b54\uff09\u65f6\uff0c\u8fd9\u53ef\u80fd\u975e\u5e38\u6709\u5bb3\uff0c\u56e0\u4e3a\u5728\u8fd9\u79cd\u60c5\u51b5\u4e0b\uff0c\u4ece\u4e24\u4e2a\u65b9\u5411\u6574\u5408\u4e0a\u4e0b\u6587\u81f3\u5173\u91cd\u8981\u3002<\/p>\n<\/blockquote>\n<p><strong>\u8bba\u6587\u539f\u6587<\/strong>(NO.4)<\/p>\n<blockquote>\n<p>In this paper, we improve the fine-tuning based approaches by proposing BERT: Bidirectional Encoder Representations from Transformers.<br \/>\nBERT alleviates the previously mentioned unidirectionality constraint by using a \u201cmasked language model\u201d (MLM) pre-training objective, inspired by the Cloze task (Taylor, 1953). The masked language model randomly masks some of the tokens from the input, and the objective is to predict the original vocabulary id of the masked arXiv:1810.04805v2 [cs.CL] 24 May 2019 word based only on its context. Unlike left-toright language model pre-training, the MLM objective enables the representation to fuse the left and the right context, which allows us to pretrain a deep bidirectional Transformer. In addition to the masked language model, we also use a \u201cnext sentence prediction\u201d task that jointly pretrains text-pair representations. The contributions of our paper are as follows:<\/p>\n<ul>\n<li>We demonstrate the importance of bidirectional pre-training for language representations. Unlike Radford et al. (2018), which uses unidirectional language models for pre-training, BERT uses masked language models to enable pretrained deep bidirectional representations. This<br \/>\nis also in contrast to Peters et al. (2018a), which uses a shallow concatenation of independently trained left-to-right and right-to-left LMs.<\/li>\n<li>We show that pre-trained representations reduce the need for many heavily-engineered taskspecific architectures. BERT is the first finetuning based representation model that achieves state-of-the-art performance on a large suite of sentence-level and token-level tasks, outperforming many task-specific architectures.<\/li>\n<\/ul>\n<\/blockquote>\n<p><strong>\u8bba\u6587\u7ffb\u8bd1<\/strong>(NO.4)<\/p>\n<blockquote>\n<p>\u5728\u8fd9\u7bc7\u8bba\u6587\u4e2d\uff0c\u6211\u4eec\u901a\u8fc7\u63d0\u51fa <code>BERT<\/code> \u6765\u6539\u8fdb\u57fa\u4e8e\u5fae\u8c03\u7684\u65b9\u6cd5\uff0c\u4ee5\u89e3\u51b3\u4e4b\u524d\u63d0\u5230\u7684\u5355\u5411\u9650\u5236\u3002\u901a\u8fc7\u4f7f\u7528\u201c<strong>\u63a9\u7801\u8bed\u8a00\u6a21\u578b<\/strong>\u201d\uff08MLM\uff09\u4f5c\u4e3a\u9884\u8bad\u7ec3\u76ee\u6807\uff0c\u8fd9\u79cd\u601d\u60f3\u6e90\u4e8e\u6587\u5b66\u4e2d\u7684<strong>\u5b8c\u5f62\u586b\u7a7a<\/strong> (Taylor, 1953)\u3002\u63a9\u7801\u8bed\u8a00\u6a21\u578b<strong>\u968f\u673a\u5c4f\u853d\u8f93\u5165<\/strong>\u4e2d\u7684\u67d0\u4e9b <code>token<\/code> \uff0c\u5e76\u4f7f\u76ee\u6807\u4ec5\u4f9d\u8d56\u4e8e\u5176\u4e0a\u4e0b\u6587\u3002\u4e0e\u5355\u5411\u8bed\u8a00\u6a21\u578b\u9884\u8bad\u7ec3\u76f8\u6bd4\uff0cMLM \u76ee\u6807\u4f7f\u8868\u5f81\u80fd\u591f\u878d\u5408\u5de6\u53f3\u4e0a\u4e0b\u6587\uff0c\u8fd9\u5141\u8bb8\u6211\u4eec\u9884\u8bad\u7ec3\u4e00\u4e2a\u6df1\u5ea6\u53cc\u5411 Transformer\u3002\u9664\u4e86 MLM \u5916\uff0c\u6211\u4eec\u8fd8\u4f7f\u7528\u201c\u4e0b\u4e00\u4e2a\u53e5\u5b50\u9884\u6d4b\u201d\u4efb\u52a1\uff0c\u8054\u5408\u9884\u8bad\u7ec3\u6587\u672c\u8868\u5f81\u3002\u6211\u4eec\u8bba\u6587\u7684\u8d21\u732e\u5982\u4e0b\uff1a<\/p>\n<ul>\n<li>\u6211\u4eec\u5c55\u793a\u4e86\u53cc\u5411\u9884\u8bad\u7ec3\u5bf9\u8bed\u8a00\u8868\u793a\u7684\u91cd\u8981\u6027\u3002\u4e0e Radford \u7b49\uff082018\uff09\u4f7f\u7528\u5355\u5411\u8bed\u8a00\u6a21\u578b\u8fdb\u884c\u9884\u8bad\u7ec3\u4e0d\u540c\uff0cBERT \u4f7f\u7528\u63a9\u853d\u8bed\u8a00\u6a21\u578b\u6765\u5b9e\u73b0\u9884\u8bad\u7ec3\u7684\u6df1\u5ea6\u53cc\u5411\u8868\u5f81\u3002\u8fd9\u4e5f\u4e0e Peters \u7b49\uff082018a\uff09\u5f62\u6210\u5bf9\u6bd4\uff0c\u540e\u8005\u4f7f\u7528\u72ec\u7acb\u8bad\u7ec3\u7684\u4ece\u5de6\u5230\u53f3\u548c\u4ece\u53f3\u5230\u5de6\u7684\u8bed\u8a00\u6a21\u578b\u7684\u6d45\u5c42\u62fc\u63a5\u3002  <\/li>\n<li>\u6211\u4eec\u5c55\u793a\u4e86\uff0c\u9884\u8bad\u7ec3\u8868\u5f81\u51cf\u5c11\u4e86\u5bf9\u8bb8\u591a\u91cd\u5de5\u7a0b\u5316\u7684\u7279\u5b9a\u4efb\u52a1\u67b6\u6784\u7684\u9700\u6c42\u3002<code>BERT<\/code> \u662f<strong>\u7b2c\u4e00\u4e2a<\/strong>\u57fa\u4e8e\u5fae\u8c03\u7684\u8868\u5f81\u6a21\u578b\uff0c\u5728\u5927\u91cf\u53e5\u5b50\u7ea7\u548c <code>token<\/code> \u7ea7\u4efb\u52a1\u4e0a\u5b9e\u73b0\u4e86\u6700\u5148\u8fdb\u7684\u6027\u80fd\uff0c\u8d85\u8d8a\u4e86\u8bb8\u591a\u7279\u5b9a\u4efb\u52a1\u7684\u67b6\u6784\u3002<\/li>\n<\/ul>\n<\/blockquote>\n<h4><span class=\"ez-toc-section\" id=\"%E7%9B%B8%E5%85%B3%E5%B7%A5%E4%BD%9CRelated_Work\"><\/span>\u76f8\u5173\u5de5\u4f5c(Related Work)<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<p><a href=\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/08\/BERT\u8bad\u7ec3\u8fc7\u7a0b\u56fe\u793a.png\" data-fancybox=\"images\" data-fancybox=\"gallery\"><img decoding=\"async\" src=\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/08\/BERT\u8bad\u7ec3\u8fc7\u7a0b\u56fe\u793a.png\" alt=\"\" \/><\/a><\/p>\n<p><strong>\u8bba\u6587\u539f\u6587<\/strong>(NO.5)<\/p>\n<blockquote>\n<p>Figure 1: Overall pre-training and fine-tuning procedures for BERT. Apart from output layers, the same architectures are used in both pre-training and fine-tuning. The same pre-trained model parameters are used to initialize models for different down-stream tasks. During fine-tuning, all parameters are fine-tuned. [CLS] is a special symbol added in front of every input example, and [SEP] is a special separator token (e.g. separating questions\/answers).<\/p>\n<\/blockquote>\n<p><strong>\u8bba\u6587\u7ffb\u8bd1<\/strong>(NO.5)<\/p>\n<blockquote>\n<p>\u56fe1\uff1aBERT\u7684\u6574\u4f53\u9884\u8bad\u7ec3\u548c\u5fae\u8c03\u8fc7\u7a0b\u3002\u9664\u4e86\u8f93\u51fa\u5c42\u5916\uff0c\u9884\u8bad\u7ec3\u548c\u5fae\u8c03\u4e2d\u4f7f\u7528\u76f8\u540c\u7684\u67b6\u6784\u3002\u76f8\u540c\u7684\u9884\u8bad\u7ec3\u6a21\u578b\u53c2\u6570\u7528\u4e8e\u521d\u59cb\u5316\u4e0d\u540c\u4e0b\u6e38\u4efb\u52a1\u7684\u6a21\u578b\u3002\u5728\u5fae\u8c03\u8fc7\u7a0b\u4e2d\uff0c\u6240\u6709\u53c2\u6570\u90fd\u8fdb\u884c\u5fae\u8c03\u3002[CLS] \u662f\u5728\u6bcf\u4e2a\u8f93\u5165\u793a\u4f8b\u524d\u6dfb\u52a0\u7684\u7279\u6b8a\u7b26\u53f7\uff0c[SEP] \u662f\u4e00\u4e2a\u7279\u6b8a\u7684\u5206\u9694\u7b26\u6807\u8bb0\uff08\u4f8b\u5982\uff0c\u7528\u4e8e\u5206\u9694\u95ee\u9898\/\u7b54\u6848\uff09\u3002<\/p>\n<\/blockquote>\n<h4><span class=\"ez-toc-section\" id=\"BERT%E9%83%A8%E5%88%86BERT\"><\/span>BERT\u90e8\u5206(BERT)<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<p><strong>\u8bba\u6587\u539f\u6587<\/strong>(NO.6)<\/p>\n<blockquote>\n<p>We introduce BERT and its detailed implementation in this section. There are two steps in our framework: pre-training and fine-tuning. During pre-training, the model is trained on unlabeled data over different pre-training tasks. For finetuning, the BERT model is first initialized with the pre-trained parameters, and all of the parameters are fine-tuned using labeled data from the downstream tasks. Each downstream task has separate fine-tuned models, even though they are initialized with the same pre-trained parameters. The question-answering example in Figure 1 will serve as a running example for this section. A distinctive feature of BERT is its unified architecture across different tasks. There is minimal difference between the pre-trained architecture and the final downstream architecture.<\/p>\n<\/blockquote>\n<p><strong>\u8bba\u6587\u7ffb\u8bd1<\/strong>(NO.6)<\/p>\n<blockquote>\n<p>\u5728\u672c\u8282\u4e2d\uff0c\u6211\u4eec\u4ecb\u7ecdBERT\u53ca\u5176\u8be6\u7ec6\u5b9e\u73b0\u3002\u6211\u4eec\u7684\u6846\u67b6\u5305\u542b\u4e24\u4e2a\u6b65\u9aa4\uff1a<strong>\u9884\u8bad\u7ec3<\/strong> \u548c <strong>\u5fae\u8c03<\/strong>\u3002\u5728<strong>\u9884\u8bad\u7ec3\u9636\u6bb5<\/strong>\uff0c\u6a21\u578b\u5728\u4e0d\u540c\u7684<strong>\u65e0\u6807\u6ce8\u6570\u636e<\/strong>\u548c\u9884\u8bad\u7ec3\u4efb\u52a1\u4e0a\u8fdb\u884c\u8bad\u7ec3\u3002\u5bf9\u4e8e<strong>\u5fae\u8c03<\/strong>\uff0cBERT\u6a21\u578b\u9996\u5148\u4f7f\u7528\u9884\u8bad\u7ec3\u53c2\u6570\u8fdb\u884c\u521d\u59cb\u5316\uff0c\u7136\u540e\u4f7f\u7528\u6765\u81ea\u4e0b\u6e38\u4efb\u52a1\u7684<strong>\u6807\u6ce8\u6570\u636e<\/strong>\u5bf9\u6240\u6709\u53c2\u6570\u8fdb\u884c\u5fae\u8c03\u3002\u6bcf\u4e2a\u4e0b\u6e38\u4efb\u52a1\u90fd\u6709\u5355\u72ec\u7684\u5fae\u8c03\u6a21\u578b\uff0c\u5c3d\u7ba1\u5b83\u4eec\u90fd\u662f\u4f7f\u7528\u76f8\u540c\u7684\u9884\u8bad\u7ec3\u53c2\u6570\u521d\u59cb\u5316\u7684\u3002\u56fe1\u4e2d\u7684\u95ee\u7b54\u793a\u4f8b\u5c06\u4f5c\u4e3a\u672c\u8282\u7684\u8fd0\u884c\u793a\u4f8b\u3002<code>BERT<\/code> \u7684\u4e00\u4e2a\u663e\u8457\u7279\u70b9\u662f\u5176\u5728\u4e0d\u540c\u4efb\u52a1\u4e4b\u95f4\u7684\u7edf\u4e00\u67b6\u6784\u3002\u9884\u8bad\u7ec3\u67b6\u6784\u4e0e\u6700\u7ec8\u4e0b\u6e38\u67b6\u6784\u4e4b\u95f4\u7684\u5dee\u5f02\u6700\u5c0f\u3002<\/p>\n<\/blockquote>\n<h4><span class=\"ez-toc-section\" id=\"%E6%A8%A1%E5%9E%8B%E6%9E%B6%E6%9E%84Model_Architecture\"><\/span>\u6a21\u578b\u67b6\u6784(Model Architecture)<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<p><strong>\u8bba\u6587\u539f\u6587<\/strong>(NO.7)<\/p>\n<blockquote>\n<p>BERT\u2019s model architecture is a multi-layer bidirectional Transformer encoder based on the original implementation described in Vaswani et al. (2017) and released in the tensor2tensor library.1 Because the use of Transformers has become common and our implementation is almost identical to the original, we will omit an exhaustive background description of the model architecture and refer readers to Vaswani et al. (2017) as well as excellent guides such as \u201cThe Annotated Transformer.\u201d<br \/>\nIn this work, we denote the number of layers (i.e., Transformer blocks) as L, the hidden size as H, and the number of self-attention heads as A. We primarily report results on two model sizes: BERTBASE (L=12, H=768, A=12, Total Parameters=110M) and BERTLARGE (L=24, H=1024,A=16, Total Parameters=340M).<\/p>\n<\/blockquote>\n<p><strong>\u8bba\u6587\u7ffb\u8bd1<\/strong>(NO.7)<\/p>\n<blockquote>\n<p>BERT\u7684\u6a21\u578b\u67b6\u6784\u662f\u57fa\u4e8eVaswani\u7b49\u4eba\uff082017\uff09\u63cf\u8ff0\u7684\u539f\u59cb\u5b9e\u73b0\uff0c\u5e76\u5728tensor2tensor\u5e93\u4e2d\u53d1\u5e03\u7684\u591a\u5c42\u53cc\u5411\u53d8\u6362\u5668\u7f16\u7801\u5668\u3002\u7531\u4e8e <code>Transformers<\/code> \u7684\u4f7f\u7528\u5df2\u53d8\u5f97\u666e\u904d\uff0c\u5e76\u4e14\u6211\u4eec\u7684\u5b9e\u73b0\u4e0e\u539f\u59cb\u7248\u672c\u51e0\u4e4e\u76f8\u540c\uff0c\u56e0\u6b64\u6211\u4eec\u5c06\u7701\u7565\u5bf9\u6a21\u578b\u67b6\u6784\u7684\u8be6\u7ec6\u80cc\u666f\u63cf\u8ff0\uff0c\u5e76\u5efa\u8bae\u8bfb\u8005\u53c2\u8003Vaswani\u7b49\u4eba\uff082017\uff09\u4ee5\u53ca\u4f18\u79c0\u7684\u6307\u5357\uff0c\u5982\u300aThe Annotated Transformer\u300b\u3002<\/p>\n<p>\u5728\u672c\u5de5\u4f5c\u4e2d\uff0c\u6211\u4eec\u5c06\u5c42\u6570\uff08\u5373\u53d8\u6362\u5668\u5757\uff09\u8bb0\u4e3aL\uff0c\u9690\u85cf\u5c42\u5927\u5c0f\u8bb0\u4e3aH\uff0c\u81ea\u6ce8\u610f\u529b\u5934\u7684\u6570\u91cf\u8bb0\u4e3aA\u3002\u6211\u4eec\u4e3b\u8981\u62a5\u544a\u4e24\u79cd\u6a21\u578b\u5927\u5c0f\u7684\u7ed3\u679c\uff1aBERTBASE\uff08L=12\uff0cH=768\uff0cA=12\uff0c\u603b\u53c2\u6570=1.1\u4ebf\uff09\u548cBERTLARGE\uff08L=24\uff0cH=1024\uff0cA=16\uff0c\u603b\u53c2\u6570=3.4\u4ebf\uff09\u3002<\/p>\n<\/blockquote>\n<h4><span class=\"ez-toc-section\" id=\"%E8%BE%93%E5%85%A5%E8%BE%93%E5%87%BA%E8%A1%A8%E5%BE%81InputOutput_Representations\"><\/span>\u8f93\u5165\/\u8f93\u51fa\u8868\u5f81(Input\/Output Representations)<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<p><strong>\u8bba\u6587\u539f\u6587<\/strong>(NO.8)<\/p>\n<blockquote>\n<p>To make BERT handle a variety of down-stream tasks, our input representation is able to unambiguously represent both a single sentence and a pair of sentences (e.g., h Question, Answeri) in one token sequence. Throughout this work, a \u201csentence\u201d can be an arbitrary span of contiguous text, rather than an actual linguistic sentence. A \u201csequence\u201d refers to the input token sequence to BERT, which may be a single sentence or two sentences packed together. We use WordPiece embeddings (Wu et al., 2016) with a 30,000 token vocabulary. The first<br \/>\ntoken of every sequence is always a special classification token ([CLS]). The final hidden state corresponding to this token is used as the aggregate sequence representation for classification tasks. Sentence pairs are packed together into a single sequence. We differentiate the sentences in two ways. First, we separate them with a special token ([SEP]). Second, we add a learned embedding to every token indicating whether it belongs to sentence A or sentence B. <\/p>\n<\/blockquote>\n<p><strong>\u8bba\u6587\u7ffb\u8bd1<\/strong>(NO.8)<\/p>\n<blockquote>\n<p>\u4e3a\u4e86\u4f7fBERT\u80fd\u591f\u5904\u7406\u5404\u79cd\u4e0b\u6e38\u4efb\u52a1\uff0c\u6211\u4eec\u7684\u8f93\u5165\u8868\u5f81\u80fd\u591f\u660e\u786e\u5730\u8868\u793a<strong>\u5355\u4e2a\u53e5\u5b50<\/strong>\u548c<strong>\u4e00\u5bf9\u53e5\u5b50<\/strong>\uff08\u4f8b\u5982\uff0c&lt;\u95ee\u9898, \u7b54\u6848&gt;\uff09\u5728\u4e00\u4e2a <code>token<\/code> \u5e8f\u5217\u4e2d\u3002\u5728\u6211\u4eec\u7684\u901a\u7bc7\u6587\u7ae0\u4e2d\uff0c\u4e00<strong>\u4e2a\u201csentence\uff08\u53e5\u5b50\uff09\u201d\u53ef\u4ee5\u662f\u4efb\u610f\u8fde\u7eed\u7684\u6587\u672c\uff0c\u800c\u4e0d\u5fc5\u662f\u5b9e\u9645\u7684\u8bed\u8a00\u53e5\u5b50<\/strong>\u3002\u201csequence(\u5e8f\u5217)\u201d\u6307\u7684\u662f\u8f93\u5165\u5230BERT\u7684 <code>token<\/code> \u5e8f\u5217\uff0c\u8fd9\u53ef\u4ee5\u662f\u4e00\u4e2a\u5355\u72ec\u7684\u53e5\u5b50\u6216\u4e24\u4e2a\u53e5\u5b50\u7ec4\u5408\u5728\u4e00\u8d77\u3002<\/p>\n<p>\u6211\u4eec\u4f7f\u7528WordPiece\u5d4c\u5165\uff08Wu\u7b49\uff0c2016\uff09\uff0c\u8bcd\u6c47\u91cf\u4e3a30,000\u4e2a\u6807\u8bb0\u3002\u6bcf\u4e2a\u5e8f\u5217\u7684\u7b2c\u4e00\u4e2a <code>token<\/code> \u59cb\u7ec8\u662f\u4e00\u4e2a\u7279\u6b8a\u7684\u5206\u7c7b\u6807\u8bb0\uff08[CLS]\uff09\u3002\u4e0e\u8be5 <code>token<\/code> \u5bf9\u5e94\u7684\u6700\u7ec8\u9690\u85cf\u72b6\u6001\u7528\u4e8e\u5206\u7c7b\u4efb\u52a1\u7684\u805a\u5408\u5e8f\u5217\u8868\u793a\u3002\u5982\u679c\u662f\u53e5\u5b50\u5bf9\uff0c\u5219\u88ab\u6253\u5305\u6210\u4e00\u4e2a\u5355\u4e00\u5e8f\u5217\u3002\u6211\u4eec\u901a\u8fc7\u4e24\u79cd\u65b9\u5f0f\u6765\u533a\u5206\u53e5\u5b50\u3002\u9996\u5148\uff0c\u6211\u4eec<strong>\u7528\u4e00\u4e2a\u7279\u6b8atoken\uff08[SEP]\uff09\u5c06\u5b83\u4eec\u5206\u5f00<\/strong>\u3002\u5176\u6b21\uff0c\u6211\u4eec\u4e3a\u6bcf\u4e2a <code>token<\/code> <strong>\u6dfb\u52a0\u4e00\u4e2a\u5df2\u7ecf\u5b66\u8fc7\u7684embedding\u8bcd\u5411\u91cf<\/strong>\uff0c\u6307\u793a\u5b83\u5c5e\u4e8e\u53e5\u5b50A\u8fd8\u662f\u53e5\u5b50B\u3002<br \/>\n<a href=\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/08\/BERT\u8f93\u5165\u8f93\u51fa\u793a\u4f8b.png\" data-fancybox=\"images\" data-fancybox=\"gallery\"><img decoding=\"async\" src=\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/08\/BERT\u8f93\u5165\u8f93\u51fa\u793a\u4f8b.png\" alt=\"\" \/><\/a><\/p>\n<\/blockquote>\n<h4><span class=\"ez-toc-section\" id=\"%E4%BB%BB%E5%8A%A11_Masked_LM\"><\/span>\u4efb\u52a11: Masked LM<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<p><strong>\u8bba\u6587\u539f\u6587<\/strong>(NO.9)<\/p>\n<blockquote>\n<p>Intuitively, it is reasonable to believe that a deep bidirectional model is strictly more powerful than either a left-to-right model or the shallow concatenation of a left-toright and a right-to-left model. Unfortunately, standard conditional language models can only be trained left-to-right or right-to-left, since bidirectional conditioning would allow each word to indirectly \u201csee itself\u201d, and the model could trivially predict the target word in a multi-layered context. In order to train a deep bidirectional representation, we simply mask some percentage of the input tokens at random, and then predict those masked tokens. We refer to this procedure as a \u201cmasked LM\u201d (MLM), although it is often referred to as a Cloze task in the literature (Taylor, 1953). In this case, the final hidden vectors corresponding to the mask tokens are fed into an output softmax over the vocabulary, as in a standard LM. In all of our experiments, we mask 15% of all WordPiece tokens in each sequence at random. In contrast to denoising auto-encoders (Vincent et al., 2008), we only predict the masked words rather than reconstructing the entire input. Although this allows us to obtain a bidirectional pre-trained model, a downside is that we are creating a mismatch between pre-training and fine-tuning, since the [MASK] token does not appear during fine-tuning. To mitigate this, we do not always replace \u201cmasked\u201d words with the actual [MASK] token. The training data generator chooses 15% of the token positions at random for prediction. If the i-th token is chosen, we replace the i-th token with (1) the [MASK] token 80% of the time (2) a random token 10% of the time (3) the unchanged i-th token 10% of the time. Then, Ti will be used to predict the original token with cross entropy loss. <\/p>\n<\/blockquote>\n<p><strong>\u8bba\u6587\u7ffb\u8bd1<\/strong>(NO.9)<\/p>\n<blockquote>\n<p>\u76f4\u89c2\u4e0a\uff0c<strong>\u6df1\u5ea6\u53cc\u5411\u6a21\u578b\u7684\u80fd\u529b<\/strong>\u663e\u7136\u5f3a\u4e8e<strong>\u5355\u5411\u4ece\u5de6\u5230\u53f3<\/strong>\u7684\u6a21\u578b\u6216<strong>\u4ece\u5de6\u5230\u53f3<\/strong>\u4e0e<strong>\u4ece\u53f3\u5230\u5de6<\/strong>\u6a21\u578b\u7684\u6d45\u5c42\u62fc\u63a5\u3002\u4e0d\u5e78\u7684\u662f\uff0c\u6807\u51c6\u7684\u6761\u4ef6\u8bed\u8a00\u6a21\u578b\u53ea\u80fd\u4ece\u5de6\u5230\u53f3\u6216\u4ece\u53f3\u5230\u5de6\u8fdb\u884c\u8bad\u7ec3\uff0c\u56e0\u4e3a\u53cc\u5411\u6761\u4ef6\u4f1a\u4f7f\u6bcf\u4e2a\u8bcd\u95f4\u63a5\u5730\u201c\u770b\u5230\u81ea\u5df1\u201d\uff0c\u4ece\u800c\u4f7f\u6a21\u578b\u80fd\u591f\u5728\u591a\u5c42\u4e0a\u4e0b\u6587\u4e2d\u8f7b\u677e\u9884\u6d4b\u76ee\u6807\u8bcd\u3002<\/p>\n<p>\u4e3a\u4e86\u8bad\u7ec3\u6df1\u5ea6\u53cc\u5411\u8868\u5f81\uff0c\u6211\u4eec<strong>\u968f\u673a\u63a9\u76d6\u8f93\u5165<\/strong> <code>token<\/code> \u7684\u4e00\u5b9a\u767e\u5206\u6bd4\uff0c\u7136\u540e<strong>\u9884\u6d4b\u8fd9\u4e9b\u88ab\u63a9\u76d6\u7684<\/strong> <code>token<\/code> \u3002\u6211\u4eec\u5c06\u8fd9\u4e00\u8fc7\u7a0b\u79f0\u4e3a\u201c<strong>\u63a9\u7801\u8bed\u8a00\u6a21\u578b<\/strong>\u201d\uff08Masked LM\uff0cMLM\uff09\uff0c\u5c3d\u7ba1\u5728\u6587\u732e\u4e2d\u901a\u5e38\u79f0\u4e3a\u5b8c\u578b\u586b\u7a7a\u4efb\u52a1\uff08Taylor\uff0c1953\uff09\u3002\u5728\u8fd9\u79cd\u60c5\u51b5\u4e0b\uff0c\u4e0e\u63a9\u7801\u6807\u8bb0\u5bf9\u5e94\u7684\u6700\u7ec8\u9690\u85cf\u5411\u91cf\u88ab\u8f93\u5165\u5230\u4e00\u4e2a\u8f93\u51fasoftmax\u4e2d\uff0c\u7c7b\u4f3c\u4e8e\u6807\u51c6\u7684\u8bed\u8a00\u6a21\u578b\u3002\u5728\u6211\u4eec\u7684\u6240\u6709\u5b9e\u9a8c\u4e2d\uff0c\u6211\u4eec\u968f\u673a\u63a9\u76d6\u6bcf\u4e2a\u5e8f\u5217\u4e2d15%\u7684WordPiece <code>token<\/code> \u3002\u4e0e\u53bb\u566a\u81ea\u7f16\u7801\u5668\uff08Vincent\u7b49\uff0c2008\uff09\u4e0d\u540c\uff0c<strong>\u6211\u4eec\u53ea\u9884\u6d4b\u88ab\u63a9\u76d6\u7684\u8bcd\uff0c\u800c\u4e0d\u662f\u91cd\u5efa\u6574\u4e2a\u8f93\u5165<\/strong>\u3002<\/p>\n<p>\u5c3d\u7ba1\u8fd9\u4f7f\u6211\u4eec\u80fd\u591f\u83b7\u5f97\u4e00\u4e2a\u53cc\u5411\u9884\u8bad\u7ec3\u6a21\u578b\uff0c\u4f46\u4e00\u4e2a\u7f3a\u70b9\u662f\u6211\u4eec\u5728\u9884\u8bad\u7ec3\u548c\u5fae\u8c03\u4e4b\u95f4\u4ea7\u751f\u4e86\u4e0d\u5339\u914d\uff0c\u56e0\u4e3a[MASK]\u6807\u8bb0\u5728\u5fae\u8c03\u8fc7\u7a0b\u4e2d\u5e76\u4e0d\u5b58\u5728\u3002\u4e3a\u4e86\u7f13\u89e3\u8fd9\u4e00\u95ee\u9898\uff0c\u6211\u4eec\u5e76\u4e0d\u603b\u662f\u7528\u5b9e\u9645\u7684[MASK]\u6807\u8bb0\u66ff\u6362\u201c\u88ab\u63a9\u76d6\u201d\u7684\u8bcd\u3002\u8bad\u7ec3\u6570\u636e\u751f\u6210\u5668\u968f\u673a\u9009\u62e915%\u7684 <code>token<\/code> \u4f4d\u7f6e\u8fdb\u884c\u9884\u6d4b\u3002\u5982\u679c\u9009\u62e9\u4e86\u7b2ci\u4e2a <code>token<\/code> \uff0c\u6211\u4eec\u4f1a\u5728\u4ee5\u4e0b\u60c5\u51b5\u4e0b\u66ff\u6362\u7b2ci\u4e2a <code>token<\/code> \uff1a(1) 80%\u7684\u65f6\u95f4\u7528[MASK]<code>token<\/code>\uff0c(2) 10%\u7684\u65f6\u95f4\u7528\u968f\u673a<code>token<\/code>\uff0c(3) 10%\u7684\u65f6\u95f4\u4fdd\u6301\u7b2ci\u4e2a <code>token<\/code> \u4e0d\u53d8\u3002\u7136\u540e\uff0c<code>Ti<\/code>\u5c06\u7528\u4e8e\u901a\u8fc7\u4ea4\u53c9\u71b5\u635f\u5931\u9884\u6d4b\u539f\u59cb\u6807\u8bb0\u3002<\/p>\n<\/blockquote>\n<h4><span class=\"ez-toc-section\" id=\"%E4%BB%BB%E5%8A%A12%EF%BC%9A%E4%B8%8B%E4%B8%80%E4%B8%AA%E5%8F%A5%E5%AD%90%E9%A2%84%E6%B5%8B%EF%BC%88NSP%EF%BC%89\"><\/span>\u4efb\u52a12\uff1a\u4e0b\u4e00\u4e2a\u53e5\u5b50\u9884\u6d4b\uff08NSP\uff09<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<p><strong>\u8bba\u6587\u539f\u6587<\/strong>(NO.10)<\/p>\n<blockquote>\n<p>Many important downstream tasks such as Question Answering (QA) and Natural Language Inference (NLI) are based on understanding the relationship between two sentences, which is not directly captured by language modeling. In order to train a model that understands sentence relationships, we pre-train for a binarized next sentence prediction task that can be trivially generated from any monolingual corpus. Specifically, when choosing the sentences A and B for each pretraining example, 50% of the time B is the actual next sentence that follows A (labeled as IsNext),<br \/>\nand 50% of the time it is a random sentence from the corpus (labeled as NotNext). As we show in Figure 1, C is used for next sentence prediction (NSP). Despite its simplicity, we demonstrate in Section 5.1 that pre-training towards this task is very beneficial to both QA and NLI.<\/p>\n<\/blockquote>\n<p><strong>\u8bba\u6587\u7ffb\u8bd1<\/strong>(NO.10)<\/p>\n<blockquote>\n<p>\u8bb8\u591a\u91cd\u8981\u7684\u4e0b\u6e38\u4efb\u52a1\uff0c\u5982\u95ee\u7b54\uff08QA\uff09\u548c\u81ea\u7136\u8bed\u8a00\u63a8\u7406\uff08NLI\uff09\uff0c\u90fd\u57fa\u4e8e\u7406\u89e3\u4e24\u4e2a\u53e5\u5b50\u4e4b\u95f4\u7684\u5173\u7cfb\uff0c\u800c\u8fd9\u4e00\u70b9\u5e76\u6ca1\u6709\u88ab\u8bed\u8a00\u5efa\u6a21\u76f4\u63a5\u6355\u6349\u3002\u4e3a\u4e86\u8bad\u7ec3\u4e00\u4e2a\u7406\u89e3\u53e5\u5b50\u5173\u7cfb\u7684\u6a21\u578b\uff0c\u6211\u4eec\u9884\u8bad\u7ec3\u4e86\u4e00\u4e2a<strong>\u4e8c\u5143\u4e0b\u4e00\u4e2a\u53e5\u5b50\u9884\u6d4b<\/strong>\u4efb\u52a1\uff0c\u8be5\u4efb\u52a1\u53ef\u4ee5\u4ece\u4efb\u4f55\u5355\u8bed\u8bed\u6599\u5e93\u4e2d\u7b80\u5355\u751f\u6210\u3002<\/p>\n<p>\u5177\u4f53\u800c\u8a00\uff0c\u5728\u4e3a\u6bcf\u4e2a\u9884\u8bad\u7ec3\u793a\u4f8b\u9009\u62e9\u53e5\u5b50A\u548cB\u65f6\uff0c<strong>50%<\/strong>\u7684\u65f6\u95f4 <strong>B \u662f\u5b9e\u9645\u8ddf\u968f A \u7684\u4e0b\u4e00\u4e2a\u53e5\u5b50<\/strong>\uff08\u6807\u8bb0\u4e3aIsNext\uff09\uff0c\u800c <strong>50%<\/strong> \u7684\u65f6\u95f4\u5219\u662f<strong>\u6765\u81ea\u8bed\u6599\u5e93\u7684\u968f\u673a\u53e5\u5b50<\/strong>\uff08\u6807\u8bb0\u4e3aNotNext\uff09\u3002\u6b63\u5982\u56fe1\u6240\u793a\uff0c C \u7528\u4e8e\u4e0b\u4e00\u4e2a\u53e5\u5b50\u9884\u6d4b\uff08NSP\uff09\u3002\u5c3d\u7ba1\u8fd9\u4e2a\u4efb\u52a1\u5f88\u7b80\u5355\uff0c\u4f46\u6211\u4eec\u5728\u7b2c5.1\u8282\u4e2d\u8bc1\u660e\u4e86\uff0c\u9488\u5bf9\u8fd9\u4e00\u4efb\u52a1\u7684\u9884\u8bad\u7ec3\u5bf9\u95ee\u7b54\u548c\u81ea\u7136\u8bed\u8a00\u63a8\u7406\u90fd\u6709\u5f88\u5927\u5e2e\u52a9\u3002<\/p>\n<\/blockquote>\n<h4><span class=\"ez-toc-section\" id=\"%E9%A2%84%E8%AE%AD%E7%BB%83%E6%95%B0%E6%8D%AEPre-training_data\"><\/span>\u9884\u8bad\u7ec3\u6570\u636e(Pre-training data)<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<p><strong>\u8bba\u6587\u539f\u6587<\/strong>(NO.11)<\/p>\n<blockquote>\n<p>The pre-training procedure largely follows the existing literature on language model pre-training. For the pre-training corpus we use the BooksCorpus (800M words) (Zhu et al., 2015) and English Wikipedia (2,500M words). For Wikipedia we extract only the text passages and ignore lists, tables, and headers. It is critical to use a document-level corpus rather than a shuffled sentence-level corpus such as the Billion Word Benchmark (Chelba et al., 2013) in order to extract long contiguous sequences.<\/p>\n<\/blockquote>\n<p><strong>\u8bba\u6587\u7ffb\u8bd1<\/strong>(NO.11)<\/p>\n<blockquote>\n<p>\u9884\u8bad\u7ec3\u8fc7\u7a0b\u57fa\u672c\u4e0a\u9075\u5faa\u73b0\u6709\u7684\u8bed\u8a00\u6a21\u578b\u9884\u8bad\u7ec3\u6587\u732e\u3002\u6211\u4eec\u4f7f\u7528\u7684\u9884\u8bad\u7ec3\u8bed\u6599\u5e93\u5305\u62ec<strong>BooksCorpus<\/strong>\uff088\u4ebf\u4e2a\u8bcd\uff09\uff08Zhu\u7b49\uff0c2015\uff09\u548c<strong>\u82f1\u8bed\u7ef4\u57fa\u767e\u79d1<\/strong>\uff0825\u4ebf\u4e2a\u8bcd\uff09\u3002\u5bf9\u4e8e\u7ef4\u57fa\u767e\u79d1\uff0c\u6211\u4eec<strong>\u4ec5\u63d0\u53d6\u6587\u672c\u6bb5\u843d<\/strong>\uff0c\u5ffd\u7565\u5217\u8868\u3001\u8868\u683c\u548c\u6807\u9898\u3002\u4f7f\u7528\u6587\u6863\u7ea7\u8bed\u6599\u5e93\u800c\u975e\u50cf\u4ebf\u8bcd\u57fa\u51c6\uff08Billion Word Benchmark\uff0cChelba\u7b49\uff0c2013\uff09\u8fd9\u6837\u7684\u6253\u4e71\u53e5\u5b50\u7ea7\u8bed\u6599\u5e93\u662f\u81f3\u5173\u91cd\u8981\u7684\uff0c\u4ee5\u4fbf\u63d0\u53d6\u957f\u7684\u8fde\u7eed\u5e8f\u5217\u3002<\/p>\n<\/blockquote>\n<h3><span class=\"ez-toc-section\" id=\"%E8%AE%BA%E6%96%87%E5%86%85%E5%AE%B9%E6%80%BB%E7%BB%93\"><\/span>\u8bba\u6587\u5185\u5bb9\u603b\u7ed3<span class=\"ez-toc-section-end\"><\/span><\/h3>\n<h4><span class=\"ez-toc-section\" id=\"%E9%A2%84%E8%AE%AD%E7%BB%83%E9%98%B6%E6%AE%B5\"><\/span>\u9884\u8bad\u7ec3\u9636\u6bb5<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<p>\u5728\u9884\u8bad\u7ec3\u9636\u6bb5\uff0c<code>BERT<\/code> \u4f7f\u7528<strong>\u5927\u91cf\u65e0\u6807\u6ce8\u7684\u6587\u672c\u6570\u636e<\/strong>\u8fdb\u884c\u8bad\u7ec3\uff0c\u4e3b\u8981\u5305\u62ec\u4ee5\u4e0b\u4e24\u4e2a\u4efb\u52a1\uff1a<\/p>\n<p><strong>\u7b2c\u4e00\u4e2a\u4efb\u52a1<\/strong>\uff1a\u63a9\u853d\u8bed\u8a00\u6a21\u578b\uff08Masked Language Model, MLM\uff09<\/p>\n<ul>\n<li><strong>\u8fc7\u7a0b<\/strong>\uff1a\u968f\u673a\u9009\u62e9\u8f93\u5165\u53e5\u5b50\u4e2d\u7684\u4e00\u4e9b\u8bcd\uff0c\u5e76\u5c06\u5b83\u4eec\u66ff\u6362\u4e3a\u4e00\u4e2a\u7279\u6b8a\u7684[MASK]\u6807\u8bb0\u3002\u6a21\u578b\u7684\u76ee\u6807\u662f\u6839\u636e\u4e0a\u4e0b\u6587\u9884\u6d4b\u88ab\u63a9\u853d\u7684\u8bcd\uff0c\u5373\uff1a\u6316\u7a7a\u586b\u7a7a\u3002<\/li>\n<\/ul>\n<pre><code class=\"language-python\"># \u7406\u89e3\u793a\u4f8b\uff1a\n\n# \u8f93\u5165\u53e5\u5b50\uff1a\u201c\u6211\u559c\u6b22\u5403\u82f9\u679c\u3002\u201d\n# \u968f\u673a\u66ff\u6362\uff1a\u201c\u6211\u559c\u6b22\u5403[MASK]\u3002\u201d\n# \u6a21\u578b\u9884\u6d4b\uff1a\u6a21\u578b\u9700\u8981\u9884\u6d4b\u88ab\u63a9\u853d\u7684\u8bcd\uff0c\u4f8b\u5982\u201c\u82f9\u679c\u201d\u3002<\/code><\/pre>\n<p><strong>\u7b2c\u4e8c\u4e2a\u4efb\u52a1<\/strong>\uff1a\u4e0b\u4e00\u4e2a\u53e5\u5b50\u9884\u6d4b\uff08Next Sentence Prediction, NSP\uff09<\/p>\n<ul>\n<li><strong>\u8fc7\u7a0b<\/strong>\uff1a\u6a21\u578b\u63a5\u6536\u4e00\u5bf9\u53e5\u5b50\uff0c\u5224\u65ad\u53e5\u5b50A\u548c\u53e5\u5b50B\u662f\u5426\u6709\u76f8\u5173\u6027\uff0c\u5176\u4e2d[NSP]\u662f\u53e5\u5b50\u662f\u5426\u6709\u76f8\u5173\u6027\u7684\u5206\u7c7b\u6807\u7b7e\u3002<\/li>\n<\/ul>\n<pre><code class=\"language-python\"># \u7406\u89e3\u793a\u4f8b\uff1a\n# \u8f93\u5165\u53e5\u5b50\u5bf9\uff1a\n#   \u53e5\u5b50A\uff1a\u201c\u6211\u559c\u6b22\u770b\u7535\u5f71\u3002\u201d\n#   \u53e5\u5b50B1\uff1a\u201c\u8fd9\u90e8\u7535\u5f71\u975e\u5e38\u597d\u3002\u201d\uff08\u662f\u4e0b\u4e00\u4e2a\u53e5\u5b50\uff09\n#   \u53e5\u5b50B2\uff1a\u201c\u5929\u6c14\u5f88\u597d\u3002\u201d\uff08\u4e0d\u662f\u4e0b\u4e00\u4e2a\u53e5\u5b50\uff09\n# \u6a21\u578b\u9884\u6d4b\uff1a\u6a21\u578b\u9700\u8981\u9884\u6d4b\u53e5\u5b50A\u548c\u53e5\u5b50B\u662f\u5426\u6709\u76f8\u5173\u6027\uff0c\u5373\uff1a\u5224\u65ad\u53e5\u5b50B1\u662f\u5426\u662f\u53e5\u5b50A\u7684\u4e0b\u4e00\u4e2a\u53e5\u5b50\u3002<\/code><\/pre>\n<h4><span class=\"ez-toc-section\" id=\"%E5%BE%AE%E8%B0%83%E9%98%B6%E6%AE%B5\"><\/span>\u5fae\u8c03\u9636\u6bb5<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<p>\u5728\u5fae\u8c03\u9636\u6bb5\uff0c<code>BERT<\/code> \u5728\u7279\u5b9a\u7684\u4e0b\u6e38\u4efb\u52a1\u4e0a\u8fdb\u884c\u8bad\u7ec3\uff0c\u901a\u5e38\u53ea\u9700\u6dfb\u52a0\u4e00\u4e2a\u8f93\u51fa\u5c42\u3002\u5fae\u8c03\u7684\u8fc7\u7a0b\u5982\u4e0b\uff1a<\/p>\n<ul>\n<li><strong>\u8fc7\u7a0b<\/strong>\uff1a\u5c06\u9884\u8bad\u7ec3\u7684 <code>BERT<\/code> \u6a21\u578b\u52a0\u8f7d\u5230\u7279\u5b9a\u4efb\u52a1\u4e2d\uff08\u5982\u95ee\u7b54\u3001\u60c5\u611f\u5206\u6790\u7b49\uff09\uff0c\u5e76\u4f7f\u7528\u6807\u6ce8\u6570\u636e\u4e0a\u8fdb\u884c\u8bad\u7ec3\u3002\n<pre><code class=\"language-python\"># \u7406\u89e3\u793a\u4f8b\uff1a\n# \u8f93\u5165\u95ee\u7b54\u5bf9\uff1a(\u5bf9\u4e8e\u95ee\u7b54\u5bf9\u4efb\u52a1)\n#   \u95ee\u9898\uff1a\u201cBERT\u6a21\u578b\u662f\u4ec0\u4e48\uff1f\u201d\n#   \u4e0a\u4e0b\u6587\uff1a\u201cBERT\u6a21\u578b\u662f\u4e00\u4e2a\u5f3a\u5927\u7684\u8bed\u8a00\u6a21\u578b\u3002\u201d\n# \u6a21\u578b\u8f93\u51fa\uff1a\u7b54\u6848\u7684\u5f00\u59cb\u548c\u7ed3\u675f\u4f4d\u7f6e\u3002<\/code><\/pre>\n<\/li>\n<\/ul>\n<h2><span class=\"ez-toc-section\" id=\"GLM_%E6%A8%A1%E5%9E%8B%E8%AE%AD%E7%BB%83%E7%AD%96%E7%95%A5\"><\/span>GLM \u6a21\u578b\u8bad\u7ec3\u7b56\u7565<span class=\"ez-toc-section-end\"><\/span><\/h2>\n<h3><span class=\"ez-toc-section\" id=\"%E8%83%8C%E6%99%AF%E4%BB%8B%E7%BB%8D-2\"><\/span>\u80cc\u666f\u4ecb\u7ecd<span class=\"ez-toc-section-end\"><\/span><\/h3>\n<p>GLM\uff08Generalized Language Model\uff09\u662f\u56fd\u5185\u667a\u666e\u63d0\u51fa\u7684\u4e00\u79cd\u901a\u7528\u7684\u8bed\u8a00\u6a21\u578b\uff0c\u65e8\u5728\u901a\u8fc7\u66f4\u5f3a\u7684\u4e0a\u4e0b\u6587\u7406\u89e3\u548c\u7075\u6d3b\u7684\u4efb\u52a1\u9002\u5e94\u80fd\u529b\u6765\u63d0\u5347\u81ea\u7136\u8bed\u8a00\u5904\u7406\u7684\u6548\u679c\u3002<\/p>\n<p><strong>\u8d21\u732e<\/strong>\uff1a<br \/>\n<code>GLM<\/code> \u521b\u65b0\u7684\u4f7f\u7528\u4e86\u4e00\u79cd <code>\u57fa\u4e8e\u81ea\u56de\u5f52\u7a7a\u767d\u586b\u5145\u7684\u901a\u7528\u8bed\u8a00\u6a21\u578b<\/code> \u3002<\/p>\n<h3><span class=\"ez-toc-section\" id=\"%E8%AE%BA%E6%96%87%E9%98%85%E8%AF%BB%E7%90%86%E8%A7%A3-2\"><\/span>\u8bba\u6587\u9605\u8bfb\u7406\u89e3<span class=\"ez-toc-section-end\"><\/span><\/h3>\n<h4><span class=\"ez-toc-section\" id=\"%E6%91%98%E8%A6%81Abstract-2\"><\/span>\u6458\u8981(Abstract)<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<p><strong>\u8bba\u6587\u539f\u6587<\/strong>(NO.1)<\/p>\n<blockquote>\n<p>There have been various types of pretraining architectures including autoencoding models (e.g., BERT), autoregressive models (e.g., GPT), and encoder-decoder models (e.g., T5). However, none of the pretraining frameworks performs the best for all tasks of three main categories including natural language understanding (NLU), unconditional generation, and conditional generation. We propose a General Language Model (GLM) based on autoregressive blank infilling to address this challenge. GLM improves blank filling pretraining by adding 2D positional encodings and allowing an arbitrary order to predict spans, which results in performance gains over BERT and T5 on NLU tasks. Meanwhile, GLM can be pretrained for different types of tasks by varying the number and lengths of blanks. On a wide range of tasks across NLU, conditional and unconditional generation, GLM outperforms BERT, T5, and GPT given the same model sizes and data, and achieves the best performance from a single pretrained model with 1.25\u00d7 parameters of BERTLarge, demonstrating its generalizability to different downstream tasks.<\/p>\n<\/blockquote>\n<p><strong>\u8bba\u6587\u7ffb\u8bd1<\/strong>(NO.1)<\/p>\n<blockquote>\n<p>\u9884\u8bad\u7ec3\u67b6\u6784\u6709\u591a\u79cd\u7c7b\u578b\uff0c\u5305\u62ec\u81ea\u7f16\u7801\u6a21\u578b\uff08\u5982<code>BERT<\/code>\uff09\u3001\u81ea\u56de\u5f52\u6a21\u578b\uff08\u5982<code>GPT<\/code>\uff09\u548c\u7f16\u7801-\u89e3\u7801\u6a21\u578b\uff08\u5982<code>T5<\/code>\uff09\u3002\u7136\u800c\uff0c\u6ca1\u6709\u4efb\u4f55\u9884\u8bad\u7ec3\u6846\u67b6\u5728\u81ea\u7136\u8bed\u8a00\u7406\u89e3\uff08NLU\uff09\u3001\u65e0\u6761\u4ef6\u751f\u6210\u548c\u6709\u6761\u4ef6\u751f\u6210\u8fd9\u4e09\u5927\u7c7b\u4efb\u52a1\u4e2d\u8868\u73b0\u6700\u4f73\u3002\u4e3a\u4e86\u89e3\u51b3\u8fd9\u4e2a\u6311\u6218\uff0c\u6211\u4eec\u63d0\u51fa\u4e86\u4e00\u79cd<strong>\u57fa\u4e8e\u81ea\u56de\u5f52\u7a7a\u767d\u586b\u5145<\/strong>\u7684\u901a\u7528\u8bed\u8a00\u6a21\u578b\uff08GLM\uff09\u3002<\/p>\n<p><code>GLM<\/code> \u901a\u8fc7\u6dfb\u52a0<strong>\u4e8c\u7ef4\u4f4d\u7f6e\u7f16\u7801<\/strong>\u5e76\u5141\u8bb8<strong>\u4ee5\u4efb\u610f\u987a\u5e8f\u9884\u6d4b\u8de8\u5ea6<\/strong>\u6765\u6539\u8fdb<strong>\u7a7a\u767d\u586b\u5145\u9884\u8bad\u7ec3<\/strong>\uff0c\u8fd9\u4f7f\u5f97\u5176\u5728NLU\u4efb\u52a1\u4e0a\u76f8\u8f83\u4e8e <code>BERT<\/code> \u548c <code>T5<\/code> \u53d6\u5f97\u4e86\u6027\u80fd\u63d0\u5347\u3002\u540c\u65f6\uff0c<code>GLM<\/code> \u53ef\u4ee5\u901a\u8fc7\u6539\u53d8\u7a7a\u767d\u7684\u6570\u91cf\u548c\u957f\u5ea6\u6765\u9488\u5bf9\u4e0d\u540c\u7c7b\u578b\u7684\u4efb\u52a1\u8fdb\u884c\u9884\u8bad\u7ec3\u3002\u5728\u81ea\u7136\u8bed\u8a00\u7406\u89e3\u3001\u6709\u6761\u4ef6\u548c\u65e0\u6761\u4ef6\u751f\u6210\u7b49\u5e7f\u6cdb\u4efb\u52a1\u4e2d\uff0c<code>GLM<\/code> \u5728\u76f8\u540c\u6a21\u578b\u5927\u5c0f\u548c\u6570\u636e\u7684\u60c5\u51b5\u4e0b\u8d85\u8d8a\u4e86BERT\u3001T5\u548cGPT\uff0c\u5e76\u4e14\u5728\u53c2\u6570\u4e3aBERTLarge\u76841.25\u500d\u7684\u5355\u4e00\u9884\u8bad\u7ec3\u6a21\u578b\u4e2d\u8fbe\u5230\u4e86\u6700\u4f73\u6027\u80fd\uff0c\u5c55\u793a\u4e86\u5176\u5bf9\u4e0d\u540c\u4e0b\u6e38\u4efb\u52a1\u7684\u6cdb\u5316\u80fd\u529b\u3002<\/p>\n<\/blockquote>\n<h4><span class=\"ez-toc-section\" id=\"%E5%BC%95%E8%A8%80Introduction-2\"><\/span>\u5f15\u8a00(Introduction)<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<p><strong>\u8bba\u6587\u539f\u6587<\/strong>(NO.2)<\/p>\n<blockquote>\n<p>In this paper, we propose a pretraining framework named GLM (General Language Model), based on autoregressive blank infilling. We randomly blank out continuous spans of tokens from the input text, following the idea of autoencoding, and train the model to sequentially reconstruct the spans, following the idea of autoregressive pretraining (see Figure 1). While blanking filling has been used in T5 (Raffel et al., 2020) for text-to-text pretraining, we propose two improvements, namely span shuffling and 2D positional encoding. Empirically, we show that with the same amount of parameters and computational cost, GLM significantly outperforms BERT on the SuperGLUE benchmark by a large margin of 4.6% \u2013 5.0% and outperforms RoBERTa and BART when pretrained on a corpus of similar size (158GB). GLM also significantly outperforms T5 on NLU and generation tasks with fewer parameters and data.<\/p>\n<\/blockquote>\n<p><strong>\u8bba\u6587\u7ffb\u8bd1<\/strong>(NO.2)<\/p>\n<blockquote>\n<p>\u5728\u672c\u6587\u4e2d\uff0c\u6211\u4eec\u63d0\u51fa\u4e86\u4e00\u79cd\u540d\u4e3aGLM\uff08\u901a\u7528\u8bed\u8a00\u6a21\u578b\uff09\u7684\u9884\u8bad\u7ec3\u6846\u67b6\uff0c\u57fa\u4e8e<strong>\u81ea\u56de\u5f52\u7a7a\u767d\u586b\u5145<\/strong>\u3002\u6211\u4eec<strong>\u968f\u673a<\/strong>\u5c06\u8f93\u5165\u6587\u672c\u4e2d\u7684<strong>\u8fde\u7eed\u8de8\u5ea6<\/strong>\u7684 <code>token<\/code> \u8fdb\u884c\u906e\u76d6\uff0c\u9075\u5faa\u81ea\u7f16\u7801\u7684\u601d\u60f3\uff0c\u5e76\u8bad\u7ec3\u6a21\u578b\u6309\u987a\u5e8f\u91cd\u5efa\u8fd9\u4e9b\u8de8\u5ea6\uff0c\u9075\u5faa\u81ea\u56de\u5f52\u9884\u8bad\u7ec3\u7684\u7406\u5ff5\uff08\u89c1\u56fe1\uff09\u3002\u867d\u7136\u7a7a\u767d\u586b\u5145\u5df2\u5728 <code>T5<\/code>\uff08Raffel\u7b49\uff0c2020\uff09\u4e2d\u7528\u4e8e\u6587\u672c\u5230\u6587\u672c\u7684\u9884\u8bad\u7ec3\uff0c\u4f46\u6211\u4eec\u63d0\u51fa\u4e86<strong>\u4e24\u4e2a\u6539\u8fdb<\/strong>\uff0c\u5373<strong>\u4e00\u5b9a\u957f\u5ea6\u7684\u5b8c\u5f62\u586b\u7a7a<\/strong>\u548c<strong>\u4e8c\u7ef4\u4f4d\u7f6e\u7f16\u7801<\/strong>\u3002\u901a\u8fc7\u5b9e\u8bc1\u7814\u7a76\uff0c\u6211\u4eec\u8868\u660e\uff0c\u5728\u76f8\u540c\u53c2\u6570\u548c\u8ba1\u7b97\u6210\u672c\u7684\u60c5\u51b5\u4e0b\uff0c<code>GLM<\/code> \u5728SuperGLUE\u57fa\u51c6\u6d4b\u8bd5\u4e2d\u663e\u8457\u8d85\u8d8a <code>BERT <\/code>\uff0c\u63d0\u5347\u5e45\u5ea6\u8fbe\u52304.6%\u81f35.0%\uff0c\u5e76\u5728\u4e0eRoBERTa\u548cBART\u7684\u9884\u8bad\u7ec3\u4e2d\u8868\u73b0\u66f4\u4f73\uff0c\u4f7f\u7528\u7684\u8bed\u6599\u5e93\u89c4\u6a21\u76f8\u4f3c\uff08158GB\uff09\u3002<code>GLM<\/code> \u5728\u81ea\u7136\u8bed\u8a00\u7406\u89e3\u548c\u751f\u6210\u4efb\u52a1\u4e2d\u4e5f\u663e\u8457\u8d85\u8d8a <code>T5<\/code>\uff0c\u4e14\u6240\u9700\u53c2\u6570\u548c\u6570\u636e\u66f4\u5c11\u3002<\/p>\n<\/blockquote>\n<p><strong>\u8bba\u6587\u539f\u6587<\/strong>(NO.3)<\/p>\n<blockquote>\n<p>Inspired by Pattern-Exploiting Training (PET) (Schick and Sch\u00fctze, 2020a), we reformulate NLU tasks as manually-crafted cloze questions that mimic human language. Different from the BERTbased models used by PET, GLM can naturally handle multi-token answers to the cloze question via autoregressive blank filling.<\/p>\n<\/blockquote>\n<p><strong>\u8bba\u6587\u7ffb\u8bd1<\/strong>(NO.3)<\/p>\n<blockquote>\n<p>\u53d7\u5230\u6a21\u5f0f\u5229\u7528\u8bad\u7ec3\uff08PET\uff09\uff08Schick\u548cSch\u00fctze\uff0c2020a\uff09\u7684\u542f\u53d1\uff0c\u6211\u4eec\u5c06\u81ea\u7136\u8bed\u8a00\u7406\u89e3\uff08NLU\uff09\u4efb\u52a1\u91cd\u65b0\u8868\u8ff0\u4e3a\u624b\u5de5\u5236\u4f5c\u7684\u586b\u7a7a\u95ee\u9898\uff0c\u4ee5\u6a21\u62df\u4eba\u7c7b\u8bed\u8a00\u3002\u4e0ePET\u4f7f\u7528\u7684\u57fa\u4e8eBERT\u7684\u6a21\u578b\u4e0d\u540c\uff0cGLM\u53ef\u4ee5\u901a\u8fc7\u81ea\u56de\u5f52\u7a7a\u767d\u586b\u5145\u81ea\u7136\u5730\u5904\u7406\u586b\u7a7a\u95ee\u9898\u7684\u591a<code>token<\/code>\u7b54\u6848\u3002<br \/>\n<a href=\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/08\/GLM\u6a21\u578b\u7ed3\u6784\u793a\u610f\u56fe.png\" data-fancybox=\"images\" data-fancybox=\"gallery\"><img decoding=\"async\" src=\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/08\/GLM\u6a21\u578b\u7ed3\u6784\u793a\u610f\u56fe.png\" alt=\"\" \/><\/a><\/p>\n<\/blockquote>\n<p><strong>\u8bba\u6587\u539f\u6587<\/strong>(NO.4)<\/p>\n<blockquote>\n<p>Figure 2: GLM pretraining. (a) The original text is [x1, x2, x3, x4, x5, x6]. Two spans [x3] and [x5, x6] are sampled. (b) Replace the sampled spans with [M] in Part A, and shuffle the spans in Part B. (c) GLM autoregressively generates Part B. Each span is prepended with [S] as input and appended with [E] as output. 2D positional encoding represents inter- and intra-span positions. (d) Self-attention mask. Grey areas are masked out. Part A tokens can attend to themselves (blue frame) but not B. Part B tokens can attend to A and their antecedents in B (yellow and green frames correspond to the two spans). [M] := [MASK], [S] := [START], and [E] := [END].<\/p>\n<\/blockquote>\n<p><strong>\u8bba\u6587\u7ffb\u8bd1<\/strong>(NO.3)<\/p>\n<blockquote>\n<p>\u56fe2\uff1aGLM\u9884\u8bad\u7ec3\u3002<\/p>\n<p>(a) \u539f\u59cb\u6587\u672c\u4e3a[x1, x2, x3, x4, x5, x6]\u3002\u4ece\u4e2d\u62bd\u53d6\u4e24\u4e2a\u8de8\u5ea6[x3]\u548c[x5, x6]\u3002<br \/>\n(b) \u5728A\u90e8\u5206\u7528[M]\u66ff\u6362\u62bd\u53d6\u7684\u8de8\u5ea6\uff0c\u5e76\u5728B\u90e8\u5206\u5bf9\u8de8\u5ea6\u8fdb\u884c\u6d17\u724c\u3002<br \/>\n(c) GLM\u81ea\u56de\u5f52\u751f\u6210B\u90e8\u5206\u3002\u6bcf\u4e2a\u8de8\u5ea6\u4ee5[S]\u4f5c\u4e3a\u8f93\u5165\uff0c\u8f93\u51fa\u4ee5[E]\u7ed3\u675f\u3002\u4e8c\u7ef4\u4f4d\u7f6e\u7f16\u7801\u8868\u793a\u8de8\u5ea6\u5185\u5916\u7684\u4f4d\u7f6e\u3002<br \/>\n(d) \u81ea\u6ce8\u610f\u529b\u63a9\u7801\u3002\u7070\u8272\u533a\u57df\u88ab\u5c4f\u853d\u3002A\u90e8\u5206\u7684\u6807\u8bb0\u53ef\u4ee5\u76f8\u4e92\u5173\u6ce8\uff08\u84dd\u6846\uff09\uff0c\u4f46\u4e0d\u80fd\u5173\u6ce8B\u90e8\u5206\u3002B\u90e8\u5206\u7684\u6807\u8bb0\u53ef\u4ee5\u5173\u6ce8A\u90e8\u5206\u53ca\u5176\u5728B\u4e2d\u7684\u524d\u9a71\uff08\u9ec4\u8272\u548c\u7eff\u8272\u6846\u5bf9\u5e94\u4e8e\u4e24\u4e2a\u8de8\u5ea6\uff09\u3002[M] := [MASK]\uff0c[S] := [START]\uff0c\u548c[E] := [END]\u3002<\/p>\n<\/blockquote>\n<h3><span class=\"ez-toc-section\" id=\"%E8%AE%BA%E6%96%87%E5%86%85%E5%AE%B9%E6%80%BB%E7%BB%93-2\"><\/span>\u8bba\u6587\u5185\u5bb9\u603b\u7ed3<span class=\"ez-toc-section-end\"><\/span><\/h3>\n<p>GLM\u8bad\u7ec3\u8fc7\u7a0b\uff1a<br \/>\n<strong>\u7b2c\u4e00\u6b65<\/strong>\uff1a\u51c6\u5907\u6570\u636e\uff0c\u4ece\u53e5\u5b50\u4e2d\u968f\u673a\u9009\u62e9\u4e00\u4e9b<code>token<\/code>\u8fdb\u884c\u906e\u853d\u3002<\/p>\n<pre><code class=\"language-python\"># \u793a\u4f8b\u7406\u89e3\uff1a\n\n# \u539f\u59cb\u6587\u672c\uff1a[\u6211, \u559c\u6b22, \u5b66\u4e60, \u4eba\u5de5\u667a\u80fd, \u548c, \u673a\u5668\u5b66\u4e60]\n#         [x1, x2,   x3,   x4,    x5,     x6]<\/code><\/pre>\n<p><strong>\u7b2c\u4e8c\u6b65<\/strong>\uff1a\u968f\u673a\u906e\u853d\uff0c\u5206\u4e3a\u4e24\u4e2a\u90e8\u5206<\/p>\n<pre><code class=\"language-python\"># \u968f\u673a\u906e\u853d\u8bcd\u4e3a\uff1a\u5373\u201c\u5b66\u4e60\u201d\u548c\u201c\u548c\u673a\u5668\u5b66\u4e60\u201d\n# PartA: [\u6211, \u559c\u6b22, [M], \u4eba\u5de5\u667a\u80fd, [M] ]\n# PartB: [[\u548c, \u673a\u5668\u5b66\u4e60], [\u5b66\u4e60]]<\/code><\/pre>\n<p><strong>\u7b2c\u4e09\u6b65<\/strong>\uff1a\u4f4d\u7f6e\u7f16\u7801<\/p>\n<ul>\n<li>\u8fc7\u7a0b\uff1a<code>GLM<\/code> \u4f7f\u7528\u4e8c\u7ef4\u4f4d\u7f6e\u7f16\u7801\u6765\u8868\u793a\u6bcf\u4e2a\u6807\u8bb0\u5728\u5e8f\u5217\u4e2d\u7684\u4f4d\u7f6e\u3002\u8fd9\u79cd\u7f16\u7801\u65b9\u5f0f\u80fd\u591f\u5e2e\u52a9\u6a21\u578b\u7406\u89e3\u6807\u8bb0\u4e4b\u95f4\u7684\u76f8\u5bf9\u4f4d\u7f6e\u5173\u7cfb\u3002<\/li>\n<\/ul>\n<pre><code class=\"language-python\">#       \u6211, \u559c\u6b22, [M], \u4eba\u5de5\u667a\u80fd, [M]  [S]  \u548c  \u673a\u5668\u5b66\u4e60 [S]  \u5b66\u4e60\n# P1    0   1     2     3      4    4    4    4      2    2    \n# P2    0   0     0     0      0    1    2    3      1    2<\/code><\/pre>\n<p><strong>\u7b2c\u56db\u6b65<\/strong>\uff1a\u81ea\u56de\u5f52\u7a7a\u767d\u586b\u5145<\/p>\n<pre><code class=\"language-python\">#   \u521d\u59cb\u8f93\u5165\uff1a \u6211, \u559c\u6b22\n\n#   \u7b2c\u4e00\u8f6e\u81ea\u56de\u5f52\n#   \u8f93\u5165\uff1a \u6211, \u559c\u6b22 ,[S]\n#   \u8fd4\u56de\uff1a \u6211, \u559c\u6b22 ,[S], \u548c\n\n#   \u7b2c\u4e8c\u8f6e\u81ea\u56de\u5f52\n#   \u8f93\u5165\uff1a \u6211, \u559c\u6b22 ,[S], \u548c\n#   \u8fd4\u56de\uff1a \u6211, \u559c\u6b22 ,[S], \u548c, \u673a\u5668\u5b66\u4e60\n\n#   \u7b2c\u4e09\u8f6e\u81ea\u56de\u5f52\n#   \u8f93\u5165\uff1a \u6211, \u559c\u6b22 ,[S], \u548c, \u673a\u5668\u5b66\u4e60\n#   \u8fd4\u56de\uff1a \u6211, \u559c\u6b22 ,[S], \u548c, \u673a\u5668\u5b66\u4e60, [E]\n\n#   \u7b2c\u56db\u8f6e\u81ea\u56de\u5f52\n#   \u8f93\u5165\uff1a \u6211, \u559c\u6b22 ,[S], \u548c, \u673a\u5668\u5b66\u4e60, [E]\n#   \u8fd4\u56de\uff1a \u6211, \u559c\u6b22 ,[S], \u548c, \u673a\u5668\u5b66\u4e60, [E], [S]\n\n#   \u7b2c\u4e94\u8f6e\u81ea\u56de\u5f52\n#   \u8f93\u5165\uff1a \u6211, \u559c\u6b22 ,[S], \u548c, \u673a\u5668\u5b66\u4e60, [E], [S]\n#   \u8fd4\u56de\uff1a \u6211, \u559c\u6b22 ,[S], \u548c, \u673a\u5668\u5b66\u4e60, [E], [S], \u5b66\u4e60\n\n#   \u6700\u540e\u8f93\u51fa\uff1a \u6211, \u559c\u6b22 ,[S], \u548c, \u673a\u5668\u5b66\u4e60, [E], [S], \u5b66\u4e60, [E]\n<\/code><\/pre>\n<h2><span class=\"ez-toc-section\" id=\"%E6%80%BB%E7%BB%93\"><\/span>\u603b\u7ed3<span class=\"ez-toc-section-end\"><\/span><\/h2>\n<ul>\n<li><code>BERT<\/code> \u548c <code>GLM<\/code> \u6a21\u578b\u7684\u5171\u540c\u70b9\uff1a\u9884\u8bad\u7ec3\u65f6\u90fd\u662f\u4f7f\u7528\u6d77\u91cf\u7684\u65e0\u6807\u6ce8\u6570\u636e\u8fdb\u884c\u8bad\u7ec3\u3002<\/li>\n<li><code>BERT<\/code> \u548c <code>GLM<\/code> \u6a21\u578b\u7684\u4e0d\u540c\u70b9\uff1a\n<ul>\n<li><code>BERT<\/code>\u6a21\u578b\u5728\u9884\u8bad\u7ec3\u65f6\u6316\u7a7a\u586b\u7a7a\u65f6\uff0c\u662f\u968f\u673a\u6316<code>\u5355\u4e2atoken<\/code>\uff0c\u800c<code>GLM<\/code>\u6a21\u578b\u662f\u968f\u673a\u6316<code>\u8fde\u7eed\u7684token<\/code>\u3002<\/li>\n<li><code>BERT<\/code>\u6a21\u578b\u7ed3\u6784\u662f<code>Encoder-Only\u67b6\u6784<\/code>\uff0c<code>GLM<\/code>\u6a21\u578b\u7ed3\u6784\u662f<code>Decoder-Only\u67b6\u6784<\/code>\u3002<\/li>\n<\/ul>\n<\/li>\n<li><code>GLM<\/code> \u6a21\u578b\u5b83\u901a\u8fc7mask\u63a9\u7801\u5c06encoder\u4e0edecoder\u8fde\u63a5\u8d77\u6765\uff0c\u5b9e\u73b0\u81ea\u56de\u5f52\u7a7a\u767d\u586b\u5145\u3002<\/li>\n<li>\u9884\u8bad\u7ec3(PT)\u662f\u4e00\u4e2a\u6f2b\u957f\u7684\u3001\u67af\u71e5\u7684\u8bad\u7ec3\u8fc7\u7a0b\uff0c\u4f46\u4e5f\u662f\u6253\u57fa\u7840\u3001\u4fee\u70bc\u5185\u529f\u7684\u8fc7\u7a0b\u3002<\/li>\n<li>\u5fae\u8c03\u8bad\u7ec3(SFT)\u662f\u5efa\u7acb\u5728\u9884\u8bad\u7ec3\u57fa\u7840\u4e0a\u7684\u8fdb\u9636\u8bad\u7ec3\uff0c\u8fd9\u4e2a\u8fc7\u7a0b\u9700\u8981\u6839\u636e\u5b9e\u6218\u60c5\u51b5\u8c03\u6574\u8bad\u7ec3\u5185\u5bb9\u3002<\/li>\n<\/ul>\n<h2><span class=\"ez-toc-section\" id=\"%E8%AF%BB%E5%90%8E%E6%84%9F\"><\/span>\u8bfb\u540e\u611f<span class=\"ez-toc-section-end\"><\/span><\/h2>\n<p>\u5927\u6a21\u578b\u7684\u8bad\u7ec3\u8fc7\u7a0b\u662f\u5982\u6b64\u5730\u5177\u6709\u666e\u4e16\u7684\u610f\u4e49\u3002\u8ba9\u6211\u4e0d\u7981\u60f3\u8d77\u300a\u4e03\u9f99\u73e0\u300b\u4e2d\u6b66\u5929\u8001\u5e08\u8bad\u7ec3\u609f\u7a7a\u548c\u514b\u6797\u7684\u8fc7\u7a0b\uff1a<\/p>\n<ul>\n<li>\n<p>\u57fa\u7840\u7684\u8bad\u7ec3\u662f\u6f2b\u957f\u7684\u3001\u91cd\u590d\u7684\u3001\u67af\u71e5\u7684\uff0c\u4f46\u4e5f\u662f\u6253\u57fa\u7840\u7684\u8fc7\u7a0b\u3002<br \/>\n<a href=\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/08\/\u4e03\u9f99\u73e0\u8bad\u7ec3\u793a\u610f\u56fe.png\" data-fancybox=\"images\" data-fancybox=\"gallery\"><img decoding=\"async\" src=\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/08\/\u4e03\u9f99\u73e0\u8bad\u7ec3\u793a\u610f\u56fe.png\" alt=\"\" \/><\/a><\/p>\n<\/li>\n<li>\n<p>\u53ea\u6709\u57fa\u7840\u6253\u7262\u9760\u4e86\uff0c\u624d\u80fd\u5728\u6218\u6597\u4e2d&quot;\u9488\u5bf9\u6027\u7684\u4fee\u70bc&quot;\uff0c\u79ef\u7d2f\u7ecf\u9a8c\uff0c\u6700\u7ec8\u8d85\u8d8a\u81ea\u6211\u3002<br \/>\n<a href=\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/08\/\u4e03\u9f99\u73e0\u8bad\u7ec3\u793a\u610f\u56fe2.png\" data-fancybox=\"images\" data-fancybox=\"gallery\"><img decoding=\"async\" src=\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/08\/\u4e03\u9f99\u73e0\u8bad\u7ec3\u793a\u610f\u56fe2.png\" alt=\"\" \/><\/a><\/p>\n<\/li>\n<\/ul>\n<h2><span class=\"ez-toc-section\" id=\"%E5%8F%82%E8%80%83%E8%B5%84%E6%96%99\"><\/span>\u53c2\u8003\u8d44\u6599<span class=\"ez-toc-section-end\"><\/span><\/h2>\n<p><a href=\"https:\/\/zhuanlan.zhihu.com\/p\/600204350\">\u77e5\u4e4e\uff1aBERT\u7cfb\u5217\u8bba\u6587\u7b14\u8bb0<\/a><\/p>\n<p align=\"center\">\u6b22\u8fce\u5173\u6ce8\u516c\u4f17\u53f7\u4ee5\u83b7\u5f97\u6700\u65b0\u7684\u6587\u7ae0\u548c\u65b0\u95fb<\/p>\n<p><a href=\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/09\/\u626b\u7801_\u641c\u7d22\u8054\u5408\u4f20\u64ad\u6837\u5f0f-\u767d\u8272\u7248.bmp\" data-fancybox=\"images\" data-fancybox=\"gallery\"><img decoding=\"async\" src=\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/09\/\u626b\u7801_\u641c\u7d22\u8054\u5408\u4f20\u64ad\u6837\u5f0f-\u767d\u8272\u7248.bmp\" alt=\"\" \/><\/a><\/p>\n","protected":false},"excerpt":{"rendered":"<p>\u524d\u8a00 \u5728\u524d\u4e24\u7ae0\u7684\u5b66\u4e60\u4e2d\uff0c\u6211\u4eec\u4e86\u89e3\u5230\u5927\u6a21\u578b [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":13444,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"aside","meta":{"site-sidebar-layout":"default","site-content-layout":"","ast-site-content-layout":"default","site-content-style":"default","site-sidebar-style":"default","ast-global-header-display":"","ast-banner-title-visibility":"","ast-main-header-display":"","ast-hfb-above-header-display":"","ast-hfb-below-header-display":"","ast-hfb-mobile-header-display":"","site-post-title":"","ast-breadcrumbs-content":"","ast-featured-img":"","footer-sml-layout":"","theme-transparent-header-meta":"default","adv-header-id-meta":"","stick-header-meta":"default","header-above-stick-meta":"","header-main-stick-meta":"","header-below-stick-meta":"","astra-migrate-meta-layouts":"set","ast-page-background-enabled":"default","ast-page-background-meta":{"desktop":{"background-color":"var(--ast-global-color-4)","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""},"tablet":{"background-color":"","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""},"mobile":{"background-color":"","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""}},"ast-content-background-meta":{"desktop":{"background-color":"var(--ast-global-color-5)","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""},"tablet":{"background-color":"var(--ast-global-color-5)","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""},"mobile":{"background-color":"var(--ast-global-color-5)","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""}},"footnotes":""},"categories":[28],"tags":[68],"class_list":["post-13440","post","type-post","status-publish","format-aside","has-post-thumbnail","hentry","category-blog","tag-68","post_format-post-format-aside"],"yoast_head":"<!-- This site is optimized with the Yoast SEO plugin v26.4 - https:\/\/yoast.com\/wordpress\/plugins\/seo\/ -->\n<title>\u3010\u8bfe\u7a0b\u603b\u7ed3\u3011day23\uff1a\u5927\u6a21\u578b\u8bad\u7ec3\u7b56\u7565\uff08BERT\u6a21\u578b\u4e0eGLM\u6a21\u578b\uff09 - \u4e00\u8d77AI\u6280\u672f<\/title>\n<meta name=\"robots\" content=\"index, follow, max-snippet:-1, max-image-preview:large, max-video-preview:-1\" \/>\n<link rel=\"canonical\" href=\"https:\/\/17aitech.com\/?p=13440\" \/>\n<script type=\"application\/ld+json\" class=\"yoast-schema-graph\">{\"@context\":\"https:\/\/schema.org\",\"@graph\":[{\"@type\":\"WebPage\",\"@id\":\"https:\/\/17aitech.com\/?p=13440\",\"url\":\"https:\/\/17aitech.com\/?p=13440\",\"name\":\"\u3010\u8bfe\u7a0b\u603b\u7ed3\u3011day23\uff1a\u5927\u6a21\u578b\u8bad\u7ec3\u7b56\u7565\uff08BERT\u6a21\u578b\u4e0eGLM\u6a21\u578b\uff09 - \u4e00\u8d77AI\u6280\u672f\",\"isPartOf\":{\"@id\":\"https:\/\/17aitech.com\/#website\"},\"primaryImageOfPage\":{\"@id\":\"https:\/\/17aitech.com\/?p=13440#primaryimage\"},\"image\":{\"@id\":\"https:\/\/17aitech.com\/?p=13440#primaryimage\"},\"thumbnailUrl\":\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/08\/BERT\u8bad\u7ec3\u8fc7\u7a0b\u56fe\u793a.png\",\"datePublished\":\"2024-08-13T19:10:26+00:00\",\"dateModified\":\"2024-10-08T07:10:41+00:00\",\"author\":{\"@id\":\"https:\/\/17aitech.com\/#\/schema\/person\/3d23bb6f7f115fcefc9ae7803a691739\"},\"breadcrumb\":{\"@id\":\"https:\/\/17aitech.com\/?p=13440#breadcrumb\"},\"inLanguage\":\"zh-Hans\",\"potentialAction\":[{\"@type\":\"ReadAction\",\"target\":[\"https:\/\/17aitech.com\/?p=13440\"]}]},{\"@type\":\"ImageObject\",\"inLanguage\":\"zh-Hans\",\"@id\":\"https:\/\/17aitech.com\/?p=13440#primaryimage\",\"url\":\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/08\/BERT\u8bad\u7ec3\u8fc7\u7a0b\u56fe\u793a.png\",\"contentUrl\":\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/08\/BERT\u8bad\u7ec3\u8fc7\u7a0b\u56fe\u793a.png\",\"width\":2402,\"height\":956},{\"@type\":\"BreadcrumbList\",\"@id\":\"https:\/\/17aitech.com\/?p=13440#breadcrumb\",\"itemListElement\":[{\"@type\":\"ListItem\",\"position\":1,\"name\":\"\u9996\u9875\",\"item\":\"https:\/\/17aitech.com\/\"},{\"@type\":\"ListItem\",\"position\":2,\"name\":\"\u3010\u8bfe\u7a0b\u603b\u7ed3\u3011day23\uff1a\u5927\u6a21\u578b\u8bad\u7ec3\u7b56\u7565\uff08BERT\u6a21\u578b\u4e0eGLM\u6a21\u578b\uff09\"}]},{\"@type\":\"WebSite\",\"@id\":\"https:\/\/17aitech.com\/#website\",\"url\":\"https:\/\/17aitech.com\/\",\"name\":\"\u4e00\u8d77AI\u6280\u672f\",\"description\":\"\u8ba9AI\u77e5\u8bc6\u89e6\u624b\u53ef\u53ca\",\"alternateName\":\"\u4e00\u8d77AI\u6280\u672f\",\"potentialAction\":[{\"@type\":\"SearchAction\",\"target\":{\"@type\":\"EntryPoint\",\"urlTemplate\":\"https:\/\/17aitech.com\/?s={search_term_string}\"},\"query-input\":{\"@type\":\"PropertyValueSpecification\",\"valueRequired\":true,\"valueName\":\"search_term_string\"}}],\"inLanguage\":\"zh-Hans\"},{\"@type\":\"Person\",\"@id\":\"https:\/\/17aitech.com\/#\/schema\/person\/3d23bb6f7f115fcefc9ae7803a691739\",\"name\":\"Dongming\",\"image\":{\"@type\":\"ImageObject\",\"inLanguage\":\"zh-Hans\",\"@id\":\"https:\/\/17aitech.com\/#\/schema\/person\/image\/\",\"url\":\"\/\/17aitech.com\/wp-content\/uploads\/member\/avatars\/238a0b923820dcc5.1732798681.jpg\",\"contentUrl\":\"\/\/17aitech.com\/wp-content\/uploads\/member\/avatars\/238a0b923820dcc5.1732798681.jpg\",\"caption\":\"Dongming\"},\"description\":\"\u89c1\u5929\u5730\uff0c\u89c1\u4f17\u751f\uff0c\u89c1\u81ea\u5df1\u3002\",\"sameAs\":[\"http:\/\/17aitech.com\"],\"url\":\"https:\/\/17aitech.com\/?page_id=33738&user=1\"}]}<\/script>\n<!-- \/ Yoast SEO plugin. -->","yoast_head_json":{"title":"\u3010\u8bfe\u7a0b\u603b\u7ed3\u3011day23\uff1a\u5927\u6a21\u578b\u8bad\u7ec3\u7b56\u7565\uff08BERT\u6a21\u578b\u4e0eGLM\u6a21\u578b\uff09 - \u4e00\u8d77AI\u6280\u672f","robots":{"index":"index","follow":"follow","max-snippet":"max-snippet:-1","max-image-preview":"max-image-preview:large","max-video-preview":"max-video-preview:-1"},"canonical":"https:\/\/17aitech.com\/?p=13440","schema":{"@context":"https:\/\/schema.org","@graph":[{"@type":"WebPage","@id":"https:\/\/17aitech.com\/?p=13440","url":"https:\/\/17aitech.com\/?p=13440","name":"\u3010\u8bfe\u7a0b\u603b\u7ed3\u3011day23\uff1a\u5927\u6a21\u578b\u8bad\u7ec3\u7b56\u7565\uff08BERT\u6a21\u578b\u4e0eGLM\u6a21\u578b\uff09 - \u4e00\u8d77AI\u6280\u672f","isPartOf":{"@id":"https:\/\/17aitech.com\/#website"},"primaryImageOfPage":{"@id":"https:\/\/17aitech.com\/?p=13440#primaryimage"},"image":{"@id":"https:\/\/17aitech.com\/?p=13440#primaryimage"},"thumbnailUrl":"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/08\/BERT\u8bad\u7ec3\u8fc7\u7a0b\u56fe\u793a.png","datePublished":"2024-08-13T19:10:26+00:00","dateModified":"2024-10-08T07:10:41+00:00","author":{"@id":"https:\/\/17aitech.com\/#\/schema\/person\/3d23bb6f7f115fcefc9ae7803a691739"},"breadcrumb":{"@id":"https:\/\/17aitech.com\/?p=13440#breadcrumb"},"inLanguage":"zh-Hans","potentialAction":[{"@type":"ReadAction","target":["https:\/\/17aitech.com\/?p=13440"]}]},{"@type":"ImageObject","inLanguage":"zh-Hans","@id":"https:\/\/17aitech.com\/?p=13440#primaryimage","url":"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/08\/BERT\u8bad\u7ec3\u8fc7\u7a0b\u56fe\u793a.png","contentUrl":"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/08\/BERT\u8bad\u7ec3\u8fc7\u7a0b\u56fe\u793a.png","width":2402,"height":956},{"@type":"BreadcrumbList","@id":"https:\/\/17aitech.com\/?p=13440#breadcrumb","itemListElement":[{"@type":"ListItem","position":1,"name":"\u9996\u9875","item":"https:\/\/17aitech.com\/"},{"@type":"ListItem","position":2,"name":"\u3010\u8bfe\u7a0b\u603b\u7ed3\u3011day23\uff1a\u5927\u6a21\u578b\u8bad\u7ec3\u7b56\u7565\uff08BERT\u6a21\u578b\u4e0eGLM\u6a21\u578b\uff09"}]},{"@type":"WebSite","@id":"https:\/\/17aitech.com\/#website","url":"https:\/\/17aitech.com\/","name":"\u4e00\u8d77AI\u6280\u672f","description":"\u8ba9AI\u77e5\u8bc6\u89e6\u624b\u53ef\u53ca","alternateName":"\u4e00\u8d77AI\u6280\u672f","potentialAction":[{"@type":"SearchAction","target":{"@type":"EntryPoint","urlTemplate":"https:\/\/17aitech.com\/?s={search_term_string}"},"query-input":{"@type":"PropertyValueSpecification","valueRequired":true,"valueName":"search_term_string"}}],"inLanguage":"zh-Hans"},{"@type":"Person","@id":"https:\/\/17aitech.com\/#\/schema\/person\/3d23bb6f7f115fcefc9ae7803a691739","name":"Dongming","image":{"@type":"ImageObject","inLanguage":"zh-Hans","@id":"https:\/\/17aitech.com\/#\/schema\/person\/image\/","url":"\/\/17aitech.com\/wp-content\/uploads\/member\/avatars\/238a0b923820dcc5.1732798681.jpg","contentUrl":"\/\/17aitech.com\/wp-content\/uploads\/member\/avatars\/238a0b923820dcc5.1732798681.jpg","caption":"Dongming"},"description":"\u89c1\u5929\u5730\uff0c\u89c1\u4f17\u751f\uff0c\u89c1\u81ea\u5df1\u3002","sameAs":["http:\/\/17aitech.com"],"url":"https:\/\/17aitech.com\/?page_id=33738&user=1"}]}},"_links":{"self":[{"href":"https:\/\/17aitech.com\/index.php?rest_route=\/wp\/v2\/posts\/13440","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/17aitech.com\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/17aitech.com\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/17aitech.com\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/17aitech.com\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=13440"}],"version-history":[{"count":2,"href":"https:\/\/17aitech.com\/index.php?rest_route=\/wp\/v2\/posts\/13440\/revisions"}],"predecessor-version":[{"id":33222,"href":"https:\/\/17aitech.com\/index.php?rest_route=\/wp\/v2\/posts\/13440\/revisions\/33222"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/17aitech.com\/index.php?rest_route=\/wp\/v2\/media\/13444"}],"wp:attachment":[{"href":"https:\/\/17aitech.com\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=13440"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/17aitech.com\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=13440"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/17aitech.com\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=13440"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}