{"id":33308,"date":"2024-10-14T13:05:20","date_gmt":"2024-10-14T05:05:20","guid":{"rendered":"https:\/\/17aitech.com\/?p=33308"},"modified":"2024-10-14T13:59:55","modified_gmt":"2024-10-14T05:59:55","slug":"%e3%80%90%e8%af%be%e7%a8%8b%e6%80%bb%e7%bb%93%e3%80%91day34%ef%bc%9avit%e6%a8%a1%e5%9e%8b%e3%80%81clip%e6%a8%a1%e5%9e%8b%e8%ae%ba%e6%96%87%e9%98%85%e8%af%bb%e7%90%86%e8%a7%a3","status":"publish","type":"post","link":"https:\/\/17aitech.com\/?p=33308","title":{"rendered":"\u3010\u8bfe\u7a0b\u603b\u7ed3\u3011day34\uff1a\u591a\u6a21\u6001\u5927\u6a21\u578b\u4e4bViT\u6a21\u578b\u3001CLIP\u6a21\u578b\u8bba\u6587\u9605\u8bfb\u7406\u89e3"},"content":{"rendered":"<div id=\"ez-toc-container\" class=\"ez-toc-v2_0_78 ez-toc-wrap-left-text counter-hierarchy ez-toc-counter ez-toc-light-blue ez-toc-container-direction\">\n<div class=\"ez-toc-title-container\">\n<p class=\"ez-toc-title\" style=\"cursor:inherit\">\u6587\u7ae0\u76ee\u5f55<\/p>\n<span class=\"ez-toc-title-toggle\"><a href=\"#\" class=\"ez-toc-pull-right ez-toc-btn ez-toc-btn-xs ez-toc-btn-default ez-toc-toggle\" aria-label=\"Toggle Table of Content\"><span class=\"ez-toc-js-icon-con\"><span class=\"\"><span class=\"eztoc-hide\" style=\"display:none;\">Toggle<\/span><span class=\"ez-toc-icon-toggle-span\"><svg style=\"fill: #999;color:#999\" xmlns=\"http:\/\/www.w3.org\/2000\/svg\" class=\"list-377408\" width=\"20px\" height=\"20px\" viewBox=\"0 0 24 24\" fill=\"none\"><path d=\"M6 6H4v2h2V6zm14 0H8v2h12V6zM4 11h2v2H4v-2zm16 0H8v2h12v-2zM4 16h2v2H4v-2zm16 0H8v2h12v-2z\" fill=\"currentColor\"><\/path><\/svg><svg style=\"fill: #999;color:#999\" class=\"arrow-unsorted-368013\" xmlns=\"http:\/\/www.w3.org\/2000\/svg\" width=\"10px\" height=\"10px\" viewBox=\"0 0 24 24\" version=\"1.2\" baseProfile=\"tiny\"><path d=\"M18.2 9.3l-6.2-6.3-6.2 6.3c-.2.2-.3.4-.3.7s.1.5.3.7c.2.2.4.3.7.3h11c.3 0 .5-.1.7-.3.2-.2.3-.5.3-.7s-.1-.5-.3-.7zM5.8 14.7l6.2 6.3 6.2-6.3c.2-.2.3-.5.3-.7s-.1-.5-.3-.7c-.2-.2-.4-.3-.7-.3h-11c-.3 0-.5.1-.7.3-.2.2-.3.5-.3.7s.1.5.3.7z\"\/><\/svg><\/span><\/span><\/span><\/a><\/span><\/div>\n<nav><ul class='ez-toc-list ez-toc-list-level-1 ' ><li class='ez-toc-page-1 ez-toc-heading-level-2'><a class=\"ez-toc-link ez-toc-heading-1\" href=\"https:\/\/17aitech.com\/?p=33308\/#%E5%89%8D%E8%A8%80\" >\u524d\u8a00<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-2'><a class=\"ez-toc-link ez-toc-heading-2\" href=\"https:\/\/17aitech.com\/?p=33308\/#ViT_%E6%A8%A1%E5%9E%8B%E8%AE%BA%E6%96%87%E9%98%85%E8%AF%BB%E7%90%86%E8%A7%A3\" >ViT \u6a21\u578b\u8bba\u6587\u9605\u8bfb\u7406\u89e3<\/a><ul class='ez-toc-list-level-3' ><li class='ez-toc-heading-level-3'><a class=\"ez-toc-link ez-toc-heading-3\" href=\"https:\/\/17aitech.com\/?p=33308\/#ABSTRACT%E9%83%A8%E5%88%86\" >ABSTRACT\u90e8\u5206<\/a><ul class='ez-toc-list-level-4' ><li class='ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-4\" href=\"https:\/\/17aitech.com\/?p=33308\/#%E8%AE%BA%E6%96%87%E5%8E%9F%E6%96%87\" >\u8bba\u6587\u539f\u6587<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-5\" href=\"https:\/\/17aitech.com\/?p=33308\/#%E8%AE%BA%E6%96%87%E7%BF%BB%E8%AF%91\" >\u8bba\u6587\u7ffb\u8bd1<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-6\" href=\"https:\/\/17aitech.com\/?p=33308\/#%E8%AE%BA%E6%96%87%E7%90%86%E8%A7%A3\" >\u8bba\u6587\u7406\u89e3<\/a><\/li><\/ul><\/li><li class='ez-toc-page-1 ez-toc-heading-level-3'><a class=\"ez-toc-link ez-toc-heading-7\" href=\"https:\/\/17aitech.com\/?p=33308\/#INTRODUCTION%E9%83%A8%E5%88%86\" >INTRODUCTION\u90e8\u5206<\/a><ul class='ez-toc-list-level-4' ><li class='ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-8\" href=\"https:\/\/17aitech.com\/?p=33308\/#%E8%AE%BA%E6%96%87%E5%8E%9F%E6%96%87-2\" >\u8bba\u6587\u539f\u6587<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-9\" href=\"https:\/\/17aitech.com\/?p=33308\/#%E8%AE%BA%E6%96%87%E7%BF%BB%E8%AF%91-2\" >\u8bba\u6587\u7ffb\u8bd1<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-10\" href=\"https:\/\/17aitech.com\/?p=33308\/#%E8%AE%BA%E6%96%87%E7%90%86%E8%A7%A3-2\" >\u8bba\u6587\u7406\u89e3<\/a><\/li><\/ul><\/li><li class='ez-toc-page-1 ez-toc-heading-level-3'><a class=\"ez-toc-link ez-toc-heading-11\" href=\"https:\/\/17aitech.com\/?p=33308\/#%E6%A8%A1%E5%9E%8B%E7%BB%93%E6%9E%84\" >\u6a21\u578b\u7ed3\u6784<\/a><ul class='ez-toc-list-level-4' ><li class='ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-12\" href=\"https:\/\/17aitech.com\/?p=33308\/#%E8%AE%BA%E6%96%87%E5%8E%9F%E6%96%87-3\" >\u8bba\u6587\u539f\u6587<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-13\" href=\"https:\/\/17aitech.com\/?p=33308\/#%E8%AE%BA%E6%96%87%E7%BF%BB%E8%AF%91-3\" >\u8bba\u6587\u7ffb\u8bd1<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-14\" href=\"https:\/\/17aitech.com\/?p=33308\/#%E8%AE%BA%E6%96%87%E7%90%86%E8%A7%A3-3\" >\u8bba\u6587\u7406\u89e3<\/a><\/li><\/ul><\/li><\/ul><\/li><li class='ez-toc-page-1 ez-toc-heading-level-2'><a class=\"ez-toc-link ez-toc-heading-15\" href=\"https:\/\/17aitech.com\/?p=33308\/#%E5%9B%9E%E9%A1%BE_BERT\" >\u56de\u987e BERT<\/a><ul class='ez-toc-list-level-3' ><li class='ez-toc-heading-level-3'><a class=\"ez-toc-link ez-toc-heading-16\" href=\"https:\/\/17aitech.com\/?p=33308\/#%E6%A8%A1%E5%9E%8B%E7%BB%93%E6%9E%84-2\" >\u6a21\u578b\u7ed3\u6784<\/a><ul class='ez-toc-list-level-4' ><li class='ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-17\" href=\"https:\/\/17aitech.com\/?p=33308\/#%E8%AE%BA%E6%96%87%E5%8E%9F%E6%96%87-4\" >\u8bba\u6587\u539f\u6587<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-18\" href=\"https:\/\/17aitech.com\/?p=33308\/#%E8%AE%BA%E6%96%87%E7%BF%BB%E8%AF%91-4\" >\u8bba\u6587\u7ffb\u8bd1<\/a><\/li><\/ul><\/li><li class='ez-toc-page-1 ez-toc-heading-level-3'><a class=\"ez-toc-link ez-toc-heading-19\" href=\"https:\/\/17aitech.com\/?p=33308\/#%E8%AE%BA%E6%96%87%E7%90%86%E8%A7%A3-4\" >\u8bba\u6587\u7406\u89e3<\/a><\/li><\/ul><\/li><li class='ez-toc-page-1 ez-toc-heading-level-2'><a class=\"ez-toc-link ez-toc-heading-20\" href=\"https:\/\/17aitech.com\/?p=33308\/#CLIP_%E6%A8%A1%E5%9E%8B%E8%AE%BA%E6%96%87%E9%98%85%E8%AF%BB%E7%90%86%E8%A7%A3\" >CLIP \u6a21\u578b\u8bba\u6587\u9605\u8bfb\u7406\u89e3<\/a><ul class='ez-toc-list-level-4' ><li class='ez-toc-heading-level-4'><ul class='ez-toc-list-level-4' ><li class='ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-21\" href=\"https:\/\/17aitech.com\/?p=33308\/#%E8%AE%BA%E6%96%87%E5%8E%9F%E6%96%87-5\" >\u8bba\u6587\u539f\u6587<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-22\" href=\"https:\/\/17aitech.com\/?p=33308\/#%E8%AE%BA%E6%96%87%E7%BF%BB%E8%AF%91-5\" >\u8bba\u6587\u7ffb\u8bd1<\/a><\/li><\/ul><\/li><li class='ez-toc-page-1 ez-toc-heading-level-3'><a class=\"ez-toc-link ez-toc-heading-23\" href=\"https:\/\/17aitech.com\/?p=33308\/#%E6%A8%A1%E5%9E%8B%E7%BB%93%E6%9E%84-3\" >\u6a21\u578b\u7ed3\u6784<\/a><ul class='ez-toc-list-level-4' ><li class='ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-24\" href=\"https:\/\/17aitech.com\/?p=33308\/#%E8%AE%BA%E6%96%87%E5%8E%9F%E6%96%87-6\" >\u8bba\u6587\u539f\u6587<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-25\" href=\"https:\/\/17aitech.com\/?p=33308\/#%E8%AE%BA%E6%96%87%E7%BF%BB%E8%AF%91-6\" >\u8bba\u6587\u7ffb\u8bd1<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-26\" href=\"https:\/\/17aitech.com\/?p=33308\/#%E8%AE%BA%E6%96%87%E7%90%86%E8%A7%A3-5\" >\u8bba\u6587\u7406\u89e3<\/a><\/li><\/ul><\/li><li class='ez-toc-page-1 ez-toc-heading-level-3'><a class=\"ez-toc-link ez-toc-heading-27\" href=\"https:\/\/17aitech.com\/?p=33308\/#%E5%AF%B9%E6%AF%94%E5%AD%A6%E4%B9%A0\" >\u5bf9\u6bd4\u5b66\u4e60<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-3'><a class=\"ez-toc-link ez-toc-heading-28\" href=\"https:\/\/17aitech.com\/?p=33308\/#%E8%BF%81%E7%A7%BB%E9%A2%84%E8%AE%AD%E7%BB%83%E6%A8%A1%E5%9E%8B%E5%AE%9E%E7%8E%B0zero-shot\" >\u8fc1\u79fb\u9884\u8bad\u7ec3\u6a21\u578b\u5b9e\u73b0zero-shot<\/a><ul class='ez-toc-list-level-4' ><li class='ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-29\" href=\"https:\/\/17aitech.com\/?p=33308\/#%E8%AE%BA%E6%96%87%E5%8E%9F%E6%96%87-7\" >\u8bba\u6587\u539f\u6587<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-30\" href=\"https:\/\/17aitech.com\/?p=33308\/#%E8%AE%BA%E6%96%87%E7%BF%BB%E8%AF%91-7\" >\u8bba\u6587\u7ffb\u8bd1<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-31\" href=\"https:\/\/17aitech.com\/?p=33308\/#%E8%AE%BA%E6%96%87%E7%90%86%E8%A7%A3-6\" >\u8bba\u6587\u7406\u89e3<\/a><\/li><\/ul><\/li><\/ul><\/li><li class='ez-toc-page-1 ez-toc-heading-level-2'><a class=\"ez-toc-link ez-toc-heading-32\" href=\"https:\/\/17aitech.com\/?p=33308\/#%E5%86%85%E5%AE%B9%E5%B0%8F%E7%BB%93\" >\u5185\u5bb9\u5c0f\u7ed3<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-2'><a class=\"ez-toc-link ez-toc-heading-33\" href=\"https:\/\/17aitech.com\/?p=33308\/#%E5%8F%82%E8%80%83%E8%B5%84%E6%96%99\" >\u53c2\u8003\u8d44\u6599<\/a><\/li><\/ul><\/nav><\/div>\n<h2><span class=\"ez-toc-section\" id=\"%E5%89%8D%E8%A8%80\"><\/span>\u524d\u8a00<span class=\"ez-toc-section-end\"><\/span><\/h2>\n<p>\u5728<a href=\"https:\/\/17aitech.com\/?p=32899\">\u3010\u8bfe\u7a0b\u603b\u7ed3\u3011day31\uff1a\u591a\u6a21\u6001\u5927\u6a21\u578b\u521d\u6b65\u4e86\u89e3<\/a>\u4e00\u6587\u4e2d\uff0c\u6211\u4eec\u5bf9\u591a\u6a21\u6001\u5927\u6a21\u578b\u7684\u57fa\u672c\u539f\u7406\u6709\u4e86\u521d\u6b65\u4e86\u89e3\uff0c\u672c\u7ae0\u5185\u5bb9\u5c06\u901a\u8fc7\u8bba\u6587\u9605\u8bfb\u7406\u89e3\uff0c\u66f4\u8fdb\u4e00\u6b65\u7406\u89e3\u591a\u6a21\u6001\u5927\u6a21\u578b\u4e2d\u6240\u6d89\u53ca\u7684 Vit \u67b6\u6784\u3001Transformer\u5728\u89c6\u89c9\u5e94\u7528\u7684\u7406\u5ff5\u4ee5\u53ca Clip\u56fe\u50cf\u4e0e\u6587\u672c\u5339\u914d\u7684\u5e94\u7528\u3002<\/p>\n<h2><span class=\"ez-toc-section\" id=\"ViT_%E6%A8%A1%E5%9E%8B%E8%AE%BA%E6%96%87%E9%98%85%E8%AF%BB%E7%90%86%E8%A7%A3\"><\/span><code>ViT<\/code> \u6a21\u578b\u8bba\u6587\u9605\u8bfb\u7406\u89e3<span class=\"ez-toc-section-end\"><\/span><\/h2>\n<p>\u591a\u6a21\u6001\u5927\u6a21\u578b\u4e2d\u6240\u6d89\u53ca\u7684\u6700\u4e3a\u7ecf\u5178\u7684\u6a21\u578b\u5c31\u662f <code>ViT<\/code>\uff0c\u6240\u4ee5\u6211\u4eec\u5148\u4e86\u89e3\u8be5\u8bba\u6587\u7684\u6838\u5fc3\u8981\u70b9\u3002<\/p>\n<p>\u8bba\u6587\u6807\u9898\uff1aAn Image Is Worth 16&#215;16 Words: Transformers For Image Recognition At Scale<br \/>\n\u8bba\u6587\u5730\u5740\uff1a<a href=\"https:\/\/arxiv.org\/abs\/2010.11929\">https:\/\/arxiv.org\/abs\/2010.11929<\/a><\/p>\n<h3><span class=\"ez-toc-section\" id=\"ABSTRACT%E9%83%A8%E5%88%86\"><\/span>ABSTRACT\u90e8\u5206<span class=\"ez-toc-section-end\"><\/span><\/h3>\n<h4><span class=\"ez-toc-section\" id=\"%E8%AE%BA%E6%96%87%E5%8E%9F%E6%96%87\"><\/span>\u8bba\u6587\u539f\u6587<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<blockquote>\n<p>While the Transformer architecture has become the de-facto standard for natural language processing tasks, its applications to computer vision remain limited. In vision, attention is either applied in conjunction with convolutional networks, or used to replace certain components of convolutional networks while keeping their overall structure in place. <\/p>\n<p>We show that this reliance on CNNs is not necessary and <strong>a pure transformer applied directly to sequences of image patches can perform very well on image classification tasks<\/strong>. When pre-trained on large amounts of data and transferred to multiple mid-sized or small image recognition benchmarks (ImageNet, CIFAR-100, VTAB, etc.), <strong>Vision Transformer (ViT) attains<\/strong> excellent results compared to state-of-the-art convolutional networks while requiring substantially <strong>fewer computational<\/strong> resources to train.<\/p>\n<\/blockquote>\n<h4><span class=\"ez-toc-section\" id=\"%E8%AE%BA%E6%96%87%E7%BF%BB%E8%AF%91\"><\/span>\u8bba\u6587\u7ffb\u8bd1<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<blockquote>\n<p>\u5c3d\u7ba1\u53d8\u6362\u5668\u67b6\u6784\u5df2\u6210\u4e3a\u81ea\u7136\u8bed\u8a00\u5904\u7406\u4efb\u52a1\u7684\u6807\u51c6\uff0c\u4f46\u5176\u5728\u8ba1\u7b97\u673a\u89c6\u89c9\u4e2d\u7684\u5e94\u7528\u4ecd\u7136\u6709\u9650\u3002\u5728\u89c6\u89c9\u9886\u57df\uff0c\u6ce8\u610f\u529b\u7684\u4f5c\u7528\u8981\u4e48\u4e0e\u5377\u79ef\u7f51\u7edc\u7ed3\u5408\u4f7f\u7528\uff0c\u8981\u4e48\u7528\u4e8e\u66ff\u6362\u5377\u79ef\u7f51\u7edc\u7684\u67d0\u4e9b\u7ec4\u4ef6\uff0c\u540c\u65f6\u4fdd\u6301\u5176\u6574\u4f53\u7ed3\u6784\u4e0d\u53d8\u3002<\/p>\n<p>\u6211\u4eec\u8868\u660e\uff0c\u8fd9\u79cd\u5bf9\u5377\u79ef\u795e\u7ecf\u7f51\u7edc\u7684\u4f9d\u8d56\u5e76\u4e0d\u662f\u5fc5\u9700\u7684\uff0c\u76f4\u63a5\u5c06<strong>\u7eaf\u53d8\u6362\u5668\u5e94\u7528\u4e8e\u56fe\u50cf\u5757\u7684\u5e8f\u5217\u53ef\u4ee5\u5728\u56fe\u50cf\u5206\u7c7b\u4efb\u52a1\u4e2d\u8868\u73b0\u5f97\u975e\u5e38\u597d<\/strong>\u3002\u5f53\u5728\u5927\u91cf\u6570\u636e\u4e0a\u8fdb\u884c\u9884\u8bad\u7ec3\u5e76\u8f6c\u79fb\u5230\u591a\u4e2a\u4e2d\u578b\u6216\u5c0f\u578b\u56fe\u50cf\u8bc6\u522b\u57fa\u51c6\uff08\u5982 ImageNet\u3001CIFAR-100\u3001VTAB \u7b49\uff09\u65f6\uff0c<strong>\u89c6\u89c9\u53d8\u6362\u5668 (ViT) \u7684\u8868\u73b0<\/strong>\u4e0e\u6700\u5148\u8fdb\u7684\u5377\u79ef\u7f51\u7edc\u76f8\u6bd4\uff0c<strong>\u53d6\u5f97\u4e86\u4f18\u79c0\u7684\u7ed3\u679c<\/strong>\uff0c\u540c\u65f6\u8bad\u7ec3\u6240\u9700\u7684<strong>\u8ba1\u7b97\u8d44\u6e90\u663e\u8457\u51cf\u5c11<\/strong>\u3002<\/p>\n<\/blockquote>\n<h4><span class=\"ez-toc-section\" id=\"%E8%AE%BA%E6%96%87%E7%90%86%E8%A7%A3\"><\/span>\u8bba\u6587\u7406\u89e3<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<ul>\n<li>\u8be5\u8bba\u6587\u63d0\u51fa\u4e86\u4e00\u4e2a\u89e3\u51b3\u95ee\u9898\u601d\u60f3\uff0c\u4f7f\u7528 <code>Transformer<\/code> \u7ed3\u6784\u6765\u5904\u7406\u56fe\u50cf\u3002<\/li>\n<\/ul>\n<h3><span class=\"ez-toc-section\" id=\"INTRODUCTION%E9%83%A8%E5%88%86\"><\/span>INTRODUCTION\u90e8\u5206<span class=\"ez-toc-section-end\"><\/span><\/h3>\n<h4><span class=\"ez-toc-section\" id=\"%E8%AE%BA%E6%96%87%E5%8E%9F%E6%96%87-2\"><\/span>\u8bba\u6587\u539f\u6587<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<blockquote>\n<p>Self-attention-based architectures, in particular Transformers (Vaswani et al., 2017), have become the model of choice in natural language processing (NLP). The dominant approach is to pre-train on a large text corpus and then fine-tune on a smaller task-specific dataset (Devlin et al., 2019). Thanks to Transformers\u2019 <strong>computational efficiency and scalability<\/strong>, it has become possible to train models of unprecedented size, with over 100B parameters (Brown et al., 2020; Lepikhin et al., 2020). With the models and datasets growing, there is still no sign of saturating performance.<\/p>\n<p>In computer vision, however, convolutional architectures remain dominant (LeCun et al., 1989; Krizhevsky et al., 2012; He et al., 2016). Inspired by NLP successes, multiple works try combining CNN-like architectures with self-attention (Wang et al., 2018; Carion et al., 2020), some replacing the convolutions entirely (Ramachandran et al., 2019; Wang et al., 2020a). The latter models, while theoretically efficient, have not yet been scaled effectively on modern hardware accelerators due to the use of specialized attention patterns. Therefore, in large-scale image recognition, classic ResNet-like architectures are still state of the art (Mahajan et al., 2018; Xie et al., 2020; Kolesnikov et al., 2020).<\/p>\n<p>Inspired by the Transformer scaling successes in NLP, we experiment with applying <strong>a standard Transformer directly to images<\/strong>, with the fewest possible modifications. To do so, we <strong>split an image into patches and provide the sequence of linear embeddings of these patches as an input to a Transformer<\/strong>. Image patches are treated the same way <strong>as tokens (words)<\/strong> in an NLP application. We train the model on image classification in a supervised fashion.<\/p>\n<\/blockquote>\n<h4><span class=\"ez-toc-section\" id=\"%E8%AE%BA%E6%96%87%E7%BF%BB%E8%AF%91-2\"><\/span>\u8bba\u6587\u7ffb\u8bd1<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<blockquote>\n<p>\u57fa\u4e8e\u81ea\u6ce8\u610f\u529b\u7684\u67b6\u6784\uff0c\u5c24\u5176\u662f <code>Transformer<\/code>\uff08Vaswani \u7b49\uff0c2017\uff09\uff0c\u5df2\u6210\u4e3a\u81ea\u7136\u8bed\u8a00\u5904\u7406\uff08NLP\uff09\u4e2d\u7684\u9996\u9009\u6a21\u578b\u3002\u4e3b\u6d41\u7684\u65b9\u6cd5\u662f\u5728\u5927\u578b\u6587\u672c\u8bed\u6599\u5e93\u4e0a\u8fdb\u884c\u9884\u8bad\u7ec3\uff0c\u7136\u540e\u5728\u8f83\u5c0f\u7684\u7279\u5b9a\u4efb\u52a1\u6570\u636e\u96c6\u4e0a\u8fdb\u884c\u5fae\u8c03\uff08Devlin \u7b49\uff0c2019\uff09\u3002\u5f97\u76ca\u4e8e<code>Transformer<\/code> \u7684<strong>\u8ba1\u7b97\u6548\u7387\u548c\u53ef\u6269\u5c55\u6027<\/strong>\uff0c\u8bad\u7ec3\u8d85\u8fc7 1000 \u4ebf\u53c2\u6570\u7684\u524d\u6240\u672a\u6709\u89c4\u6a21\u7684\u6a21\u578b\u6210\u4e3a\u53ef\u80fd\uff08Brown \u7b49\uff0c2020\uff1bLepikhin \u7b49\uff0c2020\uff09\u3002\u968f\u7740\u6a21\u578b\u548c\u6570\u636e\u96c6\u7684\u589e\u957f\uff0c\u6027\u80fd\u4ecd\u6ca1\u6709\u9971\u548c\u7684\u8ff9\u8c61\u3002<\/p>\n<p>\u7136\u800c\uff0c\u5728\u8ba1\u7b97\u673a\u89c6\u89c9\u4e2d\uff0c\u5377\u79ef\u67b6\u6784\u4ecd\u7136\u5360\u4e3b\u5bfc\u5730\u4f4d\uff08LeCun \u7b49\uff0c1989\uff1bKrizhevsky \u7b49\uff0c2012\uff1bHe \u7b49\uff0c2016\uff09\u3002\u53d7\u5230 NLP \u6210\u529f\u7684\u542f\u53d1\uff0c\u591a\u4e2a\u7814\u7a76\u5c1d\u8bd5\u5c06\u7c7b\u4f3c CNN \u7684\u67b6\u6784\u4e0e\u81ea\u6ce8\u610f\u529b\u7ed3\u5408\uff08Wang \u7b49\uff0c2018\uff1bCarion \u7b49\uff0c2020\uff09\uff0c\u5176\u4e2d\u4e00\u4e9b\u5b8c\u5168\u66ff\u4ee3\u5377\u79ef\uff08Ramachandran \u7b49\uff0c2019\uff1bWang \u7b49\uff0c2020a\uff09\u3002\u540e\u8005\u6a21\u578b\u867d\u7136\u5728\u7406\u8bba\u4e0a\u9ad8\u6548\uff0c\u4f46\u7531\u4e8e\u4f7f\u7528\u4e86\u4e13\u95e8\u7684\u6ce8\u610f\u529b\u6a21\u5f0f\uff0c\u5c1a\u672a\u5728\u73b0\u4ee3\u786c\u4ef6\u52a0\u901f\u5668\u4e0a\u6709\u6548\u6269\u5c55\u3002\u56e0\u6b64\uff0c\u5728\u5927\u89c4\u6a21\u56fe\u50cf\u8bc6\u522b\u4e2d\uff0c\u7ecf\u5178\u7684 ResNet \u7c7b\u67b6\u6784\u4ecd\u7136\u662f\u6700\u5148\u8fdb\u7684\uff08Mahajan \u7b49\uff0c2018\uff1bXie \u7b49\uff0c2020\uff1bKolesnikov \u7b49\uff0c2020\uff09\u3002<\/p>\n<p>\u53d7\u5230 NLP \u4e2d\u53d8\u6362\u5668\u6269\u5c55\u6210\u529f\u7684\u542f\u53d1\uff0c\u6211\u4eec<strong>\u5c1d\u8bd5\u5c06\u6807\u51c6 <code>Transformer<\/code> \u76f4\u63a5\u5e94\u7528\u4e8e\u56fe\u50cf<\/strong>\uff0c\u5c3d\u53ef\u80fd\u5c11\u5730\u8fdb\u884c\u4fee\u6539\u3002\u4e3a\u6b64\uff0c\u6211\u4eec<strong>\u5c06\u56fe\u50cf\u5206\u5272\u6210\u5c0f\u5757\uff0c\u5e76\u5c06\u8fd9\u4e9b\u5c0f\u5757\u7684\u7ebf\u6027\u5d4c\u5165\u5e8f\u5217\u4f5c\u4e3a\u8f93\u5165\u63d0\u4f9b\u7ed9 <code>Transformer<\/code><\/strong>\u3002\u56fe\u50cf\u5c0f\u5757\u7684\u5904\u7406\u65b9\u5f0f\u4e0e NLP \u5e94\u7528\u4e2d\u7684\u6807\u8bb0\uff08\u5355\u8bcd\uff09\u76f8\u540c\u3002\u6211\u4eec\u5728\u76d1\u7763\u65b9\u5f0f\u4e0b\u5bf9\u56fe\u50cf\u5206\u7c7b\u8fdb\u884c\u6a21\u578b\u8bad\u7ec3\u3002<\/p>\n<\/blockquote>\n<h4><span class=\"ez-toc-section\" id=\"%E8%AE%BA%E6%96%87%E7%90%86%E8%A7%A3-2\"><\/span>\u8bba\u6587\u7406\u89e3<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<ul>\n<li><code>Transformer<\/code> \u662f\u4e00\u4e2a\u975e\u5e38\u5f3a\u5927\u7684\u6a21\u578b\uff0c\u53ef\u4ee5\u5904\u7406\u65f6\u5e8f\u4fe1\u606f\u5e76\u4e14\u5728 <code>NLP<\/code> \u81ea\u7136\u8bed\u8a00\u4efb\u52a1\u4e2d\u8868\u73b0\u4f18\u79c0\u3002<\/li>\n<li>\u8ba1\u7b97\u673a\u89c6\u89c9\u65b9\u9762\u76ee\u524d\u4ecd\u7136\u662f <strong>\u5377\u79ef\u7f51\u7edc<\/strong> \u4e3b\u5bfc\u7684\uff0c\u800c Transformer \u6709\u975e\u5e38\u5f3a\u5927\u7684\u5e76\u884c\u8ba1\u7b97\u6027\u80fd\u3002<\/li>\n<li>\u5982\u4f55\u4f7f\u7528Transformer \u6765\u66ff\u4ee3\u5377\u79ef\u5462\uff0c\u8bba\u6587\u4e2d\u7ed9\u51fa\u4e86\u4e00\u79cd\u601d\u60f3\uff1a\n<ul>\n<li>\u5c06\u56fe\u50cf\u5206\u5272\u6210\u5c0f\u5757\uff0c\u7136\u540e\u5bf9\u6bcf\u4e2a\u5c0f\u5757\u8fdb\u884c <code>embedding<\/code>\uff0c\u7136\u540e\u5c06\u8fd9\u4e9b <code>embedding<\/code> \u5e8f\u5217(\u56fe\u50cf\u53d8\u4e3a\u4e86\u65f6\u5e8f\u7684\u5e8f\u5217)\u4f5c\u4e3a\u8f93\u5165\u63d0\u4f9b\u7ed9 <code>Transformer<\/code>\uff0c\u4ee5\u6b64\u53d1\u6325 Transformer \u7684\u5e76\u884c\u8ba1\u7b97\u4f18\u52bf\u3002<\/li>\n<\/ul>\n<\/li>\n<\/ul>\n<h3><span class=\"ez-toc-section\" id=\"%E6%A8%A1%E5%9E%8B%E7%BB%93%E6%9E%84\"><\/span>\u6a21\u578b\u7ed3\u6784<span class=\"ez-toc-section-end\"><\/span><\/h3>\n<p><a href=\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/10\/ViT\u6a21\u578b\u7ed3\u6784.png\" data-fancybox=\"images\" data-fancybox=\"gallery\"><img decoding=\"async\" src=\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/10\/ViT\u6a21\u578b\u7ed3\u6784.png\" alt=\"\" \/><\/a><\/p>\n<h4><span class=\"ez-toc-section\" id=\"%E8%AE%BA%E6%96%87%E5%8E%9F%E6%96%87-3\"><\/span>\u8bba\u6587\u539f\u6587<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<blockquote>\n<p>An overview of the model is depicted in Figure 1. The standard Transformer receives as input a 1D sequence of token embeddings. To handle 2D images, we <strong>reshape the image<\/strong>  <code class=\"katex-inline\">x \\in \\mathbb{R}^{H \\times W \\times C}<\/code> <strong>into a sequence of flattened 2D patches<\/strong> <code class=\"katex-inline\">x_p \\in \\mathbb{R}^{N \\times (P^2 \\cdot C)}<\/code>, where <code class=\"katex-inline\">(H, W)<\/code> is the resolution of the <strong>original image<\/strong>, <code class=\"katex-inline\">C<\/code> is the <strong>number of channels<\/strong>, <code class=\"katex-inline\">(P, P)<\/code> is the resolution of <strong>each image patch<\/strong>, and <code class=\"katex-inline\">N = \\frac{HW}{P^2}<\/code> is the <strong>resulting number of patches<\/strong>, which also serves as the effective input sequence length for the Transformer. <\/p>\n<p>The Transformer uses a constant latent vector size <code class=\"katex-inline\">D<\/code> through all of its layers, so we <strong>flatten the patches<\/strong> and map to <code class=\"katex-inline\">D<\/code> dimensions with <strong>a trainable linear<\/strong> projection (Eq. 1). We refer to the output of this projection as the patch embeddings.<\/p>\n<p><strong>Similar to BERT\u2019s<\/strong> [class] token, we <strong>prepend a learnable embedding<\/strong> to the sequence of embedded patches <code class=\"katex-inline\">(z_0^0 = x_{\\text{class}})<\/code>, whose state at the output of the Transformer encoder <code class=\"katex-inline\">(z_L^0)<\/code> serves as <strong>the image representation<\/strong> <code class=\"katex-inline\">y<\/code> (Eq. 4). Both during pre-training and fine-tuning, a classification head is attached to <code class=\"katex-inline\">z_L^0<\/code>. The classification head is implemented by a MLP with one hidden layer at pre-training time and by a single linear layer at fine-tuning time.<\/p>\n<p><strong>Position embeddings<\/strong> are added to the patch embeddings to <strong>retain<\/strong> positional information. We use standard learnable 1D position embeddings, since we have not observed significant performance gains from using more advanced 2D-aware position embeddings (Appendix D.4). The resulting sequence of embedding vectors serves as input to the encoder.<\/p>\n<p>The Transformer encoder (Vaswani et al., 2017) consists of alternating layers of <strong>multiheaded self-attention<\/strong> (MSA, see Appendix A) and <strong>MLP<\/strong> blocks (Eq. 2, 3). <strong>Layer normalization<\/strong> (LN) is applied <strong>before<\/strong> every block, and <strong>residual connections<\/strong> <strong>after<\/strong> every block (Wang et al., 2019; Baevski &amp; Auli, 2019). The MLP contains two layers with a GELU non-linearity.<\/p>\n<\/blockquote>\n<h4><span class=\"ez-toc-section\" id=\"%E8%AE%BA%E6%96%87%E7%BF%BB%E8%AF%91-3\"><\/span>\u8bba\u6587\u7ffb\u8bd1<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<blockquote>\n<p>\u6a21\u578b\u7684\u6982\u8ff0\u5982\u56fe 1 \u6240\u793a\u3002\u6807\u51c6 <code>Transformer<\/code> \u63a5\u6536 <code>1D<\/code> \u7684\u6807\u8bb0\u5d4c\u5165\u5e8f\u5217\u4f5c\u4e3a\u8f93\u5165\u3002\u4e3a\u4e86\u5904\u7406 <code>2D<\/code> \u56fe\u50cf\uff0c\u6211\u4eec\u5c06\u56fe\u50cf <code class=\"katex-inline\">x \\in \\mathbb{R}^{H \\times W \\times C}<\/code> <strong>\u91cd\u5851\u4e3a\u4e00\u7cfb\u5217\u5c55\u5e73\u7684 <code>2D<\/code> \u5c0f\u5757<\/strong> <code class=\"katex-inline\">x_p \\in \\mathbb{R}^{N \\times (P^2 \\cdot C)}<\/code>\uff0c\u5176\u4e2d <code class=\"katex-inline\">(H, W)<\/code> \u662f<strong>\u539f\u59cb\u56fe\u50cf\u7684\u5206\u8fa8\u7387<\/strong>\uff0c<code class=\"katex-inline\">C<\/code> \u662f<strong>\u901a\u9053\u6570<\/strong>\uff0c<code class=\"katex-inline\">(P, P)<\/code> \u662f\u6bcf\u4e2a\u56fe\u50cf<strong>\u5c0f\u5757\u7684\u5206\u8fa8\u7387<\/strong>\uff0c<code class=\"katex-inline\">N = \\frac{HW}{P^2}<\/code> \u662f\u5f97\u5230\u7684<strong>\u5c0f\u5757\u6570\u91cf<\/strong>\uff0c\u8fd9\u4e5f\u4f5c\u4e3a <code>Transformer<\/code> \u7684\u6709\u6548\u8f93\u5165\u5e8f\u5217\u957f\u5ea6\u3002<\/p>\n<p><code>Transformer<\/code> \u5728\u5176\u6240\u6709\u5c42\u4e2d\u4f7f\u7528\u6052\u5b9a\u7684\u6f5c\u5728\u5411\u91cf\u5927\u5c0f <code class=\"katex-inline\">D<\/code>\uff0c\u56e0\u6b64\u6211\u4eec\u5c06\u5c0f\u5757\u5c55\u5e73\u5e76\u901a\u8fc7\u53ef\u8bad\u7ec3\u7684\u7ebf\u6027\u6295\u5f71\u6620\u5c04\u5230 <code class=\"katex-inline\">D<\/code> \u7ef4\uff08\u516c\u5f0f 1\uff09\u3002\u6211\u4eec\u5c06\u6b64\u6295\u5f71\u7684\u8f93\u51fa\u79f0\u4e3a\u5c0f\u5757\u5d4c\u5165\u3002<\/p>\n<p>\u7c7b\u4f3c\u4e8e <code>BERT<\/code> \u7684 [class] \u6807\u8bb0\uff0c\u6211\u4eec\u5728\u5d4c\u5165\u5c0f\u5757\u7684\u5e8f\u5217\u524d\u6dfb\u52a0\u4e00\u4e2a<strong>\u53ef\u5b66\u4e60\u7684\u5d4c\u5165<\/strong> <code class=\"katex-inline\">(z_0^0 = x_{\\text{class}})<\/code>\uff0c\u5176\u5728 <code>Transformer<\/code> \u7684\u7f16\u7801\u5668\u8f93\u51fa\u65f6\u7684\u72b6\u6001 <code class=\"katex-inline\">(z_L^0)<\/code> \u4f5c\u4e3a\u56fe\u50cf\u8868\u793a <code class=\"katex-inline\">y<\/code>\uff08\u516c\u5f0f 4\uff09\u3002\u5728\u9884\u8bad\u7ec3\u548c\u5fae\u8c03\u8fc7\u7a0b\u4e2d\uff0c\u5206\u7c7b\u5934\u90fd\u9644\u52a0\u5728 <code class=\"katex-inline\">z_L^0<\/code> \u4e0a\u3002\u5206\u7c7b\u5934\u5728\u9884\u8bad\u7ec3\u65f6\u901a\u8fc7\u4e00\u4e2a\u9690\u85cf\u5c42\u7684 <code>MLP<\/code> \u5b9e\u73b0\uff0c\u5728\u5fae\u8c03\u65f6\u901a\u8fc7\u4e00\u4e2a\u7ebf\u6027\u5c42\u5b9e\u73b0\u3002<\/p>\n<p>\u4f4d\u7f6e\u5d4c\u5165\u88ab\u6dfb\u52a0\u5230\u5c0f\u5757\u5d4c\u5165\u4e2d\u4ee5\u4fdd\u7559\u4f4d\u7f6e\u4fe1\u606f\u3002\u6211\u4eec\u4f7f\u7528\u6807\u51c6\u7684\u53ef\u5b66\u4e60 <code>1D<\/code> \u4f4d\u7f6e\u5d4c\u5165\uff0c\u56e0\u4e3a\u6211\u4eec\u6ca1\u6709\u89c2\u5bdf\u5230\u4f7f\u7528\u66f4\u5148\u8fdb\u7684 <code>2D<\/code> \u4f4d\u7f6e\u5d4c\u5165\u4f1a\u663e\u8457\u63d0\u5347\u6027\u80fd\uff08\u9644\u5f55 D.4\uff09\u3002\u751f\u6210\u7684\u5d4c\u5165\u5411\u91cf\u5e8f\u5217\u4f5c\u4e3a\u7f16\u7801\u5668\u7684\u8f93\u5165\u3002<\/p>\n<p><code>Transformer<\/code> \u7f16\u7801\u5668\uff08Vaswani \u7b49\uff0c2017\uff09\u7531<strong>\u591a\u5934\u81ea\u6ce8\u610f\u529b<\/strong>\uff08MSA\uff0c\u89c1\u9644\u5f55 A\uff09\u548c <strong>MLP<\/strong> \u5757\uff08\u516c\u5f0f 2\u30013\uff09<strong>\u4ea4\u66ff\u5c42<\/strong>\u7ec4\u6210\u3002\u5728\u6bcf\u4e2a\u5757<strong>\u4e4b\u524d\u5e94\u7528\u5c42\u5f52\u4e00\u5316<\/strong>\uff08LN\uff09\uff0c\u5728\u6bcf\u4e2a\u5757<strong>\u4e4b\u540e\u5e94\u7528\u6b8b\u5dee\u8fde\u63a5<\/strong>\uff08Wang \u7b49\uff0c2019\uff1bBaevski &amp; Auli\uff0c2019\uff09\u3002MLP \u5305\u542b\u4e24\u4e2a\u5c42\uff0c\u5e76\u4f7f\u7528 GELU \u975e\u7ebf\u6027\u6fc0\u6d3b\u51fd\u6570\u3002<\/p>\n<\/blockquote>\n<h4><span class=\"ez-toc-section\" id=\"%E8%AE%BA%E6%96%87%E7%90%86%E8%A7%A3-3\"><\/span>\u8bba\u6587\u7406\u89e3<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<ul>\n<li>\u4e3a\u4e86\u8ba9Transformer \u80fd\u591f\u5904\u7406\u56fe\u50cf\uff0c\u6211\u4eec<strong>\u5c06\u56fe\u50cf\u5206\u5272\u6210\u5c0f\u5757\uff0c\u5176\u4e2dH,W \u4e3a\u56fe\u50cf\u7684\u5206\u8fa8\u7387\uff0cC \u4e3a\u901a\u9053\u6570\uff0cP \u4e3a\u5c0f\u5757\u7684\u5206\u8fa8\u7387<\/strong>\u3002<\/li>\n<li>\u4e3a\u4e86\u6ee1\u8db3\u56fe\u7247\u540e\u7eed\u7684\u5206\u7c7b\u80fd\u529b\uff0c\u91c7\u7528\u4e86 BERT \u7684 [CLS] \u6807\u8bb0\uff0c\u5728Embedding\u5c42\u7684\u7b2c\u4e00\u4e2a\u4f4d\u7f6e\u6dfb\u52a0\u4e86\u4e00\u4e2a [CLS] \u6807\u8bb0\u3002<\/li>\n<li>\u4e3a\u4e86\u4fdd\u7559\u56fe\u7247\u7684\u65f6\u5e8f\u4fe1\u606f\uff0c\u91c7\u7528\u4e86\u53ef\u5b66\u4e60\u7684 1D position embedding\uff0c\u6700\u540e\u5c06\u751f\u6210\u7684\u5d4c\u5165\u5e8f\u5217\u4f5c\u4e3a\u7f16\u7801\u5668\u7684\u8f93\u5165\u3002<\/li>\n<li>Transformer \u53ea\u4f7f\u7528\u4e86encoder\u90e8\u5206\uff0c\u76f8\u6bd4\u539f\u59cb\u7684 Transformer\uff0c\u5c06Layer Normalization \u548c Residual Connection \u6dfb\u52a0\u5230\u4e86encoder\u90e8\u5206\u3002<\/li>\n<\/ul>\n<h2><span class=\"ez-toc-section\" id=\"%E5%9B%9E%E9%A1%BE_BERT\"><\/span>\u56de\u987e <code>BERT<\/code><span class=\"ez-toc-section-end\"><\/span><\/h2>\n<p>\u7531\u4e8e <code>Vit<\/code> \u67b6\u6784\u4e2d\u63d0\u5230\u4e86\u56fe\u50cf\u5206\u7c7b\u91c7\u7528\u4e86 <code>Bert<\/code> \u6a21\u578b\uff0c\u6240\u4ee5\u6211\u4eec\u56de\u987e\u4e0e\u6b64\u76f8\u5173\u7684\u90e8\u5206\u3002<\/p>\n<p>\u8bba\u6587\u5730\u5740\uff1a<a href=\"https:\/\/arxiv.org\/abs\/1810.04805\">BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding<\/a><\/p>\n<h3><span class=\"ez-toc-section\" id=\"%E6%A8%A1%E5%9E%8B%E7%BB%93%E6%9E%84-2\"><\/span>\u6a21\u578b\u7ed3\u6784<span class=\"ez-toc-section-end\"><\/span><\/h3>\n<p><a href=\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/10\/Bert\u6a21\u578b\u7ed3\u6784.png\" data-fancybox=\"images\" data-fancybox=\"gallery\"><img decoding=\"async\" src=\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/10\/Bert\u6a21\u578b\u7ed3\u6784.png\" alt=\"\" \/><\/a><\/p>\n<h4><span class=\"ez-toc-section\" id=\"%E8%AE%BA%E6%96%87%E5%8E%9F%E6%96%87-4\"><\/span>\u8bba\u6587\u539f\u6587<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<blockquote>\n<p>Figure 1: Overall pre-training and fine-tuning procedures for BERT. Apart from output layers, the same architectures are used in both pre-training and fine-tuning. The same pre-trained model parameters are used to initialize models for different down-stream tasks. During fine-tuning, all parameters are fine-tuned. [CLS] is a special symbol added in front of every input example, and [SEP] is a special separator token (e.g. separating questions\/answers).<\/p>\n<\/blockquote>\n<h4><span class=\"ez-toc-section\" id=\"%E8%AE%BA%E6%96%87%E7%BF%BB%E8%AF%91-4\"><\/span>\u8bba\u6587\u7ffb\u8bd1<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<blockquote>\n<p>\u56fe1\uff1aBERT\u7684\u6574\u4f53\u9884\u8bad\u7ec3\u548c\u5fae\u8c03\u8fc7\u7a0b\u3002\u9664\u4e86\u8f93\u51fa\u5c42\u5916\uff0c\u9884\u8bad\u7ec3\u548c\u5fae\u8c03\u4e2d\u4f7f\u7528\u76f8\u540c\u7684\u67b6\u6784\u3002\u76f8\u540c\u7684\u9884\u8bad\u7ec3\u6a21\u578b\u53c2\u6570\u7528\u4e8e\u521d\u59cb\u5316\u4e0d\u540c\u4e0b\u6e38\u4efb\u52a1\u7684\u6a21\u578b\u3002\u5728\u5fae\u8c03\u8fc7\u7a0b\u4e2d\uff0c\u6240\u6709\u53c2\u6570\u90fd\u8fdb\u884c\u5fae\u8c03\u3002[CLS] \u662f\u5728\u6bcf\u4e2a\u8f93\u5165\u793a\u4f8b\u524d\u6dfb\u52a0\u7684\u7279\u6b8a\u7b26\u53f7\uff0c[SEP] \u662f\u4e00\u4e2a\u7279\u6b8a\u7684\u5206\u9694\u7b26\u6807\u8bb0\uff08\u4f8b\u5982\uff0c\u7528\u4e8e\u5206\u9694\u95ee\u9898\/\u7b54\u6848\uff09\u3002<\/p>\n<\/blockquote>\n<p><a href=\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/10\/Bert\u6a21\u578b\u7ed3\u67842.png\" data-fancybox=\"images\" data-fancybox=\"gallery\"><img decoding=\"async\" src=\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/10\/Bert\u6a21\u578b\u7ed3\u67842.png\" alt=\"\" \/><\/a><\/p>\n<p><strong>\u8bba\u6587\u539f\u6587<\/strong><\/p>\n<blockquote>\n<p>To make BERT handle a variety of down-stream tasks, our input representation is able to unambiguously represent both a single sentence and a pair of sentences (e.g., h Question, Answeri) in one token sequence. Throughout this work, a \u201csentence\u201d can be an arbitrary span of contiguous text, rather than an actual linguistic sentence. A \u201csequence\u201d refers to the input token sequence to BERT, which may be a single sentence or two sentences packed together. We use WordPiece embeddings (Wu et al., 2016) with a 30,000 token vocabulary. The first<br \/>\ntoken of every sequence is always a special classification token ([CLS]). The final hidden state corresponding to this token is used as the aggregate sequence representation for classification tasks. Sentence pairs are packed together into a single sequence. We differentiate the sentences in two ways. First, we separate them with a special token ([SEP]). Second, we add a learned embedding to every token indicating whether it belongs to sentence A or sentence B. <\/p>\n<\/blockquote>\n<p><strong>\u8bba\u6587\u7ffb\u8bd1<\/strong><\/p>\n<blockquote>\n<p>\u4e3a\u4e86\u4f7fBERT\u80fd\u591f\u5904\u7406\u5404\u79cd\u4e0b\u6e38\u4efb\u52a1\uff0c\u6211\u4eec\u7684\u8f93\u5165\u8868\u5f81\u80fd\u591f\u660e\u786e\u5730\u8868\u793a<strong>\u5355\u4e2a\u53e5\u5b50<\/strong>\u548c<strong>\u4e00\u5bf9\u53e5\u5b50<\/strong>\uff08\u4f8b\u5982\uff0c&lt;\u95ee\u9898, \u7b54\u6848&gt;\uff09\u5728\u4e00\u4e2a <code>token<\/code> \u5e8f\u5217\u4e2d\u3002\u5728\u6211\u4eec\u7684\u901a\u7bc7\u6587\u7ae0\u4e2d\uff0c\u4e00<strong>\u4e2a\u201csentence\uff08\u53e5\u5b50\uff09\u201d\u53ef\u4ee5\u662f\u4efb\u610f\u8fde\u7eed\u7684\u6587\u672c\uff0c\u800c\u4e0d\u5fc5\u662f\u5b9e\u9645\u7684\u8bed\u8a00\u53e5\u5b50<\/strong>\u3002\u201csequence(\u5e8f\u5217)\u201d\u6307\u7684\u662f\u8f93\u5165\u5230BERT\u7684 <code>token<\/code> \u5e8f\u5217\uff0c\u8fd9\u53ef\u4ee5\u662f\u4e00\u4e2a\u5355\u72ec\u7684\u53e5\u5b50\u6216\u4e24\u4e2a\u53e5\u5b50\u7ec4\u5408\u5728\u4e00\u8d77\u3002<\/p>\n<p>\u6211\u4eec\u4f7f\u7528WordPiece\u5d4c\u5165\uff08Wu\u7b49\uff0c2016\uff09\uff0c\u8bcd\u6c47\u91cf\u4e3a30,000\u4e2a\u6807\u8bb0\u3002\u6bcf\u4e2a\u5e8f\u5217\u7684\u7b2c\u4e00\u4e2a <code>token<\/code> \u59cb\u7ec8\u662f\u4e00\u4e2a\u7279\u6b8a\u7684\u5206\u7c7b\u6807\u8bb0\uff08[CLS]\uff09\u3002\u4e0e\u8be5 <code>token<\/code> \u5bf9\u5e94\u7684\u6700\u7ec8\u9690\u85cf\u72b6\u6001\u7528\u4e8e\u5206\u7c7b\u4efb\u52a1\u7684\u805a\u5408\u5e8f\u5217\u8868\u793a\u3002\u5982\u679c\u662f\u53e5\u5b50\u5bf9\uff0c\u5219\u88ab\u6253\u5305\u6210\u4e00\u4e2a\u5355\u4e00\u5e8f\u5217\u3002\u6211\u4eec\u901a\u8fc7\u4e24\u79cd\u65b9\u5f0f\u6765\u533a\u5206\u53e5\u5b50\u3002\u9996\u5148\uff0c\u6211\u4eec<strong>\u7528\u4e00\u4e2a\u7279\u6b8atoken\uff08[SEP]\uff09\u5c06\u5b83\u4eec\u5206\u5f00<\/strong>\u3002\u5176\u6b21\uff0c\u6211\u4eec\u4e3a\u6bcf\u4e2a <code>token<\/code> <strong>\u6dfb\u52a0\u4e00\u4e2a\u5df2\u7ecf\u5b66\u8fc7\u7684embedding\u8bcd\u5411\u91cf<\/strong>\uff0c\u6307\u793a\u5b83\u5c5e\u4e8e\u53e5\u5b50A\u8fd8\u662f\u53e5\u5b50B\u3002<\/p>\n<\/blockquote>\n<h3><span class=\"ez-toc-section\" id=\"%E8%AE%BA%E6%96%87%E7%90%86%E8%A7%A3-4\"><\/span>\u8bba\u6587\u7406\u89e3<span class=\"ez-toc-section-end\"><\/span><\/h3>\n<p>\u901a\u8fc7\u5bf9\u8bba\u6587\u7684\u56de\u987e\uff0c\u6211\u4eec\u53ef\u4ee5\u53d1\u73b0\uff0c<code>Vit<\/code> \u67b6\u6784\u4e2d\u91c7\u7528\u4e86\u4e0e <code>BERT<\/code> \u76f8\u4f3c\u7684\u67b6\u6784\uff0c\u5e76\u4e14\u4e5f\u4f7f\u7528\u4e86 <code>BERT<\/code> \u7684 <code>CLS<\/code> \u6807\u8bb0\u3002 <\/p>\n<h2><span class=\"ez-toc-section\" id=\"CLIP_%E6%A8%A1%E5%9E%8B%E8%AE%BA%E6%96%87%E9%98%85%E8%AF%BB%E7%90%86%E8%A7%A3\"><\/span><code>CLIP<\/code> \u6a21\u578b\u8bba\u6587\u9605\u8bfb\u7406\u89e3<span class=\"ez-toc-section-end\"><\/span><\/h2>\n<p>\u591a\u6a21\u6001\u5927\u6a21\u578b\u7684\u8bad\u7ec3\u8fc7\u7a0b\u4e2d\uff0c\u9700\u8981\u5c06\u6587\u672c\u548c\u56fe\u50cf\u8fdb\u884c\u5339\u914d\uff0c\u6240\u4ee5 <code>CLIP<\/code> \u5373\u627f\u62c5\u6b64\u9879\u4efb\u52a1\u3002<\/p>\n<p>\u8bba\u6587\u6807\u9898\uff1aCLIP: Learning Transferable Visual Models From Natural Language Supervision<br \/>\n\u8bba\u6587\u5730\u5740\uff1a<a href=\"https:\/\/arxiv.org\/abs\/2103.00020\">https:\/\/arxiv.org\/abs\/2103.00020<\/a><\/p>\n<h4><span class=\"ez-toc-section\" id=\"%E8%AE%BA%E6%96%87%E5%8E%9F%E6%96%87-5\"><\/span>\u8bba\u6587\u539f\u6587<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<blockquote>\n<p>State-of-the-art computer vision systems are trained to predict a fixed set of predetermined object categories. This restricted form of supervision limits their generality and usability <strong>since additional labeled data<\/strong> is needed to specify any other visual concept. Learning <strong>directly from raw text about images<\/strong> is a promising alternative which leverages a much broader source of supervision.<\/p>\n<p>We demonstrate that the simple pre-training task of predicting which caption goes with which image is an efficient and scalable way to learn SOTA image representations from scratch on a dataset of 400 million (image, text) pairs collected from the internet. After pre-training, natural language is used to reference learned visual concepts (or describe new ones) enabling <code>zero-shot<\/code> transfer of the model to downstream tasks.<\/p>\n<p>We study the performance of this approach by benchmarking on over <code>30<\/code> different existing computer vision datasets, spanning tasks such as OCR, action recognition in videos, geo-localization, and many types of fine-grained object classification. The model transfers non-trivially to most tasks and is often competitive with a fully supervised baseline without the need for any dataset specific training. <\/p>\n<\/blockquote>\n<h4><span class=\"ez-toc-section\" id=\"%E8%AE%BA%E6%96%87%E7%BF%BB%E8%AF%91-5\"><\/span>\u8bba\u6587\u7ffb\u8bd1<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<blockquote>\n<p>\u6700\u5148\u8fdb\u7684\u8ba1\u7b97\u673a\u89c6\u89c9\u7cfb\u7edf\u88ab\u8bad\u7ec3\u4ee5\u9884\u6d4b\u4e00\u7ec4\u56fa\u5b9a\u7684\u9884\u5b9a\u5bf9\u8c61\u7c7b\u522b\u3002\u8fd9\u79cd\u9650\u5236\u6027\u7684\u76d1\u7763\u5f62\u5f0f\u9650\u5236\u4e86\u5b83\u4eec\u7684\u901a\u7528\u6027\u548c\u53ef\u7528\u6027\uff0c\u56e0\u4e3a<strong>\u9700\u8981\u989d\u5916\u7684\u6807\u8bb0\u6570\u636e<\/strong>\u6765\u6307\u5b9a\u4efb\u4f55\u5176\u4ed6\u89c6\u89c9\u6982\u5ff5\u3002<strong>\u76f4\u63a5\u4ece\u5173\u4e8e\u56fe\u50cf\u7684\u539f\u59cb\u6587\u672c\u5b66\u4e60<\/strong>\u662f\u4e00\u79cd\u6709\u524d\u666f\u7684\u66ff\u4ee3\u65b9\u6848\uff0c\u5b83\u5229\u7528\u4e86\u66f4\u5e7f\u6cdb\u7684\u76d1\u7763\u6765\u6e90\u3002<\/p>\n<p>\u6211\u4eec\u5c55\u793a\u4e86\u7b80\u5355\u7684\u9884\u8bad\u7ec3\u4efb\u52a1\uff0c\u5373\u9884\u6d4b\u54ea\u4e2a\u6807\u9898\u4e0e\u54ea\u4e2a\u56fe\u50cf\u76f8\u5339\u914d\uff0c\u662f\u4e00\u79cd\u9ad8\u6548\u4e14\u53ef\u6269\u5c55\u7684\u65b9\u6cd5\uff0c\u53ef\u4ee5\u5728\u4ece\u4e92\u8054\u7f51\u4e0a\u6536\u96c6\u7684 4 \u4ebf\u5bf9\uff08\u56fe\u50cf\uff0c\u6587\u672c\uff09\u6570\u636e\u96c6\u4e0a\u4ece\u96f6\u5f00\u59cb\u5b66\u4e60\u6700\u5148\u8fdb\u7684\u56fe\u50cf\u8868\u793a\u3002\u5728\u9884\u8bad\u7ec3\u4e4b\u540e\uff0c\u81ea\u7136\u8bed\u8a00\u88ab\u7528\u6765\u5f15\u7528\u5b66\u4e60\u5230\u7684\u89c6\u89c9\u6982\u5ff5\uff08\u6216\u63cf\u8ff0\u65b0\u7684\u6982\u5ff5\uff09\uff0c\u4ece\u800c\u4f7f\u6a21\u578b\u80fd\u591f <code>zero-shot<\/code> \u8f6c\u79fb\u5230\u4e0b\u6e38\u4efb\u52a1\u3002<\/p>\n<p>\u6211\u4eec\u901a\u8fc7\u5728\u8d85\u8fc7 <code>30<\/code> \u4e2a\u4e0d\u540c\u7684\u73b0\u6709\u8ba1\u7b97\u673a\u89c6\u89c9\u6570\u636e\u96c6\u4e0a\u8fdb\u884c\u57fa\u51c6\u6d4b\u8bd5\u6765\u7814\u7a76\u8fd9\u79cd\u65b9\u6cd5\u7684\u6027\u80fd\uff0c\u6db5\u76d6 OCR\u3001\u89c6\u9891\u4e2d\u7684\u52a8\u4f5c\u8bc6\u522b\u3001\u5730\u7406\u5b9a\u4f4d\u4ee5\u53ca\u591a\u79cd\u7c7b\u578b\u7684\u7ec6\u7c92\u5ea6\u5bf9\u8c61\u5206\u7c7b\u7b49\u4efb\u52a1\u3002\u8be5\u6a21\u578b\u5728\u5927\u591a\u6570\u4efb\u52a1\u4e2d\u975e\u5e73\u51e1\u5730\u8f6c\u79fb\uff0c\u5e76\u4e14\u901a\u5e38\u4e0e\u5b8c\u5168\u76d1\u7763\u7684\u57fa\u7ebf\u7ade\u4e89\uff0c\u800c\u65e0\u9700\u4efb\u4f55\u7279\u5b9a\u4e8e\u6570\u636e\u96c6\u7684\u8bad\u7ec3\u3002<\/p>\n<\/blockquote>\n<h3><span class=\"ez-toc-section\" id=\"%E6%A8%A1%E5%9E%8B%E7%BB%93%E6%9E%84-3\"><\/span>\u6a21\u578b\u7ed3\u6784<span class=\"ez-toc-section-end\"><\/span><\/h3>\n<p><a href=\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/10\/CLIP\u6a21\u578b\u7ed3\u6784.png\" data-fancybox=\"images\" data-fancybox=\"gallery\"><img decoding=\"async\" src=\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/10\/CLIP\u6a21\u578b\u7ed3\u6784.png\" alt=\"\" \/><\/a><\/p>\n<h4><span class=\"ez-toc-section\" id=\"%E8%AE%BA%E6%96%87%E5%8E%9F%E6%96%87-6\"><\/span>\u8bba\u6587\u539f\u6587<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<blockquote>\n<p>Figure 1. <strong>Summary of Our Approach<\/strong> While standard image models jointly train an image feature extractor<strong> and a linear classifier to predict some label, CLIP jointly trains <\/strong>an image encoder<strong> and <\/strong>a text encoder<strong> to predict the <\/strong>correct pairings of a batch of (image, text)** training examples. At test time, the learned text encoder synthesizes a zero-shot linear classifier by embedding the names or descriptions of the target dataset\u2019s classes.<\/p>\n<\/blockquote>\n<h4><span class=\"ez-toc-section\" id=\"%E8%AE%BA%E6%96%87%E7%BF%BB%E8%AF%91-6\"><\/span>\u8bba\u6587\u7ffb\u8bd1<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<blockquote>\n<p>\u56fe 1. <strong>\u6211\u4eec\u65b9\u6cd5\u7684\u603b\u7ed3<\/strong> \u6807\u51c6\u7684\u56fe\u50cf\u6a21\u578b\u5171\u540c\u8bad\u7ec3\u56fe\u50cf\u7279\u5f81\u63d0\u53d6\u5668\u548c\u7ebf\u6027\u5206\u7c7b\u5668\u4ee5\u9884\u6d4b\u67d0\u4e2a\u6807\u7b7e\uff0c\u800c <code>CLIP<\/code> \u5171\u540c\u8bad\u7ec3 <strong>\u56fe\u50cf\u7f16\u7801\u5668<\/strong> \u548c <strong>\u6587\u672c\u7f16\u7801\u5668<\/strong> \u4ee5<strong>\u9884\u6d4b\u4e00\u6279\uff08\u56fe\u50cf\uff0c\u6587\u672c\uff09<\/strong>\u8bad\u7ec3\u793a\u4f8b\u7684\u6b63\u786e\u914d\u5bf9\u3002\u5728\u6d4b\u8bd5\u65f6\uff0c\u5b66\u4e60\u5230\u7684\u6587\u672c\u7f16\u7801\u5668\u901a\u8fc7\u5d4c\u5165\u76ee\u6807\u6570\u636e\u96c6\u7c7b\u522b\u7684\u540d\u79f0\u6216\u63cf\u8ff0\u6765\u5408\u6210\u4e00\u4e2a zero-shot \u7ebf\u6027\u5206\u7c7b\u5668\u3002<\/p>\n<\/blockquote>\n<h4><span class=\"ez-toc-section\" id=\"%E8%AE%BA%E6%96%87%E7%90%86%E8%A7%A3-5\"><\/span>\u8bba\u6587\u7406\u89e3<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<ul>\n<li>CLIP\u7684\u6a21\u578b\u5305\u62ec\u4e24\u4e2a\u90e8\u5206\uff0c\u5373<strong>\u6587\u672c\u7f16\u7801\u5668<\/strong>\uff08Text Encoder\uff09\u548c<strong>\u56fe\u50cf\u7f16\u7801\u5668<\/strong>\uff08Image Encoder\uff09\u3002Text Encoder\u9009\u62e9\u7684\u662f<code>Text Transformer<\/code>\u6a21\u578b\uff1bImage Encoder\u9009\u62e9\u4e86\u4e24\u79cd\u6a21\u578b\uff0c\u4e00\u662f\u57fa\u4e8eCNN\u7684<code>ResNet<\/code>\uff08\u5bf9\u6bd4\u4e86\u4e0d\u540c\u5c42\u6570\u7684ResNet\uff09\uff0c\u4e8c\u662f\u57fa\u4e8e <code>Transformer\u7684ViT<\/code>\u3002<\/li>\n<li>CLIP\u5c06\u56fe\u7247-\u6587\u672cpair\u5bf9\u8fdb\u884c\u5bf9\u6bd4\u5b66\u4e60\u8f93\u5165\u5230\u540c\u4e00\u4e2a\u795e\u7ecf\u7f51\u7edc\uff0c\u5c06\u5b83\u4eec\u6620\u5c04\u5230\u540c\u4e00\u4e2a\u5d4c\u5165\u7a7a\u95f4\uff0c\u4ece\u800c\u5b9e\u73b0\u4e86\u56fe\u50cf\u548c\u6587\u672c\u8de8\u6a21\u6001\u7684\u8bed\u4e49\u5bf9\u9f50\u3002<\/li>\n<li>\u7528\u56fe\u7247\u9884\u6d4b\u5bf9\u5e94\u7684\u6587\u672c\uff0c\u7ed3\u679c\u4f1a\u975e\u5e38\u591a\u6837\uff0c\u8bad\u7ec3\u8d77\u6765\u4f1a\u975e\u5e38\u6162\u3002\u800c\u4f7f\u7528\u5bf9\u6bd4\u5b66\u4e60\uff0c\u5224\u65ad\u56fe\u7247\u6587\u672c\u662f\u5426\u662f\u4e00\u5bf9\uff0c\u5c31\u7b80\u5316\u4e86\u4efb\u52a1\u3002<\/li>\n<\/ul>\n<h3><span class=\"ez-toc-section\" id=\"%E5%AF%B9%E6%AF%94%E5%AD%A6%E4%B9%A0\"><\/span>\u5bf9\u6bd4\u5b66\u4e60<span class=\"ez-toc-section-end\"><\/span><\/h3>\n<pre><code class=\"language-python\"># image_encoder - ResNet or Vision Transformer\n# text_encoder - CBOW or Text Transformer\n# I[n, h, w, c] - minibatch of aligned images\n# T[n, l] - minibatch of aligned texts\n# W_i[d_i, d_e] - learned proj of image to embed\n# W_t[d_t, d_e] - learned proj of text to embed\n# t - learned temperature parameter\n\n# \u5206\u522b\u63d0\u53d6\u56fe\u50cf\u7279\u5f81\u548c\u6587\u672c\u7279\u5f81\nI_f = image_encoder(I) #[n, d_i]\nT_f = text_encoder(T) #[n, d_t]\n\n# \u5bf9\u4e24\u4e2a\u7279\u5f81\u8fdb\u884c\u7ebf\u6027\u6295\u5c04\uff0c\u5f97\u5230\u76f8\u540c\u7ef4\u5ea6\u7684\u7279\u5f81\uff0c\u5e76\u8fdb\u884cl2\u5f52\u4e00\u5316\nI_e = l2_normalize(np.dot(I_f, W_i), axis=1)\nT_e = l2_normalize(np.dot(T_f, W_t), axis=1)\n\n# \u8ba1\u7b97\u7f29\u653e\u7684\u4f59\u5f26\u76f8\u4f3c\u5ea6\uff1a[n, n]\nlogits = np.dot(I_e, T_e.T) * np.exp(t)\n\n# \u5bf9\u79f0\u7684\u5bf9\u6bd4\u5b66\u4e60\u635f\u5931\uff1a\u7b49\u4ef7\u4e8eN\u4e2a\u7c7b\u522b\u7684cross_entropy_loss\nlabels = np.arange(n) # \u5bf9\u89d2\u7ebf\u5143\u7d20\u7684labels\nloss_i = cross_entropy_loss(logits, labels, axis=0)\nloss_t = cross_entropy_loss(logits, labels, axis=1)\nloss = (loss_i + loss_t)\/2<\/code><\/pre>\n<p>CLIP \u5728\u6587\u672c-\u56fe\u50cf\u5bf9\u6570\u636e\u96c6\u4e0a\u7684\u5bf9\u6bd4\u5b66\u4e60\u8bad\u7ec3\u8fc7\u7a0b\u5982\u4e0b\uff1a<\/p>\n<ol>\n<li>\u5bf9\u4e8e\u4e00\u4e2a\u5305\u542b N \u4e2a &lt;\u6587\u672c-\u56fe\u50cf&gt; \u5bf9\u7684\u8bad\u7ec3 <code>batch<\/code>\uff0c\u4f7f\u7528 <code>Text Encoder<\/code> \u548c <code>Image Encoder<\/code> \u63d0\u53d6 <code>N<\/code> \u4e2a\u6587\u672c\u7279\u5f81\u548c <code>N<\/code> \u4e2a\u56fe\u50cf\u7279\u5f81\u3002<br \/>\n<blockquote>\n<p>\u8fd9\u91cc\u5171\u6709 <code>N<\/code> \u4e2a\u6b63\u6837\u672c\uff0c\u5373\u771f\u6b63\u5c5e\u4e8e\u4e00\u5bf9\u7684\u6587\u672c\u548c\u56fe\u50cf\uff08\u77e9\u9635\u4e2d\u7684\u5bf9\u89d2\u7ebf\u5143\u7d20\uff09\uff0c\u800c\u5269\u4f59\u7684 (N^2 &#8211; N) \u4e2a\u6587\u672c-\u56fe\u50cf\u5bf9\u4e3a\u8d1f\u6837\u672c\u3002<\/p>\n<\/blockquote>\n<\/li>\n<li>\u5c06 <code>N<\/code> \u4e2a\u6587\u672c\u7279\u5f81\u548c <code>N<\/code> \u4e2a\u56fe\u50cf\u7279\u5f81\u4e24\u4e24\u7ec4\u5408\uff0c<code>CLIP<\/code> \u6a21\u578b\u4f1a\u9884\u6d4b\u51fa (N^2) \u4e2a\u53ef\u80fd\u7684\u6587\u672c-\u56fe\u50cf\u5bf9\u7684\u76f8\u4f3c\u5ea6\uff0c\u8fd9\u91cc\u7684\u76f8\u4f3c\u5ea6\u76f4\u63a5\u8ba1\u7b97\u6587\u672c\u7279\u5f81\u548c\u56fe\u50cf\u7279\u5f81\u7684 <code>\u4f59\u5f26\u76f8\u4f3c\u6027\uff08cosine similarity\uff09<\/code>\uff0c\u5373\u4e0a\u56fe\u6240\u793a\u7684\u77e9\u9635\u3002<\/li>\n<li>\u90a3\u4e48 <code>CLIP<\/code> \u7684\u8bad\u7ec3\u76ee\u6807\u5c31\u662f\u6700\u5927\u5316 <code>N<\/code> \u4e2a\u6b63\u6837\u672c\u7684\u76f8\u4f3c\u5ea6\uff0c\u540c\u65f6\u6700\u5c0f\u5316 (N^2 &#8211; N) \u4e2a\u8d1f\u6837\u672c\u7684\u76f8\u4f3c\u5ea6\uff0c\u5373\u6700\u5927\u5316\u5bf9\u89d2\u7ebf\u4e2d\u84dd\u8272\u7684\u6570\u503c\uff0c\u6700\u5c0f\u5316\u5176\u5b83\u975e\u5bf9\u89d2\u7ebf\u7684\u6570\u503c\uff1a<\/li>\n<\/ol>\n<pre><code class=\"language-katex\">\\text{min}\\left(\\sum_{i=1}^{N}\\sum_{j=1}^{N}(I_i \\cdot T_j)_{i \\neq j} - \\sum_{i=1}^{N}(I_i \\cdot T_j)\\right)<\/code><\/pre>\n<h3><span class=\"ez-toc-section\" id=\"%E8%BF%81%E7%A7%BB%E9%A2%84%E8%AE%AD%E7%BB%83%E6%A8%A1%E5%9E%8B%E5%AE%9E%E7%8E%B0zero-shot\"><\/span>\u8fc1\u79fb\u9884\u8bad\u7ec3\u6a21\u578b\u5b9e\u73b0zero-shot<span class=\"ez-toc-section-end\"><\/span><\/h3>\n<p><a href=\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/10\/zero-shot.png\" data-fancybox=\"images\" data-fancybox=\"gallery\"><img decoding=\"async\" src=\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/10\/zero-shot.png\" alt=\"\" \/><\/a><\/p>\n<h4><span class=\"ez-toc-section\" id=\"%E8%AE%BA%E6%96%87%E5%8E%9F%E6%96%87-7\"><\/span>\u8bba\u6587\u539f\u6587<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<blockquote>\n<p>CLIP is pre-trained to predict if an image and a text snippet are paired together in its dataset. To perform <code>zero-shot<\/code> classification, we reuse this capability. For each dataset, we use the names of all the classes in the dataset as the set of potential text pairings and <strong>predict the most probable (image, text) pair<\/strong> according to CLIP. <\/p>\n<p>In a bit more detail, we <strong>first compute the feature embedding<\/strong> of the image and the feature embedding of the set of possible texts by their respective encoders. <strong>The cosine similarity of these embeddings is then calculated<\/strong>, scaled by a temperature parameter \u03c4, and <strong>normalized into a probability distribution<\/strong> via a softmax. Note that this prediction layer is a multinomial logistic regression classifier with L2-normalized inputs, L2-normalized weights, no bias, and temperature scaling. <\/p>\n<p>When interpreted this way, the image encoder is the computer vision backbone which computes a feature representation for the image and the text encoder is a hypernetwork (Ha et al., 2016) which generates the weights of a linear classifier based on the text specifying the visual concepts that the classes represent. Lei Ba et al. (2015) first introduced a zero-shot image classifier of this form while the idea of generating a classifier from natural language dates back to at least Elhoseiny et al. (2013). <\/p>\n<\/blockquote>\n<h4><span class=\"ez-toc-section\" id=\"%E8%AE%BA%E6%96%87%E7%BF%BB%E8%AF%91-7\"><\/span>\u8bba\u6587\u7ffb\u8bd1<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<blockquote>\n<p>CLIP \u88ab\u9884\u8bad\u7ec3\u4ee5\u9884\u6d4b\u56fe\u50cf\u548c\u6587\u672c\u7247\u6bb5\u662f\u5426\u5728\u5176\u6570\u636e\u96c6\u4e2d\u914d\u5bf9\u3002\u4e3a\u4e86\u6267\u884c <code>zero-shot<\/code> \u5206\u7c7b\uff0c\u6211\u4eec\u91cd\u7528\u8fd9\u4e00\u80fd\u529b\u3002\u5bf9\u4e8e\u6bcf\u4e2a\u6570\u636e\u96c6\uff0c\u6211\u4eec\u4f7f\u7528\u6570\u636e\u96c6\u4e2d\u6240\u6709\u7c7b\u522b\u7684\u540d\u79f0\u4f5c\u4e3a\u6f5c\u5728\u6587\u672c\u914d\u5bf9\u7684\u96c6\u5408\uff0c\u5e76\u6839\u636e CLIP \u9884\u6d4b\u6700\u53ef\u80fd\u7684\uff08\u56fe\u50cf\uff0c\u6587\u672c\uff09\u5bf9\u3002<\/p>\n<p>\u66f4\u8be6\u7ec6\u5730\u8bf4\uff0c\u6211\u4eec\u9996\u5148<strong>\u8ba1\u7b97\u56fe\u50cf\u7684\u7279\u5f81\u5d4c\u5165<\/strong>\u548c<strong>\u53ef\u80fd\u6587\u672c\u96c6\u5408\u7684\u7279\u5f81\u5d4c\u5165<\/strong>\u3002\u7136\u540e\u8ba1\u7b97\u8fd9\u4e9b\u5d4c\u5165\u7684<strong>\u4f59\u5f26\u76f8\u4f3c\u5ea6<\/strong>\uff0c\u901a\u8fc7\u6e29\u5ea6\u53c2\u6570 \u03c4 \u8fdb\u884c\u7f29\u653e\uff0c\u5e76\u901a\u8fc7 <code>softmax<\/code> \u6b63\u89c4\u5316\u4e3a\u6982\u7387\u5206\u5e03\u3002\u8bf7\u6ce8\u610f\uff0c\u8fd9\u4e2a\u9884\u6d4b\u5c42\u662f\u4e00\u4e2a\u591a\u9879\u5f0f\u903b\u8f91\u56de\u5f52\u5206\u7c7b\u5668\uff0c\u5177\u6709 L2 \u89c4\u8303\u5316\u7684\u8f93\u5165\u3001L2 \u89c4\u8303\u5316\u7684\u6743\u91cd\u3001\u6ca1\u6709\u504f\u7f6e\u548c\u6e29\u5ea6\u7f29\u653e\u3002<\/p>\n<p>\u4ece\u8fd9\u4e2a\u89d2\u5ea6\u89e3\u91ca\u65f6\uff0c\u56fe\u50cf\u7f16\u7801\u5668\u662f\u8ba1\u7b97\u673a\u89c6\u89c9\u7684\u9aa8\u5e72\uff0c\u8ba1\u7b97\u56fe\u50cf\u7684\u7279\u5f81\u8868\u793a\uff0c\u800c\u6587\u672c\u7f16\u7801\u5668\u662f\u4e00\u4e2a\u8d85\u7f51\u7edc\uff08Ha \u7b49\uff0c2016\uff09\uff0c\u6839\u636e\u6307\u5b9a\u7c7b\u522b\u6240\u4ee3\u8868\u7684\u89c6\u89c9\u6982\u5ff5\u751f\u6210\u7ebf\u6027\u5206\u7c7b\u5668\u7684\u6743\u91cd\u3002Lei Ba \u7b49\uff082015\uff09\u9996\u6b21\u5f15\u5165\u4e86\u8fd9\u79cd\u5f62\u5f0f\u7684 <code>zero-shot<\/code> \u56fe\u50cf\u5206\u7c7b\u5668\uff0c\u800c\u4ece\u81ea\u7136\u8bed\u8a00\u751f\u6210\u5206\u7c7b\u5668\u7684\u60f3\u6cd5\u81f3\u5c11\u53ef\u4ee5\u8ffd\u6eaf\u5230 Elhoseiny \u7b49\uff082013\uff09\u3002 <\/p>\n<\/blockquote>\n<h4><span class=\"ez-toc-section\" id=\"%E8%AE%BA%E6%96%87%E7%90%86%E8%A7%A3-6\"><\/span>\u8bba\u6587\u7406\u89e3<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<ul>\n<li>\u8bad\u7ec3\u540e\u7684CLIP\u5176\u5b9e\u662f\u4e24\u4e2a\u6a21\u578b\uff1a\u89c6\u89c9\u6a21\u578b+\u6587\u672c\u6a21\u578b\uff0c\u4e0eCV\u4e2d\u5e38\u7528\u7684\u5148\u9884\u8bad\u7ec3\u7136\u540e\u5fae\u8c03\u4e0d\u540c\uff0cCLIP\u53ef\u4ee5\u76f4\u63a5\u5b9e\u73b0zero-shot\u7684\u56fe\u50cf\u5206\u7c7b\uff0c\u5373\u4e0d\u9700\u8981\u4efb\u4f55\u8bad\u7ec3\u6570\u636e\uff0c\u5c31\u80fd\u5728\u67d0\u4e2a\u5177\u4f53\u4e0b\u6e38\u4efb\u52a1\u4e0a\u5b9e\u73b0\u5206\u7c7b\u3002<\/li>\n<li>\u7ecf\u8fc7\u5728\u6587\u672c-\u56fe\u50cf\u5bf9\u6570\u636e\u4e0a\u8bad\u7ec3\u7684\u6a21\u578b\uff0cCLIP\u6709\u80fd\u529b\u5224\u65ad\u7ed9\u5b9a\u7684\u6587\u672c\u548c\u56fe\u50cf\u662f\u5426\u5339\u914d\uff0c\u5373\u53ef\u4ee5\u76f4\u63a5\u505a\u56fe\u50cf\u5206\u7c7b\u3002<\/li>\n<li>CLIP\u7684zero-shot\u5206\u7c7b\u8fc7\u7a0b\u5982\u4e0b\uff1a\n<ul>\n<li>\u6839\u636e\u4efb\u52a1\u7684\u5206\u7c7b\u6807\u7b7e\u6784\u5efa\u6bcf\u4e2a\u7c7b\u522b\u7684\u63cf\u8ff0\u6587\u672c(\u4ee5Imagenet\u6709N=1000\u7c7b\u4e3a\u4f8b)\uff1aA photo of {label}\uff0c\u7136\u540e\u5c06\u8fd9\u4e9b\u6587\u672c\u9001\u5165Text Encoder\u5f97\u5230\u5bf9\u5e94\u7684\u6587\u672c\u7279\u5f81\uff0c\u5982\u679c\u7c7b\u522b\u6570\u76ee\u4e3aN\uff0c\u90a3\u4e48\u5c06\u5f97\u5230N\u4e2a\u6587\u672c\u7279\u5f81\uff1b<\/li>\n<li>\u5c06\u8981\u9884\u6d4b\u7684\u56fe\u50cf\u9001\u5165Image Encoder\u5f97\u5230\u56fe\u50cf\u7279\u5f81\uff0c\u7136\u540e\u4e0eN\u4e2a\u6587\u672c\u7279\u5f81\u8ba1\u7b97\u7f29\u653e\u7684\u4f59\u5f26\u76f8\u4f3c\u5ea6\uff08\u548c\u8bad\u7ec3\u8fc7\u7a0b\u4e00\u81f4\uff09\uff0c\u7136\u540e\u9009\u62e9\u76f8\u4f3c\u5ea6\u6700\u5927\u7684\u6587\u672c\u5bf9\u5e94\u7684\u7c7b\u522b\u4f5c\u4e3a\u56fe\u50cf\u5206\u7c7b\u9884\u6d4b\u7ed3\u679c\uff0c\u8fdb\u4e00\u6b65\u5730\uff0c\u53ef\u4ee5\u5c06\u8fd9\u4e9b\u76f8\u4f3c\u5ea6\u770b\u6210logits\uff0c\u9001\u5165softmax\u540e\u53ef\u4ee5\u5230\u6bcf\u4e2a\u7c7b\u522b\u7684\u9884\u6d4b\u6982\u7387\u3002<\/li>\n<\/ul>\n<\/li>\n<\/ul>\n<h2><span class=\"ez-toc-section\" id=\"%E5%86%85%E5%AE%B9%E5%B0%8F%E7%BB%93\"><\/span>\u5185\u5bb9\u5c0f\u7ed3<span class=\"ez-toc-section-end\"><\/span><\/h2>\n<p>\u5173\u4e8e <code>Vit<\/code> \u6a21\u578b<\/p>\n<ul>\n<li>\u591a\u6a21\u6001\u5927\u6a21\u578b\u4e2d\u6240\u6d89\u53ca\u7684 <code>ViT<\/code> \u67b6\u6784\uff0c\u63d0\u51fa\u4e86\u4e00\u4e2a\u4f7f\u7528 <code>Transformer<\/code> \u7ed3\u6784\u6765\u5904\u7406\u56fe\u50cf\u7684\u601d\u60f3\u3002<\/li>\n<li>Vit\u67b6\u6784\u7684\u5904\u7406\u8fc7\u7a0b\uff1a\u5c06\u56fe\u50cf\u5206\u5272\u6210\u5c0f\u5757\uff0c\u7136\u540e\u5bf9\u6bcf\u4e2a\u5c0f\u5757\u8fdb\u884c <code>embedding<\/code>\u5e76\u4f5c\u4e3a\u8f93\u5165\u63d0\u4f9b\u7ed9 <code>Transformer<\/code>\uff0c\u4ee5\u6b64\u53d1\u6325 Transformer \u7684\u5e76\u884c\u8ba1\u7b97\u4f18\u52bf\u3002<\/li>\n<li>\u4e3a\u4e86\u6ee1\u8db3\u56fe\u7247\u540e\u7eed\u7684\u5206\u7c7b\u80fd\u529b\uff0c\u91c7\u7528\u4e86 BERT \u7684 [CLS] \u6807\u8bb0\uff0c\u5728Embedding\u5c42\u7684\u7b2c\u4e00\u4e2a\u4f4d\u7f6e\u6dfb\u52a0\u4e86\u4e00\u4e2a [CLS] \u6807\u8bb0\u3002<\/li>\n<\/ul>\n<p>\u5173\u4e8e <code>CLIP<\/code> \u6a21\u578b<\/p>\n<ul>\n<li>CLIP\u7684\u6a21\u578b\u5305\u62ec\u4e24\u4e2a\u90e8\u5206\uff0c\u5373<strong>\u6587\u672c\u7f16\u7801\u5668<\/strong>\uff08Text Encoder\uff09\u548c<strong>\u56fe\u50cf\u7f16\u7801\u5668<\/strong>\uff08Image Encoder\uff09\u3002<\/li>\n<li>CLIP\u6a21\u578b\u901a\u8fc7 <code>\u56fe\u7247-\u6587\u672cpair\u5bf9<\/code> \u8fdb\u884c\u5bf9\u6bd4\u5b66\u4e60\uff0c\u5c06\u5b83\u4eec\u6620\u5c04\u5230\u540c\u4e00\u4e2a\u5d4c\u5165\u7a7a\u95f4\uff0c\u4ece\u800c\u5b9e\u73b0\u4e86\u56fe\u50cf\u548c\u6587\u672c\u8de8\u6a21\u6001\u7684\u8bed\u4e49\u5bf9\u9f50\u3002<\/li>\n<li>CLIP\u6a21\u578b\u7684\u6700\u5927\u4eae\u70b9\u662f\uff1a<strong>\u4e0d\u9700\u8981\u4efb\u4f55\u8bad\u7ec3\u6570\u636e<\/strong>\uff0c<strong>\u76f4\u63a5\u5728\u56fe\u50cf\u548c\u6587\u672c\u7684pair\u5bf9\u4e0a\u8fdb\u884c\u8bad\u7ec3<\/strong>\uff0c\u4ece\u800c\u5b9e\u73b0\u4e86 <code>zero-shot<\/code> \u7684\u56fe\u50cf\u5206\u7c7b\u3002<\/li>\n<\/ul>\n<h2><span class=\"ez-toc-section\" id=\"%E5%8F%82%E8%80%83%E8%B5%84%E6%96%99\"><\/span>\u53c2\u8003\u8d44\u6599<span class=\"ez-toc-section-end\"><\/span><\/h2>\n<ul>\n<li><a href=\"https:\/\/blog.csdn.net\/weixin_54338498\/article\/details\/132419567\">CSDN:\u57fa\u7840\u8bba\u6587\u5b66\u4e60\uff084\uff09\u2014\u2014CLIP<\/a><\/li>\n<\/ul>\n","protected":false},"excerpt":{"rendered":"<p>\u524d\u8a00 \u5728\u3010\u8bfe\u7a0b\u603b\u7ed3\u3011day31\uff1a\u591a\u6a21\u6001\u5927 [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":33309,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"aside","meta":{"site-sidebar-layout":"default","site-content-layout":"","ast-site-content-layout":"default","site-content-style":"default","site-sidebar-style":"default","ast-global-header-display":"","ast-banner-title-visibility":"","ast-main-header-display":"","ast-hfb-above-header-display":"","ast-hfb-below-header-display":"","ast-hfb-mobile-header-display":"","site-post-title":"","ast-breadcrumbs-content":"","ast-featured-img":"","footer-sml-layout":"","theme-transparent-header-meta":"default","adv-header-id-meta":"","stick-header-meta":"default","header-above-stick-meta":"","header-main-stick-meta":"","header-below-stick-meta":"","astra-migrate-meta-layouts":"set","ast-page-background-enabled":"default","ast-page-background-meta":{"desktop":{"background-color":"var(--ast-global-color-4)","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""},"tablet":{"background-color":"","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""},"mobile":{"background-color":"","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""}},"ast-content-background-meta":{"desktop":{"background-color":"var(--ast-global-color-5)","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""},"tablet":{"background-color":"var(--ast-global-color-5)","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""},"mobile":{"background-color":"var(--ast-global-color-5)","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""}},"footnotes":""},"categories":[28],"tags":[73,68],"class_list":["post-33308","post","type-post","status-publish","format-aside","has-post-thumbnail","hentry","category-blog","tag-73","tag-68","post_format-post-format-aside"],"yoast_head":"<!-- This site is optimized with the Yoast SEO plugin v26.4 - https:\/\/yoast.com\/wordpress\/plugins\/seo\/ -->\n<title>\u3010\u8bfe\u7a0b\u603b\u7ed3\u3011day34\uff1a\u591a\u6a21\u6001\u5927\u6a21\u578b\u4e4bViT\u6a21\u578b\u3001CLIP\u6a21\u578b\u8bba\u6587\u9605\u8bfb\u7406\u89e3 - \u4e00\u8d77AI\u6280\u672f<\/title>\n<meta name=\"robots\" content=\"index, follow, max-snippet:-1, max-image-preview:large, max-video-preview:-1\" \/>\n<link rel=\"canonical\" href=\"https:\/\/17aitech.com\/?p=33308\" \/>\n<script type=\"application\/ld+json\" class=\"yoast-schema-graph\">{\"@context\":\"https:\/\/schema.org\",\"@graph\":[{\"@type\":\"WebPage\",\"@id\":\"https:\/\/17aitech.com\/?p=33308\",\"url\":\"https:\/\/17aitech.com\/?p=33308\",\"name\":\"\u3010\u8bfe\u7a0b\u603b\u7ed3\u3011day34\uff1a\u591a\u6a21\u6001\u5927\u6a21\u578b\u4e4bViT\u6a21\u578b\u3001CLIP\u6a21\u578b\u8bba\u6587\u9605\u8bfb\u7406\u89e3 - \u4e00\u8d77AI\u6280\u672f\",\"isPartOf\":{\"@id\":\"https:\/\/17aitech.com\/#website\"},\"primaryImageOfPage\":{\"@id\":\"https:\/\/17aitech.com\/?p=33308#primaryimage\"},\"image\":{\"@id\":\"https:\/\/17aitech.com\/?p=33308#primaryimage\"},\"thumbnailUrl\":\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/10\/ViT\u6a21\u578b\u7ed3\u6784.png\",\"datePublished\":\"2024-10-14T05:05:20+00:00\",\"dateModified\":\"2024-10-14T05:59:55+00:00\",\"author\":{\"@id\":\"https:\/\/17aitech.com\/#\/schema\/person\/3d23bb6f7f115fcefc9ae7803a691739\"},\"breadcrumb\":{\"@id\":\"https:\/\/17aitech.com\/?p=33308#breadcrumb\"},\"inLanguage\":\"zh-Hans\",\"potentialAction\":[{\"@type\":\"ReadAction\",\"target\":[\"https:\/\/17aitech.com\/?p=33308\"]}]},{\"@type\":\"ImageObject\",\"inLanguage\":\"zh-Hans\",\"@id\":\"https:\/\/17aitech.com\/?p=33308#primaryimage\",\"url\":\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/10\/ViT\u6a21\u578b\u7ed3\u6784.png\",\"contentUrl\":\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/10\/ViT\u6a21\u578b\u7ed3\u6784.png\",\"width\":992,\"height\":528},{\"@type\":\"BreadcrumbList\",\"@id\":\"https:\/\/17aitech.com\/?p=33308#breadcrumb\",\"itemListElement\":[{\"@type\":\"ListItem\",\"position\":1,\"name\":\"\u9996\u9875\",\"item\":\"https:\/\/17aitech.com\/\"},{\"@type\":\"ListItem\",\"position\":2,\"name\":\"\u3010\u8bfe\u7a0b\u603b\u7ed3\u3011day34\uff1a\u591a\u6a21\u6001\u5927\u6a21\u578b\u4e4bViT\u6a21\u578b\u3001CLIP\u6a21\u578b\u8bba\u6587\u9605\u8bfb\u7406\u89e3\"}]},{\"@type\":\"WebSite\",\"@id\":\"https:\/\/17aitech.com\/#website\",\"url\":\"https:\/\/17aitech.com\/\",\"name\":\"\u4e00\u8d77AI\u6280\u672f\",\"description\":\"\u8ba9AI\u77e5\u8bc6\u89e6\u624b\u53ef\u53ca\",\"alternateName\":\"\u4e00\u8d77AI\u6280\u672f\",\"potentialAction\":[{\"@type\":\"SearchAction\",\"target\":{\"@type\":\"EntryPoint\",\"urlTemplate\":\"https:\/\/17aitech.com\/?s={search_term_string}\"},\"query-input\":{\"@type\":\"PropertyValueSpecification\",\"valueRequired\":true,\"valueName\":\"search_term_string\"}}],\"inLanguage\":\"zh-Hans\"},{\"@type\":\"Person\",\"@id\":\"https:\/\/17aitech.com\/#\/schema\/person\/3d23bb6f7f115fcefc9ae7803a691739\",\"name\":\"Dongming\",\"image\":{\"@type\":\"ImageObject\",\"inLanguage\":\"zh-Hans\",\"@id\":\"https:\/\/17aitech.com\/#\/schema\/person\/image\/\",\"url\":\"\/\/17aitech.com\/wp-content\/uploads\/member\/avatars\/238a0b923820dcc5.1732798681.jpg\",\"contentUrl\":\"\/\/17aitech.com\/wp-content\/uploads\/member\/avatars\/238a0b923820dcc5.1732798681.jpg\",\"caption\":\"Dongming\"},\"description\":\"\u89c1\u5929\u5730\uff0c\u89c1\u4f17\u751f\uff0c\u89c1\u81ea\u5df1\u3002\",\"sameAs\":[\"http:\/\/17aitech.com\"],\"url\":\"https:\/\/17aitech.com\/?page_id=33738&user=1\"}]}<\/script>\n<!-- \/ Yoast SEO plugin. -->","yoast_head_json":{"title":"\u3010\u8bfe\u7a0b\u603b\u7ed3\u3011day34\uff1a\u591a\u6a21\u6001\u5927\u6a21\u578b\u4e4bViT\u6a21\u578b\u3001CLIP\u6a21\u578b\u8bba\u6587\u9605\u8bfb\u7406\u89e3 - \u4e00\u8d77AI\u6280\u672f","robots":{"index":"index","follow":"follow","max-snippet":"max-snippet:-1","max-image-preview":"max-image-preview:large","max-video-preview":"max-video-preview:-1"},"canonical":"https:\/\/17aitech.com\/?p=33308","schema":{"@context":"https:\/\/schema.org","@graph":[{"@type":"WebPage","@id":"https:\/\/17aitech.com\/?p=33308","url":"https:\/\/17aitech.com\/?p=33308","name":"\u3010\u8bfe\u7a0b\u603b\u7ed3\u3011day34\uff1a\u591a\u6a21\u6001\u5927\u6a21\u578b\u4e4bViT\u6a21\u578b\u3001CLIP\u6a21\u578b\u8bba\u6587\u9605\u8bfb\u7406\u89e3 - \u4e00\u8d77AI\u6280\u672f","isPartOf":{"@id":"https:\/\/17aitech.com\/#website"},"primaryImageOfPage":{"@id":"https:\/\/17aitech.com\/?p=33308#primaryimage"},"image":{"@id":"https:\/\/17aitech.com\/?p=33308#primaryimage"},"thumbnailUrl":"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/10\/ViT\u6a21\u578b\u7ed3\u6784.png","datePublished":"2024-10-14T05:05:20+00:00","dateModified":"2024-10-14T05:59:55+00:00","author":{"@id":"https:\/\/17aitech.com\/#\/schema\/person\/3d23bb6f7f115fcefc9ae7803a691739"},"breadcrumb":{"@id":"https:\/\/17aitech.com\/?p=33308#breadcrumb"},"inLanguage":"zh-Hans","potentialAction":[{"@type":"ReadAction","target":["https:\/\/17aitech.com\/?p=33308"]}]},{"@type":"ImageObject","inLanguage":"zh-Hans","@id":"https:\/\/17aitech.com\/?p=33308#primaryimage","url":"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/10\/ViT\u6a21\u578b\u7ed3\u6784.png","contentUrl":"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/10\/ViT\u6a21\u578b\u7ed3\u6784.png","width":992,"height":528},{"@type":"BreadcrumbList","@id":"https:\/\/17aitech.com\/?p=33308#breadcrumb","itemListElement":[{"@type":"ListItem","position":1,"name":"\u9996\u9875","item":"https:\/\/17aitech.com\/"},{"@type":"ListItem","position":2,"name":"\u3010\u8bfe\u7a0b\u603b\u7ed3\u3011day34\uff1a\u591a\u6a21\u6001\u5927\u6a21\u578b\u4e4bViT\u6a21\u578b\u3001CLIP\u6a21\u578b\u8bba\u6587\u9605\u8bfb\u7406\u89e3"}]},{"@type":"WebSite","@id":"https:\/\/17aitech.com\/#website","url":"https:\/\/17aitech.com\/","name":"\u4e00\u8d77AI\u6280\u672f","description":"\u8ba9AI\u77e5\u8bc6\u89e6\u624b\u53ef\u53ca","alternateName":"\u4e00\u8d77AI\u6280\u672f","potentialAction":[{"@type":"SearchAction","target":{"@type":"EntryPoint","urlTemplate":"https:\/\/17aitech.com\/?s={search_term_string}"},"query-input":{"@type":"PropertyValueSpecification","valueRequired":true,"valueName":"search_term_string"}}],"inLanguage":"zh-Hans"},{"@type":"Person","@id":"https:\/\/17aitech.com\/#\/schema\/person\/3d23bb6f7f115fcefc9ae7803a691739","name":"Dongming","image":{"@type":"ImageObject","inLanguage":"zh-Hans","@id":"https:\/\/17aitech.com\/#\/schema\/person\/image\/","url":"\/\/17aitech.com\/wp-content\/uploads\/member\/avatars\/238a0b923820dcc5.1732798681.jpg","contentUrl":"\/\/17aitech.com\/wp-content\/uploads\/member\/avatars\/238a0b923820dcc5.1732798681.jpg","caption":"Dongming"},"description":"\u89c1\u5929\u5730\uff0c\u89c1\u4f17\u751f\uff0c\u89c1\u81ea\u5df1\u3002","sameAs":["http:\/\/17aitech.com"],"url":"https:\/\/17aitech.com\/?page_id=33738&user=1"}]}},"_links":{"self":[{"href":"https:\/\/17aitech.com\/index.php?rest_route=\/wp\/v2\/posts\/33308","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/17aitech.com\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/17aitech.com\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/17aitech.com\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/17aitech.com\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=33308"}],"version-history":[{"count":8,"href":"https:\/\/17aitech.com\/index.php?rest_route=\/wp\/v2\/posts\/33308\/revisions"}],"predecessor-version":[{"id":33324,"href":"https:\/\/17aitech.com\/index.php?rest_route=\/wp\/v2\/posts\/33308\/revisions\/33324"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/17aitech.com\/index.php?rest_route=\/wp\/v2\/media\/33309"}],"wp:attachment":[{"href":"https:\/\/17aitech.com\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=33308"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/17aitech.com\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=33308"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/17aitech.com\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=33308"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}