{"id":33269,"date":"2024-10-09T16:18:54","date_gmt":"2024-10-09T08:18:54","guid":{"rendered":"https:\/\/17aitech.com\/?p=33269"},"modified":"2024-10-09T16:20:48","modified_gmt":"2024-10-09T08:20:48","slug":"%e3%80%90%e8%af%be%e7%a8%8b%e6%80%bb%e7%bb%93%e3%80%91day32%e4%b8%8a%ef%bc%9a%e5%a4%9a%e6%a8%a1%e6%80%81%e5%a4%a7%e6%a8%a1%e5%9e%8bqwen2%e7%9a%84%e6%b7%b1%e5%85%a5%e4%ba%86%e8%a7%a3","status":"publish","type":"post","link":"https:\/\/17aitech.com\/?p=33269","title":{"rendered":"\u3010\u8bfe\u7a0b\u603b\u7ed3\u3011day32(\u4e0a)\uff1a\u591a\u6a21\u6001\u5927\u6a21\u578bQwen2\u7684\u6df1\u5165\u4e86\u89e3"},"content":{"rendered":"<div id=\"ez-toc-container\" class=\"ez-toc-v2_0_78 ez-toc-wrap-left-text counter-hierarchy ez-toc-counter ez-toc-light-blue ez-toc-container-direction\">\n<div class=\"ez-toc-title-container\">\n<p class=\"ez-toc-title\" style=\"cursor:inherit\">\u6587\u7ae0\u76ee\u5f55<\/p>\n<span class=\"ez-toc-title-toggle\"><a href=\"#\" class=\"ez-toc-pull-right ez-toc-btn ez-toc-btn-xs ez-toc-btn-default ez-toc-toggle\" aria-label=\"Toggle Table of Content\"><span class=\"ez-toc-js-icon-con\"><span class=\"\"><span class=\"eztoc-hide\" style=\"display:none;\">Toggle<\/span><span class=\"ez-toc-icon-toggle-span\"><svg style=\"fill: #999;color:#999\" xmlns=\"http:\/\/www.w3.org\/2000\/svg\" class=\"list-377408\" width=\"20px\" height=\"20px\" viewBox=\"0 0 24 24\" fill=\"none\"><path d=\"M6 6H4v2h2V6zm14 0H8v2h12V6zM4 11h2v2H4v-2zm16 0H8v2h12v-2zM4 16h2v2H4v-2zm16 0H8v2h12v-2z\" fill=\"currentColor\"><\/path><\/svg><svg style=\"fill: #999;color:#999\" class=\"arrow-unsorted-368013\" xmlns=\"http:\/\/www.w3.org\/2000\/svg\" width=\"10px\" height=\"10px\" viewBox=\"0 0 24 24\" version=\"1.2\" baseProfile=\"tiny\"><path d=\"M18.2 9.3l-6.2-6.3-6.2 6.3c-.2.2-.3.4-.3.7s.1.5.3.7c.2.2.4.3.7.3h11c.3 0 .5-.1.7-.3.2-.2.3-.5.3-.7s-.1-.5-.3-.7zM5.8 14.7l6.2 6.3 6.2-6.3c.2-.2.3-.5.3-.7s-.1-.5-.3-.7c-.2-.2-.4-.3-.7-.3h-11c-.3 0-.5.1-.7.3-.2.2-.3.5-.3.7s.1.5.3.7z\"\/><\/svg><\/span><\/span><\/span><\/a><\/span><\/div>\n<nav><ul class='ez-toc-list ez-toc-list-level-1 ' ><li class='ez-toc-page-1 ez-toc-heading-level-2'><a class=\"ez-toc-link ez-toc-heading-1\" href=\"https:\/\/17aitech.com\/?p=33269\/#%E5%89%8D%E8%A8%80\" >\u524d\u8a00<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-2'><a class=\"ez-toc-link ez-toc-heading-2\" href=\"https:\/\/17aitech.com\/?p=33269\/#%E8%B5%84%E6%96%99\" >\u8d44\u6599<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-2'><a class=\"ez-toc-link ez-toc-heading-3\" href=\"https:\/\/17aitech.com\/?p=33269\/#%E8%AE%BA%E6%96%87%E9%98%85%E8%AF%BB%E7%90%86%E8%A7%A3\" >\u8bba\u6587\u9605\u8bfb\u7406\u89e3<\/a><ul class='ez-toc-list-level-3' ><li class='ez-toc-heading-level-3'><a class=\"ez-toc-link ez-toc-heading-4\" href=\"https:\/\/17aitech.com\/?p=33269\/#%E8%AE%BA%E6%96%87%E6%A0%B8%E5%BF%83%E8%A6%81%E7%82%B9\" >\u8bba\u6587\u6838\u5fc3\u8981\u70b9<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-3'><a class=\"ez-toc-link ez-toc-heading-5\" href=\"https:\/\/17aitech.com\/?p=33269\/#%E5%8D%87%E7%BA%A7%E7%82%B91%EF%BC%9A%E5%8E%9F%E5%A7%8B%E5%8A%A8%E6%80%81%E5%88%86%E8%BE%A8%E7%8E%87\" >\u5347\u7ea7\u70b91\uff1a\u539f\u59cb\u52a8\u6001\u5206\u8fa8\u7387<\/a><ul class='ez-toc-list-level-4' ><li class='ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-6\" href=\"https:\/\/17aitech.com\/?p=33269\/#%E6%A8%A1%E5%9E%8B%E7%BB%93%E6%9E%84\" >\u6a21\u578b\u7ed3\u6784<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-7\" href=\"https:\/\/17aitech.com\/?p=33269\/#%E8%AE%BA%E6%96%87%E5%8E%9F%E6%96%87\" >\u8bba\u6587\u539f\u6587<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-8\" href=\"https:\/\/17aitech.com\/?p=33269\/#%E8%AE%BA%E6%96%87%E7%BF%BB%E8%AF%91\" >\u8bba\u6587\u7ffb\u8bd1<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-9\" href=\"https:\/\/17aitech.com\/?p=33269\/#%E8%AE%BA%E6%96%87%E7%90%86%E8%A7%A3\" >\u8bba\u6587\u7406\u89e3<\/a><\/li><\/ul><\/li><li class='ez-toc-page-1 ez-toc-heading-level-3'><a class=\"ez-toc-link ez-toc-heading-10\" href=\"https:\/\/17aitech.com\/?p=33269\/#%E5%8D%87%E7%BA%A7%E7%82%B92%EF%BC%9A%E5%A4%9A%E6%A8%A1%E6%80%81%E6%97%8B%E8%BD%AC%E4%BD%8D%E7%BD%AE%E5%B5%8C%E5%85%A5\" >\u5347\u7ea7\u70b92\uff1a\u591a\u6a21\u6001\u65cb\u8f6c\u4f4d\u7f6e\u5d4c\u5165<\/a><ul class='ez-toc-list-level-4' ><li class='ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-11\" href=\"https:\/\/17aitech.com\/?p=33269\/#%E6%A8%A1%E5%9E%8B%E7%BB%93%E6%9E%84-2\" >\u6a21\u578b\u7ed3\u6784<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-12\" href=\"https:\/\/17aitech.com\/?p=33269\/#%E8%AE%BA%E6%96%87%E5%8E%9F%E6%96%87-2\" >\u8bba\u6587\u539f\u6587<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-13\" href=\"https:\/\/17aitech.com\/?p=33269\/#%E8%AE%BA%E6%96%87%E7%BF%BB%E8%AF%91-2\" >\u8bba\u6587\u7ffb\u8bd1<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-14\" href=\"https:\/\/17aitech.com\/?p=33269\/#%E8%AE%BA%E6%96%87%E7%90%86%E8%A7%A3-2\" >\u8bba\u6587\u7406\u89e3<\/a><\/li><\/ul><\/li><li class='ez-toc-page-1 ez-toc-heading-level-3'><a class=\"ez-toc-link ez-toc-heading-15\" href=\"https:\/\/17aitech.com\/?p=33269\/#%E5%8D%87%E7%BA%A7%E7%82%B93%EF%BC%9A%E7%BB%9F%E4%B8%80%E5%9B%BE%E5%83%8F%E5%92%8C%E8%A7%86%E9%A2%91%E7%9A%84%E7%90%86%E8%A7%A3\" >\u5347\u7ea7\u70b93\uff1a\u7edf\u4e00\u56fe\u50cf\u548c\u89c6\u9891\u7684\u7406\u89e3<\/a><ul class='ez-toc-list-level-4' ><li class='ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-16\" href=\"https:\/\/17aitech.com\/?p=33269\/#%E8%AE%BA%E6%96%87%E5%8E%9F%E6%96%87-3\" >\u8bba\u6587\u539f\u6587<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-17\" href=\"https:\/\/17aitech.com\/?p=33269\/#%E8%AE%BA%E6%96%87%E7%BF%BB%E8%AF%91-3\" >\u8bba\u6587\u7ffb\u8bd1<\/a><\/li><\/ul><\/li><\/ul><\/li><li class='ez-toc-page-1 ez-toc-heading-level-2'><a class=\"ez-toc-link ez-toc-heading-18\" href=\"https:\/\/17aitech.com\/?p=33269\/#%E6%A8%A1%E5%9E%8B%E9%83%A8%E7%BD%B2%E4%BD%BF%E7%94%A8flash_attention\" >\u6a21\u578b\u90e8\u7f72(\u4f7f\u7528flash_attention)<\/a><ul class='ez-toc-list-level-3' ><li class='ez-toc-heading-level-3'><a class=\"ez-toc-link ez-toc-heading-19\" href=\"https:\/\/17aitech.com\/?p=33269\/#%E5%87%86%E5%A4%87%E7%8E%AF%E5%A2%83\" >\u51c6\u5907\u73af\u5883<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-3'><a class=\"ez-toc-link ez-toc-heading-20\" href=\"https:\/\/17aitech.com\/?p=33269\/#%E6%8B%89%E5%8F%96%E4%BB%A3%E7%A0%81\" >\u62c9\u53d6\u4ee3\u7801<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-3'><a class=\"ez-toc-link ez-toc-heading-21\" href=\"https:\/\/17aitech.com\/?p=33269\/#%E5%AE%89%E8%A3%85flash_attention\" >\u5b89\u88c5flash_attention<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-3'><a class=\"ez-toc-link ez-toc-heading-22\" href=\"https:\/\/17aitech.com\/?p=33269\/#%E5%BC%95%E5%85%A5%E7%9B%B8%E5%85%B3%E5%BA%93\" >\u5f15\u5165\u76f8\u5173\u5e93<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-3'><a class=\"ez-toc-link ez-toc-heading-23\" href=\"https:\/\/17aitech.com\/?p=33269\/#%E5%8A%A0%E8%BD%BD%E6%A8%A1%E5%9E%8B\" >\u52a0\u8f7d\u6a21\u578b<\/a><ul class='ez-toc-list-level-4' ><li class='ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-24\" href=\"https:\/\/17aitech.com\/?p=33269\/#%E6%A8%A1%E5%9E%8B%E5%BD%A2%E7%8A%B6\" >\u6a21\u578b\u5f62\u72b6<\/a><\/li><\/ul><\/li><li class='ez-toc-page-1 ez-toc-heading-level-3'><a class=\"ez-toc-link ez-toc-heading-25\" href=\"https:\/\/17aitech.com\/?p=33269\/#%E5%8A%A0%E8%BD%BDprocessor\" >\u52a0\u8f7dprocessor<\/a><ul class='ez-toc-list-level-4' ><li class='ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-26\" href=\"https:\/\/17aitech.com\/?p=33269\/#processor%E9%85%8D%E7%BD%AE\" >processor\u914d\u7f6e<\/a><\/li><\/ul><\/li><li class='ez-toc-page-1 ez-toc-heading-level-3'><a class=\"ez-toc-link ez-toc-heading-27\" href=\"https:\/\/17aitech.com\/?p=33269\/#%E6%9E%84%E5%BB%BA%E5%AF%B9%E8%AF%9D%E6%A8%A1%E6%9D%BF\" >\u6784\u5efa\u5bf9\u8bdd\u6a21\u677f<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-3'><a class=\"ez-toc-link ez-toc-heading-28\" href=\"https:\/\/17aitech.com\/?p=33269\/#%E6%95%B0%E6%8D%AE%E9%A2%84%E5%A4%84%E7%90%86\" >\u6570\u636e\u9884\u5904\u7406<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-3'><a class=\"ez-toc-link ez-toc-heading-29\" href=\"https:\/\/17aitech.com\/?p=33269\/#%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86\" >\u6a21\u578b\u63a8\u7406<\/a><\/li><\/ul><\/li><li class='ez-toc-page-1 ez-toc-heading-level-2'><a class=\"ez-toc-link ez-toc-heading-30\" href=\"https:\/\/17aitech.com\/?p=33269\/#%E8%AF%86%E5%88%ABGif%E5%8A%A8%E5%9B%BE\" >\u8bc6\u522bGif\u52a8\u56fe<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-2'><a class=\"ez-toc-link ez-toc-heading-31\" href=\"https:\/\/17aitech.com\/?p=33269\/#%E8%AF%86%E5%88%AB%E8%A7%86%E9%A2%91\" >\u8bc6\u522b\u89c6\u9891<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-2'><a class=\"ez-toc-link ez-toc-heading-32\" href=\"https:\/\/17aitech.com\/?p=33269\/#%E5%86%85%E5%AE%B9%E5%B0%8F%E7%BB%93\" >\u5185\u5bb9\u5c0f\u7ed3<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-2'><a class=\"ez-toc-link ez-toc-heading-33\" href=\"https:\/\/17aitech.com\/?p=33269\/#%E5%8F%82%E8%80%83%E8%B5%84%E6%96%99\" >\u53c2\u8003\u8d44\u6599<\/a><\/li><\/ul><\/nav><\/div>\n<h2><span class=\"ez-toc-section\" id=\"%E5%89%8D%E8%A8%80\"><\/span>\u524d\u8a00<span class=\"ez-toc-section-end\"><\/span><\/h2>\n<p>\u5728\u4e0a\u4e00\u7ae0<a href=\"https:\/\/17aitech.com\/?p=32899\">\u3010\u8bfe\u7a0b\u603b\u7ed3\u3011day31\uff1a\u591a\u6a21\u6001\u5927\u6a21\u578b\u521d\u6b65\u4e86\u89e3<\/a>\u4e2d\uff0c\u6211\u4eec\u5728\u4e91\u670d\u52a1\u5668\u4e0a\u90e8\u7f72\u4e86Qwen2-VL-2B\u6a21\u578b\uff0c\u521d\u6b65\u4f53\u9a8c\u4e86Qwen2\u7684\u591a\u6a21\u6001\u80fd\u529b\uff0c\u672c\u7ae0\u6211\u4eec\u5c06\u6df1\u5165\u4e86\u89e3Qwen2-VL\u5e76\u4f7f\u7528\u591a\u6a21\u6001\u5bf9\u4e8e\u89c6\u9891\u7684\u5904\u7406\u80fd\u529b\u3002<\/p>\n<h2><span class=\"ez-toc-section\" id=\"%E8%B5%84%E6%96%99\"><\/span>\u8d44\u6599<span class=\"ez-toc-section-end\"><\/span><\/h2>\n<p><strong>\u8bba\u6587\u6807\u9898<\/strong>\uff1a\u300aQwen2-VL: Enhancing Vision-Language Model&#8217;s Perception of the World at Any Resolution\u300b<br \/>\n<strong>\u8bba\u6587\u5730\u5740<\/strong>\uff1a<a href=\"https:\/\/arxiv.org\/pdf\/2409.12191\">https:\/\/arxiv.org\/pdf\/2409.12191<\/a><\/p>\n<h2><span class=\"ez-toc-section\" id=\"%E8%AE%BA%E6%96%87%E9%98%85%E8%AF%BB%E7%90%86%E8%A7%A3\"><\/span>\u8bba\u6587\u9605\u8bfb\u7406\u89e3<span class=\"ez-toc-section-end\"><\/span><\/h2>\n<h3><span class=\"ez-toc-section\" id=\"%E8%AE%BA%E6%96%87%E6%A0%B8%E5%BF%83%E8%A6%81%E7%82%B9\"><\/span>\u8bba\u6587\u6838\u5fc3\u8981\u70b9<span class=\"ez-toc-section-end\"><\/span><\/h3>\n<p>\u636eQwen2-VL\u7684\u8bba\u6587\u4e2d\u4ecb\u7ecd\uff0c\u8be5\u6a21\u578b\u4e3a\u4e86\u8fdb\u4e00\u6b65\u589e\u5f3a\u6a21\u578b\u5bf9\u89c6\u9891\u4e2d\u89c6\u89c9\u4fe1\u606f\u7684\u6709\u6548\u611f\u77e5\u548c\u7406\u89e3\u80fd\u529b\uff0c\u5f15\u5165\u4e86\u4e09\u4e2a\u5173\u952e\u7684\u521b\u65b0\u5347\u7ea7\uff1a<\/p>\n<ol>\n<li><strong>\u539f\u59cb\u52a8\u6001\u5206\u8fa8\u7387<\/strong>\uff1a\u8be5\u529f\u80fd\u5141\u8bb8\u6a21\u578b\u5904\u7406\u4efb\u610f\u5206\u8fa8\u7387\u7684\u56fe\u50cf\uff0c\u800c\u4e0d\u9700\u8981\u8c03\u6574\u6a21\u578b\u7ed3\u6784\u3002<\/li>\n<li><strong>\u591a\u6a21\u6001\u65cb\u8f6c\u4f4d\u7f6e\u5d4c\u5165<\/strong>\uff1a\u8be5\u529f\u80fd\u901a\u8fc7\u65f6\u95f4\u3001\u9ad8\u5ea6\u3001\u5bbd\u5ea6\u4e09\u4e2a\u7ef4\u5ea6\u6765\u5bf9\u8fdb\u884cembedding\uff0c\u4ece\u800c\u5efa\u6a21\u4e86\u591a\u6a21\u6001\u8f93\u5165\u7684\u4f4d\u7f6e\u4fe1\u606f\u3002<\/li>\n<li><strong>\u7edf\u4e00\u56fe\u50cf\u548c\u89c6\u9891\u7684\u7406\u89e3<\/strong>\uff1a\u901a\u8fc7\u6df7\u5408\u8bad\u7ec3\u65b9\u6cd5\u7684\u65b9\u5f0f\uff0c\u7ed3\u5408\u56fe\u50cf\u548c\u89c6\u9891\u6570\u636e\uff0c\u786e\u4fdd\u5728\u56fe\u50cf\u7406\u89e3\u548c\u89c6\u9891\u7406\u89e3\u65b9\u9762\u5177\u6709\u4e13\u4e1a\u6c34\u5e73\u3002<\/li>\n<\/ol>\n<h3><span class=\"ez-toc-section\" id=\"%E5%8D%87%E7%BA%A7%E7%82%B91%EF%BC%9A%E5%8E%9F%E5%A7%8B%E5%8A%A8%E6%80%81%E5%88%86%E8%BE%A8%E7%8E%87\"><\/span>\u5347\u7ea7\u70b91\uff1a\u539f\u59cb\u52a8\u6001\u5206\u8fa8\u7387<span class=\"ez-toc-section-end\"><\/span><\/h3>\n<h4><span class=\"ez-toc-section\" id=\"%E6%A8%A1%E5%9E%8B%E7%BB%93%E6%9E%84\"><\/span>\u6a21\u578b\u7ed3\u6784<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<p><a href=\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/10\/\u539f\u59cb\u52a8\u6001\u5206\u8fa8\u7387\u6a21\u578b\u7ed3\u6784.png\" data-fancybox=\"images\" data-fancybox=\"gallery\"><img decoding=\"async\" src=\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/10\/\u539f\u59cb\u52a8\u6001\u5206\u8fa8\u7387\u6a21\u578b\u7ed3\u6784.png\" alt=\"\" \/><\/a><\/p>\n<h4><span class=\"ez-toc-section\" id=\"%E8%AE%BA%E6%96%87%E5%8E%9F%E6%96%87\"><\/span>\u8bba\u6587\u539f\u6587<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<blockquote>\n<p><strong>Naive Dynamic Resolution<\/strong>  A key architectural improvement in Qwen2-VL is the introduction of naive dynamic resolution support (Dehghani et al., 2024). Unlike Qwen-VL, Qwen2-VL can now process images of any resolution, dynamically converting them into a variable number of visual tokens.1 To support this feature, we modified ViT by removing the original absolute position embeddings and introducing 2D-RoPE (Suet al., 2024; Su, 2021) to <strong>capture the two-dimensional positional information of images.<\/strong> At the inference stage, images of varying resolutions are <strong>packed into a single sequence<\/strong>, with the packed length controlled to limit GPU memory usage. Furthermore, to reduce the visual tokens of each image, a simple MLP layer is employed after the ViT to compress adjacent 2 \u00d7 2 tokens into a single token, with the special &lt;|vision_start|&gt; and &lt;|vision_end|&gt; tokens placed at the beginning and end of the compressed visual tokens. As a result, an image with a resolution of 224 \u00d7 224, encoded with a ViT using patch_size=14, will be compressed to 66 tokens before entering LLM.<\/p>\n<\/blockquote>\n<h4><span class=\"ez-toc-section\" id=\"%E8%AE%BA%E6%96%87%E7%BF%BB%E8%AF%91\"><\/span>\u8bba\u6587\u7ffb\u8bd1<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<blockquote>\n<p><strong>\u539f\u59cb\u52a8\u6001\u5206\u8fa8\u7387(Naive Dynamic Resolution)<\/strong>\uff1a<code>Qwen2-VL<\/code> \u67b6\u6784\u6539\u8fdb\u7684\u5173\u952e\u4e4b\u4e00\u3002\u4e0e\u5b83\u7684\u524d\u8eab\u4e0d\u540c\uff0cQwen2-VL\u73b0\u5728\u53ef\u4ee5\u5904\u7406\u4efb\u4f55\u5206\u8fa8\u7387\u7684\u56fe\u50cf\uff0c\u5e76\u4e14\u80fd\u591f\u5c06\u5b83\u4eec\u52a8\u6001\u8f6c\u6362\u4e3a\u53ef\u53d8\u6570\u91cf\u7684\u89c6\u89c9\u4ee4\u724c\u3002\u4e3a\u4e86\u652f\u6301\u8fd9\u4e00\u529f\u80fd\uff0c\u6211\u4eec\u4fee\u6539\u4e86 <code>ViT<\/code>\uff0c\u5220\u9664\u4e86\u539f\u59cb\u7edd\u5bf9\u4f4d\u7f6e\u5d4c\u5165\uff0c\u5e76<strong>\u5f15\u51652D-RoPE\u6765\u6355\u83b7\u56fe\u50cf\u7684\u4e8c\u7ef4\u4f4d\u7f6e\u4fe1\u606f<\/strong>\u3002\u5728\u63a8\u7406\u9636\u6bb5\uff0c\u5404\u79cd\u5206\u8fa8\u7387\u7684\u56fe\u50cf\u88ab<strong>\u5305\u88c5\u6210\u5355\u4e2a\u5e8f\u5217<\/strong>\uff0c\u5305\u88c5\u957f\u5ea6\u53d7\u63a7\u4ee5\u9650\u5236GPU\u5185\u5b58\u4f7f\u7528\u91cf\u3002\u6b64\u5916\uff0c\u4e3a\u4e86\u51cf\u5c11\u6bcf\u4e2a\u56fe\u50cf\u7684\u89c6\u89c9\u4ee4\u724c\u6570\uff0c\u5728ViT\u4e4b\u540e\u91c7\u7528\u4e00\u4e2a\u7b80\u5355\u7684<code>MLP<\/code>\u5c42\uff0c\u5c06\u76f8\u90bb\u76842\u00d72\u4ee4\u724c\u538b\u7f29\u5230\u4e00\u4e2a\u4ee4\u724c\u4e2d\uff0c\u5176\u4e2d\u7279\u6b8a\u7684 <strong>&lt;|vision_start|&gt;<\/strong> \u548c <strong>&lt;|vision_end|&gt;<\/strong> \u4ee4\u724c\u653e\u7f6e\u5728\u538b\u7f29\u7684\u89c6\u89c9\u4ee4\u724c\u7684\u5f00\u59cb\u548c\u7ed3\u675f\u5904\u3002\u56e0\u6b64\uff0c\u4f7f\u7528 <code>patch_size = 14<\/code> \u7f16\u7801\u7684\u5206\u8fa8\u7387 <code>224\u00d7224<\/code> \u7684\u56fe\u50cf\u5c06\u5728\u8fdb\u5165LLM\u4e4b\u524d\u88ab\u538b\u7f29\u4e3a <code>66<\/code> \u4e2a\u4ee4\u724c\u3002<\/p>\n<\/blockquote>\n<h4><span class=\"ez-toc-section\" id=\"%E8%AE%BA%E6%96%87%E7%90%86%E8%A7%A3\"><\/span>\u8bba\u6587\u7406\u89e3<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<ol>\n<li><strong>\u56fe\u50cf\u5206\u5757\uff08Patch\uff09<\/strong>\uff1a<br \/>\n\u5728\u89c6\u89c9 Transformer\uff08ViT\uff09\u4e2d\uff0c\u56fe\u50cf\u4f1a\u88ab\u5212\u5206\u4e3a\u591a\u4e2a\u5c0f\u5757\uff08patches\uff09\u3002<code>patch_size = 14<\/code> \u610f\u5473\u7740\u6bcf\u4e2a\u5c0f\u5757\u7684\u5c3a\u5bf8\u4e3a <code>14x14<\/code> \u50cf\u7d20\u3002<\/li>\n<\/ol>\n<ul>\n<li>\u56fe\u50cf\u5206\u8fa8\u7387\uff1a\u5047\u5982\u8f93\u5165\u7684\u56fe\u50cf\u5206\u8fa8\u7387\u4e3a <code>224\u00d7224<\/code> \u50cf\u7d20\u3002<\/li>\n<li>\u5c0f\u5757\u6570\u91cf\uff1a\n<ul>\n<li>\u6c34\u5e73\u65b9\u5411\uff1a<code>224 \/ 14<\/code> = 16<\/li>\n<li>\u5782\u76f4\u65b9\u5411\uff1a<code>224 \/ 14<\/code> = 16<br \/>\n\u56e0\u6b64\uff0c\u603b\u7684\u5c0f\u5757\u6570\u91cf\u4e3a 16 \u00d7 16 = 256 \u4e2a\u5c0f\u5757\u3002<\/li>\n<\/ul>\n<\/li>\n<\/ul>\n<ol start=\"2\">\n<li>\n<p><strong>\u538b\u7f29\u89c6\u89c9\u4ee4\u724c<\/strong>\uff1a<br \/>\n\u4e3a\u4e86\u51cf\u5c11\u8f93\u5165\u5230\u6a21\u578b\u4e2d\u7684\u89c6\u89c9\u4ee4\u724c\u6570\u91cf\uff0c<code>Qwen2-VL<\/code> \u4f7f\u7528\u4e86\u4e00\u4e2a\u7b80\u5355\u7684 <code>MLP<\/code> \u5c42\uff0c\u5c06\u76f8\u90bb\u7684 <code>2x2<\/code> \u4e2a\u5c0f\u5757\u538b\u7f29\u4e3a\u4e00\u4e2a\u89c6\u89c9\u4ee4\u724c\u3002<br \/>\n\u7531\u4e8e\u6bcf\u4e2a <code>2x2<\/code> \u7684\u5c0f\u5757\u5305\u542b <code>4<\/code> \u4e2a\u5c0f\u5757\uff0c\u56e0\u6b64 <code>256<\/code> \u4e2a\u5c0f\u5757\u88ab\u538b\u7f29\u4e3a <code>256 \/ 4<\/code> = 64 \u4e2a\u89c6\u89c9\u4ee4\u724c\u3002<\/p>\n<\/li>\n<li>\n<p><strong>\u7279\u6b8a\u4ee4\u724c<\/strong>\uff1a<br \/>\n\u5728\u538b\u7f29\u540e\u7684\u89c6\u89c9\u4ee4\u724c\u5e8f\u5217\u4e2d\uff0c\u6dfb\u52a0\u4e86\u4e24\u4e2a\u7279\u6b8a\u7684\u4ee4\u724c\uff1a<code>&lt;|vision_start|&gt;<\/code> \u548c <code>&lt;|vision_end|&gt;<\/code>\uff0c\u7528\u4e8e\u6807\u8bc6\u89c6\u89c9\u4fe1\u606f\u7684\u5f00\u59cb\u548c\u7ed3\u675f\u3002<br \/>\n\u56e0\u6b64\uff0c\u6700\u7ec8\u7684\u89c6\u89c9\u4ee4\u724c\u6570\u91cf\u4e3a <code>64 + 2<\/code> = 66 \u4e2a\u3002<\/p>\n<\/li>\n<\/ol>\n<h3><span class=\"ez-toc-section\" id=\"%E5%8D%87%E7%BA%A7%E7%82%B92%EF%BC%9A%E5%A4%9A%E6%A8%A1%E6%80%81%E6%97%8B%E8%BD%AC%E4%BD%8D%E7%BD%AE%E5%B5%8C%E5%85%A5\"><\/span>\u5347\u7ea7\u70b92\uff1a\u591a\u6a21\u6001\u65cb\u8f6c\u4f4d\u7f6e\u5d4c\u5165<span class=\"ez-toc-section-end\"><\/span><\/h3>\n<h4><span class=\"ez-toc-section\" id=\"%E6%A8%A1%E5%9E%8B%E7%BB%93%E6%9E%84-2\"><\/span>\u6a21\u578b\u7ed3\u6784<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<p><a href=\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/10\/\u591a\u6a21\u6001\u65cb\u8f6c\u5d4c\u5165.png\" data-fancybox=\"images\" data-fancybox=\"gallery\"><img decoding=\"async\" src=\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/10\/\u591a\u6a21\u6001\u65cb\u8f6c\u5d4c\u5165.png\" alt=\"\" \/><\/a><\/p>\n<h4><span class=\"ez-toc-section\" id=\"%E8%AE%BA%E6%96%87%E5%8E%9F%E6%96%87-2\"><\/span>\u8bba\u6587\u539f\u6587<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<blockquote>\n<p><strong>Multimodal Rotary Position Embedding (M-RoPE)<\/strong> Another key architectural enhancement is the innovation of Multimodal Rotary Position Embedding <code>(M-RoPE)<\/code>. Unlike the traditional <code>1D-RoPE<\/code> in LLMs, which is limited to encoding one-dimensional positional information, M-RoPE effectively models the positional information of multimodal inputs. This is achieved by deconstructing the original rotary embedding into three components: <code>temporal<\/code>, <code>height<\/code>, and <code>width<\/code>.<br \/>\n<strong>For text inputs<\/strong>, these components utilize identical position IDs, making M-RoPE functionally equivalent to 1D-RoPE (Su, 2024).<br \/>\n<strong>When processing images<\/strong>, the temporal IDs of each visual token remain constant, while distinct IDs are assigned to the height and width components based on the token\u2019s position in the image.<br \/>\n<strong>For videos<\/strong>, which are treated as sequences of frames, the temporal ID increments for each frame, while the height and width components follow the same ID assignment pattern as images. In scenarios where the model\u2019s input encompasses multiple modalities, position numbering for each modality is initialized by incrementing the maximum position ID of the preceding modality by one. An illustration of M-RoPE is shown in Figure 3. M-RoPE not only enhances the modeling of positional information but also reduces the value of position IDs for images and videos, enabling the model to extrapolate to longer sequences during inference.<\/p>\n<\/blockquote>\n<h4><span class=\"ez-toc-section\" id=\"%E8%AE%BA%E6%96%87%E7%BF%BB%E8%AF%91-2\"><\/span>\u8bba\u6587\u7ffb\u8bd1<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<blockquote>\n<p><strong>\u591a\u6a21\u6001\u65cb\u8f6c\u4f4d\u7f6e\u5d4c\u5165\uff08M-RoPE\uff09<\/strong>\uff1a\u53e6\u4e00\u4e2a\u5173\u952e\u7684\u67b6\u6784\u589e\u5f3a\u662f\u591a\u6a21\u6001\u65cb\u8f6c\u4f4d\u7f6e\u5d4c\u5165 (M-RoPE) \u7684\u521b\u65b0\u3002\u4e0e\u5927\u578b\u8bed\u8a00\u6a21\u578b\u4e2d\u7684\u4f20\u7edf 1D-RoPE \u4e0d\u540c\uff0c\u5b83\u4ec5\u9650\u4e8e\u7f16\u7801\u4e00\u7ef4\u4f4d\u7f6e\u4fe1\u606f\uff0cM-RoPE \u6709\u6548\u5730\u5efa\u6a21\u4e86\u591a\u6a21\u6001\u8f93\u5165\u7684\u4f4d\u7f6e\u4fe1\u606f\u3002\u8fd9\u901a\u8fc7\u5c06\u539f\u59cb\u65cb\u8f6c\u5d4c\u5165\u5206\u89e3\u4e3a\u4e09\u4e2a\u7ec4\u4ef6\uff1a<code>\u65f6\u95f4<\/code>\u3001<code>\u9ad8\u5ea6<\/code> \u548c <code>\u5bbd\u5ea6<\/code> \u6765\u5b9e\u73b0\u3002<br \/>\n<strong>\u5bf9\u4e8e\u6587\u672c\u8f93\u5165<\/strong>\uff0c\u8fd9\u4e9b\u7ec4\u4ef6\u4f7f\u7528\u76f8\u540c\u7684\u4f4d\u79fb\u3002\u591a\u6a21\u6001\u65cb\u8f6c\u4f4d\u7f6e\u5d4c\u5165ID\uff0c\u4f7fM-RoPE\u529f\u80fd\u4e0a\u7b49\u540c\u4e8e1D-RoPE\u3002<br \/>\n<strong>\u5728\u5904\u7406\u56fe\u50cf\u65f6<\/strong>\uff0c\u6bcf\u4e2a\u89c6\u89c9\u4ee4\u724c\u7684<strong>\u65f6\u95f4ID\u4fdd\u6301\u4e0d\u53d8<\/strong>\uff0c\u800c\u9ad8\u5ea6\u548c\u5bbd\u5ea6\u7ec4\u4ef6\u6839\u636e\u4ee4\u724c\u5728\u56fe\u50cf\u4e2d\u7684\u4f4d\u7f6e\u5206\u914d\u4e0d\u540c\u7684ID\u3002<br \/>\n<strong>\u5bf9\u4e8e\u89c6\u9891<\/strong>\uff0c\u8fd9\u4e9b\u88ab\u5f53\u4f5c\u5e27\u5e8f\u5217\u6765\u5904\u7406\u7684\u89c6\u9891\uff0c\u6bcf\u5e27\u7684<strong>\u65f6\u95f4ID\u9012\u589e<\/strong>\uff0c\u800c\u9ad8\u5ea6\u548c\u5bbd\u5ea6\u7ec4\u4ef6\u9075\u5faa\u4e0e\u56fe\u50cf\u76f8\u540c\u7684ID\u5206\u914d\u6a21\u5f0f\u3002\u5728\u6a21\u578b\u8f93\u5165\u5305\u542b\u591a\u4e2a\u6a21\u6001\u7684\u60c5\u51b5\u4e0b\uff0c\u6bcf\u4e2a\u6a21\u6001\u7684\u4f4d\u7f6e\u7f16\u53f7\u901a\u8fc7\u5c06\u524d\u4e00\u6a21\u6001\u7684\u6700\u5927\u4f4d\u7f6eID\u589e\u52a0\u4e00\u4e2a\u8fdb\u884c\u521d\u59cb\u5316\u3002\u56fe3\u663e\u793a\u4e86M-RoPE\u7684\u793a\u4f8b\u3002M-RoPE\u4e0d\u4ec5\u589e\u5f3a\u4e86\u5bf9\u4f4d\u7f6e\u4fe1\u606f\u7684\u5efa\u6a21\u80fd\u529b\uff0c\u800c\u4e14\u964d\u4f4e\u4e86\u56fe\u50cf\u548c\u89c6\u9891\u4e2d\u4f4d\u7f6eID\u7684\u4ef7\u503c\uff0c\u4f7f\u5f97\u6a21\u578b\u80fd\u591f\u5728\u63a8\u7406\u671f\u95f4\u6269\u5c55\u5230\u66f4\u957f\u7684\u5e8f\u5217\u3002<\/p>\n<\/blockquote>\n<h4><span class=\"ez-toc-section\" id=\"%E8%AE%BA%E6%96%87%E7%90%86%E8%A7%A3-2\"><\/span>\u8bba\u6587\u7406\u89e3<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<ol>\n<li>\n<p><strong>Postion Embedding<\/strong>\uff1a\u4f4d\u7f6e\u5d4c\u5165\u662f\u7528\u6765\u544a\u8bc9\u6a21\u578b\u8f93\u5165\u6570\u636e\u4e2d\u6bcf\u4e2a\u5143\u7d20\u7684\u4f4d\u7f6e\u3002\u6bd4\u5982\uff0c\u5728\u5904\u7406\u6587\u672c\u65f6\uff0c\u6a21\u578b\u9700\u8981\u77e5\u9053\u201c\u6211\u7231\u4f60\u201d\u4e2d\u7684\u201c\u6211\u201d\u662f\u7b2c\u4e00\u4e2a\u8bcd\uff0c\u201c\u7231\u201d\u662f\u7b2c\u4e8c\u4e2a\u8bcd\u3002<\/p>\n<\/li>\n<li>\n<p><strong>M-RoPE<\/strong>\uff1aQwen2-VL \u5f15\u5165\u7684 M-RoPE \u5219\u662f\u4e00\u4e2a\u66f4\u590d\u6742\u7684\u7cfb\u7edf\uff0c\u5b83\u4e0d\u4ec5\u80fd\u5904\u7406\u6587\u672c\uff0c\u8fd8\u80fd\u5904\u7406\u56fe\u50cf\u548c\u89c6\u9891\u3002M-RoPE \u5c06\u4f4d\u7f6e\u5d4c\u5165\u5206\u4e3a\u4e09\u4e2a\u90e8\u5206\uff1a<\/p>\n<ul>\n<li><strong>\u65f6\u95f4<\/strong>\uff1a\u9002\u7528\u4e8e\u89c6\u9891\u6216\u5e8f\u5217\u6570\u636e\uff0c\u8868\u793a\u5e27\u7684\u987a\u5e8f\u3002<\/li>\n<li><strong>\u9ad8\u5ea6\u548c\u5bbd\u5ea6<\/strong>\uff1a\u9002\u7528\u4e8e\u56fe\u50cf\uff0c\u8868\u793a\u56fe\u50cf\u4e2d\u6bcf\u4e2a\u89c6\u89c9\u4ee4\u724c\u7684\u4f4d\u7f6e\uff08\u884c\u548c\u5217\uff09\u3002<\/li>\n<\/ul>\n<\/li>\n<li>\n<p><strong>\u4e0d\u540c\u6570\u636e\u7c7b\u578b\u7684\u5904\u7406<\/strong>\uff1a<\/p>\n<ul>\n<li><strong>\u5bf9\u4e8e\u6587\u672c\u8f93\u5165<\/strong>:<\/li>\n<li>\u76f8\u540c\u4f4d\u79fb\uff1a\u6587\u672c\u4e2d\u7684\u6bcf\u4e2a\u8bcd\u4f7f\u7528\u76f8\u540c\u7684\u65f6\u95f4\u4f4d\u79fb\u3002\u4f8b\u5982\uff0c\u53e5\u5b50\u4e2d\u7684\u8bcd\u6309\u987a\u5e8f\u7f16\u53f7\u3002<\/li>\n<li><strong>\u5bf9\u4e8e\u56fe\u50cf\u8f93\u5165<\/strong><\/li>\n<li><code>\u56fa\u5b9a\u7684\u65f6\u95f4ID<\/code>\uff1a\u56fe\u50cf\u4e2d\u7684\u6bcf\u4e2a\u89c6\u89c9\u4ee4\u724c\uff08\u5c0f\u5757\uff09\u4fdd\u6301\u76f8\u540c\u7684\u65f6\u95f4ID\uff0c\u4f46\u9ad8\u5ea6\u548c\u5bbd\u5ea6\u7684ID\u4f1a\u6839\u636e\u5b83\u4eec\u5728\u56fe\u50cf\u4e2d\u7684\u4f4d\u7f6e\u4e0d\u540c\u800c\u53d8\u5316\u3002\u4f8b\u5982\uff0c\u5de6\u4e0a\u89d2\u7684\u5c0f\u5757\u53ef\u80fd\u662f\uff081,1\uff09\uff0c\u800c\u53f3\u4e0b\u89d2\u7684\u5c0f\u5757\u53ef\u80fd\u662f\uff0816,16\uff09\u3002<\/li>\n<li><strong>\u5bf9\u4e8e\u89c6\u9891\u8f93\u5165<\/strong><\/li>\n<li><code>\u9012\u589e\u7684\u65f6\u95f4ID<\/code>\uff1a\u89c6\u9891\u4e2d\u7684\u6bcf\u4e00\u5e27\u90fd\u6709\u4e0d\u540c\u7684\u65f6\u95f4ID\uff0c\u8868\u793a\u5b83\u4eec\u5728\u5e8f\u5217\u4e2d\u7684\u987a\u5e8f\u3002\u540c\u65f6\uff0c\u6bcf\u5e27\u7684\u9ad8\u5ea6\u548c\u5bbd\u5ea6\u7ec4\u4ef6\u4ecd\u7136\u6839\u636e\u56fe\u50cf\u7684\u4f4d\u7f6e\u5206\u914dID\u3002<\/li>\n<\/ul>\n<\/li>\n<li>\n<p><strong>\u6a21\u6001\u4e4b\u95f4\u7684ID\u521d\u59cb\u5316<\/strong>:<br \/>\n\u5f53\u6a21\u578b\u5904\u7406\u591a\u4e2a\u6a21\u6001\u65f6\uff0c\u6bd4\u5982\u540c\u65f6\u5904\u7406<strong>\u6587\u672c<\/strong>\u548c<strong>\u56fe\u50cf<\/strong>\uff0c<code>M-RoPE<\/code> \u4f1a\u4e3a\u6bcf\u4e2a\u6a21\u6001\u5206\u914d<strong>\u4e0d\u540c\u7684\u8d77\u59cb\u4f4d\u7f6eID<\/strong>\u3002\u4f8b\u5982\uff0c\u5904\u7406\u56fe\u50cf\u65f6\uff0c\u56fe\u50cf\u7684\u6700\u5927ID\u4f1a\u5728\u5904\u7406\u6587\u672c\u65f6\u88ab\u589e\u52a0\uff0c\u4ee5\u907f\u514d\u51b2\u7a81\u3002<\/p>\n<\/li>\n<\/ol>\n<h3><span class=\"ez-toc-section\" id=\"%E5%8D%87%E7%BA%A7%E7%82%B93%EF%BC%9A%E7%BB%9F%E4%B8%80%E5%9B%BE%E5%83%8F%E5%92%8C%E8%A7%86%E9%A2%91%E7%9A%84%E7%90%86%E8%A7%A3\"><\/span>\u5347\u7ea7\u70b93\uff1a\u7edf\u4e00\u56fe\u50cf\u548c\u89c6\u9891\u7684\u7406\u89e3<span class=\"ez-toc-section-end\"><\/span><\/h3>\n<h4><span class=\"ez-toc-section\" id=\"%E8%AE%BA%E6%96%87%E5%8E%9F%E6%96%87-3\"><\/span>\u8bba\u6587\u539f\u6587<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<blockquote>\n<p><strong>Unified Image and Video Understanding<\/strong> Qwen2-VL employs a <strong>mixed training<\/strong> regimen incorporating both image and video data, ensuring proficiency in image understanding and video comprehension. To preserve video information as completely as possible, we sampled each video at two frames per second. Additionally, we integrated <code>3D convolutions<\/code> (Carreira and Zisserman, 2017) with a depth of two to process video inputs, allowing the model to handle 3D tubes instead of 2D patches, thus enabling it to process more video frames without increasing the sequence length (Arnab et al., 2021). For consistency, each image is treated as two identical frames. To balance the computational demands of long video processing with overall training efficiency, we dynamically adjust the resolution of each video frame, limiting the total number of tokens per video to 16384. This training approach strikes a balance between the model\u2019s ability to comprehend long videos and training efficiency.<\/p>\n<\/blockquote>\n<h4><span class=\"ez-toc-section\" id=\"%E8%AE%BA%E6%96%87%E7%BF%BB%E8%AF%91-3\"><\/span>\u8bba\u6587\u7ffb\u8bd1<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<blockquote>\n<p>\u7edf<strong>\u4e00\u56fe\u50cf\u548c\u89c6\u9891\u7406\u89e3<\/strong>\uff1a\u91c7\u7528\u6df7\u5408\u8bad\u7ec3\u65b9\u6cd5\uff0c\u7ed3\u5408\u56fe\u50cf\u548c\u89c6\u9891\u6570\u636e\uff0c\u786e\u4fdd\u5728\u56fe\u50cf\u7406\u89e3\u548c\u89c6\u9891\u7406\u89e3\u65b9\u9762\u5177\u6709\u4e13\u4e1a\u6c34\u5e73\u3002\u4e3a\u4e86\u5c3d\u53ef\u80fd\u5b8c\u6574\u5730\u4fdd\u7559\u89c6\u9891\u4fe1\u606f\uff0c\u6211\u4eec\u6bcf\u79d2\u5bf9\u6bcf\u4e2a\u89c6\u9891\u8fdb\u884c\u4e24\u6b21\u91c7\u6837\u3002\u6b64\u5916\uff0c\u6211\u4eec\u8fd8\u96c6\u6210\u6df1\u5ea6\u4e3a\u4e24\u5c42\u7684<code>\u4e09\u7ef4\u5377\u79ef<\/code>\u6765\u5904\u7406\u89c6\u9891\u8f93\u5165\uff0c\u5141\u8bb8\u6a21\u578b\u5904\u7406\u4e09\u7ef4\u7ba1\u72b6\u7ed3\u6784\u800c\u4e0d\u662f\u4e8c\u7ef4\u5757\uff0c\u4ece\u800c\u4f7f\u5176\u80fd\u591f\u5904\u7406\u66f4\u591a\u89c6\u9891\u5e27\u800c\u65e0\u9700\u589e\u52a0\u5e8f\u5217\u957f\u5ea6\u3002\u4e3a\u4e86\u4fdd\u6301\u4e00\u81f4\uff0c\u6bcf\u5f20\u56fe\u7247\u90fd\u88ab\u89c6\u4e3a\u4e24\u5f20\u76f8\u540c\u7684\u5e27\u3002\u4e3a\u4e86\u5e73\u8861\u957f\u89c6\u9891\u5904\u7406\u6240\u9700\u7684\u8ba1\u7b97\u9700\u6c42\u4e0e\u6574\u4f53\u8bad\u7ec3\u6548\u7387\uff0c\u6211\u4eec\u52a8\u6001\u8c03\u6574\u6bcf\u4e2a\u89c6\u9891\u5e27\u7684\u5206\u8fa8\u7387\uff0c\u9650\u5236\u6bcf\u4e2a\u89c6\u9891\u4e2d\u7684\u603b\u4ee4\u724c\u6570\u91cf\u4e0d\u8d85\u8fc7 16384\u3002\u8fd9\u79cd\u8bad\u7ec3\u65b9\u6cd5\u5728\u6a21\u578b\u7406\u89e3\u548c\u8bad\u7ec3\u6548\u7387\u4e4b\u95f4\u53d6\u5f97\u4e86\u5e73\u8861\u3002<\/p>\n<\/blockquote>\n<h2><span class=\"ez-toc-section\" id=\"%E6%A8%A1%E5%9E%8B%E9%83%A8%E7%BD%B2%E4%BD%BF%E7%94%A8flash_attention\"><\/span>\u6a21\u578b\u90e8\u7f72(\u4f7f\u7528flash_attention)<span class=\"ez-toc-section-end\"><\/span><\/h2>\n<p>\u5728\u4e0a\u4e00\u7ae0<a href=\"https:\/\/17aitech.com\/?p=32899#toc-29\">\u3010\u8bfe\u7a0b\u603b\u7ed3\u3011day31\uff1a\u591a\u6a21\u6001\u5927\u6a21\u578b\u521d\u6b65\u4e86\u89e3<\/a>\uff0c\u6211\u4eec\u90e8\u7f72\u4e86Qwen2-VL\u6a21\u578b\u3002<br \/>\n\u7531\u4e8e\u591a\u6a21\u6001\u5927\u6a21\u578b\u6bd4\u8f83\u5360\u7528GPU\u663e\u5b58\uff0c\u6211\u4eec\u4f7f\u7528<code>flash_attention<\/code>\u6765\u52a0\u901f\u63a8\u7406\uff0c\u4ee5\u51cf\u5c11\u663e\u5b58\u5360\u7528\u3002<\/p>\n<h3><span class=\"ez-toc-section\" id=\"%E5%87%86%E5%A4%87%E7%8E%AF%E5%A2%83\"><\/span>\u51c6\u5907\u73af\u5883<span class=\"ez-toc-section-end\"><\/span><\/h3>\n<p>\u7b2c\u4e00\u6b65\uff1a\u542f\u52a8ModelScope\u5e73\u53f0\u7684PAI-DSW\u7684GPU\u73af\u5883<\/p>\n<pre><code class=\"language-bash\"># \u68c0\u67e5CUDA\u7684\u7248\u672c\nnvcc --version\n\n# \u68c0\u67e5pytorch\u7248\u672c\nimport torch\nprint(torch.__version__)\nprint(torch.cuda.is_available())<\/code><\/pre>\n<p>\u8fd0\u884c\u7ed3\u679c\uff1a<br \/>\n<a href=\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/10\/\u7cfb\u7edf\u7248\u672c.png\" data-fancybox=\"images\" data-fancybox=\"gallery\"><img decoding=\"async\" src=\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/10\/\u7cfb\u7edf\u7248\u672c.png\" alt=\"\" \/><\/a><\/p>\n<p>\u7cfb\u7edf\u7248\u672c\u4e3a CUDA 12.1 \u548c PyTorch 2.3.1<\/p>\n<h3><span class=\"ez-toc-section\" id=\"%E6%8B%89%E5%8F%96%E4%BB%A3%E7%A0%81\"><\/span>\u62c9\u53d6\u4ee3\u7801<span class=\"ez-toc-section-end\"><\/span><\/h3>\n<p>\u7b2c\u4e8c\u6b65\uff1a\u4e0b\u8f7d\u901a\u4e49\u5343\u95ee2-VL-2B-Instruct\u6a21\u578b<\/p>\n<pre><code class=\"language-bash\"># \u786e\u4fdd git lfs \u5df2\u5b89\u88c5\ngit lfs install\n\n# \u4e0b\u8f7d\u6a21\u578b\ngit clone https:\/\/www.modelscope.cn\/Qwen\/Qwen2-VL-2B-Instruct.git<\/code><\/pre>\n<h3><span class=\"ez-toc-section\" id=\"%E5%AE%89%E8%A3%85flash_attention\"><\/span>\u5b89\u88c5flash_attention<span class=\"ez-toc-section-end\"><\/span><\/h3>\n<p>\u7b2c\u4e09\u6b65\uff1a\u5b89\u88c5flash_attention<\/p>\n<pre><code class=\"language-bash\">pip install flash-attn<\/code><\/pre>\n<p>\u8fd0\u884c\u7ed3\u679c\uff1a<br \/>\n<a href=\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/10\/flash-atten\u5b89\u88c5.png\" data-fancybox=\"images\" data-fancybox=\"gallery\"><img decoding=\"async\" src=\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/10\/flash-atten\u5b89\u88c5.png\" alt=\"\" \/><\/a><\/p>\n<h3><span class=\"ez-toc-section\" id=\"%E5%BC%95%E5%85%A5%E7%9B%B8%E5%85%B3%E5%BA%93\"><\/span>\u5f15\u5165\u76f8\u5173\u5e93<span class=\"ez-toc-section-end\"><\/span><\/h3>\n<pre><code class=\"language-python\">from transformers import Qwen2VLForConditionalGeneration\nfrom transformers import AutoTokenizer\nfrom transformers import AutoProcessor\nimport torch\nfrom qwen_vl_utils import process_vision_info<\/code><\/pre>\n<h3><span class=\"ez-toc-section\" id=\"%E5%8A%A0%E8%BD%BD%E6%A8%A1%E5%9E%8B\"><\/span>\u52a0\u8f7d\u6a21\u578b<span class=\"ez-toc-section-end\"><\/span><\/h3>\n<pre><code class=\"language-python\"># \u8bbe\u7f6e\u6a21\u578b\u8def\u5f84\nmodel_dir = &quot;Qwen2-VL-2B-Instruct&quot;\n\n# \u4f7f\u7528flash-attension\u52a0\u8f7d\u6a21\u578b\nmodel = Qwen2VLForConditionalGeneration.from_pretrained(\n    model_dir,\n    torch_dtype=torch.bfloat16,\n    attn_implementation=&quot;flash_attention_2&quot;,\n    device_map=&quot;auto&quot;,\n)<\/code><\/pre>\n<p>\u8fd0\u884c\u7ed3\u679c\uff1a<br \/>\n<a href=\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/10\/flash\u52a0\u8f7d\u7ed3\u679c.png\" data-fancybox=\"images\" data-fancybox=\"gallery\"><img decoding=\"async\" src=\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/10\/flash\u52a0\u8f7d\u7ed3\u679c.png\" alt=\"\" \/><\/a><\/p>\n<h4><span class=\"ez-toc-section\" id=\"%E6%A8%A1%E5%9E%8B%E5%BD%A2%E7%8A%B6\"><\/span>\u6a21\u578b\u5f62\u72b6<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<p>\u5728\u52a0\u8f7d\u6a21\u578b\u540e\uff0c\u5982\u679c\u8f93\u51fa <code>model<\/code>\uff0c\u53ef\u4ee5\u770b\u5230Qwen2\u7684\u6a21\u578b\u7ed3\u6784\u4e3a\uff1a<\/p>\n<pre><code class=\"language-python\">Qwen2VLForConditionalGeneration(\n  (visual): Qwen2VisionTransformerPretrainedModel(\n    (patch_embed): PatchEmbed(\n      (proj): Conv3d(3, 1280, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False)\n    )\n    (rotary_pos_emb): VisionRotaryEmbedding()\n    (blocks): ModuleList(\n      (0-31): 32 x Qwen2VLVisionBlock(\n        (norm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)\n        (norm2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)\n        (attn): VisionFlashAttention2(\n          (qkv): Linear(in_features=1280, out_features=3840, bias=True)\n          (proj): Linear(in_features=1280, out_features=1280, bias=True)\n        )\n        (mlp): VisionMlp(\n          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n          (act): QuickGELUActivation()\n          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n        )\n      )\n    )\n    (merger): PatchMerger(\n      (ln_q): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)\n      (mlp): Sequential(\n        (0): Linear(in_features=5120, out_features=5120, bias=True)\n        (1): GELU(approximate=&#039;none&#039;)\n        (2): Linear(in_features=5120, out_features=1536, bias=True)\n      )\n    )\n  )\n  (model): Qwen2VLModel(\n    (embed_tokens): Embedding(151936, 1536)\n    (layers): ModuleList(\n      (0-27): 28 x Qwen2VLDecoderLayer(\n        (self_attn): Qwen2VLFlashAttention2(\n          (q_proj): Linear(in_features=1536, out_features=1536, bias=True)\n          (k_proj): Linear(in_features=1536, out_features=256, bias=True)\n          (v_proj): Linear(in_features=1536, out_features=256, bias=True)\n          (o_proj): Linear(in_features=1536, out_features=1536, bias=False)\n          (rotary_emb): Qwen2RotaryEmbedding()\n        )\n        (mlp): Qwen2MLP(\n          (gate_proj): Linear(in_features=1536, out_features=8960, bias=False)\n          (up_proj): Linear(in_features=1536, out_features=8960, bias=False)\n          (down_proj): Linear(in_features=8960, out_features=1536, bias=False)\n          (act_fn): SiLU()\n        )\n        (input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)\n        (post_attention_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)\n      )\n    )\n    (norm): Qwen2RMSNorm((1536,), eps=1e-06)\n  )\n  (lm_head): Linear(in_features=1536, out_features=151936, bias=False)\n)<\/code><\/pre>\n<p>\u8bf4\u660e\uff1a<\/p>\n<ul>\n<li>Qwen2-VL \u6a21\u578b\u4e3b\u8981\u7531\u4e24\u4e2a\u90e8\u5206\u7ec4\u6210\uff1a<strong>\u89c6\u89c9\u7f16\u7801\u5668<\/strong> \u548c <strong>\u8bed\u8a00\u6a21\u578b<\/strong>\u3002<\/li>\n<li><strong>\u89c6\u89c9\u7f16\u7801\u5668<\/strong> (Qwen2VisionTransformerPretrainedModel)\uff1a\n<ul>\n<li><strong>Patch Embedding<\/strong>\uff1a\u4f7f\u7528 <code>Conv3d<\/code> \u8fdb\u884c\u56fe\u50cf\u7684embedding\uff0c\u5207\u5206\u4e3a\u591a\u4e2a\u5c0f\u5757\u5e76\u63d0\u53d6\u7279\u5f81\u3002\u5176\u4e2d\u5377\u79ef\u6838\u5927\u5c0f\u4e3a (2, 14, 14)\uff0c\u6b65\u5e45\u4e5f\u4e3a (2, 14, 14)\u3002<\/li>\n<li><strong>Rotary Positional Embedding<\/strong>\uff1a\u5982\u8bba\u6587\u6240\u8ff0\uff0c\u8fdb\u884c\u65cb\u8f6c\u4f4d\u7f6e\u5d4c\u5165\u4ee5\u589e\u5f3a\u89c6\u89c9\u6a21\u578b\u7684\u611f\u77e5\u80fd\u529b\u3002<\/li>\n<li><strong>Transformer Blocks<\/strong>\uff1a\u5305\u542b 32 \u4e2a <code>Qwen2VLVisionBlock<\/code>\uff0c\u6bcf\u4e2a\u5757\u90fd\u6709\u4e24\u4e2a <code>Layer Normalization<\/code> \u5c42\u548c\u4e00\u4e2a <code>\u6ce8\u610f\u529b\u673a\u5236<\/code>\uff0c\u6ce8\u610f\u529b\u673a\u5236\u91c7\u7528 <code>Linear<\/code> \u5c42\u8fdb\u884c <code>QKV\uff08\u67e5\u8be2\u3001\u952e\u3001\u503c\uff09<\/code>\u6620\u5c04\u3002<\/li>\n<li><strong>Patch Merger<\/strong>\uff1a\u5bf9\u63d0\u53d6\u7684\u7279\u5f81\u8fdb\u884c\u5408\u5e76\uff0c\u4f7f\u7528 <code>LayerNorm<\/code> \u548c <code>MLP(\u591a\u5c42\u611f\u77e5\u673a)<\/code> \u5904\u7406\u3002<\/li>\n<\/ul>\n<\/li>\n<li><strong>\u8bed\u8a00\u6a21\u578b<\/strong> (Qwen2VLModel)\uff1a\n<ul>\n<li><strong>Token Embedding<\/strong>\uff1a\u4f7f\u7528 <code>Embedding<\/code> \u5c42\u5c06\u8f93\u5165\u7684\u6587\u672c <code>token<\/code> \u8f6c\u6362\u4e3a\u7a20\u5bc6\u5411\u91cf\uff0c\u7ef4\u5ea6\u4e3a 1536\u3002<\/li>\n<li><strong>Decoder Layers<\/strong>\uff1a\u5305\u542b 28 \u4e2a <code>Qwen2VLDecoderLayer<\/code>\uff0c\u6bcf\u5c42\u5177\u6709\u81ea\u6ce8\u610f\u529b\u673a\u5236\u548c MLP\uff1b\u81ea\u6ce8\u610f\u529b\u673a\u5236\uff08<code>Qwen2VLFlashAttention2<\/code>\uff09\u901a\u8fc7 Q\u3001K\u3001V \u7684\u7ebf\u6027\u6620\u5c04\u8fdb\u884c\u6ce8\u610f\u529b\u8ba1\u7b97\uff0c\u91c7\u7528\u65cb\u8f6c\u5d4c\u5165\u589e\u5f3a\u5e8f\u5217\u4fe1\u606f\u3002<\/li>\n<li><strong>Norm Layer<\/strong>:\u4f7f\u7528 <code>Qwen2RMSNorm<\/code> \u8fdb\u884c\u5f52\u4e00\u5316\uff0c\u5e2e\u52a9\u6a21\u578b\u5728\u8bad\u7ec3\u8fc7\u7a0b\u4e2d\u4fdd\u6301\u7a33\u5b9a\u6027\u3002<\/li>\n<\/ul>\n<\/li>\n<li><strong>\u8f93\u51fa\u5c42<\/strong> (lm_head)\uff1a\n<ul>\n<li>\u6700\u540e\u901a\u8fc7\u4e00\u4e2a\u7ebf\u6027\u5c42\u5c06\u6a21\u578b\u7684\u8f93\u51fa\u6620\u5c04\u56de\u8bcd\u6c47\u8868\u5927\u5c0f\uff08151936\uff09\uff0c\u7528\u4e8e\u751f\u6210\u6587\u672c\u3002<\/li>\n<\/ul>\n<\/li>\n<\/ul>\n<h3><span class=\"ez-toc-section\" id=\"%E5%8A%A0%E8%BD%BDprocessor\"><\/span>\u52a0\u8f7dprocessor<span class=\"ez-toc-section-end\"><\/span><\/h3>\n<pre><code class=\"language-python\">processor = AutoProcessor.from_pretrained(model_dir)<\/code><\/pre>\n<h4><span class=\"ez-toc-section\" id=\"processor%E9%85%8D%E7%BD%AE\"><\/span>processor\u914d\u7f6e<span class=\"ez-toc-section-end\"><\/span><\/h4>\n<p>\u6253\u5370processor\u53ef\u4ee5\u5f97\u5230\u5982\u4e0b\u4fe1\u606f\uff1a<\/p>\n<pre><code class=\"language-python\">Qwen2VLProcessor:\n- image_processor: Qwen2VLImageProcessor {\n  &quot;do_convert_rgb&quot;: true,\n  &quot;do_normalize&quot;: true,\n  &quot;do_rescale&quot;: true,\n  &quot;do_resize&quot;: true,\n  &quot;image_mean&quot;: [\n    0.48145466,\n    0.4578275,\n    0.40821073\n  ],\n  &quot;image_processor_type&quot;: &quot;Qwen2VLImageProcessor&quot;,\n  &quot;image_std&quot;: [\n    0.26862954,\n    0.26130258,\n    0.27577711\n  ],\n  &quot;max_pixels&quot;: 12845056,\n  &quot;merge_size&quot;: 2,\n  &quot;min_pixels&quot;: 3136,\n  &quot;patch_size&quot;: 14,\n  &quot;processor_class&quot;: &quot;Qwen2VLProcessor&quot;,\n  &quot;resample&quot;: 3,\n  &quot;rescale_factor&quot;: 0.00392156862745098,\n  &quot;size&quot;: {\n    &quot;max_pixels&quot;: 12845056,\n    &quot;min_pixels&quot;: 3136\n  },\n  &quot;temporal_patch_size&quot;: 2\n}\n\n- tokenizer: Qwen2TokenizerFast(name_or_path=&#039;Qwen2-VL-2B-Instruct&#039;, vocab_size=151643, model_max_length=32768, is_fast=True, padding_side=&#039;left&#039;, truncation_side=&#039;right&#039;, special_tokens={&#039;eos_token&#039;: &#039;&lt;|im_end|&gt;&#039;, &#039;pad_token&#039;: &#039;&lt;|endoftext|&gt;&#039;, &#039;additional_special_tokens&#039;: [&#039;&lt;|im_start|&gt;&#039;, &#039;&lt;|im_end|&gt;&#039;, &#039;&lt;|object_ref_start|&gt;&#039;, &#039;&lt;|object_ref_end|&gt;&#039;, &#039;&lt;|box_start|&gt;&#039;, &#039;&lt;|box_end|&gt;&#039;, &#039;&lt;|quad_start|&gt;&#039;, &#039;&lt;|quad_end|&gt;&#039;, &#039;&lt;|vision_start|&gt;&#039;, &#039;&lt;|vision_end|&gt;&#039;, &#039;&lt;|vision_pad|&gt;&#039;, &#039;&lt;|image_pad|&gt;&#039;, &#039;&lt;|video_pad|&gt;&#039;]}, clean_up_tokenization_spaces=False),  added_tokens_decoder={\n    151643: AddedToken(&quot;&lt;|endoftext|&gt;&quot;, rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n    151644: AddedToken(&quot;&lt;|im_start|&gt;&quot;, rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n    151645: AddedToken(&quot;&lt;|im_end|&gt;&quot;, rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n    151646: AddedToken(&quot;&lt;|object_ref_start|&gt;&quot;, rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n    151647: AddedToken(&quot;&lt;|object_ref_end|&gt;&quot;, rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n    151648: AddedToken(&quot;&lt;|box_start|&gt;&quot;, rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n    151649: AddedToken(&quot;&lt;|box_end|&gt;&quot;, rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n    151650: AddedToken(&quot;&lt;|quad_start|&gt;&quot;, rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n    151651: AddedToken(&quot;&lt;|quad_end|&gt;&quot;, rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n    151652: AddedToken(&quot;&lt;|vision_start|&gt;&quot;, rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n    151653: AddedToken(&quot;&lt;|vision_end|&gt;&quot;, rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n    151654: AddedToken(&quot;&lt;|vision_pad|&gt;&quot;, rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n    151655: AddedToken(&quot;&lt;|image_pad|&gt;&quot;, rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n    151656: AddedToken(&quot;&lt;|video_pad|&gt;&quot;, rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n}\n\n{\n  &quot;chat_template&quot;: &quot;{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message[&#039;role&#039;] != &#039;system&#039; %}&lt;|im_start|&gt;system\\nYou are a helpful assistant.&lt;|im_end|&gt;\\n{% endif %}&lt;|im_start|&gt;{{ message[&#039;role&#039;] }}\\n{% if message[&#039;content&#039;] is string %}{{ message[&#039;content&#039;] }}&lt;|im_end|&gt;\\n{% else %}{% for content in message[&#039;content&#039;] %}{% if content[&#039;type&#039;] == &#039;image&#039; or &#039;image&#039; in content or &#039;image_url&#039; in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}&lt;|vision_start|&gt;&lt;|image_pad|&gt;&lt;|vision_end|&gt;{% elif content[&#039;type&#039;] == &#039;video&#039; or &#039;video&#039; in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}&lt;|vision_start|&gt;&lt;|video_pad|&gt;&lt;|vision_end|&gt;{% elif &#039;text&#039; in content %}{{ content[&#039;text&#039;] }}{% endif %}{% endfor %}&lt;|im_end|&gt;\\n{% endif %}{% endfor %}{% if add_generation_prompt %}&lt;|im_start|&gt;assistant\\n{% endif %}&quot;,\n  &quot;processor_class&quot;: &quot;Qwen2VLProcessor&quot;\n}<\/code><\/pre>\n<p>\u8bf4\u660e\uff1a<\/p>\n<ol>\n<li><strong>\u56fe\u50cf\u5904\u7406\u5668 (Qwen2VLImageProcessor)<\/strong><\/li>\n<\/ol>\n<ul>\n<li>\u8f6c\u6362 RGB &#8211; <code>do_convert_rgb<\/code>: \u8bbe\u7f6e\u4e3a true\uff0c\u8868\u793a\u5c06\u8f93\u5165\u56fe\u50cf\u8f6c\u6362\u4e3a RGB \u683c\u5f0f\uff0c\u786e\u4fdd\u989c\u8272\u901a\u9053\u7684\u4e00\u81f4\u6027\u3002<\/li>\n<li>\u5f52\u4e00\u5316 &#8211; <code>do_normalize<\/code>: \u8bbe\u7f6e\u4e3a true\uff0c\u8868\u793a\u5bf9\u56fe\u50cf\u8fdb\u884c\u6807\u51c6\u5316\u5904\u7406\uff0c\u4ee5\u4fbf\u4f7f\u56fe\u50cf\u7279\u5f81\u7684\u5747\u503c\u548c\u65b9\u5dee\u7b26\u5408\u6a21\u578b\u7684\u9884\u671f\u3002<\/li>\n<li>\u91cd\u7f29\u653e &#8211; <code>do_rescale<\/code>: \u8bbe\u7f6e\u4e3a true\uff0c\u8868\u793a\u5c06\u56fe\u50cf\u50cf\u7d20\u503c\u7f29\u653e\u5230 [0, 1] \u7684\u8303\u56f4\u3002<\/li>\n<li>\u8c03\u6574\u5927\u5c0f &#8211; <code>do_resize<\/code>: \u8bbe\u7f6e\u4e3a true\uff0c\u8868\u793a\u5c06\u56fe\u50cf\u8c03\u6574\u4e3a\u6a21\u578b\u6240\u9700\u7684\u8f93\u5165\u5c3a\u5bf8\u3002<\/li>\n<li>\u5747\u503c\u548c\u6807\u51c6\u5dee:<br \/>\n<code>image_mean<\/code>: [0.48145466, 0.4578275, 0.40821073]\uff0c\u7528\u4e8e\u56fe\u50cf\u5f52\u4e00\u5316\u7684\u5747\u503c\u3002<br \/>\n<code>image_std<\/code>: [0.26862954, 0.26130258, 0.27577711]\uff0c\u7528\u4e8e\u56fe\u50cf\u5f52\u4e00\u5316\u7684\u6807\u51c6\u5dee\u3002<\/li>\n<li>\u50cf\u7d20\u9650\u5236:<br \/>\n<code>max_pixels<\/code>: 12845056\uff0c\u8868\u793a\u5904\u7406\u7684\u56fe\u50cf\u6700\u5927\u50cf\u7d20\u6570\u3002<br \/>\n<code>min_pixels<\/code>: 3136\uff0c\u8868\u793a\u5904\u7406\u7684\u56fe\u50cf\u6700\u5c0f\u50cf\u7d20\u6570\u3002<\/li>\n<li>\u8865\u4e01\u5927\u5c0f &#8211; <code>patch_size<\/code>: 14\uff0c\u8868\u793a\u5c06\u56fe\u50cf\u5212\u5206\u4e3a\u8865\u4e01\u7684\u5927\u5c0f\u3002<\/li>\n<\/ul>\n<ol start=\"2\">\n<li><strong>\u5206\u8bcd\u5668 (Qwen2TokenizerFast)<\/strong><\/li>\n<\/ol>\n<ul>\n<li>\u8bcd\u6c47\u8868\u5927\u5c0f &#8211; <code>vocab_size<\/code>: 151643\uff0c\u8868\u793a\u5206\u8bcd\u5668\u652f\u6301\u7684\u8bcd\u6c47\u6570\u91cf\u3002<\/li>\n<li>\u6700\u5927\u957f\u5ea6 &#8211; <code>model_max_length<\/code>: 32768\uff0c\u8868\u793a\u6a21\u578b\u80fd\u591f\u5904\u7406\u7684\u6700\u5927\u6587\u672c\u957f\u5ea6\u3002<\/li>\n<li>\u5feb\u901f\u6a21\u5f0f &#8211; <code>is_fast<\/code>: \u8bbe\u7f6e\u4e3a True\uff0c\u8868\u793a\u4f7f\u7528\u5feb\u901f\u5206\u8bcd\u5668\uff0c\u4ee5\u63d0\u9ad8\u5904\u7406\u6548\u7387\u3002<\/li>\n<li>\u586b\u5145\u548c\u622a\u65ad:\n<ul>\n<li><code>padding_side<\/code>: &#8216;left&#8217;\uff0c\u8868\u793a\u5728\u6587\u672c\u5de6\u4fa7\u586b\u5145\u3002<\/li>\n<li><code>truncation_side<\/code>: &#8216;right&#8217;\uff0c\u8868\u793a\u5728\u6587\u672c\u53f3\u4fa7\u622a\u65ad\u3002<\/li>\n<\/ul>\n<\/li>\n<li>\u7279\u6b8a\u6807\u8bb0 &#8211; <code>special_tokens<\/code>: \u5305\u542b\u591a\u4e2a\u7279\u6b8a\u6807\u8bb0\uff0c\u4f8b\u5982\uff1a\n<ul>\n<li><code> &lt;|vision_start|&gt;<\/code> \u548c <code>&lt;|vision_end|&gt;<\/code>\uff0c\u7528\u4e8e\u6807\u8bc6\u56fe\u50cf\u7684\u5f00\u59cb\u548c\u7ed3\u675f\u3002<\/li>\n<li><code>&lt;|vision_pad|&gt;<\/code>\u3001<code>&lt;|image_pad|&gt;<\/code> \u548c <code>&lt;|video_pad|&gt;<\/code> \u8868\u793a\u56fe\u50cf\u8865\u4e01\u7684\u586b\u5145\u3002<\/li>\n<\/ul>\n<\/li>\n<\/ul>\n<h3><span class=\"ez-toc-section\" id=\"%E6%9E%84%E5%BB%BA%E5%AF%B9%E8%AF%9D%E6%A8%A1%E6%9D%BF\"><\/span>\u6784\u5efa\u5bf9\u8bdd\u6a21\u677f<span class=\"ez-toc-section-end\"><\/span><\/h3>\n<pre><code class=\"language-python\">messages = [\n    {\n        &quot;role&quot;: &quot;user&quot;,\n        &quot;content&quot;: [\n            {\n                &quot;type&quot;: &quot;image&quot;,\n                &quot;image&quot;: &quot;https:\/\/17aitech.com\/wp-content\/uploads\/2024\/10\/missile.jpeg&quot;,\n            },\n            {&quot;type&quot;: &quot;text&quot;, &quot;text&quot;: &quot;\u63cf\u8ff0\u4e00\u4e0b\u8fd9\u5f20\u56fe\u7247\uff0c\u53ef\u4ee5\u7684\u8bdd\u7ed9\u51fa\u5177\u4f53\u53c2\u6570\u578b\u53f7.&quot;},\n        ],\n    }\n]<\/code><\/pre>\n<p>\u5907\u6ce8\uff1a<\/p>\n<ul>\n<li>\u56fe\u7247\u8def\u5f84\u4e3ahttps:\/\/17aitech.com\/wp-content\/uploads\/2024\/10\/missile.jpeg<\/li>\n<li>qwen_vl_utils\u4f1a\u81ea\u52a8\u4ece\u4ee5\u4e0a\u5730\u5740\u4e0b\u8f7d\u56fe\u7247<\/li>\n<li>\u56fe\u7247\u5185\u5bb9\u5982\u4e0b\uff1a<br \/>\n<a href=\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/10\/missile.jpeg\" data-fancybox=\"images\" data-fancybox=\"gallery\"><img decoding=\"async\" src=\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/10\/missile.jpeg\" alt=\"\u5bfc\u5f39\" \/><\/a><\/li>\n<\/ul>\n<h3><span class=\"ez-toc-section\" id=\"%E6%95%B0%E6%8D%AE%E9%A2%84%E5%A4%84%E7%90%86\"><\/span>\u6570\u636e\u9884\u5904\u7406<span class=\"ez-toc-section-end\"><\/span><\/h3>\n<pre><code class=\"language-python\">text = processor.apply_chat_template(\n    messages, tokenize=False, add_generation_prompt=True\n)\nimage_inputs, video_inputs = process_vision_info(messages)\ninputs = processor(\n    text=[text],\n    images=image_inputs,\n    videos=video_inputs,\n    padding=True,\n    return_tensors=&quot;pt&quot;,\n)\ninputs = inputs.to(&quot;cuda&quot;)<\/code><\/pre>\n<p>\u8bf4\u660e\uff1a<\/p>\n<ul>\n<li>\u67e5\u770btext\u5185\u5bb9\uff0c\u5176\u6784\u6210\u7684\u5bf9\u8bdd\u6a21\u677f\u5185\u5bb9\u4e3a\uff1a\n<pre><code>&#039;&lt;|im_start|&gt;system\\nYou are a helpful assistant.&lt;|im_end|&gt;\\n&lt;|im_start|&gt;user\\n&lt;|vision_start|&gt;&lt;|image_pad|&gt;&lt;|vision_end|&gt;\u63cf\u8ff0\u4e00\u4e0b\u8fd9\u5f20\u56fe\u7247\uff0c\u53ef\u4ee5\u7684\u8bdd\u7ed9\u51fa\u5177\u4f53\u53c2\u6570\u578b\u53f7.&lt;|im_end|&gt;\\n&lt;|im_start|&gt;assistant\\n&#039;<\/code><\/pre>\n<\/li>\n<li>\u5176\u4e2d <code>&lt;|image_pad|&gt;<\/code> \u4e3a\u56fe\u7247\u7684\u586b\u5145\u7b26\uff0c\u7528\u4e8e\u5bf9\u9f50\u3002<\/li>\n<\/ul>\n<h3><span class=\"ez-toc-section\" id=\"%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86\"><\/span>\u6a21\u578b\u63a8\u7406<span class=\"ez-toc-section-end\"><\/span><\/h3>\n<pre><code class=\"language-python\">generated_ids = model.generate(**inputs, max_new_tokens=128)\ngenerated_ids_trimmed = [\n    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\n]\noutput_text = processor.batch_decode(\n    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False\n)\nprint(output_text)<\/code><\/pre>\n<p>\u8fd0\u884c\u7ed3\u679c\uff1a<br \/>\n<a href=\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/10\/\u56fe\u7247\u8bc6\u522b\u7ed3\u679c.png\" data-fancybox=\"images\" data-fancybox=\"gallery\"><img decoding=\"async\" src=\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/10\/\u56fe\u7247\u8bc6\u522b\u7ed3\u679c.png\" alt=\"\" \/><\/a><\/p>\n<h2><span class=\"ez-toc-section\" id=\"%E8%AF%86%E5%88%ABGif%E5%8A%A8%E5%9B%BE\"><\/span>\u8bc6\u522bGif\u52a8\u56fe<span class=\"ez-toc-section-end\"><\/span><\/h2>\n<pre><code class=\"language-python\">messages = [\n    {\n        &quot;role&quot;: &quot;user&quot;,\n        &quot;content&quot;: [\n            {\n                &quot;type&quot;: &quot;image&quot;,\n                &quot;image&quot;: &quot;https:\/\/17aitech.com\/wp-content\/uploads\/2024\/09\/%E6%A3%80%E7%B4%A2%E5%88%B0%E7%AD%94%E6%A1%88.gif&quot;,\n            },\n            {&quot;type&quot;: &quot;text&quot;, &quot;text&quot;: &quot;\u63cf\u8ff0\u4e00\u4e0b\u8fd9\u5f20\u56fe\u7247.&quot;},\n        ],\n    }\n]<\/code><\/pre>\n<p>\u539f\u59cb\u52a8\u56fe\uff1a<br \/>\n<a href=\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/09\/%E6%A3%80%E7%B4%A2%E5%88%B0%E7%AD%94%E6%A1%88.gif\" data-fancybox=\"images\" data-fancybox=\"gallery\"><img decoding=\"async\" src=\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/09\/%E6%A3%80%E7%B4%A2%E5%88%B0%E7%AD%94%E6%A1%88.gif\" alt=\"\" \/><\/a><br \/>\n\u8bc6\u522b\u7ed3\u679c\uff1a<br \/>\n<a href=\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/10\/\u52a8\u56fe\u8bc6\u522b\u7ed3\u679c.png\" data-fancybox=\"images\" data-fancybox=\"gallery\"><img decoding=\"async\" src=\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/10\/\u52a8\u56fe\u8bc6\u522b\u7ed3\u679c.png\" alt=\"\" \/><\/a><\/p>\n<h2><span class=\"ez-toc-section\" id=\"%E8%AF%86%E5%88%AB%E8%A7%86%E9%A2%91\"><\/span>\u8bc6\u522b\u89c6\u9891<span class=\"ez-toc-section-end\"><\/span><\/h2>\n<p><strong>\u9996\u5148<\/strong>\uff0c\u6211\u4eec\u4e0b\u8f7d\u4e00\u6bb5.mp4\u89c6\u9891\u5230\u672c\u5730\uff0c\u4e0b\u8f7d\u7684\u89c6\u9891\u5730\u5740\u4e3a<a href=\"https:\/\/haokan.baidu.com\/v?pd=wisenatural&amp;vid=7617265203486639345\">\u597d\u770b\u89c6\u9891<\/a><\/p>\n<blockquote>\n<p>\u5907\u6ce8\uff1a\u6211\u4ee5\u524d\u66fe\u7ecf\u505a\u8fc7\u4e00\u4e2a\u9879\u76ee\uff0c\u901a\u8fc7\u89c6\u9891\u7684\u5e27\u6570\u6765\u5ea6\u91cf\u8f6f\u4ef6\u7684\u542f\u52a8\u901f\u5ea6\uff0c\u6211\u4eec\u770b\u770b\u5927\u6a21\u578b\u662f\u5426\u53ef\u4ee5\u5f88\u5bb9\u6613\u5730\u7ed9\u51fa\u7ed3\u679c\u3002<\/p>\n<\/blockquote>\n<p><strong>\u5176\u6b21<\/strong>\uff0c\u6211\u4eec\u5c06\u89c6\u9891\u4e0a\u4f20\u5230\u670d\u52a1\u5668\u4e0a\u3002<br \/>\n<a href=\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/10\/\u89c6\u9891\u4e0a\u4f20\u5230\u670d\u52a1\u5668.png\" data-fancybox=\"images\" data-fancybox=\"gallery\"><img decoding=\"async\" src=\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/10\/\u89c6\u9891\u4e0a\u4f20\u5230\u670d\u52a1\u5668.png\" alt=\"\" \/><\/a><\/p>\n<p><strong>\u7136\u540e<\/strong>\uff0c\u4fee\u6539\u6d88\u606f\u5185\u5bb9\u5982\u4e0b\uff1a<\/p>\n<pre><code class=\"language-python\">\nmessages = [\n    {\n        &quot;role&quot;: &quot;user&quot;,\n        &quot;content&quot;: [\n            {\n                &quot;type&quot;: &quot;video&quot;,\n                &quot;video&quot;: &quot;file:\/\/start_speed.mp4&quot;,\n                &quot;max_pixels&quot;: 360 * 420,\n                &quot;fps&quot;: 1.0,\n            },\n            {&quot;type&quot;: &quot;text&quot;, &quot;text&quot;: &quot;\u8bf7\u63cf\u8ff0\u8fd9\u6bb5\u89c6\u9891\uff0c\u540c\u65f6\u8ba1\u7b97\u4e24\u4e2a\u624b\u673a\u5404\u81ea\u4ece\u542f\u52a8\u5230\u663e\u793a\u5404\u81ea\u7684\u5e27\u6570\u5e76\u8f93\u51fa\u7ed3\u679c.&quot;},\n        ],\n    }\n]<\/code><\/pre>\n<p>\u5176\u4ed6\u90e8\u5206\u4ee3\u7801\u4fdd\u6301\u4e0d\u53d8\u540e\u8fd0\u884c\uff0c\u8fd0\u884c\u7ed3\u679c\u5982\u4e0b\uff1a<br \/>\n<a href=\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/10\/\u89c6\u9891\u8bc6\u522b\u7ed3\u679c.png\" data-fancybox=\"images\" data-fancybox=\"gallery\"><img decoding=\"async\" src=\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/10\/\u89c6\u9891\u8bc6\u522b\u7ed3\u679c.png\" alt=\"\" \/><\/a><\/p>\n<p>\u53ef\u4ee5\u770b\u5230\uff0cQwen2-VL\u53ef\u4ee5\u8bc6\u522b\u51fa\u89c6\u9891\u4e2d\u7684\u5185\u5bb9\uff0c\u867d\u7136\u6ca1\u6709\u7ed9\u51fa\u5404\u81ea\u7684\u5e27\u6570\uff0c\u4f46\u662f\u53ef\u4ee5\u8bc6\u522b\u51fa\u4e24\u4e2a\u624b\u673a\u7684\u54c1\u724c\u5e76\u4e14\u7ed9\u51fa\u54ea\u4e2a\u66f4\u5feb\u3002<\/p>\n<h2><span class=\"ez-toc-section\" id=\"%E5%86%85%E5%AE%B9%E5%B0%8F%E7%BB%93\"><\/span>\u5185\u5bb9\u5c0f\u7ed3<span class=\"ez-toc-section-end\"><\/span><\/h2>\n<ul>\n<li>Qwen2-VL\u4e3a\u4e86\u589e\u5f3a\u6a21\u578b\u80fd\u529b\uff0c\u4e3b\u8981\u8fdb\u884c\u4e863\u4e2a\u6539\u8fdb\uff1a\n<ul>\n<li><strong>\u539f\u59cb\u52a8\u6001\u5206\u8fa8\u7387<\/strong>\uff1a\u8be5\u529f\u80fd\u5141\u8bb8\u6a21\u578b\u5904\u7406\u4efb\u610f\u5206\u8fa8\u7387\u7684\u56fe\u50cf\uff0c\u800c\u4e0d\u9700\u8981\u8c03\u6574\u6a21\u578b\u7ed3\u6784\u3002<\/li>\n<li><strong>\u591a\u6a21\u6001\u65cb\u8f6c\u4f4d\u7f6e\u5d4c\u5165<\/strong>\uff1a\u8be5\u529f\u80fd\u901a\u8fc7\u65f6\u95f4\u3001\u9ad8\u5ea6\u3001\u5bbd\u5ea6\u4e09\u4e2a\u7ef4\u5ea6\u6765\u5bf9\u8fdb\u884cembedding\uff0c\u4ece\u800c\u5efa\u6a21\u4e86\u591a\u6a21\u6001\u8f93\u5165\u7684\u4f4d\u7f6e\u4fe1\u606f\u3002<\/li>\n<li><strong>\u7edf\u4e00\u56fe\u50cf\u548c\u89c6\u9891\u7684\u7406\u89e3<\/strong>\uff1a\u901a\u8fc7\u6df7\u5408\u8bad\u7ec3\u65b9\u6cd5\u7684\u65b9\u5f0f\uff0c\u7ed3\u5408\u56fe\u50cf\u548c\u89c6\u9891\u6570\u636e\uff0c\u786e\u4fdd\u5728\u56fe\u50cf\u7406\u89e3\u548c\u89c6\u9891\u7406\u89e3\u65b9\u9762\u5177\u6709\u4e13\u4e1a\u6c34\u5e73\u3002<\/li>\n<\/ul>\n<\/li>\n<li>Qwen2-VL\u7684\u6a21\u578b\u7ed3\u6784\u4e3b\u8981\u7531 <strong>\u89c6\u89c9\u7f16\u7801\u5668<\/strong> \u548c <strong>\u8bed\u8a00\u6a21\u578b<\/strong> \u4e24\u90e8\u5206\u7ec4\u6210\u3002<\/li>\n<li>Qwen2-VL\u53ef\u4ee5\u4f7f\u7528flashAttention\u8fdb\u884c\u52a0\u901f\uff0c\u4f7f\u7528\u65f6\u9700\u8981\u68c0\u67e5CUDA\u3001torch\u7248\u672c\u7b49\u3002<\/li>\n<li>Qwen2-VL\u9664\u4e86\u53ef\u4ee5\u8bc6\u522b\u56fe\u7247\u4e4b\u5916\uff0c\u4e5f\u53ef\u4ee5\u8bc6\u522bGif\u52a8\u56fe\u548c\u89c6\u9891\uff0c\u5176\u80fd\u529b\u975e\u5e38\u5f3a\u5927\u3002<\/li>\n<\/ul>\n<h2><span class=\"ez-toc-section\" id=\"%E5%8F%82%E8%80%83%E8%B5%84%E6%96%99\"><\/span>\u53c2\u8003\u8d44\u6599<span class=\"ez-toc-section-end\"><\/span><\/h2>\n<p><a href=\"https:\/\/zhuanlan.zhihu.com\/p\/720996637\">\u77e5\u4e4e\uff1a\u3010\u7cbe\u8bfb\u3011Qwen2-VL: Enhancing Vision-Language Model&#8217;s Perception of the World at Any Resolution<\/a><\/p>\n<p align=\"center\">\u6b22\u8fce\u5173\u6ce8\u516c\u4f17\u53f7\u4ee5\u83b7\u5f97\u6700\u65b0\u7684\u6587\u7ae0\u548c\u65b0\u95fb<\/p>\n<p><a href=\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/09\/\u626b\u7801_\u641c\u7d22\u8054\u5408\u4f20\u64ad\u6837\u5f0f-\u767d\u8272\u7248.bmp\" data-fancybox=\"images\" data-fancybox=\"gallery\"><img decoding=\"async\" src=\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/09\/\u626b\u7801_\u641c\u7d22\u8054\u5408\u4f20\u64ad\u6837\u5f0f-\u767d\u8272\u7248.bmp\" alt=\"\" \/><\/a><\/p>\n","protected":false},"excerpt":{"rendered":"<p>\u524d\u8a00 \u5728\u4e0a\u4e00\u7ae0\u3010\u8bfe\u7a0b\u603b\u7ed3\u3011day31\uff1a\u591a [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":33270,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"aside","meta":{"site-sidebar-layout":"default","site-content-layout":"","ast-site-content-layout":"default","site-content-style":"default","site-sidebar-style":"default","ast-global-header-display":"","ast-banner-title-visibility":"","ast-main-header-display":"","ast-hfb-above-header-display":"","ast-hfb-below-header-display":"","ast-hfb-mobile-header-display":"","site-post-title":"","ast-breadcrumbs-content":"","ast-featured-img":"","footer-sml-layout":"","theme-transparent-header-meta":"default","adv-header-id-meta":"","stick-header-meta":"default","header-above-stick-meta":"","header-main-stick-meta":"","header-below-stick-meta":"","astra-migrate-meta-layouts":"set","ast-page-background-enabled":"default","ast-page-background-meta":{"desktop":{"background-color":"var(--ast-global-color-4)","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""},"tablet":{"background-color":"","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""},"mobile":{"background-color":"","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""}},"ast-content-background-meta":{"desktop":{"background-color":"var(--ast-global-color-5)","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""},"tablet":{"background-color":"var(--ast-global-color-5)","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""},"mobile":{"background-color":"var(--ast-global-color-5)","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""}},"footnotes":""},"categories":[28],"tags":[73,68],"class_list":["post-33269","post","type-post","status-publish","format-aside","has-post-thumbnail","hentry","category-blog","tag-73","tag-68","post_format-post-format-aside"],"yoast_head":"<!-- This site is optimized with the Yoast SEO plugin v26.4 - https:\/\/yoast.com\/wordpress\/plugins\/seo\/ -->\n<title>\u3010\u8bfe\u7a0b\u603b\u7ed3\u3011day32(\u4e0a)\uff1a\u591a\u6a21\u6001\u5927\u6a21\u578bQwen2\u7684\u6df1\u5165\u4e86\u89e3 - \u4e00\u8d77AI\u6280\u672f<\/title>\n<meta name=\"robots\" content=\"index, follow, max-snippet:-1, max-image-preview:large, max-video-preview:-1\" \/>\n<link rel=\"canonical\" href=\"https:\/\/17aitech.com\/?p=33269\" \/>\n<script type=\"application\/ld+json\" class=\"yoast-schema-graph\">{\"@context\":\"https:\/\/schema.org\",\"@graph\":[{\"@type\":\"WebPage\",\"@id\":\"https:\/\/17aitech.com\/?p=33269\",\"url\":\"https:\/\/17aitech.com\/?p=33269\",\"name\":\"\u3010\u8bfe\u7a0b\u603b\u7ed3\u3011day32(\u4e0a)\uff1a\u591a\u6a21\u6001\u5927\u6a21\u578bQwen2\u7684\u6df1\u5165\u4e86\u89e3 - \u4e00\u8d77AI\u6280\u672f\",\"isPartOf\":{\"@id\":\"https:\/\/17aitech.com\/#website\"},\"primaryImageOfPage\":{\"@id\":\"https:\/\/17aitech.com\/?p=33269#primaryimage\"},\"image\":{\"@id\":\"https:\/\/17aitech.com\/?p=33269#primaryimage\"},\"thumbnailUrl\":\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/10\/\u539f\u59cb\u52a8\u6001\u5206\u8fa8\u7387\u6a21\u578b\u7ed3\u6784.png\",\"datePublished\":\"2024-10-09T08:18:54+00:00\",\"dateModified\":\"2024-10-09T08:20:48+00:00\",\"author\":{\"@id\":\"https:\/\/17aitech.com\/#\/schema\/person\/3d23bb6f7f115fcefc9ae7803a691739\"},\"breadcrumb\":{\"@id\":\"https:\/\/17aitech.com\/?p=33269#breadcrumb\"},\"inLanguage\":\"zh-Hans\",\"potentialAction\":[{\"@type\":\"ReadAction\",\"target\":[\"https:\/\/17aitech.com\/?p=33269\"]}]},{\"@type\":\"ImageObject\",\"inLanguage\":\"zh-Hans\",\"@id\":\"https:\/\/17aitech.com\/?p=33269#primaryimage\",\"url\":\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/10\/\u539f\u59cb\u52a8\u6001\u5206\u8fa8\u7387\u6a21\u578b\u7ed3\u6784.png\",\"contentUrl\":\"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/10\/\u539f\u59cb\u52a8\u6001\u5206\u8fa8\u7387\u6a21\u578b\u7ed3\u6784.png\",\"width\":1286,\"height\":928},{\"@type\":\"BreadcrumbList\",\"@id\":\"https:\/\/17aitech.com\/?p=33269#breadcrumb\",\"itemListElement\":[{\"@type\":\"ListItem\",\"position\":1,\"name\":\"\u9996\u9875\",\"item\":\"https:\/\/17aitech.com\/\"},{\"@type\":\"ListItem\",\"position\":2,\"name\":\"\u3010\u8bfe\u7a0b\u603b\u7ed3\u3011day32(\u4e0a)\uff1a\u591a\u6a21\u6001\u5927\u6a21\u578bQwen2\u7684\u6df1\u5165\u4e86\u89e3\"}]},{\"@type\":\"WebSite\",\"@id\":\"https:\/\/17aitech.com\/#website\",\"url\":\"https:\/\/17aitech.com\/\",\"name\":\"\u4e00\u8d77AI\u6280\u672f\",\"description\":\"\u8ba9AI\u77e5\u8bc6\u89e6\u624b\u53ef\u53ca\",\"alternateName\":\"\u4e00\u8d77AI\u6280\u672f\",\"potentialAction\":[{\"@type\":\"SearchAction\",\"target\":{\"@type\":\"EntryPoint\",\"urlTemplate\":\"https:\/\/17aitech.com\/?s={search_term_string}\"},\"query-input\":{\"@type\":\"PropertyValueSpecification\",\"valueRequired\":true,\"valueName\":\"search_term_string\"}}],\"inLanguage\":\"zh-Hans\"},{\"@type\":\"Person\",\"@id\":\"https:\/\/17aitech.com\/#\/schema\/person\/3d23bb6f7f115fcefc9ae7803a691739\",\"name\":\"Dongming\",\"image\":{\"@type\":\"ImageObject\",\"inLanguage\":\"zh-Hans\",\"@id\":\"https:\/\/17aitech.com\/#\/schema\/person\/image\/\",\"url\":\"\/\/17aitech.com\/wp-content\/uploads\/member\/avatars\/238a0b923820dcc5.1732798681.jpg\",\"contentUrl\":\"\/\/17aitech.com\/wp-content\/uploads\/member\/avatars\/238a0b923820dcc5.1732798681.jpg\",\"caption\":\"Dongming\"},\"description\":\"\u89c1\u5929\u5730\uff0c\u89c1\u4f17\u751f\uff0c\u89c1\u81ea\u5df1\u3002\",\"sameAs\":[\"http:\/\/17aitech.com\"],\"url\":\"https:\/\/17aitech.com\/?page_id=33738&user=1\"}]}<\/script>\n<!-- \/ Yoast SEO plugin. -->","yoast_head_json":{"title":"\u3010\u8bfe\u7a0b\u603b\u7ed3\u3011day32(\u4e0a)\uff1a\u591a\u6a21\u6001\u5927\u6a21\u578bQwen2\u7684\u6df1\u5165\u4e86\u89e3 - \u4e00\u8d77AI\u6280\u672f","robots":{"index":"index","follow":"follow","max-snippet":"max-snippet:-1","max-image-preview":"max-image-preview:large","max-video-preview":"max-video-preview:-1"},"canonical":"https:\/\/17aitech.com\/?p=33269","schema":{"@context":"https:\/\/schema.org","@graph":[{"@type":"WebPage","@id":"https:\/\/17aitech.com\/?p=33269","url":"https:\/\/17aitech.com\/?p=33269","name":"\u3010\u8bfe\u7a0b\u603b\u7ed3\u3011day32(\u4e0a)\uff1a\u591a\u6a21\u6001\u5927\u6a21\u578bQwen2\u7684\u6df1\u5165\u4e86\u89e3 - \u4e00\u8d77AI\u6280\u672f","isPartOf":{"@id":"https:\/\/17aitech.com\/#website"},"primaryImageOfPage":{"@id":"https:\/\/17aitech.com\/?p=33269#primaryimage"},"image":{"@id":"https:\/\/17aitech.com\/?p=33269#primaryimage"},"thumbnailUrl":"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/10\/\u539f\u59cb\u52a8\u6001\u5206\u8fa8\u7387\u6a21\u578b\u7ed3\u6784.png","datePublished":"2024-10-09T08:18:54+00:00","dateModified":"2024-10-09T08:20:48+00:00","author":{"@id":"https:\/\/17aitech.com\/#\/schema\/person\/3d23bb6f7f115fcefc9ae7803a691739"},"breadcrumb":{"@id":"https:\/\/17aitech.com\/?p=33269#breadcrumb"},"inLanguage":"zh-Hans","potentialAction":[{"@type":"ReadAction","target":["https:\/\/17aitech.com\/?p=33269"]}]},{"@type":"ImageObject","inLanguage":"zh-Hans","@id":"https:\/\/17aitech.com\/?p=33269#primaryimage","url":"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/10\/\u539f\u59cb\u52a8\u6001\u5206\u8fa8\u7387\u6a21\u578b\u7ed3\u6784.png","contentUrl":"https:\/\/17aitech.com\/wp-content\/uploads\/2024\/10\/\u539f\u59cb\u52a8\u6001\u5206\u8fa8\u7387\u6a21\u578b\u7ed3\u6784.png","width":1286,"height":928},{"@type":"BreadcrumbList","@id":"https:\/\/17aitech.com\/?p=33269#breadcrumb","itemListElement":[{"@type":"ListItem","position":1,"name":"\u9996\u9875","item":"https:\/\/17aitech.com\/"},{"@type":"ListItem","position":2,"name":"\u3010\u8bfe\u7a0b\u603b\u7ed3\u3011day32(\u4e0a)\uff1a\u591a\u6a21\u6001\u5927\u6a21\u578bQwen2\u7684\u6df1\u5165\u4e86\u89e3"}]},{"@type":"WebSite","@id":"https:\/\/17aitech.com\/#website","url":"https:\/\/17aitech.com\/","name":"\u4e00\u8d77AI\u6280\u672f","description":"\u8ba9AI\u77e5\u8bc6\u89e6\u624b\u53ef\u53ca","alternateName":"\u4e00\u8d77AI\u6280\u672f","potentialAction":[{"@type":"SearchAction","target":{"@type":"EntryPoint","urlTemplate":"https:\/\/17aitech.com\/?s={search_term_string}"},"query-input":{"@type":"PropertyValueSpecification","valueRequired":true,"valueName":"search_term_string"}}],"inLanguage":"zh-Hans"},{"@type":"Person","@id":"https:\/\/17aitech.com\/#\/schema\/person\/3d23bb6f7f115fcefc9ae7803a691739","name":"Dongming","image":{"@type":"ImageObject","inLanguage":"zh-Hans","@id":"https:\/\/17aitech.com\/#\/schema\/person\/image\/","url":"\/\/17aitech.com\/wp-content\/uploads\/member\/avatars\/238a0b923820dcc5.1732798681.jpg","contentUrl":"\/\/17aitech.com\/wp-content\/uploads\/member\/avatars\/238a0b923820dcc5.1732798681.jpg","caption":"Dongming"},"description":"\u89c1\u5929\u5730\uff0c\u89c1\u4f17\u751f\uff0c\u89c1\u81ea\u5df1\u3002","sameAs":["http:\/\/17aitech.com"],"url":"https:\/\/17aitech.com\/?page_id=33738&user=1"}]}},"_links":{"self":[{"href":"https:\/\/17aitech.com\/index.php?rest_route=\/wp\/v2\/posts\/33269","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/17aitech.com\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/17aitech.com\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/17aitech.com\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/17aitech.com\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=33269"}],"version-history":[{"count":2,"href":"https:\/\/17aitech.com\/index.php?rest_route=\/wp\/v2\/posts\/33269\/revisions"}],"predecessor-version":[{"id":33280,"href":"https:\/\/17aitech.com\/index.php?rest_route=\/wp\/v2\/posts\/33269\/revisions\/33280"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/17aitech.com\/index.php?rest_route=\/wp\/v2\/media\/33270"}],"wp:attachment":[{"href":"https:\/\/17aitech.com\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=33269"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/17aitech.com\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=33269"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/17aitech.com\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=33269"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}