alibaba · jxt1234 · Jun 18, 2026 · Jun 18, 2026
diff --git a/docs/transformers/llm.md b/docs/transformers/llm.md
@@ -179,6 +179,39 @@ python llmexport.py \
   cmake .. -DMNN_BUILD_CONVERTER=ON && make -j16
   ```
   编译完成后 `build/` 目录下会生成 `MNNConvert` 可执行文件，`llmexport.py` 默认会在 `../../../build/` 下查找该工具；也可以通过 `--mnnconvert` 选项显式指定 MNNConvert 路径。若未提供本地 MNNConvert，脚本会回退到 pymnn（需先安装 `pip install MNN`）。此方案目前支持导出4bit和8bit模型。
+- 导出 segment 形式的 MNN LLM，使用 `--export mnn --segment`。该模式从 safetensors 权重和 workflow JSON 直接生成多个 MNN 子图，跳过 ONNX 中间文件，适合在 Metal 等后端上复用 decoder、logit、embedding 等 segment 模型。默认会在 `resource/*.json` 中查找匹配的 workflow，也可以通过 `--workflow /path/to/workflow.json` 显式指定。
+
+  ```bash
+  cd transformers/llm/export
+  python3 llmexport.py \
+      --path /path/to/Qwen3-0.6B \
+      --export mnn \
+      --segment \
+      --dst_path ./model
+  ```
+
+  segment 导出目录包含：
+
+  ```text
+  model/
+  ├── config.json              # llm_demo 入口配置，包含 "mnn_llm_version": "segment"
+  ├── llm_config.json          # 模型结构和模板配置
+  ├── tokenizer.mtok
+  ├── embed.mnn
+  ├── decoder.mnn
+  ├── decoder.mnn.weight
+  ├── logit.mnn
+  ├── logit.mnn.weight
+  └── logit_topkv_1.mnn
+  ```
+
+  运行 segment 模型时需要使用生成的 `config.json`：
+
+  ```bash
+  ./llm_demo transformers/llm/export/model/config.json /path/to/prompt.txt
+  ```
+
+  C++ 运行时需启用 `MNN_BUILD_LLM=ON`，并打开 `MNN_LLM_SUPPORT_SEGMENT`（默认开启）。segment 路径当前仅支持 `--export mnn`，不支持 `--export onnx`。
 - 如果直接转为mnn模型遇到问题，或者需要其他bits数的量化（如5bit/6bit），可以先将模型先转为onnx模型，使用`--export onnx`，然后使用./MNNConvert工具将onnx模型转为mnn模型:
 
 ```
@@ -197,7 +230,7 @@ usage: llmexport.py [-h] --path PATH [--type TYPE] [--tokenizer_path TOKENIZER_P
                     [--gptq_path GPTQ_PATH] [--dst_path DST_PATH] [--verbose] [--test TEST] [--export EXPORT]
                     [--onnx_slim] [--quant_bit QUANT_BIT] [--quant_block QUANT_BLOCK]
                     [--lm_quant_bit LM_QUANT_BIT] [--mnnconvert MNNCONVERT] [--ppl] [--awq] [--omni] [--sym] [--seperate_embed]
-                    [--lora_split]
+                    [--lora_split] [--segment] [--workflow WORKFLOW]
 
 llm_exporter
 
@@ -219,6 +252,8 @@ optional arguments:
   --verbose             Whether or not to print verbose.
   --test TEST           test model inference with query `TEST`.
   --export EXPORT       export model to an onnx/mnn model.
+  --segment             export segment MNN LLM from safetensors workflow directly, without ONNX export.
+  --workflow WORKFLOW   workflow json for --segment safetensors conversion. If absent, search resource/*.json.
   --onnx_slim           Whether or not to use onnx-slim.
   --quant_bit QUANT_BIT
                         mnn quant bit, 4 or 8, default is 4.
@@ -1158,4 +1193,4 @@ adb push model /data/local/tmp/MNN/model
 ```
 cd ${MNN_ROOT}
 project/android/testCommon.sh ./llm_demo model/config_mlda.json
-```
+```
diff --git a/express/MathOp.cpp b/express/MathOp.cpp
@@ -584,6 +584,10 @@ VARP _Multiply(VARP x, VARP y) {
     return _Binary(x, y, BinaryOpOperation_MUL);
 }
 
+VARP _MulSilu(VARP x, VARP y) {
+    return _Binary(x, y, BinaryOpOperation_MUL_SILU);
+}
+
 /*Computes Python style division of x by y.
 Args:
 x: A variable. Must be one of the following types:

diff --git a/express/module/StaticModule.cpp b/express/module/StaticModule.cpp
diff --git a/include/MNN/expr/MathOp.hpp b/include/MNN/expr/MathOp.hpp
@@ -15,6 +15,7 @@ namespace Express {
 MNN_PUBLIC VARP _Add(VARP x, VARP y);
 MNN_PUBLIC VARP _Subtract(VARP x, VARP y);
 MNN_PUBLIC VARP _Multiply(VARP x, VARP y);
+MNN_PUBLIC VARP _MulSilu(VARP x, VARP y);
 MNN_PUBLIC VARP _Divide(VARP x, VARP y);
 MNN_PUBLIC VARP _Pow(VARP x, VARP y);
 MNN_PUBLIC VARP _Minimum(VARP x, VARP y);

diff --git a/resource/qwen3_hf_0.6b.json b/resource/qwen3_hf_0.6b.json
@@ -0,0 +1,34 @@
+{
+  "models": [
+    {
+      "name": "hf_decoder",
+      "blocks": [
+        {
+          "type": "QwenTransformer",
+          "hiddenSize": 1024,
+          "headDim": 128,
+          "numHead": 16,
+          "kvNumHead": 8,
+          "number": 28,
+          "max_position_embeddings": 40960
+        }
+      ]
+    },
+    {
+      "name": "logit",
+      "blocks": [
+        {
+          "type": "InnerProduct",
+          "prefix": "lm_head"
+        },
+        {
+          "type": "TieEmbedding"
+        },
+        {
+          "type": "TopKV",
+          "K": [1, 5]
+        }
+      ]
+    }
+  ]
+}
diff --git a/schema/current/MNN_generated.h b/schema/current/MNN_generated.h
@@ -289,6 +289,7 @@ enum OpType {
   OpType_SplitGeLU = 303,
   OpType_GroupNorm = 304,
   OpType_LinearAttention = 305,
+  OpType_RoPE = 306,
   OpType_Extra = 512,
   OpType_ConvInt8 = 513,
   OpType_Int8ToFloat = 514,
@@ -303,7 +304,7 @@ enum OpType {
   OpType_MAX = OpType_GridSample
 };
 
-inline const OpType (&EnumValuesOpType())[183] {
+inline const OpType (&EnumValuesOpType())[184] {
   static const OpType values[] = {
     OpType_AbsVal,
     OpType_QuantizedAdd,
@@ -478,6 +479,7 @@ inline const OpType (&EnumValuesOpType())[183] {
     OpType_SplitGeLU,
     OpType_GroupNorm,
     OpType_LinearAttention,
+    OpType_RoPE,
     OpType_Extra,
     OpType_ConvInt8,
     OpType_Int8ToFloat,
@@ -800,7 +802,7 @@ inline const char * const *EnumNamesOpType() {
     "SplitGeLU",
     "GroupNorm",
     "LinearAttention",
-    "",
+    "RoPE",
     "",
     "",
     "",
@@ -2999,10 +3001,15 @@ struct AttentionParamT : public flatbuffers::NativeTable {
   std::string kv_shared_layer;
   int32_t layer_index;
   int32_t kv_shared_layer_index;
+  std::vector<std::unique_ptr<TensorQuantInfoT>> mhq_quant;
+  bool output_c4;
+  float attnScale;
   AttentionParamT()
       : kv_cache(true),
         layer_index(-1),
-        kv_shared_layer_index(-1) {
+        kv_shared_layer_index(-1),
+        output_c4(false),
+        attnScale(0.0f) {
   }
 };
 
@@ -3023,13 +3030,27 @@ struct AttentionParam FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   int32_t kv_shared_layer_index() const {
     return GetField<int32_t>(10, -1);
   }
+  const flatbuffers::Vector<flatbuffers::Offset<TensorQuantInfo>> *mhq_quant() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<TensorQuantInfo>> *>(12);
+  }
+  bool output_c4() const {
+    return GetField<uint8_t>(14, 0) != 0;
+  }
+  float attnScale() const {
+    return GetField<float>(16, 0.0f);
+  }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<uint8_t>(verifier, 4) &&
            VerifyOffset(verifier, 6) &&
            verifier.VerifyString(kv_shared_layer()) &&
            VerifyField<int32_t>(verifier, 8) &&
            VerifyField<int32_t>(verifier, 10) &&
+           VerifyOffset(verifier, 12) &&
+           verifier.VerifyVector(mhq_quant()) &&
+           verifier.VerifyVectorOfTables(mhq_quant()) &&
+           VerifyField<uint8_t>(verifier, 14) &&
+           VerifyField<float>(verifier, 16) &&
            verifier.EndTable();
   }
   AttentionParamT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -3052,6 +3073,15 @@ struct AttentionParamBuilder {
   void add_kv_shared_layer_index(int32_t kv_shared_layer_index) {
     fbb_.AddElement<int32_t>(10, kv_shared_layer_index, -1);
   }
+  void add_mhq_quant(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<TensorQuantInfo>>> mhq_quant) {
+    fbb_.AddOffset(12, mhq_quant);
+  }
+  void add_output_c4(bool output_c4) {
+    fbb_.AddElement<uint8_t>(14, static_cast<uint8_t>(output_c4), 0);
+  }
+  void add_attnScale(float attnScale) {
+    fbb_.AddElement<float>(16, attnScale, 0.0f);
+  }
   explicit AttentionParamBuilder(flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -3069,11 +3099,17 @@ inline flatbuffers::Offset<AttentionParam> CreateAttentionParam(
     bool kv_cache = true,
     flatbuffers::Offset<flatbuffers::String> kv_shared_layer = 0,
     int32_t layer_index = -1,
-    int32_t kv_shared_layer_index = -1) {
+    int32_t kv_shared_layer_index = -1,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<TensorQuantInfo>>> mhq_quant = 0,
+    bool output_c4 = false,
+    float attnScale = 0.0f) {
   AttentionParamBuilder builder_(_fbb);
+  builder_.add_attnScale(attnScale);
+  builder_.add_mhq_quant(mhq_quant);
   builder_.add_kv_shared_layer_index(kv_shared_layer_index);
   builder_.add_layer_index(layer_index);
   builder_.add_kv_shared_layer(kv_shared_layer);
+  builder_.add_output_c4(output_c4);
   builder_.add_kv_cache(kv_cache);
   return builder_.Finish();
 }
@@ -5499,6 +5535,9 @@ inline void AttentionParam::UnPackTo(AttentionParamT *_o, const flatbuffers::res
   { auto _e = kv_shared_layer(); if (_e) _o->kv_shared_layer = _e->str(); };
   { auto _e = layer_index(); _o->layer_index = _e; };
   { auto _e = kv_shared_layer_index(); _o->kv_shared_layer_index = _e; };
+  { auto _e = mhq_quant(); if (_e) { _o->mhq_quant.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->mhq_quant[_i] = std::unique_ptr<TensorQuantInfoT>(_e->Get(_i)->UnPack(_resolver)); } } };
+  { auto _e = output_c4(); _o->output_c4 = _e; };
+  { auto _e = attnScale(); _o->attnScale = _e; };
 }
 
 inline flatbuffers::Offset<AttentionParam> AttentionParam::Pack(flatbuffers::FlatBufferBuilder &_fbb, const AttentionParamT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@@ -5513,12 +5552,18 @@ inline flatbuffers::Offset<AttentionParam> CreateAttentionParam(flatbuffers::Fla
   auto _kv_shared_layer = _o->kv_shared_layer.empty() ? 0 : _fbb.CreateString(_o->kv_shared_layer);
   auto _layer_index = _o->layer_index;
   auto _kv_shared_layer_index = _o->kv_shared_layer_index;
+  auto _mhq_quant = _o->mhq_quant.size() ? _fbb.CreateVector<flatbuffers::Offset<TensorQuantInfo>> (_o->mhq_quant.size(), [](size_t i, _VectorArgs *__va) { return CreateTensorQuantInfo(*__va->__fbb, __va->__o->mhq_quant[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _output_c4 = _o->output_c4;
+  auto _attnScale = _o->attnScale;
   return MNN::CreateAttentionParam(
       _fbb,
       _kv_cache,
       _kv_shared_layer,
       _layer_index,
-      _kv_shared_layer_index);
+      _kv_shared_layer_index,
+      _mhq_quant,
+      _output_c4,
+      _attnScale);
 }
 
 inline LinearAttentionParamT *LinearAttentionParam::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
@@ -7768,7 +7813,7 @@ inline OpParameterUnion::OpParameterUnion(const OpParameterUnion &u) FLATBUFFERS
       break;
     }
     case OpParameter_AttentionParam: {
-      value = new AttentionParamT(*reinterpret_cast<AttentionParamT *>(u.value));
+      FLATBUFFERS_ASSERT(false);  // AttentionParamT not copyable.
       break;
     }
     case OpParameter_StftParam: {
@@ -8485,12 +8530,13 @@ inline const flatbuffers::TypeTable *OpTypeTypeTable() {
     { flatbuffers::ET_INT, 0, 0 },
     { flatbuffers::ET_INT, 0, 0 },
     { flatbuffers::ET_INT, 0, 0 },
+    { flatbuffers::ET_INT, 0, 0 },
     { flatbuffers::ET_INT, 0, 0 }
   };
   static const flatbuffers::TypeFunction type_refs[] = {
     OpTypeTypeTable
   };
-  static const int64_t values[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 299, 300, 301, 302, 303, 304, 305, 512, 513, 514, 515, 517, 518, 600, 601, 603, 604 };
+  static const int64_t values[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 299, 300, 301, 302, 303, 304, 305, 306, 512, 513, 514, 515, 517, 518, 600, 601, 603, 604 };
   static const char * const names[] = {
     "AbsVal",
     "QuantizedAdd",
@@ -8665,6 +8711,7 @@ inline const flatbuffers::TypeTable *OpTypeTypeTable() {
     "SplitGeLU",
     "GroupNorm",
     "LinearAttention",
+    "RoPE",
     "Extra",
     "ConvInt8",
     "Int8ToFloat",
@@ -8677,7 +8724,7 @@ inline const flatbuffers::TypeTable *OpTypeTypeTable() {
     "GridSample"
   };
   static const flatbuffers::TypeTable tt = {
-    flatbuffers::ST_ENUM, 183, type_codes, type_refs, values, names
+    flatbuffers::ST_ENUM, 184, type_codes, type_refs, values, names
   };
   return &tt;
 }
@@ -9110,16 +9157,25 @@ inline const flatbuffers::TypeTable *AttentionParamTypeTable() {
     { flatbuffers::ET_BOOL, 0, -1 },
     { flatbuffers::ET_STRING, 0, -1 },
     { flatbuffers::ET_INT, 0, -1 },
-    { flatbuffers::ET_INT, 0, -1 }
+    { flatbuffers::ET_INT, 0, -1 },
+    { flatbuffers::ET_SEQUENCE, 1, 0 },
+    { flatbuffers::ET_BOOL, 0, -1 },
+    { flatbuffers::ET_FLOAT, 0, -1 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    TensorQuantInfoTypeTable
   };
   static const char * const names[] = {
     "kv_cache",
     "kv_shared_layer",
     "layer_index",
-    "kv_shared_layer_index"
+    "kv_shared_layer_index",
+    "mhq_quant",
+    "output_c4",
+    "attnScale"
   };
   static const flatbuffers::TypeTable tt = {
-    flatbuffers::ST_TABLE, 4, type_codes, nullptr, nullptr, names
+    flatbuffers::ST_TABLE, 7, type_codes, type_refs, nullptr, names
   };
   return &tt;
 }