From fa305158f57452d5e2367b8e60d1d159e3bbe881 Mon Sep 17 00:00:00 2001
From: Damen Knight <damen@knightspeed.com>
Date: Mon, 29 Jun 2026 12:12:58 -0700
Subject: [PATCH] qwen35moe : fix MTP graph abort with sliding-window attention

The qwen35moe MTP (nextn) sub-graph builds its attention input with the
non-iSWA build_attn_inp_kv(), which asserts hparams.swa_type == NONE. The
main graph already routes through build_inp_mem_hybrid_iswa() /
build_attn_inp_kv_iswa() when swa_type is set, but graph_mtp did not, so
running this architecture with sliding-window attention *and* MTP /
speculative decoding (--spec-type draft-mtp) aborts at graph build:

  GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE
              && "Use llama_kv_cache_iswa for SWA") failed
  (src/llama-graph.cpp, via llama_model_qwen35moe::build_arch_graph)

Route graph_mtp through build_attn_inp_kv_iswa() when swa_type != NONE,
mirroring the main graph. The MTP layer index (n_layer()) is not marked
SWA, so it routes to the full attention sub-cache. Relates to #23322
(SWA + MTP on Qwen3.6).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/models/qwen35moe.cpp | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/models/qwen35moe.cpp b/src/models/qwen35moe.cpp
index 7b0876cbb..14590018f 100644
--- a/src/models/qwen35moe.cpp
+++ b/src/models/qwen35moe.cpp
@@ -601,7 +601,13 @@ llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm
     ggml_tensor * inp_pos     = build_inp_pos();
     ggml_tensor * inp_out_ids = build_inp_out_ids();
 
-    auto * inp_attn = build_attn_inp_kv();
+    // MTP + sliding-window attention: when the model runs on the iSWA hybrid cache (swa_type != NONE),
+    // the MTP sub-graph must build its attention input via the iSWA builder, exactly as the main graph
+    // does. The non-iSWA build_attn_inp_kv() asserts swa_type == NONE and aborts at graph build. The MTP
+    // layer index (n_layer()) is not marked SWA, so it routes to the full sub-cache (attends the full KV).
+    const bool mtp_use_iswa = hparams.swa_type != LLAMA_SWA_TYPE_NONE;
+    llm_graph_input_attn_kv      * inp_attn      = mtp_use_iswa ? nullptr : build_attn_inp_kv();
+    llm_graph_input_attn_kv_iswa * inp_attn_iswa = mtp_use_iswa ? build_attn_inp_kv_iswa() : nullptr;
 
     ggml_tensor * h_norm = build_norm(h_embd, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il);
     cb(h_norm, "mtp_hnorm", il);
@@ -658,9 +664,9 @@ llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm
     const float kq_scale = hparams.f_attention_scale == 0.0f
             ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
 
-    cur = build_attn(inp_attn,
-            nullptr, nullptr, nullptr,
-            Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+    cur = mtp_use_iswa
+        ? build_attn(inp_attn_iswa, nullptr, nullptr, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il)
+        : build_attn(inp_attn,      nullptr, nullptr, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
     cb(cur, "mtp_attn_pregate", il);
 
     cur = ggml_mul(ctx0, cur, ggml_sigmoid(ctx0, gate));
-- 
2.47.3