From 4b72bb9d70e8a2d6351c4ab6b27dfdbf2d4461d1 Mon Sep 17 00:00:00 2001
From: Damen Knight <damen@knightspeed.com>
Date: Mon, 29 Jun 2026 11:39:19 -0700
Subject: [PATCH] 395aimax fork: re-port patches onto upstream master + fix
 MTP+SWA

Modernized the fork onto current upstream (6f4f53f2b) and re-ported the full
395aimax patch set through the API migration (swa_layers->is_swa_impl,
n_layer field->n_layer() excluding nextn, recurrent_layer_arr->is_recr,
nextn_predict_layers->n_layer_nextn):

- SWA recipe: apply_dynsparse_swa() marks the hybrid full-attn layers as iSWA
  (--dynsparse-swa W / --dynsparse-swa-full / --dynsparse-swa-keepfull, env
  fallback) for qwen35moe, qwen3next, granite-hybrid; iSWA hybrid-cache routing.
- KV eviction: H2O/SnapKV obs scorer (eval-callback) + kv-evict tool + C-API.
- CLI flags in arg.cpp/common/llama.h.

Fix: route graph_mtp through build_attn_inp_kv_iswa() when swa_type!=NONE.
The MTP sub-graph used the non-iswa build_attn_inp_kv() which asserts
swa_type==NONE, so MTP (--spec-type draft-mtp) + the SWA recipe aborted at
load (llama-graph.cpp:2704). MTP layer routes to the full sub-cache. MTP+SWA
now runs and is synergistic at depth (draft acceptance 0.80->0.96 @123k,
decode +46% vs dense baseline). Obsolete patches dropped: the old
LLAMA_SPARSE_* mask hack (superseded by the SWA recipe) and the prior
graph_mtp routing (upstream refactored MTP via #23643).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 common/arg.cpp                           |  46 ++++
 common/common.cpp                        |   3 +
 common/common.h                          |   4 +
 examples/CMakeLists.txt                  |   1 +
 examples/eval-callback/eval-callback.cpp | 294 ++++++++++++++++++----
 examples/kv-evict/CMakeLists.txt         |   5 +
 examples/kv-evict/amass_faon.h           | 137 +++++++++++
 examples/kv-evict/kv-evict.cpp           | 297 +++++++++++++++++++++++
 ggml/src/ggml-vulkan/ggml-vulkan.cpp     |   7 +-
 include/llama.h                          |  15 ++
 src/llama-context.cpp                    |  37 +++
 src/llama-graph.cpp                      | 119 ++++++++-
 src/llama-graph.h                        |   3 +-
 src/llama-hparams.h                      |   5 +
 src/llama-kv-cache.cpp                   |  94 ++++++-
 src/llama-kv-cache.h                     |   6 +
 src/llama-model.cpp                      |   3 +
 src/models/granite-hybrid.cpp            |  13 +-
 src/models/models.h                      |  54 ++++-
 src/models/qwen35moe.cpp                 |  47 +++-
 src/models/qwen3next.cpp                 |  12 +-
 21 files changed, 1125 insertions(+), 77 deletions(-)
 create mode 100644 examples/kv-evict/CMakeLists.txt
 create mode 100644 examples/kv-evict/amass_faon.h
 create mode 100644 examples/kv-evict/kv-evict.cpp

diff --git a/common/arg.cpp b/common/arg.cpp
index 0fc94e553..267259dc2 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -10,6 +10,8 @@
 #include "speculative.h"
 #include "preset.h"
 
+#include <cstdlib> // setenv, for sparse-attn POC flags
+
 // fix problem with std::min and std::max
 #if defined(_WIN32)
 #define WIN32_LEAN_AND_MEAN
@@ -1435,6 +1437,27 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.n_keep = value;
         }
     ));
+    add_opt(common_arg(
+        {"--sparse-attn-sink"}, "N",
+        "sparse-attn prefill POC: keep first N KV positions (attention sink); needs --sparse-attn-window (default: 0)",
+        [](common_params & params, int value) {
+            (void) params; setenv("LLAMA_SPARSE_SINK", std::to_string(value).c_str(), 1);
+        }
+    ));
+    add_opt(common_arg(
+        {"--sparse-attn-window"}, "N",
+        "sparse-attn prefill POC: keep last N KV positions per query (local window); >=0 enables, drops the middle (fast but lossy at depth; full-attn layers only)",
+        [](common_params & params, int value) {
+            (void) params; setenv("LLAMA_SPARSE_WINDOW", std::to_string(value).c_str(), 1);
+        }
+    ));
+    add_opt(common_arg(
+        {"--sparse-attn-stride"}, "N",
+        "sparse-attn prefill POC: also keep every Nth 256-token KV block globally (dilated long-range coverage; 0=off)",
+        [](common_params & params, int value) {
+            (void) params; setenv("LLAMA_SPARSE_STRIDE", std::to_string(value).c_str(), 1);
+        }
+    ));
     add_opt(common_arg(
         {"--swa-full"},
         string_format("use full-size SWA cache (default: %s)\n"
@@ -2494,6 +2517,29 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             }
         }
     ).set_env("LLAMA_ARG_N_GPU_LAYERS"));
+    add_opt(common_arg(
+        {"--dynsparse-swa"}, "N",
+        "hybrid sliding-window attention: window the attention layers of an attention+SSM hybrid (Qwen3.6, "
+        "Qwen3-Next, Granite-H) to W=N tokens (0=off). Bounds attention KV + flattens prefill at long context. "
+        "Pair with --dynsparse-swa-keepfull / --dynsparse-swa-full to backstop far recall.",
+        [](common_params & params, const std::string & value) {
+            params.dynsparse_swa = std::stoi(value);
+        }
+    ).set_env("LLAMA_DYNSPARSE_SWA"));
+    add_opt(common_arg(
+        {"--dynsparse-swa-keepfull"}, "K",
+        "with --dynsparse-swa: keep the last K attention layers FULL (un-windowed) as a far-recall backstop (default: 0)",
+        [](common_params & params, const std::string & value) {
+            params.dynsparse_swa_keepfull = std::stoi(value);
+        }
+    ).set_env("LLAMA_DYNSPARSE_SWA_KEEPFULL"));
+    add_opt(common_arg(
+        {"--dynsparse-swa-full"}, "i,j,k",
+        "with --dynsparse-swa: keep an explicit comma-separated set of attention-layer indices FULL (e.g. 27,31,35,39)",
+        [](common_params & params, const std::string & value) {
+            params.dynsparse_swa_full = value;
+        }
+    ).set_env("LLAMA_DYNSPARSE_SWA_FULL"));
     add_opt(common_arg(
         {"-sm", "--split-mode"}, "{none,layer,row,tensor}",
         "how to split the model across multiple GPUs, one of:\n"
diff --git a/common/common.cpp b/common/common.cpp
index 0dd9ede5e..93cc3c628 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1544,6 +1544,9 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
     mparams.check_tensors   = params.check_tensors;
     mparams.use_extra_bufts = !params.no_extra_bufts;
     mparams.no_host         = params.no_host;
+    mparams.dynsparse_swa          = params.dynsparse_swa;
+    mparams.dynsparse_swa_keepfull = params.dynsparse_swa_keepfull;
+    mparams.dynsparse_swa_full     = params.dynsparse_swa_full.empty() ? nullptr : params.dynsparse_swa_full.c_str();
 
     if (params.kv_overrides.empty()) {
         mparams.kv_overrides = NULL;
diff --git a/common/common.h b/common/common.h
index 2adb310b8..3d14fc00c 100644
--- a/common/common.h
+++ b/common/common.h
@@ -476,6 +476,10 @@ struct common_params {
     bool    fit_params_print   = false; // print the estimated required memory to run the model
     int32_t fit_params_min_ctx = 4096;  // minimum context size to set when trying to reduce memory use
 
+    int32_t     dynsparse_swa          = 0;  // hybrid attention-layer sliding-window size W (0 = off)
+    int32_t     dynsparse_swa_keepfull = 0;  // keep last-K attention layers full (far-recall backstop)
+    std::string dynsparse_swa_full;          // explicit comma-separated attn-layer indices to keep full
+
     // margin per device in bytes for fitting parameters to free memory:
     std::vector<size_t> fit_params_target = std::vector<size_t>(llama_max_devices(), 1024 * 1024*1024);
 
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 39f802d25..b34eeac47 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -18,6 +18,7 @@ else()
     add_subdirectory(debug)
     add_subdirectory(embedding)
     add_subdirectory(eval-callback)
+    add_subdirectory(kv-evict)
 
     add_subdirectory(gguf-hash)
     add_subdirectory(gguf)
diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp
index 4ce8d600b..939dad7ed 100644
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -1,88 +1,280 @@
+// Obs-attention scorer for content-aware KV eviction (SnapKV-style obs window). Two modes:
+//
+//  (default, FA-OFF) two-phase post-softmax capture, memory-feasible at long context:
+//     phase 1: prefill [0, n-W) FA-off, NO capture (softmax on GPU, never copied to host)
+//     phase 2: process the last W=OBS queries FA-off WITH capture -> obs softmax [n_kv x W x heads]
+//     Slow at depth (FA-off brute prefill, ~20-40 min @128k+). The science/ground-truth path.
+//
+//  (AMASS_FAON set, run with -fa on) FAST production scorer. Run a NORMAL fast FA-on prefill and capture the
+//     post-RoPE Qcur/Kcur tensors (named "Qcur-<il>"/"Kcur-<il>" in qwen35moe.cpp, emitted AFTER rope+q/k-norm)
+//     for the full-attn layers via cb_eval. The FA kernel hides the softmax, but Q and K are still materialized,
+//     so reconstruct obs = sum_{w in last W queries, h in heads} softmax(kq_scale * Q_w . K_key)[key] host-side.
+//     Output-gating in this arch modulates the attention OUTPUT, not the softmax weights, so per-key obs mass is
+//     unaffected. Same /tmp/amass.csv output, ~10-20x faster (no FA-off prefill).
+//
+// Output per-key obs score (sum over the W queries + heads), per full-attn layer -> /tmp/amass.csv
+// (layer,key,recv_cum,recv_obs) with recv_cum=0 so keep_from_amass.py (SCORE=obs) consumes it directly.
+// env: AMASS_W (obs window, default 64); FA-on extra: AMASS_FAON, AMASS_FULL ("27,31,35,39"),
+//      AMASS_SCALE (override kq_scale; default 1/sqrt(head_dim)), AMASS_THREADS (default hw concurrency)
 #include "arg.h"
 #include "common.h"
-#include "debug.h"
 #include "log.h"
 #include "llama.h"
+#include "ggml.h"
+#include "ggml-backend.h"
 
 #include <clocale>
 #include <string>
 #include <vector>
+#include <map>
+#include <set>
+#include <thread>
+#include <cmath>
+#include <cstring>
+#include <cstdlib>
+#include <cstdio>
+#include <algorithm>
 
-static bool run(llama_context * ctx, const common_params & params) {
-    const llama_model * model = llama_get_model(ctx);
-    const llama_vocab * vocab = llama_model_get_vocab(model);
+struct amass {
+    std::map<int, std::vector<double>> obs;      // il -> per-key obs score (summed over heads)
+    std::map<int, std::vector<double>> obs_head; // il -> [n_head * n] per-head obs (AMASS_PERHEAD)
+    bool perhead = false;
+    bool capturing = false;
+    int  captures = 0;
 
-    const bool add_bos = llama_vocab_get_add_bos(vocab);
+    // FA-on (Qcur/Kcur) capture state
+    bool faon = false;
+    int  W = 64;
+    int  n_total = 0;                       // total prompt tokens (set in main)
+    float scale = 0.0f;                     // kq_scale (0 => default 1/sqrt(head_dim))
+    int  threads = 0;
+    int  head_dim = 0, n_head = 0, n_head_kv = 0;
+    std::set<int> full;                     // full-attn layers to score
+    std::map<int, std::vector<float>> Kall; // il -> [n_total * n_head_kv * head_dim] (token-major)
+    std::map<int, std::vector<float>> Qtail;// il -> [W * n_head * head_dim] (last W queries)
+    std::map<int, int> kcur, qcur;          // per-layer write cursors (global token index)
+};
 
-    std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos, true);
+static float elem_to_f32(const ggml_tensor * t, const uint8_t * data, size_t i) {
+    if (t->type == GGML_TYPE_F32) return ((const float *) data)[i];
+    if (t->type == GGML_TYPE_F16) return ggml_fp16_to_fp32(((const ggml_fp16_t *) data)[i]);
+    return 0.0f;
+}
+static int parse_il(const char * name) { const char * d = strrchr(name, '-'); return d ? atoi(d + 1) : -1; }
+static bool base_is(const char * name, const char * want) {
+    const char * d = strrchr(name, '-'); if (!d) return false;
+    size_t len = (size_t)(d - name); return strlen(want) == len && strncmp(name, want, len) == 0;
+}
 
-    if (tokens.empty()) {
-        LOG_ERR("%s : there are not input tokens to process - (try to provide a prompt with '-p')\n", __func__);
-        return false;
-    }
+// ---- FA-off post-softmax capture (original path) ----
+static bool faoff_cb(struct ggml_tensor * t, bool ask, amass * acc) {
+    if (ask) return acc->capturing && (t->op == GGML_OP_SOFT_MAX); // only request data during phase 2
+    const bool match = (t->op == GGML_OP_SOFT_MAX) && (t->ne[0] > 256);
+    if (!match || !ggml_is_contiguous(t)) return true;
+    const int il = parse_il(t->name);
+    if (il < 0) return true;
+    const int64_t nk = t->ne[0], nq = t->ne[1], nh = t->ne[2];
+    std::vector<uint8_t> buf(ggml_nbytes(t));
+    ggml_backend_tensor_get(t, buf.data(), 0, ggml_nbytes(t));
+    auto & vo = acc->obs[il];
+    if ((int64_t) vo.size() < nk) vo.resize(nk, 0.0);
+    acc->captures++;
+    for (int64_t h = 0; h < nh; ++h)
+        for (int64_t q = 0; q < nq; ++q) {
+            const int64_t base = h*nk*nq + q*nk;
+            for (int64_t k = 0; k < nk; ++k) vo[k] += (double) elem_to_f32(t, buf.data(), (size_t)(base + k));
+        }
+    return true;
+}
 
-    LOG_INF("number of input tokens = %zu\n", tokens.size());
-    for (size_t i = 0; i < tokens.size(); ++i) {
-        LOG_INF("  %d\n", tokens[i]);
+// ---- FA-on Qcur/Kcur capture (fast path) ----
+static bool faon_cb(struct ggml_tensor * t, bool ask, amass * acc) {
+    // capture ONLY the final post-RoPE Q/K (op == ROPE). Both "Qcur" and "Kcur" names are reused in
+    // qwen35moe.cpp for the pre-norm 2D projection ([n_embd_kv, n_tokens], op MUL_MAT) AND the post-rope
+    // 3D tensor ([head_dim, n_head_kv, n_tokens]); grabbing the 2D one misreads n_tokens as the head count
+    // and allocates tens of GB. The ROPE filter also excludes the permuted views ("... (view) (permuted)").
+    if (t->op != GGML_OP_ROPE) return ask ? false : true;
+    if (strpbrk(t->name, " (")) return ask ? false : true;
+    const bool isQ = base_is(t->name, "Qcur");
+    const bool isK = base_is(t->name, "Kcur");
+    if (!isQ && !isK) return ask ? false : true;
+    const int il = parse_il(t->name);
+    if (acc->full.count(il) == 0) return ask ? false : true;
+    if (ask) return true; // request the data for this Qcur/Kcur of a scored layer
+    if (!ggml_is_contiguous(t)) { LOG_ERR("non-contiguous %s; cannot capture\n", t->name); return true; }
+    const int hd = (int) t->ne[0], heads = (int) t->ne[1], nt = (int) t->ne[2];
+    std::vector<uint8_t> buf(ggml_nbytes(t));
+    ggml_backend_tensor_get(t, buf.data(), 0, ggml_nbytes(t));
+    const int blk = heads * hd; // floats per token
+    if (isK) {
+        acc->head_dim = hd; acc->n_head_kv = heads;
+        auto & Kv = acc->Kall[il];
+        if (Kv.empty()) Kv.assign((size_t) acc->n_total * blk, 0.0f);
+        int base = acc->kcur[il];
+        for (int j = 0; j < nt; ++j) {
+            const int gp = base + j; if (gp >= acc->n_total) break;
+            for (int c = 0; c < blk; ++c) Kv[(size_t) gp * blk + c] = elem_to_f32(t, buf.data(), (size_t) j * blk + c);
+        }
+        acc->kcur[il] += nt;
+    } else {
+        acc->head_dim = hd; acc->n_head = heads;
+        auto & Qv = acc->Qtail[il];
+        if (Qv.empty()) Qv.assign((size_t) acc->W * blk, 0.0f);
+        const int split = acc->n_total - acc->W;
+        int base = acc->qcur[il];
+        for (int j = 0; j < nt; ++j) {
+            const int gp = base + j; if (gp < split || gp >= acc->n_total) continue;
+            const int w = gp - split;
+            for (int c = 0; c < blk; ++c) Qv[(size_t) w * blk + c] = elem_to_f32(t, buf.data(), (size_t) j * blk + c);
+        }
+        acc->qcur[il] += nt;
     }
+    return true;
+}
 
-    if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
-        LOG_ERR("%s : failed to eval\n", __func__);
-        return false;
+static bool amass_cb(struct ggml_tensor * t, bool ask, void * user_data) {
+    auto * acc = (amass *) user_data;
+    return acc->faon ? faon_cb(t, ask, acc) : faoff_cb(t, ask, acc);
+}
+
+// reconstruct per-key obs from captured Qtail/Kall (host, multi-threaded over the W*n_head query/head units)
+static void compute_obs_faon(amass & acc) {
+    const int n = acc.n_total, W = acc.W;
+    const int hd = acc.head_dim, nh = acc.n_head, nkv = acc.n_head_kv;
+    const int split = n - W, grp = nh / nkv;
+    const float scale = acc.scale > 0.0f ? acc.scale : 1.0f / sqrtf((float) hd);
+    unsigned T = acc.threads > 0 ? (unsigned) acc.threads : std::thread::hardware_concurrency();
+    if (T == 0) T = 1;
+    LOG_INF("obs: n=%d W=%d hd=%d nh=%d nkv=%d scale=%.6f threads=%u\n", n, W, hd, nh, nkv, scale, T);
+
+    for (int il : acc.full) {
+        if (acc.Kall.find(il) == acc.Kall.end() || acc.Qtail.find(il) == acc.Qtail.end()) continue;
+        const std::vector<float> & K = acc.Kall[il];
+        const std::vector<float> & Q = acc.Qtail[il];
+        std::vector<double> & obs = acc.obs[il];
+        obs.assign(n, 0.0);
+        const size_t pstride = acc.perhead ? (size_t) nh * n : (size_t) n;
+        std::vector<std::vector<double>> partial(T, std::vector<double>(pstride, 0.0));
+        auto worker = [&](unsigned tid) {
+            std::vector<double> logit(n, 0.0);
+            std::vector<double> & pacc = partial[tid];
+            for (int w = (int) tid; w < W; w += (int) T) {
+                const int qpos = split + w; // global position of this query
+                for (int h = 0; h < nh; ++h) {
+                    const int kv = h / grp;
+                    const float * qp = &Q[((size_t) w * nh + h) * hd];
+                    double maxl = -1e300;
+                    for (int k = 0; k <= qpos; ++k) {
+                        const float * kp = &K[((size_t) k * nkv + kv) * hd];
+                        double d = 0.0;
+                        for (int e = 0; e < hd; ++e) d += (double) qp[e] * (double) kp[e];
+                        d *= scale; logit[k] = d; if (d > maxl) maxl = d;
+                    }
+                    double sum = 0.0;
+                    for (int k = 0; k <= qpos; ++k) { double e = exp(logit[k] - maxl); logit[k] = e; sum += e; }
+                    const double inv = sum > 0.0 ? 1.0 / sum : 0.0;
+                    const size_t off = acc.perhead ? (size_t) h * n : 0;
+                    for (int k = 0; k <= qpos; ++k) pacc[off + k] += logit[k] * inv;
+                }
+            }
+        };
+        std::vector<std::thread> pool;
+        for (unsigned t = 0; t < T; ++t) pool.emplace_back(worker, t);
+        for (auto & th : pool) th.join();
+        if (acc.perhead) {
+            std::vector<double> & oh = acc.obs_head[il]; oh.assign((size_t) nh * n, 0.0);
+            for (unsigned t = 0; t < T; ++t) for (size_t i = 0; i < pstride; ++i) oh[i] += partial[t][i];
+            for (int h = 0; h < nh; ++h) for (int k = 0; k < n; ++k) obs[k] += oh[(size_t) h * n + k];
+        } else {
+            for (unsigned t = 0; t < T; ++t) for (int k = 0; k < n; ++k) obs[k] += partial[t][k];
+        }
     }
+}
 
+static bool decode_range(llama_context * ctx, const std::vector<llama_token> & toks, int lo, int hi, int nb) {
+    llama_batch b = llama_batch_init(nb, 0, 1);
+    for (int i = lo; i < hi; i += nb) {
+        const int cn = std::min(nb, hi - i);
+        b.n_tokens = cn;
+        for (int j = 0; j < cn; ++j) {
+            b.token[j] = toks[i + j]; b.pos[j] = i + j; b.n_seq_id[j] = 1; b.seq_id[j][0] = 0;
+            b.logits[j] = (i + j == hi - 1) ? 1 : 0;
+        }
+        if (llama_decode(ctx, b)) { LOG_ERR("decode failed at %d\n", i); llama_batch_free(b); return false; }
+    }
+    llama_batch_free(b);
     return true;
 }
 
 int main(int argc, char ** argv) {
     std::setlocale(LC_NUMERIC, "C");
-
-    common_debug_cb_user_data cb_data;
-
+    amass acc;
+    acc.W       = getenv("AMASS_W") ? atoi(getenv("AMASS_W")) : 64;
+    acc.faon    = getenv("AMASS_FAON") != nullptr;
+    acc.perhead = getenv("AMASS_PERHEAD") != nullptr;
+    acc.scale   = getenv("AMASS_SCALE") ? (float) atof(getenv("AMASS_SCALE")) : 0.0f;
+    acc.threads = getenv("AMASS_THREADS") ? atoi(getenv("AMASS_THREADS")) : 0;
+    {
+        const char * fl = getenv("AMASS_FULL"); std::string s = fl ? fl : "27,31,35,39";
+        size_t p = 0; while (p < s.size()) { size_t c = s.find(',', p); int v = atoi(s.substr(p, c == std::string::npos ? c : c - p).c_str()); acc.full.insert(v); if (c == std::string::npos) break; p = c + 1; }
+    }
     common_params params;
-
     common_init();
-
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
-        return 1;
-    }
-
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) return 1;
     llama_backend_init();
     llama_numa_init(params.numa);
-
-    // pass the callback to the backend scheduler
-    // it will be executed for each node during the graph computation
-    params.cb_eval = common_debug_cb_eval;
-    params.cb_eval_user_data = &cb_data;
+    params.cb_eval = amass_cb;
+    params.cb_eval_user_data = &acc;
     params.warmup = false;
+    auto li = common_init_from_params(params);
+    auto * ctx = li->context();
+    if (li->model() == nullptr || ctx == nullptr) { LOG_ERR("init failed\n"); return 1; }
+    const llama_vocab * vocab = llama_model_get_vocab(li->model());
 
-    // init
-    auto llama_init = common_init_from_params(params);
-
-    auto * model = llama_init->model();
-    auto * ctx   = llama_init->context();
+    std::vector<llama_token> toks = common_tokenize(ctx, params.prompt, llama_vocab_get_add_bos(vocab), true);
+    const int n = (int) toks.size();
+    if (n == 0) { LOG_ERR("no tokens\n"); return 1; }
+    acc.n_total = n;
+    const int split = n > acc.W ? n - acc.W : 0;
+    LOG_INF("scorer: mode=%s n=%d W=%d split=%d nb=%d\n", acc.faon ? "FA-ON(QK)" : "FA-OFF(softmax)", n, acc.W, split, params.n_batch);
 
-    if (model == nullptr || ctx == nullptr) {
-        LOG_ERR("%s : failed to init\n", __func__);
-        return 1;
+    if (acc.faon) {
+        acc.capturing = true;
+        if (!decode_range(ctx, toks, 0, n, params.n_batch)) return 1; // single fast prefill, capture Qcur/Kcur
+        compute_obs_faon(acc);
+    } else {
+        acc.capturing = false;
+        if (!decode_range(ctx, toks, 0, split, params.n_batch)) return 1;   // phase 1: no capture
+        acc.capturing = true;
+        if (!decode_range(ctx, toks, split, n, params.n_batch)) return 1;   // phase 2: capture obs
     }
 
-    // print system information
-    {
-        LOG_INF("\n");
-        LOG_INF("%s\n", common_params_get_system_info(params).c_str());
-        LOG_INF("\n");
+    FILE * f = fopen("/tmp/amass.csv", "w");
+    fprintf(f, "layer,key,recv_cum,recv_obs\n");
+    for (auto & kv : acc.obs) {
+        const int il = kv.first;
+        for (size_t k = 0; k < kv.second.size(); ++k) fprintf(f, "%d,%zu,0.0,%.6f\n", il, k, kv.second[k]);
     }
+    fclose(f);
+    LOG_INF("\nscorer -> /tmp/amass.csv (%zu layers, mode=%s)\n", acc.obs.size(), acc.faon ? "FA-ON" : "FA-OFF");
 
-    bool OK = run(ctx, params);
-    if (!OK) {
-        return 1;
+    if (acc.perhead) {
+        const int topN = getenv("AMASS_TOPN") ? atoi(getenv("AMASS_TOPN")) : 20;
+        FILE * g = fopen("/tmp/amass_perhead.csv", "w");
+        fprintf(g, "layer,head,rank,key,obs\n");
+        for (auto & kv : acc.obs_head) {
+            const int il = kv.first; const std::vector<double> & oh = kv.second;
+            for (int h = 0; h < acc.n_head; ++h) {
+                std::vector<int> idx(n); for (int i = 0; i < n; ++i) idx[i] = i;
+                const double * o = &oh[(size_t) h * n];
+                const int kk = std::min(topN, n);
+                std::partial_sort(idx.begin(), idx.begin() + kk, idx.end(), [&](int a, int b) { return o[a] > o[b]; });
+                for (int r = 0; r < kk; ++r) fprintf(g, "%d,%d,%d,%d,%.6f\n", il, h, r, idx[r], o[idx[r]]);
+            }
+        }
+        fclose(g);
+        LOG_INF("perhead -> /tmp/amass_perhead.csv (top-%d/head, %d heads x %zu layers)\n", topN, acc.n_head, acc.obs_head.size());
     }
-
-    LOG("\n");
-    llama_perf_context_print(ctx);
-
     llama_backend_free();
-
     return 0;
 }
diff --git a/examples/kv-evict/CMakeLists.txt b/examples/kv-evict/CMakeLists.txt
new file mode 100644
index 000000000..ebb304df6
--- /dev/null
+++ b/examples/kv-evict/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET llama-kv-evict)
+add_executable(${TARGET} kv-evict.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/kv-evict/amass_faon.h b/examples/kv-evict/amass_faon.h
new file mode 100644
index 000000000..52156a2db
--- /dev/null
+++ b/examples/kv-evict/amass_faon.h
@@ -0,0 +1,137 @@
+#pragma once
+// Fast FA-on obs scorer (reusable). During a NORMAL flash-attention prefill, capture the post-RoPE Qcur/Kcur
+// of the chosen full-attn layers via an eval callback, then reconstruct SnapKV-style obs scores host-side:
+//   obs[key] = sum over the last W queries and all heads of softmax(kq_scale * Q_w . K_key)[key]
+// The FA kernel hides the softmax, but Q and K are materialized, so this gives the same per-key attention
+// mass the slow FA-off softmax-capture scorer produces, at FA-on speed. Used by llama-kv-evict (EVICT_AUTO)
+// and the standalone llama-eval-callback scorer. NOTE: capture ONLY op==ROPE tensors named exactly
+// "Qcur"/"Kcur" -- those names are reused in qwen35moe.cpp for a 2D pre-norm projection too, and grabbing
+// that misreads n_tokens as the head count and allocates tens of GB.
+#include "llama.h"
+#include "ggml.h"
+#include "ggml-backend.h"
+#include <map>
+#include <set>
+#include <vector>
+#include <thread>
+#include <cmath>
+#include <cstring>
+#include <cstdlib>
+#include <algorithm>
+
+struct faon_scorer {
+    int   W = 64, n_total = 0, threads = 0;
+    float scale = 0.0f;                       // 0 => default 1/sqrt(head_dim)
+    int   head_dim = 0, n_head = 0, n_head_kv = 0;
+    std::set<int> full;                       // full-attn layers to score
+    std::map<int, std::vector<float>>  Kall;  // il -> [n_total * n_head_kv * head_dim]
+    std::map<int, std::vector<float>>  Qtail; // il -> [W * n_head * head_dim]
+    std::map<int, int> kcur, qcur;
+    std::map<int, std::vector<double>> obs;   // il -> per-key obs
+
+    static float to_f32(const ggml_tensor * t, const uint8_t * d, size_t i) {
+        if (t->type == GGML_TYPE_F32) return ((const float *) d)[i];
+        if (t->type == GGML_TYPE_F16) return ggml_fp16_to_fp32(((const ggml_fp16_t *) d)[i]);
+        return 0.0f;
+    }
+    static int  parse_il(const char * n) { const char * d = strrchr(n, '-'); return d ? atoi(d + 1) : -1; }
+    static bool base_is(const char * n, const char * w) {
+        const char * d = strrchr(n, '-'); if (!d) return false;
+        size_t l = (size_t)(d - n); return strlen(w) == l && strncmp(n, w, l) == 0;
+    }
+
+    void parse_full(const char * env_default) {
+        const char * fl = getenv("AMASS_FULL"); std::string s = fl ? fl : env_default;
+        size_t p = 0; while (p < s.size()) {
+            size_t c = s.find(',', p);
+            full.insert(atoi(s.substr(p, c == std::string::npos ? c : c - p).c_str()));
+            if (c == std::string::npos) break; p = c + 1;
+        }
+    }
+
+    bool cb(ggml_tensor * t, bool ask) {
+        if (t->op != GGML_OP_ROPE)   return ask ? false : true;
+        if (strpbrk(t->name, " ("))  return ask ? false : true; // skip views/permutes
+        const bool isQ = base_is(t->name, "Qcur"), isK = base_is(t->name, "Kcur");
+        if (!isQ && !isK)            return ask ? false : true;
+        const int il = parse_il(t->name);
+        if (full.count(il) == 0)     return ask ? false : true;
+        if (ask) return true;
+        if (!ggml_is_contiguous(t)) return true;
+        const int hd = (int) t->ne[0], heads = (int) t->ne[1], nt = (int) t->ne[2], blk = heads * hd;
+        std::vector<uint8_t> buf(ggml_nbytes(t));
+        ggml_backend_tensor_get(t, buf.data(), 0, ggml_nbytes(t));
+        if (isK) {
+            head_dim = hd; n_head_kv = heads;
+            auto & K = Kall[il]; if (K.empty()) K.assign((size_t) n_total * blk, 0.0f);
+            const int b = kcur[il];
+            for (int j = 0; j < nt; ++j) { const int gp = b + j; if (gp >= n_total) break;
+                for (int c = 0; c < blk; ++c) K[(size_t) gp * blk + c] = to_f32(t, buf.data(), (size_t) j * blk + c); }
+            kcur[il] += nt;
+        } else {
+            head_dim = hd; n_head = heads;
+            auto & Q = Qtail[il]; if (Q.empty()) Q.assign((size_t) W * blk, 0.0f);
+            const int split = n_total - W, b = qcur[il];
+            for (int j = 0; j < nt; ++j) { const int gp = b + j; if (gp < split || gp >= n_total) continue;
+                const int w = gp - split;
+                for (int c = 0; c < blk; ++c) Q[(size_t) w * blk + c] = to_f32(t, buf.data(), (size_t) j * blk + c); }
+            qcur[il] += nt;
+        }
+        return true;
+    }
+
+    void compute() {
+        const int n = n_total, hd = head_dim, nh = n_head, nkv = n_head_kv, split = n - W, grp = nh / nkv;
+        const float sc = scale > 0.0f ? scale : 1.0f / sqrtf((float) hd);
+        unsigned T = threads > 0 ? (unsigned) threads : std::thread::hardware_concurrency(); if (T == 0) T = 1;
+        for (int il : full) {
+            if (!Kall.count(il) || !Qtail.count(il)) continue;
+            const auto & K = Kall[il]; const auto & Q = Qtail[il];
+            auto & o = obs[il]; o.assign(n, 0.0);
+            std::vector<std::vector<double>> part(T, std::vector<double>(n, 0.0));
+            auto work = [&](unsigned tid) {
+                std::vector<double> lg(n, 0.0); auto & pa = part[tid];
+                for (int w = (int) tid; w < W; w += (int) T) {
+                    const int qp = split + w;
+                    for (int h = 0; h < nh; ++h) {
+                        const int kv = h / grp; const float * qv = &Q[((size_t) w * nh + h) * hd];
+                        double mx = -1e300;
+                        for (int k = 0; k <= qp; ++k) { const float * kp = &K[((size_t) k * nkv + kv) * hd];
+                            double d = 0.0; for (int e = 0; e < hd; ++e) d += (double) qv[e] * (double) kp[e];
+                            d *= sc; lg[k] = d; if (d > mx) mx = d; }
+                        double s = 0.0; for (int k = 0; k <= qp; ++k) { double e = exp(lg[k] - mx); lg[k] = e; s += e; }
+                        const double inv = s > 0.0 ? 1.0 / s : 0.0;
+                        for (int k = 0; k <= qp; ++k) pa[k] += lg[k] * inv;
+                    }
+                }
+            };
+            std::vector<std::thread> pool;
+            for (unsigned t = 0; t < T; ++t) pool.emplace_back(work, t);
+            for (auto & th : pool) th.join();
+            for (unsigned t = 0; t < T; ++t) for (int k = 0; k < n; ++k) o[k] += part[t][k];
+        }
+    }
+
+    // aggregate obs across the full layers (like keep_from_amass.py), return the top-K key positions.
+    // pool>1 applies SnapKV-style max-pool smoothing (kernel=pool) to the aggregated scores BEFORE ranking, so a
+    // token adjacent to a salient peak inherits its score and gets kept too -- keeps whole spans (a needle is
+    // several tokens), not isolated peaks. The kept positions are still the original indices.
+    std::vector<llama_pos> topk(int K, int pool = 0) {
+        std::vector<double> agg(n_total, 0.0);
+        for (auto & kv : obs) for (size_t k = 0; k < kv.second.size() && (int) k < n_total; ++k) agg[k] += kv.second[k];
+        std::vector<double> score = agg;
+        if (pool > 1) {
+            const int r = pool / 2;
+            for (int i = 0; i < n_total; ++i) {
+                double m = agg[i];
+                for (int j = std::max(0, i - r); j <= std::min(n_total - 1, i + r); ++j) if (agg[j] > m) m = agg[j];
+                score[i] = m;
+            }
+        }
+        std::vector<int> idx(n_total); for (int i = 0; i < n_total; ++i) idx[i] = i;
+        const int kk = std::min(K, n_total);
+        std::partial_sort(idx.begin(), idx.begin() + kk, idx.end(), [&](int a, int b) { return score[a] > score[b]; });
+        std::vector<llama_pos> out(idx.begin(), idx.begin() + kk);
+        return out;
+    }
+};
diff --git a/examples/kv-evict/kv-evict.cpp b/examples/kv-evict/kv-evict.cpp
new file mode 100644
index 000000000..727998e73
--- /dev/null
+++ b/examples/kv-evict/kv-evict.cpp
@@ -0,0 +1,297 @@
+// KV eviction harness for the hybrid-iSWA config: prefill a long prompt, evict low-value positions from the
+// BASE (full-attn, layers 27/31/35/39) cache, COMPACT via state save/clear/restore (shrinks n_kv), then
+// decode + measure tok/s + check needle recall. Quantifies the decode win and the recall cost of eviction.
+//
+// MEASURE_DUAL=1: prefill ONCE, then measure baseline decode -> restore post-prefill state -> evict+compact ->
+//   measure evicted decode. Apples-to-apples decode-t/s delta from a single prefill. EOG disabled (fixed ngen).
+//
+// EVICT_AUTO=1: fully self-contained content-aware eviction. Capture post-RoPE Qcur/Kcur during THIS prefill
+//   (fast FA-on obs scorer, amass_faon.h), score, and use the top-KEEP_MID obs keys as the scored-middle
+//   keep-set -- no separate scorer pass, no known needle position, no keepfile. This is the usable pipeline.
+//
+// env: EVICT(0/1) EVICT_SINK EVICT_RECENT EVICT_STRIDE(0=none) EVICT_KEEPFILE EVICT_NGEN NEEDLE_STR MEASURE_DUAL
+//      EVICT_AUTO KEEP_MID(512) AMASS_W(64) AMASS_FULL(27,31,35,39) AMASS_THREADS AMASS_SCALE
+#include "arg.h"
+#include "common.h"
+#include "log.h"
+#include "llama.h"
+#include "amass_faon.h"
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <string>
+#include <vector>
+#include <chrono>
+#include <random>
+#include <cmath>
+#include <algorithm>
+
+static int envi(const char * k, int dflt) { const char * v = getenv(k); return v ? atoi(v) : dflt; }
+
+static faon_scorer g_faon;
+static bool kvevict_cb(ggml_tensor * t, bool ask, void * ud) { return ((faon_scorer *) ud)->cb(t, ask); }
+
+// DECODE_TEMP=0 (default) -> greedy argmax; >0 -> temperature sampling (fixed seed for reproducible recall tests).
+// Probes whether the temp-0 greedy degeneracy attractor (bare "!"/ "3." at far+deep) is what fails deep recall.
+static double g_temp = 0.0;
+static bool g_stop_eog = false; // RECALL mode: stop at EOG for a CLEAN answer (vs fixed-NGEN rambling for timing)
+static std::mt19937 g_rng(1234567u);
+static llama_token greedy(llama_context * ctx, const llama_vocab * vocab) {
+    const float * logits = llama_get_logits_ith(ctx, -1);
+    const int n_vocab = llama_vocab_n_tokens(vocab);
+    if (g_temp <= 0.0) {
+        llama_token best = 0; float bestv = -1e30f;
+        for (int i = 0; i < n_vocab; ++i) if (logits[i] > bestv) { bestv = logits[i]; best = i; }
+        return best;
+    }
+    float mx = -1e30f; for (int i = 0; i < n_vocab; ++i) if (logits[i] > mx) mx = logits[i];
+    double sum = 0.0; std::vector<double> p(n_vocab);
+    for (int i = 0; i < n_vocab; ++i) { double e = exp((logits[i] - mx) / g_temp); p[i] = e; sum += e; }
+    std::uniform_real_distribution<double> U(0.0, sum); double r = U(g_rng), c = 0.0;
+    for (int i = 0; i < n_vocab; ++i) { c += p[i]; if (r <= c) return i; }
+    return n_vocab - 1;
+}
+
+static bool prefill(llama_context * ctx, const std::vector<llama_token> & toks, int n_batch) {
+    const int n = (int) toks.size();
+    llama_batch batch = llama_batch_init(n_batch, 0, 1);
+    for (int i = 0; i < n; i += n_batch) {
+        const int cn = std::min(n_batch, n - i);
+        batch.n_tokens = cn;
+        for (int j = 0; j < cn; ++j) {
+            batch.token[j]      = toks[i + j];
+            batch.pos[j]        = i + j;
+            batch.n_seq_id[j]   = 1;
+            batch.seq_id[j][0]  = 0;
+            batch.logits[j]     = (i + j == n - 1) ? 1 : 0;
+        }
+        if (llama_decode(ctx, batch)) { LOG_ERR("prefill decode failed at %d\n", i); llama_batch_free(batch); return false; }
+    }
+    llama_batch_free(batch);
+    return true;
+}
+
+static void decode_measure(llama_context * ctx, const llama_vocab * vocab, int start_pos, int ngen,
+                           const char * needle, double & tps, int & hit, std::string & gen, int & produced) {
+    gen.clear(); produced = 0;
+    llama_batch b = llama_batch_init(1, 0, 1);
+    auto d0 = std::chrono::steady_clock::now();
+    int pos = start_pos;
+    for (int g = 0; g < ngen; ++g) {
+        if (g == 1) d0 = std::chrono::steady_clock::now();
+        llama_token t = greedy(ctx, vocab);
+        if (g_stop_eog && llama_vocab_is_eog(vocab, t)) break;
+        gen += common_token_to_piece(ctx, t);
+        b.n_tokens = 1; b.token[0] = t; b.pos[0] = pos++; b.n_seq_id[0] = 1; b.seq_id[0][0] = 0; b.logits[0] = 1;
+        if (llama_decode(ctx, b)) { LOG_ERR("decode failed at gen %d\n", g); break; }
+        if (g >= 1) produced++;
+    }
+    auto d1 = std::chrono::steady_clock::now();
+    llama_batch_free(b);
+    const double s = std::chrono::duration<double>(d1 - d0).count();
+    tps = produced ? produced / s : 0.0;
+    hit = (needle && gen.find(needle) != std::string::npos) ? 1 : 0;
+}
+
+// build the keep-set (sinks + recent always; scored middle from in-process obs (AUTO), keepfile, or strided),
+// evict base-only, compact
+static int evict_and_compact(llama_context * ctx, llama_memory_t mem, int n_prompt,
+                             int sink, int recent, int stride, const std::vector<llama_pos> & scored,
+                             size_t * state_after) {
+    std::vector<llama_pos> keep;
+    for (int p = 0; p < sink && p < n_prompt; ++p) keep.push_back(p);
+    for (int p = std::max(0, n_prompt - recent); p < n_prompt; ++p) keep.push_back(p);
+    if (!scored.empty()) {
+        for (llama_pos p : scored) if (p >= 0 && p < n_prompt) keep.push_back(p);
+    } else if (const char * keepfile = getenv("EVICT_KEEPFILE")) {
+        FILE * kf = fopen(keepfile, "r");
+        if (!kf) { LOG_ERR("cannot open EVICT_KEEPFILE %s\n", keepfile); }
+        else { int p; while (fscanf(kf, "%d", &p) == 1) if (p >= 0 && p < n_prompt) keep.push_back(p); fclose(kf); }
+    } else if (stride > 0) {
+        for (int p = sink; p < n_prompt - recent; p += stride) keep.push_back(p);
+    }
+    std::sort(keep.begin(), keep.end());
+    keep.erase(std::unique(keep.begin(), keep.end()), keep.end());
+
+    if (!llama_memory_evict_base(mem, 0, keep.data(), (int) keep.size())) {
+        LOG_ERR("evict_base returned false (memory not hybrid-iSWA?)\n");
+    }
+    size_t sz = llama_state_seq_get_size(ctx, 0);
+    std::vector<uint8_t> buf(sz);
+    size_t got = llama_state_seq_get_data(ctx, buf.data(), sz, 0);
+    llama_memory_seq_rm(mem, 0, -1, -1);
+    size_t set = llama_state_seq_set_data(ctx, buf.data(), got, 0);
+    if (set == 0) LOG_ERR("compact restore failed (set=0)\n");
+    *state_after = llama_state_seq_get_size(ctx, 0);
+    return (int) keep.size();
+}
+
+int main(int argc, char ** argv) {
+    common_params params;
+    common_init();
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) return 1;
+
+    const int do_evict = envi("EVICT", 0);
+    const int dual     = envi("MEASURE_DUAL", 0);
+    const int sink     = envi("EVICT_SINK", 4);
+    const int recent   = envi("EVICT_RECENT", 4096);
+    const int stride   = envi("EVICT_STRIDE", 0);
+    const int ngen     = envi("EVICT_NGEN", 64);
+    const int autoscore= envi("EVICT_AUTO", 0);
+    const int keep_mid = envi("KEEP_MID", 512);
+    const char * needle = getenv("NEEDLE_STR");
+    g_temp = getenv("DECODE_TEMP") ? atof(getenv("DECODE_TEMP")) : 0.0;
+    if (getenv("DECODE_SEED")) g_rng.seed((unsigned) atoi(getenv("DECODE_SEED"))); // vary for hit-rate reliability
+    g_stop_eog = getenv("STOP_EOG") != nullptr; // clean answer (no post-answer rambling) for recall measurement
+
+    if (autoscore) {
+        g_faon.W       = envi("AMASS_W", 64);
+        g_faon.threads = envi("AMASS_THREADS", 0);
+        g_faon.scale   = getenv("AMASS_SCALE") ? (float) atof(getenv("AMASS_SCALE")) : 0.0f;
+        g_faon.parse_full("27,31,35,39");
+        params.cb_eval = kvevict_cb;
+        params.cb_eval_user_data = &g_faon;
+        params.warmup = false;
+    }
+
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+    auto init = common_init_from_params(params);
+    auto * ctx   = init->context();
+    auto * model = init->model();
+    if (!ctx || !model) { LOG_ERR("init failed\n"); return 1; }
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+    llama_memory_t mem = llama_get_memory(ctx);
+
+    std::vector<llama_token> toks = common_tokenize(ctx, params.prompt, llama_vocab_get_add_bos(vocab), true);
+    const int n_prompt = (int) toks.size();
+    if (n_prompt == 0) { LOG_ERR("no tokens\n"); return 1; }
+    if (autoscore) g_faon.n_total = n_prompt;
+
+    LOG_INF("n_prompt=%d EVICT=%d DUAL=%d AUTO=%d sink=%d recent=%d stride=%d keep_mid=%d ngen=%d\n",
+            n_prompt, do_evict, dual, autoscore, sink, recent, stride, keep_mid, ngen);
+
+    auto t0 = std::chrono::steady_clock::now();
+    if (!prefill(ctx, toks, params.n_batch)) return 1;
+    auto t1 = std::chrono::steady_clock::now();
+    const double prefill_s = std::chrono::duration<double>(t1 - t0).count();
+    const size_t state_before = llama_state_seq_get_size(ctx, 0);
+
+    // AUTO: reconstruct obs from the captured Qcur/Kcur, pick the top-KEEP_MID scored keys
+    std::vector<llama_pos> scored;
+    double obs_s = 0.0;
+    if (autoscore) {
+        auto o0 = std::chrono::steady_clock::now();
+        g_faon.compute();
+        scored = g_faon.topk(keep_mid, envi("POOL_KERNEL", 0));
+        auto o1 = std::chrono::steady_clock::now();
+        obs_s = std::chrono::duration<double>(o1 - o0).count();
+        llama_pos lo = scored.empty() ? -1 : *std::min_element(scored.begin(), scored.end());
+        llama_pos hi = scored.empty() ? -1 : *std::max_element(scored.begin(), scored.end());
+        LOG_INF("AUTO obs: scored=%zu keys (range %d..%d) obs_compute_s=%.2f\n", scored.size(), lo, hi, obs_s);
+    }
+
+    // SEED_SWEEP: from ONE prefill, decode baseline AND evicted N times with different sampling seeds -> hit-RATE
+    // (temp sampling makes single-sample recall noisy; this is the reliability instrument). Prints full GENs for
+    // external multi-code parsing. Requires EVICT_AUTO + DECODE_TEMP>0.
+    if (autoscore && getenv("SEED_SWEEP")) {
+        std::vector<int> seeds;
+        { std::string s = getenv("SEED_SWEEP"); size_t p = 0;
+          while (p < s.size()) { size_t c = s.find(',', p); seeds.push_back(atoi(s.substr(p, c==std::string::npos?c:c-p).c_str())); if (c==std::string::npos) break; p = c+1; } }
+        std::vector<uint8_t> snap(state_before);
+        size_t got = llama_state_seq_get_data(ctx, snap.data(), state_before, 0);
+        printf("\n==== SEED_SWEEP (n_prompt=%d temp=%.2f keep_mid=%d) ====\n", n_prompt, g_temp, keep_mid);
+        for (int sd : seeds) {
+            double tb, te; int hb, he, pb, pe; std::string gb, ge;
+            g_rng.seed((unsigned) sd);
+            llama_memory_seq_rm(mem, 0, -1, -1); llama_state_seq_set_data(ctx, snap.data(), got, 0);
+            decode_measure(ctx, vocab, n_prompt, ngen, needle, tb, hb, gb, pb);
+            g_rng.seed((unsigned) sd);
+            llama_memory_seq_rm(mem, 0, -1, -1); llama_state_seq_set_data(ctx, snap.data(), got, 0);
+            size_t st = state_before; evict_and_compact(ctx, mem, n_prompt, sink, recent, stride, scored, &st);
+            decode_measure(ctx, vocab, n_prompt, ngen, needle, te, he, ge, pe);
+            printf("SEED=%d BASE<<<%s>>> EVICT<<<%s>>>\n", sd, gb.c_str(), ge.c_str());
+            fflush(stdout);
+        }
+        llama_backend_free();
+        return 0;
+    }
+
+    // KEEP_SWEEP: from ONE prefill, measure recall+speed across many keep budgets (amortizes the expensive
+    // long-context prefill -> the keep-budget/recall curve in a single run). Requires EVICT_AUTO (obs scores).
+    if (autoscore && getenv("KEEP_SWEEP")) {
+        std::vector<int> budgets;
+        { std::string s = getenv("KEEP_SWEEP"); size_t p = 0;
+          while (p < s.size()) { size_t c = s.find(',', p); budgets.push_back(atoi(s.substr(p, c==std::string::npos?c:c-p).c_str())); if (c==std::string::npos) break; p = c+1; } }
+        const int pool = envi("POOL_KERNEL", 0);
+        std::vector<uint8_t> snap(state_before);
+        size_t got = llama_state_seq_get_data(ctx, snap.data(), state_before, 0);
+        double tps_b; int hit_b, prod_b; std::string gb;
+        decode_measure(ctx, vocab, n_prompt, ngen, needle, tps_b, hit_b, gb, prod_b);
+        printf("\n==== KEEP_SWEEP (n_prompt=%d obs_s=%.2f) ====\n", n_prompt, obs_s);
+        printf("BASELINE n_kv=%d hit=%d tps=%.2f\n", n_prompt, hit_b, tps_b);
+        for (int K : budgets) {
+            llama_memory_seq_rm(mem, 0, -1, -1);
+            llama_state_seq_set_data(ctx, snap.data(), got, 0);
+            std::vector<llama_pos> sc = g_faon.topk(K, pool);
+            size_t st_after = state_before;
+            int nk = evict_and_compact(ctx, mem, n_prompt, sink, recent, stride, sc, &st_after);
+            double tps_e; int hit_e, prod_e; std::string ge;
+            decode_measure(ctx, vocab, n_prompt, ngen, needle, tps_e, hit_e, ge, prod_e);
+            std::string g40 = ge.substr(0, 44);
+            printf("K=%-6d n_keep=%-7d (%.2f%%) shrink=%.2fx hit=%d tps=%.2f GEN<<<%s>>>\n",
+                   K, nk, 100.0*nk/n_prompt, st_after ? (double)state_before/st_after : 0.0, hit_e, tps_e, g40.c_str());
+            fflush(stdout);
+        }
+        llama_backend_free();
+        return 0;
+    }
+
+    if (dual) {
+        std::vector<uint8_t> snap(state_before);
+        size_t got = llama_state_seq_get_data(ctx, snap.data(), state_before, 0);
+
+        double tps_b, tps_e; int hit_b, hit_e, prod_b, prod_e; std::string gb, ge;
+        decode_measure(ctx, vocab, n_prompt, ngen, needle, tps_b, hit_b, gb, prod_b);
+
+        llama_memory_seq_rm(mem, 0, -1, -1);
+        llama_state_seq_set_data(ctx, snap.data(), got, 0);
+
+        size_t state_after = state_before;
+        int n_keep = evict_and_compact(ctx, mem, n_prompt, sink, recent, stride, scored, &state_after);
+        decode_measure(ctx, vocab, n_prompt, ngen, needle, tps_e, hit_e, ge, prod_e);
+
+        printf("\n==== KV-EVICT DUAL RESULT ====\n");
+        printf("n_prompt=%d prefill_s=%.2f auto=%d obs_compute_s=%.2f\n", n_prompt, prefill_s, autoscore, obs_s);
+        printf("BASELINE: n_kv=%d state=%zu decode_tps=%.2f (tok=%d) hit=%d\n",
+               n_prompt, state_before, tps_b, prod_b, hit_b);
+        printf("EVICTED : n_keep=%d (%.2f%%) state=%zu (%.2fx smaller) decode_tps=%.2f (tok=%d) hit=%d\n",
+               n_keep, 100.0 * n_keep / n_prompt, state_after,
+               state_after ? (double) state_before / state_after : 0.0, tps_e, prod_e, hit_e);
+        printf("DECODE SPEEDUP: %.2fx\n", tps_b ? tps_e / tps_b : 0.0);
+        printf("GEN_base<<<%s>>>\nGEN_evict<<<%s>>>\n", gb.c_str(), ge.c_str());
+        fflush(stdout);
+        llama_backend_free();
+        return 0;
+    }
+
+    size_t state_after = state_before;
+    int n_keep = n_prompt;
+    if (do_evict) n_keep = evict_and_compact(ctx, mem, n_prompt, sink, recent, stride, scored, &state_after);
+
+    double tps; int hit, produced; std::string gen;
+    decode_measure(ctx, vocab, n_prompt, ngen, needle, tps, hit, gen, produced);
+
+    printf("\n==== KV-EVICT RESULT ====\n");
+    printf("n_prompt=%d evict=%d auto=%d n_keep=%d (%.1f%% kept)\n", n_prompt, do_evict, autoscore, n_keep, 100.0 * n_keep / n_prompt);
+    printf("state_bytes_before=%zu after=%zu (%.2fx smaller)\n", state_before, state_after,
+           state_after ? (double) state_before / state_after : 0.0);
+    printf("prefill_s=%.2f obs_compute_s=%.2f decode_tok=%d decode_tps=%.2f\n", prefill_s, obs_s, produced, tps);
+    printf("needle=%s hit=%d\n", needle ? needle : "(none)", hit);
+    printf("GEN<<<%s>>>\n", gen.c_str());
+    fflush(stdout);
+    llama_backend_free();
+    return 0;
+}
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index ac021c7a6..bc7dcf253 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -3377,7 +3377,7 @@ static vk_fa_tuning_params get_fa_tuning_params_coopmat1(const vk_device& device
     const uint32_t coopmat_block_rows = 16;
     const uint32_t coopmat_block_cols = 16;
 
-    const uint32_t num_subgroups = 4;
+    const uint32_t num_subgroups = getenv("GGML_VK_FA_NSG") ? (uint32_t)atoi(getenv("GGML_VK_FA_NSG")) : 4;
 
     result.block_rows = coopmat_block_rows;
     result.block_cols = coopmat_block_cols * num_subgroups;
@@ -3859,7 +3859,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
         if ((device->architecture == AMD_GCN) && (device->driver_id != vk::DriverId::eAmdProprietary)) {
             m_warptile_mmq = m_warptile_mmq_int = { 256, 64, 64, 32, 16, 16, 2, 2, 2, 1, 16 };
             m_warptile_mmqid = m_warptile_mmqid_int = { 256, 64, 64, 32, 16, 16, 2, 2, 2, 1, 16 };
-        } else if (device->vendor_id == VK_VENDOR_ID_AMD && device->coopmat_support && device->driver_id != vk::DriverId::eAmdProprietary) {
+        } else if (device->vendor_id == VK_VENDOR_ID_AMD && device->coopmat_support && device->driver_id != vk::DriverId::eAmdProprietary && !getenv("GGML_VK_NOAMDTILE")) {
             // This is intentionally using tx_m values, slight performance increase
             l_warptile = { 256, 128, 128, 16, subgroup_size_8, 64, 2, tm_m, tn_m, tk_m, subgroup_size_8 };
             l_warptile_mmq = l_warptile_mmq_int = { 256, 128, 128, 32, subgroup_size_8, 64, 2, tm_m, tn_m, tk_m, subgroup_size_8 };
@@ -10215,7 +10215,8 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
     }
 
     // Only use mask opt when the mask is fairly large. This hasn't been tuned extensively.
-    bool use_mask_opt = mask && nem1 >= 32 && nem0 * nem1 > 32768 && nem0 >= tuning_params.block_cols * 16;
+    static const bool no_maskopt = getenv("LLAMA_FA_NO_MASKOPT") != nullptr;
+    bool use_mask_opt = !no_maskopt && mask && nem1 >= 32 && nem0 * nem1 > 32768 && nem0 >= tuning_params.block_cols * 16;
     vk_fa_pipeline_state fa_pipeline_state = get_fa_pipeline_state(ctx->device, tuning_params, HSK, HSV, aligned, f32acc,
                                                                    mask != nullptr, use_mask_opt, logit_softcap != 0, k->type, v->type);
 
diff --git a/include/llama.h b/include/llama.h
index f723c9f60..b80a6da72 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -315,6 +315,11 @@ extern "C" {
         // override key-value pairs of the model meta data
         const struct llama_model_kv_override * kv_overrides;
 
+        // dynsparse-SWA (hybrid attention-layer windowing); 0/NULL = disabled (falls back to LLAMA_DYNSPARSE_SWA* env)
+        int32_t      dynsparse_swa;          // sliding-window size W for the hybrid's attention layers (0 = off)
+        int32_t      dynsparse_swa_keepfull; // keep the last-K attention layers full (far-recall backstop)
+        const char * dynsparse_swa_full;     // explicit comma-separated attn-layer indices to keep full (NULL = none)
+
         // Keep the booleans together to avoid misalignment during copy-by-value.
         bool vocab_only;      // only load the vocabulary, no weights
         bool use_mmap;        // use mmap if possible
@@ -772,6 +777,16 @@ extern "C" {
               llama_seq_id seq_id);
 
     // Check if the memory supports shifting
+    // 395aimax: evict from the BASE (non-SWA / full-attn) KV cache of a hybrid-iSWA memory.
+    // Removes every base position for `seq_id` NOT listed in keep[0..n_keep); leaves the SWA
+    // window and the recurrent (DeltaNet) state untouched. Returns false if `mem` is not
+    // hybrid-iSWA. Follow with a state save/clear/restore to physically compact (shrink n_kv).
+    LLAMA_API bool llama_memory_evict_base(
+            llama_memory_t mem,
+              llama_seq_id seq_id,
+         const llama_pos * keep,
+                   int32_t n_keep);
+
     LLAMA_API bool llama_memory_can_shift(llama_memory_t mem);
 
     //
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 0465430df..7e5694721 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -7,6 +7,10 @@
 #include "llama-batch.h"
 #include "llama-io.h"
 #include "llama-memory.h"
+#include "llama-memory-hybrid-iswa.h"
+#include "llama-kv-cache-iswa.h"
+#include "llama-kv-cache.h"
+#include <algorithm>
 #include "llama-mmap.h"
 #include "llama-model.h"
 #include "llama-ext.h"
@@ -3848,6 +3852,39 @@ bool llama_memory_seq_rm(
     return mem->seq_rm(seq_id, p0, p1);
 }
 
+bool llama_memory_evict_base(
+        llama_memory_t mem,
+          llama_seq_id seq_id,
+     const llama_pos * keep,
+               int32_t n_keep) {
+    if (!mem) {
+        return false;
+    }
+
+    // The full-attn (non-SWA) KV lives in the iSWA base cache for the hybrid-iSWA memory.
+    auto * hyb = dynamic_cast<llama_memory_hybrid_iswa *>(mem);
+    if (!hyb) {
+        return false; // only meaningful for the SWA-on hybrid config
+    }
+
+    llama_kv_cache * base = hyb->get_mem_attn()->get_base();
+
+    std::vector<llama_pos> ks(keep, keep + n_keep);
+    std::sort(ks.begin(), ks.end());
+    ks.erase(std::unique(ks.begin(), ks.end()), ks.end());
+
+    // remove the gaps between kept positions, then everything past the last kept position
+    llama_pos prev = -1;
+    for (llama_pos p : ks) {
+        if (p > prev + 1) {
+            base->seq_rm(seq_id, prev + 1, p);
+        }
+        prev = p;
+    }
+    base->seq_rm(seq_id, prev + 1, -1);
+    return true;
+}
+
 void llama_memory_seq_cp(
         llama_memory_t mem,
           llama_seq_id seq_id_src,
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 4c86e43c1..3d510a0ad 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -2381,7 +2381,8 @@ ggml_tensor * llm_graph_context::build_attn_mha(
          ggml_tensor * sinks,
          ggml_tensor * v_mla,
                float   kq_scale,
-                 int   il) const {
+                 int   il,
+         ggml_tensor * k_reps) const {
     const bool v_trans = v->nb[1] > v->nb[2];
 
     // split the batch into streams if needed
@@ -2412,7 +2413,113 @@ ggml_tensor * llm_graph_context::build_attn_mha(
             v = ggml_cast(ctx0, v, GGML_TYPE_F16);
         }
 
-        cur = ggml_flash_attn_ext(ctx0, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias,
+        // --- Route B: dynamic content-aware block-sparse mask (env-gated, prefill only) ---
+        // Pooled-QK block scores -> softmax over key-blocks -> threshold -> additive -inf block mask
+        // -> upscale (block-repeat) to [n_kv,n_q] -> add to kq_mask -> reuse the FA tile-skip.
+        // Inert unless LLAMA_DYNSPARSE_TAU is set. (q:[hd,nq,nh,ns], k:[hd,nkv,nhkv,ns])
+        ggml_tensor * fa_mask = kq_mask;
+        // DIAGNOSTIC: force an all-(-inf) COMPUTED mask via a broadcast add (bypasses all machinery).
+        // If FA then skips (fast), computed masks CAN skip and our badd construction is the bug.
+        static const bool dbg_force_inf = getenv("LLAMA_DYNSPARSE_FORCEINF") != nullptr;
+        if (dbg_force_inf && kq_mask != nullptr) {
+            ggml_tensor * ninf = ggml_fill(ctx0, ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, 1,1,1,1), -3.4e38f);
+            ninf = ggml_repeat_4d(ctx0, ninf, kq_mask->ne[0], kq_mask->ne[1], kq_mask->ne[2], kq_mask->ne[3]);
+            ninf = ggml_cont(ctx0, ninf);
+            ninf = ggml_cast(ctx0, ninf, kq_mask->type);
+            fa_mask = ggml_add(ctx0, kq_mask, ninf);
+        }
+        static const float dyn_tau = []{ const char * e = getenv("LLAMA_DYNSPARSE_TAU"); return e ? (float) atof(e) : -1.0f; }();
+        static const int   dyn_blk = []{ const char * e = getenv("LLAMA_DYNSPARSE_BLK"); return e ? atoi(e) : 128; }();
+        static const int   dyn_win  = []{ const char * e = getenv("LLAMA_DYNSPARSE_WIN");  return e ? atoi(e) : 4096; }();
+        static const int   dyn_sink = []{ const char * e = getenv("LLAMA_DYNSPARSE_SINK"); return e ? atoi(e) : 256;  }();
+        static const int   dyn_sdim = []{ const char * e = getenv("LLAMA_DYNSPARSE_SCOREDIM"); return e ? atoi(e) : 0; }();
+        static const enum ggml_op_pool dyn_pool = getenv("LLAMA_DYNSPARSE_POOLMAX") ? GGML_OP_POOL_MAX : GGML_OP_POOL_AVG;
+        static const bool dyn_noscore = getenv("LLAMA_DYNSPARSE_NOSCORE") != nullptr;
+        if (dyn_tau >= 0.0f && kq_mask != nullptr && q->ne[1] >= 256 && k->ne[1] >= 256 &&
+            q->ne[1] % dyn_blk == 0 && k->ne[1] % dyn_blk == 0) {
+            const int64_t hd = q->ne[0], nq = q->ne[1], ns = q->ne[3];
+            const int64_t nkv = k->ne[1], B = dyn_blk;
+            const int64_t nqb = nq  / B;
+            const int64_t nkb = nkv / B;
+            // v2: MEAN-pool each B-block so a needle token actually counts toward its block's score
+            // (v1 used the strided FIRST token of each block -> missed needles -> erratic retrieval).
+            // mean-pool = permute token->dim0, ggml_pool_1d AVG (kernel=stride=B), permute back.
+            // v4: block-MEAN via ggml_pool_2d (Vulkan-native). ROOT CAUSE of the v2.1 slowdown:
+            // ggml_pool_1d has NO Vulkan impl -> CPU fallback -> a GPU<->CPU copy+sync EVERY ubatch
+            // (fixed ~535 t/s tax; measured: Phase-1 input-mask 936 vs in-graph 401 at the same kept
+            // fraction, and insensitive to scoredim -> not bytes, a sync). pool_2d pools tokens (dim1)
+            // by a 1xB kernel -> no permute/cont needed; it needs F32 src so cast the (sliced) q/k first.
+            // dyn_sdim>0 also slices the head dim (v3a) to shrink the cast+pool further.
+            ggml_tensor * qb;
+            ggml_tensor * kb;
+            if (k_reps != nullptr) {
+                // v5: AMORTIZED - K block reps come from the persistent rep cache (built incrementally on KV
+                // write), so there is NO per-ubatch O(N^2) re-sweep of K. Only Q (current ubatch, small) is
+                // pooled here. k_reps: [n_embd_head_k, n_head_kv, nkb, ns] -> permute to [hd, nkb, nhkv, ns].
+                ggml_tensor * qf = q->type == GGML_TYPE_F32 ? ggml_cont(ctx0, q) : ggml_cast(ctx0, q, GGML_TYPE_F32);
+                qb = ggml_pool_2d(ctx0, qf, dyn_pool, 1, (int) B, 1, (int) B, 0, 0);
+                kb = ggml_cont(ctx0, ggml_permute(ctx0, k_reps, 0, 2, 1, 3));
+            } else {
+                ggml_tensor * qs = q, * ks = k;
+                if (dyn_sdim > 0 && dyn_sdim < hd) {
+                    qs = ggml_view_4d(ctx0, q, dyn_sdim, q->ne[1], q->ne[2], q->ne[3], q->nb[1], q->nb[2], q->nb[3], 0);
+                    ks = ggml_view_4d(ctx0, k, dyn_sdim, k->ne[1], k->ne[2], k->ne[3], k->nb[1], k->nb[2], k->nb[3], 0);
+                }
+                ggml_tensor * qf = qs->type == GGML_TYPE_F32 ? ggml_cont(ctx0, qs) : ggml_cast(ctx0, qs, GGML_TYPE_F32);
+                ggml_tensor * kf = ks->type == GGML_TYPE_F32 ? ggml_cont(ctx0, ks) : ggml_cast(ctx0, ks, GGML_TYPE_F32);
+                qb = ggml_pool_2d(ctx0, qf, dyn_pool, 1, (int) B, 1, (int) B, 0, 0);
+                kb = ggml_pool_2d(ctx0, kf, dyn_pool, 1, (int) B, 1, (int) B, 0, 0);
+            }
+            // per-head block scores (GQA broadcast nhkv->nh): [nkb, nqb, nh, ns]
+            ggml_tensor * sc = ggml_mul_mat(ctx0, kb, qb);
+            // sum over heads -> [nkb, nqb, ns]
+            sc = ggml_sum_rows(ctx0, ggml_cont(ctx0, ggml_permute(ctx0, sc, 1, 2, 0, 3)));
+            sc = ggml_reshape_3d(ctx0, sc, nkb, nqb, ns);
+            // softmax over key-blocks, threshold to keep(0/1)
+            ggml_tensor * prob   = ggml_soft_max(ctx0, sc);
+            ggml_tensor * ntau   = ggml_fill(ctx0, ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, 1,1,1,1), -dyn_tau);
+            ggml_tensor * keep   = ggml_step(ctx0, ggml_add(ctx0, prob, ntau));   // content keep [nkb,nqb]
+            // v2.1: OR in always-keep sink (first Sb blocks) + recent window (last Wb blocks)
+            const int64_t Sb = (dyn_sink + B - 1) / B;
+            const int64_t Wb = (dyn_win  + B - 1) / B;
+            ggml_tensor * pos      = ggml_arange(ctx0, 0.0f, (float) nkb, 1.0f);    // [nkb] block indices
+            ggml_tensor * sink_ind = ggml_step(ctx0, ggml_scale(ctx0, ggml_sub(ctx0, pos, ggml_fill(ctx0, ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, 1,1,1,1), (float) Sb - 0.5f)), -1.0f));
+            ggml_tensor * rec_ind  = ggml_step(ctx0, ggml_sub(ctx0, pos, ggml_fill(ctx0, ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, 1,1,1,1), (float) (nkb - Wb) - 0.5f)));
+            // NOTE: Vulkan step() is (x >= 0 ? 1 : 0) -> step(0)=1, unlike CPU (x > 0). The OR of 0/1
+            // indicators needs a -0.5 bias so the "neither" case (sum==0) DROPS (step(-0.5)=0) instead of
+            // being kept (step(0)=1). Without this, NOTHING was ever dropped and FA never tile-skipped.
+            ggml_tensor * nhalf    = ggml_fill(ctx0, ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, 1,1,1,1), -0.5f);
+            ggml_tensor * keeppos  = ggml_step(ctx0, ggml_add(ctx0, ggml_add(ctx0, sink_ind, rec_ind), nhalf)); // 1 if sink|recent
+            keep = dyn_noscore ? ggml_repeat_4d(ctx0, ggml_reshape_4d(ctx0, keeppos, nkb, 1, 1, 1), nkb, nqb, 1, 1) : ggml_step(ctx0, ggml_add(ctx0, ggml_add(ctx0, keep, keeppos), nhalf));                       // OR content|pos
+            // additive block mask: 0 if kept, -3.4e38 (=> F16 -inf => FA all-neg-inf skip) if dropped
+            ggml_tensor * none   = ggml_fill(ctx0, ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, 1,1,1,1), -1.0f);
+            static const bool dbg_cleankeep = getenv("LLAMA_DYNSPARSE_CLEANKEEP") != nullptr;
+            if (getenv("LLAMA_DYNSPARSE_CONTKEEP")) { keep = ggml_cont(ctx0, keep); }
+            if (dbg_cleankeep) { keep = ggml_fill(ctx0, ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, keep->ne[0], keep->ne[1], keep->ne[2], keep->ne[3]), 0.0f); }
+            ggml_tensor * badd   = ggml_cont(ctx0, ggml_scale(ctx0, ggml_add(ctx0, keep, none), 3.4e38f));
+            static const bool dbg_dropall = getenv("LLAMA_DYNSPARSE_DROPALL") != nullptr;
+            if (dbg_dropall) { badd = ggml_fill(ctx0, ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, badd->ne[0], badd->ne[1], badd->ne[2], badd->ne[3]), -3.4e38f); }
+            // block -> token via nearest upscale (dims 0,1 each x B), crop to [n_kv, n_q]
+            // expand block mask [nkb,nqb] -> token mask [nkv,nq] via a LAYOUT-EXACT block-repeat
+            // (key = kb*B+i, query = qb*B+j). ggml_upscale/NEAREST produced a mask the FA tile-skip could
+            // NOT read as -inf (FORCEINF broadcast-add proved clean computed masks DO skip @1020 t/s); this
+            // reshape->repeat_4d->reshape is provably aligned to K's block structure. (ns==1 prefill path.)
+            badd = ggml_reshape_4d(ctx0, badd, 1, nkb, 1, nqb);
+            badd = ggml_repeat_4d(ctx0, badd, (int64_t) B, nkb, (int64_t) B, nqb);
+            badd = ggml_cont(ctx0, badd);
+            badd = ggml_reshape_3d(ctx0, badd, nkv, nq, 1);
+            badd = ggml_cast(ctx0, badd, kq_mask->type);
+            badd = ggml_reshape_4d(ctx0, badd, nkv, nq, 1, ns);
+            fa_mask = ggml_cont(ctx0, ggml_add(ctx0, kq_mask, badd));
+        }
+
+        if (getenv("LLAMA_DYNSPARSE_DBG") && fa_mask) {
+            fprintf(stderr, "FAMASK il=%d ne=[%lld,%lld,%lld,%lld] nb=[%zu,%zu,%zu,%zu] type=%d cont=%d\n",
+                il, (long long)fa_mask->ne[0], (long long)fa_mask->ne[1], (long long)fa_mask->ne[2], (long long)fa_mask->ne[3],
+                (size_t)fa_mask->nb[0], (size_t)fa_mask->nb[1], (size_t)fa_mask->nb[2], (size_t)fa_mask->nb[3],
+                (int)fa_mask->type, ggml_is_contiguous(fa_mask));
+        }
+        cur = ggml_flash_attn_ext(ctx0, q, k, v, fa_mask, kq_scale, hparams.f_max_alibi_bias,
                                   hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f);
         cb(cur, LLAMA_TENSOR_NAME_FATTN, il);
 
@@ -2657,6 +2764,10 @@ ggml_tensor * llm_graph_context::build_attn(
 
         ggml_build_forward_expand(gf, mctx_cur->cpy_k(ctx0, k_cur, k_idxs, il));
         ggml_build_forward_expand(gf, mctx_cur->cpy_v(ctx0, v_cur, v_idxs, il));
+        ggml_tensor * kr_upd = mctx_cur->cpy_k_reps(ctx0, k_cur, il);
+        if (kr_upd != nullptr) {
+            ggml_build_forward_expand(gf, kr_upd);
+        }
     }
 
     const auto & kq_mask = inp->get_kq_mask();
@@ -2665,7 +2776,9 @@ ggml_tensor * llm_graph_context::build_attn(
     ggml_tensor * k = mctx_cur->get_k(ctx0, il);
     ggml_tensor * v = mctx_cur->get_v(ctx0, il);
 
-    ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
+    ggml_tensor * k_reps = mctx_cur->get_k_reps(ctx0, il);
+
+    ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il, k_reps);
     cb(cur, "kqv_out", il);
 
     if (inp->self_v_rot) {
diff --git a/src/llama-graph.h b/src/llama-graph.h
index 4b5b75c63..65e645bba 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -1053,7 +1053,8 @@ struct llm_graph_context {
             ggml_tensor * sinks,   // [n_head_q]
             ggml_tensor * v_mla,   // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
                   float   kq_scale,
-                    int   il) const;
+                    int   il,
+            ggml_tensor * k_reps = nullptr) const;
 
     llm_graph_input_attn_no_cache * build_attn_inp_no_cache() const;
 
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index 8be5f28f3..69d0cf18b 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -1,5 +1,7 @@
 #pragma once
 
+#include <cstdlib>
+
 #include "llama.h"
 
 #include <array>
@@ -370,6 +372,9 @@ struct llama_hparams {
     // TODO: pack the SWA params in a struct?
     static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1) {
         assert(p0 >= 0 && p1 >= 0);
+        // OURS: env-gated attention SINK (StreamingLLM) - always keep the first K key positions attendable.
+        { static const int swa_sink = []{ const char * e = getenv("LLAMA_SWA_SINK"); return e ? atoi(e) : 0; }();
+          if (swa_sink > 0 && p0 < (llama_pos) swa_sink) { return false; } }
 
         switch (swa_type) {
             case LLAMA_SWA_TYPE_NONE:
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index 12bf5c379..1620c08ba 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -1,5 +1,7 @@
 #include "llama-kv-cache.h"
 
+#include <string>
+
 #include "llama-impl.h"
 #include "llama-io.h"
 #include "llama-model.h"
@@ -9,6 +11,7 @@
 #include <cassert>
 #include <cmath>
 #include <cstring>
+#include <cstdlib>
 #include <limits>
 #include <map>
 #include <stdexcept>
@@ -244,8 +247,13 @@ llama_kv_cache::llama_kv_cache(
         const bool has_k = true;
         const bool has_v = !is_mla;
 
-        ggml_tensor * k = has_k ? ggml_new_tensor_3d(ctx, type_k, n_embd_k_gqa, kv_size, n_stream) : nullptr;
-        ggml_tensor * v = has_v ? ggml_new_tensor_3d(ctx, type_v, n_embd_v_gqa, kv_size, n_stream) : nullptr;
+        // OURS: env-gated PER-LAYER KV quant. LLAMA_KVQUANT_LAYERS="3,7" -> those layers q4_0, rest type_k/v.
+        ggml_type lk = type_k, lv = type_v;
+        { static const char * kvq = getenv("LLAMA_KVQUANT_LAYERS");
+          if (kvq) { std::string hay = std::string(",") + kvq + ",";
+            if (hay.find("," + std::to_string(il) + ",") != std::string::npos) { lk = GGML_TYPE_Q4_0; lv = GGML_TYPE_Q4_0; } } }
+        ggml_tensor * k = has_k ? ggml_new_tensor_3d(ctx, lk, n_embd_k_gqa, kv_size, n_stream) : nullptr;
+        ggml_tensor * v = has_v ? ggml_new_tensor_3d(ctx, lv, n_embd_v_gqa, kv_size, n_stream) : nullptr;
 
         has_k && ggml_format_name(k, "cache_k_l%d", il);
         has_v && ggml_format_name(v, "cache_v_l%d", il);
@@ -258,9 +266,19 @@ llama_kv_cache::llama_kv_cache(
             v_stream.push_back(has_v ? ggml_view_2d(ctx, v, n_embd_v_gqa, kv_size, v->nb[1], s*v->nb[2]) : nullptr);
         }
 
+        // sparse-attn rep cache: block-mean K reps, [n_embd_k_gqa, nkb, n_stream] (env-gated, prefill amortization)
+        static const bool repcache_enabled = getenv("LLAMA_DYNSPARSE_REPCACHE") != nullptr;
+        ggml_tensor * k_reps = nullptr;
+        if (repcache_enabled && has_k) {
+            const uint32_t B   = 128;
+            const uint32_t nkb = (kv_size + B - 1) / B;
+            k_reps = ggml_new_tensor_3d(ctx, type_k, n_embd_k_gqa, nkb, n_stream);
+            ggml_format_name(k_reps, "cache_kreps_l%d", il);
+        }
+
         map_layer_ids[il] = layers.size();
 
-        layers.push_back({ il, k, v, k_stream, v_stream, });
+        layers.push_back({ il, k, v, k_stream, v_stream, k_reps });
     }
 
     if (reuse) {
@@ -1276,6 +1294,27 @@ ggml_tensor * llama_kv_cache::get_k(ggml_context * ctx, int32_t il, uint32_t n_k
             ggml_row_size(k->type, n_embd_k_gqa*kv_size)*sinfo.s0);
 }
 
+ggml_tensor * llama_kv_cache::get_k_reps(ggml_context * ctx, int32_t il, uint32_t n_kb, const slot_info & sinfo) const {
+    const int32_t ikv = map_layer_ids.at(il);
+
+    auto * kr = layers[ikv].k_reps;
+    if (kr == nullptr) {
+        return nullptr;
+    }
+
+    const uint64_t nkb_size     = kr->ne[1];
+    const uint64_t n_embd_k_gqa = kr->ne[0];
+
+    const uint32_t ns = sinfo.s1 - sinfo.s0 + 1;
+
+    return ggml_view_4d(ctx, kr,
+            hparams.n_embd_head_k(il), hparams.n_head_kv(il), n_kb, ns,
+            ggml_row_size(kr->type, hparams.n_embd_head_k(il)),
+            ggml_row_size(kr->type, n_embd_k_gqa),
+            ggml_row_size(kr->type, n_embd_k_gqa*nkb_size),
+            ggml_row_size(kr->type, n_embd_k_gqa*nkb_size)*sinfo.s0);
+}
+
 ggml_tensor * llama_kv_cache::get_v(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const {
     const int32_t ikv = map_layer_ids.at(il);
 
@@ -1343,6 +1382,46 @@ ggml_tensor * llama_kv_cache::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggm
     return ggml_set_rows(ctx, k, k_cur, k_idxs);
 }
 
+ggml_tensor * llama_kv_cache::cpy_k_reps(ggml_context * ctx, ggml_tensor * k_cur, int32_t il, const slot_info & sinfo, uint32_t n_kv) const {
+    GGML_UNUSED(sinfo);
+
+    const int32_t ikv = map_layer_ids.at(il);
+
+    ggml_tensor * kr = layers[ikv].k_reps;
+    if (kr == nullptr) {
+        return nullptr;
+    }
+
+    const uint32_t B = 128;
+    const int64_t n_embd_head = k_cur->ne[0];
+    const int64_t n_head      = k_cur->ne[1];
+    const int64_t n_tokens    = k_cur->ne[2];
+    const int64_t n_embd_gqa  = n_embd_head*n_head;
+
+    // amortized rep update: only on block-aligned contiguous writes (prefill).
+    // pos0 = first cache cell of this ubatch (assumes contiguous fill).
+    const int64_t pos0 = (int64_t) n_kv - n_tokens;
+    if (n_tokens % B != 0 || pos0 < 0 || pos0 % B != 0) {
+        return nullptr;
+    }
+
+    const int64_t n_new_blocks = n_tokens / B;
+    const int64_t blk0         = pos0 / B;
+
+    // pool ONLY the new tokens into block means: [n_embd_head, n_head, n_tokens] -> [n_embd_gqa, n_tokens]
+    ggml_tensor * kc = ggml_cont(ctx, k_cur);
+    kc = ggml_reshape_2d(ctx, kc, n_embd_gqa, n_tokens);
+    if (kc->type != GGML_TYPE_F32) {
+        kc = ggml_cast(ctx, kc, GGML_TYPE_F32);
+    }
+    // pool dim1 (tokens) by B -> [n_embd_gqa, n_new_blocks]
+    ggml_tensor * reps = ggml_pool_2d(ctx, kc, GGML_OP_POOL_AVG, 1, (int) B, 1, (int) B, 0, 0);
+
+    // write into kr at block offset blk0 (stream 0)
+    ggml_tensor * kr_view = ggml_view_2d(ctx, kr, n_embd_gqa, n_new_blocks, kr->nb[1], blk0*kr->nb[1]);
+    return ggml_cpy(ctx, reps, kr_view);
+}
+
 ggml_tensor * llama_kv_cache::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il, const slot_info & sinfo) const {
     GGML_UNUSED(sinfo);
 
@@ -2595,6 +2674,11 @@ ggml_tensor * llama_kv_cache_context::get_k(ggml_context * ctx, int32_t il) cons
     return kv->get_k(ctx, il, n_kv, sinfos[i_cur]);
 }
 
+ggml_tensor * llama_kv_cache_context::get_k_reps(ggml_context * ctx, int32_t il) const {
+    const uint32_t n_kb = (n_kv + 127) / 128;
+    return kv->get_k_reps(ctx, il, n_kb, sinfos[i_cur]);
+}
+
 ggml_tensor * llama_kv_cache_context::get_v(ggml_context * ctx, int32_t il) const {
     return kv->get_v(ctx, il, n_kv, sinfos[i_cur]);
 }
@@ -2603,6 +2687,10 @@ ggml_tensor * llama_kv_cache_context::cpy_k(ggml_context * ctx, ggml_tensor * k_
     return kv->cpy_k(ctx, k_cur, k_idxs, il, sinfos[i_cur]);
 }
 
+ggml_tensor * llama_kv_cache_context::cpy_k_reps(ggml_context * ctx, ggml_tensor * k_cur, int32_t il) const {
+    return kv->cpy_k_reps(ctx, k_cur, il, sinfos[i_cur], n_kv);
+}
+
 ggml_tensor * llama_kv_cache_context::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il) const {
     return kv->cpy_v(ctx, v_cur, v_idxs, il, sinfos[i_cur]);
 }
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
index 531d99dbd..c4e4fa596 100644
--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@@ -173,9 +173,11 @@ public:
     // get views of the current state of the cache
     ggml_tensor * get_k(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const;
     ggml_tensor * get_v(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const;
+    ggml_tensor * get_k_reps(ggml_context * ctx, int32_t il, uint32_t n_kb, const slot_info & sinfo) const;
 
     // store k_cur and v_cur in the cache based on the provided head location
     ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il, const slot_info & sinfo) const;
+    ggml_tensor * cpy_k_reps(ggml_context * ctx, ggml_tensor * k_cur, int32_t il, const slot_info & sinfo, uint32_t n_kv) const;
     ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il, const slot_info & sinfo) const;
 
     //
@@ -231,6 +233,8 @@ private:
 
         std::vector<ggml_tensor *> k_stream;
         std::vector<ggml_tensor *> v_stream;
+
+        ggml_tensor * k_reps = nullptr; // block-mean K reps for sparse-attn cache (env LLAMA_DYNSPARSE_REPCACHE)
     };
 
     bool v_trans = true;  // the value tensor is transposed
@@ -370,6 +374,7 @@ public:
     // get views of the current state of the cache
     ggml_tensor * get_k(ggml_context * ctx, int32_t il) const;
     ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;
+    ggml_tensor * get_k_reps(ggml_context * ctx, int32_t il) const;
 
     // store k_cur and v_cur in the cache based on the provided head location
     // note: the heads in k_cur and v_cur should be laid out contiguously in memory
@@ -378,6 +383,7 @@ public:
     //   - v_cur  [n_embd_head_v, n_head_v, n_tokens]
     //   - v_idxs [n_tokens] or [n_tokens*n_embd_v_gqa] depending if V cache is transposed
     ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il) const;
+    ggml_tensor * cpy_k_reps(ggml_context * ctx, ggml_tensor * k_cur, int32_t il) const;
     ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il) const;
 
     // create destination indices for each head of the current batch for where it would be written in the KV cache
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index d58ebac28..2dec13d94 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -2291,6 +2291,9 @@ llama_model_params llama_model_default_params() {
         /*.progress_callback           =*/ nullptr,
         /*.progress_callback_user_data =*/ nullptr,
         /*.kv_overrides                =*/ nullptr,
+        /*.dynsparse_swa               =*/ 0,
+        /*.dynsparse_swa_keepfull      =*/ 0,
+        /*.dynsparse_swa_full          =*/ nullptr,
         /*.vocab_only                  =*/ false,
         /*.use_mmap                    =*/ true,
         /*.use_direct_io               =*/ false,
diff --git a/src/models/granite-hybrid.cpp b/src/models/granite-hybrid.cpp
index eb23095ae..fcef8a558 100644
--- a/src/models/granite-hybrid.cpp
+++ b/src/models/granite-hybrid.cpp
@@ -23,6 +23,9 @@ void llama_model_granite_hybrid::load_arch_hparams(llama_model_loader & ml) {
         hparams.is_recr_impl[i] = hparams.n_head_kv(i) == 0;
     }
 
+    // Generalization: env-gated SWA windowing via the shared helper (one call -- the abstraction).
+    apply_dynsparse_swa(hparams, this->params);
+
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
 
     switch (hparams.n_embd) {
@@ -141,8 +144,6 @@ llama_model_granite_hybrid::graph::graph(const llama_model & model, const llm_gr
 
     inpL = build_inp_embd(model.tok_embd);
 
-    auto * inp = build_inp_mem_hybrid();
-
     ggml_tensor * inp_out_ids = build_inp_out_ids();
 
     // Positional embeddings populated if rope enabled
@@ -151,6 +152,8 @@ llama_model_granite_hybrid::graph::graph(const llama_model & model, const llm_gr
         inp_pos = build_inp_pos();
     }
 
+    // Generic lambda so the loop runs with either the plain hybrid input or the hybrid-iswa input (SWA).
+    auto run_layers = [&](auto * inp) {
     for (int il = 0; il < n_layer; ++il) {
         struct ggml_tensor * inpSA = inpL;
 
@@ -177,6 +180,9 @@ llama_model_granite_hybrid::graph::graph(const llama_model & model, const llm_gr
         // input for next layer
         inpL = cur;
     }
+    };
+    if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) { run_layers(build_inp_mem_hybrid_iswa()); }
+    else { run_layers(build_inp_mem_hybrid()); }
 
     cur = inpL;
 
@@ -198,9 +204,10 @@ llama_model_granite_hybrid::graph::graph(const llama_model & model, const llm_gr
     ggml_build_forward_expand(gf, cur);
 }
 
+template <typename TAttn>
 ggml_tensor * llama_model_granite_hybrid::graph::build_attention_layer(ggml_tensor *             cur,
                                                               ggml_tensor *             inp_pos,
-                                                              llm_graph_input_attn_kv * inp_attn,
+                                                              TAttn *                   inp_attn,
                                                               const llama_model &       model,
                                                               const int64_t             n_embd_head,
                                                               const int                 il) {
diff --git a/src/models/models.h b/src/models/models.h
index 7a52e7bc1..f2fad5d3a 100644
--- a/src/models/models.h
+++ b/src/models/models.h
@@ -7,6 +7,52 @@
 // note: almost all graphs require at least sqrtf, so include cmath globally
 #include <cmath>
 
+#include <cstdlib>
+#include <cstdio>
+#include <string>
+
+// Shared env-gated dynamic-sparse SWA marking (used by hybrid arches: qwen35moe, qwen3next, granite-hybrid, ...).
+// LLAMA_DYNSPARSE_SWA=W marks the full-attention (non-recurrent) layers sliding-window(W) so the hybrid routes
+// through hybrid-iswa -> rolling KV bounded to W. LLAMA_DYNSPARSE_SWA_FULL=comma-idx keeps listed attn layers FULL
+// (far-recall backstop). ARCH-AGNOSTIC: reads only hparams.recurrent_layer_arr. No-op unless the env var is set.
+static inline void apply_dynsparse_swa(llama_hparams & hparams, const llama_model_params & mp) {
+    // Resolve from CLI params (precedence) or LLAMA_DYNSPARSE_SWA* env (fallback / back-compat).
+    int32_t swa_w = mp.dynsparse_swa;
+    if (swa_w <= 0) { const char * e = getenv("LLAMA_DYNSPARSE_SWA"); swa_w = e ? atoi(e) : 0; }
+    if (swa_w <= 0) return;
+    int32_t keepfull = mp.dynsparse_swa_keepfull;
+    if (keepfull <= 0) { const char * e = getenv("LLAMA_DYNSPARSE_SWA_KEEPFULL"); keepfull = e ? atoi(e) : 0; }
+    const char * full_list = mp.dynsparse_swa_full;
+    if (!full_list || !*full_list) { full_list = getenv("LLAMA_DYNSPARSE_SWA_FULL"); }
+
+    const uint32_t n_main = hparams.n_layer(); // transformer layers (excludes nextn)
+    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+    hparams.n_swa = (uint32_t) swa_w;
+    for (uint32_t i = 0; i < n_main; ++i) {
+        hparams.is_swa_impl[i] = hparams.is_recr(i) ? 0u : 1u; // window the attn (non-recurrent) layers
+    }
+    if (keepfull > 0) {
+        int keep = keepfull;
+        for (int i = (int) n_main - 1; i >= 0 && keep > 0; --i) {
+            if (hparams.is_swa_impl[i]) { hparams.is_swa_impl[i] = 0u; keep--; }
+        }
+    }
+    if (full_list && *full_list) {
+        std::string fs(full_list); size_t p = 0;
+        while (p <= fs.size()) {
+            size_t c = fs.find(',', p);
+            std::string tok = fs.substr(p, c == std::string::npos ? std::string::npos : c - p);
+            if (!tok.empty()) { int idx = atoi(tok.c_str()); if (idx >= 0 && idx < (int) n_main) hparams.is_swa_impl[idx] = 0u; }
+            if (c == std::string::npos) break; p = c + 1;
+        }
+    }
+    fprintf(stderr, "[dynsparse-swa] W=%u n_main=%u windowed_attn=[", (unsigned) swa_w, n_main);
+    for (uint32_t i = 0; i < n_main; ++i) if (hparams.is_swa_impl[i]) fprintf(stderr, "%u,", i);
+    fprintf(stderr, "] kept_full_attn=[");
+    for (uint32_t i = 0; i < n_main; ++i) if (!hparams.is_recr(i) && !hparams.is_swa_impl[i]) fprintf(stderr, "%u,", i);
+    fprintf(stderr, "]\n");
+}
+
 //
 // base classes
 //
@@ -1555,7 +1601,7 @@ struct llama_model_granite_hybrid : public llama_model_base {
     struct graph : public llm_build_mamba_base {
         graph(const llama_model & model, const llm_graph_params & params);
         ggml_tensor * build_layer_ffn(ggml_tensor * cur, ggml_tensor * inpSA, const llama_model & model, const int il);
-        ggml_tensor * build_attention_layer(ggml_tensor * cur, ggml_tensor * inp_pos, llm_graph_input_attn_kv * inp_attn,
+        template <typename TAttn> ggml_tensor * build_attention_layer(ggml_tensor * cur, ggml_tensor * inp_pos, TAttn * inp_attn,
             const llama_model & model,const int64_t n_embd_head, const int il);
     };
 
@@ -1905,8 +1951,9 @@ struct llama_model_qwen3next : public llama_model_base {
     struct graph : public llm_build_delta_net_base {
         graph(const llama_model & model, const llm_graph_params & params);
     private:
+        template <typename TAttn>
         ggml_tensor * build_layer_attn(
-        llm_graph_input_attn_kv * inp_attn,
+        TAttn * inp_attn,
                     ggml_tensor * cur,
                     ggml_tensor * inp_pos,
                             int   il);
@@ -1992,8 +2039,9 @@ struct llama_model_qwen35moe : public llama_model_base {
     struct graph : public llm_build_delta_net_base {
         graph(const llama_model & model, const llm_graph_params & params);
     private:
+        template <class InpAttn>
         ggml_tensor * build_layer_attn(
-        llm_graph_input_attn_kv * inp_attn,
+        InpAttn * inp_attn,
                     ggml_tensor * cur,
                     ggml_tensor * inp_pos,
                             int * sections,
diff --git a/src/models/qwen35moe.cpp b/src/models/qwen35moe.cpp
index 7b0876cbb..068d233b6 100644
--- a/src/models/qwen35moe.cpp
+++ b/src/models/qwen35moe.cpp
@@ -1,4 +1,6 @@
 #include "models.h"
+
+#include <string>
 #include "llama-memory-recurrent.h"
 
 void llama_model_qwen35moe::load_arch_hparams(llama_model_loader & ml) {
@@ -29,6 +31,8 @@ void llama_model_qwen35moe::load_arch_hparams(llama_model_loader & ml) {
         }
     }
 
+    apply_dynsparse_swa(hparams, this->params);
+
     switch (hparams.n_layer()) {
         case 40: type = LLM_TYPE_35B_A3B; break;
         case 48: type = LLM_TYPE_122B_A10B; break;
@@ -172,12 +176,13 @@ llama_model_qwen35moe::graph::graph(const llama_model & model, const llm_graph_p
 
     cb(inpL, "model.input_embed", -1);
 
-    auto * inp = build_inp_mem_hybrid();
-
     ggml_tensor * inp_pos     = build_inp_pos();
     ggml_tensor * inp_out_ids = build_inp_out_ids();
 
     // MTP/NextN layers are loaded as extra decoder blocks but not executed in the main pass.
+    // SWA recipe (apply_dynsparse_swa): when swa_type!=NONE the model is hybrid-iSWA; build the iswa hybrid
+    // input so the windowed attn layers attend a rolling KV. Generic lambda handles both input types.
+    auto run_transformer = [&](auto * inp) {
     for (int il = 0; il < n_layer; ++il) {
         res->t_layer_inp[il] = inpL;
 
@@ -227,6 +232,12 @@ llama_model_qwen35moe::graph::graph(const llama_model & model, const llm_graph_p
         // Input for next layer
         inpL = cur;
     }
+    };
+    if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
+        run_transformer(build_inp_mem_hybrid_iswa());
+    } else {
+        run_transformer(build_inp_mem_hybrid());
+    }
     cur = inpL;
 
     // post-norm hidden state feeds both the LM head and the MTP seed below
@@ -278,8 +289,9 @@ ggml_tensor * llama_model_qwen35moe::graph::build_norm_gated(
     return ggml_mul(ctx0, normalized, gated_silu);
 }
 
+template <class InpAttn>
 ggml_tensor * llama_model_qwen35moe::graph::build_layer_attn(
-        llm_graph_input_attn_kv * inp,
+        InpAttn * inp,
         ggml_tensor *             cur,
         ggml_tensor *             inp_pos,
         int *                     sections,
@@ -346,6 +358,22 @@ ggml_tensor * llama_model_qwen35moe::graph::build_layer_attn(
                 nullptr, nullptr, nullptr,
                 Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
     cb(cur, "attn_pregate", il);
+    // OURS: env-gated HEAD ABLATION. LLAMA_ABLATE_HEAD_RANGE="a,b" zeros attn-output heads [a,b) (all attn layers).
+    { static const char * abl_env = getenv("LLAMA_ABLATE_HEAD_RANGE");
+      if (abl_env) {
+        std::string as(abl_env); size_t cpos = as.find(',');
+        int a = atoi(as.substr(0, cpos).c_str());
+        int b = (cpos == std::string::npos) ? a : atoi(as.substr(cpos + 1).c_str());
+        const int64_t N = (int64_t) n_embd_head * n_head;
+        const float fa = (float) ((int64_t) a * n_embd_head);
+        const float fb = (float) ((int64_t) b * n_embd_head);
+        // ascending aranges only; +0.5 dodges step(0). ge_a[i]=1 if i>=a*ehd, ge_b[i]=1 if i>=b*ehd.
+        ggml_tensor * ge_a = ggml_step(ctx0, ggml_arange(ctx0, 0.5f - fa, (float) N + 0.5f - fa, 1.0f));
+        ggml_tensor * ge_b = ggml_step(ctx0, ggml_arange(ctx0, 0.5f - fb, (float) N + 0.5f - fb, 1.0f));
+        ggml_tensor * in_range = ggml_sub(ctx0, ge_a, ge_b);   // 1 inside [a,b), 0 outside
+        cur = ggml_sub(ctx0, cur, ggml_mul(ctx0, cur, in_range));   // zero the ablated heads
+        cb(cur, "attn_head_ablated", il);
+      } }
 
     ggml_tensor * gate_sigmoid = ggml_sigmoid(ctx0, gate);
     cb(gate_sigmoid, "gate_sigmoid", il);
@@ -601,7 +629,12 @@ llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm
     ggml_tensor * inp_pos     = build_inp_pos();
     ggml_tensor * inp_out_ids = build_inp_out_ids();
 
-    auto * inp_attn = build_attn_inp_kv();
+    // MTP+SWA: when the SWA recipe sets swa_type, the model runs on the iSWA hybrid cache, so the MTP
+    // attention input must also be iSWA (the non-iswa build_attn_inp_kv asserts swa_type==NONE). The MTP
+    // layer index (n_layer()) is not marked SWA, so it routes to the full sub-cache (attends full KV).
+    const bool mtp_use_iswa = hparams.swa_type != LLAMA_SWA_TYPE_NONE;
+    llm_graph_input_attn_kv      * inp_attn      = mtp_use_iswa ? nullptr : build_attn_inp_kv();
+    llm_graph_input_attn_kv_iswa * inp_attn_iswa = mtp_use_iswa ? build_attn_inp_kv_iswa() : nullptr;
 
     ggml_tensor * h_norm = build_norm(h_embd, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il);
     cb(h_norm, "mtp_hnorm", il);
@@ -658,9 +691,9 @@ llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm
     const float kq_scale = hparams.f_attention_scale == 0.0f
             ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
 
-    cur = build_attn(inp_attn,
-            nullptr, nullptr, nullptr,
-            Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+    cur = mtp_use_iswa
+        ? build_attn(inp_attn_iswa, nullptr, nullptr, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il)
+        : build_attn(inp_attn,      nullptr, nullptr, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
     cb(cur, "mtp_attn_pregate", il);
 
     cur = ggml_mul(ctx0, cur, ggml_sigmoid(ctx0, gate));
diff --git a/src/models/qwen3next.cpp b/src/models/qwen3next.cpp
index 97200a440..ad5b4df12 100644
--- a/src/models/qwen3next.cpp
+++ b/src/models/qwen3next.cpp
@@ -22,6 +22,8 @@ void llama_model_qwen3next::load_arch_hparams(llama_model_loader & ml) {
         }
     }
 
+    apply_dynsparse_swa(hparams, this->params);
+
     switch (hparams.n_layer()) {
         case 48: type = LLM_TYPE_80B_A3B; break;
         default: type = LLM_TYPE_UNKNOWN;
@@ -115,11 +117,11 @@ llama_model_qwen3next::graph::graph(const llama_model & model, const llm_graph_p
     inpL = build_inp_embd(model.tok_embd);
     cb(inpL, "model.embed_tokens", -1);
 
-    auto * inp = build_inp_mem_hybrid();
-
     ggml_tensor * inp_pos     = build_inp_pos();
     ggml_tensor * inp_out_ids = build_inp_out_ids();
 
+    // Generic lambda so the same loop runs with either the plain hybrid input or the hybrid-iswa input (SWA).
+    auto run_layers = [&](auto * inp) {
     for (int il = 0; il < n_layer; ++il) {
         ggml_tensor * inpSA = inpL;
 
@@ -167,6 +169,9 @@ llama_model_qwen3next::graph::graph(const llama_model & model, const llm_graph_p
         // Input for next layer
         inpL = cur;
     }
+    };
+    if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) { run_layers(build_inp_mem_hybrid_iswa()); }
+    else { run_layers(build_inp_mem_hybrid()); }
     cur = inpL;
 
     // Final norm
@@ -203,8 +208,9 @@ ggml_tensor * llama_model_qwen3next::graph::build_norm_gated(
     return ggml_mul(ctx0, normalized, gated_silu);
 }
 
+template <typename TAttn>
 ggml_tensor * llama_model_qwen3next::graph::build_layer_attn(
-        llm_graph_input_attn_kv * inp,
+        TAttn *                   inp,
         ggml_tensor *             cur,
         ggml_tensor *             inp_pos,
         int                       il) {
-- 
2.47.3

