From 17df27edeece77d0ceeb48030fea85ff52d38fdd Mon Sep 17 00:00:00 2001
From: Adrian Gunnar Lauterer <adrian@lauterer.it>
Date: Wed, 1 Apr 2026 17:20:11 +0200
Subject: [PATCH] updates

---
 hosts/galadriel/hardware-configuration.nix |  2 +-
 modules/llama-swap.nix                     | 36 ++++++++++++++++------
 2 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/hosts/galadriel/hardware-configuration.nix b/hosts/galadriel/hardware-configuration.nix
index aecdb59..4cc3387 100644
--- a/hosts/galadriel/hardware-configuration.nix
+++ b/hosts/galadriel/hardware-configuration.nix
@@ -22,7 +22,7 @@
     "usbhid"
     "sd_mod"
   ];
-  boot.kernelPackages = pkgs.linuxPackages_6_18;
+  boot.kernelPackages = pkgs.linuxPackages_6_19; #migth need to bump down if zfs compat breaks.
   boot.zfs.package = pkgs.zfs_2_4;
   boot.initrd.kernelModules = [ ];
   boot.kernelModules = [ "kvm-amd" ];
diff --git a/modules/llama-swap.nix b/modules/llama-swap.nix
index 7241e71..1f07e2b 100644
--- a/modules/llama-swap.nix
+++ b/modules/llama-swap.nix
@@ -38,23 +38,30 @@
         logLevel = "info";
 
         macros = {
-          ctx = 64000;
+          ctx = 32000;
           ngl = 99;
-          kv_cache = "-ctk iq4_nl -ctv iq4_nl -fa on";
+          kv_cache = "-ctk q4_0 -ctv q4_0 -fa 1";
+          batch = "-b 1024 -ub 1024";
           hf_repo = "";
+	  image-tokens = "--image-min-tokens 256 --image-max-tokens 1536";
+          qwen35-thinking = "--chat-template-kwargs '{\"enable_thinking\":true}'";
+          qwen35-no-thinking = "--chat-template-kwargs '{\"enable_thinking\":false}'";
         };
 
         models = {
+
+
           "qwen3.5-35b-a3b" = {
-            cmd = "${llama-server} --port $\{PORT\} --host 0.0.0.0 --ctx-size $\{ctx\} -ngl $\{ngl\} $\{kv_cache\} --hf-repo $\{hf_repo\} --image-max-tokens 1024 --chat-template-kwargs '{\"enable_thinking\":false}'";
+            cmd = "${llama-server} --port $\{PORT\} --host 0.0.0.0 --ctx-size $\{ctx\} -ngl $\{ngl\} $\{kv_cache\} $\{batch\} --hf-repo $\{hf_repo\} $\{image-tokens\} $\{qwen35-no-thinking\} ";
             aliases = [ "qwen3.5" ];
             ttl = 1800;
             macros = {
+	      ctx = 49152;
               hf_repo = "unsloth/Qwen3.5-35B-A3B-GGUF:UD-IQ3_XXS";
             };
           };
           "qwen3.5-9b" = {
-            cmd = "${llama-server} --port $\{PORT\} --host 0.0.0.0 --ctx-size $\{ctx\} -ngl $\{ngl\} --hf-repo $\{hf_repo\} --no-mmap --image-max-tokens 1024 --image-min-tokens 512  --chat-template-kwargs '{\"enable_thinking\":true}'";
+            cmd = "${llama-server} --port $\{PORT\} --host 0.0.0.0 --ctx-size $\{ctx\} $\{kv_cache\} -ngl $\{ngl\} --hf-repo $\{hf_repo\} $\{batch\} $\{image-tokens\} $\{qwen35-thinking\} ";
             ttl = 900;
             macros = {
               hf_repo = "unsloth/Qwen3.5-9B-GGUF:UD-Q4_K_XL";
@@ -62,13 +69,24 @@
             };
           };
           "qwen3.5-2b" = {
-            cmd = "${llama-server} --port $\{PORT\} --host 0.0.0.0 --ctx-size $\{ctx\} -ngl $\{ngl\} --hf-repo $\{hf_repo\} --image-max-tokens 1024";
+            cmd = "${llama-server} --port $\{PORT\} --host 0.0.0.0 --ctx-size $\{ctx\} $\{kv_cache\} $\{batch\} $\{image-tokens\} -ngl $\{ngl\} --hf-repo $\{hf_repo\} ";
             ttl = 900;
             macros = {
               hf_repo = "unsloth/Qwen3.5-2B-GGUF:UD-Q8_K_XL";
-              ctx = 128000;
+              ctx = 200000;
             };
           };
+
+          "glm4.7-flash" = {
+            cmd = "${llama-server} --port $\{PORT\} --host 0.0.0.0 --ctx-size $\{ctx\}  $\{kv_cache\} -ngl $\{ngl\} --hf-repo $\{hf_repo\}";
+            ttl = 900;
+            aliases = [ "coder" ];
+            macros = {
+              hf_repo = "unsloth/GLM-4.7-Flash-REAP-23B-A3B-GGUF"; #Reap should allow more context in gpu mem
+              ctx = 64000; #supports 200k.
+            };
+          };
+
           "ministal-3-8b-reasonning" = {
             cmd = "${llama-server} --port $\{PORT\} --host 0.0.0.0 --ctx-size $\{ctx\} -ngl $\{ngl\} --hf-repo $\{hf_repo\}";
             aliases = [ "ministral3" ];
@@ -81,6 +99,9 @@
             ttl = 900;
             macros.hf_repo = "mistralai/Ministral-3-3B-Instruct-2512-GGUF";
           };
+
+
+
           "minicpm-o-4_5" = {
             cmd = "${llama-server} --port $\{PORT\} --host 0.0.0.0 --ctx-size $\{ctx\} -ngl $\{ngl\} --hf-repo $\{hf_repo\} --mmproj-url https://huggingface.co/openbmb/MiniCPM-o-4_5-gguf/resolve/main/vision/MiniCPM-o-4_5-vision-F16.gguf";
             aliases = [
@@ -142,9 +163,6 @@
             checkEndpoint = "/v1/audio/voices";
             proxy = "http://127.0.0.1:$\{PORT\}";
             ttl = 900;
-            aliases = [
-              "tts-1"
-            ];
           };
 
         };