From 17df27edeece77d0ceeb48030fea85ff52d38fdd Mon Sep 17 00:00:00 2001 From: Adrian Gunnar Lauterer Date: Wed, 1 Apr 2026 17:20:11 +0200 Subject: [PATCH] updates --- hosts/galadriel/hardware-configuration.nix | 2 +- modules/llama-swap.nix | 36 ++++++++++++++++------ 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/hosts/galadriel/hardware-configuration.nix b/hosts/galadriel/hardware-configuration.nix index aecdb59..4cc3387 100644 --- a/hosts/galadriel/hardware-configuration.nix +++ b/hosts/galadriel/hardware-configuration.nix @@ -22,7 +22,7 @@ "usbhid" "sd_mod" ]; - boot.kernelPackages = pkgs.linuxPackages_6_18; + boot.kernelPackages = pkgs.linuxPackages_6_19; #migth need to bump down if zfs compat breaks. boot.zfs.package = pkgs.zfs_2_4; boot.initrd.kernelModules = [ ]; boot.kernelModules = [ "kvm-amd" ]; diff --git a/modules/llama-swap.nix b/modules/llama-swap.nix index 7241e71..1f07e2b 100644 --- a/modules/llama-swap.nix +++ b/modules/llama-swap.nix @@ -38,23 +38,30 @@ logLevel = "info"; macros = { - ctx = 64000; + ctx = 32000; ngl = 99; - kv_cache = "-ctk iq4_nl -ctv iq4_nl -fa on"; + kv_cache = "-ctk q4_0 -ctv q4_0 -fa 1"; + batch = "-b 1024 -ub 1024"; hf_repo = ""; + image-tokens = "--image-min-tokens 256 --image-max-tokens 1536"; + qwen35-thinking = "--chat-template-kwargs '{\"enable_thinking\":true}'"; + qwen35-no-thinking = "--chat-template-kwargs '{\"enable_thinking\":false}'"; }; models = { + + "qwen3.5-35b-a3b" = { - cmd = "${llama-server} --port $\{PORT\} --host 0.0.0.0 --ctx-size $\{ctx\} -ngl $\{ngl\} $\{kv_cache\} --hf-repo $\{hf_repo\} --image-max-tokens 1024 --chat-template-kwargs '{\"enable_thinking\":false}'"; + cmd = "${llama-server} --port $\{PORT\} --host 0.0.0.0 --ctx-size $\{ctx\} -ngl $\{ngl\} $\{kv_cache\} $\{batch\} --hf-repo $\{hf_repo\} $\{image-tokens\} $\{qwen35-no-thinking\} "; aliases = [ "qwen3.5" ]; ttl = 1800; macros = { + ctx = 49152; hf_repo = "unsloth/Qwen3.5-35B-A3B-GGUF:UD-IQ3_XXS"; }; }; "qwen3.5-9b" = { - cmd = "${llama-server} --port $\{PORT\} --host 0.0.0.0 --ctx-size $\{ctx\} -ngl $\{ngl\} --hf-repo $\{hf_repo\} --no-mmap --image-max-tokens 1024 --image-min-tokens 512 --chat-template-kwargs '{\"enable_thinking\":true}'"; + cmd = "${llama-server} --port $\{PORT\} --host 0.0.0.0 --ctx-size $\{ctx\} $\{kv_cache\} -ngl $\{ngl\} --hf-repo $\{hf_repo\} $\{batch\} $\{image-tokens\} $\{qwen35-thinking\} "; ttl = 900; macros = { hf_repo = "unsloth/Qwen3.5-9B-GGUF:UD-Q4_K_XL"; @@ -62,13 +69,24 @@ }; }; "qwen3.5-2b" = { - cmd = "${llama-server} --port $\{PORT\} --host 0.0.0.0 --ctx-size $\{ctx\} -ngl $\{ngl\} --hf-repo $\{hf_repo\} --image-max-tokens 1024"; + cmd = "${llama-server} --port $\{PORT\} --host 0.0.0.0 --ctx-size $\{ctx\} $\{kv_cache\} $\{batch\} $\{image-tokens\} -ngl $\{ngl\} --hf-repo $\{hf_repo\} "; ttl = 900; macros = { hf_repo = "unsloth/Qwen3.5-2B-GGUF:UD-Q8_K_XL"; - ctx = 128000; + ctx = 200000; }; }; + + "glm4.7-flash" = { + cmd = "${llama-server} --port $\{PORT\} --host 0.0.0.0 --ctx-size $\{ctx\} $\{kv_cache\} -ngl $\{ngl\} --hf-repo $\{hf_repo\}"; + ttl = 900; + aliases = [ "coder" ]; + macros = { + hf_repo = "unsloth/GLM-4.7-Flash-REAP-23B-A3B-GGUF"; #Reap should allow more context in gpu mem + ctx = 64000; #supports 200k. + }; + }; + "ministal-3-8b-reasonning" = { cmd = "${llama-server} --port $\{PORT\} --host 0.0.0.0 --ctx-size $\{ctx\} -ngl $\{ngl\} --hf-repo $\{hf_repo\}"; aliases = [ "ministral3" ]; @@ -81,6 +99,9 @@ ttl = 900; macros.hf_repo = "mistralai/Ministral-3-3B-Instruct-2512-GGUF"; }; + + + "minicpm-o-4_5" = { cmd = "${llama-server} --port $\{PORT\} --host 0.0.0.0 --ctx-size $\{ctx\} -ngl $\{ngl\} --hf-repo $\{hf_repo\} --mmproj-url https://huggingface.co/openbmb/MiniCPM-o-4_5-gguf/resolve/main/vision/MiniCPM-o-4_5-vision-F16.gguf"; aliases = [ @@ -142,9 +163,6 @@ checkEndpoint = "/v1/audio/voices"; proxy = "http://127.0.0.1:$\{PORT\}"; ttl = 900; - aliases = [ - "tts-1" - ]; }; };