From ea5c79367cbe3d2000e4228742eca902a27f1f06 Mon Sep 17 00:00:00 2001 From: Adrian Gunnar Lauterer Date: Mon, 6 Apr 2026 09:45:55 +0200 Subject: [PATCH] llama swap oci edition --- hosts/galadriel/hardware-configuration.nix | 2 +- modules/llama-swap.nix | 67 ++++++++++++++-------- 2 files changed, 45 insertions(+), 24 deletions(-) diff --git a/hosts/galadriel/hardware-configuration.nix b/hosts/galadriel/hardware-configuration.nix index 4cc3387..390f4c1 100644 --- a/hosts/galadriel/hardware-configuration.nix +++ b/hosts/galadriel/hardware-configuration.nix @@ -22,7 +22,7 @@ "usbhid" "sd_mod" ]; - boot.kernelPackages = pkgs.linuxPackages_6_19; #migth need to bump down if zfs compat breaks. + boot.kernelPackages = pkgs.linuxPackages_6_19; # migth need to bump down if zfs compat breaks. boot.zfs.package = pkgs.zfs_2_4; boot.initrd.kernelModules = [ ]; boot.kernelModules = [ "kvm-amd" ]; diff --git a/modules/llama-swap.nix b/modules/llama-swap.nix index a340c1b..66d42a7 100644 --- a/modules/llama-swap.nix +++ b/modules/llama-swap.nix @@ -42,22 +42,21 @@ ctx = 32000; ngl = 99; kv_cache = "-ctk q4_0 -ctv q4_0 -fa 1"; - batch = "-b 1024 -ub 1024"; + batch = "-b 1024 -ub 1024"; # default 512 512 hf_repo = ""; - image-tokens = "--image-min-tokens 256 --image-max-tokens 1536"; + image-tokens = "--image-min-tokens 256 --image-max-tokens 1024"; qwen35-thinking = "--chat-template-kwargs '{\"enable_thinking\":true}'"; qwen35-no-thinking = "--chat-template-kwargs '{\"enable_thinking\":false}'"; }; models = { - "qwen3.5-35b-a3b" = { cmd = "${llama-server} --port $\{PORT\} --host 0.0.0.0 --ctx-size $\{ctx\} -ngl $\{ngl\} $\{kv_cache\} $\{batch\} --hf-repo $\{hf_repo\} $\{image-tokens\} $\{qwen35-no-thinking\} "; aliases = [ "qwen3.5" ]; ttl = 1800; macros = { - ctx = 49152; + ctx = 49152; hf_repo = "unsloth/Qwen3.5-35B-A3B-GGUF:UD-IQ3_XXS"; }; }; @@ -83,8 +82,8 @@ ttl = 900; aliases = [ "coder" ]; macros = { - hf_repo = "unsloth/GLM-4.7-Flash-REAP-23B-A3B-GGUF"; #Reap should allow more context in gpu mem - ctx = 48000; + hf_repo = "unsloth/GLM-4.7-Flash-REAP-23B-A3B-GGUF"; # Reap should allow more context in gpu mem + ctx = 48000; }; }; @@ -106,7 +105,6 @@ }; }; - "ministal-3-8b-reasonning" = { cmd = "${llama-server} --port $\{PORT\} --host 0.0.0.0 --ctx-size $\{ctx\} -ngl $\{ngl\} --hf-repo $\{hf_repo\}"; aliases = [ "ministral3" ]; @@ -120,8 +118,6 @@ macros.hf_repo = "mistralai/Ministral-3-3B-Instruct-2512-GGUF"; }; - - "minicpm-o-4_5" = { cmd = "${llama-server} --port $\{PORT\} --host 0.0.0.0 --ctx-size $\{ctx\} -ngl $\{ngl\} --hf-repo $\{hf_repo\} --mmproj-url https://huggingface.co/openbmb/MiniCPM-o-4_5-gguf/resolve/main/vision/MiniCPM-o-4_5-vision-F16.gguf"; aliases = [ @@ -134,7 +130,6 @@ "z-image-turbo" = { cmd = "${sd-server} --listen-port $\{PORT\} --diffusion-model $\{diffusion_model\} --vae $\{vae\} --llm $\{llm\} --offload-to-cpu --cfg-scale 1.0 --height 1024 --width 1024 --steps 4"; checkEndpoint = "/"; - proxy = "http://127.0.0.1:$\{PORT\}"; ttl = 300; aliases = [ "gpt-image-1" @@ -149,7 +144,6 @@ "distil-whisper-v3.5" = { cmd = "${whisper-server} --host 127.0.0.1 --port $\{PORT\} -m $\{model\} --request-path /v1/audio/transcriptions --inference-path \"\""; checkEndpoint = "/v1/audio/transcriptions/"; - proxy = "http://127.0.0.1:$\{PORT\}"; ttl = 0; aliases = [ "whisper" @@ -160,7 +154,6 @@ "nb-whisper-small" = { cmd = "${whisper-server} --host 127.0.0.1 --port $\{PORT\} -m $\{model\} --request-path /v1/audio/transcriptions --inference-path \"\" --language no"; checkEndpoint = "/v1/audio/transcriptions/"; - proxy = "http://127.0.0.1:$\{PORT\}"; ttl = 0; aliases = [ "whisper-no" @@ -168,20 +161,21 @@ ]; macros.model = "${whisper-models}/models/nb-whisper-small-q5_0.bin"; }; - - "omnivoice" = { - cmd = "docker run --gpus all --rm --name omnivoice-$\{port\} -p $\{port\}:8091 vllm/vllm-openai:nightly vllm serve k2-fsa/OmniVoice --omni --port 8091 --trust-remote-code"; - cmdStop = "${podman} stop omnivoice-$\{PORT\}"; - checkEndpoint = "/v1/audio/voices"; - proxy = "http://127.0.0.1:$\{PORT\}"; - ttl = 900; - }; + + +"omnivoice" = { + # Starts the service, then tails the journal to stay in the foreground for llama-swap + cmd = "${pkgs.bash}/bin/bash -c '${pkgs.systemd}/bin/systemctl start podman-omnivoice.service && exec ${pkgs.systemd}/bin/journalctl -u podman-omnivoice.service -f'"; + cmdStop = "${pkgs.systemd}/bin/systemctl stop podman-omnivoice.service"; + checkEndpoint = "/v1/audio/voices"; + proxy = "http://127.0.0.1:8091"; + ttl = 900; +}; "kokoro" = { - cmd = "${podman} run --rm --name kokoro-tts-$\{PORT\} -p $\{PORT\}:8880 ghcr.io/remsky/kokoro-fastapi-cpu:latest"; - cmdStop = "${podman} stop kokoro-tts-$\{PORT\}"; + cmd = "${podman} run --rm --name kokoro-tts -p $\{PORT\}:8880 ghcr.io/remsky/kokoro-fastapi-cpu:latest"; + cmdStop = "${podman} stop kokoro-tts"; checkEndpoint = "/v1/audio/voices"; - proxy = "http://127.0.0.1:$\{PORT\}"; ttl = 900; }; @@ -189,6 +183,15 @@ }; }; +virtualisation.oci-containers.containers.omnivoice = { + image = "vllm/vllm-openai:nightly"; + ports = [ "8091:8091" ]; + cmd = [ "vllm" "serve" "k2-fsa/OmniVoice" "--omni" "--port" "8091" "--trust-remote-code" ]; + extraOptions = ["--rm" ]; + autoStart = false; +}; + + systemd.services.llama-swap = { serviceConfig = { StateDirectory = "llama-swap"; @@ -199,6 +202,24 @@ "XDG_CACHE_HOME=/var/cache/llama-swap" "MESA_SHADER_CACHE_DIR=/var/cache/llama-swap/mesa" ]; + + DynamicUser = lib.mkForce false; + User = "root"; + Group = "root"; + + PrivateUsers = lib.mkForce false; + RestrictNamespaces = lib.mkForce false; + ProtectKernelNamespaces = false; + PrivateMounts = lib.mkForce false; + ProtectSystem = lib.mkForce false; + NoNewPrivileges = lib.mkForce false; + RestrictAddressFamilies = lib.mkForce [ + "AF_INET" + "AF_INET6" + "AF_UNIX" + "AF_NETLINK" + ]; + SystemCallFilter = lib.mkForce [ "@system-service" ]; }; }; }