diff --git a/hosts/galadriel/configuration.nix b/hosts/galadriel/configuration.nix index 73631f4..9dfe296 100644 --- a/hosts/galadriel/configuration.nix +++ b/hosts/galadriel/configuration.nix @@ -36,8 +36,10 @@ ../../modules/qbittorrent.nix ../../modules/mealie.nix ../../modules/miniflux.nix - ../../modules/ollama.nix - ../../modules/openwebui.nix + #../../modules/ollama.nix # replaced by llama-cpp + llama-swap + #../../modules/openwebui.nix # using llama-cpp built-in UI instead + ../../modules/llama-cpp.nix + ../../modules/llama-swap.nix ../../modules/librechat.nix ../../modules/immich.nix diff --git a/modules/llama-cpp.nix b/modules/llama-cpp.nix new file mode 100644 index 0000000..0ae2ddd --- /dev/null +++ b/modules/llama-cpp.nix @@ -0,0 +1,91 @@ +{ + config, + pkgs, + lib, + ... +}: + +let + modelDir = "/var/lib/llama-cpp/models"; + + # llama-cpp with Vulkan support for Intel Arc + llama-cpp-vulkan = pkgs.llama-cpp.override { + vulkanSupport = true; + cudaSupport = false; + rocmSupport = false; + openclSupport = false; + blasSupport = true; + }; + + llama-server = lib.getExe' llama-cpp-vulkan "llama-server"; + + # Model definitions: name -> { url, filename } + models = { + "Ministral-3-8B-Reasoning-Q4_K_M" = { + url = "https://huggingface.co/mistralai/Ministral-3-8B-Reasoning-2512-GGUF/resolve/main/Ministral-3-8B-Reasoning-2512-Q4_K_M.gguf"; + filename = "Ministral-3-8B-Reasoning-2512-Q4_K_M.gguf"; + }; + "Qwen3.5-35B-A3B-UD-Q3_K_M" = { + url = "https://huggingface.co/unsloth/Qwen3.5-35B-A3B-GGUF/resolve/main/Qwen3.5-35B-A3B-UD-Q3_K_M.gguf"; + filename = "Qwen3.5-35B-A3B-UD-Q3_K_M.gguf"; + }; + "Qwen3.5-27B-UD-Q4_K_XL" = { + url = "https://huggingface.co/unsloth/Qwen3.5-27B-GGUF/resolve/main/Qwen3.5-27B-UD-Q4_K_XL.gguf"; + filename = "Qwen3.5-27B-UD-Q4_K_XL.gguf"; + }; + "LFM2-24B-A2B-Q4_K_M" = { + url = "https://huggingface.co/LiquidAI/LFM2-24B-A2B-GGUF/resolve/main/LFM2-24B-A2B-Q4_K_M.gguf"; + filename = "LFM2-24B-A2B-Q4_K_M.gguf"; + }; + "Nanbeige4-3B-Thinking-Q4_K_M" = { + url = "https://huggingface.co/bartowski/Nanbeige_Nanbeige4-3B-Thinking-2511-GGUF/resolve/main/Nanbeige_Nanbeige4-3B-Thinking-2511-Q4_K_M.gguf"; + filename = "Nanbeige_Nanbeige4-3B-Thinking-2511-Q4_K_M.gguf"; + }; + }; + + # Generate a systemd oneshot service per model that downloads if missing + downloadServices = lib.mapAttrs' ( + name: model: + lib.nameValuePair "llama-cpp-download-${name}" { + description = "Download GGUF model: ${name}"; + after = [ "network-online.target" ]; + wants = [ "network-online.target" ]; + wantedBy = [ "multi-user.target" ]; + + serviceConfig = { + Type = "oneshot"; + RemainAfterExit = true; + User = "llama-cpp"; + Group = "llama-cpp"; + ExecStart = pkgs.writeShellScript "download-${name}" '' + MODEL_PATH="${modelDir}/${model.filename}" + if [ ! -f "$MODEL_PATH" ]; then + echo "Downloading ${name}..." + ${lib.getExe pkgs.curl} -L -o "$MODEL_PATH.tmp" "${model.url}" + mv "$MODEL_PATH.tmp" "$MODEL_PATH" + echo "Download complete: ${name}" + else + echo "Model already exists: ${name}" + fi + ''; + }; + } + ) models; +in +{ + environment.systemPackages = [ llama-cpp-vulkan ]; + + users.users.llama-cpp = { + isSystemUser = true; + group = "llama-cpp"; + home = "/var/lib/llama-cpp"; + description = "llama-cpp service user"; + }; + users.groups.llama-cpp = { }; + + systemd.tmpfiles.rules = [ + "d ${modelDir} 0755 llama-cpp llama-cpp - -" + ]; + + systemd.services = downloadServices; +} diff --git a/modules/llama-swap.nix b/modules/llama-swap.nix new file mode 100644 index 0000000..f995ac1 --- /dev/null +++ b/modules/llama-swap.nix @@ -0,0 +1,74 @@ +{ + config, + pkgs, + lib, + ... +}: + +let + modelDir = "/var/lib/llama-cpp/models"; + + # llama-cpp with Vulkan for Intel Arc + llama-cpp-vulkan = pkgs.llama-cpp.override { + vulkanSupport = true; + cudaSupport = false; + rocmSupport = false; + openclSupport = false; + blasSupport = true; + }; + + llama-server = lib.getExe' llama-cpp-vulkan "llama-server"; + + # Common flags for all models + # --no-webui is NOT set so the llama-cpp UI is available per-model + commonFlags = port: "-ngl 99 --port \${PORT} --host 127.0.0.1"; +in +{ + services.llama-swap = { + enable = true; + port = 11111; + openFirewall = true; + + settings = { + healthCheckTimeout = 120; + + models = { + "ministral-3-8b-reasoning" = { + cmd = "${llama-server} --port \${PORT} --host 127.0.0.1 -ngl 99 -m ${modelDir}/Ministral-3-8B-Reasoning-2512-Q4_K_M.gguf -c 32768"; + aliases = [ + "ministral-3" + "ministral" + ]; + }; + "qwen3.5-35b-a3b" = { + cmd = "${llama-server} --port \${PORT} --host 127.0.0.1 -ngl 99 -m ${modelDir}/Qwen3.5-35B-A3B-UD-Q3_K_M.gguf -c 32768"; + aliases = [ + "qwen3.5-35b" + "qwen-moe" + ]; + }; + "qwen3.5-27b" = { + cmd = "${llama-server} --port \${PORT} --host 127.0.0.1 -ngl 99 -m ${modelDir}/Qwen3.5-27B-UD-Q4_K_XL.gguf -c 16384"; + aliases = [ + "qwen3.5" + "qwen-27b" + ]; + }; + "lfm2-24b-a2b" = { + cmd = "${llama-server} --port \${PORT} --host 127.0.0.1 -ngl 99 -m ${modelDir}/LFM2-24B-A2B-Q4_K_M.gguf -c 32768"; + aliases = [ + "lfm2" + "liquid" + ]; + }; + "nanbeige4-3b-thinking" = { + cmd = "${llama-server} --port \${PORT} --host 127.0.0.1 -ngl 99 -m ${modelDir}/Nanbeige_Nanbeige4-3B-Thinking-2511-Q4_K_M.gguf -c 32768"; + aliases = [ + "nanbeige" + "nanbeige4" + ]; + }; + }; + }; + }; +}