feat: add llama-cpp-nightly package (b8667) and replace unstable llama-cpp-vulkan

2026-04-06 09:39:45 +02:00
parent ea5c79367c
commit dbdb16fde2
4 changed files with 187 additions and 22 deletions
@@ -113,6 +113,8 @@
            s2cpp
            s2-model
            vllm-omni
+            llama-cpp-nightly
+            llama-cpp-nightly-vulkan
            ;
        };

@@ -124,6 +126,8 @@
        s2cpp = final.callPackage ./packages/s2cpp { };
        s2-model = final.callPackage ./packages/s2-model { };
        vllm-omni = final.python3Packages.callPackage ./packages/vllm-omni { };
+        llama-cpp-nightly = final.callPackage ./packages/llama-cpp-nightly { };
+        llama-cpp-nightly-vulkan = final.callPackage ./packages/llama-cpp-nightly { vulkanSupport = true; };
      };

      # legolas
@@ -3,6 +3,8 @@
  pkgs,
  lib,
  unstable,
+  inputs,
+  system,
  ...
 }:

@@ -16,7 +18,7 @@ in
    enable = true;
    host = "0.0.0.0";
    port = 11111;
-    package = pkgs.unstable.llama-cpp-vulkan;
+    package = inputs.self.packages.${system}.llama-cpp-nightly-vulkan;
    openFirewall = true;
    model = "/var/lib/llama/models/Qwen3.5-35B-A3B-UD-Q2_K_XL.gguf";
    extraFlags = [
@@ -9,7 +9,7 @@
 {

  environment.systemPackages = [
-    pkgs.unstable.llama-cpp-vulkan
+    inputs.self.packages.${system}.llama-cpp-nightly-vulkan
    pkgs.unstable.stable-diffusion-cpp-vulkan
    pkgs.unstable.whisper-cpp-vulkan
    inputs.self.packages.${system}.z-image-models
@@ -25,7 +25,7 @@

    settings =
      let
-        llama-server = lib.getExe' pkgs.unstable.llama-cpp-vulkan "llama-server";
+        llama-server = lib.getExe' inputs.self.packages.${system}.llama-cpp-nightly-vulkan "llama-server";
        sd-server = lib.getExe' pkgs.unstable.stable-diffusion-cpp-vulkan "sd-server";
        whisper-server = lib.getExe' pkgs.unstable.whisper-cpp-vulkan "whisper-server";
        podman = lib.getExe pkgs.podman;
@@ -44,7 +44,7 @@
          kv_cache = "-ctk q4_0 -ctv q4_0 -fa 1";
          batch = "-b 1024 -ub 1024"; # default 512 512
          hf_repo = "";
-          image-tokens = "--image-min-tokens 256 --image-max-tokens 1024";
+          image-tokens = "--image-min-tokens 256 --image-max-tokens 1536";
          qwen35-thinking = "--chat-template-kwargs '{\"enable_thinking\":true}'";
          qwen35-no-thinking = "--chat-template-kwargs '{\"enable_thinking\":false}'";
        };
@@ -162,15 +162,13 @@
            macros.model = "${whisper-models}/models/nb-whisper-small-q5_0.bin";
          };

-
-"omnivoice" = {
-  # Starts the service, then tails the journal to stay in the foreground for llama-swap
-  cmd = "${pkgs.bash}/bin/bash -c '${pkgs.systemd}/bin/systemctl start podman-omnivoice.service && exec ${pkgs.systemd}/bin/journalctl -u podman-omnivoice.service -f'";
-  cmdStop = "${pkgs.systemd}/bin/systemctl stop podman-omnivoice.service";
-  checkEndpoint = "/v1/audio/voices";
-  proxy = "http://127.0.0.1:8091";
-  ttl = 900;
-};
+          "omnivoice" = {
+            cmd = "${pkgs.bash}/bin/bash -c '${pkgs.systemd}/bin/systemctl start podman-omnivoice.service && exec ${pkgs.systemd}/bin/journalctl -u podman-omnivoice.service -f'";
+            cmdStop = "${pkgs.systemd}/bin/systemctl stop podman-omnivoice.service";
+            checkEndpoint = "/v1/audio/voices";
+            proxy = "http://127.0.0.1:8091";
+            ttl = 900;
+          };

          "kokoro" = {
            cmd = "${podman} run --rm --name kokoro-tts -p $\{PORT\}:8880 ghcr.io/remsky/kokoro-fastapi-cpu:latest";
@@ -183,14 +181,21 @@
      };
  };

-virtualisation.oci-containers.containers.omnivoice = {
-  image = "vllm/vllm-openai:nightly";
-  ports = [ "8091:8091" ];
-  cmd = [ "vllm" "serve" "k2-fsa/OmniVoice" "--omni" "--port" "8091" "--trust-remote-code" ];
-  extraOptions = ["--rm" ];
-  autoStart = false;
-};
-
+  virtualisation.oci-containers.containers.omnivoice = {
+    image = "vllm/vllm-openai:nightly";
+    ports = [ "8091:8091" ];
+    cmd = [
+      "vllm"
+      "serve"
+      "k2-fsa/OmniVoice"
+      "--omni"
+      "--port"
+      "8091"
+      "--trust-remote-code"
+    ];
+    extraOptions = [ "--rm" ];
+    autoStart = false;
+  };

  systemd.services.llama-swap = {
    serviceConfig = {
@@ -209,7 +214,8 @@ virtualisation.oci-containers.containers.omnivoice = {

      PrivateUsers = lib.mkForce false;
      RestrictNamespaces = lib.mkForce false;
- ProtectKernelNamespaces = false;
+      ProtectKernelNamespaces = lib.mkForce false;
+      ProtectKernelTunables = lib.mkForce false;
      PrivateMounts = lib.mkForce false;
      ProtectSystem = lib.mkForce false;
      NoNewPrivileges = lib.mkForce false;
@@ -0,0 +1,153 @@
+{
+  lib,
+  autoAddDriverRunpath,
+  cmake,
+  fetchFromGitHub,
+  stdenv,
+  ninja,
+  pkg-config,
+  curl,
+
+  config,
+  cudaSupport ? config.cudaSupport,
+  cudaPackages ? { },
+
+  rocmSupport ? config.rocmSupport,
+  rocmPackages ? { },
+  rocmGpuTargets ? rocmPackages.clr.localGpuTargets or rocmPackages.clr.gpuTargets,
+
+  openclSupport ? false,
+  clblast,
+
+  blasSupport ? builtins.all (x: !x) [
+    cudaSupport
+    metalSupport
+    openclSupport
+    rocmSupport
+    vulkanSupport
+  ],
+  blas,
+
+  metalSupport ? stdenv.hostPlatform.isDarwin && stdenv.hostPlatform.isAarch64 && !openclSupport,
+  vulkanSupport ? false,
+  rpcSupport ? false,
+  shaderc,
+  vulkan-headers,
+  vulkan-loader,
+}:
+
+let
+  effectiveStdenv = if cudaSupport then cudaPackages.backendStdenv else stdenv;
+  inherit (lib)
+    cmakeBool
+    cmakeFeature
+    optionals
+    optionalString
+    ;
+
+  cudaBuildInputs = with cudaPackages; [
+    cuda_cccl
+    cuda_cudart
+    libcublas
+  ];
+
+  rocmBuildInputs = with rocmPackages; [
+    clr
+    hipblas
+    rocblas
+  ];
+
+  vulkanBuildInputs = [
+    shaderc
+    vulkan-headers
+    vulkan-loader
+  ];
+in
+effectiveStdenv.mkDerivation (finalAttrs: {
+  pname = "llama-cpp-nightly";
+  version = "8667";
+
+  src = fetchFromGitHub {
+    owner = "ggml-org";
+    repo = "llama.cpp";
+    tag = "b${finalAttrs.version}";
+    hash = "sha256-bDI7a7OMCbuZyaJX4o22fmQIyrGdzYkoIeVvxBYlnRI=";
+    leaveDotGit = true;
+    postFetch = ''
+      git -C "$out" rev-parse --short HEAD > $out/COMMIT
+      find "$out" -name .git -print0 | xargs -0 rm -rf
+    '';
+  };
+
+  nativeBuildInputs = [
+    cmake
+    ninja
+    pkg-config
+  ]
+  ++ optionals cudaSupport [
+    cudaPackages.cuda_nvcc
+    autoAddDriverRunpath
+  ];
+
+  buildInputs =
+    optionals cudaSupport cudaBuildInputs
+    ++ optionals openclSupport [ clblast ]
+    ++ optionals rocmSupport rocmBuildInputs
+    ++ optionals blasSupport [ blas ]
+    ++ optionals vulkanSupport vulkanBuildInputs
+    ++ [ curl ];
+
+  preConfigure = ''
+    prependToVar cmakeFlags "-DLLAMA_BUILD_COMMIT:STRING=$(cat COMMIT)"
+  '';
+
+  cmakeFlags = [
+    (cmakeBool "GGML_NATIVE" false)
+    (cmakeBool "LLAMA_BUILD_EXAMPLES" false)
+    (cmakeBool "LLAMA_BUILD_SERVER" true)
+    (cmakeBool "LLAMA_BUILD_TESTS" false)
+    (cmakeBool "LLAMA_CURL" true)
+    (cmakeBool "BUILD_SHARED_LIBS" true)
+    (cmakeBool "GGML_BLAS" blasSupport)
+    (cmakeBool "GGML_CLBLAST" openclSupport)
+    (cmakeBool "GGML_CUDA" cudaSupport)
+    (cmakeBool "GGML_HIP" rocmSupport)
+    (cmakeBool "GGML_METAL" metalSupport)
+    (cmakeBool "GGML_RPC" rpcSupport)
+    (cmakeBool "GGML_VULKAN" vulkanSupport)
+    (cmakeFeature "LLAMA_BUILD_NUMBER" finalAttrs.version)
+  ]
+  ++ optionals cudaSupport [
+    (cmakeFeature "CMAKE_CUDA_ARCHITECTURES" cudaPackages.flags.cmakeCudaArchitecturesString)
+  ]
+  ++ optionals rocmSupport [
+    (cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.clr.hipClangPath}/clang++")
+    (cmakeFeature "CMAKE_HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmGpuTargets))
+  ]
+  ++ optionals metalSupport [
+    (cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
+    (cmakeBool "LLAMA_METAL_EMBED_LIBRARY" true)
+  ]
+  ++ optionals rpcSupport [
+    (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
+  ];
+
+  postInstall = ''
+    ln -sf $out/bin/llama-cli $out/bin/llama
+    mkdir -p $out/include
+    cp $src/include/llama.h $out/include/
+  ''
+  + optionalString rpcSupport "cp bin/rpc-server $out/bin/llama-rpc-server";
+
+  doCheck = false;
+
+  meta = {
+    description = "Inference of Meta's LLaMA model (and others) in pure C/C++ (nightly b8667)";
+    homepage = "https://github.com/ggml-org/llama.cpp";
+    license = lib.licenses.mit;
+    mainProgram = "llama";
+    platforms = lib.platforms.unix;
+    badPlatforms = optionals (cudaSupport || openclSupport) lib.platforms.darwin;
+    broken = metalSupport && !effectiveStdenv.hostPlatform.isDarwin;
+  };
+})