llama swap oci edition
This commit is contained in:
@@ -22,7 +22,7 @@
|
||||
"usbhid"
|
||||
"sd_mod"
|
||||
];
|
||||
boot.kernelPackages = pkgs.linuxPackages_6_19; #migth need to bump down if zfs compat breaks.
|
||||
boot.kernelPackages = pkgs.linuxPackages_6_19; # migth need to bump down if zfs compat breaks.
|
||||
boot.zfs.package = pkgs.zfs_2_4;
|
||||
boot.initrd.kernelModules = [ ];
|
||||
boot.kernelModules = [ "kvm-amd" ];
|
||||
|
||||
@@ -42,22 +42,21 @@
|
||||
ctx = 32000;
|
||||
ngl = 99;
|
||||
kv_cache = "-ctk q4_0 -ctv q4_0 -fa 1";
|
||||
batch = "-b 1024 -ub 1024";
|
||||
batch = "-b 1024 -ub 1024"; # default 512 512
|
||||
hf_repo = "";
|
||||
image-tokens = "--image-min-tokens 256 --image-max-tokens 1536";
|
||||
image-tokens = "--image-min-tokens 256 --image-max-tokens 1024";
|
||||
qwen35-thinking = "--chat-template-kwargs '{\"enable_thinking\":true}'";
|
||||
qwen35-no-thinking = "--chat-template-kwargs '{\"enable_thinking\":false}'";
|
||||
};
|
||||
|
||||
models = {
|
||||
|
||||
|
||||
"qwen3.5-35b-a3b" = {
|
||||
cmd = "${llama-server} --port $\{PORT\} --host 0.0.0.0 --ctx-size $\{ctx\} -ngl $\{ngl\} $\{kv_cache\} $\{batch\} --hf-repo $\{hf_repo\} $\{image-tokens\} $\{qwen35-no-thinking\} ";
|
||||
aliases = [ "qwen3.5" ];
|
||||
ttl = 1800;
|
||||
macros = {
|
||||
ctx = 49152;
|
||||
ctx = 49152;
|
||||
hf_repo = "unsloth/Qwen3.5-35B-A3B-GGUF:UD-IQ3_XXS";
|
||||
};
|
||||
};
|
||||
@@ -83,8 +82,8 @@
|
||||
ttl = 900;
|
||||
aliases = [ "coder" ];
|
||||
macros = {
|
||||
hf_repo = "unsloth/GLM-4.7-Flash-REAP-23B-A3B-GGUF"; #Reap should allow more context in gpu mem
|
||||
ctx = 48000;
|
||||
hf_repo = "unsloth/GLM-4.7-Flash-REAP-23B-A3B-GGUF"; # Reap should allow more context in gpu mem
|
||||
ctx = 48000;
|
||||
};
|
||||
};
|
||||
|
||||
@@ -106,7 +105,6 @@
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
"ministal-3-8b-reasonning" = {
|
||||
cmd = "${llama-server} --port $\{PORT\} --host 0.0.0.0 --ctx-size $\{ctx\} -ngl $\{ngl\} --hf-repo $\{hf_repo\}";
|
||||
aliases = [ "ministral3" ];
|
||||
@@ -120,8 +118,6 @@
|
||||
macros.hf_repo = "mistralai/Ministral-3-3B-Instruct-2512-GGUF";
|
||||
};
|
||||
|
||||
|
||||
|
||||
"minicpm-o-4_5" = {
|
||||
cmd = "${llama-server} --port $\{PORT\} --host 0.0.0.0 --ctx-size $\{ctx\} -ngl $\{ngl\} --hf-repo $\{hf_repo\} --mmproj-url https://huggingface.co/openbmb/MiniCPM-o-4_5-gguf/resolve/main/vision/MiniCPM-o-4_5-vision-F16.gguf";
|
||||
aliases = [
|
||||
@@ -134,7 +130,6 @@
|
||||
"z-image-turbo" = {
|
||||
cmd = "${sd-server} --listen-port $\{PORT\} --diffusion-model $\{diffusion_model\} --vae $\{vae\} --llm $\{llm\} --offload-to-cpu --cfg-scale 1.0 --height 1024 --width 1024 --steps 4";
|
||||
checkEndpoint = "/";
|
||||
proxy = "http://127.0.0.1:$\{PORT\}";
|
||||
ttl = 300;
|
||||
aliases = [
|
||||
"gpt-image-1"
|
||||
@@ -149,7 +144,6 @@
|
||||
"distil-whisper-v3.5" = {
|
||||
cmd = "${whisper-server} --host 127.0.0.1 --port $\{PORT\} -m $\{model\} --request-path /v1/audio/transcriptions --inference-path \"\"";
|
||||
checkEndpoint = "/v1/audio/transcriptions/";
|
||||
proxy = "http://127.0.0.1:$\{PORT\}";
|
||||
ttl = 0;
|
||||
aliases = [
|
||||
"whisper"
|
||||
@@ -160,7 +154,6 @@
|
||||
"nb-whisper-small" = {
|
||||
cmd = "${whisper-server} --host 127.0.0.1 --port $\{PORT\} -m $\{model\} --request-path /v1/audio/transcriptions --inference-path \"\" --language no";
|
||||
checkEndpoint = "/v1/audio/transcriptions/";
|
||||
proxy = "http://127.0.0.1:$\{PORT\}";
|
||||
ttl = 0;
|
||||
aliases = [
|
||||
"whisper-no"
|
||||
@@ -168,20 +161,21 @@
|
||||
];
|
||||
macros.model = "${whisper-models}/models/nb-whisper-small-q5_0.bin";
|
||||
};
|
||||
|
||||
"omnivoice" = {
|
||||
cmd = "docker run --gpus all --rm --name omnivoice-$\{port\} -p $\{port\}:8091 vllm/vllm-openai:nightly vllm serve k2-fsa/OmniVoice --omni --port 8091 --trust-remote-code";
|
||||
cmdStop = "${podman} stop omnivoice-$\{PORT\}";
|
||||
checkEndpoint = "/v1/audio/voices";
|
||||
proxy = "http://127.0.0.1:$\{PORT\}";
|
||||
ttl = 900;
|
||||
};
|
||||
|
||||
|
||||
"omnivoice" = {
|
||||
# Starts the service, then tails the journal to stay in the foreground for llama-swap
|
||||
cmd = "${pkgs.bash}/bin/bash -c '${pkgs.systemd}/bin/systemctl start podman-omnivoice.service && exec ${pkgs.systemd}/bin/journalctl -u podman-omnivoice.service -f'";
|
||||
cmdStop = "${pkgs.systemd}/bin/systemctl stop podman-omnivoice.service";
|
||||
checkEndpoint = "/v1/audio/voices";
|
||||
proxy = "http://127.0.0.1:8091";
|
||||
ttl = 900;
|
||||
};
|
||||
|
||||
"kokoro" = {
|
||||
cmd = "${podman} run --rm --name kokoro-tts-$\{PORT\} -p $\{PORT\}:8880 ghcr.io/remsky/kokoro-fastapi-cpu:latest";
|
||||
cmdStop = "${podman} stop kokoro-tts-$\{PORT\}";
|
||||
cmd = "${podman} run --rm --name kokoro-tts -p $\{PORT\}:8880 ghcr.io/remsky/kokoro-fastapi-cpu:latest";
|
||||
cmdStop = "${podman} stop kokoro-tts";
|
||||
checkEndpoint = "/v1/audio/voices";
|
||||
proxy = "http://127.0.0.1:$\{PORT\}";
|
||||
ttl = 900;
|
||||
};
|
||||
|
||||
@@ -189,6 +183,15 @@
|
||||
};
|
||||
};
|
||||
|
||||
virtualisation.oci-containers.containers.omnivoice = {
|
||||
image = "vllm/vllm-openai:nightly";
|
||||
ports = [ "8091:8091" ];
|
||||
cmd = [ "vllm" "serve" "k2-fsa/OmniVoice" "--omni" "--port" "8091" "--trust-remote-code" ];
|
||||
extraOptions = ["--rm" ];
|
||||
autoStart = false;
|
||||
};
|
||||
|
||||
|
||||
systemd.services.llama-swap = {
|
||||
serviceConfig = {
|
||||
StateDirectory = "llama-swap";
|
||||
@@ -199,6 +202,24 @@
|
||||
"XDG_CACHE_HOME=/var/cache/llama-swap"
|
||||
"MESA_SHADER_CACHE_DIR=/var/cache/llama-swap/mesa"
|
||||
];
|
||||
|
||||
DynamicUser = lib.mkForce false;
|
||||
User = "root";
|
||||
Group = "root";
|
||||
|
||||
PrivateUsers = lib.mkForce false;
|
||||
RestrictNamespaces = lib.mkForce false;
|
||||
ProtectKernelNamespaces = false;
|
||||
PrivateMounts = lib.mkForce false;
|
||||
ProtectSystem = lib.mkForce false;
|
||||
NoNewPrivileges = lib.mkForce false;
|
||||
RestrictAddressFamilies = lib.mkForce [
|
||||
"AF_INET"
|
||||
"AF_INET6"
|
||||
"AF_UNIX"
|
||||
"AF_NETLINK"
|
||||
];
|
||||
SystemCallFilter = lib.mkForce [ "@system-service" ];
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user