diff --git a/developmentlog.md b/developmentlog.md new file mode 100644 index 0000000..a347db7 --- /dev/null +++ b/developmentlog.md @@ -0,0 +1,18 @@ +# Development log + +## legolas: NVMe (Corsair MP600 ELITE / Phison E21) suspend/resume crashes + +The drive hung on resume from S3 (deep) suspend — boot logs ended at `PM: suspend entry (deep)` with no resume, followed by a hard reboot. The Phison E21 can't recover from the D3 transition that S3 forces. + +Investigation also found two existing workarounds were doing nothing: + +- The udev rule `ATTR{d3cold_allowed}="0"` was a no-op — that sysfs attribute doesn't exist on this Phison endpoint or its parent root port (`0000:00:1d.0`). +- The `nvme-resume-fix` service was broken: `ExecStart` used `>` redirection without a shell, so `echo` just printed the string and never wrote to the `rescan` sysfs file. It never actually rescanned. + +Changes (`hosts/legolas/hardware-configuration.nix`): + +- Switched `mem_sleep_default` from `deep` (S3) to `s2idle` (modern standby). s2idle avoids the deep D3 path that hangs the drive. +- Removed the no-op `d3cold_allowed` rule and the broken rescan service. +- Added a correct runtime-PM-off udev rule on both the NVMe endpoint (`0000:6e:00.0`) and its parent root port (`0000:00:1d.0`), keeping the PCIe wakeup disable. + +Note: runtime PM was already off on the endpoint (`power/control=on`); that alone never fixed the crash because system suspend uses a separate code path. Verify after rebuild that the cmdline has `mem_sleep_default=s2idle` (the old running generation showed a stale duplicate `deep`). diff --git a/hosts/legolas/hardware-configuration.nix b/hosts/legolas/hardware-configuration.nix index eb730b1..a83a55f 100644 --- a/hosts/legolas/hardware-configuration.nix +++ b/hosts/legolas/hardware-configuration.nix @@ -13,7 +13,10 @@ boot.kernelPackages = pkgs.linuxPackages_latest; boot.kernelParams = [ - "mem_sleep_default=deep" + # s2idle (modern standby) instead of deep (S3): the Phison E21 NVMe + # controller (Corsair MP600 ELITE) hangs on resume from the D3 transition + # that S3 forces. s2idle avoids that path. + "mem_sleep_default=s2idle" "nvme_core.default_ps_max_latency_us=0" "pcie_aspm=off" ]; @@ -67,22 +70,15 @@ }; }; - # Disable PCIe wakeups and prevent NVMe D3cold (fixes resume from S3) + # Keep runtime PM off on the NVMe controller and its parent root port, and + # disable PCIe wakeup events. (The previous d3cold_allowed rule was a no-op: + # that sysfs attribute doesn't exist on this Phison endpoint or its root port.) services.udev.extraRules = '' ACTION=="add", SUBSYSTEM=="pci", DRIVER=="pcieport", ATTR{power/wakeup}="disabled" - ACTION=="add", SUBSYSTEM=="pci", KERNEL=="0000:6e:00.0", ATTR{d3cold_allowed}="0" + ACTION=="add", SUBSYSTEM=="pci", KERNEL=="0000:6e:00.0", ATTR{power/control}="on" + ACTION=="add", SUBSYSTEM=="pci", KERNEL=="0000:00:1d.0", ATTR{power/control}="on" ''; - systemd.services.nvme-resume-fix = { - description = "NVMe resume workaround"; - after = [ "suspend.target" ]; - wantedBy = [ "suspend.target" ]; - serviceConfig = { - Type = "oneshot"; - ExecStart = "${pkgs.coreutils}/bin/echo 1 > /sys/class/nvme/nvme0/device/rescan"; - }; - }; - hardware.graphics = { enable = true; extraPackages = with pkgs; [ vpl-gpu-rt ];