diff --git a/modules/nixos/tools/telegraf/default.nix b/modules/nixos/tools/telegraf/default.nix index 91e58e4..d3d6e28 100644 --- a/modules/nixos/tools/telegraf/default.nix +++ b/modules/nixos/tools/telegraf/default.nix @@ -2,6 +2,30 @@ let cfg = config.syscfg.monitoring.telegraf; hasCollector = name: builtins.elem name cfg.collectors; + dockerGroups = + lib.optionals (cfg.enable && hasCollector "docker" && config.virtualisation.podman.enable) [ "podman" ] + ++ lib.optionals (cfg.enable && hasCollector "docker" && config.virtualisation.docker.enable) [ "docker" ]; + amdgpuMetricsScript = pkgs.writeShellScript "telegraf-amdgpu-metrics" '' + set -euo pipefail + ${lib.getExe pkgs.custom.amdgpu_top} -J -n 1 | ${lib.getExe pkgs.jq} -r ' + def maybe_int($name; $value): + if $value == null then empty else "\($name)=\(($value | floor))i" end; + def maybe_float($name; $value): + if $value == null then empty else "\($name)=\($value)" end; + .devices + | to_entries[] + | [ + maybe_int("utilization_gpu"; (.value.gpu_activity.GFX.value // .value.GRBM2["Command Processor - Graphics"].value // 0)), + maybe_int("utilization_media"; .value.gpu_activity.MediaEngine.value), + maybe_int("utilization_memory"; .value.gpu_activity.Memory.value), + maybe_float("temperature_edge"; .value.Sensors["Edge Temperature"].value), + maybe_float("power_draw"; .value.gpu_metrics.average_socket_power.value) + ] as $fields + | map(select(length > 0)) as $nonempty + | select(($nonempty | length) > 0) + | "amdgpu,card=\(.key) " + ($nonempty | join(",")) + ' + ''; baseConfig = { agent = { interval = "10s"; @@ -40,7 +64,9 @@ let }; }) (lib.mkIf (hasCollector "diskio") { - inputs.diskio = { }; + inputs.diskio = { + skip_serial_number = true; + }; }) (lib.mkIf (hasCollector "kernel") { inputs.kernel = { }; @@ -70,8 +96,8 @@ let inputs.docker = { endpoint = "unix:///var/run/docker.sock"; timeout = "5s"; - perdevice_include = ["blkio" "cpu" "network"]; - total_include = []; + perdevice_include = [ ]; + total_include = [ ]; }; }) (lib.mkIf (hasCollector "ping") { @@ -83,6 +109,13 @@ let binary = "${pkgs.iputils}/bin/ping"; }; }) + (lib.mkIf (hasCollector "gpu") { + inputs.exec = [{ + commands = [ amdgpuMetricsScript ]; + timeout = "5s"; + data_format = "influx"; + }]; + }) ]; outputsConfig = lib.mkMerge [{ outputs.influxdb_v3 = { @@ -105,11 +138,13 @@ in { ]; }; - users.users.telegraf.extraGroups = ["podman"]; + users.users.telegraf.extraGroups = dockerGroups; systemd.services.telegraf = { - path = lib.optionals (hasCollector "smart") [ pkgs.smartmontools ]; - serviceConfig.SupplementaryGroups = ["podman"]; + path = + lib.optionals (hasCollector "smart") [ pkgs.smartmontools pkgs.nvme-cli ] + ++ lib.optionals (hasCollector "gpu") [ pkgs.custom.amdgpu_top pkgs.jq ]; + serviceConfig.SupplementaryGroups = dockerGroups; }; security.sudo.extraRules = lib.optionals (hasCollector "smart") [{ diff --git a/modules/shared/syscfg/monitoring.nix b/modules/shared/syscfg/monitoring.nix index bef5dd5..4c6b30d 100644 --- a/modules/shared/syscfg/monitoring.nix +++ b/modules/shared/syscfg/monitoring.nix @@ -23,6 +23,7 @@ with lib; { "smart" "docker" "ping" + "gpu" ]); default = [ ]; }; diff --git a/systems/sandbox/cfg.nix b/systems/sandbox/cfg.nix index 190433d..d7a9667 100644 --- a/systems/sandbox/cfg.nix +++ b/systems/sandbox/cfg.nix @@ -93,14 +93,14 @@ collectors = [ "cpu" "mem" - "swap" + #"swap" "system" "disk" "diskio" - "kernel" + #"kernel" "net" - "netstat" - "processes" + #"netstat" + #"processes" "docker" "ping" ];