Files
nixconfig/modules/nixos/tools/telegraf/default.nix
2026-06-06 01:45:59 +02:00

159 lines
4.6 KiB
Nix

{ config, lib, pkgs, ... }:
let
cfg = config.syscfg.monitoring.telegraf;
hasCollector = name: builtins.elem name cfg.collectors;
dockerGroups =
lib.optionals (cfg.enable && hasCollector "docker" && config.virtualisation.podman.enable) [ "podman" ]
++ lib.optionals (cfg.enable && hasCollector "docker" && config.virtualisation.docker.enable) [ "docker" ];
amdgpuMetricsScript = pkgs.writeShellScript "telegraf-amdgpu-metrics" ''
set -euo pipefail
${lib.getExe pkgs.custom.amdgpu_top} -J -n 1 | ${lib.getExe pkgs.jq} -r '
def maybe_int($name; $value):
if $value == null then empty else "\($name)=\(($value | floor))i" end;
def maybe_float($name; $value):
if $value == null then empty else "\($name)=\($value)" end;
.devices
| to_entries[]
| [
maybe_int("utilization_gpu"; (.value.gpu_activity.GFX.value // .value.GRBM2["Command Processor - Graphics"].value // 0)),
maybe_int("utilization_media"; .value.gpu_activity.MediaEngine.value),
maybe_int("utilization_memory"; .value.gpu_activity.Memory.value),
maybe_float("temperature_edge"; .value.Sensors["Edge Temperature"].value),
maybe_float("power_draw"; .value.gpu_metrics.average_socket_power.value)
] as $fields
| map(select(length > 0)) as $nonempty
| select(($nonempty | length) > 0)
| "amdgpu,card=\(.key) " + ($nonempty | join(","))
'
'';
baseConfig = {
agent = {
interval = "10s";
round_interval = true;
metric_batch_size = 1000;
metric_buffer_limit = 10000;
flush_interval = "10s";
hostname = config.syscfg.hostname;
omit_hostname = false;
};
global_tags = {
host = config.syscfg.hostname;
};
};
inputsConfig = lib.mkMerge [
(lib.mkIf (hasCollector "cpu") {
inputs.cpu = {
percpu = true;
totalcpu = true;
collect_cpu_time = false;
report_active = false;
};
})
(lib.mkIf (hasCollector "mem") {
inputs.mem = { };
})
(lib.mkIf (hasCollector "swap") {
inputs.swap = { };
})
(lib.mkIf (hasCollector "system") {
inputs.system = { };
})
(lib.mkIf (hasCollector "disk") {
inputs.disk = {
ignore_fs = [ "tmpfs" "devtmpfs" "devfs" "overlay" "squashfs" ];
};
})
(lib.mkIf (hasCollector "diskio") {
inputs.diskio = {
skip_serial_number = true;
};
})
(lib.mkIf (hasCollector "kernel") {
inputs.kernel = { };
})
(lib.mkIf (hasCollector "net") {
inputs.net = { };
})
(lib.mkIf (hasCollector "netstat") {
inputs.netstat = { };
})
(lib.mkIf (hasCollector "processes") {
inputs.processes = { };
})
(lib.mkIf (hasCollector "temp") {
inputs.temp = { };
})
(lib.mkIf (hasCollector "mdstat") {
inputs.mdstat = { };
})
(lib.mkIf (hasCollector "smart") {
inputs.smart = {
use_sudo = true;
attributes = true;
};
})
(lib.mkIf (hasCollector "docker") {
inputs.docker = {
endpoint = "unix:///var/run/docker.sock";
timeout = "5s";
perdevice_include = [ ];
total_include = [ ];
};
})
(lib.mkIf (hasCollector "ping") {
inputs.ping = {
urls = [ "1.1.1.1" ];
count = 4;
interval = "60s";
timeout = 5.0;
binary = "${pkgs.iputils}/bin/ping";
};
})
(lib.mkIf (hasCollector "gpu") {
inputs.exec = [{
commands = [ amdgpuMetricsScript ];
timeout = "5s";
data_format = "influx";
}];
})
];
outputsConfig = lib.mkMerge [{
outputs.influxdb_v3 = {
urls = cfg.outputs;
token = "$INFLUX_TOKEN";#config.sops.secrets.telegraf.path;
database = "telegraf";
};
}
];
in {
config = lib.mkIf cfg.enable {
services.telegraf = {
enable = true;
environmentFiles = [ config.sops.secrets.telegraf.path ];
extraConfig = lib.mkMerge [
baseConfig
inputsConfig
outputsConfig
cfg.extraConfig
];
};
users.users.telegraf.extraGroups = dockerGroups;
systemd.services.telegraf = {
path =
lib.optionals (hasCollector "smart") [ pkgs.smartmontools pkgs.nvme-cli ]
++ lib.optionals (hasCollector "gpu") [ pkgs.custom.amdgpu_top pkgs.jq ];
serviceConfig.SupplementaryGroups = dockerGroups;
};
security.sudo.extraRules = lib.optionals (hasCollector "smart") [{
users = [ "telegraf" ];
commands = [{
command = "${pkgs.smartmontools}/bin/smartctl";
options = [ "NOPASSWD" ];
}];
}];
};
}