monitoring

This commit is contained in:
soraefir
2026-06-06 01:45:59 +02:00
parent f2a68f34cb
commit 70fdf33f07
3 changed files with 46 additions and 10 deletions

View File

@@ -2,6 +2,30 @@
let
cfg = config.syscfg.monitoring.telegraf;
hasCollector = name: builtins.elem name cfg.collectors;
dockerGroups =
lib.optionals (cfg.enable && hasCollector "docker" && config.virtualisation.podman.enable) [ "podman" ]
++ lib.optionals (cfg.enable && hasCollector "docker" && config.virtualisation.docker.enable) [ "docker" ];
amdgpuMetricsScript = pkgs.writeShellScript "telegraf-amdgpu-metrics" ''
set -euo pipefail
${lib.getExe pkgs.custom.amdgpu_top} -J -n 1 | ${lib.getExe pkgs.jq} -r '
def maybe_int($name; $value):
if $value == null then empty else "\($name)=\(($value | floor))i" end;
def maybe_float($name; $value):
if $value == null then empty else "\($name)=\($value)" end;
.devices
| to_entries[]
| [
maybe_int("utilization_gpu"; (.value.gpu_activity.GFX.value // .value.GRBM2["Command Processor - Graphics"].value // 0)),
maybe_int("utilization_media"; .value.gpu_activity.MediaEngine.value),
maybe_int("utilization_memory"; .value.gpu_activity.Memory.value),
maybe_float("temperature_edge"; .value.Sensors["Edge Temperature"].value),
maybe_float("power_draw"; .value.gpu_metrics.average_socket_power.value)
] as $fields
| map(select(length > 0)) as $nonempty
| select(($nonempty | length) > 0)
| "amdgpu,card=\(.key) " + ($nonempty | join(","))
'
'';
baseConfig = {
agent = {
interval = "10s";
@@ -40,7 +64,9 @@ let
};
})
(lib.mkIf (hasCollector "diskio") {
inputs.diskio = { };
inputs.diskio = {
skip_serial_number = true;
};
})
(lib.mkIf (hasCollector "kernel") {
inputs.kernel = { };
@@ -70,8 +96,8 @@ let
inputs.docker = {
endpoint = "unix:///var/run/docker.sock";
timeout = "5s";
perdevice_include = ["blkio" "cpu" "network"];
total_include = [];
perdevice_include = [ ];
total_include = [ ];
};
})
(lib.mkIf (hasCollector "ping") {
@@ -83,6 +109,13 @@ let
binary = "${pkgs.iputils}/bin/ping";
};
})
(lib.mkIf (hasCollector "gpu") {
inputs.exec = [{
commands = [ amdgpuMetricsScript ];
timeout = "5s";
data_format = "influx";
}];
})
];
outputsConfig = lib.mkMerge [{
outputs.influxdb_v3 = {
@@ -105,11 +138,13 @@ in {
];
};
users.users.telegraf.extraGroups = ["podman"];
users.users.telegraf.extraGroups = dockerGroups;
systemd.services.telegraf = {
path = lib.optionals (hasCollector "smart") [ pkgs.smartmontools ];
serviceConfig.SupplementaryGroups = ["podman"];
path =
lib.optionals (hasCollector "smart") [ pkgs.smartmontools pkgs.nvme-cli ]
++ lib.optionals (hasCollector "gpu") [ pkgs.custom.amdgpu_top pkgs.jq ];
serviceConfig.SupplementaryGroups = dockerGroups;
};
security.sudo.extraRules = lib.optionals (hasCollector "smart") [{