159 lines
4.6 KiB
Nix
159 lines
4.6 KiB
Nix
{ config, lib, pkgs, ... }:
|
|
let
|
|
cfg = config.syscfg.monitoring.telegraf;
|
|
hasCollector = name: builtins.elem name cfg.collectors;
|
|
dockerGroups =
|
|
lib.optionals (cfg.enable && hasCollector "docker" && config.virtualisation.podman.enable) [ "podman" ]
|
|
++ lib.optionals (cfg.enable && hasCollector "docker" && config.virtualisation.docker.enable) [ "docker" ];
|
|
amdgpuMetricsScript = pkgs.writeShellScript "telegraf-amdgpu-metrics" ''
|
|
set -euo pipefail
|
|
${lib.getExe pkgs.custom.amdgpu_top} -J -n 1 | ${lib.getExe pkgs.jq} -r '
|
|
def maybe_int($name; $value):
|
|
if $value == null then empty else "\($name)=\(($value | floor))i" end;
|
|
def maybe_float($name; $value):
|
|
if $value == null then empty else "\($name)=\($value)" end;
|
|
.devices
|
|
| to_entries[]
|
|
| [
|
|
maybe_int("utilization_gpu"; (.value.gpu_activity.GFX.value // .value.GRBM2["Command Processor - Graphics"].value // 0)),
|
|
maybe_int("utilization_media"; .value.gpu_activity.MediaEngine.value),
|
|
maybe_int("utilization_memory"; .value.gpu_activity.Memory.value),
|
|
maybe_float("temperature_edge"; .value.Sensors["Edge Temperature"].value),
|
|
maybe_float("power_draw"; .value.gpu_metrics.average_socket_power.value)
|
|
] as $fields
|
|
| map(select(length > 0)) as $nonempty
|
|
| select(($nonempty | length) > 0)
|
|
| "amdgpu,card=\(.key) " + ($nonempty | join(","))
|
|
'
|
|
'';
|
|
baseConfig = {
|
|
agent = {
|
|
interval = "10s";
|
|
round_interval = true;
|
|
metric_batch_size = 1000;
|
|
metric_buffer_limit = 10000;
|
|
flush_interval = "10s";
|
|
hostname = config.syscfg.hostname;
|
|
omit_hostname = false;
|
|
};
|
|
global_tags = {
|
|
host = config.syscfg.hostname;
|
|
};
|
|
};
|
|
inputsConfig = lib.mkMerge [
|
|
(lib.mkIf (hasCollector "cpu") {
|
|
inputs.cpu = {
|
|
percpu = true;
|
|
totalcpu = true;
|
|
collect_cpu_time = false;
|
|
report_active = false;
|
|
};
|
|
})
|
|
(lib.mkIf (hasCollector "mem") {
|
|
inputs.mem = { };
|
|
})
|
|
(lib.mkIf (hasCollector "swap") {
|
|
inputs.swap = { };
|
|
})
|
|
(lib.mkIf (hasCollector "system") {
|
|
inputs.system = { };
|
|
})
|
|
(lib.mkIf (hasCollector "disk") {
|
|
inputs.disk = {
|
|
ignore_fs = [ "tmpfs" "devtmpfs" "devfs" "overlay" "squashfs" ];
|
|
};
|
|
})
|
|
(lib.mkIf (hasCollector "diskio") {
|
|
inputs.diskio = {
|
|
skip_serial_number = true;
|
|
};
|
|
})
|
|
(lib.mkIf (hasCollector "kernel") {
|
|
inputs.kernel = { };
|
|
})
|
|
(lib.mkIf (hasCollector "net") {
|
|
inputs.net = { };
|
|
})
|
|
(lib.mkIf (hasCollector "netstat") {
|
|
inputs.netstat = { };
|
|
})
|
|
(lib.mkIf (hasCollector "processes") {
|
|
inputs.processes = { };
|
|
})
|
|
(lib.mkIf (hasCollector "temp") {
|
|
inputs.temp = { };
|
|
})
|
|
(lib.mkIf (hasCollector "mdstat") {
|
|
inputs.mdstat = { };
|
|
})
|
|
(lib.mkIf (hasCollector "smart") {
|
|
inputs.smart = {
|
|
use_sudo = true;
|
|
attributes = true;
|
|
};
|
|
})
|
|
(lib.mkIf (hasCollector "docker") {
|
|
inputs.docker = {
|
|
endpoint = "unix:///var/run/docker.sock";
|
|
timeout = "5s";
|
|
perdevice_include = [ ];
|
|
total_include = [ ];
|
|
};
|
|
})
|
|
(lib.mkIf (hasCollector "ping") {
|
|
inputs.ping = {
|
|
urls = [ "1.1.1.1" ];
|
|
count = 4;
|
|
interval = "60s";
|
|
timeout = 5.0;
|
|
binary = "${pkgs.iputils}/bin/ping";
|
|
};
|
|
})
|
|
(lib.mkIf (hasCollector "gpu") {
|
|
inputs.exec = [{
|
|
commands = [ amdgpuMetricsScript ];
|
|
timeout = "5s";
|
|
data_format = "influx";
|
|
}];
|
|
})
|
|
];
|
|
outputsConfig = lib.mkMerge [{
|
|
outputs.influxdb_v3 = {
|
|
urls = cfg.outputs;
|
|
token = "$INFLUX_TOKEN";#config.sops.secrets.telegraf.path;
|
|
database = "telegraf";
|
|
};
|
|
}
|
|
];
|
|
in {
|
|
config = lib.mkIf cfg.enable {
|
|
services.telegraf = {
|
|
enable = true;
|
|
environmentFiles = [ config.sops.secrets.telegraf.path ];
|
|
extraConfig = lib.mkMerge [
|
|
baseConfig
|
|
inputsConfig
|
|
outputsConfig
|
|
cfg.extraConfig
|
|
];
|
|
};
|
|
|
|
users.users.telegraf.extraGroups = dockerGroups;
|
|
|
|
systemd.services.telegraf = {
|
|
path =
|
|
lib.optionals (hasCollector "smart") [ pkgs.smartmontools pkgs.nvme-cli ]
|
|
++ lib.optionals (hasCollector "gpu") [ pkgs.custom.amdgpu_top pkgs.jq ];
|
|
serviceConfig.SupplementaryGroups = dockerGroups;
|
|
};
|
|
|
|
security.sudo.extraRules = lib.optionals (hasCollector "smart") [{
|
|
users = [ "telegraf" ];
|
|
commands = [{
|
|
command = "${pkgs.smartmontools}/bin/smartctl";
|
|
options = [ "NOPASSWD" ];
|
|
}];
|
|
}];
|
|
};
|
|
}
|