Files
nixconfig/modules/nixos/tools/telegraf/default.nix
2026-06-06 13:57:16 +02:00

328 lines
8.5 KiB
Nix

{ config, lib, pkgs, ... }:
let
cfg = config.syscfg.monitoring.telegraf;
hasCollector = name: builtins.elem name cfg.collectors;
dockerGroups =
lib.optionals (cfg.enable && hasCollector "docker" && config.virtualisation.podman.enable) [ "podman" ]
++ lib.optionals (cfg.enable && hasCollector "docker" && config.virtualisation.docker.enable) [ "docker" ];
amdgpuMetricsScript = pkgs.writeShellScript "telegraf-amdgpu-metrics" ''
set -euo pipefail
${lib.getExe pkgs.custom.amdgpu_top} -J -n 1 | ${lib.getExe pkgs.jq} -r '
def maybe_int($name; $value):
if $value == null then empty else "\($name)=\(($value | floor))i" end;
def maybe_float($name; $value):
if $value == null then empty else "\($name)=\($value)" end;
.devices
| to_entries[]
| [
maybe_int("utilization_gpu"; (.value.gpu_activity.GFX.value // .value.GRBM2["Command Processor - Graphics"].value // 0)),
maybe_int("utilization_media"; .value.gpu_activity.MediaEngine.value),
maybe_int("utilization_memory"; .value.gpu_activity.Memory.value),
maybe_float("temperature_edge"; .value.Sensors["Edge Temperature"].value),
maybe_float("power_draw"; .value.gpu_metrics.average_socket_power.value)
] as $fields
| map(select(length > 0)) as $nonempty
| select(($nonempty | length) > 0)
| "amdgpu,card=\(.key) " + ($nonempty | join(","))
'
'';
baseConfig = {
agent = {
interval = "10s";
round_interval = true;
metric_batch_size = 1000;
metric_buffer_limit = 10000;
flush_interval = "10s";
hostname = config.syscfg.hostname;
omit_hostname = false;
};
global_tags = {
host = config.syscfg.hostname;
};
};
inputsConfig = lib.mkMerge [
(lib.mkIf (hasCollector "cpu") {
inputs.cpu = {
percpu = true;
totalcpu = true;
collect_cpu_time = false;
report_active = false;
fielddrop = [
"usage_guest"
"usage_guest_nice"
"usage_irq"
"usage_nice"
"usage_softirq"
"usage_steal"
];
};
})
(lib.mkIf (hasCollector "mem") {
inputs.mem = {
fielddrop = [
"available_percent"
"commit_limit"
"committed_as"
"high_free"
"high_total"
"huge_page_size"
"huge_pages_free"
"huge_pages_total"
"low_free"
"low_total"
"mapped"
"page_tables"
"slab"
"sreclaimable"
"sunreclaim"
"swap_cached"
"swap_free"
"swap_total"
"vmalloc_chunk"
"vmalloc_total"
"vmalloc_used"
"write_back"
"write_back_tmp"
];
};
})
(lib.mkIf (hasCollector "swap") {
inputs.swap = {
fielddrop = [
"free"
];
};
})
(lib.mkIf (hasCollector "system") {
inputs.system = {
fielddrop = [
"n_physical_cpus"
"n_unique_users"
"uptime_format"
];
};
})
(lib.mkIf (hasCollector "disk") {
inputs.disk = {
ignore_fs = [ "tmpfs" "devtmpfs" "devfs" "overlay" "squashfs" ];
fielddrop = [
"free"
"inodes_free"
"inodes_total"
"inodes_used"
"inodes_used_percent"
];
};
})
(lib.mkIf (hasCollector "diskio") {
inputs.diskio = {
skip_serial_number = true;
fielddrop = [
"io_svctm"
"iops_in_progress"
"merged_reads"
"merged_writes"
"weighted_io_time"
];
};
})
(lib.mkIf (hasCollector "kernel") {
inputs.kernel = {
fielddrop = [
"boot_time"
];
};
})
(lib.mkIf (hasCollector "net") {
inputs.net = {
fielddrop = [
"bytes_recv"
"bytes_sent"
"speed"
];
};
})
(lib.mkIf (hasCollector "netstat") {
inputs.netstat = {
fielddrop = [
"tcp_close"
"tcp_close_wait"
"tcp_closing"
"tcp_fin_wait1"
"tcp_fin_wait2"
"tcp_last_ack"
"tcp_none"
"tcp_syn_recv"
"tcp_syn_sent"
];
};
})
(lib.mkIf (hasCollector "processes") {
inputs.processes = {
fielddrop = [
"dead"
"idle"
"paging"
"stopped"
"unknown"
"zombies"
];
};
})
(lib.mkIf (hasCollector "temp") {
inputs.temp = { };
})
(lib.mkIf (hasCollector "mdstat") {
inputs.mdstat = { };
})
(lib.mkIf (hasCollector "smart") {
inputs.smart = {
use_sudo = true;
attributes = true;
};
})
(lib.mkIf (hasCollector "docker") {
inputs.docker = [
{
endpoint = "unix:///var/run/docker.sock";
timeout = "5s";
perdevice_include = [ ];
total_include = [ ];
docker_label_exclude = [ "*" ];
tagexclude = [
"container_image"
"container_status"
"container_version"
"engine_host"
"server_version"
];
namedrop = [
"docker_container_health"
"docker_container_mem"
"docker_container_status"
];
fielddrop = [
"memory_total"
"n_cpus"
"n_goroutines"
"n_listener_events"
"n_used_file_descriptors"
"server_version"
];
}
{
endpoint = "unix:///var/run/docker.sock";
timeout = "5s";
perdevice_include = [ ];
total_include = [ ];
docker_label_exclude = [ "*" ];
tagexclude = [
"container_image"
"container_status"
"container_version"
"engine_host"
"server_version"
];
namepass = [ "docker_container_mem" ];
fielddrop = [
"active_anon"
"active_file"
"container_id"
"hierarchical_memory_limit"
"inactive_anon"
"inactive_file"
"mapped_file"
"max_usage"
"pgfault"
"pgmajfault"
"pgpgin"
"pgpgout"
"rss_huge"
"total_active_anon"
"total_active_file"
"total_cache"
"total_inactive_anon"
"total_inactive_file"
"total_mapped_file"
"total_pgfault"
"total_pgmajfault"
"total_pgpgin"
"total_pgpgout"
"total_rss"
"total_rss_huge"
"total_unevictable"
"total_writeback"
"unevictable"
"writeback"
];
}
];
})
(lib.mkIf (hasCollector "ping") {
inputs.ping = {
urls = [ "1.1.1.1" ];
count = 4;
interval = "60s";
timeout = 5.0;
binary = "${pkgs.iputils}/bin/ping";
fielddrop = [
"packets_received"
"packets_transmitted"
];
};
})
(lib.mkIf (hasCollector "internet_speed") {
inputs.internet_speed = {
interval = "30m";
cache = true;
memory_saving_mode = true;
};
})
(lib.mkIf (hasCollector "gpu") {
inputs.exec = [{
commands = [ amdgpuMetricsScript ];
timeout = "5s";
data_format = "influx";
}];
})
];
outputsConfig = lib.mkMerge [{
outputs.influxdb_v3 = {
urls = cfg.outputs;
token = "$INFLUX_TOKEN";#config.sops.secrets.telegraf.path;
database = "telegraf";
};
}
];
in {
config = lib.mkIf cfg.enable {
services.telegraf = {
enable = true;
environmentFiles = [ config.sops.secrets.telegraf.path ];
extraConfig = lib.mkMerge [
baseConfig
inputsConfig
outputsConfig
cfg.extraConfig
];
};
users.users.telegraf.extraGroups = dockerGroups;
systemd.services.telegraf = {
path =
lib.optionals (hasCollector "smart") [ pkgs.smartmontools pkgs.nvme-cli ]
++ lib.optionals (hasCollector "gpu") [ pkgs.custom.amdgpu_top pkgs.jq ];
serviceConfig.SupplementaryGroups = dockerGroups;
};
security.sudo.extraRules = lib.optionals (hasCollector "smart") [{
users = [ "telegraf" ];
commands = [{
command = "${pkgs.smartmontools}/bin/smartctl";
options = [ "NOPASSWD" ];
}];
}];
};
}