monitoring
This commit is contained in:
@@ -2,6 +2,30 @@
|
||||
let
|
||||
cfg = config.syscfg.monitoring.telegraf;
|
||||
hasCollector = name: builtins.elem name cfg.collectors;
|
||||
dockerGroups =
|
||||
lib.optionals (cfg.enable && hasCollector "docker" && config.virtualisation.podman.enable) [ "podman" ]
|
||||
++ lib.optionals (cfg.enable && hasCollector "docker" && config.virtualisation.docker.enable) [ "docker" ];
|
||||
amdgpuMetricsScript = pkgs.writeShellScript "telegraf-amdgpu-metrics" ''
|
||||
set -euo pipefail
|
||||
${lib.getExe pkgs.custom.amdgpu_top} -J -n 1 | ${lib.getExe pkgs.jq} -r '
|
||||
def maybe_int($name; $value):
|
||||
if $value == null then empty else "\($name)=\(($value | floor))i" end;
|
||||
def maybe_float($name; $value):
|
||||
if $value == null then empty else "\($name)=\($value)" end;
|
||||
.devices
|
||||
| to_entries[]
|
||||
| [
|
||||
maybe_int("utilization_gpu"; (.value.gpu_activity.GFX.value // .value.GRBM2["Command Processor - Graphics"].value // 0)),
|
||||
maybe_int("utilization_media"; .value.gpu_activity.MediaEngine.value),
|
||||
maybe_int("utilization_memory"; .value.gpu_activity.Memory.value),
|
||||
maybe_float("temperature_edge"; .value.Sensors["Edge Temperature"].value),
|
||||
maybe_float("power_draw"; .value.gpu_metrics.average_socket_power.value)
|
||||
] as $fields
|
||||
| map(select(length > 0)) as $nonempty
|
||||
| select(($nonempty | length) > 0)
|
||||
| "amdgpu,card=\(.key) " + ($nonempty | join(","))
|
||||
'
|
||||
'';
|
||||
baseConfig = {
|
||||
agent = {
|
||||
interval = "10s";
|
||||
@@ -40,7 +64,9 @@ let
|
||||
};
|
||||
})
|
||||
(lib.mkIf (hasCollector "diskio") {
|
||||
inputs.diskio = { };
|
||||
inputs.diskio = {
|
||||
skip_serial_number = true;
|
||||
};
|
||||
})
|
||||
(lib.mkIf (hasCollector "kernel") {
|
||||
inputs.kernel = { };
|
||||
@@ -70,8 +96,8 @@ let
|
||||
inputs.docker = {
|
||||
endpoint = "unix:///var/run/docker.sock";
|
||||
timeout = "5s";
|
||||
perdevice_include = ["blkio" "cpu" "network"];
|
||||
total_include = [];
|
||||
perdevice_include = [ ];
|
||||
total_include = [ ];
|
||||
};
|
||||
})
|
||||
(lib.mkIf (hasCollector "ping") {
|
||||
@@ -83,6 +109,13 @@ let
|
||||
binary = "${pkgs.iputils}/bin/ping";
|
||||
};
|
||||
})
|
||||
(lib.mkIf (hasCollector "gpu") {
|
||||
inputs.exec = [{
|
||||
commands = [ amdgpuMetricsScript ];
|
||||
timeout = "5s";
|
||||
data_format = "influx";
|
||||
}];
|
||||
})
|
||||
];
|
||||
outputsConfig = lib.mkMerge [{
|
||||
outputs.influxdb_v3 = {
|
||||
@@ -105,11 +138,13 @@ in {
|
||||
];
|
||||
};
|
||||
|
||||
users.users.telegraf.extraGroups = ["podman"];
|
||||
users.users.telegraf.extraGroups = dockerGroups;
|
||||
|
||||
systemd.services.telegraf = {
|
||||
path = lib.optionals (hasCollector "smart") [ pkgs.smartmontools ];
|
||||
serviceConfig.SupplementaryGroups = ["podman"];
|
||||
path =
|
||||
lib.optionals (hasCollector "smart") [ pkgs.smartmontools pkgs.nvme-cli ]
|
||||
++ lib.optionals (hasCollector "gpu") [ pkgs.custom.amdgpu_top pkgs.jq ];
|
||||
serviceConfig.SupplementaryGroups = dockerGroups;
|
||||
};
|
||||
|
||||
security.sudo.extraRules = lib.optionals (hasCollector "smart") [{
|
||||
|
||||
@@ -23,6 +23,7 @@ with lib; {
|
||||
"smart"
|
||||
"docker"
|
||||
"ping"
|
||||
"gpu"
|
||||
]);
|
||||
default = [ ];
|
||||
};
|
||||
|
||||
@@ -93,14 +93,14 @@
|
||||
collectors = [
|
||||
"cpu"
|
||||
"mem"
|
||||
"swap"
|
||||
#"swap"
|
||||
"system"
|
||||
"disk"
|
||||
"diskio"
|
||||
"kernel"
|
||||
#"kernel"
|
||||
"net"
|
||||
"netstat"
|
||||
"processes"
|
||||
#"netstat"
|
||||
#"processes"
|
||||
"docker"
|
||||
"ping"
|
||||
];
|
||||
|
||||
Reference in New Issue
Block a user