Files
clan-core/clanServices/monitoring/telegraf.nix
Jörg Thalheim 98136142b4 monitoring: extend telegraf certificate expiration to 100 years
The default 30-day expiration was causing certificates to expire
frequently, breaking monitoring. Setting to 100 years provides a
temporary solution until automated certificate rotation is implemented.

Fixes #5605
2025-10-21 15:28:41 +02:00

170 lines
6.1 KiB
Nix

{
roles.telegraf.perInstance =
{ settings, ... }:
{
nixosModule =
{
config,
pkgs,
lib,
...
}:
let
auth_user = "prometheus";
in
{
warnings =
lib.optionals (settings.allowAllInterfaces != null) [
"monitoring.settings.allowAllInterfaces is deprecated and and has no effect. Please remove it from your inventory."
"The monitoring service will now always listen on all interfaces over https."
]
++ (lib.optionals (settings.interfaces != null) [
"monitoring.settings.interfaces is deprecated and and has no effect. Please remove it from your inventory."
"The monitoring service will now always listen on all interfaces over https."
]);
networking.firewall.allowedTCPPorts = [
9273
9990
];
clan.core.vars.generators."telegraf-certs" = {
files.crt = {
restartUnits = [ "telegraf.service" ];
deploy = true;
secret = false;
};
files.key = {
mode = "0600";
restartUnits = [ "telegraf.service" ];
};
runtimeInputs = [
pkgs.openssl
];
# TODO: Implement automated certificate rotation instead of using a 100-year expiration
script = ''
openssl req -x509 -nodes -newkey rsa:4096 \
-days 36500 \
-keyout "$out"/key \
-out "$out"/crt \
-subj "/C=US/ST=CA/L=San Francisco/O=Example Corp/OU=IT/CN=example.com"
'';
};
clan.core.vars.generators."telegraf" = {
files.password.restartUnits = [ "telegraf.service" ];
files.password-env.restartUnits = [ "telegraf.service" ];
files.miniserve-auth.restartUnits = [ "telegraf.service" ];
dependencies = [ "telegraf-certs" ];
runtimeInputs = [
pkgs.coreutils
pkgs.xkcdpass
pkgs.mkpasswd
];
script = ''
PASSWORD=$(xkcdpass --numwords 4 --delimiter - --count 1 | tr -d "\n")
echo "BASIC_AUTH_PWD=$PASSWORD" > "$out"/password-env
echo "${auth_user}:$PASSWORD" > "$out"/miniserve-auth
echo "$PASSWORD" | tr -d "\n" > "$out"/password
'';
};
systemd.services.telegraf-json = {
enable = true;
wantedBy = [ "multi-user.target" ];
after = [ "telegraf.service" ];
requires = [ "telegraf.service" ];
serviceConfig = {
LoadCredential = [
"auth_file_path:${config.clan.core.vars.generators.telegraf.files.miniserve-auth.path}"
"telegraf_crt_path:${config.clan.core.vars.generators.telegraf-certs.files.crt.path}"
"telegraf_key_path:${config.clan.core.vars.generators.telegraf-certs.files.key.path}"
];
Environment = [
"AUTH_FILE_PATH=%d/auth_file_path"
"CRT_PATH=%d/telegraf_crt_path"
"KEY_PATH=%d/telegraf_key_path"
];
Restart = "on-failure";
User = "telegraf";
Group = "telegraf";
RuntimeDirectory = "telegraf-www";
};
script = "${pkgs.miniserve}/bin/miniserve -p 9990 /run/telegraf-www --auth-file \"$AUTH_FILE_PATH\" --tls-cert \"$CRT_PATH\" --tls-key \"$KEY_PATH\"";
};
systemd.services.telegraf = {
serviceConfig = {
LoadCredential = [
"telegraf_crt_path:${config.clan.core.vars.generators.telegraf-certs.files.crt.path}"
"telegraf_key_path:${config.clan.core.vars.generators.telegraf-certs.files.key.path}"
];
Environment = [
"CRT_PATH=%d/telegraf_crt_path"
"KEY_PATH=%d/telegraf_key_path"
];
};
};
services.telegraf = {
enable = true;
environmentFiles = [
(builtins.toString config.clan.core.vars.generators.telegraf.files.password-env.path)
];
extraConfig = {
agent.interval = "60s";
inputs = {
diskio = { };
kernel_vmstat = { };
system = { };
mem = { };
systemd_units = { };
swap = { };
exec =
let
nixosSystems = pkgs.writeShellScript "current-system" ''
printf "nixos_systems,current_system=%s,booted_system=%s,current_kernel=%s,booted_kernel=%s present=0\n" \
"$(readlink /run/current-system)" "$(readlink /run/booted-system)" \
"$(basename $(echo /run/current-system/kernel-modules/lib/modules/*))" \
"$(basename $(echo /run/booted-system/kernel-modules/lib/modules/*))"
'';
in
[
{
# Expose the path to current-system as metric. We use
# this to check if the machine is up-to-date.
commands = [ nixosSystems ];
data_format = "influx";
}
];
};
# sadly there doesn'T seem to exist a telegraf http_client output plugin
outputs.prometheus_client = {
listen = ":9273";
metric_version = 2;
basic_username = "${auth_user}";
basic_password = "$${BASIC_AUTH_PWD}";
tls_cert = "$${CRT_PATH}";
tls_key = "$${KEY_PATH}";
};
outputs.file = {
files = [ "/run/telegraf-www/telegraf.json" ];
data_format = "json";
json_timestamp_units = "1s";
};
};
};
};
};
}