From decb91a529cc768a913f50d7e2c123d080478ec5 Mon Sep 17 00:00:00 2001 From: pinpox Date: Sun, 19 Oct 2025 17:10:39 +0200 Subject: [PATCH] clanServices/monitoring: add prometheus role --- clanServices/monitoring/default.nix | 45 +++++++++- clanServices/monitoring/prometheus.nix | 65 ++++++++++++++ clanServices/monitoring/telegraf.nix | 119 ++----------------------- 3 files changed, 114 insertions(+), 115 deletions(-) create mode 100644 clanServices/monitoring/prometheus.nix diff --git a/clanServices/monitoring/default.nix b/clanServices/monitoring/default.nix index dd61ead8f..6cdf195a0 100644 --- a/clanServices/monitoring/default.nix +++ b/clanServices/monitoring/default.nix @@ -24,5 +24,48 @@ }; }; - imports = [ ./telegraf.nix ]; + roles.prometheus = { + description = "Prometheus monitoring daemon. Will collect metrics from all hosts with the telegraf role"; + interface = + { lib, ... }: + { + options.webExternalUrl = lib.mkOption { + type = lib.types.nullOr lib.types.str; + default = null; + example = "https://prometheus.tld"; + description = "The URL under which Prometheus is externally reachable"; + }; + }; + }; + + imports = [ + ./telegraf.nix + ./prometheus.nix + ]; + + perMachine.nixosModule = + { pkgs, ... }: + { + clan.core.vars.generators."prometheus" = { + + share = true; + + files.password.restartUnits = [ + "telegraf.service" + "prometheus.service" + ]; + + files.password-env.restartUnits = [ "telegraf.service" ]; + + runtimeInputs = [ + pkgs.coreutils + pkgs.xkcdpass + ]; + + script = '' + xkcdpass --numwords 6 --delimiter - --count 1 | tr -d "\n" > $out/password + printf 'BASIC_AUTH_PWD=%s\n' "$(cat $out/password)" > $out/password-env + ''; + }; + }; } diff --git a/clanServices/monitoring/prometheus.nix b/clanServices/monitoring/prometheus.nix new file mode 100644 index 000000000..780d436ae --- /dev/null +++ b/clanServices/monitoring/prometheus.nix @@ -0,0 +1,65 @@ +{ + roles.prometheus.perInstance = + { + settings, + instanceName, + roles, + ... + }: + { + + nixosModule = + { + config, + lib, + # pkgs, + ... + }: + { + + systemd.services.prometheus = { + serviceConfig = { + LoadCredential = "password:${config.clan.core.vars.generators.prometheus.files.password.path}"; + BindReadOnlyPaths = "%d/password:/etc/prometheus/password"; + }; + }; + + services.prometheus = { + + enable = true; + + # TODO what do we set here? do we even need something? + # TODO this should be a export + # "https://prometheus.${config.clan.core.settings.tld}"; + webExternalUrl = settings.webExternalUrl; + + extraFlags = [ "--storage.tsdb.retention.time=30d" ]; + + scrapeConfigs = [ + { + job_name = "telegraf"; + scrape_interval = "60s"; + metrics_path = "/metrics"; + basic_auth.username = "prometheus"; + basic_auth.password_file = "/etc/prometheus/password"; + + static_configs = [ + { + # Scrape all machines with the `telegraf` role + # https://prometheus:@.:9273/metrics + + # scheme = "https"; + # scheme = "http"; + + targets = map (m: "${m}.${config.clan.core.settings.tld}:9273") ( + lib.attrNames roles.telegraf.machines + ); + labels.type = instanceName; + } + ]; + } + ]; + }; + }; + }; +} diff --git a/clanServices/monitoring/telegraf.nix b/clanServices/monitoring/telegraf.nix index 825c46a5a..df36dd411 100644 --- a/clanServices/monitoring/telegraf.nix +++ b/clanServices/monitoring/telegraf.nix @@ -1,122 +1,21 @@ { roles.telegraf.perInstance = - { settings, ... }: + { ... }: { nixosModule = { config, pkgs, - lib, ... }: - let - auth_user = "prometheus"; - in { - warnings = - lib.optionals (settings.allowAllInterfaces != null) [ - "monitoring.settings.allowAllInterfaces is deprecated and and has no effect. Please remove it from your inventory." - "The monitoring service will now always listen on all interfaces over https." - ] - ++ (lib.optionals (settings.interfaces != null) [ - "monitoring.settings.interfaces is deprecated and and has no effect. Please remove it from your inventory." - "The monitoring service will now always listen on all interfaces over https." - ]); - networking.firewall.allowedTCPPorts = [ - 9273 - 9990 - ]; - - clan.core.vars.generators."telegraf-certs" = { - files.crt = { - restartUnits = [ "telegraf.service" ]; - deploy = true; - secret = false; - }; - files.key = { - mode = "0600"; - restartUnits = [ "telegraf.service" ]; - }; - - runtimeInputs = [ - pkgs.openssl - ]; - - # TODO: Implement automated certificate rotation instead of using a 100-year expiration - script = '' - openssl req -x509 -nodes -newkey rsa:4096 \ - -days 36500 \ - -keyout "$out"/key \ - -out "$out"/crt \ - -subj "/C=US/ST=CA/L=San Francisco/O=Example Corp/OU=IT/CN=example.com" - ''; - }; - - clan.core.vars.generators."telegraf" = { - files.password.restartUnits = [ "telegraf.service" ]; - files.password-env.restartUnits = [ "telegraf.service" ]; - files.miniserve-auth.restartUnits = [ "telegraf.service" ]; - - dependencies = [ "telegraf-certs" ]; - - runtimeInputs = [ - pkgs.coreutils - pkgs.xkcdpass - pkgs.mkpasswd - ]; - - script = '' - PASSWORD=$(xkcdpass --numwords 4 --delimiter - --count 1 | tr -d "\n") - echo "BASIC_AUTH_PWD=$PASSWORD" > "$out"/password-env - echo "${auth_user}:$PASSWORD" > "$out"/miniserve-auth - echo "$PASSWORD" | tr -d "\n" > "$out"/password - ''; - }; - - systemd.services.telegraf-json = { - enable = true; - wantedBy = [ "multi-user.target" ]; - after = [ "telegraf.service" ]; - requires = [ "telegraf.service" ]; - serviceConfig = { - LoadCredential = [ - "auth_file_path:${config.clan.core.vars.generators.telegraf.files.miniserve-auth.path}" - "telegraf_crt_path:${config.clan.core.vars.generators.telegraf-certs.files.crt.path}" - "telegraf_key_path:${config.clan.core.vars.generators.telegraf-certs.files.key.path}" - ]; - Environment = [ - "AUTH_FILE_PATH=%d/auth_file_path" - "CRT_PATH=%d/telegraf_crt_path" - "KEY_PATH=%d/telegraf_key_path" - ]; - Restart = "on-failure"; - User = "telegraf"; - Group = "telegraf"; - RuntimeDirectory = "telegraf-www"; - }; - script = "${pkgs.miniserve}/bin/miniserve -p 9990 /run/telegraf-www --auth-file \"$AUTH_FILE_PATH\" --tls-cert \"$CRT_PATH\" --tls-key \"$KEY_PATH\""; - }; - - systemd.services.telegraf = { - serviceConfig = { - LoadCredential = [ - "telegraf_crt_path:${config.clan.core.vars.generators.telegraf-certs.files.crt.path}" - "telegraf_key_path:${config.clan.core.vars.generators.telegraf-certs.files.key.path}" - ]; - Environment = [ - "CRT_PATH=%d/telegraf_crt_path" - "KEY_PATH=%d/telegraf_key_path" - ]; - }; - }; + networking.firewall.allowedTCPPorts = [ 9273 ]; services.telegraf = { enable = true; - environmentFiles = [ - (builtins.toString config.clan.core.vars.generators.telegraf.files.password-env.path) - ]; + environmentFiles = [ config.clan.core.vars.generators.prometheus.files.password-env.path ]; extraConfig = { agent.interval = "60s"; @@ -147,20 +46,12 @@ } ]; }; - # sadly there doesn'T seem to exist a telegraf http_client output plugin + # sadly there doesn't seem to exist a telegraf http_client output plugin outputs.prometheus_client = { listen = ":9273"; metric_version = 2; - basic_username = "${auth_user}"; + basic_username = "prometheus"; basic_password = "$${BASIC_AUTH_PWD}"; - tls_cert = "$${CRT_PATH}"; - tls_key = "$${KEY_PATH}"; - }; - - outputs.file = { - files = [ "/run/telegraf-www/telegraf.json" ]; - data_format = "json"; - json_timestamp_units = "1s"; }; }; };