clanServices/monitoring: add prometheus role
This commit is contained in:
@@ -24,5 +24,48 @@
|
||||
};
|
||||
};
|
||||
|
||||
imports = [ ./telegraf.nix ];
|
||||
roles.prometheus = {
|
||||
description = "Prometheus monitoring daemon. Will collect metrics from all hosts with the telegraf role";
|
||||
interface =
|
||||
{ lib, ... }:
|
||||
{
|
||||
options.webExternalUrl = lib.mkOption {
|
||||
type = lib.types.nullOr lib.types.str;
|
||||
default = null;
|
||||
example = "https://prometheus.tld";
|
||||
description = "The URL under which Prometheus is externally reachable";
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
imports = [
|
||||
./telegraf.nix
|
||||
./prometheus.nix
|
||||
];
|
||||
|
||||
perMachine.nixosModule =
|
||||
{ pkgs, ... }:
|
||||
{
|
||||
clan.core.vars.generators."prometheus" = {
|
||||
|
||||
share = true;
|
||||
|
||||
files.password.restartUnits = [
|
||||
"telegraf.service"
|
||||
"prometheus.service"
|
||||
];
|
||||
|
||||
files.password-env.restartUnits = [ "telegraf.service" ];
|
||||
|
||||
runtimeInputs = [
|
||||
pkgs.coreutils
|
||||
pkgs.xkcdpass
|
||||
];
|
||||
|
||||
script = ''
|
||||
xkcdpass --numwords 6 --delimiter - --count 1 | tr -d "\n" > $out/password
|
||||
printf 'BASIC_AUTH_PWD=%s\n' "$(cat $out/password)" > $out/password-env
|
||||
'';
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
65
clanServices/monitoring/prometheus.nix
Normal file
65
clanServices/monitoring/prometheus.nix
Normal file
@@ -0,0 +1,65 @@
|
||||
{
|
||||
roles.prometheus.perInstance =
|
||||
{
|
||||
settings,
|
||||
instanceName,
|
||||
roles,
|
||||
...
|
||||
}:
|
||||
{
|
||||
|
||||
nixosModule =
|
||||
{
|
||||
config,
|
||||
lib,
|
||||
# pkgs,
|
||||
...
|
||||
}:
|
||||
{
|
||||
|
||||
systemd.services.prometheus = {
|
||||
serviceConfig = {
|
||||
LoadCredential = "password:${config.clan.core.vars.generators.prometheus.files.password.path}";
|
||||
BindReadOnlyPaths = "%d/password:/etc/prometheus/password";
|
||||
};
|
||||
};
|
||||
|
||||
services.prometheus = {
|
||||
|
||||
enable = true;
|
||||
|
||||
# TODO what do we set here? do we even need something?
|
||||
# TODO this should be a export
|
||||
# "https://prometheus.${config.clan.core.settings.tld}";
|
||||
webExternalUrl = settings.webExternalUrl;
|
||||
|
||||
extraFlags = [ "--storage.tsdb.retention.time=30d" ];
|
||||
|
||||
scrapeConfigs = [
|
||||
{
|
||||
job_name = "telegraf";
|
||||
scrape_interval = "60s";
|
||||
metrics_path = "/metrics";
|
||||
basic_auth.username = "prometheus";
|
||||
basic_auth.password_file = "/etc/prometheus/password";
|
||||
|
||||
static_configs = [
|
||||
{
|
||||
# Scrape all machines with the `telegraf` role
|
||||
# https://prometheus:<password>@<host>.<tld>:9273/metrics
|
||||
|
||||
# scheme = "https";
|
||||
# scheme = "http";
|
||||
|
||||
targets = map (m: "${m}.${config.clan.core.settings.tld}:9273") (
|
||||
lib.attrNames roles.telegraf.machines
|
||||
);
|
||||
labels.type = instanceName;
|
||||
}
|
||||
];
|
||||
}
|
||||
];
|
||||
};
|
||||
};
|
||||
};
|
||||
}
|
||||
@@ -1,122 +1,21 @@
|
||||
{
|
||||
roles.telegraf.perInstance =
|
||||
{ settings, ... }:
|
||||
{ ... }:
|
||||
{
|
||||
|
||||
nixosModule =
|
||||
{
|
||||
config,
|
||||
pkgs,
|
||||
lib,
|
||||
...
|
||||
}:
|
||||
let
|
||||
auth_user = "prometheus";
|
||||
in
|
||||
{
|
||||
warnings =
|
||||
lib.optionals (settings.allowAllInterfaces != null) [
|
||||
"monitoring.settings.allowAllInterfaces is deprecated and and has no effect. Please remove it from your inventory."
|
||||
"The monitoring service will now always listen on all interfaces over https."
|
||||
]
|
||||
++ (lib.optionals (settings.interfaces != null) [
|
||||
"monitoring.settings.interfaces is deprecated and and has no effect. Please remove it from your inventory."
|
||||
"The monitoring service will now always listen on all interfaces over https."
|
||||
]);
|
||||
|
||||
networking.firewall.allowedTCPPorts = [
|
||||
9273
|
||||
9990
|
||||
];
|
||||
|
||||
clan.core.vars.generators."telegraf-certs" = {
|
||||
files.crt = {
|
||||
restartUnits = [ "telegraf.service" ];
|
||||
deploy = true;
|
||||
secret = false;
|
||||
};
|
||||
files.key = {
|
||||
mode = "0600";
|
||||
restartUnits = [ "telegraf.service" ];
|
||||
};
|
||||
|
||||
runtimeInputs = [
|
||||
pkgs.openssl
|
||||
];
|
||||
|
||||
# TODO: Implement automated certificate rotation instead of using a 100-year expiration
|
||||
script = ''
|
||||
openssl req -x509 -nodes -newkey rsa:4096 \
|
||||
-days 36500 \
|
||||
-keyout "$out"/key \
|
||||
-out "$out"/crt \
|
||||
-subj "/C=US/ST=CA/L=San Francisco/O=Example Corp/OU=IT/CN=example.com"
|
||||
'';
|
||||
};
|
||||
|
||||
clan.core.vars.generators."telegraf" = {
|
||||
files.password.restartUnits = [ "telegraf.service" ];
|
||||
files.password-env.restartUnits = [ "telegraf.service" ];
|
||||
files.miniserve-auth.restartUnits = [ "telegraf.service" ];
|
||||
|
||||
dependencies = [ "telegraf-certs" ];
|
||||
|
||||
runtimeInputs = [
|
||||
pkgs.coreutils
|
||||
pkgs.xkcdpass
|
||||
pkgs.mkpasswd
|
||||
];
|
||||
|
||||
script = ''
|
||||
PASSWORD=$(xkcdpass --numwords 4 --delimiter - --count 1 | tr -d "\n")
|
||||
echo "BASIC_AUTH_PWD=$PASSWORD" > "$out"/password-env
|
||||
echo "${auth_user}:$PASSWORD" > "$out"/miniserve-auth
|
||||
echo "$PASSWORD" | tr -d "\n" > "$out"/password
|
||||
'';
|
||||
};
|
||||
|
||||
systemd.services.telegraf-json = {
|
||||
enable = true;
|
||||
wantedBy = [ "multi-user.target" ];
|
||||
after = [ "telegraf.service" ];
|
||||
requires = [ "telegraf.service" ];
|
||||
serviceConfig = {
|
||||
LoadCredential = [
|
||||
"auth_file_path:${config.clan.core.vars.generators.telegraf.files.miniserve-auth.path}"
|
||||
"telegraf_crt_path:${config.clan.core.vars.generators.telegraf-certs.files.crt.path}"
|
||||
"telegraf_key_path:${config.clan.core.vars.generators.telegraf-certs.files.key.path}"
|
||||
];
|
||||
Environment = [
|
||||
"AUTH_FILE_PATH=%d/auth_file_path"
|
||||
"CRT_PATH=%d/telegraf_crt_path"
|
||||
"KEY_PATH=%d/telegraf_key_path"
|
||||
];
|
||||
Restart = "on-failure";
|
||||
User = "telegraf";
|
||||
Group = "telegraf";
|
||||
RuntimeDirectory = "telegraf-www";
|
||||
};
|
||||
script = "${pkgs.miniserve}/bin/miniserve -p 9990 /run/telegraf-www --auth-file \"$AUTH_FILE_PATH\" --tls-cert \"$CRT_PATH\" --tls-key \"$KEY_PATH\"";
|
||||
};
|
||||
|
||||
systemd.services.telegraf = {
|
||||
serviceConfig = {
|
||||
LoadCredential = [
|
||||
"telegraf_crt_path:${config.clan.core.vars.generators.telegraf-certs.files.crt.path}"
|
||||
"telegraf_key_path:${config.clan.core.vars.generators.telegraf-certs.files.key.path}"
|
||||
];
|
||||
Environment = [
|
||||
"CRT_PATH=%d/telegraf_crt_path"
|
||||
"KEY_PATH=%d/telegraf_key_path"
|
||||
];
|
||||
};
|
||||
};
|
||||
networking.firewall.allowedTCPPorts = [ 9273 ];
|
||||
|
||||
services.telegraf = {
|
||||
enable = true;
|
||||
environmentFiles = [
|
||||
(builtins.toString config.clan.core.vars.generators.telegraf.files.password-env.path)
|
||||
];
|
||||
environmentFiles = [ config.clan.core.vars.generators.prometheus.files.password-env.path ];
|
||||
|
||||
extraConfig = {
|
||||
agent.interval = "60s";
|
||||
@@ -147,20 +46,12 @@
|
||||
}
|
||||
];
|
||||
};
|
||||
# sadly there doesn'T seem to exist a telegraf http_client output plugin
|
||||
# sadly there doesn't seem to exist a telegraf http_client output plugin
|
||||
outputs.prometheus_client = {
|
||||
listen = ":9273";
|
||||
metric_version = 2;
|
||||
basic_username = "${auth_user}";
|
||||
basic_username = "prometheus";
|
||||
basic_password = "$${BASIC_AUTH_PWD}";
|
||||
tls_cert = "$${CRT_PATH}";
|
||||
tls_key = "$${KEY_PATH}";
|
||||
};
|
||||
|
||||
outputs.file = {
|
||||
files = [ "/run/telegraf-www/telegraf.json" ];
|
||||
data_format = "json";
|
||||
json_timestamp_units = "1s";
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user