clanServices: add ca certs for monitoring/telegraf

This commit is contained in:
Qubasa
2025-09-18 16:15:15 +02:00
parent 74f853bd7c
commit 455268f6ce
14 changed files with 187 additions and 77 deletions

View File

@@ -1,4 +1,4 @@
{ packages, pkgs, ... }:
{ ... }:
{
name = "monitoring";
@@ -28,6 +28,8 @@
services.telegraf.extraConfig = {
agent.interval = lib.mkForce "1s";
outputs.prometheus_client = {
# BUG: We have to disable basic auth here because the prometheus_client
# output plugin will otherwise deadlock Telegraf on startup.
basic_password = lib.mkForce "";
basic_username = lib.mkForce "";
};
@@ -35,17 +37,16 @@
};
};
extraPythonPackages = _p: [
(pkgs.python3.pkgs.toPythonModule packages.${pkgs.system}.clan-cli)
];
# !!! ANY CHANGES HERE MUST BE REFLECTED IN:
# clan_lib/metrics/telegraf.py::get_metrics
testScript =
{ ... }:
{ nodes, ... }:
''
import time
import os
import sys
import subprocess
import ssl
import json
import shlex
import urllib.request
@@ -54,45 +55,44 @@
peer1.wait_for_unit("network-online.target")
peer1.wait_for_unit("telegraf.service")
peer1.wait_for_unit("telegraf-json.service")
peer1.succeed("curl http://localhost:9990/telegraf.json")
peer1.succeed("curl http://localhost:9273/metrics")
# Fetch the basic auth password from the secret file
password = peer1.succeed("cat /var/run/secrets/vars/telegraf/password")
url = f"http://192.168.1.1:9990/telegraf.json"
password = peer1.succeed("cat ${nodes.peer1.clan.core.vars.generators.telegraf.files.password.path}").strip()
credentials = f"prometheus:{password}"
print("Using credentials:", credentials)
time.sleep(10) # wait a bit for telegraf to collect some data
# Fetch the json output from miniserve
print("Using credentials:", credentials)
peer1.succeed(f"curl -k -u {credentials} https://localhost:9990/telegraf.json")
peer1.succeed(f"curl -k -u {credentials} https://localhost:9273/metrics")
cert_path = "${nodes.peer1.clan.core.vars.generators.telegraf-certs.files.crt.path}"
url = "https://192.168.1.1:9990/telegraf.json" # HTTPS required
print("Waiting for /var/run/telegraf-www/telegraf.json to be bigger then 200 bytes")
peer1.wait_until_succeeds(f"test \"$(stat -c%s /var/run/telegraf-www/telegraf.json)\" -ge 200", timeout=30)
encoded_credentials = b64encode(credentials.encode("utf-8")).decode("utf-8")
headers = {"Authorization": f"Basic {encoded_credentials}"}
req = urllib.request.Request(url, headers=headers) # noqa: S310
response = urllib.request.urlopen(req)
# Look for the nixos_systems metric in the json output
# Trust the provided CA/server certificate
context = ssl.create_default_context(cafile=cert_path)
context.check_hostname = False
context.verify_mode = ssl.CERT_REQUIRED
found_system = False
for line in response:
line_str = line.decode("utf-8").strip()
line = json.loads(line_str)
if line["name"] == "nixos_systems":
found_system = True
print("Found nixos_systems metric in json output")
break
assert found_system, "nixos_systems metric not found in json output"
with urllib.request.urlopen(req, context=context, timeout=5) as response:
for raw_line in response:
line_str = raw_line.decode("utf-8").strip()
if not line_str:
continue
obj = json.loads(line_str)
if obj.get("name") == "nixos_systems":
found_system = True
print("Found nixos_systems metric in json output")
break
# TODO: I would like to test the python code here but it's not working yet
# Missing: I need a way to get the encrypted var from the clan
#from clan_lib.metrics.version import get_nixos_systems
#from clan_lib.machines.machines import Machine as ClanMachine
#from clan_lib.flake import Flake
#from clan_lib.ssh.remote import Remote
#target_host = Remote("peer1", "192.168.1.1")
#machine = ClanMachine("peer1", flake=Flake("${./.}"))
# data = get_nixos_systems(mymachine, target_host)
# assert data["current_system"] is not None
assert found_system, "nixos_systems metric not found in json output"
'';
}