diff --git a/clanServices/monitoring/telegraf.nix b/clanServices/monitoring/telegraf.nix index 94eb91e39..2a80bed7b 100644 --- a/clanServices/monitoring/telegraf.nix +++ b/clanServices/monitoring/telegraf.nix @@ -10,22 +10,34 @@ lib, ... }: + let + jsonpath = "/tmp/telegraf.json"; + auth_user = "prometheus"; + in { networking.firewall.interfaces = lib.mkIf (settings.allowAllInterfaces == false) ( builtins.listToAttrs ( map (name: { inherit name; - value.allowedTCPPorts = [ 9273 ]; + value.allowedTCPPorts = [ + 9273 + 9990 + ]; }) settings.interfaces ) ); - networking.firewall.allowedTCPPorts = lib.mkIf (settings.allowAllInterfaces == true) [ 9273 ]; + networking.firewall.allowedTCPPorts = lib.mkIf (settings.allowAllInterfaces == true) [ + 9273 + 9990 + ]; - clan.core.vars.generators."telegraf-password" = { - files.telegraf-password.neededFor = "users"; - files.telegraf-password.restartUnits = [ "telegraf.service" ]; + clan.core.vars.generators."telegraf" = { + + files.password.restartUnits = [ "telegraf.service" ]; + files.password-env.restartUnits = [ "telegraf.service" ]; + files.miniserve-auth.restartUnits = [ "telegraf.service" ]; runtimeInputs = [ pkgs.coreutils @@ -35,16 +47,22 @@ script = '' PASSWORD=$(xkcdpass --numwords 4 --delimiter - --count 1 | tr -d "\n") - echo "BASIC_AUTH_PWD=$PASSWORD" > "$out"/telegraf-password + echo "BASIC_AUTH_PWD=$PASSWORD" > "$out"/password-env + echo "${auth_user}:$PASSWORD" > "$out"/miniserve-auth + echo "$PASSWORD" | tr -d "\n" > "$out"/password ''; }; + systemd.services.telegraf-json = { + enable = true; + wantedBy = [ "multi-user.target" ]; + script = "${pkgs.miniserve}/bin/miniserve -p 9990 ${jsonpath} --auth-file ${config.clan.core.vars.generators.telegraf.files.miniserve-auth.path}"; + }; + services.telegraf = { enable = true; environmentFiles = [ - (builtins.toString - config.clan.core.vars.generators."telegraf-password".files.telegraf-password.path - ) + (builtins.toString config.clan.core.vars.generators.telegraf.files.password-env.path) ]; extraConfig = { agent.interval = "60s"; @@ -59,25 +77,35 @@ exec = let - currentSystemScript = pkgs.writeShellScript "current-system" '' - printf "current_system,path=%s present=0\n" $(readlink /run/current-system) + nixosSystems = pkgs.writeShellScript "current-system" '' + printf "nixos_systems,current_system=%s,booted_system=%s,current_kernel=%s,booted_kernel=%s present=0\n" \ + "$(readlink /run/current-system)" "$(readlink /run/booted-system)" \ + "$(basename $(echo /run/current-system/kernel-modules/lib/modules/*))" \ + "$(basename $(echo /run/booted-system/kernel-modules/lib/modules/*))" ''; in [ { # Expose the path to current-system as metric. We use # this to check if the machine is up-to-date. - commands = [ currentSystemScript ]; + commands = [ nixosSystems ]; data_format = "influx"; } ]; }; + # sadly there doesn'T seem to exist a telegraf http_client output plugin outputs.prometheus_client = { listen = ":9273"; metric_version = 2; - basic_username = "prometheus"; + basic_username = "${auth_user}"; basic_password = "$${BASIC_AUTH_PWD}"; }; + + outputs.file = { + files = [ jsonpath ]; + data_format = "json"; + json_timestamp_units = "1s"; + }; }; }; }; diff --git a/pkgs/clan-cli/clan_lib/metrics/__init__.py b/pkgs/clan-cli/clan_lib/metrics/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/pkgs/clan-cli/clan_lib/metrics/telegraf.py b/pkgs/clan-cli/clan_lib/metrics/telegraf.py new file mode 100644 index 000000000..97f59b301 --- /dev/null +++ b/pkgs/clan-cli/clan_lib/metrics/telegraf.py @@ -0,0 +1,71 @@ +import json +import logging +import urllib.request +from base64 import b64encode +from collections.abc import Iterator +from typing import Any, TypedDict, cast + +from clan_cli.vars.get import get_machine_var + +from clan_lib.errors import ClanError +from clan_lib.machines.machines import Machine +from clan_lib.ssh.host import Host + +log = logging.getLogger(__name__) + + +class MetricSample(TypedDict): + fields: dict[str, Any] + name: str + tags: dict[str, str] + timestamp: int + + +def get_metrics( + machine: Machine, + target_host: Host, +) -> Iterator[MetricSample]: + """Fetch Prometheus metrics from telegraf and return them as streaming metrics. + Args: + machine: The Machine instance to check. + target_host: Remote instance representing the target host. + Returns: + Iterator[dict[str, Any]]: An iterator yielding parsed metric dictionaries line by line. + """ + + # Example: fetch Prometheus metrics with basic auth + url = f"http://{target_host.address}:9990" + username = "prometheus" + var_name = "telegraf/password" + password_var = get_machine_var(machine, var_name) + if not password_var.exists: + msg = ( + f"Missing required var '{var_name}' for machine '{machine.name}'.\n" + "Ensure the 'monitoring' clanService is enabled and run `clan machines update {machine.name}`." + "For more information, see: https://docs.clan.lol/reference/clanServices/monitoring/" + ) + raise ClanError(msg) + + password = password_var.value.decode("utf-8") + credentials = f"{username}:{password}" + + encoded_credentials = b64encode(credentials.encode("utf-8")).decode("utf-8") + headers = {"Authorization": f"Basic {encoded_credentials}"} + req = urllib.request.Request(url, headers=headers) + + try: + response = urllib.request.urlopen(req) + for line in response: + line_str = line.decode("utf-8").strip() + if line_str: + try: + yield cast(MetricSample, json.loads(line_str)) + except json.JSONDecodeError: + log.warning(f"Skipping invalid JSON line: {line_str}") + continue + except Exception as e: + msg = ( + f"Failed to fetch Prometheus metrics from {url} for machine '{machine.name}': {e}\n" + "Ensure the telegraf.service is running and accessible." + ) + raise ClanError(msg) from e diff --git a/pkgs/clan-cli/clan_lib/metrics/version.py b/pkgs/clan-cli/clan_lib/metrics/version.py new file mode 100644 index 000000000..151a53087 --- /dev/null +++ b/pkgs/clan-cli/clan_lib/metrics/version.py @@ -0,0 +1,74 @@ +import logging +from dataclasses import dataclass + +from clan_lib.api import API +from clan_lib.errors import ClanError +from clan_lib.machines.machines import Machine +from clan_lib.metrics.telegraf import get_metrics +from clan_lib.nix import nix_eval +from clan_lib.ssh.localhost import LocalHost +from clan_lib.ssh.remote import Remote + +log = logging.getLogger(__name__) + + +@dataclass(frozen=True) +class NixOSSystems: + current_system: str + booted_system: str + current_kernel: str + booted_kernel: str + + +def get_nixos_systems( + machine: Machine, target_host: Remote | LocalHost +) -> NixOSSystems | None: + """Get the nixos systems from the target host.""" + + parsed_metrics = get_metrics(machine, target_host) + + for metric in parsed_metrics: + if metric["name"] == "nixos_systems": + return NixOSSystems( + current_system=metric["tags"]["current_system"], + booted_system=metric["tags"]["booted_system"], + current_kernel=metric["tags"]["current_kernel"], + booted_kernel=metric["tags"]["booted_kernel"], + ) + return None + + +@API.register +def check_machine_up_to_date( + machine: Machine, + target_host: Remote | LocalHost, +) -> bool: + """Check if a machine needs an update. + Args: + machine: The Machine instance to check. + target_host: Optional Remote or LocalHost instance representing the target host. + Returns: + bool: True if the machine needs an update, False otherwise. + """ + + nixos_systems = get_nixos_systems(machine, target_host) + + if nixos_systems is None: + msg = "Failed to find 'current_system_present' metric in telegraf logs." + raise ClanError(msg) + + machine.info(f"Getting system outPath from {machine.name}...") + + git_out_path = nix_eval( + [ + f"{machine.flake}#nixosConfigurations.'{machine.name}'.config.system.build.toplevel.outPath" + ] + ) + + log.debug( + f"Checking if {machine.name} needs an update:\n" + f"Machine outPath: {nixos_systems.current_system}\n" + f"Git outPath : {git_out_path}\n" + ) + + return git_out_path != nixos_systems.current_system diff --git a/pkgs/clan-cli/clan_lib/ssh/host.py b/pkgs/clan-cli/clan_lib/ssh/host.py index ad9ae9765..63cbb4f61 100644 --- a/pkgs/clan-cli/clan_lib/ssh/host.py +++ b/pkgs/clan-cli/clan_lib/ssh/host.py @@ -17,6 +17,12 @@ class Host(Protocol): This provides a common interface for both local and remote hosts. """ + @property + def address(self) -> str: + """Return the address of the host.""" + msg = "Subclasses must implement address property" + raise NotImplementedError(msg) + @property def command_prefix(self) -> str | None: ... diff --git a/pkgs/clan-cli/clan_lib/ssh/localhost.py b/pkgs/clan-cli/clan_lib/ssh/localhost.py index 4db3e5b26..f62a12856 100644 --- a/pkgs/clan-cli/clan_lib/ssh/localhost.py +++ b/pkgs/clan-cli/clan_lib/ssh/localhost.py @@ -20,6 +20,11 @@ class LocalHost: _user: str = field(default_factory=lambda: os.environ.get("USER", "root")) _askpass_path: str | None = None + @property + def address(self) -> str: + """Return the address of the localhost.""" + return "localhost" + @property def target(self) -> str: """Return a descriptive target string for localhost."""