Merge pull request 'working check_machine_up_to_date' (#4754) from Qubasa/clan-core:build_is_installed_api into main
Reviewed-on: https://git.clan.lol/clan/clan-core/pulls/4754
This commit is contained in:
@@ -10,22 +10,34 @@
|
|||||||
lib,
|
lib,
|
||||||
...
|
...
|
||||||
}:
|
}:
|
||||||
|
let
|
||||||
|
jsonpath = "/tmp/telegraf.json";
|
||||||
|
auth_user = "prometheus";
|
||||||
|
in
|
||||||
{
|
{
|
||||||
|
|
||||||
networking.firewall.interfaces = lib.mkIf (settings.allowAllInterfaces == false) (
|
networking.firewall.interfaces = lib.mkIf (settings.allowAllInterfaces == false) (
|
||||||
builtins.listToAttrs (
|
builtins.listToAttrs (
|
||||||
map (name: {
|
map (name: {
|
||||||
inherit name;
|
inherit name;
|
||||||
value.allowedTCPPorts = [ 9273 ];
|
value.allowedTCPPorts = [
|
||||||
|
9273
|
||||||
|
9990
|
||||||
|
];
|
||||||
}) settings.interfaces
|
}) settings.interfaces
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
|
|
||||||
networking.firewall.allowedTCPPorts = lib.mkIf (settings.allowAllInterfaces == true) [ 9273 ];
|
networking.firewall.allowedTCPPorts = lib.mkIf (settings.allowAllInterfaces == true) [
|
||||||
|
9273
|
||||||
|
9990
|
||||||
|
];
|
||||||
|
|
||||||
clan.core.vars.generators."telegraf-password" = {
|
clan.core.vars.generators."telegraf" = {
|
||||||
files.telegraf-password.neededFor = "users";
|
|
||||||
files.telegraf-password.restartUnits = [ "telegraf.service" ];
|
files.password.restartUnits = [ "telegraf.service" ];
|
||||||
|
files.password-env.restartUnits = [ "telegraf.service" ];
|
||||||
|
files.miniserve-auth.restartUnits = [ "telegraf.service" ];
|
||||||
|
|
||||||
runtimeInputs = [
|
runtimeInputs = [
|
||||||
pkgs.coreutils
|
pkgs.coreutils
|
||||||
@@ -35,16 +47,22 @@
|
|||||||
|
|
||||||
script = ''
|
script = ''
|
||||||
PASSWORD=$(xkcdpass --numwords 4 --delimiter - --count 1 | tr -d "\n")
|
PASSWORD=$(xkcdpass --numwords 4 --delimiter - --count 1 | tr -d "\n")
|
||||||
echo "BASIC_AUTH_PWD=$PASSWORD" > "$out"/telegraf-password
|
echo "BASIC_AUTH_PWD=$PASSWORD" > "$out"/password-env
|
||||||
|
echo "${auth_user}:$PASSWORD" > "$out"/miniserve-auth
|
||||||
|
echo "$PASSWORD" | tr -d "\n" > "$out"/password
|
||||||
'';
|
'';
|
||||||
};
|
};
|
||||||
|
|
||||||
|
systemd.services.telegraf-json = {
|
||||||
|
enable = true;
|
||||||
|
wantedBy = [ "multi-user.target" ];
|
||||||
|
script = "${pkgs.miniserve}/bin/miniserve -p 9990 ${jsonpath} --auth-file ${config.clan.core.vars.generators.telegraf.files.miniserve-auth.path}";
|
||||||
|
};
|
||||||
|
|
||||||
services.telegraf = {
|
services.telegraf = {
|
||||||
enable = true;
|
enable = true;
|
||||||
environmentFiles = [
|
environmentFiles = [
|
||||||
(builtins.toString
|
(builtins.toString config.clan.core.vars.generators.telegraf.files.password-env.path)
|
||||||
config.clan.core.vars.generators."telegraf-password".files.telegraf-password.path
|
|
||||||
)
|
|
||||||
];
|
];
|
||||||
extraConfig = {
|
extraConfig = {
|
||||||
agent.interval = "60s";
|
agent.interval = "60s";
|
||||||
@@ -59,25 +77,35 @@
|
|||||||
|
|
||||||
exec =
|
exec =
|
||||||
let
|
let
|
||||||
currentSystemScript = pkgs.writeShellScript "current-system" ''
|
nixosSystems = pkgs.writeShellScript "current-system" ''
|
||||||
printf "current_system,path=%s present=0\n" $(readlink /run/current-system)
|
printf "nixos_systems,current_system=%s,booted_system=%s,current_kernel=%s,booted_kernel=%s present=0\n" \
|
||||||
|
"$(readlink /run/current-system)" "$(readlink /run/booted-system)" \
|
||||||
|
"$(basename $(echo /run/current-system/kernel-modules/lib/modules/*))" \
|
||||||
|
"$(basename $(echo /run/booted-system/kernel-modules/lib/modules/*))"
|
||||||
'';
|
'';
|
||||||
in
|
in
|
||||||
[
|
[
|
||||||
{
|
{
|
||||||
# Expose the path to current-system as metric. We use
|
# Expose the path to current-system as metric. We use
|
||||||
# this to check if the machine is up-to-date.
|
# this to check if the machine is up-to-date.
|
||||||
commands = [ currentSystemScript ];
|
commands = [ nixosSystems ];
|
||||||
data_format = "influx";
|
data_format = "influx";
|
||||||
}
|
}
|
||||||
];
|
];
|
||||||
};
|
};
|
||||||
|
# sadly there doesn'T seem to exist a telegraf http_client output plugin
|
||||||
outputs.prometheus_client = {
|
outputs.prometheus_client = {
|
||||||
listen = ":9273";
|
listen = ":9273";
|
||||||
metric_version = 2;
|
metric_version = 2;
|
||||||
basic_username = "prometheus";
|
basic_username = "${auth_user}";
|
||||||
basic_password = "$${BASIC_AUTH_PWD}";
|
basic_password = "$${BASIC_AUTH_PWD}";
|
||||||
};
|
};
|
||||||
|
|
||||||
|
outputs.file = {
|
||||||
|
files = [ jsonpath ];
|
||||||
|
data_format = "json";
|
||||||
|
json_timestamp_units = "1s";
|
||||||
|
};
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|||||||
0
pkgs/clan-cli/clan_lib/metrics/__init__.py
Normal file
0
pkgs/clan-cli/clan_lib/metrics/__init__.py
Normal file
71
pkgs/clan-cli/clan_lib/metrics/telegraf.py
Normal file
71
pkgs/clan-cli/clan_lib/metrics/telegraf.py
Normal file
@@ -0,0 +1,71 @@
|
|||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import urllib.request
|
||||||
|
from base64 import b64encode
|
||||||
|
from collections.abc import Iterator
|
||||||
|
from typing import Any, TypedDict, cast
|
||||||
|
|
||||||
|
from clan_cli.vars.get import get_machine_var
|
||||||
|
|
||||||
|
from clan_lib.errors import ClanError
|
||||||
|
from clan_lib.machines.machines import Machine
|
||||||
|
from clan_lib.ssh.host import Host
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class MetricSample(TypedDict):
|
||||||
|
fields: dict[str, Any]
|
||||||
|
name: str
|
||||||
|
tags: dict[str, str]
|
||||||
|
timestamp: int
|
||||||
|
|
||||||
|
|
||||||
|
def get_metrics(
|
||||||
|
machine: Machine,
|
||||||
|
target_host: Host,
|
||||||
|
) -> Iterator[MetricSample]:
|
||||||
|
"""Fetch Prometheus metrics from telegraf and return them as streaming metrics.
|
||||||
|
Args:
|
||||||
|
machine: The Machine instance to check.
|
||||||
|
target_host: Remote instance representing the target host.
|
||||||
|
Returns:
|
||||||
|
Iterator[dict[str, Any]]: An iterator yielding parsed metric dictionaries line by line.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Example: fetch Prometheus metrics with basic auth
|
||||||
|
url = f"http://{target_host.address}:9990"
|
||||||
|
username = "prometheus"
|
||||||
|
var_name = "telegraf/password"
|
||||||
|
password_var = get_machine_var(machine, var_name)
|
||||||
|
if not password_var.exists:
|
||||||
|
msg = (
|
||||||
|
f"Missing required var '{var_name}' for machine '{machine.name}'.\n"
|
||||||
|
"Ensure the 'monitoring' clanService is enabled and run `clan machines update {machine.name}`."
|
||||||
|
"For more information, see: https://docs.clan.lol/reference/clanServices/monitoring/"
|
||||||
|
)
|
||||||
|
raise ClanError(msg)
|
||||||
|
|
||||||
|
password = password_var.value.decode("utf-8")
|
||||||
|
credentials = f"{username}:{password}"
|
||||||
|
|
||||||
|
encoded_credentials = b64encode(credentials.encode("utf-8")).decode("utf-8")
|
||||||
|
headers = {"Authorization": f"Basic {encoded_credentials}"}
|
||||||
|
req = urllib.request.Request(url, headers=headers)
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = urllib.request.urlopen(req)
|
||||||
|
for line in response:
|
||||||
|
line_str = line.decode("utf-8").strip()
|
||||||
|
if line_str:
|
||||||
|
try:
|
||||||
|
yield cast(MetricSample, json.loads(line_str))
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
log.warning(f"Skipping invalid JSON line: {line_str}")
|
||||||
|
continue
|
||||||
|
except Exception as e:
|
||||||
|
msg = (
|
||||||
|
f"Failed to fetch Prometheus metrics from {url} for machine '{machine.name}': {e}\n"
|
||||||
|
"Ensure the telegraf.service is running and accessible."
|
||||||
|
)
|
||||||
|
raise ClanError(msg) from e
|
||||||
74
pkgs/clan-cli/clan_lib/metrics/version.py
Normal file
74
pkgs/clan-cli/clan_lib/metrics/version.py
Normal file
@@ -0,0 +1,74 @@
|
|||||||
|
import logging
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
from clan_lib.api import API
|
||||||
|
from clan_lib.errors import ClanError
|
||||||
|
from clan_lib.machines.machines import Machine
|
||||||
|
from clan_lib.metrics.telegraf import get_metrics
|
||||||
|
from clan_lib.nix import nix_eval
|
||||||
|
from clan_lib.ssh.localhost import LocalHost
|
||||||
|
from clan_lib.ssh.remote import Remote
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class NixOSSystems:
|
||||||
|
current_system: str
|
||||||
|
booted_system: str
|
||||||
|
current_kernel: str
|
||||||
|
booted_kernel: str
|
||||||
|
|
||||||
|
|
||||||
|
def get_nixos_systems(
|
||||||
|
machine: Machine, target_host: Remote | LocalHost
|
||||||
|
) -> NixOSSystems | None:
|
||||||
|
"""Get the nixos systems from the target host."""
|
||||||
|
|
||||||
|
parsed_metrics = get_metrics(machine, target_host)
|
||||||
|
|
||||||
|
for metric in parsed_metrics:
|
||||||
|
if metric["name"] == "nixos_systems":
|
||||||
|
return NixOSSystems(
|
||||||
|
current_system=metric["tags"]["current_system"],
|
||||||
|
booted_system=metric["tags"]["booted_system"],
|
||||||
|
current_kernel=metric["tags"]["current_kernel"],
|
||||||
|
booted_kernel=metric["tags"]["booted_kernel"],
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
@API.register
|
||||||
|
def check_machine_up_to_date(
|
||||||
|
machine: Machine,
|
||||||
|
target_host: Remote | LocalHost,
|
||||||
|
) -> bool:
|
||||||
|
"""Check if a machine needs an update.
|
||||||
|
Args:
|
||||||
|
machine: The Machine instance to check.
|
||||||
|
target_host: Optional Remote or LocalHost instance representing the target host.
|
||||||
|
Returns:
|
||||||
|
bool: True if the machine needs an update, False otherwise.
|
||||||
|
"""
|
||||||
|
|
||||||
|
nixos_systems = get_nixos_systems(machine, target_host)
|
||||||
|
|
||||||
|
if nixos_systems is None:
|
||||||
|
msg = "Failed to find 'current_system_present' metric in telegraf logs."
|
||||||
|
raise ClanError(msg)
|
||||||
|
|
||||||
|
machine.info(f"Getting system outPath from {machine.name}...")
|
||||||
|
|
||||||
|
git_out_path = nix_eval(
|
||||||
|
[
|
||||||
|
f"{machine.flake}#nixosConfigurations.'{machine.name}'.config.system.build.toplevel.outPath"
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
log.debug(
|
||||||
|
f"Checking if {machine.name} needs an update:\n"
|
||||||
|
f"Machine outPath: {nixos_systems.current_system}\n"
|
||||||
|
f"Git outPath : {git_out_path}\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
return git_out_path != nixos_systems.current_system
|
||||||
@@ -17,6 +17,12 @@ class Host(Protocol):
|
|||||||
This provides a common interface for both local and remote hosts.
|
This provides a common interface for both local and remote hosts.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def address(self) -> str:
|
||||||
|
"""Return the address of the host."""
|
||||||
|
msg = "Subclasses must implement address property"
|
||||||
|
raise NotImplementedError(msg)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def command_prefix(self) -> str | None: ...
|
def command_prefix(self) -> str | None: ...
|
||||||
|
|
||||||
|
|||||||
@@ -20,6 +20,11 @@ class LocalHost:
|
|||||||
_user: str = field(default_factory=lambda: os.environ.get("USER", "root"))
|
_user: str = field(default_factory=lambda: os.environ.get("USER", "root"))
|
||||||
_askpass_path: str | None = None
|
_askpass_path: str | None = None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def address(self) -> str:
|
||||||
|
"""Return the address of the localhost."""
|
||||||
|
return "localhost"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def target(self) -> str:
|
def target(self) -> str:
|
||||||
"""Return a descriptive target string for localhost."""
|
"""Return a descriptive target string for localhost."""
|
||||||
|
|||||||
Reference in New Issue
Block a user