Merge pull request 'Add monitoring service' (#4756) from monitoring-service into main

Reviewed-on: https://git.clan.lol/clan/clan-core/pulls/4756
This commit is contained in:
pinpox
2025-08-15 08:57:36 +00:00
6 changed files with 150 additions and 0 deletions

View File

@@ -0,0 +1,17 @@
## Usage
```
inventory.instances = {
monitoring = {
module.name = "monitoring";
roles.telegraf.tags.all = {
settings.interfaces = [ "wg-clan" ];
};
};
};
```
This service will eventually set up a monitoring stack for your clan. For now,
only a telegraf role is implemented, which exposes the currently deployed
version of your configuration, so it can be used to check for required updates.

View File

@@ -0,0 +1,28 @@
{ packages }:
{ ... }:
{
_class = "clan.service";
manifest.name = "clan-core/monitoring";
manifest.description = "Monitoring service for the nodes in your clan";
manifest.readme = builtins.readFile ./README.md;
roles.telegraf = {
interface =
{ lib, ... }:
{
options.allowAllInterfaces = lib.mkOption {
type = lib.types.bool;
default = false;
description = "If true, Telegraf will listen on all interfaces. Otherwise, it will only listen on the interfaces specified in `interfaces`";
};
options.interfaces = lib.mkOption {
type = lib.types.listOf lib.types.str;
default = [ "zt+" ];
description = "List of interfaces to expose the metrics to";
};
};
};
imports = [ ./telegraf.nix ];
}

View File

@@ -0,0 +1,23 @@
{
self,
lib,
...
}:
let
module = lib.modules.importApply ./default.nix {
inherit (self) packages;
};
in
{
clan.modules.monitoring = module;
perSystem =
{ ... }:
{
clan.nixosTests.monitoring = {
imports = [ ./tests/vm/default.nix ];
clan.modules.monitoring = module;
};
};
}

View File

@@ -0,0 +1,57 @@
{
roles.telegraf.perInstance =
{ settings, ... }:
{
nixosModule =
{ pkgs, lib, ... }:
{
networking.firewall.interfaces = lib.mkIf (settings.allowAllInterfaces == false) (
builtins.listToAttrs (
map (name: {
inherit name;
value.allowedTCPPorts = [ 9273 ];
}) settings.interfaces
)
);
networking.firewall.allowedTCPPorts = lib.mkIf (settings.allowAllInterfaces == true) [ 9273 ];
services.telegraf = {
enable = true;
extraConfig = {
agent.interval = "60s";
inputs = {
diskio = { };
kernel_vmstat = { };
system = { };
mem = { };
systemd_units = { };
swap = { };
exec =
let
currentSystemScript = pkgs.writeShellScript "current-system" ''
printf "current_system,path=%s present=0\n" $(readlink /run/current-system)
'';
in
[
{
# Expose the path to current-system as metric. We use
# this to check if the machine is up-to-date.
commands = [ currentSystemScript ];
data_format = "influx";
}
];
};
outputs.prometheus_client = {
listen = ":9273";
metric_version = 2;
};
};
};
};
};
}

View File

@@ -0,0 +1,24 @@
{
name = "monitoring";
clan = {
directory = ./.;
inventory = {
machines.peer1 = { };
instances."test" = {
module.name = "monitoring";
module.input = "self";
roles.telegraf.machines.peer1 = { };
};
};
};
testScript =
{ ... }:
''
start_all()
'';
}

View File

@@ -103,6 +103,7 @@ nav:
- reference/clanServices/localbackup.md
- reference/clanServices/matrix-synapse.md
- reference/clanServices/mycelium.md
- reference/clanServices/monitoring.md
- reference/clanServices/packages.md
- reference/clanServices/sshd.md
- reference/clanServices/state-version.md