Add favicon

fix template
Add prometheus console
2025-10-28 10:09:14 +01:00 · 2025-10-28 01:01:07 +01:00 · 2025-10-26 21:54:14 +01:00 · 2025-10-26 12:09:05 +01:00
44 changed files with 779 additions and 539 deletions
--- a/clanServices/monitoring/alert-rules.nix
+++ b/clanServices/monitoring/alert-rules.nix
@@ -0,0 +1,24 @@
+{ lib }:
+lib.mapAttrsToList
+  (name: opts: {
+    alert = name;
+    expr = opts.condition;
+    for = opts.time or "2m";
+    labels = { };
+    annotations.description = opts.description;
+  })
+  {
+
+    # TODO Remove this alert, just for testing
+    "Filesystem > = 10%" = {
+      condition = ''disk_used_percent{fstype!~"tmpfs|vfat|devtmpfs|efivarfs"} > 10'';
+      time = "1m";
+      description = "{{$labels.instance}} device {{$labels.device}} on {{$labels.path}} got less than 90% space left on its filesystem.";
+    };
+
+    filesystem_full_80percent = {
+      condition = ''disk_used_percent{fstype!~"tmpfs|vfat|devtmpfs|efivarfs"} > 80'';
+      time = "1m";
+      description = "{{$labels.instance}} device {{$labels.device}} on {{$labels.path}} got less than 20% space left on its filesystem.";
+    };
+  }
--- a/clanServices/monitoring/default.nix
+++ b/clanServices/monitoring/default.nix
@@ -24,5 +24,48 @@
      };
  };

-  imports = [ ./telegraf.nix ];
+  roles.prometheus = {
+    description = "Prometheus monitoring daemon. Will collect metrics from all hosts with the telegraf role";
+    interface =
+      { lib, ... }:
+      {
+        options.webExternalUrl = lib.mkOption {
+          type = lib.types.nullOr lib.types.str;
+          default = null;
+          example = "https://prometheus.tld";
+          description = "The URL under which Prometheus is externally reachable";
+        };
+      };
+  };
+
+  imports = [
+    ./telegraf.nix
+    ./prometheus.nix
+  ];
+
+  perMachine.nixosModule =
+    { pkgs, ... }:
+    {
+      clan.core.vars.generators."prometheus" = {
+
+        share = true;
+
+        files.password.restartUnits = [
+          "telegraf.service"
+          "prometheus.service"
+        ];
+
+        files.password-env.restartUnits = [ "telegraf.service" ];
+
+        runtimeInputs = [
+          pkgs.coreutils
+          pkgs.xkcdpass
+        ];
+
+        script = ''
+          xkcdpass --numwords 6 --delimiter - --count 1 | tr -d "\n" > $out/password
+          printf 'BASIC_AUTH_PWD=%s\n' "$(cat $out/password)" > $out/password-env
+        '';
+      };
+    };
 }
--- a/clanServices/monitoring/prometheus-consoles/favicon.ico
+++ b/clanServices/monitoring/prometheus-consoles/favicon.ico
--- a/clanServices/monitoring/prometheus-consoles/favicon.svg
+++ b/clanServices/monitoring/prometheus-consoles/favicon.svg
@@ -0,0 +1,11 @@
+<svg xmlns="http://www.w3.org/2000/svg" version="1.1" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:svgjs="http://svgjs.dev/svgjs" width="1000" height="1000"><g clip-path="url(#SvgjsClipPath1007)"><rect width="1000" height="1000" fill="#ffffff"></rect><g transform="matrix(5.132341080724394,0,0,5.132341080724394,217.38764012391061,149.97935090550055)"><svg xmlns="http://www.w3.org/2000/svg" version="1.1" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:svgjs="http://svgjs.dev/svgjs" width="110.13" height="136.39"><svg id="Layer_1" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 110.13 136.39">
+  <defs>
+    <style>
+      .cls-1 {
+        fill: #231f20;
+      }
+    </style>
+  <clipPath id="SvgjsClipPath1007"><rect width="1000" height="1000" x="0" y="0" rx="350" ry="350"></rect></clipPath></defs>
+  <path class="cls-1" d="M88.27,30.81h16.69c1.77,0,3.21-1.44,3.21-3.21v-12.84c0-1.77-1.44-3.21-3.21-3.21h-5.26c-1.7,0-3.08-1.38-3.08-3.08V3.21c0-1.77-1.44-3.21-3.21-3.21h-47.49c-1.77,0-3.21,1.44-3.21,3.21v5.26c0,1.7-1.38,3.08-3.08,3.08h-5.26c-1.77,0-3.21,1.44-3.21,3.21v5.26c0,1.7-1.38,3.08-3.08,3.08h-5.26c-1.77,0-3.21,1.44-3.21,3.21,0,0-.77-1.95-.77,34.47,0,32.56.77,29.7.77,29.7,0,1.77,1.44,3.21,3.21,3.21h5.26c1.7,0,3.08,1.38,3.08,3.08v5.39c0,1.7,1.38,3.08,3.08,3.08h5.39c1.7,0,3.08,1.38,3.08,3.08v5.26c0,1.77,1.44,3.21,3.21,3.21h46.21c1.77,0,3.21-1.44,3.21-3.21v-5.26c0-1.7,1.38-3.08,3.08-3.08h8.5c1.77,0,3.21-1.44,3.21-3.21v-15.3c0-1.77-1.44-3.21-3.21-3.21h-19.93c-1.77,0-3.21,1.44-3.21,3.21v7.73c0,1.7-1.38,3.08-3.08,3.08h-23.36c-1.7,0-3.08-1.38-3.08-3.08v-7.83c0-1.77-1.44-3.21-3.21-3.21h-7.83c-1.7,0-2.66.25-3.08-3.08-.13-1.07-.2-2.38-.3-4.13-.25-4.41-.47-2.64-.47-15.89,0-18.52.48-23.85.77-26.42s1.38-3.08,3.08-3.08h7.83c1.77,0,3.21-1.44,3.21-3.21v-5.26c0-1.7,1.38-3.08,3.08-3.08h24.65c1.7,0,3.08,1.38,3.08,3.08v5.26c0,1.77,1.44,3.21,3.21,3.21Z"></path>
+  <path class="cls-1" d="M28.49,113.03h-3.79c-.74,0-1.33-.6-1.33-1.33v-3.79c0-1.47-1.19-2.67-2.67-2.67h-10.24c-1.47,0-2.67,1.19-2.67,2.67v3.79c0,.74-.6,1.33-1.33,1.33h-3.79c-1.47,0-2.67,1.19-2.67,2.67v10.24c0,1.47,1.19,2.67,2.67,2.67h3.79c.74,0,1.33.6,1.33,1.33v3.79c0,1.47,1.19,2.67,2.67,2.67h10.24c1.47,0,2.67-1.19,2.67-2.67v-3.79c0-.74.6-1.33,1.33-1.33h3.79c1.47,0,2.67-1.19,2.67-2.67v-10.24c0-1.47-1.19-2.67-2.67-2.67Z"></path>
+</svg></svg></g></g></svg>
--- a/clanServices/monitoring/prometheus-consoles/index.html
+++ b/clanServices/monitoring/prometheus-consoles/index.html
@@ -0,0 +1,249 @@
+<!DOCTYPE html> <html>
+<head>
+  <meta charset="utf-8">
+  <title>Clan status</title>
+  <link rel="icon" type="image/png" href="favicon-48x48.png" sizes="48x48" />
+  <link rel="icon" type="image/svg+xml" href="favicon.svg" />
+  <link rel="shortcut icon" href="favicon.ico" />
+  <link rel="apple-touch-icon" sizes="180x180" href="apple-touch-icon.png" />
+  <meta name="viewport" content="width=device-width, initial-scale=1">
+  <style>
+ :root {
+  --dark: rgb(22, 35, 36);
+  --light: rgb(229, 231, 235);
+}
+
+body {
+  font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
+  margin: 0;
+  padding: 20px;
+  background: var(--dark);
+}
+.container {
+  max-width: 1400px;
+  margin: 0 auto;
+  background: var(--light);
+  padding: 30px;
+  border-radius: 8px;
+  box-shadow: 0 1px 3px rgba(0,0,0,0.1);
+}
+h1 {
+  margin-top: 0;
+  color: #333;
+  border-bottom: 2px solid var(--dark);
+  padding-bottom: 10px;
+}
+h2 {
+  color: #555;
+  margin-top: 30px;
+}
+table {
+  width: 100%;
+  border-collapse: collapse;
+  margin: 20px 0;
+}
+th {
+  background: var(--dark);
+  color: var(--light);
+  padding: 12px;
+  text-align: left;
+  font-weight: 600;
+}
+td {
+  padding: 10px 12px;
+  border-bottom: 1px solid #ddd;
+}
+tr:hover {
+  background: var(--light);
+}
+.status-up {
+  color: #28a745;
+  font-weight: bold;
+}
+.status-down {
+  color: #dc3545;
+  font-weight: bold;
+}
+.alert-success {
+  background: #d4edda;
+  color: #155724;
+  padding: 12px;
+  border-radius: 4px;
+  border: 1px solid #c3e6cb;
+}
+.grid {
+  display: grid;
+  grid-template-columns: repeat(auto-fit, minmax(400px, 1fr));
+  gap: 20px;
+  margin: 20px 0;
+}
+.card {
+  border: 1px solid #ddd;
+  border-radius: 4px;
+  padding: 15px;
+}
+.metric-value {
+  font-size: 1.2em;
+  font-weight: bold;
+  color: var(--dark);
+}
+
+  </style>
+
+</head>
+<body>
+  <div class="container">
+    <h1>Clan Status</h1>
+
+
+    <h2>Instances</h2>
+    <table>
+      <thead>
+        <tr>
+          <th>Host</th>
+          <th>Status</th>
+          <th>CPU Usage</th>
+          <th>Memory Usage</th>
+          <th>Disk Usage</th>
+        </tr>
+      </thead>
+      <tbody>
+        {{ range query "up" | sortByLabel "instance" }}
+        {{ $hostname := reReplaceAll "\\..*" "" .Labels.instance }}
+        <tr>
+          <td>{{ $hostname }}</td>
+          <td>
+            {{ if eq .Value 1.0 }}
+              <span class="status-up">UP</span>
+            {{ else }}
+              <span class="status-down">DOWN</span>
+            {{ end }}
+          </td>
+          <td>
+            {{ $cpuQuery := query (printf "100 - cpu_usage_idle{cpu=\"cpu-total\",host=\"%s\"}" $hostname) }}
+            {{ if $cpuQuery }}
+              {{ with $cpuQuery | first }}
+                <span class="metric-value">{{ . | value | printf "%.1f" }}%</span>
+              {{ end }}
+            {{ else }}
+              N/A
+            {{ end }}
+          </td>
+          <td>
+            {{ $memQuery := query (printf "(1 - (mem_available{host=\"%s\"} / mem_total{host=\"%s\"})) * 100" $hostname $hostname) }}
+            {{ if $memQuery }}
+              {{ with $memQuery | first }}
+                <span class="metric-value">{{ . | value | printf "%.1f" }}%</span>
+              {{ end }}
+            {{ else }}
+              N/A
+            {{ end }}
+          </td>
+          <td>
+            {{ $diskQuery := query (printf "(1 - (disk_free{host=\"%s\",path=\"/\"} / disk_total{host=\"%s\",path=\"/\"})) * 100" $hostname $hostname) }}
+            {{ if $diskQuery }}
+              {{ with $diskQuery | first }}
+                <span class="metric-value">{{ . | value | printf "%.1f" }}%</span>
+              {{ end }}
+            {{ else }}
+              N/A
+            {{ end }}
+          </td>
+        </tr>
+        {{ end }}
+      </tbody>
+    </table>
+
+    <h2>Services</h2>
+    <table>
+      <thead>
+        <tr>
+          <th>Service</th>
+          <th>Host</th>
+          <th>State</th>
+        </tr>
+      </thead>
+      <tbody>
+		  <!--     <tr> -->
+		  <!--       <td>Vaultwarden</td> -->
+		  <!--       <td>kiwi</td> -->
+		  <!--       <td> -->
+		  <!--           <span class="status-up">UP</span> -->
+		  <!-- </td> -->
+		  <!--     </tr> -->
+      </tbody>
+    </table>
+
+    <!-- <h2>NixOS Systems</h2> -->
+    <!-- <table> -->
+    <!--   <thead> -->
+    <!--     <tr> -->
+    <!--       <th>Host</th> -->
+    <!--       <th>Booted System</th> -->
+    <!--       <th>Current System</th> -->
+    <!--       <th>Booted Kernel</th> -->
+    <!--       <th>Current Kernel</th> -->
+    <!--     </tr> -->
+    <!--   </thead> -->
+    <!--   <tbody> -->
+    <!--     {{ range query "nixos_systems_present" | sortByLabel "host" }} -->
+    <!--     <tr> -->
+    <!--       <td>{{ .Labels.host }}</td> -->
+    <!--       <td style="font-family: monospace; font-size: 0.85em;">{{ .Labels.booted_system }}</td> -->
+    <!--       <td style="font-family: monospace; font-size: 0.85em;">{{ .Labels.current_system }}</td> -->
+    <!--       <td>{{ .Labels.booted_kernel }}</td> -->
+    <!--       <td>{{ .Labels.current_kernel }}</td> -->
+    <!--     </tr> -->
+    <!--     {{ end }} -->
+    <!--   </tbody> -->
+    <!-- </table> -->
+
+    <h2>Failed Systemd Units</h2>
+    {{ $failedUnits := query "systemd_units_sub_code{sub=\"failed\"}" }}
+    {{ if $failedUnits }}
+    <table>
+      <thead>
+        <tr>
+          <th>Host</th>
+          <th>Unit</th>
+        </tr>
+      </thead>
+      <tbody>
+        {{ range $failedUnits | sortByLabel "host" }}
+        <tr>
+          <td>{{ .Labels.host }}</td>
+          <td style="color: #dc3545;">{{ .Labels.name }}</td>
+        </tr>
+        {{ end }}
+      </tbody>
+    </table>
+    {{ else }}
+    <div class="alert-success">No failed systemd units</div>
+    {{ end }}
+
+    <h2>Active Alerts</h2>
+    {{ with query "ALERTS{alertstate=\"firing\"}" }}
+    <table>
+      <thead>
+        <tr>
+          <th>Host</th>
+          <th>Alert</th>
+          <th>Value</th>
+        </tr>
+      </thead>
+      <tbody>
+        {{ range . }}
+        <tr>
+          <td>{{ or .Labels.host .Labels.instance }}</td>
+          <td>{{ .Labels.alertname }}</td>
+          <td>{{ .Value }}</td>
+        </tr>
+        {{ end }}
+      </tbody>
+    </table>
+    {{ else }}
+    <div class="alert-success">No active alerts</div>
+    {{ end }}
+  </div>
+</body>
+</html>
--- a/clanServices/monitoring/prometheus-consoles/style.css
+++ b/clanServices/monitoring/prometheus-consoles/style.css
@@ -0,0 +1,80 @@
+ :root {
+  --dark: rgb(22, 35, 36);
+  --light: rgb(229, 231, 235);
+}
+
+body {
+  font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
+  margin: 0;
+  padding: 20px;
+  background: var(--dark);
+}
+.container {
+  max-width: 1400px;
+  margin: 0 auto;
+  background: var(--light);
+  padding: 30px;
+  border-radius: 8px;
+  box-shadow: 0 1px 3px rgba(0,0,0,0.1);
+}
+h1 {
+  margin-top: 0;
+  color: #333;
+  border-bottom: 2px solid var(--dark);
+  padding-bottom: 10px;
+}
+h2 {
+  color: #555;
+  margin-top: 30px;
+}
+table {
+  width: 100%;
+  border-collapse: collapse;
+  margin: 20px 0;
+}
+th {
+  background: var(--dark);
+  color: var(--light);
+  padding: 12px;
+  text-align: left;
+  font-weight: 600;
+}
+td {
+  padding: 10px 12px;
+  border-bottom: 1px solid #ddd;
+}
+tr:hover {
+  background: var(--light);
+}
+.status-up {
+  color: #28a745;
+  font-weight: bold;
+}
+.status-down {
+  color: #dc3545;
+  font-weight: bold;
+}
+.alert-success {
+  background: #d4edda;
+  color: #155724;
+  padding: 12px;
+  border-radius: 4px;
+  border: 1px solid #c3e6cb;
+}
+.grid {
+  display: grid;
+  grid-template-columns: repeat(auto-fit, minmax(400px, 1fr));
+  gap: 20px;
+  margin: 20px 0;
+}
+.card {
+  border: 1px solid #ddd;
+  border-radius: 4px;
+  padding: 15px;
+}
+.metric-value {
+  font-size: 1.2em;
+  font-weight: bold;
+  color: var(--dark);
+}
+
--- a/clanServices/monitoring/prometheus.nix
+++ b/clanServices/monitoring/prometheus.nix
@@ -0,0 +1,83 @@
+{
+  roles.prometheus.perInstance =
+    {
+      settings,
+      instanceName,
+      roles,
+      ...
+    }:
+    {
+
+      nixosModule =
+        {
+          config,
+          lib,
+          pkgs,
+          ...
+        }:
+        {
+
+          systemd.services.prometheus = {
+            serviceConfig = {
+              LoadCredential = "password:${config.clan.core.vars.generators.prometheus.files.password.path}";
+              BindReadOnlyPaths = "%d/password:/etc/prometheus/password";
+            };
+          };
+
+          services.prometheus = {
+
+            enable = true;
+
+            # TODO what do we set here? do we even need something?
+            # TODO this should be a export
+            # "https://prometheus.${config.clan.core.settings.tld}";
+            webExternalUrl = settings.webExternalUrl;
+
+            # Configure console templates and libraries paths
+            extraFlags = [
+              "--storage.tsdb.retention.time=30d"
+              "--web.console.templates=${./prometheus-consoles}"
+              "--web.console.libraries=${./prometheus-consoles}"
+            ];
+
+            ruleFiles = [
+              (pkgs.writeText "prometheus-rules.yml" (
+                builtins.toJSON {
+                  groups = [
+                    {
+                      name = "alerting-rules";
+                      rules = import ./alert-rules.nix { inherit lib; };
+                    }
+                  ];
+                }
+              ))
+            ];
+
+            scrapeConfigs = [
+              {
+                job_name = "telegraf";
+                scrape_interval = "60s";
+                metrics_path = "/metrics";
+                basic_auth.username = "prometheus";
+                basic_auth.password_file = "/etc/prometheus/password";
+
+                static_configs = [
+                  {
+                    # Scrape all machines with the `telegraf` role
+                    # https://prometheus:<password>@<host>.<tld>:9273/metrics
+
+                    # scheme = "https";
+                    # scheme = "http";
+
+                    targets = map (m: "${m}.${config.clan.core.settings.tld}:9273") (
+                      lib.attrNames roles.telegraf.machines
+                    );
+                    labels.type = instanceName;
+                  }
+                ];
+              }
+            ];
+          };
+        };
+    };
+}
--- a/clanServices/monitoring/telegraf.nix
+++ b/clanServices/monitoring/telegraf.nix
@@ -1,128 +1,32 @@
 {
  roles.telegraf.perInstance =
-    { settings, ... }:
+    { ... }:
    {

      nixosModule =
        {
          config,
          pkgs,
-          lib,
          ...
        }:
-        let
-          auth_user = "prometheus";
-        in
        {
-          warnings =
-            lib.optionals (settings.allowAllInterfaces != null) [
-              "monitoring.settings.allowAllInterfaces is deprecated and and has no effect. Please remove it from your inventory."
-              "The monitoring service will now always listen on all interfaces over https."
-            ]
-            ++ (lib.optionals (settings.interfaces != null) [
-              "monitoring.settings.interfaces is deprecated and and has no effect. Please remove it from your inventory."
-              "The monitoring service will now always listen on all interfaces over https."
-            ]);

-          networking.firewall.allowedTCPPorts = [
-            9273
-            9990
-          ];
-
-          clan.core.vars.generators."telegraf-certs" = {
-            files.crt = {
-              restartUnits = [ "telegraf.service" ];
-              deploy = true;
-              secret = false;
-            };
-            files.key = {
-              mode = "0600";
-              restartUnits = [ "telegraf.service" ];
-            };
-
-            runtimeInputs = [
-              pkgs.openssl
-            ];
-
-            # TODO: Implement automated certificate rotation instead of using a 100-year expiration
-            script = ''
-              openssl req -x509 -nodes -newkey rsa:4096 \
-                -days 36500 \
-                -keyout "$out"/key \
-                -out "$out"/crt \
-                -subj "/C=US/ST=CA/L=San Francisco/O=Example Corp/OU=IT/CN=example.com"
-            '';
-          };
-
-          clan.core.vars.generators."telegraf" = {
-            files.password.restartUnits = [ "telegraf.service" ];
-            files.password-env.restartUnits = [ "telegraf.service" ];
-            files.miniserve-auth.restartUnits = [ "telegraf.service" ];
-
-            dependencies = [ "telegraf-certs" ];
-
-            runtimeInputs = [
-              pkgs.coreutils
-              pkgs.xkcdpass
-              pkgs.mkpasswd
-            ];
-
-            script = ''
-              PASSWORD=$(xkcdpass --numwords 4 --delimiter - --count 1 | tr -d "\n")
-              echo "BASIC_AUTH_PWD=$PASSWORD" > "$out"/password-env
-              echo "${auth_user}:$PASSWORD" > "$out"/miniserve-auth
-              echo "$PASSWORD" | tr -d "\n" > "$out"/password
-            '';
-          };
-
-          systemd.services.telegraf-json = {
-            enable = true;
-            wantedBy = [ "multi-user.target" ];
-            after = [ "telegraf.service" ];
-            requires = [ "telegraf.service" ];
-            serviceConfig = {
-              LoadCredential = [
-                "auth_file_path:${config.clan.core.vars.generators.telegraf.files.miniserve-auth.path}"
-                "telegraf_crt_path:${config.clan.core.vars.generators.telegraf-certs.files.crt.path}"
-                "telegraf_key_path:${config.clan.core.vars.generators.telegraf-certs.files.key.path}"
-              ];
-              Environment = [
-                "AUTH_FILE_PATH=%d/auth_file_path"
-                "CRT_PATH=%d/telegraf_crt_path"
-                "KEY_PATH=%d/telegraf_key_path"
-              ];
-              Restart = "on-failure";
-              User = "telegraf";
-              Group = "telegraf";
-              RuntimeDirectory = "telegraf-www";
-            };
-            script = "${pkgs.miniserve}/bin/miniserve -p 9990 /run/telegraf-www --auth-file \"$AUTH_FILE_PATH\" --tls-cert \"$CRT_PATH\" --tls-key \"$KEY_PATH\"";
-          };
-
-          systemd.services.telegraf = {
-            serviceConfig = {
-              LoadCredential = [
-                "telegraf_crt_path:${config.clan.core.vars.generators.telegraf-certs.files.crt.path}"
-                "telegraf_key_path:${config.clan.core.vars.generators.telegraf-certs.files.key.path}"
-              ];
-              Environment = [
-                "CRT_PATH=%d/telegraf_crt_path"
-                "KEY_PATH=%d/telegraf_key_path"
-              ];
-            };
-          };
+          networking.firewall.allowedTCPPorts = [ 9273 ];

          services.telegraf = {
            enable = true;
-            environmentFiles = [
-              (builtins.toString config.clan.core.vars.generators.telegraf.files.password-env.path)
-            ];
+            environmentFiles = [ config.clan.core.vars.generators.prometheus.files.password-env.path ];

            extraConfig = {
              agent.interval = "60s";
              inputs = {

+                # More input plugins available at:
+                # https://github.com/influxdata/telegraf/tree/master/plugins/inputs
                diskio = { };
+                disk = { };
+                cpu = { };
+                processes = { };
                kernel_vmstat = { };
                system = { };
                mem = { };
@@ -147,20 +51,12 @@
                    }
                  ];
              };
-              # sadly there doesn'T seem to exist a telegraf http_client output plugin
+              # sadly there doesn't seem to exist a telegraf http_client output plugin
              outputs.prometheus_client = {
                listen = ":9273";
                metric_version = 2;
-                basic_username = "${auth_user}";
+                basic_username = "prometheus";
                basic_password = "$${BASIC_AUTH_PWD}";
-                tls_cert = "$${CRT_PATH}";
-                tls_key = "$${KEY_PATH}";
-              };
-
-              outputs.file = {
-                files = [ "/run/telegraf-www/telegraf.json" ];
-                data_format = "json";
-                json_timestamp_units = "1s";
              };
            };
          };
--- a/clanServices/sshd/default.nix
+++ b/clanServices/sshd/default.nix
@@ -39,6 +39,7 @@
            ...
          }:
          let
+            uniqueStrings = list: builtins.attrNames (builtins.groupBy lib.id list);
            # Collect searchDomains from all servers in this instance
            allServerSearchDomains = lib.flatten (
              lib.mapAttrsToList (_name: machineConfig: machineConfig.settings.certificate.searchDomains or [ ]) (
@@ -46,7 +47,7 @@
              )
            );
            # Merge client's searchDomains with all servers' searchDomains
-            searchDomains = lib.uniqueStrings (settings.certificate.searchDomains ++ allServerSearchDomains);
+            searchDomains = uniqueStrings (settings.certificate.searchDomains ++ allServerSearchDomains);
          in
          {
            clan.core.vars.generators.openssh-ca = lib.mkIf (searchDomains != [ ]) {
--- a/clanServices/syncthing/flake-module.nix
+++ b/clanServices/syncthing/flake-module.nix
@@ -22,7 +22,6 @@ in
            ../../clanServices/syncthing
            # Required modules
            ../../nixosModules/clanCore
-            ../../nixosModules/machineModules
            # Dependencies like clan-cli
            ../../pkgs/clan-cli
          ];
--- a/clanServices/zerotier/default.nix
+++ b/clanServices/zerotier/default.nix
@@ -140,6 +140,9 @@
            pkgs,
            ...
          }:
+          let
+            uniqueStrings = list: builtins.attrNames (builtins.groupBy lib.id list);
+          in
          {
            imports = [
              (import ./shared.nix {
@@ -156,7 +159,7 @@
            config = {
              systemd.services.zerotier-inventory-autoaccept =
                let
-                  machines = lib.uniqueStrings (
+                  machines = uniqueStrings (
                    (lib.optionals (roles ? moon) (lib.attrNames roles.moon.machines))
                    ++ (lib.optionals (roles ? controller) (lib.attrNames roles.controller.machines))
                    ++ (lib.optionals (roles ? peer) (lib.attrNames roles.peer.machines))
--- a/clanServices/zerotier/flake-module.nix
+++ b/clanServices/zerotier/flake-module.nix
@@ -21,7 +21,6 @@ in
            ../../clanServices/zerotier
            # Required modules
            ../../nixosModules/clanCore
-            ../../nixosModules/machineModules
            # Dependencies like clan-cli
            ../../pkgs/clan-cli
          ];
--- a/devFlake/flake.lock
+++ b/devFlake/flake.lock
@@ -105,11 +105,11 @@
    },
    "nixpkgs-dev": {
      "locked": {
-        "lastModified": 1761631514,
-        "narHash": "sha256-VsXz+2W4DFBozzppbF9SXD9pNcv17Z+c/lYXvPJi/eI=",
+        "lastModified": 1761458099,
+        "narHash": "sha256-XeAdn1NidGKXSwlepyjH+n58hsCDqbpx1M8sdDM2Ggc=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "a0b0d4b52b5f375658ca8371dc49bff171dbda91",
+        "rev": "d8cc1036c65d3c9468a91443a75b51276279ac61",
        "type": "github"
      },
      "original": {
--- a/flake.lock
+++ b/flake.lock
@@ -71,11 +71,11 @@
        ]
      },
      "locked": {
-        "lastModified": 1761339987,
-        "narHash": "sha256-IUaawVwItZKi64IA6kF6wQCLCzpXbk2R46dHn8sHkig=",
+        "lastModified": 1760721282,
+        "narHash": "sha256-aAHphQbU9t/b2RRy2Eb8oMv+I08isXv2KUGFAFn7nCo=",
        "owner": "nix-darwin",
        "repo": "nix-darwin",
-        "rev": "7cd9aac79ee2924a85c211d21fafd394b06a38de",
+        "rev": "c3211fcd0c56c11ff110d346d4487b18f7365168",
        "type": "github"
      },
      "original": {
--- a/flakeModules/clan.nix
+++ b/flakeModules/clan.nix
@@ -39,10 +39,32 @@ in
          };
          modules = [
            clan-core.modules.clan.default
+            {
+              checks.minNixpkgsVersion = {
+                assertion = lib.versionAtLeast nixpkgs.lib.version "25.11";
+                message = ''
+                  Nixpkgs version: ${nixpkgs.lib.version} is incompatible with clan-core. (>= 25.11 is recommended)
+                  ---
+                  Your version of 'nixpkgs' seems too old for clan-core.
+                  Please read: https://docs.clan.lol/guides/nixpkgs-flake-input
+
+                  You can ignore this check by setting:
+                  clan.checks.minNixpkgsVersion.ignore = true;
+                  ---
+                '';
+              };
+            }
          ];
        };
-        # Important: !This logic needs to be kept in sync with lib.clan function!
-        apply = config: clan-core.lib.checkConfig config.checks config;
+        apply =
+          config:
+          lib.deepSeq (lib.mapAttrs (
+            id: check:
+            if check.ignore || check.assertion then
+              null
+            else
+              throw "clan.checks.${id} failed with message\n${check.message}"
+          ) config.checks) config;
      };

    # Mapped flake toplevel outputs
--- a/lib/clan/checkConfig.nix
+++ b/lib/clan/checkConfig.nix
@@ -1,19 +0,0 @@
-{ lib, ... }:
-/**
-  Function to assert clan configuration checks.
-
-  Arguments:
-
-  - 'checks' attribute of clan configuration
-  - Any: the returned configuration (can be anything, is just passed through)
-*/
-checks:
-lib.deepSeq (
-  lib.mapAttrs (
-    id: check:
-    if check.ignore || check.assertion then
-      null
-    else
-      throw "clan.checks.${id} failed with message\n${check.message}"
-  ) checks
-)
--- a/lib/clan/default.nix
+++ b/lib/clan/default.nix
@@ -33,23 +33,20 @@
 let
  nixpkgs = self.inputs.nixpkgs or clan-core.inputs.nixpkgs;
  nix-darwin = self.inputs.nix-darwin or clan-core.inputs.nix-darwin;
-  configuration = (
-    lib.evalModules {
-      class = "clan";
-      specialArgs = {
-        inherit
-          self
-          ;
-        inherit
-          nixpkgs
-          nix-darwin
-          ;
-      };
-      modules = [
-        clan-core.modules.clan.default
-        m
-      ];
-    }
-  );
 in
-clan-core.clanLib.checkConfig configuration.config.checks configuration
+lib.evalModules {
+  class = "clan";
+  specialArgs = {
+    inherit
+      self
+      ;
+    inherit
+      nixpkgs
+      nix-darwin
+      ;
+  };
+  modules = [
+    clan-core.modules.clan.default
+    m
+  ];
+}
--- a/lib/default.nix
+++ b/lib/default.nix
@@ -16,8 +16,6 @@ lib.fix (
      */
      callLib = file: args: import file ({ inherit lib clanLib; } // args);

-      checkConfig = clanLib.callLib ./clan/checkConfig.nix { };
-
      evalService = clanLib.callLib ./evalService.nix { };
      # ------------------------------------
      # ClanLib functions
--- a/lib/dir_test.nix
+++ b/lib/dir_test.nix
@@ -53,12 +53,7 @@ in
            };
          };
        }).clan
-          {
-            directory = rootPath;
-            self = {
-              inputs.nixpkgs.lib.version = "25.11";
-            };
-          };
+          { config.directory = rootPath; };
    in
    {
      inherit vclan;
@@ -99,12 +94,7 @@ in
            };
          };
        }).clan
-          {
-            directory = rootPath;
-            self = {
-              inputs.nixpkgs.lib.version = "25.11";
-            };
-          };
+          { config.directory = rootPath; };
    in
    {
      inherit vclan;
--- a/lib/inventory/distributed-service/flake-module.nix
+++ b/lib/inventory/distributed-service/flake-module.nix
@@ -21,7 +21,6 @@ in
          ../../../flakeModules
          ../../../lib
          ../../../nixosModules/clanCore
-          ../../../nixosModules/machineModules
          ../../../machines
          ../../../inventory.json
          ../../../modules
--- a/lib/inventory/distributed-service/service-module.nix
+++ b/lib/inventory/distributed-service/service-module.nix
@@ -7,10 +7,14 @@
  ...
 }:
 let
-  inherit (lib) mkOption types uniqueStrings;
+  inherit (lib) mkOption types;
  inherit (types) attrsWith submoduleWith;

  errorContext = "Error context: ${lib.concatStringsSep "." _ctx}";
+  # TODO:
+  # Remove once this gets merged upstream; performs in O(n*log(n) instead of O(n^2))
+  # https://github.com/NixOS/nixpkgs/pull/355616/files
+  uniqueStrings = list: builtins.attrNames (builtins.groupBy lib.id list);
  /**
    Merges the role- and machine-settings using the role interface

@@ -77,7 +81,6 @@ let
  applySettings =
    instanceName: instance:
    lib.mapAttrs (roleName: role: {
-      settings = config.instances.${instanceName}.roles.${roleName}.finalSettings.config;
      machines = lib.mapAttrs (machineName: _v: {
        settings =
          config.instances.${instanceName}.roles.${roleName}.machines.${machineName}.finalSettings.config;
@@ -155,29 +158,6 @@ in
                        (
                          { name, ... }@role:
                          {
-                            options.finalSettings = mkOption {
-                              default = evalMachineSettings instance.name role.name null role.config.settings { };
-                              type = types.raw;
-                              description = ''
-                                Final evaluated settings of the curent-machine
-
-                                This contains the merged and evaluated settings of the role interface,
-                                the role settings and the machine settings.
-
-                                Type: 'configuration' as returned by 'lib.evalModules'
-                              '';
-                              apply = lib.warn ''
-                                === WANRING ===
-                                'roles.<roleName>.settings' do not contain machine specific settings.
-
-                                Prefer `machines.<machineName>.settings` instead. (i.e `perInstance: roles.<roleName>.machines.<machineName>.settings`)
-
-                                If you have a use-case that requires access to the original role settings without machine overrides.
-                                Contact us via matrix (https://matrix.to/#/#clan:clan.lol) or file an issue: https://git.clan.lol
-
-                                This feature will be removed in the next release
-                              '';
-                            };
                            # instances.{instanceName}.roles.{roleName}.machines
                            options.machines = mkOption {
                              description = ''
@@ -879,11 +859,7 @@ in
                  instanceRes.nixosModule
                ]
                ++ (map (
-                  s:
-                  if builtins.typeOf s == "string" then
-                    lib.warn "String types for 'extraModules' will be deprecated - ${s}" "${directory}/${s}"
-                  else
-                    lib.setDefaultModuleLocation "via inventory.instances.${instanceName}.roles.${roleName}" s
+                  s: if builtins.typeOf s == "string" then "${directory}/${s}" else s
                ) instanceCfg.roles.${roleName}.extraModules);
              };
            }
--- a/lib/inventory/distributed-service/tests/per_instance_args.nix
+++ b/lib/inventory/distributed-service/tests/per_instance_args.nix
@@ -137,7 +137,6 @@ in
              settings = { };
            };
          };
-          settings = { };
        };
        peer = {
          machines = {
@@ -147,9 +146,6 @@ in
              };
            };
          };
-          settings = {
-            timeout = "foo-peer";
-          };
        };
      };
      settings = {
--- a/lib/inventory/distributed-service/tests/per_machine_args.nix
+++ b/lib/inventory/distributed-service/tests/per_machine_args.nix
@@ -102,23 +102,18 @@ in
      specificRoleSettings =
        res.importedModulesEvaluated.self-A.result.allMachines.jon.passthru.instances.instance_foo.roles.peer;
    };
-    expected = {
+    expected = rec {
      hasMachineSettings = true;
-      hasRoleSettings = true;
+      hasRoleSettings = false;
      specificMachineSettings = {
        timeout = "foo-peer-jon";
      };
      specificRoleSettings = {
        machines = {
          jon = {
-            settings = {
-              timeout = "foo-peer-jon";
-            };
+            settings = specificMachineSettings;
          };
        };
-        settings = {
-          timeout = "foo-peer";
-        };
      };
    };
  };
--- a/lib/tests.nix
+++ b/lib/tests.nix
@@ -212,36 +212,6 @@ in
      };
    };

-  test_clan_check_simple_fail =
-    let
-      eval = clan {
-        checks.constFail = {
-          assertion = false;
-          message = "This is a constant failure";
-        };
-      };
-    in
-    {
-      result = eval;
-      expr = eval.config;
-      expectedError.type = "ThrownError";
-      expectedError.msg = "This is a constant failure";
-    };
-  test_clan_check_simple_pass =
-    let
-      eval = clan {
-        checks.constFail = {
-          assertion = true;
-          message = "This is a constant success";
-        };
-      };
-    in
-    {
-      result = eval;
-      expr = lib.seq eval.config 42;
-      expected = 42;
-    };
-
  test_get_var_machine =
    let
      varsLib = import ./vars.nix { };
--- a/modules/clan/checks.nix
+++ b/modules/clan/checks.nix
@@ -1,16 +0,0 @@
-{ lib, nixpkgs, ... }:
-{
-  checks.minNixpkgsVersion = {
-    assertion = lib.versionAtLeast nixpkgs.lib.version "25.11";
-    message = ''
-      Nixpkgs version: ${nixpkgs.lib.version} is incompatible with clan-core. (>= 25.11 is recommended)
-      ---
-      Your version of 'nixpkgs' seems too old for clan-core.
-      Please read: https://docs.clan.lol/guides/nixpkgs-flake-input
-
-      You can ignore this check by setting:
-      clan.checks.minNixpkgsVersion.ignore = true;
-      ---
-    '';
-  };
-}
--- a/modules/clan/default.nix
+++ b/modules/clan/default.nix
@@ -1,14 +1,3 @@
-/**
-  Root 'clan' Module
-
-  Defines lib.clan and flake-parts.clan options
-  and all common logic for the 'clan' module.
-
-  - has Class _class = "clan"
-
-  - _module.args.clan-core: reference to clan-core flake
-  - _module.args.clanLib: reference to lib.clan function
-*/
 { clan-core }:
 {
  _class = "clan";
@@ -17,8 +6,7 @@
    inherit (clan-core) clanLib;
  };
  imports = [
-    ./top-level-interface.nix
    ./module.nix
-    ./checks.nix
+    ./interface.nix
  ];
 }
--- a/modules/clan/top-level-interface.nix
+++ b/modules/clan/top-level-interface.nix
--- a/modules/clan/module.nix
+++ b/modules/clan/module.nix
@@ -100,7 +100,7 @@ let
          _: machine:
          machine.extendModules {
            modules = [
-              (lib.modules.importApply ../../nixosModules/machineModules/overridePkgs.nix {
+              (lib.modules.importApply ../machineModules/overridePkgs.nix {
                pkgs = pkgsFor.${system};
              })
            ];
@@ -167,9 +167,6 @@ in
          { ... }@args:
          let
            _class =
-              # _class was added in https://github.com/NixOS/nixpkgs/pull/395141
-              # Clan relies on it to determine which modules to load
-              # people need to use at least that version of nixpkgs
              args._class or (throw ''
                Your version of nixpkgs is incompatible with the latest clan.
                Please update nixpkgs input to the latest nixos-unstable or nixpkgs-unstable.
@@ -179,7 +176,7 @@ in
          in
          {
            imports = [
-              (lib.modules.importApply ../../nixosModules/machineModules/forName.nix {
+              (lib.modules.importApply ../machineModules/forName.nix {
                inherit (config.inventory) meta;
                inherit
                  name
@@ -225,18 +222,6 @@ in
      inventoryClass =
        let
          flakeInputs = config.self.inputs;
-          # Compute the relative directory path
-          selfStr = toString config.self;
-          dirStr = toString directory;
-          relativeDirectory =
-            if selfStr == dirStr then
-              ""
-            else if lib.hasPrefix selfStr dirStr then
-              lib.removePrefix (selfStr + "/") dirStr
-            else
-              # This shouldn't happen in normal usage, but can occur when
-              # the flake is copied (e.g., in tests). Fall back to empty string.
-              "";
        in
        {
          _module.args = {
@@ -245,12 +230,7 @@ in
          imports = [
            ../inventoryClass/default.nix
            {
-              inherit
-                inventory
-                directory
-                flakeInputs
-                relativeDirectory
-                ;
+              inherit inventory directory flakeInputs;
              exportsModule = config.exportsModule;
            }
            (
--- a/modules/clan/templates.nix
+++ b/modules/clan/templates.nix
@@ -1,28 +1,3 @@
-/**
-  The templates submodule
-
-  'clan.templates'
-
-  Different kinds supported:
-
-  - clan templates: 'clan.templates.clan'
-  - disko templates: 'clan.templates.disko'
-  - machine templates: 'clan.templates.machine'
-
-  A template has the form:
-
-  ```nix
-  {
-    description: string; # short summary what the template contains
-    path: path; # path to the template
-  }
-  ```
-
-  The clan API copies the template from the given 'path'
-  into a target folder. For example,
-
-  `./machines/<machine-name>` for 'machine' templates.
-*/
 {
  lib,
  ...
--- a/modules/inventoryClass/default.nix
+++ b/modules/inventoryClass/default.nix
@@ -81,14 +81,6 @@ in
    directory = mkOption {
      type = types.path;
    };
-    relativeDirectory = mkOption {
-      type = types.str;
-      internal = true;
-      description = ''
-        The relative directory path from the flake root to the clan directory.
-        Empty string if directory equals the flake root.
-      '';
-    };
    machines = mkOption {
      type = types.attrsOf (submodule ({
        options = {
--- a/modules/inventoryClass/role.nix
+++ b/modules/inventoryClass/role.nix
@@ -44,6 +44,12 @@ in
      description = ''
        List of additionally imported `.nix` expressions.

+        Supported types:
+
+        - **Strings**: Interpreted relative to the 'directory' passed to `lib.clan`.
+        - **Paths**: should be relative to the current file.
+        - **Any**: Nix expression must be serializable to JSON.
+
        !!! Note
            **The import only happens if the machine is part of the service or role.**

@@ -68,7 +74,7 @@ in
            ```
      '';
      default = [ ];
-      type = types.listOf types.raw;
+      type = types.listOf types.deferredModule;
    };
  };
 }
--- a/nixosModules/machineModules/forName.nix
+++ b/nixosModules/machineModules/forName.nix
@@ -3,7 +3,6 @@
  directory,
  meta,
 }:
-# The following is a nixos/darwin module
 {
  _class,
  lib,
--- a/nixosModules/machineModules/overridePkgs.nix
+++ b/nixosModules/machineModules/overridePkgs.nix
--- a/pkgs/clan-cli/clan_cli/facts/generate.py
+++ b/pkgs/clan-cli/clan_cli/facts/generate.py
@@ -225,9 +225,7 @@ def generate_facts(
                raise ClanError(msg)

    if not was_regenerated and len(machines) > 0:
-        pass
-        # Remove message until facts has been propertly deleted
-        # log.info("All secrets and facts are already up to date")
+        log.info("All secrets and facts are already up to date")
    return was_regenerated


--- a/pkgs/clan-cli/clan_lib/api/init.py
+++ b/pkgs/clan-cli/clan_lib/api/init.py
@@ -243,11 +243,7 @@ API.register(get_system_file)
            if "oneOf" not in return_type:
                msg = (
                    f"Return type of function '{name}' is not a union type. Expected a union of Success and Error types."
-                    # If the SuccessData type is unsupported it was dropped by Union narrowing.
-                    # This is probably an antifeature
-                    # Introduced because run_generator wanted to use:
-                    # Callable[[Generator], dict[str, str]]
-                    # In its function signature.
+                    # @DavHau: no idea wy exactly this leads to the "oneOf" ot being present, but this should help
                    "Hint: When using dataclasses as return types, ensure they don't contain public fields with non-serializable types"
                )
                raise JSchemaTypeError(msg)
--- a/pkgs/clan-cli/clan_lib/dirs/init.py
+++ b/pkgs/clan-cli/clan_lib/dirs/init.py
@@ -156,28 +156,14 @@ def vm_state_dir(flake_url: str, vm_name: str) -> Path:


 def machines_dir(flake: "Flake") -> Path:
-    # Determine the base path
    if flake.is_local:
-        base_path = flake.path
-    else:
-        store_path = flake.store_path
-        if store_path is None:
-            msg = "Invalid flake object. Doesn't have a store path"
-            raise ClanError(msg)
-        base_path = Path(store_path)
+        return flake.path / "machines"

-    # Get the clan directory configuration from Nix
-    # This is computed in Nix where store paths are consistent
-    # Returns "" if no custom directory is set
-    # Fall back to "" if the option doesn't exist (backwards compatibility)
-    try:
-        clan_dir = flake.select("clanInternals.inventoryClass.relativeDirectory")
-    except ClanError:
-        # Option doesn't exist in older clan-core versions
-        # Assume no custom directory
-        clan_dir = ""
-
-    return base_path / clan_dir / "machines"
+    store_path = flake.store_path
+    if store_path is None:
+        msg = "Invalid flake object. Doesn't have a store path"
+        raise ClanError(msg)
+    return Path(store_path) / "machines"


 def specific_machine_dir(machine: "MachineSpecProtocol") -> Path:
--- a/pkgs/clan-cli/clan_lib/llm/container_test.py
+++ b/pkgs/clan-cli/clan_lib/llm/container_test.py
@@ -5,18 +5,15 @@ from pathlib import Path
 from typing import TYPE_CHECKING
 from unittest.mock import MagicMock, patch

-import clan_lib.llm.llm_types
 import pytest
 from clan_lib.flake.flake import Flake
-from clan_lib.llm.llm_types import ModelConfig
 from clan_lib.llm.orchestrator import get_llm_turn
 from clan_lib.llm.service import create_llm_model, run_llm_service
 from clan_lib.service_runner import create_service_manager

 if TYPE_CHECKING:
    from clan_lib.llm.llm_types import ChatResult
-    from clan_lib.llm.schemas import SessionState
-import platform
+    from clan_lib.llm.schemas import ChatMessage, SessionState


 def get_current_mode(session_state: "SessionState") -> str:
@@ -171,80 +168,28 @@ def llm_service() -> Iterator[None]:
            service_manager.stop_service("ollama")


-@pytest.mark.service_runner
-@pytest.mark.usefixtures("mock_nix_shell", "llm_service")
-def test_full_conversation_flow(mock_flake: MagicMock) -> None:
-    """Test the complete conversation flow by manually calling get_llm_turn at each step.
+def execute_multi_turn_workflow(
+    user_request: str,
+    flake: Flake | MagicMock,
+    conversation_history: list["ChatMessage"] | None = None,
+    provider: str = "ollama",
+    session_state: "SessionState | None" = None,
+) -> "ChatResult":
+    """Execute the multi-turn workflow, auto-executing all pending operations.

-    This test verifies:
-    - State transitions through discovery -> readme_fetch -> service_selection -> final_decision
-    - Each step returns the correct next_action
-    - Conversation history is preserved across turns
-    - Session state is correctly maintained
+    This simulates the behavior of the CLI auto-execute loop in workflow.py.
    """
-    flake = mock_flake
-    trace_file = Path("~/.ollama/container_test_llm_trace.json").expanduser()
-    trace_file.unlink(missing_ok=True)  # Start fresh
-    provider = "ollama"
-
-    # Override DEFAULT_MODELS with 4-minute timeouts for container tests
-    clan_lib.llm.llm_types.DEFAULT_MODELS = {
-        "ollama": ModelConfig(
-            name="qwen3:4b-instruct",
-            provider="ollama",
-            timeout=300,  # set inference timeout to 5 minutes as CI may be slow
-            temperature=0,  # set randomness to 0 for consistent test results
-        ),
-    }
-
-    # ========== STEP 1: Initial request (should return next_action for discovery) ==========
-    print_separator("STEP 1: Initial Request", char="=", width=80)
    result = get_llm_turn(
-        user_request="What VPN options do I have?",
+        user_request=user_request,
        flake=flake,
+        conversation_history=conversation_history,
        provider=provider,  # type: ignore[arg-type]
+        session_state=session_state,
        execute_next_action=False,
-        trace_file=trace_file,
    )

-    # Should have next_action for discovery phase
-    assert result.next_action is not None, "Should have next_action for discovery"
-    assert result.next_action["type"] == "discovery"
-    assert result.requires_user_response is False
-    assert len(result.proposed_instances) == 0
-    assert "pending_discovery" in result.session_state
-    print(f"  Next Action: {result.next_action['type']}")
-    print(f"  Description: {result.next_action['description']}")
-    print_meta_info(result, turn=1, phase="Initial Request")
-
-    # ========== STEP 2: Execute discovery (should return next_action for readme_fetch) ==========
-    print_separator("STEP 2: Execute Discovery", char="=", width=80)
-    result = get_llm_turn(
-        user_request="",
-        flake=flake,
-        conversation_history=list(result.conversation_history),
-        provider=provider,  # type: ignore[arg-type]
-        session_state=result.session_state,
-        execute_next_action=True,
-        trace_file=trace_file,
-    )
-
-    # Should have next_action for readme fetch OR a clarifying question
-    if result.next_action:
-        assert result.next_action["type"] == "fetch_readmes"
-        assert "pending_readme_fetch" in result.session_state
-        print(f"  Next Action: {result.next_action['type']}")
-        print(f"  Description: {result.next_action['description']}")
-    else:
-        # LLM asked a clarifying question
-        assert result.requires_user_response is True
-        assert len(result.assistant_message) > 0
-        print(f"  Assistant Message: {result.assistant_message[:100]}...")
-    print_meta_info(result, turn=2, phase="Discovery Executed")
-
-    # ========== STEP 3: Execute readme fetch (if applicable) ==========
-    if result.next_action and result.next_action["type"] == "fetch_readmes":
-        print_separator("STEP 3: Execute Readme Fetch", char="=", width=80)
+    # Auto-execute any pending operations
+    while result.next_action:
        result = get_llm_turn(
            user_request="",
            flake=flake,
@@ -252,74 +197,187 @@ def test_full_conversation_flow(mock_flake: MagicMock) -> None:
            provider=provider,  # type: ignore[arg-type]
            session_state=result.session_state,
            execute_next_action=True,
-            trace_file=trace_file,
        )

-        # Should have next_action for service selection
-        assert result.next_action is not None
-        assert result.next_action["type"] == "service_selection"
-        assert "pending_service_selection" in result.session_state
-        print(f"  Next Action: {result.next_action['type']}")
-        print(f"  Description: {result.next_action['description']}")
-        print_meta_info(result, turn=3, phase="Readme Fetch Executed")
+    return result

-        if platform.machine() == "aarch64":
-            pytest.skip(
-                "aarch64 detected: skipping readme/service-selection and final step for performance reasons"
-            )

-        # ========== STEP 4: Execute service selection ==========
-        print_separator("STEP 4: Execute Service Selection", char="=", width=80)
-        result = get_llm_turn(
-            user_request="I want ZeroTier.",
+@pytest.mark.service_runner
+@pytest.mark.usefixtures("mock_nix_shell", "llm_service")
+def test_full_conversation_flow(mock_flake: MagicMock) -> None:
+    """Comprehensive test that exercises the complete conversation flow with the actual LLM service.
+
+    This test simulates a realistic multi-turn conversation that covers:
+    - Discovery phase: Initial request and LLM gathering information
+    - Service selection phase: User choosing from available options
+    - Final decision phase: Configuring the selected service with specific parameters
+    - State transitions: pending_service_selection -> pending_final_decision -> completion
+    - Conversation history preservation across all turns
+    - Error handling and edge cases
+    """
+    flake = mock_flake
+    # ========== TURN 1: Discovery Phase - Initial vague request ==========
+    print_separator("TURN 1: Discovery Phase", char="=", width=80)
+    result = execute_multi_turn_workflow(
+        user_request="What VPN options do I have?",
+        flake=flake,
+        provider="ollama",
+    )
+
+    # Verify discovery phase behavior
+    assert result.requires_user_response is True, (
+        "Should require user response in discovery"
+    )
+    assert len(result.conversation_history) >= 2, (
+        "Should have user + assistant messages"
+    )
+    assert result.conversation_history[0]["role"] == "user"
+    assert result.conversation_history[0]["content"] == "What VPN options do I have?"
+    assert result.conversation_history[-1]["role"] == "assistant"
+    assert len(result.assistant_message) > 0, "Assistant should provide a response"
+
+    # After multi-turn execution, we may have either:
+    # - pending_service_selection (if LLM provided options and is waiting for choice)
+    # - pending_final_decision (if LLM directly selected a service)
+    # - no pending state (if LLM asked a clarifying question)
+
+    # No instances yet
+    assert len(result.proposed_instances) == 0
+    assert result.error is None
+
+    print_chat_exchange(
+        "What VPN options do I have?", result.assistant_message, result.session_state
+    )
+    print_meta_info(result, turn=1, phase="Discovery")
+
+    # ========== TURN 2: Service Selection Phase - User makes a choice ==========
+    print_separator("TURN 2: Service Selection", char="=", width=80)
+    user_msg_2 = "I'll use ZeroTier please"
+    result = execute_multi_turn_workflow(
+        user_request=user_msg_2,
+        flake=flake,
+        conversation_history=list(result.conversation_history),
+        provider="ollama",
+        session_state=result.session_state,
+    )
+
+    # Verify conversation history growth and preservation
+    assert len(result.conversation_history) > 2, "History should grow"
+    assert result.conversation_history[0]["content"] == "What VPN options do I have?"
+    assert result.conversation_history[2]["content"] == "I'll use ZeroTier please"
+
+    # Should either ask for configuration details or provide direct config
+    # Most likely will ask for more details (pending_final_decision)
+    if result.requires_user_response:
+        # LLM is asking for configuration details
+        assert len(result.assistant_message) > 0
+        # Should transition to final decision phase
+        if "pending_final_decision" not in result.session_state:
+            # Might still be in service selection asking clarifications
+            assert "pending_service_selection" in result.session_state
+    else:
+        # LLM provided configuration immediately (less likely)
+        assert len(result.proposed_instances) > 0
+        assert result.proposed_instances[0]["module"]["name"] == "zerotier"
+
+    print_chat_exchange(user_msg_2, result.assistant_message, result.session_state)
+    print_meta_info(result, turn=2, phase="Service Selection")
+
+    # ========== Continue conversation until we reach final decision or completion ==========
+    max_turns = 10
+    turn_count = 2
+
+    while result.requires_user_response and turn_count < max_turns:
+        turn_count += 1
+
+        # Determine appropriate response based on current state
+        if "pending_service_selection" in result.session_state:
+            # Still selecting service
+            user_request = "Yes, ZeroTier"
+            phase = "Service Selection (continued)"
+        elif "pending_final_decision" in result.session_state:
+            # Configuring the service
+            user_request = "Set up gchq-local as controller, qube-email as moon, and wintux as peer"
+            phase = "Final Configuration"
+        else:
+            # Generic continuation
+            user_request = "Yes, that sounds good. Use gchq-local as controller."
+            phase = "Continuing Conversation"
+
+        print_separator(f"TURN {turn_count}: {phase}", char="=", width=80)
+
+        result = execute_multi_turn_workflow(
+            user_request=user_request,
            flake=flake,
            conversation_history=list(result.conversation_history),
-            provider=provider,  # type: ignore[arg-type]
+            provider="ollama",
            session_state=result.session_state,
-            execute_next_action=True,
-            trace_file=trace_file,
        )

-        # Should either have next_action for final_decision OR a clarifying question
-        if result.next_action:
-            assert result.next_action["type"] == "final_decision"
-            assert "pending_final_decision" in result.session_state
-            print(f"  Next Action: {result.next_action['type']}")
-            print(f"  Description: {result.next_action['description']}")
-        else:
-            # LLM asked a clarifying question during service selection
-            assert result.requires_user_response is True
-            assert len(result.assistant_message) > 0
-            print(f"  Assistant Message: {result.assistant_message[:100]}...")
-        print_meta_info(result, turn=4, phase="Service Selection Executed")
+        # Verify conversation history continues to grow
+        assert len(result.conversation_history) == (turn_count * 2), (
+            f"History should have {turn_count * 2} messages (turn {turn_count})"
+        )

-        # ========== STEP 5: Execute final decision (if applicable) ==========
-        if result.next_action and result.next_action["type"] == "final_decision":
-            print_separator("STEP 5: Execute Final Decision", char="=", width=80)
-            result = get_llm_turn(
-                user_request="",
-                flake=flake,
-                conversation_history=list(result.conversation_history),
-                provider=provider,  # type: ignore[arg-type]
-                session_state=result.session_state,
-                execute_next_action=True,
-                trace_file=trace_file,
-            )
+        # Verify history preservation
+        assert (
+            result.conversation_history[0]["content"] == "What VPN options do I have?"
+        )

-            # Should either have proposed_instances OR ask a clarifying question
-            if result.proposed_instances:
-                assert len(result.proposed_instances) > 0
-                assert result.next_action is None
-                print(f"  Proposed Instances: {len(result.proposed_instances)}")
-                for inst in result.proposed_instances:
-                    print(f"    - {inst['module']['name']}")
-            else:
-                # LLM asked a clarifying question
-                assert result.requires_user_response is True
-                assert len(result.assistant_message) > 0
-                print(f"  Assistant Message: {result.assistant_message[:100]}...")
-            print_meta_info(result, turn=5, phase="Final Decision Executed")
+        print_chat_exchange(
+            user_request, result.assistant_message, result.session_state
+        )
+        print_meta_info(result, turn=turn_count, phase=phase)

-    # Verify conversation history has grown
-    assert len(result.conversation_history) > 0
-    assert result.conversation_history[0]["content"] == "What VPN options do I have?"
+        # Check for completion
+        if not result.requires_user_response:
+            print_separator("CONVERSATION COMPLETED", char="=", width=80)
+            break
+
+    # ========== Final Verification ==========
+    print_separator("FINAL VERIFICATION", char="=", width=80)
+    assert turn_count < max_turns, f"Conversation took too many turns ({turn_count})"
+
+    # If conversation completed, verify we have valid configuration
+    if not result.requires_user_response:
+        assert len(result.proposed_instances) > 0, (
+            "Should have at least one proposed instance"
+        )
+        instance = result.proposed_instances[0]
+
+        # Verify instance structure
+        assert "module" in instance
+        assert "name" in instance["module"]
+        assert instance["module"]["name"] in [
+            "zerotier",
+            "wireguard",
+            "yggdrasil",
+            "mycelium",
+        ]
+
+        # Should not be in pending state anymore
+        assert "pending_service_selection" not in result.session_state
+        assert "pending_final_decision" not in result.session_state
+
+        assert result.error is None, f"Should not have error: {result.error}"
+
+        print_separator("FINAL SUMMARY", char="-", width=80, double=False)
+        print("  Status:                SUCCESS")
+        print(f"  Module Name:           {instance['module']['name']}")
+        print(f"  Total Turns:           {turn_count}")
+        print(f"  Final History Length:  {len(result.conversation_history)} messages")
+        if "roles" in instance:
+            roles_list = ", ".join(instance["roles"].keys())
+            print(f"  Configuration Roles:   {roles_list}")
+        print("  Errors:                None")
+        print("-" * 80)
+    else:
+        # Conversation didn't complete but should have made progress
+        assert len(result.conversation_history) > 2
+        assert result.error is None
+        print_separator("FINAL SUMMARY", char="-", width=80, double=False)
+        print("  Status:                IN PROGRESS")
+        print(f"  Total Turns:           {turn_count}")
+        print(f"  Current State:         {list(result.session_state.keys())}")
+        print(f"  History Length:        {len(result.conversation_history)} messages")
+        print("-" * 80)
--- a/pkgs/clan-cli/clan_lib/llm/endpoints.py
+++ b/pkgs/clan-cli/clan_lib/llm/endpoints.py
@@ -149,7 +149,6 @@ def call_openai_api(
    trace_file: Path | None = None,
    stage: str = "unknown",
    trace_metadata: dict[str, Any] | None = None,
-    temperature: float | None = None,
 ) -> OpenAIChatCompletionResponse:
    """Call the OpenAI API for chat completion.

@@ -161,7 +160,6 @@ def call_openai_api(
        trace_file: Optional path to write trace entries for debugging
        stage: Stage name for trace entries (default: "unknown")
        trace_metadata: Optional metadata to include in trace entries
-        temperature: Sampling temperature (default: None = use API default)

    Returns:
        The parsed JSON response from the API
@@ -180,8 +178,6 @@ def call_openai_api(
        "messages": messages,
        "tools": list(tools),
    }
-    if temperature is not None:
-        payload["temperature"] = temperature
    _debug_log_request("openai", messages, tools)
    url = "https://api.openai.com/v1/chat/completions"
    headers = {
@@ -260,7 +256,6 @@ def call_claude_api(
    trace_file: Path | None = None,
    stage: str = "unknown",
    trace_metadata: dict[str, Any] | None = None,
-    temperature: float | None = None,
 ) -> OpenAIChatCompletionResponse:
    """Call the Claude API (via OpenAI-compatible endpoint) for chat completion.

@@ -273,7 +268,6 @@ def call_claude_api(
        trace_file: Optional path to write trace entries for debugging
        stage: Stage name for trace entries (default: "unknown")
        trace_metadata: Optional metadata to include in trace entries
-        temperature: Sampling temperature (default: None = use API default)

    Returns:
        The parsed JSON response from the API
@@ -299,8 +293,6 @@ def call_claude_api(
        "messages": messages,
        "tools": list(tools),
    }
-    if temperature is not None:
-        payload["temperature"] = temperature
    _debug_log_request("claude", messages, tools)

    url = f"{base_url}chat/completions"
@@ -380,7 +372,6 @@ def call_ollama_api(
    stage: str = "unknown",
    max_tokens: int | None = None,
    trace_metadata: dict[str, Any] | None = None,
-    temperature: float | None = None,
 ) -> OllamaChatResponse:
    """Call the Ollama API for chat completion.

@@ -393,7 +384,6 @@ def call_ollama_api(
        stage: Stage name for trace entries (default: "unknown")
        max_tokens: Maximum number of tokens to generate (default: None = unlimited)
        trace_metadata: Optional metadata to include in trace entries
-        temperature: Sampling temperature (default: None = use API default)

    Returns:
        The parsed JSON response from the API
@@ -409,14 +399,9 @@ def call_ollama_api(
        "tools": list(tools),
    }

-    # Add options for max_tokens and temperature if specified
-    options: dict[str, int | float] = {}
+    # Add max_tokens limit if specified
    if max_tokens is not None:
-        options["num_predict"] = max_tokens
-    if temperature is not None:
-        options["temperature"] = temperature
-    if options:
-        payload["options"] = options  # type: ignore[typeddict-item]
+        payload["options"] = {"num_predict": max_tokens}  # type: ignore[typeddict-item]
    _debug_log_request("ollama", messages, tools)
    url = "http://localhost:11434/api/chat"

--- a/pkgs/clan-cli/clan_lib/llm/llm_types.py
+++ b/pkgs/clan-cli/clan_lib/llm/llm_types.py
@@ -73,21 +73,19 @@ class ModelConfig:
        name: The model identifier/name
        provider: The LLM provider
        timeout: Request timeout in seconds (default: 120)
-        temperature: Sampling temperature for the model (default: None = use API default)

    """

    name: str
    provider: Literal["openai", "ollama", "claude"]
    timeout: int = 120
-    temperature: float | None = None


 # Default model configurations for each provider
 DEFAULT_MODELS: dict[Literal["openai", "ollama", "claude"], ModelConfig] = {
    "openai": ModelConfig(name="gpt-4o", provider="openai", timeout=60),
    "claude": ModelConfig(name="claude-sonnet-4-5", provider="claude", timeout=60),
-    "ollama": ModelConfig(name="qwen3:4b-instruct", provider="ollama", timeout=180),
+    "ollama": ModelConfig(name="qwen3:4b-instruct", provider="ollama", timeout=120),
 }


--- a/pkgs/clan-cli/clan_lib/llm/phases.py
+++ b/pkgs/clan-cli/clan_lib/llm/phases.py
@@ -100,7 +100,6 @@ def get_llm_discovery_phase(
            trace_file=trace_file,
            stage="discovery",
            trace_metadata=trace_metadata,
-            temperature=model_config.temperature,
        )
        function_calls, message_content = parse_openai_response(
            openai_response, provider="openai"
@@ -114,7 +113,6 @@ def get_llm_discovery_phase(
            trace_file=trace_file,
            stage="discovery",
            trace_metadata=trace_metadata,
-            temperature=model_config.temperature,
        )
        function_calls, message_content = parse_openai_response(
            claude_response, provider="claude"
@@ -129,7 +127,6 @@ def get_llm_discovery_phase(
            stage="discovery",
            max_tokens=300,  # Limit output for discovery phase (get_readme calls or short question)
            trace_metadata=trace_metadata,
-            temperature=model_config.temperature,
        )
        function_calls, message_content = parse_ollama_response(
            ollama_response, provider="ollama"
@@ -252,7 +249,6 @@ def get_llm_service_selection(
            trace_file=trace_file,
            stage="select_service",
            trace_metadata=trace_metadata,
-            temperature=model_config.temperature,
        )
        function_calls, message_content = parse_openai_response(
            openai_response, provider="openai"
@@ -266,7 +262,6 @@ def get_llm_service_selection(
            trace_file=trace_file,
            stage="select_service",
            trace_metadata=trace_metadata,
-            temperature=model_config.temperature,
        )
        function_calls, message_content = parse_openai_response(
            claude_response, provider="claude"
@@ -281,7 +276,6 @@ def get_llm_service_selection(
            stage="select_service",
            max_tokens=600,  # Allow space for summary
            trace_metadata=trace_metadata,
-            temperature=model_config.temperature,
        )
        function_calls, message_content = parse_ollama_response(
            ollama_response, provider="ollama"
@@ -453,7 +447,6 @@ def get_llm_final_decision(
            trace_file=trace_file,
            stage="final_decision",
            trace_metadata=trace_metadata,
-            temperature=model_config.temperature,
        )
        function_calls, message_content = parse_openai_response(
            openai_response, provider="openai"
@@ -469,7 +462,6 @@ def get_llm_final_decision(
            trace_file=trace_file,
            stage="final_decision",
            trace_metadata=trace_metadata,
-            temperature=model_config.temperature,
        )
        function_calls, message_content = parse_openai_response(
            claude_response, provider="claude"
@@ -485,7 +477,6 @@ def get_llm_final_decision(
        stage="final_decision",
        max_tokens=500,  # Limit output to prevent excessive verbosity
        trace_metadata=trace_metadata,
-        temperature=model_config.temperature,
    )
    function_calls, message_content = parse_ollama_response(
        ollama_response, provider="ollama"
--- a/pkgs/clan-cli/clan_lib/llm/schemas.py
+++ b/pkgs/clan-cli/clan_lib/llm/schemas.py
@@ -231,7 +231,6 @@ class ChatCompletionRequestPayload(TypedDict, total=False):
    messages: list[ChatMessage]
    tools: list[ToolDefinition]
    stream: NotRequired[bool]
-    temperature: NotRequired[float]


@dataclass(frozen=True)
--- a/pkgs/clan-cli/clan_lib/nix_models/clan.py
+++ b/pkgs/clan-cli/clan_lib/nix_models/clan.py
@@ -28,11 +28,13 @@ class InventoryInstanceRoleMachine(TypedDict):



+InventoryInstanceRoleExtramodulesType = list[Unknown]
 InventoryInstanceRoleMachinesType = dict[str, InventoryInstanceRoleMachine]
 InventoryInstanceRoleSettingsType = Unknown
 InventoryInstanceRoleTagsType = dict[str, Any] | list[str]

 class InventoryInstanceRole(TypedDict):
+    extraModules: NotRequired[InventoryInstanceRoleExtramodulesType]
    machines: NotRequired[InventoryInstanceRoleMachinesType]
    settings: NotRequired[InventoryInstanceRoleSettingsType]
    tags: NotRequired[InventoryInstanceRoleTagsType]
--- a/pkgs/clan-cli/clan_lib/service_runner/systemd_user.py
+++ b/pkgs/clan-cli/clan_lib/service_runner/systemd_user.py
@@ -5,7 +5,7 @@ from dataclasses import dataclass
 from pathlib import Path
 from typing import TYPE_CHECKING, Literal, TypedDict

-from clan_lib.cmd import Log, RunOpts, run
+from clan_lib.cmd import RunOpts, run
 from clan_lib.errors import ClanError

 if TYPE_CHECKING:
@@ -70,7 +70,7 @@ class SystemdUserService:
        """Run systemctl command with --user flag."""
        return run(
            ["systemctl", "--user", action, f"{service_name}.service"],
-            RunOpts(check=False, log=Log.NONE),
+            RunOpts(check=False),
        )

    def _get_property(self, service_name: str, prop: str) -> str:
@@ -240,15 +240,11 @@ class SystemdUserService:
        service_name = self._service_name(name)

        result = self._systemctl("stop", service_name)
-        if (
-            result.returncode != 0
-            and "not loaded" not in result.stderr.lower()
-            and "does not exist" not in result.stderr.lower()
-        ):
+        if result.returncode != 0 and "not loaded" not in result.stderr.lower():
            msg = f"Failed to stop service: {result.stderr}"
            raise ClanError(msg)

-        result = self._systemctl("disable", service_name)
+        self._systemctl("disable", service_name)  # Ignore errors for transient units

        unit_file = self._unit_file_path(name)
        if unit_file.exists():
--- a/pkgs/classgen/main.py
+++ b/pkgs/classgen/main.py
@@ -241,11 +241,6 @@ def generate_dataclass(
        # If we are at the top level, and the attribute name is not explicitly included we only do shallow
        field_name = prop.replace("-", "_")

-        # Skip "extraModules"
-        # TODO: Introduce seperate model that is tied to the serialization format
-        if "extraModules" in field_name:
-            continue
-
        # if len(attr_path) == 0 and prop in shallow_attrs:
        #     field_def = field_name, "dict[str, Any]"
        #     fields_with_default.append(field_def)
Author	SHA1	Message	Date
pinpox	bdaff0a8a4	Add favicon	2025-10-28 10:09:14 +01:00
pinpox	fabbfcaab6	fix template	2025-10-28 01:01:07 +01:00
pinpox	98cfaac849	Add prometheus console	2025-10-26 21:54:14 +01:00
pinpox	decb91a529	clanServices/monitoring: add prometheus role	2025-10-26 12:09:05 +01:00