fix template
This commit is contained in:
24
clanServices/monitoring/alert-rules.nix
Normal file
24
clanServices/monitoring/alert-rules.nix
Normal file
@@ -0,0 +1,24 @@
|
||||
{ lib }:
|
||||
lib.mapAttrsToList
|
||||
(name: opts: {
|
||||
alert = name;
|
||||
expr = opts.condition;
|
||||
for = opts.time or "2m";
|
||||
labels = { };
|
||||
annotations.description = opts.description;
|
||||
})
|
||||
{
|
||||
|
||||
# TODO Remove this alert, just for testing
|
||||
"Filesystem > = 10%" = {
|
||||
condition = ''disk_used_percent{fstype!~"tmpfs|vfat|devtmpfs|efivarfs"} > 10'';
|
||||
time = "1m";
|
||||
description = "{{$labels.instance}} device {{$labels.device}} on {{$labels.path}} got less than 90% space left on its filesystem.";
|
||||
};
|
||||
|
||||
filesystem_full_80percent = {
|
||||
condition = ''disk_used_percent{fstype!~"tmpfs|vfat|devtmpfs|efivarfs"} > 80'';
|
||||
time = "1m";
|
||||
description = "{{$labels.instance}} device {{$labels.device}} on {{$labels.path}} got less than 20% space left on its filesystem.";
|
||||
};
|
||||
}
|
||||
@@ -90,10 +90,10 @@ tr:hover {
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<h1>System Overview</h1>
|
||||
<h1>Clan Status</h1>
|
||||
|
||||
|
||||
<h2>Instance Status</h2>
|
||||
<h2>Instances</h2>
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
@@ -106,8 +106,9 @@ tr:hover {
|
||||
</thead>
|
||||
<tbody>
|
||||
{{ range query "up" | sortByLabel "instance" }}
|
||||
{{ $hostname := reReplaceAll "\\..*" "" .Labels.instance }}
|
||||
<tr>
|
||||
<td>{{ .Labels.instance }}</td>
|
||||
<td>{{ $hostname }}</td>
|
||||
<td>
|
||||
{{ if eq .Value 1.0 }}
|
||||
<span class="status-up">UP</span>
|
||||
@@ -116,7 +117,7 @@ tr:hover {
|
||||
{{ end }}
|
||||
</td>
|
||||
<td>
|
||||
{{ $cpuQuery := query (printf "100 - (avg by(host) (rate(cpu_seconds_total{mode=\"idle\",host=\"%s\"}[5m])) * 100)" .Labels.instance) }}
|
||||
{{ $cpuQuery := query (printf "100 - cpu_usage_idle{cpu=\"cpu-total\",host=\"%s\"}" $hostname) }}
|
||||
{{ if $cpuQuery }}
|
||||
{{ with $cpuQuery | first }}
|
||||
<span class="metric-value">{{ . | value | printf "%.1f" }}%</span>
|
||||
@@ -126,7 +127,7 @@ tr:hover {
|
||||
{{ end }}
|
||||
</td>
|
||||
<td>
|
||||
{{ $memQuery := query (printf "(1 - (mem_available_bytes{host=\"%s\"} / mem_total_bytes{host=\"%s\"})) * 100" .Labels.instance .Labels.instance) }}
|
||||
{{ $memQuery := query (printf "(1 - (mem_available{host=\"%s\"} / mem_total{host=\"%s\"})) * 100" $hostname $hostname) }}
|
||||
{{ if $memQuery }}
|
||||
{{ with $memQuery | first }}
|
||||
<span class="metric-value">{{ . | value | printf "%.1f" }}%</span>
|
||||
@@ -136,7 +137,7 @@ tr:hover {
|
||||
{{ end }}
|
||||
</td>
|
||||
<td>
|
||||
{{ $diskQuery := query (printf "(1 - (disk_free_bytes{host=\"%s\",path=\"/\"} / disk_total_bytes{host=\"%s\",path=\"/\"})) * 100" .Labels.instance .Labels.instance) }}
|
||||
{{ $diskQuery := query (printf "(1 - (disk_free{host=\"%s\",path=\"/\"} / disk_total{host=\"%s\",path=\"/\"})) * 100" $hostname $hostname) }}
|
||||
{{ if $diskQuery }}
|
||||
{{ with $diskQuery | first }}
|
||||
<span class="metric-value">{{ . | value | printf "%.1f" }}%</span>
|
||||
@@ -161,7 +162,7 @@ tr:hover {
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{{ range query "topk(5, 100 - (avg by(host) (rate(cpu_seconds_total{mode=\"idle\"}[5m])) * 100))" }}
|
||||
{{ range query "topk(5, 100 - cpu_usage_idle{cpu=\"cpu-total\"})" }}
|
||||
<tr>
|
||||
<td>{{ .Labels.host }}</td>
|
||||
<td><span class="metric-value">{{ .Value | printf "%.1f" }}%</span></td>
|
||||
@@ -181,7 +182,7 @@ tr:hover {
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{{ range query "topk(5, (1 - (mem_available_bytes / mem_total_bytes)) * 100)" }}
|
||||
{{ range query "topk(5, (1 - (mem_available / mem_total)) * 100)" }}
|
||||
<tr>
|
||||
<td>{{ .Labels.host }}</td>
|
||||
<td><span class="metric-value">{{ .Value | printf "%.1f" }}%</span></td>
|
||||
@@ -197,18 +198,16 @@ tr:hover {
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Alert</th>
|
||||
<th>Host</th>
|
||||
<th>Severity</th>
|
||||
<th>Alert</th>
|
||||
<th>Value</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{{ range . }}
|
||||
<tr>
|
||||
<td>{{ .Labels.alertname }}</td>
|
||||
<td>{{ or .Labels.host .Labels.instance }}</td>
|
||||
<td>{{ .Labels.severity }}</td>
|
||||
<td>{{ .Labels.alertname }}</td>
|
||||
<td>{{ .Value }}</td>
|
||||
</tr>
|
||||
{{ end }}
|
||||
|
||||
@@ -39,6 +39,20 @@
|
||||
"--web.console.templates=${./prometheus-consoles}"
|
||||
"--web.console.libraries=${./prometheus-consoles}"
|
||||
];
|
||||
|
||||
ruleFiles = [
|
||||
(pkgs.writeText "prometheus-rules.yml" (
|
||||
builtins.toJSON {
|
||||
groups = [
|
||||
{
|
||||
name = "alerting-rules";
|
||||
rules = import ./alert-rules.nix { inherit lib; };
|
||||
}
|
||||
];
|
||||
}
|
||||
))
|
||||
];
|
||||
|
||||
scrapeConfigs = [
|
||||
{
|
||||
job_name = "telegraf";
|
||||
|
||||
Reference in New Issue
Block a user