Merge "Add monitoring of the swap usage"
diff --git a/linux/files/grafana_dashboards/system_prometheus.json b/linux/files/grafana_dashboards/system_prometheus.json
old mode 100644
new mode 100755
index 735155b..da11d16
--- a/linux/files/grafana_dashboards/system_prometheus.json
+++ b/linux/files/grafana_dashboards/system_prometheus.json
@@ -98,6 +98,7 @@
"dashes": false,
"datasource": null,
"fill": 1,
+ "height": "",
"id": 1,
"legend": {
"avg": false,
@@ -352,7 +353,19 @@
"show": true
}
]
- },
+ }
+ ],
+ "repeat": null,
+ "repeatIteration": null,
+ "repeatRowId": null,
+ "showTitle": true,
+ "title": "General",
+ "titleSize": "h6"
+ },
+ {
+ "collapse": false,
+ "height": 250,
+ "panels": [
{
"aliasColors": {},
"bars": false,
@@ -652,7 +665,7 @@
"repeatIteration": null,
"repeatRowId": null,
"showTitle": true,
- "title": "General",
+ "title": "Processes",
"titleSize": "h6"
},
{
@@ -675,7 +688,7 @@
"minValue": 0,
"show": true,
"thresholdLabels": false,
- "thresholdMarkers": true
+ "thresholdMarkers": false
},
"hideTimeOverride": false,
"id": 11,
@@ -753,7 +766,7 @@
"minValue": 0,
"show": true,
"thresholdLabels": false,
- "thresholdMarkers": true
+ "thresholdMarkers": false
},
"hideTimeOverride": false,
"id": 12,
@@ -1535,9 +1548,269 @@
"showTitle": true,
"title": "Network",
"titleSize": "h6"
+ },
+ {
+ "collapse": false,
+ "height": 250,
+ "panels": [
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "datasource": null,
+ "format": "percent",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": true,
+ "thresholdLabels": false,
+ "thresholdMarkers": false
+ },
+ "id": 18,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "span": 2,
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "swap_used_percent{host=\"$host\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "refId": "A",
+ "step": 60
+ }
+ ],
+ "thresholds": "",
+ "title": "Used",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "decimals": null,
+ "fill": 0,
+ "id": 17,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 5,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "swap_used{host=\"$host\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "used",
+ "refId": "A",
+ "step": 10
+ },
+ {
+ "expr": "swap_free{host=\"$host\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "free",
+ "refId": "B",
+ "step": 10
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Usage",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "bytes",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "fill": 0,
+ "id": 19,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 5,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(swap_in{host=\"$host\"}[5m])",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "in",
+ "refId": "A",
+ "step": 10
+ },
+ {
+ "expr": "irate(swap_out{host=\"$host\"}[5m])",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "out",
+ "refId": "B",
+ "step": 10
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "I/O",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "Bps",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ }
+ ],
+ "repeat": null,
+ "repeatIteration": null,
+ "repeatRowId": null,
+ "showTitle": true,
+ "title": "Swap",
+ "titleSize": "h6"
}
],
"schemaVersion": 14,
+ "sharedCrosshair": true,
"style": "dark",
"tags": [],
"templating": {
@@ -1554,6 +1827,7 @@
"options": [],
"query": "label_values(cpu_usage_idle,host)",
"refresh": 1,
+ "refresh_on_load": true,
"regex": "",
"sort": 1,
"tagValuesQuery": "",
@@ -1574,6 +1848,7 @@
"options": [],
"query": "query_result(diskio_read_bytes{host=\"$host\"})",
"refresh": 1,
+ "refresh_on_load": true,
"regex": "/name=\"([^\"]+)/",
"sort": 1,
"tagValuesQuery": "",
@@ -1594,6 +1869,7 @@
"options": [],
"query": "query_result(disk_free{host=\"$host\"})",
"refresh": 1,
+ "refresh_on_load": true,
"regex": "/path=\"([^\"]+)/",
"sort": 1,
"tagValuesQuery": "",
@@ -1614,6 +1890,7 @@
"options": [],
"query": "query_result(net_bytes_recv{host=\"$host\"})",
"refresh": 1,
+ "refresh_on_load": true,
"regex": "/interface=\"([^\"]+)/",
"sort": 1,
"tagValuesQuery": "",
@@ -1655,5 +1932,5 @@
},
"timezone": "browser",
"title": "System",
- "version": 31
+ "version": 32
}
diff --git a/linux/meta/prometheus.yml b/linux/meta/prometheus.yml
index 485d4c0..771e3fe 100644
--- a/linux/meta/prometheus.yml
+++ b/linux/meta/prometheus.yml
@@ -70,3 +70,33 @@
annotations:
summary: 'Too many transmitted packets dropped on {{ $labels.host }} for interface {{ $labels.interface }}'
description: 'The average number of transmitted packets which are dropped is too high on node {{ $labels.host }} for interface {{ $label.interface }} (current value={{ $value }}, threshold={% endraw %}{{ net_tx_dropped_threshold }})'
+ SystemSwapUsed:
+ {%- set swap_used_threshold = prometheus_server.get('alert', {}).get('SystemSwapUsed', {}).get('var', {}).get('threshold', 80) %}
+ if: avg_over_time(swap_used_percent[1m]) > {{ swap_used_threshold }}
+ {% raw %}
+ labels:
+ severity: warning
+ service: system
+ annotations:
+ summary: 'Swap usage too high on {{ $labels.host }}'
+ description: 'The average percentage of used swap is too high on node {{ $labels.host }} (current value={{ $value }}%, threshold={% endraw %}{{ swap_used_threshold }})'
+ SystemSwapIn:
+ {%- set swap_in_threshold = prometheus_server.get('alert', {}).get('SystemSwapIn', {}).get('var', {}).get('threshold', 1024 * 1024) %}
+ if: rate(swap_in[2m]) > {{ swap_in_threshold }}
+ {% raw %}
+ labels:
+ severity: warning
+ service: system
+ annotations:
+ summary: 'Swap input throughput too high on {{ $labels.host }}'
+ description: 'The rate of swap input bytes is too high on node {{ $labels.host }} (current value={{ $value }}b/s, threshold={% endraw %}{{ swap_in_threshold }})'
+ SystemSwapOut:
+ {%- set swap_out_threshold = prometheus_server.get('alert', {}).get('SystemSwapOut', {}).get('var', {}).get('threshold', 1024 * 1024) %}
+ if: rate(swap_out[2m]) > {{ swap_out_threshold }}
+ {% raw %}
+ labels:
+ severity: warning
+ service: system
+ annotations:
+ summary: 'Swap output throughput too high on {{ $labels.host }}'
+ description: 'The rate of swap output bytes is too high on node {{ $labels.host }} (current value={{ $value }}b/s, threshold={% endraw %}{{ swap_out_threshold }})'
diff --git a/linux/meta/telegraf.yml b/linux/meta/telegraf.yml
index bc689ae..e9d604f 100644
--- a/linux/meta/telegraf.yml
+++ b/linux/meta/telegraf.yml
@@ -9,4 +9,5 @@
net:
mem:
processes:
+ swap:
system: