Merge "Add prometheus main dashboard"
diff --git a/influxdb/files/grafana_dashboards/influxdb_prometheus.json b/influxdb/files/grafana_dashboards/influxdb_prometheus.json
index 6c6c61d..9b3d0fa 100644
--- a/influxdb/files/grafana_dashboards/influxdb_prometheus.json
+++ b/influxdb/files/grafana_dashboards/influxdb_prometheus.json
@@ -13,6 +13,334 @@
"rows": [
{
"collapse": false,
+ "height": 250,
+ "panels": [
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "datasource": null,
+ "decimals": 0,
+ "format": "none",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "id": 7,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "span": 3,
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": true
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "count_scalar(influxdb_up)",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "",
+ "refId": "A",
+ "step": 60
+ }
+ ],
+ "thresholds": "",
+ "title": "Total instances",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "datasource": null,
+ "decimals": 0,
+ "format": "none",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "id": 8,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "span": 3,
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": true
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "count_scalar(influxdb_up == 1)",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "",
+ "refId": "A",
+ "step": 60
+ }
+ ],
+ "thresholds": "",
+ "title": "Running instances",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "datasource": null,
+ "decimals": 0,
+ "format": "none",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "id": 9,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "span": 3,
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": true
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "count_scalar(influxdb_up == 0)",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "",
+ "refId": "A",
+ "step": 60
+ }
+ ],
+ "thresholds": "",
+ "title": "Stopped instances",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "datasource": null,
+ "decimals": 0,
+ "format": "none",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "id": 10,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "span": 3,
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": true
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "min(haproxy_active_servers{proxy=~\"influxdb-backend\", sv=\"BACKEND\"})",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "",
+ "refId": "A",
+ "step": 60
+ }
+ ],
+ "thresholds": "",
+ "title": "InfluxDB backends",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ }
+ ],
+ "repeat": null,
+ "repeatIteration": null,
+ "repeatRowId": null,
+ "showTitle": true,
+ "title": "General",
+ "titleSize": "h6"
+ },
+ {
+ "collapse": false,
"height": "250px",
"panels": [
{
@@ -348,7 +676,7 @@
}
],
"thresholds": "",
- "title": "Go routines",
+ "title": "Goroutines",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
@@ -562,7 +890,7 @@
"multi": false,
"name": "server",
"options": [],
- "query": "label_values(influxdb_httpd_authFail, host)",
+ "query": "label_values(influxdb_up, host)",
"refresh": 1,
"regex": "",
"sort": 1,
diff --git a/influxdb/map.jinja b/influxdb/map.jinja
index a2a181c..8cf164d 100644
--- a/influxdb/map.jinja
+++ b/influxdb/map.jinja
@@ -43,6 +43,8 @@
'dropped_points_percentage': 5,
'max_relay_buffer_percentage': 70,
'relay_failed_requests_percentage': 5,
+ 'service_failed_warning_threshold_percent': 0.3,
+ 'service_failed_critical_threshold_percent': 0.6,
},
}, grain='os_family', merge=salt['pillar.get']('influxdb:monitoring')) %}
diff --git a/influxdb/meta/prometheus.yml b/influxdb/meta/prometheus.yml
index c266dfc..affbd77 100644
--- a/influxdb/meta/prometheus.yml
+++ b/influxdb/meta/prometheus.yml
@@ -5,17 +5,44 @@
server:
alert:
{%- if server.get('http', {}).get('enabled', False) %}
- InfluxdbDown:
+ InfluxdbInfo:
if: >-
- influxdb_up != 1
+ influxdb_up == 0
labels:
- severity: warning
+ severity: info
service: influxdb
annotations:
{%- raw %}
summary: 'InfluxDB service down'
description: 'InfluxDB service is down on node {{ $labels.host }}'
{%- endraw %}
+ InfluxdbWarning:
+ if: >-
+ count(influxdb_up == 0) >= count(influxdb_up) * {{ monitoring.service_failed_warning_threshold_percent }}
+ labels:
+ severity: warning
+ service: influxdb
+ annotations:
+ summary: 'More than {{monitoring.service_failed_warning_threshold_percent*100}}% of InfluxDB services are down'
+ description: 'More than {{monitoring.service_failed_warning_threshold_percent*100}}% of InfluxDB services are down'
+ InfluxdbCritical:
+ if: >-
+ count(influxdb_up == 0) >= count(influxdb_up) * {{ monitoring.service_failed_critical_threshold_percent }}
+ labels:
+ severity: critical
+ service: influxdb
+ annotations:
+ summary: 'More than {{monitoring.service_failed_critical_threshold_percent*100}}% of InfluxDB services are down'
+ description: 'More than {{monitoring.service_failed_critical_threshold_percent*100}}% of InfluxDB services are down'
+ InfluxdbDown:
+ if: >-
+ count(influxdb_up == 0) == count(influxdb_up)
+ labels:
+ severity: down
+ service: influxdb
+ annotations:
+ summary: 'All InfluxDB services are down'
+ description: 'All InfluxDB services are down'
InfluxdbSeriesNumberHigh:
{%- set influx_max_series_threshold = monitoring.max_series_percentage * server.data.max_series_per_database / 100 %}
if: >-