From 21251292d734477754463af3104b58b3e4fb957b Mon Sep 17 00:00:00 2001 From: Bartosz Kupidura Date: Wed, 21 Jun 2017 16:01:45 +0200 Subject: [PATCH] Add alerts for influxdb Change-Id: I564da44ba59653a43dd9bb5d727f4453ba90fa48 --- influxdb/meta/prometheus.yml | 54 ++++++++++++++++++++++++++++++++++++ metadata/service/support.yml | 2 ++ 2 files changed, 56 insertions(+) create mode 100644 influxdb/meta/prometheus.yml diff --git a/influxdb/meta/prometheus.yml b/influxdb/meta/prometheus.yml new file mode 100644 index 0000000..5404da6 --- /dev/null +++ b/influxdb/meta/prometheus.yml @@ -0,0 +1,54 @@ +{%- if pillar.influxdb.server is defined %} +{%- from "influxdb/map.jinja" import server with context %} + +{%- if server.get('enabled', False) %} +server: + alert: + ProcstatRunningInfluxdb: + if: >- + procstat_running{process_name="influxdb"} == 0 + {% raw %} + labels: + severity: warning + service: influxdb + annotations: + summary: 'Influxdb service is down' + description: 'Influxdb service is down on node {{ $labels.host }}' + {% endraw %} +{%- if server.get('http', {}).get('enabled', False) %} + InfluxdbHTTPClientError: + {%- set influx_http_client_error_threshold = prometheus_server.get('alert', {}).get('InfluxdbHTTPClientError', {}).get('var', {}).get('threshold', 5) %} + if: >- + rate(influxdb_httpd_clientError[2m]) / rate(influxdb_httpd_req[2m]) * 100 > {{ influx_http_client_error_threshold }} + {% raw %} + labels: + severity: warning + service: influxdb + annotations: + summary: 'Influxdb number of client errors is high' + description: '{{ printf `%.1f` $value }}% of client requests are in error on {{ $labels.host }} (threshold={%- endraw %}{{ influx_http_client_error_threshold }}).' + InfluxdbHTTPPointsWrittenFail: + {%- set influx_http_points_written_fail_threshold = prometheus_server.get('alert', {}).get('InfluxdbHTTPPointsWrittenFail', {}).get('var', {}).get('threshold', 5) %} + if: >- + rate(influxdb_httpd_pointsWrittenFail[2m]) / rate(influxdb_httpd_pointsWrittenOK[2m]) * 100 > {{ influx_http_points_written_fail_threshold }} + {% raw %} + labels: + severity: warning + service: influxdb + annotations: + summary: 'Influxdb too many failed writes' + description: '{{ printf `%.1f` $value }}% of written points have failed on {{ $labels.host }} (threshold={%- endraw %}{{ influx_http_points_written_fail_threshold }}).' + InfluxdbHTTPPointsWrittenDropped: + {%- set influx_http_points_written_dropped_threshold = prometheus_server.get('alert', {}).get('InfluxdbHTTPPointsWrittenDropped', {}).get('var', {}).get('threshold', 5) %} + if: >- + rate(influxdb_httpd_pointsWrittenDropped[2m]) / rate(influxdb_httpd_pointsWrittenOK[2m]) * 100 > {{ influx_http_points_written_dropped_threshold }} + {% raw %} + labels: + severity: warning + service: influxdb + annotations: + summary: 'Influxdb too many dropped writes' + description: '{{ printf `%.1f` $value }}% of written points have been dropped on {{ $labels.host }} (threshold={%- endraw %}{{ influx_http_points_written_dropped_threshold }}).' +{%- endif %} +{%- endif %} +{%- endif %} diff --git a/metadata/service/support.yml b/metadata/service/support.yml index b99c6a7..e85fc7b 100644 --- a/metadata/service/support.yml +++ b/metadata/service/support.yml @@ -13,3 +13,5 @@ parameters: enabled: true telegraf: enabled: true + prometheus: + enabled: true -- 2.32.7