Add alerts for influxdb
Change-Id: I564da44ba59653a43dd9bb5d727f4453ba90fa48
diff --git a/influxdb/meta/prometheus.yml b/influxdb/meta/prometheus.yml
new file mode 100644
index 0000000..5404da6
--- /dev/null
+++ b/influxdb/meta/prometheus.yml
@@ -0,0 +1,54 @@
+{%- if pillar.influxdb.server is defined %}
+{%- from "influxdb/map.jinja" import server with context %}
+
+{%- if server.get('enabled', False) %}
+server:
+ alert:
+ ProcstatRunningInfluxdb:
+ if: >-
+ procstat_running{process_name="influxdb"} == 0
+ {% raw %}
+ labels:
+ severity: warning
+ service: influxdb
+ annotations:
+ summary: 'Influxdb service is down'
+ description: 'Influxdb service is down on node {{ $labels.host }}'
+ {% endraw %}
+{%- if server.get('http', {}).get('enabled', False) %}
+ InfluxdbHTTPClientError:
+ {%- set influx_http_client_error_threshold = prometheus_server.get('alert', {}).get('InfluxdbHTTPClientError', {}).get('var', {}).get('threshold', 5) %}
+ if: >-
+ rate(influxdb_httpd_clientError[2m]) / rate(influxdb_httpd_req[2m]) * 100 > {{ influx_http_client_error_threshold }}
+ {% raw %}
+ labels:
+ severity: warning
+ service: influxdb
+ annotations:
+ summary: 'Influxdb number of client errors is high'
+ description: '{{ printf `%.1f` $value }}% of client requests are in error on {{ $labels.host }} (threshold={%- endraw %}{{ influx_http_client_error_threshold }}).'
+ InfluxdbHTTPPointsWrittenFail:
+ {%- set influx_http_points_written_fail_threshold = prometheus_server.get('alert', {}).get('InfluxdbHTTPPointsWrittenFail', {}).get('var', {}).get('threshold', 5) %}
+ if: >-
+ rate(influxdb_httpd_pointsWrittenFail[2m]) / rate(influxdb_httpd_pointsWrittenOK[2m]) * 100 > {{ influx_http_points_written_fail_threshold }}
+ {% raw %}
+ labels:
+ severity: warning
+ service: influxdb
+ annotations:
+ summary: 'Influxdb too many failed writes'
+ description: '{{ printf `%.1f` $value }}% of written points have failed on {{ $labels.host }} (threshold={%- endraw %}{{ influx_http_points_written_fail_threshold }}).'
+ InfluxdbHTTPPointsWrittenDropped:
+ {%- set influx_http_points_written_dropped_threshold = prometheus_server.get('alert', {}).get('InfluxdbHTTPPointsWrittenDropped', {}).get('var', {}).get('threshold', 5) %}
+ if: >-
+ rate(influxdb_httpd_pointsWrittenDropped[2m]) / rate(influxdb_httpd_pointsWrittenOK[2m]) * 100 > {{ influx_http_points_written_dropped_threshold }}
+ {% raw %}
+ labels:
+ severity: warning
+ service: influxdb
+ annotations:
+ summary: 'Influxdb too many dropped writes'
+ description: '{{ printf `%.1f` $value }}% of written points have been dropped on {{ $labels.host }} (threshold={%- endraw %}{{ influx_http_points_written_dropped_threshold }}).'
+{%- endif %}
+{%- endif %}
+{%- endif %}
diff --git a/metadata/service/support.yml b/metadata/service/support.yml
index b99c6a7..e85fc7b 100644
--- a/metadata/service/support.yml
+++ b/metadata/service/support.yml
@@ -13,3 +13,5 @@
enabled: true
telegraf:
enabled: true
+ prometheus:
+ enabled: true