{%- if pillar.influxdb.server is defined %}
-{%- from "influxdb/map.jinja" import server with context %}
+{%- from "influxdb/map.jinja" import server, monitoring with context %}
{%- if server.get('enabled', False) %}
server:
alert:
- ProcstatRunningInfluxdb:
+{%- if server.get('http', {}).get('enabled', False) %}
+ InfluxdbDown:
if: >-
- procstat_running{process_name="influxdb"} == 0
- {% raw %}
+ influxdb_up != 1
labels:
severity: warning
service: influxdb
annotations:
- summary: 'Influxdb service is down'
- description: 'Influxdb service is down on node {{ $labels.host }}'
- {% endraw %}
-{%- if server.get('http', {}).get('enabled', False) %}
- InfluxdbHTTPClientError:
- {%- set influx_http_client_error_threshold = prometheus_server.get('alert', {}).get('InfluxdbHTTPClientError', {}).get('var', {}).get('threshold', 5) %}
+ {%- raw %}
+ summary: 'InfluxDB service down'
+ description: 'InfluxDB service is down on node {{ $labels.host }}'
+ {%- endraw %}
+ InfluxdbSeriesNumberHigh:
+ {%- set influx_max_series_threshold = monitoring.max_series_percentage * server.data.max_series_per_database / 100 %}
+ if: >-
+ influxdb_database_numSeries >= {{ influx_max_series_threshold }}
+ labels:
+ severity: warning
+ service: influxdb
+ annotations:
+ {% raw %}
+ summary: 'InfluxDB high number of series for {{ $labels.database }}'
+ description: 'The InfluxDB {{ $labels.database }} database is getting close to the maximum number of series (value={{ $value }}{%- endraw %},threshold={{ influx_max_series_threshold }}).'
+ InfluxdbSeriesNumberTooHigh:
+ if: >-
+ influxdb_database_numSeries >= {{ server.data.max_series_per_database }}
+ labels:
+ severity: critical
+ service: influxdb
+ annotations:
+ {% raw %}
+ summary: 'InfluxDB too many series for {{ $labels.database }}'
+ description: 'The InfluxDB {{ $labels.database }} database has exceeded the maximum number of series (value={{ $value }}{%- endraw %},threshold={{ server.data.max_series_per_database }}).'
+ InfluxdbHTTPClientErrors:
+ {%- set influx_http_client_error_threshold = monitoring.http_errors_percentage %}
if: >-
rate(influxdb_httpd_clientError[2m]) / rate(influxdb_httpd_req[2m]) * 100 > {{ influx_http_client_error_threshold }}
{% raw %}
summary: 'Influxdb number of client errors is high'
description: '{{ printf `%.1f` $value }}% of client requests are in error on {{ $labels.host }} (threshold={%- endraw %}{{ influx_http_client_error_threshold }}).'
InfluxdbHTTPPointsWrittenFail:
- {%- set influx_http_points_written_fail_threshold = prometheus_server.get('alert', {}).get('InfluxdbHTTPPointsWrittenFail', {}).get('var', {}).get('threshold', 5) %}
+ {%- set influx_http_points_written_fail_threshold = monitoring.failed_points_percentage %}
if: >-
rate(influxdb_httpd_pointsWrittenFail[2m]) / rate(influxdb_httpd_pointsWrittenOK[2m]) * 100 > {{ influx_http_points_written_fail_threshold }}
{% raw %}
summary: 'Influxdb too many failed writes'
description: '{{ printf `%.1f` $value }}% of written points have failed on {{ $labels.host }} (threshold={%- endraw %}{{ influx_http_points_written_fail_threshold }}).'
InfluxdbHTTPPointsWrittenDropped:
- {%- set influx_http_points_written_dropped_threshold = prometheus_server.get('alert', {}).get('InfluxdbHTTPPointsWrittenDropped', {}).get('var', {}).get('threshold', 5) %}
+ {%- set influx_http_points_written_dropped_threshold = monitoring.dropped_points_percentage %}
if: >-
rate(influxdb_httpd_pointsWrittenDropped[2m]) / rate(influxdb_httpd_pointsWrittenOK[2m]) * 100 > {{ influx_http_points_written_dropped_threshold }}
{% raw %}