1 {%- if pillar.influxdb.server is defined %}
2 {%- from "influxdb/map.jinja" import server, relay, monitoring with context %}
4 {%- if server.get('enabled', False) or relay.get('enabled') %}
7 {%- if server.get('http', {}).get('enabled', False) %}
16 summary: 'InfluxDB service down'
17 description: 'InfluxDB service is down on node {{ $labels.host }}'
21 count(influxdb_up == 0) >= count(influxdb_up) * {{ monitoring.service_failed_warning_threshold_percent }}
26 summary: 'More than {{monitoring.service_failed_warning_threshold_percent*100}}% of InfluxDB services are down'
27 description: 'More than {{monitoring.service_failed_warning_threshold_percent*100}}% of InfluxDB services are down'
30 count(influxdb_up == 0) >= count(influxdb_up) * {{ monitoring.service_failed_critical_threshold_percent }}
35 summary: 'More than {{monitoring.service_failed_critical_threshold_percent*100}}% of InfluxDB services are down'
36 description: 'More than {{monitoring.service_failed_critical_threshold_percent*100}}% of InfluxDB services are down'
39 count(influxdb_up == 0) == count(influxdb_up)
44 summary: 'All InfluxDB services are down'
45 description: 'All InfluxDB services are down'
46 InfluxdbSeriesNumberHigh:
47 {%- set influx_max_series_threshold = monitoring.max_series_percentage * server.data.max_series_per_database / 100 %}
49 influxdb_database_numSeries >= {{ influx_max_series_threshold }}
55 summary: 'InfluxDB high number of series for {{ $labels.database }}'
56 description: 'The InfluxDB {{ $labels.database }} database is getting close to the maximum number of series (value={{ $value }}{%- endraw %},threshold={{ influx_max_series_threshold }}).'
57 InfluxdbSeriesNumberTooHigh:
59 influxdb_database_numSeries >= {{ server.data.max_series_per_database }}
65 summary: 'InfluxDB too many series for {{ $labels.database }}'
66 description: 'The InfluxDB {{ $labels.database }} database has exceeded the maximum number of series (value={{ $value }}{%- endraw %},threshold={{ server.data.max_series_per_database }}).'
67 InfluxdbHTTPClientErrors:
68 {%- set influx_http_client_error_threshold = monitoring.http_errors_percentage %}
70 rate(influxdb_httpd_clientError[2m]) / rate(influxdb_httpd_req[2m]) * 100 > {{ influx_http_client_error_threshold }}
76 summary: 'Influxdb number of client errors is high'
77 description: '{{ printf `%.1f` $value }}% of client requests are in error on {{ $labels.host }} (threshold={%- endraw %}{{ influx_http_client_error_threshold }}).'
78 InfluxdbHTTPPointsWrittenFail:
79 {%- set influx_http_points_written_fail_threshold = monitoring.failed_points_percentage %}
81 rate(influxdb_httpd_pointsWrittenFail[2m]) / rate(influxdb_httpd_pointsWrittenOK[2m]) * 100 > {{ influx_http_points_written_fail_threshold }}
87 summary: 'Influxdb too many failed writes'
88 description: '{{ printf `%.1f` $value }}% of written points have failed on {{ $labels.host }} (threshold={%- endraw %}{{ influx_http_points_written_fail_threshold }}).'
89 InfluxdbHTTPPointsWrittenDropped:
90 {%- set influx_http_points_written_dropped_threshold = monitoring.dropped_points_percentage %}
92 rate(influxdb_httpd_pointsWrittenDropped[2m]) / rate(influxdb_httpd_pointsWrittenOK[2m]) * 100 > {{ influx_http_points_written_dropped_threshold }}
98 summary: 'Influxdb too many dropped writes'
99 description: '{{ printf `%.1f` $value }}% of written points have been dropped on {{ $labels.host }} (threshold={%- endraw %}{{ influx_http_points_written_dropped_threshold }}).'
100 {%- if relay.get('enabled', False) and relay.telemetry is defined and relay.telemetry.get('enabled') %}
101 {%- set buffer_sizes = [] %}
102 {%- for name, listen in relay.listen.iteritems()|sort %}
103 {%- for backend_name, backend in listen.output.iteritems()|sort %}
104 {%- do buffer_sizes.append(backend.get('buffer_size_mb', 0)|float) %}
107 {%- set buffer_sizes = buffer_sizes|sort %}
108 {%- set buffer_size = buffer_sizes[-1] * 1024 * 1024 %}
109 {%- if buffer_size > 0 %}
110 InfluxdbRelayBufferNearFull:
111 {%- set influx_relay_buffer_size_threshold = monitoring.max_relay_buffer_percentage %}
113 influxdb_relay_backend_buffer_bytes > {{ buffer_size }} * {{ influx_relay_buffer_size_threshold }} / 100
117 service: influxdb-relay
119 summary: 'InfluxDB Relay buffer almost full'
120 description: 'The buffer size for the {{ $labels.instance }}/{{ $labels.backend }} backend is getting full (current value={{ $value }} bytes, threshold={%- endraw %}{{ buffer_size * influx_relay_buffer_size_threshold / 100 }}).'
122 InfluxdbRelayFailedRequests:
123 {%- set influx_relay_failed_requests_threshold = monitoring.relay_failed_requests_percentage %}
125 rate(influxdb_relay_failed_requests_total[5m]) / rate(influxdb_relay_requests_total[5m]) * 100 > {{ influx_relay_failed_requests_threshold }}
129 service: influxdb-relay
131 summary: 'InfluxDB Relay too many failed requests'
132 description: '{{ printf `%.1f` $value }}% of requests have been dropped on {{ $labels.instance }} (threshold={%- endraw %}{{ influx_relay_failed_requests_threshold }}).'
136 {%- if relay.get('enabled') and relay.telemetry.get('enabled') %}
138 {%- set addresses = [] %}
139 {%- if relay.telemetry.get('bind', {}).address is defined and not relay.telemetry.bind.address.startswith('127') and relay.telemetry.bind.address != '0.0.0.0' %}
140 {%- do addresses.append(relay.telemetry.bind.address) %}
142 {%- for address in grains['fqdn_ip4'] %}
143 {%- if not address.startswith('127') %}
144 {%- do addresses.append(address) %}
153 - address: {{ addresses[0] }}
154 port: {{ relay.telemetry.bind.port }}