server:
alert:
{%- if server.get('http', {}).get('enabled', False) %}
- InfluxdbDown:
+ InfluxdbInfo:
if: >-
- influxdb_up != 1
+ influxdb_up == 0
labels:
- severity: warning
+ severity: info
service: influxdb
annotations:
{%- raw %}
summary: 'InfluxDB service down'
description: 'InfluxDB service is down on node {{ $labels.host }}'
{%- endraw %}
+ InfluxdbWarning:
+ if: >-
+ count(influxdb_up == 0) >= count(influxdb_up) * {{ monitoring.service_failed_warning_threshold_percent }}
+ labels:
+ severity: warning
+ service: influxdb
+ annotations:
+ summary: 'More than {{monitoring.service_failed_warning_threshold_percent*100}}% of InfluxDB services are down'
+ description: 'More than {{monitoring.service_failed_warning_threshold_percent*100}}% of InfluxDB services are down'
+ InfluxdbCritical:
+ if: >-
+ count(influxdb_up == 0) >= count(influxdb_up) * {{ monitoring.service_failed_critical_threshold_percent }}
+ labels:
+ severity: critical
+ service: influxdb
+ annotations:
+ summary: 'More than {{monitoring.service_failed_critical_threshold_percent*100}}% of InfluxDB services are down'
+ description: 'More than {{monitoring.service_failed_critical_threshold_percent*100}}% of InfluxDB services are down'
+ InfluxdbDown:
+ if: >-
+ count(influxdb_up == 0) == count(influxdb_up)
+ labels:
+ severity: down
+ service: influxdb
+ annotations:
+ summary: 'All InfluxDB services are down'
+ description: 'All InfluxDB services are down'
InfluxdbSeriesNumberHigh:
{%- set influx_max_series_threshold = monitoring.max_series_percentage * server.data.max_series_per_database / 100 %}
if: >-
annotations:
summary: 'Influxdb too many dropped writes'
description: '{{ printf `%.1f` $value }}% of written points have been dropped on {{ $labels.host }} (threshold={%- endraw %}{{ influx_http_points_written_dropped_threshold }}).'
+{%- if relay.get('enabled', False) and relay.telemetry is defined and relay.telemetry.get('enabled') %}
+ {%- set buffer_sizes = [] %}
+ {%- for name, listen in relay.listen.iteritems()|sort %}
+ {%- for backend_name, backend in listen.output.iteritems()|sort %}
+ {%- do buffer_sizes.append(backend.get('buffer_size_mb', 0)|float) %}
+ {%- endfor %}
+ {%- endfor %}
+ {%- set buffer_sizes = buffer_sizes|sort %}
+ {%- set buffer_size = buffer_sizes[-1] * 1024 * 1024 %}
+ {%- if buffer_size > 0 %}
+ InfluxdbRelayBufferNearFull:
+ {%- set influx_relay_buffer_size_threshold = monitoring.max_relay_buffer_percentage %}
+ if: >-
+ influxdb_relay_backend_buffer_bytes > {{ buffer_size }} * {{ influx_relay_buffer_size_threshold }} / 100
+ {% raw %}
+ labels:
+ severity: warning
+ service: influxdb-relay
+ annotations:
+ summary: 'InfluxDB Relay buffer almost full'
+ description: 'The buffer size for the {{ $labels.instance }}/{{ $labels.backend }} backend is getting full (current value={{ $value }} bytes, threshold={%- endraw %}{{ buffer_size * influx_relay_buffer_size_threshold / 100 }}).'
+ {%- endif %}
+ InfluxdbRelayFailedRequests:
+ {%- set influx_relay_failed_requests_threshold = monitoring.relay_failed_requests_percentage %}
+ if: >-
+ rate(influxdb_relay_failed_requests_total[5m]) / rate(influxdb_relay_requests_total[5m]) * 100 > {{ influx_relay_failed_requests_threshold }}
+ {% raw %}
+ labels:
+ severity: warning
+ service: influxdb-relay
+ annotations:
+ summary: 'InfluxDB Relay too many failed requests'
+ description: '{{ printf `%.1f` $value }}% of requests have been dropped on {{ $labels.instance }} (threshold={%- endraw %}{{ influx_relay_failed_requests_threshold }}).'
+
{%- endif %}
{%- if relay.get('enabled') and relay.telemetry.get('enabled') %}
endpoint:
- address: {{ addresses[0] }}
port: {{ relay.telemetry.bind.port }}
+ relabel_configs:
+ - regex: {{ addresses[0] }}:{{ relay.telemetry.bind.port }}
+ replacement: {{ grains['host'] }}
+ source_labels: "__address__"
+ target_label: "host"
{%- endif %}
{%- endif %}
{%- endif %}
+{%- endif %}
\ No newline at end of file