From c4a9f9d3ad750fe00d127a3679f93c2165cad468 Mon Sep 17 00:00:00 2001 From: Ildar Svetlov Date: Tue, 12 Sep 2017 16:34:53 +0400 Subject: [PATCH 1/1] Add InfluxDB Relay alerts Change-Id: I9299d81a592fdfee33f3f4e1d5ca6300d18d94cc --- influxdb/map.jinja | 2 ++ influxdb/meta/prometheus.yml | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/influxdb/map.jinja b/influxdb/map.jinja index 47a9c00..a2a181c 100644 --- a/influxdb/map.jinja +++ b/influxdb/map.jinja @@ -41,6 +41,8 @@ default: 'http_errors_percentage': 5, 'failed_points_percentage': 5, 'dropped_points_percentage': 5, + 'max_relay_buffer_percentage': 70, + 'relay_failed_requests_percentage': 5, }, }, grain='os_family', merge=salt['pillar.get']('influxdb:monitoring')) %} diff --git a/influxdb/meta/prometheus.yml b/influxdb/meta/prometheus.yml index ea66b51..c266dfc 100644 --- a/influxdb/meta/prometheus.yml +++ b/influxdb/meta/prometheus.yml @@ -70,6 +70,40 @@ server: annotations: summary: 'Influxdb too many dropped writes' description: '{{ printf `%.1f` $value }}% of written points have been dropped on {{ $labels.host }} (threshold={%- endraw %}{{ influx_http_points_written_dropped_threshold }}).' +{%- if relay.get('enabled', False) and relay.telemetry is defined and relay.telemetry.get('enabled') %} + {%- set buffer_sizes = [] %} + {%- for name, listen in relay.listen.iteritems()|sort %} + {%- for backend_name, backend in listen.output.iteritems()|sort %} + {%- do buffer_sizes.append(backend.get('buffer_size_mb', 0)|float) %} + {%- endfor %} + {%- endfor %} + {%- set buffer_sizes = buffer_sizes|sort %} + {%- set buffer_size = buffer_sizes[-1] * 1024 * 1024 %} + {%- if buffer_size > 0 %} + InfluxdbRelayBufferNearFull: + {%- set influx_relay_buffer_size_threshold = monitoring.max_relay_buffer_percentage %} + if: >- + influxdb_relay_backend_buffer_bytes > {{ buffer_size }} * {{ influx_relay_buffer_size_threshold }} / 100 + {% raw %} + labels: + severity: warning + service: influxdb-relay + annotations: + summary: 'InfluxDB Relay buffer almost full' + description: 'The buffer size for the {{ $labels.instance }}/{{ $labels.backend }} backend is getting full (current value={{ $value }} bytes, threshold={%- endraw %}{{ buffer_size * influx_relay_buffer_size_threshold / 100 }}).' + {%- endif %} + InfluxdbRelayFailedRequests: + {%- set influx_relay_failed_requests_threshold = monitoring.relay_failed_requests_percentage %} + if: >- + rate(influxdb_relay_failed_requests_total[5m]) / rate(influxdb_relay_requests_total[5m]) * 100 > {{ influx_relay_failed_requests_threshold }} + {% raw %} + labels: + severity: warning + service: influxdb-relay + annotations: + summary: 'InfluxDB Relay too many failed requests' + description: '{{ printf `%.1f` $value }}% of requests have been dropped on {{ $labels.instance }} (threshold={%- endraw %}{{ influx_relay_failed_requests_threshold }}).' + {%- endif %} {%- if relay.get('enabled') and relay.telemetry.get('enabled') %} @@ -96,3 +130,4 @@ server: {%- endif %} {%- endif %} +{%- endif %} \ No newline at end of file -- 2.32.7