From: Dmitry Kalashnik Date: Fri, 13 Oct 2017 12:39:04 +0000 (+0400) Subject: Add threshold to InfluxDB alarms X-Git-Url: https://gerrit.mcp.mirantis.com/gitweb?p=salt-formulas%2Finfluxdb.git;a=commitdiff_plain;h=096d2bd62baf0382a85ffa05745598d559909940 Add threshold to InfluxDB alarms Change-Id: Ie6e60aa83c56b49ef9f8bf8d39d5440adb5b926b Partial-Bug: PROD-15203 --- diff --git a/influxdb/map.jinja b/influxdb/map.jinja index a2a181c..8cf164d 100644 --- a/influxdb/map.jinja +++ b/influxdb/map.jinja @@ -43,6 +43,8 @@ default: 'dropped_points_percentage': 5, 'max_relay_buffer_percentage': 70, 'relay_failed_requests_percentage': 5, + 'service_failed_warning_threshold_percent': 0.3, + 'service_failed_critical_threshold_percent': 0.6, }, }, grain='os_family', merge=salt['pillar.get']('influxdb:monitoring')) %} diff --git a/influxdb/meta/prometheus.yml b/influxdb/meta/prometheus.yml index c266dfc..affbd77 100644 --- a/influxdb/meta/prometheus.yml +++ b/influxdb/meta/prometheus.yml @@ -5,17 +5,44 @@ server: alert: {%- if server.get('http', {}).get('enabled', False) %} - InfluxdbDown: + InfluxdbInfo: if: >- - influxdb_up != 1 + influxdb_up == 0 labels: - severity: warning + severity: info service: influxdb annotations: {%- raw %} summary: 'InfluxDB service down' description: 'InfluxDB service is down on node {{ $labels.host }}' {%- endraw %} + InfluxdbWarning: + if: >- + count(influxdb_up == 0) >= count(influxdb_up) * {{ monitoring.service_failed_warning_threshold_percent }} + labels: + severity: warning + service: influxdb + annotations: + summary: 'More than {{monitoring.service_failed_warning_threshold_percent*100}}% of InfluxDB services are down' + description: 'More than {{monitoring.service_failed_warning_threshold_percent*100}}% of InfluxDB services are down' + InfluxdbCritical: + if: >- + count(influxdb_up == 0) >= count(influxdb_up) * {{ monitoring.service_failed_critical_threshold_percent }} + labels: + severity: critical + service: influxdb + annotations: + summary: 'More than {{monitoring.service_failed_critical_threshold_percent*100}}% of InfluxDB services are down' + description: 'More than {{monitoring.service_failed_critical_threshold_percent*100}}% of InfluxDB services are down' + InfluxdbDown: + if: >- + count(influxdb_up == 0) == count(influxdb_up) + labels: + severity: down + service: influxdb + annotations: + summary: 'All InfluxDB services are down' + description: 'All InfluxDB services are down' InfluxdbSeriesNumberHigh: {%- set influx_max_series_threshold = monitoring.max_series_percentage * server.data.max_series_per_database / 100 %} if: >-