From: Martin Polreich Date: Wed, 18 Oct 2017 11:24:06 +0000 (+0000) Subject: Merge "Add prometheus main dashboard" X-Git-Url: https://gerrit.mcp.mirantis.com/gitweb?p=salt-formulas%2Felasticsearch.git;a=commitdiff_plain;h=a54736f0beb8be8b5f15ba3ed5b5894f60639d00;hp=ef0b400648327cd4aa2e4c4fbe82fe485daeb17a Merge "Add prometheus main dashboard" --- diff --git a/elasticsearch/files/elasticsearch.yml b/elasticsearch/files/elasticsearch.yml index 5c4bc3e..2236031 100644 --- a/elasticsearch/files/elasticsearch.yml +++ b/elasticsearch/files/elasticsearch.yml @@ -159,7 +159,7 @@ index.number_of_replicas: {{ server.get('index', {}).get('replicas', 1) }} # # path.data: /path/to/data {%- if server.get('path', {}).data is defined %} -path.data = {{ server.path.data }} +path.data: {{ server.path.data }} {%- endif %} # # Can optionally include more than one location, causing data to be striped across diff --git a/elasticsearch/map.jinja b/elasticsearch/map.jinja index 23e2a7b..94f12fa 100644 --- a/elasticsearch/map.jinja +++ b/elasticsearch/map.jinja @@ -26,12 +26,21 @@ Debian: server: host: 127.0.0.1 port: 9200 + binary_port: 9300 RedHat: pkgs: - python-elasticsearch server: host: 127.0.0.1 port: 9200 + binary_port: 9300 {%- endload %} {%- set client = salt['grains.filter_by'](client_defaults, merge=salt['pillar.get']('elasticsearch:client')) %} + +{% set monitoring = salt['grains.filter_by']({ + 'default': { + 'service_failed_warning_threshold_percent': 0.3, + 'service_failed_critical_threshold_percent': 0.6, + }, +}, grain='os_family', merge=salt['pillar.get']('elasticsearch:monitoring')) %} diff --git a/elasticsearch/meta/prometheus.yml b/elasticsearch/meta/prometheus.yml index f0aa983..0e4d7e0 100644 --- a/elasticsearch/meta/prometheus.yml +++ b/elasticsearch/meta/prometheus.yml @@ -1,5 +1,5 @@ {%- if pillar.elasticsearch.server is defined or pillar.elasticsearch.client is defined %} -{%- from "elasticsearch/map.jinja" import server, client with context %} +{%- from "elasticsearch/map.jinja" import server, client, monitoring with context %} server: alert: @@ -29,15 +29,42 @@ server: {%- endif %} {%- if server.get('enabled', False) %} {%- raw %} - ElasticsearchDown: + ElasticsearchInfo: if: >- - elasticsearch_up{host=~'.*'} != 1 + elasticsearch_up{host=~'.*'} == 0 labels: - severity: warning + severity: info service: elasticsearch annotations: - summary: 'Elasticsearch service down' + summary: 'Elasticsearch service is down' description: 'Elasticsearch service is down on node {{ $labels.host }}' + ElasticsearchWarning: + if: >- + count(elasticsearch_up{host=~'.*'} == 0) >= count(elasticsearch_up{host=~'.*'}) * {% endraw %} {{ monitoring.service_failed_warning_threshold_percent }} {% raw %} + labels: + severity: warning + service: elasticsearch + annotations: + summary: 'More than {%- endraw %} {{monitoring.service_failed_warning_threshold_percent*100}}%{%- raw %} of Elasticsearch services are down' + description: 'More than {%- endraw %} {{monitoring.service_failed_warning_threshold_percent*100}}%{%- raw %} of Elasticsearch services are down' + ElasticsearchCritical: + if: >- + count(elasticsearch_up{host=~'.*'} == 0) >= count(elasticsearch_up{host=~'.*'}) * {% endraw %} {{ monitoring.service_failed_critical_threshold_percent }} {% raw %} + labels: + severity: critical + service: elasticsearch + annotations: + summary: 'More than {%- endraw %} {{monitoring.service_failed_critical_threshold_percent*100}}%{%- raw %} of Elasticsearch services are down' + description: 'More than {%- endraw %} {{monitoring.service_failed_critical_threshold_percent*100}}%{%- raw %} of Elasticsearch services are down' + ElasticsearchDown: + if: >- + count(elasticsearch_up{host=~'.*'} == 0) == count(elasticsearch_up{host=~'.*'}) + labels: + severity: down + service: elasticsearch + annotations: + summary: 'All Elasticsearch services are down' + description: 'All Elasticsearch services are down' ElasticsearchClusterDiskLowWaterMark: if: >- (max(elasticsearch_fs_total_total_in_bytes) by (host, instance) - max(elasticsearch_fs_total_available_in_bytes) by (host, instance)) / max(elasticsearch_fs_total_total_in_bytes) by (host, instance) * 100.0 >= 85 @@ -49,7 +76,6 @@ server: summary: 'Elasticsearch low disk watermark [85%] exceeded on node {{ $labels.host}} instance {{ $labels.instance }}' description: >- Elasticsearch will not allocate new shards to node {{ $labels.host }} - ElasticsearchClusterDiskHighWaterMark: if: >- (max(elasticsearch_fs_total_total_in_bytes) by (host, instance) - max(elasticsearch_fs_total_available_in_bytes) by (host, instance)) / max(elasticsearch_fs_total_total_in_bytes) by (host, instance) * 100.0 >= 90