Merge "Add prometheus main dashboard"

commit: a54736f0beb8be8b5f15ba3ed5b5894f60639d00 [log] [tgz]
author: Martin Polreich <mpolreich@mirantis.com> Wed Oct 18 11:24:06 2017 +0000
committer: Gerrit Code Review <gerrit2@7cd0c2eb159e> Wed Oct 18 11:24:06 2017 +0000
tree: 5a8d7eeda9b4d127f1646df88cc0d4e864eedbb6
parent: a2d8557f849bcf876b0a6cc7a8816b7928f2b79e [diff]
parent: ef0b400648327cd4aa2e4c4fbe82fe485daeb17a [diff]
diff --git a/elasticsearch/files/elasticsearch.yml b/elasticsearch/files/elasticsearch.yml
index 5c4bc3e..2236031 100644
--- a/elasticsearch/files/elasticsearch.yml
+++ b/elasticsearch/files/elasticsearch.yml

@@ -159,7 +159,7 @@
 #
 # path.data: /path/to/data
 {%- if server.get('path', {}).data is defined %}
-path.data = {{ server.path.data }}
+path.data: {{ server.path.data }}
 {%- endif %}
 #
 # Can optionally include more than one location, causing data to be striped across

diff --git a/elasticsearch/map.jinja b/elasticsearch/map.jinja
index 23e2a7b..94f12fa 100644
--- a/elasticsearch/map.jinja
+++ b/elasticsearch/map.jinja

@@ -26,12 +26,21 @@
   server:
     host: 127.0.0.1
     port: 9200
+    binary_port: 9300
 RedHat:
   pkgs:
   - python-elasticsearch
   server:
     host: 127.0.0.1
     port: 9200
+    binary_port: 9300
 {%- endload %}
 
 {%- set client = salt['grains.filter_by'](client_defaults, merge=salt['pillar.get']('elasticsearch:client')) %}
+
+{% set monitoring = salt['grains.filter_by']({
+    'default': {
+        'service_failed_warning_threshold_percent': 0.3,
+        'service_failed_critical_threshold_percent': 0.6,
+    },
+}, grain='os_family', merge=salt['pillar.get']('elasticsearch:monitoring')) %}

diff --git a/elasticsearch/meta/prometheus.yml b/elasticsearch/meta/prometheus.yml
index f0aa983..0e4d7e0 100644
--- a/elasticsearch/meta/prometheus.yml
+++ b/elasticsearch/meta/prometheus.yml

@@ -1,5 +1,5 @@
 {%- if pillar.elasticsearch.server is defined or pillar.elasticsearch.client is defined %}
-{%- from "elasticsearch/map.jinja" import server, client with context %}
+{%- from "elasticsearch/map.jinja" import server, client, monitoring with context %}
 
 server:
   alert:
@@ -29,15 +29,42 @@
 {%- endif %}
 {%- if server.get('enabled', False) %}
 {%- raw %}
-    ElasticsearchDown:
+    ElasticsearchInfo:
       if: >-
-        elasticsearch_up{host=~'.*'} != 1
+        elasticsearch_up{host=~'.*'} == 0
+      labels:
+        severity: info
+        service: elasticsearch
+      annotations:
+        summary: 'Elasticsearch service is down'
+        description: 'Elasticsearch service is down on node {{ $labels.host }}'
+    ElasticsearchWarning:
+      if: >-
+        count(elasticsearch_up{host=~'.*'} == 0) >= count(elasticsearch_up{host=~'.*'}) * {% endraw %} {{ monitoring.service_failed_warning_threshold_percent }} {% raw %}
       labels:
         severity: warning
         service: elasticsearch
       annotations:
-        summary: 'Elasticsearch service down'
-        description: 'Elasticsearch service is down on node {{ $labels.host }}'
+        summary: 'More than {%- endraw %} {{monitoring.service_failed_warning_threshold_percent*100}}%{%- raw %} of Elasticsearch services are down'
+        description: 'More than {%- endraw %} {{monitoring.service_failed_warning_threshold_percent*100}}%{%- raw %} of Elasticsearch services are down'
+    ElasticsearchCritical:
+      if: >-
+        count(elasticsearch_up{host=~'.*'} == 0) >= count(elasticsearch_up{host=~'.*'}) * {% endraw %} {{ monitoring.service_failed_critical_threshold_percent }} {% raw %}
+      labels:
+        severity: critical
+        service: elasticsearch
+      annotations:
+        summary: 'More than {%- endraw %} {{monitoring.service_failed_critical_threshold_percent*100}}%{%- raw %} of Elasticsearch services are down'
+        description: 'More than {%- endraw %} {{monitoring.service_failed_critical_threshold_percent*100}}%{%- raw %} of Elasticsearch services are down'
+    ElasticsearchDown:
+      if: >-
+        count(elasticsearch_up{host=~'.*'} == 0) == count(elasticsearch_up{host=~'.*'})
+      labels:
+        severity: down
+        service: elasticsearch
+      annotations:
+        summary: 'All Elasticsearch services are down'
+        description: 'All Elasticsearch services are down'
     ElasticsearchClusterDiskLowWaterMark:
       if: >-
           (max(elasticsearch_fs_total_total_in_bytes) by (host, instance) - max(elasticsearch_fs_total_available_in_bytes) by (host, instance)) / max(elasticsearch_fs_total_total_in_bytes)  by (host, instance) * 100.0 >= 85
@@ -49,7 +76,6 @@
         summary: 'Elasticsearch low disk watermark [85%] exceeded on node {{ $labels.host}} instance {{ $labels.instance }}'
         description: >-
           Elasticsearch will not allocate new shards to node {{ $labels.host }}
-
     ElasticsearchClusterDiskHighWaterMark:
       if: >-
           (max(elasticsearch_fs_total_total_in_bytes) by (host, instance) - max(elasticsearch_fs_total_available_in_bytes) by (host, instance)) / max(elasticsearch_fs_total_total_in_bytes) by (host, instance) * 100.0 >= 90
commit	a54736f0beb8be8b5f15ba3ed5b5894f60639d00	[log] [tgz]
author	Martin Polreich <mpolreich@mirantis.com>	Wed Oct 18 11:24:06 2017 +0000
committer	Gerrit Code Review <gerrit2@7cd0c2eb159e>	Wed Oct 18 11:24:06 2017 +0000
tree	5a8d7eeda9b4d127f1646df88cc0d4e864eedbb6
parent	a2d8557f849bcf876b0a6cc7a8816b7928f2b79e [diff]
parent	ef0b400648327cd4aa2e4c4fbe82fe485daeb17a [diff]