Merge "Add prometheus main dashboard"
diff --git a/elasticsearch/files/elasticsearch.yml b/elasticsearch/files/elasticsearch.yml
index 5c4bc3e..2236031 100644
--- a/elasticsearch/files/elasticsearch.yml
+++ b/elasticsearch/files/elasticsearch.yml
@@ -159,7 +159,7 @@
#
# path.data: /path/to/data
{%- if server.get('path', {}).data is defined %}
-path.data = {{ server.path.data }}
+path.data: {{ server.path.data }}
{%- endif %}
#
# Can optionally include more than one location, causing data to be striped across
diff --git a/elasticsearch/map.jinja b/elasticsearch/map.jinja
index 23e2a7b..94f12fa 100644
--- a/elasticsearch/map.jinja
+++ b/elasticsearch/map.jinja
@@ -26,12 +26,21 @@
server:
host: 127.0.0.1
port: 9200
+ binary_port: 9300
RedHat:
pkgs:
- python-elasticsearch
server:
host: 127.0.0.1
port: 9200
+ binary_port: 9300
{%- endload %}
{%- set client = salt['grains.filter_by'](client_defaults, merge=salt['pillar.get']('elasticsearch:client')) %}
+
+{% set monitoring = salt['grains.filter_by']({
+ 'default': {
+ 'service_failed_warning_threshold_percent': 0.3,
+ 'service_failed_critical_threshold_percent': 0.6,
+ },
+}, grain='os_family', merge=salt['pillar.get']('elasticsearch:monitoring')) %}
diff --git a/elasticsearch/meta/prometheus.yml b/elasticsearch/meta/prometheus.yml
index f0aa983..0e4d7e0 100644
--- a/elasticsearch/meta/prometheus.yml
+++ b/elasticsearch/meta/prometheus.yml
@@ -1,5 +1,5 @@
{%- if pillar.elasticsearch.server is defined or pillar.elasticsearch.client is defined %}
-{%- from "elasticsearch/map.jinja" import server, client with context %}
+{%- from "elasticsearch/map.jinja" import server, client, monitoring with context %}
server:
alert:
@@ -29,15 +29,42 @@
{%- endif %}
{%- if server.get('enabled', False) %}
{%- raw %}
- ElasticsearchDown:
+ ElasticsearchInfo:
if: >-
- elasticsearch_up{host=~'.*'} != 1
+ elasticsearch_up{host=~'.*'} == 0
+ labels:
+ severity: info
+ service: elasticsearch
+ annotations:
+ summary: 'Elasticsearch service is down'
+ description: 'Elasticsearch service is down on node {{ $labels.host }}'
+ ElasticsearchWarning:
+ if: >-
+ count(elasticsearch_up{host=~'.*'} == 0) >= count(elasticsearch_up{host=~'.*'}) * {% endraw %} {{ monitoring.service_failed_warning_threshold_percent }} {% raw %}
labels:
severity: warning
service: elasticsearch
annotations:
- summary: 'Elasticsearch service down'
- description: 'Elasticsearch service is down on node {{ $labels.host }}'
+ summary: 'More than {%- endraw %} {{monitoring.service_failed_warning_threshold_percent*100}}%{%- raw %} of Elasticsearch services are down'
+ description: 'More than {%- endraw %} {{monitoring.service_failed_warning_threshold_percent*100}}%{%- raw %} of Elasticsearch services are down'
+ ElasticsearchCritical:
+ if: >-
+ count(elasticsearch_up{host=~'.*'} == 0) >= count(elasticsearch_up{host=~'.*'}) * {% endraw %} {{ monitoring.service_failed_critical_threshold_percent }} {% raw %}
+ labels:
+ severity: critical
+ service: elasticsearch
+ annotations:
+ summary: 'More than {%- endraw %} {{monitoring.service_failed_critical_threshold_percent*100}}%{%- raw %} of Elasticsearch services are down'
+ description: 'More than {%- endraw %} {{monitoring.service_failed_critical_threshold_percent*100}}%{%- raw %} of Elasticsearch services are down'
+ ElasticsearchDown:
+ if: >-
+ count(elasticsearch_up{host=~'.*'} == 0) == count(elasticsearch_up{host=~'.*'})
+ labels:
+ severity: down
+ service: elasticsearch
+ annotations:
+ summary: 'All Elasticsearch services are down'
+ description: 'All Elasticsearch services are down'
ElasticsearchClusterDiskLowWaterMark:
if: >-
(max(elasticsearch_fs_total_total_in_bytes) by (host, instance) - max(elasticsearch_fs_total_available_in_bytes) by (host, instance)) / max(elasticsearch_fs_total_total_in_bytes) by (host, instance) * 100.0 >= 85
@@ -49,7 +76,6 @@
summary: 'Elasticsearch low disk watermark [85%] exceeded on node {{ $labels.host}} instance {{ $labels.instance }}'
description: >-
Elasticsearch will not allocate new shards to node {{ $labels.host }}
-
ElasticsearchClusterDiskHighWaterMark:
if: >-
(max(elasticsearch_fs_total_total_in_bytes) by (host, instance) - max(elasticsearch_fs_total_available_in_bytes) by (host, instance)) / max(elasticsearch_fs_total_total_in_bytes) by (host, instance) * 100.0 >= 90