Alerts rationalization for GlusterFS
Change-Id: I7e30171c4c1ae5d37605537816ba81b0f72c1288
Closes-Bug: PROD-19661
diff --git a/glusterfs/map.jinja b/glusterfs/map.jinja
index 928ff1d..3a943f2 100644
--- a/glusterfs/map.jinja
+++ b/glusterfs/map.jinja
@@ -29,6 +29,10 @@
{% set monitoring = salt['grains.filter_by']({
'default': {
- 'glusterfs_service_host': '127.0.0.1'
+ 'glusterfs_service_host': '127.0.0.1',
+ 'inodes_percent_used_minor_threshold_percent': 0.8,
+ 'inodes_percent_used_major_threshold_percent': 0.9,
+ 'space_percent_used_minor_threshold_percent': 0.8,
+ 'space_percent_used_major_threshold_percent': 0.9
}
}, merge=salt['pillar.get']('glusterfs:monitoring')) %}
diff --git a/glusterfs/meta/prometheus.yml b/glusterfs/meta/prometheus.yml
index 9a29d84..d28af65 100644
--- a/glusterfs/meta/prometheus.yml
+++ b/glusterfs/meta/prometheus.yml
@@ -1,18 +1,74 @@
-{%- from "glusterfs/map.jinja" import server with context %}
-
+{%- from "glusterfs/map.jinja" import server, monitoring with context %}
{%- if server.get('enabled', False) %}
-{%- raw %}
server:
alert:
- GlusterFSDown:
+ GlusterfsServiceOutage:
if: >-
glusterfs_up != 1
for: 2m
labels:
- severity: warning
+ severity: critical
service: glusterfs
annotations:
- summary: 'GlusterFS service down'
- description: 'GlusterFS service is down on node {{ $labels.host }}'
-{%- endraw %}
+ summary: "GlusterFS service outage"
+ description: "All GlusterFS services are down."
+ GlusterfsInodesUsedMinor:
+ if: >-
+ glusterfs_inodes_percent_used >= {{ monitoring.inodes_percent_used_minor_threshold_percent*100 }} and glusterfs_inodes_percent_used < {{ monitoring.inodes_percent_used_major_threshold_percent*100 }}
+ {%- raw %}
+ for: 2m
+ labels:
+ severity: minor
+ service: glusterfs
+ annotations:
+ summary: "{%- endraw %}{{ monitoring.inodes_percent_used_minor_threshold_percent*100 }}%{%- raw %} of inodes are used on volume"
+ description: "{{ $value }}% of GlusterFS {{ $labels.volume }} volume inodes are used for more than 2 minutes."
+ {%- endraw %}
+ GlusterfsServiceMinor:
+ if: >-
+ procstat_running{process_name="glusterd"} < 1
+ {%- raw %}
+ labels:
+ severity: minor
+ service: glusterfs
+ annotations:
+ summary: "GlusterFS service is down"
+ description: "The GlusterFS service on the {{ $labels.host }} host is down."
+ {%- endraw %}
+ GlusterfsInodesUsedMajor:
+ if: >-
+ glusterfs_inodes_percent_used >= {{ monitoring.inodes_percent_used_major_threshold_percent*100 }}
+ {%- raw %}
+ for: 2m
+ labels:
+ severity: major
+ service: glusterfs
+ annotations:
+ summary: "{%- endraw %}{{ monitoring.inodes_percent_used_major_threshold_percent*100 }}%{%- raw %} of inodes are used on volume"
+ description: "{{ $value }}% of GlusterFS {{ $labels.volume }} volume inodes are used for more than 2 minutes."
+ {%- endraw %}
+ GlusterfsSpaceUsedMinor:
+ if: >-
+ glusterfs_space_percent_used >= {{ monitoring.space_percent_used_minor_threshold_percent*100 }} and glusterfs_space_percent_used < {{ monitoring.space_percent_used_major_threshold_percent*100 }}
+ {%- raw %}
+ for: 2m
+ labels:
+ severity: minor
+ service: glusterfs
+ annotations:
+ summary: "{%- endraw %}{{ monitoring.space_percent_used_minor_threshold_percent*100 }}%{%- raw %} of disk space is used on volume"
+ description: "{{ $value }}% of GlusterFS {{ $labels.volume }} volume disk space is used for more than 2 minutes."
+ {%- endraw %}
+ GlusterfsSpaceUsedMajor:
+ if: >-
+ glusterfs_space_percent_used <= {{ monitoring.space_percent_used_major_threshold_percent*100 }}
+ {%- raw %}
+ for: 2m
+ labels:
+ severity: major
+ service: glusterfs
+ annotations:
+ summary: "{%- endraw %}{{ monitoring.space_percent_used_major_threshold_percent*100 }}%{%- raw %} of disk space is used on volume"
+ description: "{{ $value }}% of GlusterFS {{ $labels.volume }} volume disk space is used for more than 2 minutes."
+ {%- endraw %}
{%- endif %}