updated defaults for mon_max_pg_per_osd variable. Added additional alerts for pg number on osds. Change-Id: I5042a166c6c81923c630d05bd7e2499226a707d6 Related-Prod: PROD-26472

commit: c49630e2ea639e406b542dd9fdeddb98ecf4a8bc [log] [tgz]
author: Mateusz Los <mlos@mirantis.com> Thu Jan 17 18:17:37 2019 +0100
committer: Mateusz Los <mlos@mirantis.com> Fri Jan 18 13:46:18 2019 +0100
tree: 8fd3379151e673aad5a2a7f398ace326e829f528
parent: ce784ecf100bea3eba54a7c3800d14cc3996fae9 [diff]
diff --git a/ceph/map.jinja b/ceph/map.jinja
index 3140a4c..b2fde96 100644
--- a/ceph/map.jinja
+++ b/ceph/map.jinja

@@ -87,6 +87,8 @@
 default:
   cluster_stats: {}
   node_stats: {}
+  osd_pgnum_warning: 200
+  osd_pgnum_critical: 300
 {%- endload %}
 
 {% set monitoring = salt['grains.filter_by'](monitoring_defaults, merge=salt['pillar.get']('ceph:monitoring')) %}

diff --git a/ceph/meta/prometheus.yml b/ceph/meta/prometheus.yml
index 86c8a47..d29c409 100644
--- a/ceph/meta/prometheus.yml
+++ b/ceph/meta/prometheus.yml

@@ -225,6 +225,32 @@
       annotations:
         summary: "{% endraw %}{{100*threshold}}{% raw %}% of Ceph space is used"
         description: "{{ $ value }} bytes of Ceph OSD space (>={% endraw %}{{100*threshold}}{% raw %}%) is used for 3 minutes. For details, run 'ceph df'."
+    CephOsdPgNumTooHighWarning:
+      {%- endraw %}
+      {%- set threshold = monitoring.osd_pgnum_warning %}
+      if: >-
+        max(ceph_osd_numpg) > {{threshold}}
+      {%- raw %}
+      for: 3m
+      labels:
+        severity: warning
+        service: ceph
+      annotations:
+        summary: "Some OSDs have more than {% endraw %}{{threshold}}{% raw %} PGs"
+        description: "Some OSDs contain more than {% endraw %}{{threshold}}{% raw %} PGs. This may have a negative impact on the cluster performance. For details, run 'ceph pg dump'"
+    CephOsdPgNumTooHighCritical:
+      {%- endraw %}
+      {%- set threshold = monitoring.osd_pgnum_critical %}
+      if: >-
+        max(ceph_osd_numpg) > {{threshold}}
+      {%- raw %}
+      for: 3m
+      labels:
+        severity: critical
+        service: ceph
+      annotations:
+        summary: "Some OSDs have more than {% endraw %}{{threshold}}{% raw %} PGs"
+        description: "Some OSDs contain more than {% endraw %}{{threshold}}{% raw %} PGs. This may have a negative impact on the cluster performance. For details, run 'ceph pg dump'"
       {%- endraw %}
       {%- if setup.pool is defined %}
         {%- for pool_name, pool in setup.pool.iteritems() %}

diff --git a/metadata/service/mon/cluster.yml b/metadata/service/mon/cluster.yml
index 6a10da2..94c484b 100644
--- a/metadata/service/mon/cluster.yml
+++ b/metadata/service/mon/cluster.yml

@@ -7,3 +7,7 @@
   ceph:
     mon:
       enabled: true
+    common:
+      config:
+        mon:
+          mon_max_pg_per_osd: 600

diff --git a/metadata/service/mon/single.yml b/metadata/service/mon/single.yml
index 8be5da5..ea04ea0 100644
--- a/metadata/service/mon/single.yml
+++ b/metadata/service/mon/single.yml

@@ -12,6 +12,9 @@
           caps:
             mon: "allow *"
     common:
+      config:
+        mon:
+          mon_max_pg_per_osd: 600
       keyring:
         admin:
           caps:
commit	c49630e2ea639e406b542dd9fdeddb98ecf4a8bc	[log] [tgz]
author	Mateusz Los <mlos@mirantis.com>	Thu Jan 17 18:17:37 2019 +0100
committer	Mateusz Los <mlos@mirantis.com>	Fri Jan 18 13:46:18 2019 +0100
tree	8fd3379151e673aad5a2a7f398ace326e829f528
parent	ce784ecf100bea3eba54a7c3800d14cc3996fae9 [diff]