Merge "grafana dashboards for ceph mgr prometheus plugin" into release/2019.2.0

commit: b9dbeef4b9e98524f5c5ba15de6d502eb386df2a [log] [tgz]
author: mcp-jenkins <mcp-jenkins@mirantis.com> Fri Dec 28 15:56:28 2018 +0000
committer: Gerrit Code Review <mail@domain.com> Fri Dec 28 15:56:28 2018 +0000
tree: 232df26216f13ee7d40137a48b3cf429a4a5d915
parent: 0a6febb481fcabd2c2a884bb7dc596636852a33e [diff]
parent: d95614f2a04a3198798f6967837510c4adde61f8 [diff]
diff --git a/ceph/meta/fluentd.yml b/ceph/meta/fluentd.yml
new file mode 100644
index 0000000..8bc2794
--- /dev/null
+++ b/ceph/meta/fluentd.yml

@@ -0,0 +1,79 @@
+{%- if pillar.get('fluentd', {}).get('agent', {}).get('enabled', False) %}
+{%- set positiondb = pillar.fluentd.agent.dir.positiondb %}
+agent:
+  config:
+    label:
+      ceph:
+        input:
+          tail_ceph-osd:
+            type: tail
+            tag: ceph.osd
+            path: /var/log/ceph/ceph-osd*
+            path_key: log_location
+            pos_file: {{ positiondb }}/ceph.osd.pos
+            parser:
+              type: regexp
+              time_key: Timestamp
+              time_format: '%Y-%m-%d %H:%M:%S.%N'
+              keep_time_key: false
+              format: >-
+                '/^(?<Timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{6}) (?<Payload>.*)/'
+          tail_ceph-mon:
+            type: tail
+            tag: ceph.mon
+            path: /var/log/ceph/ceph-mon*, /var/log/ceph/ceph.log, /var/log/ceph/ceph.audit.log
+            path_key: log_location
+            pos_file: {{ positiondb }}/ceph.mon.pos
+            parser:
+              type: regexp
+              time_key: Timestamp
+              time_format: '%Y-%m-%d %H:%M:%S.%N'
+              keep_time_key: false
+              format: >-
+                '/^(?<Timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{6}) (?<Payload>.*)/'
+          tail_ceph-mgr:
+            type: tail
+            tag: ceph.mgr
+            path: /var/log/ceph/ceph-mgr*
+            path_key: log_location
+            pos_file: {{ positiondb }}/ceph.mgr.pos
+            parser:
+              type: regexp
+              time_key: Timestamp
+              time_format: '%Y-%m-%d %H:%M:%S.%N'
+              keep_time_key: false
+              format: >-
+                '/^(?<Timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{6}) (?<Payload>.*)/'
+          tail_radosgw:
+            type: tail
+            tag: ceph.radosgw
+            path: /var/log/ceph/ceph-rgw*
+            path_key: log_location
+            pos_file: {{ positiondb }}/ceph.radosgw.pos
+            parser:
+              type: regexp
+              time_key: Timestamp
+              time_format: '%Y-%m-%d %H:%M:%S.%N'
+              keep_time_key: false
+              format: >-
+                '/^(?<Timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{6}) (?<Payload>.*)$/'
+
+        filter:
+          match_severity:
+            type: record_transformer
+            tag: ceph.**
+            enable_ruby: true
+            record:
+              - name: programname
+                value: ceph
+              - name: severity_label
+                value: INFO
+              - name: Severity
+                value: 6
+        match:
+          push_to_default:
+            tag: ceph.*
+            type: relabel
+            label: default_output
+
+{%- endif %}

diff --git a/ceph/meta/prometheus.yml b/ceph/meta/prometheus.yml
index a8755ff..f91a4f2 100644
--- a/ceph/meta/prometheus.yml
+++ b/ceph/meta/prometheus.yml

@@ -1,4 +1,4 @@
-{%- from "ceph/map.jinja" import thresholds, mon, monitoring, setup with context %}
+{%- from "ceph/map.jinja" import thresholds, mon, monitoring, setup, osd with context %}
 
 {%- if (mon is defined and mon.get('enabled')) or (monitoring.cluster_stats.get('enabled') and monitoring.cluster_stats.ceph_user is defined) %}
 {% raw %}
@@ -6,7 +6,7 @@
   alert:
     CephClusterHealthMinor:
       if: >-
-        ceph_overall_health == 2
+        ceph_health_status == 1
       for: 3m
       labels:
         severity: minor
@@ -16,7 +16,7 @@
         description: "The Ceph cluster is in the WARNING state. For details, run 'ceph -s'."
     CephClusterHealthCritical:
       if: >-
-        ceph_overall_health == 3
+        ceph_health_status == 2
       for: 3m
       labels:
         severity: critical
@@ -26,29 +26,29 @@
         description: "The Ceph cluster is in the CRITICAL state. For details, run 'ceph -s'."
     CephMonitorDownMinor:
       if: >-
-        100 * (1 - ceph_num_mon_quorum / ceph_num_mon) > 0
+        count(ceph_mon_quorum_status) - sum(ceph_mon_quorum_status) > 0
       for: 3m
       labels:
         severity: minor
         service: ceph
       annotations:
         summary: "Ceph Monitors are down"
-        description: "{{ $value }}% of Ceph Monitors are down. For details, run 'ceph -s'."
+        description: "{{ $value }} of Ceph Monitors are down. For details, run 'ceph -s'."
     CephOsdDownMinor:
       if: >-
-        100 * (1 - ceph_osdmap_num_up_osds / ceph_osdmap_num_osds) > 0
+        count(ceph_osd_up) - sum(ceph_osd_up) > 0
       for: 3m
       labels:
         severity: minor
         service: ceph
       annotations:
         summary: "Ceph OSDs are down"
-        description: "{{ $value }}% of Ceph OSDs are down. For details, run 'ceph osd tree'."
+        description: "{{ $value }} of Ceph OSDs are down. For details, run 'ceph osd tree'."
     CephOsdSpaceUsageWarning:
       {%- endraw %}
       {%- set threshold = monitoring.space_used_warning_threshold|default('0.75')|float %}
       if: >-
-        ceph_osd_bytes_used > ceph_osd_bytes * {{threshold}}
+        ceph_cluster_total_used_bytes > ceph_cluster_total_bytes * {{threshold}}
       {%- raw %}
       for: 3m
       labels:
@@ -61,7 +61,7 @@
       {%- endraw %}
       {%- set threshold = monitoring.space_used_critical_threshold|default('0.85')|float %}
       if: >-
-        ceph_osd_bytes_used > ceph_osd_bytes * {{threshold}}
+        ceph_cluster_total_used_bytes > ceph_cluster_total_bytes * {{threshold}}
       {%- raw %}
       for: 3m
       labels:
@@ -81,7 +81,7 @@
     CephPool{{pool_name|replace(".", "")|replace("-", "")}}SpaceUsageWarning:
       {%- set threshold = monitoring_pool.pool_space_used_utilization_warning_threshold|default('0.75')|float %}
       if: >-
-        ceph_pool_usage_bytes_used{name="{{pool_name}}"} / (ceph_pool_usage_max_avail{name="{{pool_name}}"} + ceph_pool_usage_bytes_used{name="{{pool_name}}"}) > {{threshold}}
+        ceph_pool_bytes_used / (ceph_pool_bytes_used + ceph_pool_max_avail) * on(pool_id) group_left(name) ceph_pool_metadata{name="{{pool_name}}"} > {{threshold}}
       for: 3m
       labels:
         severity: warning
@@ -92,7 +92,7 @@
     CephPool{{pool_name|replace(".", "")|replace("-", "")}}SpaceUsageCritical:
       {%- set threshold = monitoring_pool.pool_space_used_critical_threshold|default('0.85')|float %}
       if: >-
-        ceph_pool_usage_bytes_used{name="{{pool_name}}"} / (ceph_pool_usage_max_avail{name="{{pool_name}}"} + ceph_pool_usage_bytes_used{name="{{pool_name}}"}) > {{threshold}}
+        ceph_pool_bytes_used / (ceph_pool_bytes_used + ceph_pool_max_avail) * on(pool_id) group_left(name) ceph_pool_metadata{name="{{pool_name}}"} > {{threshold}}
       for: 3m
       labels:
         severity: minor
@@ -100,52 +100,29 @@
       annotations:
         summary: "{{100*threshold}}% of Ceph pool space is used"
         description: "The Ceph {{pool_name}} pool uses {{100*threshold}}% of available space for 3 minutes. For details, run 'ceph df'."
-    {%- if monitoring.cluster_stats.extra_alerts is defined and monitoring.cluster_stats.extra_alerts.get("enabled", False) %}
-    CephPool{{pool_name|replace(".", "")|replace("-", "")}}WriteOpsTooHigh:
-      {%- set threshold = monitoring_pool.pool_write_ops_threshold|default('200')|float %}
-      if: >-
-        ceph_pool_stats_write_op_per_sec{name="{{pool_name}}"} > {{threshold}}
-      for: 3m
-      labels:
-        severity: warning
-        service: ceph
-      annotations:
-        summary: "{{threshold}} Ceph pool write operations per second"
-        description: "The number of Ceph {{pool_name}} pool write operations per second is {{threshold}} for 3 minutes."
-    CephPool{{pool_name|replace(".", "")|replace("-", "")}}WriteBytesTooHigh:
-      {%- set threshold = monitoring_pool.pool_write_bytes_threshold|default('70000000')|float %}
-      if: >-
-        ceph_pool_stats_write_bytes_sec{name="{{pool_name}}"} > {{threshold}}
-      for: 3m
-      labels:
-        severity: warning
-        service: ceph
-      annotations:
-        summary: "{{threshold}} Ceph pool write bytes per second"
-        description: "The number of Ceph {{pool_name}} pool write bytes per second is {{threshold}} for 3 minutes."
-    CephPool{{pool_name|replace(".", "")|replace("-", "")}}ReadOpsTooHigh:
-      {%- set threshold = monitoring_pool.pool_read_ops_threshold|default('1000')|float %}
-      if: >-
-        ceph_pool_stats_read_op_per_sec{name="{{pool_name}}"} > {{threshold}}
-      for: 3m
-      labels:
-        severity: warning
-        service: ceph
-      annotations:
-        summary: "{{threshold}} Ceph pool read operations per second"
-        description: "The number of Ceph {{pool_name}} pool read operations per second is {{threshold}} for 3 minutes."
-    CephPool{{pool_name|replace(".", "")|replace("-", "")}}ReadBytesTooHigh:
-      {%- set threshold = monitoring_pool.pool_read_bytes_threshold|default('70000000')|float %}
-      if: >-
-        ceph_pool_stats_read_bytes_sec{name="{{pool_name}}"} > {{threshold}}
-      for: 3m
-      labels:
-        severity: warning
-        service: ceph
-      annotations:
-        summary: "{{threshold}} Ceph pool read bytes per second"
-        description: "The number of Ceph {{pool_name}} pool read bytes per second is {{threshold}} for 3 minutes."
-    {%- endif %}
 {%- endfor %}
 {%- endif %}
 {%- endif %}
+
+
+{%- if mon is defined and mon.get('enabled') %}
+{%- set fqdn_ip4_addresses = [] %}
+{%- for addr in grains['fqdn_ip4'] %}
+  {%- if not addr.startswith('127.') %}
+    {%- do fqdn_ip4_addresses.append(addr) %}
+  {%- endif %}
+{%- endfor %}
+{%- set address = fqdn_ip4_addresses[0] %}
+{%- if address is defined %}
+server:
+  target:
+    static:
+      ceph:
+        enabled: true
+        endpoint:
+        - address: {{ address }}
+          port: 9283
+        honor_labels: true
+
+{%- endif %}
+{%- endif %}

diff --git a/ceph/mgr.sls b/ceph/mgr.sls
index bfc58b1..cda9856 100644
--- a/ceph/mgr.sls
+++ b/ceph/mgr.sls

@@ -83,7 +83,19 @@
 disable_ceph_dashboard:
   cmd.run:
   - name: "ceph -c /etc/ceph/{{ common.get('cluster_name', 'ceph') }}.conf mgr module disable dashboard"
-  - onlyif: "ceph -c /etc/ceph/{{ common.get('cluster_name', 'ceph') }}.conf mgr module ls | grep dashboard"
+  - unless: "ceph -c /etc/ceph/{{ common.get('cluster_name', 'ceph') }}.conf mgr module ls | grep dashboard"
+  - require:
+    - file: common_config
+    - file: /var/lib/ceph/mgr/{{ common.get('cluster_name', 'ceph') }}-{{ grains.host }}/
+
+{%- endif %}
+
+{%- if pillar.get('prometheus', {}).get('collector',{}).get("enabled", False) %}
+
+enable_prometheus_plugin:
+  cmd.run:
+  - name: "ceph -c /etc/ceph/{{ common.get('cluster_name', 'ceph') }}.conf mgr module enable prometheus"
+  - unless: "ceph -c /etc/ceph/{{ common.get('cluster_name', 'ceph') }}.conf mgr module ls | grep prometheus"
   - require:
     - file: common_config
     - file: /var/lib/ceph/mgr/{{ common.get('cluster_name', 'ceph') }}-{{ grains.host }}/
@@ -92,4 +104,4 @@
 
 {%- endif %}
 
-{%- endif %}
\ No newline at end of file
+{%- endif %}

diff --git a/metadata/service/support.yml b/metadata/service/support.yml
index 2be3736..5c87d50 100644
--- a/metadata/service/support.yml
+++ b/metadata/service/support.yml

@@ -15,3 +15,5 @@
         enabled: true
       grafana:
         enabled: true
+      fluentd:
+        enabled: true
commit	b9dbeef4b9e98524f5c5ba15de6d502eb386df2a	[log] [tgz]
author	mcp-jenkins <mcp-jenkins@mirantis.com>	Fri Dec 28 15:56:28 2018 +0000
committer	Gerrit Code Review <mail@domain.com>	Fri Dec 28 15:56:28 2018 +0000
tree	232df26216f13ee7d40137a48b3cf429a4a5d915
parent	0a6febb481fcabd2c2a884bb7dc596636852a33e [diff]
parent	d95614f2a04a3198798f6967837510c4adde61f8 [diff]