Merge "Fix ceph backup script failing while genrating tar file with timestamp by moving to double digit hours (%H) from single digit (%k)"
diff --git a/.kitchen.yml b/.kitchen.yml
index 8513cd7..c6c5d32 100644
--- a/.kitchen.yml
+++ b/.kitchen.yml
@@ -34,11 +34,25 @@
name: inspec
sudo: true
+docker_images:
+ - &xenial-20163 <%=ENV['IMAGE_XENIAL_20163'] || 'docker-dev-local.docker.mirantis.net/epcim/salt/saltstack-ubuntu-xenial-salt-2016.3/salt:2018_11_19'%>
+ - &xenial-20177 <%=ENV['IMAGE_XENIAL_20177'] || 'docker-dev-local.docker.mirantis.net/epcim/salt/saltstack-ubuntu-xenial-salt-2017.7/salt:2018_11_19'%>
+ - &xenial-stable <%=ENV['IMAGE_XENIAL_STABLE'] || 'docker-dev-local.docker.mirantis.net/epcim/salt/saltstack-ubuntu-xenial-salt-stable/salt:2018_11_19'%>
platforms:
- - name: <%=ENV['PLATFORM'] || 'saltstack-ubuntu-xenial-salt-stable' %>
+ - name: xenial-2016.3
driver_config:
- image: <%=ENV['PLATFORM'] || 'epcim/salt:saltstack-ubuntu-xenial-salt-stable'%>
+ image: *xenial-20163
+ platform: ubuntu
+
+ - name: xenial-2017.7
+ driver_config:
+ image: *xenial-20177
+ platform: ubuntu
+
+ - name: xenial-stable
+ driver_config:
+ image: *xenial-stable
platform: ubuntu
suites:
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index 2373a24..0000000
--- a/.travis.yml
+++ /dev/null
@@ -1,51 +0,0 @@
-language: python
-python:
-- "2.7.13"
-sudo: required
-services:
- - docker
-
-install:
- - pip install PyYAML
- - pip install virtualenv
- - |
- if [ ! -e Gemfile ]; then
- curl -s -o ./Gemfile 'https://gerrit.mcp.mirantis.com/gitweb?p=salt-formulas/salt-formulas-scripts.git;a=blob_plain;f=Gemfile;hb=refs/heads/master'
- fi
- - bundle install
-
-env:
- - PLATFORM=epcim/salt:saltstack-ubuntu-xenial-salt-2016.3 SUITE=ceph-client-single
- - PLATFORM=epcim/salt:saltstack-ubuntu-xenial-salt-2016.3 SUITE=ceph-mon-single
- - PLATFORM=epcim/salt:saltstack-ubuntu-xenial-salt-2016.3 SUITE=ceph-osd-single
- - PLATFORM=epcim/salt:saltstack-ubuntu-xenial-salt-2017.7 SUITE=ceph-client-single
- - PLATFORM=epcim/salt:saltstack-ubuntu-xenial-salt-2017.7 SUITE=ceph-mon-single
- - PLATFORM=epcim/salt:saltstack-ubuntu-xenial-salt-2017.7 SUITE=ceph-osd-single
- - PLATFORM=epcim/salt:saltstack-ubuntu-xenial-salt-2018.3 SUITE=ceph-client-single
- - PLATFORM=epcim/salt:saltstack-ubuntu-xenial-salt-2018.3 SUITE=ceph-mon-single
- - PLATFORM=epcim/salt:saltstack-ubuntu-xenial-salt-2018.3 SUITE=ceph-osd-single
-# - PLATFORM=epcim/salt:saltstack-ubuntu-bionic-salt-2017.7 SUITE=ceph-client-single
-# - PLATFORM=epcim/salt:saltstack-ubuntu-bionic-salt-2017.7 SUITE=ceph-mon-single
-# - PLATFORM=epcim/salt:saltstack-ubuntu-bionic-salt-2017.7 SUITE=ceph-osd-single
-# - PLATFORM=epcim/salt:saltstack-ubuntu-bionic-salt-2018.3 SUITE=ceph-client-single
-# - PLATFORM=epcim/salt:saltstack-ubuntu-bionic-salt-2018.3 SUITE=ceph-mon-single
-# - PLATFORM=epcim/salt:saltstack-ubuntu-bionic-salt-2018.3 SUITE=ceph-osd-single
-
-before_script:
- - set -o pipefail
- - make test | tail
-
-script:
- - test ! -e .kitchen.yml || bundle exec kitchen converge ${SUITE} || true
- - test ! -e .kitchen.yml || bundle exec kitchen verify ${SUITE} -t tests/integration
-
-notifications:
- webhooks:
- urls:
- - https://webhooks.gitter.im/e/6123573504759330786b
- on_success: change # options: [always|never|change] default: always
- on_failure: never # options: [always|never|change] default: always
- on_start: never # options: [always|never|change] default: always
- on_cancel: never # options: [always|never|change] default: always
- on_error: never # options: [always|never|change] default: always
- email: false
diff --git a/LICENSE b/LICENSE
index ee729da..97c1317 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,4 +1,4 @@
-Copyright 2015 tcp cloud a.s.
+Copyright 2019 Mirantis Inc. et al.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@@ -10,4 +10,4 @@
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
-limitations under the License.
\ No newline at end of file
+limitations under the License.
diff --git a/README.rst b/README.rst
index dcd9462..a0c3d3c 100644
--- a/README.rst
+++ b/README.rst
@@ -357,6 +357,21 @@
enabled: false
+In case some custom block devices should be used (like loop devices for testing purpose),
+it is needed to indicate proper partition prefix.
+
+.. code-block:: yaml
+
+ ceph:
+ osd:
+ backend:
+ bluestore:
+ disks:
+ - dev: /dev/loop20
+ block_db: /dev/loop21
+ data_partition_prefix: 'p'
+
+
Ceph client roles - ...Deprecated - use ceph:common instead
--------------------------------------------------------
@@ -826,7 +841,7 @@
Migration from Decapod to salt-formula-ceph
--------------------------------------------
-The following configuration will run a python script which will generate ceph config and osd disk mappings to be put in cluster model.
+The following configuration will run a python script which will generate ceph config and osd disk mappings to be put in cluster model.
.. code-block:: yaml
@@ -844,37 +859,3 @@
* https://github.com/cloud-ee/ceph-salt-formula
* http://ceph.com/ceph-storage/
* http://ceph.com/docs/master/start/intro/
-
-
-Documentation and bugs
-======================
-
-To learn how to install and update salt-formulas, consult the documentation
-available online at:
-
- http://salt-formulas.readthedocs.io/
-
-In the unfortunate event that bugs are discovered, they should be reported to
-the appropriate issue tracker. Use Github issue tracker for specific salt
-formula:
-
- https://github.com/salt-formulas/salt-formula-ceph/issues
-
-For feature requests, bug reports or blueprints affecting entire ecosystem,
-use Launchpad salt-formulas project:
-
- https://launchpad.net/salt-formulas
-
-You can also join salt-formulas-users team and subscribe to mailing list:
-
- https://launchpad.net/~salt-formulas-users
-
-Developers wishing to work on the salt-formulas projects should always base
-their work on master branch and submit pull request against specific formula.
-
- https://github.com/salt-formulas/salt-formula-ceph
-
-Any questions or feedback is always welcome so feel free to join our IRC
-channel:
-
- #salt-formulas @ irc.freenode.net
diff --git a/ceph/common.sls b/ceph/common.sls
index c6067b0..b445355 100644
--- a/ceph/common.sls
+++ b/ceph/common.sls
@@ -25,8 +25,8 @@
common_config:
file.managed:
- name: {{ common.prefix_dir }}/etc/ceph/{{ common.get('cluster_name', 'ceph') }}.conf
- - user: ceph
- - group: ceph
+ - user: root
+ - group: root
- source: salt://ceph/files/{{ common.version }}/ceph.conf.{{ grains.os_family }}
- template: jinja
{% if not common.get('container_mode', False) %}
diff --git a/ceph/files/grafana_dashboards/ceph-cluster_prometheus.json b/ceph/files/grafana_dashboards/ceph-cluster_prometheus.json
new file mode 100644
index 0000000..2f54c74
--- /dev/null
+++ b/ceph/files/grafana_dashboards/ceph-cluster_prometheus.json
@@ -0,0 +1,1011 @@
+{%- raw %}
+{
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "prometheus",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "description": "Ceph cluster overview",
+ "editable": true,
+ "gnetId": null,
+ "graphTooltip": 0,
+ "id": null,
+ "iteration": 1545072771425,
+ "links": [],
+ "panels": [
+ {
+ "cacheTimeout": null,
+ "colorBackground": true,
+ "colorValue": false,
+ "colors": [
+ "rgba(50, 128, 45, 0.9)",
+ "rgba(237, 129, 40, 0.9)",
+ "rgb(255, 0, 0)"
+ ],
+ "datasource": null,
+ "editable": true,
+ "error": false,
+ "format": "none",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 3,
+ "w": 2,
+ "x": 0,
+ "y": 0
+ },
+ "hideTimeOverride": true,
+ "id": 21,
+ "interval": "1m",
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "span": 2,
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "ceph_health_status{instance=~'$instance'}",
+ "format": "time_series",
+ "interval": "$interval",
+ "intervalFactor": 1,
+ "refId": "A",
+ "step": 60
+ }
+ ],
+ "thresholds": "1,2",
+ "timeFrom": "1m",
+ "title": "Health Status",
+ "transparent": false,
+ "type": "singlestat",
+ "valueFontSize": "50%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "OK",
+ "value": "0"
+ },
+ {
+ "op": "=",
+ "text": "WARN",
+ "value": "1"
+ },
+ {
+ "op": "=",
+ "text": "ERR",
+ "value": "2"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": null,
+ "format": "percentunit",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": true,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 6,
+ "w": 4,
+ "x": 4,
+ "y": 0
+ },
+ "id": 47,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": true
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "sum(ceph_osd_stat_bytes_used{instance=~\"$instance\"})/sum(ceph_osd_stat_bytes{instance=~\"$instance\"})",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Used",
+ "refId": "A"
+ }
+ ],
+ "thresholds": "70,80",
+ "title": "Capacity used",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "fill": 0,
+ "gridPos": {
+ "h": 6,
+ "w": 8,
+ "x": 8,
+ "y": 0
+ },
+ "id": 53,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "Active",
+ "color": "#508642",
+ "fill": 1,
+ "stack": "A"
+ },
+ {
+ "alias": "Total",
+ "color": "#f9e2d2"
+ },
+ {
+ "alias": "Degraded",
+ "color": "#eab839"
+ },
+ {
+ "alias": "Undersized",
+ "color": "#f9934e"
+ },
+ {
+ "alias": "Inconsistent",
+ "color": "#e24d42"
+ },
+ {
+ "alias": "Down",
+ "color": "#bf1b00"
+ },
+ {
+ "alias": "Inactive",
+ "color": "#bf1b00",
+ "fill": 4,
+ "linewidth": 0,
+ "stack": "A"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "ceph_pg_total",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Total",
+ "refId": "A"
+ },
+ {
+ "expr": "ceph_pg_active",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Active",
+ "refId": "B"
+ },
+ {
+ "expr": "ceph_pg_total - ceph_pg_active",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Inactive",
+ "refId": "G"
+ },
+ {
+ "expr": "ceph_pg_undersized",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Undersized",
+ "refId": "F"
+ },
+ {
+ "expr": "ceph_pg_degraded",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Degraded",
+ "refId": "C"
+ },
+ {
+ "expr": "ceph_pg_inconsistent",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Inconsistent",
+ "refId": "D"
+ },
+ {
+ "expr": "ceph_pg_down",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Down",
+ "refId": "E"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "PG States",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "fill": 0,
+ "gridPos": {
+ "h": 6,
+ "w": 8,
+ "x": 16,
+ "y": 0
+ },
+ "id": 66,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": false,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "Avg Apply Latency",
+ "color": "#7eb26d"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "quantile(0.95, ceph_osd_apply_latency_ms{instance=~\"$instance\"})",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Apply Latency P_95",
+ "refId": "A"
+ },
+ {
+ "expr": "quantile(0.95, ceph_osd_commit_latency_ms{instance=~\"$instance\"})",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Commit Latency P_95",
+ "refId": "B"
+ },
+ {
+ "expr": "avg(ceph_osd_apply_latency_ms{instance=~\"$instance\"})",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Avg Apply Latency",
+ "refId": "C"
+ },
+ {
+ "expr": "avg(ceph_osd_commit_latency_ms{instance=~\"$instance\"})",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Avg Commit Latency",
+ "refId": "D"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "OSD Latencies",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "ms",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "fill": 1,
+ "gridPos": {
+ "h": 9,
+ "w": 12,
+ "x": 0,
+ "y": 6
+ },
+ "id": 45,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 0.5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "Reads",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(irate(ceph_osd_op_w_in_bytes{instance=~\"$instance\"}[1m]))",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Writes",
+ "refId": "A"
+ },
+ {
+ "expr": "sum(irate(ceph_osd_op_r_out_bytes{instance=~\"$instance\"}[1m]))",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Reads",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Cluster I/O",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "Bps",
+ "label": "Read (-) / Write (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "fill": 1,
+ "gridPos": {
+ "h": 9,
+ "w": 12,
+ "x": 12,
+ "y": 6
+ },
+ "id": 62,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": false,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(deriv(ceph_pool_bytes_used{instance=~\"$instance\"}[1m]))",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Bytes",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "In-/Egress",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "Bps",
+ "label": " Egress (-) / Ingress (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "cards": {
+ "cardPadding": null,
+ "cardRound": 1
+ },
+ "color": {
+ "cardColor": "rgb(0, 254, 255)",
+ "colorScale": "sqrt",
+ "colorScheme": "interpolateBlues",
+ "exponent": 0.5,
+ "min": null,
+ "mode": "spectrum"
+ },
+ "dataFormat": "timeseries",
+ "datasource": null,
+ "gridPos": {
+ "h": 9,
+ "w": 6,
+ "x": 0,
+ "y": 15
+ },
+ "heatmap": {},
+ "highlightCards": true,
+ "id": 55,
+ "legend": {
+ "show": true
+ },
+ "links": [],
+ "span": 12,
+ "targets": [
+ {
+ "expr": "ceph_osd_stat_bytes_used{instance=~'$instance'} / ceph_osd_stat_bytes{instance=~'$instance'}",
+ "format": "time_series",
+ "instant": false,
+ "interval": "1m",
+ "intervalFactor": 1,
+ "legendFormat": "Util (%)",
+ "refId": "A",
+ "step": 60
+ }
+ ],
+ "timeFrom": null,
+ "title": "OSD Capacity Utilization",
+ "tooltip": {
+ "show": true,
+ "showHistogram": false
+ },
+ "type": "heatmap",
+ "xAxis": {
+ "show": true
+ },
+ "xBucketNumber": null,
+ "xBucketSize": "",
+ "yAxis": {
+ "decimals": null,
+ "format": "percentunit",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true,
+ "splitFactor": null
+ },
+ "yBucketBound": "auto",
+ "yBucketNumber": null,
+ "yBucketSize": null
+ },
+ {
+ "cards": {
+ "cardPadding": null,
+ "cardRound": 1
+ },
+ "color": {
+ "cardColor": "#b4ff00",
+ "colorScale": "sqrt",
+ "colorScheme": "interpolateBlues",
+ "exponent": 0.5,
+ "mode": "spectrum"
+ },
+ "dataFormat": "timeseries",
+ "datasource": null,
+ "gridPos": {
+ "h": 9,
+ "w": 6,
+ "x": 6,
+ "y": 15
+ },
+ "heatmap": {},
+ "highlightCards": true,
+ "id": 59,
+ "legend": {
+ "show": true
+ },
+ "links": [],
+ "targets": [
+ {
+ "expr": "ceph_osd_numpg{instance=~\"$instance\"}",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "#PGs",
+ "refId": "A"
+ }
+ ],
+ "title": "PGs per OSD",
+ "tooltip": {
+ "show": true,
+ "showHistogram": false
+ },
+ "type": "heatmap",
+ "xAxis": {
+ "show": true
+ },
+ "xBucketNumber": null,
+ "xBucketSize": "",
+ "yAxis": {
+ "decimals": null,
+ "format": "none",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true,
+ "splitFactor": null
+ },
+ "yBucketBound": "auto",
+ "yBucketNumber": null,
+ "yBucketSize": null
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "fill": 0,
+ "gridPos": {
+ "h": 9,
+ "w": 12,
+ "x": 12,
+ "y": 15
+ },
+ "id": 64,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": false,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(irate(ceph_osd_recovery_ops[1m]))",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Op/s",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Recovery Rate",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "ops",
+ "label": "Recovery Ops/s",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ }
+ ],
+ "refresh": "30s",
+ "schemaVersion": 16,
+ "style": "dark",
+ "tags": [
+ "ceph",
+ "cluster"
+ ],
+ "templating": {
+ "list": [
+ {
+ "auto": true,
+ "auto_count": 10,
+ "auto_min": "1m",
+ "current": {
+ "text": "auto",
+ "value": "$__auto_interval_interval"
+ },
+ "datasource": null,
+ "hide": 0,
+ "includeAll": false,
+ "label": "Interval",
+ "multi": false,
+ "name": "interval",
+ "options": [
+ {
+ "selected": true,
+ "text": "auto",
+ "value": "$__auto_interval_interval"
+ },
+ {
+ "selected": false,
+ "text": "1m",
+ "value": "1m"
+ },
+ {
+ "selected": false,
+ "text": "10m",
+ "value": "10m"
+ },
+ {
+ "selected": false,
+ "text": "30m",
+ "value": "30m"
+ },
+ {
+ "selected": false,
+ "text": "1h",
+ "value": "1h"
+ },
+ {
+ "selected": false,
+ "text": "6h",
+ "value": "6h"
+ },
+ {
+ "selected": false,
+ "text": "12h",
+ "value": "12h"
+ },
+ {
+ "selected": false,
+ "text": "1d",
+ "value": "1d"
+ },
+ {
+ "selected": false,
+ "text": "7d",
+ "value": "7d"
+ },
+ {
+ "selected": false,
+ "text": "14d",
+ "value": "14d"
+ },
+ {
+ "selected": false,
+ "text": "30d",
+ "value": "30d"
+ }
+ ],
+ "query": "1m,10m,30m,1h,6h,12h,1d,7d,14d,30d",
+ "refresh": 2,
+ "skipUrlSync": false,
+ "type": "interval"
+ },
+ {
+ "allFormat": "glob",
+ "allValue": null,
+ "current": {
+ "text": "All",
+ "value": "$__all"
+ },
+ "datasource": null,
+ "hide": 0,
+ "hideLabel": false,
+ "includeAll": true,
+ "label": "Exporter Instance",
+ "multi": false,
+ "multiFormat": "glob",
+ "name": "instance",
+ "options": [],
+ "query": "label_values(ceph_health_status, instance)",
+ "refresh": 1,
+ "regex": "",
+ "skipUrlSync": false,
+ "sort": 0,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ }
+ ]
+ },
+ "time": {
+ "from": "now-6h",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "browser",
+ "title": "Ceph - Cluster",
+ "version": 7
+}
+{%- endraw %}
diff --git a/ceph/files/grafana_dashboards/hosts-overview_prometheus.json b/ceph/files/grafana_dashboards/hosts-overview_prometheus.json
new file mode 100644
index 0000000..bdc6a90
--- /dev/null
+++ b/ceph/files/grafana_dashboards/hosts-overview_prometheus.json
@@ -0,0 +1,1068 @@
+{%- raw %}
+{
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "prometheus",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "editable": true,
+ "gnetId": null,
+ "graphTooltip": 0,
+ "id": null,
+ "iteration": 1545072569576,
+ "links": [],
+ "panels": [
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": null,
+ "format": "none",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 3,
+ "x": 0,
+ "y": 0
+ },
+ "id": 26,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "count(ceph_mon_quorum_status)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "refId": "A"
+ }
+ ],
+ "thresholds": "",
+ "title": "Monitors",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "avg"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": null,
+ "format": "none",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 3,
+ "x": 3,
+ "y": 0
+ },
+ "id": 24,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "count(ceph_mon_quorum_status) - sum(ceph_mon_quorum_status)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "refId": "A"
+ }
+ ],
+ "thresholds": "1,2",
+ "title": "Monitors down",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "avg"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": null,
+ "format": "none",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 3,
+ "x": 6,
+ "y": 0
+ },
+ "id": 5,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "Value",
+ "targets": [
+ {
+ "expr": "count(sum by (instance) (ceph_disk_occupation))",
+ "format": "table",
+ "hide": false,
+ "instant": true,
+ "interval": "15s",
+ "intervalFactor": 1,
+ "refId": "A"
+ }
+ ],
+ "thresholds": "",
+ "title": "OSD Hosts",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#890f02"
+ ],
+ "datasource": null,
+ "format": "none",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 3,
+ "x": 9,
+ "y": 0
+ },
+ "id": 22,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "count(ceph_osd_up) - sum(ceph_osd_up)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "refId": "A"
+ }
+ ],
+ "thresholds": "1,2",
+ "title": "OSDs down",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "avg"
+ },
+ {
+ "collapsed": false,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 4
+ },
+ "id": 30,
+ "panels": [],
+ "title": "",
+ "type": "row"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": null,
+ "description": "IOPS Load at the device as reported by the OS on all OSD hosts",
+ "format": "none",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 3,
+ "x": 0,
+ "y": 5
+ },
+ "id": 2,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "Value",
+ "targets": [
+ {
+ "expr": "sum (irate(diskio_reads{host=~\"($osd_hosts).*\"}[5m]) + irate(diskio_writes{host=~\"($osd_hosts).*\"}[5m]))",
+ "format": "table",
+ "instant": true,
+ "intervalFactor": 1,
+ "refId": "A"
+ }
+ ],
+ "thresholds": "",
+ "title": "Physical IOPS",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": null,
+ "decimals": 0,
+ "description": "Average Memory Usage across all hosts in the cluster (excludes buffer/cache usage)",
+ "format": "percentunit",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 3,
+ "x": 3,
+ "y": 5
+ },
+ "id": 9,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "Value",
+ "targets": [
+ {
+ "expr": "avg ((mem_total{host=~\"($osd_hosts|$mon_hosts|$rgw_hosts).*\"} - (\n mem_free{host=~\"($osd_hosts|$mon_hosts|$rgw_hosts).*\"} + \n mem_cached{host=~\"($osd_hosts|$mon_hosts|$rgw_hosts).*\"} + \n mem_buffered{host=~\"($osd_hosts|$mon_hosts|$rgw_hosts).*\"} +\n mem_slab{host=~\"($osd_hosts|$mon_hosts|$rgw_hosts).*\"}\n )) /\n mem_total{host=~\"($osd_hosts|$mon_hosts|$rgw_hosts).*\"})",
+ "format": "table",
+ "instant": true,
+ "intervalFactor": 1,
+ "refId": "A"
+ }
+ ],
+ "thresholds": "",
+ "title": "AVG RAM Utilization",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": null,
+ "description": "Average Disk utilization for all OSD data devices (i.e. excludes journal/WAL)",
+ "format": "percent",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 3,
+ "x": 6,
+ "y": 5
+ },
+ "id": 20,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "Value",
+ "targets": [
+ {
+ "expr": "avg (\n ((irate(diskio_io_time[5m]) / 10 )\n ) *\n on(host, name) diskio_io_time{host=~\"($osd_hosts).*\"}\n)",
+ "format": "table",
+ "instant": true,
+ "intervalFactor": 1,
+ "refId": "A"
+ }
+ ],
+ "thresholds": "",
+ "title": "AVG Disk Utilization",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": null,
+ "decimals": 0,
+ "description": "Total send/receive network load across all hosts in the ceph cluster",
+ "format": "bytes",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 3,
+ "x": 9,
+ "y": 5
+ },
+ "id": 18,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "Value",
+ "targets": [
+ {
+ "expr": "sum (\n irate(net_bytes_recv{host=~\"($osd_hosts|mon_hosts|rgw_hosts).*\",device!=\"lo\"}[1m])\n ) +\nsum (\n irate(net_bytes_sent{host=~\"($osd_hosts|mon_hosts|rgw_hosts).*\",device!=\"lo\"}[1m]) \n )",
+ "format": "table",
+ "instant": true,
+ "intervalFactor": 1,
+ "refId": "A"
+ }
+ ],
+ "thresholds": "",
+ "title": "Network Load",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": null,
+ "decimals": 0,
+ "description": "Average CPU busy across all hosts (OSD, RGW, MON etc) within the cluster",
+ "format": "percentunit",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 3,
+ "x": 12,
+ "y": 5
+ },
+ "id": 6,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "Value",
+ "targets": [
+ {
+ "expr": "avg(\n 1-(cpu_usage_idle{host=~\"($osd_hosts|$mon_hosts|$rgw_hosts).*\"}/100)\n )",
+ "format": "table",
+ "instant": true,
+ "intervalFactor": 1,
+ "refId": "A"
+ }
+ ],
+ "thresholds": "",
+ "title": "AVG CPU Busy",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "collapsed": false,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 9
+ },
+ "id": 28,
+ "panels": [],
+ "title": "",
+ "type": "row"
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "description": "Top 10 hosts by network load",
+ "fill": 1,
+ "gridPos": {
+ "h": 9,
+ "w": 12,
+ "x": 0,
+ "y": 10
+ },
+ "id": 19,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "topk(10, (sum by(host) (\n (\n irate(net_bytes_recv{host=~\"($osd_hosts|mon_hosts|rgw_hosts).*\",device!=\"lo\"}[1m])\n ) +\n (\n irate(net_bytes_sent{host=~\"($osd_hosts|mon_hosts|rgw_hosts).*\",device!=\"lo\"}[1m])\n ))\n )\n)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{host}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Network Load - Top 10",
+ "tooltip": {
+ "shared": true,
+ "sort": 1,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "decimals": 1,
+ "format": "bytes",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "description": "Show the top 10 busiest hosts by cpu",
+ "fill": 1,
+ "gridPos": {
+ "h": 9,
+ "w": 12,
+ "x": 12,
+ "y": 10
+ },
+ "id": 13,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "topk(10,1-(cpu_usage_idle{host=~\"($osd_hosts|$mon_hosts|$rgw_hosts).*\"}/100))",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{host}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "CPU Busy - Top 10 Hosts",
+ "tooltip": {
+ "shared": true,
+ "sort": 1,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "decimals": 1,
+ "format": "percentunit",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": false
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ }
+ ],
+ "refresh": "10s",
+ "schemaVersion": 16,
+ "style": "dark",
+ "tags": [
+ "ceph",
+ "overview"
+ ],
+ "templating": {
+ "list": [
+ {
+ "allValue": "",
+ "current": {
+ "text": "All",
+ "value": "$__all"
+ },
+ "datasource": null,
+ "hide": 2,
+ "includeAll": true,
+ "label": null,
+ "multi": false,
+ "name": "osd_hosts",
+ "options": [],
+ "query": "label_values(ceph_disk_occupation, instance)",
+ "refresh": 1,
+ "regex": "([^.]*).*",
+ "skipUrlSync": false,
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "ceph",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": null,
+ "current": {
+ "text": "All",
+ "value": "$__all"
+ },
+ "datasource": null,
+ "hide": 2,
+ "includeAll": true,
+ "label": null,
+ "multi": false,
+ "name": "mon_hosts",
+ "options": [],
+ "query": "label_values(ceph_mon_metadata, ceph_daemon)",
+ "refresh": 1,
+ "regex": "mon.(.*)",
+ "skipUrlSync": false,
+ "sort": 0,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": null,
+ "current": {
+ "text": "All",
+ "value": "$__all"
+ },
+ "datasource": null,
+ "hide": 2,
+ "includeAll": true,
+ "label": null,
+ "multi": false,
+ "name": "rgw_hosts",
+ "options": [],
+ "query": "label_values(ceph_rgw_qlen, ceph_daemon)",
+ "refresh": 1,
+ "regex": "rgw.(.*)",
+ "skipUrlSync": false,
+ "sort": 0,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ }
+ ]
+ },
+ "time": {
+ "from": "now-1h",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "",
+ "title": "Ceph Hosts Overview",
+ "version": 23
+}
+{%- endraw %}
diff --git a/ceph/files/grafana_dashboards/ceph_cluster_prometheus.json b/ceph/files/grafana_dashboards/legacy/ceph_cluster_prometheus.json
similarity index 100%
rename from ceph/files/grafana_dashboards/ceph_cluster_prometheus.json
rename to ceph/files/grafana_dashboards/legacy/ceph_cluster_prometheus.json
diff --git a/ceph/files/grafana_dashboards/ceph_osd_prometheus.json b/ceph/files/grafana_dashboards/legacy/ceph_osd_prometheus.json
similarity index 100%
rename from ceph/files/grafana_dashboards/ceph_osd_prometheus.json
rename to ceph/files/grafana_dashboards/legacy/ceph_osd_prometheus.json
diff --git a/ceph/files/grafana_dashboards/ceph_pools_prometheus.json b/ceph/files/grafana_dashboards/legacy/ceph_pools_prometheus.json
similarity index 96%
rename from ceph/files/grafana_dashboards/ceph_pools_prometheus.json
rename to ceph/files/grafana_dashboards/legacy/ceph_pools_prometheus.json
index 74ca5de..a669b33 100644
--- a/ceph/files/grafana_dashboards/ceph_pools_prometheus.json
+++ b/ceph/files/grafana_dashboards/legacy/ceph_pools_prometheus.json
@@ -36,7 +36,7 @@
"lines": true,
"linewidth": 1,
"links": [],
- "nullPointMode": "null",
+ "nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
@@ -158,7 +158,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "avg(ceph_pool_usage_max_avail{name=\"$pool\"}) - avg(ceph_pool_usage_bytes_used{name=\"$pool\"})",
+ "expr": "avg(ceph_pool_usage_max_avail{name=\"$pool\"})",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
@@ -178,7 +178,7 @@
"step": 60
},
{
- "expr": "avg(ceph_pool_usage_max_avail{name=\"$pool\"})",
+ "expr": "avg(ceph_pool_usage_max_avail{name=\"$pool\"}) + avg(ceph_pool_usage_bytes_used{name=\"$pool\"})",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
@@ -284,7 +284,7 @@
"tableColumn": "",
"targets": [
{
- "expr": "avg(ceph_pool_usage_bytes_used{name=\"$pool\"}) / avg(ceph_pool_usage_max_avail{name=\"$pool\"})",
+ "expr": "avg(ceph_pool_usage_bytes_used{name=\"$pool\"}) / (avg(ceph_pool_usage_max_avail{name=\"$pool\"}) + avg(ceph_pool_usage_bytes_used{name=\"$pool\"}))",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
@@ -437,7 +437,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "avg(irate(ceph_pool_stats_read_op_per_sec{name=\"$pool\"}[3m]))",
+ "expr": "avg(irate(ceph_pool_stats_read_op_per_sec{name=\"$pool\"}[3m])) or absent(avg(irate(ceph_pool_stats_read_op_per_sec{name=\"$pool\"}[3m]))) - 1",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
@@ -446,7 +446,7 @@
"step": 60
},
{
- "expr": "avg(irate(ceph_pool_stats_write_op_per_sec{name=\"$pool\"}[3m]))",
+ "expr": "avg(irate(ceph_pool_stats_write_op_per_sec{name=\"$pool\"}[3m])) or absent(avg(irate(ceph_pool_stats_write_op_per_sec{name=\"$pool\"}[3m]))) - 1",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
@@ -530,7 +530,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "avg(irate(ceph_pool_stats_read_bytes_sec{name=\"$pool\"}[3m]))",
+ "expr": "avg(irate(ceph_pool_stats_read_bytes_sec{name=\"$pool\"}[3m])) or absent(irate(ceph_pool_stats_read_bytes_sec{name=\"$pool\"}[3m])) - 1",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
@@ -539,7 +539,7 @@
"step": 60
},
{
- "expr": "avg(irate(ceph_pool_stats_write_bytes_sec{name=\"$pool\"}[3m]))",
+ "expr": "avg(irate(ceph_pool_stats_write_bytes_sec{name=\"$pool\"}[3m])) or absent(irate(ceph_pool_stats_write_bytes_sec{name=\"$pool\"}[3m])) - 1 ",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
@@ -618,7 +618,7 @@
"lines": true,
"linewidth": 1,
"links": [],
- "nullPointMode": "null",
+ "nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
@@ -696,7 +696,7 @@
"lines": true,
"linewidth": 1,
"links": [],
- "nullPointMode": "null",
+ "nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
diff --git a/ceph/files/grafana_dashboards/osds-detail_prometheus.json b/ceph/files/grafana_dashboards/osds-detail_prometheus.json
new file mode 100644
index 0000000..b9b950b
--- /dev/null
+++ b/ceph/files/grafana_dashboards/osds-detail_prometheus.json
@@ -0,0 +1,747 @@
+{%- raw %}
+{
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "prometheus",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "editable": true,
+ "gnetId": null,
+ "graphTooltip": 0,
+ "id": null,
+ "iteration": 1545072619802,
+ "links": [],
+ "panels": [
+ {
+ "collapsed": false,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 0
+ },
+ "id": 14,
+ "panels": [],
+ "title": "OSD Performance",
+ "type": "row"
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "fill": 1,
+ "gridPos": {
+ "h": 9,
+ "w": 6,
+ "x": 0,
+ "y": 1
+ },
+ "id": 2,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(ceph_osd_op_r_latency_sum{ceph_daemon=~\"osd.[[osd_id]]\"}[1m]) / on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m]) * 1000",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "READs",
+ "refId": "A"
+ },
+ {
+ "expr": "irate(ceph_osd_op_w_latency_sum{ceph_daemon=~\"osd.[[osd_id]]\"}[1m]) / on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m]) * 1000",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "WRITEs",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "OSD $osd_id Latency",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "ms",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "fill": 1,
+ "gridPos": {
+ "h": 9,
+ "w": 6,
+ "x": 6,
+ "y": 1
+ },
+ "id": 8,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(ceph_osd_op_r{ceph_daemon=~\"osd.[[osd_id]]\"}[1m])",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Reads",
+ "refId": "A"
+ },
+ {
+ "expr": "irate(ceph_osd_op_w{ceph_daemon=~\"osd.[[osd_id]]\"}[1m])",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Writes",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "OSD $osd_id R/W IOPS",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "fill": 1,
+ "gridPos": {
+ "h": 9,
+ "w": 6,
+ "x": 12,
+ "y": 1
+ },
+ "id": 7,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(ceph_osd_op_r_out_bytes{ceph_daemon=~\"osd.[[osd_id]]\"}[1m])",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Read Bytes",
+ "refId": "A"
+ },
+ {
+ "expr": "irate(ceph_osd_op_w_in_bytes{ceph_daemon=~\"osd.[[osd_id]]\"}[1m])",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Write Bytes",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "OSD $osd_id R/W Bytes",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "bytes",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "collapsed": false,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 10
+ },
+ "id": 12,
+ "panels": [],
+ "title": "Physical Device Performance",
+ "type": "row"
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "fill": 1,
+ "gridPos": {
+ "h": 9,
+ "w": 6,
+ "x": 0,
+ "y": 11
+ },
+ "id": 9,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null as zero",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "label_replace(label_replace(irate(diskio_read_time[1m]), \"instance\", \"$1\", \"host\", \"(.+)\"), \"device\", \"$1\", \"name\", \"(.+)\") / label_replace(label_replace(irate(diskio_reads[1m]), \"instance\", \"$1\", \"host\", \"(.+)\"), \"device\", \"$1\", \"name\", \"(.+)\") and on (instance, device) ceph_disk_occupation{ceph_daemon=~\"osd.[[osd_id]]\"}",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}/{{device}} Reads",
+ "refId": "A"
+ },
+ {
+ "expr": "label_replace(label_replace(irate(diskio_write_time[1m]), \"instance\", \"$1\", \"host\", \"(.+)\"), \"device\", \"$1\", \"name\", \"(.+)\") / label_replace(label_replace(irate(diskio_writes[1m]), \"instance\", \"$1\", \"host\", \"(.+)\"), \"device\", \"$1\", \"name\", \"(.+)\") and on (instance, device) ceph_disk_occupation{ceph_daemon=~\"osd.[[osd_id]]\"}",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}/{{device}} Writes",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Physical Device Latency for OSD $osd_id",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "ms",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "fill": 1,
+ "gridPos": {
+ "h": 9,
+ "w": 6,
+ "x": 6,
+ "y": 11
+ },
+ "id": 5,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "label_replace(label_replace(irate(diskio_reads[1m]), \"instance\", \"$1\", \"host\", \"(.+)\"), \"device\", \"$1\", \"name\", \"(.+)\") and on (instance, device) ceph_disk_occupation{ceph_daemon=~\"osd.[[osd_id]]\"}",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}} {{device}} READS",
+ "refId": "A"
+ },
+ {
+ "expr": "label_replace(label_replace(irate(diskio_writes[1m]), \"instance\", \"$1\", \"host\", \"(.+)\"), \"device\", \"$1\", \"name\", \"(.+)\") and on (instance, device) ceph_disk_occupation{ceph_daemon=~\"osd.[[osd_id]]\"}",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}} {{device}} WRITES",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Physical Device R/W IOPS for OSD $osd_id",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "fill": 1,
+ "gridPos": {
+ "h": 9,
+ "w": 6,
+ "x": 12,
+ "y": 11
+ },
+ "id": 10,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "label_replace(label_replace(irate(diskio_read_bytes[1m]), \"instance\", \"$1\", \"host\", \"(.+)\"), \"device\", \"$1\", \"name\", \"(.+)\") and on (instance, device) ceph_disk_occupation{ceph_daemon=~\"osd.[[osd_id]]\"}",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}} {{device}} READS",
+ "refId": "A"
+ },
+ {
+ "expr": "label_replace(label_replace(irate(diskio_write_bytes[1m]), \"instance\", \"$1\", \"host\", \"(.+)\"), \"device\", \"$1\", \"name\", \"(.+)\") and on (instance, device) ceph_disk_occupation{ceph_daemon=~\"osd.[[osd_id]]\"}",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}} {{device}} WRITES",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Physical Device R/W Bytes for OSD $osd_id",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "bytes",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "fill": 1,
+ "gridPos": {
+ "h": 9,
+ "w": 6,
+ "x": 18,
+ "y": 11
+ },
+ "id": 4,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "label_replace(label_replace(irate(diskio_io_time[1m]), \"instance\", \"$1\", \"host\", \"(.+)\"), \"device\", \"$1\", \"name\", \"(.+)\") and on (instance, device) ceph_disk_occupation{ceph_daemon=~\"osd.[[osd_id]]\"}",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}} {{device}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Physical Device Util% for OSD $osd_id",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ }
+ ],
+ "schemaVersion": 16,
+ "style": "dark",
+ "tags": [
+ "ceph",
+ "osd"
+ ],
+ "templating": {
+ "list": [
+ {
+ "allValue": null,
+ "current": {
+ "text": "0",
+ "value": "0"
+ },
+ "datasource": null,
+ "hide": 0,
+ "includeAll": false,
+ "label": "OSD Id",
+ "multi": false,
+ "name": "osd_id",
+ "options": [],
+ "query": "label_values(ceph_osd_metadata,ceph_daemon)",
+ "refresh": 1,
+ "regex": "osd.(.*)",
+ "skipUrlSync": false,
+ "sort": 0,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ }
+ ]
+ },
+ "time": {
+ "from": "now-1h",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "",
+ "title": "Ceph OSD device details",
+ "version": 8
+}
+{%- endraw %}
diff --git a/ceph/files/grafana_dashboards/osds-overview_prometheus.json b/ceph/files/grafana_dashboards/osds-overview_prometheus.json
new file mode 100644
index 0000000..c399fc0
--- /dev/null
+++ b/ceph/files/grafana_dashboards/osds-overview_prometheus.json
@@ -0,0 +1,632 @@
+{%- raw %}
+{
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "prometheus",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "editable": true,
+ "gnetId": null,
+ "graphTooltip": 0,
+ "id": null,
+ "links": [],
+ "panels": [
+ {
+ "aliasColors": {
+ "@95%ile": "#e0752d"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "fill": 1,
+ "gridPos": {
+ "h": 8,
+ "w": 8,
+ "x": 0,
+ "y": 0
+ },
+ "id": 12,
+ "legend": {
+ "avg": false,
+ "current": true,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "avg (irate(ceph_osd_op_r_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m]) * 1000)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "AVG read",
+ "refId": "A"
+ },
+ {
+ "expr": "max (irate(ceph_osd_op_r_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m]) * 1000)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "MAX read",
+ "refId": "B"
+ },
+ {
+ "expr": "quantile(0.95,\n (irate(ceph_osd_op_r_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m]) * 1000)\n)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "@95%ile",
+ "refId": "C"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "OSD Read Latencies",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "ms",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "columns": [],
+ "datasource": null,
+ "description": "This table shows the osd's that are delivering the 10 highest read latencies within the cluster",
+ "fontSize": "100%",
+ "gridPos": {
+ "h": 8,
+ "w": 4,
+ "x": 8,
+ "y": 0
+ },
+ "id": 15,
+ "links": [],
+ "pageSize": null,
+ "scroll": true,
+ "showHeader": true,
+ "sort": {
+ "col": 2,
+ "desc": true
+ },
+ "styles": [
+ {
+ "alias": "OSD ID",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "pattern": "ceph_daemon",
+ "thresholds": [],
+ "type": "string",
+ "unit": "short"
+ },
+ {
+ "alias": "Latency (ms)",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 0,
+ "pattern": "Value",
+ "thresholds": [],
+ "type": "number",
+ "unit": "none"
+ },
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "pattern": "/.*/",
+ "thresholds": [],
+ "type": "hidden",
+ "unit": "short"
+ }
+ ],
+ "targets": [
+ {
+ "expr": "topk(10,\n (sort(\n (irate(ceph_osd_op_r_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m]) * 1000)\n ))\n)\n\n",
+ "format": "table",
+ "instant": true,
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "title": "Highest READ Latencies",
+ "transform": "table",
+ "type": "table"
+ },
+ {
+ "aliasColors": {
+ "@95%ile write": "#e0752d"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "fill": 1,
+ "gridPos": {
+ "h": 8,
+ "w": 8,
+ "x": 12,
+ "y": 0
+ },
+ "id": 13,
+ "legend": {
+ "avg": false,
+ "current": true,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "avg (irate(ceph_osd_op_w_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m]) * 1000)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "AVG write",
+ "refId": "A"
+ },
+ {
+ "expr": "max (irate(ceph_osd_op_w_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m]) * 1000)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "MAX write",
+ "refId": "B"
+ },
+ {
+ "expr": "quantile(0.95,\n (irate(ceph_osd_op_w_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m]) * 1000)\n)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "@95%ile write",
+ "refId": "C"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "OSD Write Latencies",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "ms",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "columns": [],
+ "datasource": null,
+ "description": "This table shows the osd's that are delivering the 10 highest write latencies within the cluster",
+ "fontSize": "100%",
+ "gridPos": {
+ "h": 8,
+ "w": 4,
+ "x": 20,
+ "y": 0
+ },
+ "id": 16,
+ "links": [],
+ "pageSize": null,
+ "scroll": true,
+ "showHeader": true,
+ "sort": {
+ "col": 2,
+ "desc": true
+ },
+ "styles": [
+ {
+ "alias": "OSD ID",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "pattern": "ceph_daemon",
+ "thresholds": [],
+ "type": "string",
+ "unit": "short"
+ },
+ {
+ "alias": "Latency (ms)",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 0,
+ "pattern": "Value",
+ "thresholds": [],
+ "type": "number",
+ "unit": "none"
+ },
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "pattern": "/.*/",
+ "thresholds": [],
+ "type": "hidden",
+ "unit": "short"
+ }
+ ],
+ "targets": [
+ {
+ "expr": "topk(10,\n (sort(\n (irate(ceph_osd_op_w_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m]) * 1000)\n ))\n)\n\n",
+ "format": "table",
+ "instant": true,
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "title": "Highest WRITE Latencies",
+ "transform": "table",
+ "type": "table"
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "description": "Each bar indicates the number of OSD's that have a PG count in a specific range as shown on the x axis.",
+ "fill": 1,
+ "gridPos": {
+ "h": 7,
+ "w": 24,
+ "x": 0,
+ "y": 8
+ },
+ "id": 6,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "hideEmpty": false,
+ "hideZero": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "repeatDirection": "h",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "ceph_osd_numpg",
+ "format": "time_series",
+ "instant": false,
+ "intervalFactor": 1,
+ "legendFormat": "PGs per OSD",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Distribution of PGs per OSD",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "transparent": false,
+ "type": "graph",
+ "xaxis": {
+ "buckets": 20,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "decimals": 0,
+ "format": "short",
+ "label": "# of OSDs",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "collapsed": false,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 15
+ },
+ "id": 20,
+ "panels": [],
+ "title": "R/W Profile",
+ "type": "row"
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "description": "Show the read/write workload profile overtime",
+ "fill": 1,
+ "gridPos": {
+ "h": 8,
+ "w": 24,
+ "x": 0,
+ "y": 16
+ },
+ "id": 10,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "round(sum(irate(ceph_pool_rd[1m])))",
+ "format": "time_series",
+ "hide": false,
+ "instant": false,
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "Reads",
+ "refId": "A"
+ },
+ {
+ "expr": "round(sum(irate(ceph_pool_wr[1m])))",
+ "format": "time_series",
+ "instant": false,
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "Writes",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": "36h",
+ "timeShift": null,
+ "title": "Read/Write Profile",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ }
+ ],
+ "refresh": "30s",
+ "schemaVersion": 16,
+ "style": "dark",
+ "tags": [
+ "ceph",
+ "osd"
+ ],
+ "templating": {
+ "list": []
+ },
+ "time": {
+ "from": "now-1h",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "",
+ "title": "Ceph OSD Overview",
+ "version": 11
+}
+{%- endraw %}
diff --git a/ceph/files/grafana_dashboards/pool-overview_prometheus.json b/ceph/files/grafana_dashboards/pool-overview_prometheus.json
new file mode 100644
index 0000000..a58e6a2
--- /dev/null
+++ b/ceph/files/grafana_dashboards/pool-overview_prometheus.json
@@ -0,0 +1,730 @@
+{%- raw %}
+{
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "prometheus",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "editable": true,
+ "gnetId": null,
+ "graphTooltip": 0,
+ "id": null,
+ "iteration": 1545072836850,
+ "links": [],
+ "panels": [
+ {
+ "collapsed": false,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 0
+ },
+ "id": 14,
+ "panels": [],
+ "repeat": null,
+ "title": "Pool Overview",
+ "type": "row"
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "fill": 5,
+ "gridPos": {
+ "h": 7,
+ "w": 24,
+ "x": 0,
+ "y": 1
+ },
+ "id": 1,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "minSpan": 12,
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "avg((rate(ceph_pool_rd{pool_id=~\"[[pool_id]]\"}[1m]) + rate(ceph_pool_wr{pool_id=~\"[[pool_id]]\"}[1m])) + on(pool_id,instance) group_left(name) ceph_pool_metadata{pool_id=~\"[[pool_id]]\"}) without (instance)",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 1,
+ "legendFormat": "{{name}}",
+ "refId": "F"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Client IOPS by Pool",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "none",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "fill": 5,
+ "gridPos": {
+ "h": 6,
+ "w": 24,
+ "x": 0,
+ "y": 8
+ },
+ "id": 2,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "minSpan": 12,
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "avg((rate(ceph_pool_rd_bytes{pool_id=~\"[[pool_id]]\"}[1m]) + rate(ceph_pool_wr_bytes{pool_id=~\"[[pool_id]]\"}[1m])) + on(pool_id,instance) group_left(name) ceph_pool_metadata{pool_id=~\"[[pool_id]]\"}) without (instance)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{name}}",
+ "refId": "A",
+ "textEditor": true
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Client Throughput by Pool",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "decbytes",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "collapsed": false,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 14
+ },
+ "id": 15,
+ "panels": [],
+ "repeat": null,
+ "title": "Top 5's",
+ "type": "row"
+ },
+ {
+ "columns": [
+ {
+ "text": "Current",
+ "value": "current"
+ }
+ ],
+ "datasource": null,
+ "fontSize": "100%",
+ "gridPos": {
+ "h": 7,
+ "w": 8,
+ "x": 0,
+ "y": 15
+ },
+ "id": 3,
+ "links": [],
+ "minSpan": 12,
+ "pageSize": null,
+ "scroll": true,
+ "showHeader": true,
+ "sort": {
+ "col": 6,
+ "desc": true
+ },
+ "styles": [
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "pattern": "Time",
+ "thresholds": [],
+ "type": "hidden",
+ "unit": "short"
+ },
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "pattern": "id",
+ "thresholds": [],
+ "type": "hidden",
+ "unit": "short"
+ },
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "pattern": "instance",
+ "thresholds": [],
+ "type": "hidden",
+ "unit": "short"
+ },
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "pattern": "job",
+ "thresholds": [],
+ "type": "hidden",
+ "unit": "short"
+ },
+ {
+ "alias": "Pool Name",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "pattern": "name",
+ "thresholds": [],
+ "type": "number",
+ "unit": "short"
+ },
+ {
+ "alias": "Pool ID",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "pattern": "pool_id",
+ "thresholds": [],
+ "type": "number",
+ "unit": "short"
+ },
+ {
+ "alias": "IOPS (R+W)",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 0,
+ "pattern": "Value",
+ "thresholds": [],
+ "type": "number",
+ "unit": "none"
+ }
+ ],
+ "targets": [
+ {
+ "expr": "topk(5,(label_replace((irate(ceph_pool_rd{pool_id=~\"[[pool_id]]\"}[1m]) + irate(ceph_pool_wr{pool_id=~\"[[pool_id]]\"}[1m])),\"id\", \"$1\", \"pool_id\", \"(.*)\") + on(pool_id) group_left(instance,name) ceph_pool_metadata{pool_id=~\"[[pool_id]]\"}) )",
+ "format": "table",
+ "instant": true,
+ "intervalFactor": 2,
+ "refId": "A",
+ "textEditor": true
+ }
+ ],
+ "title": "Top 5 Pools by Client IOPS",
+ "transform": "table",
+ "type": "table"
+ },
+ {
+ "columns": [
+ {
+ "text": "Current",
+ "value": "current"
+ }
+ ],
+ "datasource": null,
+ "fontSize": "100%",
+ "gridPos": {
+ "h": 7,
+ "w": 8,
+ "x": 8,
+ "y": 15
+ },
+ "id": 4,
+ "links": [],
+ "minSpan": 12,
+ "pageSize": null,
+ "scroll": true,
+ "showHeader": true,
+ "sort": {
+ "col": 6,
+ "desc": true
+ },
+ "styles": [
+ {
+ "alias": "Time",
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "pattern": "Time",
+ "type": "hidden"
+ },
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "pattern": "id",
+ "thresholds": [],
+ "type": "hidden",
+ "unit": "short"
+ },
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "pattern": "instance",
+ "thresholds": [],
+ "type": "hidden",
+ "unit": "short"
+ },
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "pattern": "job",
+ "thresholds": [],
+ "type": "hidden",
+ "unit": "short"
+ },
+ {
+ "alias": "Pool Name",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "pattern": "name",
+ "thresholds": [],
+ "type": "number",
+ "unit": "short"
+ },
+ {
+ "alias": "Pool ID",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "pattern": "pool_id",
+ "thresholds": [],
+ "type": "number",
+ "unit": "short"
+ },
+ {
+ "alias": "Throughput",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "pattern": "Value",
+ "thresholds": [],
+ "type": "number",
+ "unit": "decbytes"
+ }
+ ],
+ "targets": [
+ {
+ "expr": "(label_replace((irate(ceph_pool_rd_bytes{pool_id=~\"[[pool_id]]\"}[1m]) + irate(ceph_pool_wr_bytes{pool_id=~\"[[pool_id]]\"}[1m])),\"id\", \"$1\", \"pool_id\", \"(.*)\") + on(pool_id) group_left(instance,name) ceph_pool_metadata{pool_id=~\"[[pool_id]]\"}) ",
+ "format": "table",
+ "instant": true,
+ "intervalFactor": 2,
+ "refId": "A",
+ "textEditor": true
+ }
+ ],
+ "title": "Top 5 Pools by Throughput",
+ "transform": "table",
+ "type": "table"
+ },
+ {
+ "columns": [],
+ "datasource": null,
+ "fontSize": "100%",
+ "gridPos": {
+ "h": 7,
+ "w": 8,
+ "x": 16,
+ "y": 15
+ },
+ "id": 5,
+ "links": [],
+ "minSpan": 8,
+ "pageSize": null,
+ "scroll": true,
+ "showHeader": true,
+ "sort": {
+ "col": 5,
+ "desc": true
+ },
+ "styles": [
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "pattern": "Time",
+ "thresholds": [],
+ "type": "hidden",
+ "unit": "short"
+ },
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "pattern": "instance",
+ "thresholds": [],
+ "type": "hidden",
+ "unit": "short"
+ },
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "pattern": "job",
+ "thresholds": [],
+ "type": "hidden",
+ "unit": "short"
+ },
+ {
+ "alias": "Pool Name",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "pattern": "name",
+ "thresholds": [],
+ "type": "string",
+ "unit": "short"
+ },
+ {
+ "alias": "Pool ID",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "pattern": "pool_id",
+ "thresholds": [],
+ "type": "number",
+ "unit": "short"
+ },
+ {
+ "alias": "Capacity Used",
+ "colorMode": "value",
+ "colors": [
+ "rgba(50, 172, 45, 0.97)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(245, 54, 54, 0.9)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "pattern": "Value",
+ "thresholds": [
+ "70",
+ "85"
+ ],
+ "type": "number",
+ "unit": "percentunit"
+ }
+ ],
+ "targets": [
+ {
+ "expr": "topk(5,((ceph_pool_bytes_used / (ceph_pool_bytes_used + ceph_pool_max_avail)) * on(pool_id) group_left(name) ceph_pool_metadata))",
+ "format": "table",
+ "hide": false,
+ "instant": true,
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "D"
+ }
+ ],
+ "title": "Top 5 Pools By Capacity Used",
+ "transform": "table",
+ "type": "table"
+ }
+ ],
+ "refresh": "15s",
+ "schemaVersion": 16,
+ "style": "dark",
+ "tags": [
+ "ceph",
+ "pool"
+ ],
+ "templating": {
+ "list": [
+ {
+ "allValue": null,
+ "current": {
+ "text": "All",
+ "value": "$__all"
+ },
+ "datasource": null,
+ "hide": 2,
+ "includeAll": true,
+ "label": null,
+ "multi": false,
+ "name": "pool_id",
+ "options": [],
+ "query": "label_values(ceph_pool_metadata,pool_id)",
+ "refresh": 1,
+ "regex": "",
+ "skipUrlSync": false,
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": null,
+ "current": {
+ "text": "All",
+ "value": "$__all"
+ },
+ "datasource": null,
+ "hide": 2,
+ "includeAll": true,
+ "label": "Pool Name",
+ "multi": false,
+ "name": "pool_name",
+ "options": [],
+ "query": "label_values(ceph_pool_metadata,name)",
+ "refresh": 1,
+ "regex": "",
+ "skipUrlSync": false,
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ }
+ ]
+ },
+ "time": {
+ "from": "now-1h",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "15s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "browser",
+ "title": "Ceph Pools Overview",
+ "version": 11
+}
+{%- endraw %}
diff --git a/ceph/files/grafana_dashboards/radosgw-detail_prometheus.json b/ceph/files/grafana_dashboards/radosgw-detail_prometheus.json
new file mode 100644
index 0000000..fa95510
--- /dev/null
+++ b/ceph/files/grafana_dashboards/radosgw-detail_prometheus.json
@@ -0,0 +1,401 @@
+{%- raw %}
+{
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "prometheus",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "editable": true,
+ "gnetId": null,
+ "graphTooltip": 0,
+ "id": null,
+ "iteration": 1545072859805,
+ "links": [],
+ "panels": [
+ {
+ "collapsed": false,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 0
+ },
+ "id": 12,
+ "panels": [],
+ "repeat": null,
+ "title": "RGW Host Detail : $rgw_servers",
+ "type": "row"
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "fill": 1,
+ "gridPos": {
+ "h": 8,
+ "w": 6,
+ "x": 0,
+ "y": 1
+ },
+ "id": 34,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "rate(ceph_rgw_get_initial_lat_sum{ceph_daemon=~\"($rgw_servers)\"}[1m]) / rate(ceph_rgw_get_initial_lat_count{ceph_daemon=~\"($rgw_servers)\"}[1m])",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "GET {{ceph_daemon}}",
+ "refId": "A"
+ },
+ {
+ "expr": "rate(ceph_rgw_put_initial_lat_sum{ceph_daemon=~\"($rgw_servers)\"}[1m]) / rate(ceph_rgw_put_initial_lat_count{ceph_daemon=~\"($rgw_servers)\"}[1m])",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "PUT {{ceph_daemon}}",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "$rgw_servers GET/PUT Latencies",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "s",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "fill": 1,
+ "gridPos": {
+ "h": 8,
+ "w": 7,
+ "x": 6,
+ "y": 1
+ },
+ "id": 18,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "rate(ceph_rgw_get_b{ceph_daemon=~\"[[rgw_servers]]\"}[1m])",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "GETs {{ceph_daemon}}",
+ "refId": "B"
+ },
+ {
+ "expr": "rate(ceph_rgw_put_b{ceph_daemon=~\"[[rgw_servers]]\"}[1m])",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "PUTs {{ceph_daemon}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Bandwidth by HTTP Operation",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "decimals": 0,
+ "format": "bytes",
+ "label": "",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {
+ "GETs": "#7eb26d",
+ "Other": "#447ebc",
+ "PUTs": "#eab839",
+ "Requests": "#3f2b5b",
+ "Requests Failed": "#bf1b00"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "fill": 1,
+ "gridPos": {
+ "h": 8,
+ "w": 7,
+ "x": 13,
+ "y": 1
+ },
+ "id": 14,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "rate(ceph_rgw_failed_req{ceph_daemon=~\"[[rgw_servers]]\"}[1m])",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Requests Failed {{ceph_daemon}}",
+ "refId": "B"
+ },
+ {
+ "expr": "rate(ceph_rgw_get{ceph_daemon=~\"[[rgw_servers]]\"}[1m])",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "GETs {{ceph_daemon}}",
+ "refId": "C"
+ },
+ {
+ "expr": "rate(ceph_rgw_put{ceph_daemon=~\"[[rgw_servers]]\"}[1m])",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "PUTs {{ceph_daemon}}",
+ "refId": "D"
+ },
+ {
+ "expr": "rate(ceph_rgw_req{ceph_daemon=~\"[[rgw_servers]]\"}[1m]) -\n (rate(ceph_rgw_get{ceph_daemon=~\"[[rgw_servers]]\"}[1m]) +\n rate(ceph_rgw_put{ceph_daemon=~\"[[rgw_servers]]\"}[1m]))",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Other {{ceph_daemon}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "HTTP Request Breakdown",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ }
+ ],
+ "refresh": "15s",
+ "schemaVersion": 16,
+ "style": "dark",
+ "tags": [
+ "ceph",
+ "rgw"
+ ],
+ "templating": {
+ "list": [
+ {
+ "allValue": null,
+ "current": {
+ "text": "All",
+ "value": "$__all"
+ },
+ "datasource": null,
+ "hide": 0,
+ "includeAll": true,
+ "label": null,
+ "multi": false,
+ "name": "rgw_servers",
+ "options": [],
+ "query": "label_values(ceph_rgw_req, ceph_daemon)",
+ "refresh": 1,
+ "regex": "",
+ "skipUrlSync": false,
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ }
+ ]
+ },
+ "time": {
+ "from": "now-1h",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "15s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "",
+ "title": "Ceph RGW Instance Detail",
+ "version": 12
+}
+{%- endraw %}
diff --git a/ceph/files/grafana_dashboards/radosgw-overview_prometheus.json b/ceph/files/grafana_dashboards/radosgw-overview_prometheus.json
new file mode 100644
index 0000000..1cafb7a
--- /dev/null
+++ b/ceph/files/grafana_dashboards/radosgw-overview_prometheus.json
@@ -0,0 +1,631 @@
+{%- raw %}
+{
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "prometheus",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "editable": true,
+ "gnetId": null,
+ "graphTooltip": 0,
+ "id": null,
+ "iteration": 1545072879350,
+ "links": [],
+ "panels": [
+ {
+ "collapsed": false,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 0
+ },
+ "id": 2,
+ "panels": [],
+ "title": "RGW Overview - All Gateways",
+ "type": "row"
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "fill": 1,
+ "gridPos": {
+ "h": 7,
+ "w": 8,
+ "x": 0,
+ "y": 1
+ },
+ "id": 29,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "avg(rate(ceph_rgw_get_initial_lat_sum[1m]) / rate(ceph_rgw_get_initial_lat_count[1m]))",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "GET AVG",
+ "refId": "A"
+ },
+ {
+ "expr": "avg(rate(ceph_rgw_put_initial_lat_sum[1m]) / rate(ceph_rgw_put_initial_lat_count[1m]))",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "PUT AVG",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Average GET/PUT Latencies",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "s",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "fill": 1,
+ "gridPos": {
+ "h": 7,
+ "w": 7,
+ "x": 8,
+ "y": 1
+ },
+ "id": 4,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum by(rgw_host) (label_replace(rate(ceph_rgw_req[1m]), \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"))",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{rgw_host}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Total Requests/sec by RGW Instance",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "decimals": 0,
+ "format": "none",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "description": "Latencies are shown stacked, without a yaxis to provide a visual indication of GET latency imbalance across RGW hosts",
+ "fill": 1,
+ "gridPos": {
+ "h": 7,
+ "w": 6,
+ "x": 15,
+ "y": 1
+ },
+ "id": 31,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "label_replace(rate(ceph_rgw_get_initial_lat_sum[1m]),\"rgw_host\",\"$1\",\"ceph_daemon\",\"rgw.(.*)\") / \nlabel_replace(rate(ceph_rgw_get_initial_lat_count[1m]),\"rgw_host\",\"$1\",\"ceph_daemon\",\"rgw.(.*)\")",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{rgw_host}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "GET Latencies by RGW Instance",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "decimals": null,
+ "format": "s",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "description": "Total bytes transferred in/out of all radosgw instances within the cluster",
+ "fill": 1,
+ "gridPos": {
+ "h": 6,
+ "w": 8,
+ "x": 0,
+ "y": 8
+ },
+ "id": 6,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(rate(ceph_rgw_get_b[1m]))",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "GETs",
+ "refId": "A"
+ },
+ {
+ "expr": "sum(rate(ceph_rgw_put_b[1m]))",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "PUTs",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Bandwidth Consumed by Type",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "bytes",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "description": "Total bytes transferred in/out through get/put operations, by radosgw instance",
+ "fill": 1,
+ "gridPos": {
+ "h": 6,
+ "w": 7,
+ "x": 8,
+ "y": 8
+ },
+ "id": 9,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum by(rgw_host) (\n (label_replace(rate(ceph_rgw_get_b[1m]), \"rgw_host\",\"$1\",\"ceph_daemon\",\"rgw.(.*)\")) + \n (label_replace(rate(ceph_rgw_put_b[1m]), \"rgw_host\",\"$1\",\"ceph_daemon\",\"rgw.(.*)\"))\n)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{rgw_host}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Bandwidth by RGW Instance",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "bytes",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "description": "Latencies are shown stacked, without a yaxis to provide a visual indication of PUT latency imbalance across RGW hosts",
+ "fill": 1,
+ "gridPos": {
+ "h": 6,
+ "w": 6,
+ "x": 15,
+ "y": 8
+ },
+ "id": 32,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "label_replace(rate(ceph_rgw_put_initial_lat_sum[1m]),\"rgw_host\",\"$1\",\"ceph_daemon\",\"rgw.(.*)\") / \nlabel_replace(rate(ceph_rgw_put_initial_lat_count[1m]),\"rgw_host\",\"$1\",\"ceph_daemon\",\"rgw.(.*)\")",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{rgw_host}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "PUT Latencies by RGW Instance",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "decimals": null,
+ "format": "s",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": false
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ }
+ ],
+ "refresh": "15s",
+ "schemaVersion": 16,
+ "style": "dark",
+ "tags": [
+ "ceph",
+ "rgw"
+ ],
+ "templating": {
+ "list": [
+ {
+ "allValue": null,
+ "current": {
+ "text": "All",
+ "value": "$__all"
+ },
+ "datasource": null,
+ "hide": 2,
+ "includeAll": true,
+ "label": null,
+ "multi": false,
+ "name": "rgw_servers",
+ "options": [],
+ "query": "label_values(ceph_rgw_req, ceph_daemon)",
+ "refresh": 1,
+ "regex": "",
+ "skipUrlSync": false,
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ }
+ ]
+ },
+ "time": {
+ "from": "now-1h",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "15s",
+ "1m",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "",
+ "title": "Ceph RGW Overview",
+ "version": 13
+}
+{%- endraw %}
diff --git a/ceph/files/jewel/ceph.conf.Debian b/ceph/files/jewel/ceph.conf.Debian
index 351eae2..b2656ce 100644
--- a/ceph/files/jewel/ceph.conf.Debian
+++ b/ceph/files/jewel/ceph.conf.Debian
@@ -102,7 +102,7 @@
{%- if radosgw.identity.engine == 'keystone' %}
{%- set ident = radosgw.identity %}
rgw keystone api version = {{ ident.get('api_version', 3) }}
-rgw keystone url = {{ ident.host }}:{{ ident.get('port', '5000') }}
+rgw keystone url = {{ ident.get('protocol', 'https') }}://{{ ident.host }}:{{ ident.get('port', '5000') }}
rgw keystone accepted roles = {{ ident.get('accepted_roles', '_member_, Member, admin, swiftoperator') }}
rgw keystone revocation interval = {{ ident.get('revocation_interval', '1000000') }}
rgw keystone implicit tenants = {{ ident.get('implicit_tenants', 'false') }}
diff --git a/ceph/files/kraken/ceph.conf.Debian b/ceph/files/kraken/ceph.conf.Debian
index 351eae2..b2656ce 100644
--- a/ceph/files/kraken/ceph.conf.Debian
+++ b/ceph/files/kraken/ceph.conf.Debian
@@ -102,7 +102,7 @@
{%- if radosgw.identity.engine == 'keystone' %}
{%- set ident = radosgw.identity %}
rgw keystone api version = {{ ident.get('api_version', 3) }}
-rgw keystone url = {{ ident.host }}:{{ ident.get('port', '5000') }}
+rgw keystone url = {{ ident.get('protocol', 'https') }}://{{ ident.host }}:{{ ident.get('port', '5000') }}
rgw keystone accepted roles = {{ ident.get('accepted_roles', '_member_, Member, admin, swiftoperator') }}
rgw keystone revocation interval = {{ ident.get('revocation_interval', '1000000') }}
rgw keystone implicit tenants = {{ ident.get('implicit_tenants', 'false') }}
diff --git a/ceph/files/luminous/ceph.conf.Debian b/ceph/files/luminous/ceph.conf.Debian
index 351eae2..b2656ce 100644
--- a/ceph/files/luminous/ceph.conf.Debian
+++ b/ceph/files/luminous/ceph.conf.Debian
@@ -102,7 +102,7 @@
{%- if radosgw.identity.engine == 'keystone' %}
{%- set ident = radosgw.identity %}
rgw keystone api version = {{ ident.get('api_version', 3) }}
-rgw keystone url = {{ ident.host }}:{{ ident.get('port', '5000') }}
+rgw keystone url = {{ ident.get('protocol', 'https') }}://{{ ident.host }}:{{ ident.get('port', '5000') }}
rgw keystone accepted roles = {{ ident.get('accepted_roles', '_member_, Member, admin, swiftoperator') }}
rgw keystone revocation interval = {{ ident.get('revocation_interval', '1000000') }}
rgw keystone implicit tenants = {{ ident.get('implicit_tenants', 'false') }}
diff --git a/ceph/meta/fluentd.yml b/ceph/meta/fluentd.yml
new file mode 100644
index 0000000..8bc2794
--- /dev/null
+++ b/ceph/meta/fluentd.yml
@@ -0,0 +1,79 @@
+{%- if pillar.get('fluentd', {}).get('agent', {}).get('enabled', False) %}
+{%- set positiondb = pillar.fluentd.agent.dir.positiondb %}
+agent:
+ config:
+ label:
+ ceph:
+ input:
+ tail_ceph-osd:
+ type: tail
+ tag: ceph.osd
+ path: /var/log/ceph/ceph-osd*
+ path_key: log_location
+ pos_file: {{ positiondb }}/ceph.osd.pos
+ parser:
+ type: regexp
+ time_key: Timestamp
+ time_format: '%Y-%m-%d %H:%M:%S.%N'
+ keep_time_key: false
+ format: >-
+ '/^(?<Timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{6}) (?<Payload>.*)/'
+ tail_ceph-mon:
+ type: tail
+ tag: ceph.mon
+ path: /var/log/ceph/ceph-mon*, /var/log/ceph/ceph.log, /var/log/ceph/ceph.audit.log
+ path_key: log_location
+ pos_file: {{ positiondb }}/ceph.mon.pos
+ parser:
+ type: regexp
+ time_key: Timestamp
+ time_format: '%Y-%m-%d %H:%M:%S.%N'
+ keep_time_key: false
+ format: >-
+ '/^(?<Timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{6}) (?<Payload>.*)/'
+ tail_ceph-mgr:
+ type: tail
+ tag: ceph.mgr
+ path: /var/log/ceph/ceph-mgr*
+ path_key: log_location
+ pos_file: {{ positiondb }}/ceph.mgr.pos
+ parser:
+ type: regexp
+ time_key: Timestamp
+ time_format: '%Y-%m-%d %H:%M:%S.%N'
+ keep_time_key: false
+ format: >-
+ '/^(?<Timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{6}) (?<Payload>.*)/'
+ tail_radosgw:
+ type: tail
+ tag: ceph.radosgw
+ path: /var/log/ceph/ceph-rgw*
+ path_key: log_location
+ pos_file: {{ positiondb }}/ceph.radosgw.pos
+ parser:
+ type: regexp
+ time_key: Timestamp
+ time_format: '%Y-%m-%d %H:%M:%S.%N'
+ keep_time_key: false
+ format: >-
+ '/^(?<Timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{6}) (?<Payload>.*)$/'
+
+ filter:
+ match_severity:
+ type: record_transformer
+ tag: ceph.**
+ enable_ruby: true
+ record:
+ - name: programname
+ value: ceph
+ - name: severity_label
+ value: INFO
+ - name: Severity
+ value: 6
+ match:
+ push_to_default:
+ tag: ceph.*
+ type: relabel
+ label: default_output
+
+{%- endif %}
diff --git a/ceph/meta/grafana.yml b/ceph/meta/grafana.yml
index bfa369d..f43431b 100644
--- a/ceph/meta/grafana.yml
+++ b/ceph/meta/grafana.yml
@@ -1,13 +1,48 @@
+{%- from "ceph/map.jinja" import common with context -%}
+
+{%- if common.version is defined %}
dashboard:
+ {%- if common.version in ['kraken', 'jewel'] %}
ceph_cluster_prometheus:
datasource: prometheus
format: json
- template: ceph/files/grafana_dashboards/ceph_cluster_prometheus.json
+ template: ceph/files/grafana_dashboards/legacy/ceph_cluster_prometheus.json
ceph_osd_prometheus:
datasource: prometheus
format: json
- template: ceph/files/grafana_dashboards/ceph_osd_prometheus.json
+ template: ceph/files/grafana_dashboards/legacy/ceph_osd_prometheus.json
ceph_pools_prometheus:
datasource: prometheus
format: json
- template: ceph/files/grafana_dashboards/ceph_pools_prometheus.json
+ template: ceph/files/grafana_dashboards/legacy/ceph_pools_prometheus.json
+ {%- else %}
+ ceph_cluster_prometheus:
+ datasource: prometheus
+ format: json
+ template: ceph/files/grafana_dashboards/ceph-cluster_prometheus.json
+ ceph_osd_prometheus:
+ datasource: prometheus
+ format: json
+ template: ceph/files/grafana_dashboards/osds-overview_prometheus.json
+ ceph_osd_details_prometheus:
+ datasource: prometheus
+ format: json
+ template: ceph/files/grafana_dashboards/osds-detail_prometheus.json
+ ceph_pools_prometheus:
+ datasource: prometheus
+ format: json
+ template: ceph/files/grafana_dashboards/pool-overview_prometheus.json
+ ceph_hosts_overview_prometheus:
+ datasource: prometheus
+ format: json
+ template: ceph/files/grafana_dashboards/hosts-overview_prometheus.json
+ ceph_radosgw_overview_prometheus:
+ datasource: prometheus
+ format: json
+ template: ceph/files/grafana_dashboards/radosgw-overview_prometheus.json
+ ceph_radosgw_detail_prometheus:
+ datasource: prometheus
+ format: json
+ template: ceph/files/grafana_dashboards/radosgw-detail_prometheus.json
+ {%- endif %}
+{%- endif %}
diff --git a/ceph/meta/prometheus.yml b/ceph/meta/prometheus.yml
index 0afcfc2..86c8a47 100644
--- a/ceph/meta/prometheus.yml
+++ b/ceph/meta/prometheus.yml
@@ -1,7 +1,10 @@
-{%- from "ceph/map.jinja" import thresholds, mon, monitoring, setup with context %}
+{%- from "ceph/map.jinja" import common, mon, monitoring, setup with context -%}
-{%- if (mon is defined and mon.get('enabled')) or (monitoring.cluster_stats.get('enabled') and monitoring.cluster_stats.ceph_user is defined) %}
-{% raw %}
+{%- if common.version is defined %}
+ {%- if common.version in ['kraken', 'jewel'] -%}
+
+ {%- if (mon is defined and mon.get('enabled')) or (monitoring.cluster_stats.get('enabled') and monitoring.cluster_stats.ceph_user is defined) %}
+ {%- raw %}
server:
alert:
CephClusterHealthMinor:
@@ -55,8 +58,8 @@
severity: warning
service: ceph
annotations:
- summary: "{%-endraw %}{{100*threshold}}{%- raw %}% of Ceph space is used"
- description: "{{ $value }} bytes of Ceph OSD space (>= {%-endraw %}{{100*threshold}}{%- raw %}%) is used for 3 minutes. For details, run 'ceph df'."
+ summary: "{% endraw %}{{100*threshold}}{% raw %}% of Ceph space is used"
+ description: "{{ $value }} bytes of Ceph OSD space (>={% endraw %}{{100*threshold}}{% raw %}%) is used for 3 minutes. For details, run 'ceph df'."
CephOsdSpaceUsageMajor:
{%- endraw %}
{%- set threshold = monitoring.space_used_critical_threshold|default('0.85')|float %}
@@ -68,46 +71,20 @@
severity: major
service: ceph
annotations:
- summary: "{%-endraw %}{{100*threshold}}{%- raw %}% of Ceph space is used"
- description: "{{ $ value }} bytes of Ceph OSD space (>= {%-endraw %}{{100*threshold}}{%- raw %}%) is used for 3 minutes. For details, run 'ceph df'."
- CephServiceApplyLatencyTooHigh:
+ summary: "{% endraw %}{{100*threshold}}{% raw %}% of Ceph space is used"
+ description: "{{ $ value }} bytes of Ceph OSD space (>={% endraw %}{{100*threshold}}{% raw %}%) is used for 3 minutes. For details, run 'ceph df'."
{%- endraw %}
- {%- set threshold = monitoring.apply_latency_threshold|default('0.007')|float %}
- if: >-
- avg(ceph_apply_latency_sum) / avg(ceph_apply_latency_avgcount) > {{threshold}}
- {%- raw %}
- for: 3m
- labels:
- severity: warning
- service: ceph
- annotations:
- summary: "Ceph apply latency reached the limit of {%- endraw %}{{threshold}}{%- raw %}s"
- description: "The average Ceph apply latency is more than {%- endraw %}{{threshold}}{%- raw %} seconds for 3 minutes."
- CephServiceCommitLatencyTooHigh:
- {%- endraw %}
- {%- set threshold = monitoring.commit_latency_threshold|default('0.7')|float %}
- if: >-
- avg(ceph_commit_latency_sum) / avg(ceph_commitcycle_latency_avgcount) > {{threshold}}
- {%- raw %}
- for: 3m
- labels:
- severity: warning
- service: ceph
- annotations:
- summary: "Ceph commit latency reached the limit of {%- endraw %}{{threshold}}{%- raw %}s"
- description: "The average Ceph commit latency is more than {%- endraw %}{{threshold}}{%- raw %} seconds for 3 minutes."
-{% endraw %}
-{%- if setup.pool is defined %}
-{%- for pool_name, pool in setup.pool.iteritems() %}
- {%- if monitoring.pool is defined and monitoring.pool[pool_name] is defined %}
- {%- set monitoring_pool = monitoring.pool[pool_name] %}
- {%- else %}
- {%- set monitoring_pool = monitoring %}
- {%- endif %}
+ {%- if setup.pool is defined %}
+ {%- for pool_name, pool in setup.pool.iteritems() %}
+ {%- if monitoring.pool is defined and monitoring.pool[pool_name] is defined %}
+ {%- set monitoring_pool = monitoring.pool[pool_name] %}
+ {%- else %}
+ {%- set monitoring_pool = monitoring %}
+ {%- endif %}
CephPool{{pool_name|replace(".", "")|replace("-", "")}}SpaceUsageWarning:
- {%- set threshold = monitoring_pool.pool_space_used_utilization_warning_threshold|default('0.75')|float %}
+ {%- set threshold = monitoring_pool.pool_space_used_utilization_warning_threshold|default('0.75')|float %}
if: >-
- ceph_pool_usage_bytes_used{name="{{pool_name}}"} / ceph_pool_usage_max_avail{name="{{pool_name}}"} > {{threshold}}
+ ceph_pool_usage_bytes_used{name="{{pool_name}}"} / (ceph_pool_usage_max_avail{name="{{pool_name}}"} + ceph_pool_usage_bytes_used{name="{{pool_name}}"}) > {{threshold}}
for: 3m
labels:
severity: warning
@@ -115,10 +92,10 @@
annotations:
summary: "{{100*threshold}}% of Ceph pool space is used"
description: "The Ceph {{pool_name}} pool uses {{100*threshold}}% of available space for 3 minutes. For details, run 'ceph df'."
- CephPool{{pool_name|replace(".", "")|replace("-", "")}}SpaceUsageMinor:
- {%- set threshold = monitoring_pool.pool_space_used_critical_threshold|default('0.85')|float %}
+ CephPool{{pool_name|replace(".", "")|replace("-", "")}}SpaceUsageCritical:
+ {%- set threshold = monitoring_pool.pool_space_used_critical_threshold|default('0.85')|float %}
if: >-
- ceph_pool_usage_bytes_used{name="{{pool_name}}"} / ceph_pool_usage_max_avail{name="{{pool_name}}"} > {{threshold}}
+ ceph_pool_usage_bytes_used{name="{{pool_name}}"} / (ceph_pool_usage_max_avail{name="{{pool_name}}"} + ceph_pool_usage_bytes_used{name="{{pool_name}}"}) > {{threshold}}
for: 3m
labels:
severity: minor
@@ -126,8 +103,9 @@
annotations:
summary: "{{100*threshold}}% of Ceph pool space is used"
description: "The Ceph {{pool_name}} pool uses {{100*threshold}}% of available space for 3 minutes. For details, run 'ceph df'."
+ {%- if monitoring.cluster_stats.extra_alerts is defined and monitoring.cluster_stats.extra_alerts.get("enabled", False) %}
CephPool{{pool_name|replace(".", "")|replace("-", "")}}WriteOpsTooHigh:
- {%- set threshold = monitoring_pool.pool_write_ops_threshold|default('200')|float %}
+ {%- set threshold = monitoring_pool.pool_write_ops_threshold|default('200')|float %}
if: >-
ceph_pool_stats_write_op_per_sec{name="{{pool_name}}"} > {{threshold}}
for: 3m
@@ -138,7 +116,7 @@
summary: "{{threshold}} Ceph pool write operations per second"
description: "The number of Ceph {{pool_name}} pool write operations per second is {{threshold}} for 3 minutes."
CephPool{{pool_name|replace(".", "")|replace("-", "")}}WriteBytesTooHigh:
- {%- set threshold = monitoring_pool.pool_write_bytes_threshold|default('70000000')|float %}
+ {%- set threshold = monitoring_pool.pool_write_bytes_threshold|default('70000000')|float %}
if: >-
ceph_pool_stats_write_bytes_sec{name="{{pool_name}}"} > {{threshold}}
for: 3m
@@ -149,7 +127,7 @@
summary: "{{threshold}} Ceph pool write bytes per second"
description: "The number of Ceph {{pool_name}} pool write bytes per second is {{threshold}} for 3 minutes."
CephPool{{pool_name|replace(".", "")|replace("-", "")}}ReadOpsTooHigh:
- {%- set threshold = monitoring_pool.pool_read_ops_threshold|default('1000')|float %}
+ {%- set threshold = monitoring_pool.pool_read_ops_threshold|default('1000')|float %}
if: >-
ceph_pool_stats_read_op_per_sec{name="{{pool_name}}"} > {{threshold}}
for: 3m
@@ -160,7 +138,7 @@
summary: "{{threshold}} Ceph pool read operations per second"
description: "The number of Ceph {{pool_name}} pool read operations per second is {{threshold}} for 3 minutes."
CephPool{{pool_name|replace(".", "")|replace("-", "")}}ReadBytesTooHigh:
- {%- set threshold = monitoring_pool.pool_read_bytes_threshold|default('70000000')|float %}
+ {%- set threshold = monitoring_pool.pool_read_bytes_threshold|default('70000000')|float %}
if: >-
ceph_pool_stats_read_bytes_sec{name="{{pool_name}}"} > {{threshold}}
for: 3m
@@ -170,6 +148,133 @@
annotations:
summary: "{{threshold}} Ceph pool read bytes per second"
description: "The number of Ceph {{pool_name}} pool read bytes per second is {{threshold}} for 3 minutes."
-{%- endfor %}
-{%- endif %}
+ {%- endif %}
+ {%- endfor %}
+ {%- endif %}
+ {%- endif -%}
+
+ {%- else -%}
+
+ {%- if mon is defined and mon.get('enabled') %}
+ {%- raw %}
+server:
+ alert:
+ CephClusterHealthMinor:
+ if: >-
+ ceph_health_status == 1
+ for: 3m
+ labels:
+ severity: minor
+ service: ceph
+ annotations:
+ summary: "Ceph cluster health is WARNING"
+ description: "The Ceph cluster is in the WARNING state. For details, run 'ceph -s'."
+ CephClusterHealthCritical:
+ if: >-
+ ceph_health_status == 2
+ for: 3m
+ labels:
+ severity: critical
+ service: ceph
+ annotations:
+ summary: "Ceph cluster health is CRITICAL"
+ description: "The Ceph cluster is in the CRITICAL state. For details, run 'ceph -s'."
+ CephMonitorDownMinor:
+ if: >-
+ count(ceph_mon_quorum_status) - sum(ceph_mon_quorum_status) > 0
+ for: 3m
+ labels:
+ severity: minor
+ service: ceph
+ annotations:
+ summary: "Ceph Monitors are down"
+ description: "{{ $value }} of Ceph Monitors are down. For details, run 'ceph -s'."
+ CephOsdDownMinor:
+ if: >-
+ count(ceph_osd_up) - sum(ceph_osd_up) > 0
+ for: 3m
+ labels:
+ severity: minor
+ service: ceph
+ annotations:
+ summary: "Ceph OSDs are down"
+ description: "{{ $value }} of Ceph OSDs are down. For details, run 'ceph osd tree'."
+ CephOsdSpaceUsageWarning:
+ {%- endraw %}
+ {%- set threshold = monitoring.space_used_warning_threshold|default('0.75')|float %}
+ if: >-
+ ceph_cluster_total_used_bytes > ceph_cluster_total_bytes * {{threshold}}
+ {%- raw %}
+ for: 3m
+ labels:
+ severity: warning
+ service: ceph
+ annotations:
+ summary: "{% endraw %}{{100*threshold}}{% raw %}% of Ceph space is used"
+ description: "{{ $value }} bytes of Ceph OSD space (>={% endraw %}{{100*threshold}}{% raw %}%) is used for 3 minutes. For details, run 'ceph df'."
+ CephOsdSpaceUsageMajor:
+ {%- endraw %}
+ {%- set threshold = monitoring.space_used_critical_threshold|default('0.85')|float %}
+ if: >-
+ ceph_cluster_total_used_bytes > ceph_cluster_total_bytes * {{threshold}}
+ {%- raw %}
+ for: 3m
+ labels:
+ severity: major
+ service: ceph
+ annotations:
+ summary: "{% endraw %}{{100*threshold}}{% raw %}% of Ceph space is used"
+ description: "{{ $ value }} bytes of Ceph OSD space (>={% endraw %}{{100*threshold}}{% raw %}%) is used for 3 minutes. For details, run 'ceph df'."
+ {%- endraw %}
+ {%- if setup.pool is defined %}
+ {%- for pool_name, pool in setup.pool.iteritems() %}
+ {%- if monitoring.pool is defined and monitoring.pool[pool_name] is defined %}
+ {%- set monitoring_pool = monitoring.pool[pool_name] %}
+ {%- else %}
+ {%- set monitoring_pool = monitoring %}
+ {%- endif %}
+ CephPool{{pool_name|replace(".", "")|replace("-", "")}}SpaceUsageWarning:
+ {%- set threshold = monitoring_pool.pool_space_used_utilization_warning_threshold|default('0.75')|float %}
+ if: >-
+ ceph_pool_bytes_used / (ceph_pool_bytes_used + ceph_pool_max_avail) * on(pool_id) group_left(name) ceph_pool_metadata{name="{{pool_name}}"} > {{threshold}}
+ for: 3m
+ labels:
+ severity: warning
+ service: ceph
+ annotations:
+ summary: "{{100*threshold}}% of Ceph pool space is used"
+ description: "The Ceph {{pool_name}} pool uses {{100*threshold}}% of available space for 3 minutes. For details, run 'ceph df'."
+ CephPool{{pool_name|replace(".", "")|replace("-", "")}}SpaceUsageCritical:
+ {%- set threshold = monitoring_pool.pool_space_used_critical_threshold|default('0.85')|float %}
+ if: >-
+ ceph_pool_bytes_used / (ceph_pool_bytes_used + ceph_pool_max_avail) * on(pool_id) group_left(name) ceph_pool_metadata{name="{{pool_name}}"} > {{threshold}}
+ for: 3m
+ labels:
+ severity: minor
+ service: ceph
+ annotations:
+ summary: "{{100*threshold}}% of Ceph pool space is used"
+ description: "The Ceph {{pool_name}} pool uses {{100*threshold}}% of available space for 3 minutes. For details, run 'ceph df'."
+ {%- endfor %}
+ {%- endif -%}
+
+ {%- set fqdn_ip4_addresses = [] %}
+ {%- for addr in grains['fqdn_ip4'] %}
+ {%- if not addr.startswith('127.') %}
+ {%- do fqdn_ip4_addresses.append(addr) %}
+ {%- endif %}
+ {%- endfor %}
+ {%- set address = fqdn_ip4_addresses[0] %}
+ {%- if address is defined %}
+ target:
+ static:
+ ceph:
+ enabled: true
+ endpoint:
+ - address: {{ address }}
+ port: 9283
+ honor_labels: true
+ {%- endif %}
+ {%- endif %}
+ {%- endif %}
{%- endif %}
diff --git a/ceph/meta/telegraf.yml b/ceph/meta/telegraf.yml
index 0fbb00a..cbbe981 100644
--- a/ceph/meta/telegraf.yml
+++ b/ceph/meta/telegraf.yml
@@ -1,67 +1,70 @@
-{%- from "ceph/map.jinja" import mon, osd, monitoring with context %}
+{%- from "ceph/map.jinja" import common, mon, osd, monitoring with context -%}
-{%- if mon is defined and mon.get('enabled') %}
+{%- if common.version is defined %}
+ {%- if common.version in ['kraken', 'jewel'] -%}
+
+ {%- if mon is defined and mon.get('enabled') %}
remote_agent:
input:
ceph:
template: ceph/files/telegraf.conf
-{%- if monitoring.cluster_stats is defined %}
+ {%- if monitoring.cluster_stats is defined %}
ceph_user: client.{{ monitoring.cluster_stats.ceph_user|default('admin') }}
gather_admin_socket_stats: {{ monitoring.cluster_stats.gather_admin_socket_stats|default('false') }}
gather_cluster_stats: {{ monitoring.cluster_stats.gather_cluster_stats|default('true') }}
gather_pool_loads: {{ monitoring.cluster_stats.gather_pool_loads|default('true') }}
-{%- if monitoring.cluster_stats.ceph_binary is defined %}
+ {%- if monitoring.cluster_stats.ceph_binary is defined %}
ceph_binary: {{ monitoring.cluster_stats.ceph_binary }}
-{%- endif %}
-{%- if monitoring.cluster_stats.socket_dir is defined %}
+ {%- endif %}
+ {%- if monitoring.cluster_stats.socket_dir is defined %}
socket_dir: {{ monitoring.cluster_stats.socket_dir }}
-{%- endif %}
-{%- if monitoring.cluster_stats.mon_prefix is defined %}
+ {%- endif %}
+ {%- if monitoring.cluster_stats.mon_prefix is defined %}
mon_prefix: {{ monitoring.cluster_stats.mon_prefix }}
-{%- endif %}
-{%- if monitoring.cluster_stats.osd_prefix is defined %}
+ {%- endif %}
+ {%- if monitoring.cluster_stats.osd_prefix is defined %}
osd_prefix: {{ monitoring.cluster_stats.osd_prefix }}
-{%- endif %}
-{%- if monitoring.interval is defined %}
+ {%- endif %}
+ {%- if monitoring.interval is defined %}
interval: {{ monitoring.interval }}
-{%- endif %}
-{%- else %}
+ {%- endif %}
+ {%- else %}
ceph_user: client.admin
gather_admin_socket_stats: false
gather_cluster_stats: true
gather_pool_loads: true
-{%- endif %}
+ {%- endif %}
agent:
input:
ceph:
template: ceph/files/telegraf.conf
-{%- if monitoring.cluster_stats is defined %}
+ {%- if monitoring.cluster_stats is defined %}
ceph_user: client.{{ monitoring.cluster_stats.ceph_user|default('admin') }}
gather_admin_socket_stats: {{ monitoring.cluster_stats.gather_admin_socket_stats|default('true') }}
gather_cluster_stats: {{ monitoring.cluster_stats.gather_cluster_stats|default('false') }}
gather_pool_loads: {{ monitoring.cluster_stats.gather_pool_loads|default('false') }}
-{%- if monitoring.cluster_stats.ceph_binary is defined %}
+ {%- if monitoring.cluster_stats.ceph_binary is defined %}
ceph_binary: {{ monitoring.cluster_stats.ceph_binary }}
-{%- endif %}
-{%- if monitoring.cluster_stats.socket_dir is defined %}
+ {%- endif %}
+ {%- if monitoring.cluster_stats.socket_dir is defined %}
socket_dir: {{ monitoring.cluster_stats.socket_dir }}
-{%- endif %}
-{%- if monitoring.cluster_stats.mon_prefix is defined %}
+ {%- endif %}
+ {%- if monitoring.cluster_stats.mon_prefix is defined %}
mon_prefix: {{ monitoring.cluster_stats.mon_prefix }}
-{%- endif %}
-{%- if monitoring.cluster_stats.osd_prefix is defined %}
+ {%- endif %}
+ {%- if monitoring.cluster_stats.osd_prefix is defined %}
osd_prefix: {{ monitoring.cluster_stats.osd_prefix }}
-{%- endif %}
-{%- if monitoring.interval is defined %}
+ {%- endif %}
+ {%- if monitoring.interval is defined %}
interval: {{ monitoring.interval }}
-{%- endif %}
-{%- else %}
+ {%- endif %}
+ {%- else %}
ceph_user: client.admin
gather_admin_socket_stats: true
gather_cluster_stats: false
gather_pool_loads: false
-{%- endif %}
-{%- elif monitoring.get('cluster_stats').get('enabled') %}
+ {%- endif %}
+ {%- elif monitoring.get('cluster_stats').get('enabled') %}
remote_agent:
input:
ceph:
@@ -70,33 +73,36 @@
gather_admin_socket_stats: false
gather_cluster_stats: true
gather_pool_loads: true
-{%- endif %}
+ {%- endif -%}
-{%- if osd is defined and osd.get('enabled') %}
+ {%- if osd is defined and osd.get('enabled') %}
agent:
input:
ceph:
template: ceph/files/telegraf.conf
fieldpass: [ "apply_latency*", "commitcycle_latency*", "op_latency*", "osdop_append", "osdop_delete", "osdop_read", "osdop_write", "recovery_ops" ]
-{%- if monitoring.node_stats is defined %}
+ {%- if monitoring.node_stats is defined %}
gather_admin_socket_stats: {{ monitoring.node_stats.gather_admin_socket_stats|default('true') }}
gather_cluster_stats: {{ monitoring.node_stats.gather_cluster_stats|default('false') }}
gather_pool_loads: {{ monitoring.cluster_stats.gather_pool_loads|default('false') }}
-{%- if monitoring.node_stats.socket_dir is defined %}
+ {%- if monitoring.node_stats.socket_dir is defined %}
socket_dir: {{ monitoring.node_stats.socket_dir }}
-{%- endif %}
-{%- if monitoring.node_stats.mon_prefix is defined %}
+ {%- endif %}
+ {%- if monitoring.node_stats.mon_prefix is defined %}
mon_prefix: {{ monitoring.node_stats.mon_prefix }}
-{%- endif %}
-{%- if monitoring.node_stats.osd_prefix is defined %}
+ {%- endif %}
+ {%- if monitoring.node_stats.osd_prefix is defined %}
osd_prefix: {{ monitoring.node_stats.osd_prefix }}
-{%- endif %}
-{%- if monitoring.interval is defined %}
+ {%- endif %}
+ {%- if monitoring.interval is defined %}
interval: {{ monitoring.interval }}
-{%- endif %}
-{%- else %}
+ {%- endif %}
+ {%- else %}
gather_admin_socket_stats: true
gather_cluster_stats: false
gather_pool_loads: false
-{%- endif %}
+ {%- endif %}
+ {%- endif -%}
+
+ {%- endif %}
{%- endif %}
diff --git a/ceph/mgr.sls b/ceph/mgr.sls
index bfc58b1..cda9856 100644
--- a/ceph/mgr.sls
+++ b/ceph/mgr.sls
@@ -83,7 +83,19 @@
disable_ceph_dashboard:
cmd.run:
- name: "ceph -c /etc/ceph/{{ common.get('cluster_name', 'ceph') }}.conf mgr module disable dashboard"
- - onlyif: "ceph -c /etc/ceph/{{ common.get('cluster_name', 'ceph') }}.conf mgr module ls | grep dashboard"
+ - unless: "ceph -c /etc/ceph/{{ common.get('cluster_name', 'ceph') }}.conf mgr module ls | grep dashboard"
+ - require:
+ - file: common_config
+ - file: /var/lib/ceph/mgr/{{ common.get('cluster_name', 'ceph') }}-{{ grains.host }}/
+
+{%- endif %}
+
+{%- if pillar.get('prometheus', {}).get('collector',{}).get("enabled", False) %}
+
+enable_prometheus_plugin:
+ cmd.run:
+ - name: "ceph -c /etc/ceph/{{ common.get('cluster_name', 'ceph') }}.conf mgr module enable prometheus"
+ - unless: "ceph -c /etc/ceph/{{ common.get('cluster_name', 'ceph') }}.conf mgr module ls | grep prometheus"
- require:
- file: common_config
- file: /var/lib/ceph/mgr/{{ common.get('cluster_name', 'ceph') }}-{{ grains.host }}/
@@ -92,4 +104,4 @@
{%- endif %}
-{%- endif %}
\ No newline at end of file
+{%- endif %}
diff --git a/ceph/osd/setup.sls b/ceph/osd/setup.sls
index b927881..852bfe5 100644
--- a/ceph/osd/setup.sls
+++ b/ceph/osd/setup.sls
@@ -24,7 +24,7 @@
{% set dev = disk.dev %}
# for uniqueness
-{% set dev_device = dev + disk.get('data_partition', 1)|string %}
+{% set dev_device = dev + disk.get('data_partition_prefix', '') + disk.get('data_partition', 1)|string %}
#{{ dev }}{{ disk.get('data_partition', 1) }}
diff --git a/debian/control b/debian/control
index e952b46..17cfb41 100644
--- a/debian/control
+++ b/debian/control
@@ -1,12 +1,12 @@
Source: salt-formula-ceph
-Maintainer: Jakub Pavlik <jakub.pavlik@tcpcloud.eu>
+Maintainer: Mirantis Dev <dev@mirantis.com>
Section: admin
Priority: optional
Build-Depends: salt-master, python, python-yaml, debhelper (>= 9)
Standards-Version: 3.9.6
-Homepage: http://www.tcpcloud.eu
-Vcs-Browser: https://github.com/tcpcloud/salt-formula-ceph
-Vcs-Git: https://github.com/tcpcloud/salt-formula-ceph.git
+Homepage: https://www.mirantis.com
+Vcs-Browser: https://gerrit.mcp.mirantis.com/#/admin/projects/salt-formulas/ceph
+Vcs-Git: https://gerrit.mcp.mirantis.com/salt-formulas/ceph.git
Package: salt-formula-ceph
Architecture: all
diff --git a/debian/copyright b/debian/copyright
index a7f2a6c..80fbb95 100644
--- a/debian/copyright
+++ b/debian/copyright
@@ -1,12 +1,12 @@
Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
Upstream-Name: salt-formula-ceph
-Upstream-Contact: Jakub Pavlik <jakub.pavlik@tcpcloud.eu>
-Source: https://github.com/tcpcloud/salt-formula-ceph
+Upstream-Contact: Mirantis Dev <dev@mirantis.com>
+Source: https://gerrit.mcp.mirantis.com/#/admin/projects/salt-formulas/ceph
Files: *
-Copyright: 2014-2015 tcp cloud
+Copyright: 2014-2019 Mirantis Inc. et al
License: Apache-2.0
- Copyright (C) 2014-2015 tcp cloud
+ Copyright (C) 2014-2019 Mirantis Inc. et al
.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
diff --git a/metadata.yml b/metadata.yml
index e4d8b71..a8aeed0 100644
--- a/metadata.yml
+++ b/metadata.yml
@@ -1,3 +1,3 @@
name: "ceph"
version: "0.2"
-source: "https://github.com/salt-formulas/salt-formula-ceph"
+source: "https://gerrit.mcp.mirantis.com/salt-formulas/ceph"
diff --git a/metadata/service/monitoring/cluster_stats.yml b/metadata/service/monitoring/cluster_stats.yml
index 85ae108..fdca597 100644
--- a/metadata/service/monitoring/cluster_stats.yml
+++ b/metadata/service/monitoring/cluster_stats.yml
@@ -8,5 +8,7 @@
ceph:
monitoring:
cluster_stats:
+ extra_alerts:
+ enabled: false
enabled: true
ceph_user: ${_param:ceph_monitoring_user}
diff --git a/metadata/service/support.yml b/metadata/service/support.yml
index 2be3736..5c87d50 100644
--- a/metadata/service/support.yml
+++ b/metadata/service/support.yml
@@ -15,3 +15,5 @@
enabled: true
grafana:
enabled: true
+ fluentd:
+ enabled: true