Merge "Fix for lvm_filters in case if storage.lvm is not defined."
diff --git a/README.rst b/README.rst
index 4d66e8a..9a9b344 100644
--- a/README.rst
+++ b/README.rst
@@ -267,6 +267,21 @@
- '!SUPPORT_SHELLS'
- '!SUPPORT_RESTRICTED'
+Set ssd scheduler on physical nodes:
+Default values are
+ enabled: false
+ name: deadline
+
+.. code-block:: yaml
+
+ linux:
+ system:
+ ...
+ ssd_scheduler:
+ enabled: true
+ name: cfq
+ ...
+
Linux with package, latest version:
.. code-block:: yaml
diff --git a/linux/files/60-ssd-scheduler.rules b/linux/files/60-ssd-scheduler.rules
new file mode 100644
index 0000000..e4208ee
--- /dev/null
+++ b/linux/files/60-ssd-scheduler.rules
@@ -0,0 +1,3 @@
+{%- from "linux/map.jinja" import system with context %}
+# set deadline scheduler for non-rotating disks
+ACTION=="add|change", KERNEL=="sd[a-z]", ATTR{queue/rotational}=="0", ATTR{queue/scheduler}="{{ system.get('ssd_scheduler', {}).get('name','deadline') }}"
\ No newline at end of file
diff --git a/linux/map.jinja b/linux/map.jinja
index 4250ef2..9068aa2 100644
--- a/linux/map.jinja
+++ b/linux/map.jinja
@@ -27,6 +27,10 @@
'outfile': '/var/log/atop/daily.log'
},
'sosreport': {},
+ 'ssd_scheduler': {
+ 'enabled': false,
+ 'name': 'deadline',
+ },
},
'Debian': {
'pkgs': ['python-apt', 'apt-transport-https', 'libmnl0'],
@@ -60,6 +64,10 @@
'outfile': '/var/log/atop/daily.log'
},
'sosreport': {},
+ 'ssd_scheduler': {
+ 'enabled': false,
+ 'name': 'deadline',
+ },
},
'RedHat': {
'pkgs': ['policycoreutils', 'policycoreutils-python', 'telnet', 'wget'],
@@ -88,7 +96,11 @@
'logpath': '/var/log/atop',
'outfile': '/var/log/atop/daily.log'
},
- 'sosreport': {}
+ 'sosreport': {},
+ 'ssd_scheduler': {
+ 'enabled': false,
+ 'name': 'deadline',
+ },
},
}, grain='os_family', merge=salt['pillar.get']('linux:system')) %}
diff --git a/linux/meta/prometheus.yml b/linux/meta/prometheus.yml
index a387b16..5fe2f05 100644
--- a/linux/meta/prometheus.yml
+++ b/linux/meta/prometheus.yml
@@ -1,6 +1,28 @@
{%- from "linux/map.jinja" import monitoring, network with context %}
server:
alert:
+ {%- raw %}
+ SystemCpuIoWaitWarning:
+ if: >-
+ cpu_usage_iowait > 40
+ for: 10m
+ labels:
+ severity: warning
+ service: system
+ annotations:
+ summary: "CPU waited for I/O 40% of time"
+ description: "The CPU on the {{ $labels.host }} node spent 40% of time waiting for I/O."
+ SystemCpuIoWaitCritical:
+ if: >-
+ cpu_usage_iowait > 50
+ for: 10m
+ labels:
+ severity: critical
+ service: system
+ annotations:
+ summary: "CPU waited for I/O 50% of time"
+ description: "The CPU on the {{ $labels.host }} node spent 50% of time waiting for I/O."
+ {%- endraw %}
{%- set cpu_steal_warn = monitoring.cpu_steal_percentage.warn|float %}
{%- set cpu_steal_crit = monitoring.cpu_steal_percentage.crit|float %}
SystemCpuStealTimeWarning:
@@ -126,6 +148,46 @@
annotations:
summary: "Disk {{ $labels.device }} is failing"
description: "The {{ $labels.device }} disk on the {{ $labels.host }} node is reporting errors for 5 minutes."
+ SystemDiskBacklogWarning:
+ if: >-
+ rate(diskio_weighted_io_time{name=~"(hd[a-z]?|sd[a-z]?|nvme[0-9]?[a-z]?[0-9]?)"}[1m]) / 1000 > 10
+ for: 10m
+ labels:
+ severity: warning
+ service: system
+ annotations:
+ summary: "Disk {{ $labels.name }} backlog warning"
+ description: "I/O requests for the {{ $labels.name }} disk on the {{ $labels.host }} node exceeded concurrency level of 10 during the last 10 minutes."
+ SystemDiskBacklogCritical:
+ if: >-
+ rate(diskio_weighted_io_time{name=~"(hd[a-z]?|sd[a-z]?|nvme[0-9]?[a-z]?[0-9]?)"}[1m]) / 1000 > 20
+ for: 10m
+ labels:
+ severity: critical
+ service: system
+ annotations:
+ summary: "Disk {{ $labels.name }} backlog critical"
+ description: "I/O requests for the {{ $labels.name }} disk on the {{ $labels.host }} node exceeded concurrency level of 20 during the last 10 minutes."
+ SystemDiskRequestQueuedWarning:
+ if: >-
+ rate(diskio_io_time{name=~"(hd[a-z]?|sd[a-z]?|nvme[0-9]?[a-z]?[0-9]?)"}[1m]) / 1000 > 0.9
+ for: 10m
+ labels:
+ severity: warning
+ service: system
+ annotations:
+ summary: "Disk {{ $labels.name }} requests were queued for 90% of time"
+ description: "I/O requests for the {{ $labels.name }} disk on the {{ $labels.host }} node spent in queue 90% of the device time during the last 10 minutes."
+ SystemDiskRequestQueuedCritical:
+ if: >-
+ rate(diskio_io_time{name=~"(hd[a-z]?|sd[a-z]?|nvme[0-9]?[a-z]?[0-9]?)"}[1m]) / 1000 > 0.98
+ for: 10m
+ labels:
+ severity: critical
+ service: system
+ annotations:
+ summary: "Disk {{ $labels.name }} requests were queued for 98% of time"
+ description: "I/O requests for the {{ $labels.name }} disk on the {{ $labels.host }} node spent in queue 98% of the device time during the last 10 minutes."
SystemMemoryFullWarning:
if: >-
mem_used_percent > 90 and mem_available < 8 * 2^30
@@ -280,7 +342,7 @@
description: "The {{ $labels.bond }} bond interface slave {{ $labels.interface }} on the {{ $labels.host }} node is down."
BondInterfaceSlaveDownMajor:
if: >-
- sum(bond_slave_status) by (bond,host) <= on (bond,host) 0.5 * count(bond_slave_status)
+ sum(bond_slave_status) by (bond,host) <= 0.5 * count(bond_slave_status) by (bond,host)
labels:
severity: major
service: system
diff --git a/linux/system/init.sls b/linux/system/init.sls
index e2e3ca7..bc9b20d 100644
--- a/linux/system/init.sls
+++ b/linux/system/init.sls
@@ -5,6 +5,9 @@
- linux.system.profile
- linux.system.shell
- linux.system.motd_news
+{%- if system.get('ssd_scheduler', {}).get('enabled', false) and grains.virtual == 'physical' %}
+- linux.system.ssd_scheduler
+{%- endif %}
{%- if system.login_defs is defined %}
- linux.system.login_defs
{%- endif %}
diff --git a/linux/system/ssd_scheduler.sls b/linux/system/ssd_scheduler.sls
new file mode 100644
index 0000000..fd1292e
--- /dev/null
+++ b/linux/system/ssd_scheduler.sls
@@ -0,0 +1,21 @@
+{%- from "linux/map.jinja" import system with context %}
+
+{%- if system.ssd_scheduler.enabled %}
+
+create_ssd_scheduler_udev_rule:
+ file.managed:
+ - name: /etc/udev/rules.d/60-ssd-scheduler.rules
+ - source: salt://linux/files/60-ssd-scheduler.rules
+ - user: root
+ - group: root
+ - mode: 0644
+ - makedirs: true
+ - template: jinja
+
+trigger_ssd_scheduler_udev_rule:
+ cmd.run:
+ - name: /bin/udevadm trigger -a queue/scheduler
+ - onchanges:
+ - file: /etc/udev/rules.d/60-ssd-scheduler.rules
+
+{%- endif %}
\ No newline at end of file
diff --git a/tests/pillar/network.sls b/tests/pillar/network.sls
index 840fd84..c665119 100644
--- a/tests/pillar/network.sls
+++ b/tests/pillar/network.sls
@@ -3,6 +3,9 @@
enabled: true
domain: ci.local
name: linux.ci.local
+ ssd_scheduler:
+ enabled: false
+ name: cfq
network:
enabled: true
hostname: linux
diff --git a/tests/pillar/network_extended.sls b/tests/pillar/network_extended.sls
index e0e428d..f6703df 100644
--- a/tests/pillar/network_extended.sls
+++ b/tests/pillar/network_extended.sls
@@ -3,6 +3,9 @@
enabled: true
domain: ci.local
name: linux
+ ssd_scheduler:
+ enabled: false
+ name: cfq
network:
enabled: true
hostname: linux
diff --git a/tests/pillar/network_openvswitch.sls b/tests/pillar/network_openvswitch.sls
index dae30e7..5342ea7 100644
--- a/tests/pillar/network_openvswitch.sls
+++ b/tests/pillar/network_openvswitch.sls
@@ -3,6 +3,9 @@
enabled: true
domain: local
name: linux
+ ssd_scheduler:
+ enabled: false
+ name: cfq
network:
enabled: true
hostname: test01
diff --git a/tests/pillar/network_openvswitch_dpdk.sls b/tests/pillar/network_openvswitch_dpdk.sls
index 2e85df7..90ca061 100644
--- a/tests/pillar/network_openvswitch_dpdk.sls
+++ b/tests/pillar/network_openvswitch_dpdk.sls
@@ -3,6 +3,9 @@
enabled: true
domain: local
name: linux
+ ssd_scheduler:
+ enabled: false
+ name: cfq
network:
enabled: true
hostname: test01
diff --git a/tests/pillar/storage.sls b/tests/pillar/storage.sls
index 2e9a7fc..f3e2ac4 100644
--- a/tests/pillar/storage.sls
+++ b/tests/pillar/storage.sls
@@ -3,6 +3,9 @@
enabled: true
name: linux
domain: local
+ ssd_scheduler:
+ enabled: false
+ name: cfq
network:
enabled: true
hostname: linux
diff --git a/tests/pillar/system.sls b/tests/pillar/system.sls
index 9972318..26b15a2 100644
--- a/tests/pillar/system.sls
+++ b/tests/pillar/system.sls
@@ -499,3 +499,6 @@
logging:
syslog: true
syslog_error: true
+ ssd_scheduler:
+ enabled: false
+ name: cfq
diff --git a/tests/pillar/system_duo.sls b/tests/pillar/system_duo.sls
index 89a405a..e0cfec8 100644
--- a/tests/pillar/system_duo.sls
+++ b/tests/pillar/system_duo.sls
@@ -160,4 +160,6 @@
duo_host: localhost
duo_ikey: DUO-INTEGRATION-KEY
duo_skey: DUO-SECRET-KEY
-
+ ssd_scheduler:
+ enabled: false
+ name: cfq
diff --git a/tests/pillar/system_extra.sls b/tests/pillar/system_extra.sls
index ebcc42a..ff1dfcc 100644
--- a/tests/pillar/system_extra.sls
+++ b/tests/pillar/system_extra.sls
@@ -48,3 +48,6 @@
action: exit 101
- package: '*'
action: switch
+ ssd_scheduler:
+ enabled: false
+ name: cfq
\ No newline at end of file