Updated SMART disk prometheus alerts
- added new alerts regarding SMART disk monitoring
- set attributes parameter to true by default to get attributes raw data
as a metric.
Change-Id: I8ba0dcc824daf5eee68033db7e9f3a14cc37501e
Related-Bug: PROD-27914
diff --git a/linux/meta/prometheus.yml b/linux/meta/prometheus.yml
index 42d0541..ecf28c3 100644
--- a/linux/meta/prometheus.yml
+++ b/linux/meta/prometheus.yml
@@ -152,6 +152,42 @@
annotations:
summary: "The {{ $labels.device }} disk temperature is high"
description: "The {{ $labels.device }} disk on the {{ $labels.host }} node has a temperature of {{ $value }}C for 5 minutes."
+ SystemSMARTDiskReallocatedSectorsCount:
+ if: >-
+ increase(smart_attribute_raw_value{name="Reallocated_Sector_Ct"}[10m]) > 0
+ labels:
+ severity: minor
+ service: system
+ annotations:
+ summary: "The {{ $labels.device }} disk has reallocated sectors"
+ description: "The {{ $labels.device }} disk on the {{ $labels.host }} node has reallocated {{ $value }} sectors."
+ SystemSMARTDiskCurrentPendingSectors:
+ if: >-
+ increase(smart_attribute_raw_value{name="Current_Pending_Sector"}[10m]) > 0
+ labels:
+ severity: minor
+ service: system
+ annotations:
+ summary: "The {{ $labels.device }} disk has current pending sectors"
+ description: "The {{ $labels.device }} disk on the {{ $labels.host }} node has {{ $value }} 'current pending' sectors."
+ SystemSMARTDiskReportedUncorrectableErrors:
+ if: >-
+ increase(smart_attribute_raw_value{name="Reported_Uncorrect"}[10m]) > 0
+ labels:
+ severity: minor
+ service: system
+ annotations:
+ summary: "The {{ $labels.device }} disk has reported uncorrectable errors"
+ description: "The {{ $labels.device }} disk on the {{ $labels.host }} node has {{ $value }} 'reported uncorrectable' errors."
+ SystemSMARTDiskEndToEndError:
+ if: >-
+ increase(smart_attribute_raw_value{name="End-to-End_Error"}[10m]) > 0
+ labels:
+ severity: minor
+ service: system
+ annotations:
+ summary: "The {{ $labels.device }} disk has end-to-end errors"
+ description: "The {{ $labels.device }} disk on the {{ $labels.host }} node has {{ $value }} 'end-to-end' errors."
{%- endraw %}
{%- endif %}
SystemMemoryFullWarning:
diff --git a/linux/meta/telegraf.yml b/linux/meta/telegraf.yml
index 934824a..9a25b69 100644
--- a/linux/meta/telegraf.yml
+++ b/linux/meta/telegraf.yml
@@ -5,6 +5,7 @@
smart:
template: linux/files/smart_telegraf.conf
path: "/usr/sbin/smartctl"
+ attributes: true
{%- endif %}
cpu:
percpu: false