Adjusted SMART alerts
- Added the alert for Offline_Uncorrectable smart parameter.
- Changes severities
- Removed increase function from some SMART alerts
Change-Id: Id18313ca80d178ff854c5f07dccefbe78c3342c9
Closes-Bug: PROD-31330
Related-Bug: PROD-31379
(cherry picked from commit 8563bb8cc1a34ddb6c7b5097babc005e5f6d3af7)
diff --git a/linux/meta/prometheus.yml b/linux/meta/prometheus.yml
index f3049fa..03e0cca 100644
--- a/linux/meta/prometheus.yml
+++ b/linux/meta/prometheus.yml
@@ -154,36 +154,45 @@
description: "The {{ $labels.device }} disk on the {{ $labels.host }} node has a temperature of {{ $value }}C for 5 minutes."
SystemSMARTDiskReallocatedSectorsCount:
if: >-
- increase(smart_attribute_raw_value{name="Reallocated_Sector_Ct"}[10m]) > 0
+ smart_attribute_raw_value{name="Reallocated_Sector_Ct"} > 0
labels:
- severity: minor
+ severity: major
service: system
annotations:
summary: "The {{ $labels.device }} disk has reallocated sectors"
description: "The {{ $labels.device }} disk on the {{ $labels.host }} node has reallocated {{ $value }} sectors."
SystemSMARTDiskCurrentPendingSectors:
if: >-
- increase(smart_attribute_raw_value{name="Current_Pending_Sector"}[10m]) > 0
+ smart_attribute_raw_value{name="Current_Pending_Sector"} > 0
labels:
- severity: minor
+ severity: major
service: system
annotations:
summary: "The {{ $labels.device }} disk has current pending sectors"
description: "The {{ $labels.device }} disk on the {{ $labels.host }} node has {{ $value }} 'current pending' sectors."
SystemSMARTDiskReportedUncorrectableErrors:
if: >-
- increase(smart_attribute_raw_value{name="Reported_Uncorrect"}[10m]) > 0
+ smart_attribute_raw_value{name="Reported_Uncorrect"} > 0
labels:
- severity: minor
+ severity: major
service: system
annotations:
summary: "The {{ $labels.device }} disk has reported uncorrectable errors"
description: "The {{ $labels.device }} disk on the {{ $labels.host }} node has {{ $value }} 'reported uncorrectable' errors."
+ SystemSMARTDiskOfflineUncorrectableSectors:
+ if: >-
+ smart_attribute_raw_value{name="Offline_Uncorrectable"} > 0
+ labels:
+ severity: major
+ service: system
+ annotations:
+ summary: "The {{ $labels.device }} disk has offline uncorrectable sectors"
+ description: "The {{ $labels.device }} disk on the {{ $labels.host }} node has {{ $value }} 'offline uncorrectable' sectors."
SystemSMARTDiskEndToEndError:
if: >-
- increase(smart_attribute_raw_value{name="End-to-End_Error"}[10m]) > 0
+ smart_attribute_raw_value{name="End-to-End_Error"} > 0
labels:
- severity: minor
+ severity: major
service: system
annotations:
summary: "The {{ $labels.device }} disk has end-to-end errors"