Merge "Remove SMART alerts"
diff --git a/linux/meta/prometheus.yml b/linux/meta/prometheus.yml
index 5a0941c..47abd64 100644
--- a/linux/meta/prometheus.yml
+++ b/linux/meta/prometheus.yml
@@ -126,107 +126,6 @@
annotations:
summary: "Disk {{ $labels.device }} is failing"
description: "The {{ $labels.device }} disk on the {{ $labels.host }} node is reporting errors for 5 minutes."
- {%- endraw %}
- {%- if grains.get('virtual', None) == 'physical' %}
- {%- raw %}
- SystemSMARTDiskUDMACrcErrorsTooHigh:
- if: >-
- increase(smart_device_udma_crc_errors[1m]) > 0
- for: 5m
- labels:
- severity: warning
- service: system
- annotations:
- summary: "The {{ $labels.device }} disk has UDMA CRC errors"
- description: "The {{ $labels.device }} disk on the {{ $labels.host }} node is reporting UDMA CRC errors for 5 minutes."
- SystemSMARTDiskHealthStatus:
- if: >-
- smart_device_health_ok == 0
- for: 1m
- labels:
- severity: warning
- service: system
- annotations:
- summary: "The {{ $labels.device }} disk has bad health"
- description: "The {{ $labels.device }} disk on the {{ $labels.host }} node is reporting a bad health status for 1 minute."
- SystemSMARTDiskReadErrorRate:
- if: >-
- increase(smart_device_read_error_rate[1m]) > 0
- for: 5m
- labels:
- severity: warning
- service: system
- annotations:
- summary: "The {{ $labels.device }} disk has read errors"
- description: "The {{ $labels.device }} disk on the {{ $labels.host }} node is reporting an increased read error rate for 5 minutes."
- SystemSMARTDiskSeekErrorRate:
- if: >-
- increase(smart_device_seek_error_rate[1m]) > 0
- for: 5m
- labels:
- severity: warning
- service: system
- annotations:
- summary: "The {{ $labels.device }} disk has seek errors"
- description: "The {{ $labels.device }} disk on the {{ $labels.host }} node is reporting an increased seek error rate for 5 minutes."
- SystemSMARTDiskTemperatureHigh:
- if: >-
- smart_device_temp_c >= 60
- for: 5m
- labels:
- severity: warning
- service: system
- annotations:
- summary: "The {{ $labels.device }} disk temperature is high"
- description: "The {{ $labels.device }} disk on the {{ $labels.host }} node has a temperature of {{ $value }}C for 5 minutes."
- SystemSMARTDiskReallocatedSectorsCount:
- if: >-
- smart_attribute_raw_value{name="Reallocated_Sector_Ct"} > 10
- labels:
- severity: warning
- service: system
- annotations:
- summary: "The {{ $labels.device }} disk has reallocated sectors"
- description: "The {{ $labels.device }} disk on the {{ $labels.host }} node has reallocated {{ $value }} sectors."
- SystemSMARTDiskCurrentPendingSectors:
- if: >-
- smart_attribute_raw_value{name="Current_Pending_Sector"} > 0
- labels:
- severity: major
- service: system
- annotations:
- summary: "The {{ $labels.device }} disk has current pending sectors"
- description: "The {{ $labels.device }} disk on the {{ $labels.host }} node has {{ $value }} 'current pending' sectors."
- SystemSMARTDiskReportedUncorrectableErrors:
- if: >-
- smart_attribute_raw_value{name="Reported_Uncorrect"} > 0
- labels:
- severity: major
- service: system
- annotations:
- summary: "The {{ $labels.device }} disk has reported uncorrectable errors"
- description: "The {{ $labels.device }} disk on the {{ $labels.host }} node has {{ $value }} 'reported uncorrectable' errors."
- SystemSMARTDiskOfflineUncorrectableSectors:
- if: >-
- smart_attribute_raw_value{name="Offline_Uncorrectable"} > 0
- labels:
- severity: major
- service: system
- annotations:
- summary: "The {{ $labels.device }} disk has offline uncorrectable sectors"
- description: "The {{ $labels.device }} disk on the {{ $labels.host }} node has {{ $value }} 'offline uncorrectable' sectors."
- SystemSMARTDiskEndToEndError:
- if: >-
- smart_attribute_raw_value{name="End-to-End_Error"} > 0
- labels:
- severity: major
- service: system
- annotations:
- summary: "The {{ $labels.device }} disk has end-to-end errors"
- description: "The {{ $labels.device }} disk on the {{ $labels.host }} node has {{ $value }} 'end-to-end' errors."
- {%- endraw %}
- {%- endif %}
- {%- raw %}
SystemMemoryFullWarning:
if: >-
mem_used_percent > 90 and mem_available < 8 * 2^30