Merge pull request #33 from elemoine/stacklight-alarm
Add more alarms
diff --git a/linux/meta/heka.yml b/linux/meta/heka.yml
index 0c4a43c..f63f099 100644
--- a/linux/meta/heka.yml
+++ b/linux/meta/heka.yml
@@ -54,9 +54,145 @@
window: 120
periods: 0
function: avg
+ linux_system_swap_usage_critical:
+ description: 'There is no more swap free space'
+ severity: critical
+ rules:
+ - metric: swap_free
+ relational_operator: '=='
+ threshold: 0
+ window: 60
+ periods: 0
+ function: max
+ linux_system_swap_activity_warning:
+ description: 'The swap activity is high'
+ severity: warning
+ rules:
+ - metric: swap_io_in
+ relational_operator: '>='
+ threshold: 1048576 # 1 Mb/s
+ window: 120
+ periods: 0
+ function: avg
+ - metric: swap_io_out
+ relational_operator: '>='
+ threshold: 1048576 # 1 Mb/s
+ window: 120
+ periods: 0
+ function: avg
+ linux_system_swap_usage_warning:
+ description: 'The swap free space is low'
+ severity: warning
+ rules:
+ - metric: swap_percent_used
+ relational_operator: '>='
+ threshold: 0.8
+ window: 60
+ periods: 0
+ function: avg
+ linux_system_root_fs_warning:
+ description: "The root filesystem's free space is low"
+ severity: warning
+ rules:
+ - metric: fs_space_percent_free
+ field:
+ fs: '/'
+ relational_operator: '<'
+ threshold: 10
+ window: 60
+ periods: 0
+ function: min
+ linux_system_root_fs_critical:
+ description: "The root filesystem's free space is too low"
+ severity: critical
+ rules:
+ - metric: fs_space_percent_free
+ field:
+ fs: '/'
+ relational_operator: '<'
+ threshold: 5
+ window: 60
+ periods: 0
+ function: min
+ linux_system_network_warning_dropped_rx:
+ description: 'Some received packets have been dropped'
+ severity: warning
+ rules:
+ - metric: if_dropped_rx
+ relational_operator: '>'
+ threshold: 100
+ window: 60
+ periods: 0
+ function: avg
+ linux_system_network_critical_dropped_rx:
+ description: 'Too many received packets have been dropped'
+ severity: critical
+ rules:
+ - metric: if_dropped_rx
+ relational_operator: '>'
+ threshold: 1000
+ window: 60
+ periods: 0
+ function: avg
+ linux_system_network_warning_dropped_tx:
+ description: 'Some transmitted packets have been dropped'
+ severity: warning
+ rules:
+ - metric: if_dropped_tx
+ relational_operator: '>'
+ threshold: 100
+ window: 60
+ periods: 0
+ function: avg
+ linux_system_network_critical_dropped_tx:
+ description: 'Too many transmitted packets have been dropped'
+ severity: critical
+ rules:
+ - metric: if_dropped_tx
+ relational_operator: '>'
+ threshold: 1000
+ function: avg
+ window: 60
+ linux_system_hdd_errors_critical:
+ description: 'Errors on hard drive(s) have been detected'
+ severity: critical
+ no_data_policy: okay
+ rules:
+ - metric: hdd_errors_rate
+ group_by: [device]
+ relational_operator: '>'
+ threshold: 0
+ window: 60
+ periods: 0
+ function: max
alarm:
linux_system_cpu:
alerting: enabled
triggers:
- linux_system_cpu_warning
- linux_system_cpu_critical
+ linux_system_swap:
+ alerting: enabled
+ triggers:
+ - linux_system_swap_usage_critical
+ - linux_system_swap_activity_warning
+ - linux_system_swap_usage_warning
+ linux_system_root_fs:
+ alerting: enabled
+ triggers:
+ - linux_system_root_fs_critical
+ - linux_system_root_fs_warning
+ linux_system_network_rx:
+ alerting: enabled
+ triggers:
+ - linux_system_network_critical_dropped_rx
+ - linux_system_network_warning_dropped_rx
+ linux_system_network_tx:
+ alerting: enabled
+ triggers:
+ - linux_system_network_critical_dropped_tx
+ - linux_system_network_warning_dropped_tx
+ linux_system_hdd_errors:
+ alerting: enabled_with_notification
+ triggers:
+ - linux_system_hdd_errors_critical