initial commit
diff --git a/system/heka/alarm/openstack_compute.yml b/system/heka/alarm/openstack_compute.yml
new file mode 100644
index 0000000..d4fda7d
--- /dev/null
+++ b/system/heka/alarm/openstack_compute.yml
@@ -0,0 +1,90 @@
+parameters:
+ heka:
+ metric_collector:
+ trigger:
+ # Override the linux_system_cpu_critical and linux_system_cpu_warning
+ # triggers to use specific rules on control nodes
+ linux_system_cpu_critical:
+ description: 'The CPU usage is too high (compute node)'
+ severity: critical
+ rules:
+ - metric: cpu_wait
+ relational_operator: '>='
+ threshold: 30
+ window: 120
+ periods: 0
+ function: avg
+ linux_system_cpu_warning:
+ description: 'The CPU usage is high (compute node)'
+ severity: 'warning'
+ enabled: 'true'
+ rules:
+ - metric: cpu_wait
+ relational_operator: '>='
+ threshold: 20
+ window: 120
+ periods: 0
+ function: avg
+ alarm:
+ # Tag all the system alarm metrics with "node_role: compute". This
+ # to be able to create an alarm cluster for compute nodes.
+ linux_system_cpu:
+ alerting: enabled
+ triggers:
+ - linux_system_cpu_critical
+ - linux_system_cpu_warning
+ dimension:
+ node_role: compute
+ linux_system_swap:
+ alerting: enabled
+ triggers:
+ - linux_system_swap_usage_critical
+ - linux_system_swap_activity_warning
+ - linux_system_swap_usage_warning
+ dimension:
+ node_role: compute
+ linux_system_root_fs:
+ alerting: enabled
+ triggers:
+ - linux_system_root_fs_critical
+ - linux_system_root_fs_warning
+ dimension:
+ node_role: compute
+ linux_system_network_rx:
+ alerting: enabled
+ triggers:
+ - linux_system_network_critical_dropped_rx
+ - linux_system_network_warning_dropped_rx
+ dimension:
+ node_role: compute
+ linux_system_network_tx:
+ alerting: enabled
+ triggers:
+ - linux_system_network_critical_dropped_tx
+ - linux_system_network_warning_dropped_tx
+ dimension:
+ node_role: compute
+ linux_system_hdd_errors:
+ alerting: enabled_with_notification
+ triggers:
+ - linux_system_hdd_errors_critical
+ dimension:
+ node_role: compute
+ aggregator:
+ alarm_cluster:
+ compute_nodes:
+ policy: majority_of_members
+ alerting: enabled_with_notification
+ group_by: hostname
+ match:
+ node_role: compute
+ members:
+ - linux_system_cpu
+ - linux_system_swap
+ - linux_system_root_fs
+ - linux_system_network_rx
+ - linux_system_network_tx
+ - linux_system_hdd_errors
+ dimension:
+ cluster_name: compute
+ nagios_host: 01-node-clusters