initial commit

commit: 03ff34ef2becc70bbeb47b209edf350cee769626 [log] [tgz]
author: Ondrej Smola <ondrej.smola@tcpcloud.eu> Thu Dec 01 01:30:33 2016 +0100
committer: Ondrej Smola <ondrej.smola@tcpcloud.eu> Thu Dec 01 01:30:33 2016 +0100
tree: 16e1c37a5677c1d55fa2d59f6445d4db2fa46dc1
diff --git a/system/heka/aggregator/cluster.yml b/system/heka/aggregator/cluster.yml
new file mode 100644
index 0000000..e2ee129
--- /dev/null
+++ b/system/heka/aggregator/cluster.yml

@@ -0,0 +1,30 @@
+classes:
+- service.heka.aggregator.cluster
+parameters:
+  _param:
+    nagios_default_host_alarm_clusters: 00-clusters
+    nagios_host_dimension_key: nagios_host
+  heka:
+    aggregator:
+      influxdb_host: ${_param:heka_influxdb_host}
+      influxdb_port: ${_param:influxdb_port}
+      influxdb_database: ${_param:influxdb_database}
+      influxdb_username: ${_param:influxdb_user}
+      influxdb_password: ${_param:influxdb_password}
+      nagios_host: ${_param:nagios_host}
+      nagios_username: ${_param:nagios_username}
+      nagios_password: ${_param:nagios_password}
+      nagios_port: ${_param:nagios_status_port}
+      nagios_default_host_alarm_clusters: ${_param:nagios_default_host_alarm_clusters}
+      nagios_host_dimension_key: ${_param:nagios_host_dimension_key}
+  keepalived:
+    cluster:
+      instance:
+        stacklight_monitor_vip:
+          notify_action:
+            master:
+              - service aggregator start
+            backup:
+              - service aggregator stop
+            fault:
+              - service aggregator stop

diff --git a/system/heka/aggregator/single.yml b/system/heka/aggregator/single.yml
new file mode 100644
index 0000000..c252bd6
--- /dev/null
+++ b/system/heka/aggregator/single.yml

@@ -0,0 +1,19 @@
+classes:
+- service.heka.aggregator.single
+parameters:
+  _param:
+    nagios_default_host_alarm_clusters: 00-clusters
+    nagios_host_dimension_key: nagios_host
+  heka:
+    aggregator:
+      influxdb_host: ${_param:heka_influxdb_host}
+      influxdb_port: ${_param:influxdb_port}
+      influxdb_database: ${_param:influxdb_database}
+      influxdb_username: ${_param:influxdb_user}
+      influxdb_password: ${_param:influxdb_password}
+      nagios_host: ${_param:nagios_host}
+      nagios_username: ${_param:nagios_username}
+      nagios_password: ${_param:nagios_password}
+      nagios_port: ${_param:nagios_status_port}
+      nagios_default_host_alarm_clusters: ${_param:nagios_default_host_alarm_clusters}
+      nagios_host_dimension_key: ${_param:nagios_host_dimension_key}

diff --git a/system/heka/alarm/openstack_compute.yml b/system/heka/alarm/openstack_compute.yml
new file mode 100644
index 0000000..d4fda7d
--- /dev/null
+++ b/system/heka/alarm/openstack_compute.yml

@@ -0,0 +1,90 @@
+parameters:
+  heka:
+    metric_collector:
+      trigger:
+        # Override the linux_system_cpu_critical and linux_system_cpu_warning
+        # triggers to use specific rules on control nodes
+        linux_system_cpu_critical:
+          description: 'The CPU usage is too high (compute node)'
+          severity: critical
+          rules:
+          - metric: cpu_wait
+            relational_operator: '>='
+            threshold: 30
+            window: 120
+            periods: 0
+            function: avg
+        linux_system_cpu_warning:
+          description: 'The CPU usage is high (compute node)'
+          severity: 'warning'
+          enabled: 'true'
+          rules:
+          - metric: cpu_wait
+            relational_operator: '>='
+            threshold: 20
+            window: 120
+            periods: 0
+            function: avg
+      alarm:
+        # Tag all the system alarm metrics with "node_role: compute". This
+        # to be able to create an alarm cluster for compute nodes.
+        linux_system_cpu:
+          alerting: enabled
+          triggers:
+          - linux_system_cpu_critical
+          - linux_system_cpu_warning
+          dimension:
+            node_role: compute
+        linux_system_swap:
+          alerting: enabled
+          triggers:
+          - linux_system_swap_usage_critical
+          - linux_system_swap_activity_warning
+          - linux_system_swap_usage_warning
+          dimension:
+            node_role: compute
+        linux_system_root_fs:
+          alerting: enabled
+          triggers:
+          - linux_system_root_fs_critical
+          - linux_system_root_fs_warning
+          dimension:
+            node_role: compute
+        linux_system_network_rx:
+          alerting: enabled
+          triggers:
+          - linux_system_network_critical_dropped_rx
+          - linux_system_network_warning_dropped_rx
+          dimension:
+            node_role: compute
+        linux_system_network_tx:
+          alerting: enabled
+          triggers:
+          - linux_system_network_critical_dropped_tx
+          - linux_system_network_warning_dropped_tx
+          dimension:
+            node_role: compute
+        linux_system_hdd_errors:
+          alerting: enabled_with_notification
+          triggers:
+          - linux_system_hdd_errors_critical
+          dimension:
+            node_role: compute
+    aggregator:
+      alarm_cluster:
+        compute_nodes:
+          policy: majority_of_members
+          alerting: enabled_with_notification
+          group_by: hostname
+          match:
+            node_role: compute
+          members:
+          - linux_system_cpu
+          - linux_system_swap
+          - linux_system_root_fs
+          - linux_system_network_rx
+          - linux_system_network_tx
+          - linux_system_hdd_errors
+          dimension:
+            cluster_name: compute
+            nagios_host: 01-node-clusters

diff --git a/system/heka/alarm/openstack_control.yml b/system/heka/alarm/openstack_control.yml
new file mode 100644
index 0000000..7dcb331
--- /dev/null
+++ b/system/heka/alarm/openstack_control.yml

@@ -0,0 +1,102 @@
+parameters:
+  heka:
+    metric_collector:
+      trigger:
+        # Override the linux_system_cpu_critical and linux_system_cpu_warning
+        # triggers to use specific rules on control nodes
+        linux_system_cpu_critical:
+          description: 'The CPU usage is too high (controller node)'
+          severity: critical
+          rules:
+          - metric: cpu_idle
+            relational_operator: '<='
+            threshold: 5
+            window: 120
+            periods: 0
+            function: avg
+          - metric: cpu_wait
+            relational_operator: '>='
+            threshold: 35
+            window: 120
+            periods: 0
+            function: avg
+        linux_system_cpu_warning:
+          description: 'The CPU usage is high (controller node)'
+          severity: 'warning'
+          enabled: 'true'
+          rules:
+          - metric: cpu_idle
+            relational_operator: '<='
+            threshold: 15
+            window: 120
+            periods: 0
+            function: avg
+          - metric: cpu_wait
+            relational_operator: '>='
+            threshold: 25
+            window: 120
+            periods: 0
+            function: avg
+      alarm:
+        # Tag all the system alarm metrics with "node_role: control". This
+        # to be able to create an alarm cluster for control nodes.
+        linux_system_cpu:
+          alerting: enabled
+          triggers:
+          - linux_system_cpu_critical
+          - linux_system_cpu_warning
+          dimension:
+            node_role: control
+        linux_system_swap:
+          alerting: enabled
+          triggers:
+          - linux_system_swap_usage_critical
+          - linux_system_swap_activity_warning
+          - linux_system_swap_usage_warning
+          dimension:
+            node_role: control
+        linux_system_root_fs:
+          alerting: enabled
+          triggers:
+          - linux_system_root_fs_critical
+          - linux_system_root_fs_warning
+          dimension:
+            node_role: control
+        linux_system_network_rx:
+          alerting: enabled
+          triggers:
+          - linux_system_network_critical_dropped_rx
+          - linux_system_network_warning_dropped_rx
+          dimension:
+            node_role: control
+        linux_system_network_tx:
+          alerting: enabled
+          triggers:
+          - linux_system_network_critical_dropped_tx
+          - linux_system_network_warning_dropped_tx
+          dimension:
+            node_role: control
+        linux_system_hdd_errors:
+          alerting: enabled_with_notification
+          triggers:
+          - linux_system_hdd_errors_critical
+          dimension:
+            node_role: control
+    aggregator:
+      alarm_cluster:
+        control_nodes:
+          policy: majority_of_members
+          alerting: enabled_with_notification
+          group_by: hostname
+          match:
+            node_role: control
+          members:
+          - linux_system_cpu
+          - linux_system_swap
+          - linux_system_root_fs
+          - linux_system_network_rx
+          - linux_system_network_tx
+          - linux_system_hdd_errors
+          dimension:
+            cluster_name: control
+            nagios_host: 01-node-clusters

diff --git a/system/heka/log_collector/single.yml b/system/heka/log_collector/single.yml
new file mode 100644
index 0000000..73463e4
--- /dev/null
+++ b/system/heka/log_collector/single.yml

@@ -0,0 +1,7 @@
+classes:
+- service.heka.log_collector.single
+parameters:
+  heka:
+    log_collector:
+      elasticsearch_host: ${_param:heka_elasticsearch_host}
+      elasticsearch_port: 9200

diff --git a/system/heka/metric_collector/single.yml b/system/heka/metric_collector/single.yml
new file mode 100644
index 0000000..960d271
--- /dev/null
+++ b/system/heka/metric_collector/single.yml

@@ -0,0 +1,17 @@
+classes:
+- service.heka.metric_collector.single
+parameters:
+  heka:
+    metric_collector:
+      aggregator_host: ${_param:stacklight_monitor_address}
+      aggregator_port: ${_param:aggregator_port}
+      influxdb_database: lma
+      influxdb_host: ${_param:heka_influxdb_host}
+      influxdb_password: ${_param:influxdb_stacklight_password}
+      influxdb_port: 8086
+      influxdb_time_precision: ms
+      influxdb_username: lma
+      nagios_host: ${_param:nagios_host}
+      nagios_username: ${_param:nagios_username}
+      nagios_password: ${_param:nagios_password}
+      nagios_port: ${_param:nagios_status_port}

diff --git a/system/heka/remote_collector/cluster.yml b/system/heka/remote_collector/cluster.yml
new file mode 100644
index 0000000..f3344c4
--- /dev/null
+++ b/system/heka/remote_collector/cluster.yml

@@ -0,0 +1,23 @@
+classes:
+- service.heka.remote_collector.cluster
+parameters:
+  heka:
+    remote_collector:
+      influxdb_host: ${_param:heka_influxdb_host}
+      influxdb_port: ${_param:influxdb_port}
+      influxdb_database: ${_param:influxdb_database}
+      influxdb_username: ${_param:influxdb_user}
+      influxdb_password: ${_param:influxdb_password}
+      aggregator_host: ${_param:heka_aggregator_host}
+      aggregator_port: ${_param:aggregator_port}
+  keepalived:
+    cluster:
+      instance:
+        stacklight_monitor_vip:
+          notify_action:
+            master:
+              - service remote_collector start
+            backup:
+              - service remote_collector stop
+            fault:
+              - service remote_collector stop

diff --git a/system/heka/remote_collector/single.yml b/system/heka/remote_collector/single.yml
new file mode 100644
index 0000000..df33055
--- /dev/null
+++ b/system/heka/remote_collector/single.yml

@@ -0,0 +1,12 @@
+classes:
+- service.heka.remote_collector.single
+parameters:
+  heka:
+    remote_collector:
+      influxdb_host: ${_param:heka_influxdb_host}
+      influxdb_port: ${_param:influxdb_port}
+      influxdb_database: ${_param:influxdb_database}
+      influxdb_username: ${_param:influxdb_user}
+      influxdb_password: ${_param:influxdb_password}
+      aggregator_host: ${_param:heka_aggregator_host}
+      aggregator_port: ${_param:aggregator_port}

diff --git a/system/heka/router/single.yml b/system/heka/router/single.yml
new file mode 100644
index 0000000..8801e42
--- /dev/null
+++ b/system/heka/router/single.yml

@@ -0,0 +1,27 @@
+classes:
+- service.rabbitmq.server.single
+- service.heka.server.amqp.router
+parameters:
+  _param:
+    heka_router_input_exchange: heka
+    heka_router_input_host: ${_param:heka_amqp_host}
+    heka_router_input_password: ${_param:heka_amqp_password}
+    heka_router_input_vhost: log
+    heka_router_input_user: log
+    heka_router_output_host: ${_param:heka_elasticsearch_host}
+    heka_router_prefetch_count: 20
+    rabbitmq_secret_key: secret_key
+    rabbitmq_admin_name: admin
+    rabbitmq_admin_password: workshoplearning42
+    kibana_elasticsearch_host: localhost
+  heka:
+    shipper:
+      enabled: false
+  rabbitmq:
+    server:
+      host:
+        'log':
+          enabled: true
+          user: log
+          password: ${_param:heka_amqp_password}
+

diff --git a/system/heka/shipper/single.yml b/system/heka/shipper/single.yml
new file mode 100644
index 0000000..0aee02c
--- /dev/null
+++ b/system/heka/shipper/single.yml

@@ -0,0 +1,10 @@
+classes:
+- service.heka.server.amqp.shipper
+parameters:
+  _param:
+    heka_shipper_output_exchange: heka
+    heka_shipper_output_host: ${_param:heka_amqp_host}
+    heka_shipper_output_password: ${_param:heka_amqp_password}
+    heka_shipper_output_vhost: log
+    heka_shipper_output_user: log
+
commit	03ff34ef2becc70bbeb47b209edf350cee769626	[log] [tgz]
author	Ondrej Smola <ondrej.smola@tcpcloud.eu>	Thu Dec 01 01:30:33 2016 +0100
committer	Ondrej Smola <ondrej.smola@tcpcloud.eu>	Thu Dec 01 01:30:33 2016 +0100
tree	16e1c37a5677c1d55fa2d59f6445d4db2fa46dc1