Add alarms
Change-Id: I805f3298dd5f69f7b4535f949eb943a160665f6c
diff --git a/opencontrail/meta/heka.yml b/opencontrail/meta/heka.yml
index 069dd0f..1f09ef5 100644
--- a/opencontrail/meta/heka.yml
+++ b/opencontrail/meta/heka.yml
@@ -1,32 +1,32 @@
{%- if pillar.opencontrail is defined %}
-{%- if pillar.opencontrail.control is defined %}
-{%- from "opencontrail/map.jinja" import control with context %}
-{%- if control.get('enabled', False) %}
-{%- set controller_ref = control %}
-{%- endif %}
-{%- elif pillar.opencontrail.compute is defined %}
-{%- from "opencontrail/map.jinja" import compute with context %}
-{%- if compute.get('enabled', False) %}
-{%- set compute_ref = compute %}
-{%- endif %}
-{%- endif %}
-{%- if pillar.opencontrail.web is defined %}
-{%- from "opencontrail/map.jinja" import web with context %}
-{%- if web.get('enabled', False) %}
-{%- set web_ref = web %}
-{%- endif %}
-{%- endif %}
+ {%- if pillar.opencontrail.control is defined %}
+ {%- from "opencontrail/map.jinja" import control with context %}
+ {%- if control.get('enabled', False) %}
+ {%- set controller_ref = control %}
+ {%- endif %}
+ {%- elif pillar.opencontrail.compute is defined %}
+ {%- from "opencontrail/map.jinja" import compute with context %}
+ {%- if compute.get('enabled', False) %}
+ {%- set compute_ref = compute %}
+ {%- endif %}
+ {%- endif %}
+ {%- if pillar.opencontrail.web is defined %}
+ {%- from "opencontrail/map.jinja" import web with context %}
+ {%- if web.get('enabled', False) %}
+ {%- set web_ref = web %}
+ {%- endif %}
+ {%- endif %}
{%- endif %}
{%- if controller_ref is defined or compute_ref is defined %}
log_collector:
-{%- if controller_ref is defined %}
+ {%- if controller_ref is defined %}
splitter:
java:
engine: regex
delimiter: '\n([0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2},[0-9]{3} - )'
delimiter_eol: false
-{%- endif %}
+ {%- endif %}
decoder:
contrail:
engine: sandbox
@@ -38,7 +38,7 @@
module_file: /usr/share/lma_collector/decoders/contrail_supervisor_log.lua
module_dir: /usr/share/lma_collector/common;/usr/share/heka/lua_modules
adjust_timezone: true
-{%- if controller_ref is defined %}
+ {%- if controller_ref is defined %}
contrail_collector:
engine: sandbox
module_file: /usr/share/lma_collector/decoders/contrail_collector_log.lua
@@ -64,14 +64,14 @@
module_file: /usr/share/lma_collector/decoders/ifmap.lua
module_dir: /usr/share/lma_collector/common;/usr/share/heka/lua_modules
adjust_timezone: true
-{%- endif %}
-{%- if web_ref is defined %}
+ {%- endif %}
+ {%- if web_ref is defined %}
redis:
engine: sandbox
module_file: /usr/share/lma_collector/decoders/redis.lua
module_dir: /usr/share/lma_collector/common;/usr/share/heka/lua_modules
adjust_timezone: true
-{%- endif %}
+ {%- endif %}
input:
contrail_supervisor_log:
engine: logstreamer
@@ -89,7 +89,7 @@
priority: ["^Seq"]
decoder: "contrail_supervisor_decoder"
splitter: "TokenSplitter"
-{%- if controller_ref is defined %}
+ {%- if controller_ref is defined %}
contrail_collector_log:
engine: logstreamer
log_directory: "/var/log"
@@ -130,7 +130,7 @@
priority: ["^Seq"]
decoder: "ifmap_decoder"
splitter: "java_splitter"
-{%- endif %}
+ {%- endif %}
contrail_main_log:
engine: logstreamer
log_directory: "/var/log"
@@ -139,7 +139,7 @@
priority: ["^Seq"]
decoder: "contrail_decoder"
splitter: "TokenSplitter"
-{%- if web_ref is defined and web_ref.get('cache', {}).get('engine', '') == 'redis' %}
+ {%- if web_ref is defined and web_ref.get('cache', {}).get('engine', '') == 'redis' %}
redis_log:
engine: logstreamer
log_directory: "/var/log"
@@ -148,5 +148,454 @@
priority: ["^Seq"]
decoder: "redis_decoder"
splitter: "TokenSplitter"
-{%- endif %}
+ {%- endif %}
+metric_collector:
+ trigger:
+ {%- if controller_ref is defined %}
+ {%- for contrail_process in ('contrail-control', 'contrail-control-nodemgr',
+ 'contrail-dns', 'contrail-named', 'contrail-alarm-gen', 'contrail-analytics-api',
+ 'contrail-analytics-nodemgr', 'contrail-collector', 'contrail-query-engine',
+ 'contrail-snmp-collector', 'contrail-topology', 'contrail-api', 'contrail-config-nodemgr',
+ 'contrail-device-manager', 'contrail-discovery', 'contrail-schema', 'contrail-svc-monitor',
+ 'contrail-database-nodemgr', 'ifmap', 'kafka', 'cassandra', 'zookeeper'
+ ) %}
+ {{ contrail_process|replace("-", "_") }}:
+ description: "There is no {{ contrail_process }} process running"
+ severity: critical
+ rules:
+ - metric: lma_components_processes
+ fields:
+ service: '{{ contrail_process }}'
+ relational_operator: '=='
+ threshold: 0
+ window: 60
+ periods: 0
+ function: min
+ {%- endfor %}
+ xmpp_number_of_sessions_lo:
+ description: "There are no xmpp-number-of-sessions"
+ severity: warning
+ rules:
+ - metric: xmpp-number-of-sessions
+ relational_operator: '=='
+ threshold: 0
+ window: 100
+ periods: 0
+ function: min
+ xmpp_number_of_sessions_hi:
+ description: "There are too many xmpp-number-of-sessions"
+ severity: warning
+ rules:
+ - metric: xmpp-number-of-sessions
+ relational_operator: '=='
+ threshold: 500
+ window: 100
+ periods: 0
+ function: min
+ xmpp_number_of_sessions_diff:
+ description: "Number of xmpp-number-of-sessions changed between checks is too high"
+ severity: warning
+ rules:
+ - metric: xmpp-number-of-sessions
+ relational_operator: '>='
+ threshold: 100
+ window: 100
+ periods: 0
+ function: diff
+ vrouter_xmpp_of_sessions_lo:
+ description: "There are no vrouter-xmpp sessions"
+ severity: warning
+ rules:
+ - metric: vrouter-xmpp
+ relational_operator: '=='
+ threshold: 0
+ window: 100
+ periods: 0
+ function: min
+ vrouter_xmpp_of_sessions_hi:
+ description: "There are too many vrouter-xmpp sessions"
+ severity: warning
+ rules:
+ - metric: vrouter-xmpps
+ relational_operator: '=='
+ threshold: 10
+ window: 100
+ periods: 0
+ function: min
+ vrouter_xmpp_of_sessions_diff:
+ description: "Number of vrouter-xmpp changed between checks is too high"
+ severity: warning
+ rules:
+ - metric: vrouter-xmpp
+ relational_operator: '>='
+ threshold: 5
+ window: 100
+ periods: 0
+ function: diff
+ vrouter_xmpp_dns_of_sessions_lo:
+ description: "There are no vrouter-dns-xmpp sessions"
+ severity: warning
+ rules:
+ - metric: vrouter-dns-xmpp
+ relational_operator: '=='
+ threshold: 0
+ window: 100
+ periods: 0
+ function: min
+ vrouter_xmpp_dns_of_sessions_hi:
+ description: "There are too many vrouter-dns-xmpp sessions"
+ severity: warning
+ rules:
+ - metric: vrouter-dns-xmpps
+ relational_operator: '=='
+ threshold: 10
+ window: 100
+ periods: 0
+ function: min
+ vrouter_xmpp_dns_of_sessions_diff:
+ description: "Number of vrouter-dns-xmpp changed between checks is too high"
+ severity: warning
+ rules:
+ - metric: vrouter-dns-xmpp
+ relational_operator: '>='
+ threshold: 5
+ window: 100
+ periods: 0
+ function: diff
+ vrouter_lls_sessions_lo:
+ description: "There are no vrouter-vrouter-lls sessions"
+ severity: warning
+ rules:
+ - metric: vrouter-lls
+ relational_operator: '=='
+ threshold: 0
+ window: 100
+ periods: 0
+ function: min
+ vrouter_lls_sessions_hi:
+ description: "There are too many vrouter-vrouter-lls sessions"
+ severity: warning
+ rules:
+ - metric: vrouter-lls
+ relational_operator: '=='
+ threshold: 10
+ window: 100
+ periods: 0
+ function: min
+ vrouter_lls_of_sessions_diff:
+ description: "Number of vrouter-vrouter-lls changed between checks is too high"
+ severity: warning
+ rules:
+ - metric: vrouter-lls
+ relational_operator: '>='
+ threshold: 5
+ window: 100
+ periods: 0
+ function: diff
+ xmpp_number_of_sessions_up:
+ description: "There are no active XMPP sessions "
+ severity: warning
+ rules:
+ - metric: xmpp-number-of-sessions-up
+ relational_operator: '=='
+ threshold: 0
+ window: 100
+ periods: 0
+ function: min
+ xmpp_number_of_sessions_down:
+ description: "There are inactive XMPP sessions"
+ severity: warning
+ rules:
+ - metric: xmpp-number-of-sessions-down
+ relational_operator: '>='
+ threshold: 1
+ window: 100
+ periods: 0
+ function: min
+ bgp_session_number:
+ description: "There are no BGP sessions"
+ severity: warning
+ rules:
+ - metric: bgp-session-number
+ relational_operator: '=='
+ threshold: 0
+ window: 100
+ periods: 0
+ function: min
+ bgp_session_number_up:
+ description: "There are no active BGP sessions "
+ severity: warning
+ rules:
+ - metric: bgp-session-number-up
+ relational_operator: '=='
+ threshold: 0
+ window: 100
+ periods: 0
+ function: min
+ bgp_session_number_down:
+ description: "There are inactive BGP sessions"
+ severity: warning
+ rules:
+ - metric: bgp-session-number-down
+ relational_operator: '>='
+ threshold: 1
+ window: 100
+ periods: 0
+ function: min
+ vrouter_openedsockets:
+ description: "There are too many sockets opened for vRouter"
+ severity: warning
+ rules:
+ - metric: vrouter-openedsockets
+ relational_operator: '>='
+ threshold: 0
+ window: 120
+ periods: 0
+ function: min
+ vrouter_flows_active:
+ description: "There are too many vrouter flows"
+ severity: warning
+ rules:
+ - metric: vrouter-flows-active
+ relational_operator: '>='
+ threshold: 1200
+ window: 120
+ periods: 0
+ function: min
+ vrouter_flows_zero_active:
+ description: "There are no active flows"
+ severity: warning
+ rules:
+ - metric: vrouter-flows-active
+ relational_operator: '=='
+ threshold: 0
+ window: 120
+ periods: 0
+ function: min
+ vrouter_flows_created:
+ description: "There are too many vrouter flows created"
+ severity: warning
+ rules:
+ - metric: vrouter-flows-created
+ relational_operator: '>='
+ threshold: 1000
+ window: 120
+ periods: 0
+ function: min
+ vrouter_flows_discard:
+ description: "There are too many vrouter flows: discards"
+ severity: warning
+ rules:
+ - metric: vrouter-flows-discard
+ relational_operator: '>='
+ threshold: 100
+ window: 120
+ periods: 0
+ function: min
+ vrouter_flows_drop:
+ description: "There are too many vrouter flows: drops"
+ severity: warning
+ rules:
+ - metric: vrouter-flows-drop
+ relational_operator: '>='
+ threshold: 100
+ window: 120
+ periods: 0
+ function: min
+ vrouter_flows_frag_err:
+ description: "There are too many vrouter flows: fragment errors"
+ severity: warning
+ rules:
+ - metric: vrouter-flows-frag-err
+ relational_operator: '>='
+ threshold: 100
+ window: 120
+ periods: 0
+ function: min
+ vrouter_flows_invalid_nh:
+ description: "There are too many vrouter flows: invalid_nh"
+ severity: warning
+ rules:
+ - metric: vrouter-flows-invalid-nh
+ relational_operator: '>='
+ threshold: 100
+ window: 120
+ periods: 0
+ function: min
+ vrouter_flows_composite_invalid_interface:
+ description: "There are too many vrouter flows: composite_invalid_interface"
+ severity: warning
+ rules:
+ - metric: vrouter-flows-composite-invalid-interface
+ relational_operator: '>='
+ threshold: 100
+ window: 120
+ periods: 0
+ function: min
+ vrouter_flows_invalid_label:
+ description: "There are too many vrouter flows: invalid_label"
+ severity: warning
+ rules:
+ - metric: vrouter-flows-invalid-label
+ relational_operator: '>='
+ threshold: 100
+ window: 120
+ periods: 0
+ function: min
+ vrouter_flows_flow_queue_limit_exceeded:
+ description: "There are too many vrouter flows: flow_queue_limit_exceeded"
+ severity: warning
+ rules:
+ - metric: vrouter-flows-flow-queue-limit-exceeded
+ relational_operator: '>='
+ threshold: 100
+ window: 120
+ periods: 0
+ function: min
+ vrouter_flow_full:
+ description: "There are too many vrouter flows: flow_table_full"
+ severity: warning
+ rules:
+ - metric: vrouter-flows-flow-table-full
+ relational_operator: '>='
+ threshold: 100
+ window: 120
+ periods: 0
+ function: min
+ cassandra_cluster_endpoint_down:
+ description: "Cassandra Cluster Endpoint is down"
+ severity: critical
+ rules:
+ - metric: DownEndpointCount
+ relational_operator: '>'
+ threshold: 0
+ window: 100
+ periods: 0
+ function: min
+ {%- endif %}
+ {%- if web_ref is defined %}
+ {%- for contrail_process in ('contrail-webui', 'contrail-webui-middleware'
+ ) %}
+ {{ contrail_process|replace("-", "_") }}:
+ description: "There is no {{ contrail_process }} process running"
+ severity: critical
+ rules:
+ - metric: lma_components_processes
+ fields:
+ service: '{{ contrail_process }}'
+ relational_operator: '=='
+ threshold: 0
+ window: 60
+ periods: 0
+ function: min
+ {%- endfor %}
+ {%- endif %}
+ {%- if compute_ref is defined %}
+ {%- for contrail_process in ('contrail-vrouter-agent', 'contrail-vrouter-nodemgr'
+ ) %}
+ {{ contrail_process|replace("-", "_") }}:
+ description: "There is no {{ contrail_process }} process running"
+ severity: critical
+ rules:
+ - metric: lma_components_processes
+ fields:
+ service: '{{ contrail_process|replace("-", "_") }}'
+ relational_operator: '=='
+ threshold: 0
+ window: 60
+ periods: 0
+ function: min
+ {%- endfor %}
+ {%- endif %}
+ alarm:
+ {%- if controller_ref is defined %}
+ {%- for contrail_process in ('contrail-control', 'contrail-control-nodemgr',
+ 'contrail-dns', 'contrail-named', 'contrail-alarm-gen', 'contrail-analytics-api',
+ 'contrail-analytics-nodemgr', 'contrail-collector', 'contrail-query-engine',
+ 'contrail-snmp-collector', 'contrail-topology', 'contrail-api', 'contrail-config-nodemgr',
+ 'contrail-device-manager', 'contrail-discovery', 'contrail-schema', 'contrail-svc-monitor',
+ 'contrail-database-nodemgr', 'ifmap', 'kafka', 'cassandra', 'zookeeper'
+ ) %}
+ {{ contrail_process|replace("-", "_") }}:
+ alerting: enabled
+ triggers:
+ - {{ contrail_process|replace("-", "_") }}
+ dimension:
+ service: contrail-processes-control
+ {%- endfor %}
+ {%- endif %}
+ {%- if web_ref is defined %}
+ {%- for contrail_process in ('contrail-webui', 'contrail-webui-middleware'
+ ) %}
+ {{ contrail_process|replace("-", "_") }}:
+ alerting: enabled
+ triggers:
+ - {{ contrail_process|replace("-", "_") }}
+ dimension:
+ service: contrail-processes-control
+ {%- endfor %}
+ {%- endif %}
+ {%- if compute_ref is defined %}
+ {%- for contrail_process in ('contrail-vrouter-agent', 'contrail-vrouter-nodemgr'
+ ) %}
+ {{ contrail_process|replace("-", "_") }}:
+ alerting: enabled
+ triggers:
+ - {{ contrail_process|replace("-", "_") }}
+ dimension:
+ service: contrail-processes-compute
+ {%- endfor %}
+ {%- endif %}
+aggregator:
+ alarm_cluster:
+ {%- if controller_ref is defined %}
+ contrail_processes_control:
+ policy: highest_severity
+ group_by: hostname
+ alerting: enabled
+ match:
+ service: contrail-processes-control
+ members:
+ {%- for contrail_process in ('contrail-control', 'contrail-control-nodemgr',
+ 'contrail-dns', 'contrail-named', 'contrail-alarm-gen', 'contrail-analytics-api',
+ 'contrail-analytics-nodemgr', 'contrail-collector', 'contrail-query-engine',
+ 'contrail-snmp-collector', 'contrail-topology', 'contrail-api', 'contrail-config-nodemgr',
+ 'contrail-device-manager', 'contrail-discovery', 'contrail-schema', 'contrail-svc-monitor',
+ 'contrail-database-nodemgr', 'ifmap', 'kafka', 'cassandra', 'zookeeper'
+ ) %}
+ - {{ contrail_process|replace("-", "_") }}:
+ {%- endfor %}
+ dimension:
+ service: contrail-control
+ {%- endif %}
+ {%- if web_ref is defined %}
+ contrail_processes_compute:
+ policy: highest_severity
+ group_by: hostname
+ alerting: enabled
+ match:
+ service: contrail-processes-control
+ members:
+ {%- for contrail_process in ('contrail-webui', 'contrail-webui-middleware'
+ ) %}
+ - {{ contrail_process|replace("-", "_") }}:
+ {%- endfor %}
+ dimension:
+ service: contrail-control
+ {%- endif %}
+ {%- if compute_ref is defined %}
+ contrail_processes_compute:
+ policy: highest_severity
+ group_by: hostname
+ alerting: enabled
+ match:
+ service: contrail-processes-compute
+ members:
+ {%- for contrail_process in ('contrail-vrouter-agent', 'contrail-vrouter-nodemgr'
+ ) %}
+ - {{ contrail_process|replace("-", "_") }}:
+ {%- endfor %}
+ dimension:
+ service: contrail-compute
+ {%- endif %}
{%- endif %}