blob: bf4f3c64fd15d4c7771145f5d8ea9ad3e935d15e [file] [log] [blame]
{% from "nova/map.jinja" import controller with context %}
{%- set apache_wsgi = controller.get('enabled') and controller.version not in ('juno', 'kilo', 'liberty', 'mitaka', 'newton') %}
log_collector:
decoder:
nova:
engine: sandbox
module_file: /usr/share/lma_collector/decoders/openstack_log.lua
module_dir: /usr/share/lma_collector/common;/usr/share/heka/lua_modules
adjust_timezone: true
{%- if pillar.nova.compute is defined %}
libvirt:
engine: sandbox
module_file: /usr/share/lma_collector/decoders/libvirt_log.lua
module_dir: /usr/share/lma_collector/common;/usr/share/heka/lua_modules
{%- endif %}
{%- if apache_wsgi %}
nova_placement_wsgi:
engine: sandbox
module_file: /usr/share/lma_collector/decoders/apache_wsgi_log.lua
module_dir: /usr/share/lma_collector/common;/usr/share/heka/lua_modules
config:
logger: openstack.nova
apache_log_pattern: >-
%v:%p %h %l %u %t \"%r\" %>s %D %O \"%{Referer}i\" \"%{User-Agent}i\"
{%- endif %}
splitter:
nova:
engine: token
delimiter: '\n'
input:
nova_log:
engine: logstreamer
log_directory: "/var/log"
file_match: 'nova/(?P<Service>.+)\.log\.?(?P<Seq>\d*)$'
differentiator: ['nova', '_', 'Service']
priority: ["^Seq"]
decoder: "nova_decoder"
splitter: "nova_splitter"
{%- if pillar.nova.compute is defined %}
libvirt_log:
engine: logstreamer
log_directory: "/var/log"
file_match: 'libvirt/libvirtd.log'
differentiator: ['libvirt']
decoder: "libvirt_decoder"
splitter: "TokenSplitter"
{%- endif %}
{%- if apache_wsgi %}
nova_placement_wsgi_log:
engine: logstreamer
log_directory: "/var/log/apache2"
file_match: 'nova_placement_access\.log'
differentiator: ['nova-placement-wsgi']
priority: ["^Seq"]
decoder: "nova_placement_wsgi_decoder"
splitter: "TokenSplitter"
{%- endif %}
metric_collector:
trigger:
nova_logs_error:
description: 'Too many errors have been detected in Nova logs'
severity: warning
no_data_policy: okay
rules:
- metric: log_messages
field:
service: nova
level: error
relational_operator: '>'
threshold: 0.1
window: 70
periods: 0
function: max
{%- if pillar.nova.controller is defined %}
nova_api_local_endpoint:
description: 'Nova API is locally down'
severity: down
rules:
- metric: openstack_check_local_api
field:
service: nova-api
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
{%- endif %}
{%- if pillar.nova.compute is defined %}
libvirt_check:
description: 'Libvirt cannot be checked'
severity: down
rules:
- metric: libvirt_check
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
{%- endif %}
alarm:
{%- if pillar.nova.compute is defined %}
nova_logs_compute:
alerting: enabled
triggers:
- nova_logs_error
dimension:
service: nova-logs-compute
libvirt_check:
alerting: enabled
triggers:
- libvirt_check
dimension:
service: libvirt-check
{%- endif %}
{%- if pillar.nova.controller is defined %}
nova_logs:
alerting: enabled
triggers:
- nova_logs_error
dimension:
service: nova-logs
nova_api_endpoint:
alerting: enabled
triggers:
- nova_api_local_endpoint
dimension:
service: nova-api-endpoint
{%- endif %}
remote_collector:
trigger:
{%- if pillar.nova.controller is defined %}
nova_api_check_failed:
description: 'Endpoint check for nova-api is failed'
severity: down
rules:
- metric: openstack_check_api
field:
service: nova-api
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
{%- for nova_service in ('cert', 'consoleauth', 'conductor', 'scheduler') %}
nova_{{ nova_service }}_two_up:
description: 'Some Nova {{ nova_service }}s are down'
severity: warning
logical_operator: and
rules:
- metric: openstack_nova_services
field:
service: {{ nova_service }}
state: up
relational_operator: '>='
threshold: 2
window: 60
periods: 0
function: last
- metric: openstack_nova_services
field:
service: {{ nova_service }}
state: down
relational_operator: '>'
threshold: 0
window: 60
periods: 0
function: last
nova_{{ nova_service }}_one_up:
description: 'Only one Nova {{ nova_service }} is up'
severity: critical
logical_operator: and
rules:
- metric: openstack_nova_services
field:
service: {{ nova_service }}
state: up
relational_operator: '=='
threshold: 1
window: 60
periods: 0
function: last
- metric: openstack_nova_services_percent
field:
service: {{ nova_service }}
state: up
relational_operator: '<'
threshold: 100
window: 60
periods: 0
function: last
nova_{{ nova_service }}_zero_up:
description: 'All Nova {{ nova_service }}s are down or disabled'
severity: down
rules:
- metric: openstack_nova_services
field:
service: {{ nova_service }}
state: up
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
{%- endfor %}
# we treat "up" and "disabled" states in the same way, considering
# that "disabled" should not be treated as an error
nova_compute_some_down:
description: 'Some Nova computes are down'
severity: warning
logical_operator: and
rules:
- metric: openstack_nova_services_percent
field:
service: compute
state: down
relational_operator: '>'
threshold: 0
window: 60
periods: 0
function: last
nova_compute_majority_down:
description: 'Majority of Nova computes are down'
severity: critical
rules:
- metric: openstack_nova_services_percent
field:
service: compute
state: down
relational_operator: '>'
threshold: 50
window: 60
periods: 0
function: last
nova_compute_all_down:
description: 'All Nova computes are down'
severity: down
rules:
- metric: openstack_nova_services_percent
field:
service: compute
state: down
relational_operator: '=='
threshold: 100
window: 60
periods: 0
function: last
nova_total_free_vcpu_warning:
description: 'There is no VCPU available for new instances'
severity: warning
rules:
- metric: openstack_nova_total_free_vcpus
relational_operator: '=='
threshold: 10
window: 60
periods: 0
function: max
nova_total_free_memory_warning:
description: 'There is no memory available for new instances'
severity: warning
rules:
- metric: openstack_nova_total_free_ram
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: max
nova_aggregates_free_memory_warning:
description: 'The nova aggregates free memory percent is low'
severity: warning
no_data_policy: okay
rules:
- metric: openstack_nova_aggregate_free_ram_percent
group_by: [aggregate]
relational_operator: '<'
threshold: 10.0
window: 70
periods: 0
function: min
nova_aggregates_free_memory_critical:
description: 'The nova aggregates free memory percent is too low'
severity: critical
no_data_policy: okay
rules:
- metric: openstack_nova_aggregate_free_ram_percent
group_by: [aggregate]
relational_operator: '<'
threshold: 1.0
window: 70
periods: 0
function: min
{%- endif %}
alarm:
{%- if pillar.nova.controller is defined %}
nova_api_check:
alerting: enabled
triggers:
- nova_api_check_failed
dimension:
service: nova-api-check
{%- for nova_service in ('cert', 'consoleauth', 'conductor', 'scheduler') %}
nova_{{ nova_service }}:
alerting: enabled
triggers:
- nova_{{ nova_service }}_zero_up
- nova_{{ nova_service }}_one_up
- nova_{{ nova_service }}_two_up
dimension:
service: nova-{{ nova_service }}
{%- endfor %}
nova_total_free_vcpu:
alerting: enabled
triggers:
- nova_total_free_vcpu_warning
dimension:
service: nova-total-free-vcpu
nova_total_free_memory:
alerting: enabled
triggers:
- nova_total_free_memory_warning
dimension:
service: nova-total-free-memory
nova_aggregates_free_memory:
alerting: enabled
triggers:
- nova_aggregates_free_memory_critical
- nova_aggregates_free_memory_warning
dimension:
service: nova-aggregates-free-memory
nova_compute:
alerting: enabled
triggers:
- nova_compute_all_down
- nova_compute_majority_down
- nova_compute_some_down
dimension:
service: nova-compute
{%- endif %}
aggregator:
alarm_cluster:
nova_logs_compute:
policy: majority_of_node_members
group_by: hostname
alerting: enabled
match:
service: nova-logs-compute
members:
- nova_logs_compute
dimension:
service: nova-data
nagios_host: 01-service-clusters
nova_libvirt:
policy: majority_of_node_members
group_by: hostname
alerting: enabled
match:
service: libvirt-check
members:
- libvirt_check
dimension:
service: nova-data
nagios_host: 01-service-clusters
nova_logs:
policy: status_of_members
group_by: hostname
alerting: enabled
match:
service: nova-logs
members:
- nova_logs
dimension:
service: nova-control
nagios_host: 01-service-clusters
nova_api_endpoint:
policy: availability_of_members
group_by: hostname
alerting: enabled
match:
service: nova-api-endpoint
members:
- nova_api_endpoint
dimension:
service: nova-control
nagios_host: 01-service-clusters
nova_api_check:
policy: highest_severity
alerting: enabled
match:
service: nova-api-check
members:
- nova_api_check
dimension:
service: nova-control
nagios_host: 01-service-clusters
{%- for nova_service in ('cert', 'consoleauth', 'conductor', 'scheduler') %}
nova_{{ nova_service }}:
policy: highest_severity
alerting: enabled
match:
service: nova-{{ nova_service }}
members:
- nova_{{ nova_service }}
dimension:
service: nova-control
nagios_host: 01-service-clusters
{%- endfor %}
nova_total_free_vcpu:
policy: highest_severity
alerting: enabled
match:
service: nova-total-free-vcpu
members:
- nova_total_free_vcpu
dimension:
service: nova-data
nagios_host: 01-service-clusters
nova_total_free_memory:
policy: highest_severity
alerting: enabled
match:
service: nova-total-free-memory
members:
- nova_total_free_memory
dimension:
service: nova-data
nagios_host: 01-service-clusters
nova_aggregates_free_memory:
policy: highest_severity
alerting: enabled
match:
service: nova-aggregates-free-memory
members:
- nova_aggregates_free_memory
dimension:
service: nova-data
nagios_host: 01-service-clusters
nova_compute:
policy: highest_severity
alerting: enabled
match:
service: nova-compute
members:
- nova_compute
dimension:
service: nova-data
nagios_host: 01-service-clusters
nova_control:
policy: highest_severity
alerting: enabled_with_notification
match:
service: nova-control
members:
- nova_logs
- nova_api_endpoint
- nova_api_check
{%- for nova_service in ('cert', 'consoleauth', 'conductor', 'scheduler') %}
- nova_{{ nova_service }}
{%- endfor %}
- haproxy_nova-control
dimension:
cluster_name: nova-control
nagios_host: 00-top-clusters
nova_data:
policy: highest_severity
alerting: enabled_with_notification
match:
service: nova-data
members:
- nova_logs_compute
- nova_libvirt
- nova_total_free_vcpu
- nova_total_free_memory
- nova_aggregates_free_memory
- nova_compute
dimension:
cluster_name: nova-data
nagios_host: 00-top-clusters