blob: c4a1131d256b3a4923e674ee6cf2ed37d3f26dd0 [file] [log] [blame]
{%- from "telegraf/map.jinja" import agent with context -%}
{%- if agent.output is defined %}
{%- if agent.output.prometheus_client is defined %}
{%- if agent.output.prometheus_client.bind.address == '0.0.0.0' %}
{%- set fqdn_ip4_addresses = [] %}
{%- for addr in grains['fqdn_ip4'] %}
{%- if not addr.startswith('127.') %}
{%- do fqdn_ip4_addresses.append(addr) %}
{%- endif %}
{%- endfor %}
{%- set address = fqdn_ip4_addresses[0] %}
{%- else %}
{%- set address = agent.output.prometheus_client.bind.address %}
{%- endif %}
{%- endif %}
{%- endif %}
{%- raw %}
server:
alert:
NodeDown:
if: >-
sum by (host, url) (label_replace((up{job=~"telegraf|fluentd"}), "url", "$1", "instance", "(.*):.*")) + on (url) group_left abs(clamp_max(ping_result_code, 1) - 1) == 0
labels:
severity: critical
service: system
annotations:
summary: "The {{ $labels.host }} node is down"
description: "The {{ $labels.host }} node is unreachable at {{ $labels.url }}, the Telegraf and Fluentd targets on the {{ $labels.host }} node are down."
TelegrafGatherErrors:
if: >-
rate(internal_agent_gather_errors[10m]) > 0
labels:
severity: major
service: telegraf
annotations:
summary: "Telegraf failed to gather metrics"
description: "Telegraf has gathering errors on the {{ $labels.host }} node for the last 10 minutes."
{%- endraw %}
{%- if pillar.reclass is defined %}
{%- if pillar.reclass.get('storage', {}).get('enabled', False) and pillar.reclass.get('storage', {}).get('data_source',{}).get('engine',"") == "local" %}
ReclassUnstagedChanges:
if: >-
reclass_files_unstaged_changes > 0
for: 30m
labels:
severity: warning
service: reclass
annotations:
summary: 'Reclass model has been modified'
{%- raw %}
description: '{{ $labels.value }} files under {{ $labels.directory }} have been modified without being committed'
{%- endraw %}
ReclassStagedChanges:
if: >-
reclass_files_staged_changes > 0
for: 30m
labels:
severity: warning
service: reclass
annotations:
summary: 'Reclass model has diverged from expected local state'
{%- raw %}
description: '{{ $labels.value }} files under {{ $labels.directory }} have diverged from expected local state'
{%- endraw %}
ReclassRemoteDesync:
if: >-
reclass_remote_desync > 0
for: 30m
labels:
severity: warning
service: reclass
annotations:
summary: 'Local Reclass model state has diverted from remote state'
{%- raw %}
description: '{{ $labels.value }} local files under {{ $labels.directory }} have diverged from expected remote state'
{%- endraw %}
{%- endif %}
{%- endif %}
{%- if pillar.neutron is defined %}
{%- for component in ['gateway', 'compute'] %}
{%- set neutron_config = pillar.neutron.get(component, {}) %}
{%- if neutron_config.get('enabled', False) == True and 'ovs' in neutron_config.get('backend', {}).get('mechanism', {}).keys() %}
{%- raw %}
OVSTooManyPortRunningOnAgent:
if: >-
sum by (host) (ovs_bridge_status) > 1500
for: 2m
labels:
severity: major
service: neutron
annotations:
summary: "High number of ovs ports on host"
description: "The number of ovs port is {{ $value }} (ovs-vsctl list port ) on {{ $labels.host }} which is more than the expected limit for 2 minutes."
OVSErrorOnPort:
if: >-
ovs_bridge_status == 2
for: 2m
labels:
severity: critical
service: neutron
annotations:
summary: "OVS port is reporting error"
description: "OVS port {{ $labels.port }} on bridge {{ $labels.bridge }} running on {{ $labels.host }} is reporting errors for 2 minutes."
OVSNonInternalPortDown:
if: >-
ovs_bridge_status{type!="internal"} == 0
for: 5m
labels:
severity: critical
service: neutron
annotations:
summary: "Non internal ovs port is down"
description: "OVS port {{ $labels.port }} on bridge {{ $labels.bridge }} running on {{ $labels.host }} is reporting status down for 5 minutes."
OVSGatherFailed:
if: >-
ovs_bridge_check == 0
labels:
severity: critical
service: neutron
annotations:
summary: "Failed to Gather OVS information"
description: "Failed to Gather OVS information on host {{ $labels.host }}"
{%- endraw %}
{%- if pillar.neutron.get('gateway', {}).get('enabled', False) == True or pillar.neutron.get('compute',{}).get('dhcp_agent_enabled', False) == True %}
OVSInstanceArpingCheckDown:
if: instance_arping_check_up == 0
for: 2m
labels:
severity: major
service: ovs
annotations:
summary: "The OVS instance arping check is down"
{%- raw %}
description: "The OVS instance arping check on the {{ $labels.host }} node is down for 2 minutes."
{%- endraw %}
{%- endif %}
{%- endif %}
{%- endfor %}
{%- endif %}
{%- if pillar.opencontrail is defined %}
{%- if pillar.opencontrail.get('compute', {}).get('enabled', False) == True %}
OpencontrailInstancePingCheckDown:
if: instance_ping_check_up == 0
for: 2m
labels:
severity: major
service: contrail
annotations:
summary: "The Opencontrail instance ping check is down"
{%- raw %}
description: "The Opencontrail instance ping check on the {{ $labels.host }} node is down for 2 minutes."
{%- endraw %}
{%- endif %}
{%- endif %}
{%- if pillar.neutron is defined %}
{%- if (pillar.neutron.get('gateway', {}).get('enabled', False) == True and 'ovs' in pillar.neutron.get('gateway', {}).get('backend', {}).get('mechanism', {}).keys()) %}
or (pillar.neutron.get('compute', {}).get('enabled', False) == True and 'ovs' in pillar.neutron.get('compute', {}).get('backend', {}).get('mechanism', {}).keys() and pillar.neutron.get('compute', {}).get('dhcp_agent_enabled', False) == True) %}
recording:
instance_id:instance_arping_success:
query: >-
avg(instance_arping_success) by (id)
instance_id:instance_arping_success:avg10m:for10m:
query: >-
avg_over_time(instance_id:instance_arping_success[10m]) and instance_id:instance_arping_success and instance_id:instance_arping_success offset 10m
total:instance_id:instance_arping_success:avg10m:for10m:
query: >-
count(instance_id:instance_arping_success:avg10m:for10m)
total:instance_id:instance_arping_success:avg10m:for10m:eq0:
query: >-
count(instance_id:instance_arping_success:avg10m:for10m == 0)
total:openstack_nova_instance_failed:
query: >-
count(instance_id:instance_arping_success:avg10m:for10m == 0 or on(id) openstack_nova_instance_status == 2) or vector(0)
total:openstack_nova_instance_all:
query: >-
count(instance_id:instance_arping_success:avg10m:for10m or on(id) openstack_nova_instance_status) or vector(0)
{%- endif %}
{%- endif %}
{%- if pillar.opencontrail is defined %}
{%- if pillar.opencontrail.get('compute', {}).get('enabled', False) == True %}
recording:
instance_id:instance_ping_success:
query: >-
avg(instance_ping_success) by (id) * on(id) instance_ping_valid or on(id) instance_ping_valid
instance_id:instance_ping_success:avg10m:for10m:
query: >-
avg_over_time(instance_id:instance_ping_success[10m]) and instance_id:instance_ping_success and instance_id:instance_ping_success offset 10m
total:instance_id:instance_ping_success:avg10m:for10m:
query: >-
count(instance_id:instance_ping_success:avg10m:for10m)
total:instance_id:instance_ping_success:avg10m:for10m:eq0:
query: >-
count(instance_id:instance_ping_success:avg10m:for10m == 0)
total:openstack_nova_instance_failed:
query: >-
count(instance_id:instance_ping_success:avg10m:for10m == 0 or on(id) openstack_nova_instance_status == 2) or vector(0)
total:openstack_nova_instance_all:
query: >-
count(instance_id:instance_ping_success:avg10m:for10m or on(id) openstack_nova_instance_status) or vector(0)
{%- endif %}
{%- endif %}
{%- if address is defined %}
target:
static:
telegraf:
{%- if agent.output.prometheus_client.scheme is defined %}
scheme: {{ agent.output.prometheus_client.scheme }}
{%- endif %}
{%- if agent.output.prometheus_client.tls_config is defined %}
tls_config:
{%- if agent.output.prometheus_client.tls_config.skip_verify is defined %}
skip_verify: {{ agent.output.prometheus_client.tls_config.skip_verify }}
{%- endif %}
{%- if agent.output.prometheus_client.tls_config.ca_file is defined %}
ca_file: {{ agent.output.prometheus_client.tls_config.ca_file }}
{%- endif %}
{%- if agent.output.prometheus_client.tls_config.cert_name is defined %}
cert_name: {{ agent.output.prometheus_client.tls_config.cert_name }}
{%- endif %}
{%- if agent.output.prometheus_client.tls_config.key_name is defined %}
key_name: {{ agent.output.prometheus_client.tls_config.key_name }}
{%- endif %}
{%- endif %}
endpoint:
- address: {{ address }}
port: {{ agent.output.prometheus_client.bind.port }}
relabel_configs:
- regex: {{ address }}:{{ agent.output.prometheus_client.bind.port }}
replacement: {{ grains['host'] }}
source_labels: "__address__"
target_label: "host"
honor_labels: true
{%- endif %}