| {%- from "telegraf/map.jinja" import agent with context -%} |
| {%- if agent.output is defined %} |
| {%- if agent.output.prometheus_client is defined %} |
| {%- if agent.output.prometheus_client.bind.address == '0.0.0.0' %} |
| {%- set fqdn_ip4_addresses = [] %} |
| {%- for addr in grains['fqdn_ip4'] %} |
| {%- if not addr.startswith('127.') %} |
| {%- do fqdn_ip4_addresses.append(addr) %} |
| {%- endif %} |
| {%- endfor %} |
| {%- set address = fqdn_ip4_addresses[0] %} |
| {%- else %} |
| {%- set address = agent.output.prometheus_client.bind.address %} |
| {%- endif %} |
| {%- endif %} |
| {%- endif %} |
| {%- raw %} |
| server: |
| alert: |
| NodeDown: |
| if: >- |
| sum by (host, url) (label_replace((up{job=~"telegraf|fluentd"}), "url", "$1", "instance", "(.*):.*")) + on (url) group_left abs(clamp_max(ping_result_code, 1) - 1) == 0 |
| labels: |
| severity: critical |
| service: system |
| annotations: |
| summary: "The {{ $labels.host }} node is down" |
| description: "The {{ $labels.host }} node is unreachable at {{ $labels.url }}, the Telegraf and Fluentd targets on the {{ $labels.host }} node are down." |
| TelegrafGatherErrors: |
| if: >- |
| rate(internal_agent_gather_errors[10m]) > 0 |
| labels: |
| severity: major |
| service: telegraf |
| annotations: |
| summary: "Telegraf failed to gather metrics" |
| description: "Telegraf has gathering errors on the {{ $labels.host }} node for the last 10 minutes." |
| {%- endraw %} |
| {%- if pillar.reclass is defined %} |
| {%- if pillar.reclass.get('storage', {}).get('enabled', False) and pillar.reclass.get('storage', {}).get('data_source',{}).get('engine',"") == "local" %} |
| ReclassUnstagedChanges: |
| if: >- |
| reclass_files_unstaged_changes > 0 |
| for: 30m |
| labels: |
| severity: warning |
| service: reclass |
| annotations: |
| summary: 'Reclass model has been modified' |
| {%- raw %} |
| description: '{{ $labels.value }} files under {{ $labels.directory }} have been modified without being committed' |
| {%- endraw %} |
| ReclassStagedChanges: |
| if: >- |
| reclass_files_staged_changes > 0 |
| for: 30m |
| labels: |
| severity: warning |
| service: reclass |
| annotations: |
| summary: 'Reclass model has diverged from expected local state' |
| {%- raw %} |
| description: '{{ $labels.value }} files under {{ $labels.directory }} have diverged from expected local state' |
| {%- endraw %} |
| ReclassRemoteDesync: |
| if: >- |
| reclass_remote_desync > 0 |
| for: 30m |
| labels: |
| severity: warning |
| service: reclass |
| annotations: |
| summary: 'Local Reclass model state has diverted from remote state' |
| {%- raw %} |
| description: '{{ $labels.value }} local files under {{ $labels.directory }} have diverged from expected remote state' |
| {%- endraw %} |
| {%- endif %} |
| {%- endif %} |
| {%- if pillar.neutron is defined %} |
| {%- for component in ['gateway', 'compute'] %} |
| {%- set neutron_config = pillar.neutron.get(component, {}) %} |
| {%- if neutron_config.get('enabled', False) == True and 'ovs' in neutron_config.get('backend', {}).get('mechanism', {}).keys() %} |
| {%- raw %} |
| OVSTooManyPortRunningOnAgent: |
| if: >- |
| sum by (host) (ovs_bridge_status) > 1500 |
| for: 2m |
| labels: |
| severity: major |
| service: neutron |
| annotations: |
| summary: "High number of ovs ports on host" |
| description: "The number of ovs port is {{ $value }} (ovs-vsctl list port ) on {{ $labels.host }} which is more than the expected limit for 2 minutes." |
| OVSErrorOnPort: |
| if: >- |
| ovs_bridge_status == 2 |
| for: 2m |
| labels: |
| severity: critical |
| service: neutron |
| annotations: |
| summary: "OVS port is reporting error" |
| description: "OVS port {{ $labels.port }} on bridge {{ $labels.bridge }} running on {{ $labels.host }} is reporting errors for 2 minutes." |
| OVSNonInternalPortDown: |
| if: >- |
| ovs_bridge_status{type!="internal"} == 0 |
| for: 5m |
| labels: |
| severity: critical |
| service: neutron |
| annotations: |
| summary: "Non internal ovs port is down" |
| description: "OVS port {{ $labels.port }} on bridge {{ $labels.bridge }} running on {{ $labels.host }} is reporting status down for 5 minutes." |
| OVSGatherFailed: |
| if: >- |
| ovs_bridge_check == 0 |
| labels: |
| severity: critical |
| service: neutron |
| annotations: |
| summary: "Failed to Gather OVS information" |
| description: "Failed to Gather OVS information on host {{ $labels.host }}" |
| {%- endraw %} |
| {%- if pillar.neutron.get('gateway', {}).get('enabled', False) == True or pillar.neutron.get('compute',{}).get('dhcp_agent_enabled', False) == True %} |
| OVSInstanceArpingCheckDown: |
| if: instance_arping_check_up == 0 |
| for: 2m |
| labels: |
| severity: major |
| service: ovs |
| annotations: |
| summary: "The OVS instance arping check is down" |
| {%- raw %} |
| description: "The OVS instance arping check on the {{ $labels.host }} node is down for 2 minutes." |
| {%- endraw %} |
| {%- endif %} |
| {%- endif %} |
| {%- endfor %} |
| {%- endif %} |
| {%- if pillar.opencontrail is defined %} |
| {%- if pillar.opencontrail.get('compute', {}).get('enabled', False) == True %} |
| OpencontrailInstancePingCheckDown: |
| if: instance_ping_check_up == 0 |
| for: 2m |
| labels: |
| severity: major |
| service: contrail |
| annotations: |
| summary: "The Opencontrail instance ping check is down" |
| {%- raw %} |
| description: "The Opencontrail instance ping check on the {{ $labels.host }} node is down for 2 minutes." |
| {%- endraw %} |
| {%- endif %} |
| {%- endif %} |
| {%- if pillar.neutron is defined %} |
| {%- if (pillar.neutron.get('gateway', {}).get('enabled', False) == True and 'ovs' in pillar.neutron.get('gateway', {}).get('backend', {}).get('mechanism', {}).keys()) %} |
| or (pillar.neutron.get('compute', {}).get('enabled', False) == True and 'ovs' in pillar.neutron.get('compute', {}).get('backend', {}).get('mechanism', {}).keys() and pillar.neutron.get('compute', {}).get('dhcp_agent_enabled', False) == True) %} |
| recording: |
| instance_id:instance_arping_success: |
| query: >- |
| avg(instance_arping_success) by (id) |
| instance_id:instance_arping_success:avg10m:for10m: |
| query: >- |
| avg_over_time(instance_id:instance_arping_success[10m]) and instance_id:instance_arping_success and instance_id:instance_arping_success offset 10m |
| total:instance_id:instance_arping_success:avg10m:for10m: |
| query: >- |
| count(instance_id:instance_arping_success:avg10m:for10m) |
| total:instance_id:instance_arping_success:avg10m:for10m:eq0: |
| query: >- |
| count(instance_id:instance_arping_success:avg10m:for10m == 0) |
| total:openstack_nova_instance_failed: |
| query: >- |
| count(instance_id:instance_arping_success:avg10m:for10m == 0 or on(id) openstack_nova_instance_status == 2) or vector(0) |
| total:openstack_nova_instance_all: |
| query: >- |
| count(instance_id:instance_arping_success:avg10m:for10m or on(id) openstack_nova_instance_status) or vector(0) |
| {%- endif %} |
| {%- endif %} |
| {%- if pillar.opencontrail is defined %} |
| {%- if pillar.opencontrail.get('compute', {}).get('enabled', False) == True %} |
| recording: |
| instance_id:instance_ping_success: |
| query: >- |
| avg(instance_ping_success) by (id) * on(id) instance_ping_valid or on(id) instance_ping_valid |
| instance_id:instance_ping_success:avg10m:for10m: |
| query: >- |
| avg_over_time(instance_id:instance_ping_success[10m]) and instance_id:instance_ping_success and instance_id:instance_ping_success offset 10m |
| total:instance_id:instance_ping_success:avg10m:for10m: |
| query: >- |
| count(instance_id:instance_ping_success:avg10m:for10m) |
| total:instance_id:instance_ping_success:avg10m:for10m:eq0: |
| query: >- |
| count(instance_id:instance_ping_success:avg10m:for10m == 0) |
| total:openstack_nova_instance_failed: |
| query: >- |
| count(instance_id:instance_ping_success:avg10m:for10m == 0 or on(id) openstack_nova_instance_status == 2) or vector(0) |
| total:openstack_nova_instance_all: |
| query: >- |
| count(instance_id:instance_ping_success:avg10m:for10m or on(id) openstack_nova_instance_status) or vector(0) |
| {%- endif %} |
| {%- endif %} |
| {%- if address is defined %} |
| target: |
| static: |
| telegraf: |
| {%- if agent.output.prometheus_client.scheme is defined %} |
| scheme: {{ agent.output.prometheus_client.scheme }} |
| {%- endif %} |
| {%- if agent.output.prometheus_client.tls_config is defined %} |
| tls_config: |
| {%- if agent.output.prometheus_client.tls_config.skip_verify is defined %} |
| skip_verify: {{ agent.output.prometheus_client.tls_config.skip_verify }} |
| {%- endif %} |
| {%- if agent.output.prometheus_client.tls_config.ca_file is defined %} |
| ca_file: {{ agent.output.prometheus_client.tls_config.ca_file }} |
| {%- endif %} |
| {%- if agent.output.prometheus_client.tls_config.cert_name is defined %} |
| cert_name: {{ agent.output.prometheus_client.tls_config.cert_name }} |
| {%- endif %} |
| {%- if agent.output.prometheus_client.tls_config.key_name is defined %} |
| key_name: {{ agent.output.prometheus_client.tls_config.key_name }} |
| {%- endif %} |
| {%- endif %} |
| endpoint: |
| - address: {{ address }} |
| port: {{ agent.output.prometheus_client.bind.port }} |
| relabel_configs: |
| - regex: {{ address }}:{{ agent.output.prometheus_client.bind.port }} |
| replacement: {{ grains['host'] }} |
| source_labels: "__address__" |
| target_label: "host" |
| honor_labels: true |
| {%- endif %} |