blob: e62cc2d460a5767df5589c7648bb193955ce8cbd [file] [log] [blame]
{%- from "kubernetes/map.jinja" import master with context %}
{%- from "kubernetes/map.jinja" import pool with context %}
{% set kube_services = ('apiserver', 'scheduler', 'controller-manager') %}
{%- if pool.get('enabled', False) %}
{% set network = pool.get('network', {}) %}
{%- else %}
{%- if master.get('enabled', False) %}
{% set network = master.get('network', {}) %}
{% endif %}
{% endif %}
{%- if network is defined and network.get('calico', {}).get('enabled', False) %}
log_collector:
decoder:
calico_felix:
engine: sandbox
module_file: /usr/share/lma_collector/decoders/calico.lua
module_dir: /usr/share/lma_collector/common;/usr/share/heka/lua_modules
adjust_timezone: true
config:
calico_service: 'felix'
calico_bird:
engine: sandbox
module_file: /usr/share/lma_collector/decoders/calico.lua
module_dir: /usr/share/lma_collector/common;/usr/share/heka/lua_modules
adjust_timezone: true
config:
calico_service: 'bird'
calico_confd:
engine: sandbox
module_file: /usr/share/lma_collector/decoders/calico.lua
module_dir: /usr/share/lma_collector/common;/usr/share/heka/lua_modules
adjust_timezone: true
config:
calico_service: 'confd'
input:
calico_felix:
engine: logstreamer
log_directory: "/var/log"
file_match: 'calico/felix/(current|@(?P<Seq>[0-9a-f][0-9a-f]*)\.[su])$'
differentiator: ['calico', '.', 'felix']
priority: ["^Seq"]
decoder: "calico_felix_decoder"
splitter: "TokenSplitter"
calico_bird:
engine: logstreamer
log_directory: "/var/log"
file_match: 'calico/bird(?P<Service>6*)/(current|@(?P<Seq>[0-9a-f][0-9a-f]*)\.[su])$'
differentiator: ['calico', '.', 'bird', 'Service']
priority: ["^Seq"]
decoder: "calico_bird_decoder"
splitter: "TokenSplitter"
calico_confd:
engine: logstreamer
log_directory: "/var/log"
file_match: 'calico/confd/(current|@(?P<Seq>[0-9a-f][0-9a-f]*)\.[su])$'
differentiator: ['calico', '.', 'confd']
priority: ["^Seq"]
decoder: "calico_confd_decoder"
splitter: "TokenSplitter"
{%- endif %}
{%- if master.get('enabled', False) or pool.get('enabled', False) %}
metric_collector:
trigger:
{%- if master.get('enabled', False) %}
{% for kube_service in kube_services %}
k8s-{{ kube_service }}_local_endpoint:
description: 'K8s {{ kube_service }} is locally down'
severity: down
rules:
- metric: k8s_service_health
field:
service: {{ kube_service }}
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
{%- endfor %}
{%- endif %}
{%- if pool.get('enabled', False) %}
k8s-kubelet_local_endpoint:
description: 'K8s kubelet is locally down'
severity: down
rules:
- metric: k8s_service_health
field:
service: kubelet
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
k8s-proxy_local_endpoint:
description: 'K8s proxy is locally down'
severity: down
rules:
- metric: k8s_service_health
field:
service: proxy
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
{%- endif %}
{%- if network is defined and network.get('calico', {}).get('enabled', False) %}
calico_docker:
description: "There is no docker calico-node process running"
severity: down
rules:
- metric: process_processes
field:
service: calico-docker
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
calico_felix:
description: "There is no calico-felix process running"
severity: down
rules:
- metric: process_processes
field:
service: calico-felix
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
calico_bird:
description: "There is no calico-bird process running"
severity: down
rules:
- metric: process_processes
field:
service: calico-bird
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
calico_bird6:
description: "There is no calico-bird6 process running"
severity: down
rules:
- metric: process_processes
field:
service: calico-bird6
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
calico_confd:
description: "There is no calico-confd process running"
severity: down
rules:
- metric: process_processes
field:
service: calico-confd
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
{%- endif %}
alarm:
{%- if master.get('enabled', False) %}
{%- for kube_service in kube_services %}
k8s-{{ kube_service }}_endpoint:
alerting: enabled
triggers:
- k8s-{{ kube_service }}_local_endpoint
dimension:
service: k8s-{{ kube_service }}-endpoint
{%- endfor %}
{%- endif %}
{%- if pool.get('enabled', False) %}
k8s-kubelet_endpoint:
alerting: enabled
triggers:
- k8s-kubelet_local_endpoint
dimension:
service: k8s-kubelet-endpoint
k8s-proxy_endpoint:
alerting: enabled
triggers:
- k8s-proxy_local_endpoint
dimension:
service: k8s-proxy-endpoint
{%- endif %}
{%- if network is defined and network.get('calico', {}).get('enabled', False) %}
calico_docker:
alerting: enabled
triggers:
- calico_docker
dimension:
process: calico-docker
calico_felix:
alerting: enabled
triggers:
- calico_felix
dimension:
process: calico-felix
calico_bird:
alerting: enabled
triggers:
- calico_bird
dimension:
process: calico-bird
calico_bird6:
alerting: enabled
triggers:
- calico_bird6
dimension:
process: calico-bird6
calico_confd:
alerting: enabled
triggers:
- calico_confd
dimension:
process: calico-confd
{%- endif %}
{%- endif %}
{%- if master.get('enabled', False) %}
remote_collector:
trigger:
k8s-apiserver_vip:
description: 'K8s apiserver is down'
severity: down
rules:
- metric: k8s_service_health_vip
field:
service: apiserver
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
k8s_node_some_not_ready:
description: 'Some k8s nodes are not ready'
severity: warning
logical_operator: and
rules:
- metric: k8s_nodes
field:
status: not_ready
relational_operator: '>'
threshold: 0
window: 120
periods: 0
function: last
k8s_node_majority_not_ready:
description: 'Majority of k8s nodes are not ready'
severity: critical
rules:
- metric: k8s_nodes_percent
field:
status: not_ready
relational_operator: '>'
threshold: 50
window: 120
periods: 0
function: last
k8s_node_all_not_ready:
description: 'All k8s node are not ready'
severity: down
rules:
- metric: k8s_nodes_percent
field:
status: not_ready
relational_operator: '=='
threshold: 100
window: 60
periods: 0
function: last
alarm:
k8s-apiserver-vip:
alerting: enabled
triggers:
- k8s-apiserver_vip
dimension:
service: k8s-apiserver-vip
k8s-nodes-not-ready:
alerting: enabled
triggers:
- k8s_node_all_not_ready
- k8s_node_majority_not_ready
- k8s_node_some_not_ready
dimension:
service: k8s-nodes
{%- endif %}
{%- if master.get('enabled', False) %}
aggregator:
alarm_cluster:
k8s-apiserver_vip:
policy: highest_severity
alerting: enabled
match:
service: k8s-apiserver-vip
members:
- k8s-apiserver-vip
dimension:
service: k8s-master
nagios_host: 01-service-clusters
{%- for kube_service in kube_services %}
k8s-{{ kube_service }}_endpoint:
policy: availability_of_members
alerting: enabled
group_by: hostname
match:
service: k8s-{{ kube_service }}-endpoint
members:
- k8s-{{ kube_service }}_endpoint
dimension:
service: k8s-master
nagios_host: 01-service-clusters
{%- endfor %}
k8s-kubelet_endpoint:
policy: status_of_members
alerting: enabled
group_by: hostname
match:
service: k8s-kubelet-endpoint
members:
- k8s-kubelet_endpoint
dimension:
service: k8s-pool
nagios_host: 01-service-clusters
k8s-proxy_endpoint:
policy: status_of_members
alerting: enabled
group_by: hostname
match:
service: k8s-proxy-endpoint
members:
- k8s-proxy_endpoint
dimension:
service: k8s-pool
nagios_host: 01-service-clusters
k8s-nodes:
policy: highest_severity
alerting: enabled
group_by: member
match:
service: k8s-nodes
members:
- k8s-nodes-not-ready
dimension:
service: k8s-pool
nagios_host: 01-service-clusters
k8s_pool:
policy: highest_severity
alerting: enabled_with_notification
match:
service: k8s-pool
members:
- k8s-kubelet_endpoint
- k8s-proxy_endpoint
- k8s-nodes
dimension:
cluster_name: k8s-pool
nagios_host: 00-top-clusters
k8s_master:
policy: highest_severity
alerting: enabled_with_notification
match:
service: k8s-master
members:
- k8s-apiserver_endpoint
- k8s-scheduler_endpoint
- k8s-controller-manager_endpoint
- k8s-apiserver_vip
dimension:
cluster_name: k8s-master
nagios_host: 00-top-clusters
{%- if master.network is defined and master.network.get('calico', {}).get('enabled', False) %}
calico_docker:
policy: availability_of_members
alerting: enabled
group_by: hostname
match:
process: calico-docker
members:
- calico_docker
dimension:
service: calico
nagios_host: 01-service-clusters
calico_felix:
policy: availability_of_members
alerting: enabled
group_by: hostname
match:
process: calico-felix
members:
- calico_felix
dimension:
service: calico
nagios_host: 01-service-clusters
calico_bird:
policy: availability_of_members
alerting: enabled
group_by: hostname
match:
process: calico-bird
members:
- calico_bird
dimension:
service: calico
nagios_host: 01-service-clusters
calico_bird6:
policy: availability_of_members
alerting: enabled
group_by: hostname
match:
process: calico-bird6
members:
- calico_bird6
dimension:
service: calico
nagios_host: 01-service-clusters
calico_confd:
policy: availability_of_members
alerting: enabled
group_by: hostname
match:
process: calico-confd
members:
- calico_confd
dimension:
service: calico
nagios_host: 01-service-clusters
calico:
policy: highest_severity
alerting: enabled_with_notification
match:
service: calico
members:
- calico_docker
- calico_felix
- calico_bird
- calico_bird6
- calico_confd
dimension:
cluster_name: calico
nagios_host: 00-top-clusters
{%- endif %}
{%- endif %}