| {%- from "kubernetes/map.jinja" import master with context %} |
| {%- from "kubernetes/map.jinja" import pool with context %} |
| |
| {% set kube_services = ('apiserver', 'scheduler', 'controller-manager') %} |
| |
| {%- if pool.get('enabled', False) %} |
| {% set network = pool.get('network', {}) %} |
| {%- else %} |
| {%- if master.get('enabled', False) %} |
| {% set network = master.get('network', {}) %} |
| {% endif %} |
| {% endif %} |
| |
| {%- if network is defined and network.get('calico', {}).get('enabled', False) %} |
| log_collector: |
| decoder: |
| calico_felix: |
| engine: sandbox |
| module_file: /usr/share/lma_collector/decoders/calico.lua |
| module_dir: /usr/share/lma_collector/common;/usr/share/heka/lua_modules |
| adjust_timezone: true |
| config: |
| calico_service: 'felix' |
| calico_bird: |
| engine: sandbox |
| module_file: /usr/share/lma_collector/decoders/calico.lua |
| module_dir: /usr/share/lma_collector/common;/usr/share/heka/lua_modules |
| adjust_timezone: true |
| config: |
| calico_service: 'bird' |
| calico_confd: |
| engine: sandbox |
| module_file: /usr/share/lma_collector/decoders/calico.lua |
| module_dir: /usr/share/lma_collector/common;/usr/share/heka/lua_modules |
| adjust_timezone: true |
| config: |
| calico_service: 'confd' |
| input: |
| calico_felix: |
| engine: logstreamer |
| log_directory: "/var/log" |
| file_match: 'calico/felix/(current|@(?P<Seq>[0-9a-f][0-9a-f]*)\.[su])$' |
| differentiator: ['calico', '.', 'felix'] |
| priority: ["^Seq"] |
| decoder: "calico_felix_decoder" |
| splitter: "TokenSplitter" |
| calico_bird: |
| engine: logstreamer |
| log_directory: "/var/log" |
| file_match: 'calico/bird(?P<Service>6*)/(current|@(?P<Seq>[0-9a-f][0-9a-f]*)\.[su])$' |
| differentiator: ['calico', '.', 'bird', 'Service'] |
| priority: ["^Seq"] |
| decoder: "calico_bird_decoder" |
| splitter: "TokenSplitter" |
| calico_confd: |
| engine: logstreamer |
| log_directory: "/var/log" |
| file_match: 'calico/confd/(current|@(?P<Seq>[0-9a-f][0-9a-f]*)\.[su])$' |
| differentiator: ['calico', '.', 'confd'] |
| priority: ["^Seq"] |
| decoder: "calico_confd_decoder" |
| splitter: "TokenSplitter" |
| {%- endif %} |
| {%- if master.get('enabled', False) or pool.get('enabled', False) %} |
| metric_collector: |
| trigger: |
| {%- if master.get('enabled', False) %} |
| {% for kube_service in kube_services %} |
| k8s-{{ kube_service }}_local_endpoint: |
| description: 'K8s {{ kube_service }} is locally down' |
| severity: down |
| rules: |
| - metric: k8s_service_health |
| field: |
| service: {{ kube_service }} |
| relational_operator: '==' |
| threshold: 0 |
| window: 60 |
| periods: 0 |
| function: last |
| {%- endfor %} |
| {%- endif %} |
| {%- if pool.get('enabled', False) %} |
| k8s-kubelet_local_endpoint: |
| description: 'K8s kubelet is locally down' |
| severity: down |
| rules: |
| - metric: k8s_service_health |
| field: |
| service: kubelet |
| relational_operator: '==' |
| threshold: 0 |
| window: 60 |
| periods: 0 |
| function: last |
| k8s-proxy_local_endpoint: |
| description: 'K8s proxy is locally down' |
| severity: down |
| rules: |
| - metric: k8s_service_health |
| field: |
| service: proxy |
| relational_operator: '==' |
| threshold: 0 |
| window: 60 |
| periods: 0 |
| function: last |
| {%- endif %} |
| {%- if network is defined and network.get('calico', {}).get('enabled', False) %} |
| calico_docker: |
| description: "There is no docker calico-node process running" |
| severity: down |
| rules: |
| - metric: process_processes |
| field: |
| service: calico-docker |
| relational_operator: '==' |
| threshold: 0 |
| window: 60 |
| periods: 0 |
| function: last |
| calico_felix: |
| description: "There is no calico-felix process running" |
| severity: down |
| rules: |
| - metric: process_processes |
| field: |
| service: calico-felix |
| relational_operator: '==' |
| threshold: 0 |
| window: 60 |
| periods: 0 |
| function: last |
| calico_bird: |
| description: "There is no calico-bird process running" |
| severity: down |
| rules: |
| - metric: process_processes |
| field: |
| service: calico-bird |
| relational_operator: '==' |
| threshold: 0 |
| window: 60 |
| periods: 0 |
| function: last |
| calico_bird6: |
| description: "There is no calico-bird6 process running" |
| severity: down |
| rules: |
| - metric: process_processes |
| field: |
| service: calico-bird6 |
| relational_operator: '==' |
| threshold: 0 |
| window: 60 |
| periods: 0 |
| function: last |
| calico_confd: |
| description: "There is no calico-confd process running" |
| severity: down |
| rules: |
| - metric: process_processes |
| field: |
| service: calico-confd |
| relational_operator: '==' |
| threshold: 0 |
| window: 60 |
| periods: 0 |
| function: last |
| {%- endif %} |
| |
| alarm: |
| {%- if master.get('enabled', False) %} |
| {%- for kube_service in kube_services %} |
| k8s-{{ kube_service }}_endpoint: |
| alerting: enabled |
| triggers: |
| - k8s-{{ kube_service }}_local_endpoint |
| dimension: |
| service: k8s-{{ kube_service }}-endpoint |
| {%- endfor %} |
| {%- endif %} |
| {%- if pool.get('enabled', False) %} |
| k8s-kubelet_endpoint: |
| alerting: enabled |
| triggers: |
| - k8s-kubelet_local_endpoint |
| dimension: |
| service: k8s-kubelet-endpoint |
| k8s-proxy_endpoint: |
| alerting: enabled |
| triggers: |
| - k8s-proxy_local_endpoint |
| dimension: |
| service: k8s-proxy-endpoint |
| {%- endif %} |
| {%- if network is defined and network.get('calico', {}).get('enabled', False) %} |
| calico_docker: |
| alerting: enabled |
| triggers: |
| - calico_docker |
| dimension: |
| process: calico-docker |
| calico_felix: |
| alerting: enabled |
| triggers: |
| - calico_felix |
| dimension: |
| process: calico-felix |
| calico_bird: |
| alerting: enabled |
| triggers: |
| - calico_bird |
| dimension: |
| process: calico-bird |
| calico_bird6: |
| alerting: enabled |
| triggers: |
| - calico_bird6 |
| dimension: |
| process: calico-bird6 |
| calico_confd: |
| alerting: enabled |
| triggers: |
| - calico_confd |
| dimension: |
| process: calico-confd |
| {%- endif %} |
| {%- endif %} |
| |
| {%- if master.get('enabled', False) %} |
| remote_collector: |
| trigger: |
| k8s-apiserver_vip: |
| description: 'K8s apiserver is down' |
| severity: down |
| rules: |
| - metric: k8s_service_health_vip |
| field: |
| service: apiserver |
| relational_operator: '==' |
| threshold: 0 |
| window: 60 |
| periods: 0 |
| function: last |
| k8s_node_some_not_ready: |
| description: 'Some k8s nodes are not ready' |
| severity: warning |
| logical_operator: and |
| rules: |
| - metric: k8s_nodes |
| field: |
| status: not_ready |
| relational_operator: '>' |
| threshold: 0 |
| window: 120 |
| periods: 0 |
| function: last |
| k8s_node_majority_not_ready: |
| description: 'Majority of k8s nodes are not ready' |
| severity: critical |
| rules: |
| - metric: k8s_nodes_percent |
| field: |
| status: not_ready |
| relational_operator: '>' |
| threshold: 50 |
| window: 120 |
| periods: 0 |
| function: last |
| k8s_node_all_not_ready: |
| description: 'All k8s node are not ready' |
| severity: down |
| rules: |
| - metric: k8s_nodes_percent |
| field: |
| status: not_ready |
| relational_operator: '==' |
| threshold: 100 |
| window: 60 |
| periods: 0 |
| function: last |
| alarm: |
| k8s-apiserver-vip: |
| alerting: enabled |
| triggers: |
| - k8s-apiserver_vip |
| dimension: |
| service: k8s-apiserver-vip |
| k8s-nodes-not-ready: |
| alerting: enabled |
| triggers: |
| - k8s_node_all_not_ready |
| - k8s_node_majority_not_ready |
| - k8s_node_some_not_ready |
| dimension: |
| service: k8s-nodes |
| {%- endif %} |
| |
| {%- if master.get('enabled', False) %} |
| aggregator: |
| alarm_cluster: |
| k8s-apiserver_vip: |
| policy: highest_severity |
| alerting: enabled |
| match: |
| service: k8s-apiserver-vip |
| members: |
| - k8s-apiserver-vip |
| dimension: |
| service: k8s-master |
| nagios_host: 01-service-clusters |
| {%- for kube_service in kube_services %} |
| k8s-{{ kube_service }}_endpoint: |
| policy: availability_of_members |
| alerting: enabled |
| group_by: hostname |
| match: |
| service: k8s-{{ kube_service }}-endpoint |
| members: |
| - k8s-{{ kube_service }}_endpoint |
| dimension: |
| service: k8s-master |
| nagios_host: 01-service-clusters |
| {%- endfor %} |
| k8s-kubelet_endpoint: |
| policy: status_of_members |
| alerting: enabled |
| group_by: hostname |
| match: |
| service: k8s-kubelet-endpoint |
| members: |
| - k8s-kubelet_endpoint |
| dimension: |
| service: k8s-pool |
| nagios_host: 01-service-clusters |
| k8s-proxy_endpoint: |
| policy: status_of_members |
| alerting: enabled |
| group_by: hostname |
| match: |
| service: k8s-proxy-endpoint |
| members: |
| - k8s-proxy_endpoint |
| dimension: |
| service: k8s-pool |
| nagios_host: 01-service-clusters |
| k8s-nodes: |
| policy: highest_severity |
| alerting: enabled |
| group_by: member |
| match: |
| service: k8s-nodes |
| members: |
| - k8s-nodes-not-ready |
| dimension: |
| service: k8s-pool |
| nagios_host: 01-service-clusters |
| k8s_pool: |
| policy: highest_severity |
| alerting: enabled_with_notification |
| match: |
| service: k8s-pool |
| members: |
| - k8s-kubelet_endpoint |
| - k8s-proxy_endpoint |
| - k8s-nodes |
| dimension: |
| cluster_name: k8s-pool |
| nagios_host: 00-top-clusters |
| k8s_master: |
| policy: highest_severity |
| alerting: enabled_with_notification |
| match: |
| service: k8s-master |
| members: |
| - k8s-apiserver_endpoint |
| - k8s-scheduler_endpoint |
| - k8s-controller-manager_endpoint |
| - k8s-apiserver_vip |
| dimension: |
| cluster_name: k8s-master |
| nagios_host: 00-top-clusters |
| {%- if master.network is defined and master.network.get('calico', {}).get('enabled', False) %} |
| calico_docker: |
| policy: availability_of_members |
| alerting: enabled |
| group_by: hostname |
| match: |
| process: calico-docker |
| members: |
| - calico_docker |
| dimension: |
| service: calico |
| nagios_host: 01-service-clusters |
| calico_felix: |
| policy: availability_of_members |
| alerting: enabled |
| group_by: hostname |
| match: |
| process: calico-felix |
| members: |
| - calico_felix |
| dimension: |
| service: calico |
| nagios_host: 01-service-clusters |
| calico_bird: |
| policy: availability_of_members |
| alerting: enabled |
| group_by: hostname |
| match: |
| process: calico-bird |
| members: |
| - calico_bird |
| dimension: |
| service: calico |
| nagios_host: 01-service-clusters |
| calico_bird6: |
| policy: availability_of_members |
| alerting: enabled |
| group_by: hostname |
| match: |
| process: calico-bird6 |
| members: |
| - calico_bird6 |
| dimension: |
| service: calico |
| nagios_host: 01-service-clusters |
| calico_confd: |
| policy: availability_of_members |
| alerting: enabled |
| group_by: hostname |
| match: |
| process: calico-confd |
| members: |
| - calico_confd |
| dimension: |
| service: calico |
| nagios_host: 01-service-clusters |
| calico: |
| policy: highest_severity |
| alerting: enabled_with_notification |
| match: |
| service: calico |
| members: |
| - calico_docker |
| - calico_felix |
| - calico_bird |
| - calico_bird6 |
| - calico_confd |
| dimension: |
| cluster_name: calico |
| nagios_host: 00-top-clusters |
| {%- endif %} |
| {%- endif %} |