主机
- 主机内存使用率超过阈值
1 - node_memory_MemAvailable_bytes{mode!="idle"} / node_memory_MemTotal_bytes
阈值:0.9
- 主机 CPU 使用率超过阈值
1 - avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (host_name)
阈值:0.85
- 主机硬盘使用率超过阈值
1 - avg without (fstype)(node_filesystem_free_bytes{fstype!='rootfs',mountpoint!~'/(run|var|snap).*'} / node_filesystem_size_bytes{fstype!='rootfs',mountpoint!~'/(run|var|snap).*'})
阈值:0.8
Windows
- Windows 主机内存使用率超过阈值
1 - 1 * windows_os_physical_memory_free_bytes{job="windows_exporter",mode!="idle"} / windows_cs_physical_memory_bytes
阈值:0.9
- Windows 主机 CPU 使用率超过阈值
1 - (avg by (host_ip,host_name) (irate(windows_cpu_time_total{job="windows_exporter",mode="idle"}[1m])))
阈值:0.85
- Windows 主机硬盘使用率超过阈值
1 - windows_logical_disk_free_bytes{job="windows_exporter",volume!~"HarddiskVolume.*"} / windows_logical_disk_size_bytes
阈值:0.8
Kubernetes
- 集群数量发生变化
count(count(kube_node_created{}) by (cluster))
阈值:不等于真实值
- Pod CPU 被限制核数
sum (rate (container_cpu_cfs_throttled_seconds_total{}[5m])) by (pod)
阈值: 大于 1
- Pod CPU 被限制比例
sum by (cluster, namespace, pod)(irate(container_cpu_cfs_throttled_periods_total{container!="POD", container!=""}[5m])) / sum by (cluster, namespace, pod)( irate(container_cpu_cfs_periods_total{container!="POD", container!=""}[5m])) > 0.5
阈值: 大于 0.5
- POD 10 分钟重启次数超过3次
sum (increase (kube_pod_container_status_restarts_total{}[10m])) by (cluster, namespace,pod)
阈值:大于 3
- 5 分钟不可用节点增量
sum(kube_node_status_condition{status!="true"}) by (cluster, exported_node, condition) - sum(kube_node_status_condition{status!="true"} offset 5m) by (cluster, exported_node, condition)
阈值:大于 0
- 10 分钟内 POD 发生 OOM
avg_over_time(kube_pod_container_status_last_terminated_reason{reason='OOMKilled'}[10m]) > 0 and avg_over_time(kube_pod_container_status_last_terminated_reason{reason='OOMKilled'}[10m]) < 1
阈值:等于 0
- POD 等待
sum by (namespace, pod, reason) (kube_pod_container_status_waiting_reason{})
阈值:大于 0
- 节点时间不同步
min_over_time(node_timex_sync_status[5m])
阈值:等于 0
- controller 的调度深度
sum by (cluster) (workqueue_depth{})
阈值:大于 0
- 5 分钟队列增加总数
sum by (cluster) (rate(workqueue_adds_total{}[5m]))
阈值:大于 50
- 对象在队列中停留的时间
histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{}[5m])) by (cluster, le))
阈值:大于 0.5
ClickHouse
- 副本同步队列出现堆积
clickhouse_replicas_max_queue_size{}
阈值:大于 100
Elasticsearch
- Elastic_Cluster_Health_RED 有主分片没能正常运行
elasticsearch_cluster_health_status{color="red"}
阈值:等于 1
- Elasticsearch_health_up 集群处于非健康状态
elasticsearch_cluster_health_up{}
阈值:不等于 1
- ES heap 内存占用率高于90%
sum by ( cluster_name, host) (elasticsearch_jvm_memory_used_bytes{area="heap"}) / sum by ( cluster_name, host) (elasticsearch_jvm_memory_max_bytes{area="heap"})
阈值:大于 0.9
- ES 待处理的任务数大于 10
elasticsearch_cluster_health_number_of_pending_tasks{}
阈值:大于10
MongoDB
- MongoDB ops commands操作过高
mongodb_commands_per_sec{} or cloudwatch_aws_doc_db_opcounters_command_average{}
阈值:大于 5000
- MongoDB链接数过高
mongodb_open_connections{} or cloudwatch_aws_doc_db_database_connections_average{}
阈值:大于5000
- MongoDB wiredTiger cache达到驱逐阀值
sum by (cluster_name) (mongodb_wtcache_current_bytes{}) / sum by (cluster_name) (mongodb_wtcache_max_bytes_configured{}) * 100
阈值:大于90
- MongoDB wiredTiger dirty cache 达到驱逐阀值
sum by (cluster_name) (mongodb_wtcache_tracked_dirty_bytes{}) / sum by (cluster_name) (mongodb_wtcache_max_bytes_configured{}) * 100
阈值:大于20
Redis
- 服务状态异常
redis_up{job="telegraf"}
阈值:<1
- instance 当前内存使用率过大
redis_memory_used_bytes{job="telegraf"} / redis_config_maxmemory
阈值:>0.8
- instance 内存使用率10分钟增长超过50%
(redis_memory_used_bytes{job="telegraf"} / redis_config_maxmemory) - (redis_memory_used_bytes{job="telegraf"} offset 10m / redis_config_maxmemory offset 10m)
阈值:> 0.5
- cluster 当前内存使用率过大
sum(redis_memory_used_bytes{job="telegraf"}) by (cluster_name) / sum(redis_config_maxmemory{job="telegraf"}) by (cluster_name)
阈值: > 0.8
- cluster 内存使用率10分钟增长超过10%
(sum(redis_memory_used_bytes{job="telegraf"}) by (cluster_name) / sum(redis_config_maxmemory{job="telegraf"}) by (cluster_name))- (sum(redis_memory_used_bytes{job="telegraf"}offset 10m) by (cluster_name) / sum(redis_config_maxmemory{job="telegraf"}offset 10m) by (cluster_name))
阈值:> 0.1
- QPS 10分钟内增长超过30% 并且 当前QPS>500
sum by (cluster_name) (rate(redis_commands_processed_total{job="telegraf"}[5m])) and ((sum by (cluster_name) (rate(redis_commands_processed_total{job="telegraf"}[5m] )) -sum by (cluster_name) (rate(redis_commands_processed_total{job="telegraf"}[5m] offset 10m)) ) /sum by (cluster_name) (rate(redis_commands_processed_total{job="telegraf""}[5m] offset 10m))> 0.3)
阈值:> 500
- 连接数 10分钟内增长超过30% 并且 当前连接数>500
sum by (cluster_name) (redis_connected_clients{job="telegraf"}) and (((sum by (cluster_name) (redis_connected_clients{job="telegraf"}) -sum by (cluster_name) (redis_connected_clients{job="telegraf"}offset 10m))) / sum by (cluster_name) (redis_connected_clients{job="telegraf"}offset 10m)> 0.3)
阈值:> 500
RabbitMQ
- 实例探活异常
avg by (cluster_name,addr) (rabbitmq_up{})
阈值:< 1
- 内存使用率过大
sum by (cluster_name,region,addr) (rabbitmq_node_mem_used{}) / sum by (cluster_name,region,addr) (rabbitmq_node_mem_limit{})
阈值:> 0.85
- 连接数使用率过大
sum by (cluster_name,region,addr) (rabbitmq_sockets_used{}) / sum by (cluster_name,region,addr) (rabbitmq_sockets_available{})
阈值:> 0.85
Pika
- Pika 节点ops过高
sum by(cluster_name, addr ) (rate(pika_total_commands_processed[5m]))
阈值:> 120000
- Pika 节点状态异常
avg(pika_up{}) by (cluster_name,addr)
阈值:< 1
- Pika slave lag过大
avg(pika_slave_lag{}) by (cluster_name,addr)
阈值:>100000
- Pika master 连接异常
avg(pika_master_link_up{}) by (cluster_name,addr)
阈值:<1
MySQL
- 磁盘使用率过大
(huaweicloud_sys_rds_rds048_disk_used_size / huaweicloud_sys_rds_rds047_disk_total_size) or (1 - aws_rdsdisk_free / aws_rdsdisk_total)
阈值:> 0.8
- CPU 使用率过大
huaweicloud_sys_rds_rds001_cpu_util or aws_rds_cpuutilization_average
阈值:> 80
- 慢查过多
rate(mysql_slow_queries{}[5m])
阈值:> 50
- 连接数过大
mysql_threads_connected{}
阈值:> 2000
- 连接数使用率过大
mysql_threads_connected{} / mysql_max_used_connections{}
阈值:> 0.9
- MySQL QPS过大
rate(mysql_queries{}[5m])
阈值:> 8000
- MySQL 探活失败
mysql_up{job="telegraf-mysql"}
阈值:< 1
- MySQL 磁盘使用率过大
((huaweicloud_sys_rds_rds048_disk_used_size / huaweicloud_sys_rds_rds047_disk_total_size) and on (name) mysql_up{}) or ((1 - aws_rdsdisk_free / aws_rdsdisk_total) and on (name) mysql_up{})
阈值 > 0.8
- MySQL CPU使用率过大
(huaweicloud_sys_rds_rds001_cpu_util and on (name) mysql_up{}) or (aws_rds_cpuutilization_average and on (dbinstance_identifier) mysql_up{})
阈值:> 80
Kafka
- kafka队列积压过多
avg without(host_ip) (sum by(consumergroup, topic, cluster_name, host_ip) (kafka_consumergroup_lag{})) > 1000 and on (cluster_name, consumergroup, topic ) sum without(partition, job, host_ip) (delta(kafka_consumergroup_current_offset{}[1d]))
阈值:!= 0
- Kafka 存在复制不足的分区
avg_over_time(kafka_server_replicamanager_underreplicatedpartitions{}[1m])
阈值:>=1
- Kafka 存在没有领导者的分区
avg_over_time(kafka_controller_kafkacontroller_offlinepartitionscount{}[1m])
阈值:> 0
- Kafka 集群中控制器的数量等于 0
sum by(cluster_name) (avg_over_time(kafka_controller_kafkacontroller_activecontrollercount{}[2m]))
阈值:=0
- Kafka brokers数量不足
kafka_brokers{}
阈值:=0
- Zookeeper instance is down
zk_up{}
阈值:=0
- Zookeeper 连接数过高
zk_num_alive_connections{}
阈值:> 100
- Zookeeper 超出服务器处理能力的排队请求数量
zk_outstanding_requests{}
阈值:>100
- Zookeeper服务器响应客户端请求花费的平均时间过长(ms)
zk_avg_latency{}
阈值:>100
Etcd
- Etcd Server has no leader
etcd_server_has_leader{}
阈值:= 0
- Etcd proposls failed num > 10 within 2min
increase(etcd_server_proposals_failed_total{}[2m])
阈值:>10
- Etcd Disk fsync durations > 500ms
histogram_quantile(0.99, sum by(instance,le,cluster_name) (rate(etcd_disk_wal_fsync_duration_seconds_bucket{}[2m])))
阈值:> 0.5
- Etcd Disk commit durations > 250ms
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{}[2m]))
阈值: > 0.25
- Etcd leader changes num > 2 within 5min
increase(etcd_server_leader_changes_seen_total{}[5m])
阈值:> 2
- Etcd DB 使用空间超过85%
etcd_mvcc_db_total_size_in_bytes{} / etcd_server_quota_backend_bytes{}
阈值:> 0.85
参考
- https://help.aliyun.com/document_detail/176180.html#section-78l-udp-gcs
- https://blog.csdn.net/weixin_43798031/article/details/123430196