常用的各类资源 Prometheus 告警语句

2023年 1月 4日 43.6k 0

主机

  • 主机内存使用率超过阈值

1 - node_memory_MemAvailable_bytes{mode!="idle"} / node_memory_MemTotal_bytes阈值:0.9

  • 主机 CPU 使用率超过阈值

1 - avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (host_name)阈值:0.85

  • 主机硬盘使用率超过阈值

1 - avg without (fstype)(node_filesystem_free_bytes{fstype!='rootfs',mountpoint!~'/(run|var|snap).*'} / node_filesystem_size_bytes{fstype!='rootfs',mountpoint!~'/(run|var|snap).*'})阈值:0.8

Windows

  • Windows 主机内存使用率超过阈值

1 - 1 * windows_os_physical_memory_free_bytes{job="windows_exporter",mode!="idle"} / windows_cs_physical_memory_bytes阈值:0.9

  • Windows 主机 CPU 使用率超过阈值

1 - (avg by (host_ip,host_name) (irate(windows_cpu_time_total{job="windows_exporter",mode="idle"}[1m])))阈值:0.85

  • Windows 主机硬盘使用率超过阈值

1 - windows_logical_disk_free_bytes{job="windows_exporter",volume!~"HarddiskVolume.*"} / windows_logical_disk_size_bytes阈值:0.8

Kubernetes

  • 集群数量发生变化

count(count(kube_node_created{}) by (cluster))阈值:不等于真实值

  • Pod CPU 被限制核数

sum (rate (container_cpu_cfs_throttled_seconds_total{}[5m])) by (pod)阈值: 大于 1

  • Pod CPU 被限制比例

sum by (cluster, namespace, pod)(irate(container_cpu_cfs_throttled_periods_total{container!="POD", container!=""}[5m])) / sum by (cluster, namespace, pod)( irate(container_cpu_cfs_periods_total{container!="POD", container!=""}[5m])) > 0.5阈值: 大于 0.5

  • POD 10 分钟重启次数超过3次

sum (increase (kube_pod_container_status_restarts_total{}[10m])) by (cluster, namespace,pod)阈值:大于 3

  • 5 分钟不可用节点增量

sum(kube_node_status_condition{status!="true"}) by (cluster, exported_node, condition) - sum(kube_node_status_condition{status!="true"} offset 5m) by (cluster, exported_node, condition)阈值:大于 0

  • 10 分钟内 POD 发生 OOM

avg_over_time(kube_pod_container_status_last_terminated_reason{reason='OOMKilled'}[10m]) > 0 and avg_over_time(kube_pod_container_status_last_terminated_reason{reason='OOMKilled'}[10m]) < 1阈值:等于 0

  • POD 等待

sum by (namespace, pod, reason) (kube_pod_container_status_waiting_reason{})阈值:大于 0

  • 节点时间不同步

min_over_time(node_timex_sync_status[5m])阈值:等于 0

  • controller 的调度深度

sum by (cluster) (workqueue_depth{})阈值:大于 0

  • 5 分钟队列增加总数

sum by (cluster) (rate(workqueue_adds_total{}[5m]))阈值:大于 50

  • 对象在队列中停留的时间

histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{}[5m])) by (cluster, le))阈值:大于 0.5

ClickHouse

  • 副本同步队列出现堆积

clickhouse_replicas_max_queue_size{}阈值:大于 100

Elasticsearch

  • Elastic_Cluster_Health_RED 有主分片没能正常运行

elasticsearch_cluster_health_status{color="red"}阈值:等于 1

  • Elasticsearch_health_up 集群处于非健康状态

elasticsearch_cluster_health_up{}阈值:不等于 1

  • ES heap 内存占用率高于90%

sum by ( cluster_name, host) (elasticsearch_jvm_memory_used_bytes{area="heap"}) / sum by ( cluster_name, host) (elasticsearch_jvm_memory_max_bytes{area="heap"})阈值:大于 0.9

  • ES 待处理的任务数大于 10

elasticsearch_cluster_health_number_of_pending_tasks{}阈值:大于10

MongoDB

  • MongoDB ops commands操作过高

mongodb_commands_per_sec{} or cloudwatch_aws_doc_db_opcounters_command_average{}阈值:大于 5000

  • MongoDB链接数过高

mongodb_open_connections{} or cloudwatch_aws_doc_db_database_connections_average{}阈值:大于5000

  • MongoDB wiredTiger cache达到驱逐阀值

sum by (cluster_name) (mongodb_wtcache_current_bytes{}) / sum by (cluster_name) (mongodb_wtcache_max_bytes_configured{}) * 100阈值:大于90

  • MongoDB wiredTiger dirty cache 达到驱逐阀值

sum by (cluster_name) (mongodb_wtcache_tracked_dirty_bytes{}) / sum by (cluster_name) (mongodb_wtcache_max_bytes_configured{}) * 100阈值:大于20

Redis

  • 服务状态异常

redis_up{job="telegraf"}阈值:<1

  • instance 当前内存使用率过大

redis_memory_used_bytes{job="telegraf"} / redis_config_maxmemory阈值:>0.8

  • instance 内存使用率10分钟增长超过50%

(redis_memory_used_bytes{job="telegraf"} / redis_config_maxmemory) - (redis_memory_used_bytes{job="telegraf"} offset 10m / redis_config_maxmemory offset 10m)阈值:> 0.5

  • cluster 当前内存使用率过大

sum(redis_memory_used_bytes{job="telegraf"}) by (cluster_name) / sum(redis_config_maxmemory{job="telegraf"}) by (cluster_name)阈值: > 0.8

  • cluster 内存使用率10分钟增长超过10%

(sum(redis_memory_used_bytes{job="telegraf"}) by (cluster_name) / sum(redis_config_maxmemory{job="telegraf"}) by (cluster_name))- (sum(redis_memory_used_bytes{job="telegraf"}offset 10m) by (cluster_name) / sum(redis_config_maxmemory{job="telegraf"}offset 10m) by (cluster_name))阈值:> 0.1

  • QPS 10分钟内增长超过30% 并且 当前QPS>500

sum by (cluster_name) (rate(redis_commands_processed_total{job="telegraf"}[5m])) and ((sum by (cluster_name) (rate(redis_commands_processed_total{job="telegraf"}[5m] )) -sum by (cluster_name) (rate(redis_commands_processed_total{job="telegraf"}[5m] offset 10m)) ) /sum by (cluster_name) (rate(redis_commands_processed_total{job="telegraf""}[5m] offset 10m))> 0.3)阈值:> 500

  • 连接数 10分钟内增长超过30% 并且 当前连接数>500

sum by (cluster_name) (redis_connected_clients{job="telegraf"}) and (((sum by (cluster_name) (redis_connected_clients{job="telegraf"}) -sum by (cluster_name) (redis_connected_clients{job="telegraf"}offset 10m))) / sum by (cluster_name) (redis_connected_clients{job="telegraf"}offset 10m)> 0.3)阈值:> 500

RabbitMQ

  • 实例探活异常

avg by (cluster_name,addr) (rabbitmq_up{})阈值:< 1

  • 内存使用率过大

sum by (cluster_name,region,addr) (rabbitmq_node_mem_used{}) / sum by (cluster_name,region,addr) (rabbitmq_node_mem_limit{})阈值:> 0.85

  • 连接数使用率过大

sum by (cluster_name,region,addr) (rabbitmq_sockets_used{}) / sum by (cluster_name,region,addr) (rabbitmq_sockets_available{})阈值:> 0.85

Pika

  • Pika 节点ops过高

sum by(cluster_name, addr ) (rate(pika_total_commands_processed[5m]))阈值:> 120000

  • Pika 节点状态异常

avg(pika_up{}) by (cluster_name,addr)阈值:< 1

  • Pika slave lag过大

avg(pika_slave_lag{}) by (cluster_name,addr)阈值:>100000

  • Pika master 连接异常

avg(pika_master_link_up{}) by (cluster_name,addr)阈值:<1

MySQL

  • 磁盘使用率过大

(huaweicloud_sys_rds_rds048_disk_used_size / huaweicloud_sys_rds_rds047_disk_total_size) or (1 - aws_rdsdisk_free / aws_rdsdisk_total)阈值:> 0.8

  • CPU 使用率过大

huaweicloud_sys_rds_rds001_cpu_util or aws_rds_cpuutilization_average阈值:> 80

  • 慢查过多

rate(mysql_slow_queries{}[5m])阈值:> 50

  • 连接数过大

mysql_threads_connected{}阈值:> 2000

  • 连接数使用率过大

mysql_threads_connected{} / mysql_max_used_connections{}阈值:> 0.9

  • MySQL QPS过大

rate(mysql_queries{}[5m])阈值:> 8000

  • MySQL 探活失败

mysql_up{job="telegraf-mysql"}阈值:< 1

  • MySQL 磁盘使用率过大

((huaweicloud_sys_rds_rds048_disk_used_size / huaweicloud_sys_rds_rds047_disk_total_size) and on (name) mysql_up{}) or ((1 - aws_rdsdisk_free / aws_rdsdisk_total) and on (name) mysql_up{})阈值 > 0.8

  • MySQL CPU使用率过大

(huaweicloud_sys_rds_rds001_cpu_util and on (name) mysql_up{}) or (aws_rds_cpuutilization_average and on (dbinstance_identifier) mysql_up{})阈值:> 80

Kafka

  • kafka队列积压过多

avg without(host_ip) (sum by(consumergroup, topic, cluster_name, host_ip) (kafka_consumergroup_lag{})) > 1000 and on (cluster_name, consumergroup, topic ) sum without(partition, job, host_ip) (delta(kafka_consumergroup_current_offset{}[1d]))阈值:!= 0

  • Kafka 存在复制不足的分区

avg_over_time(kafka_server_replicamanager_underreplicatedpartitions{}[1m])阈值:>=1

  • Kafka 存在没有领导者的分区

avg_over_time(kafka_controller_kafkacontroller_offlinepartitionscount{}[1m])阈值:> 0

  • Kafka 集群中控制器的数量等于 0

sum by(cluster_name) (avg_over_time(kafka_controller_kafkacontroller_activecontrollercount{}[2m]))阈值:=0

  • Kafka brokers数量不足

kafka_brokers{}阈值:=0

  • Zookeeper instance is down

zk_up{}阈值:=0

  • Zookeeper 连接数过高

zk_num_alive_connections{}阈值:> 100

  • Zookeeper 超出服务器处理能力的排队请求数量

zk_outstanding_requests{}阈值:>100

  • Zookeeper服务器响应客户端请求花费的平均时间过长(ms)

zk_avg_latency{}阈值:>100

Etcd

  • Etcd Server has no leader

etcd_server_has_leader{}阈值:= 0

  • Etcd proposls failed num > 10 within 2min

increase(etcd_server_proposals_failed_total{}[2m])阈值:>10

  • Etcd Disk fsync durations > 500ms

histogram_quantile(0.99, sum by(instance,le,cluster_name) (rate(etcd_disk_wal_fsync_duration_seconds_bucket{}[2m])))阈值:> 0.5

  • Etcd Disk commit durations > 250ms

histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{}[2m]))阈值: > 0.25

  • Etcd leader changes num > 2 within 5min

increase(etcd_server_leader_changes_seen_total{}[5m])阈值:> 2

  • Etcd DB 使用空间超过85%

etcd_mvcc_db_total_size_in_bytes{} / etcd_server_quota_backend_bytes{}阈值:> 0.85

参考

  • https://help.aliyun.com/document_detail/176180.html#section-78l-udp-gcs
  • https://blog.csdn.net/weixin_43798031/article/details/123430196

相关文章

KubeSphere 部署向量数据库 Milvus 实战指南
探索 Kubernetes 持久化存储之 Longhorn 初窥门径
征服 Docker 镜像访问限制!KubeSphere v3.4.1 成功部署全攻略
那些年在 Terraform 上吃到的糖和踩过的坑
无需 Kubernetes 测试 Kubernetes 网络实现
Kubernetes v1.31 中的移除和主要变更

发布评论