Galera Cluster监控
授权数据库
GRANT SELECT ON *.* TO 'zabbix'@'127.0.0.1' IDENTIFIED BY '123';
追加到配置和脚本
echo "UserParameter=maria.db[*],/etc/zabbix/scripts/mariadb.sh $1" >> /etc/zabbix/zabbix_agentd.conf
zabbixmy账号配置文件这里需要写一个配置文件,主要保存上面授权的账号和密码,通过脚本来调用,并不写在脚本中
cat > /etc/zabbix/zabbixmy.conf << EOF
[client]
host=127.0.0.1
user=zabbix
password='123'
EOF
脚本文件 cat /etc/zabbix/scripts/mariadb.sh脚本文件较长,主要包含了基础的状态健康和缓存监控,并没有其他,之前有写过FPMM监控,发现很多其实用不上,当有太多集群数据库的时候,itmes精简是非常必要的
#/bin/bash
DEF="--defaults-file=/etc/zabbix/zabbixmy.conf"
MYSQL='/usr/local/mariadb/bin/mysqladmin'
ARGS=1
if [ $# -ne "$ARGS" ];then
echo "Please input one arguement:"
fi
case $1 in
Com_update)
result=`${MYSQL} $DEF extended-status |awk '/Com_updateW/{print $4}'`
echo $result
;;
Slow_queries)
result=`${MYSQL} $DEF extended-status |awk '/Slow_queries/{print $4}'`
echo $result
;;
com_select)
result=`${MYSQL} $DEF extended-status |awk '/Com_selectW/{print $4}'`
echo $result
;;
Com_insert)
result=`${MYSQL} $DEF extended-status |awk '/Com_insertW/{print $4}'`
echo $result
;;
Com_delete)
result=`${MYSQL} $DEF extended-status |awk '/Com_deleteW/{print $4}'`
echo $result
;;
#查询的数量
Questions)
result=`${MYSQL} $DEF status|awk '/Questions/{print $6}'`
echo $result
;;
#已经建立的链接
Threads_connected)
result=`${MYSQL} $DEF "extended-status"|awk '/Threads_connected/{print $4}'`
echo $result
;;
#正在运行的连接
Threads_running)
result=`${MYSQL} $DEF "extended-status"|awk '/Threads_running/{print $4}'`
echo $result
;;
#由于服务器内部本身导致的错误
Connection_errors_internal)
result=`${MYSQL} $DEF "extended-status"|awk '/Connection_errors_internal/{print $4}'`
echo $result
;;
#尝试与服务器建立连接但是失败的次数
Aborted_connects)
result=`${MYSQL} $DEF "extended-status"|awk '/Aborted_connects/{print $4}'`
echo $result
;;
#由于到达最大连接数导致的错误
Connection_errors_max_connections)
result=`${MYSQL} $DEF "extended-status"|awk '/Connection_errors_max_connections/{print $4}'`
echo $result
;;
#Innodb_buffer读取缓存请求的数量
Innodb_buffer_pool_read_requests)
result=`${MYSQL} $DEF "extended-status"|awk '/Innodb_buffer_pool_read_requests/{print $4}'`
echo $result
;;
#Innodb_buffer需要读取磁盘的请求数
Innodb_buffer_pool_reads)
result=`${MYSQL} $DEF "extended-status"|awk '/Innodb_buffer_pool_reads/{print $4}'`
echo $result
;;
#Innodb_buffer BP中总页面数
Innodb_buffer_pool_pages_total)
result=`${MYSQL} $DEF "extended-status"|awk '/Innodb_buffer_pool_pages_total/{print $4}'`
echo $result
;;
#Innodb_buffer空页数
Innodb_buffer_pool_pages_free)
result=`${MYSQL} $DEF "extended-status"|awk '/Innodb_buffer_pool_pages_free/{print $4}'`
echo $result
;;
#wsrep_cluster_status集群状态
wsrep_cluster_status)
result=`${MYSQL} $DEF "extended-status"|awk '/wsrep_cluster_status/{print $4}'`
echo $result
;;
#wsrep_cluster_size集群成员
wsrep_cluster_size)
result=`${MYSQL} $DEF "extended-status"|awk '/wsrep_cluster_size/{print $4}'`
echo $result
;;
#wsrep_ready
wsrep_ready)
result=`${MYSQL} $DEF "extended-status"|awk '/wsrep_ready/{print $4}'`
echo $result
;;
#wsrep_local_recv_queue_avg平均请求队列长度
wsrep_local_recv_queue_avg)
result=`${MYSQL} $DEF "extended-status"|awk '/wsrep_local_recv_queue_avg/{print $4}'`
echo $result
;;
#wsrep_local_send_queue_avg上次查询之后的平均发送队列长度
wsrep_local_send_queue_avg)
result=`${MYSQL} $DEF "extended-status"|awk '/wsrep_local_send_queue_avg/{print $4}'`
echo $result
;;
mping)
result=`${MYSQL} $DEF ping|grep -c alive`
echo $result
;;
*)
echo "Usage:$0(Com_update|Slow_queries|Com_select|Com_insert|Com_delete|Questions|Threads_connected|Threads_running|Connection_errors_internal|Aborted_connects|Connection_errors_max_connections|Innodb_buffer_pool_read_requests|Innodb_buffer_pool_reads|Innodb_buffer|Innodb_buffer_pool_pages_free|wsrep_cluster_status|wsrep_cluster_size|wsrep_ready|wsrep_local_recv_queue_avg|wsrep_local_send_queue_avg|mping)"
;;
esac
itmes
items并没有截图,主要的项如下:
Aborted_connects尝试与服务器建立连接但是失败的次数 maria.db[Aborted_connects] 30s 7d
Com_delete删除_30s maria.db[Com_delete] 30s 7d
Com_insert插入_30s maria.db[Com_insert] 30s 7d
com_select查询_30s maria.db[com_select] 30s 7d
Com_update修改_30s maria.db[Com_update] 30s 7d
Connection_errors_internal由于服务器内部本身导致的错误 maria.db[Connection_errors_internal] 30s 7d
Connection_errors_max_connections由于到达最大连接数导致的错误 maria.db[Connection_errors_max_connections] 30s 7d
Innodb_buffer_pool_pages_free空页数 maria.db[Innodb_buffer_pool_pages_free] 30s 7d
Innodb_buffer_pool_pages_totalBP中总页面数 maria.db[Innodb_buffer_pool_pages_total] 30s 7d
Innodb_buffer_pool_reads需要读取磁盘的请求数 maria.db[Innodb_buffer_pool_reads] 30s 7d
Innodb_buffer_pool_read_requests读取缓存请求的数量 maria.db[Innodb_buffer_pool_read_requests] 30s 7d
mping maria.db[mping] 30s 7d
Questions查询的数量_2m maria.db[Questions] 2m 7d
Slow_queries_慢查询_5m maria.db[Slow_queries] 5m 7d
Threads_connected已经建立的链接 maria.db[Threads_connected] 30s 7d
Threads_running正在运行的连接 maria.db[Threads_running] 30s 7d
wsrep_cluster_size集群成员 maria.db[wsrep_cluster_size] 30s 7d
wsrep_cluster_status集群状态 maria.db[wsrep_cluster_status] 30s 7d
wsrep_local_recv_queue_avg平均请求队列长度 maria.db[wsrep_local_recv_queue_avg] 30s 7d
wsrep_local_send_queue_avg上次查询之后的平均发送队列长度 maria.db[wsrep_local_send_queue_avg] 30s 7d
wsrep_ready maria.db[wsrep_ready] 30s 7d
Triggers
这里有报警的阈值
{HOST.NAME} Node is not ready {Mariadb_Customize_0607:maria.db[wsrep_ready].regexp(ON)}<>1
{HOST.NAME} mariadb is down! {Mariadb_Customize_0607:maria.db[mping].last()}=0
{HOST.NAME} Mariadb Cluster chenage {Mariadb_Customize_0607:maria.db[wsrep_cluster_size].last(1m)}<>3
{HOST.NAME} Innodb_buffer_pool缓存的命用率低于90%
({Mariadb_Customize_0607:maria.db[Innodb_buffer_pool_read_requests].last(0)}-{Mariadb_Customize_0607:maria.db[Innodb_buffer_pool_reads].last(0)})/{Mariadb_Customize_0607:maria.db[Innodb_buffer_pool_read_requests].last(0)}*100<95
{HOST.NAME} Innodb_buffer_pool缓存的使用率高于99%
({Mariadb_Customize_0607:maria.db[Innodb_buffer_pool_pages_total].last(0)}-{Mariadb_Customize_0607:maria.db[Innodb_buffer_pool_pages_free].last(0)})/{Mariadb_Customize_0607:maria.db[Innodb_buffer_pool_pages_total].last(0)}*100>99
{HOST.NAME} cluster_status no-Primary {Mariadb_Customize_0607:maria.db[wsrep_cluster_status].regexp(Primary)}<>1
命中率低于%90,数据库如果没有使用也会报innodb_buffer_pool计算: Innodb_buffer_pool_read_requests记录了读取请求的数量,而Innodb_buffer_pool_reads记录了缓冲池无法满足,因而只能从磁盘读取的请求数量,也就是说,如果Innodb_buffer_pool_reads的值开始增加,意味着数据库性能大有问题。缓存的使用率和命中率可以通过如下方法计算:
(Innodb_buffer_pool_pages_total - Innodb_buffer_pool_pages_free) /
Innodb_buffer_pool_pages_total * 100%
(Innodb_buffer_pool_read_requests - Innodb_buffer_pool_reads) /
Innodb_buffer_pool_read_requests * 100%
如果数据库从磁盘进行大量读取,而缓冲池还有许多闲置空间,这可能是因为缓存最近才清理过,还处于预热阶段。
集群监控项目:
1,mysql -e "show status;" |awk '/wsrep_cluster_status/{print $2}'|grep -c Primary
2,wsrep_cluster_status显示集群里节点的主状态。标准返回primary。如返回non-Primary或其他值说明是多个节点改变导致的节点丢失或者脑裂。如果所有节点都返回不是Primary,则要重设quorum。具体参见http://galeracluster.com/documentation-webpages/quorumreset.html如果返回都正常,说明复制机制在每个节点都能正常工作,下一步该检查每个节点的状态确保他们都能收到write-set
show global status like 'wsrep_cluster_status';
+----------------------+---------+
| Variable_name | Value |
+----------------------+---------+
| wsrep_cluster_status | Primary |
+----------------------+---------+
2,mysql -e "show status;" |awk '/wsrep_cluster_size/{print $2}'
wsrep_cluster_size显示了集群中节点的个数
3,mysql -e "show status;" |awk '/wsrep_cluster_state_uuid/{print $2}'
wsrep_cluster_conf_id显示了整个集群的变化次数。所有节点都应相同,否则说明某个节点与集群断开了
节点状态:1,mysql -e "show status;" |awk '/wsrep_ready/{print $2}'|grep -c ON
wsrep_ready显示了节点是否可以接受queries。ON表示正常,如果是OFF几乎所有的query都会报错,报错信息提示2, mysql -e "show status;" |awk '/wsrep_connected/{print $2}'|grep -c ON
SHOW GLOBAL STATUS LIKE 'wsrep_connected’显示该节点是否与其他节点有网络连接。(实验得知,当把某节点的网卡down掉之后,该值仍为on。说明网络还在)丢失连接的问题可能在于配置wsrep_cluster_address或wsrep_cluster_name的错误3,mysql -e "show status;" |awk '/wsrep_local_state_comment/{print $2}'|grep -c Initialized
wsrep_local_state_comment 以人能读懂的方式显示节点的状态,正常的返回值是Joining, Waiting on SST, Joined, Synced or Donor,返回Initialized说明已不在正常工作状态
健康状态:1,mysql -e "show status;" |awk '/wsrep_local_recv_queue_avg/{print $2}'
平均请求队列长度。当返回值大于0时,说明apply write-sets比收write-set慢,有等待。堆积太多可能导致启动flow control2,mysql -e "show status;" |awk '/wsrep_local_send_queue_avg/{print $2}'
显示自上次查询之后的平均发送队列长度。比如网络瓶颈和flow control都可能是原因
主从监控
授权:
Welcome to the MariaDB monitor. Commands end with ; or g.
Your MariaDB connection id is 217076
Server version: 10.0.29-MariaDB-wsrep MariaDB Server, wsrep_25.16.rc3fc46e
Copyright (c) 2000, 2016, Oracle, MariaDB Corporation Ab and others.
Type 'help;' or 'h' for help. Type 'c' to clear the current input statement.
MariaDB [(none)]> GRANT replication client on *.* TO 'zabbix'@'127.0.0.1' IDENTIFIED BY 'password';
Query OK, 0 rows affected (0.02 sec)
MariaDB [(none)]> exit
Bye
如果只是要监控主从线程,就需要这个文件
### 用户文件
cat > /etc/zabbix/zabbixmy.conf << EOF
[client]
host=127.0.0.1
user=zabbix
password='password'
EOF
调用脚本
cat /etc/zabbix/scripts/IO_SQL.sh
#/bin/bash
DEF="--defaults-file=/etc/zabbix/zabbixmy.conf"
MYSQL='/usr/local/mariadb/bin/mysql'
ARGS=1
if [ $# -ne "$ARGS" ];then
echo "Please input one arguement:"
fi
case $1 in
Slave_IO_Running)
result=`${MYSQL} $DEF -e "show slave statusG"|awk '/Slave_IO_Running/{print $2}'`
echo $result
;;
Slave_SQL_Running)
result=`${MYSQL} $DEF -e "show slave statusG"|awk '/Slave_SQL_Running/{print $2}'`
echo $result
;;
*)
echo "Usage:$0(Slave_SQL_Running|Slave_IO_Running)"
;;
esac
追加到zabbix_agentd.confecho "UserParameter=maria.IO_SQL[*],/etc/zabbix/scripts/IO_SQL.sh $1" >> /etc/zabbix/zabbix_agentd.conf
items如下:
maria.IO_SQL[Slave_IO_Running]
maria.IO_SQL[Slave_SQL_Running]
Triggers 如下:
{HOST.NAME} Mariadb Slave SQL Not Running {Mariadb_M-S_Thread:maria.IO_SQL[Slave_SQL_Running].regexp(ON)}<>1 Enabled
{HOST.NAME} Mariadb Slave IO Not Running {Mariadb_M-S_Thread:maria.IO_SQL[Slave_IO_Running].regexp(ON)}<>1