mysql作为主流的关系型数据库,健康检查相关的信息却少之又少,相比Oracle,这方面逊色太多。本文将结合生产需求,全方位检查mysql健康状态,并整理输出成巡检报告。
说明
1、巡检报告通过python脚本实现,可以远程批量巡检多套数据库,数据源主要有数据库(通过pymysql连接数据库查询)、监控平台(api接口获取)。
2、python程序获取的数据通过docx.Document模块写入word模板,下文的截图全部来自这个模板,我在work模板填写代码,与下文分析的sql语句完全对应,可以拿来即用。
->请点击下载word模板
巡检报告主要包含以下内容,大家也可以根据实际需求进行增减。
- 巡检总结
- 操作系统概况
- 数据库概况
- 数据库备份
- 数据库性能检查
- 数据库监控趋势图
一、巡检总结
这部分内容是对整个巡检报告的总结,让用户从整体上了解数据库运行情况。数据来自下面几部分的汇总分析。
截图展示
二、操作系统概况
由于我们需要远程批量巡检多套数据库,所以下图的cpu信息、内存信息、磁盘使用率信息均通过监控平台api接口获取。如果大家觉得这种方式获取数据麻烦,也可以在本地执行shell命令获取,代码就会简单很多。
截图展示
下面分享一个python通过zabbix api获取数据的方法
import os
import json
import requests
from urllib import request, parse
from urllib.request import urlopen
import sys
import datetime
import urllib.request
import http.cookiejar
from docx import Document
from mod_logger import getlogger
from configparser import ConfigParser
from urllib.error import HTTPError
cfg = ConfigParser()
path = os.path.split(os.path.realpath(__file__))[0] + "/" + "zabbix.ini"
cfg.read(path)
logger = getlogger()
ZABBIX_URL = cfg.get("ZABBIX", "ZABBIX_URL")
ZABBIX_USERNAME = cfg.get("ZABBIX", "ZABBIX_USERNAME")
ZABBIX_PASSWORD = cfg.get("ZABBIX", "ZABBIX_PASSWORD")
url = "{}/api_jsonrpc.php".format(ZABBIX_URL)
header = {"Content-Type": "application/json"}
gr_url=ZABBIX_URL + "/chart2.php"
index_url = ZABBIX_URL + "/index.php"
p_w_picpath_dir="./picture"
def get_zabbix_authID(): # 获取authkey
# auth user and password
data = {
"jsonrpc": "2.0",
"method": "user.login",
"params": {
"user": ZABBIX_USERNAME,
"password": ZABBIX_PASSWORD
},
"id": 1,
}
# 由于API接收的是json字符串,故需要转化一下
value = json.dumps(data).encode('utf-8')
# 对请求进行包装
req = request.Request(url, headers=header, data=value)
# 验证并获取Auth ID
try:
# 打开包装过的url
result = request.urlopen(req)
except Exception as e:
logger.error("Auth Failed, Please Check Your Name And Password:", e)
return "INVALID"
else:
response = result.read()
# 上面获取的是bytes类型数据,故需要decode转化成字符串
page = response.decode('utf-8')
# 将此json字符串转化为python字典
page = json.loads(page)
result.close()
authID = page.get('result')
#print("Auth Successful. The Auth ID Is: {}".format(page.get('result')))
#print(authID)
return authID
def get_zabbix_hostID(authID, host):
data = {
"jsonrpc": "2.0",
"method": "host.get",
"params": {
"output": [
"hostid",
"host"
],
"filter": {
"host": [
host
]
},
"selectInterfaces": [
"interfaceid",
"ip"
]
},
"auth": authID, # 这里的auth的值就是上文获取到的auth
"id": 2
}
r = requests.post(url, headers=header, data=json.dumps(data))
if r.json().get('result'):
result1 = r.json()['result']
return result1[0]['hostid']
else:
logger.error("Failed to get hostID")
return -1
def get_zabbix_graphID(authID, hostID,graphname):
graph_id_data = {
"jsonrpc": "2.0",
"method": "graph.get",
"params": {
"output": "extend",
"hostids": hostID,
"filter": {"name": graphname},
"sortfield": "name"},
"auth": authID,
"id": 1}
graph_data = requests.post(
url, headers=header, data=json.dumps(graph_id_data))
if graph_data.json().get('result'):
result1 = graph_data.json().get('result')
graphID = result1[0]["graphid"]
return graphID
else:
logger.error("Failed to get graphID")
return -1
def GetGraph(gurl, values, p_w_picpath_dir):
key = values.keys()
#print(values) # 显示最底下value传入的值
cookiejar = http.cookiejar.CookieJar()
#cookiejar = cookielib.CookieJar()
urlOpener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookiejar))
values1 = {"name":ZABBIX_USERNAME,'password':ZABBIX_PASSWORD,'autologin':1,"enter":'Sign in'}
data = parse.urlencode(values1).encode("utf-8")
request = urllib.request.Request(index_url, data)
try:
urlOpener.open(request,timeout=10)
except ValueError:
print()
logger.error(index_url + "网页打开失败")
return -1
data = parse.urlencode(values).encode("utf-8")
# data=urllib.urlencode(values)
request = urllib.request.Request(gurl, data)
pic_url = urlOpener.open(request)
p_w_picpath = pic_url.read()
p_w_picpathname = "%s/%s.png" % (p_w_picpath_dir,
values["hostname"]+"_"+values["name"])
f = open(p_w_picpathname, 'wb+')
f.write(p_w_picpath)
f.close()
return p_w_picpathname
def get_host_pic(host, pic_name):
# 获取hostid
authID = get_zabbix_authID()
if authID == "INVALID":
logger.error("Error login to Zabbix!")
return -1
#host = "10.202.155.2"
hostID = get_zabbix_hostID(authID, host)
# print("host:" + host)
# print("hostids:" + hostID)
graphID = get_zabbix_graphID(authID, hostID,pic_name)
#print(graphID)
values = {"graphid":graphID,"name":pic_name,"hostname":host,"width":1200,"height":300,"from":"now-30d","to":"now","profileIdx": "web.graphics.filter"}
p_w_picpathname = GetGraph(gr_url,values,p_w_picpath_dir)
if p_w_picpathname:
return p_w_picpathname
else:
logger.error("Failed get host pic" + host + ":" + name)
return -1
#############################################
def main():
auth = get_zabbix_authID()
hostid = get_zabbix_hostID(auth, "192.168.1.100")
mem_grphid = get_zabbix_graphID(auth, hostid, '内存使用率')
cpu_grphid = get_zabbix_graphID(auth, hostid, '内存使用率')
get_host_pic("192.168.1.100", '内存使用率')
# print(auth, hostid, mem_grphid, cpu_grphid)
if __name__ == "__main__":
main()
然后在主程序中,通过api获取数据并写入字典。
### 从zabbix api获取操作系统指标信息。
cfg2 = ConfigParser()
conf_path = os.path.split(os.path.realpath(__file__))[0] + "/" + "zabbix.ini"
cfg2.read(conf_path)
ZABBIX_URL = cfg2.get("ZABBIX", "ZABBIX_URL")
ZABBIX_USERNAME = cfg2.get("ZABBIX", "ZABBIX_USERNAME")
ZABBIX_PASSWORD = cfg2.get("ZABBIX", "ZABBIX_PASSWORD")
url = "{}/api_jsonrpc.php".format(ZABBIX_URL)
zabbix = zabbix_api.Zabbix_API(url, ZABBIX_USERNAME, ZABBIX_PASSWORD)
# 获取token
zabbix.get_token()
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3] + " [Info] zabbix token: " + zabbix.token)
""" 获取主机信息 """
# history_value:
# History object types Possible values,Default: 3
# 0 - numeric float;
# 1 - character;
# 2 - log;
# 3 - numeric unsigned;
# 4 - text.
# 获取hostid,该id会覆盖上面备份拿到的hostID.
zabbix.get_hostID(group_ip)
# print('hostid: ' + zabbix.hostid)
# 获取主机名
hostname = zabbix.get_data(key_="system.hostname", history_value=1)
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3] + " [Info] hostname: " + hostname)
# 创建一个空字典,存储查询数据库和zabbix api获取到的巡检数据。
context = {}
self.context = context
#示例:获取cpu信息并写入字典
# 获取cpu信息
"""
CPU 核数:system.cpu.num、CPU 空闲率:system.cpu.util[,idle]、CPU 使用率:system.cpu.util
"""
# cpu核数
cpu_num = zabbix.get_data(key_="system.cpu.num", history_value=3)
if cpu_num:
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3] + " [Info] cpu_num: " + cpu_num)
# cpu空闲率
cpu_idle = zabbix.get_data(key_="system.cpu.util[,idle", history_value=0)
if cpu_idle:
cpu_idle = "%.2f" % float(cpu_idle)
cpu_idle = str(cpu_idle) + '%'
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3] + " [Info] cpu_idle: " + str(cpu_idle))
# cpu使用率
cpu_util = zabbix.get_data(key_="system.cpu.util", history_value=0)
if cpu_util:
cpu_util = "%.2f" % float(cpu_util)
cpu_util = str(cpu_util) + '%'
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3] + " [Info] cpu_util: " + str(cpu_util))
# 把cpu信息添加到字典中
if cpu_num:
self.context['cpuinfo'].append({'ip_addr': group_ip, "cpu_num": cpu_num, "cpu_idle": cpu_idle, "cpu_util": cpu_util})
三、数据库概况
这部分数据全部来自于数据库,使用sql命令查询,然后pyhton程序存储字典,加工处理后写入word文档,主要包括以下内容:
- 实例基础信息
- mysql高可用信息
- 数据库关键参数
- 数据库规模
- 数据库用户信息
- 数据库对象概览
- 会话连接信息
- 数据文件信息
- 数据库状态信息
截图展示:(截图信息不全,仅供参考)
四、数据库备份
数据库备份信息来自于监控平台。当然,如果本地执行巡检脚本,写shell命令获取更加简单些。
截图展示
五、数据库性能检查
这部分内容关乎数据库健康状态,属于重中之重,除了下文输出的内容条目,大家可以根据实际需求进行增减。
- InnoDB 详细信息
- InnoDB 锁等待
- 元数据锁信息
- 占用空间 top 10 的表
- 占用空间 top 10 的索引
- 没有主键或唯一键的表
- 非 innodb 引擎的数据对象
- 单张表超过 6 个索引的对象
- 联合索引的字段个数大于 4 的对象
- 单张表字段个数大于 50 的对象
- 物理 IO top10 的表
- 数据库内存分布 top10
- 慢 SQL top10
- 全表扫描的 SQL top10
- 全表扫描的表 top10
- 使用临时表的 SQL top10
- 行操作次数 top10
- 未使用的索引
- 自增值使用率 top10
- 冗余索引
下面是整个巡检报告中用到的所有sql语句,大家可以执行测试,另外下文中sql语句与work模板代码一一对应,可拿来即用。
[variables]
#basic info
co_date =select DATE_FORMAT(now(),"%Y-%m") as CO_DATE from dual;
version =select variable_value from performance_schema.session_variables where variable_name in ('version','version_comment');
checktime =select now() as checktime from dual;
platform =select variable_value from performance_schema.session_variables where variable_name in ('version_compile_os','version_compile_machine');
createtime =select DATE_FORMAT(min(set_time),"%Y-%m-%d") as createtime from sys.sys_config where set_by is NULL;
instancetime =select DATE_FORMAT(date_sub(now(), INTERVAL variable_value SECOND),"%Y-%m-%d") started_at from performance_schema.global_status where variable_name='Uptime';
#mgrinfo
mgrinfo =SELECT MEMBER_ID, MEMBER_HOST, MEMBER_PORT, MEMBER_STATE, IF(global_status.VARIABLE_NAME IS NOT NULL, 'PRIMARY', 'SECONDARY') AS MEMBER_ROLE FROM performance_schema.replication_group_members LEFT JOIN performance_schema.global_status ON global_status.VARIABLE_NAME = 'group_replication_primary_member' AND global_status.VARIABLE_VALUE = replication_group_members.MEMBER_ID;
#import_parameter
import_para =select variable_name,variable_value from performance_schema.session_variables where variable_name in ('enforce_gtid_consistency','gtid_mode','binlog_format','binlog_row_image','log_bin_trust_function_creators','expire_logs_days','sync_binlog','innodb_flush_log_at_trx_commit','open_files_limit','explicit_defaults_for_timestamp','max_allowed_packet','character_set_server','transaction_isolation','lower_case_table_names','default_time_zone','wait_timeout','max_user_connections','max_connections','table_definition_cache','table_open_cache','innodb_buffer_pool_size','innodb_io_capacity','innodb_io_capacity_max','innodb_flush_method','innodb_thread_concurrency');
db_files =select variable_name,variable_value from performance_schema.session_variables where variable_name in ('innodb_temp_data_file_path','innodb_data_file_path','datadir','tmpdir','innodb_log_files_in_group','log_bin_basename','relay_log_basename','relay_log_info_file','pid_file','socket','innodb_log_group_home_dir');
#processlist
processlist =select ID col1,USER col2,HOST col3,DB col4,COMMAND col5,TIME col6,STATE col7 from information_schema.processlist;
#大事务,超过5分钟的大事务个数
sql5min =select count(*) count_num from information_schema.innodb_trx where TIME_TO_SEC(timediff(now(),trx_started))> 300;
#innodb锁等待
innodb_trx =select trx_mysql_thread_id col1,trx_state col2,trx_started col3,now() col4,trx_query col5,trx_operation_state col6 from information_schema.innodb_trx;
#元数据锁信息
mdlinfo =select object_schema col1,object_name col2,waiting_thread_id col3,waiting_query col4,blocking_thread_id col5 from sys.schema_table_lock_waits;
#DB size
db_size =select a.SCHEMA_NAME,a.DEFAULT_CHARACTER_SET_NAME as CHARACTER_NAME, a.DEFAULT_COLLATION_NAME as COLLATION_NAME, truncate(sum(data_length)/1024/1024/1024, 2) as 'table_size', truncate(sum(index_length)/1024/1024/1024, 2) as 'index_size', truncate(sum(data_length + index_length)/1024/1024/1024, 2) as 'total_size', truncate(sum(data_free)/1024/1024/1024, 2) as 'free_size' from INFORMATION_SCHEMA.SCHEMATA a left outer join information_schema.tables b on a.SCHEMA_NAME = b.TABLE_SCHEMA where a.SCHEMA_NAME not in ('sys', 'information_schema', 'performance_schema', 'mysql') group by a.SCHEMA_NAME, a.DEFAULT_CHARACTER_SET_NAME, a.DEFAULT_COLLATION_NAME order by sum(data_length) desc, sum(index_length) desc;
#userinfo
userinfo =select user as col1,host as col2,Grant_priv as col3,plugin as col4,account_locked as col5 from mysql.user where user not in ('mysql.infoschema','mysql.session','mysql.sys');
#数据库状态信息
global_status =select VARIABLE_NAME,VARIABLE_VALUE from performance_schema.global_status where VARIABLE_NAME in ('Com_lock_instance','Com_lock_tables','Com_unlock_instance','Com_unlock_tables','Handler_external_lock','Innodb_row_lock_current_waits','Innodb_row_lock_time','Innodb_row_lock_time_avg','Innodb_row_lock_time_max','Innodb_row_lock_waits','Key_blocks_not_flushed','Key_blocks_unused','Key_blocks_used','Locked_connects','Performance_schema_locker_lost','Table_locks_immediate','Table_locks_waited','Uptime','Questions','Threads_connected','Threads_created','Threads_running');
#tabletop10
tbtop10 =SELECT table_schema AS col1, table_name AS col2,a. ENGINE as col3, a.CREATE_TIME col4, a.TABLE_COLLATION col5, table_rows AS col6,TRUNCATE(a.DATA_LENGTH/1024/1024/1024, 2) AS col7,TRUNCATE(index_length/1024/1024/1024, 2) AS col8, TRUNCATE((data_length + index_length)/1024/1024/1024, 2) AS col9, TRUNCATE(a.DATA_FREE/1024/1024/1024, 2) AS col10 FROM information_schema.TABLES a where table_schema not in ('sys','information_schema','performance_schema','mysql') ORDER BY (data_length + index_length) DESC limit 10;
#indextop10
#占用空间最大的10个索引
idxtop10 =select iis.database_name as col1,iis.table_name as col2,iis.index_name as col3,round((iis.stat_value * @@innodb_page_size)/1024/1024/1024, 2) as col4, s.NON_UNIQUE as col5,s.INDEX_TYPE as col6,GROUP_CONCAT(s.COLUMN_NAME order by SEQ_IN_INDEX) as col7 from (select * from mysql.innodb_index_stats WHERE index_name not in ('PRIMARY', 'GEN_CLUST_INDEX') and stat_name = 'size' order by (stat_value * @@innodb_page_size) desc limit 10) iis left join INFORMATION_SCHEMA.STATISTICS s on (iis.database_name = s.TABLE_SCHEMA and iis.table_name = s.TABLE_NAME and iis.index_name = s.INDEX_NAME) where iis.database_name not in ('sys', 'information_schema', 'performance_schema', 'mysql') GROUP BY iis.database_name,iis.TABLE_NAME,iis.INDEX_NAME, (iis.stat_value * @@innodb_page_size), s.NON_UNIQUE,s.INDEX_TYPE order by (stat_value * @@innodb_page_size) desc limit 10;
#无主键表名
nopk =select table_schema as col1, table_name as col2 from information_schema.tables where table_type = 'BASE TABLE' and (table_schema, table_name) not in (select /*+ subquery(materialization) */ a.TABLE_SCHEMA, a.TABLE_NAME from information_schema.TABLE_CONSTRAINTS a where a.CONSTRAINT_TYPE in ('PRIMARY KEY', 'UNIQUE') and table_schema not in ('mysql', 'information_schema', 'sys', 'performance_schema')) AND table_schema not in ('mysql', 'information_schema', 'sys', 'performance_schema');
#数据对象统计
obnum =select db as col1,object_type as col2,count as col3 from sys.schema_object_overview where db not in ('mysql', 'information_schema', 'sys', 'performance_schema');
#非innodb引擎对象
noinnodb =select table_schema as col1,table_name as col2,engine as col3 from information_schema.tables where engine!='innodb' and table_schema not in('mysql','information_schema','performance_schema');
#索引个数大于6个
indexnum5 =select table_schema as col1, table_name as col2,count(*) as col3 from (select distinct table_schema,table_name, INDEX_NAME from information_schema.STATISTICS where table_schema not in ('information_schema','mysql','performance_schema','sys') ) a group by table_schema,table_name having col3>6 order by table_schema,col3 desc,table_name;
#单个联合索引的字段个数大于4
indexcolnum =select table_schema col1, table_name col2,index_name,count(index_name) col3 from information_schema.STATISTICS where table_schema not in('sys','information_schema','performance_schema','mysql') group by table_schema,table_name,index_name having col3>4 order by table_schema, col3 desc,table_name,index_name;
#单张表字段个数超过50的对象
colnum50 =select TABLE_SCHEMA col1, TABLE_NAME col2, count(COLUMN_NAME) col3 from information_schema.COLUMNS where TABLE_SCHEMA not in('mysql', 'information_schema', 'sys', 'performance_schema') group by TABLE_SCHEMA, TABLE_NAME having col3 > 50 order by table_name;
#物理IO较高的表top10
iotop10 =select file col1,total_read col2,avg_read col3,total_written col4,avg_write col5,total col6 from sys.io_global_by_file_by_bytes limit 10;
#数据库实例内存分布top10
memtop10 =select event_name col1,current_count col2,current_alloc col3,current_avg_alloc col4 from sys.memory_global_by_current_bytes limit 10;
#慢sql top 10
sqltop10 =select s.db col1, s.total_latency col2, s.avg_latency col3,s.last_seen col5, (select sl.DIGEST_TEXT from `PERFORMANCE_SCHEMA`.events_statements_summary_by_digest sl where s.digest = sl.DIGEST limit 1) col6 FROM sys.statement_analysis s where s.db not in ('mysql','information_schema','performance_schema','sys') and s.avg_latency like '% s' group by s.avg_latency order by s.avg_latency desc limit 10;
#全表扫描耗时top10的SQL
fullscantop10 =SELECT s.db col1, s.no_index_used_count col2, s.no_index_used_pct col3, s.last_seen col4, (select sl.DIGEST_TEXT from `PERFORMANCE_SCHEMA`.events_statements_summary_by_digest sl where s.digest = sl.DIGEST limit 1) col5 FROM sys.statements_with_full_table_scans s where s.db not in ('mysql','information_schema','performance_schema','sys') LIMIT 10;
#全表扫描top10的表
fullscantbtop10 =select object_schema col1,object_name col2 ,rows_full_scanned col3 ,latency col4 from sys.schema_tables_with_full_table_scans limit 10;
#使用临时表top10的SQL
tmpsqltop10 =SELECT s.db col1, s.disk_tmp_tables col2, s.tmp_tables_to_disk_pct col3, s.last_seen col4, (select sl.DIGEST_TEXT from `PERFORMANCE_SCHEMA`.events_statements_summary_by_digest sl where s.digest = sl.DIGEST limit 1) col5 FROM sys.statements_with_temp_tables s where s.db not in ('mysql','information_schema','performance_schema','sys') LIMIT 10;
#行操作top30的sql
rowsdmltop30 =select table_schema col1,table_name col2,select_latency col3,rows_inserted col4,rows_updated col5,rows_deleted col6 from sys.schema_index_statistics limit 30;
#未使用过的索引
unuseidx =select object_schema col1,object_name col2,index_name col3 from sys.schema_unused_indexes;
#自增值使用率top10
incrtop10 =select table_schema col1,table_name col2,column_name col3,data_type col4,max_value col5,auto_increment col6,auto_increment_ratio col7 from sys.schema_auto_increment_columns limit 10;
#冗余索引
redundantidx =select table_schema col1,table_name col2,redundant_index_name col3,redundant_index_columns col4,sql_drop_index col5 from sys.schema_redundant_indexes limit 10;
#版本号
myversion =select version() version from dual;
截图展示(仅部分展示)
六、数据库历史趋势图
pyhton通过api获取历史趋势图比较简单,网上有很多现成代码,这里不再分享。
截图展示(巡检报告篇幅太长,这里仅部分展示)
上文详细展示了数据库巡检框架和具体内容,其中pyhton程序代码没有一一展示,每个人实现方法不同,但基本逻辑是一样的,首先查询获取数据存放到字典中,然后组装成word模板格式进行保存就ok了。
最后补充一段分析字典数据的代码,通过自己定义的判断逻辑,把符合条件的数据写入巡检报告第一部分的检查总结中。