CPU CPU使用/cpu_count 超过 90% #容量#
一、数据库状态检查的步骤/命令(通过本步骤可以确定数据库当前的状态是否OK)
检查连接情况,确保连接数使用率正常
col PROCESS_LIMIT for a20;
select process_current,value process_limit,round(process_current / value, 2) * 100 || '%' as process_pct
from (select count(*) as process_current from v$process) a, v$parameter where name = 'processes';
二、请描述告警出现的可能原因
1、有效率差的语句,sql性能问题
2、语句并发过高
三、预计多久不处理就可以出现UIOC(P1)
CPU高可能导致业务语句执行缓慢甚至数据库不可用,需15分钟内处理正常
四、告警检查的步骤(通过本步骤确定告警是否是误告)
select to_char(round(aa.value/cc.value,2))||'%' from v$sysmetric aa,v$parameter cc where aa.METRIC_NAME ='CPU Usage Per Sec' and aa.group_id=2 and cc.name ='cpu_count';
五、需要保存现场的步骤/命令
create table dbmgr.ash_20220000 as select tt.* from gv$active_session_history tt;
create table dbmgr.sysstat_20220000 as select tt.* from gv$sysstat tt;
create table dbmgr.sysmetric_20220000 as select tt.* from gv$sysmetric tt;
create table dbmgr.sysmtc_sum_20220000 as select tt.* from gv$sysmetric_summary tt;
create table dbmgr.system_event_20220000 as select tt.* from gv$system_event tt;
create table dbmgr.event_hitgam_20220000 as select tt.* from gv$event_histogram tt;
create table dbmgr.sessmetric_20220000 as select tt.* from gv$sessmetric tt;
create table dbmgr.session_event_20220000 as select tt.* from gv$session_event tt;
create table dbmgr.sess_w_hist_20220000 as select tt.* from gv$session_wait_history tt;
create table dbmgr.sql_20220000 as select tt.* from gv$sql tt;
六、需要反馈上级以及运营的信息
将导致CPU异常的语句反馈给运营,评估决策是否kill会话
七、建议处理的流程,步骤和对应命令
(一)一般情况下异常语句导致CPU冲高会伴随等待事件,常规的定位方法如下:
1、 检查数据库的等待事件情况以及对应的语句
--查看当前数据库的top等待事件情况以及对应的语句
col EVENT for a40
select inst_id,sql_id,event, count(*) as amount from gv$session where event is not null and sql_id is not null and wait_class 'Idle' group by inst_id,sql_id,event order by count(*) desc;
--查看最近一分钟内,最消耗CPU的语句
select sql_id,count(*),
round(count(*)/sum(count(*)) over (),2) pctload
from v$active_session_history
where sample_time > sysdate -1/(24*60)
and session_type 'BACKGROUND'
and session_state= 'ON CPU'
group by sql_id
order by count(*) desc;
--查看近10分钟的会话状态分布,确定哪些sql处于on cpu模式,看sql有无性能问题
select sql_id, session_state, username, count(*)
from v$active_session_history a, dba_users s
where sample_time > sysdate - 10 / 24 / 60
and a.USER_ID = s.user_id
group by sql_id, session_state, username
order by 4 desc;
2、查询近1个小时每15分钟的TOP 5 SQL
select * from (select t.* ,row_number() over(partition by time order by cpu_time desc) rn from
(select to_char(end_INTERVAL_TIME ,'yyyy/mm/dd hh24:mi') time,s.sql_id,sum(s.executions_delta) execs,
sum(s.cpu_time_delta/1000/1000)cpu_time ,sum(s.elapsed_time_delta/1000/1000) elapsed_time
from dba_hist_sqlstat s , dba_hist_snapshot b
where s.instance_number=b.instance_number
and s.snap_id = b.snap_id
and b.end_interval_time > sysdate-1/24
group by to_char(end_INTERVAL_TIME ,'yyyy/mm/dd hh24:mi'),sql_id
)t) where rn 0
order by s.snap_id desc;
4、查看语句的执行计划明细,进一步分析是否是SQL性能问题
set linesize 150 pagesize 999
select * from table(dbms_xplan.display_cursor('&sql_id',&child_no,'ADVANCED'));
select * from table(dbms_xplan.display_cursor('&sql_id',null,'ADVANCED'));
select * from table(dbms_xplan.display_awr('&sql_id',&plan_hash_value,null,'ADVANCED'));
select * from table(dbms_xplan.display_sql_plan_baseline('&sql_handle','&plan_name'));
5、固化较优的执行计划,常见下面三种,待补充引用
固化历史较优执行计划
加hint生成新的执行计划并固化
从其他库迁移spm导入正确的执行计划
6、应急情况下,可根据语句或者等待事件KILL会话(需要运营决策可以KILL)
select 'alter system kill session ' || '''' || sid || ',' || serial# || ',@' || inst_id || '''' || ' immediate;'
from gv$session
where event='&event'//sql_id='&sql_id'
and type = 'USER';
7、查看语句涉及的对象的统计信息 (需要升级DA以及领导决策)
--查看语句涉及的对象信息
-- TABLE INFO
select distinct t.owner,t.table_name,t.num_rows,t.last_analyzed,t.degree,t.row_movement from v$sql_plan s, dba_tables t
where s.sql_id = '&1'
and s.object_type like '%TABLE%' and s.object_name = t.table_name and s.object_owner = t.owner order by last_analyzed;
-- INDEX INFO
select distinct t.owner,t.index_name,t.index_type,t.uniqueness,t.distinct_keys,t.num_rows,t.last_analyzed,t.degree from v$sql_plan s, dba_indexes t
where s.sql_id = '&1'
and s.object_type like '%INDEX%' and s.object_name = t.index_name and s.object_owner = t.owner order by last_analyzed;
-- INDEX PARTITION INFO
select distinct t.index_owner, t.index_name, t.column_name, t.column_position, t.descend,c.num_distinct,c.low_value,c.high_value,c.num_nulls,c.num_buckets,c.last_analyzed from v$sql_plan s, dba_ind_columns t, dba_tab_cols c
where s.sql_id = '&1'
and s.object_type like '%INDEX%' and s.object_name = t.index_name and s.object_owner = t.index_owner and t.table_owner = c.owner and t.table_name = c.table_name and t.column_name = c.column_name order by last_analyzed;
-- DDL INFO
select distinct t.owner, t.object_name,t.status,t.created, t.last_ddl_time, t.timestamp from v$sql_plan s, dba_objects t
where s.sql_id = '&1'
and s.object_name = t.OBJECT_NAME and s.object_owner = t.owner order by timestamp;
--查看对象的统计信息
select owner,table_name,column_name,num_distinct,density,last_analyzed,histogram from dba_tab_col_statistics where table_name='&table_name';
--收集统计信息(需要升级DA以及领导决策)
exec dbms_stats.gather_table_stats(ownname => '&owner',tabname => '&table_name',estimate_percent => 10,cascade => true);
8、判断是少索引,并且需要紧急创建索引(需要升级DA以及领导决策)
create index xxx.xxx on xxx.xxx(xxx) tablespace xxx initrans xxx online parallel xxx;
ACTIVE SESSION 超过300 #性能#
一、数据库状态检查的步骤/命令(通过本步骤可以确定数据库当前的状态是否OK)
检查连接情况,确保能正常登录、有应用连接、连接数使用率正常
col PROCESS_LIMIT for a20;
select process_current,value process_limit,round(process_current / value, 2) * 100 || '%' as process_pct
from (select count(*) as process_current from v$process) a, v$parameter where name = 'processes';
二、请描述告警出现的可能原因
1、异常等待事件导致活跃连接突增
三、预计多久不处理就可以出现UIOC(P1)
等待严重影响业务语句正常执行,需15分钟内处理正常
四、告警检查的步骤(通过本步骤确定告警是否是误告)
select 'session_active:'||count(*) from v$session where status='ACTIVE' and TYPE !='BACKGROUND';
五、需要保存现场的步骤/命令
create table dbmgr.ash_20220000 as select tt.* from gv$active_session_history tt;
create table dbmgr.sysstat_20220000 as select tt.* from gv$sysstat tt;
create table dbmgr.sysmetric_20220000 as select tt.* from gv$sysmetric tt;
create table dbmgr.sysmtc_sum_20220000 as select tt.* from gv$sysmetric_summary tt;
create table dbmgr.system_event_20220000 as select tt.* from gv$system_event tt;
create table dbmgr.event_hitgam_20220000 as select tt.* from gv$event_histogram tt;
create table dbmgr.sessmetric_20220000 as select tt.* from gv$sessmetric tt;
create table dbmgr.session_event_20220000 as select tt.* from gv$session_event tt;
create table dbmgr.sess_w_hist_20220000 as select tt.* from gv$session_wait_history tt;
create table dbmgr.sql_20220000 as select tt.* from gv$sql tt;
六、需要反馈上级以及运营的信息
将等待的情况反馈运营/DA,同时确认业务的影响情况,根据情况确认是否kill异常等待事件对应的会话
七、建议处理的流程,步骤和对应命令
1、查看数据库后台进程有没有被堵塞
select inst_id,type,count(*) from gv$session where blocking_session'' and status='ACTIVE' group by inst_id,type;
2、检查数据库的等待事件情况,确定top等待以及topsql
--查看当前的top等待和topsql
col EVENT for a40
select inst_id,sql_id,event, count(*) as amount from gv$session where event is not null and wait_class 'Idle' group by inst_id,sql_id,event order by count(*) desc;
--ASH查看用户的top等待事件
select *
from (select event, username, count(*)
from v$active_session_history a, dba_users s
where sample_time >=
to_date('&begin_time', 'YYYY-MM-DD HH24:MI:SS')
AND sample_time =
to_date('&begin_time', 'YYYY-MM-DD HH24:MI:SS')
AND sample_time =to_timestamp('&begin_time','yyyy-mm-dd hh24:mi:ss') and sample_time oradebug unlimit
SQL> oradebug hanganalyze 3 或者 oradebug -g all hanganalyze 3 (RAC环境)
等待1分钟
SQL> oradebug hanganalyze 3 或者 oradebug -g all hanganalyze 3 (RAC环境)
等待1分钟
SQL> oradebug dump systemstate 10 或者 oradebug -g all dump systemstate 10 (RAC环境) --默认采集10级别的,如需采集高级别的需要领导评估,级别分为256 258 266 267
等待1分钟
SQL> oradebug dump systemstate 10 或者 oradebug -g all dump systemstate 10 (RAC环境) -----默认采集10级别的,如需采集高级别的需要领导评估,级别分为256 258 266 267
等待1分钟
SQL> oradebug tracefile_name --------保留trace文件
2)检查数据库是否有占用undo较大的会话,会影响重启数据库后的恢复时间
--查看单个会话占用最大的undo空间
with bksize as
(select /*+materialize */ value
from v$parameter where NAME = 'db_block_size')
select 'undo_session_size:'||nvl(max(round(tr.USED_UBLK * bksize.value / 1024 / 1024)), 0) || ' M' "undo_session_size(MB)"
from v$session se, v$transaction tr, bksize
where se.TADDR = tr.ADDR(+);
3)临时屏蔽crontab
crontab -l >.cron_file_bak
>cron_null
crontab cron_null
4)停监听 --需要先同运营确认是否可以停监听,注意标黄的需要替换,而且有的库存在多个监听,都要停掉
GI单实例: crsctl stop res $ORACLE_SID.lsnr -f
RAC环境: srvctl stop listener -l $ORACLE_SID
VCS单实例: lsnrctl stop $ORACLE_SID
5)如果连接数已满无法进入数据库,考虑从OS层面kill本实例LOCAL=NO的进程,使用下面语句匹配
ORA-00020: maximum number of processes (1500) exceeded
--替换实例名
ps -ef|grep -v grep|grep LOCAL=NO|grep -w oracle$ORACLE_SID |awk '{print $2}'|xargs kill -9
6)后台登陆数据库检查等待事件是否已释放,一定要确认好了,正常再启监听
select inst_id,sql_id,event, count(*) as amount from gv$session where event is not null and wait_class 'Idle' group by inst_id,sql_id,event order by count(*) desc;
7)启动监听
GI单实例: crsctl start res $ORACLE_SID.lsnr
RAC环境: srvctl start listener -l $ORACLE_SID
VCS单实例: lsnrctl start $ORACLE_SID
8)启动crontab
crontab .cron_file_bak
9)检查GG
cd $GGS_HOME
./ggsci
info all
如果进程异常,使用start 进程名启动
八、验证问题已经恢复的步骤和命令
1、检查连接情况,确保能正常登录、有应用连接、连接数使用率正常
col PROCESS_LIMIT for a20;
select process_current,value process_limit,round(process_current / value, 2) * 100 || '%' as process_pct
from (select count(*) as process_current from v$process) a, v$parameter where name = 'processes';
2、检查数据库的等待事件情况
select inst_id,sql_id,event, count(*) as amount from gv$session where event is not null and wait_class 'Idle' group by inst_id,sql_id,event order by count(*) desc;
PROCESS 使用超过95% #配置#
一、数据库状态检查的步骤/命令(通过本步骤可以确定数据库当前的状态是否OK)
检查连接情况,确保连接数使用率正常
col PROCESS_LIMIT for a20;
select process_current,value process_limit,round(process_current / value, 2) * 100 || '%' as process_pct
from (select count(*) as process_current from v$process) a, v$parameter where name = 'processes';
二、请描述告警出现的可能原因
1、应用服务器连接配置超出数据库限制
2、异常等待事件导致连接堆积,常见场景是发版或者新上业务语句解析异常
三、预计多久不处理就可以出现UIOC(P1)
连接数打满时应用无法新建数据库连接,当达到90%时需要立即介入处理,需15分钟内处理正常
四、告警检查的步骤(通过本步骤确定告警是否是误告)
--查看当前连接数使用情况,process_pct代表使用率
col PROCESS_LIMIT for a20;
select process_current,value process_limit,round(process_current / value, 2) * 100 || '%' as process_pct
from (select count(*) as process_current from v$process) a, v$parameter where name = 'processes';
五、需要保存现场的步骤/命令
create table dbmgr.ash_20220000 as select tt.* from gv$active_session_history tt;
create table dbmgr.sessmetric_20220000 as select tt.* from gv$sessmetric tt;
create table dbmgr.session_event_20220000 as select tt.* from gv$session_event tt;
六、需要反馈上级以及运营的信息
将连接会话明细发给运营确认应用是否有异常,是否可以停部分应用,是否最近有扩容应用服务器的情况,评估紧急kill会话
如果有扩容应用服务器,联系DA评估是否安排变更调整连接数
七、建议处理的流程,步骤和对应命令
1、查看最近连接使用情况,可以初步看出,是从哪个时间点突增的
set line 1000;
set pagesize 1000;
col PROCESS_VALUE for a20;
select end_time, current_process,process_value ,used_pct from
(select to_char(trunc(end_time,'mi'), 'YYYY-MM-DD HH24:MI:SS') end_time,
round(value*limit_value/100) current_process,
limit_value process_value,
round(value,2)||'%' used_pct
from v$sysmetric_history,
(select limit_value from v$resource_limit where resource_name = 'processes')
where metric_name = 'Process Limit %') order by end_time desc;
2、检查当前连接分布明细
--连接分布
select * from (select username,osuser,status,machine,count(*) from v$session group by username ,osuser,status,machine order by 5 desc) where rownum 0
and &beg_snap < snap_id
and snap_id