其他1

2023年 9月 2日 30.8k 0

CPU CPU使用/cpu_count 超过 90% #容量#

一、数据库状态检查的步骤/命令(通过本步骤可以确定数据库当前的状态是否OK)

检查连接情况,确保连接数使用率正常

col PROCESS_LIMIT for a20;
select process_current,value process_limit,round(process_current / value, 2) * 100 || '%' as process_pct
from (select count(*) as process_current from v$process) a, v$parameter where name = 'processes';

二、请描述告警出现的可能原因

1、有效率差的语句,sql性能问题

2、语句并发过高

三、预计多久不处理就可以出现UIOC(P1)

CPU高可能导致业务语句执行缓慢甚至数据库不可用,需15分钟内处理正常

四、告警检查的步骤(通过本步骤确定告警是否是误告)

select to_char(round(aa.value/cc.value,2))||'%'  from v$sysmetric aa,v$parameter cc  where aa.METRIC_NAME ='CPU Usage Per Sec' and aa.group_id=2 and cc.name ='cpu_count';

五、需要保存现场的步骤/命令

create table dbmgr.ash_20220000 as select tt.* from gv$active_session_history tt;
create table dbmgr.sysstat_20220000 as select tt.* from gv$sysstat tt;
create table dbmgr.sysmetric_20220000 as select tt.* from gv$sysmetric tt;
create table dbmgr.sysmtc_sum_20220000 as select tt.* from gv$sysmetric_summary tt;
create table dbmgr.system_event_20220000 as select tt.* from gv$system_event tt;
create table dbmgr.event_hitgam_20220000 as select tt.* from gv$event_histogram tt;
create table dbmgr.sessmetric_20220000 as select tt.* from gv$sessmetric tt;
create table dbmgr.session_event_20220000 as select tt.* from gv$session_event tt;
create table dbmgr.sess_w_hist_20220000 as select tt.* from gv$session_wait_history tt;
create table dbmgr.sql_20220000 as select tt.* from gv$sql tt;

六、需要反馈上级以及运营的信息

将导致CPU异常的语句反馈给运营,评估决策是否kill会话

七、建议处理的流程,步骤和对应命令

(一)一般情况下异常语句导致CPU冲高会伴随等待事件,常规的定位方法如下:

1、 检查数据库的等待事件情况以及对应的语句

--查看当前数据库的top等待事件情况以及对应的语句

col EVENT for a40
select inst_id,sql_id,event, count(*) as amount from gv$session where event is not null and sql_id is not null and wait_class 'Idle' group by inst_id,sql_id,event order by count(*) desc;

--查看最近一分钟内,最消耗CPU的语句

select sql_id,count(*), 
round(count(*)/sum(count(*)) over (),2) pctload 
from v$active_session_history 
where sample_time > sysdate -1/(24*60) 
and session_type 'BACKGROUND' 
and session_state= 'ON CPU' 
group by sql_id 
order by count(*) desc; 

--查看近10分钟的会话状态分布,确定哪些sql处于on cpu模式,看sql有无性能问题

select sql_id, session_state, username, count(*)
from v$active_session_history a, dba_users s
where sample_time > sysdate - 10 / 24 / 60
and a.USER_ID = s.user_id
group by sql_id, session_state, username
order by 4 desc;

2、查询近1个小时每15分钟的TOP 5 SQL

select * from (select t.* ,row_number() over(partition by time order by cpu_time desc) rn from 
(select to_char(end_INTERVAL_TIME ,'yyyy/mm/dd hh24:mi') time,s.sql_id,sum(s.executions_delta) execs,
sum(s.cpu_time_delta/1000/1000)cpu_time ,sum(s.elapsed_time_delta/1000/1000) elapsed_time 
from dba_hist_sqlstat s , dba_hist_snapshot b 
where s.instance_number=b.instance_number
and s.snap_id = b.snap_id
and b.end_interval_time  > sysdate-1/24
group by to_char(end_INTERVAL_TIME ,'yyyy/mm/dd hh24:mi'),sql_id
)t) where rn 0
order by s.snap_id desc;

4、查看语句的执行计划明细,进一步分析是否是SQL性能问题

set linesize 150 pagesize 999
select * from table(dbms_xplan.display_cursor('&sql_id',&child_no,'ADVANCED'));
select * from table(dbms_xplan.display_cursor('&sql_id',null,'ADVANCED'));
select * from table(dbms_xplan.display_awr('&sql_id',&plan_hash_value,null,'ADVANCED'));
select * from table(dbms_xplan.display_sql_plan_baseline('&sql_handle','&plan_name'));  

5、固化较优的执行计划,常见下面三种,待补充引用

固化历史较优执行计划

加hint生成新的执行计划并固化

从其他库迁移spm导入正确的执行计划

6、应急情况下,可根据语句或者等待事件KILL会话(需要运营决策可以KILL)

select 'alter system kill session ' || '''' || sid || ',' || serial# || ',@' || inst_id || '''' || ' immediate;'
  from gv$session
 where event='&event'//sql_id='&sql_id'
   and type = 'USER';

7、查看语句涉及的对象的统计信息 (需要升级DA以及领导决策)

--查看语句涉及的对象信息

-- TABLE INFO 
select distinct t.owner,t.table_name,t.num_rows,t.last_analyzed,t.degree,t.row_movement from v$sql_plan s, dba_tables t
 where s.sql_id = '&1'
   and s.object_type like '%TABLE%' and s.object_name = t.table_name and s.object_owner = t.owner order by last_analyzed;
-- INDEX INFO
select distinct t.owner,t.index_name,t.index_type,t.uniqueness,t.distinct_keys,t.num_rows,t.last_analyzed,t.degree  from v$sql_plan s, dba_indexes t
 where s.sql_id = '&1'
   and s.object_type like '%INDEX%' and s.object_name = t.index_name and s.object_owner = t.owner order by last_analyzed;
-- INDEX PARTITION INFO
select distinct t.index_owner, t.index_name, t.column_name, t.column_position,  t.descend,c.num_distinct,c.low_value,c.high_value,c.num_nulls,c.num_buckets,c.last_analyzed  from v$sql_plan s, dba_ind_columns t, dba_tab_cols c
 where s.sql_id = '&1'
   and s.object_type like '%INDEX%' and s.object_name = t.index_name and s.object_owner = t.index_owner and t.table_owner = c.owner and t.table_name = c.table_name and t.column_name = c.column_name order by last_analyzed;
-- DDL INFO
select distinct t.owner, t.object_name,t.status,t.created, t.last_ddl_time, t.timestamp from v$sql_plan s, dba_objects t
 where s.sql_id = '&1'
   and s.object_name = t.OBJECT_NAME and s.object_owner = t.owner order by timestamp;

--查看对象的统计信息

select owner,table_name,column_name,num_distinct,density,last_analyzed,histogram from dba_tab_col_statistics  where table_name='&table_name';

--收集统计信息(需要升级DA以及领导决策)

exec dbms_stats.gather_table_stats(ownname => '&owner',tabname => '&table_name',estimate_percent => 10,cascade => true);

8、判断是少索引,并且需要紧急创建索引(需要升级DA以及领导决策)

create index xxx.xxx on xxx.xxx(xxx)  tablespace xxx initrans xxx online parallel xxx;

ACTIVE SESSION 超过300 #性能#

一、数据库状态检查的步骤/命令(通过本步骤可以确定数据库当前的状态是否OK)

检查连接情况,确保能正常登录、有应用连接、连接数使用率正常

col PROCESS_LIMIT for a20;
select process_current,value process_limit,round(process_current / value, 2) * 100 || '%' as process_pct
from (select count(*) as process_current from v$process) a, v$parameter where name = 'processes';

二、请描述告警出现的可能原因

1、异常等待事件导致活跃连接突增

三、预计多久不处理就可以出现UIOC(P1)

等待严重影响业务语句正常执行,需15分钟内处理正常

四、告警检查的步骤(通过本步骤确定告警是否是误告)

select 'session_active:'||count(*) from v$session where status='ACTIVE' and TYPE !='BACKGROUND';

五、需要保存现场的步骤/命令

create table dbmgr.ash_20220000 as select tt.* from gv$active_session_history tt;
create table dbmgr.sysstat_20220000 as select tt.* from gv$sysstat tt;
create table dbmgr.sysmetric_20220000 as select tt.* from gv$sysmetric tt;
create table dbmgr.sysmtc_sum_20220000 as select tt.* from gv$sysmetric_summary tt;
create table dbmgr.system_event_20220000 as select tt.* from gv$system_event tt;
create table dbmgr.event_hitgam_20220000 as select tt.* from gv$event_histogram tt;
create table dbmgr.sessmetric_20220000 as select tt.* from gv$sessmetric tt;
create table dbmgr.session_event_20220000 as select tt.* from gv$session_event tt;
create table dbmgr.sess_w_hist_20220000 as select tt.* from gv$session_wait_history tt;
create table dbmgr.sql_20220000 as select tt.* from gv$sql tt;

六、需要反馈上级以及运营的信息

将等待的情况反馈运营/DA,同时确认业务的影响情况,根据情况确认是否kill异常等待事件对应的会话

七、建议处理的流程,步骤和对应命令

1、查看数据库后台进程有没有被堵塞

select inst_id,type,count(*) from gv$session where blocking_session'' and status='ACTIVE' group by inst_id,type;

2、检查数据库的等待事件情况,确定top等待以及topsql

--查看当前的top等待和topsql
col EVENT for a40
select inst_id,sql_id,event, count(*) as amount from gv$session where event is not null and wait_class 'Idle' group by inst_id,sql_id,event order by count(*) desc;

--ASH查看用户的top等待事件
select *
  from (select event, username, count(*)
          from v$active_session_history a, dba_users s
         where sample_time >=
               to_date('&begin_time', 'YYYY-MM-DD HH24:MI:SS')
           AND sample_time =
               to_date('&begin_time', 'YYYY-MM-DD HH24:MI:SS')
           AND sample_time =to_timestamp('&begin_time','yyyy-mm-dd hh24:mi:ss')  and sample_time oradebug unlimit

SQL> oradebug hanganalyze 3 或者 oradebug -g all hanganalyze 3 (RAC环境)

等待1分钟

SQL> oradebug hanganalyze 3 或者 oradebug -g all hanganalyze 3 (RAC环境)

等待1分钟

SQL> oradebug dump systemstate 10 或者 oradebug -g all dump systemstate  10 (RAC环境) --默认采集10级别的,如需采集高级别的需要领导评估,级别分为256 258 266 267

等待1分钟

SQL> oradebug dump systemstate 10 或者 oradebug -g all dump systemstate  10 (RAC环境) -----默认采集10级别的,如需采集高级别的需要领导评估,级别分为256 258 266 267

等待1分钟

SQL> oradebug tracefile_name --------保留trace文件

2)检查数据库是否有占用undo较大的会话,会影响重启数据库后的恢复时间

--查看单个会话占用最大的undo空间

with bksize as
 (select /*+materialize */ value
    from v$parameter where NAME = 'db_block_size')
select 'undo_session_size:'||nvl(max(round(tr.USED_UBLK * bksize.value / 1024 / 1024)), 0) || ' M' "undo_session_size(MB)"
  from v$session se, v$transaction tr, bksize
 where se.TADDR = tr.ADDR(+);

3)临时屏蔽crontab

crontab -l >.cron_file_bak

>cron_null

crontab cron_null

4)停监听    --需要先同运营确认是否可以停监听,注意标黄的需要替换,而且有的库存在多个监听,都要停掉

GI单实例:  crsctl stop res $ORACLE_SID.lsnr -f

RAC环境: srvctl stop listener -l $ORACLE_SID

VCS单实例: lsnrctl stop $ORACLE_SID

5)如果连接数已满无法进入数据库,考虑从OS层面kill本实例LOCAL=NO的进程,使用下面语句匹配

ORA-00020: maximum number of processes (1500) exceeded

--替换实例名

ps -ef|grep -v grep|grep LOCAL=NO|grep -w oracle$ORACLE_SID |awk '{print $2}'|xargs kill -9

6)后台登陆数据库检查等待事件是否已释放,一定要确认好了,正常再启监听

select inst_id,sql_id,event, count(*) as amount from gv$session where event is not null and wait_class 'Idle' group by inst_id,sql_id,event order by count(*) desc;

7)启动监听

GI单实例:  crsctl start res $ORACLE_SID.lsnr

RAC环境: srvctl start listener -l $ORACLE_SID

VCS单实例: lsnrctl start $ORACLE_SID

8)启动crontab

crontab .cron_file_bak

9)检查GG

cd $GGS_HOME

./ggsci

info all

如果进程异常,使用start 进程名启动

八、验证问题已经恢复的步骤和命令

1、检查连接情况,确保能正常登录、有应用连接、连接数使用率正常

col PROCESS_LIMIT for a20;
select process_current,value process_limit,round(process_current / value, 2) * 100 || '%' as process_pct
from (select count(*) as process_current from v$process) a, v$parameter where name = 'processes';

2、检查数据库的等待事件情况

select inst_id,sql_id,event, count(*) as amount from gv$session where event is not null and wait_class 'Idle' group by inst_id,sql_id,event order by count(*) desc;

PROCESS 使用超过95% #配置#

一、数据库状态检查的步骤/命令(通过本步骤可以确定数据库当前的状态是否OK)

检查连接情况,确保连接数使用率正常

col PROCESS_LIMIT for a20;
select process_current,value process_limit,round(process_current / value, 2) * 100 || '%' as process_pct
from (select count(*) as process_current from v$process) a, v$parameter where name = 'processes';

二、请描述告警出现的可能原因

1、应用服务器连接配置超出数据库限制

2、异常等待事件导致连接堆积,常见场景是发版或者新上业务语句解析异常

三、预计多久不处理就可以出现UIOC(P1)

连接数打满时应用无法新建数据库连接,当达到90%时需要立即介入处理,需15分钟内处理正常

四、告警检查的步骤(通过本步骤确定告警是否是误告)

--查看当前连接数使用情况,process_pct代表使用率

col PROCESS_LIMIT for a20;
select process_current,value process_limit,round(process_current / value, 2) * 100 || '%' as process_pct
from (select count(*) as process_current from v$process) a, v$parameter where name = 'processes';

五、需要保存现场的步骤/命令

create table dbmgr.ash_20220000 as select tt.* from gv$active_session_history tt;
create table dbmgr.sessmetric_20220000 as select tt.* from gv$sessmetric tt;
create table dbmgr.session_event_20220000 as select tt.* from gv$session_event tt;

六、需要反馈上级以及运营的信息

将连接会话明细发给运营确认应用是否有异常,是否可以停部分应用,是否最近有扩容应用服务器的情况,评估紧急kill会话

如果有扩容应用服务器,联系DA评估是否安排变更调整连接数

七、建议处理的流程,步骤和对应命令

1、查看最近连接使用情况,可以初步看出,是从哪个时间点突增的

set line 1000;
set pagesize 1000;
col PROCESS_VALUE for a20;
select end_time, current_process,process_value ,used_pct from 
(select to_char(trunc(end_time,'mi'), 'YYYY-MM-DD HH24:MI:SS') end_time, 
round(value*limit_value/100) current_process,
limit_value process_value,
round(value,2)||'%' used_pct
from v$sysmetric_history,
(select limit_value from v$resource_limit where resource_name = 'processes') 
where metric_name = 'Process Limit %') order by end_time desc;

2、检查当前连接分布明细

--连接分布

select * from (select username,osuser,status,machine,count(*) from v$session group by username ,osuser,status,machine order by 5 desc) where rownum 0
                   and &beg_snap < snap_id
                   and snap_id

相关文章

Oracle如何使用授予和撤销权限的语法和示例
Awesome Project: 探索 MatrixOrigin 云原生分布式数据库
下载丨66页PDF,云和恩墨技术通讯(2024年7月刊)
社区版oceanbase安装
Oracle 导出CSV工具-sqluldr2
ETL数据集成丨快速将MySQL数据迁移至Doris数据库

发布评论