-- name: lfsdiag.sql
-- ------------------------------------------------------------------------
-- author: michael polaski - oracle support services
-- purpose:
-- this script. is intended to provide a user friendly guide to troubleshoot
-- log file sync waits. the script. will look at important parameters involved
-- in log file sync waits, log file sync wait histogram data, and the script
-- will look at the worst average log file sync times in the active session
-- history data and awr data and dump information to help determine why those
-- times were the highest. the script. will create a file called
-- lfsdiag_.out in your local directory.
set echo off
set feedback off
column timecol new_value timestamp
column spool_extension new_value suffix
select to_char(sysdate, 'mondd_hh24mi') timecol, '.out' spool_extension
from sys.dual;
column output new_value dbname
select value || '_' output from v$parameter where name = 'db_name';
spool lfsdiag_&&dbname&×tamp&&suffix
set trim on
set trims on
set lines 130
set pages 100
set verify off
alter session set optimizer_features_enable = '10.2.0.4';
prompt lfsdiag data for &&dbname&×tamp
prompt note: all timings are in milliseconds (1000 milliseconds = 1 second)
prompt
prompt important parameters relating to log file sync waits:
column name format a40 wra
column value format a40 wra
select inst_id, name, value
from gv$parameter
where ((value is not null and name like '%log_archive%') or
name like '%commit%' or name like '%event=%' or name like '%lgwr%')
and name not in
(select name
from gv$parameter
where (name like '%log_archive_dest_state%' and value = 'enable')
or name = 'log_archive_format')
order by 1, 2, 3;
prompt histogram data for lfs and other related waits:
prompt approach: look at the wait distribution for log file sync waits
prompt by looking at "wait_time_milli". look at the high wait times then
prompt see if you can correlate those with other related wait events.
column event format a40 wra
select inst_id, event, wait_time_milli, wait_count
from gv$event_histogram
where event in ('log file sync',
'gcs log flush sync',
'log file parallel write',
'wait for scn ack',
'log file switch completion',
'gc cr grant 2-way',
'gc buffer busy',
'gc current block 2-way')
or event like '%lgwr%'
or event like '%lns%'
order by 2 desc, 1, 3;
prompt ordered by wait_time_milli
order by 3, 1, 2 desc;
prompt redo write stats
prompt "redo write time" in centiseconds (100 per second)
prompt 11.1: "redo write broadcast ack time" in centiseconds (100 per second)
prompt 11.2: "redo write broadcast ack time" in microseconds (1000 per millisecond)
column value format 99999999999999999999
column milliseconds format 99999999999999.999
select v.version,
ss.inst_id,
ss.name,
ss.value,
decode(substr(version, 1, 4),
'11.1',
decode(name,
'redo write time',
value * 10,
'redo write broadcast ack time',
value * 10),
'11.2',
value / 1000),
decode(name, 'redo write time', value * 10)) milliseconds
from gv$sysstat ss, v$instance v
where name like 'redo write%'
and value > 0
prompt ash threshold...
prompt this will be the threshold in milliseconds for average log file sync
prompt times. this will be used for the next queries to look for the worst
prompt 'log file sync' minutes. any minutes that have an average log file
prompt sync time greater than the threshold will be analyzed further.
column threshold_in_ms new_value threshold format 999999999.999
select min(threshold_in_ms) threshold_in_ms
from (select inst_id,
to_char(sample_time, 'mondd_hh24mi') minute,
avg(time_waited) / 1000 threshold_in_ms
from gv$active_session_history
where event = 'log file sync'
group by inst_id, to_char(sample_time, 'mondd_hh24mi')
order by 3 desc)
where rownum <= 5;
prompt ash worst minutes for log file sync waits:
prompt approach: these are the minutes where the avg log file sync time
prompt was the highest (in milliseconds).
column event format a30 tru
column program format a35 tru
column total_wait_time format 999999999999.999
column avg_time_waited format 999999999999.999
select to_char(sample_time, 'mondd_hh24mi') minute,
inst_id,
event,
sum(time_waited) / 1000 total_wait_time,
count(*) waits,
avg(time_waited) / 1000 avg_time_waited
from gv$active_session_history
where event = 'log file sync'
group by to_char(sample_time, 'mondd_hh24mi'), inst_id, event
having avg(time_waited) / 1000 > &&threshold
order by 1, 2;
prompt ash lfs background process waits during worst minutes:
prompt approach: what is lgwr doing when 'log file sync' waits
prompt are happening? lms info may be relevent for broadcast
prompt on commit and lns data may be relevant for dataguard.
prompt if more details are needed see the ash details for worst
prompt minutes section at the bottom of the report.
column inst format 999
column program format a35 wra
inst_id inst,
program,
where to_char(sample_time, 'mondd_hh24mi') in
(select to_char(sample_time, 'mondd_hh24mi')
group by to_char(sample_time, 'mondd_hh24mi'), inst_id
having avg(time_waited) / 1000 > &&threshold)
and (program like '%lgwr%' or program like '%lms%' or
program like '%lns%' or event = 'log file sync')
group by to_char(sample_time, 'mondd_hh24mi'), inst_id, program, event
order by 1, 2, 3, 5 desc, 4;
prompt awr worst avg log file sync snaps:
prompt approach: these are the awr snaps where the average 'log file sync'
prompt times were the highest.
column begin format a12 tru
column end format a12 tru
column name format a13 tru
select
dhs.snap_id,
dhs.instance_number inst,
to_char(dhs.begin_interval_time,'mondd_hh24mi') begin,
to_char(dhs.end_interval_time,'mondd_hh24mi') end,
en.name,
se.time_waited_micro/1000 total_wait_time,
se.total_waits,
se.time_waited_micro/1000 / se.total_waits avg_time_waited
from
dba_hist_snapshot dhs,
wrh$_system_event se,
v$event_name en
where (dhs.snap_id = se.snap_id and dhs.instance_number = se.instance_number)
and se.event_id = en.event_id and en.name = 'log file sync' and
dhs.snap_id in (
select snap_id
from (select se.snap_id,
se.time_waited_micro / 1000 / se.total_waits avg_time_waited
from wrh$_system_event se, v$event_name en
where se.event_id = en.event_id
and en.name = 'log file sync'
order by avg_time_waited desc)
where rownum < 4)
order by 1,2;
prompt awr redo write stats
column stat_name format a30 tru
ss.snap_id,
ss.instance_number inst,
sn.stat_name,
decode(stat_name,
decode(stat_name, 'redo write time', value * 10)) milliseconds
from wrh$_sysstat ss, wrh$_stat_name sn, v$instance v
where ss.stat_id = sn.stat_id
and sn.stat_name like 'redo write%'
and ss.value > 0
and ss.snap_id in
(select snap_id
from (select se.snap_id,
se.time_waited_micro / 1000 / se.total_waits avg_time_waited
from wrh$_system_event se, v$event_name en
where se.event_id = en.event_id
and en.name = 'log file sync'
order by avg_time_waited desc)
where rownum < 4)
prompt awr lfs and other related waits for worst lfs awrs:
prompt times were the highest. look at related waits at those times.
column name format a40 tru
select se.snap_id,
se.instance_number inst,
en.name,
se.total_waits,
se.time_waited_micro / 1000 total_wait_time,
se.time_waited_micro / 1000 / se.total_waits avg_time_waited
from wrh$_system_event se, v$event_name en
where se.event_id = en.event_id
and (en.name in ('log file sync',
'gcs log flush sync',
'log file parallel write',
'wait for scn ack',
'log file switch completion',
'gc cr grant 2-way',
'gc buffer busy',
'gc current block 2-way') or en.name like '%lgwr%' or
en.name like '%lns%')
and se.snap_id in
order by 1, 6 desc;
prompt awr histogram data for lfs and other related waits for worst lfs awrs:
prompt note: this query won't work on 10.2 - ora-942
select eh.snap_id,
eh.instance_number inst,
eh.wait_time_milli,
eh.wait_count
from wrh$_event_histogram eh, v$event_name en
where eh.event_id = en.event_id
and snap_id in
order by 1, 3 desc, 2, 4;
order by 1, 4, 2, 3 desc;
prompt ash details for worst minutes:
prompt approach: if you cannot determine the problem from the data
prompt above, you may need to look at the details of what each session
prompt is doing during each 'bad' snap. most likely you will want to
prompt note the times of the high log file sync waits, look at what
prompt lgwr is doing at those times, and go from there...
column program format a45 wra
column sample_time format a25 tru
column time_waited format 999999.999
column p1 format a40 tru
column p2 format a40 tru
column p3 format a40 tru
select sample_time,
session_id,
time_waited / 1000 time_waited,
p1text || ': ' || p1 p1,
p2text || ': ' || p2 p2,
p3text || ': ' || p3 p3
order by 1, 2, 3, 4, 5;
select to_char(sysdate,'mondd hh24:mi:ss') time from dual;
spool off
prompt output file is: lfsdiag_&&dbname&×tamp&&suffix