f 二月 2019 ~ 迪貝之家

Microsoft SQL Server Log Shipping

SQL 7就有的技術,簡單的說就是資料庫複本資料同步的技術,而這份複本可以開放讀取的功能以分散營業資料庫的負載,也可純粹當成備援資料庫,不管是在本地還是異地。怎麼達成? 透過SQL Agent自動化排程進行備份、傳檔及復原等作業。

Oracle GolGateden

機房搬遷跨Site能即時同步資料庫利器

Nagios 資料庫維運自動化

一開始設計這架構時,就排除使用remote agent的想法因為在專業分工的組織下,要求安裝新軟體於既有系統是一件不太可行的方案,既然身為DBA就只能把資料庫instance當作是一個最大的agent 想辦法在資料庫內做到我想做的事情

This is default featured post 4 title

Go to Blogger edit html and find these sentences.Now replace these sentences with your own descriptions.

This is default featured post 5 title

Go to Blogger edit html and find these sentences.Now replace these sentences with your own descriptions.

Oracle 非最高權限怎麼砍session

組織分工下,通常不會把kill session的權限給AP 單位,但有時會緊急接到要砍資料庫連線的需求,不過DBA又不是英英美代只,隨時待命準備幫你砍session,只好寫了一隻store procedure 給AP單位來自行砍session,但這只是應急,奉勸還是要找到根本原因,想辦法去解決問題,那才是做事情的基本道理

create or replace procedure kill_session(sid_in varchar2, serial_num_in varchar2) authid definer
as
  not_allowed  exception;
  killed exception;
  ses_rec v$session%rowtype;
  v_file  UTL_FILE.FILE_TYPE;
  kill_log_file varchar2(100) := '.kill_ses_'||to_char(sysdate,'YYYYMMDD')||'.log';
  current_session_user varchar2(100);
  client_host          varchar2(100);
  client_ip            varchar2(100);
  client_terminal      varchar2(100);
  client_module        varchar2(100);
  msg varchar2(100);
begin


    execute immediate('create or replace directory kill_sess_log_dir as ''/tmp''');
   select * into ses_rec
   from v$session
   where sid = sid_in
   and   serial# = serial_num_in;

 if ses_rec.username  in ( 'SYS','SYSTEM','DBSNMP','SYSMAN') or ses_rec.username is null
   then
       raise not_allowed;
   end if;
 if ses_rec.status = 'KILLED'
   then
       raise killed;
   end if;
 
execute immediate 'alter system kill session ''' || sid_in || ', ' || serial_num_in || '''' ;

   SELECT SYS_CONTEXT ('USERENV', 'SESSION_USER'),
          SYS_CONTEXT('USERENV','HOST'),
  SYS_CONTEXT('USERENV','IP_ADDRESS'),
  SYS_CONTEXT('USERENV','TERMINAL'),
  SYS_CONTEXT('USERENV','MODULE')
  into current_session_user ,
               client_host,
   client_ip ,
   client_terminal ,
   client_module
  FROM DUAL;


 v_file := UTL_FILE.FOPEN(location     => 'KILL_SESS_LOG_DIR',
                           filename     => kill_log_file,
                           open_mode    => 'a',
                           max_linesize => 32767);
msg := to_char(sysdate,'YYYYMMDDHH24MISS')||','||current_session_user||','||client_host||','||client_ip||','||client_terminal ||','||sid_in||','||serial_num_in||','||client_module ;
 utl_file.put_line(v_file,msg);
 utl_file.fclose(v_file);

 exception
    when no_data_found
    then
    dbms_output.put_line('Your input find no session');
   when not_allowed
   then
    dbms_output.put_line('Hey! You are not allowed to kill this session.');
   when killed
   then
    dbms_output.put_line('This session has already been killed.');
   when others
     then
     declare
       error_code  number := SQLCODE;
       error_msg   varchar2(300) := SQLERRM;
      begin
        if error_code != 0 then
             dbms_output.put_line(error_msg);
      if   UTL_FILE.IS_OPEN(v_file) then
       utl_file.fclose(v_file);
  end if;
         end if;
      end;
end;
/

T-SQL 查看自行設定的 Long Query

DECLARE slow_query_cursor CURSOR FORWARD_ONLY READ_ONLY FOR
SELECT      wait_type,
wait_time/1000 wait_time,
            status   ,
            session_id  ,
            cpu_time/1000 cpu_time  ,
            convert(varchar,start_time, 120) start_time  ,
            total_elapsed_time/1000 total_elapsed_time,
            name   
FROM        sys.dm_exec_requests r
                        LEFT JOIN sys.databases d ON (r.database_id=d.database_id)
WHERE       r.session_id > 50 AND r.session_id <> @@SPID
and         r.total_elapsed_time/1000 > 600
and         r.wait_time/1000 > 60
and         wait_type not like '%BROKER_TRANSMITTER%'
order by total_elapsed_time desc;

DECLARE @outputinfo varchar(MAX);
set   @outputinfo = N'N';
DECLARE @wait_type varchar(100);
DECLARE @wait_time bigint;
DECLARE @status varchar(100);
DECLARE @session_id int;
DECLARE @cpu_time bigint;
DECLARE @start_time varchar(100);
DECLARE @total_elapsed_time bigint;
DECLARE @name varchar(100);

OPEN slow_query_cursor;

FETCH NEXT
FROM slow_query_cursor
INTO @wait_type,@wait_time,@status,@session_id,@cpu_time,@start_time,@total_elapsed_time,@name;

WHILE @@FETCH_STATUS = 0
begin

set   @outputinfo =  @outputinfo + ':' + @name +'/' + @start_time + '/'+ cast(@session_id as varchar) + '/'+ cast(@total_elapsed_time as varchar) + '/' + cast(@cpu_time as varchar) + '/' + @status + '/' ;
set   @outputinfo =  @outputinfo + cast(@wait_time as varchar) +'/' + @wait_type;

   FETCH NEXT
   FROM slow_query_cursor
   INTO @wait_type,@wait_time,@status,@session_id,@cpu_time,@start_time,@total_elapsed_time,@name;
end

   -- Close the cursor
CLOSE slow_query_cursor;
-- Deallocate the cursor
DEALLOCATE slow_query_cursor;

if len(@outputinfo) = 1
   print N'Y';
else
   print  @outputinfo;




用SQL 查詢Postgres 執行時間超過10分鐘 Query 中處於wait 狀態TOP 1

SET client_min_messages TO NOTICE;
do $$
declare
  version_check    boolean;
  r1               record;
  r2               record;
  outputinfo       varchar(80) :='N' ;
  waiting_threshold interval := '10 min';

begin

   SELECT substring((string_to_array(version(), ' '))[2] for 3) > '9.1' into version_check;
    if version_check
    then
      for r1 in  SELECT date_trunc('second',current_timestamp - query_start) AS runtime,
                    datname, usename,substring(query for 20) query,pid
FROM pg_stat_activity
WHERE state = 'active'
AND current_timestamp - query_start >=  waiting_threshold
-- and query !~* '(vacuum|analyze)'
                        and waiting
ORDER BY 1 DESC limit 1
    loop
  outputinfo := '';
  outputinfo := outputinfo || cast(r1.pid as varchar) ||'/'||cast(r1.runtime as varchar)||'/'||r1.query ;
              for r2 in SELECT blocked_locks.pid     AS blocked_pid,
                               blocked_activity.usename  AS blocked_user,
                               blocking_locks.pid     AS blocking_pid,
                               blocking_activity.usename AS blocking_user,
                               blocked_activity.query    AS blocked_statement,
                               substring(blocking_activity.query for 20)  AS current_statement_in_blocking_process
                               FROM  pg_catalog.pg_locks         blocked_locks
                               JOIN pg_catalog.pg_stat_activity blocked_activity
   ON blocked_activity.pid = blocked_locks.pid
                               AND  blocked_activity.pid = r1.pid
                               JOIN pg_catalog.pg_locks         blocking_locks
                               ON blocking_locks.locktype = blocked_locks.locktype
                               AND blocking_locks.DATABASE IS NOT DISTINCT FROM blocked_locks.DATABASE
                               AND blocking_locks.relation IS NOT DISTINCT FROM blocked_locks.relation
                               AND blocking_locks.page IS NOT DISTINCT FROM blocked_locks.page
                               AND blocking_locks.tuple IS NOT DISTINCT FROM blocked_locks.tuple
                               AND blocking_locks.virtualxid IS NOT DISTINCT FROM blocked_locks.virtualxid
                               AND blocking_locks.transactionid IS NOT DISTINCT FROM blocked_locks.transactionid
                               AND blocking_locks.classid IS NOT DISTINCT FROM blocked_locks.classid
                               AND blocking_locks.objid IS NOT DISTINCT FROM blocked_locks.objid
                               AND blocking_locks.objsubid IS NOT DISTINCT FROM blocked_locks.objsubid
                               AND blocking_locks.pid != blocked_locks.pid
                               JOIN pg_catalog.pg_stat_activity blocking_activity ON blocking_activity.pid = blocking_locks.pid
                               WHERE NOT blocked_locks.GRANTED

        loop
          outputinfo := outputinfo ||':' ||cast(r2.blocking_pid as varchar)||'/'||r2.current_statement_in_blocking_process;
    end loop;
        end loop;
    else
      for r1 in  SELECT date_trunc('second',current_timestamp - query_start) AS runtime,
                    datname, usename, substring(current_query for 20) query,waiting,procpid
FROM pg_stat_activity
where current_timestamp - query_start >=  waiting_threshold
-- and query !~* '(vacuum|analyze)'
                        and waiting
ORDER BY 1 DESC limit 1
   loop
         outputinfo := '';
         outputinfo := outputinfo || cast(r1.procpid as varchar) ||'/'||cast(r1.runtime as varchar)||'/'||r1.query ;
              for r2 in SELECT blocked_locks.pid     AS blocked_pid,
                               blocked_activity.usename  AS blocked_user,
                               blocking_locks.pid     AS blocking_pid,
                               blocking_activity.usename AS blocking_user,
                               blocked_activity.current_query    AS blocked_statement,
                               substring(blocking_activity.current_query for 20)  AS current_statement_in_blocking_process
                               FROM  pg_catalog.pg_locks         blocked_locks
                               JOIN pg_catalog.pg_stat_activity blocked_activity
         ON blocked_activity.procpid = blocked_locks.pid
   and  blocked_activity.procpid = r1.procpid
                               JOIN pg_catalog.pg_locks         blocking_locks
                               ON blocking_locks.locktype = blocked_locks.locktype
                               AND blocking_locks.DATABASE IS NOT DISTINCT FROM blocked_locks.DATABASE
                               AND blocking_locks.relation IS NOT DISTINCT FROM blocked_locks.relation
                               AND blocking_locks.page IS NOT DISTINCT FROM blocked_locks.page
                               AND blocking_locks.tuple IS NOT DISTINCT FROM blocked_locks.tuple
                               AND blocking_locks.virtualxid IS NOT DISTINCT FROM blocked_locks.virtualxid
                               AND blocking_locks.transactionid IS NOT DISTINCT FROM blocked_locks.transactionid
                               AND blocking_locks.classid IS NOT DISTINCT FROM blocked_locks.classid
                               AND blocking_locks.objid IS NOT DISTINCT FROM blocked_locks.objid
                               AND blocking_locks.objsubid IS NOT DISTINCT FROM blocked_locks.objsubid
                               AND blocking_locks.pid != blocked_locks.pid
                               JOIN pg_catalog.pg_stat_activity blocking_activity ON blocking_activity.procpid = blocking_locks.pid
                               WHERE NOT blocked_locks.GRANTED

        loop
          outputinfo := outputinfo ||':' ||cast(r2.blocking_pid as varchar)||'/'||r2.current_statement_in_blocking_process;
    end loop;

       end loop;
    end if;
raise notice '%', outputinfo;
end$$;

用SQL 查詢Postgres 執行時間超過10分鐘 Query 中最TOP的Query

SET client_min_messages TO NOTICE;
do $$
declare
  version_check    boolean;
  r1               record;
  outputinfo       varchar(80) := 'N';
  waiting_threshold interval := '10 min';

begin

   SELECT substring((string_to_array(version(), ' '))[2] for 3) > '9.1' into version_check;
    if version_check
    then
      for r1 in  SELECT  date_trunc('second',current_timestamp - query_start) AS runtime,
                            datname, usename,substring(query for 40) query,waiting,pid
                                                FROM pg_stat_activity
                                                WHERE state = 'active'
                                                AND current_timestamp - query_start >=  waiting_threshold
                                                and query !~* '(vacuum|analyze)'
                                                ORDER BY 1 DESC limit 1
            loop
                  outputinfo := '';
                  outputinfo := outputinfo || cast(r1.pid as varchar) ||'/'||cast(r1.runtime as varchar)||'/'||r1.query ;
            end loop;
    else
      for r1 in  SELECT  date_trunc('second',current_timestamp - query_start) AS runtime,
                            datname, usename, substring(current_query for 40) query,waiting,procpid
                                                FROM pg_stat_activity
                                                where current_timestamp - query_start >=  waiting_threshold
                                                and current_query !~* '(vacuum|analyze|idle)'
                                                ORDER BY 1 DESC limit 1
           loop
                 outputinfo := '';
                 outputinfo := outputinfo || cast(r1.procpid as varchar) ||'/'||cast(r1.runtime as varchar)||'/'||r1.query ;
           end loop;

    end if;
        raise notice '%', outputinfo;
end$$;

如何使用SQL 查詢Postgres RDBMS 系統記錄檔內關鍵字串FATAL 或PANIC或WARNNING

select to_tsvector(pg_read_file(current_log,1,size) )  @@ to_tsquery('(FATAL|PANIC|WARNNING)')
from ( 
 select case current_setting('log_directory') when 'pg_log' then current_setting('data_directory')||'/pg_log/'||file
                                              else current_setting('log_directory')||'/'||file end as current_log ,
 (pg_stat_file(current_setting('log_directory')||'/'||file)).modification,(pg_stat_file(current_setting('log_directory')||'/'||file)).size
 FROM  pg_ls_dir(current_setting('log_directory')||'/') as list(file)
  ORDER BY 2 DESC
  LIMIT 1 );

T-SQL 檢核SQL Server Login 密碼到期日

DECLARE password_expire_cursor CURSOR FORWARD_ONLY READ_ONLY FOR
SELECT  name,cast(LOGINPROPERTY(name, 'DaysUntilExpiration')  as int) expired_day
FROM    sys.sql_logins
WHERE is_policy_checked = 1
and   cast(LOGINPROPERTY(name, 'DaysUntilExpiration')  as int) > 0
and    cast(LOGINPROPERTY(name, 'DaysUntilExpiration')  as int) < 7
and   LOGINPROPERTY(name, 'PasswordLastSetTime') > '1900-01-01 08:00:00.000' ;
DECLARE @name varchar(100);
DECLARE @expired_day int;
DECLARE @logic_disk char;
DECLARE @outputinfo varchar(MAX);
set   @outputinfo = N'N';

-- Open the cursor
OPEN password_expire_cursor;
-- Retrieve one row at a time from the cursor
FETCH NEXT
FROM password_expire_cursor
INTO @name,@expired_day ;

WHILE @@FETCH_STATUS = 0
begin

set   @outputinfo =  @outputinfo + ':' + @name + '/'+ cast(@expired_day as char) ;

   FETCH NEXT
   FROM password_expire_cursor
   INTO @name,@expired_day;
end

   -- Close the cursor
CLOSE password_expire_cursor;
-- Deallocate the cursor
DEALLOCATE password_expire_cursor;

if len(@outputinfo) = 1
   print N'Y';
else
   print  @outputinfo;


T-SQL 查看windows 邏輯磁碟的可用空間

IF OBJECT_ID('tempdb..#G_DB_FILE_LIST') IS NOT NULL
    DROP TABLE #G_DB_FILE_LIST;
create table #G_DB_FILE_LIST (current_db varchar(100), physical_name varchar(300));

exec sp_MSforeachdb 'use [?];

begin
INSERT  #G_DB_FILE_LIST select db_name() , physical_name From  sys.database_files;
end'
DECLARE logic_disk_cursor CURSOR FORWARD_ONLY READ_ONLY FOR
SELECT  distinct current_db, substring(physical_name,1,1) as logic_disk FROM #G_DB_FILE_LIST;
DECLARE @logic_disk_free_size TABLE (logic_disk char, free_mb int, warning_flag char)
DECLARE @current_db varchar(100);
DECLARE @logic_disk char;
DECLARE @cnt int;
DECLARE @outputinfo varchar(MAX);
DECLARE @free_mb int;
set   @outputinfo = N'N';

Declare outputinfo_cursor   CURSOR FORWARD_ONLY READ_ONLY FOR
select logic_disk, free_mb from @logic_disk_free_size where warning_flag = '1';


insert @logic_disk_free_size(logic_disk , free_mb ) exec MASTER..xp_fixeddrives;

-- Open the cursor
OPEN logic_disk_cursor;
-- Retrieve one row at a time from the cursor
FETCH NEXT
FROM logic_disk_cursor
INTO @current_db,@logic_disk;

-- Process and retrieve new rows until no more are available
WHILE @@FETCH_STATUS = 0
  BEGIN
    select  @cnt = count(*) from  @logic_disk_free_size where logic_disk = @logic_disk and free_mb <= 20480;
   if @cnt >= 1
    update @logic_disk_free_size set warning_flag = '1' where logic_disk = @logic_disk;

   FETCH NEXT
   FROM logic_disk_cursor
   INTO @current_db,@logic_disk;
  END
-- Close the cursor
CLOSE logic_disk_cursor;
-- Deallocate the cursor
DEALLOCATE logic_disk_cursor;

OPEN outputinfo_cursor;

FETCH NEXT
FROM outputinfo_cursor
INTO @logic_disk,@free_mb;

WHILE @@FETCH_STATUS = 0
begin

set   @outputinfo =  @outputinfo + ':' + @logic_disk + '/'+ cast(@free_mb as varchar) ;

   FETCH NEXT
   FROM outputinfo_cursor
   INTO @logic_disk,@free_mb;
end

   -- Close the cursor
CLOSE outputinfo_cursor;
-- Deallocate the cursor
DEALLOCATE outputinfo_cursor;

if len(@outputinfo) = 1
   print N'Y';
else
   print  @outputinfo;
select * from   @logic_disk_free_size

T-SQL 查看SQL Server 系統記錄檔

不是SSMS就可以看log了嗎 ?? 對...沒錯!!但當你的log裡頭包含了幾千或幾萬行的記錄時,用scroll bar慢慢往下拉,你要看到甚麼時候??我的DBA生涯,沒有碰過只管幾個資料庫的狀況,如果你真那麼lucky,那你就慢慢拉...慢慢看!!我抓了我手上一個系統的貼圖出來給大家參考看看,每分鐘拋出兩筆login failed的記錄,client我確認過,是 AP Server IP, 我不知道為何他們要用sa登入資料庫?? 詢問過AP單位,他們說沒有阿!! 我們哪有用sa帳號............恩!!客戶的外包廠商,又能夠說什麼哩!
sql server log

















程式我是google來的,改一改就成了我自己的,如果你有需要, 歡迎你拿回去依據你的需求進行修改。

DECLARE @fliecnt int
DECLARE @tbFile TABLE (myfilecnt int, myfiledate datetime, myfilesize int)
DECLARE @tbLog TABLE (rid INT IDENTITY, LogDate datetime, ProcessInfo nvarchar(10),Text nvarchar(3950))

INSERT @tbFile EXEC master.dbo.sp_enumerrorlogs

SELECT @fliecnt =MAX(myfilecnt) FROM @tbFile

WHILE (@fliecnt >=0)
BEGIN
 INSERT @tbLog EXEC master.dbo.xp_readerrorlog @fliecnt,1,N'錯誤'
 INSERT @tbLog EXEC master.dbo.xp_readerrorlog @fliecnt,1,N'Failed'
 INSERT @tbLog EXEC master.dbo.xp_readerrorlog 0,1,N'Server is listening on'
 SET @fliecnt = @fliecnt -1
END

SELECT LogDate N'記錄時間', ProcessInfo N'來源', Text N'訊息'
FROM @tbLog
where logdate between getdate() -1.5 and getdate()
and  processinfo not in ('登入' ,'Logon')
ORDER BY rid DESC

杜書伍:懶而美的文明 ,他說的沒錯,就是懶才會想辦法解決麻煩進步來自於人性,懶就是人的天性!! 誰不怕麻煩.........

該怎麼進行Postgres 最基本的例行檢核工作


Postgres 是一個幾乎把所有訊息都寫到系統記錄檔的RDBMS,所以在日常維運上,不透過工具來分析log內容,我還真不知道該怎麼管理我手上的postgres。我用的工具是pgbager-3.5 ,有興趣的,去問一下谷大哥,它一定會告訴你相關資訊。打開pgbager報表後,最上頭就是可供讀取的報表

pgBadger: PostgreSQL log analyzer


Normalized reports are marked with a "(N)".

每份報表都是用pipe符號來切開,不太可能日檢時查看每個部份的報表,就個人的經驗,只會看底下這幾個報表的資訊 :

礙於報表列的部份用截圖的方式像素在這裡太難表達出我想傳達的意念,所以我把上述幾個報表的html code直接抓下來修改後貼上,你用桌機的螢幕看pbagder報表時,當然不會看到這麼鱉的畫面。

我們來看報表吧~~ 很直覺啦 ~~

Most frequent events (N) ^

Rank Times reported Error
1
FATAL: sorry, too many clients already

2
ERROR: could not serialize access due to concurrent update

3
ERROR: duplicate key value violates unique constraint "..."


Most frequent waiting queries (N)^

Rank Count Total wait time (s) Min/Max/Avg duration (s) Query
1216.695s8.321s/8.374s/4.187s
INSERT into public.trans_log (card_nbr,header_termino,merchant_id,message_type,process_code,prog_name,receive_date,resp_code,rrn,system_ip,term_id,trans_date,id )
VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$10,$10,$10);

Queries that waited the most^

Rank Wait time (s) Query
18.374s
INSERT into public.trans_log (card_nbr,header_termino,merchant_id,message_type,process_code,prog_name,receive_date,resp_code,rrn,system_ip,term_id,trans_date,id )
VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$10,$10,$10);
28.321s
INSERT into public.trans_log (card_nbr,header_termino,merchant_id,message_type,process_code,prog_name,receive_date,resp_code,rrn,system_ip,term_id,trans_date,id )
VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$10,$10,$10);

Slowest queries ^

Rank Duration (s) Query
114.598s
SELECT count (* ) FROM (
SELECT
DISTINCT s_time,
e_time,
request_id,
STATUS
FROM
common_status )
WHERE
STATUS = 'Success'
AND to_char(s_time,'yyyy/MM/dd' )
= to_char(now) - interval'1 days','yyyy/MM/dd' );
214.397s
SELECT fun_id, count (fun_id )
FROM ( SELECT fun_id,
s_time
FROM (
SELECT
DISTINCT s_time,
e_time,
request_id,
fun_id
FROM
common_status )
WHERE
to_char (s_time,'yyyy/MM/dd' ) = to_char (now () - interval '1 days', 'yyyy/MM/dd' ) )
GROUP BY
fun_id;

Hourly Statistics

我日檢時只看checkpoint sync的數據,因為它跟整體I/O有關,圖我修改過,完整報表不會這麼精簡
checkpoint sync time
這是我丟在系統cron跑的一段shell script,為的是自動跑出前一天08:00~今天08:00間的報表,上班的時候就能看系統前一天的運作概況了

LOG_DIR=/data/pg_log
BADGER_HOME=~/steven/ma/pgbadger-3.5
LOG_DATE=`date +%Y%m%d`

LOG_BEGIN="$(date --date='1 days ago' +%Y-%m-%d) 08:00:00"
LOG_END="$(date  +%Y-%m-%d) 08:00:00"
PREV_DATE=$(date --date='1 days ago' +%Y-%m-%d)
CURR_DATE=`date +%Y-%m-%d`
$BADGER_HOME/pgbadger -f stderr -b "$LOG_BEGIN" -e "$LOG_END" -G  -o ~/steven/log/YOURDB."$LOG_DATE".html $LOG_DIR/*$PREV_DATE*.log $LOG_DIR/*$CURR_DATE*.log

cd ~/steven/log/
HOST=your ftp host
USER=your ftp account
PASSWD=your ftp account password
/usr/bin/ftp -nv «!x
open $HOST
user $USER $PASSWD
prompt
cd pgbager_parse
binary
mput YOURDB.$LOG_DATE.html
bye
!x

SQL 是處理整組資料(SET)的語言

這句話基本上是對的,但它講的是sql 處理資料的特性,但很多PG為了方便,喜歡把邏輯都寫在一段sql statement裡頭,造成程式跑起來不穩定,時不時就會出現無法在預定時間內處理完資料。我們來看看的底下我要描述的case ,是在Postgres 上頭發生的狀況,PG把一個子查詢包在where condition,資料上它顯示跑了46分鐘11秒,data還吐不出來,所以PG強行停掉它的運作。因為沒跑完,即使啟動auto explain模組及設上了相關參數,也看不到執行計畫及數據被拋到系統記錄檔,所以這個case在當時非常難debug。
auto explain
既然沒有其它的數據可供除錯,那就只能用最基本的調校原則來建議PG改寫程式--簡化邏輯。我請PG把該段子查詢改寫成一個loop,直接把原本子查詢的result set逐一帶入where condition的右值,不要讓RDBMS做表格資料mapping 的動作。當然PG有提出疑問,說難道不能用with CTE來改寫嗎 ?? 其實你要是稍微了解資料庫的運作,透過CTE的方式,一樣是資料庫系統幫你進行資料比對的動作,當下我是認為不會有改善,但我沒表示甚麼!!PG說他要測試,我說好,那你就測吧~~ 測完他就回覆我,with語法確定沒能解決問題,所以他會嘗試我給的建議。
後來他們試著在問題發生的時段,協調了相關連系統人員,在營運系統測試了兩趟次 一批8萬多筆, 一批將近5萬筆,分別完成時間為4分鐘及一分半鐘,所以我建議的改寫方式是有幫助的,底下是他們改寫後的程式
auto explain
從系統拋出的數據來看,每個迴圈執行的時間是40 ms,但應該不是5萬筆資料就跑5萬個迴圈,再往下看,store_id 的右値被帶入單一値,這就表示不是由RDBMS來比對了。我想提醒PG的是,把程式寫好、寫穩,是你們的責任及義務,當碰到執行速度不穩定時,先想看看,有沒有辦法簡化出現問題的那一段sql 語句,真的不要懶到那個地步,什麼都拋給系統來處理。