閱讀順序
《Postgresql源碼(30)Postgresql索引基礎B-linked-tree》
《Postgresql源碼(31)Btree索引相關系統表和整體結構》
《Postgresql源碼(32)Btree索引分裂前後結構差異對比》
《Postgresql源碼(33)Btree索引讀——整體流程&_bt_first》
《Postgresql源碼(34)Btree索引讀——_bt_first搜尋部分分析》
《Postgresql源碼(36)Btree索引讀——_bt_next搜尋部分分析》
總結
- BTScanPosData會在so->currPos->items緩存目前查詢頁面的ctid。每次查詢完一個頁面,會使用_bt_steppage更新currPos的内容。
- 葉子頁面加鎖特點:_bt_steppage會使用
打開下一個頁面,加上鎖之後會開始檢查資料,把符合要求的資料的heap tid記錄到so->currPos->items中。同一時間隻會持一把鎖。_bt_readnextpage
- branch頁面加鎖特點:btree在爬非葉子節點的時候,從root到leaf的過程是用_bt_relandgetbuf加鎖的,也就是先放一個老的鎖,在加下一個頁面的鎖。同一時間隻會持一把鎖。(
)_bt_search
頁面結構和預期
繼續分析34篇提到的這條SQL
用于分析的SQL預期:_bt_next會掃過1、2、4三個leaf頁面
-- 起始 > (1,19) 終止 < (3,59)
-- PAGE1:掃描1條:itemoffset 141-141
-- PAGE2:掃描140條:itemoffset 2-141
-- PAGE4:掃描139條:itemoffset 2-140
select * from t81 where id>139 and id<419;
複制
資料建構
create table t81(id int, info text);
create index idx_t81_id_info on t81(id,info);
insert into t81 select generate_series(1,149370), md5(random()::text);
select * from bt_metap('idx_t81_id_info');
magic | version | root | level | fastroot | fastlevel
--------+---------+------+-------+----------+-----------
340322 | 2 | 116 | 2 | 116 | 2
select itemoffset,ctid from bt_page_items('idx_t81_id_info', 116);
itemoffset | ctid
------------+----------
1 | (3,1)
2 | (115,1)
3 | (227,1)
4 | (338,1)
5 | (449,1)
6 | (560,1)
7 | (671,1)
8 | (782,1)
9 | (893,1)
10 | (1004,1)
select itemoffset,ctid from bt_page_items('idx_t81_id_info', 1);
itemoffset | ctid
------------+---------
1 | (1,21)
2 | (0,1)
3 | (0,2)
4 | (0,3)
5 | (0,4)
...
140 | (1,19) <------------------------ start
141 | (1,20)
select itemoffset,ctid from bt_page_items('idx_t81_id_info', 2);
itemoffset | ctid
------------+---------
1 | (2,41)
2 | (1,21)
3 | (1,22)
4 | (1,23)
5 | (1,24)
6 | (1,25)
...
140 | (2,39)
141 | (2,40)
select itemoffset,ctid from bt_page_items('idx_t81_id_info', 4);
itemoffset | ctid
------------+---------
1 | (3,61)
2 | (2,41)
3 | (2,42)
4 | (2,43)
5 | (2,44)
6 | (2,45)
7 | (2,46)
...
140 | (3,59) <------------------------ end
141 | (3,60)
select * from t81 where ctid='(1,19)';
id | info
-----+----------------------------------
143 | 2c33d634b63de9c16306ac81abd2dcf2
select * from t81 where ctid='(3,59)';
id | info
-----+----------------------------------
423 | d13e311ad061155067e44af2c655d5b2
-- 起始 > (1,19) 終止 < (3,59)
-- PAGE1:掃描1條:itemoffset 141-141
-- PAGE2:掃描140條:itemoffset 2-141
-- PAGE4:掃描139條:itemoffset 2-140
select * from t81 where id>139 and id<419;
複制
前面《Postgresql源碼(33)Btree索引讀——整體流程&_bt_first》
已經對定位初始頁做了分析,這裡已經拿到了初始ctid,繼續分析後面的搜尋流程。
重點關注scan過程用到的資料結構。
_bt_next
b PortalRun
b _bt_next
在_bt_first執行後,BTScanOpaque的資料已經完整,這裡緩存了查詢下一條所需要的全部資料。
函數流程比較簡單:
bool
_bt_next(IndexScanDesc scan, ScanDirection dir)
{
BTScanOpaque so = (BTScanOpaque) scan->opaque;
BTScanPosItem *currItem;
/*
* Advance to next tuple on current page; or if there's no more, try to
* step to the next page with data.
*/
if (ScanDirectionIsForward(dir))
{
if (++so->currPos.itemIndex > so->currPos.lastItem)
{
if (!_bt_steppage(scan, dir))
return false;
}
}
else
{
if (--so->currPos.itemIndex < so->currPos.firstItem)
{
if (!_bt_steppage(scan, dir))
return false;
}
}
/* OK, itemIndex says what to return */
currItem = &so->currPos.items[so->currPos.itemIndex];
scan->xs_ctup.t_self = currItem->heapTid;
if (scan->xs_want_itup)
scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
return true;
}
複制
這裡重點看下BTScanOpaque用到的currPos資料結構和目前内容。
typedef struct BTScanOpaqueData
{
...
BTScanPosData currPos; /* current position data */
...
} BTScanOpaqueData;
複制
BTScanPosData
typedef struct BTScanPosData
{
Buffer buf; /* if valid, the buffer is pinned */
XLogRecPtr lsn; /* pos in the WAL stream when page was read */
BlockNumber currPage; /* page referenced by items array */
BlockNumber nextPage; /* page's right link when we scanned it */
/*
* moreLeft and moreRight track whether we think there may be matching
* index entries to the left and right of the current page, respectively.
* We can clear the appropriate one of these flags when _bt_checkkeys()
* returns continuescan = false.
*/
bool moreLeft;
bool moreRight;
/*
* If we are doing an index-only scan, nextTupleOffset is the first free
* location in the associated tuple storage workspace.
*/
int nextTupleOffset;
/*
* The items array is always ordered in index order (ie, increasing
* indexoffset). When scanning backwards it is convenient to fill the
* array back-to-front, so we start at the last slot and fill downwards.
* Hence we need both a first-valid-entry and a last-valid-entry counter.
* itemIndex is a cursor showing which entry was last returned to caller.
*/
int firstItem; /* first valid index in items[] */
int lastItem; /* last valid index in items[] */
int itemIndex; /* current index in items[] */
BTScanPosItem items[MaxIndexTuplesPerPage]; /* MUST BE LAST */
} BTScanPosData;
typedef struct BTScanPosItem /* what we remember about each match */
{
ItemPointerData heapTid; /* TID of referenced heap item */
OffsetNumber indexOffset; /* index item's location within page */
LocationIndex tupleOffset; /* IndexTuple's offset in workspace, if any */
} BTScanPosItem;
複制
執行SQL後,第一次_bt_next
-- 起始 > (1,19) 終止 < (3,59)
-- PAGE1:掃描1條:itemoffset 141-141
-- PAGE2:掃描140條:itemoffset 2-141
-- PAGE4:掃描139條:itemoffset 2-140
select * from t81 where id>139 and id<419;
複制
這裡應該掃描PAGE1的最後一條:
(gdb) p so->currPos
{
buf = 150,
lsn = 128202491872,
currPage = 1,
nextPage = 2,
moreLeft = 0 '\000',
moreRight = 1 '\001',
nextTupleOffset = 48,
firstItem = 0,
lastItem = 0,
itemIndex = 0,
items = {
{heapTid = {ip_blkid = {bi_hi = 0, bi_lo = 1}, ip_posid = 20}, indexOffset = 141, tupleOffset = 0}}
}
複制
注意第一次執行後,會進入
_bt_steppage
緩存下一頁的資料,下一次執行_bt_next:
(gdb) p so->currPos
{
buf = 152,
lsn = 128202499320,
currPage = 2,
nextPage = 4,
moreLeft = 1 '\001',
moreRight = 1 '\001',
nextTupleOffset = 6720,
firstItem = 0,
lastItem = 139,
itemIndex = 0,
items = {
{heapTid = {ip_blkid = {bi_hi = 0, bi_lo = 1}, ip_posid = 21}, indexOffset = 2, tupleOffset = 0},
{heapTid = {ip_blkid = {bi_hi = 0, bi_lo = 1}, ip_posid = 22}, indexOffset = 3, tupleOffset = 48},
{heapTid = {ip_blkid = {bi_hi = 0, bi_lo = 1}, ip_posid = 23}, indexOffset = 4, tupleOffset = 96},
...
}
複制
取數循環
ExecutePlan:外層循環,每次拿一條
/*
* Loop until we've processed the proper number of tuples from the plan.
*/
for (;;)
ExecProcNode
ExecIndexOnlyScan
ExecScan
ExecScanFetch
IndexOnlyNext:内部拼TupleTableSlot傳回
index_getnext_tid
btgettuple
_bt_next
(1)按so->currPos.items找到目前緩存的heaptid
(2)記錄scan->xs_itup:{t_tid = {ip_blkid = {bi_hi = 0, bi_lo = 1}, ip_posid = 22}, t_info = 16432}
index_fetch_heap
StoreIndexTuple
複制