免费注册 查看新帖 |

Chinaunix

  平台 论坛 博客 文库
最近访问板块 发新帖
查看: 4729 | 回复: 2
打印 上一主题 下一主题

pgsql的fsm是一个怎样的组织结构? [复制链接]

论坛徽章:
0
跳转到指定楼层
1 [收藏(0)] [报告]
发表于 2011-04-04 14:38 |只看该作者 |倒序浏览
现在只了解fsm是一个二叉树,pages也是一个树

MaxFSMRequestSize是什么作用?
  1. #include "postgres.h"

  2. #include "access/htup.h"
  3. #include "access/xlogutils.h"
  4. #include "miscadmin.h"
  5. #include "storage/bufmgr.h"
  6. #include "storage/bufpage.h"
  7. #include "storage/freespace.h"
  8. #include "storage/fsm_internals.h"
  9. #include "storage/lmgr.h"
  10. #include "storage/lwlock.h"
  11. #include "storage/smgr.h"
  12. #include "utils/rel.h"


  13. /*
  14. * We use just one byte to store the amount of free space on a page, so we
  15. * divide the amount of free space a page can have into 256 different
  16. * categories. The highest category, 255, represents a page with at least
  17. * MaxFSMRequestSize bytes of free space, and the second highest category
  18. * represents the range from 254 * FSM_CAT_STEP, inclusive, to
  19. * MaxFSMRequestSize, exclusive.
  20. *
  21. * MaxFSMRequestSize depends on the architecture and BLCKSZ, but assuming
  22. * default 8k BLCKSZ, and that MaxFSMRequestSize is 24 bytes, the categories
  23. * look like this
  24. *
  25. *
  26. * Range         Category
  27. * 0        - 31   0
  28. * 32        - 63   1
  29. * ...          ...  ...
  30. * 8096 - 8127 253
  31. * 8128 - 8163 254
  32. * 8164 - 8192 255
  33. *
  34. * The reason that MaxFSMRequestSize is special is that if MaxFSMRequestSize
  35. * isn't equal to a range boundary, a page with exactly MaxFSMRequestSize
  36. * bytes of free space wouldn't satisfy a request for MaxFSMRequestSize
  37. * bytes. If there isn't more than MaxFSMRequestSize bytes of free space on a
  38. * completely empty page, that would mean that we could never satisfy a
  39. * request of exactly MaxFSMRequestSize bytes.
  40. */
  41. #define FSM_CATEGORIES        256
  42. #define FSM_CAT_STEP        (BLCKSZ / FSM_CATEGORIES)
  43. #define MaxFSMRequestSize        MaxHeapTupleSize

  44. /*
  45. * Depth of the on-disk tree. We need to be able to address 2^32-1 blocks,
  46. * and 1626 is the smallest number that satisfies X^3 >= 2^32-1. Likewise,
  47. * 216 is the smallest number that satisfies X^4 >= 2^32-1. In practice,
  48. * this means that 4096 bytes is the smallest BLCKSZ that we can get away
  49. * with a 3-level tree, and 512 is the smallest we support.
  50. */
  51. #define FSM_TREE_DEPTH        ((SlotsPerFSMPage >= 1626) ? 3 : 4)

  52. #define FSM_ROOT_LEVEL        (FSM_TREE_DEPTH - 1)
  53. #define FSM_BOTTOM_LEVEL 0

  54. /*
  55. * The internal FSM routines work on a logical addressing scheme. Each
  56. * level of the tree can be thought of as a separately addressable file.
  57. */
  58. typedef struct
  59. {
  60.         int                        level;                        /* level */
  61.         int                        logpageno;                /* page number within the level */
  62. } FSMAddress;

  63. /* Address of the root page. */
  64. static const FSMAddress FSM_ROOT_ADDRESS = {FSM_ROOT_LEVEL, 0};

  65. /* functions to navigate the tree */
  66. static FSMAddress fsm_get_child(FSMAddress parent, uint16 slot);
  67. static FSMAddress fsm_get_parent(FSMAddress child, uint16 *slot);
  68. static FSMAddress fsm_get_location(BlockNumber heapblk, uint16 *slot);
  69. static BlockNumber fsm_get_heap_blk(FSMAddress addr, uint16 slot);
  70. static BlockNumber fsm_logical_to_physical(FSMAddress addr);

  71. static Buffer fsm_readbuf(Relation rel, FSMAddress addr, bool extend);
  72. static void fsm_extend(Relation rel, BlockNumber fsm_nblocks);

  73. /* functions to convert amount of free space to a FSM category */
  74. static uint8 fsm_space_avail_to_cat(Size avail);
  75. static uint8 fsm_space_needed_to_cat(Size needed);
  76. static Size fsm_space_cat_to_avail(uint8 cat);

  77. /* workhorse functions for various operations */
  78. static int fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot,
  79.                                    uint8 newValue, uint8 minValue);
  80. static BlockNumber fsm_search(Relation rel, uint8 min_cat);
  81. static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr, bool *eof);


  82. /******** Public API ********/

  83. /*
  84. * GetPageWithFreeSpace - try to find a page in the given relation with
  85. *                at least the specified amount of free space.
  86. *
  87. * If successful, return the block number; if not, return InvalidBlockNumber.
  88. *
  89. * The caller must be prepared for the possibility that the returned page
  90. * will turn out to have too little space available by the time the caller
  91. * gets a lock on it.  In that case, the caller should report the actual
  92. * amount of free space available on that page and then try again (see
  93. * RecordAndGetPageWithFreeSpace).        If InvalidBlockNumber is returned,
  94. * extend the relation.
  95. */
  96. BlockNumber
  97. GetPageWithFreeSpace(Relation rel, Size spaceNeeded)
  98. {
  99.         uint8                min_cat = fsm_space_needed_to_cat(spaceNeeded);

  100.         return fsm_search(rel, min_cat);
  101. }

  102. /*
  103. * RecordAndGetPageWithFreeSpace - update info about a page and try again.
  104. *
  105. * We provide this combo form to save some locking overhead, compared to
  106. * separate RecordPageWithFreeSpace + GetPageWithFreeSpace calls. There's
  107. * also some effort to return a page close to the old page; if there's a
  108. * page with enough free space on the same FSM page where the old one page
  109. * is located, it is preferred.
  110. */
  111. BlockNumber
  112. RecordAndGetPageWithFreeSpace(Relation rel, BlockNumber oldPage,
  113.                                                           Size oldSpaceAvail, Size spaceNeeded)
  114. {
  115.         int                        old_cat = fsm_space_avail_to_cat(oldSpaceAvail);
  116.         int                        search_cat = fsm_space_needed_to_cat(spaceNeeded);
  117.         FSMAddress        addr;
  118.         uint16                slot;
  119.         int                        search_slot;

  120.         /* Get the location of the FSM byte representing the heap block */
  121.         addr = fsm_get_location(oldPage, &slot);

  122.         search_slot = fsm_set_and_search(rel, addr, slot, old_cat, search_cat);

  123.         /*
  124.          * If fsm_set_and_search found a suitable new block, return that.
  125.          * Otherwise, search as usual.
  126.          */
  127.         if (search_slot != -1)
  128.                 return fsm_get_heap_blk(addr, search_slot);
  129.         else
  130.                 return fsm_search(rel, search_cat);
  131. }

  132. /*
  133. * RecordPageWithFreeSpace - update info about a page.
  134. *
  135. * Note that if the new spaceAvail value is higher than the old value stored
  136. * in the FSM, the space might not become visible to searchers until the next
  137. * FreeSpaceMapVacuum call, which updates the upper level pages.
  138. */
  139. void
  140. RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk, Size spaceAvail)
  141. {
  142.         int                        new_cat = fsm_space_avail_to_cat(spaceAvail);
  143.         FSMAddress        addr;
  144.         uint16                slot;

  145.         /* Get the location of the FSM byte representing the heap block */
  146.         addr = fsm_get_location(heapBlk, &slot);

  147.         fsm_set_and_search(rel, addr, slot, new_cat, 0);
  148. }

  149. /*
  150. * XLogRecordPageWithFreeSpace - like RecordPageWithFreeSpace, for use in
  151. *                WAL replay
  152. */

  153. /*
  154. * GetRecordedFreePage - return the amount of free space on a particular page,
  155. *                according to the FSM.
  156. */
  157. Size
  158. GetRecordedFreeSpace(Relation rel, BlockNumber heapBlk)
  159. {
  160.         FSMAddress        addr;
  161.         uint16                slot;
  162.         Buffer                buf;
  163.         uint8                cat;

  164.         /* Get the location of the FSM byte representing the heap block */
  165.         addr = fsm_get_location(heapBlk, &slot);

  166.         buf = fsm_readbuf(rel, addr, false);
  167.         if (!BufferIsValid(buf))
  168.                 return 0;
  169.         cat = fsm_get_avail(BufferGetPage(buf), slot);
  170.         ReleaseBuffer(buf);

  171.         return fsm_space_cat_to_avail(cat);
  172. }

  173. /*
  174. * FreeSpaceMapVacuum - scan and fix any inconsistencies in the FSM
  175. */
  176. void
  177. FreeSpaceMapVacuum(Relation rel)
  178. {
  179.         bool                dummy;

  180.         /*
  181.          * Traverse the tree in depth-first order. The tree is stored physically
  182.          * in depth-first order, so this should be pretty I/O efficient.
  183.          */
  184.         fsm_vacuum_page(rel, FSM_ROOT_ADDRESS, &dummy);
  185. }

  186. /******** Internal routines ********/

  187. /*
  188. * Return category corresponding x bytes of free space
  189. */
  190. static uint8
  191. fsm_space_avail_to_cat(Size avail)
  192. {
  193.         int                        cat;

  194.         Assert(avail < BLCKSZ);

  195.         if (avail >= MaxFSMRequestSize)
  196.                 return 255;

  197.         cat = avail / FSM_CAT_STEP;

  198.         /*
  199.          * The highest category, 255, is reserved for MaxFSMRequestSize bytes or
  200.          * more.
  201.          */
  202.         if (cat > 254)
  203.                 cat = 254;

  204.         return (uint8) cat;
  205. }

  206. /*
  207. * Return the lower bound of the range of free space represented by given
  208. * category.
  209. */
  210. static Size
  211. fsm_space_cat_to_avail(uint8 cat)
  212. {
  213.         /* The highest category represents exactly MaxFSMRequestSize bytes. */
  214.         if (cat == 255)
  215.                 return MaxFSMRequestSize;
  216.         else
  217.                 return cat * FSM_CAT_STEP;
  218. }

  219. /*
  220. * Which category does a page need to have, to accommodate x bytes of data?
  221. * While fsm_size_to_avail_cat() rounds down, this needs to round up.
  222. */
  223. static uint8
  224. fsm_space_needed_to_cat(Size needed)
  225. {
  226.         int                        cat;

  227.         /* Can't ask for more space than the highest category represents */
  228.         if (needed > MaxFSMRequestSize)
  229.                 elog(ERROR, "invalid FSM request size %lu",
  230.                          (unsigned long) needed);

  231.         if (needed == 0)
  232.                 return 1;

  233.         cat = (needed + FSM_CAT_STEP - 1) / FSM_CAT_STEP;

  234.         if (cat > 255)
  235.                 cat = 255;

  236.         return (uint8) cat;
  237. }

  238. /*
  239. * Returns the physical block number an FSM page
  240. */
  241. static BlockNumber
  242. fsm_logical_to_physical(FSMAddress addr)
  243. {
  244.         BlockNumber pages;
  245.         int                        leafno;
  246.         int                        l;

  247.         /*
  248.          * Calculate the logical page number of the first leaf page below the
  249.          * given page.
  250.          */
  251.         leafno = addr.logpageno;
  252.         for (l = 0; l < addr.level; l++)
  253.                 leafno *= SlotsPerFSMPage;

  254.         /* Count upper level nodes required to address the leaf page */
  255.         pages = 0;
  256.         for (l = 0; l < FSM_TREE_DEPTH; l++)
  257.         {
  258.                 pages += leafno + 1;
  259.                 leafno /= SlotsPerFSMPage;
  260.         }

  261.         /*
  262.          * If the page we were asked for wasn't at the bottom level, subtract the
  263.          * additional lower level pages we counted above.
  264.          */
  265.         pages -= addr.level;

  266.         /* Turn the page count into 0-based block number */
  267.         return pages - 1;
  268. }

  269. /*
  270. * Return the FSM location corresponding to given heap block.
  271. */
  272. static FSMAddress
  273. fsm_get_location(BlockNumber heapblk, uint16 *slot)
  274. {
  275.         FSMAddress        addr;

  276.         addr.level = FSM_BOTTOM_LEVEL;
  277.         addr.logpageno = heapblk / SlotsPerFSMPage;
  278.         *slot = heapblk % SlotsPerFSMPage;

  279.         return addr;
  280. }

  281. /*
  282. * Return the heap block number corresponding to given location in the FSM.
  283. */
  284. static BlockNumber
  285. fsm_get_heap_blk(FSMAddress addr, uint16 slot)
  286. {
  287.         Assert(addr.level == FSM_BOTTOM_LEVEL);
  288.         return ((unsigned int) addr.logpageno) * SlotsPerFSMPage + slot;
  289. }

  290. /*
  291. * Given a logical address of a child page, get the logical page number of
  292. * the parent, and the slot within the parent corresponding to the child.
  293. */
  294. static FSMAddress
  295. fsm_get_parent(FSMAddress child, uint16 *slot)
  296. {
  297.         FSMAddress        parent;

  298.         Assert(child.level < FSM_ROOT_LEVEL);

  299.         parent.level = child.level + 1;
  300.         parent.logpageno = child.logpageno / SlotsPerFSMPage;
  301.         *slot = child.logpageno % SlotsPerFSMPage;

  302.         return parent;
  303. }

  304. /*
  305. * Given a logical address of a parent page, and a slot number get the
  306. * logical address of the corresponding child page.
  307. */
  308. static FSMAddress
  309. fsm_get_child(FSMAddress parent, uint16 slot)
  310. {
  311.         FSMAddress        child;

  312.         Assert(parent.level > FSM_BOTTOM_LEVEL);

  313.         child.level = parent.level - 1;
  314.         child.logpageno = parent.logpageno * SlotsPerFSMPage + slot;

  315.         return child;
  316. }

  317. /*
  318. * Read a FSM page.
  319. *
  320. * If the page doesn't exist, InvalidBuffer is returned, or if 'extend' is
  321. * true, the FSM file is extended.
  322. */
  323. static Buffer
  324. fsm_readbuf(Relation rel, FSMAddress addr, bool extend)
  325. {
  326.         BlockNumber blkno = fsm_logical_to_physical(addr);
  327.         Buffer                buf;

  328.         RelationOpenSmgr(rel);

  329.         /*
  330.          * If we haven't cached the size of the FSM yet, check it first.  Also
  331.          * recheck if the requested block seems to be past end, since our cached
  332.          * value might be stale.  (We send smgr inval messages on truncation, but
  333.          * not on extension.)
  334.          */
  335.         if (rel->rd_smgr->smgr_fsm_nblocks == InvalidBlockNumber ||
  336.                 blkno >= rel->rd_smgr->smgr_fsm_nblocks)
  337.         {
  338.                 if (smgrexists(rel->rd_smgr, FSM_FORKNUM))
  339.                         rel->rd_smgr->smgr_fsm_nblocks = smgrnblocks(rel->rd_smgr,
  340.                                                                                                                  FSM_FORKNUM);
  341.                 else
  342.                         rel->rd_smgr->smgr_fsm_nblocks = 0;
  343.         }

  344.         /* Handle requests beyond EOF */
  345.         if (blkno >= rel->rd_smgr->smgr_fsm_nblocks)
  346.         {
  347.                 if (extend)
  348.                         fsm_extend(rel, blkno + 1);
  349.                 else
  350.                         return InvalidBuffer;
  351.         }

  352.         /*
  353.          * Use ZERO_ON_ERROR mode, and initialize the page if necessary. The FSM
  354.          * information is not accurate anyway, so it's better to clear corrupt
  355.          * pages than error out. Since the FSM changes are not WAL-logged, the
  356.          * so-called torn page problem on crash can lead to pages with corrupt
  357.          * headers, for example.
  358.          */
  359.         buf = ReadBufferExtended(rel, FSM_FORKNUM, blkno, RBM_ZERO_ON_ERROR, NULL);
  360.         if (PageIsNew(BufferGetPage(buf)))
  361.                 PageInit(BufferGetPage(buf), BLCKSZ, 0);
  362.         return buf;
  363. }

  364. /*
  365. * Ensure that the FSM fork is at least fsm_nblocks long, extending
  366. * it if necessary with empty pages. And by empty, I mean pages filled
  367. * with zeros, meaning there's no free space.
  368. */
  369. static void
  370. fsm_extend(Relation rel, BlockNumber fsm_nblocks)
  371. {
  372.         BlockNumber fsm_nblocks_now;
  373.         Page                pg;

  374.         pg = (Page) palloc(BLCKSZ);
  375.         PageInit(pg, BLCKSZ, 0);

  376.         /*
  377.          * We use the relation extension lock to lock out other backends trying to
  378.          * extend the FSM at the same time. It also locks out extension of the
  379.          * main fork, unnecessarily, but extending the FSM happens seldom enough
  380.          * that it doesn't seem worthwhile to have a separate lock tag type for
  381.          * it.
  382.          *
  383.          * Note that another backend might have extended or created the relation
  384.          * by the time we get the lock.
  385.          */
  386.         LockRelationForExtension(rel, ExclusiveLock);

  387.         /* Might have to re-open if a cache flush happened */
  388.         RelationOpenSmgr(rel);

  389.         /*
  390.          * Create the FSM file first if it doesn't exist.  If smgr_fsm_nblocks is
  391.          * positive then it must exist, no need for an smgrexists call.
  392.          */
  393.         if ((rel->rd_smgr->smgr_fsm_nblocks == 0 ||
  394.                  rel->rd_smgr->smgr_fsm_nblocks == InvalidBlockNumber) &&
  395.                 !smgrexists(rel->rd_smgr, FSM_FORKNUM))
  396.                 smgrcreate(rel->rd_smgr, FSM_FORKNUM, false);

  397.         fsm_nblocks_now = smgrnblocks(rel->rd_smgr, FSM_FORKNUM);

  398.         while (fsm_nblocks_now < fsm_nblocks)
  399.         {
  400.                 smgrextend(rel->rd_smgr, FSM_FORKNUM, fsm_nblocks_now,
  401.                                    (char *) pg, rel->rd_istemp);
  402.                 fsm_nblocks_now++;
  403.         }

  404.         /* Update local cache with the up-to-date size */
  405.         rel->rd_smgr->smgr_fsm_nblocks = fsm_nblocks_now;

  406.         UnlockRelationForExtension(rel, ExclusiveLock);

  407.         pfree(pg);
  408. }

  409. /*
  410. * Set value in given FSM page and slot.
  411. *
  412. * If minValue > 0, the updated page is also searched for a page with at
  413. * least minValue of free space. If one is found, its slot number is
  414. * returned, -1 otherwise.
  415. */
  416. static int
  417. fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot,
  418.                                    uint8 newValue, uint8 minValue)
  419. {
  420.         Buffer                buf;
  421.         Page                page;
  422.         int                        newslot = -1;

  423.         buf = fsm_readbuf(rel, addr, true);
  424.         LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);

  425.         page = BufferGetPage(buf);

  426.         if (fsm_set_avail(page, slot, newValue))
  427.                 MarkBufferDirty(buf);

  428.         if (minValue != 0)
  429.         {
  430.                 /* Search while we still hold the lock */
  431.                 newslot = fsm_search_avail(buf, minValue,
  432.                                                                    addr.level == FSM_BOTTOM_LEVEL,
  433.                                                                    true);
  434.         }

  435.         UnlockReleaseBuffer(buf);

  436.         return newslot;
  437. }

  438. /*
  439. * Search the tree for a heap page with at least min_cat of free space
  440. */
  441. static BlockNumber
  442. fsm_search(Relation rel, uint8 min_cat)
  443. {
  444.         int                        restarts = 0;
  445.         FSMAddress        addr = FSM_ROOT_ADDRESS;

  446.         for (;;)
  447.         {
  448.                 int                        slot;
  449.                 Buffer                buf;
  450.                 uint8                max_avail = 0;

  451.                 /* Read the FSM page. */
  452.                 buf = fsm_readbuf(rel, addr, false);

  453.                 /* Search within the page */
  454.                 if (BufferIsValid(buf))
  455.                 {
  456.                         LockBuffer(buf, BUFFER_LOCK_SHARE);
  457.                         slot = fsm_search_avail(buf, min_cat,
  458.                                                                         (addr.level == FSM_BOTTOM_LEVEL),
  459.                                                                         false);
  460.                         if (slot == -1)
  461.                                 max_avail = fsm_get_max_avail(BufferGetPage(buf));
  462.                         UnlockReleaseBuffer(buf);
  463.                 }
  464.                 else
  465.                         slot = -1;

  466.                 if (slot != -1)
  467.                 {
  468.                         /*
  469.                          * Descend the tree, or return the found block if we're at the
  470.                          * bottom.
  471.                          */
  472.                         if (addr.level == FSM_BOTTOM_LEVEL)
  473.                                 return fsm_get_heap_blk(addr, slot);

  474.                         addr = fsm_get_child(addr, slot);
  475.                 }
  476.                 else if (addr.level == FSM_ROOT_LEVEL)
  477.                 {
  478.                         /*
  479.                          * At the root, failure means there's no page with enough free
  480.                          * space in the FSM. Give up.
  481.                          */
  482.                         return InvalidBlockNumber;
  483.                 }
  484.                 else
  485.                 {
  486.                         uint16                parentslot;
  487.                         FSMAddress        parent;

  488.                         /*
  489.                          * At lower level, failure can happen if the value in the upper-
  490.                          * level node didn't reflect the value on the lower page. Update
  491.                          * the upper node, to avoid falling into the same trap again, and
  492.                          * start over.
  493.                          *
  494.                          * There's a race condition here, if another backend updates this
  495.                          * page right after we release it, and gets the lock on the parent
  496.                          * page before us. We'll then update the parent page with the now
  497.                          * stale information we had. It's OK, because it should happen
  498.                          * rarely, and will be fixed by the next vacuum.
  499.                          */
  500.                         parent = fsm_get_parent(addr, &parentslot);
  501.                         fsm_set_and_search(rel, parent, parentslot, max_avail, 0);

  502.                         /*
  503.                          * If the upper pages are badly out of date, we might need to loop
  504.                          * quite a few times, updating them as we go. Any inconsistencies
  505.                          * should eventually be corrected and the loop should end. Looping
  506.                          * indefinitely is nevertheless scary, so provide an emergency
  507.                          * valve.
  508.                          */
  509.                         if (restarts++ > 10000)
  510.                                 return InvalidBlockNumber;

  511.                         /* Start search all over from the root */
  512.                         addr = FSM_ROOT_ADDRESS;
  513.                 }
  514.         }
  515. }


  516. /*
  517. * Recursive guts of FreeSpaceMapVacuum
  518. */
  519. static uint8
  520. fsm_vacuum_page(Relation rel, FSMAddress addr, bool *eof_p)
  521. {
  522.         Buffer                buf;
  523.         Page                page;
  524.         uint8                max_avail;

  525.         /* Read the page if it exists, or return EOF */
  526.         buf = fsm_readbuf(rel, addr, false);
  527.         if (!BufferIsValid(buf))
  528.         {
  529.                 *eof_p = true;
  530.                 return 0;
  531.         }
  532.         else
  533.                 *eof_p = false;

  534.         page = BufferGetPage(buf);

  535.         /*
  536.          * Recurse into children, and fix the information stored about them at
  537.          * this level.
  538.          */
  539.         if (addr.level > FSM_BOTTOM_LEVEL)
  540.         {
  541.                 int                        slot;
  542.                 bool                eof = false;

  543.                 for (slot = 0; slot < SlotsPerFSMPage; slot++)
  544.                 {
  545.                         int                        child_avail;

  546.                         CHECK_FOR_INTERRUPTS();

  547.                         /* After we hit end-of-file, just clear the rest of the slots */
  548.                         if (!eof)
  549.                                 child_avail = fsm_vacuum_page(rel, fsm_get_child(addr, slot), &eof);
  550.                         else
  551.                                 child_avail = 0;

  552.                         /* Update information about the child */
  553.                         if (fsm_get_avail(page, slot) != child_avail)
  554.                         {
  555.                                 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
  556.                                 fsm_set_avail(BufferGetPage(buf), slot, child_avail);
  557.                                 MarkBufferDirty(buf);
  558.                                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
  559.                         }
  560.                 }
  561.         }

  562.         max_avail = fsm_get_max_avail(BufferGetPage(buf));

  563.         /*
  564.          * Reset the next slot pointer. This encourages the use of low-numbered
  565.          * pages, increasing the chances that a later vacuum can truncate the
  566.          * relation.
  567.          */
  568.         ((FSMPage) PageGetContents(page))->fp_next_slot = 0;

  569.         ReleaseBuffer(buf);

  570.         return max_avail;
  571. }
复制代码

论坛徽章:
0
2 [报告]
发表于 2011-04-04 14:45 |只看该作者
Free Space Map
--------------

The purpose of the free space map is to quickly locate a page with enough
free space to hold a tuple to be stored; or to determine that no such page
exists and the relation must be extended by one page.  As of PostgreSQL 8.4
each relation has its own, extensible free space map stored in a separate
"fork" of its relation.  This eliminates the disadvantages of the former
fixed-size FSM.

It is important to keep the map small so that it can be searched rapidly.
Therefore, we don't attempt to record the exact free space on a page.
We allocate one map byte to each page, allowing us to record free space
at a granularity of 1/256th of a page.  Another way to say it is that
the stored value is the free space divided by BLCKSZ/256 (rounding down).
We assume that the free space must always be less than BLCKSZ, since
all pages have some overhead; so the maximum map value is 255.

To assist in fast searching, the map isn't simply an array of per-page
entries, but has a tree structure above those entries.  There is a tree
structure of pages, and a tree structure within each page, as described
below.

FSM page structure
------------------

Within each FSM page, we use a binary tree structure where leaf nodes store
the amount of free space on heap pages (or lower level FSM pages, see
"Higher-level structure" below), with one leaf node per heap page. A non-leaf
node stores the max amount of free space on any of its children.

For example:

    4
4     2
3 4   0 2    <- This level represents heap pages

We need two basic operations: search and update.

To search for a page with X amount of free space, traverse down the tree
along a path where n >= X, until you hit the bottom. If both children of a
node satisfy the condition, you can pick either one arbitrarily.

To update the amount of free space on a page to X, first update the leaf node
corresponding to the heap page, then "bubble up" the change to upper nodes,
by walking up to each parent and recomputing its value as the max of its
two children.  Repeat until reaching the root or a parent whose value
doesn't change.

This data structure has a couple of nice properties:
- to discover that there is no page with X bytes of free space, you only
  need to look at the root node
- by varying which child to traverse to in the search algorithm, when you have
  a choice, we can implement various strategies, like preferring pages closer
  to a given page, or spreading the load across the table.

Higher-level routines that use FSM pages access them through the fsm_set_avail()
and fsm_search_avail() functions. The interface to those functions hides the
page's internal tree structure, treating the FSM page as a black box that has
a certain number of "slots" for storing free space information.  (However,
the higher routines have to be aware of the tree structure of the whole map.)

The binary tree is stored on each FSM page as an array. Because the page
header takes some space on a page, the binary tree isn't perfect. That is,
a few right-most leaf nodes are missing, and there are some useless non-leaf
nodes at the right. So the tree looks something like this:

       0
   1       2
3   4   5   6
7 8 9 A B

where the numbers denote each node's position in the array.  Note that the
tree is guaranteed complete above the leaf level; only some leaf nodes are
missing.  This is reflected in the number of usable "slots" per page not
being an exact power of 2.

A FSM page also has a next slot pointer, fp_next_slot, that determines where
to start the next search for free space within that page.  The reason for that
is to spread out the pages that are returned by FSM searches.  When several
backends are concurrently inserting into a relation, contention can be avoided
by having them insert into different pages.  But it is also desirable to fill
up pages in sequential order, to get the benefit of OS prefetching and batched
writes.  The FSM is responsible for making that happen, and the next slot
pointer helps provide the desired behavior.

Higher-level structure
----------------------

To scale up the data structure described above beyond a single page, we
maintain a similar tree-structure across pages. Leaf nodes in higher level
pages correspond to lower level FSM pages. The root node within each page
has the same value as the corresponding leaf node on its parent page.

The root page is always stored at physical block 0.

For example, assuming each FSM page can hold information about 4 pages (in
reality, it holds (BLCKSZ - headers) / 2, or ~4000 with default BLCKSZ),
we get a disk layout like this:

0     <-- page 0 at level 2 (root page)
  0     <-- page 0 at level 1
   0     <-- page 0 at level 0
   1     <-- page 1 at level 0
   2     <-- ...
   3
  1     <-- page 1 at level 1
   4
   5
   6
   7
  2
   8
   9
   10
   11
  3
   12
   13
   14
   15

where the numbers are page numbers *at that level*, starting from 0.

To find the physical block # corresponding to leaf page n, we need to
count the number number of leaf and upper-level pages preceding page n.
This turns out to be

y = n + (n / F + 1) + (n / F^2 + 1) + ... + 1

where F is the fanout (4 in the above example). The first term n is the number
of preceding leaf pages, the second term is the number of pages at level 1,
and so forth.

To keep things simple, the tree is always constant height. To cover the
maximum relation size of 2^32-1 blocks, three levels is enough with the default
BLCKSZ (4000^3 > 2^32).

Addressing
----------

The higher-level routines operate on "logical" addresses, consisting of
- level,
- logical page number, and
- slot (if applicable)

Bottom level FSM pages have level of 0, the level above that 1, and root 2.
As in the diagram above, logical page number is the page number at that level,
starting from 0.

Locking
-------

When traversing down to search for free space, only one page is locked at a
time: the parent page is released before locking the child. If the child page
is concurrently modified, and there no longer is free space on the child page
when you land on it, you need to start from scratch (after correcting the
parent page, so that you don't get into an infinite loop).

We use shared buffer locks when searching, but exclusive buffer lock when
updating a page.  However, the next slot search pointer is updated during
searches even though we have only a shared lock.  fp_next_slot is just a hint
and we can easily reset it if it gets corrupted; so it seems better to accept
some risk of that type than to pay the overhead of exclusive locking.

Recovery
--------

The FSM is not explicitly WAL-logged. Instead, we rely on a bunch of
self-correcting measures to repair possible corruption.

First of all, whenever a value is set on an FSM page, the root node of the
page is compared against the new value after bubbling up the change is
finished. It should be greater than or equal to the value just set, or we
have a corrupted page, with a parent somewhere with too small a value.
Secondly, if we detect corrupted pages while we search, traversing down
the tree. That check will notice if a parent node is set to too high a value.
In both cases, the upper nodes on the page are immediately rebuilt, fixing
the corruption.

Vacuum updates all the bottom level pages with correct amount of free space
on the heap pages, fixing any outdated values there. After the heap and
index passes are done, FreeSpaceMapVacuum is called, and the FSM tree is
scanned in depth-first order. This fixes any discrepancies between upper
and lower level FSM pages.

TODO
----

- fastroot to avoid traversing upper nodes with just 1 child
- use a different system for tables that fit into one FSM page, with a
  mechanism to switch to the real thing as it grows.

论坛徽章:
59
2015七夕节徽章
日期:2015-08-24 11:17:25ChinaUnix专家徽章
日期:2015-07-20 09:19:30每周论坛发贴之星
日期:2015-07-20 09:19:42ChinaUnix元老
日期:2015-07-20 11:04:38荣誉版主
日期:2015-07-20 11:05:19巳蛇
日期:2015-07-20 11:05:26CU十二周年纪念徽章
日期:2015-07-20 11:05:27IT运维版块每日发帖之星
日期:2015-07-20 11:05:34操作系统版块每日发帖之星
日期:2015-07-20 11:05:36程序设计版块每日发帖之星
日期:2015-07-20 11:05:40数据库技术版块每日发帖之星
日期:2015-07-20 11:05:432015年辞旧岁徽章
日期:2015-07-20 11:05:44
3 [报告]
发表于 2011-05-03 09:36 |只看该作者
迷糊啊。好像是源代码吧。
您需要登录后才可以回帖 登录 | 注册

本版积分规则 发表回复

  

北京盛拓优讯信息技术有限公司. 版权所有 京ICP备16024965号-6 北京市公安局海淀分局网监中心备案编号:11010802020122 niuxiaotong@pcpop.com 17352615567
未成年举报专区
中国互联网协会会员  联系我们:huangweiwei@itpub.net
感谢所有关心和支持过ChinaUnix的朋友们 转载本站内容请注明原作者名及出处

清除 Cookies - ChinaUnix - Archiver - WAP - TOP