diff options
Diffstat (limited to 'fs/jfs/jfs_xtree.c')
-rw-r--r-- | fs/jfs/jfs_xtree.c | 4485 |
1 files changed, 4485 insertions, 0 deletions
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c new file mode 100644 index 0000000..11c58c5 --- /dev/null +++ b/fs/jfs/jfs_xtree.c @@ -0,0 +1,4485 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +/* + * jfs_xtree.c: extent allocation descriptor B+-tree manager + */ + +#include <linux/fs.h> +#include <linux/quotaops.h> +#include "jfs_incore.h" +#include "jfs_filsys.h" +#include "jfs_metapage.h" +#include "jfs_dmap.h" +#include "jfs_dinode.h" +#include "jfs_superblock.h" +#include "jfs_debug.h" + +/* + * xtree local flag + */ +#define XT_INSERT 0x00000001 + +/* + * xtree key/entry comparison: extent offset + * + * return: + * -1: k < start of extent + * 0: start_of_extent <= k <= end_of_extent + * 1: k > end_of_extent + */ +#define XT_CMP(CMP, K, X, OFFSET64)\ +{\ + OFFSET64 = offsetXAD(X);\ + (CMP) = ((K) >= OFFSET64 + lengthXAD(X)) ? 1 :\ + ((K) < OFFSET64) ? -1 : 0;\ +} + +/* write a xad entry */ +#define XT_PUTENTRY(XAD, FLAG, OFF, LEN, ADDR)\ +{\ + (XAD)->flag = (FLAG);\ + XADoffset((XAD), (OFF));\ + XADlength((XAD), (LEN));\ + XADaddress((XAD), (ADDR));\ +} + +#define XT_PAGE(IP, MP) BT_PAGE(IP, MP, xtpage_t, i_xtroot) + +/* get page buffer for specified block address */ +/* ToDo: Replace this ugly macro with a function */ +#define XT_GETPAGE(IP, BN, MP, SIZE, P, RC)\ +{\ + BT_GETPAGE(IP, BN, MP, xtpage_t, SIZE, P, RC, i_xtroot)\ + if (!(RC))\ + {\ + if ((le16_to_cpu((P)->header.nextindex) < XTENTRYSTART) ||\ + (le16_to_cpu((P)->header.nextindex) > le16_to_cpu((P)->header.maxentry)) ||\ + (le16_to_cpu((P)->header.maxentry) > (((BN)==0)?XTROOTMAXSLOT:PSIZE>>L2XTSLOTSIZE)))\ + {\ + jfs_error((IP)->i_sb, "XT_GETPAGE: xtree page corrupt");\ + BT_PUTPAGE(MP);\ + MP = NULL;\ + RC = -EIO;\ + }\ + }\ +} + +/* for consistency */ +#define XT_PUTPAGE(MP) BT_PUTPAGE(MP) + +#define XT_GETSEARCH(IP, LEAF, BN, MP, P, INDEX) \ + BT_GETSEARCH(IP, LEAF, BN, MP, xtpage_t, P, INDEX, i_xtroot) +/* xtree entry parameter descriptor */ +struct xtsplit { + struct metapage *mp; + s16 index; + u8 flag; + s64 off; + s64 addr; + int len; + struct pxdlist *pxdlist; +}; + + +/* + * statistics + */ +#ifdef CONFIG_JFS_STATISTICS +static struct { + uint search; + uint fastSearch; + uint split; +} xtStat; +#endif + + +/* + * forward references + */ +static int xtSearch(struct inode *ip, + s64 xoff, int *cmpp, struct btstack * btstack, int flag); + +static int xtSplitUp(tid_t tid, + struct inode *ip, + struct xtsplit * split, struct btstack * btstack); + +static int xtSplitPage(tid_t tid, struct inode *ip, struct xtsplit * split, + struct metapage ** rmpp, s64 * rbnp); + +static int xtSplitRoot(tid_t tid, struct inode *ip, + struct xtsplit * split, struct metapage ** rmpp); + +#ifdef _STILL_TO_PORT +static int xtDeleteUp(tid_t tid, struct inode *ip, struct metapage * fmp, + xtpage_t * fp, struct btstack * btstack); + +static int xtSearchNode(struct inode *ip, + xad_t * xad, + int *cmpp, struct btstack * btstack, int flag); + +static int xtRelink(tid_t tid, struct inode *ip, xtpage_t * fp); +#endif /* _STILL_TO_PORT */ + +/* External references */ + +/* + * debug control + */ +/* #define _JFS_DEBUG_XTREE 1 */ + + +/* + * xtLookup() + * + * function: map a single page into a physical extent; + */ +int xtLookup(struct inode *ip, s64 lstart, + s64 llen, int *pflag, s64 * paddr, s32 * plen, int no_check) +{ + int rc = 0; + struct btstack btstack; + int cmp; + s64 bn; + struct metapage *mp; + xtpage_t *p; + int index; + xad_t *xad; + s64 size, xoff, xend; + int xlen; + s64 xaddr; + + *plen = 0; + + if (!no_check) { + /* is lookup offset beyond eof ? */ + size = ((u64) ip->i_size + (JFS_SBI(ip->i_sb)->bsize - 1)) >> + JFS_SBI(ip->i_sb)->l2bsize; + if (lstart >= size) { + jfs_err("xtLookup: lstart (0x%lx) >= size (0x%lx)", + (ulong) lstart, (ulong) size); + return 0; + } + } + + /* + * search for the xad entry covering the logical extent + */ +//search: + if ((rc = xtSearch(ip, lstart, &cmp, &btstack, 0))) { + jfs_err("xtLookup: xtSearch returned %d", rc); + return rc; + } + + /* + * compute the physical extent covering logical extent + * + * N.B. search may have failed (e.g., hole in sparse file), + * and returned the index of the next entry. + */ + /* retrieve search result */ + XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + + /* is xad found covering start of logical extent ? + * lstart is a page start address, + * i.e., lstart cannot start in a hole; + */ + if (cmp) + goto out; + + /* + * lxd covered by xad + */ + xad = &p->xad[index]; + xoff = offsetXAD(xad); + xlen = lengthXAD(xad); + xend = xoff + xlen; + xaddr = addressXAD(xad); + + /* initialize new pxd */ + *pflag = xad->flag; + *paddr = xaddr + (lstart - xoff); + /* a page must be fully covered by an xad */ + *plen = min(xend - lstart, llen); + + out: + XT_PUTPAGE(mp); + + return rc; +} + + +/* + * xtLookupList() + * + * function: map a single logical extent into a list of physical extent; + * + * parameter: + * struct inode *ip, + * struct lxdlist *lxdlist, lxd list (in) + * struct xadlist *xadlist, xad list (in/out) + * int flag) + * + * coverage of lxd by xad under assumption of + * . lxd's are ordered and disjoint. + * . xad's are ordered and disjoint. + * + * return: + * 0: success + * + * note: a page being written (even a single byte) is backed fully, + * except the last page which is only backed with blocks + * required to cover the last byte; + * the extent backing a page is fully contained within an xad; + */ +int xtLookupList(struct inode *ip, struct lxdlist * lxdlist, + struct xadlist * xadlist, int flag) +{ + int rc = 0; + struct btstack btstack; + int cmp; + s64 bn; + struct metapage *mp; + xtpage_t *p; + int index; + lxd_t *lxd; + xad_t *xad, *pxd; + s64 size, lstart, lend, xstart, xend, pstart; + s64 llen, xlen, plen; + s64 xaddr, paddr; + int nlxd, npxd, maxnpxd; + + npxd = xadlist->nxad = 0; + maxnpxd = xadlist->maxnxad; + pxd = xadlist->xad; + + nlxd = lxdlist->nlxd; + lxd = lxdlist->lxd; + + lstart = offsetLXD(lxd); + llen = lengthLXD(lxd); + lend = lstart + llen; + + size = (ip->i_size + (JFS_SBI(ip->i_sb)->bsize - 1)) >> + JFS_SBI(ip->i_sb)->l2bsize; + + /* + * search for the xad entry covering the logical extent + */ + search: + if (lstart >= size) + return 0; + + if ((rc = xtSearch(ip, lstart, &cmp, &btstack, 0))) + return rc; + + /* + * compute the physical extent covering logical extent + * + * N.B. search may have failed (e.g., hole in sparse file), + * and returned the index of the next entry. + */ +//map: + /* retrieve search result */ + XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + + /* is xad on the next sibling page ? */ + if (index == le16_to_cpu(p->header.nextindex)) { + if (p->header.flag & BT_ROOT) + goto mapend; + + if ((bn = le64_to_cpu(p->header.next)) == 0) + goto mapend; + + XT_PUTPAGE(mp); + + /* get next sibling page */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + index = XTENTRYSTART; + } + + xad = &p->xad[index]; + + /* + * is lxd covered by xad ? + */ + compare: + xstart = offsetXAD(xad); + xlen = lengthXAD(xad); + xend = xstart + xlen; + xaddr = addressXAD(xad); + + compare1: + if (xstart < lstart) + goto compare2; + + /* (lstart <= xstart) */ + + /* lxd is NOT covered by xad */ + if (lend <= xstart) { + /* + * get next lxd + */ + if (--nlxd == 0) + goto mapend; + lxd++; + + lstart = offsetLXD(lxd); + llen = lengthLXD(lxd); + lend = lstart + llen; + if (lstart >= size) + goto mapend; + + /* compare with the current xad */ + goto compare1; + } + /* lxd is covered by xad */ + else { /* (xstart < lend) */ + + /* initialize new pxd */ + pstart = xstart; + plen = min(lend - xstart, xlen); + paddr = xaddr; + + goto cover; + } + + /* (xstart < lstart) */ + compare2: + /* lxd is covered by xad */ + if (lstart < xend) { + /* initialize new pxd */ + pstart = lstart; + plen = min(xend - lstart, llen); + paddr = xaddr + (lstart - xstart); + + goto cover; + } + /* lxd is NOT covered by xad */ + else { /* (xend <= lstart) */ + + /* + * get next xad + * + * linear search next xad covering lxd on + * the current xad page, and then tree search + */ + if (index == le16_to_cpu(p->header.nextindex) - 1) { + if (p->header.flag & BT_ROOT) + goto mapend; + + XT_PUTPAGE(mp); + goto search; + } else { + index++; + xad++; + + /* compare with new xad */ + goto compare; + } + } + + /* + * lxd is covered by xad and a new pxd has been initialized + * (lstart <= xstart < lend) or (xstart < lstart < xend) + */ + cover: + /* finalize pxd corresponding to current xad */ + XT_PUTENTRY(pxd, xad->flag, pstart, plen, paddr); + + if (++npxd >= maxnpxd) + goto mapend; + pxd++; + + /* + * lxd is fully covered by xad + */ + if (lend <= xend) { + /* + * get next lxd + */ + if (--nlxd == 0) + goto mapend; + lxd++; + + lstart = offsetLXD(lxd); + llen = lengthLXD(lxd); + lend = lstart + llen; + if (lstart >= size) + goto mapend; + + /* + * test for old xad covering new lxd + * (old xstart < new lstart) + */ + goto compare2; + } + /* + * lxd is partially covered by xad + */ + else { /* (xend < lend) */ + + /* + * get next xad + * + * linear search next xad covering lxd on + * the current xad page, and then next xad page search + */ + if (index == le16_to_cpu(p->header.nextindex) - 1) { + if (p->header.flag & BT_ROOT) + goto mapend; + + if ((bn = le64_to_cpu(p->header.next)) == 0) + goto mapend; + + XT_PUTPAGE(mp); + + /* get next sibling page */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + index = XTENTRYSTART; + xad = &p->xad[index]; + } else { + index++; + xad++; + } + + /* + * test for new xad covering old lxd + * (old lstart < new xstart) + */ + goto compare; + } + + mapend: + xadlist->nxad = npxd; + +//out: + XT_PUTPAGE(mp); + + return rc; +} + + +/* + * xtSearch() + * + * function: search for the xad entry covering specified offset. + * + * parameters: + * ip - file object; + * xoff - extent offset; + * cmpp - comparison result: + * btstack - traverse stack; + * flag - search process flag (XT_INSERT); + * + * returns: + * btstack contains (bn, index) of search path traversed to the entry. + * *cmpp is set to result of comparison with the entry returned. + * the page containing the entry is pinned at exit. + */ +static int xtSearch(struct inode *ip, s64 xoff, /* offset of extent */ + int *cmpp, struct btstack * btstack, int flag) +{ + struct jfs_inode_info *jfs_ip = JFS_IP(ip); + int rc = 0; + int cmp = 1; /* init for empty page */ + s64 bn; /* block number */ + struct metapage *mp; /* page buffer */ + xtpage_t *p; /* page */ + xad_t *xad; + int base, index, lim, btindex; + struct btframe *btsp; + int nsplit = 0; /* number of pages to split */ + s64 t64; + + INCREMENT(xtStat.search); + + BT_CLR(btstack); + + btstack->nsplit = 0; + + /* + * search down tree from root: + * + * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of + * internal page, child page Pi contains entry with k, Ki <= K < Kj. + * + * if entry with search key K is not found + * internal page search find the entry with largest key Ki + * less than K which point to the child page to search; + * leaf page search find the entry with smallest key Kj + * greater than K so that the returned index is the position of + * the entry to be shifted right for insertion of new entry. + * for empty tree, search key is greater than any key of the tree. + * + * by convention, root bn = 0. + */ + for (bn = 0;;) { + /* get/pin the page to search */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* try sequential access heuristics with the previous + * access entry in target leaf page: + * once search narrowed down into the target leaf, + * key must either match an entry in the leaf or + * key entry does not exist in the tree; + */ +//fastSearch: + if ((jfs_ip->btorder & BT_SEQUENTIAL) && + (p->header.flag & BT_LEAF) && + (index = jfs_ip->btindex) < + le16_to_cpu(p->header.nextindex)) { + xad = &p->xad[index]; + t64 = offsetXAD(xad); + if (xoff < t64 + lengthXAD(xad)) { + if (xoff >= t64) { + *cmpp = 0; + goto out; + } + + /* stop sequential access heuristics */ + goto binarySearch; + } else { /* (t64 + lengthXAD(xad)) <= xoff */ + + /* try next sequential entry */ + index++; + if (index < + le16_to_cpu(p->header.nextindex)) { + xad++; + t64 = offsetXAD(xad); + if (xoff < t64 + lengthXAD(xad)) { + if (xoff >= t64) { + *cmpp = 0; + goto out; + } + + /* miss: key falls between + * previous and this entry + */ + *cmpp = 1; + goto out; + } + + /* (xoff >= t64 + lengthXAD(xad)); + * matching entry may be further out: + * stop heuristic search + */ + /* stop sequential access heuristics */ + goto binarySearch; + } + + /* (index == p->header.nextindex); + * miss: key entry does not exist in + * the target leaf/tree + */ + *cmpp = 1; + goto out; + } + + /* + * if hit, return index of the entry found, and + * if miss, where new entry with search key is + * to be inserted; + */ + out: + /* compute number of pages to split */ + if (flag & XT_INSERT) { + if (p->header.nextindex == /* little-endian */ + p->header.maxentry) + nsplit++; + else + nsplit = 0; + btstack->nsplit = nsplit; + } + + /* save search result */ + btsp = btstack->top; + btsp->bn = bn; + btsp->index = index; + btsp->mp = mp; + + /* update sequential access heuristics */ + jfs_ip->btindex = index; + + INCREMENT(xtStat.fastSearch); + return 0; + } + + /* well, ... full search now */ + binarySearch: + lim = le16_to_cpu(p->header.nextindex) - XTENTRYSTART; + + /* + * binary search with search key K on the current page + */ + for (base = XTENTRYSTART; lim; lim >>= 1) { + index = base + (lim >> 1); + + XT_CMP(cmp, xoff, &p->xad[index], t64); + if (cmp == 0) { + /* + * search hit + */ + /* search hit - leaf page: + * return the entry found + */ + if (p->header.flag & BT_LEAF) { + *cmpp = cmp; + + /* compute number of pages to split */ + if (flag & XT_INSERT) { + if (p->header.nextindex == + p->header.maxentry) + nsplit++; + else + nsplit = 0; + btstack->nsplit = nsplit; + } + + /* save search result */ + btsp = btstack->top; + btsp->bn = bn; + btsp->index = index; + btsp->mp = mp; + + /* init sequential access heuristics */ + btindex = jfs_ip->btindex; + if (index == btindex || + index == btindex + 1) + jfs_ip->btorder = BT_SEQUENTIAL; + else + jfs_ip->btorder = BT_RANDOM; + jfs_ip->btindex = index; + + return 0; + } + + /* search hit - internal page: + * descend/search its child page + */ + goto next; + } + + if (cmp > 0) { + base = index + 1; + --lim; + } + } + + /* + * search miss + * + * base is the smallest index with key (Kj) greater than + * search key (K) and may be zero or maxentry index. + */ + /* + * search miss - leaf page: + * + * return location of entry (base) where new entry with + * search key K is to be inserted. + */ + if (p->header.flag & BT_LEAF) { + *cmpp = cmp; + + /* compute number of pages to split */ + if (flag & XT_INSERT) { + if (p->header.nextindex == + p->header.maxentry) + nsplit++; + else + nsplit = 0; + btstack->nsplit = nsplit; + } + + /* save search result */ + btsp = btstack->top; + btsp->bn = bn; + btsp->index = base; + btsp->mp = mp; + + /* init sequential access heuristics */ + btindex = jfs_ip->btindex; + if (base == btindex || base == btindex + 1) + jfs_ip->btorder = BT_SEQUENTIAL; + else + jfs_ip->btorder = BT_RANDOM; + jfs_ip->btindex = base; + + return 0; + } + + /* + * search miss - non-leaf page: + * + * if base is non-zero, decrement base by one to get the parent + * entry of the child page to search. + */ + index = base ? base - 1 : base; + + /* + * go down to child page + */ + next: + /* update number of pages to split */ + if (p->header.nextindex == p->header.maxentry) + nsplit++; + else + nsplit = 0; + + /* push (bn, index) of the parent page/entry */ + BT_PUSH(btstack, bn, index); + + /* get the child page block number */ + bn = addressXAD(&p->xad[index]); + + /* unpin the parent page */ + XT_PUTPAGE(mp); + } +} + +/* + * xtInsert() + * + * function: + * + * parameter: + * tid - transaction id; + * ip - file object; + * xflag - extent flag (XAD_NOTRECORDED): + * xoff - extent offset; + * xlen - extent length; + * xaddrp - extent address pointer (in/out): + * if (*xaddrp) + * caller allocated data extent at *xaddrp; + * else + * allocate data extent and return its xaddr; + * flag - + * + * return: + */ +int xtInsert(tid_t tid, /* transaction id */ + struct inode *ip, int xflag, s64 xoff, s32 xlen, s64 * xaddrp, + int flag) +{ + int rc = 0; + s64 xaddr, hint; + struct metapage *mp; /* meta-page buffer */ + xtpage_t *p; /* base B+-tree index page */ + s64 bn; + int index, nextindex; + struct btstack btstack; /* traverse stack */ + struct xtsplit split; /* split information */ + xad_t *xad; + int cmp; + struct tlock *tlck; + struct xtlock *xtlck; + + jfs_info("xtInsert: nxoff:0x%lx nxlen:0x%x", (ulong) xoff, xlen); + + /* + * search for the entry location at which to insert: + * + * xtFastSearch() and xtSearch() both returns (leaf page + * pinned, index at which to insert). + * n.b. xtSearch() may return index of maxentry of + * the full page. + */ + if ((rc = xtSearch(ip, xoff, &cmp, &btstack, XT_INSERT))) + return rc; + + /* retrieve search result */ + XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + + /* This test must follow XT_GETSEARCH since mp must be valid if + * we branch to out: */ + if (cmp == 0) { + rc = -EEXIST; + goto out; + } + + /* + * allocate data extent requested + * + * allocation hint: last xad + */ + if ((xaddr = *xaddrp) == 0) { + if (index > XTENTRYSTART) { + xad = &p->xad[index - 1]; + hint = addressXAD(xad) + lengthXAD(xad) - 1; + } else + hint = 0; + if ((rc = DQUOT_ALLOC_BLOCK(ip, xlen))) + goto out; + if ((rc = dbAlloc(ip, hint, (s64) xlen, &xaddr))) { + DQUOT_FREE_BLOCK(ip, xlen); + goto out; + } + } + + /* + * insert entry for new extent + */ + xflag |= XAD_NEW; + + /* + * if the leaf page is full, split the page and + * propagate up the router entry for the new page from split + * + * The xtSplitUp() will insert the entry and unpin the leaf page. + */ + nextindex = le16_to_cpu(p->header.nextindex); + if (nextindex == le16_to_cpu(p->header.maxentry)) { + split.mp = mp; + split.index = index; + split.flag = xflag; + split.off = xoff; + split.len = xlen; + split.addr = xaddr; + split.pxdlist = NULL; + if ((rc = xtSplitUp(tid, ip, &split, &btstack))) { + /* undo data extent allocation */ + if (*xaddrp == 0) { + dbFree(ip, xaddr, (s64) xlen); + DQUOT_FREE_BLOCK(ip, xlen); + } + return rc; + } + + *xaddrp = xaddr; + return 0; + } + + /* + * insert the new entry into the leaf page + */ + /* + * acquire a transaction lock on the leaf page; + * + * action: xad insertion/extension; + */ + BT_MARK_DIRTY(mp, ip); + + /* if insert into middle, shift right remaining entries. */ + if (index < nextindex) + memmove(&p->xad[index + 1], &p->xad[index], + (nextindex - index) * sizeof(xad_t)); + + /* insert the new entry: mark the entry NEW */ + xad = &p->xad[index]; + XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr); + + /* advance next available entry index */ + p->header.nextindex = + cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1); + + /* Don't log it if there are no links to the file */ + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + xtlck->lwm.offset = + (xtlck->lwm.offset) ? min(index, + (int)xtlck->lwm.offset) : index; + xtlck->lwm.length = + le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset; + } + + *xaddrp = xaddr; + + out: + /* unpin the leaf page */ + XT_PUTPAGE(mp); + + return rc; +} + + +/* + * xtSplitUp() + * + * function: + * split full pages as propagating insertion up the tree + * + * parameter: + * tid - transaction id; + * ip - file object; + * split - entry parameter descriptor; + * btstack - traverse stack from xtSearch() + * + * return: + */ +static int +xtSplitUp(tid_t tid, + struct inode *ip, struct xtsplit * split, struct btstack * btstack) +{ + int rc = 0; + struct metapage *smp; + xtpage_t *sp; /* split page */ + struct metapage *rmp; + s64 rbn; /* new right page block number */ + struct metapage *rcmp; + xtpage_t *rcp; /* right child page */ + s64 rcbn; /* right child page block number */ + int skip; /* index of entry of insertion */ + int nextindex; /* next available entry index of p */ + struct btframe *parent; /* parent page entry on traverse stack */ + xad_t *xad; + s64 xaddr; + int xlen; + int nsplit; /* number of pages split */ + struct pxdlist pxdlist; + pxd_t *pxd; + struct tlock *tlck; + struct xtlock *xtlck; + + smp = split->mp; + sp = XT_PAGE(ip, smp); + + /* is inode xtree root extension/inline EA area free ? */ + if ((sp->header.flag & BT_ROOT) && (!S_ISDIR(ip->i_mode)) && + (le16_to_cpu(sp->header.maxentry) < XTROOTMAXSLOT) && + (JFS_IP(ip)->mode2 & INLINEEA)) { + sp->header.maxentry = cpu_to_le16(XTROOTMAXSLOT); + JFS_IP(ip)->mode2 &= ~INLINEEA; + + BT_MARK_DIRTY(smp, ip); + /* + * acquire a transaction lock on the leaf page; + * + * action: xad insertion/extension; + */ + + /* if insert into middle, shift right remaining entries. */ + skip = split->index; + nextindex = le16_to_cpu(sp->header.nextindex); + if (skip < nextindex) + memmove(&sp->xad[skip + 1], &sp->xad[skip], + (nextindex - skip) * sizeof(xad_t)); + + /* insert the new entry: mark the entry NEW */ + xad = &sp->xad[skip]; + XT_PUTENTRY(xad, split->flag, split->off, split->len, + split->addr); + + /* advance next available entry index */ + sp->header.nextindex = + cpu_to_le16(le16_to_cpu(sp->header.nextindex) + 1); + + /* Don't log it if there are no links to the file */ + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, smp, tlckXTREE | tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + xtlck->lwm.offset = (xtlck->lwm.offset) ? + min(skip, (int)xtlck->lwm.offset) : skip; + xtlck->lwm.length = + le16_to_cpu(sp->header.nextindex) - + xtlck->lwm.offset; + } + + return 0; + } + + /* + * allocate new index blocks to cover index page split(s) + * + * allocation hint: ? + */ + if (split->pxdlist == NULL) { + nsplit = btstack->nsplit; + split->pxdlist = &pxdlist; + pxdlist.maxnpxd = pxdlist.npxd = 0; + pxd = &pxdlist.pxd[0]; + xlen = JFS_SBI(ip->i_sb)->nbperpage; + for (; nsplit > 0; nsplit--, pxd++) { + if ((rc = dbAlloc(ip, (s64) 0, (s64) xlen, &xaddr)) + == 0) { + PXDaddress(pxd, xaddr); + PXDlength(pxd, xlen); + + pxdlist.maxnpxd++; + + continue; + } + + /* undo allocation */ + + XT_PUTPAGE(smp); + return rc; + } + } + + /* + * Split leaf page <sp> into <sp> and a new right page <rp>. + * + * The split routines insert the new entry into the leaf page, + * and acquire txLock as appropriate. + * return <rp> pinned and its block number <rpbn>. + */ + rc = (sp->header.flag & BT_ROOT) ? + xtSplitRoot(tid, ip, split, &rmp) : + xtSplitPage(tid, ip, split, &rmp, &rbn); + + XT_PUTPAGE(smp); + + if (rc) + return -EIO; + /* + * propagate up the router entry for the leaf page just split + * + * insert a router entry for the new page into the parent page, + * propagate the insert/split up the tree by walking back the stack + * of (bn of parent page, index of child page entry in parent page) + * that were traversed during the search for the page that split. + * + * the propagation of insert/split up the tree stops if the root + * splits or the page inserted into doesn't have to split to hold + * the new entry. + * + * the parent entry for the split page remains the same, and + * a new entry is inserted at its right with the first key and + * block number of the new right page. + * + * There are a maximum of 3 pages pinned at any time: + * right child, left parent and right parent (when the parent splits) + * to keep the child page pinned while working on the parent. + * make sure that all pins are released at exit. + */ + while ((parent = BT_POP(btstack)) != NULL) { + /* parent page specified by stack frame <parent> */ + + /* keep current child pages <rcp> pinned */ + rcmp = rmp; + rcbn = rbn; + rcp = XT_PAGE(ip, rcmp); + + /* + * insert router entry in parent for new right child page <rp> + */ + /* get/pin the parent page <sp> */ + XT_GETPAGE(ip, parent->bn, smp, PSIZE, sp, rc); + if (rc) { + XT_PUTPAGE(rcmp); + return rc; + } + + /* + * The new key entry goes ONE AFTER the index of parent entry, + * because the split was to the right. + */ + skip = parent->index + 1; + + /* + * split or shift right remaining entries of the parent page + */ + nextindex = le16_to_cpu(sp->header.nextindex); + /* + * parent page is full - split the parent page + */ + if (nextindex == le16_to_cpu(sp->header.maxentry)) { + /* init for parent page split */ + split->mp = smp; + split->index = skip; /* index at insert */ + split->flag = XAD_NEW; + split->off = offsetXAD(&rcp->xad[XTENTRYSTART]); + split->len = JFS_SBI(ip->i_sb)->nbperpage; + split->addr = rcbn; + + /* unpin previous right child page */ + XT_PUTPAGE(rcmp); + + /* The split routines insert the new entry, + * and acquire txLock as appropriate. + * return <rp> pinned and its block number <rpbn>. + */ + rc = (sp->header.flag & BT_ROOT) ? + xtSplitRoot(tid, ip, split, &rmp) : + xtSplitPage(tid, ip, split, &rmp, &rbn); + if (rc) { + XT_PUTPAGE(smp); + return rc; + } + + XT_PUTPAGE(smp); + /* keep new child page <rp> pinned */ + } + /* + * parent page is not full - insert in parent page + */ + else { + /* + * insert router entry in parent for the right child + * page from the first entry of the right child page: + */ + /* + * acquire a transaction lock on the parent page; + * + * action: router xad insertion; + */ + BT_MARK_DIRTY(smp, ip); + + /* + * if insert into middle, shift right remaining entries + */ + if (skip < nextindex) + memmove(&sp->xad[skip + 1], &sp->xad[skip], + (nextindex - + skip) << L2XTSLOTSIZE); + + /* insert the router entry */ + xad = &sp->xad[skip]; + XT_PUTENTRY(xad, XAD_NEW, + offsetXAD(&rcp->xad[XTENTRYSTART]), + JFS_SBI(ip->i_sb)->nbperpage, rcbn); + + /* advance next available entry index. */ + sp->header.nextindex = + cpu_to_le16(le16_to_cpu(sp->header.nextindex) + + 1); + + /* Don't log it if there are no links to the file */ + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, smp, + tlckXTREE | tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + xtlck->lwm.offset = (xtlck->lwm.offset) ? + min(skip, (int)xtlck->lwm.offset) : skip; + xtlck->lwm.length = + le16_to_cpu(sp->header.nextindex) - + xtlck->lwm.offset; + } + + /* unpin parent page */ + XT_PUTPAGE(smp); + + /* exit propagate up */ + break; + } + } + + /* unpin current right page */ + XT_PUTPAGE(rmp); + + return 0; +} + + +/* + * xtSplitPage() + * + * function: + * split a full non-root page into + * original/split/left page and new right page + * i.e., the original/split page remains as left page. + * + * parameter: + * int tid, + * struct inode *ip, + * struct xtsplit *split, + * struct metapage **rmpp, + * u64 *rbnp, + * + * return: + * Pointer to page in which to insert or NULL on error. + */ +static int +xtSplitPage(tid_t tid, struct inode *ip, + struct xtsplit * split, struct metapage ** rmpp, s64 * rbnp) +{ + int rc = 0; + struct metapage *smp; + xtpage_t *sp; + struct metapage *rmp; + xtpage_t *rp; /* new right page allocated */ + s64 rbn; /* new right page block number */ + struct metapage *mp; + xtpage_t *p; + s64 nextbn; + int skip, maxentry, middle, righthalf, n; + xad_t *xad; + struct pxdlist *pxdlist; + pxd_t *pxd; + struct tlock *tlck; + struct xtlock *sxtlck = NULL, *rxtlck = NULL; + int quota_allocation = 0; + + smp = split->mp; + sp = XT_PAGE(ip, smp); + + INCREMENT(xtStat.split); + + pxdlist = split->pxdlist; + pxd = &pxdlist->pxd[pxdlist->npxd]; + pxdlist->npxd++; + rbn = addressPXD(pxd); + + /* Allocate blocks to quota. */ + if (DQUOT_ALLOC_BLOCK(ip, lengthPXD(pxd))) { + rc = -EDQUOT; + goto clean_up; + } + + quota_allocation += lengthPXD(pxd); + + /* + * allocate the new right page for the split + */ + rmp = get_metapage(ip, rbn, PSIZE, 1); + if (rmp == NULL) { + rc = -EIO; + goto clean_up; + } + + jfs_info("xtSplitPage: ip:0x%p smp:0x%p rmp:0x%p", ip, smp, rmp); + + BT_MARK_DIRTY(rmp, ip); + /* + * action: new page; + */ + + rp = (xtpage_t *) rmp->data; + rp->header.self = *pxd; + rp->header.flag = sp->header.flag & BT_TYPE; + rp->header.maxentry = sp->header.maxentry; /* little-endian */ + rp->header.nextindex = cpu_to_le16(XTENTRYSTART); + + BT_MARK_DIRTY(smp, ip); + /* Don't log it if there are no links to the file */ + if (!test_cflag(COMMIT_Nolink, ip)) { + /* + * acquire a transaction lock on the new right page; + */ + tlck = txLock(tid, ip, rmp, tlckXTREE | tlckNEW); + rxtlck = (struct xtlock *) & tlck->lock; + rxtlck->lwm.offset = XTENTRYSTART; + /* + * acquire a transaction lock on the split page + */ + tlck = txLock(tid, ip, smp, tlckXTREE | tlckGROW); + sxtlck = (struct xtlock *) & tlck->lock; + } + + /* + * initialize/update sibling pointers of <sp> and <rp> + */ + nextbn = le64_to_cpu(sp->header.next); + rp->header.next = cpu_to_le64(nextbn); + rp->header.prev = cpu_to_le64(addressPXD(&sp->header.self)); + sp->header.next = cpu_to_le64(rbn); + + skip = split->index; + + /* + * sequential append at tail (after last entry of last page) + * + * if splitting the last page on a level because of appending + * a entry to it (skip is maxentry), it's likely that the access is + * sequential. adding an empty page on the side of the level is less + * work and can push the fill factor much higher than normal. + * if we're wrong it's no big deal - we will do the split the right + * way next time. + * (it may look like it's equally easy to do a similar hack for + * reverse sorted data, that is, split the tree left, but it's not. + * Be my guest.) + */ + if (nextbn == 0 && skip == le16_to_cpu(sp->header.maxentry)) { + /* + * acquire a transaction lock on the new/right page; + * + * action: xad insertion; + */ + /* insert entry at the first entry of the new right page */ + xad = &rp->xad[XTENTRYSTART]; + XT_PUTENTRY(xad, split->flag, split->off, split->len, + split->addr); + + rp->header.nextindex = cpu_to_le16(XTENTRYSTART + 1); + + if (!test_cflag(COMMIT_Nolink, ip)) { + /* rxtlck->lwm.offset = XTENTRYSTART; */ + rxtlck->lwm.length = 1; + } + + *rmpp = rmp; + *rbnp = rbn; + + jfs_info("xtSplitPage: sp:0x%p rp:0x%p", sp, rp); + return 0; + } + + /* + * non-sequential insert (at possibly middle page) + */ + + /* + * update previous pointer of old next/right page of <sp> + */ + if (nextbn != 0) { + XT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc); + if (rc) { + XT_PUTPAGE(rmp); + goto clean_up; + } + + BT_MARK_DIRTY(mp, ip); + /* + * acquire a transaction lock on the next page; + * + * action:sibling pointer update; + */ + if (!test_cflag(COMMIT_Nolink, ip)) + tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK); + + p->header.prev = cpu_to_le64(rbn); + + /* sibling page may have been updated previously, or + * it may be updated later; + */ + + XT_PUTPAGE(mp); + } + + /* + * split the data between the split and new/right pages + */ + maxentry = le16_to_cpu(sp->header.maxentry); + middle = maxentry >> 1; + righthalf = maxentry - middle; + + /* + * skip index in old split/left page - insert into left page: + */ + if (skip <= middle) { + /* move right half of split page to the new right page */ + memmove(&rp->xad[XTENTRYSTART], &sp->xad[middle], + righthalf << L2XTSLOTSIZE); + + /* shift right tail of left half to make room for new entry */ + if (skip < middle) + memmove(&sp->xad[skip + 1], &sp->xad[skip], + (middle - skip) << L2XTSLOTSIZE); + + /* insert new entry */ + xad = &sp->xad[skip]; + XT_PUTENTRY(xad, split->flag, split->off, split->len, + split->addr); + + /* update page header */ + sp->header.nextindex = cpu_to_le16(middle + 1); + if (!test_cflag(COMMIT_Nolink, ip)) { + sxtlck->lwm.offset = (sxtlck->lwm.offset) ? + min(skip, (int)sxtlck->lwm.offset) : skip; + } + + rp->header.nextindex = + cpu_to_le16(XTENTRYSTART + righthalf); + } + /* + * skip index in new right page - insert into right page: + */ + else { + /* move left head of right half to right page */ + n = skip - middle; + memmove(&rp->xad[XTENTRYSTART], &sp->xad[middle], + n << L2XTSLOTSIZE); + + /* insert new entry */ + n += XTENTRYSTART; + xad = &rp->xad[n]; + XT_PUTENTRY(xad, split->flag, split->off, split->len, + split->addr); + + /* move right tail of right half to right page */ + if (skip < maxentry) + memmove(&rp->xad[n + 1], &sp->xad[skip], + (maxentry - skip) << L2XTSLOTSIZE); + + /* update page header */ + sp->header.nextindex = cpu_to_le16(middle); + if (!test_cflag(COMMIT_Nolink, ip)) { + sxtlck->lwm.offset = (sxtlck->lwm.offset) ? + min(middle, (int)sxtlck->lwm.offset) : middle; + } + + rp->header.nextindex = cpu_to_le16(XTENTRYSTART + + righthalf + 1); + } + + if (!test_cflag(COMMIT_Nolink, ip)) { + sxtlck->lwm.length = le16_to_cpu(sp->header.nextindex) - + sxtlck->lwm.offset; + + /* rxtlck->lwm.offset = XTENTRYSTART; */ + rxtlck->lwm.length = le16_to_cpu(rp->header.nextindex) - + XTENTRYSTART; + } + + *rmpp = rmp; + *rbnp = rbn; + + jfs_info("xtSplitPage: sp:0x%p rp:0x%p", sp, rp); + return rc; + + clean_up: + + /* Rollback quota allocation. */ + if (quota_allocation) + DQUOT_FREE_BLOCK(ip, quota_allocation); + + return (rc); +} + + +/* + * xtSplitRoot() + * + * function: + * split the full root page into + * original/root/split page and new right page + * i.e., root remains fixed in tree anchor (inode) and + * the root is copied to a single new right child page + * since root page << non-root page, and + * the split root page contains a single entry for the + * new right child page. + * + * parameter: + * int tid, + * struct inode *ip, + * struct xtsplit *split, + * struct metapage **rmpp) + * + * return: + * Pointer to page in which to insert or NULL on error. + */ +static int +xtSplitRoot(tid_t tid, + struct inode *ip, struct xtsplit * split, struct metapage ** rmpp) +{ + xtpage_t *sp; + struct metapage *rmp; + xtpage_t *rp; + s64 rbn; + int skip, nextindex; + xad_t *xad; + pxd_t *pxd; + struct pxdlist *pxdlist; + struct tlock *tlck; + struct xtlock *xtlck; + + sp = &JFS_IP(ip)->i_xtroot; + + INCREMENT(xtStat.split); + + /* + * allocate a single (right) child page + */ + pxdlist = split->pxdlist; + pxd = &pxdlist->pxd[pxdlist->npxd]; + pxdlist->npxd++; + rbn = addressPXD(pxd); + rmp = get_metapage(ip, rbn, PSIZE, 1); + if (rmp == NULL) + return -EIO; + + /* Allocate blocks to quota. */ + if (DQUOT_ALLOC_BLOCK(ip, lengthPXD(pxd))) { + release_metapage(rmp); + return -EDQUOT; + } + + jfs_info("xtSplitRoot: ip:0x%p rmp:0x%p", ip, rmp); + + /* + * acquire a transaction lock on the new right page; + * + * action: new page; + */ + BT_MARK_DIRTY(rmp, ip); + + rp = (xtpage_t *) rmp->data; + rp->header.flag = + (sp->header.flag & BT_LEAF) ? BT_LEAF : BT_INTERNAL; + rp->header.self = *pxd; + rp->header.nextindex = cpu_to_le16(XTENTRYSTART); + rp->header.maxentry = cpu_to_le16(PSIZE >> L2XTSLOTSIZE); + + /* initialize sibling pointers */ + rp->header.next = 0; + rp->header.prev = 0; + + /* + * copy the in-line root page into new right page extent + */ + nextindex = le16_to_cpu(sp->header.maxentry); + memmove(&rp->xad[XTENTRYSTART], &sp->xad[XTENTRYSTART], + (nextindex - XTENTRYSTART) << L2XTSLOTSIZE); + + /* + * insert the new entry into the new right/child page + * (skip index in the new right page will not change) + */ + skip = split->index; + /* if insert into middle, shift right remaining entries */ + if (skip != nextindex) + memmove(&rp->xad[skip + 1], &rp->xad[skip], + (nextindex - skip) * sizeof(xad_t)); + + xad = &rp->xad[skip]; + XT_PUTENTRY(xad, split->flag, split->off, split->len, split->addr); + + /* update page header */ + rp->header.nextindex = cpu_to_le16(nextindex + 1); + + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, rmp, tlckXTREE | tlckNEW); + xtlck = (struct xtlock *) & tlck->lock; + xtlck->lwm.offset = XTENTRYSTART; + xtlck->lwm.length = le16_to_cpu(rp->header.nextindex) - + XTENTRYSTART; + } + + /* + * reset the root + * + * init root with the single entry for the new right page + * set the 1st entry offset to 0, which force the left-most key + * at any level of the tree to be less than any search key. + */ + /* + * acquire a transaction lock on the root page (in-memory inode); + * + * action: root split; + */ + BT_MARK_DIRTY(split->mp, ip); + + xad = &sp->xad[XTENTRYSTART]; + XT_PUTENTRY(xad, XAD_NEW, 0, JFS_SBI(ip->i_sb)->nbperpage, rbn); + + /* update page header of root */ + sp->header.flag &= ~BT_LEAF; + sp->header.flag |= BT_INTERNAL; + + sp->header.nextindex = cpu_to_le16(XTENTRYSTART + 1); + + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, split->mp, tlckXTREE | tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + xtlck->lwm.offset = XTENTRYSTART; + xtlck->lwm.length = 1; + } + + *rmpp = rmp; + + jfs_info("xtSplitRoot: sp:0x%p rp:0x%p", sp, rp); + return 0; +} + + +/* + * xtExtend() + * + * function: extend in-place; + * + * note: existing extent may or may not have been committed. + * caller is responsible for pager buffer cache update, and + * working block allocation map update; + * update pmap: alloc whole extended extent; + */ +int xtExtend(tid_t tid, /* transaction id */ + struct inode *ip, s64 xoff, /* delta extent offset */ + s32 xlen, /* delta extent length */ + int flag) +{ + int rc = 0; + int cmp; + struct metapage *mp; /* meta-page buffer */ + xtpage_t *p; /* base B+-tree index page */ + s64 bn; + int index, nextindex, len; + struct btstack btstack; /* traverse stack */ + struct xtsplit split; /* split information */ + xad_t *xad; + s64 xaddr; + struct tlock *tlck; + struct xtlock *xtlck = NULL; + + jfs_info("xtExtend: nxoff:0x%lx nxlen:0x%x", (ulong) xoff, xlen); + + /* there must exist extent to be extended */ + if ((rc = xtSearch(ip, xoff - 1, &cmp, &btstack, XT_INSERT))) + return rc; + + /* retrieve search result */ + XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + + if (cmp != 0) { + XT_PUTPAGE(mp); + jfs_error(ip->i_sb, "xtExtend: xtSearch did not find extent"); + return -EIO; + } + + /* extension must be contiguous */ + xad = &p->xad[index]; + if ((offsetXAD(xad) + lengthXAD(xad)) != xoff) { + XT_PUTPAGE(mp); + jfs_error(ip->i_sb, "xtExtend: extension is not contiguous"); + return -EIO; + } + + /* + * acquire a transaction lock on the leaf page; + * + * action: xad insertion/extension; + */ + BT_MARK_DIRTY(mp, ip); + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + } + + /* extend will overflow extent ? */ + xlen = lengthXAD(xad) + xlen; + if ((len = xlen - MAXXLEN) <= 0) + goto extendOld; + + /* + * extent overflow: insert entry for new extent + */ +//insertNew: + xoff = offsetXAD(xad) + MAXXLEN; + xaddr = addressXAD(xad) + MAXXLEN; + nextindex = le16_to_cpu(p->header.nextindex); + + /* + * if the leaf page is full, insert the new entry and + * propagate up the router entry for the new page from split + * + * The xtSplitUp() will insert the entry and unpin the leaf page. + */ + if (nextindex == le16_to_cpu(p->header.maxentry)) { + /* xtSpliUp() unpins leaf pages */ + split.mp = mp; + split.index = index + 1; + split.flag = XAD_NEW; + split.off = xoff; /* split offset */ + split.len = len; + split.addr = xaddr; + split.pxdlist = NULL; + if ((rc = xtSplitUp(tid, ip, &split, &btstack))) + return rc; + + /* get back old page */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + /* + * if leaf root has been split, original root has been + * copied to new child page, i.e., original entry now + * resides on the new child page; + */ + if (p->header.flag & BT_INTERNAL) { + ASSERT(p->header.nextindex == + cpu_to_le16(XTENTRYSTART + 1)); + xad = &p->xad[XTENTRYSTART]; + bn = addressXAD(xad); + XT_PUTPAGE(mp); + + /* get new child page */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + BT_MARK_DIRTY(mp, ip); + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + } + } + } + /* + * insert the new entry into the leaf page + */ + else { + /* insert the new entry: mark the entry NEW */ + xad = &p->xad[index + 1]; + XT_PUTENTRY(xad, XAD_NEW, xoff, len, xaddr); + + /* advance next available entry index */ + p->header.nextindex = + cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1); + } + + /* get back old entry */ + xad = &p->xad[index]; + xlen = MAXXLEN; + + /* + * extend old extent + */ + extendOld: + XADlength(xad, xlen); + if (!(xad->flag & XAD_NEW)) + xad->flag |= XAD_EXTENDED; + + if (!test_cflag(COMMIT_Nolink, ip)) { + xtlck->lwm.offset = + (xtlck->lwm.offset) ? min(index, + (int)xtlck->lwm.offset) : index; + xtlck->lwm.length = + le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset; + } + + /* unpin the leaf page */ + XT_PUTPAGE(mp); + + return rc; +} + +#ifdef _NOTYET +/* + * xtTailgate() + * + * function: split existing 'tail' extent + * (split offset >= start offset of tail extent), and + * relocate and extend the split tail half; + * + * note: existing extent may or may not have been committed. + * caller is responsible for pager buffer cache update, and + * working block allocation map update; + * update pmap: free old split tail extent, alloc new extent; + */ +int xtTailgate(tid_t tid, /* transaction id */ + struct inode *ip, s64 xoff, /* split/new extent offset */ + s32 xlen, /* new extent length */ + s64 xaddr, /* new extent address */ + int flag) +{ + int rc = 0; + int cmp; + struct metapage *mp; /* meta-page buffer */ + xtpage_t *p; /* base B+-tree index page */ + s64 bn; + int index, nextindex, llen, rlen; + struct btstack btstack; /* traverse stack */ + struct xtsplit split; /* split information */ + xad_t *xad; + struct tlock *tlck; + struct xtlock *xtlck = 0; + struct tlock *mtlck; + struct maplock *pxdlock; + +/* +printf("xtTailgate: nxoff:0x%lx nxlen:0x%x nxaddr:0x%lx\n", + (ulong)xoff, xlen, (ulong)xaddr); +*/ + + /* there must exist extent to be tailgated */ + if ((rc = xtSearch(ip, xoff, &cmp, &btstack, XT_INSERT))) + return rc; + + /* retrieve search result */ + XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + + if (cmp != 0) { + XT_PUTPAGE(mp); + jfs_error(ip->i_sb, "xtTailgate: couldn't find extent"); + return -EIO; + } + + /* entry found must be last entry */ + nextindex = le16_to_cpu(p->header.nextindex); + if (index != nextindex - 1) { + XT_PUTPAGE(mp); + jfs_error(ip->i_sb, + "xtTailgate: the entry found is not the last entry"); + return -EIO; + } + + BT_MARK_DIRTY(mp, ip); + /* + * acquire tlock of the leaf page containing original entry + */ + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + } + + /* completely replace extent ? */ + xad = &p->xad[index]; +/* +printf("xtTailgate: xoff:0x%lx xlen:0x%x xaddr:0x%lx\n", + (ulong)offsetXAD(xad), lengthXAD(xad), (ulong)addressXAD(xad)); +*/ + if ((llen = xoff - offsetXAD(xad)) == 0) + goto updateOld; + + /* + * partially replace extent: insert entry for new extent + */ +//insertNew: + /* + * if the leaf page is full, insert the new entry and + * propagate up the router entry for the new page from split + * + * The xtSplitUp() will insert the entry and unpin the leaf page. + */ + if (nextindex == le16_to_cpu(p->header.maxentry)) { + /* xtSpliUp() unpins leaf pages */ + split.mp = mp; + split.index = index + 1; + split.flag = XAD_NEW; + split.off = xoff; /* split offset */ + split.len = xlen; + split.addr = xaddr; + split.pxdlist = NULL; + if ((rc = xtSplitUp(tid, ip, &split, &btstack))) + return rc; + + /* get back old page */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + /* + * if leaf root has been split, original root has been + * copied to new child page, i.e., original entry now + * resides on the new child page; + */ + if (p->header.flag & BT_INTERNAL) { + ASSERT(p->header.nextindex == + cpu_to_le16(XTENTRYSTART + 1)); + xad = &p->xad[XTENTRYSTART]; + bn = addressXAD(xad); + XT_PUTPAGE(mp); + + /* get new child page */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + BT_MARK_DIRTY(mp, ip); + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + } + } + } + /* + * insert the new entry into the leaf page + */ + else { + /* insert the new entry: mark the entry NEW */ + xad = &p->xad[index + 1]; + XT_PUTENTRY(xad, XAD_NEW, xoff, xlen, xaddr); + + /* advance next available entry index */ + p->header.nextindex = + cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1); + } + + /* get back old XAD */ + xad = &p->xad[index]; + + /* + * truncate/relocate old extent at split offset + */ + updateOld: + /* update dmap for old/committed/truncated extent */ + rlen = lengthXAD(xad) - llen; + if (!(xad->flag & XAD_NEW)) { + /* free from PWMAP at commit */ + if (!test_cflag(COMMIT_Nolink, ip)) { + mtlck = txMaplock(tid, ip, tlckMAP); + pxdlock = (struct maplock *) & mtlck->lock; + pxdlock->flag = mlckFREEPXD; + PXDaddress(&pxdlock->pxd, addressXAD(xad) + llen); + PXDlength(&pxdlock->pxd, rlen); + pxdlock->index = 1; + } + } else + /* free from WMAP */ + dbFree(ip, addressXAD(xad) + llen, (s64) rlen); + + if (llen) + /* truncate */ + XADlength(xad, llen); + else + /* replace */ + XT_PUTENTRY(xad, XAD_NEW, xoff, xlen, xaddr); + + if (!test_cflag(COMMIT_Nolink, ip)) { + xtlck->lwm.offset = (xtlck->lwm.offset) ? + min(index, (int)xtlck->lwm.offset) : index; + xtlck->lwm.length = le16_to_cpu(p->header.nextindex) - + xtlck->lwm.offset; + } + + /* unpin the leaf page */ + XT_PUTPAGE(mp); + + return rc; +} +#endif /* _NOTYET */ + +/* + * xtUpdate() + * + * function: update XAD; + * + * update extent for allocated_but_not_recorded or + * compressed extent; + * + * parameter: + * nxad - new XAD; + * logical extent of the specified XAD must be completely + * contained by an existing XAD; + */ +int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad) +{ /* new XAD */ + int rc = 0; + int cmp; + struct metapage *mp; /* meta-page buffer */ + xtpage_t *p; /* base B+-tree index page */ + s64 bn; + int index0, index, newindex, nextindex; + struct btstack btstack; /* traverse stack */ + struct xtsplit split; /* split information */ + xad_t *xad, *lxad, *rxad; + int xflag; + s64 nxoff, xoff; + int nxlen, xlen, lxlen, rxlen; + s64 nxaddr, xaddr; + struct tlock *tlck; + struct xtlock *xtlck = NULL; + int newpage = 0; + + /* there must exist extent to be tailgated */ + nxoff = offsetXAD(nxad); + nxlen = lengthXAD(nxad); + nxaddr = addressXAD(nxad); + + if ((rc = xtSearch(ip, nxoff, &cmp, &btstack, XT_INSERT))) + return rc; + + /* retrieve search result */ + XT_GETSEARCH(ip, btstack.top, bn, mp, p, index0); + + if (cmp != 0) { + XT_PUTPAGE(mp); + jfs_error(ip->i_sb, "xtUpdate: Could not find extent"); + return -EIO; + } + + BT_MARK_DIRTY(mp, ip); + /* + * acquire tlock of the leaf page containing original entry + */ + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + } + + xad = &p->xad[index0]; + xflag = xad->flag; + xoff = offsetXAD(xad); + xlen = lengthXAD(xad); + xaddr = addressXAD(xad); + + /* nXAD must be completely contained within XAD */ + if ((xoff > nxoff) || + (nxoff + nxlen > xoff + xlen)) { + XT_PUTPAGE(mp); + jfs_error(ip->i_sb, + "xtUpdate: nXAD in not completely contained within XAD"); + return -EIO; + } + + index = index0; + newindex = index + 1; + nextindex = le16_to_cpu(p->header.nextindex); + +#ifdef _JFS_WIP_NOCOALESCE + if (xoff < nxoff) + goto updateRight; + + /* + * replace XAD with nXAD + */ + replace: /* (nxoff == xoff) */ + if (nxlen == xlen) { + /* replace XAD with nXAD:recorded */ + *xad = *nxad; + xad->flag = xflag & ~XAD_NOTRECORDED; + + goto out; + } else /* (nxlen < xlen) */ + goto updateLeft; +#endif /* _JFS_WIP_NOCOALESCE */ + +/* #ifdef _JFS_WIP_COALESCE */ + if (xoff < nxoff) + goto coalesceRight; + + /* + * coalesce with left XAD + */ +//coalesceLeft: /* (xoff == nxoff) */ + /* is XAD first entry of page ? */ + if (index == XTENTRYSTART) + goto replace; + + /* is nXAD logically and physically contiguous with lXAD ? */ + lxad = &p->xad[index - 1]; + lxlen = lengthXAD(lxad); + if (!(lxad->flag & XAD_NOTRECORDED) && + (nxoff == offsetXAD(lxad) + lxlen) && + (nxaddr == addressXAD(lxad) + lxlen) && + (lxlen + nxlen < MAXXLEN)) { + /* extend right lXAD */ + index0 = index - 1; + XADlength(lxad, lxlen + nxlen); + + /* If we just merged two extents together, need to make sure the + * right extent gets logged. If the left one is marked XAD_NEW, + * then we know it will be logged. Otherwise, mark as + * XAD_EXTENDED + */ + if (!(lxad->flag & XAD_NEW)) + lxad->flag |= XAD_EXTENDED; + + if (xlen > nxlen) { + /* truncate XAD */ + XADoffset(xad, xoff + nxlen); + XADlength(xad, xlen - nxlen); + XADaddress(xad, xaddr + nxlen); + goto out; + } else { /* (xlen == nxlen) */ + + /* remove XAD */ + if (index < nextindex - 1) + memmove(&p->xad[index], &p->xad[index + 1], + (nextindex - index - + 1) << L2XTSLOTSIZE); + + p->header.nextindex = + cpu_to_le16(le16_to_cpu(p->header.nextindex) - + 1); + + index = index0; + newindex = index + 1; + nextindex = le16_to_cpu(p->header.nextindex); + xoff = nxoff = offsetXAD(lxad); + xlen = nxlen = lxlen + nxlen; + xaddr = nxaddr = addressXAD(lxad); + goto coalesceRight; + } + } + + /* + * replace XAD with nXAD + */ + replace: /* (nxoff == xoff) */ + if (nxlen == xlen) { + /* replace XAD with nXAD:recorded */ + *xad = *nxad; + xad->flag = xflag & ~XAD_NOTRECORDED; + + goto coalesceRight; + } else /* (nxlen < xlen) */ + goto updateLeft; + + /* + * coalesce with right XAD + */ + coalesceRight: /* (xoff <= nxoff) */ + /* is XAD last entry of page ? */ + if (newindex == nextindex) { + if (xoff == nxoff) + goto out; + goto updateRight; + } + + /* is nXAD logically and physically contiguous with rXAD ? */ + rxad = &p->xad[index + 1]; + rxlen = lengthXAD(rxad); + if (!(rxad->flag & XAD_NOTRECORDED) && + (nxoff + nxlen == offsetXAD(rxad)) && + (nxaddr + nxlen == addressXAD(rxad)) && + (rxlen + nxlen < MAXXLEN)) { + /* extend left rXAD */ + XADoffset(rxad, nxoff); + XADlength(rxad, rxlen + nxlen); + XADaddress(rxad, nxaddr); + + /* If we just merged two extents together, need to make sure + * the left extent gets logged. If the right one is marked + * XAD_NEW, then we know it will be logged. Otherwise, mark as + * XAD_EXTENDED + */ + if (!(rxad->flag & XAD_NEW)) + rxad->flag |= XAD_EXTENDED; + + if (xlen > nxlen) + /* truncate XAD */ + XADlength(xad, xlen - nxlen); + else { /* (xlen == nxlen) */ + + /* remove XAD */ + memmove(&p->xad[index], &p->xad[index + 1], + (nextindex - index - 1) << L2XTSLOTSIZE); + + p->header.nextindex = + cpu_to_le16(le16_to_cpu(p->header.nextindex) - + 1); + } + + goto out; + } else if (xoff == nxoff) + goto out; + + if (xoff >= nxoff) { + XT_PUTPAGE(mp); + jfs_error(ip->i_sb, "xtUpdate: xoff >= nxoff"); + return -EIO; + } +/* #endif _JFS_WIP_COALESCE */ + + /* + * split XAD into (lXAD, nXAD): + * + * |---nXAD---> + * --|----------XAD----------|-- + * |-lXAD-| + */ + updateRight: /* (xoff < nxoff) */ + /* truncate old XAD as lXAD:not_recorded */ + xad = &p->xad[index]; + XADlength(xad, nxoff - xoff); + + /* insert nXAD:recorded */ + if (nextindex == le16_to_cpu(p->header.maxentry)) { + + /* xtSpliUp() unpins leaf pages */ + split.mp = mp; + split.index = newindex; + split.flag = xflag & ~XAD_NOTRECORDED; + split.off = nxoff; + split.len = nxlen; + split.addr = nxaddr; + split.pxdlist = NULL; + if ((rc = xtSplitUp(tid, ip, &split, &btstack))) + return rc; + + /* get back old page */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + /* + * if leaf root has been split, original root has been + * copied to new child page, i.e., original entry now + * resides on the new child page; + */ + if (p->header.flag & BT_INTERNAL) { + ASSERT(p->header.nextindex == + cpu_to_le16(XTENTRYSTART + 1)); + xad = &p->xad[XTENTRYSTART]; + bn = addressXAD(xad); + XT_PUTPAGE(mp); + + /* get new child page */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + BT_MARK_DIRTY(mp, ip); + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + } + } else { + /* is nXAD on new page ? */ + if (newindex > + (le16_to_cpu(p->header.maxentry) >> 1)) { + newindex = + newindex - + le16_to_cpu(p->header.nextindex) + + XTENTRYSTART; + newpage = 1; + } + } + } else { + /* if insert into middle, shift right remaining entries */ + if (newindex < nextindex) + memmove(&p->xad[newindex + 1], &p->xad[newindex], + (nextindex - newindex) << L2XTSLOTSIZE); + + /* insert the entry */ + xad = &p->xad[newindex]; + *xad = *nxad; + xad->flag = xflag & ~XAD_NOTRECORDED; + + /* advance next available entry index. */ + p->header.nextindex = + cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1); + } + + /* + * does nXAD force 3-way split ? + * + * |---nXAD--->| + * --|----------XAD-------------|-- + * |-lXAD-| |-rXAD -| + */ + if (nxoff + nxlen == xoff + xlen) + goto out; + + /* reorient nXAD as XAD for further split XAD into (nXAD, rXAD) */ + if (newpage) { + /* close out old page */ + if (!test_cflag(COMMIT_Nolink, ip)) { + xtlck->lwm.offset = (xtlck->lwm.offset) ? + min(index0, (int)xtlck->lwm.offset) : index0; + xtlck->lwm.length = + le16_to_cpu(p->header.nextindex) - + xtlck->lwm.offset; + } + + bn = le64_to_cpu(p->header.next); + XT_PUTPAGE(mp); + + /* get new right page */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + BT_MARK_DIRTY(mp, ip); + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + } + + index0 = index = newindex; + } else + index++; + + newindex = index + 1; + nextindex = le16_to_cpu(p->header.nextindex); + xlen = xlen - (nxoff - xoff); + xoff = nxoff; + xaddr = nxaddr; + + /* recompute split pages */ + if (nextindex == le16_to_cpu(p->header.maxentry)) { + XT_PUTPAGE(mp); + + if ((rc = xtSearch(ip, nxoff, &cmp, &btstack, XT_INSERT))) + return rc; + + /* retrieve search result */ + XT_GETSEARCH(ip, btstack.top, bn, mp, p, index0); + + if (cmp != 0) { + XT_PUTPAGE(mp); + jfs_error(ip->i_sb, "xtUpdate: xtSearch failed"); + return -EIO; + } + + if (index0 != index) { + XT_PUTPAGE(mp); + jfs_error(ip->i_sb, + "xtUpdate: unexpected value of index"); + return -EIO; + } + } + + /* + * split XAD into (nXAD, rXAD) + * + * ---nXAD---| + * --|----------XAD----------|-- + * |-rXAD-| + */ + updateLeft: /* (nxoff == xoff) && (nxlen < xlen) */ + /* update old XAD with nXAD:recorded */ + xad = &p->xad[index]; + *xad = *nxad; + xad->flag = xflag & ~XAD_NOTRECORDED; + + /* insert rXAD:not_recorded */ + xoff = xoff + nxlen; + xlen = xlen - nxlen; + xaddr = xaddr + nxlen; + if (nextindex == le16_to_cpu(p->header.maxentry)) { +/* +printf("xtUpdate.updateLeft.split p:0x%p\n", p); +*/ + /* xtSpliUp() unpins leaf pages */ + split.mp = mp; + split.index = newindex; + split.flag = xflag; + split.off = xoff; + split.len = xlen; + split.addr = xaddr; + split.pxdlist = NULL; + if ((rc = xtSplitUp(tid, ip, &split, &btstack))) + return rc; + + /* get back old page */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* + * if leaf root has been split, original root has been + * copied to new child page, i.e., original entry now + * resides on the new child page; + */ + if (p->header.flag & BT_INTERNAL) { + ASSERT(p->header.nextindex == + cpu_to_le16(XTENTRYSTART + 1)); + xad = &p->xad[XTENTRYSTART]; + bn = addressXAD(xad); + XT_PUTPAGE(mp); + + /* get new child page */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + BT_MARK_DIRTY(mp, ip); + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + } + } + } else { + /* if insert into middle, shift right remaining entries */ + if (newindex < nextindex) + memmove(&p->xad[newindex + 1], &p->xad[newindex], + (nextindex - newindex) << L2XTSLOTSIZE); + + /* insert the entry */ + xad = &p->xad[newindex]; + XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr); + + /* advance next available entry index. */ + p->header.nextindex = + cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1); + } + + out: + if (!test_cflag(COMMIT_Nolink, ip)) { + xtlck->lwm.offset = (xtlck->lwm.offset) ? + min(index0, (int)xtlck->lwm.offset) : index0; + xtlck->lwm.length = le16_to_cpu(p->header.nextindex) - + xtlck->lwm.offset; + } + + /* unpin the leaf page */ + XT_PUTPAGE(mp); + + return rc; +} + + +/* + * xtAppend() + * + * function: grow in append mode from contiguous region specified ; + * + * parameter: + * tid - transaction id; + * ip - file object; + * xflag - extent flag: + * xoff - extent offset; + * maxblocks - max extent length; + * xlen - extent length (in/out); + * xaddrp - extent address pointer (in/out): + * flag - + * + * return: + */ +int xtAppend(tid_t tid, /* transaction id */ + struct inode *ip, int xflag, s64 xoff, s32 maxblocks, + s32 * xlenp, /* (in/out) */ + s64 * xaddrp, /* (in/out) */ + int flag) +{ + int rc = 0; + struct metapage *mp; /* meta-page buffer */ + xtpage_t *p; /* base B+-tree index page */ + s64 bn, xaddr; + int index, nextindex; + struct btstack btstack; /* traverse stack */ + struct xtsplit split; /* split information */ + xad_t *xad; + int cmp; + struct tlock *tlck; + struct xtlock *xtlck; + int nsplit, nblocks, xlen; + struct pxdlist pxdlist; + pxd_t *pxd; + + xaddr = *xaddrp; + xlen = *xlenp; + jfs_info("xtAppend: xoff:0x%lx maxblocks:%d xlen:%d xaddr:0x%lx", + (ulong) xoff, maxblocks, xlen, (ulong) xaddr); + + /* + * search for the entry location at which to insert: + * + * xtFastSearch() and xtSearch() both returns (leaf page + * pinned, index at which to insert). + * n.b. xtSearch() may return index of maxentry of + * the full page. + */ + if ((rc = xtSearch(ip, xoff, &cmp, &btstack, XT_INSERT))) + return rc; + + /* retrieve search result */ + XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + + if (cmp == 0) { + rc = -EEXIST; + goto out; + } +//insert: + /* + * insert entry for new extent + */ + xflag |= XAD_NEW; + + /* + * if the leaf page is full, split the page and + * propagate up the router entry for the new page from split + * + * The xtSplitUp() will insert the entry and unpin the leaf page. + */ + nextindex = le16_to_cpu(p->header.nextindex); + if (nextindex < le16_to_cpu(p->header.maxentry)) + goto insertLeaf; + + /* + * allocate new index blocks to cover index page split(s) + */ + nsplit = btstack.nsplit; + split.pxdlist = &pxdlist; + pxdlist.maxnpxd = pxdlist.npxd = 0; + pxd = &pxdlist.pxd[0]; + nblocks = JFS_SBI(ip->i_sb)->nbperpage; + for (; nsplit > 0; nsplit--, pxd++, xaddr += nblocks, maxblocks -= nblocks) { + if ((rc = dbAllocBottomUp(ip, xaddr, (s64) nblocks)) == 0) { + PXDaddress(pxd, xaddr); + PXDlength(pxd, nblocks); + + pxdlist.maxnpxd++; + + continue; + } + + /* undo allocation */ + + goto out; + } + + xlen = min(xlen, maxblocks); + + /* + * allocate data extent requested + */ + if ((rc = dbAllocBottomUp(ip, xaddr, (s64) xlen))) + goto out; + + split.mp = mp; + split.index = index; + split.flag = xflag; + split.off = xoff; + split.len = xlen; + split.addr = xaddr; + if ((rc = xtSplitUp(tid, ip, &split, &btstack))) { + /* undo data extent allocation */ + dbFree(ip, *xaddrp, (s64) * xlenp); + + return rc; + } + + *xaddrp = xaddr; + *xlenp = xlen; + return 0; + + /* + * insert the new entry into the leaf page + */ + insertLeaf: + /* + * allocate data extent requested + */ + if ((rc = dbAllocBottomUp(ip, xaddr, (s64) xlen))) + goto out; + + BT_MARK_DIRTY(mp, ip); + /* + * acquire a transaction lock on the leaf page; + * + * action: xad insertion/extension; + */ + tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + + /* insert the new entry: mark the entry NEW */ + xad = &p->xad[index]; + XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr); + + /* advance next available entry index */ + p->header.nextindex = + cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1); + + xtlck->lwm.offset = + (xtlck->lwm.offset) ? min(index,(int) xtlck->lwm.offset) : index; + xtlck->lwm.length = le16_to_cpu(p->header.nextindex) - + xtlck->lwm.offset; + + *xaddrp = xaddr; + *xlenp = xlen; + + out: + /* unpin the leaf page */ + XT_PUTPAGE(mp); + + return rc; +} +#ifdef _STILL_TO_PORT + +/* - TBD for defragmentaion/reorganization - + * + * xtDelete() + * + * function: + * delete the entry with the specified key. + * + * N.B.: whole extent of the entry is assumed to be deleted. + * + * parameter: + * + * return: + * ENOENT: if the entry is not found. + * + * exception: + */ +int xtDelete(tid_t tid, struct inode *ip, s64 xoff, s32 xlen, int flag) +{ + int rc = 0; + struct btstack btstack; + int cmp; + s64 bn; + struct metapage *mp; + xtpage_t *p; + int index, nextindex; + struct tlock *tlck; + struct xtlock *xtlck; + + /* + * find the matching entry; xtSearch() pins the page + */ + if ((rc = xtSearch(ip, xoff, &cmp, &btstack, 0))) + return rc; + + XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + if (cmp) { + /* unpin the leaf page */ + XT_PUTPAGE(mp); + return -ENOENT; + } + + /* + * delete the entry from the leaf page + */ + nextindex = le16_to_cpu(p->header.nextindex); + p->header.nextindex = + cpu_to_le16(le16_to_cpu(p->header.nextindex) - 1); + + /* + * if the leaf page bocome empty, free the page + */ + if (p->header.nextindex == cpu_to_le16(XTENTRYSTART)) + return (xtDeleteUp(tid, ip, mp, p, &btstack)); + + BT_MARK_DIRTY(mp, ip); + /* + * acquire a transaction lock on the leaf page; + * + * action:xad deletion; + */ + tlck = txLock(tid, ip, mp, tlckXTREE); + xtlck = (struct xtlock *) & tlck->lock; + xtlck->lwm.offset = + (xtlck->lwm.offset) ? min(index, xtlck->lwm.offset) : index; + + /* if delete from middle, shift left/compact the remaining entries */ + if (index < nextindex - 1) + memmove(&p->xad[index], &p->xad[index + 1], + (nextindex - index - 1) * sizeof(xad_t)); + + XT_PUTPAGE(mp); + + return 0; +} + + +/* - TBD for defragmentaion/reorganization - + * + * xtDeleteUp() + * + * function: + * free empty pages as propagating deletion up the tree + * + * parameter: + * + * return: + */ +static int +xtDeleteUp(tid_t tid, struct inode *ip, + struct metapage * fmp, xtpage_t * fp, struct btstack * btstack) +{ + int rc = 0; + struct metapage *mp; + xtpage_t *p; + int index, nextindex; + s64 xaddr; + int xlen; + struct btframe *parent; + struct tlock *tlck; + struct xtlock *xtlck; + + /* + * keep root leaf page which has become empty + */ + if (fp->header.flag & BT_ROOT) { + /* keep the root page */ + fp->header.flag &= ~BT_INTERNAL; + fp->header.flag |= BT_LEAF; + fp->header.nextindex = cpu_to_le16(XTENTRYSTART); + + /* XT_PUTPAGE(fmp); */ + + return 0; + } + + /* + * free non-root leaf page + */ + if ((rc = xtRelink(tid, ip, fp))) { + XT_PUTPAGE(fmp); + return rc; + } + + xaddr = addressPXD(&fp->header.self); + xlen = lengthPXD(&fp->header.self); + /* free the page extent */ + dbFree(ip, xaddr, (s64) xlen); + + /* free the buffer page */ + discard_metapage(fmp); + + /* + * propagate page deletion up the index tree + * + * If the delete from the parent page makes it empty, + * continue all the way up the tree. + * stop if the root page is reached (which is never deleted) or + * if the entry deletion does not empty the page. + */ + while ((parent = BT_POP(btstack)) != NULL) { + /* get/pin the parent page <sp> */ + XT_GETPAGE(ip, parent->bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + index = parent->index; + + /* delete the entry for the freed child page from parent. + */ + nextindex = le16_to_cpu(p->header.nextindex); + + /* + * the parent has the single entry being deleted: + * free the parent page which has become empty. + */ + if (nextindex == 1) { + if (p->header.flag & BT_ROOT) { + /* keep the root page */ + p->header.flag &= ~BT_INTERNAL; + p->header.flag |= BT_LEAF; + p->header.nextindex = + cpu_to_le16(XTENTRYSTART); + + /* XT_PUTPAGE(mp); */ + + break; + } else { + /* free the parent page */ + if ((rc = xtRelink(tid, ip, p))) + return rc; + + xaddr = addressPXD(&p->header.self); + /* free the page extent */ + dbFree(ip, xaddr, + (s64) JFS_SBI(ip->i_sb)->nbperpage); + + /* unpin/free the buffer page */ + discard_metapage(mp); + + /* propagate up */ + continue; + } + } + /* + * the parent has other entries remaining: + * delete the router entry from the parent page. + */ + else { + BT_MARK_DIRTY(mp, ip); + /* + * acquire a transaction lock on the leaf page; + * + * action:xad deletion; + */ + tlck = txLock(tid, ip, mp, tlckXTREE); + xtlck = (struct xtlock *) & tlck->lock; + xtlck->lwm.offset = + (xtlck->lwm.offset) ? min(index, + xtlck->lwm. + offset) : index; + + /* if delete from middle, + * shift left/compact the remaining entries in the page + */ + if (index < nextindex - 1) + memmove(&p->xad[index], &p->xad[index + 1], + (nextindex - index - + 1) << L2XTSLOTSIZE); + + p->header.nextindex = + cpu_to_le16(le16_to_cpu(p->header.nextindex) - + 1); + jfs_info("xtDeleteUp(entry): 0x%lx[%d]", + (ulong) parent->bn, index); + } + + /* unpin the parent page */ + XT_PUTPAGE(mp); + + /* exit propagation up */ + break; + } + + return 0; +} + + +/* + * NAME: xtRelocate() + * + * FUNCTION: relocate xtpage or data extent of regular file; + * This function is mainly used by defragfs utility. + * + * NOTE: This routine does not have the logic to handle + * uncommitted allocated extent. The caller should call + * txCommit() to commit all the allocation before call + * this routine. + */ +int +xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */ + s64 nxaddr, /* new xaddr */ + int xtype) +{ /* extent type: XTPAGE or DATAEXT */ + int rc = 0; + struct tblock *tblk; + struct tlock *tlck; + struct xtlock *xtlck; + struct metapage *mp, *pmp, *lmp, *rmp; /* meta-page buffer */ + xtpage_t *p, *pp, *rp, *lp; /* base B+-tree index page */ + xad_t *xad; + pxd_t *pxd; + s64 xoff, xsize; + int xlen; + s64 oxaddr, sxaddr, dxaddr, nextbn, prevbn; + cbuf_t *cp; + s64 offset, nbytes, nbrd, pno; + int nb, npages, nblks; + s64 bn; + int cmp; + int index; + struct pxd_lock *pxdlock; + struct btstack btstack; /* traverse stack */ + + xtype = xtype & EXTENT_TYPE; + + xoff = offsetXAD(oxad); + oxaddr = addressXAD(oxad); + xlen = lengthXAD(oxad); + + /* validate extent offset */ + offset = xoff << JFS_SBI(ip->i_sb)->l2bsize; + if (offset >= ip->i_size) + return -ESTALE; /* stale extent */ + + jfs_info("xtRelocate: xtype:%d xoff:0x%lx xlen:0x%x xaddr:0x%lx:0x%lx", + xtype, (ulong) xoff, xlen, (ulong) oxaddr, (ulong) nxaddr); + + /* + * 1. get and validate the parent xtpage/xad entry + * covering the source extent to be relocated; + */ + if (xtype == DATAEXT) { + /* search in leaf entry */ + rc = xtSearch(ip, xoff, &cmp, &btstack, 0); + if (rc) + return rc; + + /* retrieve search result */ + XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index); + + if (cmp) { + XT_PUTPAGE(pmp); + return -ESTALE; + } + + /* validate for exact match with a single entry */ + xad = &pp->xad[index]; + if (addressXAD(xad) != oxaddr || lengthXAD(xad) != xlen) { + XT_PUTPAGE(pmp); + return -ESTALE; + } + } else { /* (xtype == XTPAGE) */ + + /* search in internal entry */ + rc = xtSearchNode(ip, oxad, &cmp, &btstack, 0); + if (rc) + return rc; + + /* retrieve search result */ + XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index); + + if (cmp) { + XT_PUTPAGE(pmp); + return -ESTALE; + } + + /* xtSearchNode() validated for exact match with a single entry + */ + xad = &pp->xad[index]; + } + jfs_info("xtRelocate: parent xad entry validated."); + + /* + * 2. relocate the extent + */ + if (xtype == DATAEXT) { + /* if the extent is allocated-but-not-recorded + * there is no real data to be moved in this extent, + */ + if (xad->flag & XAD_NOTRECORDED) + goto out; + else + /* release xtpage for cmRead()/xtLookup() */ + XT_PUTPAGE(pmp); + + /* + * cmRelocate() + * + * copy target data pages to be relocated; + * + * data extent must start at page boundary and + * multiple of page size (except the last data extent); + * read in each page of the source data extent into cbuf, + * update the cbuf extent descriptor of the page to be + * homeward bound to new dst data extent + * copy the data from the old extent to new extent. + * copy is essential for compressed files to avoid problems + * that can arise if there was a change in compression + * algorithms. + * it is a good strategy because it may disrupt cache + * policy to keep the pages in memory afterwards. + */ + offset = xoff << JFS_SBI(ip->i_sb)->l2bsize; + assert((offset & CM_OFFSET) == 0); + nbytes = xlen << JFS_SBI(ip->i_sb)->l2bsize; + pno = offset >> CM_L2BSIZE; + npages = (nbytes + (CM_BSIZE - 1)) >> CM_L2BSIZE; +/* + npages = ((offset + nbytes - 1) >> CM_L2BSIZE) - + (offset >> CM_L2BSIZE) + 1; +*/ + sxaddr = oxaddr; + dxaddr = nxaddr; + + /* process the request one cache buffer at a time */ + for (nbrd = 0; nbrd < nbytes; nbrd += nb, + offset += nb, pno++, npages--) { + /* compute page size */ + nb = min(nbytes - nbrd, CM_BSIZE); + + /* get the cache buffer of the page */ + if (rc = cmRead(ip, offset, npages, &cp)) + break; + + assert(addressPXD(&cp->cm_pxd) == sxaddr); + assert(!cp->cm_modified); + + /* bind buffer with the new extent address */ + nblks = nb >> JFS_IP(ip->i_sb)->l2bsize; + cmSetXD(ip, cp, pno, dxaddr, nblks); + + /* release the cbuf, mark it as modified */ + cmPut(cp, TRUE); + + dxaddr += nblks; + sxaddr += nblks; + } + + /* get back parent page */ + if ((rc = xtSearch(ip, xoff, &cmp, &btstack, 0))) + return rc; + + XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index); + jfs_info("xtRelocate: target data extent relocated."); + } else { /* (xtype == XTPAGE) */ + + /* + * read in the target xtpage from the source extent; + */ + XT_GETPAGE(ip, oxaddr, mp, PSIZE, p, rc); + if (rc) { + XT_PUTPAGE(pmp); + return rc; + } + + /* + * read in sibling pages if any to update sibling pointers; + */ + rmp = NULL; + if (p->header.next) { + nextbn = le64_to_cpu(p->header.next); + XT_GETPAGE(ip, nextbn, rmp, PSIZE, rp, rc); + if (rc) { + XT_PUTPAGE(pmp); + XT_PUTPAGE(mp); + return (rc); + } + } + + lmp = NULL; + if (p->header.prev) { + prevbn = le64_to_cpu(p->header.prev); + XT_GETPAGE(ip, prevbn, lmp, PSIZE, lp, rc); + if (rc) { + XT_PUTPAGE(pmp); + XT_PUTPAGE(mp); + if (rmp) + XT_PUTPAGE(rmp); + return (rc); + } + } + + /* at this point, all xtpages to be updated are in memory */ + + /* + * update sibling pointers of sibling xtpages if any; + */ + if (lmp) { + BT_MARK_DIRTY(lmp, ip); + tlck = + txLock(tid, ip, lmp, tlckXTREE | tlckRELINK); + lp->header.next = cpu_to_le64(nxaddr); + XT_PUTPAGE(lmp); + } + + if (rmp) { + BT_MARK_DIRTY(rmp, ip); + tlck = + txLock(tid, ip, rmp, tlckXTREE | tlckRELINK); + rp->header.prev = cpu_to_le64(nxaddr); + XT_PUTPAGE(rmp); + } + + /* + * update the target xtpage to be relocated + * + * update the self address of the target page + * and write to destination extent; + * redo image covers the whole xtpage since it is new page + * to the destination extent; + * update of bmap for the free of source extent + * of the target xtpage itself: + * update of bmap for the allocation of destination extent + * of the target xtpage itself: + * update of bmap for the extents covered by xad entries in + * the target xtpage is not necessary since they are not + * updated; + * if not committed before this relocation, + * target page may contain XAD_NEW entries which must + * be scanned for bmap update (logredo() always + * scan xtpage REDOPAGE image for bmap update); + * if committed before this relocation (tlckRELOCATE), + * scan may be skipped by commit() and logredo(); + */ + BT_MARK_DIRTY(mp, ip); + /* tlckNEW init xtlck->lwm.offset = XTENTRYSTART; */ + tlck = txLock(tid, ip, mp, tlckXTREE | tlckNEW); + xtlck = (struct xtlock *) & tlck->lock; + + /* update the self address in the xtpage header */ + pxd = &p->header.self; + PXDaddress(pxd, nxaddr); + + /* linelock for the after image of the whole page */ + xtlck->lwm.length = + le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset; + + /* update the buffer extent descriptor of target xtpage */ + xsize = xlen << JFS_SBI(ip->i_sb)->l2bsize; + bmSetXD(mp, nxaddr, xsize); + + /* unpin the target page to new homeward bound */ + XT_PUTPAGE(mp); + jfs_info("xtRelocate: target xtpage relocated."); + } + + /* + * 3. acquire maplock for the source extent to be freed; + * + * acquire a maplock saving the src relocated extent address; + * to free of the extent at commit time; + */ + out: + /* if DATAEXT relocation, write a LOG_UPDATEMAP record for + * free PXD of the source data extent (logredo() will update + * bmap for free of source data extent), and update bmap for + * free of the source data extent; + */ + if (xtype == DATAEXT) + tlck = txMaplock(tid, ip, tlckMAP); + /* if XTPAGE relocation, write a LOG_NOREDOPAGE record + * for the source xtpage (logredo() will init NoRedoPage + * filter and will also update bmap for free of the source + * xtpage), and update bmap for free of the source xtpage; + * N.B. We use tlckMAP instead of tlkcXTREE because there + * is no buffer associated with this lock since the buffer + * has been redirected to the target location. + */ + else /* (xtype == XTPAGE) */ + tlck = txMaplock(tid, ip, tlckMAP | tlckRELOCATE); + + pxdlock = (struct pxd_lock *) & tlck->lock; + pxdlock->flag = mlckFREEPXD; + PXDaddress(&pxdlock->pxd, oxaddr); + PXDlength(&pxdlock->pxd, xlen); + pxdlock->index = 1; + + /* + * 4. update the parent xad entry for relocation; + * + * acquire tlck for the parent entry with XAD_NEW as entry + * update which will write LOG_REDOPAGE and update bmap for + * allocation of XAD_NEW destination extent; + */ + jfs_info("xtRelocate: update parent xad entry."); + BT_MARK_DIRTY(pmp, ip); + tlck = txLock(tid, ip, pmp, tlckXTREE | tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + + /* update the XAD with the new destination extent; */ + xad = &pp->xad[index]; + xad->flag |= XAD_NEW; + XADaddress(xad, nxaddr); + + xtlck->lwm.offset = min(index, xtlck->lwm.offset); + xtlck->lwm.length = le16_to_cpu(pp->header.nextindex) - + xtlck->lwm.offset; + + /* unpin the parent xtpage */ + XT_PUTPAGE(pmp); + + return rc; +} + + +/* + * xtSearchNode() + * + * function: search for the internal xad entry covering specified extent. + * This function is mainly used by defragfs utility. + * + * parameters: + * ip - file object; + * xad - extent to find; + * cmpp - comparison result: + * btstack - traverse stack; + * flag - search process flag; + * + * returns: + * btstack contains (bn, index) of search path traversed to the entry. + * *cmpp is set to result of comparison with the entry returned. + * the page containing the entry is pinned at exit. + */ +static int xtSearchNode(struct inode *ip, xad_t * xad, /* required XAD entry */ + int *cmpp, struct btstack * btstack, int flag) +{ + int rc = 0; + s64 xoff, xaddr; + int xlen; + int cmp = 1; /* init for empty page */ + s64 bn; /* block number */ + struct metapage *mp; /* meta-page buffer */ + xtpage_t *p; /* page */ + int base, index, lim; + struct btframe *btsp; + s64 t64; + + BT_CLR(btstack); + + xoff = offsetXAD(xad); + xlen = lengthXAD(xad); + xaddr = addressXAD(xad); + + /* + * search down tree from root: + * + * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of + * internal page, child page Pi contains entry with k, Ki <= K < Kj. + * + * if entry with search key K is not found + * internal page search find the entry with largest key Ki + * less than K which point to the child page to search; + * leaf page search find the entry with smallest key Kj + * greater than K so that the returned index is the position of + * the entry to be shifted right for insertion of new entry. + * for empty tree, search key is greater than any key of the tree. + * + * by convention, root bn = 0. + */ + for (bn = 0;;) { + /* get/pin the page to search */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + if (p->header.flag & BT_LEAF) { + XT_PUTPAGE(mp); + return -ESTALE; + } + + lim = le16_to_cpu(p->header.nextindex) - XTENTRYSTART; + + /* + * binary search with search key K on the current page + */ + for (base = XTENTRYSTART; lim; lim >>= 1) { + index = base + (lim >> 1); + + XT_CMP(cmp, xoff, &p->xad[index], t64); + if (cmp == 0) { + /* + * search hit + * + * verify for exact match; + */ + if (xaddr == addressXAD(&p->xad[index]) && + xoff == offsetXAD(&p->xad[index])) { + *cmpp = cmp; + + /* save search result */ + btsp = btstack->top; + btsp->bn = bn; + btsp->index = index; + btsp->mp = mp; + + return 0; + } + + /* descend/search its child page */ + goto next; + } + + if (cmp > 0) { + base = index + 1; + --lim; + } + } + + /* + * search miss - non-leaf page: + * + * base is the smallest index with key (Kj) greater than + * search key (K) and may be zero or maxentry index. + * if base is non-zero, decrement base by one to get the parent + * entry of the child page to search. + */ + index = base ? base - 1 : base; + + /* + * go down to child page + */ + next: + /* get the child page block number */ + bn = addressXAD(&p->xad[index]); + + /* unpin the parent page */ + XT_PUTPAGE(mp); + } +} + + +/* + * xtRelink() + * + * function: + * link around a freed page. + * + * Parameter: + * int tid, + * struct inode *ip, + * xtpage_t *p) + * + * returns: + */ +static int xtRelink(tid_t tid, struct inode *ip, xtpage_t * p) +{ + int rc = 0; + struct metapage *mp; + s64 nextbn, prevbn; + struct tlock *tlck; + + nextbn = le64_to_cpu(p->header.next); + prevbn = le64_to_cpu(p->header.prev); + + /* update prev pointer of the next page */ + if (nextbn != 0) { + XT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* + * acquire a transaction lock on the page; + * + * action: update prev pointer; + */ + BT_MARK_DIRTY(mp, ip); + tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK); + + /* the page may already have been tlock'd */ + + p->header.prev = cpu_to_le64(prevbn); + + XT_PUTPAGE(mp); + } + + /* update next pointer of the previous page */ + if (prevbn != 0) { + XT_GETPAGE(ip, prevbn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* + * acquire a transaction lock on the page; + * + * action: update next pointer; + */ + BT_MARK_DIRTY(mp, ip); + tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK); + + /* the page may already have been tlock'd */ + + p->header.next = le64_to_cpu(nextbn); + + XT_PUTPAGE(mp); + } + + return 0; +} +#endif /* _STILL_TO_PORT */ + + +/* + * xtInitRoot() + * + * initialize file root (inline in inode) + */ +void xtInitRoot(tid_t tid, struct inode *ip) +{ + xtpage_t *p; + + /* + * acquire a transaction lock on the root + * + * action: + */ + txLock(tid, ip, (struct metapage *) &JFS_IP(ip)->bxflag, + tlckXTREE | tlckNEW); + p = &JFS_IP(ip)->i_xtroot; + + p->header.flag = DXD_INDEX | BT_ROOT | BT_LEAF; + p->header.nextindex = cpu_to_le16(XTENTRYSTART); + + if (S_ISDIR(ip->i_mode)) + p->header.maxentry = cpu_to_le16(XTROOTINITSLOT_DIR); + else { + p->header.maxentry = cpu_to_le16(XTROOTINITSLOT); + ip->i_size = 0; + } + + + return; +} + + +/* + * We can run into a deadlock truncating a file with a large number of + * xtree pages (large fragmented file). A robust fix would entail a + * reservation system where we would reserve a number of metadata pages + * and tlocks which we would be guaranteed without a deadlock. Without + * this, a partial fix is to limit number of metadata pages we will lock + * in a single transaction. Currently we will truncate the file so that + * no more than 50 leaf pages will be locked. The caller of xtTruncate + * will be responsible for ensuring that the current transaction gets + * committed, and that subsequent transactions are created to truncate + * the file further if needed. + */ +#define MAX_TRUNCATE_LEAVES 50 + +/* + * xtTruncate() + * + * function: + * traverse for truncation logging backward bottom up; + * terminate at the last extent entry at the current subtree + * root page covering new down size. + * truncation may occur within the last extent entry. + * + * parameter: + * int tid, + * struct inode *ip, + * s64 newsize, + * int type) {PWMAP, PMAP, WMAP; DELETE, TRUNCATE} + * + * return: + * + * note: + * PWMAP: + * 1. truncate (non-COMMIT_NOLINK file) + * by jfs_truncate() or jfs_open(O_TRUNC): + * xtree is updated; + * 2. truncate index table of directory when last entry removed + * map update via tlock at commit time; + * PMAP: + * Call xtTruncate_pmap instead + * WMAP: + * 1. remove (free zero link count) on last reference release + * (pmap has been freed at commit zero link count); + * 2. truncate (COMMIT_NOLINK file, i.e., tmp file): + * xtree is updated; + * map update directly at truncation time; + * + * if (DELETE) + * no LOG_NOREDOPAGE is required (NOREDOFILE is sufficient); + * else if (TRUNCATE) + * must write LOG_NOREDOPAGE for deleted index page; + * + * pages may already have been tlocked by anonymous transactions + * during file growth (i.e., write) before truncation; + * + * except last truncated entry, deleted entries remains as is + * in the page (nextindex is updated) for other use + * (e.g., log/update allocation map): this avoid copying the page + * info but delay free of pages; + * + */ +s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag) +{ + int rc = 0; + s64 teof; + struct metapage *mp; + xtpage_t *p; + s64 bn; + int index, nextindex; + xad_t *xad; + s64 xoff, xaddr; + int xlen, len, freexlen; + struct btstack btstack; + struct btframe *parent; + struct tblock *tblk = NULL; + struct tlock *tlck = NULL; + struct xtlock *xtlck = NULL; + struct xdlistlock xadlock; /* maplock for COMMIT_WMAP */ + struct pxd_lock *pxdlock; /* maplock for COMMIT_WMAP */ + s64 nfreed; + int freed, log; + int locked_leaves = 0; + + /* save object truncation type */ + if (tid) { + tblk = tid_to_tblock(tid); + tblk->xflag |= flag; + } + + nfreed = 0; + + flag &= COMMIT_MAP; + assert(flag != COMMIT_PMAP); + + if (flag == COMMIT_PWMAP) + log = 1; + else { + log = 0; + xadlock.flag = mlckFREEXADLIST; + xadlock.index = 1; + } + + /* + * if the newsize is not an integral number of pages, + * the file between newsize and next page boundary will + * be cleared. + * if truncating into a file hole, it will cause + * a full block to be allocated for the logical block. + */ + + /* + * release page blocks of truncated region <teof, eof> + * + * free the data blocks from the leaf index blocks. + * delete the parent index entries corresponding to + * the freed child data/index blocks. + * free the index blocks themselves which aren't needed + * in new sized file. + * + * index blocks are updated only if the blocks are to be + * retained in the new sized file. + * if type is PMAP, the data and index pages are NOT + * freed, and the data and index blocks are NOT freed + * from working map. + * (this will allow continued access of data/index of + * temporary file (zerolink count file truncated to zero-length)). + */ + teof = (newsize + (JFS_SBI(ip->i_sb)->bsize - 1)) >> + JFS_SBI(ip->i_sb)->l2bsize; + + /* clear stack */ + BT_CLR(&btstack); + + /* + * start with root + * + * root resides in the inode + */ + bn = 0; + + /* + * first access of each page: + */ + getPage: + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* process entries backward from last index */ + index = le16_to_cpu(p->header.nextindex) - 1; + + if (p->header.flag & BT_INTERNAL) + goto getChild; + + /* + * leaf page + */ + + /* Since this is the rightmost leaf, and we may have already freed + * a page that was formerly to the right, let's make sure that the + * next pointer is zero. + */ + if (p->header.next) { + if (log) + /* + * Make sure this change to the header is logged. + * If we really truncate this leaf, the flag + * will be changed to tlckTRUNCATE + */ + tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW); + BT_MARK_DIRTY(mp, ip); + p->header.next = 0; + } + + freed = 0; + + /* does region covered by leaf page precede Teof ? */ + xad = &p->xad[index]; + xoff = offsetXAD(xad); + xlen = lengthXAD(xad); + if (teof >= xoff + xlen) { + XT_PUTPAGE(mp); + goto getParent; + } + + /* (re)acquire tlock of the leaf page */ + if (log) { + if (++locked_leaves > MAX_TRUNCATE_LEAVES) { + /* + * We need to limit the size of the transaction + * to avoid exhausting pagecache & tlocks + */ + XT_PUTPAGE(mp); + newsize = (xoff + xlen) << JFS_SBI(ip->i_sb)->l2bsize; + goto getParent; + } + tlck = txLock(tid, ip, mp, tlckXTREE); + tlck->type = tlckXTREE | tlckTRUNCATE; + xtlck = (struct xtlock *) & tlck->lock; + xtlck->hwm.offset = le16_to_cpu(p->header.nextindex) - 1; + } + BT_MARK_DIRTY(mp, ip); + + /* + * scan backward leaf page entries + */ + for (; index >= XTENTRYSTART; index--) { + xad = &p->xad[index]; + xoff = offsetXAD(xad); + xlen = lengthXAD(xad); + xaddr = addressXAD(xad); + + /* + * The "data" for a directory is indexed by the block + * device's address space. This metadata must be invalidated + * here + */ + if (S_ISDIR(ip->i_mode) && (teof == 0)) + invalidate_xad_metapages(ip, *xad); + /* + * entry beyond eof: continue scan of current page + * xad + * ---|---=======-------> + * eof + */ + if (teof < xoff) { + nfreed += xlen; + continue; + } + + /* + * (xoff <= teof): last entry to be deleted from page; + * If other entries remain in page: keep and update the page. + */ + + /* + * eof == entry_start: delete the entry + * xad + * -------|=======-------> + * eof + * + */ + if (teof == xoff) { + nfreed += xlen; + + if (index == XTENTRYSTART) + break; + + nextindex = index; + } + /* + * eof within the entry: truncate the entry. + * xad + * -------===|===-------> + * eof + */ + else if (teof < xoff + xlen) { + /* update truncated entry */ + len = teof - xoff; + freexlen = xlen - len; + XADlength(xad, len); + + /* save pxd of truncated extent in tlck */ + xaddr += len; + if (log) { /* COMMIT_PWMAP */ + xtlck->lwm.offset = (xtlck->lwm.offset) ? + min(index, (int)xtlck->lwm.offset) : index; + xtlck->lwm.length = index + 1 - + xtlck->lwm.offset; + xtlck->twm.offset = index; + pxdlock = (struct pxd_lock *) & xtlck->pxdlock; + pxdlock->flag = mlckFREEPXD; + PXDaddress(&pxdlock->pxd, xaddr); + PXDlength(&pxdlock->pxd, freexlen); + } + /* free truncated extent */ + else { /* COMMIT_WMAP */ + + pxdlock = (struct pxd_lock *) & xadlock; + pxdlock->flag = mlckFREEPXD; + PXDaddress(&pxdlock->pxd, xaddr); + PXDlength(&pxdlock->pxd, freexlen); + txFreeMap(ip, pxdlock, NULL, COMMIT_WMAP); + + /* reset map lock */ + xadlock.flag = mlckFREEXADLIST; + } + + /* current entry is new last entry; */ + nextindex = index + 1; + + nfreed += freexlen; + } + /* + * eof beyond the entry: + * xad + * -------=======---|---> + * eof + */ + else { /* (xoff + xlen < teof) */ + + nextindex = index + 1; + } + + if (nextindex < le16_to_cpu(p->header.nextindex)) { + if (!log) { /* COMMIT_WAMP */ + xadlock.xdlist = &p->xad[nextindex]; + xadlock.count = + le16_to_cpu(p->header.nextindex) - + nextindex; + txFreeMap(ip, (struct maplock *) & xadlock, + NULL, COMMIT_WMAP); + } + p->header.nextindex = cpu_to_le16(nextindex); + } + + XT_PUTPAGE(mp); + + /* assert(freed == 0); */ + goto getParent; + } /* end scan of leaf page entries */ + + freed = 1; + + /* + * leaf page become empty: free the page if type != PMAP + */ + if (log) { /* COMMIT_PWMAP */ + /* txCommit() with tlckFREE: + * free data extents covered by leaf [XTENTRYSTART:hwm); + * invalidate leaf if COMMIT_PWMAP; + * if (TRUNCATE), will write LOG_NOREDOPAGE; + */ + tlck->type = tlckXTREE | tlckFREE; + } else { /* COMMIT_WAMP */ + + /* free data extents covered by leaf */ + xadlock.xdlist = &p->xad[XTENTRYSTART]; + xadlock.count = + le16_to_cpu(p->header.nextindex) - XTENTRYSTART; + txFreeMap(ip, (struct maplock *) & xadlock, NULL, COMMIT_WMAP); + } + + if (p->header.flag & BT_ROOT) { + p->header.flag &= ~BT_INTERNAL; + p->header.flag |= BT_LEAF; + p->header.nextindex = cpu_to_le16(XTENTRYSTART); + + XT_PUTPAGE(mp); /* debug */ + goto out; + } else { + if (log) { /* COMMIT_PWMAP */ + /* page will be invalidated at tx completion + */ + XT_PUTPAGE(mp); + } else { /* COMMIT_WMAP */ + + if (mp->lid) + lid_to_tlock(mp->lid)->flag |= tlckFREELOCK; + + /* invalidate empty leaf page */ + discard_metapage(mp); + } + } + + /* + * the leaf page become empty: delete the parent entry + * for the leaf page if the parent page is to be kept + * in the new sized file. + */ + + /* + * go back up to the parent page + */ + getParent: + /* pop/restore parent entry for the current child page */ + if ((parent = BT_POP(&btstack)) == NULL) + /* current page must have been root */ + goto out; + + /* get back the parent page */ + bn = parent->bn; + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + index = parent->index; + + /* + * child page was not empty: + */ + if (freed == 0) { + /* has any entry deleted from parent ? */ + if (index < le16_to_cpu(p->header.nextindex) - 1) { + /* (re)acquire tlock on the parent page */ + if (log) { /* COMMIT_PWMAP */ + /* txCommit() with tlckTRUNCATE: + * free child extents covered by parent [); + */ + tlck = txLock(tid, ip, mp, tlckXTREE); + xtlck = (struct xtlock *) & tlck->lock; + if (!(tlck->type & tlckTRUNCATE)) { + xtlck->hwm.offset = + le16_to_cpu(p->header. + nextindex) - 1; + tlck->type = + tlckXTREE | tlckTRUNCATE; + } + } else { /* COMMIT_WMAP */ + + /* free child extents covered by parent */ + xadlock.xdlist = &p->xad[index + 1]; + xadlock.count = + le16_to_cpu(p->header.nextindex) - + index - 1; + txFreeMap(ip, (struct maplock *) & xadlock, + NULL, COMMIT_WMAP); + } + BT_MARK_DIRTY(mp, ip); + + p->header.nextindex = cpu_to_le16(index + 1); + } + XT_PUTPAGE(mp); + goto getParent; + } + + /* + * child page was empty: + */ + nfreed += lengthXAD(&p->xad[index]); + + /* + * During working map update, child page's tlock must be handled + * before parent's. This is because the parent's tlock will cause + * the child's disk space to be marked available in the wmap, so + * it's important that the child page be released by that time. + * + * ToDo: tlocks should be on doubly-linked list, so we can + * quickly remove it and add it to the end. + */ + + /* + * Move parent page's tlock to the end of the tid's tlock list + */ + if (log && mp->lid && (tblk->last != mp->lid) && + lid_to_tlock(mp->lid)->tid) { + lid_t lid = mp->lid; + struct tlock *prev; + + tlck = lid_to_tlock(lid); + + if (tblk->next == lid) + tblk->next = tlck->next; + else { + for (prev = lid_to_tlock(tblk->next); + prev->next != lid; + prev = lid_to_tlock(prev->next)) { + assert(prev->next); + } + prev->next = tlck->next; + } + lid_to_tlock(tblk->last)->next = lid; + tlck->next = 0; + tblk->last = lid; + } + + /* + * parent page become empty: free the page + */ + if (index == XTENTRYSTART) { + if (log) { /* COMMIT_PWMAP */ + /* txCommit() with tlckFREE: + * free child extents covered by parent; + * invalidate parent if COMMIT_PWMAP; + */ + tlck = txLock(tid, ip, mp, tlckXTREE); + xtlck = (struct xtlock *) & tlck->lock; + xtlck->hwm.offset = + le16_to_cpu(p->header.nextindex) - 1; + tlck->type = tlckXTREE | tlckFREE; + } else { /* COMMIT_WMAP */ + + /* free child extents covered by parent */ + xadlock.xdlist = &p->xad[XTENTRYSTART]; + xadlock.count = + le16_to_cpu(p->header.nextindex) - + XTENTRYSTART; + txFreeMap(ip, (struct maplock *) & xadlock, NULL, + COMMIT_WMAP); + } + BT_MARK_DIRTY(mp, ip); + + if (p->header.flag & BT_ROOT) { + p->header.flag &= ~BT_INTERNAL; + p->header.flag |= BT_LEAF; + p->header.nextindex = cpu_to_le16(XTENTRYSTART); + if (le16_to_cpu(p->header.maxentry) == XTROOTMAXSLOT) { + /* + * Shrink root down to allow inline + * EA (otherwise fsck complains) + */ + p->header.maxentry = + cpu_to_le16(XTROOTINITSLOT); + JFS_IP(ip)->mode2 |= INLINEEA; + } + + XT_PUTPAGE(mp); /* debug */ + goto out; + } else { + if (log) { /* COMMIT_PWMAP */ + /* page will be invalidated at tx completion + */ + XT_PUTPAGE(mp); + } else { /* COMMIT_WMAP */ + + if (mp->lid) + lid_to_tlock(mp->lid)->flag |= + tlckFREELOCK; + + /* invalidate parent page */ + discard_metapage(mp); + } + + /* parent has become empty and freed: + * go back up to its parent page + */ + /* freed = 1; */ + goto getParent; + } + } + /* + * parent page still has entries for front region; + */ + else { + /* try truncate region covered by preceding entry + * (process backward) + */ + index--; + + /* go back down to the child page corresponding + * to the entry + */ + goto getChild; + } + + /* + * internal page: go down to child page of current entry + */ + getChild: + /* save current parent entry for the child page */ + BT_PUSH(&btstack, bn, index); + + /* get child page */ + xad = &p->xad[index]; + bn = addressXAD(xad); + + /* + * first access of each internal entry: + */ + /* release parent page */ + XT_PUTPAGE(mp); + + /* process the child page */ + goto getPage; + + out: + /* + * update file resource stat + */ + /* set size + */ + if (S_ISDIR(ip->i_mode) && !newsize) + ip->i_size = 1; /* fsck hates zero-length directories */ + else + ip->i_size = newsize; + + /* update quota allocation to reflect freed blocks */ + DQUOT_FREE_BLOCK(ip, nfreed); + + /* + * free tlock of invalidated pages + */ + if (flag == COMMIT_WMAP) + txFreelock(ip); + + return newsize; +} + + +/* + * xtTruncate_pmap() + * + * function: + * Perform truncate to zero lenghth for deleted file, leaving the + * the xtree and working map untouched. This allows the file to + * be accessed via open file handles, while the delete of the file + * is committed to disk. + * + * parameter: + * tid_t tid, + * struct inode *ip, + * s64 committed_size) + * + * return: new committed size + * + * note: + * + * To avoid deadlock by holding too many transaction locks, the + * truncation may be broken up into multiple transactions. + * The committed_size keeps track of part of the file has been + * freed from the pmaps. + */ +s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size) +{ + s64 bn; + struct btstack btstack; + int cmp; + int index; + int locked_leaves = 0; + struct metapage *mp; + xtpage_t *p; + struct btframe *parent; + int rc; + struct tblock *tblk; + struct tlock *tlck = NULL; + xad_t *xad; + int xlen; + s64 xoff; + struct xtlock *xtlck = NULL; + + /* save object truncation type */ + tblk = tid_to_tblock(tid); + tblk->xflag |= COMMIT_PMAP; + + /* clear stack */ + BT_CLR(&btstack); + + if (committed_size) { + xoff = (committed_size >> JFS_SBI(ip->i_sb)->l2bsize) - 1; + rc = xtSearch(ip, xoff, &cmp, &btstack, 0); + if (rc) + return rc; + + XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + + if (cmp != 0) { + XT_PUTPAGE(mp); + jfs_error(ip->i_sb, + "xtTruncate_pmap: did not find extent"); + return -EIO; + } + } else { + /* + * start with root + * + * root resides in the inode + */ + bn = 0; + + /* + * first access of each page: + */ + getPage: + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* process entries backward from last index */ + index = le16_to_cpu(p->header.nextindex) - 1; + + if (p->header.flag & BT_INTERNAL) + goto getChild; + } + + /* + * leaf page + */ + + if (++locked_leaves > MAX_TRUNCATE_LEAVES) { + /* + * We need to limit the size of the transaction + * to avoid exhausting pagecache & tlocks + */ + xad = &p->xad[index]; + xoff = offsetXAD(xad); + xlen = lengthXAD(xad); + XT_PUTPAGE(mp); + return (xoff + xlen) << JFS_SBI(ip->i_sb)->l2bsize; + } + tlck = txLock(tid, ip, mp, tlckXTREE); + tlck->type = tlckXTREE | tlckFREE; + xtlck = (struct xtlock *) & tlck->lock; + xtlck->hwm.offset = index; + + + XT_PUTPAGE(mp); + + /* + * go back up to the parent page + */ + getParent: + /* pop/restore parent entry for the current child page */ + if ((parent = BT_POP(&btstack)) == NULL) + /* current page must have been root */ + goto out; + + /* get back the parent page */ + bn = parent->bn; + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + index = parent->index; + + /* + * parent page become empty: free the page + */ + if (index == XTENTRYSTART) { + /* txCommit() with tlckFREE: + * free child extents covered by parent; + * invalidate parent if COMMIT_PWMAP; + */ + tlck = txLock(tid, ip, mp, tlckXTREE); + xtlck = (struct xtlock *) & tlck->lock; + xtlck->hwm.offset = + le16_to_cpu(p->header.nextindex) - 1; + tlck->type = tlckXTREE | tlckFREE; + + XT_PUTPAGE(mp); + + if (p->header.flag & BT_ROOT) { + + goto out; + } else { + goto getParent; + } + } + /* + * parent page still has entries for front region; + */ + else + index--; + /* + * internal page: go down to child page of current entry + */ + getChild: + /* save current parent entry for the child page */ + BT_PUSH(&btstack, bn, index); + + /* get child page */ + xad = &p->xad[index]; + bn = addressXAD(xad); + + /* + * first access of each internal entry: + */ + /* release parent page */ + XT_PUTPAGE(mp); + + /* process the child page */ + goto getPage; + + out: + + return 0; +} + + +#ifdef _JFS_DEBUG_XTREE +/* + * xtDisplayTree() + * + * function: traverse forward + */ +int xtDisplayTree(struct inode *ip) +{ + int rc = 0; + struct metapage *mp; + xtpage_t *p; + s64 bn, pbn; + int index, lastindex, v, h; + xad_t *xad; + struct btstack btstack; + struct btframe *btsp; + struct btframe *parent; + + printk("display B+-tree.\n"); + + /* clear stack */ + btsp = btstack.stack; + + /* + * start with root + * + * root resides in the inode + */ + bn = 0; + v = h = 0; + + /* + * first access of each page: + */ + getPage: + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* process entries forward from first index */ + index = XTENTRYSTART; + lastindex = le16_to_cpu(p->header.nextindex) - 1; + + if (p->header.flag & BT_INTERNAL) { + /* + * first access of each internal page + */ + goto getChild; + } else { /* (p->header.flag & BT_LEAF) */ + + /* + * first access of each leaf page + */ + printf("leaf page "); + xtDisplayPage(ip, bn, p); + + /* unpin the leaf page */ + XT_PUTPAGE(mp); + } + + /* + * go back up to the parent page + */ + getParent: + /* pop/restore parent entry for the current child page */ + if ((parent = (btsp == btstack.stack ? NULL : --btsp)) == NULL) + /* current page must have been root */ + return; + + /* + * parent page scan completed + */ + if ((index = parent->index) == (lastindex = parent->lastindex)) { + /* go back up to the parent page */ + goto getParent; + } + + /* + * parent page has entries remaining + */ + /* get back the parent page */ + bn = parent->bn; + /* v = parent->level; */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* get next parent entry */ + index++; + + /* + * internal page: go down to child page of current entry + */ + getChild: + /* push/save current parent entry for the child page */ + btsp->bn = pbn = bn; + btsp->index = index; + btsp->lastindex = lastindex; + /* btsp->level = v; */ + /* btsp->node = h; */ + ++btsp; + + /* get child page */ + xad = &p->xad[index]; + bn = addressXAD(xad); + + /* + * first access of each internal entry: + */ + /* release parent page */ + XT_PUTPAGE(mp); + + printk("traverse down 0x%lx[%d]->0x%lx\n", (ulong) pbn, index, + (ulong) bn); + v++; + h = index; + + /* process the child page */ + goto getPage; +} + + +/* + * xtDisplayPage() + * + * function: display page + */ +int xtDisplayPage(struct inode *ip, s64 bn, xtpage_t * p) +{ + int rc = 0; + xad_t *xad; + s64 xaddr, xoff; + int xlen, i, j; + + /* display page control */ + printf("bn:0x%lx flag:0x%x nextindex:%d\n", + (ulong) bn, p->header.flag, + le16_to_cpu(p->header.nextindex)); + + /* display entries */ + xad = &p->xad[XTENTRYSTART]; + for (i = XTENTRYSTART, j = 1; i < le16_to_cpu(p->header.nextindex); + i++, xad++, j++) { + xoff = offsetXAD(xad); + xaddr = addressXAD(xad); + xlen = lengthXAD(xad); + printf("\t[%d] 0x%lx:0x%lx(0x%x)", i, (ulong) xoff, + (ulong) xaddr, xlen); + + if (j == 4) { + printf("\n"); + j = 0; + } + } + + printf("\n"); +} +#endif /* _JFS_DEBUG_XTREE */ + + +#ifdef _JFS_WIP +/* + * xtGather() + * + * function: + * traverse for allocation acquiring tlock at commit time + * (vs at the time of update) logging backward top down + * + * note: + * problem - establishing that all new allocation have been + * processed both for append and random write in sparse file + * at the current entry at the current subtree root page + * + */ +int xtGather(btree_t *t) +{ + int rc = 0; + xtpage_t *p; + u64 bn; + int index; + btentry_t *e; + struct btstack btstack; + struct btsf *parent; + + /* clear stack */ + BT_CLR(&btstack); + + /* + * start with root + * + * root resides in the inode + */ + bn = 0; + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* new root is NOT pointed by a new entry + if (p->header.flag & NEW) + allocate new page lock; + write a NEWPAGE log; + */ + + dopage: + /* + * first access of each page: + */ + /* process entries backward from last index */ + index = le16_to_cpu(p->header.nextindex) - 1; + + if (p->header.flag & BT_LEAF) { + /* + * first access of each leaf page + */ + /* process leaf page entries backward */ + for (; index >= XTENTRYSTART; index--) { + e = &p->xad[index]; + /* + * if newpage, log NEWPAGE. + * + if (e->flag & XAD_NEW) { + nfound =+ entry->length; + update current page lock for the entry; + newpage(entry); + * + * if moved, log move. + * + } else if (e->flag & XAD_MOVED) { + reset flag; + update current page lock for the entry; + } + */ + } + + /* unpin the leaf page */ + XT_PUTPAGE(mp); + + /* + * go back up to the parent page + */ + getParent: + /* restore parent entry for the current child page */ + if ((parent = BT_POP(&btstack)) == NULL) + /* current page must have been root */ + return 0; + + if ((index = parent->index) == XTENTRYSTART) { + /* + * parent page scan completed + */ + /* go back up to the parent page */ + goto getParent; + } else { + /* + * parent page has entries remaining + */ + /* get back the parent page */ + bn = parent->bn; + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return -EIO; + + /* first subroot page which + * covers all new allocated blocks + * itself not new/modified. + * (if modified from split of descendent, + * go down path of split page) + + if (nfound == nnew && + !(p->header.flag & (NEW | MOD))) + exit scan; + */ + + /* process parent page entries backward */ + index--; + } + } else { + /* + * first access of each internal page + */ + } + + /* + * internal page: go down to child page of current entry + */ + + /* save current parent entry for the child page */ + BT_PUSH(&btstack, bn, index); + + /* get current entry for the child page */ + e = &p->xad[index]; + + /* + * first access of each internal entry: + */ + /* + * if new entry, log btree_tnewentry. + * + if (e->flag & XAD_NEW) + update parent page lock for the entry; + */ + + /* release parent page */ + XT_PUTPAGE(mp); + + /* get child page */ + bn = e->bn; + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* + * first access of each non-root page: + */ + /* + * if new, log btree_newpage. + * + if (p->header.flag & NEW) + allocate new page lock; + write a NEWPAGE log (next, prev); + */ + + /* process the child page */ + goto dopage; + + out: + return 0; +} +#endif /* _JFS_WIP */ + + +#ifdef CONFIG_JFS_STATISTICS +int jfs_xtstat_read(char *buffer, char **start, off_t offset, int length, + int *eof, void *data) +{ + int len = 0; + off_t begin; + + len += sprintf(buffer, + "JFS Xtree statistics\n" + "====================\n" + "searches = %d\n" + "fast searches = %d\n" + "splits = %d\n", + xtStat.search, + xtStat.fastSearch, + xtStat.split); + + begin = offset; + *start = buffer + begin; + len -= begin; + + if (len > length) + len = length; + else + *eof = 1; + + if (len < 0) + len = 0; + + return len; +} +#endif |