From 31bd61f2bb79e098117d823e054342b03aa87668 Mon Sep 17 00:00:00 2001 From: Lachlan McIlroy Date: Wed, 17 Sep 2008 16:45:37 +1000 Subject: [XFS] Move memory allocations for log tracing out of the critical path Memory allocations for log->l_grant_trace and iclog->ic_trace are done on demand when the first event is logged. In xlog_state_get_iclog_space() we call xlog_trace_iclog() under a spinlock and allocating memory here can cause us to sleep with a spinlock held and deadlock the system. For the log grant tracing we use KM_NOSLEEP but that means we can lose trace entries. Since there is no locking to serialize the log grant tracing we could race and have multiple allocations and leak memory. So move the allocations to where we initialize the log/iclog structures. Use KM_NOFS to avoid recursing into the filesystem and drop log->l_trace since it's not even used. SGI-PV: 983738 SGI-Modid: xfs-linux-melb:xfs-kern:31896a Signed-off-by: Lachlan McIlroy Signed-off-by: Christoph Hellwig --- fs/xfs/xfs_log.c | 60 ++++++++++++++++++++++++++++++++++----------------- fs/xfs/xfs_log_priv.h | 1 - 2 files changed, 40 insertions(+), 21 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index ccba14e..7812bd0 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -124,16 +124,27 @@ STATIC void xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog, STATIC int xlog_iclogs_empty(xlog_t *log); #if defined(XFS_LOG_TRACE) + +#define XLOG_TRACE_LOGGRANT_SIZE 2048 +#define XLOG_TRACE_ICLOG_SIZE 256 + +void +xlog_trace_loggrant_alloc(xlog_t *log) +{ + log->l_grant_trace = ktrace_alloc(XLOG_TRACE_LOGGRANT_SIZE, KM_NOFS); +} + +void +xlog_trace_loggrant_dealloc(xlog_t *log) +{ + ktrace_free(log->l_grant_trace); +} + void xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string) { unsigned long cnts; - if (!log->l_grant_trace) { - log->l_grant_trace = ktrace_alloc(2048, KM_NOSLEEP); - if (!log->l_grant_trace) - return; - } /* ticket counts are 1 byte each */ cnts = ((unsigned long)tic->t_ocnt) | ((unsigned long)tic->t_cnt) << 8; @@ -157,10 +168,20 @@ xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string) } void +xlog_trace_iclog_alloc(xlog_in_core_t *iclog) +{ + iclog->ic_trace = ktrace_alloc(XLOG_TRACE_ICLOG_SIZE, KM_NOFS); +} + +void +xlog_trace_iclog_dealloc(xlog_in_core_t *iclog) +{ + ktrace_free(iclog->ic_trace); +} + +void xlog_trace_iclog(xlog_in_core_t *iclog, uint state) { - if (!iclog->ic_trace) - iclog->ic_trace = ktrace_alloc(256, KM_NOFS); ktrace_enter(iclog->ic_trace, (void *)((unsigned long)state), (void *)((unsigned long)current_pid()), @@ -170,8 +191,15 @@ xlog_trace_iclog(xlog_in_core_t *iclog, uint state) (void *)NULL, (void *)NULL); } #else + +#define xlog_trace_loggrant_alloc(log) +#define xlog_trace_loggrant_dealloc(log) #define xlog_trace_loggrant(log,tic,string) + +#define xlog_trace_iclog_alloc(iclog) +#define xlog_trace_iclog_dealloc(iclog) #define xlog_trace_iclog(iclog,state) + #endif /* XFS_LOG_TRACE */ @@ -1231,6 +1259,7 @@ xlog_alloc_log(xfs_mount_t *mp, spin_lock_init(&log->l_grant_lock); sv_init(&log->l_flush_wait, 0, "flush_wait"); + xlog_trace_loggrant_alloc(log); /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */ ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0); @@ -1285,6 +1314,8 @@ xlog_alloc_log(xfs_mount_t *mp, sv_init(&iclog->ic_force_wait, SV_DEFAULT, "iclog-force"); sv_init(&iclog->ic_write_wait, SV_DEFAULT, "iclog-write"); + xlog_trace_iclog_alloc(iclog); + iclogp = &iclog->ic_next; } *iclogp = log->l_iclog; /* complete ring */ @@ -1565,11 +1596,7 @@ xlog_dealloc_log(xlog_t *log) sv_destroy(&iclog->ic_force_wait); sv_destroy(&iclog->ic_write_wait); xfs_buf_free(iclog->ic_bp); -#ifdef XFS_LOG_TRACE - if (iclog->ic_trace != NULL) { - ktrace_free(iclog->ic_trace); - } -#endif + xlog_trace_iclog_dealloc(iclog); next_iclog = iclog->ic_next; kmem_free(iclog); iclog = next_iclog; @@ -1578,14 +1605,7 @@ xlog_dealloc_log(xlog_t *log) spinlock_destroy(&log->l_grant_lock); xfs_buf_free(log->l_xbuf); -#ifdef XFS_LOG_TRACE - if (log->l_trace != NULL) { - ktrace_free(log->l_trace); - } - if (log->l_grant_trace != NULL) { - ktrace_free(log->l_grant_trace); - } -#endif + xlog_trace_loggrant_dealloc(log); log->l_mp->m_log = NULL; kmem_free(log); } /* xlog_dealloc_log */ diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index c8a5b22..e7d8f84 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -448,7 +448,6 @@ typedef struct log { int l_grant_write_bytes; #ifdef XFS_LOG_TRACE - struct ktrace *l_trace; struct ktrace *l_grant_trace; #endif -- cgit v1.1 From 6efdf281777eb07fac28ac2b2d7df1e619ee6da1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 17 Sep 2008 16:49:33 +1000 Subject: [XFS] Fix regression introduced by remount fixup Logically we would return an error in xfs_fs_remount code to prevent users from believing they might have changed mount options using remount which can't be changed. But unfortunately mount(8) adds all options from mtab and fstab to the mount arguments in some cases so we can't blindly reject options, but have to check for each specified option if it actually differs from the currently set option and only reject it if that's the case. Until that is implemented we return success for every remount request, and silently ignore all options that we can't actually change. SGI-PV: 985710 SGI-Modid: xfs-linux-melb:xfs-kern:31908a Signed-off-by: Christoph Hellwig Signed-off-by: Tim Shimmin Signed-off-by: Lachlan McIlroy --- fs/xfs/linux-2.6/xfs_super.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'fs/xfs') diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 73c65f1..18d3c84 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -1302,9 +1302,29 @@ xfs_fs_remount( mp->m_flags &= ~XFS_MOUNT_BARRIER; break; default: + /* + * Logically we would return an error here to prevent + * users from believing they might have changed + * mount options using remount which can't be changed. + * + * But unfortunately mount(8) adds all options from + * mtab and fstab to the mount arguments in some cases + * so we can't blindly reject options, but have to + * check for each specified option if it actually + * differs from the currently set option and only + * reject it if that's the case. + * + * Until that is implemented we return success for + * every remount request, and silently ignore all + * options that we can't actually change. + */ +#if 0 printk(KERN_INFO "XFS: mount option \"%s\" not supported for remount\n", p); return -EINVAL; +#else + return 0; +#endif } } -- cgit v1.1 From 364f358a734ddcd827c662ccbfa58ee3ac510762 Mon Sep 17 00:00:00 2001 From: Lachlan McIlroy Date: Wed, 17 Sep 2008 16:50:14 +1000 Subject: [XFS] Prevent direct I/O from mapping extents beyond eof With the help from some tracing I found that we try to map extents beyond eof when doing a direct I/O read. It appears that the way to inform the generic direct I/O path (ie do_direct_IO()) that we have breached eof is to return an unmapped buffer from xfs_get_blocks_direct(). This will cause do_direct_IO() to jump to the hole handling code where is will check for eof and then abort. This problem was found because a direct I/O read was trying to map beyond eof and was encountering delayed allocations. The delayed allocations beyond eof are speculative allocations and they didn't get converted when the direct I/O flushed the file because there was only enough space in the current AG to convert and write out the dirty pages within eof. Note that xfs_iomap_write_allocate() wont necessarily convert all the delayed allocation passed to it - it will return after allocating the first extent - so if the delayed allocation extends beyond eof then it will stay that way. SGI-PV: 983683 SGI-Modid: xfs-linux-melb:xfs-kern:31929a Signed-off-by: Lachlan McIlroy Signed-off-by: Christoph Hellwig --- fs/xfs/linux-2.6/xfs_aops.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs/xfs') diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index f42f80a..a44d68e 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -1338,6 +1338,10 @@ __xfs_get_blocks( offset = (xfs_off_t)iblock << inode->i_blkbits; ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); size = bh_result->b_size; + + if (!create && direct && offset >= i_size_read(inode)) + return 0; + error = xfs_iomap(XFS_I(inode), offset, size, create ? flags : BMAPI_READ, &iomap, &niomap); if (error) -- cgit v1.1 From b5b8c9acd547244eb2b7d0280ba38b9dd01971cc Mon Sep 17 00:00:00 2001 From: David Chinner Date: Wed, 17 Sep 2008 16:50:50 +1000 Subject: [XFS] Fix barrier status change detection. The current code in xlog_iodone() uses the wrong macro to check if the barrier has been cleared due to an EOPNOTSUPP error form the lower layer. SGI-PV: 986143 SGI-Modid: xfs-linux-melb:xfs-kern:31984a Signed-off-by: David Chinner Signed-off-by: Nathaniel W. Turner Signed-off-by: Peter Leckie Signed-off-by: Lachlan McIlroy --- fs/xfs/xfs_log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 7812bd0..503ea89 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1037,7 +1037,7 @@ xlog_iodone(xfs_buf_t *bp) * layer, it means the underlyin device no longer supports * barrier I/O. Warn loudly and turn off barriers. */ - if ((l->l_mp->m_flags & XFS_MOUNT_BARRIER) && !XFS_BUF_ORDERED(bp)) { + if ((l->l_mp->m_flags & XFS_MOUNT_BARRIER) && !XFS_BUF_ISORDERED(bp)) { l->l_mp->m_flags &= ~XFS_MOUNT_BARRIER; xfs_fs_cmn_err(CE_WARN, l->l_mp, "xlog_iodone: Barriers are no longer supported" -- cgit v1.1 From f9114eba1eb08ee75fd0f1eee780f0290fb3c043 Mon Sep 17 00:00:00 2001 From: David Chinner Date: Wed, 17 Sep 2008 16:51:21 +1000 Subject: [XFS] Prevent lockdep false positives when locking two inodes. If we call xfs_lock_two_inodes() to grab both the iolock and the ilock, then drop the ilocks on both inodes, then grab them again (as xfs_swap_extents() does) then lockdep will report a locking order problem. This is a false positive. To avoid this, disallow xfs_lock_two_inodes() fom locking both inode locks at once - force calers to make two separate calls. This means that nested dropping and regaining of the ilocks will retain the same lockdep subclass and so lockdep will not see anything wrong with this code. SGI-PV: 986238 SGI-Modid: xfs-linux-melb:xfs-kern:31999a Signed-off-by: David Chinner Signed-off-by: Christoph Hellwig Signed-off-by: Peter Leckie Signed-off-by: Lachlan McIlroy --- fs/xfs/xfs_dfrag.c | 9 ++++++++- fs/xfs/xfs_vnodeops.c | 8 ++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index 760f4c5..75b0cd4 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -149,7 +149,14 @@ xfs_swap_extents( sbp = &sxp->sx_stat; - xfs_lock_two_inodes(ip, tip, lock_flags); + /* + * we have to do two separate lock calls here to keep lockdep + * happy. If we try to get all the locks in one call, lock will + * report false positives when we drop the ILOCK and regain them + * below. + */ + xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL); + xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); locked = 1; /* Verify that both files have the same format */ diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index aa238c8..98a0aec 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -1838,6 +1838,12 @@ again: #endif } +/* + * xfs_lock_two_inodes() can only be used to lock one type of lock + * at a time - the iolock or the ilock, but not both at once. If + * we lock both at once, lockdep will report false positives saying + * we have violated locking orders. + */ void xfs_lock_two_inodes( xfs_inode_t *ip0, @@ -1848,6 +1854,8 @@ xfs_lock_two_inodes( int attempts = 0; xfs_log_item_t *lp; + if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) + ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0); ASSERT(ip0->i_ino != ip1->i_ino); if (ip0->i_ino > ip1->i_ino) { -- cgit v1.1 From e1f5dbd7077eebec794452a516cb02f1669b036d Mon Sep 17 00:00:00 2001 From: Lachlan McIlroy Date: Wed, 17 Sep 2008 16:52:13 +1000 Subject: [XFS] Fix use-after-free with buffers We have a use-after-free issue where log completions access buffers via the buffer log item and the buffer has already been freed. Fix this by taking a reference on the buffer when attaching the buffer log item and release the hold when the buffer log item is detached and we no longer need the buffer. Also create a new function xfs_buf_item_free() to combine some common code. SGI-PV: 985757 SGI-Modid: xfs-linux-melb:xfs-kern:32025a Signed-off-by: Lachlan McIlroy Signed-off-by: Christoph Hellwig --- fs/xfs/xfs_buf_item.c | 44 ++++++++++++++++++++------------------------ 1 file changed, 20 insertions(+), 24 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 608c30c..002fc26 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -732,6 +732,7 @@ xfs_buf_item_init( bip->bli_item.li_ops = &xfs_buf_item_ops; bip->bli_item.li_mountp = mp; bip->bli_buf = bp; + xfs_buf_hold(bp); bip->bli_format.blf_type = XFS_LI_BUF; bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp); bip->bli_format.blf_len = (ushort)BTOBB(XFS_BUF_COUNT(bp)); @@ -867,6 +868,21 @@ xfs_buf_item_dirty( return (bip->bli_flags & XFS_BLI_DIRTY); } +STATIC void +xfs_buf_item_free( + xfs_buf_log_item_t *bip) +{ +#ifdef XFS_TRANS_DEBUG + kmem_free(bip->bli_orig); + kmem_free(bip->bli_logged); +#endif /* XFS_TRANS_DEBUG */ + +#ifdef XFS_BLI_TRACE + ktrace_free(bip->bli_trace); +#endif + kmem_zone_free(xfs_buf_item_zone, bip); +} + /* * This is called when the buf log item is no longer needed. It should * free the buf log item associated with the given buffer and clear @@ -887,18 +903,8 @@ xfs_buf_item_relse( (XFS_BUF_IODONE_FUNC(bp) != NULL)) { XFS_BUF_CLR_IODONE_FUNC(bp); } - -#ifdef XFS_TRANS_DEBUG - kmem_free(bip->bli_orig); - bip->bli_orig = NULL; - kmem_free(bip->bli_logged); - bip->bli_logged = NULL; -#endif /* XFS_TRANS_DEBUG */ - -#ifdef XFS_BLI_TRACE - ktrace_free(bip->bli_trace); -#endif - kmem_zone_free(xfs_buf_item_zone, bip); + xfs_buf_rele(bp); + xfs_buf_item_free(bip); } @@ -1120,6 +1126,7 @@ xfs_buf_iodone( ASSERT(bip->bli_buf == bp); + xfs_buf_rele(bp); mp = bip->bli_item.li_mountp; /* @@ -1136,18 +1143,7 @@ xfs_buf_iodone( * xfs_trans_delete_ail() drops the AIL lock. */ xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip); - -#ifdef XFS_TRANS_DEBUG - kmem_free(bip->bli_orig); - bip->bli_orig = NULL; - kmem_free(bip->bli_logged); - bip->bli_logged = NULL; -#endif /* XFS_TRANS_DEBUG */ - -#ifdef XFS_BLI_TRACE - ktrace_free(bip->bli_trace); -#endif - kmem_zone_free(xfs_buf_item_zone, bip); + xfs_buf_item_free(bip); } #if defined(XFS_BLI_TRACE) -- cgit v1.1 From 2fd6f6ec64ff347447d26646ac6188f3658b383c Mon Sep 17 00:00:00 2001 From: Lachlan McIlroy Date: Wed, 17 Sep 2008 16:52:50 +1000 Subject: [XFS] Don't do I/O beyond eof when unreserving space When unreserving space with boundaries that are not block aligned we round up the start and round down the end boundaries and then use this function, xfs_zero_remaining_bytes(), to zero the parts of the blocks that got dropped during the rounding. The problem is we don't consider if these blocks are beyond eof. Worse still is if we encounter delayed allocations beyond eof we will try to use the magic delayed allocation block number as a real block number. If the file size is ever extended to expose these blocks then we'll go through xfs_zero_eof() to zero them anyway. SGI-PV: 983683 SGI-Modid: xfs-linux-melb:xfs-kern:32055a Signed-off-by: Lachlan McIlroy Signed-off-by: Christoph Hellwig --- fs/xfs/xfs_vnodeops.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 98a0aec..8b6812f 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -3160,6 +3160,13 @@ error1: /* Just cancel transaction */ /* * Zero file bytes between startoff and endoff inclusive. * The iolock is held exclusive and no blocks are buffered. + * + * This function is used by xfs_free_file_space() to zero + * partial blocks when the range to free is not block aligned. + * When unreserving space with boundaries that are not block + * aligned we round up the start and round down the end + * boundaries and then use this function to zero the parts of + * the blocks that got dropped during the rounding. */ STATIC int xfs_zero_remaining_bytes( @@ -3176,6 +3183,17 @@ xfs_zero_remaining_bytes( int nimap; int error = 0; + /* + * Avoid doing I/O beyond eof - it's not necessary + * since nothing can read beyond eof. The space will + * be zeroed when the file is extended anyway. + */ + if (startoff >= ip->i_size) + return 0; + + if (endoff > ip->i_size) + endoff = ip->i_size; + bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize, XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp); -- cgit v1.1 From f1ccd2955157e1aff992f6aaaba0944209076220 Mon Sep 17 00:00:00 2001 From: Lachlan McIlroy Date: Fri, 26 Sep 2008 12:16:46 +1000 Subject: [XFS] Fix extent list corruption in xfs_iext_irec_compact_full(). If we don't move all the records from the next buffer into the current buffer then we need to update the er_extoff field of the next buffer as we shift the remaining records to the start of the buffer. SGI-PV: 987159 SGI-Modid: xfs-linux-melb:xfs-kern:32165a Signed-off-by: Lachlan McIlroy Signed-off-by: Eric Sandeen Signed-off-by: Russell Cattelan --- fs/xfs/xfs_inode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 00e80df..419cfc2 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -4584,6 +4584,7 @@ xfs_iext_irec_compact_full( (XFS_LINEAR_EXTS - erp_next->er_extcount) * sizeof(xfs_bmbt_rec_t)); + erp_next->er_extoff += ext_diff; } } -- cgit v1.1 From 71a8c87fb300b601eacf7a86cc6c6322fe827bfd Mon Sep 17 00:00:00 2001 From: Lachlan McIlroy Date: Fri, 26 Sep 2008 12:17:57 +1000 Subject: [XFS] Remove xfs_iext_irec_compact_full() Yet another bug was found in xfs_iext_irec_compact_full() and while the source of the bug was found it wasn't an easy task to track it down because the conditions are very difficult to reproduce. A HUGE thank-you goes to Russell Cattelan and Eric Sandeen for their significant effort in tracking down the source of this corruption. xfs_iext_irec_compact_full() and xfs_iext_irec_compact_pages() are almost identical - they both compact indirect extent lists by moving extents from subsequent buffers into earlier ones. xfs_iext_irec_compact_pages() only moves extents if all of the extents in the next buffer will fit into the empty space in the buffer before it. xfs_iext_irec_compact_full() will go a step further and move part of the next buffer if all the extents wont fit. It will then shift the remaining extents in the next buffer up to the start of the buffer. The bug here was that we did not update er_extoff and this caused extent list corruption. It does not appear that this extra functionality gains us much. Calling xfs_iext_irec_compact_pages() instead will do a good enough job at compacting the indirect list and will be quicker too. For the case in xfs_iext_indirect_to_direct() the total number of extents in the indirect list will fit into one buffer so we will never need the extra functionality of xfs_iext_irec_compact_full() there. Also xfs_iext_irec_compact_pages() doesn't need to do a memmove() (the buffers will never overlap) so we don't want the performance hit that can incur. SGI-PV: 987159 SGI-Modid: xfs-linux-melb:xfs-kern:32166a Signed-off-by: Lachlan McIlroy Signed-off-by: Eric Sandeen --- fs/xfs/xfs_inode.c | 95 ++---------------------------------------------------- 1 file changed, 3 insertions(+), 92 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 419cfc2..dbd9cef 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -4118,7 +4118,7 @@ xfs_iext_indirect_to_direct( ASSERT(nextents <= XFS_LINEAR_EXTS); size = nextents * sizeof(xfs_bmbt_rec_t); - xfs_iext_irec_compact_full(ifp); + xfs_iext_irec_compact_pages(ifp); ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ); ep = ifp->if_u1.if_ext_irec->er_extbuf; @@ -4449,8 +4449,7 @@ xfs_iext_irec_remove( * compaction policy is as follows: * * Full Compaction: Extents fit into a single page (or inline buffer) - * Full Compaction: Extents occupy less than 10% of allocated space - * Partial Compaction: Extents occupy > 10% and < 50% of allocated space + * Partial Compaction: Extents occupy less than 50% of allocated space * No Compaction: Extents occupy at least 50% of allocated space */ void @@ -4471,8 +4470,6 @@ xfs_iext_irec_compact( xfs_iext_direct_to_inline(ifp, nextents); } else if (nextents <= XFS_LINEAR_EXTS) { xfs_iext_indirect_to_direct(ifp); - } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 3) { - xfs_iext_irec_compact_full(ifp); } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) { xfs_iext_irec_compact_pages(ifp); } @@ -4496,7 +4493,7 @@ xfs_iext_irec_compact_pages( erp_next = erp + 1; if (erp_next->er_extcount <= (XFS_LINEAR_EXTS - erp->er_extcount)) { - memmove(&erp->er_extbuf[erp->er_extcount], + memcpy(&erp->er_extbuf[erp->er_extcount], erp_next->er_extbuf, erp_next->er_extcount * sizeof(xfs_bmbt_rec_t)); erp->er_extcount += erp_next->er_extcount; @@ -4516,92 +4513,6 @@ xfs_iext_irec_compact_pages( } /* - * Fully compact the extent records managed by the indirection array. - */ -void -xfs_iext_irec_compact_full( - xfs_ifork_t *ifp) /* inode fork pointer */ -{ - xfs_bmbt_rec_host_t *ep, *ep_next; /* extent record pointers */ - xfs_ext_irec_t *erp, *erp_next; /* extent irec pointers */ - int erp_idx = 0; /* extent irec index */ - int ext_avail; /* empty entries in ex list */ - int ext_diff; /* number of exts to add */ - int nlists; /* number of irec's (ex lists) */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - erp = ifp->if_u1.if_ext_irec; - ep = &erp->er_extbuf[erp->er_extcount]; - erp_next = erp + 1; - ep_next = erp_next->er_extbuf; - - while (erp_idx < nlists - 1) { - /* - * Check how many extent records are available in this irec. - * If there is none skip the whole exercise. - */ - ext_avail = XFS_LINEAR_EXTS - erp->er_extcount; - if (ext_avail) { - - /* - * Copy over as many as possible extent records into - * the previous page. - */ - ext_diff = MIN(ext_avail, erp_next->er_extcount); - memcpy(ep, ep_next, ext_diff * sizeof(xfs_bmbt_rec_t)); - erp->er_extcount += ext_diff; - erp_next->er_extcount -= ext_diff; - - /* - * If the next irec is empty now we can simply - * remove it. - */ - if (erp_next->er_extcount == 0) { - /* - * Free page before removing extent record - * so er_extoffs don't get modified in - * xfs_iext_irec_remove. - */ - kmem_free(erp_next->er_extbuf); - erp_next->er_extbuf = NULL; - xfs_iext_irec_remove(ifp, erp_idx + 1); - erp = &ifp->if_u1.if_ext_irec[erp_idx]; - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - - /* - * If the next irec is not empty move up the content - * that has not been copied to the previous page to - * the beggining of this one. - */ - } else { - memmove(erp_next->er_extbuf, &ep_next[ext_diff], - erp_next->er_extcount * - sizeof(xfs_bmbt_rec_t)); - ep_next = erp_next->er_extbuf; - memset(&ep_next[erp_next->er_extcount], 0, - (XFS_LINEAR_EXTS - - erp_next->er_extcount) * - sizeof(xfs_bmbt_rec_t)); - erp_next->er_extoff += ext_diff; - } - } - - if (erp->er_extcount == XFS_LINEAR_EXTS) { - erp_idx++; - if (erp_idx < nlists) - erp = &ifp->if_u1.if_ext_irec[erp_idx]; - else - break; - } - ep = &erp->er_extbuf[erp->er_extcount]; - erp_next = erp + 1; - ep_next = erp_next->er_extbuf; - } -} - -/* * This is called to update the er_extoff field in the indirection * array when extents have been added or removed from one of the * extent lists. erp_idx contains the irec index to begin updating -- cgit v1.1