summaryrefslogtreecommitdiffstats
path: root/fs/xfs/linux-2.6
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/linux-2.6')
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c7
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h7
-rw-r--r--fs/xfs/linux-2.6/xfs_discard.c191
-rw-r--r--fs/xfs/linux-2.6/xfs_discard.h8
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c587
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c23
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c57
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c11
-rw-r--r--fs/xfs/linux-2.6/xfs_sysctl.c23
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.h33
11 files changed, 645 insertions, 304 deletions
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 92f1f2a..ac1c7e8 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -896,7 +896,6 @@ xfs_buf_rele(
trace_xfs_buf_rele(bp, _RET_IP_);
if (!pag) {
- ASSERT(!bp->b_relse);
ASSERT(list_empty(&bp->b_lru));
ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
if (atomic_dec_and_test(&bp->b_hold))
@@ -908,11 +907,7 @@ xfs_buf_rele(
ASSERT(atomic_read(&bp->b_hold) > 0);
if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
- if (bp->b_relse) {
- atomic_inc(&bp->b_hold);
- spin_unlock(&pag->pag_buf_lock);
- bp->b_relse(bp);
- } else if (!(bp->b_flags & XBF_STALE) &&
+ if (!(bp->b_flags & XBF_STALE) &&
atomic_read(&bp->b_lru_ref)) {
xfs_buf_lru_add(bp);
spin_unlock(&pag->pag_buf_lock);
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index a76c242..cbe6595 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -152,8 +152,6 @@ typedef struct xfs_buftarg {
struct xfs_buf;
typedef void (*xfs_buf_iodone_t)(struct xfs_buf *);
-typedef void (*xfs_buf_relse_t)(struct xfs_buf *);
-typedef int (*xfs_buf_bdstrat_t)(struct xfs_buf *);
#define XB_PAGES 2
@@ -183,7 +181,6 @@ typedef struct xfs_buf {
void *b_addr; /* virtual address of buffer */
struct work_struct b_iodone_work;
xfs_buf_iodone_t b_iodone; /* I/O completion function */
- xfs_buf_relse_t b_relse; /* releasing function */
struct completion b_iowait; /* queue for I/O waiters */
void *b_fspriv;
void *b_fspriv2;
@@ -323,7 +320,6 @@ void xfs_buf_stale(struct xfs_buf *bp);
#define XFS_BUF_FSPRIVATE2(bp, type) ((type)(bp)->b_fspriv2)
#define XFS_BUF_SET_FSPRIVATE2(bp, val) ((bp)->b_fspriv2 = (void*)(val))
#define XFS_BUF_SET_START(bp) do { } while (0)
-#define XFS_BUF_SET_BRELSE_FUNC(bp, func) ((bp)->b_relse = (func))
#define XFS_BUF_PTR(bp) (xfs_caddr_t)((bp)->b_addr)
#define XFS_BUF_SET_PTR(bp, val, cnt) xfs_buf_associate_memory(bp, val, cnt)
@@ -360,8 +356,7 @@ xfs_buf_set_ref(
static inline void xfs_buf_relse(xfs_buf_t *bp)
{
- if (!bp->b_relse)
- xfs_buf_unlock(bp);
+ xfs_buf_unlock(bp);
xfs_buf_rele(bp);
}
diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/linux-2.6/xfs_discard.c
new file mode 100644
index 0000000..05201ae
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_discard.c
@@ -0,0 +1,191 @@
+/*
+ * Copyright (C) 2010 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_sb.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_quota.h"
+#include "xfs_trans.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_discard.h"
+#include "xfs_trace.h"
+
+STATIC int
+xfs_trim_extents(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ xfs_fsblock_t start,
+ xfs_fsblock_t len,
+ xfs_fsblock_t minlen,
+ __uint64_t *blocks_trimmed)
+{
+ struct block_device *bdev = mp->m_ddev_targp->bt_bdev;
+ struct xfs_btree_cur *cur;
+ struct xfs_buf *agbp;
+ struct xfs_perag *pag;
+ int error;
+ int i;
+
+ pag = xfs_perag_get(mp, agno);
+
+ error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
+ if (error || !agbp)
+ goto out_put_perag;
+
+ cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT);
+
+ /*
+ * Force out the log. This means any transactions that might have freed
+ * space before we took the AGF buffer lock are now on disk, and the
+ * volatile disk cache is flushed.
+ */
+ xfs_log_force(mp, XFS_LOG_SYNC);
+
+ /*
+ * Look up the longest btree in the AGF and start with it.
+ */
+ error = xfs_alloc_lookup_le(cur, 0,
+ XFS_BUF_TO_AGF(agbp)->agf_longest, &i);
+ if (error)
+ goto out_del_cursor;
+
+ /*
+ * Loop until we are done with all extents that are large
+ * enough to be worth discarding.
+ */
+ while (i) {
+ xfs_agblock_t fbno;
+ xfs_extlen_t flen;
+
+ error = xfs_alloc_get_rec(cur, &fbno, &flen, &i);
+ if (error)
+ goto out_del_cursor;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor);
+ ASSERT(flen <= XFS_BUF_TO_AGF(agbp)->agf_longest);
+
+ /*
+ * Too small? Give up.
+ */
+ if (flen < minlen) {
+ trace_xfs_discard_toosmall(mp, agno, fbno, flen);
+ goto out_del_cursor;
+ }
+
+ /*
+ * If the extent is entirely outside of the range we are
+ * supposed to discard skip it. Do not bother to trim
+ * down partially overlapping ranges for now.
+ */
+ if (XFS_AGB_TO_FSB(mp, agno, fbno) + flen < start ||
+ XFS_AGB_TO_FSB(mp, agno, fbno) >= start + len) {
+ trace_xfs_discard_exclude(mp, agno, fbno, flen);
+ goto next_extent;
+ }
+
+ /*
+ * If any blocks in the range are still busy, skip the
+ * discard and try again the next time.
+ */
+ if (xfs_alloc_busy_search(mp, agno, fbno, flen)) {
+ trace_xfs_discard_busy(mp, agno, fbno, flen);
+ goto next_extent;
+ }
+
+ trace_xfs_discard_extent(mp, agno, fbno, flen);
+ error = -blkdev_issue_discard(bdev,
+ XFS_AGB_TO_DADDR(mp, agno, fbno),
+ XFS_FSB_TO_BB(mp, flen),
+ GFP_NOFS, 0);
+ if (error)
+ goto out_del_cursor;
+ *blocks_trimmed += flen;
+
+next_extent:
+ error = xfs_btree_decrement(cur, 0, &i);
+ if (error)
+ goto out_del_cursor;
+ }
+
+out_del_cursor:
+ xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+ xfs_buf_relse(agbp);
+out_put_perag:
+ xfs_perag_put(pag);
+ return error;
+}
+
+int
+xfs_ioc_trim(
+ struct xfs_mount *mp,
+ struct fstrim_range __user *urange)
+{
+ struct request_queue *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue;
+ unsigned int granularity = q->limits.discard_granularity;
+ struct fstrim_range range;
+ xfs_fsblock_t start, len, minlen;
+ xfs_agnumber_t start_agno, end_agno, agno;
+ __uint64_t blocks_trimmed = 0;
+ int error, last_error = 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -XFS_ERROR(EPERM);
+ if (copy_from_user(&range, urange, sizeof(range)))
+ return -XFS_ERROR(EFAULT);
+
+ /*
+ * Truncating down the len isn't actually quite correct, but using
+ * XFS_B_TO_FSB would mean we trivially get overflows for values
+ * of ULLONG_MAX or slightly lower. And ULLONG_MAX is the default
+ * used by the fstrim application. In the end it really doesn't
+ * matter as trimming blocks is an advisory interface.
+ */
+ start = XFS_B_TO_FSBT(mp, range.start);
+ len = XFS_B_TO_FSBT(mp, range.len);
+ minlen = XFS_B_TO_FSB(mp, max_t(u64, granularity, range.minlen));
+
+ start_agno = XFS_FSB_TO_AGNO(mp, start);
+ if (start_agno >= mp->m_sb.sb_agcount)
+ return -XFS_ERROR(EINVAL);
+
+ end_agno = XFS_FSB_TO_AGNO(mp, start + len);
+ if (end_agno >= mp->m_sb.sb_agcount)
+ end_agno = mp->m_sb.sb_agcount - 1;
+
+ for (agno = start_agno; agno <= end_agno; agno++) {
+ error = -xfs_trim_extents(mp, agno, start, len, minlen,
+ &blocks_trimmed);
+ if (error)
+ last_error = error;
+ }
+
+ if (last_error)
+ return last_error;
+
+ range.len = XFS_FSB_TO_B(mp, blocks_trimmed);
+ if (copy_to_user(urange, &range, sizeof(range)))
+ return -XFS_ERROR(EFAULT);
+ return 0;
+}
diff --git a/fs/xfs/linux-2.6/xfs_discard.h b/fs/xfs/linux-2.6/xfs_discard.h
new file mode 100644
index 0000000..e82b6dd
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_discard.h
@@ -0,0 +1,8 @@
+#ifndef XFS_DISCARD_H
+#define XFS_DISCARD_H 1
+
+struct fstrim_range;
+
+extern int xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *);
+
+#endif /* XFS_DISCARD_H */
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index ba8ad42..a55c1b4 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -37,10 +37,45 @@
#include "xfs_trace.h"
#include <linux/dcache.h>
+#include <linux/falloc.h>
static const struct vm_operations_struct xfs_file_vm_ops;
/*
+ * Locking primitives for read and write IO paths to ensure we consistently use
+ * and order the inode->i_mutex, ip->i_lock and ip->i_iolock.
+ */
+static inline void
+xfs_rw_ilock(
+ struct xfs_inode *ip,
+ int type)
+{
+ if (type & XFS_IOLOCK_EXCL)
+ mutex_lock(&VFS_I(ip)->i_mutex);
+ xfs_ilock(ip, type);
+}
+
+static inline void
+xfs_rw_iunlock(
+ struct xfs_inode *ip,
+ int type)
+{
+ xfs_iunlock(ip, type);
+ if (type & XFS_IOLOCK_EXCL)
+ mutex_unlock(&VFS_I(ip)->i_mutex);
+}
+
+static inline void
+xfs_rw_ilock_demote(
+ struct xfs_inode *ip,
+ int type)
+{
+ xfs_ilock_demote(ip, type);
+ if (type & XFS_IOLOCK_EXCL)
+ mutex_unlock(&VFS_I(ip)->i_mutex);
+}
+
+/*
* xfs_iozero
*
* xfs_iozero clears the specified range of buffer supplied,
@@ -262,22 +297,21 @@ xfs_file_aio_read(
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
- if (unlikely(ioflags & IO_ISDIRECT))
- mutex_lock(&inode->i_mutex);
- xfs_ilock(ip, XFS_IOLOCK_SHARED);
-
if (unlikely(ioflags & IO_ISDIRECT)) {
+ xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
+
if (inode->i_mapping->nrpages) {
ret = -xfs_flushinval_pages(ip,
(iocb->ki_pos & PAGE_CACHE_MASK),
-1, FI_REMAPF_LOCKED);
+ if (ret) {
+ xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
+ return ret;
+ }
}
- mutex_unlock(&inode->i_mutex);
- if (ret) {
- xfs_iunlock(ip, XFS_IOLOCK_SHARED);
- return ret;
- }
- }
+ xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+ } else
+ xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags);
@@ -285,7 +319,7 @@ xfs_file_aio_read(
if (ret > 0)
XFS_STATS_ADD(xs_read_bytes, ret);
- xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+ xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
return ret;
}
@@ -309,7 +343,7 @@ xfs_file_splice_read(
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
return -EIO;
- xfs_ilock(ip, XFS_IOLOCK_SHARED);
+ xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
@@ -317,10 +351,61 @@ xfs_file_splice_read(
if (ret > 0)
XFS_STATS_ADD(xs_read_bytes, ret);
- xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+ xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
return ret;
}
+STATIC void
+xfs_aio_write_isize_update(
+ struct inode *inode,
+ loff_t *ppos,
+ ssize_t bytes_written)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ xfs_fsize_t isize = i_size_read(inode);
+
+ if (bytes_written > 0)
+ XFS_STATS_ADD(xs_write_bytes, bytes_written);
+
+ if (unlikely(bytes_written < 0 && bytes_written != -EFAULT &&
+ *ppos > isize))
+ *ppos = isize;
+
+ if (*ppos > ip->i_size) {
+ xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
+ if (*ppos > ip->i_size)
+ ip->i_size = *ppos;
+ xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
+ }
+}
+
+/*
+ * If this was a direct or synchronous I/O that failed (such as ENOSPC) then
+ * part of the I/O may have been written to disk before the error occured. In
+ * this case the on-disk file size may have been adjusted beyond the in-memory
+ * file size and now needs to be truncated back.
+ */
+STATIC void
+xfs_aio_write_newsize_update(
+ struct xfs_inode *ip)
+{
+ if (ip->i_new_size) {
+ xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
+ ip->i_new_size = 0;
+ if (ip->i_d.di_size > ip->i_size)
+ ip->i_d.di_size = ip->i_size;
+ xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
+ }
+}
+
+/*
+ * xfs_file_splice_write() does not use xfs_rw_ilock() because
+ * generic_file_splice_write() takes the i_mutex itself. This, in theory,
+ * couuld cause lock inversions between the aio_write path and the splice path
+ * if someone is doing concurrent splice(2) based writes and write(2) based
+ * writes to the same inode. The only real way to fix this is to re-implement
+ * the generic code here with correct locking orders.
+ */
STATIC ssize_t
xfs_file_splice_write(
struct pipe_inode_info *pipe,
@@ -331,7 +416,7 @@ xfs_file_splice_write(
{
struct inode *inode = outfilp->f_mapping->host;
struct xfs_inode *ip = XFS_I(inode);
- xfs_fsize_t isize, new_size;
+ xfs_fsize_t new_size;
int ioflags = 0;
ssize_t ret;
@@ -355,27 +440,9 @@ xfs_file_splice_write(
trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
- if (ret > 0)
- XFS_STATS_ADD(xs_write_bytes, ret);
-
- isize = i_size_read(inode);
- if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize))
- *ppos = isize;
-
- if (*ppos > ip->i_size) {
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- if (*ppos > ip->i_size)
- ip->i_size = *ppos;
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- }
- if (ip->i_new_size) {
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- ip->i_new_size = 0;
- if (ip->i_d.di_size > ip->i_size)
- ip->i_d.di_size = ip->i_size;
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- }
+ xfs_aio_write_isize_update(inode, ppos, ret);
+ xfs_aio_write_newsize_update(ip);
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
return ret;
}
@@ -562,247 +629,314 @@ out_lock:
return error;
}
+/*
+ * Common pre-write limit and setup checks.
+ *
+ * Returns with iolock held according to @iolock.
+ */
STATIC ssize_t
-xfs_file_aio_write(
- struct kiocb *iocb,
- const struct iovec *iovp,
- unsigned long nr_segs,
- loff_t pos)
+xfs_file_aio_write_checks(
+ struct file *file,
+ loff_t *pos,
+ size_t *count,
+ int *iolock)
{
- struct file *file = iocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
+ struct inode *inode = file->f_mapping->host;
struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
- ssize_t ret = 0, error = 0;
- int ioflags = 0;
- xfs_fsize_t isize, new_size;
- int iolock;
- size_t ocount = 0, count;
- int need_i_mutex;
+ xfs_fsize_t new_size;
+ int error = 0;
- XFS_STATS_INC(xs_write_calls);
+ error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
+ if (error) {
+ xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
+ *iolock = 0;
+ return error;
+ }
- BUG_ON(iocb->ki_pos != pos);
+ new_size = *pos + *count;
+ if (new_size > ip->i_size)
+ ip->i_new_size = new_size;
- if (unlikely(file->f_flags & O_DIRECT))
- ioflags |= IO_ISDIRECT;
- if (file->f_mode & FMODE_NOCMTIME)
- ioflags |= IO_INVIS;
+ if (likely(!(file->f_mode & FMODE_NOCMTIME)))
+ file_update_time(file);
+
+ /*
+ * If the offset is beyond the size of the file, we need to zero any
+ * blocks that fall between the existing EOF and the start of this
+ * write.
+ */
+ if (*pos > ip->i_size)
+ error = -xfs_zero_eof(ip, *pos, ip->i_size);
- error = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
+ xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
if (error)
return error;
- count = ocount;
- if (count == 0)
- return 0;
-
- xfs_wait_for_freeze(mp, SB_FREEZE_WRITE);
+ /*
+ * If we're writing the file then make sure to clear the setuid and
+ * setgid bits if the process is not being run by root. This keeps
+ * people from modifying setuid and setgid binaries.
+ */
+ return file_remove_suid(file);
- if (XFS_FORCED_SHUTDOWN(mp))
- return -EIO;
+}
-relock:
- if (ioflags & IO_ISDIRECT) {
- iolock = XFS_IOLOCK_SHARED;
- need_i_mutex = 0;
- } else {
- iolock = XFS_IOLOCK_EXCL;
- need_i_mutex = 1;
- mutex_lock(&inode->i_mutex);
+/*
+ * xfs_file_dio_aio_write - handle direct IO writes
+ *
+ * Lock the inode appropriately to prepare for and issue a direct IO write.
+ * By separating it from the buffered write path we remove all the tricky to
+ * follow locking changes and looping.
+ *
+ * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
+ * until we're sure the bytes at the new EOF have been zeroed and/or the cached
+ * pages are flushed out.
+ *
+ * In most cases the direct IO writes will be done holding IOLOCK_SHARED
+ * allowing them to be done in parallel with reads and other direct IO writes.
+ * However, if the IO is not aligned to filesystem blocks, the direct IO layer
+ * needs to do sub-block zeroing and that requires serialisation against other
+ * direct IOs to the same block. In this case we need to serialise the
+ * submission of the unaligned IOs so that we don't get racing block zeroing in
+ * the dio layer. To avoid the problem with aio, we also need to wait for
+ * outstanding IOs to complete so that unwritten extent conversion is completed
+ * before we try to map the overlapping block. This is currently implemented by
+ * hitting it with a big hammer (i.e. xfs_ioend_wait()).
+ *
+ * Returns with locks held indicated by @iolock and errors indicated by
+ * negative return values.
+ */
+STATIC ssize_t
+xfs_file_dio_aio_write(
+ struct kiocb *iocb,
+ const struct iovec *iovp,
+ unsigned long nr_segs,
+ loff_t pos,
+ size_t ocount,
+ int *iolock)
+{
+ struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ ssize_t ret = 0;
+ size_t count = ocount;
+ int unaligned_io = 0;
+ struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ?
+ mp->m_rtdev_targp : mp->m_ddev_targp;
+
+ *iolock = 0;
+ if ((pos & target->bt_smask) || (count & target->bt_smask))
+ return -XFS_ERROR(EINVAL);
+
+ if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
+ unaligned_io = 1;
+
+ if (unaligned_io || mapping->nrpages || pos > ip->i_size)
+ *iolock = XFS_IOLOCK_EXCL;
+ else
+ *iolock = XFS_IOLOCK_SHARED;
+ xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
+
+ ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
+ if (ret)
+ return ret;
+
+ if (mapping->nrpages) {
+ WARN_ON(*iolock != XFS_IOLOCK_EXCL);
+ ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1,
+ FI_REMAPF_LOCKED);
+ if (ret)
+ return ret;
}
- xfs_ilock(ip, XFS_ILOCK_EXCL|iolock);
-
-start:
- error = -generic_write_checks(file, &pos, &count,
- S_ISBLK(inode->i_mode));
- if (error) {
- xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
- goto out_unlock_mutex;
+ /*
+ * If we are doing unaligned IO, wait for all other IO to drain,
+ * otherwise demote the lock if we had to flush cached pages
+ */
+ if (unaligned_io)
+ xfs_ioend_wait(ip);
+ else if (*iolock == XFS_IOLOCK_EXCL) {
+ xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+ *iolock = XFS_IOLOCK_SHARED;
}
- if (ioflags & IO_ISDIRECT) {
- xfs_buftarg_t *target =
- XFS_IS_REALTIME_INODE(ip) ?
- mp->m_rtdev_targp : mp->m_ddev_targp;
+ trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
+ ret = generic_file_direct_write(iocb, iovp,
+ &nr_segs, pos, &iocb->ki_pos, count, ocount);
- if ((pos & target->bt_smask) || (count & target->bt_smask)) {
- xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
- return XFS_ERROR(-EINVAL);
- }
+ /* No fallback to buffered IO on errors for XFS. */
+ ASSERT(ret < 0 || ret == count);
+ return ret;
+}
- if (!need_i_mutex && (mapping->nrpages || pos > ip->i_size)) {
- xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
- iolock = XFS_IOLOCK_EXCL;
- need_i_mutex = 1;
- mutex_lock(&inode->i_mutex);
- xfs_ilock(ip, XFS_ILOCK_EXCL|iolock);
- goto start;
- }
- }
+STATIC ssize_t
+xfs_file_buffered_aio_write(
+ struct kiocb *iocb,
+ const struct iovec *iovp,
+ unsigned long nr_segs,
+ loff_t pos,
+ size_t ocount,
+ int *iolock)
+{
+ struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ struct xfs_inode *ip = XFS_I(inode);
+ ssize_t ret;
+ int enospc = 0;
+ size_t count = ocount;
- new_size = pos + count;
- if (new_size > ip->i_size)
- ip->i_new_size = new_size;
+ *iolock = XFS_IOLOCK_EXCL;
+ xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
- if (likely(!(ioflags & IO_INVIS)))
- file_update_time(file);
+ ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
+ if (ret)
+ return ret;
+ /* We can write back this queue in page reclaim */
+ current->backing_dev_info = mapping->backing_dev_info;
+
+write_retry:
+ trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
+ ret = generic_file_buffered_write(iocb, iovp, nr_segs,
+ pos, &iocb->ki_pos, count, ret);
/*
- * If the offset is beyond the size of the file, we have a couple
- * of things to do. First, if there is already space allocated
- * we need to either create holes or zero the disk or ...
- *
- * If there is a page where the previous size lands, we need
- * to zero it out up to the new size.
+ * if we just got an ENOSPC, flush the inode now we aren't holding any
+ * page locks and retry *once*
*/
-
- if (pos > ip->i_size) {
- error = xfs_zero_eof(ip, pos, ip->i_size);
- if (error) {
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- goto out_unlock_internal;
- }
+ if (ret == -ENOSPC && !enospc) {
+ ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
+ if (ret)
+ return ret;
+ enospc = 1;
+ goto write_retry;
}
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ current->backing_dev_info = NULL;
+ return ret;
+}
- /*
- * If we're writing the file then make sure to clear the
- * setuid and setgid bits if the process is not being run
- * by root. This keeps people from modifying setuid and
- * setgid binaries.
- */
- error = -file_remove_suid(file);
- if (unlikely(error))
- goto out_unlock_internal;
+STATIC ssize_t
+xfs_file_aio_write(
+ struct kiocb *iocb,
+ const struct iovec *iovp,
+ unsigned long nr_segs,
+ loff_t pos)
+{
+ struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ struct xfs_inode *ip = XFS_I(inode);
+ ssize_t ret;
+ int iolock;
+ size_t ocount = 0;
- /* We can write back this queue in page reclaim */
- current->backing_dev_info = mapping->backing_dev_info;
+ XFS_STATS_INC(xs_write_calls);
- if ((ioflags & IO_ISDIRECT)) {
- if (mapping->nrpages) {
- WARN_ON(need_i_mutex == 0);
- error = xfs_flushinval_pages(ip,
- (pos & PAGE_CACHE_MASK),
- -1, FI_REMAPF_LOCKED);
- if (error)
- goto out_unlock_internal;
- }
+ BUG_ON(iocb->ki_pos != pos);
- if (need_i_mutex) {
- /* demote the lock now the cached pages are gone */
- xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
- mutex_unlock(&inode->i_mutex);
+ ret = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
+ if (ret)
+ return ret;
- iolock = XFS_IOLOCK_SHARED;
- need_i_mutex = 0;
- }
+ if (ocount == 0)
+ return 0;
- trace_xfs_file_direct_write(ip, count, iocb->ki_pos, ioflags);
- ret = generic_file_direct_write(iocb, iovp,
- &nr_segs, pos, &iocb->ki_pos, count, ocount);
+ xfs_wait_for_freeze(ip->i_mount, SB_FREEZE_WRITE);
- /*
- * direct-io write to a hole: fall through to buffered I/O
- * for completing the rest of the request.
- */
- if (ret >= 0 && ret != count) {
- XFS_STATS_ADD(xs_write_bytes, ret);
+ if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+ return -EIO;
- pos += ret;
- count -= ret;
+ if (unlikely(file->f_flags & O_DIRECT))
+ ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos,
+ ocount, &iolock);
+ else
+ ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
+ ocount, &iolock);
- ioflags &= ~IO_ISDIRECT;
- xfs_iunlock(ip, iolock);
- goto relock;
- }
- } else {
- int enospc = 0;
- ssize_t ret2 = 0;
+ xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret);
-write_retry:
- trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, ioflags);
- ret2 = generic_file_buffered_write(iocb, iovp, nr_segs,
- pos, &iocb->ki_pos, count, ret);
- /*
- * if we just got an ENOSPC, flush the inode now we
- * aren't holding any page locks and retry *once*
- */
- if (ret2 == -ENOSPC && !enospc) {
- error = xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
- if (error)
- goto out_unlock_internal;
- enospc = 1;
- goto write_retry;
- }
- ret = ret2;
- }
+ if (ret <= 0)
+ goto out_unlock;
- current->backing_dev_info = NULL;
+ /* Handle various SYNC-type writes */
+ if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
+ loff_t end = pos + ret - 1;
+ int error, error2;
- isize = i_size_read(inode);
- if (unlikely(ret < 0 && ret != -EFAULT && iocb->ki_pos > isize))
- iocb->ki_pos = isize;
+ xfs_rw_iunlock(ip, iolock);
+ error = filemap_write_and_wait_range(mapping, pos, end);
+ xfs_rw_ilock(ip, iolock);
- if (iocb->ki_pos > ip->i_size) {
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- if (iocb->ki_pos > ip->i_size)
- ip->i_size = iocb->ki_pos;
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ error2 = -xfs_file_fsync(file,
+ (file->f_flags & __O_SYNC) ? 0 : 1);
+ if (error)
+ ret = error;
+ else if (error2)
+ ret = error2;
}
- error = -ret;
- if (ret <= 0)
- goto out_unlock_internal;
+out_unlock:
+ xfs_aio_write_newsize_update(ip);
+ xfs_rw_iunlock(ip, iolock);
+ return ret;
+}
- XFS_STATS_ADD(xs_write_bytes, ret);
+STATIC long
+xfs_file_fallocate(
+ struct file *file,
+ int mode,
+ loff_t offset,
+ loff_t len)
+{
+ struct inode *inode = file->f_path.dentry->d_inode;
+ long error;
+ loff_t new_size = 0;
+ xfs_flock64_t bf;
+ xfs_inode_t *ip = XFS_I(inode);
+ int cmd = XFS_IOC_RESVSP;
- /* Handle various SYNC-type writes */
- if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
- loff_t end = pos + ret - 1;
- int error2;
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ return -EOPNOTSUPP;
- xfs_iunlock(ip, iolock);
- if (need_i_mutex)
- mutex_unlock(&inode->i_mutex);
+ bf.l_whence = 0;
+ bf.l_start = offset;
+ bf.l_len = len;
- error2 = filemap_write_and_wait_range(mapping, pos, end);
- if (!error)
- error = error2;
- if (need_i_mutex)
- mutex_lock(&inode->i_mutex);
- xfs_ilock(ip, iolock);
+ xfs_ilock(ip, XFS_IOLOCK_EXCL);
- error2 = -xfs_file_fsync(file,
- (file->f_flags & __O_SYNC) ? 0 : 1);
- if (!error)
- error = error2;
+ if (mode & FALLOC_FL_PUNCH_HOLE)
+ cmd = XFS_IOC_UNRESVSP;
+
+ /* check the new inode size is valid before allocating */
+ if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+ offset + len > i_size_read(inode)) {
+ new_size = offset + len;
+ error = inode_newsize_ok(inode, new_size);
+ if (error)
+ goto out_unlock;
}
- out_unlock_internal:
- if (ip->i_new_size) {
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- ip->i_new_size = 0;
- /*
- * If this was a direct or synchronous I/O that failed (such
- * as ENOSPC) then part of the I/O may have been written to
- * disk before the error occured. In this case the on-disk
- * file size may have been adjusted beyond the in-memory file
- * size and now needs to be truncated back.
- */
- if (ip->i_d.di_size > ip->i_size)
- ip->i_d.di_size = ip->i_size;
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ error = -xfs_change_file_space(ip, cmd, &bf, 0, XFS_ATTR_NOLOCK);
+ if (error)
+ goto out_unlock;
+
+ /* Change file size if needed */
+ if (new_size) {
+ struct iattr iattr;
+
+ iattr.ia_valid = ATTR_SIZE;
+ iattr.ia_size = new_size;
+ error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
}
- xfs_iunlock(ip, iolock);
- out_unlock_mutex:
- if (need_i_mutex)
- mutex_unlock(&inode->i_mutex);
- return -error;
+
+out_unlock:
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ return error;
}
+
STATIC int
xfs_file_open(
struct inode *inode,
@@ -921,6 +1055,7 @@ const struct file_operations xfs_file_operations = {
.open = xfs_file_open,
.release = xfs_file_release,
.fsync = xfs_file_fsync,
+ .fallocate = xfs_file_fallocate,
};
const struct file_operations xfs_dir_file_operations = {
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index ad442d9..f5e2a19 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -39,6 +39,7 @@
#include "xfs_dfrag.h"
#include "xfs_fsops.h"
#include "xfs_vnodeops.h"
+#include "xfs_discard.h"
#include "xfs_quota.h"
#include "xfs_inode_item.h"
#include "xfs_export.h"
@@ -984,10 +985,22 @@ xfs_ioctl_setattr(
/*
* Extent size must be a multiple of the appropriate block
- * size, if set at all.
+ * size, if set at all. It must also be smaller than the
+ * maximum extent size supported by the filesystem.
+ *
+ * Also, for non-realtime files, limit the extent size hint to
+ * half the size of the AGs in the filesystem so alignment
+ * doesn't result in extents larger than an AG.
*/
if (fa->fsx_extsize != 0) {
- xfs_extlen_t size;
+ xfs_extlen_t size;
+ xfs_fsblock_t extsize_fsb;
+
+ extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize);
+ if (extsize_fsb > MAXEXTLEN) {
+ code = XFS_ERROR(EINVAL);
+ goto error_return;
+ }
if (XFS_IS_REALTIME_INODE(ip) ||
((mask & FSX_XFLAGS) &&
@@ -996,6 +1009,10 @@ xfs_ioctl_setattr(
mp->m_sb.sb_blocklog;
} else {
size = mp->m_sb.sb_blocksize;
+ if (extsize_fsb > mp->m_sb.sb_agblocks / 2) {
+ code = XFS_ERROR(EINVAL);
+ goto error_return;
+ }
}
if (fa->fsx_extsize % size) {
@@ -1294,6 +1311,8 @@ xfs_file_ioctl(
trace_xfs_file_ioctl(ip);
switch (cmd) {
+ case FITRIM:
+ return xfs_ioc_trim(mp, arg);
case XFS_IOC_ALLOCSP:
case XFS_IOC_FREESP:
case XFS_IOC_RESVSP:
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index da54403..bd57278 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -46,7 +46,6 @@
#include <linux/namei.h>
#include <linux/posix_acl.h>
#include <linux/security.h>
-#include <linux/falloc.h>
#include <linux/fiemap.h>
#include <linux/slab.h>
@@ -505,61 +504,6 @@ xfs_vn_setattr(
return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0);
}
-STATIC long
-xfs_vn_fallocate(
- struct inode *inode,
- int mode,
- loff_t offset,
- loff_t len)
-{
- long error;
- loff_t new_size = 0;
- xfs_flock64_t bf;
- xfs_inode_t *ip = XFS_I(inode);
- int cmd = XFS_IOC_RESVSP;
-
- /* preallocation on directories not yet supported */
- error = -ENODEV;
- if (S_ISDIR(inode->i_mode))
- goto out_error;
-
- bf.l_whence = 0;
- bf.l_start = offset;
- bf.l_len = len;
-
- xfs_ilock(ip, XFS_IOLOCK_EXCL);
-
- if (mode & FALLOC_FL_PUNCH_HOLE)
- cmd = XFS_IOC_UNRESVSP;
-
- /* check the new inode size is valid before allocating */
- if (!(mode & FALLOC_FL_KEEP_SIZE) &&
- offset + len > i_size_read(inode)) {
- new_size = offset + len;
- error = inode_newsize_ok(inode, new_size);
- if (error)
- goto out_unlock;
- }
-
- error = -xfs_change_file_space(ip, cmd, &bf, 0, XFS_ATTR_NOLOCK);
- if (error)
- goto out_unlock;
-
- /* Change file size if needed */
- if (new_size) {
- struct iattr iattr;
-
- iattr.ia_valid = ATTR_SIZE;
- iattr.ia_size = new_size;
- error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
- }
-
-out_unlock:
- xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-out_error:
- return error;
-}
-
#define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
/*
@@ -653,7 +597,6 @@ static const struct inode_operations xfs_inode_operations = {
.getxattr = generic_getxattr,
.removexattr = generic_removexattr,
.listxattr = xfs_vn_listxattr,
- .fallocate = xfs_vn_fallocate,
.fiemap = xfs_vn_fiemap,
};
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index bd07f73..9731898 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1414,7 +1414,7 @@ xfs_fs_freeze(
xfs_save_resvblks(mp);
xfs_quiesce_attr(mp);
- return -xfs_fs_log_dummy(mp, SYNC_WAIT);
+ return -xfs_fs_log_dummy(mp);
}
STATIC int
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index a02480d..e22f005 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -362,7 +362,7 @@ xfs_quiesce_data(
/* mark the log as covered if needed */
if (xfs_log_need_covered(mp))
- error2 = xfs_fs_log_dummy(mp, SYNC_WAIT);
+ error2 = xfs_fs_log_dummy(mp);
/* flush data-only devices */
if (mp->m_rtdev_targp)
@@ -503,13 +503,14 @@ xfs_sync_worker(
int error;
if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
- xfs_log_force(mp, 0);
- xfs_reclaim_inodes(mp, 0);
/* dgc: errors ignored here */
- error = xfs_qm_sync(mp, SYNC_TRYLOCK);
if (mp->m_super->s_frozen == SB_UNFROZEN &&
xfs_log_need_covered(mp))
- error = xfs_fs_log_dummy(mp, 0);
+ error = xfs_fs_log_dummy(mp);
+ else
+ xfs_log_force(mp, 0);
+ xfs_reclaim_inodes(mp, 0);
+ error = xfs_qm_sync(mp, SYNC_TRYLOCK);
}
mp->m_sync_seq++;
wake_up(&mp->m_wait_single_sync_task);
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index 7bb5092..ee3cee0 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -18,6 +18,7 @@
#include "xfs.h"
#include <linux/sysctl.h>
#include <linux/proc_fs.h>
+#include "xfs_error.h"
static struct ctl_table_header *xfs_table_header;
@@ -51,6 +52,26 @@ xfs_stats_clear_proc_handler(
return ret;
}
+
+STATIC int
+xfs_panic_mask_proc_handler(
+ ctl_table *ctl,
+ int write,
+ void __user *buffer,
+ size_t *lenp,
+ loff_t *ppos)
+{
+ int ret, *valp = ctl->data;
+
+ ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
+ if (!ret && write) {
+ xfs_panic_mask = *valp;
+#ifdef DEBUG
+ xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
+#endif
+ }
+ return ret;
+}
#endif /* CONFIG_PROC_FS */
static ctl_table xfs_table[] = {
@@ -77,7 +98,7 @@ static ctl_table xfs_table[] = {
.data = &xfs_params.panic_mask.val,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
+ .proc_handler = xfs_panic_mask_proc_handler,
.extra1 = &xfs_params.panic_mask.min,
.extra2 = &xfs_params.panic_mask.max
},
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index 647af2a..2d0bcb4 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -1759,6 +1759,39 @@ DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover);
DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel);
DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip);
+DECLARE_EVENT_CLASS(xfs_discard_class,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agblock_t agbno, xfs_extlen_t len),
+ TP_ARGS(mp, agno, agbno, len),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, agbno)
+ __field(xfs_extlen_t, len)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->agbno = agbno;
+ __entry->len = len;
+ ),
+ TP_printk("dev %d:%d agno %u agbno %u len %u\n",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agbno,
+ __entry->len)
+)
+
+#define DEFINE_DISCARD_EVENT(name) \
+DEFINE_EVENT(xfs_discard_class, name, \
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+ xfs_agblock_t agbno, xfs_extlen_t len), \
+ TP_ARGS(mp, agno, agbno, len))
+DEFINE_DISCARD_EVENT(xfs_discard_extent);
+DEFINE_DISCARD_EVENT(xfs_discard_toosmall);
+DEFINE_DISCARD_EVENT(xfs_discard_exclude);
+DEFINE_DISCARD_EVENT(xfs_discard_busy);
+
#endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH
OpenPOWER on IntegriCloud