summaryrefslogtreecommitdiffstats
path: root/sys/kern
diff options
context:
space:
mode:
authormav <mav@FreeBSD.org>2015-04-22 18:11:34 +0000
committermav <mav@FreeBSD.org>2015-04-22 18:11:34 +0000
commit90d0d1c5a567c8cc6751443c046b50fa722d88a5 (patch)
treef55317f65bc3859ba254b5b703052ad324b34bcf /sys/kern
parent2682de6543196abd41241bf0c25fc39c35682cde (diff)
downloadFreeBSD-src-90d0d1c5a567c8cc6751443c046b50fa722d88a5.zip
FreeBSD-src-90d0d1c5a567c8cc6751443c046b50fa722d88a5.tar.gz
Make AIO to not allocate pbufs for unmapped I/O like r281825.
While there, make few more performance optimizations. On 40-core system doing many 512-byte AIO reads from array of raw SSDs this change removes lock congestions inside pbuf allocator and devfs, and bottleneck on single AIO completion taskqueue thread. It improves peak AIO performance from ~600K to ~1.3M IOPS. MFC after: 2 weeks
Diffstat (limited to 'sys/kern')
-rw-r--r--sys/kern/vfs_aio.c206
1 files changed, 105 insertions, 101 deletions
diff --git a/sys/kern/vfs_aio.c b/sys/kern/vfs_aio.c
index c7e602e..0bfcf2d 100644
--- a/sys/kern/vfs_aio.c
+++ b/sys/kern/vfs_aio.c
@@ -59,10 +59,12 @@ __FBSDID("$FreeBSD$");
#include <sys/conf.h>
#include <sys/event.h>
#include <sys/mount.h>
+#include <geom/geom.h>
#include <machine/atomic.h>
#include <vm/vm.h>
+#include <vm/vm_page.h>
#include <vm/vm_extern.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
@@ -232,9 +234,10 @@ struct aiocblist {
int jobstate; /* (b) job state */
int inputcharge; /* (*) input blockes */
int outputcharge; /* (*) output blockes */
- struct buf *bp; /* (*) private to BIO backend,
- * buffer pointer
- */
+ struct bio *bp; /* (*) BIO backend BIO pointer */
+ struct buf *pbuf; /* (*) BIO backend buffer pointer */
+ struct vm_page *pages[btoc(MAXPHYS)+1]; /* BIO backend pages */
+ int npages; /* BIO backend number of pages */
struct proc *userproc; /* (*) user process */
struct ucred *cred; /* (*) active credential when created */
struct file *fd_file; /* (*) pointer to file structure */
@@ -243,7 +246,6 @@ struct aiocblist {
struct knlist klist; /* (a) list of knotes */
struct aiocb uaiocb; /* (*) kernel I/O control block */
ksiginfo_t ksi; /* (a) realtime signal info */
- struct task biotask; /* (*) private to BIO backend */
uint64_t seqno; /* (*) job number */
int pending; /* (a) number of pending I/O, aio_fsync only */
};
@@ -344,11 +346,10 @@ static void aio_process_mlock(struct aiocblist *aiocbe);
static int aio_newproc(int *);
int aio_aqueue(struct thread *td, struct aiocb *job,
struct aioliojob *lio, int type, struct aiocb_ops *ops);
-static void aio_physwakeup(struct buf *bp);
+static void aio_physwakeup(struct bio *bp);
static void aio_proc_rundown(void *arg, struct proc *p);
static void aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp);
static int aio_qphysio(struct proc *p, struct aiocblist *iocb);
-static void biohelper(void *, int);
static void aio_daemon(void *param);
static void aio_swake_cb(struct socket *, struct sockbuf *);
static int aio_unload(void);
@@ -1294,13 +1295,15 @@ aio_qphysio(struct proc *p, struct aiocblist *aiocbe)
{
struct aiocb *cb;
struct file *fp;
- struct buf *bp;
+ struct bio *bp;
+ struct buf *pbuf;
struct vnode *vp;
struct cdevsw *csw;
struct cdev *dev;
struct kaioinfo *ki;
struct aioliojob *lj;
- int error, ref;
+ int error, ref, unmap, poff;
+ vm_prot_t prot;
cb = &aiocbe->uaiocb;
fp = aiocbe->fd_file;
@@ -1309,107 +1312,121 @@ aio_qphysio(struct proc *p, struct aiocblist *aiocbe)
return (-1);
vp = fp->f_vnode;
-
- /*
- * If its not a disk, we don't want to return a positive error.
- * It causes the aio code to not fall through to try the thread
- * way when you're talking to a regular file.
- */
- if (!vn_isdisk(vp, &error)) {
- if (error == ENOTBLK)
- return (-1);
- else
- return (error);
- }
-
- if (vp->v_bufobj.bo_bsize == 0)
- return (-1);
-
- if (cb->aio_nbytes % vp->v_bufobj.bo_bsize)
+ if (vp->v_type != VCHR)
return (-1);
-
- if (cb->aio_nbytes >
- MAXPHYS - (((vm_offset_t) cb->aio_buf) & PAGE_MASK))
+ if (vp->v_bufobj.bo_bsize == 0)
return (-1);
-
- ki = p->p_aioinfo;
- if (ki->kaio_buffer_count >= ki->kaio_ballowed_count)
+ if (cb->aio_nbytes % vp->v_bufobj.bo_bsize)
return (-1);
ref = 0;
csw = devvn_refthread(vp, &dev, &ref);
if (csw == NULL)
return (ENXIO);
+
+ if ((csw->d_flags & D_DISK) == 0) {
+ error = -1;
+ goto unref;
+ }
if (cb->aio_nbytes > dev->si_iosize_max) {
error = -1;
goto unref;
}
- /* Create and build a buffer header for a transfer. */
- bp = (struct buf *)getpbuf(NULL);
- BUF_KERNPROC(bp);
+ ki = p->p_aioinfo;
+ poff = (vm_offset_t)cb->aio_buf & PAGE_MASK;
+ unmap = ((dev->si_flags & SI_UNMAPPED) && unmapped_buf_allowed);
+ if (unmap) {
+ if (cb->aio_nbytes > MAXPHYS) {
+ error = -1;
+ goto unref;
+ }
+ } else {
+ if (cb->aio_nbytes > MAXPHYS - poff) {
+ error = -1;
+ goto unref;
+ }
+ if (ki->kaio_buffer_count >= ki->kaio_ballowed_count) {
+ error = -1;
+ goto unref;
+ }
+ }
+ aiocbe->bp = bp = g_alloc_bio();
+ if (!unmap) {
+ aiocbe->pbuf = pbuf = (struct buf *)getpbuf(NULL);
+ BUF_KERNPROC(pbuf);
+ }
AIO_LOCK(ki);
ki->kaio_count++;
- ki->kaio_buffer_count++;
+ if (!unmap)
+ ki->kaio_buffer_count++;
lj = aiocbe->lio;
if (lj)
lj->lioj_count++;
- AIO_UNLOCK(ki);
-
- /*
- * Get a copy of the kva from the physical buffer.
- */
- error = 0;
-
- bp->b_bcount = cb->aio_nbytes;
- bp->b_bufsize = cb->aio_nbytes;
- bp->b_iodone = aio_physwakeup;
- bp->b_saveaddr = bp->b_data;
- bp->b_data = (void *)(uintptr_t)cb->aio_buf;
- bp->b_offset = cb->aio_offset;
- bp->b_iooffset = cb->aio_offset;
- bp->b_blkno = btodb(cb->aio_offset);
- bp->b_iocmd = cb->aio_lio_opcode == LIO_WRITE ? BIO_WRITE : BIO_READ;
-
- /*
- * Bring buffer into kernel space.
- */
- if (vmapbuf(bp, (dev->si_flags & SI_UNMAPPED) == 0) < 0) {
- error = EFAULT;
- goto doerror;
- }
-
- AIO_LOCK(ki);
- aiocbe->bp = bp;
- bp->b_caller1 = (void *)aiocbe;
TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist);
TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist);
aiocbe->jobstate = JOBST_JOBQBUF;
cb->_aiocb_private.status = cb->aio_nbytes;
AIO_UNLOCK(ki);
- atomic_add_int(&num_queue_count, 1);
- atomic_add_int(&num_buf_aio, 1);
-
- bp->b_error = 0;
+ bp->bio_length = cb->aio_nbytes;
+ bp->bio_bcount = cb->aio_nbytes;
+ bp->bio_done = aio_physwakeup;
+ bp->bio_data = (void *)(uintptr_t)cb->aio_buf;
+ bp->bio_offset = cb->aio_offset;
+ bp->bio_cmd = cb->aio_lio_opcode == LIO_WRITE ? BIO_WRITE : BIO_READ;
+ bp->bio_dev = dev;
+ bp->bio_caller1 = (void *)aiocbe;
+
+ prot = VM_PROT_READ;
+ if (cb->aio_lio_opcode == LIO_READ)
+ prot |= VM_PROT_WRITE; /* Less backwards than it looks */
+ if ((aiocbe->npages = vm_fault_quick_hold_pages(
+ &curproc->p_vmspace->vm_map,
+ (vm_offset_t)bp->bio_data, bp->bio_length, prot, aiocbe->pages,
+ sizeof(aiocbe->pages)/sizeof(aiocbe->pages[0]))) < 0) {
+ error = EFAULT;
+ goto doerror;
+ }
+ if (!unmap) {
+ pmap_qenter((vm_offset_t)pbuf->b_data,
+ aiocbe->pages, aiocbe->npages);
+ bp->bio_data = pbuf->b_data + poff;
+ } else {
+ bp->bio_ma = aiocbe->pages;
+ bp->bio_ma_n = aiocbe->npages;
+ bp->bio_ma_offset = poff;
+ bp->bio_data = unmapped_buf;
+ bp->bio_flags |= BIO_UNMAPPED;
+ }
- TASK_INIT(&aiocbe->biotask, 0, biohelper, aiocbe);
+ atomic_add_int(&num_queue_count, 1);
+ if (!unmap)
+ atomic_add_int(&num_buf_aio, 1);
/* Perform transfer. */
- dev_strategy_csw(dev, csw, bp);
+ csw->d_strategy(bp);
dev_relthread(dev, ref);
return (0);
doerror:
AIO_LOCK(ki);
+ aiocbe->jobstate = JOBST_NULL;
+ TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
+ TAILQ_REMOVE(&ki->kaio_all, aiocbe, allist);
ki->kaio_count--;
- ki->kaio_buffer_count--;
+ if (!unmap)
+ ki->kaio_buffer_count--;
if (lj)
lj->lioj_count--;
- aiocbe->bp = NULL;
AIO_UNLOCK(ki);
- relpbuf(bp, NULL);
+ if (pbuf) {
+ relpbuf(pbuf, NULL);
+ aiocbe->pbuf = NULL;
+ }
+ g_destroy_bio(bp);
+ aiocbe->bp = NULL;
unref:
dev_relthread(dev, ref);
return (error);
@@ -1787,8 +1804,6 @@ no_kqueue:
}
#endif
queueit:
- /* No buffer for daemon I/O. */
- aiocbe->bp = NULL;
atomic_add_int(&num_queue_count, 1);
AIO_LOCK(ki);
@@ -2425,54 +2440,43 @@ sys_lio_listio(struct thread *td, struct lio_listio_args *uap)
return (error);
}
-/*
- * Called from interrupt thread for physio, we should return as fast
- * as possible, so we schedule a biohelper task.
- */
static void
-aio_physwakeup(struct buf *bp)
+aio_physwakeup(struct bio *bp)
{
- struct aiocblist *aiocbe;
-
- aiocbe = (struct aiocblist *)bp->b_caller1;
- taskqueue_enqueue(taskqueue_aiod_bio, &aiocbe->biotask);
-}
-
-/*
- * Task routine to perform heavy tasks, process wakeup, and signals.
- */
-static void
-biohelper(void *context, int pending)
-{
- struct aiocblist *aiocbe = context;
- struct buf *bp;
+ struct aiocblist *aiocbe = (struct aiocblist *)bp->bio_caller1;
struct proc *userp;
struct kaioinfo *ki;
int nblks;
+ /* Release mapping into kernel space. */
+ if (aiocbe->pbuf) {
+ pmap_qremove((vm_offset_t)aiocbe->pbuf->b_data, aiocbe->npages);
+ relpbuf(aiocbe->pbuf, NULL);
+ aiocbe->pbuf = NULL;
+ atomic_subtract_int(&num_buf_aio, 1);
+ }
+ vm_page_unhold_pages(aiocbe->pages, aiocbe->npages);
+
bp = aiocbe->bp;
+ aiocbe->bp = NULL;
userp = aiocbe->userproc;
ki = userp->p_aioinfo;
AIO_LOCK(ki);
- aiocbe->uaiocb._aiocb_private.status -= bp->b_resid;
+ aiocbe->uaiocb._aiocb_private.status -= bp->bio_resid;
aiocbe->uaiocb._aiocb_private.error = 0;
- if (bp->b_ioflags & BIO_ERROR)
- aiocbe->uaiocb._aiocb_private.error = bp->b_error;
+ if (bp->bio_flags & BIO_ERROR)
+ aiocbe->uaiocb._aiocb_private.error = bp->bio_error;
nblks = btodb(aiocbe->uaiocb.aio_nbytes);
if (aiocbe->uaiocb.aio_lio_opcode == LIO_WRITE)
aiocbe->outputcharge += nblks;
else
aiocbe->inputcharge += nblks;
- aiocbe->bp = NULL;
TAILQ_REMOVE(&userp->p_aioinfo->kaio_bufqueue, aiocbe, plist);
ki->kaio_buffer_count--;
aio_bio_done_notify(userp, aiocbe, DONE_BUF);
AIO_UNLOCK(ki);
- /* Release mapping into kernel space. */
- vunmapbuf(bp);
- relpbuf(bp, NULL);
- atomic_subtract_int(&num_buf_aio, 1);
+ g_destroy_bio(bp);
}
/* syscall - wait for the next completion of an aio request */
OpenPOWER on IntegriCloud