summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authornetchild <netchild@FreeBSD.org>2006-10-15 14:22:14 +0000
committernetchild <netchild@FreeBSD.org>2006-10-15 14:22:14 +0000
commit183bd5a34bb7e6bdd1231cfafcc185bc5eb27d7f (patch)
tree37b5d24e960676debab482dc664dda0e3c7ec546
parent7b28cc9b2faecf0b23e4e115f9cd784767065ea8 (diff)
downloadFreeBSD-src-183bd5a34bb7e6bdd1231cfafcc185bc5eb27d7f.zip
FreeBSD-src-183bd5a34bb7e6bdd1231cfafcc185bc5eb27d7f.tar.gz
MFP4 (with some minor changes):
Implement the linux_io_* syscalls (AIO). They are only enabled if the native AIO code is available (either compiled in to the kernel or as a module) at the time the functions are used. If the AIO stuff is not available there will be a ENOSYS. From the submitter: ---snip--- DESIGN NOTES: 1. Linux permits a process to own multiple AIO queues (distinguished by "context"), but FreeBSD creates only one single AIO queue per process. My code maintains a request queue (STAILQ of queue(3)) per "context", and throws all AIO requests of all contexts owned by a process into the single FreeBSD per-process AIO queue. When the process calls io_destroy(2), io_getevents(2), io_submit(2) and io_cancel(2), my code can pick out requests owned by the specified context from the single FreeBSD per-process AIO queue according to the per-context request queues maintained by my code. 2. The request queue maintained by my code stores contrast information between Linux IO control blocks (struct linux_iocb) and FreeBSD IO control blocks (struct aiocb). FreeBSD IO control block actually exists in userland memory space, required by FreeBSD native aio_XXXXXX(2). 3. It is quite troubling that the function io_getevents() of libaio-0.3.105 needs to use Linux-specific "struct aio_ring", which is a partial mirror of context in user space. I would rather take the address of context in kernel as the context ID, but the io_getevents() of libaio forces me to take the address of the "ring" in user space as the context ID. To my surprise, one comment line in the file "io_getevents.c" of libaio-0.3.105 reads: Ben will hate me for this REFERENCE: 1. Linux kernel source code: http://www.kernel.org/pub/linux/kernel/v2.6/ (include/linux/aio_abi.h, fs/aio.c) 2. Linux manual pages: http://www.kernel.org/pub/linux/docs/manpages/ (io_setup(2), io_destroy(2), io_getevents(2), io_submit(2), io_cancel(2)) 3. Linux Scalability Effort: http://lse.sourceforge.net/io/aio.html The design notes: http://lse.sourceforge.net/io/aionotes.txt 4. The package libaio, both source and binary: http://rpmfind.net/linux/rpm2html/search.php?query=libaio Simple transparent interface to Linux AIO system calls. 5. Libaio-oracle: http://oss.oracle.com/projects/libaio-oracle/ POSIX AIO implementation based on Linux AIO system calls (depending on libaio). ---snip--- Submitted by: Li, Xiao <intron@intron.ac>
-rw-r--r--sys/amd64/linux32/linux.h2
-rw-r--r--sys/amd64/linux32/syscalls.master10
-rw-r--r--sys/compat/linux/linux_aio.c1349
-rw-r--r--sys/compat/linux/linux_aio.h98
-rw-r--r--sys/conf/files.amd641
-rw-r--r--sys/conf/files.i3861
-rw-r--r--sys/i386/linux/linux.h2
-rw-r--r--sys/i386/linux/syscalls.master10
-rw-r--r--sys/kern/vfs_aio.c8
-rw-r--r--sys/modules/aio/Makefile2
-rw-r--r--sys/modules/linux/Makefile12
11 files changed, 1476 insertions, 19 deletions
diff --git a/sys/amd64/linux32/linux.h b/sys/amd64/linux32/linux.h
index 9c8ed5c..cf4f471 100644
--- a/sys/amd64/linux32/linux.h
+++ b/sys/amd64/linux32/linux.h
@@ -840,4 +840,6 @@ typedef int l_mqd_t;
#define THREADING_FLAGS (CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGHAND)
+#include <compat/linux/linux_aio.h>
+
#endif /* !_AMD64_LINUX_LINUX_H_ */
diff --git a/sys/amd64/linux32/syscalls.master b/sys/amd64/linux32/syscalls.master
index 34f2f0d..6836441 100644
--- a/sys/amd64/linux32/syscalls.master
+++ b/sys/amd64/linux32/syscalls.master
@@ -406,11 +406,11 @@
242 AUE_NULL UNIMPL linux_sched_getaffinity
243 AUE_NULL UNIMPL linux_set_thread_area
244 AUE_NULL UNIMPL linux_get_thread_area
-245 AUE_NULL UNIMPL linux_io_setup
-246 AUE_NULL UNIMPL linux_io_destroy
-247 AUE_NULL UNIMPL linux_io_getevents
-248 AUE_NULL UNIMPL linux_io_submit
-249 AUE_NULL UNIMPL linux_io_cancel
+245 AUE_NULL STD { int linux_io_setup(l_uint nr_reqs, linux_aio_context_t *ctxp); }
+246 AUE_NULL STD { int linux_io_destroy(linux_aio_context_t ctx); }
+247 AUE_NULL STD { int linux_io_getevents(linux_aio_context_t ctx_id, l_long min_nr, l_long nr, struct linux_io_event *events, struct l_timespec *timeout); }
+248 AUE_NULL STD { int linux_io_submit(linux_aio_context_t ctx_id, l_long nr, struct linux_iocb **iocbpp); }
+249 AUE_NULL STD { int linux_io_cancel(linux_aio_context_t ctx_id, struct linux_iocb *iocb, struct linux_io_event *result); }
250 AUE_NULL STD { int linux_fadvise64(void); }
251 AUE_NULL UNIMPL
252 AUE_EXIT STD { int linux_exit_group(int error_code); }
diff --git a/sys/compat/linux/linux_aio.c b/sys/compat/linux/linux_aio.c
new file mode 100644
index 0000000..b6c6674
--- /dev/null
+++ b/sys/compat/linux/linux_aio.c
@@ -0,0 +1,1349 @@
+/*-
+ * Copyright (c) 2006 Li, Xiao <intron@intron.ac>. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/sysent.h>
+#include <sys/proc.h>
+#include <sys/errno.h>
+#include <sys/eventhandler.h>
+#include <sys/aio.h>
+#include <sys/mman.h>
+#include <sys/time.h>
+#include <sys/queue.h>
+#include <vm/uma.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sx.h>
+#include <sys/linker.h>
+#include <sys/sysctl.h>
+#include <sys/syscall.h>
+#include <sys/sysproto.h>
+
+#ifdef COMPAT_LINUX32
+#include <machine/../linux32/linux.h>
+#include <machine/../linux32/linux32_proto.h>
+#else
+#include <machine/../linux/linux.h>
+#include <machine/../linux/linux_proto.h>
+#endif
+
+#define LINUX_AIO_DEBUG
+
+/*
+ * Linux Kernel Implementation of Asynchronous I/O
+ */
+
+#ifdef LINUX_AIO_DEBUG
+
+/* Print arguments of syscall */
+#define DARGPRINTF(fmt, ...) printf("linux(%ld): %s("fmt")\n", \
+ (long)td->td_proc->p_pid, __func__, __VA_ARGS__)
+/* Print message in syscall function */
+#define DPRINTF(fmt, ...) printf(LMSG("%s(): " fmt), \
+ __func__, __VA_ARGS__)
+/* Print message in non-syscall function, the one more "P" means "private" */
+#define DPPRINTF(fmt, ...) printf("linux(): %s(): " fmt "\n", \
+ __func__, __VA_ARGS__)
+
+#else
+
+#define DARGPRINTF(fmt, ...)
+#define DPRINTF(fmt, ...)
+#define DPPRINTF(fmt, ...)
+
+#endif
+
+/*
+ * DATA STRUCTURE HIERARCHY
+ *
+ * +--------------------+ +--------------------+
+ * context_list ---> | context | ---> | context | ---> ...
+ * SLIST |(owned by a process)| |(owned by a process)|
+ * | | | |
+ * | ctx_req | | ctx_req |
+ * +----|---------------+ +----|---------------+
+ * | STAILQ | STAILQ
+ * v v
+ * +------------+ +------------+
+ * | request | | request |
+ * | | | |
+ * |.req_pbsd | |.req_pbsd |
+ * |.req_porig | |.req_porig |
+ * |.req_linux | |.req_linux |
+ * | | | |
+ * +------------+ +------------+
+ * | |
+ * v v
+ * +------------+ +------------+
+ * | request | | request |
+ * | | | |
+ * |.req_pbsd | |.req_pbsd |
+ * |.req_porig | |.req_porig |
+ * |.req_linux | |.req_linux |
+ * | | | |
+ * +------------+ +------------+
+ * | |
+ * v v
+ * ... ...
+ */
+
+struct linux_aio_context;
+
+struct linux_aio_request {
+ struct aiocb *req_pbsd; /* Userland clone for FreeBSD */
+ struct linux_iocb *req_porig; /* Userland original control block */
+ struct linux_iocb req_linux; /* Copy of original control block */
+ STAILQ_ENTRY(linux_aio_request) req_ctx_entry;
+};
+
+struct linux_aio_context {
+ struct sx ctx_sx;
+ pid_t ctx_pid;
+ struct linux_aio_ring *ctx_pring;
+ int ctx_nreq_max; /* Maximum request number */
+ int ctx_nreq_cur; /* Current request number */
+ STAILQ_HEAD(,linux_aio_request) ctx_req;
+ SLIST_ENTRY(linux_aio_context) ctx_list_entry;
+};
+static SLIST_HEAD(,linux_aio_context) linux_aio_context_list;
+
+#define LINUX_AIO_REQ_HOOK(pctx, preq) { \
+ STAILQ_INSERT_TAIL(&((pctx)->ctx_req), (preq), req_ctx_entry); \
+ (pctx)->ctx_nreq_cur ++; \
+}
+
+#define LINUX_AIO_REQ_UNHOOK(pctx, preq) { \
+ STAILQ_REMOVE(&((pctx)->ctx_req), (preq), linux_aio_request, \
+ req_ctx_entry); \
+ (pctx)->ctx_nreq_cur --; \
+}
+
+#define LINUX_AIO_REQ_FOREACH(pctx, preq) \
+ STAILQ_FOREACH((preq), &((pctx)->ctx_req), req_ctx_entry)
+
+#define LINUX_AIO_REQ_FOREACH_SAFE(pctx, preq, ptmpreq) \
+ STAILQ_FOREACH_SAFE((preq), &((pctx)->ctx_req), req_ctx_entry, \
+ (ptmpreq))
+
+#define LINUX_AIO_CTX_LOCK(pctx) sx_xlock(&((pctx)->ctx_sx))
+
+#define LINUX_AIO_CTX_UNLOCK(pctx) sx_unlock(&((pctx)->ctx_sx))
+
+#define LINUX_AIO_CTX_HOOK(pctx) \
+ SLIST_INSERT_HEAD(&linux_aio_context_list, (pctx), ctx_list_entry)
+
+#define LINUX_AIO_CTX_UNHOOK(pctx) \
+ SLIST_REMOVE(&linux_aio_context_list, (pctx), \
+ linux_aio_context, ctx_list_entry)
+
+#define LINUX_AIO_CTX_FOREACH(pctx) \
+ SLIST_FOREACH((pctx), &linux_aio_context_list, ctx_list_entry)
+
+#define LINUX_AIO_CTX_FOREACH_SAFE(pctx, ptmpctx) \
+ SLIST_FOREACH_SAFE((pctx), &linux_aio_context_list, \
+ ctx_list_entry, (ptmpctx))
+
+#define LINUX_AIO_CTX_MATCH(pctx, ctxid, pid) \
+ ((linux_aio_context_t)(pctx)->ctx_pring == (ctxid) \
+ && (pctx)->ctx_pid == (pid))
+
+static struct mtx linux_aio_context_list_mtx;
+
+#define LINUX_AIO_CTX_LIST_LOCK() mtx_lock(&linux_aio_context_list_mtx)
+
+#define LINUX_AIO_CTX_LIST_UNLOCK() mtx_unlock(&linux_aio_context_list_mtx)
+
+/*
+ * The following two macros are substantially identical to the two macros
+ * AIO_(UN)LOCK in /sys/kern/vfs_aio.c. Thus, the mutex much be unlocked
+ * before calling functions of FreeBSD native AIO module.
+ *
+ * XXX
+ * I ASSUME the member "kaio_mtx" is the first element of "struct kaioinfo".
+ */
+#define LINUX_AIO_LOCK(p) { \
+ if ((p)->p_aioinfo == NULL) \
+ p_aio_init_aioinfo(p); \
+ mtx_lock((struct mtx *)((p)->p_aioinfo)); \
+}
+
+#define LINUX_AIO_UNLOCK(p) { \
+ if ((p)->p_aioinfo == NULL) \
+ p_aio_init_aioinfo(p); \
+ mtx_unlock((struct mtx *)((p)->p_aioinfo)); \
+}
+
+static uma_zone_t linux_aio_context_zone, linux_aio_request_zone;
+
+static eventhandler_tag linux_aio_exit_tag;
+
+/*
+ * XXX
+ * Calling external function/variable declared with "static" is DANGEROUS !!!
+ * Compiler may use register to transfer calling arguments for optimization,
+ * which is NOT a normal calling way and can cause kernel crash.
+ */
+
+#define NATIVE_AIO_MODULE_NAME "aio"
+static struct mod_depend native_aio_module_depend = {1, 1, 1};
+static linker_file_t native_aio_module_handle = NULL;
+
+/* Mirror of sysctls in /sys/kern/vfs_aio.c */
+#define NATIVE_AIO_SYSCTL_CAPACITY_PROC "vfs.aio.max_aio_queue_per_proc"
+static int native_aio_capacity_proc;
+#define NATIVE_AIO_SYSCTL_CAPACITY_SYS "vfs.aio.max_aio_queue"
+static int native_aio_capacity_sys;
+
+/* For declaration of aio_aqueue(), defined in /sys/kern/vfs_aio.c */
+struct aioliojob;
+
+/* Functions in /sys/kern/vfs_aio.c, XXX defined with "static" */
+#define GET_INTERNAL_FUNC_POINTER(s) { \
+ * ((caddr_t *) & p_ ## s) = linker_file_lookup_symbol( \
+ native_aio_module_handle, #s, FALSE); \
+ if (p_ ## s == NULL) \
+ break; \
+}
+static void (*p_aio_init_aioinfo) (struct proc *p);
+static int (*p_aio_aqueue) (struct thread *td, struct aiocb *job,
+ struct aioliojob *lio, int type, int osigev);
+
+/* System calls in /sys/kern/vfs_aio.c */
+#define DEFINE_SYSCALL_POINTER_VARIABLE(s) \
+ static int (* p_ ## s) (struct thread *, struct s ## _args *)
+#define GET_SYSCALL_POINTER(s) { \
+ * ((sy_call_t **) & p_ ## s) = sysent[SYS_ ## s].sy_call; \
+ if ((sy_call_t *) p_ ## s == (sy_call_t *)lkmressys) \
+ break; \
+}
+DEFINE_SYSCALL_POINTER_VARIABLE(aio_return);
+DEFINE_SYSCALL_POINTER_VARIABLE(aio_suspend);
+DEFINE_SYSCALL_POINTER_VARIABLE(aio_cancel);
+DEFINE_SYSCALL_POINTER_VARIABLE(aio_error);
+
+static int user_mem_rw_verify(void *p, size_t s)
+{
+ char buf[256];
+ size_t i;
+ int nerr = 0;
+
+ for (i = 0; i < s; i += sizeof(buf)) {
+ /* Verify reading */
+ nerr = copyin((char *)p+i, buf, MIN(sizeof(buf), s-i));
+ if (nerr != 0)
+ break;
+
+ /* Verify writing */
+ nerr = copyout(buf, (char *)p+i, MIN(sizeof(buf), s-i));
+ if (nerr != 0)
+ break;
+ }
+
+ return (nerr);
+}
+
+/* Allocate memory in user space */
+static int user_malloc(struct thread *td, void **pp, size_t s)
+{
+ struct mmap_args mmaparg;
+ int nerr;
+ register_t r;
+
+ r = td->td_retval[0];
+
+ mmaparg.addr = NULL;
+ mmaparg.len = s;
+ mmaparg.prot = PROT_READ | PROT_WRITE;
+ mmaparg.flags = MAP_PRIVATE | MAP_ANON;
+ mmaparg.fd = -1;
+ mmaparg.pad = 0;
+ mmaparg.pos = 0;
+
+ nerr = mmap(td, &mmaparg);
+
+ if (nerr == 0) {
+ *pp = (void *)td->td_retval[0];
+ DPPRINTF("%lu bytes allocated at %p", (unsigned long)s, *pp);
+ }
+
+ td->td_retval[0] = r;
+
+ return (nerr);
+}
+
+/* Free memory in user space */
+static int user_free(struct thread *td, void *p, size_t s)
+{
+ struct munmap_args munmaparg;
+ int nerr;
+ register_t r;
+
+ r = td->td_retval[0];
+
+ munmaparg.addr = p;
+ munmaparg.len = s;
+
+ nerr = munmap(td, &munmaparg);
+
+ td->td_retval[0] = r;
+ DPPRINTF("%lu bytes at %p", (unsigned long)s, p);
+
+ return (nerr);
+}
+
+#ifdef LINUX_AIO_DEBUG
+
+static void linux_aio_dump_freebsd_aiocb(struct aiocb *piocb, int isuserland)
+{
+ struct aiocb localcb, *pcb;
+ int nerr = 0;
+
+ if (isuserland) {
+ nerr = copyin(piocb, &localcb, sizeof(localcb));
+ pcb = &localcb;
+ } else
+ pcb = piocb;
+
+ DPPRINTF("Dump struct aiocb (%p, %s): %s",
+ piocb, (isuserland?"userland":"kernel"),
+ (nerr?"Failure":""));
+ if (!nerr) {
+ DPPRINTF("aio_fildes: %d",
+ pcb->aio_fildes);
+ DPPRINTF("aio_offset: %lu",
+ (unsigned long) pcb->aio_offset);
+ DPPRINTF("aio_buf: %p",
+ pcb->aio_buf);
+ DPPRINTF("aio_nbytes: %lu",
+ (unsigned long) pcb->aio_nbytes);
+ DPPRINTF("aio_lio_opcode: %d",
+ pcb->aio_lio_opcode);
+ DPPRINTF("aio_reqprio: %d",
+ pcb->aio_reqprio);
+ DPPRINTF("aio_sigevent.sigev_notify: %d",
+ pcb->aio_sigevent.sigev_notify);
+ DPPRINTF("aio_sigevent.sigev_signo: %d",
+ pcb->aio_sigevent.sigev_signo);
+ }
+}
+
+#define DUMP_FREEBSD_AIOCB(p, isu) linux_aio_dump_freebsd_aiocb((p), (isu));
+
+#define DUMP_TIMESPEC(f, t ,a) \
+ DPRINTF("%s%ld second + %ld nanosecond%s", \
+ (f), (long)(t)->tv_sec, (long)(t)->tv_nsec, (a));
+
+#else /* ! LINUX_AIO_DEBUG */
+
+#define DUMP_FREEBSD_AIOCB(p, isu)
+#define DUMP_TIMESPEC(f, t, a)
+
+#endif /* LINUX_AIO_DEBUG */
+
+static int iocb_reformat(struct linux_iocb *plnx, struct aiocb *pbsd)
+{
+ int nerr = 0;
+
+ bzero(pbsd, sizeof(*pbsd));
+
+ pbsd->aio_fildes = plnx->aio_fildes; /* File descriptor */
+ pbsd->aio_offset = plnx->aio_offset; /* File offset for I/O */
+ pbsd->aio_buf = (void *)(unsigned long) plnx->aio_buf; /*
+ * User space
+ * I/O buffer
+ */
+ pbsd->aio_nbytes = plnx->aio_nbytes; /* Number of bytes for I/O */
+ switch (plnx->aio_lio_opcode) { /* LIO opcode */
+ case LINUX_IOCB_CMD_PREAD:
+ pbsd->aio_lio_opcode = LIO_READ;
+ break;
+ case LINUX_IOCB_CMD_PWRITE:
+ pbsd->aio_lio_opcode = LIO_WRITE;
+ break;
+ case LINUX_IOCB_CMD_FSYNC:
+ case LINUX_IOCB_CMD_FDSYNC:
+ pbsd->aio_lio_opcode = LIO_SYNC;
+ break;
+#if 0
+ case LINUX_IOCB_CMD_PREADX:
+ break;
+ case LINUX_IOCB_CMD_POLL:
+ break;
+#endif
+ case LINUX_IOCB_CMD_NOOP:
+ pbsd->aio_lio_opcode = LIO_NOP;
+ break;
+ default:
+ nerr = EINVAL;
+ break;
+ }
+ if (nerr != 0) {
+ DPPRINTF("Unsupported aio_lio_opcode: %u",
+ (unsigned)plnx->aio_lio_opcode);
+ return (nerr);
+ }
+ pbsd->aio_reqprio = plnx->aio_reqprio; /* Request priority */
+ pbsd->aio_sigevent.sigev_notify = SIGEV_NONE; /* No signal to deliver */
+ pbsd->aio_sigevent.sigev_signo = 0; /* No signal to deliver */
+
+ return (nerr);
+}
+
+static int link_to_native_aio_module(struct thread *td)
+{
+ int nerr;
+
+ if (native_aio_module_handle != NULL) {
+ /* Linking has been done successfully. */
+ return (0);
+ }
+
+ nerr = linker_reference_module(NATIVE_AIO_MODULE_NAME,
+ &native_aio_module_depend, &native_aio_module_handle);
+ if (nerr)
+ return (nerr);
+
+ do {
+ nerr = EINVAL;
+
+ /* Kernel internal functions */
+ GET_INTERNAL_FUNC_POINTER(aio_init_aioinfo);
+ GET_INTERNAL_FUNC_POINTER(aio_aqueue);
+
+ /* System calls */
+ GET_SYSCALL_POINTER(aio_return);
+ GET_SYSCALL_POINTER(aio_suspend);
+ GET_SYSCALL_POINTER(aio_cancel);
+ GET_SYSCALL_POINTER(aio_error);
+
+ nerr = 0;
+ } while (0);
+
+ if (nerr) {
+ linker_release_module(NULL, NULL, native_aio_module_handle);
+ native_aio_module_handle = NULL;
+
+ printf(LMSG("Unable to link to the native module \""
+ NATIVE_AIO_MODULE_NAME "\"."));
+
+ return (nerr);
+ }
+
+ return (0);
+}
+
+#define LINK_TO_NATIVE_AIO_MODULE() \
+ if (link_to_native_aio_module(td)) { \
+ printf(LMSG("Please load the module \"" \
+ NATIVE_AIO_MODULE_NAME "\"" \
+ "to provide FreeBSD " \
+ "native Asynchronous I/O support.")); \
+ return (ENOSYS); \
+}
+
+static int mirror_native_aio_sysctl(struct thread *td)
+{
+ int nerr = 0;
+ size_t l;
+
+ l = sizeof(native_aio_capacity_proc);
+ nerr = kernel_sysctlbyname(td, NATIVE_AIO_SYSCTL_CAPACITY_PROC,
+ &native_aio_capacity_proc, &l, NULL, 0,
+ NULL ,0);
+ if (nerr)
+ return (nerr);
+
+ l = sizeof(native_aio_capacity_sys);
+ nerr = kernel_sysctlbyname(td, NATIVE_AIO_SYSCTL_CAPACITY_SYS,
+ &native_aio_capacity_sys, &l, NULL, 0,
+ NULL ,0);
+ if (nerr)
+ return (nerr);
+
+ DPRINTF(NATIVE_AIO_SYSCTL_CAPACITY_PROC "=%d, "
+ NATIVE_AIO_SYSCTL_CAPACITY_SYS "=%d",
+ native_aio_capacity_proc,
+ native_aio_capacity_sys);
+
+ return (nerr);
+}
+
+/* Linux system call io_setup(2) */
+int linux_io_setup(struct thread *td, struct linux_io_setup_args *args)
+{
+ struct proc *p;
+ struct linux_aio_ring *pring, ring;
+ struct linux_aio_context *pctx = NULL, *ptmpctx;
+ linux_aio_context_t ctx_id;
+ int nerr = 0, nr, nrall, nq, arg_nr_reqs;
+
+ DARGPRINTF("%u, %p", args->nr_reqs, args->ctxp);
+ LINK_TO_NATIVE_AIO_MODULE();
+ nerr = mirror_native_aio_sysctl(td);
+ if (nerr) {
+ printf(LMSG("linux_io_setup(): Unable to query sysctls "
+ NATIVE_AIO_SYSCTL_CAPACITY_PROC
+ " and/or " NATIVE_AIO_SYSCTL_CAPACITY_SYS
+ " ."));
+ return (nerr);
+ }
+
+ /* Signed integer is a little safer than unsigned */
+ arg_nr_reqs = args->nr_reqs;
+ if (arg_nr_reqs <= 0)
+ return (EINVAL);
+
+ if (arg_nr_reqs > native_aio_capacity_proc
+ || arg_nr_reqs > native_aio_capacity_sys) {
+ printf(LMSG("linux_io_setup(): Please increase sysctls "
+ NATIVE_AIO_SYSCTL_CAPACITY_PROC
+ " and/or " NATIVE_AIO_SYSCTL_CAPACITY_SYS
+ " ."));
+ return (ENOMEM);
+ }
+
+ nerr = user_mem_rw_verify(args->ctxp, sizeof(*(args->ctxp)));
+ if (nerr != 0)
+ return (nerr);
+
+ copyin(args->ctxp, &ctx_id, sizeof(ctx_id));
+ if (ctx_id != 0) /* "Not initialized", described by io_setup(2) */
+ return (EINVAL);
+
+ p = td->td_proc;
+
+ /* Get a new "ring" */
+ nerr = user_malloc(td, (void **)&pring, sizeof(*pring));
+ if (nerr != 0)
+ return (nerr);
+
+ /* Get a new context */
+ pctx = uma_zalloc(linux_aio_context_zone, M_WAITOK);
+
+ LINUX_AIO_CTX_LIST_LOCK();
+
+ /* Count request capacity of all contexts belonging to this process */
+ nr = 0;
+ nrall = 0;
+ nq = 0;
+ LINUX_AIO_CTX_FOREACH(ptmpctx) {
+ if (ptmpctx->ctx_pid == p->p_pid) {
+ nr += ptmpctx->ctx_nreq_max;
+ nq ++;
+ }
+ nrall += ptmpctx->ctx_nreq_max;
+ }
+ DPRINTF("%d queues of %d requests totally allocated for this process, "
+ "%d requests' total capacity for the whole system",
+ nq, nr, nrall);
+
+ /* Check whether there are enough resources for requested queue */
+ if (arg_nr_reqs > native_aio_capacity_proc - nr
+ || arg_nr_reqs > native_aio_capacity_sys - nrall) {
+ printf(LMSG("linux_io_setup(): "
+ "Please increase sysctls "
+ NATIVE_AIO_SYSCTL_CAPACITY_PROC
+ " and/or " NATIVE_AIO_SYSCTL_CAPACITY_SYS " ."
+ "Besides %d queues of %d requests totally "
+ "for this process, and %d requests' queues "
+ "totally for the whole system, "
+ "this Linux application needs one more "
+ "AIO queue of %d requests' capacity."),
+ nq, nr, nrall, arg_nr_reqs);
+ LINUX_AIO_CTX_LIST_UNLOCK();
+ DPRINTF("Free context %p", pctx);
+ uma_zfree(linux_aio_context_zone, pctx);
+ user_free(td, pring, sizeof(*pring));
+ return (ENOMEM);
+ }
+
+ /* Initialize the new context */
+ sx_init(&(pctx->ctx_sx), "linux_aio_context");
+ pctx->ctx_pid = p->p_pid;
+ pctx->ctx_pring = pring;
+ pctx->ctx_nreq_max = arg_nr_reqs;
+ pctx->ctx_nreq_cur = 0;
+ STAILQ_INIT(&(pctx->ctx_req));
+
+ /* Hook the new context to global context list */
+ LINUX_AIO_CTX_HOOK(pctx);
+
+ LINUX_AIO_CTX_LIST_UNLOCK();
+
+ /* Initialize the new "ring" */
+ DPRINTF("initialize the \"ring\" %p", pring);
+ bzero(&ring, sizeof(ring));
+ ring.ring_id = 1;
+ ring.ring_nr = arg_nr_reqs;
+ ring.ring_head = 0;
+ ring.ring_tail = 1;
+ ring.ring_magic = LINUX_AIO_RING_MAGIC;
+ ring.ring_compat_features = LINUX_AIO_RING_COMPAT_FEATURES;
+ ring.ring_incompat_features = LINUX_AIO_RING_INCOMPAT_FEATURES;
+ ring.ring_header_length = sizeof(ring);
+ copyout(&ring, pring, sizeof(ring)); /* It has been hooked before */
+
+ /* Substantial return value */
+ ctx_id = (linux_aio_context_t)pctx->ctx_pring;
+ copyout(&ctx_id, args->ctxp, sizeof(ctx_id));
+ DPRINTF("returned context: %lx -> %p", (unsigned long)ctx_id, pctx);
+
+ return (nerr);
+}
+
+/* Linux system call io_destroy(2) */
+int linux_io_destroy(struct thread *td, struct linux_io_destroy_args *args)
+{
+ int nerr = 0;
+ struct proc *p;
+ struct linux_aio_context *pctx;
+ struct linux_aio_request *preq, *ptmpreq;
+ struct aio_cancel_args cancelargs;
+ struct aio_return_args aioretargs;
+
+ DARGPRINTF("%lx", (unsigned long)args->ctx);
+ LINK_TO_NATIVE_AIO_MODULE();
+
+ p = td->td_proc;
+
+ /*
+ * Locking:
+ *
+ * LINUX_AIO_LOCK(p); <----------------+
+ * ... |
+ * LINUX_AIO_CTX_LIST_LOCK(); <--+ |
+ * ... | |
+ * LINUX_AIO_CTX_LIST_UNLOCK(); <--+ |
+ * ... |
+ * LINUX_AIO_CTX_LOCK(pctx); <---------|---+
+ * LINUX_AIO_UNLOCK(p); <----------------+ |
+ * ... |
+ * LINUX_AIO_CTX_UNLOCK(pctx); <-------------+
+ */
+
+ LINUX_AIO_LOCK(p);
+
+ /* Find the context in context list */
+ LINUX_AIO_CTX_LIST_LOCK();
+ LINUX_AIO_CTX_FOREACH(pctx) {
+ if (LINUX_AIO_CTX_MATCH(pctx, args->ctx, p->p_pid))
+ break;
+ }
+ LINUX_AIO_CTX_LIST_UNLOCK();
+
+ /* Unable to find the context */
+ if (pctx == NULL) {
+ LINUX_AIO_UNLOCK(p);
+ return (EINVAL);
+ }
+
+ DPRINTF("Found the context: %lx -> %p", (unsigned long)args->ctx, pctx);
+
+ /* Unhook the context from context list */
+ DPRINTF("Unhook context %p", pctx);
+ LINUX_AIO_CTX_UNHOOK(pctx);
+
+ LINUX_AIO_CTX_LOCK(pctx); /* XXX Interlaced, seamless */
+ LINUX_AIO_UNLOCK(p); /* XXX Interlaced, seamless */
+
+ /* Real cleanup */
+ LINUX_AIO_REQ_FOREACH_SAFE(pctx, preq, ptmpreq) {
+ DPRINTF("Cancel request (Linux: %p, FreeBSD: %p)",
+ preq->req_porig, preq->req_pbsd);
+
+ /* Cancel FreeBSD native clone */
+ cancelargs.fd = preq->req_linux.aio_fildes;
+ cancelargs.aiocbp = preq->req_pbsd;
+ p_aio_cancel(td, &cancelargs);
+ DPRINTF("aio_cancel() returned %ld", (long)td->td_retval[0]);
+ if (td->td_retval[0] == AIO_NOTCANCELED)
+ printf(LMSG("linux_io_destroy(): Asynchronous IO "
+ "request (Linux: %p, FreeBSD: %p) "
+ "cannot be cancelled. "
+ "***** Both User Space "
+ "and Kernel Memory Leaked! *****"),
+ preq->req_porig, preq->req_pbsd);
+
+ LINUX_AIO_REQ_UNHOOK(pctx, preq);
+
+ if (td->td_retval[0] == AIO_ALLDONE) {
+ aioretargs.aiocbp = preq->req_pbsd;
+ p_aio_return(td, &aioretargs);
+ DPRINTF("aio_return(%p) returned %ld",
+ aioretargs.aiocbp,
+ (long)td->td_retval[0]);
+
+ td->td_retval[0] = AIO_ALLDONE;
+ }
+
+ /* Free user space clone of the request */
+ if (td->td_retval[0] != AIO_NOTCANCELED) /*
+ * XXX How to avoid
+ * memory leak here?
+ */
+ user_free(td, preq->req_pbsd,
+ sizeof(*(preq->req_pbsd)));
+
+ /* Free kernel structure of the request */
+ uma_zfree(linux_aio_request_zone, preq);
+
+ td->td_retval[0] = 0;
+ }
+
+ LINUX_AIO_CTX_UNLOCK(pctx);
+
+ sx_destroy(&(pctx->ctx_sx));
+
+ /* Free the "ring" */
+ DPRINTF("free the \"ring\" %p", pctx->ctx_pring);
+ user_free(td, pctx->ctx_pring, sizeof(*pctx->ctx_pring));
+
+ /* Free destroyed context */
+ uma_zfree(linux_aio_context_zone, pctx);
+
+ return (nerr);
+}
+
+/* Linux system call io_getevents(2) */
+int linux_io_getevents(struct thread *td, struct linux_io_getevents_args *args)
+{
+ int i, j, nerr = 0;
+ struct proc *p;
+ struct l_timespec l_timeout;
+ struct timespec timeout, *u_ptimeout, t1, t2;
+ struct linux_aio_context *pctx;
+ struct linux_aio_request *preq, *ptmpreq;
+ struct linux_io_event evt;
+ struct aio_return_args aioretargs;
+ struct aio_error_args aioerrargs;
+ register_t aio_ret, aio_err;
+ struct aiocb ** u_aiocbp;
+ struct aio_suspend_args aiosusargs;
+
+ DARGPRINTF("%lx, %ld, %ld, %p, %p",
+ (unsigned long) args->ctx_id,
+ (long)args->min_nr, (long)args->nr,
+ args->events, args->timeout);
+ LINK_TO_NATIVE_AIO_MODULE();
+
+ if (args->nr <= 0)
+ return (EINVAL);
+
+ if (args->min_nr < 0)
+ return (EINVAL);
+
+ nerr = user_mem_rw_verify(args->events,
+ sizeof(*(args->events)) * args->nr);
+ if (nerr != 0)
+ return (nerr);
+
+ if (args->timeout != NULL) {
+ nerr = copyin(args->timeout, &l_timeout, sizeof(l_timeout));
+ if (nerr != 0)
+ return (nerr);
+ timeout.tv_sec = l_timeout.tv_sec;
+ timeout.tv_nsec = l_timeout.tv_nsec;
+ DUMP_TIMESPEC("User specified timeout: ", &timeout, "");
+ }
+
+ p = td->td_proc;
+
+ /*
+ * Locking:
+ *
+ * LINUX_AIO_LOCK(p); <----------------+
+ * ... |
+ * LINUX_AIO_CTX_LIST_LOCK(); <--+ |
+ * ... | |
+ * LINUX_AIO_CTX_LIST_UNLOCK(); <--+ |
+ * ... |
+ * LINUX_AIO_CTX_LOCK(pctx); <---------|---+
+ * LINUX_AIO_UNLOCK(p); <----------------+ |
+ * ... |
+ * LINUX_AIO_CTX_UNLOCK(pctx); <-------------+
+ */
+
+ LINUX_AIO_LOCK(p);
+
+ /* Find the context in context list */
+ LINUX_AIO_CTX_LIST_LOCK();
+ LINUX_AIO_CTX_FOREACH(pctx) {
+ if (LINUX_AIO_CTX_MATCH(pctx, args->ctx_id, p->p_pid))
+ break;
+ }
+ LINUX_AIO_CTX_LIST_UNLOCK();
+
+ /* Unable to find the context */
+ if (pctx == NULL) {
+ LINUX_AIO_UNLOCK(p);
+ return (EINVAL);
+ }
+
+ DPRINTF("Found the context: %lx -> %p", (unsigned long)args->ctx_id, pctx);
+
+ LINUX_AIO_CTX_LOCK(pctx); /* XXX Interlaced, seamless */
+ LINUX_AIO_UNLOCK(p); /* XXX Interlaced, seamless */
+
+ if (STAILQ_EMPTY(&(pctx->ctx_req))) {
+ td->td_retval[0] = 0; /* No queued request */
+ DPRINTF("No request in queue (context: %p) at all, "
+ "return directly", pctx);
+ } else { /* Deal with the request queue */
+ i = 0; /*
+ * This variable's value will be the return value
+ * of linux_io_getevents()
+ */
+
+ nerr = user_malloc(td, (void **)&u_aiocbp,
+ sizeof(*u_aiocbp) * pctx->ctx_nreq_max);
+ if (nerr != 0)
+ goto skip_substantial_0;
+
+ nerr = user_malloc(td, (void **)&u_ptimeout,
+ sizeof(*u_ptimeout));
+ if (nerr != 0)
+ goto skip_substantial_1;
+
+ for (i = 0;i < args->nr;) {
+
+ /* Collecting finished requests and waiting for queued requests */
+
+ LINUX_AIO_REQ_FOREACH_SAFE(pctx, preq, ptmpreq) {
+
+ /* Collect all finished requests */
+
+ if (i >= args->nr) /* Full */
+ break;
+
+ aioerrargs.aiocbp = preq->req_pbsd;
+ p_aio_error(td, &aioerrargs);
+ aio_ret = td->td_retval[0];
+ td->td_retval[0] = 0;
+
+ DPRINTF("aio_error(%p) (Linux: %p) "
+ "returned %ld%s",
+ aioerrargs.aiocbp,
+ preq->req_porig,
+ (long)aio_ret,
+ aio_ret == EINPROGRESS ?
+ "(EINPROGRESS)" : "" );
+
+ if (aio_ret == EINPROGRESS)
+ continue;
+
+ /* Done */
+ LINUX_AIO_REQ_UNHOOK(pctx, preq);
+
+ aioretargs.aiocbp = preq->req_pbsd;
+ aio_err = p_aio_return(td, &aioretargs);
+ aio_ret = td->td_retval[0];
+ td->td_retval[0] = 0;
+
+ DPRINTF("aio_return(%p) (Linux: %p) "
+ "returned %ld, errno=%ld",
+ aioretargs.aiocbp,
+ preq->req_porig,
+ (long)aio_ret,
+ (long)aio_err);
+
+ evt.data = preq->req_linux.aio_data;
+ evt.obj = (uint64_t)(unsigned long)
+ preq->req_porig;
+ if (aio_ret >= 0) {
+ /* Normal return (success) */
+ evt.res = aio_ret;
+ } else { /* Error code (failure) */
+ /*
+ * Translate FreeBSD error code
+ * to Linux's
+ */
+ evt.res =
+ p->p_sysent->sv_errtbl[aio_err];
+ }
+ DPRINTF("context %p (Linux: %p): "
+ "io_event.res=%lld",
+ preq->req_pbsd,
+ preq->req_porig,
+ (long long)evt.res);
+ evt.res2 = 0;
+
+ copyout(&evt, &(args->events[i]), sizeof(evt));
+
+ uma_zfree(linux_aio_request_zone, preq);
+
+ i ++;
+ } /* End of collecting all finished requests */
+
+ if (STAILQ_EMPTY(&(pctx->ctx_req))) {
+ /* No request remained in this context */
+ DPRINTF("returning(context %p): "
+ "request queue is empty",
+ pctx);
+ break;
+ }
+
+ if (i >= args->nr) { /* Full */
+ DPRINTF("returning(context %p): user space "
+ "event array is full",
+ pctx);
+ break;
+ }
+
+ if (i >= args->min_nr) {
+ /* Met the minimum requirement */
+ DPRINTF("returning(context %p): "
+ "met the minimum requirement",
+ pctx);
+ break;
+ }
+
+ if (args->timeout != NULL) {
+ if (! timespecisset(&timeout)) { /* Timed out */
+ DPRINTF("returning(context %p): "
+ "no time remaining",
+ pctx);
+ break;
+ }
+ }
+
+ if (args->timeout != NULL) {
+ nanouptime(&t1); /* Time before aio_suspend() */
+ DUMP_TIMESPEC("T1: ", &t1,
+ " (uptime before calling aio_suspend())");
+ }
+
+ /* Prepare arguments for aio_suspend() */
+ j = 0;
+ LINUX_AIO_REQ_FOREACH(pctx, preq) {
+ copyout(&(preq->req_pbsd), &(u_aiocbp[j]),
+ sizeof(preq->req_pbsd));
+ j++;
+ }
+ MPASS(j == pctx->ctx_nreq_cur);
+ aiosusargs.aiocbp = u_aiocbp;
+ aiosusargs.nent = j;
+
+ if (args->timeout != NULL) {
+ copyout(&timeout, u_ptimeout, sizeof(timeout));
+ aiosusargs.timeout = u_ptimeout;
+ DUMP_TIMESPEC("Time remained: ", &timeout, "");
+ } else {
+ aiosusargs.timeout = NULL;
+ }
+
+ aio_err = p_aio_suspend(td, &aiosusargs);
+ DPRINTF("aio_suspend(%p, %d, %p) returned %ld",
+ aiosusargs.aiocbp, aiosusargs.nent,
+ aiosusargs.timeout, (long)aio_err);
+
+ if (args->timeout != NULL) {
+ nanouptime(&t2); /* Time after aio_suspend() */
+ DUMP_TIMESPEC("T2: ", &t2,
+ " (uptime after calling aio_suspend())");
+ timespecsub(&t2, &t1); /*
+ * Time spent by
+ * aio_suspend()
+ */
+ DUMP_TIMESPEC("T_delta: ", &t2,
+ " (time spent by calling aio_suspend())");
+ if (timespeccmp(&t2, &timeout, >=)) {
+ timespecclear(&timeout); /* Timed out */
+ } else {
+ timespecsub(&timeout, &t2);
+ /* Time remaining */
+ }
+ DUMP_TIMESPEC("Time remained: ", &timeout, "");
+ }
+
+ if (aio_err == EAGAIN) { /* Timed out */
+ DPRINTF("returning(context %p): "
+ "timed out after calling aio_suspend()",
+ pctx);
+ break;
+ }
+ } /*
+ * End of collecting finished requests
+ * and waiting for queued requests
+ */
+
+ l_timeout.tv_sec = timeout.tv_sec;
+ l_timeout.tv_nsec = timeout.tv_nsec;
+ copyout(&l_timeout, args->timeout, sizeof(l_timeout));
+ /* No matter whether successfully or not */
+
+ nerr = user_free(td, u_ptimeout, sizeof(*u_ptimeout));
+skip_substantial_1:
+ nerr = user_free(td, u_aiocbp,
+ sizeof(*u_aiocbp) * pctx->ctx_nreq_max);
+skip_substantial_0:
+ td->td_retval[0] = i;
+ /* user_free() resets td->td_retval[0] to 0 */
+ DPRINTF("%d requests are unhooked from the context %p", i, pctx);
+ } /* End of dealing with request queue */
+
+ LINUX_AIO_CTX_UNLOCK(pctx);
+
+ return (nerr);
+}
+
+/* Linux system call io_submit(2) */
+int linux_io_submit(struct thread *td, struct linux_io_submit_args *args)
+{
+ int i, nerr = 0;
+ struct proc *p;
+ struct linux_aio_context *pctx;
+ struct linux_aio_request req, *preq;
+ struct linux_iocb *porig;
+ struct aiocb iocb, *piocb;
+
+ DARGPRINTF("%lx, %ld, %p", (unsigned long)args->ctx_id,
+ (long)args->nr, args->iocbpp);
+ LINK_TO_NATIVE_AIO_MODULE();
+
+ if (args->nr <= 0)
+ return (EINVAL);
+
+ p = td->td_proc;
+
+ /*
+ * Locking:
+ *
+ * LINUX_AIO_LOCK(p); <----------------+
+ * ... |
+ * LINUX_AIO_CTX_LIST_LOCK(); <--+ |
+ * ... | |
+ * LINUX_AIO_CTX_LIST_UNLOCK(); <--+ |
+ * ... |
+ * LINUX_AIO_CTX_LOCK(pctx); <---------|---+
+ * LINUX_AIO_UNLOCK(p); <----------------+ |
+ * ... |
+ * LINUX_AIO_CTX_UNLOCK(pctx); <-------------+
+ */
+
+ LINUX_AIO_LOCK(p);
+
+ /* Find the context in context list */
+ LINUX_AIO_CTX_LIST_LOCK();
+ LINUX_AIO_CTX_FOREACH(pctx) {
+ if (LINUX_AIO_CTX_MATCH(pctx, args->ctx_id, p->p_pid))
+ break;
+ }
+ LINUX_AIO_CTX_LIST_UNLOCK();
+
+ /* Unable to find the context */
+ if (pctx == NULL) {
+ LINUX_AIO_UNLOCK(p);
+ return (EINVAL);
+ }
+
+ DPRINTF("Found the context: %lx -> %p", (unsigned long)args->ctx_id, pctx);
+
+ LINUX_AIO_CTX_LOCK(pctx); /* XXX Interlaced, seamless */
+ LINUX_AIO_UNLOCK(p); /* XXX Interlaced, seamless */
+
+ for (i = 0; pctx->ctx_nreq_cur < pctx->ctx_nreq_max && i < args->nr;
+ i++) {
+ /* Get user space Linux control block */
+ nerr = copyin(&(args->iocbpp[i]), &porig, sizeof(porig));
+ if (nerr != 0)
+ break;
+ nerr = copyin(porig, &(req.req_linux), sizeof(req.req_linux));
+ if (nerr != 0)
+ break;
+
+ /* Create user space FreeBSD control block clone */
+ nerr = iocb_reformat(&(req.req_linux), &iocb);
+ if (nerr != 0)
+ break;
+ nerr = user_malloc(td, (void **)&piocb, sizeof(*piocb));
+ if (nerr != 0)
+ break;
+ nerr = copyout(&iocb, piocb, sizeof(iocb));
+ if (nerr != 0)
+ break;
+ DUMP_FREEBSD_AIOCB(piocb, 1);
+
+ /* Submit user space control block */
+ nerr = p_aio_aqueue(td, piocb, NULL, iocb.aio_lio_opcode, 0);
+ if (nerr != 0) {
+ user_free(td, piocb, sizeof(*piocb));
+ break;
+ }
+
+ req.req_porig = porig;
+ req.req_pbsd = piocb;
+
+ /* Hook request to the context */
+ preq = uma_zalloc(linux_aio_request_zone, M_WAITOK);
+ memcpy(preq, &req, sizeof(req));
+ DPRINTF("Linux IOCB %p (aio_lio_opcode=%u, aio_fildes=%u), "
+ "FreeBSD IOCB %p",
+ preq->req_porig,
+ (unsigned)preq->req_linux.aio_lio_opcode,
+ (unsigned)preq->req_linux.aio_fildes,
+ preq->req_pbsd);
+ LINUX_AIO_REQ_HOOK(pctx, preq);
+ }
+
+ LINUX_AIO_CTX_UNLOCK(pctx);
+
+ if (i > 0) {
+ td->td_retval[0] = i;
+ nerr = 0;
+ }
+
+ if (i == 0 && nerr == 0)
+ nerr = EAGAIN; /* No request is successfully submitted */
+
+ return (nerr);
+}
+
+/* Linux system call io_cancel(2) */
+int linux_io_cancel(struct thread *td, struct linux_io_cancel_args *args)
+{
+ int nerr = 0;
+ struct proc *p;
+ struct linux_iocb lcb;
+ struct linux_aio_context *pctx;
+ struct linux_aio_request *preq;
+ struct linux_io_event evt;
+ struct aio_cancel_args aiocnclargs;
+
+ DARGPRINTF("%lx, %p, %p", (unsigned long)args->ctx_id,
+ args->iocb, args->result);
+ LINK_TO_NATIVE_AIO_MODULE();
+
+ nerr = copyin(args->iocb, &lcb, sizeof(lcb));
+ if (nerr != 0)
+ return (nerr);
+
+ nerr = user_mem_rw_verify(args->result, sizeof(*(args->result)));
+ if (nerr != 0)
+ return (nerr);
+
+ p = td->td_proc;
+
+ /*
+ * Locking:
+ *
+ * LINUX_AIO_LOCK(p); <----------------+
+ * ... |
+ * LINUX_AIO_CTX_LIST_LOCK(); <--+ |
+ * ... | |
+ * LINUX_AIO_CTX_LIST_UNLOCK(); <--+ |
+ * ... |
+ * LINUX_AIO_CTX_LOCK(pctx); <---------|---+
+ * LINUX_AIO_UNLOCK(p); <----------------+ |
+ * ... |
+ * LINUX_AIO_CTX_UNLOCK(pctx); <-------------+
+ */
+
+ LINUX_AIO_LOCK(p);
+
+ /* Find the context in context list */
+ LINUX_AIO_CTX_LIST_LOCK();
+ LINUX_AIO_CTX_FOREACH(pctx) {
+ if (LINUX_AIO_CTX_MATCH(pctx, args->ctx_id, p->p_pid))
+ break;
+ }
+ LINUX_AIO_CTX_LIST_UNLOCK();
+
+ /* Unable to find the context */
+ if (pctx == NULL) {
+ LINUX_AIO_UNLOCK(p);
+ return (EINVAL);
+ }
+
+ DPRINTF("Found the context: %lx -> %p", (unsigned long)args->ctx_id, pctx);
+
+ LINUX_AIO_CTX_LOCK(pctx); /* XXX Interlaced, seamless */
+ LINUX_AIO_UNLOCK(p); /* XXX Interlaced, seamless */
+
+ LINUX_AIO_REQ_FOREACH(pctx, preq) {
+ if (preq->req_porig == args->iocb
+ && preq->req_linux.aio_key == lcb.aio_key)
+ break;
+ }
+
+ if (preq == NULL) {
+ DPRINTF("Unable to find IO control block %p", args->iocb);
+ nerr = EINVAL;
+ } else { /* Found the request in context */
+ DPRINTF("Cancel request (Linux: %p, FreeBSD: %p)",
+ preq->req_porig, preq->req_pbsd);
+
+ /* Cancel FreeBSD native clone */
+ aiocnclargs.fd = preq->req_linux.aio_fildes;
+ aiocnclargs.aiocbp = preq->req_pbsd;
+ p_aio_cancel(td, &aiocnclargs);
+ DPRINTF("aio_cancel() returned %ld", (long)td->td_retval[0]);
+
+ if (td->td_retval[0] == AIO_CANCELED) {
+ /* Cancellation succeeded */
+ LINUX_AIO_REQ_UNHOOK(pctx, preq);
+
+ evt.data = preq->req_linux.aio_data;
+ evt.obj = (uint64_t)(unsigned long) preq->req_porig;
+ evt.res = p->p_sysent->sv_errtbl[ECANCELED];
+ evt.res2 = 0;
+
+ /* Fill in user space structure linux_io_event */
+ copyout(&evt, args->result, sizeof(evt));
+
+ /* Free user space clone of the request */
+ user_free(td, preq->req_pbsd,
+ sizeof(*(preq->req_pbsd)));
+
+ /* Free kernel structure of the request */
+ uma_zfree(linux_aio_request_zone, preq);
+ } else if (td->td_retval[0] == AIO_ALLDONE) {
+ nerr = EINVAL; /*
+ * This value of Linux 2.6.15
+ * is really confusing !!!
+ */
+ } else { /* AIO_NOTCANCELED */
+ nerr = EAGAIN;
+ }
+
+ td->td_retval[0] = 0;
+ }
+
+ LINUX_AIO_CTX_UNLOCK(pctx);
+
+ return (nerr);
+}
+
+static void linux_aio_proc_rundown(void *arg, struct proc *p)
+{
+ struct linux_aio_context *pctx, *ptmpctx;
+ struct linux_aio_request *preq, *ptmpreq;
+
+ /*
+ * FreeBSD module "aio" can do more essential native cleanup
+ * (i.e. cancelling all queued requests) itself.
+ */
+
+ LINUX_AIO_CTX_LIST_LOCK();
+
+ LINUX_AIO_CTX_FOREACH_SAFE(pctx, ptmpctx) {
+ if (pctx->ctx_pid == p->p_pid) {
+ LINUX_AIO_REQ_FOREACH_SAFE(pctx, preq, ptmpreq) {
+ DPPRINTF("Free request %p from context %p "
+ "(ring: %p)",
+ preq, pctx, pctx->ctx_pring);
+ LINUX_AIO_REQ_UNHOOK(pctx, preq);
+ uma_zfree(linux_aio_request_zone, preq);
+ }
+
+ DPPRINTF("Free context %p (ring: %p)",
+ pctx, pctx->ctx_pring);
+
+ /* Unhook it from context list */
+ LINUX_AIO_CTX_UNHOOK(pctx);
+
+ /* Free it really */
+ sx_destroy(&(pctx->ctx_sx));
+ uma_zfree(linux_aio_context_zone, pctx);
+
+ DPPRINTF("The remaining context list is %s",
+ (SLIST_EMPTY(&linux_aio_context_list) ?
+ "empty":"not empty"));
+ }
+ }
+
+ LINUX_AIO_CTX_LIST_UNLOCK();
+}
+
+/*
+ * Module constructor/destructor
+ */
+static int
+linux_aio_modload(struct module *module, int cmd, void *arg)
+{
+ int nerr = 0;
+
+ switch (cmd) {
+ case MOD_LOAD:
+ linux_aio_context_zone = uma_zcreate("LINUXAIOCTX",
+ sizeof(struct linux_aio_context),
+ NULL, NULL, NULL, NULL,
+ UMA_ALIGN_PTR, 0);
+ linux_aio_request_zone = uma_zcreate("LINUXAIOREQ",
+ sizeof(struct linux_aio_request),
+ NULL, NULL, NULL, NULL,
+ UMA_ALIGN_PTR, 0);
+ mtx_init(&linux_aio_context_list_mtx,
+ "linux_aio_context_list", NULL, MTX_DEF);
+ SLIST_INIT(&linux_aio_context_list);
+ linux_aio_exit_tag = EVENTHANDLER_REGISTER(process_exit,
+ linux_aio_proc_rundown,
+ NULL, EVENTHANDLER_PRI_ANY);
+ break;
+ case MOD_UNLOAD:
+ LINUX_AIO_CTX_LIST_LOCK();
+ if (!SLIST_EMPTY(&linux_aio_context_list)) {
+ nerr = EBUSY;
+ LINUX_AIO_CTX_LIST_UNLOCK();
+ break;
+ }
+ EVENTHANDLER_DEREGISTER(process_exit, linux_aio_exit_tag);
+ LINUX_AIO_CTX_LIST_UNLOCK();
+ mtx_destroy(&linux_aio_context_list_mtx);
+ uma_zdestroy(linux_aio_request_zone);
+ uma_zdestroy(linux_aio_context_zone);
+ if (native_aio_module_handle != NULL) {
+ /*
+ * linker_release_module() cannot be used here.
+ * It tries to hold "kld_sx", conflicting against
+ * module_unload().
+ */
+ linker_file_unload(native_aio_module_handle,
+ LINKER_UNLOAD_NORMAL);
+ }
+ break;
+ case MOD_SHUTDOWN:
+ break;
+ default:
+ nerr = EINVAL;
+ break;
+ }
+ return (nerr);
+}
+
+static moduledata_t linux_aio_mod = {
+ "linuxaio",
+ &linux_aio_modload,
+ NULL
+};
+
+DECLARE_MODULE(linuxaio, linux_aio_mod, SI_SUB_VFS, SI_ORDER_ANY);
diff --git a/sys/compat/linux/linux_aio.h b/sys/compat/linux/linux_aio.h
new file mode 100644
index 0000000..96028a8
--- /dev/null
+++ b/sys/compat/linux/linux_aio.h
@@ -0,0 +1,98 @@
+/*-
+ * Copyright (c) 2006 Li, Xiao <intron@intron.ac>. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Linux Kernel Implementation of Asynchronous I/O
+ */
+
+#ifndef _LINUX_AIO_H_
+#define _LINUX_AIO_H_
+
+typedef unsigned long linux_aio_context_t;
+
+enum {
+ LINUX_IOCB_CMD_PREAD = 0,
+ LINUX_IOCB_CMD_PWRITE = 1,
+ LINUX_IOCB_CMD_FSYNC = 2,
+ LINUX_IOCB_CMD_FDSYNC = 3,
+#if 0
+ LINUX_IOCB_CMD_PREADX = 4,
+ LINUX_IOCB_CMD_POLL = 5,
+#endif
+ LINUX_IOCB_CMD_NOOP = 6,
+};
+
+struct linux_io_event {
+ uint64_t data;
+ uint64_t obj;
+ int64_t res;
+ int64_t res2;
+};
+
+#if _BYTE_ORDER == _LITTLE_ENDIAN
+#define LINUX_AIO_PADDED(x,y) x,y
+#elif _BYTE_ORDER == _BIG_ENDIAN
+#define LINUX_AIO_PADDED(x,y) y,x
+#else
+#error Unidentified byte order !!!
+#endif
+
+struct linux_iocb {
+ uint64_t aio_data;
+ uint32_t LINUX_AIO_PADDED(aio_key, aio_reserved1);
+
+ uint16_t aio_lio_opcode;
+ int16_t aio_reqprio;
+ uint32_t aio_fildes;
+
+ uint64_t aio_buf;
+ uint64_t aio_nbytes;
+ int64_t aio_offset;
+
+ uint64_t aio_reserved2; /* TODO: use this for a (struct sigevent *) */
+ uint64_t aio_reserved3;
+
+};
+
+/* User space context information structure */
+struct linux_aio_ring {
+ l_uint ring_id;
+ l_uint ring_nr;
+ l_uint ring_head;
+ l_uint ring_tail;
+#define LINUX_AIO_RING_MAGIC 0xa10a10a1
+ l_uint ring_magic;
+#define LINUX_AIO_RING_COMPAT_FEATURES 1
+ l_uint ring_compat_features;
+#define LINUX_AIO_RING_INCOMPAT_FEATURES 0
+ l_uint ring_incompat_features;
+ l_uint ring_header_length; /* Size of this structure */
+
+ struct linux_io_event ring_io_events[0];
+};
+
+#endif /* !_LINUX_AIO_H_ */
diff --git a/sys/conf/files.amd64 b/sys/conf/files.amd64
index 15c71b4..47d416f 100644
--- a/sys/conf/files.amd64
+++ b/sys/conf/files.amd64
@@ -240,6 +240,7 @@ amd64/linux32/linux32_locore.s optional compat_linux32 \
amd64/linux32/linux32_machdep.c optional compat_linux32
amd64/linux32/linux32_sysent.c optional compat_linux32
amd64/linux32/linux32_sysvec.c optional compat_linux32
+compat/linux/linux_aio.c optional compat_linux32
compat/linux/linux_emul.c optional compat_linux32
compat/linux/linux_file.c optional compat_linux32
compat/linux/linux_futex.c optional compat_linux32
diff --git a/sys/conf/files.i386 b/sys/conf/files.i386
index 11419d0..04a5080 100644
--- a/sys/conf/files.i386
+++ b/sys/conf/files.i386
@@ -85,6 +85,7 @@ rr232x_lib.o optional rr232x \
#
compat/linprocfs/linprocfs.c optional linprocfs
compat/linsysfs/linsysfs.c optional linsysfs
+compat/linux/linux_aio.c optional compat_linux
compat/linux/linux_emul.c optional compat_linux
compat/linux/linux_file.c optional compat_linux
compat/linux/linux_futex.c optional compat_linux
diff --git a/sys/i386/linux/linux.h b/sys/i386/linux/linux.h
index 9ad9353..1ca7381 100644
--- a/sys/i386/linux/linux.h
+++ b/sys/i386/linux/linux.h
@@ -803,4 +803,6 @@ typedef int l_mqd_t;
#define THREADING_FLAGS (CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGHAND)
+#include <compat/linux/linux_aio.h>
+
#endif /* !_I386_LINUX_LINUX_H_ */
diff --git a/sys/i386/linux/syscalls.master b/sys/i386/linux/syscalls.master
index 61ba1d2..6b00584 100644
--- a/sys/i386/linux/syscalls.master
+++ b/sys/i386/linux/syscalls.master
@@ -409,11 +409,11 @@
242 AUE_NULL UNIMPL linux_sched_getaffinity
243 AUE_NULL STD { int linux_set_thread_area(struct l_user_desc *desc); }
244 AUE_NULL STD { int linux_get_thread_area(struct l_user_desc *desc); }
-245 AUE_NULL UNIMPL linux_io_setup
-246 AUE_NULL UNIMPL linux_io_destroy
-247 AUE_NULL UNIMPL linux_io_getevents
-248 AUE_NULL UNIMPL linux_io_submit
-249 AUE_NULL UNIMPL linux_io_cancel
+245 AUE_NULL STD { int linux_io_setup(l_uint nr_reqs, linux_aio_context_t *ctxp); }
+246 AUE_NULL STD { int linux_io_destroy(linux_aio_context_t ctx); }
+247 AUE_NULL STD { int linux_io_getevents(linux_aio_context_t ctx_id, l_long min_nr, l_long nr, struct linux_io_event *events, struct l_timespec *timeout); }
+248 AUE_NULL STD { int linux_io_submit(linux_aio_context_t ctx_id, l_long nr, struct linux_iocb **iocbpp); }
+249 AUE_NULL STD { int linux_io_cancel(linux_aio_context_t ctx_id, struct linux_iocb *iocb, struct linux_io_event *result); }
250 AUE_NULL STD { int linux_fadvise64(void); }
251 AUE_NULL UNIMPL
252 AUE_EXIT STD { int linux_exit_group(int error_code); }
diff --git a/sys/kern/vfs_aio.c b/sys/kern/vfs_aio.c
index 49f18d9..a9b6b6e 100644
--- a/sys/kern/vfs_aio.c
+++ b/sys/kern/vfs_aio.c
@@ -313,12 +313,12 @@ static struct mtx aio_sock_mtx;
static TAILQ_HEAD(,aiocblist) aio_jobs; /* (c) Async job list */
static struct unrhdr *aiod_unr;
-static void aio_init_aioinfo(struct proc *p);
+void aio_init_aioinfo(struct proc *p);
static void aio_onceonly(void);
static int aio_free_entry(struct aiocblist *aiocbe);
static void aio_process(struct aiocblist *aiocbe);
static int aio_newproc(int *);
-static int aio_aqueue(struct thread *td, struct aiocb *job,
+int aio_aqueue(struct thread *td, struct aiocb *job,
struct aioliojob *lio, int type, int osigev);
static void aio_physwakeup(struct buf *bp);
static void aio_proc_rundown(void *arg, struct proc *p);
@@ -498,7 +498,7 @@ aio_unload(void)
* Init the per-process aioinfo structure. The aioinfo limits are set
* per-process for user limit (resource) management.
*/
-static void
+void
aio_init_aioinfo(struct proc *p)
{
struct kaioinfo *ki;
@@ -1322,7 +1322,7 @@ aio_swake_cb(struct socket *so, struct sockbuf *sb)
* Queue a new AIO request. Choosing either the threaded or direct physio VCHR
* technique is done in this code.
*/
-static int
+int
aio_aqueue(struct thread *td, struct aiocb *job, struct aioliojob *lj,
int type, int oldsigev)
{
diff --git a/sys/modules/aio/Makefile b/sys/modules/aio/Makefile
index 4136135..c73f651 100644
--- a/sys/modules/aio/Makefile
+++ b/sys/modules/aio/Makefile
@@ -5,4 +5,6 @@
KMOD= aio
SRCS= vfs_aio.c opt_vfs_aio.h vnode_if.h
+EXPORT_SYMS= aio_init_aioinfo aio_aqueue
+
.include <bsd.kmod.mk>
diff --git a/sys/modules/linux/Makefile b/sys/modules/linux/Makefile
index db8ea9c..ca6b467 100644
--- a/sys/modules/linux/Makefile
+++ b/sys/modules/linux/Makefile
@@ -8,11 +8,13 @@ CFLAGS+=-DCOMPAT_IA32 -DCOMPAT_LINUX32
.PATH: ${.CURDIR}/../../compat/linux ${.CURDIR}/../../${MACHINE_ARCH}/linux${SFX}
KMOD= linux
-SRCS= linux${SFX}_dummy.c linux_emul.c linux_file.c linux_futex.c linux_getcwd.c linux_ioctl.c \
- linux_ipc.c linux${SFX}_machdep.c linux_mib.c linux_misc.c linux_signal.c \
- linux_socket.c linux_stats.c linux_sysctl.c linux${SFX}_sysent.c linux${SFX}_sysvec.c \
- linux_uid16.c linux_util.c linux_time.c opt_inet6.h opt_mac.h opt_compat.h opt_posix.h \
- vnode_if.h device_if.h bus_if.h
+SRCS= linux_aio.c linux${SFX}_dummy.c linux_emul.c linux_file.c \
+ linux_futex.c linux_getcwd.c linux_ioctl.c linux_ipc.c \
+ linux${SFX}_machdep.c linux_mib.c linux_misc.c linux_signal.c \
+ linux_socket.c linux_stats.c linux_sysctl.c linux${SFX}_sysent.c \
+ linux${SFX}_sysvec.c linux_uid16.c linux_util.c linux_time.c \
+ opt_inet6.h opt_mac.h opt_compat.h opt_posix.h vnode_if.h \
+ device_if.h bus_if.h
OBJS= linux${SFX}_locore.o
.if ${MACHINE_ARCH} == "i386"
OpenPOWER on IntegriCloud