diff options
-rw-r--r-- | sys/amd64/linux32/linux.h | 2 | ||||
-rw-r--r-- | sys/amd64/linux32/syscalls.master | 10 | ||||
-rw-r--r-- | sys/compat/linux/linux_aio.c | 1349 | ||||
-rw-r--r-- | sys/compat/linux/linux_aio.h | 98 | ||||
-rw-r--r-- | sys/conf/files.amd64 | 1 | ||||
-rw-r--r-- | sys/conf/files.i386 | 1 | ||||
-rw-r--r-- | sys/i386/linux/linux.h | 2 | ||||
-rw-r--r-- | sys/i386/linux/syscalls.master | 10 | ||||
-rw-r--r-- | sys/kern/vfs_aio.c | 8 | ||||
-rw-r--r-- | sys/modules/aio/Makefile | 2 | ||||
-rw-r--r-- | sys/modules/linux/Makefile | 12 |
11 files changed, 1476 insertions, 19 deletions
diff --git a/sys/amd64/linux32/linux.h b/sys/amd64/linux32/linux.h index 9c8ed5c..cf4f471 100644 --- a/sys/amd64/linux32/linux.h +++ b/sys/amd64/linux32/linux.h @@ -840,4 +840,6 @@ typedef int l_mqd_t; #define THREADING_FLAGS (CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGHAND) +#include <compat/linux/linux_aio.h> + #endif /* !_AMD64_LINUX_LINUX_H_ */ diff --git a/sys/amd64/linux32/syscalls.master b/sys/amd64/linux32/syscalls.master index 34f2f0d..6836441 100644 --- a/sys/amd64/linux32/syscalls.master +++ b/sys/amd64/linux32/syscalls.master @@ -406,11 +406,11 @@ 242 AUE_NULL UNIMPL linux_sched_getaffinity 243 AUE_NULL UNIMPL linux_set_thread_area 244 AUE_NULL UNIMPL linux_get_thread_area -245 AUE_NULL UNIMPL linux_io_setup -246 AUE_NULL UNIMPL linux_io_destroy -247 AUE_NULL UNIMPL linux_io_getevents -248 AUE_NULL UNIMPL linux_io_submit -249 AUE_NULL UNIMPL linux_io_cancel +245 AUE_NULL STD { int linux_io_setup(l_uint nr_reqs, linux_aio_context_t *ctxp); } +246 AUE_NULL STD { int linux_io_destroy(linux_aio_context_t ctx); } +247 AUE_NULL STD { int linux_io_getevents(linux_aio_context_t ctx_id, l_long min_nr, l_long nr, struct linux_io_event *events, struct l_timespec *timeout); } +248 AUE_NULL STD { int linux_io_submit(linux_aio_context_t ctx_id, l_long nr, struct linux_iocb **iocbpp); } +249 AUE_NULL STD { int linux_io_cancel(linux_aio_context_t ctx_id, struct linux_iocb *iocb, struct linux_io_event *result); } 250 AUE_NULL STD { int linux_fadvise64(void); } 251 AUE_NULL UNIMPL 252 AUE_EXIT STD { int linux_exit_group(int error_code); } diff --git a/sys/compat/linux/linux_aio.c b/sys/compat/linux/linux_aio.c new file mode 100644 index 0000000..b6c6674 --- /dev/null +++ b/sys/compat/linux/linux_aio.c @@ -0,0 +1,1349 @@ +/*- + * Copyright (c) 2006 Li, Xiao <intron@intron.ac>. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include "opt_compat.h" + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/module.h> +#include <sys/sysent.h> +#include <sys/proc.h> +#include <sys/errno.h> +#include <sys/eventhandler.h> +#include <sys/aio.h> +#include <sys/mman.h> +#include <sys/time.h> +#include <sys/queue.h> +#include <vm/uma.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/sx.h> +#include <sys/linker.h> +#include <sys/sysctl.h> +#include <sys/syscall.h> +#include <sys/sysproto.h> + +#ifdef COMPAT_LINUX32 +#include <machine/../linux32/linux.h> +#include <machine/../linux32/linux32_proto.h> +#else +#include <machine/../linux/linux.h> +#include <machine/../linux/linux_proto.h> +#endif + +#define LINUX_AIO_DEBUG + +/* + * Linux Kernel Implementation of Asynchronous I/O + */ + +#ifdef LINUX_AIO_DEBUG + +/* Print arguments of syscall */ +#define DARGPRINTF(fmt, ...) printf("linux(%ld): %s("fmt")\n", \ + (long)td->td_proc->p_pid, __func__, __VA_ARGS__) +/* Print message in syscall function */ +#define DPRINTF(fmt, ...) printf(LMSG("%s(): " fmt), \ + __func__, __VA_ARGS__) +/* Print message in non-syscall function, the one more "P" means "private" */ +#define DPPRINTF(fmt, ...) printf("linux(): %s(): " fmt "\n", \ + __func__, __VA_ARGS__) + +#else + +#define DARGPRINTF(fmt, ...) +#define DPRINTF(fmt, ...) +#define DPPRINTF(fmt, ...) + +#endif + +/* + * DATA STRUCTURE HIERARCHY + * + * +--------------------+ +--------------------+ + * context_list ---> | context | ---> | context | ---> ... + * SLIST |(owned by a process)| |(owned by a process)| + * | | | | + * | ctx_req | | ctx_req | + * +----|---------------+ +----|---------------+ + * | STAILQ | STAILQ + * v v + * +------------+ +------------+ + * | request | | request | + * | | | | + * |.req_pbsd | |.req_pbsd | + * |.req_porig | |.req_porig | + * |.req_linux | |.req_linux | + * | | | | + * +------------+ +------------+ + * | | + * v v + * +------------+ +------------+ + * | request | | request | + * | | | | + * |.req_pbsd | |.req_pbsd | + * |.req_porig | |.req_porig | + * |.req_linux | |.req_linux | + * | | | | + * +------------+ +------------+ + * | | + * v v + * ... ... + */ + +struct linux_aio_context; + +struct linux_aio_request { + struct aiocb *req_pbsd; /* Userland clone for FreeBSD */ + struct linux_iocb *req_porig; /* Userland original control block */ + struct linux_iocb req_linux; /* Copy of original control block */ + STAILQ_ENTRY(linux_aio_request) req_ctx_entry; +}; + +struct linux_aio_context { + struct sx ctx_sx; + pid_t ctx_pid; + struct linux_aio_ring *ctx_pring; + int ctx_nreq_max; /* Maximum request number */ + int ctx_nreq_cur; /* Current request number */ + STAILQ_HEAD(,linux_aio_request) ctx_req; + SLIST_ENTRY(linux_aio_context) ctx_list_entry; +}; +static SLIST_HEAD(,linux_aio_context) linux_aio_context_list; + +#define LINUX_AIO_REQ_HOOK(pctx, preq) { \ + STAILQ_INSERT_TAIL(&((pctx)->ctx_req), (preq), req_ctx_entry); \ + (pctx)->ctx_nreq_cur ++; \ +} + +#define LINUX_AIO_REQ_UNHOOK(pctx, preq) { \ + STAILQ_REMOVE(&((pctx)->ctx_req), (preq), linux_aio_request, \ + req_ctx_entry); \ + (pctx)->ctx_nreq_cur --; \ +} + +#define LINUX_AIO_REQ_FOREACH(pctx, preq) \ + STAILQ_FOREACH((preq), &((pctx)->ctx_req), req_ctx_entry) + +#define LINUX_AIO_REQ_FOREACH_SAFE(pctx, preq, ptmpreq) \ + STAILQ_FOREACH_SAFE((preq), &((pctx)->ctx_req), req_ctx_entry, \ + (ptmpreq)) + +#define LINUX_AIO_CTX_LOCK(pctx) sx_xlock(&((pctx)->ctx_sx)) + +#define LINUX_AIO_CTX_UNLOCK(pctx) sx_unlock(&((pctx)->ctx_sx)) + +#define LINUX_AIO_CTX_HOOK(pctx) \ + SLIST_INSERT_HEAD(&linux_aio_context_list, (pctx), ctx_list_entry) + +#define LINUX_AIO_CTX_UNHOOK(pctx) \ + SLIST_REMOVE(&linux_aio_context_list, (pctx), \ + linux_aio_context, ctx_list_entry) + +#define LINUX_AIO_CTX_FOREACH(pctx) \ + SLIST_FOREACH((pctx), &linux_aio_context_list, ctx_list_entry) + +#define LINUX_AIO_CTX_FOREACH_SAFE(pctx, ptmpctx) \ + SLIST_FOREACH_SAFE((pctx), &linux_aio_context_list, \ + ctx_list_entry, (ptmpctx)) + +#define LINUX_AIO_CTX_MATCH(pctx, ctxid, pid) \ + ((linux_aio_context_t)(pctx)->ctx_pring == (ctxid) \ + && (pctx)->ctx_pid == (pid)) + +static struct mtx linux_aio_context_list_mtx; + +#define LINUX_AIO_CTX_LIST_LOCK() mtx_lock(&linux_aio_context_list_mtx) + +#define LINUX_AIO_CTX_LIST_UNLOCK() mtx_unlock(&linux_aio_context_list_mtx) + +/* + * The following two macros are substantially identical to the two macros + * AIO_(UN)LOCK in /sys/kern/vfs_aio.c. Thus, the mutex much be unlocked + * before calling functions of FreeBSD native AIO module. + * + * XXX + * I ASSUME the member "kaio_mtx" is the first element of "struct kaioinfo". + */ +#define LINUX_AIO_LOCK(p) { \ + if ((p)->p_aioinfo == NULL) \ + p_aio_init_aioinfo(p); \ + mtx_lock((struct mtx *)((p)->p_aioinfo)); \ +} + +#define LINUX_AIO_UNLOCK(p) { \ + if ((p)->p_aioinfo == NULL) \ + p_aio_init_aioinfo(p); \ + mtx_unlock((struct mtx *)((p)->p_aioinfo)); \ +} + +static uma_zone_t linux_aio_context_zone, linux_aio_request_zone; + +static eventhandler_tag linux_aio_exit_tag; + +/* + * XXX + * Calling external function/variable declared with "static" is DANGEROUS !!! + * Compiler may use register to transfer calling arguments for optimization, + * which is NOT a normal calling way and can cause kernel crash. + */ + +#define NATIVE_AIO_MODULE_NAME "aio" +static struct mod_depend native_aio_module_depend = {1, 1, 1}; +static linker_file_t native_aio_module_handle = NULL; + +/* Mirror of sysctls in /sys/kern/vfs_aio.c */ +#define NATIVE_AIO_SYSCTL_CAPACITY_PROC "vfs.aio.max_aio_queue_per_proc" +static int native_aio_capacity_proc; +#define NATIVE_AIO_SYSCTL_CAPACITY_SYS "vfs.aio.max_aio_queue" +static int native_aio_capacity_sys; + +/* For declaration of aio_aqueue(), defined in /sys/kern/vfs_aio.c */ +struct aioliojob; + +/* Functions in /sys/kern/vfs_aio.c, XXX defined with "static" */ +#define GET_INTERNAL_FUNC_POINTER(s) { \ + * ((caddr_t *) & p_ ## s) = linker_file_lookup_symbol( \ + native_aio_module_handle, #s, FALSE); \ + if (p_ ## s == NULL) \ + break; \ +} +static void (*p_aio_init_aioinfo) (struct proc *p); +static int (*p_aio_aqueue) (struct thread *td, struct aiocb *job, + struct aioliojob *lio, int type, int osigev); + +/* System calls in /sys/kern/vfs_aio.c */ +#define DEFINE_SYSCALL_POINTER_VARIABLE(s) \ + static int (* p_ ## s) (struct thread *, struct s ## _args *) +#define GET_SYSCALL_POINTER(s) { \ + * ((sy_call_t **) & p_ ## s) = sysent[SYS_ ## s].sy_call; \ + if ((sy_call_t *) p_ ## s == (sy_call_t *)lkmressys) \ + break; \ +} +DEFINE_SYSCALL_POINTER_VARIABLE(aio_return); +DEFINE_SYSCALL_POINTER_VARIABLE(aio_suspend); +DEFINE_SYSCALL_POINTER_VARIABLE(aio_cancel); +DEFINE_SYSCALL_POINTER_VARIABLE(aio_error); + +static int user_mem_rw_verify(void *p, size_t s) +{ + char buf[256]; + size_t i; + int nerr = 0; + + for (i = 0; i < s; i += sizeof(buf)) { + /* Verify reading */ + nerr = copyin((char *)p+i, buf, MIN(sizeof(buf), s-i)); + if (nerr != 0) + break; + + /* Verify writing */ + nerr = copyout(buf, (char *)p+i, MIN(sizeof(buf), s-i)); + if (nerr != 0) + break; + } + + return (nerr); +} + +/* Allocate memory in user space */ +static int user_malloc(struct thread *td, void **pp, size_t s) +{ + struct mmap_args mmaparg; + int nerr; + register_t r; + + r = td->td_retval[0]; + + mmaparg.addr = NULL; + mmaparg.len = s; + mmaparg.prot = PROT_READ | PROT_WRITE; + mmaparg.flags = MAP_PRIVATE | MAP_ANON; + mmaparg.fd = -1; + mmaparg.pad = 0; + mmaparg.pos = 0; + + nerr = mmap(td, &mmaparg); + + if (nerr == 0) { + *pp = (void *)td->td_retval[0]; + DPPRINTF("%lu bytes allocated at %p", (unsigned long)s, *pp); + } + + td->td_retval[0] = r; + + return (nerr); +} + +/* Free memory in user space */ +static int user_free(struct thread *td, void *p, size_t s) +{ + struct munmap_args munmaparg; + int nerr; + register_t r; + + r = td->td_retval[0]; + + munmaparg.addr = p; + munmaparg.len = s; + + nerr = munmap(td, &munmaparg); + + td->td_retval[0] = r; + DPPRINTF("%lu bytes at %p", (unsigned long)s, p); + + return (nerr); +} + +#ifdef LINUX_AIO_DEBUG + +static void linux_aio_dump_freebsd_aiocb(struct aiocb *piocb, int isuserland) +{ + struct aiocb localcb, *pcb; + int nerr = 0; + + if (isuserland) { + nerr = copyin(piocb, &localcb, sizeof(localcb)); + pcb = &localcb; + } else + pcb = piocb; + + DPPRINTF("Dump struct aiocb (%p, %s): %s", + piocb, (isuserland?"userland":"kernel"), + (nerr?"Failure":"")); + if (!nerr) { + DPPRINTF("aio_fildes: %d", + pcb->aio_fildes); + DPPRINTF("aio_offset: %lu", + (unsigned long) pcb->aio_offset); + DPPRINTF("aio_buf: %p", + pcb->aio_buf); + DPPRINTF("aio_nbytes: %lu", + (unsigned long) pcb->aio_nbytes); + DPPRINTF("aio_lio_opcode: %d", + pcb->aio_lio_opcode); + DPPRINTF("aio_reqprio: %d", + pcb->aio_reqprio); + DPPRINTF("aio_sigevent.sigev_notify: %d", + pcb->aio_sigevent.sigev_notify); + DPPRINTF("aio_sigevent.sigev_signo: %d", + pcb->aio_sigevent.sigev_signo); + } +} + +#define DUMP_FREEBSD_AIOCB(p, isu) linux_aio_dump_freebsd_aiocb((p), (isu)); + +#define DUMP_TIMESPEC(f, t ,a) \ + DPRINTF("%s%ld second + %ld nanosecond%s", \ + (f), (long)(t)->tv_sec, (long)(t)->tv_nsec, (a)); + +#else /* ! LINUX_AIO_DEBUG */ + +#define DUMP_FREEBSD_AIOCB(p, isu) +#define DUMP_TIMESPEC(f, t, a) + +#endif /* LINUX_AIO_DEBUG */ + +static int iocb_reformat(struct linux_iocb *plnx, struct aiocb *pbsd) +{ + int nerr = 0; + + bzero(pbsd, sizeof(*pbsd)); + + pbsd->aio_fildes = plnx->aio_fildes; /* File descriptor */ + pbsd->aio_offset = plnx->aio_offset; /* File offset for I/O */ + pbsd->aio_buf = (void *)(unsigned long) plnx->aio_buf; /* + * User space + * I/O buffer + */ + pbsd->aio_nbytes = plnx->aio_nbytes; /* Number of bytes for I/O */ + switch (plnx->aio_lio_opcode) { /* LIO opcode */ + case LINUX_IOCB_CMD_PREAD: + pbsd->aio_lio_opcode = LIO_READ; + break; + case LINUX_IOCB_CMD_PWRITE: + pbsd->aio_lio_opcode = LIO_WRITE; + break; + case LINUX_IOCB_CMD_FSYNC: + case LINUX_IOCB_CMD_FDSYNC: + pbsd->aio_lio_opcode = LIO_SYNC; + break; +#if 0 + case LINUX_IOCB_CMD_PREADX: + break; + case LINUX_IOCB_CMD_POLL: + break; +#endif + case LINUX_IOCB_CMD_NOOP: + pbsd->aio_lio_opcode = LIO_NOP; + break; + default: + nerr = EINVAL; + break; + } + if (nerr != 0) { + DPPRINTF("Unsupported aio_lio_opcode: %u", + (unsigned)plnx->aio_lio_opcode); + return (nerr); + } + pbsd->aio_reqprio = plnx->aio_reqprio; /* Request priority */ + pbsd->aio_sigevent.sigev_notify = SIGEV_NONE; /* No signal to deliver */ + pbsd->aio_sigevent.sigev_signo = 0; /* No signal to deliver */ + + return (nerr); +} + +static int link_to_native_aio_module(struct thread *td) +{ + int nerr; + + if (native_aio_module_handle != NULL) { + /* Linking has been done successfully. */ + return (0); + } + + nerr = linker_reference_module(NATIVE_AIO_MODULE_NAME, + &native_aio_module_depend, &native_aio_module_handle); + if (nerr) + return (nerr); + + do { + nerr = EINVAL; + + /* Kernel internal functions */ + GET_INTERNAL_FUNC_POINTER(aio_init_aioinfo); + GET_INTERNAL_FUNC_POINTER(aio_aqueue); + + /* System calls */ + GET_SYSCALL_POINTER(aio_return); + GET_SYSCALL_POINTER(aio_suspend); + GET_SYSCALL_POINTER(aio_cancel); + GET_SYSCALL_POINTER(aio_error); + + nerr = 0; + } while (0); + + if (nerr) { + linker_release_module(NULL, NULL, native_aio_module_handle); + native_aio_module_handle = NULL; + + printf(LMSG("Unable to link to the native module \"" + NATIVE_AIO_MODULE_NAME "\".")); + + return (nerr); + } + + return (0); +} + +#define LINK_TO_NATIVE_AIO_MODULE() \ + if (link_to_native_aio_module(td)) { \ + printf(LMSG("Please load the module \"" \ + NATIVE_AIO_MODULE_NAME "\"" \ + "to provide FreeBSD " \ + "native Asynchronous I/O support.")); \ + return (ENOSYS); \ +} + +static int mirror_native_aio_sysctl(struct thread *td) +{ + int nerr = 0; + size_t l; + + l = sizeof(native_aio_capacity_proc); + nerr = kernel_sysctlbyname(td, NATIVE_AIO_SYSCTL_CAPACITY_PROC, + &native_aio_capacity_proc, &l, NULL, 0, + NULL ,0); + if (nerr) + return (nerr); + + l = sizeof(native_aio_capacity_sys); + nerr = kernel_sysctlbyname(td, NATIVE_AIO_SYSCTL_CAPACITY_SYS, + &native_aio_capacity_sys, &l, NULL, 0, + NULL ,0); + if (nerr) + return (nerr); + + DPRINTF(NATIVE_AIO_SYSCTL_CAPACITY_PROC "=%d, " + NATIVE_AIO_SYSCTL_CAPACITY_SYS "=%d", + native_aio_capacity_proc, + native_aio_capacity_sys); + + return (nerr); +} + +/* Linux system call io_setup(2) */ +int linux_io_setup(struct thread *td, struct linux_io_setup_args *args) +{ + struct proc *p; + struct linux_aio_ring *pring, ring; + struct linux_aio_context *pctx = NULL, *ptmpctx; + linux_aio_context_t ctx_id; + int nerr = 0, nr, nrall, nq, arg_nr_reqs; + + DARGPRINTF("%u, %p", args->nr_reqs, args->ctxp); + LINK_TO_NATIVE_AIO_MODULE(); + nerr = mirror_native_aio_sysctl(td); + if (nerr) { + printf(LMSG("linux_io_setup(): Unable to query sysctls " + NATIVE_AIO_SYSCTL_CAPACITY_PROC + " and/or " NATIVE_AIO_SYSCTL_CAPACITY_SYS + " .")); + return (nerr); + } + + /* Signed integer is a little safer than unsigned */ + arg_nr_reqs = args->nr_reqs; + if (arg_nr_reqs <= 0) + return (EINVAL); + + if (arg_nr_reqs > native_aio_capacity_proc + || arg_nr_reqs > native_aio_capacity_sys) { + printf(LMSG("linux_io_setup(): Please increase sysctls " + NATIVE_AIO_SYSCTL_CAPACITY_PROC + " and/or " NATIVE_AIO_SYSCTL_CAPACITY_SYS + " .")); + return (ENOMEM); + } + + nerr = user_mem_rw_verify(args->ctxp, sizeof(*(args->ctxp))); + if (nerr != 0) + return (nerr); + + copyin(args->ctxp, &ctx_id, sizeof(ctx_id)); + if (ctx_id != 0) /* "Not initialized", described by io_setup(2) */ + return (EINVAL); + + p = td->td_proc; + + /* Get a new "ring" */ + nerr = user_malloc(td, (void **)&pring, sizeof(*pring)); + if (nerr != 0) + return (nerr); + + /* Get a new context */ + pctx = uma_zalloc(linux_aio_context_zone, M_WAITOK); + + LINUX_AIO_CTX_LIST_LOCK(); + + /* Count request capacity of all contexts belonging to this process */ + nr = 0; + nrall = 0; + nq = 0; + LINUX_AIO_CTX_FOREACH(ptmpctx) { + if (ptmpctx->ctx_pid == p->p_pid) { + nr += ptmpctx->ctx_nreq_max; + nq ++; + } + nrall += ptmpctx->ctx_nreq_max; + } + DPRINTF("%d queues of %d requests totally allocated for this process, " + "%d requests' total capacity for the whole system", + nq, nr, nrall); + + /* Check whether there are enough resources for requested queue */ + if (arg_nr_reqs > native_aio_capacity_proc - nr + || arg_nr_reqs > native_aio_capacity_sys - nrall) { + printf(LMSG("linux_io_setup(): " + "Please increase sysctls " + NATIVE_AIO_SYSCTL_CAPACITY_PROC + " and/or " NATIVE_AIO_SYSCTL_CAPACITY_SYS " ." + "Besides %d queues of %d requests totally " + "for this process, and %d requests' queues " + "totally for the whole system, " + "this Linux application needs one more " + "AIO queue of %d requests' capacity."), + nq, nr, nrall, arg_nr_reqs); + LINUX_AIO_CTX_LIST_UNLOCK(); + DPRINTF("Free context %p", pctx); + uma_zfree(linux_aio_context_zone, pctx); + user_free(td, pring, sizeof(*pring)); + return (ENOMEM); + } + + /* Initialize the new context */ + sx_init(&(pctx->ctx_sx), "linux_aio_context"); + pctx->ctx_pid = p->p_pid; + pctx->ctx_pring = pring; + pctx->ctx_nreq_max = arg_nr_reqs; + pctx->ctx_nreq_cur = 0; + STAILQ_INIT(&(pctx->ctx_req)); + + /* Hook the new context to global context list */ + LINUX_AIO_CTX_HOOK(pctx); + + LINUX_AIO_CTX_LIST_UNLOCK(); + + /* Initialize the new "ring" */ + DPRINTF("initialize the \"ring\" %p", pring); + bzero(&ring, sizeof(ring)); + ring.ring_id = 1; + ring.ring_nr = arg_nr_reqs; + ring.ring_head = 0; + ring.ring_tail = 1; + ring.ring_magic = LINUX_AIO_RING_MAGIC; + ring.ring_compat_features = LINUX_AIO_RING_COMPAT_FEATURES; + ring.ring_incompat_features = LINUX_AIO_RING_INCOMPAT_FEATURES; + ring.ring_header_length = sizeof(ring); + copyout(&ring, pring, sizeof(ring)); /* It has been hooked before */ + + /* Substantial return value */ + ctx_id = (linux_aio_context_t)pctx->ctx_pring; + copyout(&ctx_id, args->ctxp, sizeof(ctx_id)); + DPRINTF("returned context: %lx -> %p", (unsigned long)ctx_id, pctx); + + return (nerr); +} + +/* Linux system call io_destroy(2) */ +int linux_io_destroy(struct thread *td, struct linux_io_destroy_args *args) +{ + int nerr = 0; + struct proc *p; + struct linux_aio_context *pctx; + struct linux_aio_request *preq, *ptmpreq; + struct aio_cancel_args cancelargs; + struct aio_return_args aioretargs; + + DARGPRINTF("%lx", (unsigned long)args->ctx); + LINK_TO_NATIVE_AIO_MODULE(); + + p = td->td_proc; + + /* + * Locking: + * + * LINUX_AIO_LOCK(p); <----------------+ + * ... | + * LINUX_AIO_CTX_LIST_LOCK(); <--+ | + * ... | | + * LINUX_AIO_CTX_LIST_UNLOCK(); <--+ | + * ... | + * LINUX_AIO_CTX_LOCK(pctx); <---------|---+ + * LINUX_AIO_UNLOCK(p); <----------------+ | + * ... | + * LINUX_AIO_CTX_UNLOCK(pctx); <-------------+ + */ + + LINUX_AIO_LOCK(p); + + /* Find the context in context list */ + LINUX_AIO_CTX_LIST_LOCK(); + LINUX_AIO_CTX_FOREACH(pctx) { + if (LINUX_AIO_CTX_MATCH(pctx, args->ctx, p->p_pid)) + break; + } + LINUX_AIO_CTX_LIST_UNLOCK(); + + /* Unable to find the context */ + if (pctx == NULL) { + LINUX_AIO_UNLOCK(p); + return (EINVAL); + } + + DPRINTF("Found the context: %lx -> %p", (unsigned long)args->ctx, pctx); + + /* Unhook the context from context list */ + DPRINTF("Unhook context %p", pctx); + LINUX_AIO_CTX_UNHOOK(pctx); + + LINUX_AIO_CTX_LOCK(pctx); /* XXX Interlaced, seamless */ + LINUX_AIO_UNLOCK(p); /* XXX Interlaced, seamless */ + + /* Real cleanup */ + LINUX_AIO_REQ_FOREACH_SAFE(pctx, preq, ptmpreq) { + DPRINTF("Cancel request (Linux: %p, FreeBSD: %p)", + preq->req_porig, preq->req_pbsd); + + /* Cancel FreeBSD native clone */ + cancelargs.fd = preq->req_linux.aio_fildes; + cancelargs.aiocbp = preq->req_pbsd; + p_aio_cancel(td, &cancelargs); + DPRINTF("aio_cancel() returned %ld", (long)td->td_retval[0]); + if (td->td_retval[0] == AIO_NOTCANCELED) + printf(LMSG("linux_io_destroy(): Asynchronous IO " + "request (Linux: %p, FreeBSD: %p) " + "cannot be cancelled. " + "***** Both User Space " + "and Kernel Memory Leaked! *****"), + preq->req_porig, preq->req_pbsd); + + LINUX_AIO_REQ_UNHOOK(pctx, preq); + + if (td->td_retval[0] == AIO_ALLDONE) { + aioretargs.aiocbp = preq->req_pbsd; + p_aio_return(td, &aioretargs); + DPRINTF("aio_return(%p) returned %ld", + aioretargs.aiocbp, + (long)td->td_retval[0]); + + td->td_retval[0] = AIO_ALLDONE; + } + + /* Free user space clone of the request */ + if (td->td_retval[0] != AIO_NOTCANCELED) /* + * XXX How to avoid + * memory leak here? + */ + user_free(td, preq->req_pbsd, + sizeof(*(preq->req_pbsd))); + + /* Free kernel structure of the request */ + uma_zfree(linux_aio_request_zone, preq); + + td->td_retval[0] = 0; + } + + LINUX_AIO_CTX_UNLOCK(pctx); + + sx_destroy(&(pctx->ctx_sx)); + + /* Free the "ring" */ + DPRINTF("free the \"ring\" %p", pctx->ctx_pring); + user_free(td, pctx->ctx_pring, sizeof(*pctx->ctx_pring)); + + /* Free destroyed context */ + uma_zfree(linux_aio_context_zone, pctx); + + return (nerr); +} + +/* Linux system call io_getevents(2) */ +int linux_io_getevents(struct thread *td, struct linux_io_getevents_args *args) +{ + int i, j, nerr = 0; + struct proc *p; + struct l_timespec l_timeout; + struct timespec timeout, *u_ptimeout, t1, t2; + struct linux_aio_context *pctx; + struct linux_aio_request *preq, *ptmpreq; + struct linux_io_event evt; + struct aio_return_args aioretargs; + struct aio_error_args aioerrargs; + register_t aio_ret, aio_err; + struct aiocb ** u_aiocbp; + struct aio_suspend_args aiosusargs; + + DARGPRINTF("%lx, %ld, %ld, %p, %p", + (unsigned long) args->ctx_id, + (long)args->min_nr, (long)args->nr, + args->events, args->timeout); + LINK_TO_NATIVE_AIO_MODULE(); + + if (args->nr <= 0) + return (EINVAL); + + if (args->min_nr < 0) + return (EINVAL); + + nerr = user_mem_rw_verify(args->events, + sizeof(*(args->events)) * args->nr); + if (nerr != 0) + return (nerr); + + if (args->timeout != NULL) { + nerr = copyin(args->timeout, &l_timeout, sizeof(l_timeout)); + if (nerr != 0) + return (nerr); + timeout.tv_sec = l_timeout.tv_sec; + timeout.tv_nsec = l_timeout.tv_nsec; + DUMP_TIMESPEC("User specified timeout: ", &timeout, ""); + } + + p = td->td_proc; + + /* + * Locking: + * + * LINUX_AIO_LOCK(p); <----------------+ + * ... | + * LINUX_AIO_CTX_LIST_LOCK(); <--+ | + * ... | | + * LINUX_AIO_CTX_LIST_UNLOCK(); <--+ | + * ... | + * LINUX_AIO_CTX_LOCK(pctx); <---------|---+ + * LINUX_AIO_UNLOCK(p); <----------------+ | + * ... | + * LINUX_AIO_CTX_UNLOCK(pctx); <-------------+ + */ + + LINUX_AIO_LOCK(p); + + /* Find the context in context list */ + LINUX_AIO_CTX_LIST_LOCK(); + LINUX_AIO_CTX_FOREACH(pctx) { + if (LINUX_AIO_CTX_MATCH(pctx, args->ctx_id, p->p_pid)) + break; + } + LINUX_AIO_CTX_LIST_UNLOCK(); + + /* Unable to find the context */ + if (pctx == NULL) { + LINUX_AIO_UNLOCK(p); + return (EINVAL); + } + + DPRINTF("Found the context: %lx -> %p", (unsigned long)args->ctx_id, pctx); + + LINUX_AIO_CTX_LOCK(pctx); /* XXX Interlaced, seamless */ + LINUX_AIO_UNLOCK(p); /* XXX Interlaced, seamless */ + + if (STAILQ_EMPTY(&(pctx->ctx_req))) { + td->td_retval[0] = 0; /* No queued request */ + DPRINTF("No request in queue (context: %p) at all, " + "return directly", pctx); + } else { /* Deal with the request queue */ + i = 0; /* + * This variable's value will be the return value + * of linux_io_getevents() + */ + + nerr = user_malloc(td, (void **)&u_aiocbp, + sizeof(*u_aiocbp) * pctx->ctx_nreq_max); + if (nerr != 0) + goto skip_substantial_0; + + nerr = user_malloc(td, (void **)&u_ptimeout, + sizeof(*u_ptimeout)); + if (nerr != 0) + goto skip_substantial_1; + + for (i = 0;i < args->nr;) { + + /* Collecting finished requests and waiting for queued requests */ + + LINUX_AIO_REQ_FOREACH_SAFE(pctx, preq, ptmpreq) { + + /* Collect all finished requests */ + + if (i >= args->nr) /* Full */ + break; + + aioerrargs.aiocbp = preq->req_pbsd; + p_aio_error(td, &aioerrargs); + aio_ret = td->td_retval[0]; + td->td_retval[0] = 0; + + DPRINTF("aio_error(%p) (Linux: %p) " + "returned %ld%s", + aioerrargs.aiocbp, + preq->req_porig, + (long)aio_ret, + aio_ret == EINPROGRESS ? + "(EINPROGRESS)" : "" ); + + if (aio_ret == EINPROGRESS) + continue; + + /* Done */ + LINUX_AIO_REQ_UNHOOK(pctx, preq); + + aioretargs.aiocbp = preq->req_pbsd; + aio_err = p_aio_return(td, &aioretargs); + aio_ret = td->td_retval[0]; + td->td_retval[0] = 0; + + DPRINTF("aio_return(%p) (Linux: %p) " + "returned %ld, errno=%ld", + aioretargs.aiocbp, + preq->req_porig, + (long)aio_ret, + (long)aio_err); + + evt.data = preq->req_linux.aio_data; + evt.obj = (uint64_t)(unsigned long) + preq->req_porig; + if (aio_ret >= 0) { + /* Normal return (success) */ + evt.res = aio_ret; + } else { /* Error code (failure) */ + /* + * Translate FreeBSD error code + * to Linux's + */ + evt.res = + p->p_sysent->sv_errtbl[aio_err]; + } + DPRINTF("context %p (Linux: %p): " + "io_event.res=%lld", + preq->req_pbsd, + preq->req_porig, + (long long)evt.res); + evt.res2 = 0; + + copyout(&evt, &(args->events[i]), sizeof(evt)); + + uma_zfree(linux_aio_request_zone, preq); + + i ++; + } /* End of collecting all finished requests */ + + if (STAILQ_EMPTY(&(pctx->ctx_req))) { + /* No request remained in this context */ + DPRINTF("returning(context %p): " + "request queue is empty", + pctx); + break; + } + + if (i >= args->nr) { /* Full */ + DPRINTF("returning(context %p): user space " + "event array is full", + pctx); + break; + } + + if (i >= args->min_nr) { + /* Met the minimum requirement */ + DPRINTF("returning(context %p): " + "met the minimum requirement", + pctx); + break; + } + + if (args->timeout != NULL) { + if (! timespecisset(&timeout)) { /* Timed out */ + DPRINTF("returning(context %p): " + "no time remaining", + pctx); + break; + } + } + + if (args->timeout != NULL) { + nanouptime(&t1); /* Time before aio_suspend() */ + DUMP_TIMESPEC("T1: ", &t1, + " (uptime before calling aio_suspend())"); + } + + /* Prepare arguments for aio_suspend() */ + j = 0; + LINUX_AIO_REQ_FOREACH(pctx, preq) { + copyout(&(preq->req_pbsd), &(u_aiocbp[j]), + sizeof(preq->req_pbsd)); + j++; + } + MPASS(j == pctx->ctx_nreq_cur); + aiosusargs.aiocbp = u_aiocbp; + aiosusargs.nent = j; + + if (args->timeout != NULL) { + copyout(&timeout, u_ptimeout, sizeof(timeout)); + aiosusargs.timeout = u_ptimeout; + DUMP_TIMESPEC("Time remained: ", &timeout, ""); + } else { + aiosusargs.timeout = NULL; + } + + aio_err = p_aio_suspend(td, &aiosusargs); + DPRINTF("aio_suspend(%p, %d, %p) returned %ld", + aiosusargs.aiocbp, aiosusargs.nent, + aiosusargs.timeout, (long)aio_err); + + if (args->timeout != NULL) { + nanouptime(&t2); /* Time after aio_suspend() */ + DUMP_TIMESPEC("T2: ", &t2, + " (uptime after calling aio_suspend())"); + timespecsub(&t2, &t1); /* + * Time spent by + * aio_suspend() + */ + DUMP_TIMESPEC("T_delta: ", &t2, + " (time spent by calling aio_suspend())"); + if (timespeccmp(&t2, &timeout, >=)) { + timespecclear(&timeout); /* Timed out */ + } else { + timespecsub(&timeout, &t2); + /* Time remaining */ + } + DUMP_TIMESPEC("Time remained: ", &timeout, ""); + } + + if (aio_err == EAGAIN) { /* Timed out */ + DPRINTF("returning(context %p): " + "timed out after calling aio_suspend()", + pctx); + break; + } + } /* + * End of collecting finished requests + * and waiting for queued requests + */ + + l_timeout.tv_sec = timeout.tv_sec; + l_timeout.tv_nsec = timeout.tv_nsec; + copyout(&l_timeout, args->timeout, sizeof(l_timeout)); + /* No matter whether successfully or not */ + + nerr = user_free(td, u_ptimeout, sizeof(*u_ptimeout)); +skip_substantial_1: + nerr = user_free(td, u_aiocbp, + sizeof(*u_aiocbp) * pctx->ctx_nreq_max); +skip_substantial_0: + td->td_retval[0] = i; + /* user_free() resets td->td_retval[0] to 0 */ + DPRINTF("%d requests are unhooked from the context %p", i, pctx); + } /* End of dealing with request queue */ + + LINUX_AIO_CTX_UNLOCK(pctx); + + return (nerr); +} + +/* Linux system call io_submit(2) */ +int linux_io_submit(struct thread *td, struct linux_io_submit_args *args) +{ + int i, nerr = 0; + struct proc *p; + struct linux_aio_context *pctx; + struct linux_aio_request req, *preq; + struct linux_iocb *porig; + struct aiocb iocb, *piocb; + + DARGPRINTF("%lx, %ld, %p", (unsigned long)args->ctx_id, + (long)args->nr, args->iocbpp); + LINK_TO_NATIVE_AIO_MODULE(); + + if (args->nr <= 0) + return (EINVAL); + + p = td->td_proc; + + /* + * Locking: + * + * LINUX_AIO_LOCK(p); <----------------+ + * ... | + * LINUX_AIO_CTX_LIST_LOCK(); <--+ | + * ... | | + * LINUX_AIO_CTX_LIST_UNLOCK(); <--+ | + * ... | + * LINUX_AIO_CTX_LOCK(pctx); <---------|---+ + * LINUX_AIO_UNLOCK(p); <----------------+ | + * ... | + * LINUX_AIO_CTX_UNLOCK(pctx); <-------------+ + */ + + LINUX_AIO_LOCK(p); + + /* Find the context in context list */ + LINUX_AIO_CTX_LIST_LOCK(); + LINUX_AIO_CTX_FOREACH(pctx) { + if (LINUX_AIO_CTX_MATCH(pctx, args->ctx_id, p->p_pid)) + break; + } + LINUX_AIO_CTX_LIST_UNLOCK(); + + /* Unable to find the context */ + if (pctx == NULL) { + LINUX_AIO_UNLOCK(p); + return (EINVAL); + } + + DPRINTF("Found the context: %lx -> %p", (unsigned long)args->ctx_id, pctx); + + LINUX_AIO_CTX_LOCK(pctx); /* XXX Interlaced, seamless */ + LINUX_AIO_UNLOCK(p); /* XXX Interlaced, seamless */ + + for (i = 0; pctx->ctx_nreq_cur < pctx->ctx_nreq_max && i < args->nr; + i++) { + /* Get user space Linux control block */ + nerr = copyin(&(args->iocbpp[i]), &porig, sizeof(porig)); + if (nerr != 0) + break; + nerr = copyin(porig, &(req.req_linux), sizeof(req.req_linux)); + if (nerr != 0) + break; + + /* Create user space FreeBSD control block clone */ + nerr = iocb_reformat(&(req.req_linux), &iocb); + if (nerr != 0) + break; + nerr = user_malloc(td, (void **)&piocb, sizeof(*piocb)); + if (nerr != 0) + break; + nerr = copyout(&iocb, piocb, sizeof(iocb)); + if (nerr != 0) + break; + DUMP_FREEBSD_AIOCB(piocb, 1); + + /* Submit user space control block */ + nerr = p_aio_aqueue(td, piocb, NULL, iocb.aio_lio_opcode, 0); + if (nerr != 0) { + user_free(td, piocb, sizeof(*piocb)); + break; + } + + req.req_porig = porig; + req.req_pbsd = piocb; + + /* Hook request to the context */ + preq = uma_zalloc(linux_aio_request_zone, M_WAITOK); + memcpy(preq, &req, sizeof(req)); + DPRINTF("Linux IOCB %p (aio_lio_opcode=%u, aio_fildes=%u), " + "FreeBSD IOCB %p", + preq->req_porig, + (unsigned)preq->req_linux.aio_lio_opcode, + (unsigned)preq->req_linux.aio_fildes, + preq->req_pbsd); + LINUX_AIO_REQ_HOOK(pctx, preq); + } + + LINUX_AIO_CTX_UNLOCK(pctx); + + if (i > 0) { + td->td_retval[0] = i; + nerr = 0; + } + + if (i == 0 && nerr == 0) + nerr = EAGAIN; /* No request is successfully submitted */ + + return (nerr); +} + +/* Linux system call io_cancel(2) */ +int linux_io_cancel(struct thread *td, struct linux_io_cancel_args *args) +{ + int nerr = 0; + struct proc *p; + struct linux_iocb lcb; + struct linux_aio_context *pctx; + struct linux_aio_request *preq; + struct linux_io_event evt; + struct aio_cancel_args aiocnclargs; + + DARGPRINTF("%lx, %p, %p", (unsigned long)args->ctx_id, + args->iocb, args->result); + LINK_TO_NATIVE_AIO_MODULE(); + + nerr = copyin(args->iocb, &lcb, sizeof(lcb)); + if (nerr != 0) + return (nerr); + + nerr = user_mem_rw_verify(args->result, sizeof(*(args->result))); + if (nerr != 0) + return (nerr); + + p = td->td_proc; + + /* + * Locking: + * + * LINUX_AIO_LOCK(p); <----------------+ + * ... | + * LINUX_AIO_CTX_LIST_LOCK(); <--+ | + * ... | | + * LINUX_AIO_CTX_LIST_UNLOCK(); <--+ | + * ... | + * LINUX_AIO_CTX_LOCK(pctx); <---------|---+ + * LINUX_AIO_UNLOCK(p); <----------------+ | + * ... | + * LINUX_AIO_CTX_UNLOCK(pctx); <-------------+ + */ + + LINUX_AIO_LOCK(p); + + /* Find the context in context list */ + LINUX_AIO_CTX_LIST_LOCK(); + LINUX_AIO_CTX_FOREACH(pctx) { + if (LINUX_AIO_CTX_MATCH(pctx, args->ctx_id, p->p_pid)) + break; + } + LINUX_AIO_CTX_LIST_UNLOCK(); + + /* Unable to find the context */ + if (pctx == NULL) { + LINUX_AIO_UNLOCK(p); + return (EINVAL); + } + + DPRINTF("Found the context: %lx -> %p", (unsigned long)args->ctx_id, pctx); + + LINUX_AIO_CTX_LOCK(pctx); /* XXX Interlaced, seamless */ + LINUX_AIO_UNLOCK(p); /* XXX Interlaced, seamless */ + + LINUX_AIO_REQ_FOREACH(pctx, preq) { + if (preq->req_porig == args->iocb + && preq->req_linux.aio_key == lcb.aio_key) + break; + } + + if (preq == NULL) { + DPRINTF("Unable to find IO control block %p", args->iocb); + nerr = EINVAL; + } else { /* Found the request in context */ + DPRINTF("Cancel request (Linux: %p, FreeBSD: %p)", + preq->req_porig, preq->req_pbsd); + + /* Cancel FreeBSD native clone */ + aiocnclargs.fd = preq->req_linux.aio_fildes; + aiocnclargs.aiocbp = preq->req_pbsd; + p_aio_cancel(td, &aiocnclargs); + DPRINTF("aio_cancel() returned %ld", (long)td->td_retval[0]); + + if (td->td_retval[0] == AIO_CANCELED) { + /* Cancellation succeeded */ + LINUX_AIO_REQ_UNHOOK(pctx, preq); + + evt.data = preq->req_linux.aio_data; + evt.obj = (uint64_t)(unsigned long) preq->req_porig; + evt.res = p->p_sysent->sv_errtbl[ECANCELED]; + evt.res2 = 0; + + /* Fill in user space structure linux_io_event */ + copyout(&evt, args->result, sizeof(evt)); + + /* Free user space clone of the request */ + user_free(td, preq->req_pbsd, + sizeof(*(preq->req_pbsd))); + + /* Free kernel structure of the request */ + uma_zfree(linux_aio_request_zone, preq); + } else if (td->td_retval[0] == AIO_ALLDONE) { + nerr = EINVAL; /* + * This value of Linux 2.6.15 + * is really confusing !!! + */ + } else { /* AIO_NOTCANCELED */ + nerr = EAGAIN; + } + + td->td_retval[0] = 0; + } + + LINUX_AIO_CTX_UNLOCK(pctx); + + return (nerr); +} + +static void linux_aio_proc_rundown(void *arg, struct proc *p) +{ + struct linux_aio_context *pctx, *ptmpctx; + struct linux_aio_request *preq, *ptmpreq; + + /* + * FreeBSD module "aio" can do more essential native cleanup + * (i.e. cancelling all queued requests) itself. + */ + + LINUX_AIO_CTX_LIST_LOCK(); + + LINUX_AIO_CTX_FOREACH_SAFE(pctx, ptmpctx) { + if (pctx->ctx_pid == p->p_pid) { + LINUX_AIO_REQ_FOREACH_SAFE(pctx, preq, ptmpreq) { + DPPRINTF("Free request %p from context %p " + "(ring: %p)", + preq, pctx, pctx->ctx_pring); + LINUX_AIO_REQ_UNHOOK(pctx, preq); + uma_zfree(linux_aio_request_zone, preq); + } + + DPPRINTF("Free context %p (ring: %p)", + pctx, pctx->ctx_pring); + + /* Unhook it from context list */ + LINUX_AIO_CTX_UNHOOK(pctx); + + /* Free it really */ + sx_destroy(&(pctx->ctx_sx)); + uma_zfree(linux_aio_context_zone, pctx); + + DPPRINTF("The remaining context list is %s", + (SLIST_EMPTY(&linux_aio_context_list) ? + "empty":"not empty")); + } + } + + LINUX_AIO_CTX_LIST_UNLOCK(); +} + +/* + * Module constructor/destructor + */ +static int +linux_aio_modload(struct module *module, int cmd, void *arg) +{ + int nerr = 0; + + switch (cmd) { + case MOD_LOAD: + linux_aio_context_zone = uma_zcreate("LINUXAIOCTX", + sizeof(struct linux_aio_context), + NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, 0); + linux_aio_request_zone = uma_zcreate("LINUXAIOREQ", + sizeof(struct linux_aio_request), + NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, 0); + mtx_init(&linux_aio_context_list_mtx, + "linux_aio_context_list", NULL, MTX_DEF); + SLIST_INIT(&linux_aio_context_list); + linux_aio_exit_tag = EVENTHANDLER_REGISTER(process_exit, + linux_aio_proc_rundown, + NULL, EVENTHANDLER_PRI_ANY); + break; + case MOD_UNLOAD: + LINUX_AIO_CTX_LIST_LOCK(); + if (!SLIST_EMPTY(&linux_aio_context_list)) { + nerr = EBUSY; + LINUX_AIO_CTX_LIST_UNLOCK(); + break; + } + EVENTHANDLER_DEREGISTER(process_exit, linux_aio_exit_tag); + LINUX_AIO_CTX_LIST_UNLOCK(); + mtx_destroy(&linux_aio_context_list_mtx); + uma_zdestroy(linux_aio_request_zone); + uma_zdestroy(linux_aio_context_zone); + if (native_aio_module_handle != NULL) { + /* + * linker_release_module() cannot be used here. + * It tries to hold "kld_sx", conflicting against + * module_unload(). + */ + linker_file_unload(native_aio_module_handle, + LINKER_UNLOAD_NORMAL); + } + break; + case MOD_SHUTDOWN: + break; + default: + nerr = EINVAL; + break; + } + return (nerr); +} + +static moduledata_t linux_aio_mod = { + "linuxaio", + &linux_aio_modload, + NULL +}; + +DECLARE_MODULE(linuxaio, linux_aio_mod, SI_SUB_VFS, SI_ORDER_ANY); diff --git a/sys/compat/linux/linux_aio.h b/sys/compat/linux/linux_aio.h new file mode 100644 index 0000000..96028a8 --- /dev/null +++ b/sys/compat/linux/linux_aio.h @@ -0,0 +1,98 @@ +/*- + * Copyright (c) 2006 Li, Xiao <intron@intron.ac>. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Linux Kernel Implementation of Asynchronous I/O + */ + +#ifndef _LINUX_AIO_H_ +#define _LINUX_AIO_H_ + +typedef unsigned long linux_aio_context_t; + +enum { + LINUX_IOCB_CMD_PREAD = 0, + LINUX_IOCB_CMD_PWRITE = 1, + LINUX_IOCB_CMD_FSYNC = 2, + LINUX_IOCB_CMD_FDSYNC = 3, +#if 0 + LINUX_IOCB_CMD_PREADX = 4, + LINUX_IOCB_CMD_POLL = 5, +#endif + LINUX_IOCB_CMD_NOOP = 6, +}; + +struct linux_io_event { + uint64_t data; + uint64_t obj; + int64_t res; + int64_t res2; +}; + +#if _BYTE_ORDER == _LITTLE_ENDIAN +#define LINUX_AIO_PADDED(x,y) x,y +#elif _BYTE_ORDER == _BIG_ENDIAN +#define LINUX_AIO_PADDED(x,y) y,x +#else +#error Unidentified byte order !!! +#endif + +struct linux_iocb { + uint64_t aio_data; + uint32_t LINUX_AIO_PADDED(aio_key, aio_reserved1); + + uint16_t aio_lio_opcode; + int16_t aio_reqprio; + uint32_t aio_fildes; + + uint64_t aio_buf; + uint64_t aio_nbytes; + int64_t aio_offset; + + uint64_t aio_reserved2; /* TODO: use this for a (struct sigevent *) */ + uint64_t aio_reserved3; + +}; + +/* User space context information structure */ +struct linux_aio_ring { + l_uint ring_id; + l_uint ring_nr; + l_uint ring_head; + l_uint ring_tail; +#define LINUX_AIO_RING_MAGIC 0xa10a10a1 + l_uint ring_magic; +#define LINUX_AIO_RING_COMPAT_FEATURES 1 + l_uint ring_compat_features; +#define LINUX_AIO_RING_INCOMPAT_FEATURES 0 + l_uint ring_incompat_features; + l_uint ring_header_length; /* Size of this structure */ + + struct linux_io_event ring_io_events[0]; +}; + +#endif /* !_LINUX_AIO_H_ */ diff --git a/sys/conf/files.amd64 b/sys/conf/files.amd64 index 15c71b4..47d416f 100644 --- a/sys/conf/files.amd64 +++ b/sys/conf/files.amd64 @@ -240,6 +240,7 @@ amd64/linux32/linux32_locore.s optional compat_linux32 \ amd64/linux32/linux32_machdep.c optional compat_linux32 amd64/linux32/linux32_sysent.c optional compat_linux32 amd64/linux32/linux32_sysvec.c optional compat_linux32 +compat/linux/linux_aio.c optional compat_linux32 compat/linux/linux_emul.c optional compat_linux32 compat/linux/linux_file.c optional compat_linux32 compat/linux/linux_futex.c optional compat_linux32 diff --git a/sys/conf/files.i386 b/sys/conf/files.i386 index 11419d0..04a5080 100644 --- a/sys/conf/files.i386 +++ b/sys/conf/files.i386 @@ -85,6 +85,7 @@ rr232x_lib.o optional rr232x \ # compat/linprocfs/linprocfs.c optional linprocfs compat/linsysfs/linsysfs.c optional linsysfs +compat/linux/linux_aio.c optional compat_linux compat/linux/linux_emul.c optional compat_linux compat/linux/linux_file.c optional compat_linux compat/linux/linux_futex.c optional compat_linux diff --git a/sys/i386/linux/linux.h b/sys/i386/linux/linux.h index 9ad9353..1ca7381 100644 --- a/sys/i386/linux/linux.h +++ b/sys/i386/linux/linux.h @@ -803,4 +803,6 @@ typedef int l_mqd_t; #define THREADING_FLAGS (CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGHAND) +#include <compat/linux/linux_aio.h> + #endif /* !_I386_LINUX_LINUX_H_ */ diff --git a/sys/i386/linux/syscalls.master b/sys/i386/linux/syscalls.master index 61ba1d2..6b00584 100644 --- a/sys/i386/linux/syscalls.master +++ b/sys/i386/linux/syscalls.master @@ -409,11 +409,11 @@ 242 AUE_NULL UNIMPL linux_sched_getaffinity 243 AUE_NULL STD { int linux_set_thread_area(struct l_user_desc *desc); } 244 AUE_NULL STD { int linux_get_thread_area(struct l_user_desc *desc); } -245 AUE_NULL UNIMPL linux_io_setup -246 AUE_NULL UNIMPL linux_io_destroy -247 AUE_NULL UNIMPL linux_io_getevents -248 AUE_NULL UNIMPL linux_io_submit -249 AUE_NULL UNIMPL linux_io_cancel +245 AUE_NULL STD { int linux_io_setup(l_uint nr_reqs, linux_aio_context_t *ctxp); } +246 AUE_NULL STD { int linux_io_destroy(linux_aio_context_t ctx); } +247 AUE_NULL STD { int linux_io_getevents(linux_aio_context_t ctx_id, l_long min_nr, l_long nr, struct linux_io_event *events, struct l_timespec *timeout); } +248 AUE_NULL STD { int linux_io_submit(linux_aio_context_t ctx_id, l_long nr, struct linux_iocb **iocbpp); } +249 AUE_NULL STD { int linux_io_cancel(linux_aio_context_t ctx_id, struct linux_iocb *iocb, struct linux_io_event *result); } 250 AUE_NULL STD { int linux_fadvise64(void); } 251 AUE_NULL UNIMPL 252 AUE_EXIT STD { int linux_exit_group(int error_code); } diff --git a/sys/kern/vfs_aio.c b/sys/kern/vfs_aio.c index 49f18d9..a9b6b6e 100644 --- a/sys/kern/vfs_aio.c +++ b/sys/kern/vfs_aio.c @@ -313,12 +313,12 @@ static struct mtx aio_sock_mtx; static TAILQ_HEAD(,aiocblist) aio_jobs; /* (c) Async job list */ static struct unrhdr *aiod_unr; -static void aio_init_aioinfo(struct proc *p); +void aio_init_aioinfo(struct proc *p); static void aio_onceonly(void); static int aio_free_entry(struct aiocblist *aiocbe); static void aio_process(struct aiocblist *aiocbe); static int aio_newproc(int *); -static int aio_aqueue(struct thread *td, struct aiocb *job, +int aio_aqueue(struct thread *td, struct aiocb *job, struct aioliojob *lio, int type, int osigev); static void aio_physwakeup(struct buf *bp); static void aio_proc_rundown(void *arg, struct proc *p); @@ -498,7 +498,7 @@ aio_unload(void) * Init the per-process aioinfo structure. The aioinfo limits are set * per-process for user limit (resource) management. */ -static void +void aio_init_aioinfo(struct proc *p) { struct kaioinfo *ki; @@ -1322,7 +1322,7 @@ aio_swake_cb(struct socket *so, struct sockbuf *sb) * Queue a new AIO request. Choosing either the threaded or direct physio VCHR * technique is done in this code. */ -static int +int aio_aqueue(struct thread *td, struct aiocb *job, struct aioliojob *lj, int type, int oldsigev) { diff --git a/sys/modules/aio/Makefile b/sys/modules/aio/Makefile index 4136135..c73f651 100644 --- a/sys/modules/aio/Makefile +++ b/sys/modules/aio/Makefile @@ -5,4 +5,6 @@ KMOD= aio SRCS= vfs_aio.c opt_vfs_aio.h vnode_if.h +EXPORT_SYMS= aio_init_aioinfo aio_aqueue + .include <bsd.kmod.mk> diff --git a/sys/modules/linux/Makefile b/sys/modules/linux/Makefile index db8ea9c..ca6b467 100644 --- a/sys/modules/linux/Makefile +++ b/sys/modules/linux/Makefile @@ -8,11 +8,13 @@ CFLAGS+=-DCOMPAT_IA32 -DCOMPAT_LINUX32 .PATH: ${.CURDIR}/../../compat/linux ${.CURDIR}/../../${MACHINE_ARCH}/linux${SFX} KMOD= linux -SRCS= linux${SFX}_dummy.c linux_emul.c linux_file.c linux_futex.c linux_getcwd.c linux_ioctl.c \ - linux_ipc.c linux${SFX}_machdep.c linux_mib.c linux_misc.c linux_signal.c \ - linux_socket.c linux_stats.c linux_sysctl.c linux${SFX}_sysent.c linux${SFX}_sysvec.c \ - linux_uid16.c linux_util.c linux_time.c opt_inet6.h opt_mac.h opt_compat.h opt_posix.h \ - vnode_if.h device_if.h bus_if.h +SRCS= linux_aio.c linux${SFX}_dummy.c linux_emul.c linux_file.c \ + linux_futex.c linux_getcwd.c linux_ioctl.c linux_ipc.c \ + linux${SFX}_machdep.c linux_mib.c linux_misc.c linux_signal.c \ + linux_socket.c linux_stats.c linux_sysctl.c linux${SFX}_sysent.c \ + linux${SFX}_sysvec.c linux_uid16.c linux_util.c linux_time.c \ + opt_inet6.h opt_mac.h opt_compat.h opt_posix.h vnode_if.h \ + device_if.h bus_if.h OBJS= linux${SFX}_locore.o .if ${MACHINE_ARCH} == "i386" |