diff options
Diffstat (limited to 'sys/compat/cloudabi')
-rw-r--r-- | sys/compat/cloudabi/cloudabi_clock.c | 34 | ||||
-rw-r--r-- | sys/compat/cloudabi/cloudabi_fd.c | 383 | ||||
-rw-r--r-- | sys/compat/cloudabi/cloudabi_file.c | 490 | ||||
-rw-r--r-- | sys/compat/cloudabi/cloudabi_futex.c | 1122 | ||||
-rw-r--r-- | sys/compat/cloudabi/cloudabi_proc.c | 14 | ||||
-rw-r--r-- | sys/compat/cloudabi/cloudabi_sock.c | 118 | ||||
-rw-r--r-- | sys/compat/cloudabi/cloudabi_thread.c | 18 | ||||
-rw-r--r-- | sys/compat/cloudabi/cloudabi_util.h | 38 |
8 files changed, 2175 insertions, 42 deletions
diff --git a/sys/compat/cloudabi/cloudabi_clock.c b/sys/compat/cloudabi/cloudabi_clock.c index 8462f23..ed32cf6 100644 --- a/sys/compat/cloudabi/cloudabi_clock.c +++ b/sys/compat/cloudabi/cloudabi_clock.c @@ -80,31 +80,27 @@ cloudabi_convert_timespec(const struct timespec *in, cloudabi_timestamp_t *out) return (0); } +/* Fetches the time value of a clock. */ int -cloudabi_sys_clock_res_get(struct thread *td, - struct cloudabi_sys_clock_res_get_args *uap) +cloudabi_clock_time_get(struct thread *td, cloudabi_clockid_t clock_id, + cloudabi_timestamp_t *ret) { struct timespec ts; - cloudabi_timestamp_t cts; int error; clockid_t clockid; - error = cloudabi_convert_clockid(uap->clock_id, &clockid); - if (error != 0) - return (error); - error = kern_clock_getres(td, clockid, &ts); + error = cloudabi_convert_clockid(clock_id, &clockid); if (error != 0) return (error); - error = cloudabi_convert_timespec(&ts, &cts); + error = kern_clock_gettime(td, clockid, &ts); if (error != 0) return (error); - td->td_retval[0] = cts; - return (0); + return (cloudabi_convert_timespec(&ts, ret)); } int -cloudabi_sys_clock_time_get(struct thread *td, - struct cloudabi_sys_clock_time_get_args *uap) +cloudabi_sys_clock_res_get(struct thread *td, + struct cloudabi_sys_clock_res_get_args *uap) { struct timespec ts; cloudabi_timestamp_t cts; @@ -114,7 +110,7 @@ cloudabi_sys_clock_time_get(struct thread *td, error = cloudabi_convert_clockid(uap->clock_id, &clockid); if (error != 0) return (error); - error = kern_clock_gettime(td, clockid, &ts); + error = kern_clock_getres(td, clockid, &ts); if (error != 0) return (error); error = cloudabi_convert_timespec(&ts, &cts); @@ -123,3 +119,15 @@ cloudabi_sys_clock_time_get(struct thread *td, td->td_retval[0] = cts; return (0); } + +int +cloudabi_sys_clock_time_get(struct thread *td, + struct cloudabi_sys_clock_time_get_args *uap) +{ + cloudabi_timestamp_t ts; + int error; + + error = cloudabi_clock_time_get(td, uap->clock_id, &ts); + td->td_retval[0] = ts; + return (error); +} diff --git a/sys/compat/cloudabi/cloudabi_fd.c b/sys/compat/cloudabi/cloudabi_fd.c index 8b72c19..17177d2 100644 --- a/sys/compat/cloudabi/cloudabi_fd.c +++ b/sys/compat/cloudabi/cloudabi_fd.c @@ -27,13 +27,65 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> +#include <sys/capsicum.h> #include <sys/filedesc.h> #include <sys/proc.h> +#include <sys/mman.h> +#include <sys/socketvar.h> #include <sys/syscallsubr.h> #include <sys/sysproto.h> +#include <sys/systm.h> #include <sys/unistd.h> +#include <sys/vnode.h> #include <compat/cloudabi/cloudabi_proto.h> +#include <compat/cloudabi/cloudabi_syscalldefs.h> +#include <compat/cloudabi/cloudabi_util.h> + +/* Translation between CloudABI and Capsicum rights. */ +#define RIGHTS_MAPPINGS \ + MAPPING(CLOUDABI_RIGHT_FD_DATASYNC, CAP_FSYNC) \ + MAPPING(CLOUDABI_RIGHT_FD_READ, CAP_READ) \ + MAPPING(CLOUDABI_RIGHT_FD_SEEK, CAP_SEEK) \ + MAPPING(CLOUDABI_RIGHT_FD_STAT_PUT_FLAGS, CAP_FCNTL) \ + MAPPING(CLOUDABI_RIGHT_FD_SYNC, CAP_FSYNC) \ + MAPPING(CLOUDABI_RIGHT_FD_TELL, CAP_SEEK_TELL) \ + MAPPING(CLOUDABI_RIGHT_FD_WRITE, CAP_WRITE) \ + MAPPING(CLOUDABI_RIGHT_FILE_ADVISE) \ + MAPPING(CLOUDABI_RIGHT_FILE_ALLOCATE, CAP_WRITE) \ + MAPPING(CLOUDABI_RIGHT_FILE_CREATE_DIRECTORY, CAP_MKDIRAT) \ + MAPPING(CLOUDABI_RIGHT_FILE_CREATE_FILE, CAP_CREATE) \ + MAPPING(CLOUDABI_RIGHT_FILE_CREATE_FIFO, CAP_MKFIFOAT) \ + MAPPING(CLOUDABI_RIGHT_FILE_LINK_SOURCE, CAP_LINKAT_SOURCE) \ + MAPPING(CLOUDABI_RIGHT_FILE_LINK_TARGET, CAP_LINKAT_TARGET) \ + MAPPING(CLOUDABI_RIGHT_FILE_OPEN, CAP_LOOKUP) \ + MAPPING(CLOUDABI_RIGHT_FILE_READDIR, CAP_READ) \ + MAPPING(CLOUDABI_RIGHT_FILE_READLINK, CAP_LOOKUP) \ + MAPPING(CLOUDABI_RIGHT_FILE_RENAME_SOURCE, CAP_RENAMEAT_SOURCE) \ + MAPPING(CLOUDABI_RIGHT_FILE_RENAME_TARGET, CAP_RENAMEAT_TARGET) \ + MAPPING(CLOUDABI_RIGHT_FILE_STAT_FGET, CAP_FSTAT) \ + MAPPING(CLOUDABI_RIGHT_FILE_STAT_FPUT_SIZE, CAP_FTRUNCATE) \ + MAPPING(CLOUDABI_RIGHT_FILE_STAT_FPUT_TIMES, CAP_FUTIMES) \ + MAPPING(CLOUDABI_RIGHT_FILE_STAT_GET, CAP_FSTATAT) \ + MAPPING(CLOUDABI_RIGHT_FILE_STAT_PUT_TIMES, CAP_FUTIMESAT) \ + MAPPING(CLOUDABI_RIGHT_FILE_SYMLINK, CAP_SYMLINKAT) \ + MAPPING(CLOUDABI_RIGHT_FILE_UNLINK, CAP_UNLINKAT) \ + MAPPING(CLOUDABI_RIGHT_MEM_MAP, CAP_MMAP) \ + MAPPING(CLOUDABI_RIGHT_MEM_MAP_EXEC, CAP_MMAP_X) \ + MAPPING(CLOUDABI_RIGHT_POLL_FD_READWRITE, CAP_EVENT) \ + MAPPING(CLOUDABI_RIGHT_POLL_MODIFY, CAP_KQUEUE_CHANGE) \ + MAPPING(CLOUDABI_RIGHT_POLL_PROC_TERMINATE, CAP_EVENT) \ + MAPPING(CLOUDABI_RIGHT_POLL_WAIT, CAP_KQUEUE_EVENT) \ + MAPPING(CLOUDABI_RIGHT_PROC_EXEC, CAP_FEXECVE) \ + MAPPING(CLOUDABI_RIGHT_SOCK_ACCEPT, CAP_ACCEPT) \ + MAPPING(CLOUDABI_RIGHT_SOCK_BIND_DIRECTORY, CAP_BINDAT) \ + MAPPING(CLOUDABI_RIGHT_SOCK_BIND_SOCKET, CAP_BIND) \ + MAPPING(CLOUDABI_RIGHT_SOCK_CONNECT_DIRECTORY, CAP_CONNECTAT) \ + MAPPING(CLOUDABI_RIGHT_SOCK_CONNECT_SOCKET, CAP_CONNECT) \ + MAPPING(CLOUDABI_RIGHT_SOCK_LISTEN, CAP_LISTEN) \ + MAPPING(CLOUDABI_RIGHT_SOCK_SHUTDOWN, CAP_SHUTDOWN) \ + MAPPING(CLOUDABI_RIGHT_SOCK_STAT_GET, CAP_GETPEERNAME, \ + CAP_GETSOCKNAME, CAP_GETSOCKOPT) int cloudabi_sys_fd_close(struct thread *td, struct cloudabi_sys_fd_close_args *uap) @@ -46,11 +98,19 @@ int cloudabi_sys_fd_create1(struct thread *td, struct cloudabi_sys_fd_create1_args *uap) { + struct filecaps fcaps = {}; struct socket_args socket_args = { .domain = AF_UNIX, }; switch (uap->type) { + case CLOUDABI_FILETYPE_POLL: + cap_rights_init(&fcaps.fc_rights, CAP_FSTAT, CAP_KQUEUE); + return (kern_kqueue(td, 0, &fcaps)); + case CLOUDABI_FILETYPE_SHARED_MEMORY: + cap_rights_init(&fcaps.fc_rights, CAP_FSTAT, CAP_FTRUNCATE, + CAP_MMAP_RWX); + return (kern_shm_open(td, SHM_ANON, O_RDWR, 0, &fcaps)); case CLOUDABI_FILETYPE_SOCKET_DGRAM: socket_args.type = SOCK_DGRAM; return (sys_socket(td, &socket_args)); @@ -69,10 +129,24 @@ int cloudabi_sys_fd_create2(struct thread *td, struct cloudabi_sys_fd_create2_args *uap) { + struct filecaps fcaps1 = {}, fcaps2 = {}; int fds[2]; int error; switch (uap->type) { + case CLOUDABI_FILETYPE_FIFO: + /* + * CloudABI pipes are unidirectional. Restrict rights on + * the pipe to simulate this. + */ + cap_rights_init(&fcaps1.fc_rights, CAP_EVENT, CAP_FCNTL, + CAP_FSTAT, CAP_READ); + fcaps1.fc_fcntls = CAP_FCNTL_SETFL; + cap_rights_init(&fcaps2.fc_rights, CAP_EVENT, CAP_FCNTL, + CAP_FSTAT, CAP_WRITE); + fcaps2.fc_fcntls = CAP_FCNTL_SETFL; + error = kern_pipe(td, fds, 0, &fcaps1, &fcaps2); + break; case CLOUDABI_FILETYPE_SOCKET_DGRAM: error = kern_socketpair(td, AF_UNIX, SOCK_DGRAM, 0, fds); break; @@ -160,22 +234,323 @@ cloudabi_sys_fd_seek(struct thread *td, struct cloudabi_sys_fd_seek_args *uap) return (sys_lseek(td, &lseek_args)); } +/* Converts a file descriptor to a CloudABI file descriptor type. */ +cloudabi_filetype_t +cloudabi_convert_filetype(const struct file *fp) +{ + struct socket *so; + struct vnode *vp; + + switch (fp->f_type) { + case DTYPE_FIFO: + return (CLOUDABI_FILETYPE_FIFO); + case DTYPE_KQUEUE: + return (CLOUDABI_FILETYPE_POLL); + case DTYPE_PIPE: + return (CLOUDABI_FILETYPE_FIFO); + case DTYPE_PROCDESC: + return (CLOUDABI_FILETYPE_PROCESS); + case DTYPE_SHM: + return (CLOUDABI_FILETYPE_SHARED_MEMORY); + case DTYPE_SOCKET: + so = fp->f_data; + switch (so->so_type) { + case SOCK_DGRAM: + return (CLOUDABI_FILETYPE_SOCKET_DGRAM); + case SOCK_SEQPACKET: + return (CLOUDABI_FILETYPE_SOCKET_SEQPACKET); + case SOCK_STREAM: + return (CLOUDABI_FILETYPE_SOCKET_STREAM); + default: + return (CLOUDABI_FILETYPE_UNKNOWN); + } + case DTYPE_VNODE: + vp = fp->f_vnode; + switch (vp->v_type) { + case VBLK: + return (CLOUDABI_FILETYPE_BLOCK_DEVICE); + case VCHR: + return (CLOUDABI_FILETYPE_CHARACTER_DEVICE); + case VDIR: + return (CLOUDABI_FILETYPE_DIRECTORY); + case VFIFO: + return (CLOUDABI_FILETYPE_FIFO); + case VLNK: + return (CLOUDABI_FILETYPE_SYMBOLIC_LINK); + case VREG: + return (CLOUDABI_FILETYPE_REGULAR_FILE); + case VSOCK: + return (CLOUDABI_FILETYPE_SOCKET_STREAM); + default: + return (CLOUDABI_FILETYPE_UNKNOWN); + } + default: + return (CLOUDABI_FILETYPE_UNKNOWN); + } +} + +/* Removes rights that conflict with the file descriptor type. */ +void +cloudabi_remove_conflicting_rights(cloudabi_filetype_t filetype, + cloudabi_rights_t *base, cloudabi_rights_t *inheriting) +{ + + /* + * CloudABI has a small number of additional rights bits to + * disambiguate between multiple purposes. Remove the bits that + * don't apply to the type of the file descriptor. + * + * As file descriptor access modes (O_ACCMODE) has been fully + * replaced by rights bits, CloudABI distinguishes between + * rights that apply to the file descriptor itself (base) versus + * rights of new file descriptors derived from them + * (inheriting). The code below approximates the pair by + * decomposing depending on the file descriptor type. + * + * We need to be somewhat accurate about which actions can + * actually be performed on the file descriptor, as functions + * like fcntl(fd, F_GETFL) are emulated on top of this. + */ + switch (filetype) { + case CLOUDABI_FILETYPE_DIRECTORY: + *base &= CLOUDABI_RIGHT_FD_STAT_PUT_FLAGS | + CLOUDABI_RIGHT_FD_SYNC | CLOUDABI_RIGHT_FILE_ADVISE | + CLOUDABI_RIGHT_FILE_CREATE_DIRECTORY | + CLOUDABI_RIGHT_FILE_CREATE_FILE | + CLOUDABI_RIGHT_FILE_CREATE_FIFO | + CLOUDABI_RIGHT_FILE_LINK_SOURCE | + CLOUDABI_RIGHT_FILE_LINK_TARGET | + CLOUDABI_RIGHT_FILE_OPEN | + CLOUDABI_RIGHT_FILE_READDIR | + CLOUDABI_RIGHT_FILE_READLINK | + CLOUDABI_RIGHT_FILE_RENAME_SOURCE | + CLOUDABI_RIGHT_FILE_RENAME_TARGET | + CLOUDABI_RIGHT_FILE_STAT_FGET | + CLOUDABI_RIGHT_FILE_STAT_FPUT_TIMES | + CLOUDABI_RIGHT_FILE_STAT_GET | + CLOUDABI_RIGHT_FILE_STAT_PUT_TIMES | + CLOUDABI_RIGHT_FILE_SYMLINK | + CLOUDABI_RIGHT_FILE_UNLINK | + CLOUDABI_RIGHT_POLL_FD_READWRITE | + CLOUDABI_RIGHT_SOCK_BIND_DIRECTORY | + CLOUDABI_RIGHT_SOCK_CONNECT_DIRECTORY; + *inheriting &= CLOUDABI_RIGHT_FD_DATASYNC | + CLOUDABI_RIGHT_FD_READ | + CLOUDABI_RIGHT_FD_SEEK | + CLOUDABI_RIGHT_FD_STAT_PUT_FLAGS | + CLOUDABI_RIGHT_FD_SYNC | + CLOUDABI_RIGHT_FD_TELL | + CLOUDABI_RIGHT_FD_WRITE | + CLOUDABI_RIGHT_FILE_ADVISE | + CLOUDABI_RIGHT_FILE_ALLOCATE | + CLOUDABI_RIGHT_FILE_CREATE_DIRECTORY | + CLOUDABI_RIGHT_FILE_CREATE_FILE | + CLOUDABI_RIGHT_FILE_CREATE_FIFO | + CLOUDABI_RIGHT_FILE_LINK_SOURCE | + CLOUDABI_RIGHT_FILE_LINK_TARGET | + CLOUDABI_RIGHT_FILE_OPEN | + CLOUDABI_RIGHT_FILE_READDIR | + CLOUDABI_RIGHT_FILE_READLINK | + CLOUDABI_RIGHT_FILE_RENAME_SOURCE | + CLOUDABI_RIGHT_FILE_RENAME_TARGET | + CLOUDABI_RIGHT_FILE_STAT_FGET | + CLOUDABI_RIGHT_FILE_STAT_FPUT_SIZE | + CLOUDABI_RIGHT_FILE_STAT_FPUT_TIMES | + CLOUDABI_RIGHT_FILE_STAT_GET | + CLOUDABI_RIGHT_FILE_STAT_PUT_TIMES | + CLOUDABI_RIGHT_FILE_SYMLINK | + CLOUDABI_RIGHT_FILE_UNLINK | + CLOUDABI_RIGHT_MEM_MAP | + CLOUDABI_RIGHT_MEM_MAP_EXEC | + CLOUDABI_RIGHT_POLL_FD_READWRITE | + CLOUDABI_RIGHT_PROC_EXEC | + CLOUDABI_RIGHT_SOCK_BIND_DIRECTORY | + CLOUDABI_RIGHT_SOCK_CONNECT_DIRECTORY; + break; + case CLOUDABI_FILETYPE_FIFO: + *base &= CLOUDABI_RIGHT_FD_READ | + CLOUDABI_RIGHT_FD_STAT_PUT_FLAGS | + CLOUDABI_RIGHT_FD_WRITE | + CLOUDABI_RIGHT_FILE_STAT_FGET | + CLOUDABI_RIGHT_POLL_FD_READWRITE; + *inheriting = 0; + break; + case CLOUDABI_FILETYPE_POLL: + *base &= ~CLOUDABI_RIGHT_FILE_ADVISE; + *inheriting = 0; + break; + case CLOUDABI_FILETYPE_PROCESS: + *base &= ~(CLOUDABI_RIGHT_FILE_ADVISE | + CLOUDABI_RIGHT_POLL_FD_READWRITE); + *inheriting = 0; + break; + case CLOUDABI_FILETYPE_REGULAR_FILE: + *base &= CLOUDABI_RIGHT_FD_DATASYNC | + CLOUDABI_RIGHT_FD_READ | + CLOUDABI_RIGHT_FD_SEEK | + CLOUDABI_RIGHT_FD_STAT_PUT_FLAGS | + CLOUDABI_RIGHT_FD_SYNC | + CLOUDABI_RIGHT_FD_TELL | + CLOUDABI_RIGHT_FD_WRITE | + CLOUDABI_RIGHT_FILE_ADVISE | + CLOUDABI_RIGHT_FILE_ALLOCATE | + CLOUDABI_RIGHT_FILE_STAT_FGET | + CLOUDABI_RIGHT_FILE_STAT_FPUT_SIZE | + CLOUDABI_RIGHT_FILE_STAT_FPUT_TIMES | + CLOUDABI_RIGHT_MEM_MAP | + CLOUDABI_RIGHT_MEM_MAP_EXEC | + CLOUDABI_RIGHT_POLL_FD_READWRITE | + CLOUDABI_RIGHT_PROC_EXEC; + *inheriting = 0; + break; + case CLOUDABI_FILETYPE_SHARED_MEMORY: + *base &= ~(CLOUDABI_RIGHT_FD_SEEK | + CLOUDABI_RIGHT_FD_TELL | + CLOUDABI_RIGHT_FILE_ADVISE | + CLOUDABI_RIGHT_FILE_ALLOCATE | + CLOUDABI_RIGHT_FILE_READDIR); + *inheriting = 0; + break; + case CLOUDABI_FILETYPE_SOCKET_DGRAM: + case CLOUDABI_FILETYPE_SOCKET_SEQPACKET: + case CLOUDABI_FILETYPE_SOCKET_STREAM: + *base &= CLOUDABI_RIGHT_FD_READ | + CLOUDABI_RIGHT_FD_STAT_PUT_FLAGS | + CLOUDABI_RIGHT_FD_WRITE | + CLOUDABI_RIGHT_FILE_STAT_FGET | + CLOUDABI_RIGHT_POLL_FD_READWRITE | + CLOUDABI_RIGHT_SOCK_ACCEPT | + CLOUDABI_RIGHT_SOCK_BIND_SOCKET | + CLOUDABI_RIGHT_SOCK_CONNECT_SOCKET | + CLOUDABI_RIGHT_SOCK_LISTEN | + CLOUDABI_RIGHT_SOCK_SHUTDOWN | + CLOUDABI_RIGHT_SOCK_STAT_GET; + break; + default: + *inheriting = 0; + break; + } +} + +/* Converts FreeBSD's Capsicum rights to CloudABI's set of rights. */ +static void +convert_capabilities(const cap_rights_t *capabilities, + cloudabi_filetype_t filetype, cloudabi_rights_t *base, + cloudabi_rights_t *inheriting) +{ + cloudabi_rights_t rights; + + /* Convert FreeBSD bits to CloudABI bits. */ + rights = 0; +#define MAPPING(cloudabi, ...) do { \ + if (cap_rights_is_set(capabilities, ##__VA_ARGS__)) \ + rights |= (cloudabi); \ +} while (0); + RIGHTS_MAPPINGS +#undef MAPPING + + *base = rights; + *inheriting = rights; + cloudabi_remove_conflicting_rights(filetype, base, inheriting); +} + int cloudabi_sys_fd_stat_get(struct thread *td, struct cloudabi_sys_fd_stat_get_args *uap) { + cloudabi_fdstat_t fsb = {}; + struct filedesc *fdp; + struct file *fp; + seq_t seq; + cap_rights_t rights; + int error, oflags; + bool modified; + + /* Obtain file descriptor properties. */ + fdp = td->td_proc->p_fd; + do { + error = fget_unlocked(fdp, uap->fd, cap_rights_init(&rights), + &fp, &seq); + if (error != 0) + return (error); + if (fp->f_ops == &badfileops) { + fdrop(fp, td); + return (EBADF); + } - /* Not implemented. */ - return (ENOSYS); + rights = *cap_rights(fdp, uap->fd); + oflags = OFLAGS(fp->f_flag); + fsb.fs_filetype = cloudabi_convert_filetype(fp); + + modified = fd_modified(fdp, uap->fd, seq); + fdrop(fp, td); + } while (modified); + + /* Convert file descriptor flags. */ + if (oflags & O_APPEND) + fsb.fs_flags |= CLOUDABI_FDFLAG_APPEND; + if (oflags & O_NONBLOCK) + fsb.fs_flags |= CLOUDABI_FDFLAG_NONBLOCK; + if (oflags & O_SYNC) + fsb.fs_flags |= CLOUDABI_FDFLAG_SYNC; + + /* Convert capabilities to CloudABI rights. */ + convert_capabilities(&rights, fsb.fs_filetype, + &fsb.fs_rights_base, &fsb.fs_rights_inheriting); + return (copyout(&fsb, (void *)uap->buf, sizeof(fsb))); +} + +/* Converts CloudABI rights to a set of Capsicum capabilities. */ +int +cloudabi_convert_rights(cloudabi_rights_t in, cap_rights_t *out) +{ + + cap_rights_init(out); +#define MAPPING(cloudabi, ...) do { \ + if (in & (cloudabi)) { \ + cap_rights_set(out, ##__VA_ARGS__); \ + in &= ~(cloudabi); \ + } \ +} while (0); + RIGHTS_MAPPINGS +#undef MAPPING + if (in != 0) + return (ENOTCAPABLE); + return (0); } int cloudabi_sys_fd_stat_put(struct thread *td, struct cloudabi_sys_fd_stat_put_args *uap) { + cloudabi_fdstat_t fsb; + cap_rights_t rights; + int error, oflags; - /* Not implemented. */ - return (ENOSYS); + error = copyin(uap->buf, &fsb, sizeof(fsb)); + if (error != 0) + return (error); + + if (uap->flags == CLOUDABI_FDSTAT_FLAGS) { + /* Convert flags. */ + oflags = 0; + if (fsb.fs_flags & CLOUDABI_FDFLAG_APPEND) + oflags |= O_APPEND; + if (fsb.fs_flags & CLOUDABI_FDFLAG_NONBLOCK) + oflags |= O_NONBLOCK; + if (fsb.fs_flags & (CLOUDABI_FDFLAG_SYNC | + CLOUDABI_FDFLAG_DSYNC | CLOUDABI_FDFLAG_RSYNC)) + oflags |= O_SYNC; + return (kern_fcntl(td, uap->fd, F_SETFL, oflags)); + } else if (uap->flags == CLOUDABI_FDSTAT_RIGHTS) { + /* Convert rights. */ + error = cloudabi_convert_rights( + fsb.fs_rights_base | fsb.fs_rights_inheriting, &rights); + if (error != 0) + return (error); + return (kern_cap_rights_limit(td, uap->fd, &rights)); + } + return (EINVAL); } int diff --git a/sys/compat/cloudabi/cloudabi_file.c b/sys/compat/cloudabi/cloudabi_file.c index b508517..cdf0585 100644 --- a/sys/compat/cloudabi/cloudabi_file.c +++ b/sys/compat/cloudabi/cloudabi_file.c @@ -27,14 +27,23 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> +#include <sys/capsicum.h> +#include <sys/dirent.h> #include <sys/fcntl.h> #include <sys/kernel.h> #include <sys/malloc.h> #include <sys/namei.h> +#include <sys/proc.h> +#include <sys/stat.h> #include <sys/syscallsubr.h> +#include <sys/uio.h> +#include <sys/vnode.h> #include <compat/cloudabi/cloudabi_proto.h> #include <compat/cloudabi/cloudabi_syscalldefs.h> +#include <compat/cloudabi/cloudabi_util.h> + +#include <security/mac/mac_framework.h> static MALLOC_DEFINE(M_CLOUDABI_PATH, "cloudabipath", "CloudABI pathnames"); @@ -133,9 +142,31 @@ int cloudabi_sys_file_create(struct thread *td, struct cloudabi_sys_file_create_args *uap) { + char *path; + int error; + + error = copyin_path(uap->path, uap->pathlen, &path); + if (error != 0) + return (error); - /* Not implemented. */ - return (ENOSYS); + /* + * CloudABI processes cannot interact with UNIX credentials and + * permissions. Depend on the umask that is set prior to + * execution to restrict the file permissions. + */ + switch (uap->type) { + case CLOUDABI_FILETYPE_DIRECTORY: + error = kern_mkdirat(td, uap->fd, path, UIO_SYSSPACE, 0777); + break; + case CLOUDABI_FILETYPE_FIFO: + error = kern_mkfifoat(td, uap->fd, path, UIO_SYSSPACE, 0666); + break; + default: + error = EINVAL; + break; + } + cloudabi_freestr(path); + return (error); } int @@ -166,18 +197,304 @@ int cloudabi_sys_file_open(struct thread *td, struct cloudabi_sys_file_open_args *uap) { + cloudabi_fdstat_t fds; + cap_rights_t rights; + struct filecaps fcaps = {}; + struct nameidata nd; + struct file *fp; + struct vnode *vp; + char *path; + int error, fd, fflags; + bool read, write; + + error = copyin(uap->fds, &fds, sizeof(fds)); + if (error != 0) + return (error); + + /* All the requested rights should be set on the descriptor. */ + error = cloudabi_convert_rights( + fds.fs_rights_base | fds.fs_rights_inheriting, &rights); + if (error != 0) + return (error); + cap_rights_set(&rights, CAP_LOOKUP); + + /* Convert rights to corresponding access mode. */ + read = (fds.fs_rights_base & (CLOUDABI_RIGHT_FD_READ | + CLOUDABI_RIGHT_FILE_READDIR | CLOUDABI_RIGHT_MEM_MAP_EXEC)) != 0; + write = (fds.fs_rights_base & (CLOUDABI_RIGHT_FD_DATASYNC | + CLOUDABI_RIGHT_FD_WRITE | CLOUDABI_RIGHT_FILE_ALLOCATE | + CLOUDABI_RIGHT_FILE_STAT_FPUT_SIZE)) != 0; + fflags = write ? read ? FREAD | FWRITE : FWRITE : FREAD; + + /* Convert open flags. */ + if ((uap->oflags & CLOUDABI_O_CREAT) != 0) { + fflags |= O_CREAT; + cap_rights_set(&rights, CAP_CREATE); + } + if ((uap->oflags & CLOUDABI_O_DIRECTORY) != 0) + fflags |= O_DIRECTORY; + if ((uap->oflags & CLOUDABI_O_EXCL) != 0) + fflags |= O_EXCL; + if ((uap->oflags & CLOUDABI_O_TRUNC) != 0) { + fflags |= O_TRUNC; + cap_rights_set(&rights, CAP_FTRUNCATE); + } + if ((fds.fs_flags & CLOUDABI_FDFLAG_APPEND) != 0) + fflags |= O_APPEND; + if ((fds.fs_flags & CLOUDABI_FDFLAG_NONBLOCK) != 0) + fflags |= O_NONBLOCK; + if ((fds.fs_flags & (CLOUDABI_FDFLAG_SYNC | CLOUDABI_FDFLAG_DSYNC | + CLOUDABI_FDFLAG_RSYNC)) != 0) { + fflags |= O_SYNC; + cap_rights_set(&rights, CAP_FSYNC); + } + if ((uap->fd & CLOUDABI_LOOKUP_SYMLINK_FOLLOW) == 0) + fflags |= O_NOFOLLOW; + if (write && (fflags & (O_APPEND | O_TRUNC)) == 0) + cap_rights_set(&rights, CAP_SEEK); + + /* Allocate new file descriptor. */ + error = falloc_noinstall(td, &fp); + if (error != 0) + return (error); + fp->f_flag = fflags & FMASK; + + /* Open path. */ + error = copyin_path(uap->path, uap->pathlen, &path); + if (error != 0) { + fdrop(fp, td); + return (error); + } + NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, path, uap->fd, + &rights, td); + error = vn_open(&nd, &fflags, 0777 & ~td->td_proc->p_fd->fd_cmask, fp); + cloudabi_freestr(path); + if (error != 0) { + /* Custom operations provided. */ + if (error == ENXIO && fp->f_ops != &badfileops) + goto success; + + /* + * POSIX compliance: return ELOOP in case openat() is + * called on a symbolic link and O_NOFOLLOW is set. + */ + if (error == EMLINK) + error = ELOOP; + fdrop(fp, td); + return (error); + } + NDFREE(&nd, NDF_ONLY_PNBUF); + filecaps_free(&nd.ni_filecaps); + fp->f_vnode = vp = nd.ni_vp; + + /* Install vnode operations if no custom operations are provided. */ + if (fp->f_ops == &badfileops) { + fp->f_seqcount = 1; + finit(fp, (fflags & FMASK) | (fp->f_flag & FHASLOCK), + DTYPE_VNODE, vp, &vnops); + } + VOP_UNLOCK(vp, 0); + + /* Truncate file. */ + if (fflags & O_TRUNC) { + error = fo_truncate(fp, 0, td->td_ucred, td); + if (error != 0) { + fdrop(fp, td); + return (error); + } + } + +success: + /* Determine which Capsicum rights to set on the file descriptor. */ + cloudabi_remove_conflicting_rights(cloudabi_convert_filetype(fp), + &fds.fs_rights_base, &fds.fs_rights_inheriting); + cloudabi_convert_rights(fds.fs_rights_base | fds.fs_rights_inheriting, + &fcaps.fc_rights); + if (cap_rights_is_set(&fcaps.fc_rights)) + fcaps.fc_fcntls = CAP_FCNTL_SETFL; + + error = finstall(td, fp, &fd, fflags, &fcaps); + fdrop(fp, td); + if (error != 0) + return (error); + td->td_retval[0] = fd; + return (0); +} + +/* Converts a FreeBSD directory entry structure and writes it to userspace. */ +static int +write_dirent(struct dirent *bde, cloudabi_dircookie_t cookie, struct uio *uio) +{ + cloudabi_dirent_t cde = { + .d_next = cookie, + .d_ino = bde->d_fileno, + .d_namlen = bde->d_namlen, + }; + size_t len; + int error; + + /* Convert file type. */ + switch (bde->d_type) { + case DT_BLK: + cde.d_type = CLOUDABI_FILETYPE_BLOCK_DEVICE; + break; + case DT_CHR: + cde.d_type = CLOUDABI_FILETYPE_CHARACTER_DEVICE; + break; + case DT_DIR: + cde.d_type = CLOUDABI_FILETYPE_DIRECTORY; + break; + case DT_FIFO: + cde.d_type = CLOUDABI_FILETYPE_FIFO; + break; + case DT_LNK: + cde.d_type = CLOUDABI_FILETYPE_SYMBOLIC_LINK; + break; + case DT_REG: + cde.d_type = CLOUDABI_FILETYPE_REGULAR_FILE; + break; + case DT_SOCK: + /* The exact socket type cannot be derived. */ + cde.d_type = CLOUDABI_FILETYPE_SOCKET_STREAM; + break; + default: + cde.d_type = CLOUDABI_FILETYPE_UNKNOWN; + break; + } + + /* Write directory entry structure. */ + len = sizeof(cde) < uio->uio_resid ? sizeof(cde) : uio->uio_resid; + error = uiomove(&cde, len, uio); + if (error != 0) + return (error); - /* Not implemented. */ - return (ENOSYS); + /* Write filename. */ + len = bde->d_namlen < uio->uio_resid ? bde->d_namlen : uio->uio_resid; + return (uiomove(bde->d_name, len, uio)); } int cloudabi_sys_file_readdir(struct thread *td, struct cloudabi_sys_file_readdir_args *uap) { + struct iovec iov = { + .iov_base = uap->buf, + .iov_len = uap->nbyte + }; + struct uio uio = { + .uio_iov = &iov, + .uio_iovcnt = 1, + .uio_resid = iov.iov_len, + .uio_segflg = UIO_USERSPACE, + .uio_rw = UIO_READ, + .uio_td = td + }; + struct file *fp; + struct vnode *vp; + void *readbuf; + cap_rights_t rights; + cloudabi_dircookie_t offset; + int error; + + /* Obtain directory vnode. */ + error = getvnode(td, uap->fd, cap_rights_init(&rights, CAP_READ), &fp); + if (error != 0) { + if (error == EINVAL) + return (ENOTDIR); + return (error); + } + if ((fp->f_flag & FREAD) == 0) { + fdrop(fp, td); + return (EBADF); + } - /* Not implemented. */ - return (ENOSYS); + /* + * Call VOP_READDIR() and convert resulting data until the user + * provided buffer is filled. + */ + readbuf = malloc(MAXBSIZE, M_TEMP, M_WAITOK); + offset = uap->cookie; + vp = fp->f_vnode; + while (uio.uio_resid > 0) { + struct iovec readiov = { + .iov_base = readbuf, + .iov_len = MAXBSIZE + }; + struct uio readuio = { + .uio_iov = &readiov, + .uio_iovcnt = 1, + .uio_rw = UIO_READ, + .uio_segflg = UIO_SYSSPACE, + .uio_td = td, + .uio_resid = MAXBSIZE, + .uio_offset = offset + }; + struct dirent *bde; + unsigned long *cookies, *cookie; + size_t readbuflen; + int eof, ncookies; + + /* Validate file type. */ + vn_lock(vp, LK_SHARED | LK_RETRY); + if (vp->v_type != VDIR) { + VOP_UNLOCK(vp, 0); + error = ENOTDIR; + goto done; + } +#ifdef MAC + error = mac_vnode_check_readdir(td->td_ucred, vp); + if (error != 0) { + VOP_UNLOCK(vp, 0); + goto done; + } +#endif /* MAC */ + + /* Read new directory entries. */ + cookies = NULL; + ncookies = 0; + error = VOP_READDIR(vp, &readuio, fp->f_cred, &eof, + &ncookies, &cookies); + VOP_UNLOCK(vp, 0); + if (error != 0) + goto done; + + /* Convert entries to CloudABI's format. */ + readbuflen = MAXBSIZE - readuio.uio_resid; + bde = readbuf; + cookie = cookies; + while (readbuflen >= offsetof(struct dirent, d_name) && + uio.uio_resid > 0 && ncookies > 0) { + /* Ensure that the returned offset always increases. */ + if (readbuflen >= bde->d_reclen && bde->d_fileno != 0 && + *cookie > offset) { + error = write_dirent(bde, *cookie, &uio); + if (error != 0) { + free(cookies, M_TEMP); + goto done; + } + } + + if (offset < *cookie) + offset = *cookie; + ++cookie; + --ncookies; + readbuflen -= bde->d_reclen; + bde = (struct dirent *)((char *)bde + bde->d_reclen); + } + free(cookies, M_TEMP); + if (eof) + break; + } + +done: + fdrop(fp, td); + free(readbuf, M_TEMP); + if (error != 0) + return (error); + + /* Return number of bytes copied to userspace. */ + td->td_retval[0] = uap->nbyte - uio.uio_resid; + return (0); } int @@ -220,40 +537,185 @@ cloudabi_sys_file_rename(struct thread *td, return (error); } +/* Converts a FreeBSD stat structure to a CloudABI stat structure. */ +static void +convert_stat(const struct stat *sb, cloudabi_filestat_t *csb) +{ + cloudabi_filestat_t res = { + .st_dev = sb->st_dev, + .st_ino = sb->st_ino, + .st_nlink = sb->st_nlink, + .st_size = sb->st_size, + }; + + cloudabi_convert_timespec(&sb->st_atim, &res.st_atim); + cloudabi_convert_timespec(&sb->st_mtim, &res.st_mtim); + cloudabi_convert_timespec(&sb->st_ctim, &res.st_ctim); + *csb = res; +} + int cloudabi_sys_file_stat_fget(struct thread *td, struct cloudabi_sys_file_stat_fget_args *uap) { + struct stat sb; + cloudabi_filestat_t csb; + struct file *fp; + cap_rights_t rights; + cloudabi_filetype_t filetype; + int error; + + /* Fetch file descriptor attributes. */ + error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FSTAT), &fp); + if (error != 0) + return (error); + error = fo_stat(fp, &sb, td->td_ucred, td); + if (error != 0) { + fdrop(fp, td); + return (error); + } + filetype = cloudabi_convert_filetype(fp); + fdrop(fp, td); + + /* Convert attributes to CloudABI's format. */ + convert_stat(&sb, &csb); + csb.st_filetype = filetype; + return (copyout(&csb, uap->buf, sizeof(csb))); +} + +/* Converts timestamps to arguments to futimens() and utimensat(). */ +static void +convert_utimens_arguments(const cloudabi_filestat_t *fs, + cloudabi_fsflags_t flags, struct timespec *ts) +{ + + if ((flags & CLOUDABI_FILESTAT_ATIM_NOW) != 0) { + ts[0].tv_nsec = UTIME_NOW; + } else if ((flags & CLOUDABI_FILESTAT_ATIM) != 0) { + ts[0].tv_sec = fs->st_atim / 1000000000; + ts[0].tv_nsec = fs->st_atim % 1000000000; + } else { + ts[0].tv_nsec = UTIME_OMIT; + } - /* Not implemented. */ - return (ENOSYS); + if ((flags & CLOUDABI_FILESTAT_MTIM_NOW) != 0) { + ts[1].tv_nsec = UTIME_NOW; + } else if ((flags & CLOUDABI_FILESTAT_MTIM) != 0) { + ts[1].tv_sec = fs->st_mtim / 1000000000; + ts[1].tv_nsec = fs->st_mtim % 1000000000; + } else { + ts[1].tv_nsec = UTIME_OMIT; + } } int cloudabi_sys_file_stat_fput(struct thread *td, struct cloudabi_sys_file_stat_fput_args *uap) { + cloudabi_filestat_t fs; + struct timespec ts[2]; + int error; + + error = copyin(uap->buf, &fs, sizeof(fs)); + if (error != 0) + return (error); - /* Not implemented. */ - return (ENOSYS); + /* + * Only support truncation and timestamp modification separately + * for now, to prevent unnecessary code duplication. + */ + if ((uap->flags & CLOUDABI_FILESTAT_SIZE) != 0) { + /* Call into kern_ftruncate() for file truncation. */ + if ((uap->flags & ~CLOUDABI_FILESTAT_SIZE) != 0) + return (EINVAL); + return (kern_ftruncate(td, uap->fd, fs.st_size)); + } else if ((uap->flags & (CLOUDABI_FILESTAT_ATIM | + CLOUDABI_FILESTAT_ATIM_NOW | CLOUDABI_FILESTAT_MTIM | + CLOUDABI_FILESTAT_MTIM_NOW)) != 0) { + /* Call into kern_futimens() for timestamp modification. */ + if ((uap->flags & ~(CLOUDABI_FILESTAT_ATIM | + CLOUDABI_FILESTAT_ATIM_NOW | CLOUDABI_FILESTAT_MTIM | + CLOUDABI_FILESTAT_MTIM_NOW)) != 0) + return (EINVAL); + convert_utimens_arguments(&fs, uap->flags, ts); + return (kern_futimens(td, uap->fd, ts, UIO_SYSSPACE)); + } + return (EINVAL); } int cloudabi_sys_file_stat_get(struct thread *td, struct cloudabi_sys_file_stat_get_args *uap) { + struct stat sb; + cloudabi_filestat_t csb; + char *path; + int error; + + error = copyin_path(uap->path, uap->pathlen, &path); + if (error != 0) + return (error); + + error = kern_statat(td, + (uap->fd & CLOUDABI_LOOKUP_SYMLINK_FOLLOW) != 0 ? 0 : + AT_SYMLINK_NOFOLLOW, uap->fd, path, UIO_SYSSPACE, &sb, NULL); + cloudabi_freestr(path); + if (error != 0) + return (error); - /* Not implemented. */ - return (ENOSYS); + /* Convert results and return them. */ + convert_stat(&sb, &csb); + if (S_ISBLK(sb.st_mode)) + csb.st_filetype = CLOUDABI_FILETYPE_BLOCK_DEVICE; + else if (S_ISCHR(sb.st_mode)) + csb.st_filetype = CLOUDABI_FILETYPE_CHARACTER_DEVICE; + else if (S_ISDIR(sb.st_mode)) + csb.st_filetype = CLOUDABI_FILETYPE_DIRECTORY; + else if (S_ISFIFO(sb.st_mode)) + csb.st_filetype = CLOUDABI_FILETYPE_FIFO; + else if (S_ISREG(sb.st_mode)) + csb.st_filetype = CLOUDABI_FILETYPE_REGULAR_FILE; + else if (S_ISSOCK(sb.st_mode)) { + /* Inaccurate, but the best that we can do. */ + csb.st_filetype = CLOUDABI_FILETYPE_SOCKET_STREAM; + } else if (S_ISLNK(sb.st_mode)) + csb.st_filetype = CLOUDABI_FILETYPE_SYMBOLIC_LINK; + else + csb.st_filetype = CLOUDABI_FILETYPE_UNKNOWN; + return (copyout(&csb, uap->buf, sizeof(csb))); } int cloudabi_sys_file_stat_put(struct thread *td, struct cloudabi_sys_file_stat_put_args *uap) { + cloudabi_filestat_t fs; + struct timespec ts[2]; + char *path; + int error; + + /* + * Only support timestamp modification for now, as there is no + * truncateat(). + */ + if ((uap->flags & ~(CLOUDABI_FILESTAT_ATIM | + CLOUDABI_FILESTAT_ATIM_NOW | CLOUDABI_FILESTAT_MTIM | + CLOUDABI_FILESTAT_MTIM_NOW)) != 0) + return (EINVAL); - /* Not implemented. */ - return (ENOSYS); + error = copyin(uap->buf, &fs, sizeof(fs)); + if (error != 0) + return (error); + error = copyin_path(uap->path, uap->pathlen, &path); + if (error != 0) + return (error); + + convert_utimens_arguments(&fs, uap->flags, ts); + error = kern_utimensat(td, uap->fd, path, UIO_SYSSPACE, ts, + UIO_SYSSPACE, (uap->fd & CLOUDABI_LOOKUP_SYMLINK_FOLLOW) != 0 ? + 0 : AT_SYMLINK_NOFOLLOW); + cloudabi_freestr(path); + return (error); } int diff --git a/sys/compat/cloudabi/cloudabi_futex.c b/sys/compat/cloudabi/cloudabi_futex.c index 9d4fdf8..aec2f33 100644 --- a/sys/compat/cloudabi/cloudabi_futex.c +++ b/sys/compat/cloudabi/cloudabi_futex.c @@ -26,22 +26,1136 @@ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/limits.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/sx.h> +#include <sys/systm.h> +#include <sys/umtx.h> + #include <compat/cloudabi/cloudabi_proto.h> +#include <compat/cloudabi/cloudabi_syscalldefs.h> +#include <compat/cloudabi/cloudabi_util.h> + +/* + * Futexes for CloudABI. + * + * On most systems, futexes are implemented as objects of a single type + * on which a set of operations can be performed. CloudABI makes a clear + * distinction between locks and condition variables. A lock may have + * zero or more associated condition variables. A condition variable is + * always associated with exactly one lock. There is a strict topology. + * This approach has two advantages: + * + * - This topology is guaranteed to be acyclic. Requeueing of threads + * only happens in one direction (from condition variables to locks). + * This eases locking. + * - It means that a futex object for a lock exists when it is unlocked, + * but has threads waiting on associated condition variables. Threads + * can be requeued to a lock even if the thread performing the wakeup + * does not have the lock mapped in its address space. + * + * This futex implementation only implements a single lock type, namely + * a read-write lock. A regular mutex type would not be necessary, as + * the read-write lock is as efficient as a mutex if used as such. + * Userspace futex locks are 32 bits in size: + * + * - 1 bit: has threads waiting in kernel-space. + * - 1 bit: is write-locked. + * - 30 bits: + * - if write-locked: thread ID of owner. + * - if not write-locked: number of read locks held. + * + * Condition variables are also 32 bits in size. Its value is modified + * by kernel-space exclusively. Zero indicates that it has no waiting + * threads. Non-zero indicates the opposite. + * + * This implementation is optimal, in the sense that it only wakes up + * threads if they can actually continue execution. It does not suffer + * from the thundering herd problem. If multiple threads waiting on a + * condition variable need to be woken up, only a single thread is + * scheduled. All other threads are 'donated' to this thread. After the + * thread manages to reacquire the lock, it requeues its donated threads + * to the lock. + * + * TODO(ed): Integrate this functionality into kern_umtx.c instead. + * TODO(ed): Store futex objects in a hash table. + * TODO(ed): Add actual priority inheritance. + * TODO(ed): Let futex_queue also take priorities into account. + * TODO(ed): Make locking fine-grained. + * TODO(ed): Perform sleeps until an actual absolute point in time, + * instead of converting the timestamp to a relative value. + */ + +struct futex_address; +struct futex_condvar; +struct futex_lock; +struct futex_queue; +struct futex_waiter; + +/* Identifier of a location in memory. */ +struct futex_address { + struct umtx_key fa_key; +}; + +/* A set of waiting threads. */ +struct futex_queue { + STAILQ_HEAD(, futex_waiter) fq_list; + unsigned int fq_count; +}; + +/* Condition variables. */ +struct futex_condvar { + /* Address of the condition variable. */ + struct futex_address fc_address; + + /* The lock the waiters should be moved to when signalled. */ + struct futex_lock * fc_lock; + + /* Threads waiting on the condition variable. */ + struct futex_queue fc_waiters; + /* + * Number of threads blocked on this condition variable, or + * being blocked on the lock after being requeued. + */ + unsigned int fc_waitcount; + + /* Global list pointers. */ + LIST_ENTRY(futex_condvar) fc_next; +}; + +/* Read-write locks. */ +struct futex_lock { + /* Address of the lock. */ + struct futex_address fl_address; + + /* + * Current owner of the lock. LOCK_UNMANAGED if the lock is + * currently not owned by the kernel. LOCK_OWNER_UNKNOWN in case + * the owner is not known (e.g., when the lock is read-locked). + */ + cloudabi_tid_t fl_owner; +#define LOCK_UNMANAGED 0x0 +#define LOCK_OWNER_UNKNOWN 0x1 + + /* Writers blocked on the lock. */ + struct futex_queue fl_writers; + /* Readers blocked on the lock. */ + struct futex_queue fl_readers; + /* Number of threads blocked on this lock + condition variables. */ + unsigned int fl_waitcount; + + /* Global list pointers. */ + LIST_ENTRY(futex_lock) fl_next; +}; + +/* Information associated with a thread blocked on an object. */ +struct futex_waiter { + /* Thread ID. */ + cloudabi_tid_t fw_tid; + /* Condition variable used for waiting. */ + struct cv fw_wait; + + /* Queue this waiter is currently placed in. */ + struct futex_queue * fw_queue; + /* List pointers of fw_queue. */ + STAILQ_ENTRY(futex_waiter) fw_next; + + /* Lock has been acquired. */ + bool fw_locked; + /* If not locked, threads that should block after acquiring. */ + struct futex_queue fw_donated; +}; + +/* Global data structures. */ +static MALLOC_DEFINE(M_FUTEX, "futex", "CloudABI futex"); + +static struct sx futex_global_lock; +SX_SYSINIT(futex_global_lock, &futex_global_lock, "CloudABI futex global lock"); + +static LIST_HEAD(, futex_lock) futex_lock_list = + LIST_HEAD_INITIALIZER(&futex_lock_list); +static LIST_HEAD(, futex_condvar) futex_condvar_list = + LIST_HEAD_INITIALIZER(&futex_condvar_list); + +/* Utility functions. */ +static void futex_lock_assert(const struct futex_lock *); +static struct futex_lock *futex_lock_lookup_locked(struct futex_address *); +static void futex_lock_release(struct futex_lock *); +static int futex_lock_tryrdlock(struct futex_lock *, cloudabi_lock_t *); +static int futex_lock_unmanage(struct futex_lock *, cloudabi_lock_t *); +static int futex_lock_update_owner(struct futex_lock *, cloudabi_lock_t *); +static int futex_lock_wake_up_next(struct futex_lock *, cloudabi_lock_t *); +static unsigned int futex_queue_count(const struct futex_queue *); +static void futex_queue_init(struct futex_queue *); +static void futex_queue_requeue(struct futex_queue *, struct futex_queue *, + unsigned int); +static int futex_queue_sleep(struct futex_queue *, struct futex_lock *, + struct futex_waiter *, struct thread *, cloudabi_clockid_t, + cloudabi_timestamp_t, cloudabi_timestamp_t); +static cloudabi_tid_t futex_queue_tid_best(const struct futex_queue *); +static void futex_queue_wake_up_all(struct futex_queue *); +static void futex_queue_wake_up_best(struct futex_queue *); +static void futex_queue_wake_up_donate(struct futex_queue *, unsigned int); +static int futex_user_load(uint32_t *, uint32_t *); +static int futex_user_store(uint32_t *, uint32_t); +static int futex_user_cmpxchg(uint32_t *, uint32_t, uint32_t *, uint32_t); + +/* + * futex_address operations. + */ + +static int +futex_address_create(struct futex_address *fa, struct thread *td, + const void *object, cloudabi_mflags_t scope) +{ + + KASSERT(td == curthread, + ("Can only create umtx keys for the current thread")); + switch (scope) { + case CLOUDABI_MAP_PRIVATE: + return (umtx_key_get(object, TYPE_FUTEX, THREAD_SHARE, + &fa->fa_key)); + case CLOUDABI_MAP_SHARED: + return (umtx_key_get(object, TYPE_FUTEX, AUTO_SHARE, + &fa->fa_key)); + default: + return (EINVAL); + } +} + +static void +futex_address_free(struct futex_address *fa) +{ + + umtx_key_release(&fa->fa_key); +} + +static bool +futex_address_match(const struct futex_address *fa1, + const struct futex_address *fa2) +{ + + return (umtx_key_match(&fa1->fa_key, &fa2->fa_key)); +} + +/* + * futex_condvar operations. + */ + +static void +futex_condvar_assert(const struct futex_condvar *fc) +{ + + KASSERT(fc->fc_waitcount >= futex_queue_count(&fc->fc_waiters), + ("Total number of waiters cannot be smaller than the wait queue")); + futex_lock_assert(fc->fc_lock); +} + +static int +futex_condvar_lookup(struct thread *td, const cloudabi_condvar_t *address, + cloudabi_mflags_t scope, struct futex_condvar **fcret) +{ + struct futex_address fa_condvar; + struct futex_condvar *fc; + int error; + + error = futex_address_create(&fa_condvar, td, address, scope); + if (error != 0) + return (error); + + sx_xlock(&futex_global_lock); + LIST_FOREACH(fc, &futex_condvar_list, fc_next) { + if (futex_address_match(&fc->fc_address, &fa_condvar)) { + /* Found matching lock object. */ + futex_address_free(&fa_condvar); + futex_condvar_assert(fc); + *fcret = fc; + return (0); + } + } + sx_xunlock(&futex_global_lock); + futex_address_free(&fa_condvar); + return (ENOENT); +} + +static int +futex_condvar_lookup_or_create(struct thread *td, + const cloudabi_condvar_t *condvar, cloudabi_mflags_t condvar_scope, + const cloudabi_lock_t *lock, cloudabi_mflags_t lock_scope, + struct futex_condvar **fcret) +{ + struct futex_address fa_condvar, fa_lock; + struct futex_condvar *fc; + struct futex_lock *fl; + int error; + + error = futex_address_create(&fa_condvar, td, condvar, condvar_scope); + if (error != 0) + return (error); + error = futex_address_create(&fa_lock, td, lock, lock_scope); + if (error != 0) { + futex_address_free(&fa_condvar); + return (error); + } + + sx_xlock(&futex_global_lock); + LIST_FOREACH(fc, &futex_condvar_list, fc_next) { + if (!futex_address_match(&fc->fc_address, &fa_condvar)) + continue; + fl = fc->fc_lock; + if (!futex_address_match(&fl->fl_address, &fa_lock)) { + /* Condition variable is owned by a different lock. */ + futex_address_free(&fa_condvar); + futex_address_free(&fa_lock); + sx_xunlock(&futex_global_lock); + return (EINVAL); + } + + /* Found fully matching condition variable. */ + futex_address_free(&fa_condvar); + futex_address_free(&fa_lock); + futex_condvar_assert(fc); + *fcret = fc; + return (0); + } + + /* None found. Create new condition variable object. */ + fc = malloc(sizeof(*fc), M_FUTEX, M_WAITOK); + fc->fc_address = fa_condvar; + fc->fc_lock = futex_lock_lookup_locked(&fa_lock); + futex_queue_init(&fc->fc_waiters); + fc->fc_waitcount = 0; + LIST_INSERT_HEAD(&futex_condvar_list, fc, fc_next); + *fcret = fc; + return (0); +} + +static void +futex_condvar_release(struct futex_condvar *fc) +{ + struct futex_lock *fl; + + futex_condvar_assert(fc); + fl = fc->fc_lock; + if (fc->fc_waitcount == 0) { + /* Condition variable has no waiters. Deallocate it. */ + futex_address_free(&fc->fc_address); + LIST_REMOVE(fc, fc_next); + free(fc, M_FUTEX); + } + futex_lock_release(fl); +} + +static int +futex_condvar_unmanage(struct futex_condvar *fc, + cloudabi_condvar_t *condvar) +{ + + if (futex_queue_count(&fc->fc_waiters) != 0) + return (0); + return (futex_user_store(condvar, CLOUDABI_CONDVAR_HAS_NO_WAITERS)); +} + +/* + * futex_lock operations. + */ + +static void +futex_lock_assert(const struct futex_lock *fl) +{ + + /* + * A futex lock can only be kernel-managed if it has waiters. + * Vice versa: if a futex lock has waiters, it must be + * kernel-managed. + */ + KASSERT((fl->fl_owner == LOCK_UNMANAGED) == + (futex_queue_count(&fl->fl_readers) == 0 && + futex_queue_count(&fl->fl_writers) == 0), + ("Managed locks must have waiting threads")); + KASSERT(fl->fl_waitcount != 0 || fl->fl_owner == LOCK_UNMANAGED, + ("Lock with no waiters must be unmanaged")); +} + +static int +futex_lock_lookup(struct thread *td, const cloudabi_lock_t *address, + cloudabi_mflags_t scope, struct futex_lock **flret) +{ + struct futex_address fa; + int error; + + error = futex_address_create(&fa, td, address, scope); + if (error != 0) + return (error); + + sx_xlock(&futex_global_lock); + *flret = futex_lock_lookup_locked(&fa); + return (0); +} + +static struct futex_lock * +futex_lock_lookup_locked(struct futex_address *fa) +{ + struct futex_lock *fl; + + LIST_FOREACH(fl, &futex_lock_list, fl_next) { + if (futex_address_match(&fl->fl_address, fa)) { + /* Found matching lock object. */ + futex_address_free(fa); + futex_lock_assert(fl); + return (fl); + } + } + + /* None found. Create new lock object. */ + fl = malloc(sizeof(*fl), M_FUTEX, M_WAITOK); + fl->fl_address = *fa; + fl->fl_owner = LOCK_UNMANAGED; + futex_queue_init(&fl->fl_readers); + futex_queue_init(&fl->fl_writers); + fl->fl_waitcount = 0; + LIST_INSERT_HEAD(&futex_lock_list, fl, fl_next); + return (fl); +} + +static int +futex_lock_rdlock(struct futex_lock *fl, struct thread *td, + cloudabi_lock_t *lock, cloudabi_clockid_t clock_id, + cloudabi_timestamp_t timeout, cloudabi_timestamp_t precision) +{ + struct futex_waiter fw; + int error; + + error = futex_lock_tryrdlock(fl, lock); + if (error == EBUSY) { + /* Suspend execution. */ + KASSERT(fl->fl_owner != LOCK_UNMANAGED, + ("Attempted to sleep on an unmanaged lock")); + error = futex_queue_sleep(&fl->fl_readers, fl, &fw, td, + clock_id, timeout, precision); + KASSERT((error == 0) == fw.fw_locked, + ("Should have locked write lock on success")); + KASSERT(futex_queue_count(&fw.fw_donated) == 0, + ("Lock functions cannot receive threads")); + } + if (error != 0) + futex_lock_unmanage(fl, lock); + return (error); +} + +static void +futex_lock_release(struct futex_lock *fl) +{ + + futex_lock_assert(fl); + if (fl->fl_waitcount == 0) { + /* Lock object is unreferenced. Deallocate it. */ + KASSERT(fl->fl_owner == LOCK_UNMANAGED, + ("Attempted to free a managed lock")); + futex_address_free(&fl->fl_address); + LIST_REMOVE(fl, fl_next); + free(fl, M_FUTEX); + } + sx_xunlock(&futex_global_lock); +} + +static int +futex_lock_unmanage(struct futex_lock *fl, cloudabi_lock_t *lock) +{ + cloudabi_lock_t cmp, old; + int error; + + if (futex_queue_count(&fl->fl_readers) == 0 && + futex_queue_count(&fl->fl_writers) == 0) { + /* Lock should be unmanaged. */ + fl->fl_owner = LOCK_UNMANAGED; + + /* Clear kernel-managed bit. */ + error = futex_user_load(lock, &old); + if (error != 0) + return (error); + for (;;) { + cmp = old; + error = futex_user_cmpxchg(lock, cmp, &old, + cmp & ~CLOUDABI_LOCK_KERNEL_MANAGED); + if (error != 0) + return (error); + if (old == cmp) + break; + } + } + return (0); +} + +/* Sets an owner of a lock, based on a userspace lock value. */ +static void +futex_lock_set_owner(struct futex_lock *fl, cloudabi_lock_t lock) +{ + + /* Lock has no explicit owner. */ + if ((lock & ~CLOUDABI_LOCK_WRLOCKED) == 0) { + fl->fl_owner = LOCK_OWNER_UNKNOWN; + return; + } + lock &= ~(CLOUDABI_LOCK_WRLOCKED | CLOUDABI_LOCK_KERNEL_MANAGED); + + /* Don't allow userspace to silently unlock. */ + if (lock == LOCK_UNMANAGED) { + fl->fl_owner = LOCK_OWNER_UNKNOWN; + return; + } + fl->fl_owner = lock; +} + +static int +futex_lock_unlock(struct futex_lock *fl, struct thread *td, + cloudabi_lock_t *lock) +{ + int error; + + /* Validate that this thread is allowed to unlock. */ + error = futex_lock_update_owner(fl, lock); + if (error != 0) + return (error); + if (fl->fl_owner != LOCK_UNMANAGED && fl->fl_owner != td->td_tid) + return (EPERM); + return (futex_lock_wake_up_next(fl, lock)); +} + +/* Syncs in the owner of the lock from userspace if needed. */ +static int +futex_lock_update_owner(struct futex_lock *fl, cloudabi_lock_t *address) +{ + cloudabi_lock_t lock; + int error; + + if (fl->fl_owner == LOCK_OWNER_UNKNOWN) { + error = futex_user_load(address, &lock); + if (error != 0) + return (error); + futex_lock_set_owner(fl, lock); + } + return (0); +} + +static int +futex_lock_tryrdlock(struct futex_lock *fl, cloudabi_lock_t *address) +{ + cloudabi_lock_t old, cmp; + int error; + + if (fl->fl_owner != LOCK_UNMANAGED) { + /* Lock is already acquired. */ + return (EBUSY); + } + + old = CLOUDABI_LOCK_UNLOCKED; + for (;;) { + if ((old & CLOUDABI_LOCK_KERNEL_MANAGED) != 0) { + /* + * Userspace lock is kernel-managed, even though + * the kernel disagrees. + */ + return (EINVAL); + } + + if ((old & CLOUDABI_LOCK_WRLOCKED) == 0) { + /* + * Lock is not write-locked. Attempt to acquire + * it by increasing the read count. + */ + cmp = old; + error = futex_user_cmpxchg(address, cmp, &old, cmp + 1); + if (error != 0) + return (error); + if (old == cmp) { + /* Success. */ + return (0); + } + } else { + /* Lock is write-locked. Make it kernel-managed. */ + cmp = old; + error = futex_user_cmpxchg(address, cmp, &old, + cmp | CLOUDABI_LOCK_KERNEL_MANAGED); + if (error != 0) + return (error); + if (old == cmp) { + /* Success. */ + futex_lock_set_owner(fl, cmp); + return (EBUSY); + } + } + } +} + +static int +futex_lock_trywrlock(struct futex_lock *fl, cloudabi_lock_t *address, + cloudabi_tid_t tid, bool force_kernel_managed) +{ + cloudabi_lock_t old, new, cmp; + int error; + + if (fl->fl_owner == tid) { + /* Attempted to acquire lock recursively. */ + return (EDEADLK); + } + if (fl->fl_owner != LOCK_UNMANAGED) { + /* Lock is already acquired. */ + return (EBUSY); + } + + old = CLOUDABI_LOCK_UNLOCKED; + for (;;) { + if ((old & CLOUDABI_LOCK_KERNEL_MANAGED) != 0) { + /* + * Userspace lock is kernel-managed, even though + * the kernel disagrees. + */ + return (EINVAL); + } + if (old == (tid | CLOUDABI_LOCK_WRLOCKED)) { + /* Attempted to acquire lock recursively. */ + return (EDEADLK); + } + + if (old == CLOUDABI_LOCK_UNLOCKED) { + /* Lock is unlocked. Attempt to acquire it. */ + new = tid | CLOUDABI_LOCK_WRLOCKED; + if (force_kernel_managed) + new |= CLOUDABI_LOCK_KERNEL_MANAGED; + error = futex_user_cmpxchg(address, + CLOUDABI_LOCK_UNLOCKED, &old, new); + if (error != 0) + return (error); + if (old == CLOUDABI_LOCK_UNLOCKED) { + /* Success. */ + if (force_kernel_managed) + fl->fl_owner = tid; + return (0); + } + } else { + /* Lock is still locked. Make it kernel-managed. */ + cmp = old; + error = futex_user_cmpxchg(address, cmp, &old, + cmp | CLOUDABI_LOCK_KERNEL_MANAGED); + if (error != 0) + return (error); + if (old == cmp) { + /* Success. */ + futex_lock_set_owner(fl, cmp); + return (EBUSY); + } + } + } +} + +static int +futex_lock_wake_up_next(struct futex_lock *fl, cloudabi_lock_t *lock) +{ + cloudabi_tid_t tid; + int error; + + /* + * Determine which thread(s) to wake up. Prefer waking up + * writers over readers to prevent write starvation. + */ + if (futex_queue_count(&fl->fl_writers) > 0) { + /* Transfer ownership to a single write-locker. */ + if (futex_queue_count(&fl->fl_writers) > 1 || + futex_queue_count(&fl->fl_readers) > 0) { + /* Lock should remain managed afterwards. */ + tid = futex_queue_tid_best(&fl->fl_writers); + error = futex_user_store(lock, + tid | CLOUDABI_LOCK_WRLOCKED | + CLOUDABI_LOCK_KERNEL_MANAGED); + if (error != 0) + return (error); + + futex_queue_wake_up_best(&fl->fl_writers); + fl->fl_owner = tid; + } else { + /* Lock can become unmanaged afterwards. */ + error = futex_user_store(lock, + futex_queue_tid_best(&fl->fl_writers) | + CLOUDABI_LOCK_WRLOCKED); + if (error != 0) + return (error); + + futex_queue_wake_up_best(&fl->fl_writers); + fl->fl_owner = LOCK_UNMANAGED; + } + } else { + /* Transfer ownership to all read-lockers (if any). */ + error = futex_user_store(lock, + futex_queue_count(&fl->fl_readers)); + if (error != 0) + return (error); + + /* Wake up all threads. */ + futex_queue_wake_up_all(&fl->fl_readers); + fl->fl_owner = LOCK_UNMANAGED; + } + return (0); +} + +static int +futex_lock_wrlock(struct futex_lock *fl, struct thread *td, + cloudabi_lock_t *lock, cloudabi_clockid_t clock_id, + cloudabi_timestamp_t timeout, cloudabi_timestamp_t precision, + struct futex_queue *donated) +{ + struct futex_waiter fw; + int error; + + error = futex_lock_trywrlock(fl, lock, td->td_tid, + futex_queue_count(donated) > 0); + + if (error == 0 || error == EBUSY) { + /* Put donated threads in queue before suspending. */ + KASSERT(futex_queue_count(donated) == 0 || + fl->fl_owner != LOCK_UNMANAGED, + ("Lock should be managed if we are going to donate")); + futex_queue_requeue(donated, &fl->fl_writers, UINT_MAX); + } else { + /* + * This thread cannot deal with the donated threads. + * Wake up the next thread and let it try it by itself. + */ + futex_queue_wake_up_donate(donated, UINT_MAX); + } + + if (error == EBUSY) { + /* Suspend execution if the lock was busy. */ + KASSERT(fl->fl_owner != LOCK_UNMANAGED, + ("Attempted to sleep on an unmanaged lock")); + error = futex_queue_sleep(&fl->fl_writers, fl, &fw, td, + clock_id, timeout, precision); + KASSERT((error == 0) == fw.fw_locked, + ("Should have locked write lock on success")); + KASSERT(futex_queue_count(&fw.fw_donated) == 0, + ("Lock functions cannot receive threads")); + } + if (error != 0) + futex_lock_unmanage(fl, lock); + return (error); +} + +/* + * futex_queue operations. + */ + +static cloudabi_tid_t +futex_queue_tid_best(const struct futex_queue *fq) +{ + + return (STAILQ_FIRST(&fq->fq_list)->fw_tid); +} + +static unsigned int +futex_queue_count(const struct futex_queue *fq) +{ + + return (fq->fq_count); +} + +static void +futex_queue_init(struct futex_queue *fq) +{ + + STAILQ_INIT(&fq->fq_list); + fq->fq_count = 0; +} + +/* Converts a relative timestamp to an sbintime. */ +static sbintime_t +futex_queue_convert_timestamp_relative(cloudabi_timestamp_t ts) +{ + cloudabi_timestamp_t s, ns; + + s = ts / 1000000000; + ns = ts % 1000000000; + if (s > INT32_MAX) + return (INT64_MAX); + return ((s << 32) + (ns << 32) / 1000000000); +} + +/* Converts an absolute timestamp and precision to a pair of sbintime values. */ +static int +futex_queue_convert_timestamp(struct thread *td, cloudabi_clockid_t clock_id, + cloudabi_timestamp_t timeout, cloudabi_timestamp_t precision, + sbintime_t *sbttimeout, sbintime_t *sbtprecision) +{ + cloudabi_timestamp_t now; + int error; + + /* Make the time relative. */ + error = cloudabi_clock_time_get(td, clock_id, &now); + if (error != 0) + return (error); + timeout = timeout < now ? 0 : timeout - now; + + *sbttimeout = futex_queue_convert_timestamp_relative(timeout); + *sbtprecision = futex_queue_convert_timestamp_relative(precision); + return (0); +} + +static int +futex_queue_sleep(struct futex_queue *fq, struct futex_lock *fl, + struct futex_waiter *fw, struct thread *td, cloudabi_clockid_t clock_id, + cloudabi_timestamp_t timeout, cloudabi_timestamp_t precision) +{ + sbintime_t sbttimeout, sbtprecision; + int error; + + /* Initialize futex_waiter object. */ + fw->fw_tid = td->td_tid; + fw->fw_locked = false; + futex_queue_init(&fw->fw_donated); + + if (timeout != UINT64_MAX) { + /* Convert timeout duration. */ + error = futex_queue_convert_timestamp(td, clock_id, timeout, + precision, &sbttimeout, &sbtprecision); + if (error != 0) + return (error); + } + + /* Place object in the queue. */ + fw->fw_queue = fq; + STAILQ_INSERT_TAIL(&fq->fq_list, fw, fw_next); + ++fq->fq_count; + + cv_init(&fw->fw_wait, "futex"); + ++fl->fl_waitcount; + + futex_lock_assert(fl); + if (timeout == UINT64_MAX) { + /* Wait without a timeout. */ + error = cv_wait_sig(&fw->fw_wait, &futex_global_lock); + } else { + /* Wait respecting the timeout. */ + error = cv_timedwait_sig_sbt(&fw->fw_wait, &futex_global_lock, + sbttimeout, sbtprecision, 0); + futex_lock_assert(fl); + if (error == EWOULDBLOCK && + fw->fw_queue != NULL && fw->fw_queue != fq) { + /* + * We got signalled on a condition variable, but + * observed a timeout while waiting to reacquire + * the lock. In other words, we didn't actually + * time out. Go back to sleep and wait for the + * lock to be reacquired. + */ + error = cv_wait_sig(&fw->fw_wait, &futex_global_lock); + } + } + futex_lock_assert(fl); + + --fl->fl_waitcount; + cv_destroy(&fw->fw_wait); + + fq = fw->fw_queue; + if (fq == NULL) { + /* Thread got dequeued, so we've slept successfully. */ + return (0); + } + + /* Thread is still enqueued. Remove it. */ + KASSERT(error != 0, ("Woken up thread is still enqueued")); + STAILQ_REMOVE(&fq->fq_list, fw, futex_waiter, fw_next); + --fq->fq_count; + return (error == EWOULDBLOCK ? ETIMEDOUT : error); +} + +/* Moves up to nwaiters waiters from one queue to another. */ +static void +futex_queue_requeue(struct futex_queue *fqfrom, struct futex_queue *fqto, + unsigned int nwaiters) +{ + struct futex_waiter *fw; + + /* Move waiters to the target queue. */ + while (nwaiters-- > 0 && !STAILQ_EMPTY(&fqfrom->fq_list)) { + fw = STAILQ_FIRST(&fqfrom->fq_list); + STAILQ_REMOVE_HEAD(&fqfrom->fq_list, fw_next); + --fqfrom->fq_count; + + fw->fw_queue = fqto; + STAILQ_INSERT_TAIL(&fqto->fq_list, fw, fw_next); + ++fqto->fq_count; + } +} + +/* Wakes up all waiters in a queue. */ +static void +futex_queue_wake_up_all(struct futex_queue *fq) +{ + struct futex_waiter *fw; + + STAILQ_FOREACH(fw, &fq->fq_list, fw_next) { + fw->fw_locked = true; + fw->fw_queue = NULL; + cv_signal(&fw->fw_wait); + } + + STAILQ_INIT(&fq->fq_list); + fq->fq_count = 0; +} + +/* + * Wakes up the best waiter (i.e., the waiter having the highest + * priority) in a queue. + */ +static void +futex_queue_wake_up_best(struct futex_queue *fq) +{ + struct futex_waiter *fw; + + fw = STAILQ_FIRST(&fq->fq_list); + fw->fw_locked = true; + fw->fw_queue = NULL; + cv_signal(&fw->fw_wait); + + STAILQ_REMOVE_HEAD(&fq->fq_list, fw_next); + --fq->fq_count; +} + +static void +futex_queue_wake_up_donate(struct futex_queue *fq, unsigned int nwaiters) +{ + struct futex_waiter *fw; + + fw = STAILQ_FIRST(&fq->fq_list); + if (fw == NULL) + return; + fw->fw_locked = false; + fw->fw_queue = NULL; + cv_signal(&fw->fw_wait); + + STAILQ_REMOVE_HEAD(&fq->fq_list, fw_next); + --fq->fq_count; + futex_queue_requeue(fq, &fw->fw_donated, nwaiters); +} + +/* + * futex_user operations. Used to adjust values in userspace. + */ + +static int +futex_user_load(uint32_t *obj, uint32_t *val) +{ + + return (fueword32(obj, val) != 0 ? EFAULT : 0); +} + +static int +futex_user_store(uint32_t *obj, uint32_t val) +{ + + return (suword32(obj, val) != 0 ? EFAULT : 0); +} + +static int +futex_user_cmpxchg(uint32_t *obj, uint32_t cmp, uint32_t *old, uint32_t new) +{ + + return (casueword32(obj, cmp, old, new) != 0 ? EFAULT : 0); +} + +/* + * Blocking calls: acquiring locks, waiting on condition variables. + */ + +int +cloudabi_futex_condvar_wait(struct thread *td, cloudabi_condvar_t *condvar, + cloudabi_mflags_t condvar_scope, cloudabi_lock_t *lock, + cloudabi_mflags_t lock_scope, cloudabi_clockid_t clock_id, + cloudabi_timestamp_t timeout, cloudabi_timestamp_t precision) +{ + struct futex_condvar *fc; + struct futex_lock *fl; + struct futex_waiter fw; + int error, error2; + + /* Lookup condition variable object. */ + error = futex_condvar_lookup_or_create(td, condvar, condvar_scope, lock, + lock_scope, &fc); + if (error != 0) + return (error); + fl = fc->fc_lock; + + /* + * Set the condition variable to something other than + * CLOUDABI_CONDVAR_HAS_NO_WAITERS to make userspace threads + * call into the kernel to perform wakeups. + */ + error = futex_user_store(condvar, ~CLOUDABI_CONDVAR_HAS_NO_WAITERS); + if (error != 0) { + futex_condvar_release(fc); + return (error); + } + + /* Drop the lock. */ + error = futex_lock_unlock(fl, td, lock); + if (error != 0) { + futex_condvar_unmanage(fc, condvar); + futex_condvar_release(fc); + return (error); + } + + /* Go to sleep. */ + ++fc->fc_waitcount; + error = futex_queue_sleep(&fc->fc_waiters, fc->fc_lock, &fw, td, + clock_id, timeout, precision); + if (fw.fw_locked) { + /* Waited and got the lock assigned to us. */ + KASSERT(futex_queue_count(&fw.fw_donated) == 0, + ("Received threads while being locked")); + } else if (error == 0 || error == ETIMEDOUT) { + if (error != 0) + futex_condvar_unmanage(fc, condvar); + /* + * Got woken up without having the lock assigned to us. + * This can happen in two cases: + * + * 1. We observed a timeout on a condition variable. + * 2. We got signalled on a condition variable while the + * associated lock is unlocked. We are the first + * thread that gets woken up. This thread is + * responsible for reacquiring the userspace lock. + */ + error2 = futex_lock_wrlock(fl, td, lock, + CLOUDABI_CLOCK_MONOTONIC, UINT64_MAX, 0, &fw.fw_donated); + if (error2 != 0) + error = error2; + } else { + KASSERT(futex_queue_count(&fw.fw_donated) == 0, + ("Received threads on error")); + futex_condvar_unmanage(fc, condvar); + futex_lock_unmanage(fl, lock); + } + --fc->fc_waitcount; + futex_condvar_release(fc); + return (error); +} + +int +cloudabi_futex_lock_rdlock(struct thread *td, cloudabi_lock_t *lock, + cloudabi_mflags_t scope, cloudabi_clockid_t clock_id, + cloudabi_timestamp_t timeout, cloudabi_timestamp_t precision) +{ + struct futex_lock *fl; + int error; + + /* Look up lock object. */ + error = futex_lock_lookup(td, lock, scope, &fl); + if (error != 0) + return (error); + + error = futex_lock_rdlock(fl, td, lock, clock_id, timeout, + precision); + futex_lock_release(fl); + return (error); +} + +int +cloudabi_futex_lock_wrlock(struct thread *td, cloudabi_lock_t *lock, + cloudabi_mflags_t scope, cloudabi_clockid_t clock_id, + cloudabi_timestamp_t timeout, cloudabi_timestamp_t precision) +{ + struct futex_lock *fl; + struct futex_queue fq; + int error; + + /* Look up lock object. */ + error = futex_lock_lookup(td, lock, scope, &fl); + if (error != 0) + return (error); + + futex_queue_init(&fq); + error = futex_lock_wrlock(fl, td, lock, clock_id, timeout, + precision, &fq); + futex_lock_release(fl); + return (error); +} + +/* + * Non-blocking calls: releasing locks, signalling condition variables. + */ int cloudabi_sys_condvar_signal(struct thread *td, struct cloudabi_sys_condvar_signal_args *uap) { + struct futex_condvar *fc; + struct futex_lock *fl; + cloudabi_nthreads_t nwaiters; + int error; + + nwaiters = uap->nwaiters; + if (nwaiters == 0) { + /* No threads to wake up. */ + return (0); + } + + /* Look up futex object. */ + error = futex_condvar_lookup(td, uap->condvar, uap->scope, &fc); + if (error != 0) { + /* Race condition: condition variable with no waiters. */ + return (error == ENOENT ? 0 : error); + } + fl = fc->fc_lock; + + if (fl->fl_owner == LOCK_UNMANAGED) { + /* + * The lock is currently not managed by the kernel, + * meaning we must attempt to acquire the userspace lock + * first. We cannot requeue threads to an unmanaged lock, + * as these threads will then never be scheduled. + * + * Unfortunately, the memory address of the lock is + * unknown from this context, meaning that we cannot + * acquire the lock on behalf of the first thread to be + * scheduled. The lock may even not be mapped within the + * address space of the current thread. + * + * To solve this, wake up a single waiter that will + * attempt to acquire the lock. Donate all of the other + * waiters that need to be woken up to this waiter, so + * it can requeue them after acquiring the lock. + */ + futex_queue_wake_up_donate(&fc->fc_waiters, nwaiters - 1); + } else { + /* + * Lock is already managed by the kernel. This makes it + * easy, as we can requeue the threads from the + * condition variable directly to the associated lock. + */ + futex_queue_requeue(&fc->fc_waiters, &fl->fl_writers, nwaiters); + } - /* Not implemented. */ - return (ENOSYS); + /* Clear userspace condition variable if all waiters are gone. */ + error = futex_condvar_unmanage(fc, uap->condvar); + futex_condvar_release(fc); + return (error); } int cloudabi_sys_lock_unlock(struct thread *td, struct cloudabi_sys_lock_unlock_args *uap) { + struct futex_lock *fl; + int error; - /* Not implemented. */ - return (ENOSYS); + error = futex_lock_lookup(td, uap->lock, uap->scope, &fl); + if (error != 0) + return (error); + error = futex_lock_unlock(fl, td, uap->lock); + futex_lock_release(fl); + return (error); } diff --git a/sys/compat/cloudabi/cloudabi_proc.c b/sys/compat/cloudabi/cloudabi_proc.c index 1f4418f..d917337 100644 --- a/sys/compat/cloudabi/cloudabi_proc.c +++ b/sys/compat/cloudabi/cloudabi_proc.c @@ -27,8 +27,11 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> +#include <sys/capsicum.h> +#include <sys/filedesc.h> #include <sys/imgact.h> #include <sys/lock.h> +#include <sys/module.h> #include <sys/mutex.h> #include <sys/proc.h> #include <sys/signalvar.h> @@ -43,14 +46,19 @@ cloudabi_sys_proc_exec(struct thread *td, struct cloudabi_sys_proc_exec_args *uap) { struct image_args args; + struct vmspace *oldvmspace; int error; + error = pre_execve(td, &oldvmspace); + if (error != 0) + return (error); error = exec_copyin_data_fds(td, &args, uap->data, uap->datalen, uap->fds, uap->fdslen); if (error == 0) { args.fd = uap->fd; error = kern_execve(td, &args, NULL); } + post_execve(td, error, oldvmspace); return (error); } @@ -67,10 +75,12 @@ int cloudabi_sys_proc_fork(struct thread *td, struct cloudabi_sys_proc_fork_args *uap) { + struct filecaps fcaps = {}; struct proc *p2; int error, fd; - error = fork1(td, RFFDG | RFPROC | RFPROCDESC, 0, &p2, &fd, 0); + cap_rights_init(&fcaps.fc_rights, CAP_FSTAT, CAP_EVENT); + error = fork1(td, RFFDG | RFPROC | RFPROCDESC, 0, &p2, &fd, 0, &fcaps); if (error != 0) return (error); /* Return the file descriptor to the parent process. */ @@ -129,3 +139,5 @@ cloudabi_sys_proc_raise(struct thread *td, PROC_UNLOCK(p); return (0); } + +MODULE_VERSION(cloudabi, 1); diff --git a/sys/compat/cloudabi/cloudabi_sock.c b/sys/compat/cloudabi/cloudabi_sock.c index 877571c..3aefab1 100644 --- a/sys/compat/cloudabi/cloudabi_sock.c +++ b/sys/compat/cloudabi/cloudabi_sock.c @@ -27,14 +27,62 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> +#include <sys/capsicum.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/protosw.h> #include <sys/socket.h> +#include <sys/socketvar.h> #include <sys/syscallsubr.h> #include <sys/sysproto.h> #include <sys/systm.h> #include <sys/un.h> +#include <net/vnet.h> + +#include <netinet/in.h> + #include <compat/cloudabi/cloudabi_proto.h> #include <compat/cloudabi/cloudabi_syscalldefs.h> +#include <compat/cloudabi/cloudabi_util.h> + +/* Converts FreeBSD's struct sockaddr to CloudABI's cloudabi_sockaddr_t. */ +void +cloudabi_convert_sockaddr(const struct sockaddr *sa, socklen_t sal, + cloudabi_sockaddr_t *rsa) +{ + const struct sockaddr_in *sin; + const struct sockaddr_in6 *sin6; + + /* Zero-sized socket address. */ + if (sal < offsetof(struct sockaddr, sa_family) + sizeof(sa->sa_family)) + return; + + switch (sa->sa_family) { + case AF_INET: + if (sal < sizeof(struct sockaddr_in)) + return; + sin = (const struct sockaddr_in *)sa; + rsa->sa_family = CLOUDABI_AF_INET; + memcpy(&rsa->sa_inet.addr, &sin->sin_addr, + sizeof(rsa->sa_inet.addr)); + rsa->sa_inet.port = ntohs(sin->sin_port); + return; + case AF_INET6: + if (sal < sizeof(struct sockaddr_in6)) + return; + sin6 = (const struct sockaddr_in6 *)sa; + rsa->sa_family = CLOUDABI_AF_INET6; + memcpy(&rsa->sa_inet6.addr, &sin6->sin6_addr, + sizeof(rsa->sa_inet6.addr)); + rsa->sa_inet6.port = ntohs(sin6->sin6_port); + return; + case AF_UNIX: + rsa->sa_family = CLOUDABI_AF_UNIX; + return; + } +} /* Copies a pathname into a UNIX socket address structure. */ static int @@ -62,9 +110,27 @@ int cloudabi_sys_sock_accept(struct thread *td, struct cloudabi_sys_sock_accept_args *uap) { + struct sockaddr *sa; + cloudabi_sockstat_t ss = {}; + socklen_t sal; + int error; - /* Not implemented. */ - return (ENOSYS); + if (uap->buf == NULL) { + /* Only return the new file descriptor number. */ + return (kern_accept(td, uap->s, NULL, NULL, NULL)); + } else { + /* Also return properties of the new socket descriptor. */ + sal = MAX(sizeof(struct sockaddr_in), + sizeof(struct sockaddr_in6)); + error = kern_accept(td, uap->s, (void *)&sa, &sal, NULL); + if (error != 0) + return (error); + + /* TODO(ed): Fill the other members of cloudabi_sockstat_t. */ + cloudabi_convert_sockaddr(sa, sal, &ss.ss_peername); + free(sa, M_SONAME); + return (copyout(&ss, uap->buf, sizeof(ss))); + } } int @@ -134,7 +200,51 @@ int cloudabi_sys_sock_stat_get(struct thread *td, struct cloudabi_sys_sock_stat_get_args *uap) { + cloudabi_sockstat_t ss = {}; + cap_rights_t rights; + struct file *fp; + struct sockaddr *sa; + struct socket *so; + int error; + + error = getsock_cap(td, uap->fd, cap_rights_init(&rights, + CAP_GETSOCKOPT | CAP_GETPEERNAME | CAP_GETSOCKNAME), &fp, NULL); + if (error != 0) + return (error); + so = fp->f_data; + + CURVNET_SET(so->so_vnet); + + /* Set ss_sockname. */ + error = so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa); + if (error == 0) { + cloudabi_convert_sockaddr(sa, sa->sa_len, &ss.ss_sockname); + free(sa, M_SONAME); + } + + /* Set ss_peername. */ + if ((so->so_state & (SS_ISCONNECTED | SS_ISCONFIRMING)) != 0) { + error = so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa); + if (error == 0) { + cloudabi_convert_sockaddr(sa, sa->sa_len, + &ss.ss_peername); + free(sa, M_SONAME); + } + } + + CURVNET_RESTORE(); + + /* Set ss_error. */ + SOCK_LOCK(so); + ss.ss_error = cloudabi_convert_errno(so->so_error); + if ((uap->flags & CLOUDABI_SOCKSTAT_CLEAR_ERROR) != 0) + so->so_error = 0; + SOCK_UNLOCK(so); + + /* Set ss_state. */ + if ((so->so_options & SO_ACCEPTCONN) != 0) + ss.ss_state |= CLOUDABI_SOCKSTAT_ACCEPTCONN; - /* Not implemented. */ - return (ENOSYS); + fdrop(fp, td); + return (copyout(&ss, uap->buf, sizeof(ss))); } diff --git a/sys/compat/cloudabi/cloudabi_thread.c b/sys/compat/cloudabi/cloudabi_thread.c index 4b9eb54..8aee708 100644 --- a/sys/compat/cloudabi/cloudabi_thread.c +++ b/sys/compat/cloudabi/cloudabi_thread.c @@ -29,16 +29,30 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/proc.h> #include <sys/sched.h> +#include <sys/syscallsubr.h> #include <compat/cloudabi/cloudabi_proto.h> +#include <compat/cloudabi/cloudabi_syscalldefs.h> int cloudabi_sys_thread_exit(struct thread *td, struct cloudabi_sys_thread_exit_args *uap) { + struct cloudabi_sys_lock_unlock_args cloudabi_sys_lock_unlock_args = { + .lock = uap->lock, + .scope = uap->scope, + }; - /* Not implemented. */ - return (ENOSYS); + /* Wake up joining thread. */ + cloudabi_sys_lock_unlock(td, &cloudabi_sys_lock_unlock_args); + + /* + * Attempt to terminate the thread. Terminate the process if + * it's the last thread. + */ + kern_thr_exit(td); + exit1(td, 0, 0); + /* NOTREACHED */ } int diff --git a/sys/compat/cloudabi/cloudabi_util.h b/sys/compat/cloudabi/cloudabi_util.h index d9e6f12..10da229 100644 --- a/sys/compat/cloudabi/cloudabi_util.h +++ b/sys/compat/cloudabi/cloudabi_util.h @@ -28,14 +28,52 @@ #ifndef _CLOUDABI_UTIL_H_ #define _CLOUDABI_UTIL_H_ +#include <sys/socket.h> + #include <compat/cloudabi/cloudabi_syscalldefs.h> +struct file; +struct thread; struct timespec; +/* Fetches the time value of a clock. */ +int cloudabi_clock_time_get(struct thread *, cloudabi_clockid_t, + cloudabi_timestamp_t *); + /* Converts a FreeBSD errno to a CloudABI errno. */ cloudabi_errno_t cloudabi_convert_errno(int); +/* Converts FreeBSD's struct sockaddr to CloudABI's cloudabi_sockaddr_t. */ +void cloudabi_convert_sockaddr(const struct sockaddr *, socklen_t, + cloudabi_sockaddr_t *); + +/* Converts a file descriptor to a CloudABI file descriptor type. */ +cloudabi_filetype_t cloudabi_convert_filetype(const struct file *); + +/* Converts CloudABI rights to a set of Capsicum capabilities. */ +int cloudabi_convert_rights(cloudabi_rights_t, cap_rights_t *); + +/* Removes rights that conflict with the file descriptor type. */ +void cloudabi_remove_conflicting_rights(cloudabi_filetype_t, + cloudabi_rights_t *, cloudabi_rights_t *); + /* Converts a struct timespec to a CloudABI timestamp. */ int cloudabi_convert_timespec(const struct timespec *, cloudabi_timestamp_t *); +/* + * Blocking futex functions. + * + * These functions are called by CloudABI's polling system calls to + * sleep on a lock or condition variable. + */ +int cloudabi_futex_condvar_wait(struct thread *, cloudabi_condvar_t *, + cloudabi_mflags_t, cloudabi_lock_t *, cloudabi_mflags_t, cloudabi_clockid_t, + cloudabi_timestamp_t, cloudabi_timestamp_t); +int cloudabi_futex_lock_rdlock(struct thread *, cloudabi_lock_t *, + cloudabi_mflags_t, cloudabi_clockid_t, cloudabi_timestamp_t, + cloudabi_timestamp_t); +int cloudabi_futex_lock_wrlock(struct thread *, cloudabi_lock_t *, + cloudabi_mflags_t, cloudabi_clockid_t, cloudabi_timestamp_t, + cloudabi_timestamp_t); + #endif |