diff options
author | Renato Botelho <renato@netgate.com> | 2016-07-19 11:28:18 -0300 |
---|---|---|
committer | Renato Botelho <renato@netgate.com> | 2016-07-19 11:28:18 -0300 |
commit | a3c20a378f7a9fa76e9301a43ac64ed07057d01e (patch) | |
tree | d14045c61b471f327f487431e05b0c4fe0cd8d76 /sys | |
parent | 3a4027cfafa37c1a0c0b05987c0edb1452c7bd2b (diff) | |
parent | 3f1f4f0e73b6d12c01e8cad4791d23e8e56127db (diff) | |
download | FreeBSD-src-a3c20a378f7a9fa76e9301a43ac64ed07057d01e.zip FreeBSD-src-a3c20a378f7a9fa76e9301a43ac64ed07057d01e.tar.gz |
Merge remote-tracking branch 'origin/stable/10' into devel
Diffstat (limited to 'sys')
63 files changed, 1075 insertions, 1082 deletions
diff --git a/sys/amd64/linux/linux.h b/sys/amd64/linux/linux.h index 639499a..b30568f 100644 --- a/sys/amd64/linux/linux.h +++ b/sys/amd64/linux/linux.h @@ -139,13 +139,6 @@ struct l_rlimit { l_ulong rlim_max; }; -/* mmap options */ -#define LINUX_MAP_SHARED 0x0001 -#define LINUX_MAP_PRIVATE 0x0002 -#define LINUX_MAP_FIXED 0x0010 -#define LINUX_MAP_ANON 0x0020 -#define LINUX_MAP_GROWSDOWN 0x0100 - /* * stat family of syscalls */ diff --git a/sys/amd64/linux/linux_machdep.c b/sys/amd64/linux/linux_machdep.c index 9664dac..80c48aa 100644 --- a/sys/amd64/linux/linux_machdep.c +++ b/sys/amd64/linux/linux_machdep.c @@ -83,6 +83,7 @@ __FBSDID("$FreeBSD$"); #include <compat/linux/linux_ipc.h> #include <compat/linux/linux_file.h> #include <compat/linux/linux_misc.h> +#include <compat/linux/linux_mmap.h> #include <compat/linux/linux_signal.h> #include <compat/linux/linux_util.h> #include <compat/linux/linux_emul.h> @@ -122,181 +123,19 @@ linux_set_upcall_kse(struct thread *td, register_t stack) return (0); } -#define STACK_SIZE (2 * 1024 * 1024) -#define GUARD_SIZE (4 * PAGE_SIZE) - int linux_mmap2(struct thread *td, struct linux_mmap2_args *args) { - struct proc *p = td->td_proc; - struct mmap_args /* { - caddr_t addr; - size_t len; - int prot; - int flags; - int fd; - long pad; - off_t pos; - } */ bsd_args; - int error; - struct file *fp; - cap_rights_t rights; - - LINUX_CTR6(mmap2, "0x%lx, %ld, %ld, 0x%08lx, %ld, 0x%lx", - args->addr, args->len, args->prot, - args->flags, args->fd, args->pgoff); - - error = 0; - bsd_args.flags = 0; - fp = NULL; - - /* - * Linux mmap(2): - * You must specify exactly one of MAP_SHARED and MAP_PRIVATE - */ - if (! ((args->flags & LINUX_MAP_SHARED) ^ - (args->flags & LINUX_MAP_PRIVATE))) - return (EINVAL); - - if (args->flags & LINUX_MAP_SHARED) - bsd_args.flags |= MAP_SHARED; - if (args->flags & LINUX_MAP_PRIVATE) - bsd_args.flags |= MAP_PRIVATE; - if (args->flags & LINUX_MAP_FIXED) - bsd_args.flags |= MAP_FIXED; - if (args->flags & LINUX_MAP_ANON) - bsd_args.flags |= MAP_ANON; - else - bsd_args.flags |= MAP_NOSYNC; - if (args->flags & LINUX_MAP_GROWSDOWN) - bsd_args.flags |= MAP_STACK; - /* - * PROT_READ, PROT_WRITE, or PROT_EXEC implies PROT_READ and PROT_EXEC - * on Linux/i386. We do this to ensure maximum compatibility. - * Linux/ia64 does the same in i386 emulation mode. - */ - bsd_args.prot = args->prot; - if (bsd_args.prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) - bsd_args.prot |= PROT_READ | PROT_EXEC; - - /* Linux does not check file descriptor when MAP_ANONYMOUS is set. */ - bsd_args.fd = (bsd_args.flags & MAP_ANON) ? -1 : args->fd; - if (bsd_args.fd != -1) { - /* - * Linux follows Solaris mmap(2) description: - * The file descriptor fildes is opened with - * read permission, regardless of the - * protection options specified. - */ - - error = fget(td, bsd_args.fd, - cap_rights_init(&rights, CAP_MMAP), &fp); - if (error != 0 ) - return (error); - if (fp->f_type != DTYPE_VNODE) { - fdrop(fp, td); - return (EINVAL); - } - - /* Linux mmap() just fails for O_WRONLY files */ - if (!(fp->f_flag & FREAD)) { - fdrop(fp, td); - return (EACCES); - } - - fdrop(fp, td); - } - - if (args->flags & LINUX_MAP_GROWSDOWN) { - /* - * The Linux MAP_GROWSDOWN option does not limit auto - * growth of the region. Linux mmap with this option - * takes as addr the inital BOS, and as len, the initial - * region size. It can then grow down from addr without - * limit. However, Linux threads has an implicit internal - * limit to stack size of STACK_SIZE. Its just not - * enforced explicitly in Linux. But, here we impose - * a limit of (STACK_SIZE - GUARD_SIZE) on the stack - * region, since we can do this with our mmap. - * - * Our mmap with MAP_STACK takes addr as the maximum - * downsize limit on BOS, and as len the max size of - * the region. It then maps the top SGROWSIZ bytes, - * and auto grows the region down, up to the limit - * in addr. - * - * If we don't use the MAP_STACK option, the effect - * of this code is to allocate a stack region of a - * fixed size of (STACK_SIZE - GUARD_SIZE). - */ - - if ((caddr_t)PTRIN(args->addr) + args->len > - p->p_vmspace->vm_maxsaddr) { - /* - * Some Linux apps will attempt to mmap - * thread stacks near the top of their - * address space. If their TOS is greater - * than vm_maxsaddr, vm_map_growstack() - * will confuse the thread stack with the - * process stack and deliver a SEGV if they - * attempt to grow the thread stack past their - * current stacksize rlimit. To avoid this, - * adjust vm_maxsaddr upwards to reflect - * the current stacksize rlimit rather - * than the maximum possible stacksize. - * It would be better to adjust the - * mmap'ed region, but some apps do not check - * mmap's return value. - */ - PROC_LOCK(p); - p->p_vmspace->vm_maxsaddr = (char *)USRSTACK - - lim_cur(p, RLIMIT_STACK); - PROC_UNLOCK(p); - } - - /* - * This gives us our maximum stack size and a new BOS. - * If we're using VM_STACK, then mmap will just map - * the top SGROWSIZ bytes, and let the stack grow down - * to the limit at BOS. If we're not using VM_STACK - * we map the full stack, since we don't have a way - * to autogrow it. - */ - if (args->len > STACK_SIZE - GUARD_SIZE) { - bsd_args.addr = (caddr_t)PTRIN(args->addr); - bsd_args.len = args->len; - } else { - bsd_args.addr = (caddr_t)PTRIN(args->addr) - - (STACK_SIZE - GUARD_SIZE - args->len); - bsd_args.len = STACK_SIZE - GUARD_SIZE; - } - } else { - bsd_args.addr = (caddr_t)PTRIN(args->addr); - bsd_args.len = args->len; - } - bsd_args.pos = (off_t)args->pgoff; - - error = sys_mmap(td, &bsd_args); - - LINUX_CTR2(mmap2, "return: %d (%p)", - error, td->td_retval[0]); - return (error); + return (linux_mmap_common(td, PTROUT(args->addr), args->len, args->prot, + args->flags, args->fd, args->pgoff)); } int linux_mprotect(struct thread *td, struct linux_mprotect_args *uap) { - struct mprotect_args bsd_args; - - LINUX_CTR(mprotect); - bsd_args.addr = uap->addr; - bsd_args.len = uap->len; - bsd_args.prot = uap->prot; - if (bsd_args.prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) - bsd_args.prot |= PROT_READ | PROT_EXEC; - return (sys_mprotect(td, &bsd_args)); + return (linux_mprotect_common(td, PTROUT(uap->addr), uap->len, uap->prot)); } int diff --git a/sys/amd64/linux/linux_proto.h b/sys/amd64/linux/linux_proto.h index 4946805..4651788 100644 --- a/sys/amd64/linux/linux_proto.h +++ b/sys/amd64/linux/linux_proto.h @@ -3,7 +3,7 @@ * * DO NOT EDIT-- this file is automatically generated. * $FreeBSD$ - * created from FreeBSD: stable/10/sys/amd64/linux/syscalls.master 294368 2016-01-20 01:09:53Z jhb + * created from FreeBSD: stable/10/sys/amd64/linux/syscalls.master 302962 2016-07-17 15:07:33Z dchagin */ #ifndef _LINUX_SYSPROTO_H_ @@ -499,7 +499,7 @@ struct linux_mknod_args { char dev_l_[PADL_(l_dev_t)]; l_dev_t dev; char dev_r_[PADR_(l_dev_t)]; }; struct linux_personality_args { - char per_l_[PADL_(l_ulong)]; l_ulong per; char per_r_[PADR_(l_ulong)]; + char per_l_[PADL_(l_uint)]; l_uint per; char per_r_[PADR_(l_uint)]; }; struct linux_ustat_args { char dev_l_[PADL_(l_dev_t)]; l_dev_t dev; char dev_r_[PADR_(l_dev_t)]; diff --git a/sys/amd64/linux/linux_syscall.h b/sys/amd64/linux/linux_syscall.h index 874651c..88fa022 100644 --- a/sys/amd64/linux/linux_syscall.h +++ b/sys/amd64/linux/linux_syscall.h @@ -3,7 +3,7 @@ * * DO NOT EDIT-- this file is automatically generated. * $FreeBSD$ - * created from FreeBSD: stable/10/sys/amd64/linux/syscalls.master 294368 2016-01-20 01:09:53Z jhb + * created from FreeBSD: stable/10/sys/amd64/linux/syscalls.master 302962 2016-07-17 15:07:33Z dchagin */ #define LINUX_SYS_read 0 diff --git a/sys/amd64/linux/linux_syscalls.c b/sys/amd64/linux/linux_syscalls.c index fcea91e..1b71749 100644 --- a/sys/amd64/linux/linux_syscalls.c +++ b/sys/amd64/linux/linux_syscalls.c @@ -3,7 +3,7 @@ * * DO NOT EDIT-- this file is automatically generated. * $FreeBSD$ - * created from FreeBSD: stable/10/sys/amd64/linux/syscalls.master 294368 2016-01-20 01:09:53Z jhb + * created from FreeBSD: stable/10/sys/amd64/linux/syscalls.master 302962 2016-07-17 15:07:33Z dchagin */ const char *linux_syscallnames[] = { diff --git a/sys/amd64/linux/linux_sysent.c b/sys/amd64/linux/linux_sysent.c index 14d5c87..50302e8 100644 --- a/sys/amd64/linux/linux_sysent.c +++ b/sys/amd64/linux/linux_sysent.c @@ -3,7 +3,7 @@ * * DO NOT EDIT-- this file is automatically generated. * $FreeBSD$ - * created from FreeBSD: stable/10/sys/amd64/linux/syscalls.master 294368 2016-01-20 01:09:53Z jhb + * created from FreeBSD: stable/10/sys/amd64/linux/syscalls.master 302962 2016-07-17 15:07:33Z dchagin */ #include <sys/param.h> diff --git a/sys/amd64/linux/linux_systrace_args.c b/sys/amd64/linux/linux_systrace_args.c index 5dcdd55..be99518 100644 --- a/sys/amd64/linux/linux_systrace_args.c +++ b/sys/amd64/linux/linux_systrace_args.c @@ -1120,7 +1120,7 @@ systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args) /* linux_personality */ case 135: { struct linux_personality_args *p = params; - iarg[0] = p->per; /* l_ulong */ + iarg[0] = p->per; /* l_uint */ *n_args = 1; break; } @@ -4112,7 +4112,7 @@ systrace_entry_setargdesc(int sysnum, int ndx, char *desc, size_t descsz) case 135: switch(ndx) { case 0: - p = "l_ulong"; + p = "l_uint"; break; default: break; diff --git a/sys/amd64/linux/syscalls.master b/sys/amd64/linux/syscalls.master index 50ddd92..a2c4a0b 100644 --- a/sys/amd64/linux/syscalls.master +++ b/sys/amd64/linux/syscalls.master @@ -270,7 +270,7 @@ 133 AUE_MKNOD STD { int linux_mknod(char *path, l_int mode, \ l_dev_t dev); } 134 AUE_USELIB UNIMPL uselib -135 AUE_PERSONALITY STD { int linux_personality(l_ulong per); } +135 AUE_PERSONALITY STD { int linux_personality(l_uint per); } 136 AUE_NULL STD { int linux_ustat(l_dev_t dev, \ struct l_ustat *ubuf); } 137 AUE_STATFS STD { int linux_statfs(char *path, \ diff --git a/sys/amd64/linux32/linux.h b/sys/amd64/linux32/linux.h index 02d12f5..97da878 100644 --- a/sys/amd64/linux32/linux.h +++ b/sys/amd64/linux32/linux.h @@ -165,13 +165,6 @@ struct l_rusage { l_long ru_nivcsw; } __packed; -/* mmap options */ -#define LINUX_MAP_SHARED 0x0001 -#define LINUX_MAP_PRIVATE 0x0002 -#define LINUX_MAP_FIXED 0x0010 -#define LINUX_MAP_ANON 0x0020 -#define LINUX_MAP_GROWSDOWN 0x0100 - struct l_mmap_argv { l_uintptr_t addr; l_size_t len; diff --git a/sys/amd64/linux32/linux32_machdep.c b/sys/amd64/linux32/linux32_machdep.c index b38afb3..187ec15 100644 --- a/sys/amd64/linux32/linux32_machdep.c +++ b/sys/amd64/linux32/linux32_machdep.c @@ -70,6 +70,7 @@ __FBSDID("$FreeBSD$"); #include <amd64/linux32/linux32_proto.h> #include <compat/linux/linux_ipc.h> #include <compat/linux/linux_misc.h> +#include <compat/linux/linux_mmap.h> #include <compat/linux/linux_signal.h> #include <compat/linux/linux_util.h> #include <compat/linux/linux_emul.h> @@ -84,9 +85,6 @@ struct l_old_select_argv { l_uintptr_t timeout; } __packed; -static int linux_mmap_common(struct thread *td, l_uintptr_t addr, - l_size_t len, l_int prot, l_int flags, l_int fd, - l_loff_t pos); static void bsd_to_linux_rusage(struct rusage *ru, struct l_rusage *lru) @@ -448,9 +446,6 @@ linux_set_upcall_kse(struct thread *td, register_t stack) return (0); } -#define STACK_SIZE (2 * 1024 * 1024) -#define GUARD_SIZE (4 * PAGE_SIZE) - int linux_mmap2(struct thread *td, struct linux_mmap2_args *args) { @@ -489,184 +484,11 @@ linux_mmap(struct thread *td, struct linux_mmap_args *args) (uint32_t)linux_args.pgoff)); } -static int -linux_mmap_common(struct thread *td, l_uintptr_t addr, l_size_t len, l_int prot, - l_int flags, l_int fd, l_loff_t pos) -{ - struct proc *p = td->td_proc; - struct mmap_args /* { - caddr_t addr; - size_t len; - int prot; - int flags; - int fd; - long pad; - off_t pos; - } */ bsd_args; - int error; - struct file *fp; - cap_rights_t rights; - - error = 0; - bsd_args.flags = 0; - fp = NULL; - - /* - * Linux mmap(2): - * You must specify exactly one of MAP_SHARED and MAP_PRIVATE - */ - if (!((flags & LINUX_MAP_SHARED) ^ (flags & LINUX_MAP_PRIVATE))) - return (EINVAL); - - if (flags & LINUX_MAP_SHARED) - bsd_args.flags |= MAP_SHARED; - if (flags & LINUX_MAP_PRIVATE) - bsd_args.flags |= MAP_PRIVATE; - if (flags & LINUX_MAP_FIXED) - bsd_args.flags |= MAP_FIXED; - if (flags & LINUX_MAP_ANON) { - /* Enforce pos to be on page boundary, then ignore. */ - if ((pos & PAGE_MASK) != 0) - return (EINVAL); - pos = 0; - bsd_args.flags |= MAP_ANON; - } else - bsd_args.flags |= MAP_NOSYNC; - if (flags & LINUX_MAP_GROWSDOWN) - bsd_args.flags |= MAP_STACK; - - /* - * PROT_READ, PROT_WRITE, or PROT_EXEC implies PROT_READ and PROT_EXEC - * on Linux/i386. We do this to ensure maximum compatibility. - * Linux/ia64 does the same in i386 emulation mode. - */ - bsd_args.prot = prot; - if (bsd_args.prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) - bsd_args.prot |= PROT_READ | PROT_EXEC; - - /* Linux does not check file descriptor when MAP_ANONYMOUS is set. */ - bsd_args.fd = (bsd_args.flags & MAP_ANON) ? -1 : fd; - if (bsd_args.fd != -1) { - /* - * Linux follows Solaris mmap(2) description: - * The file descriptor fildes is opened with - * read permission, regardless of the - * protection options specified. - */ - - error = fget(td, bsd_args.fd, - cap_rights_init(&rights, CAP_MMAP), &fp); - if (error != 0) - return (error); - if (fp->f_type != DTYPE_VNODE) { - fdrop(fp, td); - return (EINVAL); - } - - /* Linux mmap() just fails for O_WRONLY files */ - if (!(fp->f_flag & FREAD)) { - fdrop(fp, td); - return (EACCES); - } - - fdrop(fp, td); - } - - if (flags & LINUX_MAP_GROWSDOWN) { - /* - * The Linux MAP_GROWSDOWN option does not limit auto - * growth of the region. Linux mmap with this option - * takes as addr the inital BOS, and as len, the initial - * region size. It can then grow down from addr without - * limit. However, Linux threads has an implicit internal - * limit to stack size of STACK_SIZE. Its just not - * enforced explicitly in Linux. But, here we impose - * a limit of (STACK_SIZE - GUARD_SIZE) on the stack - * region, since we can do this with our mmap. - * - * Our mmap with MAP_STACK takes addr as the maximum - * downsize limit on BOS, and as len the max size of - * the region. It then maps the top SGROWSIZ bytes, - * and auto grows the region down, up to the limit - * in addr. - * - * If we don't use the MAP_STACK option, the effect - * of this code is to allocate a stack region of a - * fixed size of (STACK_SIZE - GUARD_SIZE). - */ - - if ((caddr_t)PTRIN(addr) + len > p->p_vmspace->vm_maxsaddr) { - /* - * Some Linux apps will attempt to mmap - * thread stacks near the top of their - * address space. If their TOS is greater - * than vm_maxsaddr, vm_map_growstack() - * will confuse the thread stack with the - * process stack and deliver a SEGV if they - * attempt to grow the thread stack past their - * current stacksize rlimit. To avoid this, - * adjust vm_maxsaddr upwards to reflect - * the current stacksize rlimit rather - * than the maximum possible stacksize. - * It would be better to adjust the - * mmap'ed region, but some apps do not check - * mmap's return value. - */ - PROC_LOCK(p); - p->p_vmspace->vm_maxsaddr = (char *)LINUX32_USRSTACK - - lim_cur(p, RLIMIT_STACK); - PROC_UNLOCK(p); - } - - /* - * This gives us our maximum stack size and a new BOS. - * If we're using VM_STACK, then mmap will just map - * the top SGROWSIZ bytes, and let the stack grow down - * to the limit at BOS. If we're not using VM_STACK - * we map the full stack, since we don't have a way - * to autogrow it. - */ - if (len > STACK_SIZE - GUARD_SIZE) { - bsd_args.addr = (caddr_t)PTRIN(addr); - bsd_args.len = len; - } else { - bsd_args.addr = (caddr_t)PTRIN(addr) - - (STACK_SIZE - GUARD_SIZE - len); - bsd_args.len = STACK_SIZE - GUARD_SIZE; - } - } else { - bsd_args.addr = (caddr_t)PTRIN(addr); - bsd_args.len = len; - } - bsd_args.pos = pos; - -#ifdef DEBUG - if (ldebug(mmap)) - printf("-> %s(%p, %d, %d, 0x%08x, %d, 0x%x)\n", - __func__, - (void *)bsd_args.addr, (int)bsd_args.len, bsd_args.prot, - bsd_args.flags, bsd_args.fd, (int)bsd_args.pos); -#endif - error = sys_mmap(td, &bsd_args); -#ifdef DEBUG - if (ldebug(mmap)) - printf("-> %s() return: 0x%x (0x%08x)\n", - __func__, error, (u_int)td->td_retval[0]); -#endif - return (error); -} - int linux_mprotect(struct thread *td, struct linux_mprotect_args *uap) { - struct mprotect_args bsd_args; - - bsd_args.addr = uap->addr; - bsd_args.len = uap->len; - bsd_args.prot = uap->prot; - if (bsd_args.prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) - bsd_args.prot |= PROT_READ | PROT_EXEC; - return (sys_mprotect(td, &bsd_args)); + + return (linux_mprotect_common(td, PTROUT(uap->addr), uap->len, uap->prot)); } int diff --git a/sys/amd64/linux32/linux32_proto.h b/sys/amd64/linux32/linux32_proto.h index a960c09..b2d1a7e 100644 --- a/sys/amd64/linux32/linux32_proto.h +++ b/sys/amd64/linux32/linux32_proto.h @@ -3,7 +3,7 @@ * * DO NOT EDIT-- this file is automatically generated. * $FreeBSD$ - * created from FreeBSD: stable/10/sys/amd64/linux32/syscalls.master 297300 2016-03-27 06:10:51Z dchagin + * created from FreeBSD: stable/10/sys/amd64/linux32/syscalls.master 302962 2016-07-17 15:07:33Z dchagin */ #ifndef _LINUX32_SYSPROTO_H_ @@ -427,7 +427,7 @@ struct linux_sysfs_args { char arg2_l_[PADL_(l_ulong)]; l_ulong arg2; char arg2_r_[PADR_(l_ulong)]; }; struct linux_personality_args { - char per_l_[PADL_(l_ulong)]; l_ulong per; char per_r_[PADR_(l_ulong)]; + char per_l_[PADL_(l_uint)]; l_uint per; char per_r_[PADR_(l_uint)]; }; struct linux_setfsuid16_args { char uid_l_[PADL_(l_uid16_t)]; l_uid16_t uid; char uid_r_[PADR_(l_uid16_t)]; diff --git a/sys/amd64/linux32/linux32_syscall.h b/sys/amd64/linux32/linux32_syscall.h index 25747e4..82764d0 100644 --- a/sys/amd64/linux32/linux32_syscall.h +++ b/sys/amd64/linux32/linux32_syscall.h @@ -3,7 +3,7 @@ * * DO NOT EDIT-- this file is automatically generated. * $FreeBSD$ - * created from FreeBSD: stable/10/sys/amd64/linux32/syscalls.master 297300 2016-03-27 06:10:51Z dchagin + * created from FreeBSD: stable/10/sys/amd64/linux32/syscalls.master 302962 2016-07-17 15:07:33Z dchagin */ #define LINUX32_SYS_linux_exit 1 diff --git a/sys/amd64/linux32/linux32_syscalls.c b/sys/amd64/linux32/linux32_syscalls.c index ef965d7..0d995c3 100644 --- a/sys/amd64/linux32/linux32_syscalls.c +++ b/sys/amd64/linux32/linux32_syscalls.c @@ -3,7 +3,7 @@ * * DO NOT EDIT-- this file is automatically generated. * $FreeBSD$ - * created from FreeBSD: stable/10/sys/amd64/linux32/syscalls.master 297300 2016-03-27 06:10:51Z dchagin + * created from FreeBSD: stable/10/sys/amd64/linux32/syscalls.master 302962 2016-07-17 15:07:33Z dchagin */ const char *linux32_syscallnames[] = { diff --git a/sys/amd64/linux32/linux32_sysent.c b/sys/amd64/linux32/linux32_sysent.c index 6eb215e..ee766d7 100644 --- a/sys/amd64/linux32/linux32_sysent.c +++ b/sys/amd64/linux32/linux32_sysent.c @@ -3,7 +3,7 @@ * * DO NOT EDIT-- this file is automatically generated. * $FreeBSD$ - * created from FreeBSD: stable/10/sys/amd64/linux32/syscalls.master 297300 2016-03-27 06:10:51Z dchagin + * created from FreeBSD: stable/10/sys/amd64/linux32/syscalls.master 302962 2016-07-17 15:07:33Z dchagin */ #include "opt_compat.h" diff --git a/sys/amd64/linux32/linux32_systrace_args.c b/sys/amd64/linux32/linux32_systrace_args.c index cabfab7..f3bde85 100644 --- a/sys/amd64/linux32/linux32_systrace_args.c +++ b/sys/amd64/linux32/linux32_systrace_args.c @@ -910,7 +910,7 @@ systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args) /* linux_personality */ case 136: { struct linux_personality_args *p = params; - iarg[0] = p->per; /* l_ulong */ + iarg[0] = p->per; /* l_uint */ *n_args = 1; break; } @@ -3715,7 +3715,7 @@ systrace_entry_setargdesc(int sysnum, int ndx, char *desc, size_t descsz) case 136: switch(ndx) { case 0: - p = "l_ulong"; + p = "l_uint"; break; default: break; diff --git a/sys/amd64/linux32/syscalls.master b/sys/amd64/linux32/syscalls.master index 79cd2c8..20aa3c4 100644 --- a/sys/amd64/linux32/syscalls.master +++ b/sys/amd64/linux32/syscalls.master @@ -238,7 +238,7 @@ 134 AUE_BDFLUSH STD { int linux_bdflush(void); } 135 AUE_NULL STD { int linux_sysfs(l_int option, \ l_ulong arg1, l_ulong arg2); } -136 AUE_PERSONALITY STD { int linux_personality(l_ulong per); } +136 AUE_PERSONALITY STD { int linux_personality(l_uint per); } 137 AUE_NULL UNIMPL afs_syscall 138 AUE_SETFSUID STD { int linux_setfsuid16(l_uid16_t uid); } 139 AUE_SETFSGID STD { int linux_setfsgid16(l_gid16_t gid); } diff --git a/sys/cddl/compat/opensolaris/kern/opensolaris_lookup.c b/sys/cddl/compat/opensolaris/kern/opensolaris_lookup.c index 848007e..4214ca8 100644 --- a/sys/cddl/compat/opensolaris/kern/opensolaris_lookup.c +++ b/sys/cddl/compat/opensolaris/kern/opensolaris_lookup.c @@ -89,13 +89,14 @@ traverse(vnode_t **cvpp, int lktype) if (vfsp == NULL) break; error = vfs_busy(vfsp, 0); + /* * tvp is NULL for *cvpp vnode, which we can't unlock. - * At least some callers expect the reference to be - * maintained to the original *cvpp */ if (tvp != NULL) vput(cvp); + else + vrele(cvp); if (error) return (error); diff --git a/sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c b/sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c index 49becbc..6af1e8b 100644 --- a/sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c +++ b/sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c @@ -121,34 +121,39 @@ mount_snapshot(kthread_t *td, vnode_t **vpp, const char *fstype, char *fspath, struct ucred *cr; int error; + ASSERT_VOP_ELOCKED(*vpp, "mount_snapshot"); + + vp = *vpp; + *vpp = NULL; + error = 0; + /* * Be ultra-paranoid about making sure the type and fspath * variables will fit in our mp buffers, including the * terminating NUL. */ if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN) - return (ENAMETOOLONG); - - vfsp = vfs_byname_kld(fstype, td, &error); - if (vfsp == NULL) - return (ENODEV); - - vp = *vpp; - if (vp->v_type != VDIR) - return (ENOTDIR); + error = ENAMETOOLONG; + if (error == 0 && (vfsp = vfs_byname_kld(fstype, td, &error)) == NULL) + error = ENODEV; + if (error == 0 && vp->v_type != VDIR) + error = ENOTDIR; /* * We need vnode lock to protect v_mountedhere and vnode interlock * to protect v_iflag. */ - vn_lock(vp, LK_SHARED | LK_RETRY); - VI_LOCK(vp); - if ((vp->v_iflag & VI_MOUNT) != 0 || vp->v_mountedhere != NULL) { + if (error == 0) { + VI_LOCK(vp); + if ((vp->v_iflag & VI_MOUNT) == 0 && vp->v_mountedhere == NULL) + vp->v_iflag |= VI_MOUNT; + else + error = EBUSY; VI_UNLOCK(vp); - VOP_UNLOCK(vp, 0); - return (EBUSY); } - vp->v_iflag |= VI_MOUNT; - VI_UNLOCK(vp); + if (error != 0) { + vput(vp); + return (error); + } VOP_UNLOCK(vp, 0); /* @@ -198,7 +203,6 @@ mount_snapshot(kthread_t *td, vnode_t **vpp, const char *fstype, char *fspath, vfs_unbusy(mp); vfs_freeopts(mp->mnt_optnew); vfs_mount_destroy(mp); - *vpp = NULL; return (error); } @@ -229,7 +233,7 @@ mount_snapshot(kthread_t *td, vnode_t **vpp, const char *fstype, char *fspath, vfs_event_signal(NULL, VQ_MOUNT, 0); if (VFS_ROOT(mp, LK_EXCLUSIVE, &mvp)) panic("mount: lost mount"); - vput(vp); + VOP_UNLOCK(vp, 0); vfs_unbusy(mp); *vpp = mvp; return (0); diff --git a/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c b/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c index ed99c4b..a087e6e 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c +++ b/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c @@ -7288,7 +7288,7 @@ dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1, if (pred != NULL) { dtrace_difo_t *dp = pred->dtp_difo; - int rval; + uint64_t rval; rval = dtrace_dif_emulate(dp, &mstate, vstate, state); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c b/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c index c5c5c00..27d259d 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c @@ -442,19 +442,9 @@ gfs_lookup_dot(vnode_t **vpp, vnode_t *dvp, vnode_t *pvp, const char *nm) *vpp = dvp; return (0); } else if (strcmp(nm, "..") == 0) { - if (pvp == NULL) { - ASSERT(dvp->v_flag & VROOT); - VN_HOLD(dvp); - *vpp = dvp; - ASSERT_VOP_ELOCKED(dvp, "gfs_lookup_dot: non-locked dvp"); - } else { - ltype = VOP_ISLOCKED(dvp); - VOP_UNLOCK(dvp, 0); - VN_HOLD(pvp); - *vpp = pvp; - vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); - vn_lock(dvp, ltype | LK_RETRY); - } + ASSERT(pvp != NULL); + VN_HOLD(pvp); + *vpp = pvp; return (0); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c index f6d19fe..9430037 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c @@ -233,10 +233,15 @@ int zfs_disable_dup_eviction = 0; uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ u_int zfs_arc_free_target = 0; +/* Absolute min for arc min / max is 16MB. */ +static uint64_t arc_abs_min = 16 << 20; + static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS); static int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS); +static int sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS); +static int sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS); -#ifdef _KERNEL +#if defined(__FreeBSD__) && defined(_KERNEL) static void arc_free_target_init(void *unused __unused) { @@ -253,10 +258,10 @@ TUNABLE_QUAD("vfs.zfs.arc_meta_min", &zfs_arc_meta_min); TUNABLE_QUAD("vfs.zfs.arc_average_blocksize", &zfs_arc_average_blocksize); TUNABLE_INT("vfs.zfs.arc_shrink_shift", &zfs_arc_shrink_shift); SYSCTL_DECL(_vfs_zfs); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0, - "Maximum ARC size"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0, - "Minimum ARC size"); +SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max, CTLTYPE_U64 | CTLFLAG_RWTUN, + 0, sizeof(uint64_t), sysctl_vfs_zfs_arc_max, "QU", "Maximum ARC size"); +SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min, CTLTYPE_U64 | CTLFLAG_RWTUN, + 0, sizeof(uint64_t), sysctl_vfs_zfs_arc_min, "QU", "Minimum ARC size"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN, &zfs_arc_average_blocksize, 0, "ARC average blocksize"); @@ -882,7 +887,7 @@ struct arc_buf_hdr { l1arc_buf_hdr_t b_l1hdr; }; -#ifdef _KERNEL +#if defined(__FreeBSD__) && defined(_KERNEL) static int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS) { @@ -900,6 +905,82 @@ sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS) arc_meta_limit = val; return (0); } + +static int +sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS) +{ + uint64_t val; + int err; + + val = zfs_arc_max; + err = sysctl_handle_64(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if (zfs_arc_max == 0) { + /* Loader tunable so blindly set */ + zfs_arc_max = val; + return (0); + } + + if (val < arc_abs_min || val > kmem_size()) + return (EINVAL); + if (val < arc_c_min) + return (EINVAL); + if (zfs_arc_meta_limit > 0 && val < zfs_arc_meta_limit) + return (EINVAL); + + arc_c_max = val; + + arc_c = arc_c_max; + arc_p = (arc_c >> 1); + + if (zfs_arc_meta_limit == 0) { + /* limit meta-data to 1/4 of the arc capacity */ + arc_meta_limit = arc_c_max / 4; + } + + /* if kmem_flags are set, lets try to use less memory */ + if (kmem_debugging()) + arc_c = arc_c / 2; + + zfs_arc_max = arc_c; + + return (0); +} + +static int +sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS) +{ + uint64_t val; + int err; + + val = zfs_arc_min; + err = sysctl_handle_64(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if (zfs_arc_min == 0) { + /* Loader tunable so blindly set */ + zfs_arc_min = val; + return (0); + } + + if (val < arc_abs_min || val > arc_c_max) + return (EINVAL); + + arc_c_min = val; + + if (zfs_arc_meta_min == 0) + arc_meta_min = arc_c_min / 2; + + if (arc_c < arc_c_min) + arc_c = arc_c_min; + + zfs_arc_min = arc_c_min; + + return (0); +} #endif static arc_buf_t *arc_eviction_list; @@ -2251,6 +2332,7 @@ arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr) ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, hdr->b_l1hdr.b_buf->b_data); ASSERT3U(hdr->b_l2hdr.b_compress, ==, ZIO_COMPRESS_OFF); + hdr->b_l1hdr.b_tmp_cdata = NULL; return; } @@ -5320,8 +5402,8 @@ arc_init(void) arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); #endif #endif /* illumos */ - /* set min cache to 1/32 of all memory, or 16MB, whichever is more */ - arc_c_min = MAX(arc_c / 4, 16 << 20); + /* set min cache to 1/32 of all memory, or arc_abs_min, whichever is more */ + arc_c_min = MAX(arc_c / 4, arc_abs_min); /* set max to 1/2 of all memory, or all but 1GB, whichever is more */ if (arc_c * 8 >= 1 << 30) arc_c_max = (arc_c * 8) - (1 << 30); @@ -5342,11 +5424,11 @@ arc_init(void) #ifdef _KERNEL /* * Allow the tunables to override our calculations if they are - * reasonable (ie. over 16MB) + * reasonable. */ - if (zfs_arc_max > 16 << 20 && zfs_arc_max < kmem_size()) + if (zfs_arc_max > arc_abs_min && zfs_arc_max < kmem_size()) arc_c_max = zfs_arc_max; - if (zfs_arc_min > 16 << 20 && zfs_arc_min <= arc_c_max) + if (zfs_arc_min > arc_abs_min && zfs_arc_min <= arc_c_max) arc_c_min = zfs_arc_min; #endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c index 8e14a97..d8d186c 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c @@ -1083,8 +1083,13 @@ dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size) else XUIOSTAT_BUMP(xuiostat_rbuf_copied); } else { +#ifdef illumos err = uiomove((char *)db->db_data + bufoff, tocpy, UIO_READ, uio); +#else + err = vn_io_fault_uiomove((char *)db->db_data + bufoff, + tocpy, uio); +#endif } if (err) break; @@ -1178,6 +1183,7 @@ dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx) else dmu_buf_will_dirty(db, tx); +#ifdef illumos /* * XXX uiomove could block forever (eg. nfs-backed * pages). There needs to be a uiolockdown() function @@ -1186,6 +1192,10 @@ dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx) */ err = uiomove((char *)db->db_data + bufoff, tocpy, UIO_WRITE, uio); +#else + err = vn_io_fault_uiomove((char *)db->db_data + bufoff, tocpy, + uio); +#endif if (tocpy == db->db_size) dmu_buf_fill_done(db, tx); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c index d1b8002..9c34f45 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c @@ -49,8 +49,8 @@ #include <sys/dsl_userhold.h> #ifdef __FreeBSD__ -#include <sys/sysctl.h> #include <sys/types.h> +#include <sys/sysctl.h> #endif /* diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h index 1945686..8849003 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h @@ -61,7 +61,6 @@ int zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp); #define ZFSCTL_INO_ROOT 0x1 #define ZFSCTL_INO_SNAPDIR 0x2 -#define ZFSCTL_INO_SHARES 0x3 #ifdef __cplusplus } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c index 693ba41..cbafa03 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c @@ -223,7 +223,7 @@ zfsctl_root_inode_cb(vnode_t *vp, int index) { zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; - ASSERT(index <= 2); + ASSERT(index < 2); if (index == 0) return (ZFSCTL_INO_SNAPDIR); @@ -294,6 +294,22 @@ zfsctl_root(znode_t *zp) return (zp->z_zfsvfs->z_ctldir); } +static int +zfsctl_common_print(ap) + struct vop_print_args /* { + struct vnode *a_vp; + } */ *ap; +{ + vnode_t *vp = ap->a_vp; + gfs_file_t *fp = vp->v_data; + + printf(" parent = %p\n", fp->gfs_parent); + printf(" type = %d\n", fp->gfs_type); + printf(" index = %d\n", fp->gfs_index); + printf(" ino = %ju\n", (uintmax_t)fp->gfs_ino); + return (0); +} + /* * Common open routine. Disallow any write access. */ @@ -404,8 +420,6 @@ zfsctl_common_fid(ap) ZFS_EXIT(zfsvfs); return (SET_ERROR(ENOSPC)); } -#else - fidp->fid_len = SHORT_FID_LEN; #endif zfid = (zfid_short_t *)fidp; @@ -454,25 +468,6 @@ zfsctl_shares_fid(ap) return (error); } -static int -zfsctl_common_reclaim(ap) - struct vop_reclaim_args /* { - struct vnode *a_vp; - struct thread *a_td; - } */ *ap; -{ - vnode_t *vp = ap->a_vp; - - /* - * Destroy the vm object and flush associated pages. - */ - vnode_destroy_vobject(vp); - VI_LOCK(vp); - vp->v_data = NULL; - VI_UNLOCK(vp); - return (0); -} - /* * .zfs inode namespace * @@ -537,9 +532,20 @@ zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, ZFS_ENTER(zfsvfs); if (strcmp(nm, "..") == 0) { +#ifdef illumos err = VFS_ROOT(dvp->v_vfsp, LK_EXCLUSIVE, vpp); +#else + /* + * NB: can not use VFS_ROOT here as it would acquire + * the vnode lock of the parent (root) vnode while + * holding the child's (.zfs) lock. + */ + znode_t *rootzp; + + err = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); if (err == 0) - VOP_UNLOCK(*vpp, 0); + *vpp = ZTOV(rootzp); +#endif } else { err = gfs_vop_lookup(dvp, nm, vpp, pnp, flags, rdir, cr, ct, direntflags, realpnp); @@ -550,6 +556,61 @@ zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, return (err); } +static int +zfsctl_freebsd_root_lookup(ap) + struct vop_lookup_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + } */ *ap; +{ + vnode_t *dvp = ap->a_dvp; + vnode_t **vpp = ap->a_vpp; + cred_t *cr = ap->a_cnp->cn_cred; + int flags = ap->a_cnp->cn_flags; + int lkflags = ap->a_cnp->cn_lkflags; + int nameiop = ap->a_cnp->cn_nameiop; + char nm[NAME_MAX + 1]; + int err; + + if ((flags & ISLASTCN) && (nameiop == RENAME || nameiop == CREATE)) + return (EOPNOTSUPP); + + ASSERT(ap->a_cnp->cn_namelen < sizeof(nm)); + strlcpy(nm, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1); +relookup: + err = zfsctl_root_lookup(dvp, nm, vpp, NULL, 0, NULL, cr, NULL, NULL, NULL); + if (err == 0 && (nm[0] != '.' || nm[1] != '\0')) { + if (flags & ISDOTDOT) { + VOP_UNLOCK(dvp, 0); + err = vn_lock(*vpp, lkflags); + if (err != 0) { + vrele(*vpp); + *vpp = NULL; + } + vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); + } else { + err = vn_lock(*vpp, LK_EXCLUSIVE); + if (err != 0) { + VERIFY3S(err, ==, ENOENT); + goto relookup; + } + } + } + return (err); +} + +static int +zfsctl_root_print(ap) + struct vop_print_args /* { + struct vnode *a_vp; + } */ *ap; +{ + printf(" .zfs node\n"); + zfsctl_common_print(ap); + return (0); +} + #ifdef illumos static int zfsctl_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, @@ -585,49 +646,6 @@ static const fs_operation_def_t zfsctl_tops_root[] = { }; #endif /* illumos */ -/* - * Special case the handling of "..". - */ -/* ARGSUSED */ -int -zfsctl_freebsd_root_lookup(ap) - struct vop_lookup_args /* { - struct vnode *a_dvp; - struct vnode **a_vpp; - struct componentname *a_cnp; - } */ *ap; -{ - vnode_t *dvp = ap->a_dvp; - vnode_t **vpp = ap->a_vpp; - cred_t *cr = ap->a_cnp->cn_cred; - int flags = ap->a_cnp->cn_flags; - int nameiop = ap->a_cnp->cn_nameiop; - char nm[NAME_MAX + 1]; - int err; - int ltype; - - if ((flags & ISLASTCN) && (nameiop == RENAME || nameiop == CREATE)) - return (EOPNOTSUPP); - - ASSERT(ap->a_cnp->cn_namelen < sizeof(nm)); - strlcpy(nm, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1); - err = zfsctl_root_lookup(dvp, nm, vpp, NULL, 0, NULL, cr, NULL, NULL, NULL); - if (err == 0 && (nm[0] != '.' || nm[1] != '\0')) { - ltype = VOP_ISLOCKED(dvp); - if (flags & ISDOTDOT) { - VN_HOLD(*vpp); - VOP_UNLOCK(dvp, 0); - } - vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); - if (flags & ISDOTDOT) { - VN_RELE(*vpp); - vn_lock(dvp, ltype| LK_RETRY); - } - } - - return (err); -} - static struct vop_vector zfsctl_ops_root = { .vop_default = &default_vnodeops, .vop_open = zfsctl_common_open, @@ -643,6 +661,7 @@ static struct vop_vector zfsctl_ops_root = { .vop_pathconf = zfsctl_pathconf, #endif .vop_fid = zfsctl_common_fid, + .vop_print = zfsctl_root_print, }; /* @@ -987,6 +1006,11 @@ zfsctl_snapdir_lookup(ap) ZFS_ENTER(zfsvfs); if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0) { + if (nm[0] == '.' && nm[1] == '.' && nm[2] =='\0') { + VOP_UNLOCK(dvp, 0); + VERIFY0(vn_lock(*vpp, LK_EXCLUSIVE)); + VERIFY0(vn_lock(dvp, LK_EXCLUSIVE)); + } ZFS_EXIT(zfsvfs); return (0); } @@ -1011,6 +1035,7 @@ zfsctl_snapdir_lookup(ap) #endif } +relookup: mutex_enter(&sdp->sd_lock); search.se_name = (char *)nm; if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) != NULL) { @@ -1018,7 +1043,6 @@ zfsctl_snapdir_lookup(ap) VN_HOLD(*vpp); err = traverse(vpp, LK_EXCLUSIVE | LK_RETRY); if (err != 0) { - VN_RELE(*vpp); *vpp = NULL; } else if (*vpp == sep->se_root) { /* @@ -1027,13 +1051,6 @@ zfsctl_snapdir_lookup(ap) */ VERIFY(zfsctl_snapshot_zname(dvp, nm, MAXNAMELEN, snapname) == 0); goto domount; - } else { - /* - * VROOT was set during the traverse call. We need - * to clear it since we're pretending to be part - * of our parent's vfs. - */ - (*vpp)->v_flag &= ~VROOT; } mutex_exit(&sdp->sd_lock); ZFS_EXIT(zfsvfs); @@ -1076,7 +1093,6 @@ zfsctl_snapdir_lookup(ap) sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP); (void) strcpy(sep->se_name, nm); *vpp = sep->se_root = zfsctl_snapshot_mknode(dvp, dmu_objset_id(snap)); - VN_HOLD(*vpp); avl_insert(&sdp->sd_snaps, sep, where); dmu_objset_rele(snap, FTAG); @@ -1087,6 +1103,16 @@ domount: (void) snprintf(mountpoint, mountpoint_len, "%s/" ZFS_CTLDIR_NAME "/snapshot/%s", dvp->v_vfsp->mnt_stat.f_mntonname, nm); + mutex_exit(&sdp->sd_lock); + + /* + * The vnode may get reclaimed between dropping sd_lock and + * getting the vnode lock. + * */ + err = vn_lock(*vpp, LK_EXCLUSIVE); + if (err == ENOENT) + goto relookup; + VERIFY0(err); err = mount_snapshot(curthread, vpp, "zfs", mountpoint, snapname, 0); kmem_free(mountpoint, mountpoint_len); if (err == 0) { @@ -1099,8 +1125,8 @@ domount: */ ASSERT(VTOZ(*vpp)->z_zfsvfs != zfsvfs); VTOZ(*vpp)->z_zfsvfs->z_parent = zfsvfs; + (*vpp)->v_flag &= ~VROOT; } - mutex_exit(&sdp->sd_lock); ZFS_EXIT(zfsvfs); #ifdef illumos @@ -1142,6 +1168,11 @@ zfsctl_shares_lookup(ap) strlcpy(nm, cnp->cn_nameptr, cnp->cn_namelen + 1); if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0) { + if (nm[0] == '.' && nm[1] == '.' && nm[2] =='\0') { + VOP_UNLOCK(dvp, 0); + VERIFY0(vn_lock(*vpp, LK_EXCLUSIVE)); + VERIFY0(vn_lock(dvp, LK_EXCLUSIVE)); + } ZFS_EXIT(zfsvfs); return (0); } @@ -1348,8 +1379,8 @@ zfsctl_snapdir_getattr(ap) /* ARGSUSED */ static int -zfsctl_snapdir_inactive(ap) - struct vop_inactive_args /* { +zfsctl_snapdir_reclaim(ap) + struct vop_reclaim_args /* { struct vnode *a_vp; struct thread *a_td; } */ *ap; @@ -1358,22 +1389,37 @@ zfsctl_snapdir_inactive(ap) zfsctl_snapdir_t *sdp = vp->v_data; zfs_snapentry_t *sep; - /* - * On forced unmount we have to free snapshots from here. - */ - mutex_enter(&sdp->sd_lock); - while ((sep = avl_first(&sdp->sd_snaps)) != NULL) { - avl_remove(&sdp->sd_snaps, sep); - kmem_free(sep->se_name, strlen(sep->se_name) + 1); - kmem_free(sep, sizeof (zfs_snapentry_t)); - } - mutex_exit(&sdp->sd_lock); - gfs_dir_inactive(vp); ASSERT(avl_numnodes(&sdp->sd_snaps) == 0); mutex_destroy(&sdp->sd_lock); avl_destroy(&sdp->sd_snaps); - kmem_free(sdp, sizeof (zfsctl_snapdir_t)); + gfs_vop_reclaim(ap); + + return (0); +} + +static int +zfsctl_shares_print(ap) + struct vop_print_args /* { + struct vnode *a_vp; + } */ *ap; +{ + printf(" .zfs/shares node\n"); + zfsctl_common_print(ap); + return (0); +} +static int +zfsctl_snapdir_print(ap) + struct vop_print_args /* { + struct vnode *a_vp; + } */ *ap; +{ + vnode_t *vp = ap->a_vp; + zfsctl_snapdir_t *sdp = vp->v_data; + + printf(" .zfs/snapshot node\n"); + printf(" number of children = %lu\n", avl_numnodes(&sdp->sd_snaps)); + zfsctl_common_print(ap); return (0); } @@ -1419,9 +1465,10 @@ static struct vop_vector zfsctl_ops_snapdir = { .vop_mkdir = zfsctl_freebsd_snapdir_mkdir, .vop_readdir = gfs_vop_readdir, .vop_lookup = zfsctl_snapdir_lookup, - .vop_inactive = zfsctl_snapdir_inactive, - .vop_reclaim = zfsctl_common_reclaim, + .vop_inactive = VOP_NULL, + .vop_reclaim = zfsctl_snapdir_reclaim, .vop_fid = zfsctl_common_fid, + .vop_print = zfsctl_snapdir_print, }; static struct vop_vector zfsctl_ops_shares = { @@ -1436,6 +1483,7 @@ static struct vop_vector zfsctl_ops_shares = { .vop_inactive = VOP_NULL, .vop_reclaim = gfs_vop_reclaim, .vop_fid = zfsctl_shares_fid, + .vop_print = zfsctl_shares_print, }; #endif /* illumos */ @@ -1454,7 +1502,6 @@ zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset) vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp, pvp->v_vfsp, &zfsctl_ops_snapshot, NULL, NULL, MAXNAMELEN, NULL, NULL); - VN_HOLD(vp); zcp = vp->v_data; zcp->zc_id = objset; VOP_UNLOCK(vp, 0); @@ -1462,17 +1509,28 @@ zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset) return (vp); } +static int +zfsctl_snapshot_inactive(ap) + struct vop_inactive_args /* { + struct vnode *a_vp; + struct thread *a_td; + } */ *ap; +{ + vnode_t *vp = ap->a_vp; + + vrecycle(vp); + return (0); +} static int zfsctl_snapshot_reclaim(ap) - struct vop_inactive_args /* { + struct vop_reclaim_args /* { struct vnode *a_vp; struct thread *a_td; } */ *ap; { vnode_t *vp = ap->a_vp; cred_t *cr = ap->a_td->td_ucred; - struct vop_reclaim_args iap; zfsctl_snapdir_t *sdp; zfs_snapentry_t *sep, *next; int locked; @@ -1480,7 +1538,6 @@ zfsctl_snapshot_reclaim(ap) VERIFY(gfs_dir_lookup(vp, "..", &dvp, cr, 0, NULL, NULL) == 0); sdp = dvp->v_data; - VOP_UNLOCK(dvp, 0); /* this may already have been unmounted */ if (sdp == NULL) { VN_RELE(dvp); @@ -1516,106 +1573,12 @@ zfsctl_snapshot_reclaim(ap) * "active". If we lookup the same name again we will end up * creating a new vnode. */ - iap.a_vp = vp; - gfs_vop_reclaim(&iap); + gfs_vop_reclaim(ap); return (0); } static int -zfsctl_traverse_begin(vnode_t **vpp, int lktype) -{ - - VN_HOLD(*vpp); - /* Snapshot should be already mounted, but just in case. */ - if (vn_mountedvfs(*vpp) == NULL) - return (ENOENT); - return (traverse(vpp, lktype)); -} - -static void -zfsctl_traverse_end(vnode_t *vp, int err) -{ - - if (err == 0) - vput(vp); - else - VN_RELE(vp); -} - -static int -zfsctl_snapshot_getattr(ap) - struct vop_getattr_args /* { - struct vnode *a_vp; - struct vattr *a_vap; - struct ucred *a_cred; - } */ *ap; -{ - vnode_t *vp = ap->a_vp; - int err; - - err = zfsctl_traverse_begin(&vp, LK_SHARED | LK_RETRY); - if (err == 0) - err = VOP_GETATTR(vp, ap->a_vap, ap->a_cred); - zfsctl_traverse_end(vp, err); - return (err); -} - -static int -zfsctl_snapshot_fid(ap) - struct vop_fid_args /* { - struct vnode *a_vp; - struct fid *a_fid; - } */ *ap; -{ - vnode_t *vp = ap->a_vp; - int err; - - err = zfsctl_traverse_begin(&vp, LK_SHARED | LK_RETRY); - if (err == 0) - err = VOP_VPTOFH(vp, (void *)ap->a_fid); - zfsctl_traverse_end(vp, err); - return (err); -} - -static int -zfsctl_snapshot_lookup(ap) - struct vop_lookup_args /* { - struct vnode *a_dvp; - struct vnode **a_vpp; - struct componentname *a_cnp; - } */ *ap; -{ - vnode_t *dvp = ap->a_dvp; - vnode_t **vpp = ap->a_vpp; - struct componentname *cnp = ap->a_cnp; - cred_t *cr = ap->a_cnp->cn_cred; - zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data; - int error; - - if (cnp->cn_namelen != 2 || cnp->cn_nameptr[0] != '.' || - cnp->cn_nameptr[1] != '.') { - return (ENOENT); - } - - ASSERT(dvp->v_type == VDIR); - ASSERT(zfsvfs->z_ctldir != NULL); - - error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", vpp, - NULL, 0, NULL, cr, NULL, NULL, NULL); - if (error == 0) { - int ltype = VOP_ISLOCKED(dvp); - VN_HOLD(*vpp); - VOP_UNLOCK(dvp, 0); - vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); - VN_RELE(*vpp); - vn_lock(dvp, ltype | LK_RETRY); - } - - return (error); -} - -static int zfsctl_snapshot_vptocnp(struct vop_vptocnp_args *ap) { zfsvfs_t *zfsvfs = ap->a_vp->v_vfsp->vfs_data; @@ -1657,18 +1620,31 @@ zfsctl_snapshot_vptocnp(struct vop_vptocnp_args *ap) return (error); } +static int +zfsctl_snaphot_print(ap) + struct vop_print_args /* { + struct vnode *a_vp; + } */ *ap; +{ + vnode_t *vp = ap->a_vp; + zfsctl_node_t *zcp = vp->v_data; + + printf(" .zfs/snapshot/<snap> node\n"); + printf(" id = %ju\n", (uintmax_t)zcp->zc_id); + zfsctl_common_print(ap); + return (0); +} + /* * These VP's should never see the light of day. They should always * be covered. */ static struct vop_vector zfsctl_ops_snapshot = { .vop_default = &default_vnodeops, - .vop_inactive = VOP_NULL, - .vop_lookup = zfsctl_snapshot_lookup, + .vop_inactive = zfsctl_snapshot_inactive, .vop_reclaim = zfsctl_snapshot_reclaim, - .vop_getattr = zfsctl_snapshot_getattr, - .vop_fid = zfsctl_snapshot_fid, .vop_vptocnp = zfsctl_snapshot_vptocnp, + .vop_print = zfsctl_snaphot_print, }; int @@ -1709,16 +1685,15 @@ zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp) */ error = traverse(&vp, LK_SHARED | LK_RETRY); if (error == 0) { - if (vp == sep->se_root) + if (vp == sep->se_root) { + VN_RELE(vp); /* release covered vp */ error = SET_ERROR(EINVAL); - else + } else { *zfsvfsp = VTOZ(vp)->z_zfsvfs; + VN_URELE(vp); /* put snapshot's root vp */ + } } mutex_exit(&sdp->sd_lock); - if (error == 0) - VN_URELE(vp); - else - VN_RELE(vp); } else { error = SET_ERROR(EINVAL); mutex_exit(&sdp->sd_lock); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c index 651c596..4d6be94 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c @@ -1429,6 +1429,7 @@ static int getzfsvfs(const char *dsname, zfsvfs_t **zfvp) { objset_t *os; + vfs_t *vfsp; int error; error = dmu_objset_hold(dsname, FTAG, &os); @@ -1442,19 +1443,21 @@ getzfsvfs(const char *dsname, zfsvfs_t **zfvp) mutex_enter(&os->os_user_ptr_lock); *zfvp = dmu_objset_get_user(os); if (*zfvp) { -#ifdef illumos - VFS_HOLD((*zfvp)->z_vfs); -#else - if (vfs_busy((*zfvp)->z_vfs, 0) != 0) { - *zfvp = NULL; - error = SET_ERROR(ESRCH); - } -#endif + vfsp = (*zfvp)->z_vfs; + vfs_ref(vfsp); } else { error = SET_ERROR(ESRCH); } mutex_exit(&os->os_user_ptr_lock); dmu_objset_rele(os, FTAG); + if (error == 0) { + error = vfs_busy(vfsp, 0); + vfs_rel(vfsp); + if (error != 0) { + *zfvp = NULL; + error = SET_ERROR(ESRCH); + } + } return (error); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c index fa6eac7..9bb6091 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c @@ -1171,6 +1171,7 @@ zfs_domount(vfs_t *vfsp, char *osname) vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED; vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES; vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED; + vfsp->mnt_kern_flag |= MNTK_NO_IOPF; /* vn_io_fault can be used */ /* * The fsid is 64 bits, composed of an 8-bit fs type, which @@ -1782,12 +1783,11 @@ zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp) if (error == 0) { error = vn_lock(*vpp, flags); - if (error == 0) - (*vpp)->v_vflag |= VV_ROOT; + if (error != 0) { + VN_RELE(*vpp); + *vpp = NULL; + } } - if (error != 0) - *vpp = NULL; - return (error); } @@ -2005,12 +2005,6 @@ zfs_umount(vfs_t *vfsp, int fflag) */ if (zfsvfs->z_ctldir != NULL) zfsctl_destroy(zfsvfs); - if (zfsvfs->z_issnap) { - vnode_t *svp = vfsp->mnt_vnodecovered; - - if (svp->v_count >= 2) - VN_RELE(svp); - } zfs_freevfs(vfsp); return (0); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c index 2a15cdf..b0f11ac 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c @@ -655,7 +655,11 @@ mappedread(vnode_t *vp, int nbytes, uio_t *uio) zfs_vmobject_wunlock(obj); va = zfs_map_page(pp, &sf); +#ifdef illumos error = uiomove(va + off, bytes, UIO_READ, uio); +#else + error = vn_io_fault_uiomove(va + off, bytes, uio); +#endif zfs_unmap_page(sf); zfs_vmobject_wlock(obj); page_unhold(pp); @@ -1033,18 +1037,31 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) * holding up the transaction if the data copy hangs * up on a pagefault (e.g., from an NFS server mapping). */ +#ifdef illumos size_t cbytes; +#endif abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), max_blksz); ASSERT(abuf != NULL); ASSERT(arc_buf_size(abuf) == max_blksz); +#ifdef illumos if (error = uiocopy(abuf->b_data, max_blksz, UIO_WRITE, uio, &cbytes)) { dmu_return_arcbuf(abuf); break; } ASSERT(cbytes == max_blksz); +#else + ssize_t resid = uio->uio_resid; + error = vn_io_fault_uiomove(abuf->b_data, max_blksz, uio); + if (error != 0) { + uio->uio_offset -= resid - uio->uio_resid; + uio->uio_resid = resid; + dmu_return_arcbuf(abuf); + break; + } +#endif } /* @@ -1122,8 +1139,10 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl), woff, abuf, tx); } +#ifdef illumos ASSERT(tx_bytes <= uio->uio_resid); uioskip(uio, tx_bytes); +#endif } if (tx_bytes && vn_has_cached_data(vp)) { update_pages(vp, woff, tx_bytes, zfsvfs->z_os, @@ -1177,7 +1196,11 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) while ((end_size = zp->z_size) < uio->uio_loffset) { (void) atomic_cas_64(&zp->z_size, end_size, uio->uio_loffset); +#ifdef illumos ASSERT(error == 0); +#else + ASSERT(error == 0 || error == EFAULT); +#endif } /* * If we are replaying and eof is non zero then force @@ -1187,7 +1210,10 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) zp->z_size = zfsvfs->z_replay_eof; - error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + if (error == 0) + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + else + (void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); dmu_tx_commit(tx); @@ -1214,6 +1240,17 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) return (error); } +#ifdef __FreeBSD__ + /* + * EFAULT means that at least one page of the source buffer was not + * available. VFS will re-try remaining I/O upon this error. + */ + if (error == EFAULT) { + ZFS_EXIT(zfsvfs); + return (error); + } +#endif + if (ioflag & (FSYNC | FDSYNC) || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, zp->z_id); @@ -7165,6 +7202,53 @@ zfs_freebsd_aclcheck(ap) return (EOPNOTSUPP); } +static int +zfs_vptocnp(struct vop_vptocnp_args *ap) +{ + vnode_t *covered_vp; + vnode_t *vp = ap->a_vp;; + zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + znode_t *zp = VTOZ(vp); + uint64_t parent; + int ltype; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + /* + * If we are a snapshot mounted under .zfs, run the operation + * on the covered vnode. + */ + if ((error = sa_lookup(zp->z_sa_hdl, + SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + if (zp->z_id != parent || zfsvfs->z_parent == zfsvfs) { + ZFS_EXIT(zfsvfs); + return (vop_stdvptocnp(ap)); + } + ZFS_EXIT(zfsvfs); + + covered_vp = vp->v_mount->mnt_vnodecovered; + vhold(covered_vp); + ltype = VOP_ISLOCKED(vp); + VOP_UNLOCK(vp, 0); + error = vget(covered_vp, LK_EXCLUSIVE, curthread); + vdrop(covered_vp); + if (error == 0) { + error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred, + ap->a_buf, ap->a_buflen); + vput(covered_vp); + } + vn_lock(vp, ltype | LK_RETRY); + if ((vp->v_iflag & VI_DOOMED) != 0) + error = SET_ERROR(ENOENT); + return (error); +} + struct vop_vector zfs_vnodeops; struct vop_vector zfs_fifoops; struct vop_vector zfs_shareops; @@ -7210,6 +7294,7 @@ struct vop_vector zfs_vnodeops = { .vop_aclcheck = zfs_freebsd_aclcheck, .vop_getpages = zfs_freebsd_getpages, .vop_putpages = zfs_freebsd_putpages, + .vop_vptocnp = zfs_vptocnp, }; struct vop_vector zfs_fifoops = { diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c index 431a05d..3853838 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c @@ -574,9 +574,10 @@ zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp, zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE; /* - * Slap on VROOT if we are the root znode + * Slap on VROOT if we are the root znode unless we are the root + * node of a snapshot mounted under .zfs. */ - if (zp->z_id == zfsvfs->z_root) + if (zp->z_id == zfsvfs->z_root && zfsvfs->z_parent == zfsvfs) ZTOV(zp)->v_flag |= VROOT; mutex_exit(&zp->z_lock); diff --git a/sys/compat/linux/linux_emul.c b/sys/compat/linux/linux_emul.c index c2bf3ae..b244eea 100644 --- a/sys/compat/linux/linux_emul.c +++ b/sys/compat/linux/linux_emul.c @@ -45,6 +45,7 @@ __FBSDID("$FreeBSD$"); #include <compat/linux/linux_emul.h> #include <compat/linux/linux_misc.h> +#include <compat/linux/linux_persona.h> #include <compat/linux/linux_util.h> @@ -127,7 +128,7 @@ linux_proc_init(struct thread *td, struct thread *newtd, int flags) /* epoll should be destroyed in a case of exec. */ pem = pem_find(p); KASSERT(pem != NULL, ("proc_exit: proc emuldata not found.\n")); - + pem->persona = 0; if (pem->epoll != NULL) { emd = pem->epoll; pem->epoll = NULL; @@ -220,6 +221,9 @@ linux_proc_exec(void *arg __unused, struct proc *p, struct image_params *imgp) { struct thread *td = curthread; struct thread *othertd; +#if defined(__amd64__) + struct linux_pemuldata *pem; +#endif /* * In a case of execing from linux binary properly detach @@ -243,6 +247,17 @@ linux_proc_exec(void *arg __unused, struct proc *p, struct image_params *imgp) linux_proc_init(td, NULL, 0); else linux_proc_init(td, td, 0); +#if defined(__amd64__) + /* + * An IA32 executable which has executable stack will have the + * READ_IMPLIES_EXEC personality flag set automatically. + */ + if (SV_PROC_FLAG(td->td_proc, SV_ILP32) && + imgp->stack_prot & VM_PROT_EXECUTE) { + pem = pem_find(p); + pem->persona |= LINUX_READ_IMPLIES_EXEC; + } +#endif } } diff --git a/sys/compat/linux/linux_emul.h b/sys/compat/linux/linux_emul.h index 7262093..9a5a667 100644 --- a/sys/compat/linux/linux_emul.h +++ b/sys/compat/linux/linux_emul.h @@ -67,6 +67,7 @@ struct linux_pemuldata { uint32_t flags; /* process emuldata flags */ struct sx pem_sx; /* lock for this struct */ void *epoll; /* epoll data */ + uint32_t persona; /* process execution domain */ }; #define LINUX_PEM_XLOCK(p) sx_xlock(&(p)->pem_sx) diff --git a/sys/compat/linux/linux_misc.c b/sys/compat/linux/linux_misc.c index cb2379a5..2765dfd 100644 --- a/sys/compat/linux/linux_misc.c +++ b/sys/compat/linux/linux_misc.c @@ -1198,15 +1198,23 @@ linux_mknodat(struct thread *td, struct linux_mknodat_args *args) int linux_personality(struct thread *td, struct linux_personality_args *args) { + struct linux_pemuldata *pem; + struct proc *p = td->td_proc; + uint32_t old; + #ifdef DEBUG if (ldebug(personality)) - printf(ARGS(personality, "%lu"), (unsigned long)args->per); + printf(ARGS(personality, "%u"), args->per); #endif - if (args->per != 0) - return (EINVAL); - /* Yes Jim, it's still a Linux... */ - td->td_retval[0] = 0; + PROC_LOCK(p); + pem = pem_find(p); + old = pem->persona; + if (args->per != 0xffffffff) + pem->persona = args->per; + PROC_UNLOCK(p); + + td->td_retval[0] = old; return (0); } diff --git a/sys/compat/linux/linux_mmap.c b/sys/compat/linux/linux_mmap.c new file mode 100644 index 0000000..0a5557b --- /dev/null +++ b/sys/compat/linux/linux_mmap.c @@ -0,0 +1,257 @@ +/*- + * Copyright (c) 2004 Tim J. Robbins + * Copyright (c) 2002 Doug Rabson + * Copyright (c) 2000 Marcel Moolenaar + * Copyright (c) 1994-1995 Søren Schmidt + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer + * in this position and unchanged. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/capsicum.h> +#include <sys/file.h> +#include <sys/imgact.h> +#include <sys/ktr.h> +#include <sys/mman.h> +#include <sys/proc.h> +#include <sys/resourcevar.h> +#include <sys/sysent.h> +#include <sys/sysproto.h> + +#include <vm/pmap.h> +#include <vm/vm_map.h> + +#include <compat/linux/linux_emul.h> +#include <compat/linux/linux_mmap.h> +#include <compat/linux/linux_persona.h> +#include <compat/linux/linux_util.h> + + +#define STACK_SIZE (2 * 1024 * 1024) +#define GUARD_SIZE (4 * PAGE_SIZE) + +#if defined(__amd64__) +static void linux_fixup_prot(struct thread *td, int *prot); +#endif + + +int +linux_mmap_common(struct thread *td, uintptr_t addr, size_t len, int prot, + int flags, int fd, off_t pos) +{ + struct proc *p = td->td_proc; + struct vmspace *vms = td->td_proc->p_vmspace; + struct mmap_args /* { + caddr_t addr; + size_t len; + int prot; + int flags; + int fd; + off_t pos; + } */ bsd_args; + int error; + struct file *fp; + + cap_rights_t rights; + LINUX_CTR6(mmap2, "0x%lx, %ld, %ld, 0x%08lx, %ld, 0x%lx", + addr, len, prot, flags, fd, pos); + + error = 0; + bsd_args.flags = 0; + fp = NULL; + + /* + * Linux mmap(2): + * You must specify exactly one of MAP_SHARED and MAP_PRIVATE + */ + if (!((flags & LINUX_MAP_SHARED) ^ (flags & LINUX_MAP_PRIVATE))) + return (EINVAL); + + if (flags & LINUX_MAP_SHARED) + bsd_args.flags |= MAP_SHARED; + if (flags & LINUX_MAP_PRIVATE) + bsd_args.flags |= MAP_PRIVATE; + if (flags & LINUX_MAP_FIXED) + bsd_args.flags |= MAP_FIXED; + if (flags & LINUX_MAP_ANON) { + /* Enforce pos to be on page boundary, then ignore. */ + if ((pos & PAGE_MASK) != 0) + return (EINVAL); + pos = 0; + bsd_args.flags |= MAP_ANON; + } else + bsd_args.flags |= MAP_NOSYNC; + if (flags & LINUX_MAP_GROWSDOWN) + bsd_args.flags |= MAP_STACK; + + /* + * PROT_READ, PROT_WRITE, or PROT_EXEC implies PROT_READ and PROT_EXEC + * on Linux/i386 if the binary requires executable stack. + * We do this only for IA32 emulation as on native i386 this is does not + * make sense without PAE. + * + * XXX. Linux checks that the file system is not mounted with noexec. + */ + bsd_args.prot = prot; +#if defined(__amd64__) + linux_fixup_prot(td, &bsd_args.prot); +#endif + + /* Linux does not check file descriptor when MAP_ANONYMOUS is set. */ + bsd_args.fd = (bsd_args.flags & MAP_ANON) ? -1 : fd; + if (bsd_args.fd != -1) { + /* + * Linux follows Solaris mmap(2) description: + * The file descriptor fildes is opened with + * read permission, regardless of the + * protection options specified. + */ + + error = fget(td, bsd_args.fd, + cap_rights_init(&rights, CAP_MMAP), &fp); + if (error != 0) + return (error); + if (fp->f_type != DTYPE_VNODE) { + fdrop(fp, td); + return (EINVAL); + } + + /* Linux mmap() just fails for O_WRONLY files */ + if (!(fp->f_flag & FREAD)) { + fdrop(fp, td); + return (EACCES); + } + + fdrop(fp, td); + } + + if (flags & LINUX_MAP_GROWSDOWN) { + /* + * The Linux MAP_GROWSDOWN option does not limit auto + * growth of the region. Linux mmap with this option + * takes as addr the initial BOS, and as len, the initial + * region size. It can then grow down from addr without + * limit. However, Linux threads has an implicit internal + * limit to stack size of STACK_SIZE. Its just not + * enforced explicitly in Linux. But, here we impose + * a limit of (STACK_SIZE - GUARD_SIZE) on the stack + * region, since we can do this with our mmap. + * + * Our mmap with MAP_STACK takes addr as the maximum + * downsize limit on BOS, and as len the max size of + * the region. It then maps the top SGROWSIZ bytes, + * and auto grows the region down, up to the limit + * in addr. + * + * If we don't use the MAP_STACK option, the effect + * of this code is to allocate a stack region of a + * fixed size of (STACK_SIZE - GUARD_SIZE). + */ + + if ((caddr_t)addr + len > vms->vm_maxsaddr) { + /* + * Some Linux apps will attempt to mmap + * thread stacks near the top of their + * address space. If their TOS is greater + * than vm_maxsaddr, vm_map_growstack() + * will confuse the thread stack with the + * process stack and deliver a SEGV if they + * attempt to grow the thread stack past their + * current stacksize rlimit. To avoid this, + * adjust vm_maxsaddr upwards to reflect + * the current stacksize rlimit rather + * than the maximum possible stacksize. + * It would be better to adjust the + * mmap'ed region, but some apps do not check + * mmap's return value. + */ + PROC_LOCK(p); + vms->vm_maxsaddr = (char *)p->p_sysent->sv_usrstack - + lim_cur(p, RLIMIT_STACK); + PROC_UNLOCK(p); + } + + /* + * This gives us our maximum stack size and a new BOS. + * If we're using VM_STACK, then mmap will just map + * the top SGROWSIZ bytes, and let the stack grow down + * to the limit at BOS. If we're not using VM_STACK + * we map the full stack, since we don't have a way + * to autogrow it. + */ + if (len > STACK_SIZE - GUARD_SIZE) { + bsd_args.addr = (caddr_t)addr; + bsd_args.len = len; + } else { + bsd_args.addr = (caddr_t)addr - + (STACK_SIZE - GUARD_SIZE - len); + bsd_args.len = STACK_SIZE - GUARD_SIZE; + } + } else { + bsd_args.addr = (caddr_t)addr; + bsd_args.len = len; + } + bsd_args.pos = pos; + + error = sys_mmap(td, &bsd_args); + + LINUX_CTR2(mmap2, "return: %d (%p)", error, td->td_retval[0]); + + return (error); +} + +int +linux_mprotect_common(struct thread *td, uintptr_t addr, size_t len, int prot) +{ + struct mprotect_args bsd_args; + + bsd_args.addr = (void *)addr; + bsd_args.len = len; + bsd_args.prot = prot; + +#if defined(__amd64__) + linux_fixup_prot(td, &bsd_args.prot); +#endif + return (sys_mprotect(td, &bsd_args)); +} + +#if defined(__amd64__) +static void +linux_fixup_prot(struct thread *td, int *prot) +{ + struct linux_pemuldata *pem; + + if (SV_PROC_FLAG(td->td_proc, SV_ILP32) && *prot & PROT_READ) { + pem = pem_find(td->td_proc); + if (pem->persona & LINUX_READ_IMPLIES_EXEC) + *prot |= PROT_EXEC; + } + +} +#endif diff --git a/sys/compat/linux/linux_mmap.h b/sys/compat/linux/linux_mmap.h new file mode 100644 index 0000000..a27d99d --- /dev/null +++ b/sys/compat/linux/linux_mmap.h @@ -0,0 +1,49 @@ +/*- + * Copyright (c) 2004 Tim J. Robbins + * Copyright (c) 2002 Doug Rabson + * Copyright (c) 2000 Marcel Moolenaar + * Copyright (c) 1994-1995 Søren Schmidt + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer + * in this position and unchanged. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _LINUX_MMAP_H_ +#define _LINUX_MMAP_H_ + +/* mmap options */ +#define LINUX_MAP_SHARED 0x0001 +#define LINUX_MAP_PRIVATE 0x0002 +#define LINUX_MAP_FIXED 0x0010 +#define LINUX_MAP_ANON 0x0020 +#define LINUX_MAP_GROWSDOWN 0x0100 + + +int linux_mmap_common(struct thread *, uintptr_t, size_t, int, int, + int, off_t); +int linux_mprotect_common(struct thread *, uintptr_t, size_t, int); + +#endif /* _LINUX_MMAP_H_ */ diff --git a/sys/compat/linux/linux_persona.h b/sys/compat/linux/linux_persona.h new file mode 100644 index 0000000..8d84134 --- /dev/null +++ b/sys/compat/linux/linux_persona.h @@ -0,0 +1,56 @@ +/* + * $FreeBSD$ + */ + +#ifndef LINUX_PERSONALITY_H +#define LINUX_PERSONALITY_H + +/* + * Flags for bug emulation. + * + * These occupy the top three bytes. + */ +enum { + LINUX_UNAME26 = 0x0020000, + LINUX_ADDR_NO_RANDOMIZE = 0x0040000, /* disable randomization + * of VA space + */ + LINUX_FDPIC_FUNCPTRS = 0x0080000, /* userspace function + * ptrs point to descriptors + * (signal handling) + */ + LINUX_MMAP_PAGE_ZERO = 0x0100000, + LINUX_ADDR_COMPAT_LAYOUT = 0x0200000, + LINUX_READ_IMPLIES_EXEC = 0x0400000, + LINUX_ADDR_LIMIT_32BIT = 0x0800000, + LINUX_SHORT_INODE = 0x1000000, + LINUX_WHOLE_SECONDS = 0x2000000, + LINUX_STICKY_TIMEOUTS = 0x4000000, + LINUX_ADDR_LIMIT_3GB = 0x8000000, +}; + +/* + * Security-relevant compatibility flags that must be + * cleared upon setuid or setgid exec: + */ +#define LINUX_PER_CLEAR_ON_SETID (LINUX_READ_IMPLIES_EXEC | \ + LINUX_ADDR_NO_RANDOMIZE | \ + LINUX_ADDR_COMPAT_LAYOUT | \ + LINUX_MMAP_PAGE_ZERO) + +/* + * Personality types. + * + * These go in the low byte. Avoid using the top bit, it will + * conflict with error returns. + */ +enum { + LINUX_PER_LINUX = 0x0000, + LINUX_PER_LINUX_32BIT = 0x0000 | LINUX_ADDR_LIMIT_32BIT, + LINUX_PER_LINUX_FDPIC = 0x0000 | LINUX_FDPIC_FUNCPTRS, + LINUX_PER_LINUX32 = 0x0008, + LINUX_PER_LINUX32_3GB = 0x0008 | LINUX_ADDR_LIMIT_3GB, + LINUX_PER_MASK = 0x00ff, +}; + +#endif /* LINUX_PERSONALITY_H */ diff --git a/sys/conf/files.amd64 b/sys/conf/files.amd64 index 798d105..e8d9c7d 100644 --- a/sys/conf/files.amd64 +++ b/sys/conf/files.amd64 @@ -549,6 +549,7 @@ compat/linux/linux_ioctl.c optional compat_linux32 compat/linux/linux_ipc.c optional compat_linux32 compat/linux/linux_mib.c optional compat_linux32 compat/linux/linux_misc.c optional compat_linux32 +compat/linux/linux_mmap.c optional compat_linux32 compat/linux/linux_signal.c optional compat_linux32 compat/linux/linux_socket.c optional compat_linux32 compat/linux/linux_stats.c optional compat_linux32 diff --git a/sys/conf/files.i386 b/sys/conf/files.i386 index 72686d9..19b44ef 100644 --- a/sys/conf/files.i386 +++ b/sys/conf/files.i386 @@ -102,6 +102,7 @@ compat/linux/linux_ioctl.c optional compat_linux compat/linux/linux_ipc.c optional compat_linux compat/linux/linux_mib.c optional compat_linux compat/linux/linux_misc.c optional compat_linux +compat/linux/linux_mmap.c optional compat_linux compat/linux/linux_signal.c optional compat_linux compat/linux/linux_socket.c optional compat_linux compat/linux/linux_stats.c optional compat_linux diff --git a/sys/conf/files.pc98 b/sys/conf/files.pc98 index 1af0721..598743a 100644 --- a/sys/conf/files.pc98 +++ b/sys/conf/files.pc98 @@ -63,6 +63,7 @@ compat/linux/linux_ioctl.c optional compat_linux compat/linux/linux_ipc.c optional compat_linux compat/linux/linux_mib.c optional compat_linux compat/linux/linux_misc.c optional compat_linux +compat/linux/linux_mmap.c optional compat_linux compat/linux/linux_signal.c optional compat_linux compat/linux/linux_socket.c optional compat_linux compat/linux/linux_stats.c optional compat_linux diff --git a/sys/dev/ahci/ahci.c b/sys/dev/ahci/ahci.c index 821682a..9db1c44 100644 --- a/sys/dev/ahci/ahci.c +++ b/sys/dev/ahci/ahci.c @@ -154,8 +154,8 @@ int ahci_attach(device_t dev) { struct ahci_controller *ctlr = device_get_softc(dev); - int error, i, u, speed, unit; - u_int32_t version; + int error, i, speed, unit; + uint32_t u, version; device_t child; ctlr->dev = dev; diff --git a/sys/dev/ahci/ahci.h b/sys/dev/ahci/ahci.h index a243387..61643c1 100644 --- a/sys/dev/ahci/ahci.h +++ b/sys/dev/ahci/ahci.h @@ -472,7 +472,7 @@ struct ahci_enclosure { uint8_t status[AHCI_MAX_PORTS][4]; /* ArrayDev statuses */ int quirks; int channels; - int ichannels; + uint32_t ichannels; }; /* structure describing a AHCI controller */ @@ -503,7 +503,7 @@ struct ahci_controller { int quirks; int numirqs; int channels; - int ichannels; + uint32_t ichannels; int ccc; /* CCC timeout */ int cccv; /* CCC vector */ int direct; /* Direct command completion */ diff --git a/sys/dev/e1000/if_igb.c b/sys/dev/e1000/if_igb.c index ab445f6..48ef637 100644 --- a/sys/dev/e1000/if_igb.c +++ b/sys/dev/e1000/if_igb.c @@ -1161,10 +1161,27 @@ igb_ioctl(struct ifnet *ifp, u_long command, caddr_t data) } } #endif +#if __FreeBSD_version >= 1000000 + /* HW cannot turn these on/off separately */ + if (mask & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6)) { + ifp->if_capenable ^= IFCAP_RXCSUM; + ifp->if_capenable ^= IFCAP_RXCSUM_IPV6; + reinit = 1; + } + if (mask & IFCAP_TXCSUM) { + ifp->if_capenable ^= IFCAP_TXCSUM; + reinit = 1; + } + if (mask & IFCAP_TXCSUM_IPV6) { + ifp->if_capenable ^= IFCAP_TXCSUM_IPV6; + reinit = 1; + } +#else if (mask & IFCAP_HWCSUM) { ifp->if_capenable ^= IFCAP_HWCSUM; reinit = 1; } +#endif if (mask & IFCAP_TSO4) { ifp->if_capenable ^= IFCAP_TSO4; reinit = 1; @@ -1243,14 +1260,26 @@ igb_init_locked(struct adapter *adapter) /* Set hardware offload abilities */ ifp->if_hwassist = 0; if (ifp->if_capenable & IFCAP_TXCSUM) { +#if __FreeBSD_version >= 1000000 + ifp->if_hwassist |= (CSUM_IP_TCP | CSUM_IP_UDP); + if (adapter->hw.mac.type != e1000_82575) + ifp->if_hwassist |= CSUM_IP_SCTP; +#else ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP); #if __FreeBSD_version >= 800000 - if ((adapter->hw.mac.type == e1000_82576) || - (adapter->hw.mac.type == e1000_82580)) + if (adapter->hw.mac.type != e1000_82575) ifp->if_hwassist |= CSUM_SCTP; #endif +#endif } +#if __FreeBSD_version >= 1000000 + if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) { + ifp->if_hwassist |= (CSUM_IP6_TCP | CSUM_IP6_UDP); + if (adapter->hw.mac.type != e1000_82575) + ifp->if_hwassist |= CSUM_IP6_SCTP; + } +#endif if (ifp->if_capenable & IFCAP_TSO) ifp->if_hwassist |= CSUM_TSO; @@ -3044,6 +3073,9 @@ igb_setup_interface(device_t dev, struct adapter *adapter) ifp->if_capabilities = ifp->if_capenable = 0; ifp->if_capabilities = IFCAP_HWCSUM | IFCAP_VLAN_HWCSUM; +#if __FreeBSD_version >= 1000000 + ifp->if_capabilities |= IFCAP_HWCSUM_IPV6; +#endif ifp->if_capabilities |= IFCAP_TSO; ifp->if_capabilities |= IFCAP_JUMBO_MTU; ifp->if_capenable = ifp->if_capabilities; @@ -3817,17 +3849,29 @@ igb_tx_ctx_setup(struct tx_ring *txr, struct mbuf *mp, switch (ipproto) { case IPPROTO_TCP: +#if __FreeBSD_version >= 1000000 + if (mp->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP)) +#else if (mp->m_pkthdr.csum_flags & CSUM_TCP) +#endif type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_TCP; break; case IPPROTO_UDP: +#if __FreeBSD_version >= 1000000 + if (mp->m_pkthdr.csum_flags & (CSUM_IP_UDP | CSUM_IP6_UDP)) +#else if (mp->m_pkthdr.csum_flags & CSUM_UDP) +#endif type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_UDP; break; #if __FreeBSD_version >= 800000 case IPPROTO_SCTP: +#if __FreeBSD_version >= 1000000 + if (mp->m_pkthdr.csum_flags & (CSUM_IP_SCTP | CSUM_IP6_SCTP)) +#else if (mp->m_pkthdr.csum_flags & CSUM_SCTP) +#endif type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_SCTP; break; #endif @@ -4514,8 +4558,7 @@ igb_initialize_receive_units(struct adapter *adapter) rxcsum |= E1000_RXCSUM_PCSD; #if __FreeBSD_version >= 800000 /* For SCTP Offload */ - if (((hw->mac.type == e1000_82576) || - (hw->mac.type == e1000_82580)) && + if ((hw->mac.type != e1000_82575) && (ifp->if_capenable & IFCAP_RXCSUM)) rxcsum |= E1000_RXCSUM_CRCOFL; #endif @@ -4524,8 +4567,7 @@ igb_initialize_receive_units(struct adapter *adapter) if (ifp->if_capenable & IFCAP_RXCSUM) { rxcsum |= E1000_RXCSUM_IPPCSE; #if __FreeBSD_version >= 800000 - if ((adapter->hw.mac.type == e1000_82576) || - (adapter->hw.mac.type == e1000_82580)) + if (adapter->hw.mac.type != e1000_82575) rxcsum |= E1000_RXCSUM_CRCOFL; #endif } else diff --git a/sys/dev/e1000/if_igb.h b/sys/dev/e1000/if_igb.h index 431437d..f23bb0f 100644 --- a/sys/dev/e1000/if_igb.h +++ b/sys/dev/e1000/if_igb.h @@ -291,7 +291,11 @@ #define ETH_ADDR_LEN 6 /* Offload bits in mbuf flag */ -#if __FreeBSD_version >= 800000 +#if __FreeBSD_version >= 1000000 +#define CSUM_OFFLOAD_IPV4 (CSUM_IP|CSUM_IP_TCP|CSUM_IP_UDP|CSUM_IP_SCTP) +#define CSUM_OFFLOAD_IPV6 (CSUM_IP6_TCP|CSUM_IP6_UDP|CSUM_IP6_SCTP) +#define CSUM_OFFLOAD (CSUM_OFFLOAD_IPV4|CSUM_OFFLOAD_IPV6) +#elif __FreeBSD_version >= 800000 #define CSUM_OFFLOAD (CSUM_IP|CSUM_TCP|CSUM_UDP|CSUM_SCTP) #else #define CSUM_OFFLOAD (CSUM_IP|CSUM_TCP|CSUM_UDP) diff --git a/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c b/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c index a89a762..d9c29e3 100644 --- a/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c +++ b/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c @@ -805,6 +805,13 @@ hv_storvsc_on_iocompletion(struct storvsc_softc *sc, vm_srb = &vstor_packet->u.vm_srb; + /* + * Copy some fields of the host's response into the request structure, + * because the fields will be used later in storvsc_io_done(). + */ + request->vstor_packet.u.vm_srb.scsi_status = vm_srb->scsi_status; + request->vstor_packet.u.vm_srb.transfer_len = vm_srb->transfer_len; + if (((vm_srb->scsi_status & 0xFF) == SCSI_STATUS_CHECK_COND) && (vm_srb->srb_status & SRB_STATUS_AUTOSENSE_VALID)) { /* Autosense data available */ @@ -1939,62 +1946,24 @@ create_storvsc_request(union ccb *ccb, struct hv_storvsc_request *reqp) } /* - * Modified based on scsi_print_inquiry which is responsible to - * print the detail information for scsi_inquiry_data. - * + * SCSI Inquiry checks qualifier and type. + * If qualifier is 011b, means the device server is not capable + * of supporting a peripheral device on this logical unit, and + * the type should be set to 1Fh. + * * Return 1 if it is valid, 0 otherwise. */ static inline int is_inquiry_valid(const struct scsi_inquiry_data *inq_data) { uint8_t type; - char vendor[16], product[48], revision[16]; - - /* - * Check device type and qualifier - */ - if (!(SID_QUAL_IS_VENDOR_UNIQUE(inq_data) || - SID_QUAL(inq_data) == SID_QUAL_LU_CONNECTED)) + if (SID_QUAL(inq_data) != SID_QUAL_LU_CONNECTED) { return (0); - + } type = SID_TYPE(inq_data); - switch (type) { - case T_DIRECT: - case T_SEQUENTIAL: - case T_PRINTER: - case T_PROCESSOR: - case T_WORM: - case T_CDROM: - case T_SCANNER: - case T_OPTICAL: - case T_CHANGER: - case T_COMM: - case T_STORARRAY: - case T_ENCLOSURE: - case T_RBC: - case T_OCRW: - case T_OSD: - case T_ADC: - break; - case T_NODEVICE: - default: + if (type == T_NODEVICE) { return (0); } - - /* - * Check vendor, product, and revision - */ - cam_strvis(vendor, inq_data->vendor, sizeof(inq_data->vendor), - sizeof(vendor)); - cam_strvis(product, inq_data->product, sizeof(inq_data->product), - sizeof(product)); - cam_strvis(revision, inq_data->revision, sizeof(inq_data->revision), - sizeof(revision)); - if (strlen(vendor) == 0 || - strlen(product) == 0 || - strlen(revision) == 0) - return (0); - return (1); } @@ -2071,7 +2040,6 @@ storvsc_io_done(struct hv_storvsc_request *reqp) ccb->ccb_h.status &= ~CAM_STATUS_MASK; if (vm_srb->scsi_status == SCSI_STATUS_OK) { const struct scsi_generic *cmd; - /* * Check whether the data for INQUIRY cmd is valid or * not. Windows 10 and Windows 2016 send all zero @@ -2080,23 +2048,59 @@ storvsc_io_done(struct hv_storvsc_request *reqp) cmd = (const struct scsi_generic *) ((ccb->ccb_h.flags & CAM_CDB_POINTER) ? csio->cdb_io.cdb_ptr : csio->cdb_io.cdb_bytes); - if (cmd->opcode == INQUIRY && - /* - * XXX: Temporary work around disk hot plugin on win2k12r2, - * only filtering the invalid disk on win10 or 2016 server. - * So, the hot plugin on win10 and 2016 server needs - * to be fixed. + if (cmd->opcode == INQUIRY) { + /* + * The host of Windows 10 or 2016 server will response + * the inquiry request with invalid data for unexisted device: + [0x7f 0x0 0x5 0x2 0x1f ... ] + * But on windows 2012 R2, the response is: + [0x7f 0x0 0x0 0x0 0x0 ] + * That is why here wants to validate the inquiry response. + * The validation will skip the INQUIRY whose response is short, + * which is less than SHORT_INQUIRY_LENGTH (36). + * + * For more information about INQUIRY, please refer to: + * ftp://ftp.avc-pioneer.com/Mtfuji_7/Proposal/Jun09/INQUIRY.pdf */ - vmstor_proto_version == VMSTOR_PROTOCOL_VERSION_WIN10 && - is_inquiry_valid( - (const struct scsi_inquiry_data *)csio->data_ptr) == 0) { + const struct scsi_inquiry_data *inq_data = + (const struct scsi_inquiry_data *)csio->data_ptr; + uint8_t* resp_buf = (uint8_t*)csio->data_ptr; + /* Get the buffer length reported by host */ + int resp_xfer_len = vm_srb->transfer_len; + /* Get the available buffer length */ + int resp_buf_len = resp_xfer_len >= 5 ? resp_buf[4] + 5 : 0; + int data_len = (resp_buf_len < resp_xfer_len) ? resp_buf_len : resp_xfer_len; + if (data_len < SHORT_INQUIRY_LENGTH) { + ccb->ccb_h.status |= CAM_REQ_CMP; + if (bootverbose && data_len >= 5) { + mtx_lock(&sc->hs_lock); + xpt_print(ccb->ccb_h.path, + "storvsc skips the validation for short inquiry (%d)" + " [%x %x %x %x %x]\n", + data_len,resp_buf[0],resp_buf[1],resp_buf[2], + resp_buf[3],resp_buf[4]); + mtx_unlock(&sc->hs_lock); + } + } else if (is_inquiry_valid(inq_data) == 0) { ccb->ccb_h.status |= CAM_DEV_NOT_THERE; + if (bootverbose && data_len >= 5) { + mtx_lock(&sc->hs_lock); + xpt_print(ccb->ccb_h.path, + "storvsc uninstalled invalid device" + " [%x %x %x %x %x]\n", + resp_buf[0],resp_buf[1],resp_buf[2],resp_buf[3],resp_buf[4]); + mtx_unlock(&sc->hs_lock); + } + } else { + ccb->ccb_h.status |= CAM_REQ_CMP; if (bootverbose) { mtx_lock(&sc->hs_lock); xpt_print(ccb->ccb_h.path, - "storvsc uninstalled device\n"); + "storvsc has passed inquiry response (%d) validation\n", + data_len); mtx_unlock(&sc->hs_lock); } + } } else { ccb->ccb_h.status |= CAM_REQ_CMP; } diff --git a/sys/i386/i386/exception.s b/sys/i386/i386/exception.s index 5949f02..a3674c7 100644 --- a/sys/i386/i386/exception.s +++ b/sys/i386/i386/exception.s @@ -235,7 +235,7 @@ IDTVEC(lcall_syscall) pushfl /* save eflags */ popl 8(%esp) /* shuffle into tf_eflags */ pushl $7 /* sizeof "lcall 7,0" */ - subl $4,%esp /* skip over tf_trapno */ + pushl $0 /* tf_trapno */ pushal pushl $0 movw %ds,(%esp) @@ -264,7 +264,7 @@ IDTVEC(lcall_syscall) SUPERALIGN_TEXT IDTVEC(int0x80_syscall) pushl $2 /* sizeof "int 0x80" */ - subl $4,%esp /* skip over tf_trapno */ + pushl $0 /* tf_trapno */ pushal pushl $0 movw %ds,(%esp) diff --git a/sys/i386/linux/linux.h b/sys/i386/linux/linux.h index 36b2084..42e836c 100644 --- a/sys/i386/linux/linux.h +++ b/sys/i386/linux/linux.h @@ -140,13 +140,6 @@ struct l_rlimit { l_ulong rlim_max; }; -/* mmap options */ -#define LINUX_MAP_SHARED 0x0001 -#define LINUX_MAP_PRIVATE 0x0002 -#define LINUX_MAP_FIXED 0x0010 -#define LINUX_MAP_ANON 0x0020 -#define LINUX_MAP_GROWSDOWN 0x0100 - struct l_mmap_argv { l_uintptr_t addr; l_size_t len; diff --git a/sys/i386/linux/linux_machdep.c b/sys/i386/linux/linux_machdep.c index c9f969b..4b4b886 100644 --- a/sys/i386/linux/linux_machdep.c +++ b/sys/i386/linux/linux_machdep.c @@ -65,6 +65,7 @@ __FBSDID("$FreeBSD$"); #include <i386/linux/linux_proto.h> #include <compat/linux/linux_ipc.h> #include <compat/linux/linux_misc.h> +#include <compat/linux/linux_mmap.h> #include <compat/linux/linux_signal.h> #include <compat/linux/linux_util.h> #include <compat/linux/linux_emul.h> @@ -95,10 +96,6 @@ struct l_old_select_argv { struct l_timeval *timeout; }; -static int linux_mmap_common(struct thread *td, l_uintptr_t addr, - l_size_t len, l_int prot, l_int flags, l_int fd, - l_loff_t pos); - int linux_execve(struct thread *td, struct linux_execve_args *args) @@ -340,9 +337,6 @@ linux_set_upcall_kse(struct thread *td, register_t stack) return (0); } -#define STACK_SIZE (2 * 1024 * 1024) -#define GUARD_SIZE (4 * PAGE_SIZE) - int linux_mmap2(struct thread *td, struct linux_mmap2_args *args) { @@ -381,187 +375,11 @@ linux_mmap(struct thread *td, struct linux_mmap_args *args) (uint32_t)linux_args.pgoff)); } -static int -linux_mmap_common(struct thread *td, l_uintptr_t addr, l_size_t len, l_int prot, - l_int flags, l_int fd, l_loff_t pos) -{ - struct proc *p = td->td_proc; - struct mmap_args /* { - caddr_t addr; - size_t len; - int prot; - int flags; - int fd; - long pad; - off_t pos; - } */ bsd_args; - int error; - struct file *fp; - cap_rights_t rights; - - error = 0; - bsd_args.flags = 0; - fp = NULL; - - /* - * Linux mmap(2): - * You must specify exactly one of MAP_SHARED and MAP_PRIVATE - */ - if (!((flags & LINUX_MAP_SHARED) ^ (flags & LINUX_MAP_PRIVATE))) - return (EINVAL); - - if (flags & LINUX_MAP_SHARED) - bsd_args.flags |= MAP_SHARED; - if (flags & LINUX_MAP_PRIVATE) - bsd_args.flags |= MAP_PRIVATE; - if (flags & LINUX_MAP_FIXED) - bsd_args.flags |= MAP_FIXED; - if (flags & LINUX_MAP_ANON) { - /* Enforce pos to be on page boundary, then ignore. */ - if ((pos & PAGE_MASK) != 0) - return (EINVAL); - pos = 0; - bsd_args.flags |= MAP_ANON; - } else - bsd_args.flags |= MAP_NOSYNC; - if (flags & LINUX_MAP_GROWSDOWN) - bsd_args.flags |= MAP_STACK; - - /* - * PROT_READ, PROT_WRITE, or PROT_EXEC implies PROT_READ and PROT_EXEC - * on Linux/i386. We do this to ensure maximum compatibility. - * Linux/ia64 does the same in i386 emulation mode. - */ - bsd_args.prot = prot; - if (bsd_args.prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) - bsd_args.prot |= PROT_READ | PROT_EXEC; - - /* Linux does not check file descriptor when MAP_ANONYMOUS is set. */ - bsd_args.fd = (bsd_args.flags & MAP_ANON) ? -1 : fd; - if (bsd_args.fd != -1) { - /* - * Linux follows Solaris mmap(2) description: - * The file descriptor fildes is opened with - * read permission, regardless of the - * protection options specified. - * - * Checking just CAP_MMAP is fine here, since the real work - * is done in the FreeBSD mmap(). - */ - - error = fget(td, bsd_args.fd, - cap_rights_init(&rights, CAP_MMAP), &fp); - if (error != 0) - return (error); - if (fp->f_type != DTYPE_VNODE) { - fdrop(fp, td); - return (EINVAL); - } - - /* Linux mmap() just fails for O_WRONLY files */ - if (!(fp->f_flag & FREAD)) { - fdrop(fp, td); - return (EACCES); - } - - fdrop(fp, td); - } - - if (flags & LINUX_MAP_GROWSDOWN) { - /* - * The Linux MAP_GROWSDOWN option does not limit auto - * growth of the region. Linux mmap with this option - * takes as addr the inital BOS, and as len, the initial - * region size. It can then grow down from addr without - * limit. However, linux threads has an implicit internal - * limit to stack size of STACK_SIZE. Its just not - * enforced explicitly in linux. But, here we impose - * a limit of (STACK_SIZE - GUARD_SIZE) on the stack - * region, since we can do this with our mmap. - * - * Our mmap with MAP_STACK takes addr as the maximum - * downsize limit on BOS, and as len the max size of - * the region. It them maps the top SGROWSIZ bytes, - * and auto grows the region down, up to the limit - * in addr. - * - * If we don't use the MAP_STACK option, the effect - * of this code is to allocate a stack region of a - * fixed size of (STACK_SIZE - GUARD_SIZE). - */ - - if ((caddr_t)PTRIN(addr) + len > p->p_vmspace->vm_maxsaddr) { - /* - * Some linux apps will attempt to mmap - * thread stacks near the top of their - * address space. If their TOS is greater - * than vm_maxsaddr, vm_map_growstack() - * will confuse the thread stack with the - * process stack and deliver a SEGV if they - * attempt to grow the thread stack past their - * current stacksize rlimit. To avoid this, - * adjust vm_maxsaddr upwards to reflect - * the current stacksize rlimit rather - * than the maximum possible stacksize. - * It would be better to adjust the - * mmap'ed region, but some apps do not check - * mmap's return value. - */ - PROC_LOCK(p); - p->p_vmspace->vm_maxsaddr = (char *)USRSTACK - - lim_cur(p, RLIMIT_STACK); - PROC_UNLOCK(p); - } - - /* - * This gives us our maximum stack size and a new BOS. - * If we're using VM_STACK, then mmap will just map - * the top SGROWSIZ bytes, and let the stack grow down - * to the limit at BOS. If we're not using VM_STACK - * we map the full stack, since we don't have a way - * to autogrow it. - */ - if (len > STACK_SIZE - GUARD_SIZE) { - bsd_args.addr = (caddr_t)PTRIN(addr); - bsd_args.len = len; - } else { - bsd_args.addr = (caddr_t)PTRIN(addr) - - (STACK_SIZE - GUARD_SIZE - len); - bsd_args.len = STACK_SIZE - GUARD_SIZE; - } - } else { - bsd_args.addr = (caddr_t)PTRIN(addr); - bsd_args.len = len; - } - bsd_args.pos = pos; - -#ifdef DEBUG - if (ldebug(mmap)) - printf("-> %s(%p, %d, %d, 0x%08x, %d, 0x%x)\n", - __func__, - (void *)bsd_args.addr, bsd_args.len, bsd_args.prot, - bsd_args.flags, bsd_args.fd, (int)bsd_args.pos); -#endif - error = sys_mmap(td, &bsd_args); -#ifdef DEBUG - if (ldebug(mmap)) - printf("-> %s() return: 0x%x (0x%08x)\n", - __func__, error, (u_int)td->td_retval[0]); -#endif - return (error); -} - int linux_mprotect(struct thread *td, struct linux_mprotect_args *uap) { - struct mprotect_args bsd_args; - - bsd_args.addr = uap->addr; - bsd_args.len = uap->len; - bsd_args.prot = uap->prot; - if (bsd_args.prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) - bsd_args.prot |= PROT_READ | PROT_EXEC; - return (sys_mprotect(td, &bsd_args)); + + return (linux_mprotect_common(td, PTROUT(uap->addr), uap->len, uap->prot)); } int diff --git a/sys/i386/linux/linux_proto.h b/sys/i386/linux/linux_proto.h index 3717c9f..1b848cd 100644 --- a/sys/i386/linux/linux_proto.h +++ b/sys/i386/linux/linux_proto.h @@ -3,7 +3,7 @@ * * DO NOT EDIT-- this file is automatically generated. * $FreeBSD$ - * created from FreeBSD: stable/10/sys/i386/linux/syscalls.master 297300 2016-03-27 06:10:51Z dchagin + * created from FreeBSD: stable/10/sys/i386/linux/syscalls.master 302962 2016-07-17 15:07:33Z dchagin */ #ifndef _LINUX_SYSPROTO_H_ @@ -435,7 +435,7 @@ struct linux_sysfs_args { char arg2_l_[PADL_(l_ulong)]; l_ulong arg2; char arg2_r_[PADR_(l_ulong)]; }; struct linux_personality_args { - char per_l_[PADL_(l_ulong)]; l_ulong per; char per_r_[PADR_(l_ulong)]; + char per_l_[PADL_(l_uint)]; l_uint per; char per_r_[PADR_(l_uint)]; }; struct linux_setfsuid16_args { char uid_l_[PADL_(l_uid16_t)]; l_uid16_t uid; char uid_r_[PADR_(l_uid16_t)]; diff --git a/sys/i386/linux/linux_syscall.h b/sys/i386/linux/linux_syscall.h index e9d8ef7..aa9e4a0 100644 --- a/sys/i386/linux/linux_syscall.h +++ b/sys/i386/linux/linux_syscall.h @@ -3,7 +3,7 @@ * * DO NOT EDIT-- this file is automatically generated. * $FreeBSD$ - * created from FreeBSD: stable/10/sys/i386/linux/syscalls.master 297300 2016-03-27 06:10:51Z dchagin + * created from FreeBSD: stable/10/sys/i386/linux/syscalls.master 302962 2016-07-17 15:07:33Z dchagin */ #define LINUX_SYS_linux_exit 1 diff --git a/sys/i386/linux/linux_syscalls.c b/sys/i386/linux/linux_syscalls.c index 54d8423..8b43796 100644 --- a/sys/i386/linux/linux_syscalls.c +++ b/sys/i386/linux/linux_syscalls.c @@ -3,7 +3,7 @@ * * DO NOT EDIT-- this file is automatically generated. * $FreeBSD$ - * created from FreeBSD: stable/10/sys/i386/linux/syscalls.master 297300 2016-03-27 06:10:51Z dchagin + * created from FreeBSD: stable/10/sys/i386/linux/syscalls.master 302962 2016-07-17 15:07:33Z dchagin */ const char *linux_syscallnames[] = { diff --git a/sys/i386/linux/linux_sysent.c b/sys/i386/linux/linux_sysent.c index ce765b5..7218375 100644 --- a/sys/i386/linux/linux_sysent.c +++ b/sys/i386/linux/linux_sysent.c @@ -3,7 +3,7 @@ * * DO NOT EDIT-- this file is automatically generated. * $FreeBSD$ - * created from FreeBSD: stable/10/sys/i386/linux/syscalls.master 297300 2016-03-27 06:10:51Z dchagin + * created from FreeBSD: stable/10/sys/i386/linux/syscalls.master 302962 2016-07-17 15:07:33Z dchagin */ #include <sys/param.h> diff --git a/sys/i386/linux/linux_systrace_args.c b/sys/i386/linux/linux_systrace_args.c index f02f34f..0a0bf42 100644 --- a/sys/i386/linux/linux_systrace_args.c +++ b/sys/i386/linux/linux_systrace_args.c @@ -948,7 +948,7 @@ systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args) /* linux_personality */ case 136: { struct linux_personality_args *p = params; - iarg[0] = p->per; /* l_ulong */ + iarg[0] = p->per; /* l_uint */ *n_args = 1; break; } @@ -3849,7 +3849,7 @@ systrace_entry_setargdesc(int sysnum, int ndx, char *desc, size_t descsz) case 136: switch(ndx) { case 0: - p = "l_ulong"; + p = "l_uint"; break; default: break; diff --git a/sys/i386/linux/syscalls.master b/sys/i386/linux/syscalls.master index 7ec3154..5899adb 100644 --- a/sys/i386/linux/syscalls.master +++ b/sys/i386/linux/syscalls.master @@ -240,7 +240,7 @@ 134 AUE_BDFLUSH STD { int linux_bdflush(void); } 135 AUE_NULL STD { int linux_sysfs(l_int option, \ l_ulong arg1, l_ulong arg2); } -136 AUE_PERSONALITY STD { int linux_personality(l_ulong per); } +136 AUE_PERSONALITY STD { int linux_personality(l_uint per); } 137 AUE_NULL UNIMPL afs_syscall 138 AUE_SETFSUID STD { int linux_setfsuid16(l_uid16_t uid); } 139 AUE_SETFSGID STD { int linux_setfsgid16(l_gid16_t gid); } diff --git a/sys/kern/kern_mutex.c b/sys/kern/kern_mutex.c index fb0b313..908f99a 100644 --- a/sys/kern/kern_mutex.c +++ b/sys/kern/kern_mutex.c @@ -658,8 +658,15 @@ thread_lock_flags_(struct thread *td, int opts, const char *file, int line) i = 0; tid = (uintptr_t)curthread; - if (SCHEDULER_STOPPED()) + if (SCHEDULER_STOPPED()) { + /* + * Ensure that spinlock sections are balanced even when the + * scheduler is stopped, since we may otherwise inadvertently + * re-enable interrupts while dumping core. + */ + spinlock_enter(); return; + } #ifdef KDTRACE_HOOKS spin_time -= lockstat_nsecs(&td->td_lock->lock_object); diff --git a/sys/kern/vfs_mount.c b/sys/kern/vfs_mount.c index f5f522e..496b852 100644 --- a/sys/kern/vfs_mount.c +++ b/sys/kern/vfs_mount.c @@ -1315,7 +1315,8 @@ dounmount(struct mount *mp, int flags, struct thread *td) */ if ((flags & MNT_FORCE) && VFS_ROOT(mp, LK_EXCLUSIVE, &fsrootvp) == 0) { - if (mp->mnt_vnodecovered != NULL) + if (mp->mnt_vnodecovered != NULL && + (mp->mnt_flag & MNT_IGNORE) == 0) mountcheckdirs(fsrootvp, mp->mnt_vnodecovered); if (fsrootvp == rootvnode) { vrele(rootvnode); @@ -1336,7 +1337,8 @@ dounmount(struct mount *mp, int flags, struct thread *td) if (error && error != ENXIO) { if ((flags & MNT_FORCE) && VFS_ROOT(mp, LK_EXCLUSIVE, &fsrootvp) == 0) { - if (mp->mnt_vnodecovered != NULL) + if (mp->mnt_vnodecovered != NULL && + (mp->mnt_flag & MNT_IGNORE) == 0) mountcheckdirs(mp->mnt_vnodecovered, fsrootvp); if (rootvnode == NULL) { rootvnode = fsrootvp; diff --git a/sys/modules/linux/Makefile b/sys/modules/linux/Makefile index 0e70ce6..124227e 100644 --- a/sys/modules/linux/Makefile +++ b/sys/modules/linux/Makefile @@ -30,7 +30,7 @@ SRCS+= opt_apic.h OBJS= ${VDSO}.so .if ${MACHINE_CPUARCH} == "i386" -SRCS+= linux_ptrace.c imgact_linux.c linux_util.c linux_mib.c \ +SRCS+= linux_ptrace.c imgact_linux.c linux_util.c linux_mib.c linux_mmap.c \ linux_emul.c opt_cpu.h linux.c .endif diff --git a/sys/modules/linux_common/Makefile b/sys/modules/linux_common/Makefile index 91449f7..2301796 100644 --- a/sys/modules/linux_common/Makefile +++ b/sys/modules/linux_common/Makefile @@ -3,7 +3,7 @@ .PATH: ${.CURDIR}/../../compat/linux KMOD= linux_common -SRCS= linux_common.c linux_mib.c linux_util.c linux_emul.c \ +SRCS= linux_common.c linux_mib.c linux_mmap.c linux_util.c linux_emul.c \ linux.c opt_compat.h device_if.h vnode_if.h bus_if.h EXPORT_SYMS= diff --git a/sys/net/mppcc.c b/sys/net/mppcc.c index 01ce3ff..293ce46 100644 --- a/sys/net/mppcc.c +++ b/sys/net/mppcc.c @@ -233,7 +233,7 @@ int MPPC_Compress(u_char **src, u_char **dst, u_long *srcCnt, u_long *dstCnt, ch putbits16(*dst, 0xc000|(off-320), 16, &olen, &l); } else { /* NOTREACHED */ rtn &= ~MPPC_OK; - return rtn; + return (rtn); } /* Encode length of match. */ diff --git a/sys/net/mppcd.c b/sys/net/mppcd.c index c1730e5..d8e663c 100644 --- a/sys/net/mppcd.c +++ b/sys/net/mppcd.c @@ -170,7 +170,7 @@ int MPPC_Decompress(u_char **src, u_char **dst, u_long *srcCnt, u_long *dstCnt, rtn &= ~MPPC_OK; return (rtn); } - } else { /* NOTREACHED */ + } else { /* This shouldn't happen. */ rtn &= ~MPPC_OK; return (rtn); } diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c index 0398b03..03e8c72 100644 --- a/sys/netinet/tcp_syncache.c +++ b/sys/netinet/tcp_syncache.c @@ -683,7 +683,7 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) * connection when the SYN arrived. If we can't create * the connection, abort it. */ - so = sonewconn(lso, SS_ISCONNECTED); + so = sonewconn(lso, 0); if (so == NULL) { /* * Drop the connection; we will either send a RST or @@ -922,6 +922,8 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) INP_WUNLOCK(inp); + soisconnected(so); + TCPSTAT_INC(tcps_accepts); return (so); diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c index 1ccbf9a..8d7e9a0 100644 --- a/sys/netinet/tcp_usrreq.c +++ b/sys/netinet/tcp_usrreq.c @@ -643,13 +643,6 @@ out: /* * Accept a connection. Essentially all the work is done at higher levels; * just return the address of the peer, storing through addr. - * - * The rationale for acquiring the tcbinfo lock here is somewhat complicated, - * and is described in detail in the commit log entry for r175612. Acquiring - * it delays an accept(2) racing with sonewconn(), which inserts the socket - * before the inpcb address/port fields are initialized. A better fix would - * prevent the socket from being placed in the listen queue until all fields - * are fully initialized. */ static int tcp_usr_accept(struct socket *so, struct sockaddr **nam) @@ -666,7 +659,6 @@ tcp_usr_accept(struct socket *so, struct sockaddr **nam) inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_accept: inp == NULL")); - INP_INFO_RLOCK(&V_tcbinfo); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { error = ECONNABORTED; @@ -686,7 +678,6 @@ tcp_usr_accept(struct socket *so, struct sockaddr **nam) out: TCPDEBUG2(PRU_ACCEPT); INP_WUNLOCK(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); if (error == 0) *nam = in_sockaddr(port, &addr); return error; diff --git a/sys/netpfil/ipfw/dn_sched_fq_pie.c b/sys/netpfil/ipfw/dn_sched_fq_pie.c index 2883cf8..bfcd6c5 100644 --- a/sys/netpfil/ipfw/dn_sched_fq_pie.c +++ b/sys/netpfil/ipfw/dn_sched_fq_pie.c @@ -111,7 +111,7 @@ struct fq_pie_flow { int deficit; int active; /* 1: flow is active (in a list) */ struct pie_status pst; /* pie status variables */ - struct fq_pie_si *psi; /* parent scheduler instance */ + struct fq_pie_si_extra *psi_extra; STAILQ_ENTRY(fq_pie_flow) flowchain; }; @@ -120,23 +120,30 @@ struct fq_pie_schk { struct dn_sch_fq_pie_parms cfg; }; + +/* fq_pie scheduler instance extra state vars. + * The purpose of separation this structure is to preserve number of active + * sub-queues and the flows array pointer even after the scheduler instance + * is destroyed. + * Preserving these varaiables allows freeing the allocated memory by + * fqpie_callout_cleanup() independently from fq_pie_free_sched(). + */ +struct fq_pie_si_extra { + uint32_t nr_active_q; /* number of active queues */ + struct fq_pie_flow *flows; /* array of flows (queues) */ + }; + /* fq_pie scheduler instance */ struct fq_pie_si { - struct dn_sch_inst _si; /* standard scheduler instance */ + struct dn_sch_inst _si; /* standard scheduler instance. SHOULD BE FIRST */ struct dn_queue main_q; /* main queue is after si directly */ - uint32_t nr_active_q; - struct fq_pie_flow *flows; /* array of flows (queues) */ uint32_t perturbation; /* random value */ struct fq_pie_list newflows; /* list of new queues */ struct fq_pie_list oldflows; /* list of old queues */ + struct fq_pie_si_extra *si_extra; /* extra state vars*/ }; -struct mem_to_free { - void *mem_flows; - void *mem_callout; -}; -static struct mtx freemem_mtx; static struct dn_alg fq_pie_desc; /* Default FQ-PIE parameters including PIE */ @@ -371,22 +378,6 @@ fq_calculate_drop_prob(void *x) int64_t p, prob, oldprob; aqm_time_t now; - /* dealing with race condition */ - if (callout_pending(&pst->aqm_pie_callout)) { - /* callout was reset */ - mtx_unlock(&pst->lock_mtx); - return; - } - - if (!callout_active(&pst->aqm_pie_callout)) { - /* callout was stopped */ - mtx_unlock(&pst->lock_mtx); - mtx_destroy(&pst->lock_mtx); - q->psi->nr_active_q--; - return; - } - callout_deactivate(&pst->aqm_pie_callout); - now = AQM_UNOW; pprms = pst->parms; prob = pst->drop_prob; @@ -524,20 +515,17 @@ fq_deactivate_pie(struct pie_status *pst) * Initialize PIE for sub-queue 'q' */ static int -pie_init(struct fq_pie_flow *q) +pie_init(struct fq_pie_flow *q, struct fq_pie_schk *fqpie_schk) { struct pie_status *pst=&q->pst; struct dn_aqm_pie_parms *pprms = pst->parms; - struct fq_pie_schk *fqpie_schk; - - fqpie_schk = (struct fq_pie_schk *)(q->psi->_si.sched+1); - int err = 0; + int err = 0; if (!pprms){ D("AQM_PIE is not configured"); err = EINVAL; } else { - q->psi->nr_active_q++; + q->psi_extra->nr_active_q++; /* For speed optimization, we caculate 1/3 queue size once here */ // XXX limit divided by number of queues divided by 3 ??? @@ -553,8 +541,36 @@ pie_init(struct fq_pie_flow *q) } /* + * callout function to destroy PIE lock, and free fq_pie flows and fq_pie si + * extra memory when number of active sub-queues reaches zero. + * 'x' is a fq_pie_flow to be destroyed + */ +static void +fqpie_callout_cleanup(void *x) +{ + struct fq_pie_flow *q = x; + struct pie_status *pst = &q->pst; + struct fq_pie_si_extra *psi_extra; + + mtx_unlock(&pst->lock_mtx); + mtx_destroy(&pst->lock_mtx); + psi_extra = q->psi_extra; + + DN_BH_WLOCK(); + psi_extra->nr_active_q--; + + /* when all sub-queues are destroyed, free flows fq_pie extra vars memory */ + if (!psi_extra->nr_active_q) { + free(psi_extra->flows, M_DUMMYNET); + free(psi_extra, M_DUMMYNET); + fq_pie_desc.ref_count--; + } + DN_BH_WUNLOCK(); +} + +/* * Clean up PIE status for sub-queue 'q' - * Stop callout timer and destroy mtx + * Stop callout timer and destroy mtx using fqpie_callout_cleanup() callout. */ static int pie_cleanup(struct fq_pie_flow *q) @@ -562,14 +578,9 @@ pie_cleanup(struct fq_pie_flow *q) struct pie_status *pst = &q->pst; mtx_lock(&pst->lock_mtx); - if (callout_stop(&pst->aqm_pie_callout) || !(pst->sflags & PIE_ACTIVE)) { - mtx_unlock(&pst->lock_mtx); - mtx_destroy(&pst->lock_mtx); - q->psi->nr_active_q--; - } else { - mtx_unlock(&pst->lock_mtx); - return EBUSY; - } + callout_reset_sbt(&pst->aqm_pie_callout, + SBT_1US, 0, fqpie_callout_cleanup, q, 0); + mtx_unlock(&pst->lock_mtx); return 0; } @@ -831,10 +842,12 @@ fq_pie_enqueue(struct dn_sch_inst *_si, struct dn_queue *_q, struct fq_pie_schk *schk; struct dn_sch_fq_pie_parms *param; struct dn_queue *mainq; + struct fq_pie_flow *flows; int idx, drop, i, maxidx; mainq = (struct dn_queue *)(_si + 1); si = (struct fq_pie_si *)_si; + flows = si->si_extra->flows; schk = (struct fq_pie_schk *)(si->_si.sched+1); param = &schk->cfg; @@ -844,7 +857,7 @@ fq_pie_enqueue(struct dn_sch_inst *_si, struct dn_queue *_q, /* enqueue packet into appropriate queue using PIE AQM. * Note: 'pie_enqueue' function returns 1 only when it unable to * add timestamp to packet (no limit check)*/ - drop = pie_enqueue(&si->flows[idx], m, si); + drop = pie_enqueue(&flows[idx], m, si); /* pie unable to timestamp a packet */ if (drop) @@ -853,11 +866,11 @@ fq_pie_enqueue(struct dn_sch_inst *_si, struct dn_queue *_q, /* If the flow (sub-queue) is not active ,then add it to tail of * new flows list, initialize and activate it. */ - if (!si->flows[idx].active) { - STAILQ_INSERT_TAIL(&si->newflows, &si->flows[idx], flowchain); - si->flows[idx].deficit = param->quantum; - fq_activate_pie(&si->flows[idx]); - si->flows[idx].active = 1; + if (!flows[idx].active) { + STAILQ_INSERT_TAIL(&si->newflows, &flows[idx], flowchain); + flows[idx].deficit = param->quantum; + fq_activate_pie(&flows[idx]); + flows[idx].active = 1; } /* check the limit for all queues and remove a packet from the @@ -866,15 +879,15 @@ fq_pie_enqueue(struct dn_sch_inst *_si, struct dn_queue *_q, if (mainq->ni.length > schk->cfg.limit) { /* find first active flow */ for (maxidx = 0; maxidx < schk->cfg.flows_cnt; maxidx++) - if (si->flows[maxidx].active) + if (flows[maxidx].active) break; if (maxidx < schk->cfg.flows_cnt) { /* find the largest sub- queue */ for (i = maxidx + 1; i < schk->cfg.flows_cnt; i++) - if (si->flows[i].active && si->flows[i].stats.length > - si->flows[maxidx].stats.length) + if (flows[i].active && flows[i].stats.length > + flows[maxidx].stats.length) maxidx = i; - pie_drop_head(&si->flows[maxidx], si); + pie_drop_head(&flows[maxidx], si); drop = 1; } } @@ -974,12 +987,13 @@ fq_pie_new_sched(struct dn_sch_inst *_si) struct fq_pie_si *si; struct dn_queue *q; struct fq_pie_schk *schk; + struct fq_pie_flow *flows; int i; si = (struct fq_pie_si *)_si; schk = (struct fq_pie_schk *)(_si->sched+1); - if(si->flows) { + if(si->si_extra) { D("si already configured!"); return 0; } @@ -990,17 +1004,27 @@ fq_pie_new_sched(struct dn_sch_inst *_si) q->_si = _si; q->fs = _si->sched->fs; + /* allocate memory for scheduler instance extra vars */ + si->si_extra = malloc(sizeof(struct fq_pie_si_extra), + M_DUMMYNET, M_NOWAIT | M_ZERO); + if (si->si_extra == NULL) { + D("cannot allocate memory for fq_pie si extra vars"); + return ENOMEM ; + } /* allocate memory for flows array */ - si->flows = malloc(schk->cfg.flows_cnt * sizeof(struct fq_pie_flow), + si->si_extra->flows = malloc(schk->cfg.flows_cnt * sizeof(struct fq_pie_flow), M_DUMMYNET, M_NOWAIT | M_ZERO); - if (si->flows == NULL) { - D("cannot allocate memory for fq_pie configuration parameters"); + flows = si->si_extra->flows; + if (flows == NULL) { + free(si->si_extra, M_DUMMYNET); + si->si_extra = NULL; + D("cannot allocate memory for fq_pie flows"); return ENOMEM ; } /* init perturbation for this si */ si->perturbation = random(); - si->nr_active_q = 0; + si->si_extra->nr_active_q = 0; /* init the old and new flows lists */ STAILQ_INIT(&si->newflows); @@ -1008,45 +1032,16 @@ fq_pie_new_sched(struct dn_sch_inst *_si) /* init the flows (sub-queues) */ for (i = 0; i < schk->cfg.flows_cnt; i++) { - si->flows[i].pst.parms = &schk->cfg.pcfg; - si->flows[i].psi = si; - pie_init(&si->flows[i]); - } - - /* init mtx lock and callout function for free memory */ - if (!fq_pie_desc.ref_count) { - mtx_init(&freemem_mtx, "mtx_pie", NULL, MTX_DEF); + flows[i].pst.parms = &schk->cfg.pcfg; + flows[i].psi_extra = si->si_extra; + pie_init(&flows[i], schk); } - mtx_lock(&freemem_mtx); fq_pie_desc.ref_count++; - mtx_unlock(&freemem_mtx); return 0; } -/* - * Free FQ-PIE flows memory callout function. - * This function is scheduled when a flow or more still active and - * the scheduer is about to be destroyed, to prevent memory leak. - */ -static void -free_flows(void *_mem) -{ - struct mem_to_free *mem = _mem; - - free(mem->mem_flows, M_DUMMYNET); - free(mem->mem_callout, M_DUMMYNET); - free(_mem, M_DUMMYNET); - - fq_pie_desc.ref_count--; - if (!fq_pie_desc.ref_count) { - mtx_unlock(&freemem_mtx); - mtx_destroy(&freemem_mtx); - } else - mtx_unlock(&freemem_mtx); - //D("mem freed ok!"); -} /* * Free fq_pie scheduler instance. @@ -1056,61 +1051,17 @@ fq_pie_free_sched(struct dn_sch_inst *_si) { struct fq_pie_si *si; struct fq_pie_schk *schk; + struct fq_pie_flow *flows; int i; si = (struct fq_pie_si *)_si; schk = (struct fq_pie_schk *)(_si->sched+1); - + flows = si->si_extra->flows; for (i = 0; i < schk->cfg.flows_cnt; i++) { - pie_cleanup(&si->flows[i]); - } - - /* if there are still some queues have a callout going to start, - * we cannot free flows memory. If we do so, a panic can happen - * as prob calculate callout function uses flows memory. - */ - if (!si->nr_active_q) { - /* free the flows array */ - free(si->flows , M_DUMMYNET); - si->flows = NULL; - mtx_lock(&freemem_mtx); - fq_pie_desc.ref_count--; - if (!fq_pie_desc.ref_count) { - mtx_unlock(&freemem_mtx); - mtx_destroy(&freemem_mtx); - } else - mtx_unlock(&freemem_mtx); - //D("ok!"); - return 0; - } else { - /* memory leak happens here. So, we register a callout function to free - * flows memory later. - */ - D("unable to stop all fq_pie sub-queues!"); - mtx_lock(&freemem_mtx); - - struct callout *mem_callout; - struct mem_to_free *mem; - - mem = malloc(sizeof(*mem), M_DUMMYNET, - M_NOWAIT | M_ZERO); - mem_callout = malloc(sizeof(*mem_callout), M_DUMMYNET, - M_NOWAIT | M_ZERO); - - callout_init_mtx(mem_callout, &freemem_mtx, - CALLOUT_RETURNUNLOCKED); - - mem->mem_flows = si->flows; - mem->mem_callout = mem_callout; - callout_reset_sbt(mem_callout, - (uint64_t)(si->flows[0].pst.parms->tupdate + 1000) * SBT_1US, - 0, free_flows, mem, 0); - - si->flows = NULL; - mtx_unlock(&freemem_mtx); - - return EBUSY; + pie_cleanup(&flows[i]); } + si->si_extra = NULL; + return 0; } /* diff --git a/sys/ofed/include/linux/linux_idr.c b/sys/ofed/include/linux/linux_idr.c index 8eb7949..c47903e 100644 --- a/sys/ofed/include/linux/linux_idr.c +++ b/sys/ofed/include/linux/linux_idr.c @@ -267,7 +267,8 @@ idr_get(struct idr *idr) return (il); } il = malloc(sizeof(*il), M_IDR, M_ZERO | M_NOWAIT); - bitmap_fill(&il->bitmap, IDR_SIZE); + if (il != NULL) + bitmap_fill(&il->bitmap, IDR_SIZE); return (il); } diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index c250c5d..7010580 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -981,8 +981,6 @@ static int vm_page_insert_after(vm_page_t m, vm_object_t object, vm_pindex_t pindex, vm_page_t mpred) { - vm_pindex_t sidx; - vm_object_t sobj; vm_page_t msucc; VM_OBJECT_ASSERT_WLOCKED(object); @@ -1003,8 +1001,6 @@ vm_page_insert_after(vm_page_t m, vm_object_t object, vm_pindex_t pindex, /* * Record the object/offset pair in this page */ - sobj = m->object; - sidx = m->pindex; m->object = object; m->pindex = pindex; @@ -1012,8 +1008,8 @@ vm_page_insert_after(vm_page_t m, vm_object_t object, vm_pindex_t pindex, * Now link into the object's ordered list of backed pages. */ if (vm_radix_insert(&object->rtree, m)) { - m->object = sobj; - m->pindex = sidx; + m->object = NULL; + m->pindex = 0; return (1); } vm_page_insert_radixdone(m, object, mpred); @@ -1640,6 +1636,7 @@ vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req) } m->object = NULL; m->oflags = VPO_UNMANAGED; + m->busy_lock = VPB_UNBUSIED; vm_page_free(m); return (NULL); } @@ -1842,6 +1839,7 @@ retry: m->object = NULL; m->oflags |= VPO_UNMANAGED; } + m->busy_lock = VPB_UNBUSIED; vm_page_free(m); } return (NULL); |