diff options
author | David S. Miller <davem@davemloft.net> | 2014-03-31 00:45:49 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2014-03-31 00:45:49 -0400 |
commit | 9109e17f7c3ace48629397b44db5ce06bf168644 (patch) | |
tree | 495b67bcf755829a5409da5b7444ea9b93f60b35 /kernel | |
parent | 64c27237a07129758e33f5f824ba5c33b7f57417 (diff) | |
parent | 9a985cdc5ccb0d557720221d01bd70c19f04bb8c (diff) | |
download | op-kernel-dev-9109e17f7c3ace48629397b44db5ce06bf168644.zip op-kernel-dev-9109e17f7c3ace48629397b44db5ce06bf168644.tar.gz |
Merge branch 'filter-next'
Daniel Borkmann says:
====================
BPF updates
We sat down and have heavily reworked the whole previous patchset
from v10 [1] to address all comments/concerns. This patchset therefore
*replaces* the internal BPF interpreter with the new layout as
discussed in [1], and migrates some exotic callers to properly use the
BPF API for a transparent upgrade. All other callers that already use
the BPF API in a way it should be used, need no further changes to run
the new internals. We also removed the sysctl knob entirely, and do not
expose any structure to userland, so that implementation details only
reside in kernel space. Since we are replacing the interpreter we had
to migrate seccomp in one patch along with the interpreter to not break
anything. When attaching a new filter, the flow can be described as
following: i) test if jit compiler is enabled and can compile the user
BPF, ii) if so, then go for it, iii) if not, then transparently migrate
the filter into the new representation, and run it in the interpreter.
Also, we have scratched the jit flag from the len attribute and made it
as initial patch in this series as Pablo has suggested in the last
feedback, thanks. For details, please refer to the patches themselves.
We did extensive testing of BPF and seccomp on the new interpreter
itself and also on the user ABIs and could not find any issues; new
performance numbers as posted in patch 8 are also still the same.
Please find more details in the patches themselves.
For all the previous history from v1 to v10, see [1]. We have decided
to drop the v11 as we have pedantically reworked the set, but of course,
included all previous feedback.
v3 -> v4:
- Applied feedback from Dave regarding swap insns
- Rebased on net-next
v2 -> v3:
- Rebased to latest net-next (i.e. w/ rxhash->hash rename)
- Fixed patch 8/9 commit message/doc as suggested by Dave
- Rest is unchanged
v1 -> v2:
- Rebased to latest net-next
- Added static to ptp_filter as suggested by Dave
- Fixed a typo in patch 8's commit message
- Rest unchanged
Thanks !
[1] http://thread.gmane.org/gmane.linux.kernel/1665858
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/seccomp.c | 119 |
1 files changed, 58 insertions, 61 deletions
diff --git a/kernel/seccomp.c b/kernel/seccomp.c index b7a1004..4f18e75 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -55,60 +55,33 @@ struct seccomp_filter { atomic_t usage; struct seccomp_filter *prev; unsigned short len; /* Instruction count */ - struct sock_filter insns[]; + struct sock_filter_int insnsi[]; }; /* Limit any path through the tree to 256KB worth of instructions. */ #define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter)) -/** - * get_u32 - returns a u32 offset into data - * @data: a unsigned 64 bit value - * @index: 0 or 1 to return the first or second 32-bits - * - * This inline exists to hide the length of unsigned long. If a 32-bit - * unsigned long is passed in, it will be extended and the top 32-bits will be - * 0. If it is a 64-bit unsigned long, then whatever data is resident will be - * properly returned. - * +/* * Endianness is explicitly ignored and left for BPF program authors to manage * as per the specific architecture. */ -static inline u32 get_u32(u64 data, int index) +static void populate_seccomp_data(struct seccomp_data *sd) { - return ((u32 *)&data)[index]; -} + struct task_struct *task = current; + struct pt_regs *regs = task_pt_regs(task); -/* Helper for bpf_load below. */ -#define BPF_DATA(_name) offsetof(struct seccomp_data, _name) -/** - * bpf_load: checks and returns a pointer to the requested offset - * @off: offset into struct seccomp_data to load from - * - * Returns the requested 32-bits of data. - * seccomp_check_filter() should assure that @off is 32-bit aligned - * and not out of bounds. Failure to do so is a BUG. - */ -u32 seccomp_bpf_load(int off) -{ - struct pt_regs *regs = task_pt_regs(current); - if (off == BPF_DATA(nr)) - return syscall_get_nr(current, regs); - if (off == BPF_DATA(arch)) - return syscall_get_arch(current, regs); - if (off >= BPF_DATA(args[0]) && off < BPF_DATA(args[6])) { - unsigned long value; - int arg = (off - BPF_DATA(args[0])) / sizeof(u64); - int index = !!(off % sizeof(u64)); - syscall_get_arguments(current, regs, arg, 1, &value); - return get_u32(value, index); - } - if (off == BPF_DATA(instruction_pointer)) - return get_u32(KSTK_EIP(current), 0); - if (off == BPF_DATA(instruction_pointer) + sizeof(u32)) - return get_u32(KSTK_EIP(current), 1); - /* seccomp_check_filter should make this impossible. */ - BUG(); + sd->nr = syscall_get_nr(task, regs); + sd->arch = syscall_get_arch(task, regs); + + /* Unroll syscall_get_args to help gcc on arm. */ + syscall_get_arguments(task, regs, 0, 1, (unsigned long *) &sd->args[0]); + syscall_get_arguments(task, regs, 1, 1, (unsigned long *) &sd->args[1]); + syscall_get_arguments(task, regs, 2, 1, (unsigned long *) &sd->args[2]); + syscall_get_arguments(task, regs, 3, 1, (unsigned long *) &sd->args[3]); + syscall_get_arguments(task, regs, 4, 1, (unsigned long *) &sd->args[4]); + syscall_get_arguments(task, regs, 5, 1, (unsigned long *) &sd->args[5]); + + sd->instruction_pointer = KSTK_EIP(task); } /** @@ -133,17 +106,17 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) switch (code) { case BPF_S_LD_W_ABS: - ftest->code = BPF_S_ANC_SECCOMP_LD_W; + ftest->code = BPF_LDX | BPF_W | BPF_ABS; /* 32-bit aligned and not out of bounds. */ if (k >= sizeof(struct seccomp_data) || k & 3) return -EINVAL; continue; case BPF_S_LD_W_LEN: - ftest->code = BPF_S_LD_IMM; + ftest->code = BPF_LD | BPF_IMM; ftest->k = sizeof(struct seccomp_data); continue; case BPF_S_LDX_W_LEN: - ftest->code = BPF_S_LDX_IMM; + ftest->code = BPF_LDX | BPF_IMM; ftest->k = sizeof(struct seccomp_data); continue; /* Explicitly include allowed calls. */ @@ -185,6 +158,7 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) case BPF_S_JMP_JGT_X: case BPF_S_JMP_JSET_K: case BPF_S_JMP_JSET_X: + sk_decode_filter(ftest, ftest); continue; default: return -EINVAL; @@ -202,18 +176,21 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) static u32 seccomp_run_filters(int syscall) { struct seccomp_filter *f; + struct seccomp_data sd; u32 ret = SECCOMP_RET_ALLOW; /* Ensure unexpected behavior doesn't result in failing open. */ if (WARN_ON(current->seccomp.filter == NULL)) return SECCOMP_RET_KILL; + populate_seccomp_data(&sd); + /* * All filters in the list are evaluated and the lowest BPF return * value always takes priority (ignoring the DATA). */ for (f = current->seccomp.filter; f; f = f->prev) { - u32 cur_ret = sk_run_filter(NULL, f->insns); + u32 cur_ret = sk_run_filter_int_seccomp(&sd, f->insnsi); if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) ret = cur_ret; } @@ -231,6 +208,8 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) struct seccomp_filter *filter; unsigned long fp_size = fprog->len * sizeof(struct sock_filter); unsigned long total_insns = fprog->len; + struct sock_filter *fp; + int new_len; long ret; if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) @@ -252,28 +231,43 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) CAP_SYS_ADMIN) != 0) return -EACCES; - /* Allocate a new seccomp_filter */ - filter = kzalloc(sizeof(struct seccomp_filter) + fp_size, - GFP_KERNEL|__GFP_NOWARN); - if (!filter) + fp = kzalloc(fp_size, GFP_KERNEL|__GFP_NOWARN); + if (!fp) return -ENOMEM; - atomic_set(&filter->usage, 1); - filter->len = fprog->len; /* Copy the instructions from fprog. */ ret = -EFAULT; - if (copy_from_user(filter->insns, fprog->filter, fp_size)) - goto fail; + if (copy_from_user(fp, fprog->filter, fp_size)) + goto free_prog; /* Check and rewrite the fprog via the skb checker */ - ret = sk_chk_filter(filter->insns, filter->len); + ret = sk_chk_filter(fp, fprog->len); if (ret) - goto fail; + goto free_prog; /* Check and rewrite the fprog for seccomp use */ - ret = seccomp_check_filter(filter->insns, filter->len); + ret = seccomp_check_filter(fp, fprog->len); + if (ret) + goto free_prog; + + /* Convert 'sock_filter' insns to 'sock_filter_int' insns */ + ret = sk_convert_filter(fp, fprog->len, NULL, &new_len); + if (ret) + goto free_prog; + + /* Allocate a new seccomp_filter */ + filter = kzalloc(sizeof(struct seccomp_filter) + + sizeof(struct sock_filter_int) * new_len, + GFP_KERNEL|__GFP_NOWARN); + if (!filter) + goto free_prog; + + ret = sk_convert_filter(fp, fprog->len, filter->insnsi, &new_len); if (ret) - goto fail; + goto free_filter; + + atomic_set(&filter->usage, 1); + filter->len = new_len; /* * If there is an existing filter, make it the prev and don't drop its @@ -282,8 +276,11 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) filter->prev = current->seccomp.filter; current->seccomp.filter = filter; return 0; -fail: + +free_filter: kfree(filter); +free_prog: + kfree(fp); return ret; } |