From 250bbd12c2fe1221ec96d8087d63e982d4f2180a Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 24 Apr 2014 19:08:24 +0200 Subject: uprobes/x86: Refuse to attach uprobe to "word-sized" branch insns All branch insns on x86 can be prefixed with the operand-size override prefix, 0x66. It was only ever useful for performing jumps to 32-bit offsets in 16-bit code segments. In 32-bit code, such instructions are useless since they cause IP truncation to 16 bits, and in case of call insns, they save only 16 bits of return address and misalign the stack pointer as a "bonus". In 64-bit code, such instructions are treated differently by Intel and AMD CPUs: Intel ignores the prefix altogether, AMD treats them the same as in 32-bit mode. Before this patch, the emulation code would execute the instructions as if they have no 0x66 prefix. With this patch, we refuse to attach uprobes to such insns. Signed-off-by: Denys Vlasenko Acked-by: Jim Keniston Acked-by: Masami Hiramatsu Signed-off-by: Oleg Nesterov --- arch/x86/kernel/uprobes.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index ace2291..3cf24a2 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -583,6 +583,7 @@ static struct uprobe_xol_ops branch_xol_ops = { static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) { u8 opc1 = OPCODE1(insn); + int i; /* has the side-effect of processing the entire instruction */ insn_get_length(insn); @@ -612,6 +613,16 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) return -ENOSYS; } + /* + * 16-bit overrides such as CALLW (66 e8 nn nn) are not supported. + * Intel and AMD behavior differ in 64-bit mode: Intel ignores 66 prefix. + * No one uses these insns, reject any branch insns with such prefix. + */ + for (i = 0; i < insn->prefixes.nbytes; i++) { + if (insn->prefixes.bytes[i] == 0x66) + return -ENOTSUPP; + } + auprobe->branch.opc1 = opc1; auprobe->branch.ilen = insn->length; auprobe->branch.offs = insn->immediate.value; -- cgit v1.1 From 73175d0d19657ec132cc24e8cf0e341e73c54868 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 19 Apr 2014 12:34:02 +0200 Subject: uprobes/x86: Add uprobe_init_insn(), kill validate_insn_{32,64}bits() validate_insn_32bits() and validate_insn_64bits() are very similar, turn them into the single uprobe_init_insn() which has the additional "bool x86_64" argument which can be passed to insn_init() and used to choose between good_insns_64/good_insns_32. Also kill UPROBE_FIX_NONE, it has no users. Note: the current code doesn't use ifdef's consistently, good_insns_64 depends on CONFIG_X86_64 but good_insns_32 is unconditional. This patch removes ifdef around good_insns_64, we will add it back later along with the similar one for good_insns_32. Signed-off-by: Oleg Nesterov Reviewed-by: Jim Keniston Acked-by: Srikar Dronamraju --- arch/x86/kernel/uprobes.c | 45 +++++++++++++-------------------------------- 1 file changed, 13 insertions(+), 32 deletions(-) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 3cf24a2..b4aff6a 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -32,9 +32,6 @@ /* Post-execution fixups. */ -/* No fixup needed */ -#define UPROBE_FIX_NONE 0x0 - /* Adjust IP back to vicinity of actual insn */ #define UPROBE_FIX_IP 0x1 @@ -114,7 +111,6 @@ static volatile u32 good_2byte_insns[256 / 32] = { /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ }; -#ifdef CONFIG_X86_64 /* Good-instruction tables for 64-bit apps */ static volatile u32 good_insns_64[256 / 32] = { /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ @@ -138,7 +134,6 @@ static volatile u32 good_insns_64[256 / 32] = { /* ---------------------------------------------- */ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ }; -#endif #undef W /* @@ -209,16 +204,22 @@ static bool is_prefix_bad(struct insn *insn) return false; } -static int validate_insn_32bits(struct arch_uprobe *auprobe, struct insn *insn) +static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool x86_64) { - insn_init(insn, auprobe->insn, false); + u32 volatile *good_insns; + + insn_init(insn, auprobe->insn, x86_64); - /* Skip good instruction prefixes; reject "bad" ones. */ insn_get_opcode(insn); if (is_prefix_bad(insn)) return -ENOTSUPP; - if (test_bit(OPCODE1(insn), (unsigned long *)good_insns_32)) + if (x86_64) + good_insns = good_insns_64; + else + good_insns = good_insns_32; + + if (test_bit(OPCODE1(insn), (unsigned long *)good_insns)) return 0; if (insn->opcode.nbytes == 2) { @@ -355,30 +356,10 @@ handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long * } } -static int validate_insn_64bits(struct arch_uprobe *auprobe, struct insn *insn) -{ - insn_init(insn, auprobe->insn, true); - - /* Skip good instruction prefixes; reject "bad" ones. */ - insn_get_opcode(insn); - if (is_prefix_bad(insn)) - return -ENOTSUPP; - - if (test_bit(OPCODE1(insn), (unsigned long *)good_insns_64)) - return 0; - - if (insn->opcode.nbytes == 2) { - if (test_bit(OPCODE2(insn), (unsigned long *)good_2byte_insns)) - return 0; - } - return -ENOTSUPP; -} - static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn) { - if (mm->context.ia32_compat) - return validate_insn_32bits(auprobe, insn); - return validate_insn_64bits(auprobe, insn); + bool x86_64 = !mm->context.ia32_compat; + return uprobe_init_insn(auprobe, insn, x86_64); } #else /* 32-bit: */ /* @@ -398,7 +379,7 @@ static void handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs * static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn) { - return validate_insn_32bits(auprobe, insn); + return uprobe_init_insn(auprobe, insn, false); } #endif /* CONFIG_X86_64 */ -- cgit v1.1 From 2ae1f49ae1978fedb6ad607e1f8b084aa9752f95 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 19 Apr 2014 14:03:05 +0200 Subject: uprobes/x86: Add is_64bit_mm(), kill validate_insn_bits() 1. Extract the ->ia32_compat check from 64bit validate_insn_bits() into the new helper, is_64bit_mm(), it will have more users. TODO: this checks is actually wrong if mm owner is X32 task, we need another fix which changes set_personality_ia32(). TODO: even worse, the whole 64-or-32-bit logic is very broken and the fix is not simple, we need the nontrivial changes in the core uprobes code. 2. Kill validate_insn_bits() and change its single caller to use uprobe_init_insn(is_64bit_mm(mm). Signed-off-by: Oleg Nesterov Reviewed-by: Jim Keniston Acked-by: Srikar Dronamraju --- arch/x86/kernel/uprobes.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index b4aff6a..b3b25dd 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -231,6 +231,11 @@ static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool } #ifdef CONFIG_X86_64 +static inline bool is_64bit_mm(struct mm_struct *mm) +{ + return !config_enabled(CONFIG_IA32_EMULATION) || + !mm->context.ia32_compat; +} /* * If arch_uprobe->insn doesn't use rip-relative addressing, return * immediately. Otherwise, rewrite the instruction so that it accesses @@ -355,13 +360,11 @@ handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long * *correction += 4; } } - -static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn) +#else /* 32-bit: */ +static inline bool is_64bit_mm(struct mm_struct *mm) { - bool x86_64 = !mm->context.ia32_compat; - return uprobe_init_insn(auprobe, insn, x86_64); + return false; } -#else /* 32-bit: */ /* * No RIP-relative addressing on 32-bit */ @@ -376,11 +379,6 @@ static void handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs * long *correction) { } - -static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn) -{ - return uprobe_init_insn(auprobe, insn, false); -} #endif /* CONFIG_X86_64 */ struct uprobe_xol_ops { @@ -625,7 +623,7 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, bool fix_ip = true, fix_call = false; int ret; - ret = validate_insn_bits(auprobe, mm, &insn); + ret = uprobe_init_insn(auprobe, &insn, is_64bit_mm(mm)); if (ret) return ret; -- cgit v1.1 From ff261964cfcfe49d73690ca29b0ba2853d9497e3 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 19 Apr 2014 14:15:27 +0200 Subject: uprobes/x86: Shift "insn_complete" from branch_setup_xol_ops() to uprobe_init_insn() Change uprobe_init_insn() to make insn_complete() == T, this makes other insn_get_*() calls unnecessary. Signed-off-by: Oleg Nesterov Reviewed-by: Jim Keniston Acked-by: Srikar Dronamraju --- arch/x86/kernel/uprobes.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index b3b25dd..98d7db5 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -209,8 +209,11 @@ static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool u32 volatile *good_insns; insn_init(insn, auprobe->insn, x86_64); + /* has the side-effect of processing the entire instruction */ + insn_get_length(insn); + if (WARN_ON_ONCE(!insn_complete(insn))) + return -ENOEXEC; - insn_get_opcode(insn); if (is_prefix_bad(insn)) return -ENOTSUPP; @@ -283,8 +286,6 @@ handle_riprel_insn(struct arch_uprobe *auprobe, struct insn *insn) * is the immediate operand. */ cursor = auprobe->insn + insn_offset_modrm(insn); - insn_get_length(insn); - /* * Convert from rip-relative addressing to indirect addressing * via a scratch register. Change the r/m field from 0x5 (%rip) @@ -564,11 +565,6 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) u8 opc1 = OPCODE1(insn); int i; - /* has the side-effect of processing the entire instruction */ - insn_get_length(insn); - if (WARN_ON_ONCE(!insn_complete(insn))) - return -ENOEXEC; - switch (opc1) { case 0xeb: /* jmp 8 */ case 0xe9: /* jmp 32 */ @@ -654,7 +650,6 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, fix_ip = false; break; case 0xff: - insn_get_modrm(&insn); switch (MODRM_REG(&insn)) { case 2: case 3: /* call or lcall, indirect */ fix_call = true; -- cgit v1.1 From 8dbacad93a2a12adebcc717e6055b1bcc1739ab8 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 19 Apr 2014 16:07:15 +0200 Subject: uprobes/x86: Make good_insns_* depend on CONFIG_X86_* Add the suitable ifdef's around good_insns_* arrays. We do not want to add the ugly ifdef's into their only user, uprobe_init_insn(), so the "#else" branch simply defines them as NULL. This doesn't generate the extra code, gcc is smart enough, although the code is fine even if it could not detect that (without CONFIG_IA32_EMULATION) is_64bit_mm() is __builtin_constant_p(). The patch looks more complicated because it also moves good_insns_64 up close to good_insns_32. Signed-off-by: Oleg Nesterov Reviewed-by: Jim Keniston Acked-by: Srikar Dronamraju --- arch/x86/kernel/uprobes.c | 56 +++++++++++++++++++++++++++-------------------- 1 file changed, 32 insertions(+), 24 deletions(-) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 98d7db5..892975b 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -64,6 +64,7 @@ * to keep gcc from statically optimizing it out, as variable_test_bit makes * some versions of gcc to think only *(unsigned long*) is used. */ +#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) static volatile u32 good_insns_32[256 / 32] = { /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ /* ---------------------------------------------- */ @@ -86,32 +87,12 @@ static volatile u32 good_insns_32[256 / 32] = { /* ---------------------------------------------- */ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ }; - -/* Using this for both 64-bit and 32-bit apps */ -static volatile u32 good_2byte_insns[256 / 32] = { - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - /* ---------------------------------------------- */ - W(0x00, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1) | /* 00 */ - W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* 10 */ - W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */ - W(0x30, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */ - W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ - W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */ - W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 60 */ - W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */ - W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ - W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ - W(0xa0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */ - W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ - W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */ - W(0xd0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ - W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* e0 */ - W(0xf0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* f0 */ - /* ---------------------------------------------- */ - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ -}; +#else +#define good_insns_32 NULL +#endif /* Good-instruction tables for 64-bit apps */ +#if defined(CONFIG_X86_64) static volatile u32 good_insns_64[256 / 32] = { /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ /* ---------------------------------------------- */ @@ -134,6 +115,33 @@ static volatile u32 good_insns_64[256 / 32] = { /* ---------------------------------------------- */ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ }; +#else +#define good_insns_64 NULL +#endif + +/* Using this for both 64-bit and 32-bit apps */ +static volatile u32 good_2byte_insns[256 / 32] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ---------------------------------------------- */ + W(0x00, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1) | /* 00 */ + W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* 10 */ + W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */ + W(0x30, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */ + W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ + W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */ + W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 60 */ + W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */ + W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ + W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ + W(0xa0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */ + W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ + W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */ + W(0xd0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ + W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* e0 */ + W(0xf0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* f0 */ + /* ---------------------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ +}; #undef W /* -- cgit v1.1 From b24dc8dace74708fd849312722090169c5da97d3 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 19 Apr 2014 18:10:09 +0200 Subject: uprobes/x86: Fix is_64bit_mm() with CONFIG_X86_X32 is_64bit_mm() assumes that mm->context.ia32_compat means the 32-bit instruction set, this is not true if the task is TIF_X32. Change set_personality_ia32() to initialize mm->context.ia32_compat by TIF_X32 or TIF_IA32 instead of 1. This allows to fix is_64bit_mm() without affecting other users, they all treat ia32_compat as "bool". TIF_ in ->ia32_compat looks a bit strange, but this is grep-friendly and avoids the new define's. Signed-off-by: Oleg Nesterov Reviewed-by: Jim Keniston Acked-by: Srikar Dronamraju --- arch/x86/kernel/process_64.c | 7 ++++--- arch/x86/kernel/uprobes.c | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 9c0280f..9b53940 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -413,12 +413,11 @@ void set_personality_ia32(bool x32) set_thread_flag(TIF_ADDR32); /* Mark the associated mm as containing 32-bit tasks. */ - if (current->mm) - current->mm->context.ia32_compat = 1; - if (x32) { clear_thread_flag(TIF_IA32); set_thread_flag(TIF_X32); + if (current->mm) + current->mm->context.ia32_compat = TIF_X32; current->personality &= ~READ_IMPLIES_EXEC; /* is_compat_task() uses the presence of the x32 syscall bit flag to determine compat status */ @@ -426,6 +425,8 @@ void set_personality_ia32(bool x32) } else { set_thread_flag(TIF_IA32); clear_thread_flag(TIF_X32); + if (current->mm) + current->mm->context.ia32_compat = TIF_IA32; current->personality |= force_personality32; /* Prepare the first "return" to user space */ current_thread_info()->status |= TS_COMPAT; diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 892975b..ecbffd1 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -245,7 +245,7 @@ static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool static inline bool is_64bit_mm(struct mm_struct *mm) { return !config_enabled(CONFIG_IA32_EMULATION) || - !mm->context.ia32_compat; + !(mm->context.ia32_compat == TIF_IA32); } /* * If arch_uprobe->insn doesn't use rip-relative addressing, return -- cgit v1.1 From dd91016dfc9ba9236cb0149984da3f0434278b49 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 22 Apr 2014 15:20:07 +0200 Subject: uprobes/x86: Don't change the task's state if ->pre_xol() fails Currently this doesn't matter, the only ->pre_xol() hook can't fail, but we need to fix arch_uprobe_pre_xol() anyway. If ->pre_xol() fails we should not change regs->ip/flags, we should just return the error to make restart actually possible. Signed-off-by: Oleg Nesterov Reviewed-by: Jim Keniston --- arch/x86/kernel/uprobes.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index ecbffd1..f4464b1 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -687,6 +687,12 @@ int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { struct uprobe_task *utask = current->utask; + if (auprobe->ops->pre_xol) { + int err = auprobe->ops->pre_xol(auprobe, regs); + if (err) + return err; + } + regs->ip = utask->xol_vaddr; utask->autask.saved_trap_nr = current->thread.trap_nr; current->thread.trap_nr = UPROBE_TRAP_NR; @@ -696,8 +702,6 @@ int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) if (test_tsk_thread_flag(current, TIF_BLOCKSTEP)) set_task_blockstep(current, false); - if (auprobe->ops->pre_xol) - return auprobe->ops->pre_xol(auprobe, regs); return 0; } -- cgit v1.1 From 588fbd613c3d8fa73e96720761d49f1d40d34d4c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 21 Apr 2014 16:58:17 +0200 Subject: uprobes/x86: Introduce uprobe_xol_ops->abort() and default_abort_op() arch_uprobe_abort_xol() calls handle_riprel_post_xol() even if auprobe->ops != default_xol_ops. This is fine correctness wise, only default_pre_xol_op() can set UPROBE_FIX_RIP_AX|UPROBE_FIX_RIP_CX and otherwise handle_riprel_post_xol() is nop. But this doesn't look clean and this doesn't allow us to move ->fixups into the union in arch_uprobe. Move this handle_riprel_post_xol() call into the new default_abort_op() hook and change arch_uprobe_abort_xol() accordingly. Signed-off-by: Oleg Nesterov Reviewed-by: Jim Keniston --- arch/x86/kernel/uprobes.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index f4464b1..b3c2a92 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -394,6 +394,7 @@ struct uprobe_xol_ops { bool (*emulate)(struct arch_uprobe *, struct pt_regs *); int (*pre_xol)(struct arch_uprobe *, struct pt_regs *); int (*post_xol)(struct arch_uprobe *, struct pt_regs *); + void (*abort)(struct arch_uprobe *, struct pt_regs *); }; static inline int sizeof_long(void) @@ -444,9 +445,15 @@ static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs return 0; } +static void default_abort_op(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + handle_riprel_post_xol(auprobe, regs, NULL); +} + static struct uprobe_xol_ops default_xol_ops = { .pre_xol = default_pre_xol_op, .post_xol = default_post_xol_op, + .abort = default_abort_op, }; static bool branch_is_call(struct arch_uprobe *auprobe) @@ -820,10 +827,11 @@ void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { struct uprobe_task *utask = current->utask; - current->thread.trap_nr = utask->autask.saved_trap_nr; - handle_riprel_post_xol(auprobe, regs, NULL); - instruction_pointer_set(regs, utask->vaddr); + if (auprobe->ops->abort) + auprobe->ops->abort(auprobe, regs); + current->thread.trap_nr = utask->autask.saved_trap_nr; + regs->ip = utask->vaddr; /* clear TF if it was set by us in arch_uprobe_pre_xol() */ if (!utask->autask.saved_tf) regs->flags &= ~X86_EFLAGS_TF; -- cgit v1.1 From 6ded5f3848bfd3227ee208aa38f8bf8d7209d4e3 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 21 Apr 2014 18:28:02 +0200 Subject: uprobes/x86: Don't use arch_uprobe_abort_xol() in arch_uprobe_post_xol() 014940bad8e4 "uprobes/x86: Send SIGILL if arch_uprobe_post_xol() fails" changed arch_uprobe_post_xol() to use arch_uprobe_abort_xol() if ->post_xol fails. This was correct and helped to avoid the additional complications, we need to clear X86_EFLAGS_TF in this case. However, now that we have uprobe_xol_ops->abort() hook it would be better to avoid arch_uprobe_abort_xol() here. ->post_xol() should likely do what ->abort() does anyway, we should not do the same work twice. Currently only handle_riprel_post_xol() can be called twice, this is unnecessary but safe. Still this is not clean and can lead to the problems in future. Change arch_uprobe_post_xol() to clear X86_EFLAGS_TF and restore ->ip by hand and avoid arch_uprobe_abort_xol(). This temporary uglifies the usage of autask.saved_tf, we will cleanup this later. Signed-off-by: Oleg Nesterov Reviewed-by: Jim Keniston --- arch/x86/kernel/uprobes.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index b3c2a92..2efb93f 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -759,22 +759,24 @@ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) struct uprobe_task *utask = current->utask; WARN_ON_ONCE(current->thread.trap_nr != UPROBE_TRAP_NR); + current->thread.trap_nr = utask->autask.saved_trap_nr; if (auprobe->ops->post_xol) { int err = auprobe->ops->post_xol(auprobe, regs); if (err) { - arch_uprobe_abort_xol(auprobe, regs); + if (!utask->autask.saved_tf) + regs->flags &= ~X86_EFLAGS_TF; /* - * Restart the probed insn. ->post_xol() must ensure - * this is really possible if it returns -ERESTART. + * Restore ->ip for restart or post mortem analysis. + * ->post_xol() must not return -ERESTART unless this + * is really possible. */ + regs->ip = utask->vaddr; if (err == -ERESTART) return 0; return err; } } - - current->thread.trap_nr = utask->autask.saved_trap_nr; /* * arch_uprobe_pre_xol() doesn't save the state of TIF_BLOCKSTEP * so we can get an extra SIGTRAP if we do not clear TF. We need @@ -819,9 +821,8 @@ int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, /* * This function gets called when XOL instruction either gets trapped or - * the thread has a fatal signal, or if arch_uprobe_post_xol() failed. - * Reset the instruction pointer to its probed address for the potential - * restart or for post mortem analysis. + * the thread has a fatal signal. Reset the instruction pointer to its + * probed address for the potential restart or for post mortem analysis. */ void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { -- cgit v1.1 From 220ef8dc9a7a63fe202aacd3fc61e5104f6dd98c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 21 Apr 2014 20:39:56 +0200 Subject: uprobes/x86: Move UPROBE_FIX_SETF logic from arch_uprobe_post_xol() to default_post_xol_op() UPROBE_FIX_SETF is only needed to handle "popf" correctly but it is processed by the generic arch_uprobe_post_xol() code. This doesn't allows us to make ->fixups private for default_xol_ops. 1 Change default_post_xol_op(UPROBE_FIX_SETF) to set ->saved_tf = T. "popf" always reads the flags from stack, it doesn't matter if TF was set or not before single-step. Ignoring the naming, this is even more logical, "saved_tf" means "owned by application" and we do not own this flag after "popf". 2. Change arch_uprobe_post_xol() to save ->saved_tf into the local "bool send_sigtrap" before ->post_xol(). 3. Change arch_uprobe_post_xol() to ignore UPROBE_FIX_SETF and just check ->saved_tf after ->post_xol(). With this patch ->fixups and ->rip_rela_target_address are only used by default_xol_ops hooks, we are ready to remove them from the common part of arch_uprobe. Signed-off-by: Oleg Nesterov Reviewed-by: Jim Keniston --- arch/x86/kernel/uprobes.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 2efb93f..b2bca29 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -441,6 +441,9 @@ static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs return -ERESTART; } } + /* popf; tell the caller to not touch TF */ + if (auprobe->fixups & UPROBE_FIX_SETF) + utask->autask.saved_tf = true; return 0; } @@ -757,15 +760,15 @@ bool arch_uprobe_xol_was_trapped(struct task_struct *t) int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { struct uprobe_task *utask = current->utask; + bool send_sigtrap = utask->autask.saved_tf; + int err = 0; WARN_ON_ONCE(current->thread.trap_nr != UPROBE_TRAP_NR); current->thread.trap_nr = utask->autask.saved_trap_nr; if (auprobe->ops->post_xol) { - int err = auprobe->ops->post_xol(auprobe, regs); + err = auprobe->ops->post_xol(auprobe, regs); if (err) { - if (!utask->autask.saved_tf) - regs->flags &= ~X86_EFLAGS_TF; /* * Restore ->ip for restart or post mortem analysis. * ->post_xol() must not return -ERESTART unless this @@ -773,8 +776,8 @@ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) */ regs->ip = utask->vaddr; if (err == -ERESTART) - return 0; - return err; + err = 0; + send_sigtrap = false; } } /* @@ -782,12 +785,13 @@ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) * so we can get an extra SIGTRAP if we do not clear TF. We need * to examine the opcode to make it right. */ - if (utask->autask.saved_tf) + if (send_sigtrap) send_sig(SIGTRAP, current, 0); - else if (!(auprobe->fixups & UPROBE_FIX_SETF)) + + if (!utask->autask.saved_tf) regs->flags &= ~X86_EFLAGS_TF; - return 0; + return err; } /* callback routine for handling exceptions. */ -- cgit v1.1 From 97aa5cddbe9e01521137f337624469374e3cbde5 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 22 Apr 2014 16:20:55 +0200 Subject: uprobes/x86: Move default_xol_ops's data into arch_uprobe->def Finally we can move arch_uprobe->fixups/rip_rela_target_address into the new "def" struct and place this struct in the union, they are only used by default_xol_ops paths. The patch also renames rip_rela_target_address to riprel_target just to make this name shorter. Signed-off-by: Oleg Nesterov Reviewed-by: Jim Keniston --- arch/x86/include/asm/uprobes.h | 12 +++++++----- arch/x86/kernel/uprobes.c | 43 +++++++++++++++++++++--------------------- 2 files changed, 28 insertions(+), 27 deletions(-) diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h index 93bee7b..72caff7 100644 --- a/arch/x86/include/asm/uprobes.h +++ b/arch/x86/include/asm/uprobes.h @@ -41,18 +41,20 @@ struct arch_uprobe { u8 ixol[MAX_UINSN_BYTES]; }; - u16 fixups; const struct uprobe_xol_ops *ops; union { -#ifdef CONFIG_X86_64 - unsigned long rip_rela_target_address; -#endif struct { s32 offs; u8 ilen; u8 opc1; - } branch; + } branch; + struct { +#ifdef CONFIG_X86_64 + long riprel_target; +#endif + u16 fixups; + } def; }; }; diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index b2bca29..7824ce2 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -251,10 +251,9 @@ static inline bool is_64bit_mm(struct mm_struct *mm) * If arch_uprobe->insn doesn't use rip-relative addressing, return * immediately. Otherwise, rewrite the instruction so that it accesses * its memory operand indirectly through a scratch register. Set - * arch_uprobe->fixups and arch_uprobe->rip_rela_target_address - * accordingly. (The contents of the scratch register will be saved - * before we single-step the modified instruction, and restored - * afterward.) + * def->fixups and def->riprel_target accordingly. (The contents of the + * scratch register will be saved before we single-step the modified + * instruction, and restored afterward). * * We do this because a rip-relative instruction can access only a * relatively small area (+/- 2 GB from the instruction), and the XOL @@ -308,18 +307,18 @@ handle_riprel_insn(struct arch_uprobe *auprobe, struct insn *insn) * is NOT the register operand, so we use %rcx (register * #1) for the scratch register. */ - auprobe->fixups = UPROBE_FIX_RIP_CX; + auprobe->def.fixups = UPROBE_FIX_RIP_CX; /* Change modrm from 00 000 101 to 00 000 001. */ *cursor = 0x1; } else { /* Use %rax (register #0) for the scratch register. */ - auprobe->fixups = UPROBE_FIX_RIP_AX; + auprobe->def.fixups = UPROBE_FIX_RIP_AX; /* Change modrm from 00 xxx 101 to 00 xxx 000 */ *cursor = (reg << 3); } /* Target address = address of next instruction + (signed) offset */ - auprobe->rip_rela_target_address = (long)insn->length + insn->displacement.value; + auprobe->def.riprel_target = (long)insn->length + insn->displacement.value; /* Displacement field is gone; slide immediate field (if any) over. */ if (insn->immediate.nbytes) { @@ -336,25 +335,25 @@ static void pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs, struct arch_uprobe_task *autask) { - if (auprobe->fixups & UPROBE_FIX_RIP_AX) { + if (auprobe->def.fixups & UPROBE_FIX_RIP_AX) { autask->saved_scratch_register = regs->ax; regs->ax = current->utask->vaddr; - regs->ax += auprobe->rip_rela_target_address; - } else if (auprobe->fixups & UPROBE_FIX_RIP_CX) { + regs->ax += auprobe->def.riprel_target; + } else if (auprobe->def.fixups & UPROBE_FIX_RIP_CX) { autask->saved_scratch_register = regs->cx; regs->cx = current->utask->vaddr; - regs->cx += auprobe->rip_rela_target_address; + regs->cx += auprobe->def.riprel_target; } } static void handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction) { - if (auprobe->fixups & (UPROBE_FIX_RIP_AX | UPROBE_FIX_RIP_CX)) { + if (auprobe->def.fixups & (UPROBE_FIX_RIP_AX | UPROBE_FIX_RIP_CX)) { struct arch_uprobe_task *autask; autask = ¤t->utask->autask; - if (auprobe->fixups & UPROBE_FIX_RIP_AX) + if (auprobe->def.fixups & UPROBE_FIX_RIP_AX) regs->ax = autask->saved_scratch_register; else regs->cx = autask->saved_scratch_register; @@ -432,17 +431,17 @@ static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs long correction = (long)(utask->vaddr - utask->xol_vaddr); handle_riprel_post_xol(auprobe, regs, &correction); - if (auprobe->fixups & UPROBE_FIX_IP) + if (auprobe->def.fixups & UPROBE_FIX_IP) regs->ip += correction; - if (auprobe->fixups & UPROBE_FIX_CALL) { + if (auprobe->def.fixups & UPROBE_FIX_CALL) { if (adjust_ret_addr(regs->sp, correction)) { regs->sp += sizeof_long(); return -ERESTART; } } /* popf; tell the caller to not touch TF */ - if (auprobe->fixups & UPROBE_FIX_SETF) + if (auprobe->def.fixups & UPROBE_FIX_SETF) utask->autask.saved_tf = true; return 0; @@ -646,13 +645,13 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, return ret; /* - * Figure out which fixups arch_uprobe_post_xol() will need to perform, - * and annotate arch_uprobe->fixups accordingly. To start with, ->fixups - * is either zero or it reflects rip-related fixups. + * Figure out which fixups default_post_xol_op() will need to perform, + * and annotate def->fixups accordingly. To start with, ->fixups is + * either zero or it reflects rip-related fixups. */ switch (OPCODE1(&insn)) { case 0x9d: /* popf */ - auprobe->fixups |= UPROBE_FIX_SETF; + auprobe->def.fixups |= UPROBE_FIX_SETF; break; case 0xc3: /* ret or lret -- ip is correct */ case 0xcb: @@ -680,9 +679,9 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, } if (fix_ip) - auprobe->fixups |= UPROBE_FIX_IP; + auprobe->def.fixups |= UPROBE_FIX_IP; if (fix_call) - auprobe->fixups |= UPROBE_FIX_CALL; + auprobe->def.fixups |= UPROBE_FIX_CALL; auprobe->ops = &default_xol_ops; return 0; -- cgit v1.1 From 78d9af4cd375880a574327210eb9dab572618364 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 24 Apr 2014 18:52:37 +0200 Subject: uprobes/x86: Cleanup the usage of arch_uprobe->def.fixups, make it u8 handle_riprel_insn() assumes that nobody else could modify ->fixups before. This is correct but fragile, change it to use "|=". Also make ->fixups u8, we are going to add the new members into the union. It is not clear why UPROBE_FIX_RIP_.X lived in the upper byte, redefine them so that they can fit into u8. Signed-off-by: Oleg Nesterov --- arch/x86/include/asm/uprobes.h | 2 +- arch/x86/kernel/uprobes.c | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h index 72caff7..9ce25ce 100644 --- a/arch/x86/include/asm/uprobes.h +++ b/arch/x86/include/asm/uprobes.h @@ -53,7 +53,7 @@ struct arch_uprobe { #ifdef CONFIG_X86_64 long riprel_target; #endif - u16 fixups; + u8 fixups; } def; }; }; diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 7824ce2..a8e1d7e 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -33,16 +33,16 @@ /* Post-execution fixups. */ /* Adjust IP back to vicinity of actual insn */ -#define UPROBE_FIX_IP 0x1 +#define UPROBE_FIX_IP 0x01 /* Adjust the return address of a call insn */ -#define UPROBE_FIX_CALL 0x2 +#define UPROBE_FIX_CALL 0x02 /* Instruction will modify TF, don't change it */ -#define UPROBE_FIX_SETF 0x4 +#define UPROBE_FIX_SETF 0x04 -#define UPROBE_FIX_RIP_AX 0x8000 -#define UPROBE_FIX_RIP_CX 0x4000 +#define UPROBE_FIX_RIP_AX 0x08 +#define UPROBE_FIX_RIP_CX 0x10 #define UPROBE_TRAP_NR UINT_MAX @@ -307,12 +307,12 @@ handle_riprel_insn(struct arch_uprobe *auprobe, struct insn *insn) * is NOT the register operand, so we use %rcx (register * #1) for the scratch register. */ - auprobe->def.fixups = UPROBE_FIX_RIP_CX; + auprobe->def.fixups |= UPROBE_FIX_RIP_CX; /* Change modrm from 00 000 101 to 00 000 001. */ *cursor = 0x1; } else { /* Use %rax (register #0) for the scratch register. */ - auprobe->def.fixups = UPROBE_FIX_RIP_AX; + auprobe->def.fixups |= UPROBE_FIX_RIP_AX; /* Change modrm from 00 xxx 101 to 00 xxx 000 */ *cursor = (reg << 3); } -- cgit v1.1 From 2b82cadffc4154a25c25d88a63c7fb3397cda9d6 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 24 Apr 2014 19:21:38 +0200 Subject: uprobes/x86: Introduce push_ret_address() Extract the "push return address" code from branch_emulate_op() into the new simple helper, push_ret_address(). It will have more users. Signed-off-by: Oleg Nesterov --- arch/x86/kernel/uprobes.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index a8e1d7e..df75913 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -407,6 +407,17 @@ static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) return 0; } +static int push_ret_address(struct pt_regs *regs, unsigned long ip) +{ + unsigned long new_sp = regs->sp - sizeof_long(); + + if (copy_to_user((void __user *)new_sp, &ip, sizeof_long())) + return -EFAULT; + + regs->sp = new_sp; + return 0; +} + /* * Adjust the return address pushed by a call insn executed out of line. */ @@ -517,7 +528,6 @@ static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs) unsigned long offs = (long)auprobe->branch.offs; if (branch_is_call(auprobe)) { - unsigned long new_sp = regs->sp - sizeof_long(); /* * If it fails we execute this (mangled, see the comment in * branch_clear_offset) insn out-of-line. In the likely case @@ -527,9 +537,8 @@ static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs) * * But there is corner case, see the comment in ->post_xol(). */ - if (copy_to_user((void __user *)new_sp, &new_ip, sizeof_long())) + if (push_ret_address(regs, new_ip)) return false; - regs->sp = new_sp; } else if (!check_jmp_cond(auprobe, regs)) { offs = 0; } -- cgit v1.1 From 1dc76e6eacef271230d9ff6fd0f91824bda03f44 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 25 Apr 2014 18:06:19 +0200 Subject: uprobes/x86: Kill adjust_ret_addr(), simplify UPROBE_FIX_CALL logic The only insn which could have both UPROBE_FIX_IP and UPROBE_FIX_CALL was 0xe8 "call relative", and now it is handled by branch_xol_ops. So we can change default_post_xol_op(UPROBE_FIX_CALL) to simply push the address of next insn == utask->vaddr + insn.length, just we need to record insn.length into the new auprobe->def.ilen member. Note: if/when we teach branch_xol_ops to support jcxz/loopz we can remove the "correction" logic, UPROBE_FIX_IP can use the same address. Signed-off-by: Oleg Nesterov --- arch/x86/include/asm/uprobes.h | 1 + arch/x86/kernel/uprobes.c | 24 +++--------------------- 2 files changed, 4 insertions(+), 21 deletions(-) diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h index 9ce25ce..a040d49 100644 --- a/arch/x86/include/asm/uprobes.h +++ b/arch/x86/include/asm/uprobes.h @@ -54,6 +54,7 @@ struct arch_uprobe { long riprel_target; #endif u8 fixups; + u8 ilen; } def; }; }; diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index df75913..5bcce85 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -418,24 +418,6 @@ static int push_ret_address(struct pt_regs *regs, unsigned long ip) return 0; } -/* - * Adjust the return address pushed by a call insn executed out of line. - */ -static int adjust_ret_addr(unsigned long sp, long correction) -{ - int rasize = sizeof_long(); - long ra; - - if (copy_from_user(&ra, (void __user *)sp, rasize)) - return -EFAULT; - - ra += correction; - if (copy_to_user((void __user *)sp, &ra, rasize)) - return -EFAULT; - - return 0; -} - static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) { struct uprobe_task *utask = current->utask; @@ -446,10 +428,9 @@ static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs regs->ip += correction; if (auprobe->def.fixups & UPROBE_FIX_CALL) { - if (adjust_ret_addr(regs->sp, correction)) { - regs->sp += sizeof_long(); + regs->sp += sizeof_long(); + if (push_ret_address(regs, utask->vaddr + auprobe->def.ilen)) return -ERESTART; - } } /* popf; tell the caller to not touch TF */ if (auprobe->def.fixups & UPROBE_FIX_SETF) @@ -687,6 +668,7 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, handle_riprel_insn(auprobe, &insn); } + auprobe->def.ilen = insn.length; if (fix_ip) auprobe->def.fixups |= UPROBE_FIX_IP; if (fix_call) -- cgit v1.1 From 83cd591485e558ab70aed45ce7261ce3f5ee8746 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 25 Apr 2014 18:53:32 +0200 Subject: uprobes/x86: Cleanup the usage of UPROBE_FIX_IP/UPROBE_FIX_CALL Now that UPROBE_FIX_IP/UPROBE_FIX_CALL are mutually exclusive we can use a single "fix_ip_or_call" enum instead of 2 fix_* booleans. This way the logic looks more understandable and clean to me. While at it, join "case 0xea" with other "ip is correct" ret/lret cases. Also change default_post_xol_op() to use "else if" for the same reason. Signed-off-by: Oleg Nesterov --- arch/x86/kernel/uprobes.c | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 5bcce85..d2792e8 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -424,10 +424,9 @@ static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs long correction = (long)(utask->vaddr - utask->xol_vaddr); handle_riprel_post_xol(auprobe, regs, &correction); - if (auprobe->def.fixups & UPROBE_FIX_IP) + if (auprobe->def.fixups & UPROBE_FIX_IP) { regs->ip += correction; - - if (auprobe->def.fixups & UPROBE_FIX_CALL) { + } else if (auprobe->def.fixups & UPROBE_FIX_CALL) { regs->sp += sizeof_long(); if (push_ret_address(regs, utask->vaddr + auprobe->def.ilen)) return -ERESTART; @@ -623,7 +622,7 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr) { struct insn insn; - bool fix_ip = true, fix_call = false; + u8 fix_ip_or_call = UPROBE_FIX_IP; int ret; ret = uprobe_init_insn(auprobe, &insn, is_64bit_mm(mm)); @@ -647,21 +646,20 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, case 0xcb: case 0xc2: case 0xca: - fix_ip = false; + case 0xea: /* jmp absolute -- ip is correct */ + fix_ip_or_call = 0; break; case 0x9a: /* call absolute - Fix return addr, not ip */ - fix_call = true; - fix_ip = false; - break; - case 0xea: /* jmp absolute -- ip is correct */ - fix_ip = false; + fix_ip_or_call = UPROBE_FIX_CALL; break; case 0xff: switch (MODRM_REG(&insn)) { case 2: case 3: /* call or lcall, indirect */ - fix_call = true; + fix_ip_or_call = UPROBE_FIX_CALL; + break; case 4: case 5: /* jmp or ljmp, indirect */ - fix_ip = false; + fix_ip_or_call = 0; + break; } /* fall through */ default: @@ -669,10 +667,7 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, } auprobe->def.ilen = insn.length; - if (fix_ip) - auprobe->def.fixups |= UPROBE_FIX_IP; - if (fix_call) - auprobe->def.fixups |= UPROBE_FIX_CALL; + auprobe->def.fixups |= fix_ip_or_call; auprobe->ops = &default_xol_ops; return 0; -- cgit v1.1 From 1475ee7fadafc6d0c194f2f4cbdae10ed04b9580 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 27 Apr 2014 16:31:59 +0200 Subject: uprobes/x86: Rename *riprel* helpers to make the naming consistent handle_riprel_insn(), pre_xol_rip_insn() and handle_riprel_post_xol() look confusing and inconsistent. Rename them into riprel_analyze(), riprel_pre_xol(), and riprel_post_xol() respectively. No changes in compiled code. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- arch/x86/kernel/uprobes.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index d2792e8..187be0e 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -268,8 +268,7 @@ static inline bool is_64bit_mm(struct mm_struct *mm) * - There's never a SIB byte. * - The displacement is always 4 bytes. */ -static void -handle_riprel_insn(struct arch_uprobe *auprobe, struct insn *insn) +static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn) { u8 *cursor; u8 reg; @@ -331,8 +330,7 @@ handle_riprel_insn(struct arch_uprobe *auprobe, struct insn *insn) * If we're emulating a rip-relative instruction, save the contents * of the scratch register and store the target address in that register. */ -static void -pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs, +static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, struct arch_uprobe_task *autask) { if (auprobe->def.fixups & UPROBE_FIX_RIP_AX) { @@ -346,8 +344,8 @@ pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs, } } -static void -handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction) +static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, + long *correction) { if (auprobe->def.fixups & (UPROBE_FIX_RIP_AX | UPROBE_FIX_RIP_CX)) { struct arch_uprobe_task *autask; @@ -376,14 +374,14 @@ static inline bool is_64bit_mm(struct mm_struct *mm) /* * No RIP-relative addressing on 32-bit */ -static void handle_riprel_insn(struct arch_uprobe *auprobe, struct insn *insn) +static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn) { } -static void pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs, +static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, struct arch_uprobe_task *autask) { } -static void handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, +static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction) { } @@ -403,7 +401,7 @@ static inline int sizeof_long(void) static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) { - pre_xol_rip_insn(auprobe, regs, ¤t->utask->autask); + riprel_pre_xol(auprobe, regs, ¤t->utask->autask); return 0; } @@ -423,7 +421,7 @@ static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs struct uprobe_task *utask = current->utask; long correction = (long)(utask->vaddr - utask->xol_vaddr); - handle_riprel_post_xol(auprobe, regs, &correction); + riprel_post_xol(auprobe, regs, &correction); if (auprobe->def.fixups & UPROBE_FIX_IP) { regs->ip += correction; } else if (auprobe->def.fixups & UPROBE_FIX_CALL) { @@ -440,7 +438,7 @@ static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs static void default_abort_op(struct arch_uprobe *auprobe, struct pt_regs *regs) { - handle_riprel_post_xol(auprobe, regs, NULL); + riprel_post_xol(auprobe, regs, NULL); } static struct uprobe_xol_ops default_xol_ops = { @@ -663,7 +661,7 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, } /* fall through */ default: - handle_riprel_insn(auprobe, &insn); + riprel_analyze(auprobe, &insn); } auprobe->def.ilen = insn.length; -- cgit v1.1 From 7f55e82bacaaa2c41b8e14d6bc78129b096b67b8 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 27 Apr 2014 17:00:46 +0200 Subject: uprobes/x86: Kill the "autask" arg of riprel_pre_xol() default_pre_xol_op() passes ¤t->utask->autask to riprel_pre_xol() and this is just ugly because it still needs to load current->utask to read ->vaddr. Remove this argument, change riprel_pre_xol() to use current->utask. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- arch/x86/kernel/uprobes.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 187be0e..5df1bca 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -330,16 +330,17 @@ static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn) * If we're emulating a rip-relative instruction, save the contents * of the scratch register and store the target address in that register. */ -static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, - struct arch_uprobe_task *autask) +static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { + struct uprobe_task *utask = current->utask; + if (auprobe->def.fixups & UPROBE_FIX_RIP_AX) { - autask->saved_scratch_register = regs->ax; - regs->ax = current->utask->vaddr; + utask->autask.saved_scratch_register = regs->ax; + regs->ax = utask->vaddr; regs->ax += auprobe->def.riprel_target; } else if (auprobe->def.fixups & UPROBE_FIX_RIP_CX) { - autask->saved_scratch_register = regs->cx; - regs->cx = current->utask->vaddr; + utask->autask.saved_scratch_register = regs->cx; + regs->cx = utask->vaddr; regs->cx += auprobe->def.riprel_target; } } @@ -377,8 +378,7 @@ static inline bool is_64bit_mm(struct mm_struct *mm) static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn) { } -static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, - struct arch_uprobe_task *autask) +static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { } static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, @@ -401,7 +401,7 @@ static inline int sizeof_long(void) static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) { - riprel_pre_xol(auprobe, regs, ¤t->utask->autask); + riprel_pre_xol(auprobe, regs); return 0; } -- cgit v1.1 From c90a6950120a7e45f31a22653fe6543507ae64d0 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 27 Apr 2014 18:13:31 +0200 Subject: uprobes/x86: Simplify riprel_{pre,post}_xol() and make them similar Ignoring the "correction" logic riprel_pre_xol() and riprel_post_xol() are very similar but look quite differently. 1. Add the "UPROBE_FIX_RIP_AX | UPROBE_FIX_RIP_CX" check at the start of riprel_pre_xol(), like the same check in riprel_post_xol(). 2. Add the trivial scratch_reg() helper which returns the address of scratch register pre_xol/post_xol need to change. 3. Change these functions to use the new helper and avoid copy-and-paste under if/else branches. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- arch/x86/kernel/uprobes.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 5df1bca..2ebadb2 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -326,22 +326,24 @@ static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn) } } +static inline unsigned long * +scratch_reg(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + return (auprobe->def.fixups & UPROBE_FIX_RIP_AX) ? ®s->ax : ®s->cx; +} + /* * If we're emulating a rip-relative instruction, save the contents * of the scratch register and store the target address in that register. */ static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { - struct uprobe_task *utask = current->utask; + if (auprobe->def.fixups & (UPROBE_FIX_RIP_AX | UPROBE_FIX_RIP_CX)) { + struct uprobe_task *utask = current->utask; + unsigned long *sr = scratch_reg(auprobe, regs); - if (auprobe->def.fixups & UPROBE_FIX_RIP_AX) { - utask->autask.saved_scratch_register = regs->ax; - regs->ax = utask->vaddr; - regs->ax += auprobe->def.riprel_target; - } else if (auprobe->def.fixups & UPROBE_FIX_RIP_CX) { - utask->autask.saved_scratch_register = regs->cx; - regs->cx = utask->vaddr; - regs->cx += auprobe->def.riprel_target; + utask->autask.saved_scratch_register = *sr; + *sr = utask->vaddr + auprobe->def.riprel_target; } } @@ -349,14 +351,10 @@ static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction) { if (auprobe->def.fixups & (UPROBE_FIX_RIP_AX | UPROBE_FIX_RIP_CX)) { - struct arch_uprobe_task *autask; - - autask = ¤t->utask->autask; - if (auprobe->def.fixups & UPROBE_FIX_RIP_AX) - regs->ax = autask->saved_scratch_register; - else - regs->cx = autask->saved_scratch_register; + struct uprobe_task *utask = current->utask; + unsigned long *sr = scratch_reg(auprobe, regs); + *sr = utask->autask.saved_scratch_register; /* * The original instruction includes a displacement, and so * is 4 bytes longer than what we've just single-stepped. -- cgit v1.1 From ce5f36a58fd1d92cb945cf2568751d40e8598508 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 24 Apr 2014 13:26:01 +0200 Subject: uprobes/tracing: Make uprobe_perf_close() visible to uprobe_perf_open() Preparation. Move uprobe_perf_close() up before uprobe_perf_open() to avoid the forward declaration in the next patch and make it readable. Signed-off-by: Oleg Nesterov Acked-by: Steven Rostedt Acked-by: Srikar Dronamraju --- kernel/trace/trace_uprobe.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index c082a74..51bd071 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1009,54 +1009,54 @@ uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event) return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm); } -static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) +static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) { bool done; write_lock(&tu->filter.rwlock); if (event->hw.tp_target) { - /* - * event->parent != NULL means copy_process(), we can avoid - * uprobe_apply(). current->mm must be probed and we can rely - * on dup_mmap() which preserves the already installed bp's. - * - * attr.enable_on_exec means that exec/mmap will install the - * breakpoints we need. - */ + list_del(&event->hw.tp_list); done = tu->filter.nr_systemwide || - event->parent || event->attr.enable_on_exec || + (event->hw.tp_target->flags & PF_EXITING) || uprobe_filter_event(tu, event); - list_add(&event->hw.tp_list, &tu->filter.perf_events); } else { + tu->filter.nr_systemwide--; done = tu->filter.nr_systemwide; - tu->filter.nr_systemwide++; } write_unlock(&tu->filter.rwlock); if (!done) - uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); + uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); return 0; } -static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) +static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) { bool done; write_lock(&tu->filter.rwlock); if (event->hw.tp_target) { - list_del(&event->hw.tp_list); + /* + * event->parent != NULL means copy_process(), we can avoid + * uprobe_apply(). current->mm must be probed and we can rely + * on dup_mmap() which preserves the already installed bp's. + * + * attr.enable_on_exec means that exec/mmap will install the + * breakpoints we need. + */ done = tu->filter.nr_systemwide || - (event->hw.tp_target->flags & PF_EXITING) || + event->parent || event->attr.enable_on_exec || uprobe_filter_event(tu, event); + list_add(&event->hw.tp_list, &tu->filter.perf_events); } else { - tu->filter.nr_systemwide--; done = tu->filter.nr_systemwide; + tu->filter.nr_systemwide++; } write_unlock(&tu->filter.rwlock); if (!done) - uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); + uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); return 0; } -- cgit v1.1 From 927d687480ab7e43d73a003bab58803fc67717d9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 24 Apr 2014 13:33:31 +0200 Subject: uprobes/tracing: Fix uprobe_perf_open() on uprobe_apply() failure uprobe_perf_open()->uprobe_apply() can fail, but this error is wrongly ignored. Change uprobe_perf_open() to do uprobe_perf_close() and return the error code in this case. Change uprobe_perf_close() to propogate the error from uprobe_apply() as well, although it should not fail. Signed-off-by: Oleg Nesterov Acked-by: Steven Rostedt Acked-by: Srikar Dronamraju --- kernel/trace/trace_uprobe.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 51bd071..5a7f1a6 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1026,7 +1026,7 @@ static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) write_unlock(&tu->filter.rwlock); if (!done) - uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); + return uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); return 0; } @@ -1034,6 +1034,7 @@ static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) { bool done; + int err; write_lock(&tu->filter.rwlock); if (event->hw.tp_target) { @@ -1055,10 +1056,13 @@ static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) } write_unlock(&tu->filter.rwlock); - if (!done) - uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); - - return 0; + err = 0; + if (!done) { + err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); + if (err) + uprobe_perf_close(tu, event); + } + return err; } static bool uprobe_perf_filter(struct uprobe_consumer *uc, -- cgit v1.1 From 13f59c5e45be59665c11ddde19799b6295543b7d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 28 Apr 2014 20:15:43 +0200 Subject: uprobes: Refuse to insert a probe into MAP_SHARED vma valid_vma() rejects the VM_SHARED vmas, but this still allows to insert a probe into the MAP_SHARED but not VM_MAYWRITE vma. Currently this is fine, such a mapping doesn't really differ from the private read-only mmap except mprotect(PROT_WRITE) won't work. However, get_user_pages(FOLL_WRITE | FOLL_FORCE) doesn't allow to COW in this case, and it would be safer to follow the same conventions as mm even if currently this happens to work. After the recent cda540ace6a1 "mm: get_user_pages(write,force) refuse to COW in shared areas" only uprobes can insert an anon page into the shared file-backed area, lets stop this and change valid_vma() to check VM_MAYSHARE instead. Signed-off-by: Oleg Nesterov --- kernel/events/uprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index d1edc5e..7716c40 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -127,7 +127,7 @@ struct xol_area { */ static bool valid_vma(struct vm_area_struct *vma, bool is_register) { - vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_SHARED; + vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE; if (is_register) flags |= VM_WRITE; -- cgit v1.1 From 29dedee0e693aa113164c820395ce51446a71ace Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 5 May 2014 16:38:18 +0200 Subject: uprobes: Add mem_cgroup_charge_anon() into uprobe_write_opcode() Hugh says: The one I noticed was that it forgets all about memcg (because it was copied from KSM, and there the replacement page has already been charged to a memcg). See how mm/memory.c do_anonymous_page() does a mem_cgroup_charge_anon(). Hopefully not a big problem, uprobes is a system-wide thing and only root can insert the probes. But I agree, should be fixed anyway. Add mem_cgroup_{un,}charge_anon() into uprobe_write_opcode(). To simplify the error handling (and avoid the new "uncharge" label) the patch also moves anon_vma_prepare() up before we alloc/charge the new page. While at it fix the comment about ->mmap_sem, it is held for write. Suggested-by: Hugh Dickins Signed-off-by: Oleg Nesterov --- kernel/events/uprobes.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 7716c40..a13251e 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -279,18 +279,13 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t * supported by that architecture then we need to modify is_trap_at_addr and * uprobe_write_opcode accordingly. This would never be a problem for archs * that have fixed length instructions. - */ - -/* + * * uprobe_write_opcode - write the opcode at a given virtual address. * @mm: the probed process address space. * @vaddr: the virtual address to store the opcode. * @opcode: opcode to be written at @vaddr. * - * Called with mm->mmap_sem held (for read and with a reference to - * mm). - * - * For mm @mm, write the opcode at @vaddr. + * Called with mm->mmap_sem held for write. * Return 0 (success) or a negative errno. */ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, @@ -310,21 +305,25 @@ retry: if (ret <= 0) goto put_old; + ret = anon_vma_prepare(vma); + if (ret) + goto put_old; + ret = -ENOMEM; new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); if (!new_page) goto put_old; - __SetPageUptodate(new_page); + if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) + goto put_new; + __SetPageUptodate(new_page); copy_highpage(new_page, old_page); copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); - ret = anon_vma_prepare(vma); - if (ret) - goto put_new; - ret = __replace_page(vma, vaddr, old_page, new_page); + if (ret) + mem_cgroup_uncharge_page(new_page); put_new: page_cache_release(new_page); -- cgit v1.1 From 50204c6f6dd01b5bce1b53e0b003d01849455512 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 1 May 2014 16:52:46 +0200 Subject: uprobes/x86: Simplify rip-relative handling It is possible to replace rip-relative addressing mode with addressing mode of the same length: (reg+disp32). This eliminates the need to fix up immediate and correct for changing instruction length. And we can kill arch_uprobe->def.riprel_target. Signed-off-by: Denys Vlasenko Reviewed-by: Jim Keniston Signed-off-by: Oleg Nesterov --- arch/x86/include/asm/uprobes.h | 3 -- arch/x86/kernel/uprobes.c | 71 ++++++++++++++++++------------------------ 2 files changed, 30 insertions(+), 44 deletions(-) diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h index a040d49..7be3c07 100644 --- a/arch/x86/include/asm/uprobes.h +++ b/arch/x86/include/asm/uprobes.h @@ -50,9 +50,6 @@ struct arch_uprobe { u8 opc1; } branch; struct { -#ifdef CONFIG_X86_64 - long riprel_target; -#endif u8 fixups; u8 ilen; } def; diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 2ebadb2..31dcb4d 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -251,9 +251,9 @@ static inline bool is_64bit_mm(struct mm_struct *mm) * If arch_uprobe->insn doesn't use rip-relative addressing, return * immediately. Otherwise, rewrite the instruction so that it accesses * its memory operand indirectly through a scratch register. Set - * def->fixups and def->riprel_target accordingly. (The contents of the - * scratch register will be saved before we single-step the modified - * instruction, and restored afterward). + * def->fixups accordingly. (The contents of the scratch register + * will be saved before we single-step the modified instruction, + * and restored afterward). * * We do this because a rip-relative instruction can access only a * relatively small area (+/- 2 GB from the instruction), and the XOL @@ -264,9 +264,12 @@ static inline bool is_64bit_mm(struct mm_struct *mm) * * Some useful facts about rip-relative instructions: * - * - There's always a modrm byte. + * - There's always a modrm byte with bit layout "00 reg 101". * - There's never a SIB byte. * - The displacement is always 4 bytes. + * - REX.B=1 bit in REX prefix, which normally extends r/m field, + * has no effect on rip-relative mode. It doesn't make modrm byte + * with r/m=101 refer to register 1101 = R13. */ static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn) { @@ -293,9 +296,8 @@ static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn) */ cursor = auprobe->insn + insn_offset_modrm(insn); /* - * Convert from rip-relative addressing to indirect addressing - * via a scratch register. Change the r/m field from 0x5 (%rip) - * to 0x0 (%rax) or 0x1 (%rcx), and squeeze out the offset field. + * Convert from rip-relative addressing + * to register-relative addressing via a scratch register. */ reg = MODRM_REG(insn); if (reg == 0) { @@ -307,22 +309,21 @@ static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn) * #1) for the scratch register. */ auprobe->def.fixups |= UPROBE_FIX_RIP_CX; - /* Change modrm from 00 000 101 to 00 000 001. */ - *cursor = 0x1; + /* + * Change modrm from "00 000 101" to "10 000 001". Example: + * 89 05 disp32 mov %eax,disp32(%rip) becomes + * 89 81 disp32 mov %eax,disp32(%rcx) + */ + *cursor = 0x81; } else { /* Use %rax (register #0) for the scratch register. */ auprobe->def.fixups |= UPROBE_FIX_RIP_AX; - /* Change modrm from 00 xxx 101 to 00 xxx 000 */ - *cursor = (reg << 3); - } - - /* Target address = address of next instruction + (signed) offset */ - auprobe->def.riprel_target = (long)insn->length + insn->displacement.value; - - /* Displacement field is gone; slide immediate field (if any) over. */ - if (insn->immediate.nbytes) { - cursor++; - memmove(cursor, cursor + insn->displacement.nbytes, insn->immediate.nbytes); + /* + * Change modrm from "00 reg 101" to "10 reg 000". Example: + * 89 1d disp32 mov %edx,disp32(%rip) becomes + * 89 98 disp32 mov %edx,disp32(%rax) + */ + *cursor = (reg << 3) | 0x80; } } @@ -343,26 +344,17 @@ static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) unsigned long *sr = scratch_reg(auprobe, regs); utask->autask.saved_scratch_register = *sr; - *sr = utask->vaddr + auprobe->def.riprel_target; + *sr = utask->vaddr + auprobe->def.ilen; } } -static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, - long *correction) +static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { if (auprobe->def.fixups & (UPROBE_FIX_RIP_AX | UPROBE_FIX_RIP_CX)) { struct uprobe_task *utask = current->utask; unsigned long *sr = scratch_reg(auprobe, regs); *sr = utask->autask.saved_scratch_register; - /* - * The original instruction includes a displacement, and so - * is 4 bytes longer than what we've just single-stepped. - * Caller may need to apply other fixups to handle stuff - * like "jmpq *...(%rip)" and "callq *...(%rip)". - */ - if (correction) - *correction += 4; } } #else /* 32-bit: */ @@ -379,8 +371,7 @@ static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn) static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { } -static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, - long *correction) +static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { } #endif /* CONFIG_X86_64 */ @@ -417,10 +408,10 @@ static int push_ret_address(struct pt_regs *regs, unsigned long ip) static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) { struct uprobe_task *utask = current->utask; - long correction = (long)(utask->vaddr - utask->xol_vaddr); - riprel_post_xol(auprobe, regs, &correction); + riprel_post_xol(auprobe, regs); if (auprobe->def.fixups & UPROBE_FIX_IP) { + long correction = utask->vaddr - utask->xol_vaddr; regs->ip += correction; } else if (auprobe->def.fixups & UPROBE_FIX_CALL) { regs->sp += sizeof_long(); @@ -436,7 +427,7 @@ static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs static void default_abort_op(struct arch_uprobe *auprobe, struct pt_regs *regs) { - riprel_post_xol(auprobe, regs, NULL); + riprel_post_xol(auprobe, regs); } static struct uprobe_xol_ops default_xol_ops = { @@ -732,11 +723,9 @@ bool arch_uprobe_xol_was_trapped(struct task_struct *t) * * If the original instruction was a rip-relative instruction such as * "movl %edx,0xnnnn(%rip)", we have instead executed an equivalent - * instruction using a scratch register -- e.g., "movl %edx,(%rax)". - * We need to restore the contents of the scratch register and adjust - * the ip, keeping in mind that the instruction we executed is 4 bytes - * shorter than the original instruction (since we squeezed out the offset - * field). (FIX_RIP_AX or FIX_RIP_CX) + * instruction using a scratch register -- e.g., "movl %edx,0xnnnn(%rax)". + * We need to restore the contents of the scratch register + * (FIX_RIP_AX or FIX_RIP_CX). */ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { -- cgit v1.1 From 1ea30fb64598bd3a6ba43d874bb53c55878eaef5 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 2 May 2014 17:04:00 +0200 Subject: uprobes/x86: Fix scratch register selection for rip-relative fixups Before this patch, instructions such as div, mul, shifts with count in CL, cmpxchg are mishandled. This patch adds vex prefix handling. In particular, it avoids colliding with register operand encoded in vex.vvvv field. Since we need to avoid two possible register operands, the selection of scratch register needs to be from at least three registers. After looking through a lot of CPU docs, it looks like the safest choice is SI,DI,BX. Selecting BX needs care to not collide with implicit use of BX by cmpxchg8b. Test-case: #include static const char *const pass[] = { "FAIL", "pass" }; long two = 2; void test1(void) { long ax = 0, dx = 0; asm volatile("\n" " xor %%edx,%%edx\n" " lea 2(%%edx),%%eax\n" // We divide 2 by 2. Result (in eax) should be 1: " probe1: .globl probe1\n" " divl two(%%rip)\n" // If we have a bug (eax mangled on entry) the result will be 2, // because eax gets restored by probe machinery. : "=a" (ax), "=d" (dx) /*out*/ : "0" (ax), "1" (dx) /*in*/ : "memory" /*clobber*/ ); dprintf(2, "%s: %s\n", __func__, pass[ax == 1] ); } long val2 = 0; void test2(void) { long old_val = val2; long ax = 0, dx = 0; asm volatile("\n" " mov val2,%%eax\n" // eax := val2 " lea 1(%%eax),%%edx\n" // edx := eax+1 // eax is equal to val2. cmpxchg should store edx to val2: " probe2: .globl probe2\n" " cmpxchg %%edx,val2(%%rip)\n" // If we have a bug (eax mangled on entry), val2 will stay unchanged : "=a" (ax), "=d" (dx) /*out*/ : "0" (ax), "1" (dx) /*in*/ : "memory" /*clobber*/ ); dprintf(2, "%s: %s\n", __func__, pass[val2 == old_val + 1] ); } long val3[2] = {0,0}; void test3(void) { long old_val = val3[0]; long ax = 0, dx = 0; asm volatile("\n" " mov val3,%%eax\n" // edx:eax := val3 " mov val3+4,%%edx\n" " mov %%eax,%%ebx\n" // ecx:ebx := edx:eax + 1 " mov %%edx,%%ecx\n" " add $1,%%ebx\n" " adc $0,%%ecx\n" // edx:eax is equal to val3. cmpxchg8b should store ecx:ebx to val3: " probe3: .globl probe3\n" " cmpxchg8b val3(%%rip)\n" // If we have a bug (edx:eax mangled on entry), val3 will stay unchanged. // If ecx:edx in mangled, val3 will get wrong value. : "=a" (ax), "=d" (dx) /*out*/ : "0" (ax), "1" (dx) /*in*/ : "cx", "bx", "memory" /*clobber*/ ); dprintf(2, "%s: %s\n", __func__, pass[val3[0] == old_val + 1 && val3[1] == 0] ); } int main(int argc, char **argv) { test1(); test2(); test3(); return 0; } Before this change all tests fail if probe{1,2,3} are probed. Signed-off-by: Denys Vlasenko Reviewed-by: Jim Keniston Signed-off-by: Oleg Nesterov --- arch/x86/kernel/uprobes.c | 176 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 125 insertions(+), 51 deletions(-) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 31dcb4d..159ca52 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -41,8 +41,11 @@ /* Instruction will modify TF, don't change it */ #define UPROBE_FIX_SETF 0x04 -#define UPROBE_FIX_RIP_AX 0x08 -#define UPROBE_FIX_RIP_CX 0x10 +#define UPROBE_FIX_RIP_SI 0x08 +#define UPROBE_FIX_RIP_DI 0x10 +#define UPROBE_FIX_RIP_BX 0x20 +#define UPROBE_FIX_RIP_MASK \ + (UPROBE_FIX_RIP_SI | UPROBE_FIX_RIP_DI | UPROBE_FIX_RIP_BX) #define UPROBE_TRAP_NR UINT_MAX @@ -275,20 +278,109 @@ static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn) { u8 *cursor; u8 reg; + u8 reg2; if (!insn_rip_relative(insn)) return; /* - * insn_rip_relative() would have decoded rex_prefix, modrm. + * insn_rip_relative() would have decoded rex_prefix, vex_prefix, modrm. * Clear REX.b bit (extension of MODRM.rm field): - * we want to encode rax/rcx, not r8/r9. + * we want to encode low numbered reg, not r8+. */ if (insn->rex_prefix.nbytes) { cursor = auprobe->insn + insn_offset_rex_prefix(insn); - *cursor &= 0xfe; /* Clearing REX.B bit */ + /* REX byte has 0100wrxb layout, clearing REX.b bit */ + *cursor &= 0xfe; } + /* + * Similar treatment for VEX3 prefix. + * TODO: add XOP/EVEX treatment when insn decoder supports them + */ + if (insn->vex_prefix.nbytes == 3) { + /* + * vex2: c5 rvvvvLpp (has no b bit) + * vex3/xop: c4/8f rxbmmmmm wvvvvLpp + * evex: 62 rxbR00mm wvvvv1pp zllBVaaa + * (evex will need setting of both b and x since + * in non-sib encoding evex.x is 4th bit of MODRM.rm) + * Setting VEX3.b (setting because it has inverted meaning): + */ + cursor = auprobe->insn + insn_offset_vex_prefix(insn) + 1; + *cursor |= 0x20; + } + + /* + * Convert from rip-relative addressing to register-relative addressing + * via a scratch register. + * + * This is tricky since there are insns with modrm byte + * which also use registers not encoded in modrm byte: + * [i]div/[i]mul: implicitly use dx:ax + * shift ops: implicitly use cx + * cmpxchg: implicitly uses ax + * cmpxchg8/16b: implicitly uses dx:ax and bx:cx + * Encoding: 0f c7/1 modrm + * The code below thinks that reg=1 (cx), chooses si as scratch. + * mulx: implicitly uses dx: mulx r/m,r1,r2 does r1:r2 = dx * r/m. + * First appeared in Haswell (BMI2 insn). It is vex-encoded. + * Example where none of bx,cx,dx can be used as scratch reg: + * c4 e2 63 f6 0d disp32 mulx disp32(%rip),%ebx,%ecx + * [v]pcmpistri: implicitly uses cx, xmm0 + * [v]pcmpistrm: implicitly uses xmm0 + * [v]pcmpestri: implicitly uses ax, dx, cx, xmm0 + * [v]pcmpestrm: implicitly uses ax, dx, xmm0 + * Evil SSE4.2 string comparison ops from hell. + * maskmovq/[v]maskmovdqu: implicitly uses (ds:rdi) as destination. + * Encoding: 0f f7 modrm, 66 0f f7 modrm, vex-encoded: c5 f9 f7 modrm. + * Store op1, byte-masked by op2 msb's in each byte, to (ds:rdi). + * AMD says it has no 3-operand form (vex.vvvv must be 1111) + * and that it can have only register operands, not mem + * (its modrm byte must have mode=11). + * If these restrictions will ever be lifted, + * we'll need code to prevent selection of di as scratch reg! + * + * Summary: I don't know any insns with modrm byte which + * use SI register implicitly. DI register is used only + * by one insn (maskmovq) and BX register is used + * only by one too (cmpxchg8b). + * BP is stack-segment based (may be a problem?). + * AX, DX, CX are off-limits (many implicit users). + * SP is unusable (it's stack pointer - think about "pop mem"; + * also, rsp+disp32 needs sib encoding -> insn length change). + */ + reg = MODRM_REG(insn); /* Fetch modrm.reg */ + reg2 = 0xff; /* Fetch vex.vvvv */ + if (insn->vex_prefix.nbytes == 2) + reg2 = insn->vex_prefix.bytes[1]; + else if (insn->vex_prefix.nbytes == 3) + reg2 = insn->vex_prefix.bytes[2]; + /* + * TODO: add XOP, EXEV vvvv reading. + * + * vex.vvvv field is in bits 6-3, bits are inverted. + * But in 32-bit mode, high-order bit may be ignored. + * Therefore, let's consider only 3 low-order bits. + */ + reg2 = ((reg2 >> 3) & 0x7) ^ 0x7; + /* + * Register numbering is ax,cx,dx,bx, sp,bp,si,di, r8..r15. + * + * Choose scratch reg. Order is important: must not select bx + * if we can use si (cmpxchg8b case!) + */ + if (reg != 6 && reg2 != 6) { + reg2 = 6; + auprobe->def.fixups |= UPROBE_FIX_RIP_SI; + } else if (reg != 7 && reg2 != 7) { + reg2 = 7; + auprobe->def.fixups |= UPROBE_FIX_RIP_DI; + /* TODO (paranoia): force maskmovq to not use di */ + } else { + reg2 = 3; + auprobe->def.fixups |= UPROBE_FIX_RIP_BX; + } /* * Point cursor at the modrm byte. The next 4 bytes are the * displacement. Beyond the displacement, for some instructions, @@ -296,41 +388,21 @@ static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn) */ cursor = auprobe->insn + insn_offset_modrm(insn); /* - * Convert from rip-relative addressing - * to register-relative addressing via a scratch register. + * Change modrm from "00 reg 101" to "10 reg reg2". Example: + * 89 05 disp32 mov %eax,disp32(%rip) becomes + * 89 86 disp32 mov %eax,disp32(%rsi) */ - reg = MODRM_REG(insn); - if (reg == 0) { - /* - * The register operand (if any) is either the A register - * (%rax, %eax, etc.) or (if the 0x4 bit is set in the - * REX prefix) %r8. In any case, we know the C register - * is NOT the register operand, so we use %rcx (register - * #1) for the scratch register. - */ - auprobe->def.fixups |= UPROBE_FIX_RIP_CX; - /* - * Change modrm from "00 000 101" to "10 000 001". Example: - * 89 05 disp32 mov %eax,disp32(%rip) becomes - * 89 81 disp32 mov %eax,disp32(%rcx) - */ - *cursor = 0x81; - } else { - /* Use %rax (register #0) for the scratch register. */ - auprobe->def.fixups |= UPROBE_FIX_RIP_AX; - /* - * Change modrm from "00 reg 101" to "10 reg 000". Example: - * 89 1d disp32 mov %edx,disp32(%rip) becomes - * 89 98 disp32 mov %edx,disp32(%rax) - */ - *cursor = (reg << 3) | 0x80; - } + *cursor = 0x80 | (reg << 3) | reg2; } static inline unsigned long * scratch_reg(struct arch_uprobe *auprobe, struct pt_regs *regs) { - return (auprobe->def.fixups & UPROBE_FIX_RIP_AX) ? ®s->ax : ®s->cx; + if (auprobe->def.fixups & UPROBE_FIX_RIP_SI) + return ®s->si; + if (auprobe->def.fixups & UPROBE_FIX_RIP_DI) + return ®s->di; + return ®s->bx; } /* @@ -339,7 +411,7 @@ scratch_reg(struct arch_uprobe *auprobe, struct pt_regs *regs) */ static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { - if (auprobe->def.fixups & (UPROBE_FIX_RIP_AX | UPROBE_FIX_RIP_CX)) { + if (auprobe->def.fixups & UPROBE_FIX_RIP_MASK) { struct uprobe_task *utask = current->utask; unsigned long *sr = scratch_reg(auprobe, regs); @@ -350,7 +422,7 @@ static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { - if (auprobe->def.fixups & (UPROBE_FIX_RIP_AX | UPROBE_FIX_RIP_CX)) { + if (auprobe->def.fixups & UPROBE_FIX_RIP_MASK) { struct uprobe_task *utask = current->utask; unsigned long *sr = scratch_reg(auprobe, regs); @@ -405,6 +477,23 @@ static int push_ret_address(struct pt_regs *regs, unsigned long ip) return 0; } +/* + * We have to fix things up as follows: + * + * Typically, the new ip is relative to the copied instruction. We need + * to make it relative to the original instruction (FIX_IP). Exceptions + * are return instructions and absolute or indirect jump or call instructions. + * + * If the single-stepped instruction was a call, the return address that + * is atop the stack is the address following the copied instruction. We + * need to make it the address following the original instruction (FIX_CALL). + * + * If the original instruction was a rip-relative instruction such as + * "movl %edx,0xnnnn(%rip)", we have instead executed an equivalent + * instruction using a scratch register -- e.g., "movl %edx,0xnnnn(%rsi)". + * We need to restore the contents of the scratch register + * (FIX_RIP_reg). + */ static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) { struct uprobe_task *utask = current->utask; @@ -711,21 +800,6 @@ bool arch_uprobe_xol_was_trapped(struct task_struct *t) * single-step, we single-stepped a copy of the instruction. * * This function prepares to resume execution after the single-step. - * We have to fix things up as follows: - * - * Typically, the new ip is relative to the copied instruction. We need - * to make it relative to the original instruction (FIX_IP). Exceptions - * are return instructions and absolute or indirect jump or call instructions. - * - * If the single-stepped instruction was a call, the return address that - * is atop the stack is the address following the copied instruction. We - * need to make it the address following the original instruction (FIX_CALL). - * - * If the original instruction was a rip-relative instruction such as - * "movl %edx,0xnnnn(%rip)", we have instead executed an equivalent - * instruction using a scratch register -- e.g., "movl %edx,0xnnnn(%rax)". - * We need to restore the contents of the scratch register - * (FIX_RIP_AX or FIX_RIP_CX). */ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { -- cgit v1.1 From 5e1b05beeca8139204324581a5b1ffb53d057f96 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 8 May 2014 20:34:00 +0200 Subject: x86/traps: Make math_error() static Trivial, make math_error() static. Signed-off-by: Oleg Nesterov --- arch/x86/include/asm/traps.h | 1 - arch/x86/kernel/traps.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 58d66fe..a7b212d 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -98,7 +98,6 @@ static inline int get_si_code(unsigned long condition) extern int panic_on_unrecovered_nmi; -void math_error(struct pt_regs *, int, int); void math_emulate(struct math_emu_info *); #ifndef CONFIG_X86_32 asmlinkage void smp_thermal_interrupt(void); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 57409f6..8eddd62 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -488,7 +488,7 @@ exit: * the correct behaviour even in the presence of the asynchronous * IRQ13 behaviour */ -void math_error(struct pt_regs *regs, int error_code, int trapnr) +static void math_error(struct pt_regs *regs, int error_code, int trapnr) { struct task_struct *task = current; siginfo_t info; -- cgit v1.1 From 38cad57be9800e46c52a3612fb9d963eee4fd9c3 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 7 May 2014 16:47:09 +0200 Subject: x86/traps: Use SEND_SIG_PRIV instead of force_sig() force_sig() is just force_sig_info(SEND_SIG_PRIV). Imho it should die, we have too many ugly "send signal" helpers. And do_trap() looks just ugly because it uses force_sig_info() or force_sig() depending on info != NULL. Signed-off-by: Oleg Nesterov --- arch/x86/kernel/traps.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 8eddd62..2cd4291 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -168,10 +168,7 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, } #endif - if (info) - force_sig_info(signr, info, tsk); - else - force_sig(signr, tsk); + force_sig_info(signr, info ?: SEND_SIG_PRIV, tsk); } #define DO_ERROR(trapnr, signr, str, name) \ @@ -305,7 +302,7 @@ do_general_protection(struct pt_regs *regs, long error_code) pr_cont("\n"); } - force_sig(SIGSEGV, tsk); + force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk); exit: exception_exit(prev_state); } @@ -645,7 +642,7 @@ void math_state_restore(void) */ if (unlikely(restore_fpu_checking(tsk))) { drop_init_fpu(tsk); - force_sig(SIGSEGV, tsk); + force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk); return; } -- cgit v1.1 From dff0796e53c29147c9bd1f5567a261dcf0e528bc Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 7 May 2014 17:21:34 +0200 Subject: x86/traps: Introduce do_error_trap() Move the common code from DO_ERROR() and DO_ERROR_INFO() into the new helper, do_error_trap(). This simplifies define's and shaves 527 bytes from traps.o. Signed-off-by: Oleg Nesterov --- arch/x86/kernel/traps.c | 38 +++++++++++++++++--------------------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 2cd4291..ab8dad7 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -171,41 +171,37 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, force_sig_info(signr, info ?: SEND_SIG_PRIV, tsk); } +static void do_error_trap(struct pt_regs *regs, long error_code, char *str, + unsigned long trapnr, int signr, siginfo_t *info) +{ + enum ctx_state prev_state = exception_enter(); + + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) != + NOTIFY_STOP) { + conditional_sti(regs); + do_trap(trapnr, signr, str, regs, error_code, info); + } + + exception_exit(prev_state); +} + #define DO_ERROR(trapnr, signr, str, name) \ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ { \ - enum ctx_state prev_state; \ - \ - prev_state = exception_enter(); \ - if (notify_die(DIE_TRAP, str, regs, error_code, \ - trapnr, signr) == NOTIFY_STOP) { \ - exception_exit(prev_state); \ - return; \ - } \ - conditional_sti(regs); \ - do_trap(trapnr, signr, str, regs, error_code, NULL); \ - exception_exit(prev_state); \ + do_error_trap(regs, error_code, str, trapnr, signr, NULL); \ } #define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ { \ siginfo_t info; \ - enum ctx_state prev_state; \ \ info.si_signo = signr; \ info.si_errno = 0; \ info.si_code = sicode; \ info.si_addr = (void __user *)siaddr; \ - prev_state = exception_enter(); \ - if (notify_die(DIE_TRAP, str, regs, error_code, \ - trapnr, signr) == NOTIFY_STOP) { \ - exception_exit(prev_state); \ - return; \ - } \ - conditional_sti(regs); \ - do_trap(trapnr, signr, str, regs, error_code, &info); \ - exception_exit(prev_state); \ + \ + do_error_trap(regs, error_code, str, trapnr, signr, &info); \ } DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip ) -- cgit v1.1 From 958d3d729802f7d741cbe8400e69b89baae580ee Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 7 May 2014 17:59:39 +0200 Subject: x86/traps: Introduce fill_trap_info(), simplify DO_ERROR_INFO() Extract the fill-siginfo code from DO_ERROR_INFO() into the new helper, fill_trap_info(). It can calculate si_code and si_addr looking at trapnr, so we can remove these arguments from DO_ERROR_INFO() and simplify the source code. The generated code is the same, __builtin_constant_p(trapnr) == T. Signed-off-by: Oleg Nesterov --- arch/x86/kernel/traps.c | 53 +++++++++++++++++++++++++++++++++++-------------- 1 file changed, 38 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ab8dad7..1cf8a4c 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -136,6 +136,33 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, return -1; } +static void fill_trap_info(struct pt_regs *regs, int signr, int trapnr, + siginfo_t *info) +{ + unsigned long siaddr; + int sicode; + + switch (trapnr) { + case X86_TRAP_DE: + sicode = FPE_INTDIV; + siaddr = regs->ip; + break; + case X86_TRAP_UD: + sicode = ILL_ILLOPN; + siaddr = regs->ip; + break; + case X86_TRAP_AC: + sicode = BUS_ADRALN; + siaddr = 0; + break; + } + + info->si_signo = signr; + info->si_errno = 0; + info->si_code = sicode; + info->si_addr = (void __user *)siaddr; +} + static void __kprobes do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, long error_code, siginfo_t *info) @@ -191,30 +218,26 @@ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ do_error_trap(regs, error_code, str, trapnr, signr, NULL); \ } -#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ +#define DO_ERROR_INFO(trapnr, signr, str, name) \ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ { \ siginfo_t info; \ \ - info.si_signo = signr; \ - info.si_errno = 0; \ - info.si_code = sicode; \ - info.si_addr = (void __user *)siaddr; \ - \ + fill_trap_info(regs, signr, trapnr, &info); \ do_error_trap(regs, error_code, str, trapnr, signr, &info); \ } -DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip ) -DO_ERROR (X86_TRAP_OF, SIGSEGV, "overflow", overflow ) -DO_ERROR (X86_TRAP_BR, SIGSEGV, "bounds", bounds ) -DO_ERROR_INFO(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip ) -DO_ERROR (X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun ) -DO_ERROR (X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS ) -DO_ERROR (X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present ) +DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error) +DO_ERROR (X86_TRAP_OF, SIGSEGV, "overflow", overflow) +DO_ERROR (X86_TRAP_BR, SIGSEGV, "bounds", bounds) +DO_ERROR_INFO(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op) +DO_ERROR (X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) +DO_ERROR (X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS) +DO_ERROR (X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present) #ifdef CONFIG_X86_32 -DO_ERROR (X86_TRAP_SS, SIGBUS, "stack segment", stack_segment ) +DO_ERROR (X86_TRAP_SS, SIGBUS, "stack segment", stack_segment) #endif -DO_ERROR_INFO(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0 ) +DO_ERROR_INFO(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check) #ifdef CONFIG_X86_64 /* Runs on IST stack */ -- cgit v1.1 From 1c326c4dfe182a1c4c1e39f2c00f04c380d11692 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 8 May 2014 20:04:11 +0200 Subject: x86/traps: Shift fill_trap_info() from DO_ERROR_INFO() to do_error_trap() Move the callsite of fill_trap_info() into do_error_trap() and remove the "siginfo_t *info" argument. This obviously breaks DO_ERROR() which passed info == NULL, we simply change fill_trap_info() to return "siginfo_t *" and add the "default" case which returns SEND_SIG_PRIV. Signed-off-by: Oleg Nesterov --- arch/x86/kernel/traps.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 1cf8a4c..c9dd741 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -136,13 +136,16 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, return -1; } -static void fill_trap_info(struct pt_regs *regs, int signr, int trapnr, - siginfo_t *info) +static siginfo_t *fill_trap_info(struct pt_regs *regs, int signr, int trapnr, + siginfo_t *info) { unsigned long siaddr; int sicode; switch (trapnr) { + default: + return SEND_SIG_PRIV; + case X86_TRAP_DE: sicode = FPE_INTDIV; siaddr = regs->ip; @@ -161,6 +164,7 @@ static void fill_trap_info(struct pt_regs *regs, int signr, int trapnr, info->si_errno = 0; info->si_code = sicode; info->si_addr = (void __user *)siaddr; + return info; } static void __kprobes @@ -199,14 +203,16 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, } static void do_error_trap(struct pt_regs *regs, long error_code, char *str, - unsigned long trapnr, int signr, siginfo_t *info) + unsigned long trapnr, int signr) { enum ctx_state prev_state = exception_enter(); + siginfo_t info; if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) != NOTIFY_STOP) { conditional_sti(regs); - do_trap(trapnr, signr, str, regs, error_code, info); + do_trap(trapnr, signr, str, regs, error_code, + fill_trap_info(regs, signr, trapnr, &info)); } exception_exit(prev_state); @@ -215,16 +221,13 @@ static void do_error_trap(struct pt_regs *regs, long error_code, char *str, #define DO_ERROR(trapnr, signr, str, name) \ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ { \ - do_error_trap(regs, error_code, str, trapnr, signr, NULL); \ + do_error_trap(regs, error_code, str, trapnr, signr); \ } #define DO_ERROR_INFO(trapnr, signr, str, name) \ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ { \ - siginfo_t info; \ - \ - fill_trap_info(regs, signr, trapnr, &info); \ - do_error_trap(regs, error_code, str, trapnr, signr, &info); \ + do_error_trap(regs, error_code, str, trapnr, signr); \ } DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error) -- cgit v1.1 From 0eb14833d5b1ea1accfeffb71be5de5929f85da9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 8 May 2014 20:12:24 +0200 Subject: x86/traps: Kill DO_ERROR_INFO() Now that DO_ERROR_INFO() doesn't differ from DO_ERROR() we can remove it and use DO_ERROR() instead. Signed-off-by: Oleg Nesterov --- arch/x86/kernel/traps.c | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index c9dd741..73b3ea3 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -224,23 +224,17 @@ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ do_error_trap(regs, error_code, str, trapnr, signr); \ } -#define DO_ERROR_INFO(trapnr, signr, str, name) \ -dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ -{ \ - do_error_trap(regs, error_code, str, trapnr, signr); \ -} - -DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error) -DO_ERROR (X86_TRAP_OF, SIGSEGV, "overflow", overflow) -DO_ERROR (X86_TRAP_BR, SIGSEGV, "bounds", bounds) -DO_ERROR_INFO(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op) -DO_ERROR (X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) -DO_ERROR (X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS) -DO_ERROR (X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present) +DO_ERROR(X86_TRAP_DE, SIGFPE, "divide error", divide_error) +DO_ERROR(X86_TRAP_OF, SIGSEGV, "overflow", overflow) +DO_ERROR(X86_TRAP_BR, SIGSEGV, "bounds", bounds) +DO_ERROR(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op) +DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun",coprocessor_segment_overrun) +DO_ERROR(X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS) +DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present) #ifdef CONFIG_X86_32 -DO_ERROR (X86_TRAP_SS, SIGBUS, "stack segment", stack_segment) +DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment) #endif -DO_ERROR_INFO(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check) +DO_ERROR(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check) #ifdef CONFIG_X86_64 /* Runs on IST stack */ -- cgit v1.1 From b02ef20a9fba08948e643d3eec0efadf1da01a44 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 12 May 2014 18:24:45 +0200 Subject: uprobes/x86: Fix the wrong ->si_addr when xol triggers a trap If the probed insn triggers a trap, ->si_addr = regs->ip is technically correct, but this is not what the signal handler wants; we need to pass the address of the probed insn, not the address of xol slot. Add the new arch-agnostic helper, uprobe_get_trap_addr(), and change fill_trap_info() and math_error() to use it. !CONFIG_UPROBES case in uprobes.h uses a macro to avoid include hell and ensure that it can be compiled even if an architecture doesn't define instruction_pointer(). Test-case: #include #include #include extern void probe_div(void); void sigh(int sig, siginfo_t *info, void *c) { int passed = (info->si_addr == probe_div); printf(passed ? "PASS\n" : "FAIL\n"); _exit(!passed); } int main(void) { struct sigaction sa = { .sa_sigaction = sigh, .sa_flags = SA_SIGINFO, }; sigaction(SIGFPE, &sa, NULL); asm ( "xor %ecx,%ecx\n" ".globl probe_div; probe_div:\n" "idiv %ecx\n" ); return 0; } it fails if probe_div() is probed. Note: show_unhandled_signals users should probably use this helper too, but we need to cleanup them first. Signed-off-by: Oleg Nesterov Reviewed-by: Masami Hiramatsu --- arch/x86/kernel/traps.c | 7 ++++--- include/linux/uprobes.h | 4 ++++ kernel/events/uprobes.c | 10 ++++++++++ 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 73b3ea3..3fdb205 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -148,11 +149,11 @@ static siginfo_t *fill_trap_info(struct pt_regs *regs, int signr, int trapnr, case X86_TRAP_DE: sicode = FPE_INTDIV; - siaddr = regs->ip; + siaddr = uprobe_get_trap_addr(regs); break; case X86_TRAP_UD: sicode = ILL_ILLOPN; - siaddr = regs->ip; + siaddr = uprobe_get_trap_addr(regs); break; case X86_TRAP_AC: sicode = BUS_ADRALN; @@ -531,7 +532,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr) task->thread.error_code = error_code; info.si_signo = SIGFPE; info.si_errno = 0; - info.si_addr = (void __user *)regs->ip; + info.si_addr = (void __user *)uprobe_get_trap_addr(regs); if (trapnr == X86_TRAP_MF) { unsigned short cwd, swd; /* diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index edff2b9..88c3b7e 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -102,6 +102,7 @@ extern int __weak set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, u extern bool __weak is_swbp_insn(uprobe_opcode_t *insn); extern bool __weak is_trap_insn(uprobe_opcode_t *insn); extern unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs); +extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); extern int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t); extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); extern int uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool); @@ -130,6 +131,9 @@ extern bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *r #else /* !CONFIG_UPROBES */ struct uprobes_state { }; + +#define uprobe_get_trap_addr(regs) instruction_pointer(regs) + static inline int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) { diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index a13251e..3b02c72 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1351,6 +1351,16 @@ unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs) return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE; } +unsigned long uprobe_get_trap_addr(struct pt_regs *regs) +{ + struct uprobe_task *utask = current->utask; + + if (unlikely(utask && utask->active_uprobe)) + return utask->vaddr; + + return instruction_pointer(regs); +} + /* * Called with no locks held. * Called in context of a exiting or a exec-ing thread. -- cgit v1.1