From 61365e132ef987f7719af5d2e434db4465957637 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Mon, 7 Dec 2009 12:51:42 +0100 Subject: [S390] Improve address space check. A data access in access-register mode always is a user mode access, the code to inspect the access-registers can be removed. The second change is to use a different test to check for no-execute fault. The third change is to pass the translation exception identification as parameter, in theory the trans_exc_code in the lowcore could have been overwritten by the time the call to check_space from do_no_context is done. Signed-off-by: Martin Schwidefsky --- arch/s390/mm/fault.c | 99 ++++++++++++++++++++++++---------------------------- 1 file changed, 45 insertions(+), 54 deletions(-) (limited to 'arch/s390/mm') diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 6d50746..3df5b91 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -100,39 +100,28 @@ void bust_spinlocks(int yes) /* * Returns the address space associated with the fault. - * Returns 0 for kernel space, 1 for user space and - * 2 for code execution in user space with noexec=on. + * Returns 0 for kernel space and 1 for user space. */ -static inline int check_space(struct task_struct *tsk) +static inline int user_space_fault(unsigned long trans_exc_code) { /* - * The lowest two bits of S390_lowcore.trans_exc_code - * indicate which paging table was used. + * The lowest two bits of the translation exception + * identification indicate which paging table was used. */ - int desc = S390_lowcore.trans_exc_code & 3; - - if (desc == 3) /* Home Segment Table Descriptor */ - return switch_amode == 0; - if (desc == 2) /* Secondary Segment Table Descriptor */ - return tsk->thread.mm_segment.ar4; -#ifdef CONFIG_S390_SWITCH_AMODE - if (unlikely(desc == 1)) { /* STD determined via access register */ - /* %a0 always indicates primary space. */ - if (S390_lowcore.exc_access_id != 0) { - save_access_regs(tsk->thread.acrs); - /* - * An alet of 0 indicates primary space. - * An alet of 1 indicates secondary space. - * Any other alet values generate an - * alen-translation exception. - */ - if (tsk->thread.acrs[S390_lowcore.exc_access_id]) - return tsk->thread.mm_segment.ar4; - } - } -#endif - /* Primary Segment Table Descriptor */ - return switch_amode << s390_noexec; + trans_exc_code &= 3; + if (trans_exc_code == 2) + /* Access via secondary space, set_fs setting decides */ + return current->thread.mm_segment.ar4; + if (!switch_amode) + /* User space if the access has been done via home space. */ + return trans_exc_code == 3; + /* + * If the user space is not the home space the kernel runs in home + * space. Access via secondary space has already been covered, + * access via primary space or access register is from user space + * and access via home space is from the kernel. + */ + return trans_exc_code != 3; } /* @@ -162,9 +151,10 @@ static void do_sigsegv(struct pt_regs *regs, unsigned long error_code, } static void do_no_context(struct pt_regs *regs, unsigned long error_code, - unsigned long address) + unsigned long trans_exc_code) { const struct exception_table_entry *fixup; + unsigned long address; /* Are we prepared to handle this kernel fault? */ fixup = search_exception_tables(regs->psw.addr & __FIXUP_MASK); @@ -177,7 +167,8 @@ static void do_no_context(struct pt_regs *regs, unsigned long error_code, * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice. */ - if (check_space(current) == 0) + address = trans_exc_code & __FAIL_ADDR_MASK; + if (user_space_fault(trans_exc_code) == 0) printk(KERN_ALERT "Unable to handle kernel pointer dereference" " at virtual kernel address %p\n", (void *)address); else @@ -188,7 +179,8 @@ static void do_no_context(struct pt_regs *regs, unsigned long error_code, do_exit(SIGKILL); } -static void do_low_address(struct pt_regs *regs, unsigned long error_code) +static void do_low_address(struct pt_regs *regs, unsigned long error_code, + unsigned long trans_exc_code) { /* Low-address protection hit in kernel mode means NULL pointer write access in kernel mode. */ @@ -198,11 +190,11 @@ static void do_low_address(struct pt_regs *regs, unsigned long error_code) do_exit(SIGKILL); } - do_no_context(regs, error_code, 0); + do_no_context(regs, error_code, trans_exc_code); } static void do_sigbus(struct pt_regs *regs, unsigned long error_code, - unsigned long address) + unsigned long trans_exc_code) { struct task_struct *tsk = current; struct mm_struct *mm = tsk->mm; @@ -212,13 +204,13 @@ static void do_sigbus(struct pt_regs *regs, unsigned long error_code, * Send a sigbus, regardless of whether we were in kernel * or user mode. */ - tsk->thread.prot_addr = address; + tsk->thread.prot_addr = trans_exc_code & __FAIL_ADDR_MASK; tsk->thread.trap_no = error_code; force_sig(SIGBUS, tsk); /* Kernel mode? Handle exceptions or die */ if (!(regs->psw.mask & PSW_MASK_PSTATE)) - do_no_context(regs, error_code, address); + do_no_context(regs, error_code, trans_exc_code); } #ifdef CONFIG_S390_EXEC_PROTECT @@ -272,13 +264,13 @@ static int signal_return(struct mm_struct *mm, struct pt_regs *regs, * 3b Region third trans. -> Not present (nullification) */ static inline void -do_exception(struct pt_regs *regs, unsigned long error_code, int write) +do_exception(struct pt_regs *regs, unsigned long error_code, int write, + unsigned long trans_exc_code) { struct task_struct *tsk; struct mm_struct *mm; struct vm_area_struct *vma; unsigned long address; - int space; int si_code; int fault; @@ -288,18 +280,15 @@ do_exception(struct pt_regs *regs, unsigned long error_code, int write) tsk = current; mm = tsk->mm; - /* get the failing address and the affected space */ - address = S390_lowcore.trans_exc_code & __FAIL_ADDR_MASK; - space = check_space(tsk); - /* * Verify that the fault happened in user space, that * we are not in an interrupt and that there is a * user context. */ - if (unlikely(space == 0 || in_atomic() || !mm)) + if (unlikely(!user_space_fault(trans_exc_code) || in_atomic() || !mm)) goto no_context; + address = trans_exc_code & __FAIL_ADDR_MASK; /* * When we get here, the fault happened in the current * task's user address space, so we can switch on the @@ -315,7 +304,8 @@ do_exception(struct pt_regs *regs, unsigned long error_code, int write) goto bad_area; #ifdef CONFIG_S390_EXEC_PROTECT - if (unlikely((space == 2) && !(vma->vm_flags & VM_EXEC))) + if (unlikely((regs->psw.mask & PSW_MASK_ASC) == PSW_ASC_SECONDARY && + (trans_exc_code & 3) == 0 && !(vma->vm_flags & VM_EXEC))) if (!signal_return(mm, regs, address, error_code)) /* * signal_return() has done an up_read(&mm->mmap_sem) @@ -397,12 +387,14 @@ bad_area: } no_context: - do_no_context(regs, error_code, address); + do_no_context(regs, error_code, trans_exc_code); } void __kprobes do_protection_exception(struct pt_regs *regs, long error_code) { + unsigned long trans_exc_code = S390_lowcore.trans_exc_code; + /* Protection exception is supressing, decrement psw address. */ regs->psw.addr -= (error_code >> 16); /* @@ -410,31 +402,30 @@ void __kprobes do_protection_exception(struct pt_regs *regs, * as a special case because the translation exception code * field is not guaranteed to contain valid data in this case. */ - if (unlikely(!(S390_lowcore.trans_exc_code & 4))) { - do_low_address(regs, error_code); + if (unlikely(!(trans_exc_code & 4))) { + do_low_address(regs, error_code, trans_exc_code); return; } - do_exception(regs, 4, 1); + do_exception(regs, 4, 1, trans_exc_code); } void __kprobes do_dat_exception(struct pt_regs *regs, long error_code) { - do_exception(regs, error_code & 0xff, 0); + do_exception(regs, error_code & 0xff, 0, S390_lowcore.trans_exc_code); } #ifdef CONFIG_64BIT void __kprobes do_asce_exception(struct pt_regs *regs, unsigned long error_code) { + unsigned long trans_exc_code = S390_lowcore.trans_exc_code; struct mm_struct *mm; struct vm_area_struct *vma; unsigned long address; - int space; mm = current->mm; - address = S390_lowcore.trans_exc_code & __FAIL_ADDR_MASK; - space = check_space(current); + address = trans_exc_code & __FAIL_ADDR_MASK; - if (unlikely(space == 0 || in_atomic() || !mm)) + if (unlikely(!user_space_fault(trans_exc_code) || in_atomic() || !mm)) goto no_context; local_irq_enable(); @@ -457,7 +448,7 @@ void __kprobes do_asce_exception(struct pt_regs *regs, unsigned long error_code) } no_context: - do_no_context(regs, error_code, address); + do_no_context(regs, error_code, trans_exc_code); } #endif -- cgit v1.1 From b11b53342773361f3353b285eb6a3fd6074e7997 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Mon, 7 Dec 2009 12:51:43 +0100 Subject: [S390] Improve address space mode selection. Introduce user_mode to replace the two variables switch_amode and s390_noexec. There are three valid combinations of the old values: 1) switch_amode == 0 && s390_noexec == 0 2) switch_amode == 1 && s390_noexec == 0 3) switch_amode == 1 && s390_noexec == 1 They get replaced by 1) user_mode == HOME_SPACE_MODE 2) user_mode == PRIMARY_SPACE_MODE 3) user_mode == SECONDARY_SPACE_MODE The new kernel parameter user_mode=[primary,secondary,home] lets you choose the address space mode the user space processes should use. In addition the CONFIG_S390_SWITCH_AMODE config option is removed. Signed-off-by: Martin Schwidefsky --- arch/s390/mm/fault.c | 4 ++-- arch/s390/mm/pgtable.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/s390/mm') diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 3df5b91..77108e3 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -112,7 +112,7 @@ static inline int user_space_fault(unsigned long trans_exc_code) if (trans_exc_code == 2) /* Access via secondary space, set_fs setting decides */ return current->thread.mm_segment.ar4; - if (!switch_amode) + if (user_mode == HOME_SPACE_MODE) /* User space if the access has been done via home space. */ return trans_exc_code == 3; /* @@ -168,7 +168,7 @@ static void do_no_context(struct pt_regs *regs, unsigned long error_code, * terminate things with extreme prejudice. */ address = trans_exc_code & __FAIL_ADDR_MASK; - if (user_space_fault(trans_exc_code) == 0) + if (!user_space_fault(trans_exc_code)) printk(KERN_ALERT "Unable to handle kernel pointer dereference" " at virtual kernel address %p\n", (void *)address); else diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 2757c56..ad621e0 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -269,7 +269,7 @@ int s390_enable_sie(void) struct mm_struct *mm, *old_mm; /* Do we have switched amode? If no, we cannot do sie */ - if (!switch_amode) + if (user_mode == HOME_SPACE_MODE) return -EINVAL; /* Do we have pgstes? if yes, we are done */ -- cgit v1.1 From 7ecb344ae80bc03397ded3b004e06ecfe32becf9 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Mon, 7 Dec 2009 12:51:44 +0100 Subject: [S390] Improve notify_page_fault implementation. notify_page_fault does a preempt_disable/preempt_enable for each fault generated by a kernel access to user space. If kprobes is not active that is unnecessary since the interrupts are not reenabled yet. To play safe repeat the kprobe_running check after preempt_disable(). Signed-off-by: Martin Schwidefsky --- arch/s390/mm/fault.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'arch/s390/mm') diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 77108e3..fd72c26 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -52,11 +52,11 @@ extern int sysctl_userprocess_debug; #endif -#ifdef CONFIG_KPROBES -static inline int notify_page_fault(struct pt_regs *regs, long err) +static inline int notify_page_fault(struct pt_regs *regs) { int ret = 0; +#ifdef CONFIG_KPROBES /* kprobe_running() needs smp_processor_id() */ if (!user_mode(regs)) { preempt_disable(); @@ -64,15 +64,9 @@ static inline int notify_page_fault(struct pt_regs *regs, long err) ret = 1; preempt_enable(); } - +#endif return ret; } -#else -static inline int notify_page_fault(struct pt_regs *regs, long err) -{ - return 0; -} -#endif /* @@ -274,7 +268,7 @@ do_exception(struct pt_regs *regs, unsigned long error_code, int write, int si_code; int fault; - if (notify_page_fault(regs, error_code)) + if (notify_page_fault(regs)) return; tsk = current; -- cgit v1.1 From 50d7280d430484a890ddcadc7f738b5b6dd28bf1 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Mon, 7 Dec 2009 12:51:45 +0100 Subject: [S390] fault handler performance optimization. Slim down the do_exception function to handle only the fast path of a fault and move the exceptional cases into a new function. That slightly increases the performance of the fault handling. Build fix for !CONFIG_COMPAT by Kamalesh Babulal Signed-off-by: Martin Schwidefsky --- arch/s390/mm/fault.c | 258 +++++++++++++++++++++++++-------------------------- 1 file changed, 129 insertions(+), 129 deletions(-) (limited to 'arch/s390/mm') diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index fd72c26..0dcfcfb 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -34,16 +34,15 @@ #include #include #include +#include #include "../kernel/entry.h" #ifndef CONFIG_64BIT #define __FAIL_ADDR_MASK 0x7ffff000 -#define __FIXUP_MASK 0x7fffffff #define __SUBCODE_MASK 0x0200 #define __PF_RES_FIELD 0ULL #else /* CONFIG_64BIT */ #define __FAIL_ADDR_MASK -4096L -#define __FIXUP_MASK ~0L #define __SUBCODE_MASK 0x0600 #define __PF_RES_FIELD 0x8000000000000000ULL #endif /* CONFIG_64BIT */ @@ -52,6 +51,10 @@ extern int sysctl_userprocess_debug; #endif +#define VM_FAULT_BADCONTEXT 0x010000 +#define VM_FAULT_BADMAP 0x020000 +#define VM_FAULT_BADACCESS 0x040000 + static inline int notify_page_fault(struct pt_regs *regs) { int ret = 0; @@ -122,18 +125,22 @@ static inline int user_space_fault(unsigned long trans_exc_code) * Send SIGSEGV to task. This is an external routine * to keep the stack usage of do_page_fault small. */ -static void do_sigsegv(struct pt_regs *regs, unsigned long error_code, - int si_code, unsigned long address) +static noinline void do_sigsegv(struct pt_regs *regs, long int_code, + int si_code, unsigned long trans_exc_code) { struct siginfo si; + unsigned long address; + address = trans_exc_code & __FAIL_ADDR_MASK; + current->thread.prot_addr = address; + current->thread.trap_no = int_code; #if defined(CONFIG_SYSCTL) || defined(CONFIG_PROCESS_DEBUG) #if defined(CONFIG_SYSCTL) if (sysctl_userprocess_debug) #endif { printk("User process fault: interruption code 0x%lX\n", - error_code); + int_code); printk("failing address: %lX\n", address); show_regs(regs); } @@ -144,14 +151,14 @@ static void do_sigsegv(struct pt_regs *regs, unsigned long error_code, force_sig_info(SIGSEGV, &si, current); } -static void do_no_context(struct pt_regs *regs, unsigned long error_code, - unsigned long trans_exc_code) +static noinline void do_no_context(struct pt_regs *regs, long int_code, + unsigned long trans_exc_code) { const struct exception_table_entry *fixup; unsigned long address; /* Are we prepared to handle this kernel fault? */ - fixup = search_exception_tables(regs->psw.addr & __FIXUP_MASK); + fixup = search_exception_tables(regs->psw.addr & PSW_ADDR_INSN); if (fixup) { regs->psw.addr = fixup->fixup | PSW_ADDR_AMODE; return; @@ -169,107 +176,127 @@ static void do_no_context(struct pt_regs *regs, unsigned long error_code, printk(KERN_ALERT "Unable to handle kernel paging request" " at virtual user address %p\n", (void *)address); - die("Oops", regs, error_code); + die("Oops", regs, int_code); do_exit(SIGKILL); } -static void do_low_address(struct pt_regs *regs, unsigned long error_code, - unsigned long trans_exc_code) +static noinline void do_low_address(struct pt_regs *regs, long int_code, + unsigned long trans_exc_code) { /* Low-address protection hit in kernel mode means NULL pointer write access in kernel mode. */ if (regs->psw.mask & PSW_MASK_PSTATE) { /* Low-address protection hit in user mode 'cannot happen'. */ - die ("Low-address protection", regs, error_code); + die ("Low-address protection", regs, int_code); do_exit(SIGKILL); } - do_no_context(regs, error_code, trans_exc_code); + do_no_context(regs, int_code, trans_exc_code); } -static void do_sigbus(struct pt_regs *regs, unsigned long error_code, - unsigned long trans_exc_code) +static noinline void do_sigbus(struct pt_regs *regs, long int_code, + unsigned long trans_exc_code) { struct task_struct *tsk = current; - struct mm_struct *mm = tsk->mm; - up_read(&mm->mmap_sem); /* * Send a sigbus, regardless of whether we were in kernel * or user mode. */ tsk->thread.prot_addr = trans_exc_code & __FAIL_ADDR_MASK; - tsk->thread.trap_no = error_code; + tsk->thread.trap_no = int_code; force_sig(SIGBUS, tsk); - - /* Kernel mode? Handle exceptions or die */ - if (!(regs->psw.mask & PSW_MASK_PSTATE)) - do_no_context(regs, error_code, trans_exc_code); } #ifdef CONFIG_S390_EXEC_PROTECT -static int signal_return(struct mm_struct *mm, struct pt_regs *regs, - unsigned long address, unsigned long error_code) +static noinline int signal_return(struct pt_regs *regs, long int_code, + unsigned long trans_exc_code) { u16 instruction; int rc; -#ifdef CONFIG_COMPAT - int compat; -#endif - pagefault_disable(); rc = __get_user(instruction, (u16 __user *) regs->psw.addr); - pagefault_enable(); - if (rc) - return -EFAULT; - up_read(&mm->mmap_sem); - clear_tsk_thread_flag(current, TIF_SINGLE_STEP); -#ifdef CONFIG_COMPAT - compat = is_compat_task(); - if (compat && instruction == 0x0a77) - sys32_sigreturn(); - else if (compat && instruction == 0x0aad) - sys32_rt_sigreturn(); - else -#endif - if (instruction == 0x0a77) - sys_sigreturn(); - else if (instruction == 0x0aad) - sys_rt_sigreturn(); - else { - current->thread.prot_addr = address; - current->thread.trap_no = error_code; - do_sigsegv(regs, error_code, SEGV_MAPERR, address); - } + if (!rc && instruction == 0x0a77) { + clear_tsk_thread_flag(current, TIF_SINGLE_STEP); + if (is_compat_task()) + sys32_sigreturn(); + else + sys_sigreturn(); + } else if (!rc && instruction == 0x0aad) { + clear_tsk_thread_flag(current, TIF_SINGLE_STEP); + if (is_compat_task()) + sys32_rt_sigreturn(); + else + sys_rt_sigreturn(); + } else + do_sigsegv(regs, int_code, SEGV_MAPERR, trans_exc_code); return 0; } #endif /* CONFIG_S390_EXEC_PROTECT */ +static noinline void do_fault_error(struct pt_regs *regs, long int_code, + unsigned long trans_exc_code, int fault) +{ + int si_code; + + switch (fault) { + case VM_FAULT_BADACCESS: +#ifdef CONFIG_S390_EXEC_PROTECT + if ((regs->psw.mask & PSW_MASK_ASC) == PSW_ASC_SECONDARY && + (trans_exc_code & 3) == 0) { + signal_return(regs, int_code, trans_exc_code); + break; + } +#endif /* CONFIG_S390_EXEC_PROTECT */ + case VM_FAULT_BADMAP: + /* Bad memory access. Check if it is kernel or user space. */ + if (regs->psw.mask & PSW_MASK_PSTATE) { + /* User mode accesses just cause a SIGSEGV */ + si_code = (fault == VM_FAULT_BADMAP) ? + SEGV_MAPERR : SEGV_ACCERR; + do_sigsegv(regs, int_code, si_code, trans_exc_code); + return; + } + case VM_FAULT_BADCONTEXT: + do_no_context(regs, int_code, trans_exc_code); + break; + default: /* fault & VM_FAULT_ERROR */ + if (fault & VM_FAULT_OOM) + pagefault_out_of_memory(); + else if (fault & VM_FAULT_SIGBUS) { + do_sigbus(regs, int_code, trans_exc_code); + /* Kernel mode? Handle exceptions or die */ + if (!(regs->psw.mask & PSW_MASK_PSTATE)) + do_no_context(regs, int_code, trans_exc_code); + } else + BUG(); + break; + } +} + /* * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate * routines. * - * error_code: + * interruption code (int_code): * 04 Protection -> Write-Protection (suprression) * 10 Segment translation -> Not present (nullification) * 11 Page translation -> Not present (nullification) * 3b Region third trans. -> Not present (nullification) */ -static inline void -do_exception(struct pt_regs *regs, unsigned long error_code, int write, - unsigned long trans_exc_code) +static inline int do_exception(struct pt_regs *regs, int write, + unsigned long trans_exc_code) { struct task_struct *tsk; struct mm_struct *mm; struct vm_area_struct *vma; unsigned long address; - int si_code; int fault; if (notify_page_fault(regs)) - return; + return 0; tsk = current; mm = tsk->mm; @@ -279,8 +306,9 @@ do_exception(struct pt_regs *regs, unsigned long error_code, int write, * we are not in an interrupt and that there is a * user context. */ + fault = VM_FAULT_BADCONTEXT; if (unlikely(!user_space_fault(trans_exc_code) || in_atomic() || !mm)) - goto no_context; + goto out; address = trans_exc_code & __FAIL_ADDR_MASK; /* @@ -292,41 +320,35 @@ do_exception(struct pt_regs *regs, unsigned long error_code, int write, perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); down_read(&mm->mmap_sem); - si_code = SEGV_MAPERR; + fault = VM_FAULT_BADMAP; vma = find_vma(mm, address); if (!vma) - goto bad_area; + goto out_up; + if (unlikely(vma->vm_start > address)) { + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto out_up; + if (expand_stack(vma, address)) + goto out_up; + } + + /* + * Ok, we have a good vm_area for this memory access, so + * we can handle it.. + */ + fault = VM_FAULT_BADACCESS; #ifdef CONFIG_S390_EXEC_PROTECT if (unlikely((regs->psw.mask & PSW_MASK_ASC) == PSW_ASC_SECONDARY && (trans_exc_code & 3) == 0 && !(vma->vm_flags & VM_EXEC))) - if (!signal_return(mm, regs, address, error_code)) - /* - * signal_return() has done an up_read(&mm->mmap_sem) - * if it returns 0. - */ - return; + goto out_up; #endif - - if (vma->vm_start <= address) - goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; - if (expand_stack(vma, address)) - goto bad_area; -/* - * Ok, we have a good vm_area for this memory access, so - * we can handle it.. - */ -good_area: - si_code = SEGV_ACCERR; if (!write) { /* page not present, check vm flags */ if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) - goto bad_area; + goto out_up; } else { if (!(vma->vm_flags & VM_WRITE)) - goto bad_area; + goto out_up; } if (is_vm_hugetlb_page(vma)) @@ -337,17 +359,9 @@ good_area: * the fault. */ fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0); - if (unlikely(fault & VM_FAULT_ERROR)) { - if (fault & VM_FAULT_OOM) { - up_read(&mm->mmap_sem); - pagefault_out_of_memory(); - return; - } else if (fault & VM_FAULT_SIGBUS) { - do_sigbus(regs, error_code, address); - return; - } - BUG(); - } + if (unlikely(fault & VM_FAULT_ERROR)) + goto out_up; + if (fault & VM_FAULT_MAJOR) { tsk->maj_flt++; perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, @@ -357,67 +371,55 @@ good_area: perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, regs, address); } - up_read(&mm->mmap_sem); /* * The instruction that caused the program check will * be repeated. Don't signal single step via SIGTRAP. */ clear_tsk_thread_flag(tsk, TIF_SINGLE_STEP); - return; - -/* - * Something tried to access memory that isn't in our memory map.. - * Fix it, but check if it's kernel or user first.. - */ -bad_area: + fault = 0; +out_up: up_read(&mm->mmap_sem); - - /* User mode accesses just cause a SIGSEGV */ - if (regs->psw.mask & PSW_MASK_PSTATE) { - tsk->thread.prot_addr = address; - tsk->thread.trap_no = error_code; - do_sigsegv(regs, error_code, si_code, address); - return; - } - -no_context: - do_no_context(regs, error_code, trans_exc_code); +out: + return fault; } -void __kprobes do_protection_exception(struct pt_regs *regs, - long error_code) +void __kprobes do_protection_exception(struct pt_regs *regs, long int_code) { unsigned long trans_exc_code = S390_lowcore.trans_exc_code; + int fault; /* Protection exception is supressing, decrement psw address. */ - regs->psw.addr -= (error_code >> 16); + regs->psw.addr -= (int_code >> 16); /* * Check for low-address protection. This needs to be treated * as a special case because the translation exception code * field is not guaranteed to contain valid data in this case. */ if (unlikely(!(trans_exc_code & 4))) { - do_low_address(regs, error_code, trans_exc_code); + do_low_address(regs, int_code, trans_exc_code); return; } - do_exception(regs, 4, 1, trans_exc_code); + fault = do_exception(regs, 1, trans_exc_code); + if (unlikely(fault)) + do_fault_error(regs, 4, trans_exc_code, fault); } -void __kprobes do_dat_exception(struct pt_regs *regs, long error_code) +void __kprobes do_dat_exception(struct pt_regs *regs, long int_code) { - do_exception(regs, error_code & 0xff, 0, S390_lowcore.trans_exc_code); + unsigned long trans_exc_code = S390_lowcore.trans_exc_code; + int fault; + + fault = do_exception(regs, 0, trans_exc_code); + if (unlikely(fault)) + do_fault_error(regs, int_code & 255, trans_exc_code, fault); } #ifdef CONFIG_64BIT -void __kprobes do_asce_exception(struct pt_regs *regs, unsigned long error_code) +void __kprobes do_asce_exception(struct pt_regs *regs, long int_code) { unsigned long trans_exc_code = S390_lowcore.trans_exc_code; - struct mm_struct *mm; + struct mm_struct *mm = current->mm; struct vm_area_struct *vma; - unsigned long address; - - mm = current->mm; - address = trans_exc_code & __FAIL_ADDR_MASK; if (unlikely(!user_space_fault(trans_exc_code) || in_atomic() || !mm)) goto no_context; @@ -425,7 +427,7 @@ void __kprobes do_asce_exception(struct pt_regs *regs, unsigned long error_code) local_irq_enable(); down_read(&mm->mmap_sem); - vma = find_vma(mm, address); + vma = find_vma(mm, trans_exc_code & __FAIL_ADDR_MASK); up_read(&mm->mmap_sem); if (vma) { @@ -435,14 +437,12 @@ void __kprobes do_asce_exception(struct pt_regs *regs, unsigned long error_code) /* User mode accesses just cause a SIGSEGV */ if (regs->psw.mask & PSW_MASK_PSTATE) { - current->thread.prot_addr = address; - current->thread.trap_no = error_code; - do_sigsegv(regs, error_code, SEGV_MAPERR, address); + do_sigsegv(regs, int_code, SEGV_MAPERR, trans_exc_code); return; } no_context: - do_no_context(regs, error_code, trans_exc_code); + do_no_context(regs, int_code, trans_exc_code); } #endif @@ -507,7 +507,7 @@ void pfault_fini(void) : : "a" (&refbk), "m" (refbk) : "cc"); } -static void pfault_interrupt(__u16 error_code) +static void pfault_interrupt(__u16 int_code) { struct task_struct *tsk; __u16 subcode; -- cgit v1.1 From 1ab947de293f43812276b60cf9fa21127e7a5bb2 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Mon, 7 Dec 2009 12:51:46 +0100 Subject: [S390] fault handler access flags check. Simplify the check of the vma->flags in do_exception for the different fault types. Signed-off-by: Martin Schwidefsky --- arch/s390/mm/fault.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) (limited to 'arch/s390/mm') diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 0dcfcfb..5a9e9a7 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -286,7 +286,7 @@ static noinline void do_fault_error(struct pt_regs *regs, long int_code, * 11 Page translation -> Not present (nullification) * 3b Region third trans. -> Not present (nullification) */ -static inline int do_exception(struct pt_regs *regs, int write, +static inline int do_exception(struct pt_regs *regs, int access, unsigned long trans_exc_code) { struct task_struct *tsk; @@ -337,19 +337,8 @@ static inline int do_exception(struct pt_regs *regs, int write, * we can handle it.. */ fault = VM_FAULT_BADACCESS; -#ifdef CONFIG_S390_EXEC_PROTECT - if (unlikely((regs->psw.mask & PSW_MASK_ASC) == PSW_ASC_SECONDARY && - (trans_exc_code & 3) == 0 && !(vma->vm_flags & VM_EXEC))) + if (unlikely(!(vma->vm_flags & access))) goto out_up; -#endif - if (!write) { - /* page not present, check vm flags */ - if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) - goto out_up; - } else { - if (!(vma->vm_flags & VM_WRITE)) - goto out_up; - } if (is_vm_hugetlb_page(vma)) address &= HPAGE_MASK; @@ -358,7 +347,8 @@ static inline int do_exception(struct pt_regs *regs, int write, * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0); + fault = handle_mm_fault(mm, vma, address, + (access == VM_WRITE) ? FAULT_FLAG_WRITE : 0); if (unlikely(fault & VM_FAULT_ERROR)) goto out_up; @@ -399,7 +389,7 @@ void __kprobes do_protection_exception(struct pt_regs *regs, long int_code) do_low_address(regs, int_code, trans_exc_code); return; } - fault = do_exception(regs, 1, trans_exc_code); + fault = do_exception(regs, VM_WRITE, trans_exc_code); if (unlikely(fault)) do_fault_error(regs, 4, trans_exc_code, fault); } @@ -407,9 +397,15 @@ void __kprobes do_protection_exception(struct pt_regs *regs, long int_code) void __kprobes do_dat_exception(struct pt_regs *regs, long int_code) { unsigned long trans_exc_code = S390_lowcore.trans_exc_code; - int fault; + int access, fault; - fault = do_exception(regs, 0, trans_exc_code); + access = VM_READ | VM_EXEC | VM_WRITE; +#ifdef CONFIG_S390_EXEC_PROTECT + if ((regs->psw.mask & PSW_MASK_ASC) == PSW_ASC_SECONDARY && + (trans_exc_code & 3) == 0) + access = VM_EXEC; +#endif + fault = do_exception(regs, access, trans_exc_code); if (unlikely(fault)) do_fault_error(regs, int_code & 255, trans_exc_code, fault); } -- cgit v1.1 From 6c1e3e79430615d0472dbf9f8fed89c571e66423 Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Mon, 7 Dec 2009 12:51:47 +0100 Subject: [S390] Use do_exception() in pagetable walk usercopy functions. The pagetable walk usercopy functions have used a modified copy of the do_exception() function for fault handling. This lead to inconsistencies with recent changes to do_exception(), e.g. performance counters. This patch changes the pagetable walk usercopy code to call do_exception() directly, eliminating the redundancy. A new parameter is added to do_exception() to specify the fault address. Signed-off-by: Gerald Schaefer Signed-off-by: Martin Schwidefsky --- arch/s390/mm/fault.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'arch/s390/mm') diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 5a9e9a7..fc102e7 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -442,6 +442,29 @@ no_context: } #endif +int __handle_fault(unsigned long uaddr, unsigned long int_code, int write_user) +{ + struct pt_regs regs; + int access, fault; + + regs.psw.mask = psw_kernel_bits; + if (!irqs_disabled()) + regs.psw.mask |= PSW_MASK_IO | PSW_MASK_EXT; + regs.psw.addr = (unsigned long) __builtin_return_address(0); + regs.psw.addr |= PSW_ADDR_AMODE; + uaddr &= PAGE_MASK; + access = write_user ? VM_WRITE : VM_READ; + fault = do_exception(®s, access, uaddr | 2); + if (unlikely(fault)) { + if (fault & VM_FAULT_OOM) { + pagefault_out_of_memory(); + fault = 0; + } else if (fault & VM_FAULT_SIGBUS) + do_sigbus(®s, int_code, uaddr); + } + return fault ? -EFAULT : 0; +} + #ifdef CONFIG_PFAULT /* * 'pfault' pseudo page faults routines. -- cgit v1.1 From 52b169c864ea8622c4755172844fd24168c81195 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Mon, 7 Dec 2009 12:52:06 +0100 Subject: [S390] cmm: free pages on hibernate. The pages allocated by the cmm memory balloon should be freed before the hibernation image is created. Otherwise the memory reserved by the balloon gets written to the swap device but there is no content in these pages that need to be preserved. Signed-off-by: Martin Schwidefsky --- arch/s390/mm/cmm.c | 61 ++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 50 insertions(+), 11 deletions(-) (limited to 'arch/s390/mm') diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c index b201135..5e5f384 100644 --- a/arch/s390/mm/cmm.c +++ b/arch/s390/mm/cmm.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -44,6 +45,7 @@ static volatile long cmm_pages_target; static volatile long cmm_timed_pages_target; static long cmm_timeout_pages; static long cmm_timeout_seconds; +static int cmm_suspended; static struct cmm_page_array *cmm_page_list; static struct cmm_page_array *cmm_timed_page_list; @@ -147,9 +149,9 @@ cmm_thread(void *dummy) while (1) { rc = wait_event_interruptible(cmm_thread_wait, - (cmm_pages != cmm_pages_target || - cmm_timed_pages != cmm_timed_pages_target || - kthread_should_stop())); + (!cmm_suspended && (cmm_pages != cmm_pages_target || + cmm_timed_pages != cmm_timed_pages_target)) || + kthread_should_stop()); if (kthread_should_stop() || rc == -ERESTARTSYS) { cmm_pages_target = cmm_pages; cmm_timed_pages_target = cmm_timed_pages; @@ -411,6 +413,38 @@ cmm_smsg_target(char *from, char *msg) static struct ctl_table_header *cmm_sysctl_header; +static int cmm_suspend(void) +{ + cmm_suspended = 1; + cmm_free_pages(cmm_pages, &cmm_pages, &cmm_page_list); + cmm_free_pages(cmm_timed_pages, &cmm_timed_pages, &cmm_timed_page_list); + return 0; +} + +static int cmm_resume(void) +{ + cmm_suspended = 0; + cmm_kick_thread(); + return 0; +} + +static int cmm_power_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + switch (event) { + case PM_POST_HIBERNATION: + return cmm_resume(); + case PM_HIBERNATION_PREPARE: + return cmm_suspend(); + default: + return NOTIFY_DONE; + } +} + +static struct notifier_block cmm_power_notifier = { + .notifier_call = cmm_power_event, +}; + static int cmm_init (void) { @@ -419,7 +453,7 @@ cmm_init (void) #ifdef CONFIG_CMM_PROC cmm_sysctl_header = register_sysctl_table(cmm_dir_table); if (!cmm_sysctl_header) - goto out; + goto out_sysctl; #endif #ifdef CONFIG_CMM_IUCV rc = smsg_register_callback(SMSG_PREFIX, cmm_smsg_target); @@ -429,17 +463,21 @@ cmm_init (void) rc = register_oom_notifier(&cmm_oom_nb); if (rc < 0) goto out_oom_notify; + rc = register_pm_notifier(&cmm_power_notifier); + if (rc) + goto out_pm; init_waitqueue_head(&cmm_thread_wait); init_timer(&cmm_timer); cmm_thread_ptr = kthread_run(cmm_thread, NULL, "cmmthread"); rc = IS_ERR(cmm_thread_ptr) ? PTR_ERR(cmm_thread_ptr) : 0; - if (!rc) - goto out; - /* - * kthread_create failed. undo all the stuff from above again. - */ - unregister_oom_notifier(&cmm_oom_nb); + if (rc) + goto out_kthread; + return 0; +out_kthread: + unregister_pm_notifier(&cmm_power_notifier); +out_pm: + unregister_oom_notifier(&cmm_oom_nb); out_oom_notify: #ifdef CONFIG_CMM_IUCV smsg_unregister_callback(SMSG_PREFIX, cmm_smsg_target); @@ -447,8 +485,8 @@ out_smsg: #endif #ifdef CONFIG_CMM_PROC unregister_sysctl_table(cmm_sysctl_header); +out_sysctl: #endif -out: return rc; } @@ -456,6 +494,7 @@ static void cmm_exit(void) { kthread_stop(cmm_thread_ptr); + unregister_pm_notifier(&cmm_power_notifier); unregister_oom_notifier(&cmm_oom_nb); cmm_free_pages(cmm_pages, &cmm_pages, &cmm_page_list); cmm_free_pages(cmm_timed_pages, &cmm_timed_pages, &cmm_timed_page_list); -- cgit v1.1 From 6a985c6194017de2c062916ad1cd00dee0302c40 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Mon, 7 Dec 2009 12:52:11 +0100 Subject: [S390] s390: use change recording override for kernel mapping We dont need the dirty bit if a write access is done via the kernel mapping. In that case SetPageDirty and friends are used anyway, no need to do that a second time. We can use the change-recording overide function for the kernel mapping, if available. Signed-off-by: Christian Borntraeger Signed-off-by: Martin Schwidefsky --- arch/s390/mm/vmem.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'arch/s390/mm') diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index 5f91a38..300ab01 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -70,8 +70,12 @@ static pte_t __ref *vmem_pte_alloc(void) pte = alloc_bootmem(PTRS_PER_PTE * sizeof(pte_t)); if (!pte) return NULL; - clear_table((unsigned long *) pte, _PAGE_TYPE_EMPTY, - PTRS_PER_PTE * sizeof(pte_t)); + if (MACHINE_HAS_HPAGE) + clear_table((unsigned long *) pte, _PAGE_TYPE_EMPTY | _PAGE_CO, + PTRS_PER_PTE * sizeof(pte_t)); + else + clear_table((unsigned long *) pte, _PAGE_TYPE_EMPTY, + PTRS_PER_PTE * sizeof(pte_t)); return pte; } @@ -112,7 +116,8 @@ static int vmem_add_mem(unsigned long start, unsigned long size, int ro) if (MACHINE_HAS_HPAGE && !(address & ~HPAGE_MASK) && (address + HPAGE_SIZE <= start + size) && (address >= HPAGE_SIZE)) { - pte_val(pte) |= _SEGMENT_ENTRY_LARGE; + pte_val(pte) |= _SEGMENT_ENTRY_LARGE | + _SEGMENT_ENTRY_CO; pmd_val(*pm_dir) = pte_val(pte); address += HPAGE_SIZE - PAGE_SIZE; continue; -- cgit v1.1