From be5201b0d1c733f2bfb8fa4bef0225259f9acb2c Mon Sep 17 00:00:00 2001 From: alc Date: Mon, 20 Dec 2010 22:49:31 +0000 Subject: Introduce vm_fault_hold() and use it to (1) eliminate a long-standing race condition in proc_rwmem() and to (2) simplify the implementation of the cxgb driver's vm_fault_hold_user_pages(). Specifically, in proc_rwmem() the requested read or write could fail because the targeted page could be reclaimed between the calls to vm_fault() and vm_page_hold(). In collaboration with: kib@ MFC after: 6 weeks --- sys/kern/sys_process.c | 80 +++++++++++--------------------------------------- 1 file changed, 17 insertions(+), 63 deletions(-) (limited to 'sys/kern/sys_process.c') diff --git a/sys/kern/sys_process.c b/sys/kern/sys_process.c index e5d9842..3733de3 100644 --- a/sys/kern/sys_process.c +++ b/sys/kern/sys_process.c @@ -237,10 +237,9 @@ int proc_rwmem(struct proc *p, struct uio *uio) { vm_map_t map; - vm_object_t backing_object, object; vm_offset_t pageno; /* page number */ vm_prot_t reqprot; - int error, writing; + int error, fault_flags, page_offset, writing; /* * Assert that someone has locked this vmspace. (Should be @@ -255,26 +254,24 @@ proc_rwmem(struct proc *p, struct uio *uio) */ map = &p->p_vmspace->vm_map; + /* + * If we are writing, then we request vm_fault() to create a private + * copy of each page. Since these copies will not be writeable by the + * process, we must explicity request that they be dirtied. + */ writing = uio->uio_rw == UIO_WRITE; reqprot = writing ? VM_PROT_COPY | VM_PROT_READ : VM_PROT_READ; + fault_flags = writing ? VM_FAULT_DIRTY : VM_FAULT_NORMAL; /* * Only map in one page at a time. We don't have to, but it * makes things easier. This way is trivial - right? */ do { - vm_map_t tmap; vm_offset_t uva; - int page_offset; /* offset into page */ - vm_map_entry_t out_entry; - vm_prot_t out_prot; - boolean_t wired; - vm_pindex_t pindex; u_int len; vm_page_t m; - object = NULL; - uva = (vm_offset_t)uio->uio_offset; /* @@ -289,10 +286,10 @@ proc_rwmem(struct proc *p, struct uio *uio) len = min(PAGE_SIZE - page_offset, uio->uio_resid); /* - * Fault the page on behalf of the process + * Fault and hold the page on behalf of the process. */ - error = vm_fault(map, pageno, reqprot, VM_FAULT_NORMAL); - if (error) { + error = vm_fault_hold(map, pageno, reqprot, fault_flags, &m); + if (error != KERN_SUCCESS) { if (error == KERN_RESOURCE_SHORTAGE) error = ENOMEM; else @@ -301,61 +298,18 @@ proc_rwmem(struct proc *p, struct uio *uio) } /* - * Now we need to get the page. out_entry and wired - * aren't used. One would think the vm code - * would be a *bit* nicer... We use tmap because - * vm_map_lookup() can change the map argument. - */ - tmap = map; - error = vm_map_lookup(&tmap, pageno, reqprot, &out_entry, - &object, &pindex, &out_prot, &wired); - if (error) { - error = EFAULT; - break; - } - VM_OBJECT_LOCK(object); - while ((m = vm_page_lookup(object, pindex)) == NULL && - !writing && - (backing_object = object->backing_object) != NULL) { - /* - * Allow fallback to backing objects if we are reading. - */ - VM_OBJECT_LOCK(backing_object); - pindex += OFF_TO_IDX(object->backing_object_offset); - VM_OBJECT_UNLOCK(object); - object = backing_object; - } - if (writing && m != NULL) { - vm_page_dirty(m); - vm_pager_page_unswapped(m); - } - VM_OBJECT_UNLOCK(object); - if (m == NULL) { - vm_map_lookup_done(tmap, out_entry); - error = EFAULT; - break; - } - - /* - * Hold the page in memory. - */ - vm_page_lock(m); - vm_page_hold(m); - vm_page_unlock(m); - - /* - * We're done with tmap now. - */ - vm_map_lookup_done(tmap, out_entry); - - /* * Now do the i/o move. */ error = uiomove_fromphys(&m, page_offset, len, uio); /* Make the I-cache coherent for breakpoints. */ - if (!error && writing && (out_prot & VM_PROT_EXECUTE)) - vm_sync_icache(map, uva, len); + if (writing && error == 0) { + vm_map_lock_read(map); + if (vm_map_check_protection(map, pageno, pageno + + PAGE_SIZE, VM_PROT_EXECUTE)) + vm_sync_icache(map, uva, len); + vm_map_unlock_read(map); + } /* * Release the page. -- cgit v1.1