From b2d0994b1301fc3a6a89e1889578dac9227840e3 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Thu, 12 Mar 2009 00:55:37 -0700 Subject: futex: update futex commentary Impact: cleanup The futex_hash_bucket can be a bit confusing when first looking at the code as it is a shared queue (and futex_q isn't a queue at all, but rather an element on the queue). The mmap_sem is no longer held outside of the futex_handle_fault() routine, yet numerous comments refer to it. The fshared argument is no an integer. I left some of these comments along as they are simply removed in future patches. Some of the commentary refering to futexes by virtual page mappings was not very clear, and completely accurate (as for shared futexes both the page and the offset are used to determine the key). For the purposes of the function description, just referring to "the futex" seems sufficient. With hashed futexes we now access the page after the hash-bucket is locked, and not only after it is enqueued. Signed-off-by: Darren Hart Acked-by: Peter Zijlstra Cc: Rusty Russell LKML-Reference: <20090312075537.9856.29954.stgit@Aeon> Signed-off-by: Ingo Molnar --- kernel/futex.c | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 438701a..e6a4d72 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -114,7 +114,9 @@ struct futex_q { }; /* - * Split the global futex_lock into every hash list lock. + * Hash buckets are shared by all the futex_keys that hash to the same + * location. Each key may have multiple futex_q structures, one for each task + * waiting on a futex. */ struct futex_hash_bucket { spinlock_t lock; @@ -189,8 +191,7 @@ static void drop_futex_key_refs(union futex_key *key) /** * get_futex_key - Get parameters which are the keys for a futex. * @uaddr: virtual address of the futex - * @shared: NULL for a PROCESS_PRIVATE futex, - * ¤t->mm->mmap_sem for a PROCESS_SHARED futex + * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED * @key: address where result is stored. * * Returns a negative error code or 0 @@ -200,9 +201,7 @@ static void drop_futex_key_refs(union futex_key *key) * offset_within_page). For private mappings, it's (uaddr, current->mm). * We can usually work out the index without swapping in the page. * - * fshared is NULL for PROCESS_PRIVATE futexes - * For other futexes, it points to ¤t->mm->mmap_sem and - * caller must have taken the reader lock. but NOT any spinlocks. + * lock_page() might sleep, the caller should not hold a spinlock. */ static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) { @@ -589,10 +588,9 @@ static void wake_futex(struct futex_q *q) * The waiting task can free the futex_q as soon as this is written, * without taking any locks. This must come last. * - * A memory barrier is required here to prevent the following store - * to lock_ptr from getting ahead of the wakeup. Clearing the lock - * at the end of wake_up_all() does not prevent this store from - * moving. + * A memory barrier is required here to prevent the following store to + * lock_ptr from getting ahead of the wakeup. Clearing the lock at the + * end of wake_up() does not prevent this store from moving. */ smp_wmb(); q->lock_ptr = NULL; @@ -693,8 +691,7 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) } /* - * Wake up all waiters hashed on the physical page that is mapped - * to this virtual address: + * Wake up waiters matching bitset queued on this futex (uaddr). */ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) { @@ -1076,11 +1073,9 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, * in the user space variable. This must be atomic as we have * to preserve the owner died bit here. * - * Note: We write the user space value _before_ changing the - * pi_state because we can fault here. Imagine swapped out - * pages or a fork, which was running right before we acquired - * mmap_sem, that marked all the anonymous memory readonly for - * cow. + * Note: We write the user space value _before_ changing the pi_state + * because we can fault here. Imagine swapped out pages or a fork + * that marked all the anonymous memory readonly for cow. * * Modifying pi_state _before_ the user space value would * leave the pi_state in an inconsistent state when we fault @@ -1188,7 +1183,7 @@ retry: hb = queue_lock(&q); /* - * Access the page AFTER the futex is queued. + * Access the page AFTER the hash-bucket is locked. * Order is important: * * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val); @@ -1204,7 +1199,7 @@ retry: * a wakeup when *uaddr != val on entry to the syscall. This is * rare, but normal. * - * for shared futexes, we hold the mmap semaphore, so the mapping + * For shared futexes, we hold the mmap semaphore, so the mapping * cannot have changed since we looked it up in get_futex_key. */ ret = get_futex_value_locked(&uval, uaddr); -- cgit v1.1 From de87fcc124a5d4a171aa32707b3265608ebda6e7 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Thu, 12 Mar 2009 00:55:46 -0700 Subject: futex: additional (get|put)_futex_key() fixes Impact: fix races futex_requeue and futex_lock_pi still had some bad (get|put)_futex_key() usage. This patch adds the missing put_futex_keys() and corrects a goto in futex_lock_pi() to avoid a double get. Build and boot tested on a 4 way Intel x86_64 workstation. Passes basic pthread_mutex and PI tests out of ltp/testcases/realtime. Signed-off-by: Darren Hart Acked-by: Peter Zijlstra Cc: Rusty Russell LKML-Reference: <20090312075545.9856.75152.stgit@Aeon> Signed-off-by: Ingo Molnar --- kernel/futex.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index e6a4d72..4000454 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -802,8 +802,10 @@ retry: ret = get_user(dummy, uaddr2); if (ret) - return ret; + goto out_put_keys; + put_futex_key(fshared, &key2); + put_futex_key(fshared, &key1); goto retryfull; } @@ -878,6 +880,9 @@ retry: if (hb1 != hb2) spin_unlock(&hb2->lock); + put_futex_key(fshared, &key2); + put_futex_key(fshared, &key1); + ret = get_user(curval, uaddr1); if (!ret) @@ -1453,6 +1458,7 @@ retry_locked: * exit to complete. */ queue_unlock(&q, hb); + put_futex_key(fshared, &q.key); cond_resched(); goto retry; @@ -1595,13 +1601,12 @@ uaddr_faulted: ret = get_user(uval, uaddr); if (!ret) - goto retry; + goto retry_unlocked; - if (to) - destroy_hrtimer_on_stack(&to->timer); - return ret; + goto out_put_key; } + /* * Userspace attempted a TID -> 0 atomic transition, and failed. * This is the in-kernel slowpath: we look up the PI state (if any), @@ -1705,6 +1710,7 @@ pi_faulted: } ret = get_user(uval, uaddr); + put_futex_key(fshared, &key); if (!ret) goto retry; -- cgit v1.1 From 5eb3dc62fc5986e85715041c23dcf3832812be4b Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Thu, 12 Mar 2009 00:55:52 -0700 Subject: futex: add double_unlock_hb() Impact: cleanup The futex code uses double_lock_hb() which locks the hb->lock's in pointer value order. There is no parallel unlock routine, and the code unlocks them in name order, ignoring pointer value. This patch adds double_unlock_hb() to refactor the duplicated code segments. Build and boot tested on a 4 way Intel x86_64 workstation. Passes basic pthread_mutex and PI tests out of ltp/testcases/realtime. Signed-off-by: Darren Hart Acked-by: Peter Zijlstra Cc: Rusty Russell LKML-Reference: <20090312075552.9856.48021.stgit@Aeon> Signed-off-by: Ingo Molnar --- kernel/futex.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 4000454..e149545 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -690,6 +690,19 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) } } +static inline void +double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) +{ + if (hb1 <= hb2) { + spin_unlock(&hb2->lock); + if (hb1 < hb2) + spin_unlock(&hb1->lock); + } else { /* hb1 > hb2 */ + spin_unlock(&hb1->lock); + spin_unlock(&hb2->lock); + } +} + /* * Wake up waiters matching bitset queued on this futex (uaddr). */ @@ -767,9 +780,7 @@ retry: if (unlikely(op_ret < 0)) { u32 dummy; - spin_unlock(&hb1->lock); - if (hb1 != hb2) - spin_unlock(&hb2->lock); + double_unlock_hb(hb1, hb2); #ifndef CONFIG_MMU /* @@ -833,9 +844,7 @@ retry: ret += op_ret; } - spin_unlock(&hb1->lock); - if (hb1 != hb2) - spin_unlock(&hb2->lock); + double_unlock_hb(hb1, hb2); out_put_keys: put_futex_key(fshared, &key2); out_put_key1: @@ -876,9 +885,7 @@ retry: ret = get_futex_value_locked(&curval, uaddr1); if (unlikely(ret)) { - spin_unlock(&hb1->lock); - if (hb1 != hb2) - spin_unlock(&hb2->lock); + double_unlock_hb(hb1, hb2); put_futex_key(fshared, &key2); put_futex_key(fshared, &key1); @@ -925,9 +932,7 @@ retry: } out_unlock: - spin_unlock(&hb1->lock); - if (hb1 != hb2) - spin_unlock(&hb2->lock); + double_unlock_hb(hb1, hb2); /* drop_futex_key_refs() must be called outside the spinlocks. */ while (--drop_count >= 0) -- cgit v1.1 From 16f4993f4e9860715918efd4eeac928f8de1218b Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Thu, 12 Mar 2009 00:55:59 -0700 Subject: futex: use current->time_slack_ns for rt tasks too RT tasks should set their timer slack to 0 on their own. This patch removes the 'if (rt_task()) slack = 0;' block in futex_wait. Build and boot tested on a 4 way Intel x86_64 workstation. Passes basic pthread_mutex and PI tests out of ltp/testcases/realtime. Signed-off-by: Darren Hart Acked-by: Peter Zijlstra Cc: Rusty Russell Cc: Arjan van de Ven LKML-Reference: <20090312075559.9856.28822.stgit@Aeon> Signed-off-by: Ingo Molnar --- kernel/futex.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index e149545..6579912 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1253,16 +1253,13 @@ retry: if (!abs_time) schedule(); else { - unsigned long slack; - slack = current->timer_slack_ns; - if (rt_task(current)) - slack = 0; hrtimer_init_on_stack(&t.timer, clockrt ? CLOCK_REALTIME : CLOCK_MONOTONIC, HRTIMER_MODE_ABS); hrtimer_init_sleeper(&t, current); - hrtimer_set_expires_range_ns(&t.timer, *abs_time, slack); + hrtimer_set_expires_range_ns(&t.timer, *abs_time, + current->timer_slack_ns); hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS); if (!hrtimer_active(&t.timer)) -- cgit v1.1 From e8f6386c01a5699c115bdad10271a24076364c97 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Thu, 12 Mar 2009 00:56:06 -0700 Subject: futex: unlock before returning -EFAULT Impact: rt-mutex failure case fix futex_lock_pi can potentially return -EFAULT with the rt_mutex held. This seems like the wrong thing to do as userspace should assume -EFAULT means the lock was not taken. Even if it could figure this out, we'd be leaving the pi_state->owner in an inconsistent state. This patch unlocks the rt_mutex prior to returning -EFAULT to userspace. Build and boot tested on a 4 way Intel x86_64 workstation. Passes basic pthread_mutex and PI tests out of ltp/testcases/realtime. Signed-off-by: Darren Hart Acked-by: Peter Zijlstra Cc: Rusty Russell LKML-Reference: <20090312075606.9856.88729.stgit@Aeon> Signed-off-by: Ingo Molnar --- kernel/futex.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/futex.c b/kernel/futex.c index 6579912..c980a55 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1567,6 +1567,13 @@ retry_locked: } } + /* + * If fixup_pi_state_owner() faulted and was unable to handle the + * fault, unlock it and return the fault to userspace. + */ + if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) + rt_mutex_unlock(&q.pi_state->pi_mutex); + /* Unqueue and drop the lock */ unqueue_me_pi(&q); -- cgit v1.1 From e4dc5b7a36a49eff97050894cf1b3a9a02523717 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Thu, 12 Mar 2009 00:56:13 -0700 Subject: futex: clean up fault logic Impact: cleanup Older versions of the futex code held the mmap_sem which had to be dropped in order to call get_user(), so a two-pronged fault handling mechanism was employed to handle faults of the atomic operations. The mmap_sem is no longer held, so get_user() should be adequate. This patch greatly simplifies the logic and improves legibility. Build and boot tested on a 4 way Intel x86_64 workstation. Passes basic pthread_mutex and PI tests out of ltp/testcases/realtime. Signed-off-by: Darren Hart Acked-by: Peter Zijlstra Cc: Rusty Russell LKML-Reference: <20090312075612.9856.48612.stgit@Aeon> Signed-off-by: Ingo Molnar --- kernel/futex.c | 126 +++++++++++++++++---------------------------------------- 1 file changed, 36 insertions(+), 90 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index c980a55..9c97f67 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -298,41 +298,6 @@ static int get_futex_value_locked(u32 *dest, u32 __user *from) return ret ? -EFAULT : 0; } -/* - * Fault handling. - */ -static int futex_handle_fault(unsigned long address, int attempt) -{ - struct vm_area_struct * vma; - struct mm_struct *mm = current->mm; - int ret = -EFAULT; - - if (attempt > 2) - return ret; - - down_read(&mm->mmap_sem); - vma = find_vma(mm, address); - if (vma && address >= vma->vm_start && - (vma->vm_flags & VM_WRITE)) { - int fault; - fault = handle_mm_fault(mm, vma, address, 1); - if (unlikely((fault & VM_FAULT_ERROR))) { -#if 0 - /* XXX: let's do this when we verify it is OK */ - if (ret & VM_FAULT_OOM) - ret = -ENOMEM; -#endif - } else { - ret = 0; - if (fault & VM_FAULT_MAJOR) - current->maj_flt++; - else - current->min_flt++; - } - } - up_read(&mm->mmap_sem); - return ret; -} /* * PI code: @@ -760,9 +725,9 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, struct futex_hash_bucket *hb1, *hb2; struct plist_head *head; struct futex_q *this, *next; - int ret, op_ret, attempt = 0; + int ret, op_ret; -retryfull: +retry: ret = get_futex_key(uaddr1, fshared, &key1); if (unlikely(ret != 0)) goto out; @@ -773,9 +738,8 @@ retryfull: hb1 = hash_futex(&key1); hb2 = hash_futex(&key2); -retry: double_lock_hb(hb1, hb2); - +retry_private: op_ret = futex_atomic_op_inuser(op, uaddr2); if (unlikely(op_ret < 0)) { u32 dummy; @@ -796,28 +760,16 @@ retry: goto out_put_keys; } - /* - * futex_atomic_op_inuser needs to both read and write - * *(int __user *)uaddr2, but we can't modify it - * non-atomically. Therefore, if get_user below is not - * enough, we need to handle the fault ourselves, while - * still holding the mmap_sem. - */ - if (attempt++) { - ret = futex_handle_fault((unsigned long)uaddr2, - attempt); - if (ret) - goto out_put_keys; - goto retry; - } - ret = get_user(dummy, uaddr2); if (ret) goto out_put_keys; + if (!fshared) + goto retry_private; + put_futex_key(fshared, &key2); put_futex_key(fshared, &key1); - goto retryfull; + goto retry; } head = &hb1->chain; @@ -877,6 +829,7 @@ retry: hb1 = hash_futex(&key1); hb2 = hash_futex(&key2); +retry_private: double_lock_hb(hb1, hb2); if (likely(cmpval != NULL)) { @@ -887,15 +840,16 @@ retry: if (unlikely(ret)) { double_unlock_hb(hb1, hb2); - put_futex_key(fshared, &key2); - put_futex_key(fshared, &key1); - ret = get_user(curval, uaddr1); + if (ret) + goto out_put_keys; - if (!ret) - goto retry; + if (!fshared) + goto retry_private; - goto out_put_keys; + put_futex_key(fshared, &key2); + put_futex_key(fshared, &key1); + goto retry; } if (curval != *cmpval) { ret = -EAGAIN; @@ -1070,7 +1024,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, struct futex_pi_state *pi_state = q->pi_state; struct task_struct *oldowner = pi_state->owner; u32 uval, curval, newval; - int ret, attempt = 0; + int ret; /* Owner died? */ if (!pi_state->owner) @@ -1141,7 +1095,7 @@ retry: handle_fault: spin_unlock(q->lock_ptr); - ret = futex_handle_fault((unsigned long)uaddr, attempt++); + ret = get_user(uval, uaddr); spin_lock(q->lock_ptr); @@ -1190,6 +1144,7 @@ retry: if (unlikely(ret != 0)) goto out; +retry_private: hb = queue_lock(&q); /* @@ -1216,13 +1171,16 @@ retry: if (unlikely(ret)) { queue_unlock(&q, hb); - put_futex_key(fshared, &q.key); ret = get_user(uval, uaddr); + if (ret) + goto out_put_key; - if (!ret) - goto retry; - goto out; + if (!fshared) + goto retry_private; + + put_futex_key(fshared, &q.key); + goto retry; } ret = -EWOULDBLOCK; if (unlikely(uval != val)) { @@ -1356,7 +1314,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, struct futex_hash_bucket *hb; u32 uval, newval, curval; struct futex_q q; - int ret, lock_taken, ownerdied = 0, attempt = 0; + int ret, lock_taken, ownerdied = 0; if (refill_pi_state_cache()) return -ENOMEM; @@ -1376,7 +1334,7 @@ retry: if (unlikely(ret != 0)) goto out; -retry_unlocked: +retry_private: hb = queue_lock(&q); retry_locked: @@ -1601,18 +1559,15 @@ uaddr_faulted: */ queue_unlock(&q, hb); - if (attempt++) { - ret = futex_handle_fault((unsigned long)uaddr, attempt); - if (ret) - goto out_put_key; - goto retry_unlocked; - } - ret = get_user(uval, uaddr); - if (!ret) - goto retry_unlocked; + if (ret) + goto out_put_key; - goto out_put_key; + if (!fshared) + goto retry_private; + + put_futex_key(fshared, &q.key); + goto retry; } @@ -1628,7 +1583,7 @@ static int futex_unlock_pi(u32 __user *uaddr, int fshared) u32 uval; struct plist_head *head; union futex_key key = FUTEX_KEY_INIT; - int ret, attempt = 0; + int ret; retry: if (get_user(uval, uaddr)) @@ -1644,7 +1599,6 @@ retry: goto out; hb = hash_futex(&key); -retry_unlocked: spin_lock(&hb->lock); /* @@ -1709,17 +1663,9 @@ pi_faulted: * we have to drop the mmap_sem in order to call get_user(). */ spin_unlock(&hb->lock); - - if (attempt++) { - ret = futex_handle_fault((unsigned long)uaddr, attempt); - if (ret) - goto out; - uval = 0; - goto retry_unlocked; - } + put_futex_key(fshared, &key); ret = get_user(uval, uaddr); - put_futex_key(fshared, &key); if (!ret) goto retry; -- cgit v1.1 From f061d35150003b7fd5b133d14d66a74500fdaa60 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Thu, 12 Mar 2009 15:11:18 -0700 Subject: futex: remove the pointer math from double_unlock_hb Impact: simplify code I mistakenly included the pointer value ordering in the double_unlock_hb() in my previous patch. It's only necessary in the double_lock_hb() function. This patch removes it. Signed-off-by: Darren Hart Acked-by: Peter Zijlstra Cc: Rusty Russell LKML-Reference: <20090312221118.11146.68610.stgit@Aeon> Signed-off-by: Ingo Molnar --- kernel/futex.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 9c97f67..2331b73 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -658,14 +658,8 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) static inline void double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) { - if (hb1 <= hb2) { - spin_unlock(&hb2->lock); - if (hb1 < hb2) - spin_unlock(&hb1->lock); - } else { /* hb1 > hb2 */ - spin_unlock(&hb1->lock); - spin_unlock(&hb2->lock); - } + spin_unlock(&hb1->lock); + spin_unlock(&hb2->lock); } /* -- cgit v1.1 From 88f502fedba82eff252b6420e8b8328e4ae25c67 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 13 Mar 2009 10:32:07 +0100 Subject: futex: remove the pointer math from double_unlock_hb, fix Impact: fix double unlock crash Thomas Gleixner noticed that the simplified double_unlock_hb() became ... too unsophisticated: in the hb1 == hb2 case it will do a double unlock. Reported-by: Thomas Gleixner Cc: Darren Hart LKML-Reference: <20090312221118.11146.68610.stgit@Aeon> Signed-off-by: Ingo Molnar --- kernel/futex.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/futex.c b/kernel/futex.c index 2331b73..6b50a02 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -659,7 +659,8 @@ static inline void double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) { spin_unlock(&hb1->lock); - spin_unlock(&hb2->lock); + if (hb1 != hb2) + spin_unlock(&hb2->lock); } /* -- cgit v1.1