From 6f4e16f8338923e9fd89009ec9cb4a5a3d770983 Mon Sep 17 00:00:00 2001 From: kib Date: Wed, 30 May 2012 16:06:38 +0000 Subject: Add a rangelock implementation, intended to be used to range-locking the i/o regions of the vnode data space. The implementation is quite simple-minded, it uses the list of the lock requests, ordered by arrival time. Each request may be for read or for write. The implementation is fair FIFO. MFC after: 2 month --- sys/conf/files | 1 + sys/kern/kern_rangelock.c | 246 ++++++++++++++++++++++++++++++++++++++++++++++ sys/kern/kern_thread.c | 3 + sys/kern/vfs_subr.c | 2 + sys/sys/proc.h | 1 + sys/sys/rangelock.h | 78 +++++++++++++++ sys/sys/vnode.h | 11 +++ 7 files changed, 342 insertions(+) create mode 100644 sys/kern/kern_rangelock.c create mode 100644 sys/sys/rangelock.h diff --git a/sys/conf/files b/sys/conf/files index da90069..ad460ba 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -2562,6 +2562,7 @@ kern/kern_priv.c standard kern/kern_proc.c standard kern/kern_prot.c standard kern/kern_racct.c standard +kern/kern_rangelock.c standard kern/kern_rctl.c standard kern/kern_resource.c standard kern/kern_rmlock.c standard diff --git a/sys/kern/kern_rangelock.c b/sys/kern/kern_rangelock.c new file mode 100644 index 0000000..1b4dfd6 --- /dev/null +++ b/sys/kern/kern_rangelock.c @@ -0,0 +1,246 @@ +/*- + * Copyright (c) 2009 Konstantin Belousov + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include + +#include + +struct rl_q_entry { + TAILQ_ENTRY(rl_q_entry) rl_q_link; + off_t rl_q_start, rl_q_end; + int rl_q_flags; +}; + +static uma_zone_t rl_entry_zone; + +static void +rangelock_sys_init(void) +{ + + rl_entry_zone = uma_zcreate("rl_entry", sizeof(struct rl_q_entry), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); +} +SYSINIT(vfs, SI_SUB_LOCK, SI_ORDER_ANY, rangelock_sys_init, NULL); + +static struct rl_q_entry * +rlqentry_alloc(void) +{ + + return (uma_zalloc(rl_entry_zone, M_WAITOK)); +} + +void +rlqentry_free(struct rl_q_entry *rleq) +{ + + uma_zfree(rl_entry_zone, rleq); +} + +void +rangelock_init(struct rangelock *lock) +{ + + TAILQ_INIT(&lock->rl_waiters); + lock->rl_currdep = NULL; +} + +void +rangelock_destroy(struct rangelock *lock) +{ + + KASSERT(TAILQ_EMPTY(&lock->rl_waiters), ("Dangling waiters")); +} + +/* + * Verifies the supplied rl_q_entries for compatibility. Returns true + * if the rangelock queue entries are not compatible, false if they are. + * + * Two entries are compatible if their ranges do not overlap, or both + * entries are for read. + */ +static int +rangelock_incompatible(const struct rl_q_entry *e1, + const struct rl_q_entry *e2) +{ + + if ((e1->rl_q_flags & RL_LOCK_TYPE_MASK) == RL_LOCK_READ && + (e2->rl_q_flags & RL_LOCK_TYPE_MASK) == RL_LOCK_READ) + return (0); + if (e1->rl_q_start < e2->rl_q_end && e1->rl_q_end > e2->rl_q_start) + return (1); + return (0); +} + +/* + * Recalculate the lock->rl_currdep after an unlock. + */ +static void +rangelock_calc_block(struct rangelock *lock) +{ + struct rl_q_entry *entry, *entry1, *whead; + + if (lock->rl_currdep == TAILQ_FIRST(&lock->rl_waiters) && + lock->rl_currdep != NULL) + lock->rl_currdep = TAILQ_NEXT(lock->rl_currdep, rl_q_link); + for (entry = lock->rl_currdep; entry != NULL; + entry = TAILQ_NEXT(entry, rl_q_link)) { + TAILQ_FOREACH(entry1, &lock->rl_waiters, rl_q_link) { + if (rangelock_incompatible(entry, entry1)) + goto out; + if (entry1 == entry) + break; + } + } +out: + lock->rl_currdep = entry; + TAILQ_FOREACH(whead, &lock->rl_waiters, rl_q_link) { + if (whead == lock->rl_currdep) + break; + if (!(whead->rl_q_flags & RL_LOCK_GRANTED)) { + whead->rl_q_flags |= RL_LOCK_GRANTED; + wakeup(whead); + } + } +} + +static void +rangelock_unlock_locked(struct rangelock *lock, struct rl_q_entry *entry, + struct mtx *ilk) +{ + + MPASS(lock != NULL && entry != NULL && ilk != NULL); + mtx_assert(ilk, MA_OWNED); + KASSERT(entry != lock->rl_currdep, ("stuck currdep")); + + TAILQ_REMOVE(&lock->rl_waiters, entry, rl_q_link); + rangelock_calc_block(lock); + mtx_unlock(ilk); + if (curthread->td_rlqe == NULL) + curthread->td_rlqe = entry; + else + rlqentry_free(entry); +} + +void +rangelock_unlock(struct rangelock *lock, void *cookie, struct mtx *ilk) +{ + + MPASS(lock != NULL && cookie != NULL && ilk != NULL); + + mtx_lock(ilk); + rangelock_unlock_locked(lock, cookie, ilk); +} + +/* + * Unlock the sub-range of granted lock. + */ +void * +rangelock_unlock_range(struct rangelock *lock, void *cookie, off_t start, + off_t end, struct mtx *ilk) +{ + struct rl_q_entry *entry; + + MPASS(lock != NULL && cookie != NULL && ilk != NULL); + entry = cookie; + KASSERT(entry->rl_q_flags & RL_LOCK_GRANTED, + ("Unlocking non-granted lock")); + KASSERT(entry->rl_q_start == start, ("wrong start")); + KASSERT(entry->rl_q_end >= end, ("wrong end")); + + mtx_lock(ilk); + if (entry->rl_q_end == end) { + rangelock_unlock_locked(lock, cookie, ilk); + return (NULL); + } + entry->rl_q_end = end; + rangelock_calc_block(lock); + mtx_unlock(ilk); + return (cookie); +} + +/* + * Add the lock request to the queue of the pending requests for + * rangelock. Sleep until the request can be granted. + */ +static void * +rangelock_enqueue(struct rangelock *lock, off_t start, off_t end, int mode, + struct mtx *ilk) +{ + struct rl_q_entry *entry; + struct thread *td; + + MPASS(lock != NULL && ilk != NULL); + + td = curthread; + if (td->td_rlqe != NULL) { + entry = td->td_rlqe; + td->td_rlqe = NULL; + } else + entry = rlqentry_alloc(); + MPASS(entry != NULL); + entry->rl_q_flags = mode; + entry->rl_q_start = start; + entry->rl_q_end = end; + + mtx_lock(ilk); + /* + * XXXKIB TODO. Check that a thread does not try to enqueue a + * lock that is incompatible with another request from the same + * thread. + */ + + TAILQ_INSERT_TAIL(&lock->rl_waiters, entry, rl_q_link); + if (lock->rl_currdep == NULL) + lock->rl_currdep = entry; + rangelock_calc_block(lock); + while (!(entry->rl_q_flags & RL_LOCK_GRANTED)) + msleep(entry, ilk, 0, "range", 0); + mtx_unlock(ilk); + return (entry); +} + +void * +rangelock_rlock(struct rangelock *lock, off_t start, off_t end, struct mtx *ilk) +{ + + return (rangelock_enqueue(lock, start, end, RL_LOCK_READ, ilk)); +} + +void * +rangelock_wlock(struct rangelock *lock, off_t start, off_t end, struct mtx *ilk) +{ + + return (rangelock_enqueue(lock, start, end, RL_LOCK_WRITE, ilk)); +} diff --git a/sys/kern/kern_thread.c b/sys/kern/kern_thread.c index 3fbe96f..69a416e 100644 --- a/sys/kern/kern_thread.c +++ b/sys/kern/kern_thread.c @@ -39,6 +39,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -205,6 +206,7 @@ thread_init(void *mem, int size, int flags) td->td_sleepqueue = sleepq_alloc(); td->td_turnstile = turnstile_alloc(); + td->td_rlqe = NULL; EVENTHANDLER_INVOKE(thread_init, td); td->td_sched = (struct td_sched *)&td[1]; umtx_thread_init(td); @@ -222,6 +224,7 @@ thread_fini(void *mem, int size) td = (struct thread *)mem; EVENTHANDLER_INVOKE(thread_fini, td); + rlqentry_free(td->td_rlqe); turnstile_free(td->td_turnstile); sleepq_free(td->td_sleepqueue); umtx_thread_fini(td); diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index a06ba31..8d999c3 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -1027,6 +1027,7 @@ alloc: if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0) vp->v_vflag |= VV_NOKNOTE; } + rangelock_init(&vp->v_rl); *vpp = vp; return (0); @@ -2468,6 +2469,7 @@ vdropl(struct vnode *vp) /* XXX Elsewhere we detect an already freed vnode via NULL v_op. */ vp->v_op = NULL; #endif + rangelock_destroy(&vp->v_rl); lockdestroy(vp->v_vnlock); mtx_destroy(&vp->v_interlock); mtx_destroy(BO_MTX(bo)); diff --git a/sys/sys/proc.h b/sys/sys/proc.h index 389bf14..8e54073 100644 --- a/sys/sys/proc.h +++ b/sys/sys/proc.h @@ -213,6 +213,7 @@ struct thread { struct seltd *td_sel; /* Select queue/channel. */ struct sleepqueue *td_sleepqueue; /* (k) Associated sleep queue. */ struct turnstile *td_turnstile; /* (k) Associated turnstile. */ + struct rl_q_entry *td_rlqe; /* (k) Associated range lock entry. */ struct umtx_q *td_umtxq; /* (c?) Link for when we're blocked. */ lwpid_t td_tid; /* (b) Thread ID. */ sigqueue_t td_sigqueue; /* (c) Sigs arrived, not delivered. */ diff --git a/sys/sys/rangelock.h b/sys/sys/rangelock.h new file mode 100644 index 0000000..fb0d252 --- /dev/null +++ b/sys/sys/rangelock.h @@ -0,0 +1,78 @@ +/*- + * Copyright (c) 2009 Konstantin Belousov + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SYS_RANGELOCK_H +#define _SYS_RANGELOCK_H + +#include + +#define RL_LOCK_READ 0x0001 +#define RL_LOCK_WRITE 0x0002 +#define RL_LOCK_TYPE_MASK 0x0003 +#define RL_LOCK_GRANTED 0x0004 + +struct rl_q_entry; + +/* + * The structure representing the range lock. Caller may request + * read or write access to the range of bytes. Access is granted if + * all existing lock owners are compatible with the request. Two lock + * owners are compatible if their ranges do not overlap, or both + * owners are for read. + * + * Access to the structure itself is synchronized with the externally + * supplied mutex. + * + * rl_waiters is the queue of lock requests in the order of arrival. + * rl_currdep is the first lock request that cannot be granted now due + * to the preceding requests conflicting with it. + */ +struct rangelock { + TAILQ_HEAD(, rl_q_entry) rl_waiters; + struct rl_q_entry *rl_currdep; +}; + +#ifdef _KERNEL + +struct mtx; + +void rangelock_init(struct rangelock *lock); +void rangelock_destroy(struct rangelock *lock); +void rangelock_unlock(struct rangelock *lock, void *cookie, + struct mtx *ilk); +void *rangelock_unlock_range(struct rangelock *lock, void *cookie, + off_t start, off_t end, struct mtx *ilk); +void *rangelock_rlock(struct rangelock *lock, off_t start, off_t end, + struct mtx *ilk); +void *rangelock_wlock(struct rangelock *lock, off_t start, off_t end, + struct mtx *ilk); +void rlqentry_free(struct rl_q_entry *rlqe); + +#endif /* _KERNEL */ + +#endif /* _SYS_RANGELOCK_H */ diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h index db22d47..7480bfc 100644 --- a/sys/sys/vnode.h +++ b/sys/sys/vnode.h @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -165,6 +166,7 @@ struct vnode { struct vpollinfo *v_pollinfo; /* i Poll events, p for *v_pi */ struct label *v_label; /* MAC label for vnode */ struct lockf *v_lockf; /* Byte-level advisory lock list */ + struct rangelock v_rl; /* Byte-range lock */ }; #endif /* defined(_KERNEL) || defined(_KVM_VNODE) */ @@ -679,6 +681,15 @@ int vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace, int vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp); +#define vn_rangelock_unlock(vp, cookie) \ + rangelock_unlock(&(vp)->v_rl, (cookie), VI_MTX(vp)) +#define vn_rangelock_unlock_range(vp, cookie, start, end) \ + rangelock_unlock_range(&(vp)->v_rl, (cookie), (start), (end), \ + VI_MTX(vp)) +#define vn_rangelock_rlock(vp, start, end) \ + rangelock_rlock(&(vp)->v_rl, (start), (end), VI_MTX(vp)) +#define vn_rangelock_wlock(vp, start, end) \ + rangelock_wlock(&(vp)->v_rl, (start), (end), VI_MTX(vp)) int vfs_cache_lookup(struct vop_lookup_args *ap); void vfs_timestamp(struct timespec *); -- cgit v1.1