diff options
Diffstat (limited to 'contrib/bind9/lib/dns/rbtdb.c')
-rw-r--r-- | contrib/bind9/lib/dns/rbtdb.c | 1950 |
1 files changed, 1445 insertions, 505 deletions
diff --git a/contrib/bind9/lib/dns/rbtdb.c b/contrib/bind9/lib/dns/rbtdb.c index 8930d35..cd25608 100644 --- a/contrib/bind9/lib/dns/rbtdb.c +++ b/contrib/bind9/lib/dns/rbtdb.c @@ -15,7 +15,9 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: rbtdb.c,v 1.168.2.11.2.26 2006/03/02 23:18:20 marka Exp $ */ +/* $Id: rbtdb.c,v 1.196.18.41 2006/10/26 06:04:29 marka Exp $ */ + +/*! \file */ /* * Principal Author: Bob Halley @@ -32,12 +34,15 @@ #include <isc/rwlock.h> #include <isc/string.h> #include <isc/task.h> +#include <isc/time.h> #include <isc/util.h> +#include <dns/acache.h> #include <dns/db.h> #include <dns/dbiterator.h> #include <dns/events.h> #include <dns/fixedname.h> +#include <dns/lib.h> #include <dns/log.h> #include <dns/masterdump.h> #include <dns/rbt.h> @@ -46,6 +51,8 @@ #include <dns/rdatasetiter.h> #include <dns/rdataslab.h> #include <dns/result.h> +#include <dns/view.h> +#include <dns/zone.h> #include <dns/zonekey.h> #ifdef DNS_RBTDB_VERSION64 @@ -60,7 +67,7 @@ #define RBTDB_MAGIC ISC_MAGIC('R', 'B', 'D', '4') #endif -/* +/*% * Note that "impmagic" is not the first four bytes of the struct, so * ISC_MAGIC_VALID cannot be used. */ @@ -69,7 +76,7 @@ #ifdef DNS_RBTDB_VERSION64 typedef isc_uint64_t rbtdb_serial_t; -/* +/*% * Make casting easier in symbolic debuggers by using different names * for the 64 bit version. */ @@ -98,6 +105,81 @@ typedef isc_uint32_t rbtdb_rdatatype_t; RBTDB_RDATATYPE_VALUE(0, dns_rdatatype_any) /* + * We use rwlock for DB lock only when ISC_RWLOCK_USEATOMIC is non 0. + * Using rwlock is effective with regard to lookup performance only when + * it is implemented in an efficient way. + * Otherwise, it is generally wise to stick to the simple locking since rwlock + * would require more memory or can even make lookups slower due to its own + * overhead (when it internally calls mutex locks). + */ +#ifdef ISC_RWLOCK_USEATOMIC +#define DNS_RBTDB_USERWLOCK 1 +#else +#define DNS_RBTDB_USERWLOCK 0 +#endif + +#if DNS_RBTDB_USERWLOCK +#define RBTDB_INITLOCK(l) isc_rwlock_init((l), 0, 0) +#define RBTDB_DESTROYLOCK(l) isc_rwlock_destroy(l) +#define RBTDB_LOCK(l, t) RWLOCK((l), (t)) +#define RBTDB_UNLOCK(l, t) RWUNLOCK((l), (t)) +#else +#define RBTDB_INITLOCK(l) isc_mutex_init(l) +#define RBTDB_DESTROYLOCK(l) DESTROYLOCK(l) +#define RBTDB_LOCK(l, t) LOCK(l) +#define RBTDB_UNLOCK(l, t) UNLOCK(l) +#endif + +/* + * Since node locking is sensitive to both performance and memory footprint, + * we need some trick here. If we have both high-performance rwlock and + * high performance and small-memory reference counters, we use rwlock for + * node lock and isc_refcount for node references. In this case, we don't have + * to protect the access to the counters by locks. + * Otherwise, we simply use ordinary mutex lock for node locking, and use + * simple integers as reference counters which is protected by the lock. + * In most cases, we can simply use wrapper macros such as NODE_LOCK and + * NODE_UNLOCK. In some other cases, however, we need to protect reference + * counters first and then protect other parts of a node as read-only data. + * Special additional macros, NODE_STRONGLOCK(), NODE_WEAKLOCK(), etc, are also + * provided for these special cases. When we can use the efficient backend + * routines, we should only protect the "other members" by NODE_WEAKLOCK(read). + * Otherwise, we should use NODE_STRONGLOCK() to protect the entire critical + * section including the access to the reference counter. + * Note that we cannot use NODE_LOCK()/NODE_UNLOCK() wherever the protected + * section is also protected by NODE_STRONGLOCK(). + */ +#if defined(ISC_RWLOCK_USEATOMIC) && defined(DNS_RBT_USEISCREFCOUNT) +typedef isc_rwlock_t nodelock_t; + +#define NODE_INITLOCK(l) isc_rwlock_init((l), 0, 0) +#define NODE_DESTROYLOCK(l) isc_rwlock_destroy(l) +#define NODE_LOCK(l, t) RWLOCK((l), (t)) +#define NODE_UNLOCK(l, t) RWUNLOCK((l), (t)) +#define NODE_TRYUPGRADE(l) isc_rwlock_tryupgrade(l) + +#define NODE_STRONGLOCK(l) ((void)0) +#define NODE_STRONGUNLOCK(l) ((void)0) +#define NODE_WEAKLOCK(l, t) NODE_LOCK(l, t) +#define NODE_WEAKUNLOCK(l, t) NODE_UNLOCK(l, t) +#define NODE_WEAKDOWNGRADE(l) isc_rwlock_downgrade(l) +#else +typedef isc_mutex_t nodelock_t; + +#define NODE_INITLOCK(l) isc_mutex_init(l) +#define NODE_DESTROYLOCK(l) DESTROYLOCK(l) +#define NODE_LOCK(l, t) LOCK(l) +#define NODE_UNLOCK(l, t) UNLOCK(l) +#define NODE_TRYUPGRADE(l) ISC_R_SUCCESS + +#define NODE_STRONGLOCK(l) LOCK(l) +#define NODE_STRONGUNLOCK(l) UNLOCK(l) +#define NODE_WEAKLOCK(l, t) ((void)0) +#define NODE_WEAKUNLOCK(l, t) ((void)0) +#define NODE_WEAKDOWNGRADE(l) ((void)0) +#endif + +/* * Allow clients with a virtual time of upto 5 minutes in the past to see * records that would have otherwise have expired. */ @@ -109,8 +191,10 @@ struct noqname { void * nsecsig; }; +typedef struct acachectl acachectl_t; + typedef struct rdatasetheader { - /* + /*% * Locked by the owning node's lock. */ rbtdb_serial_t serial; @@ -119,13 +203,13 @@ typedef struct rdatasetheader { isc_uint16_t attributes; dns_trust_t trust; struct noqname *noqname; - /* + /*%< * We don't use the LIST macros, because the LIST structure has * both head and tail pointers, and is doubly linked. */ struct rdatasetheader *next; - /* + /*%< * If this is the top header for an rdataset, 'next' points * to the top header for the next rdataset (i.e., the next type). * Otherwise, it points up to the header whose down pointer points @@ -133,19 +217,22 @@ typedef struct rdatasetheader { */ struct rdatasetheader *down; - /* + /*%< * Points to the header for the next older version of * this rdataset. */ isc_uint32_t count; - /* + /*%< * Monotonously increased every time this rdataset is bound so that * it is used as the base of the starting point in DNS responses * when the "cyclic" rrset-order is required. Since the ordering * should not be so crucial, no lock is set for the counter for * performance reasons. */ + + acachectl_t *additional_auth; + acachectl_t *additional_glue; } rdatasetheader_t; #define RDATASET_ATTR_NONEXISTENT 0x0001 @@ -154,6 +241,19 @@ typedef struct rdatasetheader { #define RDATASET_ATTR_RETAIN 0x0008 #define RDATASET_ATTR_NXDOMAIN 0x0010 +typedef struct acache_cbarg { + dns_rdatasetadditional_t type; + unsigned int count; + dns_db_t *db; + dns_dbnode_t *node; + rdatasetheader_t *header; +} acache_cbarg_t; + +struct acachectl { + dns_acacheentry_t *entry; + acache_cbarg_t *cbarg; +}; + /* * XXX * When the cache will pre-expire data (due to memory low or other @@ -175,12 +275,14 @@ typedef struct rdatasetheader { #define NXDOMAIN(header) \ (((header)->attributes & RDATASET_ATTR_NXDOMAIN) != 0) -#define DEFAULT_NODE_LOCK_COUNT 7 /* Should be prime. */ +#define DEFAULT_NODE_LOCK_COUNT 7 /*%< Should be prime. */ +#define DEFAULT_CACHE_NODE_LOCK_COUNT 1009 /*%< Should be prime. */ typedef struct { - isc_mutex_t lock; + nodelock_t lock; + /* Protected in the refcount routines. */ + isc_refcount_t references; /* Locked by lock. */ - unsigned int references; isc_boolean_t exiting; } rbtdb_nodelock_t; @@ -195,9 +297,14 @@ typedef ISC_LIST(rbtdb_changed_t) rbtdb_changedlist_t; typedef struct rbtdb_version { /* Not locked */ rbtdb_serial_t serial; + /* + * Protected in the refcount routines. + * XXXJT: should we change the lock policy based on the refcount + * performance? + */ + isc_refcount_t references; /* Locked by database lock. */ isc_boolean_t writer; - unsigned int references; isc_boolean_t commit_ok; rbtdb_changedlist_t changed_list; ISC_LINK(struct rbtdb_version) link; @@ -208,7 +315,11 @@ typedef ISC_LIST(rbtdb_version_t) rbtdb_versionlist_t; typedef struct { /* Unlocked. */ dns_db_t common; +#if DNS_RBTDB_USERWLOCK + isc_rwlock_t lock; +#else isc_mutex_t lock; +#endif isc_rwlock_t tree_lock; unsigned int node_lock_count; rbtdb_nodelock_t * node_locks; @@ -225,15 +336,20 @@ typedef struct { rbtdb_versionlist_t open_versions; isc_boolean_t overmem; isc_task_t * task; + dns_dbnode_t *soanode; + dns_dbnode_t *nsnode; /* Locked by tree_lock. */ dns_rbt_t * tree; isc_boolean_t secure; + + /* Unlocked */ + unsigned int quantum; } dns_rbtdb_t; #define RBTDB_ATTR_LOADED 0x01 #define RBTDB_ATTR_LOADING 0x02 -/* +/*% * Search Context */ typedef struct { @@ -252,7 +368,7 @@ typedef struct { isc_stdtime_t now; } rbtdb_search_t; -/* +/*% * Load Context */ typedef struct { @@ -270,6 +386,30 @@ static isc_result_t rdataset_getnoqname(dns_rdataset_t *rdataset, dns_name_t *name, dns_rdataset_t *nsec, dns_rdataset_t *nsecsig); +static isc_result_t rdataset_getadditional(dns_rdataset_t *rdataset, + dns_rdatasetadditional_t type, + dns_rdatatype_t qtype, + dns_acache_t *acache, + dns_zone_t **zonep, + dns_db_t **dbp, + dns_dbversion_t **versionp, + dns_dbnode_t **nodep, + dns_name_t *fname, + dns_message_t *msg, + isc_stdtime_t now); +static isc_result_t rdataset_setadditional(dns_rdataset_t *rdataset, + dns_rdatasetadditional_t type, + dns_rdatatype_t qtype, + dns_acache_t *acache, + dns_zone_t *zone, + dns_db_t *db, + dns_dbversion_t *version, + dns_dbnode_t *node, + dns_name_t *fname); +static isc_result_t rdataset_putadditional(dns_acache_t *acache, + dns_rdataset_t *rdataset, + dns_rdatasetadditional_t type, + dns_rdatatype_t qtype); static dns_rdatasetmethods_t rdataset_methods = { rdataset_disassociate, @@ -279,7 +419,10 @@ static dns_rdatasetmethods_t rdataset_methods = { rdataset_clone, rdataset_count, NULL, - rdataset_getnoqname + rdataset_getnoqname, + rdataset_getadditional, + rdataset_setadditional, + rdataset_putadditional }; static void rdatasetiter_destroy(dns_rdatasetiter_t **iteratorp); @@ -403,25 +546,87 @@ free_rbtdb_callback(isc_task_t *task, isc_event_t *event) { free_rbtdb(rbtdb, ISC_TRUE, event); } +/*% + * Work out how many nodes can be deleted in the time between two + * requests to the nameserver. Smooth the resulting number and use it + * as a estimate for the number of nodes to be deleted in the next + * iteration. + */ +static unsigned int +adjust_quantum(unsigned int old, isc_time_t *start) { + unsigned int pps = dns_pps; /* packets per second */ + unsigned int interval; + isc_uint64_t usecs; + isc_time_t end; + unsigned int new; + + if (pps < 100) + pps = 100; + isc_time_now(&end); + + interval = 1000000 / pps; /* interval in usec */ + if (interval == 0) + interval = 1; + usecs = isc_time_microdiff(&end, start); + if (usecs == 0) { + /* + * We were unable to measure the amount of time taken. + * Double the nodes deleted next time. + */ + old *= 2; + if (old > 1000) + old = 1000; + return (old); + } + new = old * interval; + new /= (unsigned int)usecs; + if (new == 0) + new = 1; + else if (new > 1000) + new = 1000; + + /* Smooth */ + new = (new + old * 3) / 4; + + isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, DNS_LOGMODULE_CACHE, + ISC_LOG_DEBUG(1), "adjust_quantum -> %d", new); + + return (new); +} + static void free_rbtdb(dns_rbtdb_t *rbtdb, isc_boolean_t log, isc_event_t *event) { unsigned int i; isc_ondestroy_t ondest; isc_result_t result; char buf[DNS_NAME_FORMATSIZE]; + isc_time_t start; - REQUIRE(EMPTY(rbtdb->open_versions)); + REQUIRE(rbtdb->current_version != NULL || EMPTY(rbtdb->open_versions)); REQUIRE(rbtdb->future_version == NULL); - if (rbtdb->current_version != NULL) + if (rbtdb->current_version != NULL) { + unsigned int refs; + + isc_refcount_decrement(&rbtdb->current_version->references, + &refs); + INSIST(refs == 0); + UNLINK(rbtdb->open_versions, rbtdb->current_version, link); + isc_refcount_destroy(&rbtdb->current_version->references); isc_mem_put(rbtdb->common.mctx, rbtdb->current_version, sizeof(rbtdb_version_t)); + } + if (event == NULL) + rbtdb->quantum = (rbtdb->task != NULL) ? 100 : 0; again: if (rbtdb->tree != NULL) { - result = dns_rbt_destroy2(&rbtdb->tree, - (rbtdb->task != NULL) ? 1000 : 0); + isc_time_now(&start); + result = dns_rbt_destroy2(&rbtdb->tree, rbtdb->quantum); if (result == ISC_R_QUOTA) { INSIST(rbtdb->task != NULL); + if (rbtdb->quantum != 0) + rbtdb->quantum = adjust_quantum(rbtdb->quantum, + &start); if (event == NULL) event = isc_event_allocate(rbtdb->common.mctx, NULL, @@ -450,15 +655,17 @@ free_rbtdb(dns_rbtdb_t *rbtdb, isc_boolean_t log, isc_event_t *event) { } if (dns_name_dynamic(&rbtdb->common.origin)) dns_name_free(&rbtdb->common.origin, rbtdb->common.mctx); - for (i = 0; i < rbtdb->node_lock_count; i++) - DESTROYLOCK(&rbtdb->node_locks[i].lock); + for (i = 0; i < rbtdb->node_lock_count; i++) { + isc_refcount_destroy(&rbtdb->node_locks[i].references); + NODE_DESTROYLOCK(&rbtdb->node_locks[i].lock); + } isc_mem_put(rbtdb->common.mctx, rbtdb->node_locks, rbtdb->node_lock_count * sizeof(rbtdb_nodelock_t)); isc_rwlock_destroy(&rbtdb->tree_lock); isc_refcount_destroy(&rbtdb->references); if (rbtdb->task != NULL) isc_task_detach(&rbtdb->task); - DESTROYLOCK(&rbtdb->lock); + RBTDB_DESTROYLOCK(&rbtdb->lock); rbtdb->common.magic = 0; rbtdb->common.impmagic = 0; ondest = rbtdb->common.ondest; @@ -474,24 +681,31 @@ maybe_free_rbtdb(dns_rbtdb_t *rbtdb) { /* XXX check for open versions here */ + if (rbtdb->soanode != NULL) + dns_db_detachnode((dns_db_t *)rbtdb, &rbtdb->soanode); + if (rbtdb->nsnode != NULL) + dns_db_detachnode((dns_db_t *)rbtdb, &rbtdb->nsnode); + /* * Even though there are no external direct references, there still * may be nodes in use. */ for (i = 0; i < rbtdb->node_lock_count; i++) { - LOCK(&rbtdb->node_locks[i].lock); + NODE_LOCK(&rbtdb->node_locks[i].lock, isc_rwlocktype_write); rbtdb->node_locks[i].exiting = ISC_TRUE; - if (rbtdb->node_locks[i].references == 0) + NODE_UNLOCK(&rbtdb->node_locks[i].lock, isc_rwlocktype_write); + if (isc_refcount_current(&rbtdb->node_locks[i].references) + == 0) { inactive++; - UNLOCK(&rbtdb->node_locks[i].lock); + } } if (inactive != 0) { - LOCK(&rbtdb->lock); + RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write); rbtdb->active -= inactive; if (rbtdb->active == 0) want_free = ISC_TRUE; - UNLOCK(&rbtdb->lock); + RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write); if (want_free) { char buf[DNS_NAME_FORMATSIZE]; if (dns_name_dynamic(&rbtdb->common.origin)) @@ -526,15 +740,14 @@ static void currentversion(dns_db_t *db, dns_dbversion_t **versionp) { dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; rbtdb_version_t *version; + unsigned int refs; REQUIRE(VALID_RBTDB(rbtdb)); - LOCK(&rbtdb->lock); + RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read); version = rbtdb->current_version; - if (version->references == 0) - PREPEND(rbtdb->open_versions, version, link); - version->references++; - UNLOCK(&rbtdb->lock); + isc_refcount_increment(&version->references, &refs); + RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read); *versionp = (dns_dbversion_t *)version; } @@ -543,13 +756,18 @@ static inline rbtdb_version_t * allocate_version(isc_mem_t *mctx, rbtdb_serial_t serial, unsigned int references, isc_boolean_t writer) { + isc_result_t result; rbtdb_version_t *version; version = isc_mem_get(mctx, sizeof(*version)); if (version == NULL) return (NULL); version->serial = serial; - version->references = references; + result = isc_refcount_init(&version->references, references); + if (result != ISC_R_SUCCESS) { + isc_mem_put(mctx, version, sizeof(*version)); + return (NULL); + } version->writer = writer; version->commit_ok = ISC_FALSE; ISC_LIST_INIT(version->changed_list); @@ -567,7 +785,7 @@ newversion(dns_db_t *db, dns_dbversion_t **versionp) { REQUIRE(versionp != NULL && *versionp == NULL); REQUIRE(rbtdb->future_version == NULL); - LOCK(&rbtdb->lock); + RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write); RUNTIME_CHECK(rbtdb->next_serial != 0); /* XXX Error? */ version = allocate_version(rbtdb->common.mctx, rbtdb->next_serial, 1, ISC_TRUE); @@ -576,7 +794,7 @@ newversion(dns_db_t *db, dns_dbversion_t **versionp) { rbtdb->next_serial++; rbtdb->future_version = version; } - UNLOCK(&rbtdb->lock); + RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write); if (version == NULL) return (ISC_R_NOMEMORY); @@ -592,16 +810,12 @@ attachversion(dns_db_t *db, dns_dbversion_t *source, { dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; rbtdb_version_t *rbtversion = source; + unsigned int refs; REQUIRE(VALID_RBTDB(rbtdb)); - LOCK(&rbtdb->lock); - - INSIST(rbtversion->references > 0); - rbtversion->references++; - INSIST(rbtversion->references != 0); - - UNLOCK(&rbtdb->lock); + isc_refcount_increment(&rbtversion->references, &refs); + INSIST(refs > 1); *targetp = rbtversion; } @@ -611,32 +825,62 @@ add_changed(dns_rbtdb_t *rbtdb, rbtdb_version_t *version, dns_rbtnode_t *node) { rbtdb_changed_t *changed; + unsigned int refs; /* - * Caller must be holding the node lock. + * Caller must be holding the node lock if its reference must be + * protected by the lock. */ changed = isc_mem_get(rbtdb->common.mctx, sizeof(*changed)); - LOCK(&rbtdb->lock); + RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write); REQUIRE(version->writer); if (changed != NULL) { - INSIST(node->references > 0); - node->references++; - INSIST(node->references != 0); + dns_rbtnode_refincrement(node, &refs); + INSIST(refs != 0); changed->node = node; changed->dirty = ISC_FALSE; ISC_LIST_INITANDAPPEND(version->changed_list, changed, link); } else version->commit_ok = ISC_FALSE; - UNLOCK(&rbtdb->lock); + RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write); return (changed); } +static void +free_acachearray(isc_mem_t *mctx, rdatasetheader_t *header, + acachectl_t *array) +{ + unsigned int count; + unsigned int i; + unsigned char *raw; /* RDATASLAB */ + + /* + * The caller must be holding the corresponding node lock. + */ + + if (array == NULL) + return; + + raw = (unsigned char *)header + sizeof(*header); + count = raw[0] * 256 + raw[1]; + + /* + * Sanity check: since an additional cache entry has a reference to + * the original DB node (in the callback arg), there should be no + * acache entries when the node can be freed. + */ + for (i = 0; i < count; i++) + INSIST(array[i].entry == NULL && array[i].cbarg == NULL); + + isc_mem_put(mctx, array, count * sizeof(acachectl_t)); +} + static inline void free_noqname(isc_mem_t *mctx, struct noqname **noqname) { @@ -658,7 +902,10 @@ free_rdataset(isc_mem_t *mctx, rdatasetheader_t *rdataset) { if (rdataset->noqname != NULL) free_noqname(mctx, &rdataset->noqname); - + + free_acachearray(mctx, rdataset, rdataset->additional_auth); + free_acachearray(mctx, rdataset, rdataset->additional_glue); + if ((rdataset->attributes & RDATASET_ATTR_NONEXISTENT) != 0) size = sizeof(*rdataset); else @@ -700,8 +947,19 @@ rollback_node(dns_rbtnode_t *node, rbtdb_serial_t serial) { } static inline void +clean_stale_headers(isc_mem_t *mctx, rdatasetheader_t *top) { + rdatasetheader_t *d, *down_next; + + for (d = top->down; d != NULL; d = down_next) { + down_next = d->down; + free_rdataset(mctx, d); + } + top->down = NULL; +} + +static inline void clean_cache_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node) { - rdatasetheader_t *current, *dcurrent, *top_prev, *top_next, *down_next; + rdatasetheader_t *current, *top_prev, *top_next; isc_mem_t *mctx = rbtdb->common.mctx; /* @@ -711,15 +969,7 @@ clean_cache_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node) { top_prev = NULL; for (current = node->data; current != NULL; current = top_next) { top_next = current->next; - dcurrent = current->down; - if (dcurrent != NULL) { - do { - down_next = dcurrent->down; - free_rdataset(mctx, dcurrent); - dcurrent = down_next; - } while (dcurrent != NULL); - current->down = NULL; - } + clean_stale_headers(mctx, current); /* * If current is nonexistent or stale, we can clean it up. */ @@ -862,31 +1112,72 @@ clean_zone_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, node->dirty = 0; } +/* + * Caller must be holding the node lock if its reference must be protected + * by the lock. + */ static inline void new_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node) { - if (node->references == 0) { - rbtdb->node_locks[node->locknum].references++; - INSIST(rbtdb->node_locks[node->locknum].references != 0); + unsigned int lockrefs, noderefs; + isc_refcount_t *lockref; + + dns_rbtnode_refincrement0(node, &noderefs); + if (noderefs == 1) { /* this is the first reference to the node */ + lockref = &rbtdb->node_locks[node->locknum].references; + isc_refcount_increment0(lockref, &lockrefs); + INSIST(lockrefs != 0); } - node->references++; - INSIST(node->references != 0); + INSIST(noderefs != 0); } -static void -no_references(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, - rbtdb_serial_t least_serial, isc_rwlocktype_t lock) +/* + * Caller must be holding the node lock; either the "strong", read or write + * lock. Note that the lock must be held even when node references are + * atomically modified; in that case the decrement operation itself does not + * have to be protected, but we must avoid a race condition where multiple + * threads are decreasing the reference to zero simultaneously and at least + * one of them is going to free the node. + * This function returns ISC_TRUE if and only if the node reference decreases + * to zero. + */ +static isc_boolean_t +decrement_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, + rbtdb_serial_t least_serial, + isc_rwlocktype_t nlock, isc_rwlocktype_t tlock) { isc_result_t result; isc_boolean_t write_locked; - unsigned int locknum; - - /* - * Caller must be holding the node lock. - */ + rbtdb_nodelock_t *nodelock; + unsigned int refs, nrefs; + + nodelock = &rbtdb->node_locks[node->locknum]; + + /* Handle easy and typical case first. */ + if (!node->dirty && (node->data != NULL || node->down != NULL)) { + dns_rbtnode_refdecrement(node, &nrefs); + INSIST((int)nrefs >= 0); + if (nrefs == 0) { + isc_refcount_decrement(&nodelock->references, &refs); + INSIST((int)refs >= 0); + } + return ((nrefs == 0) ? ISC_TRUE : ISC_FALSE); + } - REQUIRE(node->references == 0); + /* Upgrade the lock? */ + if (nlock == isc_rwlocktype_read) { + NODE_WEAKUNLOCK(&nodelock->lock, isc_rwlocktype_read); + NODE_WEAKLOCK(&nodelock->lock, isc_rwlocktype_write); + } + dns_rbtnode_refdecrement(node, &nrefs); + INSIST((int)nrefs >= 0); + if (nrefs > 0) { + /* Restore the lock? */ + if (nlock == isc_rwlocktype_read) + NODE_WEAKDOWNGRADE(&nodelock->lock); + return (ISC_FALSE); + } - if (node->dirty) { + if (node->dirty && dns_rbtnode_refcurrent(node) == 0) { if (IS_CACHE(rbtdb)) clean_cache_node(rbtdb, node); else { @@ -895,35 +1186,38 @@ no_references(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, * Caller doesn't know the least serial. * Get it. */ - LOCK(&rbtdb->lock); + RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read); least_serial = rbtdb->least_serial; - UNLOCK(&rbtdb->lock); + RBTDB_UNLOCK(&rbtdb->lock, + isc_rwlocktype_read); } clean_zone_node(rbtdb, node, least_serial); } } - locknum = node->locknum; - - INSIST(rbtdb->node_locks[locknum].references > 0); - rbtdb->node_locks[locknum].references--; + isc_refcount_decrement(&nodelock->references, &refs); + INSIST((int)refs >= 0); /* * XXXDCL should this only be done for cache zones? */ - if (node->data != NULL || node->down != NULL) - return; + if (node->data != NULL || node->down != NULL) { + /* Restore the lock? */ + if (nlock == isc_rwlocktype_read) + NODE_WEAKDOWNGRADE(&nodelock->lock); + return (ISC_TRUE); + } /* * XXXDCL need to add a deferred delete method for ISC_R_LOCKBUSY. */ - if (lock != isc_rwlocktype_write) { + if (tlock != isc_rwlocktype_write) { /* * Locking hierarchy notwithstanding, we don't need to free * the node lock before acquiring the tree write lock because * we only do a trylock. */ - if (lock == isc_rwlocktype_read) + if (tlock == isc_rwlocktype_read) result = isc_rwlock_tryupgrade(&rbtdb->tree_lock); else result = isc_rwlock_trylock(&rbtdb->tree_lock, @@ -935,13 +1229,21 @@ no_references(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, } else write_locked = ISC_TRUE; - if (write_locked) { + if (write_locked && dns_rbtnode_refcurrent(node) == 0) { + /* + * We can now delete the node if the reference counter is + * zero. This should be typically the case, but a different + * thread may still gain a (new) reference just before the + * current thread locks the tree (e.g., in findnode()). + */ + if (isc_log_wouldlog(dns_lctx, ISC_LOG_DEBUG(1))) { char printname[DNS_NAME_FORMATSIZE]; isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1), - "no_references: delete from rbt: %p %s", + "decrement_reference: " + "delete from rbt: %p %s", node, dns_rbt_formatnodename(node, printname, sizeof(printname))); @@ -951,20 +1253,27 @@ no_references(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, if (result != ISC_R_SUCCESS) isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, DNS_LOGMODULE_CACHE, ISC_LOG_WARNING, - "no_references: dns_rbt_deletenode: %s", + "decrement_reference: " + "dns_rbt_deletenode: %s", isc_result_totext(result)); } + /* Restore the lock? */ + if (nlock == isc_rwlocktype_read) + NODE_WEAKDOWNGRADE(&nodelock->lock); + /* * Relock a read lock, or unlock the write lock if no lock was held. */ - if (lock == isc_rwlocktype_none) + if (tlock == isc_rwlocktype_none) if (write_locked) RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write); - if (lock == isc_rwlocktype_read) + if (tlock == isc_rwlocktype_read) if (write_locked) isc_rwlock_downgrade(&rbtdb->tree_lock); + + return (ISC_TRUE); } static inline void @@ -1061,7 +1370,7 @@ closeversion(dns_db_t *db, dns_dbversion_t **versionp, isc_boolean_t commit) { rbtdb_changed_t *changed, *next_changed; rbtdb_serial_t serial, least_serial; dns_rbtnode_t *rbtnode; - isc_mutex_t *lock; + unsigned int refs; REQUIRE(VALID_RBTDB(rbtdb)); version = (rbtdb_version_t *)*versionp; @@ -1069,113 +1378,146 @@ closeversion(dns_db_t *db, dns_dbversion_t **versionp, isc_boolean_t commit) { cleanup_version = NULL; ISC_LIST_INIT(cleanup_list); - LOCK(&rbtdb->lock); - INSIST(version->references > 0); - INSIST(!version->writer || !(commit && version->references > 1)); - version->references--; + isc_refcount_decrement(&version->references, &refs); + if (refs > 0) { /* typical and easy case first */ + if (commit) { + RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read); + INSIST(!version->writer); + RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read); + } + goto end; + } + + RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write); serial = version->serial; - if (version->references == 0) { - if (version->writer) { - if (commit) { - INSIST(version->commit_ok); - INSIST(version == rbtdb->future_version); - if (EMPTY(rbtdb->open_versions)) { - /* - * We're going to become the least open - * version. - */ - make_least_version(rbtdb, version, - &cleanup_list); - } else { - /* - * Some other open version is the - * least version. We can't cleanup - * records that were changed in this - * version because the older versions - * may still be in use by an open - * version. - * - * We can, however, discard the - * changed records for things that - * we've added that didn't exist in - * prior versions. - */ - cleanup_nondirty(version, - &cleanup_list); - } - /* - * If the (soon to be former) current version - * isn't being used by anyone, we can clean - * it up. - */ - if (rbtdb->current_version->references == 0) { - cleanup_version = - rbtdb->current_version; - APPENDLIST(version->changed_list, - cleanup_version->changed_list, - link); - } + if (version->writer) { + if (commit) { + unsigned cur_ref; + rbtdb_version_t *cur_version; + + INSIST(version->commit_ok); + INSIST(version == rbtdb->future_version); + /* + * The current version is going to be replaced. + * Release the (likely last) reference to it from the + * DB itself and unlink it from the open list. + */ + cur_version = rbtdb->current_version; + isc_refcount_decrement(&cur_version->references, + &cur_ref); + if (cur_ref == 0) { + if (cur_version->serial == rbtdb->least_serial) + INSIST(EMPTY(cur_version->changed_list)); + UNLINK(rbtdb->open_versions, + cur_version, link); + } + if (EMPTY(rbtdb->open_versions)) { /* - * Become the current version. + * We're going to become the least open + * version. */ - version->writer = ISC_FALSE; - rbtdb->current_version = version; - rbtdb->current_serial = version->serial; - rbtdb->future_version = NULL; + make_least_version(rbtdb, version, + &cleanup_list); } else { /* - * We're rolling back this transaction. + * Some other open version is the + * least version. We can't cleanup + * records that were changed in this + * version because the older versions + * may still be in use by an open + * version. + * + * We can, however, discard the + * changed records for things that + * we've added that didn't exist in + * prior versions. */ - cleanup_list = version->changed_list; - ISC_LIST_INIT(version->changed_list); - rollback = ISC_TRUE; - cleanup_version = version; - rbtdb->future_version = NULL; + cleanup_nondirty(version, &cleanup_list); } + /* + * If the (soon to be former) current version + * isn't being used by anyone, we can clean + * it up. + */ + if (cur_ref == 0) { + cleanup_version = cur_version; + APPENDLIST(version->changed_list, + cleanup_version->changed_list, + link); + } + /* + * Become the current version. + */ + version->writer = ISC_FALSE; + rbtdb->current_version = version; + rbtdb->current_serial = version->serial; + rbtdb->future_version = NULL; + + /* + * Keep the current version in the open list, and + * gain a reference for the DB itself (see the DB + * creation function below). This must be the only + * case where we need to increment the counter from + * zero and need to use isc_refcount_increment0(). + */ + isc_refcount_increment0(&version->references, + &cur_ref); + INSIST(cur_ref == 1); + PREPEND(rbtdb->open_versions, + rbtdb->current_version, link); } else { - if (version != rbtdb->current_version) { - /* - * There are no external or internal references - * to this version and it can be cleaned up. - */ - cleanup_version = version; + /* + * We're rolling back this transaction. + */ + cleanup_list = version->changed_list; + ISC_LIST_INIT(version->changed_list); + rollback = ISC_TRUE; + cleanup_version = version; + rbtdb->future_version = NULL; + } + } else { + if (version != rbtdb->current_version) { + /* + * There are no external or internal references + * to this version and it can be cleaned up. + */ + cleanup_version = version; + /* + * Find the version with the least serial + * number greater than ours. + */ + least_greater = PREV(version, link); + if (least_greater == NULL) + least_greater = rbtdb->current_version; + + INSIST(version->serial < least_greater->serial); + /* + * Is this the least open version? + */ + if (version->serial == rbtdb->least_serial) { /* - * Find the version with the least serial - * number greater than ours. + * Yes. Install the new least open + * version. */ - least_greater = PREV(version, link); - if (least_greater == NULL) - least_greater = rbtdb->current_version; - - INSIST(version->serial < least_greater->serial); + make_least_version(rbtdb, + least_greater, + &cleanup_list); + } else { /* - * Is this the least open version? + * Add any unexecuted cleanups to + * those of the least greater version. */ - if (version->serial == rbtdb->least_serial) { - /* - * Yes. Install the new least open - * version. - */ - make_least_version(rbtdb, - least_greater, - &cleanup_list); - } else { - /* - * Add any unexecuted cleanups to - * those of the least greater version. - */ - APPENDLIST(least_greater->changed_list, - version->changed_list, - link); - } - } else if (version->serial == rbtdb->least_serial) - INSIST(EMPTY(version->changed_list)); - UNLINK(rbtdb->open_versions, version, link); - } + APPENDLIST(least_greater->changed_list, + version->changed_list, + link); + } + } else if (version->serial == rbtdb->least_serial) + INSIST(EMPTY(version->changed_list)); + UNLINK(rbtdb->open_versions, version, link); } least_serial = rbtdb->least_serial; - UNLOCK(&rbtdb->lock); + RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write); /* * Update the zone's secure status. @@ -1193,28 +1535,26 @@ closeversion(dns_db_t *db, dns_dbversion_t **versionp, isc_boolean_t commit) { for (changed = HEAD(cleanup_list); changed != NULL; changed = next_changed) { + nodelock_t *lock; + next_changed = NEXT(changed, link); rbtnode = changed->node; lock = &rbtdb->node_locks[rbtnode->locknum].lock; - LOCK(lock); - - INSIST(rbtnode->references > 0); - rbtnode->references--; + NODE_LOCK(lock, isc_rwlocktype_write); if (rollback) rollback_node(rbtnode, serial); - - if (rbtnode->references == 0) - no_references(rbtdb, rbtnode, least_serial, - isc_rwlocktype_none); - - UNLOCK(lock); + decrement_reference(rbtdb, rbtnode, least_serial, + isc_rwlocktype_write, + isc_rwlocktype_none); + NODE_UNLOCK(lock, isc_rwlocktype_write); isc_mem_put(rbtdb->common.mctx, changed, sizeof(*changed)); } } + end: *versionp = NULL; } @@ -1287,7 +1627,6 @@ findnode(dns_db_t *db, dns_name_t *name, isc_boolean_t create, dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; dns_rbtnode_t *node = NULL; dns_name_t nodename; - unsigned int locknum; isc_result_t result; isc_rwlocktype_t locktype = isc_rwlocktype_read; @@ -1334,10 +1673,9 @@ findnode(dns_db_t *db, dns_name_t *name, isc_boolean_t create, return (result); } } - locknum = node->locknum; - LOCK(&rbtdb->node_locks[locknum].lock); + NODE_STRONGLOCK(&rbtdb->node_locks[node->locknum].lock); new_reference(rbtdb, node); - UNLOCK(&rbtdb->node_locks[locknum].lock); + NODE_STRONGUNLOCK(&rbtdb->node_locks[node->locknum].lock); RWUNLOCK(&rbtdb->tree_lock, locktype); *nodep = (dns_dbnode_t *)node; @@ -1366,7 +1704,8 @@ zone_zonecut_callback(dns_rbtnode_t *node, dns_name_t *name, void *arg) { result = DNS_R_CONTINUE; onode = search->rbtdb->origin_node; - LOCK(&(search->rbtdb->node_locks[node->locknum].lock)); + NODE_LOCK(&(search->rbtdb->node_locks[node->locknum].lock), + isc_rwlocktype_read); /* * Look for an NS or DNAME rdataset active in our version. @@ -1477,7 +1816,8 @@ zone_zonecut_callback(dns_rbtnode_t *node, dns_name_t *name, void *arg) { search->wild = ISC_TRUE; } - UNLOCK(&(search->rbtdb->node_locks[node->locknum].lock)); + NODE_UNLOCK(&(search->rbtdb->node_locks[node->locknum].lock), + isc_rwlocktype_read); return (result); } @@ -1487,10 +1827,14 @@ bind_rdataset(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, rdatasetheader_t *header, isc_stdtime_t now, dns_rdataset_t *rdataset) { - unsigned char *raw; + unsigned char *raw; /* RDATASLAB */ /* - * Caller must be holding the node lock. + * Caller must be holding the node reader lock. + * XXXJT: technically, we need a writer lock, since we'll increment + * the header count below. However, since the actual counter value + * doesn't matter, we prioritize performance here. (We may want to + * use atomic increment when available). */ if (rdataset == NULL) @@ -1570,14 +1914,16 @@ setup_delegation(rbtdb_search_t *search, dns_dbnode_t **nodep, search->need_cleanup = ISC_FALSE; } if (rdataset != NULL) { - LOCK(&(search->rbtdb->node_locks[node->locknum].lock)); + NODE_LOCK(&(search->rbtdb->node_locks[node->locknum].lock), + isc_rwlocktype_read); bind_rdataset(search->rbtdb, node, search->zonecut_rdataset, search->now, rdataset); if (sigrdataset != NULL && search->zonecut_sigrdataset != NULL) bind_rdataset(search->rbtdb, node, search->zonecut_sigrdataset, search->now, sigrdataset); - UNLOCK(&(search->rbtdb->node_locks[node->locknum].lock)); + NODE_UNLOCK(&(search->rbtdb->node_locks[node->locknum].lock), + isc_rwlocktype_read); } if (type == dns_rdatatype_dname) @@ -1589,7 +1935,7 @@ static inline isc_boolean_t valid_glue(rbtdb_search_t *search, dns_name_t *name, rbtdb_rdatatype_t type, dns_rbtnode_t *node) { - unsigned char *raw; + unsigned char *raw; /* RDATASLAB */ unsigned int count, size; dns_name_t ns_name; isc_boolean_t valid = ISC_FALSE; @@ -1618,12 +1964,12 @@ valid_glue(rbtdb_search_t *search, dns_name_t *name, rbtdb_rdatatype_t type, header = search->zonecut_rdataset; raw = (unsigned char *)header + sizeof(*header); count = raw[0] * 256 + raw[1]; - raw += 2; + raw += 2 + (4 * count); while (count > 0) { count--; size = raw[0] * 256 + raw[1]; - raw += 2; + raw += 4; region.base = raw; region.length = size; raw += size; @@ -1672,7 +2018,8 @@ activeempty(rbtdb_search_t *search, dns_rbtnodechain_t *chain, origin, &node); if (result != ISC_R_SUCCESS) break; - LOCK(&(rbtdb->node_locks[node->locknum].lock)); + NODE_LOCK(&(rbtdb->node_locks[node->locknum].lock), + isc_rwlocktype_read); for (header = node->data; header != NULL; header = header->next) { @@ -1680,7 +2027,8 @@ activeempty(rbtdb_search_t *search, dns_rbtnodechain_t *chain, !IGNORE(header) && EXISTS(header)) break; } - UNLOCK(&(rbtdb->node_locks[node->locknum].lock)); + NODE_UNLOCK(&(rbtdb->node_locks[node->locknum].lock), + isc_rwlocktype_read); if (header != NULL) break; result = dns_rbtnodechain_next(chain, NULL, NULL); @@ -1737,7 +2085,8 @@ activeemtpynode(rbtdb_search_t *search, dns_name_t *qname, dns_name_t *wname) { origin, &node); if (result != ISC_R_SUCCESS) break; - LOCK(&(rbtdb->node_locks[node->locknum].lock)); + NODE_LOCK(&(rbtdb->node_locks[node->locknum].lock), + isc_rwlocktype_read); for (header = node->data; header != NULL; header = header->next) { @@ -1745,7 +2094,8 @@ activeemtpynode(rbtdb_search_t *search, dns_name_t *qname, dns_name_t *wname) { !IGNORE(header) && EXISTS(header)) break; } - UNLOCK(&(rbtdb->node_locks[node->locknum].lock)); + NODE_UNLOCK(&(rbtdb->node_locks[node->locknum].lock), + isc_rwlocktype_read); if (header != NULL) break; result = dns_rbtnodechain_prev(&chain, NULL, NULL); @@ -1762,7 +2112,8 @@ activeemtpynode(rbtdb_search_t *search, dns_name_t *qname, dns_name_t *wname) { origin, &node); if (result != ISC_R_SUCCESS) break; - LOCK(&(rbtdb->node_locks[node->locknum].lock)); + NODE_LOCK(&(rbtdb->node_locks[node->locknum].lock), + isc_rwlocktype_read); for (header = node->data; header != NULL; header = header->next) { @@ -1770,7 +2121,8 @@ activeemtpynode(rbtdb_search_t *search, dns_name_t *qname, dns_name_t *wname) { !IGNORE(header) && EXISTS(header)) break; } - UNLOCK(&(rbtdb->node_locks[node->locknum].lock)); + NODE_UNLOCK(&(rbtdb->node_locks[node->locknum].lock), + isc_rwlocktype_read); if (header != NULL) break; result = dns_rbtnodechain_next(&chain, NULL, NULL); @@ -1838,7 +2190,8 @@ find_wildcard(rbtdb_search_t *search, dns_rbtnode_t **nodep, done = ISC_FALSE; node = *nodep; do { - LOCK(&(rbtdb->node_locks[node->locknum].lock)); + NODE_LOCK(&(rbtdb->node_locks[node->locknum].lock), + isc_rwlocktype_read); /* * First we try to figure out if this node is active in @@ -1863,7 +2216,8 @@ find_wildcard(rbtdb_search_t *search, dns_rbtnode_t **nodep, else wild = ISC_FALSE; - UNLOCK(&(rbtdb->node_locks[node->locknum].lock)); + NODE_UNLOCK(&(rbtdb->node_locks[node->locknum].lock), + isc_rwlocktype_read); if (wild) { /* @@ -1896,33 +2250,38 @@ find_wildcard(rbtdb_search_t *search, dns_rbtnode_t **nodep, DNS_RBTFIND_EMPTYDATA, NULL, NULL); if (result == ISC_R_SUCCESS) { - /* - * We have found the wildcard node. If it - * is active in the search's version, we're - * done. - */ - LOCK(&(rbtdb->node_locks[wnode->locknum].lock)); - for (header = wnode->data; - header != NULL; - header = header->next) { - if (header->serial <= search->serial && - !IGNORE(header) && EXISTS(header)) - break; - } - UNLOCK(&(rbtdb->node_locks[wnode->locknum].lock)); - if (header != NULL || - activeempty(search, &wchain, wname)) { - if (activeemtpynode(search, qname, wname)) + nodelock_t *lock; + + /* + * We have found the wildcard node. If it + * is active in the search's version, we're + * done. + */ + lock = &rbtdb->node_locks[wnode->locknum].lock; + NODE_LOCK(lock, isc_rwlocktype_read); + for (header = wnode->data; + header != NULL; + header = header->next) { + if (header->serial <= search->serial && + !IGNORE(header) && EXISTS(header)) + break; + } + NODE_UNLOCK(lock, isc_rwlocktype_read); + if (header != NULL || + activeempty(search, &wchain, wname)) { + if (activeemtpynode(search, qname, + wname)) { return (ISC_R_NOTFOUND); - /* - * The wildcard node is active! - * - * Note: result is still ISC_R_SUCCESS - * so we don't have to set it. - */ - *nodep = wnode; - break; - } + } + /* + * The wildcard node is active! + * + * Note: result is still ISC_R_SUCCESS + * so we don't have to set it. + */ + *nodep = wnode; + break; + } } else if (result != ISC_R_NOTFOUND && result != DNS_R_PARTIALMATCH) { /* @@ -1974,7 +2333,8 @@ find_closest_nsec(rbtdb_search_t *search, dns_dbnode_t **nodep, origin, &node); if (result != ISC_R_SUCCESS) return (result); - LOCK(&(search->rbtdb->node_locks[node->locknum].lock)); + NODE_LOCK(&(search->rbtdb->node_locks[node->locknum].lock), + isc_rwlocktype_read); found = NULL; foundsig = NULL; empty_node = ISC_TRUE; @@ -2074,7 +2434,8 @@ find_closest_nsec(rbtdb_search_t *search, dns_dbnode_t **nodep, result = dns_rbtnodechain_prev(&search->chain, NULL, NULL); } - UNLOCK(&(search->rbtdb->node_locks[node->locknum].lock)); + NODE_UNLOCK(&(search->rbtdb->node_locks[node->locknum].lock), + isc_rwlocktype_read); } while (empty_node && result == ISC_R_SUCCESS); /* @@ -2103,12 +2464,12 @@ zone_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version, isc_boolean_t at_zonecut = ISC_FALSE; isc_boolean_t wild; isc_boolean_t empty_node; - isc_mutex_t *lock; rdatasetheader_t *header, *header_next, *found, *nsecheader; rdatasetheader_t *foundsig, *cnamesig, *nsecsig; rbtdb_rdatatype_t sigtype; isc_boolean_t active; dns_rbtnodechain_t chain; + nodelock_t *lock; search.rbtdb = (dns_rbtdb_t *)db; @@ -2243,7 +2604,8 @@ zone_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version, * We now go looking for rdata... */ - LOCK(&(search.rbtdb->node_locks[node->locknum].lock)); + NODE_LOCK(&(search.rbtdb->node_locks[node->locknum].lock), + isc_rwlocktype_read); found = NULL; foundsig = NULL; @@ -2391,7 +2753,8 @@ zone_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version, * we really have a partial match. */ if (!wild) { - UNLOCK(&(search.rbtdb->node_locks[node->locknum].lock)); + lock = &search.rbtdb->node_locks[node->locknum].lock; + NODE_UNLOCK(lock, isc_rwlocktype_read); goto partial_match; } } @@ -2401,16 +2764,17 @@ zone_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version, */ if (found == NULL) { if (search.zonecut != NULL) { - /* - * We were trying to find glue at a node beneath a - * zone cut, but didn't. - * - * Return the delegation. - */ - UNLOCK(&(search.rbtdb->node_locks[node->locknum].lock)); - result = setup_delegation(&search, nodep, foundname, - rdataset, sigrdataset); - goto tree_exit; + /* + * We were trying to find glue at a node beneath a + * zone cut, but didn't. + * + * Return the delegation. + */ + lock = &search.rbtdb->node_locks[node->locknum].lock; + NODE_UNLOCK(lock, isc_rwlocktype_read); + result = setup_delegation(&search, nodep, foundname, + rdataset, sigrdataset); + goto tree_exit; } /* * The desired type doesn't exist. @@ -2426,11 +2790,12 @@ zone_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version, result = DNS_R_BADDB; goto node_exit; } - - UNLOCK(&(search.rbtdb->node_locks[node->locknum].lock)); + + lock = &search.rbtdb->node_locks[node->locknum].lock; + NODE_UNLOCK(lock, isc_rwlocktype_read); result = find_closest_nsec(&search, nodep, foundname, - rdataset, sigrdataset, - search.rbtdb->secure); + rdataset, sigrdataset, + search.rbtdb->secure); if (result == ISC_R_SUCCESS) result = DNS_R_EMPTYWILD; goto tree_exit; @@ -2508,9 +2873,10 @@ zone_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version, if (result == DNS_R_GLUE && (search.options & DNS_DBFIND_VALIDATEGLUE) != 0 && !valid_glue(&search, foundname, type, node)) { - UNLOCK(&(search.rbtdb->node_locks[node->locknum].lock)); - result = setup_delegation(&search, nodep, foundname, - rdataset, sigrdataset); + lock = &search.rbtdb->node_locks[node->locknum].lock; + NODE_UNLOCK(lock, isc_rwlocktype_read); + result = setup_delegation(&search, nodep, foundname, + rdataset, sigrdataset); goto tree_exit; } } else { @@ -2539,7 +2905,8 @@ zone_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version, foundname->attributes |= DNS_NAMEATTR_WILDCARD; node_exit: - UNLOCK(&(search.rbtdb->node_locks[node->locknum].lock)); + NODE_UNLOCK(&(search.rbtdb->node_locks[node->locknum].lock), + isc_rwlocktype_read); tree_exit: RWUNLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read); @@ -2552,14 +2919,10 @@ zone_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version, node = search.zonecut; lock = &(search.rbtdb->node_locks[node->locknum].lock); - LOCK(lock); - INSIST(node->references > 0); - node->references--; - if (node->references == 0) - no_references(search.rbtdb, node, 0, - isc_rwlocktype_none); - - UNLOCK(lock); + NODE_LOCK(lock, isc_rwlocktype_read); + decrement_reference(search.rbtdb, node, 0, + isc_rwlocktype_read, isc_rwlocktype_none); + NODE_UNLOCK(lock, isc_rwlocktype_read); } if (close_version) @@ -2596,6 +2959,8 @@ cache_zonecut_callback(dns_rbtnode_t *node, dns_name_t *name, void *arg) { rdatasetheader_t *header, *header_prev, *header_next; rdatasetheader_t *dname_header, *sigdname_header; isc_result_t result; + nodelock_t *lock; + isc_rwlocktype_t locktype; /* XXX comment */ @@ -2606,7 +2971,9 @@ cache_zonecut_callback(dns_rbtnode_t *node, dns_name_t *name, void *arg) { */ UNUSED(name); - LOCK(&(search->rbtdb->node_locks[node->locknum].lock)); + lock = &(search->rbtdb->node_locks[node->locknum].lock); + locktype = isc_rwlocktype_read; + NODE_LOCK(lock, locktype); /* * Look for a DNAME or RRSIG DNAME rdataset. @@ -2624,21 +2991,47 @@ cache_zonecut_callback(dns_rbtnode_t *node, dns_name_t *name, void *arg) { * the node as dirty, so it will get cleaned * up later. */ - if (node->references == 0) { - INSIST(header->down == NULL); - if (header_prev != NULL) - header_prev->next = - header->next; - else - node->data = header->next; - free_rdataset(search->rbtdb->common.mctx, - header); - } else { - header->attributes |= - RDATASET_ATTR_STALE; - node->dirty = 1; + if ((header->ttl <= search->now - RBTDB_VIRTUAL) && + (locktype == isc_rwlocktype_write || + NODE_TRYUPGRADE(lock) == ISC_R_SUCCESS)) { + /* + * We update the node's status only when we + * can get write access; otherwise, we leave + * others to this work. Periodical cleaning + * will eventually take the job as the last + * resort. + * We won't downgrade the lock, since other + * rdatasets are probably stale, too. + */ + locktype = isc_rwlocktype_write; + + if (dns_rbtnode_refcurrent(node) == 0) { + isc_mem_t *mctx; + + /* + * header->down can be non-NULL if the + * refcount has just decremented to 0 + * but decrement_reference() has not + * performed clean_cache_node(), in + * which case we need to purge the + * stale headers first. + */ + mctx = search->rbtdb->common.mctx; + clean_stale_headers(mctx, header); + if (header_prev != NULL) + header_prev->next = + header->next; + else + node->data = header->next; + free_rdataset(mctx, header); + } else { + header->attributes |= + RDATASET_ATTR_STALE; + node->dirty = 1; + header_prev = header; + } + } else header_prev = header; - } } else if (header->type == dns_rdatatype_dname && EXISTS(header)) { dname_header = header; @@ -2667,7 +3060,7 @@ cache_zonecut_callback(dns_rbtnode_t *node, dns_name_t *name, void *arg) { } else result = DNS_R_CONTINUE; - UNLOCK(&(search->rbtdb->node_locks[node->locknum].lock)); + NODE_UNLOCK(lock, locktype); return (result); } @@ -2685,6 +3078,8 @@ find_deepest_zonecut(rbtdb_search_t *search, dns_rbtnode_t *node, dns_name_t name; dns_rbtdb_t *rbtdb; isc_boolean_t done; + nodelock_t *lock; + isc_rwlocktype_t locktype; /* * Caller must be holding the tree lock. @@ -2694,7 +3089,9 @@ find_deepest_zonecut(rbtdb_search_t *search, dns_rbtnode_t *node, i = search->chain.level_matches; done = ISC_FALSE; do { - LOCK(&(rbtdb->node_locks[node->locknum].lock)); + locktype = isc_rwlocktype_read; + lock = &rbtdb->node_locks[node->locknum].lock; + NODE_LOCK(lock, locktype); /* * Look for NS and RRSIG NS rdatasets. @@ -2714,21 +3111,37 @@ find_deepest_zonecut(rbtdb_search_t *search, dns_rbtnode_t *node, * the node as dirty, so it will get cleaned * up later. */ - if (node->references == 0) { - INSIST(header->down == NULL); - if (header_prev != NULL) - header_prev->next = - header->next; - else - node->data = header->next; - free_rdataset(rbtdb->common.mctx, - header); - } else { - header->attributes |= - RDATASET_ATTR_STALE; - node->dirty = 1; + if ((header->ttl <= search->now - + RBTDB_VIRTUAL) && + (locktype == isc_rwlocktype_write || + NODE_TRYUPGRADE(lock) == ISC_R_SUCCESS)) { + /* + * We update the node's status only + * when we can get write access. + */ + locktype = isc_rwlocktype_write; + + if (dns_rbtnode_refcurrent(node) + == 0) { + isc_mem_t *m; + + m = search->rbtdb->common.mctx; + clean_stale_headers(m, header); + if (header_prev != NULL) + header_prev->next = + header->next; + else + node->data = + header->next; + free_rdataset(m, header); + } else { + header->attributes |= + RDATASET_ATTR_STALE; + node->dirty = 1; + header_prev = header; + } + } else header_prev = header; - } } else if (EXISTS(header)) { /* * We've found an extant rdataset. See if @@ -2792,7 +3205,7 @@ find_deepest_zonecut(rbtdb_search_t *search, dns_rbtnode_t *node, } node_exit: - UNLOCK(&(search->rbtdb->node_locks[node->locknum].lock)); + NODE_UNLOCK(lock, locktype); if (found == NULL && i > 0) { i--; @@ -2818,6 +3231,8 @@ find_coveringnsec(rbtdb_search_t *search, dns_dbnode_t **nodep, dns_fixedname_t fname, forigin; dns_name_t *name, *origin; rbtdb_rdatatype_t matchtype, sigmatchtype; + nodelock_t *lock; + isc_rwlocktype_t locktype; matchtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_nsec, 0); sigmatchtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, @@ -2833,7 +3248,9 @@ find_coveringnsec(rbtdb_search_t *search, dns_dbnode_t **nodep, origin, &node); if (result != ISC_R_SUCCESS) return (result); - LOCK(&(search->rbtdb->node_locks[node->locknum].lock)); + locktype = isc_rwlocktype_read; + lock = &(search->rbtdb->node_locks[node->locknum].lock); + NODE_LOCK(lock, locktype); found = NULL; foundsig = NULL; empty_node = ISC_TRUE; @@ -2850,23 +3267,35 @@ find_coveringnsec(rbtdb_search_t *search, dns_dbnode_t **nodep, * node as dirty, so it will get cleaned up * later. */ - if (header->ttl > search->now - RBTDB_VIRTUAL) - header_prev = header; - else if (node->references == 0) { - INSIST(header->down == NULL); - if (header_prev != NULL) - header_prev->next = - header->next; - else - node->data = header->next; - free_rdataset(search->rbtdb->common.mctx, - header); - } else { - header->attributes |= - RDATASET_ATTR_STALE; - node->dirty = 1; + if ((header->ttl <= now - RBTDB_VIRTUAL) && + (locktype == isc_rwlocktype_write || + NODE_TRYUPGRADE(lock) == ISC_R_SUCCESS)) { + /* + * We update the node's status only + * when we can get write access. + */ + locktype = isc_rwlocktype_write; + + if (dns_rbtnode_refcurrent(node) + == 0) { + isc_mem_t *m; + + m = search->rbtdb->common.mctx; + clean_stale_headers(m, header); + if (header_prev != NULL) + header_prev->next = + header->next; + else + node->data = header->next; + free_rdataset(m, header); + } else { + header->attributes |= + RDATASET_ATTR_STALE; + node->dirty = 1; + header_prev = header; + } + } else header_prev = header; - } continue; } if (NONEXISTENT(header) || NXDOMAIN(header)) { @@ -2899,7 +3328,7 @@ find_coveringnsec(rbtdb_search_t *search, dns_dbnode_t **nodep, result = dns_rbtnodechain_prev(&search->chain, NULL, NULL); unlock_node: - UNLOCK(&(search->rbtdb->node_locks[node->locknum].lock)); + NODE_UNLOCK(lock, locktype); } while (empty_node && result == ISC_R_SUCCESS); return (result); } @@ -2915,7 +3344,8 @@ cache_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version, rbtdb_search_t search; isc_boolean_t cname_ok = ISC_TRUE; isc_boolean_t empty_node; - isc_mutex_t *lock; + nodelock_t *lock; + isc_rwlocktype_t locktype; rdatasetheader_t *header, *header_prev, *header_next; rdatasetheader_t *found, *nsheader; rdatasetheader_t *foundsig, *nssig, *cnamesig; @@ -2989,7 +3419,9 @@ cache_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version, * We now go looking for rdata... */ - LOCK(&(search.rbtdb->node_locks[node->locknum].lock)); + lock = &(search.rbtdb->node_locks[node->locknum].lock); + locktype = isc_rwlocktype_read; + NODE_LOCK(lock, locktype); found = NULL; foundsig = NULL; @@ -3009,21 +3441,34 @@ cache_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version, * mark it as stale, and the node as dirty, so it will * get cleaned up later. */ - if (header->ttl > now - RBTDB_VIRTUAL) - header_prev = header; - else if (node->references == 0) { - INSIST(header->down == NULL); - if (header_prev != NULL) - header_prev->next = header->next; - else - node->data = header->next; - free_rdataset(search.rbtdb->common.mctx, - header); - } else { - header->attributes |= RDATASET_ATTR_STALE; - node->dirty = 1; + if ((header->ttl <= now - RBTDB_VIRTUAL) && + (locktype == isc_rwlocktype_write || + NODE_TRYUPGRADE(lock) == ISC_R_SUCCESS)) { + /* + * We update the node's status only when we + * can get write access. + */ + locktype = isc_rwlocktype_write; + + if (dns_rbtnode_refcurrent(node) == 0) { + isc_mem_t *mctx; + + mctx = search.rbtdb->common.mctx; + clean_stale_headers(mctx, header); + if (header_prev != NULL) + header_prev->next = + header->next; + else + node->data = header->next; + free_rdataset(mctx, header); + } else { + header->attributes |= + RDATASET_ATTR_STALE; + node->dirty = 1; + header_prev = header; + } + } else header_prev = header; - } } else if (EXISTS(header)) { /* * We now know that there is at least one active @@ -3103,7 +3548,7 @@ cache_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version, * extant rdatasets. That means that this node doesn't * meaningfully exist, and that we really have a partial match. */ - UNLOCK(&(search.rbtdb->node_locks[node->locknum].lock)); + NODE_UNLOCK(lock, locktype); goto find_ns; } @@ -3136,7 +3581,7 @@ cache_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version, /* * Go find the deepest zone cut. */ - UNLOCK(&(search.rbtdb->node_locks[node->locknum].lock)); + NODE_UNLOCK(lock, locktype); goto find_ns; } @@ -3183,7 +3628,7 @@ cache_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version, } node_exit: - UNLOCK(&(search.rbtdb->node_locks[node->locknum].lock)); + NODE_UNLOCK(lock, locktype); tree_exit: RWUNLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read); @@ -3196,13 +3641,10 @@ cache_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version, node = search.zonecut; lock = &(search.rbtdb->node_locks[node->locknum].lock); - LOCK(lock); - INSIST(node->references > 0); - node->references--; - if (node->references == 0) - no_references(search.rbtdb, node, 0, - isc_rwlocktype_none); - UNLOCK(lock); + NODE_LOCK(lock, isc_rwlocktype_read); + decrement_reference(search.rbtdb, node, 0, + isc_rwlocktype_read, isc_rwlocktype_none); + NODE_UNLOCK(lock, isc_rwlocktype_read); } dns_rbtnodechain_reset(&search.chain); @@ -3217,11 +3659,13 @@ cache_findzonecut(dns_db_t *db, dns_name_t *name, unsigned int options, dns_rdataset_t *rdataset, dns_rdataset_t *sigrdataset) { dns_rbtnode_t *node = NULL; + nodelock_t *lock; isc_result_t result; rbtdb_search_t search; rdatasetheader_t *header, *header_prev, *header_next; rdatasetheader_t *found, *foundsig; unsigned int rbtoptions = DNS_RBTFIND_EMPTYDATA; + isc_rwlocktype_t locktype; search.rbtdb = (dns_rbtdb_t *)db; @@ -3264,7 +3708,9 @@ cache_findzonecut(dns_db_t *db, dns_name_t *name, unsigned int options, * We now go looking for an NS rdataset at the node. */ - LOCK(&(search.rbtdb->node_locks[node->locknum].lock)); + lock = &(search.rbtdb->node_locks[node->locknum].lock); + locktype = isc_rwlocktype_read; + NODE_LOCK(lock, locktype); found = NULL; foundsig = NULL; @@ -3278,21 +3724,34 @@ cache_findzonecut(dns_db_t *db, dns_name_t *name, unsigned int options, * mark it as stale, and the node as dirty, so it will * get cleaned up later. */ - if (header->ttl > now - RBTDB_VIRTUAL) - header_prev = header; - else if (node->references == 0) { - INSIST(header->down == NULL); - if (header_prev != NULL) - header_prev->next = header->next; - else - node->data = header->next; - free_rdataset(search.rbtdb->common.mctx, - header); - } else { - header->attributes |= RDATASET_ATTR_STALE; - node->dirty = 1; + if ((header->ttl <= now - RBTDB_VIRTUAL) && + (locktype == isc_rwlocktype_write || + NODE_TRYUPGRADE(lock) == ISC_R_SUCCESS)) { + /* + * We update the node's status only when we + * can get write access. + */ + locktype = isc_rwlocktype_write; + + if (dns_rbtnode_refcurrent(node) == 0) { + isc_mem_t *mctx; + + mctx = search.rbtdb->common.mctx; + clean_stale_headers(mctx, header); + if (header_prev != NULL) + header_prev->next = + header->next; + else + node->data = header->next; + free_rdataset(mctx, header); + } else { + header->attributes |= + RDATASET_ATTR_STALE; + node->dirty = 1; + header_prev = header; + } + } else header_prev = header; - } } else if (EXISTS(header)) { /* * If we found a type we were looking for, remember @@ -3321,7 +3780,7 @@ cache_findzonecut(dns_db_t *db, dns_name_t *name, unsigned int options, /* * No NS records here. */ - UNLOCK(&(search.rbtdb->node_locks[node->locknum].lock)); + NODE_UNLOCK(lock, locktype); goto find_ns; } @@ -3335,7 +3794,7 @@ cache_findzonecut(dns_db_t *db, dns_name_t *name, unsigned int options, bind_rdataset(search.rbtdb, node, foundsig, search.now, sigrdataset); - UNLOCK(&(search.rbtdb->node_locks[node->locknum].lock)); + NODE_UNLOCK(lock, locktype); tree_exit: RWUNLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read); @@ -3354,15 +3813,15 @@ static void attachnode(dns_db_t *db, dns_dbnode_t *source, dns_dbnode_t **targetp) { dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; dns_rbtnode_t *node = (dns_rbtnode_t *)source; + unsigned int refs; REQUIRE(VALID_RBTDB(rbtdb)); REQUIRE(targetp != NULL && *targetp == NULL); - LOCK(&rbtdb->node_locks[node->locknum].lock); - INSIST(node->references > 0); - node->references++; - INSIST(node->references != 0); /* Catch overflow. */ - UNLOCK(&rbtdb->node_locks[node->locknum].lock); + NODE_STRONGLOCK(&rbtdb->node_locks[node->locknum].lock); + dns_rbtnode_refincrement(node, &refs); + INSIST(refs != 0); + NODE_STRONGUNLOCK(&rbtdb->node_locks[node->locknum].lock); *targetp = source; } @@ -3373,35 +3832,34 @@ detachnode(dns_db_t *db, dns_dbnode_t **targetp) { dns_rbtnode_t *node; isc_boolean_t want_free = ISC_FALSE; isc_boolean_t inactive = ISC_FALSE; - unsigned int locknum; + rbtdb_nodelock_t *nodelock; REQUIRE(VALID_RBTDB(rbtdb)); REQUIRE(targetp != NULL && *targetp != NULL); node = (dns_rbtnode_t *)(*targetp); - locknum = node->locknum; + nodelock = &rbtdb->node_locks[node->locknum]; - LOCK(&rbtdb->node_locks[locknum].lock); + NODE_LOCK(&nodelock->lock, isc_rwlocktype_read); - INSIST(node->references > 0); - node->references--; - if (node->references == 0) { - no_references(rbtdb, node, 0, isc_rwlocktype_none); - if (rbtdb->node_locks[locknum].references == 0 && - rbtdb->node_locks[locknum].exiting) + if (decrement_reference(rbtdb, node, 0, isc_rwlocktype_read, + isc_rwlocktype_none)) { + if (isc_refcount_current(&nodelock->references) == 0 && + nodelock->exiting) { inactive = ISC_TRUE; + } } - UNLOCK(&rbtdb->node_locks[locknum].lock); + NODE_UNLOCK(&nodelock->lock, isc_rwlocktype_read); *targetp = NULL; if (inactive) { - LOCK(&rbtdb->lock); + RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write); rbtdb->active--; if (rbtdb->active == 0) want_free = ISC_TRUE; - UNLOCK(&rbtdb->lock); + RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write); if (want_free) { char buf[DNS_NAME_FORMATSIZE]; if (dns_name_dynamic(&rbtdb->common.origin)) @@ -3465,14 +3923,19 @@ expirenode(dns_db_t *db, dns_dbnode_t *node, isc_stdtime_t now) { sizeof(printname))); } - LOCK(&rbtdb->node_locks[rbtnode->locknum].lock); + /* + * We may not need write access, but this code path is not performance + * sensitive, so it should be okay to always lock as a writer. + */ + NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock, + isc_rwlocktype_write); for (header = rbtnode->data; header != NULL; header = header->next) if (header->ttl <= now - RBTDB_VIRTUAL) { /* - * We don't check if rbtnode->references == 0 and try + * We don't check if refcurrent(rbtnode) == 0 and try * to free like we do in cache_find(), because - * rbtnode->references must be non-zero. This is so + * refcurrent(rbtnode) must be non-zero. This is so * because 'node' is an argument to the function. */ header->attributes |= RDATASET_ATTR_STALE; @@ -3496,7 +3959,8 @@ expirenode(dns_db_t *db, dns_dbnode_t *node, isc_stdtime_t now) { isc_log_write(dns_lctx, category, module, level, "overmem cache: saved %s", printname); - UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock); + NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, + isc_rwlocktype_write); return (ISC_R_SUCCESS); } @@ -3518,10 +3982,12 @@ printnode(dns_db_t *db, dns_dbnode_t *node, FILE *out) { REQUIRE(VALID_RBTDB(rbtdb)); - LOCK(&rbtdb->node_locks[rbtnode->locknum].lock); + NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock, + isc_rwlocktype_read); fprintf(out, "node %p, %u references, locknum = %u\n", - rbtnode, rbtnode->references, rbtnode->locknum); + rbtnode, dns_rbtnode_refcurrent(rbtnode), + rbtnode->locknum); if (rbtnode->data != NULL) { rdatasetheader_t *current, *top_next; @@ -3547,7 +4013,8 @@ printnode(dns_db_t *db, dns_dbnode_t *node, FILE *out) { } else fprintf(out, "(empty)\n"); - UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock); + NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, + isc_rwlocktype_read); } static isc_result_t @@ -3608,7 +4075,8 @@ zone_findrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, serial = rbtversion->serial; now = 0; - LOCK(&rbtdb->node_locks[rbtnode->locknum].lock); + NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock, + isc_rwlocktype_read); found = NULL; foundsig = NULL; @@ -3656,7 +4124,8 @@ zone_findrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, sigrdataset); } - UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock); + NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, + isc_rwlocktype_read); if (close_version) closeversion(db, (dns_dbversion_t **) (void *)(&rbtversion), @@ -3679,6 +4148,8 @@ cache_findrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, rdatasetheader_t *header, *header_next, *found, *foundsig; rbtdb_rdatatype_t matchtype, sigmatchtype, negtype; isc_result_t result; + nodelock_t *lock; + isc_rwlocktype_t locktype; REQUIRE(VALID_RBTDB(rbtdb)); REQUIRE(type != dns_rdatatype_any); @@ -3690,7 +4161,9 @@ cache_findrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, if (now == 0) isc_stdtime_get(&now); - LOCK(&rbtdb->node_locks[rbtnode->locknum].lock); + lock = &rbtdb->node_locks[rbtnode->locknum].lock; + locktype = isc_rwlocktype_read; + NODE_LOCK(lock, locktype); found = NULL; foundsig = NULL; @@ -3704,13 +4177,22 @@ cache_findrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, for (header = rbtnode->data; header != NULL; header = header_next) { header_next = header->next; if (header->ttl <= now) { - /* - * We don't check if rbtnode->references == 0 and try - * to free like we do in cache_find(), because - * rbtnode->references must be non-zero. This is so - * because 'node' is an argument to the function. - */ - if (header->ttl <= now - RBTDB_VIRTUAL) { + if ((header->ttl <= now - RBTDB_VIRTUAL) && + (locktype == isc_rwlocktype_write || + NODE_TRYUPGRADE(lock) == ISC_R_SUCCESS)) { + /* + * We update the node's status only when we + * can get write access. + */ + locktype = isc_rwlocktype_write; + + /* + * We don't check if refcurrent(rbtnode) == 0 + * and try to free like we do in cache_find(), + * because refcurrent(rbtnode) must be + * non-zero. This is so because 'node' is an + * argument to the function. + */ header->attributes |= RDATASET_ATTR_STALE; rbtnode->dirty = 1; } @@ -3731,7 +4213,7 @@ cache_findrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, sigrdataset); } - UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock); + NODE_UNLOCK(lock, locktype); if (found == NULL) return (ISC_R_NOTFOUND); @@ -3757,6 +4239,7 @@ allrdatasets(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node; rbtdb_version_t *rbtversion = version; rbtdb_rdatasetiter_t *iterator; + unsigned int refs; REQUIRE(VALID_RBTDB(rbtdb)); @@ -3770,11 +4253,11 @@ allrdatasets(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, currentversion(db, (dns_dbversion_t **) (void *)(&rbtversion)); else { - LOCK(&rbtdb->lock); - INSIST(rbtversion->references > 0); - rbtversion->references++; - INSIST(rbtversion->references != 0); - UNLOCK(&rbtdb->lock); + unsigned int refs; + + isc_refcount_increment(&rbtversion->references, + &refs); + INSIST(refs > 1); } } else { if (now == 0) @@ -3789,14 +4272,14 @@ allrdatasets(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, iterator->common.version = (dns_dbversion_t *)rbtversion; iterator->common.now = now; - LOCK(&rbtdb->node_locks[rbtnode->locknum].lock); + NODE_STRONGLOCK(&rbtdb->node_locks[rbtnode->locknum].lock); + + dns_rbtnode_refincrement(rbtnode, &refs); + INSIST(refs != 0); - INSIST(rbtnode->references > 0); - rbtnode->references++; - INSIST(rbtnode->references != 0); iterator->current = NULL; - UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock); + NODE_STRONGUNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock); *iteratorp = (dns_rdatasetiter_t *)iterator; @@ -3987,6 +4470,7 @@ add(dns_rbtdb_t *rbtdb, dns_rbtnode_t *rbtnode, rbtdb_version_t *rbtversion, * The NXDOMAIN/NODATA(QTYPE=ANY) * is more trusted. */ + free_rdataset(rbtdb->common.mctx, newheader); if (addedrdataset != NULL) @@ -4343,6 +4827,8 @@ addrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, newheader->noqname = NULL; newheader->count = 0; newheader->trust = rdataset->trust; + newheader->additional_auth = NULL; + newheader->additional_glue = NULL; if (rbtversion != NULL) { newheader->serial = rbtversion->serial; now = 0; @@ -4371,14 +4857,16 @@ addrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, } else delegating = ISC_FALSE; - LOCK(&rbtdb->node_locks[rbtnode->locknum].lock); + NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock, + isc_rwlocktype_write); result = add(rbtdb, rbtnode, rbtversion, newheader, options, ISC_FALSE, addedrdataset, now); if (result == ISC_R_SUCCESS && delegating) rbtnode->find_callback = 1; - UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock); + NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, + isc_rwlocktype_write); if (delegating) RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write); @@ -4423,13 +4911,17 @@ subtractrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, newheader->trust = 0; newheader->noqname = NULL; newheader->count = 0; + newheader->additional_auth = NULL; + newheader->additional_glue = NULL; - LOCK(&rbtdb->node_locks[rbtnode->locknum].lock); + NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock, + isc_rwlocktype_write); changed = add_changed(rbtdb, rbtversion, rbtnode); if (changed == NULL) { free_rdataset(rbtdb->common.mctx, newheader); - UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock); + NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, + isc_rwlocktype_write); return (ISC_R_NOMEMORY); } @@ -4476,6 +4968,13 @@ subtractrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, * header, not newheader. */ newheader->serial = rbtversion->serial; + /* + * XXXJT: dns_rdataslab_subtract() copied the pointers + * to additional info. We need to clear these fields + * to avoid having duplicated references. + */ + newheader->additional_auth = NULL; + newheader->additional_glue = NULL; } else if (result == DNS_R_NXRRSET) { /* * This subtraction would remove all of the rdata; @@ -4495,6 +4994,8 @@ subtractrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, newheader->serial = rbtversion->serial; newheader->noqname = NULL; newheader->count = 0; + newheader->additional_auth = NULL; + newheader->additional_glue = NULL; } else { free_rdataset(rbtdb->common.mctx, newheader); goto unlock; @@ -4530,7 +5031,8 @@ subtractrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, bind_rdataset(rbtdb, rbtnode, newheader, 0, newrdataset); unlock: - UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock); + NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, + isc_rwlocktype_write); /* * Update the zone's secure status. If version is non-NULL @@ -4567,18 +5069,22 @@ deleterdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, newheader->attributes = RDATASET_ATTR_NONEXISTENT; newheader->trust = 0; newheader->noqname = NULL; + newheader->additional_auth = NULL; + newheader->additional_glue = NULL; if (rbtversion != NULL) newheader->serial = rbtversion->serial; else newheader->serial = 0; newheader->count = 0; - LOCK(&rbtdb->node_locks[rbtnode->locknum].lock); + NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock, + isc_rwlocktype_write); result = add(rbtdb, rbtnode, rbtversion, newheader, DNS_DBADD_FORCE, ISC_FALSE, NULL, 0); - UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock); + NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, + isc_rwlocktype_write); /* * Update the zone's secure status. If version is non-NULL @@ -4656,6 +5162,8 @@ loading_addrdataset(void *arg, dns_name_t *name, dns_rdataset_t *rdataset) { newheader->serial = 1; newheader->noqname = NULL; newheader->count = 0; + newheader->additional_auth = NULL; + newheader->additional_glue = NULL; result = add(rbtdb, node, rbtdb->current_version, newheader, DNS_DBADD_MERGE, ISC_TRUE, NULL, 0); @@ -4687,13 +5195,13 @@ beginload(dns_db_t *db, dns_addrdatasetfunc_t *addp, dns_dbload_t **dbloadp) { else loadctx->now = 0; - LOCK(&rbtdb->lock); + RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write); REQUIRE((rbtdb->attributes & (RBTDB_ATTR_LOADED|RBTDB_ATTR_LOADING)) == 0); rbtdb->attributes |= RBTDB_ATTR_LOADING; - UNLOCK(&rbtdb->lock); + RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write); *addp = loading_addrdataset; *dbloadp = loadctx; @@ -4711,7 +5219,7 @@ endload(dns_db_t *db, dns_dbload_t **dbloadp) { loadctx = *dbloadp; REQUIRE(loadctx->rbtdb == rbtdb); - LOCK(&rbtdb->lock); + RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write); REQUIRE((rbtdb->attributes & RBTDB_ATTR_LOADING) != 0); REQUIRE((rbtdb->attributes & RBTDB_ATTR_LOADED) == 0); @@ -4719,7 +5227,7 @@ endload(dns_db_t *db, dns_dbload_t **dbloadp) { rbtdb->attributes &= ~RBTDB_ATTR_LOADING; rbtdb->attributes |= RBTDB_ATTR_LOADED; - UNLOCK(&rbtdb->lock); + RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write); /* * If there's a KEY rdataset at the zone origin containing a @@ -4736,16 +5244,17 @@ endload(dns_db_t *db, dns_dbload_t **dbloadp) { } static isc_result_t -dump(dns_db_t *db, dns_dbversion_t *version, const char *filename) { +dump(dns_db_t *db, dns_dbversion_t *version, const char *filename, + dns_masterformat_t masterformat) { dns_rbtdb_t *rbtdb; rbtdb = (dns_rbtdb_t *)db; REQUIRE(VALID_RBTDB(rbtdb)); - return (dns_master_dump(rbtdb->common.mctx, db, version, - &dns_master_style_default, - filename)); + return (dns_master_dump2(rbtdb->common.mctx, db, version, + &dns_master_style_default, + filename, masterformat)); } static void @@ -4799,12 +5308,12 @@ settask(dns_db_t *db, isc_task_t *task) { REQUIRE(VALID_RBTDB(rbtdb)); - LOCK(&rbtdb->lock); + RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write); if (rbtdb->task != NULL) isc_task_detach(&rbtdb->task); if (task != NULL) isc_task_attach(task, &rbtdb->task); - UNLOCK(&rbtdb->lock); + RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write); } static isc_boolean_t @@ -4813,6 +5322,31 @@ ispersistent(dns_db_t *db) { return (ISC_FALSE); } +static isc_result_t +getoriginnode(dns_db_t *db, dns_dbnode_t **nodep) { + dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; + dns_rbtnode_t *onode; + isc_result_t result = ISC_R_SUCCESS; + + REQUIRE(VALID_RBTDB(rbtdb)); + REQUIRE(nodep != NULL && *nodep == NULL); + + /* Note that the access to origin_node doesn't require a DB lock */ + onode = (dns_rbtnode_t *)rbtdb->origin_node; + if (onode != NULL) { + NODE_STRONGLOCK(&rbtdb->node_locks[onode->locknum].lock); + new_reference(rbtdb, onode); + NODE_STRONGUNLOCK(&rbtdb->node_locks[onode->locknum].lock); + + *nodep = rbtdb->origin_node; + } else { + INSIST(!IS_CACHE(rbtdb)); + result = ISC_R_NOTFOUND; + } + + return (result); +} + static dns_dbmethods_t zone_methods = { attach, detach, @@ -4840,7 +5374,8 @@ static dns_dbmethods_t zone_methods = { nodecount, ispersistent, overmem, - settask + settask, + getoriginnode }; static dns_dbmethods_t cache_methods = { @@ -4870,7 +5405,8 @@ static dns_dbmethods_t cache_methods = { nodecount, ispersistent, overmem, - settask + settask, + getoriginnode }; isc_result_t @@ -4896,6 +5432,7 @@ dns_rbtdb_create rbtdb = isc_mem_get(mctx, sizeof(*rbtdb)); if (rbtdb == NULL) return (ISC_R_NOMEMORY); + memset(rbtdb, '\0', sizeof(*rbtdb)); dns_name_init(&rbtdb->common.origin, NULL); rbtdb->common.attributes = 0; @@ -4910,55 +5447,48 @@ dns_rbtdb_create rbtdb->common.rdclass = rdclass; rbtdb->common.mctx = NULL; - result = isc_mutex_init(&rbtdb->lock); - if (result != ISC_R_SUCCESS) { - isc_mem_put(mctx, rbtdb, sizeof(*rbtdb)); - UNEXPECTED_ERROR(__FILE__, __LINE__, - "isc_mutex_init() failed: %s", - isc_result_totext(result)); - return (ISC_R_UNEXPECTED); - } + result = RBTDB_INITLOCK(&rbtdb->lock); + if (result != ISC_R_SUCCESS) + goto cleanup_rbtdb; result = isc_rwlock_init(&rbtdb->tree_lock, 0, 0); - if (result != ISC_R_SUCCESS) { - DESTROYLOCK(&rbtdb->lock); - isc_mem_put(mctx, rbtdb, sizeof(*rbtdb)); - UNEXPECTED_ERROR(__FILE__, __LINE__, - "isc_rwlock_init() failed: %s", - isc_result_totext(result)); - return (ISC_R_UNEXPECTED); - } + if (result != ISC_R_SUCCESS) + goto cleanup_lock; + if (rbtdb->node_lock_count == 0) { + if (IS_CACHE(rbtdb)) + rbtdb->node_lock_count = DEFAULT_CACHE_NODE_LOCK_COUNT; + else + rbtdb->node_lock_count = DEFAULT_NODE_LOCK_COUNT; + } INSIST(rbtdb->node_lock_count < (1 << DNS_RBT_LOCKLENGTH)); - - if (rbtdb->node_lock_count == 0) - rbtdb->node_lock_count = DEFAULT_NODE_LOCK_COUNT; rbtdb->node_locks = isc_mem_get(mctx, rbtdb->node_lock_count * sizeof(rbtdb_nodelock_t)); + if (rbtdb->node_locks == NULL) { + result = ISC_R_NOMEMORY; + goto cleanup_tree_lock; + } + rbtdb->active = rbtdb->node_lock_count; + for (i = 0; i < (int)(rbtdb->node_lock_count); i++) { - result = isc_mutex_init(&rbtdb->node_locks[i].lock); + result = NODE_INITLOCK(&rbtdb->node_locks[i].lock); + if (result == ISC_R_SUCCESS) { + result = isc_refcount_init(&rbtdb->node_locks[i].references, 0); + if (result != ISC_R_SUCCESS) + NODE_DESTROYLOCK(&rbtdb->node_locks[i].lock); + } if (result != ISC_R_SUCCESS) { - i--; - while (i >= 0) { - DESTROYLOCK(&rbtdb->node_locks[i].lock); - i--; + while (i-- > 0) { + NODE_DESTROYLOCK(&rbtdb->node_locks[i].lock); + isc_refcount_decrement(&rbtdb->node_locks[i].references, NULL); + isc_refcount_destroy(&rbtdb->node_locks[i].references); } - isc_mem_put(mctx, rbtdb->node_locks, - rbtdb->node_lock_count * - sizeof(rbtdb_nodelock_t)); - isc_rwlock_destroy(&rbtdb->tree_lock); - DESTROYLOCK(&rbtdb->lock); - isc_mem_put(mctx, rbtdb, sizeof(*rbtdb)); - UNEXPECTED_ERROR(__FILE__, __LINE__, - "isc_mutex_init() failed: %s", - isc_result_totext(result)); - return (ISC_R_UNEXPECTED); + goto cleanup_node_locks; } - rbtdb->node_locks[i].references = 0; rbtdb->node_locks[i].exiting = ISC_FALSE; } - + /* * Attach to the mctx. The database will persist so long as there * are references to it, and attaching to the mctx ensures that our @@ -5001,7 +5531,7 @@ dns_rbtdb_create * the top-of-zone node can never be deleted, nor can its address * change. */ - if (! IS_CACHE(rbtdb)) { + if (!IS_CACHE(rbtdb)) { rbtdb->origin_node = NULL; result = dns_rbt_addnode(rbtdb->tree, &rbtdb->common.origin, &rbtdb->origin_node); @@ -5029,7 +5559,11 @@ dns_rbtdb_create /* * Misc. Initialization. */ - isc_refcount_init(&rbtdb->references, 1); + result = isc_refcount_init(&rbtdb->references, 1); + if (result != ISC_R_SUCCESS) { + free_rbtdb(rbtdb, ISC_FALSE, NULL); + return (result); + } rbtdb->attributes = 0; rbtdb->secure = ISC_FALSE; rbtdb->overmem = ISC_FALSE; @@ -5041,13 +5575,20 @@ dns_rbtdb_create rbtdb->current_serial = 1; rbtdb->least_serial = 1; rbtdb->next_serial = 2; - rbtdb->current_version = allocate_version(mctx, 1, 0, ISC_FALSE); + rbtdb->current_version = allocate_version(mctx, 1, 1, ISC_FALSE); if (rbtdb->current_version == NULL) { + isc_refcount_decrement(&rbtdb->references, NULL); + isc_refcount_destroy(&rbtdb->references); free_rbtdb(rbtdb, ISC_FALSE, NULL); return (ISC_R_NOMEMORY); } rbtdb->future_version = NULL; ISC_LIST_INIT(rbtdb->open_versions); + /* + * Keep the current version in the open list so that list operation + * won't happen in normal lookup operations. + */ + PREPEND(rbtdb->open_versions, rbtdb->current_version, link); rbtdb->common.magic = DNS_DB_MAGIC; rbtdb->common.impmagic = RBTDB_MAGIC; @@ -5055,6 +5596,20 @@ dns_rbtdb_create *dbp = (dns_db_t *)rbtdb; return (ISC_R_SUCCESS); + + cleanup_node_locks: + isc_mem_put(mctx, rbtdb->node_locks, + rbtdb->node_lock_count * sizeof(rbtdb_nodelock_t)); + + cleanup_tree_lock: + isc_rwlock_destroy(&rbtdb->tree_lock); + + cleanup_lock: + RBTDB_DESTROYLOCK(&rbtdb->lock); + + cleanup_rbtdb: + isc_mem_put(mctx, rbtdb, sizeof(*rbtdb)); + return (result); } @@ -5072,7 +5627,7 @@ rdataset_disassociate(dns_rdataset_t *rdataset) { static isc_result_t rdataset_first(dns_rdataset_t *rdataset) { - unsigned char *raw = rdataset->private3; + unsigned char *raw = rdataset->private3; /* RDATASLAB */ unsigned int count; count = raw[0] * 256 + raw[1]; @@ -5080,11 +5635,20 @@ rdataset_first(dns_rdataset_t *rdataset) { rdataset->private5 = NULL; return (ISC_R_NOMORE); } - raw += 2; + + if ((rdataset->attributes & DNS_RDATASETATTR_LOADORDER) == 0) + raw += 2 + (4 * count); + else + raw += 2; + /* - * The privateuint4 field is the number of rdata beyond the cursor - * position, so we decrement the total count by one before storing - * it. + * The privateuint4 field is the number of rdata beyond the + * cursor position, so we decrement the total count by one + * before storing it. + * + * If DNS_RDATASETATTR_LOADORDER is not set 'raw' points to the + * first record. If DNS_RDATASETATTR_LOADORDER is set 'raw' points + * to the first entry in the offset table. */ count--; rdataset->privateuint4 = count; @@ -5097,30 +5661,47 @@ static isc_result_t rdataset_next(dns_rdataset_t *rdataset) { unsigned int count; unsigned int length; - unsigned char *raw; + unsigned char *raw; /* RDATASLAB */ count = rdataset->privateuint4; if (count == 0) return (ISC_R_NOMORE); count--; rdataset->privateuint4 = count; + + /* + * Skip forward one record (length + 4) or one offset (4). + */ raw = rdataset->private5; - length = raw[0] * 256 + raw[1]; - raw += length + 2; - rdataset->private5 = raw; + if ((rdataset->attributes & DNS_RDATASETATTR_LOADORDER) == 0) { + length = raw[0] * 256 + raw[1]; + raw += length; + } + rdataset->private5 = raw + 4; return (ISC_R_SUCCESS); } static void rdataset_current(dns_rdataset_t *rdataset, dns_rdata_t *rdata) { - unsigned char *raw = rdataset->private5; + unsigned char *raw = rdataset->private5; /* RDATASLAB */ + unsigned int offset; isc_region_t r; REQUIRE(raw != NULL); + /* + * Find the start of the record if not already in private5 + * then skip the length and order fields. + */ + if ((rdataset->attributes & DNS_RDATASETATTR_LOADORDER) != 0) { + offset = (raw[0] << 24) + (raw[1] << 16) + + (raw[2] << 8) + raw[3]; + raw = rdataset->private3; + raw += offset; + } r.length = raw[0] * 256 + raw[1]; - raw += 2; + raw += 4; r.base = raw; dns_rdata_fromregion(rdata, rdataset->rdclass, rdataset->type, &r); } @@ -5143,7 +5724,7 @@ rdataset_clone(dns_rdataset_t *source, dns_rdataset_t *target) { static unsigned int rdataset_count(dns_rdataset_t *rdataset) { - unsigned char *raw = rdataset->private3; + unsigned char *raw = rdataset->private3; /* RDATASLAB */ unsigned int count; count = raw[0] * 256 + raw[1]; @@ -5233,7 +5814,8 @@ rdatasetiter_first(dns_rdatasetiter_t *iterator) { now = 0; } - LOCK(&rbtdb->node_locks[rbtnode->locknum].lock); + NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock, + isc_rwlocktype_read); for (header = rbtnode->data; header != NULL; header = top_next) { top_next = header->next; @@ -5260,7 +5842,8 @@ rdatasetiter_first(dns_rdatasetiter_t *iterator) { break; } - UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock); + NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, + isc_rwlocktype_read); rbtiterator->current = header; @@ -5294,7 +5877,8 @@ rdatasetiter_next(dns_rdatasetiter_t *iterator) { now = 0; } - LOCK(&rbtdb->node_locks[rbtnode->locknum].lock); + NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock, + isc_rwlocktype_read); type = header->type; rdtype = RBTDB_RDATATYPE_BASE(header->type); @@ -5335,7 +5919,8 @@ rdatasetiter_next(dns_rdatasetiter_t *iterator) { } } - UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock); + NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, + isc_rwlocktype_read); rbtiterator->current = header; @@ -5355,12 +5940,14 @@ rdatasetiter_current(dns_rdatasetiter_t *iterator, dns_rdataset_t *rdataset) { header = rbtiterator->current; REQUIRE(header != NULL); - LOCK(&rbtdb->node_locks[rbtnode->locknum].lock); + NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock, + isc_rwlocktype_read); bind_rdataset(rbtdb, rbtnode, header, rbtiterator->common.now, rdataset); - UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock); + NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, + isc_rwlocktype_read); } @@ -5377,26 +5964,25 @@ reference_iter_node(rbtdb_dbiterator_t *rbtdbiter) { return; INSIST(rbtdbiter->tree_locked != isc_rwlocktype_none); - LOCK(&rbtdb->node_locks[node->locknum].lock); + NODE_STRONGLOCK(&rbtdb->node_locks[node->locknum].lock); new_reference(rbtdb, node); - UNLOCK(&rbtdb->node_locks[node->locknum].lock); + NODE_STRONGUNLOCK(&rbtdb->node_locks[node->locknum].lock); } static inline void dereference_iter_node(rbtdb_dbiterator_t *rbtdbiter) { dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)rbtdbiter->common.db; dns_rbtnode_t *node = rbtdbiter->node; - isc_mutex_t *lock; + nodelock_t *lock; if (node == NULL) return; lock = &rbtdb->node_locks[node->locknum].lock; - LOCK(lock); - INSIST(rbtdbiter->node->references > 0); - if (--node->references == 0) - no_references(rbtdb, node, 0, rbtdbiter->tree_locked); - UNLOCK(lock); + NODE_LOCK(lock, isc_rwlocktype_read); + decrement_reference(rbtdb, node, 0, isc_rwlocktype_read, + rbtdbiter->tree_locked); + NODE_UNLOCK(lock, isc_rwlocktype_read); rbtdbiter->node = NULL; } @@ -5406,7 +5992,7 @@ flush_deletions(rbtdb_dbiterator_t *rbtdbiter) { dns_rbtnode_t *node; dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)rbtdbiter->common.db; isc_boolean_t was_read_locked = ISC_FALSE; - isc_mutex_t *lock; + nodelock_t *lock; int i; if (rbtdbiter->delete != 0) { @@ -5433,13 +6019,11 @@ flush_deletions(rbtdb_dbiterator_t *rbtdbiter) { node = rbtdbiter->deletions[i]; lock = &rbtdb->node_locks[node->locknum].lock; - LOCK(lock); - INSIST(node->references > 0); - node->references--; - if (node->references == 0) - no_references(rbtdb, node, 0, - rbtdbiter->tree_locked); - UNLOCK(lock); + NODE_LOCK(lock, isc_rwlocktype_read); + decrement_reference(rbtdb, node, 0, + isc_rwlocktype_read, + rbtdbiter->tree_locked); + NODE_UNLOCK(lock, isc_rwlocktype_read); } rbtdbiter->delete = 0; @@ -5707,9 +6291,9 @@ dbiterator_current(dns_dbiterator_t *iterator, dns_dbnode_t **nodep, } else result = ISC_R_SUCCESS; - LOCK(&rbtdb->node_locks[node->locknum].lock); + NODE_STRONGLOCK(&rbtdb->node_locks[node->locknum].lock); new_reference(rbtdb, node); - UNLOCK(&rbtdb->node_locks[node->locknum].lock); + NODE_STRONGUNLOCK(&rbtdb->node_locks[node->locknum].lock); *nodep = rbtdbiter->node; @@ -5730,10 +6314,13 @@ dbiterator_current(dns_dbiterator_t *iterator, dns_dbnode_t **nodep, * expirenode() currently always returns success. */ if (expire_result == ISC_R_SUCCESS && node->down == NULL) { + unsigned int refs; + rbtdbiter->deletions[rbtdbiter->delete++] = node; - LOCK(&rbtdb->node_locks[node->locknum].lock); - node->references++; - UNLOCK(&rbtdb->node_locks[node->locknum].lock); + NODE_STRONGLOCK(&rbtdb->node_locks[node->locknum].lock); + dns_rbtnode_refincrement(node, &refs); + INSIST(refs != 0); + NODE_STRONGUNLOCK(&rbtdb->node_locks[node->locknum].lock); } } @@ -5775,3 +6362,356 @@ dbiterator_origin(dns_dbiterator_t *iterator, dns_name_t *name) { return (dns_name_copy(origin, name, NULL)); } + +/*% + * Additional cache routines. + */ +static isc_result_t +rdataset_getadditional(dns_rdataset_t *rdataset, dns_rdatasetadditional_t type, + dns_rdatatype_t qtype, dns_acache_t *acache, + dns_zone_t **zonep, dns_db_t **dbp, + dns_dbversion_t **versionp, dns_dbnode_t **nodep, + dns_name_t *fname, dns_message_t *msg, + isc_stdtime_t now) +{ + dns_rbtdb_t *rbtdb = rdataset->private1; + dns_rbtnode_t *rbtnode = rdataset->private2; + unsigned char *raw = rdataset->private3; /* RDATASLAB */ + unsigned int current_count = rdataset->privateuint4; + unsigned int count; + rdatasetheader_t *header; + nodelock_t *nodelock; + unsigned int total_count; + acachectl_t *acarray; + dns_acacheentry_t *entry; + isc_result_t result; + + UNUSED(qtype); /* we do not use this value at least for now */ + UNUSED(acache); + + header = (struct rdatasetheader *)(raw - sizeof(*header)); + + total_count = raw[0] * 256 + raw[1]; + INSIST(total_count > current_count); + count = total_count - current_count - 1; + + acarray = NULL; + + nodelock = &rbtdb->node_locks[rbtnode->locknum].lock; + NODE_LOCK(nodelock, isc_rwlocktype_read); + + switch (type) { + case dns_rdatasetadditional_fromauth: + acarray = header->additional_auth; + break; + case dns_rdatasetadditional_fromcache: + acarray = NULL; + break; + case dns_rdatasetadditional_fromglue: + acarray = header->additional_glue; + break; + default: + INSIST(0); + } + + if (acarray == NULL) { + if (type != dns_rdatasetadditional_fromcache) + dns_acache_countquerymiss(acache); + NODE_UNLOCK(nodelock, isc_rwlocktype_read); + return (ISC_R_NOTFOUND); + } + + if (acarray[count].entry == NULL) { + dns_acache_countquerymiss(acache); + NODE_UNLOCK(nodelock, isc_rwlocktype_read); + return (ISC_R_NOTFOUND); + } + + entry = NULL; + dns_acache_attachentry(acarray[count].entry, &entry); + + NODE_UNLOCK(nodelock, isc_rwlocktype_read); + + result = dns_acache_getentry(entry, zonep, dbp, versionp, + nodep, fname, msg, now); + + dns_acache_detachentry(&entry); + + return (result); +} + +static void +acache_callback(dns_acacheentry_t *entry, void **arg) { + dns_rbtdb_t *rbtdb; + dns_rbtnode_t *rbtnode; + nodelock_t *nodelock; + acachectl_t *acarray = NULL; + acache_cbarg_t *cbarg; + unsigned int count; + + REQUIRE(arg != NULL); + cbarg = *arg; + + /* + * The caller must hold the entry lock. + */ + + rbtdb = (dns_rbtdb_t *)cbarg->db; + rbtnode = (dns_rbtnode_t *)cbarg->node; + + nodelock = &rbtdb->node_locks[rbtnode->locknum].lock; + NODE_LOCK(nodelock, isc_rwlocktype_write); + + switch (cbarg->type) { + case dns_rdatasetadditional_fromauth: + acarray = cbarg->header->additional_auth; + break; + case dns_rdatasetadditional_fromglue: + acarray = cbarg->header->additional_glue; + break; + default: + INSIST(0); + } + + count = cbarg->count; + if (acarray[count].entry == entry) + acarray[count].entry = NULL; + INSIST(acarray[count].cbarg != NULL); + isc_mem_put(rbtdb->common.mctx, acarray[count].cbarg, + sizeof(acache_cbarg_t)); + acarray[count].cbarg = NULL; + + dns_acache_detachentry(&entry); + + NODE_UNLOCK(nodelock, isc_rwlocktype_write); + + dns_db_detachnode((dns_db_t *)rbtdb, (dns_dbnode_t **)(void*)&rbtnode); + dns_db_detach((dns_db_t **)(void*)&rbtdb); + + *arg = NULL; +} + +static void +acache_cancelentry(isc_mem_t *mctx, dns_acacheentry_t *entry, + acache_cbarg_t **cbargp) +{ + acache_cbarg_t *cbarg; + + REQUIRE(mctx != NULL); + REQUIRE(entry != NULL); + REQUIRE(cbargp != NULL && *cbargp != NULL); + + cbarg = *cbargp; + + dns_acache_cancelentry(entry); + dns_db_detachnode(cbarg->db, &cbarg->node); + dns_db_detach(&cbarg->db); + + isc_mem_put(mctx, cbarg, sizeof(acache_cbarg_t)); + + *cbargp = NULL; +} + +static isc_result_t +rdataset_setadditional(dns_rdataset_t *rdataset, dns_rdatasetadditional_t type, + dns_rdatatype_t qtype, dns_acache_t *acache, + dns_zone_t *zone, dns_db_t *db, + dns_dbversion_t *version, dns_dbnode_t *node, + dns_name_t *fname) +{ + dns_rbtdb_t *rbtdb = rdataset->private1; + dns_rbtnode_t *rbtnode = rdataset->private2; + unsigned char *raw = rdataset->private3; /* RDATASLAB */ + unsigned int current_count = rdataset->privateuint4; + rdatasetheader_t *header; + unsigned int total_count, count; + nodelock_t *nodelock; + isc_result_t result; + acachectl_t *acarray; + dns_acacheentry_t *newentry, *oldentry = NULL; + acache_cbarg_t *newcbarg, *oldcbarg = NULL; + + UNUSED(qtype); + + if (type == dns_rdatasetadditional_fromcache) + return (ISC_R_SUCCESS); + + header = (struct rdatasetheader *)(raw - sizeof(*header)); + + total_count = raw[0] * 256 + raw[1]; + INSIST(total_count > current_count); + count = total_count - current_count - 1; /* should be private data */ + + newcbarg = isc_mem_get(rbtdb->common.mctx, sizeof(*newcbarg)); + if (newcbarg == NULL) + return (ISC_R_NOMEMORY); + newcbarg->type = type; + newcbarg->count = count; + newcbarg->header = header; + newcbarg->db = NULL; + dns_db_attach((dns_db_t *)rbtdb, &newcbarg->db); + newcbarg->node = NULL; + dns_db_attachnode((dns_db_t *)rbtdb, (dns_dbnode_t *)rbtnode, + &newcbarg->node); + newentry = NULL; + result = dns_acache_createentry(acache, (dns_db_t *)rbtdb, + acache_callback, newcbarg, &newentry); + if (result != ISC_R_SUCCESS) + goto fail; + /* Set cache data in the new entry. */ + result = dns_acache_setentry(acache, newentry, zone, db, + version, node, fname); + if (result != ISC_R_SUCCESS) + goto fail; + + nodelock = &rbtdb->node_locks[rbtnode->locknum].lock; + NODE_LOCK(nodelock, isc_rwlocktype_write); + + acarray = NULL; + switch (type) { + case dns_rdatasetadditional_fromauth: + acarray = header->additional_auth; + break; + case dns_rdatasetadditional_fromglue: + acarray = header->additional_glue; + break; + default: + INSIST(0); + } + + if (acarray == NULL) { + unsigned int i; + + acarray = isc_mem_get(rbtdb->common.mctx, total_count * + sizeof(acachectl_t)); + + if (acarray == NULL) { + NODE_UNLOCK(nodelock, isc_rwlocktype_write); + goto fail; + } + + for (i = 0; i < total_count; i++) { + acarray[i].entry = NULL; + acarray[i].cbarg = NULL; + } + } + switch (type) { + case dns_rdatasetadditional_fromauth: + header->additional_auth = acarray; + break; + case dns_rdatasetadditional_fromglue: + header->additional_glue = acarray; + break; + default: + INSIST(0); + } + + if (acarray[count].entry != NULL) { + /* + * Swap the entry. Delay cleaning-up the old entry since + * it would require a node lock. + */ + oldentry = acarray[count].entry; + INSIST(acarray[count].cbarg != NULL); + oldcbarg = acarray[count].cbarg; + } + acarray[count].entry = newentry; + acarray[count].cbarg = newcbarg; + + NODE_UNLOCK(nodelock, isc_rwlocktype_write); + + if (oldentry != NULL) { + if (oldcbarg != NULL) + acache_cancelentry(rbtdb->common.mctx, oldentry, + &oldcbarg); + dns_acache_detachentry(&oldentry); + } + + return (ISC_R_SUCCESS); + + fail: + if (newcbarg != NULL) { + if (newentry != NULL) { + acache_cancelentry(rbtdb->common.mctx, newentry, + &newcbarg); + dns_acache_detachentry(&newentry); + } else { + dns_db_detachnode((dns_db_t *)rbtdb, &newcbarg->node); + dns_db_detach(&newcbarg->db); + isc_mem_put(rbtdb->common.mctx, newcbarg, + sizeof(*newcbarg)); + } + } + + return (result); +} + +static isc_result_t +rdataset_putadditional(dns_acache_t *acache, dns_rdataset_t *rdataset, + dns_rdatasetadditional_t type, dns_rdatatype_t qtype) +{ + dns_rbtdb_t *rbtdb = rdataset->private1; + dns_rbtnode_t *rbtnode = rdataset->private2; + unsigned char *raw = rdataset->private3; /* RDATASLAB */ + unsigned int current_count = rdataset->privateuint4; + rdatasetheader_t *header; + nodelock_t *nodelock; + unsigned int total_count, count; + acachectl_t *acarray; + dns_acacheentry_t *entry; + acache_cbarg_t *cbarg; + + UNUSED(qtype); /* we do not use this value at least for now */ + UNUSED(acache); + + if (type == dns_rdatasetadditional_fromcache) + return (ISC_R_SUCCESS); + + header = (struct rdatasetheader *)(raw - sizeof(*header)); + + total_count = raw[0] * 256 + raw[1]; + INSIST(total_count > current_count); + count = total_count - current_count - 1; + + acarray = NULL; + entry = NULL; + + nodelock = &rbtdb->node_locks[rbtnode->locknum].lock; + NODE_LOCK(nodelock, isc_rwlocktype_write); + + switch (type) { + case dns_rdatasetadditional_fromauth: + acarray = header->additional_auth; + break; + case dns_rdatasetadditional_fromglue: + acarray = header->additional_glue; + break; + default: + INSIST(0); + } + + if (acarray == NULL) { + NODE_UNLOCK(nodelock, isc_rwlocktype_write); + return (ISC_R_NOTFOUND); + } + + entry = acarray[count].entry; + if (entry == NULL) { + NODE_UNLOCK(nodelock, isc_rwlocktype_write); + return (ISC_R_NOTFOUND); + } + + acarray[count].entry = NULL; + cbarg = acarray[count].cbarg; + acarray[count].cbarg = NULL; + + NODE_UNLOCK(nodelock, isc_rwlocktype_write); + + if (entry != NULL) { + if (cbarg != NULL) + acache_cancelentry(rbtdb->common.mctx, entry, &cbarg); + dns_acache_detachentry(&entry); + } + + return (ISC_R_SUCCESS); +} |