diff options
Diffstat (limited to 'contrib/bind9/lib/dns/rbtdb.c')
-rw-r--r-- | contrib/bind9/lib/dns/rbtdb.c | 2651 |
1 files changed, 2237 insertions, 414 deletions
diff --git a/contrib/bind9/lib/dns/rbtdb.c b/contrib/bind9/lib/dns/rbtdb.c index 462a718..9741c15 100644 --- a/contrib/bind9/lib/dns/rbtdb.c +++ b/contrib/bind9/lib/dns/rbtdb.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2004-2008 Internet Systems Consortium, Inc. ("ISC") + * Copyright (C) 2004-2009 Internet Systems Consortium, Inc. ("ISC") * Copyright (C) 1999-2003 Internet Software Consortium. * * Permission to use, copy, modify, and/or distribute this software for any @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: rbtdb.c,v 1.196.18.53 2008/01/31 23:46:05 tbox Exp $ */ +/* $Id: rbtdb.c,v 1.270.12.6 2009/05/06 23:34:30 jinmei Exp $ */ /*! \file */ @@ -25,13 +25,18 @@ #include <config.h> +/* #define inline */ + #include <isc/event.h> +#include <isc/heap.h> #include <isc/mem.h> -#include <isc/print.h> #include <isc/mutex.h> +#include <isc/platform.h> +#include <isc/print.h> #include <isc/random.h> #include <isc/refcount.h> #include <isc/rwlock.h> +#include <isc/serial.h> #include <isc/string.h> #include <isc/task.h> #include <isc/time.h> @@ -45,12 +50,16 @@ #include <dns/lib.h> #include <dns/log.h> #include <dns/masterdump.h> +#include <dns/nsec.h> +#include <dns/nsec3.h> #include <dns/rbt.h> #include <dns/rdata.h> #include <dns/rdataset.h> #include <dns/rdatasetiter.h> #include <dns/rdataslab.h> +#include <dns/rdatastruct.h> #include <dns/result.h> +#include <dns/stats.h> #include <dns/view.h> #include <dns/zone.h> #include <dns/zonekey.h> @@ -62,20 +71,20 @@ #endif #ifdef DNS_RBTDB_VERSION64 -#define RBTDB_MAGIC ISC_MAGIC('R', 'B', 'D', '8') +#define RBTDB_MAGIC ISC_MAGIC('R', 'B', 'D', '8') #else -#define RBTDB_MAGIC ISC_MAGIC('R', 'B', 'D', '4') +#define RBTDB_MAGIC ISC_MAGIC('R', 'B', 'D', '4') #endif /*% * Note that "impmagic" is not the first four bytes of the struct, so * ISC_MAGIC_VALID cannot be used. */ -#define VALID_RBTDB(rbtdb) ((rbtdb) != NULL && \ +#define VALID_RBTDB(rbtdb) ((rbtdb) != NULL && \ (rbtdb)->common.impmagic == RBTDB_MAGIC) #ifdef DNS_RBTDB_VERSION64 -typedef isc_uint64_t rbtdb_serial_t; +typedef isc_uint64_t rbtdb_serial_t; /*% * Make casting easier in symbolic debuggers by using different names * for the 64 bit version. @@ -84,17 +93,19 @@ typedef isc_uint64_t rbtdb_serial_t; #define rdatasetheader_t rdatasetheader64_t #define rbtdb_version_t rbtdb_version64_t #else -typedef isc_uint32_t rbtdb_serial_t; +typedef isc_uint32_t rbtdb_serial_t; #endif -typedef isc_uint32_t rbtdb_rdatatype_t; +typedef isc_uint32_t rbtdb_rdatatype_t; -#define RBTDB_RDATATYPE_BASE(type) ((dns_rdatatype_t)((type) & 0xFFFF)) -#define RBTDB_RDATATYPE_EXT(type) ((dns_rdatatype_t)((type) >> 16)) -#define RBTDB_RDATATYPE_VALUE(b, e) (((e) << 16) | (b)) +#define RBTDB_RDATATYPE_BASE(type) ((dns_rdatatype_t)((type) & 0xFFFF)) +#define RBTDB_RDATATYPE_EXT(type) ((dns_rdatatype_t)((type) >> 16)) +#define RBTDB_RDATATYPE_VALUE(b, e) ((rbtdb_rdatatype_t)((e) << 16) | (b)) #define RBTDB_RDATATYPE_SIGNSEC \ RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_nsec) +#define RBTDB_RDATATYPE_SIGNSEC3 \ + RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_nsec3) #define RBTDB_RDATATYPE_SIGNS \ RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_ns) #define RBTDB_RDATATYPE_SIGCNAME \ @@ -119,15 +130,15 @@ typedef isc_uint32_t rbtdb_rdatatype_t; #endif #if DNS_RBTDB_USERWLOCK -#define RBTDB_INITLOCK(l) isc_rwlock_init((l), 0, 0) -#define RBTDB_DESTROYLOCK(l) isc_rwlock_destroy(l) -#define RBTDB_LOCK(l, t) RWLOCK((l), (t)) -#define RBTDB_UNLOCK(l, t) RWUNLOCK((l), (t)) +#define RBTDB_INITLOCK(l) isc_rwlock_init((l), 0, 0) +#define RBTDB_DESTROYLOCK(l) isc_rwlock_destroy(l) +#define RBTDB_LOCK(l, t) RWLOCK((l), (t)) +#define RBTDB_UNLOCK(l, t) RWUNLOCK((l), (t)) #else -#define RBTDB_INITLOCK(l) isc_mutex_init(l) -#define RBTDB_DESTROYLOCK(l) DESTROYLOCK(l) -#define RBTDB_LOCK(l, t) LOCK(l) -#define RBTDB_UNLOCK(l, t) UNLOCK(l) +#define RBTDB_INITLOCK(l) isc_mutex_init(l) +#define RBTDB_DESTROYLOCK(l) DESTROYLOCK(l) +#define RBTDB_LOCK(l, t) LOCK(l) +#define RBTDB_UNLOCK(l, t) UNLOCK(l) #endif /* @@ -152,47 +163,53 @@ typedef isc_uint32_t rbtdb_rdatatype_t; #if defined(ISC_RWLOCK_USEATOMIC) && defined(DNS_RBT_USEISCREFCOUNT) typedef isc_rwlock_t nodelock_t; -#define NODE_INITLOCK(l) isc_rwlock_init((l), 0, 0) -#define NODE_DESTROYLOCK(l) isc_rwlock_destroy(l) -#define NODE_LOCK(l, t) RWLOCK((l), (t)) -#define NODE_UNLOCK(l, t) RWUNLOCK((l), (t)) -#define NODE_TRYUPGRADE(l) isc_rwlock_tryupgrade(l) - -#define NODE_STRONGLOCK(l) ((void)0) -#define NODE_STRONGUNLOCK(l) ((void)0) -#define NODE_WEAKLOCK(l, t) NODE_LOCK(l, t) -#define NODE_WEAKUNLOCK(l, t) NODE_UNLOCK(l, t) -#define NODE_WEAKDOWNGRADE(l) isc_rwlock_downgrade(l) +#define NODE_INITLOCK(l) isc_rwlock_init((l), 0, 0) +#define NODE_DESTROYLOCK(l) isc_rwlock_destroy(l) +#define NODE_LOCK(l, t) RWLOCK((l), (t)) +#define NODE_UNLOCK(l, t) RWUNLOCK((l), (t)) +#define NODE_TRYUPGRADE(l) isc_rwlock_tryupgrade(l) + +#define NODE_STRONGLOCK(l) ((void)0) +#define NODE_STRONGUNLOCK(l) ((void)0) +#define NODE_WEAKLOCK(l, t) NODE_LOCK(l, t) +#define NODE_WEAKUNLOCK(l, t) NODE_UNLOCK(l, t) +#define NODE_WEAKDOWNGRADE(l) isc_rwlock_downgrade(l) #else typedef isc_mutex_t nodelock_t; -#define NODE_INITLOCK(l) isc_mutex_init(l) -#define NODE_DESTROYLOCK(l) DESTROYLOCK(l) -#define NODE_LOCK(l, t) LOCK(l) -#define NODE_UNLOCK(l, t) UNLOCK(l) -#define NODE_TRYUPGRADE(l) ISC_R_SUCCESS - -#define NODE_STRONGLOCK(l) LOCK(l) -#define NODE_STRONGUNLOCK(l) UNLOCK(l) -#define NODE_WEAKLOCK(l, t) ((void)0) -#define NODE_WEAKUNLOCK(l, t) ((void)0) -#define NODE_WEAKDOWNGRADE(l) ((void)0) +#define NODE_INITLOCK(l) isc_mutex_init(l) +#define NODE_DESTROYLOCK(l) DESTROYLOCK(l) +#define NODE_LOCK(l, t) LOCK(l) +#define NODE_UNLOCK(l, t) UNLOCK(l) +#define NODE_TRYUPGRADE(l) ISC_R_SUCCESS + +#define NODE_STRONGLOCK(l) LOCK(l) +#define NODE_STRONGUNLOCK(l) UNLOCK(l) +#define NODE_WEAKLOCK(l, t) ((void)0) +#define NODE_WEAKUNLOCK(l, t) ((void)0) +#define NODE_WEAKDOWNGRADE(l) ((void)0) #endif -#ifndef DNS_RDATASET_FIXED -#define DNS_RDATASET_FIXED 1 +/*% + * Whether to rate-limit updating the LRU to avoid possible thread contention. + * Our performance measurement has shown the cost is marginal, so it's defined + * to be 0 by default either with or without threads. + */ +#ifndef DNS_RBTDB_LIMITLRUUPDATE +#define DNS_RBTDB_LIMITLRUUPDATE 0 #endif /* - * Allow clients with a virtual time of upto 5 minutes in the past to see + * Allow clients with a virtual time of up to 5 minutes in the past to see * records that would have otherwise have expired. */ #define RBTDB_VIRTUAL 300 struct noqname { - dns_name_t name; - void * nsec; - void * nsecsig; + dns_name_t name; + void * neg; + void * negsig; + dns_rdatatype_t type; }; typedef struct acachectl acachectl_t; @@ -201,18 +218,19 @@ typedef struct rdatasetheader { /*% * Locked by the owning node's lock. */ - rbtdb_serial_t serial; - dns_ttl_t ttl; - rbtdb_rdatatype_t type; - isc_uint16_t attributes; - dns_trust_t trust; - struct noqname *noqname; + rbtdb_serial_t serial; + dns_ttl_t rdh_ttl; + rbtdb_rdatatype_t type; + isc_uint16_t attributes; + dns_trust_t trust; + struct noqname *noqname; + struct noqname *closest; /*%< * We don't use the LIST macros, because the LIST structure has * both head and tail pointers, and is doubly linked. */ - struct rdatasetheader *next; + struct rdatasetheader *next; /*%< * If this is the top header for an rdataset, 'next' points * to the top header for the next rdataset (i.e., the next type). @@ -220,13 +238,13 @@ typedef struct rdatasetheader { * at this header. */ - struct rdatasetheader *down; + struct rdatasetheader *down; /*%< * Points to the header for the next older version of * this rdataset. */ - isc_uint32_t count; + isc_uint32_t count; /*%< * Monotonously increased every time this rdataset is bound so that * it is used as the base of the starting point in DNS responses @@ -235,27 +253,56 @@ typedef struct rdatasetheader { * performance reasons. */ - acachectl_t *additional_auth; - acachectl_t *additional_glue; + acachectl_t *additional_auth; + acachectl_t *additional_glue; + + dns_rbtnode_t *node; + isc_stdtime_t last_used; + ISC_LINK(struct rdatasetheader) lru_link; + /*%< + * Used for LRU-based cache management. We should probably make + * these cache-DB specific. We might also make it a pointer and + * ensure only the top header has a valid link to save memory. + * The linked-list is locked by the rbtdb->lrulock. + */ + + /* + * It's possible this should not be here anymore, but instead + * referenced from the bucket's heap directly. + */ +#if 0 + isc_heap_t *heap; +#endif + unsigned int heap_index; + /*%< + * Used for TTL-based cache cleaning. + */ + isc_stdtime_t resign; } rdatasetheader_t; -#define RDATASET_ATTR_NONEXISTENT 0x0001 -#define RDATASET_ATTR_STALE 0x0002 -#define RDATASET_ATTR_IGNORE 0x0004 -#define RDATASET_ATTR_RETAIN 0x0008 -#define RDATASET_ATTR_NXDOMAIN 0x0010 +typedef ISC_LIST(rdatasetheader_t) rdatasetheaderlist_t; +typedef ISC_LIST(dns_rbtnode_t) rbtnodelist_t; + +#define RDATASET_ATTR_NONEXISTENT 0x0001 +#define RDATASET_ATTR_STALE 0x0002 +#define RDATASET_ATTR_IGNORE 0x0004 +#define RDATASET_ATTR_RETAIN 0x0008 +#define RDATASET_ATTR_NXDOMAIN 0x0010 +#define RDATASET_ATTR_RESIGN 0x0020 +#define RDATASET_ATTR_STATCOUNT 0x0040 +#define RDATASET_ATTR_OPTOUT 0x0080 typedef struct acache_cbarg { - dns_rdatasetadditional_t type; - unsigned int count; - dns_db_t *db; - dns_dbnode_t *node; - rdatasetheader_t *header; + dns_rdatasetadditional_t type; + unsigned int count; + dns_db_t *db; + dns_dbnode_t *node; + rdatasetheader_t *header; } acache_cbarg_t; struct acachectl { - dns_acacheentry_t *entry; - acache_cbarg_t *cbarg; + dns_acacheentry_t *entry; + acache_cbarg_t *cbarg; }; /* @@ -266,7 +313,7 @@ struct acachectl { * expired. */ -#undef IGNORE /* WIN32 winbase.h defines this. */ +#undef IGNORE /* WIN32 winbase.h defines this. */ #define EXISTS(header) \ (((header)->attributes & RDATASET_ATTR_NONEXISTENT) == 0) @@ -278,106 +325,164 @@ struct acachectl { (((header)->attributes & RDATASET_ATTR_RETAIN) != 0) #define NXDOMAIN(header) \ (((header)->attributes & RDATASET_ATTR_NXDOMAIN) != 0) +#define RESIGN(header) \ + (((header)->attributes & RDATASET_ATTR_RESIGN) != 0) +#define OPTOUT(header) \ + (((header)->attributes & RDATASET_ATTR_OPTOUT) != 0) + +#define DEFAULT_NODE_LOCK_COUNT 7 /*%< Should be prime. */ -#define DEFAULT_NODE_LOCK_COUNT 7 /*%< Should be prime. */ -#define DEFAULT_CACHE_NODE_LOCK_COUNT 1009 /*%< Should be prime. */ +/*% + * Number of buckets for cache DB entries (locks, LRU lists, TTL heaps). + * There is a tradeoff issue about configuring this value: if this is too + * small, it may cause heavier contention between threads; if this is too large, + * LRU purge algorithm won't work well (entries tend to be purged prematurely). + * The default value should work well for most environments, but this can + * also be configurable at compilation time via the + * DNS_RBTDB_CACHE_NODE_LOCK_COUNT variable. This value must be larger than + * 1 due to the assumption of overmem_purge(). + */ +#ifdef DNS_RBTDB_CACHE_NODE_LOCK_COUNT +#if DNS_RBTDB_CACHE_NODE_LOCK_COUNT <= 1 +#error "DNS_RBTDB_CACHE_NODE_LOCK_COUNT must be larger than 1" +#else +#define DEFAULT_CACHE_NODE_LOCK_COUNT DNS_RBTDB_CACHE_NODE_LOCK_COUNT +#endif +#else +#define DEFAULT_CACHE_NODE_LOCK_COUNT 16 +#endif /* DNS_RBTDB_CACHE_NODE_LOCK_COUNT */ typedef struct { - nodelock_t lock; + nodelock_t lock; /* Protected in the refcount routines. */ - isc_refcount_t references; + isc_refcount_t references; /* Locked by lock. */ - isc_boolean_t exiting; + isc_boolean_t exiting; } rbtdb_nodelock_t; typedef struct rbtdb_changed { - dns_rbtnode_t * node; - isc_boolean_t dirty; - ISC_LINK(struct rbtdb_changed) link; + dns_rbtnode_t * node; + isc_boolean_t dirty; + ISC_LINK(struct rbtdb_changed) link; } rbtdb_changed_t; -typedef ISC_LIST(rbtdb_changed_t) rbtdb_changedlist_t; +typedef ISC_LIST(rbtdb_changed_t) rbtdb_changedlist_t; + +typedef enum { + dns_db_insecure, + dns_db_partial, + dns_db_secure +} dns_db_secure_t; typedef struct rbtdb_version { /* Not locked */ - rbtdb_serial_t serial; + rbtdb_serial_t serial; /* * Protected in the refcount routines. * XXXJT: should we change the lock policy based on the refcount * performance? */ - isc_refcount_t references; + isc_refcount_t references; /* Locked by database lock. */ - isc_boolean_t writer; - isc_boolean_t commit_ok; - rbtdb_changedlist_t changed_list; - ISC_LINK(struct rbtdb_version) link; + isc_boolean_t writer; + isc_boolean_t commit_ok; + rbtdb_changedlist_t changed_list; + rdatasetheaderlist_t resigned_list; + ISC_LINK(struct rbtdb_version) link; + dns_db_secure_t secure; + isc_boolean_t havensec3; + /* NSEC3 parameters */ + dns_hash_t hash; + isc_uint8_t flags; + isc_uint16_t iterations; + isc_uint8_t salt_length; + unsigned char salt[NSEC3_MAX_HASH_LENGTH]; } rbtdb_version_t; -typedef ISC_LIST(rbtdb_version_t) rbtdb_versionlist_t; +typedef ISC_LIST(rbtdb_version_t) rbtdb_versionlist_t; typedef struct { /* Unlocked. */ - dns_db_t common; + dns_db_t common; #if DNS_RBTDB_USERWLOCK - isc_rwlock_t lock; + isc_rwlock_t lock; #else - isc_mutex_t lock; + isc_mutex_t lock; #endif - isc_rwlock_t tree_lock; - unsigned int node_lock_count; - rbtdb_nodelock_t * node_locks; - dns_rbtnode_t * origin_node; + isc_rwlock_t tree_lock; + unsigned int node_lock_count; + rbtdb_nodelock_t * node_locks; + dns_rbtnode_t * origin_node; + dns_stats_t * rrsetstats; /* cache DB only */ /* Locked by lock. */ - unsigned int active; - isc_refcount_t references; - unsigned int attributes; - rbtdb_serial_t current_serial; - rbtdb_serial_t least_serial; - rbtdb_serial_t next_serial; - rbtdb_version_t * current_version; - rbtdb_version_t * future_version; - rbtdb_versionlist_t open_versions; - isc_boolean_t overmem; - isc_task_t * task; - dns_dbnode_t *soanode; - dns_dbnode_t *nsnode; + unsigned int active; + isc_refcount_t references; + unsigned int attributes; + rbtdb_serial_t current_serial; + rbtdb_serial_t least_serial; + rbtdb_serial_t next_serial; + rbtdb_version_t * current_version; + rbtdb_version_t * future_version; + rbtdb_versionlist_t open_versions; + isc_boolean_t overmem; + isc_task_t * task; + dns_dbnode_t *soanode; + dns_dbnode_t *nsnode; + + /* + * This is a linked list used to implement the LRU cache. There will + * be node_lock_count linked lists here. Nodes in bucket 1 will be + * placed on the linked list rdatasets[1]. + */ + rdatasetheaderlist_t *rdatasets; + + /*% + * Temporary storage for stale cache nodes and dynamically deleted + * nodes that await being cleaned up. + */ + rbtnodelist_t *deadnodes; + + /* + * Heaps. Each of these is used for TTL based expiry. + */ + isc_heap_t **heaps; + /* Locked by tree_lock. */ - dns_rbt_t * tree; - isc_boolean_t secure; + dns_rbt_t * tree; + dns_rbt_t * nsec3; /* Unlocked */ - unsigned int quantum; + unsigned int quantum; } dns_rbtdb_t; -#define RBTDB_ATTR_LOADED 0x01 -#define RBTDB_ATTR_LOADING 0x02 +#define RBTDB_ATTR_LOADED 0x01 +#define RBTDB_ATTR_LOADING 0x02 /*% * Search Context */ typedef struct { - dns_rbtdb_t * rbtdb; - rbtdb_version_t * rbtversion; - rbtdb_serial_t serial; - unsigned int options; - dns_rbtnodechain_t chain; - isc_boolean_t copy_name; - isc_boolean_t need_cleanup; - isc_boolean_t wild; - dns_rbtnode_t * zonecut; - rdatasetheader_t * zonecut_rdataset; - rdatasetheader_t * zonecut_sigrdataset; - dns_fixedname_t zonecut_name; - isc_stdtime_t now; + dns_rbtdb_t * rbtdb; + rbtdb_version_t * rbtversion; + rbtdb_serial_t serial; + unsigned int options; + dns_rbtnodechain_t chain; + isc_boolean_t copy_name; + isc_boolean_t need_cleanup; + isc_boolean_t wild; + dns_rbtnode_t * zonecut; + rdatasetheader_t * zonecut_rdataset; + rdatasetheader_t * zonecut_sigrdataset; + dns_fixedname_t zonecut_name; + isc_stdtime_t now; } rbtdb_search_t; /*% * Load Context */ typedef struct { - dns_rbtdb_t * rbtdb; - isc_stdtime_t now; + dns_rbtdb_t * rbtdb; + isc_stdtime_t now; } rbtdb_load_t; static void rdataset_disassociate(dns_rdataset_t *rdataset); @@ -388,8 +493,12 @@ static void rdataset_clone(dns_rdataset_t *source, dns_rdataset_t *target); static unsigned int rdataset_count(dns_rdataset_t *rdataset); static isc_result_t rdataset_getnoqname(dns_rdataset_t *rdataset, dns_name_t *name, - dns_rdataset_t *nsec, - dns_rdataset_t *nsecsig); + dns_rdataset_t *neg, + dns_rdataset_t *negsig); +static isc_result_t rdataset_getclosest(dns_rdataset_t *rdataset, + dns_name_t *name, + dns_rdataset_t *neg, + dns_rdataset_t *negsig); static isc_result_t rdataset_getadditional(dns_rdataset_t *rdataset, dns_rdatasetadditional_t type, dns_rdatatype_t qtype, @@ -414,6 +523,17 @@ static isc_result_t rdataset_putadditional(dns_acache_t *acache, dns_rdataset_t *rdataset, dns_rdatasetadditional_t type, dns_rdatatype_t qtype); +static inline isc_boolean_t need_headerupdate(rdatasetheader_t *header, + isc_stdtime_t now); +static void update_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header, + isc_stdtime_t now); +static void expire_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header, + isc_boolean_t tree_locked); +static void overmem_purge(dns_rbtdb_t *rbtdb, unsigned int locknum_start, + isc_stdtime_t now, isc_boolean_t tree_locked); +static isc_result_t resign_insert(dns_rbtdb_t *rbtdb, int idx, + rdatasetheader_t *newheader); +static void prune_tree(isc_task_t *task, isc_event_t *event); static dns_rdatasetmethods_t rdataset_methods = { rdataset_disassociate, @@ -424,6 +544,8 @@ static dns_rdatasetmethods_t rdataset_methods = { rdataset_count, NULL, rdataset_getnoqname, + NULL, + rdataset_getclosest, rdataset_getadditional, rdataset_setadditional, rdataset_putadditional @@ -443,22 +565,22 @@ static dns_rdatasetitermethods_t rdatasetiter_methods = { }; typedef struct rbtdb_rdatasetiter { - dns_rdatasetiter_t common; - rdatasetheader_t * current; + dns_rdatasetiter_t common; + rdatasetheader_t * current; } rbtdb_rdatasetiter_t; -static void dbiterator_destroy(dns_dbiterator_t **iteratorp); -static isc_result_t dbiterator_first(dns_dbiterator_t *iterator); -static isc_result_t dbiterator_last(dns_dbiterator_t *iterator); -static isc_result_t dbiterator_seek(dns_dbiterator_t *iterator, +static void dbiterator_destroy(dns_dbiterator_t **iteratorp); +static isc_result_t dbiterator_first(dns_dbiterator_t *iterator); +static isc_result_t dbiterator_last(dns_dbiterator_t *iterator); +static isc_result_t dbiterator_seek(dns_dbiterator_t *iterator, dns_name_t *name); -static isc_result_t dbiterator_prev(dns_dbiterator_t *iterator); -static isc_result_t dbiterator_next(dns_dbiterator_t *iterator); -static isc_result_t dbiterator_current(dns_dbiterator_t *iterator, +static isc_result_t dbiterator_prev(dns_dbiterator_t *iterator); +static isc_result_t dbiterator_next(dns_dbiterator_t *iterator); +static isc_result_t dbiterator_current(dns_dbiterator_t *iterator, dns_dbnode_t **nodep, dns_name_t *name); -static isc_result_t dbiterator_pause(dns_dbiterator_t *iterator); -static isc_result_t dbiterator_origin(dns_dbiterator_t *iterator, +static isc_result_t dbiterator_pause(dns_dbiterator_t *iterator); +static isc_result_t dbiterator_origin(dns_dbiterator_t *iterator, dns_name_t *name); static dns_dbiteratormethods_t dbiterator_methods = { @@ -479,17 +601,21 @@ static dns_dbiteratormethods_t dbiterator_methods = { * If 'paused' is ISC_TRUE, then the tree lock is not being held. */ typedef struct rbtdb_dbiterator { - dns_dbiterator_t common; - isc_boolean_t paused; - isc_boolean_t new_origin; - isc_rwlocktype_t tree_locked; - isc_result_t result; - dns_fixedname_t name; - dns_fixedname_t origin; - dns_rbtnodechain_t chain; - dns_rbtnode_t *node; - dns_rbtnode_t *deletions[DELETION_BATCH_MAX]; - int delete; + dns_dbiterator_t common; + isc_boolean_t paused; + isc_boolean_t new_origin; + isc_rwlocktype_t tree_locked; + isc_result_t result; + dns_fixedname_t name; + dns_fixedname_t origin; + dns_rbtnodechain_t chain; + dns_rbtnodechain_t nsec3chain; + dns_rbtnodechain_t *current; + dns_rbtnode_t *node; + dns_rbtnode_t *deletions[DELETION_BATCH_MAX]; + int delete; + isc_boolean_t nsec3only; + isc_boolean_t nonsec3; } rbtdb_dbiterator_t; @@ -498,17 +624,20 @@ typedef struct rbtdb_dbiterator { static void free_rbtdb(dns_rbtdb_t *rbtdb, isc_boolean_t log, isc_event_t *event); +static void overmem(dns_db_t *db, isc_boolean_t overmem); +static void setnsec3parameters(dns_db_t *db, rbtdb_version_t *version, + isc_boolean_t *nsec3createflag); /*% * 'init_count' is used to initialize 'newheader->count' which inturn * is used to determine where in the cycle rrset-order cyclic starts. - * We don't lock this as we don't care about simultanious updates. + * We don't lock this as we don't care about simultaneous updates. * * Note: - * Both init_count and header->count can be ISC_UINT32_MAX. + * Both init_count and header->count can be ISC_UINT32_MAX. * The count on the returned rdataset however can't be as - * that indicates that the database does not implement cyclic - * processing. + * that indicates that the database does not implement cyclic + * processing. */ static unsigned int init_count; @@ -518,12 +647,12 @@ static unsigned int init_count; * If a routine is going to lock more than one lock in this module, then * the locking must be done in the following order: * - * Tree Lock + * Tree Lock * - * Node Lock (Only one from the set may be locked at one time by - * any caller) + * Node Lock (Only one from the set may be locked at one time by + * any caller) * - * Database Lock + * Database Lock * * Failure to follow this hierarchy can result in deadlock. */ @@ -531,11 +660,7 @@ static unsigned int init_count; /* * Deleting Nodes * - * Currently there is no deletion of nodes from the database, except when - * the database is being destroyed. - * - * If node deletion is added in the future, then for zone databases the node - * for the origin of the zone MUST NOT be deleted. + * For zone databases the node for the origin of the zone MUST NOT be deleted. */ @@ -563,6 +688,96 @@ free_rbtdb_callback(isc_task_t *task, isc_event_t *event) { free_rbtdb(rbtdb, ISC_TRUE, event); } +static void +update_rrsetstats(dns_rbtdb_t *rbtdb, rdatasetheader_t *header, + isc_boolean_t increment) +{ + dns_rdatastatstype_t statattributes = 0; + dns_rdatastatstype_t base = 0; + dns_rdatastatstype_t type; + + /* At the moment we count statistics only for cache DB */ + INSIST(IS_CACHE(rbtdb)); + + if (NXDOMAIN(header)) + statattributes = DNS_RDATASTATSTYPE_ATTR_NXDOMAIN; + else if (RBTDB_RDATATYPE_BASE(header->type) == 0) { + statattributes = DNS_RDATASTATSTYPE_ATTR_NXRRSET; + base = RBTDB_RDATATYPE_EXT(header->type); + } else + base = RBTDB_RDATATYPE_BASE(header->type); + + type = DNS_RDATASTATSTYPE_VALUE(base, statattributes); + if (increment) + dns_rdatasetstats_increment(rbtdb->rrsetstats, type); + else + dns_rdatasetstats_decrement(rbtdb->rrsetstats, type); +} + +static void +set_ttl(dns_rbtdb_t *rbtdb, rdatasetheader_t *header, dns_ttl_t newttl) { + int idx; + isc_heap_t *heap; + dns_ttl_t oldttl; + + oldttl = header->rdh_ttl; + header->rdh_ttl = newttl; + + if (!IS_CACHE(rbtdb)) + return; + + /* + * It's possible the rbtdb is not a cache. If this is the case, + * we will not have a heap, and we move on. If we do, though, + * we might need to adjust things. + */ + if (header->heap_index == 0 || newttl == oldttl) + return; + idx = header->node->locknum; + if (rbtdb->heaps == NULL || rbtdb->heaps[idx] == NULL) + return; + heap = rbtdb->heaps[idx]; + + if (newttl < oldttl) + isc_heap_increased(heap, header->heap_index); + else + isc_heap_decreased(heap, header->heap_index); +} + +/*% + * These functions allow the heap code to rank the priority of each + * element. It returns ISC_TRUE if v1 happens "sooner" than v2. + */ +static isc_boolean_t +ttl_sooner(void *v1, void *v2) { + rdatasetheader_t *h1 = v1; + rdatasetheader_t *h2 = v2; + + if (h1->rdh_ttl < h2->rdh_ttl) + return (ISC_TRUE); + return (ISC_FALSE); +} + +static isc_boolean_t +resign_sooner(void *v1, void *v2) { + rdatasetheader_t *h1 = v1; + rdatasetheader_t *h2 = v2; + + if (h1->resign < h2->resign) + return (ISC_TRUE); + return (ISC_FALSE); +} + +/*% + * This function sets the heap index into the header. + */ +static void +set_index(void *what, unsigned int index) { + rdatasetheader_t *h = what; + + h->heap_index = index; +} + /*% * Work out how many nodes can be deleted in the time between two * requests to the nameserver. Smooth the resulting number and use it @@ -571,7 +786,7 @@ free_rbtdb_callback(isc_task_t *task, isc_event_t *event) { */ static unsigned int adjust_quantum(unsigned int old, isc_time_t *start) { - unsigned int pps = dns_pps; /* packets per second */ + unsigned int pps = dns_pps; /* packets per second */ unsigned int interval; isc_uint64_t usecs; isc_time_t end; @@ -581,7 +796,7 @@ adjust_quantum(unsigned int old, isc_time_t *start) { pps = 100; isc_time_now(&end); - interval = 1000000 / pps; /* interval in usec */ + interval = 1000000 / pps; /* interval in usec */ if (interval == 0) interval = 1; usecs = isc_time_microdiff(&end, start); @@ -619,6 +834,9 @@ free_rbtdb(dns_rbtdb_t *rbtdb, isc_boolean_t log, isc_event_t *event) { char buf[DNS_NAME_FORMATSIZE]; isc_time_t start; + if (IS_CACHE(rbtdb) && rbtdb->common.rdclass == dns_rdataclass_in) + overmem((dns_db_t *)rbtdb, (isc_boolean_t)-1); + REQUIRE(rbtdb->current_version != NULL || EMPTY(rbtdb->open_versions)); REQUIRE(rbtdb->future_version == NULL); @@ -633,6 +851,21 @@ free_rbtdb(dns_rbtdb_t *rbtdb, isc_boolean_t log, isc_event_t *event) { isc_mem_put(rbtdb->common.mctx, rbtdb->current_version, sizeof(rbtdb_version_t)); } + + /* + * We assume the number of remaining dead nodes is reasonably small; + * the overhead of unlinking all nodes here should be negligible. + */ + for (i = 0; i < rbtdb->node_lock_count; i++) { + dns_rbtnode_t *node; + + node = ISC_LIST_HEAD(rbtdb->deadnodes[i]); + while (node != NULL) { + ISC_LIST_UNLINK(rbtdb->deadnodes[i], node, deadlink); + node = ISC_LIST_HEAD(rbtdb->deadnodes[i]); + } + } + if (event == NULL) rbtdb->quantum = (rbtdb->task != NULL) ? 100 : 0; again: @@ -658,6 +891,30 @@ free_rbtdb(dns_rbtdb_t *rbtdb, isc_boolean_t log, isc_event_t *event) { } INSIST(result == ISC_R_SUCCESS && rbtdb->tree == NULL); } + + if (rbtdb->nsec3 != NULL) { + isc_time_now(&start); + result = dns_rbt_destroy2(&rbtdb->nsec3, rbtdb->quantum); + if (result == ISC_R_QUOTA) { + INSIST(rbtdb->task != NULL); + if (rbtdb->quantum != 0) + rbtdb->quantum = adjust_quantum(rbtdb->quantum, + &start); + if (event == NULL) + event = isc_event_allocate(rbtdb->common.mctx, + NULL, + DNS_EVENT_FREESTORAGE, + free_rbtdb_callback, + rbtdb, + sizeof(isc_event_t)); + if (event == NULL) + goto again; + isc_task_send(rbtdb->task, &event); + return; + } + INSIST(result == ISC_R_SUCCESS && rbtdb->nsec3 == NULL); + } + if (event != NULL) isc_event_free(&event); if (log) { @@ -676,12 +933,47 @@ free_rbtdb(dns_rbtdb_t *rbtdb, isc_boolean_t log, isc_event_t *event) { isc_refcount_destroy(&rbtdb->node_locks[i].references); NODE_DESTROYLOCK(&rbtdb->node_locks[i].lock); } + + /* + * Clean up LRU / re-signing order lists. + */ + if (rbtdb->rdatasets != NULL) { + for (i = 0; i < rbtdb->node_lock_count; i++) + INSIST(ISC_LIST_EMPTY(rbtdb->rdatasets[i])); + isc_mem_put(rbtdb->common.mctx, rbtdb->rdatasets, + rbtdb->node_lock_count * + sizeof(rdatasetheaderlist_t)); + } + /* + * Clean up dead node buckets. + */ + if (rbtdb->deadnodes != NULL) { + for (i = 0; i < rbtdb->node_lock_count; i++) + INSIST(ISC_LIST_EMPTY(rbtdb->deadnodes[i])); + isc_mem_put(rbtdb->common.mctx, rbtdb->deadnodes, + rbtdb->node_lock_count * sizeof(rbtnodelist_t)); + } + /* + * Clean up heap objects. + */ + if (rbtdb->heaps != NULL) { + for (i = 0; i < rbtdb->node_lock_count; i++) + isc_heap_destroy(&rbtdb->heaps[i]); + isc_mem_put(rbtdb->common.mctx, rbtdb->heaps, + rbtdb->node_lock_count * + sizeof(isc_heap_t *)); + } + + if (rbtdb->rrsetstats != NULL) + dns_stats_detach(&rbtdb->rrsetstats); + isc_mem_put(rbtdb->common.mctx, rbtdb->node_locks, rbtdb->node_lock_count * sizeof(rbtdb_nodelock_t)); isc_rwlock_destroy(&rbtdb->tree_lock); isc_refcount_destroy(&rbtdb->references); if (rbtdb->task != NULL) isc_task_detach(&rbtdb->task); + RBTDB_DESTROYLOCK(&rbtdb->lock); rbtdb->common.magic = 0; rbtdb->common.impmagic = 0; @@ -788,6 +1080,7 @@ allocate_version(isc_mem_t *mctx, rbtdb_serial_t serial, version->writer = writer; version->commit_ok = ISC_FALSE; ISC_LIST_INIT(version->changed_list); + ISC_LIST_INIT(version->resigned_list); ISC_LINK_INIT(version, link); return (version); @@ -803,11 +1096,29 @@ newversion(dns_db_t *db, dns_dbversion_t **versionp) { REQUIRE(rbtdb->future_version == NULL); RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write); - RUNTIME_CHECK(rbtdb->next_serial != 0); /* XXX Error? */ + RUNTIME_CHECK(rbtdb->next_serial != 0); /* XXX Error? */ version = allocate_version(rbtdb->common.mctx, rbtdb->next_serial, 1, ISC_TRUE); if (version != NULL) { version->commit_ok = ISC_TRUE; + version->secure = rbtdb->current_version->secure; + version->havensec3 = rbtdb->current_version->havensec3; + if (version->havensec3) { + version->flags = rbtdb->current_version->flags; + version->iterations = + rbtdb->current_version->iterations; + version->hash = rbtdb->current_version->hash; + version->salt_length = + rbtdb->current_version->salt_length; + memcpy(version->salt, rbtdb->current_version->salt, + version->salt_length); + } else { + version->flags = 0; + version->iterations = 0; + version->hash = 0; + version->salt_length = 0; + memset(version->salt, 0, sizeof(version->salt)); + } rbtdb->next_serial++; rbtdb->future_version = version; } @@ -875,7 +1186,7 @@ free_acachearray(isc_mem_t *mctx, rdatasetheader_t *header, { unsigned int count; unsigned int i; - unsigned char *raw; /* RDATASLAB */ + unsigned char *raw; /* RDATASLAB */ /* * The caller must be holding the corresponding node lock. @@ -903,22 +1214,69 @@ free_noqname(isc_mem_t *mctx, struct noqname **noqname) { if (dns_name_dynamic(&(*noqname)->name)) dns_name_free(&(*noqname)->name, mctx); - if ((*noqname)->nsec != NULL) - isc_mem_put(mctx, (*noqname)->nsec, - dns_rdataslab_size((*noqname)->nsec, 0)); - if ((*noqname)->nsecsig != NULL) - isc_mem_put(mctx, (*noqname)->nsecsig, - dns_rdataslab_size((*noqname)->nsecsig, 0)); + if ((*noqname)->neg != NULL) + isc_mem_put(mctx, (*noqname)->neg, + dns_rdataslab_size((*noqname)->neg, 0)); + if ((*noqname)->negsig != NULL) + isc_mem_put(mctx, (*noqname)->negsig, + dns_rdataslab_size((*noqname)->negsig, 0)); isc_mem_put(mctx, *noqname, sizeof(**noqname)); *noqname = NULL; } static inline void -free_rdataset(isc_mem_t *mctx, rdatasetheader_t *rdataset) { +init_rdataset(dns_rbtdb_t *rbtdb, rdatasetheader_t *h) +{ + ISC_LINK_INIT(h, lru_link); + h->heap_index = 0; + +#if TRACE_HEADER + if (IS_CACHE(rbtdb) && rbtdb->common.rdclass == dns_rdataclass_in) + fprintf(stderr, "initialized header: %p\n", h); +#else + UNUSED(rbtdb); +#endif +} + +static inline rdatasetheader_t * +new_rdataset(dns_rbtdb_t *rbtdb, isc_mem_t *mctx) +{ + rdatasetheader_t *h; + + h = isc_mem_get(mctx, sizeof(*h)); + if (h == NULL) + return (NULL); + +#if TRACE_HEADER + if (IS_CACHE(rbtdb) && rbtdb->common.rdclass == dns_rdataclass_in) + fprintf(stderr, "allocated header: %p\n", h); +#endif + init_rdataset(rbtdb, h); + return (h); +} + +static inline void +free_rdataset(dns_rbtdb_t *rbtdb, isc_mem_t *mctx, rdatasetheader_t *rdataset) +{ unsigned int size; + int idx; + + if (EXISTS(rdataset) && + (rdataset->attributes & RDATASET_ATTR_STATCOUNT) != 0) { + update_rrsetstats(rbtdb, rdataset, ISC_FALSE); + } + + idx = rdataset->node->locknum; + if (ISC_LINK_LINKED(rdataset, lru_link)) + ISC_LIST_UNLINK(rbtdb->rdatasets[idx], rdataset, lru_link); + if (rdataset->heap_index != 0) + isc_heap_delete(rbtdb->heaps[idx], rdataset->heap_index); + rdataset->heap_index = 0; if (rdataset->noqname != NULL) free_noqname(mctx, &rdataset->noqname); + if (rdataset->closest != NULL) + free_noqname(mctx, &rdataset->closest); free_acachearray(mctx, rdataset, rdataset->additional_auth); free_acachearray(mctx, rdataset, rdataset->additional_glue); @@ -964,12 +1322,13 @@ rollback_node(dns_rbtnode_t *node, rbtdb_serial_t serial) { } static inline void -clean_stale_headers(isc_mem_t *mctx, rdatasetheader_t *top) { +clean_stale_headers(dns_rbtdb_t *rbtdb, isc_mem_t *mctx, rdatasetheader_t *top) +{ rdatasetheader_t *d, *down_next; for (d = top->down; d != NULL; d = down_next) { down_next = d->down; - free_rdataset(mctx, d); + free_rdataset(rbtdb, mctx, d); } top->down = NULL; } @@ -986,7 +1345,7 @@ clean_cache_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node) { top_prev = NULL; for (current = node->data; current != NULL; current = top_next) { top_next = current->next; - clean_stale_headers(mctx, current); + clean_stale_headers(rbtdb, mctx, current); /* * If current is nonexistent or stale, we can clean it up. */ @@ -996,7 +1355,7 @@ clean_cache_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node) { top_prev->next = current->next; else node->data = current->next; - free_rdataset(mctx, current); + free_rdataset(rbtdb, mctx, current); } else top_prev = current; } @@ -1037,7 +1396,7 @@ clean_zone_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, if (down_next != NULL) down_next->next = dparent; dparent->down = down_next; - free_rdataset(mctx, dcurrent); + free_rdataset(rbtdb, mctx, dcurrent); } else dparent = dcurrent; } @@ -1053,7 +1412,7 @@ clean_zone_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, top_prev->next = current->next; else node->data = current->next; - free_rdataset(mctx, current); + free_rdataset(rbtdb, mctx, current); /* * current no longer exists, so we can * just continue with the loop. @@ -1069,7 +1428,7 @@ clean_zone_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, else node->data = down_next; down_next->next = top_next; - free_rdataset(mctx, current); + free_rdataset(rbtdb, mctx, current); current = down_next; } } @@ -1096,7 +1455,7 @@ clean_zone_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, do { down_next = dcurrent->down; INSIST(dcurrent->serial <= least_serial); - free_rdataset(mctx, dcurrent); + free_rdataset(rbtdb, mctx, dcurrent); dcurrent = down_next; } while (dcurrent != NULL); dparent->down = NULL; @@ -1120,7 +1479,7 @@ clean_zone_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, top_prev->next = current->next; else node->data = current->next; - free_rdataset(mctx, current); + free_rdataset(rbtdb, mctx, current); } else top_prev = current; } @@ -1129,6 +1488,49 @@ clean_zone_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, node->dirty = 0; } +/*% + * Clean up dead nodes. These are nodes which have no references, and + * have no data. They are dead but we could not or chose not to delete + * them when we deleted all the data at that node because we did not want + * to wait for the tree write lock. + * + * The caller must hold a tree write lock and bucketnum'th node (write) lock. + */ +static void +cleanup_dead_nodes(dns_rbtdb_t *rbtdb, int bucketnum) { + dns_rbtnode_t *node; + isc_result_t result; + int count = 10; /* XXXJT: should be adjustable */ + + node = ISC_LIST_HEAD(rbtdb->deadnodes[bucketnum]); + while (node != NULL && count > 0) { + ISC_LIST_UNLINK(rbtdb->deadnodes[bucketnum], node, deadlink); + + /* + * Since we're holding a tree write lock, it should be + * impossible for this node to be referenced by others. + */ + INSIST(dns_rbtnode_refcurrent(node) == 0 && + node->data == NULL); + + INSIST(!ISC_LINK_LINKED(node, deadlink)); + if (node->nsec3) + result = dns_rbt_deletenode(rbtdb->nsec3, node, + ISC_FALSE); + else + result = dns_rbt_deletenode(rbtdb->tree, node, + ISC_FALSE); + if (result != ISC_R_SUCCESS) + isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, + DNS_LOGMODULE_CACHE, ISC_LOG_WARNING, + "cleanup_dead_nodes: " + "dns_rbt_deletenode: %s", + isc_result_totext(result)); + node = ISC_LIST_HEAD(rbtdb->deadnodes[bucketnum]); + count--; + } +} + /* * Caller must be holding the node lock if its reference must be protected * by the lock. @@ -1139,7 +1541,7 @@ new_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node) { isc_refcount_t *lockref; dns_rbtnode_refincrement0(node, &noderefs); - if (noderefs == 1) { /* this is the first reference to the node */ + if (noderefs == 1) { /* this is the first reference to the node */ lockref = &rbtdb->node_locks[node->locknum].references; isc_refcount_increment0(lockref, &lockrefs); INSIST(lockrefs != 0); @@ -1148,6 +1550,49 @@ new_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node) { } /* + * This function is assumed to be called when a node is newly referenced + * and can be in the deadnode list. In that case the node must be retrieved + * from the list because it is going to be used. In addition, if the caller + * happens to hold a write lock on the tree, it's a good chance to purge dead + * nodes. + * Note: while a new reference is gained in multiple places, there are only very + * few cases where the node can be in the deadnode list (only empty nodes can + * have been added to the list). + */ +static inline void +reactivate_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, + isc_rwlocktype_t treelocktype) +{ + isc_boolean_t need_relock = ISC_FALSE; + + NODE_STRONGLOCK(&rbtdb->node_locks[node->locknum].lock); + new_reference(rbtdb, node); + + NODE_WEAKLOCK(&rbtdb->node_locks[node->locknum].lock, + isc_rwlocktype_read); + if (ISC_LINK_LINKED(node, deadlink)) + need_relock = ISC_TRUE; + else if (!ISC_LIST_EMPTY(rbtdb->deadnodes[node->locknum]) && + treelocktype == isc_rwlocktype_write) + need_relock = ISC_TRUE; + NODE_WEAKUNLOCK(&rbtdb->node_locks[node->locknum].lock, + isc_rwlocktype_read); + if (need_relock) { + NODE_WEAKLOCK(&rbtdb->node_locks[node->locknum].lock, + isc_rwlocktype_write); + if (ISC_LINK_LINKED(node, deadlink)) + ISC_LIST_UNLINK(rbtdb->deadnodes[node->locknum], + node, deadlink); + if (treelocktype == isc_rwlocktype_write) + cleanup_dead_nodes(rbtdb, node->locknum); + NODE_WEAKUNLOCK(&rbtdb->node_locks[node->locknum].lock, + isc_rwlocktype_write); + } + + NODE_STRONGUNLOCK(&rbtdb->node_locks[node->locknum].lock); +} + +/* * Caller must be holding the node lock; either the "strong", read or write * lock. Note that the lock must be held even when node references are * atomically modified; in that case the decrement operation itself does not @@ -1160,14 +1605,17 @@ new_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node) { static isc_boolean_t decrement_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, rbtdb_serial_t least_serial, - isc_rwlocktype_t nlock, isc_rwlocktype_t tlock) + isc_rwlocktype_t nlock, isc_rwlocktype_t tlock, + isc_boolean_t pruning) { isc_result_t result; isc_boolean_t write_locked; rbtdb_nodelock_t *nodelock; unsigned int refs, nrefs; + int bucket = node->locknum; + isc_boolean_t no_reference; - nodelock = &rbtdb->node_locks[node->locknum]; + nodelock = &rbtdb->node_locks[bucket]; /* Handle easy and typical case first. */ if (!node->dirty && (node->data != NULL || node->down != NULL)) { @@ -1226,7 +1674,9 @@ decrement_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, } /* - * XXXDCL need to add a deferred delete method for ISC_R_LOCKBUSY. + * Attempt to switch to a write lock on the tree. If this fails, + * we will add this node to a linked list of nodes in this locking + * bucket which we will free later. */ if (tlock != isc_rwlocktype_write) { /* @@ -1246,6 +1696,7 @@ decrement_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, } else write_locked = ISC_TRUE; + no_reference = ISC_TRUE; if (write_locked && dns_rbtnode_refcurrent(node) == 0) { /* * We can now delete the node if the reference counter is @@ -1254,26 +1705,97 @@ decrement_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, * current thread locks the tree (e.g., in findnode()). */ - if (isc_log_wouldlog(dns_lctx, ISC_LOG_DEBUG(1))) { - char printname[DNS_NAME_FORMATSIZE]; + /* + * If this node is the only one in the level it's in, deleting + * this node may recursively make its parent the only node in + * the parent level; if so, and if no one is currently using + * the parent node, this is almost the only opportunity to + * clean it up. But the recursive cleanup is not that trivial + * since the child and parent may be in different lock buckets, + * which would cause a lock order reversal problem. To avoid + * the trouble, we'll dispatch a separate event for batch + * cleaning. We need to check whether we're deleting the node + * as a result of pruning to avoid infinite dispatching. + * Note: pruning happens only when a task has been set for the + * rbtdb. If the user of the rbtdb chooses not to set a task, + * it's their responsibility to purge stale leaves (e.g. by + * periodic walk-through). + */ + if (!pruning && node->parent != NULL && + node->parent->down == node && node->left == NULL && + node->right == NULL && rbtdb->task != NULL) { + isc_event_t *ev; + dns_db_t *db; + + ev = isc_event_allocate(rbtdb->common.mctx, NULL, + DNS_EVENT_RBTPRUNE, + prune_tree, node, + sizeof(isc_event_t)); + if (ev != NULL) { + new_reference(rbtdb, node); + db = NULL; + attach((dns_db_t *)rbtdb, &db); + ev->ev_sender = db; + isc_task_send(rbtdb->task, &ev); + no_reference = ISC_FALSE; + } else { + /* + * XXX: this is a weird situation. We could + * ignore this error case, but then the stale + * node will unlikely be purged except via a + * rare condition such as manual cleanup. So + * we queue it in the deadnodes list, hoping + * the memory shortage is temporary and the node + * will be deleted later. + */ + isc_log_write(dns_lctx, + DNS_LOGCATEGORY_DATABASE, + DNS_LOGMODULE_CACHE, + ISC_LOG_INFO, + "decrement_reference: failed to " + "allocate pruning event"); + INSIST(!ISC_LINK_LINKED(node, deadlink)); + ISC_LIST_APPEND(rbtdb->deadnodes[bucket], node, + deadlink); + } + } else { + if (isc_log_wouldlog(dns_lctx, ISC_LOG_DEBUG(1))) { + char printname[DNS_NAME_FORMATSIZE]; + + isc_log_write(dns_lctx, + DNS_LOGCATEGORY_DATABASE, + DNS_LOGMODULE_CACHE, + ISC_LOG_DEBUG(1), + "decrement_reference: " + "delete from rbt: %p %s", + node, + dns_rbt_formatnodename(node, + printname, + sizeof(printname))); + } - isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, - DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1), - "decrement_reference: " - "delete from rbt: %p %s", - node, - dns_rbt_formatnodename(node, printname, - sizeof(printname))); + INSIST(!ISC_LINK_LINKED(node, deadlink)); + if (node->nsec3) + result = dns_rbt_deletenode(rbtdb->nsec3, node, + ISC_FALSE); + else + result = dns_rbt_deletenode(rbtdb->tree, node, + ISC_FALSE); + if (result != ISC_R_SUCCESS) { + isc_log_write(dns_lctx, + DNS_LOGCATEGORY_DATABASE, + DNS_LOGMODULE_CACHE, + ISC_LOG_WARNING, + "decrement_reference: " + "dns_rbt_deletenode: %s", + isc_result_totext(result)); + } } - - result = dns_rbt_deletenode(rbtdb->tree, node, ISC_FALSE); - if (result != ISC_R_SUCCESS) - isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, - DNS_LOGMODULE_CACHE, ISC_LOG_WARNING, - "decrement_reference: " - "dns_rbt_deletenode: %s", - isc_result_totext(result)); - } + } else if (dns_rbtnode_refcurrent(node) == 0) { + INSIST(!ISC_LINK_LINKED(node, deadlink)); + ISC_LIST_APPEND(rbtdb->deadnodes[bucket], node, deadlink); + } else + no_reference = ISC_FALSE; /* Restore the lock? */ if (nlock == isc_rwlocktype_read) @@ -1290,7 +1812,71 @@ decrement_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, if (write_locked) isc_rwlock_downgrade(&rbtdb->tree_lock); - return (ISC_TRUE); + return (no_reference); +} + +/* + * Prune the tree by recursively cleaning-up single leaves. In the worst + * case, the number of iteration is the number of tree levels, which is at + * most the maximum number of domain name labels, i.e, 127. In practice, this + * should be much smaller (only a few times), and even the worst case would be + * acceptable for a single event. + */ +static void +prune_tree(isc_task_t *task, isc_event_t *event) { + dns_rbtdb_t *rbtdb = event->ev_sender; + dns_rbtnode_t *node = event->ev_arg; + dns_rbtnode_t *parent; + unsigned int locknum; + + UNUSED(task); + + isc_event_free(&event); + + RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write); + locknum = node->locknum; + NODE_LOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write); + do { + parent = node->parent; + decrement_reference(rbtdb, node, 0, isc_rwlocktype_write, + isc_rwlocktype_write, ISC_TRUE); + + if (parent != NULL && parent->down == NULL) { + /* + * node was the only down child of the parent and has + * just been removed. We'll then need to examine the + * parent. Keep the lock if possible; otherwise, + * release the old lock and acquire one for the parent. + */ + if (parent->locknum != locknum) { + NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, + isc_rwlocktype_write); + locknum = parent->locknum; + NODE_LOCK(&rbtdb->node_locks[locknum].lock, + isc_rwlocktype_write); + } + + /* + * We need to gain a reference to the node before + * decrementing it in the next iteration. In addition, + * if the node is in the dead-nodes list, extract it + * from the list beforehand as we do in + * reactivate_node(). + */ + new_reference(rbtdb, parent); + if (ISC_LINK_LINKED(parent, deadlink)) { + ISC_LIST_UNLINK(rbtdb->deadnodes[locknum], + parent, deadlink); + } + } else + parent = NULL; + + node = parent; + } while (node != NULL); + NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write); + RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write); + + detach((dns_db_t **)&rbtdb); } static inline void @@ -1337,17 +1923,20 @@ cleanup_nondirty(rbtdb_version_t *version, rbtdb_changedlist_t *cleanup_list) { } } -static isc_boolean_t -iszonesecure(dns_db_t *db, dns_dbnode_t *origin) { +static void +iszonesecure(dns_db_t *db, rbtdb_version_t *version, dns_dbnode_t *origin) { dns_rdataset_t keyset; dns_rdataset_t nsecset, signsecset; + dns_rdata_t rdata = DNS_RDATA_INIT; isc_boolean_t haszonekey = ISC_FALSE; isc_boolean_t hasnsec = ISC_FALSE; + isc_boolean_t hasoptbit = ISC_FALSE; + isc_boolean_t nsec3createflag = ISC_FALSE; isc_result_t result; dns_rdataset_init(&keyset); - result = dns_db_findrdataset(db, origin, NULL, dns_rdatatype_dnskey, 0, - 0, &keyset, NULL); + result = dns_db_findrdataset(db, origin, version, dns_rdatatype_dnskey, + 0, 0, &keyset, NULL); if (result == ISC_R_SUCCESS) { dns_rdata_t keyrdata = DNS_RDATA_INIT; result = dns_rdataset_first(&keyset); @@ -1361,21 +1950,153 @@ iszonesecure(dns_db_t *db, dns_dbnode_t *origin) { } dns_rdataset_disassociate(&keyset); } - if (!haszonekey) - return (ISC_FALSE); + if (!haszonekey) { + version->secure = dns_db_insecure; + version->havensec3 = ISC_FALSE; + return; + } dns_rdataset_init(&nsecset); dns_rdataset_init(&signsecset); - result = dns_db_findrdataset(db, origin, NULL, dns_rdatatype_nsec, 0, - 0, &nsecset, &signsecset); + result = dns_db_findrdataset(db, origin, version, dns_rdatatype_nsec, + 0, 0, &nsecset, &signsecset); if (result == ISC_R_SUCCESS) { if (dns_rdataset_isassociated(&signsecset)) { hasnsec = ISC_TRUE; + result = dns_rdataset_first(&nsecset); + if (result == ISC_R_SUCCESS) { + dns_rdataset_current(&nsecset, &rdata); + hasoptbit = dns_nsec_typepresent(&rdata, + dns_rdatatype_opt); + } dns_rdataset_disassociate(&signsecset); } dns_rdataset_disassociate(&nsecset); } - return (hasnsec); + + setnsec3parameters(db, version, &nsec3createflag); + + /* + * Do we have a valid NSEC/NSEC3 chain? + */ + if (version->havensec3 || (hasnsec && !hasoptbit)) + version->secure = dns_db_secure; + /* + * Do we have a NSEC/NSEC3 chain under creation? + */ + else if (hasoptbit || nsec3createflag) + version->secure = dns_db_partial; + else + version->secure = dns_db_insecure; +} + +/*%< + * Walk the origin node looking for NSEC3PARAM records. + * Cache the nsec3 parameters. + */ +static void +setnsec3parameters(dns_db_t *db, rbtdb_version_t *version, + isc_boolean_t *nsec3createflag) +{ + dns_rbtnode_t *node; + dns_rdata_nsec3param_t nsec3param; + dns_rdata_t rdata = DNS_RDATA_INIT; + isc_region_t region; + isc_result_t result; + rdatasetheader_t *header, *header_next; + unsigned char *raw; /* RDATASLAB */ + unsigned int count, length; + dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; + + RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read); + version->havensec3 = ISC_FALSE; + node = rbtdb->origin_node; + NODE_LOCK(&(rbtdb->node_locks[node->locknum].lock), + isc_rwlocktype_read); + for (header = node->data; + header != NULL; + header = header_next) { + header_next = header->next; + do { + if (header->serial <= version->serial && + !IGNORE(header)) { + if (NONEXISTENT(header)) + header = NULL; + break; + } else + header = header->down; + } while (header != NULL); + + if (header != NULL && + header->type == dns_rdatatype_nsec3param) { + /* + * Find A NSEC3PARAM with a supported algorithm. + */ + raw = (unsigned char *)header + sizeof(*header); + count = raw[0] * 256 + raw[1]; /* count */ +#if DNS_RDATASET_FIXED + raw += count * 4 + 2; +#else + raw += 2; +#endif + while (count-- > 0U) { + length = raw[0] * 256 + raw[1]; +#if DNS_RDATASET_FIXED + raw += 4; +#else + raw += 2; +#endif + region.base = raw; + region.length = length; + raw += length; + dns_rdata_fromregion(&rdata, + rbtdb->common.rdclass, + dns_rdatatype_nsec3param, + ®ion); + result = dns_rdata_tostruct(&rdata, + &nsec3param, + NULL); + INSIST(result == ISC_R_SUCCESS); + dns_rdata_reset(&rdata); + + if (nsec3param.hash != DNS_NSEC3_UNKNOWNALG && + !dns_nsec3_supportedhash(nsec3param.hash)) + continue; + +#ifdef RFC5155_STRICT + if (nsec3param.flags != 0) + continue; +#else + if ((nsec3param.flags & DNS_NSEC3FLAG_CREATE) + != 0) + *nsec3createflag = ISC_TRUE; + if ((nsec3param.flags & ~DNS_NSEC3FLAG_OPTOUT) + != 0) + continue; +#endif + + INSIST(nsec3param.salt_length <= + sizeof(version->salt)); + memcpy(version->salt, nsec3param.salt, + nsec3param.salt_length); + version->hash = nsec3param.hash; + version->salt_length = nsec3param.salt_length; + version->iterations = nsec3param.iterations; + version->flags = nsec3param.flags; + version->havensec3 = ISC_TRUE; + /* + * Look for a better algorithm than the + * unknown test algorithm. + */ + if (nsec3param.hash != DNS_NSEC3_UNKNOWNALG) + goto unlock; + } + } + } + unlock: + NODE_UNLOCK(&(rbtdb->node_locks[node->locknum].lock), + isc_rwlocktype_read); + RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read); } static void @@ -1384,10 +2105,12 @@ closeversion(dns_db_t *db, dns_dbversion_t **versionp, isc_boolean_t commit) { rbtdb_version_t *version, *cleanup_version, *least_greater; isc_boolean_t rollback = ISC_FALSE; rbtdb_changedlist_t cleanup_list; + rdatasetheaderlist_t resigned_list; rbtdb_changed_t *changed, *next_changed; rbtdb_serial_t serial, least_serial; dns_rbtnode_t *rbtnode; unsigned int refs; + rdatasetheader_t *header; isc_boolean_t writer; REQUIRE(VALID_RBTDB(rbtdb)); @@ -1395,9 +2118,10 @@ closeversion(dns_db_t *db, dns_dbversion_t **versionp, isc_boolean_t commit) { cleanup_version = NULL; ISC_LIST_INIT(cleanup_list); + ISC_LIST_INIT(resigned_list); isc_refcount_decrement(&version->references, &refs); - if (refs > 0) { /* typical and easy case first */ + if (refs > 0) { /* typical and easy case first */ if (commit) { RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read); INSIST(!version->writer); @@ -1484,12 +2208,16 @@ closeversion(dns_db_t *db, dns_dbversion_t **versionp, isc_boolean_t commit) { INSIST(cur_ref == 1); PREPEND(rbtdb->open_versions, rbtdb->current_version, link); + resigned_list = version->resigned_list; + ISC_LIST_INIT(version->resigned_list); } else { /* * We're rolling back this transaction. */ cleanup_list = version->changed_list; ISC_LIST_INIT(version->changed_list); + resigned_list = version->resigned_list; + ISC_LIST_INIT(version->resigned_list); rollback = ISC_TRUE; cleanup_version = version; rbtdb->future_version = NULL; @@ -1542,7 +2270,7 @@ closeversion(dns_db_t *db, dns_dbversion_t **versionp, isc_boolean_t commit) { * Update the zone's secure status. */ if (writer && commit && !IS_CACHE(rbtdb)) - rbtdb->secure = iszonesecure(db, rbtdb->origin_node); + iszonesecure(db, version, rbtdb->origin_node); if (cleanup_version != NULL) { INSIST(EMPTY(cleanup_version->changed_list)); @@ -1550,7 +2278,35 @@ closeversion(dns_db_t *db, dns_dbversion_t **versionp, isc_boolean_t commit) { sizeof(*cleanup_version)); } + /* + * Commit/rollback re-signed headers. + */ + for (header = HEAD(resigned_list); + header != NULL; + header = HEAD(resigned_list)) { + ISC_LIST_UNLINK(resigned_list, header, lru_link); + if (rollback) { + nodelock_t *lock; + lock = &rbtdb->node_locks[header->node->locknum].lock; + NODE_LOCK(lock, isc_rwlocktype_write); + resign_insert(rbtdb, header->node->locknum, header); + NODE_UNLOCK(lock, isc_rwlocktype_write); + } + decrement_reference(rbtdb, header->node, least_serial, + isc_rwlocktype_write, isc_rwlocktype_none, + ISC_FALSE); + } + if (!EMPTY(cleanup_list)) { + /* + * We acquire a tree write lock here in order to make sure + * that stale nodes will be removed in decrement_reference(). + * If we didn't have the lock, those nodes could miss the + * chance to be removed until the server stops. The write lock + * is expensive, but this event should be rare enough to justify + * the cost. + */ + RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write); for (changed = HEAD(cleanup_list); changed != NULL; changed = next_changed) { @@ -1561,19 +2317,27 @@ closeversion(dns_db_t *db, dns_dbversion_t **versionp, isc_boolean_t commit) { lock = &rbtdb->node_locks[rbtnode->locknum].lock; NODE_LOCK(lock, isc_rwlocktype_write); + /* + * This is a good opportunity to purge any dead nodes, + * so use it. + */ + cleanup_dead_nodes(rbtdb, rbtnode->locknum); + if (rollback) rollback_node(rbtnode, serial); decrement_reference(rbtdb, rbtnode, least_serial, isc_rwlocktype_write, - isc_rwlocktype_none); + isc_rwlocktype_write, ISC_FALSE); + NODE_UNLOCK(lock, isc_rwlocktype_write); isc_mem_put(rbtdb->common.mctx, changed, sizeof(*changed)); } + RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write); } - end: + end: *versionp = NULL; } @@ -1606,6 +2370,7 @@ add_wildcard_magic(dns_rbtdb_t *rbtdb, dns_name_t *name) { result = dns_rbt_addnode(rbtdb->tree, &foundname, &node); if (result != ISC_R_SUCCESS && result != ISC_R_EXISTS) return (result); + node->nsec3 = 0; node->find_callback = 1; node->wild = 1; return (ISC_R_SUCCESS); @@ -1623,7 +2388,7 @@ add_empty_wildcards(dns_rbtdb_t *rbtdb, dns_name_t *name) { l = dns_name_countlabels(&rbtdb->common.origin); i = l + 1; while (i < n) { - dns_rbtnode_t *node = NULL; /* dummy */ + dns_rbtnode_t *node = NULL; /* dummy */ dns_name_getlabelsequence(name, n - i, i, &foundname); if (dns_name_iswildcard(&foundname)) { result = add_wildcard_magic(rbtdb, &foundname); @@ -1633,6 +2398,7 @@ add_empty_wildcards(dns_rbtdb_t *rbtdb, dns_name_t *name) { &node); if (result != ISC_R_SUCCESS && result != ISC_R_EXISTS) return (result); + node->nsec3 = 0; } i++; } @@ -1678,6 +2444,7 @@ findnode(dns_db_t *db, dns_name_t *name, isc_boolean_t create, node->locknum = dns_name_hash(&nodename, ISC_TRUE) % rbtdb->node_lock_count; #endif + node->nsec3 = 0; add_empty_wildcards(rbtdb, name); if (dns_name_iswildcard(name)) { @@ -1692,6 +2459,60 @@ findnode(dns_db_t *db, dns_name_t *name, isc_boolean_t create, return (result); } } + reactivate_node(rbtdb, node, locktype); + RWUNLOCK(&rbtdb->tree_lock, locktype); + + *nodep = (dns_dbnode_t *)node; + + return (ISC_R_SUCCESS); +} + +static isc_result_t +findnsec3node(dns_db_t *db, dns_name_t *name, isc_boolean_t create, + dns_dbnode_t **nodep) +{ + dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; + dns_rbtnode_t *node = NULL; + dns_name_t nodename; + isc_result_t result; + isc_rwlocktype_t locktype = isc_rwlocktype_read; + + REQUIRE(VALID_RBTDB(rbtdb)); + + dns_name_init(&nodename, NULL); + RWLOCK(&rbtdb->tree_lock, locktype); + result = dns_rbt_findnode(rbtdb->nsec3, name, NULL, &node, NULL, + DNS_RBTFIND_EMPTYDATA, NULL, NULL); + if (result != ISC_R_SUCCESS) { + RWUNLOCK(&rbtdb->tree_lock, locktype); + if (!create) { + if (result == DNS_R_PARTIALMATCH) + result = ISC_R_NOTFOUND; + return (result); + } + /* + * It would be nice to try to upgrade the lock instead of + * unlocking then relocking. + */ + locktype = isc_rwlocktype_write; + RWLOCK(&rbtdb->tree_lock, locktype); + node = NULL; + result = dns_rbt_addnode(rbtdb->nsec3, name, &node); + if (result == ISC_R_SUCCESS) { + dns_rbt_namefromnode(node, &nodename); +#ifdef DNS_RBT_USEHASH + node->locknum = node->hashval % rbtdb->node_lock_count; +#else + node->locknum = dns_name_hash(&nodename, ISC_TRUE) % + rbtdb->node_lock_count; +#endif + node->nsec3 = 1U; + } else if (result != ISC_R_EXISTS) { + RWUNLOCK(&rbtdb->tree_lock, locktype); + return (result); + } + } else + INSIST(node->nsec3); NODE_STRONGLOCK(&rbtdb->node_locks[node->locknum].lock); new_reference(rbtdb, node); NODE_STRONGUNLOCK(&rbtdb->node_locks[node->locknum].lock); @@ -1846,7 +2667,7 @@ bind_rdataset(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, rdatasetheader_t *header, isc_stdtime_t now, dns_rdataset_t *rdataset) { - unsigned char *raw; /* RDATASLAB */ + unsigned char *raw; /* RDATASLAB */ /* * Caller must be holding the node reader lock. @@ -1861,16 +2682,18 @@ bind_rdataset(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, new_reference(rbtdb, node); - INSIST(rdataset->methods == NULL); /* We must be disassociated. */ + INSIST(rdataset->methods == NULL); /* We must be disassociated. */ rdataset->methods = &rdataset_methods; rdataset->rdclass = rbtdb->common.rdclass; rdataset->type = RBTDB_RDATATYPE_BASE(header->type); rdataset->covers = RBTDB_RDATATYPE_EXT(header->type); - rdataset->ttl = header->ttl - now; + rdataset->ttl = header->rdh_ttl - now; rdataset->trust = header->trust; if (NXDOMAIN(header)) rdataset->attributes |= DNS_RDATASETATTR_NXDOMAIN; + if (OPTOUT(header)) + rdataset->attributes |= DNS_RDATASETATTR_OPTOUT; rdataset->private1 = rbtdb; rdataset->private2 = node; raw = (unsigned char *)header + sizeof(*header); @@ -1891,6 +2714,18 @@ bind_rdataset(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, rdataset->private6 = header->noqname; if (rdataset->private6 != NULL) rdataset->attributes |= DNS_RDATASETATTR_NOQNAME; + rdataset->private7 = header->closest; + if (rdataset->private7 != NULL) + rdataset->attributes |= DNS_RDATASETATTR_CLOSEST; + + /* + * Copy out re-signing information. + */ + if (RESIGN(header)) { + rdataset->attributes |= DNS_RDATASETATTR_RESIGN; + rdataset->resign = header->resign; + } else + rdataset->resign = 0; } static inline isc_result_t @@ -1954,7 +2789,7 @@ static inline isc_boolean_t valid_glue(rbtdb_search_t *search, dns_name_t *name, rbtdb_rdatatype_t type, dns_rbtnode_t *node) { - unsigned char *raw; /* RDATASLAB */ + unsigned char *raw; /* RDATASLAB */ unsigned int count, size; dns_name_t ns_name; isc_boolean_t valid = ISC_FALSE; @@ -2338,10 +3173,55 @@ find_wildcard(rbtdb_search_t *search, dns_rbtnode_t **nodep, return (result); } +static isc_boolean_t +matchparams(rdatasetheader_t *header, rbtdb_search_t *search) +{ + dns_rdata_t rdata = DNS_RDATA_INIT; + dns_rdata_nsec3_t nsec3; + unsigned char *raw; /* RDATASLAB */ + unsigned int rdlen, count; + isc_region_t region; + isc_result_t result; + + REQUIRE(header->type == dns_rdatatype_nsec3); + + raw = (unsigned char *)header + sizeof(*header); + count = raw[0] * 256 + raw[1]; /* count */ +#if DNS_RDATASET_FIXED + raw += count * 4 + 2; +#else + raw += 2; +#endif + while (count-- > 0) { + rdlen = raw[0] * 256 + raw[1]; +#if DNS_RDATASET_FIXED + raw += 4; +#else + raw += 2; +#endif + region.base = raw; + region.length = rdlen; + dns_rdata_fromregion(&rdata, search->rbtdb->common.rdclass, + dns_rdatatype_nsec3, ®ion); + raw += rdlen; + result = dns_rdata_tostruct(&rdata, &nsec3, NULL); + INSIST(result == ISC_R_SUCCESS); + if (nsec3.hash == search->rbtversion->hash && + nsec3.iterations == search->rbtversion->iterations && + nsec3.salt_length == search->rbtversion->salt_length && + memcmp(nsec3.salt, search->rbtversion->salt, + nsec3.salt_length) == 0) + return (ISC_TRUE); + dns_rdata_reset(&rdata); + } + return (ISC_FALSE); +} + static inline isc_result_t find_closest_nsec(rbtdb_search_t *search, dns_dbnode_t **nodep, dns_name_t *foundname, dns_rdataset_t *rdataset, - dns_rdataset_t *sigrdataset, isc_boolean_t need_sig) + dns_rdataset_t *sigrdataset, dns_rbt_t *tree, + dns_db_secure_t secure) { dns_rbtnode_t *node; rdatasetheader_t *header, *header_next, *found, *foundsig; @@ -2349,7 +3229,22 @@ find_closest_nsec(rbtdb_search_t *search, dns_dbnode_t **nodep, isc_result_t result; dns_fixedname_t fname, forigin; dns_name_t *name, *origin; + dns_rdatatype_t type; + rbtdb_rdatatype_t sigtype; + isc_boolean_t wraps; + isc_boolean_t need_sig = ISC_TF(secure == dns_db_secure); + if (tree == search->rbtdb->nsec3) { + type = dns_rdatatype_nsec3; + sigtype = RBTDB_RDATATYPE_SIGNSEC3; + wraps = ISC_TRUE; + } else { + type = dns_rdatatype_nsec; + sigtype = RBTDB_RDATATYPE_SIGNSEC; + wraps = ISC_FALSE; + } + + again: do { node = NULL; dns_fixedname_init(&fname); @@ -2391,12 +3286,11 @@ find_closest_nsec(rbtdb_search_t *search, dns_dbnode_t **nodep, * active rdataset at this node. */ empty_node = ISC_FALSE; - if (header->type == dns_rdatatype_nsec) { + if (header->type == type) { found = header; if (foundsig != NULL) break; - } else if (header->type == - RBTDB_RDATATYPE_SIGNSEC) { + } else if (header->type == sigtype) { foundsig = header; if (found != NULL) break; @@ -2404,11 +3298,19 @@ find_closest_nsec(rbtdb_search_t *search, dns_dbnode_t **nodep, } } if (!empty_node) { - if (found != NULL && - (foundsig != NULL || !need_sig)) + if (found != NULL && search->rbtversion->havensec3 && + found->type == dns_rdatatype_nsec3 && + !matchparams(found, search)) { + empty_node = ISC_TRUE; + found = NULL; + foundsig = NULL; + result = dns_rbtnodechain_prev(&search->chain, + NULL, NULL); + } else if (found != NULL && + (foundsig != NULL || !need_sig)) { /* - * We've found the right NSEC record. + * We've found the right NSEC/NSEC3 record. * * Note: for this to really be the right * NSEC record, it's essential that the NSEC @@ -2465,6 +3367,15 @@ find_closest_nsec(rbtdb_search_t *search, dns_dbnode_t **nodep, isc_rwlocktype_read); } while (empty_node && result == ISC_R_SUCCESS); + if (result == ISC_R_NOMORE && wraps) { + result = dns_rbtnodechain_last(&search->chain, tree, + NULL, NULL); + if (result == ISC_R_SUCCESS || result == DNS_R_NEWORIGIN) { + wraps = ISC_FALSE; + goto again; + } + } + /* * If the result is ISC_R_NOMORE, then we got to the beginning of * the database and didn't find a NSEC record. This shouldn't @@ -2497,7 +3408,7 @@ zone_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version, isc_boolean_t active; dns_rbtnodechain_t chain; nodelock_t *lock; - + dns_rbt_t *tree; search.rbtdb = (dns_rbtdb_t *)db; @@ -2540,7 +3451,9 @@ zone_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version, * encounter a callback node, zone_zonecut_callback() will search the * rdatasets at the zone cut for active DNAME or NS rdatasets. */ - result = dns_rbt_findnode(search.rbtdb->tree, name, foundname, &node, + tree = (options & DNS_DBFIND_FORCENSEC3) != 0 ? search.rbtdb->nsec3 : + search.rbtdb->tree; + result = dns_rbt_findnode(tree, name, foundname, &node, &search.chain, DNS_RBTFIND_EMPTYDATA, zone_zonecut_callback, &search); @@ -2578,12 +3491,14 @@ zone_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version, * If we're here, then the name does not exist, is not * beneath a zonecut, and there's no matching wildcard. */ - if (search.rbtdb->secure || - (search.options & DNS_DBFIND_FORCENSEC) != 0) + if ((search.rbtversion->secure == dns_db_secure && + !search.rbtversion->havensec3) || + (search.options & DNS_DBFIND_FORCENSEC) != 0 || + (search.options & DNS_DBFIND_FORCENSEC3) != 0) { result = find_closest_nsec(&search, nodep, foundname, - rdataset, sigrdataset, - search.rbtdb->secure); + rdataset, sigrdataset, tree, + search.rbtversion->secure); if (result == ISC_R_SUCCESS) result = active ? DNS_R_EMPTYNAME : DNS_R_NXDOMAIN; @@ -2704,6 +3619,14 @@ zone_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version, break; } + + /* + * If the NSEC3 record doesn't match the chain + * we are using behave as if it isn't here. + */ + if (header->type == dns_rdatatype_nsec3 && + !matchparams(header, &search)) + goto partial_match; /* * If we found a type we were looking for, * remember it. @@ -2748,14 +3671,16 @@ zone_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version, */ if (!maybe_zonecut && found != NULL) break; - } else if (header->type == dns_rdatatype_nsec) { + } else if (header->type == dns_rdatatype_nsec && + !search.rbtversion->havensec3) { /* * Remember a NSEC rdataset even if we're * not specifically looking for it, because * we might need it later. */ nsecheader = header; - } else if (header->type == RBTDB_RDATATYPE_SIGNSEC) { + } else if (header->type == RBTDB_RDATATYPE_SIGNSEC && + !search.rbtversion->havensec3) { /* * If we need the NSEC rdataset, we'll also * need its signature. @@ -2807,7 +3732,8 @@ zone_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version, * The desired type doesn't exist. */ result = DNS_R_NXRRSET; - if (search.rbtdb->secure && + if (search.rbtversion->secure == dns_db_secure && + !search.rbtversion->havensec3 && (nsecheader == NULL || nsecsig == NULL)) { /* * The zone is secure but there's no NSEC, @@ -2822,7 +3748,8 @@ zone_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version, NODE_UNLOCK(lock, isc_rwlocktype_read); result = find_closest_nsec(&search, nodep, foundname, rdataset, sigrdataset, - search.rbtdb->secure); + search.rbtdb->tree, + search.rbtversion->secure); if (result == ISC_R_SUCCESS) result = DNS_R_EMPTYWILD; goto tree_exit; @@ -2841,7 +3768,8 @@ zone_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version, new_reference(search.rbtdb, node); *nodep = node; } - if (search.rbtdb->secure || + if ((search.rbtversion->secure == dns_db_secure && + !search.rbtversion->havensec3) || (search.options & DNS_DBFIND_FORCENSEC) != 0) { bind_rdataset(search.rbtdb, node, nsecheader, @@ -2882,6 +3810,7 @@ zone_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version, * validated updates. */ if (type == dns_rdatatype_nsec || + type == dns_rdatatype_nsec3 || type == dns_rdatatype_key) result = ISC_R_SUCCESS; else if (type == dns_rdatatype_any) @@ -2948,7 +3877,8 @@ zone_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version, NODE_LOCK(lock, isc_rwlocktype_read); decrement_reference(search.rbtdb, node, 0, - isc_rwlocktype_read, isc_rwlocktype_none); + isc_rwlocktype_read, isc_rwlocktype_none, + ISC_FALSE); NODE_UNLOCK(lock, isc_rwlocktype_read); } @@ -3010,7 +3940,7 @@ cache_zonecut_callback(dns_rbtnode_t *node, dns_name_t *name, void *arg) { header_prev = NULL; for (header = node->data; header != NULL; header = header_next) { header_next = header->next; - if (header->ttl <= search->now) { + if (header->rdh_ttl <= search->now) { /* * This rdataset is stale. If no one else is * using the node, we can clean it up right @@ -3018,7 +3948,7 @@ cache_zonecut_callback(dns_rbtnode_t *node, dns_name_t *name, void *arg) { * the node as dirty, so it will get cleaned * up later. */ - if ((header->ttl <= search->now - RBTDB_VIRTUAL) && + if ((header->rdh_ttl <= search->now - RBTDB_VIRTUAL) && (locktype == isc_rwlocktype_write || NODE_TRYUPGRADE(lock) == ISC_R_SUCCESS)) { /* @@ -3044,13 +3974,16 @@ cache_zonecut_callback(dns_rbtnode_t *node, dns_name_t *name, void *arg) { * stale headers first. */ mctx = search->rbtdb->common.mctx; - clean_stale_headers(mctx, header); + clean_stale_headers(search->rbtdb, + mctx, + header); if (header_prev != NULL) header_prev->next = header->next; else node->data = header->next; - free_rdataset(mctx, header); + free_rdataset(search->rbtdb, mctx, + header); } else { header->attributes |= RDATASET_ATTR_STALE; @@ -3079,6 +4012,7 @@ cache_zonecut_callback(dns_rbtnode_t *node, dns_name_t *name, void *arg) { * search->zonecut_rdataset will still be valid later. */ new_reference(search->rbtdb, node); + INSIST(!ISC_LINK_LINKED(node, deadlink)); search->zonecut = node; search->zonecut_rdataset = dname_header; search->zonecut_sigrdataset = sigdname_header; @@ -3130,7 +4064,7 @@ find_deepest_zonecut(rbtdb_search_t *search, dns_rbtnode_t *node, header != NULL; header = header_next) { header_next = header->next; - if (header->ttl <= search->now) { + if (header->rdh_ttl <= search->now) { /* * This rdataset is stale. If no one else is * using the node, we can clean it up right @@ -3138,7 +4072,7 @@ find_deepest_zonecut(rbtdb_search_t *search, dns_rbtnode_t *node, * the node as dirty, so it will get cleaned * up later. */ - if ((header->ttl <= search->now - + if ((header->rdh_ttl <= search->now - RBTDB_VIRTUAL) && (locktype == isc_rwlocktype_write || NODE_TRYUPGRADE(lock) == ISC_R_SUCCESS)) { @@ -3153,14 +4087,17 @@ find_deepest_zonecut(rbtdb_search_t *search, dns_rbtnode_t *node, isc_mem_t *m; m = search->rbtdb->common.mctx; - clean_stale_headers(m, header); + clean_stale_headers( + search->rbtdb, + m, header); if (header_prev != NULL) header_prev->next = header->next; else node->data = header->next; - free_rdataset(m, header); + free_rdataset(rbtdb, m, + header); } else { header->attributes |= RDATASET_ATTR_STALE; @@ -3229,6 +4166,23 @@ find_deepest_zonecut(rbtdb_search_t *search, dns_rbtnode_t *node, if (foundsig != NULL) bind_rdataset(search->rbtdb, node, foundsig, search->now, sigrdataset); + if (need_headerupdate(found, search->now) || + (foundsig != NULL && + need_headerupdate(foundsig, search->now))) { + if (locktype != isc_rwlocktype_write) { + NODE_UNLOCK(lock, locktype); + NODE_LOCK(lock, isc_rwlocktype_write); + locktype = isc_rwlocktype_write; + } + if (need_headerupdate(found, search->now)) + update_header(search->rbtdb, found, + search->now); + if (foundsig != NULL && + need_headerupdate(foundsig, search->now)) { + update_header(search->rbtdb, foundsig, + search->now); + } + } } node_exit: @@ -3286,7 +4240,7 @@ find_coveringnsec(rbtdb_search_t *search, dns_dbnode_t **nodep, header != NULL; header = header_next) { header_next = header->next; - if (header->ttl <= now) { + if (header->rdh_ttl <= now) { /* * This rdataset is stale. If no one else is * using the node, we can clean it up right @@ -3294,7 +4248,7 @@ find_coveringnsec(rbtdb_search_t *search, dns_dbnode_t **nodep, * node as dirty, so it will get cleaned up * later. */ - if ((header->ttl <= now - RBTDB_VIRTUAL) && + if ((header->rdh_ttl <= now - RBTDB_VIRTUAL) && (locktype == isc_rwlocktype_write || NODE_TRYUPGRADE(lock) == ISC_R_SUCCESS)) { /* @@ -3308,13 +4262,16 @@ find_coveringnsec(rbtdb_search_t *search, dns_dbnode_t **nodep, isc_mem_t *m; m = search->rbtdb->common.mctx; - clean_stale_headers(m, header); + clean_stale_headers( + search->rbtdb, + m, header); if (header_prev != NULL) header_prev->next = header->next; else node->data = header->next; - free_rdataset(m, header); + free_rdataset(search->rbtdb, m, + header); } else { header->attributes |= RDATASET_ATTR_STALE; @@ -3377,6 +4334,7 @@ cache_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version, rdatasetheader_t *header, *header_prev, *header_next; rdatasetheader_t *found, *nsheader; rdatasetheader_t *foundsig, *nssig, *cnamesig; + rdatasetheader_t *update, *updatesig; rbtdb_rdatatype_t sigtype, negtype; UNUSED(version); @@ -3399,6 +4357,8 @@ cache_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version, dns_fixedname_init(&search.zonecut_name); dns_rbtnodechain_init(&search.chain, search.rbtdb->common.mctx); search.now = now; + update = NULL; + updatesig = NULL; RWLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read); @@ -3462,14 +4422,14 @@ cache_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version, header_prev = NULL; for (header = node->data; header != NULL; header = header_next) { header_next = header->next; - if (header->ttl <= now) { + if (header->rdh_ttl <= now) { /* * This rdataset is stale. If no one else is using the * node, we can clean it up right now, otherwise we * mark it as stale, and the node as dirty, so it will * get cleaned up later. */ - if ((header->ttl <= now - RBTDB_VIRTUAL) && + if ((header->rdh_ttl <= now - RBTDB_VIRTUAL) && (locktype == isc_rwlocktype_write || NODE_TRYUPGRADE(lock) == ISC_R_SUCCESS)) { /* @@ -3482,13 +4442,15 @@ cache_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version, isc_mem_t *mctx; mctx = search.rbtdb->common.mctx; - clean_stale_headers(mctx, header); + clean_stale_headers(search.rbtdb, mctx, + header); if (header_prev != NULL) header_prev->next = header->next; else node->data = header->next; - free_rdataset(mctx, header); + free_rdataset(search.rbtdb, mctx, + header); } else { header->attributes |= RDATASET_ATTR_STALE; @@ -3595,13 +4557,19 @@ cache_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version, if (nsheader != NULL) { if (nodep != NULL) { new_reference(search.rbtdb, node); + INSIST(!ISC_LINK_LINKED(node, deadlink)); *nodep = node; } bind_rdataset(search.rbtdb, node, nsheader, search.now, rdataset); - if (nssig != NULL) + if (need_headerupdate(nsheader, search.now)) + update = nsheader; + if (nssig != NULL) { bind_rdataset(search.rbtdb, node, nssig, search.now, sigrdataset); + if (need_headerupdate(nssig, search.now)) + updatesig = nssig; + } result = DNS_R_DELEGATION; goto node_exit; } @@ -3619,6 +4587,7 @@ cache_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version, if (nodep != NULL) { new_reference(search.rbtdb, node); + INSIST(!ISC_LINK_LINKED(node, deadlink)); *nodep = node; } @@ -3650,12 +4619,28 @@ cache_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version, result == DNS_R_NCACHENXRRSET) { bind_rdataset(search.rbtdb, node, found, search.now, rdataset); - if (foundsig != NULL) + if (need_headerupdate(found, search.now)) + update = found; + if (foundsig != NULL) { bind_rdataset(search.rbtdb, node, foundsig, search.now, sigrdataset); + if (need_headerupdate(foundsig, search.now)) + updatesig = foundsig; + } } node_exit: + if ((update != NULL || updatesig != NULL) && + locktype != isc_rwlocktype_write) { + NODE_UNLOCK(lock, locktype); + NODE_LOCK(lock, isc_rwlocktype_write); + locktype = isc_rwlocktype_write; + } + if (update != NULL && need_headerupdate(update, search.now)) + update_header(search.rbtdb, update, search.now); + if (updatesig != NULL && need_headerupdate(updatesig, search.now)) + update_header(search.rbtdb, updatesig, search.now); + NODE_UNLOCK(lock, locktype); tree_exit: @@ -3671,7 +4656,8 @@ cache_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version, NODE_LOCK(lock, isc_rwlocktype_read); decrement_reference(search.rbtdb, node, 0, - isc_rwlocktype_read, isc_rwlocktype_none); + isc_rwlocktype_read, isc_rwlocktype_none, + ISC_FALSE); NODE_UNLOCK(lock, isc_rwlocktype_read); } @@ -3745,14 +4731,14 @@ cache_findzonecut(dns_db_t *db, dns_name_t *name, unsigned int options, header_prev = NULL; for (header = node->data; header != NULL; header = header_next) { header_next = header->next; - if (header->ttl <= now) { + if (header->rdh_ttl <= now) { /* * This rdataset is stale. If no one else is using the * node, we can clean it up right now, otherwise we * mark it as stale, and the node as dirty, so it will * get cleaned up later. */ - if ((header->ttl <= now - RBTDB_VIRTUAL) && + if ((header->rdh_ttl <= now - RBTDB_VIRTUAL) && (locktype == isc_rwlocktype_write || NODE_TRYUPGRADE(lock) == ISC_R_SUCCESS)) { /* @@ -3765,13 +4751,15 @@ cache_findzonecut(dns_db_t *db, dns_name_t *name, unsigned int options, isc_mem_t *mctx; mctx = search.rbtdb->common.mctx; - clean_stale_headers(mctx, header); + clean_stale_headers(search.rbtdb, mctx, + header); if (header_prev != NULL) header_prev->next = header->next; else node->data = header->next; - free_rdataset(mctx, header); + free_rdataset(search.rbtdb, mctx, + header); } else { header->attributes |= RDATASET_ATTR_STALE; @@ -3814,6 +4802,7 @@ cache_findzonecut(dns_db_t *db, dns_name_t *name, unsigned int options, if (nodep != NULL) { new_reference(search.rbtdb, node); + INSIST(!ISC_LINK_LINKED(node, deadlink)); *nodep = node; } @@ -3822,6 +4811,21 @@ cache_findzonecut(dns_db_t *db, dns_name_t *name, unsigned int options, bind_rdataset(search.rbtdb, node, foundsig, search.now, sigrdataset); + if (need_headerupdate(found, search.now) || + (foundsig != NULL && need_headerupdate(foundsig, search.now))) { + if (locktype != isc_rwlocktype_write) { + NODE_UNLOCK(lock, locktype); + NODE_LOCK(lock, isc_rwlocktype_write); + locktype = isc_rwlocktype_write; + } + if (need_headerupdate(found, search.now)) + update_header(search.rbtdb, found, search.now); + if (foundsig != NULL && + need_headerupdate(foundsig, search.now)) { + update_header(search.rbtdb, foundsig, search.now); + } + } + NODE_UNLOCK(lock, locktype); tree_exit: @@ -3871,7 +4875,7 @@ detachnode(dns_db_t *db, dns_dbnode_t **targetp) { NODE_LOCK(&nodelock->lock, isc_rwlocktype_read); if (decrement_reference(rbtdb, node, 0, isc_rwlocktype_read, - isc_rwlocktype_none)) { + isc_rwlocktype_none, ISC_FALSE)) { if (isc_refcount_current(&nodelock->references) == 0 && nodelock->exiting) { inactive = ISC_TRUE; @@ -3938,8 +4942,8 @@ expirenode(dns_db_t *db, dns_dbnode_t *node, isc_stdtime_t now) { /* * Note that 'log' can be true IFF rbtdb->overmem is also true. - * rbtdb->ovemem can currently only be true for cache databases - * -- hence all of the "overmem cache" log strings. + * rbtdb->overmem can currently only be true for cache + * databases -- hence all of the "overmem cache" log strings. */ log = ISC_TF(isc_log_wouldlog(dns_lctx, level)); if (log) @@ -3959,7 +4963,7 @@ expirenode(dns_db_t *db, dns_dbnode_t *node, isc_stdtime_t now) { isc_rwlocktype_write); for (header = rbtnode->data; header != NULL; header = header->next) - if (header->ttl <= now - RBTDB_VIRTUAL) { + if (header->rdh_ttl <= now - RBTDB_VIRTUAL) { /* * We don't check if refcurrent(rbtnode) == 0 and try * to free like we do in cache_find(), because @@ -3974,7 +4978,7 @@ expirenode(dns_db_t *db, dns_dbnode_t *node, isc_stdtime_t now) { printname); } else if (force_expire) { if (! RETAIN(header)) { - header->ttl = 0; + set_ttl(rbtdb, header, 0); header->attributes |= RDATASET_ATTR_STALE; rbtnode->dirty = 1; } else if (log) { @@ -3997,9 +5001,8 @@ static void overmem(dns_db_t *db, isc_boolean_t overmem) { dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; - if (IS_CACHE(rbtdb)) { + if (IS_CACHE(rbtdb)) rbtdb->overmem = overmem; - } } static void @@ -4030,11 +5033,13 @@ printnode(dns_db_t *db, dns_dbnode_t *node, FILE *out) { first = ISC_FALSE; fprintf(out, "\tserial = %lu, ttl = %u, " - "trust = %u, attributes = %u\n", + "trust = %u, attributes = %u, " + "resign = %u\n", (unsigned long)current->serial, - current->ttl, + current->rdh_ttl, current->trust, - current->attributes); + current->attributes, + current->resign); current = current->down; } while (current != NULL); } @@ -4046,8 +5051,7 @@ printnode(dns_db_t *db, dns_dbnode_t *node, FILE *out) { } static isc_result_t -createiterator(dns_db_t *db, isc_boolean_t relative_names, - dns_dbiterator_t **iteratorp) +createiterator(dns_db_t *db, unsigned int options, dns_dbiterator_t **iteratorp) { dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; rbtdb_dbiterator_t *rbtdbiter; @@ -4061,7 +5065,8 @@ createiterator(dns_db_t *db, isc_boolean_t relative_names, rbtdbiter->common.methods = &dbiterator_methods; rbtdbiter->common.db = NULL; dns_db_attach(db, &rbtdbiter->common.db); - rbtdbiter->common.relative_names = relative_names; + rbtdbiter->common.relative_names = + ISC_TF((options & DNS_DB_RELATIVENAMES) != 0); rbtdbiter->common.magic = DNS_DBITERATOR_MAGIC; rbtdbiter->common.cleaning = ISC_FALSE; rbtdbiter->paused = ISC_TRUE; @@ -4071,8 +5076,15 @@ createiterator(dns_db_t *db, isc_boolean_t relative_names, dns_fixedname_init(&rbtdbiter->origin); rbtdbiter->node = NULL; rbtdbiter->delete = 0; + rbtdbiter->nsec3only = ISC_TF((options & DNS_DB_NSEC3ONLY) != 0); + rbtdbiter->nonsec3 = ISC_TF((options & DNS_DB_NONSEC3) != 0); memset(rbtdbiter->deletions, 0, sizeof(rbtdbiter->deletions)); dns_rbtnodechain_init(&rbtdbiter->chain, db->mctx); + dns_rbtnodechain_init(&rbtdbiter->nsec3chain, db->mctx); + if (rbtdbiter->nsec3only) + rbtdbiter->current = &rbtdbiter->nsec3chain; + else + rbtdbiter->current = &rbtdbiter->chain; *iteratorp = (dns_dbiterator_t *)rbtdbiter; @@ -4204,8 +5216,8 @@ cache_findrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, for (header = rbtnode->data; header != NULL; header = header_next) { header_next = header->next; - if (header->ttl <= now) { - if ((header->ttl <= now - RBTDB_VIRTUAL) && + if (header->rdh_ttl <= now) { + if ((header->rdh_ttl <= now - RBTDB_VIRTUAL) && (locktype == isc_rwlocktype_write || NODE_TRYUPGRADE(lock) == ISC_R_SUCCESS)) { /* @@ -4355,19 +5367,15 @@ cname_and_other_data(dns_rbtnode_t *node, rbtdb_serial_t serial) { * Look for active extant "other data". * * "Other data" is any rdataset whose type is not - * KEY, RRSIG KEY, NSEC, RRSIG NSEC or RRSIG CNAME. + * KEY, NSEC, SIG or RRSIG. */ rdtype = RBTDB_RDATATYPE_BASE(header->type); - if (rdtype == dns_rdatatype_rrsig || - rdtype == dns_rdatatype_sig) - rdtype = RBTDB_RDATATYPE_EXT(header->type); - if (rdtype != dns_rdatatype_nsec && - rdtype != dns_rdatatype_key && - rdtype != dns_rdatatype_cname) { + if (rdtype != dns_rdatatype_key && + rdtype != dns_rdatatype_sig && + rdtype != dns_rdatatype_nsec && + rdtype != dns_rdatatype_rrsig) { /* - * We've found a type that isn't - * NSEC, KEY, CNAME, or one of their - * signatures. Is it active and extant? + * Is it active and extant? */ do { if (header->serial <= serial && @@ -4395,6 +5403,16 @@ cname_and_other_data(dns_rbtnode_t *node, rbtdb_serial_t serial) { } static isc_result_t +resign_insert(dns_rbtdb_t *rbtdb, int idx, rdatasetheader_t *newheader) { + isc_result_t result; + + INSIST(newheader->heap_index == 0); + INSIST(!ISC_LINK_LINKED(newheader, lru_link)); + result = isc_heap_insert(rbtdb->heaps[idx], newheader); + return (result); +} + +static isc_result_t add(dns_rbtdb_t *rbtdb, dns_rbtnode_t *rbtnode, rbtdb_version_t *rbtversion, rdatasetheader_t *newheader, unsigned int options, isc_boolean_t loading, dns_rdataset_t *addedrdataset, isc_stdtime_t now) @@ -4409,6 +5427,7 @@ add(dns_rbtdb_t *rbtdb, dns_rbtnode_t *rbtnode, rbtdb_version_t *rbtversion, dns_rdatatype_t rdtype, covers; rbtdb_rdatatype_t negtype; dns_trust_t trust; + int idx; /* * Add an rdatasetheader_t to a node. @@ -4437,7 +5456,7 @@ add(dns_rbtdb_t *rbtdb, dns_rbtnode_t *rbtnode, rbtdb_version_t *rbtversion, */ changed = add_changed(rbtdb, rbtversion, rbtnode); if (changed == NULL) { - free_rdataset(rbtdb->common.mctx, newheader); + free_rdataset(rbtdb, rbtdb->common.mctx, newheader); return (ISC_R_NOMEMORY); } } @@ -4466,7 +5485,7 @@ add(dns_rbtdb_t *rbtdb, dns_rbtnode_t *rbtnode, rbtdb_version_t *rbtversion, for (topheader = rbtnode->data; topheader != NULL; topheader = topheader->next) { - topheader->ttl = 0; + set_ttl(rbtdb, topheader, 0); topheader->attributes |= RDATASET_ATTR_STALE; } @@ -4489,7 +5508,7 @@ add(dns_rbtdb_t *rbtdb, dns_rbtnode_t *rbtnode, rbtdb_version_t *rbtversion, break; } if (topheader != NULL && EXISTS(topheader) && - topheader->ttl > now) { + topheader->rdh_ttl > now) { /* * Found one. */ @@ -4498,8 +5517,8 @@ add(dns_rbtdb_t *rbtdb, dns_rbtnode_t *rbtnode, rbtdb_version_t *rbtversion, * The NXDOMAIN/NODATA(QTYPE=ANY) * is more trusted. */ - - free_rdataset(rbtdb->common.mctx, + free_rdataset(rbtdb, + rbtdb->common.mctx, newheader); if (addedrdataset != NULL) bind_rdataset(rbtdb, rbtnode, @@ -4511,7 +5530,7 @@ add(dns_rbtdb_t *rbtdb, dns_rbtnode_t *rbtnode, rbtdb_version_t *rbtversion, * The new rdataset is better. Expire the * NXDOMAIN/NODATA(QTYPE=ANY). */ - topheader->ttl = 0; + set_ttl(rbtdb, topheader, 0); topheader->attributes |= RDATASET_ATTR_STALE; rbtnode->dirty = 1; topheader = NULL; @@ -4546,7 +5565,7 @@ add(dns_rbtdb_t *rbtdb, dns_rbtnode_t *rbtnode, rbtdb_version_t *rbtversion, * Deleting an already non-existent rdataset has no effect. */ if (header_nx && newheader_nx) { - free_rdataset(rbtdb->common.mctx, newheader); + free_rdataset(rbtdb, rbtdb->common.mctx, newheader); return (DNS_R_UNCHANGED); } @@ -4555,8 +5574,8 @@ add(dns_rbtdb_t *rbtdb, dns_rbtnode_t *rbtnode, rbtdb_version_t *rbtversion, * has no effect, provided that the cache data isn't stale. */ if (rbtversion == NULL && trust < header->trust && - (header->ttl > now || header_nx)) { - free_rdataset(rbtdb->common.mctx, newheader); + (header->rdh_ttl > now || header_nx)) { + free_rdataset(rbtdb, rbtdb->common.mctx, newheader); if (addedrdataset != NULL) bind_rdataset(rbtdb, rbtnode, header, now, addedrdataset); @@ -4582,9 +5601,9 @@ add(dns_rbtdb_t *rbtdb, dns_rbtnode_t *rbtnode, rbtdb_version_t *rbtversion, if ((options & DNS_DBADD_EXACT) != 0) flags |= DNS_RDATASLAB_EXACT; if ((options & DNS_DBADD_EXACTTTL) != 0 && - newheader->ttl != header->ttl) + newheader->rdh_ttl != header->rdh_ttl) result = DNS_R_NOTEXACT; - else if (newheader->ttl != header->ttl) + else if (newheader->rdh_ttl != header->rdh_ttl) flags |= DNS_RDATASLAB_FORCE; if (result == ISC_R_SUCCESS) result = dns_rdataslab_merge( @@ -4604,10 +5623,16 @@ add(dns_rbtdb_t *rbtdb, dns_rbtnode_t *rbtnode, rbtdb_version_t *rbtversion, * alone. It will get cleaned up when * clean_zone_node() runs. */ - free_rdataset(rbtdb->common.mctx, newheader); + free_rdataset(rbtdb, rbtdb->common.mctx, + newheader); newheader = (rdatasetheader_t *)merged; + if (loading && RESIGN(newheader) && + RESIGN(header) && + header->resign < newheader->resign) + newheader->resign = header->resign; } else { - free_rdataset(rbtdb->common.mctx, newheader); + free_rdataset(rbtdb, rbtdb->common.mctx, + newheader); return (result); } } @@ -4618,7 +5643,7 @@ add(dns_rbtdb_t *rbtdb, dns_rbtnode_t *rbtnode, rbtdb_version_t *rbtversion, * Don't lower trust of existing record if the * update is forced. */ - if (IS_CACHE(rbtdb) && header->ttl > now && + if (IS_CACHE(rbtdb) && header->rdh_ttl > now && header->type == dns_rdatatype_ns && !header_nx && !newheader_nx && header->trust >= newheader->trust && @@ -4631,20 +5656,25 @@ add(dns_rbtdb_t *rbtdb, dns_rbtnode_t *rbtnode, rbtdb_version_t *rbtversion, * Honour the new ttl if it is less than the * older one. */ - if (header->ttl > newheader->ttl) - header->ttl = newheader->ttl; + if (header->rdh_ttl > newheader->rdh_ttl) + set_ttl(rbtdb, header, newheader->rdh_ttl); if (header->noqname == NULL && newheader->noqname != NULL) { header->noqname = newheader->noqname; newheader->noqname = NULL; } - free_rdataset(rbtdb->common.mctx, newheader); + if (header->closest == NULL && + newheader->closest != NULL) { + header->closest = newheader->closest; + newheader->closest = NULL; + } + free_rdataset(rbtdb, rbtdb->common.mctx, newheader); if (addedrdataset != NULL) bind_rdataset(rbtdb, rbtnode, header, now, addedrdataset); return (ISC_R_SUCCESS); } - if (IS_CACHE(rbtdb) && header->ttl > now && + if (IS_CACHE(rbtdb) && header->rdh_ttl > now && (header->type == dns_rdatatype_a || header->type == dns_rdatatype_aaaa) && !header_nx && !newheader_nx && @@ -4656,14 +5686,19 @@ add(dns_rbtdb_t *rbtdb, dns_rbtnode_t *rbtnode, rbtdb_version_t *rbtversion, * Honour the new ttl if it is less than the * older one. */ - if (header->ttl > newheader->ttl) - header->ttl = newheader->ttl; + if (header->rdh_ttl > newheader->rdh_ttl) + set_ttl(rbtdb, header, newheader->rdh_ttl); if (header->noqname == NULL && newheader->noqname != NULL) { header->noqname = newheader->noqname; newheader->noqname = NULL; } - free_rdataset(rbtdb->common.mctx, newheader); + if (header->closest == NULL && + newheader->closest != NULL) { + header->closest = newheader->closest; + newheader->closest = NULL; + } + free_rdataset(rbtdb, rbtdb->common.mctx, newheader); if (addedrdataset != NULL) bind_rdataset(rbtdb, rbtnode, header, now, addedrdataset); @@ -4684,7 +5719,7 @@ add(dns_rbtdb_t *rbtdb, dns_rbtnode_t *rbtnode, rbtdb_version_t *rbtversion, * loading, we MUST clean up 'header' now. */ newheader->down = NULL; - free_rdataset(rbtdb->common.mctx, header); + free_rdataset(rbtdb, rbtdb->common.mctx, header); } else { newheader->down = topheader; topheader->next = newheader; @@ -4692,9 +5727,23 @@ add(dns_rbtdb_t *rbtdb, dns_rbtnode_t *rbtnode, rbtdb_version_t *rbtversion, if (changed != NULL) changed->dirty = ISC_TRUE; if (rbtversion == NULL) { - header->ttl = 0; + set_ttl(rbtdb, header, 0); header->attributes |= RDATASET_ATTR_STALE; } + idx = newheader->node->locknum; + if (IS_CACHE(rbtdb)) { + ISC_LIST_PREPEND(rbtdb->rdatasets[idx], + newheader, lru_link); + /* + * XXXMLG We don't check the return value + * here. If it fails, we will not do TTL + * based expiry on this node. However, we + * will do it on the LRU side, so memory + * will not leak... for long. + */ + isc_heap_insert(rbtdb->heaps[idx], newheader); + } else if (RESIGN(newheader)) + resign_insert(rbtdb, idx, newheader); } } else { /* @@ -4706,7 +5755,7 @@ add(dns_rbtdb_t *rbtdb, dns_rbtnode_t *rbtnode, rbtdb_version_t *rbtversion, * If we're trying to delete the type, don't bother. */ if (newheader_nx) { - free_rdataset(rbtdb->common.mctx, newheader); + free_rdataset(rbtdb, rbtdb->common.mctx, newheader); return (DNS_R_UNCHANGED); } @@ -4740,6 +5789,14 @@ add(dns_rbtdb_t *rbtdb, dns_rbtnode_t *rbtnode, rbtdb_version_t *rbtversion, newheader->down = NULL; rbtnode->data = newheader; } + idx = newheader->node->locknum; + if (IS_CACHE(rbtdb)) { + ISC_LIST_PREPEND(rbtdb->rdatasets[idx], + newheader, lru_link); + isc_heap_insert(rbtdb->heaps[idx], newheader); + } else if (RESIGN(newheader)) { + resign_insert(rbtdb, idx, newheader); + } } /* @@ -4778,15 +5835,15 @@ addnoqname(dns_rbtdb_t *rbtdb, rdatasetheader_t *newheader, struct noqname *noqname; isc_mem_t *mctx = rbtdb->common.mctx; dns_name_t name; - dns_rdataset_t nsec, nsecsig; + dns_rdataset_t neg, negsig; isc_result_t result; isc_region_t r; dns_name_init(&name, NULL); - dns_rdataset_init(&nsec); - dns_rdataset_init(&nsecsig); + dns_rdataset_init(&neg); + dns_rdataset_init(&negsig); - result = dns_rdataset_getnoqname(rdataset, &name, &nsec, &nsecsig); + result = dns_rdataset_getnoqname(rdataset, &name, &neg, &negsig); RUNTIME_CHECK(result == ISC_R_SUCCESS); noqname = isc_mem_get(mctx, sizeof(*noqname)); @@ -4795,31 +5852,84 @@ addnoqname(dns_rbtdb_t *rbtdb, rdatasetheader_t *newheader, goto cleanup; } dns_name_init(&noqname->name, NULL); - noqname->nsec = NULL; - noqname->nsecsig = NULL; + noqname->neg = NULL; + noqname->negsig = NULL; + noqname->type = neg.type; result = dns_name_dup(&name, mctx, &noqname->name); if (result != ISC_R_SUCCESS) goto cleanup; - result = dns_rdataslab_fromrdataset(&nsec, mctx, &r, 0); + result = dns_rdataslab_fromrdataset(&neg, mctx, &r, 0); if (result != ISC_R_SUCCESS) goto cleanup; - noqname->nsec = r.base; - result = dns_rdataslab_fromrdataset(&nsecsig, mctx, &r, 0); + noqname->neg = r.base; + result = dns_rdataslab_fromrdataset(&negsig, mctx, &r, 0); if (result != ISC_R_SUCCESS) goto cleanup; - noqname->nsecsig = r.base; - dns_rdataset_disassociate(&nsec); - dns_rdataset_disassociate(&nsecsig); + noqname->negsig = r.base; + dns_rdataset_disassociate(&neg); + dns_rdataset_disassociate(&negsig); newheader->noqname = noqname; return (ISC_R_SUCCESS); cleanup: - dns_rdataset_disassociate(&nsec); - dns_rdataset_disassociate(&nsecsig); + dns_rdataset_disassociate(&neg); + dns_rdataset_disassociate(&negsig); free_noqname(mctx, &noqname); return(result); } +static inline isc_result_t +addclosest(dns_rbtdb_t *rbtdb, rdatasetheader_t *newheader, + dns_rdataset_t *rdataset) +{ + struct noqname *closest; + isc_mem_t *mctx = rbtdb->common.mctx; + dns_name_t name; + dns_rdataset_t neg, negsig; + isc_result_t result; + isc_region_t r; + + dns_name_init(&name, NULL); + dns_rdataset_init(&neg); + dns_rdataset_init(&negsig); + + result = dns_rdataset_getclosest(rdataset, &name, &neg, &negsig); + RUNTIME_CHECK(result == ISC_R_SUCCESS); + + closest = isc_mem_get(mctx, sizeof(*closest)); + if (closest == NULL) { + result = ISC_R_NOMEMORY; + goto cleanup; + } + dns_name_init(&closest->name, NULL); + closest->neg = NULL; + closest->negsig = NULL; + closest->type = neg.type; + result = dns_name_dup(&name, mctx, &closest->name); + if (result != ISC_R_SUCCESS) + goto cleanup; + result = dns_rdataslab_fromrdataset(&neg, mctx, &r, 0); + if (result != ISC_R_SUCCESS) + goto cleanup; + closest->neg = r.base; + result = dns_rdataslab_fromrdataset(&negsig, mctx, &r, 0); + if (result != ISC_R_SUCCESS) + goto cleanup; + closest->negsig = r.base; + dns_rdataset_disassociate(&neg); + dns_rdataset_disassociate(&negsig); + newheader->closest = closest; + return (ISC_R_SUCCESS); + + cleanup: + dns_rdataset_disassociate(&neg); + dns_rdataset_disassociate(&negsig); + free_noqname(mctx, &closest); + return(result); +} + +static dns_dbmethods_t zone_methods; + static isc_result_t addrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, isc_stdtime_t now, dns_rdataset_t *rdataset, unsigned int options, @@ -4830,11 +5940,21 @@ addrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, rbtdb_version_t *rbtversion = version; isc_region_t region; rdatasetheader_t *newheader; + rdatasetheader_t *header; isc_result_t result; isc_boolean_t delegating; + isc_boolean_t tree_locked = ISC_FALSE; REQUIRE(VALID_RBTDB(rbtdb)); + if (rbtdb->common.methods == &zone_methods) + REQUIRE(((rbtnode->nsec3 && + (rdataset->type == dns_rdatatype_nsec3 || + rdataset->covers == dns_rdatatype_nsec3)) || + (!rbtnode->nsec3 && + rdataset->type != dns_rdatatype_nsec3 && + rdataset->covers != dns_rdatatype_nsec3))); + if (rbtversion == NULL) { if (now == 0) isc_stdtime_get(&now); @@ -4848,26 +5968,48 @@ addrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, return (result); newheader = (rdatasetheader_t *)region.base; - newheader->ttl = rdataset->ttl + now; + init_rdataset(rbtdb, newheader); + set_ttl(rbtdb, newheader, rdataset->ttl + now); newheader->type = RBTDB_RDATATYPE_VALUE(rdataset->type, rdataset->covers); newheader->attributes = 0; newheader->noqname = NULL; + newheader->closest = NULL; newheader->count = init_count++; newheader->trust = rdataset->trust; newheader->additional_auth = NULL; newheader->additional_glue = NULL; + newheader->last_used = now; + newheader->node = rbtnode; if (rbtversion != NULL) { newheader->serial = rbtversion->serial; now = 0; + + if ((rdataset->attributes & DNS_RDATASETATTR_RESIGN) != 0) { + newheader->attributes |= RDATASET_ATTR_RESIGN; + newheader->resign = rdataset->resign; + } else + newheader->resign = 0; } else { newheader->serial = 1; + newheader->resign = 0; if ((rdataset->attributes & DNS_RDATASETATTR_NXDOMAIN) != 0) newheader->attributes |= RDATASET_ATTR_NXDOMAIN; + if ((rdataset->attributes & DNS_RDATASETATTR_OPTOUT) != 0) + newheader->attributes |= RDATASET_ATTR_OPTOUT; if ((rdataset->attributes & DNS_RDATASETATTR_NOQNAME) != 0) { result = addnoqname(rbtdb, newheader, rdataset); if (result != ISC_R_SUCCESS) { - free_rdataset(rbtdb->common.mctx, newheader); + free_rdataset(rbtdb, rbtdb->common.mctx, + newheader); + return (result); + } + } + if ((rdataset->attributes & DNS_RDATASETATTR_CLOSEST) != 0) { + result = addclosest(rbtdb, newheader, rdataset); + if (result != ISC_R_SUCCESS) { + free_rdataset(rbtdb, rbtdb->common.mctx, + newheader); return (result); } } @@ -4876,18 +6018,54 @@ addrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, /* * If we're adding a delegation type (e.g. NS or DNAME for a zone, * just DNAME for the cache), then we need to set the callback bit - * on the node, and to do that we must be holding an exclusive lock - * on the tree. + * on the node. */ - if (delegating_type(rbtdb, rbtnode, rdataset->type)) { + if (delegating_type(rbtdb, rbtnode, rdataset->type)) delegating = ISC_TRUE; - RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write); - } else + else delegating = ISC_FALSE; + /* + * If we're adding a delegation type or the DB is a cache in an overmem + * state, hold an exclusive lock on the tree. In the latter case + * the lock does not necessarily have to be acquired but it will help + * purge stale entries more effectively. + */ + if (delegating || (IS_CACHE(rbtdb) && rbtdb->overmem)) { + tree_locked = ISC_TRUE; + RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write); + } + + if (IS_CACHE(rbtdb) && rbtdb->overmem) + overmem_purge(rbtdb, rbtnode->locknum, now, tree_locked); + NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock, isc_rwlocktype_write); + if (rbtdb->rrsetstats != NULL) { + newheader->attributes |= RDATASET_ATTR_STATCOUNT; + update_rrsetstats(rbtdb, newheader, ISC_TRUE); + } + + if (IS_CACHE(rbtdb)) { + if (tree_locked) + cleanup_dead_nodes(rbtdb, rbtnode->locknum); + + header = isc_heap_element(rbtdb->heaps[rbtnode->locknum], 1); + if (header && header->rdh_ttl <= now - RBTDB_VIRTUAL) + expire_header(rbtdb, header, tree_locked); + + /* + * If we've been holding a write lock on the tree just for + * cleaning, we can release it now. However, we still need the + * node lock. + */ + if (tree_locked && !delegating) { + RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write); + tree_locked = ISC_FALSE; + } + } + result = add(rbtdb, rbtnode, rbtversion, newheader, options, ISC_FALSE, addedrdataset, now); if (result == ISC_R_SUCCESS && delegating) @@ -4896,15 +6074,15 @@ addrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, isc_rwlocktype_write); - if (delegating) + if (tree_locked) RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write); /* * Update the zone's secure status. If version is non-NULL - * this is defered until closeversion() is called. + * this is deferred until closeversion() is called. */ if (result == ISC_R_SUCCESS && version == NULL && !IS_CACHE(rbtdb)) - rbtdb->secure = iszonesecure(db, rbtdb->origin_node); + iszonesecure(db, version, rbtdb->origin_node); return (result); } @@ -4925,29 +6103,46 @@ subtractrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, REQUIRE(VALID_RBTDB(rbtdb)); + if (rbtdb->common.methods == &zone_methods) + REQUIRE(((rbtnode->nsec3 && + (rdataset->type == dns_rdatatype_nsec3 || + rdataset->covers == dns_rdatatype_nsec3)) || + (!rbtnode->nsec3 && + rdataset->type != dns_rdatatype_nsec3 && + rdataset->covers != dns_rdatatype_nsec3))); + result = dns_rdataslab_fromrdataset(rdataset, rbtdb->common.mctx, ®ion, sizeof(rdatasetheader_t)); if (result != ISC_R_SUCCESS) return (result); newheader = (rdatasetheader_t *)region.base; - newheader->ttl = rdataset->ttl; + init_rdataset(rbtdb, newheader); + set_ttl(rbtdb, newheader, rdataset->ttl); newheader->type = RBTDB_RDATATYPE_VALUE(rdataset->type, rdataset->covers); newheader->attributes = 0; newheader->serial = rbtversion->serial; newheader->trust = 0; newheader->noqname = NULL; + newheader->closest = NULL; newheader->count = init_count++; newheader->additional_auth = NULL; newheader->additional_glue = NULL; + newheader->last_used = 0; + newheader->node = rbtnode; + if ((rdataset->attributes & DNS_RDATASETATTR_RESIGN) != 0) { + newheader->attributes |= RDATASET_ATTR_RESIGN; + newheader->resign = rdataset->resign; + } else + newheader->resign = 0; NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock, isc_rwlocktype_write); changed = add_changed(rbtdb, rbtversion, rbtnode); if (changed == NULL) { - free_rdataset(rbtdb->common.mctx, newheader); + free_rdataset(rbtdb, rbtdb->common.mctx, newheader); NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, isc_rwlocktype_write); return (ISC_R_NOMEMORY); @@ -4975,7 +6170,7 @@ subtractrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, result = ISC_R_SUCCESS; if ((options & DNS_DBSUB_EXACT) != 0) { flags |= DNS_RDATASLAB_EXACT; - if (newheader->ttl != header->ttl) + if (newheader->rdh_ttl != header->rdh_ttl) result = DNS_R_NOTEXACT; } if (result == ISC_R_SUCCESS) @@ -4988,8 +6183,9 @@ subtractrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, (dns_rdatatype_t)header->type, flags, &subresult); if (result == ISC_R_SUCCESS) { - free_rdataset(rbtdb->common.mctx, newheader); + free_rdataset(rbtdb, rbtdb->common.mctx, newheader); newheader = (rdatasetheader_t *)subresult; + init_rdataset(rbtdb, newheader); /* * We have to set the serial since the rdataslab * subtraction routine copies the reserved portion of @@ -5008,24 +6204,27 @@ subtractrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, * This subtraction would remove all of the rdata; * add a nonexistent header instead. */ - free_rdataset(rbtdb->common.mctx, newheader); - newheader = isc_mem_get(rbtdb->common.mctx, - sizeof(*newheader)); + free_rdataset(rbtdb, rbtdb->common.mctx, newheader); + newheader = new_rdataset(rbtdb, rbtdb->common.mctx); if (newheader == NULL) { result = ISC_R_NOMEMORY; goto unlock; } - newheader->ttl = 0; + set_ttl(rbtdb, newheader, 0); newheader->type = topheader->type; newheader->attributes = RDATASET_ATTR_NONEXISTENT; newheader->trust = 0; newheader->serial = rbtversion->serial; newheader->noqname = NULL; + newheader->closest = NULL; newheader->count = 0; newheader->additional_auth = NULL; newheader->additional_glue = NULL; + newheader->node = rbtnode; + newheader->resign = 0; + newheader->last_used = 0; } else { - free_rdataset(rbtdb->common.mctx, newheader); + free_rdataset(rbtdb, rbtdb->common.mctx, newheader); goto unlock; } @@ -5048,7 +6247,7 @@ subtractrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, * The rdataset doesn't exist, so we don't need to do anything * to satisfy the deletion request. */ - free_rdataset(rbtdb->common.mctx, newheader); + free_rdataset(rbtdb, rbtdb->common.mctx, newheader); if ((options & DNS_DBSUB_EXACT) != 0) result = DNS_R_NOTEXACT; else @@ -5064,10 +6263,10 @@ subtractrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, /* * Update the zone's secure status. If version is non-NULL - * this is defered until closeversion() is called. + * this is deferred until closeversion() is called. */ if (result == ISC_R_SUCCESS && version == NULL && !IS_CACHE(rbtdb)) - rbtdb->secure = iszonesecure(db, rbtdb->origin_node); + iszonesecure(db, rbtdb->current_version, rbtdb->origin_node); return (result); } @@ -5089,14 +6288,15 @@ deleterdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, if (type == dns_rdatatype_rrsig && covers == 0) return (ISC_R_NOTIMPLEMENTED); - newheader = isc_mem_get(rbtdb->common.mctx, sizeof(*newheader)); + newheader = new_rdataset(rbtdb, rbtdb->common.mctx); if (newheader == NULL) return (ISC_R_NOMEMORY); - newheader->ttl = 0; + set_ttl(rbtdb, newheader, 0); newheader->type = RBTDB_RDATATYPE_VALUE(type, covers); newheader->attributes = RDATASET_ATTR_NONEXISTENT; newheader->trust = 0; newheader->noqname = NULL; + newheader->closest = NULL; newheader->additional_auth = NULL; newheader->additional_glue = NULL; if (rbtversion != NULL) @@ -5104,6 +6304,8 @@ deleterdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, else newheader->serial = 0; newheader->count = 0; + newheader->last_used = 0; + newheader->node = rbtnode; NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock, isc_rwlocktype_write); @@ -5116,10 +6318,10 @@ deleterdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, /* * Update the zone's secure status. If version is non-NULL - * this is defered until closeversion() is called. + * this is deferred until closeversion() is called. */ if (result == ISC_R_SUCCESS && version == NULL && !IS_CACHE(rbtdb)) - rbtdb->secure = iszonesecure(db, rbtdb->origin_node); + iszonesecure(db, rbtdb->current_version, rbtdb->origin_node); return (result); } @@ -5147,7 +6349,9 @@ loading_addrdataset(void *arg, dns_name_t *name, dns_rdataset_t *rdataset) { !IS_CACHE(rbtdb) && !dns_name_equal(name, &rbtdb->common.origin)) return (DNS_R_NOTZONETOP); - add_empty_wildcards(rbtdb, name); + if (rdataset->type != dns_rdatatype_nsec3 && + rdataset->covers != dns_rdatatype_nsec3) + add_empty_wildcards(rbtdb, name); if (dns_name_iswildcard(name)) { /* @@ -5155,13 +6359,27 @@ loading_addrdataset(void *arg, dns_name_t *name, dns_rdataset_t *rdataset) { */ if (rdataset->type == dns_rdatatype_ns) return (DNS_R_INVALIDNS); + /* + * NSEC3 record owners cannot legally be wild cards. + */ + if (rdataset->type == dns_rdatatype_nsec3) + return (DNS_R_INVALIDNSEC3); result = add_wildcard_magic(rbtdb, name); if (result != ISC_R_SUCCESS) return (result); } node = NULL; - result = dns_rbt_addnode(rbtdb->tree, name, &node); + if (rdataset->type == dns_rdatatype_nsec3 || + rdataset->covers == dns_rdatatype_nsec3) { + result = dns_rbt_addnode(rbtdb->nsec3, name, &node); + if (result == ISC_R_SUCCESS) + node->nsec3 = 1; + } else { + result = dns_rbt_addnode(rbtdb->tree, name, &node); + if (result == ISC_R_SUCCESS) + node->nsec3 = 0; + } if (result != ISC_R_SUCCESS && result != ISC_R_EXISTS) return (result); if (result != ISC_R_EXISTS) { @@ -5182,16 +6400,26 @@ loading_addrdataset(void *arg, dns_name_t *name, dns_rdataset_t *rdataset) { if (result != ISC_R_SUCCESS) return (result); newheader = (rdatasetheader_t *)region.base; - newheader->ttl = rdataset->ttl + loadctx->now; /* XXX overflow check */ + init_rdataset(rbtdb, newheader); + set_ttl(rbtdb, newheader, + rdataset->ttl + loadctx->now); /* XXX overflow check */ newheader->type = RBTDB_RDATATYPE_VALUE(rdataset->type, rdataset->covers); newheader->attributes = 0; newheader->trust = rdataset->trust; newheader->serial = 1; newheader->noqname = NULL; + newheader->closest = NULL; newheader->count = init_count++; newheader->additional_auth = NULL; newheader->additional_glue = NULL; + newheader->last_used = 0; + newheader->node = node; + if ((rdataset->attributes & DNS_RDATASETATTR_RESIGN) != 0) { + newheader->attributes |= RDATASET_ATTR_RESIGN; + newheader->resign = rdataset->resign; + } else + newheader->resign = 0; result = add(rbtdb, node, rbtdb->current_version, newheader, DNS_DBADD_MERGE, ISC_TRUE, NULL, 0); @@ -5262,7 +6490,7 @@ endload(dns_db_t *db, dns_dbload_t **dbloadp) { * zone key, we consider the zone secure. */ if (! IS_CACHE(rbtdb)) - rbtdb->secure = iszonesecure(db, rbtdb->origin_node); + iszonesecure(db, rbtdb->current_version, rbtdb->origin_node); *dbloadp = NULL; @@ -5292,7 +6520,7 @@ delete_callback(void *data, void *arg) { for (current = data; current != NULL; current = next) { next = current->next; - free_rdataset(rbtdb->common.mctx, current); + free_rdataset(rbtdb, rbtdb->common.mctx, current); } } @@ -5306,12 +6534,28 @@ issecure(dns_db_t *db) { REQUIRE(VALID_RBTDB(rbtdb)); RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read); - secure = rbtdb->secure; + secure = ISC_TF(rbtdb->current_version->secure == dns_db_secure); RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read); return (secure); } +static isc_boolean_t +isdnssec(dns_db_t *db) { + dns_rbtdb_t *rbtdb; + isc_boolean_t dnssec; + + rbtdb = (dns_rbtdb_t *)db; + + REQUIRE(VALID_RBTDB(rbtdb)); + + RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read); + dnssec = ISC_TF(rbtdb->current_version->secure != dns_db_insecure); + RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read); + + return (dnssec); +} + static unsigned int nodecount(dns_db_t *db) { dns_rbtdb_t *rbtdb; @@ -5368,13 +6612,180 @@ getoriginnode(dns_db_t *db, dns_dbnode_t **nodep) { *nodep = rbtdb->origin_node; } else { - INSIST(!IS_CACHE(rbtdb)); + INSIST(IS_CACHE(rbtdb)); result = ISC_R_NOTFOUND; } return (result); } +static isc_result_t +getnsec3parameters(dns_db_t *db, dns_dbversion_t *version, dns_hash_t *hash, + isc_uint8_t *flags, isc_uint16_t *iterations, + unsigned char *salt, size_t *salt_length) +{ + dns_rbtdb_t *rbtdb; + isc_result_t result = ISC_R_NOTFOUND; + rbtdb_version_t *rbtversion = version; + + rbtdb = (dns_rbtdb_t *)db; + + REQUIRE(VALID_RBTDB(rbtdb)); + + RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read); + + if (rbtversion == NULL) + rbtversion = rbtdb->current_version; + + if (rbtversion->havensec3) { + if (hash != NULL) + *hash = rbtversion->hash; + if (salt != NULL && salt_length != 0) { + REQUIRE(*salt_length > rbtversion->salt_length); + memcpy(salt, rbtversion->salt, rbtversion->salt_length); + } + if (salt_length != NULL) + *salt_length = rbtversion->salt_length; + if (iterations != NULL) + *iterations = rbtversion->iterations; + if (flags != NULL) + *flags = rbtversion->flags; + result = ISC_R_SUCCESS; + } + RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read); + + return (result); +} + +static isc_result_t +setsigningtime(dns_db_t *db, dns_rdataset_t *rdataset, isc_stdtime_t resign) { + dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; + isc_stdtime_t oldresign; + isc_result_t result = ISC_R_SUCCESS; + rdatasetheader_t *header; + + REQUIRE(VALID_RBTDB(rbtdb)); + REQUIRE(!IS_CACHE(rbtdb)); + REQUIRE(rdataset != NULL); + + header = rdataset->private3; + header--; + + NODE_LOCK(&rbtdb->node_locks[header->node->locknum].lock, + isc_rwlocktype_write); + + oldresign = header->resign; + header->resign = resign; + if (header->heap_index != 0) { + INSIST(RESIGN(header)); + if (resign == 0) { + isc_heap_delete(rbtdb->heaps[header->node->locknum], + header->heap_index); + header->heap_index = 0; + } else if (resign < oldresign) + isc_heap_increased(rbtdb->heaps[header->node->locknum], + header->heap_index); + else + isc_heap_decreased(rbtdb->heaps[header->node->locknum], + header->heap_index); + } else if (resign && header->heap_index == 0) { + header->attributes |= RDATASET_ATTR_RESIGN; + result = resign_insert(rbtdb, header->node->locknum, header); + } + NODE_UNLOCK(&rbtdb->node_locks[header->node->locknum].lock, + isc_rwlocktype_write); + return (result); +} + +static isc_result_t +getsigningtime(dns_db_t *db, dns_rdataset_t *rdataset, + dns_name_t *foundname) +{ + dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; + rdatasetheader_t *header = NULL, *this; + unsigned int i; + isc_result_t result = ISC_R_NOTFOUND; + + REQUIRE(VALID_RBTDB(rbtdb)); + + RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read); + + for (i = 0; i < rbtdb->node_lock_count; i++) { + this = isc_heap_element(rbtdb->heaps[i], 1); + if (this == NULL) + continue; + if (header == NULL) + header = this; + else if (isc_serial_lt(this->resign, header->resign)) + header = this; + } + + if (header == NULL) + goto unlock; + + NODE_LOCK(&rbtdb->node_locks[header->node->locknum].lock, + isc_rwlocktype_read); + + bind_rdataset(rbtdb, header->node, header, 0, rdataset); + + if (foundname != NULL) + dns_rbt_fullnamefromnode(header->node, foundname); + + NODE_UNLOCK(&rbtdb->node_locks[header->node->locknum].lock, + isc_rwlocktype_read); + + result = ISC_R_SUCCESS; + + unlock: + RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read); + + return (result); +} + +static void +resigned(dns_db_t *db, dns_rdataset_t *rdataset, dns_dbversion_t *version) +{ + rbtdb_version_t *rbtversion = (rbtdb_version_t *)version; + dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; + dns_rbtnode_t *node; + rdatasetheader_t *header; + + REQUIRE(VALID_RBTDB(rbtdb)); + REQUIRE(rdataset != NULL); + REQUIRE(rbtdb->future_version == rbtversion); + REQUIRE(rbtversion->writer); + + node = rdataset->private2; + header = rdataset->private3; + header--; + + RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read); + NODE_LOCK(&rbtdb->node_locks[node->locknum].lock, + isc_rwlocktype_write); + /* + * Delete from heap and save to re-signed list so that it can + * be restored if we backout of this change. + */ + new_reference(rbtdb, node); + isc_heap_delete(rbtdb->heaps[node->locknum], header->heap_index); + header->heap_index = 0; + ISC_LIST_APPEND(rbtversion->resigned_list, header, lru_link); + + NODE_UNLOCK(&rbtdb->node_locks[node->locknum].lock, + isc_rwlocktype_write); + RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read); +} + +static dns_stats_t * +getrrsetstats(dns_db_t *db) { + dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; + + REQUIRE(VALID_RBTDB(rbtdb)); + REQUIRE(IS_CACHE(rbtdb)); /* current restriction */ + + return (rbtdb->rrsetstats); +} + static dns_dbmethods_t zone_methods = { attach, detach, @@ -5403,7 +6814,15 @@ static dns_dbmethods_t zone_methods = { ispersistent, overmem, settask, - getoriginnode + getoriginnode, + NULL, + getnsec3parameters, + findnsec3node, + setsigningtime, + getsigningtime, + resigned, + isdnssec, + NULL }; static dns_dbmethods_t cache_methods = { @@ -5434,7 +6853,15 @@ static dns_dbmethods_t cache_methods = { ispersistent, overmem, settask, - getoriginnode + getoriginnode, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + isdnssec, + getrrsetstats }; isc_result_t @@ -5451,6 +6878,7 @@ dns_rbtdb_create isc_result_t result; int i; dns_name_t name; + isc_boolean_t (*sooner)(void *, void *); /* Keep the compiler happy. */ UNUSED(argc); @@ -5483,11 +6911,20 @@ dns_rbtdb_create if (result != ISC_R_SUCCESS) goto cleanup_lock; + /* + * Initialize node_lock_count in a generic way to support future + * extension which allows the user to specify this value on creation. + * Note that when specified for a cache DB it must be larger than 1 + * as commented with the definition of DEFAULT_CACHE_NODE_LOCK_COUNT. + */ if (rbtdb->node_lock_count == 0) { if (IS_CACHE(rbtdb)) rbtdb->node_lock_count = DEFAULT_CACHE_NODE_LOCK_COUNT; else rbtdb->node_lock_count = DEFAULT_NODE_LOCK_COUNT; + } else if (rbtdb->node_lock_count < 2 && IS_CACHE(rbtdb)) { + result = ISC_R_RANGE; + goto cleanup_tree_lock; } INSIST(rbtdb->node_lock_count < (1 << DNS_RBT_LOCKLENGTH)); rbtdb->node_locks = isc_mem_get(mctx, rbtdb->node_lock_count * @@ -5497,6 +6934,53 @@ dns_rbtdb_create goto cleanup_tree_lock; } + rbtdb->rrsetstats = NULL; + if (IS_CACHE(rbtdb)) { + result = dns_rdatasetstats_create(mctx, &rbtdb->rrsetstats); + if (result != ISC_R_SUCCESS) + goto cleanup_node_locks; + rbtdb->rdatasets = isc_mem_get(mctx, rbtdb->node_lock_count * + sizeof(rdatasetheaderlist_t)); + if (rbtdb->rdatasets == NULL) { + result = ISC_R_NOMEMORY; + goto cleanup_rrsetstats; + } + for (i = 0; i < (int)rbtdb->node_lock_count; i++) + ISC_LIST_INIT(rbtdb->rdatasets[i]); + } else + rbtdb->rdatasets = NULL; + + /* + * Create the heaps. + */ + rbtdb->heaps = isc_mem_get(mctx, rbtdb->node_lock_count * + sizeof(isc_heap_t *)); + if (rbtdb->heaps == NULL) { + result = ISC_R_NOMEMORY; + goto cleanup_rdatasets; + } + for (i = 0; i < (int)rbtdb->node_lock_count; i++) + rbtdb->heaps[i] = NULL; + sooner = IS_CACHE(rbtdb) ? ttl_sooner : resign_sooner; + for (i = 0; i < (int)rbtdb->node_lock_count; i++) { + result = isc_heap_create(mctx, sooner, set_index, 0, + &rbtdb->heaps[i]); + if (result != ISC_R_SUCCESS) + goto cleanup_heaps; + } + + /* + * Create deadnode lists. + */ + rbtdb->deadnodes = isc_mem_get(mctx, rbtdb->node_lock_count * + sizeof(rbtnodelist_t)); + if (rbtdb->deadnodes == NULL) { + result = ISC_R_NOMEMORY; + goto cleanup_heaps; + } + for (i = 0; i < (int)rbtdb->node_lock_count; i++) + ISC_LIST_INIT(rbtdb->deadnodes[i]); + rbtdb->active = rbtdb->node_lock_count; for (i = 0; i < (int)(rbtdb->node_lock_count); i++) { @@ -5512,7 +6996,7 @@ dns_rbtdb_create isc_refcount_decrement(&rbtdb->node_locks[i].references, NULL); isc_refcount_destroy(&rbtdb->node_locks[i].references); } - goto cleanup_node_locks; + goto cleanup_deadnodes; } rbtdb->node_locks[i].exiting = ISC_FALSE; } @@ -5525,7 +7009,7 @@ dns_rbtdb_create isc_mem_attach(mctx, &rbtdb->common.mctx); /* - * Must be initalized before free_rbtdb() is called. + * Must be initialized before free_rbtdb() is called. */ isc_ondestroy_init(&rbtdb->common.ondest); @@ -5539,13 +7023,20 @@ dns_rbtdb_create } /* - * Make the Red-Black Tree. + * Make the Red-Black Trees. */ result = dns_rbt_create(mctx, delete_callback, rbtdb, &rbtdb->tree); if (result != ISC_R_SUCCESS) { free_rbtdb(rbtdb, ISC_FALSE, NULL); return (result); } + + result = dns_rbt_create(mctx, delete_callback, rbtdb, &rbtdb->nsec3); + if (result != ISC_R_SUCCESS) { + free_rbtdb(rbtdb, ISC_FALSE, NULL); + return (result); + } + /* * In order to set the node callback bit correctly in zone databases, * we need to know if the node has the origin name of the zone. @@ -5568,6 +7059,7 @@ dns_rbtdb_create free_rbtdb(rbtdb, ISC_FALSE, NULL); return (result); } + rbtdb->origin_node->nsec3 = 0; /* * We need to give the origin node the right locknum. */ @@ -5593,7 +7085,6 @@ dns_rbtdb_create return (result); } rbtdb->attributes = 0; - rbtdb->secure = ISC_FALSE; rbtdb->overmem = ISC_FALSE; rbtdb->task = NULL; @@ -5610,6 +7101,14 @@ dns_rbtdb_create free_rbtdb(rbtdb, ISC_FALSE, NULL); return (ISC_R_NOMEMORY); } + rbtdb->current_version->secure = dns_db_insecure; + rbtdb->current_version->havensec3 = ISC_FALSE; + rbtdb->current_version->flags = 0; + rbtdb->current_version->iterations = 0; + rbtdb->current_version->hash = 0; + rbtdb->current_version->salt_length = 0; + memset(rbtdb->current_version->salt, 0, + sizeof(rbtdb->current_version->salt)); rbtdb->future_version = NULL; ISC_LIST_INIT(rbtdb->open_versions); /* @@ -5625,6 +7124,27 @@ dns_rbtdb_create return (ISC_R_SUCCESS); + cleanup_deadnodes: + isc_mem_put(mctx, rbtdb->deadnodes, + rbtdb->node_lock_count * sizeof(rbtnodelist_t)); + + cleanup_heaps: + if (rbtdb->heaps != NULL) { + for (i = 0 ; i < (int)rbtdb->node_lock_count ; i++) + if (rbtdb->heaps[i] != NULL) + isc_heap_destroy(&rbtdb->heaps[i]); + isc_mem_put(mctx, rbtdb->heaps, + rbtdb->node_lock_count * sizeof(isc_heap_t *)); + } + + cleanup_rdatasets: + if (rbtdb->rdatasets != NULL) + isc_mem_put(mctx, rbtdb->rdatasets, rbtdb->node_lock_count * + sizeof(rdatasetheaderlist_t)); + cleanup_rrsetstats: + if (rbtdb->rrsetstats != NULL) + dns_stats_detach(&rbtdb->rrsetstats); + cleanup_node_locks: isc_mem_put(mctx, rbtdb->node_locks, rbtdb->node_lock_count * sizeof(rbtdb_nodelock_t)); @@ -5655,7 +7175,7 @@ rdataset_disassociate(dns_rdataset_t *rdataset) { static isc_result_t rdataset_first(dns_rdataset_t *rdataset) { - unsigned char *raw = rdataset->private3; /* RDATASLAB */ + unsigned char *raw = rdataset->private3; /* RDATASLAB */ unsigned int count; count = raw[0] * 256 + raw[1]; @@ -5691,7 +7211,7 @@ static isc_result_t rdataset_next(dns_rdataset_t *rdataset) { unsigned int count; unsigned int length; - unsigned char *raw; /* RDATASLAB */ + unsigned char *raw; /* RDATASLAB */ count = rdataset->privateuint4; if (count == 0) @@ -5710,9 +7230,9 @@ rdataset_next(dns_rdataset_t *rdataset) { raw += length; #if DNS_RDATASET_FIXED } - rdataset->private5 = raw + 4; /* length(2) + order(2) */ + rdataset->private5 = raw + 4; /* length(2) + order(2) */ #else - rdataset->private5 = raw + 2; /* length(2) */ + rdataset->private5 = raw + 2; /* length(2) */ #endif return (ISC_R_SUCCESS); @@ -5720,11 +7240,13 @@ rdataset_next(dns_rdataset_t *rdataset) { static void rdataset_current(dns_rdataset_t *rdataset, dns_rdata_t *rdata) { - unsigned char *raw = rdataset->private5; /* RDATASLAB */ + unsigned char *raw = rdataset->private5; /* RDATASLAB */ #if DNS_RDATASET_FIXED unsigned int offset; #endif + unsigned int length; isc_region_t r; + unsigned int flags = 0; REQUIRE(raw != NULL); @@ -5740,15 +7262,22 @@ rdataset_current(dns_rdataset_t *rdataset, dns_rdata_t *rdata) { raw += offset; } #endif - r.length = raw[0] * 256 + raw[1]; - + length = raw[0] * 256 + raw[1]; #if DNS_RDATASET_FIXED raw += 4; #else raw += 2; #endif + if (rdataset->type == dns_rdatatype_rrsig) { + if (*raw & DNS_RDATASLAB_OFFLINE) + flags |= DNS_RDATA_OFFLINE; + length--; + raw++; + } + r.length = length; r.base = raw; dns_rdata_fromregion(rdata, rdataset->rdclass, rdataset->type, &r); + rdata->flags |= flags; } static void @@ -5769,7 +7298,7 @@ rdataset_clone(dns_rdataset_t *source, dns_rdataset_t *target) { static unsigned int rdataset_count(dns_rdataset_t *rdataset) { - unsigned char *raw = rdataset->private3; /* RDATASLAB */ + unsigned char *raw = rdataset->private3; /* RDATASLAB */ unsigned int count; count = raw[0] * 256 + raw[1]; @@ -5790,37 +7319,85 @@ rdataset_getnoqname(dns_rdataset_t *rdataset, dns_name_t *name, attachnode(db, node, &cloned_node); nsec->methods = &rdataset_methods; nsec->rdclass = db->rdclass; - nsec->type = dns_rdatatype_nsec; + nsec->type = noqname->type; nsec->covers = 0; nsec->ttl = rdataset->ttl; nsec->trust = rdataset->trust; nsec->private1 = rdataset->private1; nsec->private2 = rdataset->private2; - nsec->private3 = noqname->nsec; + nsec->private3 = noqname->neg; nsec->privateuint4 = 0; nsec->private5 = NULL; nsec->private6 = NULL; + nsec->private7 = NULL; cloned_node = NULL; attachnode(db, node, &cloned_node); nsecsig->methods = &rdataset_methods; nsecsig->rdclass = db->rdclass; nsecsig->type = dns_rdatatype_rrsig; - nsecsig->covers = dns_rdatatype_nsec; + nsecsig->covers = noqname->type; nsecsig->ttl = rdataset->ttl; nsecsig->trust = rdataset->trust; nsecsig->private1 = rdataset->private1; nsecsig->private2 = rdataset->private2; - nsecsig->private3 = noqname->nsecsig; + nsecsig->private3 = noqname->negsig; nsecsig->privateuint4 = 0; nsecsig->private5 = NULL; nsec->private6 = NULL; + nsec->private7 = NULL; dns_name_clone(&noqname->name, name); return (ISC_R_SUCCESS); } +static isc_result_t +rdataset_getclosest(dns_rdataset_t *rdataset, dns_name_t *name, + dns_rdataset_t *nsec, dns_rdataset_t *nsecsig) +{ + dns_db_t *db = rdataset->private1; + dns_dbnode_t *node = rdataset->private2; + dns_dbnode_t *cloned_node; + struct noqname *closest = rdataset->private7; + + cloned_node = NULL; + attachnode(db, node, &cloned_node); + nsec->methods = &rdataset_methods; + nsec->rdclass = db->rdclass; + nsec->type = closest->type; + nsec->covers = 0; + nsec->ttl = rdataset->ttl; + nsec->trust = rdataset->trust; + nsec->private1 = rdataset->private1; + nsec->private2 = rdataset->private2; + nsec->private3 = closest->neg; + nsec->privateuint4 = 0; + nsec->private5 = NULL; + nsec->private6 = NULL; + nsec->private7 = NULL; + + cloned_node = NULL; + attachnode(db, node, &cloned_node); + nsecsig->methods = &rdataset_methods; + nsecsig->rdclass = db->rdclass; + nsecsig->type = dns_rdatatype_rrsig; + nsecsig->covers = closest->type; + nsecsig->ttl = rdataset->ttl; + nsecsig->trust = rdataset->trust; + nsecsig->private1 = rdataset->private1; + nsecsig->private2 = rdataset->private2; + nsecsig->private3 = closest->negsig; + nsecsig->privateuint4 = 0; + nsecsig->private5 = NULL; + nsec->private6 = NULL; + nsec->private7 = NULL; + + dns_name_clone(&closest->name, name); + + return (ISC_R_SUCCESS); +} + /* * Rdataset Iterator Methods */ @@ -5871,13 +7448,13 @@ rdatasetiter_first(dns_rdatasetiter_t *iterator) { * record? Or is it too old in the cache? * * Note: unlike everywhere else, we - * check for now > header->ttl instead - * of now >= header->ttl. This allows + * check for now > header->rdh_ttl instead + * of now >= header->rdh_ttl. This allows * ANY and RRSIG queries for 0 TTL * rdatasets to work. */ if (NONEXISTENT(header) || - (now != 0 && now > header->ttl)) + (now != 0 && now > header->rdh_ttl)) header = NULL; break; } else @@ -5953,7 +7530,7 @@ rdatasetiter_next(dns_rdatasetiter_t *iterator) { */ if ((header->attributes & RDATASET_ATTR_NONEXISTENT) != 0 || - (now != 0 && now > header->ttl)) + (now != 0 && now > header->rdh_ttl)) header = NULL; break; } else @@ -6009,9 +7586,7 @@ reference_iter_node(rbtdb_dbiterator_t *rbtdbiter) { return; INSIST(rbtdbiter->tree_locked != isc_rwlocktype_none); - NODE_STRONGLOCK(&rbtdb->node_locks[node->locknum].lock); - new_reference(rbtdb, node); - NODE_STRONGUNLOCK(&rbtdb->node_locks[node->locknum].lock); + reactivate_node(rbtdb, node, rbtdbiter->tree_locked); } static inline void @@ -6026,7 +7601,7 @@ dereference_iter_node(rbtdb_dbiterator_t *rbtdbiter) { lock = &rbtdb->node_locks[node->locknum].lock; NODE_LOCK(lock, isc_rwlocktype_read); decrement_reference(rbtdb, node, 0, isc_rwlocktype_read, - rbtdbiter->tree_locked); + rbtdbiter->tree_locked, ISC_FALSE); NODE_UNLOCK(lock, isc_rwlocktype_read); rbtdbiter->node = NULL; @@ -6067,7 +7642,7 @@ flush_deletions(rbtdb_dbiterator_t *rbtdbiter) { NODE_LOCK(lock, isc_rwlocktype_read); decrement_reference(rbtdb, node, 0, isc_rwlocktype_read, - rbtdbiter->tree_locked); + rbtdbiter->tree_locked, ISC_FALSE); NODE_UNLOCK(lock, isc_rwlocktype_read); } @@ -6117,6 +7692,7 @@ dbiterator_destroy(dns_dbiterator_t **iteratorp) { dns_db_detach(&rbtdbiter->common.db); dns_rbtnodechain_reset(&rbtdbiter->chain); + dns_rbtnodechain_reset(&rbtdbiter->nsec3chain); isc_mem_put(db->mctx, rbtdbiter, sizeof(*rbtdbiter)); dns_db_detach(&db); @@ -6142,12 +7718,25 @@ dbiterator_first(dns_dbiterator_t *iterator) { name = dns_fixedname_name(&rbtdbiter->name); origin = dns_fixedname_name(&rbtdbiter->origin); dns_rbtnodechain_reset(&rbtdbiter->chain); + dns_rbtnodechain_reset(&rbtdbiter->nsec3chain); - result = dns_rbtnodechain_first(&rbtdbiter->chain, rbtdb->tree, name, - origin); - + if (rbtdbiter->nsec3only) { + rbtdbiter->current = &rbtdbiter->nsec3chain; + result = dns_rbtnodechain_first(rbtdbiter->current, + rbtdb->nsec3, name, origin); + } else { + rbtdbiter->current = &rbtdbiter->chain; + result = dns_rbtnodechain_first(rbtdbiter->current, + rbtdb->tree, name, origin); + if (!rbtdbiter->nonsec3 && result == ISC_R_NOTFOUND) { + rbtdbiter->current = &rbtdbiter->nsec3chain; + result = dns_rbtnodechain_first(rbtdbiter->current, + rbtdb->nsec3, name, + origin); + } + } if (result == ISC_R_SUCCESS || result == DNS_R_NEWORIGIN) { - result = dns_rbtnodechain_current(&rbtdbiter->chain, NULL, + result = dns_rbtnodechain_current(rbtdbiter->current, NULL, NULL, &rbtdbiter->node); if (result == ISC_R_SUCCESS) { rbtdbiter->new_origin = ISC_TRUE; @@ -6182,11 +7771,21 @@ dbiterator_last(dns_dbiterator_t *iterator) { name = dns_fixedname_name(&rbtdbiter->name); origin = dns_fixedname_name(&rbtdbiter->origin); dns_rbtnodechain_reset(&rbtdbiter->chain); + dns_rbtnodechain_reset(&rbtdbiter->nsec3chain); - result = dns_rbtnodechain_last(&rbtdbiter->chain, rbtdb->tree, name, - origin); + result = ISC_R_NOTFOUND; + if (rbtdbiter->nsec3only && !rbtdbiter->nonsec3) { + rbtdbiter->current = &rbtdbiter->nsec3chain; + result = dns_rbtnodechain_last(rbtdbiter->current, + rbtdb->nsec3, name, origin); + } + if (!rbtdbiter->nsec3only && result == ISC_R_NOTFOUND) { + rbtdbiter->current = &rbtdbiter->chain; + result = dns_rbtnodechain_last(rbtdbiter->current, rbtdb->tree, + name, origin); + } if (result == ISC_R_SUCCESS || result == DNS_R_NEWORIGIN) { - result = dns_rbtnodechain_current(&rbtdbiter->chain, NULL, + result = dns_rbtnodechain_current(rbtdbiter->current, NULL, NULL, &rbtdbiter->node); if (result == ISC_R_SUCCESS) { rbtdbiter->new_origin = ISC_TRUE; @@ -6210,6 +7809,7 @@ dbiterator_seek(dns_dbiterator_t *iterator, dns_name_t *name) { dns_name_t *iname, *origin; if (rbtdbiter->result != ISC_R_SUCCESS && + rbtdbiter->result != ISC_R_NOTFOUND && rbtdbiter->result != ISC_R_NOMORE) return (rbtdbiter->result); @@ -6221,22 +7821,74 @@ dbiterator_seek(dns_dbiterator_t *iterator, dns_name_t *name) { iname = dns_fixedname_name(&rbtdbiter->name); origin = dns_fixedname_name(&rbtdbiter->origin); dns_rbtnodechain_reset(&rbtdbiter->chain); + dns_rbtnodechain_reset(&rbtdbiter->nsec3chain); + + if (rbtdbiter->nsec3only) { + rbtdbiter->current = &rbtdbiter->nsec3chain; + result = dns_rbt_findnode(rbtdb->nsec3, name, NULL, + &rbtdbiter->node, + rbtdbiter->current, + DNS_RBTFIND_EMPTYDATA, NULL, NULL); + } else if (rbtdbiter->nonsec3) { + rbtdbiter->current = &rbtdbiter->chain; + result = dns_rbt_findnode(rbtdb->tree, name, NULL, + &rbtdbiter->node, + rbtdbiter->current, + DNS_RBTFIND_EMPTYDATA, NULL, NULL); + } else { + /* + * Stay on main chain if not found on either chain. + */ + rbtdbiter->current = &rbtdbiter->chain; + result = dns_rbt_findnode(rbtdb->tree, name, NULL, + &rbtdbiter->node, + rbtdbiter->current, + DNS_RBTFIND_EMPTYDATA, NULL, NULL); + if (result == DNS_R_PARTIALMATCH) { + dns_rbtnode_t *node = NULL; + result = dns_rbt_findnode(rbtdb->nsec3, name, NULL, + &node, &rbtdbiter->nsec3chain, + DNS_RBTFIND_EMPTYDATA, + NULL, NULL); + if (result == ISC_R_SUCCESS) { + rbtdbiter->node = node; + rbtdbiter->current = &rbtdbiter->nsec3chain; + } + } + } - result = dns_rbt_findnode(rbtdb->tree, name, NULL, &rbtdbiter->node, - &rbtdbiter->chain, DNS_RBTFIND_EMPTYDATA, - NULL, NULL); +#if 1 if (result == ISC_R_SUCCESS) { - result = dns_rbtnodechain_current(&rbtdbiter->chain, iname, + result = dns_rbtnodechain_current(rbtdbiter->current, iname, origin, NULL); if (result == ISC_R_SUCCESS) { rbtdbiter->new_origin = ISC_TRUE; reference_iter_node(rbtdbiter); } - - } else if (result == DNS_R_PARTIALMATCH) + } else if (result == DNS_R_PARTIALMATCH) { result = ISC_R_NOTFOUND; + rbtdbiter->node = NULL; + } rbtdbiter->result = result; +#else + if (result == ISC_R_SUCCESS || result == DNS_R_PARTIALMATCH) { + isc_result_t tresult; + tresult = dns_rbtnodechain_current(rbtdbiter->current, iname, + origin, NULL); + if (tresult == ISC_R_SUCCESS) { + rbtdbiter->new_origin = ISC_TRUE; + reference_iter_node(rbtdbiter); + } else { + result = tresult; + rbtdbiter->node = NULL; + } + } else + rbtdbiter->node = NULL; + + rbtdbiter->result = (result == DNS_R_PARTIALMATCH) ? + ISC_R_SUCCESS : result; +#endif return (result); } @@ -6246,6 +7898,7 @@ dbiterator_prev(dns_dbiterator_t *iterator) { isc_result_t result; rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator; dns_name_t *name, *origin; + dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)iterator->db; REQUIRE(rbtdbiter->node != NULL); @@ -6257,13 +7910,23 @@ dbiterator_prev(dns_dbiterator_t *iterator) { name = dns_fixedname_name(&rbtdbiter->name); origin = dns_fixedname_name(&rbtdbiter->origin); - result = dns_rbtnodechain_prev(&rbtdbiter->chain, name, origin); + result = dns_rbtnodechain_prev(rbtdbiter->current, name, origin); + if (result == ISC_R_NOMORE && !rbtdbiter->nsec3only && + !rbtdbiter->nonsec3 && + &rbtdbiter->nsec3chain == rbtdbiter->current) { + rbtdbiter->current = &rbtdbiter->chain; + dns_rbtnodechain_reset(rbtdbiter->current); + result = dns_rbtnodechain_last(rbtdbiter->current, rbtdb->tree, + name, origin); + if (result == ISC_R_NOTFOUND) + result = ISC_R_NOMORE; + } dereference_iter_node(rbtdbiter); if (result == DNS_R_NEWORIGIN || result == ISC_R_SUCCESS) { rbtdbiter->new_origin = ISC_TF(result == DNS_R_NEWORIGIN); - result = dns_rbtnodechain_current(&rbtdbiter->chain, NULL, + result = dns_rbtnodechain_current(rbtdbiter->current, NULL, NULL, &rbtdbiter->node); } @@ -6280,6 +7943,7 @@ dbiterator_next(dns_dbiterator_t *iterator) { isc_result_t result; rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator; dns_name_t *name, *origin; + dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)iterator->db; REQUIRE(rbtdbiter->node != NULL); @@ -6291,13 +7955,22 @@ dbiterator_next(dns_dbiterator_t *iterator) { name = dns_fixedname_name(&rbtdbiter->name); origin = dns_fixedname_name(&rbtdbiter->origin); - result = dns_rbtnodechain_next(&rbtdbiter->chain, name, origin); + result = dns_rbtnodechain_next(rbtdbiter->current, name, origin); + if (result == ISC_R_NOMORE && !rbtdbiter->nsec3only && + !rbtdbiter->nonsec3 && &rbtdbiter->chain == rbtdbiter->current) { + rbtdbiter->current = &rbtdbiter->nsec3chain; + dns_rbtnodechain_reset(rbtdbiter->current); + result = dns_rbtnodechain_first(rbtdbiter->current, + rbtdb->nsec3, name, origin); + if (result == ISC_R_NOTFOUND) + result = ISC_R_NOMORE; + } dereference_iter_node(rbtdbiter); if (result == DNS_R_NEWORIGIN || result == ISC_R_SUCCESS) { rbtdbiter->new_origin = ISC_TF(result == DNS_R_NEWORIGIN); - result = dns_rbtnodechain_current(&rbtdbiter->chain, NULL, + result = dns_rbtnodechain_current(rbtdbiter->current, NULL, NULL, &rbtdbiter->node); } if (result == ISC_R_SUCCESS) @@ -6421,7 +8094,7 @@ rdataset_getadditional(dns_rdataset_t *rdataset, dns_rdatasetadditional_t type, { dns_rbtdb_t *rbtdb = rdataset->private1; dns_rbtnode_t *rbtnode = rdataset->private2; - unsigned char *raw = rdataset->private3; /* RDATASLAB */ + unsigned char *raw = rdataset->private3; /* RDATASLAB */ unsigned int current_count = rdataset->privateuint4; unsigned int count; rdatasetheader_t *header; @@ -6567,7 +8240,7 @@ rdataset_setadditional(dns_rdataset_t *rdataset, dns_rdatasetadditional_t type, { dns_rbtdb_t *rbtdb = rdataset->private1; dns_rbtnode_t *rbtnode = rdataset->private2; - unsigned char *raw = rdataset->private3; /* RDATASLAB */ + unsigned char *raw = rdataset->private3; /* RDATASLAB */ unsigned int current_count = rdataset->privateuint4; rdatasetheader_t *header; unsigned int total_count, count; @@ -6673,7 +8346,7 @@ rdataset_setadditional(dns_rdataset_t *rdataset, dns_rdatasetadditional_t type, return (ISC_R_SUCCESS); - fail: + fail: if (newcbarg != NULL) { if (newentry != NULL) { acache_cancelentry(rbtdb->common.mctx, newentry, @@ -6696,7 +8369,7 @@ rdataset_putadditional(dns_acache_t *acache, dns_rdataset_t *rdataset, { dns_rbtdb_t *rbtdb = rdataset->private1; dns_rbtnode_t *rbtnode = rdataset->private2; - unsigned char *raw = rdataset->private3; /* RDATASLAB */ + unsigned char *raw = rdataset->private3; /* RDATASLAB */ unsigned int current_count = rdataset->privateuint4; rdatasetheader_t *header; nodelock_t *nodelock; @@ -6705,7 +8378,7 @@ rdataset_putadditional(dns_acache_t *acache, dns_rdataset_t *rdataset, dns_acacheentry_t *entry; acache_cbarg_t *cbarg; - UNUSED(qtype); /* we do not use this value at least for now */ + UNUSED(qtype); /* we do not use this value at least for now */ UNUSED(acache); if (type == dns_rdatasetadditional_fromcache) @@ -6752,9 +8425,159 @@ rdataset_putadditional(dns_acache_t *acache, dns_rdataset_t *rdataset, NODE_UNLOCK(nodelock, isc_rwlocktype_write); if (entry != NULL) { - acache_cancelentry(rbtdb->common.mctx, entry, &cbarg); + if (cbarg != NULL) + acache_cancelentry(rbtdb->common.mctx, entry, &cbarg); dns_acache_detachentry(&entry); } return (ISC_R_SUCCESS); } + +/*% + * Routines for LRU-based cache management. + */ + +/*% + * See if a given cache entry that is being reused needs to be updated + * in the LRU-list. From the LRU management point of view, this function is + * expected to return true for almost all cases. When used with threads, + * however, this may cause a non-negligible performance penalty because a + * writer lock will have to be acquired before updating the list. + * If DNS_RBTDB_LIMITLRUUPDATE is defined to be non 0 at compilation time, this + * function returns true if the entry has not been updated for some period of + * time. We differentiate the NS or glue address case and the others since + * experiments have shown that the former tends to be accessed relatively + * infrequently and the cost of cache miss is higher (e.g., a missing NS records + * may cause external queries at a higher level zone, involving more + * transactions). + * + * Caller must hold the node (read or write) lock. + */ +static inline isc_boolean_t +need_headerupdate(rdatasetheader_t *header, isc_stdtime_t now) { + if ((header->attributes & + (RDATASET_ATTR_NONEXISTENT|RDATASET_ATTR_STALE)) != 0) + return (ISC_FALSE); + +#if DNS_RBTDB_LIMITLRUUPDATE + if (header->type == dns_rdatatype_ns || + (header->trust == dns_trust_glue && + (header->type == dns_rdatatype_a || + header->type == dns_rdatatype_aaaa))) { + /* + * Glue records are updated if at least 60 seconds have passed + * since the previous update time. + */ + return (header->last_used + 60 <= now); + } + + /* Other records are updated if 5 minutes have passed. */ + return (header->last_used + 300 <= now); +#else + UNUSED(now); + + return (ISC_TRUE); +#endif +} + +/*% + * Update the timestamp of a given cache entry and move it to the head + * of the corresponding LRU list. + * + * Caller must hold the node (write) lock. + * + * Note that the we do NOT touch the heap here, as the TTL has not changed. + */ +static void +update_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header, + isc_stdtime_t now) +{ + INSIST(IS_CACHE(rbtdb)); + + /* To be checked: can we really assume this? XXXMLG */ + INSIST(ISC_LINK_LINKED(header, lru_link)); + + ISC_LIST_UNLINK(rbtdb->rdatasets[header->node->locknum], + header, lru_link); + header->last_used = now; + ISC_LIST_PREPEND(rbtdb->rdatasets[header->node->locknum], + header, lru_link); +} + +/*% + * Purge some expired and/or stale (i.e. unused for some period) cache entries + * under an overmem condition. To recover from this condition quickly, up to + * 2 entries will be purged. This process is triggered while adding a new + * entry, and we specifically avoid purging entries in the same LRU bucket as + * the one to which the new entry will belong. Otherwise, we might purge + * entries of the same name of different RR types while adding RRsets from a + * single response (consider the case where we're adding A and AAAA glue records + * of the same NS name). + */ +static void +overmem_purge(dns_rbtdb_t *rbtdb, unsigned int locknum_start, + isc_stdtime_t now, isc_boolean_t tree_locked) +{ + rdatasetheader_t *header, *header_prev; + unsigned int locknum; + int purgecount = 2; + + for (locknum = (locknum_start + 1) % rbtdb->node_lock_count; + locknum != locknum_start && purgecount > 0; + locknum = (locknum + 1) % rbtdb->node_lock_count) { + NODE_LOCK(&rbtdb->node_locks[locknum].lock, + isc_rwlocktype_write); + + header = isc_heap_element(rbtdb->heaps[locknum], 1); + if (header && header->rdh_ttl <= now - RBTDB_VIRTUAL) { + expire_header(rbtdb, header, tree_locked); + purgecount--; + } + + for (header = ISC_LIST_TAIL(rbtdb->rdatasets[locknum]); + header != NULL && purgecount > 0; + header = header_prev) { + header_prev = ISC_LIST_PREV(header, lru_link); + /* + * Unlink the entry at this point to avoid checking it + * again even if it's currently used someone else and + * cannot be purged at this moment. This entry won't be + * referenced any more (so unlinking is safe) since the + * TTL was reset to 0. + */ + ISC_LIST_UNLINK(rbtdb->rdatasets[locknum], header, + lru_link); + expire_header(rbtdb, header, tree_locked); + purgecount--; + } + + NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, + isc_rwlocktype_write); + } +} + +static void +expire_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header, + isc_boolean_t tree_locked) +{ + set_ttl(rbtdb, header, 0); + header->attributes |= RDATASET_ATTR_STALE; + header->node->dirty = 1; + + /* + * Caller must hold the node (write) lock. + */ + + if (dns_rbtnode_refcurrent(header->node) == 0) { + /* + * If no one else is using the node, we can clean it up now. + * We first need to gain a new reference to the node to meet a + * requirement of decrement_reference(). + */ + new_reference(rbtdb, header->node); + decrement_reference(rbtdb, header->node, 0, + isc_rwlocktype_write, + tree_locked ? isc_rwlocktype_write : + isc_rwlocktype_none, ISC_FALSE); + } +} |