merge code from ipfw3-head to reduce contention on the ipfw lock

and remove all O(N) sequences from kernel critical sections in ipfw. In detail: 1. introduce a IPFW_UH_LOCK to arbitrate requests from the upper half of the kernel. Some things, such as 'ipfw show', can be done holding this lock in read mode, whereas insert and delete require IPFW_UH_WLOCK. 2. introduce a mapping structure to keep rules together. This replaces the 'next' chain currently used in ipfw rules. At the moment the map is a simple array (sorted by rule number and then rule_id), so we can find a rule quickly instead of having to scan the list. This reduces many expensive lookups from O(N) to O(log N). 3. when an expensive operation (such as insert or delete) is done by userland, we grab IPFW_UH_WLOCK, create a new copy of the map without blocking the bottom half of the kernel, then acquire IPFW_WLOCK and quickly update pointers to the map and related info. After dropping IPFW_LOCK we can then continue the cleanup protected by IPFW_UH_LOCK. So userland still costs O(N) but the kernel side is only blocked for O(1). 4. do not pass pointers to rules through dummynet, netgraph, divert etc, but rather pass a <slot, chain_id, rulenum, rule_id> tuple. We validate the slot index (in the array of #2) with chain_id, and if successful do a O(1) dereference; otherwise, we can find the rule in O(log N) through <rulenum, rule_id> All the above does not change the userland/kernel ABI, though there are some disgusting casts between pointers and uint32_t Operation costs now are as follows: Function Old Now Planned ------------------------------------------------------------------- + skipto X, non cached O(N) O(log N) + skipto X, cached O(1) O(1) XXX dynamic rule lookup O(1) O(log N) O(1) + skipto tablearg O(N) O(1) + reinject, non cached O(N) O(log N) + reinject, cached O(1) O(1) + kernel blocked during setsockopt() O(N) O(1) ------------------------------------------------------------------- The only (very small) regression is on dynamic rule lookup and this will be fixed in a day or two, without changing the userland/kernel ABI Supported by: Valeria Paoli MFC after: 1 month
author: luigi <luigi@FreeBSD.org> 2009-12-22 19:01:47 +0000
committer: luigi <luigi@FreeBSD.org> 2009-12-22 19:01:47 +0000
commit: 2043aec45642350832212819a4c7cf828ba2dffa (patch)
tree: 212a476e3a032fe33a57f72f211a5e84e676bbd4
parent: 69eb1537ebfe9e513651b7095899759cee1b96aa (diff)
download: FreeBSD-src-2043aec45642350832212819a4c7cf828ba2dffa.zip
FreeBSD-src-2043aec45642350832212819a4c7cf828ba2dffa.tar.gz
12 files changed, 415 insertions, 402 deletions
diff --git a/sys/net/if_bridge.c b/sys/net/if_bridge.c
index d7de38e..b380ef0 100644
--- a/sys/net/if_bridge.c
+++ b/sys/net/if_bridge.c
@@ -3039,20 +3039,23 @@ bridge_pfil(struct mbuf **mp, struct ifnet *bifp, struct ifnet *ifp, int dir)
 			goto bad;
 	}
 
+	/* XXX this section is also in if_ethersubr.c */
 	if (V_ip_fw_chk_ptr && pfil_ipfw != 0 && dir == PFIL_OUT && ifp != NULL) {
 		struct dn_pkt_tag *dn_tag;
 
 		error = -1;
 		dn_tag = ip_dn_claim_tag(*mp);
-		if (dn_tag != NULL) {
-			if (dn_tag->rule != NULL && V_fw_one_pass)
+		if (dn_tag == NULL) {
+			args.slot = 0;
+		} else {
+			if (dn_tag->slot != 0 && V_fw_one_pass)
 				/* packet already partially processed */
 				goto ipfwpass;
-			args.rule = dn_tag->rule; /* matching rule to restart */
-			args.rule_id = dn_tag->rule_id;
+			args.slot = dn_tag->slot; /* next rule to use */
 			args.chain_id = dn_tag->chain_id;
-		} else
-			args.rule = NULL;
+			args.rulenum = dn_tag->rulenum;
+			args.rule_id = dn_tag->rule_id;
+		}
 
 		args.m = *mp;
 		args.oif = ifp;
diff --git a/sys/net/if_ethersubr.c b/sys/net/if_ethersubr.c
index 8d06926..95554ca 100644
--- a/sys/net/if_ethersubr.c
+++ b/sys/net/if_ethersubr.c
@@ -471,15 +471,17 @@ ether_ipfw_chk(struct mbuf **m0, struct ifnet *dst, int shared)
 
 	dn_tag = ip_dn_claim_tag(*m0);
 
-	if (dn_tag != NULL) {
-		if (dn_tag->rule != NULL && V_fw_one_pass)
+	if (dn_tag == NULL) {
+		args.slot = 0;
+	} else {
+		if (dn_tag->slot != 0 && V_fw_one_pass)
 			/* dummynet packet, already partially processed */
 			return (1);
-		args.rule = dn_tag->rule;	/* matching rule to restart */
+		args.slot = dn_tag->slot;	/* matching rule to restart */
+		args.rulenum = dn_tag->rulenum;
 		args.rule_id = dn_tag->rule_id;
 		args.chain_id = dn_tag->chain_id;
-	} else
-		args.rule = NULL;
+	}
 
 	/*
 	 * I need some amt of data to be contiguous, and in case others need
diff --git a/sys/netgraph/ng_ipfw.c b/sys/netgraph/ng_ipfw.c
index 2f80945..974f48f 100644
--- a/sys/netgraph/ng_ipfw.c
+++ b/sys/netgraph/ng_ipfw.c
@@ -293,7 +293,8 @@ ng_ipfw_input(struct mbuf **m0, int dir, struct ip_fw_args *fwa, int tee)
 			m_freem(m);
 			return (ENOMEM);
 		}
-		ngit->rule = fwa->rule;
+		ngit->slot = fwa->slot;
+		ngit->rulenum = fwa->rulenum;
 		ngit->rule_id = fwa->rule_id;
 		ngit->chain_id = fwa->chain_id;
 		ngit->dir = dir;
diff --git a/sys/netgraph/ng_ipfw.h b/sys/netgraph/ng_ipfw.h
index 29039f2..9fb4cf7 100644
--- a/sys/netgraph/ng_ipfw.h
+++ b/sys/netgraph/ng_ipfw.h
@@ -37,7 +37,8 @@ extern	ng_ipfw_input_t	*ng_ipfw_input_p;
 
 struct ng_ipfw_tag {
 	struct m_tag	mt;		/* tag header */
-	struct ip_fw	*rule;		/* matching rule */
+	uint32_t	slot;		/* slot for next rule */
+	uint32_t	rulenum;	/* matching rule number */
 	uint32_t	rule_id;	/* matching rule id */
 	uint32_t	chain_id;	/* ruleset id */
 	struct ifnet	*ifp;		/* interface, for ip_output */
diff --git a/sys/netinet/ip_dummynet.h b/sys/netinet/ip_dummynet.h
index b5ef19e..9d5223f 100644
--- a/sys/netinet/ip_dummynet.h
+++ b/sys/netinet/ip_dummynet.h
@@ -112,7 +112,8 @@ struct dn_heap {
  * processing requirements.
  */
 struct dn_pkt_tag {
-    struct ip_fw *rule;		/* matching rule */
+    uint32_t slot;		/* slot of next rule to use */
+    uint32_t rulenum;		/* matching rule number */
     uint32_t rule_id;		/* matching rule id */
     uint32_t chain_id;		/* ruleset id */
     int dn_dir;			/* action when packet comes out. */
diff --git a/sys/netinet/ip_fw.h b/sys/netinet/ip_fw.h
index 34540fd..21a79ec 100644
--- a/sys/netinet/ip_fw.h
+++ b/sys/netinet/ip_fw.h
@@ -461,7 +461,7 @@ typedef struct _ipfw_insn_icmp6 {
  */
 
 struct ip_fw {
-	struct ip_fw	*next;		/* linked list of rules		*/
+	struct ip_fw	*x_next;	/* linked list of rules		*/
 	struct ip_fw	*next_rule;	/* ptr to next [skipto] rule	*/
 	/* 'next_rule' is used to pass up 'set_disable' status		*/
 
diff --git a/sys/netinet/ipfw/ip_dummynet.c b/sys/netinet/ipfw/ip_dummynet.c
index 6f8dc4d..a4f9668 100644
--- a/sys/netinet/ipfw/ip_dummynet.c
+++ b/sys/netinet/ipfw/ip_dummynet.c
@@ -1367,20 +1367,11 @@ dummynet_io(struct mbuf **m0, int dir, struct ip_fw_args *fwa)
 	struct dn_pipe *pipe;
 	uint64_t len = m->m_pkthdr.len;
 	struct dn_flow_queue *q = NULL;
-	int is_pipe;
-	ipfw_insn *cmd = ACTION_PTR(fwa->rule);
+	int is_pipe = fwa->cookie & 0x8000000 ? 0 : 1;
 
 	KASSERT(m->m_nextpkt == NULL,
 	    ("dummynet_io: mbuf queue passed to dummynet"));
 
-	if (cmd->opcode == O_LOG)
-		cmd += F_LEN(cmd);
-	if (cmd->opcode == O_ALTQ)
-		cmd += F_LEN(cmd);
-	if (cmd->opcode == O_TAG)
-		cmd += F_LEN(cmd);
-	is_pipe = (cmd->opcode == O_PIPE);
-
 	DUMMYNET_LOCK();
 	io_pkt++;
 	/*
@@ -1390,11 +1381,11 @@ dummynet_io(struct mbuf **m0, int dir, struct ip_fw_args *fwa)
 	 * below can be simplified.
 	 */
 	if (is_pipe) {
-		pipe = locate_pipe(fwa->cookie);
+		pipe = locate_pipe(fwa->cookie & 0xffff);
 		if (pipe != NULL)
 			fs = &(pipe->fs);
 	} else
-		fs = locate_flowset(fwa->cookie);
+		fs = locate_flowset(fwa->cookie & 0xffff);
 
 	if (fs == NULL)
 		goto dropit;	/* This queue/pipe does not exist! */
@@ -1440,7 +1431,8 @@ dummynet_io(struct mbuf **m0, int dir, struct ip_fw_args *fwa)
 	 * Ok, i can handle the pkt now...
 	 * Build and enqueue packet + parameters.
 	 */
-	pkt->rule = fwa->rule;
+	pkt->slot = fwa->slot;
+	pkt->rulenum = fwa->rulenum;
 	pkt->rule_id = fwa->rule_id;
 	pkt->chain_id = fwa->chain_id;
 	pkt->dn_dir = dir;
diff --git a/sys/netinet/ipfw/ip_fw2.c b/sys/netinet/ipfw/ip_fw2.c
index ca998c1..b19ece4 100644
--- a/sys/netinet/ipfw/ip_fw2.c
+++ b/sys/netinet/ipfw/ip_fw2.c
@@ -628,31 +628,6 @@ send_reject(struct ip_fw_args *args, int code, int ip_len, struct ip *ip)
 	args->m = NULL;
 }
 
-/**
- * Return the pointer to the skipto target.
- *
- * IMPORTANT: this should only be called on SKIPTO rules, and the
- * jump target is taken from the 'rulenum' argument, which may come
- * from the rule itself (direct skipto) or not (tablearg)
- *
- * The function never returns NULL: if the requested rule is not
- * present, it returns the next rule in the chain.
- * This also happens in case of a bogus argument > 65535
- */
-static struct ip_fw *
-lookup_next_rule(struct ip_fw *me, uint32_t rulenum)
-{
-	struct ip_fw *rule;
-
-	for (rule = me->next; rule ; rule = rule->next) {
-		if (rule->rulenum >= rulenum)
-			break;
-	}
-	if (rule == NULL)	/* failure or not a skipto */
-		rule = me->next ? me->next : me;
-	return rule;
-}
-
 /*
  * Support for uid/gid/jail lookup. These tests are expensive
  * (because we may need to look into the list of active sockets)
@@ -741,11 +716,13 @@ check_uidgid(ipfw_insn_u32 *insn, int proto, struct ifnet *oif,
  * Helper function to write the matching rule into args
  */
 static inline void
-set_match(struct ip_fw_args *args, struct ip_fw *f, struct ip_fw_chain *chain)
+set_match(struct ip_fw_args *args, int slot,
+	struct ip_fw_chain *chain)
 {
-	args->rule = f;
-	args->rule_id = f->id;
 	args->chain_id = chain->id;
+	args->slot = slot + 1; /* we use 0 as a marker */
+	args->rule_id = chain->map[slot]->id;
+	args->rulenum = chain->map[slot]->rulenum;
 }
 
 /*
@@ -839,7 +816,7 @@ ipfw_chk(struct ip_fw_args *args)
 	 */
 	struct ifnet *oif = args->oif;
 
-	struct ip_fw *f = NULL;		/* matching rule */
+	int f_pos = 0;		/* index of current rule in the array */
 	int retval = 0;
 
 	/*
@@ -1168,31 +1145,21 @@ do {								\
 		return (IP_FW_PASS);	/* accept */
 	}
 	mtag = m_tag_find(m, PACKET_TAG_DIVERT, NULL);
-	if (args->rule) {
+	if (args->slot) {
 		/*
 		 * Packet has already been tagged as a result of a previous
 		 * match on rule args->rule aka args->rule_id (PIPE, QUEUE,
 		 * REASS, NETGRAPH and similar, never a skipto).
-		 * Validate the pointer and continue from args->rule->next
-		 * if still present, otherwise use the default rule.
-		 * XXX If fw_one_pass != 0 then just accept it, though
-		 * the caller should never pass us such packets.
+		 * Validate the slot and continue from the next one
+		 * if still present, otherwise do a lookup.
 		 */
 		if (V_fw_one_pass) {
 			IPFW_RUNLOCK(chain);
 			return (IP_FW_PASS);
 		}
-		if (chain->id == args->chain_id) { /* pointer still valid */
-			f = args->rule->next;
-		} else { /* must revalidate the pointer */
-			for (f = chain->rules; f != NULL; f = f->next)
-				if (f == args->rule && f->id == args->rule_id) {
-					f = args->rule->next;
-					break;
-				}
-		}
-		if (f == NULL) /* in case of errors, use default; */
-			f = chain->default_rule;
+		f_pos = (args->chain_id == chain->id) ?
+		    args->slot /* already incremented */ :
+		    ipfw_find_rule(chain, args->rulenum, args->rule_id+1);
 	} else {
 		/*
 		 * Find the starting rule. It can be either the first
@@ -1200,18 +1167,13 @@ do {								\
 		 */
 		int skipto = mtag ? divert_cookie(mtag) : 0;
 
-		f = chain->rules;
+		f_pos = 0;
 		if (args->eh == NULL && skipto != 0) {
 			if (skipto >= IPFW_DEFAULT_RULE) {
 				IPFW_RUNLOCK(chain);
 				return (IP_FW_DENY); /* invalid */
 			}
-			while (f && f->rulenum <= skipto)
-				f = f->next;
-			if (f == NULL) {	/* drop packet */
-				IPFW_RUNLOCK(chain);
-				return (IP_FW_DENY);
-			}
+			f_pos = ipfw_find_rule(chain, skipto, 0);
 		}
 	}
 	/* reset divert rule to avoid confusion later */
@@ -1227,7 +1189,7 @@ do {								\
 	 * need to break out of one or both loops, or re-enter one of
 	 * the loops with updated variables. Loop variables are:
 	 *
-	 *	f (outer loop) points to the current rule.
+	 *	f_pos (outer loop) points to the current rule.
 	 *		On output it points to the matching rule.
 	 *	done (outer loop) is used as a flag to break the loop.
 	 *	l (inner loop)	residual length of current rule.
@@ -1236,15 +1198,16 @@ do {								\
 	 * We break the inner loop by setting l=0 and possibly
 	 * cmdlen=0 if we don't want to advance cmd.
 	 * We break the outer loop by setting done=1
-	 * We can restart the inner loop by setting l>0 and f, cmd
+	 * We can restart the inner loop by setting l>0 and f_pos, f, cmd
 	 * as needed.
 	 */
-	for (; f; f = f->next) {
+	for (; f_pos < chain->n_rules; f_pos++) {
 		ipfw_insn *cmd;
 		uint32_t tablearg = 0;
 		int l, cmdlen, skip_or; /* skip rest of OR block */
+		struct ip_fw *f;
 
-/* again: */
+		f = chain->map[f_pos];
 		if (V_set_disable & (1 << f->set) )
 			continue;
 
@@ -1864,8 +1827,8 @@ do {								\
 			 * Exceptions:
 			 * O_COUNT and O_SKIPTO actions:
 			 *   instead of terminating, we jump to the next rule
-			 *   (setting l=0), or to the SKIPTO target (by
-			 *   setting f, cmd and l as needed), respectively.
+			 *   (setting l=0), or to the SKIPTO target (setting
+			 *   f/f_len, cmd and l as needed), respectively.
 			 *
 			 * O_TAG, O_LOG and O_ALTQ action parameters:
 			 *   perform some action and set match = 1;
@@ -1926,7 +1889,14 @@ do {								\
 					 */
 					q->pcnt++;
 					q->bcnt += pktlen;
+					/* XXX we would like to have f_pos
+					 * readily accessible in the dynamic
+				         * rule, instead of having to
+					 * lookup q->rule.
+					 */
 					f = q->rule;
+					f_pos = ipfw_find_rule(chain,
+						f->rulenum, f->id);
 					cmd = ACTION_PTR(f);
 					l = f->cmd_len - f->act_ofs;
 					ipfw_dyn_unlock();
@@ -1952,9 +1922,11 @@ do {								\
 
 			case O_PIPE:
 			case O_QUEUE:
-				set_match(args, f, chain);
+				set_match(args, f_pos, chain);
 				args->cookie = (cmd->arg1 == IP_FW_TABLEARG) ?
 					tablearg : cmd->arg1;
+				if (cmd->opcode == O_QUEUE)
+					args->cookie |= 0x80000000;
 				retval = IP_FW_DUMMYNET;
 				l = 0;          /* exit inner loop */
 				done = 1;       /* exit outer loop */
@@ -1987,38 +1959,53 @@ do {								\
 				break;
 
 			case O_COUNT:
-			case O_SKIPTO:
 				f->pcnt++;	/* update stats */
 				f->bcnt += pktlen;
 				f->timestamp = time_uptime;
-				if (cmd->opcode == O_COUNT) {
 					l = 0;		/* exit inner loop */
 					break;
-				}
-				/* skipto: */
-				if (cmd->arg1 == IP_FW_TABLEARG) {
-				    f = lookup_next_rule(f, tablearg);
-				} else { /* direct skipto */
-				    /* update f->next_rule if not set */
-				    if (f->next_rule == NULL)
+
+			case O_SKIPTO:
+			    f->pcnt++;	/* update stats */
+			    f->bcnt += pktlen;
+			    f->timestamp = time_uptime;
+			    /* If possible use cached f_pos (in f->next_rule),
+			     * whose version is written in f->next_rule
+			     * (horrible hacks to avoid changing the ABI).
+			     */
+			    if (cmd->arg1 != IP_FW_TABLEARG &&
+				    (uint32_t)f->x_next == chain->id) {
+				f_pos = (uint32_t)f->next_rule;
+			    } else {
+				int i = (cmd->arg1 == IP_FW_TABLEARG) ?
+					tablearg : cmd->arg1;
+				/* make sure we do not jump backward */
+				if (i <= f->rulenum)
+				    i = f->rulenum + 1;
+				f_pos = ipfw_find_rule(chain, i, 0);
+				/* update the cache */
+				if (cmd->arg1 != IP_FW_TABLEARG) {
 					f->next_rule =
-					    lookup_next_rule(f, cmd->arg1);
-				    f = f->next_rule;
+					(void *)(uintptr_t)f_pos;
+				    f->x_next =
+					(void *)(uintptr_t)chain->id;
+				}
 				}
 				/*
-				 * Skip disabled rules, and
-				 * re-enter the inner loop
-				 * with the correct f, l and cmd.
+			     * Skip disabled rules, and re-enter
+			     * the inner loop with the correct
+			     * f_pos, f, l and cmd.
 				 * Also clear cmdlen and skip_or
 				 */
-				while (f && (V_set_disable & (1 << f->set)))
-					f = f->next;
-				if (f) { /* found a valid rule */
+			    for (; f_pos < chain->n_rules - 1 &&
+				    (V_set_disable &
+				     (1 << chain->map[f_pos]->set));
+				    f_pos++)
+				;
+			    /* prepare to enter the inner loop */
+			    f = chain->map[f_pos];
 					l = f->cmd_len;
 					cmd = f->cmd;
-				} else { /* should not happen */
-					l = 0;  /* exit inner loop */
-				}
 				match = 1;
 				cmdlen = 0;
 				skip_or = 0;
@@ -2083,7 +2070,7 @@ do {								\
 
 			case O_NETGRAPH:
 			case O_NGTEE:
-				set_match(args, f, chain);
+				set_match(args, f_pos, chain);
 				args->cookie = (cmd->arg1 == IP_FW_TABLEARG) ?
 					tablearg : cmd->arg1;
 				retval = (cmd->opcode == O_NETGRAPH) ?
@@ -2108,7 +2095,7 @@ do {								\
 				    struct cfg_nat *t;
 				    int nat_id;
 
-				    set_match(args, f, chain);
+				    set_match(args, f_pos, chain);
 				    t = ((ipfw_insn_nat *)cmd)->nat;
 				    if (t == NULL) {
 					nat_id = (cmd->arg1 == IP_FW_TABLEARG) ?
@@ -2175,7 +2162,7 @@ do {								\
 				    else
 					ip->ip_sum = in_cksum(m, hlen);
 				    retval = IP_FW_REASS;
-				    set_match(args, f, chain);
+				    set_match(args, f_pos, chain);
 				}
 				done = 1;	/* exit outer loop */
 				break;
@@ -2209,10 +2196,11 @@ do {								\
 	}		/* end of outer for, scan rules */
 
 	if (done) {
+		struct ip_fw *rule = chain->map[f_pos];
 		/* Update statistics */
-		f->pcnt++;
-		f->bcnt += pktlen;
-		f->timestamp = time_uptime;
+		rule->pcnt++;
+		rule->bcnt += pktlen;
+		rule->timestamp = time_uptime;
 	} else {
 		retval = IP_FW_DENY;
 		printf("ipfw: ouch!, skip past end of rules, denying packet\n");
@@ -2308,7 +2296,7 @@ static int
 vnet_ipfw_init(const void *unused)
 {
 	int error;
-	struct ip_fw default_rule;
+	struct ip_fw *rule = NULL;
 	struct ip_fw_chain *chain;
 
 	chain = &V_layer3_chain;
@@ -2322,38 +2310,39 @@ vnet_ipfw_init(const void *unused)
 #ifdef IPFIREWALL_VERBOSE_LIMIT
 	V_verbose_limit = IPFIREWALL_VERBOSE_LIMIT;
 #endif
+#ifdef IPFIREWALL_NAT
+	LIST_INIT(&chain->nat);
+#endif
 
+	/* insert the default rule and create the initial map */
+	chain->n_rules = 1;
+	chain->static_len = sizeof(struct ip_fw);
+	chain->map = malloc(sizeof(struct ip_fw *), M_IPFW, M_NOWAIT | M_ZERO);
+	if (chain->map)
+		rule = malloc(chain->static_len, M_IPFW, M_NOWAIT | M_ZERO);
+	if (rule == NULL) {
+		if (chain->map)
+			free(chain->map, M_IPFW);
+		printf("ipfw2: ENOSPC initializing default rule "
+			"(support disabled)\n");
+		return (ENOSPC);
+	}
 	error = ipfw_init_tables(chain);
 	if (error) {
 		panic("init_tables"); /* XXX Marko fix this ! */
 	}
-#ifdef IPFIREWALL_NAT
-	LIST_INIT(&chain->nat);
-#endif
 
+	/* fill and insert the default rule */
+	rule->act_ofs = 0;
+	rule->rulenum = IPFW_DEFAULT_RULE;
+	rule->cmd_len = 1;
+	rule->set = RESVD_SET;
+	rule->cmd[0].len = 1;
+	rule->cmd[0].opcode = default_to_accept ? O_ACCEPT : O_DENY;
+	chain->rules = chain->default_rule = chain->map[0] = rule;
+	chain->id = rule->id = 1;
 
-	chain->rules = NULL;
 	IPFW_LOCK_INIT(chain);
-
-	bzero(&default_rule, sizeof default_rule);
-	default_rule.act_ofs = 0;
-	default_rule.rulenum = IPFW_DEFAULT_RULE;
-	default_rule.cmd_len = 1;
-	default_rule.set = RESVD_SET;
-	default_rule.cmd[0].len = 1;
-	default_rule.cmd[0].opcode = default_to_accept ? O_ACCEPT : O_DENY;
-	error = ipfw_add_rule(chain, &default_rule);
-
-	if (error != 0) {
-		printf("ipfw2: error %u initializing default rule "
-			"(support disabled)\n", error);
-		IPFW_LOCK_DESTROY(chain);
-		printf("leaving ipfw_iattach (1) with error %d\n", error);
-		return (error);
-	}
-
-	chain->default_rule = chain->rules;
-
 	ipfw_dyn_init();
 
 	/* First set up some values that are compile time options */
@@ -2385,8 +2374,9 @@ vnet_ipfw_init(const void *unused)
 static int
 vnet_ipfw_uninit(const void *unused)
 {
-	struct ip_fw *reap;
+	struct ip_fw *reap, *rule;
 	struct ip_fw_chain *chain = &V_layer3_chain;
+	int i;
 
 	V_ipfw_vnet_ready = 0; /* tell new callers to go away */
 	/*
@@ -2400,19 +2390,26 @@ vnet_ipfw_uninit(const void *unused)
 #endif
 	V_ip_fw_chk_ptr = NULL;
 	V_ip_fw_ctl_ptr = NULL;
+	IPFW_UH_WLOCK(chain);
+	IPFW_UH_WUNLOCK(chain);
+	IPFW_UH_WLOCK(chain);
 
 	IPFW_WLOCK(chain);
-	/* We wait on the wlock here until the last user leaves */
 	IPFW_WUNLOCK(chain);
 	IPFW_WLOCK(chain);
 
 	ipfw_dyn_uninit(0);	/* run the callout_drain */
 	ipfw_flush_tables(chain);
-	chain->reap = NULL;
-	ipfw_free_chain(chain, 1 /* kill default rule */);
-	reap = chain->reap;
-	chain->reap = NULL;
+	reap = NULL;
+	for (i = 0; i < chain->n_rules; i++) {
+		rule = chain->map[i];
+		rule->x_next = reap;
+		reap = rule;
+	}
+	if (chain->map)
+		free(chain->map, M_IPFW);
 	IPFW_WUNLOCK(chain);
+	IPFW_UH_WUNLOCK(chain);
 	if (reap != NULL)
 		ipfw_reap_rules(reap);
 	IPFW_LOCK_DESTROY(chain);
diff --git a/sys/netinet/ipfw/ip_fw_log.c b/sys/netinet/ipfw/ip_fw_log.c
index cd404d4..e3515af 100644
--- a/sys/netinet/ipfw/ip_fw_log.c
+++ b/sys/netinet/ipfw/ip_fw_log.c
@@ -26,9 +26,6 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#define        DEB(x)
-#define        DDB(x) x
-
 /*
  * Logging support for ipfw
  */
diff --git a/sys/netinet/ipfw/ip_fw_pfil.c b/sys/netinet/ipfw/ip_fw_pfil.c
index e1dcf16..35ee157 100644
--- a/sys/netinet/ipfw/ip_fw_pfil.c
+++ b/sys/netinet/ipfw/ip_fw_pfil.c
@@ -125,7 +125,8 @@ ipfw_check_in(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir,
 	if (ng_tag != NULL) {
 		KASSERT(ng_tag->dir == NG_IPFW_IN,
 		    ("ng_ipfw tag with wrong direction"));
-		args.rule = ng_tag->rule;
+		args.slot = ng_tag->slot;
+		args.rulenum = ng_tag->rulenum;
 		args.rule_id = ng_tag->rule_id;
 		args.chain_id = ng_tag->chain_id;
 		m_tag_delete(*m0, (struct m_tag *)ng_tag);
@@ -137,10 +138,10 @@ again:
 		struct dn_pkt_tag *dt;
 
 		dt = (struct dn_pkt_tag *)(dn_tag+1);
-		args.rule = dt->rule;
+		args.slot = dt->slot;
+		args.rulenum = dt->rulenum;
 		args.rule_id = dt->rule_id;
 		args.chain_id = dt->chain_id;
-
 		m_tag_delete(*m0, dn_tag);
 	}
 
@@ -148,7 +149,7 @@ again:
 	args.inp = inp;
 	tee = 0;
 
-	if (V_fw_one_pass == 0 || args.rule == NULL) {
+	if (V_fw_one_pass == 0 || args.slot == 0) {
 		ipfw = ipfw_chk(&args);
 		*m0 = args.m;
 	} else
@@ -201,7 +202,7 @@ again:
 			*m0 = NULL;
 			return 0;	/* packet consumed */
 		} else {
-			args.rule = NULL;
+			args.slot = 0;
 			goto again;	/* continue with packet */
 		}
 
@@ -258,7 +259,8 @@ ipfw_check_out(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir,
 	if (ng_tag != NULL) {
 		KASSERT(ng_tag->dir == NG_IPFW_OUT,
 		    ("ng_ipfw tag with wrong direction"));
-		args.rule = ng_tag->rule;
+		args.slot = ng_tag->slot;
+		args.rulenum = ng_tag->rulenum;
 		args.rule_id = ng_tag->rule_id;
 		args.chain_id = ng_tag->chain_id;
 		m_tag_delete(*m0, (struct m_tag *)ng_tag);
@@ -270,10 +272,10 @@ again:
 		struct dn_pkt_tag *dt;
 
 		dt = (struct dn_pkt_tag *)(dn_tag+1);
-		args.rule = dt->rule;
+		args.slot = dt->slot;
+		args.rulenum = dt->rulenum;
 		args.rule_id = dt->rule_id;
 		args.chain_id = dt->chain_id;
-
 		m_tag_delete(*m0, dn_tag);
 	}
 
@@ -282,7 +284,7 @@ again:
 	args.inp = inp;
 	tee = 0;
 
-	if (V_fw_one_pass == 0 || args.rule == NULL) {
+	if (V_fw_one_pass == 0 || args.slot == 0) {
 		ipfw = ipfw_chk(&args);
 		*m0 = args.m;
 	} else
@@ -341,7 +343,7 @@ again:
 			*m0 = NULL;
 			return 0;	/* packet consumed */
 		} else {
-			args.rule = NULL;
+			args.slot = 0;
 			goto again;	/* continue with packet */
 		}
 
diff --git a/sys/netinet/ipfw/ip_fw_private.h b/sys/netinet/ipfw/ip_fw_private.h
index 9c67d20..da5db8e 100644
--- a/sys/netinet/ipfw/ip_fw_private.h
+++ b/sys/netinet/ipfw/ip_fw_private.h
@@ -79,7 +79,12 @@ struct ip_fw_args {
 	struct ifnet	*oif;		/* output interface		*/
 	struct sockaddr_in *next_hop;	/* forward address		*/
 
-	struct ip_fw	*rule;		/* matching rule		*/
+	/* chain_id validates 'slot', the location of the pointer to
+	 * a matching rule.
+	 * If invalid, we can lookup the rule using rule_id and rulenum
+	 */
+	uint32_t	slot;		/* slot for matching rule	*/
+	uint32_t	rulenum;	/* matching rule number		*/
 	uint32_t	rule_id;	/* matching rule id		*/
 	uint32_t	chain_id;	/* ruleset id			*/
 
@@ -178,9 +183,11 @@ struct ip_fw_chain {
 	struct ip_fw	*default_rule;
 	int		n_rules;	/* number of static rules */
 	int		static_len;	/* total len of static rules */
+	struct ip_fw    **map;	/* array of rule ptrs to ease lookup */
 	LIST_HEAD(nat_list, cfg_nat) nat;       /* list of nat entries */
 	struct radix_node_head *tables[IPFW_TABLES_MAX];
 	struct rwlock	rwmtx;
+	struct rwlock	uh_lock;	/* lock for upper half */
 	uint32_t	id;		/* ruleset id */
 };
 
@@ -191,9 +198,16 @@ struct sockopt;	/* used by tcp_var.h */
  * so the variable and the macros must be here.
  */
 
-#define	IPFW_LOCK_INIT(_chain) \
-	rw_init(&(_chain)->rwmtx, "IPFW static rules")
-#define	IPFW_LOCK_DESTROY(_chain)	rw_destroy(&(_chain)->rwmtx)
+#define	IPFW_LOCK_INIT(_chain) do {			\
+	rw_init(&(_chain)->rwmtx, "IPFW static rules");	\
+	rw_init(&(_chain)->uh_lock, "IPFW UH lock");	\
+	} while (0)
+
+#define	IPFW_LOCK_DESTROY(_chain) do {			\
+	rw_destroy(&(_chain)->rwmtx);			\
+	rw_destroy(&(_chain)->uh_lock);			\
+	} while (0)
+
 #define	IPFW_WLOCK_ASSERT(_chain)	rw_assert(&(_chain)->rwmtx, RA_WLOCKED)
 
 #define IPFW_RLOCK(p) rw_rlock(&(p)->rwmtx)
@@ -201,12 +215,17 @@ struct sockopt;	/* used by tcp_var.h */
 #define IPFW_WLOCK(p) rw_wlock(&(p)->rwmtx)
 #define IPFW_WUNLOCK(p) rw_wunlock(&(p)->rwmtx)
 
+#define IPFW_UH_RLOCK(p) rw_rlock(&(p)->uh_lock)
+#define IPFW_UH_RUNLOCK(p) rw_runlock(&(p)->uh_lock)
+#define IPFW_UH_WLOCK(p) rw_wlock(&(p)->uh_lock)
+#define IPFW_UH_WUNLOCK(p) rw_wunlock(&(p)->uh_lock)
+
 /* In ip_fw_sockopt.c */
+int ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id);
 int ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule);
 int ipfw_ctl(struct sockopt *sopt);
 int ipfw_chk(struct ip_fw_args *args);
 void ipfw_reap_rules(struct ip_fw *head);
-void ipfw_free_chain(struct ip_fw_chain *chain, int kill_default);
 
 /* In ip_fw_table.c */
 struct radix_node;
diff --git a/sys/netinet/ipfw/ip_fw_sockopt.c b/sys/netinet/ipfw/ip_fw_sockopt.c
index 49618da..061c3f9 100644
--- a/sys/netinet/ipfw/ip_fw_sockopt.c
+++ b/sys/netinet/ipfw/ip_fw_sockopt.c
@@ -1,5 +1,7 @@
 /*-
- * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa
+ * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa
+ *
+ * Supported by: Valeria Paoli
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -26,9 +28,6 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#define        DEB(x)
-#define        DDB(x) x
-
 /*
  * Sockopt support for ipfw. The routines here implement
  * the upper half of the ipfw code.
@@ -81,136 +80,143 @@ MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's");
  */
 
 /*
- * When a rule is added/deleted, clear the next_rule pointers in all rules.
- * These will be reconstructed on the fly as packets are matched.
+ * Find the smallest rule >= key, id.
+ * We could use bsearch but it is so simple that we code it directly
  */
-static void
-flush_rule_ptrs(struct ip_fw_chain *chain)
+int
+ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id)
 {
-	struct ip_fw *rule;
+	int i, lo, hi;
+	struct ip_fw *r;
+
+  	for (lo = 0, hi = chain->n_rules - 1; lo < hi;) {
+		i = (lo + hi) / 2;
+		r = chain->map[i];
+		if (r->rulenum < key)
+			lo = i + 1;	/* continue from the next one */
+		else if (r->rulenum > key)
+			hi = i;		/* this might be good */
+		else if (r->id < id)
+			lo = i + 1;	/* continue from the next one */
+		else /* r->id >= id */
+			hi = i;		/* this might be good */
+	};
+	return hi;
+}
 
-	IPFW_WLOCK_ASSERT(chain);
+/*
+ * allocate a new map, returns the chain locked. extra is the number
+ * of entries to add or delete.
+ */
+static struct ip_fw **
+get_map(struct ip_fw_chain *chain, int extra, int locked)
+{
 
-	chain->id++;
+	for (;;) {
+		struct ip_fw **map;
+		int i;
 
-	for (rule = chain->rules; rule; rule = rule->next)
-		rule->next_rule = NULL;
+		i = chain->n_rules + extra;
+		map = malloc(i * sizeof(struct ip_fw *), M_IPFW, M_WAITOK);
+		if (map == NULL) {
+			printf("%s: cannot allocate map\n", __FUNCTION__);
+			return NULL;
+		}
+		if (!locked)
+			IPFW_UH_WLOCK(chain);
+		if (i >= chain->n_rules + extra) /* good */
+			return map;
+		/* otherwise we lost the race, free and retry */
+		if (!locked)
+			IPFW_UH_WUNLOCK(chain);
+		free(map, M_IPFW);
+	}
+}
+
+/*
+ * swap the maps. It is supposed to be called with IPFW_UH_WLOCK
+ */
+static struct ip_fw **
+swap_map(struct ip_fw_chain *chain, struct ip_fw **new_map, int new_len)
+{
+	struct ip_fw **old_map;
+
+	IPFW_WLOCK(chain);
+	chain->id++;
+	chain->n_rules = new_len;
+	old_map = chain->map;
+	chain->map = new_map;
+	IPFW_WUNLOCK(chain);
+	return old_map;
 }
 
 /*
  * Add a new rule to the list. Copy the rule into a malloc'ed area, then
  * possibly create a rule number and add the rule to the list.
  * Update the rule_number in the input struct so the caller knows it as well.
+ * XXX DO NOT USE FOR THE DEFAULT RULE.
+ * Must be called without IPFW_UH held
  */
 int
 ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule)
 {
-	struct ip_fw *rule, *f, *prev;
-	int l = RULESIZE(input_rule);
+	struct ip_fw *rule;
+	int i, l, insert_before;
+	struct ip_fw **map;	/* the new array of pointers */
 
-	if (chain->rules == NULL && input_rule->rulenum != IPFW_DEFAULT_RULE)
+	if (chain->rules == NULL || input_rule->rulenum > IPFW_DEFAULT_RULE-1)
 		return (EINVAL);
 
-	rule = malloc(l, M_IPFW, M_NOWAIT | M_ZERO);
+	l = RULESIZE(input_rule);
+	rule = malloc(l, M_IPFW, M_WAITOK | M_ZERO);
 	if (rule == NULL)
 		return (ENOSPC);
+	/* get_map returns with IPFW_UH_WLOCK if successful */
+	map = get_map(chain, 1, 0 /* not locked */);
+	if (map == NULL) {
+		free(rule, M_IPFW);
+		return ENOSPC;
+	}
 
 	bcopy(input_rule, rule, l);
-
-	rule->next = NULL;
+	/* clear fields not settable from userland */
+	rule->x_next = NULL;
 	rule->next_rule = NULL;
-
 	rule->pcnt = 0;
 	rule->bcnt = 0;
 	rule->timestamp = 0;
 
-	IPFW_WLOCK(chain);
-
-	if (chain->rules == NULL) {	/* default rule */
-		chain->rules = rule;
-		rule->id = ++chain->id;
-		goto done;
-        }
-
-	/*
-	 * If rulenum is 0, find highest numbered rule before the
-	 * default rule, and add autoinc_step
-	 */
 	if (V_autoinc_step < 1)
 		V_autoinc_step = 1;
 	else if (V_autoinc_step > 1000)
 		V_autoinc_step = 1000;
+	/* find the insertion point, we will insert before */
+	insert_before = rule->rulenum ? rule->rulenum + 1 : IPFW_DEFAULT_RULE;
+	i = ipfw_find_rule(chain, insert_before, 0);
+	/* duplicate first part */
+	if (i > 0)
+		bcopy(chain->map, map, i * sizeof(struct ip_fw *));
+	map[i] = rule;
+	/* duplicate remaining part, we always have the default rule */
+	bcopy(chain->map + i, map + i + 1,
+		sizeof(struct ip_fw *) *(chain->n_rules - i));
 	if (rule->rulenum == 0) {
-		/*
-		 * locate the highest numbered rule before default
-		 */
-		for (f = chain->rules; f; f = f->next) {
-			if (f->rulenum == IPFW_DEFAULT_RULE)
-				break;
-			rule->rulenum = f->rulenum;
-		}
+		/* write back the number */
+		rule->rulenum = i > 0 ? map[i-1]->rulenum : 0;
 		if (rule->rulenum < IPFW_DEFAULT_RULE - V_autoinc_step)
 			rule->rulenum += V_autoinc_step;
 		input_rule->rulenum = rule->rulenum;
 	}
 
-	/*
-	 * Now insert the new rule in the right place in the sorted list.
-	 */
-	for (prev = NULL, f = chain->rules; f; prev = f, f = f->next) {
-		if (f->rulenum > rule->rulenum) { /* found the location */
-			if (prev) {
-				rule->next = f;
-				prev->next = rule;
-			} else { /* head insert */
-				rule->next = chain->rules;
-				chain->rules = rule;
-			}
-			break;
-		}
-	}
-	flush_rule_ptrs(chain);
-	/* chain->id incremented inside flush_rule_ptrs() */
-	rule->id = chain->id;
-done:
-	chain->n_rules++;
+	rule->id = chain->id + 1;
+	map = swap_map(chain, map, chain->n_rules + 1);
 	chain->static_len += l;
-	IPFW_WUNLOCK(chain);
+	IPFW_UH_WUNLOCK(chain);
+	if (map)
+		free(map, M_IPFW);
 	return (0);
 }
 
-/**
- * Remove a static rule (including derived * dynamic rules)
- * and place it on the ``reap list'' for later reclamation.
- * The caller is in charge of clearing rule pointers to avoid
- * dangling pointers.
- * @return a pointer to the next entry.
- * Arguments are not checked, so they better be correct.
- */
-static struct ip_fw *
-remove_rule(struct ip_fw_chain *chain, struct ip_fw *rule,
-    struct ip_fw *prev)
-{
-	struct ip_fw *n;
-	int l = RULESIZE(rule);
-
-	IPFW_WLOCK_ASSERT(chain);
-
-	n = rule->next;
-	ipfw_remove_dyn_children(rule);
-	if (prev == NULL)
-		chain->rules = n;
-	else
-		prev->next = n;
-	chain->n_rules--;
-	chain->static_len -= l;
-
-	rule->next = chain->reap;
-	chain->reap = rule;
-
-	return n;
-}
-
 /*
  * Reclaim storage associated with a list of rules.  This is
  * typically the list created using remove_rule.
@@ -222,34 +228,11 @@ ipfw_reap_rules(struct ip_fw *head)
 	struct ip_fw *rule;
 
 	while ((rule = head) != NULL) {
-		head = head->next;
+		head = head->x_next;
 		free(rule, M_IPFW);
 	}
 }
 
-/*
- * Remove all rules from a chain (except rules in set RESVD_SET
- * unless kill_default = 1).  The caller is responsible for
- * reclaiming storage for the rules left in chain->reap.
- */
-void
-ipfw_free_chain(struct ip_fw_chain *chain, int kill_default)
-{
-	struct ip_fw *prev, *rule;
-
-	IPFW_WLOCK_ASSERT(chain);
-
-	chain->reap = NULL;
-	flush_rule_ptrs(chain); /* more efficient to do outside the loop */
-	for (prev = NULL, rule = chain->rules; rule ; )
-		if (kill_default || rule->set != RESVD_SET)
-			rule = remove_rule(chain, rule, prev);
-		else {
-			prev = rule;
-			rule = rule->next;
-		}
-}
-
 /**
  * Remove all rules with given number, and also do set manipulation.
  * Assumes chain != NULL && *chain != NULL.
@@ -267,9 +250,12 @@ ipfw_free_chain(struct ip_fw_chain *chain, int kill_default)
 static int
 del_entry(struct ip_fw_chain *chain, u_int32_t arg)
 {
-	struct ip_fw *prev = NULL, *rule;
-	u_int16_t rulenum;	/* rule or old_set */
-	u_int8_t cmd, new_set;
+	struct ip_fw *rule;
+	uint32_t rulenum;	/* rule or old_set */
+	uint8_t cmd, new_set;
+	int start, end = 0, i, ofs, n;
+	struct ip_fw **map = NULL;
+	int error = 0;
 
 	rulenum = arg & 0xffff;
 	cmd = (arg >> 24) & 0xff;
@@ -285,96 +271,122 @@ del_entry(struct ip_fw_chain *chain, u_int32_t arg)
 			return EINVAL;
 	}
 
-	IPFW_WLOCK(chain);
-	rule = chain->rules;	/* common starting point */
+	IPFW_UH_WLOCK(chain); /* prevent conflicts among the writers */
 	chain->reap = NULL;	/* prepare for deletions */
+
 	switch (cmd) {
-	case 0:	/* delete rules with given number */
-		/*
-		 * locate first rule to delete
+	case 0:	/* delete rules with given number (0 is special means all) */
+	case 1:	/* delete all rules with given set number, rule->set == rulenum */
+	case 5: /* delete rules with given number and with given set number.
+		 * rulenum - given rule number;
+		 * new_set - given set number.
 		 */
-		for (; rule->rulenum < rulenum; prev = rule, rule = rule->next)
-			;
-		if (rule->rulenum != rulenum) {
-			IPFW_WUNLOCK(chain);
-			return EINVAL;
-		}
-
-		/*
-		 * flush pointers outside the loop, then delete all matching
-		 * rules. prev remains the same throughout the cycle.
+		/* locate first rule to delete (start), the one after the
+		 * last one (end), and count how many rules to delete (n)
 		 */
-		flush_rule_ptrs(chain);
-		while (rule->rulenum == rulenum)
-			rule = remove_rule(chain, rule, prev);
+		n = 0;
+		if (cmd == 1) { /* look for a specific set, must scan all */
+			for (start = -1, i = 0; i < chain->n_rules; i++) {
+				if (chain->map[start]->set != rulenum)
+					continue;
+				if (start < 0)
+					start = i;
+				end = i;
+				n++;
+			}
+			end++;	/* first non-matching */
+		} else {
+			start = ipfw_find_rule(chain, rulenum, 0);
+			for (end = start; end < chain->n_rules; end++) {
+				rule = chain->map[end];
+				if (rulenum > 0 && rule->rulenum != rulenum)
+					break;
+				if (rule->set != RESVD_SET &&
+				    (cmd == 0 || rule->set == new_set) )
+					n++;
+			}
+		}
+		/* allocate the map, if needed */
+		if (n > 0)
+			map = get_map(chain, -n, 1 /* locked */);
+		if (n == 0 || map == NULL) {
+			error = EINVAL;
 		break;
-
-	case 1:	/* delete all rules with given set number */
-		flush_rule_ptrs(chain);
-		while (rule->rulenum < IPFW_DEFAULT_RULE) {
-			if (rule->set == rulenum)
-				rule = remove_rule(chain, rule, prev);
-			else {
-				prev = rule;
-				rule = rule->next;
+		}
+		/* copy the initial part of the map */
+		if (start > 0)
+			bcopy(chain->map, map, start * sizeof(struct ip_fw *));
+		/* copy active rules between start and end */
+		for (i = ofs = start; i < end; i++) {
+			rule = chain->map[i];
+			if (!(rule->set != RESVD_SET &&
+			    (cmd == 0 || rule->set == new_set) ))
+				map[ofs++] = chain->map[i];
+		}
+		/* finally the tail */
+		bcopy(chain->map + end, map + ofs,
+			(chain->n_rules - end) * sizeof(struct ip_fw *));
+		map = swap_map(chain, map, chain->n_rules - n);
+		/* now remove the rules deleted */
+		for (i = start; i < end; i++) {
+			rule = map[i];
+			if (rule->set != RESVD_SET &&
+			    (cmd == 0 || rule->set == new_set) ) {
+				int l = RULESIZE(rule);
+
+				chain->static_len -= l;
+				ipfw_remove_dyn_children(rule);
+				rule->x_next = chain->reap;
+				chain->reap = rule;
 			}
 		}
 		break;
 
 	case 2:	/* move rules with given number to new set */
-		for (; rule->rulenum < IPFW_DEFAULT_RULE; rule = rule->next)
+		IPFW_UH_WLOCK(chain);
+		for (i = 0; i < chain->n_rules; i++) {
+			rule = chain->map[i];
 			if (rule->rulenum == rulenum)
 				rule->set = new_set;
+		}
+		IPFW_UH_WUNLOCK(chain);
 		break;
 
 	case 3: /* move rules with given set number to new set */
-		for (; rule->rulenum < IPFW_DEFAULT_RULE; rule = rule->next)
+		IPFW_UH_WLOCK(chain);
+		for (i = 0; i < chain->n_rules; i++) {
+			rule = chain->map[i];
 			if (rule->set == rulenum)
 				rule->set = new_set;
+		}
+		IPFW_UH_WUNLOCK(chain);
 		break;
 
 	case 4: /* swap two sets */
-		for (; rule->rulenum < IPFW_DEFAULT_RULE; rule = rule->next)
+		IPFW_UH_WLOCK(chain);
+		for (i = 0; i < chain->n_rules; i++) {
+			rule = chain->map[i];
 			if (rule->set == rulenum)
 				rule->set = new_set;
 			else if (rule->set == new_set)
 				rule->set = rulenum;
-		break;
-
-	case 5: /* delete rules with given number and with given set number.
-		 * rulenum - given rule number;
-		 * new_set - given set number.
-		 */
-		for (; rule->rulenum < rulenum; prev = rule, rule = rule->next)
-			;
-		if (rule->rulenum != rulenum) {
-			IPFW_WUNLOCK(chain);
-			return (EINVAL);
-		}
-		flush_rule_ptrs(chain);
-		while (rule->rulenum == rulenum) {
-			if (rule->set == new_set)
-				rule = remove_rule(chain, rule, prev);
-			else {
-				prev = rule;
-				rule = rule->next;
-			}
 		}
+		IPFW_UH_WUNLOCK(chain);
+		break;
 	}
-	/*
-	 * Look for rules to reclaim.  We grab the list before
-	 * releasing the lock then reclaim them w/o the lock to
-	 * avoid a LOR with dummynet.
-	 */
 	rule = chain->reap;
-	IPFW_WUNLOCK(chain);
+	chain->reap = NULL;
+	IPFW_UH_WUNLOCK(chain);
 	ipfw_reap_rules(rule);
-	return 0;
+	if (map)
+		free(map, M_IPFW);
+	return error;
 }
 
 /*
  * Clear counters for a specific rule.
- * The enclosing "table" is assumed locked.
+ * Normally run under IPFW_UH_RLOCK, but these are idempotent ops
+ * so we only care that rules do not disappear.
  */
 static void
 clear_counters(struct ip_fw *rule, int log_only)
@@ -403,6 +415,7 @@ zero_entry(struct ip_fw_chain *chain, u_int32_t arg, int log_only)
 {
 	struct ip_fw *rule;
 	char *msg;
+	int i;
 
 	uint16_t rulenum = arg & 0xffff;
 	uint8_t set = (arg >> 16) & 0xff;
@@ -413,11 +426,12 @@ zero_entry(struct ip_fw_chain *chain, u_int32_t arg, int log_only)
 	if (cmd == 1 && set > RESVD_SET)
 		return (EINVAL);
 
-	IPFW_WLOCK(chain);
+	IPFW_UH_RLOCK(chain);
 	if (rulenum == 0) {
 		V_norule_counter = 0;
-		for (rule = chain->rules; rule; rule = rule->next) {
-			/* Skip rules from another set. */
+		for (i = 0; i < chain->n_rules; i++) {
+			rule = chain->map[i];
+			/* Skip rules not in our set. */
 			if (cmd == 1 && rule->set != set)
 				continue;
 			clear_counters(rule, log_only);
@@ -426,18 +440,14 @@ zero_entry(struct ip_fw_chain *chain, u_int32_t arg, int log_only)
 		    "Accounting cleared";
 	} else {
 		int cleared = 0;
-		/*
-		 * We can have multiple rules with the same number, so we
-		 * need to clear them all.
-		 */
-		for (rule = chain->rules; rule; rule = rule->next)
+		for (i = 0; i < chain->n_rules; i++) {
+			rule = chain->map[i];
 			if (rule->rulenum == rulenum) {
-				while (rule && rule->rulenum == rulenum) {
 					if (cmd == 0 || rule->set == set)
 						clear_counters(rule, log_only);
-					rule = rule->next;
-				}
 				cleared = 1;
+			}
+			if (rule->rulenum > rulenum)
 				break;
 			}
 		if (!cleared) {	/* we did not find any matching rules */
@@ -446,7 +456,7 @@ zero_entry(struct ip_fw_chain *chain, u_int32_t arg, int log_only)
 		}
 		msg = log_only ? "logging count reset" : "cleared";
 	}
-	IPFW_WUNLOCK(chain);
+	IPFW_UH_RUNLOCK(chain);
 
 	if (V_fw_verbose) {
 		int lev = LOG_SECURITY | LOG_NOTICE;
@@ -497,7 +507,6 @@ check_ipfw_struct(struct ip_fw *rule, int size)
 			    cmd->opcode);
 			return EINVAL;
 		}
-		DEB(printf("ipfw: opcode %d\n", cmd->opcode);)
 		switch (cmd->opcode) {
 		case O_PROBE_STATE:
 		case O_KEEP_STATE:
@@ -771,43 +780,37 @@ bad_size:
 /*
  * Copy the static and dynamic rules to the supplied buffer
  * and return the amount of space actually used.
+ * Must be run under IPFW_UH_RLOCK
  */
 static size_t
 ipfw_getrules(struct ip_fw_chain *chain, void *buf, size_t space)
 {
 	char *bp = buf;
 	char *ep = bp + space;
-	struct ip_fw *rule;
-	int i;
+	struct ip_fw *rule, *dst;
+	int l, i;
 	time_t	boot_seconds;
 
         boot_seconds = boottime.tv_sec;
-	/* XXX this can take a long time and locking will block packet flow */
-	IPFW_RLOCK(chain);
-	for (rule = chain->rules; rule ; rule = rule->next) {
-		/*
-		 * Verify the entry fits in the buffer in case the
-		 * rules changed between calculating buffer space and
-		 * now.  This would be better done using a generation
-		 * number but should suffice for now.
-		 */
-		i = RULESIZE(rule);
-		if (bp + i <= ep) {
-			bcopy(rule, bp, i);
+	for (i = 0; i < chain->n_rules; i++) {
+		rule = chain->map[i];
+		l = RULESIZE(rule);
+		if (bp + l > ep) { /* should not happen */
+			printf("overflow dumping static rules\n");
+			break;
+		}
+		dst = (struct ip_fw *)bp;
+		bcopy(rule, dst, l);
 			/*
 			 * XXX HACK. Store the disable mask in the "next"
 			 * pointer in a wild attempt to keep the ABI the same.
 			 * Why do we do this on EVERY rule?
 			 */
-			bcopy(&V_set_disable,
-			    &(((struct ip_fw *)bp)->next_rule),
-			    sizeof(V_set_disable));
-			if (((struct ip_fw *)bp)->timestamp)
-				((struct ip_fw *)bp)->timestamp += boot_seconds;
-			bp += i;
+		bcopy(&V_set_disable, &dst->next_rule, sizeof(V_set_disable));
+		if (dst->timestamp)
+			dst->timestamp += boot_seconds;
+		bp += l;
 		}
-	}
-	IPFW_RUNLOCK(chain);
 	ipfw_get_dynamic(&bp, ep); /* protected by the dynamic lock */
 	return (bp - (char *)buf);
 }
@@ -857,41 +860,33 @@ ipfw_ctl(struct sockopt *sopt)
 		 * change between calculating the size and returning the
 		 * data in which case we'll just return what fits.
 		 */
-		size = chain->static_len;	/* size of static rules */
-		size += ipfw_dyn_len();
+		for (;;) {
+			int len = 0, want;
 
+			size = chain->static_len;
+			size += ipfw_dyn_len();
 		if (size >= sopt->sopt_valsize)
 			break;
-		/*
-		 * XXX todo: if the user passes a short length just to know
-		 * how much room is needed, do not bother filling up the
-		 * buffer, just jump to the sooptcopyout.
-		 */
 		buf = malloc(size, M_TEMP, M_WAITOK);
-		error = sooptcopyout(sopt, buf,
-				ipfw_getrules(chain, buf, size));
+			if (buf == NULL)
+				break;
+			IPFW_UH_RLOCK(chain);
+			/* check again how much space we need */
+			want = chain->static_len + ipfw_dyn_len();
+			if (size >= want)
+				len = ipfw_getrules(chain, buf, size);
+			IPFW_UH_RUNLOCK(chain);
+			if (size >= want)
+				error = sooptcopyout(sopt, buf, len);
 		free(buf, M_TEMP);
+			if (size >= want)
+				break;
+		}
 		break;
 
 	case IP_FW_FLUSH:
-		/*
-		 * Normally we cannot release the lock on each iteration.
-		 * We could do it here only because we start from the head all
-		 * the times so there is no risk of missing some entries.
-		 * On the other hand, the risk is that we end up with
-		 * a very inconsistent ruleset, so better keep the lock
-		 * around the whole cycle.
-		 *
-		 * XXX this code can be improved by resetting the head of
-		 * the list to point to the default rule, and then freeing
-		 * the old list without the need for a lock.
-		 */
-
-		IPFW_WLOCK(chain);
-		ipfw_free_chain(chain, 0 /* keep default rule */);
-		rule = chain->reap;
-		IPFW_WUNLOCK(chain);
-		ipfw_reap_rules(rule);
+		/* locking is done within del_entry() */
+		error = del_entry(chain, 0); /* special case, rule=0, cmd=0 means all */
 		break;
 
 	case IP_FW_ADD:
@@ -901,6 +896,7 @@ ipfw_ctl(struct sockopt *sopt)
 		if (error == 0)
 			error = check_ipfw_struct(rule, sopt->sopt_valsize);
 		if (error == 0) {
+			/* locking is done within ipfw_add_rule() */
 			error = ipfw_add_rule(chain, rule);
 			size = RULESIZE(rule);
 			if (!error && sopt->sopt_dir == SOPT_GET)
@@ -931,9 +927,11 @@ ipfw_ctl(struct sockopt *sopt)
 			/* delete or reassign, locking done in del_entry() */
 			error = del_entry(chain, rulenum[0]);
 		} else if (size == 2*sizeof(u_int32_t)) { /* set enable/disable */
+			IPFW_UH_WLOCK(chain);
 			V_set_disable =
 			    (V_set_disable | rulenum[0]) & ~rulenum[1] &
 			    ~(1<<RESVD_SET); /* set RESVD_SET always enabled */
+			IPFW_UH_WUNLOCK(chain);
 		} else
 			error = EINVAL;
 		break;
author	luigi <luigi@FreeBSD.org>	2009-12-22 19:01:47 +0000
committer	luigi <luigi@FreeBSD.org>	2009-12-22 19:01:47 +0000
commit	2043aec45642350832212819a4c7cf828ba2dffa (patch)
tree	212a476e3a032fe33a57f72f211a5e84e676bbd4
parent	69eb1537ebfe9e513651b7095899759cee1b96aa (diff)
download	FreeBSD-src-2043aec45642350832212819a4c7cf828ba2dffa.zip FreeBSD-src-2043aec45642350832212819a4c7cf828ba2dffa.tar.gz