summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorqingli <qingli@FreeBSD.org>2010-03-09 01:11:45 +0000
committerqingli <qingli@FreeBSD.org>2010-03-09 01:11:45 +0000
commit93013817b06170ed5ed76ef83e4bc75efe85b100 (patch)
treefea4c8dfdec74a7dea073968c9742940e2408c8d
parentfe5f1f57b8fcac02898f1fa59b4c49c028009495 (diff)
downloadFreeBSD-src-93013817b06170ed5ed76ef83e4bc75efe85b100.zip
FreeBSD-src-93013817b06170ed5ed76ef83e4bc75efe85b100.tar.gz
One of the advantages of enabling ECMP (a.k.a RADIX_MPATH) is to
allow for connection load balancing across interfaces. Currently the address alias handling method is colliding with the ECMP code. For example, when two interfaces are configured on the same prefix, only one prefix route is installed. So connection load balancing among the available interfaces is not possible. The other advantage of ECMP is for failover. The issue with the current code, is that the interface link-state is not reflected in the route entry. For example, if there are two interfaces on the same prefix, the cable on one interface is unplugged, new and existing connections should switch over to the other interface. This is not done today and packets go into a black hole. Also, there is a small bug in the kernel where deleting ECMP routes in the userland will always return an error even though the command is successfully executed. MFC after: 5 days
-rw-r--r--sys/net/flowtable.c3
-rw-r--r--sys/net/radix.c2
-rw-r--r--sys/net/radix_mpath.c3
-rw-r--r--sys/net/route.c38
-rw-r--r--sys/net/route.h2
-rw-r--r--sys/netinet/in.c8
-rw-r--r--sys/netinet/ip_output.c6
7 files changed, 51 insertions, 11 deletions
diff --git a/sys/net/flowtable.c b/sys/net/flowtable.c
index ab42e68..b7ec578 100644
--- a/sys/net/flowtable.c
+++ b/sys/net/flowtable.c
@@ -472,7 +472,8 @@ flow_stale(struct flowtable *ft, struct flentry *fle)
|| ((fle->f_rt->rt_flags & RTF_HOST) &&
((fle->f_rt->rt_flags & (RTF_UP))
!= (RTF_UP)))
- || (fle->f_rt->rt_ifp == NULL))
+ || (fle->f_rt->rt_ifp == NULL)
+ || !RT_LINK_IS_UP(fle->f_rt->rt_ifp))
return (1);
idle_time = time_uptime - fle->f_uptime;
diff --git a/sys/net/radix.c b/sys/net/radix.c
index 9f2383d..33fcf82 100644
--- a/sys/net/radix.c
+++ b/sys/net/radix.c
@@ -761,8 +761,10 @@ on2:
if (m->rm_flags & RNF_NORMAL) {
mmask = m->rm_leaf->rn_mask;
if (tt->rn_flags & RNF_NORMAL) {
+#if !defined(RADIX_MPATH)
log(LOG_ERR,
"Non-unique normal route, mask not entered\n");
+#endif
return tt;
}
} else
diff --git a/sys/net/radix_mpath.c b/sys/net/radix_mpath.c
index 9be01d2..ea84e5c 100644
--- a/sys/net/radix_mpath.c
+++ b/sys/net/radix_mpath.c
@@ -270,7 +270,8 @@ rtalloc_mpath_fib(struct route *ro, uint32_t hash, u_int fibnum)
* XXX we don't attempt to lookup cached route again; what should
* be done for sendto(3) case?
*/
- if (ro->ro_rt && ro->ro_rt->rt_ifp && (ro->ro_rt->rt_flags & RTF_UP))
+ if (ro->ro_rt && ro->ro_rt->rt_ifp && (ro->ro_rt->rt_flags & RTF_UP)
+ && RT_LINK_IS_UP(ro->ro_rt->rt_ifp))
return;
ro->ro_rt = rtalloc1_fib(&ro->ro_dst, 1, 0, fibnum);
diff --git a/sys/net/route.c b/sys/net/route.c
index a938c9c..e500ed1 100644
--- a/sys/net/route.c
+++ b/sys/net/route.c
@@ -830,7 +830,13 @@ rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum)
int
rtexpunge(struct rtentry *rt)
{
+#if !defined(RADIX_MPATH)
struct radix_node *rn;
+#else
+ struct rt_addrinfo info;
+ int fib;
+ struct rtentry *rt0;
+#endif
struct radix_node_head *rnh;
struct ifaddr *ifa;
int error = 0;
@@ -843,14 +849,26 @@ rtexpunge(struct rtentry *rt)
if (rnh == NULL)
return (EAFNOSUPPORT);
RADIX_NODE_HEAD_LOCK_ASSERT(rnh);
-#if 0
- /*
- * We cannot assume anything about the reference count
- * because protocols call us in many situations; often
- * before unwinding references to the table entry.
- */
- KASSERT(rt->rt_refcnt <= 1, ("bogus refcnt %ld", rt->rt_refcnt));
-#endif
+
+#ifdef RADIX_MPATH
+ fib = rt->rt_fibnum;
+ bzero(&info, sizeof(info));
+ info.rti_ifp = rt->rt_ifp;
+ info.rti_flags = RTF_RNH_LOCKED;
+ info.rti_info[RTAX_DST] = rt_key(rt);
+ info.rti_info[RTAX_GATEWAY] = rt->rt_ifa->ifa_addr;
+
+ RT_UNLOCK(rt);
+ error = rtrequest1_fib(RTM_DELETE, &info, &rt0, fib);
+
+ if (error == 0 && rt0 != NULL) {
+ rt = rt0;
+ RT_LOCK(rt);
+ } else if (error != 0) {
+ RT_LOCK(rt);
+ return (error);
+ }
+#else
/*
* Remove the item from the tree; it should be there,
* but when callers invoke us blindly it may not (sigh).
@@ -864,6 +882,7 @@ rtexpunge(struct rtentry *rt)
("unexpected flags 0x%x", rn->rn_flags));
KASSERT(rt == RNTORT(rn),
("lookup mismatch, rt %p rn %p", rt, rn));
+#endif /* RADIX_MPATH */
rt->rt_flags &= ~RTF_UP;
@@ -886,7 +905,9 @@ rtexpunge(struct rtentry *rt)
* linked to the routing table.
*/
V_rttrash++;
+#if !defined(RADIX_MPATH)
bad:
+#endif
return (error);
}
@@ -1044,6 +1065,7 @@ rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt,
*/
if (error != ENOENT)
goto bad;
+ error = 0;
}
#endif
/*
diff --git a/sys/net/route.h b/sys/net/route.h
index a8ae867..b337f32 100644
--- a/sys/net/route.h
+++ b/sys/net/route.h
@@ -319,6 +319,8 @@ struct rt_addrinfo {
#ifdef _KERNEL
+#define RT_LINK_IS_UP(ifp) ((ifp)->if_link_state == LINK_STATE_UP)
+
#define RT_LOCK_INIT(_rt) \
mtx_init(&(_rt)->rt_mtx, "rtentry", NULL, MTX_DEF | MTX_DUPOK)
#define RT_LOCK(_rt) mtx_lock(&(_rt)->rt_mtx)
diff --git a/sys/netinet/in.c b/sys/netinet/in.c
index b1c51a2..319ec95 100644
--- a/sys/netinet/in.c
+++ b/sys/netinet/in.c
@@ -34,6 +34,7 @@
__FBSDID("$FreeBSD$");
#include "opt_carp.h"
+#include "opt_mpath.h"
#include <sys/param.h>
#include <sys/systm.h>
@@ -1040,6 +1041,13 @@ in_addprefix(struct in_ifaddr *target, int flags)
* interface address, we are done here.
*/
if (ia->ia_flags & IFA_ROUTE) {
+#ifdef RADIX_MPATH
+ if (ia->ia_addr.sin_addr.s_addr ==
+ target->ia_addr.sin_addr.s_addr)
+ return (EEXIST);
+ else
+ break;
+#endif
if (V_sameprefixcarponly &&
target->ia_ifp->if_type != IFT_CARP &&
ia->ia_ifp->if_type != IFT_CARP) {
diff --git a/sys/netinet/ip_output.c b/sys/netinet/ip_output.c
index d6f361d..e238e41 100644
--- a/sys/netinet/ip_output.c
+++ b/sys/netinet/ip_output.c
@@ -199,6 +199,8 @@ again:
*/
rte = ro->ro_rt;
if (rte && ((rte->rt_flags & RTF_UP) == 0 ||
+ rte->rt_ifp == NULL ||
+ !RT_LINK_IS_UP(rte->rt_ifp) ||
dst->sin_family != AF_INET ||
dst->sin_addr.s_addr != ip->ip_dst.s_addr)) {
if (!nortfree)
@@ -270,7 +272,9 @@ again:
#endif
rte = ro->ro_rt;
}
- if (rte == NULL) {
+ if (rte == NULL ||
+ rte->rt_ifp == NULL ||
+ !RT_LINK_IS_UP(rte->rt_ifp)) {
#ifdef IPSEC
/*
* There is no route for this packet, but it is
OpenPOWER on IntegriCloud