summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorkmacy <kmacy@FreeBSD.org>2009-04-14 23:05:36 +0000
committerkmacy <kmacy@FreeBSD.org>2009-04-14 23:05:36 +0000
commit8149bfaed6bb48e24c81b4fe830665f412bc7e32 (patch)
tree2c468e4e81b86a0a66401b94c6626251cbbccf6a
parent7500c86f06626d19d1c6296cfdb9f781600f1bf0 (diff)
downloadFreeBSD-src-8149bfaed6bb48e24c81b4fe830665f412bc7e32.zip
FreeBSD-src-8149bfaed6bb48e24c81b4fe830665f412bc7e32.tar.gz
Extend route command:
- add show as alias for get - add weights to allow mpath to do more than equal cost - add sticky / nostick to disable / re-enable per-connection load balancing This adds a field to rt_metrics_lite so network bits of world will need to be re-built. Reviewed by: jeli & qingli
-rw-r--r--UPDATING7
-rw-r--r--sbin/route/keywords4
-rw-r--r--sbin/route/route.c31
-rw-r--r--sys/net/radix_mpath.c43
-rw-r--r--sys/net/route.c161
-rw-r--r--sys/net/route.h16
-rw-r--r--sys/net/rtsock.c14
-rw-r--r--sys/sys/param.h2
8 files changed, 180 insertions, 98 deletions
diff --git a/UPDATING b/UPDATING
index e5110f0..aef112d 100644
--- a/UPDATING
+++ b/UPDATING
@@ -22,6 +22,13 @@ NOTE TO PEOPLE WHO THINK THAT FreeBSD 8.x IS SLOW:
to maximize performance. (To disable malloc debugging, run
ln -s aj /etc/malloc.conf.)
+20090414:
+ The size of rt_metrics_lite and by extension rtentry has changed.
+ Networking administration apps will need to be recompiled.
+ The route command now supports show as an alias for get, weighting
+ of routes, sticky and nostick flags to alter the behavior of stateful
+ load balancing.
+ Bump __FreeBSD_version to 800078.
20090408:
Do not use Giant for kbdmux(4) locking. This is wrong and
apparently causing more problems than it solves. This will
diff --git a/sbin/route/keywords b/sbin/route/keywords
index 130fcd1..8817f30 100644
--- a/sbin/route/keywords
+++ b/sbin/route/keywords
@@ -33,6 +33,7 @@ mtu
net
netmask
nostatic
+nostick
osi
prefixlen
proto1
@@ -44,8 +45,11 @@ rtt
rttvar
sa
sendpipe
+show
ssthresh
static
+sticky
+weight
x25
xns
xresolve
diff --git a/sbin/route/route.c b/sbin/route/route.c
index 1b0985e..e20cedb 100644
--- a/sbin/route/route.c
+++ b/sbin/route/route.c
@@ -169,6 +169,7 @@ main(argc, argv)
if (*argv)
switch (keyword(*argv)) {
case K_GET:
+ case K_SHOW:
uid = 0;
/* FALLTHROUGH */
@@ -548,6 +549,7 @@ set_metric(value, key)
caseof(K_SSTHRESH, RTV_SSTHRESH, rmx_ssthresh);
caseof(K_RTT, RTV_RTT, rmx_rtt);
caseof(K_RTTVAR, RTV_RTTVAR, rmx_rttvar);
+ caseof(K_WEIGHT, RTV_WEIGHT, rmx_weight);
}
rtm_inits |= flag;
if (lockrest || locking)
@@ -571,8 +573,9 @@ newroute(argc, argv)
errx(EX_NOPERM, "must be root to alter routing table");
}
cmd = argv[0];
- if (*cmd != 'g')
+ if (*cmd != 'g' && *cmd != 's')
shutdown(s, SHUT_RD); /* Don't want to read back our messages */
+
while (--argc > 0) {
if (**(++argv)== '-') {
switch (key = keyword(1 + *argv)) {
@@ -635,6 +638,12 @@ newroute(argc, argv)
case K_STATIC:
flags |= RTF_STATIC;
break;
+ case K_STICKY:
+ flags |= RTF_STICKY;
+ break;
+ case K_NOSTICK:
+ flags &= ~RTF_STICKY;
+ break;
case K_IFA:
if (!--argc)
usage((char *)NULL);
@@ -688,6 +697,7 @@ newroute(argc, argv)
case K_SSTHRESH:
case K_RTT:
case K_RTTVAR:
+ case K_WEIGHT:
if (!--argc)
usage((char *)NULL);
set_metric(*++argv, key);
@@ -741,7 +751,7 @@ newroute(argc, argv)
} else
break;
}
- if (*cmd == 'g')
+ if (*cmd == 'g' || *cmd == 's')
exit(ret != 0);
if (!qflag) {
oerrno = errno;
@@ -1193,7 +1203,7 @@ rtmsg(cmd, flags)
cmd = RTM_ADD;
else if (cmd == 'c')
cmd = RTM_CHANGE;
- else if (cmd == 'g') {
+ else if (cmd == 'g' || cmd == 's') {
cmd = RTM_GET;
if (so_ifp.sa.sa_family == 0) {
so_ifp.sa.sa_family = AF_LINK;
@@ -1297,13 +1307,13 @@ char *msgtypes[] = {
};
char metricnames[] =
-"\011pksent\010rttvar\7rtt\6ssthresh\5sendpipe\4recvpipe\3expire\2hopcount"
+"\011weight\010rttvar\7rtt\6ssthresh\5sendpipe\4recvpipe\3expire"
"\1mtu";
char routeflags[] =
-"\1UP\2GATEWAY\3HOST\4REJECT\5DYNAMIC\6MODIFIED\7DONE\010MASK_PRESENT"
-"\011CLONING\012XRESOLVE\013LLINFO\014STATIC\015BLACKHOLE\016b016"
-"\017PROTO2\020PROTO1\021PRCLONING\022WASCLONED\023PROTO3\024CHAINDELETE"
-"\025PINNED\026LOCAL\027BROADCAST\030MULTICAST";
+"\1UP\2GATEWAY\3HOST\4REJECT\5DYNAMIC\6MODIFIED\7DONE"
+"\012XRESOLVE\013LLINFO\014STATIC\015BLACKHOLE"
+"\017PROTO2\020PROTO1\021PRCLONING\022WASCLONED\023PROTO3"
+"\025PINNED\026LOCAL\027BROADCAST\030MULTICAST\035STICKY";
char ifnetflags[] =
"\1UP\2BROADCAST\3DEBUG\4LOOPBACK\5PTP\6b6\7RUNNING\010NOARP"
"\011PPROMISC\012ALLMULTI\013OACTIVE\014SIMPLEX\015LINK0\016LINK1"
@@ -1466,14 +1476,13 @@ print_getmsg(rtm, msglen)
#define msec(u) (((u) + 500) / 1000) /* usec to msec */
(void) printf("\n%s\n", "\
- recvpipe sendpipe ssthresh rtt,msec rttvar hopcount mtu expire");
+ recvpipe sendpipe ssthresh rtt,msec mtu weight expire");
printf("%8ld%c ", rtm->rtm_rmx.rmx_recvpipe, lock(RPIPE));
printf("%8ld%c ", rtm->rtm_rmx.rmx_sendpipe, lock(SPIPE));
printf("%8ld%c ", rtm->rtm_rmx.rmx_ssthresh, lock(SSTHRESH));
printf("%8ld%c ", msec(rtm->rtm_rmx.rmx_rtt), lock(RTT));
- printf("%8ld%c ", msec(rtm->rtm_rmx.rmx_rttvar), lock(RTTVAR));
- printf("%8ld%c ", rtm->rtm_rmx.rmx_hopcount, lock(HOPCOUNT));
printf("%8ld%c ", rtm->rtm_rmx.rmx_mtu, lock(MTU));
+ printf("%8ld%c ", rtm->rtm_rmx.rmx_weight, lock(WEIGHT));
if (rtm->rtm_rmx.rmx_expire)
rtm->rtm_rmx.rmx_expire -= time(0);
printf("%8ld%c\n", rtm->rtm_rmx.rmx_expire, lock(EXPIRE));
diff --git a/sys/net/radix_mpath.c b/sys/net/radix_mpath.c
index 8d94d01..9be01d2 100644
--- a/sys/net/radix_mpath.c
+++ b/sys/net/radix_mpath.c
@@ -77,15 +77,18 @@ rn_mpath_next(struct radix_node *rn)
return NULL;
}
-u_int32_t
+uint32_t
rn_mpath_count(struct radix_node *rn)
{
- u_int32_t i;
-
- i = 1;
- while ((rn = rn_mpath_next(rn)) != NULL)
- i++;
- return i;
+ uint32_t i = 0;
+ struct rtentry *rt;
+
+ while (rn != NULL) {
+ rt = (struct rtentry *)rn;
+ i += rt->rt_rmx.rmx_weight;
+ rn = rn_mpath_next(rn);
+ }
+ return (i);
}
struct rtentry *
@@ -256,10 +259,12 @@ different:
}
void
-rtalloc_mpath_fib(struct route *ro, u_int32_t hash, u_int fibnum)
+rtalloc_mpath_fib(struct route *ro, uint32_t hash, u_int fibnum)
{
struct radix_node *rn0, *rn;
u_int32_t n;
+ struct rtentry *rt;
+ int64_t weight;
/*
* XXX we don't attempt to lookup cached route again; what should
@@ -284,25 +289,31 @@ rtalloc_mpath_fib(struct route *ro, u_int32_t hash, u_int fibnum)
/* gw selection by Modulo-N Hash (RFC2991) XXX need improvement? */
hash += hashjitter;
hash %= n;
- while (hash-- > 0 && rn) {
+ for (weight = abs((int32_t)hash), rt = ro->ro_rt;
+ weight >= rt->rt_rmx.rmx_weight && rn;
+ weight -= rt->rt_rmx.rmx_weight) {
+
/* stay within the multipath routes */
if (rn->rn_dupedkey && rn->rn_mask != rn->rn_dupedkey->rn_mask)
break;
rn = rn->rn_dupedkey;
+ rt = (struct rtentry *)rn;
}
-
/* XXX try filling rt_gwroute and avoid unreachable gw */
- /* if gw selection fails, use the first match (default) */
+ /* gw selection has failed - there must be only zero weight routes */
if (!rn) {
RT_UNLOCK(ro->ro_rt);
+ ro->ro_rt = NULL;
return;
}
-
- RTFREE_LOCKED(ro->ro_rt);
- ro->ro_rt = (struct rtentry *)rn;
- RT_LOCK(ro->ro_rt);
- RT_ADDREF(ro->ro_rt);
+ if (ro->ro_rt != rt) {
+ RTFREE_LOCKED(ro->ro_rt);
+ ro->ro_rt = (struct rtentry *)rn;
+ RT_LOCK(ro->ro_rt);
+ RT_ADDREF(ro->ro_rt);
+
+ }
RT_UNLOCK(ro->ro_rt);
}
diff --git a/sys/net/route.c b/sys/net/route.c
index f1e13ad..5294975 100644
--- a/sys/net/route.c
+++ b/sys/net/route.c
@@ -826,6 +826,103 @@ bad:
return (error);
}
+#ifdef RADIX_MPATH
+static int
+rn_mpath_update(int req, struct rt_addrinfo *info,
+ struct radix_node_head *rnh, struct rtentry **ret_nrt)
+{
+ /*
+ * if we got multipath routes, we require users to specify
+ * a matching RTAX_GATEWAY.
+ */
+ struct rtentry *rt, *rto = NULL;
+ register struct radix_node *rn;
+ int error = 0;
+
+ rn = rnh->rnh_matchaddr(dst, rnh);
+ if (rn == NULL)
+ return (ESRCH);
+ rto = rt = RNTORT(rn);
+ rt = rt_mpath_matchgate(rt, gateway);
+ if (rt == NULL)
+ return (ESRCH);
+ /*
+ * this is the first entry in the chain
+ */
+ if (rto == rt) {
+ rn = rn_mpath_next((struct radix_node *)rt);
+ /*
+ * there is another entry, now it's active
+ */
+ if (rn) {
+ rto = RNTORT(rn);
+ RT_LOCK(rto);
+ rto->rt_flags |= RTF_UP;
+ RT_UNLOCK(rto);
+ } else if (rt->rt_flags & RTF_GATEWAY) {
+ /*
+ * For gateway routes, we need to
+ * make sure that we we are deleting
+ * the correct gateway.
+ * rt_mpath_matchgate() does not
+ * check the case when there is only
+ * one route in the chain.
+ */
+ if (gateway &&
+ (rt->rt_gateway->sa_len != gateway->sa_len ||
+ memcmp(rt->rt_gateway, gateway, gateway->sa_len)))
+ error = ESRCH;
+ goto done;
+ }
+ /*
+ * use the normal delete code to remove
+ * the first entry
+ */
+ if (req != RTM_DELETE)
+ goto nondelete;
+
+ error = ENOENT;
+ goto done;
+ }
+
+ /*
+ * if the entry is 2nd and on up
+ */
+ if ((req == RTM_DELETE) && !rt_mpath_deldup(rto, rt))
+ panic ("rtrequest1: rt_mpath_deldup");
+ RT_LOCK(rt);
+ RT_ADDREF(rt);
+ if (req == RTM_DELETE) {
+ rt->rt_flags &= ~RTF_UP;
+ /*
+ * One more rtentry floating around that is not
+ * linked to the routing table. rttrash will be decremented
+ * when RTFREE(rt) is eventually called.
+ */
+ V_rttrash++;
+
+ }
+
+nondelete:
+ if (req != RTM_DELETE)
+ panic("unrecognized request %d", req);
+
+
+ /*
+ * If the caller wants it, then it can have it,
+ * but it's up to it to free the rtentry as we won't be
+ * doing it.
+ */
+ if (ret_nrt) {
+ *ret_nrt = rt;
+ RT_UNLOCK(rt);
+ } else
+ RTFREE_LOCKED(rt);
+done:
+ return (error);
+}
+#endif
+
int
rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt,
u_int fibnum)
@@ -864,65 +961,15 @@ rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt,
switch (req) {
case RTM_DELETE:
#ifdef RADIX_MPATH
- /*
- * if we got multipath routes, we require users to specify
- * a matching RTAX_GATEWAY.
- */
if (rn_mpath_capable(rnh)) {
- struct rtentry *rto = NULL;
-
- rn = rnh->rnh_matchaddr(dst, rnh);
- if (rn == NULL)
- senderr(ESRCH);
- rto = rt = RNTORT(rn);
- rt = rt_mpath_matchgate(rt, gateway);
- if (!rt)
- senderr(ESRCH);
- /*
- * this is the first entry in the chain
- */
- if (rto == rt) {
- rn = rn_mpath_next((struct radix_node *)rt);
- /*
- * there is another entry, now it's active
- */
- if (rn) {
- rto = RNTORT(rn);
- RT_LOCK(rto);
- rto->rt_flags |= RTF_UP;
- RT_UNLOCK(rto);
- } else if (rt->rt_flags & RTF_GATEWAY) {
- /*
- * For gateway routes, we need to
- * make sure that we we are deleting
- * the correct gateway.
- * rt_mpath_matchgate() does not
- * check the case when there is only
- * one route in the chain.
- */
- if (gateway &&
- (rt->rt_gateway->sa_len != gateway->sa_len ||
- memcmp(rt->rt_gateway, gateway, gateway->sa_len)))
- senderr(ESRCH);
- }
- /*
- * use the normal delete code to remove
- * the first entry
- */
- goto normal_rtdel;
- }
+ error = rn_mpath_update(req, info, rnh, ret_nrt);
/*
- * if the entry is 2nd and on up
+ * "bad" holds true for the success case
+ * as well
*/
- if (!rt_mpath_deldup(rto, rt))
- panic ("rtrequest1: rt_mpath_deldup");
- RT_LOCK(rt);
- RT_ADDREF(rt);
- rt->rt_flags &= ~RTF_UP;
- goto deldone; /* done with the RTM_DELETE command */
+ if (error != ENOENT)
+ goto bad;
}
-
-normal_rtdel:
#endif
/*
* Remove the item from the tree and return it.
@@ -944,9 +991,6 @@ normal_rtdel:
if ((ifa = rt->rt_ifa) && ifa->ifa_rtrequest)
ifa->ifa_rtrequest(RTM_DELETE, rt, info);
-#ifdef RADIX_MPATH
-deldone:
-#endif
/*
* One more rtentry floating around that is not
* linked to the routing table. rttrash will be decremented
@@ -1019,6 +1063,7 @@ deldone:
IFAREF(ifa);
rt->rt_ifa = ifa;
rt->rt_ifp = ifa->ifa_ifp;
+ rt->rt_rmx.rmx_weight = 1;
#ifdef RADIX_MPATH
/* do not permit exactly the same dst/mask/gw pair */
diff --git a/sys/net/route.h b/sys/net/route.h
index 44b04ac..2624788 100644
--- a/sys/net/route.h
+++ b/sys/net/route.h
@@ -58,6 +58,7 @@ struct rt_metrics_lite {
u_long rmx_mtu; /* MTU for this path */
u_long rmx_expire; /* lifetime for route, e.g. redirect */
u_long rmx_pksent; /* packets sent using this route */
+ u_long rmx_weight; /* absolute weight */
};
struct rt_metrics {
@@ -71,7 +72,8 @@ struct rt_metrics {
u_long rmx_rtt; /* estimated round trip time */
u_long rmx_rttvar; /* estimated rtt variance */
u_long rmx_pksent; /* packets sent using this route */
- u_long rmx_filler[4]; /* will be used for T/TCP later */
+ u_long rmx_weight; /* route weight */
+ u_long rmx_filler[3]; /* will be used for T/TCP later */
};
/*
@@ -193,13 +195,15 @@ struct ortentry {
#define RTF_LOCAL 0x200000 /* route represents a local address */
#define RTF_BROADCAST 0x400000 /* route represents a bcast address */
#define RTF_MULTICAST 0x800000 /* route represents a mcast address */
- /* 0x1000000 and up unassigned */
-#define RTF_RNH_LOCKED 0x40000000 /* radix node head locked by caller */
+ /* 0x8000000 and up unassigned */
+#define RTF_STICKY 0x10000000 /* always route dst->src */
+
+#define RTF_RNH_LOCKED 0x40000000 /* radix node head is locked */
/* Mask of RTF flags that are allowed to be modified by RTM_CHANGE. */
#define RTF_FMASK \
(RTF_PROTO1 | RTF_PROTO2 | RTF_PROTO3 | RTF_BLACKHOLE | \
- RTF_REJECT | RTF_STATIC)
+ RTF_REJECT | RTF_STATIC | RTF_STICKY)
/*
* Routing statistics.
@@ -225,12 +229,11 @@ struct rt_msghdr {
int rtm_seq; /* for sender to identify action */
int rtm_errno; /* why failed */
int rtm_fmask; /* bitmask used in RTM_CHANGE message */
-#define rtm_use rtm_fmask /* deprecated, use rtm_rmx->rmx_pksent */
u_long rtm_inits; /* which metrics we are initializing */
struct rt_metrics rtm_rmx; /* metrics themselves */
};
-#define RTM_VERSION 5 /* Up the ante and ignore older versions */
+#define RTM_VERSION 6 /* Up the ante and ignore older versions */
/*
* Message types.
@@ -265,6 +268,7 @@ struct rt_msghdr {
#define RTV_SSTHRESH 0x20 /* init or lock _ssthresh */
#define RTV_RTT 0x40 /* init or lock _rtt */
#define RTV_RTTVAR 0x80 /* init or lock _rttvar */
+#define RTV_WEIGHT 0x100 /* init or lock _weight */
/*
* Bitmask values for rtm_addrs.
diff --git a/sys/net/rtsock.c b/sys/net/rtsock.c
index 91aec20..6b7c29b 100644
--- a/sys/net/rtsock.c
+++ b/sys/net/rtsock.c
@@ -637,7 +637,6 @@ route_output(struct mbuf *m, struct socket *so)
}
(void)rt_msg2(rtm->rtm_type, &info, (caddr_t)rtm, NULL);
rtm->rtm_flags = rt->rt_flags;
- rtm->rtm_use = 0;
rt_getmetrics(&rt->rt_rmx, &rtm->rtm_rmx);
rtm->rtm_addrs = info.rti_addrs;
break;
@@ -691,10 +690,8 @@ route_output(struct mbuf *m, struct socket *so)
rt->rt_ifp = info.rti_ifp;
}
/* Allow some flags to be toggled on change. */
- if (rtm->rtm_fmask & RTF_FMASK)
- rt->rt_flags = (rt->rt_flags &
- ~rtm->rtm_fmask) |
- (rtm->rtm_flags & rtm->rtm_fmask);
+ rt->rt_flags = (rt->rt_flags & ~RTF_FMASK) |
+ (rtm->rtm_flags & RTF_FMASK);
rt_setmetrics(rtm->rtm_inits, &rtm->rtm_rmx,
&rt->rt_rmx);
rtm->rtm_index = rt->rt_ifp->if_index;
@@ -773,6 +770,7 @@ rt_setmetrics(u_long which, const struct rt_metrics *in,
* of tcp hostcache. The rest is ignored.
*/
metric(RTV_MTU, rmx_mtu);
+ metric(RTV_WEIGHT, rmx_weight);
/* Userland -> kernel timebase conversion. */
if (which & RTV_EXPIRE)
out->rmx_expire = in->rmx_expire ?
@@ -786,6 +784,7 @@ rt_getmetrics(const struct rt_metrics_lite *in, struct rt_metrics *out)
#define metric(e) out->e = in->e;
bzero(out, sizeof(*out));
metric(rmx_mtu);
+ metric(rmx_weight);
/* Kernel -> userland timebase conversion. */
out->rmx_expire = in->rmx_expire ?
in->rmx_expire - time_uptime + time_second : 0;
@@ -1257,7 +1256,10 @@ sysctl_dumpentry(struct radix_node *rn, void *vw)
struct rt_msghdr *rtm = (struct rt_msghdr *)w->w_tmem;
rtm->rtm_flags = rt->rt_flags;
- rtm->rtm_use = rt->rt_rmx.rmx_pksent;
+ /*
+ * let's be honest about this being a retarded hack
+ */
+ rtm->rtm_fmask = rt->rt_rmx.rmx_pksent;
rt_getmetrics(&rt->rt_rmx, &rtm->rtm_rmx);
rtm->rtm_index = rt->rt_ifp->if_index;
rtm->rtm_errno = rtm->rtm_pid = rtm->rtm_seq = 0;
diff --git a/sys/sys/param.h b/sys/sys/param.h
index f02853c..8703c30 100644
--- a/sys/sys/param.h
+++ b/sys/sys/param.h
@@ -57,7 +57,7 @@
* is created, otherwise 1.
*/
#undef __FreeBSD_version
-#define __FreeBSD_version 800077 /* Master, propagated to newvers */
+#define __FreeBSD_version 800078 /* Master, propagated to newvers */
#ifndef LOCORE
#include <sys/types.h>
OpenPOWER on IntegriCloud