Refactor the NFS over UDP retransmit timeout estimation logic to allow

the estimator to be more easily tuned and maintained. There should be no functional change except there is now a lower limit on the retransmit timeout to prevent the client from retransmitting faster than the server's disks can fill requests, and an upper limit to prevent the estimator from taking to long to retransmit during a server outage. Reviewed by: mohan, kris, silby Sponsored by: Network Appliance, Incorporated
author: cel <cel@FreeBSD.org> 2006-05-23 18:33:58 +0000
committer: cel <cel@FreeBSD.org> 2006-05-23 18:33:58 +0000
commit: ec80996e6b2ebfba48c19d243ad3fd4f810cd75c (patch)
tree: 8e9eadb2175472785f8fc6b2f9477037efeed9f7
parent: 04be51fc90e96399fdb0fbc4352ca5b7b52e4ad6 (diff)
download: FreeBSD-src-ec80996e6b2ebfba48c19d243ad3fd4f810cd75c.zip
FreeBSD-src-ec80996e6b2ebfba48c19d243ad3fd4f810cd75c.tar.gz
3 files changed, 158 insertions, 62 deletions
diff --git a/sys/nfsclient/nfs.h b/sys/nfsclient/nfs.h
index c3c54b2..9dc34a7 100644
--- a/sys/nfsclient/nfs.h
+++ b/sys/nfsclient/nfs.h
@@ -257,6 +257,31 @@ extern int nfs_debug;
 
 #endif
 
+/*
+ * On fast networks, the estimator will try to reduce the
+ * timeout lower than the latency of the server's disks,
+ * which results in too many timeouts, so cap the lower
+ * bound.
+ */
+#define NFS_MINRTO	(NFS_HZ >> 2)
+
+/*
+ * Keep the RTO from increasing to unreasonably large values
+ * when a server is not responding.
+ */
+#define NFS_MAXRTO	(20 * NFS_HZ)
+
+enum nfs_rto_timer_t {
+	NFS_DEFAULT_TIMER,
+	NFS_GETATTR_TIMER,
+	NFS_LOOKUP_TIMER,
+	NFS_READ_TIMER,
+	NFS_WRITE_TIMER,
+};
+#define NFS_MAX_TIMER	(NFS_WRITE_TIMER)
+
+#define NFS_INITRTT	(NFS_HZ << 3)
+
 vfs_init_t nfs_init;
 vfs_uninit_t nfs_uninit;
 int	nfs_mountroot(struct mount *mp, struct thread *td);
diff --git a/sys/nfsclient/nfs_socket.c b/sys/nfsclient/nfs_socket.c
index d2fd025..5e12939 100644
--- a/sys/nfsclient/nfs_socket.c
+++ b/sys/nfsclient/nfs_socket.c
@@ -79,38 +79,6 @@ __FBSDID("$FreeBSD$");
 
 extern u_int32_t nfs_xid;
 
-/*
- * Estimate rto for an nfs rpc sent via. an unreliable datagram.
- * Use the mean and mean deviation of rtt for the appropriate type of rpc
- * for the frequent rpcs and a default for the others.
- * The justification for doing "other" this way is that these rpcs
- * happen so infrequently that timer est. would probably be stale.
- * Also, since many of these rpcs are
- * non-idempotent, a conservative timeout is desired.
- * getattr, lookup - A+2D
- * read, write     - A+4D
- * other           - nm_timeo
- */
-#define	NFS_RTO(n, t) \
-	((t) == 0 ? (n)->nm_timeo : \
-	 ((t) < 3 ? \
-	  (((((n)->nm_srtt[t-1] + 3) >> 2) + (n)->nm_sdrtt[t-1] + 1) >> 1) : \
-	  ((((n)->nm_srtt[t-1] + 7) >> 3) + (n)->nm_sdrtt[t-1] + 1)))
-#define	NFS_SRTT(r)	(r)->r_nmp->nm_srtt[proct[(r)->r_procnum] - 1]
-#define	NFS_SDRTT(r)	(r)->r_nmp->nm_sdrtt[proct[(r)->r_procnum] - 1]
-
-/*
- * Defines which timer to use for the procnum.
- * 0 - default
- * 1 - getattr
- * 2 - lookup
- * 3 - read
- * 4 - write
- */
-static int proct[NFS_NPROCS] = {
-	0, 1, 0, 2, 1, 3, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0,
-};
-
 static int	nfs_realign_test;
 static int	nfs_realign_count;
 static int	nfs_bufpackets = 4;
@@ -157,6 +125,132 @@ static void nfs_clnt_udp_soupcall(struct socket *so, void *arg, int waitflag);
 extern struct mtx nfs_reqq_mtx;
 
 /*
+ * RTT estimator
+ */
+
+static enum nfs_rto_timer_t nfs_proct[NFS_NPROCS] = {
+	NFS_DEFAULT_TIMER,	/* NULL */
+	NFS_GETATTR_TIMER,	/* GETATTR */
+	NFS_DEFAULT_TIMER,	/* SETATTR */
+	NFS_LOOKUP_TIMER,	/* LOOKUP */
+	NFS_GETATTR_TIMER,	/* ACCESS */
+	NFS_READ_TIMER,		/* READLINK */
+	NFS_READ_TIMER,		/* READ */
+	NFS_WRITE_TIMER,	/* WRITE */
+	NFS_DEFAULT_TIMER,	/* CREATE */
+	NFS_DEFAULT_TIMER,	/* MKDIR */
+	NFS_DEFAULT_TIMER,	/* SYMLINK */
+	NFS_DEFAULT_TIMER,	/* MKNOD */
+	NFS_DEFAULT_TIMER,	/* REMOVE */
+	NFS_DEFAULT_TIMER,	/* RMDIR */
+	NFS_DEFAULT_TIMER,	/* RENAME */
+	NFS_DEFAULT_TIMER,	/* LINK */
+	NFS_READ_TIMER,		/* READDIR */
+	NFS_READ_TIMER,		/* READDIRPLUS */
+	NFS_DEFAULT_TIMER,	/* FSSTAT */
+	NFS_DEFAULT_TIMER,	/* FSINFO */
+	NFS_DEFAULT_TIMER,	/* PATHCONF */
+	NFS_DEFAULT_TIMER,	/* COMMIT */
+	NFS_DEFAULT_TIMER,	/* NOOP */
+};
+
+/*
+ * Choose the correct RTT timer for this NFS procedure.
+ */
+static inline enum nfs_rto_timer_t
+nfs_rto_timer(u_int32_t procnum)
+{
+	return nfs_proct[procnum];
+}
+
+/*
+ * Initialize the RTT estimator state for a new mount point.
+ */
+static void
+nfs_init_rtt(struct nfsmount *nmp)
+{
+	int i;
+
+	for (i = 0; i < NFS_MAX_TIMER; i++)
+		nmp->nm_srtt[i] = NFS_INITRTT;
+	for (i = 0; i < NFS_MAX_TIMER; i++)
+		nmp->nm_sdrtt[i] = 0;
+}
+
+/*
+ * Update a mount point's RTT estimator state using data from the
+ * passed-in request.
+ * 
+ * Use a gain of 0.125 on the mean and a gain of 0.25 on the deviation.
+ *
+ * NB: Since the timer resolution of NFS_HZ is so course, it can often
+ * result in r_rtt == 0. Since r_rtt == N means that the actual RTT is
+ * between N + dt and N + 2 - dt ticks, add 1 before calculating the
+ * update values.
+ */
+static void
+nfs_update_rtt(struct nfsreq *rep)
+{
+	int t1 = rep->r_rtt + 1;
+	int index = nfs_rto_timer(rep->r_procnum) - 1;
+	int *srtt = &rep->r_nmp->nm_srtt[index];
+	int *sdrtt = &rep->r_nmp->nm_sdrtt[index];
+
+	t1 -= *srtt >> 3;
+	*srtt += t1;
+	if (t1 < 0)
+		t1 = -t1;
+	t1 -= *sdrtt >> 2;
+	*sdrtt += t1;
+}
+
+/*
+ * Estimate RTO for an NFS RPC sent via an unreliable datagram.
+ *
+ * Use the mean and mean deviation of RTT for the appropriate type
+ * of RPC for the frequent RPCs and a default for the others.
+ * The justification for doing "other" this way is that these RPCs
+ * happen so infrequently that timer est. would probably be stale.
+ * Also, since many of these RPCs are non-idempotent, a conservative
+ * timeout is desired.
+ *
+ * getattr, lookup - A+2D
+ * read, write     - A+4D
+ * other           - nm_timeo
+ */
+static int
+nfs_estimate_rto(struct nfsmount *nmp, u_int32_t procnum)
+{
+	enum nfs_rto_timer_t timer = nfs_rto_timer(procnum);
+	int index = timer - 1;
+	int rto;
+
+	switch (timer) {
+	case NFS_GETATTR_TIMER:
+	case NFS_LOOKUP_TIMER:
+		rto = ((nmp->nm_srtt[index] + 3) >> 2) +
+				((nmp->nm_sdrtt[index] + 1) >> 1);
+		break;
+	case NFS_READ_TIMER:
+	case NFS_WRITE_TIMER:
+		rto = ((nmp->nm_srtt[index] + 7) >> 3) +
+				(nmp->nm_sdrtt[index] + 1);
+		break;
+	default:
+		rto = nmp->nm_timeo;
+		return (rto);
+	}
+
+	if (rto < NFS_MINRTO)
+		rto = NFS_MINRTO;
+	else if (rto > NFS_MAXRTO)
+		rto = NFS_MAXRTO;
+
+	return (rto);
+}
+
+
+/*
  * Initialize sockets and congestion for a new NFS connection.
  * We do not free the sockaddr if error.
  */
@@ -357,10 +451,7 @@ nfs_connect(struct nfsmount *nmp, struct nfsreq *rep)
 
 	mtx_lock(&nmp->nm_mtx);
 	/* Initialize other non-zero congestion variables */
-	nmp->nm_srtt[0] = nmp->nm_srtt[1] = nmp->nm_srtt[2] =
-		nmp->nm_srtt[3] = (NFS_TIMEO << 3);
-	nmp->nm_sdrtt[0] = nmp->nm_sdrtt[1] = nmp->nm_sdrtt[2] =
-		nmp->nm_sdrtt[3] = 0;
+	nfs_init_rtt(nmp);
 	nmp->nm_cwnd = NFS_MAXCWND / 2;	    /* Initial send window */
 	nmp->nm_sent = 0;
 	nmp->nm_timeouts = 0;
@@ -685,7 +776,6 @@ nfs_clnt_match_xid(struct socket *so,
 	caddr_t dpos;
 	u_int32_t rxid, *tl;
 	struct nfsreq *rep;
-	register int32_t t1;
 	int error;
 	
 	/*
@@ -743,27 +833,8 @@ nfsmout:
 				rep->r_flags &= ~R_SENT;
 				nmp->nm_sent -= NFS_CWNDSCALE;
 			}
-			/*
-			 * Update rtt using a gain of 0.125 on the mean
-			 * and a gain of 0.25 on the deviation.
-			 */
-			if (rep->r_flags & R_TIMING) {
-				/*
-				 * Since the timer resolution of
-				 * NFS_HZ is so course, it can often
-				 * result in r_rtt == 0. Since
-				 * r_rtt == N means that the actual
-				 * rtt is between N+dt and N+2-dt ticks,
-				 * add 1.
-				 */
-				t1 = rep->r_rtt + 1;
-				t1 -= (NFS_SRTT(rep) >> 3);
-				NFS_SRTT(rep) += t1;
-				if (t1 < 0)
-					t1 = -t1;
-				t1 -= (NFS_SDRTT(rep) >> 2);
-				NFS_SDRTT(rep) += t1;
-			}
+			if (rep->r_flags & R_TIMING)
+				nfs_update_rtt(rep);
 			nmp->nm_timeouts = 0;
 			wakeup((caddr_t)rep);
 			mtx_unlock(&rep->r_mtx);
@@ -1073,7 +1144,7 @@ tryagain:
 	else
 		rep->r_retry = NFS_MAXREXMIT + 1;	/* past clip limit */
 	rep->r_rtt = rep->r_rexmit = 0;
-	if (proct[procnum] > 0)
+	if (nfs_rto_timer(procnum) != NFS_DEFAULT_TIMER)
 		rep->r_flags = R_TIMING;
 	else
 		rep->r_flags = 0;
@@ -1328,7 +1399,7 @@ nfs_timer(void *arg)
 			if (nmp->nm_flag & NFSMNT_DUMBTIMR)
 				timeo = nmp->nm_timeo;
 			else
-				timeo = NFS_RTO(nmp, proct[rep->r_procnum]);
+				timeo = nfs_estimate_rto(nmp, rep->r_procnum);
 			if (nmp->nm_timeouts > 0)
 				timeo *= nfs_backoff[nmp->nm_timeouts - 1];
 			if (rep->r_rtt <= timeo) {
diff --git a/sys/nfsclient/nfsmount.h b/sys/nfsclient/nfsmount.h
index e7b9d0f..4fd2afe 100644
--- a/sys/nfsclient/nfsmount.h
+++ b/sys/nfsclient/nfsmount.h
@@ -64,8 +64,8 @@ struct	nfsmount {
 	struct	sockaddr *nm_nam;	/* Addr of server */
 	int	nm_timeo;		/* Init timer for NFSMNT_DUMBTIMR */
 	int	nm_retry;		/* Max retries */
-	int	nm_srtt[4];		/* Timers for rpcs */
-	int	nm_sdrtt[4];
+	int	nm_srtt[NFS_MAX_TIMER],	/* RTT Timers for rpcs */
+		nm_sdrtt[NFS_MAX_TIMER];
 	int	nm_sent;		/* Request send count */
 	int	nm_cwnd;		/* Request send window */
 	int	nm_timeouts;		/* Request timeouts */
author	cel <cel@FreeBSD.org>	2006-05-23 18:33:58 +0000
committer	cel <cel@FreeBSD.org>	2006-05-23 18:33:58 +0000
commit	ec80996e6b2ebfba48c19d243ad3fd4f810cd75c (patch)
tree	8e9eadb2175472785f8fc6b2f9477037efeed9f7
parent	04be51fc90e96399fdb0fbc4352ca5b7b52e4ad6 (diff)
download	FreeBSD-src-ec80996e6b2ebfba48c19d243ad3fd4f810cd75c.zip FreeBSD-src-ec80996e6b2ebfba48c19d243ad3fd4f810cd75c.tar.gz