summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorcel <cel@FreeBSD.org>2006-05-23 18:33:58 +0000
committercel <cel@FreeBSD.org>2006-05-23 18:33:58 +0000
commitec80996e6b2ebfba48c19d243ad3fd4f810cd75c (patch)
tree8e9eadb2175472785f8fc6b2f9477037efeed9f7
parent04be51fc90e96399fdb0fbc4352ca5b7b52e4ad6 (diff)
downloadFreeBSD-src-ec80996e6b2ebfba48c19d243ad3fd4f810cd75c.zip
FreeBSD-src-ec80996e6b2ebfba48c19d243ad3fd4f810cd75c.tar.gz
Refactor the NFS over UDP retransmit timeout estimation logic to allow
the estimator to be more easily tuned and maintained. There should be no functional change except there is now a lower limit on the retransmit timeout to prevent the client from retransmitting faster than the server's disks can fill requests, and an upper limit to prevent the estimator from taking to long to retransmit during a server outage. Reviewed by: mohan, kris, silby Sponsored by: Network Appliance, Incorporated
-rw-r--r--sys/nfsclient/nfs.h25
-rw-r--r--sys/nfsclient/nfs_socket.c191
-rw-r--r--sys/nfsclient/nfsmount.h4
3 files changed, 158 insertions, 62 deletions
diff --git a/sys/nfsclient/nfs.h b/sys/nfsclient/nfs.h
index c3c54b2..9dc34a7 100644
--- a/sys/nfsclient/nfs.h
+++ b/sys/nfsclient/nfs.h
@@ -257,6 +257,31 @@ extern int nfs_debug;
#endif
+/*
+ * On fast networks, the estimator will try to reduce the
+ * timeout lower than the latency of the server's disks,
+ * which results in too many timeouts, so cap the lower
+ * bound.
+ */
+#define NFS_MINRTO (NFS_HZ >> 2)
+
+/*
+ * Keep the RTO from increasing to unreasonably large values
+ * when a server is not responding.
+ */
+#define NFS_MAXRTO (20 * NFS_HZ)
+
+enum nfs_rto_timer_t {
+ NFS_DEFAULT_TIMER,
+ NFS_GETATTR_TIMER,
+ NFS_LOOKUP_TIMER,
+ NFS_READ_TIMER,
+ NFS_WRITE_TIMER,
+};
+#define NFS_MAX_TIMER (NFS_WRITE_TIMER)
+
+#define NFS_INITRTT (NFS_HZ << 3)
+
vfs_init_t nfs_init;
vfs_uninit_t nfs_uninit;
int nfs_mountroot(struct mount *mp, struct thread *td);
diff --git a/sys/nfsclient/nfs_socket.c b/sys/nfsclient/nfs_socket.c
index d2fd025..5e12939 100644
--- a/sys/nfsclient/nfs_socket.c
+++ b/sys/nfsclient/nfs_socket.c
@@ -79,38 +79,6 @@ __FBSDID("$FreeBSD$");
extern u_int32_t nfs_xid;
-/*
- * Estimate rto for an nfs rpc sent via. an unreliable datagram.
- * Use the mean and mean deviation of rtt for the appropriate type of rpc
- * for the frequent rpcs and a default for the others.
- * The justification for doing "other" this way is that these rpcs
- * happen so infrequently that timer est. would probably be stale.
- * Also, since many of these rpcs are
- * non-idempotent, a conservative timeout is desired.
- * getattr, lookup - A+2D
- * read, write - A+4D
- * other - nm_timeo
- */
-#define NFS_RTO(n, t) \
- ((t) == 0 ? (n)->nm_timeo : \
- ((t) < 3 ? \
- (((((n)->nm_srtt[t-1] + 3) >> 2) + (n)->nm_sdrtt[t-1] + 1) >> 1) : \
- ((((n)->nm_srtt[t-1] + 7) >> 3) + (n)->nm_sdrtt[t-1] + 1)))
-#define NFS_SRTT(r) (r)->r_nmp->nm_srtt[proct[(r)->r_procnum] - 1]
-#define NFS_SDRTT(r) (r)->r_nmp->nm_sdrtt[proct[(r)->r_procnum] - 1]
-
-/*
- * Defines which timer to use for the procnum.
- * 0 - default
- * 1 - getattr
- * 2 - lookup
- * 3 - read
- * 4 - write
- */
-static int proct[NFS_NPROCS] = {
- 0, 1, 0, 2, 1, 3, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0,
-};
-
static int nfs_realign_test;
static int nfs_realign_count;
static int nfs_bufpackets = 4;
@@ -157,6 +125,132 @@ static void nfs_clnt_udp_soupcall(struct socket *so, void *arg, int waitflag);
extern struct mtx nfs_reqq_mtx;
/*
+ * RTT estimator
+ */
+
+static enum nfs_rto_timer_t nfs_proct[NFS_NPROCS] = {
+ NFS_DEFAULT_TIMER, /* NULL */
+ NFS_GETATTR_TIMER, /* GETATTR */
+ NFS_DEFAULT_TIMER, /* SETATTR */
+ NFS_LOOKUP_TIMER, /* LOOKUP */
+ NFS_GETATTR_TIMER, /* ACCESS */
+ NFS_READ_TIMER, /* READLINK */
+ NFS_READ_TIMER, /* READ */
+ NFS_WRITE_TIMER, /* WRITE */
+ NFS_DEFAULT_TIMER, /* CREATE */
+ NFS_DEFAULT_TIMER, /* MKDIR */
+ NFS_DEFAULT_TIMER, /* SYMLINK */
+ NFS_DEFAULT_TIMER, /* MKNOD */
+ NFS_DEFAULT_TIMER, /* REMOVE */
+ NFS_DEFAULT_TIMER, /* RMDIR */
+ NFS_DEFAULT_TIMER, /* RENAME */
+ NFS_DEFAULT_TIMER, /* LINK */
+ NFS_READ_TIMER, /* READDIR */
+ NFS_READ_TIMER, /* READDIRPLUS */
+ NFS_DEFAULT_TIMER, /* FSSTAT */
+ NFS_DEFAULT_TIMER, /* FSINFO */
+ NFS_DEFAULT_TIMER, /* PATHCONF */
+ NFS_DEFAULT_TIMER, /* COMMIT */
+ NFS_DEFAULT_TIMER, /* NOOP */
+};
+
+/*
+ * Choose the correct RTT timer for this NFS procedure.
+ */
+static inline enum nfs_rto_timer_t
+nfs_rto_timer(u_int32_t procnum)
+{
+ return nfs_proct[procnum];
+}
+
+/*
+ * Initialize the RTT estimator state for a new mount point.
+ */
+static void
+nfs_init_rtt(struct nfsmount *nmp)
+{
+ int i;
+
+ for (i = 0; i < NFS_MAX_TIMER; i++)
+ nmp->nm_srtt[i] = NFS_INITRTT;
+ for (i = 0; i < NFS_MAX_TIMER; i++)
+ nmp->nm_sdrtt[i] = 0;
+}
+
+/*
+ * Update a mount point's RTT estimator state using data from the
+ * passed-in request.
+ *
+ * Use a gain of 0.125 on the mean and a gain of 0.25 on the deviation.
+ *
+ * NB: Since the timer resolution of NFS_HZ is so course, it can often
+ * result in r_rtt == 0. Since r_rtt == N means that the actual RTT is
+ * between N + dt and N + 2 - dt ticks, add 1 before calculating the
+ * update values.
+ */
+static void
+nfs_update_rtt(struct nfsreq *rep)
+{
+ int t1 = rep->r_rtt + 1;
+ int index = nfs_rto_timer(rep->r_procnum) - 1;
+ int *srtt = &rep->r_nmp->nm_srtt[index];
+ int *sdrtt = &rep->r_nmp->nm_sdrtt[index];
+
+ t1 -= *srtt >> 3;
+ *srtt += t1;
+ if (t1 < 0)
+ t1 = -t1;
+ t1 -= *sdrtt >> 2;
+ *sdrtt += t1;
+}
+
+/*
+ * Estimate RTO for an NFS RPC sent via an unreliable datagram.
+ *
+ * Use the mean and mean deviation of RTT for the appropriate type
+ * of RPC for the frequent RPCs and a default for the others.
+ * The justification for doing "other" this way is that these RPCs
+ * happen so infrequently that timer est. would probably be stale.
+ * Also, since many of these RPCs are non-idempotent, a conservative
+ * timeout is desired.
+ *
+ * getattr, lookup - A+2D
+ * read, write - A+4D
+ * other - nm_timeo
+ */
+static int
+nfs_estimate_rto(struct nfsmount *nmp, u_int32_t procnum)
+{
+ enum nfs_rto_timer_t timer = nfs_rto_timer(procnum);
+ int index = timer - 1;
+ int rto;
+
+ switch (timer) {
+ case NFS_GETATTR_TIMER:
+ case NFS_LOOKUP_TIMER:
+ rto = ((nmp->nm_srtt[index] + 3) >> 2) +
+ ((nmp->nm_sdrtt[index] + 1) >> 1);
+ break;
+ case NFS_READ_TIMER:
+ case NFS_WRITE_TIMER:
+ rto = ((nmp->nm_srtt[index] + 7) >> 3) +
+ (nmp->nm_sdrtt[index] + 1);
+ break;
+ default:
+ rto = nmp->nm_timeo;
+ return (rto);
+ }
+
+ if (rto < NFS_MINRTO)
+ rto = NFS_MINRTO;
+ else if (rto > NFS_MAXRTO)
+ rto = NFS_MAXRTO;
+
+ return (rto);
+}
+
+
+/*
* Initialize sockets and congestion for a new NFS connection.
* We do not free the sockaddr if error.
*/
@@ -357,10 +451,7 @@ nfs_connect(struct nfsmount *nmp, struct nfsreq *rep)
mtx_lock(&nmp->nm_mtx);
/* Initialize other non-zero congestion variables */
- nmp->nm_srtt[0] = nmp->nm_srtt[1] = nmp->nm_srtt[2] =
- nmp->nm_srtt[3] = (NFS_TIMEO << 3);
- nmp->nm_sdrtt[0] = nmp->nm_sdrtt[1] = nmp->nm_sdrtt[2] =
- nmp->nm_sdrtt[3] = 0;
+ nfs_init_rtt(nmp);
nmp->nm_cwnd = NFS_MAXCWND / 2; /* Initial send window */
nmp->nm_sent = 0;
nmp->nm_timeouts = 0;
@@ -685,7 +776,6 @@ nfs_clnt_match_xid(struct socket *so,
caddr_t dpos;
u_int32_t rxid, *tl;
struct nfsreq *rep;
- register int32_t t1;
int error;
/*
@@ -743,27 +833,8 @@ nfsmout:
rep->r_flags &= ~R_SENT;
nmp->nm_sent -= NFS_CWNDSCALE;
}
- /*
- * Update rtt using a gain of 0.125 on the mean
- * and a gain of 0.25 on the deviation.
- */
- if (rep->r_flags & R_TIMING) {
- /*
- * Since the timer resolution of
- * NFS_HZ is so course, it can often
- * result in r_rtt == 0. Since
- * r_rtt == N means that the actual
- * rtt is between N+dt and N+2-dt ticks,
- * add 1.
- */
- t1 = rep->r_rtt + 1;
- t1 -= (NFS_SRTT(rep) >> 3);
- NFS_SRTT(rep) += t1;
- if (t1 < 0)
- t1 = -t1;
- t1 -= (NFS_SDRTT(rep) >> 2);
- NFS_SDRTT(rep) += t1;
- }
+ if (rep->r_flags & R_TIMING)
+ nfs_update_rtt(rep);
nmp->nm_timeouts = 0;
wakeup((caddr_t)rep);
mtx_unlock(&rep->r_mtx);
@@ -1073,7 +1144,7 @@ tryagain:
else
rep->r_retry = NFS_MAXREXMIT + 1; /* past clip limit */
rep->r_rtt = rep->r_rexmit = 0;
- if (proct[procnum] > 0)
+ if (nfs_rto_timer(procnum) != NFS_DEFAULT_TIMER)
rep->r_flags = R_TIMING;
else
rep->r_flags = 0;
@@ -1328,7 +1399,7 @@ nfs_timer(void *arg)
if (nmp->nm_flag & NFSMNT_DUMBTIMR)
timeo = nmp->nm_timeo;
else
- timeo = NFS_RTO(nmp, proct[rep->r_procnum]);
+ timeo = nfs_estimate_rto(nmp, rep->r_procnum);
if (nmp->nm_timeouts > 0)
timeo *= nfs_backoff[nmp->nm_timeouts - 1];
if (rep->r_rtt <= timeo) {
diff --git a/sys/nfsclient/nfsmount.h b/sys/nfsclient/nfsmount.h
index e7b9d0f..4fd2afe 100644
--- a/sys/nfsclient/nfsmount.h
+++ b/sys/nfsclient/nfsmount.h
@@ -64,8 +64,8 @@ struct nfsmount {
struct sockaddr *nm_nam; /* Addr of server */
int nm_timeo; /* Init timer for NFSMNT_DUMBTIMR */
int nm_retry; /* Max retries */
- int nm_srtt[4]; /* Timers for rpcs */
- int nm_sdrtt[4];
+ int nm_srtt[NFS_MAX_TIMER], /* RTT Timers for rpcs */
+ nm_sdrtt[NFS_MAX_TIMER];
int nm_sent; /* Request send count */
int nm_cwnd; /* Request send window */
int nm_timeouts; /* Request timeouts */
OpenPOWER on IntegriCloud