diff options
Diffstat (limited to 'sys')
-rw-r--r-- | sys/conf/NOTES | 2 | ||||
-rw-r--r-- | sys/conf/files | 6 | ||||
-rw-r--r-- | sys/dev/filemon/filemon.c | 4 | ||||
-rw-r--r-- | sys/dev/xen/control/control.c | 3 | ||||
-rw-r--r-- | sys/fs/nfs/nfs_var.h | 1 | ||||
-rw-r--r-- | sys/fs/nfsclient/nfs_clrpcops.c | 38 | ||||
-rw-r--r-- | sys/fs/nfsserver/nfs_nfsdkrpc.c | 1 | ||||
-rw-r--r-- | sys/fs/nfsserver/nfs_nfsdstate.c | 55 | ||||
-rw-r--r-- | sys/kern/vfs_subr.c | 16 | ||||
-rw-r--r-- | sys/modules/Makefile.inc | 1 | ||||
-rw-r--r-- | sys/modules/drm2/Makefile | 2 | ||||
-rw-r--r-- | sys/modules/drm2/radeonkmsfw/Makefile | 2 | ||||
-rw-r--r-- | sys/modules/netgraph/Makefile | 2 | ||||
-rw-r--r-- | sys/netinet/igmp.c | 10 | ||||
-rw-r--r-- | sys/netinet6/mld6.c | 10 | ||||
-rw-r--r-- | sys/netpfil/ipfw/ip_fw_dynamic.c | 3 | ||||
-rw-r--r-- | sys/vm/device_pager.c | 28 | ||||
-rw-r--r-- | sys/vm/vm_page.h | 1 | ||||
-rw-r--r-- | sys/vm/vm_pageout.c | 112 | ||||
-rw-r--r-- | sys/x86/xen/hvm.c | 21 |
20 files changed, 237 insertions, 81 deletions
diff --git a/sys/conf/NOTES b/sys/conf/NOTES index e05e366..9c8b9c9 100644 --- a/sys/conf/NOTES +++ b/sys/conf/NOTES @@ -2706,6 +2706,8 @@ device uvisor # USB serial support for DDI pocket's PHS device uvscom # +# USB ethernet support +device uether # ADMtek USB ethernet. Supports the LinkSys USB100TX, # the Billionton USB100, the Melco LU-ATX, the D-Link DSB-650TX # and the SMC 2202USB. Also works with the ADMtek AN986 Pegasus diff --git a/sys/conf/files b/sys/conf/files index e124bf4..f6456a7 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -2529,9 +2529,9 @@ dev/usb/net/if_udav.c optional udav dev/usb/net/if_usie.c optional usie dev/usb/net/if_urndis.c optional urndis dev/usb/net/ruephy.c optional rue -dev/usb/net/usb_ethernet.c optional aue | axe | axge | cdce | cue | kue | \ - mos | rue | smsc | udav | ipheth | \ - urndis +dev/usb/net/usb_ethernet.c optional uether | aue | axe | axge | cdce | \ + cue | ipheth | kue | mos | rue | \ + smsc | udav | urndis dev/usb/net/uhso.c optional uhso # # USB WLAN drivers diff --git a/sys/dev/filemon/filemon.c b/sys/dev/filemon/filemon.c index f8a698f..b302de9 100644 --- a/sys/dev/filemon/filemon.c +++ b/sys/dev/filemon/filemon.c @@ -43,7 +43,6 @@ __FBSDID("$FreeBSD$"); #include <sys/lock.h> #include <sys/malloc.h> #include <sys/module.h> -#include <sys/mutex.h> #include <sys/poll.h> #include <sys/proc.h> #include <sys/queue.h> @@ -195,9 +194,6 @@ filemon_open(struct cdev *dev, int oflags __unused, int devtype __unused, if (filemon == NULL) { filemon = malloc(sizeof(struct filemon), M_FILEMON, M_WAITOK | M_ZERO); - - filemon->fp = NULL; - sx_init(&filemon->lock, "filemon"); } diff --git a/sys/dev/xen/control/control.c b/sys/dev/xen/control/control.c index bc0609d..60e448a 100644 --- a/sys/dev/xen/control/control.c +++ b/sys/dev/xen/control/control.c @@ -127,6 +127,7 @@ __FBSDID("$FreeBSD$"); #include <machine/_inttypes.h> #include <machine/intr_machdep.h> +#include <machine/apicvar.h> #include <vm/vm.h> #include <vm/vm_extern.h> @@ -403,6 +404,8 @@ xctrl_suspend() gnttab_resume(); #ifdef SMP + /* Send an IPI_BITMAP in case there are pending bitmap IPIs. */ + lapic_ipi_vectored(IPI_BITMAP_VECTOR, APIC_IPI_DEST_ALL); if (smp_started && !CPU_EMPTY(&cpu_suspend_map)) { /* * Now that event channels have been initialized, diff --git a/sys/fs/nfs/nfs_var.h b/sys/fs/nfs/nfs_var.h index 2abd7e4..d540dc9 100644 --- a/sys/fs/nfs/nfs_var.h +++ b/sys/fs/nfs/nfs_var.h @@ -135,6 +135,7 @@ int nfsrv_checksequence(struct nfsrv_descript *, uint32_t, uint32_t *, uint32_t *, int, uint32_t *, NFSPROC_T *); int nfsrv_checkreclaimcomplete(struct nfsrv_descript *); void nfsrv_cache_session(uint8_t *, uint32_t, int, struct mbuf **); +void nfsrv_freeallbackchannel_xprts(void); /* nfs_nfsdserv.c */ int nfsrvd_access(struct nfsrv_descript *, int, diff --git a/sys/fs/nfsclient/nfs_clrpcops.c b/sys/fs/nfsclient/nfs_clrpcops.c index bfe6fa3..429cfcc 100644 --- a/sys/fs/nfsclient/nfs_clrpcops.c +++ b/sys/fs/nfsclient/nfs_clrpcops.c @@ -3087,6 +3087,25 @@ nfsrpc_readdir(vnode_t vp, struct uio *uiop, nfsuint64 *cookiep, *eofp = eof; } + /* + * Add extra empty records to any remaining DIRBLKSIZ chunks. + */ + while (uio_uio_resid(uiop) > 0 && ((size_t)(uio_uio_resid(uiop))) != tresid) { + dp = (struct dirent *) CAST_DOWN(caddr_t, uio_iov_base(uiop)); + dp->d_type = DT_UNKNOWN; + dp->d_fileno = 0; + dp->d_namlen = 0; + dp->d_name[0] = '\0'; + tl = (u_int32_t *)&dp->d_name[4]; + *tl++ = cookie.lval[0]; + *tl = cookie.lval[1]; + dp->d_reclen = DIRBLKSIZ; + uio_iov_base_add(uiop, DIRBLKSIZ); + uio_iov_len_add(uiop, -(DIRBLKSIZ)); + uio_uio_resid_add(uiop, -(DIRBLKSIZ)); + uiop->uio_offset += DIRBLKSIZ; + } + nfsmout: if (nd->nd_mrep != NULL) mbuf_freem(nd->nd_mrep); @@ -3561,6 +3580,25 @@ nfsrpc_readdirplus(vnode_t vp, struct uio *uiop, nfsuint64 *cookiep, *eofp = eof; } + /* + * Add extra empty records to any remaining DIRBLKSIZ chunks. + */ + while (uio_uio_resid(uiop) > 0 && uio_uio_resid(uiop) != tresid) { + dp = (struct dirent *)uio_iov_base(uiop); + dp->d_type = DT_UNKNOWN; + dp->d_fileno = 0; + dp->d_namlen = 0; + dp->d_name[0] = '\0'; + tl = (u_int32_t *)&dp->d_name[4]; + *tl++ = cookie.lval[0]; + *tl = cookie.lval[1]; + dp->d_reclen = DIRBLKSIZ; + uio_iov_base_add(uiop, DIRBLKSIZ); + uio_iov_len_add(uiop, -(DIRBLKSIZ)); + uio_uio_resid_add(uiop, -(DIRBLKSIZ)); + uiop->uio_offset += DIRBLKSIZ; + } + nfsmout: if (nd->nd_mrep != NULL) mbuf_freem(nd->nd_mrep); diff --git a/sys/fs/nfsserver/nfs_nfsdkrpc.c b/sys/fs/nfsserver/nfs_nfsdkrpc.c index e68a18b..7326038 100644 --- a/sys/fs/nfsserver/nfs_nfsdkrpc.c +++ b/sys/fs/nfsserver/nfs_nfsdkrpc.c @@ -547,6 +547,7 @@ nfsrvd_init(int terminating) if (terminating) { nfsd_master_proc = NULL; NFSD_UNLOCK(); + nfsrv_freeallbackchannel_xprts(); svcpool_destroy(nfsrvd_pool); nfsrvd_pool = NULL; NFSD_LOCK(); diff --git a/sys/fs/nfsserver/nfs_nfsdstate.c b/sys/fs/nfsserver/nfs_nfsdstate.c index c6d9448..37fb3b6 100644 --- a/sys/fs/nfsserver/nfs_nfsdstate.c +++ b/sys/fs/nfsserver/nfs_nfsdstate.c @@ -4188,10 +4188,23 @@ nfsrv_docallback(struct nfsclient *clp, int procnum, if (!error) { if ((nd->nd_flag & ND_NFSV41) != 0) { KASSERT(sep != NULL, ("sep NULL")); - error = newnfs_request(nd, NULL, clp, &clp->lc_req, - NULL, NULL, cred, clp->lc_program, - clp->lc_req.nr_vers, NULL, 1, NULL, - &sep->sess_cbsess); + if (sep->sess_cbsess.nfsess_xprt != NULL) + error = newnfs_request(nd, NULL, clp, + &clp->lc_req, NULL, NULL, cred, + clp->lc_program, clp->lc_req.nr_vers, NULL, + 1, NULL, &sep->sess_cbsess); + else { + /* + * This should probably never occur, but if a + * client somehow does an RPC without a + * SequenceID Op that causes a callback just + * after the nfsd threads have been terminated + * and restared we could conceivably get here + * without a backchannel xprt. + */ + printf("nfsrv_docallback: no xprt\n"); + error = ECONNREFUSED; + } nfsrv_freesession(sep, NULL); } else error = newnfs_request(nd, NULL, clp, &clp->lc_req, @@ -5776,14 +5789,16 @@ nfsrv_checksequence(struct nfsrv_descript *nd, uint32_t sequenceid, * If this session handles the backchannel, save the nd_xprt for this * RPC, since this is the one being used. */ - if (sep->sess_cbsess.nfsess_xprt != NULL && + if (sep->sess_clp->lc_req.nr_client != NULL && (sep->sess_crflags & NFSV4CRSESS_CONNBACKCHAN) != 0) { savxprt = sep->sess_cbsess.nfsess_xprt; SVC_ACQUIRE(nd->nd_xprt); - nd->nd_xprt->xp_p2 = savxprt->xp_p2; + nd->nd_xprt->xp_p2 = + sep->sess_clp->lc_req.nr_client->cl_private; nd->nd_xprt->xp_idletimeout = 0; /* Disable timeout. */ sep->sess_cbsess.nfsess_xprt = nd->nd_xprt; - SVC_RELEASE(savxprt); + if (savxprt != NULL) + SVC_RELEASE(savxprt); } *sflagsp = 0; @@ -6042,3 +6057,29 @@ nfsv4_getcbsession(struct nfsclient *clp, struct nfsdsession **sepp) return (0); } +/* + * Free up all backchannel xprts. This needs to be done when the nfsd threads + * exit, since those transports will all be going away. + * This is only called after all the nfsd threads are done performing RPCs, + * so locking shouldn't be an issue. + */ +APPLESTATIC void +nfsrv_freeallbackchannel_xprts(void) +{ + struct nfsdsession *sep; + struct nfsclient *clp; + SVCXPRT *xprt; + int i; + + for (i = 0; i < nfsrv_clienthashsize; i++) { + LIST_FOREACH(clp, &nfsclienthash[i], lc_hash) { + LIST_FOREACH(sep, &clp->lc_session, sess_list) { + xprt = sep->sess_cbsess.nfsess_xprt; + sep->sess_cbsess.nfsess_xprt = NULL; + if (xprt != NULL) + SVC_RELEASE(xprt); + } + } + } +} + diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index b5f5b42..2df1e25 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -2425,6 +2425,10 @@ vdrop(struct vnode *vp) * Drop the hold count of the vnode. If this is the last reference to * the vnode we place it on the free list unless it has been vgone'd * (marked VI_DOOMED) in which case we will free it. + * + * Because the vnode vm object keeps a hold reference on the vnode if + * there is at least one resident non-cached page, the vnode cannot + * leave the active list without the page cleanup done. */ void vdropl(struct vnode *vp) @@ -2540,11 +2544,13 @@ vinactive(struct vnode *vp, struct thread *td) VI_UNLOCK(vp); /* * Before moving off the active list, we must be sure that any - * modified pages are on the vnode's dirty list since these will - * no longer be checked once the vnode is on the inactive list. - * Because the vnode vm object keeps a hold reference on the vnode - * if there is at least one resident non-cached page, the vnode - * cannot leave the active list without the page cleanup done. + * modified pages are converted into the vnode's dirty + * buffers, since these will no longer be checked once the + * vnode is on the inactive list. + * + * The write-out of the dirty pages is asynchronous. At the + * point that VOP_INACTIVE() is called, there could still be + * pending I/O and dirty pages in the object. */ obj = vp->v_object; if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0) { diff --git a/sys/modules/Makefile.inc b/sys/modules/Makefile.inc index 9dc38af..b20b99b 100644 --- a/sys/modules/Makefile.inc +++ b/sys/modules/Makefile.inc @@ -4,3 +4,4 @@ CFLAGS+= -DPC98 .endif +SUBDIR_PARALLEL= yes diff --git a/sys/modules/drm2/Makefile b/sys/modules/drm2/Makefile index 3671c1a..e4e19fb 100644 --- a/sys/modules/drm2/Makefile +++ b/sys/modules/drm2/Makefile @@ -2,8 +2,6 @@ .include <bsd.own.mk> -SUBDIR_PARALLEL= - .if ${MACHINE_CPUARCH} == "amd64" _i915kms= i915kms _radeonkms= radeonkms diff --git a/sys/modules/drm2/radeonkmsfw/Makefile b/sys/modules/drm2/radeonkmsfw/Makefile index f885da9..167743c 100644 --- a/sys/modules/drm2/radeonkmsfw/Makefile +++ b/sys/modules/drm2/radeonkmsfw/Makefile @@ -1,7 +1,5 @@ # $FreeBSD$ -SUBDIR_PARALLEL= - SUBDIR= \ ARUBA_me \ ARUBA_pfp \ diff --git a/sys/modules/netgraph/Makefile b/sys/modules/netgraph/Makefile index 03873e4..dc44ac7 100644 --- a/sys/modules/netgraph/Makefile +++ b/sys/modules/netgraph/Makefile @@ -62,6 +62,4 @@ _bluetooth= bluetooth _mppc= mppc .endif -SUBDIR_PARALLEL= - .include <bsd.subdir.mk> diff --git a/sys/netinet/igmp.c b/sys/netinet/igmp.c index c138f14..34a60a8 100644 --- a/sys/netinet/igmp.c +++ b/sys/netinet/igmp.c @@ -3327,6 +3327,15 @@ igmp_v3_dispatch_general_query(struct igmp_ifinfo *igi) KASSERT(igi->igi_version == IGMP_VERSION_3, ("%s: called when version %d", __func__, igi->igi_version)); + /* + * Check that there are some packets queued. If so, send them first. + * For large number of groups the reply to general query can take + * many packets, we should finish sending them before starting of + * queuing the new reply. + */ + if (igi->igi_gq.ifq_head != NULL) + goto send; + ifp = igi->igi_ifp; IF_ADDR_RLOCK(ifp); @@ -3362,6 +3371,7 @@ igmp_v3_dispatch_general_query(struct igmp_ifinfo *igi) } IF_ADDR_RUNLOCK(ifp); +send: loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0; igmp_dispatch_queue(&igi->igi_gq, IGMP_MAX_RESPONSE_BURST, loop); diff --git a/sys/netinet6/mld6.c b/sys/netinet6/mld6.c index 77b19bf..6e0f95d 100644 --- a/sys/netinet6/mld6.c +++ b/sys/netinet6/mld6.c @@ -2989,6 +2989,15 @@ mld_v2_dispatch_general_query(struct mld_ifinfo *mli) KASSERT(mli->mli_version == MLD_VERSION_2, ("%s: called when version %d", __func__, mli->mli_version)); + /* + * Check that there are some packets queued. If so, send them first. + * For large number of groups the reply to general query can take + * many packets, we should finish sending them before starting of + * queuing the new reply. + */ + if (mli->mli_gq.ifq_head != NULL) + goto send; + ifp = mli->mli_ifp; IF_ADDR_RLOCK(ifp); @@ -3024,6 +3033,7 @@ mld_v2_dispatch_general_query(struct mld_ifinfo *mli) } IF_ADDR_RUNLOCK(ifp); +send: mld_dispatch_queue(&mli->mli_gq, MLD_MAX_RESPONSE_BURST); /* diff --git a/sys/netpfil/ipfw/ip_fw_dynamic.c b/sys/netpfil/ipfw/ip_fw_dynamic.c index 694362a..b6cfa62 100644 --- a/sys/netpfil/ipfw/ip_fw_dynamic.c +++ b/sys/netpfil/ipfw/ip_fw_dynamic.c @@ -715,6 +715,9 @@ ipfw_install_state(struct ip_fw *rule, ipfw_insn_limit *cmd, id.fib = M_GETFIB(args->m); if (IS_IP6_FLOW_ID (&(args->f_id))) { + bzero(&id.src_ip6, sizeof(id.src_ip6)); + bzero(&id.dst_ip6, sizeof(id.dst_ip6)); + if (limit_mask & DYN_SRC_ADDR) id.src_ip6 = args->f_id.src_ip6; if (limit_mask & DYN_DST_ADDR) diff --git a/sys/vm/device_pager.c b/sys/vm/device_pager.c index 60c1beb..fd110c7 100644 --- a/sys/vm/device_pager.c +++ b/sys/vm/device_pager.c @@ -60,10 +60,8 @@ static vm_object_t dev_pager_alloc(void *, vm_ooffset_t, vm_prot_t, vm_ooffset_t, struct ucred *); static void dev_pager_dealloc(vm_object_t); static int dev_pager_getpages(vm_object_t, vm_page_t *, int, int); -static void dev_pager_putpages(vm_object_t, vm_page_t *, int, - boolean_t, int *); -static boolean_t dev_pager_haspage(vm_object_t, vm_pindex_t, int *, - int *); +static void dev_pager_putpages(vm_object_t, vm_page_t *, int, int, int *); +static boolean_t dev_pager_haspage(vm_object_t, vm_pindex_t, int *, int *); static void dev_pager_free_page(vm_object_t object, vm_page_t m); /* list of device pager objects */ @@ -101,8 +99,9 @@ static struct cdev_pager_ops old_dev_pager_ops = { }; static void -dev_pager_init() +dev_pager_init(void) { + TAILQ_INIT(&dev_pager_object_list); mtx_init(&dev_pager_mtx, "dev_pager list", NULL, MTX_DEF); } @@ -231,8 +230,7 @@ dev_pager_free_page(vm_object_t object, vm_page_t m) } static void -dev_pager_dealloc(object) - vm_object_t object; +dev_pager_dealloc(vm_object_t object) { vm_page_t m; @@ -362,24 +360,18 @@ old_dev_pager_fault(vm_object_t object, vm_ooffset_t offset, int prot, } static void -dev_pager_putpages(object, m, count, sync, rtvals) - vm_object_t object; - vm_page_t *m; - int count; - boolean_t sync; - int *rtvals; +dev_pager_putpages(vm_object_t object, vm_page_t *m, int count, int flags, + int *rtvals) { panic("dev_pager_putpage called"); } static boolean_t -dev_pager_haspage(object, pindex, before, after) - vm_object_t object; - vm_pindex_t pindex; - int *before; - int *after; +dev_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, + int *after) { + if (before != NULL) *before = 0; if (after != NULL) diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h index accf517..7ecb6c7 100644 --- a/sys/vm/vm_page.h +++ b/sys/vm/vm_page.h @@ -227,6 +227,7 @@ struct vm_domain { long vmd_segs; /* bitmask of the segments */ boolean_t vmd_oom; int vmd_pass; /* local pagedaemon pass */ + int vmd_oom_seq; int vmd_last_active_scan; struct vm_page vmd_marker; /* marker for pagedaemon private use */ }; diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c index 2cc738d..156d11a 100644 --- a/sys/vm/vm_pageout.c +++ b/sys/vm/vm_pageout.c @@ -121,7 +121,8 @@ static void vm_pageout(void); static void vm_pageout_init(void); static int vm_pageout_clean(vm_page_t); static void vm_pageout_scan(struct vm_domain *vmd, int pass); -static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int pass); +static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage, + int starting_page_shortage); SYSINIT(pagedaemon_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, vm_pageout_init, NULL); @@ -158,6 +159,7 @@ int vm_pages_needed; /* Event on which pageout daemon sleeps */ int vm_pageout_deficit; /* Estimated number of pages deficit */ int vm_pageout_pages_needed; /* flag saying that the pageout daemon needs pages */ int vm_pageout_wakeup_thresh; +static int vm_pageout_oom_seq = 12; #if !defined(NO_SWAPPING) static int vm_pageout_req_swapout; /* XXX */ @@ -217,6 +219,10 @@ static int pageout_lock_miss; SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss, CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout"); +SYSCTL_INT(_vm, OID_AUTO, pageout_oom_seq, + CTLFLAG_RW, &vm_pageout_oom_seq, 0, + "back-to-back calls to oom detector to start OOM"); + #define VM_PAGEOUT_PAGE_COUNT 16 int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT; @@ -941,7 +947,7 @@ vm_pageout_scan(struct vm_domain *vmd, int pass) long min_scan; int act_delta, addl_page_shortage, deficit, maxscan, page_shortage; int vnodes_skipped = 0; - int maxlaunder, scan_tick, scanned; + int maxlaunder, scan_tick, scanned, starting_page_shortage; int lockmode; boolean_t queues_locked; @@ -981,6 +987,7 @@ vm_pageout_scan(struct vm_domain *vmd, int pass) page_shortage = vm_paging_target() + deficit; } else page_shortage = deficit = 0; + starting_page_shortage = page_shortage; /* * maxlaunder limits the number of dirty pages we flush per scan. @@ -1358,6 +1365,12 @@ relock_queues: (void)speedup_syncer(); /* + * If the inactive queue scan fails repeatedly to meet its + * target, kill the largest process. + */ + vm_pageout_mightbe_oom(vmd, page_shortage, starting_page_shortage); + + /* * Compute the number of pages we want to try to move from the * active queue to the inactive queue. */ @@ -1469,15 +1482,6 @@ relock_queues: } } #endif - - /* - * If we are critically low on one of RAM or swap and low on - * the other, kill the largest process. However, we avoid - * doing this on the first pass in order to give ourselves a - * chance to flush out dirty vnode-backed pages and to allow - * active pages to be moved to the inactive queue and reclaimed. - */ - vm_pageout_mightbe_oom(vmd, pass); } static int vm_pageout_oom_vote; @@ -1488,12 +1492,17 @@ static int vm_pageout_oom_vote; * failed to reach free target is premature. */ static void -vm_pageout_mightbe_oom(struct vm_domain *vmd, int pass) +vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage, + int starting_page_shortage) { int old_vote; - if (pass <= 1 || !((swap_pager_avail < 64 && vm_page_count_min()) || - (swap_pager_full && vm_paging_target() > 0))) { + if (starting_page_shortage <= 0 || starting_page_shortage != + page_shortage) + vmd->vmd_oom_seq = 0; + else + vmd->vmd_oom_seq++; + if (vmd->vmd_oom_seq < vm_pageout_oom_seq) { if (vmd->vmd_oom) { vmd->vmd_oom = FALSE; atomic_subtract_int(&vm_pageout_oom_vote, 1); @@ -1501,6 +1510,12 @@ vm_pageout_mightbe_oom(struct vm_domain *vmd, int pass) return; } + /* + * Do not follow the call sequence until OOM condition is + * cleared. + */ + vmd->vmd_oom_seq = 0; + if (vmd->vmd_oom) return; @@ -1526,6 +1541,65 @@ vm_pageout_mightbe_oom(struct vm_domain *vmd, int pass) atomic_subtract_int(&vm_pageout_oom_vote, 1); } +/* + * The OOM killer is the page daemon's action of last resort when + * memory allocation requests have been stalled for a prolonged period + * of time because it cannot reclaim memory. This function computes + * the approximate number of physical pages that could be reclaimed if + * the specified address space is destroyed. + * + * Private, anonymous memory owned by the address space is the + * principal resource that we expect to recover after an OOM kill. + * Since the physical pages mapped by the address space's COW entries + * are typically shared pages, they are unlikely to be released and so + * they are not counted. + * + * To get to the point where the page daemon runs the OOM killer, its + * efforts to write-back vnode-backed pages may have stalled. This + * could be caused by a memory allocation deadlock in the write path + * that might be resolved by an OOM kill. Therefore, physical pages + * belonging to vnode-backed objects are counted, because they might + * be freed without being written out first if the address space holds + * the last reference to an unlinked vnode. + * + * Similarly, physical pages belonging to OBJT_PHYS objects are + * counted because the address space might hold the last reference to + * the object. + */ +static long +vm_pageout_oom_pagecount(struct vmspace *vmspace) +{ + vm_map_t map; + vm_map_entry_t entry; + vm_object_t obj; + long res; + + map = &vmspace->vm_map; + KASSERT(!map->system_map, ("system map")); + sx_assert(&map->lock, SA_LOCKED); + res = 0; + for (entry = map->header.next; entry != &map->header; + entry = entry->next) { + if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) + continue; + obj = entry->object.vm_object; + if (obj == NULL) + continue; + if ((entry->eflags & MAP_ENTRY_NEEDS_COPY) != 0 && + obj->ref_count != 1) + continue; + switch (obj->type) { + case OBJT_DEFAULT: + case OBJT_SWAP: + case OBJT_PHYS: + case OBJT_VNODE: + res += obj->resident_page_count; + break; + } + } + return (res); +} + void vm_pageout_oom(int shortage) { @@ -1570,7 +1644,8 @@ vm_pageout_oom(int shortage) if (!TD_ON_RUNQ(td) && !TD_IS_RUNNING(td) && !TD_IS_SLEEPING(td) && - !TD_IS_SUSPENDED(td)) { + !TD_IS_SUSPENDED(td) && + !TD_IS_SWAPPED(td)) { thread_unlock(td); breakout = 1; break; @@ -1598,12 +1673,13 @@ vm_pageout_oom(int shortage) } PROC_UNLOCK(p); size = vmspace_swap_count(vm); - vm_map_unlock_read(&vm->vm_map); if (shortage == VM_OOM_MEM) - size += vmspace_resident_count(vm); + size += vm_pageout_oom_pagecount(vm); + vm_map_unlock_read(&vm->vm_map); vmspace_free(vm); + /* - * if the this process is bigger than the biggest one + * If this process is bigger than the biggest one, * remember it. */ if (size > bigsize) { diff --git a/sys/x86/xen/hvm.c b/sys/x86/xen/hvm.c index 1986f6f..6c6f153 100644 --- a/sys/x86/xen/hvm.c +++ b/sys/x86/xen/hvm.c @@ -72,7 +72,6 @@ static driver_filter_t xen_cpustop_handler; static driver_filter_t xen_cpususpend_handler; static driver_filter_t xen_cpustophard_handler; static void xen_ipi_vectored(u_int vector, int dest); -static void xen_hvm_cpu_resume(void); #endif static void xen_hvm_cpu_init(void); @@ -84,9 +83,6 @@ extern void pmap_lazyfix_action(void); extern int pmap_pcid_enabled; #endif -/* Variables used by mp_machdep to perform the bitmap IPI */ -extern volatile u_int cpu_ipi_pending[MAXCPU]; - /*---------------------------------- Macros ----------------------------------*/ #define IPI_TO_IDX(ipi) ((ipi) - APIC_IPI_INTS) @@ -110,7 +106,7 @@ enum xen_domain_type xen_domain_type = XEN_NATIVE; struct cpu_ops xen_hvm_cpu_ops = { .ipi_vectored = lapic_ipi_vectored, .cpu_init = xen_hvm_cpu_init, - .cpu_resume = xen_hvm_cpu_resume + .cpu_resume = xen_hvm_cpu_init }; #endif @@ -312,21 +308,6 @@ xen_ipi_vectored(u_int vector, int dest) /*---------------------- XEN diverged cpu operations -------------------------*/ static void -xen_hvm_cpu_resume(void) -{ - u_int cpuid = PCPU_GET(cpuid); - - /* - * Reset pending bitmap IPIs, because Xen doesn't preserve pending - * event channels on migration. - */ - cpu_ipi_pending[cpuid] = 0; - - /* register vcpu_info area */ - xen_hvm_cpu_init(); -} - -static void xen_cpu_ipi_init(int cpu) { xen_intr_handle_t *ipi_handle; |