Work-in-progress commit syncing up pmap cleanups that I have been working

on for a while: - fine grained TLB shootdown for SMP on i386 - ranged TLB shootdowns.. eg: specify a range of pages to shoot down with a single IPI, since the IPI is very expensive. Adjust some callers that used to trigger this inside tight loops to do a ranged shootdown at the end instead. - PG_G support for SMP on i386 (options ENABLE_PG_G) - defer PG_G activation till after we decide what we are going to do with PSE and the 4MB pages at the start of the kernel. This should solve some rumored strangeness about stale PG_G entries getting stuck underneath the 4MB pages. - add some instrumentation for the fine TLB shootdown - convert some asm instruction wrappers from functions to inlines. gcc seems to do a fair bit better with this. - [temporarily!] pessimize the tlb shootdown IPI handlers. I will fix this again shortly. This has been working fairly well for me for a while, but I have tweaked it again prior to commit since my last major testing round. The only outstanding problem that I know of is PG_G related, which is why there is an option for it (not on by default for SMP). I have seen a world speedups by a few percent (as much as 4 or 5% in one case) but I have *not* accurately measured this - I am a bit sceptical of these numbers.
author: peter <peter@FreeBSD.org> 2002-02-25 23:49:51 +0000
committer: peter <peter@FreeBSD.org> 2002-02-25 23:49:51 +0000
commit: 748d0e116728aaecf95d1e3ca10bfe40045b88b8 (patch)
tree: 0754b996bbf402ca335dd8c6d902bac23f681df8 /sys/amd64
parent: 06f86e63e411dfaa5655cb6527e06c115aa3e97d (diff)
download: FreeBSD-src-748d0e116728aaecf95d1e3ca10bfe40045b88b8.zip
FreeBSD-src-748d0e116728aaecf95d1e3ca10bfe40045b88b8.tar.gz
15 files changed, 1024 insertions, 478 deletions
diff --git a/sys/amd64/amd64/apic_vector.S b/sys/amd64/amd64/apic_vector.S
index 95c9133..e3a37e1 100644
--- a/sys/amd64/amd64/apic_vector.S
+++ b/sys/amd64/amd64/apic_vector.S
@@ -181,30 +181,108 @@ Xspuriousint:
 	iret
 
 /*
- * Handle TLB shootdowns.
+ * Global address space TLB shootdown.
  */
 	.text
 	SUPERALIGN_TEXT
 	.globl	Xinvltlb
 Xinvltlb:
 	pushl	%eax
+	pushl	%ds
+	movl	$KDSEL, %eax		/* Kernel data selector */
+	mov	%ax, %ds
 
 #ifdef COUNT_XINVLTLB_HITS
 	pushl	%fs
-	movl	$KPSEL, %eax
+	movl	$KPSEL, %eax		/* Private space selector */
 	mov	%ax, %fs
 	movl	PCPU(CPUID), %eax
 	popl	%fs
-	ss
-	incl	_xhits(,%eax,4)
+	incl	xhits_gbl(,%eax,4)
 #endif /* COUNT_XINVLTLB_HITS */
 
 	movl	%cr3, %eax		/* invalidate the TLB */
 	movl	%eax, %cr3
 
-	ss				/* stack segment, avoid %ds load */
 	movl	$0, lapic+LA_EOI	/* End Of Interrupt to APIC */
 
+	lock
+	incl	smp_tlb_wait
+
+	popl	%ds
+	popl	%eax
+	iret
+
+/*
+ * Single page TLB shootdown
+ */
+	.text
+	SUPERALIGN_TEXT
+	.globl	Xinvlpg
+Xinvlpg:
+	pushl	%eax
+	pushl	%ds
+	movl	$KDSEL, %eax		/* Kernel data selector */
+	mov	%ax, %ds
+
+#ifdef COUNT_XINVLTLB_HITS
+	pushl	%fs
+	movl	$KPSEL, %eax		/* Private space selector */
+	mov	%ax, %fs
+	movl	PCPU(CPUID), %eax
+	popl	%fs
+	ss
+	incl	xhits_pg(,%eax,4)
+#endif /* COUNT_XINVLTLB_HITS */
+
+	movl	smp_tlb_addr1, %eax
+	invlpg	(%eax)			/* invalidate single page */
+
+	movl	$0, lapic+LA_EOI	/* End Of Interrupt to APIC */
+
+	lock
+	incl	smp_tlb_wait
+
+	popl	%ds
+	popl	%eax
+	iret
+
+/*
+ * Page range TLB shootdown.
+ */
+	.text
+	SUPERALIGN_TEXT
+	.globl	Xinvlrng
+Xinvlrng:
+	pushl	%eax
+	pushl	%edx
+	pushl	%ds
+	movl	$KDSEL, %eax		/* Kernel data selector */
+	mov	%ax, %ds
+
+#ifdef COUNT_XINVLTLB_HITS
+	pushl	%fs
+	movl	$KPSEL, %eax		/* Private space selector */
+	mov	%ax, %fs
+	movl	PCPU(CPUID), %eax
+	popl	%fs
+	incl	xhits_rng(,%eax,4)
+#endif /* COUNT_XINVLTLB_HITS */
+
+	movl	smp_tlb_addr1, %edx
+	movl	smp_tlb_addr2, %eax
+1:	invlpg	(%edx)			/* invalidate single page */
+	addl	$PAGE_SIZE, %edx
+	cmpl	%edx, %eax
+	jb	1b
+
+	movl	$0, lapic+LA_EOI	/* End Of Interrupt to APIC */
+
+	lock
+	incl	smp_tlb_wait
+
+	popl	%ds
+	popl	%edx
 	popl	%eax
 	iret
 
@@ -443,12 +521,6 @@ Xrendezvous:
 	
 	.data
 
-#ifdef COUNT_XINVLTLB_HITS
-	.globl	_xhits
-_xhits:
-	.space	(NCPU * 4), 0
-#endif /* COUNT_XINVLTLB_HITS */
-
 	.globl	apic_pin_trigger
 apic_pin_trigger:
 	.long	0
diff --git a/sys/amd64/amd64/locore.S b/sys/amd64/amd64/locore.S
index 4fff220..299bc3e 100644
--- a/sys/amd64/amd64/locore.S
+++ b/sys/amd64/amd64/locore.S
@@ -381,12 +381,6 @@ begin:
 	movl	IdlePTD,%esi
 	movl	%esi,(KSTACK_PAGES*PAGE_SIZE-PCB_SIZE+PCB_CR3)(%eax)
 
-	testl	$CPUID_PGE, R(cpu_feature)
-	jz	1f
-	movl	%cr4, %eax
-	orl	$CR4_PGE, %eax
-	movl	%eax, %cr4
-1:
 	pushl	physfree			/* value of first for init386(first) */
 	call	init386				/* wire 386 chip for unix operation */
 
@@ -809,14 +803,7 @@ no_kernend:
 	jne	map_read_write
 #endif
 	xorl	%edx,%edx
-
-#if !defined(SMP)
-	testl	$CPUID_PGE, R(cpu_feature)
-	jz	2f
-	orl	$PG_G,%edx
-#endif
-	
-2:	movl	$R(etext),%ecx
+	movl	$R(etext),%ecx
 	addl	$PAGE_MASK,%ecx
 	shrl	$PAGE_SHIFT,%ecx
 	fillkptphys(%edx)
@@ -827,13 +814,7 @@ no_kernend:
 	andl	$~PAGE_MASK, %eax
 map_read_write:
 	movl	$PG_RW,%edx
-#if !defined(SMP)
-	testl	$CPUID_PGE, R(cpu_feature)
-	jz	1f
-	orl	$PG_G,%edx
-#endif
-	
-1:	movl	R(KERNend),%ecx
+	movl	R(KERNend),%ecx
 	subl	%eax,%ecx
 	shrl	$PAGE_SHIFT,%ecx
 	fillkptphys(%edx)
diff --git a/sys/amd64/amd64/locore.s b/sys/amd64/amd64/locore.s
index 4fff220..299bc3e 100644
--- a/sys/amd64/amd64/locore.s
+++ b/sys/amd64/amd64/locore.s
@@ -381,12 +381,6 @@ begin:
 	movl	IdlePTD,%esi
 	movl	%esi,(KSTACK_PAGES*PAGE_SIZE-PCB_SIZE+PCB_CR3)(%eax)
 
-	testl	$CPUID_PGE, R(cpu_feature)
-	jz	1f
-	movl	%cr4, %eax
-	orl	$CR4_PGE, %eax
-	movl	%eax, %cr4
-1:
 	pushl	physfree			/* value of first for init386(first) */
 	call	init386				/* wire 386 chip for unix operation */
 
@@ -809,14 +803,7 @@ no_kernend:
 	jne	map_read_write
 #endif
 	xorl	%edx,%edx
-
-#if !defined(SMP)
-	testl	$CPUID_PGE, R(cpu_feature)
-	jz	2f
-	orl	$PG_G,%edx
-#endif
-	
-2:	movl	$R(etext),%ecx
+	movl	$R(etext),%ecx
 	addl	$PAGE_MASK,%ecx
 	shrl	$PAGE_SHIFT,%ecx
 	fillkptphys(%edx)
@@ -827,13 +814,7 @@ no_kernend:
 	andl	$~PAGE_MASK, %eax
 map_read_write:
 	movl	$PG_RW,%edx
-#if !defined(SMP)
-	testl	$CPUID_PGE, R(cpu_feature)
-	jz	1f
-	orl	$PG_G,%edx
-#endif
-	
-1:	movl	R(KERNend),%ecx
+	movl	R(KERNend),%ecx
 	subl	%eax,%ecx
 	shrl	$PAGE_SHIFT,%ecx
 	fillkptphys(%edx)
diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c
index 27ee7ae..008dfc5 100644
--- a/sys/amd64/amd64/mp_machdep.c
+++ b/sys/amd64/amd64/mp_machdep.c
@@ -287,6 +287,14 @@ extern pt_entry_t *SMPpt;
 
 struct pcb stoppcbs[MAXCPU];
 
+#ifdef APIC_IO
+/* Variables needed for SMP tlb shootdown. */
+u_int smp_tlb_addr1;
+u_int smp_tlb_addr2;
+volatile int smp_tlb_wait;
+static struct mtx smp_tlb_mtx;
+#endif
+
 /*
  * Local data and functions.
  */
@@ -335,6 +343,9 @@ init_locks(void)
 #ifdef USE_COMLOCK
 	mtx_init(&com_mtx, "com", MTX_SPIN);
 #endif /* USE_COMLOCK */
+#ifdef APIC_IO
+	mtx_init(&smp_tlb_mtx, "tlb", MTX_SPIN);
+#endif
 }
 
 /*
@@ -604,6 +615,10 @@ mp_enable(u_int boot_addr)
 	/* install an inter-CPU IPI for TLB invalidation */
 	setidt(XINVLTLB_OFFSET, Xinvltlb,
 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
+	setidt(XINVLPG_OFFSET, Xinvlpg,
+	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
+	setidt(XINVLRNG_OFFSET, Xinvlrng,
+	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 
 	/* install an inter-CPU IPI for forwarding hardclock() */
 	setidt(XHARDCLOCK_OFFSET, Xhardclock,
@@ -2186,42 +2201,198 @@ start_ap(int logical_cpu, u_int boot_addr)
 	return 0;		/* return FAILURE */
 }
 
+#if defined(APIC_IO)
+
+#ifdef COUNT_XINVLTLB_HITS
+u_int xhits_gbl[MAXCPU];
+u_int xhits_pg[MAXCPU];
+u_int xhits_rng[MAXCPU];
+SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, "");
+SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl,
+    sizeof(xhits_gbl), "IU", "");
+SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg,
+    sizeof(xhits_pg), "IU", "");
+SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng,
+    sizeof(xhits_rng), "IU", "");
+
+u_int ipi_global;
+u_int ipi_page;
+u_int ipi_range;
+u_int ipi_range_size;
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size,
+    0, "");
+
+u_int ipi_masked_global;
+u_int ipi_masked_page;
+u_int ipi_masked_range;
+u_int ipi_masked_range_size;
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW,
+    &ipi_masked_global, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW,
+    &ipi_masked_page, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW,
+    &ipi_masked_range, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW,
+    &ipi_masked_range_size, 0, "");
+#endif
+
 /*
  * Flush the TLB on all other CPU's
- *
- * XXX: Needs to handshake and wait for completion before proceding.
  */
+static void
+smp_tlb_shootdown(u_int vector, u_int addr1, u_int addr2)
+{
+	u_int ncpu;
+	register_t eflags;
+
+	ncpu = mp_ncpus - 1;	/* does not shootdown self */
+	if (ncpu < 1)
+		return;		/* no other cpus */
+	eflags = read_eflags();
+	if ((eflags & PSL_I) == 0)
+		panic("absolutely cannot call smp_ipi_shootdown with interrupts already disabled");
+	mtx_lock_spin(&smp_tlb_mtx);
+	smp_tlb_addr1 = addr1;
+	smp_tlb_addr2 = addr2;
+	smp_tlb_wait = 0;
+	ipi_all_but_self(vector);
+	while (atomic_load_acq_int(&smp_tlb_wait) < ncpu)
+		/* XXX cpu_pause() */ ;
+	mtx_unlock_spin(&smp_tlb_mtx);
+}
+
+static void
+smp_targeted_tlb_shootdown(u_int mask, u_int vector, u_int addr1, u_int addr2)
+{
+	u_int m;
+	int i, ncpu, othercpus;
+	register_t eflags;
+
+	othercpus = mp_ncpus - 1;
+	if (mask == (u_int)-1) {
+		ncpu = othercpus;
+		if (ncpu < 1)
+			return;
+	} else {
+		/* XXX there should be a pcpu self mask */
+		mask &= ~(1 << PCPU_GET(cpuid));
+		if (mask == 0)
+			return;
+		/* Count the target cpus */
+		ncpu = 0;
+		m = mask;
+		while ((i = ffs(m)) != 0) {
+			m >>= i;
+			ncpu++;
+		}
+		if (ncpu > othercpus) {
+			/* XXX this should be a panic offence */
+			printf("SMP: tlb shootdown to %d other cpus (only have %d)\n",
+			    ncpu, othercpus);
+			ncpu = othercpus;
+		}
+		/* XXX should be a panic, implied by mask == 0 above */
+		if (ncpu < 1)
+			return;
+	}
+	eflags = read_eflags();
+	if ((eflags & PSL_I) == 0)
+		panic("absolutely cannot call smp_targeted_ipi_shootdown with interrupts already disabled");
+	mtx_lock_spin(&smp_tlb_mtx);
+	smp_tlb_addr1 = addr1;
+	smp_tlb_addr2 = addr2;
+	smp_tlb_wait = 0;
+	if (mask == (u_int)-1)
+		ipi_all_but_self(vector);
+	else
+		ipi_selected(mask, vector);
+	while (atomic_load_acq_int(&smp_tlb_wait) < ncpu)
+		/* XXX cpu_pause() */ ;
+	mtx_unlock_spin(&smp_tlb_mtx);
+}
+#endif
+
 void
 smp_invltlb(void)
 {
 #if defined(APIC_IO)
-	if (smp_started)
-		ipi_all_but_self(IPI_INVLTLB);
+	if (smp_started) {
+		smp_tlb_shootdown(IPI_INVLTLB, 0, 0);
+#ifdef COUNT_XINVLTLB_HITS
+		ipi_global++;
+#endif
+	}
 #endif  /* APIC_IO */
 }
 
 void
-invlpg(u_int addr)
+smp_invlpg(u_int addr)
 {
-	__asm   __volatile("invlpg (%0)"::"r"(addr):"memory");
+#if defined(APIC_IO)
+	if (smp_started) {
+		smp_tlb_shootdown(IPI_INVLPG, addr, 0);
+#ifdef COUNT_XINVLTLB_HITS
+		ipi_page++;
+#endif
+	}
+#endif  /* APIC_IO */
+}
 
-	/* send a message to the other CPUs */
-	smp_invltlb();
+void
+smp_invlpg_range(u_int addr1, u_int addr2)
+{
+#if defined(APIC_IO)
+	if (smp_started) {
+		smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2);
+#ifdef COUNT_XINVLTLB_HITS
+		ipi_range++;
+		ipi_range_size += (addr2 - addr1) / PAGE_SIZE;
+#endif
+	}
+#endif  /* APIC_IO */
 }
 
 void
-invltlb(void)
+smp_masked_invltlb(u_int mask)
 {
-	u_long  temp;
+#if defined(APIC_IO)
+	if (smp_started) {
+		smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0);
+#ifdef COUNT_XINVLTLB_HITS
+		ipi_masked_global++;
+#endif
+	}
+#endif  /* APIC_IO */
+}
 
-	/*
-	 * This should be implemented as load_cr3(rcr3()) when load_cr3() is
-	 * inlined.
-	 */
-	__asm __volatile("movl %%cr3, %0; movl %0, %%cr3":"=r"(temp) :: "memory");
+void
+smp_masked_invlpg(u_int mask, u_int addr)
+{
+#if defined(APIC_IO)
+	if (smp_started) {
+		smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0);
+#ifdef COUNT_XINVLTLB_HITS
+		ipi_masked_page++;
+#endif
+	}
+#endif  /* APIC_IO */
+}
 
-	/* send a message to the other CPUs */
-	smp_invltlb();
+void
+smp_masked_invlpg_range(u_int mask, u_int addr1, u_int addr2)
+{
+#if defined(APIC_IO)
+	if (smp_started) {
+		smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2);
+#ifdef COUNT_XINVLTLB_HITS
+		ipi_masked_range++;
+		ipi_masked_range_size += (addr2 - addr1) / PAGE_SIZE;
+#endif
+	}
+#endif  /* APIC_IO */
 }
 
 
@@ -2280,6 +2451,9 @@ ap_init(void)
 	/* Build our map of 'other' CPUs. */
 	PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask));
 
+	if (bootverbose)
+		apic_dump("ap_init()");
+
 	printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid));
 
 	if (smp_cpus == mp_ncpus) {
@@ -2312,7 +2486,8 @@ forwarded_statclock(struct trapframe frame)
 {
 
 	mtx_lock_spin(&sched_lock);
-	statclock_process(curthread->td_kse, TRAPF_PC(&frame), TRAPF_USERMODE(&frame));
+	statclock_process(curthread->td_kse, TRAPF_PC(&frame),
+	    TRAPF_USERMODE(&frame));
 	mtx_unlock_spin(&sched_lock);
 }
 
diff --git a/sys/amd64/amd64/mptable.c b/sys/amd64/amd64/mptable.c
index 27ee7ae..008dfc5 100644
--- a/sys/amd64/amd64/mptable.c
+++ b/sys/amd64/amd64/mptable.c
@@ -287,6 +287,14 @@ extern pt_entry_t *SMPpt;
 
 struct pcb stoppcbs[MAXCPU];
 
+#ifdef APIC_IO
+/* Variables needed for SMP tlb shootdown. */
+u_int smp_tlb_addr1;
+u_int smp_tlb_addr2;
+volatile int smp_tlb_wait;
+static struct mtx smp_tlb_mtx;
+#endif
+
 /*
  * Local data and functions.
  */
@@ -335,6 +343,9 @@ init_locks(void)
 #ifdef USE_COMLOCK
 	mtx_init(&com_mtx, "com", MTX_SPIN);
 #endif /* USE_COMLOCK */
+#ifdef APIC_IO
+	mtx_init(&smp_tlb_mtx, "tlb", MTX_SPIN);
+#endif
 }
 
 /*
@@ -604,6 +615,10 @@ mp_enable(u_int boot_addr)
 	/* install an inter-CPU IPI for TLB invalidation */
 	setidt(XINVLTLB_OFFSET, Xinvltlb,
 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
+	setidt(XINVLPG_OFFSET, Xinvlpg,
+	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
+	setidt(XINVLRNG_OFFSET, Xinvlrng,
+	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 
 	/* install an inter-CPU IPI for forwarding hardclock() */
 	setidt(XHARDCLOCK_OFFSET, Xhardclock,
@@ -2186,42 +2201,198 @@ start_ap(int logical_cpu, u_int boot_addr)
 	return 0;		/* return FAILURE */
 }
 
+#if defined(APIC_IO)
+
+#ifdef COUNT_XINVLTLB_HITS
+u_int xhits_gbl[MAXCPU];
+u_int xhits_pg[MAXCPU];
+u_int xhits_rng[MAXCPU];
+SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, "");
+SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl,
+    sizeof(xhits_gbl), "IU", "");
+SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg,
+    sizeof(xhits_pg), "IU", "");
+SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng,
+    sizeof(xhits_rng), "IU", "");
+
+u_int ipi_global;
+u_int ipi_page;
+u_int ipi_range;
+u_int ipi_range_size;
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size,
+    0, "");
+
+u_int ipi_masked_global;
+u_int ipi_masked_page;
+u_int ipi_masked_range;
+u_int ipi_masked_range_size;
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW,
+    &ipi_masked_global, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW,
+    &ipi_masked_page, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW,
+    &ipi_masked_range, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW,
+    &ipi_masked_range_size, 0, "");
+#endif
+
 /*
  * Flush the TLB on all other CPU's
- *
- * XXX: Needs to handshake and wait for completion before proceding.
  */
+static void
+smp_tlb_shootdown(u_int vector, u_int addr1, u_int addr2)
+{
+	u_int ncpu;
+	register_t eflags;
+
+	ncpu = mp_ncpus - 1;	/* does not shootdown self */
+	if (ncpu < 1)
+		return;		/* no other cpus */
+	eflags = read_eflags();
+	if ((eflags & PSL_I) == 0)
+		panic("absolutely cannot call smp_ipi_shootdown with interrupts already disabled");
+	mtx_lock_spin(&smp_tlb_mtx);
+	smp_tlb_addr1 = addr1;
+	smp_tlb_addr2 = addr2;
+	smp_tlb_wait = 0;
+	ipi_all_but_self(vector);
+	while (atomic_load_acq_int(&smp_tlb_wait) < ncpu)
+		/* XXX cpu_pause() */ ;
+	mtx_unlock_spin(&smp_tlb_mtx);
+}
+
+static void
+smp_targeted_tlb_shootdown(u_int mask, u_int vector, u_int addr1, u_int addr2)
+{
+	u_int m;
+	int i, ncpu, othercpus;
+	register_t eflags;
+
+	othercpus = mp_ncpus - 1;
+	if (mask == (u_int)-1) {
+		ncpu = othercpus;
+		if (ncpu < 1)
+			return;
+	} else {
+		/* XXX there should be a pcpu self mask */
+		mask &= ~(1 << PCPU_GET(cpuid));
+		if (mask == 0)
+			return;
+		/* Count the target cpus */
+		ncpu = 0;
+		m = mask;
+		while ((i = ffs(m)) != 0) {
+			m >>= i;
+			ncpu++;
+		}
+		if (ncpu > othercpus) {
+			/* XXX this should be a panic offence */
+			printf("SMP: tlb shootdown to %d other cpus (only have %d)\n",
+			    ncpu, othercpus);
+			ncpu = othercpus;
+		}
+		/* XXX should be a panic, implied by mask == 0 above */
+		if (ncpu < 1)
+			return;
+	}
+	eflags = read_eflags();
+	if ((eflags & PSL_I) == 0)
+		panic("absolutely cannot call smp_targeted_ipi_shootdown with interrupts already disabled");
+	mtx_lock_spin(&smp_tlb_mtx);
+	smp_tlb_addr1 = addr1;
+	smp_tlb_addr2 = addr2;
+	smp_tlb_wait = 0;
+	if (mask == (u_int)-1)
+		ipi_all_but_self(vector);
+	else
+		ipi_selected(mask, vector);
+	while (atomic_load_acq_int(&smp_tlb_wait) < ncpu)
+		/* XXX cpu_pause() */ ;
+	mtx_unlock_spin(&smp_tlb_mtx);
+}
+#endif
+
 void
 smp_invltlb(void)
 {
 #if defined(APIC_IO)
-	if (smp_started)
-		ipi_all_but_self(IPI_INVLTLB);
+	if (smp_started) {
+		smp_tlb_shootdown(IPI_INVLTLB, 0, 0);
+#ifdef COUNT_XINVLTLB_HITS
+		ipi_global++;
+#endif
+	}
 #endif  /* APIC_IO */
 }
 
 void
-invlpg(u_int addr)
+smp_invlpg(u_int addr)
 {
-	__asm   __volatile("invlpg (%0)"::"r"(addr):"memory");
+#if defined(APIC_IO)
+	if (smp_started) {
+		smp_tlb_shootdown(IPI_INVLPG, addr, 0);
+#ifdef COUNT_XINVLTLB_HITS
+		ipi_page++;
+#endif
+	}
+#endif  /* APIC_IO */
+}
 
-	/* send a message to the other CPUs */
-	smp_invltlb();
+void
+smp_invlpg_range(u_int addr1, u_int addr2)
+{
+#if defined(APIC_IO)
+	if (smp_started) {
+		smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2);
+#ifdef COUNT_XINVLTLB_HITS
+		ipi_range++;
+		ipi_range_size += (addr2 - addr1) / PAGE_SIZE;
+#endif
+	}
+#endif  /* APIC_IO */
 }
 
 void
-invltlb(void)
+smp_masked_invltlb(u_int mask)
 {
-	u_long  temp;
+#if defined(APIC_IO)
+	if (smp_started) {
+		smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0);
+#ifdef COUNT_XINVLTLB_HITS
+		ipi_masked_global++;
+#endif
+	}
+#endif  /* APIC_IO */
+}
 
-	/*
-	 * This should be implemented as load_cr3(rcr3()) when load_cr3() is
-	 * inlined.
-	 */
-	__asm __volatile("movl %%cr3, %0; movl %0, %%cr3":"=r"(temp) :: "memory");
+void
+smp_masked_invlpg(u_int mask, u_int addr)
+{
+#if defined(APIC_IO)
+	if (smp_started) {
+		smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0);
+#ifdef COUNT_XINVLTLB_HITS
+		ipi_masked_page++;
+#endif
+	}
+#endif  /* APIC_IO */
+}
 
-	/* send a message to the other CPUs */
-	smp_invltlb();
+void
+smp_masked_invlpg_range(u_int mask, u_int addr1, u_int addr2)
+{
+#if defined(APIC_IO)
+	if (smp_started) {
+		smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2);
+#ifdef COUNT_XINVLTLB_HITS
+		ipi_masked_range++;
+		ipi_masked_range_size += (addr2 - addr1) / PAGE_SIZE;
+#endif
+	}
+#endif  /* APIC_IO */
 }
 
 
@@ -2280,6 +2451,9 @@ ap_init(void)
 	/* Build our map of 'other' CPUs. */
 	PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask));
 
+	if (bootverbose)
+		apic_dump("ap_init()");
+
 	printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid));
 
 	if (smp_cpus == mp_ncpus) {
@@ -2312,7 +2486,8 @@ forwarded_statclock(struct trapframe frame)
 {
 
 	mtx_lock_spin(&sched_lock);
-	statclock_process(curthread->td_kse, TRAPF_PC(&frame), TRAPF_USERMODE(&frame));
+	statclock_process(curthread->td_kse, TRAPF_PC(&frame),
+	    TRAPF_USERMODE(&frame));
 	mtx_unlock_spin(&sched_lock);
 }
 
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index f12cb0b..ba3ee22 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -85,6 +85,9 @@
 #include <sys/user.h>
 #include <sys/vmmeter.h>
 #include <sys/sysctl.h>
+#if defined(SMP)
+#include <sys/smp.h>
+#endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
@@ -101,7 +104,6 @@
 #include <machine/md_var.h>
 #include <machine/specialreg.h>
 #if defined(SMP) || defined(APIC_IO)
-#include <machine/smp.h>
 #include <machine/apic.h>
 #include <machine/segments.h>
 #include <machine/tss.h>
@@ -259,10 +261,10 @@ static vm_offset_t
 pmap_kmem_choose(vm_offset_t addr)
 {
 	vm_offset_t newaddr = addr;
+
 #ifndef DISABLE_PSE
-	if (cpu_feature & CPUID_PSE) {
+	if (cpu_feature & CPUID_PSE)
 		newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
-	}
 #endif
 	return newaddr;
 }
@@ -367,10 +369,9 @@ pmap_bootstrap(firstaddr, loadaddr)
 		PTD[i] = 0;
 
 	pgeflag = 0;
-#if !defined(SMP)			/* XXX - see also mp_machdep.c */
-	if (cpu_feature & CPUID_PGE) {
+#if !defined(SMP) || defined(ENABLE_PG_G)
+	if (cpu_feature & CPUID_PGE)
 		pgeflag = PG_G;
-	}
 #endif
 	
 /*
@@ -383,7 +384,7 @@ pmap_bootstrap(firstaddr, loadaddr)
  */
 	pdir4mb = 0;
 
-#if !defined(DISABLE_PSE)
+#ifndef DISABLE_PSE
 	if (cpu_feature & CPUID_PSE) {
 		pd_entry_t ptditmp;
 		/*
@@ -394,57 +395,64 @@ pmap_bootstrap(firstaddr, loadaddr)
 		ptditmp &= ~(NBPDR - 1);
 		ptditmp |= PG_V | PG_RW | PG_PS | PG_U | pgeflag;
 		pdir4mb = ptditmp;
-
-#if !defined(SMP)
-		/*
-		 * Enable the PSE mode.
-		 */
-		load_cr4(rcr4() | CR4_PSE);
-
-		/*
-		 * We can do the mapping here for the single processor
-		 * case.  We simply ignore the old page table page from
-		 * now on.
-		 */
-		/*
-		 * For SMP, we still need 4K pages to bootstrap APs,
-		 * PSE will be enabled as soon as all APs are up.
-		 */
-		PTD[KPTDI] = (pd_entry_t) ptditmp;
-		kernel_pmap->pm_pdir[KPTDI] = (pd_entry_t) ptditmp;
-		invltlb();
-#endif
 	}
 #endif
-
+#ifndef SMP
+	/*
+	 * Turn on PGE/PSE.  SMP does this later on since the
+	 * 4K page tables are required for AP boot (for now).
+	 * XXX fixme.
+	 */
+	pmap_set_opt();
+#endif
 #ifdef SMP
 	if (cpu_apic_address == 0)
 		panic("pmap_bootstrap: no local apic! (non-SMP hardware?)");
-
 	/* local apic is mapped on last page */
 	SMPpt[NPTEPG - 1] = (pt_entry_t)(PG_V | PG_RW | PG_N | pgeflag |
 	    (cpu_apic_address & PG_FRAME));
 #endif
-
-	invltlb();
+	cpu_invltlb();
 }
 
-#ifdef SMP
 /*
- * Set 4mb pdir for mp startup
+ * Enable 4MB page mode for MP startup.  Turn on PG_G support.
+ * BSP will run this after all the AP's have started up.
  */
 void
 pmap_set_opt(void)
 {
-	if (pseflag && (cpu_feature & CPUID_PSE)) {
+	pt_entry_t *pte;
+	vm_offset_t va;
+
+	if (pgeflag && (cpu_feature & CPUID_PGE))
+		load_cr4(rcr4() | CR4_PGE);
+#ifndef DISABLE_PSE
+	if (pseflag && (cpu_feature & CPUID_PSE))
 		load_cr4(rcr4() | CR4_PSE);
-		if (pdir4mb && PCPU_GET(cpuid) == 0) {	/* only on BSP */
+#endif
+	if (PCPU_GET(cpuid) == 0) {
+#ifndef DISABLE_PSE
+		if (pdir4mb)
 			kernel_pmap->pm_pdir[KPTDI] = PTD[KPTDI] = pdir4mb;
-			cpu_invltlb();
+#endif
+		if (pgeflag) {
+			/* XXX see earlier comments about virtual_avail */
+			for (va = KERNBASE; va < virtual_avail; va += PAGE_SIZE)
+			{
+				pte = vtopte(va);
+				if (*pte)
+					*pte |= pgeflag;
+			}
 		}
-	}
+		/*
+		 * for SMP, this will cause all cpus to reload again, which
+		 * is actually what we want since they now have CR4_PGE on.
+		 */
+		invltlb();
+	} else
+		cpu_invltlb();
 }
-#endif
 
 /*
  *	Initialize the pmap module.
@@ -552,27 +560,37 @@ pmap_track_modified(vm_offset_t va)
 		return 0;
 }
 
-static PMAP_INLINE void
-invltlb_1pg(vm_offset_t va)
-{
-#ifdef I386_CPU
-	invltlb();
-#else
-	invlpg(va);
-#endif
-}
-
 static __inline void
 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 {
 #if defined(SMP)
-	if (pmap->pm_active & PCPU_GET(cpumask))
-		cpu_invlpg((void *)va);
-	if (pmap->pm_active & PCPU_GET(other_cpus))
-		smp_invltlb();
+	u_int cpumask;
+	u_int other_cpus;
+	struct thread *td;
+
+	td = curthread;
+	critical_enter();
+	/*
+	 * We need to disable interrupt preemption but MUST NOT have
+	 * interrupts disabled here.
+	 * XXX we may need to hold schedlock to get a coherent pm_active
+	 */
+	if (td->td_critnest == 1)
+		cpu_critical_exit(td->td_savecrit);
+	if (pmap->pm_active == -1 || pmap->pm_active == all_cpus) {
+		invlpg(va);	/* global */
+	} else {
+		cpumask = PCPU_GET(cpumask);
+		other_cpus = PCPU_GET(other_cpus);
+		if (pmap->pm_active & cpumask)
+			cpu_invlpg(va);
+		if (pmap->pm_active & other_cpus)
+			smp_masked_invlpg(pmap->pm_active & other_cpus, va);
+	}
+	critical_exit();
 #else
 	if (pmap->pm_active)
-		invltlb_1pg(va);
+		cpu_invlpg(va);
 #endif
 }
 
@@ -580,10 +598,30 @@ static __inline void
 pmap_invalidate_all(pmap_t pmap)
 {
 #if defined(SMP)
-	if (pmap->pm_active & PCPU_GET(cpumask))
-		cpu_invltlb();
-	if (pmap->pm_active & PCPU_GET(other_cpus))
-		smp_invltlb();
+	u_int cpumask;
+	u_int other_cpus;
+	struct thread *td;
+
+	td = curthread;
+	critical_enter();
+	/*
+	 * We need to disable interrupt preemption but MUST NOT have
+	 * interrupts disabled here.
+	 * XXX we may need to hold schedlock to get a coherent pm_active
+	 */
+	if (td->td_critnest == 1)
+		cpu_critical_exit(td->td_savecrit);
+	if (pmap->pm_active == -1 || pmap->pm_active == all_cpus) {
+		invltlb();	/* global */
+	} else {
+		cpumask = PCPU_GET(cpumask);
+		other_cpus = PCPU_GET(other_cpus);
+		if (pmap->pm_active & cpumask)
+			cpu_invltlb();
+		if (pmap->pm_active & other_cpus)
+			smp_masked_invltlb(pmap->pm_active & other_cpus);
+	}
+	critical_exit();
 #else
 	if (pmap->pm_active)
 		invltlb();
@@ -609,12 +647,7 @@ get_ptbase(pmap)
 	/* otherwise, we are alternate address space */
 	if (frame != (APTDpde & PG_FRAME)) {
 		APTDpde = (pd_entry_t) (frame | PG_RW | PG_V);
-#if defined(SMP)
-		/* The page directory is not shared between CPUs */
-		cpu_invltlb();
-#else
 		invltlb();
-#endif
 	}
 	return APTmap;
 }
@@ -643,7 +676,7 @@ pmap_pte_quick(pmap, va)
 		newpf = pde & PG_FRAME;
 		if (((*PMAP1) & PG_FRAME) != newpf) {
 			*PMAP1 = newpf | PG_RW | PG_V;
-			invltlb_1pg((vm_offset_t) PADDR1);
+			pmap_invalidate_page(pmap, (vm_offset_t) PADDR1);
 		}
 		return PADDR1 + (index & (NPTEPG - 1));
 	}
@@ -689,20 +722,17 @@ pmap_extract(pmap, va)
 
 /*
  * add a wired page to the kva
- * note that in order for the mapping to take effect -- you
- * should do a invltlb after doing the pmap_kenter...
  */
 PMAP_INLINE void 
 pmap_kenter(vm_offset_t va, vm_offset_t pa)
 {
 	pt_entry_t *pte;
-	pt_entry_t npte, opte;
+	pt_entry_t npte;
 
 	npte = pa | PG_RW | PG_V | pgeflag;
 	pte = vtopte(va);
-	opte = *pte;
 	*pte = npte;
-	invltlb_1pg(va);
+	invlpg(va);
 }
 
 /*
@@ -715,7 +745,7 @@ pmap_kremove(vm_offset_t va)
 
 	pte = vtopte(va);
 	*pte = 0;
-	invltlb_1pg(va);
+	invlpg(va);
 }
 
 /*
@@ -733,13 +763,17 @@ pmap_kremove(vm_offset_t va)
 vm_offset_t
 pmap_map(vm_offset_t *virt, vm_offset_t start, vm_offset_t end, int prot)
 {
-	vm_offset_t sva = *virt;
-	vm_offset_t va = sva;
+	vm_offset_t va, sva;
+	pt_entry_t *pte;
+
+	va = sva = *virt;
 	while (start < end) {
-		pmap_kenter(va, start);
+		pte = vtopte(va);
+		*pte = start | PG_RW | PG_V | pgeflag;
 		va += PAGE_SIZE;
 		start += PAGE_SIZE;
 	}
+	invlpg_range(sva, end);
 	*virt = va;
 	return (sva);
 }
@@ -754,28 +788,21 @@ pmap_map(vm_offset_t *virt, vm_offset_t start, vm_offset_t end, int prot)
  * over.  The page *must* be wired.
  */
 void
-pmap_qenter(vm_offset_t va, vm_page_t *m, int count)
+pmap_qenter(vm_offset_t sva, vm_page_t *m, int count)
 {
-	vm_offset_t end_va;
+	vm_offset_t va, end_va;
+	pt_entry_t *pte;
 
+	va = sva;
 	end_va = va + count * PAGE_SIZE;
-		
-	while (va < end_va) {
-		pt_entry_t *pte;
 
+	while (va < end_va) {
 		pte = vtopte(va);
 		*pte = VM_PAGE_TO_PHYS(*m) | PG_RW | PG_V | pgeflag;
-#ifdef SMP
-		cpu_invlpg((void *)va);
-#else
-		invltlb_1pg(va);
-#endif
 		va += PAGE_SIZE;
 		m++;
 	}
-#ifdef SMP
-	smp_invltlb();
-#endif
+	invlpg_range(sva, end_va);
 }
 
 /*
@@ -783,27 +810,20 @@ pmap_qenter(vm_offset_t va, vm_page_t *m, int count)
  * kernel -- it is meant only for temporary mappings.
  */
 void
-pmap_qremove(vm_offset_t va, int count)
+pmap_qremove(vm_offset_t sva, int count)
 {
-	vm_offset_t end_va;
+	pt_entry_t *pte;
+	vm_offset_t va, end_va;
 
-	end_va = va + count*PAGE_SIZE;
+	va = sva;
+	end_va = va + count * PAGE_SIZE;
 
 	while (va < end_va) {
-		pt_entry_t *pte;
-
 		pte = vtopte(va);
 		*pte = 0;
-#ifdef SMP
-		cpu_invlpg((void *)va);
-#else
-		invltlb_1pg(va);
-#endif
 		va += PAGE_SIZE;
 	}
-#ifdef SMP
-	smp_invltlb();
-#endif
+	invlpg_range(sva, end_va);
 }
 
 static vm_page_t
@@ -824,9 +844,6 @@ retry:
 void
 pmap_new_proc(struct proc *p)
 {
-#ifdef I386_CPU
-	int updateneeded = 0;
-#endif
 	int i;
 	vm_object_t upobj;
 	vm_offset_t up;
@@ -870,23 +887,14 @@ pmap_new_proc(struct proc *p)
 		 * Enter the page into the kernel address space.
 		 */
 		*(ptek + i) = VM_PAGE_TO_PHYS(m) | PG_RW | PG_V | pgeflag;
-		if (oldpte) {
-#ifdef I386_CPU
-			updateneeded = 1;
-#else
+		if (oldpte)
 			invlpg(up + i * PAGE_SIZE);
-#endif
-		}
 
 		vm_page_wakeup(m);
 		vm_page_flag_clear(m, PG_ZERO);
 		vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE);
 		m->valid = VM_PAGE_BITS_ALL;
 	}
-#ifdef I386_CPU
-	if (updateneeded)
-		invltlb();
-#endif
 }
 
 /*
@@ -901,7 +909,7 @@ pmap_dispose_proc(p)
 	vm_object_t upobj;
 	vm_offset_t up;
 	vm_page_t m;
-	pt_entry_t *ptek, oldpte;
+	pt_entry_t *ptek;
 
 	upobj = p->p_upages_obj;
 	up = (vm_offset_t)p->p_uarea;
@@ -911,17 +919,11 @@ pmap_dispose_proc(p)
 		if (m == NULL)
 			panic("pmap_dispose_proc: upage already missing?");
 		vm_page_busy(m);
-		oldpte = *(ptek + i);
 		*(ptek + i) = 0;
-#ifndef I386_CPU
 		invlpg(up + i * PAGE_SIZE);
-#endif
 		vm_page_unwire(m, 0);
 		vm_page_free(m);
 	}
-#ifdef I386_CPU
-	invltlb();
-#endif
 }
 
 /*
@@ -986,9 +988,6 @@ pmap_swapin_proc(p)
 void
 pmap_new_thread(struct thread *td)
 {
-#ifdef I386_CPU
-	int updateneeded = 0;
-#endif
 	int i;
 	vm_object_t ksobj;
 	vm_page_t m;
@@ -1019,13 +1018,8 @@ pmap_new_thread(struct thread *td)
 	ptek = vtopte(ks - PAGE_SIZE);
 	oldpte = *ptek;
 	*ptek = 0;
-	if (oldpte) {
-#ifdef I386_CPU
-		updateneeded = 1;
-#else
+	if (oldpte)
 		invlpg(ks - PAGE_SIZE);
-#endif
-	}
 	ptek++;
 #else
 	/* get a kernel virtual address for the kstack for this thread */
@@ -1055,23 +1049,14 @@ pmap_new_thread(struct thread *td)
 		 * Enter the page into the kernel address space.
 		 */
 		*(ptek + i) = VM_PAGE_TO_PHYS(m) | PG_RW | PG_V | pgeflag;
-		if (oldpte) {
-#ifdef I386_CPU
-			updateneeded = 1;
-#else
+		if (oldpte)
 			invlpg(ks + i * PAGE_SIZE);
-#endif
-		}
 
 		vm_page_wakeup(m);
 		vm_page_flag_clear(m, PG_ZERO);
 		vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE);
 		m->valid = VM_PAGE_BITS_ALL;
 	}
-#ifdef I386_CPU
-	if (updateneeded)
-		invltlb();
-#endif
 }
 
 /*
@@ -1086,7 +1071,7 @@ pmap_dispose_thread(td)
 	vm_object_t ksobj;
 	vm_offset_t ks;
 	vm_page_t m;
-	pt_entry_t *ptek, oldpte;
+	pt_entry_t *ptek;
 
 	ksobj = td->td_kstack_obj;
 	ks = td->td_kstack;
@@ -1096,17 +1081,11 @@ pmap_dispose_thread(td)
 		if (m == NULL)
 			panic("pmap_dispose_thread: kstack already missing?");
 		vm_page_busy(m);
-		oldpte = *(ptek + i);
 		*(ptek + i) = 0;
-#ifndef I386_CPU
 		invlpg(ks + i * PAGE_SIZE);
-#endif
 		vm_page_unwire(m, 0);
 		vm_page_free(m);
 	}
-#ifdef I386_CPU
-	invltlb();
-#endif
 }
 
 /*
@@ -2207,13 +2186,7 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
 		if ((prot & VM_PROT_WRITE) && (origpte & PG_V)) {
 			if ((origpte & PG_RW) == 0) {
 				*pte |= PG_RW;
-#ifdef SMP
-				cpu_invlpg((void *)va);
-				if (pmap->pm_active & PCPU_GET(other_cpus))
-					smp_invltlb();
-#else
-				invltlb_1pg(va);
-#endif
+				pmap_invalidate_page(pmap, va);
 			}
 			return;
 		}
@@ -2281,13 +2254,7 @@ validate:
 	if ((origpte & ~(PG_M|PG_A)) != newpte) {
 		*pte = newpte | PG_A;
 		/*if (origpte)*/ {
-#ifdef SMP
-			cpu_invlpg((void *)va);
-			if (pmap->pm_active & PCPU_GET(other_cpus))
-				smp_invltlb();
-#else
-			invltlb_1pg(va);
-#endif
+			pmap_invalidate_page(pmap, va);
 		}
 	}
 }
@@ -2710,7 +2677,6 @@ pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
 	vm_offset_t pdnxt;
 	pd_entry_t src_frame, dst_frame;
 	vm_page_t m;
-	pd_entry_t saved_pde;
 
 	if (dst_addr != src_addr)
 		return;
@@ -2720,17 +2686,7 @@ pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
 		return;
 
 	dst_frame = dst_pmap->pm_pdir[PTDPTDI] & PG_FRAME;
-	if (dst_frame != (APTDpde & PG_FRAME)) {
-		APTDpde = dst_frame | PG_RW | PG_V;
-#if defined(SMP)
-		/* The page directory is not shared between CPUs */
-		cpu_invltlb();
-#else
-		invltlb();
-#endif
-	}
- 	saved_pde = APTDpde & (PG_FRAME | PG_RW | PG_V);
-	for(addr = src_addr; addr < end_addr; addr = pdnxt) {
+	for (addr = src_addr; addr < end_addr; addr = pdnxt) {
 		pt_entry_t *src_pte, *dst_pte;
 		vm_page_t dstmpte, srcmpte;
 		pd_entry_t srcptepaddr;
@@ -2771,6 +2727,14 @@ pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
 		if (pdnxt > end_addr)
 			pdnxt = end_addr;
 
+		/*
+		 * Have to recheck this before every avtopte() call below
+		 * in case we have blocked and something else used APTDpde.
+		 */
+		if (dst_frame != (APTDpde & PG_FRAME)) {
+			APTDpde = dst_frame | PG_RW | PG_V;
+			invltlb();
+		}
 		src_pte = vtopte(addr);
 		dst_pte = avtopte(addr);
 		while (addr < pdnxt) {
@@ -2786,16 +2750,6 @@ pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
 				 * block.
 				 */
 				dstmpte = pmap_allocpte(dst_pmap, addr);
-				if ((APTDpde & PG_FRAME) !=
-				    (saved_pde & PG_FRAME)) {
-					APTDpde = saved_pde;
-printf ("IT HAPPENNED!");
-#if defined(SMP)
-					cpu_invltlb();
-#else
-					invltlb();
-#endif
-				}
 				if ((*dst_pte == 0) && (ptetemp = *src_pte)) {
 					/*
 					 * Clear the modified and
@@ -2839,12 +2793,15 @@ void
 pmap_zero_page(vm_offset_t phys)
 {
 
+#ifdef SMP
+	/* XXX overkill, we only want to disable migration here */
+	/* XXX or maybe not. down the track we have reentrancy issues */
+	critical_enter();
+#endif
 	if (*CMAP2)
 		panic("pmap_zero_page: CMAP2 busy");
-
 	*CMAP2 = PG_V | PG_RW | (phys & PG_FRAME) | PG_A | PG_M;
-	invltlb_1pg((vm_offset_t)CADDR2);
-
+	cpu_invlpg((vm_offset_t)CADDR2);	/* SMP: local cpu only */
 #if defined(I686_CPU)
 	if (cpu_class == CPUCLASS_686)
 		i686_pagezero(CADDR2);
@@ -2852,6 +2809,9 @@ pmap_zero_page(vm_offset_t phys)
 #endif
 		bzero(CADDR2, PAGE_SIZE);
 	*CMAP2 = 0;
+#ifdef SMP
+	critical_exit();
+#endif
 }
 
 /*
@@ -2864,12 +2824,15 @@ void
 pmap_zero_page_area(vm_offset_t phys, int off, int size)
 {
 
+#ifdef SMP
+	/* XXX overkill, we only want to disable migration here */
+	/* XXX or maybe not. down the track we have reentrancy issues */
+	critical_enter();
+#endif
 	if (*CMAP2)
 		panic("pmap_zero_page: CMAP2 busy");
-
 	*CMAP2 = PG_V | PG_RW | (phys & PG_FRAME) | PG_A | PG_M;
-	invltlb_1pg((vm_offset_t)CADDR2);
-
+	cpu_invlpg((vm_offset_t)CADDR2);	/* SMP: local cpu only */
 #if defined(I686_CPU)
 	if (cpu_class == CPUCLASS_686 && off == 0 && size == PAGE_SIZE)
 		i686_pagezero(CADDR2);
@@ -2877,6 +2840,9 @@ pmap_zero_page_area(vm_offset_t phys, int off, int size)
 #endif
 		bzero((char *)CADDR2 + off, size);
 	*CMAP2 = 0;
+#ifdef SMP
+	critical_exit();
+#endif
 }
 
 /*
@@ -2889,6 +2855,11 @@ void
 pmap_copy_page(vm_offset_t src, vm_offset_t dst)
 {
 
+#ifdef SMP
+	/* XXX overkill, we only want to disable migration here */
+	/* XXX or maybe not. down the track we have reentrancy issues */
+	critical_enter();
+#endif
 	if (*CMAP1)
 		panic("pmap_copy_page: CMAP1 busy");
 	if (*CMAP2)
@@ -2896,17 +2867,14 @@ pmap_copy_page(vm_offset_t src, vm_offset_t dst)
 
 	*CMAP1 = PG_V | (src & PG_FRAME) | PG_A;
 	*CMAP2 = PG_V | PG_RW | (dst & PG_FRAME) | PG_A | PG_M;
-#ifdef I386_CPU
-	invltlb();
-#else
-	invlpg((u_int)CADDR1);
-	invlpg((u_int)CADDR2);
-#endif
-
+	cpu_invlpg((u_int)CADDR1);		/* SMP: local only */
+	cpu_invlpg((u_int)CADDR2);		/* SMP: local only */
 	bcopy(CADDR1, CADDR2, PAGE_SIZE);
-
 	*CMAP1 = 0;
 	*CMAP2 = 0;
+#ifdef SMP
+	critical_exit();
+#endif
 }
 
 
@@ -3322,14 +3290,13 @@ pmap_mapdev(pa, size)
 		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
 
 	pa = pa & PG_FRAME;
-	for (tmpva = va; size > 0;) {
+	for (tmpva = va; size > 0; ) {
 		pte = vtopte(tmpva);
 		*pte = pa | PG_RW | PG_V | pgeflag;
 		size -= PAGE_SIZE;
 		tmpva += PAGE_SIZE;
-		pa += PAGE_SIZE;
 	}
-	invltlb();
+	invlpg_range(va, tmpva);
 
 	return ((void *)(va + offset));
 }
@@ -3339,11 +3306,20 @@ pmap_unmapdev(va, size)
 	vm_offset_t va;
 	vm_size_t size;
 {
-	vm_offset_t base, offset;
+	vm_offset_t base, offset, tmpva;
+	pt_entry_t *pte;
 
 	base = va & PG_FRAME;
 	offset = va & PAGE_MASK;
 	size = roundup(offset + size, PAGE_SIZE);
+
+	for (tmpva = base; size > 0; ) {
+		pte = vtopte(tmpva);
+		*pte = 0;
+		size -= PAGE_SIZE;
+		tmpva += PAGE_SIZE;
+	}
+	invlpg_range(va, tmpva);
 	kmem_free(kernel_map, base, size);
 }
 
diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S
index bc58672..0649009 100644
--- a/sys/amd64/amd64/support.S
+++ b/sys/amd64/amd64/support.S
@@ -1591,42 +1591,6 @@ ENTRY(ssdtosd)
 	popl	%ebx
 	ret
 
-/* load_cr0(cr0) */
-ENTRY(load_cr0)
-	movl	4(%esp),%eax
-	movl	%eax,%cr0
-	ret
-
-/* rcr0() */
-ENTRY(rcr0)
-	movl	%cr0,%eax
-	ret
-
-/* rcr3() */
-ENTRY(rcr3)
-	movl	%cr3,%eax
-	ret
-
-/* void load_cr3(caddr_t cr3) */
-ENTRY(load_cr3)
-#ifdef SWTCH_OPTIM_STATS
-	incl	tlb_flush_count
-#endif
-	movl	4(%esp),%eax
-	movl	%eax,%cr3
-	ret
-
-/* rcr4() */
-ENTRY(rcr4)
-	movl	%cr4,%eax
-	ret
-
-/* void load_cr4(caddr_t cr4) */
-ENTRY(load_cr4)
-	movl	4(%esp),%eax
-	movl	%eax,%cr4
-	ret
-
 /* void reset_dbregs() */
 ENTRY(reset_dbregs)
 	movl    $0,%eax
diff --git a/sys/amd64/amd64/support.s b/sys/amd64/amd64/support.s
index bc58672..0649009 100644
--- a/sys/amd64/amd64/support.s
+++ b/sys/amd64/amd64/support.s
@@ -1591,42 +1591,6 @@ ENTRY(ssdtosd)
 	popl	%ebx
 	ret
 
-/* load_cr0(cr0) */
-ENTRY(load_cr0)
-	movl	4(%esp),%eax
-	movl	%eax,%cr0
-	ret
-
-/* rcr0() */
-ENTRY(rcr0)
-	movl	%cr0,%eax
-	ret
-
-/* rcr3() */
-ENTRY(rcr3)
-	movl	%cr3,%eax
-	ret
-
-/* void load_cr3(caddr_t cr3) */
-ENTRY(load_cr3)
-#ifdef SWTCH_OPTIM_STATS
-	incl	tlb_flush_count
-#endif
-	movl	4(%esp),%eax
-	movl	%eax,%cr3
-	ret
-
-/* rcr4() */
-ENTRY(rcr4)
-	movl	%cr4,%eax
-	ret
-
-/* void load_cr4(caddr_t cr4) */
-ENTRY(load_cr4)
-	movl	4(%esp),%eax
-	movl	%eax,%cr4
-	ret
-
 /* void reset_dbregs() */
 ENTRY(reset_dbregs)
 	movl    $0,%eax
diff --git a/sys/amd64/include/cpufunc.h b/sys/amd64/include/cpufunc.h
index 969541f..94d5c3a 100644
--- a/sys/amd64/include/cpufunc.h
+++ b/sys/amd64/include/cpufunc.h
@@ -227,62 +227,6 @@ invd(void)
 	__asm __volatile("invd");
 }
 
-#if defined(SMP) && defined(_KERNEL)
-
-/*
- * When using APIC IPI's, invlpg() is not simply the invlpg instruction
- * (this is a bug) and the inlining cost is prohibitive since the call
- * executes into the IPI transmission system.
- */
-void	invlpg		__P((u_int addr));
-void	invltlb		__P((void));
-
-static __inline void
-cpu_invlpg(void *addr)
-{
-	__asm __volatile("invlpg %0" : : "m" (*(char *)addr) : "memory");
-}
-
-static __inline void
-cpu_invltlb(void)
-{
-	u_int	temp;
-	/*
-	 * This should be implemented as load_cr3(rcr3()) when load_cr3()
-	 * is inlined.
-	 */
-	__asm __volatile("movl %%cr3, %0; movl %0, %%cr3" : "=r" (temp)
-			 : : "memory");
-#if defined(SWTCH_OPTIM_STATS)
-	++tlb_flush_count;
-#endif
-}
-
-#else /* !(SMP && _KERNEL) */
-
-static __inline void
-invlpg(u_int addr)
-{
-	__asm __volatile("invlpg %0" : : "m" (*(char *)addr) : "memory");
-}
-
-static __inline void
-invltlb(void)
-{
-	u_int	temp;
-	/*
-	 * This should be implemented as load_cr3(rcr3()) when load_cr3()
-	 * is inlined.
-	 */
-	__asm __volatile("movl %%cr3, %0; movl %0, %%cr3" : "=r" (temp)
-			 : : "memory");
-#ifdef SWTCH_OPTIM_STATS
-	++tlb_flush_count;
-#endif
-}
-
-#endif /* SMP && _KERNEL */
-
 static __inline u_short
 inw(u_int port)
 {
@@ -348,15 +292,6 @@ outw(u_int port, u_short data)
 }
 
 static __inline u_int
-rcr2(void)
-{
-	u_int	data;
-
-	__asm __volatile("movl %%cr2,%0" : "=r" (data));
-	return (data);
-}
-
-static __inline u_int
 read_eflags(void)
 {
 	u_int	ef;
@@ -420,6 +355,162 @@ wrmsr(u_int msr, u_int64_t newval)
 	__asm __volatile("wrmsr" : : "A" (newval), "c" (msr));
 }
 
+static __inline void
+load_cr0(u_int data)
+{
+
+	__asm __volatile("movl %0,%%cr0" : : "r" (data));
+}
+
+static __inline u_int
+rcr0(void)
+{
+	u_int	data;
+
+	__asm __volatile("movl %%cr0,%0" : "=r" (data));
+	return (data);
+}
+
+static __inline u_int
+rcr2(void)
+{
+	u_int	data;
+
+	__asm __volatile("movl %%cr2,%0" : "=r" (data));
+	return (data);
+}
+
+static __inline void
+load_cr3(u_int data)
+{
+
+	__asm __volatile("movl %0,%%cr3" : : "r" (data) : "memory");
+#if defined(SWTCH_OPTIM_STATS)
+	++tlb_flush_count;
+#endif
+}
+
+static __inline u_int
+rcr3(void)
+{
+	u_int	data;
+
+	__asm __volatile("movl %%cr3,%0" : "=r" (data));
+	return (data);
+}
+
+static __inline void
+load_cr4(u_int data)
+{
+	__asm __volatile("movl %0,%%cr4" : : "r" (data));
+}
+
+static __inline u_int
+rcr4(void)
+{
+	u_int	data;
+
+	__asm __volatile("movl %%cr4,%0" : "=r" (data));
+	return (data);
+}
+
+/*
+ * Global TLB flush (except for thise for pages marked PG_G)
+ */
+static __inline void
+cpu_invltlb(void)
+{
+
+	load_cr3(rcr3());
+}
+
+/*
+ * TLB flush for an individual page (even if it has PG_G).
+ * Only works on 486+ CPUs (i386 does not have PG_G).
+ */
+static __inline void
+cpu_invlpg(u_int addr)
+{
+
+#ifndef I386_CPU
+	__asm __volatile("invlpg %0" : : "m" (*(char *)addr) : "memory");
+#else
+	cpu_invltlb();
+#endif
+}
+
+#ifdef PAGE_SIZE	/* Avoid this file depending on sys/param.h */
+/*
+ * Same as above but for a range of pages.
+ */
+static __inline void
+cpu_invlpg_range(u_int startva, u_int endva)
+{
+#ifndef I386_CPU
+	u_int addr;
+
+	for (addr = startva; addr < endva; addr += PAGE_SIZE)
+		__asm __volatile("invlpg %0" : : "m" (*(char *)addr));
+	__asm __volatile("" : : : "memory");
+#else
+	cpu_invltlb();
+#endif
+}
+#endif
+
+#ifdef SMP
+extern void	smp_invlpg(u_int addr);
+extern void	smp_masked_invlpg(u_int mask, u_int addr);
+#ifdef PAGE_SIZE	/* Avoid this file depending on sys/param.h */
+extern void	smp_invlpg_range(u_int startva, u_int endva);
+extern void	smp_masked_invlpg_range(u_int mask, u_int startva, u_int endva);
+#endif
+extern void	smp_invltlb(void);
+extern void	smp_masked_invltlb(u_int mask);
+#endif
+
+/*
+ * Generic page TLB flush.  Takes care of SMP.
+ */
+static __inline void
+invlpg(u_int addr)
+{
+
+	cpu_invlpg(addr);
+#ifdef SMP
+	smp_invlpg(addr);
+#endif
+}
+
+#ifdef PAGE_SIZE	/* Avoid this file depending on sys/param.h */
+/*
+ * Generic TLB flush for a range of pages. Takes care of SMP.
+ * Saves many IPIs for SMP mode.
+ */
+static __inline void
+invlpg_range(u_int startva, u_int endva)
+{
+
+	cpu_invlpg_range(startva, endva);
+#ifdef SMP
+	smp_invlpg_range(startva, endva);
+#endif
+}
+#endif
+
+/*
+ * Generic global TLB flush (except for thise for pages marked PG_G)
+ */
+static __inline void
+invltlb(void)
+{
+
+	cpu_invltlb();
+#ifdef SMP
+	smp_invltlb();
+#endif
+}
+
 static __inline u_int
 rfs(void)
 {
@@ -581,6 +672,8 @@ cpu_critical_exit(critical_t eflags)
 int	breakpoint	__P((void));
 u_int	bsfl		__P((u_int mask));
 u_int	bsrl		__P((u_int mask));
+void	cpu_invlpg	__P((u_int addr));
+void	cpu_invlpg_range __P((u_int start, u_int end));
 void	disable_intr	__P((void));
 void	do_cpuid	__P((u_int ax, u_int *p));
 void	enable_intr	__P((void));
@@ -591,15 +684,26 @@ void	insl		__P((u_int port, void *addr, size_t cnt));
 void	insw		__P((u_int port, void *addr, size_t cnt));
 void	invd		__P((void));
 void	invlpg		__P((u_int addr));
+void	invlpg_range	__P((u_int start, u_int end));
 void	invltlb		__P((void));
 u_short	inw		__P((u_int port));
+void	load_cr0	__P((u_int cr0));
+void	load_cr3	__P((u_int cr3));
+void	load_cr4	__P((u_int cr4));
+void	load_fs		__P((u_int sel));
+void	load_gs		__P((u_int sel));
 void	outb		__P((u_int port, u_char data));
 void	outl		__P((u_int port, u_int data));
 void	outsb		__P((u_int port, void *addr, size_t cnt));
 void	outsl		__P((u_int port, void *addr, size_t cnt));
 void	outsw		__P((u_int port, void *addr, size_t cnt));
 void	outw		__P((u_int port, u_short data));
+u_int	rcr0		__P((void));
 u_int	rcr2		__P((void));
+u_int	rcr3		__P((void));
+u_int	rcr4		__P((void));
+u_int	rfs		__P((void));
+u_int	rgs		__P((void));
 u_int64_t rdmsr		__P((u_int msr));
 u_int64_t rdpmc		__P((u_int pmc));
 u_int64_t rdtsc		__P((void));
@@ -607,22 +711,12 @@ u_int	read_eflags	__P((void));
 void	wbinvd		__P((void));
 void	write_eflags	__P((u_int ef));
 void	wrmsr		__P((u_int msr, u_int64_t newval));
-u_int	rfs		__P((void));
-u_int	rgs		__P((void));
-void	load_fs		__P((u_int sel));
-void	load_gs		__P((u_int sel));
 critical_t cpu_critical_enter __P((void));
 void	cpu_critical_exit __P((critical_t eflags));
 
 #endif	/* __GNUC__ */
 
-void	load_cr0	__P((u_int cr0));
-void	load_cr3	__P((u_int cr3));
-void	load_cr4	__P((u_int cr4));
 void	ltr		__P((u_short sel));
-u_int	rcr0		__P((void));
-u_int	rcr3		__P((void));
-u_int	rcr4		__P((void));
 void    reset_dbregs    __P((void));
 __END_DECLS
 
diff --git a/sys/amd64/include/mptable.h b/sys/amd64/include/mptable.h
index 27ee7ae..008dfc5 100644
--- a/sys/amd64/include/mptable.h
+++ b/sys/amd64/include/mptable.h
@@ -287,6 +287,14 @@ extern pt_entry_t *SMPpt;
 
 struct pcb stoppcbs[MAXCPU];
 
+#ifdef APIC_IO
+/* Variables needed for SMP tlb shootdown. */
+u_int smp_tlb_addr1;
+u_int smp_tlb_addr2;
+volatile int smp_tlb_wait;
+static struct mtx smp_tlb_mtx;
+#endif
+
 /*
  * Local data and functions.
  */
@@ -335,6 +343,9 @@ init_locks(void)
 #ifdef USE_COMLOCK
 	mtx_init(&com_mtx, "com", MTX_SPIN);
 #endif /* USE_COMLOCK */
+#ifdef APIC_IO
+	mtx_init(&smp_tlb_mtx, "tlb", MTX_SPIN);
+#endif
 }
 
 /*
@@ -604,6 +615,10 @@ mp_enable(u_int boot_addr)
 	/* install an inter-CPU IPI for TLB invalidation */
 	setidt(XINVLTLB_OFFSET, Xinvltlb,
 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
+	setidt(XINVLPG_OFFSET, Xinvlpg,
+	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
+	setidt(XINVLRNG_OFFSET, Xinvlrng,
+	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 
 	/* install an inter-CPU IPI for forwarding hardclock() */
 	setidt(XHARDCLOCK_OFFSET, Xhardclock,
@@ -2186,42 +2201,198 @@ start_ap(int logical_cpu, u_int boot_addr)
 	return 0;		/* return FAILURE */
 }
 
+#if defined(APIC_IO)
+
+#ifdef COUNT_XINVLTLB_HITS
+u_int xhits_gbl[MAXCPU];
+u_int xhits_pg[MAXCPU];
+u_int xhits_rng[MAXCPU];
+SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, "");
+SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl,
+    sizeof(xhits_gbl), "IU", "");
+SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg,
+    sizeof(xhits_pg), "IU", "");
+SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng,
+    sizeof(xhits_rng), "IU", "");
+
+u_int ipi_global;
+u_int ipi_page;
+u_int ipi_range;
+u_int ipi_range_size;
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size,
+    0, "");
+
+u_int ipi_masked_global;
+u_int ipi_masked_page;
+u_int ipi_masked_range;
+u_int ipi_masked_range_size;
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW,
+    &ipi_masked_global, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW,
+    &ipi_masked_page, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW,
+    &ipi_masked_range, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW,
+    &ipi_masked_range_size, 0, "");
+#endif
+
 /*
  * Flush the TLB on all other CPU's
- *
- * XXX: Needs to handshake and wait for completion before proceding.
  */
+static void
+smp_tlb_shootdown(u_int vector, u_int addr1, u_int addr2)
+{
+	u_int ncpu;
+	register_t eflags;
+
+	ncpu = mp_ncpus - 1;	/* does not shootdown self */
+	if (ncpu < 1)
+		return;		/* no other cpus */
+	eflags = read_eflags();
+	if ((eflags & PSL_I) == 0)
+		panic("absolutely cannot call smp_ipi_shootdown with interrupts already disabled");
+	mtx_lock_spin(&smp_tlb_mtx);
+	smp_tlb_addr1 = addr1;
+	smp_tlb_addr2 = addr2;
+	smp_tlb_wait = 0;
+	ipi_all_but_self(vector);
+	while (atomic_load_acq_int(&smp_tlb_wait) < ncpu)
+		/* XXX cpu_pause() */ ;
+	mtx_unlock_spin(&smp_tlb_mtx);
+}
+
+static void
+smp_targeted_tlb_shootdown(u_int mask, u_int vector, u_int addr1, u_int addr2)
+{
+	u_int m;
+	int i, ncpu, othercpus;
+	register_t eflags;
+
+	othercpus = mp_ncpus - 1;
+	if (mask == (u_int)-1) {
+		ncpu = othercpus;
+		if (ncpu < 1)
+			return;
+	} else {
+		/* XXX there should be a pcpu self mask */
+		mask &= ~(1 << PCPU_GET(cpuid));
+		if (mask == 0)
+			return;
+		/* Count the target cpus */
+		ncpu = 0;
+		m = mask;
+		while ((i = ffs(m)) != 0) {
+			m >>= i;
+			ncpu++;
+		}
+		if (ncpu > othercpus) {
+			/* XXX this should be a panic offence */
+			printf("SMP: tlb shootdown to %d other cpus (only have %d)\n",
+			    ncpu, othercpus);
+			ncpu = othercpus;
+		}
+		/* XXX should be a panic, implied by mask == 0 above */
+		if (ncpu < 1)
+			return;
+	}
+	eflags = read_eflags();
+	if ((eflags & PSL_I) == 0)
+		panic("absolutely cannot call smp_targeted_ipi_shootdown with interrupts already disabled");
+	mtx_lock_spin(&smp_tlb_mtx);
+	smp_tlb_addr1 = addr1;
+	smp_tlb_addr2 = addr2;
+	smp_tlb_wait = 0;
+	if (mask == (u_int)-1)
+		ipi_all_but_self(vector);
+	else
+		ipi_selected(mask, vector);
+	while (atomic_load_acq_int(&smp_tlb_wait) < ncpu)
+		/* XXX cpu_pause() */ ;
+	mtx_unlock_spin(&smp_tlb_mtx);
+}
+#endif
+
 void
 smp_invltlb(void)
 {
 #if defined(APIC_IO)
-	if (smp_started)
-		ipi_all_but_self(IPI_INVLTLB);
+	if (smp_started) {
+		smp_tlb_shootdown(IPI_INVLTLB, 0, 0);
+#ifdef COUNT_XINVLTLB_HITS
+		ipi_global++;
+#endif
+	}
 #endif  /* APIC_IO */
 }
 
 void
-invlpg(u_int addr)
+smp_invlpg(u_int addr)
 {
-	__asm   __volatile("invlpg (%0)"::"r"(addr):"memory");
+#if defined(APIC_IO)
+	if (smp_started) {
+		smp_tlb_shootdown(IPI_INVLPG, addr, 0);
+#ifdef COUNT_XINVLTLB_HITS
+		ipi_page++;
+#endif
+	}
+#endif  /* APIC_IO */
+}
 
-	/* send a message to the other CPUs */
-	smp_invltlb();
+void
+smp_invlpg_range(u_int addr1, u_int addr2)
+{
+#if defined(APIC_IO)
+	if (smp_started) {
+		smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2);
+#ifdef COUNT_XINVLTLB_HITS
+		ipi_range++;
+		ipi_range_size += (addr2 - addr1) / PAGE_SIZE;
+#endif
+	}
+#endif  /* APIC_IO */
 }
 
 void
-invltlb(void)
+smp_masked_invltlb(u_int mask)
 {
-	u_long  temp;
+#if defined(APIC_IO)
+	if (smp_started) {
+		smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0);
+#ifdef COUNT_XINVLTLB_HITS
+		ipi_masked_global++;
+#endif
+	}
+#endif  /* APIC_IO */
+}
 
-	/*
-	 * This should be implemented as load_cr3(rcr3()) when load_cr3() is
-	 * inlined.
-	 */
-	__asm __volatile("movl %%cr3, %0; movl %0, %%cr3":"=r"(temp) :: "memory");
+void
+smp_masked_invlpg(u_int mask, u_int addr)
+{
+#if defined(APIC_IO)
+	if (smp_started) {
+		smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0);
+#ifdef COUNT_XINVLTLB_HITS
+		ipi_masked_page++;
+#endif
+	}
+#endif  /* APIC_IO */
+}
 
-	/* send a message to the other CPUs */
-	smp_invltlb();
+void
+smp_masked_invlpg_range(u_int mask, u_int addr1, u_int addr2)
+{
+#if defined(APIC_IO)
+	if (smp_started) {
+		smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2);
+#ifdef COUNT_XINVLTLB_HITS
+		ipi_masked_range++;
+		ipi_masked_range_size += (addr2 - addr1) / PAGE_SIZE;
+#endif
+	}
+#endif  /* APIC_IO */
 }
 
 
@@ -2280,6 +2451,9 @@ ap_init(void)
 	/* Build our map of 'other' CPUs. */
 	PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask));
 
+	if (bootverbose)
+		apic_dump("ap_init()");
+
 	printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid));
 
 	if (smp_cpus == mp_ncpus) {
@@ -2312,7 +2486,8 @@ forwarded_statclock(struct trapframe frame)
 {
 
 	mtx_lock_spin(&sched_lock);
-	statclock_process(curthread->td_kse, TRAPF_PC(&frame), TRAPF_USERMODE(&frame));
+	statclock_process(curthread->td_kse, TRAPF_PC(&frame),
+	    TRAPF_USERMODE(&frame));
 	mtx_unlock_spin(&sched_lock);
 }
 
diff --git a/sys/amd64/include/pmap.h b/sys/amd64/include/pmap.h
index cb5a24d..618bb3f 100644
--- a/sys/amd64/include/pmap.h
+++ b/sys/amd64/include/pmap.h
@@ -267,9 +267,7 @@ void	*pmap_mapdev __P((vm_offset_t, vm_size_t));
 void	pmap_unmapdev __P((vm_offset_t, vm_size_t));
 pt_entry_t *pmap_pte __P((pmap_t, vm_offset_t)) __pure2;
 vm_page_t pmap_use_pt __P((pmap_t, vm_offset_t));
-#ifdef SMP
 void	pmap_set_opt __P((void));
-#endif
 
 #endif /* _KERNEL */
 
diff --git a/sys/amd64/include/smp.h b/sys/amd64/include/smp.h
index 34228e2..4136c20 100644
--- a/sys/amd64/include/smp.h
+++ b/sys/amd64/include/smp.h
@@ -51,6 +51,8 @@ extern int current_postcode;  /** XXX currently in mp_machdep.c */
  * Interprocessor interrupts for SMP.
  */
 #define	IPI_INVLTLB		XINVLTLB_OFFSET
+#define	IPI_INVLPG		XINVLPG_OFFSET
+#define	IPI_INVLRNG		XINVLRNG_OFFSET
 #define	IPI_RENDEZVOUS		XRENDEZVOUS_OFFSET
 #define	IPI_AST			XCPUAST_OFFSET
 #define	IPI_STOP		XCPUSTOP_OFFSET
@@ -107,7 +109,6 @@ void	assign_apic_irq		__P((int apic, int intpin, int irq));
 void	revoke_apic_irq		__P((int irq));
 void	bsp_apic_configure	__P((void));
 void	init_secondary		__P((void));
-void	smp_invltlb		__P((void));
 void	forward_statclock	__P((void));
 void	forwarded_statclock	__P((struct trapframe frame));
 void	forward_hardclock	__P((void));
diff --git a/sys/amd64/isa/intr_machdep.c b/sys/amd64/isa/intr_machdep.c
index cfc162b..92bf581 100644
--- a/sys/amd64/isa/intr_machdep.c
+++ b/sys/amd64/isa/intr_machdep.c
@@ -499,14 +499,6 @@ icu_setup(int intr, driver_intr_t *handler, void *arg, int flags)
 	}
 	else {
 		vector = TPR_SLOW_INTS + intr;
-#ifdef APIC_INTR_REORDER
-#ifdef APIC_INTR_HIGHPRI_CLOCK
-		/* XXX: Hack (kludge?) for more accurate clock. */
-		if (intr == apic_8254_intr || intr == 8) {
-			vector = TPR_FAST_INTS + intr;
-		}
-#endif
-#endif
 		setidt(vector, slowintr[intr],
 		       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	}
diff --git a/sys/amd64/isa/intr_machdep.h b/sys/amd64/isa/intr_machdep.h
index 1726635..789b02b 100644
--- a/sys/amd64/isa/intr_machdep.h
+++ b/sys/amd64/isa/intr_machdep.h
@@ -88,6 +88,7 @@
 /* IDT vector base for regular (aka. slow) and fast interrupts */
 #define TPR_SLOW_INTS		0x20
 #define TPR_FAST_INTS		0x60
+/* XXX note that the AST interrupt is at 0x50 */
 
 /* blocking values for local APIC Task Priority Register */
 #define TPR_BLOCK_HWI		0x4f		/* hardware INTs */
@@ -104,20 +105,23 @@
 #endif /** TEST_TEST1 */
 
 /* TLB shootdowns */
-#define XINVLTLB_OFFSET		(ICU_OFFSET + 112)
+#define XINVLTLB_OFFSET		(ICU_OFFSET + 112)	/* 0x90 */
+#define XINVLPG_OFFSET		(ICU_OFFSET + 113)	/* 0x91 */
+#define XINVLRNG_OFFSET		(ICU_OFFSET + 114)	/* 0x92 */
 
 /* inter-cpu clock handling */
-#define XHARDCLOCK_OFFSET	(ICU_OFFSET + 113)
-#define XSTATCLOCK_OFFSET	(ICU_OFFSET + 114)
+#define XHARDCLOCK_OFFSET	(ICU_OFFSET + 120)	/* 0x98 */
+#define XSTATCLOCK_OFFSET	(ICU_OFFSET + 121)	/* 0x99 */
 
 /* inter-CPU rendezvous */
-#define XRENDEZVOUS_OFFSET	(ICU_OFFSET + 115)
+#define XRENDEZVOUS_OFFSET	(ICU_OFFSET + 122)	/* 0x9A */
 
 /* IPI to generate an additional software trap at the target CPU */
-#define XCPUAST_OFFSET		(ICU_OFFSET +  48)
+/* XXX in the middle of the interrupt range, overlapping IRQ48 */
+#define XCPUAST_OFFSET		(ICU_OFFSET +  48)	/* 0x50 */
 
 /* IPI to signal CPUs to stop and wait for another CPU to restart them */
-#define XCPUSTOP_OFFSET		(ICU_OFFSET + 128)
+#define XCPUSTOP_OFFSET		(ICU_OFFSET + 128)	/* 0xA0 */
 
 /*
  * Note: this vector MUST be xxxx1111, 32 + 223 = 255 = 0xff:
@@ -181,7 +185,9 @@ inthand_t
 	IDTVEC(intr28), IDTVEC(intr29), IDTVEC(intr30), IDTVEC(intr31);
 
 inthand_t
-	Xinvltlb,	/* TLB shootdowns */
+	Xinvltlb,	/* TLB shootdowns - global */
+	Xinvlpg,	/* TLB shootdowns - 1 page */
+	Xinvlrng,	/* TLB shootdowns - page range */
 	Xhardclock,	/* Forward hardclock() */
 	Xstatclock,	/* Forward statclock() */
 	Xcpuast,	/* Additional software trap on other cpu */ 
diff --git a/sys/amd64/isa/nmi.c b/sys/amd64/isa/nmi.c
index cfc162b..92bf581 100644
--- a/sys/amd64/isa/nmi.c
+++ b/sys/amd64/isa/nmi.c
@@ -499,14 +499,6 @@ icu_setup(int intr, driver_intr_t *handler, void *arg, int flags)
 	}
 	else {
 		vector = TPR_SLOW_INTS + intr;
-#ifdef APIC_INTR_REORDER
-#ifdef APIC_INTR_HIGHPRI_CLOCK
-		/* XXX: Hack (kludge?) for more accurate clock. */
-		if (intr == apic_8254_intr || intr == 8) {
-			vector = TPR_FAST_INTS + intr;
-		}
-#endif
-#endif
 		setidt(vector, slowintr[intr],
 		       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	}
author	peter <peter@FreeBSD.org>	2002-02-25 23:49:51 +0000
committer	peter <peter@FreeBSD.org>	2002-02-25 23:49:51 +0000
commit	748d0e116728aaecf95d1e3ca10bfe40045b88b8 (patch)
tree	0754b996bbf402ca335dd8c6d902bac23f681df8 /sys/amd64
parent	06f86e63e411dfaa5655cb6527e06c115aa3e97d (diff)
download	FreeBSD-src-748d0e116728aaecf95d1e3ca10bfe40045b88b8.zip FreeBSD-src-748d0e116728aaecf95d1e3ca10bfe40045b88b8.tar.gz