diff options
Diffstat (limited to 'sys/amd64/amd64')
-rw-r--r-- | sys/amd64/amd64/apic_vector.S | 276 | ||||
-rw-r--r-- | sys/amd64/amd64/autoconf.c | 25 | ||||
-rw-r--r-- | sys/amd64/amd64/cpu_switch.S | 27 | ||||
-rw-r--r-- | sys/amd64/amd64/db_interface.c | 37 | ||||
-rw-r--r-- | sys/amd64/amd64/db_trace.c | 5 | ||||
-rw-r--r-- | sys/amd64/amd64/exception.S | 17 | ||||
-rw-r--r-- | sys/amd64/amd64/fpu.c | 3 | ||||
-rw-r--r-- | sys/amd64/amd64/genassym.c | 20 | ||||
-rw-r--r-- | sys/amd64/amd64/identcpu.c | 3 | ||||
-rw-r--r-- | sys/amd64/amd64/io_apic.c | 5 | ||||
-rw-r--r-- | sys/amd64/amd64/local_apic.c | 26 | ||||
-rw-r--r-- | sys/amd64/amd64/machdep.c | 141 | ||||
-rw-r--r-- | sys/amd64/amd64/mem.c | 9 | ||||
-rw-r--r-- | sys/amd64/amd64/mp_machdep.c | 473 | ||||
-rw-r--r-- | sys/amd64/amd64/mpboot.S | 398 | ||||
-rw-r--r-- | sys/amd64/amd64/mptable.c | 11 | ||||
-rw-r--r-- | sys/amd64/amd64/nexus.c | 12 | ||||
-rw-r--r-- | sys/amd64/amd64/pmap.c | 233 | ||||
-rw-r--r-- | sys/amd64/amd64/support.S | 14 | ||||
-rw-r--r-- | sys/amd64/amd64/trap.c | 17 | ||||
-rw-r--r-- | sys/amd64/amd64/tsc.c | 16 | ||||
-rw-r--r-- | sys/amd64/amd64/vm_machdep.c | 65 |
22 files changed, 903 insertions, 930 deletions
diff --git a/sys/amd64/amd64/apic_vector.S b/sys/amd64/amd64/apic_vector.S index ecc4c67..2160dc4 100644 --- a/sys/amd64/amd64/apic_vector.S +++ b/sys/amd64/amd64/apic_vector.S @@ -42,7 +42,6 @@ #include <machine/asmacros.h> #include <machine/apicreg.h> -#include <machine/smptests.h> #include "assym.s" @@ -50,19 +49,48 @@ * Macros to create and destroy a trap frame. */ #define PUSH_FRAME \ - pushl $0 ; /* dummy error code */ \ - pushl $0 ; /* dummy trap type */ \ - pushal ; /* 8 ints */ \ - pushl %ds ; /* save data and extra segments ... */ \ - pushl %es ; \ - pushl %fs + subq $TF_RIP,%rsp ; /* skip dummy tf_err and tf_trapno */ \ + testb $SEL_RPL_MASK,TF_CS(%rsp) ; /* come from kernel? */ \ + jz 1f ; /* Yes, dont swapgs again */ \ + swapgs ; \ +1: movq %rdi,TF_RDI(%rsp) ; \ + movq %rsi,TF_RSI(%rsp) ; \ + movq %rdx,TF_RDX(%rsp) ; \ + movq %rcx,TF_RCX(%rsp) ; \ + movq %r8,TF_R8(%rsp) ; \ + movq %r9,TF_R9(%rsp) ; \ + movq %rax,TF_RAX(%rsp) ; \ + movq %rbx,TF_RBX(%rsp) ; \ + movq %rbp,TF_RBP(%rsp) ; \ + movq %r10,TF_R10(%rsp) ; \ + movq %r11,TF_R11(%rsp) ; \ + movq %r12,TF_R12(%rsp) ; \ + movq %r13,TF_R13(%rsp) ; \ + movq %r14,TF_R14(%rsp) ; \ + movq %r15,TF_R15(%rsp) #define POP_FRAME \ - popl %fs ; \ - popl %es ; \ - popl %ds ; \ - popal ; \ - addl $4+4,%esp + movq TF_RDI(%rsp),%rdi ; \ + movq TF_RSI(%rsp),%rsi ; \ + movq TF_RDX(%rsp),%rdx ; \ + movq TF_RCX(%rsp),%rcx ; \ + movq TF_R8(%rsp),%r8 ; \ + movq TF_R9(%rsp),%r9 ; \ + movq TF_RAX(%rsp),%rax ; \ + movq TF_RBX(%rsp),%rbx ; \ + movq TF_RBP(%rsp),%rbp ; \ + movq TF_R10(%rsp),%r10 ; \ + movq TF_R11(%rsp),%r11 ; \ + movq TF_R12(%rsp),%r12 ; \ + movq TF_R13(%rsp),%r13 ; \ + movq TF_R14(%rsp),%r14 ; \ + movq TF_R15(%rsp),%r15 ; \ + testb $SEL_RPL_MASK,TF_CS(%rsp) ; /* come from kernel? */ \ + jz 1f ; /* keep kernel GS.base */ \ + cli ; \ + swapgs ; \ +1: addq $TF_RIP,%rsp /* skip over tf_err, tf_trapno */ + /* * I/O Interrupt Entry Point. Rather than having one entry point for @@ -76,21 +104,15 @@ SUPERALIGN_TEXT ; \ IDTVEC(vec_name) ; \ PUSH_FRAME ; \ - movl $KDSEL, %eax ; /* reload with kernel's data segment */ \ - mov %ax, %ds ; \ - mov %ax, %es ; \ - movl $KPSEL, %eax ; /* reload with per-CPU data segment */ \ - mov %ax, %fs ; \ - movl lapic, %edx ; /* pointer to local APIC */ \ - movl LA_ISR + 16 * (index)(%edx), %eax ; /* load ISR */ \ + movq lapic, %rdx ; /* pointer to local APIC */ \ + movl LA_ISR + 16 * (index)(%rdx), %eax ; /* load ISR */ \ bsrl %eax, %eax ; /* index of highset set bit in ISR */ \ jz 2f ; \ addl $(32 * index),%eax ; \ 1: ; \ FAKE_MCOUNT(13*4(%esp)) ; /* XXX avoid double count */ \ - pushl %eax ; /* pass the IRQ */ \ + movq %rax, %rdi ; /* pass the IRQ */ \ call lapic_handle_intr ; \ - addl $4, %esp ; /* discard parameter */ \ MEXITCOUNT ; \ jmp doreti ; \ 2: movl $-1, %eax ; /* send a vector of -1 */ \ @@ -109,7 +131,7 @@ IDTVEC(spuriousint) /* No EOI cycle used here */ - iret + iretq MCOUNT_LABEL(bintr2) ISR_VEC(1, apic_isr1) @@ -128,32 +150,19 @@ MCOUNT_LABEL(eintr2) .text SUPERALIGN_TEXT IDTVEC(invltlb) - pushl %eax - pushl %ds - movl $KDSEL, %eax /* Kernel data selector */ - mov %ax, %ds - -#ifdef COUNT_XINVLTLB_HITS - pushl %fs - movl $KPSEL, %eax /* Private space selector */ - mov %ax, %fs - movl PCPU(CPUID), %eax - popl %fs - incl xhits_gbl(,%eax,4) -#endif /* COUNT_XINVLTLB_HITS */ + pushq %rax - movl %cr3, %eax /* invalidate the TLB */ - movl %eax, %cr3 + movq %cr3, %rax /* invalidate the TLB */ + movq %rax, %cr3 - movl lapic, %eax - movl $0, LA_EOI(%eax) /* End Of Interrupt to APIC */ + movq lapic, %rax + movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */ lock incl smp_tlb_wait - popl %ds - popl %eax - iret + popq %rax + iretq /* * Single page TLB shootdown @@ -161,32 +170,19 @@ IDTVEC(invltlb) .text SUPERALIGN_TEXT IDTVEC(invlpg) - pushl %eax - pushl %ds - movl $KDSEL, %eax /* Kernel data selector */ - mov %ax, %ds - -#ifdef COUNT_XINVLTLB_HITS - pushl %fs - movl $KPSEL, %eax /* Private space selector */ - mov %ax, %fs - movl PCPU(CPUID), %eax - popl %fs - incl xhits_pg(,%eax,4) -#endif /* COUNT_XINVLTLB_HITS */ + pushq %rax - movl smp_tlb_addr1, %eax - invlpg (%eax) /* invalidate single page */ + movq smp_tlb_addr1, %rax + invlpg (%rax) /* invalidate single page */ - movl lapic, %eax - movl $0, LA_EOI(%eax) /* End Of Interrupt to APIC */ + movq lapic, %rax + movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */ lock incl smp_tlb_wait - popl %ds - popl %eax - iret + popq %rax + iretq /* * Page range TLB shootdown. @@ -194,38 +190,25 @@ IDTVEC(invlpg) .text SUPERALIGN_TEXT IDTVEC(invlrng) - pushl %eax - pushl %edx - pushl %ds - movl $KDSEL, %eax /* Kernel data selector */ - mov %ax, %ds - -#ifdef COUNT_XINVLTLB_HITS - pushl %fs - movl $KPSEL, %eax /* Private space selector */ - mov %ax, %fs - movl PCPU(CPUID), %eax - popl %fs - incl xhits_rng(,%eax,4) -#endif /* COUNT_XINVLTLB_HITS */ - - movl smp_tlb_addr1, %edx - movl smp_tlb_addr2, %eax -1: invlpg (%edx) /* invalidate single page */ - addl $PAGE_SIZE, %edx - cmpl %eax, %edx + pushq %rax + pushq %rdx + + movq smp_tlb_addr1, %rdx + movq smp_tlb_addr2, %rax +1: invlpg (%rdx) /* invalidate single page */ + addq $PAGE_SIZE, %rdx + cmpq %rax, %rdx jb 1b - movl lapic, %eax - movl $0, LA_EOI(%eax) /* End Of Interrupt to APIC */ + movq lapic, %rax + movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */ lock incl smp_tlb_wait - popl %ds - popl %edx - popl %eax - iret + popq %rdx + popq %rax + iretq /* * Forward hardclock to another CPU. Pushes a clockframe and calls @@ -235,18 +218,11 @@ IDTVEC(invlrng) SUPERALIGN_TEXT IDTVEC(hardclock) PUSH_FRAME - movl $KDSEL, %eax /* reload with kernel's data segment */ - mov %ax, %ds - mov %ax, %es - movl $KPSEL, %eax - mov %ax, %fs - movl lapic, %edx - movl $0, LA_EOI(%edx) /* End Of Interrupt to APIC */ + movq lapic, %rdx + movl $0, LA_EOI(%rdx) /* End Of Interrupt to APIC */ - pushl $0 /* XXX convert trapframe to clockframe */ call forwarded_hardclock - addl $4, %esp /* XXX convert clockframe to trapframe */ MEXITCOUNT jmp doreti @@ -258,20 +234,13 @@ IDTVEC(hardclock) SUPERALIGN_TEXT IDTVEC(statclock) PUSH_FRAME - movl $KDSEL, %eax /* reload with kernel's data segment */ - mov %ax, %ds - mov %ax, %es - movl $KPSEL, %eax - mov %ax, %fs - movl lapic, %edx - movl $0, LA_EOI(%edx) /* End Of Interrupt to APIC */ + movq lapic, %rdx + movl $0, LA_EOI(%rdx) /* End Of Interrupt to APIC */ FAKE_MCOUNT(13*4(%esp)) - pushl $0 /* XXX convert trapframe to clockframe */ call forwarded_statclock - addl $4, %esp /* XXX convert clockframe to trapframe */ MEXITCOUNT jmp doreti @@ -287,14 +256,9 @@ IDTVEC(statclock) SUPERALIGN_TEXT IDTVEC(cpuast) PUSH_FRAME - movl $KDSEL, %eax - mov %ax, %ds /* use KERNEL data segment */ - mov %ax, %es - movl $KPSEL, %eax - mov %ax, %fs - movl lapic, %edx - movl $0, LA_EOI(%edx) /* End Of Interrupt to APIC */ + movq lapic, %rdx + movl $0, LA_EOI(%rdx) /* End Of Interrupt to APIC */ FAKE_MCOUNT(13*4(%esp)) @@ -311,63 +275,41 @@ IDTVEC(cpuast) .text SUPERALIGN_TEXT IDTVEC(cpustop) - pushl %ebp - movl %esp, %ebp - pushl %eax - pushl %ecx - pushl %edx - pushl %ds /* save current data segment */ - pushl %es - pushl %fs - - movl $KDSEL, %eax - mov %ax, %ds /* use KERNEL data segment */ - mov %ax, %es - movl $KPSEL, %eax - mov %ax, %fs - - movl lapic, %eax - movl $0, LA_EOI(%eax) /* End Of Interrupt to APIC */ + PUSH_FRAME + + movq lapic, %rax + movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */ movl PCPU(CPUID), %eax imull $PCB_SIZE, %eax - leal CNAME(stoppcbs)(%eax), %eax - pushl %eax - call CNAME(savectx) /* Save process context */ - addl $4, %esp + leaq stoppcbs(%rax), %rdi + call savectx /* Save process context */ movl PCPU(CPUID), %eax lock - btsl %eax, CNAME(stopped_cpus) /* stopped_cpus |= (1<<id) */ + btsl %eax, stopped_cpus /* stopped_cpus |= (1<<id) */ 1: - btl %eax, CNAME(started_cpus) /* while (!(started_cpus & (1<<id))) */ + btl %eax, started_cpus /* while (!(started_cpus & (1<<id))) */ jnc 1b lock - btrl %eax, CNAME(started_cpus) /* started_cpus &= ~(1<<id) */ + btrl %eax, started_cpus /* started_cpus &= ~(1<<id) */ lock - btrl %eax, CNAME(stopped_cpus) /* stopped_cpus &= ~(1<<id) */ + btrl %eax, stopped_cpus /* stopped_cpus &= ~(1<<id) */ test %eax, %eax jnz 2f - movl CNAME(cpustop_restartfunc), %eax - test %eax, %eax + movq cpustop_restartfunc, %rax + testq %rax, %rax jz 2f - movl $0, CNAME(cpustop_restartfunc) /* One-shot */ + movq $0, cpustop_restartfunc /* One-shot */ - call *%eax + call *%rax 2: - popl %fs - popl %es - popl %ds /* restore previous data segment */ - popl %edx - popl %ecx - popl %eax - movl %ebp, %esp - popl %ebp - iret + POP_FRAME + iretq /* * Executed by a CPU when it receives a RENDEZVOUS IPI from another CPU. @@ -378,19 +320,13 @@ IDTVEC(cpustop) SUPERALIGN_TEXT IDTVEC(rendezvous) PUSH_FRAME - movl $KDSEL, %eax - mov %ax, %ds /* use KERNEL data segment */ - mov %ax, %es - movl $KPSEL, %eax - mov %ax, %fs - call smp_rendezvous_action - - movl lapic, %eax - movl $0, LA_EOI(%eax) /* End Of Interrupt to APIC */ - POP_FRAME - iret + movq lapic, %rax + movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */ + POP_FRAME /* Why not doreti? */ + iretq +#ifdef LAZY_SWITCH /* * Clean up when we lose out on the lazy context switch optimization. * ie: when we are about to release a PTD but a cpu is still borrowing it. @@ -398,16 +334,10 @@ IDTVEC(rendezvous) SUPERALIGN_TEXT IDTVEC(lazypmap) PUSH_FRAME - movl $KDSEL, %eax - mov %ax, %ds /* use KERNEL data segment */ - mov %ax, %es - movl $KPSEL, %eax - mov %ax, %fs - call pmap_lazyfix_action - - movl lapic, %eax - movl $0, LA_EOI(%eax) /* End Of Interrupt to APIC */ - POP_FRAME - iret + movq lapic, %rax + movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */ + POP_FRAME /* Why not doreti? */ + iretq +#endif #endif /* SMP */ diff --git a/sys/amd64/amd64/autoconf.c b/sys/amd64/amd64/autoconf.c index d6ce6b6..adec2e0 100644 --- a/sys/amd64/amd64/autoconf.c +++ b/sys/amd64/amd64/autoconf.c @@ -76,7 +76,6 @@ __FBSDID("$FreeBSD$"); #include <nfsclient/nfsdiskless.h> #include <machine/md_var.h> -#include <amd64/isa/icu.h> #ifdef DEV_ISA #include <isa/isavar.h> @@ -109,23 +108,11 @@ configure(dummy) { /* - * Activate the ICU's. Note that we are explicitly at splhigh() - * at present as we have no way to disable stray PCI level triggered - * interrupts until the devices have had a driver attached. This - * is particularly a problem when the interrupts are shared. For - * example, if IRQ 10 is shared between a disk and network device - * and the disk device generates an interrupt, if we "activate" - * IRQ 10 when the network driver is set up, then we will get - * recursive interrupt 10's as nothing will know how to turn off - * the disk device's interrupt. - * - * Having the ICU's active means we can probe interrupt routing to - * see if a device causes the corresponding pending bit to be set. - * - * This is all rather inconvenient. + * Enable interrupts on the processor. The interrupts are still + * disabled in the interrupt controllers until interrupt handlers + * are registered. */ enable_intr(); - INTREN(IRQ_SLAVE); /* nexus0 is the top of the i386 device tree */ device_add_child(root_bus, "nexus", 0); @@ -141,12 +128,6 @@ configure(dummy) if (isa_bus_device) isa_probe_children(isa_bus_device); #endif - - /* - * Now we're ready to handle (pending) interrupts. - * XXX this is slightly misplaced. - */ - spl0(); } static void diff --git a/sys/amd64/amd64/cpu_switch.S b/sys/amd64/amd64/cpu_switch.S index 56f0c84..3bfcfc8 100644 --- a/sys/amd64/amd64/cpu_switch.S +++ b/sys/amd64/amd64/cpu_switch.S @@ -59,14 +59,16 @@ * %rsi = newtd */ ENTRY(cpu_throw) - xorq %rax, %rax movl PCPU(CPUID), %eax testq %rdi,%rdi /* no thread? */ jz 1f /* release bit from old pm_active */ movq TD_PROC(%rdi), %rdx /* oldtd->td_proc */ movq P_VMSPACE(%rdx), %rdx /* proc->p_vmspace */ - btrq %rax, VM_PMAP+PM_ACTIVE(%rdx) /* clear old */ +#ifdef SMP + lock +#endif + btrl %eax, VM_PMAP+PM_ACTIVE(%rdx) /* clear old */ 1: movq TD_PCB(%rsi),%rdx /* newtd->td_proc */ movq PCB_CR3(%rdx),%rdx @@ -74,7 +76,10 @@ ENTRY(cpu_throw) /* set bit in new pm_active */ movq TD_PROC(%rsi),%rdx movq P_VMSPACE(%rdx), %rdx - btsq %rax, VM_PMAP+PM_ACTIVE(%rdx) /* set new */ +#ifdef SMP + lock +#endif + btsl %eax, VM_PMAP+PM_ACTIVE(%rdx) /* set new */ jmp sw1 /* @@ -143,7 +148,6 @@ ENTRY(cpu_switch) jz badsw3 /* no, panic */ #endif movq TD_PCB(%rsi),%r8 - xorq %rax, %rax movl PCPU(CPUID), %eax /* switch address space */ @@ -153,12 +157,18 @@ ENTRY(cpu_switch) /* Release bit from old pmap->pm_active */ movq TD_PROC(%rdi), %rdx /* oldproc */ movq P_VMSPACE(%rdx), %rdx - btrq %rax, VM_PMAP+PM_ACTIVE(%rdx) /* clear old */ +#ifdef SMP + lock +#endif + btrl %eax, VM_PMAP+PM_ACTIVE(%rdx) /* clear old */ /* Set bit in new pmap->pm_active */ movq TD_PROC(%rsi),%rdx /* newproc */ movq P_VMSPACE(%rdx), %rdx - btsq %rax, VM_PMAP+PM_ACTIVE(%rdx) /* set new */ +#ifdef SMP + lock +#endif + btsl %eax, VM_PMAP+PM_ACTIVE(%rdx) /* set new */ sw1: /* @@ -191,8 +201,11 @@ sw1: wrmsr /* Update the TSS_RSP0 pointer for the next interrupt */ + movq PCPU(TSSP), %rax + addq $COMMON_TSS_RSP0, %rax leaq -16(%r8), %rbx - movq %rbx, common_tss + COMMON_TSS_RSP0 + movq %rbx, (%rax) + movq %rbx, PCPU(RSP0) /* Restore context. */ movq PCB_RBX(%r8),%rbx diff --git a/sys/amd64/amd64/db_interface.c b/sys/amd64/amd64/db_interface.c index 3dd6a8a..077c914 100644 --- a/sys/amd64/amd64/db_interface.c +++ b/sys/amd64/amd64/db_interface.c @@ -98,6 +98,22 @@ kdb_trap(int type, int code, struct amd64_saved_state *regs) ef = read_rflags(); disable_intr(); +#ifdef SMP + +#if defined(VERBOSE_CPUSTOP_ON_DDBBREAK) + db_printf("\nCPU%d stopping CPUs: 0x%08x...", PCPU_GET(cpuid), + PCPU_GET(other_cpus)); +#endif /* VERBOSE_CPUSTOP_ON_DDBBREAK */ + + /* We stop all CPUs except ourselves (obviously) */ + stop_cpus(PCPU_GET(other_cpus)); + +#if defined(VERBOSE_CPUSTOP_ON_DDBBREAK) + db_printf(" stopped.\n"); +#endif /* VERBOSE_CPUSTOP_ON_DDBBREAK */ + +#endif /* SMP */ + switch (type) { case T_BPTFLT: /* breakpoint */ case T_TRCTRAP: /* debug exception */ @@ -192,6 +208,27 @@ kdb_trap(int type, int code, struct amd64_saved_state *regs) regs->tf_ds = ddb_regs.tf_ds & 0xffff; #endif +#ifdef SMP + +#if defined(VERBOSE_CPUSTOP_ON_DDBBREAK) + db_printf("\nCPU%d restarting CPUs: 0x%08x...", PCPU_GET(cpuid), + stopped_cpus); +#endif /* VERBOSE_CPUSTOP_ON_DDBBREAK */ + + /* Restart all the CPUs we previously stopped */ + if (stopped_cpus != PCPU_GET(other_cpus) && smp_started != 0) { + db_printf("whoa, other_cpus: 0x%08x, stopped_cpus: 0x%08x\n", + PCPU_GET(other_cpus), stopped_cpus); + panic("stop_cpus() failed"); + } + restart_cpus(stopped_cpus); + +#if defined(VERBOSE_CPUSTOP_ON_DDBBREAK) + db_printf(" restarted.\n"); +#endif /* VERBOSE_CPUSTOP_ON_DDBBREAK */ + +#endif /* SMP */ + write_rflags(ef); return (1); diff --git a/sys/amd64/amd64/db_trace.c b/sys/amd64/amd64/db_trace.c index a05348a..7dba9bb 100644 --- a/sys/amd64/amd64/db_trace.c +++ b/sys/amd64/amd64/db_trace.c @@ -245,8 +245,9 @@ db_nextframe(fp, ip, p) if (strcmp(name, "calltrap") == 0 || strcmp(name, "fork_trampoline") == 0) frame_type = TRAP; - else if (strncmp(name, "Xintr", 5) == 0 || - strncmp(name, "Xfastintr", 9) == 0) + else if (strncmp(name, "Xatpic_intr", 11) == 0 || + strncmp(name, "Xatpic_fastintr", 15) == 0 || + strncmp(name, "Xapic_isr", 9) == 0) frame_type = INTERRUPT; else if (strcmp(name, "Xfast_syscall") == 0) frame_type = SYSCALL; diff --git a/sys/amd64/amd64/exception.S b/sys/amd64/amd64/exception.S index 3d2eaa6..972f19c 100644 --- a/sys/amd64/amd64/exception.S +++ b/sys/amd64/amd64/exception.S @@ -35,14 +35,11 @@ */ #include <machine/asmacros.h> -#include <sys/mutex.h> #include <machine/psl.h> #include <machine/trap.h> #include "assym.s" -#define SEL_RPL_MASK 0x0003 - .text /*****************************************************************************/ @@ -72,8 +69,6 @@ * %ss segment registers, but does not mess with %ds, %es, or %fs. Thus we * must load them with appropriate values for supervisor mode operation. */ -#define IDTVEC(name) ALIGN_TEXT; .globl __CONCAT(X,name); \ - .type __CONCAT(X,name),@function; __CONCAT(X,name): MCOUNT_LABEL(user) MCOUNT_LABEL(btrap) @@ -223,7 +218,7 @@ IDTVEC(page) IDTVEC(fast_syscall) swapgs movq %rsp,PCPU(SCRATCH_RSP) - movq common_tss+COMMON_TSS_RSP0,%rsp + movq PCPU(RSP0),%rsp /* Now emulate a trapframe. Make the 8 byte alignment odd for call. */ subq $TF_SIZE,%rsp /* defer TF_RSP till we have a spare register */ @@ -297,14 +292,6 @@ ENTRY(fork_trampoline) call fork_exit jmp doreti /* Handle any ASTs */ - -/* - * Include what was once config+isa-dependent code. - * XXX it should be in a stand-alone file. It's still icu-dependent and - * belongs in i386/isa. - */ -#include "amd64/isa/vector.S" - .data ALIGN_DATA @@ -406,5 +393,3 @@ doreti_iret_fault: movq $T_PROTFLT,TF_TRAPNO(%rsp) movq $0,TF_ERR(%rsp) /* XXX should be the error code */ jmp alltraps_with_regs_pushed - -#include "amd64/isa/icu_ipl.S" diff --git a/sys/amd64/amd64/fpu.c b/sys/amd64/amd64/fpu.c index 1e4890c..1acb931 100644 --- a/sys/amd64/amd64/fpu.c +++ b/sys/amd64/amd64/fpu.c @@ -55,6 +55,7 @@ __FBSDID("$FreeBSD$"); #include <machine/cputypes.h> #include <machine/frame.h> +#include <machine/intr_machdep.h> #include <machine/md_var.h> #include <machine/pcb.h> #include <machine/psl.h> @@ -63,8 +64,6 @@ __FBSDID("$FreeBSD$"); #include <machine/segments.h> #include <machine/ucontext.h> -#include <amd64/isa/intr_machdep.h> - /* * Floating point support. */ diff --git a/sys/amd64/amd64/genassym.c b/sys/amd64/amd64/genassym.c index 6a017e5..27a1a12 100644 --- a/sys/amd64/amd64/genassym.c +++ b/sys/amd64/amd64/genassym.c @@ -69,10 +69,12 @@ __FBSDID("$FreeBSD$"); #include <nfs/rpcv2.h> #include <nfsclient/nfs.h> #include <nfsclient/nfsdiskless.h> +#include <machine/apicreg.h> #include <machine/cpu.h> #include <machine/sigframe.h> #include <machine/proc.h> #include <machine/specialreg.h> +#include <machine/segments.h> ASSYM(P_VMSPACE, offsetof(struct proc, p_vmspace)); ASSYM(VM_PMAP, offsetof(struct vmspace, vm_pmap)); @@ -83,11 +85,6 @@ ASSYM(P_UAREA, offsetof(struct proc, p_uarea)); ASSYM(TD_FLAGS, offsetof(struct thread, td_flags)); ASSYM(TD_PCB, offsetof(struct thread, td_pcb)); ASSYM(TD_PROC, offsetof(struct thread, td_proc)); -ASSYM(TD_INTR_NESTING_LEVEL, offsetof(struct thread, td_intr_nesting_level)); -ASSYM(TD_CRITNEST, offsetof(struct thread, td_critnest)); -ASSYM(TD_MD, offsetof(struct thread, td_md)); - -ASSYM(P_MD, offsetof(struct proc, p_md)); ASSYM(TDF_ASTPENDING, TDF_ASTPENDING); ASSYM(TDF_NEEDRESCHED, TDF_NEEDRESCHED); @@ -180,6 +177,7 @@ ASSYM(UC_EFLAGS, offsetof(ucontext_t, uc_mcontext.mc_rflags)); ASSYM(ENOENT, ENOENT); ASSYM(EFAULT, EFAULT); ASSYM(ENAMETOOLONG, ENAMETOOLONG); +ASSYM(MAXCOMLEN, MAXCOMLEN); ASSYM(MAXPATHLEN, MAXPATHLEN); ASSYM(PC_SIZEOF, sizeof(struct pcpu)); ASSYM(PC_PRVSPACE, offsetof(struct pcpu, pc_prvspace)); @@ -189,12 +187,24 @@ ASSYM(PC_IDLETHREAD, offsetof(struct pcpu, pc_idlethread)); ASSYM(PC_CURPCB, offsetof(struct pcpu, pc_curpcb)); ASSYM(PC_CPUID, offsetof(struct pcpu, pc_cpuid)); ASSYM(PC_SCRATCH_RSP, offsetof(struct pcpu, pc_scratch_rsp)); +ASSYM(PC_CURPMAP, offsetof(struct pcpu, pc_curpmap)); +ASSYM(PC_TSSP, offsetof(struct pcpu, pc_tssp)); +ASSYM(PC_RSP0, offsetof(struct pcpu, pc_rsp0)); + +ASSYM(LA_VER, offsetof(struct LAPIC, version)); +ASSYM(LA_TPR, offsetof(struct LAPIC, tpr)); +ASSYM(LA_EOI, offsetof(struct LAPIC, eoi)); +ASSYM(LA_SVR, offsetof(struct LAPIC, svr)); +ASSYM(LA_ICR_LO, offsetof(struct LAPIC, icr_lo)); +ASSYM(LA_ICR_HI, offsetof(struct LAPIC, icr_hi)); +ASSYM(LA_ISR, offsetof(struct LAPIC, isr0)); ASSYM(KCSEL, GSEL(GCODE_SEL, SEL_KPL)); ASSYM(KDSEL, GSEL(GDATA_SEL, SEL_KPL)); ASSYM(KUCSEL, GSEL(GUCODE_SEL, SEL_UPL)); ASSYM(KUDSEL, GSEL(GUDATA_SEL, SEL_UPL)); ASSYM(KUC32SEL, GSEL(GUCODE32_SEL, SEL_UPL)); +ASSYM(SEL_RPL_MASK, SEL_RPL_MASK); ASSYM(MSR_FSBASE, MSR_FSBASE); ASSYM(MSR_GSBASE, MSR_GSBASE); diff --git a/sys/amd64/amd64/identcpu.c b/sys/amd64/amd64/identcpu.c index ba8e58e..f3d70c2 100644 --- a/sys/amd64/amd64/identcpu.c +++ b/sys/amd64/amd64/identcpu.c @@ -55,12 +55,13 @@ __FBSDID("$FreeBSD$"); #include <machine/asmacros.h> #include <machine/clock.h> #include <machine/cputypes.h> +#include <machine/frame.h> +#include <machine/intr_machdep.h> #include <machine/segments.h> #include <machine/specialreg.h> #include <machine/md_var.h> #include <amd64/isa/icu.h> -#include <amd64/isa/intr_machdep.h> /* XXX - should be in header file: */ void printcpuinfo(void); diff --git a/sys/amd64/amd64/io_apic.c b/sys/amd64/amd64/io_apic.c index 4af70fa..b620440 100644 --- a/sys/amd64/amd64/io_apic.c +++ b/sys/amd64/amd64/io_apic.c @@ -30,6 +30,7 @@ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); +#include "opt_atpic.h" #include "opt_isa.h" #include "opt_no_mixed_mode.h" @@ -50,8 +51,8 @@ __FBSDID("$FreeBSD$"); #include <machine/apicvar.h> #include <machine/segments.h> -#if defined(DEV_ISA) && !defined(NO_MIXED_MODE) -#define MIXED_MODE +#if defined(DEV_ISA) && defined(DEV_ATPIC) && !defined(NO_MIXED_MODE) +#define MIXED_MODE #endif #define IOAPIC_ISA_INTS 16 diff --git a/sys/amd64/amd64/local_apic.c b/sys/amd64/amd64/local_apic.c index 6f942bf..bdff518 100644 --- a/sys/amd64/amd64/local_apic.c +++ b/sys/amd64/amd64/local_apic.c @@ -39,6 +39,7 @@ __FBSDID("$FreeBSD$"); #include <sys/bus.h> #include <sys/kernel.h> #include <sys/pcpu.h> +#include <sys/proc.h> #include <vm/vm.h> #include <vm/pmap.h> @@ -171,8 +172,7 @@ lapic_init(uintptr_t addr) KASSERT(trunc_page(addr) == addr, ("local APIC not aligned on a page boundary")); lapic = (lapic_t *)pmap_mapdev(addr, sizeof(lapic_t)); - setidt(APIC_SPURIOUS_INT, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL, - GSEL(GCODE_SEL, SEL_KPL)); + setidt(APIC_SPURIOUS_INT, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0); /* Perform basic initialization of the BSP's local APIC. */ value = lapic->svr; @@ -242,8 +242,7 @@ lapic_enable_intr(u_int irq) KASSERT(vector != IDT_SYSCALL, ("Attempt to overwrite syscall entry")); KASSERT(ioint_handlers[vector / 32] != NULL, ("No ISR handler for IRQ %u", irq)); - setidt(vector, ioint_handlers[vector / 32], SDT_SYS386IGT, SEL_KPL, - GSEL(GCODE_SEL, SEL_KPL)); + setidt(vector, ioint_handlers[vector / 32], SDT_SYSIGT, SEL_KPL, 0); } void @@ -478,13 +477,14 @@ lapic_eoi(void) } void -lapic_handle_intr(struct intrframe frame) +lapic_handle_intr(void *cookie, struct intrframe frame) { struct intsrc *isrc; + int vec = (uintptr_t)cookie; - if (frame.if_vec == -1) + if (vec == -1) panic("Couldn't get vector from ISR!"); - isrc = intr_lookup_source(apic_idt_to_irq(frame.if_vec)); + isrc = intr_lookup_source(apic_idt_to_irq(vec)); intr_execute_handlers(isrc, &frame); } @@ -589,21 +589,9 @@ static void apic_setup_local(void *dummy __unused) { int retval; - uint64_t apic_base; if (best_enum == NULL) return; - /* - * To work around an errata, we disable the local APIC on some - * CPUs during early startup. We need to turn the local APIC back - * on on such CPUs now. - */ - if (cpu == CPU_686 && strcmp(cpu_vendor, "GenuineIntel") == 0 && - (cpu_id & 0xff0) == 0x610) { - apic_base = rdmsr(MSR_APICBASE); - apic_base |= APICBASE_ENABLED; - wrmsr(MSR_APICBASE, apic_base); - } retval = best_enum->apic_setup_local(); if (retval != 0) printf("%s: Failed to setup the local APIC: returned %d\n", diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c index e32d82a..2140d7a 100644 --- a/sys/amd64/amd64/machdep.c +++ b/sys/amd64/amd64/machdep.c @@ -42,6 +42,7 @@ __FBSDID("$FreeBSD$"); #include "opt_atalk.h" +#include "opt_atpic.h" #include "opt_compat.h" #include "opt_cpu.h" #include "opt_ddb.h" @@ -101,6 +102,7 @@ __FBSDID("$FreeBSD$"); #include <machine/reg.h> #include <machine/clock.h> #include <machine/specialreg.h> +#include <machine/intr_machdep.h> #include <machine/md_var.h> #include <machine/metadata.h> #include <machine/proc.h> @@ -108,9 +110,13 @@ __FBSDID("$FreeBSD$"); #include <machine/perfmon.h> #endif #include <machine/tss.h> +#ifdef SMP +#include <machine/smp.h> +#endif #include <amd64/isa/icu.h> -#include <amd64/isa/intr_machdep.h> + +#include <isa/isareg.h> #include <isa/rtc.h> #include <sys/ptrace.h> #include <machine/sigframe.h> @@ -146,7 +152,9 @@ vm_paddr_t phys_avail[10]; struct kva_md_info kmi; static struct trapframe proc0_tf; -static struct pcpu __pcpu; +struct region_descriptor r_gdt, r_idt; + +struct pcpu __pcpu[MAXCPU]; struct mtx icu_lock; @@ -196,7 +204,6 @@ cpu_startup(dummy) bufinit(); vm_pager_bufferinit(); - /* For SMP, we delay the cpu_setregs() until after SMP startup. */ cpu_setregs(); } @@ -589,13 +596,13 @@ SYSCTL_INT(_machdep, CPU_WALLCLOCK, wall_cmos_clock, * Initialize segments & interrupt table */ -struct user_segment_descriptor gdt[NGDT];/* global descriptor table */ +struct user_segment_descriptor gdt[NGDT * MAXCPU];/* global descriptor table */ static struct gate_descriptor idt0[NIDT]; struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ static char dblfault_stack[PAGE_SIZE] __aligned(16); -struct amd64tss common_tss; +struct amd64tss common_tss[MAXCPU]; /* software prototypes -- in more palatable form */ struct soft_segment_descriptor gdt_segs[] = { @@ -755,6 +762,15 @@ ssdtosyssd(ssd, sd) sd->sd_gran = ssd->ssd_gran; } +#if !defined(DEV_ATPIC) && defined(DEV_ISA) +#include <isa/isavar.h> +u_int +isa_irq_pending(void) +{ + + return (0); +} +#endif #define PHYSMAP_SIZE (2 * 8) @@ -783,7 +799,6 @@ static void getmemsize(caddr_t kmdp, u_int64_t first) { int i, physmap_idx, pa_indx; - u_int extmem; vm_paddr_t pa, physmap[PHYSMAP_SIZE]; pt_entry_t *pte; char *cp; @@ -802,12 +817,9 @@ getmemsize(caddr_t kmdp, u_int64_t first) * ie: an int32_t immediately precedes smap. */ smapbase = (struct bios_smap *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_SMAP); - if (smapbase == 0) - smapbase = (struct bios_smap *)preload_search_info(kmdp, MODINFO_METADATA | 0x0009); /* Old value for MODINFOMD_SMAP */ - if (smapbase == 0) { + if (smapbase == NULL) panic("No BIOS smap info from loader!"); - goto deep_shit; - } + smapsize = *((u_int32_t *)smapbase - 1); smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); @@ -816,14 +828,11 @@ getmemsize(caddr_t kmdp, u_int64_t first) printf("SMAP type=%02x base=%016lx len=%016lx\n", smap->type, smap->base, smap->length); - if (smap->type != 0x01) { + if (smap->type != 0x01) continue; - } - if (smap->length == 0) { -next_run: + if (smap->length == 0) continue; - } for (i = 0; i <= physmap_idx; i += 2) { if (smap->base < physmap[i + 1]) { @@ -836,6 +845,7 @@ next_run: if (smap->base == physmap[physmap_idx + 1]) { physmap[physmap_idx + 1] += smap->length; +next_run: continue; } @@ -850,69 +860,23 @@ next_run: } /* - * Perform "base memory" related probes & setup based on SMAP + * Find the 'base memory' segment for SMP */ -deep_shit: - if (basemem == 0) { - for (i = 0; i <= physmap_idx; i += 2) { - if (physmap[i] == 0x00000000) { - basemem = physmap[i + 1] / 1024; - break; - } - } - - if (basemem == 0) { - basemem = rtcin(RTC_BASELO) + (rtcin(RTC_BASEHI) << 8); - } - - if (basemem == 0) { - basemem = 640; - } - - if (basemem > 640) { - printf("Preposterous BIOS basemem of %uK, truncating to 640K\n", - basemem); - basemem = 640; + basemem = 0; + for (i = 0; i <= physmap_idx; i += 2) { + if (physmap[i] == 0x00000000) { + basemem = physmap[i + 1] / 1024; + break; } - -#if 0 - for (pa = trunc_page(basemem * 1024); - pa < ISA_HOLE_START; pa += PAGE_SIZE) - pmap_kenter(KERNBASE + pa, pa); -#endif } + if (basemem == 0) + panic("BIOS smap did not include a basemem segment!"); - if (physmap[1] != 0) - goto physmap_done; - - /* - * Prefer the RTC value for extended memory. - */ - extmem = rtcin(RTC_EXTLO) + (rtcin(RTC_EXTHI) << 8); - - /* - * Special hack for chipsets that still remap the 384k hole when - * there's 16MB of memory - this really confuses people that - * are trying to use bus mastering ISA controllers with the - * "16MB limit"; they only have 16MB, but the remapping puts - * them beyond the limit. - * - * If extended memory is between 15-16MB (16-17MB phys address range), - * chop it to 15MB. - */ - if ((extmem > 15 * 1024) && (extmem < 16 * 1024)) - extmem = 15 * 1024; - - physmap[0] = 0; - physmap[1] = basemem * 1024; - physmap_idx = 2; - physmap[physmap_idx] = 0x100000; - physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024; +#ifdef SMP + /* make hole for AP bootstrap code */ + physmap[1] = mp_bootaddress(physmap[1] / 1024); +#endif -physmap_done: - /* - * Now, physmap contains a map of physical memory. - */ /* * Maxmem isn't the "maximum memory", it's one larger than the * highest page of the physical address space. It should be @@ -929,7 +893,8 @@ physmap_done: * hw.physmem is a size in bytes; we also allow k, m, and g suffixes * for the appropriate modifiers. This overrides MAXMEM. */ - if ((cp = getenv("hw.physmem")) != NULL) { + cp = getenv("hw.physmem"); + if (cp != NULL) { u_int64_t AllowMem, sanity; char *ep; @@ -1106,11 +1071,18 @@ hammer_time(u_int64_t modulep, u_int64_t physfree) { caddr_t kmdp; int gsel_tss, off, x; - struct region_descriptor r_gdt, r_idt; struct pcpu *pc; u_int64_t msr; char *env; +#ifdef DEV_ISA + /* Preemptively mask the atpics and leave them shut down */ + outb(IO_ICU1 + ICU_IMR_OFFSET, 0xff); + outb(IO_ICU2 + ICU_IMR_OFFSET, 0xff); +#else +#error "have you forgotten the isa device?"; +#endif + /* Turn on PTE NX (no execute) bit */ msr = rdmsr(MSR_EFER) | EFER_NXE; wrmsr(MSR_EFER, msr); @@ -1146,7 +1118,7 @@ hammer_time(u_int64_t modulep, u_int64_t physfree) /* * make gdt memory segments */ - gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&common_tss; + gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&common_tss[0]; for (x = 0; x < NGDT; x++) { if (x != GPROC0_SEL && x != (GPROC0_SEL + 1)) @@ -1157,7 +1129,7 @@ hammer_time(u_int64_t modulep, u_int64_t physfree) r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; r_gdt.rd_base = (long) gdt; lgdt(&r_gdt); - pc = &__pcpu; + pc = &__pcpu[0]; wrmsr(MSR_FSBASE, 0); /* User value */ wrmsr(MSR_GSBASE, (u_int64_t)pc); @@ -1166,6 +1138,7 @@ hammer_time(u_int64_t modulep, u_int64_t physfree) pcpu_init(pc, 0, sizeof(struct pcpu)); PCPU_SET(prvspace, pc); PCPU_SET(curthread, &thread0); + PCPU_SET(tssp, &common_tss[0]); /* * Initialize mutexes. @@ -1211,8 +1184,8 @@ hammer_time(u_int64_t modulep, u_int64_t physfree) */ cninit(); -#ifdef DEV_ISA - isa_defaultirq(); +#ifdef DEV_ATPIC + atpic_startup(); #endif #ifdef DDB @@ -1225,12 +1198,14 @@ hammer_time(u_int64_t modulep, u_int64_t physfree) initializecpu(); /* Initialize CPU registers */ /* make an initial tss so cpu can get interrupt stack on syscall! */ - common_tss.tss_rsp0 = thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE - sizeof(struct pcb); + common_tss[0].tss_rsp0 = thread0.td_kstack + \ + KSTACK_PAGES * PAGE_SIZE - sizeof(struct pcb); /* Ensure the stack is aligned to 16 bytes */ - common_tss.tss_rsp0 &= ~0xF; + common_tss[0].tss_rsp0 &= ~0xF; + PCPU_SET(rsp0, common_tss[0].tss_rsp0); /* doublefault stack space, runs on ist1 */ - common_tss.tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)]; + common_tss[0].tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)]; gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); ltr(gsel_tss); diff --git a/sys/amd64/amd64/mem.c b/sys/amd64/amd64/mem.c index 56d268c..aeb2e35 100644 --- a/sys/amd64/amd64/mem.c +++ b/sys/amd64/amd64/mem.c @@ -323,6 +323,15 @@ mem_range_attr_set(struct mem_range_desc *mrd, int *arg) return (mem_range_softc.mr_op->set(&mem_range_softc, mrd, arg)); } +#ifdef SMP +void +mem_range_AP_init(void) +{ + if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP) + (mem_range_softc.mr_op->initAP(&mem_range_softc)); +} +#endif + static int mem_modevent(module_t mod, int type, void *data) { diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c index f58a94f..fcd478b 100644 --- a/sys/amd64/amd64/mp_machdep.c +++ b/sys/amd64/amd64/mp_machdep.c @@ -1,5 +1,6 @@ /*- * Copyright (c) 1996, by Steve Passe + * Copyright (c) 2003, by Peter Wemm * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -26,30 +27,12 @@ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); -#include "opt_apic.h" #include "opt_cpu.h" #include "opt_kstack_pages.h" -#if !defined(lint) -#if !defined(SMP) -#error How did you get here? -#endif - -#if defined(I386_CPU) && !defined(COMPILING_LINT) -#error SMP not supported with I386_CPU -#endif -#ifndef DEV_APIC -#error The apic device is required for SMP, add "device apic" to your config file. -#endif -#if defined(CPU_DISABLE_CMPXCHG) && !defined(COMPILING_LINT) -#error SMP not supported with CPU_DISABLE_CMPXCHG -#endif -#endif /* not lint */ - #include <sys/param.h> #include <sys/systm.h> #include <sys/bus.h> -#include <sys/cons.h> /* cngetc() */ #ifdef GPROF #include <sys/gmon.h> #endif @@ -75,9 +58,8 @@ __FBSDID("$FreeBSD$"); #include <machine/md_var.h> #include <machine/pcb.h> #include <machine/smp.h> -#include <machine/smptests.h> /** COUNT_XINVLTLB_HITS */ #include <machine/specialreg.h> -#include <machine/privatespace.h> +#include <machine/tss.h> #define WARMBOOT_TARGET 0 #define WARMBOOT_OFF (KERNBASE + 0x0467) @@ -88,67 +70,9 @@ __FBSDID("$FreeBSD$"); #define BIOS_RESET (0x0f) #define BIOS_WARM (0x0a) -/* - * this code MUST be enabled here and in mpboot.s. - * it follows the very early stages of AP boot by placing values in CMOS ram. - * it NORMALLY will never be needed and thus the primitive method for enabling. - * -#define CHECK_POINTS - */ - -#if defined(CHECK_POINTS) && !defined(PC98) -#define CHECK_READ(A) (outb(CMOS_REG, (A)), inb(CMOS_DATA)) -#define CHECK_WRITE(A,D) (outb(CMOS_REG, (A)), outb(CMOS_DATA, (D))) - -#define CHECK_INIT(D); \ - CHECK_WRITE(0x34, (D)); \ - CHECK_WRITE(0x35, (D)); \ - CHECK_WRITE(0x36, (D)); \ - CHECK_WRITE(0x37, (D)); \ - CHECK_WRITE(0x38, (D)); \ - CHECK_WRITE(0x39, (D)); - -#define CHECK_PRINT(S); \ - printf("%s: %d, %d, %d, %d, %d, %d\n", \ - (S), \ - CHECK_READ(0x34), \ - CHECK_READ(0x35), \ - CHECK_READ(0x36), \ - CHECK_READ(0x37), \ - CHECK_READ(0x38), \ - CHECK_READ(0x39)); - -#else /* CHECK_POINTS */ - -#define CHECK_INIT(D) -#define CHECK_PRINT(S) -#define CHECK_WRITE(A, D) - -#endif /* CHECK_POINTS */ - -/* - * Values to send to the POST hardware. - */ -#define MP_BOOTADDRESS_POST 0x10 -#define MP_PROBE_POST 0x11 -#define MPTABLE_PASS1_POST 0x12 - -#define MP_START_POST 0x13 -#define MP_ENABLE_POST 0x14 -#define MPTABLE_PASS2_POST 0x15 - -#define START_ALL_APS_POST 0x16 -#define INSTALL_AP_TRAMP_POST 0x17 -#define START_AP_POST 0x18 - -#define MP_ANNOUNCE_POST 0x19 - /* lock region used by kernel profiling */ int mcount_lock; -/** XXX FIXME: where does this really belong, isa.h/isa.c perhaps? */ -int current_postcode; - int mp_naps; /* # of Applications processors */ int boot_cpu_id = -1; /* designated BSP */ extern int nkpt; @@ -164,6 +88,9 @@ struct cpu_top *smp_topology; char *bootSTK; static int bootAP; +/* Free these after use */ +void *bootstacks[MAXCPU]; + /* Hotwire a 0->4MB V==P mapping */ extern pt_entry_t *KPTphys; @@ -178,6 +105,8 @@ vm_offset_t smp_tlb_addr2; volatile int smp_tlb_wait; struct mtx smp_tlb_mtx; +extern inthand_t IDTVEC(fast_syscall), IDTVEC(fast_syscall32); + /* * Local data and functions. */ @@ -201,17 +130,17 @@ struct cpu_info { } static cpu_info[MAXCPU]; static int cpu_apic_ids[MAXCPU]; -static u_int boot_address; +static u_int boot_address; static void set_logical_apic_ids(void); static int start_all_aps(void); -static void install_ap_tramp(void); static int start_ap(int apic_id); static void release_aps(void *dummy); static int hlt_cpus_mask; static int hlt_logical_cpus; static struct sysctl_ctx_list logical_cpu_clist; +static u_int bootMP_size; /* * Calculate usable address in base memory for AP trampoline code. @@ -219,13 +148,15 @@ static struct sysctl_ctx_list logical_cpu_clist; u_int mp_bootaddress(u_int basemem) { - POSTCODE(MP_BOOTADDRESS_POST); - boot_address = trunc_page(basemem); /* round down to 4k boundary */ + bootMP_size = mptramp_end - mptramp_start; + boot_address = trunc_page(basemem * 1024); /* round down to 4k boundary */ if ((basemem - boot_address) < bootMP_size) boot_address -= PAGE_SIZE; /* not enough, lower by 4k */ + /* 3 levels of page table pages */ + mptramp_pagetables = boot_address - (PAGE_SIZE * 3); - return boot_address; + return mptramp_pagetables; } void @@ -302,43 +233,34 @@ cpu_mp_start(void) { int i; - POSTCODE(MP_START_POST); - /* Initialize the logical ID to APIC ID table. */ for (i = 0; i < MAXCPU; i++) cpu_apic_ids[i] = -1; /* Install an inter-CPU IPI for TLB invalidation */ - setidt(IPI_INVLTLB, IDTVEC(invltlb), - SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); - setidt(IPI_INVLPG, IDTVEC(invlpg), - SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); - setidt(IPI_INVLRNG, IDTVEC(invlrng), - SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); + setidt(IPI_INVLTLB, IDTVEC(invltlb), SDT_SYSIGT, SEL_KPL, 0); + setidt(IPI_INVLPG, IDTVEC(invlpg), SDT_SYSIGT, SEL_KPL, 0); + setidt(IPI_INVLRNG, IDTVEC(invlrng), SDT_SYSIGT, SEL_KPL, 0); /* Install an inter-CPU IPI for forwarding hardclock() */ - setidt(IPI_HARDCLOCK, IDTVEC(hardclock), - SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); + setidt(IPI_HARDCLOCK, IDTVEC(hardclock), SDT_SYSIGT, SEL_KPL, 0); /* Install an inter-CPU IPI for forwarding statclock() */ - setidt(IPI_STATCLOCK, IDTVEC(statclock), - SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); + setidt(IPI_STATCLOCK, IDTVEC(statclock), SDT_SYSIGT, SEL_KPL, 0); +#ifdef LAZY_SWITCH /* Install an inter-CPU IPI for lazy pmap release */ - setidt(IPI_LAZYPMAP, IDTVEC(lazypmap), - SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); + setidt(IPI_LAZYPMAP, IDTVEC(lazypmap), SDT_SYSIGT, SEL_KPL, 0); +#endif /* Install an inter-CPU IPI for all-CPU rendezvous */ - setidt(IPI_RENDEZVOUS, IDTVEC(rendezvous), - SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); + setidt(IPI_RENDEZVOUS, IDTVEC(rendezvous), SDT_SYSIGT, SEL_KPL, 0); /* Install an inter-CPU IPI for forcing an additional software trap */ - setidt(IPI_AST, IDTVEC(cpuast), - SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); + setidt(IPI_AST, IDTVEC(cpuast), SDT_SYSIGT, SEL_KPL, 0); /* Install an inter-CPU IPI for CPU stop/restart */ - setidt(IPI_STOP, IDTVEC(cpustop), - SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); + setidt(IPI_STOP, IDTVEC(cpustop), SDT_SYSIGT, SEL_KPL, 0); mtx_init(&smp_tlb_mtx, "tlb", NULL, MTX_SPIN); @@ -371,8 +293,6 @@ cpu_mp_announce(void) { int i, x; - POSTCODE(MP_ANNOUNCE_POST); - /* List CPUs */ printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id); for (i = 1, x = 0; x < MAXCPU; x++) { @@ -390,38 +310,41 @@ cpu_mp_announce(void) void init_secondary(void) { - int gsel_tss; - int x, myid; - u_int cr0; - - /* bootAP is set in start_ap() to our ID. */ - myid = bootAP; - gdt_segs[GPRIV_SEL].ssd_base = (int) &SMP_prvspace[myid]; - gdt_segs[GPROC0_SEL].ssd_base = - (int) &SMP_prvspace[myid].pcpu.pc_common_tss; - SMP_prvspace[myid].pcpu.pc_prvspace = - &SMP_prvspace[myid].pcpu; - - for (x = 0; x < NGDT; x++) { - ssdtosd(&gdt_segs[x], &gdt[myid * NGDT + x].sd); - } + struct pcpu *pc; + u_int64_t msr, cr0; + int cpu, gsel_tss; + + /* Set by the startup code for us to use */ + cpu = bootAP; + + /* Init tss */ + common_tss[cpu] = common_tss[0]; + common_tss[cpu].tss_rsp0 = 0; /* not used until after switch */ + + gdt_segs[GPROC0_SEL].ssd_base = (long) &common_tss[cpu]; + ssdtosyssd(&gdt_segs[GPROC0_SEL], + (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); - r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; - r_gdt.rd_base = (int) &gdt[myid * NGDT]; lgdt(&r_gdt); /* does magic intra-segment return */ - lidt(&r_idt); + /* Get per-cpu data */ + pc = &__pcpu[cpu]; + + /* prime data page for it to use */ + pcpu_init(pc, cpu, sizeof(struct pcpu)); + pc->pc_apic_id = cpu_apic_ids[cpu]; + pc->pc_prvspace = pc; + pc->pc_curthread = 0; + pc->pc_tssp = &common_tss[cpu]; + pc->pc_rsp0 = 0; - lldt(_default_ldt); - PCPU_SET(currentldt, _default_ldt); + wrmsr(MSR_FSBASE, 0); /* User value */ + wrmsr(MSR_GSBASE, (u_int64_t)pc); + wrmsr(MSR_KGSBASE, (u_int64_t)pc); /* XXX User value while we're in the kernel */ + + lidt(&r_idt); gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); - gdt[myid * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS; - PCPU_SET(common_tss.tss_esp0, 0); /* not used until after switch */ - PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); - PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16); - PCPU_SET(tss_gdt, &gdt[myid * NGDT + GPROC0_SEL].sd); - PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); ltr(gsel_tss); /* @@ -432,32 +355,32 @@ init_secondary(void) cr0 = rcr0(); cr0 &= ~(CR0_CD | CR0_NW | CR0_EM); load_cr0(cr0); - CHECK_WRITE(0x38, 5); - - /* Disable local APIC just to be sure. */ + + /* Set up the fast syscall stuff */ + msr = rdmsr(MSR_EFER) | EFER_SCE; + wrmsr(MSR_EFER, msr); + wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall)); + wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32)); + msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | + ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48); + wrmsr(MSR_STAR, msr); + wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D); + + /* Disable local apic just to be sure. */ lapic_disable(); /* signal our startup to the BSP. */ mp_naps++; - CHECK_WRITE(0x39, 6); /* Spin until the BSP releases the AP's. */ while (!aps_ready) ia32_pause(); - /* BSP may have changed PTD while we were waiting */ - invltlb(); - pmap_invalidate_range(kernel_pmap, 0, NKPT * NBPDR - 1); - -#if defined(I586_CPU) && !defined(NO_F00F_HACK) - lidt(&r_idt); -#endif - /* set up CPU registers and state */ cpu_setregs(); /* set up FPU state on the AP */ - npxinit(__INITIAL_NPXCW__); + fpuinit(); /* set up SSE registers */ enable_sse(); @@ -467,7 +390,6 @@ init_secondary(void) printf("SMP: cpuid = %d\n", PCPU_GET(cpuid)); printf("SMP: actual apic_id = %d\n", lapic_id()); printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id)); - printf("PTD[MPPTDI] = %#jx\n", (uintmax_t)PTD[MPPTDI]); panic("cpuid mismatch! boom!!"); } @@ -559,39 +481,51 @@ set_logical_apic_ids(void) static int start_all_aps(void) { -#ifndef PC98 u_char mpbiosreason; -#endif - u_long mpbioswarmvec; - struct pcpu *pc; - char *stack; - uintptr_t kptbase; - int i, pg, apic_id, cpu; - - POSTCODE(START_ALL_APS_POST); + u_int32_t mpbioswarmvec; + int apic_id, cpu, i; + u_int64_t *pt4, *pt3, *pt2; mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN); /* install the AP 1st level boot code */ - install_ap_tramp(); + pmap_kenter(boot_address + KERNBASE, boot_address); + bcopy(mptramp_start, (void *)((uintptr_t)boot_address + KERNBASE), bootMP_size); + + /* Locate the page tables, they'll be below the trampoline */ + pt4 = (u_int64_t *)(uintptr_t)(mptramp_pagetables + KERNBASE); + pt3 = pt4 + (PAGE_SIZE) / sizeof(u_int64_t); + pt2 = pt3 + (PAGE_SIZE) / sizeof(u_int64_t); + + /* Create the initial 1GB replicated page tables */ + for (i = 0; i < 512; i++) { + /* Each slot of the level 4 pages points to the same level 3 page */ + pt4[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + PAGE_SIZE); + pt4[i] |= PG_V | PG_RW | PG_U; + + /* Each slot of the level 3 pages points to the same level 2 page */ + pt3[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + (2 * PAGE_SIZE)); + pt3[i] |= PG_V | PG_RW | PG_U; + + /* The level 2 page slots are mapped with 2MB pages for 1GB. */ + pt2[i] = i * (2 * 1024 * 1024); + pt2[i] |= PG_V | PG_RW | PG_PS | PG_U; + } /* save the current value of the warm-start vector */ - mpbioswarmvec = *((u_long *) WARMBOOT_OFF); -#ifndef PC98 + mpbioswarmvec = *((u_int32_t *) WARMBOOT_OFF); outb(CMOS_REG, BIOS_RESET); mpbiosreason = inb(CMOS_DATA); -#endif - /* set up temporary P==V mapping for AP boot */ - /* XXX this is a hack, we should boot the AP on its own stack/PTD */ - kptbase = (uintptr_t)(void *)KPTphys; - for (i = 0; i < NKPT; i++) - PTD[i] = (pd_entry_t)(PG_V | PG_RW | - ((kptbase + i * PAGE_SIZE) & PG_FRAME)); - invltlb(); + /* setup a vector to our boot code */ + *((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET; + *((volatile u_short *) WARMBOOT_SEG) = (boot_address >> 4); + outb(CMOS_REG, BIOS_RESET); + outb(CMOS_DATA, BIOS_WARM); /* 'warm-start' */ /* start each AP */ - for (cpu = 0, apic_id = 0; apic_id < MAXCPU; apic_id++) { + cpu = 0; + for (apic_id = 0; apic_id < MAXCPU; apic_id++) { if (!cpu_info[apic_id].cpu_present || cpu_info[apic_id].cpu_bsp) continue; @@ -600,48 +534,18 @@ start_all_aps(void) /* save APIC ID for this logical ID */ cpu_apic_ids[cpu] = apic_id; - /* first page of AP's private space */ - pg = cpu * i386_btop(sizeof(struct privatespace)); - - /* allocate a new private data page */ - pc = (struct pcpu *)kmem_alloc(kernel_map, PAGE_SIZE); - - /* wire it into the private page table page */ - SMPpt[pg] = (pt_entry_t)(PG_V | PG_RW | vtophys(pc)); - /* allocate and set up an idle stack data page */ - stack = (char *)kmem_alloc(kernel_map, KSTACK_PAGES * PAGE_SIZE); /* XXXKSE */ - for (i = 0; i < KSTACK_PAGES; i++) - SMPpt[pg + 1 + i] = (pt_entry_t) - (PG_V | PG_RW | vtophys(PAGE_SIZE * i + stack)); - - /* prime data page for it to use */ - pcpu_init(pc, cpu, sizeof(struct pcpu)); - pc->pc_apic_id = apic_id; - - /* setup a vector to our boot code */ - *((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET; - *((volatile u_short *) WARMBOOT_SEG) = (boot_address >> 4); -#ifndef PC98 - outb(CMOS_REG, BIOS_RESET); - outb(CMOS_DATA, BIOS_WARM); /* 'warm-start' */ -#endif + bootstacks[cpu] = (char *)kmem_alloc(kernel_map, KSTACK_PAGES * PAGE_SIZE); - bootSTK = &SMP_prvspace[cpu].idlekstack[KSTACK_PAGES * - PAGE_SIZE]; + bootSTK = (char *)bootstacks[cpu] + KSTACK_PAGES * PAGE_SIZE - 8; bootAP = cpu; /* attempt to start the Application Processor */ - CHECK_INIT(99); /* setup checkpoints */ if (!start_ap(apic_id)) { - printf("AP #%d (PHY# %d) failed!\n", cpu, apic_id); - CHECK_PRINT("trace"); /* show checkpoints */ - /* better panic as the AP may be running loose */ - printf("panic y/n? [y] "); - if (cngetc() != 'n') - panic("bye-bye"); + /* restore the warmstart vector */ + *(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec; + panic("AP #%d (PHY# %d) failed!", cpu, apic_id); } - CHECK_PRINT("trace"); /* show checkpoints */ all_cpus |= (1 << cpu); /* record AP in CPU map */ } @@ -650,92 +554,15 @@ start_all_aps(void) PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask)); /* restore the warmstart vector */ - *(u_long *) WARMBOOT_OFF = mpbioswarmvec; -#ifndef PC98 + *(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec; + outb(CMOS_REG, BIOS_RESET); outb(CMOS_DATA, mpbiosreason); -#endif - - /* - * Set up the idle context for the BSP. Similar to above except - * that some was done by locore, some by pmap.c and some is implicit - * because the BSP is cpu#0 and the page is initially zero and also - * because we can refer to variables by name on the BSP.. - */ - - /* Allocate and setup BSP idle stack */ - stack = (char *)kmem_alloc(kernel_map, KSTACK_PAGES * PAGE_SIZE); - for (i = 0; i < KSTACK_PAGES; i++) - SMPpt[1 + i] = (pt_entry_t) - (PG_V | PG_RW | vtophys(PAGE_SIZE * i + stack)); - - for (i = 0; i < NKPT; i++) - PTD[i] = 0; - pmap_invalidate_range(kernel_pmap, 0, NKPT * NBPDR - 1); /* number of APs actually started */ return mp_naps; } -/* - * load the 1st level AP boot code into base memory. - */ - -/* targets for relocation */ -extern void bigJump(void); -extern void bootCodeSeg(void); -extern void bootDataSeg(void); -extern void MPentry(void); -extern u_int MP_GDT; -extern u_int mp_gdtbase; - -static void -install_ap_tramp(void) -{ - int x; - int size = *(int *) ((u_long) & bootMP_size); - u_char *src = (u_char *) ((u_long) bootMP); - u_char *dst = (u_char *) boot_address + KERNBASE; - u_int boot_base = (u_int) bootMP; - u_int8_t *dst8; - u_int16_t *dst16; - u_int32_t *dst32; - - POSTCODE(INSTALL_AP_TRAMP_POST); - - pmap_kenter(boot_address + KERNBASE, boot_address); - for (x = 0; x < size; ++x) - *dst++ = *src++; - - /* - * modify addresses in code we just moved to basemem. unfortunately we - * need fairly detailed info about mpboot.s for this to work. changes - * to mpboot.s might require changes here. - */ - - /* boot code is located in KERNEL space */ - dst = (u_char *) boot_address + KERNBASE; - - /* modify the lgdt arg */ - dst32 = (u_int32_t *) (dst + ((u_int) & mp_gdtbase - boot_base)); - *dst32 = boot_address + ((u_int) & MP_GDT - boot_base); - - /* modify the ljmp target for MPentry() */ - dst32 = (u_int32_t *) (dst + ((u_int) bigJump - boot_base) + 1); - *dst32 = ((u_int) MPentry - KERNBASE); - - /* modify the target for boot code segment */ - dst16 = (u_int16_t *) (dst + ((u_int) bootCodeSeg - boot_base)); - dst8 = (u_int8_t *) (dst16 + 1); - *dst16 = (u_int) boot_address & 0xffff; - *dst8 = ((u_int) boot_address >> 16) & 0xff; - - /* modify the target for boot data segment */ - dst16 = (u_int16_t *) (dst + ((u_int) bootDataSeg - boot_base)); - dst8 = (u_int8_t *) (dst16 + 1); - *dst16 = (u_int) boot_address & 0xffff; - *dst8 = ((u_int) boot_address >> 16) & 0xff; -} /* * This function starts the AP (application processor) identified @@ -750,8 +577,6 @@ start_ap(int apic_id) int vector, ms; int cpus; - POSTCODE(START_AP_POST); - /* calculate the vector */ vector = (boot_address >> 12) & 0xff; @@ -810,50 +635,14 @@ start_ap(int apic_id) DELAY(200); /* wait ~200uS */ /* Wait up to 5 seconds for it to start. */ - for (ms = 0; ms < 5000; ms++) { + for (ms = 0; ms < 50; ms++) { if (mp_naps > cpus) return 1; /* return SUCCESS */ - DELAY(1000); + DELAY(100000); } return 0; /* return FAILURE */ } -#ifdef COUNT_XINVLTLB_HITS -u_int xhits_gbl[MAXCPU]; -u_int xhits_pg[MAXCPU]; -u_int xhits_rng[MAXCPU]; -SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, ""); -SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl, - sizeof(xhits_gbl), "IU", ""); -SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg, - sizeof(xhits_pg), "IU", ""); -SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng, - sizeof(xhits_rng), "IU", ""); - -u_int ipi_global; -u_int ipi_page; -u_int ipi_range; -u_int ipi_range_size; -SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, ""); -SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, ""); -SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, ""); -SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size, - 0, ""); - -u_int ipi_masked_global; -u_int ipi_masked_page; -u_int ipi_masked_range; -u_int ipi_masked_range_size; -SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW, - &ipi_masked_global, 0, ""); -SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW, - &ipi_masked_page, 0, ""); -SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW, - &ipi_masked_range, 0, ""); -SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW, - &ipi_masked_range_size, 0, ""); -#endif /* COUNT_XINVLTLB_HITS */ - /* * Flush the TLB on all other CPU's */ @@ -966,69 +755,49 @@ smp_targeted_tlb_shootdown(u_int mask, u_int vector, vm_offset_t addr1, vm_offse void smp_invltlb(void) { - if (smp_started) { + + if (smp_started) smp_tlb_shootdown(IPI_INVLTLB, 0, 0); -#ifdef COUNT_XINVLTLB_HITS - ipi_global++; -#endif - } } void smp_invlpg(vm_offset_t addr) { - if (smp_started) { + + if (smp_started) smp_tlb_shootdown(IPI_INVLPG, addr, 0); -#ifdef COUNT_XINVLTLB_HITS - ipi_page++; -#endif - } } void smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2) { - if (smp_started) { + + if (smp_started) smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2); -#ifdef COUNT_XINVLTLB_HITS - ipi_range++; - ipi_range_size += (addr2 - addr1) / PAGE_SIZE; -#endif - } } void smp_masked_invltlb(u_int mask) { - if (smp_started) { + + if (smp_started) smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0); -#ifdef COUNT_XINVLTLB_HITS - ipi_masked_global++; -#endif - } } void smp_masked_invlpg(u_int mask, vm_offset_t addr) { - if (smp_started) { + + if (smp_started) smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0); -#ifdef COUNT_XINVLTLB_HITS - ipi_masked_page++; -#endif - } } void smp_masked_invlpg_range(u_int mask, vm_offset_t addr1, vm_offset_t addr2) { - if (smp_started) { + + if (smp_started) smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2); -#ifdef COUNT_XINVLTLB_HITS - ipi_masked_range++; - ipi_masked_range_size += (addr2 - addr1) / PAGE_SIZE; -#endif - } } diff --git a/sys/amd64/amd64/mpboot.S b/sys/amd64/amd64/mpboot.S index 8f42f6b..ca53a87 100644 --- a/sys/amd64/amd64/mpboot.S +++ b/sys/amd64/amd64/mpboot.S @@ -1,5 +1,5 @@ -/* - * Copyright (c) 1995, Jack F. Vogel +/*- + * Copyright (c) 2003 Peter Wemm * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -10,16 +10,11 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by Jack F. Vogel - * 4. The name of the developer may be used to endorse or promote products - * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) @@ -28,257 +23,214 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * mpboot.s: FreeBSD machine support for the Intel MP Spec - * multiprocessor systems. - * * $FreeBSD$ */ -#include "opt_pmap.h" - #include <machine/asmacros.h> /* miscellaneous asm macros */ -#include <machine/apicreg.h> #include <machine/specialreg.h> #include "assym.s" -#define R(x) ((x)-KERNBASE) - -/* - * this code MUST be enabled here and in mp_machdep.c - * it follows the very early stages of AP boot by placing values in CMOS ram. - * it NORMALLY will never be needed and thus the primitive method for enabling. - * -#define CHECK_POINTS - */ - -#if defined(CHECK_POINTS) && !defined(PC98) - -#define CMOS_REG (0x70) -#define CMOS_DATA (0x71) - -#define CHECKPOINT(A,D) \ - movb $(A),%al ; \ - outb %al,$CMOS_REG ; \ - movb $(D),%al ; \ - outb %al,$CMOS_DATA + .data /* So we can modify it */ -#else - -#define CHECKPOINT(A,D) - -#endif /* CHECK_POINTS */ - - -/* - * the APs enter here from their trampoline code (bootMP, below) - */ - .p2align 4 - -NON_GPROF_ENTRY(MPentry) - CHECKPOINT(0x36, 3) + .p2align 4,0 + .globl mptramp_start +mptramp_start: + .code16 /* - * Enable features on this processor. We don't support SMP on - * CPUs older than a Pentium, so we know that we can use the cpuid - * instruction. + * The AP enters here in response to the startup IPI. + * We are in real mode. %cs is the only segment register set. */ - movl $1,%eax - cpuid /* Retrieve features */ - movl %cr4,%eax -#ifndef DISABLE_PSE - testl $CPUID_PSE,%edx - jz 1f - orl $CR4_PSE,%eax /* Enable PSE */ -1: -#endif -#ifndef DISABLE_PG_G - testl $CPUID_PGE,%edx - jz 1f - orl $CR4_PGE,%eax /* Enable PGE */ -1: -#endif - testl $CPUID_VME,%edx - jz 1f - orl $CR4_VME,%eax /* Enable VME */ -1: - movl %eax,%cr4 - - /* Now enable paging mode */ -#ifdef PAE - movl R(IdlePDPT), %eax - movl %eax, %cr3 - movl %cr4, %eax - orl $CR4_PAE, %eax - movl %eax, %cr4 -#else - movl R(IdlePTD), %eax - movl %eax,%cr3 -#endif - movl %cr0,%eax - orl $CR0_PE|CR0_PG,%eax /* enable paging */ - movl %eax,%cr0 /* let the games begin! */ - movl bootSTK,%esp /* boot stack end loc. */ - - pushl $mp_begin /* jump to high mem */ - ret + cli /* make sure no interrupts */ + mov %cs, %ax /* copy %cs to %ds. Remember these */ + mov %ax, %ds /* are offsets rather than selectors */ + mov %ax, %ss /* - * Wait for the booting CPU to signal startup + * Find relocation base and patch the gdt descript and ljmp targets */ -mp_begin: /* now running relocated at KERNBASE */ - CHECKPOINT(0x37, 4) - call init_secondary /* load i386 tables */ - -/* - * This is the embedded trampoline or bootstrap that is - * copied into 'real-mode' low memory, it is where the - * secondary processor "wakes up". When it is executed - * the processor will eventually jump into the routine - * MPentry, which resides in normal kernel text above - * 1Meg. -jackv - */ + xorl %ebx,%ebx + mov %cs, %bx + sall $4, %ebx /* %ebx is now our relocation base */ + orl %ebx, lgdt_desc-mptramp_start+2 + orl %ebx, jmp_32-mptramp_start+2 + orl %ebx, jmp_64-mptramp_start+1 - .data - ALIGN_DATA /* just to be sure */ + /* + * Load the descriptor table pointer. We'll need it when running + * in 16 bit protected mode. + */ + lgdt lgdt_desc-mptramp_start -BOOTMP1: + /* Enable protected mode */ + movl $CR0_PE, %eax + mov %eax, %cr0 -NON_GPROF_ENTRY(bootMP) - .code16 - cli - CHECKPOINT(0x34, 1) - /* First guarantee a 'clean slate' */ - xorl %eax, %eax - movl %eax, %ebx - movl %eax, %ecx - movl %eax, %edx - movl %eax, %esi - movl %eax, %edi + /* + * Now execute a far jump to turn on protected mode. This + * causes the segment registers to turn into selectors and causes + * %cs to be loaded from the gdt. + * + * The following instruction is: + * ljmpl $bootcode-gdt, $protmode-mptramp_start + * but gas cannot assemble that. And besides, we patch the targets + * in early startup and its a little clearer what we are patching. + */ +jmp_32: + .byte 0x66 /* size override to 32 bits */ + .byte 0xea /* opcode for far jump */ + .long protmode-mptramp_start /* offset in segment */ + .word bootcode-gdt /* index in gdt for 32 bit code */ - /* set up data segments */ - mov %cs, %ax + /* + * At this point, we are running in 32 bit legacy protected mode. + */ + .code32 +protmode: + mov $bootdata-gdt, %eax mov %ax, %ds - mov %ax, %es - mov %ax, %fs - mov %ax, %gs - mov %ax, %ss - mov $(boot_stk-bootMP), %esp - /* Now load the global descriptor table */ - lgdt MP_GDTptr-bootMP + /* Turn on the PAE, PSE and PGE bits for when paging is enabled */ + mov %cr4, %eax + orl $(CR4_PAE | CR4_PSE), %eax + mov %eax, %cr4 - /* Enable protected mode */ - movl %cr0, %eax - orl $CR0_PE, %eax - movl %eax, %cr0 + /* + * Enable EFER.LME so that we get long mode when all the prereqs are + * in place. In this case, it turns on when CR0_PG is finally enabled. + * Pick up a few other EFER bits that we'll use need we're here. + */ + movl $MSR_EFER, %ecx + rdmsr + orl $EFER_LME | EFER_SCE | EFER_NXE, %eax + wrmsr /* - * make intrasegment jump to flush the processor pipeline and - * reload CS register + * Point to the embedded page tables for startup. Note that this + * only gets accessed after we're actually in 64 bit mode, however + * we can only set the bottom 32 bits of %cr3 in this state. This + * means we are required to use a temporary page table that is below + * the 4GB limit. %ebx is still our relocation base. We could just + * subtract 3 * PAGE_SIZE, but that would be too easy. */ - pushl $0x18 - pushl $(protmode-bootMP) - lretl + leal mptramp_pagetables-mptramp_start(%ebx),%eax + movl (%eax), %eax + mov %eax, %cr3 - .code32 -protmode: - CHECKPOINT(0x35, 2) + /* + * Finally, switch to long bit mode by enabling paging. We have + * to be very careful here because all the segmentation disappears + * out from underneath us. The spec says we can depend on the + * subsequent pipelined branch to execute, but *only if* everthing + * is still identity mapped. If any mappings change, the pipeline + * will flush. + */ + mov %cr0, %eax + orl $CR0_PG, %eax + mov %eax, %cr0 /* - * we are NOW running for the first time with %eip - * having the full physical address, BUT we still - * are using a segment descriptor with the origin - * not matching the booting kernel. + * At this point paging is enabled, and we are in "compatability" mode. + * We do another far jump to reload %cs with the 64 bit selector. + * %cr3 points to a 4-level page table page. + * We cannot yet jump all the way to the kernel because we can only + * specify a 32 bit linear address. So, yet another trampoline. * - * SO NOW... for the BIG Jump into kernel's segment - * and physical text above 1 Meg. + * The following instruction is: + * ljmp $kernelcode-gdt, $tramp_64-mptramp_start + * but gas cannot assemble that. And besides, we patch the targets + * in early startup and its a little clearer what we are patching. */ - mov $0x10, %ebx - movw %bx, %ds - movw %bx, %es - movw %bx, %fs - movw %bx, %gs - movw %bx, %ss - - .globl bigJump -bigJump: - /* this will be modified by mpInstallTramp() */ - ljmp $0x08, $0 /* far jmp to MPentry() */ - -dead: hlt /* We should never get here */ - jmp dead - -/* - * MP boot strap Global Descriptor Table - */ - .p2align 4 - .globl MP_GDT - .globl bootCodeSeg - .globl bootDataSeg -MP_GDT: - -nulldesc: /* offset = 0x0 */ +jmp_64: + .byte 0xea /* opcode for far jump */ + .long tramp_64-mptramp_start /* offset in segment */ + .word kernelcode-gdt /* index in gdt for 64 bit code */ - .word 0x0 - .word 0x0 - .byte 0x0 - .byte 0x0 - .byte 0x0 - .byte 0x0 - -kernelcode: /* offset = 0x08 */ - - .word 0xffff /* segment limit 0..15 */ - .word 0x0000 /* segment base 0..15 */ - .byte 0x0 /* segment base 16..23; set for 0K */ - .byte 0x9f /* flags; Type */ - .byte 0xcf /* flags; Limit */ - .byte 0x0 /* segment base 24..32 */ - -kerneldata: /* offset = 0x10 */ - - .word 0xffff /* segment limit 0..15 */ - .word 0x0000 /* segment base 0..15 */ - .byte 0x0 /* segment base 16..23; set for 0k */ - .byte 0x93 /* flags; Type */ - .byte 0xcf /* flags; Limit */ - .byte 0x0 /* segment base 24..32 */ + /* + * Yeehar! We're running in 64 bit mode! We can mostly ignore our + * segment registers, and get on with it. + * Note that we are running at the correct virtual address, but with + * a 1:1 1GB mirrored mapping over entire address space. We had better + * switch to a real %cr3 promptly so that we can get to the direct map + * space. Remember that jmp is relative and that we've been relocated, + * so use an indirect jump. + */ + .code64 +tramp_64: + movabsq $entry_64,%rax /* 64 bit immediate load */ + jmp *%rax -bootcode: /* offset = 0x18 */ + .p2align 4,0 +gdt: + /* + * All segment descriptor tables start with a null descriptor + */ + .long 0x00000000 + .long 0x00000000 - .word 0xffff /* segment limit 0..15 */ -bootCodeSeg: /* this will be modified by mpInstallTramp() */ - .word 0x0000 /* segment base 0..15 */ - .byte 0x00 /* segment base 16...23; set for 0x000xx000 */ - .byte 0x9e /* flags; Type */ - .byte 0xcf /* flags; Limit */ - .byte 0x0 /*segment base 24..32 */ + /* + * This is the 64 bit long mode code descriptor. There is no + * 64 bit data descriptor. + */ +kernelcode: + .long 0x00000000 + .long 0x00209800 -bootdata: /* offset = 0x20 */ + /* + * This is the descriptor for the 32 bit boot code. + * %cs: +A, +R, -C, DPL=0, +P, +D, +G + * Accessed, Readable, Present, 32 bit, 4G granularity + */ +bootcode: + .long 0x0000ffff + .long 0x00cf9b00 - .word 0xffff -bootDataSeg: /* this will be modified by mpInstallTramp() */ - .word 0x0000 /* segment base 0..15 */ - .byte 0x00 /* segment base 16...23; set for 0x000xx000 */ - .byte 0x92 - .byte 0xcf - .byte 0x0 + /* + * This is the descriptor for the 32 bit boot data. + * We load it into %ds and %ss. The bits for each selector + * are interpreted slightly differently. + * %ds: +A, +W, -E, DPL=0, +P, +D, +G + * %ss: +A, +W, -E, DPL=0, +P, +B, +G + * Accessed, Writeable, Expand up, Present, 32 bit, 4GB + * For %ds, +D means 'default operand size is 32 bit'. + * For %ss, +B means the stack register is %esp rather than %sp. + */ +bootdata: + .long 0x0000ffff + .long 0x00cf9300 -/* - * GDT pointer for the lgdt call - */ - .globl mp_gdtbase +gdtend: -MP_GDTptr: -mp_gdtlimit: - .word 0x0028 -mp_gdtbase: /* this will be modified by mpInstallTramp() */ + /* + * The address of our page table pages that the boot code + * uses to trampoline up to kernel address space. + */ + .globl mptramp_pagetables +mptramp_pagetables: .long 0 - .space 0x100 /* space for boot_stk - 1st temporary stack */ -boot_stk: + /* + * The pseudo descriptor for lgdt to use. + */ +lgdt_desc: + .word gdtend-gdt /* Length */ + .long gdt-mptramp_start /* Offset plus %ds << 4 */ + + .globl mptramp_end +mptramp_end: -BOOTMP2: - .globl bootMP_size -bootMP_size: - .long BOOTMP2 - BOOTMP1 + /* + * From here on down is executed in the kernel .text section. + * + * Load a real %cr3 that has all the direct map stuff and switches + * off the 1GB replicated mirror. Load a stack pointer and jump + * into AP startup code in C. + */ + .text + .code64 + .p2align 4,0 +entry_64: + movq KPML4phys, %rax + movq %rax, %cr3 + movq bootSTK, %rsp + jmp init_secondary diff --git a/sys/amd64/amd64/mptable.c b/sys/amd64/amd64/mptable.c index 908e65a..f0a9883 100644 --- a/sys/amd64/amd64/mptable.c +++ b/sys/amd64/amd64/mptable.c @@ -27,7 +27,6 @@ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); -#include "opt_mptable_force_htt.h" #include <sys/param.h> #include <sys/systm.h> #include <sys/bus.h> @@ -57,13 +56,8 @@ __FBSDID("$FreeBSD$"); #define NAPICID 32 /* Max number of I/O APIC's */ -#ifdef PC98 -#define BIOS_BASE (0xe8000) -#define BIOS_SIZE (0x18000) -#else #define BIOS_BASE (0xf0000) #define BIOS_SIZE (0x10000) -#endif #define BIOS_COUNT (BIOS_SIZE/4) typedef void mptable_entry_handler(u_char *entry, void *arg); @@ -226,11 +220,12 @@ static int mptable_probe(void) { int x; - u_long segment; + u_int32_t segment; u_int32_t target; /* see if EBDA exists */ - if ((segment = (u_long) * (u_short *) (KERNBASE + 0x40e)) != 0) { + segment = (u_int32_t) *(u_short *)(KERNBASE + 0x40e); + if (segment != 0) { /* search first 1K of EBDA */ target = (u_int32_t) (segment << 4); if ((x = search_for_sig(target, 1024 / 4)) >= 0) diff --git a/sys/amd64/amd64/nexus.c b/sys/amd64/amd64/nexus.c index 952ceaf..1fab16b 100644 --- a/sys/amd64/amd64/nexus.c +++ b/sys/amd64/amd64/nexus.c @@ -50,6 +50,7 @@ __FBSDID("$FreeBSD$"); #include <sys/malloc.h> #include <sys/module.h> #include <machine/bus.h> +#include <machine/intr_machdep.h> #include <sys/rman.h> #include <sys/interrupt.h> @@ -64,8 +65,6 @@ __FBSDID("$FreeBSD$"); #include <isa/isavar.h> #include <amd64/isa/isa.h> #endif -#include <amd64/isa/icu.h> -#include <amd64/isa/intr_machdep.h> #include <sys/rtprio.h> static MALLOC_DEFINE(M_NEXUSDEV, "nexusdev", "Nexus device"); @@ -156,14 +155,11 @@ nexus_probe(device_t dev) * multi-ISA-bus systems. PCI interrupts are routed to the ISA * component, so in a way, PCI can be a partial child of an ISA bus(!). * APIC interrupts are global though. - * - * XXX We depend on the AT PIC driver correctly claiming IRQ 2 - * to prevent its reuse elsewhere. */ irq_rman.rm_start = 0; irq_rman.rm_type = RMAN_ARRAY; irq_rman.rm_descr = "Interrupt request lines"; - irq_rman.rm_end = 15; + irq_rman.rm_end = NUM_IO_INTS - 1; if (rman_init(&irq_rman) || rman_manage_region(&irq_rman, irq_rman.rm_start, irq_rman.rm_end)) @@ -428,7 +424,7 @@ nexus_setup_intr(device_t bus, device_t child, struct resource *irq, if (error) return (error); - error = inthand_add(device_get_nameunit(child), irq->r_start, + error = intr_add_handler(device_get_nameunit(child), irq->r_start, ihand, arg, flags, cookiep); return (error); @@ -437,7 +433,7 @@ nexus_setup_intr(device_t bus, device_t child, struct resource *irq, static int nexus_teardown_intr(device_t dev, device_t child, struct resource *r, void *ih) { - return (inthand_remove(ih)); + return (intr_remove_handler(ih)); } static int diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index 4e4c124..b2f0c18 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -118,6 +118,9 @@ __FBSDID("$FreeBSD$"); #include <sys/user.h> #include <sys/vmmeter.h> #include <sys/sysctl.h> +#ifdef SMP +#include <sys/smp.h> +#endif #include <vm/vm.h> #include <vm/vm_param.h> @@ -134,6 +137,9 @@ __FBSDID("$FreeBSD$"); #include <machine/cputypes.h> #include <machine/md_var.h> #include <machine/specialreg.h> +#ifdef SMP +#include <machine/smp.h> +#endif #define PMAP_KEEP_PDIRS #ifndef PMAP_SHPGPERPROC @@ -163,6 +169,11 @@ struct pmap kernel_pmap_store; LIST_HEAD(pmaplist, pmap); static struct pmaplist allpmaps; static struct mtx allpmaps_lock; +#ifdef LAZY_SWITCH +#ifdef SMP +static struct mtx lazypmap_lock; +#endif +#endif vm_paddr_t avail_start; /* PA of first available physical page */ vm_paddr_t avail_end; /* PA of last available physical page */ @@ -477,6 +488,11 @@ pmap_bootstrap(firstaddr) kernel_pmap->pm_active = -1; /* don't allow deactivation */ TAILQ_INIT(&kernel_pmap->pm_pvlist); LIST_INIT(&allpmaps); +#ifdef LAZY_SWITCH +#ifdef SMP + mtx_init(&lazypmap_lock, "lazypmap", NULL, MTX_SPIN); +#endif +#endif mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN); mtx_lock_spin(&allpmaps_lock); LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); @@ -630,8 +646,121 @@ pmap_track_modified(vm_offset_t va) return 0; } +#ifdef SMP /* - * Normal invalidation functions. + * For SMP, these functions have to use the IPI mechanism for coherence. + */ +void +pmap_invalidate_page(pmap_t pmap, vm_offset_t va) +{ + u_int cpumask; + u_int other_cpus; + + if (smp_started) { + if (!(read_rflags() & PSL_I)) + panic("%s: interrupts disabled", __func__); + mtx_lock_spin(&smp_tlb_mtx); + } else + critical_enter(); + /* + * We need to disable interrupt preemption but MUST NOT have + * interrupts disabled here. + * XXX we may need to hold schedlock to get a coherent pm_active + * XXX critical sections disable interrupts again + */ + if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { + invlpg(va); + smp_invlpg(va); + } else { + cpumask = PCPU_GET(cpumask); + other_cpus = PCPU_GET(other_cpus); + if (pmap->pm_active & cpumask) + invlpg(va); + if (pmap->pm_active & other_cpus) + smp_masked_invlpg(pmap->pm_active & other_cpus, va); + } + if (smp_started) + mtx_unlock_spin(&smp_tlb_mtx); + else + critical_exit(); +} + +void +pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +{ + u_int cpumask; + u_int other_cpus; + vm_offset_t addr; + + if (smp_started) { + if (!(read_rflags() & PSL_I)) + panic("%s: interrupts disabled", __func__); + mtx_lock_spin(&smp_tlb_mtx); + } else + critical_enter(); + /* + * We need to disable interrupt preemption but MUST NOT have + * interrupts disabled here. + * XXX we may need to hold schedlock to get a coherent pm_active + * XXX critical sections disable interrupts again + */ + if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { + for (addr = sva; addr < eva; addr += PAGE_SIZE) + invlpg(addr); + smp_invlpg_range(sva, eva); + } else { + cpumask = PCPU_GET(cpumask); + other_cpus = PCPU_GET(other_cpus); + if (pmap->pm_active & cpumask) + for (addr = sva; addr < eva; addr += PAGE_SIZE) + invlpg(addr); + if (pmap->pm_active & other_cpus) + smp_masked_invlpg_range(pmap->pm_active & other_cpus, + sva, eva); + } + if (smp_started) + mtx_unlock_spin(&smp_tlb_mtx); + else + critical_exit(); +} + +void +pmap_invalidate_all(pmap_t pmap) +{ + u_int cpumask; + u_int other_cpus; + + if (smp_started) { + if (!(read_rflags() & PSL_I)) + panic("%s: interrupts disabled", __func__); + mtx_lock_spin(&smp_tlb_mtx); + } else + critical_enter(); + /* + * We need to disable interrupt preemption but MUST NOT have + * interrupts disabled here. + * XXX we may need to hold schedlock to get a coherent pm_active + * XXX critical sections disable interrupts again + */ + if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { + invltlb(); + smp_invltlb(); + } else { + cpumask = PCPU_GET(cpumask); + other_cpus = PCPU_GET(other_cpus); + if (pmap->pm_active & cpumask) + invltlb(); + if (pmap->pm_active & other_cpus) + smp_masked_invltlb(pmap->pm_active & other_cpus); + } + if (smp_started) + mtx_unlock_spin(&smp_tlb_mtx); + else + critical_exit(); +} +#else /* !SMP */ +/* + * Normal, non-SMP, invalidation functions. * We inline these within pmap.c for speed. */ PMAP_INLINE void @@ -659,6 +788,7 @@ pmap_invalidate_all(pmap_t pmap) if (pmap == kernel_pmap || pmap->pm_active) invltlb(); } +#endif /* !SMP */ /* * Are we current address space or kernel? @@ -1208,6 +1338,93 @@ retry: * Pmap allocation/deallocation routines. ***************************************************/ +#ifdef LAZY_SWITCH +#ifdef SMP +/* + * Deal with a SMP shootdown of other users of the pmap that we are + * trying to dispose of. This can be a bit hairy. + */ +static u_int *lazymask; +static register_t lazyptd; +static volatile u_int lazywait; + +void pmap_lazyfix_action(void); + +void +pmap_lazyfix_action(void) +{ + u_int mymask = PCPU_GET(cpumask); + + if (rcr3() == lazyptd) + load_cr3(PCPU_GET(curpcb)->pcb_cr3); + atomic_clear_int(lazymask, mymask); + atomic_store_rel_int(&lazywait, 1); +} + +static void +pmap_lazyfix_self(u_int mymask) +{ + + if (rcr3() == lazyptd) + load_cr3(PCPU_GET(curpcb)->pcb_cr3); + atomic_clear_int(lazymask, mymask); +} + + +static void +pmap_lazyfix(pmap_t pmap) +{ + u_int mymask = PCPU_GET(cpumask); + u_int mask; + register u_int spins; + + while ((mask = pmap->pm_active) != 0) { + spins = 50000000; + mask = mask & -mask; /* Find least significant set bit */ + mtx_lock_spin(&lazypmap_lock); + lazyptd = vtophys(pmap->pm_pml4); + if (mask == mymask) { + lazymask = &pmap->pm_active; + pmap_lazyfix_self(mymask); + } else { + atomic_store_rel_long((u_long *)&lazymask, + (u_long)&pmap->pm_active); + atomic_store_rel_int(&lazywait, 0); + ipi_selected(mask, IPI_LAZYPMAP); + while (lazywait == 0) { + ia32_pause(); + if (--spins == 0) + break; + } + } + mtx_unlock_spin(&lazypmap_lock); + if (spins == 0) + printf("pmap_lazyfix: spun for 50000000\n"); + } +} + +#else /* SMP */ + +/* + * Cleaning up on uniprocessor is easy. For various reasons, we're + * unlikely to have to even execute this code, including the fact + * that the cleanup is deferred until the parent does a wait(2), which + * means that another userland process has run. + */ +static void +pmap_lazyfix(pmap_t pmap) +{ + u_long cr3; + + cr3 = vtophys(pmap->pm_pml4); + if (cr3 == rcr3()) { + load_cr3(PCPU_GET(curpcb)->pcb_cr3); + pmap->pm_active &= ~(PCPU_GET(cpumask)); + } +} +#endif /* SMP */ +#endif + /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. @@ -1222,6 +1439,9 @@ pmap_release(pmap_t pmap) ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); +#ifdef LAZY_SWITCH + pmap_lazyfix(pmap); +#endif mtx_lock_spin(&allpmaps_lock); LIST_REMOVE(pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); @@ -2777,12 +2997,21 @@ void pmap_activate(struct thread *td) { struct proc *p = td->td_proc; - pmap_t pmap; + pmap_t pmap, oldpmap; u_int64_t cr3; critical_enter(); pmap = vmspace_pmap(td->td_proc->p_vmspace); + oldpmap = PCPU_GET(curpmap); +#ifdef SMP +if (oldpmap) /* XXX FIXME */ + atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask)); + atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask)); +#else +if (oldpmap) /* XXX FIXME */ + oldpmap->pm_active &= ~PCPU_GET(cpumask); pmap->pm_active |= PCPU_GET(cpumask); +#endif cr3 = vtophys(pmap->pm_pml4); /* XXXKSE this is wrong. * pmap_activate is for the current thread on the current cpu diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S index 94d7bba..2c0ddf1 100644 --- a/sys/amd64/amd64/support.S +++ b/sys/amd64/amd64/support.S @@ -37,10 +37,21 @@ #include "opt_ddb.h" #include <machine/asmacros.h> +#include <machine/intr_machdep.h> #include <machine/pmap.h> #include "assym.s" + ALIGN_DATA + .globl intrcnt, eintrcnt +intrcnt: + .space INTRCNT_COUNT * 4 +eintrcnt: + + .globl intrnames, eintrnames +intrnames: + .space INTRCNT_COUNT * (MAXCOMLEN + 1) +eintrnames: .text @@ -302,6 +313,9 @@ ENTRY(casuptr) ja fusufault movq %rsi, %rax /* old */ +#ifdef SMP + lock +#endif cmpxchgq %rdx, (%rdi) /* new = %rdx */ /* diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c index 65ae8f9..0e6b95b 100644 --- a/sys/amd64/amd64/trap.c +++ b/sys/amd64/amd64/trap.c @@ -80,13 +80,14 @@ __FBSDID("$FreeBSD$"); #include <vm/vm_extern.h> #include <machine/cpu.h> +#include <machine/intr_machdep.h> #include <machine/md_var.h> #include <machine/pcb.h> +#ifdef SMP +#include <machine/smp.h> +#endif #include <machine/tss.h> -#include <amd64/isa/icu.h> -#include <amd64/isa/intr_machdep.h> - #include <ddb/ddb.h> extern void trap(struct trapframe frame); @@ -564,6 +565,11 @@ trap_fatal(frame, eva) printf("\n\nFatal trap %d: %s while in %s mode\n", type, trap_msg[type], ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel"); +#ifdef SMP + /* two separate prints in case of a trap on an unmapped page */ + printf("cpuid = %d; ", PCPU_GET(cpuid)); + printf("apic id = %02x\n", PCPU_GET(apic_id)); +#endif if (type == T_PAGEFLT) { printf("fault virtual address = 0x%lx\n", eva); printf("fault code = %s %s, %s\n", @@ -631,6 +637,11 @@ void dblfault_handler() { printf("\nFatal double fault\n"); +#ifdef SMP + /* two separate prints in case of a trap on an unmapped page */ + printf("cpuid = %d; ", PCPU_GET(cpuid)); + printf("apic id = %02x\n", PCPU_GET(apic_id)); +#endif panic("double fault"); } diff --git a/sys/amd64/amd64/tsc.c b/sys/amd64/amd64/tsc.c index 5485511..6a5b17c 100644 --- a/sys/amd64/amd64/tsc.c +++ b/sys/amd64/amd64/tsc.c @@ -77,14 +77,26 @@ init_TSC(void) tsc_freq = tscval[1] - tscval[0]; if (bootverbose) printf("TSC clock: %lu Hz\n", tsc_freq); - - return; } + void init_TSC_tc(void) { +#ifdef SMP + /* + * We can not use the TSC in SMP mode unless the TSCs on all CPUs + * are somehow synchronized. Some hardware configurations do + * this, but we have no way of determining whether this is the + * case, so we do not use the TSC in multi-processor systems + * unless the user indicated (by setting kern.timecounter.smp_tsc + * to 1) that he believes that his TSCs are synchronized. + */ + if (mp_ncpus > 1 && !smp_tsc) + tsc_timecounter.tc_quality = -100; +#endif + if (tsc_freq != 0 && !tsc_is_broken) { tsc_timecounter.tc_frequency = tsc_freq; tc_init(&tsc_timecounter); diff --git a/sys/amd64/amd64/vm_machdep.c b/sys/amd64/amd64/vm_machdep.c index 9b6bc1f..c4f583f 100644 --- a/sys/amd64/amd64/vm_machdep.c +++ b/sys/amd64/amd64/vm_machdep.c @@ -60,6 +60,7 @@ __FBSDID("$FreeBSD$"); #include <sys/mbuf.h> #include <sys/mutex.h> #include <sys/sf_buf.h> +#include <sys/smp.h> #include <sys/sysctl.h> #include <sys/unistd.h> @@ -80,6 +81,11 @@ __FBSDID("$FreeBSD$"); #include <amd64/isa/isa.h> static void cpu_reset_real(void); +#ifdef SMP +static void cpu_reset_proxy(void); +static u_int cpu_reset_proxyid; +static volatile u_int cpu_reset_proxy_active; +#endif static void sf_buf_init(void *arg); SYSINIT(sock_sf, SI_SUB_MBUF, SI_ORDER_ANY, sf_buf_init, NULL) @@ -336,10 +342,69 @@ cpu_set_upcall_kse(struct thread *td, struct kse_upcall *ku) * Force reset the processor by invalidating the entire address space! */ +#ifdef SMP +static void +cpu_reset_proxy() +{ + + cpu_reset_proxy_active = 1; + while (cpu_reset_proxy_active == 1) + ; /* Wait for other cpu to see that we've started */ + stop_cpus((1<<cpu_reset_proxyid)); + printf("cpu_reset_proxy: Stopped CPU %d\n", cpu_reset_proxyid); + DELAY(1000000); + cpu_reset_real(); +} +#endif + void cpu_reset() { +#ifdef SMP + if (smp_active == 0) { + cpu_reset_real(); + /* NOTREACHED */ + } else { + + u_int map; + int cnt; + printf("cpu_reset called on cpu#%d\n", PCPU_GET(cpuid)); + + map = PCPU_GET(other_cpus) & ~ stopped_cpus; + + if (map != 0) { + printf("cpu_reset: Stopping other CPUs\n"); + stop_cpus(map); /* Stop all other CPUs */ + } + + if (PCPU_GET(cpuid) == 0) { + DELAY(1000000); + cpu_reset_real(); + /* NOTREACHED */ + } else { + /* We are not BSP (CPU #0) */ + + cpu_reset_proxyid = PCPU_GET(cpuid); + cpustop_restartfunc = cpu_reset_proxy; + cpu_reset_proxy_active = 0; + printf("cpu_reset: Restarting BSP\n"); + started_cpus = (1<<0); /* Restart CPU #0 */ + + cnt = 0; + while (cpu_reset_proxy_active == 0 && cnt < 10000000) + cnt++; /* Wait for BSP to announce restart */ + if (cpu_reset_proxy_active == 0) + printf("cpu_reset: Failed to restart BSP\n"); + enable_intr(); + cpu_reset_proxy_active = 2; + + while (1); + /* NOTREACHED */ + } + } +#else cpu_reset_real(); +#endif } static void |