diff options
author | jasone <jasone@FreeBSD.org> | 2000-09-07 01:33:02 +0000 |
---|---|---|
committer | jasone <jasone@FreeBSD.org> | 2000-09-07 01:33:02 +0000 |
commit | 769e0f974d8929599ba599ac496510fffc90ff34 (patch) | |
tree | 9387522900085835de81e7830e570ef3f6b3ea80 /sys/amd64/amd64 | |
parent | acf1927de02afda4855ec278b1128fd9446405ea (diff) | |
download | FreeBSD-src-769e0f974d8929599ba599ac496510fffc90ff34.zip FreeBSD-src-769e0f974d8929599ba599ac496510fffc90ff34.tar.gz |
Major update to the way synchronization is done in the kernel. Highlights
include:
* Mutual exclusion is used instead of spl*(). See mutex(9). (Note: The
alpha port is still in transition and currently uses both.)
* Per-CPU idle processes.
* Interrupts are run in their own separate kernel threads and can be
preempted (i386 only).
Partially contributed by: BSDi (BSD/OS)
Submissions by (at least): cp, dfr, dillon, grog, jake, jhb, sheldonh
Diffstat (limited to 'sys/amd64/amd64')
-rw-r--r-- | sys/amd64/amd64/amd64-gdbstub.c | 10 | ||||
-rw-r--r-- | sys/amd64/amd64/apic_vector.S | 132 | ||||
-rw-r--r-- | sys/amd64/amd64/autoconf.c | 8 | ||||
-rw-r--r-- | sys/amd64/amd64/cpu_switch.S | 269 | ||||
-rw-r--r-- | sys/amd64/amd64/exception.S | 41 | ||||
-rw-r--r-- | sys/amd64/amd64/exception.s | 41 | ||||
-rw-r--r-- | sys/amd64/amd64/fpu.c | 18 | ||||
-rw-r--r-- | sys/amd64/amd64/genassym.c | 27 | ||||
-rw-r--r-- | sys/amd64/amd64/identcpu.c | 3 | ||||
-rw-r--r-- | sys/amd64/amd64/initcpu.c | 6 | ||||
-rw-r--r-- | sys/amd64/amd64/legacy.c | 32 | ||||
-rw-r--r-- | sys/amd64/amd64/locore.S | 3 | ||||
-rw-r--r-- | sys/amd64/amd64/locore.s | 3 | ||||
-rw-r--r-- | sys/amd64/amd64/machdep.c | 37 | ||||
-rw-r--r-- | sys/amd64/amd64/mp_machdep.c | 88 | ||||
-rw-r--r-- | sys/amd64/amd64/mpboot.S | 36 | ||||
-rw-r--r-- | sys/amd64/amd64/mptable.c | 88 | ||||
-rw-r--r-- | sys/amd64/amd64/nexus.c | 32 | ||||
-rw-r--r-- | sys/amd64/amd64/pmap.c | 2 | ||||
-rw-r--r-- | sys/amd64/amd64/swtch.s | 269 | ||||
-rw-r--r-- | sys/amd64/amd64/trap.c | 391 | ||||
-rw-r--r-- | sys/amd64/amd64/tsc.c | 155 | ||||
-rw-r--r-- | sys/amd64/amd64/vm_machdep.c | 51 |
23 files changed, 737 insertions, 1005 deletions
diff --git a/sys/amd64/amd64/amd64-gdbstub.c b/sys/amd64/amd64/amd64-gdbstub.c index 986b8d4..b442a37 100644 --- a/sys/amd64/amd64/amd64-gdbstub.c +++ b/sys/amd64/amd64/amd64-gdbstub.c @@ -188,7 +188,8 @@ getpacket (char *buffer) unsigned char ch; int s; - s = spltty (); + s = read_eflags(); + disable_intr(); do { /* wait around for the start character, ignore all other characters */ @@ -239,7 +240,7 @@ getpacket (char *buffer) } } while (checksum != xmitcsum); - splx (s); + write_eflags(s); } /* send the packet in buffer. */ @@ -253,7 +254,8 @@ putpacket (char *buffer) int s; /* $<packet info>#<checksum>. */ - s = spltty (); + s = read_eflags(); + disable_intr(); do { /* @@ -285,7 +287,7 @@ putpacket (char *buffer) putDebugChar (hexchars[checksum & 0xf]); } while ((getDebugChar () & 0x7f) != '+'); - splx (s); + write_eflags(s); } static char remcomInBuffer[BUFMAX]; diff --git a/sys/amd64/amd64/apic_vector.S b/sys/amd64/amd64/apic_vector.S index 2a7559d..54bf003 100644 --- a/sys/amd64/amd64/apic_vector.S +++ b/sys/amd64/amd64/apic_vector.S @@ -17,7 +17,7 @@ /* - * Macros for interrupt interrupt entry, call to handler, and exit. + * Macros for interrupt entry, call to handler, and exit. */ #define FAST_INTR(irq_num, vec_name) \ @@ -121,7 +121,7 @@ IDTVEC(vec_name) ; \ /* - * Test to see if the source is currntly masked, clear if so. + * Test to see if the source is currently masked, clear if so. */ #define UNMASK_IRQ(irq_num) \ IMASK_LOCK ; /* into critical reg */ \ @@ -200,7 +200,16 @@ log_intr_event: #else #define APIC_ITRACE(name, irq_num, id) #endif - + +/* + * Slow, threaded interrupts. + * + * XXX Most of the parameters here are obsolete. Fix this when we're + * done. + * XXX we really shouldn't return via doreti if we just schedule the + * interrupt handler and don't run anything. We could just do an + * iret. FIXME. + */ #define INTR(irq_num, vec_name, maybe_extra_ipending) \ .text ; \ SUPERALIGN_TEXT ; \ @@ -216,87 +225,24 @@ IDTVEC(vec_name) ; \ maybe_extra_ipending ; \ ; \ APIC_ITRACE(apic_itrace_enter, irq_num, APIC_ITRACE_ENTER) ; \ - lock ; /* MP-safe */ \ - btsl $(irq_num), iactive ; /* lazy masking */ \ - jc 1f ; /* already active */ \ ; \ MASK_LEVEL_IRQ(irq_num) ; \ EOI_IRQ(irq_num) ; \ 0: ; \ - APIC_ITRACE(apic_itrace_tryisrlock, irq_num, APIC_ITRACE_TRYISRLOCK) ;\ - MP_TRYLOCK ; /* XXX this is going away... */ \ - testl %eax, %eax ; /* did we get it? */ \ - jz 3f ; /* no */ \ -; \ - APIC_ITRACE(apic_itrace_gotisrlock, irq_num, APIC_ITRACE_GOTISRLOCK) ;\ - testl $IRQ_BIT(irq_num), _cpl ; \ - jne 2f ; /* this INT masked */ \ -; \ incb _intr_nesting_level ; \ ; \ /* entry point used by doreti_unpend for HWIs. */ \ __CONCAT(Xresume,irq_num): ; \ FAKE_MCOUNT(13*4(%esp)) ; /* XXX avoid dbl cnt */ \ - lock ; incl _cnt+V_INTR ; /* tally interrupts */ \ - movl _intr_countp + (irq_num) * 4, %eax ; \ - lock ; incl (%eax) ; \ -; \ - movl _cpl, %eax ; \ - pushl %eax ; \ - orl _intr_mask + (irq_num) * 4, %eax ; \ - movl %eax, _cpl ; \ - lock ; \ - andl $~IRQ_BIT(irq_num), _ipending ; \ -; \ - pushl _intr_unit + (irq_num) * 4 ; \ + pushl $irq_num; /* pass the IRQ */ \ APIC_ITRACE(apic_itrace_enter2, irq_num, APIC_ITRACE_ENTER2) ; \ sti ; \ - call *_intr_handler + (irq_num) * 4 ; \ - cli ; \ + call _sched_ithd ; \ + addl $4, %esp ; /* discard the parameter */ \ APIC_ITRACE(apic_itrace_leave, irq_num, APIC_ITRACE_LEAVE) ; \ ; \ - lock ; andl $~IRQ_BIT(irq_num), iactive ; \ - UNMASK_IRQ(irq_num) ; \ - APIC_ITRACE(apic_itrace_unmask, irq_num, APIC_ITRACE_UNMASK) ; \ - sti ; /* doreti repeats cli/sti */ \ MEXITCOUNT ; \ - jmp _doreti ; \ -; \ - ALIGN_TEXT ; \ -1: ; /* active */ \ - APIC_ITRACE(apic_itrace_active, irq_num, APIC_ITRACE_ACTIVE) ; \ - MASK_IRQ(irq_num) ; \ - EOI_IRQ(irq_num) ; \ - lock ; \ - orl $IRQ_BIT(irq_num), _ipending ; \ - lock ; \ - btsl $(irq_num), iactive ; /* still active */ \ - jnc 0b ; /* retry */ \ - POP_FRAME ; \ - iret ; /* XXX: iactive bit might be 0 now */ \ - ALIGN_TEXT ; \ -2: ; /* masked by cpl, leave iactive set */ \ - APIC_ITRACE(apic_itrace_masked, irq_num, APIC_ITRACE_MASKED) ; \ - lock ; \ - orl $IRQ_BIT(irq_num), _ipending ; \ - MP_RELLOCK ; \ - POP_FRAME ; \ - iret ; \ - ALIGN_TEXT ; \ -3: ; /* other cpu has isr lock */ \ - APIC_ITRACE(apic_itrace_noisrlock, irq_num, APIC_ITRACE_NOISRLOCK) ;\ - lock ; \ - orl $IRQ_BIT(irq_num), _ipending ; \ - testl $IRQ_BIT(irq_num), _cpl ; \ - jne 4f ; /* this INT masked */ \ - call forward_irq ; /* forward irq to lock holder */ \ - POP_FRAME ; /* and return */ \ - iret ; \ - ALIGN_TEXT ; \ -4: ; /* blocked */ \ - APIC_ITRACE(apic_itrace_masked2, irq_num, APIC_ITRACE_MASKED2) ;\ - POP_FRAME ; /* and return */ \ - iret + jmp doreti_next /* * Handle "spurious INTerrupts". @@ -434,20 +380,10 @@ _Xcpuast: FAKE_MCOUNT(13*4(%esp)) - /* - * Giant locks do not come cheap. - * A lot of cycles are going to be wasted here. - */ - call _get_mplock - - movl _cpl, %eax - pushl %eax orl $AST_PENDING, _astpending /* XXX */ incb _intr_nesting_level sti - pushl $0 - movl _cpuid, %eax lock btrl %eax, _checkstate_pending_ast @@ -461,7 +397,7 @@ _Xcpuast: lock incl CNAME(cpuast_cnt) MEXITCOUNT - jmp _doreti + jmp doreti_next 1: /* We are already in the process of delivering an ast for this CPU */ POP_FRAME @@ -487,40 +423,24 @@ _Xforward_irq: FAKE_MCOUNT(13*4(%esp)) - MP_TRYLOCK - testl %eax,%eax /* Did we get the lock ? */ - jz 1f /* No */ - lock incl CNAME(forward_irq_hitcnt) cmpb $4, _intr_nesting_level - jae 2f + jae 1f - movl _cpl, %eax - pushl %eax incb _intr_nesting_level sti - pushl $0 - MEXITCOUNT - jmp _doreti /* Handle forwarded interrupt */ + jmp doreti_next /* Handle forwarded interrupt */ 1: lock - incl CNAME(forward_irq_misscnt) - call forward_irq /* Oops, we've lost the isr lock */ - MEXITCOUNT - POP_FRAME - iret -2: - lock incl CNAME(forward_irq_toodeepcnt) -3: - MP_RELLOCK MEXITCOUNT POP_FRAME iret +#if 0 /* * */ @@ -532,9 +452,11 @@ forward_irq: cmpl $0, CNAME(forward_irq_enabled) jz 4f +/* XXX - this is broken now, because mp_lock doesn't exist movl _mp_lock,%eax cmpl $FREE_LOCK,%eax jne 1f + */ movl $0, %eax /* Pick CPU #0 if noone has lock */ 1: shrl $24,%eax @@ -559,6 +481,7 @@ forward_irq: jnz 3b 4: ret +#endif /* * Executed by a CPU when it receives an Xcpustop IPI from another CPU, @@ -654,6 +577,7 @@ MCOUNT_LABEL(bintr) FAST_INTR(22,fastintr22) FAST_INTR(23,fastintr23) #define CLKINTR_PENDING movl $1,CNAME(clkintr_pending) +/* Threaded interrupts */ INTR(0,intr0, CLKINTR_PENDING) INTR(1,intr1,) INTR(2,intr2,) @@ -728,15 +652,11 @@ _ihandlers: .long _swi_null, swi_net, _swi_null, _swi_null .long _swi_vm, _swi_null, _softclock -imasks: /* masks for interrupt handlers */ - .space NHWI*4 /* padding; HWI masks are elsewhere */ - - .long SWI_TTY_MASK, SWI_NET_MASK, SWI_CAMNET_MASK, SWI_CAMBIO_MASK - .long SWI_VM_MASK, SWI_TQ_MASK, SWI_CLOCK_MASK - +#if 0 /* active flag for lazy masking */ iactive: .long 0 +#endif #ifdef COUNT_XINVLTLB_HITS .globl _xhits diff --git a/sys/amd64/amd64/autoconf.c b/sys/amd64/amd64/autoconf.c index b209065..4edda4b 100644 --- a/sys/amd64/amd64/autoconf.c +++ b/sys/amd64/amd64/autoconf.c @@ -163,14 +163,6 @@ configure(dummy) * XXX this is slightly misplaced. */ spl0(); - - /* - * Allow lowering of the ipl to the lowest kernel level if we - * panic (or call tsleep() before clearing `cold'). No level is - * completely safe (since a panic may occur in a critical region - * at splhigh()), but we want at least bio interrupts to work. - */ - safepri = cpl; } static void diff --git a/sys/amd64/amd64/cpu_switch.S b/sys/amd64/amd64/cpu_switch.S index c895fef..db56a1b 100644 --- a/sys/amd64/amd64/cpu_switch.S +++ b/sys/amd64/amd64/cpu_switch.S @@ -73,189 +73,6 @@ _tlb_flush_count: .long 0 .text -/* - * When no processes are on the runq, cpu_switch() branches to _idle - * to wait for something to come ready. - */ - ALIGN_TEXT - .type _idle,@function -_idle: - xorl %ebp,%ebp - movl %ebp,_switchtime - -#ifdef SMP - - /* when called, we have the mplock, intr disabled */ - /* use our idleproc's "context" */ - movl _IdlePTD, %ecx - movl %cr3, %eax - cmpl %ecx, %eax - je 2f -#if defined(SWTCH_OPTIM_STATS) - decl _swtch_optim_stats - incl _tlb_flush_count -#endif - movl %ecx, %cr3 -2: - /* Keep space for nonexisting return addr, or profiling bombs */ - movl $gd_idlestack_top-4, %ecx - addl %fs:0, %ecx - movl %ecx, %esp - - /* update common_tss.tss_esp0 pointer */ - movl %ecx, _common_tss + TSS_ESP0 - - movl _cpuid, %esi - btrl %esi, _private_tss - jae 1f - - movl $gd_common_tssd, %edi - addl %fs:0, %edi - - /* move correct tss descriptor into GDT slot, then reload tr */ - movl _tss_gdt, %ebx /* entry in GDT */ - movl 0(%edi), %eax - movl %eax, 0(%ebx) - movl 4(%edi), %eax - movl %eax, 4(%ebx) - movl $GPROC0_SEL*8, %esi /* GSEL(entry, SEL_KPL) */ - ltr %si -1: - - sti - - /* - * XXX callers of cpu_switch() do a bogus splclock(). Locking should - * be left to cpu_switch(). - * - * NOTE: spl*() may only be called while we hold the MP lock (which - * we do). - */ - call _spl0 - - cli - - /* - * _REALLY_ free the lock, no matter how deep the prior nesting. - * We will recover the nesting on the way out when we have a new - * proc to load. - * - * XXX: we had damn well better be sure we had it before doing this! - */ - movl $FREE_LOCK, %eax - movl %eax, _mp_lock - - /* do NOT have lock, intrs disabled */ - .globl idle_loop -idle_loop: - - cmpl $0,_smp_active - jne 1f - cmpl $0,_cpuid - je 1f - jmp 2f - -1: - call _procrunnable - testl %eax,%eax - jnz 3f - - /* - * Handle page-zeroing in the idle loop. Called with interrupts - * disabled and the MP lock released. Inside vm_page_zero_idle - * we enable interrupts and grab the mplock as required. - */ - cmpl $0,_do_page_zero_idle - je 2f - - call _vm_page_zero_idle /* internal locking */ - testl %eax, %eax - jnz idle_loop -2: - - /* enable intrs for a halt */ - movl $0, lapic_tpr /* 1st candidate for an INT */ - call *_hlt_vector /* wait for interrupt */ - cli - jmp idle_loop - - /* - * Note that interrupts must be enabled while obtaining the MP lock - * in order to be able to take IPI's while blocked. - */ -3: - movl $LOPRIO_LEVEL, lapic_tpr /* arbitrate for INTs */ - sti - call _get_mplock - cli - call _procrunnable - testl %eax,%eax - CROSSJUMP(jnz, sw1a, jz) - call _rel_mplock - jmp idle_loop - -#else /* !SMP */ - - movl $HIDENAME(tmpstk),%esp -#if defined(OVERLY_CONSERVATIVE_PTD_MGMT) -#if defined(SWTCH_OPTIM_STATS) - incl _swtch_optim_stats -#endif - movl _IdlePTD, %ecx - movl %cr3, %eax - cmpl %ecx, %eax - je 2f -#if defined(SWTCH_OPTIM_STATS) - decl _swtch_optim_stats - incl _tlb_flush_count -#endif - movl %ecx, %cr3 -2: -#endif - - /* update common_tss.tss_esp0 pointer */ - movl %esp, _common_tss + TSS_ESP0 - - movl $0, %esi - btrl %esi, _private_tss - jae 1f - - movl $_common_tssd, %edi - - /* move correct tss descriptor into GDT slot, then reload tr */ - movl _tss_gdt, %ebx /* entry in GDT */ - movl 0(%edi), %eax - movl %eax, 0(%ebx) - movl 4(%edi), %eax - movl %eax, 4(%ebx) - movl $GPROC0_SEL*8, %esi /* GSEL(entry, SEL_KPL) */ - ltr %si -1: - - sti - - /* - * XXX callers of cpu_switch() do a bogus splclock(). Locking should - * be left to cpu_switch(). - */ - call _spl0 - - ALIGN_TEXT -idle_loop: - cli - call _procrunnable - testl %eax,%eax - CROSSJUMP(jnz, sw1a, jz) - call _vm_page_zero_idle - testl %eax, %eax - jnz idle_loop - call *_hlt_vector /* wait for interrupt */ - jmp idle_loop - -#endif /* SMP */ - -CROSSJUMPTARGET(_idle) - ENTRY(default_halt) sti #ifndef SMP @@ -264,16 +81,23 @@ ENTRY(default_halt) ret /* + * cpu_throw() + */ +ENTRY(cpu_throw) + jmp sw1 + +/* * cpu_switch() */ ENTRY(cpu_switch) /* switch to new process. first, save context as needed */ movl _curproc,%ecx + movl %ecx,_prevproc /* if no process to save, don't bother */ testl %ecx,%ecx - je sw1 + jz sw1 #ifdef SMP movb P_ONCPU(%ecx), %al /* save "last" cpu */ @@ -299,7 +123,7 @@ ENTRY(cpu_switch) movl %edi,PCB_EDI(%edx) movl %gs,PCB_GS(%edx) - /* test if debug regisers should be saved */ + /* test if debug registers should be saved */ movb PCB_FLAGS(%edx),%al andb $PCB_DBREGS,%al jz 1f /* no, skip over */ @@ -319,15 +143,12 @@ ENTRY(cpu_switch) movl %eax,PCB_DR0(%edx) 1: + /* save sched_lock recursion count */ + movl _sched_lock+MTX_RECURSE,%eax + movl %eax,PCB_SCHEDNEST(%edx) + #ifdef SMP - movl _mp_lock, %eax /* XXX FIXME: we should be saving the local APIC TPR */ -#ifdef DIAGNOSTIC - cmpl $FREE_LOCK, %eax /* is it free? */ - je badsw4 /* yes, bad medicine! */ -#endif /* DIAGNOSTIC */ - andl $COUNT_FIELD, %eax /* clear CPU portion */ - movl %eax, PCB_MPNEST(%edx) /* store it */ #endif /* SMP */ #if NNPX > 0 @@ -341,25 +162,33 @@ ENTRY(cpu_switch) 1: #endif /* NNPX > 0 */ - movl $0,_curproc /* out of process */ - - /* save is done, now choose a new process or idle */ + /* save is done, now choose a new process */ sw1: - cli #ifdef SMP /* Stop scheduling if smp_active goes zero and we are not BSP */ cmpl $0,_smp_active jne 1f cmpl $0,_cpuid - CROSSJUMP(je, _idle, jne) /* wind down */ + je 1f + + movl _idleproc, %eax + jmp sw1b 1: #endif + /* + * Choose a new process to schedule. chooseproc() returns idleproc + * if it cannot find another process to run. + */ sw1a: call _chooseproc /* trash ecx, edx, ret eax*/ - testl %eax,%eax - CROSSJUMP(je, _idle, jne) /* if no proc, idle */ + +#ifdef DIAGNOSTIC + testl %eax,%eax /* no process? */ + jz badsw3 /* no, panic */ +#endif +sw1b: movl %eax,%ecx xorl %eax,%eax @@ -456,9 +285,6 @@ sw1a: movl %ecx, _curproc /* into next process */ #ifdef SMP - movl _cpu_lockid, %eax - orl PCB_MPNEST(%edx), %eax /* add next count from PROC */ - movl %eax, _mp_lock /* load the mp_lock */ /* XXX FIXME: we should be restoring the local APIC TPR */ #endif /* SMP */ @@ -500,7 +326,22 @@ cpu_switch_load_gs: movl %eax,%dr7 1: - sti + /* + * restore sched_lock recursion count and transfer ownership to + * new process + */ + movl PCB_SCHEDNEST(%edx),%eax + movl %eax,_sched_lock+MTX_RECURSE + + movl _curproc,%eax + movl %eax,_sched_lock+MTX_LOCK + +#ifdef DIAGNOSTIC + pushfl + popl %ecx + testl $0x200, %ecx /* interrupts enabled? */ + jnz badsw6 /* that way madness lies */ +#endif ret CROSSJUMPTARGET(sw1a) @@ -517,15 +358,27 @@ badsw2: call _panic sw0_2: .asciz "cpu_switch: not SRUN" + +badsw3: + pushl $sw0_3 + call _panic + +sw0_3: .asciz "cpu_switch: chooseproc returned NULL" + #endif -#if defined(SMP) && defined(DIAGNOSTIC) -badsw4: - pushl $sw0_4 +#ifdef DIAGNOSTIC +badsw5: + pushl $sw0_5 + call _panic + +sw0_5: .asciz "cpu_switch: interrupts enabled (again)" +badsw6: + pushl $sw0_6 call _panic -sw0_4: .asciz "cpu_switch: do not have lock" -#endif /* SMP && DIAGNOSTIC */ +sw0_6: .asciz "cpu_switch: interrupts enabled" +#endif /* * savectx(pcb) diff --git a/sys/amd64/amd64/exception.S b/sys/amd64/amd64/exception.S index acb8b40..9e77114 100644 --- a/sys/amd64/amd64/exception.S +++ b/sys/amd64/amd64/exception.S @@ -38,6 +38,7 @@ #include <machine/asmacros.h> #include <machine/ipl.h> #include <machine/lock.h> +#include <machine/mutex.h> #include <machine/psl.h> #include <machine/trap.h> #ifdef SMP @@ -175,20 +176,12 @@ IDTVEC(fpu) mov %ax,%fs FAKE_MCOUNT(13*4(%esp)) -#ifdef SMP MPLOCKED incl _cnt+V_TRAP - MP_LOCK - movl _cpl,%eax - pushl %eax /* save original cpl */ - pushl $0 /* dummy unit to finish intr frame */ -#else /* SMP */ - movl _cpl,%eax - pushl %eax pushl $0 /* dummy unit to finish intr frame */ - incl _cnt+V_TRAP -#endif /* SMP */ + call __mtx_enter_giant_def call _npx_intr + call __mtx_exit_giant_def incb _intr_nesting_level MEXITCOUNT @@ -205,9 +198,6 @@ IDTVEC(align) * gate (TGT), else disabled if this was an interrupt gate (IGT). * Note that int0x80_syscall is a trap gate. Only page faults * use an interrupt gate. - * - * Note that all calls to MP_LOCK must occur with interrupts enabled - * in order to be able to take IPI's while waiting for the lock. */ SUPERALIGN_TEXT @@ -227,16 +217,12 @@ alltraps_with_regs_pushed: FAKE_MCOUNT(13*4(%esp)) calltrap: FAKE_MCOUNT(_btrap) /* init "from" _btrap -> calltrap */ - MPLOCKED incl _cnt+V_TRAP - MP_LOCK - movl _cpl,%ebx /* keep orig. cpl here during trap() */ call _trap /* * Return via _doreti to handle ASTs. Have to change trap frame * to interrupt frame. */ - pushl %ebx /* cpl to restore */ subl $4,%esp /* dummy unit to finish intr frame */ incb _intr_nesting_level MEXITCOUNT @@ -274,16 +260,11 @@ IDTVEC(syscall) movl %eax,TF_EFLAGS(%esp) movl $7,TF_ERR(%esp) /* sizeof "lcall 7,0" */ FAKE_MCOUNT(13*4(%esp)) - MPLOCKED incl _cnt+V_SYSCALL call _syscall2 MEXITCOUNT cli /* atomic astpending access */ - cmpl $0,_astpending - je doreti_syscall_ret -#ifdef SMP - MP_LOCK -#endif - pushl $0 /* cpl to restore */ + cmpl $0,_astpending /* AST pending? */ + je doreti_syscall_ret /* no, get out of here */ subl $4,%esp /* dummy unit for interrupt frame */ movb $1,_intr_nesting_level jmp _doreti @@ -312,21 +293,18 @@ IDTVEC(int0x80_syscall) mov %ax,%fs movl $2,TF_ERR(%esp) /* sizeof "int 0x80" */ FAKE_MCOUNT(13*4(%esp)) - MPLOCKED incl _cnt+V_SYSCALL call _syscall2 MEXITCOUNT cli /* atomic astpending access */ - cmpl $0,_astpending - je doreti_syscall_ret -#ifdef SMP - MP_LOCK -#endif - pushl $0 /* cpl to restore */ + cmpl $0,_astpending /* AST pending? */ + je doreti_syscall_ret /* no, get out of here */ subl $4,%esp /* dummy unit for interrupt frame */ movb $1,_intr_nesting_level jmp _doreti ENTRY(fork_trampoline) + MTX_EXIT(_sched_lock, %ecx) + sti call _spl0 #ifdef SMP @@ -355,7 +333,6 @@ ENTRY(fork_trampoline) /* * Return via _doreti to handle ASTs. */ - pushl $0 /* cpl to restore */ subl $4,%esp /* dummy unit to finish intr frame */ movb $1,_intr_nesting_level MEXITCOUNT diff --git a/sys/amd64/amd64/exception.s b/sys/amd64/amd64/exception.s index acb8b40..9e77114 100644 --- a/sys/amd64/amd64/exception.s +++ b/sys/amd64/amd64/exception.s @@ -38,6 +38,7 @@ #include <machine/asmacros.h> #include <machine/ipl.h> #include <machine/lock.h> +#include <machine/mutex.h> #include <machine/psl.h> #include <machine/trap.h> #ifdef SMP @@ -175,20 +176,12 @@ IDTVEC(fpu) mov %ax,%fs FAKE_MCOUNT(13*4(%esp)) -#ifdef SMP MPLOCKED incl _cnt+V_TRAP - MP_LOCK - movl _cpl,%eax - pushl %eax /* save original cpl */ - pushl $0 /* dummy unit to finish intr frame */ -#else /* SMP */ - movl _cpl,%eax - pushl %eax pushl $0 /* dummy unit to finish intr frame */ - incl _cnt+V_TRAP -#endif /* SMP */ + call __mtx_enter_giant_def call _npx_intr + call __mtx_exit_giant_def incb _intr_nesting_level MEXITCOUNT @@ -205,9 +198,6 @@ IDTVEC(align) * gate (TGT), else disabled if this was an interrupt gate (IGT). * Note that int0x80_syscall is a trap gate. Only page faults * use an interrupt gate. - * - * Note that all calls to MP_LOCK must occur with interrupts enabled - * in order to be able to take IPI's while waiting for the lock. */ SUPERALIGN_TEXT @@ -227,16 +217,12 @@ alltraps_with_regs_pushed: FAKE_MCOUNT(13*4(%esp)) calltrap: FAKE_MCOUNT(_btrap) /* init "from" _btrap -> calltrap */ - MPLOCKED incl _cnt+V_TRAP - MP_LOCK - movl _cpl,%ebx /* keep orig. cpl here during trap() */ call _trap /* * Return via _doreti to handle ASTs. Have to change trap frame * to interrupt frame. */ - pushl %ebx /* cpl to restore */ subl $4,%esp /* dummy unit to finish intr frame */ incb _intr_nesting_level MEXITCOUNT @@ -274,16 +260,11 @@ IDTVEC(syscall) movl %eax,TF_EFLAGS(%esp) movl $7,TF_ERR(%esp) /* sizeof "lcall 7,0" */ FAKE_MCOUNT(13*4(%esp)) - MPLOCKED incl _cnt+V_SYSCALL call _syscall2 MEXITCOUNT cli /* atomic astpending access */ - cmpl $0,_astpending - je doreti_syscall_ret -#ifdef SMP - MP_LOCK -#endif - pushl $0 /* cpl to restore */ + cmpl $0,_astpending /* AST pending? */ + je doreti_syscall_ret /* no, get out of here */ subl $4,%esp /* dummy unit for interrupt frame */ movb $1,_intr_nesting_level jmp _doreti @@ -312,21 +293,18 @@ IDTVEC(int0x80_syscall) mov %ax,%fs movl $2,TF_ERR(%esp) /* sizeof "int 0x80" */ FAKE_MCOUNT(13*4(%esp)) - MPLOCKED incl _cnt+V_SYSCALL call _syscall2 MEXITCOUNT cli /* atomic astpending access */ - cmpl $0,_astpending - je doreti_syscall_ret -#ifdef SMP - MP_LOCK -#endif - pushl $0 /* cpl to restore */ + cmpl $0,_astpending /* AST pending? */ + je doreti_syscall_ret /* no, get out of here */ subl $4,%esp /* dummy unit for interrupt frame */ movb $1,_intr_nesting_level jmp _doreti ENTRY(fork_trampoline) + MTX_EXIT(_sched_lock, %ecx) + sti call _spl0 #ifdef SMP @@ -355,7 +333,6 @@ ENTRY(fork_trampoline) /* * Return via _doreti to handle ASTs. */ - pushl $0 /* cpl to restore */ subl $4,%esp /* dummy unit to finish intr frame */ movb $1,_intr_nesting_level MEXITCOUNT diff --git a/sys/amd64/amd64/fpu.c b/sys/amd64/amd64/fpu.c index 637853e..8610e35 100644 --- a/sys/amd64/amd64/fpu.c +++ b/sys/amd64/amd64/fpu.c @@ -245,6 +245,12 @@ npx_probe(dev) setidt(16, probetrap, SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(npx_intrno, probeintr, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); npx_idt_probeintr = idt[npx_intrno]; + + /* + * XXX This looks highly bogus, but it appears that npc_probe1 + * needs interrupts enabled. Does this make any difference + * here? + */ enable_intr(); result = npx_probe1(dev); disable_intr(); @@ -797,7 +803,7 @@ npxdna() /* * Record new context early in case frstor causes an IRQ13. */ - npxproc = curproc; + PCPU_SET(npxproc, CURPROC); curpcb->pcb_savefpu.sv_ex_sw = 0; /* * The following frstor may cause an IRQ13 when the state being @@ -834,16 +840,18 @@ npxsave(addr) fnsave(addr); /* fnop(); */ start_emulating(); - npxproc = NULL; + PCPU_SET(npxproc, NULL); #else /* SMP */ + int intrstate; u_char icu1_mask; u_char icu2_mask; u_char old_icu1_mask; u_char old_icu2_mask; struct gate_descriptor save_idt_npxintr; + intrstate = save_intr(); disable_intr(); old_icu1_mask = inb(IO_ICU1 + 1); old_icu2_mask = inb(IO_ICU2 + 1); @@ -851,12 +859,12 @@ npxsave(addr) outb(IO_ICU1 + 1, old_icu1_mask & ~(IRQ_SLAVE | npx0_imask)); outb(IO_ICU2 + 1, old_icu2_mask & ~(npx0_imask >> 8)); idt[npx_intrno] = npx_idt_probeintr; - enable_intr(); + write_eflags(intrstate); stop_emulating(); fnsave(addr); fnop(); start_emulating(); - npxproc = NULL; + PCPU_SET(npxproc, NULL); disable_intr(); icu1_mask = inb(IO_ICU1 + 1); /* masks may have changed */ icu2_mask = inb(IO_ICU2 + 1); @@ -866,7 +874,7 @@ npxsave(addr) (icu2_mask & ~(npx0_imask >> 8)) | (old_icu2_mask & (npx0_imask >> 8))); idt[npx_intrno] = save_idt_npxintr; - enable_intr(); /* back to usual state */ + restore_intr(intrstate); /* back to previous state */ #endif /* SMP */ } diff --git a/sys/amd64/amd64/genassym.c b/sys/amd64/amd64/genassym.c index 60accd1..78c6075 100644 --- a/sys/amd64/amd64/genassym.c +++ b/sys/amd64/amd64/genassym.c @@ -51,6 +51,10 @@ #include <sys/mount.h> #include <sys/socket.h> #include <sys/resourcevar.h> +/* XXX */ +#ifdef KTR_PERCPU +#include <sys/ktr.h> +#endif #include <machine/frame.h> #include <machine/bootinfo.h> #include <machine/tss.h> @@ -73,6 +77,7 @@ #include <machine/sigframe.h> #include <machine/globaldata.h> #include <machine/vm86.h> +#include <machine/mutex.h> ASSYM(P_VMSPACE, offsetof(struct proc, p_vmspace)); ASSYM(VM_PMAP, offsetof(struct vmspace, vm_pmap)); @@ -127,9 +132,7 @@ ASSYM(PCB_DR7, offsetof(struct pcb, pcb_dr7)); ASSYM(PCB_DBREGS, PCB_DBREGS); ASSYM(PCB_EXT, offsetof(struct pcb, pcb_ext)); -#ifdef SMP -ASSYM(PCB_MPNEST, offsetof(struct pcb, pcb_mpnest)); -#endif +ASSYM(PCB_SCHEDNEST, offsetof(struct pcb, pcb_schednest)); ASSYM(PCB_SPARE, offsetof(struct pcb, __pcb_spare)); ASSYM(PCB_FLAGS, offsetof(struct pcb, pcb_flags)); @@ -170,7 +173,9 @@ ASSYM(BI_ESYMTAB, offsetof(struct bootinfo, bi_esymtab)); ASSYM(BI_KERNEND, offsetof(struct bootinfo, bi_kernend)); ASSYM(GD_SIZEOF, sizeof(struct globaldata)); ASSYM(GD_CURPROC, offsetof(struct globaldata, gd_curproc)); +ASSYM(GD_PREVPROC, offsetof(struct globaldata, gd_prevproc)); ASSYM(GD_NPXPROC, offsetof(struct globaldata, gd_npxproc)); +ASSYM(GD_IDLEPROC, offsetof(struct globaldata, gd_idleproc)); ASSYM(GD_CURPCB, offsetof(struct globaldata, gd_curpcb)); ASSYM(GD_COMMON_TSS, offsetof(struct globaldata, gd_common_tss)); ASSYM(GD_SWITCHTIME, offsetof(struct globaldata, gd_switchtime)); @@ -178,11 +183,21 @@ ASSYM(GD_SWITCHTICKS, offsetof(struct globaldata, gd_switchticks)); ASSYM(GD_COMMON_TSSD, offsetof(struct globaldata, gd_common_tssd)); ASSYM(GD_TSS_GDT, offsetof(struct globaldata, gd_tss_gdt)); ASSYM(GD_ASTPENDING, offsetof(struct globaldata, gd_astpending)); +ASSYM(GD_INTR_NESTING_LEVEL, offsetof(struct globaldata, gd_intr_nesting_level)); #ifdef USER_LDT ASSYM(GD_CURRENTLDT, offsetof(struct globaldata, gd_currentldt)); #endif +ASSYM(GD_WITNESS_SPIN_CHECK, offsetof(struct globaldata, gd_witness_spin_check)); + +/* XXX */ +#ifdef KTR_PERCPU +ASSYM(GD_KTR_IDX, offsetof(struct globaldata, gd_ktr_idx)); +ASSYM(GD_KTR_BUF, offsetof(struct globaldata, gd_ktr_buf)); +ASSYM(GD_KTR_BUF_DATA, offsetof(struct globaldata, gd_ktr_buf_data)); +#endif + #ifdef SMP ASSYM(GD_CPUID, offsetof(struct globaldata, gd_cpuid)); ASSYM(GD_CPU_LOCKID, offsetof(struct globaldata, gd_cpu_lockid)); @@ -211,3 +226,9 @@ ASSYM(KPSEL, GSEL(GPRIV_SEL, SEL_KPL)); ASSYM(BC32SEL, GSEL(GBIOSCODE32_SEL, SEL_KPL)); ASSYM(GPROC0_SEL, GPROC0_SEL); ASSYM(VM86_FRAMESIZE, sizeof(struct vm86frame)); + +ASSYM(MTX_LOCK, offsetof(struct mtx, mtx_lock)); +ASSYM(MTX_RECURSE, offsetof(struct mtx, mtx_recurse)); +ASSYM(MTX_SAVEFL, offsetof(struct mtx, mtx_savefl)); + +ASSYM(MTX_UNOWNED, MTX_UNOWNED); diff --git a/sys/amd64/amd64/identcpu.c b/sys/amd64/amd64/identcpu.c index 0e11e2b..71ecd63 100644 --- a/sys/amd64/amd64/identcpu.c +++ b/sys/amd64/amd64/identcpu.c @@ -42,6 +42,7 @@ #include "opt_cpu.h" #include <sys/param.h> +#include <sys/bus.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/sysctl.h> @@ -53,6 +54,8 @@ #include <machine/specialreg.h> #include <machine/md_var.h> +#include <sys/proc.h> +#include <i386/isa/icu.h> #include <i386/isa/intr_machdep.h> #define IDENTBLUE_CYRIX486 0 diff --git a/sys/amd64/amd64/initcpu.c b/sys/amd64/amd64/initcpu.c index be86c65..b9395bf 100644 --- a/sys/amd64/amd64/initcpu.c +++ b/sys/amd64/amd64/initcpu.c @@ -607,12 +607,14 @@ void enable_K5_wt_alloc(void) { u_int64_t msr; + int intrstate; /* * Write allocate is supported only on models 1, 2, and 3, with * a stepping of 4 or greater. */ if (((cpu_id & 0xf0) > 0) && ((cpu_id & 0x0f) > 3)) { + intrstate = save_intr(); disable_intr(); msr = rdmsr(0x83); /* HWCR */ wrmsr(0x83, msr & !(0x10)); @@ -645,7 +647,7 @@ enable_K5_wt_alloc(void) msr=rdmsr(0x83); wrmsr(0x83, msr|0x10); /* enable write allocate */ - enable_intr(); + restore_intr(intrstate); } } @@ -708,7 +710,6 @@ enable_K6_wt_alloc(void) wrmsr(0x0c0000082, whcr); write_eflags(eflags); - enable_intr(); } void @@ -770,7 +771,6 @@ enable_K6_2_wt_alloc(void) wrmsr(0x0c0000082, whcr); write_eflags(eflags); - enable_intr(); } #endif /* I585_CPU && CPU_WT_ALLOC */ diff --git a/sys/amd64/amd64/legacy.c b/sys/amd64/amd64/legacy.c index 8a30770..5b6cdbc 100644 --- a/sys/amd64/amd64/legacy.c +++ b/sys/amd64/amd64/legacy.c @@ -68,7 +68,10 @@ #else #include <i386/isa/isa.h> #endif +#include <sys/proc.h> +#include <i386/isa/icu.h> #include <i386/isa/intr_machdep.h> +#include <sys/rtprio.h> static struct rman irq_rman, drq_rman, port_rman, mem_rman; @@ -397,9 +400,9 @@ static int nexus_setup_intr(device_t bus, device_t child, struct resource *irq, int flags, void (*ihand)(void *), void *arg, void **cookiep) { - intrmask_t *mask; driver_t *driver; - int error, icflags; + int error, icflags; + int pri; /* interrupt thread priority */ /* somebody tried to setup an irq that failed to allocate! */ if (irq == NULL) @@ -413,27 +416,32 @@ nexus_setup_intr(device_t bus, device_t child, struct resource *irq, driver = device_get_driver(child); switch (flags) { - case INTR_TYPE_TTY: - mask = &tty_imask; + case INTR_TYPE_TTY: /* keyboard or parallel port */ + pri = PI_TTYLOW; break; - case (INTR_TYPE_TTY | INTR_TYPE_FAST): - mask = &tty_imask; + case (INTR_TYPE_TTY | INTR_FAST): /* sio */ + pri = PI_TTYHIGH; icflags |= INTR_FAST; break; case INTR_TYPE_BIO: - mask = &bio_imask; + /* + * XXX We need to refine this. BSD/OS distinguishes + * between tape and disk priorities. + */ + pri = PI_DISK; break; case INTR_TYPE_NET: - mask = &net_imask; + pri = PI_NET; break; case INTR_TYPE_CAM: - mask = &cam_imask; + pri = PI_DISK; /* XXX or PI_CAM? */ break; case INTR_TYPE_MISC: - mask = 0; + pri = PI_DULL; /* don't care */ break; + /* We didn't specify an interrupt level. */ default: - panic("still using grody create_intr interface"); + panic("nexus_setup_intr: no interrupt type in flags"); } /* @@ -444,7 +452,7 @@ nexus_setup_intr(device_t bus, device_t child, struct resource *irq, return (error); *cookiep = inthand_add(device_get_nameunit(child), irq->r_start, - ihand, arg, mask, icflags); + ihand, arg, pri, icflags); if (*cookiep == NULL) error = EINVAL; /* XXX ??? */ diff --git a/sys/amd64/amd64/locore.S b/sys/amd64/amd64/locore.S index bddd7d5..fa95fb0 100644 --- a/sys/amd64/amd64/locore.S +++ b/sys/amd64/amd64/locore.S @@ -862,9 +862,6 @@ map_read_write: movl $(NPTEPG-1), %ebx /* pte offset = NTEPG-1 */ movl $1, %ecx /* one private pt coming right up */ fillkpt(R(SMPptpa), $PG_RW) - -/* Initialize mp lock to allow early traps */ - movl $1, R(_mp_lock) #endif /* SMP */ /* install a pde for temporary double map of bottom of VA */ diff --git a/sys/amd64/amd64/locore.s b/sys/amd64/amd64/locore.s index bddd7d5..fa95fb0 100644 --- a/sys/amd64/amd64/locore.s +++ b/sys/amd64/amd64/locore.s @@ -862,9 +862,6 @@ map_read_write: movl $(NPTEPG-1), %ebx /* pte offset = NTEPG-1 */ movl $1, %ecx /* one private pt coming right up */ fillkpt(R(SMPptpa), $PG_RW) - -/* Initialize mp lock to allow early traps */ - movl $1, R(_mp_lock) #endif /* SMP */ /* install a pde for temporary double map of bottom of VA */ diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c index 6edecf0..875c9d5 100644 --- a/sys/amd64/amd64/machdep.c +++ b/sys/amd64/amd64/machdep.c @@ -58,6 +58,7 @@ #include <sys/sysproto.h> #include <sys/signalvar.h> #include <sys/kernel.h> +#include <sys/ktr.h> #include <sys/linker.h> #include <sys/malloc.h> #include <sys/proc.h> @@ -98,10 +99,12 @@ #include <machine/bootinfo.h> #include <machine/ipl.h> #include <machine/md_var.h> +#include <machine/mutex.h> #include <machine/pcb_ext.h> /* pcb.h included via sys/user.h */ +#include <machine/globaldata.h> +#include <machine/globals.h> #ifdef SMP #include <machine/smp.h> -#include <machine/globaldata.h> #endif #ifdef PERFMON #include <machine/perfmon.h> @@ -110,6 +113,7 @@ #ifdef OLD_BUS_ARCH #include <i386/isa/isa_device.h> #endif +#include <i386/isa/icu.h> #include <i386/isa/intr_machdep.h> #include <isa/rtc.h> #include <machine/vm86.h> @@ -247,6 +251,11 @@ vm_offset_t clean_sva, clean_eva; static vm_offset_t pager_sva, pager_eva; static struct trapframe proc0_tf; +struct cpuhead cpuhead; + +mtx_t sched_lock; +mtx_t Giant; + #define offsetof(type, member) ((size_t)(&((type *)0)->member)) static void @@ -431,6 +440,11 @@ again: bufinit(); vm_pager_bufferinit(); + SLIST_INIT(&cpuhead); + SLIST_INSERT_HEAD(&cpuhead, GLOBALDATA, gd_allcpu); + + mtx_init(&sched_lock, "sched lock", MTX_SPIN); + #ifdef SMP /* * OK, enough kmem_alloc/malloc state should be up, lets get on with it! @@ -1817,11 +1831,6 @@ init386(first) #endif int off; - /* - * Prevent lowering of the ipl if we call tsleep() early. - */ - safepri = cpl; - proc0.p_addr = proc0paddr; atdevbase = ISA_HOLE_START + KERNBASE; @@ -1871,6 +1880,10 @@ init386(first) r_gdt.rd_base = (int) gdt; lgdt(&r_gdt); + /* setup curproc so that mutexes work */ + PCPU_SET(curproc, &proc0); + PCPU_SET(prevproc, &proc0); + /* make ldt memory segments */ /* * The data segment limit must not cover the user area because we @@ -1953,7 +1966,7 @@ init386(first) /* make an initial tss so cpu can get interrupt stack on syscall! */ common_tss.tss_esp0 = (int) proc0.p_addr + UPAGES*PAGE_SIZE - 16; - common_tss.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL) ; + common_tss.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL); gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); private_tss = 0; tss_gdt = &gdt[GPROC0_SEL].sd; @@ -1974,6 +1987,12 @@ init386(first) dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL); dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL); + /* + * We grab Giant during the vm86bios routines, so we need to ensure + * that it is up and running before we use vm86. + */ + mtx_init(&Giant, "Giant", MTX_DEF); + vm86_initialize(); getmemsize(first); @@ -2009,9 +2028,7 @@ init386(first) /* setup proc 0's pcb */ proc0.p_addr->u_pcb.pcb_flags = 0; proc0.p_addr->u_pcb.pcb_cr3 = (int)IdlePTD; -#ifdef SMP - proc0.p_addr->u_pcb.pcb_mpnest = 1; -#endif + proc0.p_addr->u_pcb.pcb_schednest = 0; proc0.p_addr->u_pcb.pcb_ext = 0; proc0.p_md.md_regs = &proc0_tf; } diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c index 61c5ecf..95b5759 100644 --- a/sys/amd64/amd64/mp_machdep.c +++ b/sys/amd64/amd64/mp_machdep.c @@ -36,6 +36,7 @@ #endif #include <sys/param.h> +#include <sys/bus.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/proc.h> @@ -65,6 +66,7 @@ #include <machine/apic.h> #include <machine/atomic.h> #include <machine/cpufunc.h> +#include <machine/mutex.h> #include <machine/mpapic.h> #include <machine/psl.h> #include <machine/segments.h> @@ -236,6 +238,8 @@ typedef struct BASETABLE_ENTRY { #define MP_ANNOUNCE_POST 0x19 +/* used to hold the AP's until we are ready to release them */ +struct simplelock ap_boot_lock; /** XXX FIXME: where does this really belong, isa.h/isa.c perhaps? */ int current_postcode; @@ -336,6 +340,7 @@ static int start_all_aps(u_int boot_addr); static void install_ap_tramp(u_int boot_addr); static int start_ap(int logicalCpu, u_int boot_addr); static int apic_int_is_bus_type(int intr, int bus_type); +static void release_aps(void *dummy); /* * Calculate usable address in base memory for AP trampoline code. @@ -403,7 +408,7 @@ found: /* - * Startup the SMP processors. + * Initialize the SMP hardware and the APIC and start up the AP's. */ void mp_start(void) @@ -619,6 +624,9 @@ mp_enable(u_int boot_addr) /* initialize all SMP locks */ init_locks(); + /* obtain the ap_boot_lock */ + s_lock(&ap_boot_lock); + /* start each Application Processor */ start_all_aps(boot_addr); } @@ -1866,9 +1874,6 @@ struct simplelock fast_intr_lock; /* critical region around INTR() routines */ struct simplelock intr_lock; -/* lock regions protected in UP kernel via cli/sti */ -struct simplelock mpintr_lock; - /* lock region used by kernel profiling */ struct simplelock mcount_lock; @@ -1885,26 +1890,16 @@ struct simplelock clock_lock; /* lock around the MP rendezvous */ static struct simplelock smp_rv_lock; +/* only 1 CPU can panic at a time :) */ +struct simplelock panic_lock; + static void init_locks(void) { - /* - * Get the initial mp_lock with a count of 1 for the BSP. - * This uses a LOGICAL cpu ID, ie BSP == 0. - */ - mp_lock = 0x00000001; - -#if 0 - /* ISR uses its own "giant lock" */ - isr_lock = FREE_LOCK; -#endif - #if defined(APIC_INTR_DIAGNOSTIC) && defined(APIC_INTR_DIAGNOSTIC_IRQ) s_lock_init((struct simplelock*)&apic_itrace_debuglock); #endif - s_lock_init((struct simplelock*)&mpintr_lock); - s_lock_init((struct simplelock*)&mcount_lock); s_lock_init((struct simplelock*)&fast_intr_lock); @@ -1912,6 +1907,7 @@ init_locks(void) s_lock_init((struct simplelock*)&imen_lock); s_lock_init((struct simplelock*)&cpl_lock); s_lock_init(&smp_rv_lock); + s_lock_init(&panic_lock); #ifdef USE_COMLOCK s_lock_init((struct simplelock*)&com_lock); @@ -1919,11 +1915,9 @@ init_locks(void) #ifdef USE_CLOCKLOCK s_lock_init((struct simplelock*)&clock_lock); #endif /* USE_CLOCKLOCK */ -} - -/* Wait for all APs to be fully initialized */ -extern int wait_ap(unsigned int); + s_lock_init(&ap_boot_lock); +} /* * start each AP in our list @@ -1987,6 +1981,7 @@ start_all_aps(u_int boot_addr) SMPpt[pg + 4] = 0; /* *prv_PMAP1 */ /* prime data page for it to use */ + SLIST_INSERT_HEAD(&cpuhead, gd, gd_allcpu); gd->gd_cpuid = x; gd->gd_cpu_lockid = x << 24; gd->gd_prv_CMAP1 = &SMPpt[pg + 1]; @@ -2211,7 +2206,6 @@ start_ap(int logical_cpu, u_int boot_addr) return 0; /* return FAILURE */ } - /* * Flush the TLB on all other CPU's * @@ -2348,10 +2342,13 @@ SYSCTL_INT(_machdep, OID_AUTO, forward_roundrobin_enabled, CTLFLAG_RW, void ap_init(void); void -ap_init() +ap_init(void) { u_int apic_id; + /* lock against other AP's that are waking up */ + s_lock(&ap_boot_lock); + /* BSP may have changed PTD while we're waiting for the lock */ cpu_invltlb(); @@ -2397,6 +2394,30 @@ ap_init() smp_started = 1; /* enable IPI's, tlb shootdown, freezes etc */ smp_active = 1; /* historic */ } + + /* let other AP's wake up now */ + s_unlock(&ap_boot_lock); + + /* wait until all the AP's are up */ + while (smp_started == 0) + ; /* nothing */ + + /* + * Set curproc to our per-cpu idleproc so that mutexes have + * something unique to lock with. + */ + PCPU_SET(curproc,idleproc); + PCPU_SET(prevproc,idleproc); + + microuptime(&switchtime); + switchticks = ticks; + + /* ok, now grab sched_lock and enter the scheduler */ + enable_intr(); + mtx_enter(&sched_lock, MTX_SPIN); + cpu_throw(); /* doesn't return */ + + panic("scheduler returned us to ap_init"); } #ifdef BETTER_CLOCK @@ -2453,6 +2474,12 @@ forwarded_statclock(int id, int pscnt, int *astmap) p = checkstate_curproc[id]; cpustate = checkstate_cpustate[id]; + /* XXX */ + if (p->p_ithd) + cpustate = CHECKSTATE_INTR; + else if (p == idleproc) + cpustate = CHECKSTATE_SYS; + switch (cpustate) { case CHECKSTATE_USER: if (p->p_flag & P_PROFIL) @@ -2482,9 +2509,10 @@ forwarded_statclock(int id, int pscnt, int *astmap) if (pscnt > 1) return; - if (!p) + if (p == idleproc) { + p->p_sticks++; cp_time[CP_IDLE]++; - else { + } else { p->p_sticks++; cp_time[CP_SYS]++; } @@ -2510,7 +2538,7 @@ forwarded_statclock(int id, int pscnt, int *astmap) p->p_iticks++; cp_time[CP_INTR]++; } - if (p != NULL) { + if (p != idleproc) { schedclock(p); /* Update resource usage integrals and maximums. */ @@ -2863,3 +2891,11 @@ smp_rendezvous(void (* setup_func)(void *), /* release lock */ s_unlock(&smp_rv_lock); } + +void +release_aps(void *dummy __unused) +{ + s_unlock(&ap_boot_lock); +} + +SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); diff --git a/sys/amd64/amd64/mpboot.S b/sys/amd64/amd64/mpboot.S index d3602d2..9ede02c 100644 --- a/sys/amd64/amd64/mpboot.S +++ b/sys/amd64/amd64/mpboot.S @@ -114,43 +114,9 @@ mp_begin: /* now running relocated at KERNBASE */ CHECKPOINT(0x39, 6) - /* wait till we can get into the kernel */ - call _boot_get_mplock - - /* Now, let's prepare for some REAL WORK :-) */ + /* Now, let's prepare for some REAL WORK :-) This doesn't return. */ call _ap_init - call _rel_mplock - lock /* Avoid livelock (PIII Errata 39) */ - addl $0,-4(%esp) -2: - cmpl $0, CNAME(smp_started) /* Wait for last AP to be ready */ - jz 2b - call _get_mplock - - /* let her rip! (loads new stack) */ - jmp _cpu_switch - -NON_GPROF_ENTRY(wait_ap) - pushl %ebp - movl %esp, %ebp - call _rel_mplock - lock /* Avoid livelock (PIII Errata 39) */ - addl $0,0(%esp) - movl %eax, 8(%ebp) -1: - cmpl $0, CNAME(smp_started) - jnz 2f - decl %eax - cmpl $0, %eax - jge 1b -2: - call _get_mplock - movl %ebp, %esp - popl %ebp - ret - - /* * This is the embedded trampoline or bootstrap that is * copied into 'real-mode' low memory, it is where the diff --git a/sys/amd64/amd64/mptable.c b/sys/amd64/amd64/mptable.c index 61c5ecf..95b5759 100644 --- a/sys/amd64/amd64/mptable.c +++ b/sys/amd64/amd64/mptable.c @@ -36,6 +36,7 @@ #endif #include <sys/param.h> +#include <sys/bus.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/proc.h> @@ -65,6 +66,7 @@ #include <machine/apic.h> #include <machine/atomic.h> #include <machine/cpufunc.h> +#include <machine/mutex.h> #include <machine/mpapic.h> #include <machine/psl.h> #include <machine/segments.h> @@ -236,6 +238,8 @@ typedef struct BASETABLE_ENTRY { #define MP_ANNOUNCE_POST 0x19 +/* used to hold the AP's until we are ready to release them */ +struct simplelock ap_boot_lock; /** XXX FIXME: where does this really belong, isa.h/isa.c perhaps? */ int current_postcode; @@ -336,6 +340,7 @@ static int start_all_aps(u_int boot_addr); static void install_ap_tramp(u_int boot_addr); static int start_ap(int logicalCpu, u_int boot_addr); static int apic_int_is_bus_type(int intr, int bus_type); +static void release_aps(void *dummy); /* * Calculate usable address in base memory for AP trampoline code. @@ -403,7 +408,7 @@ found: /* - * Startup the SMP processors. + * Initialize the SMP hardware and the APIC and start up the AP's. */ void mp_start(void) @@ -619,6 +624,9 @@ mp_enable(u_int boot_addr) /* initialize all SMP locks */ init_locks(); + /* obtain the ap_boot_lock */ + s_lock(&ap_boot_lock); + /* start each Application Processor */ start_all_aps(boot_addr); } @@ -1866,9 +1874,6 @@ struct simplelock fast_intr_lock; /* critical region around INTR() routines */ struct simplelock intr_lock; -/* lock regions protected in UP kernel via cli/sti */ -struct simplelock mpintr_lock; - /* lock region used by kernel profiling */ struct simplelock mcount_lock; @@ -1885,26 +1890,16 @@ struct simplelock clock_lock; /* lock around the MP rendezvous */ static struct simplelock smp_rv_lock; +/* only 1 CPU can panic at a time :) */ +struct simplelock panic_lock; + static void init_locks(void) { - /* - * Get the initial mp_lock with a count of 1 for the BSP. - * This uses a LOGICAL cpu ID, ie BSP == 0. - */ - mp_lock = 0x00000001; - -#if 0 - /* ISR uses its own "giant lock" */ - isr_lock = FREE_LOCK; -#endif - #if defined(APIC_INTR_DIAGNOSTIC) && defined(APIC_INTR_DIAGNOSTIC_IRQ) s_lock_init((struct simplelock*)&apic_itrace_debuglock); #endif - s_lock_init((struct simplelock*)&mpintr_lock); - s_lock_init((struct simplelock*)&mcount_lock); s_lock_init((struct simplelock*)&fast_intr_lock); @@ -1912,6 +1907,7 @@ init_locks(void) s_lock_init((struct simplelock*)&imen_lock); s_lock_init((struct simplelock*)&cpl_lock); s_lock_init(&smp_rv_lock); + s_lock_init(&panic_lock); #ifdef USE_COMLOCK s_lock_init((struct simplelock*)&com_lock); @@ -1919,11 +1915,9 @@ init_locks(void) #ifdef USE_CLOCKLOCK s_lock_init((struct simplelock*)&clock_lock); #endif /* USE_CLOCKLOCK */ -} - -/* Wait for all APs to be fully initialized */ -extern int wait_ap(unsigned int); + s_lock_init(&ap_boot_lock); +} /* * start each AP in our list @@ -1987,6 +1981,7 @@ start_all_aps(u_int boot_addr) SMPpt[pg + 4] = 0; /* *prv_PMAP1 */ /* prime data page for it to use */ + SLIST_INSERT_HEAD(&cpuhead, gd, gd_allcpu); gd->gd_cpuid = x; gd->gd_cpu_lockid = x << 24; gd->gd_prv_CMAP1 = &SMPpt[pg + 1]; @@ -2211,7 +2206,6 @@ start_ap(int logical_cpu, u_int boot_addr) return 0; /* return FAILURE */ } - /* * Flush the TLB on all other CPU's * @@ -2348,10 +2342,13 @@ SYSCTL_INT(_machdep, OID_AUTO, forward_roundrobin_enabled, CTLFLAG_RW, void ap_init(void); void -ap_init() +ap_init(void) { u_int apic_id; + /* lock against other AP's that are waking up */ + s_lock(&ap_boot_lock); + /* BSP may have changed PTD while we're waiting for the lock */ cpu_invltlb(); @@ -2397,6 +2394,30 @@ ap_init() smp_started = 1; /* enable IPI's, tlb shootdown, freezes etc */ smp_active = 1; /* historic */ } + + /* let other AP's wake up now */ + s_unlock(&ap_boot_lock); + + /* wait until all the AP's are up */ + while (smp_started == 0) + ; /* nothing */ + + /* + * Set curproc to our per-cpu idleproc so that mutexes have + * something unique to lock with. + */ + PCPU_SET(curproc,idleproc); + PCPU_SET(prevproc,idleproc); + + microuptime(&switchtime); + switchticks = ticks; + + /* ok, now grab sched_lock and enter the scheduler */ + enable_intr(); + mtx_enter(&sched_lock, MTX_SPIN); + cpu_throw(); /* doesn't return */ + + panic("scheduler returned us to ap_init"); } #ifdef BETTER_CLOCK @@ -2453,6 +2474,12 @@ forwarded_statclock(int id, int pscnt, int *astmap) p = checkstate_curproc[id]; cpustate = checkstate_cpustate[id]; + /* XXX */ + if (p->p_ithd) + cpustate = CHECKSTATE_INTR; + else if (p == idleproc) + cpustate = CHECKSTATE_SYS; + switch (cpustate) { case CHECKSTATE_USER: if (p->p_flag & P_PROFIL) @@ -2482,9 +2509,10 @@ forwarded_statclock(int id, int pscnt, int *astmap) if (pscnt > 1) return; - if (!p) + if (p == idleproc) { + p->p_sticks++; cp_time[CP_IDLE]++; - else { + } else { p->p_sticks++; cp_time[CP_SYS]++; } @@ -2510,7 +2538,7 @@ forwarded_statclock(int id, int pscnt, int *astmap) p->p_iticks++; cp_time[CP_INTR]++; } - if (p != NULL) { + if (p != idleproc) { schedclock(p); /* Update resource usage integrals and maximums. */ @@ -2863,3 +2891,11 @@ smp_rendezvous(void (* setup_func)(void *), /* release lock */ s_unlock(&smp_rv_lock); } + +void +release_aps(void *dummy __unused) +{ + s_unlock(&ap_boot_lock); +} + +SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); diff --git a/sys/amd64/amd64/nexus.c b/sys/amd64/amd64/nexus.c index 8a30770..5b6cdbc 100644 --- a/sys/amd64/amd64/nexus.c +++ b/sys/amd64/amd64/nexus.c @@ -68,7 +68,10 @@ #else #include <i386/isa/isa.h> #endif +#include <sys/proc.h> +#include <i386/isa/icu.h> #include <i386/isa/intr_machdep.h> +#include <sys/rtprio.h> static struct rman irq_rman, drq_rman, port_rman, mem_rman; @@ -397,9 +400,9 @@ static int nexus_setup_intr(device_t bus, device_t child, struct resource *irq, int flags, void (*ihand)(void *), void *arg, void **cookiep) { - intrmask_t *mask; driver_t *driver; - int error, icflags; + int error, icflags; + int pri; /* interrupt thread priority */ /* somebody tried to setup an irq that failed to allocate! */ if (irq == NULL) @@ -413,27 +416,32 @@ nexus_setup_intr(device_t bus, device_t child, struct resource *irq, driver = device_get_driver(child); switch (flags) { - case INTR_TYPE_TTY: - mask = &tty_imask; + case INTR_TYPE_TTY: /* keyboard or parallel port */ + pri = PI_TTYLOW; break; - case (INTR_TYPE_TTY | INTR_TYPE_FAST): - mask = &tty_imask; + case (INTR_TYPE_TTY | INTR_FAST): /* sio */ + pri = PI_TTYHIGH; icflags |= INTR_FAST; break; case INTR_TYPE_BIO: - mask = &bio_imask; + /* + * XXX We need to refine this. BSD/OS distinguishes + * between tape and disk priorities. + */ + pri = PI_DISK; break; case INTR_TYPE_NET: - mask = &net_imask; + pri = PI_NET; break; case INTR_TYPE_CAM: - mask = &cam_imask; + pri = PI_DISK; /* XXX or PI_CAM? */ break; case INTR_TYPE_MISC: - mask = 0; + pri = PI_DULL; /* don't care */ break; + /* We didn't specify an interrupt level. */ default: - panic("still using grody create_intr interface"); + panic("nexus_setup_intr: no interrupt type in flags"); } /* @@ -444,7 +452,7 @@ nexus_setup_intr(device_t bus, device_t child, struct resource *irq, return (error); *cookiep = inthand_add(device_get_nameunit(child), irq->r_start, - ihand, arg, mask, icflags); + ihand, arg, pri, icflags); if (*cookiep == NULL) error = EINVAL; /* XXX ??? */ diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index edae292..7ce9120 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -668,7 +668,7 @@ pmap_pte_quick(pmap, va) * (unsigned *) prv_PMAP1 = newpf | PG_RW | PG_V; cpu_invlpg(prv_PADDR1); } - return prv_PADDR1 + ((unsigned) index & (NPTEPG - 1)); + return (unsigned *)(prv_PADDR1 + (index & (NPTEPG - 1))); #else if ( ((* (unsigned *) PMAP1) & PG_FRAME) != newpf) { * (unsigned *) PMAP1 = newpf | PG_RW | PG_V; diff --git a/sys/amd64/amd64/swtch.s b/sys/amd64/amd64/swtch.s index c895fef..db56a1b 100644 --- a/sys/amd64/amd64/swtch.s +++ b/sys/amd64/amd64/swtch.s @@ -73,189 +73,6 @@ _tlb_flush_count: .long 0 .text -/* - * When no processes are on the runq, cpu_switch() branches to _idle - * to wait for something to come ready. - */ - ALIGN_TEXT - .type _idle,@function -_idle: - xorl %ebp,%ebp - movl %ebp,_switchtime - -#ifdef SMP - - /* when called, we have the mplock, intr disabled */ - /* use our idleproc's "context" */ - movl _IdlePTD, %ecx - movl %cr3, %eax - cmpl %ecx, %eax - je 2f -#if defined(SWTCH_OPTIM_STATS) - decl _swtch_optim_stats - incl _tlb_flush_count -#endif - movl %ecx, %cr3 -2: - /* Keep space for nonexisting return addr, or profiling bombs */ - movl $gd_idlestack_top-4, %ecx - addl %fs:0, %ecx - movl %ecx, %esp - - /* update common_tss.tss_esp0 pointer */ - movl %ecx, _common_tss + TSS_ESP0 - - movl _cpuid, %esi - btrl %esi, _private_tss - jae 1f - - movl $gd_common_tssd, %edi - addl %fs:0, %edi - - /* move correct tss descriptor into GDT slot, then reload tr */ - movl _tss_gdt, %ebx /* entry in GDT */ - movl 0(%edi), %eax - movl %eax, 0(%ebx) - movl 4(%edi), %eax - movl %eax, 4(%ebx) - movl $GPROC0_SEL*8, %esi /* GSEL(entry, SEL_KPL) */ - ltr %si -1: - - sti - - /* - * XXX callers of cpu_switch() do a bogus splclock(). Locking should - * be left to cpu_switch(). - * - * NOTE: spl*() may only be called while we hold the MP lock (which - * we do). - */ - call _spl0 - - cli - - /* - * _REALLY_ free the lock, no matter how deep the prior nesting. - * We will recover the nesting on the way out when we have a new - * proc to load. - * - * XXX: we had damn well better be sure we had it before doing this! - */ - movl $FREE_LOCK, %eax - movl %eax, _mp_lock - - /* do NOT have lock, intrs disabled */ - .globl idle_loop -idle_loop: - - cmpl $0,_smp_active - jne 1f - cmpl $0,_cpuid - je 1f - jmp 2f - -1: - call _procrunnable - testl %eax,%eax - jnz 3f - - /* - * Handle page-zeroing in the idle loop. Called with interrupts - * disabled and the MP lock released. Inside vm_page_zero_idle - * we enable interrupts and grab the mplock as required. - */ - cmpl $0,_do_page_zero_idle - je 2f - - call _vm_page_zero_idle /* internal locking */ - testl %eax, %eax - jnz idle_loop -2: - - /* enable intrs for a halt */ - movl $0, lapic_tpr /* 1st candidate for an INT */ - call *_hlt_vector /* wait for interrupt */ - cli - jmp idle_loop - - /* - * Note that interrupts must be enabled while obtaining the MP lock - * in order to be able to take IPI's while blocked. - */ -3: - movl $LOPRIO_LEVEL, lapic_tpr /* arbitrate for INTs */ - sti - call _get_mplock - cli - call _procrunnable - testl %eax,%eax - CROSSJUMP(jnz, sw1a, jz) - call _rel_mplock - jmp idle_loop - -#else /* !SMP */ - - movl $HIDENAME(tmpstk),%esp -#if defined(OVERLY_CONSERVATIVE_PTD_MGMT) -#if defined(SWTCH_OPTIM_STATS) - incl _swtch_optim_stats -#endif - movl _IdlePTD, %ecx - movl %cr3, %eax - cmpl %ecx, %eax - je 2f -#if defined(SWTCH_OPTIM_STATS) - decl _swtch_optim_stats - incl _tlb_flush_count -#endif - movl %ecx, %cr3 -2: -#endif - - /* update common_tss.tss_esp0 pointer */ - movl %esp, _common_tss + TSS_ESP0 - - movl $0, %esi - btrl %esi, _private_tss - jae 1f - - movl $_common_tssd, %edi - - /* move correct tss descriptor into GDT slot, then reload tr */ - movl _tss_gdt, %ebx /* entry in GDT */ - movl 0(%edi), %eax - movl %eax, 0(%ebx) - movl 4(%edi), %eax - movl %eax, 4(%ebx) - movl $GPROC0_SEL*8, %esi /* GSEL(entry, SEL_KPL) */ - ltr %si -1: - - sti - - /* - * XXX callers of cpu_switch() do a bogus splclock(). Locking should - * be left to cpu_switch(). - */ - call _spl0 - - ALIGN_TEXT -idle_loop: - cli - call _procrunnable - testl %eax,%eax - CROSSJUMP(jnz, sw1a, jz) - call _vm_page_zero_idle - testl %eax, %eax - jnz idle_loop - call *_hlt_vector /* wait for interrupt */ - jmp idle_loop - -#endif /* SMP */ - -CROSSJUMPTARGET(_idle) - ENTRY(default_halt) sti #ifndef SMP @@ -264,16 +81,23 @@ ENTRY(default_halt) ret /* + * cpu_throw() + */ +ENTRY(cpu_throw) + jmp sw1 + +/* * cpu_switch() */ ENTRY(cpu_switch) /* switch to new process. first, save context as needed */ movl _curproc,%ecx + movl %ecx,_prevproc /* if no process to save, don't bother */ testl %ecx,%ecx - je sw1 + jz sw1 #ifdef SMP movb P_ONCPU(%ecx), %al /* save "last" cpu */ @@ -299,7 +123,7 @@ ENTRY(cpu_switch) movl %edi,PCB_EDI(%edx) movl %gs,PCB_GS(%edx) - /* test if debug regisers should be saved */ + /* test if debug registers should be saved */ movb PCB_FLAGS(%edx),%al andb $PCB_DBREGS,%al jz 1f /* no, skip over */ @@ -319,15 +143,12 @@ ENTRY(cpu_switch) movl %eax,PCB_DR0(%edx) 1: + /* save sched_lock recursion count */ + movl _sched_lock+MTX_RECURSE,%eax + movl %eax,PCB_SCHEDNEST(%edx) + #ifdef SMP - movl _mp_lock, %eax /* XXX FIXME: we should be saving the local APIC TPR */ -#ifdef DIAGNOSTIC - cmpl $FREE_LOCK, %eax /* is it free? */ - je badsw4 /* yes, bad medicine! */ -#endif /* DIAGNOSTIC */ - andl $COUNT_FIELD, %eax /* clear CPU portion */ - movl %eax, PCB_MPNEST(%edx) /* store it */ #endif /* SMP */ #if NNPX > 0 @@ -341,25 +162,33 @@ ENTRY(cpu_switch) 1: #endif /* NNPX > 0 */ - movl $0,_curproc /* out of process */ - - /* save is done, now choose a new process or idle */ + /* save is done, now choose a new process */ sw1: - cli #ifdef SMP /* Stop scheduling if smp_active goes zero and we are not BSP */ cmpl $0,_smp_active jne 1f cmpl $0,_cpuid - CROSSJUMP(je, _idle, jne) /* wind down */ + je 1f + + movl _idleproc, %eax + jmp sw1b 1: #endif + /* + * Choose a new process to schedule. chooseproc() returns idleproc + * if it cannot find another process to run. + */ sw1a: call _chooseproc /* trash ecx, edx, ret eax*/ - testl %eax,%eax - CROSSJUMP(je, _idle, jne) /* if no proc, idle */ + +#ifdef DIAGNOSTIC + testl %eax,%eax /* no process? */ + jz badsw3 /* no, panic */ +#endif +sw1b: movl %eax,%ecx xorl %eax,%eax @@ -456,9 +285,6 @@ sw1a: movl %ecx, _curproc /* into next process */ #ifdef SMP - movl _cpu_lockid, %eax - orl PCB_MPNEST(%edx), %eax /* add next count from PROC */ - movl %eax, _mp_lock /* load the mp_lock */ /* XXX FIXME: we should be restoring the local APIC TPR */ #endif /* SMP */ @@ -500,7 +326,22 @@ cpu_switch_load_gs: movl %eax,%dr7 1: - sti + /* + * restore sched_lock recursion count and transfer ownership to + * new process + */ + movl PCB_SCHEDNEST(%edx),%eax + movl %eax,_sched_lock+MTX_RECURSE + + movl _curproc,%eax + movl %eax,_sched_lock+MTX_LOCK + +#ifdef DIAGNOSTIC + pushfl + popl %ecx + testl $0x200, %ecx /* interrupts enabled? */ + jnz badsw6 /* that way madness lies */ +#endif ret CROSSJUMPTARGET(sw1a) @@ -517,15 +358,27 @@ badsw2: call _panic sw0_2: .asciz "cpu_switch: not SRUN" + +badsw3: + pushl $sw0_3 + call _panic + +sw0_3: .asciz "cpu_switch: chooseproc returned NULL" + #endif -#if defined(SMP) && defined(DIAGNOSTIC) -badsw4: - pushl $sw0_4 +#ifdef DIAGNOSTIC +badsw5: + pushl $sw0_5 + call _panic + +sw0_5: .asciz "cpu_switch: interrupts enabled (again)" +badsw6: + pushl $sw0_6 call _panic -sw0_4: .asciz "cpu_switch: do not have lock" -#endif /* SMP && DIAGNOSTIC */ +sw0_6: .asciz "cpu_switch: interrupts enabled" +#endif /* * savectx(pcb) diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c index 51de1ac..f32dfae 100644 --- a/sys/amd64/amd64/trap.c +++ b/sys/amd64/amd64/trap.c @@ -49,10 +49,12 @@ #include "opt_trap.h" #include <sys/param.h> +#include <sys/bus.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/pioctl.h> #include <sys/kernel.h> +#include <sys/ktr.h> #include <sys/resourcevar.h> #include <sys/signalvar.h> #include <sys/syscall.h> @@ -76,12 +78,14 @@ #include <machine/cpu.h> #include <machine/ipl.h> #include <machine/md_var.h> +#include <machine/mutex.h> #include <machine/pcb.h> #ifdef SMP #include <machine/smp.h> #endif #include <machine/tss.h> +#include <i386/isa/icu.h> #include <i386/isa/intr_machdep.h> #ifdef POWERFAIL_NMI @@ -96,11 +100,14 @@ #include "isa.h" #include "npx.h" +#include <sys/sysctl.h> + int (*pmath_emulate) __P((struct trapframe *)); extern void trap __P((struct trapframe frame)); extern int trapwrite __P((unsigned addr)); extern void syscall2 __P((struct trapframe frame)); +extern void ast __P((struct trapframe frame)); static int trap_pfault __P((struct trapframe *, int, vm_offset_t)); static void trap_fatal __P((struct trapframe *, vm_offset_t)); @@ -142,7 +149,7 @@ static char *trap_msg[] = { }; static __inline int userret __P((struct proc *p, struct trapframe *frame, - u_quad_t oticks, int have_mplock)); + u_quad_t oticks, int have_giant)); #if defined(I586_CPU) && !defined(NO_F00F_HACK) extern int has_f00f_bug; @@ -158,18 +165,18 @@ SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW, &panic_on_nmi, 0, "Panic on NMI"); static __inline int -userret(p, frame, oticks, have_mplock) +userret(p, frame, oticks, have_giant) struct proc *p; struct trapframe *frame; u_quad_t oticks; - int have_mplock; + int have_giant; { int sig, s; while ((sig = CURSIG(p)) != 0) { - if (have_mplock == 0) { - get_mplock(); - have_mplock = 1; + if (have_giant == 0) { + mtx_enter(&Giant, MTX_DEF); + have_giant = 1; } postsig(sig); } @@ -184,31 +191,34 @@ userret(p, frame, oticks, have_mplock) * mi_switch()'ed, we might not be on the queue indicated by * our priority. */ - if (have_mplock == 0) { - get_mplock(); - have_mplock = 1; - } s = splhigh(); + mtx_enter(&sched_lock, MTX_SPIN); setrunqueue(p); p->p_stats->p_ru.ru_nivcsw++; mi_switch(); + mtx_exit(&sched_lock, MTX_SPIN); splx(s); - while ((sig = CURSIG(p)) != 0) + while ((sig = CURSIG(p)) != 0) { + if (have_giant == 0) { + mtx_enter(&Giant, MTX_DEF); + have_giant = 1; + } postsig(sig); + } } /* * Charge system time if profiling. */ if (p->p_flag & P_PROFIL) { - if (have_mplock == 0) { - get_mplock(); - have_mplock = 1; + if (have_giant == 0) { + mtx_enter(&Giant, MTX_DEF); + have_giant = 1; } addupc_task(p, frame->tf_eip, (u_int)(p->p_sticks - oticks) * psratio); } curpriority = p->p_priority; - return(have_mplock); + return(have_giant); } /* @@ -226,13 +236,20 @@ trap(frame) u_quad_t sticks = 0; int i = 0, ucode = 0, type, code; vm_offset_t eva; +#ifdef POWERFAIL_NMI + static int lastalert = 0; +#endif - if (!(frame.tf_eflags & PSL_I)) { + atomic_add_int(&cnt.v_trap, 1); + + if ((frame.tf_eflags & PSL_I) == 0) { /* - * Buggy application or kernel code has disabled interrupts - * and then trapped. Enabling interrupts now is wrong, but - * it is better than running with interrupts disabled until - * they are accidentally enabled later. + * Buggy application or kernel code has disabled + * interrupts and then trapped. Enabling interrupts + * now is wrong, but it is better than running with + * interrupts disabled until they are accidentally + * enabled later. XXX Consider whether is this still + * correct. */ type = frame.tf_trapno; if (ISPL(frame.tf_cs) == SEL_UPL || (frame.tf_eflags & PSL_VM)) @@ -252,54 +269,27 @@ trap(frame) eva = 0; if (frame.tf_trapno == T_PAGEFLT) { /* - * For some Cyrix CPUs, %cr2 is clobbered by interrupts. - * This problem is worked around by using an interrupt - * gate for the pagefault handler. We are finally ready - * to read %cr2 and then must reenable interrupts. - * - * XXX this should be in the switch statement, but the - * NO_FOOF_HACK and VM86 goto and ifdefs obfuscate the - * flow of control too much for this to be obviously - * correct. + * For some Cyrix CPUs, %cr2 is clobbered by + * interrupts. This problem is worked around by using + * an interrupt gate for the pagefault handler. We + * are finally ready to read %cr2 and then must + * reenable interrupts. */ eva = rcr2(); enable_intr(); - } + } + + mtx_enter(&Giant, MTX_DEF); #if defined(I586_CPU) && !defined(NO_F00F_HACK) restart: #endif + type = frame.tf_trapno; code = frame.tf_err; - if (in_vm86call) { - if (frame.tf_eflags & PSL_VM && - (type == T_PROTFLT || type == T_STKFLT)) { - i = vm86_emulate((struct vm86frame *)&frame); - if (i != 0) - /* - * returns to original process - */ - vm86_trap((struct vm86frame *)&frame); - return; - } - switch (type) { - /* - * these traps want either a process context, or - * assume a normal userspace trap. - */ - case T_PROTFLT: - case T_SEGNPFLT: - trap_fatal(&frame, eva); - return; - case T_TRCTRAP: - type = T_BPTFLT; /* kernel breakpoint */ - /* FALL THROUGH */ - } - goto kernel_trap; /* normal kernel trap handling */ - } - - if ((ISPL(frame.tf_cs) == SEL_UPL) || (frame.tf_eflags & PSL_VM)) { + if ((ISPL(frame.tf_cs) == SEL_UPL) || + ((frame.tf_eflags & PSL_VM) && !in_vm86call)) { /* user trap */ sticks = p->p_sticks; @@ -322,16 +312,6 @@ restart: i = SIGFPE; break; - case T_ASTFLT: /* Allow process switch */ - astoff(); - cnt.v_soft++; - if (p->p_flag & P_OWEUPC) { - p->p_flag &= ~P_OWEUPC; - addupc_task(p, p->p_stats->p_prof.pr_addr, - p->p_stats->p_prof.pr_ticks); - } - goto out; - /* * The following two traps can happen in * vm86 mode, and, if so, we want to handle @@ -342,7 +322,7 @@ restart: if (frame.tf_eflags & PSL_VM) { i = vm86_emulate((struct vm86frame *)&frame); if (i == 0) - goto out; + goto user; break; } /* FALL THROUGH */ @@ -357,14 +337,20 @@ restart: case T_PAGEFLT: /* page fault */ i = trap_pfault(&frame, TRUE, eva); - if (i == -1) - return; #if defined(I586_CPU) && !defined(NO_F00F_HACK) - if (i == -2) + if (i == -2) { + /* + * f00f hack workaround has triggered, treat + * as illegal instruction not page fault. + */ + frame.tf_trapno = T_PRIVINFLT; goto restart; + } #endif - if (i == 0) + if (i == -1) goto out; + if (i == 0) + goto user; ucode = T_PAGEFLT; break; @@ -377,7 +363,15 @@ restart: #if NISA > 0 case T_NMI: #ifdef POWERFAIL_NMI - goto handle_powerfail; +#ifndef TIMER_FREQ +# define TIMER_FREQ 1193182 +#endif + if (time_second - lastalert > 10) { + log(LOG_WARNING, "NMI: power fail\n"); + sysbeep(TIMER_FREQ/880, hz); + lastalert = time_second; + } + goto out; #else /* !POWERFAIL_NMI */ /* machine/parity/power fail/"kitchen sink" faults */ if (isa_nmi(code) == 0) { @@ -391,7 +385,7 @@ restart: kdb_trap (type, 0, &frame); } #endif /* DDB */ - return; + goto out; } else if (panic_on_nmi) panic("NMI indicates hardware failure"); break; @@ -410,9 +404,9 @@ restart: case T_DNA: #if NNPX > 0 - /* if a transparent fault (due to context switch "late") */ + /* transparent fault (due to context switch "late") */ if (npxdna()) - return; + goto out; #endif if (!pmath_emulate) { i = SIGFPE; @@ -422,7 +416,7 @@ restart: i = (*pmath_emulate)(&frame); if (i == 0) { if (!(frame.tf_eflags & PSL_T)) - return; + goto out; frame.tf_eflags &= ~PSL_T; i = SIGTRAP; } @@ -435,13 +429,12 @@ restart: break; } } else { -kernel_trap: /* kernel trap */ switch (type) { case T_PAGEFLT: /* page fault */ (void) trap_pfault(&frame, FALSE, eva); - return; + goto out; case T_DNA: #if NNPX > 0 @@ -451,31 +444,35 @@ kernel_trap: * registered such use. */ if (npxdna()) - return; + goto out; #endif break; - case T_PROTFLT: /* general protection fault */ - case T_SEGNPFLT: /* segment not present fault */ /* - * Invalid segment selectors and out of bounds - * %eip's and %esp's can be set up in user mode. - * This causes a fault in kernel mode when the - * kernel tries to return to user mode. We want - * to get this fault so that we can fix the - * problem here and not have to check all the - * selectors and pointers when the user changes - * them. + * The following two traps can happen in + * vm86 mode, and, if so, we want to handle + * them specially. */ -#define MAYBE_DORETI_FAULT(where, whereto) \ - do { \ - if (frame.tf_eip == (int)where) { \ - frame.tf_eip = (int)whereto; \ - return; \ - } \ - } while (0) - - if (intr_nesting_level == 0) { + case T_PROTFLT: /* general protection fault */ + case T_STKFLT: /* stack fault */ + if (frame.tf_eflags & PSL_VM) { + i = vm86_emulate((struct vm86frame *)&frame); + if (i != 0) + /* + * returns to original process + */ + vm86_trap((struct vm86frame *)&frame); + goto out; + } + /* FALL THROUGH */ + + case T_SEGNPFLT: /* segment not present fault */ + if (in_vm86call) + break; + + if (intr_nesting_level != 0) + break; + /* * Invalid %fs's and %gs's can be created using * procfs or PT_SETREGS or by invalidating the @@ -488,20 +485,38 @@ kernel_trap: if (frame.tf_eip == (int)cpu_switch_load_gs) { curpcb->pcb_gs = 0; psignal(p, SIGBUS); - return; + goto out; + } + + /* + * Invalid segment selectors and out of bounds + * %eip's and %esp's can be set up in user mode. + * This causes a fault in kernel mode when the + * kernel tries to return to user mode. We want + * to get this fault so that we can fix the + * problem here and not have to check all the + * selectors and pointers when the user changes + * them. + */ + if (frame.tf_eip == (int)doreti_iret) { + frame.tf_eip = (int)doreti_iret_fault; + goto out; + } + if (frame.tf_eip == (int)doreti_popl_ds) { + frame.tf_eip = (int)doreti_popl_ds_fault; + goto out; + } + if (frame.tf_eip == (int)doreti_popl_es) { + frame.tf_eip = (int)doreti_popl_es_fault; + goto out; } - MAYBE_DORETI_FAULT(doreti_iret, - doreti_iret_fault); - MAYBE_DORETI_FAULT(doreti_popl_ds, - doreti_popl_ds_fault); - MAYBE_DORETI_FAULT(doreti_popl_es, - doreti_popl_es_fault); - MAYBE_DORETI_FAULT(doreti_popl_fs, - doreti_popl_fs_fault); + if (frame.tf_eip == (int)doreti_popl_fs) { + frame.tf_eip = (int)doreti_popl_fs_fault; + goto out; + } if (curpcb && curpcb->pcb_onfault) { frame.tf_eip = (int)curpcb->pcb_onfault; - return; - } + goto out; } break; @@ -517,7 +532,7 @@ kernel_trap: */ if (frame.tf_eflags & PSL_NT) { frame.tf_eflags &= ~PSL_NT; - return; + goto out; } break; @@ -529,7 +544,7 @@ kernel_trap: * silently until the syscall handler has * saved the flags. */ - return; + goto out; } if (frame.tf_eip == (int)IDTVEC(syscall) + 1) { /* @@ -537,7 +552,7 @@ kernel_trap: * flags. Stop single stepping it. */ frame.tf_eflags &= ~PSL_T; - return; + goto out; } /* * Ignore debug register trace traps due to @@ -549,13 +564,13 @@ kernel_trap: * in kernel space because that is useful when * debugging the kernel. */ - if (user_dbreg_trap()) { + if (user_dbreg_trap() && !in_vm86call) { /* * Reset breakpoint bits because the * processor doesn't */ load_dr6(rdr6() & 0xfffffff0); - return; + goto out; } /* * Fall through (TRCTRAP kernel mode, kernel address) @@ -567,28 +582,19 @@ kernel_trap: */ #ifdef DDB if (kdb_trap (type, 0, &frame)) - return; + goto out; #endif break; #if NISA > 0 case T_NMI: #ifdef POWERFAIL_NMI -#ifndef TIMER_FREQ -# define TIMER_FREQ 1193182 -#endif - handle_powerfail: - { - static unsigned lastalert = 0; - - if(time_second - lastalert > 10) - { + if (time_second - lastalert > 10) { log(LOG_WARNING, "NMI: power fail\n"); sysbeep(TIMER_FREQ/880, hz); lastalert = time_second; - } - return; } + goto out; #else /* !POWERFAIL_NMI */ /* machine/parity/power fail/"kitchen sink" faults */ if (isa_nmi(code) == 0) { @@ -602,16 +608,16 @@ kernel_trap: kdb_trap (type, 0, &frame); } #endif /* DDB */ - return; + goto out; } else if (panic_on_nmi == 0) - return; + goto out; /* FALL THROUGH */ #endif /* POWERFAIL_NMI */ #endif /* NISA > 0 */ } trap_fatal(&frame, eva); - return; + goto out; } /* Translate fault for emulators (e.g. Linux) */ @@ -630,8 +636,10 @@ kernel_trap: } #endif -out: +user: userret(p, &frame, sticks, 1); +out: + mtx_exit(&Giant, MTX_DEF); } #ifdef notyet @@ -769,10 +777,8 @@ trap_pfault(frame, usermode, eva) * fault. */ #if defined(I586_CPU) && !defined(NO_F00F_HACK) - if ((eva == (unsigned int)&idt[6]) && has_f00f_bug) { - frame->tf_trapno = T_PRIVINFLT; + if ((eva == (unsigned int)&idt[6]) && has_f00f_bug) return -2; - } #endif if (usermode) goto nogo; @@ -869,8 +875,7 @@ trap_fatal(frame, eva) frame->tf_eflags & PSL_VM ? "vm86" : ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel"); #ifdef SMP - /* three seperate prints in case of a trap on an unmapped page */ - printf("mp_lock = %08x; ", mp_lock); + /* two seperate prints in case of a trap on an unmapped page */ printf("cpuid = %d; ", cpuid); printf("lapic.id = %08x\n", lapic.id); #endif @@ -917,26 +922,6 @@ trap_fatal(frame, eva) } else { printf("Idle\n"); } - printf("interrupt mask = "); - if ((cpl & net_imask) == net_imask) - printf("net "); - if ((cpl & tty_imask) == tty_imask) - printf("tty "); - if ((cpl & bio_imask) == bio_imask) - printf("bio "); - if ((cpl & cam_imask) == cam_imask) - printf("cam "); - if (cpl == 0) - printf("none"); -#ifdef SMP -/** - * XXX FIXME: - * we probably SHOULD have stopped the other CPUs before now! - * another CPU COULD have been touching cpl at this moment... - */ - printf(" <- SMP: XXX"); -#endif - printf("\n"); #ifdef KDB if (kdb_trap(&psl)) @@ -973,8 +958,7 @@ dblfault_handler() printf("esp = 0x%x\n", common_tss.tss_esp); printf("ebp = 0x%x\n", common_tss.tss_ebp); #ifdef SMP - /* three seperate prints in case of a trap on an unmapped page */ - printf("mp_lock = %08x; ", mp_lock); + /* two seperate prints in case of a trap on an unmapped page */ printf("cpuid = %d; ", cpuid); printf("lapic.id = %08x\n", lapic.id); #endif @@ -1048,12 +1032,14 @@ syscall2(frame) int error; int narg; int args[8]; - int have_mplock = 0; + int have_giant = 0; u_int code; + atomic_add_int(&cnt.v_syscall, 1); + #ifdef DIAGNOSTIC if (ISPL(frame.tf_cs) != SEL_UPL) { - get_mplock(); + mtx_enter(&Giant, MTX_DEF); panic("syscall"); /* NOT REACHED */ } @@ -1075,9 +1061,9 @@ syscall2(frame) /* * The prep code is not MP aware. */ - get_mplock(); + mtx_enter(&Giant, MTX_DEF); (*p->p_sysent->sv_prepsyscall)(&frame, args, &code, ¶ms); - rel_mplock(); + mtx_exit(&Giant, MTX_DEF); } else { /* * Need to check if this is a 32 bit or 64 bit syscall. @@ -1114,8 +1100,8 @@ syscall2(frame) */ if (params && (i = narg * sizeof(int)) && (error = copyin(params, (caddr_t)args, (u_int)i))) { - get_mplock(); - have_mplock = 1; + mtx_enter(&Giant, MTX_DEF); + have_giant = 1; #ifdef KTRACE if (KTRPOINT(p, KTR_SYSCALL)) ktrsyscall(p->p_tracep, code, narg, args); @@ -1129,15 +1115,15 @@ syscall2(frame) * we are ktracing */ if ((callp->sy_narg & SYF_MPSAFE) == 0) { - get_mplock(); - have_mplock = 1; + mtx_enter(&Giant, MTX_DEF); + have_giant = 1; } #ifdef KTRACE if (KTRPOINT(p, KTR_SYSCALL)) { - if (have_mplock == 0) { - get_mplock(); - have_mplock = 1; + if (have_giant == 0) { + mtx_enter(&Giant, MTX_DEF); + have_giant = 1; } ktrsyscall(p->p_tracep, code, narg, args); } @@ -1192,9 +1178,9 @@ bad: * Traced syscall. trapsignal() is not MP aware. */ if ((frame.tf_eflags & PSL_T) && !(frame.tf_eflags & PSL_VM)) { - if (have_mplock == 0) { - get_mplock(); - have_mplock = 1; + if (have_giant == 0) { + mtx_enter(&Giant, MTX_DEF); + have_giant = 1; } frame.tf_eflags &= ~PSL_T; trapsignal(p, SIGTRAP, 0); @@ -1203,13 +1189,13 @@ bad: /* * Handle reschedule and other end-of-syscall issues */ - have_mplock = userret(p, &frame, sticks, have_mplock); + have_giant = userret(p, &frame, sticks, have_giant); #ifdef KTRACE if (KTRPOINT(p, KTR_SYSRET)) { - if (have_mplock == 0) { - get_mplock(); - have_mplock = 1; + if (have_giant == 0) { + mtx_enter(&Giant, MTX_DEF); + have_giant = 1; } ktrsysret(p->p_tracep, code, error, p->p_retval[0]); } @@ -1225,27 +1211,66 @@ bad: /* * Release the MP lock if we had to get it */ - if (have_mplock) - rel_mplock(); + if (have_giant) + mtx_exit(&Giant, MTX_DEF); + + mtx_assert(&sched_lock, MA_NOTOWNED); + mtx_assert(&Giant, MA_NOTOWNED); +} + +void +ast(frame) + struct trapframe frame; +{ + struct proc *p = CURPROC; + u_quad_t sticks; + + /* + * handle atomicy by looping since interrupts are enabled and the + * MP lock is not held. + */ + sticks = ((volatile struct proc *)p)->p_sticks; + while (sticks != ((volatile struct proc *)p)->p_sticks) + sticks = ((volatile struct proc *)p)->p_sticks; + + astoff(); + atomic_add_int(&cnt.v_soft, 1); + if (p->p_flag & P_OWEUPC) { + mtx_enter(&Giant, MTX_DEF); + p->p_flag &= ~P_OWEUPC; + addupc_task(p, p->p_stats->p_prof.pr_addr, + p->p_stats->p_prof.pr_ticks); +} + if (userret(p, &frame, sticks, mtx_owned(&Giant)) != 0) + mtx_exit(&Giant, MTX_DEF); } /* * Simplified back end of syscall(), used when returning from fork() - * directly into user mode. MP lock is held on entry and should be - * held on return. + * directly into user mode. Giant is not held on entry, and must not + * be held on return. */ void fork_return(p, frame) struct proc *p; struct trapframe frame; { + int have_giant; + frame.tf_eax = 0; /* Child returns zero */ frame.tf_eflags &= ~PSL_C; /* success */ frame.tf_edx = 1; - userret(p, &frame, 0, 1); + have_giant = userret(p, &frame, 0, mtx_owned(&Giant)); #ifdef KTRACE - if (KTRPOINT(p, KTR_SYSRET)) + if (KTRPOINT(p, KTR_SYSRET)) { + if (have_giant == 0) { + mtx_enter(&Giant, MTX_DEF); + have_giant = 1; + } ktrsysret(p->p_tracep, SYS_fork, 0, 0); + } #endif + if (have_giant) + mtx_exit(&Giant, MTX_DEF); } diff --git a/sys/amd64/amd64/tsc.c b/sys/amd64/amd64/tsc.c index 15044ab..724f3c2 100644 --- a/sys/amd64/amd64/tsc.c +++ b/sys/amd64/amd64/tsc.c @@ -54,6 +54,7 @@ #include <sys/param.h> #include <sys/systm.h> #include <sys/bus.h> +#include <sys/proc.h> #include <sys/time.h> #include <sys/timetc.h> #include <sys/kernel.h> @@ -93,10 +94,6 @@ #include <i386/isa/mca_machdep.h> #endif -#ifdef SMP -#define disable_intr() CLOCK_DISABLE_INTR() -#define enable_intr() CLOCK_ENABLE_INTR() - #ifdef APIC_IO #include <i386/isa/intr_machdep.h> /* The interrupt triggered by the 8254 (timer) chip */ @@ -104,7 +101,6 @@ int apic_8254_intr; static u_long read_intr_count __P((int vec)); static void setup_8254_mixed_mode __P((void)); #endif -#endif /* SMP */ /* * 32-bit time_t's can't reach leap years before 1904 or after 2036, so we @@ -147,7 +143,9 @@ int tsc_is_broken; int wall_cmos_clock; /* wall CMOS clock assumed if != 0 */ static int beeping = 0; +#if 0 static u_int clk_imask = HWI_MASK | SWI_MASK; +#endif static const u_char daysinmonth[] = {31,28,31,30,31,30,31,31,30,31,30,31}; static u_int hardclock_max_count; static u_int32_t i8254_lastcount; @@ -205,8 +203,12 @@ SYSCTL_OPAQUE(_debug, OID_AUTO, i8254_timecounter, CTLFLAG_RD, static void clkintr(struct clockframe frame) { + int intrsave; + if (timecounter->tc_get_timecount == i8254_get_timecount) { + intrsave = save_intr(); disable_intr(); + CLOCK_LOCK(); if (i8254_ticked) i8254_ticked = 0; else { @@ -214,7 +216,8 @@ clkintr(struct clockframe frame) i8254_lastcount = 0; } clkintr_pending = 0; - enable_intr(); + CLOCK_UNLOCK(); + restore_intr(intrsave); } timer_func(&frame); switch (timer0_state) { @@ -233,14 +236,17 @@ clkintr(struct clockframe frame) break; case ACQUIRE_PENDING: + intrsave = save_intr(); disable_intr(); + CLOCK_LOCK(); i8254_offset = i8254_get_timecount(NULL); i8254_lastcount = 0; timer0_max_count = TIMER_DIV(new_rate); outb(TIMER_MODE, TIMER_SEL0 | TIMER_RATEGEN | TIMER_16BIT); outb(TIMER_CNTR0, timer0_max_count & 0xff); outb(TIMER_CNTR0, timer0_max_count >> 8); - enable_intr(); + CLOCK_UNLOCK(); + restore_intr(intrsave); timer_func = new_function; timer0_state = ACQUIRED; setdelayed(); @@ -249,7 +255,9 @@ clkintr(struct clockframe frame) case RELEASE_PENDING: if ((timer0_prescaler_count += timer0_max_count) >= hardclock_max_count) { + intrsave = save_intr(); disable_intr(); + CLOCK_LOCK(); i8254_offset = i8254_get_timecount(NULL); i8254_lastcount = 0; timer0_max_count = hardclock_max_count; @@ -257,7 +265,8 @@ clkintr(struct clockframe frame) TIMER_SEL0 | TIMER_RATEGEN | TIMER_16BIT); outb(TIMER_CNTR0, timer0_max_count & 0xff); outb(TIMER_CNTR0, timer0_max_count >> 8); - enable_intr(); + CLOCK_UNLOCK(); + restore_intr(intrsave); timer0_prescaler_count = 0; timer_func = hardclock; timer0_state = RELEASED; @@ -404,11 +413,11 @@ DB_SHOW_COMMAND(rtc, rtc) static int getit(void) { - u_long ef; - int high, low; + int high, low, intrsave; - ef = read_eflags(); + intrsave = save_intr(); disable_intr(); + CLOCK_LOCK(); /* Select timer0 and latch counter value. */ outb(TIMER_MODE, TIMER_SEL0 | TIMER_LATCH); @@ -417,7 +426,7 @@ getit(void) high = inb(TIMER_CNTR0); CLOCK_UNLOCK(); - write_eflags(ef); + restore_intr(intrsave); return ((high << 8) | low); } @@ -523,6 +532,7 @@ sysbeepstop(void *chan) int sysbeep(int pitch, int period) { + int intrsave; int x = splclock(); if (acquire_timer2(TIMER_SQWAVE|TIMER_16BIT)) @@ -531,10 +541,13 @@ sysbeep(int pitch, int period) splx(x); return (-1); /* XXX Should be EBUSY, but nobody cares anyway. */ } + intrsave = save_intr(); disable_intr(); + CLOCK_LOCK(); outb(TIMER_CNTR2, pitch); outb(TIMER_CNTR2, (pitch>>8)); - enable_intr(); + CLOCK_UNLOCK(); + restore_intr(intrsave); if (!beeping) { /* enable counter2 output to speaker */ outb(IO_PPI, inb(IO_PPI) | 3); @@ -683,11 +696,12 @@ fail: static void set_timer_freq(u_int freq, int intr_freq) { - u_long ef; + int intrsave; int new_timer0_max_count; - ef = read_eflags(); + intrsave = save_intr(); disable_intr(); + CLOCK_LOCK(); timer_freq = freq; new_timer0_max_count = hardclock_max_count = TIMER_DIV(intr_freq); if (new_timer0_max_count != timer0_max_count) { @@ -697,7 +711,7 @@ set_timer_freq(u_int freq, int intr_freq) outb(TIMER_CNTR0, timer0_max_count >> 8); } CLOCK_UNLOCK(); - write_eflags(ef); + restore_intr(intrsave); } /* @@ -711,15 +725,16 @@ set_timer_freq(u_int freq, int intr_freq) void i8254_restore(void) { - u_long ef; + int intrsave; - ef = read_eflags(); + intrsave = save_intr(); disable_intr(); + CLOCK_LOCK(); outb(TIMER_MODE, TIMER_SEL0 | TIMER_RATEGEN | TIMER_16BIT); outb(TIMER_CNTR0, timer0_max_count & 0xff); outb(TIMER_CNTR0, timer0_max_count >> 8); CLOCK_UNLOCK(); - write_eflags(ef); + restore_intr(intrsave); } /* @@ -979,8 +994,8 @@ cpu_initclocks() { int diag; #ifdef APIC_IO - int apic_8254_trial; - struct intrec *clkdesc; + int apic_8254_trial, num_8254_ticks; + struct intrec *clkdesc, *rtcdesc; #endif /* APIC_IO */ if (statclock_disable) { @@ -1014,14 +1029,15 @@ cpu_initclocks() } else panic("APIC_IO: Cannot route 8254 interrupt to CPU"); } - - clkdesc = inthand_add("clk", apic_8254_intr, (inthand2_t *)clkintr, - NULL, &clk_imask, INTR_EXCL); - INTREN(1 << apic_8254_intr); - #else /* APIC_IO */ - inthand_add("clk", 0, (inthand2_t *)clkintr, NULL, &clk_imask, + /* + * XXX Check the priority of this interrupt handler. I + * couldn't find anything suitable in the BSD/OS code (grog, + * 19 July 2000). + */ + /* Setup the PIC clk handler. The APIC handler is setup later */ + inthand_add("clk", 0, (inthand2_t *)clkintr, NULL, PI_REALTIME, INTR_EXCL); INTREN(IRQ0); @@ -1032,8 +1048,18 @@ cpu_initclocks() writertc(RTC_STATUSB, RTCSB_24HR); /* Don't bother enabling the statistics clock. */ - if (statclock_disable) + if (statclock_disable) { +#ifdef APIC_IO + /* + * XXX - if statclock is disabled, don't attempt the APIC + * trial. Not sure this is sane for APIC_IO. + */ + inthand_add("clk", apic_8254_intr, (inthand2_t *)clkintr, NULL, + PI_REALTIME, INTR_EXCL); + INTREN(1 << apic_8254_intr); +#endif /* APIC_IO */ return; + } diag = rtcin(RTC_DIAG); if (diag != 0) printf("RTC BIOS diagnostic error %b\n", diag, RTCDG_BITS); @@ -1041,34 +1067,44 @@ cpu_initclocks() #ifdef APIC_IO if (isa_apic_irq(8) != 8) panic("APIC RTC != 8"); -#endif /* APIC_IO */ - inthand_add("rtc", 8, (inthand2_t *)rtcintr, NULL, &stat_imask, - INTR_EXCL); - -#ifdef APIC_IO - INTREN(APIC_IRQ8); -#else - INTREN(IRQ8); -#endif /* APIC_IO */ + if (apic_8254_trial) { + /* + * XXX - We use fast interrupts for clk and rtc long enough to + * perform the APIC probe and then revert to exclusive + * interrupts. + */ + clkdesc = inthand_add("clk", apic_8254_intr, + (inthand2_t *)clkintr, NULL, PI_REALTIME, INTR_FAST); + INTREN(1 << apic_8254_intr); - writertc(RTC_STATUSB, rtc_statusb); + rtcdesc = inthand_add("rtc", 8, (inthand2_t *)rtcintr, NULL, + PI_REALTIME, INTR_FAST); /* XXX */ + INTREN(APIC_IRQ8); + writertc(RTC_STATUSB, rtc_statusb); -#ifdef APIC_IO - if (apic_8254_trial) { - printf("APIC_IO: Testing 8254 interrupt delivery\n"); while (read_intr_count(8) < 6) ; /* nothing */ - if (read_intr_count(apic_8254_intr) < 3) { + num_8254_ticks = read_intr_count(apic_8254_intr); + + /* disable and remove our fake handlers */ + INTRDIS(1 << apic_8254_intr); + inthand_remove(clkdesc); + + writertc(RTC_STATUSA, rtc_statusa); + writertc(RTC_STATUSB, RTCSB_24HR); + + INTRDIS(APIC_IRQ8); + inthand_remove(rtcdesc); + + if (num_8254_ticks < 3) { /* * The MP table is broken. * The 8254 was not connected to the specified pin * on the IO APIC. * Workaround: Limited variant of mixed mode. */ - INTRDIS(1 << apic_8254_intr); - inthand_remove(clkdesc); printf("APIC_IO: Broken MP table detected: " "8254 is not connected to " "IOAPIC #%d intpin %d\n", @@ -1087,13 +1123,27 @@ cpu_initclocks() } apic_8254_intr = apic_irq(0, 0); setup_8254_mixed_mode(); - inthand_add("clk", apic_8254_intr, - (inthand2_t *)clkintr, - NULL, &clk_imask, INTR_EXCL); - INTREN(1 << apic_8254_intr); } } + + /* Finally, setup the real clock handlers */ + inthand_add("clk", apic_8254_intr, (inthand2_t *)clkintr, NULL, + PI_REALTIME, INTR_EXCL); + INTREN(1 << apic_8254_intr); +#endif + + inthand_add("rtc", 8, (inthand2_t *)rtcintr, NULL, PI_REALTIME, + INTR_EXCL); +#ifdef APIC_IO + INTREN(APIC_IRQ8); +#else + INTREN(IRQ8); +#endif + + writertc(RTC_STATUSB, rtc_statusb); + +#ifdef APIC_IO if (apic_int_type(0, 0) != 3 || int_to_apicintpin[apic_8254_intr].ioapic != 0 || int_to_apicintpin[apic_8254_intr].int_pin != 0) @@ -1198,11 +1248,12 @@ static unsigned i8254_get_timecount(struct timecounter *tc) { u_int count; - u_long ef; + int intrsave; u_int high, low; - ef = read_eflags(); + intrsave = save_intr(); disable_intr(); + CLOCK_LOCK(); /* Select timer0 and latch counter value. */ outb(TIMER_MODE, TIMER_SEL0 | TIMER_LATCH); @@ -1212,7 +1263,7 @@ i8254_get_timecount(struct timecounter *tc) count = timer0_max_count - ((high << 8) | low); if (count < i8254_lastcount || (!i8254_ticked && (clkintr_pending || - ((count < 20 || (!(ef & PSL_I) && count < timer0_max_count / 2u)) && + ((count < 20 || (!(intrsave & PSL_I) && count < timer0_max_count / 2u)) && #ifdef APIC_IO #define lapic_irr1 ((volatile u_int *)&lapic)[0x210 / 4] /* XXX XXX */ /* XXX this assumes that apic_8254_intr is < 24. */ @@ -1227,7 +1278,7 @@ i8254_get_timecount(struct timecounter *tc) i8254_lastcount = count; count += i8254_offset; CLOCK_UNLOCK(); - write_eflags(ef); + restore_intr(intrsave); return (count); } diff --git a/sys/amd64/amd64/vm_machdep.c b/sys/amd64/amd64/vm_machdep.c index cfb6cee..831ab3b 100644 --- a/sys/amd64/amd64/vm_machdep.c +++ b/sys/amd64/amd64/vm_machdep.c @@ -57,12 +57,14 @@ #include <sys/vnode.h> #include <sys/vmmeter.h> #include <sys/kernel.h> +#include <sys/ktr.h> #include <sys/sysctl.h> #include <sys/unistd.h> #include <machine/clock.h> #include <machine/cpu.h> #include <machine/md_var.h> +#include <machine/mutex.h> #ifdef SMP #include <machine/smp.h> #endif @@ -177,9 +179,8 @@ cpu_fork(p1, p2, flags) * pcb2->pcb_onfault: cloned above (always NULL here?). */ -#ifdef SMP - pcb2->pcb_mpnest = 1; -#endif + pcb2->pcb_schednest = 0; + /* * XXX don't copy the i/o pages. this should probably be fixed. */ @@ -256,8 +257,11 @@ cpu_exit(p) reset_dbregs(); pcb->pcb_flags &= ~PCB_DBREGS; } + mtx_enter(&sched_lock, MTX_SPIN); + mtx_exit(&Giant, MTX_DEF | MTX_NOSWITCH); + mtx_assert(&Giant, MA_NOTOWNED); cnt.v_swtch++; - cpu_switch(p); + cpu_switch(); panic("cpu_exit"); } @@ -406,17 +410,10 @@ vunmapbuf(bp) static void cpu_reset_proxy() { - u_int saved_mp_lock; cpu_reset_proxy_active = 1; while (cpu_reset_proxy_active == 1) - ; /* Wait for other cpu to disable interupts */ - saved_mp_lock = mp_lock; - mp_lock = 1; - printf("cpu_reset_proxy: Grabbed mp lock for BSP\n"); - cpu_reset_proxy_active = 3; - while (cpu_reset_proxy_active == 3) - ; /* Wait for other cpu to enable interrupts */ + ; /* Wait for other cpu to see that we've started */ stop_cpus((1<<cpu_reset_proxyid)); printf("cpu_reset_proxy: Stopped CPU %d\n", cpu_reset_proxyid); DELAY(1000000); @@ -453,6 +450,7 @@ cpu_reset() cpu_reset_proxyid = cpuid; cpustop_restartfunc = cpu_reset_proxy; + cpu_reset_proxy_active = 0; printf("cpu_reset: Restarting BSP\n"); started_cpus = (1<<0); /* Restart CPU #0 */ @@ -461,17 +459,9 @@ cpu_reset() cnt++; /* Wait for BSP to announce restart */ if (cpu_reset_proxy_active == 0) printf("cpu_reset: Failed to restart BSP\n"); - __asm __volatile("cli" : : : "memory"); + enable_intr(); cpu_reset_proxy_active = 2; - cnt = 0; - while (cpu_reset_proxy_active == 2 && cnt < 10000000) - cnt++; /* Do nothing */ - if (cpu_reset_proxy_active == 2) { - printf("cpu_reset: BSP did not grab mp lock\n"); - cpu_reset_real(); /* XXX: Bogus ? */ - } - cpu_reset_proxy_active = 4; - __asm __volatile("sti" : : : "memory"); + while (1); /* NOTREACHED */ } @@ -553,7 +543,7 @@ vm_page_zero_idle() static int free_rover; static int zero_state; vm_page_t m; - int s; + int s, intrsave; /* * Attempt to maintain approximately 1/2 of our free pages in a @@ -569,11 +559,10 @@ vm_page_zero_idle() if (vm_page_zero_count >= ZIDLE_HI(cnt.v_free_count)) return(0); -#ifdef SMP - if (try_mplock()) { -#endif + if (mtx_try_enter(&Giant, MTX_DEF)) { s = splvm(); - __asm __volatile("sti" : : : "memory"); + intrsave = save_intr(); + enable_intr(); zero_state = 0; m = vm_page_list_find(PQ_FREE, free_rover, FALSE); if (m != NULL && (m->flags & PG_ZERO) == 0) { @@ -595,14 +584,10 @@ vm_page_zero_idle() } free_rover = (free_rover + PQ_PRIME2) & PQ_L2_MASK; splx(s); - __asm __volatile("cli" : : : "memory"); -#ifdef SMP - rel_mplock(); -#endif + restore_intr(intrsave); + mtx_exit(&Giant, MTX_DEF); return (1); -#ifdef SMP } -#endif /* * We have to enable interrupts for a moment if the try_mplock fails * in order to potentially take an IPI. XXX this should be in |