diff options
author | kmacy <kmacy@FreeBSD.org> | 2008-09-10 07:11:08 +0000 |
---|---|---|
committer | kmacy <kmacy@FreeBSD.org> | 2008-09-10 07:11:08 +0000 |
commit | e3882f36083324c0d0c64ec0c9c6da6fb061ff78 (patch) | |
tree | 870848c20d88f26564e15790dfdae6a9fbf378cc /sys/i386 | |
parent | 977bb41a581a18041a2937f601ff6c3f0ec32b1f (diff) | |
download | FreeBSD-src-e3882f36083324c0d0c64ec0c9c6da6fb061ff78.zip FreeBSD-src-e3882f36083324c0d0c64ec0c9c6da6fb061ff78.tar.gz |
Get initial bootstrap of APs working under xen.
Note that the APs still blow up in sched_throw().
MFC after: 1 month
Diffstat (limited to 'sys/i386')
-rw-r--r-- | sys/i386/conf/XEN | 1 | ||||
-rw-r--r-- | sys/i386/i386/local_apic.c | 5 | ||||
-rw-r--r-- | sys/i386/i386/machdep.c | 2 | ||||
-rw-r--r-- | sys/i386/include/xen/xenpmap.h | 2 | ||||
-rw-r--r-- | sys/i386/include/xen/xenvar.h | 4 | ||||
-rw-r--r-- | sys/i386/xen/mp_machdep.c | 1095 | ||||
-rw-r--r-- | sys/i386/xen/mptable.c | 130 | ||||
-rw-r--r-- | sys/i386/xen/pmap.c | 2 | ||||
-rw-r--r-- | sys/i386/xen/xen_machdep.c | 39 |
9 files changed, 1251 insertions, 29 deletions
diff --git a/sys/i386/conf/XEN b/sys/i386/conf/XEN index ccde7db..328a214 100644 --- a/sys/i386/conf/XEN +++ b/sys/i386/conf/XEN @@ -63,7 +63,6 @@ options SYSVMSG # SYSV-style message queues options SYSVSEM # SYSV-style semaphores options _KPOSIX_PRIORITY_SCHEDULING # POSIX P1003_1B real-time extensions options KBD_INSTALL_CDEV # install a CDEV entry in /dev -options STOP_NMI # Stop CPUS using NMI instead of IPI options HWPMC_HOOKS # Necessary kernel hooks for hwpmc(4) options AUDIT # Security event auditing diff --git a/sys/i386/i386/local_apic.c b/sys/i386/i386/local_apic.c index 664b186..5b7c1ea 100644 --- a/sys/i386/i386/local_apic.c +++ b/sys/i386/i386/local_apic.c @@ -1012,7 +1012,6 @@ static struct apic_enumerator *best_enum; void apic_register_enumerator(struct apic_enumerator *enumerator) { -#ifndef XEN #ifdef INVARIANTS struct apic_enumerator *apic_enum; @@ -1023,7 +1022,6 @@ apic_register_enumerator(struct apic_enumerator *enumerator) } #endif SLIST_INSERT_HEAD(&enumerators, enumerator, apic_next); -#endif } /* @@ -1108,6 +1106,9 @@ apic_setup_io(void *dummy __unused) printf("%s: Failed to setup I/O APICs: returned %d\n", best_enum->apic_name, retval); +#ifdef XEN + return; +#endif /* * Finish setting up the local APIC on the BSP once we know how to * properly program the LINT pins. diff --git a/sys/i386/i386/machdep.c b/sys/i386/i386/machdep.c index 2a7b9ca..d824b26 100644 --- a/sys/i386/i386/machdep.c +++ b/sys/i386/i386/machdep.c @@ -1213,7 +1213,7 @@ void (*cpu_idle_fn)(int) = cpu_idle_acpi; void cpu_idle(int busy) { -#ifdef SMP +#if defined(SMP) && !defined(XEN) if (mp_grab_cpu_hlt()) return; #endif diff --git a/sys/i386/include/xen/xenpmap.h b/sys/i386/include/xen/xenpmap.h index eb1f157..17d1f92 100644 --- a/sys/i386/include/xen/xenpmap.h +++ b/sys/i386/include/xen/xenpmap.h @@ -184,7 +184,7 @@ vptetomachpte(vm_paddr_t *pte) do { \ PANIC_IF(HYPERVISOR_update_va_mapping(((unsigned long)(_va)),\ (_ma), \ - UVMF_INVLPG| UVMF_LOCAL) < 0); \ + UVMF_INVLPG| UVMF_ALL) < 0); \ } while (/*CONSTCOND*/0) #define PT_UPDATES_FLUSH() do { \ diff --git a/sys/i386/include/xen/xenvar.h b/sys/i386/include/xen/xenvar.h index 779a540..402bc8a 100644 --- a/sys/i386/include/xen/xenvar.h +++ b/sys/i386/include/xen/xenvar.h @@ -72,8 +72,8 @@ extern xen_pfn_t *xen_machine_phys; #define PFNTOMFN(i) (xen_phys_machine[(i)]) #define MFNTOPFN(i) ((vm_paddr_t)xen_machine_phys[(i)]) -#define VTOP(x) ((uintptr_t)(((uint8_t *)(x)) - KERNBASE)) -#define PTOV(x) ((x) + KERNBASE) +#define VTOP(x) ((((uintptr_t)(x))) - KERNBASE) +#define PTOV(x) (((uintptr_t)(x)) + KERNBASE) #define VTOPFN(x) (VTOP(x) >> PAGE_SHIFT) #define PFNTOV(x) PTOV((vm_paddr_t)(x) << PAGE_SHIFT) diff --git a/sys/i386/xen/mp_machdep.c b/sys/i386/xen/mp_machdep.c new file mode 100644 index 0000000..01d08af --- /dev/null +++ b/sys/i386/xen/mp_machdep.c @@ -0,0 +1,1095 @@ +/*- + * Copyright (c) 1996, by Steve Passe + * Copyright (c) 2008, by Kip Macy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. The name of the developer may NOT be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include "opt_apic.h" +#include "opt_cpu.h" +#include "opt_kstack_pages.h" +#include "opt_mp_watchdog.h" +#include "opt_sched.h" +#include "opt_smp.h" + +#if !defined(lint) +#if !defined(SMP) +#error How did you get here? +#endif + +#ifndef DEV_APIC +#error The apic device is required for SMP, add "device apic" to your config file. +#endif +#if defined(CPU_DISABLE_CMPXCHG) && !defined(COMPILING_LINT) +#error SMP not supported with CPU_DISABLE_CMPXCHG +#endif +#endif /* not lint */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bus.h> +#include <sys/cons.h> /* cngetc() */ +#ifdef GPROF +#include <sys/gmon.h> +#endif +#include <sys/kernel.h> +#include <sys/ktr.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/memrange.h> +#include <sys/mutex.h> +#include <sys/pcpu.h> +#include <sys/proc.h> +#include <sys/sched.h> +#include <sys/smp.h> +#include <sys/sysctl.h> + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/pmap.h> +#include <vm/vm_kern.h> +#include <vm/vm_extern.h> +#include <vm/vm_page.h> + +#include <machine/apicreg.h> +#include <machine/md_var.h> +#include <machine/mp_watchdog.h> +#include <machine/pcb.h> +#include <machine/psl.h> +#include <machine/smp.h> +#include <machine/specialreg.h> +#include <machine/pcpu.h> + + + +#include <machine/xen/xen-os.h> +#include <machine/xen/hypervisor.h> +#include <xen/interface/vcpu.h> + + +#define WARMBOOT_TARGET 0 +#define WARMBOOT_OFF (KERNBASE + 0x0467) +#define WARMBOOT_SEG (KERNBASE + 0x0469) + +#define stop_cpus_with_nmi 0 + + +int mp_naps; /* # of Applications processors */ +int boot_cpu_id = -1; /* designated BSP */ + +extern struct pcpu __pcpu[]; + +static int bootAP; +static union descriptor *bootAPgdt; + + +/* Free these after use */ +void *bootstacks[MAXCPU]; + +/* Hotwire a 0->4MB V==P mapping */ +extern pt_entry_t *KPTphys; + +struct pcb stoppcbs[MAXCPU]; + +/* Variables needed for SMP tlb shootdown. */ +vm_offset_t smp_tlb_addr1; +vm_offset_t smp_tlb_addr2; +volatile int smp_tlb_wait; + +static u_int logical_cpus; + +/* used to hold the AP's until we are ready to release them */ +static struct mtx ap_boot_mtx; + +/* Set to 1 once we're ready to let the APs out of the pen. */ +static volatile int aps_ready = 0; + +/* + * Store data from cpu_add() until later in the boot when we actually setup + * the APs. + */ +struct cpu_info { + int cpu_present:1; + int cpu_bsp:1; + int cpu_disabled:1; +} static cpu_info[MAX_APIC_ID + 1]; +int cpu_apic_ids[MAXCPU]; + +/* Holds pending bitmap based IPIs per CPU */ +static volatile u_int cpu_ipi_pending[MAXCPU]; + +static u_int boot_address; + +static void assign_cpu_ids(void); +static void set_interrupt_apic_ids(void); +int start_all_aps(void); +static int start_ap(int apic_id); +static void release_aps(void *dummy); + +static u_int hyperthreading_cpus; +static cpumask_t hyperthreading_cpus_mask; + +extern void Xhypervisor_callback(void); +extern void failsafe_callback(void); + +struct cpu_group * +cpu_topo(void) +{ + if (cpu_cores == 0) + cpu_cores = 1; + if (cpu_logical == 0) + cpu_logical = 1; + if (mp_ncpus % (cpu_cores * cpu_logical) != 0) { + printf("WARNING: Non-uniform processors.\n"); + printf("WARNING: Using suboptimal topology.\n"); + return (smp_topo_none()); + } + /* + * No multi-core or hyper-threaded. + */ + if (cpu_logical * cpu_cores == 1) + return (smp_topo_none()); + /* + * Only HTT no multi-core. + */ + if (cpu_logical > 1 && cpu_cores == 1) + return (smp_topo_1level(CG_SHARE_L1, cpu_logical, CG_FLAG_HTT)); + /* + * Only multi-core no HTT. + */ + if (cpu_cores > 1 && cpu_logical == 1) + return (smp_topo_1level(CG_SHARE_NONE, cpu_cores, 0)); + /* + * Both HTT and multi-core. + */ + return (smp_topo_2level(CG_SHARE_NONE, cpu_cores, + CG_SHARE_L1, cpu_logical, CG_FLAG_HTT)); +} + +/* + * Calculate usable address in base memory for AP trampoline code. + */ +u_int +mp_bootaddress(u_int basemem) +{ + + return (basemem); +} + +void +cpu_add(u_int apic_id, char boot_cpu) +{ + + if (apic_id > MAX_APIC_ID) { + panic("SMP: APIC ID %d too high", apic_id); + return; + } + KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice", + apic_id)); + cpu_info[apic_id].cpu_present = 1; + if (boot_cpu) { + KASSERT(boot_cpu_id == -1, + ("CPU %d claims to be BSP, but CPU %d already is", apic_id, + boot_cpu_id)); + boot_cpu_id = apic_id; + cpu_info[apic_id].cpu_bsp = 1; + } + if (mp_ncpus < MAXCPU) + mp_ncpus++; + if (bootverbose) + printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" : + "AP"); +} + +void +cpu_mp_setmaxid(void) +{ + + mp_maxid = MAXCPU - 1; +} + +int +cpu_mp_probe(void) +{ + + /* + * Always record BSP in CPU map so that the mbuf init code works + * correctly. + */ + all_cpus = 1; + if (mp_ncpus == 0) { + /* + * No CPUs were found, so this must be a UP system. Setup + * the variables to represent a system with a single CPU + * with an id of 0. + */ + mp_ncpus = 1; + return (0); + } + + /* At least one CPU was found. */ + if (mp_ncpus == 1) { + /* + * One CPU was found, so this must be a UP system with + * an I/O APIC. + */ + return (0); + } + + /* At least two CPUs were found. */ + return (1); +} + +/* + * Initialize the IPI handlers and start up the AP's. + */ +void +cpu_mp_start(void) +{ + int i; + + /* Initialize the logical ID to APIC ID table. */ + for (i = 0; i < MAXCPU; i++) { + cpu_apic_ids[i] = -1; + cpu_ipi_pending[i] = 0; + } + + /* Set boot_cpu_id if needed. */ + if (boot_cpu_id == -1) { + boot_cpu_id = PCPU_GET(apic_id); + cpu_info[boot_cpu_id].cpu_bsp = 1; + } else + KASSERT(boot_cpu_id == PCPU_GET(apic_id), + ("BSP's APIC ID doesn't match boot_cpu_id")); + cpu_apic_ids[0] = boot_cpu_id; + + assign_cpu_ids(); + + /* Start each Application Processor */ + start_all_aps(); + + /* Setup the initial logical CPUs info. */ + logical_cpus = logical_cpus_mask = 0; + if (cpu_feature & CPUID_HTT) + logical_cpus = (cpu_procinfo & CPUID_HTT_CORES) >> 16; + + set_interrupt_apic_ids(); +} + + +/* + * Print various information about the SMP system hardware and setup. + */ +void +cpu_mp_announce(void) +{ + int i, x; + + /* List CPUs */ + printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id); + for (i = 1, x = 0; x <= MAX_APIC_ID; x++) { + if (!cpu_info[x].cpu_present || cpu_info[x].cpu_bsp) + continue; + if (cpu_info[x].cpu_disabled) + printf(" cpu (AP): APIC ID: %2d (disabled)\n", x); + else { + KASSERT(i < mp_ncpus, + ("mp_ncpus and actual cpus are out of whack")); + printf(" cpu%d (AP): APIC ID: %2d\n", i++, x); + } + } +} + +#define MTOPSIZE (1<<(14 + PAGE_SHIFT)) + +/* + * AP CPU's call this to initialize themselves. + */ +void +init_secondary(void) +{ + vm_offset_t addr; + int gsel_tss; + + + /* bootAP is set in start_ap() to our ID. */ + + + PCPU_SET(currentldt, _default_ldt); + gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); +#if 0 + gdt[bootAP * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS; +#endif + PCPU_SET(common_tss.tss_esp0, 0); /* not used until after switch */ + PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); + PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16); +#if 0 + PCPU_SET(tss_gdt, &gdt[bootAP * NGDT + GPROC0_SEL].sd); + + PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); +#endif + PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd); + + /* + * Set to a known state: + * Set by mpboot.s: CR0_PG, CR0_PE + * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM + */ + /* + * signal our startup to the BSP. + */ + mp_naps++; + + /* Spin until the BSP releases the AP's. */ + while (!aps_ready) + ia32_pause(); + + /* BSP may have changed PTD while we were waiting */ + invltlb(); + for (addr = 0; addr < NKPT * NBPDR - 1; addr += PAGE_SIZE) + invlpg(addr); + + /* set up FPU state on the AP */ + npxinit(__INITIAL_NPXCW__); +#if 0 + + /* set up SSE registers */ + enable_sse(); +#endif +#if 0 && defined(PAE) + /* Enable the PTE no-execute bit. */ + if ((amd_feature & AMDID_NX) != 0) { + uint64_t msr; + + msr = rdmsr(MSR_EFER) | EFER_NXE; + wrmsr(MSR_EFER, msr); + } +#endif +#if 0 + /* A quick check from sanity claus */ + if (PCPU_GET(apic_id) != lapic_id()) { + printf("SMP: cpuid = %d\n", PCPU_GET(cpuid)); + printf("SMP: actual apic_id = %d\n", lapic_id()); + printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id)); + panic("cpuid mismatch! boom!!"); + } +#endif + + /* Initialize curthread. */ + KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); + PCPU_SET(curthread, PCPU_GET(idlethread)); + + mtx_lock_spin(&ap_boot_mtx); +#if 0 + + /* Init local apic for irq's */ + lapic_setup(1); +#endif + smp_cpus++; + + CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", PCPU_GET(cpuid)); + printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid)); + + /* Determine if we are a logical CPU. */ + if (logical_cpus > 1 && PCPU_GET(apic_id) % logical_cpus != 0) + logical_cpus_mask |= PCPU_GET(cpumask); + + /* Determine if we are a hyperthread. */ + if (hyperthreading_cpus > 1 && + PCPU_GET(apic_id) % hyperthreading_cpus != 0) + hyperthreading_cpus_mask |= PCPU_GET(cpumask); + + /* Build our map of 'other' CPUs. */ + PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask)); +#if 0 + if (bootverbose) + lapic_dump("AP"); +#endif + if (smp_cpus == mp_ncpus) { + /* enable IPI's, tlb shootdown, freezes etc */ + atomic_store_rel_int(&smp_started, 1); + smp_active = 1; /* historic */ + } + + mtx_unlock_spin(&ap_boot_mtx); + + /* wait until all the AP's are up */ + while (smp_started == 0) + ia32_pause(); + + /* enter the scheduler */ + sched_throw(NULL); + + panic("scheduler returned us to %s", __func__); + /* NOTREACHED */ +} + +/******************************************************************* + * local functions and data + */ + +/* + * We tell the I/O APIC code about all the CPUs we want to receive + * interrupts. If we don't want certain CPUs to receive IRQs we + * can simply not tell the I/O APIC code about them in this function. + * We also do not tell it about the BSP since it tells itself about + * the BSP internally to work with UP kernels and on UP machines. + */ +static void +set_interrupt_apic_ids(void) +{ + u_int i, apic_id; + + for (i = 0; i < MAXCPU; i++) { + apic_id = cpu_apic_ids[i]; + if (apic_id == -1) + continue; + if (cpu_info[apic_id].cpu_bsp) + continue; + if (cpu_info[apic_id].cpu_disabled) + continue; + + /* Don't let hyperthreads service interrupts. */ + if (hyperthreading_cpus > 1 && + apic_id % hyperthreading_cpus != 0) + continue; + + intr_add_cpu(i); + } +} + +/* + * Assign logical CPU IDs to local APICs. + */ +static void +assign_cpu_ids(void) +{ + u_int i; + + /* Check for explicitly disabled CPUs. */ + for (i = 0; i <= MAX_APIC_ID; i++) { + if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp) + continue; + + /* Don't use this CPU if it has been disabled by a tunable. */ + if (resource_disabled("lapic", i)) { + cpu_info[i].cpu_disabled = 1; + continue; + } + } + + /* + * Assign CPU IDs to local APIC IDs and disable any CPUs + * beyond MAXCPU. CPU 0 has already been assigned to the BSP, + * so we only have to assign IDs for APs. + */ + mp_ncpus = 1; + for (i = 0; i <= MAX_APIC_ID; i++) { + if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp || + cpu_info[i].cpu_disabled) + continue; + + if (mp_ncpus < MAXCPU) { + cpu_apic_ids[mp_ncpus] = i; + mp_ncpus++; + } else + cpu_info[i].cpu_disabled = 1; + } + KASSERT(mp_maxid >= mp_ncpus - 1, + ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, + mp_ncpus)); +} + +/* + * start each AP in our list + */ +/* Lowest 1MB is already mapped: don't touch*/ +#define TMPMAP_START 1 +int +start_all_aps(void) +{ + u_int32_t mpbioswarmvec; + int x,apic_id, cpu; + struct pcpu *pc; + + mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN); + + /* save the current value of the warm-start vector */ + mpbioswarmvec = *((u_int32_t *) WARMBOOT_OFF); + + /* set up temporary P==V mapping for AP boot */ + /* XXX this is a hack, we should boot the AP on its own stack/PTD */ + + /* start each AP */ + for (cpu = 1; cpu < mp_ncpus; cpu++) { + apic_id = cpu_apic_ids[cpu]; + + + /* setup a vector to our boot code */ + *((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET; + *((volatile u_short *) WARMBOOT_SEG) = (boot_address >> 4); + + bootAP = cpu; + bootAPgdt = gdt + (512*cpu); + + /* Get per-cpu data */ + pc = &__pcpu[bootAP]; + pc->pc_apic_id = cpu_apic_ids[bootAP]; + pc->pc_prvspace = pc; + pc->pc_curthread = 0; + + pcpu_init(pc, bootAP, sizeof(struct pcpu)); + gdt_segs[GPRIV_SEL].ssd_base = (int) pc; + gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss; + + PT_SET_MA(bootAPgdt, xpmap_ptom(VTOP(bootAPgdt)) | PG_V | PG_RW); + bzero(bootAPgdt, PAGE_SIZE); + for (x = 0; x < NGDT; x++) + ssdtosd(&gdt_segs[x], &bootAPgdt[x].sd); + PT_SET_MA(bootAPgdt, vtomach(bootAPgdt) | PG_V); + + /* attempt to start the Application Processor */ + if (!start_ap(cpu)) { + printf("AP #%d (PHY# %d) failed!\n", cpu, apic_id); + /* better panic as the AP may be running loose */ + printf("panic y/n? [y] "); + if (cngetc() != 'n') + panic("bye-bye"); + } + + all_cpus |= (1 << cpu); /* record AP in CPU map */ + } + + + /* build our map of 'other' CPUs */ + PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask)); + + /* restore the warmstart vector */ + *(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec; + + pmap_invalidate_range(kernel_pmap, 0, NKPT * NBPDR - 1); + + /* number of APs actually started */ + return mp_naps; +} + +extern uint8_t *pcpu_boot_stack; +extern trap_info_t trap_table[]; + +static void +smp_trap_init(trap_info_t *trap_ctxt) +{ + const trap_info_t *t = trap_table; + + for (t = trap_table; t->address; t++) { + trap_ctxt[t->vector].flags = t->flags; + trap_ctxt[t->vector].cs = t->cs; + trap_ctxt[t->vector].address = t->address; + } +} + +void +cpu_initialize_context(unsigned int cpu); +extern int nkpt; + +void +cpu_initialize_context(unsigned int cpu) +{ + /* vcpu_guest_context_t is too large to allocate on the stack. + * Hence we allocate statically and protect it with a lock */ + vm_page_t m[4]; + static vcpu_guest_context_t ctxt; + vm_offset_t boot_stack; + vm_paddr_t *va = (vm_paddr_t *)PTOV(IdlePDPT); + vm_paddr_t ma[4]; + static int color; + int i; + + /* + * Page 0: boot stack + * Page 1: PDPT + * Page 2-3: PTD{2-3] + * + */ + for (i = 0; i < 4; i++) { + m[i] = vm_page_alloc(NULL, color++, + VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | + VM_ALLOC_ZERO); + + pmap_zero_page(m[i]); + + } + boot_stack = kmem_alloc_nofault(kernel_map, 1); + + /* + * Initialize new IdlePDPT with dedicated page + * for upper 1GB + */ + pmap_kenter(boot_stack, VM_PAGE_TO_PHYS(m[1])); + for (i = 0; i < 4; i++) { + ((vm_paddr_t *)boot_stack)[i] = va[i]; + ma[i] = va[i]; + } + + ma[2] = ((vm_paddr_t *)boot_stack)[2] = + xpmap_ptom(VM_PAGE_TO_PHYS(m[2]))|PG_V; + ma[3] = ((vm_paddr_t *)boot_stack)[3] = + xpmap_ptom(VM_PAGE_TO_PHYS(m[3]))|PG_V; + + /* + * Copy cpu0 IdlePTD to new IdlePTD - copying only + * kernel mappings + */ + pmap_kenter(boot_stack, VM_PAGE_TO_PHYS(m[3])); + memcpy((uint8_t *)boot_stack, (uint8_t *)PTOV(IdlePTD) + 3*PAGE_SIZE, + nkpt*sizeof(vm_paddr_t)); + + /* + * map actual idle stack to boot_stack + */ + pmap_kenter(boot_stack, VM_PAGE_TO_PHYS(m[0])); + + printf("pinning pgdpt=%llx\n", + xpmap_ptom(VM_PAGE_TO_PHYS(m[1]))); + + xen_pgdpt_pin(xpmap_ptom(VM_PAGE_TO_PHYS(m[1]))); + vm_page_lock_queues(); + for (i = 0; i < 4; i++) { + xen_queue_pt_update((vm_paddr_t) + ((ma[2] & ~PG_V) + + (PTDPTDI - 1024 + i)*sizeof(vm_paddr_t)), + ma[i]); + } + PT_UPDATES_FLUSH(); + vm_page_unlock_queues(); + + memset(&ctxt, 0, sizeof(ctxt)); + ctxt.flags = VGCF_IN_KERNEL; + ctxt.user_regs.ds = GSEL(GDATA_SEL, SEL_KPL); + ctxt.user_regs.es = GSEL(GDATA_SEL, SEL_KPL); + ctxt.user_regs.fs = GSEL(GPRIV_SEL, SEL_KPL); + ctxt.user_regs.gs = GSEL(GDATA_SEL, SEL_KPL); + ctxt.user_regs.cs = GSEL(GCODE_SEL, SEL_KPL); + ctxt.user_regs.ss = GSEL(GDATA_SEL, SEL_KPL); + ctxt.user_regs.eip = (unsigned long)init_secondary; + ctxt.user_regs.eflags = PSL_KERNEL | 0x1000; /* IOPL_RING1 */ + + memset(&ctxt.fpu_ctxt, 0, sizeof(ctxt.fpu_ctxt)); + + smp_trap_init(ctxt.trap_ctxt); + + ctxt.ldt_ents = 0; + ctxt.gdt_frames[0] = (uint32_t)((uint64_t)vtomach(bootAPgdt) >> PAGE_SHIFT); + ctxt.gdt_ents = 512; + +#ifdef __i386__ + ctxt.user_regs.esp = boot_stack + PAGE_SIZE; + + ctxt.kernel_ss = GSEL(GDATA_SEL, SEL_KPL); + ctxt.kernel_sp = boot_stack + PAGE_SIZE; + + ctxt.event_callback_cs = GSEL(GCODE_SEL, SEL_KPL); + ctxt.event_callback_eip = (unsigned long)Xhypervisor_callback; + ctxt.failsafe_callback_cs = GSEL(GCODE_SEL, SEL_KPL); + ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback; + + ctxt.ctrlreg[3] = +#if 1 + xpmap_ptom(VM_PAGE_TO_PHYS(m[1])); +#else + xpmap_ptom((unsigned long)IdlePDPT); +#endif +#else /* __x86_64__ */ + ctxt.user_regs.esp = idle->thread.rsp0 - sizeof(struct pt_regs); + ctxt.kernel_ss = GSEL(GDATA_SEL, SEL_KPL); + ctxt.kernel_sp = idle->thread.rsp0; + + ctxt.event_callback_eip = (unsigned long)hypervisor_callback; + ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback; + ctxt.syscall_callback_eip = (unsigned long)system_call; + + ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt)); + + ctxt.gs_base_kernel = (unsigned long)(cpu_pda(cpu)); +#endif + + printf("gdtpfn=%lx pdptpfn=%lx\n", + ctxt.gdt_frames[0], + ctxt.ctrlreg[3] >> PAGE_SHIFT); + + PANIC_IF(HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, &ctxt)); + DELAY(3000); + PANIC_IF(HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL)); +} + +/* + * This function starts the AP (application processor) identified + * by the APIC ID 'physicalCpu'. It does quite a "song and dance" + * to accomplish this. This is necessary because of the nuances + * of the different hardware we might encounter. It isn't pretty, + * but it seems to work. + */ +static int +start_ap(int apic_id) +{ + int ms; + int cpus; + + /* used as a watchpoint to signal AP startup */ + cpus = mp_naps; + + cpu_initialize_context(apic_id); + + /* Wait up to 5 seconds for it to start. */ + for (ms = 0; ms < 5000; ms++) { + if (mp_naps > cpus) + return 1; /* return SUCCESS */ + DELAY(1000); + } + return 0; /* return FAILURE */ +} + +/* + * Flush the TLB on all other CPU's + */ +static void +smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2) +{ + u_int ncpu; + + ncpu = mp_ncpus - 1; /* does not shootdown self */ + if (ncpu < 1) + return; /* no other cpus */ + if (!(read_eflags() & PSL_I)) + panic("%s: interrupts disabled", __func__); + mtx_lock_spin(&smp_ipi_mtx); + smp_tlb_addr1 = addr1; + smp_tlb_addr2 = addr2; + atomic_store_rel_int(&smp_tlb_wait, 0); + ipi_all_but_self(vector); + while (smp_tlb_wait < ncpu) + ia32_pause(); + mtx_unlock_spin(&smp_ipi_mtx); +} + +static void +smp_targeted_tlb_shootdown(u_int mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2) +{ + int ncpu, othercpus; + + othercpus = mp_ncpus - 1; + if (mask == (u_int)-1) { + ncpu = othercpus; + if (ncpu < 1) + return; + } else { + mask &= ~PCPU_GET(cpumask); + if (mask == 0) + return; + ncpu = bitcount32(mask); + if (ncpu > othercpus) { + /* XXX this should be a panic offence */ + printf("SMP: tlb shootdown to %d other cpus (only have %d)\n", + ncpu, othercpus); + ncpu = othercpus; + } + /* XXX should be a panic, implied by mask == 0 above */ + if (ncpu < 1) + return; + } + if (!(read_eflags() & PSL_I)) + panic("%s: interrupts disabled", __func__); + mtx_lock_spin(&smp_ipi_mtx); + smp_tlb_addr1 = addr1; + smp_tlb_addr2 = addr2; + atomic_store_rel_int(&smp_tlb_wait, 0); + if (mask == (u_int)-1) + ipi_all_but_self(vector); + else + ipi_selected(mask, vector); + while (smp_tlb_wait < ncpu) + ia32_pause(); + mtx_unlock_spin(&smp_ipi_mtx); +} + +void +smp_cache_flush(void) +{ + + if (smp_started) + smp_tlb_shootdown(IPI_INVLCACHE, 0, 0); +} + +void +smp_invltlb(void) +{ + + if (smp_started) { + smp_tlb_shootdown(IPI_INVLTLB, 0, 0); + } +} + +void +smp_invlpg(vm_offset_t addr) +{ + + if (smp_started) { + smp_tlb_shootdown(IPI_INVLPG, addr, 0); + } +} + +void +smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2) +{ + + if (smp_started) { + smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2); + } +} + +void +smp_masked_invltlb(u_int mask) +{ + + if (smp_started) { + smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0); + } +} + +void +smp_masked_invlpg(u_int mask, vm_offset_t addr) +{ + + if (smp_started) { + smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0); + } +} + +void +smp_masked_invlpg_range(u_int mask, vm_offset_t addr1, vm_offset_t addr2) +{ + + if (smp_started) { + smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2); + } +} + +void +ipi_bitmap_handler(struct trapframe frame) +{ + int cpu = PCPU_GET(cpuid); + u_int ipi_bitmap; + + ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]); + + if (ipi_bitmap & (1 << IPI_PREEMPT)) { + sched_preempt(curthread); + } +} + +/* + * send an IPI to a set of cpus. + */ +void +ipi_selected(u_int32_t cpus, u_int ipi) +{ + int cpu; + u_int bitmap = 0; + u_int old_pending; + u_int new_pending; + + if (IPI_IS_BITMAPED(ipi)) { + bitmap = 1 << ipi; + ipi = IPI_BITMAP_VECTOR; + } + +#ifdef STOP_NMI + if (ipi == IPI_STOP && stop_cpus_with_nmi) { + ipi_nmi_selected(cpus); + return; + } +#endif + CTR3(KTR_SMP, "%s: cpus: %x ipi: %x", __func__, cpus, ipi); + while ((cpu = ffs(cpus)) != 0) { + cpu--; + cpus &= ~(1 << cpu); + + KASSERT(cpu_apic_ids[cpu] != -1, + ("IPI to non-existent CPU %d", cpu)); + + if (bitmap) { + do { + old_pending = cpu_ipi_pending[cpu]; + new_pending = old_pending | bitmap; + } while (!atomic_cmpset_int(&cpu_ipi_pending[cpu],old_pending, new_pending)); + + if (old_pending) + continue; + } + + lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]); + } + +} + +/* + * send an IPI INTerrupt containing 'vector' to all CPUs, including myself + */ +void +ipi_all(u_int ipi) +{ + + if (IPI_IS_BITMAPED(ipi) || (ipi == IPI_STOP && stop_cpus_with_nmi)) { + ipi_selected(all_cpus, ipi); + return; + } + CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); + lapic_ipi_vectored(ipi, APIC_IPI_DEST_ALL); +} + +/* + * send an IPI to all CPUs EXCEPT myself + */ +void +ipi_all_but_self(u_int ipi) +{ + + if (IPI_IS_BITMAPED(ipi) || (ipi == IPI_STOP && stop_cpus_with_nmi)) { + ipi_selected(PCPU_GET(other_cpus), ipi); + return; + } + CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); + lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS); +} + +/* + * send an IPI to myself + */ +void +ipi_self(u_int ipi) +{ + + if (IPI_IS_BITMAPED(ipi) || (ipi == IPI_STOP && stop_cpus_with_nmi)) { + ipi_selected(PCPU_GET(cpumask), ipi); + return; + } + CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); + lapic_ipi_vectored(ipi, APIC_IPI_DEST_SELF); +} + +#ifdef STOP_NMI +/* + * send NMI IPI to selected CPUs + */ + +#define BEFORE_SPIN 1000000 + +void +ipi_nmi_selected(u_int32_t cpus) +{ + int cpu; + register_t icrlo; + + icrlo = APIC_DELMODE_NMI | APIC_DESTMODE_PHY | APIC_LEVEL_ASSERT + | APIC_TRIGMOD_EDGE; + + CTR2(KTR_SMP, "%s: cpus: %x nmi", __func__, cpus); + + atomic_set_int(&ipi_nmi_pending, cpus); + + while ((cpu = ffs(cpus)) != 0) { + cpu--; + cpus &= ~(1 << cpu); + + KASSERT(cpu_apic_ids[cpu] != -1, + ("IPI NMI to non-existent CPU %d", cpu)); + + /* Wait for an earlier IPI to finish. */ + if (!lapic_ipi_wait(BEFORE_SPIN)) + panic("ipi_nmi_selected: previous IPI has not cleared"); + + lapic_ipi_raw(icrlo, cpu_apic_ids[cpu]); + } +} + +int +ipi_nmi_handler(void) +{ + int cpumask = PCPU_GET(cpumask); + + if (!(ipi_nmi_pending & cpumask)) + return 1; + + atomic_clear_int(&ipi_nmi_pending, cpumask); + cpustop_handler(); + return 0; +} + +#endif /* STOP_NMI */ + +/* + * Handle an IPI_STOP by saving our current context and spinning until we + * are resumed. + */ +void +cpustop_handler(void) +{ + int cpu = PCPU_GET(cpuid); + int cpumask = PCPU_GET(cpumask); + + savectx(&stoppcbs[cpu]); + + /* Indicate that we are stopped */ + atomic_set_int(&stopped_cpus, cpumask); + + /* Wait for restart */ + while (!(started_cpus & cpumask)) + ia32_pause(); + + atomic_clear_int(&started_cpus, cpumask); + atomic_clear_int(&stopped_cpus, cpumask); + + if (cpu == 0 && cpustop_restartfunc != NULL) { + cpustop_restartfunc(); + cpustop_restartfunc = NULL; + } +} + +/* + * This is called once the rest of the system is up and running and we're + * ready to let the AP's out of the pen. + */ +static void +release_aps(void *dummy __unused) +{ + + if (mp_ncpus == 1) + return; + atomic_store_rel_int(&aps_ready, 1); + while (smp_started == 0) + ia32_pause(); +} +SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); + diff --git a/sys/i386/xen/mptable.c b/sys/i386/xen/mptable.c new file mode 100644 index 0000000..99edc50 --- /dev/null +++ b/sys/i386/xen/mptable.c @@ -0,0 +1,130 @@ +/*- + * Copyright (c) 2003 John Baldwin <jhb@FreeBSD.org> + * Copyright (c) 1996, by Steve Passe + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. The name of the developer may NOT be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bus.h> +#include <sys/kernel.h> +#include <sys/malloc.h> + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/pmap.h> + +#include <machine/apicreg.h> +#include <machine/frame.h> +#include <machine/intr_machdep.h> +#include <machine/apicvar.h> +#include <machine/md_var.h> +#include <machine/mptable.h> +#include <machine/specialreg.h> + +#include <machine/xen/hypervisor.h> +#include <machine/xen/xen-os.h> +#include <machine/smp.h> +#include <xen/interface/vcpu.h> + + +static int mptable_probe(void); +static int mptable_probe_cpus(void); +static void mptable_register(void *dummy); +static int mptable_setup_local(void); +static int mptable_setup_io(void); + +static struct apic_enumerator mptable_enumerator = { + "MPTable", + mptable_probe, + mptable_probe_cpus, + mptable_setup_local, + mptable_setup_io +}; + +static int +mptable_probe(void) +{ + + return (-100); +} + +static int +mptable_probe_cpus(void) +{ + int i, rc; + + for (i = 0; i < MAXCPU; i++) { + rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); + if (rc >= 0) + cpu_add(i, (i == 0)); + } + + return (0); +} + +/* + * Initialize the local APIC on the BSP. + */ +static int +mptable_setup_local(void) +{ + + return (0); +} + +static int +mptable_setup_io(void) +{ + + return (0); +} + +static void +mptable_register(void *dummy __unused) +{ + + apic_register_enumerator(&mptable_enumerator); +} +SYSINIT(mptable_register, SI_SUB_CPU - 1, SI_ORDER_FIRST, mptable_register, + NULL); + + + +int +mptable_pci_probe_table(int bus) +{ + + return (0); +} + +int +mptable_pci_route_interrupt(device_t pcib, device_t dev, int pin) +{ + + return (0); +} + diff --git a/sys/i386/xen/pmap.c b/sys/i386/xen/pmap.c index 9199218..a9b37c3 100644 --- a/sys/i386/xen/pmap.c +++ b/sys/i386/xen/pmap.c @@ -211,7 +211,7 @@ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ int pgeflag = 0; /* PG_G or-in */ int pseflag = 0; /* PG_PS or-in */ -static int nkpt; +int nkpt; vm_offset_t kernel_vm_end; extern u_int32_t KERNend; diff --git a/sys/i386/xen/xen_machdep.c b/sys/i386/xen/xen_machdep.c index ee589cf..e1d618d 100644 --- a/sys/i386/xen/xen_machdep.c +++ b/sys/i386/xen/xen_machdep.c @@ -42,10 +42,8 @@ __FBSDID("$FreeBSD$"); #include <sys/reboot.h> #include <sys/sysproto.h> - #include <machine/xen/xen-os.h> - #include <vm/vm.h> #include <vm/pmap.h> #include <machine/segments.h> @@ -679,7 +677,7 @@ extern unsigned long *SMPpt; extern struct user *proc0uarea; extern vm_offset_t proc0kstack; extern int vm86paddr, vm86phystk; -char *bootmem_start, *bootmem_current, *bootmem_end; +char *bootmem_start, *bootmem_current, *bootmem_end; pteinfo_t *pteinfo_list; void initvalues(start_info_t *startinfo); @@ -813,9 +811,17 @@ initvalues(start_info_t *startinfo) vm_paddr_t pdir_shadow_ma; #endif unsigned long i; + int ncpus; +#ifdef SMP + ncpus = MAXCPU; +#else + ncpus = 1; +#endif +#if 0 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); +#endif HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); #ifdef notyet /* @@ -864,7 +870,7 @@ initvalues(start_info_t *startinfo) l1_pages = xen_start_info->nr_pt_frames - l2_pages - l3_pages; KPTphysoff = (l2_pages + l3_pages)*PAGE_SIZE; - + KPTphys = xpmap_ptom(VTOP(startinfo->pt_base + KPTphysoff)); XENPRINTF("IdlePTD %p\n", IdlePTD); XENPRINTF("nr_pages: %ld shared_info: 0x%lx flags: 0x%lx pt_base: 0x%lx " @@ -876,7 +882,7 @@ initvalues(start_info_t *startinfo) proc0kstack = cur_space; cur_space += (KSTACK_PAGES * PAGE_SIZE); printk("proc0kstack=%u\n", proc0kstack); - + /* vm86/bios stack */ cur_space += PAGE_SIZE; @@ -954,6 +960,7 @@ initvalues(start_info_t *startinfo) } xen_load_cr3(VTOP(IdlePDPTnew)); xen_pgdpt_pin(xpmap_ptom(VTOP(IdlePDPTnew))); + for (i = 0; i < 4; i++) { xen_queue_pt_update((vm_paddr_t)(IdlePTDnewma[2] + (PTDPTDI - 1024 + i)*sizeof(vm_paddr_t)), IdlePTDnewma[i] | PG_V); @@ -972,10 +979,13 @@ initvalues(start_info_t *startinfo) IdlePDPTma = IdlePDPTnewma; /* allocate page for gdt */ - gdt = (union descriptor *)cur_space; cur_space += PAGE_SIZE; - /* allocate page for ldt */ - ldt = (union descriptor *)cur_space; cur_space += PAGE_SIZE; + gdt = (union descriptor *)cur_space; + cur_space += PAGE_SIZE*ncpus; + /* allocate page for ldt */ + ldt = (union descriptor *)cur_space; cur_space += PAGE_SIZE; + cur_space += PAGE_SIZE; + HYPERVISOR_shared_info = (shared_info_t *)cur_space; cur_space += PAGE_SIZE; @@ -1001,18 +1011,6 @@ initvalues(start_info_t *startinfo) printk("#5\n"); HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = (unsigned long)xen_phys_machine; -#if 0 && defined(SMP) - for (i = 0; i < ncpus; i++) { - int j, npages = (sizeof(struct privatespace) + 1)/PAGE_SIZE; - - for (j = 0; j < npages; j++) { - vm_paddr_t ma = xpmap_ptom(cur_space); - cur_space += PAGE_SIZE; - PT_SET_VA_MA(SMPpt + i*npages + j, ma | PG_KERNEL, FALSE); - } - } - xen_flush_queue(); -#endif set_iopl.iopl = 1; PANIC_IF(HYPERVISOR_physdev_op(PHYSDEVOP_SET_IOPL, &set_iopl)); @@ -1280,7 +1278,6 @@ xen_suspend(void *ignore) vcpu_prepare(i); #endif - /* * Only resume xenbus /after/ we've prepared our VCPUs; otherwise * the VCPU hotplug callback can race with our vcpu_prepare |