diff options
author | kmacy <kmacy@FreeBSD.org> | 2008-10-15 05:44:08 +0000 |
---|---|---|
committer | kmacy <kmacy@FreeBSD.org> | 2008-10-15 05:44:08 +0000 |
commit | 1fcb50990700350ffe27fa30739c07de4ad5f715 (patch) | |
tree | 2206b456cc7749b7b37c82a5eae330b6e66d52dd /sys | |
parent | 8234483524c47dd64314d223337b39a0e81606b2 (diff) | |
download | FreeBSD-src-1fcb50990700350ffe27fa30739c07de4ad5f715.zip FreeBSD-src-1fcb50990700350ffe27fa30739c07de4ad5f715.tar.gz |
Add i386 specific xen support
Diffstat (limited to 'sys')
27 files changed, 13135 insertions, 20 deletions
diff --git a/sys/i386/conf/DEFAULTS b/sys/i386/conf/DEFAULTS index 60cb04a..289a533 100644 --- a/sys/i386/conf/DEFAULTS +++ b/sys/i386/conf/DEFAULTS @@ -15,3 +15,5 @@ device npx # Pseudo devices. device mem # Memory and kernel memory devices device io # I/O device + +options NATIVE diff --git a/sys/i386/conf/XEN b/sys/i386/conf/XEN new file mode 100644 index 0000000..3c60994 --- /dev/null +++ b/sys/i386/conf/XEN @@ -0,0 +1,153 @@ +# +# GENERIC -- Generic kernel configuration file for FreeBSD/i386 +# +# For more information on this file, please read the handbook section on +# Kernel Configuration Files: +# +# http://www.FreeBSD.org/doc/en_US.ISO8859-1/books/handbook/kernelconfig-config.html +# +# The handbook is also available locally in /usr/share/doc/handbook +# if you've installed the doc distribution, otherwise always see the +# FreeBSD World Wide Web server (http://www.FreeBSD.org/) for the +# latest information. +# +# An exhaustive list of options and more detailed explanations of the +# device lines is also present in the ../../conf/NOTES and NOTES files. +# If you are in doubt as to the purpose or necessity of a line, check first +# in NOTES. +# +# $FreeBSD$ + +machine i386 +cpu I686_CPU +ident XEN + +# To statically compile in device wiring instead of /boot/device.hints +#hints "GENERIC.hints" # Default places to look for devices. + +makeoptions DEBUG=-g # Build kernel with gdb(1) debug symbols +makeoptions MODULES_OVERRIDE="" + +#options SCHED_ULE # ULE scheduler +#options PREEMPTION # Enable kernel thread preemption +options SCHED_4BSD +options INET # InterNETworking +options INET6 # IPv6 communications protocols +options FFS # Berkeley Fast Filesystem +options SOFTUPDATES # Enable FFS soft updates support +options UFS_ACL # Support for access control lists +options UFS_DIRHASH # Improve performance on big directories +options MD_ROOT # MD is a potential root device +options NFSCLIENT # Network Filesystem Client +options NFSSERVER # Network Filesystem Server +options NFS_ROOT # NFS usable as /, requires NFSCLIENT +options MSDOSFS # MSDOS Filesystem +options CD9660 # ISO 9660 Filesystem +options PROCFS # Process filesystem (requires PSEUDOFS) +options PSEUDOFS # Pseudo-filesystem framework +options GEOM_LABEL # Provides labelization +options COMPAT_FREEBSD4 # Compatible with FreeBSD4 +options COMPAT_FREEBSD5 # Compatible with FreeBSD5 +options SCSI_DELAY=5000 # Delay (in ms) before probing SCSI +options KTRACE # ktrace(1) support +options SYSVSHM # SYSV-style shared memory +options SYSVMSG # SYSV-style message queues +options SYSVSEM # SYSV-style semaphores +options _KPOSIX_PRIORITY_SCHEDULING # POSIX P1003_1B real-time extensions +options KBD_INSTALL_CDEV # install a CDEV entry in /dev +options AUDIT # Security event auditing + +# Debugging for use in -current +options KDB # Enable kernel debugger support. +options DDB # Support DDB. +options GDB # Support remote GDB. +options INVARIANTS # Enable calls of extra sanity checking +options INVARIANT_SUPPORT # Extra sanity checks of internal structures, required by INVARIANTS +#options WITNESS # Enable checks to detect deadlocks and cycles +#options WITNESS_SKIPSPIN # Don't run witness on spinlocks for speed + +# To make an SMP kernel, the next two lines are needed +#options SMP # Symmetric MultiProcessor Kernel +#device apic # I/O APIC +options PAE + + +# CPU frequency control +#device cpufreq # native only + +# Bus support. +#device pci + +# SCSI peripherals +device scbus # SCSI bus (required for SCSI) +device ch # SCSI media changers +device da # Direct Access (disks) +device sa # Sequential Access (tape etc) +device cd # CD +device pass # Passthrough device (direct SCSI access) +device ses # SCSI Environmental Services (and SAF-TE) + +# atkbdc0 controls both the keyboard and the PS/2 mouse +device atkbdc # AT keyboard controller +device atkbd # AT keyboard +device psm # PS/2 mouse +device kbdmux # keyboard multiplexer +#device vga # VGA video card driver +device splash # Splash screen and screen saver support + +# syscons is the default console driver, resembling an SCO console + +#device agp # support several AGP chipsets + +# Power management support (see NOTES for more options) +#device apm +# Add suspend/resume support for the i8254. +#device pmtimer # native + +# Serial (COM) ports +device uart # Generic UART driver + +# If you've got a "dumb" serial or parallel PCI card that is +# supported by the puc(4) glue driver, uncomment the following +# line to enable it (connects to sio, uart and/or ppc drivers): +#device puc + +# PCI Ethernet NICs. +device em # Intel PRO/1000 adapter Gigabit Ethernet Card + +# PCI Ethernet NICs that use the common MII bus controller code. +# NOTE: Be sure to keep the 'device miibus' line in order to use these NICs! +device miibus # MII bus support + +# Pseudo devices. +device loop # Network loopback +device random # Entropy device +device ether # Ethernet support +device sl # Kernel SLIP +device ppp # Kernel PPP +device tun # Packet tunnel. +device pty # Pseudo-ttys (telnet etc) +device md # Memory "disks" +device gif # IPv6 and IPv4 tunneling +device faith # IPv6-to-IPv4 relaying (translation) +device firmware # firmware assist module + +# The `bpf' device enables the Berkeley Packet Filter. +# Be aware of the administrative consequences of enabling this! +# Note that 'bpf' is required for DHCP. +device bpf # Berkeley packet filter + + +options XEN +nooption NATIVE +nodevice atpic +options MCLSHIFT=12 + +nodevice isa +nooption ISAPNP + +options KTR +options KTR_COMPILE=(KTR_PMAP) +options KTR_CPUMASK=0xff +options KTR_ENTRIES=65536 +options KTR_MASK=(KTR_PMAP) diff --git a/sys/i386/i386/genassym.c b/sys/i386/i386/genassym.c index bb04704..66ffdef 100644 --- a/sys/i386/i386/genassym.c +++ b/sys/i386/i386/genassym.c @@ -227,3 +227,9 @@ ASSYM(MTX_RECURSECNT, offsetof(struct mtx, mtx_recurse)); ASSYM(BUS_SPACE_HANDLE_BASE, offsetof(struct bus_space_handle, bsh_base)); ASSYM(BUS_SPACE_HANDLE_IAT, offsetof(struct bus_space_handle, bsh_iat)); #endif + +#ifdef XEN +#include <machine/xen/hypervisor.h> +ASSYM(PC_CR3, offsetof(struct pcpu, pc_cr3)); +ASSYM(HYPERVISOR_VIRT_START, __HYPERVISOR_VIRT_START); +#endif diff --git a/sys/i386/i386/machdep.c b/sys/i386/i386/machdep.c index 8e27e35..1c7510b 100644 --- a/sys/i386/i386/machdep.c +++ b/sys/i386/i386/machdep.c @@ -141,6 +141,25 @@ int arch_i386_is_xbox = 0; uint32_t arch_i386_xbox_memsize = 0; #endif +#ifdef XEN +/* XEN includes */ +#include <machine/xen/hypervisor.h> +#include <machine/xen/xen-os.h> +#include <machine/xen/xenvar.h> +#include <machine/xen/xenfunc.h> +#include <machine/xen/xen_intr.h> + +void Xhypervisor_callback(void); +void failsafe_callback(void); + +int gdt_set; +extern trap_info_t trap_table[]; +struct proc_ldt default_proc_ldt; +extern int init_first; +int running_xen = 1; +extern unsigned long physfree; +#endif /* XEN */ + /* Sanity check for __curthread() */ CTASSERT(offsetof(struct pcpu, pc_curthread) == 0); @@ -282,8 +301,9 @@ cpu_startup(dummy) */ bufinit(); vm_pager_bufferinit(); - +#ifndef XEN cpu_setregs(); +#endif } /* @@ -1108,6 +1128,25 @@ cpu_est_clockrate(int cpu_id, uint64_t *rate) return (0); } +static int cpu_idle_hlt = 1; +SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hlt, CTLFLAG_RW, + &cpu_idle_hlt, 0, "Idle loop HLT enable"); +#ifdef XEN + +void +cpu_halt(void) +{ + HYPERVISOR_shutdown(SHUTDOWN_poweroff); +} + +static void +cpu_idle_default(void) +{ + idle_block(); +} + +#else + /* * Shutdown the CPU as much as possible */ @@ -1133,9 +1172,6 @@ cpu_halt(void) * XXX I'm turning it on for SMP as well by default for now. It seems to * help lock contention somewhat, and this is critical for HTT. -Peter */ -static int cpu_idle_hlt = 1; -SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hlt, CTLFLAG_RW, - &cpu_idle_hlt, 0, "Idle loop HLT enable"); static void cpu_idle_default(void) @@ -1147,6 +1183,7 @@ cpu_idle_default(void) */ __asm __volatile("sti; hlt"); } +#endif /* !XEN */ /* * Note that we have to be careful here to avoid a race between checking @@ -1317,10 +1354,16 @@ SYSCTL_ULONG(_machdep, OID_AUTO, guessed_bootdev, */ int _default_ldt; + +#ifdef XEN +union descriptor *gdt; +union descriptor *ldt; +#else union descriptor gdt[NGDT * MAXCPU]; /* global descriptor table */ +union descriptor ldt[NLDT]; /* local descriptor table */ +#endif static struct gate_descriptor idt0[NIDT]; struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ -union descriptor ldt[NLDT]; /* local descriptor table */ struct region_descriptor r_gdt, r_idt; /* table descriptors */ int private_tss; /* flag indicating private tss */ @@ -1355,7 +1398,7 @@ struct soft_segment_descriptor gdt_segs[] = { { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMRWA, /* segment type */ - 0, /* segment descriptor priority level */ + SEL_KPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ @@ -1382,7 +1425,7 @@ struct soft_segment_descriptor gdt_segs[] = { { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMERA, /* segment type */ - 0, /* segment descriptor priority level */ + SEL_KPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ @@ -1391,7 +1434,7 @@ struct soft_segment_descriptor gdt_segs[] = { { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMRWA, /* segment type */ - 0, /* segment descriptor priority level */ + SEL_KPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ @@ -1418,11 +1461,12 @@ struct soft_segment_descriptor gdt_segs[] = { { 0x400, /* segment base address */ 0xfffff, /* length */ SDT_MEMRWA, /* segment type */ - 0, /* segment descriptor priority level */ + SEL_KPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, +#ifndef XEN /* GPROC0_SEL 9 Proc 0 Tss Descriptor */ { 0x0, /* segment base address */ @@ -1514,6 +1558,7 @@ struct soft_segment_descriptor gdt_segs[] = { 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, +#endif /* !XEN */ }; static struct soft_segment_descriptor ldt_segs[] = { @@ -1735,7 +1780,17 @@ getmemsize(int first) goto physmap_done; } #endif - +#ifdef XEN + has_smap = 0; + Maxmem = xen_start_info->nr_pages - init_first; + physmem = Maxmem; + basemem = 0; + physmap[0] = init_first << PAGE_SHIFT; + physmap[1] = ptoa(Maxmem) - round_page(MSGBUF_SIZE); + physmap_idx = 0; + goto physmap_done; +#endif + hasbrokenint12 = 0; TUNABLE_INT_FETCH("hw.hasbrokenint12", &hasbrokenint12); bzero(&vmf, sizeof(vmf)); @@ -1898,7 +1953,7 @@ int15e820: vmf.vmf_ah = 0x88; vm86_intcall(0x15, &vmf); extmem = vmf.vmf_ax; -#else +#elif !defined(XEN) /* * Prefer the RTC value for extended memory. */ @@ -1988,7 +2043,7 @@ physmap_done: if (getenv_quad("dcons.addr", &dcons_addr) == 0 || getenv_quad("dcons.size", &dcons_size) == 0) dcons_addr = 0; - +#ifndef XEN /* * physmap is in bytes, so when converting to page boundaries, * round up the start address and round down the end address. @@ -2106,7 +2161,10 @@ do_next: } *pte = 0; invltlb(); - +#else + phys_avail[0] = physfree; + phys_avail[1] = xen_start_info->nr_pages*PAGE_SIZE; +#endif /* * XXX * The last chunk must contain at least one page plus the message @@ -2128,6 +2186,257 @@ do_next: avail_end = phys_avail[pa_indx]; } +#ifdef XEN + +#define MTOPSIZE (1<<(14 + PAGE_SHIFT)) +void +init386(int first) +{ + int error, gsel_tss, metadata_missing, x; + unsigned long off, gdtmachpfn; + struct pcpu *pc; + struct callback_register event = { + .type = CALLBACKTYPE_event, + .address = {GSEL(GCODE_SEL, SEL_KPL), (unsigned long)Xhypervisor_callback }, + }; + struct callback_register failsafe = { + .type = CALLBACKTYPE_failsafe, + .address = {GSEL(GCODE_SEL, SEL_KPL), (unsigned long)failsafe_callback }, + }; + + thread0.td_kstack = proc0kstack; + thread0.td_pcb = (struct pcb *) + (thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1; + + /* + * This may be done better later if it gets more high level + * components in it. If so just link td->td_proc here. + */ + proc_linkup(&proc0, &ksegrp0, &thread0); + + metadata_missing = 0; + if (xen_start_info->mod_start) { + preload_metadata = (caddr_t)xen_start_info->mod_start; + preload_bootstrap_relocate(KERNBASE); + } else { + metadata_missing = 1; + } + if (envmode == 1) + kern_envp = static_env; + else if ((caddr_t)xen_start_info->cmd_line) + kern_envp = xen_setbootenv((caddr_t)xen_start_info->cmd_line); + + boothowto |= xen_boothowto(kern_envp); + + /* Init basic tunables, hz etc */ + init_param1(); + + /* + * XEN occupies a portion of the upper virtual address space + * At its base it manages an array mapping machine page frames + * to physical page frames - hence we need to be able to + * access 4GB - (64MB - 4MB + 64k) + */ + gdt_segs[GPRIV_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); + gdt_segs[GUFS_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); + gdt_segs[GUGS_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); + gdt_segs[GCODE_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); + gdt_segs[GDATA_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); + gdt_segs[GUCODE_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); + gdt_segs[GUDATA_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); + gdt_segs[GBIOSLOWMEM_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); + + pc = &__pcpu[0]; + gdt_segs[GPRIV_SEL].ssd_base = (int) pc; + gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss; + + PT_SET_MA(gdt, xpmap_ptom(VTOP(gdt)) | PG_V | PG_RW); + bzero(gdt, PAGE_SIZE); + for (x = 0; x < NGDT; x++) + ssdtosd(&gdt_segs[x], &gdt[x].sd); + + + printk("gdt=%p\n", gdt); + printk("PTmap=%p\n", PTmap); + printk("addr=%p\n", *vtopte((unsigned long)gdt) & ~PG_RW); + + gdtmachpfn = vtomach(gdt) >> PAGE_SHIFT; + PT_SET_MA(gdt, *vtopte((unsigned long)gdt) & ~(PG_RW|PG_M|PG_A)); + PANIC_IF(HYPERVISOR_set_gdt(&gdtmachpfn, 512) != 0); + lgdt(&r_gdt /* unused */); + gdt_set = 1; + + if ((error = HYPERVISOR_set_trap_table(trap_table)) != 0) { + panic("set_trap_table failed - error %d\n", error); + } + + error = HYPERVISOR_callback_op(CALLBACKOP_register, &event); + if (error == 0) + error = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe); +#if CONFIG_XEN_COMPAT <= 0x030002 + if (error == -ENOXENSYS) + HYPERVISOR_set_callbacks(GSEL(GCODE_SEL, SEL_KPL), + (unsigned long)Xhypervisor_callback, + GSEL(GCODE_SEL, SEL_KPL), (unsigned long)failsafe_callback); +#endif + pcpu_init(pc, 0, sizeof(struct pcpu)); + PCPU_SET(prvspace, pc); + PCPU_SET(curthread, &thread0); + PCPU_SET(curpcb, thread0.td_pcb); + PCPU_SET(pdir, (unsigned long)IdlePTD); + + /* + * Initialize mutexes. + * + * icu_lock: in order to allow an interrupt to occur in a critical + * section, to set pcpu->ipending (etc...) properly, we + * must be able to get the icu lock, so it can't be + * under witness. + */ + mutex_init(); + mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS); + + /* make ldt memory segments */ + PT_SET_MA(ldt, xpmap_ptom(VTOP(ldt)) | PG_V | PG_RW); + bzero(ldt, PAGE_SIZE); + ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1); + ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1); + for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++) + ssdtosd(&ldt_segs[x], &ldt[x].sd); + + default_proc_ldt.ldt_base = (caddr_t)ldt; + default_proc_ldt.ldt_len = 6; + _default_ldt = (int)&default_proc_ldt; + PCPU_SET(currentldt, _default_ldt) + PT_SET_MA(ldt, *vtopte((unsigned long)ldt) & ~PG_RW); + xen_set_ldt((unsigned long) ldt, (sizeof ldt_segs / sizeof ldt_segs[0])); + +#ifdef XBOX + /* + * The following code queries the PCI ID of 0:0:0. For the XBOX, + * This should be 0x10de / 0x02a5. + * + * This is exactly what Linux does. + */ + outl(0xcf8, 0x80000000); + if (inl(0xcfc) == 0x02a510de) { + arch_i386_is_xbox = 1; + pic16l_setled(XBOX_LED_GREEN); + + /* + * We are an XBOX, but we may have either 64MB or 128MB of + * memory. The PCI host bridge should be programmed for this, + * so we just query it. + */ + outl(0xcf8, 0x80000084); + arch_i386_xbox_memsize = (inl(0xcfc) == 0x7FFFFFF) ? 128 : 64; + } +#endif /* XBOX */ +#if defined (XEN_PRIVILEGED) + /* + * Initialize the i8254 before the console so that console + * initialization can use DELAY(). + */ + i8254_init(); +#endif + /* + * Initialize the console before we print anything out. + */ + cninit(); + + if (metadata_missing) + printf("WARNING: loader(8) metadata is missing!\n"); + +#ifdef DEV_ISA + if (xen_start_info->flags & SIF_PRIVILEGED) { + elcr_probe(); +#ifdef DEV_ATPIC + atpic_startup(); +#endif + } +#endif + +#ifdef DDB + ksym_start = bootinfo.bi_symtab; + ksym_end = bootinfo.bi_esymtab; +#endif + + kdb_init(); + +#ifdef KDB + if (boothowto & RB_KDB) + kdb_enter("Boot flags requested debugger"); +#endif + + finishidentcpu(); /* Final stage of CPU initialization */ + setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, + GSEL(GCODE_SEL, SEL_KPL)); + setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, + GSEL(GCODE_SEL, SEL_KPL)); + initializecpu(); /* Initialize CPU registers */ + + /* make an initial tss so cpu can get interrupt stack on syscall! */ + /* Note: -16 is so we can grow the trapframe if we came from vm86 */ + PCPU_SET(common_tss.tss_esp0, thread0.td_kstack + + KSTACK_PAGES * PAGE_SIZE - sizeof(struct pcb) - 16); + PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); + gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); + HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), + PCPU_GET(common_tss.tss_esp0)); + + + /* pointer to selector slot for %fs/%gs */ + PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd); + + dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 = + dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)]; + dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 = + dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL); +#ifdef PAE + dblfault_tss.tss_cr3 = (int)IdlePDPT; +#else + dblfault_tss.tss_cr3 = (int)IdlePTD; +#endif + dblfault_tss.tss_eip = (int)dblfault_handler; + dblfault_tss.tss_eflags = PSL_KERNEL; + dblfault_tss.tss_ds = dblfault_tss.tss_es = + dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL); + dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL); + dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL); + dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL); + + vm86_initialize(); + getmemsize(first); + init_param2(physmem); + + + /* Map the message buffer. */ + for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE) + pmap_kenter((vm_offset_t)msgbufp + off, avail_end + off); + + /* now running on new page tables, configured,and u/iom is accessible */ + + msgbufinit(msgbufp, MSGBUF_SIZE); + + /* transfer to user mode */ + + _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); + _udatasel = GSEL(GUDATA_SEL, SEL_UPL); + + /* setup proc 0's pcb */ + thread0.td_pcb->pcb_flags = 0; +#ifdef PAE + thread0.td_pcb->pcb_cr3 = (int)IdlePDPT; +#else + thread0.td_pcb->pcb_cr3 = (int)IdlePTD; +#endif + thread0.td_pcb->pcb_ext = 0; + thread0.td_frame = &proc0_tf; + thread0.td_pcb->pcb_fsd = PCPU_GET(fsgs_gdt)[0]; + thread0.td_pcb->pcb_gsd = PCPU_GET(fsgs_gdt)[1]; +} + +#else void init386(first) int first; @@ -2389,6 +2698,7 @@ init386(first) thread0.td_pcb->pcb_ext = 0; thread0.td_frame = &proc0_tf; } +#endif /* !XEN */ void cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) diff --git a/sys/i386/include/asmacros.h b/sys/i386/include/asmacros.h index 9b77bd0..d2e10b8 100644 --- a/sys/i386/include/asmacros.h +++ b/sys/i386/include/asmacros.h @@ -134,6 +134,46 @@ #define MEXITCOUNT #endif /* GPROF */ +/* + * Setup the kernel segment registers. + */ +#define SET_KERNEL_SREGS \ + movl $KDSEL, %eax ; /* reload with kernel's data segment */ \ + movl %eax, %ds ; \ + movl %eax, %es ; \ + movl $KPSEL, %eax ; /* reload with per-CPU data segment */ \ + movl %eax, %fs + +#ifdef XEN +#define LOAD_CR3(reg) \ + movl reg,PCPU(CR3); \ + pushl %ecx ; \ + pushl %edx ; \ + pushl %esi ; \ + pushl reg ; \ + call xen_load_cr3 ; \ + addl $4,%esp ; \ + popl %esi ; \ + popl %edx ; \ + popl %ecx ; \ + +#define READ_CR3(reg) movl PCPU(CR3),reg; +#define LLDT(arg) \ + pushl %edx ; \ + pushl %eax ; \ + xorl %eax,%eax ; \ + movl %eax,%gs ; \ + call i386_reset_ldt ; \ + popl %eax ; \ + popl %edx +#define CLI call ni_cli +#else +#define LOAD_CR3(reg) movl reg,%cr3; +#define READ_CR3(reg) movl %cr3,reg; +#define LLDT(arg) lldt arg; +#define CLI cli +#endif /* !XEN */ + #ifdef LOCORE /* * Convenience macros for declaring interrupt entry points and trap @@ -145,4 +185,30 @@ #endif /* LOCORE */ +#ifdef __STDC__ +#define ELFNOTE(name, type, desctype, descdata...) \ +.pushsection .note.name ; \ + .align 4 ; \ + .long 2f - 1f /* namesz */ ; \ + .long 4f - 3f /* descsz */ ; \ + .long type ; \ +1:.asciz #name ; \ +2:.align 4 ; \ +3:desctype descdata ; \ +4:.align 4 ; \ +.popsection +#else /* !__STDC__, i.e. -traditional */ +#define ELFNOTE(name, type, desctype, descdata) \ +.pushsection .note.name ; \ + .align 4 ; \ + .long 2f - 1f /* namesz */ ; \ + .long 4f - 3f /* descsz */ ; \ + .long type ; \ +1:.asciz "name" ; \ +2:.align 4 ; \ +3:desctype descdata ; \ +4:.align 4 ; \ +.popsection +#endif /* __STDC__ */ + #endif /* !_MACHINE_ASMACROS_H_ */ diff --git a/sys/i386/include/pcpu.h b/sys/i386/include/pcpu.h index 6312b97..26902ea 100644 --- a/sys/i386/include/pcpu.h +++ b/sys/i386/include/pcpu.h @@ -45,6 +45,28 @@ * to each CPU's data can be set up for things like "check curproc on all * other processors" */ + +#ifdef XEN +#define PCPU_MD_FIELDS \ + struct pcpu *pc_prvspace; /* Self-reference */ \ + struct pmap *pc_curpmap; \ + struct i386tss pc_common_tss; \ + struct segment_descriptor pc_common_tssd; \ + struct segment_descriptor *pc_tss_gdt; \ + struct segment_descriptor *pc_fsgs_gdt; \ + vm_paddr_t *pc_pdir_shadow; \ + int pc_currentldt; \ + u_int pc_acpi_id; /* ACPI CPU id */ \ + u_int pc_apic_id; \ + int pc_private_tss; /* Flag indicating private tss*/\ + u_int pc_cr3; /* track cr3 for R1/R3*/ \ + u_int pc_pdir; \ + u_int pc_lazypmap; \ + u_int pc_rendezvous; \ + u_int pc_cpuast + + +#else #define PCPU_MD_FIELDS \ struct pcpu *pc_prvspace; /* Self-reference */ \ struct pmap *pc_curpmap; \ @@ -55,7 +77,7 @@ int pc_currentldt; \ u_int pc_acpi_id; \ u_int pc_apic_id - +#endif #if defined(lint) extern struct pcpu *pcpup; diff --git a/sys/i386/include/pmap.h b/sys/i386/include/pmap.h index 922ebc3..3c5a711 100644 --- a/sys/i386/include/pmap.h +++ b/sys/i386/include/pmap.h @@ -68,7 +68,14 @@ /* Our various interpretations of the above */ #define PG_W PG_AVAIL1 /* "Wired" pseudoflag */ #define PG_MANAGED PG_AVAIL2 + +#ifdef PAE +#define PG_FRAME (0x000ffffffffff000ull) +#define PG_PS_FRAME (0x000fffffffe00000ull) +#else #define PG_FRAME (~((vm_paddr_t)PAGE_MASK)) +#define PG_PS_FRAME (0xffc00000) +#endif #define PG_PROT (PG_RW|PG_U) /* all protection bits . */ #define PG_N (PG_NC_PWT|PG_NC_PCD) /* Non-cacheable */ @@ -175,6 +182,77 @@ extern pd_entry_t *IdlePTD; /* physical address of "Idle" state directory */ * the corresponding pde that in turn maps it. */ #define vtopte(va) (PTmap + i386_btop(va)) +#define vtophys(va) pmap_kextract(((vm_offset_t) (va))) + +#ifdef XEN +#include <machine/xen/xen-os.h> +#include <machine/xen/xenvar.h> +#include <machine/xen/xenpmap.h> +#ifndef TRUE +#define TRUE 1 +#endif +#ifndef FALSE +#define FALSE 0 +#endif + +#define PG_KERNEL (PG_V | PG_A | PG_RW | PG_M) + +#define MACH_TO_VM_PAGE(ma) PHYS_TO_VM_PAGE(xpmap_mtop((ma))) +#define VM_PAGE_TO_MACH(m) xpmap_ptom(VM_PAGE_TO_PHYS((m))) + +static __inline vm_paddr_t +pmap_kextract_ma(vm_offset_t va) +{ + vm_paddr_t ma; + if ((ma = PTD[va >> PDRSHIFT]) & PG_PS) { + ma = (ma & ~(NBPDR - 1)) | (va & (NBPDR - 1)); + } else { + ma = (*vtopte(va) & PG_FRAME) | (va & PAGE_MASK); + } + return ma; +} + +static __inline vm_paddr_t +pmap_kextract(vm_offset_t va) +{ + return xpmap_mtop(pmap_kextract_ma(va)); +} +#define vtomach(va) pmap_kextract_ma(((vm_offset_t) (va))) + +vm_paddr_t pmap_extract_ma(struct pmap *pmap, vm_offset_t va); + +void pmap_kenter_ma(vm_offset_t va, vm_paddr_t pa); +void pmap_map_readonly(struct pmap *pmap, vm_offset_t va, int len); +void pmap_map_readwrite(struct pmap *pmap, vm_offset_t va, int len); + +static __inline pt_entry_t +pte_load_store(pt_entry_t *ptep, pt_entry_t v) +{ + pt_entry_t r; + + v = xpmap_ptom(v); + r = *ptep; + PT_SET_VA(ptep, v, TRUE); + return (r); +} + +static __inline pt_entry_t +pte_load_store_ma(pt_entry_t *ptep, pt_entry_t v) +{ + pt_entry_t r; + + r = *ptep; + PT_SET_VA_MA(ptep, v, TRUE); + return (r); +} + +#define pte_load_clear(ptep) pte_load_store((ptep), (pt_entry_t)0ULL) + +#define pte_store(ptep, pte) pte_load_store((ptep), (pt_entry_t)pte) +#define pte_store_ma(ptep, pte) pte_load_store_ma((ptep), (pt_entry_t)pte) +#define pde_store_ma(ptep, pte) pte_load_store_ma((ptep), (pt_entry_t)pte) + +#elif !defined(XEN) /* * Routine: pmap_kextract @@ -195,10 +273,9 @@ pmap_kextract(vm_offset_t va) } return pa; } +#endif -#define vtophys(va) pmap_kextract(((vm_offset_t) (va))) - -#ifdef PAE +#if defined(PAE) && !defined(XEN) static __inline pt_entry_t pte_load(pt_entry_t *ptep) @@ -231,7 +308,7 @@ pte_load_store(pt_entry_t *ptep, pt_entry_t v) #define pte_store(ptep, pte) pte_load_store((ptep), (pt_entry_t)pte) -#else /* PAE */ +#elif !defined (PAE) && !defined(XEN) static __inline pt_entry_t pte_load(pt_entry_t *ptep) diff --git a/sys/i386/include/segments.h b/sys/i386/include/segments.h index 351ff5d..7a9f6d6 100644 --- a/sys/i386/include/segments.h +++ b/sys/i386/include/segments.h @@ -47,7 +47,11 @@ */ #define ISPL(s) ((s)&3) /* what is the priority level of a selector */ +#ifndef XEN #define SEL_KPL 0 /* kernel priority level */ +#else +#define SEL_KPL 1 /* kernel priority level */ +#endif #define SEL_UPL 3 /* user priority level */ #define ISLDT(s) ((s)&SEL_LDT) /* is it local or global */ #define SEL_LDT 4 /* local descriptor table */ @@ -222,8 +226,11 @@ struct region_descriptor { #define GBIOSARGS_SEL 17 /* BIOS interface (Arguments) */ #define GNDIS_SEL 18 /* For the NDIS layer */ +#ifndef XEN #define NGDT 19 - +#else +#define NGDT 9 +#endif /* * Entries in the Local Descriptor Table (LDT) */ @@ -240,10 +247,16 @@ struct region_descriptor { #ifdef _KERNEL extern int _default_ldt; +#ifndef XEN +extern union descriptor ldt[NLDT]; extern union descriptor gdt[]; +#else +extern union descriptor *ldt; +extern union descriptor *gdt; +#endif + extern struct soft_segment_descriptor gdt_segs[]; extern struct gate_descriptor *idt; -extern union descriptor ldt[NLDT]; extern struct region_descriptor r_gdt, r_idt; void lgdt(struct region_descriptor *rdp); diff --git a/sys/i386/include/xen/evtchn.h b/sys/i386/include/xen/evtchn.h new file mode 100644 index 0000000..3036f5b --- /dev/null +++ b/sys/i386/include/xen/evtchn.h @@ -0,0 +1,81 @@ +/****************************************************************************** + * evtchn.h + * + * Communication via Xen event channels. + * Also definitions for the device that demuxes notifications to userspace. + * + * Copyright (c) 2004, K A Fraser + */ + +#ifndef __ASM_EVTCHN_H__ +#define __ASM_EVTCHN_H__ +#include <machine/pcpu.h> +#include <machine/xen/hypervisor.h> +#include <machine/xen/synch_bitops.h> +#include <machine/frame.h> + +/* + * LOW-LEVEL DEFINITIONS + */ + +/* + * Unlike notify_remote_via_evtchn(), this is safe to use across + * save/restore. Notifications on a broken connection are silently dropped. + */ +void notify_remote_via_irq(int irq); + + +/* Entry point for notifications into Linux subsystems. */ +void evtchn_do_upcall(struct intrframe *frame); + +/* Entry point for notifications into the userland character device. */ +void evtchn_device_upcall(int port); + +void mask_evtchn(int port); + +void unmask_evtchn(int port); + + + +static inline void +clear_evtchn(int port) +{ + shared_info_t *s = HYPERVISOR_shared_info; + synch_clear_bit(port, &s->evtchn_pending[0]); +} + +static inline void +notify_remote_via_evtchn(int port) +{ + struct evtchn_send send = { .port = port }; + (void)HYPERVISOR_event_channel_op(EVTCHNOP_send, &send); +} + +/* + * Use these to access the event channel underlying the IRQ handle returned + * by bind_*_to_irqhandler(). + */ +int irq_to_evtchn_port(int irq); + +void ipi_pcpu(unsigned int cpu, int vector); + +/* + * CHARACTER-DEVICE DEFINITIONS + */ + +#define PORT_NORMAL 0x0000 +#define PORT_EXCEPTION 0x8000 +#define PORTIDX_MASK 0x7fff + +/* /dev/xen/evtchn resides at device number major=10, minor=200 */ +#define EVTCHN_MINOR 200 + +/* /dev/xen/evtchn ioctls: */ +/* EVTCHN_RESET: Clear and reinit the event buffer. Clear error condition. */ +#define EVTCHN_RESET _IO('E', 1) +/* EVTCHN_BIND: Bind to the specified event-channel port. */ +#define EVTCHN_BIND _IO('E', 2) +/* EVTCHN_UNBIND: Unbind from the specified event-channel port. */ +#define EVTCHN_UNBIND _IO('E', 3) + +#endif /* __ASM_EVTCHN_H__ */ diff --git a/sys/i386/include/xen/features.h b/sys/i386/include/xen/features.h new file mode 100644 index 0000000..b4cce2f --- /dev/null +++ b/sys/i386/include/xen/features.h @@ -0,0 +1,20 @@ +/****************************************************************************** + * features.h + * + * Query the features reported by Xen. + * + * Copyright (c) 2006, Ian Campbell + */ + +#ifndef __ASM_XEN_FEATURES_H__ +#define __ASM_XEN_FEATURES_H__ + +#include <xen/interface/version.h> + +extern void setup_xen_features(void); + +extern uint8_t xen_features[XENFEAT_NR_SUBMAPS * 32]; + +#define xen_feature(flag) (xen_features[flag]) + +#endif /* __ASM_XEN_FEATURES_H__ */ diff --git a/sys/i386/include/xen/hypercall.h b/sys/i386/include/xen/hypercall.h new file mode 100644 index 0000000..b33626f --- /dev/null +++ b/sys/i386/include/xen/hypercall.h @@ -0,0 +1,405 @@ +/****************************************************************************** + * hypercall.h + * + * Linux-specific hypervisor handling. + * + * Copyright (c) 2002-2004, K A Fraser + * + * This file may be distributed separately from the Linux kernel, or + * incorporated into other software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __HYPERCALL_H__ +#define __HYPERCALL_H__ + +#include <sys/systm.h> +#include <xen/interface/xen.h> +#include <xen/interface/sched.h> + +#define __STR(x) #x +#define STR(x) __STR(x) +#define ENOXENSYS 38 +#define CONFIG_XEN_COMPAT 0x030002 + + +#if defined(XEN) +#define HYPERCALL_STR(name) \ + "call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)" +#else +#define HYPERCALL_STR(name) \ + "mov hypercall_stubs,%%eax; " \ + "add $("STR(__HYPERVISOR_##name)" * 32),%%eax; " \ + "call *%%eax" +#endif + +#define _hypercall0(type, name) \ +({ \ + long __res; \ + __asm__ volatile ( \ + HYPERCALL_STR(name) \ + : "=a" (__res) \ + : \ + : "memory" ); \ + (type)__res; \ +}) + +#define _hypercall1(type, name, a1) \ +({ \ + long __res, __ign1; \ + __asm__ volatile ( \ + HYPERCALL_STR(name) \ + : "=a" (__res), "=b" (__ign1) \ + : "1" ((long)(a1)) \ + : "memory" ); \ + (type)__res; \ +}) + +#define _hypercall2(type, name, a1, a2) \ +({ \ + long __res, __ign1, __ign2; \ + __asm__ volatile ( \ + HYPERCALL_STR(name) \ + : "=a" (__res), "=b" (__ign1), "=c" (__ign2) \ + : "1" ((long)(a1)), "2" ((long)(a2)) \ + : "memory" ); \ + (type)__res; \ +}) + +#define _hypercall3(type, name, a1, a2, a3) \ +({ \ + long __res, __ign1, __ign2, __ign3; \ + __asm__ volatile ( \ + HYPERCALL_STR(name) \ + : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \ + "=d" (__ign3) \ + : "1" ((long)(a1)), "2" ((long)(a2)), \ + "3" ((long)(a3)) \ + : "memory" ); \ + (type)__res; \ +}) + +#define _hypercall4(type, name, a1, a2, a3, a4) \ +({ \ + long __res, __ign1, __ign2, __ign3, __ign4; \ + __asm__ volatile ( \ + HYPERCALL_STR(name) \ + : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \ + "=d" (__ign3), "=S" (__ign4) \ + : "1" ((long)(a1)), "2" ((long)(a2)), \ + "3" ((long)(a3)), "4" ((long)(a4)) \ + : "memory" ); \ + (type)__res; \ +}) + +#define _hypercall5(type, name, a1, a2, a3, a4, a5) \ +({ \ + long __res, __ign1, __ign2, __ign3, __ign4, __ign5; \ + __asm__ volatile ( \ + HYPERCALL_STR(name) \ + : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \ + "=d" (__ign3), "=S" (__ign4), "=D" (__ign5) \ + : "1" ((long)(a1)), "2" ((long)(a2)), \ + "3" ((long)(a3)), "4" ((long)(a4)), \ + "5" ((long)(a5)) \ + : "memory" ); \ + (type)__res; \ +}) + +static inline int +HYPERVISOR_set_trap_table( + trap_info_t *table) +{ + return _hypercall1(int, set_trap_table, table); +} + +static inline int +HYPERVISOR_mmu_update( + mmu_update_t *req, int count, int *success_count, domid_t domid) +{ + return _hypercall4(int, mmu_update, req, count, success_count, domid); +} + +static inline int +HYPERVISOR_mmuext_op( + mmuext_op_t *op, int count, int *success_count, domid_t domid) +{ + return _hypercall4(int, mmuext_op, op, count, success_count, domid); +} + +static inline int +HYPERVISOR_set_gdt( + unsigned long *frame_list, int entries) +{ + return _hypercall2(int, set_gdt, frame_list, entries); +} + +static inline int +HYPERVISOR_stack_switch( + unsigned long ss, unsigned long esp) +{ + return _hypercall2(int, stack_switch, ss, esp); +} + +static inline int +HYPERVISOR_set_callbacks( + unsigned long event_selector, unsigned long event_address, + unsigned long failsafe_selector, unsigned long failsafe_address) +{ + return _hypercall4(int, set_callbacks, + event_selector, event_address, + failsafe_selector, failsafe_address); +} + +static inline int +HYPERVISOR_fpu_taskswitch( + int set) +{ + return _hypercall1(int, fpu_taskswitch, set); +} + +static inline int +HYPERVISOR_sched_op_compat( + int cmd, unsigned long arg) +{ + return _hypercall2(int, sched_op_compat, cmd, arg); +} + +static inline int +HYPERVISOR_sched_op( + int cmd, void *arg) +{ + return _hypercall2(int, sched_op, cmd, arg); +} + +static inline long +HYPERVISOR_set_timer_op( + uint64_t timeout) +{ + unsigned long timeout_hi = (unsigned long)(timeout>>32); + unsigned long timeout_lo = (unsigned long)timeout; + return _hypercall2(long, set_timer_op, timeout_lo, timeout_hi); +} +#if 0 +static inline int +HYPERVISOR_platform_op( + struct xen_platform_op *platform_op) +{ + platform_op->interface_version = XENPF_INTERFACE_VERSION; + return _hypercall1(int, platform_op, platform_op); +} +#endif +static inline int +HYPERVISOR_set_debugreg( + int reg, unsigned long value) +{ + return _hypercall2(int, set_debugreg, reg, value); +} + +static inline unsigned long +HYPERVISOR_get_debugreg( + int reg) +{ + return _hypercall1(unsigned long, get_debugreg, reg); +} + +static inline int +HYPERVISOR_update_descriptor( + uint64_t ma, uint64_t desc) +{ + return _hypercall4(int, update_descriptor, ma, ma>>32, desc, desc>>32); +} + +static inline int +HYPERVISOR_memory_op( + unsigned int cmd, void *arg) +{ + return _hypercall2(int, memory_op, cmd, arg); +} + +static inline int +HYPERVISOR_multicall( + void *call_list, int nr_calls) +{ + return _hypercall2(int, multicall, call_list, nr_calls); +} + +static inline int +HYPERVISOR_update_va_mapping( + unsigned long va, uint64_t new_val, unsigned long flags) +{ + uint32_t hi, lo; + + lo = (uint32_t)(new_val & 0xffffffff); + hi = (uint32_t)(new_val >> 32); + + return _hypercall4(int, update_va_mapping, va, + lo, hi, flags); +} + +static inline int +HYPERVISOR_event_channel_op( + int cmd, void *arg) +{ + int rc = _hypercall2(int, event_channel_op, cmd, arg); + +#if CONFIG_XEN_COMPAT <= 0x030002 + if (__predict_false(rc == -ENOXENSYS)) { + struct evtchn_op op; + op.cmd = cmd; + memcpy(&op.u, arg, sizeof(op.u)); + rc = _hypercall1(int, event_channel_op_compat, &op); + memcpy(arg, &op.u, sizeof(op.u)); + } +#endif + return (rc); +} + +static inline int +HYPERVISOR_xen_version( + int cmd, void *arg) +{ + return _hypercall2(int, xen_version, cmd, arg); +} + +static inline int +HYPERVISOR_console_io( + int cmd, int count, char *str) +{ + return _hypercall3(int, console_io, cmd, count, str); +} + +static inline int +HYPERVISOR_physdev_op( + int cmd, void *arg) +{ + int rc = _hypercall2(int, physdev_op, cmd, arg); +#if CONFIG_XEN_COMPAT <= 0x030002 + if (__predict_false(rc == -ENOXENSYS)) { + struct physdev_op op; + op.cmd = cmd; + memcpy(&op.u, arg, sizeof(op.u)); + rc = _hypercall1(int, physdev_op_compat, &op); + memcpy(arg, &op.u, sizeof(op.u)); + } +#endif + return (rc); +} + +static inline int +HYPERVISOR_grant_table_op( + unsigned int cmd, void *uop, unsigned int count) +{ + return _hypercall3(int, grant_table_op, cmd, uop, count); +} + +static inline int +HYPERVISOR_update_va_mapping_otherdomain( + unsigned long va, uint64_t new_val, unsigned long flags, domid_t domid) +{ + uint32_t hi, lo; + + lo = (uint32_t)(new_val & 0xffffffff); + hi = (uint32_t)(new_val >> 32); + + return _hypercall5(int, update_va_mapping_otherdomain, va, + lo, hi, flags, domid); +} + +static inline int +HYPERVISOR_vm_assist( + unsigned int cmd, unsigned int type) +{ + return _hypercall2(int, vm_assist, cmd, type); +} + +static inline int +HYPERVISOR_vcpu_op( + int cmd, int vcpuid, void *extra_args) +{ + return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args); +} + +static inline int +HYPERVISOR_suspend( + unsigned long srec) +{ + struct sched_shutdown sched_shutdown = { + .reason = SHUTDOWN_suspend + }; + int rc = _hypercall3(int, sched_op, SCHEDOP_shutdown, + &sched_shutdown, srec); +#if CONFIG_XEN_COMPAT <= 0x030002 + if (rc == -ENOXENSYS) + rc = _hypercall3(int, sched_op_compat, SCHEDOP_shutdown, + SHUTDOWN_suspend, srec); +#endif + return (rc); +} + +#if CONFIG_XEN_COMPAT <= 0x030002 +static inline int +HYPERVISOR_nmi_op( + unsigned long op, void *arg) +{ + return _hypercall2(int, nmi_op, op, arg); +} +#endif + +static inline int +HYPERVISOR_callback_op( + int cmd, void *arg) +{ + return _hypercall2(int, callback_op, cmd, arg); +} + +#ifndef CONFIG_XEN +static inline unsigned long +HYPERVISOR_hvm_op( + int op, void *arg) +{ + return _hypercall2(unsigned long, hvm_op, op, arg); +} +#endif + +static inline int +HYPERVISOR_xenoprof_op( + int op, void *arg) +{ + return _hypercall2(int, xenoprof_op, op, arg); +} + +static inline int +HYPERVISOR_kexec_op( + unsigned long op, void *args) +{ + return _hypercall2(int, kexec_op, op, args); +} +#endif /* __HYPERCALL_H__ */ + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff --git a/sys/i386/include/xen/hypervisor.h b/sys/i386/include/xen/hypervisor.h new file mode 100644 index 0000000..ca0c180 --- /dev/null +++ b/sys/i386/include/xen/hypervisor.h @@ -0,0 +1,144 @@ +/****************************************************************************** + * hypervisor.h + * + * Linux-specific hypervisor handling. + * + * Copyright (c) 2002, K A Fraser + */ + +#ifndef __HYPERVISOR_H__ +#define __HYPERVISOR_H__ + +#define is_running_on_xen() 1 + +#ifdef PAE +#ifndef CONFIG_X86_PAE +#define CONFIG_X86_PAE +#endif +#endif + +#include <sys/cdefs.h> +#include <machine/xen/xen-os.h> +#include <sys/systm.h> +#include <xen/interface/xen.h> +#include <xen/interface/platform.h> +#include <xen/interface/event_channel.h> +#include <xen/interface/physdev.h> +#include <xen/interface/sched.h> +#include <xen/interface/callback.h> +#include <machine/xen/hypercall.h> + +#if defined(__amd64__) +#define MULTI_UVMFLAGS_INDEX 2 +#define MULTI_UVMDOMID_INDEX 3 +#else +#define MULTI_UVMFLAGS_INDEX 3 +#define MULTI_UVMDOMID_INDEX 4 +#endif + +#ifdef CONFIG_XEN_PRIVILEGED_GUEST +#define is_initial_xendomain() (xen_start_info->flags & SIF_INITDOMAIN) +#else +#define is_initial_xendomain() 0 +#endif + +extern start_info_t *xen_start_info; + +extern uint64_t get_system_time(int ticks); + +static inline int +HYPERVISOR_console_write(char *str, int count) +{ + return HYPERVISOR_console_io(CONSOLEIO_write, count, str); +} + +static inline void HYPERVISOR_crash(void) __dead2; + +static inline int +HYPERVISOR_yield(void) +{ + int rc = HYPERVISOR_sched_op(SCHEDOP_yield, NULL); + +#if CONFIG_XEN_COMPAT <= 0x030002 + if (rc == -ENOXENSYS) + rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0); +#endif + return (rc); +} + +static inline int +HYPERVISOR_block( + void) +{ + int rc = HYPERVISOR_sched_op(SCHEDOP_block, NULL); + +#if CONFIG_XEN_COMPAT <= 0x030002 + if (rc == -ENOXENSYS) + rc = HYPERVISOR_sched_op_compat(SCHEDOP_block, 0); +#endif + return (rc); +} + + +static inline void +HYPERVISOR_shutdown(unsigned int reason) +{ + struct sched_shutdown sched_shutdown = { + .reason = reason + }; + + HYPERVISOR_sched_op(SCHEDOP_shutdown, &sched_shutdown); +#if CONFIG_XEN_COMPAT <= 0x030002 + HYPERVISOR_sched_op_compat(SCHEDOP_shutdown, reason); +#endif +} + +static inline void +HYPERVISOR_crash(void) +{ + HYPERVISOR_shutdown(SHUTDOWN_crash); + /* NEVER REACHED */ + for (;;) ; /* eliminate noreturn error */ +} + +/* Transfer control to hypervisor until an event is detected on one */ +/* of the specified ports or the specified number of ticks elapse */ +static inline int +HYPERVISOR_poll( + evtchn_port_t *ports, unsigned int nr_ports, int ticks) +{ + int rc; + struct sched_poll sched_poll = { + .nr_ports = nr_ports, + .timeout = get_system_time(ticks) + }; + set_xen_guest_handle(sched_poll.ports, ports); + + rc = HYPERVISOR_sched_op(SCHEDOP_poll, &sched_poll); +#if CONFIG_XEN_COMPAT <= 0x030002 + if (rc == -ENOXENSYS) + rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0); +#endif + return (rc); +} + +static inline void +MULTI_update_va_mapping( + multicall_entry_t *mcl, unsigned long va, + uint64_t new_val, unsigned long flags) +{ + mcl->op = __HYPERVISOR_update_va_mapping; + mcl->args[0] = va; +#if defined(__amd64__) + mcl->args[1] = new_val.pte; +#elif defined(PAE) + mcl->args[1] = (uint32_t)(new_val & 0xffffffff) ; + mcl->args[2] = (uint32_t)(new_val >> 32); +#else + mcl->args[1] = new_val; + mcl->args[2] = 0; +#endif + mcl->args[MULTI_UVMFLAGS_INDEX] = flags; +} + +#endif /* __HYPERVISOR_H__ */ diff --git a/sys/i386/include/xen/synch_bitops.h b/sys/i386/include/xen/synch_bitops.h new file mode 100644 index 0000000..1b61542 --- /dev/null +++ b/sys/i386/include/xen/synch_bitops.h @@ -0,0 +1,139 @@ +#ifndef __XEN_SYNCH_BITOPS_H__ +#define __XEN_SYNCH_BITOPS_H__ + +/* + * Copyright 1992, Linus Torvalds. + * Heavily modified to provide guaranteed strong synchronisation + * when communicating with Xen or other guest OSes running on other CPUs. + */ + + +#define ADDR (*(volatile long *) addr) + +static __inline__ void synch_set_bit(int nr, volatile void * addr) +{ + __asm__ __volatile__ ( + "lock btsl %1,%0" + : "=m" (ADDR) : "Ir" (nr) : "memory" ); +} + +static __inline__ void synch_clear_bit(int nr, volatile void * addr) +{ + __asm__ __volatile__ ( + "lock btrl %1,%0" + : "=m" (ADDR) : "Ir" (nr) : "memory" ); +} + +static __inline__ void synch_change_bit(int nr, volatile void * addr) +{ + __asm__ __volatile__ ( + "lock btcl %1,%0" + : "=m" (ADDR) : "Ir" (nr) : "memory" ); +} + +static __inline__ int synch_test_and_set_bit(int nr, volatile void * addr) +{ + int oldbit; + __asm__ __volatile__ ( + "lock btsl %2,%1\n\tsbbl %0,%0" + : "=r" (oldbit), "=m" (ADDR) : "Ir" (nr) : "memory"); + return oldbit; +} + +static __inline__ int synch_test_and_clear_bit(int nr, volatile void * addr) +{ + int oldbit; + __asm__ __volatile__ ( + "lock btrl %2,%1\n\tsbbl %0,%0" + : "=r" (oldbit), "=m" (ADDR) : "Ir" (nr) : "memory"); + return oldbit; +} + +static __inline__ int synch_test_and_change_bit(int nr, volatile void * addr) +{ + int oldbit; + + __asm__ __volatile__ ( + "lock btcl %2,%1\n\tsbbl %0,%0" + : "=r" (oldbit), "=m" (ADDR) : "Ir" (nr) : "memory"); + return oldbit; +} + +struct __synch_xchg_dummy { unsigned long a[100]; }; +#define __synch_xg(x) ((volatile struct __synch_xchg_dummy *)(x)) + +#define synch_cmpxchg(ptr, old, new) \ +((__typeof__(*(ptr)))__synch_cmpxchg((ptr),\ + (unsigned long)(old), \ + (unsigned long)(new), \ + sizeof(*(ptr)))) + +static inline unsigned long __synch_cmpxchg(volatile void *ptr, + unsigned long old, + unsigned long new, int size) +{ + unsigned long prev; + switch (size) { + case 1: + __asm__ __volatile__("lock; cmpxchgb %b1,%2" + : "=a"(prev) + : "q"(new), "m"(*__synch_xg(ptr)), + "0"(old) + : "memory"); + return prev; + case 2: + __asm__ __volatile__("lock; cmpxchgw %w1,%2" + : "=a"(prev) + : "q"(new), "m"(*__synch_xg(ptr)), + "0"(old) + : "memory"); + return prev; +#ifdef CONFIG_X86_64 + case 4: + __asm__ __volatile__("lock; cmpxchgl %k1,%2" + : "=a"(prev) + : "q"(new), "m"(*__synch_xg(ptr)), + "0"(old) + : "memory"); + return prev; + case 8: + __asm__ __volatile__("lock; cmpxchgq %1,%2" + : "=a"(prev) + : "q"(new), "m"(*__synch_xg(ptr)), + "0"(old) + : "memory"); + return prev; +#else + case 4: + __asm__ __volatile__("lock; cmpxchgl %1,%2" + : "=a"(prev) + : "q"(new), "m"(*__synch_xg(ptr)), + "0"(old) + : "memory"); + return prev; +#endif + } + return old; +} + +static __inline__ int synch_const_test_bit(int nr, const volatile void * addr) +{ + return ((1UL << (nr & 31)) & + (((const volatile unsigned int *) addr)[nr >> 5])) != 0; +} + +static __inline__ int synch_var_test_bit(int nr, volatile void * addr) +{ + int oldbit; + __asm__ __volatile__ ( + "btl %2,%1\n\tsbbl %0,%0" + : "=r" (oldbit) : "m" (ADDR), "Ir" (nr) ); + return oldbit; +} + +#define synch_test_bit(nr,addr) \ +(__builtin_constant_p(nr) ? \ + synch_const_test_bit((nr),(addr)) : \ + synch_var_test_bit((nr),(addr))) + +#endif /* __XEN_SYNCH_BITOPS_H__ */ diff --git a/sys/i386/include/xen/xen-os.h b/sys/i386/include/xen/xen-os.h new file mode 100644 index 0000000..ab8e480 --- /dev/null +++ b/sys/i386/include/xen/xen-os.h @@ -0,0 +1,385 @@ +/****************************************************************************** + * os.h + * + * random collection of macros and definition + */ + +#ifndef _XEN_OS_H_ +#define _XEN_OS_H_ +#include <machine/param.h> +#ifdef PAE +#define CONFIG_X86_PAE +#endif + +#if defined(XEN) && !defined(__XEN_INTERFACE_VERSION__) +/* + * Can update to a more recent version when we implement + * the hypercall page + */ +#define __XEN_INTERFACE_VERSION__ 0x00030204 +#endif + +#include <xen/interface/xen.h> + +/* Force a proper event-channel callback from Xen. */ +void force_evtchn_callback(void); + +#ifndef vtophys +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/pmap.h> +#endif + +#ifdef SMP +#include <sys/time.h> /* XXX for pcpu.h */ +#include <sys/pcpu.h> /* XXX for PCPU_GET */ +extern int gdt_set; +static inline int +smp_processor_id(void) +{ + if (likely(gdt_set)) + return PCPU_GET(cpuid); + return 0; +} + +#else +#define smp_processor_id() 0 +#endif + +#ifndef NULL +#define NULL (void *)0 +#endif + +#ifndef PANIC_IF +#define PANIC_IF(exp) if (unlikely(exp)) {printk("panic - %s: %s:%d\n",#exp, __FILE__, __LINE__); panic("%s: %s:%d", #exp, __FILE__, __LINE__);} +#endif + +extern shared_info_t *HYPERVISOR_shared_info; + +/* Somewhere in the middle of the GCC 2.96 development cycle, we implemented + a mechanism by which the user can annotate likely branch directions and + expect the blocks to be reordered appropriately. Define __builtin_expect + to nothing for earlier compilers. */ + +/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ +static inline void rep_nop(void) +{ + __asm__ __volatile__ ( "rep;nop" : : : "memory" ); +} +#define cpu_relax() rep_nop() + + +#if __GNUC__ == 2 && __GNUC_MINOR__ < 96 +#define __builtin_expect(x, expected_value) (x) +#endif + +#define DEFINE_PER_CPU(type, name) \ + __typeof__(type) per_cpu__##name + +#define per_cpu(var, cpu) (*((void)cpu, &per_cpu__##var)) + +/* crude memory allocator for memory allocation early in + * boot + */ +void *bootmem_alloc(unsigned int size); +void bootmem_free(void *ptr, unsigned int size); + + +/* Everything below this point is not included by assembler (.S) files. */ +#ifndef __ASSEMBLY__ +#include <sys/types.h> + +void printk(const char *fmt, ...); + +/* some function prototypes */ +void trap_init(void); + +extern int preemptable; +#define preempt_disable() (preemptable = 0) +#define preempt_enable() (preemptable = 1) +#define preempt_enable_no_resched() (preemptable = 1) + + +/* + * STI/CLI equivalents. These basically set and clear the virtual + * event_enable flag in teh shared_info structure. Note that when + * the enable bit is set, there may be pending events to be handled. + * We may therefore call into do_hypervisor_callback() directly. + */ +#define likely(x) __builtin_expect((x),1) +#define unlikely(x) __builtin_expect((x),0) + + + +#define __cli() \ +do { \ + vcpu_info_t *_vcpu; \ + preempt_disable(); \ + _vcpu = &HYPERVISOR_shared_info->vcpu_info[smp_processor_id()]; \ + _vcpu->evtchn_upcall_mask = 1; \ + preempt_enable_no_resched(); \ + barrier(); \ +} while (0) + +#define __sti() \ +do { \ + vcpu_info_t *_vcpu; \ + barrier(); \ + preempt_disable(); \ + _vcpu = &HYPERVISOR_shared_info->vcpu_info[smp_processor_id()]; \ + _vcpu->evtchn_upcall_mask = 0; \ + barrier(); /* unmask then check (avoid races) */ \ + if ( unlikely(_vcpu->evtchn_upcall_pending) ) \ + force_evtchn_callback(); \ + preempt_enable(); \ +} while (0) + + +#define __save_flags(x) \ +do { \ + vcpu_info_t *vcpu; \ + vcpu = HYPERVISOR_shared_info->vcpu_info[smp_processor_id()]; \ + (x) = _vcpu->evtchn_upcall_mask; \ +} while (0) + +#define __restore_flags(x) \ +do { \ + vcpu_info_t *_vcpu; \ + barrier(); \ + preempt_disable(); \ + _vcpu = &HYPERVISOR_shared_info->vcpu_info[smp_processor_id()]; \ + if ((_vcpu->evtchn_upcall_mask = (x)) == 0) { \ + barrier(); /* unmask then check (avoid races) */ \ + if ( unlikely(_vcpu->evtchn_upcall_pending) ) \ + force_evtchn_callback(); \ + preempt_enable(); \ + } else \ + preempt_enable_no_resched(); \ +} while (0) + +/* + * Add critical_{enter, exit}? + * + */ +#define __save_and_cli(x) \ +do { \ + vcpu_info_t *_vcpu; \ + _vcpu = &HYPERVISOR_shared_info->vcpu_info[smp_processor_id()]; \ + (x) = _vcpu->evtchn_upcall_mask; \ + _vcpu->evtchn_upcall_mask = 1; \ + barrier(); \ +} while (0) + + +#define cli() __cli() +#define sti() __sti() +#define save_flags(x) __save_flags(x) +#define restore_flags(x) __restore_flags(x) +#define save_and_cli(x) __save_and_cli(x) + +#define local_irq_save(x) __save_and_cli(x) +#define local_irq_restore(x) __restore_flags(x) +#define local_irq_disable() __cli() +#define local_irq_enable() __sti() + +#define mtx_lock_irqsave(lock, x) {local_irq_save((x)); mtx_lock_spin((lock));} +#define mtx_unlock_irqrestore(lock, x) {mtx_unlock_spin((lock)); local_irq_restore((x)); } +#define spin_lock_irqsave mtx_lock_irqsave +#define spin_unlock_irqrestore mtx_unlock_irqrestore + + +#ifndef mb +#define mb() __asm__ __volatile__("lock; addl $0, 0(%%esp)": : :"memory") +#endif +#ifndef rmb +#define rmb() mb() +#endif +#ifndef wmb +#define wmb() barrier() +#endif +#ifdef SMP +#define smp_mb() mb() +#define smp_rmb() rmb() +#define smp_wmb() wmb() +#define smp_read_barrier_depends() read_barrier_depends() +#define set_mb(var, value) do { xchg(&var, value); } while (0) +#else +#define smp_mb() barrier() +#define smp_rmb() barrier() +#define smp_wmb() barrier() +#define smp_read_barrier_depends() do { } while(0) +#define set_mb(var, value) do { var = value; barrier(); } while (0) +#endif + + +/* This is a barrier for the compiler only, NOT the processor! */ +#define barrier() __asm__ __volatile__("": : :"memory") + +#define LOCK_PREFIX "" +#define LOCK "" +#define ADDR (*(volatile long *) addr) +/* + * Make sure gcc doesn't try to be clever and move things around + * on us. We need to use _exactly_ the address the user gave us, + * not some alias that contains the same information. + */ +typedef struct { volatile int counter; } atomic_t; + + + +#define xen_xchg(ptr,v) \ + ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr)))) +struct __xchg_dummy { unsigned long a[100]; }; +#define __xg(x) ((volatile struct __xchg_dummy *)(x)) +static __inline unsigned long __xchg(unsigned long x, volatile void * ptr, + int size) +{ + switch (size) { + case 1: + __asm__ __volatile__("xchgb %b0,%1" + :"=q" (x) + :"m" (*__xg(ptr)), "0" (x) + :"memory"); + break; + case 2: + __asm__ __volatile__("xchgw %w0,%1" + :"=r" (x) + :"m" (*__xg(ptr)), "0" (x) + :"memory"); + break; + case 4: + __asm__ __volatile__("xchgl %0,%1" + :"=r" (x) + :"m" (*__xg(ptr)), "0" (x) + :"memory"); + break; + } + return x; +} + +/** + * test_and_clear_bit - Clear a bit and return its old value + * @nr: Bit to set + * @addr: Address to count from + * + * This operation is atomic and cannot be reordered. + * It also implies a memory barrier. + */ +static __inline int test_and_clear_bit(int nr, volatile void * addr) +{ + int oldbit; + + __asm__ __volatile__( LOCK_PREFIX + "btrl %2,%1\n\tsbbl %0,%0" + :"=r" (oldbit),"=m" (ADDR) + :"Ir" (nr) : "memory"); + return oldbit; +} + +static __inline int constant_test_bit(int nr, const volatile void * addr) +{ + return ((1UL << (nr & 31)) & (((const volatile unsigned int *) addr)[nr >> 5])) != 0; +} + +static __inline int variable_test_bit(int nr, volatile void * addr) +{ + int oldbit; + + __asm__ __volatile__( + "btl %2,%1\n\tsbbl %0,%0" + :"=r" (oldbit) + :"m" (ADDR),"Ir" (nr)); + return oldbit; +} + +#define test_bit(nr,addr) \ +(__builtin_constant_p(nr) ? \ + constant_test_bit((nr),(addr)) : \ + variable_test_bit((nr),(addr))) + + +/** + * set_bit - Atomically set a bit in memory + * @nr: the bit to set + * @addr: the address to start counting from + * + * This function is atomic and may not be reordered. See __set_bit() + * if you do not require the atomic guarantees. + * Note that @nr may be almost arbitrarily large; this function is not + * restricted to acting on a single-word quantity. + */ +static __inline__ void set_bit(int nr, volatile void * addr) +{ + __asm__ __volatile__( LOCK_PREFIX + "btsl %1,%0" + :"=m" (ADDR) + :"Ir" (nr)); +} + +/** + * clear_bit - Clears a bit in memory + * @nr: Bit to clear + * @addr: Address to start counting from + * + * clear_bit() is atomic and may not be reordered. However, it does + * not contain a memory barrier, so if it is used for locking purposes, + * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit() + * in order to ensure changes are visible on other processors. + */ +static __inline__ void clear_bit(int nr, volatile void * addr) +{ + __asm__ __volatile__( LOCK_PREFIX + "btrl %1,%0" + :"=m" (ADDR) + :"Ir" (nr)); +} + +/** + * atomic_inc - increment atomic variable + * @v: pointer of type atomic_t + * + * Atomically increments @v by 1. Note that the guaranteed + * useful range of an atomic_t is only 24 bits. + */ +static __inline__ void atomic_inc(atomic_t *v) +{ + __asm__ __volatile__( + LOCK "incl %0" + :"=m" (v->counter) + :"m" (v->counter)); +} + + +#define rdtscll(val) \ + __asm__ __volatile__("rdtsc" : "=A" (val)) + + + +/* + * Kernel pointers have redundant information, so we can use a + * scheme where we can return either an error code or a dentry + * pointer with the same return value. + * + * This should be a per-architecture thing, to allow different + * error and pointer decisions. + */ +#define IS_ERR_VALUE(x) unlikely((x) > (unsigned long)-1000L) + +static inline void *ERR_PTR(long error) +{ + return (void *) error; +} + +static inline long PTR_ERR(const void *ptr) +{ + return (long) ptr; +} + +static inline long IS_ERR(const void *ptr) +{ + return IS_ERR_VALUE((unsigned long)ptr); +} + +#endif /* !__ASSEMBLY__ */ + +#endif /* _OS_H_ */ diff --git a/sys/i386/include/xen/xen_intr.h b/sys/i386/include/xen/xen_intr.h new file mode 100644 index 0000000..18fd8b8 --- /dev/null +++ b/sys/i386/include/xen/xen_intr.h @@ -0,0 +1,67 @@ +/* -*- Mode:C; c-basic-offset:4; tab-width:4 -*- */ +#ifndef _XEN_INTR_H_ +#define _XEN_INTR_H_ + +/* +* The flat IRQ space is divided into two regions: +* 1. A one-to-one mapping of real physical IRQs. This space is only used +* if we have physical device-access privilege. This region is at the +* start of the IRQ space so that existing device drivers do not need +* to be modified to translate physical IRQ numbers into our IRQ space. +* 3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These +* are bound using the provided bind/unbind functions. +*/ + +#define PIRQ_BASE 0 +#define NR_PIRQS 128 + +#define DYNIRQ_BASE (PIRQ_BASE + NR_PIRQS) +#define NR_DYNIRQS 128 + +#define NR_IRQS (NR_PIRQS + NR_DYNIRQS) + +#define pirq_to_irq(_x) ((_x) + PIRQ_BASE) +#define irq_to_pirq(_x) ((_x) - PIRQ_BASE) + +#define dynirq_to_irq(_x) ((_x) + DYNIRQ_BASE) +#define irq_to_dynirq(_x) ((_x) - DYNIRQ_BASE) + +/* Dynamic binding of event channels and VIRQ sources to Linux IRQ space. */ +extern void unbind_from_irq(int irq); + +extern void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu); +extern int bind_caller_port_to_irqhandler(unsigned int caller_port, + const char *devname, driver_intr_t handler, void *arg, + unsigned long irqflags, void **cookiep); +extern int bind_listening_port_to_irqhandler(unsigned int remote_domain, + const char *devname, driver_intr_t handler, void *arg, unsigned long irqflags, + void **cookiep); +extern int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu, const char *devname, + driver_intr_t handler, unsigned long irqflags); +extern int bind_ipi_to_irqhandler(unsigned int ipi, unsigned int cpu, const char *devname, + driver_intr_t handler, unsigned long irqflags); +extern int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain, + unsigned int remote_port, + const char *devname, + driver_intr_t handler, + unsigned long irqflags); + + + +extern void unbind_from_irqhandler(unsigned int evtchn, void *dev_id); +static __inline__ int irq_cannonicalize(int irq) +{ + return (irq == 2) ? 9 : irq; +} + +extern void disable_irq(unsigned int); +extern void disable_irq_nosync(unsigned int); +extern void enable_irq(unsigned int); + +extern void irq_suspend(void); +extern void irq_resume(void); + +extern void idle_block(void); + + +#endif /* _XEN_INTR_H_ */ diff --git a/sys/i386/include/xen/xenbus.h b/sys/i386/include/xen/xenbus.h new file mode 100644 index 0000000..10d92ad --- /dev/null +++ b/sys/i386/include/xen/xenbus.h @@ -0,0 +1,285 @@ +/****************************************************************************** + * xenbus.h + * + * Talks to Xen Store to figure out what devices we have. + * + * Copyright (C) 2005 Rusty Russell, IBM Corporation + * Copyright (C) 2005 XenSource Ltd. + * + * This file may be distributed separately from the Linux kernel, or + * incorporated into other software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef _ASM_XEN_XENBUS_H +#define _ASM_XEN_XENBUS_H + +#include <sys/queue.h> +#include <sys/bus.h> +#include <sys/eventhandler.h> +#include <xen/interface/io/xenbus.h> +#include <xen/interface/io/xs_wire.h> + +LIST_HEAD(xendev_list_head, xenbus_device); + +/* Register callback to watch this node. */ +struct xenbus_watch +{ + LIST_ENTRY(xenbus_watch) list; + + /* Path being watched. */ + char *node; + + /* Callback (executed in a process context with no locks held). */ + void (*callback)(struct xenbus_watch *, + const char **vec, unsigned int len); +}; + + +/* A xenbus device. */ +struct xenbus_device { + struct xenbus_watch otherend_watch; /* must be first */ + const char *devicetype; + const char *nodename; + const char *otherend; + int otherend_id; + struct xendev_list_head *bus; + struct xenbus_driver *driver; + int has_error; + enum xenbus_state state; + void *dev_driver_data; + LIST_ENTRY(xenbus_device) list; +}; + +static inline struct xenbus_device *to_xenbus_device(device_t dev) +{ + return device_get_softc(dev); +} + +struct xenbus_device_id +{ + /* .../device/<device_type>/<identifier> */ + char devicetype[32]; /* General class of device. */ +}; + +/* A xenbus driver. */ +struct xenbus_driver { + char *name; + struct module *owner; + const struct xenbus_device_id *ids; + int (*probe)(struct xenbus_device *dev, + const struct xenbus_device_id *id); + void (*otherend_changed)(struct xenbus_device *dev, + XenbusState backend_state); + int (*remove)(struct xenbus_device *dev); + int (*suspend)(struct xenbus_device *dev); + int (*resume)(struct xenbus_device *dev); + int (*hotplug)(struct xenbus_device *, char **, int, char *, int); +#if 0 + struct device_driver driver; +#endif + driver_t driver; + int (*read_otherend_details)(struct xenbus_device *dev); + int (*watch_otherend)(struct xenbus_device *dev); + int (*cleanup_device)(struct xenbus_device *dev); + int (*is_ready)(struct xenbus_device *dev); + LIST_ENTRY(xenbus_driver) list; +}; + +static inline struct xenbus_driver *to_xenbus_driver(driver_t *drv) +{ +#if 0 + return container_of(drv, struct xenbus_driver, driver); +#endif + return NULL; +} +typedef int (*xenstore_event_handler_t)(void *); + +int xenbus_register_frontend(struct xenbus_driver *drv); +int xenbus_register_backend(struct xenbus_driver *drv); +void xenbus_unregister_driver(struct xenbus_driver *drv); + +int xenbus_remove_device(struct xenbus_device *dev); + +struct xenbus_transaction +{ + uint32_t id; +}; + +#define XBT_NIL ((struct xenbus_transaction) { 0 }) + +char **xenbus_directory(struct xenbus_transaction t, + const char *dir, const char *node, unsigned int *num); +void *xenbus_read(struct xenbus_transaction t, + const char *dir, const char *node, unsigned int *len); +int xenbus_write(struct xenbus_transaction t, + const char *dir, const char *node, const char *string); +int xenbus_mkdir(struct xenbus_transaction t, + const char *dir, const char *node); +int xenbus_exists(struct xenbus_transaction t, + const char *dir, const char *node); +int xenbus_rm(struct xenbus_transaction t, const char *dir, const char *node); +int xenbus_transaction_start(struct xenbus_transaction *t); +int xenbus_transaction_end(struct xenbus_transaction t, int abort); + +/* Single read and scanf: returns -errno or num scanned if > 0. */ +int xenbus_scanf(struct xenbus_transaction t, + const char *dir, const char *node, const char *fmt, ...) + __attribute__((format(scanf, 4, 5))); + +/* Single printf and write: returns -errno or 0. */ +int xenbus_printf(struct xenbus_transaction t, + const char *dir, const char *node, const char *fmt, ...) + __attribute__((format(printf, 4, 5))); + +/* Generic read function: NULL-terminated triples of name, + * sprintf-style type string, and pointer. Returns 0 or errno.*/ +int xenbus_gather(struct xenbus_transaction t, const char *dir, ...); + +/* notifer routines for when the xenstore comes up */ +int register_xenstore_notifier(xenstore_event_handler_t func, void *arg, int priority); +#if 0 +void unregister_xenstore_notifier(); +#endif +int register_xenbus_watch(struct xenbus_watch *watch); +void unregister_xenbus_watch(struct xenbus_watch *watch); +void xs_suspend(void); +void xs_resume(void); + +/* Used by xenbus_dev to borrow kernel's store connection. */ +void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg); + +/* Called from xen core code. */ +void xenbus_suspend(void); +void xenbus_resume(void); + +#define XENBUS_IS_ERR_READ(str) ({ \ + if (!IS_ERR(str) && strlen(str) == 0) { \ + free(str, M_DEVBUF); \ + str = ERR_PTR(-ERANGE); \ + } \ + IS_ERR(str); \ +}) + +#define XENBUS_EXIST_ERR(err) ((err) == -ENOENT || (err) == -ERANGE) + + +/** + * Register a watch on the given path, using the given xenbus_watch structure + * for storage, and the given callback function as the callback. Return 0 on + * success, or -errno on error. On success, the given path will be saved as + * watch->node, and remains the caller's to free. On error, watch->node will + * be NULL, the device will switch to XenbusStateClosing, and the error will + * be saved in the store. + */ +int xenbus_watch_path(struct xenbus_device *dev, char *path, + struct xenbus_watch *watch, + void (*callback)(struct xenbus_watch *, + const char **, unsigned int)); + + +/** + * Register a watch on the given path/path2, using the given xenbus_watch + * structure for storage, and the given callback function as the callback. + * Return 0 on success, or -errno on error. On success, the watched path + * (path/path2) will be saved as watch->node, and becomes the caller's to + * kfree(). On error, watch->node will be NULL, so the caller has nothing to + * free, the device will switch to XenbusStateClosing, and the error will be + * saved in the store. + */ +int xenbus_watch_path2(struct xenbus_device *dev, const char *path, + const char *path2, struct xenbus_watch *watch, + void (*callback)(struct xenbus_watch *, + const char **, unsigned int)); + + +/** + * Advertise in the store a change of the given driver to the given new_state. + * which case this is performed inside its own transaction. Return 0 on + * success, or -errno on error. On error, the device will switch to + * XenbusStateClosing, and the error will be saved in the store. + */ +int xenbus_switch_state(struct xenbus_device *dev, + XenbusState new_state); + + +/** + * Grant access to the given ring_mfn to the peer of the given device. Return + * 0 on success, or -errno on error. On error, the device will switch to + * XenbusStateClosing, and the error will be saved in the store. + */ +int xenbus_grant_ring(struct xenbus_device *dev, unsigned long ring_mfn); + + +/** + * Allocate an event channel for the given xenbus_device, assigning the newly + * created local port to *port. Return 0 on success, or -errno on error. On + * error, the device will switch to XenbusStateClosing, and the error will be + * saved in the store. + */ +int xenbus_alloc_evtchn(struct xenbus_device *dev, int *port); + + +/** + * Free an existing event channel. Returns 0 on success or -errno on error. + */ +int xenbus_free_evtchn(struct xenbus_device *dev, int port); + + +/** + * Return the state of the driver rooted at the given store path, or + * XenbusStateClosed if no state can be read. + */ +XenbusState xenbus_read_driver_state(const char *path); + + +/*** + * Report the given negative errno into the store, along with the given + * formatted message. + */ +void xenbus_dev_error(struct xenbus_device *dev, int err, const char *fmt, + ...); + + +/*** + * Equivalent to xenbus_dev_error(dev, err, fmt, args), followed by + * xenbus_switch_state(dev, NULL, XenbusStateClosing) to schedule an orderly + * closedown of this driver and its peer. + */ +void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt, + ...); + +int xenbus_dev_init(void); + +const char *xenbus_strstate(enum xenbus_state state); +int xenbus_dev_is_online(struct xenbus_device *dev); +int xenbus_frontend_closed(struct xenbus_device *dev); + +#endif /* _ASM_XEN_XENBUS_H */ + +/* + * Local variables: + * c-file-style: "bsd" + * indent-tabs-mode: t + * c-indent-level: 4 + * c-basic-offset: 8 + * tab-width: 4 + * End: + */ diff --git a/sys/i386/include/xen/xenfunc.h b/sys/i386/include/xen/xenfunc.h new file mode 100644 index 0000000..fe40792 --- /dev/null +++ b/sys/i386/include/xen/xenfunc.h @@ -0,0 +1,81 @@ +/* + * + * Copyright (c) 2004,2005 Kip Macy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +#ifndef _XEN_XENFUNC_H_ +#define _XEN_XENFUNC_H_ + +#include <machine/xen/xen-os.h> +#include <machine/xen/hypervisor.h> +#include <machine/xen/xenpmap.h> +#include <machine/segments.h> +#include <sys/pcpu.h> +#define BKPT __asm__("int3"); +#define XPQ_CALL_DEPTH 5 +#define XPQ_CALL_COUNT 2 +#define PG_PRIV PG_AVAIL3 +typedef struct { + unsigned long pt_ref; + unsigned long pt_eip[XPQ_CALL_COUNT][XPQ_CALL_DEPTH]; +} pteinfo_t; + +extern pteinfo_t *pteinfo_list; +#ifdef XENDEBUG_LOW +#define __PRINTK(x) printk x +#else +#define __PRINTK(x) +#endif + +char *xen_setbootenv(char *cmd_line); + +int xen_boothowto(char *envp); + +void _xen_machphys_update(vm_paddr_t, vm_paddr_t, char *file, int line); + +#ifdef INVARIANTS +#define xen_machphys_update(a, b) _xen_machphys_update((a), (b), __FILE__, __LINE__) +#else +#define xen_machphys_update(a, b) _xen_machphys_update((a), (b), NULL, 0) +#endif + +void xen_update_descriptor(union descriptor *, union descriptor *); + +void ap_cpu_initclocks(void); + +extern struct mtx balloon_lock; +#if 0 +#define balloon_lock(__flags) mtx_lock_irqsave(&balloon_lock, __flags) +#define balloon_unlock(__flags) mtx_unlock_irqrestore(&balloon_lock, __flags) +#else +#define balloon_lock(__flags) __flags = 1 +#define balloon_unlock(__flags) __flags = 0 +#endif + + + +#endif /* _XEN_XENFUNC_H_ */ diff --git a/sys/i386/include/xen/xenpmap.h b/sys/i386/include/xen/xenpmap.h new file mode 100644 index 0000000..bab16e7 --- /dev/null +++ b/sys/i386/include/xen/xenpmap.h @@ -0,0 +1,231 @@ +/* + * + * Copyright (c) 2004 Christian Limpach. + * Copyright (c) 2004,2005 Kip Macy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Christian Limpach. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +#ifndef _XEN_XENPMAP_H_ +#define _XEN_XENPMAP_H_ +void xen_invlpg(vm_offset_t); +void xen_load_cr3(vm_paddr_t); +void _xen_queue_pt_update(vm_paddr_t, vm_paddr_t, char *, int); +void xen_pt_switch(vm_paddr_t); +void xen_set_ldt(vm_paddr_t, unsigned long); +void xen_tlb_flush(void); +void xen_pgdpt_pin(vm_paddr_t); +void xen_pgd_pin(vm_paddr_t); +void xen_pgd_unpin(vm_paddr_t); +void xen_pt_pin(vm_paddr_t); +void xen_pt_unpin(vm_paddr_t); +void xen_flush_queue(void); +void xen_check_queue(void); +#if 0 +void pmap_ref(pt_entry_t *pte, vm_paddr_t ma); +#endif + +#ifdef INVARIANTS +#define xen_queue_pt_update(a, b) _xen_queue_pt_update((a), (b), __FILE__, __LINE__) +#else +#define xen_queue_pt_update(a, b) _xen_queue_pt_update((a), (b), NULL, 0) +#endif + + +#include <sys/param.h> +#include <sys/pcpu.h> + +#ifdef PMAP_DEBUG +#define PMAP_REF pmap_ref +#define PMAP_DEC_REF_PAGE pmap_dec_ref_page +#define PMAP_MARK_PRIV pmap_mark_privileged +#define PMAP_MARK_UNPRIV pmap_mark_unprivileged +#else +#define PMAP_MARK_PRIV(a) +#define PMAP_MARK_UNPRIV(a) +#define PMAP_REF(a, b) +#define PMAP_DEC_REF_PAGE(a) +#endif + +#define ALWAYS_SYNC 0 + +#ifdef PT_DEBUG +#define PT_LOG() printk("WP PT_SET %s:%d\n", __FILE__, __LINE__) +#else +#define PT_LOG() +#endif + +#define INVALID_P2M_ENTRY (~0UL) + +#define pmap_valid_entry(E) ((E) & PG_V) /* is PDE or PTE valid? */ + +#define SH_PD_SET_VA 1 +#define SH_PD_SET_VA_MA 2 +#define SH_PD_SET_VA_CLEAR 3 + +struct pmap; +void pd_set(struct pmap *pmap, int ptepindex, vm_paddr_t val, int type); +#ifdef notyet +static vm_paddr_t +vptetomachpte(vm_paddr_t *pte) +{ + vm_offset_t offset, ppte; + vm_paddr_t pgoffset, retval, *pdir_shadow_ptr; + int pgindex; + + ppte = (vm_offset_t)pte; + pgoffset = (ppte & PAGE_MASK); + offset = ppte - (vm_offset_t)PTmap; + pgindex = ppte >> PDRSHIFT; + + pdir_shadow_ptr = (vm_paddr_t *)PCPU_GET(pdir_shadow); + retval = (pdir_shadow_ptr[pgindex] & ~PAGE_MASK) + pgoffset; + return (retval); +} +#endif +#define PT_GET(_ptp) \ + (pmap_valid_entry(*(_ptp)) ? xpmap_mtop(*(_ptp)) : (0)) + +#ifdef WRITABLE_PAGETABLES + +#define PT_SET_VA(_ptp,_npte,sync) do { \ + PMAP_REF((_ptp), xpmap_ptom(_npte)); \ + PT_LOG(); \ + *(_ptp) = xpmap_ptom((_npte)); \ +} while (/*CONSTCOND*/0) +#define PT_SET_VA_MA(_ptp,_npte,sync) do { \ + PMAP_REF((_ptp), (_npte)); \ + PT_LOG(); \ + *(_ptp) = (_npte); \ +} while (/*CONSTCOND*/0) +#define PT_CLEAR_VA(_ptp, sync) do { \ + PMAP_REF((pt_entry_t *)(_ptp), 0); \ + PT_LOG(); \ + *(_ptp) = 0; \ +} while (/*CONSTCOND*/0) + +#define PD_SET_VA(_pmap, _ptp, _npte, sync) do { \ + PMAP_REF((_ptp), xpmap_ptom(_npte)); \ + pd_set((_pmap),(_ptp),(_npte), SH_PD_SET_VA); \ + if (sync || ALWAYS_SYNC) xen_flush_queue(); \ +} while (/*CONSTCOND*/0) +#define PD_SET_VA_MA(_pmap, _ptp, _npte, sync) do { \ + PMAP_REF((_ptp), (_npte)); \ + pd_set((_pmap),(_ptp),(_npte), SH_PD_SET_VA_MA); \ + if (sync || ALWAYS_SYNC) xen_flush_queue(); \ +} while (/*CONSTCOND*/0) +#define PD_CLEAR_VA(_pmap, _ptp, sync) do { \ + PMAP_REF((pt_entry_t *)(_ptp), 0); \ + pd_set((_pmap),(_ptp), 0, SH_PD_SET_VA_CLEAR); \ + if (sync || ALWAYS_SYNC) xen_flush_queue(); \ +} while (/*CONSTCOND*/0) + +#else /* !WRITABLE_PAGETABLES */ + +#define PT_SET_VA(_ptp,_npte,sync) do { \ + PMAP_REF((_ptp), xpmap_ptom(_npte)); \ + xen_queue_pt_update(vtomach(_ptp), \ + xpmap_ptom(_npte)); \ + if (sync || ALWAYS_SYNC) xen_flush_queue(); \ +} while (/*CONSTCOND*/0) +#define PT_SET_VA_MA(_ptp,_npte,sync) do { \ + PMAP_REF((_ptp), (_npte)); \ + xen_queue_pt_update(vtomach(_ptp), _npte); \ + if (sync || ALWAYS_SYNC) xen_flush_queue(); \ +} while (/*CONSTCOND*/0) +#define PT_CLEAR_VA(_ptp, sync) do { \ + PMAP_REF((pt_entry_t *)(_ptp), 0); \ + xen_queue_pt_update(vtomach(_ptp), 0); \ + if (sync || ALWAYS_SYNC) \ + xen_flush_queue(); \ +} while (/*CONSTCOND*/0) + +#define PD_SET_VA(_pmap, _ptepindex,_npte,sync) do { \ + PMAP_REF((_ptp), xpmap_ptom(_npte)); \ + pd_set((_pmap),(_ptepindex),(_npte), SH_PD_SET_VA); \ + if (sync || ALWAYS_SYNC) xen_flush_queue(); \ +} while (/*CONSTCOND*/0) +#define PD_SET_VA_MA(_pmap, _ptepindex,_npte,sync) do { \ + PMAP_REF((_ptp), (_npte)); \ + pd_set((_pmap),(_ptepindex),(_npte), SH_PD_SET_VA_MA); \ + if (sync || ALWAYS_SYNC) xen_flush_queue(); \ +} while (/*CONSTCOND*/0) +#define PD_CLEAR_VA(_pmap, _ptepindex, sync) do { \ + PMAP_REF((pt_entry_t *)(_ptp), 0); \ + pd_set((_pmap),(_ptepindex), 0, SH_PD_SET_VA_CLEAR); \ + if (sync || ALWAYS_SYNC) xen_flush_queue(); \ +} while (/*CONSTCOND*/0) + +#endif + +#define PT_SET_MA(_va, _ma) \ +do { \ + PANIC_IF(HYPERVISOR_update_va_mapping(((unsigned long)(_va)),\ + (_ma), \ + UVMF_INVLPG| UVMF_ALL) < 0); \ +} while (/*CONSTCOND*/0) + +#define PT_UPDATES_FLUSH() do { \ + xen_flush_queue(); \ +} while (/*CONSTCOND*/0) + +static __inline vm_paddr_t +xpmap_mtop(vm_paddr_t mpa) +{ + vm_paddr_t tmp = (mpa & PG_FRAME); + + return machtophys(tmp) | (mpa & ~PG_FRAME); +} + +static __inline vm_paddr_t +xpmap_ptom(vm_paddr_t ppa) +{ + vm_paddr_t tmp = (ppa & PG_FRAME); + + return phystomach(tmp) | (ppa & ~PG_FRAME); +} + +static __inline void +set_phys_to_machine(unsigned long pfn, unsigned long mfn) +{ +#ifdef notyet + PANIC_IF(max_mapnr && pfn >= max_mapnr); +#endif + if (xen_feature(XENFEAT_auto_translated_physmap)) { +#ifdef notyet + PANIC_IF((pfn != mfn && mfn != INVALID_P2M_ENTRY)); +#endif + return; + } + xen_phys_machine[pfn] = mfn; +} + + + + +#endif /* _XEN_XENPMAP_H_ */ diff --git a/sys/i386/include/xen/xenstored.h b/sys/i386/include/xen/xenstored.h new file mode 100644 index 0000000..e584fa5 --- /dev/null +++ b/sys/i386/include/xen/xenstored.h @@ -0,0 +1,89 @@ +/* + * Simple prototyle Xen Store Daemon providing simple tree-like database. + * Copyright (C) 2005 Rusty Russell IBM Corporation + * + * This file may be distributed separately from the Linux kernel, or + * incorporated into other software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef _XENSTORED_H +#define _XENSTORED_H + +enum xsd_sockmsg_type +{ + XS_DEBUG, + XS_SHUTDOWN, + XS_DIRECTORY, + XS_READ, + XS_GET_PERMS, + XS_WATCH, + XS_WATCH_ACK, + XS_UNWATCH, + XS_TRANSACTION_START, + XS_TRANSACTION_END, + XS_OP_READ_ONLY = XS_TRANSACTION_END, + XS_INTRODUCE, + XS_RELEASE, + XS_GETDOMAINPATH, + XS_WRITE, + XS_MKDIR, + XS_RM, + XS_SET_PERMS, + XS_WATCH_EVENT, + XS_ERROR, +}; + +#define XS_WRITE_NONE "NONE" +#define XS_WRITE_CREATE "CREATE" +#define XS_WRITE_CREATE_EXCL "CREATE|EXCL" + +/* We hand errors as strings, for portability. */ +struct xsd_errors +{ + int errnum; + const char *errstring; +}; +#define XSD_ERROR(x) { x, #x } +static struct xsd_errors xsd_errors[] __attribute__((unused)) = { + XSD_ERROR(EINVAL), + XSD_ERROR(EACCES), + XSD_ERROR(EEXIST), + XSD_ERROR(EISDIR), + XSD_ERROR(ENOENT), + XSD_ERROR(ENOMEM), + XSD_ERROR(ENOSPC), + XSD_ERROR(EIO), + XSD_ERROR(ENOTEMPTY), + XSD_ERROR(ENOSYS), + XSD_ERROR(EROFS), + XSD_ERROR(EBUSY), + XSD_ERROR(ETIMEDOUT), + XSD_ERROR(EISCONN), +}; +struct xsd_sockmsg +{ + uint32_t type; + uint32_t len; /* Length of data following this. */ + + /* Generally followed by nul-terminated string(s). */ +}; + +#endif /* _XENSTORED_H */ diff --git a/sys/i386/include/xen/xenvar.h b/sys/i386/include/xen/xenvar.h new file mode 100644 index 0000000..402bc8a --- /dev/null +++ b/sys/i386/include/xen/xenvar.h @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2008 Kip Macy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * + * $FreeBSD$ + */ +#ifndef XENVAR_H_ +#define XENVAR_H_ +#define XBOOTUP 0x1 +#define XPMAP 0x2 +extern int xendebug_flags; +#ifndef NOXENDEBUG +#define XENPRINTF printk +#else +#define XENPRINTF printf +#endif +#include <machine/xen/features.h> + +extern xen_pfn_t *xen_phys_machine; + +#if 0 +#define TRACE_ENTER XENPRINTF("(file=%s, line=%d) entered %s\n", __FILE__, __LINE__, __FUNCTION__) +#define TRACE_EXIT XENPRINTF("(file=%s, line=%d) exiting %s\n", __FILE__, __LINE__, __FUNCTION__) +#define TRACE_DEBUG(argflags, _f, _a...) \ +if (xendebug_flags & argflags) XENPRINTF("(file=%s, line=%d) " _f "\n", __FILE__, __LINE__, ## _a); +#else +#define TRACE_ENTER +#define TRACE_EXIT +#define TRACE_DEBUG(argflags, _f, _a...) +#endif + +extern xen_pfn_t *xen_machine_phys; +/* Xen starts physical pages after the 4MB ISA hole - + * FreeBSD doesn't + */ + + +#undef ADD_ISA_HOLE /* XXX */ + +#ifdef ADD_ISA_HOLE +#define ISA_INDEX_OFFSET 1024 +#define ISA_PDR_OFFSET 1 +#else +#define ISA_INDEX_OFFSET 0 +#define ISA_PDR_OFFSET 0 +#endif + + +#define PFNTOMFN(i) (xen_phys_machine[(i)]) +#define MFNTOPFN(i) ((vm_paddr_t)xen_machine_phys[(i)]) + +#define VTOP(x) ((((uintptr_t)(x))) - KERNBASE) +#define PTOV(x) (((uintptr_t)(x)) + KERNBASE) + +#define VTOPFN(x) (VTOP(x) >> PAGE_SHIFT) +#define PFNTOV(x) PTOV((vm_paddr_t)(x) << PAGE_SHIFT) + +#define VTOMFN(va) (vtomach(va) >> PAGE_SHIFT) +#define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT) + +#define phystomach(pa) (((vm_paddr_t)(PFNTOMFN((pa) >> PAGE_SHIFT))) << PAGE_SHIFT) +#define machtophys(ma) (((vm_paddr_t)(MFNTOPFN((ma) >> PAGE_SHIFT))) << PAGE_SHIFT) + + +void xpq_init(void); + +#define BITS_PER_LONG 32 +#define NR_CPUS MAX_VIRT_CPUS + +#define BITS_TO_LONGS(bits) \ + (((bits)+BITS_PER_LONG-1)/BITS_PER_LONG) +#define DECLARE_BITMAP(name,bits) \ + unsigned long name[BITS_TO_LONGS(bits)] +typedef struct { DECLARE_BITMAP(bits, NR_CPUS); } xen_cpumask_t; + +int xen_create_contiguous_region(vm_page_t pages, int npages); + +void xen_destroy_contiguous_region(void * addr, int npages); + +#endif diff --git a/sys/i386/xen/clock.c b/sys/i386/xen/clock.c new file mode 100644 index 0000000..41b2f6a --- /dev/null +++ b/sys/i386/xen/clock.c @@ -0,0 +1,970 @@ +/*- + * Copyright (c) 1990 The Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * William Jolitz and Don Ahn. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)clock.c 7.2 (Berkeley) 5/12/91 + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +/* #define DELAYDEBUG */ +/* + * Routines to handle clock hardware. + */ + +#include "opt_ddb.h" +#include "opt_clock.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bus.h> +#include <sys/clock.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/time.h> +#include <sys/timetc.h> +#include <sys/kernel.h> +#include <sys/limits.h> +#include <sys/sysctl.h> +#include <sys/cons.h> +#include <sys/power.h> + +#include <machine/clock.h> +#include <machine/cputypes.h> +#include <machine/frame.h> +#include <machine/intr_machdep.h> +#include <machine/md_var.h> +#include <machine/psl.h> +#if defined(SMP) +#include <machine/smp.h> +#endif +#include <machine/specialreg.h> +#include <machine/timerreg.h> + +#include <i386/isa/icu.h> +#include <i386/isa/isa.h> +#include <isa/rtc.h> + +#include <machine/xen/xen_intr.h> +#include <vm/vm.h> +#include <vm/pmap.h> +#include <machine/pmap.h> +#include <machine/xen/hypervisor.h> +#include <machine/xen/xen-os.h> +#include <machine/xen/xenfunc.h> +#include <xen/interface/vcpu.h> +#include <machine/cpu.h> + +/* + * 32-bit time_t's can't reach leap years before 1904 or after 2036, so we + * can use a simple formula for leap years. + */ +#define LEAPYEAR(y) (!((y) % 4)) +#define DAYSPERYEAR (28+30*4+31*7) + +#ifndef TIMER_FREQ +#define TIMER_FREQ 1193182 +#endif + +#ifdef CYC2NS_SCALE_FACTOR +#undef CYC2NS_SCALE_FACTOR +#endif +#define CYC2NS_SCALE_FACTOR 10 + +/* Values for timerX_state: */ +#define RELEASED 0 +#define RELEASE_PENDING 1 +#define ACQUIRED 2 +#define ACQUIRE_PENDING 3 + +#define RTC_LOCK_INIT \ + mtx_init(&clock_lock, "clk", NULL, MTX_SPIN) +#define RTC_LOCK mtx_lock_spin(&clock_lock) +#define RTC_UNLOCK mtx_unlock_spin(&clock_lock) + +int adjkerntz; /* local offset from GMT in seconds */ +int clkintr_pending; +int pscnt = 1; +int psdiv = 1; +int statclock_disable; +int disable_rtc_set = 0; +int wall_cmos_clock; +u_int timer_freq = TIMER_FREQ; +static int independent_wallclock; +static int xen_disable_rtc_set; +static u_long cached_gtm; /* cached quotient for TSC -> microseconds */ +static u_long cyc2ns_scale; +static u_char timer2_state = RELEASED; +static struct timespec shadow_tv; +static uint32_t shadow_tv_version; /* XXX: lazy locking */ +static uint64_t processed_system_time; /* stime (ns) at last processing. */ + + +#ifdef XEN_PRIVILEGED_GUEST +static struct mtx clock_lock; +static int rtc_reg; +#endif + +static const u_char daysinmonth[] = {31,28,31,30,31,30,31,31,30,31,30,31}; + +SYSCTL_INT(_machdep, OID_AUTO, independent_wallclock, + CTLFLAG_RW, &independent_wallclock, 0, ""); +SYSCTL_INT(_machdep, OID_AUTO, xen_disable_rtc_set, + CTLFLAG_RW, &xen_disable_rtc_set, 1, ""); + + +#define do_div(n,base) ({ \ + unsigned long __upper, __low, __high, __mod, __base; \ + __base = (base); \ + __asm("":"=a" (__low), "=d" (__high):"A" (n)); \ + __upper = __high; \ + if (__high) { \ + __upper = __high % (__base); \ + __high = __high / (__base); \ + } \ + __asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (__base), "0" (__low), "1" (__upper)); \ + __asm("":"=A" (n):"a" (__low),"d" (__high)); \ + __mod; \ +}) + + +/* These are peridically updated in shared_info, and then copied here. */ +struct shadow_time_info { + uint64_t tsc_timestamp; /* TSC at last update of time vals. */ + uint64_t system_timestamp; /* Time, in nanosecs, since boot. */ + uint32_t tsc_to_nsec_mul; + uint32_t tsc_to_usec_mul; + int tsc_shift; + uint32_t version; +}; +static DEFINE_PER_CPU(uint64_t, processed_system_time); +static DEFINE_PER_CPU(struct shadow_time_info, shadow_time); + + +#define NS_PER_TICK (1000000000ULL/hz) + +#define rdtscll(val) \ + __asm__ __volatile__("rdtsc" : "=A" (val)) + + +/* convert from cycles(64bits) => nanoseconds (64bits) + * basic equation: + * ns = cycles / (freq / ns_per_sec) + * ns = cycles * (ns_per_sec / freq) + * ns = cycles * (10^9 / (cpu_mhz * 10^6)) + * ns = cycles * (10^3 / cpu_mhz) + * + * Then we use scaling math (suggested by george@mvista.com) to get: + * ns = cycles * (10^3 * SC / cpu_mhz) / SC + * ns = cycles * cyc2ns_scale / SC + * + * And since SC is a constant power of two, we can convert the div + * into a shift. + * -johnstul@us.ibm.com "math is hard, lets go shopping!" + */ +static inline void set_cyc2ns_scale(unsigned long cpu_mhz) +{ + cyc2ns_scale = (1000 << CYC2NS_SCALE_FACTOR)/cpu_mhz; +} + +static inline unsigned long long cycles_2_ns(unsigned long long cyc) +{ + return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR; +} + +/* + * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, + * yielding a 64-bit result. + */ +static inline uint64_t +scale_delta(uint64_t delta, uint32_t mul_frac, int shift) +{ + uint64_t product; + uint32_t tmp1, tmp2; + + if ( shift < 0 ) + delta >>= -shift; + else + delta <<= shift; + + __asm__ ( + "mul %5 ; " + "mov %4,%%eax ; " + "mov %%edx,%4 ; " + "mul %5 ; " + "add %4,%%eax ; " + "xor %5,%5 ; " + "adc %5,%%edx ; " + : "=A" (product), "=r" (tmp1), "=r" (tmp2) + : "a" ((uint32_t)delta), "1" ((uint32_t)(delta >> 32)), "2" (mul_frac) ); + + return product; +} + +static uint64_t get_nsec_offset(struct shadow_time_info *shadow) +{ + uint64_t now, delta; + rdtscll(now); + delta = now - shadow->tsc_timestamp; + return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); +} + +static void update_wallclock(void) +{ + shared_info_t *s = HYPERVISOR_shared_info; + + do { + shadow_tv_version = s->wc_version; + rmb(); + shadow_tv.tv_sec = s->wc_sec; + shadow_tv.tv_nsec = s->wc_nsec; + rmb(); + } + while ((s->wc_version & 1) | (shadow_tv_version ^ s->wc_version)); + +} + +/* + * Reads a consistent set of time-base values from Xen, into a shadow data + * area. Must be called with the xtime_lock held for writing. + */ +static void __get_time_values_from_xen(void) +{ + shared_info_t *s = HYPERVISOR_shared_info; + struct vcpu_time_info *src; + struct shadow_time_info *dst; + + src = &s->vcpu_info[smp_processor_id()].time; + dst = &per_cpu(shadow_time, smp_processor_id()); + + do { + dst->version = src->version; + rmb(); + dst->tsc_timestamp = src->tsc_timestamp; + dst->system_timestamp = src->system_time; + dst->tsc_to_nsec_mul = src->tsc_to_system_mul; + dst->tsc_shift = src->tsc_shift; + rmb(); + } + while ((src->version & 1) | (dst->version ^ src->version)); + + dst->tsc_to_usec_mul = dst->tsc_to_nsec_mul / 1000; +} + +static inline int time_values_up_to_date(int cpu) +{ + struct vcpu_time_info *src; + struct shadow_time_info *dst; + + src = &HYPERVISOR_shared_info->vcpu_info[cpu].time; + dst = &per_cpu(shadow_time, cpu); + + rmb(); + return (dst->version == src->version); +} + +static unsigned xen_get_timecount(struct timecounter *tc); + +static struct timecounter xen_timecounter = { + xen_get_timecount, /* get_timecount */ + 0, /* no poll_pps */ + ~0u, /* counter_mask */ + 0, /* frequency */ + "ixen", /* name */ + 0 /* quality */ +}; + +static void +clkintr(struct clockframe *frame) +{ + int64_t delta_cpu, delta; + int cpu = smp_processor_id(); + struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu); + + do { + __get_time_values_from_xen(); + + delta = delta_cpu = + shadow->system_timestamp + get_nsec_offset(shadow); + delta -= processed_system_time; + delta_cpu -= per_cpu(processed_system_time, cpu); + + } while (!time_values_up_to_date(cpu)); + + if (unlikely(delta < (int64_t)0) || unlikely(delta_cpu < (int64_t)0)) { + printf("Timer ISR: Time went backwards: %lld\n", delta); + return; + } + + /* Process elapsed ticks since last call. */ + if (delta >= NS_PER_TICK) { + processed_system_time += (delta / NS_PER_TICK) * NS_PER_TICK; + per_cpu(processed_system_time, cpu) += (delta_cpu / NS_PER_TICK) * NS_PER_TICK; + } + hardclock(frame); + + /* + * Take synchronised time from Xen once a minute if we're not + * synchronised ourselves, and we haven't chosen to keep an independent + * time base. + */ + + if (shadow_tv_version != HYPERVISOR_shared_info->wc_version) { + update_wallclock(); + tc_setclock(&shadow_tv); + } + + /* XXX TODO */ +} + +static uint32_t +getit(void) +{ + struct shadow_time_info *shadow; + shadow = &per_cpu(shadow_time, smp_processor_id()); + __get_time_values_from_xen(); + return shadow->system_timestamp + get_nsec_offset(shadow); +} + + +/* + * Wait "n" microseconds. + * Relies on timer 1 counting down from (timer_freq / hz) + * Note: timer had better have been programmed before this is first used! + */ +void +DELAY(int n) +{ + int delta, ticks_left; + uint32_t tick, prev_tick; +#ifdef DELAYDEBUG + int getit_calls = 1; + int n1; + static int state = 0; + + if (state == 0) { + state = 1; + for (n1 = 1; n1 <= 10000000; n1 *= 10) + DELAY(n1); + state = 2; + } + if (state == 1) + printf("DELAY(%d)...", n); +#endif + /* + * Read the counter first, so that the rest of the setup overhead is + * counted. Guess the initial overhead is 20 usec (on most systems it + * takes about 1.5 usec for each of the i/o's in getit(). The loop + * takes about 6 usec on a 486/33 and 13 usec on a 386/20. The + * multiplications and divisions to scale the count take a while). + * + * However, if ddb is active then use a fake counter since reading + * the i8254 counter involves acquiring a lock. ddb must not go + * locking for many reasons, but it calls here for at least atkbd + * input. + */ + prev_tick = getit(); + + n -= 0; /* XXX actually guess no initial overhead */ + /* + * Calculate (n * (timer_freq / 1e6)) without using floating point + * and without any avoidable overflows. + */ + if (n <= 0) + ticks_left = 0; + else if (n < 256) + /* + * Use fixed point to avoid a slow division by 1000000. + * 39099 = 1193182 * 2^15 / 10^6 rounded to nearest. + * 2^15 is the first power of 2 that gives exact results + * for n between 0 and 256. + */ + ticks_left = ((u_int)n * 39099 + (1 << 15) - 1) >> 15; + else + /* + * Don't bother using fixed point, although gcc-2.7.2 + * generates particularly poor code for the long long + * division, since even the slow way will complete long + * before the delay is up (unless we're interrupted). + */ + ticks_left = ((u_int)n * (long long)timer_freq + 999999) + / 1000000; + + while (ticks_left > 0) { + tick = getit(); +#ifdef DELAYDEBUG + ++getit_calls; +#endif + delta = tick - prev_tick; + prev_tick = tick; + if (delta < 0) { + /* + * Guard against timer0_max_count being wrong. + * This shouldn't happen in normal operation, + * but it may happen if set_timer_freq() is + * traced. + */ + /* delta += timer0_max_count; ??? */ + if (delta < 0) + delta = 0; + } + ticks_left -= delta; + } +#ifdef DELAYDEBUG + if (state == 1) + printf(" %d calls to getit() at %d usec each\n", + getit_calls, (n + 5) / getit_calls); +#endif +} + + +int +sysbeep(int pitch, int period) +{ + return (0); +} + +/* + * Restore all the timers non-atomically (XXX: should be atomically). + * + * This function is called from pmtimer_resume() to restore all the timers. + * This should not be necessary, but there are broken laptops that do not + * restore all the timers on resume. + */ +void +timer_restore(void) +{ + /* Get timebases for new environment. */ + __get_time_values_from_xen(); + + /* Reset our own concept of passage of system time. */ + processed_system_time = per_cpu(shadow_time, 0).system_timestamp; + per_cpu(processed_system_time, 0) = processed_system_time; +} + +void +startrtclock() +{ + unsigned long long alarm; + uint64_t __cpu_khz; + uint32_t cpu_khz; + struct vcpu_time_info *info; + + /* initialize xen values */ + __get_time_values_from_xen(); + processed_system_time = per_cpu(shadow_time, 0).system_timestamp; + per_cpu(processed_system_time, 0) = processed_system_time; + + __cpu_khz = 1000000ULL << 32; + info = &HYPERVISOR_shared_info->vcpu_info[0].time; + + do_div(__cpu_khz, info->tsc_to_system_mul); + if ( info->tsc_shift < 0 ) + cpu_khz = __cpu_khz << -info->tsc_shift; + else + cpu_khz = __cpu_khz >> info->tsc_shift; + + printf("Xen reported: %u.%03u MHz processor.\n", + cpu_khz / 1000, cpu_khz % 1000); + + /* (10^6 * 2^32) / cpu_hz = (10^3 * 2^32) / cpu_khz = + (2^32 * 1 / (clocks/us)) */ + { + unsigned long eax=0, edx=1000; + __asm__("divl %2" + :"=a" (cached_gtm), "=d" (edx) + :"r" (cpu_khz), + "0" (eax), "1" (edx)); + } + + set_cyc2ns_scale(cpu_khz/1000); + tsc_freq = cpu_khz * 1000; + + timer_freq = xen_timecounter.tc_frequency = 1000000000LL; + tc_init(&xen_timecounter); + + + rdtscll(alarm); +} + +#ifdef XEN_PRIVILEGED_GUEST +/* + * RTC support routines + */ + +int +rtcin(reg) + int reg; +{ + u_char val; + + RTC_LOCK; + outb(IO_RTC, reg); + inb(0x84); + val = inb(IO_RTC + 1); + inb(0x84); + RTC_UNLOCK; + return (val); +} + + +static __inline int +readrtc(int port) +{ + return(bcd2bin(rtcin(port))); +} + +void +writertc(int reg, u_char val) +{ + + RTC_LOCK; + if (rtc_reg != reg) { + inb(0x84); + outb(IO_RTC, reg); + rtc_reg = reg; + inb(0x84); + } + outb(IO_RTC + 1, val); + inb(0x84); + RTC_UNLOCK; +} + + +/* + * Initialize the time of day register, based on the time base which is, e.g. + * from a filesystem. + */ +static void +domu_inittodr(time_t base) +{ + unsigned long sec; + int s, y; + struct timespec ts; + + update_wallclock(); + + RTC_LOCK; + + if (base) { + ts.tv_sec = base; + ts.tv_nsec = 0; + tc_setclock(&ts); + } + + sec += tz_minuteswest * 60 + (wall_cmos_clock ? adjkerntz : 0); + + y = time_second - shadow_tv.tv_sec; + if (y <= -2 || y >= 2) { + /* badly off, adjust it */ + tc_setclock(&shadow_tv); + } + RTC_UNLOCK; +} + +/* + * Write system time back to RTC. + */ +static void +domu_resettodr(void) +{ + unsigned long tm; + int s; + dom0_op_t op; + struct shadow_time_info *shadow; + + shadow = &per_cpu(shadow_time, smp_processor_id()); + if (xen_disable_rtc_set) + return; + + s = splclock(); + tm = time_second; + splx(s); + + tm -= tz_minuteswest * 60 + (wall_cmos_clock ? adjkerntz : 0); + + if ((xen_start_info->flags & SIF_INITDOMAIN) && + !independent_wallclock) + { + op.cmd = DOM0_SETTIME; + op.u.settime.secs = tm; + op.u.settime.nsecs = 0; + op.u.settime.system_time = shadow->system_timestamp; + HYPERVISOR_dom0_op(&op); + update_wallclock(); + } else if (independent_wallclock) { + /* notyet */ + ; + } +} + +/* + * Initialize the time of day register, based on the time base which is, e.g. + * from a filesystem. + */ +void +inittodr(time_t base) +{ + unsigned long sec, days; + int year, month; + int y, m, s; + struct timespec ts; + + if (!(xen_start_info->flags & SIF_INITDOMAIN)) { + domu_inittodr(base); + return; + } + + if (base) { + s = splclock(); + ts.tv_sec = base; + ts.tv_nsec = 0; + tc_setclock(&ts); + splx(s); + } + + /* Look if we have a RTC present and the time is valid */ + if (!(rtcin(RTC_STATUSD) & RTCSD_PWR)) + goto wrong_time; + + /* wait for time update to complete */ + /* If RTCSA_TUP is zero, we have at least 244us before next update */ + s = splhigh(); + while (rtcin(RTC_STATUSA) & RTCSA_TUP) { + splx(s); + s = splhigh(); + } + + days = 0; +#ifdef USE_RTC_CENTURY + year = readrtc(RTC_YEAR) + readrtc(RTC_CENTURY) * 100; +#else + year = readrtc(RTC_YEAR) + 1900; + if (year < 1970) + year += 100; +#endif + if (year < 1970) { + splx(s); + goto wrong_time; + } + month = readrtc(RTC_MONTH); + for (m = 1; m < month; m++) + days += daysinmonth[m-1]; + if ((month > 2) && LEAPYEAR(year)) + days ++; + days += readrtc(RTC_DAY) - 1; + for (y = 1970; y < year; y++) + days += DAYSPERYEAR + LEAPYEAR(y); + sec = ((( days * 24 + + readrtc(RTC_HRS)) * 60 + + readrtc(RTC_MIN)) * 60 + + readrtc(RTC_SEC)); + /* sec now contains the number of seconds, since Jan 1 1970, + in the local time zone */ + + sec += tz_minuteswest * 60 + (wall_cmos_clock ? adjkerntz : 0); + + y = time_second - sec; + if (y <= -2 || y >= 2) { + /* badly off, adjust it */ + ts.tv_sec = sec; + ts.tv_nsec = 0; + tc_setclock(&ts); + } + splx(s); + return; + + wrong_time: + printf("Invalid time in real time clock.\n"); + printf("Check and reset the date immediately!\n"); +} + + + +/* + * Write system time back to RTC + */ +void +resettodr() +{ + unsigned long tm; + int y, m, s; + + if (!(xen_start_info->flags & SIF_INITDOMAIN)) { + domu_resettodr(); + return; + } + + if (xen_disable_rtc_set) + return; + + s = splclock(); + tm = time_second; + splx(s); + + /* Disable RTC updates and interrupts. */ + writertc(RTC_STATUSB, RTCSB_HALT | RTCSB_24HR); + + /* Calculate local time to put in RTC */ + + tm -= tz_minuteswest * 60 + (wall_cmos_clock ? adjkerntz : 0); + + writertc(RTC_SEC, bin2bcd(tm%60)); tm /= 60; /* Write back Seconds */ + writertc(RTC_MIN, bin2bcd(tm%60)); tm /= 60; /* Write back Minutes */ + writertc(RTC_HRS, bin2bcd(tm%24)); tm /= 24; /* Write back Hours */ + + /* We have now the days since 01-01-1970 in tm */ + writertc(RTC_WDAY, (tm + 4) % 7 + 1); /* Write back Weekday */ + for (y = 1970, m = DAYSPERYEAR + LEAPYEAR(y); + tm >= m; + y++, m = DAYSPERYEAR + LEAPYEAR(y)) + tm -= m; + + /* Now we have the years in y and the day-of-the-year in tm */ + writertc(RTC_YEAR, bin2bcd(y%100)); /* Write back Year */ +#ifdef USE_RTC_CENTURY + writertc(RTC_CENTURY, bin2bcd(y/100)); /* ... and Century */ +#endif + for (m = 0; ; m++) { + int ml; + + ml = daysinmonth[m]; + if (m == 1 && LEAPYEAR(y)) + ml++; + if (tm < ml) + break; + tm -= ml; + } + + writertc(RTC_MONTH, bin2bcd(m + 1)); /* Write back Month */ + writertc(RTC_DAY, bin2bcd(tm + 1)); /* Write back Month Day */ + + /* Reenable RTC updates and interrupts. */ + writertc(RTC_STATUSB, RTCSB_24HR); + rtcin(RTC_INTR); +} +#else +/* + * Initialize the time of day register, based on the time base which is, e.g. + * from a filesystem. + */ +void +inittodr(time_t base) +{ + int s, y; + struct timespec ts; + + s = splclock(); + if (base) { + ts.tv_sec = base; + ts.tv_nsec = 0; + tc_setclock(&ts); + } + + y = time_second - shadow_tv.tv_sec; + if (y <= -2 || y >= 2) { + /* badly off, adjust it */ + ts.tv_sec = shadow_tv.tv_sec; + ts.tv_nsec = shadow_tv.tv_nsec * 1000000000; /* :-/ */ + tc_setclock(&ts); + } + splx(s); +} + +/* + * Write system time back to RTC. Not supported for guest domains. + */ +void +resettodr() +{ +} +#endif + + +int +acquire_timer2(int mode) +{ + + if (timer2_state != RELEASED) + return (-1); + timer2_state = ACQUIRED; + + /* + * This access to the timer registers is as atomic as possible + * because it is a single instruction. We could do better if we + * knew the rate. Use of splclock() limits glitches to 10-100us, + * and this is probably good enough for timer2, so we aren't as + * careful with it as with timer0. + */ + outb(TIMER_MODE, TIMER_SEL2 | (mode & 0x3f)); + + return (0); +} + +int +release_timer2() +{ + + if (timer2_state != ACQUIRED) + return (-1); + timer2_state = RELEASED; + outb(TIMER_MODE, TIMER_SEL2 | TIMER_SQWAVE | TIMER_16BIT); + return (0); +} + +static struct vcpu_set_periodic_timer xen_set_periodic_tick; + +/* + * Start clocks running. + */ +void +cpu_initclocks(void) +{ + int time_irq; + + xen_set_periodic_tick.period_ns = NS_PER_TICK; + + HYPERVISOR_vcpu_op(VCPUOP_set_periodic_timer, 0, + &xen_set_periodic_tick); + + if ((time_irq = bind_virq_to_irqhandler(VIRQ_TIMER, 0, "clk", + (driver_intr_t *)clkintr, INTR_TYPE_CLK | INTR_FAST)) < 0) { + panic("failed to register clock interrupt\n"); + } + + /* should fast clock be enabled ? */ +} + +/* + * + * XXX + */ +#if 0 && defined(SMP) +void +ap_cpu_initclocks(void) +{ + int irq; + int cpu = smp_processor_id(); + + per_cpu(processed_system_time, cpu) = processed_system_time; + + irq = bind_virq_to_irq(VIRQ_TIMER); + PCPU_SET(time_irq, irq); + PANIC_IF(intr_add_handler("clk", irq, (driver_intr_t *)clkintr, NULL, + NULL, INTR_TYPE_CLK | INTR_FAST, NULL)); +} +#endif + +void +cpu_startprofclock(void) +{ + + printf("cpu_startprofclock: profiling clock is not supported\n"); +} + +void +cpu_stopprofclock(void) +{ + + printf("cpu_stopprofclock: profiling clock is not supported\n"); +} +#define NSEC_PER_USEC 1000 + +static uint32_t +xen_get_timecount(struct timecounter *tc) +{ + uint64_t clk; + struct shadow_time_info *shadow; + shadow = &per_cpu(shadow_time, smp_processor_id()); + + __get_time_values_from_xen(); + + clk = shadow->system_timestamp + get_nsec_offset(shadow); + + return (uint32_t)((clk / NS_PER_TICK) * NS_PER_TICK); + +} + +/* Return system time offset by ticks */ +uint64_t +get_system_time(int ticks) +{ + return processed_system_time + (ticks * NS_PER_TICK); +} + +/* + * Track behavior of cur_timer->get_offset() functionality in timer_tsc.c + */ + +#if 0 +static uint32_t +xen_get_offset(void) +{ + register unsigned long eax, edx; + + /* Read the Time Stamp Counter */ + + rdtsc(eax,edx); + + /* .. relative to previous jiffy (32 bits is enough) */ + eax -= shadow_tsc_stamp; + + /* + * Time offset = (tsc_low delta) * cached_gtm + * = (tsc_low delta) * (usecs_per_clock) + * = (tsc_low delta) * (usecs_per_jiffy / clocks_per_jiffy) + * + * Using a mull instead of a divl saves up to 31 clock cycles + * in the critical path. + */ + + __asm__("mull %2" + :"=a" (eax), "=d" (edx) + :"rm" (cached_gtm), + "0" (eax)); + + /* our adjusted time offset in microseconds */ + return edx; +} +#endif +void +idle_block(void) +{ + + __get_time_values_from_xen(); + PANIC_IF(HYPERVISOR_set_timer_op(processed_system_time + NS_PER_TICK) != 0); + HYPERVISOR_sched_op(SCHEDOP_block, 0); +} diff --git a/sys/i386/xen/exception.s b/sys/i386/xen/exception.s new file mode 100644 index 0000000..05282a9 --- /dev/null +++ b/sys/i386/xen/exception.s @@ -0,0 +1,484 @@ +/*- + * Copyright (c) 1989, 1990 William F. Jolitz. + * Copyright (c) 1990 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include "opt_apic.h" +#include "opt_npx.h" + +#include <machine/asmacros.h> +#include <machine/psl.h> +#include <machine/trap.h> + + +#include "assym.s" + +#define SEL_RPL_MASK 0x0002 +#define __HYPERVISOR_iret 23 + +/* Offsets into shared_info_t. */ +#define evtchn_upcall_pending /* 0 */ +#define evtchn_upcall_mask 1 +#define XEN_BLOCK_EVENTS(reg) movb $1,evtchn_upcall_mask(reg) +#define XEN_UNBLOCK_EVENTS(reg) movb $0,evtchn_upcall_mask(reg) +#define XEN_TEST_PENDING(reg) testb $0x1,evtchn_upcall_pending(reg) + +#define POPA \ + popl %edi; \ + popl %esi; \ + popl %ebp; \ + popl %ebx; \ + popl %ebx; \ + popl %edx; \ + popl %ecx; \ + popl %eax; + + .text + +/*****************************************************************************/ +/* Trap handling */ +/*****************************************************************************/ +/* + * Trap and fault vector routines. + * + * Most traps are 'trap gates', SDT_SYS386TGT. A trap gate pushes state on + * the stack that mostly looks like an interrupt, but does not disable + * interrupts. A few of the traps we are use are interrupt gates, + * SDT_SYS386IGT, which are nearly the same thing except interrupts are + * disabled on entry. + * + * The cpu will push a certain amount of state onto the kernel stack for + * the current process. The amount of state depends on the type of trap + * and whether the trap crossed rings or not. See i386/include/frame.h. + * At the very least the current EFLAGS (status register, which includes + * the interrupt disable state prior to the trap), the code segment register, + * and the return instruction pointer are pushed by the cpu. The cpu + * will also push an 'error' code for certain traps. We push a dummy + * error code for those traps where the cpu doesn't in order to maintain + * a consistent frame. We also push a contrived 'trap number'. + * + * The cpu does not push the general registers, we must do that, and we + * must restore them prior to calling 'iret'. The cpu adjusts the %cs and + * %ss segment registers, but does not mess with %ds, %es, or %fs. Thus we + * must load them with appropriate values for supervisor mode operation. + */ + +MCOUNT_LABEL(user) +MCOUNT_LABEL(btrap) + +#define TRAP(a) pushl $(a) ; jmp alltraps + +IDTVEC(div) + pushl $0; TRAP(T_DIVIDE) +IDTVEC(dbg) + pushl $0; TRAP(T_TRCTRAP) +IDTVEC(nmi) + pushl $0; TRAP(T_NMI) +IDTVEC(bpt) + pushl $0; TRAP(T_BPTFLT) +IDTVEC(ofl) + pushl $0; TRAP(T_OFLOW) +IDTVEC(bnd) + pushl $0; TRAP(T_BOUND) +IDTVEC(ill) + pushl $0; TRAP(T_PRIVINFLT) +IDTVEC(dna) + pushl $0; TRAP(T_DNA) +IDTVEC(fpusegm) + pushl $0; TRAP(T_FPOPFLT) +IDTVEC(tss) + TRAP(T_TSSFLT) +IDTVEC(missing) + TRAP(T_SEGNPFLT) +IDTVEC(stk) + TRAP(T_STKFLT) +IDTVEC(prot) + TRAP(T_PROTFLT) +IDTVEC(page) + TRAP(T_PAGEFLT) +IDTVEC(mchk) + pushl $0; TRAP(T_MCHK) +IDTVEC(rsvd) + pushl $0; TRAP(T_RESERVED) +IDTVEC(fpu) + pushl $0; TRAP(T_ARITHTRAP) +IDTVEC(align) + TRAP(T_ALIGNFLT) +IDTVEC(xmm) + pushl $0; TRAP(T_XMMFLT) + +IDTVEC(hypervisor_callback) + pushl $0; + pushl $0; + pushal + pushl %ds + pushl %es + pushl %fs +upcall_with_regs_pushed: + SET_KERNEL_SREGS + FAKE_MCOUNT(TF_EIP(%esp)) +call_evtchn_upcall: + movl TF_EIP(%esp),%eax + cmpl $scrit,%eax + jb 10f + cmpl $ecrit,%eax + jb critical_region_fixup + +10: pushl %esp + call evtchn_do_upcall + addl $4,%esp + + /* + * Return via doreti to handle ASTs. + */ + MEXITCOUNT + jmp doreti + + +hypervisor_callback_pending: + movl HYPERVISOR_shared_info,%esi + XEN_BLOCK_EVENTS(%esi) /* cli */ + jmp 10b + + /* + * alltraps entry point. Interrupts are enabled if this was a trap + * gate (TGT), else disabled if this was an interrupt gate (IGT). + * Note that int0x80_syscall is a trap gate. Only page faults + * use an interrupt gate. + */ + + SUPERALIGN_TEXT + .globl alltraps + .type alltraps,@function +alltraps: + pushal + pushl %ds + pushl %es + pushl %fs + +alltraps_with_regs_pushed: + SET_KERNEL_SREGS + FAKE_MCOUNT(TF_EIP(%esp)) + +calltrap: + call trap + + /* + * Return via doreti to handle ASTs. + */ + MEXITCOUNT + jmp doreti + +/* + * SYSCALL CALL GATE (old entry point for a.out binaries) + * + * The intersegment call has been set up to specify one dummy parameter. + * + * This leaves a place to put eflags so that the call frame can be + * converted to a trap frame. Note that the eflags is (semi-)bogusly + * pushed into (what will be) tf_err and then copied later into the + * final spot. It has to be done this way because esp can't be just + * temporarily altered for the pushfl - an interrupt might come in + * and clobber the saved cs/eip. + */ + SUPERALIGN_TEXT +IDTVEC(lcall_syscall) + pushfl /* save eflags */ + popl 8(%esp) /* shuffle into tf_eflags */ + pushl $7 /* sizeof "lcall 7,0" */ + subl $4,%esp /* skip over tf_trapno */ + pushal + pushl %ds + pushl %es + pushl %fs + SET_KERNEL_SREGS + FAKE_MCOUNT(TF_EIP(%esp)) + pushl %esp + call syscall + add $4, %esp + MEXITCOUNT + jmp doreti + +/* + * Call gate entry for FreeBSD ELF and Linux/NetBSD syscall (int 0x80) + * + * Even though the name says 'int0x80', this is actually a TGT (trap gate) + * rather then an IGT (interrupt gate). Thus interrupts are enabled on + * entry just as they are for a normal syscall. + */ + SUPERALIGN_TEXT +IDTVEC(int0x80_syscall) + pushl $2 /* sizeof "int 0x80" */ + pushl $0xBEEF /* for debug */ + pushal + pushl %ds + pushl %es + pushl %fs + SET_KERNEL_SREGS + FAKE_MCOUNT(TF_EIP(%esp)) + pushl %esp + call syscall + add $4, %esp + MEXITCOUNT + jmp doreti + +ENTRY(fork_trampoline) + pushl %esp /* trapframe pointer */ + pushl %ebx /* arg1 */ + pushl %esi /* function */ + call fork_exit + addl $12,%esp + /* cut from syscall */ + + /* + * Return via doreti to handle ASTs. + */ + MEXITCOUNT + jmp doreti + + +/* + * To efficiently implement classification of trap and interrupt handlers + * for profiling, there must be only trap handlers between the labels btrap + * and bintr, and only interrupt handlers between the labels bintr and + * eintr. This is implemented (partly) by including files that contain + * some of the handlers. Before including the files, set up a normal asm + * environment so that the included files doen't need to know that they are + * included. + */ + + .data + .p2align 4 + .text + SUPERALIGN_TEXT +MCOUNT_LABEL(bintr) + +#ifdef DEV_ATPIC +#include <i386/isa/atpic_vector.s> +#endif + +#ifdef DEV_APIC + .data + .p2align 4 + .text + SUPERALIGN_TEXT + +#include <i386/i386/apic_vector.s> +#endif + + .data + .p2align 4 + .text + SUPERALIGN_TEXT +#include <i386/i386/vm86bios.s> + + .text +MCOUNT_LABEL(eintr) + +/* + * void doreti(struct trapframe) + * + * Handle return from interrupts, traps and syscalls. + */ + .text + SUPERALIGN_TEXT + .type doreti,@function +doreti: + FAKE_MCOUNT($bintr) /* init "from" bintr -> doreti */ +doreti_next: +#ifdef notyet + /* + * Check if ASTs can be handled now. PSL_VM must be checked first + * since segment registers only have an RPL in non-VM86 mode. + */ + testl $PSL_VM,TF_EFLAGS(%esp) /* are we in vm86 mode? */ + jz doreti_notvm86 + movl PCPU(CURPCB),%ecx + testl $PCB_VM86CALL,PCB_FLAGS(%ecx) /* are we in a vm86 call? */ + jz doreti_ast /* can handle ASTS now if not */ + jmp doreti_exit + +doreti_notvm86: +#endif + testb $SEL_RPL_MASK,TF_CS(%esp) /* are we returning to user mode? */ + jz doreti_exit /* can't handle ASTs now if not */ + +doreti_ast: + /* + * Check for ASTs atomically with returning. Disabling CPU + * interrupts provides sufficient locking even in the SMP case, + * since we will be informed of any new ASTs by an IPI. + */ + movl HYPERVISOR_shared_info,%esi + XEN_BLOCK_EVENTS(%esi) /* cli */ + movl PCPU(CURTHREAD),%eax + testl $TDF_ASTPENDING | TDF_NEEDRESCHED,TD_FLAGS(%eax) + je doreti_exit + XEN_UNBLOCK_EVENTS(%esi) /* sti */ + pushl %esp /* pass a pointer to the trapframe */ + call ast + add $4,%esp + jmp doreti_ast + + /* + * doreti_exit: pop registers, iret. + * + * The segment register pop is a special case, since it may + * fault if (for example) a sigreturn specifies bad segment + * registers. The fault is handled in trap.c. + */ +doreti_exit: + movl HYPERVISOR_shared_info,%esi + XEN_UNBLOCK_EVENTS(%esi) # reenable event callbacks (sti) + + .globl scrit +scrit: + XEN_TEST_PENDING(%esi) + jnz hypervisor_callback_pending /* More to go */ + + MEXITCOUNT + + .globl doreti_popl_fs +doreti_popl_fs: + popl %fs + .globl doreti_popl_es +doreti_popl_es: + popl %es + .globl doreti_popl_ds +doreti_popl_ds: + popl %ds + + /* + * This is important: as nothing is atomic over here (we can get + * interrupted any time), we use the critical_region_fixup() in + * order to figure out where out stack is. Therefore, do NOT use + * 'popal' here without fixing up the table! + */ + POPA + addl $8,%esp + .globl doreti_iret +doreti_iret: +/* #jmp hypercall_page + (__HYPERVISOR_iret * 32) */ + iret + .globl ecrit +ecrit: + /* + * doreti_iret_fault and friends. Alternative return code for + * the case where we get a fault in the doreti_exit code + * above. trap() (i386/i386/trap.c) catches this specific + * case, sends the process a signal and continues in the + * corresponding place in the code below. + */ + ALIGN_TEXT + .globl doreti_iret_fault +doreti_iret_fault: + subl $8,%esp + pushal + pushl %ds + .globl doreti_popl_ds_fault +doreti_popl_ds_fault: + pushl %es + .globl doreti_popl_es_fault +doreti_popl_es_fault: + pushl %fs + .globl doreti_popl_fs_fault +doreti_popl_fs_fault: + movl $0,TF_ERR(%esp) /* XXX should be the error code */ + movl $T_PROTFLT,TF_TRAPNO(%esp) + jmp alltraps_with_regs_pushed + + /* +# [How we do the fixup]. We want to merge the current stack frame with the +# just-interrupted frame. How we do this depends on where in the critical +# region the interrupted handler was executing, and so how many saved +# registers are in each frame. We do this quickly using the lookup table +# 'critical_fixup_table'. For each byte offset in the critical region, it +# provides the number of bytes which have already been popped from the +# interrupted stack frame. +*/ + +.globl critical_region_fixup +critical_region_fixup: + addl $critical_fixup_table-scrit,%eax + movzbl (%eax),%eax # %eax contains num bytes popped + movl %esp,%esi + add %eax,%esi # %esi points at end of src region + movl %esp,%edi + add $0x40,%edi # %edi points at end of dst region + movl %eax,%ecx + shr $2,%ecx # convert bytes to words + je 16f # skip loop if nothing to copy +15: subl $4,%esi # pre-decrementing copy loop + subl $4,%edi + movl (%esi),%eax + movl %eax,(%edi) + loop 15b +16: movl %edi,%esp # final %edi is top of merged stack + jmp hypervisor_callback_pending + + +critical_fixup_table: +.byte 0x0,0x0,0x0 #testb $0x1,(%esi) +.byte 0x0,0x0,0x0,0x0,0x0,0x0 #jne ea +.byte 0x0,0x0 #pop %fs +.byte 0x04 #pop %es +.byte 0x08 #pop %ds +.byte 0x0c #pop %edi +.byte 0x10 #pop %esi +.byte 0x14 #pop %ebp +.byte 0x18 #pop %ebx +.byte 0x1c #pop %ebx +.byte 0x20 #pop %edx +.byte 0x24 #pop %ecx +.byte 0x28 #pop %eax +.byte 0x2c,0x2c,0x2c #add $0x8,%esp +.byte 0x34 #iret +/* .byte 0x34,0x34,0x34,0x34,0x34 #HYPERVISOR_iret */ + + +/* # Hypervisor uses this for application faults while it executes.*/ +ENTRY(failsafe_callback) + pushal + call xen_failsafe_handler +/*# call install_safe_pf_handler */ + movl 28(%esp),%ebx +1: movl %ebx,%ds + movl 32(%esp),%ebx +2: movl %ebx,%es + movl 36(%esp),%ebx +3: movl %ebx,%fs + movl 40(%esp),%ebx +4: movl %ebx,%gs +/*# call install_normal_pf_handler */ + popal + addl $12,%esp + iret + + diff --git a/sys/i386/xen/locore.s b/sys/i386/xen/locore.s new file mode 100644 index 0000000..4f54525 --- /dev/null +++ b/sys/i386/xen/locore.s @@ -0,0 +1,373 @@ +/*- + * Copyright (c) 1990 The Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * William Jolitz. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)locore.s 7.3 (Berkeley) 5/13/91 + * $FreeBSD$ + * + * originally from: locore.s, by William F. Jolitz + * + * Substantially rewritten by David Greenman, Rod Grimes, + * Bruce Evans, Wolfgang Solfrank, Poul-Henning Kamp + * and many others. + */ + +#include "opt_bootp.h" +#include "opt_compat.h" +#include "opt_nfsroot.h" +#include "opt_global.h" +#include "opt_pmap.h" + +#include <sys/syscall.h> +#include <sys/reboot.h> + +#include <machine/asmacros.h> +#include <machine/cputypes.h> +#include <machine/psl.h> +#include <machine/pmap.h> +#include <machine/specialreg.h> + +#define __ASSEMBLY__ +#include <xen/interface/elfnote.h> + +/* The defines below have been lifted out of <machine/xen-public/arch-x86_32.h> */ +#define FLAT_RING1_CS 0xe019 /* GDT index 259 */ +#define FLAT_RING1_DS 0xe021 /* GDT index 260 */ +#define KERNEL_CS FLAT_RING1_CS +#define KERNEL_DS FLAT_RING1_DS + +#include "assym.s" + +.section __xen_guest + .ascii "LOADER=generic,GUEST_OS=freebsd,GUEST_VER=7.0,XEN_VER=xen-3.0,BSD_SYMTAB,VIRT_BASE=0xc0000000" + .byte 0 + + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz, "FreeBSD") + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz, "HEAD") + ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz, "xen-3.0") + ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long, KERNBASE) + ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .long, KERNBASE) + ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long, btext) + ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long, hypercall_page) + ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, .long, HYPERVISOR_VIRT_START) +#if 0 + ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz, "writable_page_tables|writable_descriptor_tables|auto_translated_physmap|pae_pgdir_above_4gb|supervisor_mode_kernel") +#endif + ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz, "writable_page_tables|supervisor_mode_kernel|writable_descriptor_tables") + +#ifdef PAE + ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz, "yes") + ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .long, PG_V, PG_V) +#else + ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz, "no") + ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .long, PG_V, PG_V) +#endif + ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz, "generic") + ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long, 1) + + + +/* + * XXX + * + * Note: This version greatly munged to avoid various assembler errors + * that may be fixed in newer versions of gas. Perhaps newer versions + * will have more pleasant appearance. + */ + +/* + * PTmap is recursive pagemap at top of virtual address space. + * Within PTmap, the page directory can be found (third indirection). + */ + .globl PTmap,PTD,PTDpde + .set PTmap,(PTDPTDI << PDRSHIFT) + .set PTD,PTmap + (PTDPTDI * PAGE_SIZE) + .set PTDpde,PTD + (PTDPTDI * PDESIZE) + +/* + * Compiled KERNBASE location and the kernel load address + */ + .globl kernbase + .set kernbase,KERNBASE + .globl kernload + .set kernload,KERNLOAD + +/* + * Globals + */ + .data + ALIGN_DATA /* just to be sure */ + + .space 0x2000 /* space for tmpstk - temporary stack */ +tmpstk: + + .globl bootinfo +bootinfo: .space BOOTINFO_SIZE /* bootinfo that we can handle */ + + .globl KERNend +KERNend: .long 0 /* phys addr end of kernel (just after bss) */ + .globl physfree +physfree: .long 0 /* phys addr of next free page */ + +#ifdef SMP + .globl cpu0prvpage +cpu0pp: .long 0 /* phys addr cpu0 private pg */ +cpu0prvpage: .long 0 /* relocated version */ + + .globl SMPpt +SMPptpa: .long 0 /* phys addr SMP page table */ +SMPpt: .long 0 /* relocated version */ +#endif /* SMP */ + + .globl IdlePTD +IdlePTD: .long 0 /* phys addr of kernel PTD */ + +#ifdef PAE + .globl IdlePDPT +IdlePDPT: .long 0 /* phys addr of kernel PDPT */ +#endif + +#ifdef SMP + .globl KPTphys + .globl gdtset +#endif +KPTphys: .long 0 /* phys addr of kernel page tables */ +gdtset: .long 0 + + .globl proc0kstack +proc0uarea: .long 0 /* address of proc 0 uarea (unused)*/ +proc0kstack: .long 0 /* address of proc 0 kstack space */ +p0upa: .long 0 /* phys addr of proc0 UAREA (unused) */ +p0kpa: .long 0 /* phys addr of proc0's STACK */ + +vm86phystk: .long 0 /* PA of vm86/bios stack */ + + .globl vm86paddr, vm86pa +vm86paddr: .long 0 /* address of vm86 region */ +vm86pa: .long 0 /* phys addr of vm86 region */ + +#ifdef PC98 + .globl pc98_system_parameter +pc98_system_parameter: + .space 0x240 +#endif + + .globl avail_space +avail_space: .long 0 + +/********************************************************************** + * + * Some handy macros + * + */ + +/* + * We're already in protected mode, so no remapping is needed. + */ +#define R(foo) (foo) + +#define ALLOCPAGES(foo) \ + movl R(physfree), %esi ; \ + movl $((foo)*PAGE_SIZE), %eax ; \ + addl %esi, %eax ; \ + movl %eax, R(physfree) ; \ + movl %esi, %edi ; \ + movl $((foo)*PAGE_SIZE),%ecx ; \ + xorl %eax,%eax ; \ + cld ; \ + rep ; \ + stosb + +/* + * fillkpt + * eax = page frame address + * ebx = index into page table + * ecx = how many pages to map + * base = base address of page dir/table + * prot = protection bits + */ +#define fillkpt(base, prot) \ + shll $PTESHIFT,%ebx ; \ + addl base,%ebx ; \ + orl $PG_V,%eax ; \ + orl prot,%eax ; \ +1: movl %eax,(%ebx) ; \ + addl $PAGE_SIZE,%eax ; /* increment physical address */ \ + addl $PTESIZE,%ebx ; /* next pte */ \ + loop 1b + +/* + * fillkptphys(prot) + * eax = physical address + * ecx = how many pages to map + * prot = protection bits + */ +#define fillkptphys(prot) \ + movl %eax, %ebx ; \ + shrl $PAGE_SHIFT, %ebx ; \ + fillkpt(R(KPTphys), prot) + +/* Temporary stack */ +.space 8192 +tmpstack: + .long tmpstack, KERNEL_DS + + .text + +.p2align 12, 0x90 + +#define HYPERCALL_PAGE_OFFSET 0x1000 +.org HYPERCALL_PAGE_OFFSET +ENTRY(hypercall_page) + .cfi_startproc + .skip 0x1000 + .cfi_endproc + +/********************************************************************** + * + * This is where the bootblocks start us, set the ball rolling... + * + */ +NON_GPROF_ENTRY(btext) + /* At the end of our stack, we shall have free space - so store it */ + movl %esp,%ebx + movl %ebx,R(avail_space) + + lss tmpstack,%esp + + pushl %esi + call initvalues + popl %esi + + /* Store the CPUID information */ + xorl %eax,%eax + cpuid # cpuid 0 + movl %eax,R(cpu_high) # highest capability + movl %ebx,R(cpu_vendor) # store vendor string + movl %edx,R(cpu_vendor+4) + movl %ecx,R(cpu_vendor+8) + movb $0,R(cpu_vendor+12) + + movl $1,%eax + cpuid # cpuid 1 + movl %eax,R(cpu_id) # store cpu_id + movl %ebx,R(cpu_procinfo) # store cpu_procinfo + movl %edx,R(cpu_feature) # store cpu_feature + movl %ecx,R(cpu_feature2) # store cpu_feature2 + rorl $8,%eax # extract family type + andl $15,%eax + cmpl $5,%eax + movl $CPU_686,R(cpu) + + movl proc0kstack,%eax + leal (KSTACK_PAGES*PAGE_SIZE-PCB_SIZE)(%eax),%esp + xorl %ebp,%ebp /* mark end of frames */ +#ifdef PAE + movl IdlePDPT,%esi +#else + movl IdlePTD,%esi +#endif + movl %esi,(KSTACK_PAGES*PAGE_SIZE-PCB_SIZE+PCB_CR3)(%eax) + pushl physfree + call init386 + addl $4, %esp + call mi_startup + /* NOTREACHED */ + int $3 + +/* + * Signal trampoline, copied to top of user stack + */ +NON_GPROF_ENTRY(sigcode) + calll *SIGF_HANDLER(%esp) + leal SIGF_UC(%esp),%eax /* get ucontext */ + pushl %eax + testl $PSL_VM,UC_EFLAGS(%eax) + jne 1f + mov UC_GS(%eax), %gs /* restore %gs */ +1: + movl $SYS_sigreturn,%eax + pushl %eax /* junk to fake return addr. */ + int $0x80 /* enter kernel with args */ + /* on stack */ +1: + jmp 1b + +#ifdef COMPAT_FREEBSD4 + ALIGN_TEXT +freebsd4_sigcode: + calll *SIGF_HANDLER(%esp) + leal SIGF_UC4(%esp),%eax /* get ucontext */ + pushl %eax + testl $PSL_VM,UC4_EFLAGS(%eax) + jne 1f + mov UC4_GS(%eax),%gs /* restore %gs */ +1: + movl $344,%eax /* 4.x SYS_sigreturn */ + pushl %eax /* junk to fake return addr. */ + int $0x80 /* enter kernel with args */ + /* on stack */ +1: + jmp 1b +#endif + +#ifdef COMPAT_43 + ALIGN_TEXT +osigcode: + call *SIGF_HANDLER(%esp) /* call signal handler */ + lea SIGF_SC(%esp),%eax /* get sigcontext */ + pushl %eax + testl $PSL_VM,SC_PS(%eax) + jne 9f + movl SC_GS(%eax),%gs /* restore %gs */ +9: + movl $103,%eax /* 3.x SYS_sigreturn */ + pushl %eax /* junk to fake return addr. */ + int $0x80 /* enter kernel with args */ +0: jmp 0b +#endif /* COMPAT_43 */ + + ALIGN_TEXT +esigcode: + + .data + .globl szsigcode +szsigcode: + .long esigcode-sigcode +#ifdef COMPAT_FREEBSD4 + .globl szfreebsd4_sigcode +szfreebsd4_sigcode: + .long esigcode-freebsd4_sigcode +#endif +#ifdef COMPAT_43 + .globl szosigcode +szosigcode: + .long esigcode-osigcode +#endif diff --git a/sys/i386/xen/machdep.c b/sys/i386/xen/machdep.c new file mode 100644 index 0000000..ba6b7ff --- /dev/null +++ b/sys/i386/xen/machdep.c @@ -0,0 +1,3275 @@ +/*- + * Copyright (c) 1992 Terrence R. Lambert. + * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * William Jolitz. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include "opt_apic.h" +#include "opt_atalk.h" +#include "opt_compat.h" +#include "opt_cpu.h" +#include "opt_ddb.h" +#include "opt_global.h" +#include "opt_inet.h" +#include "opt_ipx.h" +#include "opt_isa.h" +#include "opt_kstack_pages.h" +#include "opt_maxmem.h" +#include "opt_msgbuf.h" +#include "opt_npx.h" +#include "opt_perfmon.h" +#include "opt_xbox.h" + +#include <sys/param.h> +#include <sys/proc.h> +#include <sys/systm.h> +#include <sys/bio.h> +#include <sys/buf.h> +#include <sys/bus.h> +#include <sys/callout.h> +#include <sys/clock.h> +#include <sys/cons.h> +#include <sys/cpu.h> +#include <sys/eventhandler.h> +#include <sys/exec.h> +#include <sys/imgact.h> +#include <sys/kdb.h> +#include <sys/kernel.h> +#include <sys/ktr.h> +#include <sys/linker.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/memrange.h> +#include <sys/msgbuf.h> +#include <sys/mutex.h> +#include <sys/pcpu.h> +#include <sys/ptrace.h> +#include <sys/reboot.h> +#include <sys/sched.h> +#include <sys/signalvar.h> +#include <sys/sysctl.h> +#include <sys/sysent.h> +#include <sys/sysproto.h> +#include <sys/ucontext.h> +#include <sys/vmmeter.h> + +#include <vm/vm.h> +#include <vm/vm_extern.h> +#include <vm/vm_kern.h> +#include <vm/vm_page.h> +#include <vm/vm_map.h> +#include <vm/vm_object.h> +#include <vm/vm_pager.h> +#include <vm/vm_param.h> + +#ifdef DDB +#ifndef KDB +#error KDB must be enabled in order for DDB to work! +#endif +#include <ddb/ddb.h> +#include <ddb/db_sym.h> +#endif + +#include <isa/rtc.h> + +#include <net/netisr.h> + +#include <machine/bootinfo.h> +#include <machine/clock.h> +#include <machine/cpu.h> +#include <machine/cputypes.h> +#include <machine/intr_machdep.h> +#include <machine/md_var.h> +#include <machine/pc/bios.h> +#include <machine/pcb.h> +#include <machine/pcb_ext.h> +#include <machine/proc.h> +#include <machine/reg.h> +#include <machine/sigframe.h> +#include <machine/specialreg.h> +#include <machine/vm86.h> +#ifdef PERFMON +#include <machine/perfmon.h> +#endif +#ifdef SMP +#include <machine/privatespace.h> +#include <machine/smp.h> +#endif + +#ifdef DEV_ISA +#include <i386/isa/icu.h> +#endif + +#ifdef XBOX +#include <machine/xbox.h> + +int arch_i386_is_xbox = 0; +uint32_t arch_i386_xbox_memsize = 0; +#endif + +#ifdef XEN +/* XEN includes */ +#include <machine/xen/hypervisor-ifs.h> +#include <machine/xen/xen-os.h> +#include <machine/xen/hypervisor.h> +#include <machine/xen/xenvar.h> +#include <machine/xen/xenfunc.h> +#include <machine/xen/xen_intr.h> + +void Xhypervisor_callback(void); +void failsafe_callback(void); + +int gdt_set; +extern trap_info_t trap_table[]; +struct proc_ldt default_proc_ldt; +extern int init_first; +int running_xen = 1; +#endif + +/* Sanity check for __curthread() */ +CTASSERT(offsetof(struct pcpu, pc_curthread) == 0); + +extern void init386(int first); +extern void dblfault_handler(void); + +extern void printcpuinfo(void); /* XXX header file */ +extern void finishidentcpu(void); +extern void panicifcpuunsupported(void); +extern void initializecpu(void); + +#define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) +#define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) + +#if !defined(CPU_DISABLE_SSE) && defined(I686_CPU) +#define CPU_ENABLE_SSE +#endif + +static void cpu_startup(void *); +static void fpstate_drop(struct thread *td); +static void get_fpcontext(struct thread *td, mcontext_t *mcp); +static int set_fpcontext(struct thread *td, const mcontext_t *mcp); +#ifdef CPU_ENABLE_SSE +static void set_fpregs_xmm(struct save87 *, struct savexmm *); +static void fill_fpregs_xmm(struct savexmm *, struct save87 *); +#endif /* CPU_ENABLE_SSE */ +SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL) + +#ifdef DDB +extern vm_offset_t ksym_start, ksym_end; +#endif + +int _udatasel, _ucodesel; +u_int basemem; + +int cold = 1; + +#ifdef COMPAT_43 +static void osendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask); +#endif +#ifdef COMPAT_FREEBSD4 +static void freebsd4_sendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask); +#endif + +long Maxmem = 0; +long realmem = 0; + +#define PHYSMAP_SIZE (2 * 16) + +vm_paddr_t phys_avail[PHYSMAP_SIZE + 2]; +vm_paddr_t dump_avail[PHYSMAP_SIZE + 2]; + +/* must be 2 less so 0 0 can signal end of chunks */ +#define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(phys_avail[0])) - 2) +#define DUMP_AVAIL_ARRAY_END ((sizeof(dump_avail) / sizeof(dump_avail[0])) - 2) + +struct kva_md_info kmi; + +static struct trapframe proc0_tf; +#ifndef SMP +static struct pcpu __pcpu; +#endif + +struct mtx icu_lock; + +struct mem_range_softc mem_range_softc; + +static void +cpu_startup(dummy) + void *dummy; +{ + /* + * Good {morning,afternoon,evening,night}. + */ + startrtclock(); + printcpuinfo(); + panicifcpuunsupported(); +#ifdef PERFMON + perfmon_init(); +#endif + printf("real memory = %ju (%ju MB)\n", ptoa((uintmax_t)Maxmem), + ptoa((uintmax_t)Maxmem) / 1048576); + realmem = Maxmem; + /* + * Display any holes after the first chunk of extended memory. + */ + if (bootverbose) { + int indx; + + printf("Physical memory chunk(s):\n"); + for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { + vm_paddr_t size; + + size = phys_avail[indx + 1] - phys_avail[indx]; + printf( + "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n", + (uintmax_t)phys_avail[indx], + (uintmax_t)phys_avail[indx + 1] - 1, + (uintmax_t)size, (uintmax_t)size / PAGE_SIZE); + } + } + + vm_ksubmap_init(&kmi); + + printf("avail memory = %ju (%ju MB)\n", + ptoa((uintmax_t)cnt.v_free_count), + ptoa((uintmax_t)cnt.v_free_count) / 1048576); + + /* + * Set up buffers, so they can be used to read disk labels. + */ + bufinit(); + vm_pager_bufferinit(); +#ifndef XEN + cpu_setregs(); +#endif +} + +/* + * Send an interrupt to process. + * + * Stack is set up to allow sigcode stored + * at top to call routine, followed by kcall + * to sigreturn routine below. After sigreturn + * resets the signal mask, the stack, and the + * frame pointer, it returns to the user + * specified pc, psl. + */ +#ifdef COMPAT_43 +static void +osendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) +{ + struct osigframe sf, *fp; + struct proc *p; + struct thread *td; + struct sigacts *psp; + struct trapframe *regs; + int sig; + int oonstack; + + td = curthread; + p = td->td_proc; + PROC_LOCK_ASSERT(p, MA_OWNED); + sig = ksi->ksi_signo; + psp = p->p_sigacts; + mtx_assert(&psp->ps_mtx, MA_OWNED); + regs = td->td_frame; + oonstack = sigonstack(regs->tf_esp); + + /* Allocate space for the signal handler context. */ + if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && + SIGISMEMBER(psp->ps_sigonstack, sig)) { + fp = (struct osigframe *)(td->td_sigstk.ss_sp + + td->td_sigstk.ss_size - sizeof(struct osigframe)); +#if defined(COMPAT_43) + td->td_sigstk.ss_flags |= SS_ONSTACK; +#endif + } else + fp = (struct osigframe *)regs->tf_esp - 1; + + /* Translate the signal if appropriate. */ + if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize) + sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; + + /* Build the argument list for the signal handler. */ + sf.sf_signum = sig; + sf.sf_scp = (register_t)&fp->sf_siginfo.si_sc; + if (SIGISMEMBER(psp->ps_siginfo, sig)) { + /* Signal handler installed with SA_SIGINFO. */ + sf.sf_arg2 = (register_t)&fp->sf_siginfo; + sf.sf_siginfo.si_signo = sig; + sf.sf_siginfo.si_code = ksi->ksi_code; + sf.sf_ahu.sf_action = (__osiginfohandler_t *)catcher; + } else { + /* Old FreeBSD-style arguments. */ + sf.sf_arg2 = ksi->ksi_code; + sf.sf_addr = (register_t)ksi->ksi_addr; + sf.sf_ahu.sf_handler = catcher; + } + mtx_unlock(&psp->ps_mtx); + PROC_UNLOCK(p); + + /* Save most if not all of trap frame. */ + sf.sf_siginfo.si_sc.sc_eax = regs->tf_eax; + sf.sf_siginfo.si_sc.sc_ebx = regs->tf_ebx; + sf.sf_siginfo.si_sc.sc_ecx = regs->tf_ecx; + sf.sf_siginfo.si_sc.sc_edx = regs->tf_edx; + sf.sf_siginfo.si_sc.sc_esi = regs->tf_esi; + sf.sf_siginfo.si_sc.sc_edi = regs->tf_edi; + sf.sf_siginfo.si_sc.sc_cs = regs->tf_cs; + sf.sf_siginfo.si_sc.sc_ds = regs->tf_ds; + sf.sf_siginfo.si_sc.sc_ss = regs->tf_ss; + sf.sf_siginfo.si_sc.sc_es = regs->tf_es; + sf.sf_siginfo.si_sc.sc_fs = regs->tf_fs; + sf.sf_siginfo.si_sc.sc_gs = rgs(); + sf.sf_siginfo.si_sc.sc_isp = regs->tf_isp; + + /* Build the signal context to be used by osigreturn(). */ + sf.sf_siginfo.si_sc.sc_onstack = (oonstack) ? 1 : 0; + SIG2OSIG(*mask, sf.sf_siginfo.si_sc.sc_mask); + sf.sf_siginfo.si_sc.sc_sp = regs->tf_esp; + sf.sf_siginfo.si_sc.sc_fp = regs->tf_ebp; + sf.sf_siginfo.si_sc.sc_pc = regs->tf_eip; + sf.sf_siginfo.si_sc.sc_ps = regs->tf_eflags; + sf.sf_siginfo.si_sc.sc_trapno = regs->tf_trapno; + sf.sf_siginfo.si_sc.sc_err = regs->tf_err; + + /* + * If we're a vm86 process, we want to save the segment registers. + * We also change eflags to be our emulated eflags, not the actual + * eflags. + */ + if (regs->tf_eflags & PSL_VM) { + /* XXX confusing names: `tf' isn't a trapframe; `regs' is. */ + struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; + struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86; + + sf.sf_siginfo.si_sc.sc_gs = tf->tf_vm86_gs; + sf.sf_siginfo.si_sc.sc_fs = tf->tf_vm86_fs; + sf.sf_siginfo.si_sc.sc_es = tf->tf_vm86_es; + sf.sf_siginfo.si_sc.sc_ds = tf->tf_vm86_ds; + + if (vm86->vm86_has_vme == 0) + sf.sf_siginfo.si_sc.sc_ps = + (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | + (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); + + /* See sendsig() for comments. */ + tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); + } + + /* + * Copy the sigframe out to the user's stack. + */ + if (copyout(&sf, fp, sizeof(*fp)) != 0) { +#ifdef DEBUG + printf("process %ld has trashed its stack\n", (long)p->p_pid); +#endif + PROC_LOCK(p); + sigexit(td, SIGILL); + } + + regs->tf_esp = (int)fp; + regs->tf_eip = PS_STRINGS - szosigcode; + regs->tf_eflags &= ~PSL_T; + regs->tf_cs = _ucodesel; + regs->tf_ds = _udatasel; + regs->tf_es = _udatasel; + regs->tf_fs = _udatasel; + load_gs(_udatasel); + regs->tf_ss = _udatasel; + PROC_LOCK(p); + mtx_lock(&psp->ps_mtx); +} +#endif /* COMPAT_43 */ + +#ifdef COMPAT_FREEBSD4 +static void +freebsd4_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) +{ + struct sigframe4 sf, *sfp; + struct proc *p; + struct thread *td; + struct sigacts *psp; + struct trapframe *regs; + int sig; + int oonstack; + + td = curthread; + p = td->td_proc; + PROC_LOCK_ASSERT(p, MA_OWNED); + sig = ksi->ksi_signo; + psp = p->p_sigacts; + mtx_assert(&psp->ps_mtx, MA_OWNED); + regs = td->td_frame; + oonstack = sigonstack(regs->tf_esp); + + /* Save user context. */ + bzero(&sf, sizeof(sf)); + sf.sf_uc.uc_sigmask = *mask; + sf.sf_uc.uc_stack = td->td_sigstk; + sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) + ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; + sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; + sf.sf_uc.uc_mcontext.mc_gs = rgs(); + bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs)); + + /* Allocate space for the signal handler context. */ + if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && + SIGISMEMBER(psp->ps_sigonstack, sig)) { + sfp = (struct sigframe4 *)(td->td_sigstk.ss_sp + + td->td_sigstk.ss_size - sizeof(struct sigframe4)); +#if defined(COMPAT_43) + td->td_sigstk.ss_flags |= SS_ONSTACK; +#endif + } else + sfp = (struct sigframe4 *)regs->tf_esp - 1; + + /* Translate the signal if appropriate. */ + if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize) + sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; + + /* Build the argument list for the signal handler. */ + sf.sf_signum = sig; + sf.sf_ucontext = (register_t)&sfp->sf_uc; + if (SIGISMEMBER(psp->ps_siginfo, sig)) { + /* Signal handler installed with SA_SIGINFO. */ + sf.sf_siginfo = (register_t)&sfp->sf_si; + sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; + + /* Fill in POSIX parts */ + sf.sf_si.si_signo = sig; + sf.sf_si.si_code = ksi->ksi_code; + sf.sf_si.si_addr = ksi->ksi_addr; + } else { + /* Old FreeBSD-style arguments. */ + sf.sf_siginfo = ksi->ksi_code; + sf.sf_addr = (register_t)ksi->ksi_addr; + sf.sf_ahu.sf_handler = catcher; + } + mtx_unlock(&psp->ps_mtx); + PROC_UNLOCK(p); + + /* + * If we're a vm86 process, we want to save the segment registers. + * We also change eflags to be our emulated eflags, not the actual + * eflags. + */ + if (regs->tf_eflags & PSL_VM) { + struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; + struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86; + + sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs; + sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs; + sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es; + sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds; + + if (vm86->vm86_has_vme == 0) + sf.sf_uc.uc_mcontext.mc_eflags = + (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | + (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); + + /* + * Clear PSL_NT to inhibit T_TSSFLT faults on return from + * syscalls made by the signal handler. This just avoids + * wasting time for our lazy fixup of such faults. PSL_NT + * does nothing in vm86 mode, but vm86 programs can set it + * almost legitimately in probes for old cpu types. + */ + tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); + } + + /* + * Copy the sigframe out to the user's stack. + */ + if (copyout(&sf, sfp, sizeof(*sfp)) != 0) { +#ifdef DEBUG + printf("process %ld has trashed its stack\n", (long)p->p_pid); +#endif + PROC_LOCK(p); + sigexit(td, SIGILL); + } + + regs->tf_esp = (int)sfp; + regs->tf_eip = PS_STRINGS - szfreebsd4_sigcode; + regs->tf_eflags &= ~PSL_T; + regs->tf_cs = _ucodesel; + regs->tf_ds = _udatasel; + regs->tf_es = _udatasel; + regs->tf_fs = _udatasel; + regs->tf_ss = _udatasel; + PROC_LOCK(p); + mtx_lock(&psp->ps_mtx); +} +#endif /* COMPAT_FREEBSD4 */ + +void +sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) +{ + struct sigframe sf, *sfp; + struct proc *p; + struct thread *td; + struct sigacts *psp; + char *sp; + struct trapframe *regs; + int sig; + int oonstack; + + td = curthread; + p = td->td_proc; + PROC_LOCK_ASSERT(p, MA_OWNED); + sig = ksi->ksi_signo; + psp = p->p_sigacts; + mtx_assert(&psp->ps_mtx, MA_OWNED); +#ifdef COMPAT_FREEBSD4 + if (SIGISMEMBER(psp->ps_freebsd4, sig)) { + freebsd4_sendsig(catcher, ksi, mask); + return; + } +#endif +#ifdef COMPAT_43 + if (SIGISMEMBER(psp->ps_osigset, sig)) { + osendsig(catcher, ksi, mask); + return; + } +#endif + regs = td->td_frame; + oonstack = sigonstack(regs->tf_esp); + + /* Save user context. */ + bzero(&sf, sizeof(sf)); + sf.sf_uc.uc_sigmask = *mask; + sf.sf_uc.uc_stack = td->td_sigstk; + sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) + ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; + sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; + sf.sf_uc.uc_mcontext.mc_gs = rgs(); + bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs)); + sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */ + get_fpcontext(td, &sf.sf_uc.uc_mcontext); + fpstate_drop(td); + + /* Allocate space for the signal handler context. */ + if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && + SIGISMEMBER(psp->ps_sigonstack, sig)) { + sp = td->td_sigstk.ss_sp + + td->td_sigstk.ss_size - sizeof(struct sigframe); +#if defined(COMPAT_43) + td->td_sigstk.ss_flags |= SS_ONSTACK; +#endif + } else + sp = (char *)regs->tf_esp - sizeof(struct sigframe); + /* Align to 16 bytes. */ + sfp = (struct sigframe *)((unsigned int)sp & ~0xF); + + /* Translate the signal if appropriate. */ + if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize) + sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; + + /* Build the argument list for the signal handler. */ + sf.sf_signum = sig; + sf.sf_ucontext = (register_t)&sfp->sf_uc; + if (SIGISMEMBER(psp->ps_siginfo, sig)) { + /* Signal handler installed with SA_SIGINFO. */ + sf.sf_siginfo = (register_t)&sfp->sf_si; + sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; + + /* Fill in POSIX parts */ + sf.sf_si = ksi->ksi_info; + sf.sf_si.si_signo = sig; /* maybe a translated signal */ + } else { + /* Old FreeBSD-style arguments. */ + sf.sf_siginfo = ksi->ksi_code; + sf.sf_addr = (register_t)ksi->ksi_addr; + sf.sf_ahu.sf_handler = catcher; + } + mtx_unlock(&psp->ps_mtx); + PROC_UNLOCK(p); + + /* + * If we're a vm86 process, we want to save the segment registers. + * We also change eflags to be our emulated eflags, not the actual + * eflags. + */ + if (regs->tf_eflags & PSL_VM) { + struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; + struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86; + + sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs; + sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs; + sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es; + sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds; + + if (vm86->vm86_has_vme == 0) + sf.sf_uc.uc_mcontext.mc_eflags = + (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | + (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); + + /* + * Clear PSL_NT to inhibit T_TSSFLT faults on return from + * syscalls made by the signal handler. This just avoids + * wasting time for our lazy fixup of such faults. PSL_NT + * does nothing in vm86 mode, but vm86 programs can set it + * almost legitimately in probes for old cpu types. + */ + tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); + } + + /* + * Copy the sigframe out to the user's stack. + */ + if (copyout(&sf, sfp, sizeof(*sfp)) != 0) { +#ifdef DEBUG + printf("process %ld has trashed its stack\n", (long)p->p_pid); +#endif + PROC_LOCK(p); + sigexit(td, SIGILL); + } + + regs->tf_esp = (int)sfp; + regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode); + regs->tf_eflags &= ~PSL_T; + regs->tf_cs = _ucodesel; + regs->tf_ds = _udatasel; + regs->tf_es = _udatasel; + regs->tf_fs = _udatasel; + regs->tf_ss = _udatasel; + PROC_LOCK(p); + mtx_lock(&psp->ps_mtx); +} + +/* + * System call to cleanup state after a signal + * has been taken. Reset signal mask and + * stack state from context left by sendsig (above). + * Return to previous pc and psl as specified by + * context left by sendsig. Check carefully to + * make sure that the user has not modified the + * state to gain improper privileges. + * + * MPSAFE + */ +#ifdef COMPAT_43 +int +osigreturn(td, uap) + struct thread *td; + struct osigreturn_args /* { + struct osigcontext *sigcntxp; + } */ *uap; +{ + struct osigcontext sc; + struct trapframe *regs; + struct osigcontext *scp; + struct proc *p = td->td_proc; + int eflags, error; + ksiginfo_t ksi; + + regs = td->td_frame; + error = copyin(uap->sigcntxp, &sc, sizeof(sc)); + if (error != 0) + return (error); + scp = ≻ + eflags = scp->sc_ps; + if (eflags & PSL_VM) { + struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; + struct vm86_kernel *vm86; + + /* + * if pcb_ext == 0 or vm86_inited == 0, the user hasn't + * set up the vm86 area, and we can't enter vm86 mode. + */ + if (td->td_pcb->pcb_ext == 0) + return (EINVAL); + vm86 = &td->td_pcb->pcb_ext->ext_vm86; + if (vm86->vm86_inited == 0) + return (EINVAL); + + /* Go back to user mode if both flags are set. */ + if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) { + ksiginfo_init_trap(&ksi); + ksi.ksi_signo = SIGBUS; + ksi.ksi_code = BUS_OBJERR; + ksi.ksi_addr = (void *)regs->tf_eip; + trapsignal(td, &ksi); + } + + if (vm86->vm86_has_vme) { + eflags = (tf->tf_eflags & ~VME_USERCHANGE) | + (eflags & VME_USERCHANGE) | PSL_VM; + } else { + vm86->vm86_eflags = eflags; /* save VIF, VIP */ + eflags = (tf->tf_eflags & ~VM_USERCHANGE) | + (eflags & VM_USERCHANGE) | PSL_VM; + } + tf->tf_vm86_ds = scp->sc_ds; + tf->tf_vm86_es = scp->sc_es; + tf->tf_vm86_fs = scp->sc_fs; + tf->tf_vm86_gs = scp->sc_gs; + tf->tf_ds = _udatasel; + tf->tf_es = _udatasel; + tf->tf_fs = _udatasel; + } else { + /* + * Don't allow users to change privileged or reserved flags. + */ + /* + * XXX do allow users to change the privileged flag PSL_RF. + * The cpu sets PSL_RF in tf_eflags for faults. Debuggers + * should sometimes set it there too. tf_eflags is kept in + * the signal context during signal handling and there is no + * other place to remember it, so the PSL_RF bit may be + * corrupted by the signal handler without us knowing. + * Corruption of the PSL_RF bit at worst causes one more or + * one less debugger trap, so allowing it is fairly harmless. + */ + if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) { + return (EINVAL); + } + + /* + * Don't allow users to load a valid privileged %cs. Let the + * hardware check for invalid selectors, excess privilege in + * other selectors, invalid %eip's and invalid %esp's. + */ + if (!CS_SECURE(scp->sc_cs)) { + ksiginfo_init_trap(&ksi); + ksi.ksi_signo = SIGBUS; + ksi.ksi_code = BUS_OBJERR; + ksi.ksi_trapno = T_PROTFLT; + ksi.ksi_addr = (void *)regs->tf_eip; + trapsignal(td, &ksi); + return (EINVAL); + } + regs->tf_ds = scp->sc_ds; + regs->tf_es = scp->sc_es; + regs->tf_fs = scp->sc_fs; + } + + /* Restore remaining registers. */ + regs->tf_eax = scp->sc_eax; + regs->tf_ebx = scp->sc_ebx; + regs->tf_ecx = scp->sc_ecx; + regs->tf_edx = scp->sc_edx; + regs->tf_esi = scp->sc_esi; + regs->tf_edi = scp->sc_edi; + regs->tf_cs = scp->sc_cs; + regs->tf_ss = scp->sc_ss; + regs->tf_isp = scp->sc_isp; + regs->tf_ebp = scp->sc_fp; + regs->tf_esp = scp->sc_sp; + regs->tf_eip = scp->sc_pc; + regs->tf_eflags = eflags; + + PROC_LOCK(p); +#if defined(COMPAT_43) + if (scp->sc_onstack & 1) + td->td_sigstk.ss_flags |= SS_ONSTACK; + else + td->td_sigstk.ss_flags &= ~SS_ONSTACK; +#endif + SIGSETOLD(td->td_sigmask, scp->sc_mask); + SIG_CANTMASK(td->td_sigmask); + signotify(td); + PROC_UNLOCK(p); + return (EJUSTRETURN); +} +#endif /* COMPAT_43 */ + +#ifdef COMPAT_FREEBSD4 +/* + * MPSAFE + */ +int +freebsd4_sigreturn(td, uap) + struct thread *td; + struct freebsd4_sigreturn_args /* { + const ucontext4 *sigcntxp; + } */ *uap; +{ + struct ucontext4 uc; + struct proc *p = td->td_proc; + struct trapframe *regs; + const struct ucontext4 *ucp; + int cs, eflags, error; + ksiginfo_t ksi; + + error = copyin(uap->sigcntxp, &uc, sizeof(uc)); + if (error != 0) + return (error); + ucp = &uc; + regs = td->td_frame; + eflags = ucp->uc_mcontext.mc_eflags; + if (eflags & PSL_VM) { + struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; + struct vm86_kernel *vm86; + + /* + * if pcb_ext == 0 or vm86_inited == 0, the user hasn't + * set up the vm86 area, and we can't enter vm86 mode. + */ + if (td->td_pcb->pcb_ext == 0) + return (EINVAL); + vm86 = &td->td_pcb->pcb_ext->ext_vm86; + if (vm86->vm86_inited == 0) + return (EINVAL); + + /* Go back to user mode if both flags are set. */ + if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) { + ksiginfo_init_trap(&ksi); + ksi.ksi_signo = SIGBUS; + ksi.ksi_code = BUS_OBJERR; + ksi.ksi_addr = (void *)regs->tf_eip; + trapsignal(td, &ksi); + } + if (vm86->vm86_has_vme) { + eflags = (tf->tf_eflags & ~VME_USERCHANGE) | + (eflags & VME_USERCHANGE) | PSL_VM; + } else { + vm86->vm86_eflags = eflags; /* save VIF, VIP */ + eflags = (tf->tf_eflags & ~VM_USERCHANGE) | + (eflags & VM_USERCHANGE) | PSL_VM; + } + bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe)); + tf->tf_eflags = eflags; + tf->tf_vm86_ds = tf->tf_ds; + tf->tf_vm86_es = tf->tf_es; + tf->tf_vm86_fs = tf->tf_fs; + tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs; + tf->tf_ds = _udatasel; + tf->tf_es = _udatasel; + tf->tf_fs = _udatasel; + } else { + /* + * Don't allow users to change privileged or reserved flags. + */ + /* + * XXX do allow users to change the privileged flag PSL_RF. + * The cpu sets PSL_RF in tf_eflags for faults. Debuggers + * should sometimes set it there too. tf_eflags is kept in + * the signal context during signal handling and there is no + * other place to remember it, so the PSL_RF bit may be + * corrupted by the signal handler without us knowing. + * Corruption of the PSL_RF bit at worst causes one more or + * one less debugger trap, so allowing it is fairly harmless. + */ + if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) { + printf("freebsd4_sigreturn: eflags = 0x%x\n", eflags); + return (EINVAL); + } + + /* + * Don't allow users to load a valid privileged %cs. Let the + * hardware check for invalid selectors, excess privilege in + * other selectors, invalid %eip's and invalid %esp's. + */ + cs = ucp->uc_mcontext.mc_cs; + if (!CS_SECURE(cs)) { + printf("freebsd4_sigreturn: cs = 0x%x\n", cs); + ksiginfo_init_trap(&ksi); + ksi.ksi_signo = SIGBUS; + ksi.ksi_code = BUS_OBJERR; + ksi.ksi_trapno = T_PROTFLT; + ksi.ksi_addr = (void *)regs->tf_eip; + trapsignal(td, &ksi); + return (EINVAL); + } + + bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs)); + } + + PROC_LOCK(p); +#if defined(COMPAT_43) + if (ucp->uc_mcontext.mc_onstack & 1) + td->td_sigstk.ss_flags |= SS_ONSTACK; + else + td->td_sigstk.ss_flags &= ~SS_ONSTACK; +#endif + + td->td_sigmask = ucp->uc_sigmask; + SIG_CANTMASK(td->td_sigmask); + signotify(td); + PROC_UNLOCK(p); + return (EJUSTRETURN); +} +#endif /* COMPAT_FREEBSD4 */ + +/* + * MPSAFE + */ +int +sigreturn(td, uap) + struct thread *td; + struct sigreturn_args /* { + const struct __ucontext *sigcntxp; + } */ *uap; +{ + ucontext_t uc; + struct proc *p = td->td_proc; + struct trapframe *regs; + const ucontext_t *ucp; + int cs, eflags, error, ret; + ksiginfo_t ksi; + + error = copyin(uap->sigcntxp, &uc, sizeof(uc)); + if (error != 0) + return (error); + ucp = &uc; + regs = td->td_frame; + eflags = ucp->uc_mcontext.mc_eflags; + if (eflags & PSL_VM) { + struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; + struct vm86_kernel *vm86; + + /* + * if pcb_ext == 0 or vm86_inited == 0, the user hasn't + * set up the vm86 area, and we can't enter vm86 mode. + */ + if (td->td_pcb->pcb_ext == 0) + return (EINVAL); + vm86 = &td->td_pcb->pcb_ext->ext_vm86; + if (vm86->vm86_inited == 0) + return (EINVAL); + + /* Go back to user mode if both flags are set. */ + if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) { + ksiginfo_init_trap(&ksi); + ksi.ksi_signo = SIGBUS; + ksi.ksi_code = BUS_OBJERR; + ksi.ksi_addr = (void *)regs->tf_eip; + trapsignal(td, &ksi); + } + + if (vm86->vm86_has_vme) { + eflags = (tf->tf_eflags & ~VME_USERCHANGE) | + (eflags & VME_USERCHANGE) | PSL_VM; + } else { + vm86->vm86_eflags = eflags; /* save VIF, VIP */ + eflags = (tf->tf_eflags & ~VM_USERCHANGE) | + (eflags & VM_USERCHANGE) | PSL_VM; + } + bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe)); + tf->tf_eflags = eflags; + tf->tf_vm86_ds = tf->tf_ds; + tf->tf_vm86_es = tf->tf_es; + tf->tf_vm86_fs = tf->tf_fs; + tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs; + tf->tf_ds = _udatasel; + tf->tf_es = _udatasel; + tf->tf_fs = _udatasel; + } else { + /* + * Don't allow users to change privileged or reserved flags. + */ + /* + * XXX do allow users to change the privileged flag PSL_RF. + * The cpu sets PSL_RF in tf_eflags for faults. Debuggers + * should sometimes set it there too. tf_eflags is kept in + * the signal context during signal handling and there is no + * other place to remember it, so the PSL_RF bit may be + * corrupted by the signal handler without us knowing. + * Corruption of the PSL_RF bit at worst causes one more or + * one less debugger trap, so allowing it is fairly harmless. + */ + if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) { + printf("sigreturn: eflags = 0x%x\n", eflags); + return (EINVAL); + } + + /* + * Don't allow users to load a valid privileged %cs. Let the + * hardware check for invalid selectors, excess privilege in + * other selectors, invalid %eip's and invalid %esp's. + */ + cs = ucp->uc_mcontext.mc_cs; + if (!CS_SECURE(cs)) { + printf("sigreturn: cs = 0x%x\n", cs); + ksiginfo_init_trap(&ksi); + ksi.ksi_signo = SIGBUS; + ksi.ksi_code = BUS_OBJERR; + ksi.ksi_trapno = T_PROTFLT; + ksi.ksi_addr = (void *)regs->tf_eip; + trapsignal(td, &ksi); + return (EINVAL); + } + + ret = set_fpcontext(td, &ucp->uc_mcontext); + if (ret != 0) + return (ret); + bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs)); + } + + PROC_LOCK(p); +#if defined(COMPAT_43) + if (ucp->uc_mcontext.mc_onstack & 1) + td->td_sigstk.ss_flags |= SS_ONSTACK; + else + td->td_sigstk.ss_flags &= ~SS_ONSTACK; +#endif + + td->td_sigmask = ucp->uc_sigmask; + SIG_CANTMASK(td->td_sigmask); + signotify(td); + PROC_UNLOCK(p); + return (EJUSTRETURN); +} + +/* + * Machine dependent boot() routine + * + * I haven't seen anything to put here yet + * Possibly some stuff might be grafted back here from boot() + */ +void +cpu_boot(int howto) +{ +} + +/* Get current clock frequency for the given cpu id. */ +int +cpu_est_clockrate(int cpu_id, uint64_t *rate) +{ + register_t reg; + uint64_t tsc1, tsc2; + + if (pcpu_find(cpu_id) == NULL || rate == NULL) + return (EINVAL); + if (!tsc_present) + return (EOPNOTSUPP); + + /* If we're booting, trust the rate calibrated moments ago. */ + if (cold) { + *rate = tsc_freq; + return (0); + } +#ifdef notyet + +#ifdef SMP + /* Schedule ourselves on the indicated cpu. */ + mtx_lock_spin(&sched_lock); + sched_bind(curthread, cpu_id); + mtx_unlock_spin(&sched_lock); +#endif + + /* Calibrate by measuring a short delay. */ + reg = intr_disable(); + tsc1 = rdtsc(); + DELAY(1000); + tsc2 = rdtsc(); + intr_restore(reg); + +#ifdef SMP + mtx_lock_spin(&sched_lock); + sched_unbind(curthread); + mtx_unlock_spin(&sched_lock); +#endif +#else + printf("punting on clockrate estimation ... FIX\n"); + +#endif + /* + * Calculate the difference in readings, convert to Mhz, and + * subtract 0.5% of the total. Empirical testing has shown that + * overhead in DELAY() works out to approximately this value. + */ + tsc2 -= tsc1; + *rate = tsc2 * 1000 - tsc2 * 5; + return (0); +} + +/* + * Shutdown the CPU as much as possible + */ +void +cpu_halt(void) +{ +#ifndef XEN + for (;;) + __asm__ ("hlt"); +#else + HYPERVISOR_shutdown(SHUTDOWN_poweroff); +#endif +} + +/* + * Hook to idle the CPU when possible. In the SMP case we default to + * off because a halted cpu will not currently pick up a new thread in the + * run queue until the next timer tick. If turned on this will result in + * approximately a 4.2% loss in real time performance in buildworld tests + * (but improves user and sys times oddly enough), and saves approximately + * 5% in power consumption on an idle machine (tests w/2xCPU 1.1GHz P3). + * + * XXX we need to have a cpu mask of idle cpus and generate an IPI or + * otherwise generate some sort of interrupt to wake up cpus sitting in HLT. + * Then we can have our cake and eat it too. + * + * XXX I'm turning it on for SMP as well by default for now. It seems to + * help lock contention somewhat, and this is critical for HTT. -Peter + */ +static int cpu_idle_hlt = 1; +TUNABLE_INT("machdep.cpu_idle_hlt", &cpu_idle_hlt); +SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hlt, CTLFLAG_RW, + &cpu_idle_hlt, 0, "Idle loop HLT enable"); + +static void +cpu_idle_default(void) +{ + /* + * we must absolutely guarentee that hlt is the + * absolute next instruction after sti or we + * introduce a timing window. + */ +#ifndef XEN + __asm __volatile("sti; hlt"); +#else + idle_block(); +#endif +} + +/* + * Note that we have to be careful here to avoid a race between checking + * sched_runnable() and actually halting. If we don't do this, we may waste + * the time between calling hlt and the next interrupt even though there + * is a runnable process. + */ +void +cpu_idle(void) +{ + +#ifdef SMP + if (mp_grab_cpu_hlt()) + return; +#endif + + if (cpu_idle_hlt) { + disable_intr(); + if (sched_runnable()) + enable_intr(); + else + (*cpu_idle_hook)(); + } +} + +/* Other subsystems (e.g., ACPI) can hook this later. */ +void (*cpu_idle_hook)(void) = cpu_idle_default; + +/* + * Clear registers on exec + */ +void +exec_setregs(td, entry, stack, ps_strings) + struct thread *td; + u_long entry; + u_long stack; + u_long ps_strings; +{ + struct trapframe *regs = td->td_frame; + struct pcb *pcb = td->td_pcb; + + /* Reset pc->pcb_gs and %gs before possibly invalidating it. */ + pcb->pcb_gs = _udatasel; + load_gs(_udatasel); + + mtx_lock_spin(&dt_lock); + if (td->td_proc->p_md.md_ldt) + user_ldt_free(td); + mtx_unlock_spin(&dt_lock); + + bzero((char *)regs, sizeof(struct trapframe)); + regs->tf_eip = entry; + regs->tf_esp = stack; + regs->tf_eflags = PSL_USER | (regs->tf_eflags & PSL_T); + regs->tf_ss = _udatasel; + regs->tf_ds = _udatasel; + regs->tf_es = _udatasel; + regs->tf_fs = _udatasel; + regs->tf_cs = _ucodesel; + + /* PS_STRINGS value for BSD/OS binaries. It is 0 for non-BSD/OS. */ + regs->tf_ebx = ps_strings; + + /* + * Reset the hardware debug registers if they were in use. + * They won't have any meaning for the newly exec'd process. + */ + if (pcb->pcb_flags & PCB_DBREGS) { + pcb->pcb_dr0 = 0; + pcb->pcb_dr1 = 0; + pcb->pcb_dr2 = 0; + pcb->pcb_dr3 = 0; + pcb->pcb_dr6 = 0; + pcb->pcb_dr7 = 0; + if (pcb == PCPU_GET(curpcb)) { + /* + * Clear the debug registers on the running + * CPU, otherwise they will end up affecting + * the next process we switch to. + */ + reset_dbregs(); + } + pcb->pcb_flags &= ~PCB_DBREGS; + } + + /* + * Initialize the math emulator (if any) for the current process. + * Actually, just clear the bit that says that the emulator has + * been initialized. Initialization is delayed until the process + * traps to the emulator (if it is done at all) mainly because + * emulators don't provide an entry point for initialization. + */ + td->td_pcb->pcb_flags &= ~FP_SOFTFP; + + /* + * Drop the FP state if we hold it, so that the process gets a + * clean FP state if it uses the FPU again. + */ + fpstate_drop(td); + + /* + * XXX - Linux emulator + * Make sure sure edx is 0x0 on entry. Linux binaries depend + * on it. + */ + td->td_retval[1] = 0; +} + +void +cpu_setregs(void) +{ + unsigned int cr0; + + cr0 = rcr0(); + /* + * CR0_MP, CR0_NE and CR0_TS are set for NPX (FPU) support: + * + * Prepare to trap all ESC (i.e., NPX) instructions and all WAIT + * instructions. We must set the CR0_MP bit and use the CR0_TS + * bit to control the trap, because setting the CR0_EM bit does + * not cause WAIT instructions to trap. It's important to trap + * WAIT instructions - otherwise the "wait" variants of no-wait + * control instructions would degenerate to the "no-wait" variants + * after FP context switches but work correctly otherwise. It's + * particularly important to trap WAITs when there is no NPX - + * otherwise the "wait" variants would always degenerate. + * + * Try setting CR0_NE to get correct error reporting on 486DX's. + * Setting it should fail or do nothing on lesser processors. + */ + cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM; + load_cr0(cr0); + load_gs(_udatasel); +} + +u_long bootdev; /* not a struct cdev *- encoding is different */ +SYSCTL_ULONG(_machdep, OID_AUTO, guessed_bootdev, + CTLFLAG_RD, &bootdev, 0, "Maybe the Boot device (not in struct cdev *format)"); + +/* + * Initialize 386 and configure to run kernel + */ + +/* + * Initialize segments & interrupt table + */ + +int _default_ldt; +#ifndef XEN +union descriptor gdt[NGDT * MAXCPU]; /* global descriptor table */ +union descriptor ldt[NLDT]; /* local descriptor table */ +#else +union descriptor *gdt; /* global descriptor table */ +union descriptor *ldt; /* local descriptor table */ +#endif +static struct gate_descriptor idt0[NIDT]; +struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ +struct region_descriptor r_gdt, r_idt; /* table descriptors */ +struct mtx dt_lock; /* lock for GDT and LDT */ + +#if defined(I586_CPU) && !defined(NO_F00F_HACK) +extern int has_f00f_bug; +#endif + +static struct i386tss dblfault_tss; +static char dblfault_stack[PAGE_SIZE]; + +extern vm_offset_t proc0kstack; + + +/* + * software prototypes -- in more palatable form. + * + * GCODE_SEL through GUDATA_SEL must be in this order for syscall/sysret + * GUFS_SEL and GUGS_SEL must be in this order (swtch.s knows it) + */ +struct soft_segment_descriptor gdt_segs[] = { +/* GNULL_SEL 0 Null Descriptor */ +{ 0x0, /* segment base address */ + 0x0, /* length */ + 0, /* segment type */ + 0, /* segment descriptor priority level */ + 0, /* segment descriptor present */ + 0, 0, + 0, /* default 32 vs 16 bit size */ + 0 /* limit granularity (byte/page units)*/ }, +/* GPRIV_SEL 1 SMP Per-Processor Private Data Descriptor */ +{ 0x0, /* segment base address */ + 0xfffff, /* length - all address space */ + SDT_MEMRWA, /* segment type */ + SEL_KPL, /* segment descriptor priority level */ + 1, /* segment descriptor present */ + 0, 0, + 1, /* default 32 vs 16 bit size */ + 1 /* limit granularity (byte/page units)*/ }, +/* GUFS_SEL 2 %fs Descriptor for user */ +{ 0x0, /* segment base address */ + 0xfffff, /* length - all address space */ + SDT_MEMRWA, /* segment type */ + SEL_UPL, /* segment descriptor priority level */ + 1, /* segment descriptor present */ + 0, 0, + 1, /* default 32 vs 16 bit size */ + 1 /* limit granularity (byte/page units)*/ }, +/* GUGS_SEL 3 %gs Descriptor for user */ +{ 0x0, /* segment base address */ + 0xfffff, /* length - all address space */ + SDT_MEMRWA, /* segment type */ + SEL_UPL, /* segment descriptor priority level */ + 1, /* segment descriptor present */ + 0, 0, + 1, /* default 32 vs 16 bit size */ + 1 /* limit granularity (byte/page units)*/ }, +/* GCODE_SEL 4 Code Descriptor for kernel */ +{ 0x0, /* segment base address */ + 0xfffff, /* length - all address space */ + SDT_MEMERA, /* segment type */ + SEL_KPL, /* segment descriptor priority level */ + 1, /* segment descriptor present */ + 0, 0, + 1, /* default 32 vs 16 bit size */ + 1 /* limit granularity (byte/page units)*/ }, +/* GDATA_SEL 5 Data Descriptor for kernel */ +{ 0x0, /* segment base address */ + 0xfffff, /* length - all address space */ + SDT_MEMRWA, /* segment type */ + SEL_KPL, /* segment descriptor priority level */ + 1, /* segment descriptor present */ + 0, 0, + 1, /* default 32 vs 16 bit size */ + 1 /* limit granularity (byte/page units)*/ }, +/* GUCODE_SEL 6 Code Descriptor for user */ +{ 0x0, /* segment base address */ + 0xfffff, /* length - all address space */ + SDT_MEMERA, /* segment type */ + SEL_UPL, /* segment descriptor priority level */ + 1, /* segment descriptor present */ + 0, 0, + 1, /* default 32 vs 16 bit size */ + 1 /* limit granularity (byte/page units)*/ }, +/* GUDATA_SEL 7 Data Descriptor for user */ +{ 0x0, /* segment base address */ + 0xfffff, /* length - all address space */ + SDT_MEMRWA, /* segment type */ + SEL_UPL, /* segment descriptor priority level */ + 1, /* segment descriptor present */ + 0, 0, + 1, /* default 32 vs 16 bit size */ + 1 /* limit granularity (byte/page units)*/ }, +#ifndef XEN +/* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */ +{ 0x400, /* segment base address */ + 0xfffff, /* length */ + SDT_MEMRWA, /* segment type */ + 0, /* segment descriptor priority level */ + 1, /* segment descriptor present */ + 0, 0, + 1, /* default 32 vs 16 bit size */ + 1 /* limit granularity (byte/page units)*/ }, +/* GPROC0_SEL 9 Proc 0 Tss Descriptor */ +{ + 0x0, /* segment base address */ + sizeof(struct i386tss)-1,/* length */ + SDT_SYS386TSS, /* segment type */ + 0, /* segment descriptor priority level */ + 1, /* segment descriptor present */ + 0, 0, + 0, /* unused - default 32 vs 16 bit size */ + 0 /* limit granularity (byte/page units)*/ }, +/* GLDT_SEL 10 LDT Descriptor */ +{ (int) ldt, /* segment base address */ + sizeof(ldt)-1, /* length - all address space */ + SDT_SYSLDT, /* segment type */ + SEL_UPL, /* segment descriptor priority level */ + 1, /* segment descriptor present */ + 0, 0, + 0, /* unused - default 32 vs 16 bit size */ + 0 /* limit granularity (byte/page units)*/ }, +/* GUSERLDT_SEL 11 User LDT Descriptor per process */ +{ (int) ldt, /* segment base address */ + (512 * sizeof(union descriptor)-1), /* length */ + SDT_SYSLDT, /* segment type */ + 0, /* segment descriptor priority level */ + 1, /* segment descriptor present */ + 0, 0, + 0, /* unused - default 32 vs 16 bit size */ + 0 /* limit granularity (byte/page units)*/ }, +/* GPANIC_SEL 12 Panic Tss Descriptor */ +{ (int) &dblfault_tss, /* segment base address */ + sizeof(struct i386tss)-1,/* length - all address space */ + SDT_SYS386TSS, /* segment type */ + 0, /* segment descriptor priority level */ + 1, /* segment descriptor present */ + 0, 0, + 0, /* unused - default 32 vs 16 bit size */ + 0 /* limit granularity (byte/page units)*/ }, +/* GBIOSCODE32_SEL 13 BIOS 32-bit interface (32bit Code) */ +{ 0, /* segment base address (overwritten) */ + 0xfffff, /* length */ + SDT_MEMERA, /* segment type */ + 0, /* segment descriptor priority level */ + 1, /* segment descriptor present */ + 0, 0, + 0, /* default 32 vs 16 bit size */ + 1 /* limit granularity (byte/page units)*/ }, +/* GBIOSCODE16_SEL 14 BIOS 32-bit interface (16bit Code) */ +{ 0, /* segment base address (overwritten) */ + 0xfffff, /* length */ + SDT_MEMERA, /* segment type */ + 0, /* segment descriptor priority level */ + 1, /* segment descriptor present */ + 0, 0, + 0, /* default 32 vs 16 bit size */ + 1 /* limit granularity (byte/page units)*/ }, +/* GBIOSDATA_SEL 15 BIOS 32-bit interface (Data) */ +{ 0, /* segment base address (overwritten) */ + 0xfffff, /* length */ + SDT_MEMRWA, /* segment type */ + 0, /* segment descriptor priority level */ + 1, /* segment descriptor present */ + 0, 0, + 1, /* default 32 vs 16 bit size */ + 1 /* limit granularity (byte/page units)*/ }, +/* GBIOSUTIL_SEL 16 BIOS 16-bit interface (Utility) */ +{ 0, /* segment base address (overwritten) */ + 0xfffff, /* length */ + SDT_MEMRWA, /* segment type */ + 0, /* segment descriptor priority level */ + 1, /* segment descriptor present */ + 0, 0, + 0, /* default 32 vs 16 bit size */ + 1 /* limit granularity (byte/page units)*/ }, +/* GBIOSARGS_SEL 17 BIOS 16-bit interface (Arguments) */ +{ 0, /* segment base address (overwritten) */ + 0xfffff, /* length */ + SDT_MEMRWA, /* segment type */ + 0, /* segment descriptor priority level */ + 1, /* segment descriptor present */ + 0, 0, + 0, /* default 32 vs 16 bit size */ + 1 /* limit granularity (byte/page units)*/ }, +/* GNDIS_SEL 18 NDIS Descriptor */ +{ 0x0, /* segment base address */ + 0x0, /* length */ + 0, /* segment type */ + 0, /* segment descriptor priority level */ + 0, /* segment descriptor present */ + 0, 0, + 0, /* default 32 vs 16 bit size */ + 0 /* limit granularity (byte/page units)*/ }, +#endif /* !XEN */ +}; + +static struct soft_segment_descriptor ldt_segs[] = { + /* Null Descriptor - overwritten by call gate */ +{ 0x0, /* segment base address */ + 0x0, /* length - all address space */ + 0, /* segment type */ + 0, /* segment descriptor priority level */ + 0, /* segment descriptor present */ + 0, 0, + 0, /* default 32 vs 16 bit size */ + 0 /* limit granularity (byte/page units)*/ }, + /* Null Descriptor - overwritten by call gate */ +{ 0x0, /* segment base address */ + 0x0, /* length - all address space */ + 0, /* segment type */ + 0, /* segment descriptor priority level */ + 0, /* segment descriptor present */ + 0, 0, + 0, /* default 32 vs 16 bit size */ + 0 /* limit granularity (byte/page units)*/ }, + /* Null Descriptor - overwritten by call gate */ +{ 0x0, /* segment base address */ + 0x0, /* length - all address space */ + 0, /* segment type */ + 0, /* segment descriptor priority level */ + 0, /* segment descriptor present */ + 0, 0, + 0, /* default 32 vs 16 bit size */ + 0 /* limit granularity (byte/page units)*/ }, + /* Code Descriptor for user */ +{ 0x0, /* segment base address */ + 0xfffff, /* length - all address space */ + SDT_MEMERA, /* segment type */ + SEL_UPL, /* segment descriptor priority level */ + 1, /* segment descriptor present */ + 0, 0, + 1, /* default 32 vs 16 bit size */ + 1 /* limit granularity (byte/page units)*/ }, + /* Null Descriptor - overwritten by call gate */ +{ 0x0, /* segment base address */ + 0x0, /* length - all address space */ + 0, /* segment type */ + 0, /* segment descriptor priority level */ + 0, /* segment descriptor present */ + 0, 0, + 0, /* default 32 vs 16 bit size */ + 0 /* limit granularity (byte/page units)*/ }, + /* Data Descriptor for user */ +{ 0x0, /* segment base address */ + 0xfffff, /* length - all address space */ + SDT_MEMRWA, /* segment type */ + SEL_UPL, /* segment descriptor priority level */ + 1, /* segment descriptor present */ + 0, 0, + 1, /* default 32 vs 16 bit size */ + 1 /* limit granularity (byte/page units)*/ }, +}; + +void +setidt(idx, func, typ, dpl, selec) + int idx; + inthand_t *func; + int typ; + int dpl; + int selec; +{ + struct gate_descriptor *ip; + + ip = idt + idx; + ip->gd_looffset = (int)func; + ip->gd_selector = selec; + ip->gd_stkcpy = 0; + ip->gd_xx = 0; + ip->gd_type = typ; + ip->gd_dpl = dpl; + ip->gd_p = 1; + ip->gd_hioffset = ((int)func)>>16 ; +} + +extern inthand_t + IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), + IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), + IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), + IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), + IDTVEC(xmm), IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall); + +#ifdef DDB +/* + * Display the index and function name of any IDT entries that don't use + * the default 'rsvd' entry point. + */ +DB_SHOW_COMMAND(idt, db_show_idt) +{ + struct gate_descriptor *ip; + int idx; + uintptr_t func; + + ip = idt; + for (idx = 0; idx < NIDT && !db_pager_quit; idx++) { + func = (ip->gd_hioffset << 16 | ip->gd_looffset); + if (func != (uintptr_t)&IDTVEC(rsvd)) { + db_printf("%3d\t", idx); + db_printsym(func, DB_STGY_PROC); + db_printf("\n"); + } + ip++; + } +} +#endif + +void +sdtossd(sd, ssd) + struct segment_descriptor *sd; + struct soft_segment_descriptor *ssd; +{ + ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; + ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; + ssd->ssd_type = sd->sd_type; + ssd->ssd_dpl = sd->sd_dpl; + ssd->ssd_p = sd->sd_p; + ssd->ssd_def32 = sd->sd_def32; + ssd->ssd_gran = sd->sd_gran; +} + +/* + * Populate the (physmap) array with base/bound pairs describing the + * available physical memory in the system, then test this memory and + * build the phys_avail array describing the actually-available memory. + * + * If we cannot accurately determine the physical memory map, then use + * value from the 0xE801 call, and failing that, the RTC. + * + * Total memory size may be set by the kernel environment variable + * hw.physmem or the compile-time define MAXMEM. + * + * XXX first should be vm_paddr_t. + */ +static void +getmemsize(int first) +{ + int i, off, physmap_idx, pa_indx, da_indx; + int hasbrokenint12, has_smap; + u_long physmem_tunable; + u_int extmem; + struct vm86frame vmf; + struct vm86context vmc; + vm_paddr_t pa, physmap[PHYSMAP_SIZE]; + pt_entry_t *pte; + struct bios_smap *smap; + quad_t dcons_addr, dcons_size; +#ifdef XEN_PRIVILEGED_GUEST + dom0_op_t op; + struct dom0_memory_map_entry *map; + unsigned long gapstart, gapsize; + unsigned long long last; +#endif + + +#ifdef XBOX + if (arch_i386_is_xbox) { + /* + * We queried the memory size before, so chop off 4MB for + * the framebuffer and inform the OS of this. + */ + physmap[0] = 0; + physmap[1] = (arch_i386_xbox_memsize * 1024 * 1024) - XBOX_FB_SIZE; + physmap_idx = 0; + goto physmap_done; + } +#elif defined(XEN_PRIVILEGED_GUEST) + /* + * XXX need to map "struct resource" and pci_mem_start into BSD terms + * this will panic with the current bootmem allocator - still need to + * figure out how little I can get away with there + */ + if (xen_start_info->flags & SIF_INITDOMAIN) { + map = bootmem_alloc(PAGE_SIZE); + op.cmd = DOM0_PHYSICAL_MEMORY_MAP; + op.u.physical_memory_map.memory_map = map; + op.u.physical_memory_map.max_map_entries = + PAGE_SIZE / sizeof(struct dom0_memory_map_entry); + /* + * if this fails - check for a header versioning mismatch + */ + PANIC_IF(HYPERVISOR_dom0_op(&op)); + + last = 0x100000000ULL; + gapstart = 0x10000000; + gapsize = 0x400000; + + for (i = op.u.physical_memory_map.nr_map_entries - 1; i >= 0; i--) { +#if 0 + struct resource *res; +#endif + if ((last > map[i].end) && ((last - map[i].end) > gapsize)) { + gapsize = last - map[i].end; + gapstart = map[i].end; + } + if (map[i].start < last) + last = map[i].start; + + if (map[i].end > 0x100000000ULL) + continue; + /* XXXEN need to handle reserved */ + phys_avail[2*i] = map[i].start; + phys_avail[2*i + 1] = map[i].end; + PANIC_IF(i*2 >= PHYSMAP_SIZE); + +#if 0 + res = bootmem_alloc(sizeof(struct resource)); + res->name = map[i].is_ram ? "System RAM" : "reserved"; + res->start = map[i].start; + res->end = map[i].end - 1; + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; + request_resource(&iomem_resource, res); +#endif + } + + bootmem_free(map, PAGE_SIZE); + + /* + * Start allocating dynamic PCI memory a bit into the gap, + * aligned up to the nearest megabyte. + * + * Question: should we try to pad it up a bit (do something + * like " + (gapsize >> 3)" in there too?). We now have the + * technology. + */ +#ifdef notyet + pci_mem_start = (gapstart + 0xfffff) & ~0xfffff; + + printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n", + pci_mem_start, gapstart, gapsize); +#endif + } + +#endif +#if defined(XEN) + Maxmem = xen_start_info->nr_pages - init_first; + pmap_bootstrap((init_first << PAGE_SHIFT)); + for (i = 0; i < 10; i++) + phys_avail[i] = 0; + physmem = Maxmem; + avail_end = ptoa(Maxmem) - round_page(MSGBUF_SIZE); + basemem = 0; + phys_avail[0] = init_first << PAGE_SHIFT; + phys_avail[1] = avail_end; + return; +#endif + hasbrokenint12 = 0; + TUNABLE_INT_FETCH("hw.hasbrokenint12", &hasbrokenint12); + bzero(&vmf, sizeof(vmf)); + bzero(physmap, sizeof(physmap)); + basemem = 0; + + /* + * Some newer BIOSes has broken INT 12H implementation which cause + * kernel panic immediately. In this case, we need to scan SMAP + * with INT 15:E820 first, then determine base memory size. + */ + if (hasbrokenint12) { + goto int15e820; + } + + /* + * Perform "base memory" related probes & setup + */ + vm86_intcall(0x12, &vmf); + basemem = vmf.vmf_ax; + if (basemem > 640) { + printf("Preposterous BIOS basemem of %uK, truncating to 640K\n", + basemem); + basemem = 640; + } + + /* + * XXX if biosbasemem is now < 640, there is a `hole' + * between the end of base memory and the start of + * ISA memory. The hole may be empty or it may + * contain BIOS code or data. Map it read/write so + * that the BIOS can write to it. (Memory from 0 to + * the physical end of the kernel is mapped read-only + * to begin with and then parts of it are remapped. + * The parts that aren't remapped form holes that + * remain read-only and are unused by the kernel. + * The base memory area is below the physical end of + * the kernel and right now forms a read-only hole. + * The part of it from PAGE_SIZE to + * (trunc_page(biosbasemem * 1024) - 1) will be + * remapped and used by the kernel later.) + * + * This code is similar to the code used in + * pmap_mapdev, but since no memory needs to be + * allocated we simply change the mapping. + */ + for (pa = trunc_page(basemem * 1024); + pa < ISA_HOLE_START; pa += PAGE_SIZE) + pmap_kenter(KERNBASE + pa, pa); + + /* + * Map pages between basemem and ISA_HOLE_START, if any, r/w into + * the vm86 page table so that vm86 can scribble on them using + * the vm86 map too. XXX: why 2 ways for this and only 1 way for + * page 0, at least as initialized here? + */ + pte = (pt_entry_t *)vm86paddr; + for (i = basemem / 4; i < 160; i++) + pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U; + +int15e820: + /* + * map page 1 R/W into the kernel page table so we can use it + * as a buffer. The kernel will unmap this page later. + */ + pmap_kenter(KERNBASE + (1 << PAGE_SHIFT), 1 << PAGE_SHIFT); + + /* + * get memory map with INT 15:E820 + */ + vmc.npages = 0; + smap = (void *)vm86_addpage(&vmc, 1, KERNBASE + (1 << PAGE_SHIFT)); + vm86_getptr(&vmc, (vm_offset_t)smap, &vmf.vmf_es, &vmf.vmf_di); + + physmap_idx = 0; + vmf.vmf_ebx = 0; + do { + vmf.vmf_eax = 0xE820; + vmf.vmf_edx = SMAP_SIG; + vmf.vmf_ecx = sizeof(struct bios_smap); + i = vm86_datacall(0x15, &vmf, &vmc); + if (i || vmf.vmf_eax != SMAP_SIG) + break; + if (boothowto & RB_VERBOSE) + printf("SMAP type=%02x base=%016llx len=%016llx\n", + smap->type, smap->base, smap->length); + has_smap = 1; + + if (smap->type != 0x01) + continue; + + if (smap->length == 0) + continue; + +#ifndef PAE + if (smap->base >= 0xffffffff) { + printf("%uK of memory above 4GB ignored\n", + (u_int)(smap->length / 1024)); + continue; + } +#endif + + for (i = 0; i <= physmap_idx; i += 2) { + if (smap->base < physmap[i + 1]) { + if (boothowto & RB_VERBOSE) + printf( + "Overlapping or non-monotonic memory region, ignoring second region\n"); + continue; + } + } + + if (smap->base == physmap[physmap_idx + 1]) { + physmap[physmap_idx + 1] += smap->length; + continue; + } + + physmap_idx += 2; + if (physmap_idx == PHYSMAP_SIZE) { + printf( + "Too many segments in the physical address map, giving up\n"); + break; + } + physmap[physmap_idx] = smap->base; + physmap[physmap_idx + 1] = smap->base + smap->length; + } while (vmf.vmf_ebx != 0); + + /* + * Perform "base memory" related probes & setup based on SMAP + */ + if (basemem == 0) { + for (i = 0; i <= physmap_idx; i += 2) { + if (physmap[i] == 0x00000000) { + basemem = physmap[i + 1] / 1024; + break; + } + } + + /* + * XXX this function is horribly organized and has to the same + * things that it does above here. + */ + if (basemem == 0) + basemem = 640; + if (basemem > 640) { + printf( + "Preposterous BIOS basemem of %uK, truncating to 640K\n", + basemem); + basemem = 640; + } + + /* + * Let vm86 scribble on pages between basemem and + * ISA_HOLE_START, as above. + */ + for (pa = trunc_page(basemem * 1024); + pa < ISA_HOLE_START; pa += PAGE_SIZE) + pmap_kenter(KERNBASE + pa, pa); + pte = (pt_entry_t *)vm86paddr; + for (i = basemem / 4; i < 160; i++) + pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U; + } + + if (physmap[1] != 0) + goto physmap_done; + + /* + * If we failed above, try memory map with INT 15:E801 + */ + vmf.vmf_ax = 0xE801; + if (vm86_intcall(0x15, &vmf) == 0) { + extmem = vmf.vmf_cx + vmf.vmf_dx * 64; + } else { +#if 0 + vmf.vmf_ah = 0x88; + vm86_intcall(0x15, &vmf); + extmem = vmf.vmf_ax; +#elif !defined(XEN) + /* + * Prefer the RTC value for extended memory. + */ + extmem = rtcin(RTC_EXTLO) + (rtcin(RTC_EXTHI) << 8); +#endif + } + + /* + * Special hack for chipsets that still remap the 384k hole when + * there's 16MB of memory - this really confuses people that + * are trying to use bus mastering ISA controllers with the + * "16MB limit"; they only have 16MB, but the remapping puts + * them beyond the limit. + * + * If extended memory is between 15-16MB (16-17MB phys address range), + * chop it to 15MB. + */ + if ((extmem > 15 * 1024) && (extmem < 16 * 1024)) + extmem = 15 * 1024; + + physmap[0] = 0; + physmap[1] = basemem * 1024; + physmap_idx = 2; + physmap[physmap_idx] = 0x100000; + physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024; + +physmap_done: + /* + * Now, physmap contains a map of physical memory. + */ + +#ifdef SMP + /* make hole for AP bootstrap code */ + physmap[1] = mp_bootaddress(physmap[1]); +#endif + + /* + * Maxmem isn't the "maximum memory", it's one larger than the + * highest page of the physical address space. It should be + * called something like "Maxphyspage". We may adjust this + * based on ``hw.physmem'' and the results of the memory test. + */ + Maxmem = atop(physmap[physmap_idx + 1]); + +#ifdef MAXMEM + Maxmem = MAXMEM / 4; +#endif + + if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable)) + Maxmem = atop(physmem_tunable); + + /* + * If we have an SMAP, don't allow MAXMEM or hw.physmem to extend + * the amount of memory in the system. + */ + if (has_smap && Maxmem > atop(physmap[physmap_idx + 1])) + Maxmem = atop(physmap[physmap_idx + 1]); + + if (atop(physmap[physmap_idx + 1]) != Maxmem && + (boothowto & RB_VERBOSE)) + printf("Physical memory use set to %ldK\n", Maxmem * 4); + + /* + * If Maxmem has been increased beyond what the system has detected, + * extend the last memory segment to the new limit. + */ + if (atop(physmap[physmap_idx + 1]) < Maxmem) + physmap[physmap_idx + 1] = ptoa((vm_paddr_t)Maxmem); + + /* call pmap initialization to make new kernel address space */ + pmap_bootstrap(first); + + /* + * Size up each available chunk of physical memory. + */ + physmap[0] = PAGE_SIZE; /* mask off page 0 */ + pa_indx = 0; + da_indx = 1; + phys_avail[pa_indx++] = physmap[0]; + phys_avail[pa_indx] = physmap[0]; + dump_avail[da_indx] = physmap[0]; + pte = CMAP1; + + /* + * Get dcons buffer address + */ + if (getenv_quad("dcons.addr", &dcons_addr) == 0 || + getenv_quad("dcons.size", &dcons_size) == 0) + dcons_addr = 0; + + /* + * physmap is in bytes, so when converting to page boundaries, + * round up the start address and round down the end address. + */ + for (i = 0; i <= physmap_idx; i += 2) { + vm_paddr_t end; + + end = ptoa((vm_paddr_t)Maxmem); + if (physmap[i + 1] < end) + end = trunc_page(physmap[i + 1]); + for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { + int tmp, page_bad, full; + int *ptr = (int *)CADDR1; + + full = FALSE; + /* + * block out kernel memory as not available. + */ + if (pa >= KERNLOAD && pa < first) + goto do_dump_avail; + + /* + * block out dcons buffer + */ + if (dcons_addr > 0 + && pa >= trunc_page(dcons_addr) + && pa < dcons_addr + dcons_size) + goto do_dump_avail; + + page_bad = FALSE; + + /* + * map page into kernel: valid, read/write,non-cacheable + */ + *pte = pa | PG_V | PG_RW | PG_N; + invltlb(); + + tmp = *(int *)ptr; + /* + * Test for alternating 1's and 0's + */ + *(volatile int *)ptr = 0xaaaaaaaa; + if (*(volatile int *)ptr != 0xaaaaaaaa) + page_bad = TRUE; + /* + * Test for alternating 0's and 1's + */ + *(volatile int *)ptr = 0x55555555; + if (*(volatile int *)ptr != 0x55555555) + page_bad = TRUE; + /* + * Test for all 1's + */ + *(volatile int *)ptr = 0xffffffff; + if (*(volatile int *)ptr != 0xffffffff) + page_bad = TRUE; + /* + * Test for all 0's + */ + *(volatile int *)ptr = 0x0; + if (*(volatile int *)ptr != 0x0) + page_bad = TRUE; + /* + * Restore original value. + */ + *(int *)ptr = tmp; + + /* + * Adjust array of valid/good pages. + */ + if (page_bad == TRUE) + continue; + /* + * If this good page is a continuation of the + * previous set of good pages, then just increase + * the end pointer. Otherwise start a new chunk. + * Note that "end" points one higher than end, + * making the range >= start and < end. + * If we're also doing a speculative memory + * test and we at or past the end, bump up Maxmem + * so that we keep going. The first bad page + * will terminate the loop. + */ + if (phys_avail[pa_indx] == pa) { + phys_avail[pa_indx] += PAGE_SIZE; + } else { + pa_indx++; + if (pa_indx == PHYS_AVAIL_ARRAY_END) { + printf( + "Too many holes in the physical address space, giving up\n"); + pa_indx--; + full = TRUE; + goto do_dump_avail; + } + phys_avail[pa_indx++] = pa; /* start */ + phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ + } + physmem++; +do_dump_avail: + if (dump_avail[da_indx] == pa) { + dump_avail[da_indx] += PAGE_SIZE; + } else { + da_indx++; + if (da_indx == DUMP_AVAIL_ARRAY_END) { + da_indx--; + goto do_next; + } + dump_avail[da_indx++] = pa; /* start */ + dump_avail[da_indx] = pa + PAGE_SIZE; /* end */ + } +do_next: + if (full) + break; + } + } + *pte = 0; + invltlb(); + + /* + * XXX + * The last chunk must contain at least one page plus the message + * buffer to avoid complicating other code (message buffer address + * calculation, etc.). + */ + while (phys_avail[pa_indx - 1] + PAGE_SIZE + + round_page(MSGBUF_SIZE) >= phys_avail[pa_indx]) { + physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); + phys_avail[pa_indx--] = 0; + phys_avail[pa_indx--] = 0; + } + + Maxmem = atop(phys_avail[pa_indx]); + + /* Trim off space for the message buffer. */ + phys_avail[pa_indx] -= round_page(MSGBUF_SIZE); + + avail_end = phys_avail[pa_indx]; + + /* Map the message buffer. */ + for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE) + pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] + + off); +} + +void +init386(first) + int first; +{ +#ifndef XEN + struct gate_descriptor *gdp; +#else + int error; + unsigned long gdtmachpfn; +#endif + int gsel_tss, metadata_missing, off, x; + struct pcpu *pc; + + thread0.td_kstack = proc0kstack; + thread0.td_pcb = (struct pcb *) + (thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1; + + /* + * This may be done better later if it gets more high level + * components in it. If so just link td->td_proc here. + */ + proc_linkup(&proc0, &thread0); + + metadata_missing = 0; +#ifndef XEN + if (bootinfo.bi_modulep) { + preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE; + preload_bootstrap_relocate(KERNBASE); + } +#else + if (xen_start_info->mod_start) { + preload_metadata = (caddr_t)xen_start_info->mod_start; + preload_bootstrap_relocate(KERNBASE); + } +#endif + else { + metadata_missing = 1; + } + if (envmode == 1) + kern_envp = static_env; +#ifndef XEN + else if (bootinfo.bi_envp) + kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE; +#else + else if ((caddr_t)xen_start_info->cmd_line) + kern_envp = xen_setbootenv((caddr_t)xen_start_info->cmd_line); + + boothowto |= xen_boothowto(kern_envp); +#endif + /* Init basic tunables, hz etc */ + init_param1(); + +#ifndef XEN + /* + * Make gdt memory segments. All segments cover the full 4GB + * of address space and permissions are enforced at page level. + */ + gdt_segs[GCODE_SEL].ssd_limit = atop(0 - 1); + gdt_segs[GDATA_SEL].ssd_limit = atop(0 - 1); + gdt_segs[GUCODE_SEL].ssd_limit = atop(0 - 1); + gdt_segs[GUDATA_SEL].ssd_limit = atop(0 - 1); + gdt_segs[GUFS_SEL].ssd_limit = atop(0 - 1); + gdt_segs[GUGS_SEL].ssd_limit = atop(0 - 1); + +#endif + +#ifdef SMP + pc = &SMP_prvspace[0].pcpu; +#else + pc = &__pcpu; +#endif + gdt_segs[GPRIV_SEL].ssd_limit = atop(0 - 1); + gdt_segs[GPRIV_SEL].ssd_base = (int) pc; + gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss; + /* + * XEN occupies the upper 64MB of virtual address space + * At its base it manages an array mapping machine page frames + * to physical page frames - hence we need to be able to + * access 4GB - (64MB - 4MB + 64k) + */ + gdt_segs[GCODE_SEL].ssd_limit = atop(0 - ((1 << 26) - (1 << 22) + (1 << 16))); + gdt_segs[GDATA_SEL].ssd_limit = atop(0 - ((1 << 26) - (1 << 22) + (1 << 16))); + gdt_segs[GUCODE_SEL].ssd_limit = atop(0 - ((1 << 26) - (1 << 22) + (1 << 16))); + gdt_segs[GUDATA_SEL].ssd_limit = atop(0 - ((1 << 26) - (1 << 22) + (1 << 16))); + + + for (x = 0; x < NGDT; x++) + ssdtosd(&gdt_segs[x], &gdt[x].sd); + + mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN); +#ifndef XEN + r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; + r_gdt.rd_base = (int) gdt; + lgdt(&r_gdt); +#else + printk("gdt=%p\n", gdt); + printk("PTmap=%p\n", PTmap); + printk("addr=%p\n", *vtopte((unsigned long)gdt) & ~PG_RW); + + PT_SET_MA(gdt, *vtopte((unsigned long)gdt) & ~PG_RW); + gdtmachpfn = vtomach(gdt) >> PAGE_SHIFT; + PANIC_IF(HYPERVISOR_set_gdt(&gdtmachpfn, 512) != 0); + lgdt(&r_gdt /* unused */); + gdt_set = 1; + + if ((error = HYPERVISOR_set_trap_table(trap_table)) != 0) { + panic("set_trap_table failed - error %d\n", error); + } + HYPERVISOR_set_callbacks(GSEL(GCODE_SEL, SEL_KPL), (unsigned long)Xhypervisor_callback, + GSEL(GCODE_SEL, SEL_KPL), (unsigned long)failsafe_callback); + +#endif + + + pcpu_init(pc, 0, sizeof(struct pcpu)); + PCPU_SET(prvspace, pc); + PCPU_SET(curthread, &thread0); + PCPU_SET(curpcb, thread0.td_pcb); +#ifdef XEN + PCPU_SET(pdir, (unsigned long)IdlePTD); +#endif + + /* + * Initialize mutexes. + * + * icu_lock: in order to allow an interrupt to occur in a critical + * section, to set pcpu->ipending (etc...) properly, we + * must be able to get the icu lock, so it can't be + * under witness. + */ + mutex_init(); + mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE); + + /* make ldt memory segments */ + ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1); + ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1); + for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++) + ssdtosd(&ldt_segs[x], &ldt[x].sd); +#ifdef XEN + default_proc_ldt.ldt_base = (caddr_t)ldt; + default_proc_ldt.ldt_len = 6; + _default_ldt = (int)&default_proc_ldt; + PCPU_SET(currentldt, _default_ldt) + PT_SET_MA(ldt, *vtopte((unsigned long)ldt) & ~PG_RW); + xen_set_ldt((unsigned long) ldt, (sizeof ldt_segs / sizeof ldt_segs[0])); + +#else + _default_ldt = GSEL(GLDT_SEL, SEL_KPL); + lldt(_default_ldt); + PCPU_SET(currentldt, _default_ldt); + + /* exceptions */ + for (x = 0; x < NIDT; x++) + setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL, + GSEL(GCODE_SEL, SEL_KPL)); + setidt(IDT_DE, &IDTVEC(div), SDT_SYS386TGT, SEL_KPL, + GSEL(GCODE_SEL, SEL_KPL)); + setidt(IDT_DB, &IDTVEC(dbg), SDT_SYS386IGT, SEL_KPL, + GSEL(GCODE_SEL, SEL_KPL)); + setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYS386IGT, SEL_KPL, + GSEL(GCODE_SEL, SEL_KPL)); + setidt(IDT_BP, &IDTVEC(bpt), SDT_SYS386IGT, SEL_UPL, + GSEL(GCODE_SEL, SEL_KPL)); + setidt(IDT_OF, &IDTVEC(ofl), SDT_SYS386TGT, SEL_UPL, + GSEL(GCODE_SEL, SEL_KPL)); + setidt(IDT_BR, &IDTVEC(bnd), SDT_SYS386TGT, SEL_KPL, + GSEL(GCODE_SEL, SEL_KPL)); + setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, + GSEL(GCODE_SEL, SEL_KPL)); + setidt(IDT_NM, &IDTVEC(dna), SDT_SYS386TGT, SEL_KPL + , GSEL(GCODE_SEL, SEL_KPL)); + setidt(IDT_DF, 0, SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL)); + setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYS386TGT, SEL_KPL, + GSEL(GCODE_SEL, SEL_KPL)); + setidt(IDT_TS, &IDTVEC(tss), SDT_SYS386TGT, SEL_KPL, + GSEL(GCODE_SEL, SEL_KPL)); + setidt(IDT_NP, &IDTVEC(missing), SDT_SYS386TGT, SEL_KPL, + GSEL(GCODE_SEL, SEL_KPL)); + setidt(IDT_SS, &IDTVEC(stk), SDT_SYS386TGT, SEL_KPL, + GSEL(GCODE_SEL, SEL_KPL)); + setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, + GSEL(GCODE_SEL, SEL_KPL)); + setidt(IDT_PF, &IDTVEC(page), SDT_SYS386IGT, SEL_KPL, + GSEL(GCODE_SEL, SEL_KPL)); + setidt(IDT_MF, &IDTVEC(fpu), SDT_SYS386TGT, SEL_KPL, + GSEL(GCODE_SEL, SEL_KPL)); + setidt(IDT_AC, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL, + GSEL(GCODE_SEL, SEL_KPL)); + setidt(IDT_MC, &IDTVEC(mchk), SDT_SYS386TGT, SEL_KPL, + GSEL(GCODE_SEL, SEL_KPL)); + setidt(IDT_XF, &IDTVEC(xmm), SDT_SYS386TGT, SEL_KPL, + GSEL(GCODE_SEL, SEL_KPL)); + setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), SDT_SYS386TGT, SEL_UPL, + GSEL(GCODE_SEL, SEL_KPL)); + + r_idt.rd_limit = sizeof(idt0) - 1; + r_idt.rd_base = (int) idt; + lidt(&r_idt); +#endif + +#ifdef XBOX + /* + * The following code queries the PCI ID of 0:0:0. For the XBOX, + * This should be 0x10de / 0x02a5. + * + * This is exactly what Linux does. + */ + outl(0xcf8, 0x80000000); + if (inl(0xcfc) == 0x02a510de) { + arch_i386_is_xbox = 1; + pic16l_setled(XBOX_LED_GREEN); + + /* + * We are an XBOX, but we may have either 64MB or 128MB of + * memory. The PCI host bridge should be programmed for this, + * so we just query it. + */ + outl(0xcf8, 0x80000084); + arch_i386_xbox_memsize = (inl(0xcfc) == 0x7FFFFFF) ? 128 : 64; + } +#endif /* XBOX */ + + i8254_init(); + + /* + * Initialize the console before we print anything out. + */ + XENPRINTF("cninit\n"); + cninit(); + + if (metadata_missing) + printf("WARNING: loader(8) metadata is missing!\n"); + +#ifdef DEV_ISA +#ifdef XEN + XENPRINTF("ISA probing\n"); + if (xen_start_info->flags & SIF_PRIVILEGED) { +#endif + elcr_probe(); + atpic_startup(); +#ifdef XEN + } +#endif +#endif + +#ifdef DDB + ksym_start = bootinfo.bi_symtab; + ksym_end = bootinfo.bi_esymtab; +#endif + + kdb_init(); + +#ifdef KDB + if (boothowto & RB_KDB) + kdb_enter("Boot flags requested debugger"); +#endif + + finishidentcpu(); /* Final stage of CPU initialization */ + setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, + GSEL(GCODE_SEL, SEL_KPL)); + setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, + GSEL(GCODE_SEL, SEL_KPL)); + initializecpu(); /* Initialize CPU registers */ + + /* make an initial tss so cpu can get interrupt stack on syscall! */ + /* Note: -16 is so we can grow the trapframe if we came from vm86 */ + PCPU_SET(common_tss.tss_esp0, thread0.td_kstack + + KSTACK_PAGES * PAGE_SIZE - sizeof(struct pcb) - 16); + PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); + gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); +#ifndef XEN + PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd); + PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); + PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16); + ltr(gsel_tss); +#else + HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), PCPU_GET(common_tss.tss_esp0)); +#endif + + /* pointer to selector slot for %fs/%gs */ + PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd); + + dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 = + dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)]; + dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 = + dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL); +#ifdef PAE + dblfault_tss.tss_cr3 = (int)IdlePDPT; +#else + dblfault_tss.tss_cr3 = (int)IdlePTD; +#endif + dblfault_tss.tss_eip = (int)dblfault_handler; + dblfault_tss.tss_eflags = PSL_KERNEL; + dblfault_tss.tss_ds = dblfault_tss.tss_es = + dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL); + dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL); + dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL); + dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL); + + vm86_initialize(); + getmemsize(first); + init_param2(physmem); + + /* now running on new page tables, configured,and u/iom is accessible */ + + /* Map the message buffer. */ + for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE) + pmap_kenter((vm_offset_t)msgbufp + off, avail_end + off); + PT_UPDATES_FLUSH(); + msgbufinit(msgbufp, MSGBUF_SIZE); + +#ifndef XEN + /* make a call gate to reenter kernel with */ + gdp = &ldt[LSYS5CALLS_SEL].gd; + + x = (int) &IDTVEC(lcall_syscall); + gdp->gd_looffset = x; + gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL); + gdp->gd_stkcpy = 1; + gdp->gd_type = SDT_SYS386CGT; + gdp->gd_dpl = SEL_UPL; + gdp->gd_p = 1; + gdp->gd_hioffset = x >> 16; + + /* XXX does this work? */ + /* XXX yes! */ + ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL]; + ldt[LSOL26CALLS_SEL] = ldt[LSYS5CALLS_SEL]; +#endif + /* transfer to user mode */ + _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); + _udatasel = GSEL(GUDATA_SEL, SEL_UPL); + + + /* setup proc 0's pcb */ + thread0.td_pcb->pcb_flags = 0; /* XXXKSE */ +#ifdef PAE + thread0.td_pcb->pcb_cr3 = (int)IdlePDPT; +#else + thread0.td_pcb->pcb_cr3 = (int)IdlePTD; +#endif + thread0.td_pcb->pcb_ext = 0; + thread0.td_frame = &proc0_tf; +#ifdef XEN + thread0.td_pcb->pcb_fsd = PCPU_GET(fsgs_gdt)[0]; + thread0.td_pcb->pcb_gsd = PCPU_GET(fsgs_gdt)[1]; +#endif +} + +void +cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) +{ + + pcpu->pc_acpi_id = 0xffffffff; +} + +void +spinlock_enter(void) +{ + struct thread *td; + + td = curthread; + if (td->td_md.md_spinlock_count == 0) + td->td_md.md_saved_flags = intr_disable(); + td->td_md.md_spinlock_count++; + critical_enter(); +} + +void +spinlock_exit(void) +{ + struct thread *td; + + td = curthread; + critical_exit(); + td->td_md.md_spinlock_count--; + if (td->td_md.md_spinlock_count == 0) + intr_restore(td->td_md.md_saved_flags); +} + +#if defined(I586_CPU) && !defined(NO_F00F_HACK) +static void f00f_hack(void *unused); +SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL) + +static void +f00f_hack(void *unused) +{ + struct gate_descriptor *new_idt; + vm_offset_t tmp; + + if (!has_f00f_bug) + return; + + GIANT_REQUIRED; + + printf("Intel Pentium detected, installing workaround for F00F bug\n"); + + tmp = kmem_alloc(kernel_map, PAGE_SIZE * 2); + if (tmp == 0) + panic("kmem_alloc returned 0"); + + /* Put the problematic entry (#6) at the end of the lower page. */ + new_idt = (struct gate_descriptor*) + (tmp + PAGE_SIZE - 7 * sizeof(struct gate_descriptor)); + bcopy(idt, new_idt, sizeof(idt0)); + r_idt.rd_base = (u_int)new_idt; + lidt(&r_idt); + idt = new_idt; + if (vm_map_protect(kernel_map, tmp, tmp + PAGE_SIZE, + VM_PROT_READ, FALSE) != KERN_SUCCESS) + panic("vm_map_protect failed"); +} +#endif /* defined(I586_CPU) && !NO_F00F_HACK */ + +/* + * Construct a PCB from a trapframe. This is called from kdb_trap() where + * we want to start a backtrace from the function that caused us to enter + * the debugger. We have the context in the trapframe, but base the trace + * on the PCB. The PCB doesn't have to be perfect, as long as it contains + * enough for a backtrace. + */ +void +makectx(struct trapframe *tf, struct pcb *pcb) +{ + + pcb->pcb_edi = tf->tf_edi; + pcb->pcb_esi = tf->tf_esi; + pcb->pcb_ebp = tf->tf_ebp; + pcb->pcb_ebx = tf->tf_ebx; + pcb->pcb_eip = tf->tf_eip; + pcb->pcb_esp = (ISPL(tf->tf_cs)) ? tf->tf_esp : (int)(tf + 1) - 8; +} + +int +ptrace_set_pc(struct thread *td, u_long addr) +{ + + td->td_frame->tf_eip = addr; + return (0); +} + +int +ptrace_single_step(struct thread *td) +{ + td->td_frame->tf_eflags |= PSL_T; + return (0); +} + +int +ptrace_clear_single_step(struct thread *td) +{ + td->td_frame->tf_eflags &= ~PSL_T; + return (0); +} + +int +fill_regs(struct thread *td, struct reg *regs) +{ + struct pcb *pcb; + struct trapframe *tp; + + tp = td->td_frame; + pcb = td->td_pcb; + regs->r_fs = tp->tf_fs; + regs->r_es = tp->tf_es; + regs->r_ds = tp->tf_ds; + regs->r_edi = tp->tf_edi; + regs->r_esi = tp->tf_esi; + regs->r_ebp = tp->tf_ebp; + regs->r_ebx = tp->tf_ebx; + regs->r_edx = tp->tf_edx; + regs->r_ecx = tp->tf_ecx; + regs->r_eax = tp->tf_eax; + regs->r_eip = tp->tf_eip; + regs->r_cs = tp->tf_cs; + regs->r_eflags = tp->tf_eflags; + regs->r_esp = tp->tf_esp; + regs->r_ss = tp->tf_ss; + regs->r_gs = pcb->pcb_gs; + return (0); +} + +int +set_regs(struct thread *td, struct reg *regs) +{ + struct pcb *pcb; + struct trapframe *tp; + + tp = td->td_frame; + if (!EFL_SECURE(regs->r_eflags, tp->tf_eflags) || + !CS_SECURE(regs->r_cs)) + return (EINVAL); + pcb = td->td_pcb; + tp->tf_fs = regs->r_fs; + tp->tf_es = regs->r_es; + tp->tf_ds = regs->r_ds; + tp->tf_edi = regs->r_edi; + tp->tf_esi = regs->r_esi; + tp->tf_ebp = regs->r_ebp; + tp->tf_ebx = regs->r_ebx; + tp->tf_edx = regs->r_edx; + tp->tf_ecx = regs->r_ecx; + tp->tf_eax = regs->r_eax; + tp->tf_eip = regs->r_eip; + tp->tf_cs = regs->r_cs; + tp->tf_eflags = regs->r_eflags; + tp->tf_esp = regs->r_esp; + tp->tf_ss = regs->r_ss; + pcb->pcb_gs = regs->r_gs; + return (0); +} + +#ifdef CPU_ENABLE_SSE +static void +fill_fpregs_xmm(sv_xmm, sv_87) + struct savexmm *sv_xmm; + struct save87 *sv_87; +{ + register struct env87 *penv_87 = &sv_87->sv_env; + register struct envxmm *penv_xmm = &sv_xmm->sv_env; + int i; + + bzero(sv_87, sizeof(*sv_87)); + + /* FPU control/status */ + penv_87->en_cw = penv_xmm->en_cw; + penv_87->en_sw = penv_xmm->en_sw; + penv_87->en_tw = penv_xmm->en_tw; + penv_87->en_fip = penv_xmm->en_fip; + penv_87->en_fcs = penv_xmm->en_fcs; + penv_87->en_opcode = penv_xmm->en_opcode; + penv_87->en_foo = penv_xmm->en_foo; + penv_87->en_fos = penv_xmm->en_fos; + + /* FPU registers */ + for (i = 0; i < 8; ++i) + sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc; +} + +static void +set_fpregs_xmm(sv_87, sv_xmm) + struct save87 *sv_87; + struct savexmm *sv_xmm; +{ + register struct env87 *penv_87 = &sv_87->sv_env; + register struct envxmm *penv_xmm = &sv_xmm->sv_env; + int i; + + /* FPU control/status */ + penv_xmm->en_cw = penv_87->en_cw; + penv_xmm->en_sw = penv_87->en_sw; + penv_xmm->en_tw = penv_87->en_tw; + penv_xmm->en_fip = penv_87->en_fip; + penv_xmm->en_fcs = penv_87->en_fcs; + penv_xmm->en_opcode = penv_87->en_opcode; + penv_xmm->en_foo = penv_87->en_foo; + penv_xmm->en_fos = penv_87->en_fos; + + /* FPU registers */ + for (i = 0; i < 8; ++i) + sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i]; +} +#endif /* CPU_ENABLE_SSE */ + +int +fill_fpregs(struct thread *td, struct fpreg *fpregs) +{ +#ifdef CPU_ENABLE_SSE + if (cpu_fxsr) { + fill_fpregs_xmm(&td->td_pcb->pcb_save.sv_xmm, + (struct save87 *)fpregs); + return (0); + } +#endif /* CPU_ENABLE_SSE */ + bcopy(&td->td_pcb->pcb_save.sv_87, fpregs, sizeof *fpregs); + return (0); +} + +int +set_fpregs(struct thread *td, struct fpreg *fpregs) +{ +#ifdef CPU_ENABLE_SSE + if (cpu_fxsr) { + set_fpregs_xmm((struct save87 *)fpregs, + &td->td_pcb->pcb_save.sv_xmm); + return (0); + } +#endif /* CPU_ENABLE_SSE */ + bcopy(fpregs, &td->td_pcb->pcb_save.sv_87, sizeof *fpregs); + return (0); +} + +/* + * Get machine context. + */ +int +get_mcontext(struct thread *td, mcontext_t *mcp, int flags) +{ + struct trapframe *tp; + + tp = td->td_frame; + + PROC_LOCK(curthread->td_proc); + mcp->mc_onstack = sigonstack(tp->tf_esp); + PROC_UNLOCK(curthread->td_proc); + mcp->mc_gs = td->td_pcb->pcb_gs; + mcp->mc_fs = tp->tf_fs; + mcp->mc_es = tp->tf_es; + mcp->mc_ds = tp->tf_ds; + mcp->mc_edi = tp->tf_edi; + mcp->mc_esi = tp->tf_esi; + mcp->mc_ebp = tp->tf_ebp; + mcp->mc_isp = tp->tf_isp; + mcp->mc_eflags = tp->tf_eflags; + if (flags & GET_MC_CLEAR_RET) { + mcp->mc_eax = 0; + mcp->mc_edx = 0; + mcp->mc_eflags &= ~PSL_C; + } else { + mcp->mc_eax = tp->tf_eax; + mcp->mc_edx = tp->tf_edx; + } + mcp->mc_ebx = tp->tf_ebx; + mcp->mc_ecx = tp->tf_ecx; + mcp->mc_eip = tp->tf_eip; + mcp->mc_cs = tp->tf_cs; + mcp->mc_esp = tp->tf_esp; + mcp->mc_ss = tp->tf_ss; + mcp->mc_len = sizeof(*mcp); + get_fpcontext(td, mcp); + return (0); +} + +/* + * Set machine context. + * + * However, we don't set any but the user modifiable flags, and we won't + * touch the cs selector. + */ +int +set_mcontext(struct thread *td, const mcontext_t *mcp) +{ + struct trapframe *tp; + int eflags, ret; + + tp = td->td_frame; + if (mcp->mc_len != sizeof(*mcp)) + return (EINVAL); + eflags = (mcp->mc_eflags & PSL_USERCHANGE) | + (tp->tf_eflags & ~PSL_USERCHANGE); + if ((ret = set_fpcontext(td, mcp)) == 0) { + tp->tf_fs = mcp->mc_fs; + tp->tf_es = mcp->mc_es; + tp->tf_ds = mcp->mc_ds; + tp->tf_edi = mcp->mc_edi; + tp->tf_esi = mcp->mc_esi; + tp->tf_ebp = mcp->mc_ebp; + tp->tf_ebx = mcp->mc_ebx; + tp->tf_edx = mcp->mc_edx; + tp->tf_ecx = mcp->mc_ecx; + tp->tf_eax = mcp->mc_eax; + tp->tf_eip = mcp->mc_eip; + tp->tf_eflags = eflags; + tp->tf_esp = mcp->mc_esp; + tp->tf_ss = mcp->mc_ss; + td->td_pcb->pcb_gs = mcp->mc_gs; + ret = 0; + } + return (ret); +} + +static void +get_fpcontext(struct thread *td, mcontext_t *mcp) +{ +#ifndef DEV_NPX + mcp->mc_fpformat = _MC_FPFMT_NODEV; + mcp->mc_ownedfp = _MC_FPOWNED_NONE; +#else + union savefpu *addr; + + /* + * XXX mc_fpstate might be misaligned, since its declaration is not + * unportabilized using __attribute__((aligned(16))) like the + * declaration of struct savemm, and anyway, alignment doesn't work + * for auto variables since we don't use gcc's pessimal stack + * alignment. Work around this by abusing the spare fields after + * mcp->mc_fpstate. + * + * XXX unpessimize most cases by only aligning when fxsave might be + * called, although this requires knowing too much about + * npxgetregs()'s internals. + */ + addr = (union savefpu *)&mcp->mc_fpstate; + if (td == PCPU_GET(fpcurthread) && +#ifdef CPU_ENABLE_SSE + cpu_fxsr && +#endif + ((uintptr_t)(void *)addr & 0xF)) { + do + addr = (void *)((char *)addr + 4); + while ((uintptr_t)(void *)addr & 0xF); + } + mcp->mc_ownedfp = npxgetregs(td, addr); + if (addr != (union savefpu *)&mcp->mc_fpstate) { + bcopy(addr, &mcp->mc_fpstate, sizeof(mcp->mc_fpstate)); + bzero(&mcp->mc_spare2, sizeof(mcp->mc_spare2)); + } + mcp->mc_fpformat = npxformat(); +#endif +} + +static int +set_fpcontext(struct thread *td, const mcontext_t *mcp) +{ + union savefpu *addr; + + if (mcp->mc_fpformat == _MC_FPFMT_NODEV) + return (0); + else if (mcp->mc_fpformat != _MC_FPFMT_387 && + mcp->mc_fpformat != _MC_FPFMT_XMM) + return (EINVAL); + else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) + /* We don't care what state is left in the FPU or PCB. */ + fpstate_drop(td); + else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU || + mcp->mc_ownedfp == _MC_FPOWNED_PCB) { + /* XXX align as above. */ + addr = (union savefpu *)&mcp->mc_fpstate; + if (td == PCPU_GET(fpcurthread) && +#ifdef CPU_ENABLE_SSE + cpu_fxsr && +#endif + ((uintptr_t)(void *)addr & 0xF)) { + do + addr = (void *)((char *)addr + 4); + while ((uintptr_t)(void *)addr & 0xF); + bcopy(&mcp->mc_fpstate, addr, sizeof(mcp->mc_fpstate)); + } +#ifdef DEV_NPX +#ifdef CPU_ENABLE_SSE + if (cpu_fxsr) + addr->sv_xmm.sv_env.en_mxcsr &= cpu_mxcsr_mask; +#endif + /* + * XXX we violate the dubious requirement that npxsetregs() + * be called with interrupts disabled. + */ + npxsetregs(td, addr); +#endif + /* + * Don't bother putting things back where they were in the + * misaligned case, since we know that the caller won't use + * them again. + */ + } else + return (EINVAL); + return (0); +} + +static void +fpstate_drop(struct thread *td) +{ + register_t s; + + s = intr_disable(); +#ifdef DEV_NPX + if (PCPU_GET(fpcurthread) == td) + npxdrop(); +#endif + /* + * XXX force a full drop of the npx. The above only drops it if we + * owned it. npxgetregs() has the same bug in the !cpu_fxsr case. + * + * XXX I don't much like npxgetregs()'s semantics of doing a full + * drop. Dropping only to the pcb matches fnsave's behaviour. + * We only need to drop to !PCB_INITDONE in sendsig(). But + * sendsig() is the only caller of npxgetregs()... perhaps we just + * have too many layers. + */ + curthread->td_pcb->pcb_flags &= ~PCB_NPXINITDONE; + intr_restore(s); +} + +int +fill_dbregs(struct thread *td, struct dbreg *dbregs) +{ + struct pcb *pcb; + + if (td == NULL) { + dbregs->dr[0] = rdr0(); + dbregs->dr[1] = rdr1(); + dbregs->dr[2] = rdr2(); + dbregs->dr[3] = rdr3(); + dbregs->dr[4] = rdr4(); + dbregs->dr[5] = rdr5(); + dbregs->dr[6] = rdr6(); + dbregs->dr[7] = rdr7(); + } else { + pcb = td->td_pcb; + dbregs->dr[0] = pcb->pcb_dr0; + dbregs->dr[1] = pcb->pcb_dr1; + dbregs->dr[2] = pcb->pcb_dr2; + dbregs->dr[3] = pcb->pcb_dr3; + dbregs->dr[4] = 0; + dbregs->dr[5] = 0; + dbregs->dr[6] = pcb->pcb_dr6; + dbregs->dr[7] = pcb->pcb_dr7; + } + return (0); +} + +int +set_dbregs(struct thread *td, struct dbreg *dbregs) +{ + struct pcb *pcb; + int i; + + if (td == NULL) { + load_dr0(dbregs->dr[0]); + load_dr1(dbregs->dr[1]); + load_dr2(dbregs->dr[2]); + load_dr3(dbregs->dr[3]); + load_dr4(dbregs->dr[4]); + load_dr5(dbregs->dr[5]); + load_dr6(dbregs->dr[6]); + load_dr7(dbregs->dr[7]); + } else { + /* + * Don't let an illegal value for dr7 get set. Specifically, + * check for undefined settings. Setting these bit patterns + * result in undefined behaviour and can lead to an unexpected + * TRCTRAP. + */ + for (i = 0; i < 4; i++) { + if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02) + return (EINVAL); + if (DBREG_DR7_LEN(dbregs->dr[7], i) == 0x02) + return (EINVAL); + } + + pcb = td->td_pcb; + + /* + * Don't let a process set a breakpoint that is not within the + * process's address space. If a process could do this, it + * could halt the system by setting a breakpoint in the kernel + * (if ddb was enabled). Thus, we need to check to make sure + * that no breakpoints are being enabled for addresses outside + * process's address space. + * + * XXX - what about when the watched area of the user's + * address space is written into from within the kernel + * ... wouldn't that still cause a breakpoint to be generated + * from within kernel mode? + */ + + if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) { + /* dr0 is enabled */ + if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS) + return (EINVAL); + } + + if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) { + /* dr1 is enabled */ + if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS) + return (EINVAL); + } + + if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) { + /* dr2 is enabled */ + if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS) + return (EINVAL); + } + + if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) { + /* dr3 is enabled */ + if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS) + return (EINVAL); + } + + pcb->pcb_dr0 = dbregs->dr[0]; + pcb->pcb_dr1 = dbregs->dr[1]; + pcb->pcb_dr2 = dbregs->dr[2]; + pcb->pcb_dr3 = dbregs->dr[3]; + pcb->pcb_dr6 = dbregs->dr[6]; + pcb->pcb_dr7 = dbregs->dr[7]; + + pcb->pcb_flags |= PCB_DBREGS; + } + + return (0); +} + +/* + * Return > 0 if a hardware breakpoint has been hit, and the + * breakpoint was in user space. Return 0, otherwise. + */ +int +user_dbreg_trap(void) +{ + u_int32_t dr7, dr6; /* debug registers dr6 and dr7 */ + u_int32_t bp; /* breakpoint bits extracted from dr6 */ + int nbp; /* number of breakpoints that triggered */ + caddr_t addr[4]; /* breakpoint addresses */ + int i; + + dr7 = rdr7(); + if ((dr7 & 0x000000ff) == 0) { + /* + * all GE and LE bits in the dr7 register are zero, + * thus the trap couldn't have been caused by the + * hardware debug registers + */ + return 0; + } + + nbp = 0; + dr6 = rdr6(); + bp = dr6 & 0x0000000f; + + if (!bp) { + /* + * None of the breakpoint bits are set meaning this + * trap was not caused by any of the debug registers + */ + return 0; + } + + /* + * at least one of the breakpoints were hit, check to see + * which ones and if any of them are user space addresses + */ + + if (bp & 0x01) { + addr[nbp++] = (caddr_t)rdr0(); + } + if (bp & 0x02) { + addr[nbp++] = (caddr_t)rdr1(); + } + if (bp & 0x04) { + addr[nbp++] = (caddr_t)rdr2(); + } + if (bp & 0x08) { + addr[nbp++] = (caddr_t)rdr3(); + } + + for (i = 0; i < nbp; i++) { + if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) { + /* + * addr[i] is in user space + */ + return nbp; + } + } + + /* + * None of the breakpoints are in user space. + */ + return 0; +} + +#ifndef DEV_APIC +#include <machine/apicvar.h> + +/* + * Provide stub functions so that the MADT APIC enumerator in the acpi + * kernel module will link against a kernel without 'device apic'. + * + * XXX - This is a gross hack. + */ +void +apic_register_enumerator(struct apic_enumerator *enumerator) +{ +} + +void * +ioapic_create(vm_paddr_t addr, int32_t apic_id, int intbase) +{ + return (NULL); +} + +int +ioapic_disable_pin(void *cookie, u_int pin) +{ + return (ENXIO); +} + +int +ioapic_get_vector(void *cookie, u_int pin) +{ + return (-1); +} + +void +ioapic_register(void *cookie) +{ +} + +int +ioapic_remap_vector(void *cookie, u_int pin, int vector) +{ + return (ENXIO); +} + +int +ioapic_set_extint(void *cookie, u_int pin) +{ + return (ENXIO); +} + +int +ioapic_set_nmi(void *cookie, u_int pin) +{ + return (ENXIO); +} + +int +ioapic_set_polarity(void *cookie, u_int pin, enum intr_polarity pol) +{ + return (ENXIO); +} + +int +ioapic_set_triggermode(void *cookie, u_int pin, enum intr_trigger trigger) +{ + return (ENXIO); +} + +void +lapic_create(u_int apic_id, int boot_cpu) +{ +} + +void +lapic_init(vm_paddr_t addr) +{ +} + +int +lapic_set_lvt_mode(u_int apic_id, u_int lvt, u_int32_t mode) +{ + return (ENXIO); +} + +int +lapic_set_lvt_polarity(u_int apic_id, u_int lvt, enum intr_polarity pol) +{ + return (ENXIO); +} + +int +lapic_set_lvt_triggermode(u_int apic_id, u_int lvt, enum intr_trigger trigger) +{ + return (ENXIO); +} +#endif + +#ifdef KDB + +/* + * Provide inb() and outb() as functions. They are normally only + * available as macros calling inlined functions, thus cannot be + * called from the debugger. + * + * The actual code is stolen from <machine/cpufunc.h>, and de-inlined. + */ + +#undef inb +#undef outb + +/* silence compiler warnings */ +u_char inb(u_int); +void outb(u_int, u_char); + +u_char +inb(u_int port) +{ + u_char data; + /* + * We use %%dx and not %1 here because i/o is done at %dx and not at + * %edx, while gcc generates inferior code (movw instead of movl) + * if we tell it to load (u_short) port. + */ + __asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port)); + return (data); +} + +void +outb(u_int port, u_char data) +{ + u_char al; + /* + * Use an unnecessary assignment to help gcc's register allocator. + * This make a large difference for gcc-1.40 and a tiny difference + * for gcc-2.6.0. For gcc-1.40, al had to be ``asm("ax")'' for + * best results. gcc-2.6.0 can't handle this. + */ + al = data; + __asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port)); +} + +#endif /* KDB */ diff --git a/sys/i386/xen/pmap.c b/sys/i386/xen/pmap.c new file mode 100644 index 0000000..920d6cb --- /dev/null +++ b/sys/i386/xen/pmap.c @@ -0,0 +1,3819 @@ +/*- + * Copyright (c) 1991 Regents of the University of California. + * All rights reserved. + * Copyright (c) 1994 John S. Dyson + * All rights reserved. + * Copyright (c) 1994 David Greenman + * All rights reserved. + * Copyright (c) 2005 Alan L. Cox <alc@cs.rice.edu> + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Systems Programming Group of the University of Utah Computer + * Science Department and William Jolitz of UUNET Technologies Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 + */ +/*- + * Copyright (c) 2003 Networks Associates Technology, Inc. + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Jake Burkholder, + * Safeport Network Services, and Network Associates Laboratories, the + * Security Research Division of Network Associates, Inc. under + * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA + * CHATS research program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +/* + * Manages physical address maps. + * + * In addition to hardware address maps, this + * module is called upon to provide software-use-only + * maps which may or may not be stored in the same + * form as hardware maps. These pseudo-maps are + * used to store intermediate results from copy + * operations to and from address spaces. + * + * Since the information managed by this module is + * also stored by the logical address mapping module, + * this module may throw away valid virtual-to-physical + * mappings at almost any time. However, invalidations + * of virtual-to-physical mappings must be done as + * requested. + * + * In order to cope with hardware architectures which + * make virtual-to-physical map invalidates expensive, + * this module may delay invalidate or reduced protection + * operations until such time as they are actually + * necessary. This module is given full information as + * to which processors are currently using which maps, + * and to when physical maps must be made correct. + */ + +#include "opt_cpu.h" +#include "opt_pmap.h" +#include "opt_msgbuf.h" +#include "opt_xbox.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/ktr.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mman.h> +#include <sys/msgbuf.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/sx.h> +#include <sys/vmmeter.h> +#include <sys/sched.h> +#include <sys/sysctl.h> +#include <sys/syslog.h> +#ifdef SMP +#include <sys/smp.h> +#endif + +#ifdef XBOX +#include <machine/xbox.h> +#endif + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_kern.h> +#include <vm/vm_page.h> +#include <vm/vm_map.h> +#include <vm/vm_object.h> +#include <vm/vm_extern.h> +#include <vm/vm_pageout.h> +#include <vm/vm_pager.h> +#include <vm/uma.h> + +#ifdef XEN +#include <xen/interface/xen.h> +#include <machine/xen/hypervisor.h> +#include <machine/xen/hypercall.h> +#include <machine/xen/xenvar.h> +#include <machine/xen/xenfunc.h> +#endif + +#include <machine/cpu.h> +#include <machine/cputypes.h> +#include <machine/md_var.h> +#include <machine/pcb.h> +#include <machine/specialreg.h> +#ifdef SMP +#include <machine/smp.h> +#endif + +#if !defined(CPU_DISABLE_SSE) && defined(I686_CPU) +#define CPU_ENABLE_SSE +#endif + +#ifndef PMAP_SHPGPERPROC +#define PMAP_SHPGPERPROC 200 +#endif + +#define PMAP_DIAGNOSTIC + +#if defined(DIAGNOSTIC) +#define PMAP_DIAGNOSTIC +#endif + +#if !defined(PMAP_DIAGNOSTIC) +#define PMAP_INLINE __inline +#else +#define PMAP_INLINE +#endif + +/* + * Get PDEs and PTEs for user/kernel address space + */ +#define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT])) +#define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) + +#define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0) +#define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0) +#define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0) +#define pmap_pte_u(pte) ((*(int *)pte & PG_A) != 0) +#define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0) + +#ifndef XEN +#define pmap_pte_set_w(pte, v) ((v) ? atomic_set_int((u_int *)(pte), PG_W) : \ + atomic_clear_int((u_int *)(pte), PG_W)) +#define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v))) +#endif + +struct pmap kernel_pmap_store; +LIST_HEAD(pmaplist, pmap); +static struct pmaplist allpmaps; +static struct mtx allpmaps_lock; + +vm_paddr_t avail_end; /* PA of last available physical page */ +vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ +vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ +int pgeflag = 0; /* PG_G or-in */ +int pseflag = 0; /* PG_PS or-in */ + +static int nkpt; +vm_offset_t kernel_vm_end; +extern u_int32_t KERNend; + +#if defined(PAE) && !defined(XEN) +static uma_zone_t pdptzone; +#endif + +/* + * Data for the pv entry allocation mechanism + */ +static uma_zone_t pvzone; +static struct vm_object pvzone_obj; +static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; +int pmap_pagedaemon_waken; + +/* + * All those kernel PT submaps that BSD is so fond of + */ +struct sysmaps { + struct mtx lock; + pt_entry_t *CMAP1; + pt_entry_t *CMAP2; + caddr_t CADDR1; + caddr_t CADDR2; +}; +static struct sysmaps sysmaps_pcpu[MAXCPU]; +pt_entry_t *CMAP1 = 0; +static pt_entry_t *CMAP3; +caddr_t CADDR1 = 0, ptvmmap = 0; +static caddr_t CADDR3; +struct msgbuf *msgbufp = 0; + +/* + * Crashdump maps. + */ +static caddr_t crashdumpmap; + +#ifdef SMP +extern pt_entry_t *SMPpt; +#endif +static pt_entry_t *PMAP1 = 0, *PMAP2; +static pt_entry_t *PADDR1 = 0, *PADDR2; +#ifdef SMP +static int PMAP1cpu; +static int PMAP1changedcpu; +SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD, + &PMAP1changedcpu, 0, + "Number of times pmap_pte_quick changed CPU with same PMAP1"); +#endif +static int PMAP1changed; +SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD, + &PMAP1changed, 0, + "Number of times pmap_pte_quick changed PMAP1"); +static int PMAP1unchanged; +SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD, + &PMAP1unchanged, 0, + "Number of times pmap_pte_quick didn't change PMAP1"); +static struct mtx PMAP2mutex; + +static PMAP_INLINE void free_pv_entry(pv_entry_t pv); +static pv_entry_t get_pv_entry(void); +static void pmap_clear_ptes(vm_page_t m, int bit); + +static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, + vm_page_t m, vm_prot_t prot, vm_page_t mpte); +static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, + vm_page_t *free); +static void pmap_remove_page(struct pmap *pmap, vm_offset_t va); +static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, + vm_offset_t va); +static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); +static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, + vm_page_t m); + +static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags); + +static vm_page_t _pmap_allocpte(pmap_t pmap, unsigned ptepindex, int flags); +static int _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free); +static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va); +static void pmap_pte_release(pt_entry_t *pte); +static int pmap_unuse_pt(pmap_t, vm_offset_t, vm_page_t *); +static vm_offset_t pmap_kmem_choose(vm_offset_t addr); +#if defined(PAE) && !defined(XEN) +static void *pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait); +#endif + +CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t)); +CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t)); + +/* + * If you get an error here, then you set KVA_PAGES wrong! See the + * description of KVA_PAGES in sys/i386/include/pmap.h. It must be + * multiple of 4 for a normal kernel, or a multiple of 8 for a PAE. + */ +CTASSERT(KERNBASE % (1 << 24) == 0); + +static __inline void +pagezero(void *page) +{ +#if defined(I686_CPU) + if (cpu_class == CPUCLASS_686) { +#if defined(CPU_ENABLE_SSE) + if (cpu_feature & CPUID_SSE2) + sse2_pagezero(page); + else +#endif + i686_pagezero(page); + } else +#endif + bzero(page, PAGE_SIZE); +} + +void +pd_set(struct pmap *pmap, int ptepindex, vm_paddr_t val, int type) +{ + vm_paddr_t pdir_ma = vtomach(&pmap->pm_pdir[ptepindex]); + + switch (type) { + case SH_PD_SET_VA: +#if 0 + xen_queue_pt_update(shadow_pdir_ma, + xpmap_ptom(val & ~(PG_RW))); +#endif + xen_queue_pt_update(pdir_ma, + xpmap_ptom(val)); + break; + case SH_PD_SET_VA_MA: +#if 0 + xen_queue_pt_update(shadow_pdir_ma, + val & ~(PG_RW)); +#endif + xen_queue_pt_update(pdir_ma, val); + break; + case SH_PD_SET_VA_CLEAR: +#if 0 + xen_queue_pt_update(shadow_pdir_ma, 0); +#endif + xen_queue_pt_update(pdir_ma, 0); + break; + } +} + +/* + * Move the kernel virtual free pointer to the next + * 4MB. This is used to help improve performance + * by using a large (4MB) page for much of the kernel + * (.text, .data, .bss) + */ +static vm_offset_t +pmap_kmem_choose(vm_offset_t addr) +{ + vm_offset_t newaddr = addr; + +#ifndef DISABLE_PSE + if (cpu_feature & CPUID_PSE) + newaddr = (addr + PDRMASK) & ~PDRMASK; +#endif + return newaddr; +} + +/* + * Bootstrap the system enough to run with virtual memory. + * + * On the i386 this is called after mapping has already been enabled + * and just syncs the pmap module with what has already been done. + * [We can't call it easily with mapping off since the kernel is not + * mapped with PA == VA, hence we would have to relocate every address + * from the linked base (virtual) address "KERNBASE" to the actual + * (physical) address starting relative to 0] + */ +void +pmap_bootstrap(firstaddr, loadaddr) + vm_paddr_t firstaddr; + vm_paddr_t loadaddr; +{ + vm_offset_t va; + pt_entry_t *pte, *unused; + struct sysmaps *sysmaps; + int i; + + /* + * XXX The calculation of virtual_avail is wrong. It's NKPT*PAGE_SIZE too + * large. It should instead be correctly calculated in locore.s and + * not based on 'first' (which is a physical address, not a virtual + * address, for the start of unused physical memory). The kernel + * page tables are NOT double mapped and thus should not be included + * in this calculation. + */ + virtual_avail = (vm_offset_t) KERNBASE + firstaddr; + virtual_avail = pmap_kmem_choose(virtual_avail); + + virtual_end = VM_MAX_KERNEL_ADDRESS; + + /* + * Initialize the kernel pmap (which is statically allocated). + */ + PMAP_LOCK_INIT(kernel_pmap); + kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD); +#ifdef PAE + kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT); +#endif + kernel_pmap->pm_active = -1; /* don't allow deactivation */ + TAILQ_INIT(&kernel_pmap->pm_pvlist); + LIST_INIT(&allpmaps); + mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN); + mtx_lock_spin(&allpmaps_lock); + LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); + mtx_unlock_spin(&allpmaps_lock); + nkpt = NKPT; + + /* + * Reserve some special page table entries/VA space for temporary + * mapping of pages. + */ +#define SYSMAP(c, p, v, n) \ + v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); + + va = virtual_avail; + pte = vtopte(va); + + /* + * CMAP1/CMAP2 are used for zeroing and copying pages. + * CMAP3 is used for the idle process page zeroing. + */ + for (i = 0; i < MAXCPU; i++) { + sysmaps = &sysmaps_pcpu[i]; + mtx_init(&sysmaps->lock, "SYSMAPS", NULL, MTX_DEF); + SYSMAP(caddr_t, sysmaps->CMAP1, sysmaps->CADDR1, 1) + SYSMAP(caddr_t, sysmaps->CMAP2, sysmaps->CADDR2, 1) + } + SYSMAP(caddr_t, CMAP1, CADDR1, 1) + SYSMAP(caddr_t, CMAP3, CADDR3, 1) +#ifdef XEN + PT_SET_MA(CADDR3, 0); +#else + *CMAP3 = 0; +#endif + /* + * Crashdump maps. + */ + SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS) + + /* + * ptvmmap is used for reading arbitrary physical pages via /dev/mem. + */ + SYSMAP(caddr_t, unused, ptvmmap, 1) + + /* + * msgbufp is used to map the system message buffer. + */ + SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(MSGBUF_SIZE))) + + /* + * ptemap is used for pmap_pte_quick + */ + SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1); + SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1); + + mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF); + + virtual_avail = va; +#ifdef XEN + PT_SET_MA(CADDR1, 0); +#else + *CMAP1 = 0; +#endif + +#if !defined(XEN) +#ifdef XBOX + /* FIXME: This is gross, but needed for the XBOX. Since we are in such + * an early stadium, we cannot yet neatly map video memory ... :-( + * Better fixes are very welcome! + */ + if (!arch_i386_is_xbox) +#endif + for (i = 0; i < NKPT; i++) + PTD[i] = 0; + + /* Initialize the PAT MSR if present. */ + pmap_init_pat(); + + /* Turn on PG_G on kernel page(s) */ + pmap_set_pg(); +#endif /* !XEN */ +} + +/* + * Setup the PAT MSR. + */ +void +pmap_init_pat(void) +{ + uint64_t pat_msr; + + /* Bail if this CPU doesn't implement PAT. */ + if (!(cpu_feature & CPUID_PAT)) + return; + +#ifdef PAT_WORKS + /* + * Leave the indices 0-3 at the default of WB, WT, UC, and UC-. + * Program 4 and 5 as WP and WC. + * Leave 6 and 7 as UC and UC-. + */ + pat_msr = rdmsr(MSR_PAT); + pat_msr &= ~(PAT_MASK(4) | PAT_MASK(5)); + pat_msr |= PAT_VALUE(4, PAT_WRITE_PROTECTED) | + PAT_VALUE(5, PAT_WRITE_COMBINING); +#else + /* + * Due to some Intel errata, we can only safely use the lower 4 + * PAT entries. Thus, just replace PAT Index 2 with WC instead + * of UC-. + * + * Intel Pentium III Processor Specification Update + * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B + * or Mode C Paging) + * + * Intel Pentium IV Processor Specification Update + * Errata N46 (PAT Index MSB May Be Calculated Incorrectly) + */ + pat_msr = rdmsr(MSR_PAT); + pat_msr &= ~PAT_MASK(2); + pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING); +#endif + wrmsr(MSR_PAT, pat_msr); +} + +/* + * Set PG_G on kernel pages. Only the BSP calls this when SMP is turned on. + */ +void +pmap_set_pg(void) +{ + pd_entry_t pdir; + pt_entry_t *pte; + vm_offset_t va, endva; + int i; + + if (pgeflag == 0) + return; + + i = KERNLOAD/NBPDR; + endva = KERNBASE + KERNend; + + if (pseflag) { + va = KERNBASE + KERNLOAD; + while (va < endva) { + pdir = kernel_pmap->pm_pdir[KPTDI+i]; + pdir |= pgeflag; + kernel_pmap->pm_pdir[KPTDI+i] = PTD[KPTDI+i] = pdir; + invltlb(); /* Play it safe, invltlb() every time */ + i++; + va += NBPDR; + } + } else { + va = (vm_offset_t)btext; + while (va < endva) { + pte = vtopte(va); + if (*pte) { +#ifdef XEN + PT_SET_MA(va, *pte | pgeflag); +#else + *pte |= pgeflag; +#endif + } + invltlb(); /* Play it safe, invltlb() every time */ + va += PAGE_SIZE; + } + } +} + +/* + * Initialize a vm_page's machine-dependent fields. + */ +void +pmap_page_init(vm_page_t m) +{ + + TAILQ_INIT(&m->md.pv_list); + m->md.pv_list_count = 0; +} + +#if defined(PAE) && !defined(XEN) + +static MALLOC_DEFINE(M_PMAPPDPT, "pmap", "pmap pdpt"); + +static void * +pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) +{ + *flags = UMA_SLAB_PRIV; + return (contigmalloc(PAGE_SIZE, M_PMAPPDPT, 0, 0x0ULL, 0xffffffffULL, + 1, 0)); +} +#endif + +/* + * Initialize the pmap module. + * Called by vm_init, to initialize any structures that the pmap + * system needs to map virtual memory. + */ +void +pmap_init(void) +{ + int shpgperproc = PMAP_SHPGPERPROC; + + /* + * Initialize the address space (zone) for the pv entries. Set a + * high water mark so that the system can recover from excessive + * numbers of pv entries. + */ + pvzone = uma_zcreate("PV ENTRY", sizeof(struct pv_entry), NULL, NULL, + NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE); + TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); + pv_entry_max = shpgperproc * maxproc + cnt.v_page_count; + TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); + pv_entry_high_water = 9 * (pv_entry_max / 10); + uma_zone_set_obj(pvzone, &pvzone_obj, pv_entry_max); + +#if defined(PAE) && !defined(XEN) + pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL, + NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1, + UMA_ZONE_VM | UMA_ZONE_NOFREE); + uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf); +#endif +} + +void +pmap_init2() +{ +} + + +/*************************************************** + * Low level helper routines..... + ***************************************************/ + +/* + * Determine the appropriate bits to set in a PTE or PDE for a specified + * caching mode. + */ +static int +pmap_cache_bits(int mode, boolean_t is_pde) +{ + int pat_flag, pat_index, cache_bits; + + /* The PAT bit is different for PTE's and PDE's. */ + pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT; + + /* If we don't support PAT, map extended modes to older ones. */ + if (!(cpu_feature & CPUID_PAT)) { + switch (mode) { + case PAT_UNCACHEABLE: + case PAT_WRITE_THROUGH: + case PAT_WRITE_BACK: + break; + case PAT_UNCACHED: + case PAT_WRITE_COMBINING: + case PAT_WRITE_PROTECTED: + mode = PAT_UNCACHEABLE; + break; + } + } + + /* Map the caching mode to a PAT index. */ + switch (mode) { +#ifdef PAT_WORKS + case PAT_UNCACHEABLE: + pat_index = 3; + break; + case PAT_WRITE_THROUGH: + pat_index = 1; + break; + case PAT_WRITE_BACK: + pat_index = 0; + break; + case PAT_UNCACHED: + pat_index = 2; + break; + case PAT_WRITE_COMBINING: + pat_index = 5; + break; + case PAT_WRITE_PROTECTED: + pat_index = 4; + break; +#else + case PAT_UNCACHED: + case PAT_UNCACHEABLE: + case PAT_WRITE_PROTECTED: + pat_index = 3; + break; + case PAT_WRITE_THROUGH: + pat_index = 1; + break; + case PAT_WRITE_BACK: + pat_index = 0; + break; + case PAT_WRITE_COMBINING: + pat_index = 2; + break; +#endif + default: + panic("Unknown caching mode %d\n", mode); + } + + /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ + cache_bits = 0; + if (pat_index & 0x4) + cache_bits |= pat_flag; + if (pat_index & 0x2) + cache_bits |= PG_NC_PCD; + if (pat_index & 0x1) + cache_bits |= PG_NC_PWT; + return (cache_bits); +} +#ifdef SMP +/* + * For SMP, these functions have to use the IPI mechanism for coherence. + * + * N.B.: Before calling any of the following TLB invalidation functions, + * the calling processor must ensure that all stores updating a non- + * kernel page table are globally performed. Otherwise, another + * processor could cache an old, pre-update entry without being + * invalidated. This can happen one of two ways: (1) The pmap becomes + * active on another processor after its pm_active field is checked by + * one of the following functions but before a store updating the page + * table is globally performed. (2) The pmap becomes active on another + * processor before its pm_active field is checked but due to + * speculative loads one of the following functions stills reads the + * pmap as inactive on the other processor. + * + * The kernel page table is exempt because its pm_active field is + * immutable. The kernel page table is always active on every + * processor. + */ +void +pmap_invalidate_page(pmap_t pmap, vm_offset_t va) +{ + u_int cpumask; + u_int other_cpus; + + CTR2(KTR_PMAP, "pmap_invalidate_page: pmap=%p va=0x%x", + pmap, va); + + sched_pin(); + if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { + invlpg(va); + smp_invlpg(va); + } else { + cpumask = PCPU_GET(cpumask); + other_cpus = PCPU_GET(other_cpus); + if (pmap->pm_active & cpumask) + invlpg(va); + if (pmap->pm_active & other_cpus) + smp_masked_invlpg(pmap->pm_active & other_cpus, va); + } + PT_UPDATES_FLUSH(); + sched_unpin(); +} + +void +pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +{ + u_int cpumask; + u_int other_cpus; + vm_offset_t addr; + + CTR3(KTR_PMAP, "pmap_invalidate_page: pmap=%p eva=0x%x sva=0x%x", + pmap, sva, eva); + + sched_pin(); + if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { + for (addr = sva; addr < eva; addr += PAGE_SIZE) + invlpg(addr); + smp_invlpg_range(sva, eva); + } else { + cpumask = PCPU_GET(cpumask); + other_cpus = PCPU_GET(other_cpus); + if (pmap->pm_active & cpumask) + for (addr = sva; addr < eva; addr += PAGE_SIZE) + invlpg(addr); + if (pmap->pm_active & other_cpus) + smp_masked_invlpg_range(pmap->pm_active & other_cpus, + sva, eva); + } + PT_UPDATES_FLUSH(); + sched_unpin(); +} + +void +pmap_invalidate_all(pmap_t pmap) +{ + u_int cpumask; + u_int other_cpus; + + CTR1(KTR_PMAP, "pmap_invalidate_page: pmap=%p", pmap); + sched_pin(); + if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { + invltlb(); + smp_invltlb(); + } else { + cpumask = PCPU_GET(cpumask); + other_cpus = PCPU_GET(other_cpus); + if (pmap->pm_active & cpumask) + invltlb(); + if (pmap->pm_active & other_cpus) + smp_masked_invltlb(pmap->pm_active & other_cpus); + } + sched_unpin(); +} + +void +pmap_invalidate_cache(void) +{ + + sched_pin(); + wbinvd(); + smp_cache_flush(); + sched_unpin(); +} +#else /* !SMP */ +/* + * Normal, non-SMP, 486+ invalidation functions. + * We inline these within pmap.c for speed. + */ +PMAP_INLINE void +pmap_invalidate_page(pmap_t pmap, vm_offset_t va) +{ + if (pmap == kernel_pmap || pmap->pm_active) { + CTR2(KTR_PMAP, "pmap_invalidate_page: pmap=%p va=0x%x", + pmap, va); + invlpg(va); + PT_UPDATES_FLUSH(); + } +} + +PMAP_INLINE void +pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +{ + vm_offset_t addr; + + if (pmap == kernel_pmap || pmap->pm_active) { + if (eva - sva > PAGE_SIZE) + CTR3(KTR_PMAP, + "pmap_invalidate_range: pmap=%p sva=0x%x eva=0x%x", + pmap, sva, eva); + for (addr = sva; addr < eva; addr += PAGE_SIZE) + invlpg(addr); + PT_UPDATES_FLUSH(); + } +} + +PMAP_INLINE void +pmap_invalidate_all(pmap_t pmap) +{ + + + if (pmap == kernel_pmap || pmap->pm_active) { + CTR1(KTR_PMAP, "pmap_invalidate_all: pmap=%p", pmap); + invltlb(); + } +} + +PMAP_INLINE void +pmap_invalidate_cache(void) +{ + + wbinvd(); +} +#endif /* !SMP */ + +/* + * Are we current address space or kernel? N.B. We return FALSE when + * a pmap's page table is in use because a kernel thread is borrowing + * it. The borrowed page table can change spontaneously, making any + * dependence on its continued use subject to a race condition. + */ +static __inline int +pmap_is_current(pmap_t pmap) +{ + + return (pmap == kernel_pmap || + (pmap == vmspace_pmap(curthread->td_proc->p_vmspace) && + (pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME))); +} + +/* + * If the given pmap is not the current or kernel pmap, the returned pte must + * be released by passing it to pmap_pte_release(). + */ +pt_entry_t * +pmap_pte(pmap_t pmap, vm_offset_t va) +{ + pd_entry_t newpf; + pd_entry_t *pde; + + pde = pmap_pde(pmap, va); + if (*pde & PG_PS) + return (pde); + if (*pde != 0) { + /* are we current address space or kernel? */ + if (pmap_is_current(pmap)) + return (vtopte(va)); + mtx_lock(&PMAP2mutex); + newpf = *pde & PG_FRAME; + if ((*PMAP2 & PG_FRAME) != newpf) { +#ifdef XEN + PT_SET_MA(PADDR2, newpf | PG_V | PG_A | PG_M); + CTR3(KTR_PMAP, "pmap_pte: pmap=%p va=0x%x newpte=0x%08x", + pmap, va, (*PMAP2 & 0xffffffff)); +#else + *PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M; + pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2); +#endif + } + return (PADDR2 + (i386_btop(va) & (NPTEPG - 1))); + } + return (0); +} + +/* + * Releases a pte that was obtained from pmap_pte(). Be prepared for the pte + * being NULL. + */ +static __inline void +pmap_pte_release(pt_entry_t *pte) +{ + + if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2) { + CTR1(KTR_PMAP, "pmap_pte_release: pte=0x%jx", + *PMAP2); + PT_SET_VA_MA(PMAP2, 0, TRUE); + mtx_unlock(&PMAP2mutex); + } +} + +static __inline void +invlcaddr(void *caddr) +{ + + invlpg((u_int)caddr); + PT_UPDATES_FLUSH(); +} + +/* + * Super fast pmap_pte routine best used when scanning + * the pv lists. This eliminates many coarse-grained + * invltlb calls. Note that many of the pv list + * scans are across different pmaps. It is very wasteful + * to do an entire invltlb for checking a single mapping. + * + * If the given pmap is not the current pmap, vm_page_queue_mtx + * must be held and curthread pinned to a CPU. + */ +static pt_entry_t * +pmap_pte_quick(pmap_t pmap, vm_offset_t va) +{ + pd_entry_t newpf; + pd_entry_t *pde; + + pde = pmap_pde(pmap, va); + if (*pde & PG_PS) + return (pde); + + /* + * + * XXX hitting this indicates that things are AFU + */ + if (*pde != 0) { + /* are we current address space or kernel? */ + if (pmap_is_current(pmap)) + return (vtopte(va)); + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); + newpf = *pde & PG_FRAME; + if ((*PMAP1 & PG_FRAME) != newpf) { +#ifdef XEN + PT_SET_MA(PADDR1, newpf | PG_V | PG_A | PG_M); + CTR3(KTR_PMAP, + "pmap_pte_quick: pmap=%p va=0x%x newpte=0x%08x", + pmap, va, (u_long)*PMAP1); +#else + *PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M; +#endif +#ifdef SMP + PMAP1cpu = PCPU_GET(cpuid); +#endif + invlcaddr(PADDR1); + PMAP1changed++; + } else +#ifdef SMP + if (PMAP1cpu != PCPU_GET(cpuid)) { + PMAP1cpu = PCPU_GET(cpuid); + invlcaddr(PADDR1); + PMAP1changedcpu++; + } else +#endif + PMAP1unchanged++; + return (PADDR1 + (i386_btop(va) & (NPTEPG - 1))); + } + return (0); +} + +/* + * Routine: pmap_extract + * Function: + * Extract the physical page address associated + * with the given map/virtual_address pair. + */ + +vm_paddr_t +pmap_extract(pmap_t pmap, vm_offset_t va) +{ + vm_paddr_t rtval; + pt_entry_t *pte; + pd_entry_t pde; + + rtval = 0; + PMAP_LOCK(pmap); + pde = pmap->pm_pdir[va >> PDRSHIFT]; + if (pde != 0) { + if ((pde & PG_PS) != 0) { +#ifdef XEN + rtval = xpmap_mtop(pde & PG_PS_FRAME) | (va & PDRMASK); +#else + rtval = (pde & ~PDRMASK) | (va & PDRMASK); +#endif + PMAP_UNLOCK(pmap); + return (rtval); + } + pte = pmap_pte(pmap, va); +#ifdef XEN + rtval = ((*pte ? xpmap_mtop(*pte) : 0) & PG_FRAME) | (va & PAGE_MASK); + +#else + rtval = (*pte & PG_FRAME) | (va & PAGE_MASK); +#endif + pmap_pte_release(pte); + } + PMAP_UNLOCK(pmap); + return (rtval); +} + +vm_paddr_t +pmap_extract_ma(pmap_t pmap, vm_offset_t va) +{ + vm_paddr_t rtval; + pt_entry_t *pte; + pd_entry_t pde; + + rtval = 0; + PMAP_LOCK(pmap); + pde = pmap->pm_pdir[va >> PDRSHIFT]; + if (pde != 0) { + if ((pde & PG_PS) != 0) { + rtval = (pde & ~PDRMASK) | (va & PDRMASK); + + PMAP_UNLOCK(pmap); + return (rtval); + } + pte = pmap_pte(pmap, va); + + rtval = (*pte & PG_FRAME) | (va & PAGE_MASK); + pmap_pte_release(pte); + } + PMAP_UNLOCK(pmap); + return (rtval); +} + + +/* + * Routine: pmap_extract_and_hold + * Function: + * Atomically extract and hold the physical page + * with the given pmap and virtual address pair + * if that mapping permits the given protection. + */ +vm_page_t +pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) +{ + pd_entry_t pde; + pt_entry_t pte; + vm_page_t m; + + m = NULL; + vm_page_lock_queues(); + PMAP_LOCK(pmap); + pde = PT_GET(pmap_pde(pmap, va)); + if (pde != 0) { + if (pde & PG_PS) { + if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) { + m = PHYS_TO_VM_PAGE((pde & ~PDRMASK) | + (va & PDRMASK)); + vm_page_hold(m); + } + } else { + sched_pin(); + pte = PT_GET(pmap_pte_quick(pmap, va)); + if (pte != 0 && + ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) { + m = PHYS_TO_VM_PAGE(pte & PG_FRAME); + vm_page_hold(m); + } + sched_unpin(); + } + } + vm_page_unlock_queues(); + PMAP_UNLOCK(pmap); + return (m); +} + +/*************************************************** + * Low level mapping routines..... + ***************************************************/ + +/* + * Add a wired page to the kva. + * Note: not SMP coherent. + */ +PMAP_INLINE void +pmap_kenter(vm_offset_t va, vm_paddr_t pa) +{ + PT_SET_MA(va, xpmap_ptom(pa)| PG_RW | PG_V | pgeflag); +} + +PMAP_INLINE void +pmap_kenter_ma(vm_offset_t va, vm_paddr_t pa) +{ + + PT_SET_MA(va, pa | PG_RW | PG_V | pgeflag); +} + +PMAP_INLINE void +pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) +{ + PT_SET_MA(va, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0)); +} + +/* + * Remove a page from the kernel pagetables. + * Note: not SMP coherent. + */ +PMAP_INLINE void +pmap_kremove(vm_offset_t va) +{ + pt_entry_t *pte; + + pte = vtopte(va); + PT_SET_VA_MA(pte, 0, FALSE); +} + +/* + * Used to map a range of physical addresses into kernel + * virtual address space. + * + * The value passed in '*virt' is a suggested virtual address for + * the mapping. Architectures which can support a direct-mapped + * physical to virtual region can return the appropriate address + * within that region, leaving '*virt' unchanged. Other + * architectures should map the pages starting at '*virt' and + * update '*virt' with the first usable address after the mapped + * region. + */ +vm_offset_t +pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) +{ + vm_offset_t va, sva; + + va = sva = *virt; + while (start < end) { + pmap_kenter(va, start); + va += PAGE_SIZE; + start += PAGE_SIZE; + } + pmap_invalidate_range(kernel_pmap, sva, va); + *virt = va; + return (sva); +} + + +/* + * Add a list of wired pages to the kva + * this routine is only used for temporary + * kernel mappings that do not need to have + * page modification or references recorded. + * Note that old mappings are simply written + * over. The page *must* be wired. + * Note: SMP coherent. Uses a ranged shootdown IPI. + */ +void +pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) +{ + pt_entry_t *endpte, oldpte, *pte; + + oldpte = 0; + pte = vtopte(sva); + endpte = pte + count; + vm_page_lock_queues(); + critical_enter(); + while (pte < endpte) { + oldpte |= *pte; +#ifdef XEN + PT_SET_VA(pte, VM_PAGE_TO_PHYS(*ma) | pgeflag | PG_RW | PG_V, FALSE); +#else + pte_store(pte, VM_PAGE_TO_PHYS(*ma) | pgeflag | PG_RW | PG_V); +#endif + pte++; + ma++; + } + PT_UPDATES_FLUSH(); + if ((oldpte & PG_V) != 0) + pmap_invalidate_range(kernel_pmap, sva, sva + count * + PAGE_SIZE); + vm_page_unlock_queues(); + critical_exit(); +} + +/* + * This routine tears out page mappings from the + * kernel -- it is meant only for temporary mappings. + * Note: SMP coherent. Uses a ranged shootdown IPI. + */ +void +pmap_qremove(vm_offset_t sva, int count) +{ + vm_offset_t va; + + va = sva; + vm_page_lock_queues(); + critical_enter(); + while (count-- > 0) { + pmap_kremove(va); + va += PAGE_SIZE; + } + pmap_invalidate_range(kernel_pmap, sva, va); + critical_exit(); + vm_page_unlock_queues(); + +} + +/*************************************************** + * Page table page management routines..... + ***************************************************/ +static PMAP_INLINE void +pmap_free_zero_pages(vm_page_t free) +{ + vm_page_t m; + + while (free != NULL) { + m = free; + free = m->right; + vm_page_free_zero(m); + } +} + +/* + * This routine unholds page table pages, and if the hold count + * drops to zero, then it decrements the wire count. + */ +static PMAP_INLINE int +pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free) +{ + + --m->wire_count; + if (m->wire_count == 0) + return _pmap_unwire_pte_hold(pmap, m, free); + else + return 0; +} + +static int +_pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free) +{ + vm_offset_t pteva; + + PT_UPDATES_FLUSH(); + /* + * unmap the page table page + */ +#ifdef XEN + xen_pt_unpin(pmap->pm_pdir[m->pindex]); + PT_SET_VA_MA(&pmap->pm_pdir[m->pindex], 0, TRUE); + pmap_zero_page(m); +#else + pmap->pm_pdir[m->pindex] = 0; +#endif + --pmap->pm_stats.resident_count; + + /* + * This is a release store so that the ordinary store unmapping + * the page table page is globally performed before TLB shoot- + * down is begun. + */ + atomic_subtract_rel_int(&cnt.v_wire_count, 1); + + /* + * Do an invltlb to make the invalidated mapping + * take effect immediately. + */ + pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex); + pmap_invalidate_page(pmap, pteva); + + /* + * Put page on a list so that it is released after + * *ALL* TLB shootdown is done + */ + m->right = *free; + *free = m; + + return 1; +} + +/* + * After removing a page table entry, this routine is used to + * conditionally free the page, and manage the hold/wire counts. + */ +static int +pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t *free) +{ + pd_entry_t ptepde; + vm_page_t mpte; + + if (va >= VM_MAXUSER_ADDRESS) + return 0; + ptepde = PT_GET(pmap_pde(pmap, va)); + mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); + return pmap_unwire_pte_hold(pmap, mpte, free); +} + +void +pmap_pinit0(pmap) + struct pmap *pmap; +{ + + PMAP_LOCK_INIT(pmap); + pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD); +#ifdef PAE + pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT); +#endif + pmap->pm_active = 0; + PCPU_SET(curpmap, pmap); + TAILQ_INIT(&pmap->pm_pvlist); + bzero(&pmap->pm_stats, sizeof pmap->pm_stats); + mtx_lock_spin(&allpmaps_lock); + LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); + mtx_unlock_spin(&allpmaps_lock); +} + +/* + * Initialize a preallocated and zeroed pmap structure, + * such as one in a vmspace structure. + */ +void +pmap_pinit(struct pmap *pmap) +{ +#ifdef XEN + vm_page_t m, ptdpg[NPGPTD + 1]; + int npgptd = NPGPTD + 1; +#else + vm_page_t m, ptdpg[NPGPTD]; + vm_paddr_t pa; + int npgptd = NPGPTD; +#endif + static int color; + int i; + + PMAP_LOCK_INIT(pmap); + + /* + * No need to allocate page table space yet but we do need a valid + * page directory table. + */ + if (pmap->pm_pdir == NULL) { + pmap->pm_pdir = (pd_entry_t *)kmem_alloc_nofault(kernel_map, + NBPTD); + +#if defined(PAE) +#ifdef XEN + pmap->pm_pdpt = (pd_entry_t *)kmem_alloc_nofault(kernel_map, 1); +#else + pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO); + KASSERT(((vm_offset_t)pmap->pm_pdpt & + ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0, + ("pmap_pinit: pdpt misaligned")); + KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30), + ("pmap_pinit: pdpt above 4g")); +#endif /* !XEN */ +#endif /* PAE */ + } + + /* + * allocate the page directory page(s) + */ + for (i = 0; i < npgptd;) { + m = vm_page_alloc(NULL, color++, + VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | + VM_ALLOC_ZERO); + if (m == NULL) + VM_WAIT; + else { + ptdpg[i++] = m; + } + } + + pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD); + + for (i = 0; i < NPGPTD; i++) { + if ((ptdpg[i]->flags & PG_ZERO) == 0) + pagezero(&pmap->pm_pdir[i*NPDEPG]); + } + + mtx_lock_spin(&allpmaps_lock); + LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); + mtx_unlock_spin(&allpmaps_lock); + /* Wire in kernel global address entries. */ + /* XXX copies current process, does not fill in MPPTDI */ + bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t)); + +#ifdef PAE +#ifdef XEN + pmap_qenter((vm_offset_t)pmap->pm_pdpt, &ptdpg[NPGPTD], 1); + if ((ptdpg[NPGPTD]->flags & PG_ZERO) == 0) + bzero(pmap->pm_pdpt, PAGE_SIZE); +#endif + for (i = 0; i < NPGPTD; i++) { + vm_paddr_t ma; + + ma = xpmap_ptom(VM_PAGE_TO_PHYS(ptdpg[i])); + pmap->pm_pdpt[i] = ma | PG_V; + + } +#endif + +#ifdef SMP + pmap->pm_pdir[MPPTDI] = PTD[MPPTDI]; +#endif + + +#ifdef XEN + for (i = 0; i < NPGPTD; i++) { + pt_entry_t *pd; + vm_paddr_t ma; + + ma = xpmap_ptom(VM_PAGE_TO_PHYS(ptdpg[i])); + pd = pmap->pm_pdir + (i * NPDEPG); + PT_SET_MA(pd, *vtopte((vm_offset_t)pd) & ~(PG_M|PG_A|PG_U|PG_RW)); + + } + +#ifdef PAE + PT_SET_MA(pmap->pm_pdpt, *vtopte((vm_offset_t)pmap->pm_pdpt) & ~PG_RW); +#endif + vm_page_lock_queues(); + xen_flush_queue(); + xen_pgdpt_pin(xpmap_ptom(VM_PAGE_TO_PHYS(ptdpg[NPGPTD]))); + for (i = 0; i < NPGPTD; i++) { + vm_paddr_t ma = xpmap_ptom(VM_PAGE_TO_PHYS(ptdpg[i])); + PT_SET_VA_MA(&pmap->pm_pdir[PTDPTDI + i], ma | PG_V | PG_A, FALSE); + } + xen_flush_queue(); + vm_page_unlock_queues(); +#else + /* install self-referential address mapping entry(s) */ + for (i = 0; i < NPGPTD; i++) { + pa = VM_PAGE_TO_PHYS(ptdpg[i]); + pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M; +#ifdef PAE + pmap->pm_pdpt[i] = pa | PG_V; +#endif + } +#endif + pmap->pm_active = 0; + TAILQ_INIT(&pmap->pm_pvlist); + bzero(&pmap->pm_stats, sizeof pmap->pm_stats); +} + +/* + * this routine is called if the page table page is not + * mapped correctly. + */ +static vm_page_t +_pmap_allocpte(pmap_t pmap, unsigned int ptepindex, int flags) +{ + vm_paddr_t ptepa; + vm_page_t m; + + KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT || + (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK, + ("_pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK")); + + /* + * Allocate a page table page. + */ + if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | + VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { + if (flags & M_WAITOK) { + PMAP_UNLOCK(pmap); + vm_page_unlock_queues(); + VM_WAIT; + vm_page_lock_queues(); + PMAP_LOCK(pmap); + } + + /* + * Indicate the need to retry. While waiting, the page table + * page may have been allocated. + */ + return (NULL); + } + if ((m->flags & PG_ZERO) == 0) + pmap_zero_page(m); + + /* + * Map the pagetable page into the process address space, if + * it isn't already there. + */ + + pmap->pm_stats.resident_count++; +#ifdef XEN + ptepa = xpmap_ptom(VM_PAGE_TO_PHYS(m)); + xen_pt_pin(ptepa); + PT_SET_VA_MA(&pmap->pm_pdir[ptepindex], + (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M), TRUE); + + KASSERT(pmap->pm_pdir[ptepindex], + ("_pmap_allocpte: ptepindex=%d did not get mapped", ptepindex)); +#else + ptepa = VM_PAGE_TO_PHYS(m); + pmap->pm_pdir[ptepindex] = + (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M); +#endif + return (m); +} + +static vm_page_t +pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags) +{ + unsigned ptepindex; + pd_entry_t ptepa; + vm_page_t m; + + KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT || + (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK, + ("pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK")); + + /* + * Calculate pagetable page index + */ + ptepindex = va >> PDRSHIFT; +retry: + /* + * Get the page directory entry + */ + ptepa = pmap->pm_pdir[ptepindex]; + + /* + * XXX track me down and fix me! + */ + if ((ptepa & PG_V) == 0) { + if (ptepa && ((ptepa & PG_V) == 0)) + panic("phys addr set but not valid"); + } + + /* + * This supports switching from a 4MB page to a + * normal 4K page. + */ + if (ptepa & PG_PS) { + pmap->pm_pdir[ptepindex] = 0; + ptepa = 0; + pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; + pmap_invalidate_all(kernel_pmap); + } + + /* + * If the page table page is mapped, we just increment the + * hold count, and activate it. + */ + if (ptepa) { +#ifdef XEN + m = PHYS_TO_VM_PAGE(xpmap_mtop(ptepa)); +#else + m = PHYS_TO_VM_PAGE(ptepa); +#endif + m->wire_count++; + } else { + /* + * Here if the pte page isn't mapped, or if it has + * been deallocated. + */ + CTR3(KTR_PMAP, "pmap_allocpte: pmap=%p va=0x%08x flags=0x%x", + pmap, va, flags); + + m = _pmap_allocpte(pmap, ptepindex, flags); + if (m == NULL && (flags & M_WAITOK)) + goto retry; + KASSERT(pmap->pm_pdir[ptepindex], + ("ptepindex=%d did not get mapped", ptepindex)); + } + return (m); +} + + +/*************************************************** +* Pmap allocation/deallocation routines. + ***************************************************/ + +#ifdef SMP +/* + * Deal with a SMP shootdown of other users of the pmap that we are + * trying to dispose of. This can be a bit hairy. + */ +static u_int *lazymask; +static u_int lazyptd; +static volatile u_int lazywait; + +void pmap_lazyfix_action(void); + +void +pmap_lazyfix_action(void) +{ + u_int mymask = PCPU_GET(cpumask); + + if (rcr3() == lazyptd) + load_cr3(PCPU_GET(curpcb)->pcb_cr3); + atomic_clear_int(lazymask, mymask); + atomic_store_rel_int(&lazywait, 1); +} + +static void +pmap_lazyfix_self(u_int mymask) +{ + + if (rcr3() == lazyptd) + load_cr3(PCPU_GET(curpcb)->pcb_cr3); + atomic_clear_int(lazymask, mymask); +} + + +static void +pmap_lazyfix(pmap_t pmap) +{ + u_int mymask; + u_int mask; + register u_int spins; + + while ((mask = pmap->pm_active) != 0) { + spins = 50000000; + mask = mask & -mask; /* Find least significant set bit */ + mtx_lock_spin(&smp_ipi_mtx); +#ifdef PAE + lazyptd = vtophys(pmap->pm_pdpt); +#else + lazyptd = vtophys(pmap->pm_pdir); +#endif + mymask = PCPU_GET(cpumask); + if (mask == mymask) { + lazymask = &pmap->pm_active; + pmap_lazyfix_self(mymask); + } else { + atomic_store_rel_int((u_int *)&lazymask, + (u_int)&pmap->pm_active); + atomic_store_rel_int(&lazywait, 0); + ipi_selected(mask, IPI_LAZYPMAP); + while (lazywait == 0) { + ia32_pause(); + if (--spins == 0) + break; + } + } + mtx_unlock_spin(&smp_ipi_mtx); + if (spins == 0) + printf("pmap_lazyfix: spun for 50000000\n"); + } +} + +#else /* SMP */ + +/* + * Cleaning up on uniprocessor is easy. For various reasons, we're + * unlikely to have to even execute this code, including the fact + * that the cleanup is deferred until the parent does a wait(2), which + * means that another userland process has run. + */ +static void +pmap_lazyfix(pmap_t pmap) +{ + u_int cr3; + + cr3 = vtophys(pmap->pm_pdir); + if (cr3 == rcr3()) { + load_cr3(PCPU_GET(curpcb)->pcb_cr3); + pmap->pm_active &= ~(PCPU_GET(cpumask)); + } +} +#endif /* SMP */ + +/* + * Release any resources held by the given physical map. + * Called when a pmap initialized by pmap_pinit is being released. + * Should only be called if the map contains no valid mappings. + */ +void +pmap_release(pmap_t pmap) +{ +#ifdef XEN + vm_page_t m, ptdpg[NPGPTD+1]; + int npgptd = NPGPTD + 1; +#else + vm_page_t m, ptdpg[NPGPTD]; + int npgptd = NPGPTD; +#endif + int i; + + KASSERT(pmap->pm_stats.resident_count == 0, + ("pmap_release: pmap resident count %ld != 0", + pmap->pm_stats.resident_count)); + + pmap_lazyfix(pmap); + mtx_lock_spin(&allpmaps_lock); + LIST_REMOVE(pmap, pm_list); + mtx_unlock_spin(&allpmaps_lock); + +#ifdef XEN + for (i = 0; i < NPGPTD; i++) + ptdpg[i] = PHYS_TO_VM_PAGE(xpmap_mtop(pmap->pm_pdir[PTDPTDI + i])); +#else + for (i = 0; i < NPGPTD; i++) + ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i]); +#endif + + bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) * + sizeof(*pmap->pm_pdir)); +#ifdef SMP + pmap->pm_pdir[MPPTDI] = 0; +#endif + + pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD); +#if defined(PAE) && defined(XEN) + ptdpg[NPGPTD] = PHYS_TO_VM_PAGE(vtophys(pmap->pm_pdpt)); +#endif + + vm_page_lock_queues(); + for (i = 0; i < npgptd; i++) { + vm_paddr_t ma; + + m = ptdpg[i]; + ma = xpmap_ptom(VM_PAGE_TO_PHYS(m)); + /* unpinning L1 and L2 treated the same */ + xen_pgd_unpin(ma); +#ifdef PAE + KASSERT( +#ifdef XEN + xpmap_ptom(VM_PAGE_TO_PHYS(m)) +#else + VM_PAGE_TO_PHYS(m) +#endif + == (pmap->pm_pdpt[i] & PG_FRAME), + ("pmap_release: got wrong ptd page")); +#endif + m->wire_count--; + atomic_subtract_int(&cnt.v_wire_count, 1); + vm_page_free(m); + } + vm_page_unlock_queues(); + PMAP_LOCK_DESTROY(pmap); +} + +static int +kvm_size(SYSCTL_HANDLER_ARGS) +{ + unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE; + + return sysctl_handle_long(oidp, &ksize, 0, req); +} +SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, + 0, 0, kvm_size, "IU", "Size of KVM"); + +static int +kvm_free(SYSCTL_HANDLER_ARGS) +{ + unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; + + return sysctl_handle_long(oidp, &kfree, 0, req); +} +SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, + 0, 0, kvm_free, "IU", "Amount of KVM free"); + +/* + * grow the number of kernel page table entries, if needed + */ +void +pmap_growkernel(vm_offset_t addr) +{ + struct pmap *pmap; + vm_paddr_t ptppaddr; + vm_page_t nkpg; + pd_entry_t newpdir; + + mtx_assert(&kernel_map->system_mtx, MA_OWNED); + if (kernel_vm_end == 0) { + kernel_vm_end = KERNBASE; + nkpt = 0; + while (pdir_pde(PTD, kernel_vm_end)) { + kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); + nkpt++; + if (kernel_vm_end - 1 >= kernel_map->max_offset) { + kernel_vm_end = kernel_map->max_offset; + break; + } + } + } + addr = roundup2(addr, PAGE_SIZE * NPTEPG); + if (addr - 1 >= kernel_map->max_offset) + addr = kernel_map->max_offset; + while (kernel_vm_end < addr) { + if (pdir_pde(PTD, kernel_vm_end)) { + kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); + if (kernel_vm_end - 1 >= kernel_map->max_offset) { + kernel_vm_end = kernel_map->max_offset; + break; + } + continue; + } + + nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDRSHIFT, + VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED); + if (nkpg == NULL) + panic("pmap_growkernel: no memory to grow kernel"); + + nkpt++; + + pmap_zero_page(nkpg); + ptppaddr = VM_PAGE_TO_PHYS(nkpg); + newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M); + vm_page_lock_queues(); + PD_SET_VA(kernel_pmap, (kernel_vm_end >> PDRSHIFT), newpdir, TRUE); + mtx_lock_spin(&allpmaps_lock); + LIST_FOREACH(pmap, &allpmaps, pm_list) { + PD_SET_VA(pmap, (kernel_vm_end >> PDRSHIFT), newpdir, TRUE); + } + mtx_unlock_spin(&allpmaps_lock); + vm_page_unlock_queues(); + + kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); + if (kernel_vm_end - 1 >= kernel_map->max_offset) { + kernel_vm_end = kernel_map->max_offset; + break; + } + } +} + + +/*************************************************** + * page management routines. + ***************************************************/ + +/* + * free the pv_entry back to the free list + */ +static PMAP_INLINE void +free_pv_entry(pv_entry_t pv) +{ + pv_entry_count--; + uma_zfree(pvzone, pv); +} + +/* + * get a new pv_entry, allocating a block from the system + * when needed. + * the memory allocation is performed bypassing the malloc code + * because of the possibility of allocations at interrupt time. + */ +static pv_entry_t +get_pv_entry(void) +{ + pv_entry_count++; + if ((pv_entry_count > pv_entry_high_water) && + (pmap_pagedaemon_waken == 0)) { + pmap_pagedaemon_waken = 1; + wakeup (&vm_pages_needed); + } + return uma_zalloc(pvzone, M_NOWAIT); +} + + +static void +pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) +{ + pv_entry_t pv; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + if (m->md.pv_list_count < pmap->pm_stats.resident_count) { + TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { + if (pmap == pv->pv_pmap && va == pv->pv_va) + break; + } + } else { + TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) { + if (va == pv->pv_va) + break; + } + } + KASSERT(pv != NULL, ("pmap_remove_entry: pv not found")); + TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); + m->md.pv_list_count--; + if (TAILQ_EMPTY(&m->md.pv_list)) + vm_page_flag_clear(m, PG_WRITEABLE); + TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); + free_pv_entry(pv); +} + +/* + * Create a pv entry for page at pa for + * (pmap, va). + */ +static void +pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) +{ + pv_entry_t pv; + + pv = get_pv_entry(); + if (pv == NULL) + panic("no pv entries: increase vm.pmap.shpgperproc"); + pv->pv_va = va; + pv->pv_pmap = pmap; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist); + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); + m->md.pv_list_count++; +} + +/* + * Conditionally create a pv entry. + */ +static boolean_t +pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) +{ + pv_entry_t pv; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + if (pv_entry_count < pv_entry_high_water && + (pv = uma_zalloc(pvzone, M_NOWAIT)) != NULL) { + pv_entry_count++; + pv->pv_va = va; + pv->pv_pmap = pmap; + TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist); + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); + m->md.pv_list_count++; + return (TRUE); + } else + return (FALSE); +} + +/* + * pmap_remove_pte: do the things to unmap a page in a process + */ +static int +pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, vm_page_t *free) +{ + pt_entry_t oldpte; + vm_page_t m; + + CTR3(KTR_PMAP, "pmap_remove_pte: pmap=%p *ptq=0x%x va=0x%x", + pmap, (u_long)*ptq, va); + + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); +#ifdef XEN + oldpte = *ptq; + PT_SET_VA_MA(ptq, 0, TRUE); +#else + oldpte = pte_load_clear(ptq); +#endif + if (oldpte & PG_W) + pmap->pm_stats.wired_count -= 1; + /* + * Machines that don't support invlpg, also don't support + * PG_G. + */ + if (oldpte & PG_G) + pmap_invalidate_page(kernel_pmap, va); + pmap->pm_stats.resident_count -= 1; + if (oldpte & PG_MANAGED) { + m = PHYS_TO_VM_PAGE(xpmap_mtop(oldpte)); + if (oldpte & PG_M) { + KASSERT((oldpte & PG_RW), + ("pmap_remove_pte: modified page not writable: va: %#x, pte: %#jx", + va, (uintmax_t)oldpte)); + vm_page_dirty(m); + } + if (oldpte & PG_A) + vm_page_flag_set(m, PG_REFERENCED); + pmap_remove_entry(pmap, m, va); + } + return (pmap_unuse_pt(pmap, va, free)); +} + +/* + * Remove a single page from a process address space + */ +static void +pmap_remove_page(pmap_t pmap, vm_offset_t va) +{ + pt_entry_t *pte; + vm_page_t free = NULL; + + CTR2(KTR_PMAP, "pmap_remove_page: pmap=%p va=0x%x", + pmap, va); + + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0) + return; + pmap_remove_pte(pmap, pte, va, &free); + pmap_invalidate_page(pmap, va); + pmap_free_zero_pages(free); +} + +/* + * Remove the given range of addresses from the specified map. + * + * It is assumed that the start and end are properly + * rounded to the page size. + */ +void +pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +{ + vm_offset_t pdnxt; + pd_entry_t ptpaddr; + pt_entry_t *pte; + vm_page_t free = NULL; + int anyvalid; + + CTR3(KTR_PMAP, "pmap_remove: pmap=%p sva=0x%x eva=0x%x", + pmap, sva, eva); + + /* + * Perform an unsynchronized read. This is, however, safe. + */ + if (pmap->pm_stats.resident_count == 0) + return; + + anyvalid = 0; + + vm_page_lock_queues(); + sched_pin(); + PMAP_LOCK(pmap); + + /* + * special handling of removing one page. a very + * common operation and easy to short circuit some + * code. + */ + if ((sva + PAGE_SIZE == eva) && + ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) { + pmap_remove_page(pmap, sva); + goto out; + } + + for (; sva < eva; sva = pdnxt) { + unsigned pdirindex; + + /* + * Calculate index for next page table. + */ + pdnxt = (sva + NBPDR) & ~PDRMASK; + if (pmap->pm_stats.resident_count == 0) + break; + + pdirindex = sva >> PDRSHIFT; + ptpaddr = pmap->pm_pdir[pdirindex]; + + /* + * Weed out invalid mappings. Note: we assume that the page + * directory table is always allocated, and in kernel virtual. + */ + if (ptpaddr == 0) + continue; + + /* + * Check for large page. + */ + if ((ptpaddr & PG_PS) != 0) { +#ifdef XEN + PT_SET_VA_MA(&pmap->pm_pdir[pdirindex], 0, TRUE); +#else + pmap->pm_pdir[pdirindex] = 0; +#endif + pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; + anyvalid = 1; + continue; + } + + /* + * Limit our scan to either the end of the va represented + * by the current page table page, or to the end of the + * range being removed. + */ + if (pdnxt > eva) + pdnxt = eva; + + for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, + sva += PAGE_SIZE) { + if (*pte == 0) + continue; + + /* + * The TLB entry for a PG_G mapping is invalidated + * by pmap_remove_pte(). + */ + if ((*pte & PG_G) == 0) + anyvalid = 1; + if (pmap_remove_pte(pmap, pte, sva, &free)) + break; + } + } + PT_UPDATES_FLUSH(); + +out: + if (anyvalid) { + pmap_invalidate_all(pmap); + pmap_free_zero_pages(free); + } + sched_unpin(); + vm_page_unlock_queues(); + PMAP_UNLOCK(pmap); +} + +/* + * Routine: pmap_remove_all + * Function: + * Removes this physical page from + * all physical maps in which it resides. + * Reflects back modify bits to the pager. + * + * Notes: + * Original versions of this routine were very + * inefficient because they iteratively called + * pmap_remove (slow...) + */ + +void +pmap_remove_all(vm_page_t m) +{ + register pv_entry_t pv; + pt_entry_t *pte, tpte; + vm_page_t free; + +#if defined(PMAP_DIAGNOSTIC) + /* + * XXX This makes pmap_remove_all() illegal for non-managed pages! + */ + if (m->flags & PG_FICTITIOUS) { + panic("pmap_remove_all: illegal for unmanaged page, va: 0x%jx", + VM_PAGE_TO_PHYS(m)); + } +#endif + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + sched_pin(); + while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { + PMAP_LOCK(pv->pv_pmap); + pv->pv_pmap->pm_stats.resident_count--; + pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); +#if defined(XEN) + tpte = *pte; + PT_SET_VA_MA(pte, 0, TRUE); +#else + tpte = pte_load_clear(pte); +#endif + if (tpte & PG_W) + pv->pv_pmap->pm_stats.wired_count--; + if (tpte & PG_A) + vm_page_flag_set(m, PG_REFERENCED); + + /* + * Update the vm_page_t clean and reference bits. + */ + if (tpte & PG_M) { + KASSERT((tpte & PG_RW), + ("pmap_remove_all: modified page not writable: va: %#x, pte: %#jx", + pv->pv_va, (uintmax_t)tpte)); + vm_page_dirty(m); + } + free = NULL; + pmap_unuse_pt(pv->pv_pmap, pv->pv_va, &free); + pmap_invalidate_page(pv->pv_pmap, pv->pv_va); + pmap_free_zero_pages(free); + TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist); + TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); + m->md.pv_list_count--; + PMAP_UNLOCK(pv->pv_pmap); + free_pv_entry(pv); + } + vm_page_flag_clear(m, PG_WRITEABLE); + PT_UPDATES_FLUSH(); + sched_unpin(); +} + +/* + * Set the physical protection on the + * specified range of this map as requested. + */ +void +pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) +{ + vm_offset_t pdnxt; + pd_entry_t ptpaddr; + pt_entry_t *pte; + int anychanged; + + CTR4(KTR_PMAP, "pmap_protect: pmap=%p sva=0x%x eva=0x%x prot=0x%x", + pmap, sva, eva, prot); + + if ((prot & VM_PROT_READ) == VM_PROT_NONE) { + pmap_remove(pmap, sva, eva); + return; + } + + if (prot & VM_PROT_WRITE) + return; + + anychanged = 0; + + vm_page_lock_queues(); + sched_pin(); + PMAP_LOCK(pmap); + for (; sva < eva; sva = pdnxt) { + unsigned pdirindex; + vm_paddr_t obits, pbits; + + pdnxt = (sva + NBPDR) & ~PDRMASK; + + pdirindex = sva >> PDRSHIFT; + ptpaddr = pmap->pm_pdir[pdirindex]; + + /* + * Weed out invalid mappings. Note: we assume that the page + * directory table is always allocated, and in kernel virtual. + */ + if (ptpaddr == 0) + continue; + + /* + * Check for large page. + */ + if ((ptpaddr & PG_PS) != 0) { + pmap->pm_pdir[pdirindex] &= ~(PG_M|PG_RW); + anychanged = 1; + continue; + } + + if (pdnxt > eva) + pdnxt = eva; + + for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, + sva += PAGE_SIZE) { + vm_page_t m; + +retry: + /* + * Regardless of whether a pte is 32 or 64 bits in + * size, PG_RW, PG_A, and PG_M are among the least + * significant 32 bits. + */ + obits = pbits = *pte; + if (pbits & PG_MANAGED) { +#ifdef XEN + pt_entry_t pteval = xpmap_mtop(*pte); +#else + pt_entry_t pteval = *pte; +#endif + + m = NULL; + if (pbits & PG_A) { + m = PHYS_TO_VM_PAGE(pteval); + + vm_page_flag_set(m, PG_REFERENCED); + pbits &= ~PG_A; + } + if ((pbits & PG_M) != 0) { + if (m == NULL) + m = PHYS_TO_VM_PAGE(pteval); + vm_page_dirty(m); + } + } + + pbits &= ~(PG_RW | PG_M); + + if (pbits != obits) { +#ifdef XEN + obits = *pte; + PT_SET_VA_MA(pte, pbits, TRUE); + if (*pte != pbits) + goto retry; +#else + if (!atomic_cmpset_int((u_int *)pte, obits, + pbits)) + goto retry; +#endif + if (obits & PG_G) + pmap_invalidate_page(pmap, sva); + else + anychanged = 1; + } + } + } + PT_UPDATES_FLUSH(); + if (anychanged) + pmap_invalidate_all(pmap); + sched_unpin(); + vm_page_unlock_queues(); + PMAP_UNLOCK(pmap); +} + +/* + * Insert the given physical page (p) at + * the specified virtual address (v) in the + * target physical map with the protection requested. + * + * If specified, the page will be wired down, meaning + * that the related pte can not be reclaimed. + * + * NB: This is the only routine which MAY NOT lazy-evaluate + * or lose information. That is, this routine must actually + * insert this page into the given map NOW. + */ +void +pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, + boolean_t wired) +{ + vm_paddr_t pa; + pd_entry_t *pde; + register pt_entry_t *pte; + vm_paddr_t opa; + pt_entry_t origpte, newpte; + vm_page_t mpte, om; + boolean_t invlva; + + + CTR5(KTR_PMAP, + "pmap_enter: pmap=%08p va=0x%08x ma=0x%08x prot=0x%x wired=%d", + pmap, va, xpmap_ptom(VM_PAGE_TO_PHYS(m)), prot, wired); + va &= PG_FRAME; +#ifdef PMAP_DIAGNOSTIC + if (va > VM_MAX_KERNEL_ADDRESS) + panic("pmap_enter: toobig"); + if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS)) + panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va); +#endif + + mpte = NULL; + + vm_page_lock_queues(); + PMAP_LOCK(pmap); + sched_pin(); + + /* + * In the case that a page table page is not + * resident, we are creating it here. + */ + if (va < VM_MAXUSER_ADDRESS) { + mpte = pmap_allocpte(pmap, va, M_WAITOK); + } +#if 0 && defined(PMAP_DIAGNOSTIC) + else { + pd_entry_t *pdeaddr = pmap_pde(pmap, va); + origpte = *pdeaddr; + if ((origpte & PG_V) == 0) { + panic("pmap_enter: invalid kernel page table page, pdir=%p, pde=%p, va=%p\n", + pmap->pm_pdir[PTDPTDI], origpte, va); + } + } +#endif + + pde = pmap_pde(pmap, va); + if ((*pde & PG_PS) != 0) + panic("pmap_enter: attempted pmap_enter on 4MB page"); + pte = pmap_pte_quick(pmap, va); + + /* + * Page Directory table entry not valid, we need a new PT page + */ + if (pte == NULL) { + panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x\n", + (uintmax_t)pmap->pm_pdir[PTDPTDI], va); + } + + pa = VM_PAGE_TO_PHYS(m); + om = NULL; + + origpte = *pte; + if (origpte) + origpte = xpmap_mtop(origpte); + opa = origpte & PG_FRAME; + + /* + * Mapping has not changed, must be protection or wiring change. + */ + if (origpte && (opa == pa)) { + /* + * Wiring change, just update stats. We don't worry about + * wiring PT pages as they remain resident as long as there + * are valid mappings in them. Hence, if a user page is wired, + * the PT page will be also. + */ + if (wired && ((origpte & PG_W) == 0)) + pmap->pm_stats.wired_count++; + else if (!wired && (origpte & PG_W)) + pmap->pm_stats.wired_count--; + + /* + * Remove extra pte reference + */ + if (mpte) + mpte->wire_count--; + + /* + * We might be turning off write access to the page, + * so we go ahead and sense modify status. + */ + if (origpte & PG_MANAGED) { + om = m; + pa |= PG_MANAGED; + } + goto validate; + } + /* + * Mapping has changed, invalidate old range and fall through to + * handle validating new mapping. + */ + if (opa) { + if (origpte & PG_W) + pmap->pm_stats.wired_count--; + if (origpte & PG_MANAGED) { + om = PHYS_TO_VM_PAGE(opa); + pmap_remove_entry(pmap, om, va); + } + if (mpte != NULL) { + mpte->wire_count--; + KASSERT(mpte->wire_count > 0, + ("pmap_enter: missing reference to page table page," + " va: 0x%x", va)); + } + } else + pmap->pm_stats.resident_count++; + + /* + * Enter on the PV list if part of our managed memory. + */ + if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) { + KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva, + ("pmap_enter: managed mapping within the clean submap")); + pmap_insert_entry(pmap, va, m); + pa |= PG_MANAGED; + } + + /* + * Increment counters + */ + if (wired) + pmap->pm_stats.wired_count++; + +validate: + /* + * Now validate mapping with desired protection/wiring. + */ + newpte = (pt_entry_t)(pa | PG_V); + if ((prot & VM_PROT_WRITE) != 0) + newpte |= PG_RW; + if (wired) + newpte |= PG_W; + if (va < VM_MAXUSER_ADDRESS) + newpte |= PG_U; + if (pmap == kernel_pmap) + newpte |= pgeflag; + + critical_enter(); + /* + * if the mapping or permission bits are different, we need + * to update the pte. + */ + if ((origpte & ~(PG_M|PG_A)) != newpte) { + if (origpte & PG_V) { + invlva = FALSE; +#ifdef XEN + origpte = *pte; + PT_SET_VA(pte, newpte | PG_A, FALSE); +#else + origpte = pte_load_store(pte, newpte | PG_A); +#endif + if (origpte & PG_A) { + if (origpte & PG_MANAGED) + vm_page_flag_set(om, PG_REFERENCED); + if (opa != VM_PAGE_TO_PHYS(m)) + invlva = TRUE; + } + if (origpte & PG_M) { + KASSERT((origpte & PG_RW), + ("pmap_enter: modified page not writable: va: %#x, pte: %#jx", + va, (uintmax_t)origpte)); + if ((origpte & PG_MANAGED) != 0) + vm_page_dirty(om); + if ((prot & VM_PROT_WRITE) == 0) + invlva = TRUE; + } + if (invlva) + pmap_invalidate_page(pmap, va); + } else { +#ifdef XEN + PT_SET_VA(pte, newpte | PG_A, FALSE); +#else + pte_store(pte, newpte | PG_A); +#endif + } + } + PT_UPDATES_FLUSH(); + critical_exit(); + sched_unpin(); + vm_page_unlock_queues(); + PMAP_UNLOCK(pmap); +} + +/* + * Maps a sequence of resident pages belonging to the same object. + * The sequence begins with the given page m_start. This page is + * mapped at the given virtual address start. Each subsequent page is + * mapped at a virtual address that is offset from start by the same + * amount as the page is offset from m_start within the object. The + * last page in the sequence is the page with the largest offset from + * m_start that can be mapped at a virtual address less than the given + * virtual address end. Not every virtual page between start and end + * is mapped; only those for which a resident page exists with the + * corresponding offset from m_start are mapped. + */ +void +pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, + vm_page_t m_start, vm_prot_t prot) +{ + vm_page_t m, mpte; + vm_pindex_t diff, psize; + + VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED); + psize = atop(end - start); + mpte = NULL; + m = m_start; + PMAP_LOCK(pmap); + while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { + mpte = pmap_enter_quick_locked(pmap, start + ptoa(diff), m, + prot, mpte); + m = TAILQ_NEXT(m, listq); + } + PMAP_UNLOCK(pmap); +} + +/* + * this code makes some *MAJOR* assumptions: + * 1. Current pmap & pmap exists. + * 2. Not wired. + * 3. Read access. + * 4. No page table pages. + * but is *MUCH* faster than pmap_enter... + */ + +void +pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) +{ + + PMAP_LOCK(pmap); + (void) pmap_enter_quick_locked(pmap, va, m, prot, NULL); + PMAP_UNLOCK(pmap); +} + +static vm_page_t +pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, + vm_prot_t prot, vm_page_t mpte) +{ + pt_entry_t *pte; + vm_paddr_t pa; + vm_page_t free; + + pa = VM_PAGE_TO_PHYS(m); + pa = pa ? xpmap_ptom(pa) >> PAGE_SHIFT : 0; + + CTR4(KTR_PMAP, + "pmap_enter_quick_locked: pmap=%p va=0x%08x mfn=%d prot=0x%x", + pmap, va, pa, prot); + KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || + (m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0, + ("pmap_enter_quick_locked: managed mapping within the clean submap")); + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + + /* + * In the case that a page table page is not + * resident, we are creating it here. + */ + if (va < VM_MAXUSER_ADDRESS) { + unsigned ptepindex; + pd_entry_t ptepa; + + /* + * Calculate pagetable page index + */ + ptepindex = va >> PDRSHIFT; + if (mpte && (mpte->pindex == ptepindex)) { + mpte->wire_count++; + } else { + /* + * Get the page directory entry + */ + ptepa = pmap->pm_pdir[ptepindex]; + + /* + * If the page table page is mapped, we just increment + * the hold count, and activate it. + */ + if (ptepa) { + ptepa = xpmap_mtop(ptepa); + if (ptepa & PG_PS) + panic("pmap_enter_quick: unexpected mapping into 4MB page"); + mpte = PHYS_TO_VM_PAGE(ptepa); + mpte->wire_count++; + } else { + mpte = _pmap_allocpte(pmap, ptepindex, + M_NOWAIT); + if (mpte == NULL) + return (mpte); + } + } + } else { + mpte = NULL; + } + + /* + * This call to vtopte makes the assumption that we are + * entering the page into the current pmap. In order to support + * quick entry into any pmap, one would likely use pmap_pte_quick. + * But that isn't as quick as vtopte. + */ + pte = vtopte(va); + if (*pte) { + if (mpte != NULL) { + mpte->wire_count--; + mpte = NULL; + } + return (mpte); + } + + /* + * Enter on the PV list if part of our managed memory. + */ + if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0 && + !pmap_try_insert_pv_entry(pmap, va, m)) { + if (mpte != NULL) { + free = NULL; + if (pmap_unwire_pte_hold(pmap, mpte, &free)) { + pmap_invalidate_page(pmap, va); + pmap_free_zero_pages(free); + } + + mpte = NULL; + } + return (mpte); + } + + /* + * Increment counters + */ + pmap->pm_stats.resident_count++; + + pa = VM_PAGE_TO_PHYS(m); + + /* + * Now validate mapping with RO protection + */ +#ifdef XEN + if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) + PT_SET_VA(pte, pa | PG_V | PG_U, TRUE); + else + PT_SET_VA(pte, pa | PG_V | PG_U | PG_MANAGED, TRUE); +#else + if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) + pte_store(pte, pa | PG_V | PG_U); + else + pte_store(pte, pa | PG_V | PG_U | PG_MANAGED); +#endif + return (mpte); +} + +/* + * Make a temporary mapping for a physical address. This is only intended + * to be used for panic dumps. + */ +void * +pmap_kenter_temporary(vm_paddr_t pa, int i) +{ + vm_offset_t va; + + va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); + pmap_kenter(va, pa); + invlpg(va); + return ((void *)crashdumpmap); +} + +/* + * This code maps large physical mmap regions into the + * processor address space. Note that some shortcuts + * are taken, but the code works. + */ +void +pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, + vm_object_t object, vm_pindex_t pindex, + vm_size_t size) +{ + vm_page_t p; + + CTR5(KTR_PMAP, + "pmap_object_init_pt: pmap=%p addr=0x%08x object=%p pindex=%d size=%d", + pmap, addr, object, pindex, size); + VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); + KASSERT(object->type == OBJT_DEVICE, + ("pmap_object_init_pt: non-device object")); + if (pseflag && + ((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0)) { + int i; + vm_page_t m[1]; + unsigned int ptepindex; + int npdes; + pd_entry_t ptepa; + + PMAP_LOCK(pmap); + if (pmap->pm_pdir[ptepindex = (addr >> PDRSHIFT)]) + goto out; + PMAP_UNLOCK(pmap); +retry: + p = vm_page_lookup(object, pindex); + if (p != NULL) { + vm_page_lock_queues(); + if (vm_page_sleep_if_busy(p, FALSE, "init4p")) + goto retry; + } else { + p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL); + if (p == NULL) + return; + m[0] = p; + + if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) { + vm_page_lock_queues(); + vm_page_free(p); + vm_page_unlock_queues(); + return; + } + + p = vm_page_lookup(object, pindex); + vm_page_lock_queues(); + vm_page_wakeup(p); + } + vm_page_unlock_queues(); + pmap_zero_page(p); + + ptepa = VM_PAGE_TO_PHYS(p); + if (ptepa & (NBPDR - 1)) + return; + + p->valid = VM_PAGE_BITS_ALL; + + PMAP_LOCK(pmap); + pmap->pm_stats.resident_count += size >> PAGE_SHIFT; + npdes = size >> PDRSHIFT; + critical_enter(); + for (i = 0; i < npdes; i++) { +#ifdef XEN + int flags = PG_U | PG_RW | PG_V | PG_PS; +#else + int flags = PG_U | PG_V | PG_PS; +#endif + pde_store(&pmap->pm_pdir[ptepindex], + ptepa | flags); + ptepa += NBPDR; + ptepindex += 1; + } + pmap_invalidate_all(pmap); + critical_exit(); + out: + PMAP_UNLOCK(pmap); + } +} + +/* + * Routine: pmap_change_wiring + * Function: Change the wiring attribute for a map/virtual-address + * pair. + * In/out conditions: + * The mapping must already exist in the pmap. + */ +void +pmap_change_wiring(pmap, va, wired) + register pmap_t pmap; + vm_offset_t va; + boolean_t wired; +{ + register pt_entry_t *pte; + + vm_page_lock_queues(); + PMAP_LOCK(pmap); + pte = pmap_pte(pmap, va); + + if (wired && !pmap_pte_w(pte)) { + PT_SET_VA_MA((pte), *(pte) | PG_W, TRUE); + pmap->pm_stats.wired_count++; + } else if (!wired && pmap_pte_w(pte)) { + PT_SET_VA_MA((pte), *(pte) & ~PG_W, TRUE); + pmap->pm_stats.wired_count--; + } + + /* + * Wiring is not a hardware characteristic so there is no need to + * invalidate TLB. + */ + pmap_pte_release(pte); + PMAP_UNLOCK(pmap); + vm_page_unlock_queues(); +} + + + +/* + * Copy the range specified by src_addr/len + * from the source map to the range dst_addr/len + * in the destination map. + * + * This routine is only advisory and need not do anything. + */ + +void +pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, + vm_offset_t src_addr) +{ + vm_page_t free; + vm_offset_t addr; + vm_offset_t end_addr = src_addr + len; + vm_offset_t pdnxt; + + if (dst_addr != src_addr) + return; + + if (!pmap_is_current(src_pmap)) + return; + + CTR5(KTR_PMAP, + "pmap_copy: dst_pmap=%p src_pmap=%p dst_addr=0x%x len=%d src_addr=0x%x", + dst_pmap, src_pmap, dst_addr, len, src_addr); + + vm_page_lock_queues(); + if (dst_pmap < src_pmap) { + PMAP_LOCK(dst_pmap); + PMAP_LOCK(src_pmap); + } else { + PMAP_LOCK(src_pmap); + PMAP_LOCK(dst_pmap); + } + sched_pin(); + for (addr = src_addr; addr < end_addr; addr = pdnxt) { + pt_entry_t *src_pte, *dst_pte; + vm_page_t dstmpte, srcmpte; + pd_entry_t srcptepaddr; + unsigned ptepindex; + + if (addr >= UPT_MIN_ADDRESS) + panic("pmap_copy: invalid to pmap_copy page tables"); + + pdnxt = (addr + NBPDR) & ~PDRMASK; + ptepindex = addr >> PDRSHIFT; + + srcptepaddr = src_pmap->pm_pdir[ptepindex]; + if (srcptepaddr == 0) + continue; + + if (srcptepaddr & PG_PS) { + if (dst_pmap->pm_pdir[ptepindex] == 0) { + dst_pmap->pm_pdir[ptepindex] = srcptepaddr & + ~PG_W; + dst_pmap->pm_stats.resident_count += + NBPDR / PAGE_SIZE; + } + continue; + } + + srcmpte = MACH_TO_VM_PAGE(srcptepaddr); + if (srcmpte->wire_count == 0) + panic("pmap_copy: source page table page is unused"); + + if (pdnxt > end_addr) + pdnxt = end_addr; + + src_pte = vtopte(addr); + while (addr < pdnxt) { + pt_entry_t ptetemp; + ptetemp = *src_pte; + /* + * we only virtual copy managed pages + */ + if ((ptetemp & PG_MANAGED) != 0) { + dstmpte = pmap_allocpte(dst_pmap, addr, + M_NOWAIT); + if (dstmpte == NULL) + break; + dst_pte = pmap_pte_quick(dst_pmap, addr); + if (*dst_pte == 0 && + pmap_try_insert_pv_entry(dst_pmap, addr, + MACH_TO_VM_PAGE(ptetemp & PG_FRAME))) { + /* + * Clear the wired, modified, and + * accessed (referenced) bits + * during the copy. + */ + PT_SET_VA_MA(dst_pte, ptetemp & ~(PG_W | PG_M | + PG_A), FALSE); + dst_pmap->pm_stats.resident_count++; + } else { + free = NULL; + if (pmap_unwire_pte_hold( dst_pmap, + dstmpte, &free)) { + pmap_invalidate_page(dst_pmap, + addr); + pmap_free_zero_pages(free); + } + } + if (dstmpte->wire_count >= srcmpte->wire_count) + break; + } + addr += PAGE_SIZE; + src_pte++; + } + } + sched_unpin(); + vm_page_unlock_queues(); + PMAP_UNLOCK(src_pmap); + PMAP_UNLOCK(dst_pmap); +} + +/* + * pmap_zero_page zeros the specified hardware page by mapping + * the page into KVM and using bzero to clear its contents. + */ +void +pmap_zero_page(vm_page_t m) +{ + struct sysmaps *sysmaps; + + sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; + mtx_lock(&sysmaps->lock); + if (*sysmaps->CMAP2) + panic("pmap_zero_page: CMAP2 busy"); + sched_pin(); +#ifdef XEN + PT_SET_MA(sysmaps->CADDR2, PG_V | PG_RW | xpmap_ptom(VM_PAGE_TO_PHYS(m)) | PG_A | PG_M); +#else + *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M; +#endif + invlcaddr(sysmaps->CADDR2); + pagezero(sysmaps->CADDR2); +#ifdef XEN + PT_SET_MA(sysmaps->CADDR2, 0); +#else + *sysmaps->CMAP2 = 0; +#endif + sched_unpin(); + mtx_unlock(&sysmaps->lock); +} + +/* + * pmap_zero_page_area zeros the specified hardware page by mapping + * the page into KVM and using bzero to clear its contents. + * + * off and size may not cover an area beyond a single hardware page. + */ +void +pmap_zero_page_area(vm_page_t m, int off, int size) +{ + struct sysmaps *sysmaps; + + sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; + mtx_lock(&sysmaps->lock); + if (*sysmaps->CMAP2) + panic("pmap_zero_page: CMAP2 busy"); + sched_pin(); +#ifdef XEN + PT_SET_MA(sysmaps->CADDR2, PG_V | PG_RW | xpmap_ptom(VM_PAGE_TO_PHYS(m)) | PG_A | PG_M); +#else + *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M; +#endif + invlcaddr(sysmaps->CADDR2); + if (off == 0 && size == PAGE_SIZE) + pagezero(sysmaps->CADDR2); + else + bzero((char *)sysmaps->CADDR2 + off, size); +#ifdef XEN + PT_SET_MA(sysmaps->CADDR2, 0); +#else + *sysmaps->CMAP2 = 0; +#endif + sched_unpin(); + mtx_unlock(&sysmaps->lock); +} + +/* + * pmap_zero_page_idle zeros the specified hardware page by mapping + * the page into KVM and using bzero to clear its contents. This + * is intended to be called from the vm_pagezero process only and + * outside of Giant. + */ +void +pmap_zero_page_idle(vm_page_t m) +{ + + if (*CMAP3) + panic("pmap_zero_page: CMAP3 busy"); + sched_pin(); +#ifdef XEN + PT_SET_MA(CADDR3, PG_V | PG_RW | xpmap_ptom(VM_PAGE_TO_PHYS(m)) | PG_A | PG_M); +#else + *CMAP3 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M; +#endif + invlcaddr(CADDR3); + pagezero(CADDR3); +#ifdef XEN + PT_SET_MA(CADDR3, 0); +#else + *CMAP3 = 0; +#endif + sched_unpin(); +} + +/* + * pmap_copy_page copies the specified (machine independent) + * page by mapping the page into virtual memory and using + * bcopy to copy the page, one machine dependent page at a + * time. + */ +void +pmap_copy_page(vm_page_t src, vm_page_t dst) +{ + struct sysmaps *sysmaps; + + sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; + mtx_lock(&sysmaps->lock); + if (*sysmaps->CMAP1) + panic("pmap_copy_page: CMAP1 busy"); + if (*sysmaps->CMAP2) + panic("pmap_copy_page: CMAP2 busy"); + sched_pin(); + invlpg((u_int)sysmaps->CADDR1); + invlpg((u_int)sysmaps->CADDR2); +#ifdef XEN + PT_SET_MA(sysmaps->CADDR1, PG_V | xpmap_ptom(VM_PAGE_TO_PHYS(src)) | PG_A); + PT_SET_MA(sysmaps->CADDR2, PG_V | PG_RW | xpmap_ptom(VM_PAGE_TO_PHYS(dst)) | PG_A | PG_M); + +#else + *sysmaps->CMAP1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A; + *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M; +#endif + bcopy(sysmaps->CADDR1, sysmaps->CADDR2, PAGE_SIZE); +#ifdef XEN + PT_SET_MA(sysmaps->CADDR1, 0); + PT_SET_MA(sysmaps->CADDR2, 0); +#else + *sysmaps->CMAP1 = 0; + *sysmaps->CMAP2 = 0; +#endif + sched_unpin(); + mtx_unlock(&sysmaps->lock); +} + +/* + * Returns true if the pmap's pv is one of the first + * 16 pvs linked to from this page. This count may + * be changed upwards or downwards in the future; it + * is only necessary that true be returned for a small + * subset of pmaps for proper page aging. + */ +boolean_t +pmap_page_exists_quick(pmap, m) + pmap_t pmap; + vm_page_t m; +{ + pv_entry_t pv; + int loops = 0; + + if (m->flags & PG_FICTITIOUS) + return (FALSE); + + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { + if (pv->pv_pmap == pmap) { + return TRUE; + } + loops++; + if (loops >= 16) + break; + } + return (FALSE); +} + +#define PMAP_REMOVE_PAGES_CURPROC_ONLY +/* + * Remove all pages from specified address space + * this aids process exit speeds. Also, this code + * is special cased for current process only, but + * can have the more generic (and slightly slower) + * mode enabled. This is much faster than pmap_remove + * in the case of running down an entire address space. + */ +void +pmap_remove_pages(pmap, sva, eva) + pmap_t pmap; + vm_offset_t sva, eva; +{ + pt_entry_t *pte, tpte; + vm_page_t m, free = NULL; + pv_entry_t pv, npv; + + CTR1(KTR_PMAP, "pmap_remove_pages: pmap=%p", pmap); +#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY + if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) { + printf("warning: pmap_remove_pages called with non-current pmap\n"); + return; + } +#endif + vm_page_lock_queues(); + KASSERT(pmap_is_current(pmap), ("removing pages from non-current pmap")); + PMAP_LOCK(pmap); + sched_pin(); + for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) { + + if (pv->pv_va >= eva || pv->pv_va < sva) { + npv = TAILQ_NEXT(pv, pv_plist); + continue; + } + +#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY + pte = vtopte(pv->pv_va); +#else + pte = pmap_pte_quick(pmap, pv->pv_va); +#endif + tpte = *pte ? xpmap_mtop(*pte) : 0; + + if (tpte == 0) { + printf("TPTE at %p IS ZERO @ VA %08x\n", + pte, pv->pv_va); + panic("bad pte"); + } + +/* + * We cannot remove wired pages from a process' mapping at this time + */ + if (tpte & PG_W) { + npv = TAILQ_NEXT(pv, pv_plist); + continue; + } + + m = PHYS_TO_VM_PAGE(tpte); + KASSERT(m->phys_addr == (tpte & PG_FRAME), + ("vm_page_t %p phys_addr mismatch %016jx %016jx", + m, (uintmax_t)m->phys_addr, (uintmax_t)tpte)); + + KASSERT(m < &vm_page_array[vm_page_array_size], + ("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte)); + + pmap->pm_stats.resident_count--; + +#ifdef XEN + PT_SET_VA_MA(pte, 0, FALSE); +#else + pte_clear(pte); +#endif + /* + * Update the vm_page_t clean and reference bits. + */ + if (tpte & PG_M) { + vm_page_dirty(m); + } + + npv = TAILQ_NEXT(pv, pv_plist); + TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); + + m->md.pv_list_count--; + TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); + if (TAILQ_EMPTY(&m->md.pv_list)) + vm_page_flag_clear(m, PG_WRITEABLE); + + pmap_unuse_pt(pmap, pv->pv_va, &free); + free_pv_entry(pv); + } + PT_UPDATES_FLUSH(); + sched_unpin(); + pmap_invalidate_all(pmap); + pmap_free_zero_pages(free); + vm_page_unlock_queues(); + PMAP_UNLOCK(pmap); +} + +/* + * pmap_is_modified: + * + * Return whether or not the specified physical page was modified + * in any physical maps. + */ +boolean_t +pmap_is_modified(vm_page_t m) +{ + pv_entry_t pv; + pt_entry_t *pte; + boolean_t rv; + + rv = FALSE; + if (m->flags & PG_FICTITIOUS) + return (rv); + + sched_pin(); + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { + PMAP_LOCK(pv->pv_pmap); + pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); + rv = (*pte & PG_M) != 0; + PMAP_UNLOCK(pv->pv_pmap); + if (rv) + break; + } + sched_unpin(); + return (rv); +} + +/* + * pmap_is_prefaultable: + * + * Return whether or not the specified virtual address is elgible + * for prefault. + */ +boolean_t +pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) +{ + pt_entry_t *pte; + boolean_t rv; + + rv = FALSE; +#ifdef XEN + /* + * disable prefaulting to start off + */ + return (rv); +#endif + PMAP_LOCK(pmap); + if (*pmap_pde(pmap, addr)) { + pte = vtopte(addr); + rv = *pte == 0; + } + PMAP_UNLOCK(pmap); + return (rv); +} + + +void +pmap_map_readonly(pmap_t pmap, vm_offset_t va, int len) +{ + int i, npages = round_page(len) >> PAGE_SHIFT; + for (i = 0; i < npages; i++) { + pt_entry_t *pte; + pte = pmap_pte(pmap, (vm_offset_t)(va + i*PAGE_SIZE)); + pte_store(pte, xpmap_mtop(*pte & ~(PG_RW|PG_M))); + PMAP_MARK_PRIV(xpmap_mtop(*pte)); + pmap_pte_release(pte); + } +} + +void +pmap_map_readwrite(pmap_t pmap, vm_offset_t va, int len) +{ + int i, npages = round_page(len) >> PAGE_SHIFT; + for (i = 0; i < npages; i++) { + pt_entry_t *pte; + pte = pmap_pte(pmap, (vm_offset_t)(va + i*PAGE_SIZE)); + PMAP_MARK_UNPRIV(xpmap_mtop(*pte)); + pte_store(pte, xpmap_mtop(*pte) | (PG_RW|PG_M)); + pmap_pte_release(pte); + } +} + +/* + * Clear the given bit in each of the given page's ptes. The bit is + * expressed as a 32-bit mask. Consequently, if the pte is 64 bits in + * size, only a bit within the least significant 32 can be cleared. + */ +static __inline void +pmap_clear_ptes(vm_page_t m, int bit) +{ + register pv_entry_t pv; + pt_entry_t pbits, *pte; + + if ((m->flags & PG_FICTITIOUS) || + (bit == PG_RW && (m->flags & PG_WRITEABLE) == 0)) + return; + + sched_pin(); + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + /* + * Loop over all current mappings setting/clearing as appropos If + * setting RO do we need to clear the VAC? + */ + TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { + PMAP_LOCK(pv->pv_pmap); + pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); +retry: + pbits = *pte; + if (pbits & bit) { + if (bit == PG_RW) { + /* + * Regardless of whether a pte is 32 or 64 bits + * in size, PG_RW and PG_M are among the least + * significant 32 bits. + */ +#ifdef XEN + PT_SET_VA_MA(pte, (pbits & ~(PG_RW|PG_M)), TRUE); + if (*pte != (pbits & ~(PG_RW|PG_M))) + goto retry; +#else + if (!atomic_cmpset_int((u_int *)pte, pbits, + pbits & ~(PG_RW | PG_M))) + goto retry; +#endif + if (pbits & PG_M) { + vm_page_dirty(m); + } + } else { +#ifdef XEN + PT_SET_VA_MA(pte, pbits & ~bit, TRUE); +#else + atomic_clear_int((u_int *)pte, bit); +#endif + } + pmap_invalidate_page(pv->pv_pmap, pv->pv_va); + } + PMAP_UNLOCK(pv->pv_pmap); + } + if (bit == PG_RW) + vm_page_flag_clear(m, PG_WRITEABLE); + sched_unpin(); +} + +/* + * pmap_page_protect: + * + * Lower the permission for all mappings to a given page. + */ +void +pmap_page_protect(vm_page_t m, vm_prot_t prot) +{ + if ((prot & VM_PROT_WRITE) == 0) { + if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) { + pmap_clear_ptes(m, PG_RW); + } else { + pmap_remove_all(m); + } + } +} + +/* + * pmap_ts_referenced: + * + * Return a count of reference bits for a page, clearing those bits. + * It is not necessary for every reference bit to be cleared, but it + * is necessary that 0 only be returned when there are truly no + * reference bits set. + * + * XXX: The exact number of bits to check and clear is a matter that + * should be tested and standardized at some point in the future for + * optimal aging of shared pages. + */ +int +pmap_ts_referenced(vm_page_t m) +{ + register pv_entry_t pv, pvf, pvn; + pt_entry_t *pte; + pt_entry_t v; + int rtval = 0; + + if (m->flags & PG_FICTITIOUS) + return (rtval); + + sched_pin(); + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { + + pvf = pv; + + do { + pvn = TAILQ_NEXT(pv, pv_list); + + TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); + + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); + + PMAP_LOCK(pv->pv_pmap); + pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); + + if (pte && ((v = *pte) & PG_A) != 0) { +#ifdef XEN + PT_SET_VA_MA(pte, *pte & ~PG_A, FALSE); +#else + atomic_clear_int((u_int *)pte, PG_A); +#endif + pmap_invalidate_page(pv->pv_pmap, pv->pv_va); + + rtval++; + if (rtval > 4) { + PMAP_UNLOCK(pv->pv_pmap); + break; + } + } + PMAP_UNLOCK(pv->pv_pmap); + } while ((pv = pvn) != NULL && pv != pvf); + } + PT_UPDATES_FLUSH(); + sched_unpin(); + + return (rtval); +} + +/* + * Clear the modify bits on the specified physical page. + */ +void +pmap_clear_modify(vm_page_t m) +{ + pmap_clear_ptes(m, PG_M); +} + +/* + * pmap_clear_reference: + * + * Clear the reference bit on the specified physical page. + */ +void +pmap_clear_reference(vm_page_t m) +{ + pmap_clear_ptes(m, PG_A); +} + +/* + * Miscellaneous support routines follow + */ + +/* + * Map a set of physical memory pages into the kernel virtual + * address space. Return a pointer to where it is mapped. This + * routine is intended to be used for mapping device memory, + * NOT real memory. + */ +void * +pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) +{ + vm_offset_t va, tmpva, offset; + + offset = pa & PAGE_MASK; + size = roundup(offset + size, PAGE_SIZE); + pa = pa & PG_FRAME; + + if (pa < KERNLOAD && pa + size <= KERNLOAD) + va = KERNBASE + pa; + else + va = kmem_alloc_nofault(kernel_map, size); + if (!va) + panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); + + for (tmpva = va; size > 0; ) { + pmap_kenter_attr(tmpva, pa, mode); + size -= PAGE_SIZE; + tmpva += PAGE_SIZE; + pa += PAGE_SIZE; + } + pmap_invalidate_range(kernel_pmap, va, tmpva); + pmap_invalidate_cache(); + return ((void *)(va + offset)); +} + +void * +pmap_mapdev(vm_paddr_t pa, vm_size_t size) +{ + + return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); +} + +void * +pmap_mapbios(vm_paddr_t pa, vm_size_t size) +{ + + return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); +} + +void +pmap_unmapdev(va, size) + vm_offset_t va; + vm_size_t size; +{ + vm_offset_t base, offset, tmpva; + + if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD) + return; + base = va & PG_FRAME; + offset = va & PAGE_MASK; + size = roundup(offset + size, PAGE_SIZE); + critical_enter(); + for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE) + pmap_kremove(tmpva); + pmap_invalidate_range(kernel_pmap, va, tmpva); + critical_exit(); + kmem_free(kernel_map, base, size); +} + +int +pmap_change_attr(va, size, mode) + vm_offset_t va; + vm_size_t size; + int mode; +{ + vm_offset_t base, offset, tmpva; + pt_entry_t *pte; + vm_paddr_t opte, npte; + pd_entry_t *pde; + + base = va & PG_FRAME; + offset = va & PAGE_MASK; + size = roundup(offset + size, PAGE_SIZE); + + /* Only supported on kernel virtual addresses. */ + if (base <= VM_MAXUSER_ADDRESS) + return (EINVAL); + + /* 4MB pages and pages that aren't mapped aren't supported. */ + for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE) { + pde = pmap_pde(kernel_pmap, tmpva); + if (*pde & PG_PS) + return (EINVAL); + if (*pde == 0) + return (EINVAL); + pte = vtopte(va); + if (*pte == 0) + return (EINVAL); + } + + /* + * Ok, all the pages exist and are 4k, so run through them updating + * their cache mode. + */ + for (tmpva = base; size > 0; ) { + pte = vtopte(tmpva); + + /* + * The cache mode bits are all in the low 32-bits of the + * PTE, so we can just spin on updating the low 32-bits. + */ + do { + opte = *pte; + npte = opte & ~(PG_PTE_PAT | PG_NC_PCD | PG_NC_PWT); + npte |= pmap_cache_bits(mode, 0); +#ifdef XEN + PT_SET_VA_MA(pte, npte, TRUE); +#endif + } +#ifdef XEN + while (npte != opte && (*pte != npte)); +#else + while (npte != opte && + !atomic_cmpset_int((u_int *)pte, opte, npte)); +#endif + tmpva += PAGE_SIZE; + size -= PAGE_SIZE; + } + + /* + * Flush CPU caches to make sure any data isn't cached that shouldn't + * be, etc. + */ + pmap_invalidate_range(kernel_pmap, base, tmpva); + pmap_invalidate_cache(); + return (0); +} + +/* + * perform the pmap work for mincore + */ +int +pmap_mincore(pmap, addr) + pmap_t pmap; + vm_offset_t addr; +{ + pt_entry_t *ptep, pte; + vm_page_t m; + int val = 0; + + PMAP_LOCK(pmap); + ptep = pmap_pte(pmap, addr); + pte = (ptep != NULL) ? PT_GET(ptep) : 0; + pmap_pte_release(ptep); + PMAP_UNLOCK(pmap); + + if (pte != 0) { + vm_paddr_t pa; + + val = MINCORE_INCORE; + if ((pte & PG_MANAGED) == 0) + return val; + + pa = pte & PG_FRAME; + + m = PHYS_TO_VM_PAGE(pa); + + /* + * Modified by us + */ + if (pte & PG_M) + val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER; + else { + /* + * Modified by someone else + */ + vm_page_lock_queues(); + if (m->dirty || pmap_is_modified(m)) + val |= MINCORE_MODIFIED_OTHER; + vm_page_unlock_queues(); + } + /* + * Referenced by us + */ + if (pte & PG_A) + val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER; + else { + /* + * Referenced by someone else + */ + vm_page_lock_queues(); + if ((m->flags & PG_REFERENCED) || + pmap_ts_referenced(m)) { + val |= MINCORE_REFERENCED_OTHER; + vm_page_flag_set(m, PG_REFERENCED); + } + vm_page_unlock_queues(); + } + } + return val; +} + +void +pmap_activate(struct thread *td) +{ + pmap_t pmap, oldpmap; + u_int32_t cr3; + + critical_enter(); + pmap = vmspace_pmap(td->td_proc->p_vmspace); + oldpmap = PCPU_GET(curpmap); +#if defined(SMP) + atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask)); + atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask)); +#else + oldpmap->pm_active &= ~1; + pmap->pm_active |= 1; +#endif +#ifdef PAE + cr3 = vtophys(pmap->pm_pdpt); +#else + cr3 = vtophys(pmap->pm_pdir); +#endif + /* + * pmap_activate is for the current thread on the current cpu + */ + td->td_pcb->pcb_cr3 = cr3; + PT_UPDATES_FLUSH(); + load_cr3(cr3); + PCPU_SET(curpmap, pmap); + critical_exit(); +} + +vm_offset_t +pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size) +{ + + if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) { + return addr; + } + + addr = (addr + PDRMASK) & ~PDRMASK; + return addr; +} + + +#if defined(PMAP_DEBUG) +pmap_pid_dump(int pid) +{ + pmap_t pmap; + struct proc *p; + int npte = 0; + int index; + + sx_slock(&allproc_lock); + FOREACH_PROC_IN_SYSTEM(p) { + if (p->p_pid != pid) + continue; + + if (p->p_vmspace) { + int i,j; + index = 0; + pmap = vmspace_pmap(p->p_vmspace); + for (i = 0; i < NPDEPTD; i++) { + pd_entry_t *pde; + pt_entry_t *pte; + vm_offset_t base = i << PDRSHIFT; + + pde = &pmap->pm_pdir[i]; + if (pde && pmap_pde_v(pde)) { + for (j = 0; j < NPTEPG; j++) { + vm_offset_t va = base + (j << PAGE_SHIFT); + if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) { + if (index) { + index = 0; + printf("\n"); + } + sx_sunlock(&allproc_lock); + return npte; + } + pte = pmap_pte(pmap, va); + if (pte && pmap_pte_v(pte)) { + pt_entry_t pa; + vm_page_t m; + pa = PT_GET(pte); + m = PHYS_TO_VM_PAGE(pa); + printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x", + va, pa, m->hold_count, m->wire_count, m->flags); + npte++; + index++; + if (index >= 2) { + index = 0; + printf("\n"); + } else { + printf(" "); + } + } + } + } + } + } + } + sx_sunlock(&allproc_lock); + return npte; +} +#endif + +#if defined(DEBUG) + +static void pads(pmap_t pm); +void pmap_pvdump(vm_paddr_t pa); + +/* print address space of pmap*/ +static void +pads(pmap_t pm) +{ + int i, j; + vm_paddr_t va; + pt_entry_t *ptep; + + if (pm == kernel_pmap) + return; + for (i = 0; i < NPDEPTD; i++) + if (pm->pm_pdir[i]) + for (j = 0; j < NPTEPG; j++) { + va = (i << PDRSHIFT) + (j << PAGE_SHIFT); + if (pm == kernel_pmap && va < KERNBASE) + continue; + if (pm != kernel_pmap && va > UPT_MAX_ADDRESS) + continue; + ptep = pmap_pte(pm, va); + if (pmap_pte_v(ptep)) + printf("%x:%x ", va, *ptep); + }; + +} + +void +pmap_pvdump(vm_paddr_t pa) +{ + pv_entry_t pv; + vm_page_t m; + + printf("pa %x", pa); + m = PHYS_TO_VM_PAGE(pa); + TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { + printf(" -> pmap %p, va %x", (void *)pv->pv_pmap, pv->pv_va); + pads(pv->pv_pmap); + } + printf(" "); +} +#endif diff --git a/sys/i386/xen/xen_bus.c b/sys/i386/xen/xen_bus.c new file mode 100644 index 0000000..701a1a8 --- /dev/null +++ b/sys/i386/xen/xen_bus.c @@ -0,0 +1,238 @@ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bus.h> +#include <sys/malloc.h> +#include <sys/module.h> +#include <sys/kernel.h> +#include <machine/bus.h> +#include <sys/rman.h> +#include <sys/lock.h> +#include <sys/mutex.h> + +#include <machine/frame.h> +#include <machine/intr_machdep.h> +#include <machine/resource.h> + +#include <machine/xen/xen-os.h> +#include <machine/xen/hypervisor.h> +#include <machine/xen/xen_intr.h> + +static MALLOC_DEFINE(M_XENDEV, "xenintrdrv", "xen system device"); + +struct xenbus_device { + struct resource_list xen_resources; +}; + +#define DEVTOXEN(dev) ((struct xenbus_device *)device_get_ivars(dev)) + +static void xenbus_identify(driver_t *, device_t); +static int xenbus_probe(device_t); +static int xenbus_attach(device_t); +static int xenbus_print_child(device_t, device_t); +static device_t xenbus_add_child(device_t bus, int order, const char *name, + int unit); +static struct resource *xenbus_alloc_resource(device_t, device_t, int, int *, + u_long, u_long, u_long, u_int); +static int xenbus_release_resource(device_t, device_t, int, int, + struct resource *); +static int xenbus_set_resource(device_t, device_t, int, int, u_long, u_long); +static int xenbus_get_resource(device_t, device_t, int, int, u_long *, u_long *); +static void xenbus_delete_resource(device_t, device_t, int, int); + + +static device_method_t xenbus_methods[] = { + /* Device interface */ + DEVMETHOD(device_identify, xenbus_identify), + DEVMETHOD(device_probe, xenbus_probe), + DEVMETHOD(device_attach, xenbus_attach), + DEVMETHOD(device_detach, bus_generic_detach), + DEVMETHOD(device_shutdown, bus_generic_shutdown), + DEVMETHOD(device_suspend, bus_generic_suspend), + DEVMETHOD(device_resume, bus_generic_resume), + + /* Bus interface */ + DEVMETHOD(bus_print_child, xenbus_print_child), + DEVMETHOD(bus_add_child, xenbus_add_child), + DEVMETHOD(bus_read_ivar, bus_generic_read_ivar), + DEVMETHOD(bus_write_ivar, bus_generic_write_ivar), + DEVMETHOD(bus_set_resource, xenbus_set_resource), + DEVMETHOD(bus_get_resource, xenbus_get_resource), + DEVMETHOD(bus_alloc_resource, xenbus_alloc_resource), + DEVMETHOD(bus_release_resource, xenbus_release_resource), + DEVMETHOD(bus_delete_resource, xenbus_delete_resource), + DEVMETHOD(bus_activate_resource, bus_generic_activate_resource), + DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource), + DEVMETHOD(bus_setup_intr, bus_generic_setup_intr), + DEVMETHOD(bus_teardown_intr, bus_generic_teardown_intr), + + { 0, 0 } +}; + + +static driver_t xenbus_driver = { + "xenbus", + xenbus_methods, + 1, /* no softc */ +}; +static devclass_t xenbus_devclass; +static device_t xenbus_dev; +static boolean_t xenbus_probe_delay = TRUE; /* delay child probes */ + +DRIVER_MODULE(xenbus, nexus, xenbus_driver, xenbus_devclass, 0, 0); + +static void +xenbus_identify(driver_t *driver, device_t parent) +{ + + /* + * Add child device with order of 0 so it gets probed + * first + */ + xenbus_dev = BUS_ADD_CHILD(parent, 0, "xenbus", 0); + if (xenbus_dev == NULL) + panic("xenbus: could not attach"); +} + +static int +xenbus_probe(device_t dev) +{ + device_set_desc(dev, "xen system"); + device_quiet(dev); + return (0); +} + +static int +xenbus_attach(device_t dev) +{ + /* + * First, let our child driver's identify any child devices that + * they can find. Once that is done attach any devices that we + * found. + */ + if (!xenbus_probe_delay) { + bus_generic_probe(dev); + bus_generic_attach(dev); + } + + return 0; +} + + +static int +xenbus_print_all_resources(device_t dev) +{ + struct xenbus_device *xdev = device_get_ivars(dev); + struct resource_list *rl = &xdev->xen_resources; + int retval = 0; + + if (STAILQ_FIRST(rl)) + retval += printf(" at"); + + retval += resource_list_print_type(rl, "port", SYS_RES_IOPORT, "%#lx"); + retval += resource_list_print_type(rl, "iomem", SYS_RES_MEMORY, "%#lx"); + retval += resource_list_print_type(rl, "irq", SYS_RES_IRQ, "%ld"); + + return retval; +} + + +static int +xenbus_print_child(device_t bus, device_t child) +{ + int retval = 0; + + retval += bus_print_child_header(bus, child); + retval += xenbus_print_all_resources(child); + retval += printf(" on motherboard\n"); /* XXX "motherboard", ick */ + + return (retval); +} + +static device_t +xenbus_add_child(device_t bus, int order, const char *name, int unit) +{ + device_t child; + struct xenbus_device *xendev; + + xendev = malloc(sizeof(struct xenbus_device), M_XENDEV, + M_NOWAIT | M_ZERO); + if (!xendev) + return(0); + resource_list_init(&xendev->xen_resources); + + child = device_add_child_ordered(bus, order, name, unit); + + /* should we free this in xenbus_child_detached? */ + device_set_ivars(child, xendev); + + return(child); +} + +static struct resource * +xenbus_alloc_resource(device_t bus, device_t child, int type, int *rid, + u_long start, u_long end, u_long count, u_int flags) +{ + struct xenbus_device *xendev = DEVTOXEN(child); + struct resource_list *rl = &xendev->xen_resources; + + return (resource_list_alloc(rl, bus, child, type, rid, start, end, + count, flags)); +} + + +static int +xenbus_release_resource(device_t bus, device_t child, int type, int rid, + struct resource *r) +{ + struct xenbus_device *xendev = DEVTOXEN(child); + struct resource_list *rl = &xendev->xen_resources; + + return (resource_list_release(rl, bus, child, type, rid, r)); +} + +static int +xenbus_set_resource(device_t dev, device_t child, int type, int rid, + u_long start, u_long count) +{ + struct xenbus_device *xendev = DEVTOXEN(child); + struct resource_list *rl = &xendev->xen_resources; + + resource_list_add(rl, type, rid, start, start + count - 1, count); + return(0); +} + +static int +xenbus_get_resource(device_t dev, device_t child, int type, int rid, + u_long *startp, u_long *countp) +{ + struct xenbus_device *xendev = DEVTOXEN(child); + struct resource_list *rl = &xendev->xen_resources; + struct resource_list_entry *rle; + + rle = resource_list_find(rl, type, rid); + if (!rle) + return(ENOENT); + if (startp) + *startp = rle->start; + if (countp) + *countp = rle->count; + return(0); +} + +static void +xenbus_delete_resource(device_t dev, device_t child, int type, int rid) +{ + struct xenbus_device *xendev = DEVTOXEN(child); + struct resource_list *rl = &xendev->xen_resources; + + resource_list_delete(rl, type, rid); +} + +static void +xenbus_init(void *unused) +{ + xenbus_probe_delay = FALSE; + xenbus_attach(xenbus_dev); +} +SYSINIT(xenbusdev, SI_SUB_PSEUDO, SI_ORDER_FIRST, xenbus_init, NULL); diff --git a/sys/i386/xen/xen_machdep.c b/sys/i386/xen/xen_machdep.c new file mode 100644 index 0000000..2e28304 --- /dev/null +++ b/sys/i386/xen/xen_machdep.c @@ -0,0 +1,1277 @@ +/* + * + * Copyright (c) 2004 Christian Limpach. + * Copyright (c) 2004-2006 Kip Macy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Christian Limpach. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bus.h> +#include <sys/mount.h> +#include <sys/malloc.h> +#include <sys/kernel.h> +#include <sys/reboot.h> +#include <sys/sysproto.h> + + +#include <vm/vm.h> +#include <vm/pmap.h> +#include <vm/vm_page.h> + +#include <machine/segments.h> +#include <machine/pcb.h> +#include <machine/stdarg.h> +#include <machine/vmparam.h> +#include <machine/cpu.h> +#include <machine/intr_machdep.h> +#include <machine/md_var.h> +#include <machine/asmacros.h> + + + + +#include <machine/xen/hypervisor.h> +#include <machine/xen/xenvar.h> +#include <machine/xen/xenfunc.h> +#include <machine/xen/xenpmap.h> +#include <machine/xen/xenbus.h> +#include <machine/xen/xenfunc.h> +#include <xen/interface/memory.h> +#include <machine/xen/features.h> +#ifdef SMP +#include <machine/privatespace.h> +#endif + +#define IDTVEC(name) __CONCAT(X,name) + +extern inthand_t +IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), + IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), + IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), + IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), + IDTVEC(xmm), IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall); + +int xendebug_flags; +start_info_t *xen_start_info; +shared_info_t *HYPERVISOR_shared_info; +xen_pfn_t *xen_machine_phys = machine_to_phys_mapping; +xen_pfn_t *xen_phys_machine; +int preemptable, init_first; +extern unsigned int avail_space; +extern int gdt_set; + +void ni_cli(void); +void ni_sti(void); + + +void +ni_cli(void) +{ + __asm__("pushl %edx;" + "pushl %eax;" + ); + __cli(); + __asm__("popl %eax;" + "popl %edx;" + ); +} + + +void +ni_sti(void) +{ + __asm__("pushl %edx;" + "pushl %esi;" + "pushl %eax;" + ); + __sti(); + __asm__("popl %eax;" + "popl %esi;" + "popl %edx;" + ); +} + +/* + * Modify the cmd_line by converting ',' to NULLs so that it is in a format + * suitable for the static env vars. + */ +char * +xen_setbootenv(char *cmd_line) +{ + char *cmd_line_next; + + /* Skip leading spaces */ + for (; *cmd_line == ' '; cmd_line++); + + printk("xen_setbootenv(): cmd_line='%s'\n", cmd_line); + + for (cmd_line_next = cmd_line; strsep(&cmd_line_next, ",") != NULL;); + return cmd_line; +} + +static struct +{ + const char *ev; + int mask; +} howto_names[] = { + {"boot_askname", RB_ASKNAME}, + {"boot_single", RB_SINGLE}, + {"boot_nosync", RB_NOSYNC}, + {"boot_halt", RB_ASKNAME}, + {"boot_serial", RB_SERIAL}, + {"boot_cdrom", RB_CDROM}, + {"boot_gdb", RB_GDB}, + {"boot_gdb_pause", RB_RESERVED1}, + {"boot_verbose", RB_VERBOSE}, + {"boot_multicons", RB_MULTIPLE}, + {NULL, 0} +}; + +int +xen_boothowto(char *envp) +{ + int i, howto = 0; + + /* get equivalents from the environment */ + for (i = 0; howto_names[i].ev != NULL; i++) + if (getenv(howto_names[i].ev) != NULL) + howto |= howto_names[i].mask; + return howto; +} + +#define PRINTK_BUFSIZE 1024 +void +printk(const char *fmt, ...) +{ + __va_list ap; + int retval; + static char buf[PRINTK_BUFSIZE]; + + va_start(ap, fmt); + retval = vsnprintf(buf, PRINTK_BUFSIZE - 1, fmt, ap); + va_end(ap); + buf[retval] = 0; + (void)HYPERVISOR_console_write(buf, retval); +} + + +#define XPQUEUE_SIZE 128 +#ifdef SMP +/* per-cpu queues and indices */ +static mmu_update_t xpq_queue[MAX_VIRT_CPUS][XPQUEUE_SIZE]; +static int xpq_idx[MAX_VIRT_CPUS]; + +#define XPQ_QUEUE xpq_queue[vcpu] +#define XPQ_IDX xpq_idx[vcpu] +#define SET_VCPU() int vcpu = smp_processor_id() +#else +struct mmu_log { + char *file; + int line; +}; + +static mmu_update_t xpq_queue[XPQUEUE_SIZE]; +static struct mmu_log xpq_queue_log[XPQUEUE_SIZE]; +static int xpq_idx = 0; + +#define XPQ_QUEUE xpq_queue +#define XPQ_IDX xpq_idx +#define SET_VCPU() +#endif +#define XPQ_IDX_INC atomic_add_int(&XPQ_IDX, 1); + +#if 0 +static void +xen_dump_queue(void) +{ + int _xpq_idx = XPQ_IDX; + int i; + + if (_xpq_idx <= 1) + return; + + printk("xen_dump_queue(): %u entries\n", _xpq_idx); + for (i = 0; i < _xpq_idx; i++) { + printk(" val: %llx ptr: %llx\n", XPQ_QUEUE[i].val, XPQ_QUEUE[i].ptr); + } +} +#endif + + +static __inline void +_xen_flush_queue(void) +{ + SET_VCPU(); + int _xpq_idx = XPQ_IDX; + int error, i; + /* window of vulnerability here? */ + + if (__predict_true(gdt_set)) + critical_enter(); + XPQ_IDX = 0; + /* Make sure index is cleared first to avoid double updates. */ + error = HYPERVISOR_mmu_update((mmu_update_t *)&XPQ_QUEUE, + _xpq_idx, NULL, DOMID_SELF); + +#if 0 + if (__predict_true(gdt_set)) + for (i = _xpq_idx; i > 0;) { + if (i >= 3) { + CTR6(KTR_PMAP, "mmu:val: %lx ptr: %lx val: %lx ptr: %lx val: %lx ptr: %lx", + (XPQ_QUEUE[i-1].val & 0xffffffff), (XPQ_QUEUE[i-1].ptr & 0xffffffff), + (XPQ_QUEUE[i-2].val & 0xffffffff), (XPQ_QUEUE[i-2].ptr & 0xffffffff), + (XPQ_QUEUE[i-3].val & 0xffffffff), (XPQ_QUEUE[i-3].ptr & 0xffffffff)); + i -= 3; + } else if (i == 2) { + CTR4(KTR_PMAP, "mmu: val: %lx ptr: %lx val: %lx ptr: %lx", + (XPQ_QUEUE[i-1].val & 0xffffffff), (XPQ_QUEUE[i-1].ptr & 0xffffffff), + (XPQ_QUEUE[i-2].val & 0xffffffff), (XPQ_QUEUE[i-2].ptr & 0xffffffff)); + i = 0; + } else { + CTR2(KTR_PMAP, "mmu: val: %lx ptr: %lx", + (XPQ_QUEUE[i-1].val & 0xffffffff), (XPQ_QUEUE[i-1].ptr & 0xffffffff)); + i = 0; + } + } +#endif + if (__predict_true(gdt_set)) + critical_exit(); + if (__predict_false(error < 0)) { + for (i = 0; i < _xpq_idx; i++) + printf("val: %llx ptr: %llx\n", XPQ_QUEUE[i].val, XPQ_QUEUE[i].ptr); + panic("Failed to execute MMU updates: %d", error); + } + +} + +void +xen_flush_queue(void) +{ + SET_VCPU(); + if (XPQ_IDX != 0) _xen_flush_queue(); +} + +static __inline void +xen_increment_idx(void) +{ + SET_VCPU(); + + XPQ_IDX++; + if (__predict_false(XPQ_IDX == XPQUEUE_SIZE)) + xen_flush_queue(); +} + +void +xen_check_queue(void) +{ + KASSERT(XPQ_IDX == 0, ("pending operations XPQ_IDX=%d", XPQ_IDX)); +} + +void +xen_invlpg(vm_offset_t va) +{ + struct mmuext_op op; + op.cmd = MMUEXT_INVLPG_ALL; + op.arg1.linear_addr = va & ~PAGE_MASK; + PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void +xen_load_cr3(vm_paddr_t val) +{ + struct mmuext_op op; + + KASSERT(XPQ_IDX == 0, ("pending operations XPQ_IDX=%d", XPQ_IDX)); + op.cmd = MMUEXT_NEW_BASEPTR; + op.arg1.mfn = xpmap_ptom(val) >> PAGE_SHIFT; + PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void +_xen_machphys_update(vm_paddr_t mfn, vm_paddr_t pfn, char *file, int line) +{ + + if (__predict_true(gdt_set)) + critical_enter(); + SET_VCPU(); + XPQ_QUEUE[XPQ_IDX].ptr = (mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE; + XPQ_QUEUE[XPQ_IDX].val = pfn; +#ifdef INVARIANTS + xpq_queue_log[XPQ_IDX].file = file; + xpq_queue_log[XPQ_IDX].line = line; +#endif + xen_increment_idx(); + if (__predict_true(gdt_set)) + critical_exit(); +} + +void +_xen_queue_pt_update(vm_paddr_t ptr, vm_paddr_t val, char *file, int line) +{ + + + if (__predict_true(gdt_set)) + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + + if (__predict_true(gdt_set)) + critical_enter(); + SET_VCPU(); + XPQ_QUEUE[XPQ_IDX].ptr = ((uint64_t)ptr) | MMU_NORMAL_PT_UPDATE; + XPQ_QUEUE[XPQ_IDX].val = (uint64_t)val; + if (val) + KASSERT(val & PG_V, + ("setting invalid address ptr=0x%jx 0x%jx", ptr, val)); +#ifdef INVARIANTS + xpq_queue_log[XPQ_IDX].file = file; + xpq_queue_log[XPQ_IDX].line = line; +#endif + xen_increment_idx(); + if (__predict_true(gdt_set)) + critical_exit(); +} + +void +xen_pgdpt_pin(vm_paddr_t ma) +{ + struct mmuext_op op; + op.cmd = MMUEXT_PIN_L3_TABLE; + op.arg1.mfn = ma >> PAGE_SHIFT; + xen_flush_queue(); + PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void +xen_pgd_pin(vm_paddr_t ma) +{ + struct mmuext_op op; + op.cmd = MMUEXT_PIN_L2_TABLE; + op.arg1.mfn = ma >> PAGE_SHIFT; + xen_flush_queue(); + PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void +xen_pgd_unpin(vm_paddr_t ma) +{ + struct mmuext_op op; + op.cmd = MMUEXT_UNPIN_TABLE; + op.arg1.mfn = ma >> PAGE_SHIFT; + xen_flush_queue(); + PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void +xen_pt_pin(vm_paddr_t ma) +{ + struct mmuext_op op; + op.cmd = MMUEXT_PIN_L1_TABLE; + op.arg1.mfn = ma >> PAGE_SHIFT; + printk("xen_pt_pin(): mfn=%x\n", op.arg1.mfn); + xen_flush_queue(); + PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void +xen_pt_unpin(vm_paddr_t ma) +{ + struct mmuext_op op; + op.cmd = MMUEXT_UNPIN_TABLE; + op.arg1.mfn = ma >> PAGE_SHIFT; + xen_flush_queue(); + PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void +xen_set_ldt(vm_paddr_t ptr, unsigned long len) +{ + struct mmuext_op op; + op.cmd = MMUEXT_SET_LDT; + op.arg1.linear_addr = ptr; + op.arg2.nr_ents = len; + xen_flush_queue(); + PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void xen_tlb_flush(void) +{ + struct mmuext_op op; + op.cmd = MMUEXT_TLB_FLUSH_LOCAL; + xen_flush_queue(); + PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void +xen_update_descriptor(union descriptor *table, union descriptor *entry) +{ + vm_paddr_t pa; + pt_entry_t *ptp; + + ptp = vtopte((vm_offset_t)table); + pa = (*ptp & PG_FRAME) | ((vm_offset_t)table & PAGE_MASK); + if (HYPERVISOR_update_descriptor(pa, *(uint64_t *)entry)) + panic("HYPERVISOR_update_descriptor failed\n"); +} + + +#if 0 +/* + * Bitmap is indexed by page number. If bit is set, the page is part of a + * xen_create_contiguous_region() area of memory. + */ +unsigned long *contiguous_bitmap; + +static void +contiguous_bitmap_set(unsigned long first_page, unsigned long nr_pages) +{ + unsigned long start_off, end_off, curr_idx, end_idx; + + curr_idx = first_page / BITS_PER_LONG; + start_off = first_page & (BITS_PER_LONG-1); + end_idx = (first_page + nr_pages) / BITS_PER_LONG; + end_off = (first_page + nr_pages) & (BITS_PER_LONG-1); + + if (curr_idx == end_idx) { + contiguous_bitmap[curr_idx] |= + ((1UL<<end_off)-1) & -(1UL<<start_off); + } else { + contiguous_bitmap[curr_idx] |= -(1UL<<start_off); + while ( ++curr_idx < end_idx ) + contiguous_bitmap[curr_idx] = ~0UL; + contiguous_bitmap[curr_idx] |= (1UL<<end_off)-1; + } +} + +static void +contiguous_bitmap_clear(unsigned long first_page, unsigned long nr_pages) +{ + unsigned long start_off, end_off, curr_idx, end_idx; + + curr_idx = first_page / BITS_PER_LONG; + start_off = first_page & (BITS_PER_LONG-1); + end_idx = (first_page + nr_pages) / BITS_PER_LONG; + end_off = (first_page + nr_pages) & (BITS_PER_LONG-1); + + if (curr_idx == end_idx) { + contiguous_bitmap[curr_idx] &= + -(1UL<<end_off) | ((1UL<<start_off)-1); + } else { + contiguous_bitmap[curr_idx] &= (1UL<<start_off)-1; + while ( ++curr_idx != end_idx ) + contiguous_bitmap[curr_idx] = 0; + contiguous_bitmap[curr_idx] &= -(1UL<<end_off); + } +} +#endif + +/* Ensure multi-page extents are contiguous in machine memory. */ +int +xen_create_contiguous_region(vm_page_t pages, int npages) +{ + unsigned long mfn, i, flags; + int order; + struct xen_memory_reservation reservation = { + .nr_extents = 1, + .extent_order = 0, + .domid = DOMID_SELF + }; + set_xen_guest_handle(reservation.extent_start, &mfn); + + + balloon_lock(flags); + + /* can currently only handle power of two allocation */ + PANIC_IF(ffs(npages) != fls(npages)); + + /* 0. determine order */ + order = (ffs(npages) == fls(npages)) ? fls(npages) - 1 : fls(npages); + + /* 1. give away machine pages. */ + for (i = 0; i < (1 << order); i++) { + int pfn; + pfn = VM_PAGE_TO_PHYS(&pages[i]) >> PAGE_SHIFT; + mfn = PFNTOMFN(pfn); + PFNTOMFN(pfn) = INVALID_P2M_ENTRY; + PANIC_IF(HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation) != 1); + } + + + /* 2. Get a new contiguous memory extent. */ + reservation.extent_order = order; + /* xenlinux hardcodes this because of aacraid - maybe set to 0 if we're not + * running with a broxen driver XXXEN + */ + reservation.address_bits = 31; + if (HYPERVISOR_memory_op(XENMEM_increase_reservation, &reservation) != 1) + goto fail; + + /* 3. Map the new extent in place of old pages. */ + for (i = 0; i < (1 << order); i++) { + int pfn; + pfn = VM_PAGE_TO_PHYS(&pages[i]) >> PAGE_SHIFT; + xen_machphys_update(mfn+i, pfn); + PFNTOMFN(pfn) = mfn+i; + } + + xen_tlb_flush(); + +#if 0 + contiguous_bitmap_set(VM_PAGE_TO_PHYS(&pages[0]) >> PAGE_SHIFT, 1UL << order); +#endif + + balloon_unlock(flags); + + return 0; + + fail: + reservation.extent_order = 0; + reservation.address_bits = 0; + + for (i = 0; i < (1 << order); i++) { + int pfn; + pfn = VM_PAGE_TO_PHYS(&pages[i]) >> PAGE_SHIFT; + PANIC_IF(HYPERVISOR_memory_op( + XENMEM_increase_reservation, &reservation) != 1); + xen_machphys_update(mfn, pfn); + PFNTOMFN(pfn) = mfn; + } + + xen_tlb_flush(); + + balloon_unlock(flags); + + return ENOMEM; +} + +void +xen_destroy_contiguous_region(void *addr, int npages) +{ + unsigned long mfn, i, flags, order, pfn0; + struct xen_memory_reservation reservation = { + .nr_extents = 1, + .extent_order = 0, + .domid = DOMID_SELF + }; + set_xen_guest_handle(reservation.extent_start, &mfn); + + pfn0 = vtophys(addr) >> PAGE_SHIFT; +#if 0 + scrub_pages(vstart, 1 << order); +#endif + /* can currently only handle power of two allocation */ + PANIC_IF(ffs(npages) != fls(npages)); + + /* 0. determine order */ + order = (ffs(npages) == fls(npages)) ? fls(npages) - 1 : fls(npages); + + balloon_lock(flags); + +#if 0 + contiguous_bitmap_clear(vtophys(addr) >> PAGE_SHIFT, 1UL << order); +#endif + + /* 1. Zap current PTEs, giving away the underlying pages. */ + for (i = 0; i < (1 << order); i++) { + int pfn; + uint64_t new_val = 0; + pfn = vtomach((char *)addr + i*PAGE_SIZE) >> PAGE_SHIFT; + + PANIC_IF(HYPERVISOR_update_va_mapping((vm_offset_t)((char *)addr + (i * PAGE_SIZE)), new_val, 0)); + PFNTOMFN(pfn) = INVALID_P2M_ENTRY; + PANIC_IF(HYPERVISOR_memory_op( + XENMEM_decrease_reservation, &reservation) != 1); + } + + /* 2. Map new pages in place of old pages. */ + for (i = 0; i < (1 << order); i++) { + int pfn; + uint64_t new_val; + pfn = pfn0 + i; + PANIC_IF(HYPERVISOR_memory_op(XENMEM_increase_reservation, &reservation) != 1); + + new_val = mfn << PAGE_SHIFT; + PANIC_IF(HYPERVISOR_update_va_mapping((vm_offset_t)addr + (i * PAGE_SIZE), + new_val, PG_KERNEL)); + xen_machphys_update(mfn, pfn); + PFNTOMFN(pfn) = mfn; + } + + xen_tlb_flush(); + + balloon_unlock(flags); +} + +extern unsigned long cpu0prvpage; +extern unsigned long *SMPpt; +extern struct user *proc0uarea; +extern vm_offset_t proc0kstack; +extern int vm86paddr, vm86phystk; +char *bootmem_start, *bootmem_current, *bootmem_end; + +pteinfo_t *pteinfo_list; +void initvalues(start_info_t *startinfo); + +struct ringbuf_head *xen_store; /* XXX move me */ +char *console_page; + +void * +bootmem_alloc(unsigned int size) +{ + char *retptr; + + retptr = bootmem_current; + PANIC_IF(retptr + size > bootmem_end); + bootmem_current += size; + + return retptr; +} + +void +bootmem_free(void *ptr, unsigned int size) +{ + char *tptr; + + tptr = ptr; + PANIC_IF(tptr != bootmem_current - size || + bootmem_current - size < bootmem_start); + + bootmem_current -= size; +} + +#if 0 +static vm_paddr_t +xpmap_mtop2(vm_paddr_t mpa) +{ + return ((machine_to_phys_mapping[mpa >> PAGE_SHIFT] << PAGE_SHIFT) + ) | (mpa & ~PG_FRAME); +} + +static pd_entry_t +xpmap_get_bootpde(vm_paddr_t va) +{ + + return ((pd_entry_t *)xen_start_info->pt_base)[va >> 22]; +} + +static pd_entry_t +xpmap_get_vbootpde(vm_paddr_t va) +{ + pd_entry_t pde; + + pde = xpmap_get_bootpde(va); + if ((pde & PG_V) == 0) + return (pde & ~PG_FRAME); + return (pde & ~PG_FRAME) | + (xpmap_mtop2(pde & PG_FRAME) + KERNBASE); +} + +static pt_entry_t 8* +xpmap_get_bootptep(vm_paddr_t va) +{ + pd_entry_t pde; + + pde = xpmap_get_vbootpde(va); + if ((pde & PG_V) == 0) + return (void *)-1; +#define PT_MASK 0x003ff000 /* page table address bits */ + return &(((pt_entry_t *)(pde & PG_FRAME))[(va & PT_MASK) >> PAGE_SHIFT]); +} + +static pt_entry_t +xpmap_get_bootpte(vm_paddr_t va) +{ + + return xpmap_get_bootptep(va)[0]; +} +#endif + + +#ifdef ADD_ISA_HOLE +static void +shift_phys_machine(unsigned long *phys_machine, int nr_pages) +{ + + unsigned long *tmp_page, *current_page, *next_page; + int i; + + tmp_page = bootmem_alloc(PAGE_SIZE); + current_page = phys_machine + nr_pages - (PAGE_SIZE/sizeof(unsigned long)); + next_page = current_page - (PAGE_SIZE/sizeof(unsigned long)); + bcopy(phys_machine, tmp_page, PAGE_SIZE); + + while (current_page > phys_machine) { + /* save next page */ + bcopy(next_page, tmp_page, PAGE_SIZE); + /* shift down page */ + bcopy(current_page, next_page, PAGE_SIZE); + /* finish swap */ + bcopy(tmp_page, current_page, PAGE_SIZE); + + current_page -= (PAGE_SIZE/sizeof(unsigned long)); + next_page -= (PAGE_SIZE/sizeof(unsigned long)); + } + bootmem_free(tmp_page, PAGE_SIZE); + + for (i = 0; i < nr_pages; i++) { + xen_machphys_update(phys_machine[i], i); + } + memset(phys_machine, INVALID_P2M_ENTRY, PAGE_SIZE); + +} +#endif + +extern unsigned long physfree; +void +initvalues(start_info_t *startinfo) +{ + int l3_pages, l2_pages, l1_pages, offset; + vm_offset_t cur_space; + struct physdev_set_iopl set_iopl; + + vm_paddr_t KPTphys, IdlePTDma; + vm_paddr_t console_page_ma, xen_store_ma; + vm_offset_t KPTphysoff, tmpva; + vm_paddr_t shinfo; +#ifdef PAE + vm_paddr_t IdlePDPTma, IdlePDPTnewma; + vm_paddr_t IdlePTDnewma[4]; + pd_entry_t *IdlePDPTnew, *IdlePTDnew; +#endif + unsigned long i; + + + HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); + HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); +#ifdef notyet + /* + * need to install handler + */ + HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments_notify); +#endif + xen_start_info = startinfo; + xen_phys_machine = (xen_pfn_t *)startinfo->mfn_list; + + /* number of pages allocated after the pts + 1*/; + cur_space = xen_start_info->pt_base + + ((xen_start_info->nr_pt_frames) + 3 )*PAGE_SIZE; + printk("initvalues(): wooh - availmem=%x,%x\n", avail_space, cur_space); + + printk("KERNBASE=%x,pt_base=%x, VTOPFN(base)=%x, nr_pt_frames=%x\n", KERNBASE,xen_start_info->pt_base, VTOPFN(xen_start_info->pt_base), xen_start_info->nr_pt_frames); + xendebug_flags = 0; /* 0xffffffff; */ + + /* allocate 4 pages for bootmem allocator */ + bootmem_start = bootmem_current = (char *)cur_space; + cur_space += (4 * PAGE_SIZE); + bootmem_end = (char *)cur_space; + +#ifdef ADD_ISA_HOLE + shift_phys_machine(xen_phys_machine, xen_start_info->nr_pages); +#endif + /* + * pre-zero unused mapped pages - mapped on 4MB boundary + */ +/* + bzero((char *)cur_space, (cur_space + 0x3fffff) % 0x400000); + */ + +#ifdef PAE + IdlePDPT = (pd_entry_t *)startinfo->pt_base; + IdlePDPTma = xpmap_ptom(VTOP(startinfo->pt_base)); + IdlePTD = (pd_entry_t *)((uint8_t *)startinfo->pt_base + PAGE_SIZE); + IdlePTDma = xpmap_ptom(VTOP(IdlePTD)); + l3_pages = 1; +#else + IdlePTD = (pd_entry_t *)startinfo->pt_base; + IdlePTDma = xpmap_ptom(VTOP(startinfo->pt_base)); + l3_pages = 0; +#endif + l2_pages = 1; + l1_pages = 4; /* XXX not certain if this varies */ + KPTphysoff = (l2_pages + l3_pages)*PAGE_SIZE; + + KPTphys = xpmap_ptom(VTOP(startinfo->pt_base + KPTphysoff)); + XENPRINTF("IdlePTD %p\n", IdlePTD); + XENPRINTF("nr_pages: %ld shared_info: 0x%lx flags: 0x%lx pt_base: 0x%lx " + "mod_start: 0x%lx mod_len: 0x%lx\n", + xen_start_info->nr_pages, xen_start_info->shared_info, + xen_start_info->flags, xen_start_info->pt_base, + xen_start_info->mod_start, xen_start_info->mod_len); + /* Map proc0's KSTACK */ + + proc0kstack = cur_space; cur_space += (KSTACK_PAGES * PAGE_SIZE); + printk("proc0kstack=%u\n", proc0kstack); + + /* vm86/bios stack */ + cur_space += PAGE_SIZE; + + /* Map space for the vm86 region */ + vm86paddr = (vm_offset_t)cur_space; + cur_space += (PAGE_SIZE * 3); + +#ifdef PAE + IdlePDPTnew = (pd_entry_t *)cur_space; cur_space += PAGE_SIZE; + bzero(IdlePDPTnew, PAGE_SIZE); + IdlePDPTnewma = xpmap_ptom(VTOP(IdlePDPTnew)); + + IdlePTDnew = (pd_entry_t *)cur_space; cur_space += 4*PAGE_SIZE; + bzero(IdlePTDnew, 4*PAGE_SIZE); + for (i = 0; i < 4; i++) + IdlePTDnewma[i] = xpmap_ptom( + VTOP((uint8_t *)IdlePTDnew + i*PAGE_SIZE)); + /* + * L3 + */ + for (i = 0; i < 4; i++) + IdlePDPTnew[i] = IdlePTDnewma[i] | PG_V; +#if 0 + /* + * L2 - xen isn't smart enough to handle recursive mappings + * on initial load + */ + for (i = 0; i < 4; i++) + IdlePTDnew[PTDPTDI + i] = IdlePTDnewma[i] | PG_V; + +#endif + PT_SET_MA(IdlePDPTnew, IdlePDPTnewma | PG_V); + xen_pt_unpin(IdlePDPTma); +#endif + /* unmap remaining pages from initial 4MB chunk */ + for (tmpva = cur_space; (tmpva & ((1<<22)-1)) != 0; tmpva += PAGE_SIZE) { + bzero((char *)tmpva, PAGE_SIZE); + PT_SET_MA(tmpva, (vm_paddr_t)0); + } + +#ifdef PAE + offset = 0; +#else + offset = KPTDI; +#endif + + /* allocate remainder of NKPT pages */ + for (i = l1_pages; i < NKPT; i++, cur_space += PAGE_SIZE) { + xen_pt_pin(xpmap_ptom(VTOP(cur_space))); + xen_queue_pt_update((vm_paddr_t)(IdlePTDma + (offset + i)*sizeof(vm_paddr_t)), + xpmap_ptom(VTOP(cur_space)) | PG_KERNEL); + } + + PT_UPDATES_FLUSH(); + memcpy((uint8_t *)IdlePTDnew + 3*PAGE_SIZE, IdlePTD, PAGE_SIZE/2); + printk("do remapping\n"); + for (i = 0; i < 4; i++) { + PT_SET_MA((uint8_t *)IdlePTDnew + i*PAGE_SIZE, + IdlePTDnewma[i] | PG_V); + } + xen_load_cr3(VTOP(IdlePDPTnew)); + xen_pgdpt_pin(xpmap_ptom(VTOP(IdlePDPTnew))); + for (i = 0; i < 4; i++) { + xen_queue_pt_update((vm_paddr_t)(IdlePTDnewma[2] + (PTDPTDI - 1024 + i)*sizeof(vm_paddr_t)), + IdlePTDnewma[i] | PG_V); + } + /* copy NKPT pages */ + for (i = 0; i < NKPT; i++) { + xen_queue_pt_update( + (vm_paddr_t)(IdlePTDnewma[3] + (i)*sizeof(vm_paddr_t)), + IdlePTD[i]); + } + + PT_UPDATES_FLUSH(); + IdlePTD = IdlePTDnew; + IdlePDPT = IdlePDPTnew; + IdlePDPTma = IdlePDPTnewma; + + /* allocate page for gdt */ + gdt = (union descriptor *)cur_space; cur_space += PAGE_SIZE; + /* allocate page for ldt */ + ldt = (union descriptor *)cur_space; cur_space += PAGE_SIZE; + + HYPERVISOR_shared_info = (shared_info_t *)cur_space; + cur_space += PAGE_SIZE; + + /* + * shared_info is an unsigned long so this will randomly break if + * it is allocated above 4GB - I guess people are used to that + * sort of thing with Xen ... sigh + */ + shinfo = xen_start_info->shared_info; + PT_SET_MA(HYPERVISOR_shared_info, shinfo | PG_KERNEL); + + printk("#4\n"); + + xen_store = (struct ringbuf_head *)cur_space; + cur_space += PAGE_SIZE; + + xen_store_ma = (((vm_paddr_t)xen_start_info->store_mfn) << PAGE_SHIFT); + PT_SET_MA(xen_store, xen_store_ma | PG_KERNEL); + console_page = (char *)cur_space; + cur_space += PAGE_SIZE; + console_page_ma = (((vm_paddr_t)xen_start_info->console.domU.mfn) << PAGE_SHIFT); + PT_SET_MA(console_page, console_page_ma | PG_KERNEL); + + printk("#5\n"); + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = (unsigned long)xen_phys_machine; +#if 0 && defined(SMP) + for (i = 0; i < ncpus; i++) { + int j, npages = (sizeof(struct privatespace) + 1)/PAGE_SIZE; + + for (j = 0; j < npages; j++) { + vm_paddr_t ma = xpmap_ptom(cur_space); + cur_space += PAGE_SIZE; + PT_SET_VA_MA(SMPpt + i*npages + j, ma | PG_KERNEL, FALSE); + } + } + xen_flush_queue(); +#endif + + set_iopl.iopl = 1; + PANIC_IF(HYPERVISOR_physdev_op(PHYSDEVOP_SET_IOPL, &set_iopl)); + printk("#6\n"); +#if 0 + /* add page table for KERNBASE */ + xen_queue_pt_update(IdlePTDma + KPTDI*sizeof(vm_paddr_t), + xpmap_ptom(VTOP(cur_space) | PG_KERNEL)); + xen_flush_queue(); + cur_space += PAGE_SIZE; + printk("#6\n"); +#endif + +#ifdef notyet + if (xen_start_info->flags & SIF_INITDOMAIN) { + /* Map first megabyte */ + for (i = 0; i < (256 << PAGE_SHIFT); i += PAGE_SIZE) + PT_SET_MA(KERNBASE + i, i | PG_KERNEL | PG_NC_PCD); + xen_flush_queue(); + } +#endif + /* + * re-map kernel text read-only + * + */ + for (i = (((vm_offset_t)&btext) & ~PAGE_MASK); + i < (((vm_offset_t)&etext) & ~PAGE_MASK); i += PAGE_SIZE) + PT_SET_MA(i, xpmap_ptom(VTOP(i)) | PG_V | PG_A); + + printk("#7\n"); + physfree = VTOP(cur_space); + init_first = physfree >> PAGE_SHIFT; + IdlePTD = (pd_entry_t *)VTOP(IdlePTD); + IdlePDPT = (pd_entry_t *)VTOP(IdlePDPT); + setup_xen_features(); + printk("#8, proc0kstack=%u\n", proc0kstack); +} + + +trap_info_t trap_table[] = { + { 0, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(div)}, + { 1, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(dbg)}, + { 3, 3, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(bpt)}, + { 4, 3, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(ofl)}, + /* This is UPL on Linux and KPL on BSD */ + { 5, 3, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(bnd)}, + { 6, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(ill)}, + { 7, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(dna)}, + /* + * { 8, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(XXX)}, + * no handler for double fault + */ + { 9, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(fpusegm)}, + {10, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(tss)}, + {11, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(missing)}, + {12, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(stk)}, + {13, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(prot)}, + {14, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(page)}, + {15, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(rsvd)}, + {16, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(fpu)}, + {17, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(align)}, + {18, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(mchk)}, + {19, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(xmm)}, + {0x80, 3, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(int0x80_syscall)}, + { 0, 0, 0, 0 } +}; + + +static void +shutdown_handler(struct xenbus_watch *watch, + const char **vec, unsigned int len) +{ + char *str; + struct xenbus_transaction xbt; + int err, howto; + struct reboot_args uap; + + howto = 0; + + again: + err = xenbus_transaction_start(&xbt); + if (err) + return; + str = (char *)xenbus_read(xbt, "control", "shutdown", NULL); + /* Ignore read errors and empty reads. */ + if (XENBUS_IS_ERR_READ(str)) { + xenbus_transaction_end(xbt, 1); + return; + } + + xenbus_write(xbt, "control", "shutdown", ""); + + err = xenbus_transaction_end(xbt, 0); + if (err == EAGAIN) { + free(str, M_DEVBUF); + goto again; + } + + if (strcmp(str, "reboot") == 0) + howto = 0; + else if (strcmp(str, "poweroff") == 0) + howto |= (RB_POWEROFF | RB_HALT); + else if (strcmp(str, "halt") == 0) + howto |= RB_HALT; + else if (strcmp(str, "suspend") == 0) + howto = -1; + else { + printf("Ignoring shutdown request: %s\n", str); + goto done; + } +#ifdef notyet + if (howto == -1) { + do_suspend(NULL); + goto done; + } +#else + if (howto == -1) { + printf("suspend not currently supported\n"); + goto done; + } +#endif + uap.opt = howto; + reboot(curthread, &uap); + done: + free(str, M_DEVBUF); +} + +static struct xenbus_watch shutdown_watch = { + .node = "control/shutdown", + .callback = shutdown_handler +}; + + +static void +setup_shutdown_watcher(void *unused) +{ + if (register_xenbus_watch(&shutdown_watch)) + printf("Failed to set shutdown watcher\n"); +} + + +SYSINIT(shutdown, SI_SUB_PSEUDO, SI_ORDER_ANY, setup_shutdown_watcher, NULL) +#ifdef notyet + +static void +xen_suspend(void *ignore) +{ + int i, j, k, fpp; + + extern void time_resume(void); + extern unsigned long max_pfn; + extern unsigned long *pfn_to_mfn_frame_list_list; + extern unsigned long *pfn_to_mfn_frame_list[]; + +#ifdef CONFIG_SMP +#error "do_suspend must be run cpu 0 - need to create separate thread" + cpumask_t prev_online_cpus; + int vcpu_prepare(int vcpu); +#endif + + int err = 0; + + PANIC_IF(smp_processor_id() != 0); + +#if defined(CONFIG_SMP) && !defined(CONFIG_HOTPLUG_CPU) + if (num_online_cpus() > 1) { + printk(KERN_WARNING "Can't suspend SMP guests " + "without CONFIG_HOTPLUG_CPU\n"); + return -EOPNOTSUPP; + } +#endif + + xenbus_suspend(); + +#ifdef CONFIG_SMP + lock_cpu_hotplug(); + /* + * Take all other CPUs offline. We hold the hotplug semaphore to + * avoid other processes bringing up CPUs under our feet. + */ + cpus_clear(prev_online_cpus); + while (num_online_cpus() > 1) { + for_each_online_cpu(i) { + if (i == 0) + continue; + unlock_cpu_hotplug(); + err = cpu_down(i); + lock_cpu_hotplug(); + if (err != 0) { + printk(KERN_CRIT "Failed to take all CPUs " + "down: %d.\n", err); + goto out_reenable_cpus; + } + cpu_set(i, prev_online_cpus); + } + } +#endif + + preempt_disable(); + + + __cli(); + preempt_enable(); +#ifdef SMP + unlock_cpu_hotplug(); +#endif + gnttab_suspend(); + + pmap_kremove(HYPERVISOR_shared_info); + + xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn); + xen_start_info->console.domU.mfn = mfn_to_pfn(xen_start_info->console.domU.mfn); + + /* + * We'll stop somewhere inside this hypercall. When it returns, + * we'll start resuming after the restore. + */ + HYPERVISOR_suspend(VTOMFN(xen_start_info)); + + pmap_kenter_ma(HYPERVISOR_shared_info, xen_start_info->shared_info); + set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info); + +#if 0 + memset(empty_zero_page, 0, PAGE_SIZE); +#endif + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = + VTOMFN(pfn_to_mfn_frame_list_list); + + fpp = PAGE_SIZE/sizeof(unsigned long); + for (i = 0, j = 0, k = -1; i < max_pfn; i += fpp, j++) { + if ((j % fpp) == 0) { + k++; + pfn_to_mfn_frame_list_list[k] = + VTOMFN(pfn_to_mfn_frame_list[k]); + j = 0; + } + pfn_to_mfn_frame_list[k][j] = + VTOMFN(&phys_to_machine_mapping[i]); + } + HYPERVISOR_shared_info->arch.max_pfn = max_pfn; + + gnttab_resume(); + + irq_resume(); + + time_resume(); + + __sti(); + + xencons_resume(); + +#ifdef CONFIG_SMP + for_each_cpu(i) + vcpu_prepare(i); + +#endif + + /* + * Only resume xenbus /after/ we've prepared our VCPUs; otherwise + * the VCPU hotplug callback can race with our vcpu_prepare + */ + xenbus_resume(); + +#ifdef CONFIG_SMP + out_reenable_cpus: + for_each_cpu_mask(i, prev_online_cpus) { + j = cpu_up(i); + if ((j != 0) && !cpu_online(i)) { + printk(KERN_CRIT "Failed to bring cpu " + "%d back up (%d).\n", + i, j); + err = j; + } + } +#endif + return err; +} + +#endif +/********** CODE WORTH KEEPING ABOVE HERE *****************/ + +void xen_failsafe_handler(void); + +void +xen_failsafe_handler(void) +{ + + panic("xen_failsafe_handler called!\n"); +} + +void xen_handle_thread_switch(struct pcb *pcb); + +/* This is called by cpu_switch() when switching threads. */ +/* The pcb arg refers to the process control block of the */ +/* next thread which is to run */ +void +xen_handle_thread_switch(struct pcb *pcb) +{ + uint32_t *a = (uint32_t *)&PCPU_GET(fsgs_gdt)[0]; + uint32_t *b = (uint32_t *)&pcb->pcb_fsd; + multicall_entry_t mcl[3]; + int i = 0; + + /* Notify Xen of task switch */ + mcl[i].op = __HYPERVISOR_stack_switch; + mcl[i].args[0] = GSEL(GDATA_SEL, SEL_KPL); + mcl[i++].args[1] = (unsigned long)pcb; + + /* Check for update of fsd */ + if (*a != *b || *(a+1) != *(b+1)) { + mcl[i].op = __HYPERVISOR_update_descriptor; + *(uint64_t *)&mcl[i].args[0] = vtomach((vm_offset_t)a); + *(uint64_t *)&mcl[i++].args[2] = *(uint64_t *)b; + } + + a += 2; + b += 2; + + /* Check for update of gsd */ + if (*a != *b || *(a+1) != *(b+1)) { + mcl[i].op = __HYPERVISOR_update_descriptor; + *(uint64_t *)&mcl[i].args[0] = vtomach((vm_offset_t)a); + *(uint64_t *)&mcl[i++].args[2] = *(uint64_t *)b; + } + + (void)HYPERVISOR_multicall(mcl, i); +} |