diff options
author | kmacy <kmacy@FreeBSD.org> | 2008-08-12 19:48:18 +0000 |
---|---|---|
committer | kmacy <kmacy@FreeBSD.org> | 2008-08-12 19:48:18 +0000 |
commit | d37c684bc1489bfdabef5cc63ef3d8a14e86a6bf (patch) | |
tree | 7fb71033a023845dbb2963940755dc1512fbf8eb /sys | |
parent | 2f734060b92f529ade432d31098548f04dc963a6 (diff) | |
download | FreeBSD-src-d37c684bc1489bfdabef5cc63ef3d8a14e86a6bf.zip FreeBSD-src-d37c684bc1489bfdabef5cc63ef3d8a14e86a6bf.tar.gz |
Import i386 xen sub-arch files.
MFC after: 2 weeks
Diffstat (limited to 'sys')
-rw-r--r-- | sys/i386/xen/clock.c | 976 | ||||
-rw-r--r-- | sys/i386/xen/exception.s | 466 | ||||
-rw-r--r-- | sys/i386/xen/locore.s | 372 | ||||
-rw-r--r-- | sys/i386/xen/pmap.c | 4128 | ||||
-rw-r--r-- | sys/i386/xen/xen_machdep.c | 1315 |
5 files changed, 7257 insertions, 0 deletions
diff --git a/sys/i386/xen/clock.c b/sys/i386/xen/clock.c new file mode 100644 index 0000000..a01770f --- /dev/null +++ b/sys/i386/xen/clock.c @@ -0,0 +1,976 @@ +/*- + * Copyright (c) 1990 The Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * William Jolitz and Don Ahn. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)clock.c 7.2 (Berkeley) 5/12/91 + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +/* #define DELAYDEBUG */ +/* + * Routines to handle clock hardware. + */ + +#include "opt_ddb.h" +#include "opt_clock.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bus.h> +#include <sys/clock.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/time.h> +#include <sys/timetc.h> +#include <sys/kernel.h> +#include <sys/limits.h> +#include <sys/sysctl.h> +#include <sys/cons.h> +#include <sys/power.h> + +#include <machine/clock.h> +#include <machine/cputypes.h> +#include <machine/frame.h> +#include <machine/intr_machdep.h> +#include <machine/md_var.h> +#include <machine/psl.h> +#if defined(SMP) +#include <machine/smp.h> +#endif +#include <machine/specialreg.h> +#include <machine/timerreg.h> + +#include <i386/isa/icu.h> +#include <i386/isa/isa.h> +#include <isa/rtc.h> + +#include <machine/xen/xen_intr.h> +#include <vm/vm.h> +#include <vm/pmap.h> +#include <machine/pmap.h> +#include <machine/xen/hypervisor.h> +#include <machine/xen/xen-os.h> +#include <machine/xen/xenfunc.h> +#include <xen/interface/vcpu.h> +#include <machine/cpu.h> + +/* + * 32-bit time_t's can't reach leap years before 1904 or after 2036, so we + * can use a simple formula for leap years. + */ +#define LEAPYEAR(y) (!((y) % 4)) +#define DAYSPERYEAR (28+30*4+31*7) + +#ifndef TIMER_FREQ +#define TIMER_FREQ 1193182 +#endif + +#ifdef CYC2NS_SCALE_FACTOR +#undef CYC2NS_SCALE_FACTOR +#endif +#define CYC2NS_SCALE_FACTOR 10 + +/* Values for timerX_state: */ +#define RELEASED 0 +#define RELEASE_PENDING 1 +#define ACQUIRED 2 +#define ACQUIRE_PENDING 3 + +#define RTC_LOCK_INIT \ + mtx_init(&clock_lock, "clk", NULL, MTX_SPIN | MTX_NOPROFILE) +#define RTC_LOCK mtx_lock_spin(&clock_lock) +#define RTC_UNLOCK mtx_unlock_spin(&clock_lock) + +int adjkerntz; /* local offset from GMT in seconds */ +int clkintr_pending; +int pscnt = 1; +int psdiv = 1; +int statclock_disable; +int wall_cmos_clock; +u_int timer_freq = TIMER_FREQ; +static int independent_wallclock; +static int xen_disable_rtc_set; +static u_long cached_gtm; /* cached quotient for TSC -> microseconds */ +static u_long cyc2ns_scale; +static u_char timer2_state = RELEASED; +static struct timespec shadow_tv; +static uint32_t shadow_tv_version; /* XXX: lazy locking */ +static uint64_t processed_system_time; /* stime (ns) at last processing. */ +static struct mtx clock_lock; +static int rtc_reg; + +static const u_char daysinmonth[] = {31,28,31,30,31,30,31,31,30,31,30,31}; + +SYSCTL_INT(_machdep, OID_AUTO, independent_wallclock, + CTLFLAG_RW, &independent_wallclock, 0, ""); +SYSCTL_INT(_machdep, OID_AUTO, xen_disable_rtc_set, + CTLFLAG_RW, &xen_disable_rtc_set, 1, ""); + + +#define do_div(n,base) ({ \ + unsigned long __upper, __low, __high, __mod, __base; \ + __base = (base); \ + __asm("":"=a" (__low), "=d" (__high):"A" (n)); \ + __upper = __high; \ + if (__high) { \ + __upper = __high % (__base); \ + __high = __high / (__base); \ + } \ + __asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (__base), "0" (__low), "1" (__upper)); \ + __asm("":"=A" (n):"a" (__low),"d" (__high)); \ + __mod; \ +}) + + +/* These are peridically updated in shared_info, and then copied here. */ +struct shadow_time_info { + uint64_t tsc_timestamp; /* TSC at last update of time vals. */ + uint64_t system_timestamp; /* Time, in nanosecs, since boot. */ + uint32_t tsc_to_nsec_mul; + uint32_t tsc_to_usec_mul; + int tsc_shift; + uint32_t version; +}; +static DEFINE_PER_CPU(uint64_t, processed_system_time); +static DEFINE_PER_CPU(struct shadow_time_info, shadow_time); + + +#define NS_PER_TICK (1000000000ULL/hz) + +#define rdtscll(val) \ + __asm__ __volatile__("rdtsc" : "=A" (val)) + + +/* convert from cycles(64bits) => nanoseconds (64bits) + * basic equation: + * ns = cycles / (freq / ns_per_sec) + * ns = cycles * (ns_per_sec / freq) + * ns = cycles * (10^9 / (cpu_mhz * 10^6)) + * ns = cycles * (10^3 / cpu_mhz) + * + * Then we use scaling math (suggested by george@mvista.com) to get: + * ns = cycles * (10^3 * SC / cpu_mhz) / SC + * ns = cycles * cyc2ns_scale / SC + * + * And since SC is a constant power of two, we can convert the div + * into a shift. + * -johnstul@us.ibm.com "math is hard, lets go shopping!" + */ +static inline void set_cyc2ns_scale(unsigned long cpu_mhz) +{ + cyc2ns_scale = (1000 << CYC2NS_SCALE_FACTOR)/cpu_mhz; +} + +static inline unsigned long long cycles_2_ns(unsigned long long cyc) +{ + return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR; +} + +/* + * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, + * yielding a 64-bit result. + */ +static inline uint64_t +scale_delta(uint64_t delta, uint32_t mul_frac, int shift) +{ + uint64_t product; + uint32_t tmp1, tmp2; + + if ( shift < 0 ) + delta >>= -shift; + else + delta <<= shift; + + __asm__ ( + "mul %5 ; " + "mov %4,%%eax ; " + "mov %%edx,%4 ; " + "mul %5 ; " + "add %4,%%eax ; " + "xor %5,%5 ; " + "adc %5,%%edx ; " + : "=A" (product), "=r" (tmp1), "=r" (tmp2) + : "a" ((uint32_t)delta), "1" ((uint32_t)(delta >> 32)), "2" (mul_frac) ); + + return product; +} + +static uint64_t get_nsec_offset(struct shadow_time_info *shadow) +{ + uint64_t now, delta; + rdtscll(now); + delta = now - shadow->tsc_timestamp; + return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); +} + +static void update_wallclock(void) +{ + shared_info_t *s = HYPERVISOR_shared_info; + + do { + shadow_tv_version = s->wc_version; + rmb(); + shadow_tv.tv_sec = s->wc_sec; + shadow_tv.tv_nsec = s->wc_nsec; + rmb(); + } + while ((s->wc_version & 1) | (shadow_tv_version ^ s->wc_version)); + +} + +/* + * Reads a consistent set of time-base values from Xen, into a shadow data + * area. Must be called with the xtime_lock held for writing. + */ +static void __get_time_values_from_xen(void) +{ + shared_info_t *s = HYPERVISOR_shared_info; + struct vcpu_time_info *src; + struct shadow_time_info *dst; + + src = &s->vcpu_info[smp_processor_id()].time; + dst = &per_cpu(shadow_time, smp_processor_id()); + + do { + dst->version = src->version; + rmb(); + dst->tsc_timestamp = src->tsc_timestamp; + dst->system_timestamp = src->system_time; + dst->tsc_to_nsec_mul = src->tsc_to_system_mul; + dst->tsc_shift = src->tsc_shift; + rmb(); + } + while ((src->version & 1) | (dst->version ^ src->version)); + + dst->tsc_to_usec_mul = dst->tsc_to_nsec_mul / 1000; +} + +static inline int time_values_up_to_date(int cpu) +{ + struct vcpu_time_info *src; + struct shadow_time_info *dst; + + src = &HYPERVISOR_shared_info->vcpu_info[cpu].time; + dst = &per_cpu(shadow_time, cpu); + + rmb(); + return (dst->version == src->version); +} + +static unsigned xen_get_timecount(struct timecounter *tc); + +static struct timecounter xen_timecounter = { + xen_get_timecount, /* get_timecount */ + 0, /* no poll_pps */ + ~0u, /* counter_mask */ + 0, /* frequency */ + "ixen", /* name */ + 0 /* quality */ +}; + +static void +clkintr(struct trapframe *frame) +{ + int64_t delta_cpu, delta; + int cpu = smp_processor_id(); + struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu); + + do { + __get_time_values_from_xen(); + + delta = delta_cpu = + shadow->system_timestamp + get_nsec_offset(shadow); + delta -= processed_system_time; + delta_cpu -= per_cpu(processed_system_time, cpu); + + } while (!time_values_up_to_date(cpu)); + + if (unlikely(delta < (int64_t)0) || unlikely(delta_cpu < (int64_t)0)) { + printf("Timer ISR: Time went backwards: %lld\n", delta); + return; + } + + /* Process elapsed ticks since last call. */ + if (delta >= NS_PER_TICK) { + processed_system_time += (delta / NS_PER_TICK) * NS_PER_TICK; + per_cpu(processed_system_time, cpu) += (delta_cpu / NS_PER_TICK) * NS_PER_TICK; + } + hardclock(TRAPF_USERMODE(frame), TRAPF_PC(frame)); + + /* + * Take synchronised time from Xen once a minute if we're not + * synchronised ourselves, and we haven't chosen to keep an independent + * time base. + */ + + if (shadow_tv_version != HYPERVISOR_shared_info->wc_version) { + update_wallclock(); + tc_setclock(&shadow_tv); + } + + /* XXX TODO */ +} + +static uint32_t +getit(void) +{ + struct shadow_time_info *shadow; + shadow = &per_cpu(shadow_time, smp_processor_id()); + __get_time_values_from_xen(); + return shadow->system_timestamp + get_nsec_offset(shadow); +} + + +/* + * XXX: timer needs more SMP work. + */ +void +i8254_init(void) +{ + + RTC_LOCK_INIT; +} + +/* + * Wait "n" microseconds. + * Relies on timer 1 counting down from (timer_freq / hz) + * Note: timer had better have been programmed before this is first used! + */ +void +DELAY(int n) +{ + int delta, ticks_left; + uint32_t tick, prev_tick; +#ifdef DELAYDEBUG + int getit_calls = 1; + int n1; + static int state = 0; + + if (state == 0) { + state = 1; + for (n1 = 1; n1 <= 10000000; n1 *= 10) + DELAY(n1); + state = 2; + } + if (state == 1) + printf("DELAY(%d)...", n); +#endif + /* + * Read the counter first, so that the rest of the setup overhead is + * counted. Guess the initial overhead is 20 usec (on most systems it + * takes about 1.5 usec for each of the i/o's in getit(). The loop + * takes about 6 usec on a 486/33 and 13 usec on a 386/20. The + * multiplications and divisions to scale the count take a while). + * + * However, if ddb is active then use a fake counter since reading + * the i8254 counter involves acquiring a lock. ddb must not go + * locking for many reasons, but it calls here for at least atkbd + * input. + */ + prev_tick = getit(); + + n -= 0; /* XXX actually guess no initial overhead */ + /* + * Calculate (n * (timer_freq / 1e6)) without using floating point + * and without any avoidable overflows. + */ + if (n <= 0) + ticks_left = 0; + else if (n < 256) + /* + * Use fixed point to avoid a slow division by 1000000. + * 39099 = 1193182 * 2^15 / 10^6 rounded to nearest. + * 2^15 is the first power of 2 that gives exact results + * for n between 0 and 256. + */ + ticks_left = ((u_int)n * 39099 + (1 << 15) - 1) >> 15; + else + /* + * Don't bother using fixed point, although gcc-2.7.2 + * generates particularly poor code for the long long + * division, since even the slow way will complete long + * before the delay is up (unless we're interrupted). + */ + ticks_left = ((u_int)n * (long long)timer_freq + 999999) + / 1000000; + + while (ticks_left > 0) { + tick = getit(); +#ifdef DELAYDEBUG + ++getit_calls; +#endif + delta = tick - prev_tick; + prev_tick = tick; + if (delta < 0) { + /* + * Guard against timer0_max_count being wrong. + * This shouldn't happen in normal operation, + * but it may happen if set_timer_freq() is + * traced. + */ + /* delta += timer0_max_count; ??? */ + if (delta < 0) + delta = 0; + } + ticks_left -= delta; + } +#ifdef DELAYDEBUG + if (state == 1) + printf(" %d calls to getit() at %d usec each\n", + getit_calls, (n + 5) / getit_calls); +#endif +} + + +int +sysbeep(int pitch, int period) +{ + return (0); +} + +/* + * Restore all the timers non-atomically (XXX: should be atomically). + * + * This function is called from pmtimer_resume() to restore all the timers. + * This should not be necessary, but there are broken laptops that do not + * restore all the timers on resume. + */ +void +timer_restore(void) +{ + /* Get timebases for new environment. */ + __get_time_values_from_xen(); + + /* Reset our own concept of passage of system time. */ + processed_system_time = per_cpu(shadow_time, 0).system_timestamp; + per_cpu(processed_system_time, 0) = processed_system_time; +} + +void +startrtclock() +{ + unsigned long long alarm; + uint64_t __cpu_khz; + uint32_t cpu_khz; + struct vcpu_time_info *info; + + /* initialize xen values */ + __get_time_values_from_xen(); + processed_system_time = per_cpu(shadow_time, 0).system_timestamp; + per_cpu(processed_system_time, 0) = processed_system_time; + + __cpu_khz = 1000000ULL << 32; + info = &HYPERVISOR_shared_info->vcpu_info[0].time; + + do_div(__cpu_khz, info->tsc_to_system_mul); + if ( info->tsc_shift < 0 ) + cpu_khz = __cpu_khz << -info->tsc_shift; + else + cpu_khz = __cpu_khz >> info->tsc_shift; + + printf("Xen reported: %u.%03u MHz processor.\n", + cpu_khz / 1000, cpu_khz % 1000); + + /* (10^6 * 2^32) / cpu_hz = (10^3 * 2^32) / cpu_khz = + (2^32 * 1 / (clocks/us)) */ + { + unsigned long eax=0, edx=1000; + __asm__("divl %2" + :"=a" (cached_gtm), "=d" (edx) + :"r" (cpu_khz), + "0" (eax), "1" (edx)); + } + + set_cyc2ns_scale(cpu_khz/1000); + tsc_freq = cpu_khz * 1000; + + timer_freq = xen_timecounter.tc_frequency = 1000000000LL; + tc_init(&xen_timecounter); + + + rdtscll(alarm); +} + +/* + * RTC support routines + */ + +int +rtcin(reg) + int reg; +{ + u_char val; + + RTC_LOCK; + outb(IO_RTC, reg); + inb(0x84); + val = inb(IO_RTC + 1); + inb(0x84); + RTC_UNLOCK; + return (val); +} + + +static __inline int +readrtc(int port) +{ + return(bcd2bin(rtcin(port))); +} + +void +writertc(int reg, u_char val) +{ + + RTC_LOCK; + if (rtc_reg != reg) { + inb(0x84); + outb(IO_RTC, reg); + rtc_reg = reg; + inb(0x84); + } + outb(IO_RTC + 1, val); + inb(0x84); + RTC_UNLOCK; +} + +#ifdef XEN_PRIVILEGED_GUEST + +/* + * Initialize the time of day register, based on the time base which is, e.g. + * from a filesystem. + */ +static void +domu_inittodr(time_t base) +{ + unsigned long sec; + int s, y; + struct timespec ts; + + update_wallclock(); + + RTC_LOCK; + + if (base) { + ts.tv_sec = base; + ts.tv_nsec = 0; + tc_setclock(&ts); + } + + sec += tz_minuteswest * 60 + (wall_cmos_clock ? adjkerntz : 0); + + y = time_second - shadow_tv.tv_sec; + if (y <= -2 || y >= 2) { + /* badly off, adjust it */ + tc_setclock(&shadow_tv); + } + RTC_UNLOCK; +} + +/* + * Write system time back to RTC. + */ +static void +domu_resettodr(void) +{ + unsigned long tm; + int s; + dom0_op_t op; + struct shadow_time_info *shadow; + + shadow = &per_cpu(shadow_time, smp_processor_id()); + if (xen_disable_rtc_set) + return; + + s = splclock(); + tm = time_second; + splx(s); + + tm -= tz_minuteswest * 60 + (wall_cmos_clock ? adjkerntz : 0); + + if ((xen_start_info->flags & SIF_INITDOMAIN) && + !independent_wallclock) + { + op.cmd = DOM0_SETTIME; + op.u.settime.secs = tm; + op.u.settime.nsecs = 0; + op.u.settime.system_time = shadow->system_timestamp; + HYPERVISOR_dom0_op(&op); + update_wallclock(); + } else if (independent_wallclock) { + /* notyet */ + ; + } +} + +/* + * Initialize the time of day register, based on the time base which is, e.g. + * from a filesystem. + */ +void +inittodr(time_t base) +{ + unsigned long sec, days; + int year, month; + int y, m, s; + struct timespec ts; + + if (!(xen_start_info->flags & SIF_INITDOMAIN)) { + domu_inittodr(base); + return; + } + + if (base) { + s = splclock(); + ts.tv_sec = base; + ts.tv_nsec = 0; + tc_setclock(&ts); + splx(s); + } + + /* Look if we have a RTC present and the time is valid */ + if (!(rtcin(RTC_STATUSD) & RTCSD_PWR)) + goto wrong_time; + + /* wait for time update to complete */ + /* If RTCSA_TUP is zero, we have at least 244us before next update */ + s = splhigh(); + while (rtcin(RTC_STATUSA) & RTCSA_TUP) { + splx(s); + s = splhigh(); + } + + days = 0; +#ifdef USE_RTC_CENTURY + year = readrtc(RTC_YEAR) + readrtc(RTC_CENTURY) * 100; +#else + year = readrtc(RTC_YEAR) + 1900; + if (year < 1970) + year += 100; +#endif + if (year < 1970) { + splx(s); + goto wrong_time; + } + month = readrtc(RTC_MONTH); + for (m = 1; m < month; m++) + days += daysinmonth[m-1]; + if ((month > 2) && LEAPYEAR(year)) + days ++; + days += readrtc(RTC_DAY) - 1; + for (y = 1970; y < year; y++) + days += DAYSPERYEAR + LEAPYEAR(y); + sec = ((( days * 24 + + readrtc(RTC_HRS)) * 60 + + readrtc(RTC_MIN)) * 60 + + readrtc(RTC_SEC)); + /* sec now contains the number of seconds, since Jan 1 1970, + in the local time zone */ + + sec += tz_minuteswest * 60 + (wall_cmos_clock ? adjkerntz : 0); + + y = time_second - sec; + if (y <= -2 || y >= 2) { + /* badly off, adjust it */ + ts.tv_sec = sec; + ts.tv_nsec = 0; + tc_setclock(&ts); + } + splx(s); + return; + + wrong_time: + printf("Invalid time in real time clock.\n"); + printf("Check and reset the date immediately!\n"); +} + + + +/* + * Write system time back to RTC + */ +void +resettodr() +{ + unsigned long tm; + int y, m, s; + + if (!(xen_start_info->flags & SIF_INITDOMAIN)) { + domu_resettodr(); + return; + } + + if (xen_disable_rtc_set) + return; + + s = splclock(); + tm = time_second; + splx(s); + + /* Disable RTC updates and interrupts. */ + writertc(RTC_STATUSB, RTCSB_HALT | RTCSB_24HR); + + /* Calculate local time to put in RTC */ + + tm -= tz_minuteswest * 60 + (wall_cmos_clock ? adjkerntz : 0); + + writertc(RTC_SEC, bin2bcd(tm%60)); tm /= 60; /* Write back Seconds */ + writertc(RTC_MIN, bin2bcd(tm%60)); tm /= 60; /* Write back Minutes */ + writertc(RTC_HRS, bin2bcd(tm%24)); tm /= 24; /* Write back Hours */ + + /* We have now the days since 01-01-1970 in tm */ + writertc(RTC_WDAY, (tm + 4) % 7 + 1); /* Write back Weekday */ + for (y = 1970, m = DAYSPERYEAR + LEAPYEAR(y); + tm >= m; + y++, m = DAYSPERYEAR + LEAPYEAR(y)) + tm -= m; + + /* Now we have the years in y and the day-of-the-year in tm */ + writertc(RTC_YEAR, bin2bcd(y%100)); /* Write back Year */ +#ifdef USE_RTC_CENTURY + writertc(RTC_CENTURY, bin2bcd(y/100)); /* ... and Century */ +#endif + for (m = 0; ; m++) { + int ml; + + ml = daysinmonth[m]; + if (m == 1 && LEAPYEAR(y)) + ml++; + if (tm < ml) + break; + tm -= ml; + } + + writertc(RTC_MONTH, bin2bcd(m + 1)); /* Write back Month */ + writertc(RTC_DAY, bin2bcd(tm + 1)); /* Write back Month Day */ + + /* Reenable RTC updates and interrupts. */ + writertc(RTC_STATUSB, RTCSB_24HR); + rtcin(RTC_INTR); +} +#else +/* + * Initialize the time of day register, based on the time base which is, e.g. + * from a filesystem. + */ +void +inittodr(time_t base) +{ + int s, y; + struct timespec ts; + + s = splclock(); + if (base) { + ts.tv_sec = base; + ts.tv_nsec = 0; + tc_setclock(&ts); + } + + y = time_second - shadow_tv.tv_sec; + if (y <= -2 || y >= 2) { + /* badly off, adjust it */ + ts.tv_sec = shadow_tv.tv_sec; + ts.tv_nsec = shadow_tv.tv_nsec * 1000000000; /* :-/ */ + tc_setclock(&ts); + } + splx(s); +} + +/* + * Write system time back to RTC. Not supported for guest domains. + */ +void +resettodr() +{ +} +#endif + + +int +acquire_timer2(int mode) +{ + + if (timer2_state != RELEASED) + return (-1); + timer2_state = ACQUIRED; + + /* + * This access to the timer registers is as atomic as possible + * because it is a single instruction. We could do better if we + * knew the rate. Use of splclock() limits glitches to 10-100us, + * and this is probably good enough for timer2, so we aren't as + * careful with it as with timer0. + */ + outb(TIMER_MODE, TIMER_SEL2 | (mode & 0x3f)); + + return (0); +} + +int +release_timer2() +{ + + if (timer2_state != ACQUIRED) + return (-1); + timer2_state = RELEASED; + outb(TIMER_MODE, TIMER_SEL2 | TIMER_SQWAVE | TIMER_16BIT); + return (0); +} + +static struct vcpu_set_periodic_timer xen_set_periodic_tick; + +/* + * Start clocks running. + */ +void +cpu_initclocks(void) +{ + int time_irq; + + xen_set_periodic_tick.period_ns = NS_PER_TICK; + + HYPERVISOR_vcpu_op(VCPUOP_set_periodic_timer, 0, + &xen_set_periodic_tick); + + if ((time_irq = bind_virq_to_irqhandler(VIRQ_TIMER, 0, "clk", + (driver_filter_t *)clkintr, NULL, + INTR_TYPE_CLK | INTR_FAST)) < 0) { + panic("failed to register clock interrupt\n"); + } + + /* should fast clock be enabled ? */ +} + +/* + * + * XXX + */ +#if 0 && defined(SMP) +void +ap_cpu_initclocks(void) +{ + int irq; + int cpu = smp_processor_id(); + + per_cpu(processed_system_time, cpu) = processed_system_time; + + irq = bind_virq_to_irq(VIRQ_TIMER); + PCPU_SET(time_irq, irq); + PANIC_IF(intr_add_handler("clk", irq, (driver_filter_t *)clkintr, NULL, + NULL, INTR_TYPE_CLK | INTR_FAST, NULL)); +} +#endif + +void +cpu_startprofclock(void) +{ + + printf("cpu_startprofclock: profiling clock is not supported\n"); +} + +void +cpu_stopprofclock(void) +{ + + printf("cpu_stopprofclock: profiling clock is not supported\n"); +} +#define NSEC_PER_USEC 1000 + +static uint32_t +xen_get_timecount(struct timecounter *tc) +{ + uint64_t clk; + struct shadow_time_info *shadow; + shadow = &per_cpu(shadow_time, smp_processor_id()); + + __get_time_values_from_xen(); + + clk = shadow->system_timestamp + get_nsec_offset(shadow); + + return (uint32_t)((clk / NS_PER_TICK) * NS_PER_TICK); + +} + +/* Return system time offset by ticks */ +uint64_t +get_system_time(int ticks) +{ + return processed_system_time + (ticks * NS_PER_TICK); +} + +/* + * Track behavior of cur_timer->get_offset() functionality in timer_tsc.c + */ + +#if 0 +static uint32_t +xen_get_offset(void) +{ + register unsigned long eax, edx; + + /* Read the Time Stamp Counter */ + + rdtsc(eax,edx); + + /* .. relative to previous jiffy (32 bits is enough) */ + eax -= shadow_tsc_stamp; + + /* + * Time offset = (tsc_low delta) * cached_gtm + * = (tsc_low delta) * (usecs_per_clock) + * = (tsc_low delta) * (usecs_per_jiffy / clocks_per_jiffy) + * + * Using a mull instead of a divl saves up to 31 clock cycles + * in the critical path. + */ + + __asm__("mull %2" + :"=a" (eax), "=d" (edx) + :"rm" (cached_gtm), + "0" (eax)); + + /* our adjusted time offset in microseconds */ + return edx; +} +#endif +void +idle_block(void) +{ + + __get_time_values_from_xen(); + PANIC_IF(HYPERVISOR_set_timer_op(processed_system_time + NS_PER_TICK) != 0); + HYPERVISOR_sched_op(SCHEDOP_block, 0); +} diff --git a/sys/i386/xen/exception.s b/sys/i386/xen/exception.s new file mode 100644 index 0000000..b63ac7d --- /dev/null +++ b/sys/i386/xen/exception.s @@ -0,0 +1,466 @@ +/*- + * Copyright (c) 1989, 1990 William F. Jolitz. + * Copyright (c) 1990 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include "opt_apic.h" +#include "opt_npx.h" + +#include <machine/asmacros.h> +#include <machine/psl.h> +#include <machine/trap.h> + + +#include "assym.s" + +#define SEL_RPL_MASK 0x0002 +#define __HYPERVISOR_iret 23 + +/* Offsets into shared_info_t. */ +#define evtchn_upcall_pending /* 0 */ +#define evtchn_upcall_mask 1 +#define XEN_BLOCK_EVENTS(reg) movb $1,evtchn_upcall_mask(reg) +#define XEN_UNBLOCK_EVENTS(reg) movb $0,evtchn_upcall_mask(reg) +#define XEN_TEST_PENDING(reg) testb $0x1,evtchn_upcall_pending(reg) + +#define POPA \ + popl %edi; \ + popl %esi; \ + popl %ebp; \ + popl %ebx; \ + popl %ebx; \ + popl %edx; \ + popl %ecx; \ + popl %eax; + + .text + +/*****************************************************************************/ +/* Trap handling */ +/*****************************************************************************/ +/* + * Trap and fault vector routines. + * + * Most traps are 'trap gates', SDT_SYS386TGT. A trap gate pushes state on + * the stack that mostly looks like an interrupt, but does not disable + * interrupts. A few of the traps we are use are interrupt gates, + * SDT_SYS386IGT, which are nearly the same thing except interrupts are + * disabled on entry. + * + * The cpu will push a certain amount of state onto the kernel stack for + * the current process. The amount of state depends on the type of trap + * and whether the trap crossed rings or not. See i386/include/frame.h. + * At the very least the current EFLAGS (status register, which includes + * the interrupt disable state prior to the trap), the code segment register, + * and the return instruction pointer are pushed by the cpu. The cpu + * will also push an 'error' code for certain traps. We push a dummy + * error code for those traps where the cpu doesn't in order to maintain + * a consistent frame. We also push a contrived 'trap number'. + * + * The cpu does not push the general registers, we must do that, and we + * must restore them prior to calling 'iret'. The cpu adjusts the %cs and + * %ss segment registers, but does not mess with %ds, %es, or %fs. Thus we + * must load them with appropriate values for supervisor mode operation. + */ + +MCOUNT_LABEL(user) +MCOUNT_LABEL(btrap) + +#define TRAP(a) pushl $(a) ; jmp alltraps + +IDTVEC(div) + pushl $0; TRAP(T_DIVIDE) +IDTVEC(dbg) + pushl $0; TRAP(T_TRCTRAP) +IDTVEC(nmi) + pushl $0; TRAP(T_NMI) +IDTVEC(bpt) + pushl $0; TRAP(T_BPTFLT) +IDTVEC(ofl) + pushl $0; TRAP(T_OFLOW) +IDTVEC(bnd) + pushl $0; TRAP(T_BOUND) +IDTVEC(ill) + pushl $0; TRAP(T_PRIVINFLT) +IDTVEC(dna) + pushl $0; TRAP(T_DNA) +IDTVEC(fpusegm) + pushl $0; TRAP(T_FPOPFLT) +IDTVEC(tss) + TRAP(T_TSSFLT) +IDTVEC(missing) + TRAP(T_SEGNPFLT) +IDTVEC(stk) + TRAP(T_STKFLT) +IDTVEC(prot) + TRAP(T_PROTFLT) +IDTVEC(page) + TRAP(T_PAGEFLT) +IDTVEC(mchk) + pushl $0; TRAP(T_MCHK) +IDTVEC(rsvd) + pushl $0; TRAP(T_RESERVED) +IDTVEC(fpu) + pushl $0; TRAP(T_ARITHTRAP) +IDTVEC(align) + TRAP(T_ALIGNFLT) +IDTVEC(xmm) + pushl $0; TRAP(T_XMMFLT) + +IDTVEC(hypervisor_callback) + pushl %eax; TRAP(T_HYPCALLBACK) + +hypervisor_callback_pending: + movl HYPERVISOR_shared_info,%esi + XEN_BLOCK_EVENTS(%esi) /* cli */ + movl $T_HYPCALLBACK,TF_TRAPNO(%esp) + movl $T_HYPCALLBACK,TF_ERR(%esp) + jmp 11f + + /* + * alltraps entry point. Interrupts are enabled if this was a trap + * gate (TGT), else disabled if this was an interrupt gate (IGT). + * Note that int0x80_syscall is a trap gate. Only page faults + * use an interrupt gate. + */ + + SUPERALIGN_TEXT + .globl alltraps + .type alltraps,@function +alltraps: + pushal + pushl %ds + pushl %es + pushl %fs + +alltraps_with_regs_pushed: + SET_KERNEL_SREGS + FAKE_MCOUNT(TF_EIP(%esp)) +calltrap: + movl TF_EIP(%esp),%eax + cmpl $scrit,%eax + jb 11f + cmpl $ecrit,%eax + jb critical_region_fixup + +11: push %esp + call trap + add $4, %esp + + /* + * Return via doreti to handle ASTs. + */ + MEXITCOUNT + jmp doreti + +/* + * SYSCALL CALL GATE (old entry point for a.out binaries) + * + * The intersegment call has been set up to specify one dummy parameter. + * + * This leaves a place to put eflags so that the call frame can be + * converted to a trap frame. Note that the eflags is (semi-)bogusly + * pushed into (what will be) tf_err and then copied later into the + * final spot. It has to be done this way because esp can't be just + * temporarily altered for the pushfl - an interrupt might come in + * and clobber the saved cs/eip. + */ + SUPERALIGN_TEXT +IDTVEC(lcall_syscall) + pushfl /* save eflags */ + popl 8(%esp) /* shuffle into tf_eflags */ + pushl $7 /* sizeof "lcall 7,0" */ + subl $4,%esp /* skip over tf_trapno */ + pushal + pushl %ds + pushl %es + pushl %fs + SET_KERNEL_SREGS + FAKE_MCOUNT(TF_EIP(%esp)) + pushl %esp + call syscall + add $4, %esp + MEXITCOUNT + jmp doreti + +/* + * Call gate entry for FreeBSD ELF and Linux/NetBSD syscall (int 0x80) + * + * Even though the name says 'int0x80', this is actually a TGT (trap gate) + * rather then an IGT (interrupt gate). Thus interrupts are enabled on + * entry just as they are for a normal syscall. + */ + SUPERALIGN_TEXT +IDTVEC(int0x80_syscall) + pushl $2 /* sizeof "int 0x80" */ + pushl $0xBEEF /* for debug */ + pushal + pushl %ds + pushl %es + pushl %fs + SET_KERNEL_SREGS + FAKE_MCOUNT(TF_EIP(%esp)) + pushl %esp + call syscall + add $4, %esp + MEXITCOUNT + jmp doreti + +ENTRY(fork_trampoline) + pushl %esp /* trapframe pointer */ + pushl %ebx /* arg1 */ + pushl %esi /* function */ + call fork_exit + addl $12,%esp + /* cut from syscall */ + + /* + * Return via doreti to handle ASTs. + */ + MEXITCOUNT + jmp doreti + + +/* + * To efficiently implement classification of trap and interrupt handlers + * for profiling, there must be only trap handlers between the labels btrap + * and bintr, and only interrupt handlers between the labels bintr and + * eintr. This is implemented (partly) by including files that contain + * some of the handlers. Before including the files, set up a normal asm + * environment so that the included files doen't need to know that they are + * included. + */ + + .data + .p2align 4 + .text + SUPERALIGN_TEXT +MCOUNT_LABEL(bintr) + +#ifdef DEV_ATPIC +#include <i386/isa/atpic_vector.s> +#endif + +#ifdef DEV_APIC + .data + .p2align 4 + .text + SUPERALIGN_TEXT + +#include <i386/i386/apic_vector.s> +#endif + + .data + .p2align 4 + .text + SUPERALIGN_TEXT +#include <i386/i386/vm86bios.s> + + .text +MCOUNT_LABEL(eintr) + +/* + * void doreti(struct trapframe) + * + * Handle return from interrupts, traps and syscalls. + */ + .text + SUPERALIGN_TEXT + .type doreti,@function +doreti: + FAKE_MCOUNT($bintr) /* init "from" bintr -> doreti */ +doreti_next: +#ifdef notyet + /* + * Check if ASTs can be handled now. PSL_VM must be checked first + * since segment registers only have an RPL in non-VM86 mode. + */ + testl $PSL_VM,TF_EFLAGS(%esp) /* are we in vm86 mode? */ + jz doreti_notvm86 + movl PCPU(CURPCB),%ecx + testl $PCB_VM86CALL,PCB_FLAGS(%ecx) /* are we in a vm86 call? */ + jz doreti_ast /* can handle ASTS now if not */ + jmp doreti_exit + +doreti_notvm86: +#endif + testb $SEL_RPL_MASK,TF_CS(%esp) /* are we returning to user mode? */ + jz doreti_exit /* can't handle ASTs now if not */ + +doreti_ast: + /* + * Check for ASTs atomically with returning. Disabling CPU + * interrupts provides sufficient locking even in the SMP case, + * since we will be informed of any new ASTs by an IPI. + */ + movl HYPERVISOR_shared_info,%esi + XEN_BLOCK_EVENTS(%esi) /* cli */ + movl PCPU(CURTHREAD),%eax + testl $TDF_ASTPENDING | TDF_NEEDRESCHED,TD_FLAGS(%eax) + je doreti_exit + XEN_UNBLOCK_EVENTS(%esi) /* sti */ + pushl %esp /* pass a pointer to the trapframe */ + call ast + add $4,%esp + jmp doreti_ast + + /* + * doreti_exit: pop registers, iret. + * + * The segment register pop is a special case, since it may + * fault if (for example) a sigreturn specifies bad segment + * registers. The fault is handled in trap.c. + */ +doreti_exit: + movl HYPERVISOR_shared_info,%esi + XEN_UNBLOCK_EVENTS(%esi) # reenable event callbacks (sti) + + .globl scrit +scrit: + XEN_TEST_PENDING(%esi) + jnz hypervisor_callback_pending /* More to go */ + + MEXITCOUNT + + .globl doreti_popl_fs +doreti_popl_fs: + popl %fs + .globl doreti_popl_es +doreti_popl_es: + popl %es + .globl doreti_popl_ds +doreti_popl_ds: + popl %ds + + /* + * This is important: as nothing is atomic over here (we can get + * interrupted any time), we use the critical_region_fixup() in + * order to figure out where out stack is. Therefore, do NOT use + * 'popal' here without fixing up the table! + */ + POPA + addl $8,%esp + .globl doreti_iret +doreti_iret: + jmp hypercall_page + (__HYPERVISOR_iret * 32) + .globl ecrit +ecrit: + /* + * doreti_iret_fault and friends. Alternative return code for + * the case where we get a fault in the doreti_exit code + * above. trap() (i386/i386/trap.c) catches this specific + * case, sends the process a signal and continues in the + * corresponding place in the code below. + */ + ALIGN_TEXT + .globl doreti_iret_fault +doreti_iret_fault: + subl $8,%esp + pushal + pushl %ds + .globl doreti_popl_ds_fault +doreti_popl_ds_fault: + pushl %es + .globl doreti_popl_es_fault +doreti_popl_es_fault: + pushl %fs + .globl doreti_popl_fs_fault +doreti_popl_fs_fault: + movl $0,TF_ERR(%esp) /* XXX should be the error code */ + movl $T_PROTFLT,TF_TRAPNO(%esp) + jmp alltraps_with_regs_pushed + + /* +# [How we do the fixup]. We want to merge the current stack frame with the +# just-interrupted frame. How we do this depends on where in the critical +# region the interrupted handler was executing, and so how many saved +# registers are in each frame. We do this quickly using the lookup table +# 'critical_fixup_table'. For each byte offset in the critical region, it +# provides the number of bytes which have already been popped from the +# interrupted stack frame. +*/ + +.globl critical_region_fixup +critical_region_fixup: + addl $critical_fixup_table-scrit,%eax + movzbl (%eax),%eax # %eax contains num bytes popped + movl %esp,%esi + add %eax,%esi # %esi points at end of src region + movl %esp,%edi + add $0x40,%edi # %edi points at end of dst region + movl %eax,%ecx + shr $2,%ecx # convert bytes to words + je 16f # skip loop if nothing to copy +15: subl $4,%esi # pre-decrementing copy loop + subl $4,%edi + movl (%esi),%eax + movl %eax,(%edi) + loop 15b +16: movl %edi,%esp # final %edi is top of merged stack + jmp hypervisor_callback_pending + + +critical_fixup_table: +.byte 0x0,0x0,0x0 #testb $0x1,(%esi) +.byte 0x0,0x0,0x0,0x0,0x0,0x0 #jne ea +.byte 0x0,0x0 #pop %fs +.byte 0x04 #pop %es +.byte 0x08 #pop %ds +.byte 0x0c #pop %edi +.byte 0x10 #pop %esi +.byte 0x14 #pop %ebp +.byte 0x18 #pop %ebx +.byte 0x1c #pop %ebx +.byte 0x20 #pop %edx +.byte 0x24 #pop %ecx +.byte 0x28 #pop %eax +.byte 0x2c,0x2c,0x2c #add $0x8,%esp +.byte 0x34 #iret + + +/* # Hypervisor uses this for application faults while it executes.*/ +ENTRY(failsafe_callback) + pushal + call xen_failsafe_handler +/*# call install_safe_pf_handler */ + movl 28(%esp),%ebx +1: movl %ebx,%ds + movl 32(%esp),%ebx +2: movl %ebx,%es + movl 36(%esp),%ebx +3: movl %ebx,%fs + movl 40(%esp),%ebx +4: movl %ebx,%gs +/*# call install_normal_pf_handler */ + popal + addl $12,%esp + iret + + diff --git a/sys/i386/xen/locore.s b/sys/i386/xen/locore.s new file mode 100644 index 0000000..634405d --- /dev/null +++ b/sys/i386/xen/locore.s @@ -0,0 +1,372 @@ +/*- + * Copyright (c) 1990 The Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * William Jolitz. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)locore.s 7.3 (Berkeley) 5/13/91 + * $FreeBSD$ + * + * originally from: locore.s, by William F. Jolitz + * + * Substantially rewritten by David Greenman, Rod Grimes, + * Bruce Evans, Wolfgang Solfrank, Poul-Henning Kamp + * and many others. + */ + +#include "opt_bootp.h" +#include "opt_compat.h" +#include "opt_nfsroot.h" +#include "opt_global.h" +#include "opt_pmap.h" + +#include <sys/syscall.h> +#include <sys/reboot.h> + +#include <machine/asmacros.h> +#include <machine/cputypes.h> +#include <machine/psl.h> +#include <machine/pmap.h> +#include <machine/specialreg.h> + +#define __ASSEMBLY__ +#include <xen/interface/elfnote.h> + +/* The defines below have been lifted out of <machine/xen-public/arch-x86_32.h> */ +#define FLAT_RING1_CS 0xe019 /* GDT index 259 */ +#define FLAT_RING1_DS 0xe021 /* GDT index 260 */ +#define KERNEL_CS FLAT_RING1_CS +#define KERNEL_DS FLAT_RING1_DS + +#include "assym.s" + +.section __xen_guest +#if 0 + .ascii "LOADER=generic,GUEST_OS=freebsd,GUEST_VER=7.0,XEN_VER=xen-3.0,BSD_SYMTAB,VIRT_BASE=0xc0000000" + .byte 0 +#endif + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz, "FreeBSD") + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz, "HEAD") + ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz, "xen-3.0") + ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long, KERNBASE) + ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .long, KERNBASE) + ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long, btext) + ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long, hypercall_page) + ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, .long, HYPERVISOR_VIRT_START) +#if 0 + ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz, "writable_page_tables|writable_descriptor_tables|auto_translated_physmap|pae_pgdir_above_4gb|supervisor_mode_kernel") +#endif + ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz, "writable_page_tables|supervisor_mode_kernel|writable_descriptor_tables") + +#ifdef PAE + ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz, "yes") + ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .long, PG_V, PG_V) +#else + ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz, "no") + ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .long, PG_V, PG_V) +#endif + ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz, "generic") + ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long, 1) + + + +/* + * XXX + * + * Note: This version greatly munged to avoid various assembler errors + * that may be fixed in newer versions of gas. Perhaps newer versions + * will have more pleasant appearance. + */ + +/* + * PTmap is recursive pagemap at top of virtual address space. + * Within PTmap, the page directory can be found (third indirection). + */ + .globl PTmap,PTD,PTDpde + .set PTmap,(PTDPTDI << PDRSHIFT) + .set PTD,PTmap + (PTDPTDI * PAGE_SIZE) + .set PTDpde,PTD + (PTDPTDI * PDESIZE) + +/* + * Compiled KERNBASE location and the kernel load address + */ + .globl kernbase + .set kernbase,KERNBASE + .globl kernload + .set kernload,KERNLOAD + +/* + * Globals + */ + .data + ALIGN_DATA /* just to be sure */ + + .space 0x2000 /* space for tmpstk - temporary stack */ +tmpstk: + + .globl bootinfo +bootinfo: .space BOOTINFO_SIZE /* bootinfo that we can handle */ + + .globl KERNend +KERNend: .long 0 /* phys addr end of kernel (just after bss) */ + .globl physfree +physfree: .long 0 /* phys addr of next free page */ + +#ifdef SMP + .globl cpu0prvpage +cpu0pp: .long 0 /* phys addr cpu0 private pg */ +cpu0prvpage: .long 0 /* relocated version */ + + .globl SMPpt +SMPptpa: .long 0 /* phys addr SMP page table */ +SMPpt: .long 0 /* relocated version */ +#endif /* SMP */ + + .globl IdlePTD +IdlePTD: .long 0 /* phys addr of kernel PTD */ + +#ifdef PAE + .globl IdlePDPT +IdlePDPT: .long 0 /* phys addr of kernel PDPT */ +#endif + +#ifdef SMP + .globl KPTphys +#endif +KPTphys: .long 0 /* phys addr of kernel page tables */ + + .globl proc0kstack +proc0uarea: .long 0 /* address of proc 0 uarea (unused)*/ +proc0kstack: .long 0 /* address of proc 0 kstack space */ +p0upa: .long 0 /* phys addr of proc0 UAREA (unused) */ +p0kpa: .long 0 /* phys addr of proc0's STACK */ + +vm86phystk: .long 0 /* PA of vm86/bios stack */ + + .globl vm86paddr, vm86pa +vm86paddr: .long 0 /* address of vm86 region */ +vm86pa: .long 0 /* phys addr of vm86 region */ + +#ifdef PC98 + .globl pc98_system_parameter +pc98_system_parameter: + .space 0x240 +#endif + + .globl avail_space +avail_space: .long 0 + +/********************************************************************** + * + * Some handy macros + * + */ + +/* + * We're already in protected mode, so no remapping is needed. + */ +#define R(foo) (foo) + +#define ALLOCPAGES(foo) \ + movl R(physfree), %esi ; \ + movl $((foo)*PAGE_SIZE), %eax ; \ + addl %esi, %eax ; \ + movl %eax, R(physfree) ; \ + movl %esi, %edi ; \ + movl $((foo)*PAGE_SIZE),%ecx ; \ + xorl %eax,%eax ; \ + cld ; \ + rep ; \ + stosb + +/* + * fillkpt + * eax = page frame address + * ebx = index into page table + * ecx = how many pages to map + * base = base address of page dir/table + * prot = protection bits + */ +#define fillkpt(base, prot) \ + shll $PTESHIFT,%ebx ; \ + addl base,%ebx ; \ + orl $PG_V,%eax ; \ + orl prot,%eax ; \ +1: movl %eax,(%ebx) ; \ + addl $PAGE_SIZE,%eax ; /* increment physical address */ \ + addl $PTESIZE,%ebx ; /* next pte */ \ + loop 1b + +/* + * fillkptphys(prot) + * eax = physical address + * ecx = how many pages to map + * prot = protection bits + */ +#define fillkptphys(prot) \ + movl %eax, %ebx ; \ + shrl $PAGE_SHIFT, %ebx ; \ + fillkpt(R(KPTphys), prot) + +/* Temporary stack */ +.space 8192 +tmpstack: + .long tmpstack, KERNEL_DS + + .text + +.p2align 12, 0x90 + +#define HYPERCALL_PAGE_OFFSET 0x1000 +.org HYPERCALL_PAGE_OFFSET +ENTRY(hypercall_page) + .cfi_startproc + .skip 0x1000 + .cfi_endproc + +/********************************************************************** + * + * This is where the bootblocks start us, set the ball rolling... + * + */ +NON_GPROF_ENTRY(btext) + /* At the end of our stack, we shall have free space - so store it */ + movl %esp,%ebx + movl %ebx,R(avail_space) + + lss tmpstack,%esp + + pushl %esi + call initvalues + popl %esi + + /* Store the CPUID information */ + xorl %eax,%eax + cpuid # cpuid 0 + movl %eax,R(cpu_high) # highest capability + movl %ebx,R(cpu_vendor) # store vendor string + movl %edx,R(cpu_vendor+4) + movl %ecx,R(cpu_vendor+8) + movb $0,R(cpu_vendor+12) + + movl $1,%eax + cpuid # cpuid 1 + movl %eax,R(cpu_id) # store cpu_id + movl %ebx,R(cpu_procinfo) # store cpu_procinfo + movl %edx,R(cpu_feature) # store cpu_feature + movl %ecx,R(cpu_feature2) # store cpu_feature2 + rorl $8,%eax # extract family type + andl $15,%eax + cmpl $5,%eax + movl $CPU_686,R(cpu) + + movl proc0kstack,%eax + leal (KSTACK_PAGES*PAGE_SIZE-PCB_SIZE)(%eax),%esp + xorl %ebp,%ebp /* mark end of frames */ +#ifdef PAE + movl IdlePDPT,%esi +#else + movl IdlePTD,%esi +#endif + movl %esi,(KSTACK_PAGES*PAGE_SIZE-PCB_SIZE+PCB_CR3)(%eax) + pushl physfree + call init386 + addl $4, %esp + call mi_startup + /* NOTREACHED */ + int $3 + +/* + * Signal trampoline, copied to top of user stack + */ +NON_GPROF_ENTRY(sigcode) + calll *SIGF_HANDLER(%esp) + leal SIGF_UC(%esp),%eax /* get ucontext */ + pushl %eax + testl $PSL_VM,UC_EFLAGS(%eax) + jne 1f + mov UC_GS(%eax), %gs /* restore %gs */ +1: + movl $SYS_sigreturn,%eax + pushl %eax /* junk to fake return addr. */ + int $0x80 /* enter kernel with args */ + /* on stack */ +1: + jmp 1b + +#ifdef COMPAT_FREEBSD4 + ALIGN_TEXT +freebsd4_sigcode: + calll *SIGF_HANDLER(%esp) + leal SIGF_UC4(%esp),%eax /* get ucontext */ + pushl %eax + testl $PSL_VM,UC4_EFLAGS(%eax) + jne 1f + mov UC4_GS(%eax),%gs /* restore %gs */ +1: + movl $344,%eax /* 4.x SYS_sigreturn */ + pushl %eax /* junk to fake return addr. */ + int $0x80 /* enter kernel with args */ + /* on stack */ +1: + jmp 1b +#endif + +#ifdef COMPAT_43 + ALIGN_TEXT +osigcode: + call *SIGF_HANDLER(%esp) /* call signal handler */ + lea SIGF_SC(%esp),%eax /* get sigcontext */ + pushl %eax + testl $PSL_VM,SC_PS(%eax) + jne 9f + movl SC_GS(%eax),%gs /* restore %gs */ +9: + movl $103,%eax /* 3.x SYS_sigreturn */ + pushl %eax /* junk to fake return addr. */ + int $0x80 /* enter kernel with args */ +0: jmp 0b +#endif /* COMPAT_43 */ + + ALIGN_TEXT +esigcode: + + .data + .globl szsigcode +szsigcode: + .long esigcode-sigcode +#ifdef COMPAT_FREEBSD4 + .globl szfreebsd4_sigcode +szfreebsd4_sigcode: + .long esigcode-freebsd4_sigcode +#endif +#ifdef COMPAT_43 + .globl szosigcode +szosigcode: + .long esigcode-osigcode +#endif diff --git a/sys/i386/xen/pmap.c b/sys/i386/xen/pmap.c new file mode 100644 index 0000000..d57a02f --- /dev/null +++ b/sys/i386/xen/pmap.c @@ -0,0 +1,4128 @@ +/*- + * Copyright (c) 1991 Regents of the University of California. + * All rights reserved. + * Copyright (c) 1994 John S. Dyson + * All rights reserved. + * Copyright (c) 1994 David Greenman + * All rights reserved. + * Copyright (c) 2005 Alan L. Cox <alc@cs.rice.edu> + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Systems Programming Group of the University of Utah Computer + * Science Department and William Jolitz of UUNET Technologies Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 + */ +/*- + * Copyright (c) 2003 Networks Associates Technology, Inc. + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Jake Burkholder, + * Safeport Network Services, and Network Associates Laboratories, the + * Security Research Division of Network Associates, Inc. under + * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA + * CHATS research program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +/* + * Manages physical address maps. + * + * In addition to hardware address maps, this + * module is called upon to provide software-use-only + * maps which may or may not be stored in the same + * form as hardware maps. These pseudo-maps are + * used to store intermediate results from copy + * operations to and from address spaces. + * + * Since the information managed by this module is + * also stored by the logical address mapping module, + * this module may throw away valid virtual-to-physical + * mappings at almost any time. However, invalidations + * of virtual-to-physical mappings must be done as + * requested. + * + * In order to cope with hardware architectures which + * make virtual-to-physical map invalidates expensive, + * this module may delay invalidate or reduced protection + * operations until such time as they are actually + * necessary. This module is given full information as + * to which processors are currently using which maps, + * and to when physical maps must be made correct. + */ + +#define PMAP_DIAGNOSTIC + +#include "opt_cpu.h" +#include "opt_pmap.h" +#include "opt_msgbuf.h" +#include "opt_smp.h" +#include "opt_xbox.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/ktr.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mman.h> +#include <sys/msgbuf.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/sx.h> +#include <sys/vmmeter.h> +#include <sys/sched.h> +#include <sys/sysctl.h> +#ifdef SMP +#include <sys/smp.h> +#endif + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_kern.h> +#include <vm/vm_page.h> +#include <vm/vm_map.h> +#include <vm/vm_object.h> +#include <vm/vm_extern.h> +#include <vm/vm_pageout.h> +#include <vm/vm_pager.h> +#include <vm/uma.h> + +#include <machine/cpu.h> +#include <machine/cputypes.h> +#include <machine/md_var.h> +#include <machine/pcb.h> +#include <machine/specialreg.h> +#ifdef SMP +#include <machine/smp.h> +#endif + +#ifdef XBOX +#include <machine/xbox.h> +#endif + +#include <xen/interface/xen.h> +#include <machine/xen/hypervisor.h> +#include <machine/xen/hypercall.h> +#include <machine/xen/xenvar.h> +#include <machine/xen/xenfunc.h> + +#if !defined(CPU_DISABLE_SSE) && defined(I686_CPU) +#define CPU_ENABLE_SSE +#endif + +#ifndef PMAP_SHPGPERPROC +#define PMAP_SHPGPERPROC 200 +#endif + +#if defined(DIAGNOSTIC) +#define PMAP_DIAGNOSTIC +#endif + +#if !defined(PMAP_DIAGNOSTIC) +#define PMAP_INLINE __gnu89_inline +#else +#define PMAP_INLINE +#endif + +#define PV_STATS +#ifdef PV_STATS +#define PV_STAT(x) do { x ; } while (0) +#else +#define PV_STAT(x) do { } while (0) +#endif + +/* + * Get PDEs and PTEs for user/kernel address space + */ +#define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT])) +#define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) + +#define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0) +#define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0) +#define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0) +#define pmap_pte_u(pte) ((*(int *)pte & PG_A) != 0) +#define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0) + +#define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v))) + +struct pmap kernel_pmap_store; +LIST_HEAD(pmaplist, pmap); +static struct pmaplist allpmaps; +static struct mtx allpmaps_lock; + +vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ +vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ +int pgeflag = 0; /* PG_G or-in */ +int pseflag = 0; /* PG_PS or-in */ + +static int nkpt; +vm_offset_t kernel_vm_end; +extern u_int32_t KERNend; + +#ifdef PAE +pt_entry_t pg_nx; +#if !defined(XEN) +static uma_zone_t pdptzone; +#endif +#endif + +/* + * Data for the pv entry allocation mechanism + */ +static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; +static int shpgperproc = PMAP_SHPGPERPROC; + +struct pv_chunk *pv_chunkbase; /* KVA block for pv_chunks */ +int pv_maxchunks; /* How many chunks we have KVA for */ +vm_offset_t pv_vafree; /* freelist stored in the PTE */ + +/* + * All those kernel PT submaps that BSD is so fond of + */ +struct sysmaps { + struct mtx lock; + pt_entry_t *CMAP1; + pt_entry_t *CMAP2; + caddr_t CADDR1; + caddr_t CADDR2; +}; +static struct sysmaps sysmaps_pcpu[MAXCPU]; +pt_entry_t *CMAP1 = 0; +static pt_entry_t *CMAP3; +caddr_t CADDR1 = 0, ptvmmap = 0; +static caddr_t CADDR3; +struct msgbuf *msgbufp = 0; + +/* + * Crashdump maps. + */ +static caddr_t crashdumpmap; + +static pt_entry_t *PMAP1 = 0, *PMAP2; +static pt_entry_t *PADDR1 = 0, *PADDR2; +#ifdef SMP +static int PMAP1cpu; +static int PMAP1changedcpu; +SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD, + &PMAP1changedcpu, 0, + "Number of times pmap_pte_quick changed CPU with same PMAP1"); +#endif +static int PMAP1changed; +SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD, + &PMAP1changed, 0, + "Number of times pmap_pte_quick changed PMAP1"); +static int PMAP1unchanged; +SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD, + &PMAP1unchanged, 0, + "Number of times pmap_pte_quick didn't change PMAP1"); +static struct mtx PMAP2mutex; + +static void free_pv_entry(pmap_t pmap, pv_entry_t pv); +static pv_entry_t get_pv_entry(pmap_t locked_pmap, int try); + +static vm_page_t pmap_enter_quick_locked(multicall_entry_t **mcl, int *count, pmap_t pmap, vm_offset_t va, + vm_page_t m, vm_prot_t prot, vm_page_t mpte); +static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, + vm_page_t *free); +static void pmap_remove_page(struct pmap *pmap, vm_offset_t va, + vm_page_t *free); +static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, + vm_offset_t va); +static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); +static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, + vm_page_t m); + +static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags); + +static vm_page_t _pmap_allocpte(pmap_t pmap, unsigned ptepindex, int flags); +static int _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free); +static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va); +static void pmap_pte_release(pt_entry_t *pte); +static int pmap_unuse_pt(pmap_t, vm_offset_t, vm_page_t *); +static vm_offset_t pmap_kmem_choose(vm_offset_t addr); +static boolean_t pmap_is_prefaultable_locked(pmap_t pmap, vm_offset_t addr); + +#if defined(PAE) && !defined(XEN) +static void *pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait); +#endif + +CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t)); +CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t)); + +/* + * If you get an error here, then you set KVA_PAGES wrong! See the + * description of KVA_PAGES in sys/i386/include/pmap.h. It must be + * multiple of 4 for a normal kernel, or a multiple of 8 for a PAE. + */ +CTASSERT(KERNBASE % (1 << 24) == 0); + + + +static __inline void +pagezero(void *page) +{ +#if defined(I686_CPU) + if (cpu_class == CPUCLASS_686) { +#if defined(CPU_ENABLE_SSE) + if (cpu_feature & CPUID_SSE2) + sse2_pagezero(page); + else +#endif + i686_pagezero(page); + } else +#endif + bzero(page, PAGE_SIZE); +} + +void +pd_set(struct pmap *pmap, int ptepindex, vm_paddr_t val, int type) +{ + vm_paddr_t pdir_ma = vtomach(&pmap->pm_pdir[ptepindex]); + + switch (type) { + case SH_PD_SET_VA: +#if 0 + xen_queue_pt_update(shadow_pdir_ma, + xpmap_ptom(val & ~(PG_RW))); +#endif + xen_queue_pt_update(pdir_ma, + xpmap_ptom(val)); + break; + case SH_PD_SET_VA_MA: +#if 0 + xen_queue_pt_update(shadow_pdir_ma, + val & ~(PG_RW)); +#endif + xen_queue_pt_update(pdir_ma, val); + break; + case SH_PD_SET_VA_CLEAR: +#if 0 + xen_queue_pt_update(shadow_pdir_ma, 0); +#endif + xen_queue_pt_update(pdir_ma, 0); + break; + } +} + +/* + * Move the kernel virtual free pointer to the next + * 4MB. This is used to help improve performance + * by using a large (4MB) page for much of the kernel + * (.text, .data, .bss) + */ +static vm_offset_t +pmap_kmem_choose(vm_offset_t addr) +{ + vm_offset_t newaddr = addr; + +#ifndef DISABLE_PSE + if (cpu_feature & CPUID_PSE) + newaddr = (addr + PDRMASK) & ~PDRMASK; +#endif + return newaddr; +} + +/* + * Bootstrap the system enough to run with virtual memory. + * + * On the i386 this is called after mapping has already been enabled + * and just syncs the pmap module with what has already been done. + * [We can't call it easily with mapping off since the kernel is not + * mapped with PA == VA, hence we would have to relocate every address + * from the linked base (virtual) address "KERNBASE" to the actual + * (physical) address starting relative to 0] + */ +void +pmap_bootstrap(vm_paddr_t firstaddr) +{ + vm_offset_t va; + pt_entry_t *pte, *unused; + struct sysmaps *sysmaps; + int i; + + /* + * XXX The calculation of virtual_avail is wrong. It's NKPT*PAGE_SIZE too + * large. It should instead be correctly calculated in locore.s and + * not based on 'first' (which is a physical address, not a virtual + * address, for the start of unused physical memory). The kernel + * page tables are NOT double mapped and thus should not be included + * in this calculation. + */ + virtual_avail = (vm_offset_t) KERNBASE + firstaddr; + virtual_avail = pmap_kmem_choose(virtual_avail); + + virtual_end = VM_MAX_KERNEL_ADDRESS; + + /* + * Initialize the kernel pmap (which is statically allocated). + */ + PMAP_LOCK_INIT(kernel_pmap); + kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD); +#ifdef PAE + kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT); +#endif + kernel_pmap->pm_active = -1; /* don't allow deactivation */ + TAILQ_INIT(&kernel_pmap->pm_pvchunk); + LIST_INIT(&allpmaps); + mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN); + mtx_lock_spin(&allpmaps_lock); + LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); + mtx_unlock_spin(&allpmaps_lock); + nkpt = NKPT; + + /* + * Reserve some special page table entries/VA space for temporary + * mapping of pages. + */ +#define SYSMAP(c, p, v, n) \ + v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); + + va = virtual_avail; + pte = vtopte(va); + + /* + * CMAP1/CMAP2 are used for zeroing and copying pages. + * CMAP3 is used for the idle process page zeroing. + */ + for (i = 0; i < MAXCPU; i++) { + sysmaps = &sysmaps_pcpu[i]; + mtx_init(&sysmaps->lock, "SYSMAPS", NULL, MTX_DEF); + SYSMAP(caddr_t, sysmaps->CMAP1, sysmaps->CADDR1, 1) + SYSMAP(caddr_t, sysmaps->CMAP2, sysmaps->CADDR2, 1) + } + SYSMAP(caddr_t, CMAP1, CADDR1, 1) + SYSMAP(caddr_t, CMAP3, CADDR3, 1) + PT_SET_MA(CADDR3, 0); + + /* + * Crashdump maps. + */ + SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS) + + /* + * ptvmmap is used for reading arbitrary physical pages via /dev/mem. + */ + SYSMAP(caddr_t, unused, ptvmmap, 1) + + /* + * msgbufp is used to map the system message buffer. + */ + SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(MSGBUF_SIZE))) + + /* + * ptemap is used for pmap_pte_quick + */ + SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1); + SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1); + + mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF); + + virtual_avail = va; + PT_SET_MA(CADDR1, 0); + + /* + * Leave in place an identity mapping (virt == phys) for the low 1 MB + * physical memory region that is used by the ACPI wakeup code. This + * mapping must not have PG_G set. + */ +#ifndef XEN + /* + * leave here deliberately to show that this is not supported + */ +#ifdef XBOX + /* FIXME: This is gross, but needed for the XBOX. Since we are in such + * an early stadium, we cannot yet neatly map video memory ... :-( + * Better fixes are very welcome! */ + if (!arch_i386_is_xbox) +#endif + for (i = 1; i < NKPT; i++) + PTD[i] = 0; + + /* Initialize the PAT MSR if present. */ + pmap_init_pat(); + + /* Turn on PG_G on kernel page(s) */ + pmap_set_pg(); +#endif +} + +/* + * Setup the PAT MSR. + */ +void +pmap_init_pat(void) +{ + uint64_t pat_msr; + + /* Bail if this CPU doesn't implement PAT. */ + if (!(cpu_feature & CPUID_PAT)) + return; + +#ifdef PAT_WORKS + /* + * Leave the indices 0-3 at the default of WB, WT, UC, and UC-. + * Program 4 and 5 as WP and WC. + * Leave 6 and 7 as UC and UC-. + */ + pat_msr = rdmsr(MSR_PAT); + pat_msr &= ~(PAT_MASK(4) | PAT_MASK(5)); + pat_msr |= PAT_VALUE(4, PAT_WRITE_PROTECTED) | + PAT_VALUE(5, PAT_WRITE_COMBINING); +#else + /* + * Due to some Intel errata, we can only safely use the lower 4 + * PAT entries. Thus, just replace PAT Index 2 with WC instead + * of UC-. + * + * Intel Pentium III Processor Specification Update + * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B + * or Mode C Paging) + * + * Intel Pentium IV Processor Specification Update + * Errata N46 (PAT Index MSB May Be Calculated Incorrectly) + */ + pat_msr = rdmsr(MSR_PAT); + pat_msr &= ~PAT_MASK(2); + pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING); +#endif + wrmsr(MSR_PAT, pat_msr); +} + +/* + * Set PG_G on kernel pages. Only the BSP calls this when SMP is turned on. + */ +void +pmap_set_pg(void) +{ + pd_entry_t pdir; + pt_entry_t *pte; + vm_offset_t va, endva; + int i; + + if (pgeflag == 0) + return; + + i = KERNLOAD/NBPDR; + endva = KERNBASE + KERNend; + + if (pseflag) { + va = KERNBASE + KERNLOAD; + while (va < endva) { + pdir = kernel_pmap->pm_pdir[KPTDI+i]; + pdir |= pgeflag; + kernel_pmap->pm_pdir[KPTDI+i] = PTD[KPTDI+i] = pdir; + invltlb(); /* Play it safe, invltlb() every time */ + i++; + va += NBPDR; + } + } else { + va = (vm_offset_t)btext; + while (va < endva) { + pte = vtopte(va); + if (*pte & PG_V) + *pte |= pgeflag; + invltlb(); /* Play it safe, invltlb() every time */ + va += PAGE_SIZE; + } + } +} + +/* + * Initialize a vm_page's machine-dependent fields. + */ +void +pmap_page_init(vm_page_t m) +{ + + TAILQ_INIT(&m->md.pv_list); +} + +#if defined(PAE) && !defined(XEN) + +static MALLOC_DEFINE(M_PMAPPDPT, "pmap", "pmap pdpt"); + +static void * +pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) +{ + *flags = UMA_SLAB_PRIV; + return (contigmalloc(PAGE_SIZE, M_PMAPPDPT, 0, 0x0ULL, 0xffffffffULL, + 1, 0)); +} +#endif + +/* + * ABuse the pte nodes for unmapped kva to thread a kva freelist through. + * Requirements: + * - Must deal with pages in order to ensure that none of the PG_* bits + * are ever set, PG_V in particular. + * - Assumes we can write to ptes without pte_store() atomic ops, even + * on PAE systems. This should be ok. + * - Assumes nothing will ever test these addresses for 0 to indicate + * no mapping instead of correctly checking PG_V. + * - Assumes a vm_offset_t will fit in a pte (true for i386). + * Because PG_V is never set, there can be no mappings to invalidate. + */ +static int ptelist_count = 0; +static vm_offset_t +pmap_ptelist_alloc(vm_offset_t *head) +{ + vm_offset_t va; + vm_offset_t *phead = (vm_offset_t *)*head; + + if (ptelist_count == 0) { + printf("out of memory!!!!!!\n"); + return (0); /* Out of memory */ + } + ptelist_count--; + va = phead[ptelist_count]; + return (va); +} + +static void +pmap_ptelist_free(vm_offset_t *head, vm_offset_t va) +{ + vm_offset_t *phead = (vm_offset_t *)*head; + + phead[ptelist_count++] = va; +} + +static void +pmap_ptelist_init(vm_offset_t *head, void *base, int npages) +{ + int i, nstackpages; + vm_offset_t va; + vm_page_t m; + + nstackpages = (npages + PAGE_SIZE/sizeof(vm_offset_t) - 1)/ (PAGE_SIZE/sizeof(vm_offset_t)); + for (i = 0; i < nstackpages; i++) { + va = (vm_offset_t)base + i * PAGE_SIZE; + m = vm_page_alloc(NULL, i, + VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | + VM_ALLOC_ZERO); + pmap_qenter(va, &m, 1); + } + + *head = (vm_offset_t)base; + for (i = npages - 1; i >= nstackpages; i--) { + va = (vm_offset_t)base + i * PAGE_SIZE; + pmap_ptelist_free(head, va); + } +} + + +/* + * Initialize the pmap module. + * Called by vm_init, to initialize any structures that the pmap + * system needs to map virtual memory. + */ +void +pmap_init(void) +{ + + /* + * Initialize the address space (zone) for the pv entries. Set a + * high water mark so that the system can recover from excessive + * numbers of pv entries. + */ + TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); + pv_entry_max = shpgperproc * maxproc + cnt.v_page_count; + TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); + pv_entry_max = roundup(pv_entry_max, _NPCPV); + pv_entry_high_water = 9 * (pv_entry_max / 10); + + pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc); + pv_chunkbase = (struct pv_chunk *)kmem_alloc_nofault(kernel_map, + PAGE_SIZE * pv_maxchunks); + if (pv_chunkbase == NULL) + panic("pmap_init: not enough kvm for pv chunks"); + pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks); +#if defined(PAE) && !defined(XEN) + pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL, + NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1, + UMA_ZONE_VM | UMA_ZONE_NOFREE); + uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf); +#endif +} + + +SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); +SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0, + "Max number of PV entries"); +SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0, + "Page share factor per proc"); + +/*************************************************** + * Low level helper routines..... + ***************************************************/ + +/* + * Determine the appropriate bits to set in a PTE or PDE for a specified + * caching mode. + */ +static int +pmap_cache_bits(int mode, boolean_t is_pde) +{ + int pat_flag, pat_index, cache_bits; + + /* The PAT bit is different for PTE's and PDE's. */ + pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT; + + /* If we don't support PAT, map extended modes to older ones. */ + if (!(cpu_feature & CPUID_PAT)) { + switch (mode) { + case PAT_UNCACHEABLE: + case PAT_WRITE_THROUGH: + case PAT_WRITE_BACK: + break; + case PAT_UNCACHED: + case PAT_WRITE_COMBINING: + case PAT_WRITE_PROTECTED: + mode = PAT_UNCACHEABLE; + break; + } + } + + /* Map the caching mode to a PAT index. */ + switch (mode) { +#ifdef PAT_WORKS + case PAT_UNCACHEABLE: + pat_index = 3; + break; + case PAT_WRITE_THROUGH: + pat_index = 1; + break; + case PAT_WRITE_BACK: + pat_index = 0; + break; + case PAT_UNCACHED: + pat_index = 2; + break; + case PAT_WRITE_COMBINING: + pat_index = 5; + break; + case PAT_WRITE_PROTECTED: + pat_index = 4; + break; +#else + case PAT_UNCACHED: + case PAT_UNCACHEABLE: + case PAT_WRITE_PROTECTED: + pat_index = 3; + break; + case PAT_WRITE_THROUGH: + pat_index = 1; + break; + case PAT_WRITE_BACK: + pat_index = 0; + break; + case PAT_WRITE_COMBINING: + pat_index = 2; + break; +#endif + default: + panic("Unknown caching mode %d\n", mode); + } + + /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ + cache_bits = 0; + if (pat_index & 0x4) + cache_bits |= pat_flag; + if (pat_index & 0x2) + cache_bits |= PG_NC_PCD; + if (pat_index & 0x1) + cache_bits |= PG_NC_PWT; + return (cache_bits); +} +#ifdef SMP +/* + * For SMP, these functions have to use the IPI mechanism for coherence. + * + * N.B.: Before calling any of the following TLB invalidation functions, + * the calling processor must ensure that all stores updating a non- + * kernel page table are globally performed. Otherwise, another + * processor could cache an old, pre-update entry without being + * invalidated. This can happen one of two ways: (1) The pmap becomes + * active on another processor after its pm_active field is checked by + * one of the following functions but before a store updating the page + * table is globally performed. (2) The pmap becomes active on another + * processor before its pm_active field is checked but due to + * speculative loads one of the following functions stills reads the + * pmap as inactive on the other processor. + * + * The kernel page table is exempt because its pm_active field is + * immutable. The kernel page table is always active on every + * processor. + */ +void +pmap_invalidate_page(pmap_t pmap, vm_offset_t va) +{ + u_int cpumask; + u_int other_cpus; + + CTR2(KTR_PMAP, "pmap_invalidate_page: pmap=%p va=0x%x", + pmap, va); + + sched_pin(); + if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { + invlpg(va); + smp_invlpg(va); + } else { + cpumask = PCPU_GET(cpumask); + other_cpus = PCPU_GET(other_cpus); + if (pmap->pm_active & cpumask) + invlpg(va); + if (pmap->pm_active & other_cpus) + smp_masked_invlpg(pmap->pm_active & other_cpus, va); + } + sched_unpin(); + PT_UPDATES_FLUSH(); +} + +void +pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +{ + u_int cpumask; + u_int other_cpus; + vm_offset_t addr; + + CTR3(KTR_PMAP, "pmap_invalidate_page: pmap=%p eva=0x%x sva=0x%x", + pmap, sva, eva); + + sched_pin(); + if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { + for (addr = sva; addr < eva; addr += PAGE_SIZE) + invlpg(addr); + smp_invlpg_range(sva, eva); + } else { + cpumask = PCPU_GET(cpumask); + other_cpus = PCPU_GET(other_cpus); + if (pmap->pm_active & cpumask) + for (addr = sva; addr < eva; addr += PAGE_SIZE) + invlpg(addr); + if (pmap->pm_active & other_cpus) + smp_masked_invlpg_range(pmap->pm_active & other_cpus, + sva, eva); + } + sched_unpin(); + PT_UPDATES_FLUSH(); +} + +void +pmap_invalidate_all(pmap_t pmap) +{ + u_int cpumask; + u_int other_cpus; + + CTR1(KTR_PMAP, "pmap_invalidate_page: pmap=%p", pmap); + + sched_pin(); + if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { + invltlb(); + smp_invltlb(); + } else { + cpumask = PCPU_GET(cpumask); + other_cpus = PCPU_GET(other_cpus); + if (pmap->pm_active & cpumask) + invltlb(); + if (pmap->pm_active & other_cpus) + smp_masked_invltlb(pmap->pm_active & other_cpus); + } + sched_unpin(); +} + +void +pmap_invalidate_cache(void) +{ + + sched_pin(); + wbinvd(); + smp_cache_flush(); + sched_unpin(); +} +#else /* !SMP */ +/* + * Normal, non-SMP, 486+ invalidation functions. + * We inline these within pmap.c for speed. + */ +PMAP_INLINE void +pmap_invalidate_page(pmap_t pmap, vm_offset_t va) +{ + CTR2(KTR_PMAP, "pmap_invalidate_page: pmap=%p va=0x%x", + pmap, va); + + if (pmap == kernel_pmap || pmap->pm_active) + invlpg(va); + PT_UPDATES_FLUSH(); +} + +PMAP_INLINE void +pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +{ + vm_offset_t addr; + + if (eva - sva > PAGE_SIZE) + CTR3(KTR_PMAP, "pmap_invalidate_range: pmap=%p sva=0x%x eva=0x%x", + pmap, sva, eva); + + if (pmap == kernel_pmap || pmap->pm_active) + for (addr = sva; addr < eva; addr += PAGE_SIZE) + invlpg(addr); + PT_UPDATES_FLUSH(); +} + +PMAP_INLINE void +pmap_invalidate_all(pmap_t pmap) +{ + + CTR1(KTR_PMAP, "pmap_invalidate_all: pmap=%p", pmap); + + if (pmap == kernel_pmap || pmap->pm_active) + invltlb(); +} + +PMAP_INLINE void +pmap_invalidate_cache(void) +{ + + wbinvd(); +} +#endif /* !SMP */ + +/* + * Are we current address space or kernel? N.B. We return FALSE when + * a pmap's page table is in use because a kernel thread is borrowing + * it. The borrowed page table can change spontaneously, making any + * dependence on its continued use subject to a race condition. + */ +static __inline int +pmap_is_current(pmap_t pmap) +{ + + return (pmap == kernel_pmap || + (pmap == vmspace_pmap(curthread->td_proc->p_vmspace) && + (pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME))); +} + +/* + * If the given pmap is not the current or kernel pmap, the returned pte must + * be released by passing it to pmap_pte_release(). + */ +pt_entry_t * +pmap_pte(pmap_t pmap, vm_offset_t va) +{ + pd_entry_t newpf; + pd_entry_t *pde; + + pde = pmap_pde(pmap, va); + if (*pde & PG_PS) + return (pde); + if (*pde != 0) { + /* are we current address space or kernel? */ + if (pmap_is_current(pmap)) + return (vtopte(va)); + mtx_lock(&PMAP2mutex); + newpf = *pde & PG_FRAME; + if ((*PMAP2 & PG_FRAME) != newpf) { + PT_SET_MA(PADDR2, newpf | PG_V | PG_A | PG_M); + CTR3(KTR_PMAP, "pmap_pte: pmap=%p va=0x%x newpte=0x%08x", + pmap, va, (*PMAP2 & 0xffffffff)); + } + + return (PADDR2 + (i386_btop(va) & (NPTEPG - 1))); + } + return (0); +} + +/* + * Releases a pte that was obtained from pmap_pte(). Be prepared for the pte + * being NULL. + */ +static __inline void +pmap_pte_release(pt_entry_t *pte) +{ + + if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2) { + CTR1(KTR_PMAP, "pmap_pte_release: pte=0x%jx", + *PMAP2); + PT_SET_VA(PMAP2, 0, TRUE); + mtx_unlock(&PMAP2mutex); + } +} + +static __inline void +invlcaddr(void *caddr) +{ + + invlpg((u_int)caddr); + PT_UPDATES_FLUSH(); +} + +/* + * Super fast pmap_pte routine best used when scanning + * the pv lists. This eliminates many coarse-grained + * invltlb calls. Note that many of the pv list + * scans are across different pmaps. It is very wasteful + * to do an entire invltlb for checking a single mapping. + * + * If the given pmap is not the current pmap, vm_page_queue_mtx + * must be held and curthread pinned to a CPU. + */ +static pt_entry_t * +pmap_pte_quick(pmap_t pmap, vm_offset_t va) +{ + pd_entry_t newpf; + pd_entry_t *pde; + + pde = pmap_pde(pmap, va); + if (*pde & PG_PS) + return (pde); + if (*pde != 0) { + /* are we current address space or kernel? */ + if (pmap_is_current(pmap)) + return (vtopte(va)); + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); + newpf = *pde & PG_FRAME; + if ((*PMAP1 & PG_FRAME) != newpf) { + PT_SET_MA(PADDR1, newpf | PG_V | PG_A | PG_M); + CTR3(KTR_PMAP, "pmap_pte_quick: pmap=%p va=0x%x newpte=0x%08x", + pmap, va, (u_long)*PMAP1); + +#ifdef SMP + PMAP1cpu = PCPU_GET(cpuid); +#endif + PMAP1changed++; + } else +#ifdef SMP + if (PMAP1cpu != PCPU_GET(cpuid)) { + PMAP1cpu = PCPU_GET(cpuid); + invlcaddr(PADDR1); + PMAP1changedcpu++; + } else +#endif + PMAP1unchanged++; + return (PADDR1 + (i386_btop(va) & (NPTEPG - 1))); + } + return (0); +} + +/* + * Routine: pmap_extract + * Function: + * Extract the physical page address associated + * with the given map/virtual_address pair. + */ +vm_paddr_t +pmap_extract(pmap_t pmap, vm_offset_t va) +{ + vm_paddr_t rtval; + pt_entry_t *pte; + pd_entry_t pde; + pt_entry_t pteval; + + rtval = 0; + PMAP_LOCK(pmap); + pde = pmap->pm_pdir[va >> PDRSHIFT]; + if (pde != 0) { + if ((pde & PG_PS) != 0) { + rtval = xpmap_mtop(pde & PG_PS_FRAME) | (va & PDRMASK); + PMAP_UNLOCK(pmap); + return rtval; + } + pte = pmap_pte(pmap, va); + pteval = *pte ? xpmap_mtop(*pte) : 0; + rtval = (pteval & PG_FRAME) | (va & PAGE_MASK); + pmap_pte_release(pte); + } + PMAP_UNLOCK(pmap); + return (rtval); +} + +/* + * Routine: pmap_extract_ma + * Function: + * Like pmap_extract, but returns machine address + */ +vm_paddr_t +pmap_extract_ma(pmap_t pmap, vm_offset_t va) +{ + vm_paddr_t rtval; + pt_entry_t *pte; + pd_entry_t pde; + + rtval = 0; + PMAP_LOCK(pmap); + pde = pmap->pm_pdir[va >> PDRSHIFT]; + if (pde != 0) { + if ((pde & PG_PS) != 0) { + rtval = (pde & ~PDRMASK) | (va & PDRMASK); + PMAP_UNLOCK(pmap); + return rtval; + } + pte = pmap_pte(pmap, va); + rtval = (*pte & PG_FRAME) | (va & PAGE_MASK); + pmap_pte_release(pte); + } + PMAP_UNLOCK(pmap); + return (rtval); +} + +/* + * Routine: pmap_extract_and_hold + * Function: + * Atomically extract and hold the physical page + * with the given pmap and virtual address pair + * if that mapping permits the given protection. + */ +vm_page_t +pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) +{ + pd_entry_t pde; + pt_entry_t pte; + vm_page_t m; + + m = NULL; + vm_page_lock_queues(); + PMAP_LOCK(pmap); + pde = PT_GET(pmap_pde(pmap, va)); + if (pde != 0) { + if (pde & PG_PS) { + if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) { + m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | + (va & PDRMASK)); + vm_page_hold(m); + } + } else { + sched_pin(); + pte = PT_GET(pmap_pte_quick(pmap, va)); + if (*PMAP1) + PT_SET_MA(PADDR1, 0); + if ((pte & PG_V) && + ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) { + m = PHYS_TO_VM_PAGE(pte & PG_FRAME); + vm_page_hold(m); + } + sched_unpin(); + } + } + vm_page_unlock_queues(); + PMAP_UNLOCK(pmap); + return (m); +} + +/*************************************************** + * Low level mapping routines..... + ***************************************************/ + +/* + * Add a wired page to the kva. + * Note: not SMP coherent. + */ +PMAP_INLINE void +pmap_kenter(vm_offset_t va, vm_paddr_t pa) +{ + PT_SET_MA(va, xpmap_ptom(pa)| PG_RW | PG_V | pgeflag); +} + +PMAP_INLINE void +pmap_kenter_ma(vm_offset_t va, vm_paddr_t ma) +{ + pt_entry_t *pte; + + pte = vtopte(va); + pte_store_ma(pte, ma | PG_RW | PG_V | pgeflag); +} + + +PMAP_INLINE void +pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) +{ + PT_SET_MA(va, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0)); +} + +/* + * Remove a page from the kernel pagetables. + * Note: not SMP coherent. + */ +PMAP_INLINE void +pmap_kremove(vm_offset_t va) +{ + pt_entry_t *pte; + + pte = vtopte(va); + PT_CLEAR_VA(pte, FALSE); +} + +/* + * Used to map a range of physical addresses into kernel + * virtual address space. + * + * The value passed in '*virt' is a suggested virtual address for + * the mapping. Architectures which can support a direct-mapped + * physical to virtual region can return the appropriate address + * within that region, leaving '*virt' unchanged. Other + * architectures should map the pages starting at '*virt' and + * update '*virt' with the first usable address after the mapped + * region. + */ +vm_offset_t +pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) +{ + vm_offset_t va, sva; + + va = sva = *virt; + CTR4(KTR_PMAP, "pmap_map: va=0x%x start=0x%jx end=0x%jx prot=0x%x", + va, start, end, prot); + while (start < end) { + pmap_kenter(va, start); + va += PAGE_SIZE; + start += PAGE_SIZE; + } + pmap_invalidate_range(kernel_pmap, sva, va); + *virt = va; + return (sva); +} + + +/* + * Add a list of wired pages to the kva + * this routine is only used for temporary + * kernel mappings that do not need to have + * page modification or references recorded. + * Note that old mappings are simply written + * over. The page *must* be wired. + * Note: SMP coherent. Uses a ranged shootdown IPI. + */ +void +pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) +{ + pt_entry_t *endpte, *pte; + vm_paddr_t pa; + vm_offset_t va = sva; + int mclcount = 0; + multicall_entry_t mcl[16]; + multicall_entry_t *mclp = mcl; + int error; + + CTR2(KTR_PMAP, "pmap_qenter:sva=0x%x count=%d", va, count); + pte = vtopte(sva); + endpte = pte + count; + while (pte < endpte) { + pa = xpmap_ptom(VM_PAGE_TO_PHYS(*ma)) | pgeflag | PG_RW | PG_V | PG_M | PG_A; + + mclp->op = __HYPERVISOR_update_va_mapping; + mclp->args[0] = va; + mclp->args[1] = (uint32_t)(pa & 0xffffffff); + mclp->args[2] = (uint32_t)(pa >> 32); + mclp->args[3] = (*pte & PG_V) ? UVMF_INVLPG|UVMF_ALL : 0; + + va += PAGE_SIZE; + pte++; + ma++; + mclp++; + mclcount++; + if (mclcount == 16) { + error = HYPERVISOR_multicall(mcl, mclcount); + mclp = mcl; + mclcount = 0; + KASSERT(error == 0, ("bad multicall %d", error)); + } + } + if (mclcount) { + error = HYPERVISOR_multicall(mcl, mclcount); + KASSERT(error == 0, ("bad multicall %d", error)); + } + +#ifdef INVARIANTS + for (pte = vtopte(sva), mclcount = 0; mclcount < count; mclcount++, pte++) + KASSERT(*pte, ("pte not set for va=0x%x", sva + mclcount*PAGE_SIZE)); +#endif +} + + +/* + * This routine tears out page mappings from the + * kernel -- it is meant only for temporary mappings. + * Note: SMP coherent. Uses a ranged shootdown IPI. + */ +void +pmap_qremove(vm_offset_t sva, int count) +{ + vm_offset_t va; + + CTR2(KTR_PMAP, "pmap_qremove: sva=0x%x count=%d", sva, count); + va = sva; + vm_page_lock_queues(); + critical_enter(); + while (count-- > 0) { + pmap_kremove(va); + va += PAGE_SIZE; + } + pmap_invalidate_range(kernel_pmap, sva, va); + critical_exit(); + vm_page_unlock_queues(); +} + +/*************************************************** + * Page table page management routines..... + ***************************************************/ +static __inline void +pmap_free_zero_pages(vm_page_t free) +{ + vm_page_t m; + + while (free != NULL) { + m = free; + free = m->right; + vm_page_free_zero(m); + } +} + +/* + * This routine unholds page table pages, and if the hold count + * drops to zero, then it decrements the wire count. + */ +static __inline int +pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free) +{ + + --m->wire_count; + if (m->wire_count == 0) + return _pmap_unwire_pte_hold(pmap, m, free); + else + return 0; +} + +static int +_pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free) +{ + vm_offset_t pteva; + + PT_UPDATES_FLUSH(); + /* + * unmap the page table page + */ + xen_pt_unpin(pmap->pm_pdir[m->pindex]); + /* + * page *might* contain residual mapping :-/ + */ + PD_CLEAR_VA(pmap, m->pindex, TRUE); + pmap_zero_page(m); + --pmap->pm_stats.resident_count; + + /* + * This is a release store so that the ordinary store unmapping + * the page table page is globally performed before TLB shoot- + * down is begun. + */ + atomic_subtract_rel_int(&cnt.v_wire_count, 1); + + /* + * Do an invltlb to make the invalidated mapping + * take effect immediately. + */ + pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex); + pmap_invalidate_page(pmap, pteva); + + /* + * Put page on a list so that it is released after + * *ALL* TLB shootdown is done + */ + m->right = *free; + *free = m; + + return 1; +} + +/* + * After removing a page table entry, this routine is used to + * conditionally free the page, and manage the hold/wire counts. + */ +static int +pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t *free) +{ + pd_entry_t ptepde; + vm_page_t mpte; + + if (va >= VM_MAXUSER_ADDRESS) + return 0; + ptepde = PT_GET(pmap_pde(pmap, va)); + mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); + return pmap_unwire_pte_hold(pmap, mpte, free); +} + +void +pmap_pinit0(pmap_t pmap) +{ + + PMAP_LOCK_INIT(pmap); + pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD); +#ifdef PAE + pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT); +#endif + pmap->pm_active = 0; + PCPU_SET(curpmap, pmap); + TAILQ_INIT(&pmap->pm_pvchunk); + bzero(&pmap->pm_stats, sizeof pmap->pm_stats); + mtx_lock_spin(&allpmaps_lock); + LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); + mtx_unlock_spin(&allpmaps_lock); +} + +/* + * Initialize a preallocated and zeroed pmap structure, + * such as one in a vmspace structure. + */ +int +pmap_pinit(pmap_t pmap) +{ + vm_page_t m, ptdpg[NPGPTD + 1]; + int npgptd = NPGPTD + 1; + static int color; + int i; + + PMAP_LOCK_INIT(pmap); + + /* + * No need to allocate page table space yet but we do need a valid + * page directory table. + */ + if (pmap->pm_pdir == NULL) { + pmap->pm_pdir = (pd_entry_t *)kmem_alloc_nofault(kernel_map, + NBPTD); + if (pmap->pm_pdir == NULL) { + PMAP_LOCK_DESTROY(pmap); + return (0); + } +#if defined(XEN) && defined(PAE) + pmap->pm_pdpt = (pd_entry_t *)kmem_alloc_nofault(kernel_map, 1); +#endif + +#if defined(PAE) && !defined(XEN) + pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO); + KASSERT(((vm_offset_t)pmap->pm_pdpt & + ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0, + ("pmap_pinit: pdpt misaligned")); + KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30), + ("pmap_pinit: pdpt above 4g")); +#endif + } + + /* + * allocate the page directory page(s) + */ + for (i = 0; i < npgptd;) { + m = vm_page_alloc(NULL, color++, + VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | + VM_ALLOC_ZERO); + if (m == NULL) + VM_WAIT; + else { + ptdpg[i++] = m; + } + } + pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD); + for (i = 0; i < NPGPTD; i++) { + if ((ptdpg[i]->flags & PG_ZERO) == 0) + pagezero(&pmap->pm_pdir[i*NPTEPG]); + } + + mtx_lock_spin(&allpmaps_lock); + LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); + mtx_unlock_spin(&allpmaps_lock); + /* Wire in kernel global address entries. */ + + bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t)); +#ifdef PAE +#ifdef XEN + pmap_qenter((vm_offset_t)pmap->pm_pdpt, &ptdpg[NPGPTD], 1); + if ((ptdpg[NPGPTD]->flags & PG_ZERO) == 0) + bzero(pmap->pm_pdpt, PAGE_SIZE); +#endif + for (i = 0; i < NPGPTD; i++) { + vm_paddr_t ma; + + ma = xpmap_ptom(VM_PAGE_TO_PHYS(ptdpg[i])); + pmap->pm_pdpt[i] = ma | PG_V; + + } +#endif +#ifdef XEN + for (i = 0; i < NPGPTD; i++) { + pt_entry_t *pd; + vm_paddr_t ma; + + ma = xpmap_ptom(VM_PAGE_TO_PHYS(ptdpg[i])); + pd = pmap->pm_pdir + (i * NPDEPG); + PT_SET_MA(pd, *vtopte((vm_offset_t)pd) & ~(PG_M|PG_A|PG_U|PG_RW)); +#if 0 + xen_pgd_pin(ma); +#endif + } + +#ifdef PAE + PT_SET_MA(pmap->pm_pdpt, *vtopte((vm_offset_t)pmap->pm_pdpt) & ~PG_RW); +#endif + vm_page_lock_queues(); + xen_flush_queue(); + xen_pgdpt_pin(xpmap_ptom(VM_PAGE_TO_PHYS(ptdpg[NPGPTD]))); + for (i = 0; i < NPGPTD; i++) { + vm_paddr_t ma = xpmap_ptom(VM_PAGE_TO_PHYS(ptdpg[i])); + PT_SET_VA_MA(&pmap->pm_pdir[PTDPTDI + i], ma | PG_V | PG_A, FALSE); + } + xen_flush_queue(); + vm_page_unlock_queues(); +#endif + pmap->pm_active = 0; + TAILQ_INIT(&pmap->pm_pvchunk); + bzero(&pmap->pm_stats, sizeof pmap->pm_stats); + + return (1); +} + +/* + * this routine is called if the page table page is not + * mapped correctly. + */ +static vm_page_t +_pmap_allocpte(pmap_t pmap, unsigned int ptepindex, int flags) +{ + vm_paddr_t ptema; + vm_page_t m; + + KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT || + (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK, + ("_pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK")); + + /* + * Allocate a page table page. + */ + if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | + VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { + if (flags & M_WAITOK) { + PMAP_UNLOCK(pmap); + vm_page_unlock_queues(); + VM_WAIT; + vm_page_lock_queues(); + PMAP_LOCK(pmap); + } + + /* + * Indicate the need to retry. While waiting, the page table + * page may have been allocated. + */ + return (NULL); + } + if ((m->flags & PG_ZERO) == 0) + pmap_zero_page(m); + + /* + * Map the pagetable page into the process address space, if + * it isn't already there. + */ + pmap->pm_stats.resident_count++; + + ptema = xpmap_ptom(VM_PAGE_TO_PHYS(m)); + xen_pt_pin(ptema); + PT_SET_VA_MA(&pmap->pm_pdir[ptepindex], + (ptema | PG_U | PG_RW | PG_V | PG_A | PG_M), TRUE); + + KASSERT(pmap->pm_pdir[ptepindex], + ("_pmap_allocpte: ptepindex=%d did not get mapped", ptepindex)); + return (m); +} + +static vm_page_t +pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags) +{ + unsigned ptepindex; + pd_entry_t ptema; + vm_page_t m; + + KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT || + (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK, + ("pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK")); + + /* + * Calculate pagetable page index + */ + ptepindex = va >> PDRSHIFT; +retry: + /* + * Get the page directory entry + */ + ptema = pmap->pm_pdir[ptepindex]; + + /* + * This supports switching from a 4MB page to a + * normal 4K page. + */ + if (ptema & PG_PS) { + /* + * XXX + */ + pmap->pm_pdir[ptepindex] = 0; + ptema = 0; + pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; + pmap_invalidate_all(kernel_pmap); + } + + /* + * If the page table page is mapped, we just increment the + * hold count, and activate it. + */ + if (ptema & PG_V) { + m = PHYS_TO_VM_PAGE(xpmap_mtop(ptema) & PG_FRAME); + m->wire_count++; + } else { + /* + * Here if the pte page isn't mapped, or if it has + * been deallocated. + */ + CTR3(KTR_PMAP, "pmap_allocpte: pmap=%p va=0x%08x flags=0x%x", + pmap, va, flags); + m = _pmap_allocpte(pmap, ptepindex, flags); + if (m == NULL && (flags & M_WAITOK)) + goto retry; + + KASSERT(pmap->pm_pdir[ptepindex], ("ptepindex=%d did not get mapped", ptepindex)); + } + return (m); +} + + +/*************************************************** +* Pmap allocation/deallocation routines. + ***************************************************/ + +#ifdef SMP +/* + * Deal with a SMP shootdown of other users of the pmap that we are + * trying to dispose of. This can be a bit hairy. + */ +static u_int *lazymask; +static u_int lazyptd; +static volatile u_int lazywait; + +void pmap_lazyfix_action(void); + +void +pmap_lazyfix_action(void) +{ + u_int mymask = PCPU_GET(cpumask); + +#ifdef COUNT_IPIS + (*ipi_lazypmap_counts[PCPU_GET(cpuid)])++; +#endif + if (rcr3() == lazyptd) + load_cr3(PCPU_GET(curpcb)->pcb_cr3); + atomic_clear_int(lazymask, mymask); + atomic_store_rel_int(&lazywait, 1); +} + +static void +pmap_lazyfix_self(u_int mymask) +{ + + if (rcr3() == lazyptd) + load_cr3(PCPU_GET(curpcb)->pcb_cr3); + atomic_clear_int(lazymask, mymask); +} + + +static void +pmap_lazyfix(pmap_t pmap) +{ + u_int mymask; + u_int mask; + u_int spins; + + while ((mask = pmap->pm_active) != 0) { + spins = 50000000; + mask = mask & -mask; /* Find least significant set bit */ + mtx_lock_spin(&smp_ipi_mtx); +#ifdef PAE + lazyptd = vtophys(pmap->pm_pdpt); +#else + lazyptd = vtophys(pmap->pm_pdir); +#endif + mymask = PCPU_GET(cpumask); + if (mask == mymask) { + lazymask = &pmap->pm_active; + pmap_lazyfix_self(mymask); + } else { + atomic_store_rel_int((u_int *)&lazymask, + (u_int)&pmap->pm_active); + atomic_store_rel_int(&lazywait, 0); + ipi_selected(mask, IPI_LAZYPMAP); + while (lazywait == 0) { + ia32_pause(); + if (--spins == 0) + break; + } + } + mtx_unlock_spin(&smp_ipi_mtx); + if (spins == 0) + printf("pmap_lazyfix: spun for 50000000\n"); + } +} + +#else /* SMP */ + +/* + * Cleaning up on uniprocessor is easy. For various reasons, we're + * unlikely to have to even execute this code, including the fact + * that the cleanup is deferred until the parent does a wait(2), which + * means that another userland process has run. + */ +static void +pmap_lazyfix(pmap_t pmap) +{ + u_int cr3; + + cr3 = vtophys(pmap->pm_pdir); + if (cr3 == rcr3()) { + load_cr3(PCPU_GET(curpcb)->pcb_cr3); + pmap->pm_active &= ~(PCPU_GET(cpumask)); + } +} +#endif /* SMP */ + +/* + * Release any resources held by the given physical map. + * Called when a pmap initialized by pmap_pinit is being released. + * Should only be called if the map contains no valid mappings. + */ +void +pmap_release(pmap_t pmap) +{ + vm_page_t m, ptdpg[2*NPGPTD+1]; + vm_paddr_t ma; + int i; +#ifdef XEN +#ifdef PAE + int npgptd = NPGPTD + 1; +#else + int npgptd = NPGPTD; +#endif +#else + int npgptd = NPGPTD; +#endif + KASSERT(pmap->pm_stats.resident_count == 0, + ("pmap_release: pmap resident count %ld != 0", + pmap->pm_stats.resident_count)); + PT_UPDATES_FLUSH(); + + pmap_lazyfix(pmap); + mtx_lock_spin(&allpmaps_lock); + LIST_REMOVE(pmap, pm_list); + mtx_unlock_spin(&allpmaps_lock); + + for (i = 0; i < NPGPTD; i++) + ptdpg[i] = PHYS_TO_VM_PAGE(vtophys(pmap->pm_pdir + (i*NPDEPG)) & PG_FRAME); + pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD); +#if defined(PAE) && defined(XEN) + ptdpg[NPGPTD] = PHYS_TO_VM_PAGE(vtophys(pmap->pm_pdpt)); +#endif + + for (i = 0; i < npgptd; i++) { + m = ptdpg[i]; + ma = xpmap_ptom(VM_PAGE_TO_PHYS(m)); + /* unpinning L1 and L2 treated the same */ + xen_pgd_unpin(ma); +#ifdef PAE + KASSERT(xpmap_ptom(VM_PAGE_TO_PHYS(m)) == (pmap->pm_pdpt[i] & PG_FRAME), + ("pmap_release: got wrong ptd page")); +#endif + m->wire_count--; + atomic_subtract_int(&cnt.v_wire_count, 1); + vm_page_free(m); + } + PMAP_LOCK_DESTROY(pmap); +} + +static int +kvm_size(SYSCTL_HANDLER_ARGS) +{ + unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE; + + return sysctl_handle_long(oidp, &ksize, 0, req); +} +SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, + 0, 0, kvm_size, "IU", "Size of KVM"); + +static int +kvm_free(SYSCTL_HANDLER_ARGS) +{ + unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; + + return sysctl_handle_long(oidp, &kfree, 0, req); +} +SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, + 0, 0, kvm_free, "IU", "Amount of KVM free"); + +/* + * grow the number of kernel page table entries, if needed + */ +void +pmap_growkernel(vm_offset_t addr) +{ + struct pmap *pmap; + vm_paddr_t ptppaddr; + vm_page_t nkpg; + pd_entry_t newpdir; + + mtx_assert(&kernel_map->system_mtx, MA_OWNED); + if (kernel_vm_end == 0) { + kernel_vm_end = KERNBASE; + nkpt = 0; + while (pdir_pde(PTD, kernel_vm_end)) { + kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); + nkpt++; + if (kernel_vm_end - 1 >= kernel_map->max_offset) { + kernel_vm_end = kernel_map->max_offset; + break; + } + } + } + addr = roundup2(addr, PAGE_SIZE * NPTEPG); + if (addr - 1 >= kernel_map->max_offset) + addr = kernel_map->max_offset; + while (kernel_vm_end < addr) { + if (pdir_pde(PTD, kernel_vm_end)) { + kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); + if (kernel_vm_end - 1 >= kernel_map->max_offset) { + kernel_vm_end = kernel_map->max_offset; + break; + } + continue; + } + + /* + * This index is bogus, but out of the way + */ + nkpg = vm_page_alloc(NULL, nkpt, + VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED); + if (!nkpg) + panic("pmap_growkernel: no memory to grow kernel"); + + nkpt++; + + pmap_zero_page(nkpg); + ptppaddr = VM_PAGE_TO_PHYS(nkpg); + newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M); + PD_SET_VA(kernel_pmap, (kernel_vm_end >> PDRSHIFT), newpdir, TRUE); + + mtx_lock_spin(&allpmaps_lock); + LIST_FOREACH(pmap, &allpmaps, pm_list) + PD_SET_VA(pmap, (kernel_vm_end >> PDRSHIFT), newpdir, TRUE); + + mtx_unlock_spin(&allpmaps_lock); + kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); + if (kernel_vm_end - 1 >= kernel_map->max_offset) { + kernel_vm_end = kernel_map->max_offset; + break; + } + } +} + + +/*************************************************** + * page management routines. + ***************************************************/ + +CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); +CTASSERT(_NPCM == 11); + +static __inline struct pv_chunk * +pv_to_chunk(pv_entry_t pv) +{ + + return (struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK); +} + +#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) + +#define PC_FREE0_9 0xfffffffful /* Free values for index 0 through 9 */ +#define PC_FREE10 0x0000fffful /* Free values for index 10 */ + +static uint32_t pc_freemask[11] = { + PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, + PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, + PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, + PC_FREE0_9, PC_FREE10 +}; + +SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, + "Current number of pv entries"); + +#ifdef PV_STATS +static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; + +SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, + "Current number of pv entry chunks"); +SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, + "Current number of pv entry chunks allocated"); +SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, + "Current number of pv entry chunks frees"); +SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, + "Number of times tried to get a chunk page but failed."); + +static long pv_entry_frees, pv_entry_allocs; +static int pv_entry_spare; + +SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, + "Current number of pv entry frees"); +SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, + "Current number of pv entry allocs"); +SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, + "Current number of spare pv entries"); + +static int pmap_collect_inactive, pmap_collect_active; + +SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_inactive, CTLFLAG_RD, &pmap_collect_inactive, 0, + "Current number times pmap_collect called on inactive queue"); +SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_active, CTLFLAG_RD, &pmap_collect_active, 0, + "Current number times pmap_collect called on active queue"); +#endif + +/* + * We are in a serious low memory condition. Resort to + * drastic measures to free some pages so we can allocate + * another pv entry chunk. This is normally called to + * unmap inactive pages, and if necessary, active pages. + */ +static void +pmap_collect(pmap_t locked_pmap, struct vpgqueues *vpq) +{ + pmap_t pmap; + pt_entry_t *pte, tpte; + pv_entry_t next_pv, pv; + vm_offset_t va; + vm_page_t m, free; + + sched_pin(); + TAILQ_FOREACH(m, &vpq->pl, pageq) { + if (m->hold_count || m->busy) + continue; + TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) { + va = pv->pv_va; + pmap = PV_PMAP(pv); + /* Avoid deadlock and lock recursion. */ + if (pmap > locked_pmap) + PMAP_LOCK(pmap); + else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) + continue; + pmap->pm_stats.resident_count--; + pte = pmap_pte_quick(pmap, va); + tpte = pte_load_clear(pte); + KASSERT((tpte & PG_W) == 0, + ("pmap_collect: wired pte %#jx", (uintmax_t)tpte)); + if (tpte & PG_A) + vm_page_flag_set(m, PG_REFERENCED); + if (tpte & PG_M) { + KASSERT((tpte & PG_RW), + ("pmap_collect: modified page not writable: va: %#x, pte: %#jx", + va, (uintmax_t)tpte)); + vm_page_dirty(m); + } + free = NULL; + pmap_unuse_pt(pmap, va, &free); + pmap_invalidate_page(pmap, va); + pmap_free_zero_pages(free); + TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); + if (TAILQ_EMPTY(&m->md.pv_list)) + vm_page_flag_clear(m, PG_WRITEABLE); + free_pv_entry(pmap, pv); + if (pmap != locked_pmap) + PMAP_UNLOCK(pmap); + } + } + sched_unpin(); +} + + +/* + * free the pv_entry back to the free list + */ +static void +free_pv_entry(pmap_t pmap, pv_entry_t pv) +{ + vm_page_t m; + struct pv_chunk *pc; + int idx, field, bit; + + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + PV_STAT(pv_entry_frees++); + PV_STAT(pv_entry_spare++); + pv_entry_count--; + pc = pv_to_chunk(pv); + idx = pv - &pc->pc_pventry[0]; + field = idx / 32; + bit = idx % 32; + pc->pc_map[field] |= 1ul << bit; + /* move to head of list */ + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); + for (idx = 0; idx < _NPCM; idx++) + if (pc->pc_map[idx] != pc_freemask[idx]) + return; + PV_STAT(pv_entry_spare -= _NPCPV); + PV_STAT(pc_chunk_count--); + PV_STAT(pc_chunk_frees++); + /* entire chunk is free, return it */ + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); + pmap_qremove((vm_offset_t)pc, 1); + vm_page_unwire(m, 0); + vm_page_free(m); + pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); +} + +/* + * get a new pv_entry, allocating a block from the system + * when needed. + */ +static pv_entry_t +get_pv_entry(pmap_t pmap, int try) +{ + static const struct timeval printinterval = { 60, 0 }; + static struct timeval lastprint; + static vm_pindex_t colour; + struct vpgqueues *pq; + int bit, field; + pv_entry_t pv; + struct pv_chunk *pc; + vm_page_t m; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + PV_STAT(pv_entry_allocs++); + pv_entry_count++; + if (pv_entry_count > pv_entry_high_water) + if (ratecheck(&lastprint, &printinterval)) + printf("Approaching the limit on PV entries, consider " + "increasing either the vm.pmap.shpgperproc or the " + "vm.pmap.pv_entry_max tunable.\n"); + pq = NULL; +retry: + pc = TAILQ_FIRST(&pmap->pm_pvchunk); + if (pc != NULL) { + for (field = 0; field < _NPCM; field++) { + if (pc->pc_map[field]) { + bit = bsfl(pc->pc_map[field]); + break; + } + } + if (field < _NPCM) { + pv = &pc->pc_pventry[field * 32 + bit]; + pc->pc_map[field] &= ~(1ul << bit); + /* If this was the last item, move it to tail */ + for (field = 0; field < _NPCM; field++) + if (pc->pc_map[field] != 0) { + PV_STAT(pv_entry_spare--); + return (pv); /* not full, return */ + } + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); + PV_STAT(pv_entry_spare--); + return (pv); + } + } + /* + * Access to the ptelist "pv_vafree" is synchronized by the page + * queues lock. If "pv_vafree" is currently non-empty, it will + * remain non-empty until pmap_ptelist_alloc() completes. + */ + if (pv_vafree == 0 || (m = vm_page_alloc(NULL, colour, (pq == + &vm_page_queues[PQ_ACTIVE] ? VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL) | + VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { + if (try) { + pv_entry_count--; + PV_STAT(pc_chunk_tryfail++); + return (NULL); + } + /* + * Reclaim pv entries: At first, destroy mappings to + * inactive pages. After that, if a pv chunk entry + * is still needed, destroy mappings to active pages. + */ + if (pq == NULL) { + PV_STAT(pmap_collect_inactive++); + pq = &vm_page_queues[PQ_INACTIVE]; + } else if (pq == &vm_page_queues[PQ_INACTIVE]) { + PV_STAT(pmap_collect_active++); + pq = &vm_page_queues[PQ_ACTIVE]; + } else + panic("get_pv_entry: increase vm.pmap.shpgperproc"); + pmap_collect(pmap, pq); + goto retry; + } + PV_STAT(pc_chunk_count++); + PV_STAT(pc_chunk_allocs++); + colour++; + pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree); + pmap_qenter((vm_offset_t)pc, &m, 1); + if ((m->flags & PG_ZERO) == 0) + pagezero(pc); + pc->pc_pmap = pmap; + pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */ + for (field = 1; field < _NPCM; field++) + pc->pc_map[field] = pc_freemask[field]; + pv = &pc->pc_pventry[0]; + TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); + PV_STAT(pv_entry_spare += _NPCPV - 1); + return (pv); +} + +static void +pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) +{ + pv_entry_t pv; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { + if (pmap == PV_PMAP(pv) && va == pv->pv_va) + break; + } + KASSERT(pv != NULL, ("pmap_remove_entry: pv not found")); + TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); + if (TAILQ_EMPTY(&m->md.pv_list)) + vm_page_flag_clear(m, PG_WRITEABLE); + free_pv_entry(pmap, pv); +} + +/* + * Create a pv entry for page at pa for + * (pmap, va). + */ +static void +pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) +{ + pv_entry_t pv; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + pv = get_pv_entry(pmap, FALSE); + pv->pv_va = va; + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); +} + +/* + * Conditionally create a pv entry. + */ +static boolean_t +pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) +{ + pv_entry_t pv; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + if (pv_entry_count < pv_entry_high_water && + (pv = get_pv_entry(pmap, TRUE)) != NULL) { + pv->pv_va = va; + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); + return (TRUE); + } else + return (FALSE); +} + +/* + * pmap_remove_pte: do the things to unmap a page in a process + */ +static int +pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, vm_page_t *free) +{ + pt_entry_t oldpte; + vm_page_t m; + + CTR3(KTR_PMAP, "pmap_remove_pte: pmap=%p *ptq=0x%x va=0x%x", + pmap, (u_long)*ptq, va); + + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + oldpte = *ptq; + PT_SET_VA_MA(ptq, 0, TRUE); + if (oldpte & PG_W) + pmap->pm_stats.wired_count -= 1; + /* + * Machines that don't support invlpg, also don't support + * PG_G. + */ + if (oldpte & PG_G) + pmap_invalidate_page(kernel_pmap, va); + pmap->pm_stats.resident_count -= 1; + /* + * XXX This is not strictly correctly, but somewhere along the line + * we are losing the managed bit on some pages. It is unclear to me + * why, but I think the most likely explanation is that xen's writable + * page table implementation doesn't respect the unused bits. + */ + if ((oldpte & PG_MANAGED) || ((oldpte & PG_V) && (va < VM_MAXUSER_ADDRESS)) + ) { + m = PHYS_TO_VM_PAGE(xpmap_mtop(oldpte) & PG_FRAME); + + if (!(oldpte & PG_MANAGED)) + printf("va=0x%x is unmanaged :-( pte=0x%llx\n", va, oldpte); + + if (oldpte & PG_M) { + KASSERT((oldpte & PG_RW), + ("pmap_remove_pte: modified page not writable: va: %#x, pte: %#jx", + va, (uintmax_t)oldpte)); + vm_page_dirty(m); + } + if (oldpte & PG_A) + vm_page_flag_set(m, PG_REFERENCED); + pmap_remove_entry(pmap, m, va); + } else if ((va < VM_MAXUSER_ADDRESS) && (oldpte & PG_V)) + printf("va=0x%x is unmanaged :-( pte=0x%llx\n", va, oldpte); + + return (pmap_unuse_pt(pmap, va, free)); +} + +/* + * Remove a single page from a process address space + */ +static void +pmap_remove_page(pmap_t pmap, vm_offset_t va, vm_page_t *free) +{ + pt_entry_t *pte; + + CTR2(KTR_PMAP, "pmap_remove_page: pmap=%p va=0x%x", + pmap, va); + + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + if ((pte = pmap_pte_quick(pmap, va)) == NULL || (*pte & PG_V) == 0) + return; + pmap_remove_pte(pmap, pte, va, free); + pmap_invalidate_page(pmap, va); + if (*PMAP1) + PT_SET_MA(PADDR1, 0); + +} + +/* + * Remove the given range of addresses from the specified map. + * + * It is assumed that the start and end are properly + * rounded to the page size. + */ +void +pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +{ + vm_offset_t pdnxt; + pd_entry_t ptpaddr; + pt_entry_t *pte; + vm_page_t free = NULL; + int anyvalid; + + CTR3(KTR_PMAP, "pmap_remove: pmap=%p sva=0x%x eva=0x%x", + pmap, sva, eva); + + /* + * Perform an unsynchronized read. This is, however, safe. + */ + if (pmap->pm_stats.resident_count == 0) + return; + + anyvalid = 0; + + vm_page_lock_queues(); + sched_pin(); + PMAP_LOCK(pmap); + + /* + * special handling of removing one page. a very + * common operation and easy to short circuit some + * code. + */ + if ((sva + PAGE_SIZE == eva) && + ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) { + pmap_remove_page(pmap, sva, &free); + goto out; + } + + for (; sva < eva; sva = pdnxt) { + unsigned pdirindex; + + /* + * Calculate index for next page table. + */ + pdnxt = (sva + NBPDR) & ~PDRMASK; + if (pmap->pm_stats.resident_count == 0) + break; + + pdirindex = sva >> PDRSHIFT; + ptpaddr = pmap->pm_pdir[pdirindex]; + + /* + * Weed out invalid mappings. Note: we assume that the page + * directory table is always allocated, and in kernel virtual. + */ + if (ptpaddr == 0) + continue; + + /* + * Check for large page. + */ + if ((ptpaddr & PG_PS) != 0) { + PD_CLEAR_VA(pmap, pdirindex, TRUE); + pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; + anyvalid = 1; + continue; + } + + /* + * Limit our scan to either the end of the va represented + * by the current page table page, or to the end of the + * range being removed. + */ + if (pdnxt > eva) + pdnxt = eva; + + for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, + sva += PAGE_SIZE) { + if ((*pte & PG_V) == 0) + continue; + + /* + * The TLB entry for a PG_G mapping is invalidated + * by pmap_remove_pte(). + */ + if ((*pte & PG_G) == 0) + anyvalid = 1; + if (pmap_remove_pte(pmap, pte, sva, &free)) + break; + } + } + PT_UPDATES_FLUSH(); + if (*PMAP1) + PT_SET_VA_MA(PMAP1, 0, TRUE); +out: + if (anyvalid) + pmap_invalidate_all(pmap); + sched_unpin(); + vm_page_unlock_queues(); + PMAP_UNLOCK(pmap); + pmap_free_zero_pages(free); +} + +/* + * Routine: pmap_remove_all + * Function: + * Removes this physical page from + * all physical maps in which it resides. + * Reflects back modify bits to the pager. + * + * Notes: + * Original versions of this routine were very + * inefficient because they iteratively called + * pmap_remove (slow...) + */ + +void +pmap_remove_all(vm_page_t m) +{ + pv_entry_t pv; + pmap_t pmap; + pt_entry_t *pte, tpte; + vm_page_t free; + +#if defined(PMAP_DIAGNOSTIC) + /* + * XXX This makes pmap_remove_all() illegal for non-managed pages! + */ + if (m->flags & PG_FICTITIOUS) { + panic("pmap_remove_all: illegal for unmanaged page, va: 0x%jx", + VM_PAGE_TO_PHYS(m) & 0xffffffff); + } +#endif + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + sched_pin(); + while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { + pmap = PV_PMAP(pv); + PMAP_LOCK(pmap); + pmap->pm_stats.resident_count--; + pte = pmap_pte_quick(pmap, pv->pv_va); + + tpte = *pte; + PT_SET_VA_MA(pte, 0, TRUE); + if (tpte & PG_W) + pmap->pm_stats.wired_count--; + if (tpte & PG_A) + vm_page_flag_set(m, PG_REFERENCED); + + /* + * Update the vm_page_t clean and reference bits. + */ + if (tpte & PG_M) { + KASSERT((tpte & PG_RW), + ("pmap_remove_all: modified page not writable: va: %#x, pte: %#jx", + pv->pv_va, (uintmax_t)tpte)); + vm_page_dirty(m); + } + free = NULL; + pmap_unuse_pt(pmap, pv->pv_va, &free); + pmap_invalidate_page(pmap, pv->pv_va); + pmap_free_zero_pages(free); + TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); + free_pv_entry(pmap, pv); + PMAP_UNLOCK(pmap); + } + vm_page_flag_clear(m, PG_WRITEABLE); + PT_UPDATES_FLUSH(); + if (*PMAP1) + PT_SET_MA(PADDR1, 0); + sched_unpin(); +} + +/* + * Set the physical protection on the + * specified range of this map as requested. + */ +void +pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) +{ + vm_offset_t pdnxt; + pd_entry_t ptpaddr; + pt_entry_t *pte; + int anychanged; + + CTR4(KTR_PMAP, "pmap_protect: pmap=%p sva=0x%x eva=0x%x prot=0x%x", + pmap, sva, eva, prot); + + if ((prot & VM_PROT_READ) == VM_PROT_NONE) { + pmap_remove(pmap, sva, eva); + return; + } + +#ifdef PAE + if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == + (VM_PROT_WRITE|VM_PROT_EXECUTE)) + return; +#else + if (prot & VM_PROT_WRITE) + return; +#endif + + anychanged = 0; + + vm_page_lock_queues(); + sched_pin(); + PMAP_LOCK(pmap); + for (; sva < eva; sva = pdnxt) { + pt_entry_t obits, pbits; + unsigned pdirindex; + + pdnxt = (sva + NBPDR) & ~PDRMASK; + + pdirindex = sva >> PDRSHIFT; + ptpaddr = pmap->pm_pdir[pdirindex]; + + /* + * Weed out invalid mappings. Note: we assume that the page + * directory table is always allocated, and in kernel virtual. + */ + if (ptpaddr == 0) + continue; + + /* + * Check for large page. + */ + if ((ptpaddr & PG_PS) != 0) { + if ((prot & VM_PROT_WRITE) == 0) + pmap->pm_pdir[pdirindex] &= ~(PG_M|PG_RW); +#ifdef PAE + if ((prot & VM_PROT_EXECUTE) == 0) + pmap->pm_pdir[pdirindex] |= pg_nx; +#endif + anychanged = 1; + continue; + } + + if (pdnxt > eva) + pdnxt = eva; + + for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, + sva += PAGE_SIZE) { + vm_page_t m; + +retry: + /* + * Regardless of whether a pte is 32 or 64 bits in + * size, PG_RW, PG_A, and PG_M are among the least + * significant 32 bits. + */ + obits = pbits = *pte; + if ((pbits & PG_V) == 0) + continue; + if (pbits & PG_MANAGED) { + m = NULL; + if (pbits & PG_A) { + m = PHYS_TO_VM_PAGE(xpmap_mtop(pbits) & PG_FRAME); + vm_page_flag_set(m, PG_REFERENCED); + pbits &= ~PG_A; + } + if ((pbits & PG_M) != 0) { + if (m == NULL) + m = PHYS_TO_VM_PAGE(xpmap_mtop(pbits) & PG_FRAME); + vm_page_dirty(m); + } + } + + if ((prot & VM_PROT_WRITE) == 0) + pbits &= ~(PG_RW | PG_M); +#ifdef PAE + if ((prot & VM_PROT_EXECUTE) == 0) + pbits |= pg_nx; +#endif + + if (pbits != obits) { +#ifdef XEN + obits = *pte; + PT_SET_VA_MA(pte, pbits, TRUE); + if (*pte != pbits) + goto retry; +#else +#ifdef PAE + if (!atomic_cmpset_64(pte, obits, pbits)) + goto retry; +#else + if (!atomic_cmpset_int((u_int *)pte, obits, + pbits)) + goto retry; +#endif +#endif + if (obits & PG_G) + pmap_invalidate_page(pmap, sva); + else + anychanged = 1; + } + } + } + PT_UPDATES_FLUSH(); + if (*PMAP1) + PT_SET_VA_MA(PMAP1, 0, TRUE); + if (anychanged) + pmap_invalidate_all(pmap); + sched_unpin(); + vm_page_unlock_queues(); + PMAP_UNLOCK(pmap); +} + +/* + * Insert the given physical page (p) at + * the specified virtual address (v) in the + * target physical map with the protection requested. + * + * If specified, the page will be wired down, meaning + * that the related pte can not be reclaimed. + * + * NB: This is the only routine which MAY NOT lazy-evaluate + * or lose information. That is, this routine must actually + * insert this page into the given map NOW. + */ +void +pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m, + vm_prot_t prot, boolean_t wired) +{ + vm_paddr_t pa; + pd_entry_t *pde; + pt_entry_t *pte; + vm_paddr_t opa; + pt_entry_t origpte, newpte; + vm_page_t mpte, om; + boolean_t invlva; + + CTR6(KTR_PMAP, "pmap_enter: pmap=%08p va=0x%08x access=0x%x ma=0x%08x prot=0x%x wired=%d", + pmap, va, access, xpmap_ptom(VM_PAGE_TO_PHYS(m)), prot, wired); + va = trunc_page(va); +#ifdef PMAP_DIAGNOSTIC + if (va > VM_MAX_KERNEL_ADDRESS) + panic("pmap_enter: toobig"); + if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS)) + panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va); +#endif + + mpte = NULL; + + vm_page_lock_queues(); + PMAP_LOCK(pmap); + sched_pin(); + + /* + * In the case that a page table page is not + * resident, we are creating it here. + */ + if (va < VM_MAXUSER_ADDRESS) { + mpte = pmap_allocpte(pmap, va, M_WAITOK); + } +#if 0 && defined(PMAP_DIAGNOSTIC) + else { + pd_entry_t *pdeaddr = pmap_pde(pmap, va); + origpte = *pdeaddr; + if ((origpte & PG_V) == 0) { + panic("pmap_enter: invalid kernel page table page, pdir=%p, pde=%p, va=%p\n", + pmap->pm_pdir[PTDPTDI], origpte, va); + } + } +#endif + + pde = pmap_pde(pmap, va); + if ((*pde & PG_PS) != 0) + panic("pmap_enter: attempted pmap_enter on 4MB page"); + pte = pmap_pte_quick(pmap, va); + + /* + * Page Directory table entry not valid, we need a new PT page + */ + if (pte == NULL) { + panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x\n", + (uintmax_t)pmap->pm_pdir[va >> PDRSHIFT], va); + } + + pa = VM_PAGE_TO_PHYS(m); + om = NULL; + opa = origpte = 0; + +#if 0 + KASSERT((*pte & PG_V) || (*pte == 0), ("address set but not valid pte=%p *pte=0x%016jx", + pte, *pte)); +#endif + origpte = *pte; + if (origpte) + origpte = xpmap_mtop(origpte); + opa = origpte & PG_FRAME; + + /* + * Mapping has not changed, must be protection or wiring change. + */ + if (origpte && (opa == pa)) { + /* + * Wiring change, just update stats. We don't worry about + * wiring PT pages as they remain resident as long as there + * are valid mappings in them. Hence, if a user page is wired, + * the PT page will be also. + */ + if (wired && ((origpte & PG_W) == 0)) + pmap->pm_stats.wired_count++; + else if (!wired && (origpte & PG_W)) + pmap->pm_stats.wired_count--; + + /* + * Remove extra pte reference + */ + if (mpte) + mpte->wire_count--; + + /* + * We might be turning off write access to the page, + * so we go ahead and sense modify status. + */ + if (origpte & PG_MANAGED) { + om = m; + pa |= PG_MANAGED; + } + goto validate; + } + /* + * Mapping has changed, invalidate old range and fall through to + * handle validating new mapping. + */ + if (opa) { + if (origpte & PG_W) + pmap->pm_stats.wired_count--; + if (origpte & PG_MANAGED) { + om = PHYS_TO_VM_PAGE(opa); + pmap_remove_entry(pmap, om, va); + } else if (va < VM_MAXUSER_ADDRESS) + printf("va=0x%x is unmanaged :-( \n", va); + + if (mpte != NULL) { + mpte->wire_count--; + KASSERT(mpte->wire_count > 0, + ("pmap_enter: missing reference to page table page," + " va: 0x%x", va)); + } + } else + pmap->pm_stats.resident_count++; + + /* + * Enter on the PV list if part of our managed memory. + */ + if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) { + KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva, + ("pmap_enter: managed mapping within the clean submap")); + pmap_insert_entry(pmap, va, m); + pa |= PG_MANAGED; + } + + /* + * Increment counters + */ + if (wired) + pmap->pm_stats.wired_count++; + +validate: + /* + * Now validate mapping with desired protection/wiring. + */ + newpte = (pt_entry_t)(pa | PG_V); + if ((prot & VM_PROT_WRITE) != 0) { + newpte |= PG_RW; + vm_page_flag_set(m, PG_WRITEABLE); + } +#ifdef PAE + if ((prot & VM_PROT_EXECUTE) == 0) + newpte |= pg_nx; +#endif + if (wired) + newpte |= PG_W; + if (va < VM_MAXUSER_ADDRESS) + newpte |= PG_U; + if (pmap == kernel_pmap) + newpte |= pgeflag; + + critical_enter(); + /* + * if the mapping or permission bits are different, we need + * to update the pte. + */ + if ((origpte & ~(PG_M|PG_A)) != newpte) { + if (origpte) { + invlva = FALSE; + origpte = *pte; + PT_SET_VA(pte, newpte | PG_A, FALSE); + if (origpte & PG_A) { + if (origpte & PG_MANAGED) + vm_page_flag_set(om, PG_REFERENCED); + if (opa != VM_PAGE_TO_PHYS(m)) + invlva = TRUE; +#ifdef PAE + if ((origpte & PG_NX) == 0 && + (newpte & PG_NX) != 0) + invlva = TRUE; +#endif + } + if (origpte & PG_M) { + KASSERT((origpte & PG_RW), + ("pmap_enter: modified page not writable: va: %#x, pte: %#jx", + va, (uintmax_t)origpte)); + if ((origpte & PG_MANAGED) != 0) + vm_page_dirty(om); + if ((prot & VM_PROT_WRITE) == 0) + invlva = TRUE; + } + if (invlva) + pmap_invalidate_page(pmap, va); + } else{ + PT_SET_VA(pte, newpte | PG_A, FALSE); + } + + } + PT_UPDATES_FLUSH(); + critical_exit(); + if (*PMAP1) + PT_SET_VA_MA(PMAP1, 0, TRUE); + sched_unpin(); + vm_page_unlock_queues(); + PMAP_UNLOCK(pmap); +} + +/* + * Maps a sequence of resident pages belonging to the same object. + * The sequence begins with the given page m_start. This page is + * mapped at the given virtual address start. Each subsequent page is + * mapped at a virtual address that is offset from start by the same + * amount as the page is offset from m_start within the object. The + * last page in the sequence is the page with the largest offset from + * m_start that can be mapped at a virtual address less than the given + * virtual address end. Not every virtual page between start and end + * is mapped; only those for which a resident page exists with the + * corresponding offset from m_start are mapped. + */ +void +pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, + vm_page_t m_start, vm_prot_t prot) +{ + vm_page_t m, mpte; + vm_pindex_t diff, psize; + multicall_entry_t mcl[16]; + multicall_entry_t *mclp = mcl; + int error, count = 0; + + VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED); + psize = atop(end - start); + + mpte = NULL; + m = m_start; + PMAP_LOCK(pmap); + while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { + mpte = pmap_enter_quick_locked(&mclp, &count, pmap, start + ptoa(diff), m, + prot, mpte); + m = TAILQ_NEXT(m, listq); + if (count == 16) { + error = HYPERVISOR_multicall(mcl, count); + KASSERT(error == 0, ("bad multicall %d", error)); + mclp = mcl; + count = 0; + } + } + if (count) { + error = HYPERVISOR_multicall(mcl, count); + KASSERT(error == 0, ("bad multicall %d", error)); + } + + PMAP_UNLOCK(pmap); +} + +/* + * this code makes some *MAJOR* assumptions: + * 1. Current pmap & pmap exists. + * 2. Not wired. + * 3. Read access. + * 4. No page table pages. + * but is *MUCH* faster than pmap_enter... + */ + +void +pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) +{ + multicall_entry_t mcl, *mclp; + int count = 0; + mclp = &mcl; + + CTR4(KTR_PMAP, "pmap_enter_quick: pmap=%p va=0x%x m=%p prot=0x%x", + pmap, va, m, prot); + + PMAP_LOCK(pmap); + (void) pmap_enter_quick_locked(&mclp, &count, pmap, va, m, prot, NULL); + if (count) + HYPERVISOR_multicall(&mcl, count); + PMAP_UNLOCK(pmap); +} + +void +pmap_enter_quick_range(pmap_t pmap, vm_offset_t *addrs, vm_page_t *pages, vm_prot_t *prots, int count) +{ + int i, error, index = 0; + multicall_entry_t mcl[16]; + multicall_entry_t *mclp = mcl; + + PMAP_LOCK(pmap); + for (i = 0; i < count; i++, addrs++, pages++, prots++) { + if (!pmap_is_prefaultable_locked(pmap, *addrs)) + continue; + + (void) pmap_enter_quick_locked(&mclp, &index, pmap, *addrs, *pages, *prots, NULL); + if (index == 16) { + error = HYPERVISOR_multicall(mcl, index); + mclp = mcl; + index = 0; + KASSERT(error == 0, ("bad multicall %d", error)); + } + } + if (index) { + error = HYPERVISOR_multicall(mcl, index); + KASSERT(error == 0, ("bad multicall %d", error)); + } + + PMAP_UNLOCK(pmap); +} + +static vm_page_t +pmap_enter_quick_locked(multicall_entry_t **mclpp, int *count, pmap_t pmap, vm_offset_t va, vm_page_t m, + vm_prot_t prot, vm_page_t mpte) +{ + pt_entry_t *pte; + vm_paddr_t pa; + vm_page_t free; + multicall_entry_t *mcl = *mclpp; + + KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || + (m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0, + ("pmap_enter_quick_locked: managed mapping within the clean submap")); + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + + /* + * In the case that a page table page is not + * resident, we are creating it here. + */ + if (va < VM_MAXUSER_ADDRESS) { + unsigned ptepindex; + pd_entry_t ptema; + + /* + * Calculate pagetable page index + */ + ptepindex = va >> PDRSHIFT; + if (mpte && (mpte->pindex == ptepindex)) { + mpte->wire_count++; + } else { + /* + * Get the page directory entry + */ + ptema = pmap->pm_pdir[ptepindex]; + + /* + * If the page table page is mapped, we just increment + * the hold count, and activate it. + */ + if (ptema & PG_V) { + if (ptema & PG_PS) + panic("pmap_enter_quick: unexpected mapping into 4MB page"); + mpte = PHYS_TO_VM_PAGE(xpmap_mtop(ptema) & PG_FRAME); + mpte->wire_count++; + } else { + mpte = _pmap_allocpte(pmap, ptepindex, + M_NOWAIT); + if (mpte == NULL) + return (mpte); + } + } + } else { + mpte = NULL; + } + + /* + * This call to vtopte makes the assumption that we are + * entering the page into the current pmap. In order to support + * quick entry into any pmap, one would likely use pmap_pte_quick. + * But that isn't as quick as vtopte. + */ + KASSERT(pmap_is_current(pmap), ("entering pages in non-current pmap")); + pte = vtopte(va); + if (*pte & PG_V) { + if (mpte != NULL) { + mpte->wire_count--; + mpte = NULL; + } + return (mpte); + } + + /* + * Enter on the PV list if part of our managed memory. + */ + if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0 && + !pmap_try_insert_pv_entry(pmap, va, m)) { + if (mpte != NULL) { + free = NULL; + if (pmap_unwire_pte_hold(pmap, mpte, &free)) { + pmap_invalidate_page(pmap, va); + pmap_free_zero_pages(free); + } + + mpte = NULL; + } + return (mpte); + } + + /* + * Increment counters + */ + pmap->pm_stats.resident_count++; + + pa = VM_PAGE_TO_PHYS(m); +#ifdef PAE + if ((prot & VM_PROT_EXECUTE) == 0) + pa |= pg_nx; +#endif + +#if 0 + /* + * Now validate mapping with RO protection + */ + if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) + pte_store(pte, pa | PG_V | PG_U); + else + pte_store(pte, pa | PG_V | PG_U | PG_MANAGED); +#else + /* + * Now validate mapping with RO protection + */ + if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) + pa = xpmap_ptom(pa | PG_V | PG_U); + else + pa = xpmap_ptom(pa | PG_V | PG_U | PG_MANAGED); + + mcl->op = __HYPERVISOR_update_va_mapping; + mcl->args[0] = va; + mcl->args[1] = (uint32_t)(pa & 0xffffffff); + mcl->args[2] = (uint32_t)(pa >> 32); + mcl->args[3] = 0; + *mclpp = mcl + 1; + *count = *count + 1; +#endif + return mpte; +} + +/* + * Make a temporary mapping for a physical address. This is only intended + * to be used for panic dumps. + */ +void * +pmap_kenter_temporary(vm_paddr_t pa, int i) +{ + vm_offset_t va; + + va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); + pmap_kenter(va, pa); + invlpg(va); + return ((void *)crashdumpmap); +} + +/* + * This code maps large physical mmap regions into the + * processor address space. Note that some shortcuts + * are taken, but the code works. + */ +void +pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, + vm_object_t object, vm_pindex_t pindex, + vm_size_t size) +{ + vm_page_t p; + + VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); + KASSERT(object->type == OBJT_DEVICE, + ("pmap_object_init_pt: non-device object")); + if (pseflag && + ((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0)) { + int i; + vm_page_t m[1]; + unsigned int ptepindex; + int npdes; + pd_entry_t ptepa; + + PMAP_LOCK(pmap); + if (pmap->pm_pdir[ptepindex = (addr >> PDRSHIFT)]) + goto out; + PMAP_UNLOCK(pmap); +retry: + p = vm_page_lookup(object, pindex); + if (p != NULL) { + if (vm_page_sleep_if_busy(p, FALSE, "init4p")) + goto retry; + } else { + p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL); + if (p == NULL) + return; + m[0] = p; + + if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) { + vm_page_lock_queues(); + vm_page_free(p); + vm_page_unlock_queues(); + return; + } + + p = vm_page_lookup(object, pindex); + vm_page_lock_queues(); + vm_page_wakeup(p); + vm_page_unlock_queues(); + } + + ptepa = VM_PAGE_TO_PHYS(p); + if (ptepa & (NBPDR - 1)) + return; + + p->valid = VM_PAGE_BITS_ALL; + + PMAP_LOCK(pmap); + pmap->pm_stats.resident_count += size >> PAGE_SHIFT; + npdes = size >> PDRSHIFT; + critical_enter(); + for(i = 0; i < npdes; i++) { + PD_SET_VA(pmap, ptepindex, + ptepa | PG_U | PG_M | PG_RW | PG_V | PG_PS, FALSE); + ptepa += NBPDR; + ptepindex += 1; + } + pmap_invalidate_all(pmap); + critical_exit(); +out: + PMAP_UNLOCK(pmap); + } +} + +/* + * Routine: pmap_change_wiring + * Function: Change the wiring attribute for a map/virtual-address + * pair. + * In/out conditions: + * The mapping must already exist in the pmap. + */ +void +pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired) +{ + pt_entry_t *pte; + + vm_page_lock_queues(); + PMAP_LOCK(pmap); + pte = pmap_pte(pmap, va); + + if (wired && !pmap_pte_w(pte)) { + PT_SET_VA_MA((pte), *(pte) | PG_W, TRUE); + pmap->pm_stats.wired_count++; + } else if (!wired && pmap_pte_w(pte)) { + PT_SET_VA_MA((pte), *(pte) & ~PG_W, TRUE); + pmap->pm_stats.wired_count--; + } + + /* + * Wiring is not a hardware characteristic so there is no need to + * invalidate TLB. + */ + pmap_pte_release(pte); + PMAP_UNLOCK(pmap); + vm_page_unlock_queues(); +} + + + +/* + * Copy the range specified by src_addr/len + * from the source map to the range dst_addr/len + * in the destination map. + * + * This routine is only advisory and need not do anything. + */ + +void +pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, + vm_offset_t src_addr) +{ + vm_page_t free; + vm_offset_t addr; + vm_offset_t end_addr = src_addr + len; + vm_offset_t pdnxt; + + if (dst_addr != src_addr) + return; + + if (!pmap_is_current(src_pmap)) { + CTR2(KTR_PMAP, + "pmap_copy, skipping: pdir[PTDPTDI]=0x%jx PTDpde[0]=0x%jx", + (src_pmap->pm_pdir[PTDPTDI] & PG_FRAME), (PTDpde[0] & PG_FRAME)); + + return; + } + CTR5(KTR_PMAP, "pmap_copy: dst_pmap=%p src_pmap=%p dst_addr=0x%x len=%d src_addr=0x%x", + dst_pmap, src_pmap, dst_addr, len, src_addr); + + vm_page_lock_queues(); + if (dst_pmap < src_pmap) { + PMAP_LOCK(dst_pmap); + PMAP_LOCK(src_pmap); + } else { + PMAP_LOCK(src_pmap); + PMAP_LOCK(dst_pmap); + } + sched_pin(); + for (addr = src_addr; addr < end_addr; addr = pdnxt) { + pt_entry_t *src_pte, *dst_pte; + vm_page_t dstmpte, srcmpte; + pd_entry_t srcptepaddr; + unsigned ptepindex; + + if (addr >= UPT_MIN_ADDRESS) + panic("pmap_copy: invalid to pmap_copy page tables"); + + pdnxt = (addr + NBPDR) & ~PDRMASK; + ptepindex = addr >> PDRSHIFT; + + srcptepaddr = PT_GET(&src_pmap->pm_pdir[ptepindex]); + if (srcptepaddr == 0) + continue; + + if (srcptepaddr & PG_PS) { + if (dst_pmap->pm_pdir[ptepindex] == 0) { + PD_SET_VA(dst_pmap, ptepindex, srcptepaddr & ~PG_W, TRUE); + dst_pmap->pm_stats.resident_count += + NBPDR / PAGE_SIZE; + } + continue; + } + + srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME); + if (srcmpte->wire_count == 0) + panic("pmap_copy: source page table page is unused"); + + if (pdnxt > end_addr) + pdnxt = end_addr; + + src_pte = vtopte(addr); + while (addr < pdnxt) { + pt_entry_t ptetemp; + ptetemp = *src_pte; + /* + * we only virtual copy managed pages + */ + if ((ptetemp & PG_MANAGED) != 0) { + dstmpte = pmap_allocpte(dst_pmap, addr, + M_NOWAIT); + if (dstmpte == NULL) + break; + dst_pte = pmap_pte_quick(dst_pmap, addr); + if (*dst_pte == 0 && + pmap_try_insert_pv_entry(dst_pmap, addr, + PHYS_TO_VM_PAGE(xpmap_mtop(ptetemp) & PG_FRAME))) { + /* + * Clear the wired, modified, and + * accessed (referenced) bits + * during the copy. + */ + KASSERT(ptetemp != 0, ("src_pte not set")); + PT_SET_VA_MA(dst_pte, ptetemp & ~(PG_W | PG_M | PG_A), TRUE /* XXX debug */); + KASSERT(*dst_pte == (ptetemp & ~(PG_W | PG_M | PG_A)), + ("no pmap copy expected: 0x%jx saw: 0x%jx", + ptetemp & ~(PG_W | PG_M | PG_A), *dst_pte)); + dst_pmap->pm_stats.resident_count++; + } else { + free = NULL; + if (pmap_unwire_pte_hold(dst_pmap, + dstmpte, &free)) { + pmap_invalidate_page(dst_pmap, + addr); + pmap_free_zero_pages(free); + } + } + if (dstmpte->wire_count >= srcmpte->wire_count) + break; + } + addr += PAGE_SIZE; + src_pte++; + } + } + PT_UPDATES_FLUSH(); + sched_unpin(); + vm_page_unlock_queues(); + PMAP_UNLOCK(src_pmap); + PMAP_UNLOCK(dst_pmap); +} + +/* + * pmap_zero_page zeros the specified hardware page by mapping + * the page into KVM and using bzero to clear its contents. + */ +void +pmap_zero_page(vm_page_t m) +{ + struct sysmaps *sysmaps; + + sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; + mtx_lock(&sysmaps->lock); + if (*sysmaps->CMAP2) + panic("pmap_zero_page: CMAP2 busy"); + sched_pin(); + PT_SET_MA(sysmaps->CADDR2, PG_V | PG_RW | xpmap_ptom(VM_PAGE_TO_PHYS(m)) | PG_A | PG_M); + pagezero(sysmaps->CADDR2); + PT_SET_MA(sysmaps->CADDR2, 0); + sched_unpin(); + mtx_unlock(&sysmaps->lock); +} + +/* + * pmap_zero_page_area zeros the specified hardware page by mapping + * the page into KVM and using bzero to clear its contents. + * + * off and size may not cover an area beyond a single hardware page. + */ +void +pmap_zero_page_area(vm_page_t m, int off, int size) +{ + struct sysmaps *sysmaps; + + sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; + mtx_lock(&sysmaps->lock); + if (*sysmaps->CMAP2) + panic("pmap_zero_page: CMAP2 busy"); + sched_pin(); + PT_SET_MA(sysmaps->CADDR2, PG_V | PG_RW | xpmap_ptom(VM_PAGE_TO_PHYS(m)) | PG_A | PG_M); + + if (off == 0 && size == PAGE_SIZE) + pagezero(sysmaps->CADDR2); + else + bzero((char *)sysmaps->CADDR2 + off, size); + PT_SET_MA(sysmaps->CADDR2, 0); + sched_unpin(); + mtx_unlock(&sysmaps->lock); +} + +/* + * pmap_zero_page_idle zeros the specified hardware page by mapping + * the page into KVM and using bzero to clear its contents. This + * is intended to be called from the vm_pagezero process only and + * outside of Giant. + */ +void +pmap_zero_page_idle(vm_page_t m) +{ + + if (*CMAP3) + panic("pmap_zero_page: CMAP3 busy"); + sched_pin(); + PT_SET_MA(CADDR3, PG_V | PG_RW | xpmap_ptom(VM_PAGE_TO_PHYS(m)) | PG_A | PG_M); + pagezero(CADDR3); + PT_SET_MA(CADDR3, 0); + sched_unpin(); +} + +/* + * pmap_copy_page copies the specified (machine independent) + * page by mapping the page into virtual memory and using + * bcopy to copy the page, one machine dependent page at a + * time. + */ +void +pmap_copy_page(vm_page_t src, vm_page_t dst) +{ + struct sysmaps *sysmaps; + + sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; + mtx_lock(&sysmaps->lock); + if (*sysmaps->CMAP1) + panic("pmap_copy_page: CMAP1 busy"); + if (*sysmaps->CMAP2) + panic("pmap_copy_page: CMAP2 busy"); + sched_pin(); + PT_SET_MA(sysmaps->CADDR1, PG_V | xpmap_ptom(VM_PAGE_TO_PHYS(src)) | PG_A); + PT_SET_MA(sysmaps->CADDR2, PG_V | PG_RW | xpmap_ptom(VM_PAGE_TO_PHYS(dst)) | PG_A | PG_M); + bcopy(sysmaps->CADDR1, sysmaps->CADDR2, PAGE_SIZE); + PT_SET_MA(sysmaps->CADDR1, 0); + PT_SET_MA(sysmaps->CADDR2, 0); + sched_unpin(); + mtx_unlock(&sysmaps->lock); +} + +/* + * Returns true if the pmap's pv is one of the first + * 16 pvs linked to from this page. This count may + * be changed upwards or downwards in the future; it + * is only necessary that true be returned for a small + * subset of pmaps for proper page aging. + */ +boolean_t +pmap_page_exists_quick(pmap_t pmap, vm_page_t m) +{ + pv_entry_t pv; + int loops = 0; + + if (m->flags & PG_FICTITIOUS) + return (FALSE); + + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { + if (PV_PMAP(pv) == pmap) { + return TRUE; + } + loops++; + if (loops >= 16) + break; + } + return (FALSE); +} + +/* + * pmap_page_wired_mappings: + * + * Return the number of managed mappings to the given physical page + * that are wired. + */ +int +pmap_page_wired_mappings(vm_page_t m) +{ + pv_entry_t pv; + pt_entry_t *pte; + pmap_t pmap; + int count; + + count = 0; + if ((m->flags & PG_FICTITIOUS) != 0) + return (count); + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + sched_pin(); + TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { + pmap = PV_PMAP(pv); + PMAP_LOCK(pmap); + pte = pmap_pte_quick(pmap, pv->pv_va); + if ((*pte & PG_W) != 0) + count++; + PMAP_UNLOCK(pmap); + } + sched_unpin(); + return (count); +} + +/* + * Remove all pages from specified address space + * this aids process exit speeds. Also, this code + * is special cased for current process only, but + * can have the more generic (and slightly slower) + * mode enabled. This is much faster than pmap_remove + * in the case of running down an entire address space. + */ +void +pmap_remove_pages(pmap_t pmap) +{ + pt_entry_t *pte, tpte; + vm_page_t m, free = NULL; + pv_entry_t pv; + struct pv_chunk *pc, *npc; + int field, idx; + int32_t bit; + uint32_t inuse, bitmask; + int allfree; + + CTR1(KTR_PMAP, "pmap_remove_pages: pmap=%p", pmap); + + if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) { + printf("warning: pmap_remove_pages called with non-current pmap\n"); + return; + } + vm_page_lock_queues(); + KASSERT(pmap_is_current(pmap), ("removing pages from non-current pmap")); + PMAP_LOCK(pmap); + sched_pin(); + TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { + allfree = 1; + for (field = 0; field < _NPCM; field++) { + inuse = (~(pc->pc_map[field])) & pc_freemask[field]; + while (inuse != 0) { + bit = bsfl(inuse); + bitmask = 1UL << bit; + idx = field * 32 + bit; + pv = &pc->pc_pventry[idx]; + inuse &= ~bitmask; + + pte = vtopte(pv->pv_va); + tpte = *pte ? xpmap_mtop(*pte) : 0; + + if (tpte == 0) { + printf( + "TPTE at %p IS ZERO @ VA %08x\n", + pte, pv->pv_va); + panic("bad pte"); + } + +/* + * We cannot remove wired pages from a process' mapping at this time + */ + if (tpte & PG_W) { + allfree = 0; + continue; + } + + m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); + KASSERT(m->phys_addr == (tpte & PG_FRAME), + ("vm_page_t %p phys_addr mismatch %016jx %016jx", + m, (uintmax_t)m->phys_addr, + (uintmax_t)tpte)); + + KASSERT(m < &vm_page_array[vm_page_array_size], + ("pmap_remove_pages: bad tpte %#jx", + (uintmax_t)tpte)); + + + PT_CLEAR_VA(pte, FALSE); + + /* + * Update the vm_page_t clean/reference bits. + */ + if (tpte & PG_M) + vm_page_dirty(m); + + TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); + if (TAILQ_EMPTY(&m->md.pv_list)) + vm_page_flag_clear(m, PG_WRITEABLE); + + pmap_unuse_pt(pmap, pv->pv_va, &free); + + /* Mark free */ + PV_STAT(pv_entry_frees++); + PV_STAT(pv_entry_spare++); + pv_entry_count--; + pc->pc_map[field] |= bitmask; + pmap->pm_stats.resident_count--; + } + } + PT_UPDATES_FLUSH(); + if (allfree) { + PV_STAT(pv_entry_spare -= _NPCPV); + PV_STAT(pc_chunk_count--); + PV_STAT(pc_chunk_frees++); + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); + pmap_qremove((vm_offset_t)pc, 1); + vm_page_unwire(m, 0); + vm_page_free(m); + pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); + } + } + PT_UPDATES_FLUSH(); + if (*PMAP1) + PT_SET_MA(PADDR1, 0); + + sched_unpin(); + pmap_invalidate_all(pmap); + vm_page_unlock_queues(); + PMAP_UNLOCK(pmap); + pmap_free_zero_pages(free); +} + +/* + * pmap_is_modified: + * + * Return whether or not the specified physical page was modified + * in any physical maps. + */ +boolean_t +pmap_is_modified(vm_page_t m) +{ + pv_entry_t pv; + pt_entry_t *pte; + pmap_t pmap; + boolean_t rv; + + rv = FALSE; + if (m->flags & PG_FICTITIOUS) + return (rv); + + sched_pin(); + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { + pmap = PV_PMAP(pv); + PMAP_LOCK(pmap); + pte = pmap_pte_quick(pmap, pv->pv_va); + rv = (*pte & PG_M) != 0; + PMAP_UNLOCK(pmap); + if (rv) + break; + } + if (*PMAP1) + PT_SET_MA(PADDR1, 0); + sched_unpin(); + return (rv); +} + +/* + * pmap_is_prefaultable: + * + * Return whether or not the specified virtual address is elgible + * for prefault. + */ +static boolean_t +pmap_is_prefaultable_locked(pmap_t pmap, vm_offset_t addr) +{ + pt_entry_t *pte; + boolean_t rv = FALSE; + + return (rv); + + if (pmap_is_current(pmap) && *pmap_pde(pmap, addr)) { + pte = vtopte(addr); + rv = (*pte == 0); + } + return (rv); +} + +boolean_t +pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) +{ + boolean_t rv; + + PMAP_LOCK(pmap); + rv = pmap_is_prefaultable_locked(pmap, addr); + PMAP_UNLOCK(pmap); + return (rv); +} + +void +pmap_map_readonly(pmap_t pmap, vm_offset_t va, int len) +{ + int i, npages = round_page(len) >> PAGE_SHIFT; + for (i = 0; i < npages; i++) { + pt_entry_t *pte; + pte = pmap_pte(pmap, (vm_offset_t)(va + i*PAGE_SIZE)); + pte_store(pte, xpmap_mtop(*pte & ~(PG_RW|PG_M))); + PMAP_MARK_PRIV(xpmap_mtop(*pte)); + pmap_pte_release(pte); + } +} + +void +pmap_map_readwrite(pmap_t pmap, vm_offset_t va, int len) +{ + int i, npages = round_page(len) >> PAGE_SHIFT; + for (i = 0; i < npages; i++) { + pt_entry_t *pte; + pte = pmap_pte(pmap, (vm_offset_t)(va + i*PAGE_SIZE)); + PMAP_MARK_UNPRIV(xpmap_mtop(*pte)); + pte_store(pte, xpmap_mtop(*pte) | (PG_RW|PG_M)); + pmap_pte_release(pte); + } +} + +/* + * Clear the write and modified bits in each of the given page's mappings. + */ +void +pmap_remove_write(vm_page_t m) +{ + pv_entry_t pv; + pmap_t pmap; + pt_entry_t oldpte, *pte; + + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + if ((m->flags & PG_FICTITIOUS) != 0 || + (m->flags & PG_WRITEABLE) == 0) + return; + sched_pin(); + TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { + pmap = PV_PMAP(pv); + PMAP_LOCK(pmap); + pte = pmap_pte_quick(pmap, pv->pv_va); +retry: + oldpte = *pte; + if ((oldpte & PG_RW) != 0) { + /* + * Regardless of whether a pte is 32 or 64 bits + * in size, PG_RW and PG_M are among the least + * significant 32 bits. + */ + if (!atomic_cmpset_int((u_int *)pte, oldpte, + oldpte & ~(PG_RW | PG_M))) + goto retry; + if ((oldpte & PG_M) != 0) + vm_page_dirty(m); + pmap_invalidate_page(pmap, pv->pv_va); + } + PMAP_UNLOCK(pmap); + } + vm_page_flag_clear(m, PG_WRITEABLE); + PT_UPDATES_FLUSH(); + if (*PMAP1) + PT_SET_MA(PADDR1, 0); + sched_unpin(); +} + +/* + * pmap_ts_referenced: + * + * Return a count of reference bits for a page, clearing those bits. + * It is not necessary for every reference bit to be cleared, but it + * is necessary that 0 only be returned when there are truly no + * reference bits set. + * + * XXX: The exact number of bits to check and clear is a matter that + * should be tested and standardized at some point in the future for + * optimal aging of shared pages. + */ +int +pmap_ts_referenced(vm_page_t m) +{ + pv_entry_t pv, pvf, pvn; + pmap_t pmap; + pt_entry_t *pte; + int rtval = 0; + + if (m->flags & PG_FICTITIOUS) + return (rtval); + sched_pin(); + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { + pvf = pv; + do { + pvn = TAILQ_NEXT(pv, pv_list); + TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); + pmap = PV_PMAP(pv); + PMAP_LOCK(pmap); + pte = pmap_pte_quick(pmap, pv->pv_va); + if ((*pte & PG_A) != 0) { + PT_SET_VA_MA(pte, *pte & ~PG_A, FALSE); + pmap_invalidate_page(pmap, pv->pv_va); + rtval++; + if (rtval > 4) + pvn = NULL; + } + PMAP_UNLOCK(pmap); + } while ((pv = pvn) != NULL && pv != pvf); + } + PT_UPDATES_FLUSH(); + if (*PMAP1) + PT_SET_MA(PADDR1, 0); + + sched_unpin(); + return (rtval); +} + +/* + * Clear the modify bits on the specified physical page. + */ +void +pmap_clear_modify(vm_page_t m) +{ + pv_entry_t pv; + pmap_t pmap; + pt_entry_t *pte; + + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + if ((m->flags & PG_FICTITIOUS) != 0) + return; + sched_pin(); + TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { + pmap = PV_PMAP(pv); + PMAP_LOCK(pmap); + pte = pmap_pte_quick(pmap, pv->pv_va); + if ((*pte & PG_M) != 0) { + /* + * Regardless of whether a pte is 32 or 64 bits + * in size, PG_M is among the least significant + * 32 bits. + */ + PT_SET_VA_MA(pte, *pte & ~PG_M, FALSE); + pmap_invalidate_page(pmap, pv->pv_va); + } + PMAP_UNLOCK(pmap); + } + sched_unpin(); +} + +/* + * pmap_clear_reference: + * + * Clear the reference bit on the specified physical page. + */ +void +pmap_clear_reference(vm_page_t m) +{ + pv_entry_t pv; + pmap_t pmap; + pt_entry_t *pte; + + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + if ((m->flags & PG_FICTITIOUS) != 0) + return; + sched_pin(); + TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { + pmap = PV_PMAP(pv); + PMAP_LOCK(pmap); + pte = pmap_pte_quick(pmap, pv->pv_va); + if ((*pte & PG_A) != 0) { + /* + * Regardless of whether a pte is 32 or 64 bits + * in size, PG_A is among the least significant + * 32 bits. + */ + PT_SET_VA_MA(pte, *pte & ~PG_A, FALSE); + pmap_invalidate_page(pmap, pv->pv_va); + } + PMAP_UNLOCK(pmap); + } + sched_unpin(); +} + +/* + * Miscellaneous support routines follow + */ + +/* + * Map a set of physical memory pages into the kernel virtual + * address space. Return a pointer to where it is mapped. This + * routine is intended to be used for mapping device memory, + * NOT real memory. + */ +void * +pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) +{ + vm_offset_t va, tmpva, offset; + + offset = pa & PAGE_MASK; + size = roundup(offset + size, PAGE_SIZE); + pa = pa & PG_FRAME; + + if (pa < KERNLOAD && pa + size <= KERNLOAD) + va = KERNBASE + pa; + else + va = kmem_alloc_nofault(kernel_map, size); + if (!va) + panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); + + for (tmpva = va; size > 0; ) { + pmap_kenter_attr(tmpva, pa, mode); + size -= PAGE_SIZE; + tmpva += PAGE_SIZE; + pa += PAGE_SIZE; + } + pmap_invalidate_range(kernel_pmap, va, tmpva); + pmap_invalidate_cache(); + return ((void *)(va + offset)); +} + +void * +pmap_mapdev(vm_paddr_t pa, vm_size_t size) +{ + + return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); +} + +void * +pmap_mapbios(vm_paddr_t pa, vm_size_t size) +{ + + return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); +} + +void +pmap_unmapdev(vm_offset_t va, vm_size_t size) +{ + vm_offset_t base, offset, tmpva; + + if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD) + return; + base = trunc_page(va); + offset = va & PAGE_MASK; + size = roundup(offset + size, PAGE_SIZE); + critical_enter(); + for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE) + pmap_kremove(tmpva); + pmap_invalidate_range(kernel_pmap, va, tmpva); + critical_exit(); + kmem_free(kernel_map, base, size); +} + +int +pmap_change_attr(va, size, mode) + vm_offset_t va; + vm_size_t size; + int mode; +{ + vm_offset_t base, offset, tmpva; + pt_entry_t *pte; + u_int opte, npte; + pd_entry_t *pde; + + base = trunc_page(va); + offset = va & PAGE_MASK; + size = roundup(offset + size, PAGE_SIZE); + + /* Only supported on kernel virtual addresses. */ + if (base <= VM_MAXUSER_ADDRESS) + return (EINVAL); + + /* 4MB pages and pages that aren't mapped aren't supported. */ + for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE) { + pde = pmap_pde(kernel_pmap, tmpva); + if (*pde & PG_PS) + return (EINVAL); + if ((*pde & PG_V) == 0) + return (EINVAL); + pte = vtopte(va); + if ((*pte & PG_V) == 0) + return (EINVAL); + } + + /* + * Ok, all the pages exist and are 4k, so run through them updating + * their cache mode. + */ + for (tmpva = base; size > 0; ) { + pte = vtopte(tmpva); + + /* + * The cache mode bits are all in the low 32-bits of the + * PTE, so we can just spin on updating the low 32-bits. + */ + do { + opte = *(u_int *)pte; + npte = opte & ~(PG_PTE_PAT | PG_NC_PCD | PG_NC_PWT); + npte |= pmap_cache_bits(mode, 0); + PT_SET_VA_MA(pte, npte, TRUE); + } while (npte != opte && (*pte != npte)); + tmpva += PAGE_SIZE; + size -= PAGE_SIZE; + } + + /* + * Flush CPU caches to make sure any data isn't cached that shouldn't + * be, etc. + */ + pmap_invalidate_range(kernel_pmap, base, tmpva); + pmap_invalidate_cache(); + return (0); +} + +/* + * perform the pmap work for mincore + */ +int +pmap_mincore(pmap_t pmap, vm_offset_t addr) +{ + pt_entry_t *ptep, pte; + vm_page_t m; + int val = 0; + + PMAP_LOCK(pmap); + ptep = pmap_pte(pmap, addr); + pte = (ptep != NULL) ? PT_GET(ptep) : 0; + pmap_pte_release(ptep); + PMAP_UNLOCK(pmap); + + if (pte != 0) { + vm_paddr_t pa; + + val = MINCORE_INCORE; + if ((pte & PG_MANAGED) == 0) + return val; + + pa = pte & PG_FRAME; + + m = PHYS_TO_VM_PAGE(pa); + + /* + * Modified by us + */ + if (pte & PG_M) + val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER; + else { + /* + * Modified by someone else + */ + vm_page_lock_queues(); + if (m->dirty || pmap_is_modified(m)) + val |= MINCORE_MODIFIED_OTHER; + vm_page_unlock_queues(); + } + /* + * Referenced by us + */ + if (pte & PG_A) + val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER; + else { + /* + * Referenced by someone else + */ + vm_page_lock_queues(); + if ((m->flags & PG_REFERENCED) || + pmap_ts_referenced(m)) { + val |= MINCORE_REFERENCED_OTHER; + vm_page_flag_set(m, PG_REFERENCED); + } + vm_page_unlock_queues(); + } + } + return val; +} + +void +pmap_activate(struct thread *td) +{ + pmap_t pmap, oldpmap; + u_int32_t cr3; + + critical_enter(); + pmap = vmspace_pmap(td->td_proc->p_vmspace); + oldpmap = PCPU_GET(curpmap); +#if defined(SMP) + atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask)); + atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask)); +#else + oldpmap->pm_active &= ~1; + pmap->pm_active |= 1; +#endif +#ifdef PAE + cr3 = vtophys(pmap->pm_pdpt); +#else + cr3 = vtophys(pmap->pm_pdir); +#endif + /* + * pmap_activate is for the current thread on the current cpu + */ + td->td_pcb->pcb_cr3 = cr3; + PT_UPDATES_FLUSH(); + load_cr3(cr3); + + PCPU_SET(curpmap, pmap); + critical_exit(); +} + +vm_offset_t +pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size) +{ + + if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) { + return addr; + } + + addr = (addr + PDRMASK) & ~PDRMASK; + return addr; +} + + +#if defined(PMAP_DEBUG) +pmap_pid_dump(int pid) +{ + pmap_t pmap; + struct proc *p; + int npte = 0; + int index; + + sx_slock(&allproc_lock); + FOREACH_PROC_IN_SYSTEM(p) { + if (p->p_pid != pid) + continue; + + if (p->p_vmspace) { + int i,j; + index = 0; + pmap = vmspace_pmap(p->p_vmspace); + for (i = 0; i < NPDEPTD; i++) { + pd_entry_t *pde; + pt_entry_t *pte; + vm_offset_t base = i << PDRSHIFT; + + pde = &pmap->pm_pdir[i]; + if (pde && pmap_pde_v(pde)) { + for (j = 0; j < NPTEPG; j++) { + vm_offset_t va = base + (j << PAGE_SHIFT); + if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) { + if (index) { + index = 0; + printf("\n"); + } + sx_sunlock(&allproc_lock); + return npte; + } + pte = pmap_pte(pmap, va); + if (pte && pmap_pte_v(pte)) { + pt_entry_t pa; + vm_page_t m; + pa = PT_GET(pte); + m = PHYS_TO_VM_PAGE(pa & PG_FRAME); + printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x", + va, pa, m->hold_count, m->wire_count, m->flags); + npte++; + index++; + if (index >= 2) { + index = 0; + printf("\n"); + } else { + printf(" "); + } + } + } + } + } + } + } + sx_sunlock(&allproc_lock); + return npte; +} +#endif + +#if defined(DEBUG) + +static void pads(pmap_t pm); +void pmap_pvdump(vm_paddr_t pa); + +/* print address space of pmap*/ +static void +pads(pmap_t pm) +{ + int i, j; + vm_paddr_t va; + pt_entry_t *ptep; + + if (pm == kernel_pmap) + return; + for (i = 0; i < NPDEPTD; i++) + if (pm->pm_pdir[i]) + for (j = 0; j < NPTEPG; j++) { + va = (i << PDRSHIFT) + (j << PAGE_SHIFT); + if (pm == kernel_pmap && va < KERNBASE) + continue; + if (pm != kernel_pmap && va > UPT_MAX_ADDRESS) + continue; + ptep = pmap_pte(pm, va); + if (pmap_pte_v(ptep)) + printf("%x:%x ", va, *ptep); + }; + +} + +void +pmap_pvdump(vm_paddr_t pa) +{ + pv_entry_t pv; + pmap_t pmap; + vm_page_t m; + + printf("pa %x", pa); + m = PHYS_TO_VM_PAGE(pa); + TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { + pmap = PV_PMAP(pv); + printf(" -> pmap %p, va %x", (void *)pmap, pv->pv_va); + pads(pmap); + } + printf(" "); +} +#endif diff --git a/sys/i386/xen/xen_machdep.c b/sys/i386/xen/xen_machdep.c new file mode 100644 index 0000000..6dd6bf6 --- /dev/null +++ b/sys/i386/xen/xen_machdep.c @@ -0,0 +1,1315 @@ +/* + * + * Copyright (c) 2004 Christian Limpach. + * Copyright (c) 2004-2006 Kip Macy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Christian Limpach. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bus.h> +#include <sys/mount.h> +#include <sys/malloc.h> +#include <sys/kernel.h> +#include <sys/reboot.h> +#include <sys/sysproto.h> + + +#include <vm/vm.h> +#include <vm/pmap.h> +#include <machine/segments.h> +#include <machine/pcb.h> +#include <machine/stdarg.h> +#include <machine/vmparam.h> +#include <machine/cpu.h> +#include <machine/intr_machdep.h> +#include <machine/md_var.h> +#include <machine/asmacros.h> + + + + +#include <machine/xen/hypervisor.h> +#include <machine/xen/xenvar.h> +#include <machine/xen/xenfunc.h> +#include <machine/xen/xenpmap.h> +#include <machine/xen/xenbus.h> +#include <machine/xen/xenfunc.h> +#include <xen/interface/memory.h> +#include <machine/xen/features.h> +#ifdef SMP +#include <machine/privatespace.h> +#endif + +#define IDTVEC(name) __CONCAT(X,name) + +extern inthand_t +IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), + IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), + IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), + IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), + IDTVEC(xmm), IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall); + +int xendebug_flags; +start_info_t *xen_start_info; +shared_info_t *HYPERVISOR_shared_info; +xen_pfn_t *xen_machine_phys = machine_to_phys_mapping; +xen_pfn_t *xen_phys_machine; +int preemptable, init_first; +extern unsigned int avail_space; +extern int gdt_set; + +void ni_cli(void); +void ni_sti(void); + + +void +ni_cli(void) +{ + __asm__("pushl %edx;" + "pushl %eax;" + ); + __cli(); + __asm__("popl %eax;" + "popl %edx;" + ); +} + + +void +ni_sti(void) +{ + __asm__("pushl %edx;" + "pushl %esi;" + "pushl %eax;" + ); + __sti(); + __asm__("popl %eax;" + "popl %esi;" + "popl %edx;" + ); +} + +/* + * Modify the cmd_line by converting ',' to NULLs so that it is in a format + * suitable for the static env vars. + */ +char * +xen_setbootenv(char *cmd_line) +{ + char *cmd_line_next; + + /* Skip leading spaces */ + for (; *cmd_line == ' '; cmd_line++); + + printk("xen_setbootenv(): cmd_line='%s'\n", cmd_line); + + for (cmd_line_next = cmd_line; strsep(&cmd_line_next, ",") != NULL;); + return cmd_line; +} + +static struct +{ + const char *ev; + int mask; +} howto_names[] = { + {"boot_askname", RB_ASKNAME}, + {"boot_single", RB_SINGLE}, + {"boot_nosync", RB_NOSYNC}, + {"boot_halt", RB_ASKNAME}, + {"boot_serial", RB_SERIAL}, + {"boot_cdrom", RB_CDROM}, + {"boot_gdb", RB_GDB}, + {"boot_gdb_pause", RB_RESERVED1}, + {"boot_verbose", RB_VERBOSE}, + {"boot_multicons", RB_MULTIPLE}, + {NULL, 0} +}; + +int +xen_boothowto(char *envp) +{ + int i, howto = 0; + + /* get equivalents from the environment */ + for (i = 0; howto_names[i].ev != NULL; i++) + if (getenv(howto_names[i].ev) != NULL) + howto |= howto_names[i].mask; + return howto; +} + +#define PRINTK_BUFSIZE 1024 +void +printk(const char *fmt, ...) +{ + __va_list ap; + int retval; + static char buf[PRINTK_BUFSIZE]; + + va_start(ap, fmt); + retval = vsnprintf(buf, PRINTK_BUFSIZE - 1, fmt, ap); + va_end(ap); + buf[retval] = 0; + (void)HYPERVISOR_console_write(buf, retval); +} + + +#define XPQUEUE_SIZE 128 +#ifdef SMP +/* per-cpu queues and indices */ +static mmu_update_t xpq_queue[MAX_VIRT_CPUS][XPQUEUE_SIZE]; +static int xpq_idx[MAX_VIRT_CPUS]; + +#define XPQ_QUEUE xpq_queue[vcpu] +#define XPQ_IDX xpq_idx[vcpu] +#define SET_VCPU() int vcpu = smp_processor_id() +#else +struct mmu_log { + char *file; + int line; +}; + +static mmu_update_t xpq_queue[XPQUEUE_SIZE]; +static struct mmu_log xpq_queue_log[XPQUEUE_SIZE]; +static int xpq_idx = 0; + +#define XPQ_QUEUE xpq_queue +#define XPQ_IDX xpq_idx +#define SET_VCPU() +#endif +#define XPQ_IDX_INC atomic_add_int(&XPQ_IDX, 1); + +#if 0 +static void +xen_dump_queue(void) +{ + int _xpq_idx = XPQ_IDX; + int i; + + if (_xpq_idx <= 1) + return; + + printk("xen_dump_queue(): %u entries\n", _xpq_idx); + for (i = 0; i < _xpq_idx; i++) { + printk(" val: %llx ptr: %llx\n", XPQ_QUEUE[i].val, XPQ_QUEUE[i].ptr); + } +} +#endif + + +static __inline void +_xen_flush_queue(void) +{ + SET_VCPU(); + int _xpq_idx = XPQ_IDX; + int error, i; + /* window of vulnerability here? */ + + if (__predict_true(gdt_set)) + critical_enter(); + XPQ_IDX = 0; + /* Make sure index is cleared first to avoid double updates. */ + error = HYPERVISOR_mmu_update((mmu_update_t *)&XPQ_QUEUE, + _xpq_idx, NULL, DOMID_SELF); + +#if 0 + if (__predict_true(gdt_set)) + for (i = _xpq_idx; i > 0;) { + if (i >= 3) { + CTR6(KTR_PMAP, "mmu:val: %lx ptr: %lx val: %lx ptr: %lx val: %lx ptr: %lx", + (XPQ_QUEUE[i-1].val & 0xffffffff), (XPQ_QUEUE[i-1].ptr & 0xffffffff), + (XPQ_QUEUE[i-2].val & 0xffffffff), (XPQ_QUEUE[i-2].ptr & 0xffffffff), + (XPQ_QUEUE[i-3].val & 0xffffffff), (XPQ_QUEUE[i-3].ptr & 0xffffffff)); + i -= 3; + } else if (i == 2) { + CTR4(KTR_PMAP, "mmu: val: %lx ptr: %lx val: %lx ptr: %lx", + (XPQ_QUEUE[i-1].val & 0xffffffff), (XPQ_QUEUE[i-1].ptr & 0xffffffff), + (XPQ_QUEUE[i-2].val & 0xffffffff), (XPQ_QUEUE[i-2].ptr & 0xffffffff)); + i = 0; + } else { + CTR2(KTR_PMAP, "mmu: val: %lx ptr: %lx", + (XPQ_QUEUE[i-1].val & 0xffffffff), (XPQ_QUEUE[i-1].ptr & 0xffffffff)); + i = 0; + } + } +#endif + if (__predict_true(gdt_set)) + critical_exit(); + if (__predict_false(error < 0)) { + for (i = 0; i < _xpq_idx; i++) + printf("val: %llx ptr: %llx\n", XPQ_QUEUE[i].val, XPQ_QUEUE[i].ptr); + panic("Failed to execute MMU updates: %d", error); + } + +} + +void +xen_flush_queue(void) +{ + SET_VCPU(); + if (XPQ_IDX != 0) _xen_flush_queue(); +} + +static __inline void +xen_increment_idx(void) +{ + SET_VCPU(); + + XPQ_IDX++; + if (__predict_false(XPQ_IDX == XPQUEUE_SIZE)) + xen_flush_queue(); +} + +void +xen_check_queue(void) +{ + KASSERT(XPQ_IDX == 0, ("pending operations XPQ_IDX=%d", XPQ_IDX)); +} + +void +xen_invlpg(vm_offset_t va) +{ + struct mmuext_op op; + op.cmd = MMUEXT_INVLPG_ALL; + op.arg1.linear_addr = va & ~PAGE_MASK; + PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void +xen_load_cr3(vm_paddr_t val) +{ + struct mmuext_op op; + + KASSERT(XPQ_IDX == 0, ("pending operations XPQ_IDX=%d", XPQ_IDX)); + op.cmd = MMUEXT_NEW_BASEPTR; + op.arg1.mfn = xpmap_ptom(val) >> PAGE_SHIFT; + PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void +_xen_machphys_update(unsigned long mfn, unsigned long pfn, char *file, int line) +{ + + if (__predict_true(gdt_set)) + critical_enter(); + SET_VCPU(); + XPQ_QUEUE[XPQ_IDX].ptr = (mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE; + XPQ_QUEUE[XPQ_IDX].val = pfn; +#ifdef INVARIANTS + xpq_queue_log[XPQ_IDX].file = file; + xpq_queue_log[XPQ_IDX].line = line; +#endif + xen_increment_idx(); + if (__predict_true(gdt_set)) + critical_exit(); +} + +void +_xen_queue_pt_update(vm_paddr_t ptr, vm_paddr_t val, char *file, int line) +{ + + if (__predict_true(gdt_set)) + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + + if (__predict_true(gdt_set)) + critical_enter(); + SET_VCPU(); + XPQ_QUEUE[XPQ_IDX].ptr = ((uint64_t)ptr) | MMU_NORMAL_PT_UPDATE; + XPQ_QUEUE[XPQ_IDX].val = (uint64_t)val; +#ifdef INVARIANTS + xpq_queue_log[XPQ_IDX].file = file; + xpq_queue_log[XPQ_IDX].line = line; +#endif + xen_increment_idx(); + if (__predict_true(gdt_set)) + critical_exit(); +} + +void +xen_pgdpt_pin(vm_paddr_t ma) +{ + struct mmuext_op op; + op.cmd = MMUEXT_PIN_L3_TABLE; + op.arg1.mfn = ma >> PAGE_SHIFT; + xen_flush_queue(); + PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void +xen_pgd_pin(vm_paddr_t ma) +{ + struct mmuext_op op; + op.cmd = MMUEXT_PIN_L2_TABLE; + op.arg1.mfn = ma >> PAGE_SHIFT; + xen_flush_queue(); + PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void +xen_pgd_unpin(vm_paddr_t ma) +{ + struct mmuext_op op; + op.cmd = MMUEXT_UNPIN_TABLE; + op.arg1.mfn = ma >> PAGE_SHIFT; + xen_flush_queue(); + PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void +xen_pt_pin(vm_paddr_t ma) +{ + struct mmuext_op op; + op.cmd = MMUEXT_PIN_L1_TABLE; + op.arg1.mfn = ma >> PAGE_SHIFT; + printk("xen_pt_pin(): mfn=%x\n", op.arg1.mfn); + xen_flush_queue(); + PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void +xen_pt_unpin(vm_paddr_t ma) +{ + struct mmuext_op op; + op.cmd = MMUEXT_UNPIN_TABLE; + op.arg1.mfn = ma >> PAGE_SHIFT; + xen_flush_queue(); + PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void +xen_set_ldt(vm_paddr_t ptr, unsigned long len) +{ + struct mmuext_op op; + op.cmd = MMUEXT_SET_LDT; + op.arg1.linear_addr = ptr; + op.arg2.nr_ents = len; + xen_flush_queue(); + PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void xen_tlb_flush(void) +{ + struct mmuext_op op; + op.cmd = MMUEXT_TLB_FLUSH_LOCAL; + xen_flush_queue(); + PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void +xen_update_descriptor(union descriptor *table, union descriptor *entry) +{ + vm_paddr_t pa; + pt_entry_t *ptp; + + ptp = vtopte((vm_offset_t)table); + pa = (*ptp & PG_FRAME) | ((vm_offset_t)table & PAGE_MASK); + if (HYPERVISOR_update_descriptor(pa, *(uint64_t *)entry)) + panic("HYPERVISOR_update_descriptor failed\n"); +} + + +#if 0 +/* + * Bitmap is indexed by page number. If bit is set, the page is part of a + * xen_create_contiguous_region() area of memory. + */ +unsigned long *contiguous_bitmap; + +static void +contiguous_bitmap_set(unsigned long first_page, unsigned long nr_pages) +{ + unsigned long start_off, end_off, curr_idx, end_idx; + + curr_idx = first_page / BITS_PER_LONG; + start_off = first_page & (BITS_PER_LONG-1); + end_idx = (first_page + nr_pages) / BITS_PER_LONG; + end_off = (first_page + nr_pages) & (BITS_PER_LONG-1); + + if (curr_idx == end_idx) { + contiguous_bitmap[curr_idx] |= + ((1UL<<end_off)-1) & -(1UL<<start_off); + } else { + contiguous_bitmap[curr_idx] |= -(1UL<<start_off); + while ( ++curr_idx < end_idx ) + contiguous_bitmap[curr_idx] = ~0UL; + contiguous_bitmap[curr_idx] |= (1UL<<end_off)-1; + } +} + +static void +contiguous_bitmap_clear(unsigned long first_page, unsigned long nr_pages) +{ + unsigned long start_off, end_off, curr_idx, end_idx; + + curr_idx = first_page / BITS_PER_LONG; + start_off = first_page & (BITS_PER_LONG-1); + end_idx = (first_page + nr_pages) / BITS_PER_LONG; + end_off = (first_page + nr_pages) & (BITS_PER_LONG-1); + + if (curr_idx == end_idx) { + contiguous_bitmap[curr_idx] &= + -(1UL<<end_off) | ((1UL<<start_off)-1); + } else { + contiguous_bitmap[curr_idx] &= (1UL<<start_off)-1; + while ( ++curr_idx != end_idx ) + contiguous_bitmap[curr_idx] = 0; + contiguous_bitmap[curr_idx] &= -(1UL<<end_off); + } +} +#endif + +/* Ensure multi-page extents are contiguous in machine memory. */ +int +xen_create_contiguous_region(vm_page_t pages, int npages) +{ + unsigned long mfn, i, flags; + int order; + struct xen_memory_reservation reservation = { + .nr_extents = 1, + .extent_order = 0, + .domid = DOMID_SELF + }; + set_xen_guest_handle(reservation.extent_start, &mfn); + + + balloon_lock(flags); + + /* can currently only handle power of two allocation */ + PANIC_IF(ffs(npages) != fls(npages)); + + /* 0. determine order */ + order = (ffs(npages) == fls(npages)) ? fls(npages) - 1 : fls(npages); + + /* 1. give away machine pages. */ + for (i = 0; i < (1 << order); i++) { + int pfn; + pfn = VM_PAGE_TO_PHYS(&pages[i]) >> PAGE_SHIFT; + mfn = PFNTOMFN(pfn); + PFNTOMFN(pfn) = INVALID_P2M_ENTRY; + PANIC_IF(HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation) != 1); + } + + + /* 2. Get a new contiguous memory extent. */ + reservation.extent_order = order; + /* xenlinux hardcodes this because of aacraid - maybe set to 0 if we're not + * running with a broxen driver XXXEN + */ + reservation.address_bits = 31; + if (HYPERVISOR_memory_op(XENMEM_increase_reservation, &reservation) != 1) + goto fail; + + /* 3. Map the new extent in place of old pages. */ + for (i = 0; i < (1 << order); i++) { + int pfn; + pfn = VM_PAGE_TO_PHYS(&pages[i]) >> PAGE_SHIFT; + xen_machphys_update(mfn+i, pfn); + PFNTOMFN(pfn) = mfn+i; + } + + xen_tlb_flush(); + +#if 0 + contiguous_bitmap_set(VM_PAGE_TO_PHYS(&pages[0]) >> PAGE_SHIFT, 1UL << order); +#endif + + balloon_unlock(flags); + + return 0; + + fail: + reservation.extent_order = 0; + reservation.address_bits = 0; + + for (i = 0; i < (1 << order); i++) { + int pfn; + pfn = VM_PAGE_TO_PHYS(&pages[i]) >> PAGE_SHIFT; + PANIC_IF(HYPERVISOR_memory_op( + XENMEM_increase_reservation, &reservation) != 1); + xen_machphys_update(mfn, pfn); + PFNTOMFN(pfn) = mfn; + } + + xen_tlb_flush(); + + balloon_unlock(flags); + + return ENOMEM; +} + +void +xen_destroy_contiguous_region(void *addr, int npages) +{ + unsigned long mfn, i, flags, order, pfn0; + struct xen_memory_reservation reservation = { + .nr_extents = 1, + .extent_order = 0, + .domid = DOMID_SELF + }; + set_xen_guest_handle(reservation.extent_start, &mfn); + + pfn0 = vtophys(addr) >> PAGE_SHIFT; +#if 0 + scrub_pages(vstart, 1 << order); +#endif + /* can currently only handle power of two allocation */ + PANIC_IF(ffs(npages) != fls(npages)); + + /* 0. determine order */ + order = (ffs(npages) == fls(npages)) ? fls(npages) - 1 : fls(npages); + + balloon_lock(flags); + +#if 0 + contiguous_bitmap_clear(vtophys(addr) >> PAGE_SHIFT, 1UL << order); +#endif + + /* 1. Zap current PTEs, giving away the underlying pages. */ + for (i = 0; i < (1 << order); i++) { + int pfn; + uint64_t new_val = 0; + pfn = vtomach((char *)addr + i*PAGE_SIZE) >> PAGE_SHIFT; + + PANIC_IF(HYPERVISOR_update_va_mapping((vm_offset_t)((char *)addr + (i * PAGE_SIZE)), new_val, 0)); + PFNTOMFN(pfn) = INVALID_P2M_ENTRY; + PANIC_IF(HYPERVISOR_memory_op( + XENMEM_decrease_reservation, &reservation) != 1); + } + + /* 2. Map new pages in place of old pages. */ + for (i = 0; i < (1 << order); i++) { + int pfn; + uint64_t new_val; + pfn = pfn0 + i; + PANIC_IF(HYPERVISOR_memory_op(XENMEM_increase_reservation, &reservation) != 1); + + new_val = mfn << PAGE_SHIFT; + PANIC_IF(HYPERVISOR_update_va_mapping((vm_offset_t)addr + (i * PAGE_SIZE), + new_val, PG_KERNEL)); + xen_machphys_update(mfn, pfn); + PFNTOMFN(pfn) = mfn; + } + + xen_tlb_flush(); + + balloon_unlock(flags); +} + +extern unsigned long cpu0prvpage; +extern unsigned long *SMPpt; +extern struct user *proc0uarea; +extern vm_offset_t proc0kstack; +extern int vm86paddr, vm86phystk; +char *bootmem_start, *bootmem_current, *bootmem_end; + +pteinfo_t *pteinfo_list; +void initvalues(start_info_t *startinfo); + +struct ringbuf_head *xen_store; /* XXX move me */ +char *console_page; + +void * +bootmem_alloc(unsigned int size) +{ + char *retptr; + + retptr = bootmem_current; + PANIC_IF(retptr + size > bootmem_end); + bootmem_current += size; + + return retptr; +} + +void +bootmem_free(void *ptr, unsigned int size) +{ + char *tptr; + + tptr = ptr; + PANIC_IF(tptr != bootmem_current - size || + bootmem_current - size < bootmem_start); + + bootmem_current -= size; +} + +#if 0 +static vm_paddr_t +xpmap_mtop2(vm_paddr_t mpa) +{ + return ((machine_to_phys_mapping[mpa >> PAGE_SHIFT] << PAGE_SHIFT) + ) | (mpa & ~PG_FRAME); +} + +static pd_entry_t +xpmap_get_bootpde(vm_paddr_t va) +{ + + return ((pd_entry_t *)xen_start_info->pt_base)[va >> 22]; +} + +static pd_entry_t +xpmap_get_vbootpde(vm_paddr_t va) +{ + pd_entry_t pde; + + pde = xpmap_get_bootpde(va); + if ((pde & PG_V) == 0) + return (pde & ~PG_FRAME); + return (pde & ~PG_FRAME) | + (xpmap_mtop2(pde & PG_FRAME) + KERNBASE); +} + +static pt_entry_t 8* +xpmap_get_bootptep(vm_paddr_t va) +{ + pd_entry_t pde; + + pde = xpmap_get_vbootpde(va); + if ((pde & PG_V) == 0) + return (void *)-1; +#define PT_MASK 0x003ff000 /* page table address bits */ + return &(((pt_entry_t *)(pde & PG_FRAME))[(va & PT_MASK) >> PAGE_SHIFT]); +} + +static pt_entry_t +xpmap_get_bootpte(vm_paddr_t va) +{ + + return xpmap_get_bootptep(va)[0]; +} +#endif + + +vm_paddr_t *pdir_shadow; + +#ifdef ADD_ISA_HOLE +static void +shift_phys_machine(unsigned long *phys_machine, int nr_pages) +{ + + unsigned long *tmp_page, *current_page, *next_page; + int i; + + tmp_page = bootmem_alloc(PAGE_SIZE); + current_page = phys_machine + nr_pages - (PAGE_SIZE/sizeof(unsigned long)); + next_page = current_page - (PAGE_SIZE/sizeof(unsigned long)); + bcopy(phys_machine, tmp_page, PAGE_SIZE); + + while (current_page > phys_machine) { + /* save next page */ + bcopy(next_page, tmp_page, PAGE_SIZE); + /* shift down page */ + bcopy(current_page, next_page, PAGE_SIZE); + /* finish swap */ + bcopy(tmp_page, current_page, PAGE_SIZE); + + current_page -= (PAGE_SIZE/sizeof(unsigned long)); + next_page -= (PAGE_SIZE/sizeof(unsigned long)); + } + bootmem_free(tmp_page, PAGE_SIZE); + + for (i = 0; i < nr_pages; i++) { + xen_machphys_update(phys_machine[i], i); + } + memset(phys_machine, INVALID_P2M_ENTRY, PAGE_SIZE); + +} +#endif + +extern unsigned long physfree; +void +initvalues(start_info_t *startinfo) +{ + int l3_pages, l2_pages, l1_pages, offset; + vm_offset_t cur_space; + struct physdev_set_iopl set_iopl; + + vm_paddr_t KPTphys, IdlePTDma; + vm_paddr_t console_page_ma, xen_store_ma; + vm_offset_t KPTphysoff, tmpva; + vm_paddr_t shinfo; +#ifdef PAE + vm_paddr_t IdlePDPTma, IdlePDPTnewma; + vm_paddr_t pdir_shadow_ma[4], IdlePTDnewma[4]; + pd_entry_t *IdlePDPTnew, *IdlePTDnew; +#else + vm_paddr_t pdir_shadow_ma; +#endif + unsigned long i; + + + HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); + HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); +#ifdef notyet + /* + * need to install handler + */ + HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments_notify); +#endif + xen_start_info = startinfo; + xen_phys_machine = (xen_pfn_t *)startinfo->mfn_list; + + /* number of pages allocated after the pts + 1*/; + cur_space = xen_start_info->pt_base + + ((xen_start_info->nr_pt_frames) + 3 )*PAGE_SIZE; + printk("initvalues(): wooh - availmem=%x,%x\n", avail_space, cur_space); + + printk("KERNBASE=%x,pt_base=%x, VTOPFN(base)=%x, nr_pt_frames=%x\n", KERNBASE,xen_start_info->pt_base, VTOPFN(xen_start_info->pt_base), xen_start_info->nr_pt_frames); + xendebug_flags = 0; /* 0xffffffff; */ + + /* allocate 4 pages for bootmem allocator */ + bootmem_start = bootmem_current = (char *)cur_space; + cur_space += (4 * PAGE_SIZE); + bootmem_end = (char *)cur_space; + +#ifdef ADD_ISA_HOLE + shift_phys_machine(xen_phys_machine, xen_start_info->nr_pages); +#endif + /* + * pre-zero unused mapped pages - mapped on 4MB boundary + */ +/* + bzero((char *)cur_space, (cur_space + 0x3fffff) % 0x400000); + */ + +#ifdef PAE + IdlePDPT = (pd_entry_t *)startinfo->pt_base; + IdlePDPTma = xpmap_ptom(VTOP(startinfo->pt_base)); + IdlePTD = (pd_entry_t *)((uint8_t *)startinfo->pt_base + PAGE_SIZE); + IdlePTDma = xpmap_ptom(VTOP(IdlePTD)); + l3_pages = 1; +#else + IdlePTD = (pd_entry_t *)startinfo->pt_base; + IdlePTDma = xpmap_ptom(VTOP(startinfo->pt_base)); + l3_pages = 0; +#endif + l2_pages = 1; + l1_pages = 4; /* XXX not certain if this varies */ + KPTphysoff = (l2_pages + l3_pages)*PAGE_SIZE; + + KPTphys = xpmap_ptom(VTOP(startinfo->pt_base + KPTphysoff)); + XENPRINTF("IdlePTD %p\n", IdlePTD); + XENPRINTF("nr_pages: %ld shared_info: 0x%lx flags: 0x%lx pt_base: 0x%lx " + "mod_start: 0x%lx mod_len: 0x%lx\n", + xen_start_info->nr_pages, xen_start_info->shared_info, + xen_start_info->flags, xen_start_info->pt_base, + xen_start_info->mod_start, xen_start_info->mod_len); + /* Map proc0's KSTACK */ + + proc0kstack = cur_space; cur_space += (KSTACK_PAGES * PAGE_SIZE); + printk("proc0kstack=%u\n", proc0kstack); + + /* vm86/bios stack */ + cur_space += PAGE_SIZE; + + /* Map space for the vm86 region */ + vm86paddr = (vm_offset_t)cur_space; + cur_space += (PAGE_SIZE * 3); + +#ifdef PAE + pdir_shadow = (vm_paddr_t *)cur_space; cur_space += 4*PAGE_SIZE; + bzero(pdir_shadow, 4*PAGE_SIZE); + /* initialize page directory shadow page */ + for (i = 0; i < 4; i++) + pdir_shadow_ma[i] = xpmap_ptom((vm_paddr_t) + VTOP((uint8_t *)pdir_shadow + i*PAGE_SIZE)); +#else + /* initialize page directory shadow page */ + pdir_shadow = (vm_paddr_t *)cur_space; cur_space += PAGE_SIZE; + bzero(pdir_shadow, PAGE_SIZE); + pdir_shadow_ma = xpmap_ptom((vm_paddr_t)VTOP(pdir_shadow)); +#endif + +#ifdef PAE + IdlePDPTnew = (pd_entry_t *)cur_space; cur_space += PAGE_SIZE; + bzero(IdlePDPTnew, PAGE_SIZE); + IdlePDPTnewma = xpmap_ptom(VTOP(IdlePDPTnew)); + + IdlePTDnew = (pd_entry_t *)cur_space; cur_space += 4*PAGE_SIZE; + bzero(IdlePTDnew, 4*PAGE_SIZE); + for (i = 0; i < 4; i++) + IdlePTDnewma[i] = xpmap_ptom( + VTOP((uint8_t *)IdlePTDnew + i*PAGE_SIZE)); + + + /* + * L3 + */ + for (i = 0; i < 4; i++) + IdlePDPTnew[i] = IdlePTDnewma[i] | PG_V; + /* + * L2 + */ + for (i = 0; i < 4; i++) + IdlePTDnew[PTDPTDI + i] = pdir_shadow_ma[i] | PG_V; + + /* + * Map IdlePTD at PTD + */ + for (i = 0; i < 4; i++) + pdir_shadow[PTDPTDI + i] = IdlePTDnewma[i] | PG_V; + + + PT_SET_MA(IdlePDPTnew, IdlePDPTnewma | PG_V); + xen_pt_unpin(IdlePDPTma); +#endif +#ifndef PAE + xen_queue_pt_update(IdlePTDma + PTDPTDI*sizeof(vm_paddr_t), + pdir_shadow_ma | PG_KERNEL); + xen_flush_queue(); +#endif +#if 0 + xen_queue_pt_update(pdir_shadow_ma + (KPTDI + ISA_PDR_OFFSET)*sizeof(vm_paddr_t), + KPTphys | PG_V | PG_A); + xen_flush_queue(); +#endif + /* unmap remaining pages from initial 4MB chunk */ + for (tmpva = cur_space; (tmpva & ((1<<22)-1)) != 0; tmpva += PAGE_SIZE) { + bzero((char *)tmpva, PAGE_SIZE); + PT_SET_MA(tmpva, (vm_paddr_t)0); + } + +#ifdef PAE + offset = 0; +#else + offset = KPTDI; +#endif + /* allocate remainder of NKPT pages */ + for (i = l1_pages; i < NKPT; i++, cur_space += PAGE_SIZE) { + xen_pt_pin(xpmap_ptom(VTOP(cur_space))); + xen_queue_pt_update((vm_paddr_t)(IdlePTDma + (offset + i)*sizeof(vm_paddr_t)), + xpmap_ptom(VTOP(cur_space)) | PG_KERNEL); + } + + PT_UPDATES_FLUSH(); + /* + * L1 - can't copy Xen's mappings + */ + for (i = 0; i < 256; i++) + pdir_shadow[1536 + i] = IdlePTD[i] & ~(PG_RW|PG_A|PG_M); + memcpy((uint8_t *)IdlePTDnew + 3*PAGE_SIZE, IdlePTD, PAGE_SIZE/2); + printk("do remapping\n"); + for (i = 0; i < 4; i++) { + PT_SET_MA((uint8_t *)IdlePTDnew + i*PAGE_SIZE, + IdlePTDnewma[i] | PG_V); + PT_SET_MA((uint8_t *)pdir_shadow + i*PAGE_SIZE, + pdir_shadow_ma[i] | PG_V); + } + + xen_load_cr3(VTOP(IdlePDPTnew)); + xen_pgdpt_pin(xpmap_ptom(VTOP(IdlePDPTnew))); + for (i = 0; i < 4; i++) { + xen_queue_pt_update((vm_paddr_t)(IdlePTDnewma[2] + (PTDPTDI - 1024 + i)*sizeof(vm_paddr_t)), + IdlePTDnewma[i] | PG_V); + } + PT_UPDATES_FLUSH(); + + IdlePTD = IdlePTDnew; + IdlePDPT = IdlePDPTnew; + IdlePDPTma = IdlePDPTnewma; + + /* allocate page for gdt */ + gdt = (union descriptor *)cur_space; cur_space += PAGE_SIZE; + /* allocate page for ldt */ + ldt = (union descriptor *)cur_space; cur_space += PAGE_SIZE; + + HYPERVISOR_shared_info = (shared_info_t *)cur_space; + cur_space += PAGE_SIZE; + + /* + * shared_info is an unsigned long so this will randomly break if + * it is allocated above 4GB - I guess people are used to that + * sort of thing with Xen ... sigh + */ + shinfo = xen_start_info->shared_info; + PT_SET_MA(HYPERVISOR_shared_info, shinfo | PG_KERNEL); + + printk("#4\n"); + + xen_store = (struct ringbuf_head *)cur_space; + cur_space += PAGE_SIZE; + + xen_store_ma = (xen_start_info->store_mfn << PAGE_SHIFT); + PT_SET_MA(xen_store, xen_store_ma | PG_KERNEL); + console_page = (char *)cur_space; + cur_space += PAGE_SIZE; + console_page_ma = (xen_start_info->console.domU.mfn << PAGE_SHIFT); + PT_SET_MA(console_page, console_page_ma | PG_KERNEL); + + printk("#5\n"); + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = (unsigned long)xen_phys_machine; +#if 0 && defined(SMP) + for (i = 0; i < ncpus; i++) { + int j, npages = (sizeof(struct privatespace) + 1)/PAGE_SIZE; + + for (j = 0; j < npages; j++) { + vm_paddr_t ma = xpmap_ptom(cur_space); + cur_space += PAGE_SIZE; + PT_SET_VA_MA(SMPpt + i*npages + j, ma | PG_KERNEL, FALSE); + } + } + xen_flush_queue(); +#endif + + set_iopl.iopl = 1; + PANIC_IF(HYPERVISOR_physdev_op(PHYSDEVOP_SET_IOPL, &set_iopl)); + printk("#6\n"); +#if 0 + /* add page table for KERNBASE */ + xen_queue_pt_update(IdlePTDma + KPTDI*sizeof(vm_paddr_t), + xpmap_ptom(VTOP(cur_space) | PG_KERNEL)); + xen_flush_queue(); +#ifdef PAE + xen_queue_pt_update(pdir_shadow_ma[3] + KPTDI*sizeof(vm_paddr_t), + xpmap_ptom(VTOP(cur_space) | PG_V | PG_A)); +#else + xen_queue_pt_update(pdir_shadow_ma + KPTDI*sizeof(vm_paddr_t), + xpmap_ptom(VTOP(cur_space) | PG_V | PG_A)); +#endif + xen_flush_queue(); +#endif + cur_space += PAGE_SIZE; + printk("#6\n"); + +#ifdef notyet + if (xen_start_info->flags & SIF_INITDOMAIN) { + /* Map first megabyte */ + for (i = 0; i < (256 << PAGE_SHIFT); i += PAGE_SIZE) + PT_SET_MA(KERNBASE + i, i | PG_KERNEL | PG_NC_PCD); + xen_flush_queue(); + } +#endif + /* + * re-map kernel text read-only + * + */ + for (i = (((vm_offset_t)&btext) & ~PAGE_MASK); + i < (((vm_offset_t)&etext) & ~PAGE_MASK); i += PAGE_SIZE) + PT_SET_MA(i, xpmap_ptom(VTOP(i)) | PG_V | PG_A); + + printk("#7\n"); + physfree = VTOP(cur_space); + init_first = physfree >> PAGE_SHIFT; + IdlePTD = (pd_entry_t *)VTOP(IdlePTD); + IdlePDPT = (pd_entry_t *)VTOP(IdlePDPT); + setup_xen_features(); + printk("#8, proc0kstack=%u\n", proc0kstack); +} + + +trap_info_t trap_table[] = { + { 0, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(div)}, + { 1, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(dbg)}, + { 3, 3, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(bpt)}, + { 4, 3, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(ofl)}, + /* This is UPL on Linux and KPL on BSD */ + { 5, 3, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(bnd)}, + { 6, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(ill)}, + { 7, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(dna)}, + /* + * { 8, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(XXX)}, + * no handler for double fault + */ + { 9, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(fpusegm)}, + {10, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(tss)}, + {11, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(missing)}, + {12, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(stk)}, + {13, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(prot)}, + {14, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(page)}, + {15, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(rsvd)}, + {16, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(fpu)}, + {17, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(align)}, + {18, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(mchk)}, + {19, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(xmm)}, + {0x80, 3, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(int0x80_syscall)}, + { 0, 0, 0, 0 } +}; + + +static void +shutdown_handler(struct xenbus_watch *watch, + const char **vec, unsigned int len) +{ + char *str; + struct xenbus_transaction xbt; + int err, howto; + struct reboot_args uap; + + howto = 0; + + again: + err = xenbus_transaction_start(&xbt); + if (err) + return; + str = (char *)xenbus_read(xbt, "control", "shutdown", NULL); + /* Ignore read errors and empty reads. */ + if (XENBUS_IS_ERR_READ(str)) { + xenbus_transaction_end(xbt, 1); + return; + } + + xenbus_write(xbt, "control", "shutdown", ""); + + err = xenbus_transaction_end(xbt, 0); + if (err == EAGAIN) { + free(str, M_DEVBUF); + goto again; + } + + if (strcmp(str, "reboot") == 0) + howto = 0; + else if (strcmp(str, "poweroff") == 0) + howto |= (RB_POWEROFF | RB_HALT); + else if (strcmp(str, "halt") == 0) + howto |= RB_HALT; + else if (strcmp(str, "suspend") == 0) + howto = -1; + else { + printf("Ignoring shutdown request: %s\n", str); + goto done; + } +#ifdef notyet + if (howto == -1) { + do_suspend(NULL); + goto done; + } +#else + if (howto == -1) { + printf("suspend not currently supported\n"); + goto done; + } +#endif + uap.opt = howto; + reboot(curthread, &uap); + done: + free(str, M_DEVBUF); +} + +static struct xenbus_watch shutdown_watch = { + .node = "control/shutdown", + .callback = shutdown_handler +}; + + +static void +setup_shutdown_watcher(void *unused) +{ + if (register_xenbus_watch(&shutdown_watch)) + printf("Failed to set shutdown watcher\n"); +} + + +SYSINIT(shutdown, SI_SUB_PSEUDO, SI_ORDER_ANY, setup_shutdown_watcher, NULL) +#ifdef notyet + +static void +xen_suspend(void *ignore) +{ + int i, j, k, fpp; + + extern void time_resume(void); + extern unsigned long max_pfn; + extern unsigned long *pfn_to_mfn_frame_list_list; + extern unsigned long *pfn_to_mfn_frame_list[]; + +#ifdef CONFIG_SMP +#error "do_suspend must be run cpu 0 - need to create separate thread" + cpumask_t prev_online_cpus; + int vcpu_prepare(int vcpu); +#endif + + int err = 0; + + PANIC_IF(smp_processor_id() != 0); + +#if defined(CONFIG_SMP) && !defined(CONFIG_HOTPLUG_CPU) + if (num_online_cpus() > 1) { + printk(KERN_WARNING "Can't suspend SMP guests " + "without CONFIG_HOTPLUG_CPU\n"); + return -EOPNOTSUPP; + } +#endif + + xenbus_suspend(); + +#ifdef CONFIG_SMP + lock_cpu_hotplug(); + /* + * Take all other CPUs offline. We hold the hotplug semaphore to + * avoid other processes bringing up CPUs under our feet. + */ + cpus_clear(prev_online_cpus); + while (num_online_cpus() > 1) { + for_each_online_cpu(i) { + if (i == 0) + continue; + unlock_cpu_hotplug(); + err = cpu_down(i); + lock_cpu_hotplug(); + if (err != 0) { + printk(KERN_CRIT "Failed to take all CPUs " + "down: %d.\n", err); + goto out_reenable_cpus; + } + cpu_set(i, prev_online_cpus); + } + } +#endif + + preempt_disable(); + + + __cli(); + preempt_enable(); +#ifdef SMP + unlock_cpu_hotplug(); +#endif + gnttab_suspend(); + + pmap_kremove(HYPERVISOR_shared_info); + + xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn); + xen_start_info->console.domU.mfn = mfn_to_pfn(xen_start_info->console.domU.mfn); + + /* + * We'll stop somewhere inside this hypercall. When it returns, + * we'll start resuming after the restore. + */ + HYPERVISOR_suspend(VTOMFN(xen_start_info)); + + pmap_kenter_ma(HYPERVISOR_shared_info, xen_start_info->shared_info); + set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info); + +#if 0 + memset(empty_zero_page, 0, PAGE_SIZE); +#endif + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = + VTOMFN(pfn_to_mfn_frame_list_list); + + fpp = PAGE_SIZE/sizeof(unsigned long); + for (i = 0, j = 0, k = -1; i < max_pfn; i += fpp, j++) { + if ((j % fpp) == 0) { + k++; + pfn_to_mfn_frame_list_list[k] = + VTOMFN(pfn_to_mfn_frame_list[k]); + j = 0; + } + pfn_to_mfn_frame_list[k][j] = + VTOMFN(&phys_to_machine_mapping[i]); + } + HYPERVISOR_shared_info->arch.max_pfn = max_pfn; + + gnttab_resume(); + + irq_resume(); + + time_resume(); + + __sti(); + + xencons_resume(); + +#ifdef CONFIG_SMP + for_each_cpu(i) + vcpu_prepare(i); + +#endif + + /* + * Only resume xenbus /after/ we've prepared our VCPUs; otherwise + * the VCPU hotplug callback can race with our vcpu_prepare + */ + xenbus_resume(); + +#ifdef CONFIG_SMP + out_reenable_cpus: + for_each_cpu_mask(i, prev_online_cpus) { + j = cpu_up(i); + if ((j != 0) && !cpu_online(i)) { + printk(KERN_CRIT "Failed to bring cpu " + "%d back up (%d).\n", + i, j); + err = j; + } + } +#endif + return err; +} + +#endif +/********** CODE WORTH KEEPING ABOVE HERE *****************/ + +void xen_failsafe_handler(void); + +void +xen_failsafe_handler(void) +{ + + panic("xen_failsafe_handler called!\n"); +} + +void xen_handle_thread_switch(struct pcb *pcb); + +/* This is called by cpu_switch() when switching threads. */ +/* The pcb arg refers to the process control block of the */ +/* next thread which is to run */ +void +xen_handle_thread_switch(struct pcb *pcb) +{ + uint32_t *a = (uint32_t *)&PCPU_GET(fsgs_gdt)[0]; + uint32_t *b = (uint32_t *)&pcb->pcb_fsd; + multicall_entry_t mcl[3]; + int i = 0; + + /* Notify Xen of task switch */ + mcl[i].op = __HYPERVISOR_stack_switch; + mcl[i].args[0] = GSEL(GDATA_SEL, SEL_KPL); + mcl[i++].args[1] = (unsigned long)pcb; + + /* Check for update of fsd */ + if (*a != *b || *(a+1) != *(b+1)) { + mcl[i].op = __HYPERVISOR_update_descriptor; + *(uint64_t *)&mcl[i].args[0] = vtomach((vm_offset_t)a); + *(uint64_t *)&mcl[i++].args[2] = *(uint64_t *)b; + } + + a += 2; + b += 2; + + /* Check for update of gsd */ + if (*a != *b || *(a+1) != *(b+1)) { + mcl[i].op = __HYPERVISOR_update_descriptor; + *(uint64_t *)&mcl[i].args[0] = vtomach((vm_offset_t)a); + *(uint64_t *)&mcl[i++].args[2] = *(uint64_t *)b; + } + + (void)HYPERVISOR_multicall(mcl, i); +} |