diff options
Diffstat (limited to 'sys/i386/xen')
-rw-r--r-- | sys/i386/xen/clock.c | 913 | ||||
-rw-r--r-- | sys/i386/xen/exception.s | 494 | ||||
-rw-r--r-- | sys/i386/xen/locore.s | 361 | ||||
-rw-r--r-- | sys/i386/xen/mp_machdep.c | 1249 | ||||
-rw-r--r-- | sys/i386/xen/mptable.c | 107 | ||||
-rw-r--r-- | sys/i386/xen/pmap.c | 4429 | ||||
-rw-r--r-- | sys/i386/xen/xen_clock_util.c | 101 | ||||
-rw-r--r-- | sys/i386/xen/xen_machdep.c | 1260 | ||||
-rw-r--r-- | sys/i386/xen/xen_rtc.c | 144 |
9 files changed, 9058 insertions, 0 deletions
diff --git a/sys/i386/xen/clock.c b/sys/i386/xen/clock.c new file mode 100644 index 0000000..4e43a12 --- /dev/null +++ b/sys/i386/xen/clock.c @@ -0,0 +1,913 @@ +/*- + * Copyright (c) 1990 The Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * William Jolitz and Don Ahn. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)clock.c 7.2 (Berkeley) 5/12/91 + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +/* #define DELAYDEBUG */ +/* + * Routines to handle clock hardware. + */ + +#include "opt_ddb.h" +#include "opt_clock.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bus.h> +#include <sys/clock.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/time.h> +#include <sys/timeet.h> +#include <sys/timetc.h> +#include <sys/kernel.h> +#include <sys/limits.h> +#include <sys/sysctl.h> +#include <sys/cons.h> +#include <sys/power.h> + +#include <machine/clock.h> +#include <machine/cputypes.h> +#include <machine/frame.h> +#include <machine/intr_machdep.h> +#include <machine/md_var.h> +#include <machine/psl.h> +#if defined(SMP) +#include <machine/smp.h> +#endif +#include <machine/specialreg.h> +#include <machine/timerreg.h> + +#include <x86/isa/icu.h> +#include <x86/isa/isa.h> +#include <isa/rtc.h> + +#include <xen/xen_intr.h> +#include <vm/vm.h> +#include <vm/pmap.h> +#include <machine/pmap.h> +#include <xen/hypervisor.h> +#include <machine/xen/xen-os.h> +#include <machine/xen/xenfunc.h> +#include <xen/interface/vcpu.h> +#include <machine/cpu.h> +#include <machine/xen/xen_clock_util.h> + +/* + * 32-bit time_t's can't reach leap years before 1904 or after 2036, so we + * can use a simple formula for leap years. + */ +#define LEAPYEAR(y) (!((y) % 4)) +#define DAYSPERYEAR (28+30*4+31*7) + +#ifndef TIMER_FREQ +#define TIMER_FREQ 1193182 +#endif + +#ifdef CYC2NS_SCALE_FACTOR +#undef CYC2NS_SCALE_FACTOR +#endif +#define CYC2NS_SCALE_FACTOR 10 + +/* Values for timerX_state: */ +#define RELEASED 0 +#define RELEASE_PENDING 1 +#define ACQUIRED 2 +#define ACQUIRE_PENDING 3 + +struct mtx clock_lock; +#define RTC_LOCK_INIT \ + mtx_init(&clock_lock, "clk", NULL, MTX_SPIN | MTX_NOPROFILE) +#define RTC_LOCK mtx_lock_spin(&clock_lock) +#define RTC_UNLOCK mtx_unlock_spin(&clock_lock) + +int adjkerntz; /* local offset from GMT in seconds */ +int clkintr_pending; +int pscnt = 1; +int psdiv = 1; +int wall_cmos_clock; +u_int timer_freq = TIMER_FREQ; +static int independent_wallclock; +static int xen_disable_rtc_set; +static u_long cyc2ns_scale; +static struct timespec shadow_tv; +static uint32_t shadow_tv_version; /* XXX: lazy locking */ +static uint64_t processed_system_time; /* stime (ns) at last processing. */ + +static const u_char daysinmonth[] = {31,28,31,30,31,30,31,31,30,31,30,31}; + +SYSCTL_INT(_machdep, OID_AUTO, independent_wallclock, + CTLFLAG_RW, &independent_wallclock, 0, ""); +SYSCTL_INT(_machdep, OID_AUTO, xen_disable_rtc_set, + CTLFLAG_RW, &xen_disable_rtc_set, 1, ""); + + +#define do_div(n,base) ({ \ + unsigned long __upper, __low, __high, __mod, __base; \ + __base = (base); \ + __asm("":"=a" (__low), "=d" (__high):"A" (n)); \ + __upper = __high; \ + if (__high) { \ + __upper = __high % (__base); \ + __high = __high / (__base); \ + } \ + __asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (__base), "0" (__low), "1" (__upper)); \ + __asm("":"=A" (n):"a" (__low),"d" (__high)); \ + __mod; \ +}) + + +#define NS_PER_TICK (1000000000ULL/hz) + +#define rdtscll(val) \ + __asm__ __volatile__("rdtsc" : "=A" (val)) + + +/* convert from cycles(64bits) => nanoseconds (64bits) + * basic equation: + * ns = cycles / (freq / ns_per_sec) + * ns = cycles * (ns_per_sec / freq) + * ns = cycles * (10^9 / (cpu_mhz * 10^6)) + * ns = cycles * (10^3 / cpu_mhz) + * + * Then we use scaling math (suggested by george@mvista.com) to get: + * ns = cycles * (10^3 * SC / cpu_mhz) / SC + * ns = cycles * cyc2ns_scale / SC + * + * And since SC is a constant power of two, we can convert the div + * into a shift. + * -johnstul@us.ibm.com "math is hard, lets go shopping!" + */ +static inline void set_cyc2ns_scale(unsigned long cpu_mhz) +{ + cyc2ns_scale = (1000 << CYC2NS_SCALE_FACTOR)/cpu_mhz; +} + +static inline unsigned long long cycles_2_ns(unsigned long long cyc) +{ + return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR; +} + +/* + * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, + * yielding a 64-bit result. + */ +static inline uint64_t +scale_delta(uint64_t delta, uint32_t mul_frac, int shift) +{ + uint64_t product; + uint32_t tmp1, tmp2; + + if ( shift < 0 ) + delta >>= -shift; + else + delta <<= shift; + + __asm__ ( + "mul %5 ; " + "mov %4,%%eax ; " + "mov %%edx,%4 ; " + "mul %5 ; " + "xor %5,%5 ; " + "add %4,%%eax ; " + "adc %5,%%edx ; " + : "=A" (product), "=r" (tmp1), "=r" (tmp2) + : "a" ((uint32_t)delta), "1" ((uint32_t)(delta >> 32)), "2" (mul_frac) ); + + return product; +} + +static uint64_t +get_nsec_offset(struct shadow_time_info *shadow) +{ + uint64_t now, delta; + rdtscll(now); + delta = now - shadow->tsc_timestamp; + return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); +} + +static void update_wallclock(void) +{ + shared_info_t *s = HYPERVISOR_shared_info; + + do { + shadow_tv_version = s->wc_version; + rmb(); + shadow_tv.tv_sec = s->wc_sec; + shadow_tv.tv_nsec = s->wc_nsec; + rmb(); + } + while ((s->wc_version & 1) | (shadow_tv_version ^ s->wc_version)); + +} + +static void +add_uptime_to_wallclock(void) +{ + struct timespec ut; + + xen_fetch_uptime(&ut); + timespecadd(&shadow_tv, &ut); +} + +/* + * Reads a consistent set of time-base values from Xen, into a shadow data + * area. Must be called with the xtime_lock held for writing. + */ +static void __get_time_values_from_xen(void) +{ + shared_info_t *s = HYPERVISOR_shared_info; + struct vcpu_time_info *src; + struct shadow_time_info *dst; + uint32_t pre_version, post_version; + + src = &s->vcpu_info[smp_processor_id()].time; + dst = &per_cpu(shadow_time, smp_processor_id()); + + spinlock_enter(); + do { + pre_version = dst->version = src->version; + rmb(); + dst->tsc_timestamp = src->tsc_timestamp; + dst->system_timestamp = src->system_time; + dst->tsc_to_nsec_mul = src->tsc_to_system_mul; + dst->tsc_shift = src->tsc_shift; + rmb(); + post_version = src->version; + } + while ((pre_version & 1) | (pre_version ^ post_version)); + + dst->tsc_to_usec_mul = dst->tsc_to_nsec_mul / 1000; + spinlock_exit(); +} + + +static inline int time_values_up_to_date(int cpu) +{ + struct vcpu_time_info *src; + struct shadow_time_info *dst; + + src = &HYPERVISOR_shared_info->vcpu_info[cpu].time; + dst = &per_cpu(shadow_time, cpu); + + rmb(); + return (dst->version == src->version); +} + +static unsigned xen_get_timecount(struct timecounter *tc); + +static struct timecounter xen_timecounter = { + xen_get_timecount, /* get_timecount */ + 0, /* no poll_pps */ + ~0u, /* counter_mask */ + 0, /* frequency */ + "ixen", /* name */ + 0 /* quality */ +}; + +static struct eventtimer xen_et; + +struct xen_et_state { + int mode; +#define MODE_STOP 0 +#define MODE_PERIODIC 1 +#define MODE_ONESHOT 2 + int64_t period; + int64_t next; +}; + +static DPCPU_DEFINE(struct xen_et_state, et_state); + +static int +clkintr(void *arg) +{ + int64_t now; + int cpu = smp_processor_id(); + struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu); + struct xen_et_state *state = DPCPU_PTR(et_state); + + do { + __get_time_values_from_xen(); + now = shadow->system_timestamp + get_nsec_offset(shadow); + } while (!time_values_up_to_date(cpu)); + + /* Process elapsed ticks since last call. */ + processed_system_time = now; + if (state->mode == MODE_PERIODIC) { + while (now >= state->next) { + state->next += state->period; + if (xen_et.et_active) + xen_et.et_event_cb(&xen_et, xen_et.et_arg); + } + HYPERVISOR_set_timer_op(state->next + 50000); + } else if (state->mode == MODE_ONESHOT) { + if (xen_et.et_active) + xen_et.et_event_cb(&xen_et, xen_et.et_arg); + } + /* + * Take synchronised time from Xen once a minute if we're not + * synchronised ourselves, and we haven't chosen to keep an independent + * time base. + */ + + if (shadow_tv_version != HYPERVISOR_shared_info->wc_version && + !independent_wallclock) { + printf("[XEN] hypervisor wallclock nudged; nudging TOD.\n"); + update_wallclock(); + add_uptime_to_wallclock(); + tc_setclock(&shadow_tv); + } + + /* XXX TODO */ + return (FILTER_HANDLED); +} +static uint32_t +getit(void) +{ + struct shadow_time_info *shadow; + uint64_t time; + uint32_t local_time_version; + + shadow = &per_cpu(shadow_time, smp_processor_id()); + + do { + local_time_version = shadow->version; + barrier(); + time = shadow->system_timestamp + get_nsec_offset(shadow); + if (!time_values_up_to_date(smp_processor_id())) + __get_time_values_from_xen(/*cpu */); + barrier(); + } while (local_time_version != shadow->version); + + return (time); +} + + +/* + * XXX: timer needs more SMP work. + */ +void +i8254_init(void) +{ + + RTC_LOCK_INIT; +} + +/* + * Wait "n" microseconds. + * Relies on timer 1 counting down from (timer_freq / hz) + * Note: timer had better have been programmed before this is first used! + */ +void +DELAY(int n) +{ + int delta, ticks_left; + uint32_t tick, prev_tick; +#ifdef DELAYDEBUG + int getit_calls = 1; + int n1; + static int state = 0; + + if (state == 0) { + state = 1; + for (n1 = 1; n1 <= 10000000; n1 *= 10) + DELAY(n1); + state = 2; + } + if (state == 1) + printf("DELAY(%d)...", n); +#endif + /* + * Read the counter first, so that the rest of the setup overhead is + * counted. Guess the initial overhead is 20 usec (on most systems it + * takes about 1.5 usec for each of the i/o's in getit(). The loop + * takes about 6 usec on a 486/33 and 13 usec on a 386/20. The + * multiplications and divisions to scale the count take a while). + * + * However, if ddb is active then use a fake counter since reading + * the i8254 counter involves acquiring a lock. ddb must not go + * locking for many reasons, but it calls here for at least atkbd + * input. + */ + prev_tick = getit(); + + n -= 0; /* XXX actually guess no initial overhead */ + /* + * Calculate (n * (timer_freq / 1e6)) without using floating point + * and without any avoidable overflows. + */ + if (n <= 0) + ticks_left = 0; + else if (n < 256) + /* + * Use fixed point to avoid a slow division by 1000000. + * 39099 = 1193182 * 2^15 / 10^6 rounded to nearest. + * 2^15 is the first power of 2 that gives exact results + * for n between 0 and 256. + */ + ticks_left = ((u_int)n * 39099 + (1 << 15) - 1) >> 15; + else + /* + * Don't bother using fixed point, although gcc-2.7.2 + * generates particularly poor code for the long long + * division, since even the slow way will complete long + * before the delay is up (unless we're interrupted). + */ + ticks_left = ((u_int)n * (long long)timer_freq + 999999) + / 1000000; + + while (ticks_left > 0) { + tick = getit(); +#ifdef DELAYDEBUG + ++getit_calls; +#endif + delta = tick - prev_tick; + prev_tick = tick; + if (delta < 0) { + /* + * Guard against timer0_max_count being wrong. + * This shouldn't happen in normal operation, + * but it may happen if set_timer_freq() is + * traced. + */ + /* delta += timer0_max_count; ??? */ + if (delta < 0) + delta = 0; + } + ticks_left -= delta; + } +#ifdef DELAYDEBUG + if (state == 1) + printf(" %d calls to getit() at %d usec each\n", + getit_calls, (n + 5) / getit_calls); +#endif +} + + +/* + * Restore all the timers non-atomically (XXX: should be atomically). + * + * This function is called from pmtimer_resume() to restore all the timers. + * This should not be necessary, but there are broken laptops that do not + * restore all the timers on resume. + */ +void +timer_restore(void) +{ + struct xen_et_state *state = DPCPU_PTR(et_state); + + /* Get timebases for new environment. */ + __get_time_values_from_xen(); + + /* Reset our own concept of passage of system time. */ + processed_system_time = per_cpu(shadow_time, 0).system_timestamp; + state->next = processed_system_time; +} + +void +startrtclock() +{ + unsigned long long alarm; + uint64_t __cpu_khz; + uint32_t cpu_khz; + struct vcpu_time_info *info; + + /* initialize xen values */ + __get_time_values_from_xen(); + processed_system_time = per_cpu(shadow_time, 0).system_timestamp; + + __cpu_khz = 1000000ULL << 32; + info = &HYPERVISOR_shared_info->vcpu_info[0].time; + + (void)do_div(__cpu_khz, info->tsc_to_system_mul); + if ( info->tsc_shift < 0 ) + cpu_khz = __cpu_khz << -info->tsc_shift; + else + cpu_khz = __cpu_khz >> info->tsc_shift; + + printf("Xen reported: %u.%03u MHz processor.\n", + cpu_khz / 1000, cpu_khz % 1000); + + /* (10^6 * 2^32) / cpu_hz = (10^3 * 2^32) / cpu_khz = + (2^32 * 1 / (clocks/us)) */ + + set_cyc2ns_scale(cpu_khz/1000); + tsc_freq = cpu_khz * 1000; + + timer_freq = 1000000000LL; + xen_timecounter.tc_frequency = timer_freq >> 9; + tc_init(&xen_timecounter); + + rdtscll(alarm); +} + +/* + * RTC support routines + */ + + +static __inline int +readrtc(int port) +{ + return(bcd2bin(rtcin(port))); +} + + +#ifdef XEN_PRIVILEGED_GUEST + +/* + * Initialize the time of day register, based on the time base which is, e.g. + * from a filesystem. + */ +static void +domu_inittodr(time_t base) +{ + unsigned long sec; + int s, y; + struct timespec ts; + + update_wallclock(); + add_uptime_to_wallclock(); + + RTC_LOCK; + + if (base) { + ts.tv_sec = base; + ts.tv_nsec = 0; + tc_setclock(&ts); + } + + sec += tz_minuteswest * 60 + (wall_cmos_clock ? adjkerntz : 0); + + y = time_second - shadow_tv.tv_sec; + if (y <= -2 || y >= 2) { + /* badly off, adjust it */ + tc_setclock(&shadow_tv); + } + RTC_UNLOCK; +} + +/* + * Write system time back to RTC. + */ +static void +domu_resettodr(void) +{ + unsigned long tm; + int s; + dom0_op_t op; + struct shadow_time_info *shadow; + + shadow = &per_cpu(shadow_time, smp_processor_id()); + if (xen_disable_rtc_set) + return; + + s = splclock(); + tm = time_second; + splx(s); + + tm -= tz_minuteswest * 60 + (wall_cmos_clock ? adjkerntz : 0); + + if ((xen_start_info->flags & SIF_INITDOMAIN) && + !independent_wallclock) + { + op.cmd = DOM0_SETTIME; + op.u.settime.secs = tm; + op.u.settime.nsecs = 0; + op.u.settime.system_time = shadow->system_timestamp; + HYPERVISOR_dom0_op(&op); + update_wallclock(); + add_uptime_to_wallclock(); + } else if (independent_wallclock) { + /* notyet */ + ; + } +} + +/* + * Initialize the time of day register, based on the time base which is, e.g. + * from a filesystem. + */ +void +inittodr(time_t base) +{ + unsigned long sec, days; + int year, month; + int y, m, s; + struct timespec ts; + + if (!(xen_start_info->flags & SIF_INITDOMAIN)) { + domu_inittodr(base); + return; + } + + if (base) { + s = splclock(); + ts.tv_sec = base; + ts.tv_nsec = 0; + tc_setclock(&ts); + splx(s); + } + + /* Look if we have a RTC present and the time is valid */ + if (!(rtcin(RTC_STATUSD) & RTCSD_PWR)) + goto wrong_time; + + /* wait for time update to complete */ + /* If RTCSA_TUP is zero, we have at least 244us before next update */ + s = splhigh(); + while (rtcin(RTC_STATUSA) & RTCSA_TUP) { + splx(s); + s = splhigh(); + } + + days = 0; +#ifdef USE_RTC_CENTURY + year = readrtc(RTC_YEAR) + readrtc(RTC_CENTURY) * 100; +#else + year = readrtc(RTC_YEAR) + 1900; + if (year < 1970) + year += 100; +#endif + if (year < 1970) { + splx(s); + goto wrong_time; + } + month = readrtc(RTC_MONTH); + for (m = 1; m < month; m++) + days += daysinmonth[m-1]; + if ((month > 2) && LEAPYEAR(year)) + days ++; + days += readrtc(RTC_DAY) - 1; + for (y = 1970; y < year; y++) + days += DAYSPERYEAR + LEAPYEAR(y); + sec = ((( days * 24 + + readrtc(RTC_HRS)) * 60 + + readrtc(RTC_MIN)) * 60 + + readrtc(RTC_SEC)); + /* sec now contains the number of seconds, since Jan 1 1970, + in the local time zone */ + + sec += tz_minuteswest * 60 + (wall_cmos_clock ? adjkerntz : 0); + + y = time_second - sec; + if (y <= -2 || y >= 2) { + /* badly off, adjust it */ + ts.tv_sec = sec; + ts.tv_nsec = 0; + tc_setclock(&ts); + } + splx(s); + return; + + wrong_time: + printf("Invalid time in real time clock.\n"); + printf("Check and reset the date immediately!\n"); +} + + +/* + * Write system time back to RTC + */ +void +resettodr() +{ + unsigned long tm; + int y, m, s; + + if (!(xen_start_info->flags & SIF_INITDOMAIN)) { + domu_resettodr(); + return; + } + + if (xen_disable_rtc_set) + return; + + s = splclock(); + tm = time_second; + splx(s); + + /* Disable RTC updates and interrupts. */ + writertc(RTC_STATUSB, RTCSB_HALT | RTCSB_24HR); + + /* Calculate local time to put in RTC */ + + tm -= tz_minuteswest * 60 + (wall_cmos_clock ? adjkerntz : 0); + + writertc(RTC_SEC, bin2bcd(tm%60)); tm /= 60; /* Write back Seconds */ + writertc(RTC_MIN, bin2bcd(tm%60)); tm /= 60; /* Write back Minutes */ + writertc(RTC_HRS, bin2bcd(tm%24)); tm /= 24; /* Write back Hours */ + + /* We have now the days since 01-01-1970 in tm */ + writertc(RTC_WDAY, (tm + 4) % 7 + 1); /* Write back Weekday */ + for (y = 1970, m = DAYSPERYEAR + LEAPYEAR(y); + tm >= m; + y++, m = DAYSPERYEAR + LEAPYEAR(y)) + tm -= m; + + /* Now we have the years in y and the day-of-the-year in tm */ + writertc(RTC_YEAR, bin2bcd(y%100)); /* Write back Year */ +#ifdef USE_RTC_CENTURY + writertc(RTC_CENTURY, bin2bcd(y/100)); /* ... and Century */ +#endif + for (m = 0; ; m++) { + int ml; + + ml = daysinmonth[m]; + if (m == 1 && LEAPYEAR(y)) + ml++; + if (tm < ml) + break; + tm -= ml; + } + + writertc(RTC_MONTH, bin2bcd(m + 1)); /* Write back Month */ + writertc(RTC_DAY, bin2bcd(tm + 1)); /* Write back Month Day */ + + /* Reenable RTC updates and interrupts. */ + writertc(RTC_STATUSB, RTCSB_24HR); + rtcin(RTC_INTR); +} +#endif + +static int +xen_et_start(struct eventtimer *et, + struct bintime *first, struct bintime *period) +{ + struct xen_et_state *state = DPCPU_PTR(et_state); + struct shadow_time_info *shadow; + int64_t fperiod; + + __get_time_values_from_xen(); + + if (period != NULL) { + state->mode = MODE_PERIODIC; + state->period = (1000000000LL * + (uint32_t)(period->frac >> 32)) >> 32; + if (period->sec != 0) + state->period += 1000000000LL * period->sec; + } else { + state->mode = MODE_ONESHOT; + state->period = 0; + } + if (first != NULL) { + fperiod = (1000000000LL * (uint32_t)(first->frac >> 32)) >> 32; + if (first->sec != 0) + fperiod += 1000000000LL * first->sec; + } else + fperiod = state->period; + + shadow = &per_cpu(shadow_time, smp_processor_id()); + state->next = shadow->system_timestamp + get_nsec_offset(shadow); + state->next += fperiod; + HYPERVISOR_set_timer_op(state->next + 50000); + return (0); +} + +static int +xen_et_stop(struct eventtimer *et) +{ + struct xen_et_state *state = DPCPU_PTR(et_state); + + state->mode = MODE_STOP; + HYPERVISOR_set_timer_op(0); + return (0); +} + +/* + * Start clocks running. + */ +void +cpu_initclocks(void) +{ + unsigned int time_irq; + int error; + + HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, 0, NULL); + error = bind_virq_to_irqhandler(VIRQ_TIMER, 0, "cpu0:timer", + clkintr, NULL, NULL, INTR_TYPE_CLK, &time_irq); + if (error) + panic("failed to register clock interrupt\n"); + /* should fast clock be enabled ? */ + + bzero(&xen_et, sizeof(xen_et)); + xen_et.et_name = "ixen"; + xen_et.et_flags = ET_FLAGS_PERIODIC | ET_FLAGS_ONESHOT | + ET_FLAGS_PERCPU; + xen_et.et_quality = 600; + xen_et.et_frequency = 0; + xen_et.et_min_period.sec = 0; + xen_et.et_min_period.frac = 0x00400000LL << 32; + xen_et.et_max_period.sec = 2; + xen_et.et_max_period.frac = 0; + xen_et.et_start = xen_et_start; + xen_et.et_stop = xen_et_stop; + xen_et.et_priv = NULL; + et_register(&xen_et); + + cpu_initclocks_bsp(); +} + +int +ap_cpu_initclocks(int cpu) +{ + char buf[MAXCOMLEN + 1]; + unsigned int time_irq; + int error; + + HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL); + snprintf(buf, sizeof(buf), "cpu%d:timer", cpu); + error = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, buf, + clkintr, NULL, NULL, INTR_TYPE_CLK, &time_irq); + if (error) + panic("failed to register clock interrupt\n"); + + return (0); +} + +static uint32_t +xen_get_timecount(struct timecounter *tc) +{ + uint64_t clk; + struct shadow_time_info *shadow; + shadow = &per_cpu(shadow_time, smp_processor_id()); + + __get_time_values_from_xen(); + + clk = shadow->system_timestamp + get_nsec_offset(shadow); + + return (uint32_t)(clk >> 9); + +} + +/* Return system time offset by ticks */ +uint64_t +get_system_time(int ticks) +{ + return processed_system_time + (ticks * NS_PER_TICK); +} + +void +idle_block(void) +{ + + HYPERVISOR_sched_op(SCHEDOP_block, 0); +} + +int +timer_spkr_acquire(void) +{ + + return (0); +} + +int +timer_spkr_release(void) +{ + + return (0); +} + +void +timer_spkr_setfreq(int freq) +{ + +} + diff --git a/sys/i386/xen/exception.s b/sys/i386/xen/exception.s new file mode 100644 index 0000000..e965ffd --- /dev/null +++ b/sys/i386/xen/exception.s @@ -0,0 +1,494 @@ +/*- + * Copyright (c) 1989, 1990 William F. Jolitz. + * Copyright (c) 1990 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include "opt_apic.h" +#include "opt_npx.h" + +#include <machine/asmacros.h> +#include <machine/psl.h> +#include <machine/trap.h> + +#include "assym.s" + +#define SEL_RPL_MASK 0x0002 +#define __HYPERVISOR_iret 23 + +/* Offsets into shared_info_t. */ + +#define evtchn_upcall_pending /* 0 */ +#define evtchn_upcall_mask 1 + +#define sizeof_vcpu_shift 6 + + +#ifdef SMP +#define GET_VCPU_INFO(reg) movl PCPU(CPUID),reg ; \ + shl $sizeof_vcpu_shift,reg ; \ + addl HYPERVISOR_shared_info,reg +#else +#define GET_VCPU_INFO(reg) movl HYPERVISOR_shared_info,reg +#endif + +#define __DISABLE_INTERRUPTS(reg) movb $1,evtchn_upcall_mask(reg) +#define __ENABLE_INTERRUPTS(reg) movb $0,evtchn_upcall_mask(reg) +#define DISABLE_INTERRUPTS(reg) GET_VCPU_INFO(reg) ; \ + __DISABLE_INTERRUPTS(reg) +#define ENABLE_INTERRUPTS(reg) GET_VCPU_INFO(reg) ; \ + __ENABLE_INTERRUPTS(reg) +#define __TEST_PENDING(reg) testb $0xFF,evtchn_upcall_pending(reg) + +#define POPA \ + popl %edi; \ + popl %esi; \ + popl %ebp; \ + popl %ebx; \ + popl %ebx; \ + popl %edx; \ + popl %ecx; \ + popl %eax; + + .text + +/*****************************************************************************/ +/* Trap handling */ +/*****************************************************************************/ +/* + * Trap and fault vector routines. + * + * Most traps are 'trap gates', SDT_SYS386TGT. A trap gate pushes state on + * the stack that mostly looks like an interrupt, but does not disable + * interrupts. A few of the traps we are use are interrupt gates, + * SDT_SYS386IGT, which are nearly the same thing except interrupts are + * disabled on entry. + * + * The cpu will push a certain amount of state onto the kernel stack for + * the current process. The amount of state depends on the type of trap + * and whether the trap crossed rings or not. See i386/include/frame.h. + * At the very least the current EFLAGS (status register, which includes + * the interrupt disable state prior to the trap), the code segment register, + * and the return instruction pointer are pushed by the cpu. The cpu + * will also push an 'error' code for certain traps. We push a dummy + * error code for those traps where the cpu doesn't in order to maintain + * a consistent frame. We also push a contrived 'trap number'. + * + * The cpu does not push the general registers, we must do that, and we + * must restore them prior to calling 'iret'. The cpu adjusts the %cs and + * %ss segment registers, but does not mess with %ds, %es, or %fs. Thus we + * must load them with appropriate values for supervisor mode operation. + */ + +MCOUNT_LABEL(user) +MCOUNT_LABEL(btrap) + +#define TRAP(a) pushl $(a) ; jmp alltraps + +IDTVEC(div) + pushl $0; TRAP(T_DIVIDE) +IDTVEC(dbg) + pushl $0; TRAP(T_TRCTRAP) +IDTVEC(nmi) + pushl $0; TRAP(T_NMI) +IDTVEC(bpt) + pushl $0; TRAP(T_BPTFLT) +IDTVEC(ofl) + pushl $0; TRAP(T_OFLOW) +IDTVEC(bnd) + pushl $0; TRAP(T_BOUND) +IDTVEC(ill) + pushl $0; TRAP(T_PRIVINFLT) +IDTVEC(dna) + pushl $0; TRAP(T_DNA) +IDTVEC(fpusegm) + pushl $0; TRAP(T_FPOPFLT) +IDTVEC(tss) + TRAP(T_TSSFLT) +IDTVEC(missing) + TRAP(T_SEGNPFLT) +IDTVEC(stk) + TRAP(T_STKFLT) +IDTVEC(prot) + TRAP(T_PROTFLT) +IDTVEC(page) + TRAP(T_PAGEFLT) +IDTVEC(mchk) + pushl $0; TRAP(T_MCHK) +IDTVEC(rsvd) + pushl $0; TRAP(T_RESERVED) +IDTVEC(fpu) + pushl $0; TRAP(T_ARITHTRAP) +IDTVEC(align) + TRAP(T_ALIGNFLT) +IDTVEC(xmm) + pushl $0; TRAP(T_XMMFLT) + +IDTVEC(hypervisor_callback) + pushl $0; + pushl $0; + pushal + pushl %ds + pushl %es + pushl %fs +upcall_with_regs_pushed: + SET_KERNEL_SREGS + FAKE_MCOUNT(TF_EIP(%esp)) +call_evtchn_upcall: + movl TF_EIP(%esp),%eax + cmpl $scrit,%eax + jb 10f + cmpl $ecrit,%eax + jb critical_region_fixup + +10: pushl %esp + call evtchn_do_upcall + addl $4,%esp + + /* + * Return via doreti to handle ASTs. + */ + MEXITCOUNT + jmp doreti + + +hypervisor_callback_pending: + DISABLE_INTERRUPTS(%esi) /* cli */ + jmp 10b + /* + * alltraps entry point. Interrupts are enabled if this was a trap + * gate (TGT), else disabled if this was an interrupt gate (IGT). + * Note that int0x80_syscall is a trap gate. Only page faults + * use an interrupt gate. + */ + SUPERALIGN_TEXT + .globl alltraps + .type alltraps,@function +alltraps: + pushal + pushl %ds + pushl %es + pushl %fs + +alltraps_with_regs_pushed: + SET_KERNEL_SREGS + FAKE_MCOUNT(TF_EIP(%esp)) + +calltrap: + push %esp + call trap + add $4, %esp + + /* + * Return via doreti to handle ASTs. + */ + MEXITCOUNT + jmp doreti + +/* + * SYSCALL CALL GATE (old entry point for a.out binaries) + * + * The intersegment call has been set up to specify one dummy parameter. + * + * This leaves a place to put eflags so that the call frame can be + * converted to a trap frame. Note that the eflags is (semi-)bogusly + * pushed into (what will be) tf_err and then copied later into the + * final spot. It has to be done this way because esp can't be just + * temporarily altered for the pushfl - an interrupt might come in + * and clobber the saved cs/eip. + */ + SUPERALIGN_TEXT +IDTVEC(lcall_syscall) + pushfl /* save eflags */ + popl 8(%esp) /* shuffle into tf_eflags */ + pushl $7 /* sizeof "lcall 7,0" */ + subl $4,%esp /* skip over tf_trapno */ + pushal + pushl %ds + pushl %es + pushl %fs + SET_KERNEL_SREGS + FAKE_MCOUNT(TF_EIP(%esp)) + pushl %esp + call syscall + add $4, %esp + MEXITCOUNT + jmp doreti + +/* + * Call gate entry for FreeBSD ELF and Linux/NetBSD syscall (int 0x80) + * + * Even though the name says 'int0x80', this is actually a TGT (trap gate) + * rather then an IGT (interrupt gate). Thus interrupts are enabled on + * entry just as they are for a normal syscall. + */ + SUPERALIGN_TEXT +IDTVEC(int0x80_syscall) + pushl $2 /* sizeof "int 0x80" */ + pushl $0xBEEF /* for debug */ + pushal + pushl %ds + pushl %es + pushl %fs + SET_KERNEL_SREGS + FAKE_MCOUNT(TF_EIP(%esp)) + pushl %esp + call syscall + add $4, %esp + MEXITCOUNT + jmp doreti + +ENTRY(fork_trampoline) + pushl %esp /* trapframe pointer */ + pushl %ebx /* arg1 */ + pushl %esi /* function */ + call fork_exit + addl $12,%esp + /* cut from syscall */ + + /* + * Return via doreti to handle ASTs. + */ + MEXITCOUNT + jmp doreti + + +/* + * To efficiently implement classification of trap and interrupt handlers + * for profiling, there must be only trap handlers between the labels btrap + * and bintr, and only interrupt handlers between the labels bintr and + * eintr. This is implemented (partly) by including files that contain + * some of the handlers. Before including the files, set up a normal asm + * environment so that the included files doen't need to know that they are + * included. + */ + + .data + .p2align 4 + .text + SUPERALIGN_TEXT +MCOUNT_LABEL(bintr) + +#ifdef DEV_APIC + .data + .p2align 4 + .text + SUPERALIGN_TEXT + +#include <i386/i386/apic_vector.s> +#endif + + .data + .p2align 4 + .text + SUPERALIGN_TEXT +#include <i386/i386/vm86bios.s> + + .text +MCOUNT_LABEL(eintr) + +/* + * void doreti(struct trapframe) + * + * Handle return from interrupts, traps and syscalls. + */ + .text + SUPERALIGN_TEXT + .type doreti,@function +doreti: + FAKE_MCOUNT($bintr) /* init "from" bintr -> doreti */ +doreti_next: +#ifdef notyet + /* + * Check if ASTs can be handled now. PSL_VM must be checked first + * since segment registers only have an RPL in non-VM86 mode. + */ + testl $PSL_VM,TF_EFLAGS(%esp) /* are we in vm86 mode? */ + jz doreti_notvm86 + movl PCPU(CURPCB),%ecx + testl $PCB_VM86CALL,PCB_FLAGS(%ecx) /* are we in a vm86 call? */ + jz doreti_ast /* can handle ASTS now if not */ + jmp doreti_exit + +doreti_notvm86: +#endif + testb $SEL_RPL_MASK,TF_CS(%esp) /* are we returning to user mode? */ + jz doreti_exit /* can't handle ASTs now if not */ + +doreti_ast: + /* + * Check for ASTs atomically with returning. Disabling CPU + * interrupts provides sufficient locking even in the SMP case, + * since we will be informed of any new ASTs by an IPI. + */ + DISABLE_INTERRUPTS(%esi) /* cli */ + movl PCPU(CURTHREAD),%eax + testl $TDF_ASTPENDING | TDF_NEEDRESCHED,TD_FLAGS(%eax) + je doreti_exit + ENABLE_INTERRUPTS(%esi) /* sti */ + pushl %esp /* pass a pointer to the trapframe */ + call ast + add $4,%esp + jmp doreti_ast + + /* + * doreti_exit: pop registers, iret. + * + * The segment register pop is a special case, since it may + * fault if (for example) a sigreturn specifies bad segment + * registers. The fault is handled in trap.c. + */ +doreti_exit: + ENABLE_INTERRUPTS(%esi) # reenable event callbacks (sti) + + .globl scrit +scrit: + __TEST_PENDING(%esi) + jnz hypervisor_callback_pending /* More to go */ + + MEXITCOUNT + + .globl doreti_popl_fs +doreti_popl_fs: + popl %fs + .globl doreti_popl_es +doreti_popl_es: + popl %es + .globl doreti_popl_ds +doreti_popl_ds: + popl %ds + + /* + * This is important: as nothing is atomic over here (we can get + * interrupted any time), we use the critical_region_fixup() in + * order to figure out where out stack is. Therefore, do NOT use + * 'popal' here without fixing up the table! + */ + POPA + addl $8,%esp + .globl doreti_iret +doreti_iret: + jmp hypercall_page + (__HYPERVISOR_iret * 32) + .globl ecrit +ecrit: + /* + * doreti_iret_fault and friends. Alternative return code for + * the case where we get a fault in the doreti_exit code + * above. trap() (i386/i386/trap.c) catches this specific + * case, sends the process a signal and continues in the + * corresponding place in the code below. + */ + ALIGN_TEXT + .globl doreti_iret_fault +doreti_iret_fault: + subl $8,%esp + pushal + pushl %ds + .globl doreti_popl_ds_fault +doreti_popl_ds_fault: + pushl %es + .globl doreti_popl_es_fault +doreti_popl_es_fault: + pushl %fs + .globl doreti_popl_fs_fault +doreti_popl_fs_fault: + movl $0,TF_ERR(%esp) /* XXX should be the error code */ + movl $T_PROTFLT,TF_TRAPNO(%esp) + jmp alltraps_with_regs_pushed + + /* +# [How we do the fixup]. We want to merge the current stack frame with the +# just-interrupted frame. How we do this depends on where in the critical +# region the interrupted handler was executing, and so how many saved +# registers are in each frame. We do this quickly using the lookup table +# 'critical_fixup_table'. For each byte offset in the critical region, it +# provides the number of bytes which have already been popped from the +# interrupted stack frame. +*/ + +.globl critical_region_fixup +critical_region_fixup: + addl $critical_fixup_table-scrit,%eax + movzbl (%eax),%eax # %eax contains num bytes popped + movl %esp,%esi + add %eax,%esi # %esi points at end of src region + movl %esp,%edi + add $0x40,%edi # %edi points at end of dst region + movl %eax,%ecx + shr $2,%ecx # convert bytes to words + je 16f # skip loop if nothing to copy +15: subl $4,%esi # pre-decrementing copy loop + subl $4,%edi + movl (%esi),%eax + movl %eax,(%edi) + loop 15b +16: movl %edi,%esp # final %edi is top of merged stack + jmp hypervisor_callback_pending + + +critical_fixup_table: +.byte 0x0,0x0,0x0 #testb $0x1,(%esi) +.byte 0x0,0x0,0x0,0x0,0x0,0x0 #jne ea +.byte 0x0,0x0 #pop %fs +.byte 0x04 #pop %es +.byte 0x08 #pop %ds +.byte 0x0c #pop %edi +.byte 0x10 #pop %esi +.byte 0x14 #pop %ebp +.byte 0x18 #pop %ebx +.byte 0x1c #pop %ebx +.byte 0x20 #pop %edx +.byte 0x24 #pop %ecx +.byte 0x28 #pop %eax +.byte 0x2c,0x2c,0x2c #add $0x8,%esp +#if 0 + .byte 0x34 #iret +#endif +.byte 0x34,0x34,0x34,0x34,0x34 #HYPERVISOR_iret + + +/* # Hypervisor uses this for application faults while it executes.*/ +ENTRY(failsafe_callback) + pushal + call xen_failsafe_handler +/*# call install_safe_pf_handler */ + movl 28(%esp),%ebx +1: movl %ebx,%ds + movl 32(%esp),%ebx +2: movl %ebx,%es + movl 36(%esp),%ebx +3: movl %ebx,%fs + movl 40(%esp),%ebx +4: movl %ebx,%gs +/*# call install_normal_pf_handler */ + popal + addl $12,%esp + iret + + diff --git a/sys/i386/xen/locore.s b/sys/i386/xen/locore.s new file mode 100644 index 0000000..59cdb547 --- /dev/null +++ b/sys/i386/xen/locore.s @@ -0,0 +1,361 @@ +/*- + * Copyright (c) 1990 The Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * William Jolitz. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)locore.s 7.3 (Berkeley) 5/13/91 + * $FreeBSD$ + * + * originally from: locore.s, by William F. Jolitz + * + * Substantially rewritten by David Greenman, Rod Grimes, + * Bruce Evans, Wolfgang Solfrank, Poul-Henning Kamp + * and many others. + */ + +#include "opt_bootp.h" +#include "opt_compat.h" +#include "opt_nfsroot.h" +#include "opt_global.h" +#include "opt_pmap.h" + +#include <sys/syscall.h> +#include <sys/reboot.h> + +#include <machine/asmacros.h> +#include <machine/cputypes.h> +#include <machine/psl.h> +#include <machine/pmap.h> +#include <machine/specialreg.h> + +#define __ASSEMBLY__ +#include <xen/interface/elfnote.h> + +/* The defines below have been lifted out of <machine/xen-public/arch-x86_32.h> */ +#define FLAT_RING1_CS 0xe019 /* GDT index 259 */ +#define FLAT_RING1_DS 0xe021 /* GDT index 260 */ +#define KERNEL_CS FLAT_RING1_CS +#define KERNEL_DS FLAT_RING1_DS + +#include "assym.s" + +.section __xen_guest + .ascii "LOADER=generic,GUEST_OS=freebsd,GUEST_VER=7.0,XEN_VER=xen-3.0,BSD_SYMTAB,VIRT_BASE=0xc0000000" + .byte 0 + + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz, "FreeBSD") + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz, "HEAD") + ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz, "xen-3.0") + ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long, KERNBASE) + ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .long, KERNBASE) + ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long, btext) + ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long, hypercall_page) + ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, .long, HYPERVISOR_VIRT_START) +#if 0 + ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz, "writable_page_tables|writable_descriptor_tables|auto_translated_physmap|pae_pgdir_above_4gb|supervisor_mode_kernel") +#endif + ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz, "writable_page_tables|supervisor_mode_kernel|writable_descriptor_tables") + +#ifdef PAE + ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz, "yes") + ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .long, PG_V, PG_V) +#else + ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz, "no") + ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .long, PG_V, PG_V) +#endif + ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz, "generic") + ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long, 1) + + + +/* + * XXX + * + * Note: This version greatly munged to avoid various assembler errors + * that may be fixed in newer versions of gas. Perhaps newer versions + * will have more pleasant appearance. + */ + +/* + * PTmap is recursive pagemap at top of virtual address space. + * Within PTmap, the page directory can be found (third indirection). + */ + .globl PTmap,PTD,PTDpde + .set PTmap,(PTDPTDI << PDRSHIFT) + .set PTD,PTmap + (PTDPTDI * PAGE_SIZE) + .set PTDpde,PTD + (PTDPTDI * PDESIZE) + +/* + * Compiled KERNBASE location and the kernel load address + */ + .globl kernbase + .set kernbase,KERNBASE + .globl kernload + .set kernload,KERNLOAD + +/* + * Globals + */ + .data + ALIGN_DATA /* just to be sure */ + + .space 0x2000 /* space for tmpstk - temporary stack */ +tmpstk: + + .globl bootinfo +bootinfo: .space BOOTINFO_SIZE /* bootinfo that we can handle */ + + .globl KERNend +KERNend: .long 0 /* phys addr end of kernel (just after bss) */ + .globl physfree +physfree: .long 0 /* phys addr of next free page */ + + .globl IdlePTD +IdlePTD: .long 0 /* phys addr of kernel PTD */ + +#ifdef PAE + .globl IdlePDPT +IdlePDPT: .long 0 /* phys addr of kernel PDPT */ +#endif + +#ifdef SMP + .globl KPTphys +#endif +KPTphys: .long 0 /* phys addr of kernel page tables */ + .globl gdtset +gdtset: .long 0 /* GDT is valid */ + + .globl proc0kstack +proc0kstack: .long 0 /* address of proc 0 kstack space */ +p0kpa: .long 0 /* phys addr of proc0's STACK */ + +vm86phystk: .long 0 /* PA of vm86/bios stack */ + + .globl vm86paddr, vm86pa +vm86paddr: .long 0 /* address of vm86 region */ +vm86pa: .long 0 /* phys addr of vm86 region */ + +#ifdef PC98 + .globl pc98_system_parameter +pc98_system_parameter: + .space 0x240 +#endif + + .globl avail_space +avail_space: .long 0 + +/********************************************************************** + * + * Some handy macros + * + */ + +/* + * We're already in protected mode, so no remapping is needed. + */ +#define R(foo) (foo) + +#define ALLOCPAGES(foo) \ + movl R(physfree), %esi ; \ + movl $((foo)*PAGE_SIZE), %eax ; \ + addl %esi, %eax ; \ + movl %eax, R(physfree) ; \ + movl %esi, %edi ; \ + movl $((foo)*PAGE_SIZE),%ecx ; \ + xorl %eax,%eax ; \ + cld ; \ + rep ; \ + stosb + +/* + * fillkpt + * eax = page frame address + * ebx = index into page table + * ecx = how many pages to map + * base = base address of page dir/table + * prot = protection bits + */ +#define fillkpt(base, prot) \ + shll $PTESHIFT,%ebx ; \ + addl base,%ebx ; \ + orl $PG_V,%eax ; \ + orl prot,%eax ; \ +1: movl %eax,(%ebx) ; \ + addl $PAGE_SIZE,%eax ; /* increment physical address */ \ + addl $PTESIZE,%ebx ; /* next pte */ \ + loop 1b + +/* + * fillkptphys(prot) + * eax = physical address + * ecx = how many pages to map + * prot = protection bits + */ +#define fillkptphys(prot) \ + movl %eax, %ebx ; \ + shrl $PAGE_SHIFT, %ebx ; \ + fillkpt(R(KPTphys), prot) + +/* Temporary stack */ +.space 8192 +tmpstack: + .long tmpstack, KERNEL_DS + + .text + +.p2align 12, 0x90 + +#define HYPERCALL_PAGE_OFFSET 0x1000 +.org HYPERCALL_PAGE_OFFSET +ENTRY(hypercall_page) + .cfi_startproc + .skip 0x1000 + .cfi_endproc + +/********************************************************************** + * + * This is where the bootblocks start us, set the ball rolling... + * + */ +NON_GPROF_ENTRY(btext) + /* At the end of our stack, we shall have free space - so store it */ + movl %esp,%ebx + movl %ebx,R(avail_space) + + lss tmpstack,%esp + + pushl %esi + call initvalues + popl %esi + + /* Store the CPUID information */ + xorl %eax,%eax + cpuid # cpuid 0 + movl %eax,R(cpu_high) # highest capability + movl %ebx,R(cpu_vendor) # store vendor string + movl %edx,R(cpu_vendor+4) + movl %ecx,R(cpu_vendor+8) + movb $0,R(cpu_vendor+12) + + movl $1,%eax + cpuid # cpuid 1 + movl %eax,R(cpu_id) # store cpu_id + movl %ebx,R(cpu_procinfo) # store cpu_procinfo + movl %edx,R(cpu_feature) # store cpu_feature + movl %ecx,R(cpu_feature2) # store cpu_feature2 + rorl $8,%eax # extract family type + andl $15,%eax + cmpl $5,%eax + movl $CPU_686,R(cpu) + + movl proc0kstack,%eax + leal (KSTACK_PAGES*PAGE_SIZE-PCB_SIZE)(%eax),%esp + xorl %ebp,%ebp /* mark end of frames */ +#ifdef PAE + movl IdlePDPT,%esi +#else + movl IdlePTD,%esi +#endif + movl %esi,(KSTACK_PAGES*PAGE_SIZE-PCB_SIZE+PCB_CR3)(%eax) + pushl physfree + call init386 + addl $4, %esp + call mi_startup + /* NOTREACHED */ + int $3 + +/* + * Signal trampoline, copied to top of user stack + */ +NON_GPROF_ENTRY(sigcode) + calll *SIGF_HANDLER(%esp) + leal SIGF_UC(%esp),%eax /* get ucontext */ + pushl %eax + testl $PSL_VM,UC_EFLAGS(%eax) + jne 1f + mov UC_GS(%eax), %gs /* restore %gs */ +1: + movl $SYS_sigreturn,%eax + pushl %eax /* junk to fake return addr. */ + int $0x80 /* enter kernel with args */ + /* on stack */ +1: + jmp 1b + +#ifdef COMPAT_FREEBSD4 + ALIGN_TEXT +freebsd4_sigcode: + calll *SIGF_HANDLER(%esp) + leal SIGF_UC4(%esp),%eax /* get ucontext */ + pushl %eax + testl $PSL_VM,UC4_EFLAGS(%eax) + jne 1f + mov UC4_GS(%eax),%gs /* restore %gs */ +1: + movl $344,%eax /* 4.x SYS_sigreturn */ + pushl %eax /* junk to fake return addr. */ + int $0x80 /* enter kernel with args */ + /* on stack */ +1: + jmp 1b +#endif + +#ifdef COMPAT_43 + ALIGN_TEXT +osigcode: + call *SIGF_HANDLER(%esp) /* call signal handler */ + lea SIGF_SC(%esp),%eax /* get sigcontext */ + pushl %eax + testl $PSL_VM,SC_PS(%eax) + jne 9f + movl SC_GS(%eax),%gs /* restore %gs */ +9: + movl $103,%eax /* 3.x SYS_sigreturn */ + pushl %eax /* junk to fake return addr. */ + int $0x80 /* enter kernel with args */ +0: jmp 0b +#endif /* COMPAT_43 */ + + ALIGN_TEXT +esigcode: + + .data + .globl szsigcode +szsigcode: + .long esigcode-sigcode +#ifdef COMPAT_FREEBSD4 + .globl szfreebsd4_sigcode +szfreebsd4_sigcode: + .long esigcode-freebsd4_sigcode +#endif +#ifdef COMPAT_43 + .globl szosigcode +szosigcode: + .long esigcode-osigcode +#endif diff --git a/sys/i386/xen/mp_machdep.c b/sys/i386/xen/mp_machdep.c new file mode 100644 index 0000000..253cc40 --- /dev/null +++ b/sys/i386/xen/mp_machdep.c @@ -0,0 +1,1249 @@ +/*- + * Copyright (c) 1996, by Steve Passe + * Copyright (c) 2008, by Kip Macy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. The name of the developer may NOT be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include "opt_apic.h" +#include "opt_cpu.h" +#include "opt_kstack_pages.h" +#include "opt_mp_watchdog.h" +#include "opt_pmap.h" +#include "opt_sched.h" +#include "opt_smp.h" + +#if !defined(lint) +#if !defined(SMP) +#error How did you get here? +#endif + +#ifndef DEV_APIC +#error The apic device is required for SMP, add "device apic" to your config file. +#endif +#if defined(CPU_DISABLE_CMPXCHG) && !defined(COMPILING_LINT) +#error SMP not supported with CPU_DISABLE_CMPXCHG +#endif +#endif /* not lint */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bus.h> +#include <sys/cons.h> /* cngetc() */ +#include <sys/cpuset.h> +#ifdef GPROF +#include <sys/gmon.h> +#endif +#include <sys/kernel.h> +#include <sys/ktr.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/memrange.h> +#include <sys/mutex.h> +#include <sys/pcpu.h> +#include <sys/proc.h> +#include <sys/rwlock.h> +#include <sys/sched.h> +#include <sys/smp.h> +#include <sys/sysctl.h> + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/pmap.h> +#include <vm/vm_kern.h> +#include <vm/vm_extern.h> +#include <vm/vm_page.h> + +#include <x86/apicreg.h> +#include <machine/md_var.h> +#include <machine/mp_watchdog.h> +#include <machine/pcb.h> +#include <machine/psl.h> +#include <machine/smp.h> +#include <machine/specialreg.h> +#include <machine/pcpu.h> + + + +#include <machine/xen/xen-os.h> +#include <xen/evtchn.h> +#include <xen/xen_intr.h> +#include <xen/hypervisor.h> +#include <xen/interface/vcpu.h> + + +int mp_naps; /* # of Applications processors */ +int boot_cpu_id = -1; /* designated BSP */ + +extern struct pcpu __pcpu[]; + +static int bootAP; +static union descriptor *bootAPgdt; + +static char resched_name[NR_CPUS][15]; +static char callfunc_name[NR_CPUS][15]; + +/* Free these after use */ +void *bootstacks[MAXCPU]; + +struct pcb stoppcbs[MAXCPU]; + +/* Variables needed for SMP tlb shootdown. */ +vm_offset_t smp_tlb_addr1; +vm_offset_t smp_tlb_addr2; +volatile int smp_tlb_wait; + +typedef void call_data_func_t(uintptr_t , uintptr_t); + +static u_int logical_cpus; +static volatile cpuset_t ipi_nmi_pending; + +/* used to hold the AP's until we are ready to release them */ +static struct mtx ap_boot_mtx; + +/* Set to 1 once we're ready to let the APs out of the pen. */ +static volatile int aps_ready = 0; + +/* + * Store data from cpu_add() until later in the boot when we actually setup + * the APs. + */ +struct cpu_info { + int cpu_present:1; + int cpu_bsp:1; + int cpu_disabled:1; +} static cpu_info[MAX_APIC_ID + 1]; +int cpu_apic_ids[MAXCPU]; +int apic_cpuids[MAX_APIC_ID + 1]; + +/* Holds pending bitmap based IPIs per CPU */ +static volatile u_int cpu_ipi_pending[MAXCPU]; + +static int cpu_logical; +static int cpu_cores; + +static void assign_cpu_ids(void); +static void set_interrupt_apic_ids(void); +int start_all_aps(void); +static int start_ap(int apic_id); +static void release_aps(void *dummy); + +static u_int hyperthreading_cpus; +static cpuset_t hyperthreading_cpus_mask; + +extern void Xhypervisor_callback(void); +extern void failsafe_callback(void); +extern void pmap_lazyfix_action(void); + +struct cpu_group * +cpu_topo(void) +{ + if (cpu_cores == 0) + cpu_cores = 1; + if (cpu_logical == 0) + cpu_logical = 1; + if (mp_ncpus % (cpu_cores * cpu_logical) != 0) { + printf("WARNING: Non-uniform processors.\n"); + printf("WARNING: Using suboptimal topology.\n"); + return (smp_topo_none()); + } + /* + * No multi-core or hyper-threaded. + */ + if (cpu_logical * cpu_cores == 1) + return (smp_topo_none()); + /* + * Only HTT no multi-core. + */ + if (cpu_logical > 1 && cpu_cores == 1) + return (smp_topo_1level(CG_SHARE_L1, cpu_logical, CG_FLAG_HTT)); + /* + * Only multi-core no HTT. + */ + if (cpu_cores > 1 && cpu_logical == 1) + return (smp_topo_1level(CG_SHARE_NONE, cpu_cores, 0)); + /* + * Both HTT and multi-core. + */ + return (smp_topo_2level(CG_SHARE_NONE, cpu_cores, + CG_SHARE_L1, cpu_logical, CG_FLAG_HTT)); +} + +/* + * Calculate usable address in base memory for AP trampoline code. + */ +u_int +mp_bootaddress(u_int basemem) +{ + + return (basemem); +} + +void +cpu_add(u_int apic_id, char boot_cpu) +{ + + if (apic_id > MAX_APIC_ID) { + panic("SMP: APIC ID %d too high", apic_id); + return; + } + KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice", + apic_id)); + cpu_info[apic_id].cpu_present = 1; + if (boot_cpu) { + KASSERT(boot_cpu_id == -1, + ("CPU %d claims to be BSP, but CPU %d already is", apic_id, + boot_cpu_id)); + boot_cpu_id = apic_id; + cpu_info[apic_id].cpu_bsp = 1; + } + if (mp_ncpus < MAXCPU) + mp_ncpus++; + if (bootverbose) + printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" : + "AP"); +} + +void +cpu_mp_setmaxid(void) +{ + + mp_maxid = MAXCPU - 1; +} + +int +cpu_mp_probe(void) +{ + + /* + * Always record BSP in CPU map so that the mbuf init code works + * correctly. + */ + CPU_SETOF(0, &all_cpus); + if (mp_ncpus == 0) { + /* + * No CPUs were found, so this must be a UP system. Setup + * the variables to represent a system with a single CPU + * with an id of 0. + */ + mp_ncpus = 1; + return (0); + } + + /* At least one CPU was found. */ + if (mp_ncpus == 1) { + /* + * One CPU was found, so this must be a UP system with + * an I/O APIC. + */ + return (0); + } + + /* At least two CPUs were found. */ + return (1); +} + +/* + * Initialize the IPI handlers and start up the AP's. + */ +void +cpu_mp_start(void) +{ + int i; + + /* Initialize the logical ID to APIC ID table. */ + for (i = 0; i < MAXCPU; i++) { + cpu_apic_ids[i] = -1; + cpu_ipi_pending[i] = 0; + } + + /* Set boot_cpu_id if needed. */ + if (boot_cpu_id == -1) { + boot_cpu_id = PCPU_GET(apic_id); + cpu_info[boot_cpu_id].cpu_bsp = 1; + } else + KASSERT(boot_cpu_id == PCPU_GET(apic_id), + ("BSP's APIC ID doesn't match boot_cpu_id")); + cpu_apic_ids[0] = boot_cpu_id; + apic_cpuids[boot_cpu_id] = 0; + + assign_cpu_ids(); + + /* Start each Application Processor */ + start_all_aps(); + + /* Setup the initial logical CPUs info. */ + logical_cpus = 0; + CPU_ZERO(&logical_cpus_mask); + if (cpu_feature & CPUID_HTT) + logical_cpus = (cpu_procinfo & CPUID_HTT_CORES) >> 16; + + set_interrupt_apic_ids(); +} + + +static void +iv_rendezvous(uintptr_t a, uintptr_t b) +{ + smp_rendezvous_action(); +} + +static void +iv_invltlb(uintptr_t a, uintptr_t b) +{ + xen_tlb_flush(); +} + +static void +iv_invlpg(uintptr_t a, uintptr_t b) +{ + xen_invlpg(a); +} + +static void +iv_invlrng(uintptr_t a, uintptr_t b) +{ + vm_offset_t start = (vm_offset_t)a; + vm_offset_t end = (vm_offset_t)b; + + while (start < end) { + xen_invlpg(start); + start += PAGE_SIZE; + } +} + + +static void +iv_invlcache(uintptr_t a, uintptr_t b) +{ + + wbinvd(); + atomic_add_int(&smp_tlb_wait, 1); +} + +static void +iv_lazypmap(uintptr_t a, uintptr_t b) +{ + pmap_lazyfix_action(); + atomic_add_int(&smp_tlb_wait, 1); +} + +/* + * These start from "IPI offset" APIC_IPI_INTS + */ +static call_data_func_t *ipi_vectors[6] = +{ + iv_rendezvous, + iv_invltlb, + iv_invlpg, + iv_invlrng, + iv_invlcache, + iv_lazypmap, +}; + +/* + * Reschedule call back. Nothing to do, + * all the work is done automatically when + * we return from the interrupt. + */ +static int +smp_reschedule_interrupt(void *unused) +{ + int cpu = PCPU_GET(cpuid); + u_int ipi_bitmap; + + ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]); + + if (ipi_bitmap & (1 << IPI_PREEMPT)) { +#ifdef COUNT_IPIS + (*ipi_preempt_counts[cpu])++; +#endif + sched_preempt(curthread); + } + + if (ipi_bitmap & (1 << IPI_AST)) { +#ifdef COUNT_IPIS + (*ipi_ast_counts[cpu])++; +#endif + /* Nothing to do for AST */ + } + return (FILTER_HANDLED); +} + +struct _call_data { + uint16_t func_id; + uint16_t wait; + uintptr_t arg1; + uintptr_t arg2; + atomic_t started; + atomic_t finished; +}; + +static struct _call_data *call_data; + +static int +smp_call_function_interrupt(void *unused) +{ + call_data_func_t *func; + uintptr_t arg1 = call_data->arg1; + uintptr_t arg2 = call_data->arg2; + int wait = call_data->wait; + atomic_t *started = &call_data->started; + atomic_t *finished = &call_data->finished; + + /* We only handle function IPIs, not bitmap IPIs */ + if (call_data->func_id < APIC_IPI_INTS || call_data->func_id > IPI_BITMAP_VECTOR) + panic("invalid function id %u", call_data->func_id); + + func = ipi_vectors[call_data->func_id - APIC_IPI_INTS]; + /* + * Notify initiating CPU that I've grabbed the data and am + * about to execute the function + */ + mb(); + atomic_inc(started); + /* + * At this point the info structure may be out of scope unless wait==1 + */ + (*func)(arg1, arg2); + + if (wait) { + mb(); + atomic_inc(finished); + } + atomic_add_int(&smp_tlb_wait, 1); + return (FILTER_HANDLED); +} + +/* + * Print various information about the SMP system hardware and setup. + */ +void +cpu_mp_announce(void) +{ + int i, x; + + /* List CPUs */ + printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id); + for (i = 1, x = 0; x <= MAX_APIC_ID; x++) { + if (!cpu_info[x].cpu_present || cpu_info[x].cpu_bsp) + continue; + if (cpu_info[x].cpu_disabled) + printf(" cpu (AP): APIC ID: %2d (disabled)\n", x); + else { + KASSERT(i < mp_ncpus, + ("mp_ncpus and actual cpus are out of whack")); + printf(" cpu%d (AP): APIC ID: %2d\n", i++, x); + } + } +} + +static int +xen_smp_intr_init(unsigned int cpu) +{ + int rc; + unsigned int irq; + + per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1; + + sprintf(resched_name[cpu], "resched%u", cpu); + rc = bind_ipi_to_irqhandler(RESCHEDULE_VECTOR, + cpu, + resched_name[cpu], + smp_reschedule_interrupt, + INTR_TYPE_TTY, &irq); + + printf("[XEN] IPI cpu=%d irq=%d vector=RESCHEDULE_VECTOR (%d)\n", + cpu, irq, RESCHEDULE_VECTOR); + + per_cpu(resched_irq, cpu) = irq; + + sprintf(callfunc_name[cpu], "callfunc%u", cpu); + rc = bind_ipi_to_irqhandler(CALL_FUNCTION_VECTOR, + cpu, + callfunc_name[cpu], + smp_call_function_interrupt, + INTR_TYPE_TTY, &irq); + if (rc < 0) + goto fail; + per_cpu(callfunc_irq, cpu) = irq; + + printf("[XEN] IPI cpu=%d irq=%d vector=CALL_FUNCTION_VECTOR (%d)\n", + cpu, irq, CALL_FUNCTION_VECTOR); + + + if ((cpu != 0) && ((rc = ap_cpu_initclocks(cpu)) != 0)) + goto fail; + + return 0; + + fail: + if (per_cpu(resched_irq, cpu) >= 0) + unbind_from_irqhandler(per_cpu(resched_irq, cpu)); + if (per_cpu(callfunc_irq, cpu) >= 0) + unbind_from_irqhandler(per_cpu(callfunc_irq, cpu)); + return rc; +} + +static void +xen_smp_intr_init_cpus(void *unused) +{ + int i; + + for (i = 0; i < mp_ncpus; i++) + xen_smp_intr_init(i); +} + +#define MTOPSIZE (1<<(14 + PAGE_SHIFT)) + +/* + * AP CPU's call this to initialize themselves. + */ +void +init_secondary(void) +{ + vm_offset_t addr; + u_int cpuid; + int gsel_tss; + + + /* bootAP is set in start_ap() to our ID. */ + PCPU_SET(currentldt, _default_ldt); + gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); +#if 0 + gdt[bootAP * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS; +#endif + PCPU_SET(common_tss.tss_esp0, 0); /* not used until after switch */ + PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); + PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16); +#if 0 + PCPU_SET(tss_gdt, &gdt[bootAP * NGDT + GPROC0_SEL].sd); + + PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); +#endif + PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd); + + /* + * Set to a known state: + * Set by mpboot.s: CR0_PG, CR0_PE + * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM + */ + /* + * signal our startup to the BSP. + */ + mp_naps++; + + /* Spin until the BSP releases the AP's. */ + while (!aps_ready) + ia32_pause(); + + /* BSP may have changed PTD while we were waiting */ + invltlb(); + for (addr = 0; addr < NKPT * NBPDR - 1; addr += PAGE_SIZE) + invlpg(addr); + + /* set up FPU state on the AP */ + npxinit(); +#if 0 + + /* set up SSE registers */ + enable_sse(); +#endif +#if 0 && defined(PAE) + /* Enable the PTE no-execute bit. */ + if ((amd_feature & AMDID_NX) != 0) { + uint64_t msr; + + msr = rdmsr(MSR_EFER) | EFER_NXE; + wrmsr(MSR_EFER, msr); + } +#endif +#if 0 + /* A quick check from sanity claus */ + if (PCPU_GET(apic_id) != lapic_id()) { + printf("SMP: cpuid = %d\n", PCPU_GET(cpuid)); + printf("SMP: actual apic_id = %d\n", lapic_id()); + printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id)); + panic("cpuid mismatch! boom!!"); + } +#endif + + /* Initialize curthread. */ + KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); + PCPU_SET(curthread, PCPU_GET(idlethread)); + + mtx_lock_spin(&ap_boot_mtx); +#if 0 + + /* Init local apic for irq's */ + lapic_setup(1); +#endif + smp_cpus++; + + cpuid = PCPU_GET(cpuid); + CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid); + printf("SMP: AP CPU #%d Launched!\n", cpuid); + + /* Determine if we are a logical CPU. */ + if (logical_cpus > 1 && PCPU_GET(apic_id) % logical_cpus != 0) + CPU_SET(cpuid, &logical_cpus_mask); + + /* Determine if we are a hyperthread. */ + if (hyperthreading_cpus > 1 && + PCPU_GET(apic_id) % hyperthreading_cpus != 0) + CPU_SET(cpuid, &hyperthreading_cpus_mask); +#if 0 + if (bootverbose) + lapic_dump("AP"); +#endif + if (smp_cpus == mp_ncpus) { + /* enable IPI's, tlb shootdown, freezes etc */ + atomic_store_rel_int(&smp_started, 1); + smp_active = 1; /* historic */ + } + + mtx_unlock_spin(&ap_boot_mtx); + + /* wait until all the AP's are up */ + while (smp_started == 0) + ia32_pause(); + + PCPU_SET(curthread, PCPU_GET(idlethread)); + + /* Start per-CPU event timers. */ + cpu_initclocks_ap(); + + /* enter the scheduler */ + sched_throw(NULL); + + panic("scheduler returned us to %s", __func__); + /* NOTREACHED */ +} + +/******************************************************************* + * local functions and data + */ + +/* + * We tell the I/O APIC code about all the CPUs we want to receive + * interrupts. If we don't want certain CPUs to receive IRQs we + * can simply not tell the I/O APIC code about them in this function. + * We also do not tell it about the BSP since it tells itself about + * the BSP internally to work with UP kernels and on UP machines. + */ +static void +set_interrupt_apic_ids(void) +{ + u_int i, apic_id; + + for (i = 0; i < MAXCPU; i++) { + apic_id = cpu_apic_ids[i]; + if (apic_id == -1) + continue; + if (cpu_info[apic_id].cpu_bsp) + continue; + if (cpu_info[apic_id].cpu_disabled) + continue; + + /* Don't let hyperthreads service interrupts. */ + if (hyperthreading_cpus > 1 && + apic_id % hyperthreading_cpus != 0) + continue; + + intr_add_cpu(i); + } +} + +/* + * Assign logical CPU IDs to local APICs. + */ +static void +assign_cpu_ids(void) +{ + u_int i; + + /* Check for explicitly disabled CPUs. */ + for (i = 0; i <= MAX_APIC_ID; i++) { + if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp) + continue; + + /* Don't use this CPU if it has been disabled by a tunable. */ + if (resource_disabled("lapic", i)) { + cpu_info[i].cpu_disabled = 1; + continue; + } + } + + /* + * Assign CPU IDs to local APIC IDs and disable any CPUs + * beyond MAXCPU. CPU 0 has already been assigned to the BSP, + * so we only have to assign IDs for APs. + */ + mp_ncpus = 1; + for (i = 0; i <= MAX_APIC_ID; i++) { + if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp || + cpu_info[i].cpu_disabled) + continue; + + if (mp_ncpus < MAXCPU) { + cpu_apic_ids[mp_ncpus] = i; + apic_cpuids[i] = mp_ncpus; + mp_ncpus++; + } else + cpu_info[i].cpu_disabled = 1; + } + KASSERT(mp_maxid >= mp_ncpus - 1, + ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, + mp_ncpus)); +} + +/* + * start each AP in our list + */ +/* Lowest 1MB is already mapped: don't touch*/ +#define TMPMAP_START 1 +int +start_all_aps(void) +{ + int x,apic_id, cpu; + struct pcpu *pc; + + mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN); + + /* set up temporary P==V mapping for AP boot */ + /* XXX this is a hack, we should boot the AP on its own stack/PTD */ + + /* start each AP */ + for (cpu = 1; cpu < mp_ncpus; cpu++) { + apic_id = cpu_apic_ids[cpu]; + + + bootAP = cpu; + bootAPgdt = gdt + (512*cpu); + + /* Get per-cpu data */ + pc = &__pcpu[bootAP]; + pcpu_init(pc, bootAP, sizeof(struct pcpu)); + dpcpu_init((void *)kmem_alloc(kernel_map, DPCPU_SIZE), bootAP); + pc->pc_apic_id = cpu_apic_ids[bootAP]; + pc->pc_prvspace = pc; + pc->pc_curthread = 0; + + gdt_segs[GPRIV_SEL].ssd_base = (int) pc; + gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss; + + PT_SET_MA(bootAPgdt, VTOM(bootAPgdt) | PG_V | PG_RW); + bzero(bootAPgdt, PAGE_SIZE); + for (x = 0; x < NGDT; x++) + ssdtosd(&gdt_segs[x], &bootAPgdt[x].sd); + PT_SET_MA(bootAPgdt, vtomach(bootAPgdt) | PG_V); +#ifdef notyet + + if (HYPERVISOR_vcpu_op(VCPUOP_get_physid, cpu, &cpu_id) == 0) { + apicid = xen_vcpu_physid_to_x86_apicid(cpu_id.phys_id); + acpiid = xen_vcpu_physid_to_x86_acpiid(cpu_id.phys_id); +#ifdef CONFIG_ACPI + if (acpiid != 0xff) + x86_acpiid_to_apicid[acpiid] = apicid; +#endif + } +#endif + + /* attempt to start the Application Processor */ + if (!start_ap(cpu)) { + printf("AP #%d (PHY# %d) failed!\n", cpu, apic_id); + /* better panic as the AP may be running loose */ + printf("panic y/n? [y] "); + if (cngetc() != 'n') + panic("bye-bye"); + } + + CPU_SET(cpu, &all_cpus); /* record AP in CPU map */ + } + + + pmap_invalidate_range(kernel_pmap, 0, NKPT * NBPDR - 1); + + /* number of APs actually started */ + return mp_naps; +} + +extern uint8_t *pcpu_boot_stack; +extern trap_info_t trap_table[]; + +static void +smp_trap_init(trap_info_t *trap_ctxt) +{ + const trap_info_t *t = trap_table; + + for (t = trap_table; t->address; t++) { + trap_ctxt[t->vector].flags = t->flags; + trap_ctxt[t->vector].cs = t->cs; + trap_ctxt[t->vector].address = t->address; + } +} + +extern struct rwlock pvh_global_lock; +extern int nkpt; +static void +cpu_initialize_context(unsigned int cpu) +{ + /* vcpu_guest_context_t is too large to allocate on the stack. + * Hence we allocate statically and protect it with a lock */ + vm_page_t m[NPGPTD + 2]; + static vcpu_guest_context_t ctxt; + vm_offset_t boot_stack; + vm_offset_t newPTD; + vm_paddr_t ma[NPGPTD]; + int i; + + /* + * Page 0,[0-3] PTD + * Page 1, [4] boot stack + * Page [5] PDPT + * + */ + for (i = 0; i < NPGPTD + 2; i++) { + m[i] = vm_page_alloc(NULL, 0, + VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | + VM_ALLOC_ZERO); + + pmap_zero_page(m[i]); + + } + boot_stack = kmem_alloc_nofault(kernel_map, PAGE_SIZE); + newPTD = kmem_alloc_nofault(kernel_map, NPGPTD * PAGE_SIZE); + ma[0] = VM_PAGE_TO_MACH(m[0])|PG_V; + +#ifdef PAE + pmap_kenter(boot_stack, VM_PAGE_TO_PHYS(m[NPGPTD + 1])); + for (i = 0; i < NPGPTD; i++) { + ((vm_paddr_t *)boot_stack)[i] = + ma[i] = VM_PAGE_TO_MACH(m[i])|PG_V; + } +#endif + + /* + * Copy cpu0 IdlePTD to new IdlePTD - copying only + * kernel mappings + */ + pmap_qenter(newPTD, m, 4); + + memcpy((uint8_t *)newPTD + KPTDI*sizeof(vm_paddr_t), + (uint8_t *)PTOV(IdlePTD) + KPTDI*sizeof(vm_paddr_t), + nkpt*sizeof(vm_paddr_t)); + + pmap_qremove(newPTD, 4); + kmem_free(kernel_map, newPTD, 4 * PAGE_SIZE); + /* + * map actual idle stack to boot_stack + */ + pmap_kenter(boot_stack, VM_PAGE_TO_PHYS(m[NPGPTD])); + + + xen_pgdpt_pin(VM_PAGE_TO_MACH(m[NPGPTD + 1])); + rw_wlock(&pvh_global_lock); + for (i = 0; i < 4; i++) { + int pdir = (PTDPTDI + i) / NPDEPG; + int curoffset = (PTDPTDI + i) % NPDEPG; + + xen_queue_pt_update((vm_paddr_t) + ((ma[pdir] & ~PG_V) + (curoffset*sizeof(vm_paddr_t))), + ma[i]); + } + PT_UPDATES_FLUSH(); + rw_wunlock(&pvh_global_lock); + + memset(&ctxt, 0, sizeof(ctxt)); + ctxt.flags = VGCF_IN_KERNEL; + ctxt.user_regs.ds = GSEL(GDATA_SEL, SEL_KPL); + ctxt.user_regs.es = GSEL(GDATA_SEL, SEL_KPL); + ctxt.user_regs.fs = GSEL(GPRIV_SEL, SEL_KPL); + ctxt.user_regs.gs = GSEL(GDATA_SEL, SEL_KPL); + ctxt.user_regs.cs = GSEL(GCODE_SEL, SEL_KPL); + ctxt.user_regs.ss = GSEL(GDATA_SEL, SEL_KPL); + ctxt.user_regs.eip = (unsigned long)init_secondary; + ctxt.user_regs.eflags = PSL_KERNEL | 0x1000; /* IOPL_RING1 */ + + memset(&ctxt.fpu_ctxt, 0, sizeof(ctxt.fpu_ctxt)); + + smp_trap_init(ctxt.trap_ctxt); + + ctxt.ldt_ents = 0; + ctxt.gdt_frames[0] = (uint32_t)((uint64_t)vtomach(bootAPgdt) >> PAGE_SHIFT); + ctxt.gdt_ents = 512; + +#ifdef __i386__ + ctxt.user_regs.esp = boot_stack + PAGE_SIZE; + + ctxt.kernel_ss = GSEL(GDATA_SEL, SEL_KPL); + ctxt.kernel_sp = boot_stack + PAGE_SIZE; + + ctxt.event_callback_cs = GSEL(GCODE_SEL, SEL_KPL); + ctxt.event_callback_eip = (unsigned long)Xhypervisor_callback; + ctxt.failsafe_callback_cs = GSEL(GCODE_SEL, SEL_KPL); + ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback; + + ctxt.ctrlreg[3] = VM_PAGE_TO_MACH(m[NPGPTD + 1]); +#else /* __x86_64__ */ + ctxt.user_regs.esp = idle->thread.rsp0 - sizeof(struct pt_regs); + ctxt.kernel_ss = GSEL(GDATA_SEL, SEL_KPL); + ctxt.kernel_sp = idle->thread.rsp0; + + ctxt.event_callback_eip = (unsigned long)hypervisor_callback; + ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback; + ctxt.syscall_callback_eip = (unsigned long)system_call; + + ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt)); + + ctxt.gs_base_kernel = (unsigned long)(cpu_pda(cpu)); +#endif + + printf("gdtpfn=%lx pdptpfn=%lx\n", + ctxt.gdt_frames[0], + ctxt.ctrlreg[3] >> PAGE_SHIFT); + + PANIC_IF(HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, &ctxt)); + DELAY(3000); + PANIC_IF(HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL)); +} + +/* + * This function starts the AP (application processor) identified + * by the APIC ID 'physicalCpu'. It does quite a "song and dance" + * to accomplish this. This is necessary because of the nuances + * of the different hardware we might encounter. It isn't pretty, + * but it seems to work. + */ + +int cpus; +static int +start_ap(int apic_id) +{ + int ms; + + /* used as a watchpoint to signal AP startup */ + cpus = mp_naps; + + cpu_initialize_context(apic_id); + + /* Wait up to 5 seconds for it to start. */ + for (ms = 0; ms < 5000; ms++) { + if (mp_naps > cpus) + return 1; /* return SUCCESS */ + DELAY(1000); + } + return 0; /* return FAILURE */ +} + +/* + * send an IPI to a specific CPU. + */ +static void +ipi_send_cpu(int cpu, u_int ipi) +{ + u_int bitmap, old_pending, new_pending; + + if (IPI_IS_BITMAPED(ipi)) { + bitmap = 1 << ipi; + ipi = IPI_BITMAP_VECTOR; + do { + old_pending = cpu_ipi_pending[cpu]; + new_pending = old_pending | bitmap; + } while (!atomic_cmpset_int(&cpu_ipi_pending[cpu], + old_pending, new_pending)); + if (!old_pending) + ipi_pcpu(cpu, RESCHEDULE_VECTOR); + } else { + KASSERT(call_data != NULL, ("call_data not set")); + ipi_pcpu(cpu, CALL_FUNCTION_VECTOR); + } +} + +/* + * Flush the TLB on all other CPU's + */ +static void +smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2) +{ + u_int ncpu; + struct _call_data data; + + ncpu = mp_ncpus - 1; /* does not shootdown self */ + if (ncpu < 1) + return; /* no other cpus */ + if (!(read_eflags() & PSL_I)) + panic("%s: interrupts disabled", __func__); + mtx_lock_spin(&smp_ipi_mtx); + KASSERT(call_data == NULL, ("call_data isn't null?!")); + call_data = &data; + call_data->func_id = vector; + call_data->arg1 = addr1; + call_data->arg2 = addr2; + atomic_store_rel_int(&smp_tlb_wait, 0); + ipi_all_but_self(vector); + while (smp_tlb_wait < ncpu) + ia32_pause(); + call_data = NULL; + mtx_unlock_spin(&smp_ipi_mtx); +} + +static void +smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2) +{ + int cpu, ncpu, othercpus; + struct _call_data data; + + othercpus = mp_ncpus - 1; + if (CPU_ISFULLSET(&mask)) { + if (othercpus < 1) + return; + } else { + CPU_CLR(PCPU_GET(cpuid), &mask); + if (CPU_EMPTY(&mask)) + return; + } + if (!(read_eflags() & PSL_I)) + panic("%s: interrupts disabled", __func__); + mtx_lock_spin(&smp_ipi_mtx); + KASSERT(call_data == NULL, ("call_data isn't null?!")); + call_data = &data; + call_data->func_id = vector; + call_data->arg1 = addr1; + call_data->arg2 = addr2; + atomic_store_rel_int(&smp_tlb_wait, 0); + if (CPU_ISFULLSET(&mask)) { + ncpu = othercpus; + ipi_all_but_self(vector); + } else { + ncpu = 0; + while ((cpu = cpusetobj_ffs(&mask)) != 0) { + cpu--; + CPU_CLR(cpu, &mask); + CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, + vector); + ipi_send_cpu(cpu, vector); + ncpu++; + } + } + while (smp_tlb_wait < ncpu) + ia32_pause(); + call_data = NULL; + mtx_unlock_spin(&smp_ipi_mtx); +} + +void +smp_cache_flush(void) +{ + + if (smp_started) + smp_tlb_shootdown(IPI_INVLCACHE, 0, 0); +} + +void +smp_invltlb(void) +{ + + if (smp_started) { + smp_tlb_shootdown(IPI_INVLTLB, 0, 0); + } +} + +void +smp_invlpg(vm_offset_t addr) +{ + + if (smp_started) { + smp_tlb_shootdown(IPI_INVLPG, addr, 0); + } +} + +void +smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2) +{ + + if (smp_started) { + smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2); + } +} + +void +smp_masked_invltlb(cpuset_t mask) +{ + + if (smp_started) { + smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0); + } +} + +void +smp_masked_invlpg(cpuset_t mask, vm_offset_t addr) +{ + + if (smp_started) { + smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0); + } +} + +void +smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2) +{ + + if (smp_started) { + smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2); + } +} + +/* + * send an IPI to a set of cpus. + */ +void +ipi_selected(cpuset_t cpus, u_int ipi) +{ + int cpu; + + /* + * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit + * of help in order to understand what is the source. + * Set the mask of receiving CPUs for this purpose. + */ + if (ipi == IPI_STOP_HARD) + CPU_OR_ATOMIC(&ipi_nmi_pending, &cpus); + + while ((cpu = cpusetobj_ffs(&cpus)) != 0) { + cpu--; + CPU_CLR(cpu, &cpus); + CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); + ipi_send_cpu(cpu, ipi); + } +} + +/* + * send an IPI to a specific CPU. + */ +void +ipi_cpu(int cpu, u_int ipi) +{ + + /* + * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit + * of help in order to understand what is the source. + * Set the mask of receiving CPUs for this purpose. + */ + if (ipi == IPI_STOP_HARD) + CPU_SET_ATOMIC(cpu, &ipi_nmi_pending); + + CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); + ipi_send_cpu(cpu, ipi); +} + +/* + * send an IPI to all CPUs EXCEPT myself + */ +void +ipi_all_but_self(u_int ipi) +{ + cpuset_t other_cpus; + + /* + * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit + * of help in order to understand what is the source. + * Set the mask of receiving CPUs for this purpose. + */ + other_cpus = all_cpus; + CPU_CLR(PCPU_GET(cpuid), &other_cpus); + if (ipi == IPI_STOP_HARD) + CPU_OR_ATOMIC(&ipi_nmi_pending, &other_cpus); + + CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); + ipi_selected(other_cpus, ipi); +} + +int +ipi_nmi_handler() +{ + u_int cpuid; + + /* + * As long as there is not a simple way to know about a NMI's + * source, if the bitmask for the current CPU is present in + * the global pending bitword an IPI_STOP_HARD has been issued + * and should be handled. + */ + cpuid = PCPU_GET(cpuid); + if (!CPU_ISSET(cpuid, &ipi_nmi_pending)) + return (1); + + CPU_CLR_ATOMIC(cpuid, &ipi_nmi_pending); + cpustop_handler(); + return (0); +} + +/* + * Handle an IPI_STOP by saving our current context and spinning until we + * are resumed. + */ +void +cpustop_handler(void) +{ + int cpu; + + cpu = PCPU_GET(cpuid); + + savectx(&stoppcbs[cpu]); + + /* Indicate that we are stopped */ + CPU_SET_ATOMIC(cpu, &stopped_cpus); + + /* Wait for restart */ + while (!CPU_ISSET(cpu, &started_cpus)) + ia32_pause(); + + CPU_CLR_ATOMIC(cpu, &started_cpus); + CPU_CLR_ATOMIC(cpu, &stopped_cpus); + + if (cpu == 0 && cpustop_restartfunc != NULL) { + cpustop_restartfunc(); + cpustop_restartfunc = NULL; + } +} + +/* + * This is called once the rest of the system is up and running and we're + * ready to let the AP's out of the pen. + */ +static void +release_aps(void *dummy __unused) +{ + + if (mp_ncpus == 1) + return; + atomic_store_rel_int(&aps_ready, 1); + while (smp_started == 0) + ia32_pause(); +} +SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); +SYSINIT(start_ipis, SI_SUB_INTR, SI_ORDER_ANY, xen_smp_intr_init_cpus, NULL); + diff --git a/sys/i386/xen/mptable.c b/sys/i386/xen/mptable.c new file mode 100644 index 0000000..0c1efe8 --- /dev/null +++ b/sys/i386/xen/mptable.c @@ -0,0 +1,107 @@ +/*- + * Copyright (c) 2003 John Baldwin <jhb@FreeBSD.org> + * Copyright (c) 1996, by Steve Passe + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. The name of the developer may NOT be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/bus.h> +#include <sys/kernel.h> + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/pmap.h> + +#include <machine/frame.h> +#include <machine/intr_machdep.h> +#include <machine/apicvar.h> + +#include <xen/hypervisor.h> +#include <machine/xen/xen-os.h> +#include <machine/smp.h> +#include <xen/interface/vcpu.h> + + +static int mptable_probe(void); +static int mptable_probe_cpus(void); +static void mptable_register(void *dummy); +static int mptable_setup_local(void); +static int mptable_setup_io(void); + +static struct apic_enumerator mptable_enumerator = { + "MPTable", + mptable_probe, + mptable_probe_cpus, + mptable_setup_local, + mptable_setup_io +}; + +static int +mptable_probe(void) +{ + + return (-100); +} + +static int +mptable_probe_cpus(void) +{ + int i, rc; + + for (i = 0; i < MAXCPU; i++) { + rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); + if (rc >= 0) + cpu_add(i, (i == 0)); + } + + return (0); +} + +/* + * Initialize the local APIC on the BSP. + */ +static int +mptable_setup_local(void) +{ + + return (0); +} + +static int +mptable_setup_io(void) +{ + + return (0); +} + +static void +mptable_register(void *dummy __unused) +{ + + apic_register_enumerator(&mptable_enumerator); +} +SYSINIT(mptable_register, SI_SUB_TUNABLES - 1, SI_ORDER_FIRST, mptable_register, + NULL); diff --git a/sys/i386/xen/pmap.c b/sys/i386/xen/pmap.c new file mode 100644 index 0000000..7f65a21 --- /dev/null +++ b/sys/i386/xen/pmap.c @@ -0,0 +1,4429 @@ +/*- + * Copyright (c) 1991 Regents of the University of California. + * All rights reserved. + * Copyright (c) 1994 John S. Dyson + * All rights reserved. + * Copyright (c) 1994 David Greenman + * All rights reserved. + * Copyright (c) 2005 Alan L. Cox <alc@cs.rice.edu> + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Systems Programming Group of the University of Utah Computer + * Science Department and William Jolitz of UUNET Technologies Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 + */ +/*- + * Copyright (c) 2003 Networks Associates Technology, Inc. + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Jake Burkholder, + * Safeport Network Services, and Network Associates Laboratories, the + * Security Research Division of Network Associates, Inc. under + * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA + * CHATS research program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +/* + * Manages physical address maps. + * + * Since the information managed by this module is + * also stored by the logical address mapping module, + * this module may throw away valid virtual-to-physical + * mappings at almost any time. However, invalidations + * of virtual-to-physical mappings must be done as + * requested. + * + * In order to cope with hardware architectures which + * make virtual-to-physical map invalidates expensive, + * this module may delay invalidate or reduced protection + * operations until such time as they are actually + * necessary. This module is given full information as + * to which processors are currently using which maps, + * and to when physical maps must be made correct. + */ + +#include "opt_cpu.h" +#include "opt_pmap.h" +#include "opt_smp.h" +#include "opt_xbox.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/ktr.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mman.h> +#include <sys/msgbuf.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/rwlock.h> +#include <sys/sf_buf.h> +#include <sys/sx.h> +#include <sys/vmmeter.h> +#include <sys/sched.h> +#include <sys/sysctl.h> +#ifdef SMP +#include <sys/smp.h> +#else +#include <sys/cpuset.h> +#endif + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_kern.h> +#include <vm/vm_page.h> +#include <vm/vm_map.h> +#include <vm/vm_object.h> +#include <vm/vm_extern.h> +#include <vm/vm_pageout.h> +#include <vm/vm_pager.h> +#include <vm/uma.h> + +#include <machine/cpu.h> +#include <machine/cputypes.h> +#include <machine/md_var.h> +#include <machine/pcb.h> +#include <machine/specialreg.h> +#ifdef SMP +#include <machine/smp.h> +#endif + +#ifdef XBOX +#include <machine/xbox.h> +#endif + +#include <xen/interface/xen.h> +#include <xen/hypervisor.h> +#include <machine/xen/hypercall.h> +#include <machine/xen/xenvar.h> +#include <machine/xen/xenfunc.h> + +#if !defined(CPU_DISABLE_SSE) && defined(I686_CPU) +#define CPU_ENABLE_SSE +#endif + +#ifndef PMAP_SHPGPERPROC +#define PMAP_SHPGPERPROC 200 +#endif + +#define DIAGNOSTIC + +#if !defined(DIAGNOSTIC) +#ifdef __GNUC_GNU_INLINE__ +#define PMAP_INLINE __attribute__((__gnu_inline__)) inline +#else +#define PMAP_INLINE extern inline +#endif +#else +#define PMAP_INLINE +#endif + +#ifdef PV_STATS +#define PV_STAT(x) do { x ; } while (0) +#else +#define PV_STAT(x) do { } while (0) +#endif + +/* + * Get PDEs and PTEs for user/kernel address space + */ +#define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT])) +#define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) + +#define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0) +#define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0) +#define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0) +#define pmap_pte_u(pte) ((*(int *)pte & PG_A) != 0) +#define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0) + +#define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v))) + +#define HAMFISTED_LOCKING +#ifdef HAMFISTED_LOCKING +static struct mtx createdelete_lock; +#endif + +struct pmap kernel_pmap_store; +LIST_HEAD(pmaplist, pmap); +static struct pmaplist allpmaps; +static struct mtx allpmaps_lock; + +vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ +vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ +int pgeflag = 0; /* PG_G or-in */ +int pseflag = 0; /* PG_PS or-in */ + +int nkpt; +vm_offset_t kernel_vm_end; +extern u_int32_t KERNend; + +#ifdef PAE +pt_entry_t pg_nx; +#endif + +static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); + +static int pat_works; /* Is page attribute table sane? */ + +/* + * This lock is defined as static in other pmap implementations. It cannot, + * however, be defined as static here, because it is (ab)used to serialize + * queued page table changes in other sources files. + */ +struct rwlock pvh_global_lock; + +/* + * Data for the pv entry allocation mechanism + */ +static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); +static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; +static int shpgperproc = PMAP_SHPGPERPROC; + +struct pv_chunk *pv_chunkbase; /* KVA block for pv_chunks */ +int pv_maxchunks; /* How many chunks we have KVA for */ +vm_offset_t pv_vafree; /* freelist stored in the PTE */ + +/* + * All those kernel PT submaps that BSD is so fond of + */ +struct sysmaps { + struct mtx lock; + pt_entry_t *CMAP1; + pt_entry_t *CMAP2; + caddr_t CADDR1; + caddr_t CADDR2; +}; +static struct sysmaps sysmaps_pcpu[MAXCPU]; +static pt_entry_t *CMAP3; +caddr_t ptvmmap = 0; +static caddr_t CADDR3; +struct msgbuf *msgbufp = 0; + +/* + * Crashdump maps. + */ +static caddr_t crashdumpmap; + +static pt_entry_t *PMAP1 = 0, *PMAP2; +static pt_entry_t *PADDR1 = 0, *PADDR2; +#ifdef SMP +static int PMAP1cpu; +static int PMAP1changedcpu; +SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD, + &PMAP1changedcpu, 0, + "Number of times pmap_pte_quick changed CPU with same PMAP1"); +#endif +static int PMAP1changed; +SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD, + &PMAP1changed, 0, + "Number of times pmap_pte_quick changed PMAP1"); +static int PMAP1unchanged; +SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD, + &PMAP1unchanged, 0, + "Number of times pmap_pte_quick didn't change PMAP1"); +static struct mtx PMAP2mutex; + +static void free_pv_chunk(struct pv_chunk *pc); +static void free_pv_entry(pmap_t pmap, pv_entry_t pv); +static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try); +static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); +static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, + vm_offset_t va); + +static vm_page_t pmap_enter_quick_locked(multicall_entry_t **mcl, int *count, pmap_t pmap, vm_offset_t va, + vm_page_t m, vm_prot_t prot, vm_page_t mpte); +static void pmap_flush_page(vm_page_t m); +static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); +static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, + vm_page_t *free); +static void pmap_remove_page(struct pmap *pmap, vm_offset_t va, + vm_page_t *free); +static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, + vm_offset_t va); +static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, + vm_page_t m); + +static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags); + +static vm_page_t _pmap_allocpte(pmap_t pmap, u_int ptepindex, int flags); +static void _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, vm_page_t *free); +static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va); +static void pmap_pte_release(pt_entry_t *pte); +static int pmap_unuse_pt(pmap_t, vm_offset_t, vm_page_t *); +static boolean_t pmap_is_prefaultable_locked(pmap_t pmap, vm_offset_t addr); + +static __inline void pagezero(void *page); + +CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t)); +CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t)); + +/* + * If you get an error here, then you set KVA_PAGES wrong! See the + * description of KVA_PAGES in sys/i386/include/pmap.h. It must be + * multiple of 4 for a normal kernel, or a multiple of 8 for a PAE. + */ +CTASSERT(KERNBASE % (1 << 24) == 0); + +void +pd_set(struct pmap *pmap, int ptepindex, vm_paddr_t val, int type) +{ + vm_paddr_t pdir_ma = vtomach(&pmap->pm_pdir[ptepindex]); + + switch (type) { + case SH_PD_SET_VA: +#if 0 + xen_queue_pt_update(shadow_pdir_ma, + xpmap_ptom(val & ~(PG_RW))); +#endif + xen_queue_pt_update(pdir_ma, + xpmap_ptom(val)); + break; + case SH_PD_SET_VA_MA: +#if 0 + xen_queue_pt_update(shadow_pdir_ma, + val & ~(PG_RW)); +#endif + xen_queue_pt_update(pdir_ma, val); + break; + case SH_PD_SET_VA_CLEAR: +#if 0 + xen_queue_pt_update(shadow_pdir_ma, 0); +#endif + xen_queue_pt_update(pdir_ma, 0); + break; + } +} + +/* + * Bootstrap the system enough to run with virtual memory. + * + * On the i386 this is called after mapping has already been enabled + * and just syncs the pmap module with what has already been done. + * [We can't call it easily with mapping off since the kernel is not + * mapped with PA == VA, hence we would have to relocate every address + * from the linked base (virtual) address "KERNBASE" to the actual + * (physical) address starting relative to 0] + */ +void +pmap_bootstrap(vm_paddr_t firstaddr) +{ + vm_offset_t va; + pt_entry_t *pte, *unused; + struct sysmaps *sysmaps; + int i; + + /* + * Initialize the first available kernel virtual address. However, + * using "firstaddr" may waste a few pages of the kernel virtual + * address space, because locore may not have mapped every physical + * page that it allocated. Preferably, locore would provide a first + * unused virtual address in addition to "firstaddr". + */ + virtual_avail = (vm_offset_t) KERNBASE + firstaddr; + + virtual_end = VM_MAX_KERNEL_ADDRESS; + + /* + * Initialize the kernel pmap (which is statically allocated). + */ + PMAP_LOCK_INIT(kernel_pmap); + kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD); +#ifdef PAE + kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT); +#endif + CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ + TAILQ_INIT(&kernel_pmap->pm_pvchunk); + + /* + * Initialize the global pv list lock. + */ + rw_init_flags(&pvh_global_lock, "pmap pv global", RW_RECURSE); + + LIST_INIT(&allpmaps); + mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN); + mtx_lock_spin(&allpmaps_lock); + LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); + mtx_unlock_spin(&allpmaps_lock); + if (nkpt == 0) + nkpt = NKPT; + + /* + * Reserve some special page table entries/VA space for temporary + * mapping of pages. + */ +#define SYSMAP(c, p, v, n) \ + v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); + + va = virtual_avail; + pte = vtopte(va); + + /* + * CMAP1/CMAP2 are used for zeroing and copying pages. + * CMAP3 is used for the idle process page zeroing. + */ + for (i = 0; i < MAXCPU; i++) { + sysmaps = &sysmaps_pcpu[i]; + mtx_init(&sysmaps->lock, "SYSMAPS", NULL, MTX_DEF); + SYSMAP(caddr_t, sysmaps->CMAP1, sysmaps->CADDR1, 1) + SYSMAP(caddr_t, sysmaps->CMAP2, sysmaps->CADDR2, 1) + PT_SET_MA(sysmaps->CADDR1, 0); + PT_SET_MA(sysmaps->CADDR2, 0); + } + SYSMAP(caddr_t, CMAP3, CADDR3, 1) + PT_SET_MA(CADDR3, 0); + + /* + * Crashdump maps. + */ + SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS) + + /* + * ptvmmap is used for reading arbitrary physical pages via /dev/mem. + */ + SYSMAP(caddr_t, unused, ptvmmap, 1) + + /* + * msgbufp is used to map the system message buffer. + */ + SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(msgbufsize))) + + /* + * PADDR1 and PADDR2 are used by pmap_pte_quick() and pmap_pte(), + * respectively. + */ + SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1) + SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1) + + mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF); + + virtual_avail = va; + + /* + * Leave in place an identity mapping (virt == phys) for the low 1 MB + * physical memory region that is used by the ACPI wakeup code. This + * mapping must not have PG_G set. + */ +#ifndef XEN + /* + * leave here deliberately to show that this is not supported + */ +#ifdef XBOX + /* FIXME: This is gross, but needed for the XBOX. Since we are in such + * an early stadium, we cannot yet neatly map video memory ... :-( + * Better fixes are very welcome! */ + if (!arch_i386_is_xbox) +#endif + for (i = 1; i < NKPT; i++) + PTD[i] = 0; + + /* Initialize the PAT MSR if present. */ + pmap_init_pat(); + + /* Turn on PG_G on kernel page(s) */ + pmap_set_pg(); +#endif + +#ifdef HAMFISTED_LOCKING + mtx_init(&createdelete_lock, "pmap create/delete", NULL, MTX_DEF); +#endif +} + +/* + * Setup the PAT MSR. + */ +void +pmap_init_pat(void) +{ + uint64_t pat_msr; + + /* Bail if this CPU doesn't implement PAT. */ + if (!(cpu_feature & CPUID_PAT)) + return; + + if (cpu_vendor_id != CPU_VENDOR_INTEL || + (CPUID_TO_FAMILY(cpu_id) == 6 && CPUID_TO_MODEL(cpu_id) >= 0xe)) { + /* + * Leave the indices 0-3 at the default of WB, WT, UC, and UC-. + * Program 4 and 5 as WP and WC. + * Leave 6 and 7 as UC and UC-. + */ + pat_msr = rdmsr(MSR_PAT); + pat_msr &= ~(PAT_MASK(4) | PAT_MASK(5)); + pat_msr |= PAT_VALUE(4, PAT_WRITE_PROTECTED) | + PAT_VALUE(5, PAT_WRITE_COMBINING); + pat_works = 1; + } else { + /* + * Due to some Intel errata, we can only safely use the lower 4 + * PAT entries. Thus, just replace PAT Index 2 with WC instead + * of UC-. + * + * Intel Pentium III Processor Specification Update + * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B + * or Mode C Paging) + * + * Intel Pentium IV Processor Specification Update + * Errata N46 (PAT Index MSB May Be Calculated Incorrectly) + */ + pat_msr = rdmsr(MSR_PAT); + pat_msr &= ~PAT_MASK(2); + pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING); + pat_works = 0; + } + wrmsr(MSR_PAT, pat_msr); +} + +/* + * Initialize a vm_page's machine-dependent fields. + */ +void +pmap_page_init(vm_page_t m) +{ + + TAILQ_INIT(&m->md.pv_list); + m->md.pat_mode = PAT_WRITE_BACK; +} + +/* + * ABuse the pte nodes for unmapped kva to thread a kva freelist through. + * Requirements: + * - Must deal with pages in order to ensure that none of the PG_* bits + * are ever set, PG_V in particular. + * - Assumes we can write to ptes without pte_store() atomic ops, even + * on PAE systems. This should be ok. + * - Assumes nothing will ever test these addresses for 0 to indicate + * no mapping instead of correctly checking PG_V. + * - Assumes a vm_offset_t will fit in a pte (true for i386). + * Because PG_V is never set, there can be no mappings to invalidate. + */ +static int ptelist_count = 0; +static vm_offset_t +pmap_ptelist_alloc(vm_offset_t *head) +{ + vm_offset_t va; + vm_offset_t *phead = (vm_offset_t *)*head; + + if (ptelist_count == 0) { + printf("out of memory!!!!!!\n"); + return (0); /* Out of memory */ + } + ptelist_count--; + va = phead[ptelist_count]; + return (va); +} + +static void +pmap_ptelist_free(vm_offset_t *head, vm_offset_t va) +{ + vm_offset_t *phead = (vm_offset_t *)*head; + + phead[ptelist_count++] = va; +} + +static void +pmap_ptelist_init(vm_offset_t *head, void *base, int npages) +{ + int i, nstackpages; + vm_offset_t va; + vm_page_t m; + + nstackpages = (npages + PAGE_SIZE/sizeof(vm_offset_t) - 1)/ (PAGE_SIZE/sizeof(vm_offset_t)); + for (i = 0; i < nstackpages; i++) { + va = (vm_offset_t)base + i * PAGE_SIZE; + m = vm_page_alloc(NULL, i, + VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | + VM_ALLOC_ZERO); + pmap_qenter(va, &m, 1); + } + + *head = (vm_offset_t)base; + for (i = npages - 1; i >= nstackpages; i--) { + va = (vm_offset_t)base + i * PAGE_SIZE; + pmap_ptelist_free(head, va); + } +} + + +/* + * Initialize the pmap module. + * Called by vm_init, to initialize any structures that the pmap + * system needs to map virtual memory. + */ +void +pmap_init(void) +{ + + /* + * Initialize the address space (zone) for the pv entries. Set a + * high water mark so that the system can recover from excessive + * numbers of pv entries. + */ + TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); + pv_entry_max = shpgperproc * maxproc + cnt.v_page_count; + TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); + pv_entry_max = roundup(pv_entry_max, _NPCPV); + pv_entry_high_water = 9 * (pv_entry_max / 10); + + pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc); + pv_chunkbase = (struct pv_chunk *)kmem_alloc_nofault(kernel_map, + PAGE_SIZE * pv_maxchunks); + if (pv_chunkbase == NULL) + panic("pmap_init: not enough kvm for pv chunks"); + pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks); +} + + +SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0, + "Max number of PV entries"); +SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0, + "Page share factor per proc"); + +static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0, + "2/4MB page mapping counters"); + +static u_long pmap_pde_mappings; +SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, + &pmap_pde_mappings, 0, "2/4MB page mappings"); + +/*************************************************** + * Low level helper routines..... + ***************************************************/ + +/* + * Determine the appropriate bits to set in a PTE or PDE for a specified + * caching mode. + */ +int +pmap_cache_bits(int mode, boolean_t is_pde) +{ + int pat_flag, pat_index, cache_bits; + + /* The PAT bit is different for PTE's and PDE's. */ + pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT; + + /* If we don't support PAT, map extended modes to older ones. */ + if (!(cpu_feature & CPUID_PAT)) { + switch (mode) { + case PAT_UNCACHEABLE: + case PAT_WRITE_THROUGH: + case PAT_WRITE_BACK: + break; + case PAT_UNCACHED: + case PAT_WRITE_COMBINING: + case PAT_WRITE_PROTECTED: + mode = PAT_UNCACHEABLE; + break; + } + } + + /* Map the caching mode to a PAT index. */ + if (pat_works) { + switch (mode) { + case PAT_UNCACHEABLE: + pat_index = 3; + break; + case PAT_WRITE_THROUGH: + pat_index = 1; + break; + case PAT_WRITE_BACK: + pat_index = 0; + break; + case PAT_UNCACHED: + pat_index = 2; + break; + case PAT_WRITE_COMBINING: + pat_index = 5; + break; + case PAT_WRITE_PROTECTED: + pat_index = 4; + break; + default: + panic("Unknown caching mode %d\n", mode); + } + } else { + switch (mode) { + case PAT_UNCACHED: + case PAT_UNCACHEABLE: + case PAT_WRITE_PROTECTED: + pat_index = 3; + break; + case PAT_WRITE_THROUGH: + pat_index = 1; + break; + case PAT_WRITE_BACK: + pat_index = 0; + break; + case PAT_WRITE_COMBINING: + pat_index = 2; + break; + default: + panic("Unknown caching mode %d\n", mode); + } + } + + /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ + cache_bits = 0; + if (pat_index & 0x4) + cache_bits |= pat_flag; + if (pat_index & 0x2) + cache_bits |= PG_NC_PCD; + if (pat_index & 0x1) + cache_bits |= PG_NC_PWT; + return (cache_bits); +} +#ifdef SMP +/* + * For SMP, these functions have to use the IPI mechanism for coherence. + * + * N.B.: Before calling any of the following TLB invalidation functions, + * the calling processor must ensure that all stores updating a non- + * kernel page table are globally performed. Otherwise, another + * processor could cache an old, pre-update entry without being + * invalidated. This can happen one of two ways: (1) The pmap becomes + * active on another processor after its pm_active field is checked by + * one of the following functions but before a store updating the page + * table is globally performed. (2) The pmap becomes active on another + * processor before its pm_active field is checked but due to + * speculative loads one of the following functions stills reads the + * pmap as inactive on the other processor. + * + * The kernel page table is exempt because its pm_active field is + * immutable. The kernel page table is always active on every + * processor. + */ +void +pmap_invalidate_page(pmap_t pmap, vm_offset_t va) +{ + cpuset_t other_cpus; + u_int cpuid; + + CTR2(KTR_PMAP, "pmap_invalidate_page: pmap=%p va=0x%x", + pmap, va); + + sched_pin(); + if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { + invlpg(va); + smp_invlpg(va); + } else { + cpuid = PCPU_GET(cpuid); + other_cpus = all_cpus; + CPU_CLR(cpuid, &other_cpus); + if (CPU_ISSET(cpuid, &pmap->pm_active)) + invlpg(va); + CPU_AND(&other_cpus, &pmap->pm_active); + if (!CPU_EMPTY(&other_cpus)) + smp_masked_invlpg(other_cpus, va); + } + sched_unpin(); + PT_UPDATES_FLUSH(); +} + +void +pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +{ + cpuset_t other_cpus; + vm_offset_t addr; + u_int cpuid; + + CTR3(KTR_PMAP, "pmap_invalidate_page: pmap=%p eva=0x%x sva=0x%x", + pmap, sva, eva); + + sched_pin(); + if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { + for (addr = sva; addr < eva; addr += PAGE_SIZE) + invlpg(addr); + smp_invlpg_range(sva, eva); + } else { + cpuid = PCPU_GET(cpuid); + other_cpus = all_cpus; + CPU_CLR(cpuid, &other_cpus); + if (CPU_ISSET(cpuid, &pmap->pm_active)) + for (addr = sva; addr < eva; addr += PAGE_SIZE) + invlpg(addr); + CPU_AND(&other_cpus, &pmap->pm_active); + if (!CPU_EMPTY(&other_cpus)) + smp_masked_invlpg_range(other_cpus, sva, eva); + } + sched_unpin(); + PT_UPDATES_FLUSH(); +} + +void +pmap_invalidate_all(pmap_t pmap) +{ + cpuset_t other_cpus; + u_int cpuid; + + CTR1(KTR_PMAP, "pmap_invalidate_page: pmap=%p", pmap); + + sched_pin(); + if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { + invltlb(); + smp_invltlb(); + } else { + cpuid = PCPU_GET(cpuid); + other_cpus = all_cpus; + CPU_CLR(cpuid, &other_cpus); + if (CPU_ISSET(cpuid, &pmap->pm_active)) + invltlb(); + CPU_AND(&other_cpus, &pmap->pm_active); + if (!CPU_EMPTY(&other_cpus)) + smp_masked_invltlb(other_cpus); + } + sched_unpin(); +} + +void +pmap_invalidate_cache(void) +{ + + sched_pin(); + wbinvd(); + smp_cache_flush(); + sched_unpin(); +} +#else /* !SMP */ +/* + * Normal, non-SMP, 486+ invalidation functions. + * We inline these within pmap.c for speed. + */ +PMAP_INLINE void +pmap_invalidate_page(pmap_t pmap, vm_offset_t va) +{ + CTR2(KTR_PMAP, "pmap_invalidate_page: pmap=%p va=0x%x", + pmap, va); + + if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) + invlpg(va); + PT_UPDATES_FLUSH(); +} + +PMAP_INLINE void +pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +{ + vm_offset_t addr; + + if (eva - sva > PAGE_SIZE) + CTR3(KTR_PMAP, "pmap_invalidate_range: pmap=%p sva=0x%x eva=0x%x", + pmap, sva, eva); + + if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) + for (addr = sva; addr < eva; addr += PAGE_SIZE) + invlpg(addr); + PT_UPDATES_FLUSH(); +} + +PMAP_INLINE void +pmap_invalidate_all(pmap_t pmap) +{ + + CTR1(KTR_PMAP, "pmap_invalidate_all: pmap=%p", pmap); + + if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) + invltlb(); +} + +PMAP_INLINE void +pmap_invalidate_cache(void) +{ + + wbinvd(); +} +#endif /* !SMP */ + +#define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) + +void +pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) +{ + + KASSERT((sva & PAGE_MASK) == 0, + ("pmap_invalidate_cache_range: sva not page-aligned")); + KASSERT((eva & PAGE_MASK) == 0, + ("pmap_invalidate_cache_range: eva not page-aligned")); + + if (cpu_feature & CPUID_SS) + ; /* If "Self Snoop" is supported, do nothing. */ + else if ((cpu_feature & CPUID_CLFSH) != 0 && + eva - sva < PMAP_CLFLUSH_THRESHOLD) { + + /* + * Otherwise, do per-cache line flush. Use the mfence + * instruction to insure that previous stores are + * included in the write-back. The processor + * propagates flush to other processors in the cache + * coherence domain. + */ + mfence(); + for (; sva < eva; sva += cpu_clflush_line_size) + clflush(sva); + mfence(); + } else { + + /* + * No targeted cache flush methods are supported by CPU, + * or the supplied range is bigger than 2MB. + * Globally invalidate cache. + */ + pmap_invalidate_cache(); + } +} + +void +pmap_invalidate_cache_pages(vm_page_t *pages, int count) +{ + int i; + + if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || + (cpu_feature & CPUID_CLFSH) == 0) { + pmap_invalidate_cache(); + } else { + for (i = 0; i < count; i++) + pmap_flush_page(pages[i]); + } +} + +/* + * Are we current address space or kernel? N.B. We return FALSE when + * a pmap's page table is in use because a kernel thread is borrowing + * it. The borrowed page table can change spontaneously, making any + * dependence on its continued use subject to a race condition. + */ +static __inline int +pmap_is_current(pmap_t pmap) +{ + + return (pmap == kernel_pmap || + (pmap == vmspace_pmap(curthread->td_proc->p_vmspace) && + (pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME))); +} + +/* + * If the given pmap is not the current or kernel pmap, the returned pte must + * be released by passing it to pmap_pte_release(). + */ +pt_entry_t * +pmap_pte(pmap_t pmap, vm_offset_t va) +{ + pd_entry_t newpf; + pd_entry_t *pde; + + pde = pmap_pde(pmap, va); + if (*pde & PG_PS) + return (pde); + if (*pde != 0) { + /* are we current address space or kernel? */ + if (pmap_is_current(pmap)) + return (vtopte(va)); + mtx_lock(&PMAP2mutex); + newpf = *pde & PG_FRAME; + if ((*PMAP2 & PG_FRAME) != newpf) { + PT_SET_MA(PADDR2, newpf | PG_V | PG_A | PG_M); + CTR3(KTR_PMAP, "pmap_pte: pmap=%p va=0x%x newpte=0x%08x", + pmap, va, (*PMAP2 & 0xffffffff)); + } + return (PADDR2 + (i386_btop(va) & (NPTEPG - 1))); + } + return (NULL); +} + +/* + * Releases a pte that was obtained from pmap_pte(). Be prepared for the pte + * being NULL. + */ +static __inline void +pmap_pte_release(pt_entry_t *pte) +{ + + if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2) { + CTR1(KTR_PMAP, "pmap_pte_release: pte=0x%jx", + *PMAP2); + rw_wlock(&pvh_global_lock); + PT_SET_VA(PMAP2, 0, TRUE); + rw_wunlock(&pvh_global_lock); + mtx_unlock(&PMAP2mutex); + } +} + +static __inline void +invlcaddr(void *caddr) +{ + + invlpg((u_int)caddr); + PT_UPDATES_FLUSH(); +} + +/* + * Super fast pmap_pte routine best used when scanning + * the pv lists. This eliminates many coarse-grained + * invltlb calls. Note that many of the pv list + * scans are across different pmaps. It is very wasteful + * to do an entire invltlb for checking a single mapping. + * + * If the given pmap is not the current pmap, pvh_global_lock + * must be held and curthread pinned to a CPU. + */ +static pt_entry_t * +pmap_pte_quick(pmap_t pmap, vm_offset_t va) +{ + pd_entry_t newpf; + pd_entry_t *pde; + + pde = pmap_pde(pmap, va); + if (*pde & PG_PS) + return (pde); + if (*pde != 0) { + /* are we current address space or kernel? */ + if (pmap_is_current(pmap)) + return (vtopte(va)); + rw_assert(&pvh_global_lock, RA_WLOCKED); + KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); + newpf = *pde & PG_FRAME; + if ((*PMAP1 & PG_FRAME) != newpf) { + PT_SET_MA(PADDR1, newpf | PG_V | PG_A | PG_M); + CTR3(KTR_PMAP, "pmap_pte_quick: pmap=%p va=0x%x newpte=0x%08x", + pmap, va, (u_long)*PMAP1); + +#ifdef SMP + PMAP1cpu = PCPU_GET(cpuid); +#endif + PMAP1changed++; + } else +#ifdef SMP + if (PMAP1cpu != PCPU_GET(cpuid)) { + PMAP1cpu = PCPU_GET(cpuid); + invlcaddr(PADDR1); + PMAP1changedcpu++; + } else +#endif + PMAP1unchanged++; + return (PADDR1 + (i386_btop(va) & (NPTEPG - 1))); + } + return (0); +} + +/* + * Routine: pmap_extract + * Function: + * Extract the physical page address associated + * with the given map/virtual_address pair. + */ +vm_paddr_t +pmap_extract(pmap_t pmap, vm_offset_t va) +{ + vm_paddr_t rtval; + pt_entry_t *pte; + pd_entry_t pde; + pt_entry_t pteval; + + rtval = 0; + PMAP_LOCK(pmap); + pde = pmap->pm_pdir[va >> PDRSHIFT]; + if (pde != 0) { + if ((pde & PG_PS) != 0) { + rtval = xpmap_mtop(pde & PG_PS_FRAME) | (va & PDRMASK); + PMAP_UNLOCK(pmap); + return rtval; + } + pte = pmap_pte(pmap, va); + pteval = *pte ? xpmap_mtop(*pte) : 0; + rtval = (pteval & PG_FRAME) | (va & PAGE_MASK); + pmap_pte_release(pte); + } + PMAP_UNLOCK(pmap); + return (rtval); +} + +/* + * Routine: pmap_extract_ma + * Function: + * Like pmap_extract, but returns machine address + */ +vm_paddr_t +pmap_extract_ma(pmap_t pmap, vm_offset_t va) +{ + vm_paddr_t rtval; + pt_entry_t *pte; + pd_entry_t pde; + + rtval = 0; + PMAP_LOCK(pmap); + pde = pmap->pm_pdir[va >> PDRSHIFT]; + if (pde != 0) { + if ((pde & PG_PS) != 0) { + rtval = (pde & ~PDRMASK) | (va & PDRMASK); + PMAP_UNLOCK(pmap); + return rtval; + } + pte = pmap_pte(pmap, va); + rtval = (*pte & PG_FRAME) | (va & PAGE_MASK); + pmap_pte_release(pte); + } + PMAP_UNLOCK(pmap); + return (rtval); +} + +/* + * Routine: pmap_extract_and_hold + * Function: + * Atomically extract and hold the physical page + * with the given pmap and virtual address pair + * if that mapping permits the given protection. + */ +vm_page_t +pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) +{ + pd_entry_t pde; + pt_entry_t pte, *ptep; + vm_page_t m; + vm_paddr_t pa; + + pa = 0; + m = NULL; + PMAP_LOCK(pmap); +retry: + pde = PT_GET(pmap_pde(pmap, va)); + if (pde != 0) { + if (pde & PG_PS) { + if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) { + if (vm_page_pa_tryrelock(pmap, (pde & + PG_PS_FRAME) | (va & PDRMASK), &pa)) + goto retry; + m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | + (va & PDRMASK)); + vm_page_hold(m); + } + } else { + ptep = pmap_pte(pmap, va); + pte = PT_GET(ptep); + pmap_pte_release(ptep); + if (pte != 0 && + ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) { + if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME, + &pa)) + goto retry; + m = PHYS_TO_VM_PAGE(pte & PG_FRAME); + vm_page_hold(m); + } + } + } + PA_UNLOCK_COND(pa); + PMAP_UNLOCK(pmap); + return (m); +} + +/*************************************************** + * Low level mapping routines..... + ***************************************************/ + +/* + * Add a wired page to the kva. + * Note: not SMP coherent. + * + * This function may be used before pmap_bootstrap() is called. + */ +void +pmap_kenter(vm_offset_t va, vm_paddr_t pa) +{ + + PT_SET_MA(va, xpmap_ptom(pa)| PG_RW | PG_V | pgeflag); +} + +void +pmap_kenter_ma(vm_offset_t va, vm_paddr_t ma) +{ + pt_entry_t *pte; + + pte = vtopte(va); + pte_store_ma(pte, ma | PG_RW | PG_V | pgeflag); +} + +static __inline void +pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) +{ + + PT_SET_MA(va, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0)); +} + +/* + * Remove a page from the kernel pagetables. + * Note: not SMP coherent. + * + * This function may be used before pmap_bootstrap() is called. + */ +PMAP_INLINE void +pmap_kremove(vm_offset_t va) +{ + pt_entry_t *pte; + + pte = vtopte(va); + PT_CLEAR_VA(pte, FALSE); +} + +/* + * Used to map a range of physical addresses into kernel + * virtual address space. + * + * The value passed in '*virt' is a suggested virtual address for + * the mapping. Architectures which can support a direct-mapped + * physical to virtual region can return the appropriate address + * within that region, leaving '*virt' unchanged. Other + * architectures should map the pages starting at '*virt' and + * update '*virt' with the first usable address after the mapped + * region. + */ +vm_offset_t +pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) +{ + vm_offset_t va, sva; + + va = sva = *virt; + CTR4(KTR_PMAP, "pmap_map: va=0x%x start=0x%jx end=0x%jx prot=0x%x", + va, start, end, prot); + while (start < end) { + pmap_kenter(va, start); + va += PAGE_SIZE; + start += PAGE_SIZE; + } + pmap_invalidate_range(kernel_pmap, sva, va); + *virt = va; + return (sva); +} + + +/* + * Add a list of wired pages to the kva + * this routine is only used for temporary + * kernel mappings that do not need to have + * page modification or references recorded. + * Note that old mappings are simply written + * over. The page *must* be wired. + * Note: SMP coherent. Uses a ranged shootdown IPI. + */ +void +pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) +{ + pt_entry_t *endpte, *pte; + vm_paddr_t pa; + vm_offset_t va = sva; + int mclcount = 0; + multicall_entry_t mcl[16]; + multicall_entry_t *mclp = mcl; + int error; + + CTR2(KTR_PMAP, "pmap_qenter:sva=0x%x count=%d", va, count); + pte = vtopte(sva); + endpte = pte + count; + while (pte < endpte) { + pa = VM_PAGE_TO_MACH(*ma) | pgeflag | PG_RW | PG_V | PG_M | PG_A; + + mclp->op = __HYPERVISOR_update_va_mapping; + mclp->args[0] = va; + mclp->args[1] = (uint32_t)(pa & 0xffffffff); + mclp->args[2] = (uint32_t)(pa >> 32); + mclp->args[3] = (*pte & PG_V) ? UVMF_INVLPG|UVMF_ALL : 0; + + va += PAGE_SIZE; + pte++; + ma++; + mclp++; + mclcount++; + if (mclcount == 16) { + error = HYPERVISOR_multicall(mcl, mclcount); + mclp = mcl; + mclcount = 0; + KASSERT(error == 0, ("bad multicall %d", error)); + } + } + if (mclcount) { + error = HYPERVISOR_multicall(mcl, mclcount); + KASSERT(error == 0, ("bad multicall %d", error)); + } + +#ifdef INVARIANTS + for (pte = vtopte(sva), mclcount = 0; mclcount < count; mclcount++, pte++) + KASSERT(*pte, ("pte not set for va=0x%x", sva + mclcount*PAGE_SIZE)); +#endif +} + +/* + * This routine tears out page mappings from the + * kernel -- it is meant only for temporary mappings. + * Note: SMP coherent. Uses a ranged shootdown IPI. + */ +void +pmap_qremove(vm_offset_t sva, int count) +{ + vm_offset_t va; + + CTR2(KTR_PMAP, "pmap_qremove: sva=0x%x count=%d", sva, count); + va = sva; + rw_wlock(&pvh_global_lock); + critical_enter(); + while (count-- > 0) { + pmap_kremove(va); + va += PAGE_SIZE; + } + PT_UPDATES_FLUSH(); + pmap_invalidate_range(kernel_pmap, sva, va); + critical_exit(); + rw_wunlock(&pvh_global_lock); +} + +/*************************************************** + * Page table page management routines..... + ***************************************************/ +static __inline void +pmap_free_zero_pages(vm_page_t free) +{ + vm_page_t m; + + while (free != NULL) { + m = free; + free = m->right; + vm_page_free_zero(m); + } +} + +/* + * Decrements a page table page's wire count, which is used to record the + * number of valid page table entries within the page. If the wire count + * drops to zero, then the page table page is unmapped. Returns TRUE if the + * page table page was unmapped and FALSE otherwise. + */ +static inline boolean_t +pmap_unwire_ptp(pmap_t pmap, vm_page_t m, vm_page_t *free) +{ + + --m->wire_count; + if (m->wire_count == 0) { + _pmap_unwire_ptp(pmap, m, free); + return (TRUE); + } else + return (FALSE); +} + +static void +_pmap_unwire_ptp(pmap_t pmap, vm_page_t m, vm_page_t *free) +{ + vm_offset_t pteva; + + PT_UPDATES_FLUSH(); + /* + * unmap the page table page + */ + xen_pt_unpin(pmap->pm_pdir[m->pindex]); + /* + * page *might* contain residual mapping :-/ + */ + PD_CLEAR_VA(pmap, m->pindex, TRUE); + pmap_zero_page(m); + --pmap->pm_stats.resident_count; + + /* + * This is a release store so that the ordinary store unmapping + * the page table page is globally performed before TLB shoot- + * down is begun. + */ + atomic_subtract_rel_int(&cnt.v_wire_count, 1); + + /* + * Do an invltlb to make the invalidated mapping + * take effect immediately. + */ + pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex); + pmap_invalidate_page(pmap, pteva); + + /* + * Put page on a list so that it is released after + * *ALL* TLB shootdown is done + */ + m->right = *free; + *free = m; +} + +/* + * After removing a page table entry, this routine is used to + * conditionally free the page, and manage the hold/wire counts. + */ +static int +pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t *free) +{ + pd_entry_t ptepde; + vm_page_t mpte; + + if (va >= VM_MAXUSER_ADDRESS) + return (0); + ptepde = PT_GET(pmap_pde(pmap, va)); + mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); + return (pmap_unwire_ptp(pmap, mpte, free)); +} + +/* + * Initialize the pmap for the swapper process. + */ +void +pmap_pinit0(pmap_t pmap) +{ + + PMAP_LOCK_INIT(pmap); + /* + * Since the page table directory is shared with the kernel pmap, + * which is already included in the list "allpmaps", this pmap does + * not need to be inserted into that list. + */ + pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD); +#ifdef PAE + pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT); +#endif + CPU_ZERO(&pmap->pm_active); + PCPU_SET(curpmap, pmap); + TAILQ_INIT(&pmap->pm_pvchunk); + bzero(&pmap->pm_stats, sizeof pmap->pm_stats); +} + +/* + * Initialize a preallocated and zeroed pmap structure, + * such as one in a vmspace structure. + */ +int +pmap_pinit(pmap_t pmap) +{ + vm_page_t m, ptdpg[NPGPTD + 1]; + int npgptd = NPGPTD + 1; + int i; + +#ifdef HAMFISTED_LOCKING + mtx_lock(&createdelete_lock); +#endif + + PMAP_LOCK_INIT(pmap); + + /* + * No need to allocate page table space yet but we do need a valid + * page directory table. + */ + if (pmap->pm_pdir == NULL) { + pmap->pm_pdir = (pd_entry_t *)kmem_alloc_nofault(kernel_map, + NBPTD); + if (pmap->pm_pdir == NULL) { + PMAP_LOCK_DESTROY(pmap); +#ifdef HAMFISTED_LOCKING + mtx_unlock(&createdelete_lock); +#endif + return (0); + } +#ifdef PAE + pmap->pm_pdpt = (pd_entry_t *)kmem_alloc_nofault(kernel_map, 1); +#endif + } + + /* + * allocate the page directory page(s) + */ + for (i = 0; i < npgptd;) { + m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | + VM_ALLOC_WIRED | VM_ALLOC_ZERO); + if (m == NULL) + VM_WAIT; + else { + ptdpg[i++] = m; + } + } + + pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD); + + for (i = 0; i < NPGPTD; i++) + if ((ptdpg[i]->flags & PG_ZERO) == 0) + pagezero(pmap->pm_pdir + (i * NPDEPG)); + + mtx_lock_spin(&allpmaps_lock); + LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); + /* Copy the kernel page table directory entries. */ + bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t)); + mtx_unlock_spin(&allpmaps_lock); + +#ifdef PAE + pmap_qenter((vm_offset_t)pmap->pm_pdpt, &ptdpg[NPGPTD], 1); + if ((ptdpg[NPGPTD]->flags & PG_ZERO) == 0) + bzero(pmap->pm_pdpt, PAGE_SIZE); + for (i = 0; i < NPGPTD; i++) { + vm_paddr_t ma; + + ma = VM_PAGE_TO_MACH(ptdpg[i]); + pmap->pm_pdpt[i] = ma | PG_V; + + } +#endif + for (i = 0; i < NPGPTD; i++) { + pt_entry_t *pd; + vm_paddr_t ma; + + ma = VM_PAGE_TO_MACH(ptdpg[i]); + pd = pmap->pm_pdir + (i * NPDEPG); + PT_SET_MA(pd, *vtopte((vm_offset_t)pd) & ~(PG_M|PG_A|PG_U|PG_RW)); +#if 0 + xen_pgd_pin(ma); +#endif + } + +#ifdef PAE + PT_SET_MA(pmap->pm_pdpt, *vtopte((vm_offset_t)pmap->pm_pdpt) & ~PG_RW); +#endif + rw_wlock(&pvh_global_lock); + xen_flush_queue(); + xen_pgdpt_pin(VM_PAGE_TO_MACH(ptdpg[NPGPTD])); + for (i = 0; i < NPGPTD; i++) { + vm_paddr_t ma = VM_PAGE_TO_MACH(ptdpg[i]); + PT_SET_VA_MA(&pmap->pm_pdir[PTDPTDI + i], ma | PG_V | PG_A, FALSE); + } + xen_flush_queue(); + rw_wunlock(&pvh_global_lock); + CPU_ZERO(&pmap->pm_active); + TAILQ_INIT(&pmap->pm_pvchunk); + bzero(&pmap->pm_stats, sizeof pmap->pm_stats); + +#ifdef HAMFISTED_LOCKING + mtx_unlock(&createdelete_lock); +#endif + return (1); +} + +/* + * this routine is called if the page table page is not + * mapped correctly. + */ +static vm_page_t +_pmap_allocpte(pmap_t pmap, u_int ptepindex, int flags) +{ + vm_paddr_t ptema; + vm_page_t m; + + KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT || + (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK, + ("_pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK")); + + /* + * Allocate a page table page. + */ + if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | + VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { + if (flags & M_WAITOK) { + PMAP_UNLOCK(pmap); + rw_wunlock(&pvh_global_lock); + VM_WAIT; + rw_wlock(&pvh_global_lock); + PMAP_LOCK(pmap); + } + + /* + * Indicate the need to retry. While waiting, the page table + * page may have been allocated. + */ + return (NULL); + } + if ((m->flags & PG_ZERO) == 0) + pmap_zero_page(m); + + /* + * Map the pagetable page into the process address space, if + * it isn't already there. + */ + + pmap->pm_stats.resident_count++; + + ptema = VM_PAGE_TO_MACH(m); + xen_pt_pin(ptema); + PT_SET_VA_MA(&pmap->pm_pdir[ptepindex], + (ptema | PG_U | PG_RW | PG_V | PG_A | PG_M), TRUE); + + KASSERT(pmap->pm_pdir[ptepindex], + ("_pmap_allocpte: ptepindex=%d did not get mapped", ptepindex)); + return (m); +} + +static vm_page_t +pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags) +{ + u_int ptepindex; + pd_entry_t ptema; + vm_page_t m; + + KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT || + (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK, + ("pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK")); + + /* + * Calculate pagetable page index + */ + ptepindex = va >> PDRSHIFT; +retry: + /* + * Get the page directory entry + */ + ptema = pmap->pm_pdir[ptepindex]; + + /* + * This supports switching from a 4MB page to a + * normal 4K page. + */ + if (ptema & PG_PS) { + /* + * XXX + */ + pmap->pm_pdir[ptepindex] = 0; + ptema = 0; + pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; + pmap_invalidate_all(kernel_pmap); + } + + /* + * If the page table page is mapped, we just increment the + * hold count, and activate it. + */ + if (ptema & PG_V) { + m = PHYS_TO_VM_PAGE(xpmap_mtop(ptema) & PG_FRAME); + m->wire_count++; + } else { + /* + * Here if the pte page isn't mapped, or if it has + * been deallocated. + */ + CTR3(KTR_PMAP, "pmap_allocpte: pmap=%p va=0x%08x flags=0x%x", + pmap, va, flags); + m = _pmap_allocpte(pmap, ptepindex, flags); + if (m == NULL && (flags & M_WAITOK)) + goto retry; + + KASSERT(pmap->pm_pdir[ptepindex], ("ptepindex=%d did not get mapped", ptepindex)); + } + return (m); +} + + +/*************************************************** +* Pmap allocation/deallocation routines. + ***************************************************/ + +#ifdef SMP +/* + * Deal with a SMP shootdown of other users of the pmap that we are + * trying to dispose of. This can be a bit hairy. + */ +static cpuset_t *lazymask; +static u_int lazyptd; +static volatile u_int lazywait; + +void pmap_lazyfix_action(void); + +void +pmap_lazyfix_action(void) +{ + +#ifdef COUNT_IPIS + (*ipi_lazypmap_counts[PCPU_GET(cpuid)])++; +#endif + if (rcr3() == lazyptd) + load_cr3(PCPU_GET(curpcb)->pcb_cr3); + CPU_CLR_ATOMIC(PCPU_GET(cpuid), lazymask); + atomic_store_rel_int(&lazywait, 1); +} + +static void +pmap_lazyfix_self(u_int cpuid) +{ + + if (rcr3() == lazyptd) + load_cr3(PCPU_GET(curpcb)->pcb_cr3); + CPU_CLR_ATOMIC(cpuid, lazymask); +} + + +static void +pmap_lazyfix(pmap_t pmap) +{ + cpuset_t mymask, mask; + u_int cpuid, spins; + int lsb; + + mask = pmap->pm_active; + while (!CPU_EMPTY(&mask)) { + spins = 50000000; + + /* Find least significant set bit. */ + lsb = cpusetobj_ffs(&mask); + MPASS(lsb != 0); + lsb--; + CPU_SETOF(lsb, &mask); + mtx_lock_spin(&smp_ipi_mtx); +#ifdef PAE + lazyptd = vtophys(pmap->pm_pdpt); +#else + lazyptd = vtophys(pmap->pm_pdir); +#endif + cpuid = PCPU_GET(cpuid); + + /* Use a cpuset just for having an easy check. */ + CPU_SETOF(cpuid, &mymask); + if (!CPU_CMP(&mask, &mymask)) { + lazymask = &pmap->pm_active; + pmap_lazyfix_self(cpuid); + } else { + atomic_store_rel_int((u_int *)&lazymask, + (u_int)&pmap->pm_active); + atomic_store_rel_int(&lazywait, 0); + ipi_selected(mask, IPI_LAZYPMAP); + while (lazywait == 0) { + ia32_pause(); + if (--spins == 0) + break; + } + } + mtx_unlock_spin(&smp_ipi_mtx); + if (spins == 0) + printf("pmap_lazyfix: spun for 50000000\n"); + mask = pmap->pm_active; + } +} + +#else /* SMP */ + +/* + * Cleaning up on uniprocessor is easy. For various reasons, we're + * unlikely to have to even execute this code, including the fact + * that the cleanup is deferred until the parent does a wait(2), which + * means that another userland process has run. + */ +static void +pmap_lazyfix(pmap_t pmap) +{ + u_int cr3; + + cr3 = vtophys(pmap->pm_pdir); + if (cr3 == rcr3()) { + load_cr3(PCPU_GET(curpcb)->pcb_cr3); + CPU_CLR(PCPU_GET(cpuid), &pmap->pm_active); + } +} +#endif /* SMP */ + +/* + * Release any resources held by the given physical map. + * Called when a pmap initialized by pmap_pinit is being released. + * Should only be called if the map contains no valid mappings. + */ +void +pmap_release(pmap_t pmap) +{ + vm_page_t m, ptdpg[2*NPGPTD+1]; + vm_paddr_t ma; + int i; +#ifdef PAE + int npgptd = NPGPTD + 1; +#else + int npgptd = NPGPTD; +#endif + + KASSERT(pmap->pm_stats.resident_count == 0, + ("pmap_release: pmap resident count %ld != 0", + pmap->pm_stats.resident_count)); + PT_UPDATES_FLUSH(); + +#ifdef HAMFISTED_LOCKING + mtx_lock(&createdelete_lock); +#endif + + pmap_lazyfix(pmap); + mtx_lock_spin(&allpmaps_lock); + LIST_REMOVE(pmap, pm_list); + mtx_unlock_spin(&allpmaps_lock); + + for (i = 0; i < NPGPTD; i++) + ptdpg[i] = PHYS_TO_VM_PAGE(vtophys(pmap->pm_pdir + (i*NPDEPG)) & PG_FRAME); + pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD); +#ifdef PAE + ptdpg[NPGPTD] = PHYS_TO_VM_PAGE(vtophys(pmap->pm_pdpt)); +#endif + + for (i = 0; i < npgptd; i++) { + m = ptdpg[i]; + ma = VM_PAGE_TO_MACH(m); + /* unpinning L1 and L2 treated the same */ +#if 0 + xen_pgd_unpin(ma); +#else + if (i == NPGPTD) + xen_pgd_unpin(ma); +#endif +#ifdef PAE + if (i < NPGPTD) + KASSERT(VM_PAGE_TO_MACH(m) == (pmap->pm_pdpt[i] & PG_FRAME), + ("pmap_release: got wrong ptd page")); +#endif + m->wire_count--; + atomic_subtract_int(&cnt.v_wire_count, 1); + vm_page_free(m); + } +#ifdef PAE + pmap_qremove((vm_offset_t)pmap->pm_pdpt, 1); +#endif + PMAP_LOCK_DESTROY(pmap); + +#ifdef HAMFISTED_LOCKING + mtx_unlock(&createdelete_lock); +#endif +} + +static int +kvm_size(SYSCTL_HANDLER_ARGS) +{ + unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE; + + return (sysctl_handle_long(oidp, &ksize, 0, req)); +} +SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, + 0, 0, kvm_size, "IU", "Size of KVM"); + +static int +kvm_free(SYSCTL_HANDLER_ARGS) +{ + unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; + + return (sysctl_handle_long(oidp, &kfree, 0, req)); +} +SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, + 0, 0, kvm_free, "IU", "Amount of KVM free"); + +/* + * grow the number of kernel page table entries, if needed + */ +void +pmap_growkernel(vm_offset_t addr) +{ + struct pmap *pmap; + vm_paddr_t ptppaddr; + vm_page_t nkpg; + pd_entry_t newpdir; + + mtx_assert(&kernel_map->system_mtx, MA_OWNED); + if (kernel_vm_end == 0) { + kernel_vm_end = KERNBASE; + nkpt = 0; + while (pdir_pde(PTD, kernel_vm_end)) { + kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); + nkpt++; + if (kernel_vm_end - 1 >= kernel_map->max_offset) { + kernel_vm_end = kernel_map->max_offset; + break; + } + } + } + addr = roundup2(addr, NBPDR); + if (addr - 1 >= kernel_map->max_offset) + addr = kernel_map->max_offset; + while (kernel_vm_end < addr) { + if (pdir_pde(PTD, kernel_vm_end)) { + kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; + if (kernel_vm_end - 1 >= kernel_map->max_offset) { + kernel_vm_end = kernel_map->max_offset; + break; + } + continue; + } + + nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDRSHIFT, + VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | + VM_ALLOC_ZERO); + if (nkpg == NULL) + panic("pmap_growkernel: no memory to grow kernel"); + + nkpt++; + + if ((nkpg->flags & PG_ZERO) == 0) + pmap_zero_page(nkpg); + ptppaddr = VM_PAGE_TO_PHYS(nkpg); + newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M); + rw_wlock(&pvh_global_lock); + PD_SET_VA(kernel_pmap, (kernel_vm_end >> PDRSHIFT), newpdir, TRUE); + mtx_lock_spin(&allpmaps_lock); + LIST_FOREACH(pmap, &allpmaps, pm_list) + PD_SET_VA(pmap, (kernel_vm_end >> PDRSHIFT), newpdir, TRUE); + + mtx_unlock_spin(&allpmaps_lock); + rw_wunlock(&pvh_global_lock); + + kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; + if (kernel_vm_end - 1 >= kernel_map->max_offset) { + kernel_vm_end = kernel_map->max_offset; + break; + } + } +} + + +/*************************************************** + * page management routines. + ***************************************************/ + +CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); +CTASSERT(_NPCM == 11); +CTASSERT(_NPCPV == 336); + +static __inline struct pv_chunk * +pv_to_chunk(pv_entry_t pv) +{ + + return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); +} + +#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) + +#define PC_FREE0_9 0xfffffffful /* Free values for index 0 through 9 */ +#define PC_FREE10 0x0000fffful /* Free values for index 10 */ + +static const uint32_t pc_freemask[_NPCM] = { + PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, + PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, + PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, + PC_FREE0_9, PC_FREE10 +}; + +SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, + "Current number of pv entries"); + +#ifdef PV_STATS +static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; + +SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, + "Current number of pv entry chunks"); +SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, + "Current number of pv entry chunks allocated"); +SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, + "Current number of pv entry chunks frees"); +SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, + "Number of times tried to get a chunk page but failed."); + +static long pv_entry_frees, pv_entry_allocs; +static int pv_entry_spare; + +SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, + "Current number of pv entry frees"); +SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, + "Current number of pv entry allocs"); +SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, + "Current number of spare pv entries"); +#endif + +/* + * We are in a serious low memory condition. Resort to + * drastic measures to free some pages so we can allocate + * another pv entry chunk. + */ +static vm_page_t +pmap_pv_reclaim(pmap_t locked_pmap) +{ + struct pch newtail; + struct pv_chunk *pc; + pmap_t pmap; + pt_entry_t *pte, tpte; + pv_entry_t pv; + vm_offset_t va; + vm_page_t free, m, m_pc; + uint32_t inuse; + int bit, field, freed; + + PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); + pmap = NULL; + free = m_pc = NULL; + TAILQ_INIT(&newtail); + while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 || + free == NULL)) { + TAILQ_REMOVE(&pv_chunks, pc, pc_lru); + if (pmap != pc->pc_pmap) { + if (pmap != NULL) { + pmap_invalidate_all(pmap); + if (pmap != locked_pmap) + PMAP_UNLOCK(pmap); + } + pmap = pc->pc_pmap; + /* Avoid deadlock and lock recursion. */ + if (pmap > locked_pmap) + PMAP_LOCK(pmap); + else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) { + pmap = NULL; + TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); + continue; + } + } + + /* + * Destroy every non-wired, 4 KB page mapping in the chunk. + */ + freed = 0; + for (field = 0; field < _NPCM; field++) { + for (inuse = ~pc->pc_map[field] & pc_freemask[field]; + inuse != 0; inuse &= ~(1UL << bit)) { + bit = bsfl(inuse); + pv = &pc->pc_pventry[field * 32 + bit]; + va = pv->pv_va; + pte = pmap_pte(pmap, va); + tpte = *pte; + if ((tpte & PG_W) == 0) + tpte = pte_load_clear(pte); + pmap_pte_release(pte); + if ((tpte & PG_W) != 0) + continue; + KASSERT(tpte != 0, + ("pmap_pv_reclaim: pmap %p va %x zero pte", + pmap, va)); + if ((tpte & PG_G) != 0) + pmap_invalidate_page(pmap, va); + m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); + if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) + vm_page_dirty(m); + if ((tpte & PG_A) != 0) + vm_page_aflag_set(m, PGA_REFERENCED); + TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); + if (TAILQ_EMPTY(&m->md.pv_list)) + vm_page_aflag_clear(m, PGA_WRITEABLE); + pc->pc_map[field] |= 1UL << bit; + pmap_unuse_pt(pmap, va, &free); + freed++; + } + } + if (freed == 0) { + TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); + continue; + } + /* Every freed mapping is for a 4 KB page. */ + pmap->pm_stats.resident_count -= freed; + PV_STAT(pv_entry_frees += freed); + PV_STAT(pv_entry_spare += freed); + pv_entry_count -= freed; + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + for (field = 0; field < _NPCM; field++) + if (pc->pc_map[field] != pc_freemask[field]) { + TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, + pc_list); + TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); + + /* + * One freed pv entry in locked_pmap is + * sufficient. + */ + if (pmap == locked_pmap) + goto out; + break; + } + if (field == _NPCM) { + PV_STAT(pv_entry_spare -= _NPCPV); + PV_STAT(pc_chunk_count--); + PV_STAT(pc_chunk_frees++); + /* Entire chunk is free; return it. */ + m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); + pmap_qremove((vm_offset_t)pc, 1); + pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); + break; + } + } +out: + TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru); + if (pmap != NULL) { + pmap_invalidate_all(pmap); + if (pmap != locked_pmap) + PMAP_UNLOCK(pmap); + } + if (m_pc == NULL && pv_vafree != 0 && free != NULL) { + m_pc = free; + free = m_pc->right; + /* Recycle a freed page table page. */ + m_pc->wire_count = 1; + atomic_add_int(&cnt.v_wire_count, 1); + } + pmap_free_zero_pages(free); + return (m_pc); +} + +/* + * free the pv_entry back to the free list + */ +static void +free_pv_entry(pmap_t pmap, pv_entry_t pv) +{ + struct pv_chunk *pc; + int idx, field, bit; + + rw_assert(&pvh_global_lock, RA_WLOCKED); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + PV_STAT(pv_entry_frees++); + PV_STAT(pv_entry_spare++); + pv_entry_count--; + pc = pv_to_chunk(pv); + idx = pv - &pc->pc_pventry[0]; + field = idx / 32; + bit = idx % 32; + pc->pc_map[field] |= 1ul << bit; + for (idx = 0; idx < _NPCM; idx++) + if (pc->pc_map[idx] != pc_freemask[idx]) { + /* + * 98% of the time, pc is already at the head of the + * list. If it isn't already, move it to the head. + */ + if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) != + pc)) { + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, + pc_list); + } + return; + } + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + free_pv_chunk(pc); +} + +static void +free_pv_chunk(struct pv_chunk *pc) +{ + vm_page_t m; + + TAILQ_REMOVE(&pv_chunks, pc, pc_lru); + PV_STAT(pv_entry_spare -= _NPCPV); + PV_STAT(pc_chunk_count--); + PV_STAT(pc_chunk_frees++); + /* entire chunk is free, return it */ + m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); + pmap_qremove((vm_offset_t)pc, 1); + vm_page_unwire(m, 0); + vm_page_free(m); + pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); +} + +/* + * get a new pv_entry, allocating a block from the system + * when needed. + */ +static pv_entry_t +get_pv_entry(pmap_t pmap, boolean_t try) +{ + static const struct timeval printinterval = { 60, 0 }; + static struct timeval lastprint; + int bit, field; + pv_entry_t pv; + struct pv_chunk *pc; + vm_page_t m; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + rw_assert(&pvh_global_lock, RA_WLOCKED); + PV_STAT(pv_entry_allocs++); + pv_entry_count++; + if (pv_entry_count > pv_entry_high_water) + if (ratecheck(&lastprint, &printinterval)) + printf("Approaching the limit on PV entries, consider " + "increasing either the vm.pmap.shpgperproc or the " + "vm.pmap.pv_entry_max tunable.\n"); +retry: + pc = TAILQ_FIRST(&pmap->pm_pvchunk); + if (pc != NULL) { + for (field = 0; field < _NPCM; field++) { + if (pc->pc_map[field]) { + bit = bsfl(pc->pc_map[field]); + break; + } + } + if (field < _NPCM) { + pv = &pc->pc_pventry[field * 32 + bit]; + pc->pc_map[field] &= ~(1ul << bit); + /* If this was the last item, move it to tail */ + for (field = 0; field < _NPCM; field++) + if (pc->pc_map[field] != 0) { + PV_STAT(pv_entry_spare--); + return (pv); /* not full, return */ + } + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); + PV_STAT(pv_entry_spare--); + return (pv); + } + } + /* + * Access to the ptelist "pv_vafree" is synchronized by the page + * queues lock. If "pv_vafree" is currently non-empty, it will + * remain non-empty until pmap_ptelist_alloc() completes. + */ + if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | + VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { + if (try) { + pv_entry_count--; + PV_STAT(pc_chunk_tryfail++); + return (NULL); + } + m = pmap_pv_reclaim(pmap); + if (m == NULL) + goto retry; + } + PV_STAT(pc_chunk_count++); + PV_STAT(pc_chunk_allocs++); + pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree); + pmap_qenter((vm_offset_t)pc, &m, 1); + if ((m->flags & PG_ZERO) == 0) + pagezero(pc); + pc->pc_pmap = pmap; + pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */ + for (field = 1; field < _NPCM; field++) + pc->pc_map[field] = pc_freemask[field]; + TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); + pv = &pc->pc_pventry[0]; + TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); + PV_STAT(pv_entry_spare += _NPCPV - 1); + return (pv); +} + +static __inline pv_entry_t +pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) +{ + pv_entry_t pv; + + rw_assert(&pvh_global_lock, RA_WLOCKED); + TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { + if (pmap == PV_PMAP(pv) && va == pv->pv_va) { + TAILQ_REMOVE(&pvh->pv_list, pv, pv_list); + break; + } + } + return (pv); +} + +static void +pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) +{ + pv_entry_t pv; + + pv = pmap_pvh_remove(pvh, pmap, va); + KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); + free_pv_entry(pmap, pv); +} + +static void +pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) +{ + + rw_assert(&pvh_global_lock, RA_WLOCKED); + pmap_pvh_free(&m->md, pmap, va); + if (TAILQ_EMPTY(&m->md.pv_list)) + vm_page_aflag_clear(m, PGA_WRITEABLE); +} + +/* + * Conditionally create a pv entry. + */ +static boolean_t +pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) +{ + pv_entry_t pv; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + rw_assert(&pvh_global_lock, RA_WLOCKED); + if (pv_entry_count < pv_entry_high_water && + (pv = get_pv_entry(pmap, TRUE)) != NULL) { + pv->pv_va = va; + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); + return (TRUE); + } else + return (FALSE); +} + +/* + * pmap_remove_pte: do the things to unmap a page in a process + */ +static int +pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, vm_page_t *free) +{ + pt_entry_t oldpte; + vm_page_t m; + + CTR3(KTR_PMAP, "pmap_remove_pte: pmap=%p *ptq=0x%x va=0x%x", + pmap, (u_long)*ptq, va); + + rw_assert(&pvh_global_lock, RA_WLOCKED); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + oldpte = *ptq; + PT_SET_VA_MA(ptq, 0, TRUE); + KASSERT(oldpte != 0, + ("pmap_remove_pte: pmap %p va %x zero pte", pmap, va)); + if (oldpte & PG_W) + pmap->pm_stats.wired_count -= 1; + /* + * Machines that don't support invlpg, also don't support + * PG_G. + */ + if (oldpte & PG_G) + pmap_invalidate_page(kernel_pmap, va); + pmap->pm_stats.resident_count -= 1; + if (oldpte & PG_MANAGED) { + m = PHYS_TO_VM_PAGE(xpmap_mtop(oldpte) & PG_FRAME); + if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) + vm_page_dirty(m); + if (oldpte & PG_A) + vm_page_aflag_set(m, PGA_REFERENCED); + pmap_remove_entry(pmap, m, va); + } + return (pmap_unuse_pt(pmap, va, free)); +} + +/* + * Remove a single page from a process address space + */ +static void +pmap_remove_page(pmap_t pmap, vm_offset_t va, vm_page_t *free) +{ + pt_entry_t *pte; + + CTR2(KTR_PMAP, "pmap_remove_page: pmap=%p va=0x%x", + pmap, va); + + rw_assert(&pvh_global_lock, RA_WLOCKED); + KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + if ((pte = pmap_pte_quick(pmap, va)) == NULL || (*pte & PG_V) == 0) + return; + pmap_remove_pte(pmap, pte, va, free); + pmap_invalidate_page(pmap, va); + if (*PMAP1) + PT_SET_MA(PADDR1, 0); + +} + +/* + * Remove the given range of addresses from the specified map. + * + * It is assumed that the start and end are properly + * rounded to the page size. + */ +void +pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +{ + vm_offset_t pdnxt; + pd_entry_t ptpaddr; + pt_entry_t *pte; + vm_page_t free = NULL; + int anyvalid; + + CTR3(KTR_PMAP, "pmap_remove: pmap=%p sva=0x%x eva=0x%x", + pmap, sva, eva); + + /* + * Perform an unsynchronized read. This is, however, safe. + */ + if (pmap->pm_stats.resident_count == 0) + return; + + anyvalid = 0; + + rw_wlock(&pvh_global_lock); + sched_pin(); + PMAP_LOCK(pmap); + + /* + * special handling of removing one page. a very + * common operation and easy to short circuit some + * code. + */ + if ((sva + PAGE_SIZE == eva) && + ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) { + pmap_remove_page(pmap, sva, &free); + goto out; + } + + for (; sva < eva; sva = pdnxt) { + u_int pdirindex; + + /* + * Calculate index for next page table. + */ + pdnxt = (sva + NBPDR) & ~PDRMASK; + if (pdnxt < sva) + pdnxt = eva; + if (pmap->pm_stats.resident_count == 0) + break; + + pdirindex = sva >> PDRSHIFT; + ptpaddr = pmap->pm_pdir[pdirindex]; + + /* + * Weed out invalid mappings. Note: we assume that the page + * directory table is always allocated, and in kernel virtual. + */ + if (ptpaddr == 0) + continue; + + /* + * Check for large page. + */ + if ((ptpaddr & PG_PS) != 0) { + PD_CLEAR_VA(pmap, pdirindex, TRUE); + pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; + anyvalid = 1; + continue; + } + + /* + * Limit our scan to either the end of the va represented + * by the current page table page, or to the end of the + * range being removed. + */ + if (pdnxt > eva) + pdnxt = eva; + + for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, + sva += PAGE_SIZE) { + if ((*pte & PG_V) == 0) + continue; + + /* + * The TLB entry for a PG_G mapping is invalidated + * by pmap_remove_pte(). + */ + if ((*pte & PG_G) == 0) + anyvalid = 1; + if (pmap_remove_pte(pmap, pte, sva, &free)) + break; + } + } + PT_UPDATES_FLUSH(); + if (*PMAP1) + PT_SET_VA_MA(PMAP1, 0, TRUE); +out: + if (anyvalid) + pmap_invalidate_all(pmap); + sched_unpin(); + rw_wunlock(&pvh_global_lock); + PMAP_UNLOCK(pmap); + pmap_free_zero_pages(free); +} + +/* + * Routine: pmap_remove_all + * Function: + * Removes this physical page from + * all physical maps in which it resides. + * Reflects back modify bits to the pager. + * + * Notes: + * Original versions of this routine were very + * inefficient because they iteratively called + * pmap_remove (slow...) + */ + +void +pmap_remove_all(vm_page_t m) +{ + pv_entry_t pv; + pmap_t pmap; + pt_entry_t *pte, tpte; + vm_page_t free; + + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("pmap_remove_all: page %p is not managed", m)); + free = NULL; + rw_wlock(&pvh_global_lock); + sched_pin(); + while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { + pmap = PV_PMAP(pv); + PMAP_LOCK(pmap); + pmap->pm_stats.resident_count--; + pte = pmap_pte_quick(pmap, pv->pv_va); + tpte = *pte; + PT_SET_VA_MA(pte, 0, TRUE); + KASSERT(tpte != 0, ("pmap_remove_all: pmap %p va %x zero pte", + pmap, pv->pv_va)); + if (tpte & PG_W) + pmap->pm_stats.wired_count--; + if (tpte & PG_A) + vm_page_aflag_set(m, PGA_REFERENCED); + + /* + * Update the vm_page_t clean and reference bits. + */ + if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) + vm_page_dirty(m); + pmap_unuse_pt(pmap, pv->pv_va, &free); + pmap_invalidate_page(pmap, pv->pv_va); + TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); + free_pv_entry(pmap, pv); + PMAP_UNLOCK(pmap); + } + vm_page_aflag_clear(m, PGA_WRITEABLE); + PT_UPDATES_FLUSH(); + if (*PMAP1) + PT_SET_MA(PADDR1, 0); + sched_unpin(); + rw_wunlock(&pvh_global_lock); + pmap_free_zero_pages(free); +} + +/* + * Set the physical protection on the + * specified range of this map as requested. + */ +void +pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) +{ + vm_offset_t pdnxt; + pd_entry_t ptpaddr; + pt_entry_t *pte; + int anychanged; + + CTR4(KTR_PMAP, "pmap_protect: pmap=%p sva=0x%x eva=0x%x prot=0x%x", + pmap, sva, eva, prot); + + if ((prot & VM_PROT_READ) == VM_PROT_NONE) { + pmap_remove(pmap, sva, eva); + return; + } + +#ifdef PAE + if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == + (VM_PROT_WRITE|VM_PROT_EXECUTE)) + return; +#else + if (prot & VM_PROT_WRITE) + return; +#endif + + anychanged = 0; + + rw_wlock(&pvh_global_lock); + sched_pin(); + PMAP_LOCK(pmap); + for (; sva < eva; sva = pdnxt) { + pt_entry_t obits, pbits; + u_int pdirindex; + + pdnxt = (sva + NBPDR) & ~PDRMASK; + if (pdnxt < sva) + pdnxt = eva; + + pdirindex = sva >> PDRSHIFT; + ptpaddr = pmap->pm_pdir[pdirindex]; + + /* + * Weed out invalid mappings. Note: we assume that the page + * directory table is always allocated, and in kernel virtual. + */ + if (ptpaddr == 0) + continue; + + /* + * Check for large page. + */ + if ((ptpaddr & PG_PS) != 0) { + if ((prot & VM_PROT_WRITE) == 0) + pmap->pm_pdir[pdirindex] &= ~(PG_M|PG_RW); +#ifdef PAE + if ((prot & VM_PROT_EXECUTE) == 0) + pmap->pm_pdir[pdirindex] |= pg_nx; +#endif + anychanged = 1; + continue; + } + + if (pdnxt > eva) + pdnxt = eva; + + for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, + sva += PAGE_SIZE) { + vm_page_t m; + +retry: + /* + * Regardless of whether a pte is 32 or 64 bits in + * size, PG_RW, PG_A, and PG_M are among the least + * significant 32 bits. + */ + obits = pbits = *pte; + if ((pbits & PG_V) == 0) + continue; + + if ((prot & VM_PROT_WRITE) == 0) { + if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == + (PG_MANAGED | PG_M | PG_RW)) { + m = PHYS_TO_VM_PAGE(xpmap_mtop(pbits) & + PG_FRAME); + vm_page_dirty(m); + } + pbits &= ~(PG_RW | PG_M); + } +#ifdef PAE + if ((prot & VM_PROT_EXECUTE) == 0) + pbits |= pg_nx; +#endif + + if (pbits != obits) { + obits = *pte; + PT_SET_VA_MA(pte, pbits, TRUE); + if (*pte != pbits) + goto retry; + if (obits & PG_G) + pmap_invalidate_page(pmap, sva); + else + anychanged = 1; + } + } + } + PT_UPDATES_FLUSH(); + if (*PMAP1) + PT_SET_VA_MA(PMAP1, 0, TRUE); + if (anychanged) + pmap_invalidate_all(pmap); + sched_unpin(); + rw_wunlock(&pvh_global_lock); + PMAP_UNLOCK(pmap); +} + +/* + * Insert the given physical page (p) at + * the specified virtual address (v) in the + * target physical map with the protection requested. + * + * If specified, the page will be wired down, meaning + * that the related pte can not be reclaimed. + * + * NB: This is the only routine which MAY NOT lazy-evaluate + * or lose information. That is, this routine must actually + * insert this page into the given map NOW. + */ +void +pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m, + vm_prot_t prot, boolean_t wired) +{ + pd_entry_t *pde; + pt_entry_t *pte; + pt_entry_t newpte, origpte; + pv_entry_t pv; + vm_paddr_t opa, pa; + vm_page_t mpte, om; + boolean_t invlva; + + CTR6(KTR_PMAP, "pmap_enter: pmap=%08p va=0x%08x access=0x%x ma=0x%08x prot=0x%x wired=%d", + pmap, va, access, VM_PAGE_TO_MACH(m), prot, wired); + va = trunc_page(va); + KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); + KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, + ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", + va)); + KASSERT((m->oflags & (VPO_UNMANAGED | VPO_BUSY)) != 0 || + VM_OBJECT_LOCKED(m->object), + ("pmap_enter: page %p is not busy", m)); + + mpte = NULL; + + rw_wlock(&pvh_global_lock); + PMAP_LOCK(pmap); + sched_pin(); + + /* + * In the case that a page table page is not + * resident, we are creating it here. + */ + if (va < VM_MAXUSER_ADDRESS) { + mpte = pmap_allocpte(pmap, va, M_WAITOK); + } + + pde = pmap_pde(pmap, va); + if ((*pde & PG_PS) != 0) + panic("pmap_enter: attempted pmap_enter on 4MB page"); + pte = pmap_pte_quick(pmap, va); + + /* + * Page Directory table entry not valid, we need a new PT page + */ + if (pte == NULL) { + panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x", + (uintmax_t)pmap->pm_pdir[va >> PDRSHIFT], va); + } + + pa = VM_PAGE_TO_PHYS(m); + om = NULL; + opa = origpte = 0; + +#if 0 + KASSERT((*pte & PG_V) || (*pte == 0), ("address set but not valid pte=%p *pte=0x%016jx", + pte, *pte)); +#endif + origpte = *pte; + if (origpte) + origpte = xpmap_mtop(origpte); + opa = origpte & PG_FRAME; + + /* + * Mapping has not changed, must be protection or wiring change. + */ + if (origpte && (opa == pa)) { + /* + * Wiring change, just update stats. We don't worry about + * wiring PT pages as they remain resident as long as there + * are valid mappings in them. Hence, if a user page is wired, + * the PT page will be also. + */ + if (wired && ((origpte & PG_W) == 0)) + pmap->pm_stats.wired_count++; + else if (!wired && (origpte & PG_W)) + pmap->pm_stats.wired_count--; + + /* + * Remove extra pte reference + */ + if (mpte) + mpte->wire_count--; + + if (origpte & PG_MANAGED) { + om = m; + pa |= PG_MANAGED; + } + goto validate; + } + + pv = NULL; + + /* + * Mapping has changed, invalidate old range and fall through to + * handle validating new mapping. + */ + if (opa) { + if (origpte & PG_W) + pmap->pm_stats.wired_count--; + if (origpte & PG_MANAGED) { + om = PHYS_TO_VM_PAGE(opa); + pv = pmap_pvh_remove(&om->md, pmap, va); + } else if (va < VM_MAXUSER_ADDRESS) + printf("va=0x%x is unmanaged :-( \n", va); + + if (mpte != NULL) { + mpte->wire_count--; + KASSERT(mpte->wire_count > 0, + ("pmap_enter: missing reference to page table page," + " va: 0x%x", va)); + } + } else + pmap->pm_stats.resident_count++; + + /* + * Enter on the PV list if part of our managed memory. + */ + if ((m->oflags & VPO_UNMANAGED) == 0) { + KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva, + ("pmap_enter: managed mapping within the clean submap")); + if (pv == NULL) + pv = get_pv_entry(pmap, FALSE); + pv->pv_va = va; + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); + pa |= PG_MANAGED; + } else if (pv != NULL) + free_pv_entry(pmap, pv); + + /* + * Increment counters + */ + if (wired) + pmap->pm_stats.wired_count++; + +validate: + /* + * Now validate mapping with desired protection/wiring. + */ + newpte = (pt_entry_t)(pa | PG_V); + if ((prot & VM_PROT_WRITE) != 0) { + newpte |= PG_RW; + if ((newpte & PG_MANAGED) != 0) + vm_page_aflag_set(m, PGA_WRITEABLE); + } +#ifdef PAE + if ((prot & VM_PROT_EXECUTE) == 0) + newpte |= pg_nx; +#endif + if (wired) + newpte |= PG_W; + if (va < VM_MAXUSER_ADDRESS) + newpte |= PG_U; + if (pmap == kernel_pmap) + newpte |= pgeflag; + + critical_enter(); + /* + * if the mapping or permission bits are different, we need + * to update the pte. + */ + if ((origpte & ~(PG_M|PG_A)) != newpte) { + if (origpte) { + invlva = FALSE; + origpte = *pte; + PT_SET_VA(pte, newpte | PG_A, FALSE); + if (origpte & PG_A) { + if (origpte & PG_MANAGED) + vm_page_aflag_set(om, PGA_REFERENCED); + if (opa != VM_PAGE_TO_PHYS(m)) + invlva = TRUE; +#ifdef PAE + if ((origpte & PG_NX) == 0 && + (newpte & PG_NX) != 0) + invlva = TRUE; +#endif + } + if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { + if ((origpte & PG_MANAGED) != 0) + vm_page_dirty(om); + if ((prot & VM_PROT_WRITE) == 0) + invlva = TRUE; + } + if ((origpte & PG_MANAGED) != 0 && + TAILQ_EMPTY(&om->md.pv_list)) + vm_page_aflag_clear(om, PGA_WRITEABLE); + if (invlva) + pmap_invalidate_page(pmap, va); + } else{ + PT_SET_VA(pte, newpte | PG_A, FALSE); + } + + } + PT_UPDATES_FLUSH(); + critical_exit(); + if (*PMAP1) + PT_SET_VA_MA(PMAP1, 0, TRUE); + sched_unpin(); + rw_wunlock(&pvh_global_lock); + PMAP_UNLOCK(pmap); +} + +/* + * Maps a sequence of resident pages belonging to the same object. + * The sequence begins with the given page m_start. This page is + * mapped at the given virtual address start. Each subsequent page is + * mapped at a virtual address that is offset from start by the same + * amount as the page is offset from m_start within the object. The + * last page in the sequence is the page with the largest offset from + * m_start that can be mapped at a virtual address less than the given + * virtual address end. Not every virtual page between start and end + * is mapped; only those for which a resident page exists with the + * corresponding offset from m_start are mapped. + */ +void +pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, + vm_page_t m_start, vm_prot_t prot) +{ + vm_page_t m, mpte; + vm_pindex_t diff, psize; + multicall_entry_t mcl[16]; + multicall_entry_t *mclp = mcl; + int error, count = 0; + + VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED); + psize = atop(end - start); + mpte = NULL; + m = m_start; + rw_wlock(&pvh_global_lock); + PMAP_LOCK(pmap); + while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { + mpte = pmap_enter_quick_locked(&mclp, &count, pmap, start + ptoa(diff), m, + prot, mpte); + m = TAILQ_NEXT(m, listq); + if (count == 16) { + error = HYPERVISOR_multicall(mcl, count); + KASSERT(error == 0, ("bad multicall %d", error)); + mclp = mcl; + count = 0; + } + } + if (count) { + error = HYPERVISOR_multicall(mcl, count); + KASSERT(error == 0, ("bad multicall %d", error)); + } + rw_wunlock(&pvh_global_lock); + PMAP_UNLOCK(pmap); +} + +/* + * this code makes some *MAJOR* assumptions: + * 1. Current pmap & pmap exists. + * 2. Not wired. + * 3. Read access. + * 4. No page table pages. + * but is *MUCH* faster than pmap_enter... + */ + +void +pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) +{ + multicall_entry_t mcl, *mclp; + int count = 0; + mclp = &mcl; + + CTR4(KTR_PMAP, "pmap_enter_quick: pmap=%p va=0x%x m=%p prot=0x%x", + pmap, va, m, prot); + + rw_wlock(&pvh_global_lock); + PMAP_LOCK(pmap); + (void)pmap_enter_quick_locked(&mclp, &count, pmap, va, m, prot, NULL); + if (count) + HYPERVISOR_multicall(&mcl, count); + rw_wunlock(&pvh_global_lock); + PMAP_UNLOCK(pmap); +} + +#ifdef notyet +void +pmap_enter_quick_range(pmap_t pmap, vm_offset_t *addrs, vm_page_t *pages, vm_prot_t *prots, int count) +{ + int i, error, index = 0; + multicall_entry_t mcl[16]; + multicall_entry_t *mclp = mcl; + + PMAP_LOCK(pmap); + for (i = 0; i < count; i++, addrs++, pages++, prots++) { + if (!pmap_is_prefaultable_locked(pmap, *addrs)) + continue; + + (void) pmap_enter_quick_locked(&mclp, &index, pmap, *addrs, *pages, *prots, NULL); + if (index == 16) { + error = HYPERVISOR_multicall(mcl, index); + mclp = mcl; + index = 0; + KASSERT(error == 0, ("bad multicall %d", error)); + } + } + if (index) { + error = HYPERVISOR_multicall(mcl, index); + KASSERT(error == 0, ("bad multicall %d", error)); + } + + PMAP_UNLOCK(pmap); +} +#endif + +static vm_page_t +pmap_enter_quick_locked(multicall_entry_t **mclpp, int *count, pmap_t pmap, vm_offset_t va, vm_page_t m, + vm_prot_t prot, vm_page_t mpte) +{ + pt_entry_t *pte; + vm_paddr_t pa; + vm_page_t free; + multicall_entry_t *mcl = *mclpp; + + KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || + (m->oflags & VPO_UNMANAGED) != 0, + ("pmap_enter_quick_locked: managed mapping within the clean submap")); + rw_assert(&pvh_global_lock, RA_WLOCKED); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + + /* + * In the case that a page table page is not + * resident, we are creating it here. + */ + if (va < VM_MAXUSER_ADDRESS) { + u_int ptepindex; + pd_entry_t ptema; + + /* + * Calculate pagetable page index + */ + ptepindex = va >> PDRSHIFT; + if (mpte && (mpte->pindex == ptepindex)) { + mpte->wire_count++; + } else { + /* + * Get the page directory entry + */ + ptema = pmap->pm_pdir[ptepindex]; + + /* + * If the page table page is mapped, we just increment + * the hold count, and activate it. + */ + if (ptema & PG_V) { + if (ptema & PG_PS) + panic("pmap_enter_quick: unexpected mapping into 4MB page"); + mpte = PHYS_TO_VM_PAGE(xpmap_mtop(ptema) & PG_FRAME); + mpte->wire_count++; + } else { + mpte = _pmap_allocpte(pmap, ptepindex, + M_NOWAIT); + if (mpte == NULL) + return (mpte); + } + } + } else { + mpte = NULL; + } + + /* + * This call to vtopte makes the assumption that we are + * entering the page into the current pmap. In order to support + * quick entry into any pmap, one would likely use pmap_pte_quick. + * But that isn't as quick as vtopte. + */ + KASSERT(pmap_is_current(pmap), ("entering pages in non-current pmap")); + pte = vtopte(va); + if (*pte & PG_V) { + if (mpte != NULL) { + mpte->wire_count--; + mpte = NULL; + } + return (mpte); + } + + /* + * Enter on the PV list if part of our managed memory. + */ + if ((m->oflags & VPO_UNMANAGED) == 0 && + !pmap_try_insert_pv_entry(pmap, va, m)) { + if (mpte != NULL) { + free = NULL; + if (pmap_unwire_ptp(pmap, mpte, &free)) { + pmap_invalidate_page(pmap, va); + pmap_free_zero_pages(free); + } + + mpte = NULL; + } + return (mpte); + } + + /* + * Increment counters + */ + pmap->pm_stats.resident_count++; + + pa = VM_PAGE_TO_PHYS(m); +#ifdef PAE + if ((prot & VM_PROT_EXECUTE) == 0) + pa |= pg_nx; +#endif + +#if 0 + /* + * Now validate mapping with RO protection + */ + if ((m->oflags & VPO_UNMANAGED) != 0) + pte_store(pte, pa | PG_V | PG_U); + else + pte_store(pte, pa | PG_V | PG_U | PG_MANAGED); +#else + /* + * Now validate mapping with RO protection + */ + if ((m->oflags & VPO_UNMANAGED) != 0) + pa = xpmap_ptom(pa | PG_V | PG_U); + else + pa = xpmap_ptom(pa | PG_V | PG_U | PG_MANAGED); + + mcl->op = __HYPERVISOR_update_va_mapping; + mcl->args[0] = va; + mcl->args[1] = (uint32_t)(pa & 0xffffffff); + mcl->args[2] = (uint32_t)(pa >> 32); + mcl->args[3] = 0; + *mclpp = mcl + 1; + *count = *count + 1; +#endif + return (mpte); +} + +/* + * Make a temporary mapping for a physical address. This is only intended + * to be used for panic dumps. + */ +void * +pmap_kenter_temporary(vm_paddr_t pa, int i) +{ + vm_offset_t va; + vm_paddr_t ma = xpmap_ptom(pa); + + va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); + PT_SET_MA(va, (ma & ~PAGE_MASK) | PG_V | pgeflag); + invlpg(va); + return ((void *)crashdumpmap); +} + +/* + * This code maps large physical mmap regions into the + * processor address space. Note that some shortcuts + * are taken, but the code works. + */ +void +pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, + vm_pindex_t pindex, vm_size_t size) +{ + pd_entry_t *pde; + vm_paddr_t pa, ptepa; + vm_page_t p; + int pat_mode; + + VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); + KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, + ("pmap_object_init_pt: non-device object")); + if (pseflag && + (addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { + if (!vm_object_populate(object, pindex, pindex + atop(size))) + return; + p = vm_page_lookup(object, pindex); + KASSERT(p->valid == VM_PAGE_BITS_ALL, + ("pmap_object_init_pt: invalid page %p", p)); + pat_mode = p->md.pat_mode; + + /* + * Abort the mapping if the first page is not physically + * aligned to a 2/4MB page boundary. + */ + ptepa = VM_PAGE_TO_PHYS(p); + if (ptepa & (NBPDR - 1)) + return; + + /* + * Skip the first page. Abort the mapping if the rest of + * the pages are not physically contiguous or have differing + * memory attributes. + */ + p = TAILQ_NEXT(p, listq); + for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; + pa += PAGE_SIZE) { + KASSERT(p->valid == VM_PAGE_BITS_ALL, + ("pmap_object_init_pt: invalid page %p", p)); + if (pa != VM_PAGE_TO_PHYS(p) || + pat_mode != p->md.pat_mode) + return; + p = TAILQ_NEXT(p, listq); + } + + /* + * Map using 2/4MB pages. Since "ptepa" is 2/4M aligned and + * "size" is a multiple of 2/4M, adding the PAT setting to + * "pa" will not affect the termination of this loop. + */ + PMAP_LOCK(pmap); + for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa + + size; pa += NBPDR) { + pde = pmap_pde(pmap, addr); + if (*pde == 0) { + pde_store(pde, pa | PG_PS | PG_M | PG_A | + PG_U | PG_RW | PG_V); + pmap->pm_stats.resident_count += NBPDR / + PAGE_SIZE; + pmap_pde_mappings++; + } + /* Else continue on if the PDE is already valid. */ + addr += NBPDR; + } + PMAP_UNLOCK(pmap); + } +} + +/* + * Routine: pmap_change_wiring + * Function: Change the wiring attribute for a map/virtual-address + * pair. + * In/out conditions: + * The mapping must already exist in the pmap. + */ +void +pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired) +{ + pt_entry_t *pte; + + rw_wlock(&pvh_global_lock); + PMAP_LOCK(pmap); + pte = pmap_pte(pmap, va); + + if (wired && !pmap_pte_w(pte)) { + PT_SET_VA_MA((pte), *(pte) | PG_W, TRUE); + pmap->pm_stats.wired_count++; + } else if (!wired && pmap_pte_w(pte)) { + PT_SET_VA_MA((pte), *(pte) & ~PG_W, TRUE); + pmap->pm_stats.wired_count--; + } + + /* + * Wiring is not a hardware characteristic so there is no need to + * invalidate TLB. + */ + pmap_pte_release(pte); + PMAP_UNLOCK(pmap); + rw_wunlock(&pvh_global_lock); +} + + + +/* + * Copy the range specified by src_addr/len + * from the source map to the range dst_addr/len + * in the destination map. + * + * This routine is only advisory and need not do anything. + */ + +void +pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, + vm_offset_t src_addr) +{ + vm_page_t free; + vm_offset_t addr; + vm_offset_t end_addr = src_addr + len; + vm_offset_t pdnxt; + + if (dst_addr != src_addr) + return; + + if (!pmap_is_current(src_pmap)) { + CTR2(KTR_PMAP, + "pmap_copy, skipping: pdir[PTDPTDI]=0x%jx PTDpde[0]=0x%jx", + (src_pmap->pm_pdir[PTDPTDI] & PG_FRAME), (PTDpde[0] & PG_FRAME)); + + return; + } + CTR5(KTR_PMAP, "pmap_copy: dst_pmap=%p src_pmap=%p dst_addr=0x%x len=%d src_addr=0x%x", + dst_pmap, src_pmap, dst_addr, len, src_addr); + +#ifdef HAMFISTED_LOCKING + mtx_lock(&createdelete_lock); +#endif + + rw_wlock(&pvh_global_lock); + if (dst_pmap < src_pmap) { + PMAP_LOCK(dst_pmap); + PMAP_LOCK(src_pmap); + } else { + PMAP_LOCK(src_pmap); + PMAP_LOCK(dst_pmap); + } + sched_pin(); + for (addr = src_addr; addr < end_addr; addr = pdnxt) { + pt_entry_t *src_pte, *dst_pte; + vm_page_t dstmpte, srcmpte; + pd_entry_t srcptepaddr; + u_int ptepindex; + + KASSERT(addr < UPT_MIN_ADDRESS, + ("pmap_copy: invalid to pmap_copy page tables")); + + pdnxt = (addr + NBPDR) & ~PDRMASK; + if (pdnxt < addr) + pdnxt = end_addr; + ptepindex = addr >> PDRSHIFT; + + srcptepaddr = PT_GET(&src_pmap->pm_pdir[ptepindex]); + if (srcptepaddr == 0) + continue; + + if (srcptepaddr & PG_PS) { + if (dst_pmap->pm_pdir[ptepindex] == 0) { + PD_SET_VA(dst_pmap, ptepindex, srcptepaddr & ~PG_W, TRUE); + dst_pmap->pm_stats.resident_count += + NBPDR / PAGE_SIZE; + } + continue; + } + + srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME); + KASSERT(srcmpte->wire_count > 0, + ("pmap_copy: source page table page is unused")); + + if (pdnxt > end_addr) + pdnxt = end_addr; + + src_pte = vtopte(addr); + while (addr < pdnxt) { + pt_entry_t ptetemp; + ptetemp = *src_pte; + /* + * we only virtual copy managed pages + */ + if ((ptetemp & PG_MANAGED) != 0) { + dstmpte = pmap_allocpte(dst_pmap, addr, + M_NOWAIT); + if (dstmpte == NULL) + goto out; + dst_pte = pmap_pte_quick(dst_pmap, addr); + if (*dst_pte == 0 && + pmap_try_insert_pv_entry(dst_pmap, addr, + PHYS_TO_VM_PAGE(xpmap_mtop(ptetemp) & PG_FRAME))) { + /* + * Clear the wired, modified, and + * accessed (referenced) bits + * during the copy. + */ + KASSERT(ptetemp != 0, ("src_pte not set")); + PT_SET_VA_MA(dst_pte, ptetemp & ~(PG_W | PG_M | PG_A), TRUE /* XXX debug */); + KASSERT(*dst_pte == (ptetemp & ~(PG_W | PG_M | PG_A)), + ("no pmap copy expected: 0x%jx saw: 0x%jx", + ptetemp & ~(PG_W | PG_M | PG_A), *dst_pte)); + dst_pmap->pm_stats.resident_count++; + } else { + free = NULL; + if (pmap_unwire_ptp(dst_pmap, dstmpte, + &free)) { + pmap_invalidate_page(dst_pmap, + addr); + pmap_free_zero_pages(free); + } + goto out; + } + if (dstmpte->wire_count >= srcmpte->wire_count) + break; + } + addr += PAGE_SIZE; + src_pte++; + } + } +out: + PT_UPDATES_FLUSH(); + sched_unpin(); + rw_wunlock(&pvh_global_lock); + PMAP_UNLOCK(src_pmap); + PMAP_UNLOCK(dst_pmap); + +#ifdef HAMFISTED_LOCKING + mtx_unlock(&createdelete_lock); +#endif +} + +static __inline void +pagezero(void *page) +{ +#if defined(I686_CPU) + if (cpu_class == CPUCLASS_686) { +#if defined(CPU_ENABLE_SSE) + if (cpu_feature & CPUID_SSE2) + sse2_pagezero(page); + else +#endif + i686_pagezero(page); + } else +#endif + bzero(page, PAGE_SIZE); +} + +/* + * pmap_zero_page zeros the specified hardware page by mapping + * the page into KVM and using bzero to clear its contents. + */ +void +pmap_zero_page(vm_page_t m) +{ + struct sysmaps *sysmaps; + + sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; + mtx_lock(&sysmaps->lock); + if (*sysmaps->CMAP2) + panic("pmap_zero_page: CMAP2 busy"); + sched_pin(); + PT_SET_MA(sysmaps->CADDR2, PG_V | PG_RW | VM_PAGE_TO_MACH(m) | PG_A | PG_M); + pagezero(sysmaps->CADDR2); + PT_SET_MA(sysmaps->CADDR2, 0); + sched_unpin(); + mtx_unlock(&sysmaps->lock); +} + +/* + * pmap_zero_page_area zeros the specified hardware page by mapping + * the page into KVM and using bzero to clear its contents. + * + * off and size may not cover an area beyond a single hardware page. + */ +void +pmap_zero_page_area(vm_page_t m, int off, int size) +{ + struct sysmaps *sysmaps; + + sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; + mtx_lock(&sysmaps->lock); + if (*sysmaps->CMAP2) + panic("pmap_zero_page_area: CMAP2 busy"); + sched_pin(); + PT_SET_MA(sysmaps->CADDR2, PG_V | PG_RW | VM_PAGE_TO_MACH(m) | PG_A | PG_M); + + if (off == 0 && size == PAGE_SIZE) + pagezero(sysmaps->CADDR2); + else + bzero((char *)sysmaps->CADDR2 + off, size); + PT_SET_MA(sysmaps->CADDR2, 0); + sched_unpin(); + mtx_unlock(&sysmaps->lock); +} + +/* + * pmap_zero_page_idle zeros the specified hardware page by mapping + * the page into KVM and using bzero to clear its contents. This + * is intended to be called from the vm_pagezero process only and + * outside of Giant. + */ +void +pmap_zero_page_idle(vm_page_t m) +{ + + if (*CMAP3) + panic("pmap_zero_page_idle: CMAP3 busy"); + sched_pin(); + PT_SET_MA(CADDR3, PG_V | PG_RW | VM_PAGE_TO_MACH(m) | PG_A | PG_M); + pagezero(CADDR3); + PT_SET_MA(CADDR3, 0); + sched_unpin(); +} + +/* + * pmap_copy_page copies the specified (machine independent) + * page by mapping the page into virtual memory and using + * bcopy to copy the page, one machine dependent page at a + * time. + */ +void +pmap_copy_page(vm_page_t src, vm_page_t dst) +{ + struct sysmaps *sysmaps; + + sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; + mtx_lock(&sysmaps->lock); + if (*sysmaps->CMAP1) + panic("pmap_copy_page: CMAP1 busy"); + if (*sysmaps->CMAP2) + panic("pmap_copy_page: CMAP2 busy"); + sched_pin(); + PT_SET_MA(sysmaps->CADDR1, PG_V | VM_PAGE_TO_MACH(src) | PG_A); + PT_SET_MA(sysmaps->CADDR2, PG_V | PG_RW | VM_PAGE_TO_MACH(dst) | PG_A | PG_M); + bcopy(sysmaps->CADDR1, sysmaps->CADDR2, PAGE_SIZE); + PT_SET_MA(sysmaps->CADDR1, 0); + PT_SET_MA(sysmaps->CADDR2, 0); + sched_unpin(); + mtx_unlock(&sysmaps->lock); +} + +/* + * Returns true if the pmap's pv is one of the first + * 16 pvs linked to from this page. This count may + * be changed upwards or downwards in the future; it + * is only necessary that true be returned for a small + * subset of pmaps for proper page aging. + */ +boolean_t +pmap_page_exists_quick(pmap_t pmap, vm_page_t m) +{ + pv_entry_t pv; + int loops = 0; + boolean_t rv; + + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("pmap_page_exists_quick: page %p is not managed", m)); + rv = FALSE; + rw_wlock(&pvh_global_lock); + TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { + if (PV_PMAP(pv) == pmap) { + rv = TRUE; + break; + } + loops++; + if (loops >= 16) + break; + } + rw_wunlock(&pvh_global_lock); + return (rv); +} + +/* + * pmap_page_wired_mappings: + * + * Return the number of managed mappings to the given physical page + * that are wired. + */ +int +pmap_page_wired_mappings(vm_page_t m) +{ + pv_entry_t pv; + pt_entry_t *pte; + pmap_t pmap; + int count; + + count = 0; + if ((m->oflags & VPO_UNMANAGED) != 0) + return (count); + rw_wlock(&pvh_global_lock); + sched_pin(); + TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { + pmap = PV_PMAP(pv); + PMAP_LOCK(pmap); + pte = pmap_pte_quick(pmap, pv->pv_va); + if ((*pte & PG_W) != 0) + count++; + PMAP_UNLOCK(pmap); + } + sched_unpin(); + rw_wunlock(&pvh_global_lock); + return (count); +} + +/* + * Returns TRUE if the given page is mapped. Otherwise, returns FALSE. + */ +boolean_t +pmap_page_is_mapped(vm_page_t m) +{ + + if ((m->oflags & VPO_UNMANAGED) != 0) + return (FALSE); + return (!TAILQ_EMPTY(&m->md.pv_list)); +} + +/* + * Remove all pages from specified address space + * this aids process exit speeds. Also, this code + * is special cased for current process only, but + * can have the more generic (and slightly slower) + * mode enabled. This is much faster than pmap_remove + * in the case of running down an entire address space. + */ +void +pmap_remove_pages(pmap_t pmap) +{ + pt_entry_t *pte, tpte; + vm_page_t m, free = NULL; + pv_entry_t pv; + struct pv_chunk *pc, *npc; + int field, idx; + int32_t bit; + uint32_t inuse, bitmask; + int allfree; + + CTR1(KTR_PMAP, "pmap_remove_pages: pmap=%p", pmap); + + if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) { + printf("warning: pmap_remove_pages called with non-current pmap\n"); + return; + } + rw_wlock(&pvh_global_lock); + KASSERT(pmap_is_current(pmap), ("removing pages from non-current pmap")); + PMAP_LOCK(pmap); + sched_pin(); + TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { + KASSERT(pc->pc_pmap == pmap, ("Wrong pmap %p %p", pmap, + pc->pc_pmap)); + allfree = 1; + for (field = 0; field < _NPCM; field++) { + inuse = ~pc->pc_map[field] & pc_freemask[field]; + while (inuse != 0) { + bit = bsfl(inuse); + bitmask = 1UL << bit; + idx = field * 32 + bit; + pv = &pc->pc_pventry[idx]; + inuse &= ~bitmask; + + pte = vtopte(pv->pv_va); + tpte = *pte ? xpmap_mtop(*pte) : 0; + + if (tpte == 0) { + printf( + "TPTE at %p IS ZERO @ VA %08x\n", + pte, pv->pv_va); + panic("bad pte"); + } + +/* + * We cannot remove wired pages from a process' mapping at this time + */ + if (tpte & PG_W) { + allfree = 0; + continue; + } + + m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); + KASSERT(m->phys_addr == (tpte & PG_FRAME), + ("vm_page_t %p phys_addr mismatch %016jx %016jx", + m, (uintmax_t)m->phys_addr, + (uintmax_t)tpte)); + + KASSERT(m < &vm_page_array[vm_page_array_size], + ("pmap_remove_pages: bad tpte %#jx", + (uintmax_t)tpte)); + + + PT_CLEAR_VA(pte, FALSE); + + /* + * Update the vm_page_t clean/reference bits. + */ + if (tpte & PG_M) + vm_page_dirty(m); + + TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); + if (TAILQ_EMPTY(&m->md.pv_list)) + vm_page_aflag_clear(m, PGA_WRITEABLE); + + pmap_unuse_pt(pmap, pv->pv_va, &free); + + /* Mark free */ + PV_STAT(pv_entry_frees++); + PV_STAT(pv_entry_spare++); + pv_entry_count--; + pc->pc_map[field] |= bitmask; + pmap->pm_stats.resident_count--; + } + } + PT_UPDATES_FLUSH(); + if (allfree) { + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + free_pv_chunk(pc); + } + } + PT_UPDATES_FLUSH(); + if (*PMAP1) + PT_SET_MA(PADDR1, 0); + + sched_unpin(); + pmap_invalidate_all(pmap); + rw_wunlock(&pvh_global_lock); + PMAP_UNLOCK(pmap); + pmap_free_zero_pages(free); +} + +/* + * pmap_is_modified: + * + * Return whether or not the specified physical page was modified + * in any physical maps. + */ +boolean_t +pmap_is_modified(vm_page_t m) +{ + pv_entry_t pv; + pt_entry_t *pte; + pmap_t pmap; + boolean_t rv; + + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("pmap_is_modified: page %p is not managed", m)); + rv = FALSE; + + /* + * If the page is not VPO_BUSY, then PGA_WRITEABLE cannot be + * concurrently set while the object is locked. Thus, if PGA_WRITEABLE + * is clear, no PTEs can have PG_M set. + */ + VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); + if ((m->oflags & VPO_BUSY) == 0 && + (m->aflags & PGA_WRITEABLE) == 0) + return (rv); + rw_wlock(&pvh_global_lock); + sched_pin(); + TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { + pmap = PV_PMAP(pv); + PMAP_LOCK(pmap); + pte = pmap_pte_quick(pmap, pv->pv_va); + rv = (*pte & PG_M) != 0; + PMAP_UNLOCK(pmap); + if (rv) + break; + } + if (*PMAP1) + PT_SET_MA(PADDR1, 0); + sched_unpin(); + rw_wunlock(&pvh_global_lock); + return (rv); +} + +/* + * pmap_is_prefaultable: + * + * Return whether or not the specified virtual address is elgible + * for prefault. + */ +static boolean_t +pmap_is_prefaultable_locked(pmap_t pmap, vm_offset_t addr) +{ + pt_entry_t *pte; + boolean_t rv = FALSE; + + return (rv); + + if (pmap_is_current(pmap) && *pmap_pde(pmap, addr)) { + pte = vtopte(addr); + rv = (*pte == 0); + } + return (rv); +} + +boolean_t +pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) +{ + boolean_t rv; + + PMAP_LOCK(pmap); + rv = pmap_is_prefaultable_locked(pmap, addr); + PMAP_UNLOCK(pmap); + return (rv); +} + +boolean_t +pmap_is_referenced(vm_page_t m) +{ + pv_entry_t pv; + pt_entry_t *pte; + pmap_t pmap; + boolean_t rv; + + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("pmap_is_referenced: page %p is not managed", m)); + rv = FALSE; + rw_wlock(&pvh_global_lock); + sched_pin(); + TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { + pmap = PV_PMAP(pv); + PMAP_LOCK(pmap); + pte = pmap_pte_quick(pmap, pv->pv_va); + rv = (*pte & (PG_A | PG_V)) == (PG_A | PG_V); + PMAP_UNLOCK(pmap); + if (rv) + break; + } + if (*PMAP1) + PT_SET_MA(PADDR1, 0); + sched_unpin(); + rw_wunlock(&pvh_global_lock); + return (rv); +} + +void +pmap_map_readonly(pmap_t pmap, vm_offset_t va, int len) +{ + int i, npages = round_page(len) >> PAGE_SHIFT; + for (i = 0; i < npages; i++) { + pt_entry_t *pte; + pte = pmap_pte(pmap, (vm_offset_t)(va + i*PAGE_SIZE)); + rw_wlock(&pvh_global_lock); + pte_store(pte, xpmap_mtop(*pte & ~(PG_RW|PG_M))); + rw_wunlock(&pvh_global_lock); + PMAP_MARK_PRIV(xpmap_mtop(*pte)); + pmap_pte_release(pte); + } +} + +void +pmap_map_readwrite(pmap_t pmap, vm_offset_t va, int len) +{ + int i, npages = round_page(len) >> PAGE_SHIFT; + for (i = 0; i < npages; i++) { + pt_entry_t *pte; + pte = pmap_pte(pmap, (vm_offset_t)(va + i*PAGE_SIZE)); + PMAP_MARK_UNPRIV(xpmap_mtop(*pte)); + rw_wlock(&pvh_global_lock); + pte_store(pte, xpmap_mtop(*pte) | (PG_RW|PG_M)); + rw_wunlock(&pvh_global_lock); + pmap_pte_release(pte); + } +} + +/* + * Clear the write and modified bits in each of the given page's mappings. + */ +void +pmap_remove_write(vm_page_t m) +{ + pv_entry_t pv; + pmap_t pmap; + pt_entry_t oldpte, *pte; + + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("pmap_remove_write: page %p is not managed", m)); + + /* + * If the page is not VPO_BUSY, then PGA_WRITEABLE cannot be set by + * another thread while the object is locked. Thus, if PGA_WRITEABLE + * is clear, no page table entries need updating. + */ + VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); + if ((m->oflags & VPO_BUSY) == 0 && + (m->aflags & PGA_WRITEABLE) == 0) + return; + rw_wlock(&pvh_global_lock); + sched_pin(); + TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { + pmap = PV_PMAP(pv); + PMAP_LOCK(pmap); + pte = pmap_pte_quick(pmap, pv->pv_va); +retry: + oldpte = *pte; + if ((oldpte & PG_RW) != 0) { + vm_paddr_t newpte = oldpte & ~(PG_RW | PG_M); + + /* + * Regardless of whether a pte is 32 or 64 bits + * in size, PG_RW and PG_M are among the least + * significant 32 bits. + */ + PT_SET_VA_MA(pte, newpte, TRUE); + if (*pte != newpte) + goto retry; + + if ((oldpte & PG_M) != 0) + vm_page_dirty(m); + pmap_invalidate_page(pmap, pv->pv_va); + } + PMAP_UNLOCK(pmap); + } + vm_page_aflag_clear(m, PGA_WRITEABLE); + PT_UPDATES_FLUSH(); + if (*PMAP1) + PT_SET_MA(PADDR1, 0); + sched_unpin(); + rw_wunlock(&pvh_global_lock); +} + +/* + * pmap_ts_referenced: + * + * Return a count of reference bits for a page, clearing those bits. + * It is not necessary for every reference bit to be cleared, but it + * is necessary that 0 only be returned when there are truly no + * reference bits set. + * + * XXX: The exact number of bits to check and clear is a matter that + * should be tested and standardized at some point in the future for + * optimal aging of shared pages. + */ +int +pmap_ts_referenced(vm_page_t m) +{ + pv_entry_t pv, pvf, pvn; + pmap_t pmap; + pt_entry_t *pte; + int rtval = 0; + + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("pmap_ts_referenced: page %p is not managed", m)); + rw_wlock(&pvh_global_lock); + sched_pin(); + if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { + pvf = pv; + do { + pvn = TAILQ_NEXT(pv, pv_list); + TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); + pmap = PV_PMAP(pv); + PMAP_LOCK(pmap); + pte = pmap_pte_quick(pmap, pv->pv_va); + if ((*pte & PG_A) != 0) { + PT_SET_VA_MA(pte, *pte & ~PG_A, FALSE); + pmap_invalidate_page(pmap, pv->pv_va); + rtval++; + if (rtval > 4) + pvn = NULL; + } + PMAP_UNLOCK(pmap); + } while ((pv = pvn) != NULL && pv != pvf); + } + PT_UPDATES_FLUSH(); + if (*PMAP1) + PT_SET_MA(PADDR1, 0); + sched_unpin(); + rw_wunlock(&pvh_global_lock); + return (rtval); +} + +/* + * Clear the modify bits on the specified physical page. + */ +void +pmap_clear_modify(vm_page_t m) +{ + pv_entry_t pv; + pmap_t pmap; + pt_entry_t *pte; + + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("pmap_clear_modify: page %p is not managed", m)); + VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); + KASSERT((m->oflags & VPO_BUSY) == 0, + ("pmap_clear_modify: page %p is busy", m)); + + /* + * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. + * If the object containing the page is locked and the page is not + * VPO_BUSY, then PGA_WRITEABLE cannot be concurrently set. + */ + if ((m->aflags & PGA_WRITEABLE) == 0) + return; + rw_wlock(&pvh_global_lock); + sched_pin(); + TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { + pmap = PV_PMAP(pv); + PMAP_LOCK(pmap); + pte = pmap_pte_quick(pmap, pv->pv_va); + if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { + /* + * Regardless of whether a pte is 32 or 64 bits + * in size, PG_M is among the least significant + * 32 bits. + */ + PT_SET_VA_MA(pte, *pte & ~PG_M, FALSE); + pmap_invalidate_page(pmap, pv->pv_va); + } + PMAP_UNLOCK(pmap); + } + sched_unpin(); + rw_wunlock(&pvh_global_lock); +} + +/* + * pmap_clear_reference: + * + * Clear the reference bit on the specified physical page. + */ +void +pmap_clear_reference(vm_page_t m) +{ + pv_entry_t pv; + pmap_t pmap; + pt_entry_t *pte; + + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("pmap_clear_reference: page %p is not managed", m)); + rw_wlock(&pvh_global_lock); + sched_pin(); + TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { + pmap = PV_PMAP(pv); + PMAP_LOCK(pmap); + pte = pmap_pte_quick(pmap, pv->pv_va); + if ((*pte & PG_A) != 0) { + /* + * Regardless of whether a pte is 32 or 64 bits + * in size, PG_A is among the least significant + * 32 bits. + */ + PT_SET_VA_MA(pte, *pte & ~PG_A, FALSE); + pmap_invalidate_page(pmap, pv->pv_va); + } + PMAP_UNLOCK(pmap); + } + sched_unpin(); + rw_wunlock(&pvh_global_lock); +} + +/* + * Miscellaneous support routines follow + */ + +/* + * Map a set of physical memory pages into the kernel virtual + * address space. Return a pointer to where it is mapped. This + * routine is intended to be used for mapping device memory, + * NOT real memory. + */ +void * +pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) +{ + vm_offset_t va, offset; + vm_size_t tmpsize; + + offset = pa & PAGE_MASK; + size = roundup(offset + size, PAGE_SIZE); + pa = pa & PG_FRAME; + + if (pa < KERNLOAD && pa + size <= KERNLOAD) + va = KERNBASE + pa; + else + va = kmem_alloc_nofault(kernel_map, size); + if (!va) + panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); + + for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) + pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); + pmap_invalidate_range(kernel_pmap, va, va + tmpsize); + pmap_invalidate_cache_range(va, va + size); + return ((void *)(va + offset)); +} + +void * +pmap_mapdev(vm_paddr_t pa, vm_size_t size) +{ + + return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); +} + +void * +pmap_mapbios(vm_paddr_t pa, vm_size_t size) +{ + + return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); +} + +void +pmap_unmapdev(vm_offset_t va, vm_size_t size) +{ + vm_offset_t base, offset; + + if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD) + return; + base = trunc_page(va); + offset = va & PAGE_MASK; + size = roundup(offset + size, PAGE_SIZE); + kmem_free(kernel_map, base, size); +} + +/* + * Sets the memory attribute for the specified page. + */ +void +pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) +{ + + m->md.pat_mode = ma; + if ((m->flags & PG_FICTITIOUS) != 0) + return; + + /* + * If "m" is a normal page, flush it from the cache. + * See pmap_invalidate_cache_range(). + * + * First, try to find an existing mapping of the page by sf + * buffer. sf_buf_invalidate_cache() modifies mapping and + * flushes the cache. + */ + if (sf_buf_invalidate_cache(m)) + return; + + /* + * If page is not mapped by sf buffer, but CPU does not + * support self snoop, map the page transient and do + * invalidation. In the worst case, whole cache is flushed by + * pmap_invalidate_cache_range(). + */ + if ((cpu_feature & CPUID_SS) == 0) + pmap_flush_page(m); +} + +static void +pmap_flush_page(vm_page_t m) +{ + struct sysmaps *sysmaps; + vm_offset_t sva, eva; + + if ((cpu_feature & CPUID_CLFSH) != 0) { + sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; + mtx_lock(&sysmaps->lock); + if (*sysmaps->CMAP2) + panic("pmap_flush_page: CMAP2 busy"); + sched_pin(); + PT_SET_MA(sysmaps->CADDR2, PG_V | PG_RW | + VM_PAGE_TO_MACH(m) | PG_A | PG_M | + pmap_cache_bits(m->md.pat_mode, 0)); + invlcaddr(sysmaps->CADDR2); + sva = (vm_offset_t)sysmaps->CADDR2; + eva = sva + PAGE_SIZE; + + /* + * Use mfence despite the ordering implied by + * mtx_{un,}lock() because clflush is not guaranteed + * to be ordered by any other instruction. + */ + mfence(); + for (; sva < eva; sva += cpu_clflush_line_size) + clflush(sva); + mfence(); + PT_SET_MA(sysmaps->CADDR2, 0); + sched_unpin(); + mtx_unlock(&sysmaps->lock); + } else + pmap_invalidate_cache(); +} + +/* + * Changes the specified virtual address range's memory type to that given by + * the parameter "mode". The specified virtual address range must be + * completely contained within either the kernel map. + * + * Returns zero if the change completed successfully, and either EINVAL or + * ENOMEM if the change failed. Specifically, EINVAL is returned if some part + * of the virtual address range was not mapped, and ENOMEM is returned if + * there was insufficient memory available to complete the change. + */ +int +pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) +{ + vm_offset_t base, offset, tmpva; + pt_entry_t *pte; + u_int opte, npte; + pd_entry_t *pde; + boolean_t changed; + + base = trunc_page(va); + offset = va & PAGE_MASK; + size = roundup(offset + size, PAGE_SIZE); + + /* Only supported on kernel virtual addresses. */ + if (base <= VM_MAXUSER_ADDRESS) + return (EINVAL); + + /* 4MB pages and pages that aren't mapped aren't supported. */ + for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE) { + pde = pmap_pde(kernel_pmap, tmpva); + if (*pde & PG_PS) + return (EINVAL); + if ((*pde & PG_V) == 0) + return (EINVAL); + pte = vtopte(va); + if ((*pte & PG_V) == 0) + return (EINVAL); + } + + changed = FALSE; + + /* + * Ok, all the pages exist and are 4k, so run through them updating + * their cache mode. + */ + for (tmpva = base; size > 0; ) { + pte = vtopte(tmpva); + + /* + * The cache mode bits are all in the low 32-bits of the + * PTE, so we can just spin on updating the low 32-bits. + */ + do { + opte = *(u_int *)pte; + npte = opte & ~(PG_PTE_PAT | PG_NC_PCD | PG_NC_PWT); + npte |= pmap_cache_bits(mode, 0); + PT_SET_VA_MA(pte, npte, TRUE); + } while (npte != opte && (*pte != npte)); + if (npte != opte) + changed = TRUE; + tmpva += PAGE_SIZE; + size -= PAGE_SIZE; + } + + /* + * Flush CPU caches to make sure any data isn't cached that + * shouldn't be, etc. + */ + if (changed) { + pmap_invalidate_range(kernel_pmap, base, tmpva); + pmap_invalidate_cache_range(base, tmpva); + } + return (0); +} + +/* + * perform the pmap work for mincore + */ +int +pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) +{ + pt_entry_t *ptep, pte; + vm_paddr_t pa; + int val; + + PMAP_LOCK(pmap); +retry: + ptep = pmap_pte(pmap, addr); + pte = (ptep != NULL) ? PT_GET(ptep) : 0; + pmap_pte_release(ptep); + val = 0; + if ((pte & PG_V) != 0) { + val |= MINCORE_INCORE; + if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) + val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; + if ((pte & PG_A) != 0) + val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; + } + if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != + (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && + (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { + pa = pte & PG_FRAME; + /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ + if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) + goto retry; + } else + PA_UNLOCK_COND(*locked_pa); + PMAP_UNLOCK(pmap); + return (val); +} + +void +pmap_activate(struct thread *td) +{ + pmap_t pmap, oldpmap; + u_int cpuid; + u_int32_t cr3; + + critical_enter(); + pmap = vmspace_pmap(td->td_proc->p_vmspace); + oldpmap = PCPU_GET(curpmap); + cpuid = PCPU_GET(cpuid); +#if defined(SMP) + CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); + CPU_SET_ATOMIC(cpuid, &pmap->pm_active); +#else + CPU_CLR(cpuid, &oldpmap->pm_active); + CPU_SET(cpuid, &pmap->pm_active); +#endif +#ifdef PAE + cr3 = vtophys(pmap->pm_pdpt); +#else + cr3 = vtophys(pmap->pm_pdir); +#endif + /* + * pmap_activate is for the current thread on the current cpu + */ + td->td_pcb->pcb_cr3 = cr3; + PT_UPDATES_FLUSH(); + load_cr3(cr3); + PCPU_SET(curpmap, pmap); + critical_exit(); +} + +void +pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) +{ +} + +/* + * Increase the starting virtual address of the given mapping if a + * different alignment might result in more superpage mappings. + */ +void +pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, + vm_offset_t *addr, vm_size_t size) +{ + vm_offset_t superpage_offset; + + if (size < NBPDR) + return; + if (object != NULL && (object->flags & OBJ_COLORED) != 0) + offset += ptoa(object->pg_color); + superpage_offset = offset & PDRMASK; + if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || + (*addr & PDRMASK) == superpage_offset) + return; + if ((*addr & PDRMASK) < superpage_offset) + *addr = (*addr & ~PDRMASK) + superpage_offset; + else + *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; +} + +void +pmap_suspend() +{ + pmap_t pmap; + int i, pdir, offset; + vm_paddr_t pdirma; + mmu_update_t mu[4]; + + /* + * We need to remove the recursive mapping structure from all + * our pmaps so that Xen doesn't get confused when it restores + * the page tables. The recursive map lives at page directory + * index PTDPTDI. We assume that the suspend code has stopped + * the other vcpus (if any). + */ + LIST_FOREACH(pmap, &allpmaps, pm_list) { + for (i = 0; i < 4; i++) { + /* + * Figure out which page directory (L2) page + * contains this bit of the recursive map and + * the offset within that page of the map + * entry + */ + pdir = (PTDPTDI + i) / NPDEPG; + offset = (PTDPTDI + i) % NPDEPG; + pdirma = pmap->pm_pdpt[pdir] & PG_FRAME; + mu[i].ptr = pdirma + offset * sizeof(pd_entry_t); + mu[i].val = 0; + } + HYPERVISOR_mmu_update(mu, 4, NULL, DOMID_SELF); + } +} + +void +pmap_resume() +{ + pmap_t pmap; + int i, pdir, offset; + vm_paddr_t pdirma; + mmu_update_t mu[4]; + + /* + * Restore the recursive map that we removed on suspend. + */ + LIST_FOREACH(pmap, &allpmaps, pm_list) { + for (i = 0; i < 4; i++) { + /* + * Figure out which page directory (L2) page + * contains this bit of the recursive map and + * the offset within that page of the map + * entry + */ + pdir = (PTDPTDI + i) / NPDEPG; + offset = (PTDPTDI + i) % NPDEPG; + pdirma = pmap->pm_pdpt[pdir] & PG_FRAME; + mu[i].ptr = pdirma + offset * sizeof(pd_entry_t); + mu[i].val = (pmap->pm_pdpt[i] & PG_FRAME) | PG_V; + } + HYPERVISOR_mmu_update(mu, 4, NULL, DOMID_SELF); + } +} + +#if defined(PMAP_DEBUG) +pmap_pid_dump(int pid) +{ + pmap_t pmap; + struct proc *p; + int npte = 0; + int index; + + sx_slock(&allproc_lock); + FOREACH_PROC_IN_SYSTEM(p) { + if (p->p_pid != pid) + continue; + + if (p->p_vmspace) { + int i,j; + index = 0; + pmap = vmspace_pmap(p->p_vmspace); + for (i = 0; i < NPDEPTD; i++) { + pd_entry_t *pde; + pt_entry_t *pte; + vm_offset_t base = i << PDRSHIFT; + + pde = &pmap->pm_pdir[i]; + if (pde && pmap_pde_v(pde)) { + for (j = 0; j < NPTEPG; j++) { + vm_offset_t va = base + (j << PAGE_SHIFT); + if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) { + if (index) { + index = 0; + printf("\n"); + } + sx_sunlock(&allproc_lock); + return (npte); + } + pte = pmap_pte(pmap, va); + if (pte && pmap_pte_v(pte)) { + pt_entry_t pa; + vm_page_t m; + pa = PT_GET(pte); + m = PHYS_TO_VM_PAGE(pa & PG_FRAME); + printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x", + va, pa, m->hold_count, m->wire_count, m->flags); + npte++; + index++; + if (index >= 2) { + index = 0; + printf("\n"); + } else { + printf(" "); + } + } + } + } + } + } + } + sx_sunlock(&allproc_lock); + return (npte); +} +#endif + +#if defined(DEBUG) + +static void pads(pmap_t pm); +void pmap_pvdump(vm_paddr_t pa); + +/* print address space of pmap*/ +static void +pads(pmap_t pm) +{ + int i, j; + vm_paddr_t va; + pt_entry_t *ptep; + + if (pm == kernel_pmap) + return; + for (i = 0; i < NPDEPTD; i++) + if (pm->pm_pdir[i]) + for (j = 0; j < NPTEPG; j++) { + va = (i << PDRSHIFT) + (j << PAGE_SHIFT); + if (pm == kernel_pmap && va < KERNBASE) + continue; + if (pm != kernel_pmap && va > UPT_MAX_ADDRESS) + continue; + ptep = pmap_pte(pm, va); + if (pmap_pte_v(ptep)) + printf("%x:%x ", va, *ptep); + }; + +} + +void +pmap_pvdump(vm_paddr_t pa) +{ + pv_entry_t pv; + pmap_t pmap; + vm_page_t m; + + printf("pa %x", pa); + m = PHYS_TO_VM_PAGE(pa); + TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { + pmap = PV_PMAP(pv); + printf(" -> pmap %p, va %x", (void *)pmap, pv->pv_va); + pads(pmap); + } + printf(" "); +} +#endif diff --git a/sys/i386/xen/xen_clock_util.c b/sys/i386/xen/xen_clock_util.c new file mode 100644 index 0000000..c14a627 --- /dev/null +++ b/sys/i386/xen/xen_clock_util.c @@ -0,0 +1,101 @@ +/*- + * Copyright (c) 2009 Adrian Chadd + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bus.h> +#include <sys/clock.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/kernel.h> +#include <sys/module.h> +#include <sys/time.h> + +#include <xen/xen_intr.h> +#include <vm/vm.h> +#include <vm/pmap.h> +#include <machine/pmap.h> +#include <xen/hypervisor.h> +#include <machine/xen/xen-os.h> +#include <machine/xen/xenfunc.h> +#include <xen/interface/io/xenbus.h> +#include <xen/interface/vcpu.h> +#include <machine/cpu.h> + +#include <machine/xen/xen_clock_util.h> + +/* + * Read the current hypervisor start time (wall clock) from Xen. + */ +void +xen_fetch_wallclock(struct timespec *ts) +{ + shared_info_t *s = HYPERVISOR_shared_info; + uint32_t ts_version; + + do { + ts_version = s->wc_version; + rmb(); + ts->tv_sec = s->wc_sec; + ts->tv_nsec = s->wc_nsec; + rmb(); + } + while ((s->wc_version & 1) | (ts_version ^ s->wc_version)); +} + +/* + * Read the current hypervisor system uptime value from Xen. + */ +void +xen_fetch_uptime(struct timespec *ts) +{ + shared_info_t *s = HYPERVISOR_shared_info; + struct vcpu_time_info *src; + struct shadow_time_info dst; + uint32_t pre_version, post_version; + + src = &s->vcpu_info[smp_processor_id()].time; + + spinlock_enter(); + do { + pre_version = dst.version = src->version; + rmb(); + dst.system_timestamp = src->system_time; + rmb(); + post_version = src->version; + } + while ((pre_version & 1) | (pre_version ^ post_version)); + + spinlock_exit(); + + ts->tv_sec = dst.system_timestamp / 1000000000; + ts->tv_nsec = dst.system_timestamp % 1000000000; +} diff --git a/sys/i386/xen/xen_machdep.c b/sys/i386/xen/xen_machdep.c new file mode 100644 index 0000000..3b3da6f --- /dev/null +++ b/sys/i386/xen/xen_machdep.c @@ -0,0 +1,1260 @@ +/* + * + * Copyright (c) 2004 Christian Limpach. + * Copyright (c) 2004-2006,2008 Kip Macy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Christian Limpach. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bus.h> +#include <sys/ktr.h> +#include <sys/lock.h> +#include <sys/mount.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/kernel.h> +#include <sys/proc.h> +#include <sys/reboot.h> +#include <sys/rwlock.h> +#include <sys/sysproto.h> + +#include <machine/xen/xen-os.h> + +#include <vm/vm.h> +#include <vm/pmap.h> +#include <machine/segments.h> +#include <machine/pcb.h> +#include <machine/stdarg.h> +#include <machine/vmparam.h> +#include <machine/cpu.h> +#include <machine/intr_machdep.h> +#include <machine/md_var.h> +#include <machine/asmacros.h> + + + +#include <xen/hypervisor.h> +#include <machine/xen/xenvar.h> +#include <machine/xen/xenfunc.h> +#include <machine/xen/xenpmap.h> +#include <machine/xen/xenfunc.h> +#include <xen/interface/memory.h> +#include <machine/xen/features.h> +#ifdef SMP +#include <machine/privatespace.h> +#endif + + +#include <vm/vm_page.h> + + +#define IDTVEC(name) __CONCAT(X,name) + +extern inthand_t +IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), + IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), + IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), + IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), + IDTVEC(xmm), IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall); + + +int xendebug_flags; +start_info_t *xen_start_info; +shared_info_t *HYPERVISOR_shared_info; +xen_pfn_t *xen_machine_phys = machine_to_phys_mapping; +xen_pfn_t *xen_phys_machine; +xen_pfn_t *xen_pfn_to_mfn_frame_list[16]; +xen_pfn_t *xen_pfn_to_mfn_frame_list_list; +int preemptable, init_first; +extern unsigned int avail_space; + +void ni_cli(void); +void ni_sti(void); + + +void +ni_cli(void) +{ + CTR0(KTR_SPARE2, "ni_cli disabling interrupts"); + __asm__("pushl %edx;" + "pushl %eax;" + ); + __cli(); + __asm__("popl %eax;" + "popl %edx;" + ); +} + + +void +ni_sti(void) +{ + __asm__("pushl %edx;" + "pushl %esi;" + "pushl %eax;" + ); + __sti(); + __asm__("popl %eax;" + "popl %esi;" + "popl %edx;" + ); +} + +/* + * Modify the cmd_line by converting ',' to NULLs so that it is in a format + * suitable for the static env vars. + */ +char * +xen_setbootenv(char *cmd_line) +{ + char *cmd_line_next; + + /* Skip leading spaces */ + for (; *cmd_line == ' '; cmd_line++); + + printk("xen_setbootenv(): cmd_line='%s'\n", cmd_line); + + for (cmd_line_next = cmd_line; strsep(&cmd_line_next, ",") != NULL;); + return cmd_line; +} + +static struct +{ + const char *ev; + int mask; +} howto_names[] = { + {"boot_askname", RB_ASKNAME}, + {"boot_single", RB_SINGLE}, + {"boot_nosync", RB_NOSYNC}, + {"boot_halt", RB_ASKNAME}, + {"boot_serial", RB_SERIAL}, + {"boot_cdrom", RB_CDROM}, + {"boot_gdb", RB_GDB}, + {"boot_gdb_pause", RB_RESERVED1}, + {"boot_verbose", RB_VERBOSE}, + {"boot_multicons", RB_MULTIPLE}, + {NULL, 0} +}; + +int +xen_boothowto(char *envp) +{ + int i, howto = 0; + + /* get equivalents from the environment */ + for (i = 0; howto_names[i].ev != NULL; i++) + if (getenv(howto_names[i].ev) != NULL) + howto |= howto_names[i].mask; + return howto; +} + +#define PRINTK_BUFSIZE 1024 +void +printk(const char *fmt, ...) +{ + __va_list ap; + int retval; + static char buf[PRINTK_BUFSIZE]; + + va_start(ap, fmt); + retval = vsnprintf(buf, PRINTK_BUFSIZE - 1, fmt, ap); + va_end(ap); + buf[retval] = 0; + (void)HYPERVISOR_console_write(buf, retval); +} + + +#define XPQUEUE_SIZE 128 + +struct mmu_log { + char *file; + int line; +}; + +#ifdef SMP +/* per-cpu queues and indices */ +#ifdef INVARIANTS +static struct mmu_log xpq_queue_log[MAX_VIRT_CPUS][XPQUEUE_SIZE]; +#endif + +static int xpq_idx[MAX_VIRT_CPUS]; +static mmu_update_t xpq_queue[MAX_VIRT_CPUS][XPQUEUE_SIZE]; + +#define XPQ_QUEUE_LOG xpq_queue_log[vcpu] +#define XPQ_QUEUE xpq_queue[vcpu] +#define XPQ_IDX xpq_idx[vcpu] +#define SET_VCPU() int vcpu = smp_processor_id() +#else + +static mmu_update_t xpq_queue[XPQUEUE_SIZE]; +#ifdef INVARIANTS +static struct mmu_log xpq_queue_log[XPQUEUE_SIZE]; +#endif +static int xpq_idx = 0; + +#define XPQ_QUEUE_LOG xpq_queue_log +#define XPQ_QUEUE xpq_queue +#define XPQ_IDX xpq_idx +#define SET_VCPU() +#endif /* !SMP */ + +#define XPQ_IDX_INC atomic_add_int(&XPQ_IDX, 1); + +#if 0 +static void +xen_dump_queue(void) +{ + int _xpq_idx = XPQ_IDX; + int i; + + if (_xpq_idx <= 1) + return; + + printk("xen_dump_queue(): %u entries\n", _xpq_idx); + for (i = 0; i < _xpq_idx; i++) { + printk(" val: %llx ptr: %llx\n", XPQ_QUEUE[i].val, XPQ_QUEUE[i].ptr); + } +} +#endif + + +static __inline void +_xen_flush_queue(void) +{ + SET_VCPU(); + int _xpq_idx = XPQ_IDX; + int error, i; + +#ifdef INVARIANTS + if (__predict_true(gdtset)) + CRITICAL_ASSERT(curthread); +#endif + + XPQ_IDX = 0; + /* Make sure index is cleared first to avoid double updates. */ + error = HYPERVISOR_mmu_update((mmu_update_t *)&XPQ_QUEUE, + _xpq_idx, NULL, DOMID_SELF); + +#if 0 + if (__predict_true(gdtset)) + for (i = _xpq_idx; i > 0;) { + if (i >= 3) { + CTR6(KTR_PMAP, "mmu:val: %lx ptr: %lx val: %lx " + "ptr: %lx val: %lx ptr: %lx", + (XPQ_QUEUE[i-1].val & 0xffffffff), + (XPQ_QUEUE[i-1].ptr & 0xffffffff), + (XPQ_QUEUE[i-2].val & 0xffffffff), + (XPQ_QUEUE[i-2].ptr & 0xffffffff), + (XPQ_QUEUE[i-3].val & 0xffffffff), + (XPQ_QUEUE[i-3].ptr & 0xffffffff)); + i -= 3; + } else if (i == 2) { + CTR4(KTR_PMAP, "mmu: val: %lx ptr: %lx val: %lx ptr: %lx", + (XPQ_QUEUE[i-1].val & 0xffffffff), + (XPQ_QUEUE[i-1].ptr & 0xffffffff), + (XPQ_QUEUE[i-2].val & 0xffffffff), + (XPQ_QUEUE[i-2].ptr & 0xffffffff)); + i = 0; + } else { + CTR2(KTR_PMAP, "mmu: val: %lx ptr: %lx", + (XPQ_QUEUE[i-1].val & 0xffffffff), + (XPQ_QUEUE[i-1].ptr & 0xffffffff)); + i = 0; + } + } +#endif + if (__predict_false(error < 0)) { + for (i = 0; i < _xpq_idx; i++) + printf("val: %llx ptr: %llx\n", + XPQ_QUEUE[i].val, XPQ_QUEUE[i].ptr); + panic("Failed to execute MMU updates: %d", error); + } + +} + +void +xen_flush_queue(void) +{ + SET_VCPU(); + + if (__predict_true(gdtset)) + critical_enter(); + if (XPQ_IDX != 0) _xen_flush_queue(); + if (__predict_true(gdtset)) + critical_exit(); +} + +static __inline void +xen_increment_idx(void) +{ + SET_VCPU(); + + XPQ_IDX++; + if (__predict_false(XPQ_IDX == XPQUEUE_SIZE)) + xen_flush_queue(); +} + +void +xen_check_queue(void) +{ +#ifdef INVARIANTS + SET_VCPU(); + + KASSERT(XPQ_IDX == 0, ("pending operations XPQ_IDX=%d", XPQ_IDX)); +#endif +} + +void +xen_invlpg(vm_offset_t va) +{ + struct mmuext_op op; + op.cmd = MMUEXT_INVLPG_ALL; + op.arg1.linear_addr = va & ~PAGE_MASK; + PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void +xen_load_cr3(u_int val) +{ + struct mmuext_op op; +#ifdef INVARIANTS + SET_VCPU(); + + KASSERT(XPQ_IDX == 0, ("pending operations XPQ_IDX=%d", XPQ_IDX)); +#endif + op.cmd = MMUEXT_NEW_BASEPTR; + op.arg1.mfn = xpmap_ptom(val) >> PAGE_SHIFT; + PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +#ifdef KTR +static __inline u_int +rebp(void) +{ + u_int data; + + __asm __volatile("movl 4(%%ebp),%0" : "=r" (data)); + return (data); +} +#endif + +u_int +read_eflags(void) +{ + vcpu_info_t *_vcpu; + u_int eflags; + + eflags = _read_eflags(); + _vcpu = &HYPERVISOR_shared_info->vcpu_info[smp_processor_id()]; + if (_vcpu->evtchn_upcall_mask) + eflags &= ~PSL_I; + + return (eflags); +} + +void +write_eflags(u_int eflags) +{ + u_int intr; + + CTR2(KTR_SPARE2, "%x xen_restore_flags eflags %x", rebp(), eflags); + intr = ((eflags & PSL_I) == 0); + __restore_flags(intr); + _write_eflags(eflags); +} + +void +xen_cli(void) +{ + CTR1(KTR_SPARE2, "%x xen_cli disabling interrupts", rebp()); + __cli(); +} + +void +xen_sti(void) +{ + CTR1(KTR_SPARE2, "%x xen_sti enabling interrupts", rebp()); + __sti(); +} + +u_int +xen_rcr2(void) +{ + + return (HYPERVISOR_shared_info->vcpu_info[curcpu].arch.cr2); +} + +void +_xen_machphys_update(vm_paddr_t mfn, vm_paddr_t pfn, char *file, int line) +{ + SET_VCPU(); + + if (__predict_true(gdtset)) + critical_enter(); + XPQ_QUEUE[XPQ_IDX].ptr = (mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE; + XPQ_QUEUE[XPQ_IDX].val = pfn; +#ifdef INVARIANTS + XPQ_QUEUE_LOG[XPQ_IDX].file = file; + XPQ_QUEUE_LOG[XPQ_IDX].line = line; +#endif + xen_increment_idx(); + if (__predict_true(gdtset)) + critical_exit(); +} + +extern struct rwlock pvh_global_lock; + +void +_xen_queue_pt_update(vm_paddr_t ptr, vm_paddr_t val, char *file, int line) +{ + SET_VCPU(); + + if (__predict_true(gdtset)) + rw_assert(&pvh_global_lock, RA_WLOCKED); + + KASSERT((ptr & 7) == 0, ("misaligned update")); + + if (__predict_true(gdtset)) + critical_enter(); + + XPQ_QUEUE[XPQ_IDX].ptr = ((uint64_t)ptr) | MMU_NORMAL_PT_UPDATE; + XPQ_QUEUE[XPQ_IDX].val = (uint64_t)val; +#ifdef INVARIANTS + XPQ_QUEUE_LOG[XPQ_IDX].file = file; + XPQ_QUEUE_LOG[XPQ_IDX].line = line; +#endif + xen_increment_idx(); + if (__predict_true(gdtset)) + critical_exit(); +} + +void +xen_pgdpt_pin(vm_paddr_t ma) +{ + struct mmuext_op op; + op.cmd = MMUEXT_PIN_L3_TABLE; + op.arg1.mfn = ma >> PAGE_SHIFT; + xen_flush_queue(); + PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void +xen_pgd_pin(vm_paddr_t ma) +{ + struct mmuext_op op; + op.cmd = MMUEXT_PIN_L2_TABLE; + op.arg1.mfn = ma >> PAGE_SHIFT; + xen_flush_queue(); + PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void +xen_pgd_unpin(vm_paddr_t ma) +{ + struct mmuext_op op; + op.cmd = MMUEXT_UNPIN_TABLE; + op.arg1.mfn = ma >> PAGE_SHIFT; + xen_flush_queue(); + PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void +xen_pt_pin(vm_paddr_t ma) +{ + struct mmuext_op op; + op.cmd = MMUEXT_PIN_L1_TABLE; + op.arg1.mfn = ma >> PAGE_SHIFT; + xen_flush_queue(); + PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void +xen_pt_unpin(vm_paddr_t ma) +{ + struct mmuext_op op; + op.cmd = MMUEXT_UNPIN_TABLE; + op.arg1.mfn = ma >> PAGE_SHIFT; + xen_flush_queue(); + PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void +xen_set_ldt(vm_paddr_t ptr, unsigned long len) +{ + struct mmuext_op op; + op.cmd = MMUEXT_SET_LDT; + op.arg1.linear_addr = ptr; + op.arg2.nr_ents = len; + xen_flush_queue(); + PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void xen_tlb_flush(void) +{ + struct mmuext_op op; + op.cmd = MMUEXT_TLB_FLUSH_LOCAL; + xen_flush_queue(); + PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void +xen_update_descriptor(union descriptor *table, union descriptor *entry) +{ + vm_paddr_t pa; + pt_entry_t *ptp; + + ptp = vtopte((vm_offset_t)table); + pa = (*ptp & PG_FRAME) | ((vm_offset_t)table & PAGE_MASK); + if (HYPERVISOR_update_descriptor(pa, *(uint64_t *)entry)) + panic("HYPERVISOR_update_descriptor failed\n"); +} + + +#if 0 +/* + * Bitmap is indexed by page number. If bit is set, the page is part of a + * xen_create_contiguous_region() area of memory. + */ +unsigned long *contiguous_bitmap; + +static void +contiguous_bitmap_set(unsigned long first_page, unsigned long nr_pages) +{ + unsigned long start_off, end_off, curr_idx, end_idx; + + curr_idx = first_page / BITS_PER_LONG; + start_off = first_page & (BITS_PER_LONG-1); + end_idx = (first_page + nr_pages) / BITS_PER_LONG; + end_off = (first_page + nr_pages) & (BITS_PER_LONG-1); + + if (curr_idx == end_idx) { + contiguous_bitmap[curr_idx] |= + ((1UL<<end_off)-1) & -(1UL<<start_off); + } else { + contiguous_bitmap[curr_idx] |= -(1UL<<start_off); + while ( ++curr_idx < end_idx ) + contiguous_bitmap[curr_idx] = ~0UL; + contiguous_bitmap[curr_idx] |= (1UL<<end_off)-1; + } +} + +static void +contiguous_bitmap_clear(unsigned long first_page, unsigned long nr_pages) +{ + unsigned long start_off, end_off, curr_idx, end_idx; + + curr_idx = first_page / BITS_PER_LONG; + start_off = first_page & (BITS_PER_LONG-1); + end_idx = (first_page + nr_pages) / BITS_PER_LONG; + end_off = (first_page + nr_pages) & (BITS_PER_LONG-1); + + if (curr_idx == end_idx) { + contiguous_bitmap[curr_idx] &= + -(1UL<<end_off) | ((1UL<<start_off)-1); + } else { + contiguous_bitmap[curr_idx] &= (1UL<<start_off)-1; + while ( ++curr_idx != end_idx ) + contiguous_bitmap[curr_idx] = 0; + contiguous_bitmap[curr_idx] &= -(1UL<<end_off); + } +} +#endif + +/* Ensure multi-page extents are contiguous in machine memory. */ +int +xen_create_contiguous_region(vm_page_t pages, int npages) +{ + unsigned long mfn, i, flags; + int order; + struct xen_memory_reservation reservation = { + .nr_extents = 1, + .extent_order = 0, + .domid = DOMID_SELF + }; + set_xen_guest_handle(reservation.extent_start, &mfn); + + balloon_lock(flags); + + /* can currently only handle power of two allocation */ + PANIC_IF(ffs(npages) != fls(npages)); + + /* 0. determine order */ + order = (ffs(npages) == fls(npages)) ? fls(npages) - 1 : fls(npages); + + /* 1. give away machine pages. */ + for (i = 0; i < (1 << order); i++) { + int pfn; + pfn = VM_PAGE_TO_PHYS(&pages[i]) >> PAGE_SHIFT; + mfn = PFNTOMFN(pfn); + PFNTOMFN(pfn) = INVALID_P2M_ENTRY; + PANIC_IF(HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation) != 1); + } + + + /* 2. Get a new contiguous memory extent. */ + reservation.extent_order = order; + /* xenlinux hardcodes this because of aacraid - maybe set to 0 if we're not + * running with a broxen driver XXXEN + */ + reservation.address_bits = 31; + if (HYPERVISOR_memory_op(XENMEM_increase_reservation, &reservation) != 1) + goto fail; + + /* 3. Map the new extent in place of old pages. */ + for (i = 0; i < (1 << order); i++) { + int pfn; + pfn = VM_PAGE_TO_PHYS(&pages[i]) >> PAGE_SHIFT; + xen_machphys_update(mfn+i, pfn); + PFNTOMFN(pfn) = mfn+i; + } + + xen_tlb_flush(); + +#if 0 + contiguous_bitmap_set(VM_PAGE_TO_PHYS(&pages[0]) >> PAGE_SHIFT, 1UL << order); +#endif + + balloon_unlock(flags); + + return 0; + + fail: + reservation.extent_order = 0; + reservation.address_bits = 0; + + for (i = 0; i < (1 << order); i++) { + int pfn; + pfn = VM_PAGE_TO_PHYS(&pages[i]) >> PAGE_SHIFT; + PANIC_IF(HYPERVISOR_memory_op( + XENMEM_increase_reservation, &reservation) != 1); + xen_machphys_update(mfn, pfn); + PFNTOMFN(pfn) = mfn; + } + + xen_tlb_flush(); + + balloon_unlock(flags); + + return ENOMEM; +} + +void +xen_destroy_contiguous_region(void *addr, int npages) +{ + unsigned long mfn, i, flags, order, pfn0; + struct xen_memory_reservation reservation = { + .nr_extents = 1, + .extent_order = 0, + .domid = DOMID_SELF + }; + set_xen_guest_handle(reservation.extent_start, &mfn); + + pfn0 = vtophys(addr) >> PAGE_SHIFT; +#if 0 + scrub_pages(vstart, 1 << order); +#endif + /* can currently only handle power of two allocation */ + PANIC_IF(ffs(npages) != fls(npages)); + + /* 0. determine order */ + order = (ffs(npages) == fls(npages)) ? fls(npages) - 1 : fls(npages); + + balloon_lock(flags); + +#if 0 + contiguous_bitmap_clear(vtophys(addr) >> PAGE_SHIFT, 1UL << order); +#endif + + /* 1. Zap current PTEs, giving away the underlying pages. */ + for (i = 0; i < (1 << order); i++) { + int pfn; + uint64_t new_val = 0; + pfn = vtomach((char *)addr + i*PAGE_SIZE) >> PAGE_SHIFT; + + PANIC_IF(HYPERVISOR_update_va_mapping((vm_offset_t)((char *)addr + (i * PAGE_SIZE)), new_val, 0)); + PFNTOMFN(pfn) = INVALID_P2M_ENTRY; + PANIC_IF(HYPERVISOR_memory_op( + XENMEM_decrease_reservation, &reservation) != 1); + } + + /* 2. Map new pages in place of old pages. */ + for (i = 0; i < (1 << order); i++) { + int pfn; + uint64_t new_val; + pfn = pfn0 + i; + PANIC_IF(HYPERVISOR_memory_op(XENMEM_increase_reservation, &reservation) != 1); + + new_val = mfn << PAGE_SHIFT; + PANIC_IF(HYPERVISOR_update_va_mapping((vm_offset_t)addr + (i * PAGE_SIZE), + new_val, PG_KERNEL)); + xen_machphys_update(mfn, pfn); + PFNTOMFN(pfn) = mfn; + } + + xen_tlb_flush(); + + balloon_unlock(flags); +} + +extern vm_offset_t proc0kstack; +extern int vm86paddr, vm86phystk; +char *bootmem_start, *bootmem_current, *bootmem_end; + +pteinfo_t *pteinfo_list; +void initvalues(start_info_t *startinfo); + +struct xenstore_domain_interface; +extern struct xenstore_domain_interface *xen_store; + +char *console_page; + +void * +bootmem_alloc(unsigned int size) +{ + char *retptr; + + retptr = bootmem_current; + PANIC_IF(retptr + size > bootmem_end); + bootmem_current += size; + + return retptr; +} + +void +bootmem_free(void *ptr, unsigned int size) +{ + char *tptr; + + tptr = ptr; + PANIC_IF(tptr != bootmem_current - size || + bootmem_current - size < bootmem_start); + + bootmem_current -= size; +} + +#if 0 +static vm_paddr_t +xpmap_mtop2(vm_paddr_t mpa) +{ + return ((machine_to_phys_mapping[mpa >> PAGE_SHIFT] << PAGE_SHIFT) + ) | (mpa & ~PG_FRAME); +} + +static pd_entry_t +xpmap_get_bootpde(vm_paddr_t va) +{ + + return ((pd_entry_t *)xen_start_info->pt_base)[va >> 22]; +} + +static pd_entry_t +xpmap_get_vbootpde(vm_paddr_t va) +{ + pd_entry_t pde; + + pde = xpmap_get_bootpde(va); + if ((pde & PG_V) == 0) + return (pde & ~PG_FRAME); + return (pde & ~PG_FRAME) | + (xpmap_mtop2(pde & PG_FRAME) + KERNBASE); +} + +static pt_entry_t 8* +xpmap_get_bootptep(vm_paddr_t va) +{ + pd_entry_t pde; + + pde = xpmap_get_vbootpde(va); + if ((pde & PG_V) == 0) + return (void *)-1; +#define PT_MASK 0x003ff000 /* page table address bits */ + return &(((pt_entry_t *)(pde & PG_FRAME))[(va & PT_MASK) >> PAGE_SHIFT]); +} + +static pt_entry_t +xpmap_get_bootpte(vm_paddr_t va) +{ + + return xpmap_get_bootptep(va)[0]; +} +#endif + + +#ifdef ADD_ISA_HOLE +static void +shift_phys_machine(unsigned long *phys_machine, int nr_pages) +{ + + unsigned long *tmp_page, *current_page, *next_page; + int i; + + tmp_page = bootmem_alloc(PAGE_SIZE); + current_page = phys_machine + nr_pages - (PAGE_SIZE/sizeof(unsigned long)); + next_page = current_page - (PAGE_SIZE/sizeof(unsigned long)); + bcopy(phys_machine, tmp_page, PAGE_SIZE); + + while (current_page > phys_machine) { + /* save next page */ + bcopy(next_page, tmp_page, PAGE_SIZE); + /* shift down page */ + bcopy(current_page, next_page, PAGE_SIZE); + /* finish swap */ + bcopy(tmp_page, current_page, PAGE_SIZE); + + current_page -= (PAGE_SIZE/sizeof(unsigned long)); + next_page -= (PAGE_SIZE/sizeof(unsigned long)); + } + bootmem_free(tmp_page, PAGE_SIZE); + + for (i = 0; i < nr_pages; i++) { + xen_machphys_update(phys_machine[i], i); + } + memset(phys_machine, INVALID_P2M_ENTRY, PAGE_SIZE); + +} +#endif /* ADD_ISA_HOLE */ + +/* + * Build a directory of the pages that make up our Physical to Machine + * mapping table. The Xen suspend/restore code uses this to find our + * mapping table. + */ +static void +init_frame_list_list(void *arg) +{ + unsigned long nr_pages = xen_start_info->nr_pages; +#define FPP (PAGE_SIZE/sizeof(xen_pfn_t)) + int i, j, k; + + xen_pfn_to_mfn_frame_list_list = malloc(PAGE_SIZE, M_DEVBUF, M_WAITOK); + for (i = 0, j = 0, k = -1; i < nr_pages; + i += FPP, j++) { + if ((j & (FPP - 1)) == 0) { + k++; + xen_pfn_to_mfn_frame_list[k] = + malloc(PAGE_SIZE, M_DEVBUF, M_WAITOK); + xen_pfn_to_mfn_frame_list_list[k] = + VTOMFN(xen_pfn_to_mfn_frame_list[k]); + j = 0; + } + xen_pfn_to_mfn_frame_list[k][j] = + VTOMFN(&xen_phys_machine[i]); + } + + HYPERVISOR_shared_info->arch.max_pfn = nr_pages; + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list + = VTOMFN(xen_pfn_to_mfn_frame_list_list); +} +SYSINIT(init_fll, SI_SUB_DEVFS, SI_ORDER_ANY, init_frame_list_list, NULL); + +extern unsigned long physfree; + +int pdir, curoffset; +extern int nkpt; + +extern uint32_t kernbase; + +void +initvalues(start_info_t *startinfo) +{ + vm_offset_t cur_space, cur_space_pt; + struct physdev_set_iopl set_iopl; + + int l3_pages, l2_pages, l1_pages, offset; + vm_paddr_t console_page_ma, xen_store_ma; + vm_offset_t tmpva; + vm_paddr_t shinfo; +#ifdef PAE + vm_paddr_t IdlePDPTma, IdlePDPTnewma; + vm_paddr_t IdlePTDnewma[4]; + pd_entry_t *IdlePDPTnew, *IdlePTDnew; + vm_paddr_t IdlePTDma[4]; +#else + vm_paddr_t IdlePTDma[1]; +#endif + unsigned long i; + int ncpus = MAXCPU; + + nkpt = min( + min( + max((startinfo->nr_pages >> NPGPTD_SHIFT), nkpt), + NPGPTD*NPDEPG - KPTDI), + (HYPERVISOR_VIRT_START - KERNBASE) >> PDRSHIFT); + + HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); +#ifdef notyet + /* + * need to install handler + */ + HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments_notify); +#endif + xen_start_info = startinfo; + xen_phys_machine = (xen_pfn_t *)startinfo->mfn_list; + + IdlePTD = (pd_entry_t *)((uint8_t *)startinfo->pt_base + PAGE_SIZE); + l1_pages = 0; + +#ifdef PAE + l3_pages = 1; + l2_pages = 0; + IdlePDPT = (pd_entry_t *)startinfo->pt_base; + IdlePDPTma = VTOM(startinfo->pt_base); + for (i = (KERNBASE >> 30); + (i < 4) && (IdlePDPT[i] != 0); i++) + l2_pages++; + /* + * Note that only one page directory has been allocated at this point. + * Thus, if KERNBASE + */ + for (i = 0; i < l2_pages; i++) + IdlePTDma[i] = VTOM(IdlePTD + i*PAGE_SIZE); + + l2_pages = (l2_pages == 0) ? 1 : l2_pages; +#else + l3_pages = 0; + l2_pages = 1; +#endif + for (i = (((KERNBASE>>18) & PAGE_MASK)>>PAGE_SHIFT); + (i<l2_pages*NPDEPG) && (i<(VM_MAX_KERNEL_ADDRESS>>PDRSHIFT)); i++) { + + if (IdlePTD[i] == 0) + break; + l1_pages++; + } + + /* number of pages allocated after the pts + 1*/; + cur_space = xen_start_info->pt_base + + (l3_pages + l2_pages + l1_pages + 1)*PAGE_SIZE; + + printk("initvalues(): wooh - availmem=%x,%x\n", avail_space, cur_space); + + printk("KERNBASE=%x,pt_base=%x, VTOPFN(base)=%x, nr_pt_frames=%x\n", + KERNBASE,xen_start_info->pt_base, VTOPFN(xen_start_info->pt_base), + xen_start_info->nr_pt_frames); + xendebug_flags = 0; /* 0xffffffff; */ + +#ifdef ADD_ISA_HOLE + shift_phys_machine(xen_phys_machine, xen_start_info->nr_pages); +#endif + XENPRINTF("IdlePTD %p\n", IdlePTD); + XENPRINTF("nr_pages: %ld shared_info: 0x%lx flags: 0x%lx pt_base: 0x%lx " + "mod_start: 0x%lx mod_len: 0x%lx\n", + xen_start_info->nr_pages, xen_start_info->shared_info, + xen_start_info->flags, xen_start_info->pt_base, + xen_start_info->mod_start, xen_start_info->mod_len); + +#ifdef PAE + IdlePDPTnew = (pd_entry_t *)cur_space; cur_space += PAGE_SIZE; + bzero(IdlePDPTnew, PAGE_SIZE); + + IdlePDPTnewma = VTOM(IdlePDPTnew); + IdlePTDnew = (pd_entry_t *)cur_space; cur_space += 4*PAGE_SIZE; + bzero(IdlePTDnew, 4*PAGE_SIZE); + + for (i = 0; i < 4; i++) + IdlePTDnewma[i] = VTOM((uint8_t *)IdlePTDnew + i*PAGE_SIZE); + /* + * L3 + * + * Copy the 4 machine addresses of the new PTDs in to the PDPT + * + */ + for (i = 0; i < 4; i++) + IdlePDPTnew[i] = IdlePTDnewma[i] | PG_V; + + __asm__("nop;"); + /* + * + * re-map the new PDPT read-only + */ + PT_SET_MA(IdlePDPTnew, IdlePDPTnewma | PG_V); + /* + * + * Unpin the current PDPT + */ + xen_pt_unpin(IdlePDPTma); + +#endif /* PAE */ + + /* Map proc0's KSTACK */ + proc0kstack = cur_space; cur_space += (KSTACK_PAGES * PAGE_SIZE); + printk("proc0kstack=%u\n", proc0kstack); + + /* vm86/bios stack */ + cur_space += PAGE_SIZE; + + /* Map space for the vm86 region */ + vm86paddr = (vm_offset_t)cur_space; + cur_space += (PAGE_SIZE * 3); + + /* allocate 4 pages for bootmem allocator */ + bootmem_start = bootmem_current = (char *)cur_space; + cur_space += (4 * PAGE_SIZE); + bootmem_end = (char *)cur_space; + + /* allocate pages for gdt */ + gdt = (union descriptor *)cur_space; + cur_space += PAGE_SIZE*ncpus; + + /* allocate page for ldt */ + ldt = (union descriptor *)cur_space; cur_space += PAGE_SIZE; + cur_space += PAGE_SIZE; + + /* unmap remaining pages from initial chunk + * + */ + for (tmpva = cur_space; tmpva < (((uint32_t)&kernbase) + (l1_pages<<PDRSHIFT)); + tmpva += PAGE_SIZE) { + bzero((char *)tmpva, PAGE_SIZE); + PT_SET_MA(tmpva, (vm_paddr_t)0); + } + + PT_UPDATES_FLUSH(); + + memcpy(((uint8_t *)IdlePTDnew) + ((unsigned int)(KERNBASE >> 18)), + ((uint8_t *)IdlePTD) + ((KERNBASE >> 18) & PAGE_MASK), + l1_pages*sizeof(pt_entry_t)); + + for (i = 0; i < 4; i++) { + PT_SET_MA((uint8_t *)IdlePTDnew + i*PAGE_SIZE, + IdlePTDnewma[i] | PG_V); + } + xen_load_cr3(VTOP(IdlePDPTnew)); + xen_pgdpt_pin(VTOM(IdlePDPTnew)); + + /* allocate remainder of nkpt pages */ + cur_space_pt = cur_space; + for (offset = (KERNBASE >> PDRSHIFT), i = l1_pages; i < nkpt; + i++, cur_space += PAGE_SIZE) { + pdir = (offset + i) / NPDEPG; + curoffset = ((offset + i) % NPDEPG); + if (((offset + i) << PDRSHIFT) == VM_MAX_KERNEL_ADDRESS) + break; + + /* + * make sure that all the initial page table pages + * have been zeroed + */ + PT_SET_MA(cur_space, VTOM(cur_space) | PG_V | PG_RW); + bzero((char *)cur_space, PAGE_SIZE); + PT_SET_MA(cur_space, (vm_paddr_t)0); + xen_pt_pin(VTOM(cur_space)); + xen_queue_pt_update((vm_paddr_t)(IdlePTDnewma[pdir] + + curoffset*sizeof(vm_paddr_t)), + VTOM(cur_space) | PG_KERNEL); + PT_UPDATES_FLUSH(); + } + + for (i = 0; i < 4; i++) { + pdir = (PTDPTDI + i) / NPDEPG; + curoffset = (PTDPTDI + i) % NPDEPG; + + xen_queue_pt_update((vm_paddr_t)(IdlePTDnewma[pdir] + + curoffset*sizeof(vm_paddr_t)), + IdlePTDnewma[i] | PG_V); + } + + PT_UPDATES_FLUSH(); + + IdlePTD = IdlePTDnew; + IdlePDPT = IdlePDPTnew; + IdlePDPTma = IdlePDPTnewma; + + HYPERVISOR_shared_info = (shared_info_t *)cur_space; + cur_space += PAGE_SIZE; + + xen_store = (struct xenstore_domain_interface *)cur_space; + cur_space += PAGE_SIZE; + + console_page = (char *)cur_space; + cur_space += PAGE_SIZE; + + /* + * shared_info is an unsigned long so this will randomly break if + * it is allocated above 4GB - I guess people are used to that + * sort of thing with Xen ... sigh + */ + shinfo = xen_start_info->shared_info; + PT_SET_MA(HYPERVISOR_shared_info, shinfo | PG_KERNEL); + + printk("#4\n"); + + xen_store_ma = (((vm_paddr_t)xen_start_info->store_mfn) << PAGE_SHIFT); + PT_SET_MA(xen_store, xen_store_ma | PG_KERNEL); + console_page_ma = (((vm_paddr_t)xen_start_info->console.domU.mfn) << PAGE_SHIFT); + PT_SET_MA(console_page, console_page_ma | PG_KERNEL); + + printk("#5\n"); + + set_iopl.iopl = 1; + PANIC_IF(HYPERVISOR_physdev_op(PHYSDEVOP_SET_IOPL, &set_iopl)); + printk("#6\n"); +#if 0 + /* add page table for KERNBASE */ + xen_queue_pt_update(IdlePTDma + KPTDI*sizeof(vm_paddr_t), + VTOM(cur_space) | PG_KERNEL); + xen_flush_queue(); +#ifdef PAE + xen_queue_pt_update(pdir_shadow_ma[3] + KPTDI*sizeof(vm_paddr_t), + VTOM(cur_space) | PG_V | PG_A); +#else + xen_queue_pt_update(pdir_shadow_ma + KPTDI*sizeof(vm_paddr_t), + VTOM(cur_space) | PG_V | PG_A); +#endif + xen_flush_queue(); + cur_space += PAGE_SIZE; + printk("#6\n"); +#endif /* 0 */ +#ifdef notyet + if (xen_start_info->flags & SIF_INITDOMAIN) { + /* Map first megabyte */ + for (i = 0; i < (256 << PAGE_SHIFT); i += PAGE_SIZE) + PT_SET_MA(KERNBASE + i, i | PG_KERNEL | PG_NC_PCD); + xen_flush_queue(); + } +#endif + /* + * re-map kernel text read-only + * + */ + for (i = (((vm_offset_t)&btext) & ~PAGE_MASK); + i < (((vm_offset_t)&etext) & ~PAGE_MASK); i += PAGE_SIZE) + PT_SET_MA(i, VTOM(i) | PG_V | PG_A); + + printk("#7\n"); + physfree = VTOP(cur_space); + init_first = physfree >> PAGE_SHIFT; + IdlePTD = (pd_entry_t *)VTOP(IdlePTD); + IdlePDPT = (pd_entry_t *)VTOP(IdlePDPT); + setup_xen_features(); + printk("#8, proc0kstack=%u\n", proc0kstack); +} + + +trap_info_t trap_table[] = { + { 0, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(div)}, + { 1, 0|4, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(dbg)}, + { 3, 3|4, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(bpt)}, + { 4, 3, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(ofl)}, + /* This is UPL on Linux and KPL on BSD */ + { 5, 3, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(bnd)}, + { 6, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(ill)}, + { 7, 0|4, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(dna)}, + /* + * { 8, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(XXX)}, + * no handler for double fault + */ + { 9, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(fpusegm)}, + {10, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(tss)}, + {11, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(missing)}, + {12, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(stk)}, + {13, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(prot)}, + {14, 0|4, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(page)}, + {15, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(rsvd)}, + {16, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(fpu)}, + {17, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(align)}, + {18, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(mchk)}, + {19, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(xmm)}, + {0x80, 3, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(int0x80_syscall)}, + { 0, 0, 0, 0 } +}; + +/* Perform a multicall and check that individual calls succeeded. */ +int +HYPERVISOR_multicall(struct multicall_entry * call_list, int nr_calls) +{ + int ret = 0; + int i; + + /* Perform the multicall. */ + PANIC_IF(_HYPERVISOR_multicall(call_list, nr_calls)); + + /* Check the results of individual hypercalls. */ + for (i = 0; i < nr_calls; i++) + if (unlikely(call_list[i].result < 0)) + ret++; + if (unlikely(ret > 0)) + panic("%d multicall(s) failed: cpu %d\n", + ret, smp_processor_id()); + + /* If we didn't panic already, everything succeeded. */ + return (0); +} + +/********** CODE WORTH KEEPING ABOVE HERE *****************/ + +void xen_failsafe_handler(void); + +void +xen_failsafe_handler(void) +{ + + panic("xen_failsafe_handler called!\n"); +} + +void xen_handle_thread_switch(struct pcb *pcb); + +/* This is called by cpu_switch() when switching threads. */ +/* The pcb arg refers to the process control block of the */ +/* next thread which is to run */ +void +xen_handle_thread_switch(struct pcb *pcb) +{ + uint32_t *a = (uint32_t *)&PCPU_GET(fsgs_gdt)[0]; + uint32_t *b = (uint32_t *)&pcb->pcb_fsd; + multicall_entry_t mcl[3]; + int i = 0; + + /* Notify Xen of task switch */ + mcl[i].op = __HYPERVISOR_stack_switch; + mcl[i].args[0] = GSEL(GDATA_SEL, SEL_KPL); + mcl[i++].args[1] = (unsigned long)pcb; + + /* Check for update of fsd */ + if (*a != *b || *(a+1) != *(b+1)) { + mcl[i].op = __HYPERVISOR_update_descriptor; + *(uint64_t *)&mcl[i].args[0] = vtomach((vm_offset_t)a); + *(uint64_t *)&mcl[i++].args[2] = *(uint64_t *)b; + } + + a += 2; + b += 2; + + /* Check for update of gsd */ + if (*a != *b || *(a+1) != *(b+1)) { + mcl[i].op = __HYPERVISOR_update_descriptor; + *(uint64_t *)&mcl[i].args[0] = vtomach((vm_offset_t)a); + *(uint64_t *)&mcl[i++].args[2] = *(uint64_t *)b; + } + + (void)HYPERVISOR_multicall(mcl, i); +} diff --git a/sys/i386/xen/xen_rtc.c b/sys/i386/xen/xen_rtc.c new file mode 100644 index 0000000..8e1e017 --- /dev/null +++ b/sys/i386/xen/xen_rtc.c @@ -0,0 +1,144 @@ +/*- + * Copyright (c) 2009 Adrian Chadd + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bus.h> +#include <sys/clock.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/kernel.h> +#include <sys/module.h> +#include <sys/time.h> + +#include <xen/xen_intr.h> +#include <vm/vm.h> +#include <vm/pmap.h> +#include <machine/pmap.h> +#include <xen/hypervisor.h> +#include <machine/xen/xen-os.h> +#include <machine/xen/xenfunc.h> +#include <xen/interface/io/xenbus.h> +#include <xen/interface/vcpu.h> +#include <machine/cpu.h> + +#include <machine/xen/xen_clock_util.h> + +#include "clock_if.h" + +static int +xen_rtc_probe(device_t dev) +{ + device_set_desc(dev, "Xen Hypervisor Clock"); + printf("[XEN] xen_rtc_probe: probing Hypervisor RTC clock\n"); + if (! HYPERVISOR_shared_info) { + device_printf(dev, "No hypervisor shared page found; RTC can not start.\n"); + return (EINVAL); + } + return (0); +} + +static int +xen_rtc_attach(device_t dev) +{ + printf("[XEN] xen_rtc_attach: attaching Hypervisor RTC clock\n"); + clock_register(dev, 1000000); + return(0); +} + +static int +xen_rtc_settime(device_t dev __unused, struct timespec *ts) +{ + device_printf(dev, "[XEN] xen_rtc_settime\n"); + /* + * Don't return EINVAL here; just silently fail if the domain isn't privileged enough + * to set the TOD. + */ + return(0); +} + +/* + * The Xen time structures document the hypervisor start time and the + * uptime-since-hypervisor-start (in nsec.) They need to be combined + * in order to calculate a TOD clock. + */ +static int +xen_rtc_gettime(device_t dev, struct timespec *ts) +{ + struct timespec w_ts, u_ts; + + device_printf(dev, "[XEN] xen_rtc_gettime\n"); + xen_fetch_wallclock(&w_ts); + device_printf(dev, "[XEN] xen_rtc_gettime: wallclock %ld sec; %ld nsec\n", (long int) w_ts.tv_sec, (long int) w_ts.tv_nsec); + xen_fetch_uptime(&u_ts); + device_printf(dev, "[XEN] xen_rtc_gettime: uptime %ld sec; %ld nsec\n", (long int) u_ts.tv_sec, (long int) u_ts.tv_nsec); + + timespecclear(ts); + timespecadd(ts, &w_ts); + timespecadd(ts, &u_ts); + + device_printf(dev, "[XEN] xen_rtc_gettime: TOD %ld sec; %ld nsec\n", (long int) ts->tv_sec, (long int) ts->tv_nsec); + + return(0); +} + +static void +xen_rtc_identify(driver_t *drv, device_t parent) +{ + BUS_ADD_CHILD(parent, 0, "rtc", 0); +} + +static device_method_t xen_rtc_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, xen_rtc_probe), + DEVMETHOD(device_attach, xen_rtc_attach), + DEVMETHOD(device_identify, xen_rtc_identify), + + DEVMETHOD(device_detach, bus_generic_detach), + DEVMETHOD(device_shutdown, bus_generic_shutdown), + + /* clock interface */ + DEVMETHOD(clock_gettime, xen_rtc_gettime), + DEVMETHOD(clock_settime, xen_rtc_settime), + + { 0, 0 } +}; + + +static driver_t xen_rtc_driver = { + "rtc", + xen_rtc_methods, + 0 +}; + +static devclass_t xen_rtc_devclass; + +DRIVER_MODULE(rtc, nexus, xen_rtc_driver, xen_rtc_devclass, 0, 0); |