diff options
author | Steve French <sfrench@us.ibm.com> | 2008-02-15 21:06:08 +0000 |
---|---|---|
committer | Steve French <sfrench@us.ibm.com> | 2008-02-15 21:06:08 +0000 |
commit | 0a3abcf75bf391fec4e32356ab5ddb8f5d2e6b41 (patch) | |
tree | b80b1d344ec24cad28b057ef803cebac9434be01 /arch/x86 | |
parent | 70eff55d2d979cca700aa6906494f0c474f3f7ff (diff) | |
parent | 101142c37be8e5af9b847860219217e6b958c739 (diff) | |
download | op-kernel-dev-0a3abcf75bf391fec4e32356ab5ddb8f5d2e6b41.zip op-kernel-dev-0a3abcf75bf391fec4e32356ab5ddb8f5d2e6b41.tar.gz |
Merge branch 'master' of /pub/scm/linux/kernel/git/torvalds/linux-2.6
Diffstat (limited to 'arch/x86')
84 files changed, 1078 insertions, 820 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 4348211..3be2305 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -18,8 +18,11 @@ config X86_64 ### Arch settings config X86 def_bool y + select HAVE_IDE select HAVE_OPROFILE select HAVE_KPROBES + select HAVE_KVM + config GENERIC_LOCKBREAK def_bool n @@ -52,6 +55,10 @@ config HAVE_LATENCYTOP_SUPPORT config SEMAPHORE_SLEEPERS def_bool y +config FAST_CMPXCHG_LOCAL + bool + default y + config MMU def_bool y @@ -98,6 +105,9 @@ config ARCH_HAS_ILOG2_U32 config ARCH_HAS_ILOG2_U64 def_bool n +config ARCH_HAS_CPU_IDLE_WAIT + def_bool y + config GENERIC_CALIBRATE_DELAY def_bool y @@ -105,11 +115,12 @@ config GENERIC_TIME_VSYSCALL bool default X86_64 +config ARCH_HAS_CPU_RELAX + def_bool y + config HAVE_SETUP_PER_CPU_AREA def_bool X86_64 -select HAVE_KVM - config ARCH_HIBERNATION_POSSIBLE def_bool y depends on !SMP || !X86_VOYAGER @@ -129,6 +140,9 @@ config AUDIT_ARCH bool default X86_64 +config ARCH_SUPPORTS_AOUT + def_bool y + # Use the generic interrupt handling code in kernel/irq/: config GENERIC_HARDIRQS bool @@ -415,7 +429,7 @@ config HPET_TIMER config HPET_EMULATE_RTC def_bool y - depends on HPET_TIMER && (RTC=y || RTC=m) + depends on HPET_TIMER && (RTC=y || RTC=m || RTC_DRV_CMOS=m || RTC_DRV_CMOS=y) # Mark as embedded because too many people got it wrong. # The code disables itself when not needed. @@ -631,7 +645,6 @@ config TOSHIBA config I8K tristate "Dell laptop support" - depends on X86_32 ---help--- This adds a driver to safely access the System Management Mode of the CPU on the Dell Inspiron 8000. The System Management Mode @@ -1571,7 +1584,7 @@ config IA32_EMULATION config IA32_AOUT tristate "IA32 a.out support" - depends on IA32_EMULATION + depends on IA32_EMULATION && ARCH_SUPPORTS_AOUT help Support old a.out binaries in the 32bit emulation. diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 2e1e3af..864affc 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -34,13 +34,9 @@ config DEBUG_STACK_USAGE This option will slow down process creation somewhat. -comment "Page alloc debug is incompatible with Software Suspend on i386" - depends on DEBUG_KERNEL && HIBERNATION - depends on X86_32 - config DEBUG_PAGEALLOC bool "Debug page memory allocations" - depends on DEBUG_KERNEL && X86_32 + depends on DEBUG_KERNEL help Unmap pages from the kernel linear mapping after free_pages(). This results in a large slowdown, but helps to find certain types @@ -220,9 +216,9 @@ config DEBUG_BOOT_PARAMS This option will cause struct boot_params to be exported via debugfs. config CPA_DEBUG - bool "CPA self test code" + bool "CPA self-test code" depends on DEBUG_KERNEL help - Do change_page_attr self tests at boot. + Do change_page_attr() self-tests every 30 seconds. endmenu diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 364865b..204af43 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -191,8 +191,10 @@ drivers-$(CONFIG_PCI) += arch/x86/pci/ # must be linked after kernel/ drivers-$(CONFIG_OPROFILE) += arch/x86/oprofile/ -ifeq ($(CONFIG_X86_32),y) +# suspend and hibernation support drivers-$(CONFIG_PM) += arch/x86/power/ + +ifeq ($(CONFIG_X86_32),y) drivers-$(CONFIG_FB) += arch/x86/video/ endif diff --git a/arch/x86/boot/.gitignore b/arch/x86/boot/.gitignore index 1846514..b1bdc4c 100644 --- a/arch/x86/boot/.gitignore +++ b/arch/x86/boot/.gitignore @@ -3,3 +3,5 @@ bzImage setup setup.bin setup.elf +cpustr.h +mkcpustr diff --git a/arch/x86/boot/printf.c b/arch/x86/boot/printf.c index 1a09f93..7e7e890 100644 --- a/arch/x86/boot/printf.c +++ b/arch/x86/boot/printf.c @@ -33,8 +33,8 @@ static int skip_atoi(const char **s) #define PLUS 4 /* show plus */ #define SPACE 8 /* space if plus */ #define LEFT 16 /* left justified */ -#define SPECIAL 32 /* 0x */ -#define LARGE 64 /* use 'ABCDEF' instead of 'abcdef' */ +#define SMALL 32 /* Must be 32 == 0x20 */ +#define SPECIAL 64 /* 0x */ #define do_div(n,base) ({ \ int __res; \ @@ -45,12 +45,16 @@ __res; }) static char *number(char *str, long num, int base, int size, int precision, int type) { - char c, sign, tmp[66]; - const char *digits = "0123456789abcdefghijklmnopqrstuvwxyz"; + /* we are called with base 8, 10 or 16, only, thus don't need "G..." */ + static const char digits[16] = "0123456789ABCDEF"; /* "GHIJKLMNOPQRSTUVWXYZ"; */ + + char tmp[66]; + char c, sign, locase; int i; - if (type & LARGE) - digits = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"; + /* locase = 0 or 0x20. ORing digits or letters with 'locase' + * produces same digits or (maybe lowercased) letters */ + locase = (type & SMALL); if (type & LEFT) type &= ~ZEROPAD; if (base < 2 || base > 36) @@ -81,7 +85,7 @@ static char *number(char *str, long num, int base, int size, int precision, tmp[i++] = '0'; else while (num != 0) - tmp[i++] = digits[do_div(num, base)]; + tmp[i++] = (digits[do_div(num, base)] | locase); if (i > precision) precision = i; size -= precision; @@ -95,7 +99,7 @@ static char *number(char *str, long num, int base, int size, int precision, *str++ = '0'; else if (base == 16) { *str++ = '0'; - *str++ = digits[33]; + *str++ = ('X' | locase); } } if (!(type & LEFT)) @@ -244,9 +248,9 @@ int vsprintf(char *buf, const char *fmt, va_list args) base = 8; break; - case 'X': - flags |= LARGE; case 'x': + flags |= SMALL; + case 'X': base = 16; break; diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 77562e7..3df340b 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -1421,7 +1421,6 @@ CONFIG_DEBUG_BUGVERBOSE=y # CONFIG_DEBUG_VM is not set # CONFIG_DEBUG_LIST is not set # CONFIG_FRAME_POINTER is not set -# CONFIG_FORCED_INLINING is not set # CONFIG_RCU_TORTURE_TEST is not set # CONFIG_LKDTM is not set # CONFIG_FAULT_INJECTION is not set diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 9e2b0ef..eef98cb 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -1346,7 +1346,6 @@ CONFIG_DEBUG_BUGVERBOSE=y # CONFIG_DEBUG_VM is not set # CONFIG_DEBUG_LIST is not set # CONFIG_FRAME_POINTER is not set -# CONFIG_FORCED_INLINING is not set # CONFIG_RCU_TORTURE_TEST is not set # CONFIG_LKDTM is not set # CONFIG_FAULT_INJECTION is not set diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index e4c1207..58cccb6 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -172,8 +172,7 @@ static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, has_dumped = 1; current->flags |= PF_DUMPCORE; strncpy(dump.u_comm, current->comm, sizeof(current->comm)); - dump.u_ar0 = (u32)(((unsigned long)(&dump.regs)) - - ((unsigned long)(&dump))); + dump.u_ar0 = offsetof(struct user32, regs); dump.signal = signr; dump_thread32(regs, &dump); diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 21dc1a0..76ec0f8 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -84,8 +84,6 @@ ifeq ($(CONFIG_X86_64),y) obj-y += genapic_64.o genapic_flat_64.o obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o obj-$(CONFIG_AUDIT) += audit_64.o - obj-$(CONFIG_PM) += suspend_64.o - obj-$(CONFIG_HIBERNATION) += suspend_asm_64.o obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index d2a5843..680b730 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -78,7 +78,6 @@ int acpi_ht __initdata = 1; /* enable HT */ int acpi_lapic; int acpi_ioapic; int acpi_strict; -EXPORT_SYMBOL(acpi_strict); u8 acpi_sci_flags __initdata; int acpi_sci_override_gsi __initdata; @@ -106,7 +105,7 @@ enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC; #ifdef CONFIG_X86_64 /* rely on all ACPI tables being in the direct mapping */ -char *__acpi_map_table(unsigned long phys_addr, unsigned long size) +char *__init __acpi_map_table(unsigned long phys_addr, unsigned long size) { if (!phys_addr || !size) return NULL; @@ -131,7 +130,7 @@ char *__acpi_map_table(unsigned long phys_addr, unsigned long size) * from the fixed base. That's why we start at FIX_IO_APIC_BASE_END and * count idx down while incrementing the phys address. */ -char *__acpi_map_table(unsigned long phys, unsigned long size) +char *__init __acpi_map_table(unsigned long phys, unsigned long size) { unsigned long base, offset, mapped_size; int idx; @@ -490,8 +489,6 @@ int acpi_register_gsi(u32 gsi, int triggering, int polarity) return irq; } -EXPORT_SYMBOL(acpi_register_gsi); - /* * ACPI based hotplug support for CPU */ @@ -587,25 +584,6 @@ int acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base) EXPORT_SYMBOL(acpi_unregister_ioapic); -static unsigned long __init -acpi_scan_rsdp(unsigned long start, unsigned long length) -{ - unsigned long offset = 0; - unsigned long sig_len = sizeof("RSD PTR ") - 1; - - /* - * Scan all 16-byte boundaries of the physical memory region for the - * RSDP signature. - */ - for (offset = 0; offset < length; offset += 16) { - if (strncmp((char *)(phys_to_virt(start) + offset), "RSD PTR ", sig_len)) - continue; - return (start + offset); - } - - return 0; -} - static int __init acpi_parse_sbf(struct acpi_table_header *table) { struct acpi_table_boot *sb; @@ -748,27 +726,6 @@ static int __init acpi_parse_fadt(struct acpi_table_header *table) return 0; } -unsigned long __init acpi_find_rsdp(void) -{ - unsigned long rsdp_phys = 0; - - if (efi_enabled) { - if (efi.acpi20 != EFI_INVALID_TABLE_ADDR) - return efi.acpi20; - else if (efi.acpi != EFI_INVALID_TABLE_ADDR) - return efi.acpi; - } - /* - * Scan memory looking for the RSDP signature. First search EBDA (low - * memory) paragraphs and then search upper memory (E0000-FFFFF). - */ - rsdp_phys = acpi_scan_rsdp(0, 0x400); - if (!rsdp_phys) - rsdp_phys = acpi_scan_rsdp(0xE0000, 0x20000); - - return rsdp_phys; -} - #ifdef CONFIG_X86_LOCAL_APIC /* * Parse LAPIC entries in MADT diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index 10b6717..8ca3557 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -126,6 +126,8 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu, printk(KERN_DEBUG "Monitor-Mwait will be used to enter C-%d " "state\n", cx->type); } + snprintf(cx->desc, ACPI_CX_DESC_LEN, "ACPI FFH INTEL MWAIT 0x%x", + cx->address); out: set_cpus_allowed(current, saved_mask); diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c index a25db51..324eb0c 100644 --- a/arch/x86/kernel/acpi/processor.c +++ b/arch/x86/kernel/acpi/processor.c @@ -46,6 +46,12 @@ static void init_intel_pdc(struct acpi_processor *pr, struct cpuinfo_x86 *c) buf[1] = 1; buf[2] = ACPI_PDC_C_CAPABILITY_SMP; + /* + * The default of PDC_SMP_T_SWCOORD bit is set for intel x86 cpu so + * that OSPM is capable of native ACPI throttling software + * coordination using BIOS supplied _TSD info. + */ + buf[2] |= ACPI_PDC_SMP_T_SWCOORD; if (cpu_has(c, X86_FEATURE_EST)) buf[2] |= ACPI_PDC_EST_CAPABILITY_SWSMP; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index d9313d9..f86a3c4 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -637,7 +637,7 @@ void __init early_cpu_init(void) } /* Make sure %fs is initialized properly in idle threads */ -struct pt_regs * __devinit idle_regs(struct pt_regs *regs) +struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs) { memset(regs, 0, sizeof(struct pt_regs)); regs->fs = __KERNEL_PERCPU; diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig index 151eda0..cb7a571 100644 --- a/arch/x86/kernel/cpu/cpufreq/Kconfig +++ b/arch/x86/kernel/cpu/cpufreq/Kconfig @@ -29,7 +29,7 @@ config X86_ACPI_CPUFREQ config ELAN_CPUFREQ tristate "AMD Elan SC400 and SC410" select CPU_FREQ_TABLE - depends on X86_32 && X86_ELAN + depends on X86_ELAN ---help--- This adds the CPUFreq driver for AMD Elan SC400 and SC410 processors. @@ -45,7 +45,7 @@ config ELAN_CPUFREQ config SC520_CPUFREQ tristate "AMD Elan SC520" select CPU_FREQ_TABLE - depends on X86_32 && X86_ELAN + depends on X86_ELAN ---help--- This adds the CPUFreq driver for AMD Elan SC520 processor. diff --git a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c b/arch/x86/kernel/cpu/cpufreq/e_powersaver.c index 326a4c8..39f8cb1 100644 --- a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c +++ b/arch/x86/kernel/cpu/cpufreq/e_powersaver.c @@ -23,6 +23,7 @@ #define EPS_BRAND_C7 1 #define EPS_BRAND_EDEN 2 #define EPS_BRAND_C3 3 +#define EPS_BRAND_C7D 4 struct eps_cpu_data { u32 fsb; @@ -54,6 +55,7 @@ static int eps_set_state(struct eps_cpu_data *centaur, { struct cpufreq_freqs freqs; u32 lo, hi; + u8 current_multiplier, current_voltage; int err = 0; int i; @@ -93,6 +95,15 @@ postchange: rdmsr(MSR_IA32_PERF_STATUS, lo, hi); freqs.new = centaur->fsb * ((lo >> 8) & 0xff); + /* Print voltage and multiplier */ + rdmsr(MSR_IA32_PERF_STATUS, lo, hi); + current_voltage = lo & 0xff; + printk(KERN_INFO "eps: Current voltage = %dmV\n", + current_voltage * 16 + 700); + current_multiplier = (lo >> 8) & 0xff; + printk(KERN_INFO "eps: Current multiplier = %d\n", + current_multiplier); + cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); return err; } @@ -141,9 +152,10 @@ static int eps_cpu_init(struct cpufreq_policy *policy) u8 current_multiplier, current_voltage; u8 max_multiplier, max_voltage; u8 min_multiplier, min_voltage; - u8 brand; + u8 brand = 0; u32 fsb; struct eps_cpu_data *centaur; + struct cpuinfo_x86 *c = &cpu_data(0); struct cpufreq_frequency_table *f_table; int k, step, voltage; int ret; @@ -153,21 +165,36 @@ static int eps_cpu_init(struct cpufreq_policy *policy) return -ENODEV; /* Check brand */ - printk("eps: Detected VIA "); - rdmsr(0x1153, lo, hi); - brand = (((lo >> 2) ^ lo) >> 18) & 3; + printk(KERN_INFO "eps: Detected VIA "); + + switch (c->x86_model) { + case 10: + rdmsr(0x1153, lo, hi); + brand = (((lo >> 2) ^ lo) >> 18) & 3; + printk(KERN_CONT "Model A "); + break; + case 13: + rdmsr(0x1154, lo, hi); + brand = (((lo >> 4) ^ (lo >> 2))) & 0x000000ff; + printk(KERN_CONT "Model D "); + break; + } + switch(brand) { case EPS_BRAND_C7M: - printk("C7-M\n"); + printk(KERN_CONT "C7-M\n"); break; case EPS_BRAND_C7: - printk("C7\n"); + printk(KERN_CONT "C7\n"); break; case EPS_BRAND_EDEN: - printk("Eden\n"); + printk(KERN_CONT "Eden\n"); + break; + case EPS_BRAND_C7D: + printk(KERN_CONT "C7-D\n"); break; case EPS_BRAND_C3: - printk("C3\n"); + printk(KERN_CONT "C3\n"); return -ENODEV; break; } @@ -179,7 +206,7 @@ static int eps_cpu_init(struct cpufreq_policy *policy) /* Can be locked at 0 */ rdmsrl(MSR_IA32_MISC_ENABLE, val); if (!(val & 1 << 16)) { - printk("eps: Can't enable Enhanced PowerSaver\n"); + printk(KERN_INFO "eps: Can't enable Enhanced PowerSaver\n"); return -ENODEV; } } @@ -187,19 +214,19 @@ static int eps_cpu_init(struct cpufreq_policy *policy) /* Print voltage and multiplier */ rdmsr(MSR_IA32_PERF_STATUS, lo, hi); current_voltage = lo & 0xff; - printk("eps: Current voltage = %dmV\n", current_voltage * 16 + 700); + printk(KERN_INFO "eps: Current voltage = %dmV\n", current_voltage * 16 + 700); current_multiplier = (lo >> 8) & 0xff; - printk("eps: Current multiplier = %d\n", current_multiplier); + printk(KERN_INFO "eps: Current multiplier = %d\n", current_multiplier); /* Print limits */ max_voltage = hi & 0xff; - printk("eps: Highest voltage = %dmV\n", max_voltage * 16 + 700); + printk(KERN_INFO "eps: Highest voltage = %dmV\n", max_voltage * 16 + 700); max_multiplier = (hi >> 8) & 0xff; - printk("eps: Highest multiplier = %d\n", max_multiplier); + printk(KERN_INFO "eps: Highest multiplier = %d\n", max_multiplier); min_voltage = (hi >> 16) & 0xff; - printk("eps: Lowest voltage = %dmV\n", min_voltage * 16 + 700); + printk(KERN_INFO "eps: Lowest voltage = %dmV\n", min_voltage * 16 + 700); min_multiplier = (hi >> 24) & 0xff; - printk("eps: Lowest multiplier = %d\n", min_multiplier); + printk(KERN_INFO "eps: Lowest multiplier = %d\n", min_multiplier); /* Sanity checks */ if (current_multiplier == 0 || max_multiplier == 0 @@ -208,7 +235,7 @@ static int eps_cpu_init(struct cpufreq_policy *policy) if (current_multiplier > max_multiplier || max_multiplier <= min_multiplier) return -EINVAL; - if (current_voltage > 0x1c || max_voltage > 0x1c) + if (current_voltage > 0x1f || max_voltage > 0x1f) return -EINVAL; if (max_voltage < min_voltage) return -EINVAL; @@ -310,7 +337,7 @@ static int __init eps_init(void) /* This driver will work only on Centaur C7 processors with * Enhanced SpeedStep/PowerSaver registers */ if (c->x86_vendor != X86_VENDOR_CENTAUR - || c->x86 != 6 || c->x86_model != 10) + || c->x86 != 6 || c->x86_model < 10) return -ENODEV; if (!cpu_has(c, X86_FEATURE_EST)) return -ENODEV; diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c index 2ed7db2..9d9eae8 100644 --- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c +++ b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c @@ -181,8 +181,8 @@ static __init struct pci_dev *gx_detect_chipset(void) struct pci_dev *gx_pci = NULL; /* check if CPU is a MediaGX or a Geode. */ - if ((current_cpu_data.x86_vendor != X86_VENDOR_NSC) && - (current_cpu_data.x86_vendor != X86_VENDOR_CYRIX)) { + if ((boot_cpu_data.x86_vendor != X86_VENDOR_NSC) && + (boot_cpu_data.x86_vendor != X86_VENDOR_CYRIX)) { dprintk("error: no MediaGX/Geode processor found!\n"); return NULL; } diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c index b5a9863..0a61159 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c @@ -460,7 +460,7 @@ static int powernow_decode_bios (int maxfid, int startvid) latency = psb->settlingtime; if (latency < 100) { - printk (KERN_INFO PFX "BIOS set settling time to %d microseconds." + printk(KERN_INFO PFX "BIOS set settling time to %d microseconds. " "Should be at least 100. Correcting.\n", latency); latency = 100; } diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index a052273..c99d59d 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -578,10 +578,9 @@ static void print_basics(struct powernow_k8_data *data) for (j = 0; j < data->numps; j++) { if (data->powernow_table[j].frequency != CPUFREQ_ENTRY_INVALID) { if (cpu_family == CPU_HW_PSTATE) { - printk(KERN_INFO PFX " %d : fid 0x%x did 0x%x (%d MHz)\n", + printk(KERN_INFO PFX " %d : pstate %d (%d MHz)\n", j, - (data->powernow_table[j].index & 0xff00) >> 8, - (data->powernow_table[j].index & 0xff0000) >> 16, + data->powernow_table[j].index, data->powernow_table[j].frequency/1000); } else { printk(KERN_INFO PFX " %d : fid 0x%x (%d MHz), vid 0x%x\n", @@ -827,7 +826,6 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpuf for (i = 0; i < data->acpi_data.state_count; i++) { u32 index; - u32 hi = 0, lo = 0; index = data->acpi_data.states[i].control & HW_PSTATE_MASK; if (index > data->max_hw_pstate) { @@ -1236,8 +1234,10 @@ static unsigned int powernowk8_get (unsigned int cpu) struct powernow_k8_data *data; cpumask_t oldmask = current->cpus_allowed; unsigned int khz = 0; + unsigned int first; - data = per_cpu(powernow_data, first_cpu(per_cpu(cpu_core_map, cpu))); + first = first_cpu(per_cpu(cpu_core_map, cpu)); + data = per_cpu(powernow_data, first); if (!data) return -EINVAL; diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h index afd2b52..ab48cfe 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h @@ -47,7 +47,7 @@ struct powernow_k8_data { #define CPUID_XFAM 0x0ff00000 /* extended family */ #define CPUID_XFAM_K8 0 #define CPUID_XMOD 0x000f0000 /* extended model */ -#define CPUID_XMOD_REV_MASK 0x00080000 +#define CPUID_XMOD_REV_MASK 0x000c0000 #define CPUID_XFAM_10H 0x00100000 /* family 0x10 */ #define CPUID_USE_XFAM_XMOD 0x00000f00 #define CPUID_GET_MAX_CAPABILITIES 0x80000000 diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c index 76c3ab0..98d4fdb 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c @@ -189,10 +189,7 @@ static unsigned int pentium4_get_frequency(void) printk(KERN_DEBUG "speedstep-lib: couldn't detect FSB speed. Please send an e-mail to <linux@brodo.de>\n"); /* Multiplier. */ - if (c->x86_model < 2) - mult = msr_lo >> 27; - else - mult = msr_lo >> 24; + mult = msr_lo >> 24; dprintk("P4 - FSB %u kHz; Multiplier %u; Speed %u kHz\n", fsb, mult, (fsb * mult)); diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 404a6a2..7139b02 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -83,8 +83,6 @@ static char cyrix_model_mult2[] __cpuinitdata = "12233445"; * FIXME: our newer udelay uses the tsc. We don't need to frob with SLOP */ -extern void calibrate_delay(void) __init; - static void __cpuinit check_cx686_slop(struct cpuinfo_x86 *c) { unsigned long flags; diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 24885be..9b7e01d 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -118,7 +118,7 @@ static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev) static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) { - return sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group); + sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group); } /* Mutex protecting device creation against CPU hotplug */ diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 1e27b69..b6e136f 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -659,7 +659,7 @@ static __init int amd_special_default_mtrr(void) */ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) { - unsigned long i, base, size, highest_addr = 0, def, dummy; + unsigned long i, base, size, highest_pfn = 0, def, dummy; mtrr_type type; u64 trim_start, trim_size; @@ -682,28 +682,27 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) mtrr_if->get(i, &base, &size, &type); if (type != MTRR_TYPE_WRBACK) continue; - base <<= PAGE_SHIFT; - size <<= PAGE_SHIFT; - if (highest_addr < base + size) - highest_addr = base + size; + if (highest_pfn < base + size) + highest_pfn = base + size; } /* kvm/qemu doesn't have mtrr set right, don't trim them all */ - if (!highest_addr) { + if (!highest_pfn) { printk(KERN_WARNING "WARNING: strange, CPU MTRRs all blank?\n"); WARN_ON(1); return 0; } - if ((highest_addr >> PAGE_SHIFT) < end_pfn) { + if (highest_pfn < end_pfn) { printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover" - " all of memory, losing %LdMB of RAM.\n", - (((u64)end_pfn << PAGE_SHIFT) - highest_addr) >> 20); + " all of memory, losing %luMB of RAM.\n", + (end_pfn - highest_pfn) >> (20 - PAGE_SHIFT)); WARN_ON(1); printk(KERN_INFO "update e820 for mtrr\n"); - trim_start = highest_addr; + trim_start = highest_pfn; + trim_start <<= PAGE_SHIFT; trim_size = end_pfn; trim_size <<= PAGE_SHIFT; trim_size -= trim_start; diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index 32dd62b..0c0eeb1 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c @@ -384,9 +384,6 @@ static void __init runtime_code_page_mkexec(void) efi_memory_desc_t *md; void *p; - if (!(__supported_pte_mask & _PAGE_NX)) - return; - /* Make EFI runtime service code area executable */ for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { md = p; @@ -394,7 +391,7 @@ static void __init runtime_code_page_mkexec(void) if (md->type != EFI_RUNTIME_SERVICES_CODE) continue; - set_memory_x(md->virt_addr, md->num_pages << EFI_PAGE_SHIFT); + set_memory_x(md->virt_addr, md->num_pages); } } @@ -428,9 +425,6 @@ void __init efi_enter_virtual_mode(void) else va = efi_ioremap(md->phys_addr, size); - if (md->attribute & EFI_MEMORY_WB) - set_memory_uc(md->virt_addr, size); - md->virt_addr = (u64) (unsigned long) va; if (!va) { @@ -439,6 +433,9 @@ void __init efi_enter_virtual_mode(void) continue; } + if (!(md->attribute & EFI_MEMORY_WB)) + set_memory_uc(md->virt_addr, md->num_pages); + systab = (u64) (unsigned long) efi_phys.systab; if (md->phys_addr <= systab && systab < end) { systab += md->virt_addr - md->phys_addr; @@ -476,7 +473,8 @@ void __init efi_enter_virtual_mode(void) efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count; efi.reset_system = virt_efi_reset_system; efi.set_virtual_address_map = virt_efi_set_virtual_address_map; - runtime_code_page_mkexec(); + if (__supported_pte_mask & _PAGE_NX) + runtime_code_page_mkexec(); early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size); memmap.map = NULL; } diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c index 09d5c23..d143a1e 100644 --- a/arch/x86/kernel/efi_64.c +++ b/arch/x86/kernel/efi_64.c @@ -35,6 +35,7 @@ #include <asm/tlbflush.h> #include <asm/proto.h> #include <asm/efi.h> +#include <asm/cacheflush.h> static pgd_t save_pgd __initdata; static unsigned long efi_flags __initdata; @@ -43,22 +44,15 @@ static void __init early_mapping_set_exec(unsigned long start, unsigned long end, int executable) { - pte_t *kpte; - unsigned int level; - - while (start < end) { - kpte = lookup_address((unsigned long)__va(start), &level); - BUG_ON(!kpte); - if (executable) - set_pte(kpte, pte_mkexec(*kpte)); - else - set_pte(kpte, __pte((pte_val(*kpte) | _PAGE_NX) & \ - __supported_pte_mask)); - if (level == PG_LEVEL_4K) - start = (start + PAGE_SIZE) & PAGE_MASK; - else - start = (start + PMD_SIZE) & PMD_MASK; - } + unsigned long num_pages; + + start &= PMD_MASK; + end = (end + PMD_SIZE - 1) & PMD_MASK; + num_pages = (end - start) >> PAGE_SHIFT; + if (executable) + set_memory_x((unsigned long)__va(start), num_pages); + else + set_memory_nx((unsigned long)__va(start), num_pages); } static void __init early_runtime_code_mapping_set_exec(int executable) @@ -74,7 +68,7 @@ static void __init early_runtime_code_mapping_set_exec(int executable) md = p; if (md->type == EFI_RUNTIME_SERVICES_CODE) { unsigned long end; - end = md->phys_addr + (md->num_pages << PAGE_SHIFT); + end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT); early_mapping_set_exec(md->phys_addr, end, executable); } } @@ -84,8 +78,8 @@ void __init efi_call_phys_prelog(void) { unsigned long vaddress; - local_irq_save(efi_flags); early_runtime_code_mapping_set_exec(1); + local_irq_save(efi_flags); vaddress = (unsigned long)__va(0x0UL); save_pgd = *pgd_offset_k(0x0UL); set_pgd(pgd_offset_k(0x0UL), *pgd_offset_k(vaddress)); @@ -98,9 +92,9 @@ void __init efi_call_phys_epilog(void) * After the lock is released, the original page table is restored. */ set_pgd(pgd_offset_k(0x0UL), save_pgd); - early_runtime_code_mapping_set_exec(0); __flush_tlb_all(); local_irq_restore(efi_flags); + early_runtime_code_mapping_set_exec(0); } void __init efi_reserve_bootmem(void) diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index be5c31d..824e21b 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -409,7 +409,8 @@ restore_nocheck_notrace: RESTORE_REGS addl $4, %esp # skip orig_eax/error_code CFI_ADJUST_CFA_OFFSET -4 -1: INTERRUPT_RETURN +ENTRY(irq_return) + INTERRUPT_RETURN .section .fixup,"ax" iret_exc: pushl $0 # no error code @@ -418,7 +419,7 @@ iret_exc: .previous .section __ex_table,"a" .align 4 - .long 1b,iret_exc + .long irq_return,iret_exc .previous CFI_RESTORE_STATE @@ -865,20 +866,16 @@ nmi_espfix_stack: RESTORE_REGS lss 12+4(%esp), %esp # back to espfix stack CFI_ADJUST_CFA_OFFSET -24 -1: INTERRUPT_RETURN + jmp irq_return CFI_ENDPROC -.section __ex_table,"a" - .align 4 - .long 1b,iret_exc -.previous KPROBE_END(nmi) #ifdef CONFIG_PARAVIRT ENTRY(native_iret) -1: iret + iret .section __ex_table,"a" .align 4 - .long 1b,iret_exc + .long native_iret, iret_exc .previous END(native_iret) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index bea8474..6be39a3 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -581,25 +581,41 @@ retint_restore_args: /* return to kernel space */ */ TRACE_IRQS_IRETQ restore_args: - RESTORE_ARGS 0,8,0 -iret_label: -#ifdef CONFIG_PARAVIRT + RESTORE_ARGS 0,8,0 + +ENTRY(irq_return) INTERRUPT_RETURN -#endif + + .section __ex_table, "a" + .quad irq_return, bad_iret + .previous + +#ifdef CONFIG_PARAVIRT ENTRY(native_iret) iretq .section __ex_table,"a" .quad native_iret, bad_iret .previous +#endif + .section .fixup,"ax" - /* force a signal here? this matches i386 behaviour */ - /* running with kernel gs */ bad_iret: - movq $11,%rdi /* SIGSEGV */ - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI)) - jmp do_exit + /* + * The iret traps when the %cs or %ss being restored is bogus. + * We've lost the original trap vector and error code. + * #GPF is the most likely one to get for an invalid selector. + * So pretend we completed the iret and took the #GPF in user mode. + * + * We are now running with the kernel GS after exception recovery. + * But error_entry expects us to have user GS to match the user %cs, + * so swap back. + */ + pushq $0 + + SWAPGS + jmp general_protection + .previous /* edi: workmask, edx: work */ @@ -796,7 +812,7 @@ paranoid_swapgs\trace: SWAPGS_UNSAFE_STACK paranoid_restore\trace: RESTORE_ALL 8 - INTERRUPT_RETURN + jmp irq_return paranoid_userspace\trace: GET_THREAD_INFO(%rcx) movl threadinfo_flags(%rcx),%ebx @@ -911,7 +927,7 @@ error_kernelspace: iret run with kernel gs again, so don't set the user space flag. B stepping K8s sometimes report an truncated RIP for IRET exceptions returning to compat mode. Check for these here too. */ - leaq iret_label(%rip),%rbp + leaq irq_return(%rip),%rbp cmpq %rbp,RIP(%rsp) je error_swapgs movl %ebp,%ebp /* zero extend */ diff --git a/arch/x86/kernel/geode_32.c b/arch/x86/kernel/geode_32.c index 9c7f7d3..9dad6ca 100644 --- a/arch/x86/kernel/geode_32.c +++ b/arch/x86/kernel/geode_32.c @@ -163,14 +163,11 @@ EXPORT_SYMBOL_GPL(geode_gpio_setup_event); static int __init geode_southbridge_init(void) { - int timers; - if (!is_geode()) return -ENODEV; init_lbars(); - timers = geode_mfgpt_detect(); - printk(KERN_INFO "geode: %d MFGPT timers available.\n", timers); + (void) mfgpt_timer_setup(); return 0; } diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 5d8c573..74ef4a4 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -19,6 +19,10 @@ #include <asm/thread_info.h> #include <asm/asm-offsets.h> #include <asm/setup.h> +#include <asm/processor-flags.h> + +/* Physical address */ +#define pa(X) ((X) - __PAGE_OFFSET) /* * References to members of the new_cpu_data structure. @@ -80,10 +84,6 @@ INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE + ALLOCATOR_SLOP)*PAGE_ */ .section .text.head,"ax",@progbits ENTRY(startup_32) - /* check to see if KEEP_SEGMENTS flag is meaningful */ - cmpw $0x207, BP_version(%esi) - jb 1f - /* test KEEP_SEGMENTS flag to see if the bootloader is asking us to not reload segments */ testb $(1<<6), BP_loadflags(%esi) @@ -92,7 +92,7 @@ ENTRY(startup_32) /* * Set segments to known values. */ -1: lgdt boot_gdt_descr - __PAGE_OFFSET + lgdt pa(boot_gdt_descr) movl $(__BOOT_DS),%eax movl %eax,%ds movl %eax,%es @@ -105,8 +105,8 @@ ENTRY(startup_32) */ cld xorl %eax,%eax - movl $__bss_start - __PAGE_OFFSET,%edi - movl $__bss_stop - __PAGE_OFFSET,%ecx + movl $pa(__bss_start),%edi + movl $pa(__bss_stop),%ecx subl %edi,%ecx shrl $2,%ecx rep ; stosl @@ -118,31 +118,32 @@ ENTRY(startup_32) * (kexec on panic case). Hence copy out the parameters before initializing * page tables. */ - movl $(boot_params - __PAGE_OFFSET),%edi + movl $pa(boot_params),%edi movl $(PARAM_SIZE/4),%ecx cld rep movsl - movl boot_params - __PAGE_OFFSET + NEW_CL_POINTER,%esi + movl pa(boot_params) + NEW_CL_POINTER,%esi andl %esi,%esi jz 1f # No comand line - movl $(boot_command_line - __PAGE_OFFSET),%edi + movl $pa(boot_command_line),%edi movl $(COMMAND_LINE_SIZE/4),%ecx rep movsl 1: #ifdef CONFIG_PARAVIRT - cmpw $0x207, (boot_params + BP_version - __PAGE_OFFSET) + /* This is can only trip for a broken bootloader... */ + cmpw $0x207, pa(boot_params + BP_version) jb default_entry /* Paravirt-compatible boot parameters. Look to see what architecture we're booting under. */ - movl (boot_params + BP_hardware_subarch - __PAGE_OFFSET), %eax + movl pa(boot_params + BP_hardware_subarch), %eax cmpl $num_subarch_entries, %eax jae bad_subarch - movl subarch_entries - __PAGE_OFFSET(,%eax,4), %eax + movl pa(subarch_entries)(,%eax,4), %eax subl $__PAGE_OFFSET, %eax jmp *%eax @@ -170,17 +171,68 @@ num_subarch_entries = (. - subarch_entries) / 4 * Mappings are created both at virtual address 0 (identity mapping) * and PAGE_OFFSET for up to _end+sizeof(page tables)+INIT_MAP_BEYOND_END. * - * Warning: don't use %esi or the stack in this code. However, %esp - * can be used as a GPR if you really need it... + * Note that the stack is not yet set up! */ -page_pde_offset = (__PAGE_OFFSET >> 20); +#define PTE_ATTR 0x007 /* PRESENT+RW+USER */ +#define PDE_ATTR 0x067 /* PRESENT+RW+USER+DIRTY+ACCESSED */ +#define PGD_ATTR 0x001 /* PRESENT (no other attributes) */ default_entry: - movl $(pg0 - __PAGE_OFFSET), %edi - movl $(swapper_pg_dir - __PAGE_OFFSET), %edx - movl $0x007, %eax /* 0x007 = PRESENT+RW+USER */ +#ifdef CONFIG_X86_PAE + + /* + * In PAE mode swapper_pg_dir is statically defined to contain enough + * entries to cover the VMSPLIT option (that is the top 1, 2 or 3 + * entries). The identity mapping is handled by pointing two PGD + * entries to the first kernel PMD. + * + * Note the upper half of each PMD or PTE are always zero at + * this stage. + */ + +#define KPMDS ((0x100000000-__PAGE_OFFSET) >> 30) /* Number of kernel PMDs */ + + xorl %ebx,%ebx /* %ebx is kept at zero */ + + movl $pa(pg0), %edi + movl $pa(swapper_pg_pmd), %edx + movl $PTE_ATTR, %eax +10: + leal PDE_ATTR(%edi),%ecx /* Create PMD entry */ + movl %ecx,(%edx) /* Store PMD entry */ + /* Upper half already zero */ + addl $8,%edx + movl $512,%ecx +11: + stosl + xchgl %eax,%ebx + stosl + xchgl %eax,%ebx + addl $0x1000,%eax + loop 11b + + /* + * End condition: we must map up to and including INIT_MAP_BEYOND_END + * bytes beyond the end of our own page tables. + */ + leal (INIT_MAP_BEYOND_END+PTE_ATTR)(%edi),%ebp + cmpl %ebp,%eax + jb 10b +1: + movl %edi,pa(init_pg_tables_end) + + /* Do early initialization of the fixmap area */ + movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax + movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8) +#else /* Not PAE */ + +page_pde_offset = (__PAGE_OFFSET >> 20); + + movl $pa(pg0), %edi + movl $pa(swapper_pg_dir), %edx + movl $PTE_ATTR, %eax 10: - leal 0x007(%edi),%ecx /* Create PDE entry */ + leal PDE_ATTR(%edi),%ecx /* Create PDE entry */ movl %ecx,(%edx) /* Store identity PDE entry */ movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */ addl $4,%edx @@ -189,19 +241,20 @@ default_entry: stosl addl $0x1000,%eax loop 11b - /* End condition: we must map up to and including INIT_MAP_BEYOND_END */ - /* bytes beyond the end of our own page tables; the +0x007 is the attribute bits */ - leal (INIT_MAP_BEYOND_END+0x007)(%edi),%ebp + /* + * End condition: we must map up to and including INIT_MAP_BEYOND_END + * bytes beyond the end of our own page tables; the +0x007 is + * the attribute bits + */ + leal (INIT_MAP_BEYOND_END+PTE_ATTR)(%edi),%ebp cmpl %ebp,%eax jb 10b - movl %edi,(init_pg_tables_end - __PAGE_OFFSET) - - /* Do an early initialization of the fixmap area */ - movl $(swapper_pg_dir - __PAGE_OFFSET), %edx - movl $(swapper_pg_pmd - __PAGE_OFFSET), %eax - addl $0x67, %eax /* 0x67 == _PAGE_TABLE */ - movl %eax, 4092(%edx) + movl %edi,pa(init_pg_tables_end) + /* Do early initialization of the fixmap area */ + movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax + movl %eax,pa(swapper_pg_dir+0xffc) +#endif jmp 3f /* * Non-boot CPU entry point; entered from trampoline.S @@ -241,7 +294,7 @@ ENTRY(startup_32_smp) * NOTE! We have to correct for the fact that we're * not yet offset PAGE_OFFSET.. */ -#define cr4_bits mmu_cr4_features-__PAGE_OFFSET +#define cr4_bits pa(mmu_cr4_features) movl cr4_bits,%edx andl %edx,%edx jz 6f @@ -276,10 +329,10 @@ ENTRY(startup_32_smp) /* * Enable paging */ - movl $swapper_pg_dir-__PAGE_OFFSET,%eax + movl $pa(swapper_pg_dir),%eax movl %eax,%cr3 /* set the page table pointer.. */ movl %cr0,%eax - orl $0x80000000,%eax + orl $X86_CR0_PG,%eax movl %eax,%cr0 /* ..and set paging (PG) bit */ ljmp $__BOOT_CS,$1f /* Clear prefetch and normalize %eip */ 1: @@ -552,16 +605,44 @@ ENTRY(_stext) */ .section ".bss.page_aligned","wa" .align PAGE_SIZE_asm +#ifdef CONFIG_X86_PAE +ENTRY(swapper_pg_pmd) + .fill 1024*KPMDS,4,0 +#else ENTRY(swapper_pg_dir) .fill 1024,4,0 -ENTRY(swapper_pg_pmd) +#endif +ENTRY(swapper_pg_fixmap) .fill 1024,4,0 ENTRY(empty_zero_page) .fill 4096,1,0 - /* * This starts the data section. */ +#ifdef CONFIG_X86_PAE +.section ".data.page_aligned","wa" + /* Page-aligned for the benefit of paravirt? */ + .align PAGE_SIZE_asm +ENTRY(swapper_pg_dir) + .long pa(swapper_pg_pmd+PGD_ATTR),0 /* low identity map */ +# if KPMDS == 3 + .long pa(swapper_pg_pmd+PGD_ATTR),0 + .long pa(swapper_pg_pmd+PGD_ATTR+0x1000),0 + .long pa(swapper_pg_pmd+PGD_ATTR+0x2000),0 +# elif KPMDS == 2 + .long 0,0 + .long pa(swapper_pg_pmd+PGD_ATTR),0 + .long pa(swapper_pg_pmd+PGD_ATTR+0x1000),0 +# elif KPMDS == 1 + .long 0,0 + .long 0,0 + .long pa(swapper_pg_pmd+PGD_ATTR),0 +# else +# error "Kernel PMDs should be 1, 2 or 3" +# endif + .align PAGE_SIZE_asm /* needs to be page-sized too */ +#endif + .data ENTRY(stack_start) .long init_thread_union+THREAD_SIZE diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 4f283ad..09b38d5 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -250,18 +250,13 @@ ENTRY(secondary_startup_64) lretq /* SMP bootup changes these two */ -#ifndef CONFIG_HOTPLUG_CPU - .pushsection .init.data -#endif + __CPUINITDATA .align 8 - .globl initial_code -initial_code: + ENTRY(initial_code) .quad x86_64_start_kernel -#ifndef CONFIG_HOTPLUG_CPU - .popsection -#endif - .globl init_rsp -init_rsp: + __FINITDATA + + ENTRY(init_rsp) .quad init_thread_union+THREAD_SIZE-8 bad_address: diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c index ef62b07..8540abe 100644 --- a/arch/x86/kernel/i8253.c +++ b/arch/x86/kernel/i8253.c @@ -95,7 +95,7 @@ static int pit_next_event(unsigned long delta, struct clock_event_device *evt) * registered. This mechanism replaces the previous #ifdef LOCAL_APIC - * !using_apic_timer decisions in do_timer_interrupt_hook() */ -struct clock_event_device pit_clockevent = { +static struct clock_event_device pit_clockevent = { .name = "pit", .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, .set_mode = init_pit_timer, diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index c1cfd60..d0b234c 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -151,7 +151,7 @@ NORET_TYPE void machine_kexec(struct kimage *image) void arch_crash_save_vmcoreinfo(void) { -#ifdef CONFIG_ARCH_DISCONTIGMEM_ENABLE +#ifdef CONFIG_NUMA VMCOREINFO_SYMBOL(node_data); VMCOREINFO_LENGTH(node_data, MAX_NUMNODES); #endif diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index a1fef42..236d2f8 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -234,5 +234,10 @@ NORET_TYPE void machine_kexec(struct kimage *image) void arch_crash_save_vmcoreinfo(void) { VMCOREINFO_SYMBOL(init_level4_pgt); + +#ifdef CONFIG_NUMA + VMCOREINFO_SYMBOL(node_data); + VMCOREINFO_LENGTH(node_data, MAX_NUMNODES); +#endif } diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c index 219f86e..027fc06 100644 --- a/arch/x86/kernel/mfgpt_32.c +++ b/arch/x86/kernel/mfgpt_32.c @@ -12,48 +12,37 @@ */ /* - * We are using the 32Khz input clock - its the only one that has the + * We are using the 32.768kHz input clock - it's the only one that has the * ranges we find desirable. The following table lists the suitable - * divisors and the associated hz, minimum interval - * and the maximum interval: + * divisors and the associated Hz, minimum interval and the maximum interval: * - * Divisor Hz Min Delta (S) Max Delta (S) - * 1 32000 .0005 2.048 - * 2 16000 .001 4.096 - * 4 8000 .002 8.192 - * 8 4000 .004 16.384 - * 16 2000 .008 32.768 - * 32 1000 .016 65.536 - * 64 500 .032 131.072 - * 128 250 .064 262.144 - * 256 125 .128 524.288 + * Divisor Hz Min Delta (s) Max Delta (s) + * 1 32768 .00048828125 2.000 + * 2 16384 .0009765625 4.000 + * 4 8192 .001953125 8.000 + * 8 4096 .00390625 16.000 + * 16 2048 .0078125 32.000 + * 32 1024 .015625 64.000 + * 64 512 .03125 128.000 + * 128 256 .0625 256.000 + * 256 128 .125 512.000 */ #include <linux/kernel.h> #include <linux/interrupt.h> -#include <linux/module.h> #include <asm/geode.h> -#define F_AVAIL 0x01 - static struct mfgpt_timer_t { - int flags; - struct module *owner; + unsigned int avail:1; } mfgpt_timers[MFGPT_MAX_TIMERS]; /* Selected from the table above */ #define MFGPT_DIVISOR 16 #define MFGPT_SCALE 4 /* divisor = 2^(scale) */ -#define MFGPT_HZ (32000 / MFGPT_DIVISOR) +#define MFGPT_HZ (32768 / MFGPT_DIVISOR) #define MFGPT_PERIODIC (MFGPT_HZ / HZ) -#ifdef CONFIG_GEODE_MFGPT_TIMER -static int __init mfgpt_timer_setup(void); -#else -#define mfgpt_timer_setup() (0) -#endif - /* Allow for disabling of MFGPTs */ static int disable; static int __init mfgpt_disable(char *s) @@ -85,28 +74,37 @@ __setup("mfgptfix", mfgpt_fix); * In other cases (such as with VSAless OpenFirmware), the system firmware * leaves timers available for us to use. */ -int __init geode_mfgpt_detect(void) + + +static int timers = -1; + +static void geode_mfgpt_detect(void) { - int count = 0, i; + int i; u16 val; + timers = 0; + if (disable) { - printk(KERN_INFO "geode-mfgpt: Skipping MFGPT setup\n"); - return 0; + printk(KERN_INFO "geode-mfgpt: MFGPT support is disabled\n"); + goto done; + } + + if (!geode_get_dev_base(GEODE_DEV_MFGPT)) { + printk(KERN_INFO "geode-mfgpt: MFGPT LBAR is not set up\n"); + goto done; } for (i = 0; i < MFGPT_MAX_TIMERS; i++) { val = geode_mfgpt_read(i, MFGPT_REG_SETUP); if (!(val & MFGPT_SETUP_SETUP)) { - mfgpt_timers[i].flags = F_AVAIL; - count++; + mfgpt_timers[i].avail = 1; + timers++; } } - /* set up clock event device, if desired */ - i = mfgpt_timer_setup(); - - return count; +done: + printk(KERN_INFO "geode-mfgpt: %d MFGPT timers available.\n", timers); } int geode_mfgpt_toggle_event(int timer, int cmp, int event, int enable) @@ -183,36 +181,41 @@ int geode_mfgpt_set_irq(int timer, int cmp, int irq, int enable) return 0; } -static int mfgpt_get(int timer, struct module *owner) +static int mfgpt_get(int timer) { - mfgpt_timers[timer].flags &= ~F_AVAIL; - mfgpt_timers[timer].owner = owner; + mfgpt_timers[timer].avail = 0; printk(KERN_INFO "geode-mfgpt: Registered timer %d\n", timer); return timer; } -int geode_mfgpt_alloc_timer(int timer, int domain, struct module *owner) +int geode_mfgpt_alloc_timer(int timer, int domain) { int i; - if (!geode_get_dev_base(GEODE_DEV_MFGPT)) - return -ENODEV; + if (timers == -1) { + /* timers haven't been detected yet */ + geode_mfgpt_detect(); + } + + if (!timers) + return -1; + if (timer >= MFGPT_MAX_TIMERS) - return -EIO; + return -1; if (timer < 0) { /* Try to find an available timer */ for (i = 0; i < MFGPT_MAX_TIMERS; i++) { - if (mfgpt_timers[i].flags & F_AVAIL) - return mfgpt_get(i, owner); + if (mfgpt_timers[i].avail) + return mfgpt_get(i); if (i == 5 && domain == MFGPT_DOMAIN_WORKING) break; } } else { /* If they requested a specific timer, try to honor that */ - if (mfgpt_timers[timer].flags & F_AVAIL) - return mfgpt_get(timer, owner); + if (mfgpt_timers[timer].avail) + return mfgpt_get(timer); } /* No timers available - too bad */ @@ -244,10 +247,11 @@ static int __init mfgpt_setup(char *str) } __setup("mfgpt_irq=", mfgpt_setup); -static inline void mfgpt_disable_timer(u16 clock) +static void mfgpt_disable_timer(u16 clock) { - u16 val = geode_mfgpt_read(clock, MFGPT_REG_SETUP); - geode_mfgpt_write(clock, MFGPT_REG_SETUP, val & ~MFGPT_SETUP_CNTEN); + /* avoid races by clearing CMP1 and CMP2 unconditionally */ + geode_mfgpt_write(clock, MFGPT_REG_SETUP, (u16) ~MFGPT_SETUP_CNTEN | + MFGPT_SETUP_CMP1 | MFGPT_SETUP_CMP2); } static int mfgpt_next_event(unsigned long, struct clock_event_device *); @@ -263,7 +267,7 @@ static struct clock_event_device mfgpt_clockevent = { .shift = 32 }; -static inline void mfgpt_start_timer(u16 clock, u16 delta) +static void mfgpt_start_timer(u16 delta) { geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_CMP2, (u16) delta); geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_COUNTER, 0); @@ -278,21 +282,25 @@ static void mfgpt_set_mode(enum clock_event_mode mode, mfgpt_disable_timer(mfgpt_event_clock); if (mode == CLOCK_EVT_MODE_PERIODIC) - mfgpt_start_timer(mfgpt_event_clock, MFGPT_PERIODIC); + mfgpt_start_timer(MFGPT_PERIODIC); mfgpt_tick_mode = mode; } static int mfgpt_next_event(unsigned long delta, struct clock_event_device *evt) { - mfgpt_start_timer(mfgpt_event_clock, delta); + mfgpt_start_timer(delta); return 0; } -/* Assume (foolishly?), that this interrupt was due to our tick */ - static irqreturn_t mfgpt_tick(int irq, void *dev_id) { + u16 val = geode_mfgpt_read(mfgpt_event_clock, MFGPT_REG_SETUP); + + /* See if the interrupt was for us */ + if (!(val & (MFGPT_SETUP_SETUP | MFGPT_SETUP_CMP2 | MFGPT_SETUP_CMP1))) + return IRQ_NONE; + /* Turn off the clock (and clear the event) */ mfgpt_disable_timer(mfgpt_event_clock); @@ -320,13 +328,12 @@ static struct irqaction mfgptirq = { .name = "mfgpt-timer" }; -static int __init mfgpt_timer_setup(void) +int __init mfgpt_timer_setup(void) { int timer, ret; u16 val; - timer = geode_mfgpt_alloc_timer(MFGPT_TIMER_ANY, MFGPT_DOMAIN_WORKING, - THIS_MODULE); + timer = geode_mfgpt_alloc_timer(MFGPT_TIMER_ANY, MFGPT_DOMAIN_WORKING); if (timer < 0) { printk(KERN_ERR "mfgpt-timer: Could not allocate a MFPGT timer\n"); @@ -363,7 +370,7 @@ static int __init mfgpt_timer_setup(void) &mfgpt_clockevent); printk(KERN_INFO - "mfgpt-timer: registering the MFGT timer as a clock event.\n"); + "mfgpt-timer: registering the MFGPT timer as a clock event.\n"); clockevents_register_device(&mfgpt_clockevent); return 0; diff --git a/arch/x86/kernel/mpparse_32.c b/arch/x86/kernel/mpparse_32.c index 67009cd..f349e68 100644 --- a/arch/x86/kernel/mpparse_32.c +++ b/arch/x86/kernel/mpparse_32.c @@ -736,7 +736,8 @@ static int __init smp_scan_config (unsigned long base, unsigned long length) smp_found_config = 1; printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n", mpf, virt_to_phys(mpf)); - reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE); + reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE, + BOOTMEM_DEFAULT); if (mpf->mpf_physptr) { /* * We cannot access to MPC table to compute @@ -751,7 +752,8 @@ static int __init smp_scan_config (unsigned long base, unsigned long length) unsigned long end = max_low_pfn * PAGE_SIZE; if (mpf->mpf_physptr + size > end) size = end - mpf->mpf_physptr; - reserve_bootmem(mpf->mpf_physptr, size); + reserve_bootmem(mpf->mpf_physptr, size, + BOOTMEM_DEFAULT); } mpf_found = mpf; diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 65f6acb..faf3229 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -749,6 +749,15 @@ void __init gart_iommu_init(void) */ set_memory_np((unsigned long)__va(iommu_bus_base), iommu_size >> PAGE_SHIFT); + /* + * Tricky. The GART table remaps the physical memory range, + * so the CPU wont notice potential aliases and if the memory + * is remapped to UC later on, we might surprise the PCI devices + * with a stray writeout of a cacheline. So play it sure and + * do an explicit, full-scale wbinvd() _after_ having marked all + * the pages as Not-Present: + */ + wbinvd(); /* * Try to workaround a bug (thanks to BenH) diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index dabdbef..a7d50a5 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -23,7 +23,6 @@ #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/user.h> -#include <linux/a.out.h> #include <linux/interrupt.h> #include <linux/utsname.h> #include <linux/delay.h> @@ -539,55 +538,6 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, return err; } -/* - * fill in the user structure for a core dump.. - */ -void dump_thread(struct pt_regs * regs, struct user * dump) -{ - u16 gs; - -/* changed the size calculations - should hopefully work better. lbt */ - dump->magic = CMAGIC; - dump->start_code = 0; - dump->start_stack = regs->sp & ~(PAGE_SIZE - 1); - dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT; - dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT; - dump->u_dsize -= dump->u_tsize; - dump->u_ssize = 0; - dump->u_debugreg[0] = current->thread.debugreg0; - dump->u_debugreg[1] = current->thread.debugreg1; - dump->u_debugreg[2] = current->thread.debugreg2; - dump->u_debugreg[3] = current->thread.debugreg3; - dump->u_debugreg[4] = 0; - dump->u_debugreg[5] = 0; - dump->u_debugreg[6] = current->thread.debugreg6; - dump->u_debugreg[7] = current->thread.debugreg7; - - if (dump->start_stack < TASK_SIZE) - dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT; - - dump->regs.bx = regs->bx; - dump->regs.cx = regs->cx; - dump->regs.dx = regs->dx; - dump->regs.si = regs->si; - dump->regs.di = regs->di; - dump->regs.bp = regs->bp; - dump->regs.ax = regs->ax; - dump->regs.ds = (u16)regs->ds; - dump->regs.es = (u16)regs->es; - dump->regs.fs = (u16)regs->fs; - savesegment(gs,gs); - dump->regs.orig_ax = regs->orig_ax; - dump->regs.ip = regs->ip; - dump->regs.cs = (u16)regs->cs; - dump->regs.flags = regs->flags; - dump->regs.sp = regs->sp; - dump->regs.ss = (u16)regs->ss; - - dump->u_fpvalid = dump_fpu (regs, &dump->i387); -} -EXPORT_SYMBOL(dump_thread); - #ifdef CONFIG_SECCOMP static void hard_disable_TSC(void) { diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 137a861..b0cc8f0 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -26,7 +26,6 @@ #include <linux/smp.h> #include <linux/slab.h> #include <linux/user.h> -#include <linux/a.out.h> #include <linux/interrupt.h> #include <linux/utsname.h> #include <linux/delay.h> diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 96286df..702c33ef 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -103,9 +103,26 @@ static int set_segment_reg(struct task_struct *task, if (invalid_selector(value)) return -EIO; - if (offset != offsetof(struct user_regs_struct, gs)) + /* + * For %cs and %ss we cannot permit a null selector. + * We can permit a bogus selector as long as it has USER_RPL. + * Null selectors are fine for other segment registers, but + * we will never get back to user mode with invalid %cs or %ss + * and will take the trap in iret instead. Much code relies + * on user_mode() to distinguish a user trap frame (which can + * safely use invalid selectors) from a kernel trap frame. + */ + switch (offset) { + case offsetof(struct user_regs_struct, cs): + case offsetof(struct user_regs_struct, ss): + if (unlikely(value == 0)) + return -EIO; + + default: *pt_regs_access(task_pt_regs(task), offset) = value; - else { + break; + + case offsetof(struct user_regs_struct, gs): task->thread.gs = value; if (task == current) /* @@ -227,12 +244,16 @@ static int set_segment_reg(struct task_struct *task, * Can't actually change these in 64-bit mode. */ case offsetof(struct user_regs_struct,cs): + if (unlikely(value == 0)) + return -EIO; #ifdef CONFIG_IA32_EMULATION if (test_tsk_thread_flag(task, TIF_IA32)) task_pt_regs(task)->cs = value; #endif break; case offsetof(struct user_regs_struct,ss): + if (unlikely(value == 0)) + return -EIO; #ifdef CONFIG_IA32_EMULATION if (test_tsk_thread_flag(task, TIF_IA32)) task_pt_regs(task)->ss = value; diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 3cd7a2d..c47208f 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -11,7 +11,7 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) { u8 config, rev; - u32 word; + u16 word; /* BIOS may enable hardware IRQ balancing for * E7520/E7320/E7525(revision ID 0x9 and below) @@ -26,8 +26,11 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) pci_read_config_byte(dev, 0xf4, &config); pci_write_config_byte(dev, 0xf4, config|0x2); - /* read xTPR register */ - raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word); + /* + * read xTPR register. We may not have a pci_dev for device 8 + * because it might be hidden until the above write. + */ + pci_bus_read_config_word(dev->bus, PCI_DEVFN(8, 0), 0x4c, &word); if (!(word & (1 << 13))) { dev_info(&dev->dev, "Intel E7520/7320/7525 detected; " @@ -380,19 +383,19 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0367, void force_hpet_resume(void) { switch (force_hpet_resume_type) { - case ICH_FORCE_HPET_RESUME: - return ich_force_hpet_resume(); - - case OLD_ICH_FORCE_HPET_RESUME: - return old_ich_force_hpet_resume(); - - case VT8237_FORCE_HPET_RESUME: - return vt8237_force_hpet_resume(); - - case NVIDIA_FORCE_HPET_RESUME: - return nvidia_force_hpet_resume(); - - default: + case ICH_FORCE_HPET_RESUME: + ich_force_hpet_resume(); + return; + case OLD_ICH_FORCE_HPET_RESUME: + old_ich_force_hpet_resume(); + return; + case VT8237_FORCE_HPET_RESUME: + vt8237_force_hpet_resume(); + return; + case NVIDIA_FORCE_HPET_RESUME: + nvidia_force_hpet_resume(); + return; + default: break; } } diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 5818dc2..7fd6ac4 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -326,7 +326,7 @@ static inline void kb_wait(void) } } -void machine_emergency_restart(void) +static void native_machine_emergency_restart(void) { int i; @@ -376,7 +376,7 @@ void machine_emergency_restart(void) } } -void machine_shutdown(void) +static void native_machine_shutdown(void) { /* Stop the cpus and apics */ #ifdef CONFIG_SMP @@ -420,7 +420,7 @@ void machine_shutdown(void) #endif } -void machine_restart(char *__unused) +static void native_machine_restart(char *__unused) { printk("machine restart\n"); @@ -429,11 +429,11 @@ void machine_restart(char *__unused) machine_emergency_restart(); } -void machine_halt(void) +static void native_machine_halt(void) { } -void machine_power_off(void) +static void native_machine_power_off(void) { if (pm_power_off) { if (!reboot_force) @@ -443,9 +443,35 @@ void machine_power_off(void) } struct machine_ops machine_ops = { - .power_off = machine_power_off, - .shutdown = machine_shutdown, - .emergency_restart = machine_emergency_restart, - .restart = machine_restart, - .halt = machine_halt + .power_off = native_machine_power_off, + .shutdown = native_machine_shutdown, + .emergency_restart = native_machine_emergency_restart, + .restart = native_machine_restart, + .halt = native_machine_halt }; + +void machine_power_off(void) +{ + machine_ops.power_off(); +} + +void machine_shutdown(void) +{ + machine_ops.shutdown(); +} + +void machine_emergency_restart(void) +{ + machine_ops.emergency_restart(); +} + +void machine_restart(char *cmd) +{ + machine_ops.restart(cmd); +} + +void machine_halt(void) +{ + machine_ops.halt(); +} + diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 62adc5f..691ab4c 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -154,7 +154,11 @@ struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; EXPORT_SYMBOL(boot_cpu_data); +#ifndef CONFIG_X86_PAE unsigned long mmu_cr4_features; +#else +unsigned long mmu_cr4_features = X86_CR4_PAE; +#endif /* for MCA, but anyone else can use it if they want */ unsigned int machine_id; @@ -390,7 +394,7 @@ static void __init reserve_ebda_region(void) unsigned int addr; addr = get_bios_ebda(); if (addr) - reserve_bootmem(addr, PAGE_SIZE); + reserve_bootmem(addr, PAGE_SIZE, BOOTMEM_DEFAULT); } #ifndef CONFIG_NEED_MULTIPLE_NODES @@ -484,7 +488,8 @@ static void __init reserve_crashkernel(void) (unsigned long)(total_mem >> 20)); crashk_res.start = crash_base; crashk_res.end = crash_base + crash_size - 1; - reserve_bootmem(crash_base, crash_size); + reserve_bootmem(crash_base, crash_size, + BOOTMEM_DEFAULT); } else printk(KERN_INFO "crashkernel reservation failed - " "you have to specify a base address\n"); @@ -525,7 +530,7 @@ static void __init reserve_initrd(void) } if (ramdisk_end <= end_of_lowmem) { /* All in lowmem, easy case */ - reserve_bootmem(ramdisk_image, ramdisk_size); + reserve_bootmem(ramdisk_image, ramdisk_size, BOOTMEM_DEFAULT); initrd_start = ramdisk_image + PAGE_OFFSET; initrd_end = initrd_start+ramdisk_size; return; @@ -536,7 +541,7 @@ static void __init reserve_initrd(void) /* Note: this includes all the lowmem currently occupied by the initrd, we rely on that fact to keep the data intact. */ - reserve_bootmem(ramdisk_here, ramdisk_size); + reserve_bootmem(ramdisk_here, ramdisk_size, BOOTMEM_DEFAULT); initrd_start = ramdisk_here + PAGE_OFFSET; initrd_end = initrd_start + ramdisk_size; @@ -606,13 +611,14 @@ void __init setup_bootmem_allocator(void) * bootmem allocator with an invalid RAM area. */ reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) + - bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text)); + bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text), + BOOTMEM_DEFAULT); /* * reserve physical page 0 - it's a special BIOS page on many boxes, * enabling clean reboots, SMP operation, laptop functions. */ - reserve_bootmem(0, PAGE_SIZE); + reserve_bootmem(0, PAGE_SIZE, BOOTMEM_DEFAULT); /* reserve EBDA region, it's a 4K region */ reserve_ebda_region(); @@ -622,7 +628,7 @@ void __init setup_bootmem_allocator(void) unless you have no PS/2 mouse plugged in. */ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && boot_cpu_data.x86 == 6) - reserve_bootmem(0xa0000 - 4096, 4096); + reserve_bootmem(0xa0000 - 4096, 4096, BOOTMEM_DEFAULT); #ifdef CONFIG_SMP /* @@ -630,7 +636,7 @@ void __init setup_bootmem_allocator(void) * FIXME: Don't need the extra page at 4K, but need to fix * trampoline before removing it. (see the GDT stuff) */ - reserve_bootmem(PAGE_SIZE, PAGE_SIZE); + reserve_bootmem(PAGE_SIZE, PAGE_SIZE, BOOTMEM_DEFAULT); #endif #ifdef CONFIG_ACPI_SLEEP /* diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index c8939df..c0d8208 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -15,7 +15,6 @@ #include <linux/ptrace.h> #include <linux/slab.h> #include <linux/user.h> -#include <linux/a.out.h> #include <linux/screen_info.h> #include <linux/ioport.h> #include <linux/delay.h> @@ -189,7 +188,7 @@ contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn) bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn); e820_register_active_regions(0, start_pfn, end_pfn); free_bootmem_with_active_regions(0, end_pfn); - reserve_bootmem(bootmap, bootmap_size); + reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT); } #endif @@ -220,28 +219,35 @@ static inline void copy_edd(void) #ifdef CONFIG_KEXEC static void __init reserve_crashkernel(void) { - unsigned long long free_mem; + unsigned long long total_mem; unsigned long long crash_size, crash_base; int ret; - free_mem = - ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT; + total_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT; - ret = parse_crashkernel(boot_command_line, free_mem, + ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base); if (ret == 0 && crash_size) { - if (crash_base > 0) { - printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " - "for crashkernel (System RAM: %ldMB)\n", - (unsigned long)(crash_size >> 20), - (unsigned long)(crash_base >> 20), - (unsigned long)(free_mem >> 20)); - crashk_res.start = crash_base; - crashk_res.end = crash_base + crash_size - 1; - reserve_bootmem(crash_base, crash_size); - } else + if (crash_base <= 0) { printk(KERN_INFO "crashkernel reservation failed - " "you have to specify a base address\n"); + return; + } + + if (reserve_bootmem(crash_base, crash_size, + BOOTMEM_EXCLUSIVE) < 0) { + printk(KERN_INFO "crashkernel reservation failed - " + "memory is in use\n"); + return; + } + + printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " + "for crashkernel (System RAM: %ldMB)\n", + (unsigned long)(crash_size >> 20), + (unsigned long)(crash_base >> 20), + (unsigned long)(total_mem >> 20)); + crashk_res.start = crash_base; + crashk_res.end = crash_base + crash_size - 1; } } #else diff --git a/arch/x86/kernel/smpboot_32.c b/arch/x86/kernel/smpboot_32.c index 5787a0c..579b9b7 100644 --- a/arch/x86/kernel/smpboot_32.c +++ b/arch/x86/kernel/smpboot_32.c @@ -202,8 +202,6 @@ valid_k7: ; } -extern void calibrate_delay(void); - static atomic_t init_deasserted; static void __cpuinit smp_callin(void) diff --git a/arch/x86/kernel/srat_32.c b/arch/x86/kernel/srat_32.c index 2bf6903..b72e613 100644 --- a/arch/x86/kernel/srat_32.c +++ b/arch/x86/kernel/srat_32.c @@ -274,7 +274,7 @@ int __init get_memcfg_from_srat(void) int tables = 0; int i = 0; - rsdp_address = acpi_find_rsdp(); + rsdp_address = acpi_os_get_root_pointer(); if (!rsdp_address) { printk("%s: System description tables not found\n", __FUNCTION__); diff --git a/arch/x86/kernel/test_nx.c b/arch/x86/kernel/test_nx.c index 36c100c..10b8a6f 100644 --- a/arch/x86/kernel/test_nx.c +++ b/arch/x86/kernel/test_nx.c @@ -139,7 +139,6 @@ static int test_NX(void) * Until then, don't run them to avoid too many people getting scared * by the error message */ -#if 0 #ifdef CONFIG_DEBUG_RODATA /* Test 3: Check if the .rodata section is executable */ @@ -152,6 +151,7 @@ static int test_NX(void) } #endif +#if 0 /* Test 4: Check if the .data section of a module is executable */ if (test_address(&test_data)) { printk(KERN_ERR "test_nx: .data section is executable\n"); diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c index 4c16377..c29e235 100644 --- a/arch/x86/kernel/test_rodata.c +++ b/arch/x86/kernel/test_rodata.c @@ -10,8 +10,8 @@ * of the License. */ #include <linux/module.h> +#include <asm/cacheflush.h> #include <asm/sections.h> -extern int rodata_test_data; int rodata_test(void) { diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index 0380795..c737849 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -77,7 +77,7 @@ unsigned long __init native_calculate_cpu_khz(void) reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i); } local_irq_save(flags); - /* start meauring cycles, incrementing from 0 */ + /* start measuring cycles, incrementing from 0 */ wrmsrl(MSR_K7_PERFCTR0 + i, 0); wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76); rdtscl(tsc_start); diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c index e6757aa..a40051b 100644 --- a/arch/x86/kernel/topology.c +++ b/arch/x86/kernel/topology.c @@ -53,7 +53,7 @@ EXPORT_SYMBOL(arch_register_cpu); void arch_unregister_cpu(int num) { - return unregister_cpu(&per_cpu(cpu_devices, num).cpu); + unregister_cpu(&per_cpu(cpu_devices, num).cpu); } EXPORT_SYMBOL(arch_unregister_cpu); #else diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 3cf72977..b22c01e 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -1176,17 +1176,12 @@ void __init trap_init(void) #endif set_trap_gate(19,&simd_coprocessor_error); + /* + * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned. + * Generate a build-time error if the alignment is wrong. + */ + BUILD_BUG_ON(offsetof(struct task_struct, thread.i387.fxsave) & 15); if (cpu_has_fxsr) { - /* - * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned. - * Generates a compile-time "error: zero width for bit-field" if - * the alignment is wrong. - */ - struct fxsrAlignAssert { - int _:!(offsetof(struct task_struct, - thread.i387.fxsave) & 15); - }; - printk(KERN_INFO "Enabling fast FPU save and restore... "); set_in_cr4(X86_CR4_OSFXSR); printk("done.\n"); diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index efc66df..0454666 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -84,7 +84,7 @@ static inline void conditional_sti(struct pt_regs *regs) static inline void preempt_conditional_sti(struct pt_regs *regs) { - preempt_disable(); + inc_preempt_count(); if (regs->flags & X86_EFLAGS_IF) local_irq_enable(); } @@ -95,7 +95,7 @@ static inline void preempt_conditional_cli(struct pt_regs *regs) local_irq_disable(); /* Make sure to not schedule here because we could be running on an exception stack. */ - preempt_enable_no_resched(); + dec_preempt_count(); } int kstack_depth_to_print = 12; diff --git a/arch/x86/lib/delay_32.c b/arch/x86/lib/delay_32.c index aad9d95..4535e6d 100644 --- a/arch/x86/lib/delay_32.c +++ b/arch/x86/lib/delay_32.c @@ -12,8 +12,10 @@ #include <linux/module.h> #include <linux/sched.h> +#include <linux/timex.h> #include <linux/preempt.h> #include <linux/delay.h> +#include <linux/init.h> #include <asm/processor.h> #include <asm/delay.h> @@ -63,7 +65,7 @@ void use_tsc_delay(void) delay_fn = delay_tsc; } -int read_current_timer(unsigned long *timer_val) +int __devinit read_current_timer(unsigned long *timer_val) { if (delay_fn == delay_tsc) { rdtscl(*timer_val); diff --git a/arch/x86/lib/delay_64.c b/arch/x86/lib/delay_64.c index 45cdd3f..bbc6105 100644 --- a/arch/x86/lib/delay_64.c +++ b/arch/x86/lib/delay_64.c @@ -10,8 +10,10 @@ #include <linux/module.h> #include <linux/sched.h> +#include <linux/timex.h> #include <linux/preempt.h> #include <linux/delay.h> +#include <linux/init.h> #include <asm/delay.h> #include <asm/msr.h> @@ -20,7 +22,7 @@ #include <asm/smp.h> #endif -int read_current_timer(unsigned long *timer_value) +int __devinit read_current_timer(unsigned long *timer_value) { rdtscll(*timer_value); return 0; diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c index dffa786..3cc8eb2 100644 --- a/arch/x86/mach-voyager/voyager_smp.c +++ b/arch/x86/mach-voyager/voyager_smp.c @@ -444,8 +444,6 @@ static __u32 __init setup_trampoline(void) static void __init start_secondary(void *unused) { __u8 cpuid = hard_smp_processor_id(); - /* external functions not defined in the headers */ - extern void calibrate_delay(void); cpu_init(); diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 04b1d20..c394ca0 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -391,7 +391,8 @@ unsigned long __init setup_memory(void) void __init numa_kva_reserve(void) { if (kva_pages) - reserve_bootmem(PFN_PHYS(kva_start_pfn), PFN_PHYS(kva_pages)); + reserve_bootmem(PFN_PHYS(kva_start_pfn), PFN_PHYS(kva_pages), + BOOTMEM_DEFAULT); } void __init zone_sizes_init(void) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index ad8b973..fdc6674 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -186,7 +186,7 @@ static int bad_address(void *p) } #endif -void dump_pagetable(unsigned long address) +static void dump_pagetable(unsigned long address) { #ifdef CONFIG_X86_32 __typeof__(pte_val(__pte(0))) page; @@ -428,6 +428,16 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, } #endif +static int spurious_fault_check(unsigned long error_code, pte_t *pte) +{ + if ((error_code & PF_WRITE) && !pte_write(*pte)) + return 0; + if ((error_code & PF_INSTR) && !pte_exec(*pte)) + return 0; + + return 1; +} + /* * Handle a spurious fault caused by a stale TLB entry. This allows * us to lazily refresh the TLB when increasing the permissions of a @@ -457,20 +467,21 @@ static int spurious_fault(unsigned long address, if (!pud_present(*pud)) return 0; + if (pud_large(*pud)) + return spurious_fault_check(error_code, (pte_t *) pud); + pmd = pmd_offset(pud, address); if (!pmd_present(*pmd)) return 0; + if (pmd_large(*pmd)) + return spurious_fault_check(error_code, (pte_t *) pmd); + pte = pte_offset_kernel(pmd, address); if (!pte_present(*pte)) return 0; - if ((error_code & PF_WRITE) && !pte_write(*pte)) - return 0; - if ((error_code & PF_INSTR) && !pte_exec(*pte)) - return 0; - - return 1; + return spurious_fault_check(error_code, pte); } /* @@ -947,11 +958,12 @@ void vmalloc_sync_all(void) for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) { if (!test_bit(pgd_index(address), insync)) { const pgd_t *pgd_ref = pgd_offset_k(address); + unsigned long flags; struct page *page; if (pgd_none(*pgd_ref)) continue; - spin_lock(&pgd_lock); + spin_lock_irqsave(&pgd_lock, flags); list_for_each_entry(page, &pgd_list, lru) { pgd_t *pgd; pgd = (pgd_t *)page_address(page) + pgd_index(address); @@ -960,7 +972,7 @@ void vmalloc_sync_all(void) else BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); } - spin_unlock(&pgd_lock); + spin_unlock_irqrestore(&pgd_lock, flags); set_bit(pgd_index(address), insync); } if (address == start) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index d1bc040..ee1091a 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -46,6 +46,8 @@ #include <asm/pgalloc.h> #include <asm/sections.h> #include <asm/paravirt.h> +#include <asm/setup.h> +#include <asm/cacheflush.h> unsigned int __VMALLOC_RESERVE = 128 << 20; @@ -328,44 +330,38 @@ pteval_t __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC; void __init native_pagetable_setup_start(pgd_t *base) { -#ifdef CONFIG_X86_PAE - int i; + unsigned long pfn, va; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; /* - * Init entries of the first-level page table to the - * zero page, if they haven't already been set up. - * - * In a normal native boot, we'll be running on a - * pagetable rooted in swapper_pg_dir, but not in PAE - * mode, so this will end up clobbering the mappings - * for the lower 24Mbytes of the address space, - * without affecting the kernel address space. + * Remove any mappings which extend past the end of physical + * memory from the boot time page table: */ - for (i = 0; i < USER_PTRS_PER_PGD; i++) - set_pgd(&base[i], - __pgd(__pa(empty_zero_page) | _PAGE_PRESENT)); - - /* Make sure kernel address space is empty so that a pagetable - will be allocated for it. */ - memset(&base[USER_PTRS_PER_PGD], 0, - KERNEL_PGD_PTRS * sizeof(pgd_t)); -#else + for (pfn = max_low_pfn + 1; pfn < 1<<(32-PAGE_SHIFT); pfn++) { + va = PAGE_OFFSET + (pfn<<PAGE_SHIFT); + pgd = base + pgd_index(va); + if (!pgd_present(*pgd)) + break; + + pud = pud_offset(pgd, va); + pmd = pmd_offset(pud, va); + if (!pmd_present(*pmd)) + break; + + pte = pte_offset_kernel(pmd, va); + if (!pte_present(*pte)) + break; + + pte_clear(NULL, va, pte); + } paravirt_alloc_pd(&init_mm, __pa(base) >> PAGE_SHIFT); -#endif } void __init native_pagetable_setup_done(pgd_t *base) { -#ifdef CONFIG_X86_PAE - /* - * Add low memory identity-mappings - SMP needs it when - * starting up on an AP from real-mode. In the non-PAE - * case we already have these mappings through head.S. - * All user-space mappings are explicitly cleared after - * SMP startup. - */ - set_pgd(&base[0], base[USER_PTRS_PER_PGD]); -#endif } /* @@ -374,9 +370,8 @@ void __init native_pagetable_setup_done(pgd_t *base) * the boot process. * * If we're booting on native hardware, this will be a pagetable - * constructed in arch/i386/kernel/head.S, and not running in PAE mode - * (even if we'll end up running in PAE). The root of the pagetable - * will be swapper_pg_dir. + * constructed in arch/x86/kernel/head_32.S. The root of the + * pagetable will be swapper_pg_dir. * * If we're booting paravirtualized under a hypervisor, then there are * more options: we may already be running PAE, and the pagetable may @@ -537,14 +532,6 @@ void __init paging_init(void) load_cr3(swapper_pg_dir); -#ifdef CONFIG_X86_PAE - /* - * We will bail out later - printk doesn't work right now so - * the user would just see a hanging kernel. - */ - if (cpu_has_pae) - set_in_cr4(X86_CR4_PAE); -#endif __flush_tlb_all(); kmap_init(); @@ -675,13 +662,11 @@ void __init mem_init(void) BUG_ON((unsigned long)high_memory > VMALLOC_START); #endif /* double-sanity-check paranoia */ -#ifdef CONFIG_X86_PAE - if (!cpu_has_pae) - panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!"); -#endif if (boot_cpu_data.wp_works_ok < 0) test_wp_bit(); + cpa_init(); + /* * Subtle. SMP is doing it's boot stuff late (because it has to * fork idle threads) - but it also needs low mappings for the diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 3a98d6f..a4a9ccc 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -45,6 +45,7 @@ #include <asm/sections.h> #include <asm/kdebug.h> #include <asm/numa.h> +#include <asm/cacheflush.h> const struct dma_mapping_ops *dma_ops; EXPORT_SYMBOL(dma_ops); @@ -528,13 +529,15 @@ void __init mem_init(void) reservedpages << (PAGE_SHIFT-10), datasize >> 10, initsize >> 10); + + cpa_init(); } void free_init_pages(char *what, unsigned long begin, unsigned long end) { - unsigned long addr; + unsigned long addr = begin; - if (begin >= end) + if (addr >= end) return; /* @@ -549,7 +552,7 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) #else printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); - for (addr = begin; addr < end; addr += PAGE_SIZE) { + for (; addr < end; addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); init_page_count(virt_to_page(addr)); memset((void *)(addr & ~(PAGE_SIZE-1)), @@ -591,10 +594,17 @@ void mark_rodata_ro(void) if (end <= start) return; - set_memory_ro(start, (end - start) >> PAGE_SHIFT); printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", (end - start) >> 10); + set_memory_ro(start, (end - start) >> PAGE_SHIFT); + + /* + * The rodata section (but not the kernel text!) should also be + * not-executable. + */ + start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; + set_memory_nx(start, (end - start) >> PAGE_SHIFT); rodata_test(); @@ -637,9 +647,9 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len) /* Should check here against the e820 map to avoid double free */ #ifdef CONFIG_NUMA - reserve_bootmem_node(NODE_DATA(nid), phys, len); + reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT); #else - reserve_bootmem(phys, len); + reserve_bootmem(phys, len, BOOTMEM_DEFAULT); #endif if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) { dma_reserve += len / PAGE_SIZE; diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index ee6648f..9f42d7e 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -260,41 +260,48 @@ static int __init early_ioremap_debug_setup(char *str) early_param("early_ioremap_debug", early_ioremap_debug_setup); static __initdata int after_paging_init; -static __initdata unsigned long bm_pte[1024] +static __initdata pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __attribute__((aligned(PAGE_SIZE))); -static inline unsigned long * __init early_ioremap_pgd(unsigned long addr) +static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) { - return (unsigned long *)swapper_pg_dir + ((addr >> 22) & 1023); + /* Don't assume we're using swapper_pg_dir at this point */ + pgd_t *base = __va(read_cr3()); + pgd_t *pgd = &base[pgd_index(addr)]; + pud_t *pud = pud_offset(pgd, addr); + pmd_t *pmd = pmd_offset(pud, addr); + + return pmd; } -static inline unsigned long * __init early_ioremap_pte(unsigned long addr) +static inline pte_t * __init early_ioremap_pte(unsigned long addr) { - return bm_pte + ((addr >> PAGE_SHIFT) & 1023); + return &bm_pte[pte_index(addr)]; } void __init early_ioremap_init(void) { - unsigned long *pgd; + pmd_t *pmd; if (early_ioremap_debug) printk(KERN_INFO "early_ioremap_init()\n"); - pgd = early_ioremap_pgd(fix_to_virt(FIX_BTMAP_BEGIN)); - *pgd = __pa(bm_pte) | _PAGE_TABLE; + pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); memset(bm_pte, 0, sizeof(bm_pte)); + pmd_populate_kernel(&init_mm, pmd, bm_pte); + /* - * The boot-ioremap range spans multiple pgds, for which + * The boot-ioremap range spans multiple pmds, for which * we are not prepared: */ - if (pgd != early_ioremap_pgd(fix_to_virt(FIX_BTMAP_END))) { + if (pmd != early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))) { WARN_ON(1); - printk(KERN_WARNING "pgd %p != %p\n", - pgd, early_ioremap_pgd(fix_to_virt(FIX_BTMAP_END))); + printk(KERN_WARNING "pmd %p != %p\n", + pmd, early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))); printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n", - fix_to_virt(FIX_BTMAP_BEGIN)); + fix_to_virt(FIX_BTMAP_BEGIN)); printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_END): %08lx\n", - fix_to_virt(FIX_BTMAP_END)); + fix_to_virt(FIX_BTMAP_END)); printk(KERN_WARNING "FIX_BTMAP_END: %d\n", FIX_BTMAP_END); printk(KERN_WARNING "FIX_BTMAP_BEGIN: %d\n", @@ -304,28 +311,29 @@ void __init early_ioremap_init(void) void __init early_ioremap_clear(void) { - unsigned long *pgd; + pmd_t *pmd; if (early_ioremap_debug) printk(KERN_INFO "early_ioremap_clear()\n"); - pgd = early_ioremap_pgd(fix_to_virt(FIX_BTMAP_BEGIN)); - *pgd = 0; - paravirt_release_pt(__pa(pgd) >> PAGE_SHIFT); + pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); + pmd_clear(pmd); + paravirt_release_pt(__pa(bm_pte) >> PAGE_SHIFT); __flush_tlb_all(); } void __init early_ioremap_reset(void) { enum fixed_addresses idx; - unsigned long *pte, phys, addr; + unsigned long addr, phys; + pte_t *pte; after_paging_init = 1; for (idx = FIX_BTMAP_BEGIN; idx >= FIX_BTMAP_END; idx--) { addr = fix_to_virt(idx); pte = early_ioremap_pte(addr); - if (*pte & _PAGE_PRESENT) { - phys = *pte & PAGE_MASK; + if (pte_present(*pte)) { + phys = pte_val(*pte) & PAGE_MASK; set_fixmap(idx, phys); } } @@ -334,7 +342,8 @@ void __init early_ioremap_reset(void) static void __init __early_set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t flags) { - unsigned long *pte, addr = __fix_to_virt(idx); + unsigned long addr = __fix_to_virt(idx); + pte_t *pte; if (idx >= __end_of_fixed_addresses) { BUG(); @@ -342,9 +351,9 @@ static void __init __early_set_fixmap(enum fixed_addresses idx, } pte = early_ioremap_pte(addr); if (pgprot_val(flags)) - *pte = (phys & PAGE_MASK) | pgprot_val(flags); + set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags)); else - *pte = 0; + pte_clear(NULL, addr, pte); __flush_tlb_one(addr); } diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 5a02bf4..1aecc65 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -238,9 +238,10 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, free_bootmem_with_active_regions(nodeid, end); - reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); + reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size, + BOOTMEM_DEFAULT); reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, - bootmap_pages<<PAGE_SHIFT); + bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT); #ifdef CONFIG_ACPI_NUMA srat_reserve_add_area(nodeid); #endif diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c index 398f3a5..75f1b10 100644 --- a/arch/x86/mm/pageattr-test.c +++ b/arch/x86/mm/pageattr-test.c @@ -5,6 +5,7 @@ * and compares page tables forwards and afterwards. */ #include <linux/bootmem.h> +#include <linux/kthread.h> #include <linux/random.h> #include <linux/kernel.h> #include <linux/init.h> @@ -14,8 +15,13 @@ #include <asm/pgtable.h> #include <asm/kdebug.h> +/* + * Only print the results of the first pass: + */ +static __read_mostly int print = 1; + enum { - NTEST = 4000, + NTEST = 400, #ifdef CONFIG_X86_64 LPS = (1 << PMD_SHIFT), #elif defined(CONFIG_X86_PAE) @@ -31,10 +37,9 @@ struct split_state { long min_exec, max_exec; }; -static __init int print_split(struct split_state *s) +static int print_split(struct split_state *s) { long i, expected, missed = 0; - int printed = 0; int err = 0; s->lpg = s->gpg = s->spg = s->exec = 0; @@ -47,12 +52,6 @@ static __init int print_split(struct split_state *s) pte = lookup_address(addr, &level); if (!pte) { - if (!printed) { - dump_pagetable(addr); - printk(KERN_INFO "CPA %lx no pte level %d\n", - addr, level); - printed = 1; - } missed++; i++; continue; @@ -82,10 +81,13 @@ static __init int print_split(struct split_state *s) s->max_exec = addr; } } - printk(KERN_INFO - "CPA mapping 4k %lu large %lu gb %lu x %lu[%lx-%lx] miss %lu\n", - s->spg, s->lpg, s->gpg, s->exec, - s->min_exec != ~0UL ? s->min_exec : 0, s->max_exec, missed); + if (print) { + printk(KERN_INFO + " 4k %lu large %lu gb %lu x %lu[%lx-%lx] miss %lu\n", + s->spg, s->lpg, s->gpg, s->exec, + s->min_exec != ~0UL ? s->min_exec : 0, + s->max_exec, missed); + } expected = (s->gpg*GPS + s->lpg*LPS)/PAGE_SIZE + s->spg + missed; if (expected != i) { @@ -96,11 +98,11 @@ static __init int print_split(struct split_state *s) return err; } -static unsigned long __initdata addr[NTEST]; -static unsigned int __initdata len[NTEST]; +static unsigned long addr[NTEST]; +static unsigned int len[NTEST]; /* Change the global bit on random pages in the direct mapping */ -static __init int exercise_pageattr(void) +static int pageattr_test(void) { struct split_state sa, sb, sc; unsigned long *bm; @@ -110,7 +112,8 @@ static __init int exercise_pageattr(void) int i, k; int err; - printk(KERN_INFO "CPA exercising pageattr\n"); + if (print) + printk(KERN_INFO "CPA self-test:\n"); bm = vmalloc((max_pfn_mapped + 7) / 8); if (!bm) { @@ -186,7 +189,6 @@ static __init int exercise_pageattr(void) failed += print_split(&sb); - printk(KERN_INFO "CPA reverting everything\n"); for (i = 0; i < NTEST; i++) { if (!addr[i]) continue; @@ -214,12 +216,40 @@ static __init int exercise_pageattr(void) failed += print_split(&sc); if (failed) { - printk(KERN_ERR "CPA selftests NOT PASSED. Please report.\n"); + printk(KERN_ERR "NOT PASSED. Please report.\n"); WARN_ON(1); + return -EINVAL; } else { - printk(KERN_INFO "CPA selftests PASSED\n"); + if (print) + printk(KERN_INFO "ok.\n"); + } + + return 0; +} + +static int do_pageattr_test(void *__unused) +{ + while (!kthread_should_stop()) { + schedule_timeout_interruptible(HZ*30); + if (pageattr_test() < 0) + break; + if (print) + print--; } + return 0; +} + +static int start_pageattr_test(void) +{ + struct task_struct *p; + + p = kthread_create(do_pageattr_test, NULL, "pageattr-test"); + if (!IS_ERR(p)) + wake_up_process(p); + else + WARN_ON(1); return 0; } -module_init(exercise_pageattr); + +module_init(start_pageattr_test); diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 16ce841..4119379 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -8,6 +8,7 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/mm.h> +#include <linux/interrupt.h> #include <asm/e820.h> #include <asm/processor.h> @@ -167,8 +168,6 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address) if (within(address, virt_to_highmap(_text), virt_to_highmap(_etext))) pgprot_val(forbidden) |= _PAGE_NX; - -#ifdef CONFIG_DEBUG_RODATA /* The .rodata section needs to be read-only */ if (within(address, (unsigned long)__start_rodata, (unsigned long)__end_rodata)) @@ -179,7 +178,6 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address) if (within(address, virt_to_highmap(__start_rodata), virt_to_highmap(__end_rodata))) pgprot_val(forbidden) |= _PAGE_RW; -#endif prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); @@ -194,7 +192,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address) * or when the present bit is not set. Otherwise we would return a * pointer to a nonexisting mapping. */ -pte_t *lookup_address(unsigned long address, int *level) +pte_t *lookup_address(unsigned long address, unsigned int *level) { pgd_t *pgd = pgd_offset_k(address); pud_t *pud; @@ -255,21 +253,11 @@ static int try_preserve_large_page(pte_t *kpte, unsigned long address, struct cpa_data *cpa) { - unsigned long nextpage_addr, numpages, pmask, psize, flags; + unsigned long nextpage_addr, numpages, pmask, psize, flags, addr; pte_t new_pte, old_pte, *tmp; pgprot_t old_prot, new_prot; - int level, do_split = 1; - - /* - * An Athlon 64 X2 showed hard hangs if we tried to preserve - * largepages and changed the PSE entry from RW to RO. - * - * As AMD CPUs have a long series of erratas in this area, - * (and none of the known ones seem to explain this hang), - * disable this code until the hang can be debugged: - */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) - return 1; + int i, do_split = 1; + unsigned int level; spin_lock_irqsave(&pgd_lock, flags); /* @@ -287,8 +275,8 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, break; #ifdef CONFIG_X86_64 case PG_LEVEL_1G: - psize = PMD_PAGE_SIZE; - pmask = PMD_PAGE_MASK; + psize = PUD_PAGE_SIZE; + pmask = PUD_PAGE_MASK; break; #endif default: @@ -316,6 +304,19 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, new_prot = static_protections(new_prot, address); /* + * We need to check the full range, whether + * static_protection() requires a different pgprot for one of + * the pages in the range we try to preserve: + */ + addr = address + PAGE_SIZE; + for (i = 1; i < cpa->numpages; i++, addr += PAGE_SIZE) { + pgprot_t chk_prot = static_protections(new_prot, addr); + + if (pgprot_val(chk_prot) != pgprot_val(new_prot)) + goto out_unlock; + } + + /* * If there are no changes, return. maxpages has been updated * above: */ @@ -349,23 +350,103 @@ out_unlock: return do_split; } +static LIST_HEAD(page_pool); +static unsigned long pool_size, pool_pages, pool_low; +static unsigned long pool_used, pool_failed, pool_refill; + +static void cpa_fill_pool(void) +{ + struct page *p; + gfp_t gfp = GFP_KERNEL; + + /* Do not allocate from interrupt context */ + if (in_irq() || irqs_disabled()) + return; + /* + * Check unlocked. I does not matter when we have one more + * page in the pool. The bit lock avoids recursive pool + * allocations: + */ + if (pool_pages >= pool_size || test_and_set_bit_lock(0, &pool_refill)) + return; + +#ifdef CONFIG_DEBUG_PAGEALLOC + /* + * We could do: + * gfp = in_atomic() ? GFP_ATOMIC : GFP_KERNEL; + * but this fails on !PREEMPT kernels + */ + gfp = GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN; +#endif + + while (pool_pages < pool_size) { + p = alloc_pages(gfp, 0); + if (!p) { + pool_failed++; + break; + } + spin_lock_irq(&pgd_lock); + list_add(&p->lru, &page_pool); + pool_pages++; + spin_unlock_irq(&pgd_lock); + } + clear_bit_unlock(0, &pool_refill); +} + +#define SHIFT_MB (20 - PAGE_SHIFT) +#define ROUND_MB_GB ((1 << 10) - 1) +#define SHIFT_MB_GB 10 +#define POOL_PAGES_PER_GB 16 + +void __init cpa_init(void) +{ + struct sysinfo si; + unsigned long gb; + + si_meminfo(&si); + /* + * Calculate the number of pool pages: + * + * Convert totalram (nr of pages) to MiB and round to the next + * GiB. Shift MiB to Gib and multiply the result by + * POOL_PAGES_PER_GB: + */ + gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB; + pool_size = POOL_PAGES_PER_GB * gb; + pool_low = pool_size; + + cpa_fill_pool(); + printk(KERN_DEBUG + "CPA: page pool initialized %lu of %lu pages preallocated\n", + pool_pages, pool_size); +} + static int split_large_page(pte_t *kpte, unsigned long address) { unsigned long flags, pfn, pfninc = 1; - gfp_t gfp_flags = GFP_KERNEL; unsigned int i, level; pte_t *pbase, *tmp; pgprot_t ref_prot; struct page *base; -#ifdef CONFIG_DEBUG_PAGEALLOC - gfp_flags = GFP_ATOMIC | __GFP_NOWARN; -#endif - base = alloc_pages(gfp_flags, 0); - if (!base) + /* + * Get a page from the pool. The pool list is protected by the + * pgd_lock, which we have to take anyway for the split + * operation: + */ + spin_lock_irqsave(&pgd_lock, flags); + if (list_empty(&page_pool)) { + spin_unlock_irqrestore(&pgd_lock, flags); return -ENOMEM; + } + + base = list_first_entry(&page_pool, struct page, lru); + list_del(&base->lru); + pool_pages--; + + if (pool_pages < pool_low) + pool_low = pool_pages; - spin_lock_irqsave(&pgd_lock, flags); /* * Check for races, another CPU might have split this page * up for us already: @@ -410,17 +491,24 @@ static int split_large_page(pte_t *kpte, unsigned long address) base = NULL; out_unlock: + /* + * If we dropped out via the lookup_address check under + * pgd_lock then stick the page back into the pool: + */ + if (base) { + list_add(&base->lru, &page_pool); + pool_pages++; + } else + pool_used++; spin_unlock_irqrestore(&pgd_lock, flags); - if (base) - __free_pages(base, 0); - return 0; } static int __change_page_attr(unsigned long address, struct cpa_data *cpa) { - int level, do_split, err; + int do_split, err; + unsigned int level; struct page *kpte_page; pte_t *kpte; @@ -600,6 +688,15 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages, if (!pgprot_val(mask_set) && !pgprot_val(mask_clr)) return 0; + /* Ensure we are PAGE_SIZE aligned */ + if (addr & ~PAGE_MASK) { + addr &= PAGE_MASK; + /* + * People should not be passing in unaligned addresses: + */ + WARN_ON_ONCE(1); + } + cpa.vaddr = addr; cpa.numpages = numpages; cpa.mask_set = mask_set; @@ -612,7 +709,7 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages, * Check whether we really changed something: */ if (!cpa.flushtlb) - return ret; + goto out; /* * No need to flush, when we did not set any of the caching @@ -631,6 +728,8 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages, else cpa_flush_all(cache); +out: + cpa_fill_pool(); return ret; } @@ -771,8 +870,12 @@ void kernel_map_pages(struct page *page, int numpages, int enable) return; /* - * The return value is ignored - the calls cannot fail, - * large pages are disabled at boot time: + * The return value is ignored as the calls cannot fail. + * Large pages are kept enabled at boot time, and are + * split up quickly with DEBUG_PAGEALLOC. If a splitup + * fails here (due to temporary memory shortage) no damage + * is done because we just keep the largepage intact up + * to the next attempt when it will likely be split up: */ if (enable) __set_pages_p(page, numpages); @@ -784,6 +887,12 @@ void kernel_map_pages(struct page *page, int numpages, int enable) * but that can deadlock->flush only current cpu: */ __flush_tlb_all(); + + /* + * Try to refill the page pool here. We can do this only after + * the tlb flush. + */ + cpa_fill_pool(); } #endif diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 6c19146..73aba71 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -183,7 +183,7 @@ pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); } -struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) +pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) { struct page *pte; @@ -192,6 +192,8 @@ struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) #else pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); #endif + if (pte) + pgtable_page_ctor(pte); return pte; } @@ -365,6 +367,7 @@ void check_pgt_cache(void) void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte) { + pgtable_page_dtor(pte); paravirt_release_pt(page_to_pfn(pte)); tlb_remove_page(tlb, pte); } diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 65416f8..ecd91ea 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -488,7 +488,8 @@ void __init srat_reserve_add_area(int nodeid) printk(KERN_INFO "SRAT: This will cost you %Lu MB of " "pre-allocated memory.\n", (unsigned long long)total_mb); reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start, - nodes_add[nodeid].end - nodes_add[nodeid].start); + nodes_add[nodeid].end - nodes_add[nodeid].start, + BOOTMEM_DEFAULT); } } diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 52deabc7..b7c67a1 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -26,16 +26,37 @@ int pcibios_last_bus = -1; unsigned long pirq_table_addr; struct pci_bus *pci_root_bus; struct pci_raw_ops *raw_pci_ops; +struct pci_raw_ops *raw_pci_ext_ops; + +int raw_pci_read(unsigned int domain, unsigned int bus, unsigned int devfn, + int reg, int len, u32 *val) +{ + if (reg < 256 && raw_pci_ops) + return raw_pci_ops->read(domain, bus, devfn, reg, len, val); + if (raw_pci_ext_ops) + return raw_pci_ext_ops->read(domain, bus, devfn, reg, len, val); + return -EINVAL; +} + +int raw_pci_write(unsigned int domain, unsigned int bus, unsigned int devfn, + int reg, int len, u32 val) +{ + if (reg < 256 && raw_pci_ops) + return raw_pci_ops->write(domain, bus, devfn, reg, len, val); + if (raw_pci_ext_ops) + return raw_pci_ext_ops->write(domain, bus, devfn, reg, len, val); + return -EINVAL; +} static int pci_read(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *value) { - return raw_pci_ops->read(pci_domain_nr(bus), bus->number, + return raw_pci_read(pci_domain_nr(bus), bus->number, devfn, where, size, value); } static int pci_write(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 value) { - return raw_pci_ops->write(pci_domain_nr(bus), bus->number, + return raw_pci_write(pci_domain_nr(bus), bus->number, devfn, where, size, value); } diff --git a/arch/x86/pci/direct.c b/arch/x86/pci/direct.c index 431c9a5..42f3e4c 100644 --- a/arch/x86/pci/direct.c +++ b/arch/x86/pci/direct.c @@ -14,7 +14,7 @@ #define PCI_CONF1_ADDRESS(bus, devfn, reg) \ (0x80000000 | (bus << 16) | (devfn << 8) | (reg & ~3)) -int pci_conf1_read(unsigned int seg, unsigned int bus, +static int pci_conf1_read(unsigned int seg, unsigned int bus, unsigned int devfn, int reg, int len, u32 *value) { unsigned long flags; @@ -45,7 +45,7 @@ int pci_conf1_read(unsigned int seg, unsigned int bus, return 0; } -int pci_conf1_write(unsigned int seg, unsigned int bus, +static int pci_conf1_write(unsigned int seg, unsigned int bus, unsigned int devfn, int reg, int len, u32 value) { unsigned long flags; diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 74d30ff..a5ef5f5 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -215,7 +215,8 @@ static int quirk_aspm_offset[MAX_PCIEROOT << 3]; static int quirk_pcie_aspm_read(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *value) { - return raw_pci_ops->read(0, bus->number, devfn, where, size, value); + return raw_pci_read(pci_domain_nr(bus), bus->number, + devfn, where, size, value); } /* @@ -231,7 +232,8 @@ static int quirk_pcie_aspm_write(struct pci_bus *bus, unsigned int devfn, int wh if ((offset) && (where == offset)) value = value & 0xfffffffc; - return raw_pci_ops->write(0, bus->number, devfn, where, size, value); + return raw_pci_write(pci_domain_nr(bus), bus->number, + devfn, where, size, value); } static struct pci_ops quirk_pcie_aspm_ops = { diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c index 5565d70..e041ced 100644 --- a/arch/x86/pci/legacy.c +++ b/arch/x86/pci/legacy.c @@ -22,7 +22,7 @@ static void __devinit pcibios_fixup_peer_bridges(void) if (pci_find_bus(0, n)) continue; for (devfn = 0; devfn < 256; devfn += 8) { - if (!raw_pci_ops->read(0, n, devfn, PCI_VENDOR_ID, 2, &l) && + if (!raw_pci_read(0, n, devfn, PCI_VENDOR_ID, 2, &l) && l != 0x0000 && l != 0xffff) { DBG("Found device at %02x:%02x [%04x]\n", n, devfn, l); printk(KERN_INFO "PCI: Discovered peer bus %02x\n", n); diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 4df637e..8d54df4 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -22,46 +22,13 @@ #define MMCONFIG_APER_MIN (2 * 1024*1024) #define MMCONFIG_APER_MAX (256 * 1024*1024) -DECLARE_BITMAP(pci_mmcfg_fallback_slots, 32*PCI_MMCFG_MAX_CHECK_BUS); - /* Indicate if the mmcfg resources have been placed into the resource table. */ static int __initdata pci_mmcfg_resources_inserted; -/* K8 systems have some devices (typically in the builtin northbridge) - that are only accessible using type1 - Normally this can be expressed in the MCFG by not listing them - and assigning suitable _SEGs, but this isn't implemented in some BIOS. - Instead try to discover all devices on bus 0 that are unreachable using MM - and fallback for them. */ -static void __init unreachable_devices(void) -{ - int i, bus; - /* Use the max bus number from ACPI here? */ - for (bus = 0; bus < PCI_MMCFG_MAX_CHECK_BUS; bus++) { - for (i = 0; i < 32; i++) { - unsigned int devfn = PCI_DEVFN(i, 0); - u32 val1, val2; - - pci_conf1_read(0, bus, devfn, 0, 4, &val1); - if (val1 == 0xffffffff) - continue; - - if (pci_mmcfg_arch_reachable(0, bus, devfn)) { - raw_pci_ops->read(0, bus, devfn, 0, 4, &val2); - if (val1 == val2) - continue; - } - set_bit(i + 32 * bus, pci_mmcfg_fallback_slots); - printk(KERN_NOTICE "PCI: No mmconfig possible on device" - " %02x:%02x\n", bus, i); - } - } -} - static const char __init *pci_mmcfg_e7520(void) { u32 win; - pci_conf1_read(0, 0, PCI_DEVFN(0,0), 0xce, 2, &win); + pci_direct_conf1.read(0, 0, PCI_DEVFN(0,0), 0xce, 2, &win); win = win & 0xf000; if(win == 0x0000 || win == 0xf000) @@ -86,7 +53,7 @@ static const char __init *pci_mmcfg_intel_945(void) pci_mmcfg_config_num = 1; - pci_conf1_read(0, 0, PCI_DEVFN(0,0), 0x48, 4, &pciexbar); + pci_direct_conf1.read(0, 0, PCI_DEVFN(0,0), 0x48, 4, &pciexbar); /* Enable bit */ if (!(pciexbar & 1)) @@ -151,7 +118,7 @@ static int __init pci_mmcfg_check_hostbridge(void) int i; const char *name; - pci_conf1_read(0, 0, PCI_DEVFN(0,0), 0, 4, &l); + pci_direct_conf1.read(0, 0, PCI_DEVFN(0,0), 0, 4, &l); vendor = l & 0xffff; device = (l >> 16) & 0xffff; @@ -270,8 +237,6 @@ void __init pci_mmcfg_init(int type) return; if (pci_mmcfg_arch_init()) { - if (type == 1) - unreachable_devices(); if (known_bridge) pci_mmcfg_insert_resources(IORESOURCE_BUSY); pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c index 1bf5816..081816a 100644 --- a/arch/x86/pci/mmconfig_32.c +++ b/arch/x86/pci/mmconfig_32.c @@ -30,10 +30,6 @@ static u32 get_base_addr(unsigned int seg, int bus, unsigned devfn) struct acpi_mcfg_allocation *cfg; int cfg_num; - if (seg == 0 && bus < PCI_MMCFG_MAX_CHECK_BUS && - test_bit(PCI_SLOT(devfn) + 32*bus, pci_mmcfg_fallback_slots)) - return 0; - for (cfg_num = 0; cfg_num < pci_mmcfg_config_num; cfg_num++) { cfg = &pci_mmcfg_config[cfg_num]; if (cfg->pci_segment == seg && @@ -68,13 +64,13 @@ static int pci_mmcfg_read(unsigned int seg, unsigned int bus, u32 base; if ((bus > 255) || (devfn > 255) || (reg > 4095)) { - *value = -1; +err: *value = -1; return -EINVAL; } base = get_base_addr(seg, bus, devfn); if (!base) - return pci_conf1_read(seg,bus,devfn,reg,len,value); + goto err; spin_lock_irqsave(&pci_config_lock, flags); @@ -107,7 +103,7 @@ static int pci_mmcfg_write(unsigned int seg, unsigned int bus, base = get_base_addr(seg, bus, devfn); if (!base) - return pci_conf1_write(seg,bus,devfn,reg,len,value); + return -EINVAL; spin_lock_irqsave(&pci_config_lock, flags); @@ -134,15 +130,9 @@ static struct pci_raw_ops pci_mmcfg = { .write = pci_mmcfg_write, }; -int __init pci_mmcfg_arch_reachable(unsigned int seg, unsigned int bus, - unsigned int devfn) -{ - return get_base_addr(seg, bus, devfn) != 0; -} - int __init pci_mmcfg_arch_init(void) { - printk(KERN_INFO "PCI: Using MMCONFIG\n"); - raw_pci_ops = &pci_mmcfg; + printk(KERN_INFO "PCI: Using MMCONFIG for extended config space\n"); + raw_pci_ext_ops = &pci_mmcfg; return 1; } diff --git a/arch/x86/pci/mmconfig_64.c b/arch/x86/pci/mmconfig_64.c index 4095e4d..9207fd4 100644 --- a/arch/x86/pci/mmconfig_64.c +++ b/arch/x86/pci/mmconfig_64.c @@ -40,9 +40,7 @@ static char __iomem *get_virt(unsigned int seg, unsigned bus) static char __iomem *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn) { char __iomem *addr; - if (seg == 0 && bus < PCI_MMCFG_MAX_CHECK_BUS && - test_bit(32*bus + PCI_SLOT(devfn), pci_mmcfg_fallback_slots)) - return NULL; + addr = get_virt(seg, bus); if (!addr) return NULL; @@ -56,13 +54,13 @@ static int pci_mmcfg_read(unsigned int seg, unsigned int bus, /* Why do we have this when nobody checks it. How about a BUG()!? -AK */ if (unlikely((bus > 255) || (devfn > 255) || (reg > 4095))) { - *value = -1; +err: *value = -1; return -EINVAL; } addr = pci_dev_base(seg, bus, devfn); if (!addr) - return pci_conf1_read(seg,bus,devfn,reg,len,value); + goto err; switch (len) { case 1: @@ -90,7 +88,7 @@ static int pci_mmcfg_write(unsigned int seg, unsigned int bus, addr = pci_dev_base(seg, bus, devfn); if (!addr) - return pci_conf1_write(seg,bus,devfn,reg,len,value); + return -EINVAL; switch (len) { case 1: @@ -126,12 +124,6 @@ static void __iomem * __init mcfg_ioremap(struct acpi_mcfg_allocation *cfg) return addr; } -int __init pci_mmcfg_arch_reachable(unsigned int seg, unsigned int bus, - unsigned int devfn) -{ - return pci_dev_base(seg, bus, devfn) != NULL; -} - int __init pci_mmcfg_arch_init(void) { int i; @@ -152,6 +144,6 @@ int __init pci_mmcfg_arch_init(void) return 0; } } - raw_pci_ops = &pci_mmcfg; + raw_pci_ext_ops = &pci_mmcfg; return 1; } diff --git a/arch/x86/pci/pci.h b/arch/x86/pci/pci.h index ac56d39..3431518 100644 --- a/arch/x86/pci/pci.h +++ b/arch/x86/pci/pci.h @@ -85,10 +85,17 @@ extern spinlock_t pci_config_lock; extern int (*pcibios_enable_irq)(struct pci_dev *dev); extern void (*pcibios_disable_irq)(struct pci_dev *dev); -extern int pci_conf1_write(unsigned int seg, unsigned int bus, - unsigned int devfn, int reg, int len, u32 value); -extern int pci_conf1_read(unsigned int seg, unsigned int bus, - unsigned int devfn, int reg, int len, u32 *value); +struct pci_raw_ops { + int (*read)(unsigned int domain, unsigned int bus, unsigned int devfn, + int reg, int len, u32 *val); + int (*write)(unsigned int domain, unsigned int bus, unsigned int devfn, + int reg, int len, u32 val); +}; + +extern struct pci_raw_ops *raw_pci_ops; +extern struct pci_raw_ops *raw_pci_ext_ops; + +extern struct pci_raw_ops pci_direct_conf1; extern int pci_direct_probe(void); extern void pci_direct_init(int type); @@ -98,13 +105,6 @@ extern void pcibios_sort(void); /* pci-mmconfig.c */ -/* Verify the first 16 busses. We assume that systems with more busses - get MCFG right. */ -#define PCI_MMCFG_MAX_CHECK_BUS 16 -extern DECLARE_BITMAP(pci_mmcfg_fallback_slots, 32*PCI_MMCFG_MAX_CHECK_BUS); - -extern int __init pci_mmcfg_arch_reachable(unsigned int seg, unsigned int bus, - unsigned int devfn); extern int __init pci_mmcfg_arch_init(void); /* diff --git a/arch/x86/pci/visws.c b/arch/x86/pci/visws.c index 8ecb1c7..c2df4e9 100644 --- a/arch/x86/pci/visws.c +++ b/arch/x86/pci/visws.c @@ -13,9 +13,6 @@ #include "pci.h" - -extern struct pci_raw_ops pci_direct_conf1; - static int pci_visws_enable_irq(struct pci_dev *dev) { return 0; } static void pci_visws_disable_irq(struct pci_dev *dev) { } diff --git a/arch/x86/power/Makefile b/arch/x86/power/Makefile index d764ec9..9ff4d5b 100644 --- a/arch/x86/power/Makefile +++ b/arch/x86/power/Makefile @@ -1,2 +1,2 @@ -obj-$(CONFIG_PM) += cpu.o -obj-$(CONFIG_HIBERNATION) += swsusp.o suspend.o +obj-$(CONFIG_PM_SLEEP) += cpu_$(BITS).o +obj-$(CONFIG_HIBERNATION) += hibernate_$(BITS).o hibernate_asm_$(BITS).o diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu_32.c index efcf620..7f9c6da 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu_32.c @@ -40,7 +40,7 @@ static void __save_processor_state(struct saved_context *ctxt) savesegment(ss, ctxt->ss); /* - * control registers + * control registers */ ctxt->cr0 = read_cr0(); ctxt->cr2 = read_cr2(); diff --git a/arch/x86/kernel/suspend_64.c b/arch/x86/power/cpu_64.c index 7ac7130..66bdfb5 100644 --- a/arch/x86/kernel/suspend_64.c +++ b/arch/x86/power/cpu_64.c @@ -1,8 +1,9 @@ /* - * Suspend support specific for i386. + * Suspend and hibernation support for x86-64 * * Distribute under GPLv2 * + * Copyright (c) 2007 Rafael J. Wysocki <rjw@sisk.pl> * Copyright (c) 2002 Pavel Machek <pavel@suse.cz> * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org> */ @@ -14,9 +15,6 @@ #include <asm/pgtable.h> #include <asm/mtrr.h> -/* References to section boundaries */ -extern const void __nosave_begin, __nosave_end; - static void fix_processor_context(void); struct saved_context saved_context; @@ -63,7 +61,7 @@ static void __save_processor_state(struct saved_context *ctxt) mtrr_save_fixed_ranges(NULL); /* - * control registers + * control registers */ rdmsrl(MSR_EFER, ctxt->efer); ctxt->cr0 = read_cr0(); @@ -166,155 +164,3 @@ static void fix_processor_context(void) loaddebug(¤t->thread, 7); } } - -#ifdef CONFIG_HIBERNATION -/* Defined in arch/x86_64/kernel/suspend_asm.S */ -extern int restore_image(void); - -/* - * Address to jump to in the last phase of restore in order to get to the image - * kernel's text (this value is passed in the image header). - */ -unsigned long restore_jump_address; - -/* - * Value of the cr3 register from before the hibernation (this value is passed - * in the image header). - */ -unsigned long restore_cr3; - -pgd_t *temp_level4_pgt; - -void *relocated_restore_code; - -static int res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long end) -{ - long i, j; - - i = pud_index(address); - pud = pud + i; - for (; i < PTRS_PER_PUD; pud++, i++) { - unsigned long paddr; - pmd_t *pmd; - - paddr = address + i*PUD_SIZE; - if (paddr >= end) - break; - - pmd = (pmd_t *)get_safe_page(GFP_ATOMIC); - if (!pmd) - return -ENOMEM; - set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); - for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) { - unsigned long pe; - - if (paddr >= end) - break; - pe = __PAGE_KERNEL_LARGE_EXEC | paddr; - pe &= __supported_pte_mask; - set_pmd(pmd, __pmd(pe)); - } - } - return 0; -} - -static int set_up_temporary_mappings(void) -{ - unsigned long start, end, next; - int error; - - temp_level4_pgt = (pgd_t *)get_safe_page(GFP_ATOMIC); - if (!temp_level4_pgt) - return -ENOMEM; - - /* It is safe to reuse the original kernel mapping */ - set_pgd(temp_level4_pgt + pgd_index(__START_KERNEL_map), - init_level4_pgt[pgd_index(__START_KERNEL_map)]); - - /* Set up the direct mapping from scratch */ - start = (unsigned long)pfn_to_kaddr(0); - end = (unsigned long)pfn_to_kaddr(end_pfn); - - for (; start < end; start = next) { - pud_t *pud = (pud_t *)get_safe_page(GFP_ATOMIC); - if (!pud) - return -ENOMEM; - next = start + PGDIR_SIZE; - if (next > end) - next = end; - if ((error = res_phys_pud_init(pud, __pa(start), __pa(next)))) - return error; - set_pgd(temp_level4_pgt + pgd_index(start), - mk_kernel_pgd(__pa(pud))); - } - return 0; -} - -int swsusp_arch_resume(void) -{ - int error; - - /* We have got enough memory and from now on we cannot recover */ - if ((error = set_up_temporary_mappings())) - return error; - - relocated_restore_code = (void *)get_safe_page(GFP_ATOMIC); - if (!relocated_restore_code) - return -ENOMEM; - memcpy(relocated_restore_code, &core_restore_code, - &restore_registers - &core_restore_code); - - restore_image(); - return 0; -} - -/* - * pfn_is_nosave - check if given pfn is in the 'nosave' section - */ - -int pfn_is_nosave(unsigned long pfn) -{ - unsigned long nosave_begin_pfn = __pa_symbol(&__nosave_begin) >> PAGE_SHIFT; - unsigned long nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT; - return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); -} - -struct restore_data_record { - unsigned long jump_address; - unsigned long cr3; - unsigned long magic; -}; - -#define RESTORE_MAGIC 0x0123456789ABCDEFUL - -/** - * arch_hibernation_header_save - populate the architecture specific part - * of a hibernation image header - * @addr: address to save the data at - */ -int arch_hibernation_header_save(void *addr, unsigned int max_size) -{ - struct restore_data_record *rdr = addr; - - if (max_size < sizeof(struct restore_data_record)) - return -EOVERFLOW; - rdr->jump_address = restore_jump_address; - rdr->cr3 = restore_cr3; - rdr->magic = RESTORE_MAGIC; - return 0; -} - -/** - * arch_hibernation_header_restore - read the architecture specific data - * from the hibernation image header - * @addr: address to read the data from - */ -int arch_hibernation_header_restore(void *addr) -{ - struct restore_data_record *rdr = addr; - - restore_jump_address = rdr->jump_address; - restore_cr3 = rdr->cr3; - return (rdr->magic == RESTORE_MAGIC) ? 0 : -EINVAL; -} -#endif /* CONFIG_HIBERNATION */ diff --git a/arch/x86/power/suspend.c b/arch/x86/power/hibernate_32.c index a0020b9..f2b6e3f 100644 --- a/arch/x86/power/suspend.c +++ b/arch/x86/power/hibernate_32.c @@ -1,5 +1,5 @@ /* - * Suspend support specific for i386 - temporary page tables + * Hibernation support specific for i386 - temporary page tables * * Distribute under GPLv2 * @@ -13,7 +13,7 @@ #include <asm/page.h> #include <asm/pgtable.h> -/* Defined in arch/i386/power/swsusp.S */ +/* Defined in hibernate_asm_32.S */ extern int restore_image(void); /* References to section boundaries */ @@ -23,7 +23,7 @@ extern const void __nosave_begin, __nosave_end; pgd_t *resume_pg_dir; /* The following three functions are based on the analogous code in - * arch/i386/mm/init.c + * arch/x86/mm/init_32.c */ /* diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c new file mode 100644 index 0000000..b542355 --- /dev/null +++ b/arch/x86/power/hibernate_64.c @@ -0,0 +1,169 @@ +/* + * Hibernation support for x86-64 + * + * Distribute under GPLv2 + * + * Copyright (c) 2007 Rafael J. Wysocki <rjw@sisk.pl> + * Copyright (c) 2002 Pavel Machek <pavel@suse.cz> + * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org> + */ + +#include <linux/smp.h> +#include <linux/suspend.h> +#include <asm/proto.h> +#include <asm/page.h> +#include <asm/pgtable.h> +#include <asm/mtrr.h> + +/* References to section boundaries */ +extern const void __nosave_begin, __nosave_end; + +/* Defined in hibernate_asm_64.S */ +extern int restore_image(void); + +/* + * Address to jump to in the last phase of restore in order to get to the image + * kernel's text (this value is passed in the image header). + */ +unsigned long restore_jump_address; + +/* + * Value of the cr3 register from before the hibernation (this value is passed + * in the image header). + */ +unsigned long restore_cr3; + +pgd_t *temp_level4_pgt; + +void *relocated_restore_code; + +static int res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long end) +{ + long i, j; + + i = pud_index(address); + pud = pud + i; + for (; i < PTRS_PER_PUD; pud++, i++) { + unsigned long paddr; + pmd_t *pmd; + + paddr = address + i*PUD_SIZE; + if (paddr >= end) + break; + + pmd = (pmd_t *)get_safe_page(GFP_ATOMIC); + if (!pmd) + return -ENOMEM; + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); + for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) { + unsigned long pe; + + if (paddr >= end) + break; + pe = __PAGE_KERNEL_LARGE_EXEC | paddr; + pe &= __supported_pte_mask; + set_pmd(pmd, __pmd(pe)); + } + } + return 0; +} + +static int set_up_temporary_mappings(void) +{ + unsigned long start, end, next; + int error; + + temp_level4_pgt = (pgd_t *)get_safe_page(GFP_ATOMIC); + if (!temp_level4_pgt) + return -ENOMEM; + + /* It is safe to reuse the original kernel mapping */ + set_pgd(temp_level4_pgt + pgd_index(__START_KERNEL_map), + init_level4_pgt[pgd_index(__START_KERNEL_map)]); + + /* Set up the direct mapping from scratch */ + start = (unsigned long)pfn_to_kaddr(0); + end = (unsigned long)pfn_to_kaddr(end_pfn); + + for (; start < end; start = next) { + pud_t *pud = (pud_t *)get_safe_page(GFP_ATOMIC); + if (!pud) + return -ENOMEM; + next = start + PGDIR_SIZE; + if (next > end) + next = end; + if ((error = res_phys_pud_init(pud, __pa(start), __pa(next)))) + return error; + set_pgd(temp_level4_pgt + pgd_index(start), + mk_kernel_pgd(__pa(pud))); + } + return 0; +} + +int swsusp_arch_resume(void) +{ + int error; + + /* We have got enough memory and from now on we cannot recover */ + if ((error = set_up_temporary_mappings())) + return error; + + relocated_restore_code = (void *)get_safe_page(GFP_ATOMIC); + if (!relocated_restore_code) + return -ENOMEM; + memcpy(relocated_restore_code, &core_restore_code, + &restore_registers - &core_restore_code); + + restore_image(); + return 0; +} + +/* + * pfn_is_nosave - check if given pfn is in the 'nosave' section + */ + +int pfn_is_nosave(unsigned long pfn) +{ + unsigned long nosave_begin_pfn = __pa_symbol(&__nosave_begin) >> PAGE_SHIFT; + unsigned long nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT; + return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); +} + +struct restore_data_record { + unsigned long jump_address; + unsigned long cr3; + unsigned long magic; +}; + +#define RESTORE_MAGIC 0x0123456789ABCDEFUL + +/** + * arch_hibernation_header_save - populate the architecture specific part + * of a hibernation image header + * @addr: address to save the data at + */ +int arch_hibernation_header_save(void *addr, unsigned int max_size) +{ + struct restore_data_record *rdr = addr; + + if (max_size < sizeof(struct restore_data_record)) + return -EOVERFLOW; + rdr->jump_address = restore_jump_address; + rdr->cr3 = restore_cr3; + rdr->magic = RESTORE_MAGIC; + return 0; +} + +/** + * arch_hibernation_header_restore - read the architecture specific data + * from the hibernation image header + * @addr: address to read the data from + */ +int arch_hibernation_header_restore(void *addr) +{ + struct restore_data_record *rdr = addr; + + restore_jump_address = rdr->jump_address; + restore_cr3 = rdr->cr3; + return (rdr->magic == RESTORE_MAGIC) ? 0 : -EINVAL; +} diff --git a/arch/x86/power/swsusp.S b/arch/x86/power/hibernate_asm_32.S index 53662e0..b95aa6c 100644 --- a/arch/x86/power/swsusp.S +++ b/arch/x86/power/hibernate_asm_32.S @@ -1,7 +1,6 @@ .text -/* Originally gcc generated, modified by hand - * +/* * This may not use any stack, nor any variable that is not "NoSave": * * Its rewriting one kernel image with another. What is stack in "old" diff --git a/arch/x86/kernel/suspend_asm_64.S b/arch/x86/power/hibernate_asm_64.S index aeb9a4d..1deb3244 100644 --- a/arch/x86/kernel/suspend_asm_64.S +++ b/arch/x86/power/hibernate_asm_64.S @@ -1,7 +1,12 @@ -/* Copyright 2004,2005 Pavel Machek <pavel@suse.cz>, Andi Kleen <ak@suse.de>, Rafael J. Wysocki <rjw@sisk.pl> +/* + * Hibernation support for x86-64 * * Distribute under GPLv2. * + * Copyright 2007 Rafael J. Wysocki <rjw@sisk.pl> + * Copyright 2005 Andi Kleen <ak@suse.de> + * Copyright 2004 Pavel Machek <pavel@suse.cz> + * * swsusp_arch_resume must not use any stack or any nonlocal variables while * copying pages: * @@ -9,7 +14,7 @@ * image could very well be data page in "new" image, and overwriting * your own stack under you is bad idea. */ - + .text #include <linux/linkage.h> #include <asm/segment.h> diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile index d28dda5..f385a4b 100644 --- a/arch/x86/vdso/Makefile +++ b/arch/x86/vdso/Makefile @@ -7,7 +7,7 @@ VDSO32-$(CONFIG_X86_32) := y VDSO32-$(CONFIG_COMPAT) := y vdso-install-$(VDSO64-y) += vdso.so -vdso-install-$(VDSO32-y) += $(vdso32-y:=.so) +vdso-install-$(VDSO32-y) += $(vdso32-images) # files to link into the vdso @@ -63,6 +63,8 @@ vdso32.so-$(CONFIG_X86_32) += int80 vdso32.so-$(CONFIG_COMPAT) += syscall vdso32.so-$(VDSO32-y) += sysenter +vdso32-images = $(vdso32.so-y:%=vdso32-%.so) + CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds) VDSO_LDFLAGS_vdso32.lds = -m elf_i386 -Wl,-soname=linux-gate.so.1 @@ -71,21 +73,21 @@ VDSO_LDFLAGS_vdso32.lds = -m elf_i386 -Wl,-soname=linux-gate.so.1 override obj-dirs = $(dir $(obj)) $(obj)/vdso32/ targets += vdso32/vdso32.lds -targets += $(vdso32.so-y:%=vdso32-%.so.dbg) $(vdso32.so-y:%=vdso32-%.so) +targets += $(vdso32-images) $(vdso32-images:=.dbg) targets += vdso32/note.o $(vdso32.so-y:%=vdso32/%.o) -extra-y += $(vdso32.so-y:%=vdso32-%.so) +extra-y += $(vdso32-images) -$(obj)/vdso32.o: $(vdso32.so-y:%=$(obj)/vdso32-%.so) +$(obj)/vdso32.o: $(vdso32-images:%=$(obj)/%) KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) -$(vdso32.so-y:%=$(obj)/vdso32-%.so.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_32) -$(vdso32.so-y:%=$(obj)/vdso32-%.so.dbg): asflags-$(CONFIG_X86_64) += -m32 +$(vdso32-images:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_32) +$(vdso32-images:%=$(obj)/%.dbg): asflags-$(CONFIG_X86_64) += -m32 -$(vdso32.so-y:%=$(obj)/vdso32-%.so.dbg): $(obj)/vdso32-%.so.dbg: FORCE \ - $(obj)/vdso32/vdso32.lds \ - $(obj)/vdso32/note.o \ - $(obj)/vdso32/%.o +$(vdso32-images:%=$(obj)/%.dbg): $(obj)/vdso32-%.so.dbg: FORCE \ + $(obj)/vdso32/vdso32.lds \ + $(obj)/vdso32/note.o \ + $(obj)/vdso32/%.o $(call if_changed,vdso) # Make vdso32-*-syms.lds from each image, and then make sure they match. diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index de647bc..49e5358 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -798,6 +798,10 @@ static __init void xen_pagetable_setup_start(pgd_t *base) * added to the table can be prepared properly for Xen. */ xen_write_cr3(__pa(base)); + + /* Unpin initial Xen pagetable */ + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, + PFN_DOWN(__pa(xen_start_info->pt_base))); } static __init void xen_pagetable_setup_done(pgd_t *base) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 45aa771..0144395 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -58,7 +58,7 @@ xmaddr_t arbitrary_virt_to_machine(unsigned long address) { - int level; + unsigned int level; pte_t *pte = lookup_address(address, &level); unsigned offset = address & PAGE_MASK; @@ -71,7 +71,7 @@ void make_lowmem_page_readonly(void *vaddr) { pte_t *pte, ptev; unsigned long address = (unsigned long)vaddr; - int level; + unsigned int level; pte = lookup_address(address, &level); BUG_ON(pte == NULL); @@ -86,7 +86,7 @@ void make_lowmem_page_readwrite(void *vaddr) { pte_t *pte, ptev; unsigned long address = (unsigned long)vaddr; - int level; + unsigned int level; pte = lookup_address(address, &level); BUG_ON(pte == NULL); diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index b3721fd..c39e1a5 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -217,17 +217,17 @@ unsigned long long xen_sched_clock(void) /* Get the CPU speed from Xen */ unsigned long xen_cpu_khz(void) { - u64 cpu_khz = 1000000ULL << 32; + u64 xen_khz = 1000000ULL << 32; const struct vcpu_time_info *info = &HYPERVISOR_shared_info->vcpu_info[0].time; - do_div(cpu_khz, info->tsc_to_system_mul); + do_div(xen_khz, info->tsc_to_system_mul); if (info->tsc_shift < 0) - cpu_khz <<= -info->tsc_shift; + xen_khz <<= -info->tsc_shift; else - cpu_khz >>= info->tsc_shift; + xen_khz >>= info->tsc_shift; - return cpu_khz; + return xen_khz; } /* |