summaryrefslogtreecommitdiffstats
path: root/arch/x86_64
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86_64')
-rw-r--r--arch/x86_64/Kconfig70
-rw-r--r--arch/x86_64/Kconfig.debug19
-rw-r--r--arch/x86_64/defconfig98
-rw-r--r--arch/x86_64/ia32/ia32_aout.c4
-rw-r--r--arch/x86_64/ia32/ia32_binfmt.c4
-rw-r--r--arch/x86_64/ia32/ia32_ioctl.c131
-rw-r--r--arch/x86_64/ia32/ia32_signal.c6
-rw-r--r--arch/x86_64/kernel/Makefile3
-rw-r--r--arch/x86_64/kernel/aperture.c2
-rw-r--r--arch/x86_64/kernel/apic.c10
-rw-r--r--arch/x86_64/kernel/e820.c3
-rw-r--r--arch/x86_64/kernel/entry.S3
-rw-r--r--arch/x86_64/kernel/head.S77
-rw-r--r--arch/x86_64/kernel/head64.c14
-rw-r--r--arch/x86_64/kernel/i8259.c12
-rw-r--r--arch/x86_64/kernel/io_apic.c80
-rw-r--r--arch/x86_64/kernel/kprobes.c191
-rw-r--r--arch/x86_64/kernel/mce.c27
-rw-r--r--arch/x86_64/kernel/mce_amd.c538
-rw-r--r--arch/x86_64/kernel/mpparse.c23
-rw-r--r--arch/x86_64/kernel/pci-gart.c12
-rw-r--r--arch/x86_64/kernel/pci-nommu.c2
-rw-r--r--arch/x86_64/kernel/process.c123
-rw-r--r--arch/x86_64/kernel/ptrace.c43
-rw-r--r--arch/x86_64/kernel/reboot.c7
-rw-r--r--arch/x86_64/kernel/setup.c118
-rw-r--r--arch/x86_64/kernel/setup64.c6
-rw-r--r--arch/x86_64/kernel/signal.c17
-rw-r--r--arch/x86_64/kernel/smp.c7
-rw-r--r--arch/x86_64/kernel/smpboot.c120
-rw-r--r--arch/x86_64/kernel/suspend.c92
-rw-r--r--arch/x86_64/kernel/suspend_asm.S17
-rw-r--r--arch/x86_64/kernel/sys_x86_64.c14
-rw-r--r--arch/x86_64/kernel/time.c35
-rw-r--r--arch/x86_64/kernel/traps.c44
-rw-r--r--arch/x86_64/kernel/vmlinux.lds.S2
-rw-r--r--arch/x86_64/kernel/x8664_ksyms.c3
-rw-r--r--arch/x86_64/lib/bitops.c66
-rw-r--r--arch/x86_64/lib/clear_page.S38
-rw-r--r--arch/x86_64/lib/copy_page.S87
-rw-r--r--arch/x86_64/lib/memcpy.S93
-rw-r--r--arch/x86_64/lib/memset.S94
-rw-r--r--arch/x86_64/mm/fault.c19
-rw-r--r--arch/x86_64/mm/init.c129
-rw-r--r--arch/x86_64/mm/ioremap.c4
-rw-r--r--arch/x86_64/mm/k8topology.c1
-rw-r--r--arch/x86_64/mm/numa.c130
-rw-r--r--arch/x86_64/mm/pageattr.c2
-rw-r--r--arch/x86_64/mm/srat.c6
-rw-r--r--arch/x86_64/oprofile/Kconfig6
50 files changed, 1546 insertions, 1106 deletions
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index 21afa69..6ece645 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -226,22 +226,42 @@ config SCHED_SMT
source "kernel/Kconfig.preempt"
-config K8_NUMA
- bool "K8 NUMA support"
- select NUMA
+config NUMA
+ bool "Non Uniform Memory Access (NUMA) Support"
depends on SMP
help
- Enable NUMA (Non Unified Memory Architecture) support for
- AMD Opteron Multiprocessor systems. The kernel will try to allocate
- memory used by a CPU on the local memory controller of the CPU
- and add some more NUMA awareness to the kernel.
- This code is recommended on all multiprocessor Opteron systems
- and normally doesn't hurt on others.
+ Enable NUMA (Non Uniform Memory Access) support. The kernel
+ will try to allocate memory used by a CPU on the local memory
+ controller of the CPU and add some more NUMA awareness to the kernel.
+ This code is recommended on all multiprocessor Opteron systems.
+ If the system is EM64T, you should say N unless your system is EM64T
+ NUMA.
+
+config K8_NUMA
+ bool "Old style AMD Opteron NUMA detection"
+ depends on NUMA
+ default y
+ help
+ Enable K8 NUMA node topology detection. You should say Y here if
+ you have a multi processor AMD K8 system. This uses an old
+ method to read the NUMA configurtion directly from the builtin
+ Northbridge of Opteron. It is recommended to use X86_64_ACPI_NUMA
+ instead, which also takes priority if both are compiled in.
+
+# Dummy CONFIG option to select ACPI_NUMA from drivers/acpi/Kconfig.
+
+config X86_64_ACPI_NUMA
+ bool "ACPI NUMA detection"
+ depends on NUMA
+ select ACPI
+ select ACPI_NUMA
+ default y
+ help
+ Enable ACPI SRAT based node topology detection.
config NUMA_EMU
- bool "NUMA emulation support"
- select NUMA
- depends on SMP
+ bool "NUMA emulation"
+ depends on NUMA
help
Enable NUMA emulation. A flat machine will be split
into virtual nodes when booted with "numa=fake=N", where N is the
@@ -252,9 +272,6 @@ config ARCH_DISCONTIGMEM_ENABLE
depends on NUMA
default y
-config NUMA
- bool
- default n
config ARCH_DISCONTIGMEM_ENABLE
def_bool y
@@ -374,6 +391,14 @@ config X86_MCE_INTEL
Additional support for intel specific MCE features such as
the thermal monitor.
+config X86_MCE_AMD
+ bool "AMD MCE features"
+ depends on X86_MCE && X86_LOCAL_APIC
+ default y
+ help
+ Additional support for AMD specific MCE features such as
+ the DRAM Error Threshold.
+
config PHYSICAL_START
hex "Physical address where the kernel is loaded" if EMBEDDED
default "0x100000"
@@ -502,7 +527,7 @@ config IA32_EMULATION
left.
config IA32_AOUT
- bool "IA32 a.out support"
+ tristate "IA32 a.out support"
depends on IA32_EMULATION
help
Support old a.out binaries in the 32bit emulation.
@@ -532,8 +557,21 @@ source "drivers/firmware/Kconfig"
source fs/Kconfig
+menu "Instrumentation Support"
+ depends on EXPERIMENTAL
+
source "arch/x86_64/oprofile/Kconfig"
+config KPROBES
+ bool "Kprobes (EXPERIMENTAL)"
+ help
+ Kprobes allows you to trap at almost any kernel address and
+ execute a callback function. register_kprobe() establishes
+ a probepoint and specifies the callback. Kprobes is useful
+ for kernel debugging, non-intrusive instrumentation and testing.
+ If in doubt, say "N".
+endmenu
+
source "arch/x86_64/Kconfig.debug"
source "security/Kconfig"
diff --git a/arch/x86_64/Kconfig.debug b/arch/x86_64/Kconfig.debug
index 9cf1410..e2c6e64a 100644
--- a/arch/x86_64/Kconfig.debug
+++ b/arch/x86_64/Kconfig.debug
@@ -2,15 +2,6 @@ menu "Kernel hacking"
source "lib/Kconfig.debug"
-# !SMP for now because the context switch early causes GPF in segment reloading
-# and the GS base checking does the wrong thing then, causing a hang.
-config CHECKING
- bool "Additional run-time checks"
- depends on DEBUG_KERNEL && !SMP
- help
- Enables some internal consistency checks for kernel debugging.
- You should normally say N.
-
config INIT_DEBUG
bool "Debug __init statements"
depends on DEBUG_KERNEL
@@ -33,16 +24,6 @@ config IOMMU_DEBUG
options. See Documentation/x86_64/boot-options.txt for more
details.
-config KPROBES
- bool "Kprobes"
- depends on DEBUG_KERNEL
- help
- Kprobes allows you to trap at almost any kernel address and
- execute a callback function. register_kprobe() establishes
- a probepoint and specifies the callback. Kprobes is useful
- for kernel debugging, non-intrusive instrumentation and testing.
- If in doubt, say "N".
-
config IOMMU_LEAK
bool "IOMMU leak tracing"
depends on DEBUG_KERNEL
diff --git a/arch/x86_64/defconfig b/arch/x86_64/defconfig
index f8db7e5..5d56542 100644
--- a/arch/x86_64/defconfig
+++ b/arch/x86_64/defconfig
@@ -1,7 +1,7 @@
#
# Automatically generated make config: don't edit
-# Linux kernel version: 2.6.13-git11
-# Mon Sep 12 16:16:16 2005
+# Linux kernel version: 2.6.14-git7
+# Sat Nov 5 15:55:50 2005
#
CONFIG_X86_64=y
CONFIG_64BIT=y
@@ -35,7 +35,7 @@ CONFIG_POSIX_MQUEUE=y
# CONFIG_BSD_PROCESS_ACCT is not set
CONFIG_SYSCTL=y
# CONFIG_AUDIT is not set
-# CONFIG_HOTPLUG is not set
+CONFIG_HOTPLUG=y
CONFIG_KOBJECT_UEVENT=y
CONFIG_IKCONFIG=y
CONFIG_IKCONFIG_PROC=y
@@ -93,10 +93,11 @@ CONFIG_PREEMPT_NONE=y
# CONFIG_PREEMPT_VOLUNTARY is not set
# CONFIG_PREEMPT is not set
CONFIG_PREEMPT_BKL=y
+CONFIG_NUMA=y
CONFIG_K8_NUMA=y
+CONFIG_X86_64_ACPI_NUMA=y
# CONFIG_NUMA_EMU is not set
CONFIG_ARCH_DISCONTIGMEM_ENABLE=y
-CONFIG_NUMA=y
CONFIG_ARCH_DISCONTIGMEM_DEFAULT=y
CONFIG_ARCH_SPARSEMEM_ENABLE=y
CONFIG_SELECT_MEMORY_MODEL=y
@@ -107,9 +108,10 @@ CONFIG_DISCONTIGMEM=y
CONFIG_FLAT_NODE_MEM_MAP=y
CONFIG_NEED_MULTIPLE_NODES=y
# CONFIG_SPARSEMEM_STATIC is not set
+CONFIG_SPLIT_PTLOCK_CPUS=4
CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y
-CONFIG_HAVE_DEC_LOCK=y
CONFIG_NR_CPUS=32
+CONFIG_HOTPLUG_CPU=y
CONFIG_HPET_TIMER=y
CONFIG_X86_PM_TIMER=y
CONFIG_HPET_EMULATE_RTC=y
@@ -117,6 +119,7 @@ CONFIG_GART_IOMMU=y
CONFIG_SWIOTLB=y
CONFIG_X86_MCE=y
CONFIG_X86_MCE_INTEL=y
+CONFIG_X86_MCE_AMD=y
CONFIG_PHYSICAL_START=0x100000
# CONFIG_KEXEC is not set
CONFIG_SECCOMP=y
@@ -136,11 +139,15 @@ CONFIG_PM=y
# CONFIG_PM_DEBUG is not set
CONFIG_SOFTWARE_SUSPEND=y
CONFIG_PM_STD_PARTITION=""
+CONFIG_SUSPEND_SMP=y
#
# ACPI (Advanced Configuration and Power Interface) Support
#
CONFIG_ACPI=y
+CONFIG_ACPI_SLEEP=y
+CONFIG_ACPI_SLEEP_PROC_FS=y
+CONFIG_ACPI_SLEEP_PROC_SLEEP=y
CONFIG_ACPI_AC=y
CONFIG_ACPI_BATTERY=y
CONFIG_ACPI_BUTTON=y
@@ -148,6 +155,7 @@ CONFIG_ACPI_BUTTON=y
CONFIG_ACPI_HOTKEY=m
CONFIG_ACPI_FAN=y
CONFIG_ACPI_PROCESSOR=y
+CONFIG_ACPI_HOTPLUG_CPU=y
CONFIG_ACPI_THERMAL=y
CONFIG_ACPI_NUMA=y
# CONFIG_ACPI_ASUS is not set
@@ -158,7 +166,7 @@ CONFIG_ACPI_BLACKLIST_YEAR=2001
CONFIG_ACPI_EC=y
CONFIG_ACPI_POWER=y
CONFIG_ACPI_SYSTEM=y
-# CONFIG_ACPI_CONTAINER is not set
+CONFIG_ACPI_CONTAINER=y
#
# CPU Frequency scaling
@@ -293,7 +301,6 @@ CONFIG_IPV6=y
# Network testing
#
# CONFIG_NET_PKTGEN is not set
-# CONFIG_NETFILTER_NETLINK is not set
# CONFIG_HAMRADIO is not set
# CONFIG_IRDA is not set
# CONFIG_BT is not set
@@ -312,6 +319,11 @@ CONFIG_PREVENT_FIRMWARE_BUILD=y
# CONFIG_DEBUG_DRIVER is not set
#
+# Connector - unified userspace <-> kernelspace linker
+#
+# CONFIG_CONNECTOR is not set
+
+#
# Memory Technology Devices (MTD)
#
# CONFIG_MTD is not set
@@ -354,6 +366,11 @@ CONFIG_IOSCHED_NOOP=y
# CONFIG_IOSCHED_AS is not set
CONFIG_IOSCHED_DEADLINE=y
CONFIG_IOSCHED_CFQ=y
+# CONFIG_DEFAULT_AS is not set
+CONFIG_DEFAULT_DEADLINE=y
+# CONFIG_DEFAULT_CFQ is not set
+# CONFIG_DEFAULT_NOOP is not set
+CONFIG_DEFAULT_IOSCHED="cfq"
# CONFIG_ATA_OVER_ETH is not set
#
@@ -450,6 +467,7 @@ CONFIG_BLK_DEV_SD=y
CONFIG_SCSI_SPI_ATTRS=y
# CONFIG_SCSI_FC_ATTRS is not set
# CONFIG_SCSI_ISCSI_ATTRS is not set
+# CONFIG_SCSI_SAS_ATTRS is not set
#
# SCSI low-level drivers
@@ -469,20 +487,24 @@ CONFIG_AIC79XX_DEBUG_MASK=0
# CONFIG_AIC79XX_REG_PRETTY_PRINT is not set
# CONFIG_MEGARAID_NEWGEN is not set
# CONFIG_MEGARAID_LEGACY is not set
+# CONFIG_MEGARAID_SAS is not set
CONFIG_SCSI_SATA=y
# CONFIG_SCSI_SATA_AHCI is not set
# CONFIG_SCSI_SATA_SVW is not set
CONFIG_SCSI_ATA_PIIX=y
# CONFIG_SCSI_SATA_MV is not set
-# CONFIG_SCSI_SATA_NV is not set
-# CONFIG_SCSI_SATA_PROMISE is not set
+CONFIG_SCSI_SATA_NV=y
+# CONFIG_SCSI_PDC_ADMA is not set
# CONFIG_SCSI_SATA_QSTOR is not set
+# CONFIG_SCSI_SATA_PROMISE is not set
# CONFIG_SCSI_SATA_SX4 is not set
# CONFIG_SCSI_SATA_SIL is not set
+# CONFIG_SCSI_SATA_SIL24 is not set
# CONFIG_SCSI_SATA_SIS is not set
# CONFIG_SCSI_SATA_ULI is not set
CONFIG_SCSI_SATA_VIA=y
# CONFIG_SCSI_SATA_VITESSE is not set
+CONFIG_SCSI_SATA_INTEL_COMBINED=y
# CONFIG_SCSI_BUSLOGIC is not set
# CONFIG_SCSI_DMX3191D is not set
# CONFIG_SCSI_EATA is not set
@@ -525,6 +547,7 @@ CONFIG_BLK_DEV_DM=y
CONFIG_FUSION=y
CONFIG_FUSION_SPI=y
# CONFIG_FUSION_FC is not set
+# CONFIG_FUSION_SAS is not set
CONFIG_FUSION_MAX_SGE=128
# CONFIG_FUSION_CTL is not set
@@ -564,6 +587,7 @@ CONFIG_NET_ETHERNET=y
CONFIG_MII=y
# CONFIG_HAPPYMEAL is not set
# CONFIG_SUNGEM is not set
+# CONFIG_CASSINI is not set
CONFIG_NET_VENDOR_3COM=y
CONFIG_VORTEX=y
# CONFIG_TYPHOON is not set
@@ -740,7 +764,43 @@ CONFIG_LEGACY_PTY_COUNT=256
#
# Watchdog Cards
#
-# CONFIG_WATCHDOG is not set
+CONFIG_WATCHDOG=y
+# CONFIG_WATCHDOG_NOWAYOUT is not set
+
+#
+# Watchdog Device Drivers
+#
+CONFIG_SOFT_WATCHDOG=y
+# CONFIG_ACQUIRE_WDT is not set
+# CONFIG_ADVANTECH_WDT is not set
+# CONFIG_ALIM1535_WDT is not set
+# CONFIG_ALIM7101_WDT is not set
+# CONFIG_SC520_WDT is not set
+# CONFIG_EUROTECH_WDT is not set
+# CONFIG_IB700_WDT is not set
+# CONFIG_IBMASR is not set
+# CONFIG_WAFER_WDT is not set
+# CONFIG_I6300ESB_WDT is not set
+# CONFIG_I8XX_TCO is not set
+# CONFIG_SC1200_WDT is not set
+# CONFIG_60XX_WDT is not set
+# CONFIG_SBC8360_WDT is not set
+# CONFIG_CPU5_WDT is not set
+# CONFIG_W83627HF_WDT is not set
+# CONFIG_W83877F_WDT is not set
+# CONFIG_W83977F_WDT is not set
+# CONFIG_MACHZ_WDT is not set
+
+#
+# PCI-based Watchdog Cards
+#
+# CONFIG_PCIPCWATCHDOG is not set
+# CONFIG_WDTPCI is not set
+
+#
+# USB-based Watchdog Cards
+#
+# CONFIG_USBPCWATCHDOG is not set
CONFIG_HW_RANDOM=y
# CONFIG_NVRAM is not set
CONFIG_RTC=y
@@ -767,6 +827,7 @@ CONFIG_MAX_RAW_DEVS=256
# TPM devices
#
# CONFIG_TCG_TPM is not set
+# CONFIG_TELCLOCK is not set
#
# I2C support
@@ -783,6 +844,7 @@ CONFIG_MAX_RAW_DEVS=256
#
CONFIG_HWMON=y
# CONFIG_HWMON_VID is not set
+# CONFIG_SENSORS_HDAPS is not set
# CONFIG_HWMON_DEBUG_CHIP is not set
#
@@ -886,12 +948,15 @@ CONFIG_USB_UHCI_HCD=y
# USB Device Class drivers
#
# CONFIG_OBSOLETE_OSS_USB_DRIVER is not set
-# CONFIG_USB_BLUETOOTH_TTY is not set
# CONFIG_USB_ACM is not set
CONFIG_USB_PRINTER=y
#
-# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support' may also be needed; see USB_STORAGE Help for more information
+# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support'
+#
+
+#
+# may also be needed; see USB_STORAGE Help for more information
#
CONFIG_USB_STORAGE=y
# CONFIG_USB_STORAGE_DEBUG is not set
@@ -924,6 +989,7 @@ CONFIG_USB_HIDINPUT=y
# CONFIG_USB_XPAD is not set
# CONFIG_USB_ATI_REMOTE is not set
# CONFIG_USB_KEYSPAN_REMOTE is not set
+# CONFIG_USB_APPLETOUCH is not set
#
# USB Imaging devices
@@ -1005,7 +1071,7 @@ CONFIG_USB_MON=y
#
# CONFIG_EDD is not set
# CONFIG_DELL_RBU is not set
-CONFIG_DCDBAS=m
+# CONFIG_DCDBAS is not set
#
# File systems
@@ -1037,7 +1103,7 @@ CONFIG_INOTIFY=y
# CONFIG_QUOTA is not set
CONFIG_DNOTIFY=y
CONFIG_AUTOFS_FS=y
-# CONFIG_AUTOFS4_FS is not set
+CONFIG_AUTOFS4_FS=y
# CONFIG_FUSE_FS is not set
#
@@ -1068,7 +1134,7 @@ CONFIG_TMPFS=y
CONFIG_HUGETLBFS=y
CONFIG_HUGETLB_PAGE=y
CONFIG_RAMFS=y
-# CONFIG_RELAYFS_FS is not set
+CONFIG_RELAYFS_FS=y
#
# Miscellaneous filesystems
@@ -1186,7 +1252,9 @@ CONFIG_DETECT_SOFTLOCKUP=y
# CONFIG_DEBUG_KOBJECT is not set
# CONFIG_DEBUG_INFO is not set
CONFIG_DEBUG_FS=y
+# CONFIG_DEBUG_VM is not set
# CONFIG_FRAME_POINTER is not set
+# CONFIG_RCU_TORTURE_TEST is not set
CONFIG_INIT_DEBUG=y
# CONFIG_IOMMU_DEBUG is not set
CONFIG_KPROBES=y
diff --git a/arch/x86_64/ia32/ia32_aout.c b/arch/x86_64/ia32/ia32_aout.c
index 3e6780f..3bf58af 100644
--- a/arch/x86_64/ia32/ia32_aout.c
+++ b/arch/x86_64/ia32/ia32_aout.c
@@ -36,9 +36,6 @@
#undef WARN_OLD
#undef CORE_DUMP /* probably broken */
-extern int ia32_setup_arg_pages(struct linux_binprm *bprm,
- unsigned long stack_top, int exec_stack);
-
static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs);
static int load_aout_library(struct file*);
@@ -314,7 +311,6 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
current->mm->free_area_cache = TASK_UNMAPPED_BASE;
current->mm->cached_hole_size = 0;
- set_mm_counter(current->mm, rss, 0);
current->mm->mmap = NULL;
compute_creds(bprm);
current->flags &= ~PF_FORKNOEXEC;
diff --git a/arch/x86_64/ia32/ia32_binfmt.c b/arch/x86_64/ia32/ia32_binfmt.c
index d9161e3..830feb2 100644
--- a/arch/x86_64/ia32/ia32_binfmt.c
+++ b/arch/x86_64/ia32/ia32_binfmt.c
@@ -335,7 +335,8 @@ static void elf32_init(struct pt_regs *regs)
me->thread.es = __USER_DS;
}
-int setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top, int executable_stack)
+int ia32_setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top,
+ int executable_stack)
{
unsigned long stack_base;
struct vm_area_struct *mpnt;
@@ -389,6 +390,7 @@ int setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top, int exec
return 0;
}
+EXPORT_SYMBOL(ia32_setup_arg_pages);
static unsigned long
elf32_map (struct file *filep, unsigned long addr, struct elf_phdr *eppnt, int prot, int type)
diff --git a/arch/x86_64/ia32/ia32_ioctl.c b/arch/x86_64/ia32/ia32_ioctl.c
index 419758f..e335bd0 100644
--- a/arch/x86_64/ia32/ia32_ioctl.c
+++ b/arch/x86_64/ia32/ia32_ioctl.c
@@ -12,40 +12,11 @@
#define INCLUDES
#include <linux/syscalls.h>
#include "compat_ioctl.c"
-#include <asm/mtrr.h>
#include <asm/ia32.h>
#define CODE
#include "compat_ioctl.c"
-#ifndef TIOCGDEV
-#define TIOCGDEV _IOR('T',0x32, unsigned int)
-#endif
-static int tiocgdev(unsigned fd, unsigned cmd, unsigned int __user *ptr)
-{
-
- struct file *file;
- struct tty_struct *real_tty;
- int fput_needed, ret;
-
- file = fget_light(fd, &fput_needed);
- if (!file)
- return -EBADF;
-
- ret = -EINVAL;
- if (file->f_op->ioctl != tty_ioctl)
- goto out;
- real_tty = (struct tty_struct *)file->private_data;
- if (!real_tty)
- goto out;
-
- ret = put_user(new_encode_dev(tty_devnum(real_tty)), ptr);
-
-out:
- fput_light(file, fput_needed);
- return ret;
-}
-
#define RTC_IRQP_READ32 _IOR('p', 0x0b, unsigned int) /* Read IRQ rate */
#define RTC_IRQP_SET32 _IOW('p', 0x0c, unsigned int) /* Set IRQ rate */
#define RTC_EPOCH_READ32 _IOR('p', 0x0d, unsigned) /* Read epoch */
@@ -85,90 +56,6 @@ static int rtc32_ioctl(unsigned fd, unsigned cmd, unsigned long arg)
return sys_ioctl(fd,cmd,arg);
}
-/* /proc/mtrr ioctls */
-
-
-struct mtrr_sentry32
-{
- compat_ulong_t base; /* Base address */
- compat_uint_t size; /* Size of region */
- compat_uint_t type; /* Type of region */
-};
-
-struct mtrr_gentry32
-{
- compat_ulong_t regnum; /* Register number */
- compat_uint_t base; /* Base address */
- compat_uint_t size; /* Size of region */
- compat_uint_t type; /* Type of region */
-};
-
-#define MTRR_IOCTL_BASE 'M'
-
-#define MTRRIOC32_ADD_ENTRY _IOW(MTRR_IOCTL_BASE, 0, struct mtrr_sentry32)
-#define MTRRIOC32_SET_ENTRY _IOW(MTRR_IOCTL_BASE, 1, struct mtrr_sentry32)
-#define MTRRIOC32_DEL_ENTRY _IOW(MTRR_IOCTL_BASE, 2, struct mtrr_sentry32)
-#define MTRRIOC32_GET_ENTRY _IOWR(MTRR_IOCTL_BASE, 3, struct mtrr_gentry32)
-#define MTRRIOC32_KILL_ENTRY _IOW(MTRR_IOCTL_BASE, 4, struct mtrr_sentry32)
-#define MTRRIOC32_ADD_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 5, struct mtrr_sentry32)
-#define MTRRIOC32_SET_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 6, struct mtrr_sentry32)
-#define MTRRIOC32_DEL_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 7, struct mtrr_sentry32)
-#define MTRRIOC32_GET_PAGE_ENTRY _IOWR(MTRR_IOCTL_BASE, 8, struct mtrr_gentry32)
-#define MTRRIOC32_KILL_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 9, struct mtrr_sentry32)
-
-
-static int mtrr_ioctl32(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
- struct mtrr_gentry g;
- struct mtrr_sentry s;
- int get = 0, err = 0;
- struct mtrr_gentry32 __user *g32 = (struct mtrr_gentry32 __user *)arg;
- mm_segment_t oldfs = get_fs();
-
- switch (cmd) {
-#define SET(x) case MTRRIOC32_ ## x ## _ENTRY: cmd = MTRRIOC_ ## x ## _ENTRY; break
-#define GET(x) case MTRRIOC32_ ## x ## _ENTRY: cmd = MTRRIOC_ ## x ## _ENTRY; get=1; break
- SET(ADD);
- SET(SET);
- SET(DEL);
- GET(GET);
- SET(KILL);
- SET(ADD_PAGE);
- SET(SET_PAGE);
- SET(DEL_PAGE);
- GET(GET_PAGE);
- SET(KILL_PAGE);
- }
-
- if (get) {
- err = get_user(g.regnum, &g32->regnum);
- err |= get_user(g.base, &g32->base);
- err |= get_user(g.size, &g32->size);
- err |= get_user(g.type, &g32->type);
-
- arg = (unsigned long)&g;
- } else {
- struct mtrr_sentry32 __user *s32 = (struct mtrr_sentry32 __user *)arg;
- err = get_user(s.base, &s32->base);
- err |= get_user(s.size, &s32->size);
- err |= get_user(s.type, &s32->type);
-
- arg = (unsigned long)&s;
- }
- if (err) return err;
-
- set_fs(KERNEL_DS);
- err = sys_ioctl(fd, cmd, arg);
- set_fs(oldfs);
-
- if (!err && get) {
- err = put_user(g.base, &g32->base);
- err |= put_user(g.size, &g32->size);
- err |= put_user(g.regnum, &g32->regnum);
- err |= put_user(g.type, &g32->type);
- }
- return err;
-}
#define HANDLE_IOCTL(cmd,handler) { (cmd), (ioctl_trans_handler_t)(handler) },
#define COMPATIBLE_IOCTL(cmd) HANDLE_IOCTL(cmd,sys_ioctl)
@@ -177,15 +64,8 @@ struct ioctl_trans ioctl_start[] = {
#include <linux/compat_ioctl.h>
#define DECLARES
#include "compat_ioctl.c"
-COMPATIBLE_IOCTL(HDIO_SET_KEEPSETTINGS)
-COMPATIBLE_IOCTL(HDIO_SCAN_HWIF)
-COMPATIBLE_IOCTL(BLKRASET)
-COMPATIBLE_IOCTL(0x4B50) /* KDGHWCLK - not in the kernel, but don't complain */
-COMPATIBLE_IOCTL(0x4B51) /* KDSHWCLK - not in the kernel, but don't complain */
-COMPATIBLE_IOCTL(FIOQSIZE)
/* And these ioctls need translation */
-HANDLE_IOCTL(TIOCGDEV, tiocgdev)
/* realtime device */
HANDLE_IOCTL(RTC_IRQP_READ, rtc32_ioctl)
HANDLE_IOCTL(RTC_IRQP_READ32,rtc32_ioctl)
@@ -193,17 +73,6 @@ HANDLE_IOCTL(RTC_IRQP_SET32, rtc32_ioctl)
HANDLE_IOCTL(RTC_EPOCH_READ32, rtc32_ioctl)
HANDLE_IOCTL(RTC_EPOCH_SET32, rtc32_ioctl)
/* take care of sizeof(sizeof()) breakage */
-/* mtrr */
-HANDLE_IOCTL(MTRRIOC32_ADD_ENTRY, mtrr_ioctl32)
-HANDLE_IOCTL(MTRRIOC32_SET_ENTRY, mtrr_ioctl32)
-HANDLE_IOCTL(MTRRIOC32_DEL_ENTRY, mtrr_ioctl32)
-HANDLE_IOCTL(MTRRIOC32_GET_ENTRY, mtrr_ioctl32)
-HANDLE_IOCTL(MTRRIOC32_KILL_ENTRY, mtrr_ioctl32)
-HANDLE_IOCTL(MTRRIOC32_ADD_PAGE_ENTRY, mtrr_ioctl32)
-HANDLE_IOCTL(MTRRIOC32_SET_PAGE_ENTRY, mtrr_ioctl32)
-HANDLE_IOCTL(MTRRIOC32_DEL_PAGE_ENTRY, mtrr_ioctl32)
-HANDLE_IOCTL(MTRRIOC32_GET_PAGE_ENTRY, mtrr_ioctl32)
-HANDLE_IOCTL(MTRRIOC32_KILL_PAGE_ENTRY, mtrr_ioctl32)
};
int ioctl_table_size = ARRAY_SIZE(ioctl_start);
diff --git a/arch/x86_64/ia32/ia32_signal.c b/arch/x86_64/ia32/ia32_signal.c
index 66e2821..0903cc1 100644
--- a/arch/x86_64/ia32/ia32_signal.c
+++ b/arch/x86_64/ia32/ia32_signal.c
@@ -425,7 +425,11 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size)
rsp = (unsigned long) ka->sa.sa_restorer;
}
- return (void __user *)((rsp - frame_size) & -8UL);
+ rsp -= frame_size;
+ /* Align the stack pointer according to the i386 ABI,
+ * i.e. so that on function entry ((sp + 4) & 15) == 0. */
+ rsp = ((rsp + 4) & -16ul) - 4;
+ return (void __user *) rsp;
}
int ia32_setup_frame(int sig, struct k_sigaction *ka,
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index bcdd0a8..fe4cbd1 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -11,6 +11,7 @@ obj-y := process.o signal.o entry.o traps.o irq.o \
obj-$(CONFIG_X86_MCE) += mce.o
obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o
+obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o
obj-$(CONFIG_MTRR) += ../../i386/kernel/cpu/mtrr/
obj-$(CONFIG_ACPI) += acpi/
obj-$(CONFIG_X86_MSR) += msr.o
@@ -27,7 +28,6 @@ obj-$(CONFIG_CPU_FREQ) += cpufreq/
obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
obj-$(CONFIG_GART_IOMMU) += pci-gart.o aperture.o
obj-$(CONFIG_DUMMY_IOMMU) += pci-nommu.o pci-dma.o
-obj-$(CONFIG_SWIOTLB) += swiotlb.o
obj-$(CONFIG_KPROBES) += kprobes.o
obj-$(CONFIG_X86_PM_TIMER) += pmtimer.o
@@ -41,7 +41,6 @@ CFLAGS_vsyscall.o := $(PROFILING) -g0
bootflag-y += ../../i386/kernel/bootflag.o
cpuid-$(subst m,y,$(CONFIG_X86_CPUID)) += ../../i386/kernel/cpuid.o
topology-y += ../../i386/mach-default/topology.o
-swiotlb-$(CONFIG_SWIOTLB) += ../../ia64/lib/swiotlb.o
microcode-$(subst m,y,$(CONFIG_MICROCODE)) += ../../i386/kernel/microcode.o
intel_cacheinfo-y += ../../i386/kernel/cpu/intel_cacheinfo.o
quirks-y += ../../i386/kernel/quirks.o
diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c
index 962ad48..c7f4fdd 100644
--- a/arch/x86_64/kernel/aperture.c
+++ b/arch/x86_64/kernel/aperture.c
@@ -196,7 +196,7 @@ static __u32 __init search_agp_bridge(u32 *order, int *valid_agp)
void __init iommu_hole_init(void)
{
int fix, num;
- u32 aper_size, aper_alloc = 0, aper_order, last_aper_order = 0;
+ u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0;
u64 aper_base, last_aper_base = 0;
int valid_agp = 0;
diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c
index b6e7715..18691ce 100644
--- a/arch/x86_64/kernel/apic.c
+++ b/arch/x86_64/kernel/apic.c
@@ -833,6 +833,16 @@ int setup_profiling_timer(unsigned int multiplier)
return 0;
}
+#ifdef CONFIG_X86_MCE_AMD
+void setup_threshold_lvt(unsigned long lvt_off)
+{
+ unsigned int v = 0;
+ unsigned long reg = (lvt_off << 4) + 0x500;
+ v |= THRESHOLD_APIC_VECTOR;
+ apic_write(reg, v);
+}
+#endif /* CONFIG_X86_MCE_AMD */
+
#undef APIC_DIVISOR
/*
diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c
index ab3f87a..17579a1 100644
--- a/arch/x86_64/kernel/e820.c
+++ b/arch/x86_64/kernel/e820.c
@@ -23,8 +23,7 @@
#include <asm/e820.h>
#include <asm/proto.h>
#include <asm/bootsetup.h>
-
-extern char _end[];
+#include <asm/sections.h>
/*
* PFN of last memory page.
diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
index 7937971..9ff4204 100644
--- a/arch/x86_64/kernel/entry.S
+++ b/arch/x86_64/kernel/entry.S
@@ -612,6 +612,9 @@ retint_kernel:
ENTRY(thermal_interrupt)
apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt
+ENTRY(threshold_interrupt)
+ apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt
+
#ifdef CONFIG_SMP
ENTRY(reschedule_interrupt)
apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt
diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S
index 4592bf2..1529096 100644
--- a/arch/x86_64/kernel/head.S
+++ b/arch/x86_64/kernel/head.S
@@ -12,6 +12,7 @@
#include <linux/linkage.h>
#include <linux/threads.h>
+#include <linux/init.h>
#include <asm/desc.h>
#include <asm/segment.h>
#include <asm/page.h>
@@ -70,7 +71,7 @@ startup_32:
movl %eax, %cr4
/* Setup early boot stage 4 level pagetables */
- movl $(init_level4_pgt - __START_KERNEL_map), %eax
+ movl $(boot_level4_pgt - __START_KERNEL_map), %eax
movl %eax, %cr3
/* Setup EFER (Extended Feature Enable Register) */
@@ -113,7 +114,7 @@ startup_64:
movq %rax, %cr4
/* Setup early boot stage 4 level pagetables. */
- movq $(init_level4_pgt - __START_KERNEL_map), %rax
+ movq $(boot_level4_pgt - __START_KERNEL_map), %rax
movq %rax, %cr3
/* Check if nx is implemented */
@@ -240,20 +241,10 @@ ljumpvector:
ENTRY(stext)
ENTRY(_stext)
- /*
- * This default setting generates an ident mapping at address 0x100000
- * and a mapping for the kernel that precisely maps virtual address
- * 0xffffffff80000000 to physical address 0x000000. (always using
- * 2Mbyte large pages provided by PAE mode)
- */
.org 0x1000
ENTRY(init_level4_pgt)
- .quad 0x0000000000002007 + __PHYSICAL_START /* -> level3_ident_pgt */
- .fill 255,8,0
- .quad 0x000000000000a007 + __PHYSICAL_START
- .fill 254,8,0
- /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
- .quad 0x0000000000003007 + __PHYSICAL_START /* -> level3_kernel_pgt */
+ /* This gets initialized in x86_64_start_kernel */
+ .fill 512,8,0
.org 0x2000
ENTRY(level3_ident_pgt)
@@ -270,26 +261,26 @@ ENTRY(level3_kernel_pgt)
.org 0x4000
ENTRY(level2_ident_pgt)
/* 40MB for bootup. */
- .quad 0x0000000000000183
- .quad 0x0000000000200183
- .quad 0x0000000000400183
- .quad 0x0000000000600183
- .quad 0x0000000000800183
- .quad 0x0000000000A00183
- .quad 0x0000000000C00183
- .quad 0x0000000000E00183
- .quad 0x0000000001000183
- .quad 0x0000000001200183
- .quad 0x0000000001400183
- .quad 0x0000000001600183
- .quad 0x0000000001800183
- .quad 0x0000000001A00183
- .quad 0x0000000001C00183
- .quad 0x0000000001E00183
- .quad 0x0000000002000183
- .quad 0x0000000002200183
- .quad 0x0000000002400183
- .quad 0x0000000002600183
+ .quad 0x0000000000000083
+ .quad 0x0000000000200083
+ .quad 0x0000000000400083
+ .quad 0x0000000000600083
+ .quad 0x0000000000800083
+ .quad 0x0000000000A00083
+ .quad 0x0000000000C00083
+ .quad 0x0000000000E00083
+ .quad 0x0000000001000083
+ .quad 0x0000000001200083
+ .quad 0x0000000001400083
+ .quad 0x0000000001600083
+ .quad 0x0000000001800083
+ .quad 0x0000000001A00083
+ .quad 0x0000000001C00083
+ .quad 0x0000000001E00083
+ .quad 0x0000000002000083
+ .quad 0x0000000002200083
+ .quad 0x0000000002400083
+ .quad 0x0000000002600083
/* Temporary mappings for the super early allocator in arch/x86_64/mm/init.c */
.globl temp_boot_pmds
temp_boot_pmds:
@@ -350,6 +341,24 @@ ENTRY(wakeup_level4_pgt)
.quad 0x0000000000003007 + __PHYSICAL_START /* -> level3_kernel_pgt */
#endif
+#ifndef CONFIG_HOTPLUG_CPU
+ __INITDATA
+#endif
+ /*
+ * This default setting generates an ident mapping at address 0x100000
+ * and a mapping for the kernel that precisely maps virtual address
+ * 0xffffffff80000000 to physical address 0x000000. (always using
+ * 2Mbyte large pages provided by PAE mode)
+ */
+ .align PAGE_SIZE
+ENTRY(boot_level4_pgt)
+ .quad 0x0000000000002007 + __PHYSICAL_START /* -> level3_ident_pgt */
+ .fill 255,8,0
+ .quad 0x000000000000a007 + __PHYSICAL_START
+ .fill 254,8,0
+ /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
+ .quad 0x0000000000003007 + __PHYSICAL_START /* -> level3_kernel_pgt */
+
.data
.align 16
diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c
index cf6ab14..b675c5a 100644
--- a/arch/x86_64/kernel/head64.c
+++ b/arch/x86_64/kernel/head64.c
@@ -19,14 +19,15 @@
#include <asm/bootsetup.h>
#include <asm/setup.h>
#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <asm/sections.h>
/* Don't add a printk in there. printk relies on the PDA which is not initialized
yet. */
static void __init clear_bss(void)
{
- extern char __bss_start[], __bss_end[];
memset(__bss_start, 0,
- (unsigned long) __bss_end - (unsigned long) __bss_start);
+ (unsigned long) __bss_stop - (unsigned long) __bss_start);
}
#define NEW_CL_POINTER 0x228 /* Relative to real mode data */
@@ -75,8 +76,6 @@ static void __init setup_boot_cpu_data(void)
boot_cpu_data.x86_mask = eax & 0xf;
}
-extern char _end[];
-
void __init x86_64_start_kernel(char * real_mode_data)
{
char *s;
@@ -86,6 +85,13 @@ void __init x86_64_start_kernel(char * real_mode_data)
set_intr_gate(i, early_idt_handler);
asm volatile("lidt %0" :: "m" (idt_descr));
clear_bss();
+
+ /*
+ * switch to init_level4_pgt from boot_level4_pgt
+ */
+ memcpy(init_level4_pgt, boot_level4_pgt, PTRS_PER_PGD*sizeof(pgd_t));
+ asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
+
pda_init(0);
copy_bootdata(real_mode_data);
#ifdef CONFIG_SMP
diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c
index b2a238b..6e5101a 100644
--- a/arch/x86_64/kernel/i8259.c
+++ b/arch/x86_64/kernel/i8259.c
@@ -492,9 +492,10 @@ void invalidate_interrupt5(void);
void invalidate_interrupt6(void);
void invalidate_interrupt7(void);
void thermal_interrupt(void);
+void threshold_interrupt(void);
void i8254_timer_resume(void);
-static void setup_timer(void)
+static void setup_timer_hardware(void)
{
outb_p(0x34,0x43); /* binary, mode 2, LSB/MSB, ch 0 */
udelay(10);
@@ -505,17 +506,17 @@ static void setup_timer(void)
static int timer_resume(struct sys_device *dev)
{
- setup_timer();
+ setup_timer_hardware();
return 0;
}
void i8254_timer_resume(void)
{
- setup_timer();
+ setup_timer_hardware();
}
static struct sysdev_class timer_sysclass = {
- set_kset_name("timer"),
+ set_kset_name("timer_pit"),
.resume = timer_resume,
};
@@ -580,6 +581,7 @@ void __init init_IRQ(void)
set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
#endif
set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
+ set_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
#ifdef CONFIG_X86_LOCAL_APIC
/* self generated IPI for local APIC timer */
@@ -594,7 +596,7 @@ void __init init_IRQ(void)
* Set the clock to HZ Hz, we already have a valid
* vector now:
*/
- setup_timer();
+ setup_timer_hardware();
if (!acpi_ioapic)
setup_irq(2, &irq2);
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index c8eee20..97154ab 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -57,7 +57,7 @@ int nr_ioapic_registers[MAX_IO_APICS];
* Rough estimation of how many shared IRQs there are, can
* be changed anytime.
*/
-#define MAX_PLUS_SHARED_IRQS NR_IRQS
+#define MAX_PLUS_SHARED_IRQS NR_IRQ_VECTORS
#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
/*
@@ -85,6 +85,7 @@ int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1};
int pin; \
struct irq_pin_list *entry = irq_2_pin + irq; \
\
+ BUG_ON(irq >= NR_IRQS); \
for (;;) { \
unsigned int reg; \
pin = entry->pin; \
@@ -127,6 +128,8 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
}
#endif
+static u8 gsi_2_irq[NR_IRQ_VECTORS] = { [0 ... NR_IRQ_VECTORS-1] = 0xFF };
+
/*
* The common case is 1:1 IRQ<->pin mappings. Sometimes there are
* shared ISA-space IRQs, so we have to support them. We are super
@@ -137,6 +140,7 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin)
static int first_free_entry = NR_IRQS;
struct irq_pin_list *entry = irq_2_pin + irq;
+ BUG_ON(irq >= NR_IRQS);
while (entry->next)
entry = irq_2_pin + entry->next;
@@ -144,7 +148,7 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin)
entry->next = first_free_entry;
entry = irq_2_pin + entry->next;
if (++first_free_entry >= PIN_MAP_SIZE)
- panic("io_apic.c: whoops");
+ panic("io_apic.c: ran out of irq_2_pin entries!");
}
entry->apic = apic;
entry->pin = pin;
@@ -420,6 +424,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
best_guess = irq;
}
}
+ BUG_ON(best_guess >= NR_IRQS);
return best_guess;
}
@@ -610,6 +615,64 @@ static inline int irq_trigger(int idx)
return MPBIOS_trigger(idx);
}
+static int next_irq = 16;
+
+/*
+ * gsi_irq_sharing -- Name overload! "irq" can be either a legacy IRQ
+ * in the range 0-15, a linux IRQ in the range 0-223, or a GSI number
+ * from ACPI, which can reach 800 in large boxen.
+ *
+ * Compact the sparse GSI space into a sequential IRQ series and reuse
+ * vectors if possible.
+ */
+int gsi_irq_sharing(int gsi)
+{
+ int i, tries, vector;
+
+ BUG_ON(gsi >= NR_IRQ_VECTORS);
+
+ if (platform_legacy_irq(gsi))
+ return gsi;
+
+ if (gsi_2_irq[gsi] != 0xFF)
+ return (int)gsi_2_irq[gsi];
+
+ tries = NR_IRQS;
+ try_again:
+ vector = assign_irq_vector(gsi);
+
+ /*
+ * Sharing vectors means sharing IRQs, so scan irq_vectors for previous
+ * use of vector and if found, return that IRQ. However, we never want
+ * to share legacy IRQs, which usually have a different trigger mode
+ * than PCI.
+ */
+ for (i = 0; i < NR_IRQS; i++)
+ if (IO_APIC_VECTOR(i) == vector)
+ break;
+ if (platform_legacy_irq(i)) {
+ if (--tries >= 0) {
+ IO_APIC_VECTOR(i) = 0;
+ goto try_again;
+ }
+ panic("gsi_irq_sharing: didn't find an IRQ using vector 0x%02X for GSI %d", vector, gsi);
+ }
+ if (i < NR_IRQS) {
+ gsi_2_irq[gsi] = i;
+ printk(KERN_INFO "GSI %d sharing vector 0x%02X and IRQ %d\n",
+ gsi, vector, i);
+ return i;
+ }
+
+ i = next_irq++;
+ BUG_ON(i >= NR_IRQS);
+ gsi_2_irq[gsi] = i;
+ IO_APIC_VECTOR(i) = vector;
+ printk(KERN_INFO "GSI %d assigned vector 0x%02X and IRQ %d\n",
+ gsi, vector, i);
+ return i;
+}
+
static int pin_2_irq(int idx, int apic, int pin)
{
int irq, i;
@@ -639,6 +702,7 @@ static int pin_2_irq(int idx, int apic, int pin)
while (i < apic)
irq += nr_ioapic_registers[i++];
irq += pin;
+ irq = gsi_irq_sharing(irq);
break;
}
default:
@@ -648,6 +712,7 @@ static int pin_2_irq(int idx, int apic, int pin)
break;
}
}
+ BUG_ON(irq >= NR_IRQS);
/*
* PCI IRQ command line redirection. Yes, limits are hardcoded.
@@ -663,6 +728,7 @@ static int pin_2_irq(int idx, int apic, int pin)
}
}
}
+ BUG_ON(irq >= NR_IRQS);
return irq;
}
@@ -690,8 +756,8 @@ int assign_irq_vector(int irq)
{
static int current_vector = FIRST_DEVICE_VECTOR, offset = 0;
- BUG_ON(irq >= NR_IRQ_VECTORS);
- if (IO_APIC_VECTOR(irq) > 0)
+ BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS);
+ if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0)
return IO_APIC_VECTOR(irq);
next:
current_vector += 8;
@@ -699,9 +765,8 @@ next:
goto next;
if (current_vector >= FIRST_SYSTEM_VECTOR) {
- offset++;
- if (!(offset%8))
- return -ENOSPC;
+ /* If we run out of vectors on large boxen, must share them. */
+ offset = (offset + 1) % 8;
current_vector = FIRST_DEVICE_VECTOR + offset;
}
@@ -1917,6 +1982,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a
entry.polarity = active_high_low;
entry.mask = 1; /* Disabled (masked) */
+ irq = gsi_irq_sharing(irq);
/*
* IRQs < 16 are already in the irq_2_pin[] map
*/
diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c
index df08c43..dddeb67 100644
--- a/arch/x86_64/kernel/kprobes.c
+++ b/arch/x86_64/kernel/kprobes.c
@@ -34,7 +34,6 @@
#include <linux/config.h>
#include <linux/kprobes.h>
#include <linux/ptrace.h>
-#include <linux/spinlock.h>
#include <linux/string.h>
#include <linux/slab.h>
#include <linux/preempt.h>
@@ -44,17 +43,10 @@
#include <asm/kdebug.h>
static DECLARE_MUTEX(kprobe_mutex);
-
-static struct kprobe *current_kprobe;
-static unsigned long kprobe_status, kprobe_old_rflags, kprobe_saved_rflags;
-static struct kprobe *kprobe_prev;
-static unsigned long kprobe_status_prev, kprobe_old_rflags_prev, kprobe_saved_rflags_prev;
-static struct pt_regs jprobe_saved_regs;
-static long *jprobe_saved_rsp;
void jprobe_return_end(void);
-/* copy of the kernel stack at the probe fire time */
-static kprobe_opcode_t jprobes_stack[MAX_STACK_SIZE];
+DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
+DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
/*
* returns non-zero if opcode modifies the interrupt flag.
@@ -77,9 +69,9 @@ static inline int is_IF_modifier(kprobe_opcode_t *insn)
int __kprobes arch_prepare_kprobe(struct kprobe *p)
{
/* insn: must be on special executable page on x86_64. */
- up(&kprobe_mutex);
- p->ainsn.insn = get_insn_slot();
down(&kprobe_mutex);
+ p->ainsn.insn = get_insn_slot();
+ up(&kprobe_mutex);
if (!p->ainsn.insn) {
return -ENOMEM;
}
@@ -231,34 +223,35 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p)
void __kprobes arch_remove_kprobe(struct kprobe *p)
{
- up(&kprobe_mutex);
- free_insn_slot(p->ainsn.insn);
down(&kprobe_mutex);
+ free_insn_slot(p->ainsn.insn);
+ up(&kprobe_mutex);
}
-static inline void save_previous_kprobe(void)
+static inline void save_previous_kprobe(struct kprobe_ctlblk *kcb)
{
- kprobe_prev = current_kprobe;
- kprobe_status_prev = kprobe_status;
- kprobe_old_rflags_prev = kprobe_old_rflags;
- kprobe_saved_rflags_prev = kprobe_saved_rflags;
+ kcb->prev_kprobe.kp = kprobe_running();
+ kcb->prev_kprobe.status = kcb->kprobe_status;
+ kcb->prev_kprobe.old_rflags = kcb->kprobe_old_rflags;
+ kcb->prev_kprobe.saved_rflags = kcb->kprobe_saved_rflags;
}
-static inline void restore_previous_kprobe(void)
+static inline void restore_previous_kprobe(struct kprobe_ctlblk *kcb)
{
- current_kprobe = kprobe_prev;
- kprobe_status = kprobe_status_prev;
- kprobe_old_rflags = kprobe_old_rflags_prev;
- kprobe_saved_rflags = kprobe_saved_rflags_prev;
+ __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp;
+ kcb->kprobe_status = kcb->prev_kprobe.status;
+ kcb->kprobe_old_rflags = kcb->prev_kprobe.old_rflags;
+ kcb->kprobe_saved_rflags = kcb->prev_kprobe.saved_rflags;
}
-static inline void set_current_kprobe(struct kprobe *p, struct pt_regs *regs)
+static inline void set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb)
{
- current_kprobe = p;
- kprobe_saved_rflags = kprobe_old_rflags
+ __get_cpu_var(current_kprobe) = p;
+ kcb->kprobe_saved_rflags = kcb->kprobe_old_rflags
= (regs->eflags & (TF_MASK | IF_MASK));
if (is_IF_modifier(p->ainsn.insn))
- kprobe_saved_rflags &= ~IF_MASK;
+ kcb->kprobe_saved_rflags &= ~IF_MASK;
}
static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
@@ -272,6 +265,7 @@ static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
regs->rip = (unsigned long)p->ainsn.insn;
}
+/* Called with kretprobe_lock held */
void __kprobes arch_prepare_kretprobe(struct kretprobe *rp,
struct pt_regs *regs)
{
@@ -292,32 +286,30 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe *rp,
}
}
-/*
- * Interrupts are disabled on entry as trap3 is an interrupt gate and they
- * remain disabled thorough out this function.
- */
int __kprobes kprobe_handler(struct pt_regs *regs)
{
struct kprobe *p;
int ret = 0;
kprobe_opcode_t *addr = (kprobe_opcode_t *)(regs->rip - sizeof(kprobe_opcode_t));
+ struct kprobe_ctlblk *kcb;
- /* We're in an interrupt, but this is clear and BUG()-safe. */
+ /*
+ * We don't want to be preempted for the entire
+ * duration of kprobe processing
+ */
preempt_disable();
+ kcb = get_kprobe_ctlblk();
/* Check we're not actually recursing */
if (kprobe_running()) {
- /* We *are* holding lock here, so this is safe.
- Disarm the probe we just hit, and ignore it. */
p = get_kprobe(addr);
if (p) {
- if (kprobe_status == KPROBE_HIT_SS &&
+ if (kcb->kprobe_status == KPROBE_HIT_SS &&
*p->ainsn.insn == BREAKPOINT_INSTRUCTION) {
regs->eflags &= ~TF_MASK;
- regs->eflags |= kprobe_saved_rflags;
- unlock_kprobes();
+ regs->eflags |= kcb->kprobe_saved_rflags;
goto no_kprobe;
- } else if (kprobe_status == KPROBE_HIT_SSDONE) {
+ } else if (kcb->kprobe_status == KPROBE_HIT_SSDONE) {
/* TODO: Provide re-entrancy from
* post_kprobes_handler() and avoid exception
* stack corruption while single-stepping on
@@ -325,6 +317,7 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
*/
arch_disarm_kprobe(p);
regs->rip = (unsigned long)p->addr;
+ reset_current_kprobe();
ret = 1;
} else {
/* We have reentered the kprobe_handler(), since
@@ -334,27 +327,24 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
* of the new probe without calling any user
* handlers.
*/
- save_previous_kprobe();
- set_current_kprobe(p, regs);
+ save_previous_kprobe(kcb);
+ set_current_kprobe(p, regs, kcb);
p->nmissed++;
prepare_singlestep(p, regs);
- kprobe_status = KPROBE_REENTER;
+ kcb->kprobe_status = KPROBE_REENTER;
return 1;
}
} else {
- p = current_kprobe;
+ p = __get_cpu_var(current_kprobe);
if (p->break_handler && p->break_handler(p, regs)) {
goto ss_probe;
}
}
- /* If it's not ours, can't be delete race, (we hold lock). */
goto no_kprobe;
}
- lock_kprobes();
p = get_kprobe(addr);
if (!p) {
- unlock_kprobes();
if (*addr != BREAKPOINT_INSTRUCTION) {
/*
* The breakpoint instruction was removed right
@@ -372,8 +362,8 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
goto no_kprobe;
}
- kprobe_status = KPROBE_HIT_ACTIVE;
- set_current_kprobe(p, regs);
+ set_current_kprobe(p, regs, kcb);
+ kcb->kprobe_status = KPROBE_HIT_ACTIVE;
if (p->pre_handler && p->pre_handler(p, regs))
/* handler has already set things up, so skip ss setup */
@@ -381,7 +371,7 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
ss_probe:
prepare_singlestep(p, regs);
- kprobe_status = KPROBE_HIT_SS;
+ kcb->kprobe_status = KPROBE_HIT_SS;
return 1;
no_kprobe:
@@ -409,9 +399,10 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
struct kretprobe_instance *ri = NULL;
struct hlist_head *head;
struct hlist_node *node, *tmp;
- unsigned long orig_ret_address = 0;
+ unsigned long flags, orig_ret_address = 0;
unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline;
+ spin_lock_irqsave(&kretprobe_lock, flags);
head = kretprobe_inst_table_head(current);
/*
@@ -450,13 +441,14 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
BUG_ON(!orig_ret_address || (orig_ret_address == trampoline_address));
regs->rip = orig_ret_address;
- unlock_kprobes();
+ reset_current_kprobe();
+ spin_unlock_irqrestore(&kretprobe_lock, flags);
preempt_enable_no_resched();
/*
* By returning a non-zero value, we are telling
- * kprobe_handler() that we have handled unlocking
- * and re-enabling preemption.
+ * kprobe_handler() that we don't want the post_handler
+ * to run (and have re-enabled preemption)
*/
return 1;
}
@@ -483,7 +475,8 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
* that is atop the stack is the address following the copied instruction.
* We need to make it the address following the original instruction.
*/
-static void __kprobes resume_execution(struct kprobe *p, struct pt_regs *regs)
+static void __kprobes resume_execution(struct kprobe *p,
+ struct pt_regs *regs, struct kprobe_ctlblk *kcb)
{
unsigned long *tos = (unsigned long *)regs->rsp;
unsigned long next_rip = 0;
@@ -498,7 +491,7 @@ static void __kprobes resume_execution(struct kprobe *p, struct pt_regs *regs)
switch (*insn) {
case 0x9c: /* pushfl */
*tos &= ~(TF_MASK | IF_MASK);
- *tos |= kprobe_old_rflags;
+ *tos |= kcb->kprobe_old_rflags;
break;
case 0xc3: /* ret/lret */
case 0xcb:
@@ -537,30 +530,28 @@ static void __kprobes resume_execution(struct kprobe *p, struct pt_regs *regs)
}
}
-/*
- * Interrupts are disabled on entry as trap1 is an interrupt gate and they
- * remain disabled thoroughout this function. And we hold kprobe lock.
- */
int __kprobes post_kprobe_handler(struct pt_regs *regs)
{
- if (!kprobe_running())
+ struct kprobe *cur = kprobe_running();
+ struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+ if (!cur)
return 0;
- if ((kprobe_status != KPROBE_REENTER) && current_kprobe->post_handler) {
- kprobe_status = KPROBE_HIT_SSDONE;
- current_kprobe->post_handler(current_kprobe, regs, 0);
+ if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
+ kcb->kprobe_status = KPROBE_HIT_SSDONE;
+ cur->post_handler(cur, regs, 0);
}
- resume_execution(current_kprobe, regs);
- regs->eflags |= kprobe_saved_rflags;
+ resume_execution(cur, regs, kcb);
+ regs->eflags |= kcb->kprobe_saved_rflags;
/* Restore the original saved kprobes variables and continue. */
- if (kprobe_status == KPROBE_REENTER) {
- restore_previous_kprobe();
+ if (kcb->kprobe_status == KPROBE_REENTER) {
+ restore_previous_kprobe(kcb);
goto out;
- } else {
- unlock_kprobes();
}
+ reset_current_kprobe();
out:
preempt_enable_no_resched();
@@ -575,18 +566,19 @@ out:
return 1;
}
-/* Interrupts disabled, kprobe_lock held. */
int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
{
- if (current_kprobe->fault_handler
- && current_kprobe->fault_handler(current_kprobe, regs, trapnr))
+ struct kprobe *cur = kprobe_running();
+ struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+ if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
return 1;
- if (kprobe_status & KPROBE_HIT_SS) {
- resume_execution(current_kprobe, regs);
- regs->eflags |= kprobe_old_rflags;
+ if (kcb->kprobe_status & KPROBE_HIT_SS) {
+ resume_execution(cur, regs, kcb);
+ regs->eflags |= kcb->kprobe_old_rflags;
- unlock_kprobes();
+ reset_current_kprobe();
preempt_enable_no_resched();
}
return 0;
@@ -599,39 +591,41 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
unsigned long val, void *data)
{
struct die_args *args = (struct die_args *)data;
+ int ret = NOTIFY_DONE;
+
switch (val) {
case DIE_INT3:
if (kprobe_handler(args->regs))
- return NOTIFY_STOP;
+ ret = NOTIFY_STOP;
break;
case DIE_DEBUG:
if (post_kprobe_handler(args->regs))
- return NOTIFY_STOP;
+ ret = NOTIFY_STOP;
break;
case DIE_GPF:
- if (kprobe_running() &&
- kprobe_fault_handler(args->regs, args->trapnr))
- return NOTIFY_STOP;
- break;
case DIE_PAGE_FAULT:
+ /* kprobe_running() needs smp_processor_id() */
+ preempt_disable();
if (kprobe_running() &&
kprobe_fault_handler(args->regs, args->trapnr))
- return NOTIFY_STOP;
+ ret = NOTIFY_STOP;
+ preempt_enable();
break;
default:
break;
}
- return NOTIFY_DONE;
+ return ret;
}
int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
{
struct jprobe *jp = container_of(p, struct jprobe, kp);
unsigned long addr;
+ struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
- jprobe_saved_regs = *regs;
- jprobe_saved_rsp = (long *) regs->rsp;
- addr = (unsigned long)jprobe_saved_rsp;
+ kcb->jprobe_saved_regs = *regs;
+ kcb->jprobe_saved_rsp = (long *) regs->rsp;
+ addr = (unsigned long)(kcb->jprobe_saved_rsp);
/*
* As Linus pointed out, gcc assumes that the callee
* owns the argument space and could overwrite it, e.g.
@@ -639,7 +633,8 @@ int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
* we also save and restore enough stack bytes to cover
* the argument area.
*/
- memcpy(jprobes_stack, (kprobe_opcode_t *) addr, MIN_STACK_SIZE(addr));
+ memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr,
+ MIN_STACK_SIZE(addr));
regs->eflags &= ~IF_MASK;
regs->rip = (unsigned long)(jp->entry);
return 1;
@@ -647,36 +642,40 @@ int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
void __kprobes jprobe_return(void)
{
- preempt_enable_no_resched();
+ struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
asm volatile (" xchg %%rbx,%%rsp \n"
" int3 \n"
" .globl jprobe_return_end \n"
" jprobe_return_end: \n"
" nop \n"::"b"
- (jprobe_saved_rsp):"memory");
+ (kcb->jprobe_saved_rsp):"memory");
}
int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
{
+ struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
u8 *addr = (u8 *) (regs->rip - 1);
- unsigned long stack_addr = (unsigned long)jprobe_saved_rsp;
+ unsigned long stack_addr = (unsigned long)(kcb->jprobe_saved_rsp);
struct jprobe *jp = container_of(p, struct jprobe, kp);
if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) {
- if ((long *)regs->rsp != jprobe_saved_rsp) {
+ if ((long *)regs->rsp != kcb->jprobe_saved_rsp) {
struct pt_regs *saved_regs =
- container_of(jprobe_saved_rsp, struct pt_regs, rsp);
+ container_of(kcb->jprobe_saved_rsp,
+ struct pt_regs, rsp);
printk("current rsp %p does not match saved rsp %p\n",
- (long *)regs->rsp, jprobe_saved_rsp);
+ (long *)regs->rsp, kcb->jprobe_saved_rsp);
printk("Saved registers for jprobe %p\n", jp);
show_registers(saved_regs);
printk("Current registers\n");
show_registers(regs);
BUG();
}
- *regs = jprobe_saved_regs;
- memcpy((kprobe_opcode_t *) stack_addr, jprobes_stack,
+ *regs = kcb->jprobe_saved_regs;
+ memcpy((kprobe_opcode_t *) stack_addr, kcb->jprobes_stack,
MIN_STACK_SIZE(stack_addr));
+ preempt_enable_no_resched();
return 1;
}
return 0;
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index 08203b0..183dc61 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -37,7 +37,7 @@ static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
static unsigned long console_logged;
static int notify_user;
static int rip_msr;
-static int mce_bootlog;
+static int mce_bootlog = 1;
/*
* Lockless MCE logging infrastructure.
@@ -54,9 +54,12 @@ void mce_log(struct mce *mce)
{
unsigned next, entry;
mce->finished = 0;
- smp_wmb();
+ wmb();
for (;;) {
entry = rcu_dereference(mcelog.next);
+ /* The rmb forces the compiler to reload next in each
+ iteration */
+ rmb();
for (;;) {
/* When the buffer fills up discard new entries. Assume
that the earlier errors are the more interesting. */
@@ -69,6 +72,7 @@ void mce_log(struct mce *mce)
entry++;
continue;
}
+ break;
}
smp_rmb();
next = entry + 1;
@@ -76,9 +80,9 @@ void mce_log(struct mce *mce)
break;
}
memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
- smp_wmb();
+ wmb();
mcelog.entry[entry].finished = 1;
- smp_wmb();
+ wmb();
if (!test_and_set_bit(0, &console_logged))
notify_user = 1;
@@ -343,7 +347,11 @@ static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
/* disable GART TBL walk error reporting, which trips off
incorrectly with the IOMMU & 3ware & Cerberus. */
clear_bit(10, &bank[4]);
+ /* Lots of broken BIOS around that don't clear them
+ by default and leave crap in there. Don't log. */
+ mce_bootlog = 0;
}
+
}
static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
@@ -352,6 +360,9 @@ static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
case X86_VENDOR_INTEL:
mce_intel_feature_init(c);
break;
+ case X86_VENDOR_AMD:
+ mce_amd_feature_init(c);
+ break;
default:
break;
}
@@ -491,16 +502,16 @@ static int __init mcheck_disable(char *str)
/* mce=off disables machine check. Note you can reenable it later
using sysfs.
mce=TOLERANCELEVEL (number, see above)
- mce=bootlog Log MCEs from before booting. Disabled by default to work
- around buggy BIOS that leave bogus MCEs. */
+ mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
+ mce=nobootlog Don't log MCEs from before booting. */
static int __init mcheck_enable(char *str)
{
if (*str == '=')
str++;
if (!strcmp(str, "off"))
mce_dont_init = 1;
- else if (!strcmp(str, "bootlog"))
- mce_bootlog = 1;
+ else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
+ mce_bootlog = str[0] == 'b';
else if (isdigit(str[0]))
get_option(&str, &tolerant);
else
diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c
new file mode 100644
index 0000000..1f76175
--- /dev/null
+++ b/arch/x86_64/kernel/mce_amd.c
@@ -0,0 +1,538 @@
+/*
+ * (c) 2005 Advanced Micro Devices, Inc.
+ * Your use of this code is subject to the terms and conditions of the
+ * GNU general public license version 2. See "COPYING" or
+ * http://www.gnu.org/licenses/gpl.html
+ *
+ * Written by Jacob Shin - AMD, Inc.
+ *
+ * Support : jacob.shin@amd.com
+ *
+ * MC4_MISC0 DRAM ECC Error Threshold available under AMD K8 Rev F.
+ * MC4_MISC0 exists per physical processor.
+ *
+ */
+
+#include <linux/cpu.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/kobject.h>
+#include <linux/notifier.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/sysdev.h>
+#include <linux/sysfs.h>
+#include <asm/apic.h>
+#include <asm/mce.h>
+#include <asm/msr.h>
+#include <asm/percpu.h>
+
+#define PFX "mce_threshold: "
+#define VERSION "version 1.00.9"
+#define NR_BANKS 5
+#define THRESHOLD_MAX 0xFFF
+#define INT_TYPE_APIC 0x00020000
+#define MASK_VALID_HI 0x80000000
+#define MASK_LVTOFF_HI 0x00F00000
+#define MASK_COUNT_EN_HI 0x00080000
+#define MASK_INT_TYPE_HI 0x00060000
+#define MASK_OVERFLOW_HI 0x00010000
+#define MASK_ERR_COUNT_HI 0x00000FFF
+#define MASK_OVERFLOW 0x0001000000000000L
+
+struct threshold_bank {
+ unsigned int cpu;
+ u8 bank;
+ u8 interrupt_enable;
+ u16 threshold_limit;
+ struct kobject kobj;
+};
+
+static struct threshold_bank threshold_defaults = {
+ .interrupt_enable = 0,
+ .threshold_limit = THRESHOLD_MAX,
+};
+
+#ifdef CONFIG_SMP
+static unsigned char shared_bank[NR_BANKS] = {
+ 0, 0, 0, 0, 1
+};
+#endif
+
+static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */
+
+/*
+ * CPU Initialization
+ */
+
+/* must be called with correct cpu affinity */
+static void threshold_restart_bank(struct threshold_bank *b,
+ int reset, u16 old_limit)
+{
+ u32 mci_misc_hi, mci_misc_lo;
+
+ rdmsr(MSR_IA32_MC0_MISC + b->bank * 4, mci_misc_lo, mci_misc_hi);
+
+ if (b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX))
+ reset = 1; /* limit cannot be lower than err count */
+
+ if (reset) { /* reset err count and overflow bit */
+ mci_misc_hi =
+ (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
+ (THRESHOLD_MAX - b->threshold_limit);
+ } else if (old_limit) { /* change limit w/o reset */
+ int new_count = (mci_misc_hi & THRESHOLD_MAX) +
+ (old_limit - b->threshold_limit);
+ mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) |
+ (new_count & THRESHOLD_MAX);
+ }
+
+ b->interrupt_enable ?
+ (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) :
+ (mci_misc_hi &= ~MASK_INT_TYPE_HI);
+
+ mci_misc_hi |= MASK_COUNT_EN_HI;
+ wrmsr(MSR_IA32_MC0_MISC + b->bank * 4, mci_misc_lo, mci_misc_hi);
+}
+
+void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
+{
+ int bank;
+ u32 mci_misc_lo, mci_misc_hi;
+ unsigned int cpu = smp_processor_id();
+
+ for (bank = 0; bank < NR_BANKS; ++bank) {
+ rdmsr(MSR_IA32_MC0_MISC + bank * 4, mci_misc_lo, mci_misc_hi);
+
+ /* !valid, !counter present, bios locked */
+ if (!(mci_misc_hi & MASK_VALID_HI) ||
+ !(mci_misc_hi & MASK_VALID_HI >> 1) ||
+ (mci_misc_hi & MASK_VALID_HI >> 2))
+ continue;
+
+ per_cpu(bank_map, cpu) |= (1 << bank);
+
+#ifdef CONFIG_SMP
+ if (shared_bank[bank] && cpu_core_id[cpu])
+ continue;
+#endif
+
+ setup_threshold_lvt((mci_misc_hi & MASK_LVTOFF_HI) >> 20);
+ threshold_defaults.cpu = cpu;
+ threshold_defaults.bank = bank;
+ threshold_restart_bank(&threshold_defaults, 0, 0);
+ }
+}
+
+/*
+ * APIC Interrupt Handler
+ */
+
+/*
+ * threshold interrupt handler will service THRESHOLD_APIC_VECTOR.
+ * the interrupt goes off when error_count reaches threshold_limit.
+ * the handler will simply log mcelog w/ software defined bank number.
+ */
+asmlinkage void mce_threshold_interrupt(void)
+{
+ int bank;
+ struct mce m;
+
+ ack_APIC_irq();
+ irq_enter();
+
+ memset(&m, 0, sizeof(m));
+ rdtscll(m.tsc);
+ m.cpu = smp_processor_id();
+
+ /* assume first bank caused it */
+ for (bank = 0; bank < NR_BANKS; ++bank) {
+ m.bank = MCE_THRESHOLD_BASE + bank;
+ rdmsrl(MSR_IA32_MC0_MISC + bank * 4, m.misc);
+
+ if (m.misc & MASK_OVERFLOW) {
+ mce_log(&m);
+ goto out;
+ }
+ }
+ out:
+ irq_exit();
+}
+
+/*
+ * Sysfs Interface
+ */
+
+static struct sysdev_class threshold_sysclass = {
+ set_kset_name("threshold"),
+};
+
+static DEFINE_PER_CPU(struct sys_device, device_threshold);
+
+struct threshold_attr {
+ struct attribute attr;
+ ssize_t(*show) (struct threshold_bank *, char *);
+ ssize_t(*store) (struct threshold_bank *, const char *, size_t count);
+};
+
+static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]);
+
+static cpumask_t affinity_set(unsigned int cpu)
+{
+ cpumask_t oldmask = current->cpus_allowed;
+ cpumask_t newmask = CPU_MASK_NONE;
+ cpu_set(cpu, newmask);
+ set_cpus_allowed(current, newmask);
+ return oldmask;
+}
+
+static void affinity_restore(cpumask_t oldmask)
+{
+ set_cpus_allowed(current, oldmask);
+}
+
+#define SHOW_FIELDS(name) \
+ static ssize_t show_ ## name(struct threshold_bank * b, char *buf) \
+ { \
+ return sprintf(buf, "%lx\n", (unsigned long) b->name); \
+ }
+SHOW_FIELDS(interrupt_enable)
+SHOW_FIELDS(threshold_limit)
+
+static ssize_t store_interrupt_enable(struct threshold_bank *b,
+ const char *buf, size_t count)
+{
+ char *end;
+ cpumask_t oldmask;
+ unsigned long new = simple_strtoul(buf, &end, 0);
+ if (end == buf)
+ return -EINVAL;
+ b->interrupt_enable = !!new;
+
+ oldmask = affinity_set(b->cpu);
+ threshold_restart_bank(b, 0, 0);
+ affinity_restore(oldmask);
+
+ return end - buf;
+}
+
+static ssize_t store_threshold_limit(struct threshold_bank *b,
+ const char *buf, size_t count)
+{
+ char *end;
+ cpumask_t oldmask;
+ u16 old;
+ unsigned long new = simple_strtoul(buf, &end, 0);
+ if (end == buf)
+ return -EINVAL;
+ if (new > THRESHOLD_MAX)
+ new = THRESHOLD_MAX;
+ if (new < 1)
+ new = 1;
+ old = b->threshold_limit;
+ b->threshold_limit = new;
+
+ oldmask = affinity_set(b->cpu);
+ threshold_restart_bank(b, 0, old);
+ affinity_restore(oldmask);
+
+ return end - buf;
+}
+
+static ssize_t show_error_count(struct threshold_bank *b, char *buf)
+{
+ u32 high, low;
+ cpumask_t oldmask;
+ oldmask = affinity_set(b->cpu);
+ rdmsr(MSR_IA32_MC0_MISC + b->bank * 4, low, high); /* ignore low 32 */
+ affinity_restore(oldmask);
+ return sprintf(buf, "%x\n",
+ (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit));
+}
+
+static ssize_t store_error_count(struct threshold_bank *b,
+ const char *buf, size_t count)
+{
+ cpumask_t oldmask;
+ oldmask = affinity_set(b->cpu);
+ threshold_restart_bank(b, 1, 0);
+ affinity_restore(oldmask);
+ return 1;
+}
+
+#define THRESHOLD_ATTR(_name,_mode,_show,_store) { \
+ .attr = {.name = __stringify(_name), .mode = _mode }, \
+ .show = _show, \
+ .store = _store, \
+};
+
+#define ATTR_FIELDS(name) \
+ static struct threshold_attr name = \
+ THRESHOLD_ATTR(name, 0644, show_## name, store_## name)
+
+ATTR_FIELDS(interrupt_enable);
+ATTR_FIELDS(threshold_limit);
+ATTR_FIELDS(error_count);
+
+static struct attribute *default_attrs[] = {
+ &interrupt_enable.attr,
+ &threshold_limit.attr,
+ &error_count.attr,
+ NULL
+};
+
+#define to_bank(k) container_of(k,struct threshold_bank,kobj)
+#define to_attr(a) container_of(a,struct threshold_attr,attr)
+
+static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+ struct threshold_bank *b = to_bank(kobj);
+ struct threshold_attr *a = to_attr(attr);
+ ssize_t ret;
+ ret = a->show ? a->show(b, buf) : -EIO;
+ return ret;
+}
+
+static ssize_t store(struct kobject *kobj, struct attribute *attr,
+ const char *buf, size_t count)
+{
+ struct threshold_bank *b = to_bank(kobj);
+ struct threshold_attr *a = to_attr(attr);
+ ssize_t ret;
+ ret = a->store ? a->store(b, buf, count) : -EIO;
+ return ret;
+}
+
+static struct sysfs_ops threshold_ops = {
+ .show = show,
+ .store = store,
+};
+
+static struct kobj_type threshold_ktype = {
+ .sysfs_ops = &threshold_ops,
+ .default_attrs = default_attrs,
+};
+
+/* symlinks sibling shared banks to first core. first core owns dir/files. */
+static __cpuinit int threshold_create_bank(unsigned int cpu, int bank)
+{
+ int err = 0;
+ struct threshold_bank *b = 0;
+
+#ifdef CONFIG_SMP
+ if (cpu_core_id[cpu] && shared_bank[bank]) { /* symlink */
+ char name[16];
+ unsigned lcpu = first_cpu(cpu_core_map[cpu]);
+ if (cpu_core_id[lcpu])
+ goto out; /* first core not up yet */
+
+ b = per_cpu(threshold_banks, lcpu)[bank];
+ if (!b)
+ goto out;
+ sprintf(name, "bank%i", bank);
+ err = sysfs_create_link(&per_cpu(device_threshold, cpu).kobj,
+ &b->kobj, name);
+ if (err)
+ goto out;
+ per_cpu(threshold_banks, cpu)[bank] = b;
+ goto out;
+ }
+#endif
+
+ b = kmalloc(sizeof(struct threshold_bank), GFP_KERNEL);
+ if (!b) {
+ err = -ENOMEM;
+ goto out;
+ }
+ memset(b, 0, sizeof(struct threshold_bank));
+
+ b->cpu = cpu;
+ b->bank = bank;
+ b->interrupt_enable = 0;
+ b->threshold_limit = THRESHOLD_MAX;
+ kobject_set_name(&b->kobj, "bank%i", bank);
+ b->kobj.parent = &per_cpu(device_threshold, cpu).kobj;
+ b->kobj.ktype = &threshold_ktype;
+
+ err = kobject_register(&b->kobj);
+ if (err) {
+ kfree(b);
+ goto out;
+ }
+ per_cpu(threshold_banks, cpu)[bank] = b;
+ out:
+ return err;
+}
+
+/* create dir/files for all valid threshold banks */
+static __cpuinit int threshold_create_device(unsigned int cpu)
+{
+ int bank;
+ int err = 0;
+
+ per_cpu(device_threshold, cpu).id = cpu;
+ per_cpu(device_threshold, cpu).cls = &threshold_sysclass;
+ err = sysdev_register(&per_cpu(device_threshold, cpu));
+ if (err)
+ goto out;
+
+ for (bank = 0; bank < NR_BANKS; ++bank) {
+ if (!(per_cpu(bank_map, cpu) & 1 << bank))
+ continue;
+ err = threshold_create_bank(cpu, bank);
+ if (err)
+ goto out;
+ }
+ out:
+ return err;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * let's be hotplug friendly.
+ * in case of multiple core processors, the first core always takes ownership
+ * of shared sysfs dir/files, and rest of the cores will be symlinked to it.
+ */
+
+/* cpu hotplug call removes all symlinks before first core dies */
+static __cpuinit void threshold_remove_bank(unsigned int cpu, int bank)
+{
+ struct threshold_bank *b;
+ char name[16];
+
+ b = per_cpu(threshold_banks, cpu)[bank];
+ if (!b)
+ return;
+ if (shared_bank[bank] && atomic_read(&b->kobj.kref.refcount) > 2) {
+ sprintf(name, "bank%i", bank);
+ sysfs_remove_link(&per_cpu(device_threshold, cpu).kobj, name);
+ per_cpu(threshold_banks, cpu)[bank] = 0;
+ } else {
+ kobject_unregister(&b->kobj);
+ kfree(per_cpu(threshold_banks, cpu)[bank]);
+ }
+}
+
+static __cpuinit void threshold_remove_device(unsigned int cpu)
+{
+ int bank;
+
+ for (bank = 0; bank < NR_BANKS; ++bank) {
+ if (!(per_cpu(bank_map, cpu) & 1 << bank))
+ continue;
+ threshold_remove_bank(cpu, bank);
+ }
+ sysdev_unregister(&per_cpu(device_threshold, cpu));
+}
+
+/* link all existing siblings when first core comes up */
+static __cpuinit int threshold_create_symlinks(unsigned int cpu)
+{
+ int bank, err = 0;
+ unsigned int lcpu = 0;
+
+ if (cpu_core_id[cpu])
+ return 0;
+ for_each_cpu_mask(lcpu, cpu_core_map[cpu]) {
+ if (lcpu == cpu)
+ continue;
+ for (bank = 0; bank < NR_BANKS; ++bank) {
+ if (!(per_cpu(bank_map, cpu) & 1 << bank))
+ continue;
+ if (!shared_bank[bank])
+ continue;
+ err = threshold_create_bank(lcpu, bank);
+ }
+ }
+ return err;
+}
+
+/* remove all symlinks before first core dies. */
+static __cpuinit void threshold_remove_symlinks(unsigned int cpu)
+{
+ int bank;
+ unsigned int lcpu = 0;
+ if (cpu_core_id[cpu])
+ return;
+ for_each_cpu_mask(lcpu, cpu_core_map[cpu]) {
+ if (lcpu == cpu)
+ continue;
+ for (bank = 0; bank < NR_BANKS; ++bank) {
+ if (!(per_cpu(bank_map, cpu) & 1 << bank))
+ continue;
+ if (!shared_bank[bank])
+ continue;
+ threshold_remove_bank(lcpu, bank);
+ }
+ }
+}
+#else /* !CONFIG_HOTPLUG_CPU */
+static __cpuinit void threshold_create_symlinks(unsigned int cpu)
+{
+}
+static __cpuinit void threshold_remove_symlinks(unsigned int cpu)
+{
+}
+static void threshold_remove_device(unsigned int cpu)
+{
+}
+#endif
+
+/* get notified when a cpu comes on/off */
+static __cpuinit int threshold_cpu_callback(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ /* cpu was unsigned int to begin with */
+ unsigned int cpu = (unsigned long)hcpu;
+
+ if (cpu >= NR_CPUS)
+ goto out;
+
+ switch (action) {
+ case CPU_ONLINE:
+ threshold_create_device(cpu);
+ threshold_create_symlinks(cpu);
+ break;
+ case CPU_DOWN_PREPARE:
+ threshold_remove_symlinks(cpu);
+ break;
+ case CPU_DOWN_FAILED:
+ threshold_create_symlinks(cpu);
+ break;
+ case CPU_DEAD:
+ threshold_remove_device(cpu);
+ break;
+ default:
+ break;
+ }
+ out:
+ return NOTIFY_OK;
+}
+
+static struct notifier_block threshold_cpu_notifier = {
+ .notifier_call = threshold_cpu_callback,
+};
+
+static __init int threshold_init_device(void)
+{
+ int err;
+ int lcpu = 0;
+
+ err = sysdev_class_register(&threshold_sysclass);
+ if (err)
+ goto out;
+
+ /* to hit CPUs online before the notifier is up */
+ for_each_online_cpu(lcpu) {
+ err = threshold_create_device(lcpu);
+ if (err)
+ goto out;
+ }
+ register_cpu_notifier(&threshold_cpu_notifier);
+
+ out:
+ return err;
+}
+
+device_initcall(threshold_init_device);
diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c
index f16d38d..1105250 100644
--- a/arch/x86_64/kernel/mpparse.c
+++ b/arch/x86_64/kernel/mpparse.c
@@ -42,7 +42,7 @@ int acpi_found_madt;
* Various Linux-internal data structures created from the
* MP-table.
*/
-int apic_version [MAX_APICS];
+unsigned char apic_version [MAX_APICS];
unsigned char mp_bus_id_to_type [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
@@ -65,7 +65,9 @@ unsigned long mp_lapic_addr = 0;
/* Processor that is doing the boot up */
unsigned int boot_cpu_id = -1U;
/* Internal processor count */
-static unsigned int num_processors = 0;
+unsigned int num_processors __initdata = 0;
+
+unsigned disabled_cpus __initdata;
/* Bitmask of physically existing CPUs */
physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
@@ -106,11 +108,14 @@ static int __init mpf_checksum(unsigned char *mp, int len)
static void __init MP_processor_info (struct mpc_config_processor *m)
{
- int ver, cpu;
+ int cpu;
+ unsigned char ver;
static int found_bsp=0;
- if (!(m->mpc_cpuflag & CPU_ENABLED))
+ if (!(m->mpc_cpuflag & CPU_ENABLED)) {
+ disabled_cpus++;
return;
+ }
printk(KERN_INFO "Processor #%d %d:%d APIC version %d\n",
m->mpc_apicid,
@@ -129,12 +134,14 @@ static void __init MP_processor_info (struct mpc_config_processor *m)
}
cpu = num_processors++;
-
- if (m->mpc_apicid > MAX_APICS) {
+
+#if MAX_APICS < 255
+ if ((int)m->mpc_apicid > MAX_APICS) {
printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
m->mpc_apicid, MAX_APICS);
return;
}
+#endif
ver = m->mpc_apicver;
physid_set(m->mpc_apicid, phys_cpu_present_map);
@@ -218,7 +225,7 @@ static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
m->mpc_irqtype, m->mpc_irqflag & 3,
(m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
- if (++mp_irq_entries == MAX_IRQ_SOURCES)
+ if (++mp_irq_entries >= MAX_IRQ_SOURCES)
panic("Max # of irq sources exceeded!!\n");
}
@@ -549,7 +556,7 @@ void __init get_smp_config (void)
* Read the physical hardware table. Anything here will
* override the defaults.
*/
- if (!smp_read_mpc((void *)(unsigned long)mpf->mpf_physptr)) {
+ if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr))) {
smp_found_config = 0;
printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c
index cf0a031..2e28e85 100644
--- a/arch/x86_64/kernel/pci-gart.c
+++ b/arch/x86_64/kernel/pci-gart.c
@@ -187,7 +187,7 @@ static void flush_gart(struct device *dev)
/* Allocate DMA memory on node near device */
noinline
-static void *dma_alloc_pages(struct device *dev, unsigned gfp, unsigned order)
+static void *dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order)
{
struct page *page;
int node;
@@ -204,7 +204,7 @@ static void *dma_alloc_pages(struct device *dev, unsigned gfp, unsigned order)
*/
void *
dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
- unsigned gfp)
+ gfp_t gfp)
{
void *memory;
unsigned long dma_mask = 0;
@@ -220,6 +220,12 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
uses the normal dma_mask for alloc_coherent. */
dma_mask &= *dev->dma_mask;
+ /* Why <=? Even when the mask is smaller than 4GB it is often larger
+ than 16MB and in this case we have a chance of finding fitting memory
+ in the next higher zone first. If not retry with true GFP_DMA. -AK */
+ if (dma_mask <= 0xffffffff)
+ gfp |= GFP_DMA32;
+
again:
memory = dma_alloc_pages(dev, gfp, get_order(size));
if (memory == NULL)
@@ -245,7 +251,7 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
}
if (!(gfp & GFP_DMA)) {
- gfp |= GFP_DMA;
+ gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
goto again;
}
return NULL;
diff --git a/arch/x86_64/kernel/pci-nommu.c b/arch/x86_64/kernel/pci-nommu.c
index 67d90b8..5a981dc 100644
--- a/arch/x86_64/kernel/pci-nommu.c
+++ b/arch/x86_64/kernel/pci-nommu.c
@@ -24,7 +24,7 @@ EXPORT_SYMBOL(iommu_sac_force);
*/
void *dma_alloc_coherent(struct device *hwdev, size_t size,
- dma_addr_t *dma_handle, unsigned gfp)
+ dma_addr_t *dma_handle, gfp_t gfp)
{
void *ret;
u64 mask;
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index b5a89c0..7519fc5 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -86,12 +86,22 @@ EXPORT_SYMBOL(enable_hlt);
*/
void default_idle(void)
{
+ local_irq_enable();
+
if (!atomic_read(&hlt_counter)) {
- local_irq_disable();
- if (!need_resched())
- safe_halt();
- else
- local_irq_enable();
+ clear_thread_flag(TIF_POLLING_NRFLAG);
+ smp_mb__after_clear_bit();
+ while (!need_resched()) {
+ local_irq_disable();
+ if (!need_resched())
+ safe_halt();
+ else
+ local_irq_enable();
+ }
+ set_thread_flag(TIF_POLLING_NRFLAG);
+ } else {
+ while (!need_resched())
+ cpu_relax();
}
}
@@ -102,30 +112,16 @@ void default_idle(void)
*/
static void poll_idle (void)
{
- int oldval;
-
local_irq_enable();
- /*
- * Deal with another CPU just having chosen a thread to
- * run here:
- */
- oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED);
-
- if (!oldval) {
- set_thread_flag(TIF_POLLING_NRFLAG);
- asm volatile(
- "2:"
- "testl %0,%1;"
- "rep; nop;"
- "je 2b;"
- : :
- "i" (_TIF_NEED_RESCHED),
- "m" (current_thread_info()->flags));
- clear_thread_flag(TIF_POLLING_NRFLAG);
- } else {
- set_need_resched();
- }
+ asm volatile(
+ "2:"
+ "testl %0,%1;"
+ "rep; nop;"
+ "je 2b;"
+ : :
+ "i" (_TIF_NEED_RESCHED),
+ "m" (current_thread_info()->flags));
}
void cpu_idle_wait(void)
@@ -148,7 +144,8 @@ void cpu_idle_wait(void)
do {
ssleep(1);
for_each_online_cpu(cpu) {
- if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
+ if (cpu_isset(cpu, map) &&
+ !per_cpu(cpu_idle_state, cpu))
cpu_clear(cpu, map);
}
cpus_and(map, map, cpu_online_map);
@@ -187,6 +184,8 @@ static inline void play_dead(void)
*/
void cpu_idle (void)
{
+ set_thread_flag(TIF_POLLING_NRFLAG);
+
/* endless idle loop with no priority at all */
while (1) {
while (!need_resched()) {
@@ -204,7 +203,9 @@ void cpu_idle (void)
idle();
}
+ preempt_enable_no_resched();
schedule();
+ preempt_disable();
}
}
@@ -219,15 +220,12 @@ static void mwait_idle(void)
{
local_irq_enable();
- if (!need_resched()) {
- set_thread_flag(TIF_POLLING_NRFLAG);
- do {
- __monitor((void *)&current_thread_info()->flags, 0, 0);
- if (need_resched())
- break;
- __mwait(0, 0);
- } while (!need_resched());
- clear_thread_flag(TIF_POLLING_NRFLAG);
+ while (!need_resched()) {
+ __monitor((void *)&current_thread_info()->flags, 0, 0);
+ smp_mb();
+ if (need_resched())
+ break;
+ __mwait(0, 0);
}
}
@@ -278,7 +276,8 @@ void __show_regs(struct pt_regs * regs)
system_utsname.version);
printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
printk_address(regs->rip);
- printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, regs->eflags);
+ printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp,
+ regs->eflags);
printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
regs->rax, regs->rbx, regs->rcx);
printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
@@ -352,13 +351,6 @@ void flush_thread(void)
struct task_struct *tsk = current;
struct thread_info *t = current_thread_info();
- /*
- * Remove function-return probe instances associated with this task
- * and put them back on the free list. Do not insert an exit probe for
- * this function, it will be disabled by kprobe_flush_task if you do.
- */
- kprobe_flush_task(tsk);
-
if (t->flags & _TIF_ABI_PENDING)
t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32);
@@ -430,15 +422,14 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
struct pt_regs * childregs;
struct task_struct *me = current;
- childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p->thread_info)) - 1;
-
+ childregs = ((struct pt_regs *)
+ (THREAD_SIZE + (unsigned long) p->thread_info)) - 1;
*childregs = *regs;
childregs->rax = 0;
childregs->rsp = rsp;
- if (rsp == ~0UL) {
+ if (rsp == ~0UL)
childregs->rsp = (unsigned long)childregs;
- }
p->thread.rsp = (unsigned long) childregs;
p->thread.rsp0 = (unsigned long) (childregs+1);
@@ -460,7 +451,8 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
p->thread.io_bitmap_max = 0;
return -ENOMEM;
}
- memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, IO_BITMAP_BYTES);
+ memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
+ IO_BITMAP_BYTES);
}
/*
@@ -497,7 +489,8 @@ out:
* - fold all the options into a flag word and test it with a single test.
* - could test fs/gs bitsliced
*/
-struct task_struct *__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
+struct task_struct *
+__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{
struct thread_struct *prev = &prev_p->thread,
*next = &next_p->thread;
@@ -568,7 +561,8 @@ struct task_struct *__switch_to(struct task_struct *prev_p, struct task_struct *
prev->userrsp = read_pda(oldrsp);
write_pda(oldrsp, next->userrsp);
write_pda(pcurrent, next_p);
- write_pda(kernelstack, (unsigned long)next_p->thread_info + THREAD_SIZE - PDA_STACKOFFSET);
+ write_pda(kernelstack,
+ (unsigned long)next_p->thread_info + THREAD_SIZE - PDA_STACKOFFSET);
/*
* Now maybe reload the debug registers
@@ -649,7 +643,9 @@ asmlinkage long sys_fork(struct pt_regs *regs)
return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
}
-asmlinkage long sys_clone(unsigned long clone_flags, unsigned long newsp, void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
+asmlinkage long
+sys_clone(unsigned long clone_flags, unsigned long newsp,
+ void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
{
if (!newsp)
newsp = regs->rsp;
@@ -685,7 +681,8 @@ unsigned long get_wchan(struct task_struct *p)
return 0;
fp = *(u64 *)(p->thread.rsp);
do {
- if (fp < (unsigned long)stack || fp > (unsigned long)stack+THREAD_SIZE)
+ if (fp < (unsigned long)stack ||
+ fp > (unsigned long)stack+THREAD_SIZE)
return 0;
rip = *(u64 *)(fp+8);
if (!in_sched_functions(rip))
@@ -720,8 +717,8 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
task->thread.gsindex = 0;
task->thread.gs = addr;
if (doit) {
- load_gs_index(0);
- ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
+ load_gs_index(0);
+ ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
}
}
put_cpu();
@@ -738,7 +735,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
set_32bit_tls(task, FS_TLS, addr);
if (doit) {
load_TLS(&task->thread, cpu);
- asm volatile("movl %0,%%fs" :: "r" (FS_TLS_SEL));
+ asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
}
task->thread.fsindex = FS_TLS_SEL;
task->thread.fs = 0;
@@ -748,8 +745,8 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
if (doit) {
/* set the selector to 0 to not confuse
__switch_to */
- asm volatile("movl %0,%%fs" :: "r" (0));
- ret = checking_wrmsrl(MSR_FS_BASE, addr);
+ asm volatile("movl %0,%%fs" :: "r" (0));
+ ret = checking_wrmsrl(MSR_FS_BASE, addr);
}
}
put_cpu();
@@ -758,9 +755,9 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
unsigned long base;
if (task->thread.fsindex == FS_TLS_SEL)
base = read_32bit_tls(task, FS_TLS);
- else if (doit) {
+ else if (doit)
rdmsrl(MSR_FS_BASE, base);
- } else
+ else
base = task->thread.fs;
ret = put_user(base, (unsigned long __user *)addr);
break;
@@ -769,9 +766,9 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
unsigned long base;
if (task->thread.gsindex == GS_TLS_SEL)
base = read_32bit_tls(task, GS_TLS);
- else if (doit) {
+ else if (doit)
rdmsrl(MSR_KERNEL_GS_BASE, base);
- } else
+ else
base = task->thread.gs;
ret = put_user(base, (unsigned long __user *)addr);
break;
diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c
index bbf64b5..a87b6ce 100644
--- a/arch/x86_64/kernel/ptrace.c
+++ b/arch/x86_64/kernel/ptrace.c
@@ -313,48 +313,11 @@ static unsigned long getreg(struct task_struct *child, unsigned long regno)
}
-asmlinkage long sys_ptrace(long request, long pid, unsigned long addr, long data)
+long arch_ptrace(struct task_struct *child, long request, long addr, long data)
{
- struct task_struct *child;
long i, ret;
unsigned ui;
- /* This lock_kernel fixes a subtle race with suid exec */
- lock_kernel();
- ret = -EPERM;
- if (request == PTRACE_TRACEME) {
- /* are we already being traced? */
- if (current->ptrace & PT_PTRACED)
- goto out;
- ret = security_ptrace(current->parent, current);
- if (ret)
- goto out;
- /* set the ptrace bit in the process flags. */
- current->ptrace |= PT_PTRACED;
- ret = 0;
- goto out;
- }
- ret = -ESRCH;
- read_lock(&tasklist_lock);
- child = find_task_by_pid(pid);
- if (child)
- get_task_struct(child);
- read_unlock(&tasklist_lock);
- if (!child)
- goto out;
-
- ret = -EPERM;
- if (pid == 1) /* you may not mess with init */
- goto out_tsk;
-
- if (request == PTRACE_ATTACH) {
- ret = ptrace_attach(child);
- goto out_tsk;
- }
- ret = ptrace_check_attach(child, request == PTRACE_KILL);
- if (ret < 0)
- goto out_tsk;
-
switch (request) {
/* when I and D space are separate, these will need to be fixed. */
case PTRACE_PEEKTEXT: /* read word at location addr. */
@@ -608,10 +571,6 @@ asmlinkage long sys_ptrace(long request, long pid, unsigned long addr, long data
ret = ptrace_request(child, request, addr, data);
break;
}
-out_tsk:
- put_task_struct(child);
-out:
- unlock_kernel();
return ret;
}
diff --git a/arch/x86_64/kernel/reboot.c b/arch/x86_64/kernel/reboot.c
index 47f9568..75235ed 100644
--- a/arch/x86_64/kernel/reboot.c
+++ b/arch/x86_64/kernel/reboot.c
@@ -77,6 +77,7 @@ static inline void kb_wait(void)
void machine_shutdown(void)
{
+ unsigned long flags;
/* Stop the cpus and apics */
#ifdef CONFIG_SMP
int reboot_cpu_id;
@@ -98,7 +99,7 @@ void machine_shutdown(void)
smp_send_stop();
#endif
- local_irq_disable();
+ local_irq_save(flags);
#ifndef CONFIG_SMP
disable_local_APIC();
@@ -106,7 +107,7 @@ void machine_shutdown(void)
disable_IO_APIC();
- local_irq_enable();
+ local_irq_restore(flags);
}
void machine_emergency_restart(void)
@@ -120,7 +121,7 @@ void machine_emergency_restart(void)
/* Could also try the reset bit in the Hammer NB */
switch (reboot_type) {
case BOOT_KBD:
- for (i=0; i<100; i++) {
+ for (i=0; i<10; i++) {
kb_wait();
udelay(50);
outb(0xfe,0x64); /* pulse reset low */
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 238f73e..750e01d 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -60,6 +60,7 @@
#include <asm/setup.h>
#include <asm/mach_apic.h>
#include <asm/numa.h>
+#include <asm/sections.h>
/*
* Machine setup..
@@ -103,7 +104,6 @@ struct edid_info edid_info;
struct e820map e820;
extern int root_mountflags;
-extern char _text, _etext, _edata, _end;
char command_line[COMMAND_LINE_SIZE];
@@ -412,7 +412,6 @@ contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
{
unsigned long bootmap_size, bootmap;
- memory_present(0, start_pfn, end_pfn);
bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size);
if (bootmap == -1L)
@@ -571,6 +570,8 @@ void __init setup_arch(char **cmdline_p)
init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT));
+ zap_low_mappings(0);
+
#ifdef CONFIG_ACPI
/*
* Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
@@ -657,8 +658,6 @@ void __init setup_arch(char **cmdline_p)
}
#endif
- sparse_init();
-
paging_init();
check_ioapic();
@@ -793,7 +792,7 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
#endif
bits = 0;
- while ((1 << bits) < c->x86_num_cores)
+ while ((1 << bits) < c->x86_max_cores)
bits++;
/* Low order bits define the core id (index of core in socket) */
@@ -823,16 +822,14 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
if (!node_online(node))
node = nearby_node(apicid);
}
- cpu_to_node[cpu] = node;
+ numa_set_node(cpu, node);
printk(KERN_INFO "CPU %d(%d) -> Node %d -> Core %d\n",
- cpu, c->x86_num_cores, node, cpu_core_id[cpu]);
+ cpu, c->x86_max_cores, node, cpu_core_id[cpu]);
#endif
#endif
}
-#define HWCR 0xc0010015
-
static int __init init_amd(struct cpuinfo_x86 *c)
{
int r;
@@ -841,14 +838,18 @@ static int __init init_amd(struct cpuinfo_x86 *c)
#ifdef CONFIG_SMP
unsigned long value;
- // Disable TLB flush filter by setting HWCR.FFDIS:
- // bit 6 of msr C001_0015
- //
- // Errata 63 for SH-B3 steppings
- // Errata 122 for all(?) steppings
- rdmsrl(HWCR, value);
- value |= 1 << 6;
- wrmsrl(HWCR, value);
+ /*
+ * Disable TLB flush filter by setting HWCR.FFDIS on K8
+ * bit 6 of msr C001_0015
+ *
+ * Errata 63 for SH-B3 steppings
+ * Errata 122 for all steppings (F+ have it disabled by default)
+ */
+ if (c->x86 == 15) {
+ rdmsrl(MSR_K8_HWCR, value);
+ value |= 1 << 6;
+ wrmsrl(MSR_K8_HWCR, value);
+ }
#endif
/* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
@@ -873,9 +874,9 @@ static int __init init_amd(struct cpuinfo_x86 *c)
display_cacheinfo(c);
if (c->extended_cpuid_level >= 0x80000008) {
- c->x86_num_cores = (cpuid_ecx(0x80000008) & 0xff) + 1;
- if (c->x86_num_cores & (c->x86_num_cores - 1))
- c->x86_num_cores = 1;
+ c->x86_max_cores = (cpuid_ecx(0x80000008) & 0xff) + 1;
+ if (c->x86_max_cores & (c->x86_max_cores - 1))
+ c->x86_max_cores = 1;
amd_detect_cmp(c);
}
@@ -887,54 +888,44 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_SMP
u32 eax, ebx, ecx, edx;
- int index_msb, tmp;
+ int index_msb, core_bits;
int cpu = smp_processor_id();
-
+
+ cpuid(1, &eax, &ebx, &ecx, &edx);
+
+ c->apicid = phys_pkg_id(0);
+
if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
return;
- cpuid(1, &eax, &ebx, &ecx, &edx);
smp_num_siblings = (ebx & 0xff0000) >> 16;
-
+
if (smp_num_siblings == 1) {
printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
- } else if (smp_num_siblings > 1) {
- index_msb = 31;
- /*
- * At this point we only support two siblings per
- * processor package.
- */
+ } else if (smp_num_siblings > 1 ) {
+
if (smp_num_siblings > NR_CPUS) {
printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings);
smp_num_siblings = 1;
return;
}
- tmp = smp_num_siblings;
- while ((tmp & 0x80000000 ) == 0) {
- tmp <<=1 ;
- index_msb--;
- }
- if (smp_num_siblings & (smp_num_siblings - 1))
- index_msb++;
+
+ index_msb = get_count_order(smp_num_siblings);
phys_proc_id[cpu] = phys_pkg_id(index_msb);
-
+
printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
phys_proc_id[cpu]);
- smp_num_siblings = smp_num_siblings / c->x86_num_cores;
+ smp_num_siblings = smp_num_siblings / c->x86_max_cores;
- tmp = smp_num_siblings;
- index_msb = 31;
- while ((tmp & 0x80000000) == 0) {
- tmp <<=1 ;
- index_msb--;
- }
- if (smp_num_siblings & (smp_num_siblings - 1))
- index_msb++;
+ index_msb = get_count_order(smp_num_siblings) ;
+
+ core_bits = get_count_order(c->x86_max_cores);
- cpu_core_id[cpu] = phys_pkg_id(index_msb);
+ cpu_core_id[cpu] = phys_pkg_id(index_msb) &
+ ((1 << core_bits) - 1);
- if (c->x86_num_cores > 1)
+ if (c->x86_max_cores > 1)
printk(KERN_INFO "CPU: Processor Core ID: %d\n",
cpu_core_id[cpu]);
}
@@ -965,16 +956,15 @@ static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
static void srat_detect_node(void)
{
#ifdef CONFIG_NUMA
- unsigned apicid, node;
+ unsigned node;
int cpu = smp_processor_id();
/* Don't do the funky fallback heuristics the AMD version employs
for now. */
- apicid = phys_proc_id[cpu];
- node = apicid_to_node[apicid];
+ node = apicid_to_node[hard_smp_processor_id()];
if (node == NUMA_NO_NODE)
node = 0;
- cpu_to_node[cpu] = node;
+ numa_set_node(cpu, node);
if (acpi_numa > 0)
printk(KERN_INFO "CPU %d -> Node %d\n", cpu, node);
@@ -992,13 +982,18 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
unsigned eax = cpuid_eax(0x80000008);
c->x86_virt_bits = (eax >> 8) & 0xff;
c->x86_phys_bits = eax & 0xff;
+ /* CPUID workaround for Intel 0F34 CPU */
+ if (c->x86_vendor == X86_VENDOR_INTEL &&
+ c->x86 == 0xF && c->x86_model == 0x3 &&
+ c->x86_mask == 0x4)
+ c->x86_phys_bits = 36;
}
if (c->x86 == 15)
c->x86_cache_alignment = c->x86_clflush_size * 2;
if (c->x86 >= 15)
set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
- c->x86_num_cores = intel_num_cpu_cores(c);
+ c->x86_max_cores = intel_num_cpu_cores(c);
srat_detect_node();
}
@@ -1036,7 +1031,7 @@ void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
c->x86_model_id[0] = '\0'; /* Unset */
c->x86_clflush_size = 64;
c->x86_cache_alignment = c->x86_clflush_size;
- c->x86_num_cores = 1;
+ c->x86_max_cores = 1;
c->extended_cpuid_level = 0;
memset(&c->x86_capability, 0, sizeof c->x86_capability);
@@ -1059,10 +1054,10 @@ void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
c->x86 = (tfms >> 8) & 0xf;
c->x86_model = (tfms >> 4) & 0xf;
c->x86_mask = tfms & 0xf;
- if (c->x86 == 0xf) {
+ if (c->x86 == 0xf)
c->x86 += (tfms >> 20) & 0xff;
+ if (c->x86 >= 0x6)
c->x86_model += ((tfms >> 16) & 0xF) << 4;
- }
if (c->x86_capability[0] & (1<<19))
c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
} else {
@@ -1212,7 +1207,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
/* Intel-defined (#2) */
- "pni", NULL, NULL, "monitor", "ds_cpl", NULL, NULL, "est",
+ "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", NULL, "est",
"tm2", NULL, "cid", NULL, NULL, "cx16", "xtpr", NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
@@ -1270,13 +1265,12 @@ static int show_cpuinfo(struct seq_file *m, void *v)
seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
#ifdef CONFIG_SMP
- if (smp_num_siblings * c->x86_num_cores > 1) {
+ if (smp_num_siblings * c->x86_max_cores > 1) {
int cpu = c - cpu_data;
seq_printf(m, "physical id\t: %d\n", phys_proc_id[cpu]);
- seq_printf(m, "siblings\t: %d\n",
- c->x86_num_cores * smp_num_siblings);
+ seq_printf(m, "siblings\t: %d\n", cpus_weight(cpu_core_map[cpu]));
seq_printf(m, "core id\t\t: %d\n", cpu_core_id[cpu]);
- seq_printf(m, "cpu cores\t: %d\n", c->x86_num_cores);
+ seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
}
#endif
diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c
index bd33be2..06dc354 100644
--- a/arch/x86_64/kernel/setup64.c
+++ b/arch/x86_64/kernel/setup64.c
@@ -87,6 +87,10 @@ void __init setup_per_cpu_areas(void)
int i;
unsigned long size;
+#ifdef CONFIG_HOTPLUG_CPU
+ prefill_possible_map();
+#endif
+
/* Copy section for each CPU (we discard the original) */
size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES);
#ifdef CONFIG_MODULES
@@ -137,7 +141,6 @@ void pda_init(int cpu)
panic("cannot allocate irqstack for cpu %d", cpu);
}
- asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
pda->irqstackptr += IRQSTACKSIZE-64;
}
@@ -193,6 +196,7 @@ void __cpuinit cpu_init (void)
/* CPU 0 is initialised in head64.c */
if (cpu != 0) {
pda_init(cpu);
+ zap_low_mappings(cpu);
} else
estacks = boot_exception_stacks;
diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c
index d642fbf..5876df1 100644
--- a/arch/x86_64/kernel/signal.c
+++ b/arch/x86_64/kernel/signal.c
@@ -110,6 +110,15 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned
COPY(r14);
COPY(r15);
+ /* Kernel saves and restores only the CS segment register on signals,
+ * which is the bare minimum needed to allow mixed 32/64-bit code.
+ * App's signal handler can save/restore other segments if needed. */
+ {
+ unsigned cs;
+ err |= __get_user(cs, &sc->cs);
+ regs->cs = cs | 3; /* Force into user mode */
+ }
+
{
unsigned int tmpflags;
err |= __get_user(tmpflags, &sc->eflags);
@@ -187,6 +196,7 @@ setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned lo
{
int err = 0;
+ err |= __put_user(regs->cs, &sc->cs);
err |= __put_user(0, &sc->gs);
err |= __put_user(0, &sc->fs);
@@ -318,7 +328,14 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
regs->rsp = (unsigned long)frame;
+ /* Set up the CS register to run signal handlers in 64-bit mode,
+ even if the handler happens to be interrupting 32-bit code. */
+ regs->cs = __USER_CS;
+
+ /* This, by contrast, has nothing to do with segment registers -
+ see include/asm-x86_64/uaccess.h for details. */
set_fs(USER_DS);
+
regs->eflags &= ~TF_MASK;
if (test_thread_flag(TIF_SINGLESTEP))
ptrace_notify(SIGTRAP);
diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c
index 9db9dda..cfc3d9d 100644
--- a/arch/x86_64/kernel/smp.c
+++ b/arch/x86_64/kernel/smp.c
@@ -28,8 +28,6 @@
#include <asm/proto.h>
#include <asm/apicdef.h>
-#define __cpuinit __init
-
/*
* Smarter SMP flushing macros.
* c/o Linus Torvalds.
@@ -452,13 +450,14 @@ int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
void smp_stop_cpu(void)
{
+ unsigned long flags;
/*
* Remove this CPU:
*/
cpu_clear(smp_processor_id(), cpu_online_map);
- local_irq_disable();
+ local_irq_save(flags);
disable_local_APIC();
- local_irq_enable();
+ local_irq_restore(flags);
}
static void smp_really_stop_cpu(void *dummy)
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index e12d7ba..683c33f 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -64,9 +64,8 @@
int smp_num_siblings = 1;
/* Package ID of each logical CPU */
u8 phys_proc_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
+/* core ID of each logical CPU */
u8 cpu_core_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
-EXPORT_SYMBOL(phys_proc_id);
-EXPORT_SYMBOL(cpu_core_id);
/* Bitmask of currently online CPUs */
cpumask_t cpu_online_map __read_mostly;
@@ -89,7 +88,10 @@ struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
/* Set when the idlers are all forked */
int smp_threads_ready;
+/* representing HT siblings of each logical CPU */
cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly;
+
+/* representing HT and core siblings of each logical CPU */
cpumask_t cpu_core_map[NR_CPUS] __read_mostly;
EXPORT_SYMBOL(cpu_core_map);
@@ -436,30 +438,59 @@ void __cpuinit smp_callin(void)
cpu_set(cpuid, cpu_callin_map);
}
+/* representing cpus for which sibling maps can be computed */
+static cpumask_t cpu_sibling_setup_map;
+
static inline void set_cpu_sibling_map(int cpu)
{
int i;
+ struct cpuinfo_x86 *c = cpu_data;
+
+ cpu_set(cpu, cpu_sibling_setup_map);
if (smp_num_siblings > 1) {
- for_each_cpu(i) {
- if (cpu_core_id[cpu] == cpu_core_id[i]) {
+ for_each_cpu_mask(i, cpu_sibling_setup_map) {
+ if (phys_proc_id[cpu] == phys_proc_id[i] &&
+ cpu_core_id[cpu] == cpu_core_id[i]) {
cpu_set(i, cpu_sibling_map[cpu]);
cpu_set(cpu, cpu_sibling_map[i]);
+ cpu_set(i, cpu_core_map[cpu]);
+ cpu_set(cpu, cpu_core_map[i]);
}
}
} else {
cpu_set(cpu, cpu_sibling_map[cpu]);
}
- if (current_cpu_data.x86_num_cores > 1) {
- for_each_cpu(i) {
- if (phys_proc_id[cpu] == phys_proc_id[i]) {
- cpu_set(i, cpu_core_map[cpu]);
- cpu_set(cpu, cpu_core_map[i]);
- }
- }
- } else {
+ if (current_cpu_data.x86_max_cores == 1) {
cpu_core_map[cpu] = cpu_sibling_map[cpu];
+ c[cpu].booted_cores = 1;
+ return;
+ }
+
+ for_each_cpu_mask(i, cpu_sibling_setup_map) {
+ if (phys_proc_id[cpu] == phys_proc_id[i]) {
+ cpu_set(i, cpu_core_map[cpu]);
+ cpu_set(cpu, cpu_core_map[i]);
+ /*
+ * Does this new cpu bringup a new core?
+ */
+ if (cpus_weight(cpu_sibling_map[cpu]) == 1) {
+ /*
+ * for each core in package, increment
+ * the booted_cores for this new cpu
+ */
+ if (first_cpu(cpu_sibling_map[i]) == i)
+ c[cpu].booted_cores++;
+ /*
+ * increment the core count for all
+ * the other cpus in this package
+ */
+ if (i != cpu)
+ c[i].booted_cores++;
+ } else if (i != cpu && !c[cpu].booted_cores)
+ c[cpu].booted_cores = c[i].booted_cores;
+ }
}
}
@@ -474,6 +505,7 @@ void __cpuinit start_secondary(void)
* things done here to the most necessary things.
*/
cpu_init();
+ preempt_disable();
smp_callin();
/* otherwise gcc will move up the smp_processor_id before the cpu_init */
@@ -880,6 +912,9 @@ static __init void disable_smp(void)
}
#ifdef CONFIG_HOTPLUG_CPU
+
+int additional_cpus __initdata = -1;
+
/*
* cpu_possible_map should be static, it cannot change as cpu's
* are onlined, or offlined. The reason is per-cpu data-structures
@@ -888,14 +923,38 @@ static __init void disable_smp(void)
* cpu_present_map on the other hand can change dynamically.
* In case when cpu_hotplug is not compiled, then we resort to current
* behaviour, which is cpu_possible == cpu_present.
- * If cpu-hotplug is supported, then we need to preallocate for all
- * those NR_CPUS, hence cpu_possible_map represents entire NR_CPUS range.
* - Ashok Raj
+ *
+ * Three ways to find out the number of additional hotplug CPUs:
+ * - If the BIOS specified disabled CPUs in ACPI/mptables use that.
+ * - otherwise use half of the available CPUs or 2, whatever is more.
+ * - The user can overwrite it with additional_cpus=NUM
+ * We do this because additional CPUs waste a lot of memory.
+ * -AK
*/
-static void prefill_possible_map(void)
+__init void prefill_possible_map(void)
{
int i;
- for (i = 0; i < NR_CPUS; i++)
+ int possible;
+
+ if (additional_cpus == -1) {
+ if (disabled_cpus > 0) {
+ additional_cpus = disabled_cpus;
+ } else {
+ additional_cpus = num_processors / 2;
+ if (additional_cpus == 0)
+ additional_cpus = 2;
+ }
+ }
+ possible = num_processors + additional_cpus;
+ if (possible > NR_CPUS)
+ possible = NR_CPUS;
+
+ printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
+ possible,
+ max_t(int, possible - num_processors, 0));
+
+ for (i = 0; i < possible; i++)
cpu_set(i, cpu_possible_map);
}
#endif
@@ -966,10 +1025,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
nmi_watchdog_default();
current_cpu_data = boot_cpu_data;
current_thread_info()->cpu = 0; /* needed? */
-
-#ifdef CONFIG_HOTPLUG_CPU
- prefill_possible_map();
-#endif
+ set_cpu_sibling_map(0);
if (smp_sanity_check(max_cpus) < 0) {
printk(KERN_INFO "SMP disabled\n");
@@ -1013,8 +1069,6 @@ void __init smp_prepare_boot_cpu(void)
int me = smp_processor_id();
cpu_set(me, cpu_online_map);
cpu_set(me, cpu_callout_map);
- cpu_set(0, cpu_sibling_map[0]);
- cpu_set(0, cpu_core_map[0]);
per_cpu(cpu_state, me) = CPU_ONLINE;
}
@@ -1067,9 +1121,6 @@ int __cpuinit __cpu_up(unsigned int cpu)
*/
void __init smp_cpus_done(unsigned int max_cpus)
{
-#ifndef CONFIG_HOTPLUG_CPU
- zap_low_mappings();
-#endif
smp_cleanup_boot();
#ifdef CONFIG_X86_IO_APIC
@@ -1086,15 +1137,24 @@ void __init smp_cpus_done(unsigned int max_cpus)
static void remove_siblinginfo(int cpu)
{
int sibling;
+ struct cpuinfo_x86 *c = cpu_data;
+ for_each_cpu_mask(sibling, cpu_core_map[cpu]) {
+ cpu_clear(cpu, cpu_core_map[sibling]);
+ /*
+ * last thread sibling in this cpu core going down
+ */
+ if (cpus_weight(cpu_sibling_map[cpu]) == 1)
+ c[sibling].booted_cores--;
+ }
+
for_each_cpu_mask(sibling, cpu_sibling_map[cpu])
cpu_clear(cpu, cpu_sibling_map[sibling]);
- for_each_cpu_mask(sibling, cpu_core_map[cpu])
- cpu_clear(cpu, cpu_core_map[sibling]);
cpus_clear(cpu_sibling_map[cpu]);
cpus_clear(cpu_core_map[cpu]);
phys_proc_id[cpu] = BAD_APICID;
cpu_core_id[cpu] = BAD_APICID;
+ cpu_clear(cpu, cpu_sibling_setup_map);
}
void remove_cpu_from_maps(void)
@@ -1158,6 +1218,12 @@ void __cpu_die(unsigned int cpu)
printk(KERN_ERR "CPU %u didn't die...\n", cpu);
}
+static __init int setup_additional_cpus(char *s)
+{
+ return get_option(&s, &additional_cpus);
+}
+__setup("additional_cpus=", setup_additional_cpus);
+
#else /* ... !CONFIG_HOTPLUG_CPU */
int __cpu_disable(void)
diff --git a/arch/x86_64/kernel/suspend.c b/arch/x86_64/kernel/suspend.c
index ebb9abf..fd2bef7 100644
--- a/arch/x86_64/kernel/suspend.c
+++ b/arch/x86_64/kernel/suspend.c
@@ -11,6 +11,8 @@
#include <linux/smp.h>
#include <linux/suspend.h>
#include <asm/proto.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
struct saved_context saved_context;
@@ -61,13 +63,12 @@ void save_processor_state(void)
__save_processor_state(&saved_context);
}
-static void
-do_fpu_end(void)
+static void do_fpu_end(void)
{
- /* restore FPU regs if necessary */
- /* Do it out of line so that gcc does not move cr0 load to some stupid place */
- kernel_fpu_end();
- mxcsr_feature_mask_init();
+ /*
+ * Restore FPU regs if necessary
+ */
+ kernel_fpu_end();
}
void __restore_processor_state(struct saved_context *ctxt)
@@ -140,4 +141,83 @@ void fix_processor_context(void)
}
+#ifdef CONFIG_SOFTWARE_SUSPEND
+/* Defined in arch/x86_64/kernel/suspend_asm.S */
+extern int restore_image(void);
+
+pgd_t *temp_level4_pgt;
+
+static int res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
+{
+ long i, j;
+
+ i = pud_index(address);
+ pud = pud + i;
+ for (; i < PTRS_PER_PUD; pud++, i++) {
+ unsigned long paddr;
+ pmd_t *pmd;
+
+ paddr = address + i*PUD_SIZE;
+ if (paddr >= end)
+ break;
+
+ pmd = (pmd_t *)get_safe_page(GFP_ATOMIC);
+ if (!pmd)
+ return -ENOMEM;
+ set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+ for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) {
+ unsigned long pe;
+
+ if (paddr >= end)
+ break;
+ pe = _PAGE_NX | _PAGE_PSE | _KERNPG_TABLE | paddr;
+ pe &= __supported_pte_mask;
+ set_pmd(pmd, __pmd(pe));
+ }
+ }
+ return 0;
+}
+
+static int set_up_temporary_mappings(void)
+{
+ unsigned long start, end, next;
+ int error;
+
+ temp_level4_pgt = (pgd_t *)get_safe_page(GFP_ATOMIC);
+ if (!temp_level4_pgt)
+ return -ENOMEM;
+
+ /* It is safe to reuse the original kernel mapping */
+ set_pgd(temp_level4_pgt + pgd_index(__START_KERNEL_map),
+ init_level4_pgt[pgd_index(__START_KERNEL_map)]);
+
+ /* Set up the direct mapping from scratch */
+ start = (unsigned long)pfn_to_kaddr(0);
+ end = (unsigned long)pfn_to_kaddr(end_pfn);
+
+ for (; start < end; start = next) {
+ pud_t *pud = (pud_t *)get_safe_page(GFP_ATOMIC);
+ if (!pud)
+ return -ENOMEM;
+ next = start + PGDIR_SIZE;
+ if (next > end)
+ next = end;
+ if ((error = res_phys_pud_init(pud, __pa(start), __pa(next))))
+ return error;
+ set_pgd(temp_level4_pgt + pgd_index(start),
+ mk_kernel_pgd(__pa(pud)));
+ }
+ return 0;
+}
+
+int swsusp_arch_resume(void)
+{
+ int error;
+ /* We have got enough memory and from now on we cannot recover */
+ if ((error = set_up_temporary_mappings()))
+ return error;
+ restore_image();
+ return 0;
+}
+#endif /* CONFIG_SOFTWARE_SUSPEND */
diff --git a/arch/x86_64/kernel/suspend_asm.S b/arch/x86_64/kernel/suspend_asm.S
index 4d659e9..320b6fb 100644
--- a/arch/x86_64/kernel/suspend_asm.S
+++ b/arch/x86_64/kernel/suspend_asm.S
@@ -39,12 +39,13 @@ ENTRY(swsusp_arch_suspend)
call swsusp_save
ret
-ENTRY(swsusp_arch_resume)
- /* set up cr3 */
- leaq init_level4_pgt(%rip),%rax
- subq $__START_KERNEL_map,%rax
- movq %rax,%cr3
-
+ENTRY(restore_image)
+ /* switch to temporary page tables */
+ movq $__PAGE_OFFSET, %rdx
+ movq temp_level4_pgt(%rip), %rax
+ subq %rdx, %rax
+ movq %rax, %cr3
+ /* Flush TLB */
movq mmu_cr4_features(%rip), %rax
movq %rax, %rdx
andq $~(1<<7), %rdx # PGE
@@ -69,6 +70,10 @@ loop:
movq pbe_next(%rdx), %rdx
jmp loop
done:
+ /* go back to the original page tables */
+ leaq init_level4_pgt(%rip), %rax
+ subq $__START_KERNEL_map, %rax
+ movq %rax, %cr3
/* Flush TLB, including "global" things (vmalloc) */
movq mmu_cr4_features(%rip), %rax
movq %rax, %rdx
diff --git a/arch/x86_64/kernel/sys_x86_64.c b/arch/x86_64/kernel/sys_x86_64.c
index cc7821c..6449ea8 100644
--- a/arch/x86_64/kernel/sys_x86_64.c
+++ b/arch/x86_64/kernel/sys_x86_64.c
@@ -154,17 +154,3 @@ asmlinkage long sys_uname(struct new_utsname __user * name)
err |= copy_to_user(&name->machine, "i686", 5);
return err ? -EFAULT : 0;
}
-
-asmlinkage long sys_time64(long __user * tloc)
-{
- struct timeval now;
- int i;
-
- do_gettimeofday(&now);
- i = now.tv_sec;
- if (tloc) {
- if (put_user(i,tloc))
- i = -EFAULT;
- }
- return i;
-}
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 2373cb8..fdaddc4 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -42,10 +42,6 @@
#include <asm/apic.h>
#endif
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
#ifdef CONFIG_CPU_FREQ
static void cpufreq_delayed_get(void);
#endif
@@ -481,9 +477,9 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
static unsigned int cyc2ns_scale;
#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
-static inline void set_cyc2ns_scale(unsigned long cpu_mhz)
+static inline void set_cyc2ns_scale(unsigned long cpu_khz)
{
- cyc2ns_scale = (1000 << CYC2NS_SCALE_FACTOR)/cpu_mhz;
+ cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz;
}
static inline unsigned long long cycles_2_ns(unsigned long long cyc)
@@ -655,7 +651,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
vxtime.tsc_quot = (1000L << 32) / cpu_khz;
}
- set_cyc2ns_scale(cpu_khz_ref / 1000);
+ set_cyc2ns_scale(cpu_khz_ref);
return 0;
}
@@ -939,7 +935,7 @@ void __init time_init(void)
rdtscll_sync(&vxtime.last_tsc);
setup_irq(0, &irq0);
- set_cyc2ns_scale(cpu_khz / 1000);
+ set_cyc2ns_scale(cpu_khz);
#ifndef CONFIG_SMP
time_init_gtod();
@@ -959,9 +955,6 @@ static __init int unsynchronized_tsc(void)
are handled in the OEM check above. */
if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
return 0;
- /* All in a single socket - should be synchronized */
- if (cpus_weight(cpu_core_map[0]) == num_online_cpus())
- return 0;
#endif
/* Assume multi socket systems are not synchronized */
return num_online_cpus() > 1;
@@ -1096,6 +1089,7 @@ static unsigned long PIE_freq = DEFAULT_RTC_INT_FREQ;
static unsigned long PIE_count;
static unsigned long hpet_rtc_int_freq; /* RTC interrupt frequency */
+static unsigned int hpet_t1_cmp; /* cached comparator register */
int is_hpet_enabled(void)
{
@@ -1132,10 +1126,12 @@ int hpet_rtc_timer_init(void)
cnt = hpet_readl(HPET_COUNTER);
cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq);
hpet_writel(cnt, HPET_T1_CMP);
+ hpet_t1_cmp = cnt;
local_irq_restore(flags);
cfg = hpet_readl(HPET_T1_CFG);
- cfg |= HPET_TN_ENABLE | HPET_TN_SETVAL | HPET_TN_32BIT;
+ cfg &= ~HPET_TN_PERIODIC;
+ cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
hpet_writel(cfg, HPET_T1_CFG);
return 1;
@@ -1145,8 +1141,12 @@ static void hpet_rtc_timer_reinit(void)
{
unsigned int cfg, cnt;
- if (!(PIE_on | AIE_on | UIE_on))
+ if (unlikely(!(PIE_on | AIE_on | UIE_on))) {
+ cfg = hpet_readl(HPET_T1_CFG);
+ cfg &= ~HPET_TN_ENABLE;
+ hpet_writel(cfg, HPET_T1_CFG);
return;
+ }
if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ))
hpet_rtc_int_freq = PIE_freq;
@@ -1154,15 +1154,10 @@ static void hpet_rtc_timer_reinit(void)
hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
/* It is more accurate to use the comparator value than current count.*/
- cnt = hpet_readl(HPET_T1_CMP);
+ cnt = hpet_t1_cmp;
cnt += hpet_tick*HZ/hpet_rtc_int_freq;
hpet_writel(cnt, HPET_T1_CMP);
-
- cfg = hpet_readl(HPET_T1_CFG);
- cfg |= HPET_TN_ENABLE | HPET_TN_SETVAL | HPET_TN_32BIT;
- hpet_writel(cfg, HPET_T1_CFG);
-
- return;
+ hpet_t1_cmp = cnt;
}
/*
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index b5e09e6..bf337f4 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -428,19 +428,6 @@ static void __kprobes do_trap(int trapnr, int signr, char *str,
{
conditional_sti(regs);
-#ifdef CONFIG_CHECKING
- {
- unsigned long gs;
- struct x8664_pda *pda = cpu_pda + safe_smp_processor_id();
- rdmsrl(MSR_GS_BASE, gs);
- if (gs != (unsigned long)pda) {
- wrmsrl(MSR_GS_BASE, pda);
- printk("%s: wrong gs %lx expected %p rip %lx\n", str, gs, pda,
- regs->rip);
- }
- }
-#endif
-
if (user_mode(regs)) {
struct task_struct *tsk = current;
@@ -513,20 +500,6 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
{
conditional_sti(regs);
-#ifdef CONFIG_CHECKING
- {
- unsigned long gs;
- struct x8664_pda *pda = cpu_pda + safe_smp_processor_id();
- rdmsrl(MSR_GS_BASE, gs);
- if (gs != (unsigned long)pda) {
- wrmsrl(MSR_GS_BASE, pda);
- oops_in_progress++;
- printk("general protection handler: wrong gs %lx expected %p\n", gs, pda);
- oops_in_progress--;
- }
- }
-#endif
-
if (user_mode(regs)) {
struct task_struct *tsk = current;
@@ -665,19 +638,6 @@ asmlinkage void __kprobes do_debug(struct pt_regs * regs,
struct task_struct *tsk = current;
siginfo_t info;
-#ifdef CONFIG_CHECKING
- {
- /* RED-PEN interaction with debugger - could destroy gs */
- unsigned long gs;
- struct x8664_pda *pda = cpu_pda + safe_smp_processor_id();
- rdmsrl(MSR_GS_BASE, gs);
- if (gs != (unsigned long)pda) {
- wrmsrl(MSR_GS_BASE, pda);
- printk("debug handler: wrong gs %lx expected %p\n", gs, pda);
- }
- }
-#endif
-
get_debugreg(condition, 6);
if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
@@ -888,6 +848,10 @@ asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
{
}
+asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void)
+{
+}
+
/*
* 'math_state_restore()' saves the current math information in the
* old math state array, and gets the new ones from the current task
diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
index 6dd642c..58b1921 100644
--- a/arch/x86_64/kernel/vmlinux.lds.S
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -50,7 +50,7 @@ SECTIONS
*(.bss.page_aligned)
*(.bss)
}
- __bss_end = .;
+ __bss_stop = .;
. = ALIGN(PAGE_SIZE);
. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c
index fd99ddd00..4a54221 100644
--- a/arch/x86_64/kernel/x8664_ksyms.c
+++ b/arch/x86_64/kernel/x8664_ksyms.c
@@ -203,3 +203,6 @@ EXPORT_SYMBOL(flush_tlb_page);
#endif
EXPORT_SYMBOL(cpu_khz);
+
+EXPORT_SYMBOL(load_gs_index);
+
diff --git a/arch/x86_64/lib/bitops.c b/arch/x86_64/lib/bitops.c
index a29fb75..95b6d96 100644
--- a/arch/x86_64/lib/bitops.c
+++ b/arch/x86_64/lib/bitops.c
@@ -5,19 +5,23 @@
#undef find_first_bit
#undef find_next_bit
-/**
- * find_first_zero_bit - find the first zero bit in a memory region
- * @addr: The address to start the search at
- * @size: The maximum size to search
- *
- * Returns the bit-number of the first zero bit, not the number of the byte
- * containing a bit.
- */
-inline long find_first_zero_bit(const unsigned long * addr, unsigned long size)
+static inline long
+__find_first_zero_bit(const unsigned long * addr, unsigned long size)
{
long d0, d1, d2;
long res;
+ /*
+ * We must test the size in words, not in bits, because
+ * otherwise incoming sizes in the range -63..-1 will not run
+ * any scasq instructions, and then the flags used by the je
+ * instruction will have whatever random value was in place
+ * before. Nobody should call us like that, but
+ * find_next_zero_bit() does when offset and size are at the
+ * same word and it fails to find a zero itself.
+ */
+ size += 63;
+ size >>= 6;
if (!size)
return 0;
asm volatile(
@@ -30,12 +34,30 @@ inline long find_first_zero_bit(const unsigned long * addr, unsigned long size)
" shlq $3,%%rdi\n"
" addq %%rdi,%%rdx"
:"=d" (res), "=&c" (d0), "=&D" (d1), "=&a" (d2)
- :"0" (0ULL), "1" ((size + 63) >> 6), "2" (addr), "3" (-1ULL),
- [addr] "r" (addr) : "memory");
+ :"0" (0ULL), "1" (size), "2" (addr), "3" (-1ULL),
+ [addr] "S" (addr) : "memory");
+ /*
+ * Any register would do for [addr] above, but GCC tends to
+ * prefer rbx over rsi, even though rsi is readily available
+ * and doesn't have to be saved.
+ */
return res;
}
/**
+ * find_first_zero_bit - find the first zero bit in a memory region
+ * @addr: The address to start the search at
+ * @size: The maximum size to search
+ *
+ * Returns the bit-number of the first zero bit, not the number of the byte
+ * containing a bit.
+ */
+long find_first_zero_bit(const unsigned long * addr, unsigned long size)
+{
+ return __find_first_zero_bit (addr, size);
+}
+
+/**
* find_next_zero_bit - find the first zero bit in a memory region
* @addr: The address to base the search on
* @offset: The bitnumber to start searching at
@@ -43,7 +65,7 @@ inline long find_first_zero_bit(const unsigned long * addr, unsigned long size)
*/
long find_next_zero_bit (const unsigned long * addr, long size, long offset)
{
- unsigned long * p = ((unsigned long *) addr) + (offset >> 6);
+ const unsigned long * p = addr + (offset >> 6);
unsigned long set = 0;
unsigned long res, bit = offset&63;
@@ -63,8 +85,8 @@ long find_next_zero_bit (const unsigned long * addr, long size, long offset)
/*
* No zero yet, search remaining full words for a zero
*/
- res = find_first_zero_bit ((const unsigned long *)p,
- size - 64 * (p - (unsigned long *) addr));
+ res = __find_first_zero_bit (p, size - 64 * (p - addr));
+
return (offset + set + res);
}
@@ -74,6 +96,19 @@ __find_first_bit(const unsigned long * addr, unsigned long size)
long d0, d1;
long res;
+ /*
+ * We must test the size in words, not in bits, because
+ * otherwise incoming sizes in the range -63..-1 will not run
+ * any scasq instructions, and then the flags used by the jz
+ * instruction will have whatever random value was in place
+ * before. Nobody should call us like that, but
+ * find_next_bit() does when offset and size are at the same
+ * word and it fails to find a one itself.
+ */
+ size += 63;
+ size >>= 6;
+ if (!size)
+ return 0;
asm volatile(
" repe; scasq\n"
" jz 1f\n"
@@ -83,8 +118,7 @@ __find_first_bit(const unsigned long * addr, unsigned long size)
" shlq $3,%%rdi\n"
" addq %%rdi,%%rax"
:"=a" (res), "=&c" (d0), "=&D" (d1)
- :"0" (0ULL),
- "1" ((size + 63) >> 6), "2" (addr),
+ :"0" (0ULL), "1" (size), "2" (addr),
[addr] "r" (addr) : "memory");
return res;
}
diff --git a/arch/x86_64/lib/clear_page.S b/arch/x86_64/lib/clear_page.S
index 30a9da4..43d9fa1 100644
--- a/arch/x86_64/lib/clear_page.S
+++ b/arch/x86_64/lib/clear_page.S
@@ -5,46 +5,8 @@
.globl clear_page
.p2align 4
clear_page:
- xorl %eax,%eax
- movl $4096/64,%ecx
- .p2align 4
-.Lloop:
- decl %ecx
-#define PUT(x) movq %rax,x*8(%rdi)
- movq %rax,(%rdi)
- PUT(1)
- PUT(2)
- PUT(3)
- PUT(4)
- PUT(5)
- PUT(6)
- PUT(7)
- leaq 64(%rdi),%rdi
- jnz .Lloop
- nop
- ret
-clear_page_end:
-
- /* C stepping K8 run faster using the string instructions.
- It is also a lot simpler. Use this when possible */
-
-#include <asm/cpufeature.h>
-
- .section .altinstructions,"a"
- .align 8
- .quad clear_page
- .quad clear_page_c
- .byte X86_FEATURE_K8_C
- .byte clear_page_end-clear_page
- .byte clear_page_c_end-clear_page_c
- .previous
-
- .section .altinstr_replacement,"ax"
-clear_page_c:
movl $4096/8,%ecx
xorl %eax,%eax
rep
stosq
ret
-clear_page_c_end:
- .previous
diff --git a/arch/x86_64/lib/copy_page.S b/arch/x86_64/lib/copy_page.S
index dd3aa47..621a197 100644
--- a/arch/x86_64/lib/copy_page.S
+++ b/arch/x86_64/lib/copy_page.S
@@ -8,94 +8,7 @@
.globl copy_page
.p2align 4
copy_page:
- subq $3*8,%rsp
- movq %rbx,(%rsp)
- movq %r12,1*8(%rsp)
- movq %r13,2*8(%rsp)
-
- movl $(4096/64)-5,%ecx
- .p2align 4
-.Loop64:
- dec %rcx
-
- movq (%rsi), %rax
- movq 8 (%rsi), %rbx
- movq 16 (%rsi), %rdx
- movq 24 (%rsi), %r8
- movq 32 (%rsi), %r9
- movq 40 (%rsi), %r10
- movq 48 (%rsi), %r11
- movq 56 (%rsi), %r12
-
- prefetcht0 5*64(%rsi)
-
- movq %rax, (%rdi)
- movq %rbx, 8 (%rdi)
- movq %rdx, 16 (%rdi)
- movq %r8, 24 (%rdi)
- movq %r9, 32 (%rdi)
- movq %r10, 40 (%rdi)
- movq %r11, 48 (%rdi)
- movq %r12, 56 (%rdi)
-
- leaq 64 (%rsi), %rsi
- leaq 64 (%rdi), %rdi
-
- jnz .Loop64
-
- movl $5,%ecx
- .p2align 4
-.Loop2:
- decl %ecx
-
- movq (%rsi), %rax
- movq 8 (%rsi), %rbx
- movq 16 (%rsi), %rdx
- movq 24 (%rsi), %r8
- movq 32 (%rsi), %r9
- movq 40 (%rsi), %r10
- movq 48 (%rsi), %r11
- movq 56 (%rsi), %r12
-
- movq %rax, (%rdi)
- movq %rbx, 8 (%rdi)
- movq %rdx, 16 (%rdi)
- movq %r8, 24 (%rdi)
- movq %r9, 32 (%rdi)
- movq %r10, 40 (%rdi)
- movq %r11, 48 (%rdi)
- movq %r12, 56 (%rdi)
-
- leaq 64(%rdi),%rdi
- leaq 64(%rsi),%rsi
-
- jnz .Loop2
-
- movq (%rsp),%rbx
- movq 1*8(%rsp),%r12
- movq 2*8(%rsp),%r13
- addq $3*8,%rsp
- ret
-
- /* C stepping K8 run faster using the string copy instructions.
- It is also a lot simpler. Use this when possible */
-
-#include <asm/cpufeature.h>
-
- .section .altinstructions,"a"
- .align 8
- .quad copy_page
- .quad copy_page_c
- .byte X86_FEATURE_K8_C
- .byte copy_page_c_end-copy_page_c
- .byte copy_page_c_end-copy_page_c
- .previous
-
- .section .altinstr_replacement,"ax"
-copy_page_c:
movl $4096/8,%ecx
rep
movsq
ret
-copy_page_c_end:
- .previous
diff --git a/arch/x86_64/lib/memcpy.S b/arch/x86_64/lib/memcpy.S
index c6c4649..92dd805 100644
--- a/arch/x86_64/lib/memcpy.S
+++ b/arch/x86_64/lib/memcpy.S
@@ -11,6 +11,8 @@
*
* Output:
* rax original destination
+ *
+ * TODO: check best memcpy for PSC
*/
.globl __memcpy
@@ -18,95 +20,6 @@
.p2align 4
__memcpy:
memcpy:
- pushq %rbx
- movq %rdi,%rax
-
- movl %edx,%ecx
- shrl $6,%ecx
- jz .Lhandle_tail
-
- .p2align 4
-.Lloop_64:
- decl %ecx
-
- movq (%rsi),%r11
- movq 8(%rsi),%r8
-
- movq %r11,(%rdi)
- movq %r8,1*8(%rdi)
-
- movq 2*8(%rsi),%r9
- movq 3*8(%rsi),%r10
-
- movq %r9,2*8(%rdi)
- movq %r10,3*8(%rdi)
-
- movq 4*8(%rsi),%r11
- movq 5*8(%rsi),%r8
-
- movq %r11,4*8(%rdi)
- movq %r8,5*8(%rdi)
-
- movq 6*8(%rsi),%r9
- movq 7*8(%rsi),%r10
-
- movq %r9,6*8(%rdi)
- movq %r10,7*8(%rdi)
-
- leaq 64(%rsi),%rsi
- leaq 64(%rdi),%rdi
- jnz .Lloop_64
-
-.Lhandle_tail:
- movl %edx,%ecx
- andl $63,%ecx
- shrl $3,%ecx
- jz .Lhandle_7
- .p2align 4
-.Lloop_8:
- decl %ecx
- movq (%rsi),%r8
- movq %r8,(%rdi)
- leaq 8(%rdi),%rdi
- leaq 8(%rsi),%rsi
- jnz .Lloop_8
-
-.Lhandle_7:
- movl %edx,%ecx
- andl $7,%ecx
- jz .Lende
- .p2align 4
-.Lloop_1:
- movb (%rsi),%r8b
- movb %r8b,(%rdi)
- incq %rdi
- incq %rsi
- decl %ecx
- jnz .Lloop_1
-
-.Lende:
- popq %rbx
- ret
-.Lfinal:
-
- /* C stepping K8 run faster using the string copy instructions.
- It is also a lot simpler. Use this when possible */
-
- .section .altinstructions,"a"
- .align 8
- .quad memcpy
- .quad memcpy_c
- .byte X86_FEATURE_K8_C
- .byte .Lfinal-memcpy
- .byte memcpy_c_end-memcpy_c
- .previous
-
- .section .altinstr_replacement,"ax"
- /* rdi destination
- * rsi source
- * rdx count
- */
-memcpy_c:
movq %rdi,%rax
movl %edx,%ecx
shrl $3,%ecx
@@ -117,5 +30,3 @@ memcpy_c:
rep
movsb
ret
-memcpy_c_end:
- .previous
diff --git a/arch/x86_64/lib/memset.S b/arch/x86_64/lib/memset.S
index 4b4c406..2aa48f2 100644
--- a/arch/x86_64/lib/memset.S
+++ b/arch/x86_64/lib/memset.S
@@ -13,98 +13,6 @@
.p2align 4
memset:
__memset:
- movq %rdi,%r10
- movq %rdx,%r11
-
- /* expand byte value */
- movzbl %sil,%ecx
- movabs $0x0101010101010101,%rax
- mul %rcx /* with rax, clobbers rdx */
-
- /* align dst */
- movl %edi,%r9d
- andl $7,%r9d
- jnz .Lbad_alignment
-.Lafter_bad_alignment:
-
- movl %r11d,%ecx
- shrl $6,%ecx
- jz .Lhandle_tail
-
- .p2align 4
-.Lloop_64:
- decl %ecx
- movq %rax,(%rdi)
- movq %rax,8(%rdi)
- movq %rax,16(%rdi)
- movq %rax,24(%rdi)
- movq %rax,32(%rdi)
- movq %rax,40(%rdi)
- movq %rax,48(%rdi)
- movq %rax,56(%rdi)
- leaq 64(%rdi),%rdi
- jnz .Lloop_64
-
- /* Handle tail in loops. The loops should be faster than hard
- to predict jump tables. */
- .p2align 4
-.Lhandle_tail:
- movl %r11d,%ecx
- andl $63&(~7),%ecx
- jz .Lhandle_7
- shrl $3,%ecx
- .p2align 4
-.Lloop_8:
- decl %ecx
- movq %rax,(%rdi)
- leaq 8(%rdi),%rdi
- jnz .Lloop_8
-
-.Lhandle_7:
- movl %r11d,%ecx
- andl $7,%ecx
- jz .Lende
- .p2align 4
-.Lloop_1:
- decl %ecx
- movb %al,(%rdi)
- leaq 1(%rdi),%rdi
- jnz .Lloop_1
-
-.Lende:
- movq %r10,%rax
- ret
-
-.Lbad_alignment:
- cmpq $7,%r11
- jbe .Lhandle_7
- movq %rax,(%rdi) /* unaligned store */
- movq $8,%r8
- subq %r9,%r8
- addq %r8,%rdi
- subq %r8,%r11
- jmp .Lafter_bad_alignment
-
- /* C stepping K8 run faster using the string instructions.
- It is also a lot simpler. Use this when possible */
-
-#include <asm/cpufeature.h>
-
- .section .altinstructions,"a"
- .align 8
- .quad memset
- .quad memset_c
- .byte X86_FEATURE_K8_C
- .byte memset_c_end-memset_c
- .byte memset_c_end-memset_c
- .previous
-
- .section .altinstr_replacement,"ax"
- /* rdi destination
- * rsi value
- * rdx count
- */
-memset_c:
movq %rdi,%r9
movl %edx,%r8d
andl $7,%r8d
@@ -121,5 +29,3 @@ memset_c:
stosb
movq %r9,%rax
ret
-memset_c_end:
- .previous
diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c
index b75b872..3a63707 100644
--- a/arch/x86_64/mm/fault.c
+++ b/arch/x86_64/mm/fault.c
@@ -308,18 +308,6 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
unsigned long flags;
siginfo_t info;
-#ifdef CONFIG_CHECKING
- {
- unsigned long gs;
- struct x8664_pda *pda = cpu_pda + stack_smp_processor_id();
- rdmsrl(MSR_GS_BASE, gs);
- if (gs != (unsigned long)pda) {
- wrmsrl(MSR_GS_BASE, pda);
- printk("page_fault: wrong gs %lx expected %p\n", gs, pda);
- }
- }
-#endif
-
/* get the address */
__asm__("movq %%cr2,%0":"=r" (address));
if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
@@ -571,3 +559,10 @@ do_sigbus:
force_sig_info(SIGBUS, &info, tsk);
return;
}
+
+static int __init enable_pagefaulttrace(char *str)
+{
+ page_fault_trace = 1;
+ return 0;
+}
+__setup("pagefaulttrace", enable_pagefaulttrace);
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index e60a1a8..286f6a6 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -22,6 +22,7 @@
#include <linux/pagemap.h>
#include <linux/bootmem.h>
#include <linux/proc_fs.h>
+#include <linux/pci.h>
#include <asm/processor.h>
#include <asm/system.h>
@@ -36,16 +37,13 @@
#include <asm/mmu_context.h>
#include <asm/proto.h>
#include <asm/smp.h>
+#include <asm/sections.h>
#ifndef Dprintk
#define Dprintk(x...)
#endif
-#ifdef CONFIG_GART_IOMMU
-extern int swiotlb;
-#endif
-
-extern char _stext[];
+static unsigned long dma_reserve __initdata;
DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
@@ -86,9 +84,6 @@ void show_mem(void)
/* References to section boundaries */
-extern char _text, _etext, _edata, __bss_start, _end[];
-extern char __init_begin, __init_end;
-
int after_bootmem;
static void *spp_getpage(void)
@@ -308,42 +303,81 @@ void __init init_memory_mapping(unsigned long start, unsigned long end)
table_end<<PAGE_SHIFT);
}
-extern struct x8664_pda cpu_pda[NR_CPUS];
+void __cpuinit zap_low_mappings(int cpu)
+{
+ if (cpu == 0) {
+ pgd_t *pgd = pgd_offset_k(0UL);
+ pgd_clear(pgd);
+ } else {
+ /*
+ * For AP's, zap the low identity mappings by changing the cr3
+ * to init_level4_pgt and doing local flush tlb all
+ */
+ asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
+ }
+ __flush_tlb_all();
+}
-/* Assumes all CPUs still execute in init_mm */
-void zap_low_mappings(void)
+/* Compute zone sizes for the DMA and DMA32 zones in a node. */
+__init void
+size_zones(unsigned long *z, unsigned long *h,
+ unsigned long start_pfn, unsigned long end_pfn)
{
- pgd_t *pgd = pgd_offset_k(0UL);
- pgd_clear(pgd);
- flush_tlb_all();
+ int i;
+ unsigned long w;
+
+ for (i = 0; i < MAX_NR_ZONES; i++)
+ z[i] = 0;
+
+ if (start_pfn < MAX_DMA_PFN)
+ z[ZONE_DMA] = MAX_DMA_PFN - start_pfn;
+ if (start_pfn < MAX_DMA32_PFN) {
+ unsigned long dma32_pfn = MAX_DMA32_PFN;
+ if (dma32_pfn > end_pfn)
+ dma32_pfn = end_pfn;
+ z[ZONE_DMA32] = dma32_pfn - start_pfn;
+ }
+ z[ZONE_NORMAL] = end_pfn - start_pfn;
+
+ /* Remove lower zones from higher ones. */
+ w = 0;
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ if (z[i])
+ z[i] -= w;
+ w += z[i];
+ }
+
+ /* Compute holes */
+ w = 0;
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ unsigned long s = w;
+ w += z[i];
+ h[i] = e820_hole_size(s, w);
+ }
+
+ /* Add the space pace needed for mem_map to the holes too. */
+ for (i = 0; i < MAX_NR_ZONES; i++)
+ h[i] += (z[i] * sizeof(struct page)) / PAGE_SIZE;
+
+ /* The 16MB DMA zone has the kernel and other misc mappings.
+ Account them too */
+ if (h[ZONE_DMA]) {
+ h[ZONE_DMA] += dma_reserve;
+ if (h[ZONE_DMA] >= z[ZONE_DMA]) {
+ printk(KERN_WARNING
+ "Kernel too large and filling up ZONE_DMA?\n");
+ h[ZONE_DMA] = z[ZONE_DMA];
+ }
+ }
}
#ifndef CONFIG_NUMA
void __init paging_init(void)
{
- {
- unsigned long zones_size[MAX_NR_ZONES];
- unsigned long holes[MAX_NR_ZONES];
- unsigned int max_dma;
-
- memset(zones_size, 0, sizeof(zones_size));
- memset(holes, 0, sizeof(holes));
-
- max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
-
- if (end_pfn < max_dma) {
- zones_size[ZONE_DMA] = end_pfn;
- holes[ZONE_DMA] = e820_hole_size(0, end_pfn);
- } else {
- zones_size[ZONE_DMA] = max_dma;
- holes[ZONE_DMA] = e820_hole_size(0, max_dma);
- zones_size[ZONE_NORMAL] = end_pfn - max_dma;
- holes[ZONE_NORMAL] = e820_hole_size(max_dma, end_pfn);
- }
- free_area_init_node(0, NODE_DATA(0), zones_size,
- __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
- }
- return;
+ unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES];
+ size_zones(zones, holes, 0, end_pfn);
+ free_area_init_node(0, NODE_DATA(0), zones,
+ __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
}
#endif
@@ -438,19 +472,16 @@ void __init mem_init(void)
datasize >> 10,
initsize >> 10);
+#ifdef CONFIG_SMP
/*
- * Subtle. SMP is doing its boot stuff late (because it has to
- * fork idle threads) - but it also needs low mappings for the
- * protected-mode entry to work. We zap these entries only after
- * the WP-bit has been tested.
+ * Sync boot_level4_pgt mappings with the init_level4_pgt
+ * except for the low identity mappings which are already zapped
+ * in init_level4_pgt. This sync-up is essential for AP's bringup
*/
-#ifndef CONFIG_SMP
- zap_low_mappings();
+ memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t));
#endif
}
-extern char __initdata_begin[], __initdata_end[];
-
void free_initmem(void)
{
unsigned long addr;
@@ -464,7 +495,7 @@ void free_initmem(void)
totalram_pages++;
}
memset(__initdata_begin, 0xba, __initdata_end - __initdata_begin);
- printk ("Freeing unused kernel memory: %luk freed\n", (&__init_end - &__init_begin) >> 10);
+ printk ("Freeing unused kernel memory: %luk freed\n", (__init_end - __init_begin) >> 10);
}
#ifdef CONFIG_BLK_DEV_INITRD
@@ -491,6 +522,8 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
#else
reserve_bootmem(phys, len);
#endif
+ if (phys+len <= MAX_DMA_PFN*PAGE_SIZE)
+ dma_reserve += len / PAGE_SIZE;
}
int kern_addr_valid(unsigned long addr)
@@ -532,10 +565,6 @@ extern int exception_trace, page_fault_trace;
static ctl_table debug_table2[] = {
{ 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL,
proc_dointvec },
-#ifdef CONFIG_CHECKING
- { 100, "page-fault-trace", &page_fault_trace, sizeof(int), 0644, NULL,
- proc_dointvec },
-#endif
{ 0, }
};
diff --git a/arch/x86_64/mm/ioremap.c b/arch/x86_64/mm/ioremap.c
index 6972df4..ecf7acb 100644
--- a/arch/x86_64/mm/ioremap.c
+++ b/arch/x86_64/mm/ioremap.c
@@ -60,7 +60,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned lo
if (address >= end)
BUG();
do {
- pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address);
+ pte_t * pte = pte_alloc_kernel(pmd, address);
if (!pte)
return -ENOMEM;
remap_area_pte(pte, address, end - address, address + phys_addr, flags);
@@ -105,7 +105,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
flush_cache_all();
if (address >= end)
BUG();
- spin_lock(&init_mm.page_table_lock);
do {
pud_t *pud;
pud = pud_alloc(&init_mm, pgd, address);
@@ -119,7 +118,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
address = (address + PGDIR_SIZE) & PGDIR_MASK;
pgd++;
} while (address && (address < end));
- spin_unlock(&init_mm.page_table_lock);
flush_tlb_all();
return error;
}
diff --git a/arch/x86_64/mm/k8topology.c b/arch/x86_64/mm/k8topology.c
index 65417b0..a5663e0b 100644
--- a/arch/x86_64/mm/k8topology.c
+++ b/arch/x86_64/mm/k8topology.c
@@ -108,6 +108,7 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
limit >>= 16;
limit <<= 24;
limit |= (1<<24)-1;
+ limit++;
if (limit > end_pfn << PAGE_SHIFT)
limit = end_pfn << PAGE_SHIFT;
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 80a49d9..a828a01 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -38,38 +38,57 @@ cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
int numa_off __initdata;
-int __init compute_hash_shift(struct node *nodes, int numnodes)
+
+/*
+ * Given a shift value, try to populate memnodemap[]
+ * Returns :
+ * 1 if OK
+ * 0 if memnodmap[] too small (of shift too small)
+ * -1 if node overlap or lost ram (shift too big)
+ */
+static int __init populate_memnodemap(
+ const struct node *nodes, int numnodes, int shift)
{
int i;
- int shift = 20;
- unsigned long addr,maxend=0;
-
- for (i = 0; i < numnodes; i++)
- if ((nodes[i].start != nodes[i].end) && (nodes[i].end > maxend))
- maxend = nodes[i].end;
+ int res = -1;
+ unsigned long addr, end;
- while ((1UL << shift) < (maxend / NODEMAPSIZE))
- shift++;
-
- printk (KERN_DEBUG"Using %d for the hash shift. Max adder is %lx \n",
- shift,maxend);
- memset(memnodemap,0xff,sizeof(*memnodemap) * NODEMAPSIZE);
+ memset(memnodemap, 0xff, sizeof(memnodemap));
for (i = 0; i < numnodes; i++) {
- if (nodes[i].start == nodes[i].end)
+ addr = nodes[i].start;
+ end = nodes[i].end;
+ if (addr >= end)
continue;
- for (addr = nodes[i].start;
- addr < nodes[i].end;
- addr += (1UL << shift)) {
- if (memnodemap[addr >> shift] != 0xff) {
- printk(KERN_INFO
- "Your memory is not aligned you need to rebuild your kernel "
- "with a bigger NODEMAPSIZE shift=%d adder=%lu\n",
- shift,addr);
+ if ((end >> shift) >= NODEMAPSIZE)
+ return 0;
+ do {
+ if (memnodemap[addr >> shift] != 0xff)
return -1;
- }
memnodemap[addr >> shift] = i;
- }
+ addr += (1 << shift);
+ } while (addr < end);
+ res = 1;
}
+ return res;
+}
+
+int __init compute_hash_shift(struct node *nodes, int numnodes)
+{
+ int shift = 20;
+
+ while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0)
+ shift++;
+
+ printk(KERN_DEBUG "Using %d for the hash shift.\n",
+ shift);
+
+ if (populate_memnodemap(nodes, numnodes, shift) != 1) {
+ printk(KERN_INFO
+ "Your memory is not aligned you need to rebuild your kernel "
+ "with a bigger NODEMAPSIZE shift=%d\n",
+ shift);
+ return -1;
+ }
return shift;
}
@@ -94,7 +113,6 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en
start_pfn = start >> PAGE_SHIFT;
end_pfn = end >> PAGE_SHIFT;
- memory_present(nodeid, start_pfn, end_pfn);
nodedata_phys = find_e820_area(start, end, pgdat_size);
if (nodedata_phys == -1L)
panic("Cannot find memory pgdat in node %d\n", nodeid);
@@ -132,29 +150,14 @@ void __init setup_node_zones(int nodeid)
unsigned long start_pfn, end_pfn;
unsigned long zones[MAX_NR_ZONES];
unsigned long holes[MAX_NR_ZONES];
- unsigned long dma_end_pfn;
- memset(zones, 0, sizeof(unsigned long) * MAX_NR_ZONES);
- memset(holes, 0, sizeof(unsigned long) * MAX_NR_ZONES);
+ start_pfn = node_start_pfn(nodeid);
+ end_pfn = node_end_pfn(nodeid);
- start_pfn = node_start_pfn(nodeid);
- end_pfn = node_end_pfn(nodeid);
+ Dprintk(KERN_INFO "setting up node %d %lx-%lx\n",
+ nodeid, start_pfn, end_pfn);
- Dprintk(KERN_INFO "setting up node %d %lx-%lx\n", nodeid, start_pfn, end_pfn);
-
- /* All nodes > 0 have a zero length zone DMA */
- dma_end_pfn = __pa(MAX_DMA_ADDRESS) >> PAGE_SHIFT;
- if (start_pfn < dma_end_pfn) {
- zones[ZONE_DMA] = dma_end_pfn - start_pfn;
- holes[ZONE_DMA] = e820_hole_size(start_pfn, dma_end_pfn);
- zones[ZONE_NORMAL] = end_pfn - dma_end_pfn;
- holes[ZONE_NORMAL] = e820_hole_size(dma_end_pfn, end_pfn);
-
- } else {
- zones[ZONE_NORMAL] = end_pfn - start_pfn;
- holes[ZONE_NORMAL] = e820_hole_size(start_pfn, end_pfn);
- }
-
+ size_zones(zones, holes, start_pfn, end_pfn);
free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
start_pfn, holes);
}
@@ -167,18 +170,16 @@ void __init numa_init_array(void)
mapping. To avoid this fill in the mapping for all possible
CPUs, as the number of CPUs is not known yet.
We round robin the existing nodes. */
- rr = 0;
+ rr = first_node(node_online_map);
for (i = 0; i < NR_CPUS; i++) {
if (cpu_to_node[i] != NUMA_NO_NODE)
continue;
+ numa_set_node(i, rr);
rr = next_node(rr, node_online_map);
if (rr == MAX_NUMNODES)
rr = first_node(node_online_map);
- cpu_to_node[i] = rr;
- rr++;
}
- set_bit(0, &node_to_cpumask[cpu_to_node(0)]);
}
#ifdef CONFIG_NUMA_EMU
@@ -207,8 +208,6 @@ static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
if (i == numa_fake-1)
sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
nodes[i].end = nodes[i].start + sz;
- if (i != numa_fake-1)
- nodes[i].end--;
printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
i,
nodes[i].start, nodes[i].end,
@@ -259,18 +258,22 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
nodes_clear(node_online_map);
node_set_online(0);
for (i = 0; i < NR_CPUS; i++)
- cpu_to_node[i] = 0;
+ numa_set_node(i, 0);
node_to_cpumask[0] = cpumask_of_cpu(0);
setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
}
__cpuinit void numa_add_cpu(int cpu)
{
- /* BP is initialized elsewhere */
- if (cpu)
- set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
+ set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
}
+void __cpuinit numa_set_node(int cpu, int node)
+{
+ cpu_pda[cpu].nodenumber = node;
+ cpu_to_node[cpu] = node;
+}
+
unsigned long __init numa_free_all_bootmem(void)
{
int i;
@@ -281,9 +284,26 @@ unsigned long __init numa_free_all_bootmem(void)
return pages;
}
+#ifdef CONFIG_SPARSEMEM
+static void __init arch_sparse_init(void)
+{
+ int i;
+
+ for_each_online_node(i)
+ memory_present(i, node_start_pfn(i), node_end_pfn(i));
+
+ sparse_init();
+}
+#else
+#define arch_sparse_init() do {} while (0)
+#endif
+
void __init paging_init(void)
{
int i;
+
+ arch_sparse_init();
+
for_each_online_node(i) {
setup_node_zones(i);
}
diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c
index 94862e1..b90e8fe 100644
--- a/arch/x86_64/mm/pageattr.c
+++ b/arch/x86_64/mm/pageattr.c
@@ -220,8 +220,6 @@ void global_flush_tlb(void)
down_read(&init_mm.mmap_sem);
df = xchg(&df_list, NULL);
up_read(&init_mm.mmap_sem);
- if (!df)
- return;
flush_map((df && !df->next) ? df->address : 0);
for (; df; df = next_df) {
next_df = df->next;
diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c
index 4b2e844..33340bd 100644
--- a/arch/x86_64/mm/srat.c
+++ b/arch/x86_64/mm/srat.c
@@ -71,8 +71,6 @@ static __init void cutoff_node(int i, unsigned long start, unsigned long end)
nd->start = nd->end;
}
if (nd->end > end) {
- if (!(end & 0xfff))
- end--;
nd->end = end;
if (nd->start > nd->end)
nd->start = nd->end;
@@ -166,8 +164,6 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
if (nd->end < end)
nd->end = end;
}
- if (!(nd->end & 0xfff))
- nd->end--;
printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
nd->start, nd->end);
}
@@ -203,7 +199,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
if (cpu_to_node[i] == NUMA_NO_NODE)
continue;
if (!node_isset(cpu_to_node[i], nodes_parsed))
- cpu_to_node[i] = NUMA_NO_NODE;
+ numa_set_node(i, NUMA_NO_NODE);
}
numa_init_array();
return 0;
diff --git a/arch/x86_64/oprofile/Kconfig b/arch/x86_64/oprofile/Kconfig
index 5ade198..d8a8408 100644
--- a/arch/x86_64/oprofile/Kconfig
+++ b/arch/x86_64/oprofile/Kconfig
@@ -1,7 +1,3 @@
-
-menu "Profiling support"
- depends on EXPERIMENTAL
-
config PROFILING
bool "Profiling support (EXPERIMENTAL)"
help
@@ -19,5 +15,3 @@ config OPROFILE
If unsure, say N.
-endmenu
-
OpenPOWER on IntegriCloud