summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/testing/sysfs-class-cxl8
-rw-r--r--Documentation/features/perf/perf-regs/arch-support.txt2
-rw-r--r--Documentation/features/perf/perf-stackdump/arch-support.txt2
-rw-r--r--Documentation/powerpc/eeh-pci-error-recovery.txt2
-rw-r--r--MAINTAINERS13
-rw-r--r--arch/powerpc/Kconfig7
-rw-r--r--arch/powerpc/Kconfig.debug8
-rw-r--r--arch/powerpc/boot/Makefile6
-rw-r--r--arch/powerpc/boot/dts/fsl/gef_ppc9a.dts4
-rw-r--r--arch/powerpc/boot/dts/fsl/gef_sbc310.dts22
-rw-r--r--arch/powerpc/boot/dts/fsl/gef_sbc610.dts4
-rw-r--r--arch/powerpc/boot/dts/fsl/mpc8641_hpcn.dts24
-rw-r--r--arch/powerpc/boot/dts/fsl/mpc8641_hpcn_36b.dts24
-rw-r--r--arch/powerpc/boot/dts/fsl/mpc8641si-post.dtsi41
-rw-r--r--arch/powerpc/boot/dts/fsl/mpc8641si-pre.dtsi1
-rw-r--r--arch/powerpc/boot/dts/fsl/sbc8641d.dts23
-rw-r--r--arch/powerpc/boot/dts/fsl/t1023si-post.dtsi2
-rw-r--r--arch/powerpc/boot/dts/fsl/t1040si-post.dtsi2
-rw-r--r--arch/powerpc/boot/dts/fsl/t104xrdb.dtsi2
-rw-r--r--arch/powerpc/boot/dts/fsl/t208xrdb.dtsi2
-rw-r--r--arch/powerpc/include/asm/book3s/32/hash.h3
-rw-r--r--arch/powerpc/include/asm/book3s/32/mmu-hash.h6
-rw-r--r--arch/powerpc/include/asm/book3s/32/pgalloc.h109
-rw-r--r--arch/powerpc/include/asm/book3s/64/hash-4k.h136
-rw-r--r--arch/powerpc/include/asm/book3s/64/hash-64k.h215
-rw-r--r--arch/powerpc/include/asm/book3s/64/hash.h485
-rw-r--r--arch/powerpc/include/asm/book3s/64/hugetlb-radix.h14
-rw-r--r--arch/powerpc/include/asm/book3s/64/mmu-hash.h79
-rw-r--r--arch/powerpc/include/asm/book3s/64/mmu.h137
-rw-r--r--arch/powerpc/include/asm/book3s/64/pgalloc.h207
-rw-r--r--arch/powerpc/include/asm/book3s/64/pgtable-4k.h53
-rw-r--r--arch/powerpc/include/asm/book3s/64/pgtable-64k.h64
-rw-r--r--arch/powerpc/include/asm/book3s/64/pgtable.h818
-rw-r--r--arch/powerpc/include/asm/book3s/64/radix-4k.h12
-rw-r--r--arch/powerpc/include/asm/book3s/64/radix-64k.h12
-rw-r--r--arch/powerpc/include/asm/book3s/64/radix.h232
-rw-r--r--arch/powerpc/include/asm/book3s/64/tlbflush-hash.h41
-rw-r--r--arch/powerpc/include/asm/book3s/64/tlbflush-radix.h33
-rw-r--r--arch/powerpc/include/asm/book3s/64/tlbflush.h76
-rw-r--r--arch/powerpc/include/asm/book3s/pgalloc.h19
-rw-r--r--arch/powerpc/include/asm/hugetlb.h14
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_64.h36
-rw-r--r--arch/powerpc/include/asm/machdep.h1
-rw-r--r--arch/powerpc/include/asm/mmu.h51
-rw-r--r--arch/powerpc/include/asm/mmu_context.h29
-rw-r--r--arch/powerpc/include/asm/nohash/32/pgalloc.h (renamed from arch/powerpc/include/asm/pgalloc-32.h)0
-rw-r--r--arch/powerpc/include/asm/nohash/64/pgalloc.h (renamed from arch/powerpc/include/asm/pgalloc-64.h)92
-rw-r--r--arch/powerpc/include/asm/nohash/64/pgtable.h10
-rw-r--r--arch/powerpc/include/asm/nohash/pgalloc.h23
-rw-r--r--arch/powerpc/include/asm/opal-api.h16
-rw-r--r--arch/powerpc/include/asm/page.h14
-rw-r--r--arch/powerpc/include/asm/page_64.h5
-rw-r--r--arch/powerpc/include/asm/pci-bridge.h41
-rw-r--r--arch/powerpc/include/asm/pgalloc.h19
-rw-r--r--arch/powerpc/include/asm/pgtable-be-types.h92
-rw-r--r--arch/powerpc/include/asm/pgtable-types.h58
-rw-r--r--arch/powerpc/include/asm/ppc-opcode.h2
-rw-r--r--arch/powerpc/include/asm/ppc-pci.h6
-rw-r--r--arch/powerpc/include/asm/ppc_asm.h3
-rw-r--r--arch/powerpc/include/asm/pte-common.h26
-rw-r--r--arch/powerpc/include/asm/reg.h3
-rw-r--r--arch/powerpc/include/asm/tlbflush.h3
-rw-r--r--arch/powerpc/include/uapi/asm/perf_regs.h50
-rw-r--r--arch/powerpc/kernel/asm-offsets.c4
-rw-r--r--arch/powerpc/kernel/btext.c2
-rw-r--r--arch/powerpc/kernel/cputable.c2
-rw-r--r--arch/powerpc/kernel/eeh.c9
-rw-r--r--arch/powerpc/kernel/eeh_driver.c46
-rw-r--r--arch/powerpc/kernel/eeh_event.c2
-rw-r--r--arch/powerpc/kernel/eeh_pe.c2
-rw-r--r--arch/powerpc/kernel/entry_64.S16
-rw-r--r--arch/powerpc/kernel/exceptions-64s.S163
-rw-r--r--arch/powerpc/kernel/ftrace.c10
-rw-r--r--arch/powerpc/kernel/head_64.S13
-rw-r--r--arch/powerpc/kernel/ibmebus.c2
-rw-r--r--arch/powerpc/kernel/isa-bridge.c4
-rw-r--r--arch/powerpc/kernel/machine_kexec.c18
-rw-r--r--arch/powerpc/kernel/machine_kexec_64.c15
-rw-r--r--arch/powerpc/kernel/mce.c2
-rw-r--r--arch/powerpc/kernel/mce_power.c13
-rw-r--r--arch/powerpc/kernel/misc_32.S6
-rw-r--r--arch/powerpc/kernel/nvram_64.c12
-rw-r--r--arch/powerpc/kernel/pci-hotplug.c47
-rw-r--r--arch/powerpc/kernel/pci_64.c5
-rw-r--r--arch/powerpc/kernel/pci_dn.c66
-rw-r--r--arch/powerpc/kernel/process.c17
-rw-r--r--arch/powerpc/kernel/prom.c2
-rw-r--r--arch/powerpc/kernel/rtasd.c2
-rw-r--r--arch/powerpc/kernel/setup-common.c6
-rw-r--r--arch/powerpc/kernel/swsusp.c2
-rw-r--r--arch/powerpc/kernel/time.c1
-rw-r--r--arch/powerpc/kernel/vio.c4
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_hv.c11
-rw-r--r--arch/powerpc/kvm/book3s_hv.c6
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_mmu.c12
-rw-r--r--arch/powerpc/kvm/book3s_pr.c6
-rw-r--r--arch/powerpc/lib/copy_32.S2
-rw-r--r--arch/powerpc/lib/sstep.c5
-rw-r--r--arch/powerpc/lib/xor_vmx.c10
-rw-r--r--arch/powerpc/mm/Makefile10
-rw-r--r--arch/powerpc/mm/fsl_booke_mmu.c2
-rw-r--r--arch/powerpc/mm/hash64_4k.c29
-rw-r--r--arch/powerpc/mm/hash64_64k.c71
-rw-r--r--arch/powerpc/mm/hash_native_64.c11
-rw-r--r--arch/powerpc/mm/hash_utils_64.c190
-rw-r--r--arch/powerpc/mm/hugepage-hash64.c22
-rw-r--r--arch/powerpc/mm/hugetlbpage-hash64.c29
-rw-r--r--arch/powerpc/mm/hugetlbpage-radix.c87
-rw-r--r--arch/powerpc/mm/hugetlbpage.c20
-rw-r--r--arch/powerpc/mm/init_64.c73
-rw-r--r--arch/powerpc/mm/mem.c40
-rw-r--r--arch/powerpc/mm/mmap.c110
-rw-r--r--arch/powerpc/mm/mmu_context_book3s64.c (renamed from arch/powerpc/mm/mmu_context_hash64.c)52
-rw-r--r--arch/powerpc/mm/mmu_context_nohash.c6
-rw-r--r--arch/powerpc/mm/mmu_decl.h5
-rw-r--r--arch/powerpc/mm/pgtable-book3e.c122
-rw-r--r--arch/powerpc/mm/pgtable-book3s64.c118
-rw-r--r--arch/powerpc/mm/pgtable-hash64.c342
-rw-r--r--arch/powerpc/mm/pgtable-radix.c526
-rw-r--r--arch/powerpc/mm/pgtable.c24
-rw-r--r--arch/powerpc/mm/pgtable_64.c557
-rw-r--r--arch/powerpc/mm/slb.c1
-rw-r--r--arch/powerpc/mm/slb_low.S54
-rw-r--r--arch/powerpc/mm/slice.c20
-rw-r--r--arch/powerpc/mm/tlb-radix.c251
-rw-r--r--arch/powerpc/mm/tlb_hash64.c6
-rw-r--r--arch/powerpc/perf/Makefile2
-rw-r--r--arch/powerpc/perf/callchain.c2
-rw-r--r--arch/powerpc/perf/perf_regs.c104
-rw-r--r--arch/powerpc/perf/power8-events-list.h40
-rw-r--r--arch/powerpc/perf/power8-pmu.c41
-rw-r--r--arch/powerpc/platforms/Kconfig.cputype11
-rw-r--r--arch/powerpc/platforms/cell/spu_base.c9
-rw-r--r--arch/powerpc/platforms/cell/spufs/fault.c4
-rw-r--r--arch/powerpc/platforms/powernv/eeh-powernv.c69
-rw-r--r--arch/powerpc/platforms/powernv/npu-dma.c283
-rw-r--r--arch/powerpc/platforms/powernv/opal-hmi.c8
-rw-r--r--arch/powerpc/platforms/powernv/pci-ioda.c953
-rw-r--r--arch/powerpc/platforms/powernv/pci.c19
-rw-r--r--arch/powerpc/platforms/powernv/pci.h72
-rw-r--r--arch/powerpc/platforms/powernv/setup.c5
-rw-r--r--arch/powerpc/platforms/ps3/htab.c2
-rw-r--r--arch/powerpc/platforms/ps3/spu.c4
-rw-r--r--arch/powerpc/platforms/pseries/hotplug-memory.c225
-rw-r--r--arch/powerpc/platforms/pseries/iommu.c24
-rw-r--r--arch/powerpc/platforms/pseries/lpar.c20
-rw-r--r--arch/powerpc/platforms/pseries/lparcfg.c3
-rw-r--r--arch/powerpc/platforms/pseries/mobility.c4
-rw-r--r--arch/powerpc/platforms/pseries/msi.c4
-rw-r--r--arch/powerpc/platforms/pseries/pci_dlpar.c32
-rw-r--r--arch/powerpc/platforms/pseries/reconfig.c5
-rw-r--r--arch/powerpc/platforms/pseries/setup.c4
-rw-r--r--arch/powerpc/sysdev/fsl_pci.c24
-rw-r--r--arch/powerpc/sysdev/mpic.c9
-rw-r--r--arch/powerpc/xmon/Makefile2
-rw-r--r--arch/powerpc/xmon/spr_access.S45
-rw-r--r--arch/powerpc/xmon/xmon.c138
-rw-r--r--drivers/cpufreq/pmac32-cpufreq.c2
-rw-r--r--drivers/infiniband/hw/qib/qib_file_ops.c5
-rw-r--r--drivers/infiniband/hw/qib/qib_pcie.c6
-rw-r--r--drivers/macintosh/rack-meter.c5
-rw-r--r--drivers/macintosh/via-pmu.c4
-rw-r--r--drivers/misc/cxl/api.c28
-rw-r--r--drivers/misc/cxl/context.c3
-rw-r--r--drivers/misc/cxl/cxl.h15
-rw-r--r--drivers/misc/cxl/fault.c10
-rw-r--r--drivers/misc/cxl/guest.c78
-rw-r--r--drivers/misc/cxl/native.c29
-rw-r--r--drivers/misc/cxl/pci.c64
-rw-r--r--drivers/misc/cxl/sysfs.c10
-rw-r--r--drivers/of/base.c3
-rw-r--r--drivers/pci/hotplug/rpadlpar_core.c8
-rw-r--r--drivers/pci/hotplug/rpaphp_core.c4
-rw-r--r--drivers/pci/hotplug/rpaphp_pci.c4
-rw-r--r--drivers/pcmcia/electra_cf.c2
-rw-r--r--drivers/vfio/vfio_iommu_spapr_tce.c3
-rw-r--r--include/misc/cxl.h8
-rw-r--r--kernel/trace/ftrace.c12
-rw-r--r--tools/perf/arch/powerpc/include/perf_regs.h69
-rw-r--r--tools/perf/arch/powerpc/util/Build2
-rw-r--r--tools/perf/arch/powerpc/util/perf_regs.c49
-rw-r--r--tools/perf/arch/powerpc/util/unwind-libunwind.c96
-rw-r--r--tools/perf/config/Makefile6
-rw-r--r--tools/perf/util/perf_regs.c8
-rw-r--r--tools/testing/selftests/powerpc/Makefile1
-rw-r--r--tools/testing/selftests/powerpc/context_switch/.gitignore1
-rw-r--r--tools/testing/selftests/powerpc/context_switch/Makefile10
-rw-r--r--tools/testing/selftests/powerpc/context_switch/cp_abort.c110
-rw-r--r--tools/testing/selftests/powerpc/mm/subpage_prot.c18
-rw-r--r--tools/testing/selftests/powerpc/pmu/ebb/ebb.c1
-rw-r--r--tools/testing/selftests/powerpc/pmu/ebb/reg_access_test.c1
-rw-r--r--tools/testing/selftests/powerpc/reg.h (renamed from tools/testing/selftests/powerpc/pmu/ebb/reg.h)18
-rw-r--r--tools/testing/selftests/powerpc/tm/.gitignore3
-rw-r--r--tools/testing/selftests/powerpc/tm/Makefile3
-rw-r--r--tools/testing/selftests/powerpc/tm/tm-fork.c42
-rw-r--r--tools/testing/selftests/powerpc/tm/tm-resched-dscr.c16
-rw-r--r--tools/testing/selftests/powerpc/tm/tm-signal-stack.c4
-rw-r--r--tools/testing/selftests/powerpc/tm/tm-tar.c90
-rw-r--r--tools/testing/selftests/powerpc/tm/tm-tmspr.c143
-rw-r--r--tools/testing/selftests/powerpc/utils.h8
200 files changed, 6908 insertions, 3128 deletions
diff --git a/Documentation/ABI/testing/sysfs-class-cxl b/Documentation/ABI/testing/sysfs-class-cxl
index 7fd737e..4ba0a2a 100644
--- a/Documentation/ABI/testing/sysfs-class-cxl
+++ b/Documentation/ABI/testing/sysfs-class-cxl
@@ -233,3 +233,11 @@ Description: read/write
0 = don't trust, the image may be different (default)
1 = trust that the image will not change.
Users: https://github.com/ibm-capi/libcxl
+
+What: /sys/class/cxl/<card>/psl_timebase_synced
+Date: March 2016
+Contact: linuxppc-dev@lists.ozlabs.org
+Description: read only
+ Returns 1 if the psl timebase register is synchronized
+ with the core timebase register, 0 otherwise.
+Users: https://github.com/ibm-capi/libcxl
diff --git a/Documentation/features/perf/perf-regs/arch-support.txt b/Documentation/features/perf/perf-regs/arch-support.txt
index e2b4a78..f179b1f 100644
--- a/Documentation/features/perf/perf-regs/arch-support.txt
+++ b/Documentation/features/perf/perf-regs/arch-support.txt
@@ -27,7 +27,7 @@
| nios2: | TODO |
| openrisc: | TODO |
| parisc: | TODO |
- | powerpc: | TODO |
+ | powerpc: | ok |
| s390: | TODO |
| score: | TODO |
| sh: | TODO |
diff --git a/Documentation/features/perf/perf-stackdump/arch-support.txt b/Documentation/features/perf/perf-stackdump/arch-support.txt
index 3dc24b0..85777c5 100644
--- a/Documentation/features/perf/perf-stackdump/arch-support.txt
+++ b/Documentation/features/perf/perf-stackdump/arch-support.txt
@@ -27,7 +27,7 @@
| nios2: | TODO |
| openrisc: | TODO |
| parisc: | TODO |
- | powerpc: | TODO |
+ | powerpc: | ok |
| s390: | TODO |
| score: | TODO |
| sh: | TODO |
diff --git a/Documentation/powerpc/eeh-pci-error-recovery.txt b/Documentation/powerpc/eeh-pci-error-recovery.txt
index 9d4e33d..6781892 100644
--- a/Documentation/powerpc/eeh-pci-error-recovery.txt
+++ b/Documentation/powerpc/eeh-pci-error-recovery.txt
@@ -12,7 +12,7 @@ Overview:
The IBM POWER-based pSeries and iSeries computers include PCI bus
controller chips that have extended capabilities for detecting and
reporting a large variety of PCI bus error conditions. These features
-go under the name of "EEH", for "Extended Error Handling". The EEH
+go under the name of "EEH", for "Enhanced Error Handling". The EEH
hardware features allow PCI bus errors to be cleared and a PCI
card to be "rebooted", without also having to reboot the operating
system.
diff --git a/MAINTAINERS b/MAINTAINERS
index 65f3277a..2c670ba 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6675,6 +6675,19 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git
S: Supported
F: Documentation/powerpc/
F: arch/powerpc/
+F: drivers/char/tpm/tpm_ibmvtpm*
+F: drivers/crypto/nx/
+F: drivers/crypto/vmx/
+F: drivers/net/ethernet/ibm/ibmveth.*
+F: drivers/net/ethernet/ibm/ibmvnic.*
+F: drivers/pci/hotplug/rpa*
+F: drivers/scsi/ibmvscsi/
+N: opal
+N: /pmac
+N: powermac
+N: powernv
+N: [^a-z0-9]ps3
+N: pseries
LINUX FOR POWER MACINTOSH
M: Benjamin Herrenschmidt <benh@kernel.crashing.org>
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index a18a0dc..f0403b5 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -116,6 +116,8 @@ config PPC
select GENERIC_ATOMIC64 if PPC32
select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
select HAVE_PERF_EVENTS
+ select HAVE_PERF_REGS
+ select HAVE_PERF_USER_STACK_DUMP
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_HW_BREAKPOINT if PERF_EVENTS && PPC_BOOK3S_64
select ARCH_WANT_IPC_PARSE_VERSION
@@ -606,9 +608,9 @@ endchoice
config FORCE_MAX_ZONEORDER
int "Maximum zone order"
- range 9 64 if PPC64 && PPC_64K_PAGES
+ range 8 9 if PPC64 && PPC_64K_PAGES
default "9" if PPC64 && PPC_64K_PAGES
- range 13 64 if PPC64 && !PPC_64K_PAGES
+ range 9 13 if PPC64 && !PPC_64K_PAGES
default "13" if PPC64 && !PPC_64K_PAGES
range 9 64 if PPC32 && PPC_16K_PAGES
default "9" if PPC32 && PPC_16K_PAGES
@@ -795,7 +797,6 @@ config 4xx_SOC
config FSL_LBC
bool "Freescale Local Bus support"
- depends on FSL_SOC
help
Enables reporting of errors from the Freescale local bus
controller. Also contains some common code used by
diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug
index 638f9ce..d3fcf7e 100644
--- a/arch/powerpc/Kconfig.debug
+++ b/arch/powerpc/Kconfig.debug
@@ -19,14 +19,6 @@ config PPC_WERROR
depends on !PPC_DISABLE_WERROR
default y
-config STRICT_MM_TYPECHECKS
- bool "Do extra type checking on mm types"
- default n
- help
- This option turns on extra type checking for some mm related types.
-
- If you don't know what this means, say N.
-
config PRINT_STACK_DEPTH
int "Stack depth to print" if DEBUG_KERNEL
default 64
diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile
index 6116510..8fe78a3 100644
--- a/arch/powerpc/boot/Makefile
+++ b/arch/powerpc/boot/Makefile
@@ -362,9 +362,6 @@ $(obj)/cuImage.initrd.%: vmlinux $(obj)/%.dtb $(wrapperbits)
$(obj)/cuImage.%: vmlinux $(obj)/%.dtb $(wrapperbits)
$(call if_changed,wrap,cuboot-$*,,$(obj)/$*.dtb)
-$(obj)/cuImage.%: vmlinux $(obj)/fsl/%.dtb $(wrapperbits)
- $(call if_changed,wrap,cuboot-$*,,$(obj)/fsl/$*.dtb)
-
$(obj)/simpleImage.initrd.%: vmlinux $(obj)/%.dtb $(wrapperbits)
$(call if_changed,wrap,simpleboot-$*,,$(obj)/$*.dtb,$(obj)/ramdisk.image.gz)
@@ -381,6 +378,9 @@ $(obj)/treeImage.%: vmlinux $(obj)/%.dtb $(wrapperbits)
$(obj)/%.dtb: $(src)/dts/%.dts FORCE
$(call if_changed_dep,dtc)
+$(obj)/%.dtb: $(src)/dts/fsl/%.dts FORCE
+ $(call if_changed_dep,dtc)
+
# If there isn't a platform selected then just strip the vmlinux.
ifeq (,$(image-y))
image-y := vmlinux.strip
diff --git a/arch/powerpc/boot/dts/fsl/gef_ppc9a.dts b/arch/powerpc/boot/dts/fsl/gef_ppc9a.dts
index 0424fc2..c88d4ef 100644
--- a/arch/powerpc/boot/dts/fsl/gef_ppc9a.dts
+++ b/arch/powerpc/boot/dts/fsl/gef_ppc9a.dts
@@ -211,6 +211,10 @@
0x0 0x00400000>;
};
};
+
+ pci1: pcie@fef09000 {
+ status = "disabled";
+ };
};
/include/ "mpc8641si-post.dtsi"
diff --git a/arch/powerpc/boot/dts/fsl/gef_sbc310.dts b/arch/powerpc/boot/dts/fsl/gef_sbc310.dts
index 84b3d38..8385157 100644
--- a/arch/powerpc/boot/dts/fsl/gef_sbc310.dts
+++ b/arch/powerpc/boot/dts/fsl/gef_sbc310.dts
@@ -24,10 +24,6 @@
model = "GEF_SBC310";
compatible = "gef,sbc310";
- aliases {
- pci1 = &pci1;
- };
-
memory {
device_type = "memory";
reg = <0x0 0x40000000>; // set by uboot
@@ -223,29 +219,11 @@
};
pci1: pcie@fef09000 {
- compatible = "fsl,mpc8641-pcie";
- device_type = "pci";
- #size-cells = <2>;
- #address-cells = <3>;
reg = <0xfef09000 0x1000>;
- bus-range = <0x0 0xff>;
ranges = <0x02000000 0x0 0xc0000000 0xc0000000 0x0 0x20000000
0x01000000 0x0 0x00000000 0xfe400000 0x0 0x00400000>;
- clock-frequency = <100000000>;
- interrupts = <0x19 0x2 0 0>;
- interrupt-map-mask = <0xf800 0x0 0x0 0x7>;
- interrupt-map = <
- 0x0000 0x0 0x0 0x1 &mpic 0x4 0x2
- 0x0000 0x0 0x0 0x2 &mpic 0x5 0x2
- 0x0000 0x0 0x0 0x3 &mpic 0x6 0x2
- 0x0000 0x0 0x0 0x4 &mpic 0x7 0x2
- >;
pcie@0 {
- reg = <0 0 0 0 0>;
- #size-cells = <2>;
- #address-cells = <3>;
- device_type = "pci";
ranges = <0x02000000 0x0 0xc0000000
0x02000000 0x0 0xc0000000
0x0 0x20000000
diff --git a/arch/powerpc/boot/dts/fsl/gef_sbc610.dts b/arch/powerpc/boot/dts/fsl/gef_sbc610.dts
index 974446a..ff423ab 100644
--- a/arch/powerpc/boot/dts/fsl/gef_sbc610.dts
+++ b/arch/powerpc/boot/dts/fsl/gef_sbc610.dts
@@ -209,6 +209,10 @@
0x0 0x00400000>;
};
};
+
+ pci1: pcie@fef09000 {
+ status = "disabled";
+ };
};
/include/ "mpc8641si-post.dtsi"
diff --git a/arch/powerpc/boot/dts/fsl/mpc8641_hpcn.dts b/arch/powerpc/boot/dts/fsl/mpc8641_hpcn.dts
index 554001f..11bea3e 100644
--- a/arch/powerpc/boot/dts/fsl/mpc8641_hpcn.dts
+++ b/arch/powerpc/boot/dts/fsl/mpc8641_hpcn.dts
@@ -15,10 +15,6 @@
model = "MPC8641HPCN";
compatible = "fsl,mpc8641hpcn";
- aliases {
- pci1 = &pci1;
- };
-
memory {
device_type = "memory";
reg = <0x00000000 0x40000000>; // 1G at 0x0
@@ -359,29 +355,11 @@
};
pci1: pcie@ffe09000 {
- compatible = "fsl,mpc8641-pcie";
- device_type = "pci";
- #size-cells = <2>;
- #address-cells = <3>;
reg = <0xffe09000 0x1000>;
- bus-range = <0 0xff>;
ranges = <0x02000000 0x0 0xa0000000 0xa0000000 0x0 0x20000000
0x01000000 0x0 0x00000000 0xffc10000 0x0 0x00010000>;
- clock-frequency = <100000000>;
- interrupts = <25 2 0 0>;
- interrupt-map-mask = <0xf800 0 0 7>;
- interrupt-map = <
- /* IDSEL 0x0 */
- 0x0000 0 0 1 &mpic 4 1
- 0x0000 0 0 2 &mpic 5 1
- 0x0000 0 0 3 &mpic 6 1
- 0x0000 0 0 4 &mpic 7 1
- >;
+
pcie@0 {
- reg = <0 0 0 0 0>;
- #size-cells = <2>;
- #address-cells = <3>;
- device_type = "pci";
ranges = <0x02000000 0x0 0xa0000000
0x02000000 0x0 0xa0000000
0x0 0x20000000
diff --git a/arch/powerpc/boot/dts/fsl/mpc8641_hpcn_36b.dts b/arch/powerpc/boot/dts/fsl/mpc8641_hpcn_36b.dts
index fec5867..7ff6204 100644
--- a/arch/powerpc/boot/dts/fsl/mpc8641_hpcn_36b.dts
+++ b/arch/powerpc/boot/dts/fsl/mpc8641_hpcn_36b.dts
@@ -17,10 +17,6 @@
#address-cells = <2>;
#size-cells = <2>;
- aliases {
- pci1 = &pci1;
- };
-
memory {
device_type = "memory";
reg = <0x0 0x00000000 0x0 0x40000000>; // 1G at 0x0
@@ -326,29 +322,11 @@
};
pci1: pcie@fffe09000 {
- compatible = "fsl,mpc8641-pcie";
- device_type = "pci";
- #size-cells = <2>;
- #address-cells = <3>;
reg = <0x0f 0xffe09000 0x0 0x1000>;
- bus-range = <0x0 0xff>;
ranges = <0x02000000 0x0 0xe0000000 0x0c 0x20000000 0x0 0x20000000
0x01000000 0x0 0x00000000 0x0f 0xffc10000 0x0 0x00010000>;
- clock-frequency = <100000000>;
- interrupts = <25 2 0 0>;
- interrupt-map-mask = <0xf800 0 0 7>;
- interrupt-map = <
- /* IDSEL 0x0 */
- 0x0000 0 0 1 &mpic 4 1
- 0x0000 0 0 2 &mpic 5 1
- 0x0000 0 0 3 &mpic 6 1
- 0x0000 0 0 4 &mpic 7 1
- >;
+
pcie@0 {
- reg = <0 0 0 0 0>;
- #size-cells = <2>;
- #address-cells = <3>;
- device_type = "pci";
ranges = <0x02000000 0x0 0xe0000000
0x02000000 0x0 0xe0000000
0x0 0x20000000
diff --git a/arch/powerpc/boot/dts/fsl/mpc8641si-post.dtsi b/arch/powerpc/boot/dts/fsl/mpc8641si-post.dtsi
index 70889d8..eeb7c65 100644
--- a/arch/powerpc/boot/dts/fsl/mpc8641si-post.dtsi
+++ b/arch/powerpc/boot/dts/fsl/mpc8641si-post.dtsi
@@ -102,19 +102,46 @@
bus-range = <0x0 0xff>;
clock-frequency = <100000000>;
interrupts = <24 2 0 0>;
- interrupt-map-mask = <0xf800 0x0 0x0 0x7>;
- interrupt-map = <
- 0x0000 0x0 0x0 0x1 &mpic 0x0 0x1
- 0x0000 0x0 0x0 0x2 &mpic 0x1 0x1
- 0x0000 0x0 0x0 0x3 &mpic 0x2 0x1
- 0x0000 0x0 0x0 0x4 &mpic 0x3 0x1
- >;
+ pcie@0 {
+ reg = <0 0 0 0 0>;
+ #interrupt-cells = <1>;
+ #size-cells = <2>;
+ #address-cells = <3>;
+ device_type = "pci";
+ interrupts = <24 2 0 0>;
+ interrupt-map-mask = <0xf800 0x0 0x0 0x7>;
+ interrupt-map = <
+ 0x0000 0x0 0x0 0x1 &mpic 0x0 0x1 0x0 0x0
+ 0x0000 0x0 0x0 0x2 &mpic 0x1 0x1 0x0 0x0
+ 0x0000 0x0 0x0 0x3 &mpic 0x2 0x1 0x0 0x0
+ 0x0000 0x0 0x0 0x4 &mpic 0x3 0x1 0x0 0x0
+ >;
+ };
+};
+
+&pci1 {
+ compatible = "fsl,mpc8641-pcie";
+ device_type = "pci";
+ #size-cells = <2>;
+ #address-cells = <3>;
+ bus-range = <0x0 0xff>;
+ clock-frequency = <100000000>;
+ interrupts = <25 2 0 0>;
pcie@0 {
reg = <0 0 0 0 0>;
+ #interrupt-cells = <1>;
#size-cells = <2>;
#address-cells = <3>;
device_type = "pci";
+ interrupts = <25 2 0 0>;
+ interrupt-map-mask = <0xf800 0x0 0x0 0x7>;
+ interrupt-map = <
+ 0x0000 0x0 0x0 0x1 &mpic 0x4 0x1 0x0 0x0
+ 0x0000 0x0 0x0 0x2 &mpic 0x5 0x1 0x0 0x0
+ 0x0000 0x0 0x0 0x3 &mpic 0x6 0x1 0x0 0x0
+ 0x0000 0x0 0x0 0x4 &mpic 0x7 0x1 0x0 0x0
+ >;
};
};
diff --git a/arch/powerpc/boot/dts/fsl/mpc8641si-pre.dtsi b/arch/powerpc/boot/dts/fsl/mpc8641si-pre.dtsi
index 9e03328..7c6db6f 100644
--- a/arch/powerpc/boot/dts/fsl/mpc8641si-pre.dtsi
+++ b/arch/powerpc/boot/dts/fsl/mpc8641si-pre.dtsi
@@ -25,6 +25,7 @@
serial0 = &serial0;
serial1 = &serial1;
pci0 = &pci0;
+ pci1 = &pci1;
};
cpus {
diff --git a/arch/powerpc/boot/dts/fsl/sbc8641d.dts b/arch/powerpc/boot/dts/fsl/sbc8641d.dts
index 0a9733c..75870a1 100644
--- a/arch/powerpc/boot/dts/fsl/sbc8641d.dts
+++ b/arch/powerpc/boot/dts/fsl/sbc8641d.dts
@@ -19,10 +19,6 @@
model = "SBC8641D";
compatible = "wind,sbc8641";
- aliases {
- pci1 = &pci1;
- };
-
memory {
device_type = "memory";
reg = <0x00000000 0x20000000>; // 512M at 0x0
@@ -165,30 +161,11 @@
};
pci1: pcie@f8009000 {
- compatible = "fsl,mpc8641-pcie";
- device_type = "pci";
- #size-cells = <2>;
- #address-cells = <3>;
reg = <0xf8009000 0x1000>;
- bus-range = <0 0xff>;
ranges = <0x02000000 0x0 0xa0000000 0xa0000000 0x0 0x20000000
0x01000000 0x0 0x00000000 0xe3000000 0x0 0x00100000>;
- clock-frequency = <100000000>;
- interrupts = <25 2 0 0>;
- interrupt-map-mask = <0xf800 0 0 7>;
- interrupt-map = <
- /* IDSEL 0x0 */
- 0x0000 0 0 1 &mpic 4 1
- 0x0000 0 0 2 &mpic 5 1
- 0x0000 0 0 3 &mpic 6 1
- 0x0000 0 0 4 &mpic 7 1
- >;
pcie@0 {
- reg = <0 0 0 0 0>;
- #size-cells = <2>;
- #address-cells = <3>;
- device_type = "pci";
ranges = <0x02000000 0x0 0xa0000000
0x02000000 0x0 0xa0000000
0x0 0x20000000
diff --git a/arch/powerpc/boot/dts/fsl/t1023si-post.dtsi b/arch/powerpc/boot/dts/fsl/t1023si-post.dtsi
index 99e421d..6e0b489 100644
--- a/arch/powerpc/boot/dts/fsl/t1023si-post.dtsi
+++ b/arch/powerpc/boot/dts/fsl/t1023si-post.dtsi
@@ -263,7 +263,7 @@
};
rcpm: global-utilities@e2000 {
- compatible = "fsl,t1023-rcpm", "fsl,qoriq-rcpm-2.0";
+ compatible = "fsl,t1023-rcpm", "fsl,qoriq-rcpm-2.1";
reg = <0xe2000 0x1000>;
};
diff --git a/arch/powerpc/boot/dts/fsl/t1040si-post.dtsi b/arch/powerpc/boot/dts/fsl/t1040si-post.dtsi
index e0f4da5..507649e 100644
--- a/arch/powerpc/boot/dts/fsl/t1040si-post.dtsi
+++ b/arch/powerpc/boot/dts/fsl/t1040si-post.dtsi
@@ -472,7 +472,7 @@
};
rcpm: global-utilities@e2000 {
- compatible = "fsl,t1040-rcpm", "fsl,qoriq-rcpm-2.0";
+ compatible = "fsl,t1040-rcpm", "fsl,qoriq-rcpm-2.1";
reg = <0xe2000 0x1000>;
};
diff --git a/arch/powerpc/boot/dts/fsl/t104xrdb.dtsi b/arch/powerpc/boot/dts/fsl/t104xrdb.dtsi
index 72691ef..7c4afdb 100644
--- a/arch/powerpc/boot/dts/fsl/t104xrdb.dtsi
+++ b/arch/powerpc/boot/dts/fsl/t104xrdb.dtsi
@@ -109,7 +109,7 @@
flash@0 {
#address-cells = <1>;
#size-cells = <1>;
- compatible = "micron,n25q512a", "jedec,spi-nor";
+ compatible = "micron,n25q512ax3", "jedec,spi-nor";
reg = <0>;
spi-max-frequency = <10000000>; /* input clock */
};
diff --git a/arch/powerpc/boot/dts/fsl/t208xrdb.dtsi b/arch/powerpc/boot/dts/fsl/t208xrdb.dtsi
index dc93268..ff87e67 100644
--- a/arch/powerpc/boot/dts/fsl/t208xrdb.dtsi
+++ b/arch/powerpc/boot/dts/fsl/t208xrdb.dtsi
@@ -113,7 +113,7 @@
flash@0 {
#address-cells = <1>;
#size-cells = <1>;
- compatible = "micron,n25q512a", "jedec,spi-nor";
+ compatible = "micron,n25q512ax3", "jedec,spi-nor";
reg = <0>;
spi-max-frequency = <10000000>; /* input clock */
};
diff --git a/arch/powerpc/include/asm/book3s/32/hash.h b/arch/powerpc/include/asm/book3s/32/hash.h
index 264b754..880db13 100644
--- a/arch/powerpc/include/asm/book3s/32/hash.h
+++ b/arch/powerpc/include/asm/book3s/32/hash.h
@@ -39,8 +39,5 @@
#define _PMD_PRESENT_MASK (PAGE_MASK)
#define _PMD_BAD (~PAGE_MASK)
-/* Hash table based platforms need atomic updates of the linux PTE */
-#define PTE_ATOMIC_UPDATES 1
-
#endif /* __KERNEL__ */
#endif /* _ASM_POWERPC_BOOK3S_32_HASH_H */
diff --git a/arch/powerpc/include/asm/book3s/32/mmu-hash.h b/arch/powerpc/include/asm/book3s/32/mmu-hash.h
index 16f513e..b82e063 100644
--- a/arch/powerpc/include/asm/book3s/32/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/32/mmu-hash.h
@@ -1,5 +1,5 @@
-#ifndef _ASM_POWERPC_MMU_HASH32_H_
-#define _ASM_POWERPC_MMU_HASH32_H_
+#ifndef _ASM_POWERPC_BOOK3S_32_MMU_HASH_H_
+#define _ASM_POWERPC_BOOK3S_32_MMU_HASH_H_
/*
* 32-bit hash table MMU support
*/
@@ -90,4 +90,4 @@ typedef struct {
#define mmu_virtual_psize MMU_PAGE_4K
#define mmu_linear_psize MMU_PAGE_256M
-#endif /* _ASM_POWERPC_MMU_HASH32_H_ */
+#endif /* _ASM_POWERPC_BOOK3S_32_MMU_HASH_H_ */
diff --git a/arch/powerpc/include/asm/book3s/32/pgalloc.h b/arch/powerpc/include/asm/book3s/32/pgalloc.h
new file mode 100644
index 0000000..a235019
--- /dev/null
+++ b/arch/powerpc/include/asm/book3s/32/pgalloc.h
@@ -0,0 +1,109 @@
+#ifndef _ASM_POWERPC_BOOK3S_32_PGALLOC_H
+#define _ASM_POWERPC_BOOK3S_32_PGALLOC_H
+
+#include <linux/threads.h>
+
+/* For 32-bit, all levels of page tables are just drawn from get_free_page() */
+#define MAX_PGTABLE_INDEX_SIZE 0
+
+extern void __bad_pte(pmd_t *pmd);
+
+extern pgd_t *pgd_alloc(struct mm_struct *mm);
+extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
+
+/*
+ * We don't have any real pmd's, and this code never triggers because
+ * the pgd will always be present..
+ */
+/* #define pmd_alloc_one(mm,address) ({ BUG(); ((pmd_t *)2); }) */
+#define pmd_free(mm, x) do { } while (0)
+#define __pmd_free_tlb(tlb,x,a) do { } while (0)
+/* #define pgd_populate(mm, pmd, pte) BUG() */
+
+#ifndef CONFIG_BOOKE
+
+static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp,
+ pte_t *pte)
+{
+ *pmdp = __pmd(__pa(pte) | _PMD_PRESENT);
+}
+
+static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp,
+ pgtable_t pte_page)
+{
+ *pmdp = __pmd((page_to_pfn(pte_page) << PAGE_SHIFT) | _PMD_PRESENT);
+}
+
+#define pmd_pgtable(pmd) pmd_page(pmd)
+#else
+
+static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp,
+ pte_t *pte)
+{
+ *pmdp = __pmd((unsigned long)pte | _PMD_PRESENT);
+}
+
+static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp,
+ pgtable_t pte_page)
+{
+ *pmdp = __pmd((unsigned long)lowmem_page_address(pte_page) | _PMD_PRESENT);
+}
+
+#define pmd_pgtable(pmd) pmd_page(pmd)
+#endif
+
+extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr);
+extern pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addr);
+
+static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
+{
+ free_page((unsigned long)pte);
+}
+
+static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage)
+{
+ pgtable_page_dtor(ptepage);
+ __free_page(ptepage);
+}
+
+static inline void pgtable_free(void *table, unsigned index_size)
+{
+ BUG_ON(index_size); /* 32-bit doesn't use this */
+ free_page((unsigned long)table);
+}
+
+#define check_pgt_cache() do { } while (0)
+
+#ifdef CONFIG_SMP
+static inline void pgtable_free_tlb(struct mmu_gather *tlb,
+ void *table, int shift)
+{
+ unsigned long pgf = (unsigned long)table;
+ BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
+ pgf |= shift;
+ tlb_remove_table(tlb, (void *)pgf);
+}
+
+static inline void __tlb_remove_table(void *_table)
+{
+ void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE);
+ unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE;
+
+ pgtable_free(table, shift);
+}
+#else
+static inline void pgtable_free_tlb(struct mmu_gather *tlb,
+ void *table, int shift)
+{
+ pgtable_free(table, shift);
+}
+#endif
+
+static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table,
+ unsigned long address)
+{
+ tlb_flush_pgtable(tlb, address);
+ pgtable_page_dtor(table);
+ pgtable_free_tlb(tlb, page_address(table), 0);
+}
+#endif /* _ASM_POWERPC_BOOK3S_32_PGALLOC_H */
diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h
index 5f08a08..1af837c 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
@@ -5,58 +5,31 @@
* for each page table entry. The PMD and PGD level use a 32b record for
* each entry by assuming that each entry is page aligned.
*/
-#define PTE_INDEX_SIZE 9
-#define PMD_INDEX_SIZE 7
-#define PUD_INDEX_SIZE 9
-#define PGD_INDEX_SIZE 9
+#define H_PTE_INDEX_SIZE 9
+#define H_PMD_INDEX_SIZE 7
+#define H_PUD_INDEX_SIZE 9
+#define H_PGD_INDEX_SIZE 9
#ifndef __ASSEMBLY__
-#define PTE_TABLE_SIZE (sizeof(pte_t) << PTE_INDEX_SIZE)
-#define PMD_TABLE_SIZE (sizeof(pmd_t) << PMD_INDEX_SIZE)
-#define PUD_TABLE_SIZE (sizeof(pud_t) << PUD_INDEX_SIZE)
-#define PGD_TABLE_SIZE (sizeof(pgd_t) << PGD_INDEX_SIZE)
-#endif /* __ASSEMBLY__ */
-
-#define PTRS_PER_PTE (1 << PTE_INDEX_SIZE)
-#define PTRS_PER_PMD (1 << PMD_INDEX_SIZE)
-#define PTRS_PER_PUD (1 << PUD_INDEX_SIZE)
-#define PTRS_PER_PGD (1 << PGD_INDEX_SIZE)
-
-/* PMD_SHIFT determines what a second-level page table entry can map */
-#define PMD_SHIFT (PAGE_SHIFT + PTE_INDEX_SIZE)
-#define PMD_SIZE (1UL << PMD_SHIFT)
-#define PMD_MASK (~(PMD_SIZE-1))
+#define H_PTE_TABLE_SIZE (sizeof(pte_t) << H_PTE_INDEX_SIZE)
+#define H_PMD_TABLE_SIZE (sizeof(pmd_t) << H_PMD_INDEX_SIZE)
+#define H_PUD_TABLE_SIZE (sizeof(pud_t) << H_PUD_INDEX_SIZE)
+#define H_PGD_TABLE_SIZE (sizeof(pgd_t) << H_PGD_INDEX_SIZE)
/* With 4k base page size, hugepage PTEs go at the PMD level */
#define MIN_HUGEPTE_SHIFT PMD_SHIFT
-/* PUD_SHIFT determines what a third-level page table entry can map */
-#define PUD_SHIFT (PMD_SHIFT + PMD_INDEX_SIZE)
-#define PUD_SIZE (1UL << PUD_SHIFT)
-#define PUD_MASK (~(PUD_SIZE-1))
-
-/* PGDIR_SHIFT determines what a fourth-level page table entry can map */
-#define PGDIR_SHIFT (PUD_SHIFT + PUD_INDEX_SIZE)
-#define PGDIR_SIZE (1UL << PGDIR_SHIFT)
-#define PGDIR_MASK (~(PGDIR_SIZE-1))
-
-/* Bits to mask out from a PMD to get to the PTE page */
-#define PMD_MASKED_BITS 0
-/* Bits to mask out from a PUD to get to the PMD page */
-#define PUD_MASKED_BITS 0
-/* Bits to mask out from a PGD to get to the PUD page */
-#define PGD_MASKED_BITS 0
-
/* PTE flags to conserve for HPTE identification */
-#define _PAGE_HPTEFLAGS (_PAGE_BUSY | _PAGE_HASHPTE | \
- _PAGE_F_SECOND | _PAGE_F_GIX)
-
-/* shift to put page number into pte */
-#define PTE_RPN_SHIFT (12)
-#define PTE_RPN_SIZE (45) /* gives 57-bit real addresses */
-
-#define _PAGE_4K_PFN 0
-#ifndef __ASSEMBLY__
+#define _PAGE_HPTEFLAGS (H_PAGE_BUSY | H_PAGE_HASHPTE | \
+ H_PAGE_F_SECOND | H_PAGE_F_GIX)
+/*
+ * Not supported by 4k linux page size
+ */
+#define H_PAGE_4K_PFN 0x0
+#define H_PAGE_THP_HUGE 0x0
+#define H_PAGE_COMBO 0x0
+#define H_PTE_FRAG_NR 0
+#define H_PTE_FRAG_SIZE_SHIFT 0
/*
* On all 4K setups, remap_4k_pfn() equates to remap_pfn_range()
*/
@@ -64,37 +37,76 @@
remap_pfn_range((vma), (addr), (pfn), PAGE_SIZE, (prot))
#ifdef CONFIG_HUGETLB_PAGE
-/*
- * For 4k page size, we support explicit hugepage via hugepd
- */
-static inline int pmd_huge(pmd_t pmd)
+static inline int hash__hugepd_ok(hugepd_t hpd)
+{
+ /*
+ * if it is not a pte and have hugepd shift mask
+ * set, then it is a hugepd directory pointer
+ */
+ if (!(hpd.pd & _PAGE_PTE) &&
+ ((hpd.pd & HUGEPD_SHIFT_MASK) != 0))
+ return true;
+ return false;
+}
+#endif
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+
+static inline char *get_hpte_slot_array(pmd_t *pmdp)
+{
+ BUG();
+ return NULL;
+}
+
+static inline unsigned int hpte_valid(unsigned char *hpte_slot_array, int index)
{
+ BUG();
return 0;
}
-static inline int pud_huge(pud_t pud)
+static inline unsigned int hpte_hash_index(unsigned char *hpte_slot_array,
+ int index)
{
+ BUG();
return 0;
}
-static inline int pgd_huge(pgd_t pgd)
+static inline void mark_hpte_slot_valid(unsigned char *hpte_slot_array,
+ unsigned int index, unsigned int hidx)
+{
+ BUG();
+}
+
+static inline int hash__pmd_trans_huge(pmd_t pmd)
{
return 0;
}
-#define pgd_huge pgd_huge
-static inline int hugepd_ok(hugepd_t hpd)
+static inline int hash__pmd_same(pmd_t pmd_a, pmd_t pmd_b)
{
- /*
- * if it is not a pte and have hugepd shift mask
- * set, then it is a hugepd directory pointer
- */
- if (!(hpd.pd & _PAGE_PTE) &&
- ((hpd.pd & HUGEPD_SHIFT_MASK) != 0))
- return true;
- return false;
+ BUG();
+ return 0;
}
-#define is_hugepd(hpd) (hugepd_ok(hpd))
+
+static inline pmd_t hash__pmd_mkhuge(pmd_t pmd)
+{
+ BUG();
+ return pmd;
+}
+
+extern unsigned long hash__pmd_hugepage_update(struct mm_struct *mm,
+ unsigned long addr, pmd_t *pmdp,
+ unsigned long clr, unsigned long set);
+extern pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp);
+extern void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+ pgtable_t pgtable);
+extern pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
+extern void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp);
+extern pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pmd_t *pmdp);
+extern int hash__has_transparent_hugepage(void);
#endif
#endif /* !__ASSEMBLY__ */
diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h
index 0a7956a..5aae4f5 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
@@ -1,73 +1,44 @@
#ifndef _ASM_POWERPC_BOOK3S_64_HASH_64K_H
#define _ASM_POWERPC_BOOK3S_64_HASH_64K_H
-#define PTE_INDEX_SIZE 8
-#define PMD_INDEX_SIZE 5
-#define PUD_INDEX_SIZE 5
-#define PGD_INDEX_SIZE 12
-
-#define PTRS_PER_PTE (1 << PTE_INDEX_SIZE)
-#define PTRS_PER_PMD (1 << PMD_INDEX_SIZE)
-#define PTRS_PER_PUD (1 << PUD_INDEX_SIZE)
-#define PTRS_PER_PGD (1 << PGD_INDEX_SIZE)
+#define H_PTE_INDEX_SIZE 8
+#define H_PMD_INDEX_SIZE 5
+#define H_PUD_INDEX_SIZE 5
+#define H_PGD_INDEX_SIZE 12
/* With 4k base page size, hugepage PTEs go at the PMD level */
#define MIN_HUGEPTE_SHIFT PAGE_SHIFT
-/* PMD_SHIFT determines what a second-level page table entry can map */
-#define PMD_SHIFT (PAGE_SHIFT + PTE_INDEX_SIZE)
-#define PMD_SIZE (1UL << PMD_SHIFT)
-#define PMD_MASK (~(PMD_SIZE-1))
-
-/* PUD_SHIFT determines what a third-level page table entry can map */
-#define PUD_SHIFT (PMD_SHIFT + PMD_INDEX_SIZE)
-#define PUD_SIZE (1UL << PUD_SHIFT)
-#define PUD_MASK (~(PUD_SIZE-1))
-
-/* PGDIR_SHIFT determines what a fourth-level page table entry can map */
-#define PGDIR_SHIFT (PUD_SHIFT + PUD_INDEX_SIZE)
-#define PGDIR_SIZE (1UL << PGDIR_SHIFT)
-#define PGDIR_MASK (~(PGDIR_SIZE-1))
-
-#define _PAGE_COMBO 0x00001000 /* this is a combo 4k page */
-#define _PAGE_4K_PFN 0x00002000 /* PFN is for a single 4k page */
+#define H_PAGE_COMBO 0x00001000 /* this is a combo 4k page */
+#define H_PAGE_4K_PFN 0x00002000 /* PFN is for a single 4k page */
/*
- * Used to track subpage group valid if _PAGE_COMBO is set
- * This overloads _PAGE_F_GIX and _PAGE_F_SECOND
+ * We need to differentiate between explicit huge page and THP huge
+ * page, since THP huge page also need to track real subpage details
*/
-#define _PAGE_COMBO_VALID (_PAGE_F_GIX | _PAGE_F_SECOND)
+#define H_PAGE_THP_HUGE H_PAGE_4K_PFN
-/* PTE flags to conserve for HPTE identification */
-#define _PAGE_HPTEFLAGS (_PAGE_BUSY | _PAGE_F_SECOND | \
- _PAGE_F_GIX | _PAGE_HASHPTE | _PAGE_COMBO)
-
-/* Shift to put page number into pte.
- *
- * That gives us a max RPN of 41 bits, which means a max of 57 bits
- * of addressable physical space, or 53 bits for the special 4k PFNs.
+/*
+ * Used to track subpage group valid if H_PAGE_COMBO is set
+ * This overloads H_PAGE_F_GIX and H_PAGE_F_SECOND
*/
-#define PTE_RPN_SHIFT (16)
-#define PTE_RPN_SIZE (41)
+#define H_PAGE_COMBO_VALID (H_PAGE_F_GIX | H_PAGE_F_SECOND)
+/* PTE flags to conserve for HPTE identification */
+#define _PAGE_HPTEFLAGS (H_PAGE_BUSY | H_PAGE_F_SECOND | \
+ H_PAGE_F_GIX | H_PAGE_HASHPTE | H_PAGE_COMBO)
/*
* we support 16 fragments per PTE page of 64K size.
*/
-#define PTE_FRAG_NR 16
+#define H_PTE_FRAG_NR 16
/*
* We use a 2K PTE page fragment and another 2K for storing
* real_pte_t hash index
*/
-#define PTE_FRAG_SIZE_SHIFT 12
+#define H_PTE_FRAG_SIZE_SHIFT 12
#define PTE_FRAG_SIZE (1UL << PTE_FRAG_SIZE_SHIFT)
-/* Bits to mask out from a PMD to get to the PTE page */
-#define PMD_MASKED_BITS 0xc0000000000000ffUL
-/* Bits to mask out from a PUD to get to the PMD page */
-#define PUD_MASKED_BITS 0xc0000000000000ffUL
-/* Bits to mask out from a PGD to get to the PUD page */
-#define PGD_MASKED_BITS 0xc0000000000000ffUL
-
#ifndef __ASSEMBLY__
+#include <asm/errno.h>
/*
* With 64K pages on hash table, we have a special PTE format that
@@ -83,9 +54,9 @@ static inline real_pte_t __real_pte(pte_t pte, pte_t *ptep)
rpte.pte = pte;
rpte.hidx = 0;
- if (pte_val(pte) & _PAGE_COMBO) {
+ if (pte_val(pte) & H_PAGE_COMBO) {
/*
- * Make sure we order the hidx load against the _PAGE_COMBO
+ * Make sure we order the hidx load against the H_PAGE_COMBO
* check. The store side ordering is done in __hash_page_4K
*/
smp_rmb();
@@ -97,9 +68,9 @@ static inline real_pte_t __real_pte(pte_t pte, pte_t *ptep)
static inline unsigned long __rpte_to_hidx(real_pte_t rpte, unsigned long index)
{
- if ((pte_val(rpte.pte) & _PAGE_COMBO))
+ if ((pte_val(rpte.pte) & H_PAGE_COMBO))
return (rpte.hidx >> (index<<2)) & 0xf;
- return (pte_val(rpte.pte) >> _PAGE_F_GIX_SHIFT) & 0xf;
+ return (pte_val(rpte.pte) >> H_PAGE_F_GIX_SHIFT) & 0xf;
}
#define __rpte_to_pte(r) ((r).pte)
@@ -122,79 +93,32 @@ extern bool __rpte_sub_valid(real_pte_t rpte, unsigned long index);
#define pte_iterate_hashed_end() } while(0); } } while(0)
#define pte_pagesize_index(mm, addr, pte) \
- (((pte) & _PAGE_COMBO)? MMU_PAGE_4K: MMU_PAGE_64K)
-
-#define remap_4k_pfn(vma, addr, pfn, prot) \
- (WARN_ON(((pfn) >= (1UL << PTE_RPN_SIZE))) ? -EINVAL : \
- remap_pfn_range((vma), (addr), (pfn), PAGE_SIZE, \
- __pgprot(pgprot_val((prot)) | _PAGE_4K_PFN)))
-
-#define PTE_TABLE_SIZE PTE_FRAG_SIZE
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define PMD_TABLE_SIZE ((sizeof(pmd_t) << PMD_INDEX_SIZE) + (sizeof(unsigned long) << PMD_INDEX_SIZE))
-#else
-#define PMD_TABLE_SIZE (sizeof(pmd_t) << PMD_INDEX_SIZE)
-#endif
-#define PUD_TABLE_SIZE (sizeof(pud_t) << PUD_INDEX_SIZE)
-#define PGD_TABLE_SIZE (sizeof(pgd_t) << PGD_INDEX_SIZE)
-
-#ifdef CONFIG_HUGETLB_PAGE
-/*
- * We have PGD_INDEX_SIZ = 12 and PTE_INDEX_SIZE = 8, so that we can have
- * 16GB hugepage pte in PGD and 16MB hugepage pte at PMD;
- *
- * Defined in such a way that we can optimize away code block at build time
- * if CONFIG_HUGETLB_PAGE=n.
- */
-static inline int pmd_huge(pmd_t pmd)
-{
- /*
- * leaf pte for huge page
- */
- return !!(pmd_val(pmd) & _PAGE_PTE);
-}
-
-static inline int pud_huge(pud_t pud)
-{
- /*
- * leaf pte for huge page
- */
- return !!(pud_val(pud) & _PAGE_PTE);
-}
+ (((pte) & H_PAGE_COMBO)? MMU_PAGE_4K: MMU_PAGE_64K)
-static inline int pgd_huge(pgd_t pgd)
+extern int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t);
+static inline int hash__remap_4k_pfn(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, pgprot_t prot)
{
- /*
- * leaf pte for huge page
- */
- return !!(pgd_val(pgd) & _PAGE_PTE);
+ if (pfn > (PTE_RPN_MASK >> PAGE_SHIFT)) {
+ WARN(1, "remap_4k_pfn called with wrong pfn value\n");
+ return -EINVAL;
+ }
+ return remap_pfn_range(vma, addr, pfn, PAGE_SIZE,
+ __pgprot(pgprot_val(prot) | H_PAGE_4K_PFN));
}
-#define pgd_huge pgd_huge
-#ifdef CONFIG_DEBUG_VM
-extern int hugepd_ok(hugepd_t hpd);
-#define is_hugepd(hpd) (hugepd_ok(hpd))
+#define H_PTE_TABLE_SIZE PTE_FRAG_SIZE
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define H_PMD_TABLE_SIZE ((sizeof(pmd_t) << PMD_INDEX_SIZE) + \
+ (sizeof(unsigned long) << PMD_INDEX_SIZE))
#else
-/*
- * With 64k page size, we have hugepage ptes in the pgd and pmd entries. We don't
- * need to setup hugepage directory for them. Our pte and page directory format
- * enable us to have this enabled.
- */
-static inline int hugepd_ok(hugepd_t hpd)
-{
- return 0;
-}
-#define is_hugepd(pdep) 0
-#endif /* CONFIG_DEBUG_VM */
-
-#endif /* CONFIG_HUGETLB_PAGE */
+#define H_PMD_TABLE_SIZE (sizeof(pmd_t) << PMD_INDEX_SIZE)
+#endif
+#define H_PUD_TABLE_SIZE (sizeof(pud_t) << PUD_INDEX_SIZE)
+#define H_PGD_TABLE_SIZE (sizeof(pgd_t) << PGD_INDEX_SIZE)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-extern unsigned long pmd_hugepage_update(struct mm_struct *mm,
- unsigned long addr,
- pmd_t *pmdp,
- unsigned long clr,
- unsigned long set);
static inline char *get_hpte_slot_array(pmd_t *pmdp)
{
/*
@@ -253,50 +177,35 @@ static inline void mark_hpte_slot_valid(unsigned char *hpte_slot_array,
* that for explicit huge pages.
*
*/
-static inline int pmd_trans_huge(pmd_t pmd)
+static inline int hash__pmd_trans_huge(pmd_t pmd)
{
- return !!((pmd_val(pmd) & (_PAGE_PTE | _PAGE_THP_HUGE)) ==
- (_PAGE_PTE | _PAGE_THP_HUGE));
+ return !!((pmd_val(pmd) & (_PAGE_PTE | H_PAGE_THP_HUGE)) ==
+ (_PAGE_PTE | H_PAGE_THP_HUGE));
}
-static inline int pmd_large(pmd_t pmd)
+static inline int hash__pmd_same(pmd_t pmd_a, pmd_t pmd_b)
{
- return !!(pmd_val(pmd) & _PAGE_PTE);
+ return (((pmd_raw(pmd_a) ^ pmd_raw(pmd_b)) & ~cpu_to_be64(_PAGE_HPTEFLAGS)) == 0);
}
-static inline pmd_t pmd_mknotpresent(pmd_t pmd)
+static inline pmd_t hash__pmd_mkhuge(pmd_t pmd)
{
- return __pmd(pmd_val(pmd) & ~_PAGE_PRESENT);
-}
-
-#define __HAVE_ARCH_PMD_SAME
-static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
-{
- return (((pmd_val(pmd_a) ^ pmd_val(pmd_b)) & ~_PAGE_HPTEFLAGS) == 0);
-}
-
-static inline int __pmdp_test_and_clear_young(struct mm_struct *mm,
- unsigned long addr, pmd_t *pmdp)
-{
- unsigned long old;
-
- if ((pmd_val(*pmdp) & (_PAGE_ACCESSED | _PAGE_HASHPTE)) == 0)
- return 0;
- old = pmd_hugepage_update(mm, addr, pmdp, _PAGE_ACCESSED, 0);
- return ((old & _PAGE_ACCESSED) != 0);
-}
-
-#define __HAVE_ARCH_PMDP_SET_WRPROTECT
-static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr,
- pmd_t *pmdp)
-{
-
- if ((pmd_val(*pmdp) & _PAGE_RW) == 0)
- return;
-
- pmd_hugepage_update(mm, addr, pmdp, _PAGE_RW, 0);
+ return __pmd(pmd_val(pmd) | (_PAGE_PTE | H_PAGE_THP_HUGE));
}
+extern unsigned long hash__pmd_hugepage_update(struct mm_struct *mm,
+ unsigned long addr, pmd_t *pmdp,
+ unsigned long clr, unsigned long set);
+extern pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp);
+extern void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+ pgtable_t pgtable);
+extern pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
+extern void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp);
+extern pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pmd_t *pmdp);
+extern int hash__has_transparent_hugepage(void);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif /* __ASSEMBLY__ */
diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h
index d0ee6fc..f61cad3 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -13,48 +13,12 @@
* We could create separate kernel read-only if we used the 3 PP bits
* combinations that newer processors provide but we currently don't.
*/
-#define _PAGE_BIT_SWAP_TYPE 0
-
-#define _PAGE_EXEC 0x00001 /* execute permission */
-#define _PAGE_RW 0x00002 /* read & write access allowed */
-#define _PAGE_READ 0x00004 /* read access allowed */
-#define _PAGE_USER 0x00008 /* page may be accessed by userspace */
-#define _PAGE_GUARDED 0x00010 /* G: guarded (side-effect) page */
-/* M (memory coherence) is always set in the HPTE, so we don't need it here */
-#define _PAGE_COHERENT 0x0
-#define _PAGE_NO_CACHE 0x00020 /* I: cache inhibit */
-#define _PAGE_WRITETHRU 0x00040 /* W: cache write-through */
-#define _PAGE_DIRTY 0x00080 /* C: page changed */
-#define _PAGE_ACCESSED 0x00100 /* R: page referenced */
-#define _PAGE_SPECIAL 0x00400 /* software: special page */
-#define _PAGE_BUSY 0x00800 /* software: PTE & hash are busy */
-
-#ifdef CONFIG_MEM_SOFT_DIRTY
-#define _PAGE_SOFT_DIRTY 0x200 /* software: software dirty tracking */
-#else
-#define _PAGE_SOFT_DIRTY 0x000
-#endif
-
-#define _PAGE_F_GIX_SHIFT 57
-#define _PAGE_F_GIX (7ul << 57) /* HPTE index within HPTEG */
-#define _PAGE_F_SECOND (1ul << 60) /* HPTE is in 2ndary HPTEG */
-#define _PAGE_HASHPTE (1ul << 61) /* PTE has associated HPTE */
-#define _PAGE_PTE (1ul << 62) /* distinguishes PTEs from pointers */
-#define _PAGE_PRESENT (1ul << 63) /* pte contains a translation */
-
-/*
- * We need to differentiate between explicit huge page and THP huge
- * page, since THP huge page also need to track real subpage details
- */
-#define _PAGE_THP_HUGE _PAGE_4K_PFN
-
-/*
- * set of bits not changed in pmd_modify.
- */
-#define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \
- _PAGE_ACCESSED | _PAGE_THP_HUGE | _PAGE_PTE | \
- _PAGE_SOFT_DIRTY)
-
+#define H_PAGE_BUSY 0x00800 /* software: PTE & hash are busy */
+#define H_PTE_NONE_MASK _PAGE_HPTEFLAGS
+#define H_PAGE_F_GIX_SHIFT 57
+#define H_PAGE_F_GIX (7ul << 57) /* HPTE index within HPTEG */
+#define H_PAGE_F_SECOND (1ul << 60) /* HPTE is in 2ndary HPTEG */
+#define H_PAGE_HASHPTE (1ul << 61) /* PTE has associated HPTE */
#ifdef CONFIG_PPC_64K_PAGES
#include <asm/book3s/64/hash-64k.h>
@@ -65,29 +29,33 @@
/*
* Size of EA range mapped by our pagetables.
*/
-#define PGTABLE_EADDR_SIZE (PTE_INDEX_SIZE + PMD_INDEX_SIZE + \
- PUD_INDEX_SIZE + PGD_INDEX_SIZE + PAGE_SHIFT)
-#define PGTABLE_RANGE (ASM_CONST(1) << PGTABLE_EADDR_SIZE)
+#define H_PGTABLE_EADDR_SIZE (H_PTE_INDEX_SIZE + H_PMD_INDEX_SIZE + \
+ H_PUD_INDEX_SIZE + H_PGD_INDEX_SIZE + PAGE_SHIFT)
+#define H_PGTABLE_RANGE (ASM_CONST(1) << H_PGTABLE_EADDR_SIZE)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define PMD_CACHE_INDEX (PMD_INDEX_SIZE + 1)
+/*
+ * only with hash we need to use the second half of pmd page table
+ * to store pointer to deposited pgtable_t
+ */
+#define H_PMD_CACHE_INDEX (H_PMD_INDEX_SIZE + 1)
#else
-#define PMD_CACHE_INDEX PMD_INDEX_SIZE
+#define H_PMD_CACHE_INDEX H_PMD_INDEX_SIZE
#endif
/*
* Define the address range of the kernel non-linear virtual area
*/
-#define KERN_VIRT_START ASM_CONST(0xD000000000000000)
-#define KERN_VIRT_SIZE ASM_CONST(0x0000100000000000)
+#define H_KERN_VIRT_START ASM_CONST(0xD000000000000000)
+#define H_KERN_VIRT_SIZE ASM_CONST(0x0000100000000000)
/*
* The vmalloc space starts at the beginning of that region, and
* occupies half of it on hash CPUs and a quarter of it on Book3E
* (we keep a quarter for the virtual memmap)
*/
-#define VMALLOC_START KERN_VIRT_START
-#define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1)
-#define VMALLOC_END (VMALLOC_START + VMALLOC_SIZE)
+#define H_VMALLOC_START H_KERN_VIRT_START
+#define H_VMALLOC_SIZE (H_KERN_VIRT_SIZE >> 1)
+#define H_VMALLOC_END (H_VMALLOC_START + H_VMALLOC_SIZE)
/*
* Region IDs
@@ -96,7 +64,7 @@
#define REGION_MASK (0xfUL << REGION_SHIFT)
#define REGION_ID(ea) (((unsigned long)(ea)) >> REGION_SHIFT)
-#define VMALLOC_REGION_ID (REGION_ID(VMALLOC_START))
+#define VMALLOC_REGION_ID (REGION_ID(H_VMALLOC_START))
#define KERNEL_REGION_ID (REGION_ID(PAGE_OFFSET))
#define VMEMMAP_REGION_ID (0xfUL) /* Server only */
#define USER_REGION_ID (0UL)
@@ -105,381 +73,97 @@
* Defines the address of the vmemap area, in its own region on
* hash table CPUs.
*/
-#define VMEMMAP_BASE (VMEMMAP_REGION_ID << REGION_SHIFT)
+#define H_VMEMMAP_BASE (VMEMMAP_REGION_ID << REGION_SHIFT)
#ifdef CONFIG_PPC_MM_SLICES
#define HAVE_ARCH_UNMAPPED_AREA
#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
#endif /* CONFIG_PPC_MM_SLICES */
-/* No separate kernel read-only */
-#define _PAGE_KERNEL_RW (_PAGE_RW | _PAGE_DIRTY) /* user access blocked by key */
-#define _PAGE_KERNEL_RO _PAGE_KERNEL_RW
-#define _PAGE_KERNEL_RWX (_PAGE_DIRTY | _PAGE_RW | _PAGE_EXEC)
-
-/* Strong Access Ordering */
-#define _PAGE_SAO (_PAGE_WRITETHRU | _PAGE_NO_CACHE | _PAGE_COHERENT)
-
-/* No page size encoding in the linux PTE */
-#define _PAGE_PSIZE 0
/* PTEIDX nibble */
#define _PTEIDX_SECONDARY 0x8
#define _PTEIDX_GROUP_IX 0x7
-/* Hash table based platforms need atomic updates of the linux PTE */
-#define PTE_ATOMIC_UPDATES 1
-#define _PTE_NONE_MASK _PAGE_HPTEFLAGS
-/*
- * The mask convered by the RPN must be a ULL on 32-bit platforms with
- * 64-bit PTEs
- */
-#define PTE_RPN_MASK (((1UL << PTE_RPN_SIZE) - 1) << PTE_RPN_SHIFT)
-/*
- * _PAGE_CHG_MASK masks of bits that are to be preserved across
- * pgprot changes
- */
-#define _PAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \
- _PAGE_ACCESSED | _PAGE_SPECIAL | _PAGE_PTE | \
- _PAGE_SOFT_DIRTY)
-/*
- * Mask of bits returned by pte_pgprot()
- */
-#define PAGE_PROT_BITS (_PAGE_GUARDED | _PAGE_COHERENT | _PAGE_NO_CACHE | \
- _PAGE_WRITETHRU | _PAGE_4K_PFN | \
- _PAGE_USER | _PAGE_ACCESSED | \
- _PAGE_RW | _PAGE_DIRTY | _PAGE_EXEC | \
- _PAGE_SOFT_DIRTY)
-/*
- * We define 2 sets of base prot bits, one for basic pages (ie,
- * cacheable kernel and user pages) and one for non cacheable
- * pages. We always set _PAGE_COHERENT when SMP is enabled or
- * the processor might need it for DMA coherency.
- */
-#define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_PSIZE)
-#define _PAGE_BASE (_PAGE_BASE_NC | _PAGE_COHERENT)
-
-/* Permission masks used to generate the __P and __S table,
- *
- * Note:__pgprot is defined in arch/powerpc/include/asm/page.h
- *
- * Write permissions imply read permissions for now (we could make write-only
- * pages on BookE but we don't bother for now). Execute permission control is
- * possible on platforms that define _PAGE_EXEC
- *
- * Note due to the way vm flags are laid out, the bits are XWR
- */
-#define PAGE_NONE __pgprot(_PAGE_BASE)
-#define PAGE_SHARED __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW)
-#define PAGE_SHARED_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW | \
- _PAGE_EXEC)
-#define PAGE_COPY __pgprot(_PAGE_BASE | _PAGE_USER )
-#define PAGE_COPY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC)
-#define PAGE_READONLY __pgprot(_PAGE_BASE | _PAGE_USER )
-#define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC)
-
-#define __P000 PAGE_NONE
-#define __P001 PAGE_READONLY
-#define __P010 PAGE_COPY
-#define __P011 PAGE_COPY
-#define __P100 PAGE_READONLY_X
-#define __P101 PAGE_READONLY_X
-#define __P110 PAGE_COPY_X
-#define __P111 PAGE_COPY_X
-
-#define __S000 PAGE_NONE
-#define __S001 PAGE_READONLY
-#define __S010 PAGE_SHARED
-#define __S011 PAGE_SHARED
-#define __S100 PAGE_READONLY_X
-#define __S101 PAGE_READONLY_X
-#define __S110 PAGE_SHARED_X
-#define __S111 PAGE_SHARED_X
-
-/* Permission masks used for kernel mappings */
-#define PAGE_KERNEL __pgprot(_PAGE_BASE | _PAGE_KERNEL_RW)
-#define PAGE_KERNEL_NC __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \
- _PAGE_NO_CACHE)
-#define PAGE_KERNEL_NCG __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \
- _PAGE_NO_CACHE | _PAGE_GUARDED)
-#define PAGE_KERNEL_X __pgprot(_PAGE_BASE | _PAGE_KERNEL_RWX)
-#define PAGE_KERNEL_RO __pgprot(_PAGE_BASE | _PAGE_KERNEL_RO)
-#define PAGE_KERNEL_ROX __pgprot(_PAGE_BASE | _PAGE_KERNEL_ROX)
-
-/* Protection used for kernel text. We want the debuggers to be able to
- * set breakpoints anywhere, so don't write protect the kernel text
- * on platforms where such control is possible.
- */
-#if defined(CONFIG_KGDB) || defined(CONFIG_XMON) || defined(CONFIG_BDI_SWITCH) ||\
- defined(CONFIG_KPROBES) || defined(CONFIG_DYNAMIC_FTRACE)
-#define PAGE_KERNEL_TEXT PAGE_KERNEL_X
-#else
-#define PAGE_KERNEL_TEXT PAGE_KERNEL_ROX
-#endif
-
-/* Make modules code happy. We don't set RO yet */
-#define PAGE_KERNEL_EXEC PAGE_KERNEL_X
-#define PAGE_AGP (PAGE_KERNEL_NC)
-
-#define PMD_BAD_BITS (PTE_TABLE_SIZE-1)
-#define PUD_BAD_BITS (PMD_TABLE_SIZE-1)
+#define H_PMD_BAD_BITS (PTE_TABLE_SIZE-1)
+#define H_PUD_BAD_BITS (PMD_TABLE_SIZE-1)
#ifndef __ASSEMBLY__
-#define pmd_bad(pmd) (pmd_val(pmd) & PMD_BAD_BITS)
-#define pmd_page_vaddr(pmd) __va(pmd_val(pmd) & ~PMD_MASKED_BITS)
-
-#define pud_bad(pud) (pud_val(pud) & PUD_BAD_BITS)
-#define pud_page_vaddr(pud) __va(pud_val(pud) & ~PUD_MASKED_BITS)
-
-/* Pointers in the page table tree are physical addresses */
-#define __pgtable_ptr_val(ptr) __pa(ptr)
-
-#define pgd_index(address) (((address) >> (PGDIR_SHIFT)) & (PTRS_PER_PGD - 1))
-#define pud_index(address) (((address) >> (PUD_SHIFT)) & (PTRS_PER_PUD - 1))
-#define pmd_index(address) (((address) >> (PMD_SHIFT)) & (PTRS_PER_PMD - 1))
-#define pte_index(address) (((address) >> (PAGE_SHIFT)) & (PTRS_PER_PTE - 1))
+#define hash__pmd_bad(pmd) (pmd_val(pmd) & H_PMD_BAD_BITS)
+#define hash__pud_bad(pud) (pud_val(pud) & H_PUD_BAD_BITS)
+static inline int hash__pgd_bad(pgd_t pgd)
+{
+ return (pgd_val(pgd) == 0);
+}
extern void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, unsigned long pte, int huge);
extern unsigned long htab_convert_pte_flags(unsigned long pteflags);
/* Atomic PTE updates */
-static inline unsigned long pte_update(struct mm_struct *mm,
- unsigned long addr,
- pte_t *ptep, unsigned long clr,
- unsigned long set,
- int huge)
+static inline unsigned long hash__pte_update(struct mm_struct *mm,
+ unsigned long addr,
+ pte_t *ptep, unsigned long clr,
+ unsigned long set,
+ int huge)
{
- unsigned long old, tmp;
+ __be64 old_be, tmp_be;
+ unsigned long old;
__asm__ __volatile__(
"1: ldarx %0,0,%3 # pte_update\n\
- andi. %1,%0,%6\n\
+ and. %1,%0,%6\n\
bne- 1b \n\
andc %1,%0,%4 \n\
or %1,%1,%7\n\
stdcx. %1,0,%3 \n\
bne- 1b"
- : "=&r" (old), "=&r" (tmp), "=m" (*ptep)
- : "r" (ptep), "r" (clr), "m" (*ptep), "i" (_PAGE_BUSY), "r" (set)
+ : "=&r" (old_be), "=&r" (tmp_be), "=m" (*ptep)
+ : "r" (ptep), "r" (cpu_to_be64(clr)), "m" (*ptep),
+ "r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set))
: "cc" );
/* huge pages use the old page table lock */
if (!huge)
assert_pte_locked(mm, addr);
- if (old & _PAGE_HASHPTE)
+ old = be64_to_cpu(old_be);
+ if (old & H_PAGE_HASHPTE)
hpte_need_flush(mm, addr, ptep, old, huge);
return old;
}
-static inline int __ptep_test_and_clear_young(struct mm_struct *mm,
- unsigned long addr, pte_t *ptep)
-{
- unsigned long old;
-
- if ((pte_val(*ptep) & (_PAGE_ACCESSED | _PAGE_HASHPTE)) == 0)
- return 0;
- old = pte_update(mm, addr, ptep, _PAGE_ACCESSED, 0, 0);
- return (old & _PAGE_ACCESSED) != 0;
-}
-#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
-#define ptep_test_and_clear_young(__vma, __addr, __ptep) \
-({ \
- int __r; \
- __r = __ptep_test_and_clear_young((__vma)->vm_mm, __addr, __ptep); \
- __r; \
-})
-
-#define __HAVE_ARCH_PTEP_SET_WRPROTECT
-static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep)
-{
-
- if ((pte_val(*ptep) & _PAGE_RW) == 0)
- return;
-
- pte_update(mm, addr, ptep, _PAGE_RW, 0, 0);
-}
-
-static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
- unsigned long addr, pte_t *ptep)
-{
- if ((pte_val(*ptep) & _PAGE_RW) == 0)
- return;
-
- pte_update(mm, addr, ptep, _PAGE_RW, 0, 1);
-}
-
-/*
- * We currently remove entries from the hashtable regardless of whether
- * the entry was young or dirty. The generic routines only flush if the
- * entry was young or dirty which is not good enough.
- *
- * We should be more intelligent about this but for the moment we override
- * these functions and force a tlb flush unconditionally
- */
-#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
-#define ptep_clear_flush_young(__vma, __address, __ptep) \
-({ \
- int __young = __ptep_test_and_clear_young((__vma)->vm_mm, __address, \
- __ptep); \
- __young; \
-})
-
-#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
-static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
- unsigned long addr, pte_t *ptep)
-{
- unsigned long old = pte_update(mm, addr, ptep, ~0UL, 0, 0);
- return __pte(old);
-}
-
-static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
- pte_t * ptep)
-{
- pte_update(mm, addr, ptep, ~0UL, 0, 0);
-}
-
-
/* Set the dirty and/or accessed bits atomically in a linux PTE, this
* function doesn't need to flush the hash entry
*/
-static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry)
+static inline void hash__ptep_set_access_flags(pte_t *ptep, pte_t entry)
{
- unsigned long bits = pte_val(entry) &
- (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC |
- _PAGE_SOFT_DIRTY);
+ __be64 old, tmp, val, mask;
- unsigned long old, tmp;
+ mask = cpu_to_be64(_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_READ | _PAGE_WRITE |
+ _PAGE_EXEC | _PAGE_SOFT_DIRTY);
+
+ val = pte_raw(entry) & mask;
__asm__ __volatile__(
"1: ldarx %0,0,%4\n\
- andi. %1,%0,%6\n\
+ and. %1,%0,%6\n\
bne- 1b \n\
or %0,%3,%0\n\
stdcx. %0,0,%4\n\
bne- 1b"
:"=&r" (old), "=&r" (tmp), "=m" (*ptep)
- :"r" (bits), "r" (ptep), "m" (*ptep), "i" (_PAGE_BUSY)
+ :"r" (val), "r" (ptep), "m" (*ptep), "r" (cpu_to_be64(H_PAGE_BUSY))
:"cc");
}
-static inline int pgd_bad(pgd_t pgd)
-{
- return (pgd_val(pgd) == 0);
-}
-
-#define __HAVE_ARCH_PTE_SAME
-#define pte_same(A,B) (((pte_val(A) ^ pte_val(B)) & ~_PAGE_HPTEFLAGS) == 0)
-static inline unsigned long pgd_page_vaddr(pgd_t pgd)
-{
- return (unsigned long)__va(pgd_val(pgd) & ~PGD_MASKED_BITS);
-}
-
-
-/* Generic accessors to PTE bits */
-static inline int pte_write(pte_t pte) { return !!(pte_val(pte) & _PAGE_RW);}
-static inline int pte_dirty(pte_t pte) { return !!(pte_val(pte) & _PAGE_DIRTY); }
-static inline int pte_young(pte_t pte) { return !!(pte_val(pte) & _PAGE_ACCESSED); }
-static inline int pte_special(pte_t pte) { return !!(pte_val(pte) & _PAGE_SPECIAL); }
-static inline int pte_none(pte_t pte) { return (pte_val(pte) & ~_PTE_NONE_MASK) == 0; }
-static inline pgprot_t pte_pgprot(pte_t pte) { return __pgprot(pte_val(pte) & PAGE_PROT_BITS); }
-
-#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
-static inline bool pte_soft_dirty(pte_t pte)
-{
- return !!(pte_val(pte) & _PAGE_SOFT_DIRTY);
-}
-static inline pte_t pte_mksoft_dirty(pte_t pte)
-{
- return __pte(pte_val(pte) | _PAGE_SOFT_DIRTY);
-}
-
-static inline pte_t pte_clear_soft_dirty(pte_t pte)
-{
- return __pte(pte_val(pte) & ~_PAGE_SOFT_DIRTY);
-}
-#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
-
-#ifdef CONFIG_NUMA_BALANCING
-/*
- * These work without NUMA balancing but the kernel does not care. See the
- * comment in include/asm-generic/pgtable.h . On powerpc, this will only
- * work for user pages and always return true for kernel pages.
- */
-static inline int pte_protnone(pte_t pte)
-{
- return (pte_val(pte) &
- (_PAGE_PRESENT | _PAGE_USER)) == _PAGE_PRESENT;
-}
-#endif /* CONFIG_NUMA_BALANCING */
-
-static inline int pte_present(pte_t pte)
-{
- return !!(pte_val(pte) & _PAGE_PRESENT);
-}
-
-/* Conversion functions: convert a page and protection to a page entry,
- * and a page entry and page directory to the page they refer to.
- *
- * Even if PTEs can be unsigned long long, a PFN is always an unsigned
- * long for now.
- */
-static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot)
-{
- return __pte((((pte_basic_t)(pfn) << PTE_RPN_SHIFT) & PTE_RPN_MASK) |
- pgprot_val(pgprot));
-}
-
-static inline unsigned long pte_pfn(pte_t pte)
-{
- return (pte_val(pte) & PTE_RPN_MASK) >> PTE_RPN_SHIFT;
-}
-
-/* Generic modifiers for PTE bits */
-static inline pte_t pte_wrprotect(pte_t pte)
-{
- return __pte(pte_val(pte) & ~_PAGE_RW);
-}
-
-static inline pte_t pte_mkclean(pte_t pte)
-{
- return __pte(pte_val(pte) & ~_PAGE_DIRTY);
-}
-
-static inline pte_t pte_mkold(pte_t pte)
-{
- return __pte(pte_val(pte) & ~_PAGE_ACCESSED);
-}
-
-static inline pte_t pte_mkwrite(pte_t pte)
-{
- return __pte(pte_val(pte) | _PAGE_RW);
-}
-
-static inline pte_t pte_mkdirty(pte_t pte)
+static inline int hash__pte_same(pte_t pte_a, pte_t pte_b)
{
- return __pte(pte_val(pte) | _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
+ return (((pte_raw(pte_a) ^ pte_raw(pte_b)) & ~cpu_to_be64(_PAGE_HPTEFLAGS)) == 0);
}
-static inline pte_t pte_mkyoung(pte_t pte)
+static inline int hash__pte_none(pte_t pte)
{
- return __pte(pte_val(pte) | _PAGE_ACCESSED);
-}
-
-static inline pte_t pte_mkspecial(pte_t pte)
-{
- return __pte(pte_val(pte) | _PAGE_SPECIAL);
-}
-
-static inline pte_t pte_mkhuge(pte_t pte)
-{
- return pte;
-}
-
-static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
-{
- return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot));
+ return (pte_val(pte) & ~H_PTE_NONE_MASK) == 0;
}
/* This low level function performs the actual PTE insertion
@@ -487,8 +171,8 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
* an horrible mess that I'm not going to try to clean up now but
* I'm keeping it in one place rather than spread around
*/
-static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pte, int percpu)
+static inline void hash__set_pte_at(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte, int percpu)
{
/*
* Anything else just stores the PTE normally. That covers all 64-bit
@@ -497,53 +181,6 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
*ptep = pte;
}
-/*
- * Macro to mark a page protection value as "uncacheable".
- */
-
-#define _PAGE_CACHE_CTL (_PAGE_COHERENT | _PAGE_GUARDED | _PAGE_NO_CACHE | \
- _PAGE_WRITETHRU)
-
-#define pgprot_noncached pgprot_noncached
-static inline pgprot_t pgprot_noncached(pgprot_t prot)
-{
- return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) |
- _PAGE_NO_CACHE | _PAGE_GUARDED);
-}
-
-#define pgprot_noncached_wc pgprot_noncached_wc
-static inline pgprot_t pgprot_noncached_wc(pgprot_t prot)
-{
- return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) |
- _PAGE_NO_CACHE);
-}
-
-#define pgprot_cached pgprot_cached
-static inline pgprot_t pgprot_cached(pgprot_t prot)
-{
- return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) |
- _PAGE_COHERENT);
-}
-
-#define pgprot_cached_wthru pgprot_cached_wthru
-static inline pgprot_t pgprot_cached_wthru(pgprot_t prot)
-{
- return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) |
- _PAGE_COHERENT | _PAGE_WRITETHRU);
-}
-
-#define pgprot_cached_noncoherent pgprot_cached_noncoherent
-static inline pgprot_t pgprot_cached_noncoherent(pgprot_t prot)
-{
- return __pgprot(pgprot_val(prot) & ~_PAGE_CACHE_CTL);
-}
-
-#define pgprot_writecombine pgprot_writecombine
-static inline pgprot_t pgprot_writecombine(pgprot_t prot)
-{
- return pgprot_noncached_wc(prot);
-}
-
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, unsigned long old_pmd);
@@ -556,6 +193,14 @@ static inline void hpte_do_hugepage_flush(struct mm_struct *mm,
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+extern int hash__map_kernel_page(unsigned long ea, unsigned long pa,
+ unsigned long flags);
+extern int __meminit hash__vmemmap_create_mapping(unsigned long start,
+ unsigned long page_size,
+ unsigned long phys);
+extern void hash__vmemmap_remove_mapping(unsigned long start,
+ unsigned long page_size);
#endif /* !__ASSEMBLY__ */
#endif /* __KERNEL__ */
#endif /* _ASM_POWERPC_BOOK3S_64_HASH_H */
diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb-radix.h b/arch/powerpc/include/asm/book3s/64/hugetlb-radix.h
new file mode 100644
index 0000000..60f4764
--- /dev/null
+++ b/arch/powerpc/include/asm/book3s/64/hugetlb-radix.h
@@ -0,0 +1,14 @@
+#ifndef _ASM_POWERPC_BOOK3S_64_HUGETLB_RADIX_H
+#define _ASM_POWERPC_BOOK3S_64_HUGETLB_RADIX_H
+/*
+ * For radix we want generic code to handle hugetlb. But then if we want
+ * both hash and radix to be enabled together we need to workaround the
+ * limitations.
+ */
+void radix__flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr);
+void radix__local_flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr);
+extern unsigned long
+radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags);
+#endif
diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index 0cea480..290157e 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -1,5 +1,5 @@
-#ifndef _ASM_POWERPC_MMU_HASH64_H_
-#define _ASM_POWERPC_MMU_HASH64_H_
+#ifndef _ASM_POWERPC_BOOK3S_64_MMU_HASH_H_
+#define _ASM_POWERPC_BOOK3S_64_MMU_HASH_H_
/*
* PowerPC64 memory management structures
*
@@ -78,6 +78,10 @@
#define HPTE_V_SECONDARY ASM_CONST(0x0000000000000002)
#define HPTE_V_VALID ASM_CONST(0x0000000000000001)
+/*
+ * ISA 3.0 have a different HPTE format.
+ */
+#define HPTE_R_3_0_SSIZE_SHIFT 58
#define HPTE_R_PP0 ASM_CONST(0x8000000000000000)
#define HPTE_R_TS ASM_CONST(0x4000000000000000)
#define HPTE_R_KEY_HI ASM_CONST(0x3000000000000000)
@@ -115,6 +119,7 @@
#define POWER7_TLB_SETS 128 /* # sets in POWER7 TLB */
#define POWER8_TLB_SETS 512 /* # sets in POWER8 TLB */
#define POWER9_TLB_SETS_HASH 256 /* # sets in POWER9 TLB Hash mode */
+#define POWER9_TLB_SETS_RADIX 128 /* # sets in POWER9 TLB Radix mode */
#ifndef __ASSEMBLY__
@@ -127,24 +132,6 @@ extern struct hash_pte *htab_address;
extern unsigned long htab_size_bytes;
extern unsigned long htab_hash_mask;
-/*
- * Page size definition
- *
- * shift : is the "PAGE_SHIFT" value for that page size
- * sllp : is a bit mask with the value of SLB L || LP to be or'ed
- * directly to a slbmte "vsid" value
- * penc : is the HPTE encoding mask for the "LP" field:
- *
- */
-struct mmu_psize_def
-{
- unsigned int shift; /* number of bits */
- int penc[MMU_PAGE_COUNT]; /* HPTE encoding */
- unsigned int tlbiel; /* tlbiel supported for that page size */
- unsigned long avpnm; /* bits to mask out in AVPN in the HPTE */
- unsigned long sllp; /* SLB L||LP (exact mask to use in slbmte) */
-};
-extern struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
static inline int shift_to_mmu_psize(unsigned int shift)
{
@@ -210,11 +197,6 @@ static inline int segment_shift(int ssize)
/*
* The current system page and segment sizes
*/
-extern int mmu_linear_psize;
-extern int mmu_virtual_psize;
-extern int mmu_vmalloc_psize;
-extern int mmu_vmemmap_psize;
-extern int mmu_io_psize;
extern int mmu_kernel_ssize;
extern int mmu_highuser_ssize;
extern u16 mmu_slb_size;
@@ -247,7 +229,8 @@ static inline unsigned long hpte_encode_avpn(unsigned long vpn, int psize,
*/
v = (vpn >> (23 - VPN_SHIFT)) & ~(mmu_psize_defs[psize].avpnm);
v <<= HPTE_V_AVPN_SHIFT;
- v |= ((unsigned long) ssize) << HPTE_V_SSIZE_SHIFT;
+ if (!cpu_has_feature(CPU_FTR_ARCH_300))
+ v |= ((unsigned long) ssize) << HPTE_V_SSIZE_SHIFT;
return v;
}
@@ -271,8 +254,12 @@ static inline unsigned long hpte_encode_v(unsigned long vpn, int base_psize,
* aligned for the requested page size
*/
static inline unsigned long hpte_encode_r(unsigned long pa, int base_psize,
- int actual_psize)
+ int actual_psize, int ssize)
{
+
+ if (cpu_has_feature(CPU_FTR_ARCH_300))
+ pa |= ((unsigned long) ssize) << HPTE_R_3_0_SSIZE_SHIFT;
+
/* A 4K page needs no special encoding */
if (actual_psize == MMU_PAGE_4K)
return pa & HPTE_R_RPN;
@@ -476,7 +463,7 @@ extern void slb_set_size(u16 size);
add rt,rt,rx
/* 4 bits per slice and we have one slice per 1TB */
-#define SLICE_ARRAY_SIZE (PGTABLE_RANGE >> 41)
+#define SLICE_ARRAY_SIZE (H_PGTABLE_RANGE >> 41)
#ifndef __ASSEMBLY__
@@ -512,38 +499,6 @@ static inline void subpage_prot_free(struct mm_struct *mm) {}
static inline void subpage_prot_init_new_context(struct mm_struct *mm) { }
#endif /* CONFIG_PPC_SUBPAGE_PROT */
-typedef unsigned long mm_context_id_t;
-struct spinlock;
-
-typedef struct {
- mm_context_id_t id;
- u16 user_psize; /* page size index */
-
-#ifdef CONFIG_PPC_MM_SLICES
- u64 low_slices_psize; /* SLB page size encodings */
- unsigned char high_slices_psize[SLICE_ARRAY_SIZE];
-#else
- u16 sllp; /* SLB page size encoding */
-#endif
- unsigned long vdso_base;
-#ifdef CONFIG_PPC_SUBPAGE_PROT
- struct subpage_prot_table spt;
-#endif /* CONFIG_PPC_SUBPAGE_PROT */
-#ifdef CONFIG_PPC_ICSWX
- struct spinlock *cop_lockp; /* guard acop and cop_pid */
- unsigned long acop; /* mask of enabled coprocessor types */
- unsigned int cop_pid; /* pid value used with coprocessors */
-#endif /* CONFIG_PPC_ICSWX */
-#ifdef CONFIG_PPC_64K_PAGES
- /* for 4K PTE fragment support */
- void *pte_frag;
-#endif
-#ifdef CONFIG_SPAPR_TCE_IOMMU
- struct list_head iommu_group_mem_list;
-#endif
-} mm_context_t;
-
-
#if 0
/*
* The code below is equivalent to this function for arguments
@@ -579,7 +534,7 @@ static inline unsigned long get_vsid(unsigned long context, unsigned long ea,
/*
* Bad address. We return VSID 0 for that
*/
- if ((ea & ~REGION_MASK) >= PGTABLE_RANGE)
+ if ((ea & ~REGION_MASK) >= H_PGTABLE_RANGE)
return 0;
if (ssize == MMU_SEGSIZE_256M)
@@ -613,4 +568,4 @@ unsigned htab_shift_for_mem_size(unsigned long mem_size);
#endif /* __ASSEMBLY__ */
-#endif /* _ASM_POWERPC_MMU_HASH64_H_ */
+#endif /* _ASM_POWERPC_BOOK3S_64_MMU_HASH_H_ */
diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
new file mode 100644
index 0000000..5854263
--- /dev/null
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -0,0 +1,137 @@
+#ifndef _ASM_POWERPC_BOOK3S_64_MMU_H_
+#define _ASM_POWERPC_BOOK3S_64_MMU_H_
+
+#ifndef __ASSEMBLY__
+/*
+ * Page size definition
+ *
+ * shift : is the "PAGE_SHIFT" value for that page size
+ * sllp : is a bit mask with the value of SLB L || LP to be or'ed
+ * directly to a slbmte "vsid" value
+ * penc : is the HPTE encoding mask for the "LP" field:
+ *
+ */
+struct mmu_psize_def {
+ unsigned int shift; /* number of bits */
+ int penc[MMU_PAGE_COUNT]; /* HPTE encoding */
+ unsigned int tlbiel; /* tlbiel supported for that page size */
+ unsigned long avpnm; /* bits to mask out in AVPN in the HPTE */
+ union {
+ unsigned long sllp; /* SLB L||LP (exact mask to use in slbmte) */
+ unsigned long ap; /* Ap encoding used by PowerISA 3.0 */
+ };
+};
+extern struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
+
+#define radix_enabled() mmu_has_feature(MMU_FTR_RADIX)
+
+#endif /* __ASSEMBLY__ */
+
+/* 64-bit classic hash table MMU */
+#include <asm/book3s/64/mmu-hash.h>
+
+#ifndef __ASSEMBLY__
+/*
+ * ISA 3.0 partiton and process table entry format
+ */
+struct prtb_entry {
+ __be64 prtb0;
+ __be64 prtb1;
+};
+extern struct prtb_entry *process_tb;
+
+struct patb_entry {
+ __be64 patb0;
+ __be64 patb1;
+};
+extern struct patb_entry *partition_tb;
+
+#define PATB_HR (1UL << 63)
+#define PATB_GR (1UL << 63)
+#define RPDB_MASK 0x0ffffffffffff00fUL
+#define RPDB_SHIFT (1UL << 8)
+/*
+ * Limit process table to PAGE_SIZE table. This
+ * also limit the max pid we can support.
+ * MAX_USER_CONTEXT * 16 bytes of space.
+ */
+#define PRTB_SIZE_SHIFT (CONTEXT_BITS + 4)
+/*
+ * Power9 currently only support 64K partition table size.
+ */
+#define PATB_SIZE_SHIFT 16
+
+typedef unsigned long mm_context_id_t;
+struct spinlock;
+
+typedef struct {
+ mm_context_id_t id;
+ u16 user_psize; /* page size index */
+
+#ifdef CONFIG_PPC_MM_SLICES
+ u64 low_slices_psize; /* SLB page size encodings */
+ unsigned char high_slices_psize[SLICE_ARRAY_SIZE];
+#else
+ u16 sllp; /* SLB page size encoding */
+#endif
+ unsigned long vdso_base;
+#ifdef CONFIG_PPC_SUBPAGE_PROT
+ struct subpage_prot_table spt;
+#endif /* CONFIG_PPC_SUBPAGE_PROT */
+#ifdef CONFIG_PPC_ICSWX
+ struct spinlock *cop_lockp; /* guard acop and cop_pid */
+ unsigned long acop; /* mask of enabled coprocessor types */
+ unsigned int cop_pid; /* pid value used with coprocessors */
+#endif /* CONFIG_PPC_ICSWX */
+#ifdef CONFIG_PPC_64K_PAGES
+ /* for 4K PTE fragment support */
+ void *pte_frag;
+#endif
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+ struct list_head iommu_group_mem_list;
+#endif
+} mm_context_t;
+
+/*
+ * The current system page and segment sizes
+ */
+extern int mmu_linear_psize;
+extern int mmu_virtual_psize;
+extern int mmu_vmalloc_psize;
+extern int mmu_vmemmap_psize;
+extern int mmu_io_psize;
+
+/* MMU initialization */
+extern void radix_init_native(void);
+extern void hash__early_init_mmu(void);
+extern void radix__early_init_mmu(void);
+static inline void early_init_mmu(void)
+{
+ if (radix_enabled())
+ return radix__early_init_mmu();
+ return hash__early_init_mmu();
+}
+extern void hash__early_init_mmu_secondary(void);
+extern void radix__early_init_mmu_secondary(void);
+static inline void early_init_mmu_secondary(void)
+{
+ if (radix_enabled())
+ return radix__early_init_mmu_secondary();
+ return hash__early_init_mmu_secondary();
+}
+
+extern void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base,
+ phys_addr_t first_memblock_size);
+extern void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base,
+ phys_addr_t first_memblock_size);
+static inline void setup_initial_memory_limit(phys_addr_t first_memblock_base,
+ phys_addr_t first_memblock_size)
+{
+ if (radix_enabled())
+ return radix__setup_initial_memory_limit(first_memblock_base,
+ first_memblock_size);
+ return hash__setup_initial_memory_limit(first_memblock_base,
+ first_memblock_size);
+}
+#endif /* __ASSEMBLY__ */
+#endif /* _ASM_POWERPC_BOOK3S_64_MMU_H_ */
diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h b/arch/powerpc/include/asm/book3s/64/pgalloc.h
new file mode 100644
index 0000000..488279e
--- /dev/null
+++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h
@@ -0,0 +1,207 @@
+#ifndef _ASM_POWERPC_BOOK3S_64_PGALLOC_H
+#define _ASM_POWERPC_BOOK3S_64_PGALLOC_H
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/slab.h>
+#include <linux/cpumask.h>
+#include <linux/percpu.h>
+
+struct vmemmap_backing {
+ struct vmemmap_backing *list;
+ unsigned long phys;
+ unsigned long virt_addr;
+};
+extern struct vmemmap_backing *vmemmap_list;
+
+/*
+ * Functions that deal with pagetables that could be at any level of
+ * the table need to be passed an "index_size" so they know how to
+ * handle allocation. For PTE pages (which are linked to a struct
+ * page for now, and drawn from the main get_free_pages() pool), the
+ * allocation size will be (2^index_size * sizeof(pointer)) and
+ * allocations are drawn from the kmem_cache in PGT_CACHE(index_size).
+ *
+ * The maximum index size needs to be big enough to allow any
+ * pagetable sizes we need, but small enough to fit in the low bits of
+ * any page table pointer. In other words all pagetables, even tiny
+ * ones, must be aligned to allow at least enough low 0 bits to
+ * contain this value. This value is also used as a mask, so it must
+ * be one less than a power of two.
+ */
+#define MAX_PGTABLE_INDEX_SIZE 0xf
+
+extern struct kmem_cache *pgtable_cache[];
+#define PGT_CACHE(shift) ({ \
+ BUG_ON(!(shift)); \
+ pgtable_cache[(shift) - 1]; \
+ })
+
+#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
+
+extern pte_t *pte_fragment_alloc(struct mm_struct *, unsigned long, int);
+extern void pte_fragment_free(unsigned long *, int);
+extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift);
+#ifdef CONFIG_SMP
+extern void __tlb_remove_table(void *_table);
+#endif
+
+static inline pgd_t *radix__pgd_alloc(struct mm_struct *mm)
+{
+#ifdef CONFIG_PPC_64K_PAGES
+ return (pgd_t *)__get_free_page(PGALLOC_GFP);
+#else
+ struct page *page;
+ page = alloc_pages(PGALLOC_GFP, 4);
+ if (!page)
+ return NULL;
+ return (pgd_t *) page_address(page);
+#endif
+}
+
+static inline void radix__pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+#ifdef CONFIG_PPC_64K_PAGES
+ free_page((unsigned long)pgd);
+#else
+ free_pages((unsigned long)pgd, 4);
+#endif
+}
+
+static inline pgd_t *pgd_alloc(struct mm_struct *mm)
+{
+ if (radix_enabled())
+ return radix__pgd_alloc(mm);
+ return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE), GFP_KERNEL);
+}
+
+static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+ if (radix_enabled())
+ return radix__pgd_free(mm, pgd);
+ kmem_cache_free(PGT_CACHE(PGD_INDEX_SIZE), pgd);
+}
+
+static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
+{
+ pgd_set(pgd, __pgtable_ptr_val(pud) | PGD_VAL_BITS);
+}
+
+static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
+{
+ return kmem_cache_alloc(PGT_CACHE(PUD_INDEX_SIZE),
+ GFP_KERNEL|__GFP_REPEAT);
+}
+
+static inline void pud_free(struct mm_struct *mm, pud_t *pud)
+{
+ kmem_cache_free(PGT_CACHE(PUD_INDEX_SIZE), pud);
+}
+
+static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
+{
+ pud_set(pud, __pgtable_ptr_val(pmd) | PUD_VAL_BITS);
+}
+
+static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
+ unsigned long address)
+{
+ pgtable_free_tlb(tlb, pud, PUD_INDEX_SIZE);
+}
+
+static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
+{
+ return kmem_cache_alloc(PGT_CACHE(PMD_CACHE_INDEX),
+ GFP_KERNEL|__GFP_REPEAT);
+}
+
+static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
+{
+ kmem_cache_free(PGT_CACHE(PMD_CACHE_INDEX), pmd);
+}
+
+static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
+ unsigned long address)
+{
+ return pgtable_free_tlb(tlb, pmd, PMD_CACHE_INDEX);
+}
+
+static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd,
+ pte_t *pte)
+{
+ pmd_set(pmd, __pgtable_ptr_val(pte) | PMD_VAL_BITS);
+}
+
+static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
+ pgtable_t pte_page)
+{
+ pmd_set(pmd, __pgtable_ptr_val(pte_page) | PMD_VAL_BITS);
+}
+
+static inline pgtable_t pmd_pgtable(pmd_t pmd)
+{
+ return (pgtable_t)pmd_page_vaddr(pmd);
+}
+
+#ifdef CONFIG_PPC_4K_PAGES
+static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
+ unsigned long address)
+{
+ return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO);
+}
+
+static inline pgtable_t pte_alloc_one(struct mm_struct *mm,
+ unsigned long address)
+{
+ struct page *page;
+ pte_t *pte;
+
+ pte = pte_alloc_one_kernel(mm, address);
+ if (!pte)
+ return NULL;
+ page = virt_to_page(pte);
+ if (!pgtable_page_ctor(page)) {
+ __free_page(page);
+ return NULL;
+ }
+ return pte;
+}
+#else /* if CONFIG_PPC_64K_PAGES */
+
+static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
+ unsigned long address)
+{
+ return (pte_t *)pte_fragment_alloc(mm, address, 1);
+}
+
+static inline pgtable_t pte_alloc_one(struct mm_struct *mm,
+ unsigned long address)
+{
+ return (pgtable_t)pte_fragment_alloc(mm, address, 0);
+}
+#endif
+
+static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
+{
+ pte_fragment_free((unsigned long *)pte, 1);
+}
+
+static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage)
+{
+ pte_fragment_free((unsigned long *)ptepage, 0);
+}
+
+static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table,
+ unsigned long address)
+{
+ tlb_flush_pgtable(tlb, address);
+ pgtable_free_tlb(tlb, table, 0);
+}
+
+#define check_pgt_cache() do { } while (0)
+
+#endif /* _ASM_POWERPC_BOOK3S_64_PGALLOC_H */
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable-4k.h b/arch/powerpc/include/asm/book3s/64/pgtable-4k.h
new file mode 100644
index 0000000..71e9abc
--- /dev/null
+++ b/arch/powerpc/include/asm/book3s/64/pgtable-4k.h
@@ -0,0 +1,53 @@
+#ifndef _ASM_POWERPC_BOOK3S_64_PGTABLE_4K_H
+#define _ASM_POWERPC_BOOK3S_64_PGTABLE_4K_H
+/*
+ * hash 4k can't share hugetlb and also doesn't support THP
+ */
+#ifndef __ASSEMBLY__
+#ifdef CONFIG_HUGETLB_PAGE
+static inline int pmd_huge(pmd_t pmd)
+{
+ /*
+ * leaf pte for huge page
+ */
+ if (radix_enabled())
+ return !!(pmd_val(pmd) & _PAGE_PTE);
+ return 0;
+}
+
+static inline int pud_huge(pud_t pud)
+{
+ /*
+ * leaf pte for huge page
+ */
+ if (radix_enabled())
+ return !!(pud_val(pud) & _PAGE_PTE);
+ return 0;
+}
+
+static inline int pgd_huge(pgd_t pgd)
+{
+ /*
+ * leaf pte for huge page
+ */
+ if (radix_enabled())
+ return !!(pgd_val(pgd) & _PAGE_PTE);
+ return 0;
+}
+#define pgd_huge pgd_huge
+/*
+ * With radix , we have hugepage ptes in the pud and pmd entries. We don't
+ * need to setup hugepage directory for them. Our pte and page directory format
+ * enable us to have this enabled.
+ */
+static inline int hugepd_ok(hugepd_t hpd)
+{
+ if (radix_enabled())
+ return 0;
+ return hash__hugepd_ok(hpd);
+}
+#define is_hugepd(hpd) (hugepd_ok(hpd))
+#endif /* CONFIG_HUGETLB_PAGE */
+#endif /* __ASSEMBLY__ */
+
+#endif /*_ASM_POWERPC_BOOK3S_64_PGTABLE_4K_H */
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable-64k.h b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h
new file mode 100644
index 0000000..cb2d0a5
--- /dev/null
+++ b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h
@@ -0,0 +1,64 @@
+#ifndef _ASM_POWERPC_BOOK3S_64_PGTABLE_64K_H
+#define _ASM_POWERPC_BOOK3S_64_PGTABLE_64K_H
+
+#ifndef __ASSEMBLY__
+#ifdef CONFIG_HUGETLB_PAGE
+/*
+ * We have PGD_INDEX_SIZ = 12 and PTE_INDEX_SIZE = 8, so that we can have
+ * 16GB hugepage pte in PGD and 16MB hugepage pte at PMD;
+ *
+ * Defined in such a way that we can optimize away code block at build time
+ * if CONFIG_HUGETLB_PAGE=n.
+ */
+static inline int pmd_huge(pmd_t pmd)
+{
+ /*
+ * leaf pte for huge page
+ */
+ return !!(pmd_val(pmd) & _PAGE_PTE);
+}
+
+static inline int pud_huge(pud_t pud)
+{
+ /*
+ * leaf pte for huge page
+ */
+ return !!(pud_val(pud) & _PAGE_PTE);
+}
+
+static inline int pgd_huge(pgd_t pgd)
+{
+ /*
+ * leaf pte for huge page
+ */
+ return !!(pgd_val(pgd) & _PAGE_PTE);
+}
+#define pgd_huge pgd_huge
+
+#ifdef CONFIG_DEBUG_VM
+extern int hugepd_ok(hugepd_t hpd);
+#define is_hugepd(hpd) (hugepd_ok(hpd))
+#else
+/*
+ * With 64k page size, we have hugepage ptes in the pgd and pmd entries. We don't
+ * need to setup hugepage directory for them. Our pte and page directory format
+ * enable us to have this enabled.
+ */
+static inline int hugepd_ok(hugepd_t hpd)
+{
+ return 0;
+}
+#define is_hugepd(pdep) 0
+#endif /* CONFIG_DEBUG_VM */
+
+#endif /* CONFIG_HUGETLB_PAGE */
+
+static inline int remap_4k_pfn(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, pgprot_t prot)
+{
+ if (radix_enabled())
+ BUG();
+ return hash__remap_4k_pfn(vma, addr, pfn, prot);
+}
+#endif /* __ASSEMBLY__ */
+#endif /*_ASM_POWERPC_BOOK3S_64_PGTABLE_64K_H */
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 8fe6f6b4..88a5eca 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -1,13 +1,247 @@
#ifndef _ASM_POWERPC_BOOK3S_64_PGTABLE_H_
#define _ASM_POWERPC_BOOK3S_64_PGTABLE_H_
+
+/*
+ * Common bits between hash and Radix page table
+ */
+#define _PAGE_BIT_SWAP_TYPE 0
+
+#define _PAGE_EXEC 0x00001 /* execute permission */
+#define _PAGE_WRITE 0x00002 /* write access allowed */
+#define _PAGE_READ 0x00004 /* read access allowed */
+#define _PAGE_RW (_PAGE_READ | _PAGE_WRITE)
+#define _PAGE_RWX (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC)
+#define _PAGE_PRIVILEGED 0x00008 /* kernel access only */
+#define _PAGE_SAO 0x00010 /* Strong access order */
+#define _PAGE_NON_IDEMPOTENT 0x00020 /* non idempotent memory */
+#define _PAGE_TOLERANT 0x00030 /* tolerant memory, cache inhibited */
+#define _PAGE_DIRTY 0x00080 /* C: page changed */
+#define _PAGE_ACCESSED 0x00100 /* R: page referenced */
/*
- * This file contains the functions and defines necessary to modify and use
- * the ppc64 hashed page table.
+ * Software bits
*/
+#define _RPAGE_SW0 0x2000000000000000UL
+#define _RPAGE_SW1 0x00800
+#define _RPAGE_SW2 0x00400
+#define _RPAGE_SW3 0x00200
+#ifdef CONFIG_MEM_SOFT_DIRTY
+#define _PAGE_SOFT_DIRTY _RPAGE_SW3 /* software: software dirty tracking */
+#else
+#define _PAGE_SOFT_DIRTY 0x00000
+#endif
+#define _PAGE_SPECIAL _RPAGE_SW2 /* software: special page */
+
+
+#define _PAGE_PTE (1ul << 62) /* distinguishes PTEs from pointers */
+#define _PAGE_PRESENT (1ul << 63) /* pte contains a translation */
+/*
+ * Drivers request for cache inhibited pte mapping using _PAGE_NO_CACHE
+ * Instead of fixing all of them, add an alternate define which
+ * maps CI pte mapping.
+ */
+#define _PAGE_NO_CACHE _PAGE_TOLERANT
+/*
+ * We support 57 bit real address in pte. Clear everything above 57, and
+ * every thing below PAGE_SHIFT;
+ */
+#define PTE_RPN_MASK (((1UL << 57) - 1) & (PAGE_MASK))
+/*
+ * set of bits not changed in pmd_modify. Even though we have hash specific bits
+ * in here, on radix we expect them to be zero.
+ */
+#define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \
+ _PAGE_ACCESSED | H_PAGE_THP_HUGE | _PAGE_PTE | \
+ _PAGE_SOFT_DIRTY)
+/*
+ * user access blocked by key
+ */
+#define _PAGE_KERNEL_RW (_PAGE_PRIVILEGED | _PAGE_RW | _PAGE_DIRTY)
+#define _PAGE_KERNEL_RO (_PAGE_PRIVILEGED | _PAGE_READ)
+#define _PAGE_KERNEL_RWX (_PAGE_PRIVILEGED | _PAGE_DIRTY | \
+ _PAGE_RW | _PAGE_EXEC)
+/*
+ * No page size encoding in the linux PTE
+ */
+#define _PAGE_PSIZE 0
+/*
+ * _PAGE_CHG_MASK masks of bits that are to be preserved across
+ * pgprot changes
+ */
+#define _PAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \
+ _PAGE_ACCESSED | _PAGE_SPECIAL | _PAGE_PTE | \
+ _PAGE_SOFT_DIRTY)
+/*
+ * Mask of bits returned by pte_pgprot()
+ */
+#define PAGE_PROT_BITS (_PAGE_SAO | _PAGE_NON_IDEMPOTENT | _PAGE_TOLERANT | \
+ H_PAGE_4K_PFN | _PAGE_PRIVILEGED | _PAGE_ACCESSED | \
+ _PAGE_READ | _PAGE_WRITE | _PAGE_DIRTY | _PAGE_EXEC | \
+ _PAGE_SOFT_DIRTY)
+/*
+ * We define 2 sets of base prot bits, one for basic pages (ie,
+ * cacheable kernel and user pages) and one for non cacheable
+ * pages. We always set _PAGE_COHERENT when SMP is enabled or
+ * the processor might need it for DMA coherency.
+ */
+#define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_PSIZE)
+#define _PAGE_BASE (_PAGE_BASE_NC)
+
+/* Permission masks used to generate the __P and __S table,
+ *
+ * Note:__pgprot is defined in arch/powerpc/include/asm/page.h
+ *
+ * Write permissions imply read permissions for now (we could make write-only
+ * pages on BookE but we don't bother for now). Execute permission control is
+ * possible on platforms that define _PAGE_EXEC
+ *
+ * Note due to the way vm flags are laid out, the bits are XWR
+ */
+#define PAGE_NONE __pgprot(_PAGE_BASE | _PAGE_PRIVILEGED)
+#define PAGE_SHARED __pgprot(_PAGE_BASE | _PAGE_RW)
+#define PAGE_SHARED_X __pgprot(_PAGE_BASE | _PAGE_RW | _PAGE_EXEC)
+#define PAGE_COPY __pgprot(_PAGE_BASE | _PAGE_READ)
+#define PAGE_COPY_X __pgprot(_PAGE_BASE | _PAGE_READ | _PAGE_EXEC)
+#define PAGE_READONLY __pgprot(_PAGE_BASE | _PAGE_READ)
+#define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_READ | _PAGE_EXEC)
+
+#define __P000 PAGE_NONE
+#define __P001 PAGE_READONLY
+#define __P010 PAGE_COPY
+#define __P011 PAGE_COPY
+#define __P100 PAGE_READONLY_X
+#define __P101 PAGE_READONLY_X
+#define __P110 PAGE_COPY_X
+#define __P111 PAGE_COPY_X
+
+#define __S000 PAGE_NONE
+#define __S001 PAGE_READONLY
+#define __S010 PAGE_SHARED
+#define __S011 PAGE_SHARED
+#define __S100 PAGE_READONLY_X
+#define __S101 PAGE_READONLY_X
+#define __S110 PAGE_SHARED_X
+#define __S111 PAGE_SHARED_X
+
+/* Permission masks used for kernel mappings */
+#define PAGE_KERNEL __pgprot(_PAGE_BASE | _PAGE_KERNEL_RW)
+#define PAGE_KERNEL_NC __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \
+ _PAGE_TOLERANT)
+#define PAGE_KERNEL_NCG __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \
+ _PAGE_NON_IDEMPOTENT)
+#define PAGE_KERNEL_X __pgprot(_PAGE_BASE | _PAGE_KERNEL_RWX)
+#define PAGE_KERNEL_RO __pgprot(_PAGE_BASE | _PAGE_KERNEL_RO)
+#define PAGE_KERNEL_ROX __pgprot(_PAGE_BASE | _PAGE_KERNEL_ROX)
+
+/*
+ * Protection used for kernel text. We want the debuggers to be able to
+ * set breakpoints anywhere, so don't write protect the kernel text
+ * on platforms where such control is possible.
+ */
+#if defined(CONFIG_KGDB) || defined(CONFIG_XMON) || defined(CONFIG_BDI_SWITCH) || \
+ defined(CONFIG_KPROBES) || defined(CONFIG_DYNAMIC_FTRACE)
+#define PAGE_KERNEL_TEXT PAGE_KERNEL_X
+#else
+#define PAGE_KERNEL_TEXT PAGE_KERNEL_ROX
+#endif
+
+/* Make modules code happy. We don't set RO yet */
+#define PAGE_KERNEL_EXEC PAGE_KERNEL_X
+#define PAGE_AGP (PAGE_KERNEL_NC)
+
+#ifndef __ASSEMBLY__
+/*
+ * page table defines
+ */
+extern unsigned long __pte_index_size;
+extern unsigned long __pmd_index_size;
+extern unsigned long __pud_index_size;
+extern unsigned long __pgd_index_size;
+extern unsigned long __pmd_cache_index;
+#define PTE_INDEX_SIZE __pte_index_size
+#define PMD_INDEX_SIZE __pmd_index_size
+#define PUD_INDEX_SIZE __pud_index_size
+#define PGD_INDEX_SIZE __pgd_index_size
+#define PMD_CACHE_INDEX __pmd_cache_index
+/*
+ * Because of use of pte fragments and THP, size of page table
+ * are not always derived out of index size above.
+ */
+extern unsigned long __pte_table_size;
+extern unsigned long __pmd_table_size;
+extern unsigned long __pud_table_size;
+extern unsigned long __pgd_table_size;
+#define PTE_TABLE_SIZE __pte_table_size
+#define PMD_TABLE_SIZE __pmd_table_size
+#define PUD_TABLE_SIZE __pud_table_size
+#define PGD_TABLE_SIZE __pgd_table_size
+
+extern unsigned long __pmd_val_bits;
+extern unsigned long __pud_val_bits;
+extern unsigned long __pgd_val_bits;
+#define PMD_VAL_BITS __pmd_val_bits
+#define PUD_VAL_BITS __pud_val_bits
+#define PGD_VAL_BITS __pgd_val_bits
+
+extern unsigned long __pte_frag_nr;
+#define PTE_FRAG_NR __pte_frag_nr
+extern unsigned long __pte_frag_size_shift;
+#define PTE_FRAG_SIZE_SHIFT __pte_frag_size_shift
+#define PTE_FRAG_SIZE (1UL << PTE_FRAG_SIZE_SHIFT)
+/*
+ * Pgtable size used by swapper, init in asm code
+ */
+#define MAX_PGD_TABLE_SIZE (sizeof(pgd_t) << RADIX_PGD_INDEX_SIZE)
+
+#define PTRS_PER_PTE (1 << PTE_INDEX_SIZE)
+#define PTRS_PER_PMD (1 << PMD_INDEX_SIZE)
+#define PTRS_PER_PUD (1 << PUD_INDEX_SIZE)
+#define PTRS_PER_PGD (1 << PGD_INDEX_SIZE)
+
+/* PMD_SHIFT determines what a second-level page table entry can map */
+#define PMD_SHIFT (PAGE_SHIFT + PTE_INDEX_SIZE)
+#define PMD_SIZE (1UL << PMD_SHIFT)
+#define PMD_MASK (~(PMD_SIZE-1))
+
+/* PUD_SHIFT determines what a third-level page table entry can map */
+#define PUD_SHIFT (PMD_SHIFT + PMD_INDEX_SIZE)
+#define PUD_SIZE (1UL << PUD_SHIFT)
+#define PUD_MASK (~(PUD_SIZE-1))
+
+/* PGDIR_SHIFT determines what a fourth-level page table entry can map */
+#define PGDIR_SHIFT (PUD_SHIFT + PUD_INDEX_SIZE)
+#define PGDIR_SIZE (1UL << PGDIR_SHIFT)
+#define PGDIR_MASK (~(PGDIR_SIZE-1))
+
+/* Bits to mask out from a PMD to get to the PTE page */
+#define PMD_MASKED_BITS 0xc0000000000000ffUL
+/* Bits to mask out from a PUD to get to the PMD page */
+#define PUD_MASKED_BITS 0xc0000000000000ffUL
+/* Bits to mask out from a PGD to get to the PUD page */
+#define PGD_MASKED_BITS 0xc0000000000000ffUL
+
+extern unsigned long __vmalloc_start;
+extern unsigned long __vmalloc_end;
+#define VMALLOC_START __vmalloc_start
+#define VMALLOC_END __vmalloc_end
+
+extern unsigned long __kernel_virt_start;
+extern unsigned long __kernel_virt_size;
+#define KERN_VIRT_START __kernel_virt_start
+#define KERN_VIRT_SIZE __kernel_virt_size
+extern struct page *vmemmap;
+extern unsigned long ioremap_bot;
+#endif /* __ASSEMBLY__ */
#include <asm/book3s/64/hash.h>
-#include <asm/barrier.h>
+#include <asm/book3s/64/radix.h>
+#ifdef CONFIG_PPC_64K_PAGES
+#include <asm/book3s/64/pgtable-64k.h>
+#else
+#include <asm/book3s/64/pgtable-4k.h>
+#endif
+
+#include <asm/barrier.h>
/*
* The second half of the kernel virtual space is used for IO mappings,
* it's itself carved into the PIO region (ISA and PHB IO space) and
@@ -26,8 +260,6 @@
#define IOREMAP_BASE (PHB_IO_END)
#define IOREMAP_END (KERN_VIRT_START + KERN_VIRT_SIZE)
-#define vmemmap ((struct page *)VMEMMAP_BASE)
-
/* Advertise special mapping type for AGP */
#define HAVE_PAGE_AGP
@@ -45,7 +277,7 @@
#define __real_pte(e,p) ((real_pte_t){(e)})
#define __rpte_to_pte(r) ((r).pte)
-#define __rpte_to_hidx(r,index) (pte_val(__rpte_to_pte(r)) >>_PAGE_F_GIX_SHIFT)
+#define __rpte_to_hidx(r,index) (pte_val(__rpte_to_pte(r)) >> H_PAGE_F_GIX_SHIFT)
#define pte_iterate_hashed_subpages(rpte, psize, va, index, shift) \
do { \
@@ -62,6 +294,327 @@
#endif /* __real_pte */
+static inline unsigned long pte_update(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, unsigned long clr,
+ unsigned long set, int huge)
+{
+ if (radix_enabled())
+ return radix__pte_update(mm, addr, ptep, clr, set, huge);
+ return hash__pte_update(mm, addr, ptep, clr, set, huge);
+}
+/*
+ * For hash even if we have _PAGE_ACCESSED = 0, we do a pte_update.
+ * We currently remove entries from the hashtable regardless of whether
+ * the entry was young or dirty.
+ *
+ * We should be more intelligent about this but for the moment we override
+ * these functions and force a tlb flush unconditionally
+ * For radix: H_PAGE_HASHPTE should be zero. Hence we can use the same
+ * function for both hash and radix.
+ */
+static inline int __ptep_test_and_clear_young(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
+{
+ unsigned long old;
+
+ if ((pte_val(*ptep) & (_PAGE_ACCESSED | H_PAGE_HASHPTE)) == 0)
+ return 0;
+ old = pte_update(mm, addr, ptep, _PAGE_ACCESSED, 0, 0);
+ return (old & _PAGE_ACCESSED) != 0;
+}
+
+#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
+#define ptep_test_and_clear_young(__vma, __addr, __ptep) \
+({ \
+ int __r; \
+ __r = __ptep_test_and_clear_young((__vma)->vm_mm, __addr, __ptep); \
+ __r; \
+})
+
+#define __HAVE_ARCH_PTEP_SET_WRPROTECT
+static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep)
+{
+
+ if ((pte_val(*ptep) & _PAGE_WRITE) == 0)
+ return;
+
+ pte_update(mm, addr, ptep, _PAGE_WRITE, 0, 0);
+}
+
+static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
+{
+ if ((pte_val(*ptep) & _PAGE_WRITE) == 0)
+ return;
+
+ pte_update(mm, addr, ptep, _PAGE_WRITE, 0, 1);
+}
+
+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
+static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
+{
+ unsigned long old = pte_update(mm, addr, ptep, ~0UL, 0, 0);
+ return __pte(old);
+}
+
+static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
+ pte_t * ptep)
+{
+ pte_update(mm, addr, ptep, ~0UL, 0, 0);
+}
+static inline int pte_write(pte_t pte) { return !!(pte_val(pte) & _PAGE_WRITE);}
+static inline int pte_dirty(pte_t pte) { return !!(pte_val(pte) & _PAGE_DIRTY); }
+static inline int pte_young(pte_t pte) { return !!(pte_val(pte) & _PAGE_ACCESSED); }
+static inline int pte_special(pte_t pte) { return !!(pte_val(pte) & _PAGE_SPECIAL); }
+static inline pgprot_t pte_pgprot(pte_t pte) { return __pgprot(pte_val(pte) & PAGE_PROT_BITS); }
+
+#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
+static inline bool pte_soft_dirty(pte_t pte)
+{
+ return !!(pte_val(pte) & _PAGE_SOFT_DIRTY);
+}
+static inline pte_t pte_mksoft_dirty(pte_t pte)
+{
+ return __pte(pte_val(pte) | _PAGE_SOFT_DIRTY);
+}
+
+static inline pte_t pte_clear_soft_dirty(pte_t pte)
+{
+ return __pte(pte_val(pte) & ~_PAGE_SOFT_DIRTY);
+}
+#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
+
+#ifdef CONFIG_NUMA_BALANCING
+/*
+ * These work without NUMA balancing but the kernel does not care. See the
+ * comment in include/asm-generic/pgtable.h . On powerpc, this will only
+ * work for user pages and always return true for kernel pages.
+ */
+static inline int pte_protnone(pte_t pte)
+{
+ return (pte_val(pte) & (_PAGE_PRESENT | _PAGE_PRIVILEGED)) ==
+ (_PAGE_PRESENT | _PAGE_PRIVILEGED);
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
+static inline int pte_present(pte_t pte)
+{
+ return !!(pte_val(pte) & _PAGE_PRESENT);
+}
+/*
+ * Conversion functions: convert a page and protection to a page entry,
+ * and a page entry and page directory to the page they refer to.
+ *
+ * Even if PTEs can be unsigned long long, a PFN is always an unsigned
+ * long for now.
+ */
+static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot)
+{
+ return __pte((((pte_basic_t)(pfn) << PAGE_SHIFT) & PTE_RPN_MASK) |
+ pgprot_val(pgprot));
+}
+
+static inline unsigned long pte_pfn(pte_t pte)
+{
+ return (pte_val(pte) & PTE_RPN_MASK) >> PAGE_SHIFT;
+}
+
+/* Generic modifiers for PTE bits */
+static inline pte_t pte_wrprotect(pte_t pte)
+{
+ return __pte(pte_val(pte) & ~_PAGE_WRITE);
+}
+
+static inline pte_t pte_mkclean(pte_t pte)
+{
+ return __pte(pte_val(pte) & ~_PAGE_DIRTY);
+}
+
+static inline pte_t pte_mkold(pte_t pte)
+{
+ return __pte(pte_val(pte) & ~_PAGE_ACCESSED);
+}
+
+static inline pte_t pte_mkwrite(pte_t pte)
+{
+ /*
+ * write implies read, hence set both
+ */
+ return __pte(pte_val(pte) | _PAGE_RW);
+}
+
+static inline pte_t pte_mkdirty(pte_t pte)
+{
+ return __pte(pte_val(pte) | _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
+}
+
+static inline pte_t pte_mkyoung(pte_t pte)
+{
+ return __pte(pte_val(pte) | _PAGE_ACCESSED);
+}
+
+static inline pte_t pte_mkspecial(pte_t pte)
+{
+ return __pte(pte_val(pte) | _PAGE_SPECIAL);
+}
+
+static inline pte_t pte_mkhuge(pte_t pte)
+{
+ return pte;
+}
+
+static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
+{
+ /* FIXME!! check whether this need to be a conditional */
+ return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot));
+}
+
+static inline bool pte_user(pte_t pte)
+{
+ return !(pte_val(pte) & _PAGE_PRIVILEGED);
+}
+
+/* Encode and de-code a swap entry */
+#define MAX_SWAPFILES_CHECK() do { \
+ BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS); \
+ /* \
+ * Don't have overlapping bits with _PAGE_HPTEFLAGS \
+ * We filter HPTEFLAGS on set_pte. \
+ */ \
+ BUILD_BUG_ON(_PAGE_HPTEFLAGS & (0x1f << _PAGE_BIT_SWAP_TYPE)); \
+ BUILD_BUG_ON(_PAGE_HPTEFLAGS & _PAGE_SWP_SOFT_DIRTY); \
+ } while (0)
+/*
+ * on pte we don't need handle RADIX_TREE_EXCEPTIONAL_SHIFT;
+ */
+#define SWP_TYPE_BITS 5
+#define __swp_type(x) (((x).val >> _PAGE_BIT_SWAP_TYPE) \
+ & ((1UL << SWP_TYPE_BITS) - 1))
+#define __swp_offset(x) (((x).val & PTE_RPN_MASK) >> PAGE_SHIFT)
+#define __swp_entry(type, offset) ((swp_entry_t) { \
+ ((type) << _PAGE_BIT_SWAP_TYPE) \
+ | (((offset) << PAGE_SHIFT) & PTE_RPN_MASK)})
+/*
+ * swp_entry_t must be independent of pte bits. We build a swp_entry_t from
+ * swap type and offset we get from swap and convert that to pte to find a
+ * matching pte in linux page table.
+ * Clear bits not found in swap entries here.
+ */
+#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) & ~_PAGE_PTE })
+#define __swp_entry_to_pte(x) __pte((x).val | _PAGE_PTE)
+
+#ifdef CONFIG_MEM_SOFT_DIRTY
+#define _PAGE_SWP_SOFT_DIRTY (1UL << (SWP_TYPE_BITS + _PAGE_BIT_SWAP_TYPE))
+#else
+#define _PAGE_SWP_SOFT_DIRTY 0UL
+#endif /* CONFIG_MEM_SOFT_DIRTY */
+
+#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
+static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
+{
+ return __pte(pte_val(pte) | _PAGE_SWP_SOFT_DIRTY);
+}
+static inline bool pte_swp_soft_dirty(pte_t pte)
+{
+ return !!(pte_val(pte) & _PAGE_SWP_SOFT_DIRTY);
+}
+static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
+{
+ return __pte(pte_val(pte) & ~_PAGE_SWP_SOFT_DIRTY);
+}
+#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
+
+static inline bool check_pte_access(unsigned long access, unsigned long ptev)
+{
+ /*
+ * This check for _PAGE_RWX and _PAGE_PRESENT bits
+ */
+ if (access & ~ptev)
+ return false;
+ /*
+ * This check for access to privilege space
+ */
+ if ((access & _PAGE_PRIVILEGED) != (ptev & _PAGE_PRIVILEGED))
+ return false;
+
+ return true;
+}
+/*
+ * Generic functions with hash/radix callbacks
+ */
+
+static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry)
+{
+ if (radix_enabled())
+ return radix__ptep_set_access_flags(ptep, entry);
+ return hash__ptep_set_access_flags(ptep, entry);
+}
+
+#define __HAVE_ARCH_PTE_SAME
+static inline int pte_same(pte_t pte_a, pte_t pte_b)
+{
+ if (radix_enabled())
+ return radix__pte_same(pte_a, pte_b);
+ return hash__pte_same(pte_a, pte_b);
+}
+
+static inline int pte_none(pte_t pte)
+{
+ if (radix_enabled())
+ return radix__pte_none(pte);
+ return hash__pte_none(pte);
+}
+
+static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte, int percpu)
+{
+ if (radix_enabled())
+ return radix__set_pte_at(mm, addr, ptep, pte, percpu);
+ return hash__set_pte_at(mm, addr, ptep, pte, percpu);
+}
+
+#define _PAGE_CACHE_CTL (_PAGE_NON_IDEMPOTENT | _PAGE_TOLERANT)
+
+#define pgprot_noncached pgprot_noncached
+static inline pgprot_t pgprot_noncached(pgprot_t prot)
+{
+ return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) |
+ _PAGE_NON_IDEMPOTENT);
+}
+
+#define pgprot_noncached_wc pgprot_noncached_wc
+static inline pgprot_t pgprot_noncached_wc(pgprot_t prot)
+{
+ return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) |
+ _PAGE_TOLERANT);
+}
+
+#define pgprot_cached pgprot_cached
+static inline pgprot_t pgprot_cached(pgprot_t prot)
+{
+ return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL));
+}
+
+#define pgprot_writecombine pgprot_writecombine
+static inline pgprot_t pgprot_writecombine(pgprot_t prot)
+{
+ return pgprot_noncached_wc(prot);
+}
+/*
+ * check a pte mapping have cache inhibited property
+ */
+static inline bool pte_ci(pte_t pte)
+{
+ unsigned long pte_v = pte_val(pte);
+
+ if (((pte_v & _PAGE_CACHE_CTL) == _PAGE_TOLERANT) ||
+ ((pte_v & _PAGE_CACHE_CTL) == _PAGE_NON_IDEMPOTENT))
+ return true;
+ return false;
+}
+
static inline void pmd_set(pmd_t *pmdp, unsigned long val)
{
*pmdp = __pmd(val);
@@ -75,6 +628,13 @@ static inline void pmd_clear(pmd_t *pmdp)
#define pmd_none(pmd) (!pmd_val(pmd))
#define pmd_present(pmd) (!pmd_none(pmd))
+static inline int pmd_bad(pmd_t pmd)
+{
+ if (radix_enabled())
+ return radix__pmd_bad(pmd);
+ return hash__pmd_bad(pmd);
+}
+
static inline void pud_set(pud_t *pudp, unsigned long val)
{
*pudp = __pud(val);
@@ -100,6 +660,15 @@ static inline pud_t pte_pud(pte_t pte)
return __pud(pte_val(pte));
}
#define pud_write(pud) pte_write(pud_pte(pud))
+
+static inline int pud_bad(pud_t pud)
+{
+ if (radix_enabled())
+ return radix__pud_bad(pud);
+ return hash__pud_bad(pud);
+}
+
+
#define pgd_write(pgd) pte_write(pgd_pte(pgd))
static inline void pgd_set(pgd_t *pgdp, unsigned long val)
{
@@ -124,8 +693,27 @@ static inline pgd_t pte_pgd(pte_t pte)
return __pgd(pte_val(pte));
}
+static inline int pgd_bad(pgd_t pgd)
+{
+ if (radix_enabled())
+ return radix__pgd_bad(pgd);
+ return hash__pgd_bad(pgd);
+}
+
extern struct page *pgd_page(pgd_t pgd);
+/* Pointers in the page table tree are physical addresses */
+#define __pgtable_ptr_val(ptr) __pa(ptr)
+
+#define pmd_page_vaddr(pmd) __va(pmd_val(pmd) & ~PMD_MASKED_BITS)
+#define pud_page_vaddr(pud) __va(pud_val(pud) & ~PUD_MASKED_BITS)
+#define pgd_page_vaddr(pgd) __va(pgd_val(pgd) & ~PGD_MASKED_BITS)
+
+#define pgd_index(address) (((address) >> (PGDIR_SHIFT)) & (PTRS_PER_PGD - 1))
+#define pud_index(address) (((address) >> (PUD_SHIFT)) & (PTRS_PER_PUD - 1))
+#define pmd_index(address) (((address) >> (PMD_SHIFT)) & (PTRS_PER_PMD - 1))
+#define pte_index(address) (((address) >> (PAGE_SHIFT)) & (PTRS_PER_PTE - 1))
+
/*
* Find an entry in a page-table-directory. We combine the address region
* (the high order N bits) and the pgd portion of the address.
@@ -156,74 +744,42 @@ extern struct page *pgd_page(pgd_t pgd);
#define pgd_ERROR(e) \
pr_err("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e))
-/* Encode and de-code a swap entry */
-#define MAX_SWAPFILES_CHECK() do { \
- BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS); \
- /* \
- * Don't have overlapping bits with _PAGE_HPTEFLAGS \
- * We filter HPTEFLAGS on set_pte. \
- */ \
- BUILD_BUG_ON(_PAGE_HPTEFLAGS & (0x1f << _PAGE_BIT_SWAP_TYPE)); \
- BUILD_BUG_ON(_PAGE_HPTEFLAGS & _PAGE_SWP_SOFT_DIRTY); \
- } while (0)
-/*
- * on pte we don't need handle RADIX_TREE_EXCEPTIONAL_SHIFT;
- */
-#define SWP_TYPE_BITS 5
-#define __swp_type(x) (((x).val >> _PAGE_BIT_SWAP_TYPE) \
- & ((1UL << SWP_TYPE_BITS) - 1))
-#define __swp_offset(x) (((x).val & PTE_RPN_MASK) >> PTE_RPN_SHIFT)
-#define __swp_entry(type, offset) ((swp_entry_t) { \
- ((type) << _PAGE_BIT_SWAP_TYPE) \
- | (((offset) << PTE_RPN_SHIFT) & PTE_RPN_MASK)})
-/*
- * swp_entry_t must be independent of pte bits. We build a swp_entry_t from
- * swap type and offset we get from swap and convert that to pte to find a
- * matching pte in linux page table.
- * Clear bits not found in swap entries here.
- */
-#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) & ~_PAGE_PTE })
-#define __swp_entry_to_pte(x) __pte((x).val | _PAGE_PTE)
-
-#ifdef CONFIG_MEM_SOFT_DIRTY
-#define _PAGE_SWP_SOFT_DIRTY (1UL << (SWP_TYPE_BITS + _PAGE_BIT_SWAP_TYPE))
-#else
-#define _PAGE_SWP_SOFT_DIRTY 0UL
-#endif /* CONFIG_MEM_SOFT_DIRTY */
+void pgtable_cache_add(unsigned shift, void (*ctor)(void *));
+void pgtable_cache_init(void);
-#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
-static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
+static inline int map_kernel_page(unsigned long ea, unsigned long pa,
+ unsigned long flags)
{
- return __pte(pte_val(pte) | _PAGE_SWP_SOFT_DIRTY);
+ if (radix_enabled()) {
+#if defined(CONFIG_PPC_RADIX_MMU) && defined(DEBUG_VM)
+ unsigned long page_size = 1 << mmu_psize_defs[mmu_io_psize].shift;
+ WARN((page_size != PAGE_SIZE), "I/O page size != PAGE_SIZE");
+#endif
+ return radix__map_kernel_page(ea, pa, __pgprot(flags), PAGE_SIZE);
+ }
+ return hash__map_kernel_page(ea, pa, flags);
}
-static inline bool pte_swp_soft_dirty(pte_t pte)
+
+static inline int __meminit vmemmap_create_mapping(unsigned long start,
+ unsigned long page_size,
+ unsigned long phys)
{
- return !!(pte_val(pte) & _PAGE_SWP_SOFT_DIRTY);
+ if (radix_enabled())
+ return radix__vmemmap_create_mapping(start, page_size, phys);
+ return hash__vmemmap_create_mapping(start, page_size, phys);
}
-static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+static inline void vmemmap_remove_mapping(unsigned long start,
+ unsigned long page_size)
{
- return __pte(pte_val(pte) & ~_PAGE_SWP_SOFT_DIRTY);
+ if (radix_enabled())
+ return radix__vmemmap_remove_mapping(start, page_size);
+ return hash__vmemmap_remove_mapping(start, page_size);
}
-#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
-
-void pgtable_cache_add(unsigned shift, void (*ctor)(void *));
-void pgtable_cache_init(void);
-
+#endif
struct page *realmode_pfn_to_page(unsigned long pfn);
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-extern pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot);
-extern pmd_t mk_pmd(struct page *page, pgprot_t pgprot);
-extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot);
-extern void set_pmd_at(struct mm_struct *mm, unsigned long addr,
- pmd_t *pmdp, pmd_t pmd);
-extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmd);
-#define has_transparent_hugepage has_transparent_hugepage
-extern int has_transparent_hugepage(void);
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-
-
static inline pte_t pmd_pte(pmd_t pmd)
{
return __pte(pmd_val(pmd));
@@ -238,7 +794,6 @@ static inline pte_t *pmdp_ptep(pmd_t *pmd)
{
return (pte_t *)pmd;
}
-
#define pmd_pfn(pmd) pte_pfn(pmd_pte(pmd))
#define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd))
#define pmd_young(pmd) pte_young(pmd_pte(pmd))
@@ -265,9 +820,87 @@ static inline int pmd_protnone(pmd_t pmd)
#define __HAVE_ARCH_PMD_WRITE
#define pmd_write(pmd) pte_write(pmd_pte(pmd))
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+extern pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot);
+extern pmd_t mk_pmd(struct page *page, pgprot_t pgprot);
+extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot);
+extern void set_pmd_at(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp, pmd_t pmd);
+extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd);
+extern int hash__has_transparent_hugepage(void);
+static inline int has_transparent_hugepage(void)
+{
+ if (radix_enabled())
+ return radix__has_transparent_hugepage();
+ return hash__has_transparent_hugepage();
+}
+#define has_transparent_hugepage has_transparent_hugepage
+
+static inline unsigned long
+pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp,
+ unsigned long clr, unsigned long set)
+{
+ if (radix_enabled())
+ return radix__pmd_hugepage_update(mm, addr, pmdp, clr, set);
+ return hash__pmd_hugepage_update(mm, addr, pmdp, clr, set);
+}
+
+static inline int pmd_large(pmd_t pmd)
+{
+ return !!(pmd_val(pmd) & _PAGE_PTE);
+}
+
+static inline pmd_t pmd_mknotpresent(pmd_t pmd)
+{
+ return __pmd(pmd_val(pmd) & ~_PAGE_PRESENT);
+}
+/*
+ * For radix we should always find H_PAGE_HASHPTE zero. Hence
+ * the below will work for radix too
+ */
+static inline int __pmdp_test_and_clear_young(struct mm_struct *mm,
+ unsigned long addr, pmd_t *pmdp)
+{
+ unsigned long old;
+
+ if ((pmd_val(*pmdp) & (_PAGE_ACCESSED | H_PAGE_HASHPTE)) == 0)
+ return 0;
+ old = pmd_hugepage_update(mm, addr, pmdp, _PAGE_ACCESSED, 0);
+ return ((old & _PAGE_ACCESSED) != 0);
+}
+
+#define __HAVE_ARCH_PMDP_SET_WRPROTECT
+static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp)
+{
+
+ if ((pmd_val(*pmdp) & _PAGE_WRITE) == 0)
+ return;
+
+ pmd_hugepage_update(mm, addr, pmdp, _PAGE_WRITE, 0);
+}
+
+static inline int pmd_trans_huge(pmd_t pmd)
+{
+ if (radix_enabled())
+ return radix__pmd_trans_huge(pmd);
+ return hash__pmd_trans_huge(pmd);
+}
+
+#define __HAVE_ARCH_PMD_SAME
+static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
+{
+ if (radix_enabled())
+ return radix__pmd_same(pmd_a, pmd_b);
+ return hash__pmd_same(pmd_a, pmd_b);
+}
+
static inline pmd_t pmd_mkhuge(pmd_t pmd)
{
- return __pmd(pmd_val(pmd) | (_PAGE_PTE | _PAGE_THP_HUGE));
+ if (radix_enabled())
+ return radix__pmd_mkhuge(pmd);
+ return hash__pmd_mkhuge(pmd);
}
#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
@@ -278,37 +911,63 @@ extern int pmdp_set_access_flags(struct vm_area_struct *vma,
#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp);
-#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
-extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmdp);
#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
-extern pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
- unsigned long addr, pmd_t *pmdp);
+static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pmd_t *pmdp)
+{
+ if (radix_enabled())
+ return radix__pmdp_huge_get_and_clear(mm, addr, pmdp);
+ return hash__pmdp_huge_get_and_clear(mm, addr, pmdp);
+}
-extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmdp);
+static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp)
+{
+ if (radix_enabled())
+ return radix__pmdp_collapse_flush(vma, address, pmdp);
+ return hash__pmdp_collapse_flush(vma, address, pmdp);
+}
#define pmdp_collapse_flush pmdp_collapse_flush
#define __HAVE_ARCH_PGTABLE_DEPOSIT
-extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
- pgtable_t pgtable);
+static inline void pgtable_trans_huge_deposit(struct mm_struct *mm,
+ pmd_t *pmdp, pgtable_t pgtable)
+{
+ if (radix_enabled())
+ return radix__pgtable_trans_huge_deposit(mm, pmdp, pgtable);
+ return hash__pgtable_trans_huge_deposit(mm, pmdp, pgtable);
+}
+
#define __HAVE_ARCH_PGTABLE_WITHDRAW
-extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
+static inline pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm,
+ pmd_t *pmdp)
+{
+ if (radix_enabled())
+ return radix__pgtable_trans_huge_withdraw(mm, pmdp);
+ return hash__pgtable_trans_huge_withdraw(mm, pmdp);
+}
#define __HAVE_ARCH_PMDP_INVALIDATE
extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmdp);
#define __HAVE_ARCH_PMDP_HUGE_SPLIT_PREPARE
-extern void pmdp_huge_split_prepare(struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmdp);
+static inline void pmdp_huge_split_prepare(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp)
+{
+ if (radix_enabled())
+ return radix__pmdp_huge_split_prepare(vma, address, pmdp);
+ return hash__pmdp_huge_split_prepare(vma, address, pmdp);
+}
#define pmd_move_must_withdraw pmd_move_must_withdraw
struct spinlock;
static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl,
struct spinlock *old_pmd_ptl)
{
+ if (radix_enabled())
+ return false;
/*
* Archs like ppc64 use pgtable to store per pmd
* specific information. So when we switch the pmd,
@@ -316,5 +975,6 @@ static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl,
*/
return true;
}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif /* __ASSEMBLY__ */
#endif /* _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ */
diff --git a/arch/powerpc/include/asm/book3s/64/radix-4k.h b/arch/powerpc/include/asm/book3s/64/radix-4k.h
new file mode 100644
index 0000000..7c3b1fe
--- /dev/null
+++ b/arch/powerpc/include/asm/book3s/64/radix-4k.h
@@ -0,0 +1,12 @@
+#ifndef _ASM_POWERPC_PGTABLE_RADIX_4K_H
+#define _ASM_POWERPC_PGTABLE_RADIX_4K_H
+
+/*
+ * For 4K page size supported index is 13/9/9/9
+ */
+#define RADIX_PTE_INDEX_SIZE 9 /* 2MB huge page */
+#define RADIX_PMD_INDEX_SIZE 9 /* 1G huge page */
+#define RADIX_PUD_INDEX_SIZE 9
+#define RADIX_PGD_INDEX_SIZE 13
+
+#endif /* _ASM_POWERPC_PGTABLE_RADIX_4K_H */
diff --git a/arch/powerpc/include/asm/book3s/64/radix-64k.h b/arch/powerpc/include/asm/book3s/64/radix-64k.h
new file mode 100644
index 0000000..82dc355
--- /dev/null
+++ b/arch/powerpc/include/asm/book3s/64/radix-64k.h
@@ -0,0 +1,12 @@
+#ifndef _ASM_POWERPC_PGTABLE_RADIX_64K_H
+#define _ASM_POWERPC_PGTABLE_RADIX_64K_H
+
+/*
+ * For 64K page size supported index is 13/9/9/5
+ */
+#define RADIX_PTE_INDEX_SIZE 5 /* 2MB huge page */
+#define RADIX_PMD_INDEX_SIZE 9 /* 1G huge page */
+#define RADIX_PUD_INDEX_SIZE 9
+#define RADIX_PGD_INDEX_SIZE 13
+
+#endif /* _ASM_POWERPC_PGTABLE_RADIX_64K_H */
diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
new file mode 100644
index 0000000..937d4e2
--- /dev/null
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -0,0 +1,232 @@
+#ifndef _ASM_POWERPC_PGTABLE_RADIX_H
+#define _ASM_POWERPC_PGTABLE_RADIX_H
+
+#ifndef __ASSEMBLY__
+#include <asm/cmpxchg.h>
+#endif
+
+#ifdef CONFIG_PPC_64K_PAGES
+#include <asm/book3s/64/radix-64k.h>
+#else
+#include <asm/book3s/64/radix-4k.h>
+#endif
+
+/* An empty PTE can still have a R or C writeback */
+#define RADIX_PTE_NONE_MASK (_PAGE_DIRTY | _PAGE_ACCESSED)
+
+/* Bits to set in a RPMD/RPUD/RPGD */
+#define RADIX_PMD_VAL_BITS (0x8000000000000000UL | RADIX_PTE_INDEX_SIZE)
+#define RADIX_PUD_VAL_BITS (0x8000000000000000UL | RADIX_PMD_INDEX_SIZE)
+#define RADIX_PGD_VAL_BITS (0x8000000000000000UL | RADIX_PUD_INDEX_SIZE)
+
+/* Don't have anything in the reserved bits and leaf bits */
+#define RADIX_PMD_BAD_BITS 0x60000000000000e0UL
+#define RADIX_PUD_BAD_BITS 0x60000000000000e0UL
+#define RADIX_PGD_BAD_BITS 0x60000000000000e0UL
+
+/*
+ * Size of EA range mapped by our pagetables.
+ */
+#define RADIX_PGTABLE_EADDR_SIZE (RADIX_PTE_INDEX_SIZE + RADIX_PMD_INDEX_SIZE + \
+ RADIX_PUD_INDEX_SIZE + RADIX_PGD_INDEX_SIZE + PAGE_SHIFT)
+#define RADIX_PGTABLE_RANGE (ASM_CONST(1) << RADIX_PGTABLE_EADDR_SIZE)
+
+/*
+ * We support 52 bit address space, Use top bit for kernel
+ * virtual mapping. Also make sure kernel fit in the top
+ * quadrant.
+ *
+ * +------------------+
+ * +------------------+ Kernel virtual map (0xc008000000000000)
+ * | |
+ * | |
+ * | |
+ * 0b11......+------------------+ Kernel linear map (0xc....)
+ * | |
+ * | 2 quadrant |
+ * | |
+ * 0b10......+------------------+
+ * | |
+ * | 1 quadrant |
+ * | |
+ * 0b01......+------------------+
+ * | |
+ * | 0 quadrant |
+ * | |
+ * 0b00......+------------------+
+ *
+ *
+ * 3rd quadrant expanded:
+ * +------------------------------+
+ * | |
+ * | |
+ * | |
+ * +------------------------------+ Kernel IO map end (0xc010000000000000)
+ * | |
+ * | |
+ * | 1/2 of virtual map |
+ * | |
+ * | |
+ * +------------------------------+ Kernel IO map start
+ * | |
+ * | 1/4 of virtual map |
+ * | |
+ * +------------------------------+ Kernel vmemap start
+ * | |
+ * | 1/4 of virtual map |
+ * | |
+ * +------------------------------+ Kernel virt start (0xc008000000000000)
+ * | |
+ * | |
+ * | |
+ * +------------------------------+ Kernel linear (0xc.....)
+ */
+
+#define RADIX_KERN_VIRT_START ASM_CONST(0xc008000000000000)
+#define RADIX_KERN_VIRT_SIZE ASM_CONST(0x0008000000000000)
+
+/*
+ * The vmalloc space starts at the beginning of that region, and
+ * occupies a quarter of it on radix config.
+ * (we keep a quarter for the virtual memmap)
+ */
+#define RADIX_VMALLOC_START RADIX_KERN_VIRT_START
+#define RADIX_VMALLOC_SIZE (RADIX_KERN_VIRT_SIZE >> 2)
+#define RADIX_VMALLOC_END (RADIX_VMALLOC_START + RADIX_VMALLOC_SIZE)
+/*
+ * Defines the address of the vmemap area, in its own region on
+ * hash table CPUs.
+ */
+#define RADIX_VMEMMAP_BASE (RADIX_VMALLOC_END)
+
+#ifndef __ASSEMBLY__
+#define RADIX_PTE_TABLE_SIZE (sizeof(pte_t) << RADIX_PTE_INDEX_SIZE)
+#define RADIX_PMD_TABLE_SIZE (sizeof(pmd_t) << RADIX_PMD_INDEX_SIZE)
+#define RADIX_PUD_TABLE_SIZE (sizeof(pud_t) << RADIX_PUD_INDEX_SIZE)
+#define RADIX_PGD_TABLE_SIZE (sizeof(pgd_t) << RADIX_PGD_INDEX_SIZE)
+
+static inline unsigned long radix__pte_update(struct mm_struct *mm,
+ unsigned long addr,
+ pte_t *ptep, unsigned long clr,
+ unsigned long set,
+ int huge)
+{
+ pte_t pte;
+ unsigned long old_pte, new_pte;
+
+ do {
+ pte = READ_ONCE(*ptep);
+ old_pte = pte_val(pte);
+ new_pte = (old_pte | set) & ~clr;
+
+ } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
+
+ /* We already do a sync in cmpxchg, is ptesync needed ?*/
+ asm volatile("ptesync" : : : "memory");
+ /* huge pages use the old page table lock */
+ if (!huge)
+ assert_pte_locked(mm, addr);
+
+ return old_pte;
+}
+
+/*
+ * Set the dirty and/or accessed bits atomically in a linux PTE, this
+ * function doesn't need to invalidate tlb.
+ */
+static inline void radix__ptep_set_access_flags(pte_t *ptep, pte_t entry)
+{
+ pte_t pte;
+ unsigned long old_pte, new_pte;
+ unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED |
+ _PAGE_RW | _PAGE_EXEC);
+ do {
+ pte = READ_ONCE(*ptep);
+ old_pte = pte_val(pte);
+ new_pte = old_pte | set;
+
+ } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
+
+ /* We already do a sync in cmpxchg, is ptesync needed ?*/
+ asm volatile("ptesync" : : : "memory");
+}
+
+static inline int radix__pte_same(pte_t pte_a, pte_t pte_b)
+{
+ return ((pte_raw(pte_a) ^ pte_raw(pte_b)) == 0);
+}
+
+static inline int radix__pte_none(pte_t pte)
+{
+ return (pte_val(pte) & ~RADIX_PTE_NONE_MASK) == 0;
+}
+
+static inline void radix__set_pte_at(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte, int percpu)
+{
+ *ptep = pte;
+ asm volatile("ptesync" : : : "memory");
+}
+
+static inline int radix__pmd_bad(pmd_t pmd)
+{
+ return !!(pmd_val(pmd) & RADIX_PMD_BAD_BITS);
+}
+
+static inline int radix__pmd_same(pmd_t pmd_a, pmd_t pmd_b)
+{
+ return ((pmd_raw(pmd_a) ^ pmd_raw(pmd_b)) == 0);
+}
+
+static inline int radix__pud_bad(pud_t pud)
+{
+ return !!(pud_val(pud) & RADIX_PUD_BAD_BITS);
+}
+
+
+static inline int radix__pgd_bad(pgd_t pgd)
+{
+ return !!(pgd_val(pgd) & RADIX_PGD_BAD_BITS);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+
+static inline int radix__pmd_trans_huge(pmd_t pmd)
+{
+ return !!(pmd_val(pmd) & _PAGE_PTE);
+}
+
+static inline pmd_t radix__pmd_mkhuge(pmd_t pmd)
+{
+ return __pmd(pmd_val(pmd) | _PAGE_PTE);
+}
+static inline void radix__pmdp_huge_split_prepare(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp)
+{
+ /* Nothing to do for radix. */
+ return;
+}
+
+extern unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp, unsigned long clr,
+ unsigned long set);
+extern pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp);
+extern void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+ pgtable_t pgtable);
+extern pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
+extern pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pmd_t *pmdp);
+extern int radix__has_transparent_hugepage(void);
+#endif
+
+extern int __meminit radix__vmemmap_create_mapping(unsigned long start,
+ unsigned long page_size,
+ unsigned long phys);
+extern void radix__vmemmap_remove_mapping(unsigned long start,
+ unsigned long page_size);
+
+extern int radix__map_kernel_page(unsigned long ea, unsigned long pa,
+ pgprot_t flags, unsigned int psz);
+#endif /* __ASSEMBLY__ */
+#endif
diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h
index 1b753f9..f12ddf5 100644
--- a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h
@@ -1,8 +1,6 @@
#ifndef _ASM_POWERPC_BOOK3S_64_TLBFLUSH_HASH_H
#define _ASM_POWERPC_BOOK3S_64_TLBFLUSH_HASH_H
-#define MMU_NO_CONTEXT 0
-
/*
* TLB flushing for 64-bit hash-MMU CPUs
*/
@@ -29,14 +27,21 @@ extern void __flush_tlb_pending(struct ppc64_tlb_batch *batch);
static inline void arch_enter_lazy_mmu_mode(void)
{
- struct ppc64_tlb_batch *batch = this_cpu_ptr(&ppc64_tlb_batch);
+ struct ppc64_tlb_batch *batch;
+ if (radix_enabled())
+ return;
+ batch = this_cpu_ptr(&ppc64_tlb_batch);
batch->active = 1;
}
static inline void arch_leave_lazy_mmu_mode(void)
{
- struct ppc64_tlb_batch *batch = this_cpu_ptr(&ppc64_tlb_batch);
+ struct ppc64_tlb_batch *batch;
+
+ if (radix_enabled())
+ return;
+ batch = this_cpu_ptr(&ppc64_tlb_batch);
if (batch->index)
__flush_tlb_pending(batch);
@@ -52,40 +57,42 @@ extern void flush_hash_range(unsigned long number, int local);
extern void flush_hash_hugepage(unsigned long vsid, unsigned long addr,
pmd_t *pmdp, unsigned int psize, int ssize,
unsigned long flags);
-
-static inline void local_flush_tlb_mm(struct mm_struct *mm)
+static inline void hash__local_flush_tlb_mm(struct mm_struct *mm)
{
}
-static inline void flush_tlb_mm(struct mm_struct *mm)
+static inline void hash__flush_tlb_mm(struct mm_struct *mm)
{
}
-static inline void local_flush_tlb_page(struct vm_area_struct *vma,
- unsigned long vmaddr)
+static inline void hash__local_flush_tlb_page(struct vm_area_struct *vma,
+ unsigned long vmaddr)
{
}
-static inline void flush_tlb_page(struct vm_area_struct *vma,
- unsigned long vmaddr)
+static inline void hash__flush_tlb_page(struct vm_area_struct *vma,
+ unsigned long vmaddr)
{
}
-static inline void flush_tlb_page_nohash(struct vm_area_struct *vma,
- unsigned long vmaddr)
+static inline void hash__flush_tlb_page_nohash(struct vm_area_struct *vma,
+ unsigned long vmaddr)
{
}
-static inline void flush_tlb_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end)
+static inline void hash__flush_tlb_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
{
}
-static inline void flush_tlb_kernel_range(unsigned long start,
- unsigned long end)
+static inline void hash__flush_tlb_kernel_range(unsigned long start,
+ unsigned long end)
{
}
+
+struct mmu_gather;
+extern void hash__tlb_flush(struct mmu_gather *tlb);
/* Private function for use by PCI IO mapping code */
extern void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
unsigned long end);
diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
new file mode 100644
index 0000000..13ef388
--- /dev/null
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
@@ -0,0 +1,33 @@
+#ifndef _ASM_POWERPC_TLBFLUSH_RADIX_H
+#define _ASM_POWERPC_TLBFLUSH_RADIX_H
+
+struct vm_area_struct;
+struct mm_struct;
+struct mmu_gather;
+
+static inline int mmu_get_ap(int psize)
+{
+ return mmu_psize_defs[psize].ap;
+}
+
+extern void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end);
+extern void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end);
+
+extern void radix__local_flush_tlb_mm(struct mm_struct *mm);
+extern void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr);
+extern void radix___local_flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr,
+ unsigned long ap, int nid);
+extern void radix__tlb_flush(struct mmu_gather *tlb);
+#ifdef CONFIG_SMP
+extern void radix__flush_tlb_mm(struct mm_struct *mm);
+extern void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr);
+extern void radix___flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr,
+ unsigned long ap, int nid);
+#else
+#define radix__flush_tlb_mm(mm) radix__local_flush_tlb_mm(mm)
+#define radix__flush_tlb_page(vma,addr) radix__local_flush_tlb_page(vma,addr)
+#define radix___flush_tlb_page(mm,addr,p,i) radix___local_flush_tlb_page(mm,addr,p,i)
+#endif
+
+#endif
diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h
new file mode 100644
index 0000000..d98424a
--- /dev/null
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h
@@ -0,0 +1,76 @@
+#ifndef _ASM_POWERPC_BOOK3S_64_TLBFLUSH_H
+#define _ASM_POWERPC_BOOK3S_64_TLBFLUSH_H
+
+#define MMU_NO_CONTEXT ~0UL
+
+
+#include <asm/book3s/64/tlbflush-hash.h>
+#include <asm/book3s/64/tlbflush-radix.h>
+
+static inline void flush_tlb_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
+{
+ if (radix_enabled())
+ return radix__flush_tlb_range(vma, start, end);
+ return hash__flush_tlb_range(vma, start, end);
+}
+
+static inline void flush_tlb_kernel_range(unsigned long start,
+ unsigned long end)
+{
+ if (radix_enabled())
+ return radix__flush_tlb_kernel_range(start, end);
+ return hash__flush_tlb_kernel_range(start, end);
+}
+
+static inline void local_flush_tlb_mm(struct mm_struct *mm)
+{
+ if (radix_enabled())
+ return radix__local_flush_tlb_mm(mm);
+ return hash__local_flush_tlb_mm(mm);
+}
+
+static inline void local_flush_tlb_page(struct vm_area_struct *vma,
+ unsigned long vmaddr)
+{
+ if (radix_enabled())
+ return radix__local_flush_tlb_page(vma, vmaddr);
+ return hash__local_flush_tlb_page(vma, vmaddr);
+}
+
+static inline void flush_tlb_page_nohash(struct vm_area_struct *vma,
+ unsigned long vmaddr)
+{
+ if (radix_enabled())
+ return radix__flush_tlb_page(vma, vmaddr);
+ return hash__flush_tlb_page_nohash(vma, vmaddr);
+}
+
+static inline void tlb_flush(struct mmu_gather *tlb)
+{
+ if (radix_enabled())
+ return radix__tlb_flush(tlb);
+ return hash__tlb_flush(tlb);
+}
+
+#ifdef CONFIG_SMP
+static inline void flush_tlb_mm(struct mm_struct *mm)
+{
+ if (radix_enabled())
+ return radix__flush_tlb_mm(mm);
+ return hash__flush_tlb_mm(mm);
+}
+
+static inline void flush_tlb_page(struct vm_area_struct *vma,
+ unsigned long vmaddr)
+{
+ if (radix_enabled())
+ return radix__flush_tlb_page(vma, vmaddr);
+ return hash__flush_tlb_page(vma, vmaddr);
+}
+#else
+#define flush_tlb_mm(mm) local_flush_tlb_mm(mm)
+#define flush_tlb_page(vma, addr) local_flush_tlb_page(vma, addr)
+#endif /* CONFIG_SMP */
+
+#endif /* _ASM_POWERPC_BOOK3S_64_TLBFLUSH_H */
diff --git a/arch/powerpc/include/asm/book3s/pgalloc.h b/arch/powerpc/include/asm/book3s/pgalloc.h
new file mode 100644
index 0000000..54f591e
--- /dev/null
+++ b/arch/powerpc/include/asm/book3s/pgalloc.h
@@ -0,0 +1,19 @@
+#ifndef _ASM_POWERPC_BOOK3S_PGALLOC_H
+#define _ASM_POWERPC_BOOK3S_PGALLOC_H
+
+#include <linux/mm.h>
+
+extern void tlb_remove_table(struct mmu_gather *tlb, void *table);
+static inline void tlb_flush_pgtable(struct mmu_gather *tlb,
+ unsigned long address)
+{
+
+}
+
+#ifdef CONFIG_PPC64
+#include <asm/book3s/64/pgalloc.h>
+#else
+#include <asm/book3s/32/pgalloc.h>
+#endif
+
+#endif /* _ASM_POWERPC_BOOK3S_PGALLOC_H */
diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h
index 42814f0..e2d9f49 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -8,6 +8,8 @@
extern struct kmem_cache *hugepte_cache;
#ifdef CONFIG_PPC_BOOK3S_64
+
+#include <asm/book3s/64/hugetlb-radix.h>
/*
* This should work for other subarchs too. But right now we use the
* new format only for 64bit book3s
@@ -31,7 +33,19 @@ static inline unsigned int hugepd_shift(hugepd_t hpd)
{
return mmu_psize_to_shift(hugepd_mmu_psize(hpd));
}
+static inline void flush_hugetlb_page(struct vm_area_struct *vma,
+ unsigned long vmaddr)
+{
+ if (radix_enabled())
+ return radix__flush_hugetlb_page(vma, vmaddr);
+}
+static inline void __local_flush_hugetlb_page(struct vm_area_struct *vma,
+ unsigned long vmaddr)
+{
+ if (radix_enabled())
+ return radix__local_flush_hugetlb_page(vma, vmaddr);
+}
#else
static inline pte_t *hugepd_page(hugepd_t hpd)
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 7529aab..1f4497f 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -276,19 +276,24 @@ static inline unsigned long hpte_make_readonly(unsigned long ptel)
return ptel;
}
-static inline int hpte_cache_flags_ok(unsigned long ptel, unsigned long io_type)
+static inline bool hpte_cache_flags_ok(unsigned long hptel, bool is_ci)
{
- unsigned int wimg = ptel & HPTE_R_WIMG;
+ unsigned int wimg = hptel & HPTE_R_WIMG;
/* Handle SAO */
if (wimg == (HPTE_R_W | HPTE_R_I | HPTE_R_M) &&
cpu_has_feature(CPU_FTR_ARCH_206))
wimg = HPTE_R_M;
- if (!io_type)
+ if (!is_ci)
return wimg == HPTE_R_M;
-
- return (wimg & (HPTE_R_W | HPTE_R_I)) == io_type;
+ /*
+ * if host is mapped cache inhibited, make sure hptel also have
+ * cache inhibited.
+ */
+ if (wimg & HPTE_R_W) /* FIXME!! is this ok for all guest. ? */
+ return false;
+ return !!(wimg & HPTE_R_I);
}
/*
@@ -305,9 +310,9 @@ static inline pte_t kvmppc_read_update_linux_pte(pte_t *ptep, int writing)
*/
old_pte = READ_ONCE(*ptep);
/*
- * wait until _PAGE_BUSY is clear then set it atomically
+ * wait until H_PAGE_BUSY is clear then set it atomically
*/
- if (unlikely(pte_val(old_pte) & _PAGE_BUSY)) {
+ if (unlikely(pte_val(old_pte) & H_PAGE_BUSY)) {
cpu_relax();
continue;
}
@@ -319,27 +324,12 @@ static inline pte_t kvmppc_read_update_linux_pte(pte_t *ptep, int writing)
if (writing && pte_write(old_pte))
new_pte = pte_mkdirty(new_pte);
- if (pte_val(old_pte) == __cmpxchg_u64((unsigned long *)ptep,
- pte_val(old_pte),
- pte_val(new_pte))) {
+ if (pte_xchg(ptep, old_pte, new_pte))
break;
- }
}
return new_pte;
}
-
-/* Return HPTE cache control bits corresponding to Linux pte bits */
-static inline unsigned long hpte_cache_bits(unsigned long pte_val)
-{
-#if _PAGE_NO_CACHE == HPTE_R_I && _PAGE_WRITETHRU == HPTE_R_W
- return pte_val & (HPTE_R_W | HPTE_R_I);
-#else
- return ((pte_val & _PAGE_NO_CACHE) ? HPTE_R_I : 0) +
- ((pte_val & _PAGE_WRITETHRU) ? HPTE_R_W : 0);
-#endif
-}
-
static inline bool hpte_read_permission(unsigned long pp, unsigned long key)
{
if (key)
diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h
index fd22442..6bdcd0d 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -256,6 +256,7 @@ struct machdep_calls {
#ifdef CONFIG_ARCH_RANDOM
int (*get_random_seed)(unsigned long *v);
#endif
+ int (*update_partition_table)(u64);
};
extern void e500_idle(void);
diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h
index 8ca1c98..e53ebeb 100644
--- a/arch/powerpc/include/asm/mmu.h
+++ b/arch/powerpc/include/asm/mmu.h
@@ -88,6 +88,11 @@
*/
#define MMU_FTR_1T_SEGMENT ASM_CONST(0x40000000)
+/*
+ * Radix page table available
+ */
+#define MMU_FTR_RADIX ASM_CONST(0x80000000)
+
/* MMU feature bit sets for various CPUs */
#define MMU_FTRS_DEFAULT_HPTE_ARCH_V2 \
MMU_FTR_HPTE_TABLE | MMU_FTR_PPCAS_ARCH_V2
@@ -110,9 +115,25 @@
DECLARE_PER_CPU(int, next_tlbcam_idx);
#endif
+enum {
+ MMU_FTRS_POSSIBLE = MMU_FTR_HPTE_TABLE | MMU_FTR_TYPE_8xx |
+ MMU_FTR_TYPE_40x | MMU_FTR_TYPE_44x | MMU_FTR_TYPE_FSL_E |
+ MMU_FTR_TYPE_47x | MMU_FTR_USE_HIGH_BATS | MMU_FTR_BIG_PHYS |
+ MMU_FTR_USE_TLBIVAX_BCAST | MMU_FTR_USE_TLBILX |
+ MMU_FTR_LOCK_BCAST_INVAL | MMU_FTR_NEED_DTLB_SW_LRU |
+ MMU_FTR_USE_TLBRSRV | MMU_FTR_USE_PAIRED_MAS |
+ MMU_FTR_NO_SLBIE_B | MMU_FTR_16M_PAGE | MMU_FTR_TLBIEL |
+ MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_CI_LARGE_PAGE |
+ MMU_FTR_1T_SEGMENT |
+#ifdef CONFIG_PPC_RADIX_MMU
+ MMU_FTR_RADIX |
+#endif
+ 0,
+};
+
static inline int mmu_has_feature(unsigned long feature)
{
- return (cur_cpu_spec->mmu_features & feature);
+ return (MMU_FTRS_POSSIBLE & cur_cpu_spec->mmu_features & feature);
}
static inline void mmu_clear_feature(unsigned long feature)
@@ -122,13 +143,6 @@ static inline void mmu_clear_feature(unsigned long feature)
extern unsigned int __start___mmu_ftr_fixup, __stop___mmu_ftr_fixup;
-/* MMU initialization */
-extern void early_init_mmu(void);
-extern void early_init_mmu_secondary(void);
-
-extern void setup_initial_memory_limit(phys_addr_t first_memblock_base,
- phys_addr_t first_memblock_size);
-
#ifdef CONFIG_PPC64
/* This is our real memory area size on ppc64 server, on embedded, we
* make it match the size our of bolted TLB area
@@ -181,10 +195,20 @@ static inline void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
#define MMU_PAGE_COUNT 15
-#if defined(CONFIG_PPC_STD_MMU_64)
-/* 64-bit classic hash table MMU */
-#include <asm/book3s/64/mmu-hash.h>
-#elif defined(CONFIG_PPC_STD_MMU_32)
+#ifdef CONFIG_PPC_BOOK3S_64
+#include <asm/book3s/64/mmu.h>
+#else /* CONFIG_PPC_BOOK3S_64 */
+
+#ifndef __ASSEMBLY__
+/* MMU initialization */
+extern void early_init_mmu(void);
+extern void early_init_mmu_secondary(void);
+extern void setup_initial_memory_limit(phys_addr_t first_memblock_base,
+ phys_addr_t first_memblock_size);
+#endif /* __ASSEMBLY__ */
+#endif
+
+#if defined(CONFIG_PPC_STD_MMU_32)
/* 32-bit classic hash table MMU */
#include <asm/book3s/32/mmu-hash.h>
#elif defined(CONFIG_40x)
@@ -201,6 +225,9 @@ static inline void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
# include <asm/mmu-8xx.h>
#endif
+#ifndef radix_enabled
+#define radix_enabled() (0)
+#endif
#endif /* __KERNEL__ */
#endif /* _ASM_POWERPC_MMU_H_ */
diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index 4eaab40..9d2cd0c 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -33,16 +33,27 @@ extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
extern long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem);
extern void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem);
#endif
-
-extern void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next);
extern void switch_slb(struct task_struct *tsk, struct mm_struct *mm);
extern void set_context(unsigned long id, pgd_t *pgd);
#ifdef CONFIG_PPC_BOOK3S_64
+extern void radix__switch_mmu_context(struct mm_struct *prev,
+ struct mm_struct *next);
+static inline void switch_mmu_context(struct mm_struct *prev,
+ struct mm_struct *next,
+ struct task_struct *tsk)
+{
+ if (radix_enabled())
+ return radix__switch_mmu_context(prev, next);
+ return switch_slb(tsk, next);
+}
+
extern int __init_new_context(void);
extern void __destroy_context(int context_id);
static inline void mmu_context_init(void) { }
#else
+extern void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next,
+ struct task_struct *tsk);
extern unsigned long __init_new_context(void);
extern void __destroy_context(unsigned long context_id);
extern void mmu_context_init(void);
@@ -88,17 +99,11 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
if (cpu_has_feature(CPU_FTR_ALTIVEC))
asm volatile ("dssall");
#endif /* CONFIG_ALTIVEC */
-
- /* The actual HW switching method differs between the various
- * sub architectures.
+ /*
+ * The actual HW switching method differs between the various
+ * sub architectures. Out of line for now
*/
-#ifdef CONFIG_PPC_STD_MMU_64
- switch_slb(tsk, next);
-#else
- /* Out of line for now */
- switch_mmu_context(prev, next);
-#endif
-
+ switch_mmu_context(prev, next, tsk);
}
#define deactivate_mm(tsk,mm) do { } while (0)
diff --git a/arch/powerpc/include/asm/pgalloc-32.h b/arch/powerpc/include/asm/nohash/32/pgalloc.h
index 76d6b9e..76d6b9e 100644
--- a/arch/powerpc/include/asm/pgalloc-32.h
+++ b/arch/powerpc/include/asm/nohash/32/pgalloc.h
diff --git a/arch/powerpc/include/asm/pgalloc-64.h b/arch/powerpc/include/asm/nohash/64/pgalloc.h
index 8d5fc3a..0c12a3b 100644
--- a/arch/powerpc/include/asm/pgalloc-64.h
+++ b/arch/powerpc/include/asm/nohash/64/pgalloc.h
@@ -53,7 +53,7 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
#ifndef CONFIG_PPC_64K_PAGES
-#define pgd_populate(MM, PGD, PUD) pgd_set(PGD, __pgtable_ptr_val(PUD))
+#define pgd_populate(MM, PGD, PUD) pgd_set(PGD, (unsigned long)PUD)
static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
{
@@ -68,19 +68,19 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud)
static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
{
- pud_set(pud, __pgtable_ptr_val(pmd));
+ pud_set(pud, (unsigned long)pmd);
}
static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd,
pte_t *pte)
{
- pmd_set(pmd, __pgtable_ptr_val(pte));
+ pmd_set(pmd, (unsigned long)pte);
}
static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
pgtable_t pte_page)
{
- pmd_set(pmd, __pgtable_ptr_val(page_address(pte_page)));
+ pmd_set(pmd, (unsigned long)page_address(pte_page));
}
#define pmd_pgtable(pmd) pmd_page(pmd)
@@ -119,119 +119,65 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage)
__free_page(ptepage);
}
-static inline void pgtable_free(void *table, unsigned index_size)
-{
- if (!index_size)
- free_page((unsigned long)table);
- else {
- BUG_ON(index_size > MAX_PGTABLE_INDEX_SIZE);
- kmem_cache_free(PGT_CACHE(index_size), table);
- }
-}
-
+extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift);
#ifdef CONFIG_SMP
-static inline void pgtable_free_tlb(struct mmu_gather *tlb,
- void *table, int shift)
-{
- unsigned long pgf = (unsigned long)table;
- BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
- pgf |= shift;
- tlb_remove_table(tlb, (void *)pgf);
-}
-
-static inline void __tlb_remove_table(void *_table)
-{
- void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE);
- unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE;
-
- pgtable_free(table, shift);
-}
-#else /* !CONFIG_SMP */
-static inline void pgtable_free_tlb(struct mmu_gather *tlb,
- void *table, int shift)
-{
- pgtable_free(table, shift);
-}
-#endif /* CONFIG_SMP */
-
+extern void __tlb_remove_table(void *_table);
+#endif
static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table,
unsigned long address)
{
tlb_flush_pgtable(tlb, address);
- pgtable_page_dtor(table);
pgtable_free_tlb(tlb, page_address(table), 0);
}
#else /* if CONFIG_PPC_64K_PAGES */
-extern pte_t *page_table_alloc(struct mm_struct *, unsigned long, int);
-extern void page_table_free(struct mm_struct *, unsigned long *, int);
+extern pte_t *pte_fragment_alloc(struct mm_struct *, unsigned long, int);
+extern void pte_fragment_free(unsigned long *, int);
extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift);
#ifdef CONFIG_SMP
extern void __tlb_remove_table(void *_table);
#endif
-#ifndef __PAGETABLE_PUD_FOLDED
-/* book3s 64 is 4 level page table */
-static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
-{
- pgd_set(pgd, __pgtable_ptr_val(pud));
-}
-
-static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
-{
- return kmem_cache_alloc(PGT_CACHE(PUD_INDEX_SIZE),
- GFP_KERNEL|__GFP_REPEAT);
-}
-
-static inline void pud_free(struct mm_struct *mm, pud_t *pud)
-{
- kmem_cache_free(PGT_CACHE(PUD_INDEX_SIZE), pud);
-}
-#endif
-
-static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
-{
- pud_set(pud, __pgtable_ptr_val(pmd));
-}
+#define pud_populate(mm, pud, pmd) pud_set(pud, (unsigned long)pmd)
static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd,
pte_t *pte)
{
- pmd_set(pmd, __pgtable_ptr_val(pte));
+ pmd_set(pmd, (unsigned long)pte);
}
static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
pgtable_t pte_page)
{
- pmd_set(pmd, __pgtable_ptr_val(pte_page));
+ pmd_set(pmd, (unsigned long)pte_page);
}
static inline pgtable_t pmd_pgtable(pmd_t pmd)
{
- return (pgtable_t)pmd_page_vaddr(pmd);
+ return (pgtable_t)(pmd_val(pmd) & ~PMD_MASKED_BITS);
}
static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
unsigned long address)
{
- return (pte_t *)page_table_alloc(mm, address, 1);
+ return (pte_t *)pte_fragment_alloc(mm, address, 1);
}
static inline pgtable_t pte_alloc_one(struct mm_struct *mm,
unsigned long address)
{
- return (pgtable_t)page_table_alloc(mm, address, 0);
+ return (pgtable_t)pte_fragment_alloc(mm, address, 0);
}
static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
{
- page_table_free(mm, (unsigned long *)pte, 1);
+ pte_fragment_fre((unsigned long *)pte, 1);
}
static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage)
{
- page_table_free(mm, (unsigned long *)ptepage, 0);
+ pte_fragment_free((unsigned long *)ptepage, 0);
}
static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table,
@@ -255,11 +201,11 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
#define __pmd_free_tlb(tlb, pmd, addr) \
pgtable_free_tlb(tlb, pmd, PMD_CACHE_INDEX)
-#ifndef __PAGETABLE_PUD_FOLDED
+#ifndef CONFIG_PPC_64K_PAGES
#define __pud_free_tlb(tlb, pud, addr) \
pgtable_free_tlb(tlb, pud, PUD_INDEX_SIZE)
-#endif /* __PAGETABLE_PUD_FOLDED */
+#endif /* CONFIG_PPC_64K_PAGES */
#define check_pgt_cache() do { } while (0)
diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h
index 10debb9..d4d808c 100644
--- a/arch/powerpc/include/asm/nohash/64/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/64/pgtable.h
@@ -108,9 +108,6 @@
#ifndef __ASSEMBLY__
/* pte_clear moved to later in this file */
-/* Pointers in the page table tree are virtual addresses */
-#define __pgtable_ptr_val(ptr) ((unsigned long)(ptr))
-
#define PMD_BAD_BITS (PTE_TABLE_SIZE-1)
#define PUD_BAD_BITS (PMD_TABLE_SIZE-1)
@@ -362,6 +359,13 @@ static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry)
void pgtable_cache_add(unsigned shift, void (*ctor)(void *));
void pgtable_cache_init(void);
+extern int map_kernel_page(unsigned long ea, unsigned long pa,
+ unsigned long flags);
+extern int __meminit vmemmap_create_mapping(unsigned long start,
+ unsigned long page_size,
+ unsigned long phys);
+extern void vmemmap_remove_mapping(unsigned long start,
+ unsigned long page_size);
#endif /* __ASSEMBLY__ */
#endif /* _ASM_POWERPC_NOHASH_64_PGTABLE_H */
diff --git a/arch/powerpc/include/asm/nohash/pgalloc.h b/arch/powerpc/include/asm/nohash/pgalloc.h
new file mode 100644
index 0000000..b39ec95
--- /dev/null
+++ b/arch/powerpc/include/asm/nohash/pgalloc.h
@@ -0,0 +1,23 @@
+#ifndef _ASM_POWERPC_NOHASH_PGALLOC_H
+#define _ASM_POWERPC_NOHASH_PGALLOC_H
+
+#include <linux/mm.h>
+
+extern void tlb_remove_table(struct mmu_gather *tlb, void *table);
+#ifdef CONFIG_PPC64
+extern void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address);
+#else
+/* 44x etc which is BOOKE not BOOK3E */
+static inline void tlb_flush_pgtable(struct mmu_gather *tlb,
+ unsigned long address)
+{
+
+}
+#endif /* !CONFIG_PPC_BOOK3E */
+
+#ifdef CONFIG_PPC64
+#include <asm/nohash/64/pgalloc.h>
+#else
+#include <asm/nohash/32/pgalloc.h>
+#endif
+#endif /* _ASM_POWERPC_NOHASH_PGALLOC_H */
diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h
index f8faaae..9bb8ddf 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -368,16 +368,16 @@ enum OpalLPCAddressType {
};
enum opal_msg_type {
- OPAL_MSG_ASYNC_COMP = 0, /* params[0] = token, params[1] = rc,
+ OPAL_MSG_ASYNC_COMP = 0, /* params[0] = token, params[1] = rc,
* additional params function-specific
*/
- OPAL_MSG_MEM_ERR,
- OPAL_MSG_EPOW,
- OPAL_MSG_SHUTDOWN, /* params[0] = 1 reboot, 0 shutdown */
- OPAL_MSG_HMI_EVT,
- OPAL_MSG_DPO,
- OPAL_MSG_PRD,
- OPAL_MSG_OCC,
+ OPAL_MSG_MEM_ERR = 1,
+ OPAL_MSG_EPOW = 2,
+ OPAL_MSG_SHUTDOWN = 3, /* params[0] = 1 reboot, 0 shutdown */
+ OPAL_MSG_HMI_EVT = 4,
+ OPAL_MSG_DPO = 5,
+ OPAL_MSG_PRD = 6,
+ OPAL_MSG_OCC = 7,
OPAL_MSG_TYPE_MAX,
};
diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
index ab3d897..51db3a3 100644
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -288,7 +288,11 @@ extern long long virt_phys_offset;
#ifndef __ASSEMBLY__
+#ifdef CONFIG_PPC_BOOK3S_64
+#include <asm/pgtable-be-types.h>
+#else
#include <asm/pgtable-types.h>
+#endif
typedef struct { signed long pd; } hugepd_t;
@@ -312,12 +316,20 @@ void arch_free_page(struct page *page, int order);
#endif
struct vm_area_struct;
-
+#ifdef CONFIG_PPC_BOOK3S_64
+/*
+ * For BOOK3s 64 with 4k and 64K linux page size
+ * we want to use pointers, because the page table
+ * actually store pfn
+ */
+typedef pte_t *pgtable_t;
+#else
#if defined(CONFIG_PPC_64K_PAGES) && defined(CONFIG_PPC64)
typedef pte_t *pgtable_t;
#else
typedef struct page *pgtable_t;
#endif
+#endif
#include <asm-generic/memory_model.h>
#endif /* __ASSEMBLY__ */
diff --git a/arch/powerpc/include/asm/page_64.h b/arch/powerpc/include/asm/page_64.h
index d908a46..dd5f071 100644
--- a/arch/powerpc/include/asm/page_64.h
+++ b/arch/powerpc/include/asm/page_64.h
@@ -93,7 +93,7 @@ extern u64 ppc64_pft_size;
#define SLICE_LOW_TOP (0x100000000ul)
#define SLICE_NUM_LOW (SLICE_LOW_TOP >> SLICE_LOW_SHIFT)
-#define SLICE_NUM_HIGH (PGTABLE_RANGE >> SLICE_HIGH_SHIFT)
+#define SLICE_NUM_HIGH (H_PGTABLE_RANGE >> SLICE_HIGH_SHIFT)
#define GET_LOW_SLICE_INDEX(addr) ((addr) >> SLICE_LOW_SHIFT)
#define GET_HIGH_SLICE_INDEX(addr) ((addr) >> SLICE_HIGH_SHIFT)
@@ -128,8 +128,6 @@ extern void slice_set_user_psize(struct mm_struct *mm, unsigned int psize);
extern void slice_set_range_psize(struct mm_struct *mm, unsigned long start,
unsigned long len, unsigned int psize);
-#define slice_mm_new_context(mm) ((mm)->context.id == MMU_NO_CONTEXT)
-
#endif /* __ASSEMBLY__ */
#else
#define slice_init()
@@ -151,7 +149,6 @@ do { \
#define slice_set_range_psize(mm, start, len, psize) \
slice_set_user_psize((mm), (psize))
-#define slice_mm_new_context(mm) 1
#endif /* CONFIG_PPC_MM_SLICES */
#ifdef CONFIG_HUGETLB_PAGE
diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index f5056e3..467c0b0 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -17,33 +17,34 @@ struct device_node;
* PCI controller operations
*/
struct pci_controller_ops {
- void (*dma_dev_setup)(struct pci_dev *dev);
+ void (*dma_dev_setup)(struct pci_dev *pdev);
void (*dma_bus_setup)(struct pci_bus *bus);
- int (*probe_mode)(struct pci_bus *);
+ int (*probe_mode)(struct pci_bus *bus);
/* Called when pci_enable_device() is called. Returns true to
* allow assignment/enabling of the device. */
- bool (*enable_device_hook)(struct pci_dev *);
+ bool (*enable_device_hook)(struct pci_dev *pdev);
- void (*disable_device)(struct pci_dev *);
+ void (*disable_device)(struct pci_dev *pdev);
- void (*release_device)(struct pci_dev *);
+ void (*release_device)(struct pci_dev *pdev);
/* Called during PCI resource reassignment */
- resource_size_t (*window_alignment)(struct pci_bus *, unsigned long type);
- void (*reset_secondary_bus)(struct pci_dev *dev);
+ resource_size_t (*window_alignment)(struct pci_bus *bus,
+ unsigned long type);
+ void (*reset_secondary_bus)(struct pci_dev *pdev);
#ifdef CONFIG_PCI_MSI
- int (*setup_msi_irqs)(struct pci_dev *dev,
+ int (*setup_msi_irqs)(struct pci_dev *pdev,
int nvec, int type);
- void (*teardown_msi_irqs)(struct pci_dev *dev);
+ void (*teardown_msi_irqs)(struct pci_dev *pdev);
#endif
- int (*dma_set_mask)(struct pci_dev *dev, u64 dma_mask);
- u64 (*dma_get_required_mask)(struct pci_dev *dev);
+ int (*dma_set_mask)(struct pci_dev *pdev, u64 dma_mask);
+ u64 (*dma_get_required_mask)(struct pci_dev *pdev);
- void (*shutdown)(struct pci_controller *);
+ void (*shutdown)(struct pci_controller *hose);
};
/*
@@ -208,14 +209,14 @@ struct pci_dn {
#ifdef CONFIG_EEH
struct eeh_dev *edev; /* eeh device */
#endif
-#define IODA_INVALID_PE (-1)
+#define IODA_INVALID_PE 0xFFFFFFFF
#ifdef CONFIG_PPC_POWERNV
- int pe_number;
+ unsigned int pe_number;
int vf_index; /* VF index in the PF */
#ifdef CONFIG_PCI_IOV
u16 vfs_expanded; /* number of VFs IOV BAR expanded */
u16 num_vfs; /* number of VFs enabled*/
- int *pe_num_map; /* PE# for the first VF PE or array */
+ unsigned int *pe_num_map; /* PE# for the first VF PE or array */
bool m64_single_mode; /* Use M64 BAR in Single Mode */
#define IODA_INVALID_M64 (-1)
int (*m64_map)[PCI_SRIOV_NUM_BARS];
@@ -234,7 +235,9 @@ extern struct pci_dn *pci_get_pdn_by_devfn(struct pci_bus *bus,
extern struct pci_dn *pci_get_pdn(struct pci_dev *pdev);
extern struct pci_dn *add_dev_pci_data(struct pci_dev *pdev);
extern void remove_dev_pci_data(struct pci_dev *pdev);
-extern void *update_dn_pci_info(struct device_node *dn, void *data);
+extern struct pci_dn *pci_add_device_node_info(struct pci_controller *hose,
+ struct device_node *dn);
+extern void pci_remove_device_node_info(struct device_node *dn);
static inline int pci_device_from_OF_node(struct device_node *np,
u8 *bus, u8 *devfn)
@@ -256,13 +259,13 @@ static inline struct eeh_dev *pdn_to_eeh_dev(struct pci_dn *pdn)
#endif
/** Find the bus corresponding to the indicated device node */
-extern struct pci_bus *pcibios_find_pci_bus(struct device_node *dn);
+extern struct pci_bus *pci_find_bus_by_node(struct device_node *dn);
/** Remove all of the PCI devices under this bus */
-extern void pcibios_remove_pci_devices(struct pci_bus *bus);
+extern void pci_hp_remove_devices(struct pci_bus *bus);
/** Discover new pci devices under this bus, and add them */
-extern void pcibios_add_pci_devices(struct pci_bus *bus);
+extern void pci_hp_add_devices(struct pci_bus *bus);
extern void isa_bridge_find_early(struct pci_controller *hose);
diff --git a/arch/powerpc/include/asm/pgalloc.h b/arch/powerpc/include/asm/pgalloc.h
index fc3ee06..0413457 100644
--- a/arch/powerpc/include/asm/pgalloc.h
+++ b/arch/powerpc/include/asm/pgalloc.h
@@ -1,25 +1,12 @@
#ifndef _ASM_POWERPC_PGALLOC_H
#define _ASM_POWERPC_PGALLOC_H
-#ifdef __KERNEL__
#include <linux/mm.h>
-#ifdef CONFIG_PPC_BOOK3E
-extern void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address);
-#else /* CONFIG_PPC_BOOK3E */
-static inline void tlb_flush_pgtable(struct mmu_gather *tlb,
- unsigned long address)
-{
-}
-#endif /* !CONFIG_PPC_BOOK3E */
-
-extern void tlb_remove_table(struct mmu_gather *tlb, void *table);
-
-#ifdef CONFIG_PPC64
-#include <asm/pgalloc-64.h>
+#ifdef CONFIG_PPC_BOOK3S
+#include <asm/book3s/pgalloc.h>
#else
-#include <asm/pgalloc-32.h>
+#include <asm/nohash/pgalloc.h>
#endif
-#endif /* __KERNEL__ */
#endif /* _ASM_POWERPC_PGALLOC_H */
diff --git a/arch/powerpc/include/asm/pgtable-be-types.h b/arch/powerpc/include/asm/pgtable-be-types.h
new file mode 100644
index 0000000..e2bf2086
--- /dev/null
+++ b/arch/powerpc/include/asm/pgtable-be-types.h
@@ -0,0 +1,92 @@
+#ifndef _ASM_POWERPC_PGTABLE_BE_TYPES_H
+#define _ASM_POWERPC_PGTABLE_BE_TYPES_H
+
+#include <asm/cmpxchg.h>
+
+/* PTE level */
+typedef struct { __be64 pte; } pte_t;
+#define __pte(x) ((pte_t) { cpu_to_be64(x) })
+static inline unsigned long pte_val(pte_t x)
+{
+ return be64_to_cpu(x.pte);
+}
+
+static inline __be64 pte_raw(pte_t x)
+{
+ return x.pte;
+}
+
+/* PMD level */
+#ifdef CONFIG_PPC64
+typedef struct { __be64 pmd; } pmd_t;
+#define __pmd(x) ((pmd_t) { cpu_to_be64(x) })
+static inline unsigned long pmd_val(pmd_t x)
+{
+ return be64_to_cpu(x.pmd);
+}
+
+static inline __be64 pmd_raw(pmd_t x)
+{
+ return x.pmd;
+}
+
+/*
+ * 64 bit hash always use 4 level table. Everybody else use 4 level
+ * only for 4K page size.
+ */
+#if defined(CONFIG_PPC_BOOK3S_64) || !defined(CONFIG_PPC_64K_PAGES)
+typedef struct { __be64 pud; } pud_t;
+#define __pud(x) ((pud_t) { cpu_to_be64(x) })
+static inline unsigned long pud_val(pud_t x)
+{
+ return be64_to_cpu(x.pud);
+}
+#endif /* CONFIG_PPC_BOOK3S_64 || !CONFIG_PPC_64K_PAGES */
+#endif /* CONFIG_PPC64 */
+
+/* PGD level */
+typedef struct { __be64 pgd; } pgd_t;
+#define __pgd(x) ((pgd_t) { cpu_to_be64(x) })
+static inline unsigned long pgd_val(pgd_t x)
+{
+ return be64_to_cpu(x.pgd);
+}
+
+/* Page protection bits */
+typedef struct { unsigned long pgprot; } pgprot_t;
+#define pgprot_val(x) ((x).pgprot)
+#define __pgprot(x) ((pgprot_t) { (x) })
+
+/*
+ * With hash config 64k pages additionally define a bigger "real PTE" type that
+ * gathers the "second half" part of the PTE for pseudo 64k pages
+ */
+#if defined(CONFIG_PPC_64K_PAGES) && defined(CONFIG_PPC_STD_MMU_64)
+typedef struct { pte_t pte; unsigned long hidx; } real_pte_t;
+#else
+typedef struct { pte_t pte; } real_pte_t;
+#endif
+
+static inline bool pte_xchg(pte_t *ptep, pte_t old, pte_t new)
+{
+ unsigned long *p = (unsigned long *)ptep;
+ __be64 prev;
+
+ prev = (__force __be64)__cmpxchg_u64(p, (__force unsigned long)pte_raw(old),
+ (__force unsigned long)pte_raw(new));
+
+ return pte_raw(old) == prev;
+}
+
+static inline bool pmd_xchg(pmd_t *pmdp, pmd_t old, pmd_t new)
+{
+ unsigned long *p = (unsigned long *)pmdp;
+ __be64 prev;
+
+ prev = (__force __be64)__cmpxchg_u64(p, (__force unsigned long)pmd_raw(old),
+ (__force unsigned long)pmd_raw(new));
+
+ return pmd_raw(old) == prev;
+}
+
+#endif /* _ASM_POWERPC_PGTABLE_BE_TYPES_H */
diff --git a/arch/powerpc/include/asm/pgtable-types.h b/arch/powerpc/include/asm/pgtable-types.h
index 43140f8..e7f4f3e 100644
--- a/arch/powerpc/include/asm/pgtable-types.h
+++ b/arch/powerpc/include/asm/pgtable-types.h
@@ -1,9 +1,6 @@
#ifndef _ASM_POWERPC_PGTABLE_TYPES_H
#define _ASM_POWERPC_PGTABLE_TYPES_H
-#ifdef CONFIG_STRICT_MM_TYPECHECKS
-/* These are used to make use of C type-checking. */
-
/* PTE level */
typedef struct { pte_basic_t pte; } pte_t;
#define __pte(x) ((pte_t) { (x) })
@@ -48,49 +45,6 @@ typedef struct { unsigned long pgprot; } pgprot_t;
#define pgprot_val(x) ((x).pgprot)
#define __pgprot(x) ((pgprot_t) { (x) })
-#else
-
-/*
- * .. while these make it easier on the compiler
- */
-
-typedef pte_basic_t pte_t;
-#define __pte(x) (x)
-static inline pte_basic_t pte_val(pte_t pte)
-{
- return pte;
-}
-
-#ifdef CONFIG_PPC64
-typedef unsigned long pmd_t;
-#define __pmd(x) (x)
-static inline unsigned long pmd_val(pmd_t pmd)
-{
- return pmd;
-}
-
-#if defined(CONFIG_PPC_BOOK3S_64) || !defined(CONFIG_PPC_64K_PAGES)
-typedef unsigned long pud_t;
-#define __pud(x) (x)
-static inline unsigned long pud_val(pud_t pud)
-{
- return pud;
-}
-#endif /* CONFIG_PPC_BOOK3S_64 || !CONFIG_PPC_64K_PAGES */
-#endif /* CONFIG_PPC64 */
-
-typedef unsigned long pgd_t;
-#define __pgd(x) (x)
-static inline unsigned long pgd_val(pgd_t pgd)
-{
- return pgd;
-}
-
-typedef unsigned long pgprot_t;
-#define pgprot_val(x) (x)
-#define __pgprot(x) (x)
-
-#endif /* CONFIG_STRICT_MM_TYPECHECKS */
/*
* With hash config 64k pages additionally define a bigger "real PTE" type that
* gathers the "second half" part of the PTE for pseudo 64k pages
@@ -100,4 +54,16 @@ typedef struct { pte_t pte; unsigned long hidx; } real_pte_t;
#else
typedef struct { pte_t pte; } real_pte_t;
#endif
+
+#ifdef CONFIG_PPC_STD_MMU_64
+#include <asm/cmpxchg.h>
+
+static inline bool pte_xchg(pte_t *ptep, pte_t old, pte_t new)
+{
+ unsigned long *p = (unsigned long *)ptep;
+
+ return pte_val(old) == __cmpxchg_u64(p, pte_val(old), pte_val(new));
+}
+#endif
+
#endif /* _ASM_POWERPC_PGTABLE_TYPES_H */
diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h
index 7ab04fc..1d035c1 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -131,6 +131,7 @@
/* sorted alphabetically */
#define PPC_INST_BHRBE 0x7c00025c
#define PPC_INST_CLRBHRB 0x7c00035c
+#define PPC_INST_CP_ABORT 0x7c00068c
#define PPC_INST_DCBA 0x7c0005ec
#define PPC_INST_DCBA_MASK 0xfc0007fe
#define PPC_INST_DCBAL 0x7c2005ec
@@ -285,6 +286,7 @@
#endif
/* Deal with instructions that older assemblers aren't aware of */
+#define PPC_CP_ABORT stringify_in_c(.long PPC_INST_CP_ABORT)
#define PPC_DCBAL(a, b) stringify_in_c(.long PPC_INST_DCBAL | \
__PPC_RA(a) | __PPC_RB(b))
#define PPC_DCBZL(a, b) stringify_in_c(.long PPC_INST_DCBZL | \
diff --git a/arch/powerpc/include/asm/ppc-pci.h b/arch/powerpc/include/asm/ppc-pci.h
index ca0c5bf..8753e4e 100644
--- a/arch/powerpc/include/asm/ppc-pci.h
+++ b/arch/powerpc/include/asm/ppc-pci.h
@@ -33,9 +33,9 @@ extern struct pci_dev *isa_bridge_pcidev; /* may be NULL if no ISA bus */
struct device_node;
struct pci_dn;
-typedef void *(*traverse_func)(struct device_node *me, void *data);
-void *traverse_pci_devices(struct device_node *start, traverse_func pre,
- void *data);
+void *pci_traverse_device_nodes(struct device_node *start,
+ void *(*fn)(struct device_node *, void *),
+ void *data);
void *traverse_pci_dn(struct pci_dn *root,
void *(*fn)(struct pci_dn *, void *),
void *data);
diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h
index 499d9f8..2b31632 100644
--- a/arch/powerpc/include/asm/ppc_asm.h
+++ b/arch/powerpc/include/asm/ppc_asm.h
@@ -427,7 +427,10 @@ END_FTR_SECTION_IFCLR(CPU_FTR_601)
li r4,1024; \
mtctr r4; \
lis r4,KERNELBASE@h; \
+ .machine push; \
+ .machine "power4"; \
0: tlbie r4; \
+ .machine pop; \
addi r4,r4,0x1000; \
bdnz 0b
#endif
diff --git a/arch/powerpc/include/asm/pte-common.h b/arch/powerpc/include/asm/pte-common.h
index 1ec67b0..2eeaf80 100644
--- a/arch/powerpc/include/asm/pte-common.h
+++ b/arch/powerpc/include/asm/pte-common.h
@@ -76,6 +76,16 @@
*/
#ifndef __ASSEMBLY__
extern unsigned long bad_call_to_PMD_PAGE_SIZE(void);
+
+/*
+ * Don't just check for any non zero bits in __PAGE_USER, since for book3e
+ * and PTE_64BIT, PAGE_KERNEL_X contains _PAGE_BAP_SR which is also in
+ * _PAGE_USER. Need to explicitly match _PAGE_BAP_UR bit in that case too.
+ */
+static inline bool pte_user(pte_t pte)
+{
+ return (pte_val(pte) & _PAGE_USER) == _PAGE_USER;
+}
#endif /* __ASSEMBLY__ */
/* Location of the PFN in the PTE. Most 32-bit platforms use the same
@@ -184,13 +194,6 @@ extern unsigned long bad_call_to_PMD_PAGE_SIZE(void);
/* Make modules code happy. We don't set RO yet */
#define PAGE_KERNEL_EXEC PAGE_KERNEL_X
-/*
- * Don't just check for any non zero bits in __PAGE_USER, since for book3e
- * and PTE_64BIT, PAGE_KERNEL_X contains _PAGE_BAP_SR which is also in
- * _PAGE_USER. Need to explicitly match _PAGE_BAP_UR bit in that case too.
- */
-#define pte_user(val) ((val & _PAGE_USER) == _PAGE_USER)
-
/* Advertise special mapping type for AGP */
#define PAGE_AGP (PAGE_KERNEL_NC)
#define HAVE_PAGE_AGP
@@ -198,3 +201,12 @@ extern unsigned long bad_call_to_PMD_PAGE_SIZE(void);
/* Advertise support for _PAGE_SPECIAL */
#define __HAVE_ARCH_PTE_SPECIAL
+#ifndef _PAGE_READ
+/* if not defined, we should not find _PAGE_WRITE too */
+#define _PAGE_READ 0
+#define _PAGE_WRITE _PAGE_RW
+#endif
+
+#ifndef H_PAGE_4K_PFN
+#define H_PAGE_4K_PFN 0
+#endif
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index f5f4c66..c1e82e9 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -347,6 +347,7 @@
#define LPCR_LPES_SH 2
#define LPCR_RMI 0x00000002 /* real mode is cache inhibit */
#define LPCR_HDICE 0x00000001 /* Hyp Decr enable (HV,PR,EE) */
+#define LPCR_UPRT 0x00400000 /* Use Process Table (ISA 3) */
#ifndef SPRN_LPID
#define SPRN_LPID 0x13F /* Logical Partition Identifier */
#endif
@@ -587,6 +588,7 @@
#define SPRN_PIR 0x3FF /* Processor Identification Register */
#endif
#define SPRN_TIR 0x1BE /* Thread Identification Register */
+#define SPRN_PTCR 0x1D0 /* Partition table control Register */
#define SPRN_PSPB 0x09F /* Problem State Priority Boost reg */
#define SPRN_PTEHI 0x3D5 /* 981 7450 PTE HI word (S/W TLB load) */
#define SPRN_PTELO 0x3D6 /* 982 7450 PTE LO word (S/W TLB load) */
@@ -1182,6 +1184,7 @@
#define PVR_970GX 0x0045
#define PVR_POWER7p 0x004A
#define PVR_POWER8E 0x004B
+#define PVR_POWER8NVL 0x004C
#define PVR_POWER8 0x004D
#define PVR_BE 0x0070
#define PVR_PA6T 0x0090
diff --git a/arch/powerpc/include/asm/tlbflush.h b/arch/powerpc/include/asm/tlbflush.h
index 9f77f85..1b38eea 100644
--- a/arch/powerpc/include/asm/tlbflush.h
+++ b/arch/powerpc/include/asm/tlbflush.h
@@ -58,6 +58,7 @@ extern void __flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr,
#elif defined(CONFIG_PPC_STD_MMU_32)
+#define MMU_NO_CONTEXT (0)
/*
* TLB flushing for "classic" hash-MMU 32-bit CPUs, 6xx, 7xx, 7xxx
*/
@@ -78,7 +79,7 @@ static inline void local_flush_tlb_mm(struct mm_struct *mm)
}
#elif defined(CONFIG_PPC_STD_MMU_64)
-#include <asm/book3s/64/tlbflush-hash.h>
+#include <asm/book3s/64/tlbflush.h>
#else
#error Unsupported MMU type
#endif
diff --git a/arch/powerpc/include/uapi/asm/perf_regs.h b/arch/powerpc/include/uapi/asm/perf_regs.h
new file mode 100644
index 0000000..6a93209
--- /dev/null
+++ b/arch/powerpc/include/uapi/asm/perf_regs.h
@@ -0,0 +1,50 @@
+#ifndef _UAPI_ASM_POWERPC_PERF_REGS_H
+#define _UAPI_ASM_POWERPC_PERF_REGS_H
+
+enum perf_event_powerpc_regs {
+ PERF_REG_POWERPC_R0,
+ PERF_REG_POWERPC_R1,
+ PERF_REG_POWERPC_R2,
+ PERF_REG_POWERPC_R3,
+ PERF_REG_POWERPC_R4,
+ PERF_REG_POWERPC_R5,
+ PERF_REG_POWERPC_R6,
+ PERF_REG_POWERPC_R7,
+ PERF_REG_POWERPC_R8,
+ PERF_REG_POWERPC_R9,
+ PERF_REG_POWERPC_R10,
+ PERF_REG_POWERPC_R11,
+ PERF_REG_POWERPC_R12,
+ PERF_REG_POWERPC_R13,
+ PERF_REG_POWERPC_R14,
+ PERF_REG_POWERPC_R15,
+ PERF_REG_POWERPC_R16,
+ PERF_REG_POWERPC_R17,
+ PERF_REG_POWERPC_R18,
+ PERF_REG_POWERPC_R19,
+ PERF_REG_POWERPC_R20,
+ PERF_REG_POWERPC_R21,
+ PERF_REG_POWERPC_R22,
+ PERF_REG_POWERPC_R23,
+ PERF_REG_POWERPC_R24,
+ PERF_REG_POWERPC_R25,
+ PERF_REG_POWERPC_R26,
+ PERF_REG_POWERPC_R27,
+ PERF_REG_POWERPC_R28,
+ PERF_REG_POWERPC_R29,
+ PERF_REG_POWERPC_R30,
+ PERF_REG_POWERPC_R31,
+ PERF_REG_POWERPC_NIP,
+ PERF_REG_POWERPC_MSR,
+ PERF_REG_POWERPC_ORIG_R3,
+ PERF_REG_POWERPC_CTR,
+ PERF_REG_POWERPC_LINK,
+ PERF_REG_POWERPC_XER,
+ PERF_REG_POWERPC_CCR,
+ PERF_REG_POWERPC_SOFTE,
+ PERF_REG_POWERPC_TRAP,
+ PERF_REG_POWERPC_DAR,
+ PERF_REG_POWERPC_DSISR,
+ PERF_REG_POWERPC_MAX,
+};
+#endif /* _UAPI_ASM_POWERPC_PERF_REGS_H */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index c9370d4..9ea0955 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -438,7 +438,11 @@ int main(void)
DEFINE(BUG_ENTRY_SIZE, sizeof(struct bug_entry));
#endif
+#ifdef MAX_PGD_TABLE_SIZE
+ DEFINE(PGD_TABLE_SIZE, MAX_PGD_TABLE_SIZE);
+#else
DEFINE(PGD_TABLE_SIZE, PGD_TABLE_SIZE);
+#endif
DEFINE(PTE_SIZE, sizeof(pte_t));
#ifdef CONFIG_KVM
diff --git a/arch/powerpc/kernel/btext.c b/arch/powerpc/kernel/btext.c
index 41c011c..8275858 100644
--- a/arch/powerpc/kernel/btext.c
+++ b/arch/powerpc/kernel/btext.c
@@ -162,7 +162,7 @@ void btext_map(void)
offset = ((unsigned long) dispDeviceBase) - base;
size = dispDeviceRowBytes * dispDeviceRect[3] + offset
+ dispDeviceRect[0];
- vbase = __ioremap(base, size, _PAGE_NO_CACHE);
+ vbase = __ioremap(base, size, pgprot_val(pgprot_noncached_wc(__pgprot(0))));
if (vbase == 0)
return;
logicalDisplayBase = vbase + offset;
diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c
index 6c662b8..eeeacf6 100644
--- a/arch/powerpc/kernel/cputable.c
+++ b/arch/powerpc/kernel/cputable.c
@@ -63,7 +63,6 @@ extern void __setup_cpu_745x(unsigned long offset, struct cpu_spec* spec);
extern void __setup_cpu_ppc970(unsigned long offset, struct cpu_spec* spec);
extern void __setup_cpu_ppc970MP(unsigned long offset, struct cpu_spec* spec);
extern void __setup_cpu_pa6t(unsigned long offset, struct cpu_spec* spec);
-extern void __setup_cpu_a2(unsigned long offset, struct cpu_spec* spec);
extern void __restore_cpu_pa6t(void);
extern void __restore_cpu_ppc970(void);
extern void __setup_cpu_power7(unsigned long offset, struct cpu_spec* spec);
@@ -72,7 +71,6 @@ extern void __setup_cpu_power8(unsigned long offset, struct cpu_spec* spec);
extern void __restore_cpu_power8(void);
extern void __setup_cpu_power9(unsigned long offset, struct cpu_spec* spec);
extern void __restore_cpu_power9(void);
-extern void __restore_cpu_a2(void);
extern void __flush_tlb_power7(unsigned int action);
extern void __flush_tlb_power8(unsigned int action);
extern void __flush_tlb_power9(unsigned int action);
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 6544017..c9bc78e 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -48,7 +48,7 @@
/** Overview:
- * EEH, or "Extended Error Handling" is a PCI bridge technology for
+ * EEH, or "Enhanced Error Handling" is a PCI bridge technology for
* dealing with PCI bus errors that can't be dealt with within the
* usual PCI framework, except by check-stopping the CPU. Systems
* that are designed for high-availability/reliability cannot afford
@@ -1068,7 +1068,7 @@ void eeh_add_device_early(struct pci_dn *pdn)
struct pci_controller *phb;
struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
- if (!edev || !eeh_enabled())
+ if (!edev)
return;
if (!eeh_has_flag(EEH_PROBE_MODE_DEVTREE))
@@ -1336,14 +1336,11 @@ static int eeh_pe_change_owner(struct eeh_pe *pe)
id->subdevice != pdev->subsystem_device)
continue;
- goto reset;
+ return eeh_pe_reset_and_recover(pe);
}
}
return eeh_unfreeze_pe(pe, true);
-
-reset:
- return eeh_pe_reset_and_recover(pe);
}
/**
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index fb6207d..2714a3b 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -171,6 +171,16 @@ static void *eeh_dev_save_state(void *data, void *userdata)
if (!edev)
return NULL;
+ /*
+ * We cannot access the config space on some adapters.
+ * Otherwise, it will cause fenced PHB. We don't save
+ * the content in their config space and will restore
+ * from the initial config space saved when the EEH
+ * device is created.
+ */
+ if (edev->pe && (edev->pe->state & EEH_PE_CFG_RESTRICTED))
+ return NULL;
+
pdev = eeh_dev_to_pci_dev(edev);
if (!pdev)
return NULL;
@@ -312,6 +322,19 @@ static void *eeh_dev_restore_state(void *data, void *userdata)
if (!edev)
return NULL;
+ /*
+ * The content in the config space isn't saved because
+ * the blocked config space on some adapters. We have
+ * to restore the initial saved config space when the
+ * EEH device is created.
+ */
+ if (edev->pe && (edev->pe->state & EEH_PE_CFG_RESTRICTED)) {
+ if (list_is_last(&edev->list, &edev->pe->edevs))
+ eeh_pe_restore_bars(edev->pe);
+
+ return NULL;
+ }
+
pdev = eeh_dev_to_pci_dev(edev);
if (!pdev)
return NULL;
@@ -552,7 +575,7 @@ static int eeh_clear_pe_frozen_state(struct eeh_pe *pe,
int eeh_pe_reset_and_recover(struct eeh_pe *pe)
{
- int result, ret;
+ int ret;
/* Bail if the PE is being recovered */
if (pe->state & EEH_PE_RECOVERING)
@@ -564,9 +587,6 @@ int eeh_pe_reset_and_recover(struct eeh_pe *pe)
/* Save states */
eeh_pe_dev_traverse(pe, eeh_dev_save_state, NULL);
- /* Report error */
- eeh_pe_dev_traverse(pe, eeh_report_error, &result);
-
/* Issue reset */
ret = eeh_reset_pe(pe);
if (ret) {
@@ -581,15 +601,9 @@ int eeh_pe_reset_and_recover(struct eeh_pe *pe)
return ret;
}
- /* Notify completion of reset */
- eeh_pe_dev_traverse(pe, eeh_report_reset, &result);
-
/* Restore device state */
eeh_pe_dev_traverse(pe, eeh_dev_restore_state, NULL);
- /* Resume */
- eeh_pe_dev_traverse(pe, eeh_report_resume, NULL);
-
/* Clear recovery mode */
eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
@@ -621,7 +635,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
* We don't remove the corresponding PE instances because
* we need the information afterwords. The attached EEH
* devices are expected to be attached soon when calling
- * into pcibios_add_pci_devices().
+ * into pci_hp_add_devices().
*/
eeh_pe_state_mark(pe, EEH_PE_KEEP);
if (bus) {
@@ -630,7 +644,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
} else {
eeh_pe_state_clear(pe, EEH_PE_PRI_BUS);
pci_lock_rescan_remove();
- pcibios_remove_pci_devices(bus);
+ pci_hp_remove_devices(bus);
pci_unlock_rescan_remove();
}
} else if (frozen_bus) {
@@ -681,7 +695,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
if (pe->type & EEH_PE_VF)
eeh_add_virt_device(edev, NULL);
else
- pcibios_add_pci_devices(bus);
+ pci_hp_add_devices(bus);
} else if (frozen_bus && rmv_data->removed) {
pr_info("EEH: Sleep 5s ahead of partial hotplug\n");
ssleep(5);
@@ -691,7 +705,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
if (pe->type & EEH_PE_VF)
eeh_add_virt_device(edev, NULL);
else
- pcibios_add_pci_devices(frozen_bus);
+ pci_hp_add_devices(frozen_bus);
}
eeh_pe_state_clear(pe, EEH_PE_KEEP);
@@ -896,7 +910,7 @@ perm_error:
eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
pci_lock_rescan_remove();
- pcibios_remove_pci_devices(frozen_bus);
+ pci_hp_remove_devices(frozen_bus);
pci_unlock_rescan_remove();
}
}
@@ -981,7 +995,7 @@ static void eeh_handle_special_event(void)
bus = eeh_pe_bus_get(phb_pe);
eeh_pe_dev_traverse(pe,
eeh_report_failure, NULL);
- pcibios_remove_pci_devices(bus);
+ pci_hp_remove_devices(bus);
}
pci_unlock_rescan_remove();
}
diff --git a/arch/powerpc/kernel/eeh_event.c b/arch/powerpc/kernel/eeh_event.c
index 4eefb6e..82e7327 100644
--- a/arch/powerpc/kernel/eeh_event.c
+++ b/arch/powerpc/kernel/eeh_event.c
@@ -36,7 +36,7 @@
static DEFINE_SPINLOCK(eeh_eventlist_lock);
static struct semaphore eeh_eventlist_sem;
-LIST_HEAD(eeh_eventlist);
+static LIST_HEAD(eeh_eventlist);
/**
* eeh_event_handler - Dispatch EEH events.
diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index eea48d8..f0520da 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -249,7 +249,7 @@ static void *__eeh_pe_get(void *data, void *flag)
} else {
if (edev->pe_config_addr &&
(edev->pe_config_addr == pe->addr))
- return pe;
+ return pe;
}
/* Try BDF address */
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 39a79c8..73e461a 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -37,6 +37,7 @@
#include <asm/hw_irq.h>
#include <asm/context_tracking.h>
#include <asm/tm.h>
+#include <asm/ppc-opcode.h>
/*
* System calls.
@@ -509,6 +510,14 @@ BEGIN_FTR_SECTION
ldarx r6,0,r1
END_FTR_SECTION_IFSET(CPU_FTR_STCX_CHECKS_ADDRESS)
+BEGIN_FTR_SECTION
+/*
+ * A cp_abort (copy paste abort) here ensures that when context switching, a
+ * copy from one process can't leak into the paste of another.
+ */
+ PPC_CP_ABORT
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
+
#ifdef CONFIG_PPC_BOOK3S
/* Cancel all explict user streams as they will have no use after context
* switch and will stop the HW from creating streams itself
@@ -520,7 +529,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_STCX_CHECKS_ADDRESS)
std r6,PACACURRENT(r13) /* Set new 'current' */
ld r8,KSP(r4) /* new stack pointer */
-#ifdef CONFIG_PPC_BOOK3S
+#ifdef CONFIG_PPC_STD_MMU_64
+BEGIN_MMU_FTR_SECTION
+ b 2f
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_RADIX)
BEGIN_FTR_SECTION
clrrdi r6,r8,28 /* get its ESID */
clrrdi r9,r1,28 /* get current sp ESID */
@@ -566,7 +578,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
slbmte r7,r0
isync
2:
-#endif /* !CONFIG_PPC_BOOK3S */
+#endif /* CONFIG_PPC_STD_MMU_64 */
CURRENT_THREAD_INFO(r7, r8) /* base of new stack */
/* Note: this uses SWITCH_FRAME_SIZE rather than INT_FRAME_SIZE
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 7716ceb..4c94406 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -189,7 +189,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
#endif /* CONFIG_PPC_P7_NAP */
EXCEPTION_PROLOG_0(PACA_EXMC)
BEGIN_FTR_SECTION
- b machine_check_pSeries_early
+ b machine_check_powernv_early
FTR_SECTION_ELSE
b machine_check_pSeries_0
ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE)
@@ -209,11 +209,6 @@ data_access_slb_pSeries:
EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST, 0x380)
std r3,PACA_EXSLB+EX_R3(r13)
mfspr r3,SPRN_DAR
-#ifdef __DISABLED__
- /* Keep that around for when we re-implement dynamic VSIDs */
- cmpdi r3,0
- bge slb_miss_user_pseries
-#endif /* __DISABLED__ */
mfspr r12,SPRN_SRR1
#ifndef CONFIG_RELOCATABLE
b slb_miss_realmode
@@ -240,11 +235,6 @@ instruction_access_slb_pSeries:
EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST, 0x480)
std r3,PACA_EXSLB+EX_R3(r13)
mfspr r3,SPRN_SRR0 /* SRR0 is faulting address */
-#ifdef __DISABLED__
- /* Keep that around for when we re-implement dynamic VSIDs */
- cmpdi r3,0
- bge slb_miss_user_pseries
-#endif /* __DISABLED__ */
mfspr r12,SPRN_SRR1
#ifndef CONFIG_RELOCATABLE
b slb_miss_realmode
@@ -443,7 +433,7 @@ denorm_exception_hv:
.align 7
/* moved from 0x200 */
-machine_check_pSeries_early:
+machine_check_powernv_early:
BEGIN_FTR_SECTION
EXCEPTION_PROLOG_1(PACA_EXMC, NOTEST, 0x200)
/*
@@ -709,34 +699,6 @@ system_reset_fwnmi:
#endif /* CONFIG_PPC_PSERIES */
-#ifdef __DISABLED__
-/*
- * This is used for when the SLB miss handler has to go virtual,
- * which doesn't happen for now anymore but will once we re-implement
- * dynamic VSIDs for shared page tables
- */
-slb_miss_user_pseries:
- std r10,PACA_EXGEN+EX_R10(r13)
- std r11,PACA_EXGEN+EX_R11(r13)
- std r12,PACA_EXGEN+EX_R12(r13)
- GET_SCRATCH0(r10)
- ld r11,PACA_EXSLB+EX_R9(r13)
- ld r12,PACA_EXSLB+EX_R3(r13)
- std r10,PACA_EXGEN+EX_R13(r13)
- std r11,PACA_EXGEN+EX_R9(r13)
- std r12,PACA_EXGEN+EX_R3(r13)
- clrrdi r12,r13,32
- mfmsr r10
- mfspr r11,SRR0 /* save SRR0 */
- ori r12,r12,slb_miss_user_common@l /* virt addr of handler */
- ori r10,r10,MSR_IR|MSR_DR|MSR_RI
- mtspr SRR0,r12
- mfspr r12,SRR1 /* and SRR1 */
- mtspr SRR1,r10
- rfid
- b . /* prevent spec. execution */
-#endif /* __DISABLED__ */
-
#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
kvmppc_skip_interrupt:
/*
@@ -764,11 +726,10 @@ kvmppc_skip_Hinterrupt:
#endif
/*
- * Code from here down to __end_handlers is invoked from the
- * exception prologs above. Because the prologs assemble the
- * addresses of these handlers using the LOAD_HANDLER macro,
- * which uses an ori instruction, these handlers must be in
- * the first 64k of the kernel image.
+ * Ensure that any handlers that get invoked from the exception prologs
+ * above are below the first 64KB (0x10000) of the kernel image because
+ * the prologs assemble the addresses of these handlers using the
+ * LOAD_HANDLER macro, which uses an ori instruction.
*/
/*** Common interrupt handlers ***/
@@ -953,11 +914,6 @@ hv_facility_unavailable_relon_trampoline:
#endif
STD_RELON_EXCEPTION_PSERIES(0x5700, 0x1700, altivec_assist)
- /* Other future vectors */
- .align 7
- .globl __end_interrupts
-__end_interrupts:
-
.align 7
system_call_entry:
b system_call_common
@@ -983,7 +939,13 @@ data_access_common:
ld r3,PACA_EXGEN+EX_DAR(r13)
lwz r4,PACA_EXGEN+EX_DSISR(r13)
li r5,0x300
+ std r3,_DAR(r1)
+ std r4,_DSISR(r1)
+BEGIN_MMU_FTR_SECTION
b do_hash_page /* Try to handle as hpte fault */
+MMU_FTR_SECTION_ELSE
+ b handle_page_fault
+ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_RADIX)
.align 7
.globl h_data_storage_common
@@ -1008,73 +970,15 @@ instruction_access_common:
ld r3,_NIP(r1)
andis. r4,r12,0x5820
li r5,0x400
+ std r3,_DAR(r1)
+ std r4,_DSISR(r1)
+BEGIN_MMU_FTR_SECTION
b do_hash_page /* Try to handle as hpte fault */
-
- STD_EXCEPTION_COMMON(0xe20, h_instr_storage, unknown_exception)
-
-/*
- * Here is the common SLB miss user that is used when going to virtual
- * mode for SLB misses, that is currently not used
- */
-#ifdef __DISABLED__
- .align 7
- .globl slb_miss_user_common
-slb_miss_user_common:
- mflr r10
- std r3,PACA_EXGEN+EX_DAR(r13)
- stw r9,PACA_EXGEN+EX_CCR(r13)
- std r10,PACA_EXGEN+EX_LR(r13)
- std r11,PACA_EXGEN+EX_SRR0(r13)
- bl slb_allocate_user
-
- ld r10,PACA_EXGEN+EX_LR(r13)
- ld r3,PACA_EXGEN+EX_R3(r13)
- lwz r9,PACA_EXGEN+EX_CCR(r13)
- ld r11,PACA_EXGEN+EX_SRR0(r13)
- mtlr r10
- beq- slb_miss_fault
-
- andi. r10,r12,MSR_RI /* check for unrecoverable exception */
- beq- unrecov_user_slb
- mfmsr r10
-
-.machine push
-.machine "power4"
- mtcrf 0x80,r9
-.machine pop
-
- clrrdi r10,r10,2 /* clear RI before setting SRR0/1 */
- mtmsrd r10,1
-
- mtspr SRR0,r11
- mtspr SRR1,r12
-
- ld r9,PACA_EXGEN+EX_R9(r13)
- ld r10,PACA_EXGEN+EX_R10(r13)
- ld r11,PACA_EXGEN+EX_R11(r13)
- ld r12,PACA_EXGEN+EX_R12(r13)
- ld r13,PACA_EXGEN+EX_R13(r13)
- rfid
- b .
-
-slb_miss_fault:
- EXCEPTION_PROLOG_COMMON(0x380, PACA_EXGEN)
- ld r4,PACA_EXGEN+EX_DAR(r13)
- li r5,0
- std r4,_DAR(r1)
- std r5,_DSISR(r1)
+MMU_FTR_SECTION_ELSE
b handle_page_fault
+ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_RADIX)
-unrecov_user_slb:
- EXCEPTION_PROLOG_COMMON(0x4200, PACA_EXGEN)
- RECONCILE_IRQ_STATE(r10, r11)
- bl save_nvgprs
-1: addi r3,r1,STACK_FRAME_OVERHEAD
- bl unrecoverable_exception
- b 1b
-
-#endif /* __DISABLED__ */
-
+ STD_EXCEPTION_COMMON(0xe20, h_instr_storage, unknown_exception)
/*
* Machine check is different because we use a different
@@ -1230,10 +1134,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
STD_EXCEPTION_COMMON(0xf60, facility_unavailable, facility_unavailable_exception)
STD_EXCEPTION_COMMON(0xf80, hv_facility_unavailable, facility_unavailable_exception)
- .align 7
- .globl __end_handlers
-__end_handlers:
-
/* Equivalents to the above handlers for relocation-on interrupt vectors */
STD_RELON_EXCEPTION_HV_OOL(0xe40, emulation_assist)
MASKABLE_RELON_EXCEPTION_HV_OOL(0xe80, h_doorbell)
@@ -1244,6 +1144,17 @@ __end_handlers:
STD_RELON_EXCEPTION_PSERIES_OOL(0xf60, facility_unavailable)
STD_RELON_EXCEPTION_HV_OOL(0xf80, hv_facility_unavailable)
+ /*
+ * The __end_interrupts marker must be past the out-of-line (OOL)
+ * handlers, so that they are copied to real address 0x100 when running
+ * a relocatable kernel. This ensures they can be reached from the short
+ * trampoline handlers (like 0x4f00, 0x4f20, etc.) which branch
+ * directly, without using LOAD_HANDLER().
+ */
+ .align 7
+ .globl __end_interrupts
+__end_interrupts:
+
#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV)
/*
* Data area reserved for FWNMI option.
@@ -1476,8 +1387,11 @@ slb_miss_realmode:
stw r9,PACA_EXSLB+EX_CCR(r13) /* save CR in exc. frame */
std r10,PACA_EXSLB+EX_LR(r13) /* save LR */
+#ifdef CONFIG_PPC_STD_MMU_64
+BEGIN_MMU_FTR_SECTION
bl slb_allocate_realmode
-
+END_MMU_FTR_SECTION_IFCLR(MMU_FTR_RADIX)
+#endif
/* All done -- return from exception. */
ld r10,PACA_EXSLB+EX_LR(r13)
@@ -1485,7 +1399,9 @@ slb_miss_realmode:
lwz r9,PACA_EXSLB+EX_CCR(r13) /* get saved CR */
mtlr r10
-
+BEGIN_MMU_FTR_SECTION
+ b 2f
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_RADIX)
andi. r10,r12,MSR_RI /* check for unrecoverable exception */
beq- 2f
@@ -1536,9 +1452,7 @@ power4_fixup_nap:
*/
.align 7
do_hash_page:
- std r3,_DAR(r1)
- std r4,_DSISR(r1)
-
+#ifdef CONFIG_PPC_STD_MMU_64
andis. r0,r4,0xa410 /* weird error? */
bne- handle_page_fault /* if not, try to insert a HPTE */
andis. r0,r4,DSISR_DABRMATCH@h
@@ -1566,6 +1480,7 @@ do_hash_page:
/* Error */
blt- 13f
+#endif /* CONFIG_PPC_STD_MMU_64 */
/* Here we have a page fault that hash_page can't handle. */
handle_page_fault:
@@ -1592,6 +1507,7 @@ handle_dabr_fault:
12: b ret_from_except_lite
+#ifdef CONFIG_PPC_STD_MMU_64
/* We have a page fault that hash_page could handle but HV refused
* the PTE insertion
*/
@@ -1601,6 +1517,7 @@ handle_dabr_fault:
ld r4,_DAR(r1)
bl low_hash_fault
b ret_from_except
+#endif
/*
* We come here as a result of a DSI at a point where we don't want
diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c
index 9dac18da..1123a4d 100644
--- a/arch/powerpc/kernel/ftrace.c
+++ b/arch/powerpc/kernel/ftrace.c
@@ -607,3 +607,13 @@ unsigned long __init arch_syscall_addr(int nr)
return sys_call_table[nr*2];
}
#endif /* CONFIG_FTRACE_SYSCALLS && CONFIG_PPC64 */
+
+#if defined(CONFIG_PPC64) && (!defined(_CALL_ELF) || _CALL_ELF != 2)
+char *arch_ftrace_match_adjust(char *str, const char *search)
+{
+ if (str[0] == '.' && search[0] != '.')
+ return str + 1;
+ else
+ return str;
+}
+#endif /* defined(CONFIG_PPC64) && (!defined(_CALL_ELF) || _CALL_ELF != 2) */
diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S
index 4286775..2d14774 100644
--- a/arch/powerpc/kernel/head_64.S
+++ b/arch/powerpc/kernel/head_64.S
@@ -973,13 +973,16 @@ start_here_common:
* This stuff goes at the beginning of the bss, which is page-aligned.
*/
.section ".bss"
+/*
+ * pgd dir should be aligned to PGD_TABLE_SIZE which is 64K.
+ * We will need to find a better way to fix this
+ */
+ .align 16
- .align PAGE_SHIFT
+ .globl swapper_pg_dir
+swapper_pg_dir:
+ .space PGD_TABLE_SIZE
.globl empty_zero_page
empty_zero_page:
.space PAGE_SIZE
-
- .globl swapper_pg_dir
-swapper_pg_dir:
- .space PGD_TABLE_SIZE
diff --git a/arch/powerpc/kernel/ibmebus.c b/arch/powerpc/kernel/ibmebus.c
index ac86c53..a89f4f7 100644
--- a/arch/powerpc/kernel/ibmebus.c
+++ b/arch/powerpc/kernel/ibmebus.c
@@ -408,7 +408,7 @@ static ssize_t modalias_show(struct device *dev,
return len+1;
}
-struct device_attribute ibmebus_bus_device_attrs[] = {
+static struct device_attribute ibmebus_bus_device_attrs[] = {
__ATTR_RO(devspec),
__ATTR_RO(name),
__ATTR_RO(modalias),
diff --git a/arch/powerpc/kernel/isa-bridge.c b/arch/powerpc/kernel/isa-bridge.c
index 0f19970..ae13161 100644
--- a/arch/powerpc/kernel/isa-bridge.c
+++ b/arch/powerpc/kernel/isa-bridge.c
@@ -109,14 +109,14 @@ static void pci_process_ISA_OF_ranges(struct device_node *isa_node,
size = 0x10000;
__ioremap_at(phb_io_base_phys, (void *)ISA_IO_BASE,
- size, _PAGE_NO_CACHE|_PAGE_GUARDED);
+ size, pgprot_val(pgprot_noncached(__pgprot(0))));
return;
inval_range:
printk(KERN_ERR "no ISA IO ranges or unexpected isa range, "
"mapping 64k\n");
__ioremap_at(phb_io_base_phys, (void *)ISA_IO_BASE,
- 0x10000, _PAGE_NO_CACHE|_PAGE_GUARDED);
+ 0x10000, pgprot_val(pgprot_noncached(__pgprot(0))));
}
diff --git a/arch/powerpc/kernel/machine_kexec.c b/arch/powerpc/kernel/machine_kexec.c
index 015ae55..2694d07 100644
--- a/arch/powerpc/kernel/machine_kexec.c
+++ b/arch/powerpc/kernel/machine_kexec.c
@@ -228,17 +228,12 @@ static struct property memory_limit_prop = {
static void __init export_crashk_values(struct device_node *node)
{
- struct property *prop;
-
/* There might be existing crash kernel properties, but we can't
* be sure what's in them, so remove them. */
- prop = of_find_property(node, "linux,crashkernel-base", NULL);
- if (prop)
- of_remove_property(node, prop);
-
- prop = of_find_property(node, "linux,crashkernel-size", NULL);
- if (prop)
- of_remove_property(node, prop);
+ of_remove_property(node, of_find_property(node,
+ "linux,crashkernel-base", NULL));
+ of_remove_property(node, of_find_property(node,
+ "linux,crashkernel-size", NULL));
if (crashk_res.start != 0) {
crashk_base = cpu_to_be_ulong(crashk_res.start),
@@ -258,16 +253,13 @@ static void __init export_crashk_values(struct device_node *node)
static int __init kexec_setup(void)
{
struct device_node *node;
- struct property *prop;
node = of_find_node_by_path("/chosen");
if (!node)
return -ENOENT;
/* remove any stale properties so ours can be found */
- prop = of_find_property(node, kernel_end_prop.name, NULL);
- if (prop)
- of_remove_property(node, prop);
+ of_remove_property(node, of_find_property(node, kernel_end_prop.name, NULL));
/* information needed by userspace when using default_machine_kexec */
kernel_end = cpu_to_be_ulong(__pa(_end));
diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c
index 0fbd75d..b8c202d 100644
--- a/arch/powerpc/kernel/machine_kexec_64.c
+++ b/arch/powerpc/kernel/machine_kexec_64.c
@@ -76,6 +76,7 @@ int default_machine_kexec_prepare(struct kimage *image)
* end of the blocked region (begin >= high). Use the
* boolean identity !(a || b) === (!a && !b).
*/
+#ifdef CONFIG_PPC_STD_MMU_64
if (htab_address) {
low = __pa(htab_address);
high = low + htab_size_bytes;
@@ -88,6 +89,7 @@ int default_machine_kexec_prepare(struct kimage *image)
return -ETXTBSY;
}
}
+#endif /* CONFIG_PPC_STD_MMU_64 */
/* We also should not overwrite the tce tables */
for_each_node_by_type(node, "pci") {
@@ -381,7 +383,7 @@ void default_machine_kexec(struct kimage *image)
/* NOTREACHED */
}
-#ifndef CONFIG_PPC_BOOK3E
+#ifdef CONFIG_PPC_STD_MMU_64
/* Values we need to export to the second kernel via the device tree. */
static unsigned long htab_base;
static unsigned long htab_size;
@@ -401,7 +403,6 @@ static struct property htab_size_prop = {
static int __init export_htab_values(void)
{
struct device_node *node;
- struct property *prop;
/* On machines with no htab htab_address is NULL */
if (!htab_address)
@@ -412,12 +413,8 @@ static int __init export_htab_values(void)
return -ENODEV;
/* remove any stale propertys so ours can be found */
- prop = of_find_property(node, htab_base_prop.name, NULL);
- if (prop)
- of_remove_property(node, prop);
- prop = of_find_property(node, htab_size_prop.name, NULL);
- if (prop)
- of_remove_property(node, prop);
+ of_remove_property(node, of_find_property(node, htab_base_prop.name, NULL));
+ of_remove_property(node, of_find_property(node, htab_size_prop.name, NULL));
htab_base = cpu_to_be64(__pa(htab_address));
of_add_property(node, &htab_base_prop);
@@ -428,4 +425,4 @@ static int __init export_htab_values(void)
return 0;
}
late_initcall(export_htab_values);
-#endif /* !CONFIG_PPC_BOOK3E */
+#endif /* CONFIG_PPC_STD_MMU_64 */
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 671fd51..ef267fd 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -37,7 +37,7 @@ static DEFINE_PER_CPU(int, mce_queue_count);
static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event_queue);
static void machine_check_process_queued_event(struct irq_work *work);
-struct irq_work mce_event_process_work = {
+static struct irq_work mce_event_process_work = {
.func = machine_check_process_queued_event,
};
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index ee62b19..7353991 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -72,11 +72,15 @@ void __flush_tlb_power8(unsigned int action)
void __flush_tlb_power9(unsigned int action)
{
+ if (radix_enabled())
+ flush_tlb_206(POWER9_TLB_SETS_RADIX, action);
+
flush_tlb_206(POWER9_TLB_SETS_HASH, action);
}
/* flush SLBs and reload */
+#ifdef CONFIG_PPC_STD_MMU_64
static void flush_and_reload_slb(void)
{
struct slb_shadow *slb;
@@ -110,6 +114,7 @@ static void flush_and_reload_slb(void)
asm volatile("slbmte %0,%1" : : "r" (rs), "r" (rb));
}
}
+#endif
static long mce_handle_derror(uint64_t dsisr, uint64_t slb_error_bits)
{
@@ -120,6 +125,7 @@ static long mce_handle_derror(uint64_t dsisr, uint64_t slb_error_bits)
* reset the error bits whenever we handle them so that at the end
* we can check whether we handled all of them or not.
* */
+#ifdef CONFIG_PPC_STD_MMU_64
if (dsisr & slb_error_bits) {
flush_and_reload_slb();
/* reset error bits */
@@ -131,6 +137,7 @@ static long mce_handle_derror(uint64_t dsisr, uint64_t slb_error_bits)
/* reset error bits */
dsisr &= ~P7_DSISR_MC_TLB_MULTIHIT_MFTLB;
}
+#endif
/* Any other errors we don't understand? */
if (dsisr & 0xffffffffUL)
handled = 0;
@@ -150,6 +157,7 @@ static long mce_handle_common_ierror(uint64_t srr1)
switch (P7_SRR1_MC_IFETCH(srr1)) {
case 0:
break;
+#ifdef CONFIG_PPC_STD_MMU_64
case P7_SRR1_MC_IFETCH_SLB_PARITY:
case P7_SRR1_MC_IFETCH_SLB_MULTIHIT:
/* flush and reload SLBs for SLB errors. */
@@ -162,6 +170,7 @@ static long mce_handle_common_ierror(uint64_t srr1)
handled = 1;
}
break;
+#endif
default:
break;
}
@@ -175,10 +184,12 @@ static long mce_handle_ierror_p7(uint64_t srr1)
handled = mce_handle_common_ierror(srr1);
+#ifdef CONFIG_PPC_STD_MMU_64
if (P7_SRR1_MC_IFETCH(srr1) == P7_SRR1_MC_IFETCH_SLB_BOTH) {
flush_and_reload_slb();
handled = 1;
}
+#endif
return handled;
}
@@ -321,10 +332,12 @@ static long mce_handle_ierror_p8(uint64_t srr1)
handled = mce_handle_common_ierror(srr1);
+#ifdef CONFIG_PPC_STD_MMU_64
if (P7_SRR1_MC_IFETCH(srr1) == P8_SRR1_MC_IFETCH_ERAT_MULTIHIT) {
flush_and_reload_slb();
handled = 1;
}
+#endif
return handled;
}
diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S
index bf5160f..285ca8c 100644
--- a/arch/powerpc/kernel/misc_32.S
+++ b/arch/powerpc/kernel/misc_32.S
@@ -599,12 +599,6 @@ _GLOBAL(__bswapdi2)
mr r4,r10
blr
-_GLOBAL(abs)
- srawi r4,r3,31
- xor r3,r3,r4
- sub r3,r3,r4
- blr
-
#ifdef CONFIG_SMP
_GLOBAL(start_secondary_resume)
/* Reset stack */
diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c
index 0cab9e8..856f9a7 100644
--- a/arch/powerpc/kernel/nvram_64.c
+++ b/arch/powerpc/kernel/nvram_64.c
@@ -15,8 +15,6 @@
* parsing code.
*/
-#include <linux/module.h>
-
#include <linux/types.h>
#include <linux/errno.h>
#include <linux/fs.h>
@@ -1231,12 +1229,4 @@ static int __init nvram_init(void)
return rc;
}
-
-static void __exit nvram_cleanup(void)
-{
- misc_deregister( &nvram_dev );
-}
-
-module_init(nvram_init);
-module_exit(nvram_cleanup);
-MODULE_LICENSE("GPL");
+device_initcall(nvram_init);
diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c
index 59c4361..2d71269 100644
--- a/arch/powerpc/kernel/pci-hotplug.c
+++ b/arch/powerpc/kernel/pci-hotplug.c
@@ -21,6 +21,35 @@
#include <asm/firmware.h>
#include <asm/eeh.h>
+static struct pci_bus *find_bus_among_children(struct pci_bus *bus,
+ struct device_node *dn)
+{
+ struct pci_bus *child = NULL;
+ struct pci_bus *tmp;
+
+ if (pci_bus_to_OF_node(bus) == dn)
+ return bus;
+
+ list_for_each_entry(tmp, &bus->children, node) {
+ child = find_bus_among_children(tmp, dn);
+ if (child)
+ break;
+ }
+
+ return child;
+}
+
+struct pci_bus *pci_find_bus_by_node(struct device_node *dn)
+{
+ struct pci_dn *pdn = PCI_DN(dn);
+
+ if (!pdn || !pdn->phb || !pdn->phb->bus)
+ return NULL;
+
+ return find_bus_among_children(pdn->phb->bus, dn);
+}
+EXPORT_SYMBOL_GPL(pci_find_bus_by_node);
+
/**
* pcibios_release_device - release PCI device
* @dev: PCI device
@@ -38,20 +67,20 @@ void pcibios_release_device(struct pci_dev *dev)
}
/**
- * pcibios_remove_pci_devices - remove all devices under this bus
+ * pci_hp_remove_devices - remove all devices under this bus
* @bus: the indicated PCI bus
*
* Remove all of the PCI devices under this bus both from the
* linux pci device tree, and from the powerpc EEH address cache.
*/
-void pcibios_remove_pci_devices(struct pci_bus *bus)
+void pci_hp_remove_devices(struct pci_bus *bus)
{
struct pci_dev *dev, *tmp;
struct pci_bus *child_bus;
/* First go down child busses */
list_for_each_entry(child_bus, &bus->children, node)
- pcibios_remove_pci_devices(child_bus);
+ pci_hp_remove_devices(child_bus);
pr_debug("PCI: Removing devices on bus %04x:%02x\n",
pci_domain_nr(bus), bus->number);
@@ -60,11 +89,10 @@ void pcibios_remove_pci_devices(struct pci_bus *bus)
pci_stop_and_remove_bus_device(dev);
}
}
-
-EXPORT_SYMBOL_GPL(pcibios_remove_pci_devices);
+EXPORT_SYMBOL_GPL(pci_hp_remove_devices);
/**
- * pcibios_add_pci_devices - adds new pci devices to bus
+ * pci_hp_add_devices - adds new pci devices to bus
* @bus: the indicated PCI bus
*
* This routine will find and fixup new pci devices under
@@ -74,7 +102,7 @@ EXPORT_SYMBOL_GPL(pcibios_remove_pci_devices);
* is how this routine differs from other, similar pcibios
* routines.)
*/
-void pcibios_add_pci_devices(struct pci_bus * bus)
+void pci_hp_add_devices(struct pci_bus *bus)
{
int slotno, mode, pass, max;
struct pci_dev *dev;
@@ -92,7 +120,8 @@ void pcibios_add_pci_devices(struct pci_bus * bus)
if (mode == PCI_PROBE_DEVTREE) {
/* use ofdt-based probe */
of_rescan_bus(dn, bus);
- } else if (mode == PCI_PROBE_NORMAL) {
+ } else if (mode == PCI_PROBE_NORMAL &&
+ dn->child && PCI_DN(dn->child)) {
/*
* Use legacy probe. In the partial hotplug case, we
* probably have grandchildren devices unplugged. So
@@ -114,4 +143,4 @@ void pcibios_add_pci_devices(struct pci_bus * bus)
}
pcibios_finish_adding_to_bus(bus);
}
-EXPORT_SYMBOL_GPL(pcibios_add_pci_devices);
+EXPORT_SYMBOL_GPL(pci_hp_add_devices);
diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c
index 60bb187..3759df5 100644
--- a/arch/powerpc/kernel/pci_64.c
+++ b/arch/powerpc/kernel/pci_64.c
@@ -38,7 +38,7 @@
* ISA drivers use hard coded offsets. If no ISA bus exists nothing
* is mapped on the first 64K of IO space
*/
-unsigned long pci_io_base = ISA_IO_BASE;
+unsigned long pci_io_base;
EXPORT_SYMBOL(pci_io_base);
static int __init pcibios_init(void)
@@ -47,6 +47,7 @@ static int __init pcibios_init(void)
printk(KERN_INFO "PCI: Probing PCI hardware\n");
+ pci_io_base = ISA_IO_BASE;
/* For now, override phys_mem_access_prot. If we need it,g
* later, we may move that initialization to each ppc_md
*/
@@ -159,7 +160,7 @@ static int pcibios_map_phb_io_space(struct pci_controller *hose)
/* Establish the mapping */
if (__ioremap_at(phys_page, area->addr, size_page,
- _PAGE_NO_CACHE | _PAGE_GUARDED) == NULL)
+ pgprot_val(pgprot_noncached(__pgprot(0)))) == NULL)
return -ENOMEM;
/* Fixup hose IO resource */
diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c
index 38102cb..ecdccce 100644
--- a/arch/powerpc/kernel/pci_dn.c
+++ b/arch/powerpc/kernel/pci_dn.c
@@ -282,13 +282,9 @@ void remove_dev_pci_data(struct pci_dev *pdev)
#endif /* CONFIG_PCI_IOV */
}
-/*
- * Traverse_func that inits the PCI fields of the device node.
- * NOTE: this *must* be done before read/write config to the device.
- */
-void *update_dn_pci_info(struct device_node *dn, void *data)
+struct pci_dn *pci_add_device_node_info(struct pci_controller *hose,
+ struct device_node *dn)
{
- struct pci_controller *phb = data;
const __be32 *type = of_get_property(dn, "ibm,pci-config-space-type", NULL);
const __be32 *regs;
struct device_node *parent;
@@ -299,7 +295,7 @@ void *update_dn_pci_info(struct device_node *dn, void *data)
return NULL;
dn->data = pdn;
pdn->node = dn;
- pdn->phb = phb;
+ pdn->phb = hose;
#ifdef CONFIG_PPC_POWERNV
pdn->pe_number = IODA_INVALID_PE;
#endif
@@ -331,8 +327,32 @@ void *update_dn_pci_info(struct device_node *dn, void *data)
if (pdn->parent)
list_add_tail(&pdn->list, &pdn->parent->child_list);
- return NULL;
+ return pdn;
}
+EXPORT_SYMBOL_GPL(pci_add_device_node_info);
+
+void pci_remove_device_node_info(struct device_node *dn)
+{
+ struct pci_dn *pdn = dn ? PCI_DN(dn) : NULL;
+#ifdef CONFIG_EEH
+ struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
+
+ if (edev)
+ edev->pdn = NULL;
+#endif
+
+ if (!pdn)
+ return;
+
+ WARN_ON(!list_empty(&pdn->child_list));
+ list_del(&pdn->list);
+ if (pdn->parent)
+ of_node_put(pdn->parent->node);
+
+ dn->data = NULL;
+ kfree(pdn);
+}
+EXPORT_SYMBOL_GPL(pci_remove_device_node_info);
/*
* Traverse a device tree stopping each PCI device in the tree.
@@ -352,8 +372,9 @@ void *update_dn_pci_info(struct device_node *dn, void *data)
* one of these nodes we also assume its siblings are non-pci for
* performance.
*/
-void *traverse_pci_devices(struct device_node *start, traverse_func pre,
- void *data)
+void *pci_traverse_device_nodes(struct device_node *start,
+ void *(*fn)(struct device_node *, void *),
+ void *data)
{
struct device_node *dn, *nextdn;
void *ret;
@@ -368,8 +389,11 @@ void *traverse_pci_devices(struct device_node *start, traverse_func pre,
if (classp)
class = of_read_number(classp, 1);
- if (pre && ((ret = pre(dn, data)) != NULL))
- return ret;
+ if (fn) {
+ ret = fn(dn, data);
+ if (ret)
+ return ret;
+ }
/* If we are a PCI bridge, go down */
if (dn->child && ((class >> 8) == PCI_CLASS_BRIDGE_PCI ||
@@ -391,6 +415,7 @@ void *traverse_pci_devices(struct device_node *start, traverse_func pre,
}
return NULL;
}
+EXPORT_SYMBOL_GPL(pci_traverse_device_nodes);
static struct pci_dn *pci_dn_next_one(struct pci_dn *root,
struct pci_dn *pdn)
@@ -432,6 +457,18 @@ void *traverse_pci_dn(struct pci_dn *root,
return NULL;
}
+static void *add_pdn(struct device_node *dn, void *data)
+{
+ struct pci_controller *hose = data;
+ struct pci_dn *pdn;
+
+ pdn = pci_add_device_node_info(hose, dn);
+ if (!pdn)
+ return ERR_PTR(-ENOMEM);
+
+ return NULL;
+}
+
/**
* pci_devs_phb_init_dynamic - setup pci devices under this PHB
* phb: pci-to-host bridge (top-level bridge connecting to cpu)
@@ -446,8 +483,7 @@ void pci_devs_phb_init_dynamic(struct pci_controller *phb)
struct pci_dn *pdn;
/* PHB nodes themselves must not match */
- update_dn_pci_info(dn, phb);
- pdn = dn->data;
+ pdn = pci_add_device_node_info(phb, dn);
if (pdn) {
pdn->devfn = pdn->busno = -1;
pdn->vendor_id = pdn->device_id = pdn->class_code = 0;
@@ -456,7 +492,7 @@ void pci_devs_phb_init_dynamic(struct pci_controller *phb)
}
/* Update dn->phb ptrs for new phb and children devices */
- traverse_pci_devices(dn, update_dn_pci_info, phb);
+ pci_traverse_device_nodes(dn, add_pdn, phb);
}
/**
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 2a9280b..ea8a28f 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -38,6 +38,7 @@
#include <linux/random.h>
#include <linux/hw_breakpoint.h>
#include <linux/uaccess.h>
+#include <linux/elf-randomize.h>
#include <asm/pgtable.h>
#include <asm/io.h>
@@ -55,6 +56,7 @@
#include <asm/firmware.h>
#endif
#include <asm/code-patching.h>
+#include <asm/exec.h>
#include <asm/livepatch.h>
#include <linux/kprobes.h>
@@ -1077,7 +1079,7 @@ struct task_struct *__switch_to(struct task_struct *prev,
}
#endif /* CONFIG_PPC64 */
-#ifdef CONFIG_PPC_BOOK3S_64
+#ifdef CONFIG_PPC_STD_MMU_64
batch = this_cpu_ptr(&ppc64_tlb_batch);
if (batch->active) {
current_thread_info()->local_flags |= _TLF_LAZY_MMU;
@@ -1085,7 +1087,7 @@ struct task_struct *__switch_to(struct task_struct *prev,
__flush_tlb_pending(batch);
batch->active = 0;
}
-#endif /* CONFIG_PPC_BOOK3S_64 */
+#endif /* CONFIG_PPC_STD_MMU_64 */
#ifdef CONFIG_PPC_ADV_DEBUG_REGS
switch_booke_debug_regs(&new->thread.debug);
@@ -1131,7 +1133,7 @@ struct task_struct *__switch_to(struct task_struct *prev,
last = _switch(old_thread, new_thread);
-#ifdef CONFIG_PPC_BOOK3S_64
+#ifdef CONFIG_PPC_STD_MMU_64
if (current_thread_info()->local_flags & _TLF_LAZY_MMU) {
current_thread_info()->local_flags &= ~_TLF_LAZY_MMU;
batch = this_cpu_ptr(&ppc64_tlb_batch);
@@ -1140,8 +1142,7 @@ struct task_struct *__switch_to(struct task_struct *prev,
if (current_thread_info()->task->thread.regs)
restore_math(current_thread_info()->task->thread.regs);
-
-#endif /* CONFIG_PPC_BOOK3S_64 */
+#endif /* CONFIG_PPC_STD_MMU_64 */
return last;
}
@@ -1376,6 +1377,9 @@ static void setup_ksp_vsid(struct task_struct *p, unsigned long sp)
unsigned long sp_vsid;
unsigned long llp = mmu_psize_defs[mmu_linear_psize].sllp;
+ if (radix_enabled())
+ return;
+
if (mmu_has_feature(MMU_FTR_1T_SEGMENT))
sp_vsid = get_kernel_vsid(sp, MMU_SEGSIZE_1T)
<< SLB_VSID_SHIFT_1T;
@@ -1924,7 +1928,8 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
* the heap, we can put it above 1TB so it is backed by a 1TB
* segment. Otherwise the heap will be in the bottom 1TB
* which always uses 256MB segments and this may result in a
- * performance penalty.
+ * performance penalty. We don't need to worry about radix. For
+ * radix, mmu_highuser_ssize remains unchanged from 256MB.
*/
if (!is_32bit_task() && (mmu_highuser_ssize == MMU_SEGSIZE_1T))
base = max_t(unsigned long, mm->brk, 1UL << SID_SHIFT_1T);
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index a15fe1d..946e34ff 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -34,6 +34,7 @@
#include <linux/of.h>
#include <linux/of_fdt.h>
#include <linux/libfdt.h>
+#include <linux/cpu.h>
#include <asm/prom.h>
#include <asm/rtas.h>
@@ -167,6 +168,7 @@ static struct ibm_pa_feature {
*/
{CPU_FTR_TM_COMP, 0, 0,
PPC_FEATURE2_HTM_COMP|PPC_FEATURE2_HTM_NOSC_COMP, 22, 0, 0},
+ {0, MMU_FTR_RADIX, 0, 0, 40, 0, 0},
};
static void __init scan_features(unsigned long node, const unsigned char *ftrs,
diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c
index aa610ce..c638e24 100644
--- a/arch/powerpc/kernel/rtasd.c
+++ b/arch/powerpc/kernel/rtasd.c
@@ -442,7 +442,7 @@ static void do_event_scan(void)
}
static void rtas_event_scan(struct work_struct *w);
-DECLARE_DELAYED_WORK(event_scan_work, rtas_event_scan);
+static DECLARE_DELAYED_WORK(event_scan_work, rtas_event_scan);
/*
* Delay should be at least one second since some machines have problems if
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index 44c8d03..8ca79b7 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -128,9 +128,7 @@ void machine_restart(char *cmd)
machine_shutdown();
if (ppc_md.restart)
ppc_md.restart(cmd);
-#ifdef CONFIG_SMP
smp_send_stop();
-#endif
printk(KERN_EMERG "System Halted, OK to turn off power\n");
local_irq_disable();
while (1) ;
@@ -141,9 +139,7 @@ void machine_power_off(void)
machine_shutdown();
if (pm_power_off)
pm_power_off();
-#ifdef CONFIG_SMP
smp_send_stop();
-#endif
printk(KERN_EMERG "System Halted, OK to turn off power\n");
local_irq_disable();
while (1) ;
@@ -159,9 +155,7 @@ void machine_halt(void)
machine_shutdown();
if (ppc_md.halt)
ppc_md.halt();
-#ifdef CONFIG_SMP
smp_send_stop();
-#endif
printk(KERN_EMERG "System Halted, OK to turn off power\n");
local_irq_disable();
while (1) ;
diff --git a/arch/powerpc/kernel/swsusp.c b/arch/powerpc/kernel/swsusp.c
index 6669b17..6ae9bd5 100644
--- a/arch/powerpc/kernel/swsusp.c
+++ b/arch/powerpc/kernel/swsusp.c
@@ -31,6 +31,6 @@ void save_processor_state(void)
void restore_processor_state(void)
{
#ifdef CONFIG_PPC32
- switch_mmu_context(current->active_mm, current->active_mm);
+ switch_mmu_context(current->active_mm, current->active_mm, NULL);
#endif
}
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 81b0900..3ed9a5a 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -55,6 +55,7 @@
#include <linux/delay.h>
#include <linux/irq_work.h>
#include <linux/clk-provider.h>
+#include <linux/suspend.h>
#include <asm/trace.h>
#include <asm/io.h>
diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c
index 5f8dcda..8d7358f 100644
--- a/arch/powerpc/kernel/vio.c
+++ b/arch/powerpc/kernel/vio.c
@@ -87,7 +87,7 @@ struct vio_cmo_dev_entry {
* @curr: bytes currently allocated
* @high: high water mark for IO data usage
*/
-struct vio_cmo {
+static struct vio_cmo {
spinlock_t lock;
struct delayed_work balance_q;
struct list_head device_list;
@@ -615,7 +615,7 @@ static u64 vio_dma_get_required_mask(struct device *dev)
return dma_iommu_ops.get_required_mask(dev);
}
-struct dma_map_ops vio_dma_mapping_ops = {
+static struct dma_map_ops vio_dma_mapping_ops = {
.alloc = vio_dma_iommu_alloc_coherent,
.free = vio_dma_iommu_free_coherent,
.mmap = dma_direct_mmap_coherent,
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index c7b78d8..05f09ae 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -447,7 +447,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
struct revmap_entry *rev;
struct page *page, *pages[1];
long index, ret, npages;
- unsigned long is_io;
+ bool is_ci;
unsigned int writing, write_ok;
struct vm_area_struct *vma;
unsigned long rcbits;
@@ -503,7 +503,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
smp_rmb();
ret = -EFAULT;
- is_io = 0;
+ is_ci = false;
pfn = 0;
page = NULL;
pte_size = PAGE_SIZE;
@@ -521,7 +521,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
pfn = vma->vm_pgoff +
((hva - vma->vm_start) >> PAGE_SHIFT);
pte_size = psize;
- is_io = hpte_cache_bits(pgprot_val(vma->vm_page_prot));
+ is_ci = pte_ci(__pte((pgprot_val(vma->vm_page_prot))));
write_ok = vma->vm_flags & VM_WRITE;
}
up_read(&current->mm->mmap_sem);
@@ -558,10 +558,9 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
goto out_put;
/* Check WIMG vs. the actual page we're accessing */
- if (!hpte_cache_flags_ok(r, is_io)) {
- if (is_io)
+ if (!hpte_cache_flags_ok(r, is_ci)) {
+ if (is_ci)
goto out_put;
-
/*
* Allow guest to map emulated device memory as
* uncacheable, but actually make it cacheable.
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 9324355..e20beae 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3272,6 +3272,12 @@ static int kvmppc_core_check_processor_compat_hv(void)
if (!cpu_has_feature(CPU_FTR_HVMODE) ||
!cpu_has_feature(CPU_FTR_ARCH_206))
return -EIO;
+ /*
+ * Disable KVM for Power9, untill the required bits merged.
+ */
+ if (cpu_has_feature(CPU_FTR_ARCH_300))
+ return -EIO;
+
return 0;
}
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 4cb8db0..99b4e9d 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -175,7 +175,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
unsigned long g_ptel;
struct kvm_memory_slot *memslot;
unsigned hpage_shift;
- unsigned long is_io;
+ bool is_ci;
unsigned long *rmap;
pte_t *ptep;
unsigned int writing;
@@ -199,7 +199,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
gfn = gpa >> PAGE_SHIFT;
memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn);
pa = 0;
- is_io = ~0ul;
+ is_ci = false;
rmap = NULL;
if (!(memslot && !(memslot->flags & KVM_MEMSLOT_INVALID))) {
/* Emulated MMIO - mark this with key=31 */
@@ -250,7 +250,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
if (writing && !pte_write(pte))
/* make the actual HPTE be read-only */
ptel = hpte_make_readonly(ptel);
- is_io = hpte_cache_bits(pte_val(pte));
+ is_ci = pte_ci(pte);
pa = pte_pfn(pte) << PAGE_SHIFT;
pa |= hva & (host_pte_size - 1);
pa |= gpa & ~PAGE_MASK;
@@ -267,9 +267,9 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
else
pteh |= HPTE_V_ABSENT;
- /* Check WIMG */
- if (is_io != ~0ul && !hpte_cache_flags_ok(ptel, is_io)) {
- if (is_io)
+ /*If we had host pte mapping then Check WIMG */
+ if (ptep && !hpte_cache_flags_ok(ptel, is_ci)) {
+ if (is_ci)
return H_PARAMETER;
/*
* Allow guest to map emulated device memory as
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 8129b0d..8e4f64f 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -1713,7 +1713,11 @@ static void kvmppc_core_destroy_vm_pr(struct kvm *kvm)
static int kvmppc_core_check_processor_compat_pr(void)
{
- /* we are always compatible */
+ /*
+ * Disable KVM for Power9 untill the required bits merged.
+ */
+ if (cpu_has_feature(CPU_FTR_ARCH_300))
+ return -EIO;
return 0;
}
diff --git a/arch/powerpc/lib/copy_32.S b/arch/powerpc/lib/copy_32.S
index c44df2d..99f37f2 100644
--- a/arch/powerpc/lib/copy_32.S
+++ b/arch/powerpc/lib/copy_32.S
@@ -217,7 +217,7 @@ _GLOBAL(memcpy)
bdnz 40b
65: blr
-_GLOBAL(generic_memcpy)
+generic_memcpy:
srwi. r7,r5,3
addi r6,r3,-4
addi r4,r4,-4
diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index dc885b3..3362299 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -925,6 +925,7 @@ int __kprobes analyse_instr(struct instruction_op *op, struct pt_regs *regs,
}
}
#endif
+ break; /* illegal instruction */
case 31:
switch ((instr >> 1) & 0x3ff) {
@@ -1818,9 +1819,11 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr)
case 4:
__get_user_asmx(val, op.ea, err, "lwarx");
break;
+#ifdef __powerpc64__
case 8:
__get_user_asmx(val, op.ea, err, "ldarx");
break;
+#endif
default:
return 0;
}
@@ -1841,9 +1844,11 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr)
case 4:
__put_user_asmx(op.val, op.ea, err, "stwcx.", cr);
break;
+#ifdef __powerpc64__
case 8:
__put_user_asmx(op.val, op.ea, err, "stdcx.", cr);
break;
+#endif
default:
return 0;
}
diff --git a/arch/powerpc/lib/xor_vmx.c b/arch/powerpc/lib/xor_vmx.c
index 07f49f1..f9de69a 100644
--- a/arch/powerpc/lib/xor_vmx.c
+++ b/arch/powerpc/lib/xor_vmx.c
@@ -17,7 +17,17 @@
*
* Author: Anton Blanchard <anton@au.ibm.com>
*/
+
+/*
+ * Sparse (as at v0.5.0) gets very, very confused by this file.
+ * Make it a bit simpler for it.
+ */
+#if !defined(__CHECKER__)
#include <altivec.h>
+#else
+#define vec_xor(a, b) a ^ b
+#define vector __attribute__((vector_size(16)))
+#endif
#include <linux/preempt.h>
#include <linux/export.h>
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index adfee3f..f2cea6d 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -13,10 +13,11 @@ obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \
tlb_nohash_low.o
obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(CONFIG_WORD_SIZE)e.o
hash64-$(CONFIG_PPC_NATIVE) := hash_native_64.o
-obj-$(CONFIG_PPC_STD_MMU_64) += hash_utils_64.o slb_low.o slb.o $(hash64-y)
-obj-$(CONFIG_PPC_STD_MMU_32) += ppc_mmu_32.o hash_low_32.o
-obj-$(CONFIG_PPC_STD_MMU) += tlb_hash$(CONFIG_WORD_SIZE).o \
- mmu_context_hash$(CONFIG_WORD_SIZE).o
+obj-$(CONFIG_PPC_BOOK3E_64) += pgtable-book3e.o
+obj-$(CONFIG_PPC_STD_MMU_64) += pgtable-hash64.o hash_utils_64.o slb_low.o slb.o $(hash64-y) mmu_context_book3s64.o pgtable-book3s64.o
+obj-$(CONFIG_PPC_RADIX_MMU) += pgtable-radix.o tlb-radix.o
+obj-$(CONFIG_PPC_STD_MMU_32) += ppc_mmu_32.o hash_low_32.o mmu_context_hash32.o
+obj-$(CONFIG_PPC_STD_MMU) += tlb_hash$(CONFIG_WORD_SIZE).o
ifeq ($(CONFIG_PPC_STD_MMU_64),y)
obj-$(CONFIG_PPC_4K_PAGES) += hash64_4k.o
obj-$(CONFIG_PPC_64K_PAGES) += hash64_64k.o
@@ -33,6 +34,7 @@ obj-$(CONFIG_PPC_MM_SLICES) += slice.o
obj-y += hugetlbpage.o
ifeq ($(CONFIG_HUGETLB_PAGE),y)
obj-$(CONFIG_PPC_STD_MMU_64) += hugetlbpage-hash64.o
+obj-$(CONFIG_PPC_RADIX_MMU) += hugetlbpage-radix.o
obj-$(CONFIG_PPC_BOOK3E_MMU) += hugetlbpage-book3e.o
endif
obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hugepage-hash64.o
diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/fsl_booke_mmu.c
index a1b2713..139dec4 100644
--- a/arch/powerpc/mm/fsl_booke_mmu.c
+++ b/arch/powerpc/mm/fsl_booke_mmu.c
@@ -135,7 +135,7 @@ static void settlbcam(int index, unsigned long virt, phys_addr_t phys,
TLBCAM[index].MAS7 = (u64)phys >> 32;
/* Below is unlikely -- only for large user pages or similar */
- if (pte_user(flags)) {
+ if (pte_user(__pte(flags))) {
TLBCAM[index].MAS3 |= MAS3_UX | MAS3_UR;
TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_UW : 0);
}
diff --git a/arch/powerpc/mm/hash64_4k.c b/arch/powerpc/mm/hash64_4k.c
index 47d1b26..6333b27 100644
--- a/arch/powerpc/mm/hash64_4k.c
+++ b/arch/powerpc/mm/hash64_4k.c
@@ -34,21 +34,21 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
old_pte = pte_val(pte);
/* If PTE busy, retry the access */
- if (unlikely(old_pte & _PAGE_BUSY))
+ if (unlikely(old_pte & H_PAGE_BUSY))
return 0;
/* If PTE permissions don't match, take page fault */
- if (unlikely(access & ~old_pte))
+ if (unlikely(!check_pte_access(access, old_pte)))
return 1;
/*
* Try to lock the PTE, add ACCESSED and DIRTY if it was
* a write access. Since this is 4K insert of 64K page size
- * also add _PAGE_COMBO
+ * also add H_PAGE_COMBO
*/
- new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED;
- if (access & _PAGE_RW)
+ new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED;
+ if (access & _PAGE_WRITE)
new_pte |= _PAGE_DIRTY;
- } while (old_pte != __cmpxchg_u64((unsigned long *)ptep,
- old_pte, new_pte));
+ } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
+
/*
* PP bits. _PAGE_USER is already PP bit 0x2, so we only
* need to add in 0x1 if it's a read-only user page
@@ -60,22 +60,22 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
vpn = hpt_vpn(ea, vsid, ssize);
- if (unlikely(old_pte & _PAGE_HASHPTE)) {
+ if (unlikely(old_pte & H_PAGE_HASHPTE)) {
/*
* There MIGHT be an HPTE for this pte
*/
hash = hpt_hash(vpn, shift, ssize);
- if (old_pte & _PAGE_F_SECOND)
+ if (old_pte & H_PAGE_F_SECOND)
hash = ~hash;
slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
- slot += (old_pte & _PAGE_F_GIX) >> _PAGE_F_GIX_SHIFT;
+ slot += (old_pte & H_PAGE_F_GIX) >> H_PAGE_F_GIX_SHIFT;
if (ppc_md.hpte_updatepp(slot, rflags, vpn, MMU_PAGE_4K,
MMU_PAGE_4K, ssize, flags) == -1)
old_pte &= ~_PAGE_HPTEFLAGS;
}
- if (likely(!(old_pte & _PAGE_HASHPTE))) {
+ if (likely(!(old_pte & H_PAGE_HASHPTE))) {
pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
hash = hpt_hash(vpn, shift, ssize);
@@ -115,9 +115,10 @@ repeat:
MMU_PAGE_4K, MMU_PAGE_4K, old_pte);
return -1;
}
- new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
- new_pte |= (slot << _PAGE_F_GIX_SHIFT) & (_PAGE_F_SECOND | _PAGE_F_GIX);
+ new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
+ new_pte |= (slot << H_PAGE_F_GIX_SHIFT) &
+ (H_PAGE_F_SECOND | H_PAGE_F_GIX);
}
- *ptep = __pte(new_pte & ~_PAGE_BUSY);
+ *ptep = __pte(new_pte & ~H_PAGE_BUSY);
return 0;
}
diff --git a/arch/powerpc/mm/hash64_64k.c b/arch/powerpc/mm/hash64_64k.c
index b2d659c..16644e1 100644
--- a/arch/powerpc/mm/hash64_64k.c
+++ b/arch/powerpc/mm/hash64_64k.c
@@ -23,7 +23,7 @@ bool __rpte_sub_valid(real_pte_t rpte, unsigned long index)
unsigned long g_idx;
unsigned long ptev = pte_val(rpte.pte);
- g_idx = (ptev & _PAGE_COMBO_VALID) >> _PAGE_F_GIX_SHIFT;
+ g_idx = (ptev & H_PAGE_COMBO_VALID) >> H_PAGE_F_GIX_SHIFT;
index = index >> 2;
if (g_idx & (0x1 << index))
return true;
@@ -37,12 +37,12 @@ static unsigned long mark_subptegroup_valid(unsigned long ptev, unsigned long in
{
unsigned long g_idx;
- if (!(ptev & _PAGE_COMBO))
+ if (!(ptev & H_PAGE_COMBO))
return ptev;
index = index >> 2;
g_idx = 0x1 << index;
- return ptev | (g_idx << _PAGE_F_GIX_SHIFT);
+ return ptev | (g_idx << H_PAGE_F_GIX_SHIFT);
}
int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
@@ -66,21 +66,21 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
old_pte = pte_val(pte);
/* If PTE busy, retry the access */
- if (unlikely(old_pte & _PAGE_BUSY))
+ if (unlikely(old_pte & H_PAGE_BUSY))
return 0;
/* If PTE permissions don't match, take page fault */
- if (unlikely(access & ~old_pte))
+ if (unlikely(!check_pte_access(access, old_pte)))
return 1;
/*
* Try to lock the PTE, add ACCESSED and DIRTY if it was
* a write access. Since this is 4K insert of 64K page size
- * also add _PAGE_COMBO
+ * also add H_PAGE_COMBO
*/
- new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED | _PAGE_COMBO;
- if (access & _PAGE_RW)
+ new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED | H_PAGE_COMBO;
+ if (access & _PAGE_WRITE)
new_pte |= _PAGE_DIRTY;
- } while (old_pte != __cmpxchg_u64((unsigned long *)ptep,
- old_pte, new_pte));
+ } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
+
/*
* Handle the subpage protection bits
*/
@@ -103,21 +103,21 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
/*
*None of the sub 4k page is hashed
*/
- if (!(old_pte & _PAGE_HASHPTE))
+ if (!(old_pte & H_PAGE_HASHPTE))
goto htab_insert_hpte;
/*
* Check if the pte was already inserted into the hash table
* as a 64k HW page, and invalidate the 64k HPTE if so.
*/
- if (!(old_pte & _PAGE_COMBO)) {
+ if (!(old_pte & H_PAGE_COMBO)) {
flush_hash_page(vpn, rpte, MMU_PAGE_64K, ssize, flags);
/*
* clear the old slot details from the old and new pte.
* On hash insert failure we use old pte value and we don't
* want slot information there if we have a insert failure.
*/
- old_pte &= ~(_PAGE_HASHPTE | _PAGE_F_GIX | _PAGE_F_SECOND);
- new_pte &= ~(_PAGE_HASHPTE | _PAGE_F_GIX | _PAGE_F_SECOND);
+ old_pte &= ~(H_PAGE_HASHPTE | H_PAGE_F_GIX | H_PAGE_F_SECOND);
+ new_pte &= ~(H_PAGE_HASHPTE | H_PAGE_F_GIX | H_PAGE_F_SECOND);
goto htab_insert_hpte;
}
/*
@@ -143,15 +143,15 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
if (ret == -1)
goto htab_insert_hpte;
- *ptep = __pte(new_pte & ~_PAGE_BUSY);
+ *ptep = __pte(new_pte & ~H_PAGE_BUSY);
return 0;
}
htab_insert_hpte:
/*
- * handle _PAGE_4K_PFN case
+ * handle H_PAGE_4K_PFN case
*/
- if (old_pte & _PAGE_4K_PFN) {
+ if (old_pte & H_PAGE_4K_PFN) {
/*
* All the sub 4k page have the same
* physical address.
@@ -199,20 +199,20 @@ repeat:
}
/*
* Insert slot number & secondary bit in PTE second half,
- * clear _PAGE_BUSY and set appropriate HPTE slot bit
- * Since we have _PAGE_BUSY set on ptep, we can be sure
+ * clear H_PAGE_BUSY and set appropriate HPTE slot bit
+ * Since we have H_PAGE_BUSY set on ptep, we can be sure
* nobody is undating hidx.
*/
hidxp = (unsigned long *)(ptep + PTRS_PER_PTE);
rpte.hidx &= ~(0xfUL << (subpg_index << 2));
*hidxp = rpte.hidx | (slot << (subpg_index << 2));
new_pte = mark_subptegroup_valid(new_pte, subpg_index);
- new_pte |= _PAGE_HASHPTE;
+ new_pte |= H_PAGE_HASHPTE;
/*
* check __real_pte for details on matching smp_rmb()
*/
smp_wmb();
- *ptep = __pte(new_pte & ~_PAGE_BUSY);
+ *ptep = __pte(new_pte & ~H_PAGE_BUSY);
return 0;
}
@@ -220,7 +220,6 @@ int __hash_page_64K(unsigned long ea, unsigned long access,
unsigned long vsid, pte_t *ptep, unsigned long trap,
unsigned long flags, int ssize)
{
-
unsigned long hpte_group;
unsigned long rflags, pa;
unsigned long old_pte, new_pte;
@@ -235,27 +234,26 @@ int __hash_page_64K(unsigned long ea, unsigned long access,
old_pte = pte_val(pte);
/* If PTE busy, retry the access */
- if (unlikely(old_pte & _PAGE_BUSY))
+ if (unlikely(old_pte & H_PAGE_BUSY))
return 0;
/* If PTE permissions don't match, take page fault */
- if (unlikely(access & ~old_pte))
+ if (unlikely(!check_pte_access(access, old_pte)))
return 1;
/*
* Check if PTE has the cache-inhibit bit set
* If so, bail out and refault as a 4k page
*/
if (!mmu_has_feature(MMU_FTR_CI_LARGE_PAGE) &&
- unlikely(old_pte & _PAGE_NO_CACHE))
+ unlikely(pte_ci(pte)))
return 0;
/*
* Try to lock the PTE, add ACCESSED and DIRTY if it was
* a write access.
*/
- new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED;
- if (access & _PAGE_RW)
+ new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED;
+ if (access & _PAGE_WRITE)
new_pte |= _PAGE_DIRTY;
- } while (old_pte != __cmpxchg_u64((unsigned long *)ptep,
- old_pte, new_pte));
+ } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
rflags = htab_convert_pte_flags(new_pte);
@@ -264,22 +262,22 @@ int __hash_page_64K(unsigned long ea, unsigned long access,
rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
vpn = hpt_vpn(ea, vsid, ssize);
- if (unlikely(old_pte & _PAGE_HASHPTE)) {
+ if (unlikely(old_pte & H_PAGE_HASHPTE)) {
/*
* There MIGHT be an HPTE for this pte
*/
hash = hpt_hash(vpn, shift, ssize);
- if (old_pte & _PAGE_F_SECOND)
+ if (old_pte & H_PAGE_F_SECOND)
hash = ~hash;
slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
- slot += (old_pte & _PAGE_F_GIX) >> _PAGE_F_GIX_SHIFT;
+ slot += (old_pte & H_PAGE_F_GIX) >> H_PAGE_F_GIX_SHIFT;
if (ppc_md.hpte_updatepp(slot, rflags, vpn, MMU_PAGE_64K,
MMU_PAGE_64K, ssize, flags) == -1)
old_pte &= ~_PAGE_HPTEFLAGS;
}
- if (likely(!(old_pte & _PAGE_HASHPTE))) {
+ if (likely(!(old_pte & H_PAGE_HASHPTE))) {
pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
hash = hpt_hash(vpn, shift, ssize);
@@ -319,9 +317,10 @@ repeat:
MMU_PAGE_64K, MMU_PAGE_64K, old_pte);
return -1;
}
- new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
- new_pte |= (slot << _PAGE_F_GIX_SHIFT) & (_PAGE_F_SECOND | _PAGE_F_GIX);
+ new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
+ new_pte |= (slot << H_PAGE_F_GIX_SHIFT) &
+ (H_PAGE_F_SECOND | H_PAGE_F_GIX);
}
- *ptep = __pte(new_pte & ~_PAGE_BUSY);
+ *ptep = __pte(new_pte & ~H_PAGE_BUSY);
return 0;
}
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index 8eaac81..d873f65 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -221,7 +221,7 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn,
return -1;
hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID;
- hpte_r = hpte_encode_r(pa, psize, apsize) | rflags;
+ hpte_r = hpte_encode_r(pa, psize, apsize, ssize) | rflags;
if (!(vflags & HPTE_V_BOLTED)) {
DBG_LOW(" i=%x hpte_v=%016lx, hpte_r=%016lx\n",
@@ -719,6 +719,12 @@ static void native_flush_hash_range(unsigned long number, int local)
local_irq_restore(flags);
}
+static int native_update_partition_table(u64 patb1)
+{
+ partition_tb->patb1 = cpu_to_be64(patb1);
+ return 0;
+}
+
void __init hpte_init_native(void)
{
ppc_md.hpte_invalidate = native_hpte_invalidate;
@@ -729,4 +735,7 @@ void __init hpte_init_native(void)
ppc_md.hpte_clear_all = native_hpte_clear;
ppc_md.flush_hash_range = native_flush_hash_range;
ppc_md.hugepage_invalidate = native_hugepage_invalidate;
+
+ if (cpu_has_feature(CPU_FTR_ARCH_300))
+ ppc_md.update_partition_table = native_update_partition_table;
}
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 7635b1c..5926896 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -167,16 +167,22 @@ unsigned long htab_convert_pte_flags(unsigned long pteflags)
if ((pteflags & _PAGE_EXEC) == 0)
rflags |= HPTE_R_N;
/*
- * PP bits:
+ * PPP bits:
* Linux uses slb key 0 for kernel and 1 for user.
- * kernel areas are mapped with PP=00
- * and there is no kernel RO (_PAGE_KERNEL_RO).
- * User area is mapped with PP=0x2 for read/write
- * or PP=0x3 for read-only (including writeable but clean pages).
+ * kernel RW areas are mapped with PPP=0b000
+ * User area is mapped with PPP=0b010 for read/write
+ * or PPP=0b011 for read-only (including writeable but clean pages).
*/
- if (pteflags & _PAGE_USER) {
- rflags |= 0x2;
- if (!((pteflags & _PAGE_RW) && (pteflags & _PAGE_DIRTY)))
+ if (pteflags & _PAGE_PRIVILEGED) {
+ /*
+ * Kernel read only mapped with ppp bits 0b110
+ */
+ if (!(pteflags & _PAGE_WRITE))
+ rflags |= (HPTE_R_PP0 | 0x2);
+ } else {
+ if (pteflags & _PAGE_RWX)
+ rflags |= 0x2;
+ if (!((pteflags & _PAGE_WRITE) && (pteflags & _PAGE_DIRTY)))
rflags |= 0x1;
}
/*
@@ -186,12 +192,13 @@ unsigned long htab_convert_pte_flags(unsigned long pteflags)
/*
* Add in WIG bits
*/
- if (pteflags & _PAGE_WRITETHRU)
- rflags |= HPTE_R_W;
- if (pteflags & _PAGE_NO_CACHE)
+
+ if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_TOLERANT)
rflags |= HPTE_R_I;
- if (pteflags & _PAGE_GUARDED)
- rflags |= HPTE_R_G;
+ if ((pteflags & _PAGE_CACHE_CTL ) == _PAGE_NON_IDEMPOTENT)
+ rflags |= (HPTE_R_I | HPTE_R_G);
+ if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_SAO)
+ rflags |= (HPTE_R_I | HPTE_R_W);
return rflags;
}
@@ -669,6 +676,41 @@ int remove_section_mapping(unsigned long start, unsigned long end)
}
#endif /* CONFIG_MEMORY_HOTPLUG */
+static void __init hash_init_partition_table(phys_addr_t hash_table,
+ unsigned long pteg_count)
+{
+ unsigned long ps_field;
+ unsigned long htab_size;
+ unsigned long patb_size = 1UL << PATB_SIZE_SHIFT;
+
+ /*
+ * slb llp encoding for the page size used in VPM real mode.
+ * We can ignore that for lpid 0
+ */
+ ps_field = 0;
+ htab_size = __ilog2(pteg_count) - 11;
+
+ BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 24), "Partition table size too large.");
+ partition_tb = __va(memblock_alloc_base(patb_size, patb_size,
+ MEMBLOCK_ALLOC_ANYWHERE));
+
+ /* Initialize the Partition Table with no entries */
+ memset((void *)partition_tb, 0, patb_size);
+ partition_tb->patb0 = cpu_to_be64(ps_field | hash_table | htab_size);
+ /*
+ * FIXME!! This should be done via update_partition table
+ * For now UPRT is 0 for us.
+ */
+ partition_tb->patb1 = 0;
+ DBG("Partition table %p\n", partition_tb);
+ /*
+ * update partition table control register,
+ * 64 K size.
+ */
+ mtspr(SPRN_PTCR, __pa(partition_tb) | (PATB_SIZE_SHIFT - 12));
+
+}
+
static void __init htab_initialize(void)
{
unsigned long table;
@@ -737,8 +779,11 @@ static void __init htab_initialize(void)
/* Initialize the HPT with no entries */
memset((void *)table, 0, htab_size_bytes);
- /* Set SDR1 */
- mtspr(SPRN_SDR1, _SDR1);
+ if (!cpu_has_feature(CPU_FTR_ARCH_300))
+ /* Set SDR1 */
+ mtspr(SPRN_SDR1, _SDR1);
+ else
+ hash_init_partition_table(table, pteg_count);
}
prot = pgprot_val(PAGE_KERNEL);
@@ -823,8 +868,38 @@ static void __init htab_initialize(void)
#undef KB
#undef MB
-void __init early_init_mmu(void)
+void __init hash__early_init_mmu(void)
{
+ /*
+ * initialize page table size
+ */
+ __pte_frag_nr = H_PTE_FRAG_NR;
+ __pte_frag_size_shift = H_PTE_FRAG_SIZE_SHIFT;
+
+ __pte_index_size = H_PTE_INDEX_SIZE;
+ __pmd_index_size = H_PMD_INDEX_SIZE;
+ __pud_index_size = H_PUD_INDEX_SIZE;
+ __pgd_index_size = H_PGD_INDEX_SIZE;
+ __pmd_cache_index = H_PMD_CACHE_INDEX;
+ __pte_table_size = H_PTE_TABLE_SIZE;
+ __pmd_table_size = H_PMD_TABLE_SIZE;
+ __pud_table_size = H_PUD_TABLE_SIZE;
+ __pgd_table_size = H_PGD_TABLE_SIZE;
+ /*
+ * 4k use hugepd format, so for hash set then to
+ * zero
+ */
+ __pmd_val_bits = 0;
+ __pud_val_bits = 0;
+ __pgd_val_bits = 0;
+
+ __kernel_virt_start = H_KERN_VIRT_START;
+ __kernel_virt_size = H_KERN_VIRT_SIZE;
+ __vmalloc_start = H_VMALLOC_START;
+ __vmalloc_end = H_VMALLOC_END;
+ vmemmap = (struct page *)H_VMEMMAP_BASE;
+ ioremap_bot = IOREMAP_BASE;
+
/* Initialize the MMU Hash table and create the linear mapping
* of memory. Has to be done before SLB initialization as this is
* currently where the page size encoding is obtained.
@@ -836,12 +911,16 @@ void __init early_init_mmu(void)
}
#ifdef CONFIG_SMP
-void early_init_mmu_secondary(void)
+void hash__early_init_mmu_secondary(void)
{
/* Initialize hash table for that CPU */
- if (!firmware_has_feature(FW_FEATURE_LPAR))
- mtspr(SPRN_SDR1, _SDR1);
-
+ if (!firmware_has_feature(FW_FEATURE_LPAR)) {
+ if (!cpu_has_feature(CPU_FTR_ARCH_300))
+ mtspr(SPRN_SDR1, _SDR1);
+ else
+ mtspr(SPRN_PTCR,
+ __pa(partition_tb) | (PATB_SIZE_SHIFT - 12));
+ }
/* Initialize SLB */
slb_initialize();
}
@@ -920,7 +999,7 @@ void demote_segment_4k(struct mm_struct *mm, unsigned long addr)
* Userspace sets the subpage permissions using the subpage_prot system call.
*
* Result is 0: full permissions, _PAGE_RW: read-only,
- * _PAGE_USER or _PAGE_USER|_PAGE_RW: no access.
+ * _PAGE_RWX: no access.
*/
static int subpage_protection(struct mm_struct *mm, unsigned long ea)
{
@@ -946,8 +1025,13 @@ static int subpage_protection(struct mm_struct *mm, unsigned long ea)
/* extract 2-bit bitfield for this 4k subpage */
spp >>= 30 - 2 * ((ea >> 12) & 0xf);
- /* turn 0,1,2,3 into combination of _PAGE_USER and _PAGE_RW */
- spp = ((spp & 2) ? _PAGE_USER : 0) | ((spp & 1) ? _PAGE_RW : 0);
+ /*
+ * 0 -> full premission
+ * 1 -> Read only
+ * 2 -> no access.
+ * We return the flag that need to be cleared.
+ */
+ spp = ((spp & 2) ? _PAGE_RWX : 0) | ((spp & 1) ? _PAGE_WRITE : 0);
return spp;
}
@@ -1084,7 +1168,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
/* Pre-check access permissions (will be re-checked atomically
* in __hash_page_XX but this pre-check is a fast path
*/
- if (access & ~pte_val(*ptep)) {
+ if (!check_pte_access(access, pte_val(*ptep))) {
DBG_LOW(" no access !\n");
rc = 1;
goto bail;
@@ -1122,8 +1206,8 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
#endif
/* Do actual hashing */
#ifdef CONFIG_PPC_64K_PAGES
- /* If _PAGE_4K_PFN is set, make sure this is a 4k segment */
- if ((pte_val(*ptep) & _PAGE_4K_PFN) && psize == MMU_PAGE_64K) {
+ /* If H_PAGE_4K_PFN is set, make sure this is a 4k segment */
+ if ((pte_val(*ptep) & H_PAGE_4K_PFN) && psize == MMU_PAGE_64K) {
demote_segment_4k(mm, ea);
psize = MMU_PAGE_4K;
}
@@ -1131,8 +1215,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
/* If this PTE is non-cacheable and we have restrictions on
* using non cacheable large pages, then we switch to 4k
*/
- if (mmu_ci_restrictions && psize == MMU_PAGE_64K &&
- (pte_val(*ptep) & _PAGE_NO_CACHE)) {
+ if (mmu_ci_restrictions && psize == MMU_PAGE_64K && pte_ci(*ptep)) {
if (user_region) {
demote_segment_4k(mm, ea);
psize = MMU_PAGE_4K;
@@ -1209,7 +1292,7 @@ EXPORT_SYMBOL_GPL(hash_page);
int __hash_page(unsigned long ea, unsigned long msr, unsigned long trap,
unsigned long dsisr)
{
- unsigned long access = _PAGE_PRESENT;
+ unsigned long access = _PAGE_PRESENT | _PAGE_READ;
unsigned long flags = 0;
struct mm_struct *mm = current->mm;
@@ -1220,14 +1303,18 @@ int __hash_page(unsigned long ea, unsigned long msr, unsigned long trap,
flags |= HPTE_NOHPTE_UPDATE;
if (dsisr & DSISR_ISSTORE)
- access |= _PAGE_RW;
+ access |= _PAGE_WRITE;
/*
- * We need to set the _PAGE_USER bit if MSR_PR is set or if we are
- * accessing a userspace segment (even from the kernel). We assume
- * kernel addresses always have the high bit set.
+ * We set _PAGE_PRIVILEGED only when
+ * kernel mode access kernel space.
+ *
+ * _PAGE_PRIVILEGED is NOT set
+ * 1) when kernel mode access user space
+ * 2) user space access kernel space.
*/
+ access |= _PAGE_PRIVILEGED;
if ((msr & MSR_PR) || (REGION_ID(ea) == USER_REGION_ID))
- access |= _PAGE_USER;
+ access &= ~_PAGE_PRIVILEGED;
if (trap == 0x400)
access |= _PAGE_EXEC;
@@ -1235,6 +1322,30 @@ int __hash_page(unsigned long ea, unsigned long msr, unsigned long trap,
return hash_page_mm(mm, ea, access, trap, flags);
}
+#ifdef CONFIG_PPC_MM_SLICES
+static bool should_hash_preload(struct mm_struct *mm, unsigned long ea)
+{
+ int psize = get_slice_psize(mm, ea);
+
+ /* We only prefault standard pages for now */
+ if (unlikely(psize != mm->context.user_psize))
+ return false;
+
+ /*
+ * Don't prefault if subpage protection is enabled for the EA.
+ */
+ if (unlikely((psize == MMU_PAGE_4K) && subpage_protection(mm, ea)))
+ return false;
+
+ return true;
+}
+#else
+static bool should_hash_preload(struct mm_struct *mm, unsigned long ea)
+{
+ return true;
+}
+#endif
+
void hash_preload(struct mm_struct *mm, unsigned long ea,
unsigned long access, unsigned long trap)
{
@@ -1247,11 +1358,8 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
BUG_ON(REGION_ID(ea) != USER_REGION_ID);
-#ifdef CONFIG_PPC_MM_SLICES
- /* We only prefault standard pages for now */
- if (unlikely(get_slice_psize(mm, ea) != mm->context.user_psize))
+ if (!should_hash_preload(mm, ea))
return;
-#endif
DBG_LOW("hash_preload(mm=%p, mm->pgdir=%p, ea=%016lx, access=%lx,"
" trap=%lx\n", mm, mm->pgd, ea, access, trap);
@@ -1282,13 +1390,13 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
WARN_ON(hugepage_shift);
#ifdef CONFIG_PPC_64K_PAGES
- /* If either _PAGE_4K_PFN or _PAGE_NO_CACHE is set (and we are on
+ /* If either H_PAGE_4K_PFN or cache inhibited is set (and we are on
* a 64K kernel), then we don't preload, hash_page() will take
* care of it once we actually try to access the page.
* That way we don't have to duplicate all of the logic for segment
* page size demotion here
*/
- if (pte_val(*ptep) & (_PAGE_4K_PFN | _PAGE_NO_CACHE))
+ if ((pte_val(*ptep) & H_PAGE_4K_PFN) || pte_ci(*ptep))
goto out_exit;
#endif /* CONFIG_PPC_64K_PAGES */
@@ -1570,7 +1678,7 @@ void __kernel_map_pages(struct page *page, int numpages, int enable)
}
#endif /* CONFIG_DEBUG_PAGEALLOC */
-void setup_initial_memory_limit(phys_addr_t first_memblock_base,
+void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base,
phys_addr_t first_memblock_size)
{
/* We don't currently support the first MEMBLOCK not mapping 0
diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c
index eb2accd..ba3fc22 100644
--- a/arch/powerpc/mm/hugepage-hash64.c
+++ b/arch/powerpc/mm/hugepage-hash64.c
@@ -37,20 +37,20 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
old_pmd = pmd_val(pmd);
/* If PMD busy, retry the access */
- if (unlikely(old_pmd & _PAGE_BUSY))
+ if (unlikely(old_pmd & H_PAGE_BUSY))
return 0;
/* If PMD permissions don't match, take page fault */
- if (unlikely(access & ~old_pmd))
+ if (unlikely(!check_pte_access(access, old_pmd)))
return 1;
/*
* Try to lock the PTE, add ACCESSED and DIRTY if it was
* a write access
*/
- new_pmd = old_pmd | _PAGE_BUSY | _PAGE_ACCESSED;
- if (access & _PAGE_RW)
+ new_pmd = old_pmd | H_PAGE_BUSY | _PAGE_ACCESSED;
+ if (access & _PAGE_WRITE)
new_pmd |= _PAGE_DIRTY;
- } while (old_pmd != __cmpxchg_u64((unsigned long *)pmdp,
- old_pmd, new_pmd));
+ } while (!pmd_xchg(pmdp, __pmd(old_pmd), __pmd(new_pmd)));
+
rflags = htab_convert_pte_flags(new_pmd);
#if 0
@@ -78,7 +78,7 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
* base page size. This is because demote_segment won't flush
* hash page table entries.
*/
- if ((old_pmd & _PAGE_HASHPTE) && !(old_pmd & _PAGE_COMBO)) {
+ if ((old_pmd & H_PAGE_HASHPTE) && !(old_pmd & H_PAGE_COMBO)) {
flush_hash_hugepage(vsid, ea, pmdp, MMU_PAGE_64K,
ssize, flags);
/*
@@ -125,7 +125,7 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
hash = hpt_hash(vpn, shift, ssize);
/* insert new entry */
pa = pmd_pfn(__pmd(old_pmd)) << PAGE_SHIFT;
- new_pmd |= _PAGE_HASHPTE;
+ new_pmd |= H_PAGE_HASHPTE;
repeat:
hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
@@ -169,17 +169,17 @@ repeat:
mark_hpte_slot_valid(hpte_slot_array, index, slot);
}
/*
- * Mark the pte with _PAGE_COMBO, if we are trying to hash it with
+ * Mark the pte with H_PAGE_COMBO, if we are trying to hash it with
* base page size 4k.
*/
if (psize == MMU_PAGE_4K)
- new_pmd |= _PAGE_COMBO;
+ new_pmd |= H_PAGE_COMBO;
/*
* The hpte valid is stored in the pgtable whose address is in the
* second half of the PMD. Order this against clearing of the busy bit in
* huge pmd.
*/
smp_wmb();
- *pmdp = __pmd(new_pmd & ~_PAGE_BUSY);
+ *pmdp = __pmd(new_pmd & ~H_PAGE_BUSY);
return 0;
}
diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c
index 8555fce..3058560 100644
--- a/arch/powerpc/mm/hugetlbpage-hash64.c
+++ b/arch/powerpc/mm/hugetlbpage-hash64.c
@@ -47,18 +47,19 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
do {
old_pte = pte_val(*ptep);
/* If PTE busy, retry the access */
- if (unlikely(old_pte & _PAGE_BUSY))
+ if (unlikely(old_pte & H_PAGE_BUSY))
return 0;
/* If PTE permissions don't match, take page fault */
- if (unlikely(access & ~old_pte))
+ if (unlikely(!check_pte_access(access, old_pte)))
return 1;
+
/* Try to lock the PTE, add ACCESSED and DIRTY if it was
* a write access */
- new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED;
- if (access & _PAGE_RW)
+ new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED;
+ if (access & _PAGE_WRITE)
new_pte |= _PAGE_DIRTY;
- } while(old_pte != __cmpxchg_u64((unsigned long *)ptep,
- old_pte, new_pte));
+ } while(!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
+
rflags = htab_convert_pte_flags(new_pte);
sz = ((1UL) << shift);
@@ -68,28 +69,28 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
/* Check if pte already has an hpte (case 2) */
- if (unlikely(old_pte & _PAGE_HASHPTE)) {
+ if (unlikely(old_pte & H_PAGE_HASHPTE)) {
/* There MIGHT be an HPTE for this pte */
unsigned long hash, slot;
hash = hpt_hash(vpn, shift, ssize);
- if (old_pte & _PAGE_F_SECOND)
+ if (old_pte & H_PAGE_F_SECOND)
hash = ~hash;
slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
- slot += (old_pte & _PAGE_F_GIX) >> _PAGE_F_GIX_SHIFT;
+ slot += (old_pte & H_PAGE_F_GIX) >> H_PAGE_F_GIX_SHIFT;
if (ppc_md.hpte_updatepp(slot, rflags, vpn, mmu_psize,
mmu_psize, ssize, flags) == -1)
old_pte &= ~_PAGE_HPTEFLAGS;
}
- if (likely(!(old_pte & _PAGE_HASHPTE))) {
+ if (likely(!(old_pte & H_PAGE_HASHPTE))) {
unsigned long hash = hpt_hash(vpn, shift, ssize);
pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
/* clear HPTE slot informations in new PTE */
- new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
+ new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
slot = hpte_insert_repeating(hash, vpn, pa, rflags, 0,
mmu_psize, ssize);
@@ -105,14 +106,14 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
return -1;
}
- new_pte |= (slot << _PAGE_F_GIX_SHIFT) &
- (_PAGE_F_SECOND | _PAGE_F_GIX);
+ new_pte |= (slot << H_PAGE_F_GIX_SHIFT) &
+ (H_PAGE_F_SECOND | H_PAGE_F_GIX);
}
/*
* No need to use ldarx/stdcx here
*/
- *ptep = __pte(new_pte & ~_PAGE_BUSY);
+ *ptep = __pte(new_pte & ~H_PAGE_BUSY);
return 0;
}
diff --git a/arch/powerpc/mm/hugetlbpage-radix.c b/arch/powerpc/mm/hugetlbpage-radix.c
new file mode 100644
index 0000000..1e11559
--- /dev/null
+++ b/arch/powerpc/mm/hugetlbpage-radix.c
@@ -0,0 +1,87 @@
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/cacheflush.h>
+#include <asm/machdep.h>
+#include <asm/mman.h>
+
+void radix__flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+ unsigned long ap, shift;
+ struct hstate *hstate = hstate_file(vma->vm_file);
+
+ shift = huge_page_shift(hstate);
+ if (shift == mmu_psize_defs[MMU_PAGE_2M].shift)
+ ap = mmu_get_ap(MMU_PAGE_2M);
+ else if (shift == mmu_psize_defs[MMU_PAGE_1G].shift)
+ ap = mmu_get_ap(MMU_PAGE_1G);
+ else {
+ WARN(1, "Wrong huge page shift\n");
+ return ;
+ }
+ radix___flush_tlb_page(vma->vm_mm, vmaddr, ap, 0);
+}
+
+void radix__local_flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+ unsigned long ap, shift;
+ struct hstate *hstate = hstate_file(vma->vm_file);
+
+ shift = huge_page_shift(hstate);
+ if (shift == mmu_psize_defs[MMU_PAGE_2M].shift)
+ ap = mmu_get_ap(MMU_PAGE_2M);
+ else if (shift == mmu_psize_defs[MMU_PAGE_1G].shift)
+ ap = mmu_get_ap(MMU_PAGE_1G);
+ else {
+ WARN(1, "Wrong huge page shift\n");
+ return ;
+ }
+ radix___local_flush_tlb_page(vma->vm_mm, vmaddr, ap, 0);
+}
+
+/*
+ * A vairant of hugetlb_get_unmapped_area doing topdown search
+ * FIXME!! should we do as x86 does or non hugetlb area does ?
+ * ie, use topdown or not based on mmap_is_legacy check ?
+ */
+unsigned long
+radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ struct hstate *h = hstate_file(file);
+ struct vm_unmapped_area_info info;
+
+ if (len & ~huge_page_mask(h))
+ return -EINVAL;
+ if (len > TASK_SIZE)
+ return -ENOMEM;
+
+ if (flags & MAP_FIXED) {
+ if (prepare_hugepage_range(file, addr, len))
+ return -EINVAL;
+ return addr;
+ }
+
+ if (addr) {
+ addr = ALIGN(addr, huge_page_size(h));
+ vma = find_vma(mm, addr);
+ if (TASK_SIZE - len >= addr &&
+ (!vma || addr + len <= vma->vm_start))
+ return addr;
+ }
+ /*
+ * We are always doing an topdown search here. Slice code
+ * does that too.
+ */
+ info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+ info.length = len;
+ info.low_limit = PAGE_SIZE;
+ info.high_limit = current->mm->mmap_base;
+ info.align_mask = PAGE_MASK & ~huge_page_mask(h);
+ info.align_offset = 0;
+ return vm_unmapped_area(&info);
+}
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index a4a90a8..5aac1a3 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -711,6 +711,9 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
struct hstate *hstate = hstate_file(file);
int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
+ if (radix_enabled())
+ return radix__hugetlb_get_unmapped_area(file, addr, len,
+ pgoff, flags);
return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1);
}
#endif
@@ -719,14 +722,14 @@ unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
{
#ifdef CONFIG_PPC_MM_SLICES
unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);
-
- return 1UL << mmu_psize_to_shift(psize);
-#else
+ /* With radix we don't use slice, so derive it from vma*/
+ if (!radix_enabled())
+ return 1UL << mmu_psize_to_shift(psize);
+#endif
if (!is_vm_hugetlb_page(vma))
return PAGE_SIZE;
return huge_page_size(hstate_vma(vma));
-#endif
}
static inline bool is_power_of_4(unsigned long x)
@@ -825,7 +828,7 @@ static int __init hugetlbpage_init(void)
{
int psize;
- if (!mmu_has_feature(MMU_FTR_16M_PAGE))
+ if (!radix_enabled() && !mmu_has_feature(MMU_FTR_16M_PAGE))
return -ENODEV;
for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
@@ -865,6 +868,9 @@ static int __init hugetlbpage_init(void)
HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
else if (mmu_psize_defs[MMU_PAGE_1M].shift)
HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
+ else if (mmu_psize_defs[MMU_PAGE_2M].shift)
+ HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_2M].shift;
+
return 0;
}
@@ -1005,9 +1011,9 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
end = pte_end;
pte = READ_ONCE(*ptep);
- mask = _PAGE_PRESENT | _PAGE_USER;
+ mask = _PAGE_PRESENT | _PAGE_READ;
if (write)
- mask |= _PAGE_RW;
+ mask |= _PAGE_WRITE;
if ((pte_val(pte) & mask) != mask)
return 0;
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index ba65566..33709bd 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -66,11 +66,11 @@
#include "mmu_decl.h"
#ifdef CONFIG_PPC_STD_MMU_64
-#if PGTABLE_RANGE > USER_VSID_RANGE
+#if H_PGTABLE_RANGE > USER_VSID_RANGE
#warning Limited user VSID range means pagetable space is wasted
#endif
-#if (TASK_SIZE_USER64 < PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE)
+#if (TASK_SIZE_USER64 < H_PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE)
#warning TASK_SIZE is smaller than it needs to be.
#endif
#endif /* CONFIG_PPC_STD_MMU_64 */
@@ -189,75 +189,6 @@ static int __meminit vmemmap_populated(unsigned long start, int page_size)
return 0;
}
-/* On hash-based CPUs, the vmemmap is bolted in the hash table.
- *
- * On Book3E CPUs, the vmemmap is currently mapped in the top half of
- * the vmalloc space using normal page tables, though the size of
- * pages encoded in the PTEs can be different
- */
-
-#ifdef CONFIG_PPC_BOOK3E
-static int __meminit vmemmap_create_mapping(unsigned long start,
- unsigned long page_size,
- unsigned long phys)
-{
- /* Create a PTE encoding without page size */
- unsigned long i, flags = _PAGE_PRESENT | _PAGE_ACCESSED |
- _PAGE_KERNEL_RW;
-
- /* PTEs only contain page size encodings up to 32M */
- BUG_ON(mmu_psize_defs[mmu_vmemmap_psize].enc > 0xf);
-
- /* Encode the size in the PTE */
- flags |= mmu_psize_defs[mmu_vmemmap_psize].enc << 8;
-
- /* For each PTE for that area, map things. Note that we don't
- * increment phys because all PTEs are of the large size and
- * thus must have the low bits clear
- */
- for (i = 0; i < page_size; i += PAGE_SIZE)
- BUG_ON(map_kernel_page(start + i, phys, flags));
-
- return 0;
-}
-
-#ifdef CONFIG_MEMORY_HOTPLUG
-static void vmemmap_remove_mapping(unsigned long start,
- unsigned long page_size)
-{
-}
-#endif
-#else /* CONFIG_PPC_BOOK3E */
-static int __meminit vmemmap_create_mapping(unsigned long start,
- unsigned long page_size,
- unsigned long phys)
-{
- int rc = htab_bolt_mapping(start, start + page_size, phys,
- pgprot_val(PAGE_KERNEL),
- mmu_vmemmap_psize, mmu_kernel_ssize);
- if (rc < 0) {
- int rc2 = htab_remove_mapping(start, start + page_size,
- mmu_vmemmap_psize,
- mmu_kernel_ssize);
- BUG_ON(rc2 && (rc2 != -ENOENT));
- }
- return rc;
-}
-
-#ifdef CONFIG_MEMORY_HOTPLUG
-static void vmemmap_remove_mapping(unsigned long start,
- unsigned long page_size)
-{
- int rc = htab_remove_mapping(start, start + page_size,
- mmu_vmemmap_psize,
- mmu_kernel_ssize);
- BUG_ON((rc < 0) && (rc != -ENOENT));
- WARN_ON(rc == -ENOENT);
-}
-#endif
-
-#endif /* CONFIG_PPC_BOOK3E */
-
struct vmemmap_backing *vmemmap_list;
static struct vmemmap_backing *next;
static int num_left;
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index ac79dbd..2fd57fa 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -68,12 +68,15 @@ pte_t *kmap_pte;
EXPORT_SYMBOL(kmap_pte);
pgprot_t kmap_prot;
EXPORT_SYMBOL(kmap_prot);
+#define TOP_ZONE ZONE_HIGHMEM
static inline pte_t *virt_to_kpte(unsigned long vaddr)
{
return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr),
vaddr), vaddr), vaddr);
}
+#else
+#define TOP_ZONE ZONE_NORMAL
#endif
int page_is_ram(unsigned long pfn)
@@ -267,14 +270,9 @@ void __init limit_zone_pfn(enum zone_type zone, unsigned long pfn_limit)
*/
int dma_pfn_limit_to_zone(u64 pfn_limit)
{
- enum zone_type top_zone = ZONE_NORMAL;
int i;
-#ifdef CONFIG_HIGHMEM
- top_zone = ZONE_HIGHMEM;
-#endif
-
- for (i = top_zone; i >= 0; i--) {
+ for (i = TOP_ZONE; i >= 0; i--) {
if (max_zone_pfns[i] <= pfn_limit)
return i;
}
@@ -289,7 +287,6 @@ void __init paging_init(void)
{
unsigned long long total_ram = memblock_phys_mem_size();
phys_addr_t top_of_ram = memblock_end_of_DRAM();
- enum zone_type top_zone;
#ifdef CONFIG_PPC32
unsigned long v = __fix_to_virt(__end_of_fixed_addresses - 1);
@@ -313,13 +310,9 @@ void __init paging_init(void)
(long int)((top_of_ram - total_ram) >> 20));
#ifdef CONFIG_HIGHMEM
- top_zone = ZONE_HIGHMEM;
limit_zone_pfn(ZONE_NORMAL, lowmem_end_addr >> PAGE_SHIFT);
-#else
- top_zone = ZONE_NORMAL;
#endif
-
- limit_zone_pfn(top_zone, top_of_ram >> PAGE_SHIFT);
+ limit_zone_pfn(TOP_ZONE, top_of_ram >> PAGE_SHIFT);
zone_limits_final = true;
free_area_init_nodes(max_zone_pfns);
@@ -498,7 +491,10 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
* We don't need to worry about _PAGE_PRESENT here because we are
* called with either mm->page_table_lock held or ptl lock held
*/
- unsigned long access = 0, trap;
+ unsigned long access, trap;
+
+ if (radix_enabled())
+ return;
/* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
if (!pte_young(*ptep) || address >= TASK_SIZE)
@@ -511,13 +507,19 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
*
* We also avoid filling the hash if not coming from a fault
*/
- if (current->thread.regs == NULL)
- return;
- trap = TRAP(current->thread.regs);
- if (trap == 0x400)
- access |= _PAGE_EXEC;
- else if (trap != 0x300)
+
+ trap = current->thread.regs ? TRAP(current->thread.regs) : 0UL;
+ switch (trap) {
+ case 0x300:
+ access = 0UL;
+ break;
+ case 0x400:
+ access = _PAGE_EXEC;
+ break;
+ default:
return;
+ }
+
hash_preload(vma->vm_mm, address, access, trap);
#endif /* CONFIG_PPC_STD_MMU */
#if (defined(CONFIG_PPC_BOOK3E_64) || defined(CONFIG_PPC_FSL_BOOK3E)) \
diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c
index 4087705..2f1e443 100644
--- a/arch/powerpc/mm/mmap.c
+++ b/arch/powerpc/mm/mmap.c
@@ -26,6 +26,9 @@
#include <linux/mm.h>
#include <linux/random.h>
#include <linux/sched.h>
+#include <linux/elf-randomize.h>
+#include <linux/security.h>
+#include <linux/mman.h>
/*
* Top of mmap area (just below the process stack).
@@ -78,6 +81,111 @@ static inline unsigned long mmap_base(unsigned long rnd)
return PAGE_ALIGN(TASK_SIZE - gap - rnd);
}
+#ifdef CONFIG_PPC_RADIX_MMU
+/*
+ * Same function as generic code used only for radix, because we don't need to overload
+ * the generic one. But we will have to duplicate, because hash select
+ * HAVE_ARCH_UNMAPPED_AREA
+ */
+static unsigned long
+radix__arch_get_unmapped_area(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ struct vm_unmapped_area_info info;
+
+ if (len > TASK_SIZE - mmap_min_addr)
+ return -ENOMEM;
+
+ if (flags & MAP_FIXED)
+ return addr;
+
+ if (addr) {
+ addr = PAGE_ALIGN(addr);
+ vma = find_vma(mm, addr);
+ if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
+ (!vma || addr + len <= vma->vm_start))
+ return addr;
+ }
+
+ info.flags = 0;
+ info.length = len;
+ info.low_limit = mm->mmap_base;
+ info.high_limit = TASK_SIZE;
+ info.align_mask = 0;
+ return vm_unmapped_area(&info);
+}
+
+static unsigned long
+radix__arch_get_unmapped_area_topdown(struct file *filp,
+ const unsigned long addr0,
+ const unsigned long len,
+ const unsigned long pgoff,
+ const unsigned long flags)
+{
+ struct vm_area_struct *vma;
+ struct mm_struct *mm = current->mm;
+ unsigned long addr = addr0;
+ struct vm_unmapped_area_info info;
+
+ /* requested length too big for entire address space */
+ if (len > TASK_SIZE - mmap_min_addr)
+ return -ENOMEM;
+
+ if (flags & MAP_FIXED)
+ return addr;
+
+ /* requesting a specific address */
+ if (addr) {
+ addr = PAGE_ALIGN(addr);
+ vma = find_vma(mm, addr);
+ if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
+ (!vma || addr + len <= vma->vm_start))
+ return addr;
+ }
+
+ info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+ info.length = len;
+ info.low_limit = max(PAGE_SIZE, mmap_min_addr);
+ info.high_limit = mm->mmap_base;
+ info.align_mask = 0;
+ addr = vm_unmapped_area(&info);
+
+ /*
+ * A failed mmap() very likely causes application failure,
+ * so fall back to the bottom-up function here. This scenario
+ * can happen with large stack limits and large mmap()
+ * allocations.
+ */
+ if (addr & ~PAGE_MASK) {
+ VM_BUG_ON(addr != -ENOMEM);
+ info.flags = 0;
+ info.low_limit = TASK_UNMAPPED_BASE;
+ info.high_limit = TASK_SIZE;
+ addr = vm_unmapped_area(&info);
+ }
+
+ return addr;
+}
+
+static void radix__arch_pick_mmap_layout(struct mm_struct *mm,
+ unsigned long random_factor)
+{
+ if (mmap_is_legacy()) {
+ mm->mmap_base = TASK_UNMAPPED_BASE;
+ mm->get_unmapped_area = radix__arch_get_unmapped_area;
+ } else {
+ mm->mmap_base = mmap_base(random_factor);
+ mm->get_unmapped_area = radix__arch_get_unmapped_area_topdown;
+ }
+}
+#else
+/* dummy */
+extern void radix__arch_pick_mmap_layout(struct mm_struct *mm,
+ unsigned long random_factor);
+#endif
/*
* This function, called very early during the creation of a new
* process VM image, sets up which VM layout function to use:
@@ -89,6 +197,8 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
if (current->flags & PF_RANDOMIZE)
random_factor = arch_mmap_rnd();
+ if (radix_enabled())
+ return radix__arch_pick_mmap_layout(mm, random_factor);
/*
* Fall back to the standard layout if the personality
* bit is set, or if the expected stack growth is unlimited:
diff --git a/arch/powerpc/mm/mmu_context_hash64.c b/arch/powerpc/mm/mmu_context_book3s64.c
index 9ca6fe1..227b2a6 100644
--- a/arch/powerpc/mm/mmu_context_hash64.c
+++ b/arch/powerpc/mm/mmu_context_book3s64.c
@@ -58,6 +58,17 @@ again:
return index;
}
EXPORT_SYMBOL_GPL(__init_new_context);
+static int radix__init_new_context(struct mm_struct *mm, int index)
+{
+ unsigned long rts_field;
+
+ /*
+ * set the process table entry,
+ */
+ rts_field = 3ull << PPC_BITLSHIFT(2);
+ process_tb[index].prtb0 = cpu_to_be64(rts_field | __pa(mm->pgd) | RADIX_PGD_INDEX_SIZE);
+ return 0;
+}
int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
{
@@ -67,13 +78,27 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
if (index < 0)
return index;
- /* The old code would re-promote on fork, we don't do that
- * when using slices as it could cause problem promoting slices
- * that have been forced down to 4K
- */
- if (slice_mm_new_context(mm))
- slice_set_user_psize(mm, mmu_virtual_psize);
- subpage_prot_init_new_context(mm);
+ if (radix_enabled()) {
+ radix__init_new_context(mm, index);
+ } else {
+
+ /* The old code would re-promote on fork, we don't do that
+ * when using slices as it could cause problem promoting slices
+ * that have been forced down to 4K
+ *
+ * For book3s we have MMU_NO_CONTEXT set to be ~0. Hence check
+ * explicitly against context.id == 0. This ensures that we
+ * properly initialize context slice details for newly allocated
+ * mm's (which will have id == 0) and don't alter context slice
+ * inherited via fork (which will have id != 0).
+ *
+ * We should not be calling init_new_context() on init_mm. Hence a
+ * check against 0 is ok.
+ */
+ if (mm->context.id == 0)
+ slice_set_user_psize(mm, mmu_virtual_psize);
+ subpage_prot_init_new_context(mm);
+ }
mm->context.id = index;
#ifdef CONFIG_PPC_ICSWX
mm->context.cop_lockp = kmalloc(sizeof(spinlock_t), GFP_KERNEL);
@@ -144,8 +169,19 @@ void destroy_context(struct mm_struct *mm)
mm->context.cop_lockp = NULL;
#endif /* CONFIG_PPC_ICSWX */
+ if (radix_enabled())
+ process_tb[mm->context.id].prtb1 = 0;
+ else
+ subpage_prot_free(mm);
destroy_pagetable_page(mm);
__destroy_context(mm->context.id);
- subpage_prot_free(mm);
mm->context.id = MMU_NO_CONTEXT;
}
+
+#ifdef CONFIG_PPC_RADIX_MMU
+void radix__switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
+{
+ mtspr(SPRN_PID, next->context.id);
+ asm volatile("isync": : :"memory");
+}
+#endif
diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c
index 986afbc..7d95bc4 100644
--- a/arch/powerpc/mm/mmu_context_nohash.c
+++ b/arch/powerpc/mm/mmu_context_nohash.c
@@ -226,7 +226,8 @@ static void context_check_map(void)
static void context_check_map(void) { }
#endif
-void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
+void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next,
+ struct task_struct *tsk)
{
unsigned int i, id, cpu = smp_processor_id();
unsigned long *map;
@@ -334,8 +335,7 @@ int init_new_context(struct task_struct *t, struct mm_struct *mm)
mm->context.active = 0;
#ifdef CONFIG_PPC_MM_SLICES
- if (slice_mm_new_context(mm))
- slice_set_user_psize(mm, mmu_virtual_psize);
+ slice_set_user_psize(mm, mmu_virtual_psize);
#endif
return 0;
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index bfb7c0b..6af6532 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -108,11 +108,6 @@ extern unsigned long Hash_size, Hash_mask;
#endif /* CONFIG_PPC32 */
-#ifdef CONFIG_PPC64
-extern int map_kernel_page(unsigned long ea, unsigned long pa,
- unsigned long flags);
-#endif /* CONFIG_PPC64 */
-
extern unsigned long ioremap_bot;
extern unsigned long __max_low_memory;
extern phys_addr_t __initial_memory_limit_addr;
diff --git a/arch/powerpc/mm/pgtable-book3e.c b/arch/powerpc/mm/pgtable-book3e.c
new file mode 100644
index 0000000..a229893
--- /dev/null
+++ b/arch/powerpc/mm/pgtable-book3e.c
@@ -0,0 +1,122 @@
+/*
+ * Copyright 2005, Paul Mackerras, IBM Corporation.
+ * Copyright 2009, Benjamin Herrenschmidt, IBM Corporation.
+ * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/sched.h>
+#include <linux/memblock.h>
+#include <asm/pgalloc.h>
+#include <asm/tlb.h>
+#include <asm/dma.h>
+
+#include "mmu_decl.h"
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+/*
+ * On Book3E CPUs, the vmemmap is currently mapped in the top half of
+ * the vmalloc space using normal page tables, though the size of
+ * pages encoded in the PTEs can be different
+ */
+int __meminit vmemmap_create_mapping(unsigned long start,
+ unsigned long page_size,
+ unsigned long phys)
+{
+ /* Create a PTE encoding without page size */
+ unsigned long i, flags = _PAGE_PRESENT | _PAGE_ACCESSED |
+ _PAGE_KERNEL_RW;
+
+ /* PTEs only contain page size encodings up to 32M */
+ BUG_ON(mmu_psize_defs[mmu_vmemmap_psize].enc > 0xf);
+
+ /* Encode the size in the PTE */
+ flags |= mmu_psize_defs[mmu_vmemmap_psize].enc << 8;
+
+ /* For each PTE for that area, map things. Note that we don't
+ * increment phys because all PTEs are of the large size and
+ * thus must have the low bits clear
+ */
+ for (i = 0; i < page_size; i += PAGE_SIZE)
+ BUG_ON(map_kernel_page(start + i, phys, flags));
+
+ return 0;
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+void vmemmap_remove_mapping(unsigned long start,
+ unsigned long page_size)
+{
+}
+#endif
+#endif /* CONFIG_SPARSEMEM_VMEMMAP */
+
+static __ref void *early_alloc_pgtable(unsigned long size)
+{
+ void *pt;
+
+ pt = __va(memblock_alloc_base(size, size, __pa(MAX_DMA_ADDRESS)));
+ memset(pt, 0, size);
+
+ return pt;
+}
+
+/*
+ * map_kernel_page currently only called by __ioremap
+ * map_kernel_page adds an entry to the ioremap page table
+ * and adds an entry to the HPT, possibly bolting it
+ */
+int map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags)
+{
+ pgd_t *pgdp;
+ pud_t *pudp;
+ pmd_t *pmdp;
+ pte_t *ptep;
+
+ BUILD_BUG_ON(TASK_SIZE_USER64 > PGTABLE_RANGE);
+ if (slab_is_available()) {
+ pgdp = pgd_offset_k(ea);
+ pudp = pud_alloc(&init_mm, pgdp, ea);
+ if (!pudp)
+ return -ENOMEM;
+ pmdp = pmd_alloc(&init_mm, pudp, ea);
+ if (!pmdp)
+ return -ENOMEM;
+ ptep = pte_alloc_kernel(pmdp, ea);
+ if (!ptep)
+ return -ENOMEM;
+ set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
+ __pgprot(flags)));
+ } else {
+ pgdp = pgd_offset_k(ea);
+#ifndef __PAGETABLE_PUD_FOLDED
+ if (pgd_none(*pgdp)) {
+ pudp = early_alloc_pgtable(PUD_TABLE_SIZE);
+ BUG_ON(pudp == NULL);
+ pgd_populate(&init_mm, pgdp, pudp);
+ }
+#endif /* !__PAGETABLE_PUD_FOLDED */
+ pudp = pud_offset(pgdp, ea);
+ if (pud_none(*pudp)) {
+ pmdp = early_alloc_pgtable(PMD_TABLE_SIZE);
+ BUG_ON(pmdp == NULL);
+ pud_populate(&init_mm, pudp, pmdp);
+ }
+ pmdp = pmd_offset(pudp, ea);
+ if (!pmd_present(*pmdp)) {
+ ptep = early_alloc_pgtable(PAGE_SIZE);
+ BUG_ON(ptep == NULL);
+ pmd_populate_kernel(&init_mm, pmdp, ptep);
+ }
+ ptep = pte_offset_kernel(pmdp, ea);
+ set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
+ __pgprot(flags)));
+ }
+
+ smp_wmb();
+ return 0;
+}
diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c
new file mode 100644
index 0000000..eb44511
--- /dev/null
+++ b/arch/powerpc/mm/pgtable-book3s64.c
@@ -0,0 +1,118 @@
+/*
+ * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/sched.h>
+#include <asm/pgalloc.h>
+#include <asm/tlb.h>
+
+#include "mmu_decl.h"
+#include <trace/events/thp.h>
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/*
+ * This is called when relaxing access to a hugepage. It's also called in the page
+ * fault path when we don't hit any of the major fault cases, ie, a minor
+ * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have
+ * handled those two for us, we additionally deal with missing execute
+ * permission here on some processors
+ */
+int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp, pmd_t entry, int dirty)
+{
+ int changed;
+#ifdef CONFIG_DEBUG_VM
+ WARN_ON(!pmd_trans_huge(*pmdp));
+ assert_spin_locked(&vma->vm_mm->page_table_lock);
+#endif
+ changed = !pmd_same(*(pmdp), entry);
+ if (changed) {
+ __ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry));
+ /*
+ * Since we are not supporting SW TLB systems, we don't
+ * have any thing similar to flush_tlb_page_nohash()
+ */
+ }
+ return changed;
+}
+
+int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp)
+{
+ return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
+}
+/*
+ * set a new huge pmd. We should not be called for updating
+ * an existing pmd entry. That should go via pmd_hugepage_update.
+ */
+void set_pmd_at(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp, pmd_t pmd)
+{
+#ifdef CONFIG_DEBUG_VM
+ WARN_ON(pte_present(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp)));
+ assert_spin_locked(&mm->page_table_lock);
+ WARN_ON(!pmd_trans_huge(pmd));
+#endif
+ trace_hugepage_set_pmd(addr, pmd_val(pmd));
+ return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
+}
+/*
+ * We use this to invalidate a pmdp entry before switching from a
+ * hugepte to regular pmd entry.
+ */
+void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp)
+{
+ pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0);
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ /*
+ * This ensures that generic code that rely on IRQ disabling
+ * to prevent a parallel THP split work as expected.
+ */
+ kick_all_cpus_sync();
+}
+
+static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
+{
+ return __pmd(pmd_val(pmd) | pgprot_val(pgprot));
+}
+
+pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot)
+{
+ unsigned long pmdv;
+
+ pmdv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK;
+ return pmd_set_protbits(__pmd(pmdv), pgprot);
+}
+
+pmd_t mk_pmd(struct page *page, pgprot_t pgprot)
+{
+ return pfn_pmd(page_to_pfn(page), pgprot);
+}
+
+pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
+{
+ unsigned long pmdv;
+
+ pmdv = pmd_val(pmd);
+ pmdv &= _HPAGE_CHG_MASK;
+ return pmd_set_protbits(__pmd(pmdv), newprot);
+}
+
+/*
+ * This is called at the end of handling a user page fault, when the
+ * fault has been handled by updating a HUGE PMD entry in the linux page tables.
+ * We use it to preload an HPTE into the hash table corresponding to
+ * the updated linux HUGE PMD entry.
+ */
+void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd)
+{
+ return;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c
new file mode 100644
index 0000000..c23e286
--- /dev/null
+++ b/arch/powerpc/mm/pgtable-hash64.c
@@ -0,0 +1,342 @@
+/*
+ * Copyright 2005, Paul Mackerras, IBM Corporation.
+ * Copyright 2009, Benjamin Herrenschmidt, IBM Corporation.
+ * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/sched.h>
+#include <asm/pgalloc.h>
+#include <asm/tlb.h>
+
+#include "mmu_decl.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/thp.h>
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+/*
+ * On hash-based CPUs, the vmemmap is bolted in the hash table.
+ *
+ */
+int __meminit hash__vmemmap_create_mapping(unsigned long start,
+ unsigned long page_size,
+ unsigned long phys)
+{
+ int rc = htab_bolt_mapping(start, start + page_size, phys,
+ pgprot_val(PAGE_KERNEL),
+ mmu_vmemmap_psize, mmu_kernel_ssize);
+ if (rc < 0) {
+ int rc2 = htab_remove_mapping(start, start + page_size,
+ mmu_vmemmap_psize,
+ mmu_kernel_ssize);
+ BUG_ON(rc2 && (rc2 != -ENOENT));
+ }
+ return rc;
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+void hash__vmemmap_remove_mapping(unsigned long start,
+ unsigned long page_size)
+{
+ int rc = htab_remove_mapping(start, start + page_size,
+ mmu_vmemmap_psize,
+ mmu_kernel_ssize);
+ BUG_ON((rc < 0) && (rc != -ENOENT));
+ WARN_ON(rc == -ENOENT);
+}
+#endif
+#endif /* CONFIG_SPARSEMEM_VMEMMAP */
+
+/*
+ * map_kernel_page currently only called by __ioremap
+ * map_kernel_page adds an entry to the ioremap page table
+ * and adds an entry to the HPT, possibly bolting it
+ */
+int hash__map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags)
+{
+ pgd_t *pgdp;
+ pud_t *pudp;
+ pmd_t *pmdp;
+ pte_t *ptep;
+
+ BUILD_BUG_ON(TASK_SIZE_USER64 > H_PGTABLE_RANGE);
+ if (slab_is_available()) {
+ pgdp = pgd_offset_k(ea);
+ pudp = pud_alloc(&init_mm, pgdp, ea);
+ if (!pudp)
+ return -ENOMEM;
+ pmdp = pmd_alloc(&init_mm, pudp, ea);
+ if (!pmdp)
+ return -ENOMEM;
+ ptep = pte_alloc_kernel(pmdp, ea);
+ if (!ptep)
+ return -ENOMEM;
+ set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
+ __pgprot(flags)));
+ } else {
+ /*
+ * If the mm subsystem is not fully up, we cannot create a
+ * linux page table entry for this mapping. Simply bolt an
+ * entry in the hardware page table.
+ *
+ */
+ if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, flags,
+ mmu_io_psize, mmu_kernel_ssize)) {
+ printk(KERN_ERR "Failed to do bolted mapping IO "
+ "memory at %016lx !\n", pa);
+ return -ENOMEM;
+ }
+ }
+
+ smp_wmb();
+ return 0;
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+
+unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp, unsigned long clr,
+ unsigned long set)
+{
+ __be64 old_be, tmp;
+ unsigned long old;
+
+#ifdef CONFIG_DEBUG_VM
+ WARN_ON(!pmd_trans_huge(*pmdp));
+ assert_spin_locked(&mm->page_table_lock);
+#endif
+
+ __asm__ __volatile__(
+ "1: ldarx %0,0,%3\n\
+ and. %1,%0,%6\n\
+ bne- 1b \n\
+ andc %1,%0,%4 \n\
+ or %1,%1,%7\n\
+ stdcx. %1,0,%3 \n\
+ bne- 1b"
+ : "=&r" (old_be), "=&r" (tmp), "=m" (*pmdp)
+ : "r" (pmdp), "r" (cpu_to_be64(clr)), "m" (*pmdp),
+ "r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set))
+ : "cc" );
+
+ old = be64_to_cpu(old_be);
+
+ trace_hugepage_update(addr, old, clr, set);
+ if (old & H_PAGE_HASHPTE)
+ hpte_do_hugepage_flush(mm, addr, pmdp, old);
+ return old;
+}
+
+pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp)
+{
+ pmd_t pmd;
+
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ VM_BUG_ON(pmd_trans_huge(*pmdp));
+
+ pmd = *pmdp;
+ pmd_clear(pmdp);
+ /*
+ * Wait for all pending hash_page to finish. This is needed
+ * in case of subpage collapse. When we collapse normal pages
+ * to hugepage, we first clear the pmd, then invalidate all
+ * the PTE entries. The assumption here is that any low level
+ * page fault will see a none pmd and take the slow path that
+ * will wait on mmap_sem. But we could very well be in a
+ * hash_page with local ptep pointer value. Such a hash page
+ * can result in adding new HPTE entries for normal subpages.
+ * That means we could be modifying the page content as we
+ * copy them to a huge page. So wait for parallel hash_page
+ * to finish before invalidating HPTE entries. We can do this
+ * by sending an IPI to all the cpus and executing a dummy
+ * function there.
+ */
+ kick_all_cpus_sync();
+ /*
+ * Now invalidate the hpte entries in the range
+ * covered by pmd. This make sure we take a
+ * fault and will find the pmd as none, which will
+ * result in a major fault which takes mmap_sem and
+ * hence wait for collapse to complete. Without this
+ * the __collapse_huge_page_copy can result in copying
+ * the old content.
+ */
+ flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
+ return pmd;
+}
+
+/*
+ * We want to put the pgtable in pmd and use pgtable for tracking
+ * the base page size hptes
+ */
+void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+ pgtable_t pgtable)
+{
+ pgtable_t *pgtable_slot;
+ assert_spin_locked(&mm->page_table_lock);
+ /*
+ * we store the pgtable in the second half of PMD
+ */
+ pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
+ *pgtable_slot = pgtable;
+ /*
+ * expose the deposited pgtable to other cpus.
+ * before we set the hugepage PTE at pmd level
+ * hash fault code looks at the deposted pgtable
+ * to store hash index values.
+ */
+ smp_wmb();
+}
+
+pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
+{
+ pgtable_t pgtable;
+ pgtable_t *pgtable_slot;
+
+ assert_spin_locked(&mm->page_table_lock);
+ pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
+ pgtable = *pgtable_slot;
+ /*
+ * Once we withdraw, mark the entry NULL.
+ */
+ *pgtable_slot = NULL;
+ /*
+ * We store HPTE information in the deposited PTE fragment.
+ * zero out the content on withdraw.
+ */
+ memset(pgtable, 0, PTE_FRAG_SIZE);
+ return pgtable;
+}
+
+void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp)
+{
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ VM_BUG_ON(REGION_ID(address) != USER_REGION_ID);
+
+ /*
+ * We can't mark the pmd none here, because that will cause a race
+ * against exit_mmap. We need to continue mark pmd TRANS HUGE, while
+ * we spilt, but at the same time we wan't rest of the ppc64 code
+ * not to insert hash pte on this, because we will be modifying
+ * the deposited pgtable in the caller of this function. Hence
+ * clear the _PAGE_USER so that we move the fault handling to
+ * higher level function and that will serialize against ptl.
+ * We need to flush existing hash pte entries here even though,
+ * the translation is still valid, because we will withdraw
+ * pgtable_t after this.
+ */
+ pmd_hugepage_update(vma->vm_mm, address, pmdp, 0, _PAGE_PRIVILEGED);
+}
+
+/*
+ * A linux hugepage PMD was changed and the corresponding hash table entries
+ * neesd to be flushed.
+ */
+void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp, unsigned long old_pmd)
+{
+ int ssize;
+ unsigned int psize;
+ unsigned long vsid;
+ unsigned long flags = 0;
+ const struct cpumask *tmp;
+
+ /* get the base page size,vsid and segment size */
+#ifdef CONFIG_DEBUG_VM
+ psize = get_slice_psize(mm, addr);
+ BUG_ON(psize == MMU_PAGE_16M);
+#endif
+ if (old_pmd & H_PAGE_COMBO)
+ psize = MMU_PAGE_4K;
+ else
+ psize = MMU_PAGE_64K;
+
+ if (!is_kernel_addr(addr)) {
+ ssize = user_segment_size(addr);
+ vsid = get_vsid(mm->context.id, addr, ssize);
+ WARN_ON(vsid == 0);
+ } else {
+ vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
+ ssize = mmu_kernel_ssize;
+ }
+
+ tmp = cpumask_of(smp_processor_id());
+ if (cpumask_equal(mm_cpumask(mm), tmp))
+ flags |= HPTE_LOCAL_UPDATE;
+
+ return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags);
+}
+
+pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pmd_t *pmdp)
+{
+ pmd_t old_pmd;
+ pgtable_t pgtable;
+ unsigned long old;
+ pgtable_t *pgtable_slot;
+
+ old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
+ old_pmd = __pmd(old);
+ /*
+ * We have pmd == none and we are holding page_table_lock.
+ * So we can safely go and clear the pgtable hash
+ * index info.
+ */
+ pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
+ pgtable = *pgtable_slot;
+ /*
+ * Let's zero out old valid and hash index details
+ * hash fault look at them.
+ */
+ memset(pgtable, 0, PTE_FRAG_SIZE);
+ /*
+ * Serialize against find_linux_pte_or_hugepte which does lock-less
+ * lookup in page tables with local interrupts disabled. For huge pages
+ * it casts pmd_t to pte_t. Since format of pte_t is different from
+ * pmd_t we want to prevent transit from pmd pointing to page table
+ * to pmd pointing to huge page (and back) while interrupts are disabled.
+ * We clear pmd to possibly replace it with page table pointer in
+ * different code paths. So make sure we wait for the parallel
+ * find_linux_pte_or_hugepage to finish.
+ */
+ kick_all_cpus_sync();
+ return old_pmd;
+}
+
+int hash__has_transparent_hugepage(void)
+{
+
+ if (!mmu_has_feature(MMU_FTR_16M_PAGE))
+ return 0;
+ /*
+ * We support THP only if PMD_SIZE is 16MB.
+ */
+ if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT)
+ return 0;
+ /*
+ * We need to make sure that we support 16MB hugepage in a segement
+ * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE
+ * of 64K.
+ */
+ /*
+ * If we have 64K HPTE, we will be using that by default
+ */
+ if (mmu_psize_defs[MMU_PAGE_64K].shift &&
+ (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1))
+ return 0;
+ /*
+ * Ok we only have 4K HPTE
+ */
+ if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1)
+ return 0;
+
+ return 1;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
new file mode 100644
index 0000000..18b2c11
--- /dev/null
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -0,0 +1,526 @@
+/*
+ * Page table handling routines for radix page table.
+ *
+ * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/sched.h>
+#include <linux/memblock.h>
+#include <linux/of_fdt.h>
+
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/dma.h>
+#include <asm/machdep.h>
+#include <asm/mmu.h>
+#include <asm/firmware.h>
+
+#include <trace/events/thp.h>
+
+static int native_update_partition_table(u64 patb1)
+{
+ partition_tb->patb1 = cpu_to_be64(patb1);
+ return 0;
+}
+
+static __ref void *early_alloc_pgtable(unsigned long size)
+{
+ void *pt;
+
+ pt = __va(memblock_alloc_base(size, size, MEMBLOCK_ALLOC_ANYWHERE));
+ memset(pt, 0, size);
+
+ return pt;
+}
+
+int radix__map_kernel_page(unsigned long ea, unsigned long pa,
+ pgprot_t flags,
+ unsigned int map_page_size)
+{
+ pgd_t *pgdp;
+ pud_t *pudp;
+ pmd_t *pmdp;
+ pte_t *ptep;
+ /*
+ * Make sure task size is correct as per the max adddr
+ */
+ BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE);
+ if (slab_is_available()) {
+ pgdp = pgd_offset_k(ea);
+ pudp = pud_alloc(&init_mm, pgdp, ea);
+ if (!pudp)
+ return -ENOMEM;
+ if (map_page_size == PUD_SIZE) {
+ ptep = (pte_t *)pudp;
+ goto set_the_pte;
+ }
+ pmdp = pmd_alloc(&init_mm, pudp, ea);
+ if (!pmdp)
+ return -ENOMEM;
+ if (map_page_size == PMD_SIZE) {
+ ptep = (pte_t *)pudp;
+ goto set_the_pte;
+ }
+ ptep = pte_alloc_kernel(pmdp, ea);
+ if (!ptep)
+ return -ENOMEM;
+ } else {
+ pgdp = pgd_offset_k(ea);
+ if (pgd_none(*pgdp)) {
+ pudp = early_alloc_pgtable(PUD_TABLE_SIZE);
+ BUG_ON(pudp == NULL);
+ pgd_populate(&init_mm, pgdp, pudp);
+ }
+ pudp = pud_offset(pgdp, ea);
+ if (map_page_size == PUD_SIZE) {
+ ptep = (pte_t *)pudp;
+ goto set_the_pte;
+ }
+ if (pud_none(*pudp)) {
+ pmdp = early_alloc_pgtable(PMD_TABLE_SIZE);
+ BUG_ON(pmdp == NULL);
+ pud_populate(&init_mm, pudp, pmdp);
+ }
+ pmdp = pmd_offset(pudp, ea);
+ if (map_page_size == PMD_SIZE) {
+ ptep = (pte_t *)pudp;
+ goto set_the_pte;
+ }
+ if (!pmd_present(*pmdp)) {
+ ptep = early_alloc_pgtable(PAGE_SIZE);
+ BUG_ON(ptep == NULL);
+ pmd_populate_kernel(&init_mm, pmdp, ptep);
+ }
+ ptep = pte_offset_kernel(pmdp, ea);
+ }
+
+set_the_pte:
+ set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, flags));
+ smp_wmb();
+ return 0;
+}
+
+static void __init radix_init_pgtable(void)
+{
+ int loop_count;
+ u64 base, end, start_addr;
+ unsigned long rts_field;
+ struct memblock_region *reg;
+ unsigned long linear_page_size;
+
+ /* We don't support slb for radix */
+ mmu_slb_size = 0;
+ /*
+ * Create the linear mapping, using standard page size for now
+ */
+ loop_count = 0;
+ for_each_memblock(memory, reg) {
+
+ start_addr = reg->base;
+
+redo:
+ if (loop_count < 1 && mmu_psize_defs[MMU_PAGE_1G].shift)
+ linear_page_size = PUD_SIZE;
+ else if (loop_count < 2 && mmu_psize_defs[MMU_PAGE_2M].shift)
+ linear_page_size = PMD_SIZE;
+ else
+ linear_page_size = PAGE_SIZE;
+
+ base = _ALIGN_UP(start_addr, linear_page_size);
+ end = _ALIGN_DOWN(reg->base + reg->size, linear_page_size);
+
+ pr_info("Mapping range 0x%lx - 0x%lx with 0x%lx\n",
+ (unsigned long)base, (unsigned long)end,
+ linear_page_size);
+
+ while (base < end) {
+ radix__map_kernel_page((unsigned long)__va(base),
+ base, PAGE_KERNEL_X,
+ linear_page_size);
+ base += linear_page_size;
+ }
+ /*
+ * map the rest using lower page size
+ */
+ if (end < reg->base + reg->size) {
+ start_addr = end;
+ loop_count++;
+ goto redo;
+ }
+ }
+ /*
+ * Allocate Partition table and process table for the
+ * host.
+ */
+ BUILD_BUG_ON_MSG((PRTB_SIZE_SHIFT > 23), "Process table size too large.");
+ process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT);
+ /*
+ * Fill in the process table.
+ * we support 52 bits, hence 52-28 = 24, 11000
+ */
+ rts_field = 3ull << PPC_BITLSHIFT(2);
+ process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE);
+ /*
+ * Fill in the partition table. We are suppose to use effective address
+ * of process table here. But our linear mapping also enable us to use
+ * physical address here.
+ */
+ ppc_md.update_partition_table(__pa(process_tb) | (PRTB_SIZE_SHIFT - 12) | PATB_GR);
+ pr_info("Process table %p and radix root for kernel: %p\n", process_tb, init_mm.pgd);
+}
+
+static void __init radix_init_partition_table(void)
+{
+ unsigned long rts_field;
+ /*
+ * we support 52 bits, hence 52-28 = 24, 11000
+ */
+ rts_field = 3ull << PPC_BITLSHIFT(2);
+
+ BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 24), "Partition table size too large.");
+ partition_tb = early_alloc_pgtable(1UL << PATB_SIZE_SHIFT);
+ partition_tb->patb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) |
+ RADIX_PGD_INDEX_SIZE | PATB_HR);
+ printk("Partition table %p\n", partition_tb);
+
+ memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
+ /*
+ * update partition table control register,
+ * 64 K size.
+ */
+ mtspr(SPRN_PTCR, __pa(partition_tb) | (PATB_SIZE_SHIFT - 12));
+}
+
+void __init radix_init_native(void)
+{
+ ppc_md.update_partition_table = native_update_partition_table;
+}
+
+static int __init get_idx_from_shift(unsigned int shift)
+{
+ int idx = -1;
+
+ switch (shift) {
+ case 0xc:
+ idx = MMU_PAGE_4K;
+ break;
+ case 0x10:
+ idx = MMU_PAGE_64K;
+ break;
+ case 0x15:
+ idx = MMU_PAGE_2M;
+ break;
+ case 0x1e:
+ idx = MMU_PAGE_1G;
+ break;
+ }
+ return idx;
+}
+
+static int __init radix_dt_scan_page_sizes(unsigned long node,
+ const char *uname, int depth,
+ void *data)
+{
+ int size = 0;
+ int shift, idx;
+ unsigned int ap;
+ const __be32 *prop;
+ const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+
+ /* We are scanning "cpu" nodes only */
+ if (type == NULL || strcmp(type, "cpu") != 0)
+ return 0;
+
+ prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size);
+ if (!prop)
+ return 0;
+
+ pr_info("Page sizes from device-tree:\n");
+ for (; size >= 4; size -= 4, ++prop) {
+
+ struct mmu_psize_def *def;
+
+ /* top 3 bit is AP encoding */
+ shift = be32_to_cpu(prop[0]) & ~(0xe << 28);
+ ap = be32_to_cpu(prop[0]) >> 29;
+ pr_info("Page size sift = %d AP=0x%x\n", shift, ap);
+
+ idx = get_idx_from_shift(shift);
+ if (idx < 0)
+ continue;
+
+ def = &mmu_psize_defs[idx];
+ def->shift = shift;
+ def->ap = ap;
+ }
+
+ /* needed ? */
+ cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B;
+ return 1;
+}
+
+static void __init radix_init_page_sizes(void)
+{
+ int rc;
+
+ /*
+ * Try to find the available page sizes in the device-tree
+ */
+ rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL);
+ if (rc != 0) /* Found */
+ goto found;
+ /*
+ * let's assume we have page 4k and 64k support
+ */
+ mmu_psize_defs[MMU_PAGE_4K].shift = 12;
+ mmu_psize_defs[MMU_PAGE_4K].ap = 0x0;
+
+ mmu_psize_defs[MMU_PAGE_64K].shift = 16;
+ mmu_psize_defs[MMU_PAGE_64K].ap = 0x5;
+found:
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ if (mmu_psize_defs[MMU_PAGE_2M].shift) {
+ /*
+ * map vmemmap using 2M if available
+ */
+ mmu_vmemmap_psize = MMU_PAGE_2M;
+ }
+#endif /* CONFIG_SPARSEMEM_VMEMMAP */
+ return;
+}
+
+void __init radix__early_init_mmu(void)
+{
+ unsigned long lpcr;
+ /*
+ * setup LPCR UPRT based on mmu_features
+ */
+ lpcr = mfspr(SPRN_LPCR);
+ mtspr(SPRN_LPCR, lpcr | LPCR_UPRT);
+
+#ifdef CONFIG_PPC_64K_PAGES
+ /* PAGE_SIZE mappings */
+ mmu_virtual_psize = MMU_PAGE_64K;
+#else
+ mmu_virtual_psize = MMU_PAGE_4K;
+#endif
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ /* vmemmap mapping */
+ mmu_vmemmap_psize = mmu_virtual_psize;
+#endif
+ /*
+ * initialize page table size
+ */
+ __pte_index_size = RADIX_PTE_INDEX_SIZE;
+ __pmd_index_size = RADIX_PMD_INDEX_SIZE;
+ __pud_index_size = RADIX_PUD_INDEX_SIZE;
+ __pgd_index_size = RADIX_PGD_INDEX_SIZE;
+ __pmd_cache_index = RADIX_PMD_INDEX_SIZE;
+ __pte_table_size = RADIX_PTE_TABLE_SIZE;
+ __pmd_table_size = RADIX_PMD_TABLE_SIZE;
+ __pud_table_size = RADIX_PUD_TABLE_SIZE;
+ __pgd_table_size = RADIX_PGD_TABLE_SIZE;
+
+ __pmd_val_bits = RADIX_PMD_VAL_BITS;
+ __pud_val_bits = RADIX_PUD_VAL_BITS;
+ __pgd_val_bits = RADIX_PGD_VAL_BITS;
+
+ __kernel_virt_start = RADIX_KERN_VIRT_START;
+ __kernel_virt_size = RADIX_KERN_VIRT_SIZE;
+ __vmalloc_start = RADIX_VMALLOC_START;
+ __vmalloc_end = RADIX_VMALLOC_END;
+ vmemmap = (struct page *)RADIX_VMEMMAP_BASE;
+ ioremap_bot = IOREMAP_BASE;
+ /*
+ * For now radix also use the same frag size
+ */
+ __pte_frag_nr = H_PTE_FRAG_NR;
+ __pte_frag_size_shift = H_PTE_FRAG_SIZE_SHIFT;
+
+ radix_init_page_sizes();
+ if (!firmware_has_feature(FW_FEATURE_LPAR))
+ radix_init_partition_table();
+
+ radix_init_pgtable();
+}
+
+void radix__early_init_mmu_secondary(void)
+{
+ unsigned long lpcr;
+ /*
+ * setup LPCR UPRT based on mmu_features
+ */
+ lpcr = mfspr(SPRN_LPCR);
+ mtspr(SPRN_LPCR, lpcr | LPCR_UPRT);
+ /*
+ * update partition table control register, 64 K size.
+ */
+ if (!firmware_has_feature(FW_FEATURE_LPAR))
+ mtspr(SPRN_PTCR,
+ __pa(partition_tb) | (PATB_SIZE_SHIFT - 12));
+}
+
+void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base,
+ phys_addr_t first_memblock_size)
+{
+ /* We don't currently support the first MEMBLOCK not mapping 0
+ * physical on those processors
+ */
+ BUG_ON(first_memblock_base != 0);
+ /*
+ * We limit the allocation that depend on ppc64_rma_size
+ * to first_memblock_size. We also clamp it to 1GB to
+ * avoid some funky things such as RTAS bugs.
+ *
+ * On radix config we really don't have a limitation
+ * on real mode access. But keeping it as above works
+ * well enough.
+ */
+ ppc64_rma_size = min_t(u64, first_memblock_size, 0x40000000);
+ /*
+ * Finally limit subsequent allocations. We really don't want
+ * to limit the memblock allocations to rma_size. FIXME!! should
+ * we even limit at all ?
+ */
+ memblock_set_current_limit(first_memblock_base + first_memblock_size);
+}
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+int __meminit radix__vmemmap_create_mapping(unsigned long start,
+ unsigned long page_size,
+ unsigned long phys)
+{
+ /* Create a PTE encoding */
+ unsigned long flags = _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_KERNEL_RW;
+
+ BUG_ON(radix__map_kernel_page(start, phys, __pgprot(flags), page_size));
+ return 0;
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+void radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size)
+{
+ /* FIXME!! intel does more. We should free page tables mapping vmemmap ? */
+}
+#endif
+#endif
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+
+unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp, unsigned long clr,
+ unsigned long set)
+{
+ unsigned long old;
+
+#ifdef CONFIG_DEBUG_VM
+ WARN_ON(!radix__pmd_trans_huge(*pmdp));
+ assert_spin_locked(&mm->page_table_lock);
+#endif
+
+ old = radix__pte_update(mm, addr, (pte_t *)pmdp, clr, set, 1);
+ trace_hugepage_update(addr, old, clr, set);
+
+ return old;
+}
+
+pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp)
+
+{
+ pmd_t pmd;
+
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ VM_BUG_ON(radix__pmd_trans_huge(*pmdp));
+ /*
+ * khugepaged calls this for normal pmd
+ */
+ pmd = *pmdp;
+ pmd_clear(pmdp);
+ /*FIXME!! Verify whether we need this kick below */
+ kick_all_cpus_sync();
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ return pmd;
+}
+
+/*
+ * For us pgtable_t is pte_t *. Inorder to save the deposisted
+ * page table, we consider the allocated page table as a list
+ * head. On withdraw we need to make sure we zero out the used
+ * list_head memory area.
+ */
+void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+ pgtable_t pgtable)
+{
+ struct list_head *lh = (struct list_head *) pgtable;
+
+ assert_spin_locked(pmd_lockptr(mm, pmdp));
+
+ /* FIFO */
+ if (!pmd_huge_pte(mm, pmdp))
+ INIT_LIST_HEAD(lh);
+ else
+ list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp));
+ pmd_huge_pte(mm, pmdp) = pgtable;
+}
+
+pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
+{
+ pte_t *ptep;
+ pgtable_t pgtable;
+ struct list_head *lh;
+
+ assert_spin_locked(pmd_lockptr(mm, pmdp));
+
+ /* FIFO */
+ pgtable = pmd_huge_pte(mm, pmdp);
+ lh = (struct list_head *) pgtable;
+ if (list_empty(lh))
+ pmd_huge_pte(mm, pmdp) = NULL;
+ else {
+ pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next;
+ list_del(lh);
+ }
+ ptep = (pte_t *) pgtable;
+ *ptep = __pte(0);
+ ptep++;
+ *ptep = __pte(0);
+ return pgtable;
+}
+
+
+pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pmd_t *pmdp)
+{
+ pmd_t old_pmd;
+ unsigned long old;
+
+ old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
+ old_pmd = __pmd(old);
+ /*
+ * Serialize against find_linux_pte_or_hugepte which does lock-less
+ * lookup in page tables with local interrupts disabled. For huge pages
+ * it casts pmd_t to pte_t. Since format of pte_t is different from
+ * pmd_t we want to prevent transit from pmd pointing to page table
+ * to pmd pointing to huge page (and back) while interrupts are disabled.
+ * We clear pmd to possibly replace it with page table pointer in
+ * different code paths. So make sure we wait for the parallel
+ * find_linux_pte_or_hugepage to finish.
+ */
+ kick_all_cpus_sync();
+ return old_pmd;
+}
+
+int radix__has_transparent_hugepage(void)
+{
+ /* For radix 2M at PMD level means thp */
+ if (mmu_psize_defs[MMU_PAGE_2M].shift == PMD_SHIFT)
+ return 1;
+ return 0;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index de37ff4..88a3075 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -38,14 +38,25 @@ static inline int is_exec_fault(void)
/* We only try to do i/d cache coherency on stuff that looks like
* reasonably "normal" PTEs. We currently require a PTE to be present
- * and we avoid _PAGE_SPECIAL and _PAGE_NO_CACHE. We also only do that
+ * and we avoid _PAGE_SPECIAL and cache inhibited pte. We also only do that
* on userspace PTEs
*/
static inline int pte_looks_normal(pte_t pte)
{
+
+#if defined(CONFIG_PPC_BOOK3S_64)
+ if ((pte_val(pte) & (_PAGE_PRESENT | _PAGE_SPECIAL)) == _PAGE_PRESENT) {
+ if (pte_ci(pte))
+ return 0;
+ if (pte_user(pte))
+ return 1;
+ }
+ return 0;
+#else
return (pte_val(pte) &
- (_PAGE_PRESENT | _PAGE_SPECIAL | _PAGE_NO_CACHE | _PAGE_USER)) ==
- (_PAGE_PRESENT | _PAGE_USER);
+ (_PAGE_PRESENT | _PAGE_SPECIAL | _PAGE_NO_CACHE | _PAGE_USER)) ==
+ (_PAGE_PRESENT | _PAGE_USER);
+#endif
}
static struct page *maybe_pte_to_page(pte_t pte)
@@ -71,6 +82,9 @@ static struct page *maybe_pte_to_page(pte_t pte)
static pte_t set_pte_filter(pte_t pte)
{
+ if (radix_enabled())
+ return pte;
+
pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
if (pte_looks_normal(pte) && !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) ||
cpu_has_feature(CPU_FTR_NOEXECUTE))) {
@@ -177,8 +191,8 @@ void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
* _PAGE_PRESENT, but we can be sure that it is not in hpte.
* Hence we can use set_pte_at for them.
*/
- VM_WARN_ON((pte_val(*ptep) & (_PAGE_PRESENT | _PAGE_USER)) ==
- (_PAGE_PRESENT | _PAGE_USER));
+ VM_WARN_ON(pte_present(*ptep) && !pte_protnone(*ptep));
+
/*
* Add the pte bit when tryint set a pte
*/
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 3471060..e009e06 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -55,104 +55,63 @@
#include "mmu_decl.h"
-#define CREATE_TRACE_POINTS
-#include <trace/events/thp.h>
-
-/* Some sanity checking */
-#if TASK_SIZE_USER64 > PGTABLE_RANGE
-#error TASK_SIZE_USER64 exceeds pagetable range
-#endif
-
#ifdef CONFIG_PPC_STD_MMU_64
#if TASK_SIZE_USER64 > (1UL << (ESID_BITS + SID_SHIFT))
#error TASK_SIZE_USER64 exceeds user VSID range
#endif
#endif
-unsigned long ioremap_bot = IOREMAP_BASE;
-
-#ifdef CONFIG_PPC_MMU_NOHASH
-static __ref void *early_alloc_pgtable(unsigned long size)
-{
- void *pt;
-
- pt = __va(memblock_alloc_base(size, size, __pa(MAX_DMA_ADDRESS)));
- memset(pt, 0, size);
-
- return pt;
-}
-#endif /* CONFIG_PPC_MMU_NOHASH */
-
+#ifdef CONFIG_PPC_BOOK3S_64
/*
- * map_kernel_page currently only called by __ioremap
- * map_kernel_page adds an entry to the ioremap page table
- * and adds an entry to the HPT, possibly bolting it
+ * partition table and process table for ISA 3.0
*/
-int map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags)
-{
- pgd_t *pgdp;
- pud_t *pudp;
- pmd_t *pmdp;
- pte_t *ptep;
-
- if (slab_is_available()) {
- pgdp = pgd_offset_k(ea);
- pudp = pud_alloc(&init_mm, pgdp, ea);
- if (!pudp)
- return -ENOMEM;
- pmdp = pmd_alloc(&init_mm, pudp, ea);
- if (!pmdp)
- return -ENOMEM;
- ptep = pte_alloc_kernel(pmdp, ea);
- if (!ptep)
- return -ENOMEM;
- set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
- __pgprot(flags)));
- } else {
-#ifdef CONFIG_PPC_MMU_NOHASH
- pgdp = pgd_offset_k(ea);
-#ifdef PUD_TABLE_SIZE
- if (pgd_none(*pgdp)) {
- pudp = early_alloc_pgtable(PUD_TABLE_SIZE);
- BUG_ON(pudp == NULL);
- pgd_populate(&init_mm, pgdp, pudp);
- }
-#endif /* PUD_TABLE_SIZE */
- pudp = pud_offset(pgdp, ea);
- if (pud_none(*pudp)) {
- pmdp = early_alloc_pgtable(PMD_TABLE_SIZE);
- BUG_ON(pmdp == NULL);
- pud_populate(&init_mm, pudp, pmdp);
- }
- pmdp = pmd_offset(pudp, ea);
- if (!pmd_present(*pmdp)) {
- ptep = early_alloc_pgtable(PAGE_SIZE);
- BUG_ON(ptep == NULL);
- pmd_populate_kernel(&init_mm, pmdp, ptep);
- }
- ptep = pte_offset_kernel(pmdp, ea);
- set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
- __pgprot(flags)));
-#else /* CONFIG_PPC_MMU_NOHASH */
- /*
- * If the mm subsystem is not fully up, we cannot create a
- * linux page table entry for this mapping. Simply bolt an
- * entry in the hardware page table.
- *
- */
- if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, flags,
- mmu_io_psize, mmu_kernel_ssize)) {
- printk(KERN_ERR "Failed to do bolted mapping IO "
- "memory at %016lx !\n", pa);
- return -ENOMEM;
- }
-#endif /* !CONFIG_PPC_MMU_NOHASH */
- }
-
- smp_wmb();
- return 0;
-}
-
+struct prtb_entry *process_tb;
+struct patb_entry *partition_tb;
+/*
+ * page table size
+ */
+unsigned long __pte_index_size;
+EXPORT_SYMBOL(__pte_index_size);
+unsigned long __pmd_index_size;
+EXPORT_SYMBOL(__pmd_index_size);
+unsigned long __pud_index_size;
+EXPORT_SYMBOL(__pud_index_size);
+unsigned long __pgd_index_size;
+EXPORT_SYMBOL(__pgd_index_size);
+unsigned long __pmd_cache_index;
+EXPORT_SYMBOL(__pmd_cache_index);
+unsigned long __pte_table_size;
+EXPORT_SYMBOL(__pte_table_size);
+unsigned long __pmd_table_size;
+EXPORT_SYMBOL(__pmd_table_size);
+unsigned long __pud_table_size;
+EXPORT_SYMBOL(__pud_table_size);
+unsigned long __pgd_table_size;
+EXPORT_SYMBOL(__pgd_table_size);
+unsigned long __pmd_val_bits;
+EXPORT_SYMBOL(__pmd_val_bits);
+unsigned long __pud_val_bits;
+EXPORT_SYMBOL(__pud_val_bits);
+unsigned long __pgd_val_bits;
+EXPORT_SYMBOL(__pgd_val_bits);
+unsigned long __kernel_virt_start;
+EXPORT_SYMBOL(__kernel_virt_start);
+unsigned long __kernel_virt_size;
+EXPORT_SYMBOL(__kernel_virt_size);
+unsigned long __vmalloc_start;
+EXPORT_SYMBOL(__vmalloc_start);
+unsigned long __vmalloc_end;
+EXPORT_SYMBOL(__vmalloc_end);
+struct page *vmemmap;
+EXPORT_SYMBOL(vmemmap);
+unsigned long __pte_frag_nr;
+EXPORT_SYMBOL(__pte_frag_nr);
+unsigned long __pte_frag_size_shift;
+EXPORT_SYMBOL(__pte_frag_size_shift);
+unsigned long ioremap_bot;
+#else /* !CONFIG_PPC_BOOK3S_64 */
+unsigned long ioremap_bot = IOREMAP_BASE;
+#endif
/**
* __ioremap_at - Low level function to establish the page tables
@@ -167,12 +126,8 @@ void __iomem * __ioremap_at(phys_addr_t pa, void *ea, unsigned long size,
if ((flags & _PAGE_PRESENT) == 0)
flags |= pgprot_val(PAGE_KERNEL);
- /* Non-cacheable page cannot be coherent */
- if (flags & _PAGE_NO_CACHE)
- flags &= ~_PAGE_COHERENT;
-
/* We don't support the 4K PFN hack with ioremap */
- if (flags & _PAGE_4K_PFN)
+ if (flags & H_PAGE_4K_PFN)
return NULL;
WARN_ON(pa & ~PAGE_MASK);
@@ -253,7 +208,7 @@ void __iomem * __ioremap(phys_addr_t addr, unsigned long size,
void __iomem * ioremap(phys_addr_t addr, unsigned long size)
{
- unsigned long flags = _PAGE_NO_CACHE | _PAGE_GUARDED;
+ unsigned long flags = pgprot_val(pgprot_noncached(__pgprot(0)));
void *caller = __builtin_return_address(0);
if (ppc_md.ioremap)
@@ -263,7 +218,7 @@ void __iomem * ioremap(phys_addr_t addr, unsigned long size)
void __iomem * ioremap_wc(phys_addr_t addr, unsigned long size)
{
- unsigned long flags = _PAGE_NO_CACHE;
+ unsigned long flags = pgprot_val(pgprot_noncached_wc(__pgprot(0)));
void *caller = __builtin_return_address(0);
if (ppc_md.ioremap)
@@ -277,11 +232,20 @@ void __iomem * ioremap_prot(phys_addr_t addr, unsigned long size,
void *caller = __builtin_return_address(0);
/* writeable implies dirty for kernel addresses */
- if (flags & _PAGE_RW)
+ if (flags & _PAGE_WRITE)
flags |= _PAGE_DIRTY;
- /* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */
- flags &= ~(_PAGE_USER | _PAGE_EXEC);
+ /* we don't want to let _PAGE_EXEC leak out */
+ flags &= ~_PAGE_EXEC;
+ /*
+ * Force kernel mapping.
+ */
+#if defined(CONFIG_PPC_BOOK3S_64)
+ flags |= _PAGE_PRIVILEGED;
+#else
+ flags &= ~_PAGE_USER;
+#endif
+
#ifdef _PAGE_BAP_SR
/* _PAGE_USER contains _PAGE_BAP_SR on BookE using the new PTE format
@@ -411,7 +375,7 @@ static pte_t *__alloc_for_cache(struct mm_struct *mm, int kernel)
return (pte_t *)ret;
}
-pte_t *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel)
+pte_t *pte_fragment_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel)
{
pte_t *pte;
@@ -421,8 +385,9 @@ pte_t *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel)
return __alloc_for_cache(mm, kernel);
}
+#endif /* CONFIG_PPC_64K_PAGES */
-void page_table_free(struct mm_struct *mm, unsigned long *table, int kernel)
+void pte_fragment_free(unsigned long *table, int kernel)
{
struct page *page = virt_to_page(table);
if (put_page_testzero(page)) {
@@ -433,15 +398,6 @@ void page_table_free(struct mm_struct *mm, unsigned long *table, int kernel)
}
#ifdef CONFIG_SMP
-static void page_table_free_rcu(void *table)
-{
- struct page *page = virt_to_page(table);
- if (put_page_testzero(page)) {
- pgtable_page_dtor(page);
- free_hot_cold_page(page, 0);
- }
-}
-
void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
{
unsigned long pgf = (unsigned long)table;
@@ -458,7 +414,7 @@ void __tlb_remove_table(void *_table)
if (!shift)
/* PTE page needs special handling */
- page_table_free_rcu(table);
+ pte_fragment_free(table, 0);
else {
BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
kmem_cache_free(PGT_CACHE(shift), table);
@@ -469,385 +425,10 @@ void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
{
if (!shift) {
/* PTE page needs special handling */
- struct page *page = virt_to_page(table);
- if (put_page_testzero(page)) {
- pgtable_page_dtor(page);
- free_hot_cold_page(page, 0);
- }
+ pte_fragment_free(table, 0);
} else {
BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
kmem_cache_free(PGT_CACHE(shift), table);
}
}
#endif
-#endif /* CONFIG_PPC_64K_PAGES */
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-
-/*
- * This is called when relaxing access to a hugepage. It's also called in the page
- * fault path when we don't hit any of the major fault cases, ie, a minor
- * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have
- * handled those two for us, we additionally deal with missing execute
- * permission here on some processors
- */
-int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmdp, pmd_t entry, int dirty)
-{
- int changed;
-#ifdef CONFIG_DEBUG_VM
- WARN_ON(!pmd_trans_huge(*pmdp));
- assert_spin_locked(&vma->vm_mm->page_table_lock);
-#endif
- changed = !pmd_same(*(pmdp), entry);
- if (changed) {
- __ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry));
- /*
- * Since we are not supporting SW TLB systems, we don't
- * have any thing similar to flush_tlb_page_nohash()
- */
- }
- return changed;
-}
-
-unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
- pmd_t *pmdp, unsigned long clr,
- unsigned long set)
-{
-
- unsigned long old, tmp;
-
-#ifdef CONFIG_DEBUG_VM
- WARN_ON(!pmd_trans_huge(*pmdp));
- assert_spin_locked(&mm->page_table_lock);
-#endif
-
-#ifdef PTE_ATOMIC_UPDATES
- __asm__ __volatile__(
- "1: ldarx %0,0,%3\n\
- andi. %1,%0,%6\n\
- bne- 1b \n\
- andc %1,%0,%4 \n\
- or %1,%1,%7\n\
- stdcx. %1,0,%3 \n\
- bne- 1b"
- : "=&r" (old), "=&r" (tmp), "=m" (*pmdp)
- : "r" (pmdp), "r" (clr), "m" (*pmdp), "i" (_PAGE_BUSY), "r" (set)
- : "cc" );
-#else
- old = pmd_val(*pmdp);
- *pmdp = __pmd((old & ~clr) | set);
-#endif
- trace_hugepage_update(addr, old, clr, set);
- if (old & _PAGE_HASHPTE)
- hpte_do_hugepage_flush(mm, addr, pmdp, old);
- return old;
-}
-
-pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmdp)
-{
- pmd_t pmd;
-
- VM_BUG_ON(address & ~HPAGE_PMD_MASK);
- VM_BUG_ON(pmd_trans_huge(*pmdp));
-
- pmd = *pmdp;
- pmd_clear(pmdp);
- /*
- * Wait for all pending hash_page to finish. This is needed
- * in case of subpage collapse. When we collapse normal pages
- * to hugepage, we first clear the pmd, then invalidate all
- * the PTE entries. The assumption here is that any low level
- * page fault will see a none pmd and take the slow path that
- * will wait on mmap_sem. But we could very well be in a
- * hash_page with local ptep pointer value. Such a hash page
- * can result in adding new HPTE entries for normal subpages.
- * That means we could be modifying the page content as we
- * copy them to a huge page. So wait for parallel hash_page
- * to finish before invalidating HPTE entries. We can do this
- * by sending an IPI to all the cpus and executing a dummy
- * function there.
- */
- kick_all_cpus_sync();
- /*
- * Now invalidate the hpte entries in the range
- * covered by pmd. This make sure we take a
- * fault and will find the pmd as none, which will
- * result in a major fault which takes mmap_sem and
- * hence wait for collapse to complete. Without this
- * the __collapse_huge_page_copy can result in copying
- * the old content.
- */
- flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
- return pmd;
-}
-
-int pmdp_test_and_clear_young(struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmdp)
-{
- return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
-}
-
-/*
- * We currently remove entries from the hashtable regardless of whether
- * the entry was young or dirty. The generic routines only flush if the
- * entry was young or dirty which is not good enough.
- *
- * We should be more intelligent about this but for the moment we override
- * these functions and force a tlb flush unconditionally
- */
-int pmdp_clear_flush_young(struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmdp)
-{
- return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
-}
-
-/*
- * We want to put the pgtable in pmd and use pgtable for tracking
- * the base page size hptes
- */
-void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
- pgtable_t pgtable)
-{
- pgtable_t *pgtable_slot;
- assert_spin_locked(&mm->page_table_lock);
- /*
- * we store the pgtable in the second half of PMD
- */
- pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
- *pgtable_slot = pgtable;
- /*
- * expose the deposited pgtable to other cpus.
- * before we set the hugepage PTE at pmd level
- * hash fault code looks at the deposted pgtable
- * to store hash index values.
- */
- smp_wmb();
-}
-
-pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
-{
- pgtable_t pgtable;
- pgtable_t *pgtable_slot;
-
- assert_spin_locked(&mm->page_table_lock);
- pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
- pgtable = *pgtable_slot;
- /*
- * Once we withdraw, mark the entry NULL.
- */
- *pgtable_slot = NULL;
- /*
- * We store HPTE information in the deposited PTE fragment.
- * zero out the content on withdraw.
- */
- memset(pgtable, 0, PTE_FRAG_SIZE);
- return pgtable;
-}
-
-void pmdp_huge_split_prepare(struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmdp)
-{
- VM_BUG_ON(address & ~HPAGE_PMD_MASK);
- VM_BUG_ON(REGION_ID(address) != USER_REGION_ID);
-
- /*
- * We can't mark the pmd none here, because that will cause a race
- * against exit_mmap. We need to continue mark pmd TRANS HUGE, while
- * we spilt, but at the same time we wan't rest of the ppc64 code
- * not to insert hash pte on this, because we will be modifying
- * the deposited pgtable in the caller of this function. Hence
- * clear the _PAGE_USER so that we move the fault handling to
- * higher level function and that will serialize against ptl.
- * We need to flush existing hash pte entries here even though,
- * the translation is still valid, because we will withdraw
- * pgtable_t after this.
- */
- pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_USER, 0);
-}
-
-
-/*
- * set a new huge pmd. We should not be called for updating
- * an existing pmd entry. That should go via pmd_hugepage_update.
- */
-void set_pmd_at(struct mm_struct *mm, unsigned long addr,
- pmd_t *pmdp, pmd_t pmd)
-{
-#ifdef CONFIG_DEBUG_VM
- WARN_ON((pmd_val(*pmdp) & (_PAGE_PRESENT | _PAGE_USER)) ==
- (_PAGE_PRESENT | _PAGE_USER));
- assert_spin_locked(&mm->page_table_lock);
- WARN_ON(!pmd_trans_huge(pmd));
-#endif
- trace_hugepage_set_pmd(addr, pmd_val(pmd));
- return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
-}
-
-/*
- * We use this to invalidate a pmdp entry before switching from a
- * hugepte to regular pmd entry.
- */
-void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmdp)
-{
- pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0);
-
- /*
- * This ensures that generic code that rely on IRQ disabling
- * to prevent a parallel THP split work as expected.
- */
- kick_all_cpus_sync();
-}
-
-/*
- * A linux hugepage PMD was changed and the corresponding hash table entries
- * neesd to be flushed.
- */
-void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
- pmd_t *pmdp, unsigned long old_pmd)
-{
- int ssize;
- unsigned int psize;
- unsigned long vsid;
- unsigned long flags = 0;
- const struct cpumask *tmp;
-
- /* get the base page size,vsid and segment size */
-#ifdef CONFIG_DEBUG_VM
- psize = get_slice_psize(mm, addr);
- BUG_ON(psize == MMU_PAGE_16M);
-#endif
- if (old_pmd & _PAGE_COMBO)
- psize = MMU_PAGE_4K;
- else
- psize = MMU_PAGE_64K;
-
- if (!is_kernel_addr(addr)) {
- ssize = user_segment_size(addr);
- vsid = get_vsid(mm->context.id, addr, ssize);
- WARN_ON(vsid == 0);
- } else {
- vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
- ssize = mmu_kernel_ssize;
- }
-
- tmp = cpumask_of(smp_processor_id());
- if (cpumask_equal(mm_cpumask(mm), tmp))
- flags |= HPTE_LOCAL_UPDATE;
-
- return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags);
-}
-
-static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
-{
- return __pmd(pmd_val(pmd) | pgprot_val(pgprot));
-}
-
-pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot)
-{
- unsigned long pmdv;
-
- pmdv = (pfn << PTE_RPN_SHIFT) & PTE_RPN_MASK;
- return pmd_set_protbits(__pmd(pmdv), pgprot);
-}
-
-pmd_t mk_pmd(struct page *page, pgprot_t pgprot)
-{
- return pfn_pmd(page_to_pfn(page), pgprot);
-}
-
-pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
-{
- unsigned long pmdv;
-
- pmdv = pmd_val(pmd);
- pmdv &= _HPAGE_CHG_MASK;
- return pmd_set_protbits(__pmd(pmdv), newprot);
-}
-
-/*
- * This is called at the end of handling a user page fault, when the
- * fault has been handled by updating a HUGE PMD entry in the linux page tables.
- * We use it to preload an HPTE into the hash table corresponding to
- * the updated linux HUGE PMD entry.
- */
-void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmd)
-{
- return;
-}
-
-pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
- unsigned long addr, pmd_t *pmdp)
-{
- pmd_t old_pmd;
- pgtable_t pgtable;
- unsigned long old;
- pgtable_t *pgtable_slot;
-
- old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
- old_pmd = __pmd(old);
- /*
- * We have pmd == none and we are holding page_table_lock.
- * So we can safely go and clear the pgtable hash
- * index info.
- */
- pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
- pgtable = *pgtable_slot;
- /*
- * Let's zero out old valid and hash index details
- * hash fault look at them.
- */
- memset(pgtable, 0, PTE_FRAG_SIZE);
- /*
- * Serialize against find_linux_pte_or_hugepte which does lock-less
- * lookup in page tables with local interrupts disabled. For huge pages
- * it casts pmd_t to pte_t. Since format of pte_t is different from
- * pmd_t we want to prevent transit from pmd pointing to page table
- * to pmd pointing to huge page (and back) while interrupts are disabled.
- * We clear pmd to possibly replace it with page table pointer in
- * different code paths. So make sure we wait for the parallel
- * find_linux_pte_or_hugepage to finish.
- */
- kick_all_cpus_sync();
- return old_pmd;
-}
-
-int has_transparent_hugepage(void)
-{
-
- BUILD_BUG_ON_MSG((PMD_SHIFT - PAGE_SHIFT) >= MAX_ORDER,
- "hugepages can't be allocated by the buddy allocator");
-
- BUILD_BUG_ON_MSG((PMD_SHIFT - PAGE_SHIFT) < 2,
- "We need more than 2 pages to do deferred thp split");
-
- if (!mmu_has_feature(MMU_FTR_16M_PAGE))
- return 0;
- /*
- * We support THP only if PMD_SIZE is 16MB.
- */
- if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT)
- return 0;
- /*
- * We need to make sure that we support 16MB hugepage in a segement
- * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE
- * of 64K.
- */
- /*
- * If we have 64K HPTE, we will be using that by default
- */
- if (mmu_psize_defs[MMU_PAGE_64K].shift &&
- (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1))
- return 0;
- /*
- * Ok we only have 4K HPTE
- */
- if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1)
- return 0;
-
- return 1;
-}
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
index 825b687..48fc28b 100644
--- a/arch/powerpc/mm/slb.c
+++ b/arch/powerpc/mm/slb.c
@@ -32,7 +32,6 @@ enum slb_index {
};
extern void slb_allocate_realmode(unsigned long ea);
-extern void slb_allocate_user(unsigned long ea);
static void slb_allocate(unsigned long ea)
{
diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S
index 736d18b..dfdb90c 100644
--- a/arch/powerpc/mm/slb_low.S
+++ b/arch/powerpc/mm/slb_low.S
@@ -35,7 +35,7 @@ _GLOBAL(slb_allocate_realmode)
* check for bad kernel/user address
* (ea & ~REGION_MASK) >= PGTABLE_RANGE
*/
- rldicr. r9,r3,4,(63 - PGTABLE_EADDR_SIZE - 4)
+ rldicr. r9,r3,4,(63 - H_PGTABLE_EADDR_SIZE - 4)
bne- 8f
srdi r9,r3,60 /* get region */
@@ -91,7 +91,7 @@ slb_miss_kernel_load_vmemmap:
* can be demoted from 64K -> 4K dynamically on some machines
*/
clrldi r11,r10,48
- cmpldi r11,(VMALLOC_SIZE >> 28) - 1
+ cmpldi r11,(H_VMALLOC_SIZE >> 28) - 1
bgt 5f
lhz r11,PACAVMALLOCSLLP(r13)
b 6f
@@ -179,56 +179,6 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
li r11,SLB_VSID_USER /* flags don't much matter */
b slb_finish_load
-#ifdef __DISABLED__
-
-/* void slb_allocate_user(unsigned long ea);
- *
- * Create an SLB entry for the given EA (user or kernel).
- * r3 = faulting address, r13 = PACA
- * r9, r10, r11 are clobbered by this function
- * No other registers are examined or changed.
- *
- * It is called with translation enabled in order to be able to walk the
- * page tables. This is not currently used.
- */
-_GLOBAL(slb_allocate_user)
- /* r3 = faulting address */
- srdi r10,r3,28 /* get esid */
-
- crset 4*cr7+lt /* set "user" flag for later */
-
- /* check if we fit in the range covered by the pagetables*/
- srdi. r9,r3,PGTABLE_EADDR_SIZE
- crnot 4*cr0+eq,4*cr0+eq
- beqlr
-
- /* now we need to get to the page tables in order to get the page
- * size encoding from the PMD. In the future, we'll be able to deal
- * with 1T segments too by getting the encoding from the PGD instead
- */
- ld r9,PACAPGDIR(r13)
- cmpldi cr0,r9,0
- beqlr
- rlwinm r11,r10,8,25,28
- ldx r9,r9,r11 /* get pgd_t */
- cmpldi cr0,r9,0
- beqlr
- rlwinm r11,r10,3,17,28
- ldx r9,r9,r11 /* get pmd_t */
- cmpldi cr0,r9,0
- beqlr
-
- /* build vsid flags */
- andi. r11,r9,SLB_VSID_LLP
- ori r11,r11,SLB_VSID_USER
-
- /* get context to calculate proto-VSID */
- ld r9,PACACONTEXTID(r13)
- /* fall through slb_finish_load */
-
-#endif /* __DISABLED__ */
-
-
/*
* Finish loading of an SLB entry and return
*
diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index 42954f0..2b27458 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -37,8 +37,8 @@
#include <asm/hugetlb.h>
/* some sanity checks */
-#if (PGTABLE_RANGE >> 43) > SLICE_MASK_SIZE
-#error PGTABLE_RANGE exceeds slice_mask high_slices size
+#if (H_PGTABLE_RANGE >> 43) > SLICE_MASK_SIZE
+#error H_PGTABLE_RANGE exceeds slice_mask high_slices size
#endif
static DEFINE_SPINLOCK(slice_convert_lock);
@@ -395,6 +395,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
/* Sanity checks */
BUG_ON(mm->task_size == 0);
+ VM_BUG_ON(radix_enabled());
slice_dbg("slice_get_unmapped_area(mm=%p, psize=%d...\n", mm, psize);
slice_dbg(" addr=%lx, len=%lx, flags=%lx, topdown=%d\n",
@@ -568,6 +569,16 @@ unsigned int get_slice_psize(struct mm_struct *mm, unsigned long addr)
unsigned char *hpsizes;
int index, mask_index;
+ /*
+ * Radix doesn't use slice, but can get enabled along with MMU_SLICE
+ */
+ if (radix_enabled()) {
+#ifdef CONFIG_PPC_64K_PAGES
+ return MMU_PAGE_64K;
+#else
+ return MMU_PAGE_4K;
+#endif
+ }
if (addr < SLICE_LOW_TOP) {
u64 lpsizes;
lpsizes = mm->context.low_slices_psize;
@@ -605,6 +616,7 @@ void slice_set_user_psize(struct mm_struct *mm, unsigned int psize)
slice_dbg("slice_set_user_psize(mm=%p, psize=%d)\n", mm, psize);
+ VM_BUG_ON(radix_enabled());
spin_lock_irqsave(&slice_convert_lock, flags);
old_psize = mm->context.user_psize;
@@ -649,6 +661,7 @@ void slice_set_range_psize(struct mm_struct *mm, unsigned long start,
{
struct slice_mask mask = slice_range_to_mask(start, len);
+ VM_BUG_ON(radix_enabled());
slice_convert(mm, mask, psize);
}
@@ -678,6 +691,9 @@ int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr,
struct slice_mask mask, available;
unsigned int psize = mm->context.user_psize;
+ if (radix_enabled())
+ return 0;
+
mask = slice_range_to_mask(addr, len);
available = slice_mask_for_size(mm, psize);
#ifdef CONFIG_PPC_64K_PAGES
diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
new file mode 100644
index 0000000..0fdaf93
--- /dev/null
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -0,0 +1,251 @@
+/*
+ * TLB flush routines for radix kernels.
+ *
+ * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/memblock.h>
+
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+
+static DEFINE_RAW_SPINLOCK(native_tlbie_lock);
+
+static inline void __tlbiel_pid(unsigned long pid, int set)
+{
+ unsigned long rb,rs,ric,prs,r;
+
+ rb = PPC_BIT(53); /* IS = 1 */
+ rb |= set << PPC_BITLSHIFT(51);
+ rs = ((unsigned long)pid) << PPC_BITLSHIFT(31);
+ prs = 1; /* process scoped */
+ r = 1; /* raidx format */
+ ric = 2; /* invalidate all the caches */
+
+ asm volatile("ptesync": : :"memory");
+ asm volatile(".long 0x7c000224 | (%0 << 11) | (%1 << 16) |"
+ "(%2 << 17) | (%3 << 18) | (%4 << 21)"
+ : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+ asm volatile("ptesync": : :"memory");
+}
+
+/*
+ * We use 128 set in radix mode and 256 set in hpt mode.
+ */
+static inline void _tlbiel_pid(unsigned long pid)
+{
+ int set;
+
+ for (set = 0; set < POWER9_TLB_SETS_RADIX ; set++) {
+ __tlbiel_pid(pid, set);
+ }
+ return;
+}
+
+static inline void _tlbie_pid(unsigned long pid)
+{
+ unsigned long rb,rs,ric,prs,r;
+
+ rb = PPC_BIT(53); /* IS = 1 */
+ rs = pid << PPC_BITLSHIFT(31);
+ prs = 1; /* process scoped */
+ r = 1; /* raidx format */
+ ric = 2; /* invalidate all the caches */
+
+ asm volatile("ptesync": : :"memory");
+ asm volatile(".long 0x7c000264 | (%0 << 11) | (%1 << 16) |"
+ "(%2 << 17) | (%3 << 18) | (%4 << 21)"
+ : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+ asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}
+
+static inline void _tlbiel_va(unsigned long va, unsigned long pid,
+ unsigned long ap)
+{
+ unsigned long rb,rs,ric,prs,r;
+
+ rb = va & ~(PPC_BITMASK(52, 63));
+ rb |= ap << PPC_BITLSHIFT(58);
+ rs = pid << PPC_BITLSHIFT(31);
+ prs = 1; /* process scoped */
+ r = 1; /* raidx format */
+ ric = 0; /* no cluster flush yet */
+
+ asm volatile("ptesync": : :"memory");
+ asm volatile(".long 0x7c000224 | (%0 << 11) | (%1 << 16) |"
+ "(%2 << 17) | (%3 << 18) | (%4 << 21)"
+ : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+ asm volatile("ptesync": : :"memory");
+}
+
+static inline void _tlbie_va(unsigned long va, unsigned long pid,
+ unsigned long ap)
+{
+ unsigned long rb,rs,ric,prs,r;
+
+ rb = va & ~(PPC_BITMASK(52, 63));
+ rb |= ap << PPC_BITLSHIFT(58);
+ rs = pid << PPC_BITLSHIFT(31);
+ prs = 1; /* process scoped */
+ r = 1; /* raidx format */
+ ric = 0; /* no cluster flush yet */
+
+ asm volatile("ptesync": : :"memory");
+ asm volatile(".long 0x7c000264 | (%0 << 11) | (%1 << 16) |"
+ "(%2 << 17) | (%3 << 18) | (%4 << 21)"
+ : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+ asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}
+
+/*
+ * Base TLB flushing operations:
+ *
+ * - flush_tlb_mm(mm) flushes the specified mm context TLB's
+ * - flush_tlb_page(vma, vmaddr) flushes one page
+ * - flush_tlb_range(vma, start, end) flushes a range of pages
+ * - flush_tlb_kernel_range(start, end) flushes kernel pages
+ *
+ * - local_* variants of page and mm only apply to the current
+ * processor
+ */
+void radix__local_flush_tlb_mm(struct mm_struct *mm)
+{
+ unsigned int pid;
+
+ preempt_disable();
+ pid = mm->context.id;
+ if (pid != MMU_NO_CONTEXT)
+ _tlbiel_pid(pid);
+ preempt_enable();
+}
+EXPORT_SYMBOL(radix__local_flush_tlb_mm);
+
+void radix___local_flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr,
+ unsigned long ap, int nid)
+{
+ unsigned int pid;
+
+ preempt_disable();
+ pid = mm ? mm->context.id : 0;
+ if (pid != MMU_NO_CONTEXT)
+ _tlbiel_va(vmaddr, pid, ap);
+ preempt_enable();
+}
+
+void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+#ifdef CONFIG_HUGETLB_PAGE
+ /* need the return fix for nohash.c */
+ if (vma && is_vm_hugetlb_page(vma))
+ return __local_flush_hugetlb_page(vma, vmaddr);
+#endif
+ radix___local_flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr,
+ mmu_get_ap(mmu_virtual_psize), 0);
+}
+EXPORT_SYMBOL(radix__local_flush_tlb_page);
+
+#ifdef CONFIG_SMP
+static int mm_is_core_local(struct mm_struct *mm)
+{
+ return cpumask_subset(mm_cpumask(mm),
+ topology_sibling_cpumask(smp_processor_id()));
+}
+
+void radix__flush_tlb_mm(struct mm_struct *mm)
+{
+ unsigned int pid;
+
+ preempt_disable();
+ pid = mm->context.id;
+ if (unlikely(pid == MMU_NO_CONTEXT))
+ goto no_context;
+
+ if (!mm_is_core_local(mm)) {
+ int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
+
+ if (lock_tlbie)
+ raw_spin_lock(&native_tlbie_lock);
+ _tlbie_pid(pid);
+ if (lock_tlbie)
+ raw_spin_unlock(&native_tlbie_lock);
+ } else
+ _tlbiel_pid(pid);
+no_context:
+ preempt_enable();
+}
+EXPORT_SYMBOL(radix__flush_tlb_mm);
+
+void radix___flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr,
+ unsigned long ap, int nid)
+{
+ unsigned int pid;
+
+ preempt_disable();
+ pid = mm ? mm->context.id : 0;
+ if (unlikely(pid == MMU_NO_CONTEXT))
+ goto bail;
+ if (!mm_is_core_local(mm)) {
+ int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
+
+ if (lock_tlbie)
+ raw_spin_lock(&native_tlbie_lock);
+ _tlbie_va(vmaddr, pid, ap);
+ if (lock_tlbie)
+ raw_spin_unlock(&native_tlbie_lock);
+ } else
+ _tlbiel_va(vmaddr, pid, ap);
+bail:
+ preempt_enable();
+}
+
+void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+#ifdef CONFIG_HUGETLB_PAGE
+ if (vma && is_vm_hugetlb_page(vma))
+ return flush_hugetlb_page(vma, vmaddr);
+#endif
+ radix___flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr,
+ mmu_get_ap(mmu_virtual_psize), 0);
+}
+EXPORT_SYMBOL(radix__flush_tlb_page);
+
+#endif /* CONFIG_SMP */
+
+void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end)
+{
+ int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
+
+ if (lock_tlbie)
+ raw_spin_lock(&native_tlbie_lock);
+ _tlbie_pid(0);
+ if (lock_tlbie)
+ raw_spin_unlock(&native_tlbie_lock);
+}
+EXPORT_SYMBOL(radix__flush_tlb_kernel_range);
+
+/*
+ * Currently, for range flushing, we just do a full mm flush. Because
+ * we use this in code path where we don' track the page size.
+ */
+void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end)
+
+{
+ struct mm_struct *mm = vma->vm_mm;
+ radix__flush_tlb_mm(mm);
+}
+EXPORT_SYMBOL(radix__flush_tlb_range);
+
+
+void radix__tlb_flush(struct mmu_gather *tlb)
+{
+ struct mm_struct *mm = tlb->mm;
+ radix__flush_tlb_mm(mm);
+}
diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c
index f7b8039..4517aa4 100644
--- a/arch/powerpc/mm/tlb_hash64.c
+++ b/arch/powerpc/mm/tlb_hash64.c
@@ -155,7 +155,7 @@ void __flush_tlb_pending(struct ppc64_tlb_batch *batch)
batch->index = 0;
}
-void tlb_flush(struct mmu_gather *tlb)
+void hash__tlb_flush(struct mmu_gather *tlb)
{
struct ppc64_tlb_batch *tlbbatch = &get_cpu_var(ppc64_tlb_batch);
@@ -218,7 +218,7 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
pte = pte_val(*ptep);
if (is_thp)
trace_hugepage_invalidate(start, pte);
- if (!(pte & _PAGE_HASHPTE))
+ if (!(pte & H_PAGE_HASHPTE))
continue;
if (unlikely(is_thp))
hpte_do_hugepage_flush(mm, start, (pmd_t *)ptep, pte);
@@ -248,7 +248,7 @@ void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr)
start_pte = pte_offset_map(pmd, addr);
for (pte = start_pte; pte < start_pte + PTRS_PER_PTE; pte++) {
unsigned long pteval = pte_val(*pte);
- if (pteval & _PAGE_HASHPTE)
+ if (pteval & H_PAGE_HASHPTE)
hpte_need_flush(mm, addr, pte, pteval, 0);
addr += PAGE_SIZE;
}
diff --git a/arch/powerpc/perf/Makefile b/arch/powerpc/perf/Makefile
index f9c083a..77b6394 100644
--- a/arch/powerpc/perf/Makefile
+++ b/arch/powerpc/perf/Makefile
@@ -1,6 +1,6 @@
subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
-obj-$(CONFIG_PERF_EVENTS) += callchain.o
+obj-$(CONFIG_PERF_EVENTS) += callchain.o perf_regs.o
obj-$(CONFIG_PPC_PERF_CTRS) += core-book3s.o bhrb.o
obj64-$(CONFIG_PPC_PERF_CTRS) += power4-pmu.o ppc970-pmu.o power5-pmu.o \
diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c
index 22d9015..26d37e6 100644
--- a/arch/powerpc/perf/callchain.c
+++ b/arch/powerpc/perf/callchain.c
@@ -137,7 +137,7 @@ static int read_user_stack_slow(void __user *ptr, void *buf, int nb)
offset = addr & ((1UL << shift) - 1);
pte = READ_ONCE(*ptep);
- if (!pte_present(pte) || !(pte_val(pte) & _PAGE_USER))
+ if (!pte_present(pte) || !pte_user(pte))
goto err_out;
pfn = pte_pfn(pte);
if (!page_is_ram(pfn))
diff --git a/arch/powerpc/perf/perf_regs.c b/arch/powerpc/perf/perf_regs.c
new file mode 100644
index 0000000..d24a8a3
--- /dev/null
+++ b/arch/powerpc/perf/perf_regs.c
@@ -0,0 +1,104 @@
+/*
+ * Copyright 2016 Anju T, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/perf_event.h>
+#include <linux/bug.h>
+#include <linux/stddef.h>
+#include <asm/ptrace.h>
+#include <asm/perf_regs.h>
+
+#define PT_REGS_OFFSET(id, r) [id] = offsetof(struct pt_regs, r)
+
+#define REG_RESERVED (~((1ULL << PERF_REG_POWERPC_MAX) - 1))
+
+static unsigned int pt_regs_offset[PERF_REG_POWERPC_MAX] = {
+ PT_REGS_OFFSET(PERF_REG_POWERPC_R0, gpr[0]),
+ PT_REGS_OFFSET(PERF_REG_POWERPC_R1, gpr[1]),
+ PT_REGS_OFFSET(PERF_REG_POWERPC_R2, gpr[2]),
+ PT_REGS_OFFSET(PERF_REG_POWERPC_R3, gpr[3]),
+ PT_REGS_OFFSET(PERF_REG_POWERPC_R4, gpr[4]),
+ PT_REGS_OFFSET(PERF_REG_POWERPC_R5, gpr[5]),
+ PT_REGS_OFFSET(PERF_REG_POWERPC_R6, gpr[6]),
+ PT_REGS_OFFSET(PERF_REG_POWERPC_R7, gpr[7]),
+ PT_REGS_OFFSET(PERF_REG_POWERPC_R8, gpr[8]),
+ PT_REGS_OFFSET(PERF_REG_POWERPC_R9, gpr[9]),
+ PT_REGS_OFFSET(PERF_REG_POWERPC_R10, gpr[10]),
+ PT_REGS_OFFSET(PERF_REG_POWERPC_R11, gpr[11]),
+ PT_REGS_OFFSET(PERF_REG_POWERPC_R12, gpr[12]),
+ PT_REGS_OFFSET(PERF_REG_POWERPC_R13, gpr[13]),
+ PT_REGS_OFFSET(PERF_REG_POWERPC_R14, gpr[14]),
+ PT_REGS_OFFSET(PERF_REG_POWERPC_R15, gpr[15]),
+ PT_REGS_OFFSET(PERF_REG_POWERPC_R16, gpr[16]),
+ PT_REGS_OFFSET(PERF_REG_POWERPC_R17, gpr[17]),
+ PT_REGS_OFFSET(PERF_REG_POWERPC_R18, gpr[18]),
+ PT_REGS_OFFSET(PERF_REG_POWERPC_R19, gpr[19]),
+ PT_REGS_OFFSET(PERF_REG_POWERPC_R20, gpr[20]),
+ PT_REGS_OFFSET(PERF_REG_POWERPC_R21, gpr[21]),
+ PT_REGS_OFFSET(PERF_REG_POWERPC_R22, gpr[22]),
+ PT_REGS_OFFSET(PERF_REG_POWERPC_R23, gpr[23]),
+ PT_REGS_OFFSET(PERF_REG_POWERPC_R24, gpr[24]),
+ PT_REGS_OFFSET(PERF_REG_POWERPC_R25, gpr[25]),
+ PT_REGS_OFFSET(PERF_REG_POWERPC_R26, gpr[26]),
+ PT_REGS_OFFSET(PERF_REG_POWERPC_R27, gpr[27]),
+ PT_REGS_OFFSET(PERF_REG_POWERPC_R28, gpr[28]),
+ PT_REGS_OFFSET(PERF_REG_POWERPC_R29, gpr[29]),
+ PT_REGS_OFFSET(PERF_REG_POWERPC_R30, gpr[30]),
+ PT_REGS_OFFSET(PERF_REG_POWERPC_R31, gpr[31]),
+ PT_REGS_OFFSET(PERF_REG_POWERPC_NIP, nip),
+ PT_REGS_OFFSET(PERF_REG_POWERPC_MSR, msr),
+ PT_REGS_OFFSET(PERF_REG_POWERPC_ORIG_R3, orig_gpr3),
+ PT_REGS_OFFSET(PERF_REG_POWERPC_CTR, ctr),
+ PT_REGS_OFFSET(PERF_REG_POWERPC_LINK, link),
+ PT_REGS_OFFSET(PERF_REG_POWERPC_XER, xer),
+ PT_REGS_OFFSET(PERF_REG_POWERPC_CCR, ccr),
+#ifdef CONFIG_PPC64
+ PT_REGS_OFFSET(PERF_REG_POWERPC_SOFTE, softe),
+#else
+ PT_REGS_OFFSET(PERF_REG_POWERPC_SOFTE, mq),
+#endif
+ PT_REGS_OFFSET(PERF_REG_POWERPC_TRAP, trap),
+ PT_REGS_OFFSET(PERF_REG_POWERPC_DAR, dar),
+ PT_REGS_OFFSET(PERF_REG_POWERPC_DSISR, dsisr),
+};
+
+u64 perf_reg_value(struct pt_regs *regs, int idx)
+{
+ if (WARN_ON_ONCE(idx >= PERF_REG_POWERPC_MAX))
+ return 0;
+
+ return regs_get_register(regs, pt_regs_offset[idx]);
+}
+
+int perf_reg_validate(u64 mask)
+{
+ if (!mask || mask & REG_RESERVED)
+ return -EINVAL;
+ return 0;
+}
+
+u64 perf_reg_abi(struct task_struct *task)
+{
+#ifdef CONFIG_PPC64
+ if (!test_tsk_thread_flag(task, TIF_32BIT))
+ return PERF_SAMPLE_REGS_ABI_64;
+ else
+#endif
+ return PERF_SAMPLE_REGS_ABI_32;
+}
+
+void perf_get_regs_user(struct perf_regs *regs_user,
+ struct pt_regs *regs,
+ struct pt_regs *regs_user_copy)
+{
+ regs_user->regs = task_pt_regs(current);
+ regs_user->abi = perf_reg_abi(current);
+}
diff --git a/arch/powerpc/perf/power8-events-list.h b/arch/powerpc/perf/power8-events-list.h
index 741b77e..3a2e6e8 100644
--- a/arch/powerpc/perf/power8-events-list.h
+++ b/arch/powerpc/perf/power8-events-list.h
@@ -49,3 +49,43 @@ EVENT(PM_L3_PREF_ALL, 0x4e052)
EVENT(PM_DTLB_MISS, 0x300fc)
/* ITLB Reloaded */
EVENT(PM_ITLB_MISS, 0x400fc)
+/* Run_Instructions */
+EVENT(PM_RUN_INST_CMPL, 0x500fa)
+/* Alternate event code for PM_RUN_INST_CMPL */
+EVENT(PM_RUN_INST_CMPL_ALT, 0x400fa)
+/* Run_cycles */
+EVENT(PM_RUN_CYC, 0x600f4)
+/* Alternate event code for Run_cycles */
+EVENT(PM_RUN_CYC_ALT, 0x200f4)
+/* Marked store completed */
+EVENT(PM_MRK_ST_CMPL, 0x10134)
+/* Alternate event code for Marked store completed */
+EVENT(PM_MRK_ST_CMPL_ALT, 0x301e2)
+/* Marked two path branch */
+EVENT(PM_BR_MRK_2PATH, 0x10138)
+/* Alternate event code for PM_BR_MRK_2PATH */
+EVENT(PM_BR_MRK_2PATH_ALT, 0x40138)
+/* L3 castouts in Mepf state */
+EVENT(PM_L3_CO_MEPF, 0x18082)
+/* Alternate event code for PM_L3_CO_MEPF */
+EVENT(PM_L3_CO_MEPF_ALT, 0x3e05e)
+/* Data cache was reloaded from a location other than L2 due to a marked load */
+EVENT(PM_MRK_DATA_FROM_L2MISS, 0x1d14e)
+/* Alternate event code for PM_MRK_DATA_FROM_L2MISS */
+EVENT(PM_MRK_DATA_FROM_L2MISS_ALT, 0x401e8)
+/* Alternate event code for PM_CMPLU_STALL */
+EVENT(PM_CMPLU_STALL_ALT, 0x1e054)
+/* Two path branch */
+EVENT(PM_BR_2PATH, 0x20036)
+/* Alternate event code for PM_BR_2PATH */
+EVENT(PM_BR_2PATH_ALT, 0x40036)
+/* # PPC Dispatched */
+EVENT(PM_INST_DISP, 0x200f2)
+/* Alternate event code for PM_INST_DISP */
+EVENT(PM_INST_DISP_ALT, 0x300f2)
+/* Marked filter Match */
+EVENT(PM_MRK_FILT_MATCH, 0x2013c)
+/* Alternate event code for PM_MRK_FILT_MATCH */
+EVENT(PM_MRK_FILT_MATCH_ALT, 0x3012e)
+/* Alternate event code for PM_LD_MISS_L1 */
+EVENT(PM_LD_MISS_L1_ALT, 0x400f0)
diff --git a/arch/powerpc/perf/power8-pmu.c b/arch/powerpc/perf/power8-pmu.c
index 690d918..7cf3b43 100644
--- a/arch/powerpc/perf/power8-pmu.c
+++ b/arch/powerpc/perf/power8-pmu.c
@@ -274,7 +274,8 @@ static int power8_get_constraint(u64 event, unsigned long *maskp, unsigned long
/* Ignore Linux defined bits when checking event below */
base_event = event & ~EVENT_LINUX_MASK;
- if (pmc >= 5 && base_event != 0x500fa && base_event != 0x600f4)
+ if (pmc >= 5 && base_event != PM_RUN_INST_CMPL &&
+ base_event != PM_RUN_CYC)
return -1;
mask |= CNST_PMC_MASK(pmc);
@@ -488,17 +489,17 @@ static int power8_compute_mmcr(u64 event[], int n_ev,
/* Table of alternatives, sorted by column 0 */
static const unsigned int event_alternatives[][MAX_ALT] = {
- { 0x10134, 0x301e2 }, /* PM_MRK_ST_CMPL */
- { 0x10138, 0x40138 }, /* PM_BR_MRK_2PATH */
- { 0x18082, 0x3e05e }, /* PM_L3_CO_MEPF */
- { 0x1d14e, 0x401e8 }, /* PM_MRK_DATA_FROM_L2MISS */
- { 0x1e054, 0x4000a }, /* PM_CMPLU_STALL */
- { 0x20036, 0x40036 }, /* PM_BR_2PATH */
- { 0x200f2, 0x300f2 }, /* PM_INST_DISP */
- { 0x200f4, 0x600f4 }, /* PM_RUN_CYC */
- { 0x2013c, 0x3012e }, /* PM_MRK_FILT_MATCH */
- { 0x3e054, 0x400f0 }, /* PM_LD_MISS_L1 */
- { 0x400fa, 0x500fa }, /* PM_RUN_INST_CMPL */
+ { PM_MRK_ST_CMPL, PM_MRK_ST_CMPL_ALT },
+ { PM_BR_MRK_2PATH, PM_BR_MRK_2PATH_ALT },
+ { PM_L3_CO_MEPF, PM_L3_CO_MEPF_ALT },
+ { PM_MRK_DATA_FROM_L2MISS, PM_MRK_DATA_FROM_L2MISS_ALT },
+ { PM_CMPLU_STALL_ALT, PM_CMPLU_STALL },
+ { PM_BR_2PATH, PM_BR_2PATH_ALT },
+ { PM_INST_DISP, PM_INST_DISP_ALT },
+ { PM_RUN_CYC_ALT, PM_RUN_CYC },
+ { PM_MRK_FILT_MATCH, PM_MRK_FILT_MATCH_ALT },
+ { PM_LD_MISS_L1, PM_LD_MISS_L1_ALT },
+ { PM_RUN_INST_CMPL_ALT, PM_RUN_INST_CMPL },
};
/*
@@ -546,17 +547,17 @@ static int power8_get_alternatives(u64 event, unsigned int flags, u64 alt[])
j = num_alt;
for (i = 0; i < num_alt; ++i) {
switch (alt[i]) {
- case 0x1e: /* PM_CYC */
- alt[j++] = 0x600f4; /* PM_RUN_CYC */
+ case PM_CYC:
+ alt[j++] = PM_RUN_CYC;
break;
- case 0x600f4: /* PM_RUN_CYC */
- alt[j++] = 0x1e;
+ case PM_RUN_CYC:
+ alt[j++] = PM_CYC;
break;
- case 0x2: /* PM_PPC_CMPL */
- alt[j++] = 0x500fa; /* PM_RUN_INST_CMPL */
+ case PM_INST_CMPL:
+ alt[j++] = PM_RUN_INST_CMPL;
break;
- case 0x500fa: /* PM_RUN_INST_CMPL */
- alt[j++] = 0x2; /* PM_PPC_CMPL */
+ case PM_RUN_INST_CMPL:
+ alt[j++] = PM_INST_CMPL;
break;
}
}
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 142dff5..77e9b8d 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -72,7 +72,7 @@ config PPC_BOOK3S_64
select PPC_FPU
select PPC_HAVE_PMU_SUPPORT
select SYS_SUPPORTS_HUGETLBFS
- select HAVE_ARCH_TRANSPARENT_HUGEPAGE if PPC_64K_PAGES
+ select HAVE_ARCH_TRANSPARENT_HUGEPAGE
select ARCH_SUPPORTS_NUMA_BALANCING
select IRQ_WORK
@@ -331,6 +331,15 @@ config PPC_STD_MMU_64
def_bool y
depends on PPC_STD_MMU && PPC64
+config PPC_RADIX_MMU
+ bool "Radix MMU Support"
+ depends on PPC_BOOK3S_64
+ default y
+ help
+ Enable support for the Power ISA 3.0 Radix style MMU. Currently this
+ is only implemented by IBM Power9 CPUs, if you don't have one of them
+ you can probably disable this.
+
config PPC_MMU_NOHASH
def_bool y
depends on !PPC_STD_MMU
diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c
index f7af74f..3cbe38f 100644
--- a/arch/powerpc/platforms/cell/spu_base.c
+++ b/arch/powerpc/platforms/cell/spu_base.c
@@ -24,7 +24,7 @@
#include <linux/interrupt.h>
#include <linux/list.h>
-#include <linux/module.h>
+#include <linux/init.h>
#include <linux/ptrace.h>
#include <linux/slab.h>
#include <linux/wait.h>
@@ -197,7 +197,7 @@ static int __spu_trap_data_map(struct spu *spu, unsigned long ea, u64 dsisr)
(REGION_ID(ea) != USER_REGION_ID)) {
spin_unlock(&spu->register_lock);
- ret = hash_page(ea, _PAGE_PRESENT, 0x300, dsisr);
+ ret = hash_page(ea, _PAGE_PRESENT | _PAGE_READ, 0x300, dsisr);
spin_lock(&spu->register_lock);
if (!ret) {
@@ -805,7 +805,4 @@ static int __init init_spu_base(void)
out:
return ret;
}
-module_init(init_spu_base);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Arnd Bergmann <arndb@de.ibm.com>");
+device_initcall(init_spu_base);
diff --git a/arch/powerpc/platforms/cell/spufs/fault.c b/arch/powerpc/platforms/cell/spufs/fault.c
index d98f845..e29e4d5 100644
--- a/arch/powerpc/platforms/cell/spufs/fault.c
+++ b/arch/powerpc/platforms/cell/spufs/fault.c
@@ -141,8 +141,8 @@ int spufs_handle_class1(struct spu_context *ctx)
/* we must not hold the lock when entering copro_handle_mm_fault */
spu_release(ctx);
- access = (_PAGE_PRESENT | _PAGE_USER);
- access |= (dsisr & MFC_DSISR_ACCESS_PUT) ? _PAGE_RW : 0UL;
+ access = (_PAGE_PRESENT | _PAGE_READ);
+ access |= (dsisr & MFC_DSISR_ACCESS_PUT) ? _PAGE_WRITE : 0UL;
local_irq_save(flags);
ret = hash_page(ea, access, 0x300, dsisr);
local_irq_restore(flags);
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 950b3e5..9226df1 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -75,7 +75,7 @@ static int pnv_eeh_init(void)
* and P7IOC separately. So we should regard
* PE#0 as valid for PHB3 and P7IOC.
*/
- if (phb->ioda.reserved_pe != 0)
+ if (phb->ioda.reserved_pe_idx != 0)
eeh_add_flag(EEH_VALID_PE_ZERO);
break;
@@ -1009,8 +1009,9 @@ static int pnv_eeh_reset_vf_pe(struct eeh_pe *pe, int option)
static int pnv_eeh_reset(struct eeh_pe *pe, int option)
{
struct pci_controller *hose = pe->phb;
+ struct pnv_phb *phb;
struct pci_bus *bus;
- int ret;
+ int64_t rc;
/*
* For PHB reset, we always have complete reset. For those PEs whose
@@ -1026,45 +1027,39 @@ static int pnv_eeh_reset(struct eeh_pe *pe, int option)
* reset. The side effect is that EEH core has to clear the frozen
* state explicitly after BAR restore.
*/
- if (pe->type & EEH_PE_PHB) {
- ret = pnv_eeh_phb_reset(hose, option);
- } else {
- struct pnv_phb *phb;
- s64 rc;
+ if (pe->type & EEH_PE_PHB)
+ return pnv_eeh_phb_reset(hose, option);
- /*
- * The frozen PE might be caused by PAPR error injection
- * registers, which are expected to be cleared after hitting
- * frozen PE as stated in the hardware spec. Unfortunately,
- * that's not true on P7IOC. So we have to clear it manually
- * to avoid recursive EEH errors during recovery.
- */
- phb = hose->private_data;
- if (phb->model == PNV_PHB_MODEL_P7IOC &&
- (option == EEH_RESET_HOT ||
- option == EEH_RESET_FUNDAMENTAL)) {
- rc = opal_pci_reset(phb->opal_id,
- OPAL_RESET_PHB_ERROR,
- OPAL_ASSERT_RESET);
- if (rc != OPAL_SUCCESS) {
- pr_warn("%s: Failure %lld clearing "
- "error injection registers\n",
- __func__, rc);
- return -EIO;
- }
+ /*
+ * The frozen PE might be caused by PAPR error injection
+ * registers, which are expected to be cleared after hitting
+ * frozen PE as stated in the hardware spec. Unfortunately,
+ * that's not true on P7IOC. So we have to clear it manually
+ * to avoid recursive EEH errors during recovery.
+ */
+ phb = hose->private_data;
+ if (phb->model == PNV_PHB_MODEL_P7IOC &&
+ (option == EEH_RESET_HOT ||
+ option == EEH_RESET_FUNDAMENTAL)) {
+ rc = opal_pci_reset(phb->opal_id,
+ OPAL_RESET_PHB_ERROR,
+ OPAL_ASSERT_RESET);
+ if (rc != OPAL_SUCCESS) {
+ pr_warn("%s: Failure %lld clearing error injection registers\n",
+ __func__, rc);
+ return -EIO;
}
-
- bus = eeh_pe_bus_get(pe);
- if (pe->type & EEH_PE_VF)
- ret = pnv_eeh_reset_vf_pe(pe, option);
- else if (pci_is_root_bus(bus) ||
- pci_is_root_bus(bus->parent))
- ret = pnv_eeh_root_reset(hose, option);
- else
- ret = pnv_eeh_bridge_reset(bus->self, option);
}
- return ret;
+ bus = eeh_pe_bus_get(pe);
+ if (pe->type & EEH_PE_VF)
+ return pnv_eeh_reset_vf_pe(pe, option);
+
+ if (pci_is_root_bus(bus) ||
+ pci_is_root_bus(bus->parent))
+ return pnv_eeh_root_reset(hose, option);
+
+ return pnv_eeh_bridge_reset(bus->self, option);
}
/**
diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
index 7229acd..0459e10 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -12,6 +12,7 @@
#include <linux/export.h>
#include <linux/pci.h>
#include <linux/memblock.h>
+#include <linux/iommu.h>
#include <asm/iommu.h>
#include <asm/pnv-pci.h>
@@ -25,8 +26,6 @@
* Other types of TCE cache invalidation are not functional in the
* hardware.
*/
-#define TCE_KILL_INVAL_ALL PPC_BIT(0)
-
static struct pci_dev *get_pci_dev(struct device_node *dn)
{
return PCI_DN(dn)->pcidev;
@@ -138,22 +137,17 @@ static struct pnv_ioda_pe *get_gpu_pci_dev_and_pe(struct pnv_ioda_pe *npe,
struct pnv_ioda_pe *pe;
struct pci_dn *pdn;
- if (npe->flags & PNV_IODA_PE_PEER) {
- pe = npe->peers[0];
- pdev = pe->pdev;
- } else {
- pdev = pnv_pci_get_gpu_dev(npe->pdev);
- if (!pdev)
- return NULL;
+ pdev = pnv_pci_get_gpu_dev(npe->pdev);
+ if (!pdev)
+ return NULL;
- pdn = pci_get_pdn(pdev);
- if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
- return NULL;
+ pdn = pci_get_pdn(pdev);
+ if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
+ return NULL;
- hose = pci_bus_to_host(pdev->bus);
- phb = hose->private_data;
- pe = &phb->ioda.pe_array[pdn->pe_number];
- }
+ hose = pci_bus_to_host(pdev->bus);
+ phb = hose->private_data;
+ pe = &phb->ioda.pe_array[pdn->pe_number];
if (gpdev)
*gpdev = pdev;
@@ -161,92 +155,70 @@ static struct pnv_ioda_pe *get_gpu_pci_dev_and_pe(struct pnv_ioda_pe *npe,
return pe;
}
-void pnv_npu_tce_invalidate_entire(struct pnv_ioda_pe *npe)
+long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num,
+ struct iommu_table *tbl)
{
struct pnv_phb *phb = npe->phb;
+ int64_t rc;
+ const unsigned long size = tbl->it_indirect_levels ?
+ tbl->it_level_size : tbl->it_size;
+ const __u64 start_addr = tbl->it_offset << tbl->it_page_shift;
+ const __u64 win_size = tbl->it_size << tbl->it_page_shift;
+
+ pe_info(npe, "Setting up window %llx..%llx pg=%lx\n",
+ start_addr, start_addr + win_size - 1,
+ IOMMU_PAGE_SIZE(tbl));
+
+ rc = opal_pci_map_pe_dma_window(phb->opal_id,
+ npe->pe_number,
+ npe->pe_number,
+ tbl->it_indirect_levels + 1,
+ __pa(tbl->it_base),
+ size << 3,
+ IOMMU_PAGE_SIZE(tbl));
+ if (rc) {
+ pe_err(npe, "Failed to configure TCE table, err %lld\n", rc);
+ return rc;
+ }
+ pnv_pci_ioda2_tce_invalidate_entire(phb, false);
- if (WARN_ON(phb->type != PNV_PHB_NPU ||
- !phb->ioda.tce_inval_reg ||
- !(npe->flags & PNV_IODA_PE_DEV)))
- return;
+ /* Add the table to the list so its TCE cache will get invalidated */
+ pnv_pci_link_table_and_group(phb->hose->node, num,
+ tbl, &npe->table_group);
- mb(); /* Ensure previous TCE table stores are visible */
- __raw_writeq(cpu_to_be64(TCE_KILL_INVAL_ALL),
- phb->ioda.tce_inval_reg);
+ return 0;
}
-void pnv_npu_tce_invalidate(struct pnv_ioda_pe *npe,
- struct iommu_table *tbl,
- unsigned long index,
- unsigned long npages,
- bool rm)
+long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num)
{
struct pnv_phb *phb = npe->phb;
+ int64_t rc;
- /* We can only invalidate the whole cache on NPU */
- unsigned long val = TCE_KILL_INVAL_ALL;
-
- if (WARN_ON(phb->type != PNV_PHB_NPU ||
- !phb->ioda.tce_inval_reg ||
- !(npe->flags & PNV_IODA_PE_DEV)))
- return;
-
- mb(); /* Ensure previous TCE table stores are visible */
- if (rm)
- __raw_rm_writeq(cpu_to_be64(val),
- (__be64 __iomem *) phb->ioda.tce_inval_reg_phys);
- else
- __raw_writeq(cpu_to_be64(val),
- phb->ioda.tce_inval_reg);
-}
-
-void pnv_npu_init_dma_pe(struct pnv_ioda_pe *npe)
-{
- struct pnv_ioda_pe *gpe;
- struct pci_dev *gpdev;
- int i, avail = -1;
-
- if (!npe->pdev || !(npe->flags & PNV_IODA_PE_DEV))
- return;
-
- gpe = get_gpu_pci_dev_and_pe(npe, &gpdev);
- if (!gpe)
- return;
-
- for (i = 0; i < PNV_IODA_MAX_PEER_PES; i++) {
- /* Nothing to do if the PE is already connected. */
- if (gpe->peers[i] == npe)
- return;
+ pe_info(npe, "Removing DMA window\n");
- if (!gpe->peers[i])
- avail = i;
+ rc = opal_pci_map_pe_dma_window(phb->opal_id, npe->pe_number,
+ npe->pe_number,
+ 0/* levels */, 0/* table address */,
+ 0/* table size */, 0/* page size */);
+ if (rc) {
+ pe_err(npe, "Unmapping failed, ret = %lld\n", rc);
+ return rc;
}
+ pnv_pci_ioda2_tce_invalidate_entire(phb, false);
- if (WARN_ON(avail < 0))
- return;
-
- gpe->peers[avail] = npe;
- gpe->flags |= PNV_IODA_PE_PEER;
+ pnv_pci_unlink_table_and_group(npe->table_group.tables[num],
+ &npe->table_group);
- /*
- * We assume that the NPU devices only have a single peer PE
- * (the GPU PCIe device PE).
- */
- npe->peers[0] = gpe;
- npe->flags |= PNV_IODA_PE_PEER;
+ return 0;
}
/*
- * For the NPU we want to point the TCE table at the same table as the
- * real PCI device.
+ * Enables 32 bit DMA on NPU.
*/
-static void pnv_npu_disable_bypass(struct pnv_ioda_pe *npe)
+static void pnv_npu_dma_set_32(struct pnv_ioda_pe *npe)
{
- struct pnv_phb *phb = npe->phb;
struct pci_dev *gpdev;
struct pnv_ioda_pe *gpe;
- void *addr;
- unsigned int size;
int64_t rc;
/*
@@ -260,14 +232,7 @@ static void pnv_npu_disable_bypass(struct pnv_ioda_pe *npe)
if (!gpe)
return;
- addr = (void *)gpe->table_group.tables[0]->it_base;
- size = gpe->table_group.tables[0]->it_size << 3;
- rc = opal_pci_map_pe_dma_window(phb->opal_id, npe->pe_number,
- npe->pe_number, 1, __pa(addr),
- size, 0x1000);
- if (rc != OPAL_SUCCESS)
- pr_warn("%s: Error %lld setting DMA window on PHB#%d-PE#%d\n",
- __func__, rc, phb->hose->global_number, npe->pe_number);
+ rc = pnv_npu_set_window(npe, 0, gpe->table_group.tables[0]);
/*
* We don't initialise npu_pe->tce32_table as we always use
@@ -277,72 +242,120 @@ static void pnv_npu_disable_bypass(struct pnv_ioda_pe *npe)
}
/*
- * Enable/disable bypass mode on the NPU. The NPU only supports one
+ * Enables bypass mode on the NPU. The NPU only supports one
* window per link, so bypass needs to be explicitly enabled or
* disabled. Unlike for a PHB3 bypass and non-bypass modes can't be
* active at the same time.
*/
-int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe, bool enable)
+static int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe)
{
struct pnv_phb *phb = npe->phb;
int64_t rc = 0;
+ phys_addr_t top = memblock_end_of_DRAM();
if (phb->type != PNV_PHB_NPU || !npe->pdev)
return -EINVAL;
- if (enable) {
- /* Enable the bypass window */
- phys_addr_t top = memblock_end_of_DRAM();
-
- npe->tce_bypass_base = 0;
- top = roundup_pow_of_two(top);
- dev_info(&npe->pdev->dev, "Enabling bypass for PE %d\n",
- npe->pe_number);
- rc = opal_pci_map_pe_dma_window_real(phb->opal_id,
- npe->pe_number, npe->pe_number,
- npe->tce_bypass_base, top);
- } else {
- /*
- * Disable the bypass window by replacing it with the
- * TCE32 window.
- */
- pnv_npu_disable_bypass(npe);
- }
+ rc = pnv_npu_unset_window(npe, 0);
+ if (rc != OPAL_SUCCESS)
+ return rc;
+
+ /* Enable the bypass window */
+
+ top = roundup_pow_of_two(top);
+ dev_info(&npe->pdev->dev, "Enabling bypass for PE %d\n",
+ npe->pe_number);
+ rc = opal_pci_map_pe_dma_window_real(phb->opal_id,
+ npe->pe_number, npe->pe_number,
+ 0 /* bypass base */, top);
+
+ if (rc == OPAL_SUCCESS)
+ pnv_pci_ioda2_tce_invalidate_entire(phb, false);
return rc;
}
-int pnv_npu_dma_set_mask(struct pci_dev *npdev, u64 dma_mask)
+void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass)
{
- struct pci_controller *hose = pci_bus_to_host(npdev->bus);
- struct pnv_phb *phb = hose->private_data;
- struct pci_dn *pdn = pci_get_pdn(npdev);
- struct pnv_ioda_pe *npe, *gpe;
- struct pci_dev *gpdev;
- uint64_t top;
- bool bypass = false;
+ int i;
+ struct pnv_phb *phb;
+ struct pci_dn *pdn;
+ struct pnv_ioda_pe *npe;
+ struct pci_dev *npdev;
- if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
- return -ENXIO;
+ for (i = 0; ; ++i) {
+ npdev = pnv_pci_get_npu_dev(gpdev, i);
- /* We only do bypass if it's enabled on the linked device */
- npe = &phb->ioda.pe_array[pdn->pe_number];
- gpe = get_gpu_pci_dev_and_pe(npe, &gpdev);
- if (!gpe)
- return -ENODEV;
+ if (!npdev)
+ break;
+
+ pdn = pci_get_pdn(npdev);
+ if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
+ return;
+
+ phb = pci_bus_to_host(npdev->bus)->private_data;
+
+ /* We only do bypass if it's enabled on the linked device */
+ npe = &phb->ioda.pe_array[pdn->pe_number];
+
+ if (bypass) {
+ dev_info(&npdev->dev,
+ "Using 64-bit DMA iommu bypass\n");
+ pnv_npu_dma_set_bypass(npe);
+ } else {
+ dev_info(&npdev->dev, "Using 32-bit DMA via iommu\n");
+ pnv_npu_dma_set_32(npe);
+ }
+ }
+}
- if (gpe->tce_bypass_enabled) {
- top = gpe->tce_bypass_base + memblock_end_of_DRAM() - 1;
- bypass = (dma_mask >= top);
+/* Switch ownership from platform code to external user (e.g. VFIO) */
+void pnv_npu_take_ownership(struct pnv_ioda_pe *npe)
+{
+ struct pnv_phb *phb = npe->phb;
+ int64_t rc;
+
+ /*
+ * Note: NPU has just a single TVE in the hardware which means that
+ * while used by the kernel, it can have either 32bit window or
+ * DMA bypass but never both. So we deconfigure 32bit window only
+ * if it was enabled at the moment of ownership change.
+ */
+ if (npe->table_group.tables[0]) {
+ pnv_npu_unset_window(npe, 0);
+ return;
}
- if (bypass)
- dev_info(&npdev->dev, "Using 64-bit DMA iommu bypass\n");
- else
- dev_info(&npdev->dev, "Using 32-bit DMA via iommu\n");
+ /* Disable bypass */
+ rc = opal_pci_map_pe_dma_window_real(phb->opal_id,
+ npe->pe_number, npe->pe_number,
+ 0 /* bypass base */, 0);
+ if (rc) {
+ pe_err(npe, "Failed to disable bypass, err %lld\n", rc);
+ return;
+ }
+ pnv_pci_ioda2_tce_invalidate_entire(npe->phb, false);
+}
- pnv_npu_dma_set_bypass(npe, bypass);
- *npdev->dev.dma_mask = dma_mask;
+struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe)
+{
+ struct pnv_phb *phb = npe->phb;
+ struct pci_bus *pbus = phb->hose->bus;
+ struct pci_dev *npdev, *gpdev = NULL, *gptmp;
+ struct pnv_ioda_pe *gpe = get_gpu_pci_dev_and_pe(npe, &gpdev);
- return 0;
+ if (!gpe || !gpdev)
+ return NULL;
+
+ list_for_each_entry(npdev, &pbus->devices, bus_list) {
+ gptmp = pnv_pci_get_gpu_dev(npdev);
+
+ if (gptmp != gpdev)
+ continue;
+
+ pe_info(gpe, "Attached NPU %s\n", dev_name(&npdev->dev));
+ iommu_group_add_device(gpe->table_group.group, &npdev->dev);
+ }
+
+ return gpe;
}
diff --git a/arch/powerpc/platforms/powernv/opal-hmi.c b/arch/powerpc/platforms/powernv/opal-hmi.c
index d000f4e..c0a8201 100644
--- a/arch/powerpc/platforms/powernv/opal-hmi.c
+++ b/arch/powerpc/platforms/powernv/opal-hmi.c
@@ -150,15 +150,17 @@ static void print_nx_checkstop_reason(const char *level,
static void print_checkstop_reason(const char *level,
struct OpalHMIEvent *hmi_evt)
{
- switch (hmi_evt->u.xstop_error.xstop_type) {
+ uint8_t type = hmi_evt->u.xstop_error.xstop_type;
+ switch (type) {
case CHECKSTOP_TYPE_CORE:
print_core_checkstop_reason(level, hmi_evt);
break;
case CHECKSTOP_TYPE_NX:
print_nx_checkstop_reason(level, hmi_evt);
break;
- case CHECKSTOP_TYPE_UNKNOWN:
- printk("%s Unknown Malfunction Alert.\n", level);
+ default:
+ printk("%s Unknown Malfunction Alert of type %d\n",
+ level, type);
break;
}
}
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index c5baaf3..3a5ea82 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -48,15 +48,16 @@
#include "powernv.h"
#include "pci.h"
-/* 256M DMA window, 4K TCE pages, 8 bytes TCE */
-#define TCE32_TABLE_SIZE ((0x10000000 / 0x1000) * 8)
+#define PNV_IODA1_M64_NUM 16 /* Number of M64 BARs */
+#define PNV_IODA1_M64_SEGS 8 /* Segments per M64 BAR */
+#define PNV_IODA1_DMA32_SEGSIZE 0x10000000
#define POWERNV_IOMMU_DEFAULT_LEVELS 1
#define POWERNV_IOMMU_MAX_LEVELS 5
static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl);
-static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
+void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
const char *fmt, ...)
{
struct va_format vaf;
@@ -87,13 +88,6 @@ static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
va_end(args);
}
-#define pe_err(pe, fmt, ...) \
- pe_level_printk(pe, KERN_ERR, fmt, ##__VA_ARGS__)
-#define pe_warn(pe, fmt, ...) \
- pe_level_printk(pe, KERN_WARNING, fmt, ##__VA_ARGS__)
-#define pe_info(pe, fmt, ...) \
- pe_level_printk(pe, KERN_INFO, fmt, ##__VA_ARGS__)
-
static bool pnv_iommu_bypass_disabled __read_mostly;
static int __init iommu_setup(char *str)
@@ -122,9 +116,17 @@ static inline bool pnv_pci_is_mem_pref_64(unsigned long flags)
(IORESOURCE_MEM_64 | IORESOURCE_PREFETCH));
}
+static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb, int pe_no)
+{
+ phb->ioda.pe_array[pe_no].phb = phb;
+ phb->ioda.pe_array[pe_no].pe_number = pe_no;
+
+ return &phb->ioda.pe_array[pe_no];
+}
+
static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no)
{
- if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe)) {
+ if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe_num)) {
pr_warn("%s: Invalid PE %d on PHB#%x\n",
__func__, pe_no, phb->hose->global_number);
return;
@@ -134,32 +136,31 @@ static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no)
pr_debug("%s: PE %d was reserved on PHB#%x\n",
__func__, pe_no, phb->hose->global_number);
- phb->ioda.pe_array[pe_no].phb = phb;
- phb->ioda.pe_array[pe_no].pe_number = pe_no;
+ pnv_ioda_init_pe(phb, pe_no);
}
-static int pnv_ioda_alloc_pe(struct pnv_phb *phb)
+static struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb)
{
unsigned long pe;
do {
pe = find_next_zero_bit(phb->ioda.pe_alloc,
- phb->ioda.total_pe, 0);
- if (pe >= phb->ioda.total_pe)
- return IODA_INVALID_PE;
+ phb->ioda.total_pe_num, 0);
+ if (pe >= phb->ioda.total_pe_num)
+ return NULL;
} while(test_and_set_bit(pe, phb->ioda.pe_alloc));
- phb->ioda.pe_array[pe].phb = phb;
- phb->ioda.pe_array[pe].pe_number = pe;
- return pe;
+ return pnv_ioda_init_pe(phb, pe);
}
-static void pnv_ioda_free_pe(struct pnv_phb *phb, int pe)
+static void pnv_ioda_free_pe(struct pnv_ioda_pe *pe)
{
- WARN_ON(phb->ioda.pe_array[pe].pdev);
+ struct pnv_phb *phb = pe->phb;
+
+ WARN_ON(pe->pdev);
- memset(&phb->ioda.pe_array[pe], 0, sizeof(struct pnv_ioda_pe));
- clear_bit(pe, phb->ioda.pe_alloc);
+ memset(pe, 0, sizeof(struct pnv_ioda_pe));
+ clear_bit(pe->pe_number, phb->ioda.pe_alloc);
}
/* The default M64 BAR is shared by all PEs */
@@ -199,13 +200,13 @@ static int pnv_ioda2_init_m64(struct pnv_phb *phb)
* expected to be 0 or last one of PE capabicity.
*/
r = &phb->hose->mem_resources[1];
- if (phb->ioda.reserved_pe == 0)
+ if (phb->ioda.reserved_pe_idx == 0)
r->start += phb->ioda.m64_segsize;
- else if (phb->ioda.reserved_pe == (phb->ioda.total_pe - 1))
+ else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1))
r->end -= phb->ioda.m64_segsize;
else
pr_warn(" Cannot strip M64 segment for reserved PE#%d\n",
- phb->ioda.reserved_pe);
+ phb->ioda.reserved_pe_idx);
return 0;
@@ -219,7 +220,7 @@ fail:
return -EIO;
}
-static void pnv_ioda2_reserve_dev_m64_pe(struct pci_dev *pdev,
+static void pnv_ioda_reserve_dev_m64_pe(struct pci_dev *pdev,
unsigned long *pe_bitmap)
{
struct pci_controller *hose = pci_bus_to_host(pdev->bus);
@@ -246,22 +247,80 @@ static void pnv_ioda2_reserve_dev_m64_pe(struct pci_dev *pdev,
}
}
-static void pnv_ioda2_reserve_m64_pe(struct pci_bus *bus,
- unsigned long *pe_bitmap,
- bool all)
+static int pnv_ioda1_init_m64(struct pnv_phb *phb)
+{
+ struct resource *r;
+ int index;
+
+ /*
+ * There are 16 M64 BARs, each of which has 8 segments. So
+ * there are as many M64 segments as the maximum number of
+ * PEs, which is 128.
+ */
+ for (index = 0; index < PNV_IODA1_M64_NUM; index++) {
+ unsigned long base, segsz = phb->ioda.m64_segsize;
+ int64_t rc;
+
+ base = phb->ioda.m64_base +
+ index * PNV_IODA1_M64_SEGS * segsz;
+ rc = opal_pci_set_phb_mem_window(phb->opal_id,
+ OPAL_M64_WINDOW_TYPE, index, base, 0,
+ PNV_IODA1_M64_SEGS * segsz);
+ if (rc != OPAL_SUCCESS) {
+ pr_warn(" Error %lld setting M64 PHB#%d-BAR#%d\n",
+ rc, phb->hose->global_number, index);
+ goto fail;
+ }
+
+ rc = opal_pci_phb_mmio_enable(phb->opal_id,
+ OPAL_M64_WINDOW_TYPE, index,
+ OPAL_ENABLE_M64_SPLIT);
+ if (rc != OPAL_SUCCESS) {
+ pr_warn(" Error %lld enabling M64 PHB#%d-BAR#%d\n",
+ rc, phb->hose->global_number, index);
+ goto fail;
+ }
+ }
+
+ /*
+ * Exclude the segment used by the reserved PE, which
+ * is expected to be 0 or last supported PE#.
+ */
+ r = &phb->hose->mem_resources[1];
+ if (phb->ioda.reserved_pe_idx == 0)
+ r->start += phb->ioda.m64_segsize;
+ else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1))
+ r->end -= phb->ioda.m64_segsize;
+ else
+ WARN(1, "Wrong reserved PE#%d on PHB#%d\n",
+ phb->ioda.reserved_pe_idx, phb->hose->global_number);
+
+ return 0;
+
+fail:
+ for ( ; index >= 0; index--)
+ opal_pci_phb_mmio_enable(phb->opal_id,
+ OPAL_M64_WINDOW_TYPE, index, OPAL_DISABLE_M64);
+
+ return -EIO;
+}
+
+static void pnv_ioda_reserve_m64_pe(struct pci_bus *bus,
+ unsigned long *pe_bitmap,
+ bool all)
{
struct pci_dev *pdev;
list_for_each_entry(pdev, &bus->devices, bus_list) {
- pnv_ioda2_reserve_dev_m64_pe(pdev, pe_bitmap);
+ pnv_ioda_reserve_dev_m64_pe(pdev, pe_bitmap);
if (all && pdev->subordinate)
- pnv_ioda2_reserve_m64_pe(pdev->subordinate,
- pe_bitmap, all);
+ pnv_ioda_reserve_m64_pe(pdev->subordinate,
+ pe_bitmap, all);
}
}
-static int pnv_ioda2_pick_m64_pe(struct pci_bus *bus, bool all)
+static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pci_bus *bus, bool all)
{
struct pci_controller *hose = pci_bus_to_host(bus);
struct pnv_phb *phb = hose->private_data;
@@ -271,28 +330,28 @@ static int pnv_ioda2_pick_m64_pe(struct pci_bus *bus, bool all)
/* Root bus shouldn't use M64 */
if (pci_is_root_bus(bus))
- return IODA_INVALID_PE;
+ return NULL;
/* Allocate bitmap */
- size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long));
+ size = _ALIGN_UP(phb->ioda.total_pe_num / 8, sizeof(unsigned long));
pe_alloc = kzalloc(size, GFP_KERNEL);
if (!pe_alloc) {
pr_warn("%s: Out of memory !\n",
__func__);
- return IODA_INVALID_PE;
+ return NULL;
}
/* Figure out reserved PE numbers by the PE */
- pnv_ioda2_reserve_m64_pe(bus, pe_alloc, all);
+ pnv_ioda_reserve_m64_pe(bus, pe_alloc, all);
/*
* the current bus might not own M64 window and that's all
* contributed by its child buses. For the case, we needn't
* pick M64 dependent PE#.
*/
- if (bitmap_empty(pe_alloc, phb->ioda.total_pe)) {
+ if (bitmap_empty(pe_alloc, phb->ioda.total_pe_num)) {
kfree(pe_alloc);
- return IODA_INVALID_PE;
+ return NULL;
}
/*
@@ -301,10 +360,11 @@ static int pnv_ioda2_pick_m64_pe(struct pci_bus *bus, bool all)
*/
master_pe = NULL;
i = -1;
- while ((i = find_next_bit(pe_alloc, phb->ioda.total_pe, i + 1)) <
- phb->ioda.total_pe) {
+ while ((i = find_next_bit(pe_alloc, phb->ioda.total_pe_num, i + 1)) <
+ phb->ioda.total_pe_num) {
pe = &phb->ioda.pe_array[i];
+ phb->ioda.m64_segmap[pe->pe_number] = pe->pe_number;
if (!master_pe) {
pe->flags |= PNV_IODA_PE_MASTER;
INIT_LIST_HEAD(&pe->slaves);
@@ -314,10 +374,30 @@ static int pnv_ioda2_pick_m64_pe(struct pci_bus *bus, bool all)
pe->master = master_pe;
list_add_tail(&pe->list, &master_pe->slaves);
}
+
+ /*
+ * P7IOC supports M64DT, which helps mapping M64 segment
+ * to one particular PE#. However, PHB3 has fixed mapping
+ * between M64 segment and PE#. In order to have same logic
+ * for P7IOC and PHB3, we enforce fixed mapping between M64
+ * segment and PE# on P7IOC.
+ */
+ if (phb->type == PNV_PHB_IODA1) {
+ int64_t rc;
+
+ rc = opal_pci_map_pe_mmio_window(phb->opal_id,
+ pe->pe_number, OPAL_M64_WINDOW_TYPE,
+ pe->pe_number / PNV_IODA1_M64_SEGS,
+ pe->pe_number % PNV_IODA1_M64_SEGS);
+ if (rc != OPAL_SUCCESS)
+ pr_warn("%s: Error %lld mapping M64 for PHB#%d-PE#%d\n",
+ __func__, rc, phb->hose->global_number,
+ pe->pe_number);
+ }
}
kfree(pe_alloc);
- return master_pe->pe_number;
+ return master_pe;
}
static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb)
@@ -328,8 +408,7 @@ static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb)
const u32 *r;
u64 pci_addr;
- /* FIXME: Support M64 for P7IOC */
- if (phb->type != PNV_PHB_IODA2) {
+ if (phb->type != PNV_PHB_IODA1 && phb->type != PNV_PHB_IODA2) {
pr_info(" Not support M64 window\n");
return;
}
@@ -355,7 +434,7 @@ static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb)
hose->mem_offset[1] = res->start - pci_addr;
phb->ioda.m64_size = resource_size(res);
- phb->ioda.m64_segsize = phb->ioda.m64_size / phb->ioda.total_pe;
+ phb->ioda.m64_segsize = phb->ioda.m64_size / phb->ioda.total_pe_num;
phb->ioda.m64_base = pci_addr;
pr_info(" MEM64 0x%016llx..0x%016llx -> 0x%016llx\n",
@@ -363,9 +442,12 @@ static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb)
/* Use last M64 BAR to cover M64 window */
phb->ioda.m64_bar_idx = 15;
- phb->init_m64 = pnv_ioda2_init_m64;
- phb->reserve_m64_pe = pnv_ioda2_reserve_m64_pe;
- phb->pick_m64_pe = pnv_ioda2_pick_m64_pe;
+ if (phb->type == PNV_PHB_IODA1)
+ phb->init_m64 = pnv_ioda1_init_m64;
+ else
+ phb->init_m64 = pnv_ioda2_init_m64;
+ phb->reserve_m64_pe = pnv_ioda_reserve_m64_pe;
+ phb->pick_m64_pe = pnv_ioda_pick_m64_pe;
}
static void pnv_ioda_freeze_pe(struct pnv_phb *phb, int pe_no)
@@ -456,7 +538,7 @@ static int pnv_ioda_get_pe_state(struct pnv_phb *phb, int pe_no)
s64 rc;
/* Sanity check on PE number */
- if (pe_no < 0 || pe_no >= phb->ioda.total_pe)
+ if (pe_no < 0 || pe_no >= phb->ioda.total_pe_num)
return OPAL_EEH_STOPPED_PERM_UNAVAIL;
/*
@@ -808,44 +890,6 @@ out:
return 0;
}
-static void pnv_ioda_link_pe_by_weight(struct pnv_phb *phb,
- struct pnv_ioda_pe *pe)
-{
- struct pnv_ioda_pe *lpe;
-
- list_for_each_entry(lpe, &phb->ioda.pe_dma_list, dma_link) {
- if (lpe->dma_weight < pe->dma_weight) {
- list_add_tail(&pe->dma_link, &lpe->dma_link);
- return;
- }
- }
- list_add_tail(&pe->dma_link, &phb->ioda.pe_dma_list);
-}
-
-static unsigned int pnv_ioda_dma_weight(struct pci_dev *dev)
-{
- /* This is quite simplistic. The "base" weight of a device
- * is 10. 0 means no DMA is to be accounted for it.
- */
-
- /* If it's a bridge, no DMA */
- if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL)
- return 0;
-
- /* Reduce the weight of slow USB controllers */
- if (dev->class == PCI_CLASS_SERIAL_USB_UHCI ||
- dev->class == PCI_CLASS_SERIAL_USB_OHCI ||
- dev->class == PCI_CLASS_SERIAL_USB_EHCI)
- return 3;
-
- /* Increase the weight of RAID (includes Obsidian) */
- if ((dev->class >> 8) == PCI_CLASS_STORAGE_RAID)
- return 15;
-
- /* Default */
- return 10;
-}
-
#ifdef CONFIG_PCI_IOV
static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
{
@@ -919,7 +963,6 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
struct pnv_phb *phb = hose->private_data;
struct pci_dn *pdn = pci_get_pdn(dev);
struct pnv_ioda_pe *pe;
- int pe_num;
if (!pdn) {
pr_err("%s: Device tree node not associated properly\n",
@@ -929,8 +972,8 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
if (pdn->pe_number != IODA_INVALID_PE)
return NULL;
- pe_num = pnv_ioda_alloc_pe(phb);
- if (pe_num == IODA_INVALID_PE) {
+ pe = pnv_ioda_alloc_pe(phb);
+ if (!pe) {
pr_warning("%s: Not enough PE# available, disabling device\n",
pci_name(dev));
return NULL;
@@ -943,14 +986,12 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
*
* At some point we want to remove the PDN completely anyways
*/
- pe = &phb->ioda.pe_array[pe_num];
pci_dev_get(dev);
pdn->pcidev = dev;
- pdn->pe_number = pe_num;
+ pdn->pe_number = pe->pe_number;
pe->flags = PNV_IODA_PE_DEV;
pe->pdev = dev;
pe->pbus = NULL;
- pe->tce32_seg = -1;
pe->mve_number = -1;
pe->rid = dev->bus->number << 8 | pdn->devfn;
@@ -958,23 +999,15 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
if (pnv_ioda_configure_pe(phb, pe)) {
/* XXX What do we do here ? */
- if (pe_num)
- pnv_ioda_free_pe(phb, pe_num);
+ pnv_ioda_free_pe(pe);
pdn->pe_number = IODA_INVALID_PE;
pe->pdev = NULL;
pci_dev_put(dev);
return NULL;
}
- /* Assign a DMA weight to the device */
- pe->dma_weight = pnv_ioda_dma_weight(dev);
- if (pe->dma_weight != 0) {
- phb->ioda.dma_weight += pe->dma_weight;
- phb->ioda.dma_pe_count++;
- }
-
- /* Link the PE */
- pnv_ioda_link_pe_by_weight(phb, pe);
+ /* Put PE to the list */
+ list_add_tail(&pe->list, &phb->ioda.pe_list);
return pe;
}
@@ -993,7 +1026,6 @@ static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
}
pdn->pcidev = dev;
pdn->pe_number = pe->pe_number;
- pe->dma_weight += pnv_ioda_dma_weight(dev);
if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
pnv_ioda_setup_same_PE(dev->subordinate, pe);
}
@@ -1005,49 +1037,44 @@ static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
* subordinate PCI devices and buses. The second type of PE is normally
* orgiriated by PCIe-to-PCI bridge or PLX switch downstream ports.
*/
-static void pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
+static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
{
struct pci_controller *hose = pci_bus_to_host(bus);
struct pnv_phb *phb = hose->private_data;
- struct pnv_ioda_pe *pe;
- int pe_num = IODA_INVALID_PE;
+ struct pnv_ioda_pe *pe = NULL;
/* Check if PE is determined by M64 */
if (phb->pick_m64_pe)
- pe_num = phb->pick_m64_pe(bus, all);
+ pe = phb->pick_m64_pe(bus, all);
/* The PE number isn't pinned by M64 */
- if (pe_num == IODA_INVALID_PE)
- pe_num = pnv_ioda_alloc_pe(phb);
+ if (!pe)
+ pe = pnv_ioda_alloc_pe(phb);
- if (pe_num == IODA_INVALID_PE) {
+ if (!pe) {
pr_warning("%s: Not enough PE# available for PCI bus %04x:%02x\n",
__func__, pci_domain_nr(bus), bus->number);
- return;
+ return NULL;
}
- pe = &phb->ioda.pe_array[pe_num];
pe->flags |= (all ? PNV_IODA_PE_BUS_ALL : PNV_IODA_PE_BUS);
pe->pbus = bus;
pe->pdev = NULL;
- pe->tce32_seg = -1;
pe->mve_number = -1;
pe->rid = bus->busn_res.start << 8;
- pe->dma_weight = 0;
if (all)
pe_info(pe, "Secondary bus %d..%d associated with PE#%d\n",
- bus->busn_res.start, bus->busn_res.end, pe_num);
+ bus->busn_res.start, bus->busn_res.end, pe->pe_number);
else
pe_info(pe, "Secondary bus %d associated with PE#%d\n",
- bus->busn_res.start, pe_num);
+ bus->busn_res.start, pe->pe_number);
if (pnv_ioda_configure_pe(phb, pe)) {
/* XXX What do we do here ? */
- if (pe_num)
- pnv_ioda_free_pe(phb, pe_num);
+ pnv_ioda_free_pe(pe);
pe->pbus = NULL;
- return;
+ return NULL;
}
/* Associate it with all child devices */
@@ -1056,16 +1083,7 @@ static void pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
/* Put PE to the list */
list_add_tail(&pe->list, &phb->ioda.pe_list);
- /* Account for one DMA PE if at least one DMA capable device exist
- * below the bridge
- */
- if (pe->dma_weight != 0) {
- phb->ioda.dma_weight += pe->dma_weight;
- phb->ioda.dma_pe_count++;
- }
-
- /* Link the PE */
- pnv_ioda_link_pe_by_weight(phb, pe);
+ return pe;
}
static struct pnv_ioda_pe *pnv_ioda_setup_npu_PE(struct pci_dev *npu_pdev)
@@ -1088,7 +1106,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_npu_PE(struct pci_dev *npu_pdev)
* same GPU get assigned the same PE.
*/
gpu_pdev = pnv_pci_get_gpu_dev(npu_pdev);
- for (pe_num = 0; pe_num < phb->ioda.total_pe; pe_num++) {
+ for (pe_num = 0; pe_num < phb->ioda.total_pe_num; pe_num++) {
pe = &phb->ioda.pe_array[pe_num];
if (!pe->pdev)
continue;
@@ -1106,7 +1124,6 @@ static struct pnv_ioda_pe *pnv_ioda_setup_npu_PE(struct pci_dev *npu_pdev)
rid = npu_pdev->bus->number << 8 | npu_pdn->devfn;
npu_pdn->pcidev = npu_pdev;
npu_pdn->pe_number = pe_num;
- pe->dma_weight += pnv_ioda_dma_weight(npu_pdev);
phb->ioda.pe_rmap[rid] = pe->pe_number;
/* Map the PE to this link */
@@ -1378,7 +1395,7 @@ static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
pnv_ioda_deconfigure_pe(phb, pe);
- pnv_ioda_free_pe(phb, pe->pe_number);
+ pnv_ioda_free_pe(pe);
}
}
@@ -1387,6 +1404,7 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev)
struct pci_bus *bus;
struct pci_controller *hose;
struct pnv_phb *phb;
+ struct pnv_ioda_pe *pe;
struct pci_dn *pdn;
struct pci_sriov *iov;
u16 num_vfs, i;
@@ -1411,8 +1429,11 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev)
/* Release PE numbers */
if (pdn->m64_single_mode) {
for (i = 0; i < num_vfs; i++) {
- if (pdn->pe_num_map[i] != IODA_INVALID_PE)
- pnv_ioda_free_pe(phb, pdn->pe_num_map[i]);
+ if (pdn->pe_num_map[i] == IODA_INVALID_PE)
+ continue;
+
+ pe = &phb->ioda.pe_array[pdn->pe_num_map[i]];
+ pnv_ioda_free_pe(pe);
}
} else
bitmap_clear(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs);
@@ -1454,7 +1475,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
pe->flags = PNV_IODA_PE_VF;
pe->pbus = NULL;
pe->parent_dev = pdev;
- pe->tce32_seg = -1;
pe->mve_number = -1;
pe->rid = (pci_iov_virtfn_bus(pdev, vf_index) << 8) |
pci_iov_virtfn_devfn(pdev, vf_index);
@@ -1466,8 +1486,7 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
if (pnv_ioda_configure_pe(phb, pe)) {
/* XXX What do we do here ? */
- if (pe_num)
- pnv_ioda_free_pe(phb, pe_num);
+ pnv_ioda_free_pe(pe);
pe->pdev = NULL;
continue;
}
@@ -1486,6 +1505,7 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
struct pci_bus *bus;
struct pci_controller *hose;
struct pnv_phb *phb;
+ struct pnv_ioda_pe *pe;
struct pci_dn *pdn;
int ret;
u16 i;
@@ -1528,18 +1548,20 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
/* Calculate available PE for required VFs */
if (pdn->m64_single_mode) {
for (i = 0; i < num_vfs; i++) {
- pdn->pe_num_map[i] = pnv_ioda_alloc_pe(phb);
- if (pdn->pe_num_map[i] == IODA_INVALID_PE) {
+ pe = pnv_ioda_alloc_pe(phb);
+ if (!pe) {
ret = -EBUSY;
goto m64_failed;
}
+
+ pdn->pe_num_map[i] = pe->pe_number;
}
} else {
mutex_lock(&phb->ioda.pe_alloc_mutex);
*pdn->pe_num_map = bitmap_find_next_zero_area(
- phb->ioda.pe_alloc, phb->ioda.total_pe,
+ phb->ioda.pe_alloc, phb->ioda.total_pe_num,
0, num_vfs, 0);
- if (*pdn->pe_num_map >= phb->ioda.total_pe) {
+ if (*pdn->pe_num_map >= phb->ioda.total_pe_num) {
mutex_unlock(&phb->ioda.pe_alloc_mutex);
dev_info(&pdev->dev, "Failed to enable VF%d\n", num_vfs);
kfree(pdn->pe_num_map);
@@ -1577,8 +1599,11 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
m64_failed:
if (pdn->m64_single_mode) {
for (i = 0; i < num_vfs; i++) {
- if (pdn->pe_num_map[i] != IODA_INVALID_PE)
- pnv_ioda_free_pe(phb, pdn->pe_num_map[i]);
+ if (pdn->pe_num_map[i] == IODA_INVALID_PE)
+ continue;
+
+ pe = &phb->ioda.pe_array[pdn->pe_num_map[i]];
+ pnv_ioda_free_pe(pe);
}
} else
bitmap_clear(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs);
@@ -1640,8 +1665,6 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
struct pnv_ioda_pe *pe;
uint64_t top;
bool bypass = false;
- struct pci_dev *linked_npu_dev;
- int i;
if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
return -ENODEV;;
@@ -1662,15 +1685,7 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
*pdev->dev.dma_mask = dma_mask;
/* Update peer npu devices */
- if (pe->flags & PNV_IODA_PE_PEER)
- for (i = 0; i < PNV_IODA_MAX_PEER_PES; i++) {
- if (!pe->peers[i])
- continue;
-
- linked_npu_dev = pe->peers[i]->pdev;
- if (dma_get_mask(&linked_npu_dev->dev) != dma_mask)
- dma_set_mask(&linked_npu_dev->dev, dma_mask);
- }
+ pnv_npu_try_dma_set_bypass(pdev, bypass);
return 0;
}
@@ -1811,28 +1826,34 @@ static struct iommu_table_ops pnv_ioda1_iommu_ops = {
.get = pnv_tce_get,
};
-static inline void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_ioda_pe *pe)
+#define TCE_KILL_INVAL_ALL PPC_BIT(0)
+#define TCE_KILL_INVAL_PE PPC_BIT(1)
+#define TCE_KILL_INVAL_TCE PPC_BIT(2)
+
+void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm)
+{
+ const unsigned long val = TCE_KILL_INVAL_ALL;
+
+ mb(); /* Ensure previous TCE table stores are visible */
+ if (rm)
+ __raw_rm_writeq(cpu_to_be64(val),
+ (__be64 __iomem *)
+ phb->ioda.tce_inval_reg_phys);
+ else
+ __raw_writeq(cpu_to_be64(val), phb->ioda.tce_inval_reg);
+}
+
+static inline void pnv_pci_ioda2_tce_invalidate_pe(struct pnv_ioda_pe *pe)
{
/* 01xb - invalidate TCEs that match the specified PE# */
- unsigned long val = (0x4ull << 60) | (pe->pe_number & 0xFF);
+ unsigned long val = TCE_KILL_INVAL_PE | (pe->pe_number & 0xFF);
struct pnv_phb *phb = pe->phb;
- struct pnv_ioda_pe *npe;
- int i;
if (!phb->ioda.tce_inval_reg)
return;
mb(); /* Ensure above stores are visible */
__raw_writeq(cpu_to_be64(val), phb->ioda.tce_inval_reg);
-
- if (pe->flags & PNV_IODA_PE_PEER)
- for (i = 0; i < PNV_IODA_MAX_PEER_PES; i++) {
- npe = pe->peers[i];
- if (!npe || npe->phb->type != PNV_PHB_NPU)
- continue;
-
- pnv_npu_tce_invalidate_entire(npe);
- }
}
static void pnv_pci_ioda2_do_tce_invalidate(unsigned pe_number, bool rm,
@@ -1842,7 +1863,7 @@ static void pnv_pci_ioda2_do_tce_invalidate(unsigned pe_number, bool rm,
unsigned long start, end, inc;
/* We'll invalidate DMA address in PE scope */
- start = 0x2ull << 60;
+ start = TCE_KILL_INVAL_TCE;
start |= (pe_number & 0xFF);
end = start;
@@ -1867,28 +1888,24 @@ static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
struct iommu_table_group_link *tgl;
list_for_each_entry_rcu(tgl, &tbl->it_group_list, next) {
- struct pnv_ioda_pe *npe;
struct pnv_ioda_pe *pe = container_of(tgl->table_group,
struct pnv_ioda_pe, table_group);
__be64 __iomem *invalidate = rm ?
(__be64 __iomem *)pe->phb->ioda.tce_inval_reg_phys :
pe->phb->ioda.tce_inval_reg;
- int i;
+ if (pe->phb->type == PNV_PHB_NPU) {
+ /*
+ * The NVLink hardware does not support TCE kill
+ * per TCE entry so we have to invalidate
+ * the entire cache for it.
+ */
+ pnv_pci_ioda2_tce_invalidate_entire(pe->phb, rm);
+ continue;
+ }
pnv_pci_ioda2_do_tce_invalidate(pe->pe_number, rm,
invalidate, tbl->it_page_shift,
index, npages);
-
- if (pe->flags & PNV_IODA_PE_PEER)
- /* Invalidate PEs using the same TCE table */
- for (i = 0; i < PNV_IODA_MAX_PEER_PES; i++) {
- npe = pe->peers[i];
- if (!npe || npe->phb->type != PNV_PHB_NPU)
- continue;
-
- pnv_npu_tce_invalidate(npe, tbl, index,
- npages, rm);
- }
}
}
@@ -1945,56 +1962,140 @@ static struct iommu_table_ops pnv_ioda2_iommu_ops = {
.free = pnv_ioda2_table_free,
};
-static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
- struct pnv_ioda_pe *pe, unsigned int base,
- unsigned int segs)
+static int pnv_pci_ioda_dev_dma_weight(struct pci_dev *dev, void *data)
+{
+ unsigned int *weight = (unsigned int *)data;
+
+ /* This is quite simplistic. The "base" weight of a device
+ * is 10. 0 means no DMA is to be accounted for it.
+ */
+ if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL)
+ return 0;
+
+ if (dev->class == PCI_CLASS_SERIAL_USB_UHCI ||
+ dev->class == PCI_CLASS_SERIAL_USB_OHCI ||
+ dev->class == PCI_CLASS_SERIAL_USB_EHCI)
+ *weight += 3;
+ else if ((dev->class >> 8) == PCI_CLASS_STORAGE_RAID)
+ *weight += 15;
+ else
+ *weight += 10;
+
+ return 0;
+}
+
+static unsigned int pnv_pci_ioda_pe_dma_weight(struct pnv_ioda_pe *pe)
+{
+ unsigned int weight = 0;
+
+ /* SRIOV VF has same DMA32 weight as its PF */
+#ifdef CONFIG_PCI_IOV
+ if ((pe->flags & PNV_IODA_PE_VF) && pe->parent_dev) {
+ pnv_pci_ioda_dev_dma_weight(pe->parent_dev, &weight);
+ return weight;
+ }
+#endif
+
+ if ((pe->flags & PNV_IODA_PE_DEV) && pe->pdev) {
+ pnv_pci_ioda_dev_dma_weight(pe->pdev, &weight);
+ } else if ((pe->flags & PNV_IODA_PE_BUS) && pe->pbus) {
+ struct pci_dev *pdev;
+
+ list_for_each_entry(pdev, &pe->pbus->devices, bus_list)
+ pnv_pci_ioda_dev_dma_weight(pdev, &weight);
+ } else if ((pe->flags & PNV_IODA_PE_BUS_ALL) && pe->pbus) {
+ pci_walk_bus(pe->pbus, pnv_pci_ioda_dev_dma_weight, &weight);
+ }
+
+ return weight;
+}
+
+static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb,
+ struct pnv_ioda_pe *pe)
{
struct page *tce_mem = NULL;
struct iommu_table *tbl;
- unsigned int i;
+ unsigned int weight, total_weight = 0;
+ unsigned int tce32_segsz, base, segs, avail, i;
int64_t rc;
void *addr;
/* XXX FIXME: Handle 64-bit only DMA devices */
/* XXX FIXME: Provide 64-bit DMA facilities & non-4K TCE tables etc.. */
/* XXX FIXME: Allocate multi-level tables on PHB3 */
+ weight = pnv_pci_ioda_pe_dma_weight(pe);
+ if (!weight)
+ return;
- /* We shouldn't already have a 32-bit DMA associated */
- if (WARN_ON(pe->tce32_seg >= 0))
+ pci_walk_bus(phb->hose->bus, pnv_pci_ioda_dev_dma_weight,
+ &total_weight);
+ segs = (weight * phb->ioda.dma32_count) / total_weight;
+ if (!segs)
+ segs = 1;
+
+ /*
+ * Allocate contiguous DMA32 segments. We begin with the expected
+ * number of segments. With one more attempt, the number of DMA32
+ * segments to be allocated is decreased by one until one segment
+ * is allocated successfully.
+ */
+ do {
+ for (base = 0; base <= phb->ioda.dma32_count - segs; base++) {
+ for (avail = 0, i = base; i < base + segs; i++) {
+ if (phb->ioda.dma32_segmap[i] ==
+ IODA_INVALID_PE)
+ avail++;
+ }
+
+ if (avail == segs)
+ goto found;
+ }
+ } while (--segs);
+
+ if (!segs) {
+ pe_warn(pe, "No available DMA32 segments\n");
return;
+ }
+found:
tbl = pnv_pci_table_alloc(phb->hose->node);
iommu_register_group(&pe->table_group, phb->hose->global_number,
pe->pe_number);
pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group);
/* Grab a 32-bit TCE table */
- pe->tce32_seg = base;
+ pe_info(pe, "DMA weight %d (%d), assigned (%d) %d DMA32 segments\n",
+ weight, total_weight, base, segs);
pe_info(pe, " Setting up 32-bit TCE table at %08x..%08x\n",
- (base << 28), ((base + segs) << 28) - 1);
+ base * PNV_IODA1_DMA32_SEGSIZE,
+ (base + segs) * PNV_IODA1_DMA32_SEGSIZE - 1);
/* XXX Currently, we allocate one big contiguous table for the
* TCEs. We only really need one chunk per 256M of TCE space
* (ie per segment) but that's an optimization for later, it
* requires some added smarts with our get/put_tce implementation
+ *
+ * Each TCE page is 4KB in size and each TCE entry occupies 8
+ * bytes
*/
+ tce32_segsz = PNV_IODA1_DMA32_SEGSIZE >> (IOMMU_PAGE_SHIFT_4K - 3);
tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL,
- get_order(TCE32_TABLE_SIZE * segs));
+ get_order(tce32_segsz * segs));
if (!tce_mem) {
pe_err(pe, " Failed to allocate a 32-bit TCE memory\n");
goto fail;
}
addr = page_address(tce_mem);
- memset(addr, 0, TCE32_TABLE_SIZE * segs);
+ memset(addr, 0, tce32_segsz * segs);
/* Configure HW */
for (i = 0; i < segs; i++) {
rc = opal_pci_map_pe_dma_window(phb->opal_id,
pe->pe_number,
base + i, 1,
- __pa(addr) + TCE32_TABLE_SIZE * i,
- TCE32_TABLE_SIZE, 0x1000);
+ __pa(addr) + tce32_segsz * i,
+ tce32_segsz, IOMMU_PAGE_SIZE_4K);
if (rc) {
pe_err(pe, " Failed to configure 32-bit TCE table,"
" err %ld\n", rc);
@@ -2002,9 +2103,14 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
}
}
+ /* Setup DMA32 segment mapping */
+ for (i = base; i < base + segs; i++)
+ phb->ioda.dma32_segmap[i] = pe->pe_number;
+
/* Setup linux iommu table */
- pnv_pci_setup_iommu_table(tbl, addr, TCE32_TABLE_SIZE * segs,
- base << 28, IOMMU_PAGE_SHIFT_4K);
+ pnv_pci_setup_iommu_table(tbl, addr, tce32_segsz * segs,
+ base * PNV_IODA1_DMA32_SEGSIZE,
+ IOMMU_PAGE_SHIFT_4K);
/* OPAL variant of P7IOC SW invalidated TCEs */
if (phb->ioda.tce_inval_reg)
@@ -2031,10 +2137,8 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
return;
fail:
/* XXX Failure: Try to fallback to 64-bit only ? */
- if (pe->tce32_seg >= 0)
- pe->tce32_seg = -1;
if (tce_mem)
- __free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs));
+ __free_pages(tce_mem, get_order(tce32_segsz * segs));
if (tbl) {
pnv_pci_unlink_table_and_group(tbl, &pe->table_group);
iommu_free_table(tbl, "pnv");
@@ -2075,7 +2179,7 @@ static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group,
pnv_pci_link_table_and_group(phb->hose->node, num,
tbl, &pe->table_group);
- pnv_pci_ioda2_tce_invalidate_entire(pe);
+ pnv_pci_ioda2_tce_invalidate_pe(pe);
return 0;
}
@@ -2219,7 +2323,7 @@ static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
if (ret)
pe_warn(pe, "Unmapping failed, ret = %ld\n", ret);
else
- pnv_pci_ioda2_tce_invalidate_entire(pe);
+ pnv_pci_ioda2_tce_invalidate_pe(pe);
pnv_pci_unlink_table_and_group(table_group->tables[num], table_group);
@@ -2288,6 +2392,116 @@ static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
.take_ownership = pnv_ioda2_take_ownership,
.release_ownership = pnv_ioda2_release_ownership,
};
+
+static int gpe_table_group_to_npe_cb(struct device *dev, void *opaque)
+{
+ struct pci_controller *hose;
+ struct pnv_phb *phb;
+ struct pnv_ioda_pe **ptmppe = opaque;
+ struct pci_dev *pdev = container_of(dev, struct pci_dev, dev);
+ struct pci_dn *pdn = pci_get_pdn(pdev);
+
+ if (!pdn || pdn->pe_number == IODA_INVALID_PE)
+ return 0;
+
+ hose = pci_bus_to_host(pdev->bus);
+ phb = hose->private_data;
+ if (phb->type != PNV_PHB_NPU)
+ return 0;
+
+ *ptmppe = &phb->ioda.pe_array[pdn->pe_number];
+
+ return 1;
+}
+
+/*
+ * This returns PE of associated NPU.
+ * This assumes that NPU is in the same IOMMU group with GPU and there is
+ * no other PEs.
+ */
+static struct pnv_ioda_pe *gpe_table_group_to_npe(
+ struct iommu_table_group *table_group)
+{
+ struct pnv_ioda_pe *npe = NULL;
+ int ret = iommu_group_for_each_dev(table_group->group, &npe,
+ gpe_table_group_to_npe_cb);
+
+ BUG_ON(!ret || !npe);
+
+ return npe;
+}
+
+static long pnv_pci_ioda2_npu_set_window(struct iommu_table_group *table_group,
+ int num, struct iommu_table *tbl)
+{
+ long ret = pnv_pci_ioda2_set_window(table_group, num, tbl);
+
+ if (ret)
+ return ret;
+
+ ret = pnv_npu_set_window(gpe_table_group_to_npe(table_group), num, tbl);
+ if (ret)
+ pnv_pci_ioda2_unset_window(table_group, num);
+
+ return ret;
+}
+
+static long pnv_pci_ioda2_npu_unset_window(
+ struct iommu_table_group *table_group,
+ int num)
+{
+ long ret = pnv_pci_ioda2_unset_window(table_group, num);
+
+ if (ret)
+ return ret;
+
+ return pnv_npu_unset_window(gpe_table_group_to_npe(table_group), num);
+}
+
+static void pnv_ioda2_npu_take_ownership(struct iommu_table_group *table_group)
+{
+ /*
+ * Detach NPU first as pnv_ioda2_take_ownership() will destroy
+ * the iommu_table if 32bit DMA is enabled.
+ */
+ pnv_npu_take_ownership(gpe_table_group_to_npe(table_group));
+ pnv_ioda2_take_ownership(table_group);
+}
+
+static struct iommu_table_group_ops pnv_pci_ioda2_npu_ops = {
+ .get_table_size = pnv_pci_ioda2_get_table_size,
+ .create_table = pnv_pci_ioda2_create_table,
+ .set_window = pnv_pci_ioda2_npu_set_window,
+ .unset_window = pnv_pci_ioda2_npu_unset_window,
+ .take_ownership = pnv_ioda2_npu_take_ownership,
+ .release_ownership = pnv_ioda2_release_ownership,
+};
+
+static void pnv_pci_ioda_setup_iommu_api(void)
+{
+ struct pci_controller *hose, *tmp;
+ struct pnv_phb *phb;
+ struct pnv_ioda_pe *pe, *gpe;
+
+ /*
+ * Now we have all PHBs discovered, time to add NPU devices to
+ * the corresponding IOMMU groups.
+ */
+ list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
+ phb = hose->private_data;
+
+ if (phb->type != PNV_PHB_NPU)
+ continue;
+
+ list_for_each_entry(pe, &phb->ioda.pe_list, list) {
+ gpe = pnv_pci_npu_setup_iommu(pe);
+ if (gpe)
+ gpe->table_group.ops = &pnv_pci_ioda2_npu_ops;
+ }
+ }
+}
+#else /* !CONFIG_IOMMU_API */
+static void pnv_pci_ioda_setup_iommu_api(void) { };
#endif
static void pnv_pci_ioda_setup_opal_tce_kill(struct pnv_phb *phb)
@@ -2443,10 +2657,6 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
{
int64_t rc;
- /* We shouldn't already have a 32-bit DMA associated */
- if (WARN_ON(pe->tce32_seg >= 0))
- return;
-
/* TVE #1 is selected by PCI address bit 59 */
pe->tce_bypass_base = 1ull << 59;
@@ -2454,7 +2664,6 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
pe->pe_number);
/* The PE will reserve all possible 32-bits space */
- pe->tce32_seg = 0;
pe_info(pe, "Setting up 32-bit TCE table at 0..%08x\n",
phb->ioda.m32_pci_base);
@@ -2470,11 +2679,8 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
#endif
rc = pnv_pci_ioda2_setup_default_config(pe);
- if (rc) {
- if (pe->tce32_seg >= 0)
- pe->tce32_seg = -1;
+ if (rc)
return;
- }
if (pe->flags & PNV_IODA_PE_DEV)
iommu_add_device(&pe->pdev->dev);
@@ -2485,47 +2691,24 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
static void pnv_ioda_setup_dma(struct pnv_phb *phb)
{
struct pci_controller *hose = phb->hose;
- unsigned int residual, remaining, segs, tw, base;
struct pnv_ioda_pe *pe;
+ unsigned int weight;
/* If we have more PE# than segments available, hand out one
* per PE until we run out and let the rest fail. If not,
* then we assign at least one segment per PE, plus more based
* on the amount of devices under that PE
*/
- if (phb->ioda.dma_pe_count > phb->ioda.tce32_count)
- residual = 0;
- else
- residual = phb->ioda.tce32_count -
- phb->ioda.dma_pe_count;
-
- pr_info("PCI: Domain %04x has %ld available 32-bit DMA segments\n",
- hose->global_number, phb->ioda.tce32_count);
- pr_info("PCI: %d PE# for a total weight of %d\n",
- phb->ioda.dma_pe_count, phb->ioda.dma_weight);
+ pr_info("PCI: Domain %04x has %d available 32-bit DMA segments\n",
+ hose->global_number, phb->ioda.dma32_count);
pnv_pci_ioda_setup_opal_tce_kill(phb);
- /* Walk our PE list and configure their DMA segments, hand them
- * out one base segment plus any residual segments based on
- * weight
- */
- remaining = phb->ioda.tce32_count;
- tw = phb->ioda.dma_weight;
- base = 0;
- list_for_each_entry(pe, &phb->ioda.pe_dma_list, dma_link) {
- if (!pe->dma_weight)
- continue;
- if (!remaining) {
- pe_warn(pe, "No DMA32 resources available\n");
+ /* Walk our PE list and configure their DMA segments */
+ list_for_each_entry(pe, &phb->ioda.pe_list, list) {
+ weight = pnv_pci_ioda_pe_dma_weight(pe);
+ if (!weight)
continue;
- }
- segs = 1;
- if (residual) {
- segs += ((pe->dma_weight * residual) + (tw / 2)) / tw;
- if (segs > remaining)
- segs = remaining;
- }
/*
* For IODA2 compliant PHB3, we needn't care about the weight.
@@ -2533,12 +2716,9 @@ static void pnv_ioda_setup_dma(struct pnv_phb *phb)
* the specific PE.
*/
if (phb->type == PNV_PHB_IODA1) {
- pe_info(pe, "DMA weight %d, assigned %d DMA32 segments\n",
- pe->dma_weight, segs);
- pnv_pci_ioda_setup_dma_pe(phb, pe, base, segs);
+ pnv_pci_ioda1_setup_dma_pe(phb, pe);
} else if (phb->type == PNV_PHB_IODA2) {
pe_info(pe, "Assign DMA32 space\n");
- segs = 0;
pnv_pci_ioda2_setup_dma_pe(phb, pe);
} else if (phb->type == PNV_PHB_NPU) {
/*
@@ -2548,9 +2728,6 @@ static void pnv_ioda_setup_dma(struct pnv_phb *phb)
* as the PHB3 TVT.
*/
}
-
- remaining -= segs;
- base += segs;
}
}
@@ -2858,7 +3035,7 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
pdn->m64_single_mode = false;
total_vfs = pci_sriov_get_totalvfs(pdev);
- mul = phb->ioda.total_pe;
+ mul = phb->ioda.total_pe_num;
total_vf_bar_sz = 0;
for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
@@ -2929,19 +3106,72 @@ truncate_iov:
}
#endif /* CONFIG_PCI_IOV */
+static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe,
+ struct resource *res)
+{
+ struct pnv_phb *phb = pe->phb;
+ struct pci_bus_region region;
+ int index;
+ int64_t rc;
+
+ if (!res || !res->flags || res->start > res->end)
+ return;
+
+ if (res->flags & IORESOURCE_IO) {
+ region.start = res->start - phb->ioda.io_pci_base;
+ region.end = res->end - phb->ioda.io_pci_base;
+ index = region.start / phb->ioda.io_segsize;
+
+ while (index < phb->ioda.total_pe_num &&
+ region.start <= region.end) {
+ phb->ioda.io_segmap[index] = pe->pe_number;
+ rc = opal_pci_map_pe_mmio_window(phb->opal_id,
+ pe->pe_number, OPAL_IO_WINDOW_TYPE, 0, index);
+ if (rc != OPAL_SUCCESS) {
+ pr_err("%s: Error %lld mapping IO segment#%d to PE#%d\n",
+ __func__, rc, index, pe->pe_number);
+ break;
+ }
+
+ region.start += phb->ioda.io_segsize;
+ index++;
+ }
+ } else if ((res->flags & IORESOURCE_MEM) &&
+ !pnv_pci_is_mem_pref_64(res->flags)) {
+ region.start = res->start -
+ phb->hose->mem_offset[0] -
+ phb->ioda.m32_pci_base;
+ region.end = res->end -
+ phb->hose->mem_offset[0] -
+ phb->ioda.m32_pci_base;
+ index = region.start / phb->ioda.m32_segsize;
+
+ while (index < phb->ioda.total_pe_num &&
+ region.start <= region.end) {
+ phb->ioda.m32_segmap[index] = pe->pe_number;
+ rc = opal_pci_map_pe_mmio_window(phb->opal_id,
+ pe->pe_number, OPAL_M32_WINDOW_TYPE, 0, index);
+ if (rc != OPAL_SUCCESS) {
+ pr_err("%s: Error %lld mapping M32 segment#%d to PE#%d",
+ __func__, rc, index, pe->pe_number);
+ break;
+ }
+
+ region.start += phb->ioda.m32_segsize;
+ index++;
+ }
+ }
+}
+
/*
* This function is supposed to be called on basis of PE from top
* to bottom style. So the the I/O or MMIO segment assigned to
* parent PE could be overrided by its child PEs if necessary.
*/
-static void pnv_ioda_setup_pe_seg(struct pci_controller *hose,
- struct pnv_ioda_pe *pe)
+static void pnv_ioda_setup_pe_seg(struct pnv_ioda_pe *pe)
{
- struct pnv_phb *phb = hose->private_data;
- struct pci_bus_region region;
- struct resource *res;
- int i, index;
- int rc;
+ struct pci_dev *pdev;
+ int i;
/*
* NOTE: We only care PCI bus based PE for now. For PCI
@@ -2950,57 +3180,20 @@ static void pnv_ioda_setup_pe_seg(struct pci_controller *hose,
*/
BUG_ON(!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)));
- pci_bus_for_each_resource(pe->pbus, res, i) {
- if (!res || !res->flags ||
- res->start > res->end)
- continue;
+ list_for_each_entry(pdev, &pe->pbus->devices, bus_list) {
+ for (i = 0; i <= PCI_ROM_RESOURCE; i++)
+ pnv_ioda_setup_pe_res(pe, &pdev->resource[i]);
- if (res->flags & IORESOURCE_IO) {
- region.start = res->start - phb->ioda.io_pci_base;
- region.end = res->end - phb->ioda.io_pci_base;
- index = region.start / phb->ioda.io_segsize;
-
- while (index < phb->ioda.total_pe &&
- region.start <= region.end) {
- phb->ioda.io_segmap[index] = pe->pe_number;
- rc = opal_pci_map_pe_mmio_window(phb->opal_id,
- pe->pe_number, OPAL_IO_WINDOW_TYPE, 0, index);
- if (rc != OPAL_SUCCESS) {
- pr_err("%s: OPAL error %d when mapping IO "
- "segment #%d to PE#%d\n",
- __func__, rc, index, pe->pe_number);
- break;
- }
-
- region.start += phb->ioda.io_segsize;
- index++;
- }
- } else if ((res->flags & IORESOURCE_MEM) &&
- !pnv_pci_is_mem_pref_64(res->flags)) {
- region.start = res->start -
- hose->mem_offset[0] -
- phb->ioda.m32_pci_base;
- region.end = res->end -
- hose->mem_offset[0] -
- phb->ioda.m32_pci_base;
- index = region.start / phb->ioda.m32_segsize;
-
- while (index < phb->ioda.total_pe &&
- region.start <= region.end) {
- phb->ioda.m32_segmap[index] = pe->pe_number;
- rc = opal_pci_map_pe_mmio_window(phb->opal_id,
- pe->pe_number, OPAL_M32_WINDOW_TYPE, 0, index);
- if (rc != OPAL_SUCCESS) {
- pr_err("%s: OPAL error %d when mapping M32 "
- "segment#%d to PE#%d",
- __func__, rc, index, pe->pe_number);
- break;
- }
-
- region.start += phb->ioda.m32_segsize;
- index++;
- }
- }
+ /*
+ * If the PE contains all subordinate PCI buses, the
+ * windows of the child bridges should be mapped to
+ * the PE as well.
+ */
+ if (!(pe->flags & PNV_IODA_PE_BUS_ALL) || !pci_is_bridge(pdev))
+ continue;
+ for (i = 0; i < PCI_BRIDGE_RESOURCE_NUM; i++)
+ pnv_ioda_setup_pe_res(pe,
+ &pdev->resource[PCI_BRIDGE_RESOURCES + i]);
}
}
@@ -3018,7 +3211,7 @@ static void pnv_pci_ioda_setup_seg(void)
continue;
list_for_each_entry(pe, &phb->ioda.pe_list, list) {
- pnv_ioda_setup_pe_seg(hose, pe);
+ pnv_ioda_setup_pe_seg(pe);
}
}
}
@@ -3035,6 +3228,8 @@ static void pnv_pci_ioda_setup_DMA(void)
phb = hose->private_data;
phb->initialized = 1;
}
+
+ pnv_pci_ioda_setup_iommu_api();
}
static void pnv_pci_ioda_create_dbgfs(void)
@@ -3056,27 +3251,6 @@ static void pnv_pci_ioda_create_dbgfs(void)
#endif /* CONFIG_DEBUG_FS */
}
-static void pnv_npu_ioda_fixup(void)
-{
- bool enable_bypass;
- struct pci_controller *hose, *tmp;
- struct pnv_phb *phb;
- struct pnv_ioda_pe *pe;
-
- list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
- phb = hose->private_data;
- if (phb->type != PNV_PHB_NPU)
- continue;
-
- list_for_each_entry(pe, &phb->ioda.pe_dma_list, dma_link) {
- enable_bypass = dma_get_mask(&pe->pdev->dev) ==
- DMA_BIT_MASK(64);
- pnv_npu_init_dma_pe(pe);
- pnv_npu_dma_set_bypass(pe, enable_bypass);
- }
- }
-}
-
static void pnv_pci_ioda_fixup(void)
{
pnv_pci_ioda_setup_PEs();
@@ -3089,9 +3263,6 @@ static void pnv_pci_ioda_fixup(void)
eeh_init();
eeh_addr_cache_build();
#endif
-
- /* Link NPU IODA tables to their PCI devices. */
- pnv_npu_ioda_fixup();
}
/*
@@ -3195,12 +3366,6 @@ static bool pnv_pci_enable_device_hook(struct pci_dev *dev)
return true;
}
-static u32 pnv_ioda_bdfn_to_pe(struct pnv_phb *phb, struct pci_bus *bus,
- u32 devfn)
-{
- return phb->ioda.pe_rmap[(bus->number << 8) | devfn];
-}
-
static void pnv_pci_ioda_shutdown(struct pci_controller *hose)
{
struct pnv_phb *phb = hose->private_data;
@@ -3210,31 +3375,39 @@ static void pnv_pci_ioda_shutdown(struct pci_controller *hose)
}
static const struct pci_controller_ops pnv_pci_ioda_controller_ops = {
- .dma_dev_setup = pnv_pci_dma_dev_setup,
- .dma_bus_setup = pnv_pci_dma_bus_setup,
+ .dma_dev_setup = pnv_pci_dma_dev_setup,
+ .dma_bus_setup = pnv_pci_dma_bus_setup,
#ifdef CONFIG_PCI_MSI
- .setup_msi_irqs = pnv_setup_msi_irqs,
- .teardown_msi_irqs = pnv_teardown_msi_irqs,
+ .setup_msi_irqs = pnv_setup_msi_irqs,
+ .teardown_msi_irqs = pnv_teardown_msi_irqs,
#endif
- .enable_device_hook = pnv_pci_enable_device_hook,
- .window_alignment = pnv_pci_window_alignment,
- .reset_secondary_bus = pnv_pci_reset_secondary_bus,
- .dma_set_mask = pnv_pci_ioda_dma_set_mask,
- .dma_get_required_mask = pnv_pci_ioda_dma_get_required_mask,
- .shutdown = pnv_pci_ioda_shutdown,
+ .enable_device_hook = pnv_pci_enable_device_hook,
+ .window_alignment = pnv_pci_window_alignment,
+ .reset_secondary_bus = pnv_pci_reset_secondary_bus,
+ .dma_set_mask = pnv_pci_ioda_dma_set_mask,
+ .dma_get_required_mask = pnv_pci_ioda_dma_get_required_mask,
+ .shutdown = pnv_pci_ioda_shutdown,
};
+static int pnv_npu_dma_set_mask(struct pci_dev *npdev, u64 dma_mask)
+{
+ dev_err_once(&npdev->dev,
+ "%s operation unsupported for NVLink devices\n",
+ __func__);
+ return -EPERM;
+}
+
static const struct pci_controller_ops pnv_npu_ioda_controller_ops = {
- .dma_dev_setup = pnv_pci_dma_dev_setup,
+ .dma_dev_setup = pnv_pci_dma_dev_setup,
#ifdef CONFIG_PCI_MSI
- .setup_msi_irqs = pnv_setup_msi_irqs,
- .teardown_msi_irqs = pnv_teardown_msi_irqs,
+ .setup_msi_irqs = pnv_setup_msi_irqs,
+ .teardown_msi_irqs = pnv_teardown_msi_irqs,
#endif
- .enable_device_hook = pnv_pci_enable_device_hook,
- .window_alignment = pnv_pci_window_alignment,
- .reset_secondary_bus = pnv_pci_reset_secondary_bus,
- .dma_set_mask = pnv_npu_dma_set_mask,
- .shutdown = pnv_pci_ioda_shutdown,
+ .enable_device_hook = pnv_pci_enable_device_hook,
+ .window_alignment = pnv_pci_window_alignment,
+ .reset_secondary_bus = pnv_pci_reset_secondary_bus,
+ .dma_set_mask = pnv_npu_dma_set_mask,
+ .shutdown = pnv_pci_ioda_shutdown,
};
static void __init pnv_pci_init_ioda_phb(struct device_node *np,
@@ -3242,10 +3415,12 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
{
struct pci_controller *hose;
struct pnv_phb *phb;
- unsigned long size, m32map_off, pemap_off, iomap_off = 0;
+ unsigned long size, m64map_off, m32map_off, pemap_off;
+ unsigned long iomap_off = 0, dma32map_off = 0;
const __be64 *prop64;
const __be32 *prop32;
int len;
+ unsigned int segno;
u64 phb_id;
void *aux;
long rc;
@@ -3306,13 +3481,13 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
pr_err(" Failed to map registers !\n");
/* Initialize more IODA stuff */
- phb->ioda.total_pe = 1;
+ phb->ioda.total_pe_num = 1;
prop32 = of_get_property(np, "ibm,opal-num-pes", NULL);
if (prop32)
- phb->ioda.total_pe = be32_to_cpup(prop32);
+ phb->ioda.total_pe_num = be32_to_cpup(prop32);
prop32 = of_get_property(np, "ibm,opal-reserved-pe", NULL);
if (prop32)
- phb->ioda.reserved_pe = be32_to_cpup(prop32);
+ phb->ioda.reserved_pe_idx = be32_to_cpup(prop32);
/* Parse 64-bit MMIO range */
pnv_ioda_parse_m64_window(phb);
@@ -3321,36 +3496,58 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
/* FW Has already off top 64k of M32 space (MSI space) */
phb->ioda.m32_size += 0x10000;
- phb->ioda.m32_segsize = phb->ioda.m32_size / phb->ioda.total_pe;
+ phb->ioda.m32_segsize = phb->ioda.m32_size / phb->ioda.total_pe_num;
phb->ioda.m32_pci_base = hose->mem_resources[0].start - hose->mem_offset[0];
phb->ioda.io_size = hose->pci_io_size;
- phb->ioda.io_segsize = phb->ioda.io_size / phb->ioda.total_pe;
+ phb->ioda.io_segsize = phb->ioda.io_size / phb->ioda.total_pe_num;
phb->ioda.io_pci_base = 0; /* XXX calculate this ? */
+ /* Calculate how many 32-bit TCE segments we have */
+ phb->ioda.dma32_count = phb->ioda.m32_pci_base /
+ PNV_IODA1_DMA32_SEGSIZE;
+
/* Allocate aux data & arrays. We don't have IO ports on PHB3 */
- size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long));
+ size = _ALIGN_UP(max_t(unsigned, phb->ioda.total_pe_num, 8) / 8,
+ sizeof(unsigned long));
+ m64map_off = size;
+ size += phb->ioda.total_pe_num * sizeof(phb->ioda.m64_segmap[0]);
m32map_off = size;
- size += phb->ioda.total_pe * sizeof(phb->ioda.m32_segmap[0]);
+ size += phb->ioda.total_pe_num * sizeof(phb->ioda.m32_segmap[0]);
if (phb->type == PNV_PHB_IODA1) {
iomap_off = size;
- size += phb->ioda.total_pe * sizeof(phb->ioda.io_segmap[0]);
+ size += phb->ioda.total_pe_num * sizeof(phb->ioda.io_segmap[0]);
+ dma32map_off = size;
+ size += phb->ioda.dma32_count *
+ sizeof(phb->ioda.dma32_segmap[0]);
}
pemap_off = size;
- size += phb->ioda.total_pe * sizeof(struct pnv_ioda_pe);
+ size += phb->ioda.total_pe_num * sizeof(struct pnv_ioda_pe);
aux = memblock_virt_alloc(size, 0);
phb->ioda.pe_alloc = aux;
+ phb->ioda.m64_segmap = aux + m64map_off;
phb->ioda.m32_segmap = aux + m32map_off;
- if (phb->type == PNV_PHB_IODA1)
+ for (segno = 0; segno < phb->ioda.total_pe_num; segno++) {
+ phb->ioda.m64_segmap[segno] = IODA_INVALID_PE;
+ phb->ioda.m32_segmap[segno] = IODA_INVALID_PE;
+ }
+ if (phb->type == PNV_PHB_IODA1) {
phb->ioda.io_segmap = aux + iomap_off;
+ for (segno = 0; segno < phb->ioda.total_pe_num; segno++)
+ phb->ioda.io_segmap[segno] = IODA_INVALID_PE;
+
+ phb->ioda.dma32_segmap = aux + dma32map_off;
+ for (segno = 0; segno < phb->ioda.dma32_count; segno++)
+ phb->ioda.dma32_segmap[segno] = IODA_INVALID_PE;
+ }
phb->ioda.pe_array = aux + pemap_off;
- set_bit(phb->ioda.reserved_pe, phb->ioda.pe_alloc);
+ set_bit(phb->ioda.reserved_pe_idx, phb->ioda.pe_alloc);
- INIT_LIST_HEAD(&phb->ioda.pe_dma_list);
INIT_LIST_HEAD(&phb->ioda.pe_list);
mutex_init(&phb->ioda.pe_list_mutex);
/* Calculate how many 32-bit TCE segments we have */
- phb->ioda.tce32_count = phb->ioda.m32_pci_base >> 28;
+ phb->ioda.dma32_count = phb->ioda.m32_pci_base /
+ PNV_IODA1_DMA32_SEGSIZE;
#if 0 /* We should really do that ... */
rc = opal_pci_set_phb_mem_window(opal->phb_id,
@@ -3362,7 +3559,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
#endif
pr_info(" %03d (%03d) PE's M32: 0x%x [segment=0x%x]\n",
- phb->ioda.total_pe, phb->ioda.reserved_pe,
+ phb->ioda.total_pe_num, phb->ioda.reserved_pe_idx,
phb->ioda.m32_size, phb->ioda.m32_segsize);
if (phb->ioda.m64_size)
pr_info(" M64: 0x%lx [segment=0x%lx]\n",
@@ -3377,12 +3574,6 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
phb->freeze_pe = pnv_ioda_freeze_pe;
phb->unfreeze_pe = pnv_ioda_unfreeze_pe;
- /* Setup RID -> PE mapping function */
- phb->bdfn_to_pe = pnv_ioda_bdfn_to_pe;
-
- /* Setup TCEs */
- phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup;
-
/* Setup MSI support */
pnv_pci_init_ioda_msis(phb);
@@ -3395,10 +3586,12 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
*/
ppc_md.pcibios_fixup = pnv_pci_ioda_fixup;
- if (phb->type == PNV_PHB_NPU)
+ if (phb->type == PNV_PHB_NPU) {
hose->controller_ops = pnv_npu_ioda_controller_ops;
- else
+ } else {
+ phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup;
hose->controller_ops = pnv_pci_ioda_controller_ops;
+ }
#ifdef CONFIG_PCI_IOV
ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov_resources;
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 73c8dc2..1d92bd9 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -39,9 +39,6 @@
/* Delay in usec */
#define PCI_RESET_DELAY_US 3000000
-#define cfg_dbg(fmt...) do { } while(0)
-//#define cfg_dbg(fmt...) printk(fmt)
-
#ifdef CONFIG_PCI_MSI
int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
{
@@ -370,7 +367,7 @@ static void pnv_pci_config_check_eeh(struct pci_dn *pdn)
struct pnv_phb *phb = pdn->phb->private_data;
u8 fstate;
__be16 pcierr;
- int pe_no;
+ unsigned int pe_no;
s64 rc;
/*
@@ -380,7 +377,7 @@ static void pnv_pci_config_check_eeh(struct pci_dn *pdn)
*/
pe_no = pdn->pe_number;
if (pe_no == IODA_INVALID_PE) {
- pe_no = phb->ioda.reserved_pe;
+ pe_no = phb->ioda.reserved_pe_idx;
}
/*
@@ -402,8 +399,8 @@ static void pnv_pci_config_check_eeh(struct pci_dn *pdn)
}
}
- cfg_dbg(" -> EEH check, bdfn=%04x PE#%d fstate=%x\n",
- (pdn->busno << 8) | (pdn->devfn), pe_no, fstate);
+ pr_devel(" -> EEH check, bdfn=%04x PE#%d fstate=%x\n",
+ (pdn->busno << 8) | (pdn->devfn), pe_no, fstate);
/* Clear the frozen state if applicable */
if (fstate == OPAL_EEH_STOPPED_MMIO_FREEZE ||
@@ -451,8 +448,8 @@ int pnv_pci_cfg_read(struct pci_dn *pdn,
return PCIBIOS_FUNC_NOT_SUPPORTED;
}
- cfg_dbg("%s: bus: %x devfn: %x +%x/%x -> %08x\n",
- __func__, pdn->busno, pdn->devfn, where, size, *val);
+ pr_devel("%s: bus: %x devfn: %x +%x/%x -> %08x\n",
+ __func__, pdn->busno, pdn->devfn, where, size, *val);
return PCIBIOS_SUCCESSFUL;
}
@@ -462,8 +459,8 @@ int pnv_pci_cfg_write(struct pci_dn *pdn,
struct pnv_phb *phb = pdn->phb->private_data;
u32 bdfn = (pdn->busno << 8) | pdn->devfn;
- cfg_dbg("%s: bus: %x devfn: %x +%x/%x -> %08x\n",
- pdn->busno, pdn->devfn, where, size, val);
+ pr_devel("%s: bus: %x devfn: %x +%x/%x -> %08x\n",
+ __func__, pdn->busno, pdn->devfn, where, size, val);
switch (size) {
case 1:
opal_pci_config_write_byte(phb->opal_id, bdfn, where, val);
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index 3f814f3..7dee25e 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -24,7 +24,6 @@ enum pnv_phb_model {
#define PNV_IODA_PE_MASTER (1 << 3) /* Master PE in compound case */
#define PNV_IODA_PE_SLAVE (1 << 4) /* Slave PE in compound case */
#define PNV_IODA_PE_VF (1 << 5) /* PE for one VF */
-#define PNV_IODA_PE_PEER (1 << 6) /* PE has peers */
/* Data associated with a PE, including IOMMU tracking etc.. */
struct pnv_phb;
@@ -32,9 +31,6 @@ struct pnv_ioda_pe {
unsigned long flags;
struct pnv_phb *phb;
-#define PNV_IODA_MAX_PEER_PES 8
- struct pnv_ioda_pe *peers[PNV_IODA_MAX_PEER_PES];
-
/* A PE can be associated with a single device or an
* entire bus (& children). In the former case, pdev
* is populated, in the later case, pbus is.
@@ -53,14 +49,7 @@ struct pnv_ioda_pe {
/* PE number */
unsigned int pe_number;
- /* "Weight" assigned to the PE for the sake of DMA resource
- * allocations
- */
- unsigned int dma_weight;
-
/* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */
- int tce32_seg;
- int tce32_segcount;
struct iommu_table_group table_group;
/* 64-bit TCE bypass region */
@@ -78,7 +67,6 @@ struct pnv_ioda_pe {
struct list_head slaves;
/* Link in list of PE#s */
- struct list_head dma_link;
struct list_head list;
};
@@ -110,19 +98,18 @@ struct pnv_phb {
unsigned int is_64, struct msi_msg *msg);
void (*dma_dev_setup)(struct pnv_phb *phb, struct pci_dev *pdev);
void (*fixup_phb)(struct pci_controller *hose);
- u32 (*bdfn_to_pe)(struct pnv_phb *phb, struct pci_bus *bus, u32 devfn);
int (*init_m64)(struct pnv_phb *phb);
void (*reserve_m64_pe)(struct pci_bus *bus,
unsigned long *pe_bitmap, bool all);
- int (*pick_m64_pe)(struct pci_bus *bus, bool all);
+ struct pnv_ioda_pe *(*pick_m64_pe)(struct pci_bus *bus, bool all);
int (*get_pe_state)(struct pnv_phb *phb, int pe_no);
void (*freeze_pe)(struct pnv_phb *phb, int pe_no);
int (*unfreeze_pe)(struct pnv_phb *phb, int pe_no, int opt);
struct {
/* Global bridge info */
- unsigned int total_pe;
- unsigned int reserved_pe;
+ unsigned int total_pe_num;
+ unsigned int reserved_pe_idx;
/* 32-bit MMIO window */
unsigned int m32_size;
@@ -141,15 +128,19 @@ struct pnv_phb {
unsigned int io_segsize;
unsigned int io_pci_base;
- /* PE allocation bitmap */
- unsigned long *pe_alloc;
- /* PE allocation mutex */
+ /* PE allocation */
struct mutex pe_alloc_mutex;
+ unsigned long *pe_alloc;
+ struct pnv_ioda_pe *pe_array;
/* M32 & IO segment maps */
+ unsigned int *m64_segmap;
unsigned int *m32_segmap;
unsigned int *io_segmap;
- struct pnv_ioda_pe *pe_array;
+
+ /* DMA32 segment maps - IODA1 only */
+ unsigned int dma32_count;
+ unsigned int *dma32_segmap;
/* IRQ chip */
int irq_chip_init;
@@ -167,20 +158,6 @@ struct pnv_phb {
*/
unsigned char pe_rmap[0x10000];
- /* 32-bit TCE tables allocation */
- unsigned long tce32_count;
-
- /* Total "weight" for the sake of DMA resources
- * allocation
- */
- unsigned int dma_weight;
- unsigned int dma_pe_count;
-
- /* Sorted list of used PE's, sorted at
- * boot for resource allocation purposes
- */
- struct list_head pe_dma_list;
-
/* TCE cache invalidate registers (physical and
* remapped)
*/
@@ -236,16 +213,23 @@ extern void pnv_pci_dma_bus_setup(struct pci_bus *bus);
extern int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type);
extern void pnv_teardown_msi_irqs(struct pci_dev *pdev);
+extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
+ const char *fmt, ...);
+#define pe_err(pe, fmt, ...) \
+ pe_level_printk(pe, KERN_ERR, fmt, ##__VA_ARGS__)
+#define pe_warn(pe, fmt, ...) \
+ pe_level_printk(pe, KERN_WARNING, fmt, ##__VA_ARGS__)
+#define pe_info(pe, fmt, ...) \
+ pe_level_printk(pe, KERN_INFO, fmt, ##__VA_ARGS__)
+
/* Nvlink functions */
-extern void pnv_npu_tce_invalidate_entire(struct pnv_ioda_pe *npe);
-extern void pnv_npu_tce_invalidate(struct pnv_ioda_pe *npe,
- struct iommu_table *tbl,
- unsigned long index,
- unsigned long npages,
- bool rm);
-extern void pnv_npu_init_dma_pe(struct pnv_ioda_pe *npe);
-extern void pnv_npu_setup_dma_pe(struct pnv_ioda_pe *npe);
-extern int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe, bool enabled);
-extern int pnv_npu_dma_set_mask(struct pci_dev *npdev, u64 dma_mask);
+extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass);
+extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm);
+extern struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe);
+extern long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num,
+ struct iommu_table *tbl);
+extern long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num);
+extern void pnv_npu_take_ownership(struct pnv_ioda_pe *npe);
+extern void pnv_npu_release_ownership(struct pnv_ioda_pe *npe);
#endif /* __POWERNV_PCI_H */
diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index 1acb0c7..ee6430b 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -273,7 +273,10 @@ static int __init pnv_probe(void)
if (!of_flat_dt_is_compatible(root, "ibm,powernv"))
return 0;
- hpte_init_native();
+ if (IS_ENABLED(CONFIG_PPC_RADIX_MMU) && radix_enabled())
+ radix_init_native();
+ else if (IS_ENABLED(CONFIG_PPC_STD_MMU_64))
+ hpte_init_native();
if (firmware_has_feature(FW_FEATURE_OPAL))
pnv_setup_machdep_opal();
diff --git a/arch/powerpc/platforms/ps3/htab.c b/arch/powerpc/platforms/ps3/htab.c
index 2f95d33..c9a3e67 100644
--- a/arch/powerpc/platforms/ps3/htab.c
+++ b/arch/powerpc/platforms/ps3/htab.c
@@ -63,7 +63,7 @@ static long ps3_hpte_insert(unsigned long hpte_group, unsigned long vpn,
vflags &= ~HPTE_V_SECONDARY;
hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID;
- hpte_r = hpte_encode_r(ps3_mm_phys_to_lpar(pa), psize, apsize) | rflags;
+ hpte_r = hpte_encode_r(ps3_mm_phys_to_lpar(pa), psize, apsize, ssize) | rflags;
spin_lock_irqsave(&ps3_htab_lock, flags);
diff --git a/arch/powerpc/platforms/ps3/spu.c b/arch/powerpc/platforms/ps3/spu.c
index a0bca05..492b257 100644
--- a/arch/powerpc/platforms/ps3/spu.c
+++ b/arch/powerpc/platforms/ps3/spu.c
@@ -205,7 +205,7 @@ static void spu_unmap(struct spu *spu)
static int __init setup_areas(struct spu *spu)
{
struct table {char* name; unsigned long addr; unsigned long size;};
- static const unsigned long shadow_flags = _PAGE_NO_CACHE | 3;
+ unsigned long shadow_flags = pgprot_val(pgprot_noncached_wc(PAGE_KERNEL_RO));
spu_pdata(spu)->shadow = __ioremap(spu_pdata(spu)->shadow_addr,
sizeof(struct spe_shadow),
@@ -216,7 +216,7 @@ static int __init setup_areas(struct spu *spu)
}
spu->local_store = (__force void *)ioremap_prot(spu->local_store_phys,
- LS_SIZE, _PAGE_NO_CACHE);
+ LS_SIZE, pgprot_val(pgprot_noncached_wc(__pgprot(0))));
if (!spu->local_store) {
pr_debug("%s:%d: ioremap local_store failed\n",
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
index e9ff44c..2ce1385 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -116,6 +116,155 @@ static struct property *dlpar_clone_drconf_property(struct device_node *dn)
return new_prop;
}
+static void dlpar_update_drconf_property(struct device_node *dn,
+ struct property *prop)
+{
+ struct of_drconf_cell *lmbs;
+ u32 num_lmbs, *p;
+ int i;
+
+ /* Convert the property back to BE */
+ p = prop->value;
+ num_lmbs = *p;
+ *p = cpu_to_be32(*p);
+ p++;
+
+ lmbs = (struct of_drconf_cell *)p;
+ for (i = 0; i < num_lmbs; i++) {
+ lmbs[i].base_addr = cpu_to_be64(lmbs[i].base_addr);
+ lmbs[i].drc_index = cpu_to_be32(lmbs[i].drc_index);
+ lmbs[i].flags = cpu_to_be32(lmbs[i].flags);
+ }
+
+ rtas_hp_event = true;
+ of_update_property(dn, prop);
+ rtas_hp_event = false;
+}
+
+static int dlpar_update_device_tree_lmb(struct of_drconf_cell *lmb)
+{
+ struct device_node *dn;
+ struct property *prop;
+ struct of_drconf_cell *lmbs;
+ u32 *p, num_lmbs;
+ int i;
+
+ dn = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
+ if (!dn)
+ return -ENODEV;
+
+ prop = dlpar_clone_drconf_property(dn);
+ if (!prop) {
+ of_node_put(dn);
+ return -ENODEV;
+ }
+
+ p = prop->value;
+ num_lmbs = *p++;
+ lmbs = (struct of_drconf_cell *)p;
+
+ for (i = 0; i < num_lmbs; i++) {
+ if (lmbs[i].drc_index == lmb->drc_index) {
+ lmbs[i].flags = lmb->flags;
+ lmbs[i].aa_index = lmb->aa_index;
+
+ dlpar_update_drconf_property(dn, prop);
+ break;
+ }
+ }
+
+ of_node_put(dn);
+ return 0;
+}
+
+static u32 lookup_lmb_associativity_index(struct of_drconf_cell *lmb)
+{
+ struct device_node *parent, *lmb_node, *dr_node;
+ const u32 *lmb_assoc;
+ const u32 *assoc_arrays;
+ u32 aa_index;
+ int aa_arrays, aa_array_entries, aa_array_sz;
+ int i;
+
+ parent = of_find_node_by_path("/");
+ if (!parent)
+ return -ENODEV;
+
+ lmb_node = dlpar_configure_connector(cpu_to_be32(lmb->drc_index),
+ parent);
+ of_node_put(parent);
+ if (!lmb_node)
+ return -EINVAL;
+
+ lmb_assoc = of_get_property(lmb_node, "ibm,associativity", NULL);
+ if (!lmb_assoc) {
+ dlpar_free_cc_nodes(lmb_node);
+ return -ENODEV;
+ }
+
+ dr_node = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
+ if (!dr_node) {
+ dlpar_free_cc_nodes(lmb_node);
+ return -ENODEV;
+ }
+
+ assoc_arrays = of_get_property(dr_node,
+ "ibm,associativity-lookup-arrays",
+ NULL);
+ of_node_put(dr_node);
+ if (!assoc_arrays) {
+ dlpar_free_cc_nodes(lmb_node);
+ return -ENODEV;
+ }
+
+ /* The ibm,associativity-lookup-arrays property is defined to be
+ * a 32-bit value specifying the number of associativity arrays
+ * followed by a 32-bitvalue specifying the number of entries per
+ * array, followed by the associativity arrays.
+ */
+ aa_arrays = be32_to_cpu(assoc_arrays[0]);
+ aa_array_entries = be32_to_cpu(assoc_arrays[1]);
+ aa_array_sz = aa_array_entries * sizeof(u32);
+
+ aa_index = -1;
+ for (i = 0; i < aa_arrays; i++) {
+ int indx = (i * aa_array_entries) + 2;
+
+ if (memcmp(&assoc_arrays[indx], &lmb_assoc[1], aa_array_sz))
+ continue;
+
+ aa_index = i;
+ break;
+ }
+
+ dlpar_free_cc_nodes(lmb_node);
+ return aa_index;
+}
+
+static int dlpar_add_device_tree_lmb(struct of_drconf_cell *lmb)
+{
+ int aa_index;
+
+ lmb->flags |= DRCONF_MEM_ASSIGNED;
+
+ aa_index = lookup_lmb_associativity_index(lmb);
+ if (aa_index < 0) {
+ pr_err("Couldn't find associativity index for drc index %x\n",
+ lmb->drc_index);
+ return aa_index;
+ }
+
+ lmb->aa_index = aa_index;
+ return dlpar_update_device_tree_lmb(lmb);
+}
+
+static int dlpar_remove_device_tree_lmb(struct of_drconf_cell *lmb)
+{
+ lmb->flags &= ~DRCONF_MEM_ASSIGNED;
+ lmb->aa_index = 0xffffffff;
+ return dlpar_update_device_tree_lmb(lmb);
+}
+
static struct memory_block *lmb_to_memblock(struct of_drconf_cell *lmb)
{
unsigned long section_nr;
@@ -243,8 +392,8 @@ static int dlpar_remove_lmb(struct of_drconf_cell *lmb)
memblock_remove(lmb->base_addr, block_sz);
dlpar_release_drc(lmb->drc_index);
+ dlpar_remove_device_tree_lmb(lmb);
- lmb->flags &= ~DRCONF_MEM_ASSIGNED;
return 0;
}
@@ -384,43 +533,32 @@ static int dlpar_memory_remove_by_index(u32 drc_index, struct property *prop)
#endif /* CONFIG_MEMORY_HOTREMOVE */
-static int dlpar_add_lmb(struct of_drconf_cell *lmb)
+static int dlpar_add_lmb_memory(struct of_drconf_cell *lmb)
{
struct memory_block *mem_block;
unsigned long block_sz;
int nid, rc;
- if (lmb->flags & DRCONF_MEM_ASSIGNED)
- return -EINVAL;
-
block_sz = memory_block_size_bytes();
- rc = dlpar_acquire_drc(lmb->drc_index);
- if (rc)
- return rc;
-
/* Find the node id for this address */
nid = memory_add_physaddr_to_nid(lmb->base_addr);
/* Add the memory */
rc = add_memory(nid, lmb->base_addr, block_sz);
- if (rc) {
- dlpar_release_drc(lmb->drc_index);
+ if (rc)
return rc;
- }
/* Register this block of memory */
rc = memblock_add(lmb->base_addr, block_sz);
if (rc) {
remove_memory(nid, lmb->base_addr, block_sz);
- dlpar_release_drc(lmb->drc_index);
return rc;
}
mem_block = lmb_to_memblock(lmb);
if (!mem_block) {
remove_memory(nid, lmb->base_addr, block_sz);
- dlpar_release_drc(lmb->drc_index);
return -EINVAL;
}
@@ -428,7 +566,6 @@ static int dlpar_add_lmb(struct of_drconf_cell *lmb)
put_device(&mem_block->dev);
if (rc) {
remove_memory(nid, lmb->base_addr, block_sz);
- dlpar_release_drc(lmb->drc_index);
return rc;
}
@@ -436,6 +573,34 @@ static int dlpar_add_lmb(struct of_drconf_cell *lmb)
return 0;
}
+static int dlpar_add_lmb(struct of_drconf_cell *lmb)
+{
+ int rc;
+
+ if (lmb->flags & DRCONF_MEM_ASSIGNED)
+ return -EINVAL;
+
+ rc = dlpar_acquire_drc(lmb->drc_index);
+ if (rc)
+ return rc;
+
+ rc = dlpar_add_device_tree_lmb(lmb);
+ if (rc) {
+ pr_err("Couldn't update device tree for drc index %x\n",
+ lmb->drc_index);
+ dlpar_release_drc(lmb->drc_index);
+ return rc;
+ }
+
+ rc = dlpar_add_lmb_memory(lmb);
+ if (rc) {
+ dlpar_remove_device_tree_lmb(lmb);
+ dlpar_release_drc(lmb->drc_index);
+ }
+
+ return rc;
+}
+
static int dlpar_memory_add_by_count(u32 lmbs_to_add, struct property *prop)
{
struct of_drconf_cell *lmbs;
@@ -536,31 +701,6 @@ static int dlpar_memory_add_by_index(u32 drc_index, struct property *prop)
return rc;
}
-static void dlpar_update_drconf_property(struct device_node *dn,
- struct property *prop)
-{
- struct of_drconf_cell *lmbs;
- u32 num_lmbs, *p;
- int i;
-
- /* Convert the property back to BE */
- p = prop->value;
- num_lmbs = *p;
- *p = cpu_to_be32(*p);
- p++;
-
- lmbs = (struct of_drconf_cell *)p;
- for (i = 0; i < num_lmbs; i++) {
- lmbs[i].base_addr = cpu_to_be64(lmbs[i].base_addr);
- lmbs[i].drc_index = cpu_to_be32(lmbs[i].drc_index);
- lmbs[i].flags = cpu_to_be32(lmbs[i].flags);
- }
-
- rtas_hp_event = true;
- of_update_property(dn, prop);
- rtas_hp_event = false;
-}
-
int dlpar_memory(struct pseries_hp_errorlog *hp_elog)
{
struct device_node *dn;
@@ -608,10 +748,7 @@ int dlpar_memory(struct pseries_hp_errorlog *hp_elog)
break;
}
- if (rc)
- dlpar_free_drconf_property(prop);
- else
- dlpar_update_drconf_property(dn, prop);
+ dlpar_free_drconf_property(prop);
dlpar_memory_out:
of_node_put(dn);
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index bd98ce2..b7dfc13 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -912,7 +912,8 @@ machine_arch_initcall(pseries, find_existing_ddw_windows);
static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail,
struct ddw_query_response *query)
{
- struct eeh_dev *edev;
+ struct device_node *dn;
+ struct pci_dn *pdn;
u32 cfg_addr;
u64 buid;
int ret;
@@ -923,11 +924,10 @@ static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail,
* Retrieve them from the pci device, not the node with the
* dma-window property
*/
- edev = pci_dev_to_eeh_dev(dev);
- cfg_addr = edev->config_addr;
- if (edev->pe_config_addr)
- cfg_addr = edev->pe_config_addr;
- buid = edev->phb->buid;
+ dn = pci_device_to_OF_node(dev);
+ pdn = PCI_DN(dn);
+ buid = pdn->phb->buid;
+ cfg_addr = (pdn->busno << 8) | pdn->devfn;
ret = rtas_call(ddw_avail[0], 3, 5, (u32 *)query,
cfg_addr, BUID_HI(buid), BUID_LO(buid));
@@ -941,7 +941,8 @@ static int create_ddw(struct pci_dev *dev, const u32 *ddw_avail,
struct ddw_create_response *create, int page_shift,
int window_shift)
{
- struct eeh_dev *edev;
+ struct device_node *dn;
+ struct pci_dn *pdn;
u32 cfg_addr;
u64 buid;
int ret;
@@ -952,11 +953,10 @@ static int create_ddw(struct pci_dev *dev, const u32 *ddw_avail,
* Retrieve them from the pci device, not the node with the
* dma-window property
*/
- edev = pci_dev_to_eeh_dev(dev);
- cfg_addr = edev->config_addr;
- if (edev->pe_config_addr)
- cfg_addr = edev->pe_config_addr;
- buid = edev->phb->buid;
+ dn = pci_device_to_OF_node(dev);
+ pdn = PCI_DN(dn);
+ buid = pdn->phb->buid;
+ cfg_addr = (pdn->busno << 8) | pdn->devfn;
do {
/* extra outputs are LIOBN and dma-addr (hi, lo) */
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index 2415a0d..7f6100d 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -89,18 +89,21 @@ void vpa_init(int cpu)
"%lx failed with %ld\n", cpu, hwcpu, addr, ret);
return;
}
+
+#ifdef CONFIG_PPC_STD_MMU_64
/*
* PAPR says this feature is SLB-Buffer but firmware never
* reports that. All SPLPAR support SLB shadow buffer.
*/
- addr = __pa(paca[cpu].slb_shadow_ptr);
- if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
+ if (!radix_enabled() && firmware_has_feature(FW_FEATURE_SPLPAR)) {
+ addr = __pa(paca[cpu].slb_shadow_ptr);
ret = register_slb_shadow(hwcpu, addr);
if (ret)
pr_err("WARNING: SLB shadow buffer registration for "
"cpu %d (hw %d) of area %lx failed with %ld\n",
cpu, hwcpu, addr, ret);
}
+#endif /* CONFIG_PPC_STD_MMU_64 */
/*
* Register dispatch trace log, if one has been allocated.
@@ -123,6 +126,8 @@ void vpa_init(int cpu)
}
}
+#ifdef CONFIG_PPC_STD_MMU_64
+
static long pSeries_lpar_hpte_insert(unsigned long hpte_group,
unsigned long vpn, unsigned long pa,
unsigned long rflags, unsigned long vflags,
@@ -139,7 +144,7 @@ static long pSeries_lpar_hpte_insert(unsigned long hpte_group,
hpte_group, vpn, pa, rflags, vflags, psize);
hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID;
- hpte_r = hpte_encode_r(pa, psize, apsize) | rflags;
+ hpte_r = hpte_encode_r(pa, psize, apsize, ssize) | rflags;
if (!(vflags & HPTE_V_BOLTED))
pr_devel(" hpte_v=%016lx, hpte_r=%016lx\n", hpte_v, hpte_r);
@@ -152,10 +157,6 @@ static long pSeries_lpar_hpte_insert(unsigned long hpte_group,
/* Exact = 0 */
flags = 0;
- /* Make pHyp happy */
- if ((rflags & _PAGE_NO_CACHE) && !(rflags & _PAGE_WRITETHRU))
- hpte_r &= ~HPTE_R_M;
-
if (firmware_has_feature(FW_FEATURE_XCMO) && !(hpte_r & HPTE_R_N))
flags |= H_COALESCE_CAND;
@@ -659,6 +660,8 @@ static void pSeries_set_page_state(struct page *page, int order,
void arch_free_page(struct page *page, int order)
{
+ if (radix_enabled())
+ return;
if (!cmo_free_hint_flag || !firmware_has_feature(FW_FEATURE_CMO))
return;
@@ -666,7 +669,8 @@ void arch_free_page(struct page *page, int order)
}
EXPORT_SYMBOL(arch_free_page);
-#endif
+#endif /* CONFIG_PPC_SMLPAR */
+#endif /* CONFIG_PPC_STD_MMU_64 */
#ifdef CONFIG_TRACEPOINTS
#ifdef HAVE_JUMP_LABEL
diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c
index c9fecf0..afa05a2 100644
--- a/arch/powerpc/platforms/pseries/lparcfg.c
+++ b/arch/powerpc/platforms/pseries/lparcfg.c
@@ -484,8 +484,9 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v)
seq_printf(m, "shared_processor_mode=%d\n",
lppaca_shared_proc(get_lppaca()));
+#ifdef CONFIG_PPC_STD_MMU_64
seq_printf(m, "slb_size=%d\n", mmu_slb_size);
-
+#endif
parse_em_data(m);
return 0;
diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c
index ceb18d3..a560a98 100644
--- a/arch/powerpc/platforms/pseries/mobility.c
+++ b/arch/powerpc/platforms/pseries/mobility.c
@@ -191,8 +191,8 @@ static int update_dt_node(__be32 phandle, s32 scope)
break;
case 0x80000000:
- prop = of_find_property(dn, prop_name, NULL);
- of_remove_property(dn, prop);
+ of_remove_property(dn, of_find_property(dn,
+ prop_name, NULL));
prop = NULL;
break;
diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c
index 272e9ec..543a638 100644
--- a/arch/powerpc/platforms/pseries/msi.c
+++ b/arch/powerpc/platforms/pseries/msi.c
@@ -305,7 +305,7 @@ static int msi_quota_for_device(struct pci_dev *dev, int request)
memset(&counts, 0, sizeof(struct msi_counts));
/* Work out how many devices we have below this PE */
- traverse_pci_devices(pe_dn, count_non_bridge_devices, &counts);
+ pci_traverse_device_nodes(pe_dn, count_non_bridge_devices, &counts);
if (counts.num_devices == 0) {
pr_err("rtas_msi: found 0 devices under PE for %s\n",
@@ -320,7 +320,7 @@ static int msi_quota_for_device(struct pci_dev *dev, int request)
/* else, we have some more calculating to do */
counts.requestor = pci_device_to_OF_node(dev);
counts.request = request;
- traverse_pci_devices(pe_dn, count_spare_msis, &counts);
+ pci_traverse_device_nodes(pe_dn, count_spare_msis, &counts);
/* If the quota isn't an integer multiple of the total, we can
* use the remainder as spare MSIs for anyone that wants them. */
diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c b/arch/powerpc/platforms/pseries/pci_dlpar.c
index 5d4a3df..906dbaa 100644
--- a/arch/powerpc/platforms/pseries/pci_dlpar.c
+++ b/arch/powerpc/platforms/pseries/pci_dlpar.c
@@ -34,38 +34,6 @@
#include "pseries.h"
-static struct pci_bus *
-find_bus_among_children(struct pci_bus *bus,
- struct device_node *dn)
-{
- struct pci_bus *child = NULL;
- struct pci_bus *tmp;
- struct device_node *busdn;
-
- busdn = pci_bus_to_OF_node(bus);
- if (busdn == dn)
- return bus;
-
- list_for_each_entry(tmp, &bus->children, node) {
- child = find_bus_among_children(tmp, dn);
- if (child)
- break;
- };
- return child;
-}
-
-struct pci_bus *
-pcibios_find_pci_bus(struct device_node *dn)
-{
- struct pci_dn *pdn = dn->data;
-
- if (!pdn || !pdn->phb || !pdn->phb->bus)
- return NULL;
-
- return find_bus_among_children(pdn->phb->bus, dn);
-}
-EXPORT_SYMBOL_GPL(pcibios_find_pci_bus);
-
struct pci_controller *init_phb_dynamic(struct device_node *dn)
{
struct pci_controller *phb;
diff --git a/arch/powerpc/platforms/pseries/reconfig.c b/arch/powerpc/platforms/pseries/reconfig.c
index 7c7fcc0..cc66c49 100644
--- a/arch/powerpc/platforms/pseries/reconfig.c
+++ b/arch/powerpc/platforms/pseries/reconfig.c
@@ -303,7 +303,6 @@ static int do_remove_property(char *buf, size_t bufsize)
{
struct device_node *np;
char *tmp;
- struct property *prop;
buf = parse_node(buf, bufsize, &np);
if (!np)
@@ -316,9 +315,7 @@ static int do_remove_property(char *buf, size_t bufsize)
if (strlen(buf) == 0)
return -EINVAL;
- prop = of_find_property(np, buf, NULL);
-
- return of_remove_property(np, prop);
+ return of_remove_property(np, of_find_property(np, buf, NULL));
}
static int do_update_property(char *buf, size_t bufsize)
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index 6e944fc..9883bc7 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -235,6 +235,8 @@ static void __init pseries_discover_pic(void)
for_each_node_by_name(np, "interrupt-controller") {
typep = of_get_property(np, "compatible", NULL);
+ if (!typep)
+ continue;
if (strstr(typep, "open-pic")) {
pSeries_mpic_node = of_node_get(np);
ppc_md.init_IRQ = pseries_mpic_init_IRQ;
@@ -265,7 +267,7 @@ static int pci_dn_reconfig_notifier(struct notifier_block *nb, unsigned long act
pdn = parent ? PCI_DN(parent) : NULL;
if (pdn) {
/* Create pdn and EEH device */
- update_dn_pci_info(np, pdn->phb);
+ pci_add_device_node_info(pdn->phb, np);
eeh_dev_init(PCI_DN(np), pdn->phb);
}
diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c
index 85729f4..0ef9df4 100644
--- a/arch/powerpc/sysdev/fsl_pci.c
+++ b/arch/powerpc/sysdev/fsl_pci.c
@@ -37,6 +37,7 @@
#include <asm/pci-bridge.h>
#include <asm/ppc-pci.h>
#include <asm/machdep.h>
+#include <asm/mpc85xx.h>
#include <asm/disassemble.h>
#include <asm/ppc-opcode.h>
#include <sysdev/fsl_soc.h>
@@ -527,6 +528,8 @@ int fsl_add_bridge(struct platform_device *pdev, int is_primary)
u8 hdr_type, progif;
struct device_node *dev;
struct ccsr_pci __iomem *pci;
+ u16 temp;
+ u32 svr = mfspr(SPRN_SVR);
dev = pdev->dev.of_node;
@@ -596,6 +599,27 @@ int fsl_add_bridge(struct platform_device *pdev, int is_primary)
PPC_INDIRECT_TYPE_SURPRESS_PRIMARY_BUS;
if (fsl_pcie_check_link(hose))
hose->indirect_type |= PPC_INDIRECT_TYPE_NO_PCIE_LINK;
+ } else {
+ /*
+ * Set PBFR(PCI Bus Function Register)[10] = 1 to
+ * disable the combining of crossing cacheline
+ * boundary requests into one burst transaction.
+ * PCI-X operation is not affected.
+ * Fix erratum PCI 5 on MPC8548
+ */
+#define PCI_BUS_FUNCTION 0x44
+#define PCI_BUS_FUNCTION_MDS 0x400 /* Master disable streaming */
+ if (((SVR_SOC_VER(svr) == SVR_8543) ||
+ (SVR_SOC_VER(svr) == SVR_8545) ||
+ (SVR_SOC_VER(svr) == SVR_8547) ||
+ (SVR_SOC_VER(svr) == SVR_8548)) &&
+ !early_find_capability(hose, 0, 0, PCI_CAP_ID_PCIX)) {
+ early_read_config_word(hose, 0, 0,
+ PCI_BUS_FUNCTION, &temp);
+ temp |= PCI_BUS_FUNCTION_MDS;
+ early_write_config_word(hose, 0, 0,
+ PCI_BUS_FUNCTION, temp);
+ }
}
printk(KERN_INFO "Found FSL PCI host bridge at 0x%016llx. "
diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c
index afe3c7c..7de45b2 100644
--- a/arch/powerpc/sysdev/mpic.c
+++ b/arch/powerpc/sysdev/mpic.c
@@ -2004,8 +2004,15 @@ static struct syscore_ops mpic_syscore_ops = {
static int mpic_init_sys(void)
{
+ int rc;
+
register_syscore_ops(&mpic_syscore_ops);
- subsys_system_register(&mpic_subsys, NULL);
+ rc = subsys_system_register(&mpic_subsys, NULL);
+ if (rc) {
+ unregister_syscore_ops(&mpic_syscore_ops);
+ pr_err("mpic: Failed to register subsystem!\n");
+ return rc;
+ }
return 0;
}
diff --git a/arch/powerpc/xmon/Makefile b/arch/powerpc/xmon/Makefile
index 436062d..0b2f771 100644
--- a/arch/powerpc/xmon/Makefile
+++ b/arch/powerpc/xmon/Makefile
@@ -7,7 +7,7 @@ UBSAN_SANITIZE := n
ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC)
-obj-y += xmon.o nonstdio.o
+obj-y += xmon.o nonstdio.o spr_access.o
ifdef CONFIG_XMON_DISASSEMBLY
obj-y += ppc-dis.o ppc-opc.o
diff --git a/arch/powerpc/xmon/spr_access.S b/arch/powerpc/xmon/spr_access.S
new file mode 100644
index 0000000..84ad742
--- /dev/null
+++ b/arch/powerpc/xmon/spr_access.S
@@ -0,0 +1,45 @@
+#include <asm/ppc_asm.h>
+
+/* unsigned long xmon_mfspr(sprn, default_value) */
+_GLOBAL(xmon_mfspr)
+ ld r5, .Lmfspr_table@got(r2)
+ b xmon_mxspr
+
+/* void xmon_mtspr(sprn, new_value) */
+_GLOBAL(xmon_mtspr)
+ ld r5, .Lmtspr_table@got(r2)
+ b xmon_mxspr
+
+/*
+ * r3 = sprn
+ * r4 = default or new value
+ * r5 = table base
+ */
+xmon_mxspr:
+ /*
+ * To index into the table of mxsprs we need:
+ * i = (sprn & 0x3ff) * 8
+ * or using rwlinm:
+ * i = (sprn << 3) & (0x3ff << 3)
+ */
+ rlwinm r3, r3, 3, 0x3ff << 3
+ add r5, r5, r3
+ mtctr r5
+ mr r3, r4 /* put default_value in r3 for mfspr */
+ bctr
+
+.Lmfspr_table:
+ spr = 0
+ .rept 1024
+ mfspr r3, spr
+ blr
+ spr = spr + 1
+ .endr
+
+.Lmtspr_table:
+ spr = 0
+ .rept 1024
+ mtspr spr, r4
+ blr
+ spr = spr + 1
+ .endr
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 942796f..c5e1551 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -86,6 +86,7 @@ static char tmpstr[128];
static long bus_error_jmp[JMP_BUF_LEN];
static int catch_memory_errors;
+static int catch_spr_faults;
static long *xmon_fault_jmp[NR_CPUS];
/* Breakpoint stuff */
@@ -147,7 +148,7 @@ void getstring(char *, int);
static void flush_input(void);
static int inchar(void);
static void take_input(char *);
-static unsigned long read_spr(int);
+static int read_spr(int, unsigned long *);
static void write_spr(int, unsigned long);
static void super_regs(void);
static void remove_bpts(void);
@@ -250,6 +251,9 @@ Commands:\n\
sdi # disassemble spu local store for spu # (in hex)\n"
#endif
" S print special registers\n\
+ Sa print all SPRs\n\
+ Sr # read SPR #\n\
+ Sw #v write v to SPR #\n\
t print backtrace\n\
x exit monitor and recover\n\
X exit monitor and don't recover\n"
@@ -442,6 +446,12 @@ static int xmon_core(struct pt_regs *regs, int fromipi)
#ifdef CONFIG_SMP
cpu = smp_processor_id();
if (cpumask_test_cpu(cpu, &cpus_in_xmon)) {
+ /*
+ * We catch SPR read/write faults here because the 0x700, 0xf60
+ * etc. handlers don't call debugger_fault_handler().
+ */
+ if (catch_spr_faults)
+ longjmp(bus_error_jmp, 1);
get_output_lock();
excprint(regs);
printf("cpu 0x%x: Exception %lx %s in xmon, "
@@ -1635,89 +1645,87 @@ static void cacheflush(void)
catch_memory_errors = 0;
}
-static unsigned long
-read_spr(int n)
+extern unsigned long xmon_mfspr(int spr, unsigned long default_value);
+extern void xmon_mtspr(int spr, unsigned long value);
+
+static int
+read_spr(int n, unsigned long *vp)
{
- unsigned int instrs[2];
- unsigned long (*code)(void);
unsigned long ret = -1UL;
-#ifdef CONFIG_PPC64
- unsigned long opd[3];
-
- opd[0] = (unsigned long)instrs;
- opd[1] = 0;
- opd[2] = 0;
- code = (unsigned long (*)(void)) opd;
-#else
- code = (unsigned long (*)(void)) instrs;
-#endif
-
- /* mfspr r3,n; blr */
- instrs[0] = 0x7c6002a6 + ((n & 0x1F) << 16) + ((n & 0x3e0) << 6);
- instrs[1] = 0x4e800020;
- store_inst(instrs);
- store_inst(instrs+1);
+ int ok = 0;
if (setjmp(bus_error_jmp) == 0) {
- catch_memory_errors = 1;
+ catch_spr_faults = 1;
sync();
- ret = code();
+ ret = xmon_mfspr(n, *vp);
sync();
- /* wait a little while to see if we get a machine check */
- __delay(200);
- n = size;
+ *vp = ret;
+ ok = 1;
}
+ catch_spr_faults = 0;
- return ret;
+ return ok;
}
static void
write_spr(int n, unsigned long val)
{
- unsigned int instrs[2];
- unsigned long (*code)(unsigned long);
-#ifdef CONFIG_PPC64
- unsigned long opd[3];
-
- opd[0] = (unsigned long)instrs;
- opd[1] = 0;
- opd[2] = 0;
- code = (unsigned long (*)(unsigned long)) opd;
-#else
- code = (unsigned long (*)(unsigned long)) instrs;
-#endif
-
- instrs[0] = 0x7c6003a6 + ((n & 0x1F) << 16) + ((n & 0x3e0) << 6);
- instrs[1] = 0x4e800020;
- store_inst(instrs);
- store_inst(instrs+1);
-
if (setjmp(bus_error_jmp) == 0) {
- catch_memory_errors = 1;
+ catch_spr_faults = 1;
sync();
- code(val);
+ xmon_mtspr(n, val);
sync();
- /* wait a little while to see if we get a machine check */
- __delay(200);
- n = size;
+ } else {
+ printf("SPR 0x%03x (%4d) Faulted during write\n", n, n);
}
+ catch_spr_faults = 0;
}
static unsigned long regno;
extern char exc_prolog;
extern char dec_exc;
+static void dump_one_spr(int spr, bool show_unimplemented)
+{
+ unsigned long val;
+
+ val = 0xdeadbeef;
+ if (!read_spr(spr, &val)) {
+ printf("SPR 0x%03x (%4d) Faulted during read\n", spr, spr);
+ return;
+ }
+
+ if (val == 0xdeadbeef) {
+ /* Looks like read was a nop, confirm */
+ val = 0x0badcafe;
+ if (!read_spr(spr, &val)) {
+ printf("SPR 0x%03x (%4d) Faulted during read\n", spr, spr);
+ return;
+ }
+
+ if (val == 0x0badcafe) {
+ if (show_unimplemented)
+ printf("SPR 0x%03x (%4d) Unimplemented\n", spr, spr);
+ return;
+ }
+ }
+
+ printf("SPR 0x%03x (%4d) = 0x%lx\n", spr, spr, val);
+}
+
static void super_regs(void)
{
int cmd;
- unsigned long val;
+ int spr;
cmd = skipbl();
- if (cmd == '\n') {
+
+ switch (cmd) {
+ case '\n': {
unsigned long sp, toc;
asm("mr %0,1" : "=r" (sp) :);
asm("mr %0,2" : "=r" (toc) :);
@@ -1730,21 +1738,29 @@ static void super_regs(void)
mfspr(SPRN_DEC), mfspr(SPRN_SPRG2));
printf("sp = "REG" sprg3= "REG"\n", sp, mfspr(SPRN_SPRG3));
printf("toc = "REG" dar = "REG"\n", toc, mfspr(SPRN_DAR));
-
return;
}
-
- scanhex(&regno);
- switch (cmd) {
- case 'w':
- val = read_spr(regno);
+ case 'w': {
+ unsigned long val;
+ scanhex(&regno);
+ val = 0;
+ read_spr(regno, &val);
scanhex(&val);
write_spr(regno, val);
- /* fall through */
+ dump_one_spr(regno, true);
+ break;
+ }
case 'r':
- printf("spr %lx = %lx\n", regno, read_spr(regno));
+ scanhex(&regno);
+ dump_one_spr(regno, true);
+ break;
+ case 'a':
+ /* dump ALL SPRs */
+ for (spr = 1; spr < 1024; ++spr)
+ dump_one_spr(spr, false);
break;
}
+
scannl();
}
@@ -2913,7 +2929,7 @@ static void xmon_print_symbol(unsigned long address, const char *mid,
printf("%s", after);
}
-#ifdef CONFIG_PPC_BOOK3S_64
+#ifdef CONFIG_PPC_STD_MMU_64
void dump_segments(void)
{
int i;
diff --git a/drivers/cpufreq/pmac32-cpufreq.c b/drivers/cpufreq/pmac32-cpufreq.c
index b7b576e..ff44016 100644
--- a/drivers/cpufreq/pmac32-cpufreq.c
+++ b/drivers/cpufreq/pmac32-cpufreq.c
@@ -300,7 +300,7 @@ static int pmu_set_cpu_speed(int low_speed)
_set_L3CR(save_l3cr);
/* Restore userland MMU context */
- switch_mmu_context(NULL, current->active_mm);
+ switch_mmu_context(NULL, current->active_mm, NULL);
#ifdef DEBUG_FREQ
printk(KERN_DEBUG "HID1, after: %x\n", mfspr(SPRN_HID1));
diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c
index 24f4a78..ff946d5 100644
--- a/drivers/infiniband/hw/qib/qib_file_ops.c
+++ b/drivers/infiniband/hw/qib/qib_file_ops.c
@@ -824,10 +824,7 @@ static int mmap_piobufs(struct vm_area_struct *vma,
phys = dd->physaddr + piobufs;
#if defined(__powerpc__)
- /* There isn't a generic way to specify writethrough mappings */
- pgprot_val(vma->vm_page_prot) |= _PAGE_NO_CACHE;
- pgprot_val(vma->vm_page_prot) |= _PAGE_WRITETHRU;
- pgprot_val(vma->vm_page_prot) &= ~_PAGE_GUARDED;
+ vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
#endif
/*
diff --git a/drivers/infiniband/hw/qib/qib_pcie.c b/drivers/infiniband/hw/qib/qib_pcie.c
index 4758a38..6abe1c6 100644
--- a/drivers/infiniband/hw/qib/qib_pcie.c
+++ b/drivers/infiniband/hw/qib/qib_pcie.c
@@ -144,13 +144,7 @@ int qib_pcie_ddinit(struct qib_devdata *dd, struct pci_dev *pdev,
addr = pci_resource_start(pdev, 0);
len = pci_resource_len(pdev, 0);
-#if defined(__powerpc__)
- /* There isn't a generic way to specify writethrough mappings */
- dd->kregbase = __ioremap(addr, len, _PAGE_NO_CACHE | _PAGE_WRITETHRU);
-#else
dd->kregbase = ioremap_nocache(addr, len);
-#endif
-
if (!dd->kregbase)
return -ENOMEM;
diff --git a/drivers/macintosh/rack-meter.c b/drivers/macintosh/rack-meter.c
index caaec65..465c522 100644
--- a/drivers/macintosh/rack-meter.c
+++ b/drivers/macintosh/rack-meter.c
@@ -154,8 +154,8 @@ static void rackmeter_do_pause(struct rackmeter *rm, int pause)
DBDMA_DO_STOP(rm->dma_regs);
return;
}
- memset(rdma->buf1, 0, SAMPLE_COUNT & sizeof(u32));
- memset(rdma->buf2, 0, SAMPLE_COUNT & sizeof(u32));
+ memset(rdma->buf1, 0, ARRAY_SIZE(rdma->buf1));
+ memset(rdma->buf2, 0, ARRAY_SIZE(rdma->buf2));
rm->dma_buf_v->mark = 0;
@@ -227,6 +227,7 @@ static void rackmeter_do_timer(struct work_struct *work)
total_idle_ticks = get_cpu_idle_time(cpu);
idle_ticks = (unsigned int) (total_idle_ticks - rcpu->prev_idle);
+ idle_ticks = min(idle_ticks, total_ticks);
rcpu->prev_idle = total_idle_ticks;
/* We do a very dumb calculation to update the LEDs for now,
diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c
index 01ee736..f8b6d14 100644
--- a/drivers/macintosh/via-pmu.c
+++ b/drivers/macintosh/via-pmu.c
@@ -1851,7 +1851,7 @@ static int powerbook_sleep_grackle(void)
_set_L2CR(save_l2cr);
/* Restore userland MMU context */
- switch_mmu_context(NULL, current->active_mm);
+ switch_mmu_context(NULL, current->active_mm, NULL);
/* Power things up */
pmu_unlock();
@@ -1940,7 +1940,7 @@ powerbook_sleep_Core99(void)
_set_L3CR(save_l3cr);
/* Restore userland MMU context */
- switch_mmu_context(NULL, current->active_mm);
+ switch_mmu_context(NULL, current->active_mm, NULL);
/* Tell PMU we are ready */
pmu_unlock();
diff --git a/drivers/misc/cxl/api.c b/drivers/misc/cxl/api.c
index 2107c94..6d228cc 100644
--- a/drivers/misc/cxl/api.c
+++ b/drivers/misc/cxl/api.c
@@ -68,15 +68,6 @@ struct cxl_context *cxl_get_context(struct pci_dev *dev)
}
EXPORT_SYMBOL_GPL(cxl_get_context);
-struct device *cxl_get_phys_dev(struct pci_dev *dev)
-{
- struct cxl_afu *afu;
-
- afu = cxl_pci_to_afu(dev);
-
- return afu->adapter->dev.parent;
-}
-
int cxl_release_context(struct cxl_context *ctx)
{
if (ctx->status >= STARTED)
@@ -192,6 +183,7 @@ int cxl_start_context(struct cxl_context *ctx, u64 wed,
ctx->pid = get_task_pid(task, PIDTYPE_PID);
ctx->glpid = get_task_pid(task->group_leader, PIDTYPE_PID);
kernel = false;
+ ctx->real_mode = false;
}
cxl_ctx_get();
@@ -228,6 +220,24 @@ void cxl_set_master(struct cxl_context *ctx)
}
EXPORT_SYMBOL_GPL(cxl_set_master);
+int cxl_set_translation_mode(struct cxl_context *ctx, bool real_mode)
+{
+ if (ctx->status == STARTED) {
+ /*
+ * We could potentially update the PE and issue an update LLCMD
+ * to support this, but it doesn't seem to have a good use case
+ * since it's trivial to just create a second kernel context
+ * with different translation modes, so until someone convinces
+ * me otherwise:
+ */
+ return -EBUSY;
+ }
+
+ ctx->real_mode = real_mode;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(cxl_set_translation_mode);
+
/* wrappers around afu_* file ops which are EXPORTED */
int cxl_fd_open(struct inode *inode, struct file *file)
{
diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c
index 7edea9c..26d206b 100644
--- a/drivers/misc/cxl/context.c
+++ b/drivers/misc/cxl/context.c
@@ -297,8 +297,7 @@ static void reclaim_ctx(struct rcu_head *rcu)
if (ctx->kernelapi)
kfree(ctx->mapping);
- if (ctx->irq_bitmap)
- kfree(ctx->irq_bitmap);
+ kfree(ctx->irq_bitmap);
/* Drop ref to the afu device taken during cxl_context_init */
cxl_afu_put(ctx->afu);
diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h
index 73dc2a3..4fe5078 100644
--- a/drivers/misc/cxl/cxl.h
+++ b/drivers/misc/cxl/cxl.h
@@ -178,15 +178,6 @@ static const cxl_p2n_reg_t CXL_PSL_WED_An = {0x0A0};
#define CXL_PSL_SR_An_MP (1ull << (63-62)) /* Master Process */
#define CXL_PSL_SR_An_LE (1ull << (63-63)) /* Little Endian */
-/****** CXL_PSL_LLCMD_An ****************************************************/
-#define CXL_LLCMD_TERMINATE 0x0001000000000000ULL
-#define CXL_LLCMD_REMOVE 0x0002000000000000ULL
-#define CXL_LLCMD_SUSPEND 0x0003000000000000ULL
-#define CXL_LLCMD_RESUME 0x0004000000000000ULL
-#define CXL_LLCMD_ADD 0x0005000000000000ULL
-#define CXL_LLCMD_UPDATE 0x0006000000000000ULL
-#define CXL_LLCMD_HANDLE_MASK 0x000000000000ffffULL
-
/****** CXL_PSL_ID_An ****************************************************/
#define CXL_PSL_ID_An_F (1ull << (63-31))
#define CXL_PSL_ID_An_L (1ull << (63-30))
@@ -376,11 +367,13 @@ struct cxl_afu_native {
};
struct cxl_afu_guest {
+ struct cxl_afu *parent;
u64 handle;
phys_addr_t p2n_phys;
u64 p2n_size;
int max_ints;
- struct mutex recovery_lock;
+ bool handle_err;
+ struct delayed_work work_err;
int previous_state;
};
@@ -524,6 +517,7 @@ struct cxl_context {
bool pe_inserted;
bool master;
bool kernel;
+ bool real_mode;
bool pending_irq;
bool pending_fault;
bool pending_afu_err;
@@ -580,6 +574,7 @@ struct cxl {
bool perst_loads_image;
bool perst_select_user;
bool perst_same_image;
+ bool psl_timebase_synced;
};
int cxl_pci_alloc_one_irq(struct cxl *adapter);
diff --git a/drivers/misc/cxl/fault.c b/drivers/misc/cxl/fault.c
index 9a8650b..377e650 100644
--- a/drivers/misc/cxl/fault.c
+++ b/drivers/misc/cxl/fault.c
@@ -149,11 +149,13 @@ static void cxl_handle_page_fault(struct cxl_context *ctx,
* update_mmu_cache() will not have loaded the hash since current->trap
* is not a 0x400 or 0x300, so just call hash_page_mm() here.
*/
- access = _PAGE_PRESENT;
+ access = _PAGE_PRESENT | _PAGE_READ;
if (dsisr & CXL_PSL_DSISR_An_S)
- access |= _PAGE_RW;
- if ((!ctx->kernel) || ~(dar & (1ULL << 63)))
- access |= _PAGE_USER;
+ access |= _PAGE_WRITE;
+
+ access |= _PAGE_PRIVILEGED;
+ if ((!ctx->kernel) || (REGION_ID(dar) == USER_REGION_ID))
+ access &= ~_PAGE_PRIVILEGED;
if (dsisr & DSISR_NOHPTE)
inv_flags |= HPTE_NOHPTE_UPDATE;
diff --git a/drivers/misc/cxl/guest.c b/drivers/misc/cxl/guest.c
index 8213372..bc8d0b9 100644
--- a/drivers/misc/cxl/guest.c
+++ b/drivers/misc/cxl/guest.c
@@ -178,6 +178,9 @@ static int afu_read_error_state(struct cxl_afu *afu, int *state_out)
u64 state;
int rc = 0;
+ if (!afu)
+ return -EIO;
+
rc = cxl_h_read_error_state(afu->guest->handle, &state);
if (!rc) {
WARN_ON(state != H_STATE_NORMAL &&
@@ -552,6 +555,17 @@ static int attach_afu_directed(struct cxl_context *ctx, u64 wed, u64 amr)
elem->common.sstp0 = cpu_to_be64(ctx->sstp0);
elem->common.sstp1 = cpu_to_be64(ctx->sstp1);
+
+ /*
+ * Ensure we have at least one interrupt allocated to take faults for
+ * kernel contexts that may not have allocated any AFU IRQs at all:
+ */
+ if (ctx->irqs.range[0] == 0) {
+ rc = afu_register_irqs(ctx, 0);
+ if (rc)
+ goto out_free;
+ }
+
for (r = 0; r < CXL_IRQ_RANGES; r++) {
for (i = 0; i < ctx->irqs.range[r]; i++) {
if (r == 0 && i == 0) {
@@ -597,6 +611,7 @@ static int attach_afu_directed(struct cxl_context *ctx, u64 wed, u64 amr)
enable_afu_irqs(ctx);
}
+out_free:
free_page((u64)elem);
return rc;
}
@@ -605,6 +620,9 @@ static int guest_attach_process(struct cxl_context *ctx, bool kernel, u64 wed, u
{
pr_devel("in %s\n", __func__);
+ if (ctx->real_mode)
+ return -EPERM;
+
ctx->kernel = kernel;
if (ctx->afu->current_mode == CXL_MODE_DIRECTED)
return attach_afu_directed(ctx, wed, amr);
@@ -818,7 +836,6 @@ static int afu_update_state(struct cxl_afu *afu)
switch (cur_state) {
case H_STATE_NORMAL:
afu->guest->previous_state = cur_state;
- rc = 1;
break;
case H_STATE_DISABLE:
@@ -834,7 +851,6 @@ static int afu_update_state(struct cxl_afu *afu)
pci_error_handlers(afu, CXL_SLOT_RESET_EVENT,
pci_channel_io_normal);
pci_error_handlers(afu, CXL_RESUME_EVENT, 0);
- rc = 1;
}
afu->guest->previous_state = 0;
break;
@@ -859,39 +875,30 @@ static int afu_update_state(struct cxl_afu *afu)
return rc;
}
-static int afu_do_recovery(struct cxl_afu *afu)
+static void afu_handle_errstate(struct work_struct *work)
{
- int rc;
+ struct cxl_afu_guest *afu_guest =
+ container_of(to_delayed_work(work), struct cxl_afu_guest, work_err);
- /* many threads can arrive here, in case of detach_all for example.
- * Only one needs to drive the recovery
- */
- if (mutex_trylock(&afu->guest->recovery_lock)) {
- rc = afu_update_state(afu);
- mutex_unlock(&afu->guest->recovery_lock);
- return rc;
- }
- return 0;
+ if (!afu_update_state(afu_guest->parent) &&
+ afu_guest->previous_state == H_STATE_PERM_UNAVAILABLE)
+ return;
+
+ if (afu_guest->handle_err == true)
+ schedule_delayed_work(&afu_guest->work_err,
+ msecs_to_jiffies(3000));
}
static bool guest_link_ok(struct cxl *cxl, struct cxl_afu *afu)
{
int state;
- if (afu) {
- if (afu_read_error_state(afu, &state) ||
- state != H_STATE_NORMAL) {
- if (afu_do_recovery(afu) > 0) {
- /* check again in case we've just fixed it */
- if (!afu_read_error_state(afu, &state) &&
- state == H_STATE_NORMAL)
- return true;
- }
- return false;
- }
+ if (afu && (!afu_read_error_state(afu, &state))) {
+ if (state == H_STATE_NORMAL)
+ return true;
}
- return true;
+ return false;
}
static int afu_properties_look_ok(struct cxl_afu *afu)
@@ -929,8 +936,6 @@ int cxl_guest_init_afu(struct cxl *adapter, int slice, struct device_node *afu_n
return -ENOMEM;
}
- mutex_init(&afu->guest->recovery_lock);
-
if ((rc = dev_set_name(&afu->dev, "afu%i.%i",
adapter->adapter_num,
slice)))
@@ -986,6 +991,15 @@ int cxl_guest_init_afu(struct cxl *adapter, int slice, struct device_node *afu_n
afu->enabled = true;
+ /*
+ * wake up the cpu periodically to check the state
+ * of the AFU using "afu" stored in the guest structure.
+ */
+ afu->guest->parent = afu;
+ afu->guest->handle_err = true;
+ INIT_DELAYED_WORK(&afu->guest->work_err, afu_handle_errstate);
+ schedule_delayed_work(&afu->guest->work_err, msecs_to_jiffies(1000));
+
if ((rc = cxl_pci_vphb_add(afu)))
dev_info(&afu->dev, "Can't register vPHB\n");
@@ -1014,6 +1028,10 @@ void cxl_guest_remove_afu(struct cxl_afu *afu)
if (!afu)
return;
+ /* flush and stop pending job */
+ afu->guest->handle_err = false;
+ flush_delayed_work(&afu->guest->work_err);
+
cxl_pci_vphb_remove(afu);
cxl_sysfs_afu_remove(afu);
@@ -1101,6 +1119,12 @@ struct cxl *cxl_guest_init_adapter(struct device_node *np, struct platform_devic
adapter->dev.release = release_adapter;
dev_set_drvdata(&pdev->dev, adapter);
+ /*
+ * Hypervisor controls PSL timebase initialization (p1 register).
+ * On FW840, PSL is initialized.
+ */
+ adapter->psl_timebase_synced = true;
+
if ((rc = cxl_of_read_adapter_handle(adapter, np)))
goto err1;
diff --git a/drivers/misc/cxl/native.c b/drivers/misc/cxl/native.c
index ecf7557..55d8a14 100644
--- a/drivers/misc/cxl/native.c
+++ b/drivers/misc/cxl/native.c
@@ -186,16 +186,25 @@ static int spa_max_procs(int spa_size)
int cxl_alloc_spa(struct cxl_afu *afu)
{
+ unsigned spa_size;
+
/* Work out how many pages to allocate */
afu->native->spa_order = 0;
do {
afu->native->spa_order++;
- afu->native->spa_size = (1 << afu->native->spa_order) * PAGE_SIZE;
+ spa_size = (1 << afu->native->spa_order) * PAGE_SIZE;
+
+ if (spa_size > 0x100000) {
+ dev_warn(&afu->dev, "num_of_processes too large for the SPA, limiting to %i (0x%x)\n",
+ afu->native->spa_max_procs, afu->native->spa_size);
+ afu->num_procs = afu->native->spa_max_procs;
+ break;
+ }
+
+ afu->native->spa_size = spa_size;
afu->native->spa_max_procs = spa_max_procs(afu->native->spa_size);
} while (afu->native->spa_max_procs < afu->num_procs);
- WARN_ON(afu->native->spa_size > 0x100000); /* Max size supported by the hardware */
-
if (!(afu->native->spa = (struct cxl_process_element *)
__get_free_pages(GFP_KERNEL | __GFP_ZERO, afu->native->spa_order))) {
pr_err("cxl_alloc_spa: Unable to allocate scheduled process area\n");
@@ -486,8 +495,9 @@ static u64 calculate_sr(struct cxl_context *ctx)
if (mfspr(SPRN_LPCR) & LPCR_TC)
sr |= CXL_PSL_SR_An_TC;
if (ctx->kernel) {
- sr |= CXL_PSL_SR_An_R | (mfmsr() & MSR_SF);
- sr |= CXL_PSL_SR_An_HV;
+ if (!ctx->real_mode)
+ sr |= CXL_PSL_SR_An_R;
+ sr |= (mfmsr() & MSR_SF) | CXL_PSL_SR_An_HV;
} else {
sr |= CXL_PSL_SR_An_PR | CXL_PSL_SR_An_R;
sr &= ~(CXL_PSL_SR_An_HV);
@@ -526,6 +536,15 @@ static int attach_afu_directed(struct cxl_context *ctx, u64 wed, u64 amr)
ctx->elem->common.sstp0 = cpu_to_be64(ctx->sstp0);
ctx->elem->common.sstp1 = cpu_to_be64(ctx->sstp1);
+ /*
+ * Ensure we have the multiplexed PSL interrupt set up to take faults
+ * for kernel contexts that may not have allocated any AFU IRQs at all:
+ */
+ if (ctx->irqs.range[0] == 0) {
+ ctx->irqs.offset[0] = ctx->afu->native->psl_hwirq;
+ ctx->irqs.range[0] = 1;
+ }
+
for (r = 0; r < CXL_IRQ_RANGES; r++) {
ctx->elem->ivte_offsets[r] = cpu_to_be16(ctx->irqs.offset[r]);
ctx->elem->ivte_ranges[r] = cpu_to_be16(ctx->irqs.range[r]);
diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c
index 2844e97..a08fcc8 100644
--- a/drivers/misc/cxl/pci.c
+++ b/drivers/misc/cxl/pci.c
@@ -21,6 +21,7 @@
#include <asm/msi_bitmap.h>
#include <asm/pnv-pci.h>
#include <asm/io.h>
+#include <asm/reg.h>
#include "cxl.h"
#include <misc/cxl.h>
@@ -321,12 +322,43 @@ static void dump_afu_descriptor(struct cxl_afu *afu)
#undef show_reg
}
+#define CAPP_UNIT0_ID 0xBA
+#define CAPP_UNIT1_ID 0XBE
+
+static u64 get_capp_unit_id(struct device_node *np)
+{
+ u32 phb_index;
+
+ /*
+ * For chips other than POWER8NVL, we only have CAPP 0,
+ * irrespective of which PHB is used.
+ */
+ if (!pvr_version_is(PVR_POWER8NVL))
+ return CAPP_UNIT0_ID;
+
+ /*
+ * For POWER8NVL, assume CAPP 0 is attached to PHB0 and
+ * CAPP 1 is attached to PHB1.
+ */
+ if (of_property_read_u32(np, "ibm,phb-index", &phb_index))
+ return 0;
+
+ if (phb_index == 0)
+ return CAPP_UNIT0_ID;
+
+ if (phb_index == 1)
+ return CAPP_UNIT1_ID;
+
+ return 0;
+}
+
static int init_implementation_adapter_regs(struct cxl *adapter, struct pci_dev *dev)
{
struct device_node *np;
const __be32 *prop;
u64 psl_dsnctl;
u64 chipid;
+ u64 capp_unit_id;
if (!(np = pnv_pci_get_phb_node(dev)))
return -ENODEV;
@@ -336,10 +368,19 @@ static int init_implementation_adapter_regs(struct cxl *adapter, struct pci_dev
if (!np)
return -ENODEV;
chipid = be32_to_cpup(prop);
+ capp_unit_id = get_capp_unit_id(np);
of_node_put(np);
+ if (!capp_unit_id) {
+ pr_err("cxl: invalid capp unit id\n");
+ return -ENODEV;
+ }
+ psl_dsnctl = 0x0000900000000000ULL; /* pteupd ttype, scdone */
+ psl_dsnctl |= (0x2ULL << (63-38)); /* MMIO hang pulse: 256 us */
/* Tell PSL where to route data to */
- psl_dsnctl = 0x02E8900002000000ULL | (chipid << (63-5));
+ psl_dsnctl |= (chipid << (63-5));
+ psl_dsnctl |= (capp_unit_id << (63-13));
+
cxl_p1_write(adapter, CXL_PSL_DSNDCTL, psl_dsnctl);
cxl_p1_write(adapter, CXL_PSL_RESLCKTO, 0x20000000200ULL);
/* snoop write mask */
@@ -355,22 +396,24 @@ static int init_implementation_adapter_regs(struct cxl *adapter, struct pci_dev
#define TBSYNC_CNT(n) (((u64)n & 0x7) << (63-6))
#define _2048_250MHZ_CYCLES 1
-static int cxl_setup_psl_timebase(struct cxl *adapter, struct pci_dev *dev)
+static void cxl_setup_psl_timebase(struct cxl *adapter, struct pci_dev *dev)
{
u64 psl_tb;
int delta;
unsigned int retry = 0;
struct device_node *np;
+ adapter->psl_timebase_synced = false;
+
if (!(np = pnv_pci_get_phb_node(dev)))
- return -ENODEV;
+ return;
/* Do not fail when CAPP timebase sync is not supported by OPAL */
of_node_get(np);
if (! of_get_property(np, "ibm,capp-timebase-sync", NULL)) {
of_node_put(np);
- pr_err("PSL: Timebase sync: OPAL support missing\n");
- return 0;
+ dev_info(&dev->dev, "PSL timebase inactive: OPAL support missing\n");
+ return;
}
of_node_put(np);
@@ -389,8 +432,8 @@ static int cxl_setup_psl_timebase(struct cxl *adapter, struct pci_dev *dev)
do {
msleep(1);
if (retry++ > 5) {
- pr_err("PSL: Timebase sync: giving up!\n");
- return -EIO;
+ dev_info(&dev->dev, "PSL timebase can't synchronize\n");
+ return;
}
psl_tb = cxl_p1_read(adapter, CXL_PSL_Timebase);
delta = mftb() - psl_tb;
@@ -398,7 +441,8 @@ static int cxl_setup_psl_timebase(struct cxl *adapter, struct pci_dev *dev)
delta = -delta;
} while (tb_to_ns(delta) > 16000);
- return 0;
+ adapter->psl_timebase_synced = true;
+ return;
}
static int init_implementation_afu_regs(struct cxl_afu *afu)
@@ -1144,8 +1188,8 @@ static int cxl_configure_adapter(struct cxl *adapter, struct pci_dev *dev)
if ((rc = pnv_phb_to_cxl_mode(dev, OPAL_PHB_CAPI_MODE_SNOOP_ON)))
goto err;
- if ((rc = cxl_setup_psl_timebase(adapter, dev)))
- goto err;
+ /* Ignore error, adapter init is not dependant on timebase sync */
+ cxl_setup_psl_timebase(adapter, dev);
if ((rc = cxl_native_register_psl_err_irq(adapter)))
goto err;
diff --git a/drivers/misc/cxl/sysfs.c b/drivers/misc/cxl/sysfs.c
index 25913c0..b043c20 100644
--- a/drivers/misc/cxl/sysfs.c
+++ b/drivers/misc/cxl/sysfs.c
@@ -57,6 +57,15 @@ static ssize_t image_loaded_show(struct device *device,
return scnprintf(buf, PAGE_SIZE, "factory\n");
}
+static ssize_t psl_timebase_synced_show(struct device *device,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct cxl *adapter = to_cxl_adapter(device);
+
+ return scnprintf(buf, PAGE_SIZE, "%i\n", adapter->psl_timebase_synced);
+}
+
static ssize_t reset_adapter_store(struct device *device,
struct device_attribute *attr,
const char *buf, size_t count)
@@ -142,6 +151,7 @@ static struct device_attribute adapter_attrs[] = {
__ATTR_RO(psl_revision),
__ATTR_RO(base_image),
__ATTR_RO(image_loaded),
+ __ATTR_RO(psl_timebase_synced),
__ATTR_RW(load_image_on_perst),
__ATTR_RW(perst_reloads_same_image),
__ATTR(reset, S_IWUSR, NULL, reset_adapter_store),
diff --git a/drivers/of/base.c b/drivers/of/base.c
index b299de2..64018eb 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -1777,6 +1777,9 @@ int of_remove_property(struct device_node *np, struct property *prop)
unsigned long flags;
int rc;
+ if (!prop)
+ return -ENODEV;
+
mutex_lock(&of_mutex);
raw_spin_lock_irqsave(&devtree_lock, flags);
diff --git a/drivers/pci/hotplug/rpadlpar_core.c b/drivers/pci/hotplug/rpadlpar_core.c
index b46b57d..dc67f39 100644
--- a/drivers/pci/hotplug/rpadlpar_core.c
+++ b/drivers/pci/hotplug/rpadlpar_core.c
@@ -175,7 +175,7 @@ static int dlpar_add_pci_slot(char *drc_name, struct device_node *dn)
struct pci_dev *dev;
struct pci_controller *phb;
- if (pcibios_find_pci_bus(dn))
+ if (pci_find_bus_by_node(dn))
return -EINVAL;
/* Add pci bus */
@@ -212,7 +212,7 @@ static int dlpar_remove_phb(char *drc_name, struct device_node *dn)
struct pci_dn *pdn;
int rc = 0;
- if (!pcibios_find_pci_bus(dn))
+ if (!pci_find_bus_by_node(dn))
return -EINVAL;
/* If pci slot is hotpluggable, use hotplug to remove it */
@@ -356,7 +356,7 @@ int dlpar_remove_pci_slot(char *drc_name, struct device_node *dn)
pci_lock_rescan_remove();
- bus = pcibios_find_pci_bus(dn);
+ bus = pci_find_bus_by_node(dn);
if (!bus) {
ret = -EINVAL;
goto out;
@@ -380,7 +380,7 @@ int dlpar_remove_pci_slot(char *drc_name, struct device_node *dn)
}
/* Remove all devices below slot */
- pcibios_remove_pci_devices(bus);
+ pci_hp_remove_devices(bus);
/* Unmap PCI IO space */
if (pcibios_unmap_io_space(bus)) {
diff --git a/drivers/pci/hotplug/rpaphp_core.c b/drivers/pci/hotplug/rpaphp_core.c
index 611f605..8d13202 100644
--- a/drivers/pci/hotplug/rpaphp_core.c
+++ b/drivers/pci/hotplug/rpaphp_core.c
@@ -404,7 +404,7 @@ static int enable_slot(struct hotplug_slot *hotplug_slot)
if (state == PRESENT) {
pci_lock_rescan_remove();
- pcibios_add_pci_devices(slot->bus);
+ pci_hp_add_devices(slot->bus);
pci_unlock_rescan_remove();
slot->state = CONFIGURED;
} else if (state == EMPTY) {
@@ -426,7 +426,7 @@ static int disable_slot(struct hotplug_slot *hotplug_slot)
return -EINVAL;
pci_lock_rescan_remove();
- pcibios_remove_pci_devices(slot->bus);
+ pci_hp_remove_devices(slot->bus);
pci_unlock_rescan_remove();
vm_unmap_aliases();
diff --git a/drivers/pci/hotplug/rpaphp_pci.c b/drivers/pci/hotplug/rpaphp_pci.c
index 7836d69..ea41ea1 100644
--- a/drivers/pci/hotplug/rpaphp_pci.c
+++ b/drivers/pci/hotplug/rpaphp_pci.c
@@ -93,7 +93,7 @@ int rpaphp_enable_slot(struct slot *slot)
if (rc)
return rc;
- bus = pcibios_find_pci_bus(slot->dn);
+ bus = pci_find_bus_by_node(slot->dn);
if (!bus) {
err("%s: no pci_bus for dn %s\n", __func__, slot->dn->full_name);
return -EINVAL;
@@ -116,7 +116,7 @@ int rpaphp_enable_slot(struct slot *slot)
}
if (list_empty(&bus->devices))
- pcibios_add_pci_devices(bus);
+ pci_hp_add_devices(bus);
if (!list_empty(&bus->devices)) {
info->adapter_status = CONFIGURED;
diff --git a/drivers/pcmcia/electra_cf.c b/drivers/pcmcia/electra_cf.c
index 61cf61a..4d7bc3f 100644
--- a/drivers/pcmcia/electra_cf.c
+++ b/drivers/pcmcia/electra_cf.c
@@ -228,7 +228,7 @@ static int electra_cf_probe(struct platform_device *ofdev)
if (!cf->mem_base || !cf->io_virt || !cf->gpio_base ||
(__ioremap_at(io.start, cf->io_virt, cf->io_size,
- _PAGE_NO_CACHE | _PAGE_GUARDED) == NULL)) {
+ pgprot_val(pgprot_noncached(__pgprot(0)))) == NULL)) {
dev_err(device, "can't ioremap ranges\n");
status = -ENOMEM;
goto fail1;
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index 0582b72..3054e3f 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -1188,7 +1188,8 @@ static int tce_iommu_attach_group(void *iommu_data,
goto unlock_exit;
}
table_group_tmp = iommu_group_get_iommudata(tcegrp->grp);
- if (table_group_tmp->ops != table_group->ops) {
+ if (table_group_tmp->ops->create_table !=
+ table_group->ops->create_table) {
pr_warn("tce_vfio: Group %d is incompatible with group %d\n",
iommu_group_id(iommu_group),
iommu_group_id(tcegrp->grp));
diff --git a/include/misc/cxl.h b/include/misc/cxl.h
index 7d5e261..56560c5 100644
--- a/include/misc/cxl.h
+++ b/include/misc/cxl.h
@@ -127,6 +127,14 @@ int cxl_afu_reset(struct cxl_context *ctx);
void cxl_set_master(struct cxl_context *ctx);
/*
+ * Sets the context to use real mode memory accesses to operate with
+ * translation disabled. Note that this only makes sense for kernel contexts
+ * under bare metal, and will not work with virtualisation. May only be
+ * performed on stopped contexts.
+ */
+int cxl_set_translation_mode(struct cxl_context *ctx, bool real_mode);
+
+/*
* Map and unmap the AFU Problem Space area. The amount and location mapped
* depends on if this context is a master or slave.
*/
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 7e8d792..a6c8252 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3456,11 +3456,23 @@ struct ftrace_glob {
int type;
};
+/*
+ * If symbols in an architecture don't correspond exactly to the user-visible
+ * name of what they represent, it is possible to define this function to
+ * perform the necessary adjustments.
+*/
+char * __weak arch_ftrace_match_adjust(char *str, const char *search)
+{
+ return str;
+}
+
static int ftrace_match(char *str, struct ftrace_glob *g)
{
int matched = 0;
int slen;
+ str = arch_ftrace_match_adjust(str, g->search);
+
switch (g->type) {
case MATCH_FULL:
if (strcmp(str, g->search) == 0)
diff --git a/tools/perf/arch/powerpc/include/perf_regs.h b/tools/perf/arch/powerpc/include/perf_regs.h
new file mode 100644
index 0000000..75de0e9
--- /dev/null
+++ b/tools/perf/arch/powerpc/include/perf_regs.h
@@ -0,0 +1,69 @@
+#ifndef ARCH_PERF_REGS_H
+#define ARCH_PERF_REGS_H
+
+#include <stdlib.h>
+#include <linux/types.h>
+#include <asm/perf_regs.h>
+
+#define PERF_REGS_MASK ((1ULL << PERF_REG_POWERPC_MAX) - 1)
+#define PERF_REGS_MAX PERF_REG_POWERPC_MAX
+#ifdef __powerpc64__
+ #define PERF_SAMPLE_REGS_ABI PERF_SAMPLE_REGS_ABI_64
+#else
+ #define PERF_SAMPLE_REGS_ABI PERF_SAMPLE_REGS_ABI_32
+#endif
+
+#define PERF_REG_IP PERF_REG_POWERPC_NIP
+#define PERF_REG_SP PERF_REG_POWERPC_R1
+
+static const char *reg_names[] = {
+ [PERF_REG_POWERPC_R0] = "r0",
+ [PERF_REG_POWERPC_R1] = "r1",
+ [PERF_REG_POWERPC_R2] = "r2",
+ [PERF_REG_POWERPC_R3] = "r3",
+ [PERF_REG_POWERPC_R4] = "r4",
+ [PERF_REG_POWERPC_R5] = "r5",
+ [PERF_REG_POWERPC_R6] = "r6",
+ [PERF_REG_POWERPC_R7] = "r7",
+ [PERF_REG_POWERPC_R8] = "r8",
+ [PERF_REG_POWERPC_R9] = "r9",
+ [PERF_REG_POWERPC_R10] = "r10",
+ [PERF_REG_POWERPC_R11] = "r11",
+ [PERF_REG_POWERPC_R12] = "r12",
+ [PERF_REG_POWERPC_R13] = "r13",
+ [PERF_REG_POWERPC_R14] = "r14",
+ [PERF_REG_POWERPC_R15] = "r15",
+ [PERF_REG_POWERPC_R16] = "r16",
+ [PERF_REG_POWERPC_R17] = "r17",
+ [PERF_REG_POWERPC_R18] = "r18",
+ [PERF_REG_POWERPC_R19] = "r19",
+ [PERF_REG_POWERPC_R20] = "r20",
+ [PERF_REG_POWERPC_R21] = "r21",
+ [PERF_REG_POWERPC_R22] = "r22",
+ [PERF_REG_POWERPC_R23] = "r23",
+ [PERF_REG_POWERPC_R24] = "r24",
+ [PERF_REG_POWERPC_R25] = "r25",
+ [PERF_REG_POWERPC_R26] = "r26",
+ [PERF_REG_POWERPC_R27] = "r27",
+ [PERF_REG_POWERPC_R28] = "r28",
+ [PERF_REG_POWERPC_R29] = "r29",
+ [PERF_REG_POWERPC_R30] = "r30",
+ [PERF_REG_POWERPC_R31] = "r31",
+ [PERF_REG_POWERPC_NIP] = "nip",
+ [PERF_REG_POWERPC_MSR] = "msr",
+ [PERF_REG_POWERPC_ORIG_R3] = "orig_r3",
+ [PERF_REG_POWERPC_CTR] = "ctr",
+ [PERF_REG_POWERPC_LINK] = "link",
+ [PERF_REG_POWERPC_XER] = "xer",
+ [PERF_REG_POWERPC_CCR] = "ccr",
+ [PERF_REG_POWERPC_SOFTE] = "softe",
+ [PERF_REG_POWERPC_TRAP] = "trap",
+ [PERF_REG_POWERPC_DAR] = "dar",
+ [PERF_REG_POWERPC_DSISR] = "dsisr"
+};
+
+static inline const char *perf_reg_name(int id)
+{
+ return reg_names[id];
+}
+#endif /* ARCH_PERF_REGS_H */
diff --git a/tools/perf/arch/powerpc/util/Build b/tools/perf/arch/powerpc/util/Build
index c8fe207..90ad64b 100644
--- a/tools/perf/arch/powerpc/util/Build
+++ b/tools/perf/arch/powerpc/util/Build
@@ -1,6 +1,8 @@
libperf-y += header.o
libperf-y += sym-handling.o
libperf-y += kvm-stat.o
+libperf-y += perf_regs.o
libperf-$(CONFIG_DWARF) += dwarf-regs.o
libperf-$(CONFIG_DWARF) += skip-callchain-idx.o
+libperf-$(CONFIG_LIBUNWIND) += unwind-libunwind.o
diff --git a/tools/perf/arch/powerpc/util/perf_regs.c b/tools/perf/arch/powerpc/util/perf_regs.c
new file mode 100644
index 0000000..a3c3e1c
--- /dev/null
+++ b/tools/perf/arch/powerpc/util/perf_regs.c
@@ -0,0 +1,49 @@
+#include "../../perf.h"
+#include "../../util/perf_regs.h"
+
+const struct sample_reg sample_reg_masks[] = {
+ SMPL_REG(r0, PERF_REG_POWERPC_R0),
+ SMPL_REG(r1, PERF_REG_POWERPC_R1),
+ SMPL_REG(r2, PERF_REG_POWERPC_R2),
+ SMPL_REG(r3, PERF_REG_POWERPC_R3),
+ SMPL_REG(r4, PERF_REG_POWERPC_R4),
+ SMPL_REG(r5, PERF_REG_POWERPC_R5),
+ SMPL_REG(r6, PERF_REG_POWERPC_R6),
+ SMPL_REG(r7, PERF_REG_POWERPC_R7),
+ SMPL_REG(r8, PERF_REG_POWERPC_R8),
+ SMPL_REG(r9, PERF_REG_POWERPC_R9),
+ SMPL_REG(r10, PERF_REG_POWERPC_R10),
+ SMPL_REG(r11, PERF_REG_POWERPC_R11),
+ SMPL_REG(r12, PERF_REG_POWERPC_R12),
+ SMPL_REG(r13, PERF_REG_POWERPC_R13),
+ SMPL_REG(r14, PERF_REG_POWERPC_R14),
+ SMPL_REG(r15, PERF_REG_POWERPC_R15),
+ SMPL_REG(r16, PERF_REG_POWERPC_R16),
+ SMPL_REG(r17, PERF_REG_POWERPC_R17),
+ SMPL_REG(r18, PERF_REG_POWERPC_R18),
+ SMPL_REG(r19, PERF_REG_POWERPC_R19),
+ SMPL_REG(r20, PERF_REG_POWERPC_R20),
+ SMPL_REG(r21, PERF_REG_POWERPC_R21),
+ SMPL_REG(r22, PERF_REG_POWERPC_R22),
+ SMPL_REG(r23, PERF_REG_POWERPC_R23),
+ SMPL_REG(r24, PERF_REG_POWERPC_R24),
+ SMPL_REG(r25, PERF_REG_POWERPC_R25),
+ SMPL_REG(r26, PERF_REG_POWERPC_R26),
+ SMPL_REG(r27, PERF_REG_POWERPC_R27),
+ SMPL_REG(r28, PERF_REG_POWERPC_R28),
+ SMPL_REG(r29, PERF_REG_POWERPC_R29),
+ SMPL_REG(r30, PERF_REG_POWERPC_R30),
+ SMPL_REG(r31, PERF_REG_POWERPC_R31),
+ SMPL_REG(nip, PERF_REG_POWERPC_NIP),
+ SMPL_REG(msr, PERF_REG_POWERPC_MSR),
+ SMPL_REG(orig_r3, PERF_REG_POWERPC_ORIG_R3),
+ SMPL_REG(ctr, PERF_REG_POWERPC_CTR),
+ SMPL_REG(link, PERF_REG_POWERPC_LINK),
+ SMPL_REG(xer, PERF_REG_POWERPC_XER),
+ SMPL_REG(ccr, PERF_REG_POWERPC_CCR),
+ SMPL_REG(softe, PERF_REG_POWERPC_SOFTE),
+ SMPL_REG(trap, PERF_REG_POWERPC_TRAP),
+ SMPL_REG(dar, PERF_REG_POWERPC_DAR),
+ SMPL_REG(dsisr, PERF_REG_POWERPC_DSISR),
+ SMPL_REG_END
+};
diff --git a/tools/perf/arch/powerpc/util/unwind-libunwind.c b/tools/perf/arch/powerpc/util/unwind-libunwind.c
new file mode 100644
index 0000000..9e15f92
--- /dev/null
+++ b/tools/perf/arch/powerpc/util/unwind-libunwind.c
@@ -0,0 +1,96 @@
+/*
+ * Copyright 2016 Chandan Kumar, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <errno.h>
+#include <libunwind.h>
+#include <asm/perf_regs.h>
+#include "../../util/unwind.h"
+#include "../../util/debug.h"
+
+int libunwind__arch_reg_id(int regnum)
+{
+ switch (regnum) {
+ case UNW_PPC64_R0:
+ return PERF_REG_POWERPC_R0;
+ case UNW_PPC64_R1:
+ return PERF_REG_POWERPC_R1;
+ case UNW_PPC64_R2:
+ return PERF_REG_POWERPC_R2;
+ case UNW_PPC64_R3:
+ return PERF_REG_POWERPC_R3;
+ case UNW_PPC64_R4:
+ return PERF_REG_POWERPC_R4;
+ case UNW_PPC64_R5:
+ return PERF_REG_POWERPC_R5;
+ case UNW_PPC64_R6:
+ return PERF_REG_POWERPC_R6;
+ case UNW_PPC64_R7:
+ return PERF_REG_POWERPC_R7;
+ case UNW_PPC64_R8:
+ return PERF_REG_POWERPC_R8;
+ case UNW_PPC64_R9:
+ return PERF_REG_POWERPC_R9;
+ case UNW_PPC64_R10:
+ return PERF_REG_POWERPC_R10;
+ case UNW_PPC64_R11:
+ return PERF_REG_POWERPC_R11;
+ case UNW_PPC64_R12:
+ return PERF_REG_POWERPC_R12;
+ case UNW_PPC64_R13:
+ return PERF_REG_POWERPC_R13;
+ case UNW_PPC64_R14:
+ return PERF_REG_POWERPC_R14;
+ case UNW_PPC64_R15:
+ return PERF_REG_POWERPC_R15;
+ case UNW_PPC64_R16:
+ return PERF_REG_POWERPC_R16;
+ case UNW_PPC64_R17:
+ return PERF_REG_POWERPC_R17;
+ case UNW_PPC64_R18:
+ return PERF_REG_POWERPC_R18;
+ case UNW_PPC64_R19:
+ return PERF_REG_POWERPC_R19;
+ case UNW_PPC64_R20:
+ return PERF_REG_POWERPC_R20;
+ case UNW_PPC64_R21:
+ return PERF_REG_POWERPC_R21;
+ case UNW_PPC64_R22:
+ return PERF_REG_POWERPC_R22;
+ case UNW_PPC64_R23:
+ return PERF_REG_POWERPC_R23;
+ case UNW_PPC64_R24:
+ return PERF_REG_POWERPC_R24;
+ case UNW_PPC64_R25:
+ return PERF_REG_POWERPC_R25;
+ case UNW_PPC64_R26:
+ return PERF_REG_POWERPC_R26;
+ case UNW_PPC64_R27:
+ return PERF_REG_POWERPC_R27;
+ case UNW_PPC64_R28:
+ return PERF_REG_POWERPC_R28;
+ case UNW_PPC64_R29:
+ return PERF_REG_POWERPC_R29;
+ case UNW_PPC64_R30:
+ return PERF_REG_POWERPC_R30;
+ case UNW_PPC64_R31:
+ return PERF_REG_POWERPC_R31;
+ case UNW_PPC64_LR:
+ return PERF_REG_POWERPC_LINK;
+ case UNW_PPC64_CTR:
+ return PERF_REG_POWERPC_CTR;
+ case UNW_PPC64_XER:
+ return PERF_REG_POWERPC_XER;
+ case UNW_PPC64_NIP:
+ return PERF_REG_POWERPC_NIP;
+ default:
+ pr_err("unwind: invalid reg id %d\n", regnum);
+ return -EINVAL;
+ }
+ return -EINVAL;
+}
diff --git a/tools/perf/config/Makefile b/tools/perf/config/Makefile
index 1e46277..5ad0255 100644
--- a/tools/perf/config/Makefile
+++ b/tools/perf/config/Makefile
@@ -23,6 +23,12 @@ $(call detected_var,ARCH)
NO_PERF_REGS := 1
+# Additional ARCH settings for ppc
+ifeq ($(ARCH),powerpc)
+ NO_PERF_REGS := 0
+ LIBUNWIND_LIBS := -lunwind -lunwind-ppc64
+endif
+
# Additional ARCH settings for x86
ifeq ($(ARCH),x86)
$(call detected,CONFIG_X86)
diff --git a/tools/perf/util/perf_regs.c b/tools/perf/util/perf_regs.c
index 6b8eb13..c4023f2 100644
--- a/tools/perf/util/perf_regs.c
+++ b/tools/perf/util/perf_regs.c
@@ -12,18 +12,18 @@ int perf_reg_value(u64 *valp, struct regs_dump *regs, int id)
int i, idx = 0;
u64 mask = regs->mask;
- if (regs->cache_mask & (1 << id))
+ if (regs->cache_mask & (1ULL << id))
goto out;
- if (!(mask & (1 << id)))
+ if (!(mask & (1ULL << id)))
return -EINVAL;
for (i = 0; i < id; i++) {
- if (mask & (1 << i))
+ if (mask & (1ULL << i))
idx++;
}
- regs->cache_mask |= (1 << id);
+ regs->cache_mask |= (1ULL << id);
regs->cache_regs[id] = regs->regs[idx];
out:
diff --git a/tools/testing/selftests/powerpc/Makefile b/tools/testing/selftests/powerpc/Makefile
index b08f77c..4ca83fe 100644
--- a/tools/testing/selftests/powerpc/Makefile
+++ b/tools/testing/selftests/powerpc/Makefile
@@ -14,6 +14,7 @@ export CFLAGS
SUB_DIRS = benchmarks \
copyloops \
+ context_switch \
dscr \
mm \
pmu \
diff --git a/tools/testing/selftests/powerpc/context_switch/.gitignore b/tools/testing/selftests/powerpc/context_switch/.gitignore
new file mode 100644
index 0000000..c1431af
--- /dev/null
+++ b/tools/testing/selftests/powerpc/context_switch/.gitignore
@@ -0,0 +1 @@
+cp_abort
diff --git a/tools/testing/selftests/powerpc/context_switch/Makefile b/tools/testing/selftests/powerpc/context_switch/Makefile
new file mode 100644
index 0000000..e164d14
--- /dev/null
+++ b/tools/testing/selftests/powerpc/context_switch/Makefile
@@ -0,0 +1,10 @@
+TEST_PROGS := cp_abort
+
+all: $(TEST_PROGS)
+
+$(TEST_PROGS): ../harness.c ../utils.c
+
+include ../../lib.mk
+
+clean:
+ rm -f $(TEST_PROGS)
diff --git a/tools/testing/selftests/powerpc/context_switch/cp_abort.c b/tools/testing/selftests/powerpc/context_switch/cp_abort.c
new file mode 100644
index 0000000..5a5b55a
--- /dev/null
+++ b/tools/testing/selftests/powerpc/context_switch/cp_abort.c
@@ -0,0 +1,110 @@
+/*
+ * Adapted from Anton Blanchard's context switch microbenchmark.
+ *
+ * Copyright 2009, Anton Blanchard, IBM Corporation.
+ * Copyright 2016, Mikey Neuling, Chris Smart, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * This program tests the copy paste abort functionality of a P9
+ * (or later) by setting up two processes on the same CPU, one
+ * which executes the copy instruction and the other which
+ * executes paste.
+ *
+ * The paste instruction should never succeed, as the cp_abort
+ * instruction is called by the kernel during a context switch.
+ *
+ */
+
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include "utils.h"
+#include <sched.h>
+
+#define READ_FD 0
+#define WRITE_FD 1
+
+#define NUM_LOOPS 1000
+
+/* This defines the "paste" instruction from Power ISA 3.0 Book II, section 4.4. */
+#define PASTE(RA, RB, L, RC) \
+ .long (0x7c00070c | (RA) << (31-15) | (RB) << (31-20) | (L) << (31-10) | (RC) << (31-31))
+
+int paste(void *i)
+{
+ int cr;
+
+ asm volatile(str(PASTE(0, %1, 1, 1))";"
+ "mfcr %0;"
+ : "=r" (cr)
+ : "b" (i)
+ : "memory"
+ );
+ return cr;
+}
+
+/* This defines the "copy" instruction from Power ISA 3.0 Book II, section 4.4. */
+#define COPY(RA, RB, L) \
+ .long (0x7c00060c | (RA) << (31-15) | (RB) << (31-20) | (L) << (31-10))
+
+void copy(void *i)
+{
+ asm volatile(str(COPY(0, %0, 1))";"
+ :
+ : "b" (i)
+ : "memory"
+ );
+}
+
+int test_cp_abort(void)
+{
+ /* 128 bytes for a full cache line */
+ char buf[128] __cacheline_aligned;
+ cpu_set_t cpuset;
+ int fd1[2], fd2[2], pid;
+ char c;
+
+ /* only run this test on a P9 or later */
+ SKIP_IF(!have_hwcap2(PPC_FEATURE2_ARCH_3_00));
+
+ /*
+ * Run both processes on the same CPU, so that copy is more likely
+ * to leak into a paste.
+ */
+ CPU_ZERO(&cpuset);
+ CPU_SET(pick_online_cpu(), &cpuset);
+ FAIL_IF(sched_setaffinity(0, sizeof(cpuset), &cpuset));
+
+ FAIL_IF(pipe(fd1) || pipe(fd2));
+
+ pid = fork();
+ FAIL_IF(pid < 0);
+
+ if (!pid) {
+ for (int i = 0; i < NUM_LOOPS; i++) {
+ FAIL_IF((write(fd1[WRITE_FD], &c, 1)) != 1);
+ FAIL_IF((read(fd2[READ_FD], &c, 1)) != 1);
+ /* A paste succeeds if CR0 EQ bit is set */
+ FAIL_IF(paste(buf) & 0x20000000);
+ }
+ } else {
+ for (int i = 0; i < NUM_LOOPS; i++) {
+ FAIL_IF((read(fd1[READ_FD], &c, 1)) != 1);
+ copy(buf);
+ FAIL_IF((write(fd2[WRITE_FD], &c, 1) != 1));
+ }
+ }
+ return 0;
+
+}
+
+int main(int argc, char *argv[])
+{
+ return test_harness(test_cp_abort, "cp_abort");
+}
diff --git a/tools/testing/selftests/powerpc/mm/subpage_prot.c b/tools/testing/selftests/powerpc/mm/subpage_prot.c
index 440180f..35ade74 100644
--- a/tools/testing/selftests/powerpc/mm/subpage_prot.c
+++ b/tools/testing/selftests/powerpc/mm/subpage_prot.c
@@ -73,7 +73,7 @@ static inline void check_faulted(void *addr, long page, long subpage, int write)
want_fault |= (subpage == ((page + 1) % 16));
if (faulted != want_fault) {
- printf("Failed at 0x%p (p=%ld,sp=%ld,w=%d), want=%s, got=%s !\n",
+ printf("Failed at %p (p=%ld,sp=%ld,w=%d), want=%s, got=%s !\n",
addr, page, subpage, write,
want_fault ? "fault" : "pass",
faulted ? "fault" : "pass");
@@ -82,7 +82,7 @@ static inline void check_faulted(void *addr, long page, long subpage, int write)
if (faulted) {
if (dar != addr) {
- printf("Fault expected at 0x%p and happened at 0x%p !\n",
+ printf("Fault expected at %p and happened at %p !\n",
addr, dar);
}
faulted = 0;
@@ -162,7 +162,7 @@ int test_anon(void)
mallocblock = (void *)align;
- printf("allocated malloc block of 0x%lx bytes at 0x%p\n",
+ printf("allocated malloc block of 0x%lx bytes at %p\n",
mallocsize, mallocblock);
printf("testing malloc block...\n");
@@ -197,7 +197,7 @@ int test_file(void)
perror("failed to map file");
return 1;
}
- printf("allocated %s for 0x%lx bytes at 0x%p\n",
+ printf("allocated %s for 0x%lx bytes at %p\n",
file_name, filesize, fileblock);
printf("testing file map...\n");
@@ -207,14 +207,16 @@ int test_file(void)
int main(int argc, char *argv[])
{
- test_harness(test_anon, "subpage_prot_anon");
+ int rc;
+
+ rc = test_harness(test_anon, "subpage_prot_anon");
+ if (rc)
+ return rc;
if (argc > 1)
file_name = argv[1];
else
file_name = "tempfile";
- test_harness(test_file, "subpage_prot_file");
-
- return 0;
+ return test_harness(test_file, "subpage_prot_file");
}
diff --git a/tools/testing/selftests/powerpc/pmu/ebb/ebb.c b/tools/testing/selftests/powerpc/pmu/ebb/ebb.c
index e67452f..46681fe 100644
--- a/tools/testing/selftests/powerpc/pmu/ebb/ebb.c
+++ b/tools/testing/selftests/powerpc/pmu/ebb/ebb.c
@@ -15,7 +15,6 @@
#include <sys/ioctl.h>
#include "trace.h"
-#include "reg.h"
#include "ebb.h"
diff --git a/tools/testing/selftests/powerpc/pmu/ebb/reg_access_test.c b/tools/testing/selftests/powerpc/pmu/ebb/reg_access_test.c
index 5b1188f..f923228 100644
--- a/tools/testing/selftests/powerpc/pmu/ebb/reg_access_test.c
+++ b/tools/testing/selftests/powerpc/pmu/ebb/reg_access_test.c
@@ -7,7 +7,6 @@
#include <stdlib.h>
#include "ebb.h"
-#include "reg.h"
/*
diff --git a/tools/testing/selftests/powerpc/pmu/ebb/reg.h b/tools/testing/selftests/powerpc/reg.h
index 5921b0d..65bfdee 100644
--- a/tools/testing/selftests/powerpc/pmu/ebb/reg.h
+++ b/tools/testing/selftests/powerpc/reg.h
@@ -9,12 +9,12 @@
#define __stringify_1(x) #x
#define __stringify(x) __stringify_1(x)
-#define mfspr(rn) ({unsigned long rval; \
- asm volatile("mfspr %0," __stringify(rn) \
- : "=r" (rval)); rval; })
-#define mtspr(rn, v) asm volatile("mtspr " __stringify(rn) ",%0" : \
- : "r" ((unsigned long)(v)) \
- : "memory")
+#define mfspr(rn) ({unsigned long rval; \
+ asm volatile("mfspr %0," _str(rn) \
+ : "=r" (rval)); rval; })
+#define mtspr(rn, v) asm volatile("mtspr " _str(rn) ",%0" : \
+ : "r" ((unsigned long)(v)) \
+ : "memory")
#define mb() asm volatile("sync" : : : "memory");
@@ -46,4 +46,10 @@
#define SPRN_SDAR 781
#define SPRN_SIER 768
+#define SPRN_TEXASR 0x82
+#define SPRN_TFIAR 0x81 /* Transaction Failure Inst Addr */
+#define SPRN_TFHAR 0x80 /* Transaction Failure Handler Addr */
+#define TEXASR_FS 0x08000000
+#define SPRN_TAR 0x32f
+
#endif /* _SELFTESTS_POWERPC_REG_H */
diff --git a/tools/testing/selftests/powerpc/tm/.gitignore b/tools/testing/selftests/powerpc/tm/.gitignore
index 7d0f14b..bb942db 100644
--- a/tools/testing/selftests/powerpc/tm/.gitignore
+++ b/tools/testing/selftests/powerpc/tm/.gitignore
@@ -3,3 +3,6 @@ tm-syscall
tm-signal-msr-resv
tm-signal-stack
tm-vmxcopy
+tm-fork
+tm-tar
+tm-tmspr
diff --git a/tools/testing/selftests/powerpc/tm/Makefile b/tools/testing/selftests/powerpc/tm/Makefile
index 737f72c..d0505db 100644
--- a/tools/testing/selftests/powerpc/tm/Makefile
+++ b/tools/testing/selftests/powerpc/tm/Makefile
@@ -1,4 +1,4 @@
-TEST_PROGS := tm-resched-dscr tm-syscall tm-signal-msr-resv tm-signal-stack tm-vmxcopy
+TEST_PROGS := tm-resched-dscr tm-syscall tm-signal-msr-resv tm-signal-stack tm-vmxcopy tm-fork tm-tar tm-tmspr
all: $(TEST_PROGS)
@@ -6,6 +6,7 @@ $(TEST_PROGS): ../harness.c ../utils.c
tm-syscall: tm-syscall-asm.S
tm-syscall: CFLAGS += -mhtm -I../../../../../usr/include
+tm-tmspr: CFLAGS += -pthread
include ../../lib.mk
diff --git a/tools/testing/selftests/powerpc/tm/tm-fork.c b/tools/testing/selftests/powerpc/tm/tm-fork.c
new file mode 100644
index 0000000..8d48579
--- /dev/null
+++ b/tools/testing/selftests/powerpc/tm/tm-fork.c
@@ -0,0 +1,42 @@
+/*
+ * Copyright 2015, Michael Neuling, IBM Corp.
+ * Licensed under GPLv2.
+ *
+ * Edited: Rashmica Gupta, Nov 2015
+ *
+ * This test does a fork syscall inside a transaction. Basic sniff test
+ * to see if we can enter the kernel during a transaction.
+ */
+
+#include <errno.h>
+#include <inttypes.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "utils.h"
+#include "tm.h"
+
+int test_fork(void)
+{
+ SKIP_IF(!have_htm());
+
+ asm __volatile__(
+ "tbegin.;"
+ "blt 1f; "
+ "li 0, 2;" /* fork syscall */
+ "sc ;"
+ "tend.;"
+ "1: ;"
+ : : : "memory", "r0");
+ /* If we reach here, we've passed. Otherwise we've probably crashed
+ * the kernel */
+
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ return test_harness(test_fork, "tm_fork");
+}
diff --git a/tools/testing/selftests/powerpc/tm/tm-resched-dscr.c b/tools/testing/selftests/powerpc/tm/tm-resched-dscr.c
index 8fde93d..d9c49f4 100644
--- a/tools/testing/selftests/powerpc/tm/tm-resched-dscr.c
+++ b/tools/testing/selftests/powerpc/tm/tm-resched-dscr.c
@@ -31,12 +31,6 @@
#include "utils.h"
#include "tm.h"
-#define TBEGIN ".long 0x7C00051D ;"
-#define TEND ".long 0x7C00055D ;"
-#define TCHECK ".long 0x7C00059C ;"
-#define TSUSPEND ".long 0x7C0005DD ;"
-#define TRESUME ".long 0x7C2005DD ;"
-#define SPRN_TEXASR 0x82
#define SPRN_DSCR 0x03
int test_body(void)
@@ -55,13 +49,13 @@ int test_body(void)
"mtspr %[sprn_dscr], 3;"
/* start and suspend a transaction */
- TBEGIN
+ "tbegin.;"
"beq 1f;"
- TSUSPEND
+ "tsuspend.;"
/* hard loop until the transaction becomes doomed */
"2: ;"
- TCHECK
+ "tcheck 0;"
"bc 4, 0, 2b;"
/* record DSCR and TEXASR */
@@ -70,8 +64,8 @@ int test_body(void)
"mfspr 3, %[sprn_texasr];"
"std 3, %[texasr];"
- TRESUME
- TEND
+ "tresume.;"
+ "tend.;"
"li %[rv], 0;"
"1: ;"
: [rv]"=r"(rv), [dscr2]"=m"(dscr2), [texasr]"=m"(texasr)
diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-stack.c b/tools/testing/selftests/powerpc/tm/tm-signal-stack.c
index e44a238..1f0eb56 100644
--- a/tools/testing/selftests/powerpc/tm/tm-signal-stack.c
+++ b/tools/testing/selftests/powerpc/tm/tm-signal-stack.c
@@ -60,9 +60,9 @@ int tm_signal_stack()
exit(1);
asm volatile("li 1, 0 ;" /* stack ptr == NULL */
"1:"
- ".long 0x7C00051D ;" /* tbegin */
+ "tbegin.;"
"beq 1b ;" /* retry forever */
- ".long 0x7C0005DD ; ;" /* tsuspend */
+ "tsuspend.;"
"ld 2, 0(1) ;" /* trigger segv" */
: : : "memory");
diff --git a/tools/testing/selftests/powerpc/tm/tm-tar.c b/tools/testing/selftests/powerpc/tm/tm-tar.c
new file mode 100644
index 0000000..2d2fcc2b
--- /dev/null
+++ b/tools/testing/selftests/powerpc/tm/tm-tar.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright 2015, Michael Neuling, IBM Corp.
+ * Licensed under GPLv2.
+ * Original: Michael Neuling 19/7/2013
+ * Edited: Rashmica Gupta 01/12/2015
+ *
+ * Do some transactions, see if the tar is corrupted.
+ * If the transaction is aborted, the TAR should be rolled back to the
+ * checkpointed value before the transaction began. The value written to
+ * TAR in suspended mode should only remain in TAR if the transaction
+ * completes.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+
+#include "tm.h"
+#include "utils.h"
+
+int num_loops = 10000;
+
+int test_tar(void)
+{
+ int i;
+
+ SKIP_IF(!have_htm());
+
+ for (i = 0; i < num_loops; i++)
+ {
+ uint64_t result = 0;
+ asm __volatile__(
+ "li 7, 1;"
+ "mtspr %[tar], 7;" /* tar = 1 */
+ "tbegin.;"
+ "beq 3f;"
+ "li 4, 0x7000;" /* Loop lots, to use time */
+ "2:;" /* Start loop */
+ "li 7, 2;"
+ "mtspr %[tar], 7;" /* tar = 2 */
+ "tsuspend.;"
+ "li 7, 3;"
+ "mtspr %[tar], 7;" /* tar = 3 */
+ "tresume.;"
+ "subi 4, 4, 1;"
+ "cmpdi 4, 0;"
+ "bne 2b;"
+ "tend.;"
+
+ /* Transaction sucess! TAR should be 3 */
+ "mfspr 7, %[tar];"
+ "ori %[res], 7, 4;" // res = 3|4 = 7
+ "b 4f;"
+
+ /* Abort handler. TAR should be rolled back to 1 */
+ "3:;"
+ "mfspr 7, %[tar];"
+ "ori %[res], 7, 8;" // res = 1|8 = 9
+ "4:;"
+
+ : [res]"=r"(result)
+ : [tar]"i"(SPRN_TAR)
+ : "memory", "r0", "r4", "r7");
+
+ /* If result is anything else other than 7 or 9, the tar
+ * value must have been corrupted. */
+ if ((result != 7) && (result != 9))
+ return 1;
+ }
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ /* A low number of iterations (eg 100) can cause a false pass */
+ if (argc > 1) {
+ if (strcmp(argv[1], "-h") == 0) {
+ printf("Syntax:\n\t%s [<num loops>]\n",
+ argv[0]);
+ return 1;
+ } else {
+ num_loops = atoi(argv[1]);
+ }
+ }
+
+ printf("Starting, %d loops\n", num_loops);
+
+ return test_harness(test_tar, "tm_tar");
+}
diff --git a/tools/testing/selftests/powerpc/tm/tm-tmspr.c b/tools/testing/selftests/powerpc/tm/tm-tmspr.c
new file mode 100644
index 0000000..2bda81c
--- /dev/null
+++ b/tools/testing/selftests/powerpc/tm/tm-tmspr.c
@@ -0,0 +1,143 @@
+/*
+ * Copyright 2015, Michael Neuling, IBM Corp.
+ * Licensed under GPLv2.
+ *
+ * Original: Michael Neuling 3/4/2014
+ * Modified: Rashmica Gupta 8/12/2015
+ *
+ * Check if any of the Transaction Memory SPRs get corrupted.
+ * - TFIAR - stores address of location of transaction failure
+ * - TFHAR - stores address of software failure handler (if transaction
+ * fails)
+ * - TEXASR - lots of info about the transacion(s)
+ *
+ * (1) create more threads than cpus
+ * (2) in each thread:
+ * (a) set TFIAR and TFHAR a unique value
+ * (b) loop for awhile, continually checking to see if
+ * either register has been corrupted.
+ *
+ * (3) Loop:
+ * (a) begin transaction
+ * (b) abort transaction
+ * (c) check TEXASR to see if FS has been corrupted
+ *
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <string.h>
+
+#include "utils.h"
+#include "tm.h"
+
+int num_loops = 10000;
+int passed = 1;
+
+void tfiar_tfhar(void *in)
+{
+ int i, cpu;
+ unsigned long tfhar, tfhar_rd, tfiar, tfiar_rd;
+ cpu_set_t cpuset;
+
+ CPU_ZERO(&cpuset);
+ cpu = (unsigned long)in >> 1;
+ CPU_SET(cpu, &cpuset);
+ sched_setaffinity(0, sizeof(cpuset), &cpuset);
+
+ /* TFIAR: Last bit has to be high so userspace can read register */
+ tfiar = ((unsigned long)in) + 1;
+ tfiar += 2;
+ mtspr(SPRN_TFIAR, tfiar);
+
+ /* TFHAR: Last two bits are reserved */
+ tfhar = ((unsigned long)in);
+ tfhar &= ~0x3UL;
+ tfhar += 4;
+ mtspr(SPRN_TFHAR, tfhar);
+
+ for (i = 0; i < num_loops; i++) {
+ tfhar_rd = mfspr(SPRN_TFHAR);
+ tfiar_rd = mfspr(SPRN_TFIAR);
+ if ( (tfhar != tfhar_rd) || (tfiar != tfiar_rd) ) {
+ passed = 0;
+ return;
+ }
+ }
+ return;
+}
+
+void texasr(void *in)
+{
+ unsigned long i;
+ uint64_t result = 0;
+
+ for (i = 0; i < num_loops; i++) {
+ asm __volatile__(
+ "tbegin.;"
+ "beq 3f ;"
+ "tabort. 0 ;"
+ "tend.;"
+
+ /* Abort handler */
+ "3: ;"
+ ::: "memory");
+
+ /* Check the TEXASR */
+ result = mfspr(SPRN_TEXASR);
+ if ((result & TEXASR_FS) == 0) {
+ passed = 0;
+ return;
+ }
+ }
+ return;
+}
+
+int test_tmspr()
+{
+ pthread_t thread;
+ int thread_num;
+ unsigned long i;
+
+ SKIP_IF(!have_htm());
+
+ /* To cause some context switching */
+ thread_num = 10 * sysconf(_SC_NPROCESSORS_ONLN);
+
+ /* Test TFIAR and TFHAR */
+ for (i = 0 ; i < thread_num ; i += 2){
+ if (pthread_create(&thread, NULL, (void*)tfiar_tfhar, (void *)i))
+ return EXIT_FAILURE;
+ }
+ if (pthread_join(thread, NULL) != 0)
+ return EXIT_FAILURE;
+
+ /* Test TEXASR */
+ for (i = 0 ; i < thread_num ; i++){
+ if (pthread_create(&thread, NULL, (void*)texasr, (void *)i))
+ return EXIT_FAILURE;
+ }
+ if (pthread_join(thread, NULL) != 0)
+ return EXIT_FAILURE;
+
+ if (passed)
+ return 0;
+ else
+ return 1;
+}
+
+int main(int argc, char *argv[])
+{
+ if (argc > 1) {
+ if (strcmp(argv[1], "-h") == 0) {
+ printf("Syntax:\t [<num loops>]\n");
+ return 0;
+ } else {
+ num_loops = atoi(argv[1]);
+ }
+ }
+ return test_harness(test_tmspr, "tm_tmspr");
+}
diff --git a/tools/testing/selftests/powerpc/utils.h b/tools/testing/selftests/powerpc/utils.h
index 175ac6a..a985cfa 100644
--- a/tools/testing/selftests/powerpc/utils.h
+++ b/tools/testing/selftests/powerpc/utils.h
@@ -6,9 +6,12 @@
#ifndef _SELFTESTS_POWERPC_UTILS_H
#define _SELFTESTS_POWERPC_UTILS_H
+#define __cacheline_aligned __attribute__((aligned(128)))
+
#include <stdint.h>
#include <stdbool.h>
#include <linux/auxvec.h>
+#include "reg.h"
/* Avoid headaches with PRI?64 - just use %ll? always */
typedef unsigned long long u64;
@@ -54,4 +57,9 @@ do { \
#define _str(s) #s
#define str(s) _str(s)
+/* POWER9 feature */
+#ifndef PPC_FEATURE2_ARCH_3_00
+#define PPC_FEATURE2_ARCH_3_00 0x00800000
+#endif
+
#endif /* _SELFTESTS_POWERPC_UTILS_H */
OpenPOWER on IntegriCloud