86 files changed, 2173 insertions, 779 deletions
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index eb2cf49..9795b59 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -7,16 +7,19 @@ config ARM64
 	select ACPI_REDUCED_HARDWARE_ONLY if ACPI
 	select ACPI_MCFG if ACPI
 	select ACPI_SPCR_TABLE if ACPI
+	select ACPI_PPTT if ACPI
 	select ARCH_CLOCKSOURCE_DATA
 	select ARCH_HAS_DEBUG_VIRTUAL
 	select ARCH_HAS_DEVMEM_IS_ALLOWED
 	select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI
 	select ARCH_HAS_ELF_RANDOMIZE
+	select ARCH_HAS_FAST_MULTIPLIER
 	select ARCH_HAS_FORTIFY_SOURCE
 	select ARCH_HAS_GCOV_PROFILE_ALL
 	select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA
 	select ARCH_HAS_KCOV
 	select ARCH_HAS_MEMBARRIER_SYNC_CORE
+	select ARCH_HAS_PTE_SPECIAL
 	select ARCH_HAS_SET_MEMORY
 	select ARCH_HAS_SG_CHAIN
 	select ARCH_HAS_STRICT_KERNEL_RWX
@@ -105,7 +108,6 @@ config ARM64
 	select HAVE_CONTEXT_TRACKING
 	select HAVE_DEBUG_BUGVERBOSE
 	select HAVE_DEBUG_KMEMLEAK
-	select HAVE_DMA_API_DEBUG
 	select HAVE_DMA_CONTIGUOUS
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS
@@ -133,6 +135,8 @@ config ARM64
 	select IRQ_FORCED_THREADING
 	select MODULES_USE_ELF_RELA
 	select MULTI_IRQ_HANDLER
+	select NEED_DMA_MAP_STATE
+	select NEED_SG_DMA_LENGTH
 	select NO_BOOTMEM
 	select OF
 	select OF_EARLY_FLATTREE
@@ -142,6 +146,7 @@ config ARM64
 	select POWER_SUPPLY
 	select REFCOUNT_FULL
 	select SPARSE_IRQ
+	select SWIOTLB
 	select SYSCTL_EXCEPTION_TRACE
 	select THREAD_INFO_IN_TASK
 	help
@@ -150,9 +155,6 @@ config ARM64
 config 64BIT
 	def_bool y
 
-config ARCH_PHYS_ADDR_T_64BIT
-	def_bool y
-
 config MMU
 	def_bool y
 
@@ -237,24 +239,9 @@ config ZONE_DMA32
 config HAVE_GENERIC_GUP
 	def_bool y
 
-config ARCH_DMA_ADDR_T_64BIT
-	def_bool y
-
-config NEED_DMA_MAP_STATE
-	def_bool y
-
-config NEED_SG_DMA_LENGTH
-	def_bool y
-
 config SMP
 	def_bool y
 
-config SWIOTLB
-	def_bool y
-
-config IOMMU_HELPER
-	def_bool SWIOTLB
-
 config KERNEL_MODE_NEON
 	def_bool y
 
@@ -938,6 +925,15 @@ config HARDEN_EL2_VECTORS
 
 	  If unsure, say Y.
 
+config ARM64_SSBD
+	bool "Speculative Store Bypass Disable" if EXPERT
+	default y
+	help
+	  This enables mitigation of the bypassing of previous stores
+	  by speculative loads.
+
+	  If unsure, say Y.
+
 menuconfig ARMV8_DEPRECATED
 	bool "Emulate deprecated/obsolete ARMv8 instructions"
 	depends on COMPAT
@@ -1049,6 +1045,7 @@ config ARM64_PAN
 
 config ARM64_LSE_ATOMICS
 	bool "Atomic instructions"
+	default y
 	help
 	  As part of the Large System Extensions, ARMv8.1 introduces new
 	  atomic instructions that are designed specifically to scale in
@@ -1057,7 +1054,8 @@ config ARM64_LSE_ATOMICS
 	  Say Y here to make use of these instructions for the in-kernel
 	  atomic routines. This incurs a small overhead on CPUs that do
 	  not support these instructions and requires the kernel to be
-	  built with binutils >= 2.25.
+	  built with binutils >= 2.25 in order for the new instructions
+	  to be used.
 
 config ARM64_VHE
 	bool "Enable support for Virtualization Host Extensions (VHE)"
diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index 87f7d2f..3c353b4 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -78,7 +78,7 @@ LDFLAGS		+= -maarch64linux
 UTS_MACHINE	:= aarch64
 endif
 
-CHECKFLAGS	+= -D__aarch64__ -m64
+CHECKFLAGS	+= -D__aarch64__
 
 ifeq ($(CONFIG_ARM64_MODULE_PLTS),y)
 KBUILD_LDFLAGS_MODULE	+= -T $(srctree)/arch/arm64/kernel/module.lds
diff --git a/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi b/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi
index c89d0c30..e6b0593 100644
--- a/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi
+++ b/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi
@@ -17,6 +17,7 @@
 /dts-v1/;
 #include <dt-bindings/reset/altr,rst-mgr-s10.h>
 #include <dt-bindings/gpio/gpio.h>
+#include <dt-bindings/clock/stratix10-clock.h>
 
 / {
 	compatible = "altr,socfpga-stratix10";
@@ -92,9 +93,32 @@
 		interrupt-parent = <&intc>;
 		ranges = <0 0 0 0xffffffff>;
 
-		clkmgr@ffd1000 {
-			compatible = "altr,clk-mgr";
+		clkmgr: clock-controller@ffd10000 {
+			compatible = "intel,stratix10-clkmgr";
 			reg = <0xffd10000 0x1000>;
+			#clock-cells = <1>;
+		};
+
+		clocks {
+			cb_intosc_hs_div2_clk: cb-intosc-hs-div2-clk {
+				#clock-cells = <0>;
+				compatible = "fixed-clock";
+			};
+
+			cb_intosc_ls_clk: cb-intosc-ls-clk {
+				#clock-cells = <0>;
+				compatible = "fixed-clock";
+			};
+
+			f2s_free_clk: f2s-free-clk {
+				#clock-cells = <0>;
+				compatible = "fixed-clock";
+			};
+
+			osc1: osc1 {
+				#clock-cells = <0>;
+				compatible = "fixed-clock";
+			};
 		};
 
 		gmac0: ethernet@ff800000 {
@@ -105,6 +129,8 @@
 			mac-address = [00 00 00 00 00 00];
 			resets = <&rst EMAC0_RESET>;
 			reset-names = "stmmaceth";
+			clocks = <&clkmgr STRATIX10_EMAC0_CLK>;
+			clock-names = "stmmaceth";
 			status = "disabled";
 		};
 
@@ -116,6 +142,8 @@
 			mac-address = [00 00 00 00 00 00];
 			resets = <&rst EMAC1_RESET>;
 			reset-names = "stmmaceth";
+			clocks = <&clkmgr STRATIX10_EMAC1_CLK>;
+			clock-names = "stmmaceth";
 			status = "disabled";
 		};
 
@@ -127,6 +155,8 @@
 			mac-address = [00 00 00 00 00 00];
 			resets = <&rst EMAC2_RESET>;
 			reset-names = "stmmaceth";
+			clocks = <&clkmgr STRATIX10_EMAC2_CLK>;
+			clock-names = "stmmaceth";
 			status = "disabled";
 		};
 
@@ -177,6 +207,7 @@
 			reg = <0xffc02800 0x100>;
 			interrupts = <0 103 4>;
 			resets = <&rst I2C0_RESET>;
+			clocks = <&clkmgr STRATIX10_L4_SP_CLK>;
 			status = "disabled";
 		};
 
@@ -187,6 +218,7 @@
 			reg = <0xffc02900 0x100>;
 			interrupts = <0 104 4>;
 			resets = <&rst I2C1_RESET>;
+			clocks = <&clkmgr STRATIX10_L4_SP_CLK>;
 			status = "disabled";
 		};
 
@@ -197,6 +229,7 @@
 			reg = <0xffc02a00 0x100>;
 			interrupts = <0 105 4>;
 			resets = <&rst I2C2_RESET>;
+			clocks = <&clkmgr STRATIX10_L4_SP_CLK>;
 			status = "disabled";
 		};
 
@@ -207,6 +240,7 @@
 			reg = <0xffc02b00 0x100>;
 			interrupts = <0 106 4>;
 			resets = <&rst I2C3_RESET>;
+			clocks = <&clkmgr STRATIX10_L4_SP_CLK>;
 			status = "disabled";
 		};
 
@@ -217,6 +251,7 @@
 			reg = <0xffc02c00 0x100>;
 			interrupts = <0 107 4>;
 			resets = <&rst I2C4_RESET>;
+			clocks = <&clkmgr STRATIX10_L4_SP_CLK>;
 			status = "disabled";
 		};
 
@@ -229,6 +264,9 @@
 			fifo-depth = <0x400>;
 			resets = <&rst SDMMC_RESET>;
 			reset-names = "reset";
+			clocks = <&clkmgr STRATIX10_L4_MP_CLK>,
+				 <&clkmgr STRATIX10_SDMMC_CLK>;
+			clock-names = "biu", "ciu";
 			status = "disabled";
 		};
 
@@ -237,6 +275,25 @@
 			reg = <0xffe00000 0x100000>;
 		};
 
+		pdma: pdma@ffda0000 {
+			compatible = "arm,pl330", "arm,primecell";
+			reg = <0xffda0000 0x1000>;
+			interrupts = <0 81 4>,
+				     <0 82 4>,
+				     <0 83 4>,
+				     <0 84 4>,
+				     <0 85 4>,
+				     <0 86 4>,
+				     <0 87 4>,
+				     <0 88 4>,
+				     <0 89 4>;
+			#dma-cells = <1>;
+			#dma-channels = <8>;
+			#dma-requests = <32>;
+			clocks = <&clkmgr STRATIX10_L4_MAIN_CLK>;
+			clock-names = "apb_pclk";
+		};
+
 		rst: rstmgr@ffd11000 {
 			#reset-cells = <1>;
 			compatible = "altr,rst-mgr";
@@ -288,24 +345,32 @@
 			compatible = "snps,dw-apb-timer";
 			interrupts = <0 113 4>;
 			reg = <0xffc03000 0x100>;
+			clocks = <&clkmgr STRATIX10_L4_SP_CLK>;
+			clock-names = "timer";
 		};
 
 		timer1: timer1@ffc03100 {
 			compatible = "snps,dw-apb-timer";
 			interrupts = <0 114 4>;
 			reg = <0xffc03100 0x100>;
+			clocks = <&clkmgr STRATIX10_L4_SP_CLK>;
+			clock-names = "timer";
 		};
 
 		timer2: timer2@ffd00000 {
 			compatible = "snps,dw-apb-timer";
 			interrupts = <0 115 4>;
 			reg = <0xffd00000 0x100>;
+			clocks = <&clkmgr STRATIX10_L4_SP_CLK>;
+			clock-names = "timer";
 		};
 
 		timer3: timer3@ffd00100 {
 			compatible = "snps,dw-apb-timer";
 			interrupts = <0 116 4>;
 			reg = <0xffd00100 0x100>;
+			clocks = <&clkmgr STRATIX10_L4_SP_CLK>;
+			clock-names = "timer";
 		};
 
 		uart0: serial0@ffc02000 {
@@ -315,6 +380,7 @@
 			reg-shift = <2>;
 			reg-io-width = <4>;
 			resets = <&rst UART0_RESET>;
+			clocks = <&clkmgr STRATIX10_L4_SP_CLK>;
 			status = "disabled";
 		};
 
@@ -325,6 +391,7 @@
 			reg-shift = <2>;
 			reg-io-width = <4>;
 			resets = <&rst UART1_RESET>;
+			clocks = <&clkmgr STRATIX10_L4_SP_CLK>;
 			status = "disabled";
 		};
 
@@ -387,5 +454,17 @@
 			resets = <&rst WATCHDOG3_RESET>;
 			status = "disabled";
 		};
+
+		eccmgr {
+			compatible = "altr,socfpga-s10-ecc-manager";
+			interrupts = <0 15 4>, <0 95 4>;
+			interrupt-controller;
+			#interrupt-cells = <2>;
+
+			sdramedac {
+				compatible = "altr,sdram-edac-s10";
+				interrupts = <16 4>, <48 4>;
+			};
+		};
 	};
 };
diff --git a/arch/arm64/boot/dts/altera/socfpga_stratix10_socdk.dts b/arch/arm64/boot/dts/altera/socfpga_stratix10_socdk.dts
index eaf13fe..f9b1ef1 100644
--- a/arch/arm64/boot/dts/altera/socfpga_stratix10_socdk.dts
+++ b/arch/arm64/boot/dts/altera/socfpga_stratix10_socdk.dts
@@ -50,6 +50,21 @@
 		/* We expect the bootloader to fill in the reg */
 		reg = <0 0 0 0>;
 	};
+
+	ref_033v: 033-v-ref {
+		compatible = "regulator-fixed";
+		regulator-name = "0.33V";
+		regulator-min-microvolt = <330000>;
+		regulator-max-microvolt = <330000>;
+	};
+
+	soc {
+		clocks {
+			osc1 {
+				clock-frequency = <25000000>;
+			};
+		};
+	};
 };
 
 &gpio1 {
@@ -79,7 +94,7 @@
 			rxd2-skew-ps = <420>; /* 0ps */
 			rxd3-skew-ps = <420>; /* 0ps */
 			txen-skew-ps = <0>; /* -420ps */
-			txc-skew-ps = <1860>; /* 960ps */
+			txc-skew-ps = <900>; /* 0ps */
 			rxdv-skew-ps = <420>; /* 0ps */
 			rxc-skew-ps = <1680>; /* 780ps */
 		};
@@ -105,3 +120,30 @@
 &watchdog0 {
 	status = "okay";
 };
+
+&i2c1 {
+	status = "okay";
+	clock-frequency = <100000>;
+
+	adc@14 {
+		compatible = "lltc,ltc2497";
+		reg = <0x14>;
+		vref-supply = <&ref_033v>;
+	};
+
+	temp@4c {
+		compatible = "maxim,max1619";
+		reg = <0x4c>;
+	};
+
+	eeprom@51 {
+		compatible = "atmel,24c32";
+		reg = <0x51>;
+		pagesize = <32>;
+	};
+
+	rtc@68 {
+		compatible = "dallas,ds1339";
+		reg = <0x68>;
+	};
+};
diff --git a/arch/arm64/boot/dts/exynos/exynos5433.dtsi b/arch/arm64/boot/dts/exynos/exynos5433.dtsi
index c0231d0..1ad8677 100644
--- a/arch/arm64/boot/dts/exynos/exynos5433.dtsi
+++ b/arch/arm64/boot/dts/exynos/exynos5433.dtsi
@@ -1317,7 +1317,7 @@
 			reg = <0x14d60000 0x100>;
 			dmas = <&pdma0 31 &pdma0 30>;
 			dma-names = "tx", "rx";
-			interrupts = <GIC_SPI 435 IRQ_TYPE_NONE>;
+			interrupts = <GIC_SPI 435 IRQ_TYPE_LEVEL_HIGH>;
 			clocks = <&cmu_peric CLK_PCLK_I2S1>,
 				 <&cmu_peric CLK_PCLK_I2S1>,
 				 <&cmu_peric CLK_SCLK_I2S1>;
diff --git a/arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts b/arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts
index 724a0d3..edb4ee0 100644
--- a/arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts
+++ b/arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts
@@ -299,7 +299,6 @@
 		/* GPIO blocks 16 thru 19 do not appear to be routed to pins */
 
 		dwmmc_0: dwmmc0@f723d000 {
-			max-frequency = <150000000>;
 			cap-mmc-highspeed;
 			mmc-hs200-1_8v;
 			non-removable;
diff --git a/arch/arm64/boot/dts/marvell/armada-cp110.dtsi b/arch/arm64/boot/dts/marvell/armada-cp110.dtsi
index 48cad79..ed2f123 100644
--- a/arch/arm64/boot/dts/marvell/armada-cp110.dtsi
+++ b/arch/arm64/boot/dts/marvell/armada-cp110.dtsi
@@ -38,9 +38,10 @@
 			compatible = "marvell,armada-7k-pp22";
 			reg = <0x0 0x100000>, <0x129000 0xb000>;
 			clocks = <&CP110_LABEL(clk) 1 3>, <&CP110_LABEL(clk) 1 9>,
-				 <&CP110_LABEL(clk) 1 5>, <&CP110_LABEL(clk) 1 18>;
+				 <&CP110_LABEL(clk) 1 5>, <&CP110_LABEL(clk) 1 6>,
+				 <&CP110_LABEL(clk) 1 18>;
 			clock-names = "pp_clk", "gop_clk",
-				      "mg_clk", "axi_clk";
+				      "mg_clk", "mg_core_clk", "axi_clk";
 			marvell,system-controller = <&CP110_LABEL(syscon0)>;
 			status = "disabled";
 			dma-coherent;
@@ -141,6 +142,8 @@
 			#size-cells = <0>;
 			compatible = "marvell,xmdio";
 			reg = <0x12a600 0x10>;
+			clocks = <&CP110_LABEL(clk) 1 5>,
+				 <&CP110_LABEL(clk) 1 6>, <&CP110_LABEL(clk) 1 18>;
 			status = "disabled";
 		};
 
diff --git a/arch/arm64/boot/dts/nvidia/tegra186-p3310.dtsi b/arch/arm64/boot/dts/nvidia/tegra186-p3310.dtsi
index a8baad7..13f57ff 100644
--- a/arch/arm64/boot/dts/nvidia/tegra186-p3310.dtsi
+++ b/arch/arm64/boot/dts/nvidia/tegra186-p3310.dtsi
@@ -46,7 +46,7 @@
 				compatible = "ethernet-phy-ieee802.3-c22";
 				reg = <0x0>;
 				interrupt-parent = <&gpio>;
-				interrupts = <TEGRA_MAIN_GPIO(M, 5) IRQ_TYPE_LEVEL_HIGH>;
+				interrupts = <TEGRA_MAIN_GPIO(M, 5) IRQ_TYPE_LEVEL_LOW>;
 			};
 		};
 	};
diff --git a/arch/arm64/boot/dts/qcom/apq8096-db820c-pins.dtsi b/arch/arm64/boot/dts/qcom/apq8096-db820c-pins.dtsi
index 24552f1..6a57387 100644
--- a/arch/arm64/boot/dts/qcom/apq8096-db820c-pins.dtsi
+++ b/arch/arm64/boot/dts/qcom/apq8096-db820c-pins.dtsi
@@ -36,4 +36,30 @@
 			drive-strength = <2>;	/* 2 MA */
 		};
 	};
+
+	blsp1_uart1_default: blsp1_uart1_default {
+		mux {
+			pins = "gpio41", "gpio42", "gpio43", "gpio44";
+			function = "blsp_uart2";
+		};
+
+		config {
+			pins = "gpio41", "gpio42", "gpio43", "gpio44";
+			drive-strength = <16>;
+			bias-disable;
+		};
+	};
+
+	blsp1_uart1_sleep: blsp1_uart1_sleep {
+		mux {
+			pins = "gpio41", "gpio42", "gpio43", "gpio44";
+			function = "gpio";
+		};
+
+		config {
+			pins = "gpio41", "gpio42", "gpio43", "gpio44";
+			drive-strength = <2>;
+			bias-disable;
+		};
+	};
 };
diff --git a/arch/arm64/boot/dts/qcom/apq8096-db820c-pmic-pins.dtsi b/arch/arm64/boot/dts/qcom/apq8096-db820c-pmic-pins.dtsi
index 59b29ddf..6167af9 100644
--- a/arch/arm64/boot/dts/qcom/apq8096-db820c-pmic-pins.dtsi
+++ b/arch/arm64/boot/dts/qcom/apq8096-db820c-pmic-pins.dtsi
@@ -14,6 +14,28 @@
 		};
 	};
 
+	bt_en_gpios: bt_en_gpios {
+		pinconf {
+			pins = "gpio19";
+			function = PMIC_GPIO_FUNC_NORMAL;
+			output-low;
+			power-source = <PM8994_GPIO_S4>; // 1.8V
+			qcom,drive-strength = <PMIC_GPIO_STRENGTH_LOW>;
+			bias-pull-down;
+		};
+	};
+
+	wlan_en_gpios: wlan_en_gpios {
+		pinconf {
+			pins = "gpio8";
+			function = PMIC_GPIO_FUNC_NORMAL;
+			output-low;
+			power-source = <PM8994_GPIO_S4>; // 1.8V
+			qcom,drive-strength = <PMIC_GPIO_STRENGTH_LOW>;
+			bias-pull-down;
+		};
+	};
+
 	volume_up_gpio: pm8996_gpio2 {
 		pinconf {
 			pins = "gpio2";
@@ -26,6 +48,16 @@
 		};
 	};
 
+	divclk4_pin_a: divclk4 {
+		pinconf {
+			pins = "gpio18";
+			function = PMIC_GPIO_FUNC_FUNC2;
+
+			bias-disable;
+			power-source = <PM8994_GPIO_S4>;
+		};
+	};
+
 	usb3_vbus_det_gpio: pm8996_gpio22 {
 		pinconf {
 			pins = "gpio22";
diff --git a/arch/arm64/boot/dts/qcom/apq8096-db820c.dtsi b/arch/arm64/boot/dts/qcom/apq8096-db820c.dtsi
index 1c8f1b8..4b8bb02 100644
--- a/arch/arm64/boot/dts/qcom/apq8096-db820c.dtsi
+++ b/arch/arm64/boot/dts/qcom/apq8096-db820c.dtsi
@@ -23,6 +23,7 @@
 	aliases {
 		serial0 = &blsp2_uart1;
 		serial1 = &blsp2_uart2;
+		serial2 = &blsp1_uart1;
 		i2c0	= &blsp1_i2c2;
 		i2c1	= &blsp2_i2c1;
 		i2c2	= &blsp2_i2c0;
@@ -34,7 +35,36 @@
 		stdout-path = "serial0:115200n8";
 	};
 
+	clocks {
+		divclk4: divclk4 {
+			compatible = "fixed-clock";
+			#clock-cells = <0>;
+			clock-frequency = <32768>;
+			clock-output-names = "divclk4";
+
+			pinctrl-names = "default";
+			pinctrl-0 = <&divclk4_pin_a>;
+		};
+	};
+
 	soc {
+		serial@7570000 {
+			label = "BT-UART";
+			status = "okay";
+			pinctrl-names = "default", "sleep";
+			pinctrl-0 = <&blsp1_uart1_default>;
+			pinctrl-1 = <&blsp1_uart1_sleep>;
+
+			bluetooth {
+				compatible = "qcom,qca6174-bt";
+
+				/* bt_disable_n gpio */
+				enable-gpios = <&pm8994_gpios 19 GPIO_ACTIVE_HIGH>;
+
+				clocks = <&divclk4>;
+			};
+		};
+
 		serial@75b0000 {
 			label = "LS-UART1";
 			status = "okay";
@@ -139,9 +169,40 @@
 			pinctrl-0 = <&usb2_vbus_det_gpio>;
 		};
 
+		bt_en: bt-en-1-8v {
+			pinctrl-names = "default";
+			pinctrl-0 = <&bt_en_gpios>;
+			compatible = "regulator-fixed";
+			regulator-name = "bt-en-regulator";
+			regulator-min-microvolt = <1800000>;
+			regulator-max-microvolt = <1800000>;
+
+			/* WLAN card specific delay */
+			startup-delay-us = <70000>;
+			enable-active-high;
+		};
+
+		wlan_en: wlan-en-1-8v {
+			pinctrl-names = "default";
+			pinctrl-0 = <&wlan_en_gpios>;
+			compatible = "regulator-fixed";
+			regulator-name = "wlan-en-regulator";
+			regulator-min-microvolt = <1800000>;
+			regulator-max-microvolt = <1800000>;
+
+			gpio = <&pm8994_gpios 8 0>;
+
+			/* WLAN card specific delay */
+			startup-delay-us = <70000>;
+			enable-active-high;
+		};
+
 		agnoc@0 {
 			qcom,pcie@600000 {
+				status = "okay";
 				perst-gpio = <&msmgpio 35 GPIO_ACTIVE_LOW>;
+				vddpe-supply = <&wlan_en>;
+				vddpe1-supply = <&bt_en>;
 			};
 
 			qcom,pcie@608000 {
diff --git a/arch/arm64/boot/dts/qcom/msm8996.dtsi b/arch/arm64/boot/dts/qcom/msm8996.dtsi
index 410ae78..f8e49d0 100644
--- a/arch/arm64/boot/dts/qcom/msm8996.dtsi
+++ b/arch/arm64/boot/dts/qcom/msm8996.dtsi
@@ -419,6 +419,16 @@
 			#clock-cells = <1>;
 		};
 
+		blsp1_uart1: serial@7570000 {
+			compatible = "qcom,msm-uartdm-v1.4", "qcom,msm-uartdm";
+			reg = <0x07570000 0x1000>;
+			interrupts = <GIC_SPI 108 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&gcc GCC_BLSP1_UART2_APPS_CLK>,
+				 <&gcc GCC_BLSP1_AHB_CLK>;
+			clock-names = "core", "iface";
+			status = "disabled";
+		};
+
 		blsp1_spi0: spi@7575000 {
 			compatible = "qcom,spi-qup-v2.2.1";
 			reg = <0x07575000 0x600>;
diff --git a/arch/arm64/boot/dts/socionext/uniphier-ld11.dtsi b/arch/arm64/boot/dts/socionext/uniphier-ld11.dtsi
index e62bda1..c32dd34 100644
--- a/arch/arm64/boot/dts/socionext/uniphier-ld11.dtsi
+++ b/arch/arm64/boot/dts/socionext/uniphier-ld11.dtsi
@@ -414,7 +414,7 @@
 			mmc-ddr-1_8v;
 			mmc-hs200-1_8v;
 			mmc-pwrseq = <&emmc_pwrseq>;
-			cdns,phy-input-delay-legacy = <4>;
+			cdns,phy-input-delay-legacy = <9>;
 			cdns,phy-input-delay-mmc-highspeed = <2>;
 			cdns,phy-input-delay-mmc-ddr = <3>;
 			cdns,phy-dll-delay-sdclk = <21>;
diff --git a/arch/arm64/boot/dts/socionext/uniphier-ld20-ref.dts b/arch/arm64/boot/dts/socionext/uniphier-ld20-ref.dts
index 2c1a92f..440c2e6 100644
--- a/arch/arm64/boot/dts/socionext/uniphier-ld20-ref.dts
+++ b/arch/arm64/boot/dts/socionext/uniphier-ld20-ref.dts
@@ -67,3 +67,11 @@
 		reg = <0>;
 	};
 };
+
+&pinctrl_ether_rgmii {
+	tx {
+		pins = "RGMII_TXCLK", "RGMII_TXD0", "RGMII_TXD1",
+		       "RGMII_TXD2", "RGMII_TXD3", "RGMII_TXCTL";
+		drive-strength = <9>;
+	};
+};
diff --git a/arch/arm64/boot/dts/socionext/uniphier-ld20.dtsi b/arch/arm64/boot/dts/socionext/uniphier-ld20.dtsi
index 9efe20d..3a5ed78 100644
--- a/arch/arm64/boot/dts/socionext/uniphier-ld20.dtsi
+++ b/arch/arm64/boot/dts/socionext/uniphier-ld20.dtsi
@@ -519,7 +519,7 @@
 			mmc-ddr-1_8v;
 			mmc-hs200-1_8v;
 			mmc-pwrseq = <&emmc_pwrseq>;
-			cdns,phy-input-delay-legacy = <4>;
+			cdns,phy-input-delay-legacy = <9>;
 			cdns,phy-input-delay-mmc-highspeed = <2>;
 			cdns,phy-input-delay-mmc-ddr = <3>;
 			cdns,phy-dll-delay-sdclk = <21>;
diff --git a/arch/arm64/boot/dts/socionext/uniphier-pxs3.dtsi b/arch/arm64/boot/dts/socionext/uniphier-pxs3.dtsi
index 7c8f710..e85d6dd 100644
--- a/arch/arm64/boot/dts/socionext/uniphier-pxs3.dtsi
+++ b/arch/arm64/boot/dts/socionext/uniphier-pxs3.dtsi
@@ -334,7 +334,7 @@
 			mmc-ddr-1_8v;
 			mmc-hs200-1_8v;
 			mmc-pwrseq = <&emmc_pwrseq>;
-			cdns,phy-input-delay-legacy = <4>;
+			cdns,phy-input-delay-legacy = <9>;
 			cdns,phy-input-delay-mmc-highspeed = <2>;
 			cdns,phy-input-delay-mmc-ddr = <3>;
 			cdns,phy-dll-delay-sdclk = <21>;
diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index b209f95..3cfa8ca 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -80,7 +80,8 @@ CONFIG_PCIE_HISI_STB=y
 CONFIG_PCI_AARDVARK=y
 CONFIG_PCI_TEGRA=y
 CONFIG_PCIE_RCAR=y
-CONFIG_PCIE_ROCKCHIP=m
+CONFIG_PCIE_ROCKCHIP=y
+CONFIG_PCIE_ROCKCHIP_HOST=m
 CONFIG_PCI_HOST_GENERIC=y
 CONFIG_PCI_XGENE=y
 CONFIG_PCI_HOST_THUNDER_PEM=y
diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index cb5a243..e3fdb0f 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -47,6 +47,12 @@ config CRYPTO_SM3_ARM64_CE
 	select CRYPTO_HASH
 	select CRYPTO_SM3
 
+config CRYPTO_SM4_ARM64_CE
+	tristate "SM4 symmetric cipher (ARMv8.2 Crypto Extensions)"
+	depends on KERNEL_MODE_NEON
+	select CRYPTO_ALGAPI
+	select CRYPTO_SM4
+
 config CRYPTO_GHASH_ARM64_CE
 	tristate "GHASH/AES-GCM using ARMv8 Crypto Extensions"
 	depends on KERNEL_MODE_NEON
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index f35ac68..bcafd01 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -23,6 +23,9 @@ sha3-ce-y := sha3-ce-glue.o sha3-ce-core.o
 obj-$(CONFIG_CRYPTO_SM3_ARM64_CE) += sm3-ce.o
 sm3-ce-y := sm3-ce-glue.o sm3-ce-core.o
 
+obj-$(CONFIG_CRYPTO_SM4_ARM64_CE) += sm4-ce.o
+sm4-ce-y := sm4-ce-glue.o sm4-ce-core.o
+
 obj-$(CONFIG_CRYPTO_GHASH_ARM64_CE) += ghash-ce.o
 ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o
 
diff --git a/arch/arm64/crypto/aes-ce-ccm-core.S b/arch/arm64/crypto/aes-ce-ccm-core.S
index e3a375c..88f5aef 100644
--- a/arch/arm64/crypto/aes-ce-ccm-core.S
+++ b/arch/arm64/crypto/aes-ce-ccm-core.S
@@ -19,24 +19,33 @@
 	 *			     u32 *macp, u8 const rk[], u32 rounds);
 	 */
 ENTRY(ce_aes_ccm_auth_data)
-	ldr	w8, [x3]			/* leftover from prev round? */
+	frame_push	7
+
+	mov	x19, x0
+	mov	x20, x1
+	mov	x21, x2
+	mov	x22, x3
+	mov	x23, x4
+	mov	x24, x5
+
+	ldr	w25, [x22]			/* leftover from prev round? */
 	ld1	{v0.16b}, [x0]			/* load mac */
-	cbz	w8, 1f
-	sub	w8, w8, #16
+	cbz	w25, 1f
+	sub	w25, w25, #16
 	eor	v1.16b, v1.16b, v1.16b
-0:	ldrb	w7, [x1], #1			/* get 1 byte of input */
-	subs	w2, w2, #1
-	add	w8, w8, #1
+0:	ldrb	w7, [x20], #1			/* get 1 byte of input */
+	subs	w21, w21, #1
+	add	w25, w25, #1
 	ins	v1.b[0], w7
 	ext	v1.16b, v1.16b, v1.16b, #1	/* rotate in the input bytes */
 	beq	8f				/* out of input? */
-	cbnz	w8, 0b
+	cbnz	w25, 0b
 	eor	v0.16b, v0.16b, v1.16b
-1:	ld1	{v3.4s}, [x4]			/* load first round key */
-	prfm	pldl1strm, [x1]
-	cmp	w5, #12				/* which key size? */
-	add	x6, x4, #16
-	sub	w7, w5, #2			/* modified # of rounds */
+1:	ld1	{v3.4s}, [x23]			/* load first round key */
+	prfm	pldl1strm, [x20]
+	cmp	w24, #12			/* which key size? */
+	add	x6, x23, #16
+	sub	w7, w24, #2			/* modified # of rounds */
 	bmi	2f
 	bne	5f
 	mov	v5.16b, v3.16b
@@ -55,33 +64,43 @@ ENTRY(ce_aes_ccm_auth_data)
 	ld1	{v5.4s}, [x6], #16		/* load next round key */
 	bpl	3b
 	aese	v0.16b, v4.16b
-	subs	w2, w2, #16			/* last data? */
+	subs	w21, w21, #16			/* last data? */
 	eor	v0.16b, v0.16b, v5.16b		/* final round */
 	bmi	6f
-	ld1	{v1.16b}, [x1], #16		/* load next input block */
+	ld1	{v1.16b}, [x20], #16		/* load next input block */
 	eor	v0.16b, v0.16b, v1.16b		/* xor with mac */
-	bne	1b
-6:	st1	{v0.16b}, [x0]			/* store mac */
+	beq	6f
+
+	if_will_cond_yield_neon
+	st1	{v0.16b}, [x19]			/* store mac */
+	do_cond_yield_neon
+	ld1	{v0.16b}, [x19]			/* reload mac */
+	endif_yield_neon
+
+	b	1b
+6:	st1	{v0.16b}, [x19]			/* store mac */
 	beq	10f
-	adds	w2, w2, #16
+	adds	w21, w21, #16
 	beq	10f
-	mov	w8, w2
-7:	ldrb	w7, [x1], #1
+	mov	w25, w21
+7:	ldrb	w7, [x20], #1
 	umov	w6, v0.b[0]
 	eor	w6, w6, w7
-	strb	w6, [x0], #1
-	subs	w2, w2, #1
+	strb	w6, [x19], #1
+	subs	w21, w21, #1
 	beq	10f
 	ext	v0.16b, v0.16b, v0.16b, #1	/* rotate out the mac bytes */
 	b	7b
-8:	mov	w7, w8
-	add	w8, w8, #16
+8:	mov	w7, w25
+	add	w25, w25, #16
 9:	ext	v1.16b, v1.16b, v1.16b, #1
 	adds	w7, w7, #1
 	bne	9b
 	eor	v0.16b, v0.16b, v1.16b
-	st1	{v0.16b}, [x0]
-10:	str	w8, [x3]
+	st1	{v0.16b}, [x19]
+10:	str	w25, [x22]
+
+	frame_pop
 	ret
 ENDPROC(ce_aes_ccm_auth_data)
 
@@ -126,19 +145,29 @@ ENTRY(ce_aes_ccm_final)
 ENDPROC(ce_aes_ccm_final)
 
 	.macro	aes_ccm_do_crypt,enc
-	ldr	x8, [x6, #8]			/* load lower ctr */
-	ld1	{v0.16b}, [x5]			/* load mac */
-CPU_LE(	rev	x8, x8			)	/* keep swabbed ctr in reg */
+	frame_push	8
+
+	mov	x19, x0
+	mov	x20, x1
+	mov	x21, x2
+	mov	x22, x3
+	mov	x23, x4
+	mov	x24, x5
+	mov	x25, x6
+
+	ldr	x26, [x25, #8]			/* load lower ctr */
+	ld1	{v0.16b}, [x24]			/* load mac */
+CPU_LE(	rev	x26, x26		)	/* keep swabbed ctr in reg */
 0:	/* outer loop */
-	ld1	{v1.8b}, [x6]			/* load upper ctr */
-	prfm	pldl1strm, [x1]
-	add	x8, x8, #1
-	rev	x9, x8
-	cmp	w4, #12				/* which key size? */
-	sub	w7, w4, #2			/* get modified # of rounds */
+	ld1	{v1.8b}, [x25]			/* load upper ctr */
+	prfm	pldl1strm, [x20]
+	add	x26, x26, #1
+	rev	x9, x26
+	cmp	w23, #12			/* which key size? */
+	sub	w7, w23, #2			/* get modified # of rounds */
 	ins	v1.d[1], x9			/* no carry in lower ctr */
-	ld1	{v3.4s}, [x3]			/* load first round key */
-	add	x10, x3, #16
+	ld1	{v3.4s}, [x22]			/* load first round key */
+	add	x10, x22, #16
 	bmi	1f
 	bne	4f
 	mov	v5.16b, v3.16b
@@ -165,9 +194,9 @@ CPU_LE(	rev	x8, x8			)	/* keep swabbed ctr in reg */
 	bpl	2b
 	aese	v0.16b, v4.16b
 	aese	v1.16b, v4.16b
-	subs	w2, w2, #16
-	bmi	6f				/* partial block? */
-	ld1	{v2.16b}, [x1], #16		/* load next input block */
+	subs	w21, w21, #16
+	bmi	7f				/* partial block? */
+	ld1	{v2.16b}, [x20], #16		/* load next input block */
 	.if	\enc == 1
 	eor	v2.16b, v2.16b, v5.16b		/* final round enc+mac */
 	eor	v1.16b, v1.16b, v2.16b		/* xor with crypted ctr */
@@ -176,18 +205,29 @@ CPU_LE(	rev	x8, x8			)	/* keep swabbed ctr in reg */
 	eor	v1.16b, v2.16b, v5.16b		/* final round enc */
 	.endif
 	eor	v0.16b, v0.16b, v2.16b		/* xor mac with pt ^ rk[last] */
-	st1	{v1.16b}, [x0], #16		/* write output block */
-	bne	0b
-CPU_LE(	rev	x8, x8			)
-	st1	{v0.16b}, [x5]			/* store mac */
-	str	x8, [x6, #8]			/* store lsb end of ctr (BE) */
-5:	ret
-
-6:	eor	v0.16b, v0.16b, v5.16b		/* final round mac */
+	st1	{v1.16b}, [x19], #16		/* write output block */
+	beq	5f
+
+	if_will_cond_yield_neon
+	st1	{v0.16b}, [x24]			/* store mac */
+	do_cond_yield_neon
+	ld1	{v0.16b}, [x24]			/* reload mac */
+	endif_yield_neon
+
+	b	0b
+5:
+CPU_LE(	rev	x26, x26			)
+	st1	{v0.16b}, [x24]			/* store mac */
+	str	x26, [x25, #8]			/* store lsb end of ctr (BE) */
+
+6:	frame_pop
+	ret
+
+7:	eor	v0.16b, v0.16b, v5.16b		/* final round mac */
 	eor	v1.16b, v1.16b, v5.16b		/* final round enc */
-	st1	{v0.16b}, [x5]			/* store mac */
-	add	w2, w2, #16			/* process partial tail block */
-7:	ldrb	w9, [x1], #1			/* get 1 byte of input */
+	st1	{v0.16b}, [x24]			/* store mac */
+	add	w21, w21, #16			/* process partial tail block */
+8:	ldrb	w9, [x20], #1			/* get 1 byte of input */
 	umov	w6, v1.b[0]			/* get top crypted ctr byte */
 	umov	w7, v0.b[0]			/* get top mac byte */
 	.if	\enc == 1
@@ -197,13 +237,13 @@ CPU_LE(	rev	x8, x8			)
 	eor	w9, w9, w6
 	eor	w7, w7, w9
 	.endif
-	strb	w9, [x0], #1			/* store out byte */
-	strb	w7, [x5], #1			/* store mac byte */
-	subs	w2, w2, #1
-	beq	5b
+	strb	w9, [x19], #1			/* store out byte */
+	strb	w7, [x24], #1			/* store mac byte */
+	subs	w21, w21, #1
+	beq	6b
 	ext	v0.16b, v0.16b, v0.16b, #1	/* shift out mac byte */
 	ext	v1.16b, v1.16b, v1.16b, #1	/* shift out ctr byte */
-	b	7b
+	b	8b
 	.endm
 
 	/*
diff --git a/arch/arm64/crypto/aes-ce.S b/arch/arm64/crypto/aes-ce.S
index 50330f5..623e74e 100644
--- a/arch/arm64/crypto/aes-ce.S
+++ b/arch/arm64/crypto/aes-ce.S
@@ -30,18 +30,21 @@
 	.endm
 
 	/* prepare for encryption with key in rk[] */
-	.macro		enc_prepare, rounds, rk, ignore
-	load_round_keys	\rounds, \rk
+	.macro		enc_prepare, rounds, rk, temp
+	mov		\temp, \rk
+	load_round_keys	\rounds, \temp
 	.endm
 
 	/* prepare for encryption (again) but with new key in rk[] */
-	.macro		enc_switch_key, rounds, rk, ignore
-	load_round_keys	\rounds, \rk
+	.macro		enc_switch_key, rounds, rk, temp
+	mov		\temp, \rk
+	load_round_keys	\rounds, \temp
 	.endm
 
 	/* prepare for decryption with key in rk[] */
-	.macro		dec_prepare, rounds, rk, ignore
-	load_round_keys	\rounds, \rk
+	.macro		dec_prepare, rounds, rk, temp
+	mov		\temp, \rk
+	load_round_keys	\rounds, \temp
 	.endm
 
 	.macro		do_enc_Nx, de, mc, k, i0, i1, i2, i3
diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S
index a68412e..483a713 100644
--- a/arch/arm64/crypto/aes-modes.S
+++ b/arch/arm64/crypto/aes-modes.S
@@ -14,12 +14,12 @@
 	.align		4
 
 aes_encrypt_block4x:
-	encrypt_block4x	v0, v1, v2, v3, w3, x2, x8, w7
+	encrypt_block4x	v0, v1, v2, v3, w22, x21, x8, w7
 	ret
 ENDPROC(aes_encrypt_block4x)
 
 aes_decrypt_block4x:
-	decrypt_block4x	v0, v1, v2, v3, w3, x2, x8, w7
+	decrypt_block4x	v0, v1, v2, v3, w22, x21, x8, w7
 	ret
 ENDPROC(aes_decrypt_block4x)
 
@@ -31,57 +31,71 @@ ENDPROC(aes_decrypt_block4x)
 	 */
 
 AES_ENTRY(aes_ecb_encrypt)
-	stp		x29, x30, [sp, #-16]!
-	mov		x29, sp
+	frame_push	5
 
-	enc_prepare	w3, x2, x5
+	mov		x19, x0
+	mov		x20, x1
+	mov		x21, x2
+	mov		x22, x3
+	mov		x23, x4
+
+.Lecbencrestart:
+	enc_prepare	w22, x21, x5
 
 .LecbencloopNx:
-	subs		w4, w4, #4
+	subs		w23, w23, #4
 	bmi		.Lecbenc1x
-	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
+	ld1		{v0.16b-v3.16b}, [x20], #64	/* get 4 pt blocks */
 	bl		aes_encrypt_block4x
-	st1		{v0.16b-v3.16b}, [x0], #64
+	st1		{v0.16b-v3.16b}, [x19], #64
+	cond_yield_neon	.Lecbencrestart
 	b		.LecbencloopNx
 .Lecbenc1x:
-	adds		w4, w4, #4
+	adds		w23, w23, #4
 	beq		.Lecbencout
 .Lecbencloop:
-	ld1		{v0.16b}, [x1], #16		/* get next pt block */
-	encrypt_block	v0, w3, x2, x5, w6
-	st1		{v0.16b}, [x0], #16
-	subs		w4, w4, #1
+	ld1		{v0.16b}, [x20], #16		/* get next pt block */
+	encrypt_block	v0, w22, x21, x5, w6
+	st1		{v0.16b}, [x19], #16
+	subs		w23, w23, #1
 	bne		.Lecbencloop
 .Lecbencout:
-	ldp		x29, x30, [sp], #16
+	frame_pop
 	ret
 AES_ENDPROC(aes_ecb_encrypt)
 
 
 AES_ENTRY(aes_ecb_decrypt)
-	stp		x29, x30, [sp, #-16]!
-	mov		x29, sp
+	frame_push	5
+
+	mov		x19, x0
+	mov		x20, x1
+	mov		x21, x2
+	mov		x22, x3
+	mov		x23, x4
 
-	dec_prepare	w3, x2, x5
+.Lecbdecrestart:
+	dec_prepare	w22, x21, x5
 
 .LecbdecloopNx:
-	subs		w4, w4, #4
+	subs		w23, w23, #4
 	bmi		.Lecbdec1x
-	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
+	ld1		{v0.16b-v3.16b}, [x20], #64	/* get 4 ct blocks */
 	bl		aes_decrypt_block4x
-	st1		{v0.16b-v3.16b}, [x0], #64
+	st1		{v0.16b-v3.16b}, [x19], #64
+	cond_yield_neon	.Lecbdecrestart
 	b		.LecbdecloopNx
 .Lecbdec1x:
-	adds		w4, w4, #4
+	adds		w23, w23, #4
 	beq		.Lecbdecout
 .Lecbdecloop:
-	ld1		{v0.16b}, [x1], #16		/* get next ct block */
-	decrypt_block	v0, w3, x2, x5, w6
-	st1		{v0.16b}, [x0], #16
-	subs		w4, w4, #1
+	ld1		{v0.16b}, [x20], #16		/* get next ct block */
+	decrypt_block	v0, w22, x21, x5, w6
+	st1		{v0.16b}, [x19], #16
+	subs		w23, w23, #1
 	bne		.Lecbdecloop
 .Lecbdecout:
-	ldp		x29, x30, [sp], #16
+	frame_pop
 	ret
 AES_ENDPROC(aes_ecb_decrypt)
 
@@ -94,78 +108,100 @@ AES_ENDPROC(aes_ecb_decrypt)
 	 */
 
 AES_ENTRY(aes_cbc_encrypt)
-	ld1		{v4.16b}, [x5]			/* get iv */
-	enc_prepare	w3, x2, x6
+	frame_push	6
+
+	mov		x19, x0
+	mov		x20, x1
+	mov		x21, x2
+	mov		x22, x3
+	mov		x23, x4
+	mov		x24, x5
+
+.Lcbcencrestart:
+	ld1		{v4.16b}, [x24]			/* get iv */
+	enc_prepare	w22, x21, x6
 
 .Lcbcencloop4x:
-	subs		w4, w4, #4
+	subs		w23, w23, #4
 	bmi		.Lcbcenc1x
-	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
+	ld1		{v0.16b-v3.16b}, [x20], #64	/* get 4 pt blocks */
 	eor		v0.16b, v0.16b, v4.16b		/* ..and xor with iv */
-	encrypt_block	v0, w3, x2, x6, w7
+	encrypt_block	v0, w22, x21, x6, w7
 	eor		v1.16b, v1.16b, v0.16b
-	encrypt_block	v1, w3, x2, x6, w7
+	encrypt_block	v1, w22, x21, x6, w7
 	eor		v2.16b, v2.16b, v1.16b
-	encrypt_block	v2, w3, x2, x6, w7
+	encrypt_block	v2, w22, x21, x6, w7
 	eor		v3.16b, v3.16b, v2.16b
-	encrypt_block	v3, w3, x2, x6, w7
-	st1		{v0.16b-v3.16b}, [x0], #64
+	encrypt_block	v3, w22, x21, x6, w7
+	st1		{v0.16b-v3.16b}, [x19], #64
 	mov		v4.16b, v3.16b
+	st1		{v4.16b}, [x24]			/* return iv */
+	cond_yield_neon	.Lcbcencrestart
 	b		.Lcbcencloop4x
 .Lcbcenc1x:
-	adds		w4, w4, #4
+	adds		w23, w23, #4
 	beq		.Lcbcencout
 .Lcbcencloop:
-	ld1		{v0.16b}, [x1], #16		/* get next pt block */
+	ld1		{v0.16b}, [x20], #16		/* get next pt block */
 	eor		v4.16b, v4.16b, v0.16b		/* ..and xor with iv */
-	encrypt_block	v4, w3, x2, x6, w7
-	st1		{v4.16b}, [x0], #16
-	subs		w4, w4, #1
+	encrypt_block	v4, w22, x21, x6, w7
+	st1		{v4.16b}, [x19], #16
+	subs		w23, w23, #1
 	bne		.Lcbcencloop
 .Lcbcencout:
-	st1		{v4.16b}, [x5]			/* return iv */
+	st1		{v4.16b}, [x24]			/* return iv */
+	frame_pop
 	ret
 AES_ENDPROC(aes_cbc_encrypt)
 
 
 AES_ENTRY(aes_cbc_decrypt)
-	stp		x29, x30, [sp, #-16]!
-	mov		x29, sp
+	frame_push	6
+
+	mov		x19, x0
+	mov		x20, x1
+	mov		x21, x2
+	mov		x22, x3
+	mov		x23, x4
+	mov		x24, x5
 
-	ld1		{v7.16b}, [x5]			/* get iv */
-	dec_prepare	w3, x2, x6
+.Lcbcdecrestart:
+	ld1		{v7.16b}, [x24]			/* get iv */
+	dec_prepare	w22, x21, x6
 
 .LcbcdecloopNx:
-	subs		w4, w4, #4
+	subs		w23, w23, #4
 	bmi		.Lcbcdec1x
-	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
+	ld1		{v0.16b-v3.16b}, [x20], #64	/* get 4 ct blocks */
 	mov		v4.16b, v0.16b
 	mov		v5.16b, v1.16b
 	mov		v6.16b, v2.16b
 	bl		aes_decrypt_block4x
-	sub		x1, x1, #16
+	sub		x20, x20, #16
 	eor		v0.16b, v0.16b, v7.16b
 	eor		v1.16b, v1.16b, v4.16b
-	ld1		{v7.16b}, [x1], #16		/* reload 1 ct block */
+	ld1		{v7.16b}, [x20], #16		/* reload 1 ct block */
 	eor		v2.16b, v2.16b, v5.16b
 	eor		v3.16b, v3.16b, v6.16b
-	st1		{v0.16b-v3.16b}, [x0], #64
+	st1		{v0.16b-v3.16b}, [x19], #64
+	st1		{v7.16b}, [x24]			/* return iv */
+	cond_yield_neon	.Lcbcdecrestart
 	b		.LcbcdecloopNx
 .Lcbcdec1x:
-	adds		w4, w4, #4
+	adds		w23, w23, #4
 	beq		.Lcbcdecout
 .Lcbcdecloop:
-	ld1		{v1.16b}, [x1], #16		/* get next ct block */
+	ld1		{v1.16b}, [x20], #16		/* get next ct block */
 	mov		v0.16b, v1.16b			/* ...and copy to v0 */
-	decrypt_block	v0, w3, x2, x6, w7
+	decrypt_block	v0, w22, x21, x6, w7
 	eor		v0.16b, v0.16b, v7.16b		/* xor with iv => pt */
 	mov		v7.16b, v1.16b			/* ct is next iv */
-	st1		{v0.16b}, [x0], #16
-	subs		w4, w4, #1
+	st1		{v0.16b}, [x19], #16
+	subs		w23, w23, #1
 	bne		.Lcbcdecloop
 .Lcbcdecout:
-	st1		{v7.16b}, [x5]			/* return iv */
-	ldp		x29, x30, [sp], #16
+	st1		{v7.16b}, [x24]			/* return iv */
+	frame_pop
 	ret
 AES_ENDPROC(aes_cbc_decrypt)
 
@@ -176,19 +212,26 @@ AES_ENDPROC(aes_cbc_decrypt)
 	 */
 
 AES_ENTRY(aes_ctr_encrypt)
-	stp		x29, x30, [sp, #-16]!
-	mov		x29, sp
+	frame_push	6
 
-	enc_prepare	w3, x2, x6
-	ld1		{v4.16b}, [x5]
+	mov		x19, x0
+	mov		x20, x1
+	mov		x21, x2
+	mov		x22, x3
+	mov		x23, x4
+	mov		x24, x5
+
+.Lctrrestart:
+	enc_prepare	w22, x21, x6
+	ld1		{v4.16b}, [x24]
 
 	umov		x6, v4.d[1]		/* keep swabbed ctr in reg */
 	rev		x6, x6
-	cmn		w6, w4			/* 32 bit overflow? */
-	bcs		.Lctrloop
 .LctrloopNx:
-	subs		w4, w4, #4
+	subs		w23, w23, #4
 	bmi		.Lctr1x
+	cmn		w6, #4			/* 32 bit overflow? */
+	bcs		.Lctr1x
 	ldr		q8, =0x30000000200000001	/* addends 1,2,3[,0] */
 	dup		v7.4s, w6
 	mov		v0.16b, v4.16b
@@ -200,25 +243,27 @@ AES_ENTRY(aes_ctr_encrypt)
 	mov		v1.s[3], v8.s[0]
 	mov		v2.s[3], v8.s[1]
 	mov		v3.s[3], v8.s[2]
-	ld1		{v5.16b-v7.16b}, [x1], #48	/* get 3 input blocks */
+	ld1		{v5.16b-v7.16b}, [x20], #48	/* get 3 input blocks */
 	bl		aes_encrypt_block4x
 	eor		v0.16b, v5.16b, v0.16b
-	ld1		{v5.16b}, [x1], #16		/* get 1 input block  */
+	ld1		{v5.16b}, [x20], #16		/* get 1 input block  */
 	eor		v1.16b, v6.16b, v1.16b
 	eor		v2.16b, v7.16b, v2.16b
 	eor		v3.16b, v5.16b, v3.16b
-	st1		{v0.16b-v3.16b}, [x0], #64
+	st1		{v0.16b-v3.16b}, [x19], #64
 	add		x6, x6, #4
 	rev		x7, x6
 	ins		v4.d[1], x7
-	cbz		w4, .Lctrout
+	cbz		w23, .Lctrout
+	st1		{v4.16b}, [x24]		/* return next CTR value */
+	cond_yield_neon	.Lctrrestart
 	b		.LctrloopNx
 .Lctr1x:
-	adds		w4, w4, #4
+	adds		w23, w23, #4
 	beq		.Lctrout
 .Lctrloop:
 	mov		v0.16b, v4.16b
-	encrypt_block	v0, w3, x2, x8, w7
+	encrypt_block	v0, w22, x21, x8, w7
 
 	adds		x6, x6, #1		/* increment BE ctr */
 	rev		x7, x6
@@ -226,22 +271,22 @@ AES_ENTRY(aes_ctr_encrypt)
 	bcs		.Lctrcarry		/* overflow? */
 
 .Lctrcarrydone:
-	subs		w4, w4, #1
+	subs		w23, w23, #1
 	bmi		.Lctrtailblock		/* blocks <0 means tail block */
-	ld1		{v3.16b}, [x1], #16
+	ld1		{v3.16b}, [x20], #16
 	eor		v3.16b, v0.16b, v3.16b
-	st1		{v3.16b}, [x0], #16
+	st1		{v3.16b}, [x19], #16
 	bne		.Lctrloop
 
 .Lctrout:
-	st1		{v4.16b}, [x5]		/* return next CTR value */
-	ldp		x29, x30, [sp], #16
+	st1		{v4.16b}, [x24]		/* return next CTR value */
+.Lctrret:
+	frame_pop
 	ret
 
 .Lctrtailblock:
-	st1		{v0.16b}, [x0]
-	ldp		x29, x30, [sp], #16
-	ret
+	st1		{v0.16b}, [x19]
+	b		.Lctrret
 
 .Lctrcarry:
 	umov		x7, v4.d[0]		/* load upper word of ctr  */
@@ -274,10 +319,16 @@ CPU_LE(	.quad		1, 0x87		)
 CPU_BE(	.quad		0x87, 1		)
 
 AES_ENTRY(aes_xts_encrypt)
-	stp		x29, x30, [sp, #-16]!
-	mov		x29, sp
+	frame_push	6
+
+	mov		x19, x0
+	mov		x20, x1
+	mov		x21, x2
+	mov		x22, x3
+	mov		x23, x4
+	mov		x24, x6
 
-	ld1		{v4.16b}, [x6]
+	ld1		{v4.16b}, [x24]
 	cbz		w7, .Lxtsencnotfirst
 
 	enc_prepare	w3, x5, x8
@@ -286,15 +337,17 @@ AES_ENTRY(aes_xts_encrypt)
 	ldr		q7, .Lxts_mul_x
 	b		.LxtsencNx
 
+.Lxtsencrestart:
+	ld1		{v4.16b}, [x24]
 .Lxtsencnotfirst:
-	enc_prepare	w3, x2, x8
+	enc_prepare	w22, x21, x8
 .LxtsencloopNx:
 	ldr		q7, .Lxts_mul_x
 	next_tweak	v4, v4, v7, v8
 .LxtsencNx:
-	subs		w4, w4, #4
+	subs		w23, w23, #4
 	bmi		.Lxtsenc1x
-	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
+	ld1		{v0.16b-v3.16b}, [x20], #64	/* get 4 pt blocks */
 	next_tweak	v5, v4, v7, v8
 	eor		v0.16b, v0.16b, v4.16b
 	next_tweak	v6, v5, v7, v8
@@ -307,35 +360,43 @@ AES_ENTRY(aes_xts_encrypt)
 	eor		v0.16b, v0.16b, v4.16b
 	eor		v1.16b, v1.16b, v5.16b
 	eor		v2.16b, v2.16b, v6.16b
-	st1		{v0.16b-v3.16b}, [x0], #64
+	st1		{v0.16b-v3.16b}, [x19], #64
 	mov		v4.16b, v7.16b
-	cbz		w4, .Lxtsencout
+	cbz		w23, .Lxtsencout
+	st1		{v4.16b}, [x24]
+	cond_yield_neon	.Lxtsencrestart
 	b		.LxtsencloopNx
 .Lxtsenc1x:
-	adds		w4, w4, #4
+	adds		w23, w23, #4
 	beq		.Lxtsencout
 .Lxtsencloop:
-	ld1		{v1.16b}, [x1], #16
+	ld1		{v1.16b}, [x20], #16
 	eor		v0.16b, v1.16b, v4.16b
-	encrypt_block	v0, w3, x2, x8, w7
+	encrypt_block	v0, w22, x21, x8, w7
 	eor		v0.16b, v0.16b, v4.16b
-	st1		{v0.16b}, [x0], #16
-	subs		w4, w4, #1
+	st1		{v0.16b}, [x19], #16
+	subs		w23, w23, #1
 	beq		.Lxtsencout
 	next_tweak	v4, v4, v7, v8
 	b		.Lxtsencloop
 .Lxtsencout:
-	st1		{v4.16b}, [x6]
-	ldp		x29, x30, [sp], #16
+	st1		{v4.16b}, [x24]
+	frame_pop
 	ret
 AES_ENDPROC(aes_xts_encrypt)
 
 
 AES_ENTRY(aes_xts_decrypt)
-	stp		x29, x30, [sp, #-16]!
-	mov		x29, sp
+	frame_push	6
 
-	ld1		{v4.16b}, [x6]
+	mov		x19, x0
+	mov		x20, x1
+	mov		x21, x2
+	mov		x22, x3
+	mov		x23, x4
+	mov		x24, x6
+
+	ld1		{v4.16b}, [x24]
 	cbz		w7, .Lxtsdecnotfirst
 
 	enc_prepare	w3, x5, x8
@@ -344,15 +405,17 @@ AES_ENTRY(aes_xts_decrypt)
 	ldr		q7, .Lxts_mul_x
 	b		.LxtsdecNx
 
+.Lxtsdecrestart:
+	ld1		{v4.16b}, [x24]
 .Lxtsdecnotfirst:
-	dec_prepare	w3, x2, x8
+	dec_prepare	w22, x21, x8
 .LxtsdecloopNx:
 	ldr		q7, .Lxts_mul_x
 	next_tweak	v4, v4, v7, v8
 .LxtsdecNx:
-	subs		w4, w4, #4
+	subs		w23, w23, #4
 	bmi		.Lxtsdec1x
-	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
+	ld1		{v0.16b-v3.16b}, [x20], #64	/* get 4 ct blocks */
 	next_tweak	v5, v4, v7, v8
 	eor		v0.16b, v0.16b, v4.16b
 	next_tweak	v6, v5, v7, v8
@@ -365,26 +428,28 @@ AES_ENTRY(aes_xts_decrypt)
 	eor		v0.16b, v0.16b, v4.16b
 	eor		v1.16b, v1.16b, v5.16b
 	eor		v2.16b, v2.16b, v6.16b
-	st1		{v0.16b-v3.16b}, [x0], #64
+	st1		{v0.16b-v3.16b}, [x19], #64
 	mov		v4.16b, v7.16b
-	cbz		w4, .Lxtsdecout
+	cbz		w23, .Lxtsdecout
+	st1		{v4.16b}, [x24]
+	cond_yield_neon	.Lxtsdecrestart
 	b		.LxtsdecloopNx
 .Lxtsdec1x:
-	adds		w4, w4, #4
+	adds		w23, w23, #4
 	beq		.Lxtsdecout
 .Lxtsdecloop:
-	ld1		{v1.16b}, [x1], #16
+	ld1		{v1.16b}, [x20], #16
 	eor		v0.16b, v1.16b, v4.16b
-	decrypt_block	v0, w3, x2, x8, w7
+	decrypt_block	v0, w22, x21, x8, w7
 	eor		v0.16b, v0.16b, v4.16b
-	st1		{v0.16b}, [x0], #16
-	subs		w4, w4, #1
+	st1		{v0.16b}, [x19], #16
+	subs		w23, w23, #1
 	beq		.Lxtsdecout
 	next_tweak	v4, v4, v7, v8
 	b		.Lxtsdecloop
 .Lxtsdecout:
-	st1		{v4.16b}, [x6]
-	ldp		x29, x30, [sp], #16
+	st1		{v4.16b}, [x24]
+	frame_pop
 	ret
 AES_ENDPROC(aes_xts_decrypt)
 
@@ -393,43 +458,61 @@ AES_ENDPROC(aes_xts_decrypt)
 	 *		  int blocks, u8 dg[], int enc_before, int enc_after)
 	 */
 AES_ENTRY(aes_mac_update)
-	ld1		{v0.16b}, [x4]			/* get dg */
+	frame_push	6
+
+	mov		x19, x0
+	mov		x20, x1
+	mov		x21, x2
+	mov		x22, x3
+	mov		x23, x4
+	mov		x24, x6
+
+	ld1		{v0.16b}, [x23]			/* get dg */
 	enc_prepare	w2, x1, x7
 	cbz		w5, .Lmacloop4x
 
 	encrypt_block	v0, w2, x1, x7, w8
 
 .Lmacloop4x:
-	subs		w3, w3, #4
+	subs		w22, w22, #4
 	bmi		.Lmac1x
-	ld1		{v1.16b-v4.16b}, [x0], #64	/* get next pt block */
+	ld1		{v1.16b-v4.16b}, [x19], #64	/* get next pt block */
 	eor		v0.16b, v0.16b, v1.16b		/* ..and xor with dg */
-	encrypt_block	v0, w2, x1, x7, w8
+	encrypt_block	v0, w21, x20, x7, w8
 	eor		v0.16b, v0.16b, v2.16b
-	encrypt_block	v0, w2, x1, x7, w8
+	encrypt_block	v0, w21, x20, x7, w8
 	eor		v0.16b, v0.16b, v3.16b
-	encrypt_block	v0, w2, x1, x7, w8
+	encrypt_block	v0, w21, x20, x7, w8
 	eor		v0.16b, v0.16b, v4.16b
-	cmp		w3, wzr
-	csinv		x5, x6, xzr, eq
+	cmp		w22, wzr
+	csinv		x5, x24, xzr, eq
 	cbz		w5, .Lmacout
-	encrypt_block	v0, w2, x1, x7, w8
+	encrypt_block	v0, w21, x20, x7, w8
+	st1		{v0.16b}, [x23]			/* return dg */
+	cond_yield_neon	.Lmacrestart
 	b		.Lmacloop4x
 .Lmac1x:
-	add		w3, w3, #4
+	add		w22, w22, #4
 .Lmacloop:
-	cbz		w3, .Lmacout
-	ld1		{v1.16b}, [x0], #16		/* get next pt block */
+	cbz		w22, .Lmacout
+	ld1		{v1.16b}, [x19], #16		/* get next pt block */
 	eor		v0.16b, v0.16b, v1.16b		/* ..and xor with dg */
 
-	subs		w3, w3, #1
-	csinv		x5, x6, xzr, eq
+	subs		w22, w22, #1
+	csinv		x5, x24, xzr, eq
 	cbz		w5, .Lmacout
 
-	encrypt_block	v0, w2, x1, x7, w8
+.Lmacenc:
+	encrypt_block	v0, w21, x20, x7, w8
 	b		.Lmacloop
 
 .Lmacout:
-	st1		{v0.16b}, [x4]			/* return dg */
+	st1		{v0.16b}, [x23]			/* return dg */
+	frame_pop
 	ret
+
+.Lmacrestart:
+	ld1		{v0.16b}, [x23]			/* get dg */
+	enc_prepare	w21, x20, x0
+	b		.Lmacloop4x
 AES_ENDPROC(aes_mac_update)
diff --git a/arch/arm64/crypto/aes-neonbs-core.S b/arch/arm64/crypto/aes-neonbs-core.S
index ca04725..e613a87 100644
--- a/arch/arm64/crypto/aes-neonbs-core.S
+++ b/arch/arm64/crypto/aes-neonbs-core.S
@@ -565,54 +565,61 @@ ENDPROC(aesbs_decrypt8)
 	 *		     int blocks)
 	 */
 	.macro		__ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
-	stp		x29, x30, [sp, #-16]!
-	mov		x29, sp
+	frame_push	5
+
+	mov		x19, x0
+	mov		x20, x1
+	mov		x21, x2
+	mov		x22, x3
+	mov		x23, x4
 
 99:	mov		x5, #1
-	lsl		x5, x5, x4
-	subs		w4, w4, #8
-	csel		x4, x4, xzr, pl
+	lsl		x5, x5, x23
+	subs		w23, w23, #8
+	csel		x23, x23, xzr, pl
 	csel		x5, x5, xzr, mi
 
-	ld1		{v0.16b}, [x1], #16
+	ld1		{v0.16b}, [x20], #16
 	tbnz		x5, #1, 0f
-	ld1		{v1.16b}, [x1], #16
+	ld1		{v1.16b}, [x20], #16
 	tbnz		x5, #2, 0f
-	ld1		{v2.16b}, [x1], #16
+	ld1		{v2.16b}, [x20], #16
 	tbnz		x5, #3, 0f
-	ld1		{v3.16b}, [x1], #16
+	ld1		{v3.16b}, [x20], #16
 	tbnz		x5, #4, 0f
-	ld1		{v4.16b}, [x1], #16
+	ld1		{v4.16b}, [x20], #16
 	tbnz		x5, #5, 0f
-	ld1		{v5.16b}, [x1], #16
+	ld1		{v5.16b}, [x20], #16
 	tbnz		x5, #6, 0f
-	ld1		{v6.16b}, [x1], #16
+	ld1		{v6.16b}, [x20], #16
 	tbnz		x5, #7, 0f
-	ld1		{v7.16b}, [x1], #16
+	ld1		{v7.16b}, [x20], #16
 
-0:	mov		bskey, x2
-	mov		rounds, x3
+0:	mov		bskey, x21
+	mov		rounds, x22
 	bl		\do8
 
-	st1		{\o0\().16b}, [x0], #16
+	st1		{\o0\().16b}, [x19], #16
 	tbnz		x5, #1, 1f
-	st1		{\o1\().16b}, [x0], #16
+	st1		{\o1\().16b}, [x19], #16
 	tbnz		x5, #2, 1f
-	st1		{\o2\().16b}, [x0], #16
+	st1		{\o2\().16b}, [x19], #16
 	tbnz		x5, #3, 1f
-	st1		{\o3\().16b}, [x0], #16
+	st1		{\o3\().16b}, [x19], #16
 	tbnz		x5, #4, 1f
-	st1		{\o4\().16b}, [x0], #16
+	st1		{\o4\().16b}, [x19], #16
 	tbnz		x5, #5, 1f
-	st1		{\o5\().16b}, [x0], #16
+	st1		{\o5\().16b}, [x19], #16
 	tbnz		x5, #6, 1f
-	st1		{\o6\().16b}, [x0], #16
+	st1		{\o6\().16b}, [x19], #16
 	tbnz		x5, #7, 1f
-	st1		{\o7\().16b}, [x0], #16
+	st1		{\o7\().16b}, [x19], #16
 
-	cbnz		x4, 99b
+	cbz		x23, 1f
+	cond_yield_neon
+	b		99b
 
-1:	ldp		x29, x30, [sp], #16
+1:	frame_pop
 	ret
 	.endm
 
@@ -632,43 +639,49 @@ ENDPROC(aesbs_ecb_decrypt)
 	 */
 	.align		4
 ENTRY(aesbs_cbc_decrypt)
-	stp		x29, x30, [sp, #-16]!
-	mov		x29, sp
+	frame_push	6
+
+	mov		x19, x0
+	mov		x20, x1
+	mov		x21, x2
+	mov		x22, x3
+	mov		x23, x4
+	mov		x24, x5
 
 99:	mov		x6, #1
-	lsl		x6, x6, x4
-	subs		w4, w4, #8
-	csel		x4, x4, xzr, pl
+	lsl		x6, x6, x23
+	subs		w23, w23, #8
+	csel		x23, x23, xzr, pl
 	csel		x6, x6, xzr, mi
 
-	ld1		{v0.16b}, [x1], #16
+	ld1		{v0.16b}, [x20], #16
 	mov		v25.16b, v0.16b
 	tbnz		x6, #1, 0f
-	ld1		{v1.16b}, [x1], #16
+	ld1		{v1.16b}, [x20], #16
 	mov		v26.16b, v1.16b
 	tbnz		x6, #2, 0f
-	ld1		{v2.16b}, [x1], #16
+	ld1		{v2.16b}, [x20], #16
 	mov		v27.16b, v2.16b
 	tbnz		x6, #3, 0f
-	ld1		{v3.16b}, [x1], #16
+	ld1		{v3.16b}, [x20], #16
 	mov		v28.16b, v3.16b
 	tbnz		x6, #4, 0f
-	ld1		{v4.16b}, [x1], #16
+	ld1		{v4.16b}, [x20], #16
 	mov		v29.16b, v4.16b
 	tbnz		x6, #5, 0f
-	ld1		{v5.16b}, [x1], #16
+	ld1		{v5.16b}, [x20], #16
 	mov		v30.16b, v5.16b
 	tbnz		x6, #6, 0f
-	ld1		{v6.16b}, [x1], #16
+	ld1		{v6.16b}, [x20], #16
 	mov		v31.16b, v6.16b
 	tbnz		x6, #7, 0f
-	ld1		{v7.16b}, [x1]
+	ld1		{v7.16b}, [x20]
 
-0:	mov		bskey, x2
-	mov		rounds, x3
+0:	mov		bskey, x21
+	mov		rounds, x22
 	bl		aesbs_decrypt8
 
-	ld1		{v24.16b}, [x5]			// load IV
+	ld1		{v24.16b}, [x24]		// load IV
 
 	eor		v1.16b, v1.16b, v25.16b
 	eor		v6.16b, v6.16b, v26.16b
@@ -679,34 +692,36 @@ ENTRY(aesbs_cbc_decrypt)
 	eor		v3.16b, v3.16b, v30.16b
 	eor		v5.16b, v5.16b, v31.16b
 
-	st1		{v0.16b}, [x0], #16
+	st1		{v0.16b}, [x19], #16
 	mov		v24.16b, v25.16b
 	tbnz		x6, #1, 1f
-	st1		{v1.16b}, [x0], #16
+	st1		{v1.16b}, [x19], #16
 	mov		v24.16b, v26.16b
 	tbnz		x6, #2, 1f
-	st1		{v6.16b}, [x0], #16
+	st1		{v6.16b}, [x19], #16
 	mov		v24.16b, v27.16b
 	tbnz		x6, #3, 1f
-	st1		{v4.16b}, [x0], #16
+	st1		{v4.16b}, [x19], #16
 	mov		v24.16b, v28.16b
 	tbnz		x6, #4, 1f
-	st1		{v2.16b}, [x0], #16
+	st1		{v2.16b}, [x19], #16
 	mov		v24.16b, v29.16b
 	tbnz		x6, #5, 1f
-	st1		{v7.16b}, [x0], #16
+	st1		{v7.16b}, [x19], #16
 	mov		v24.16b, v30.16b
 	tbnz		x6, #6, 1f
-	st1		{v3.16b}, [x0], #16
+	st1		{v3.16b}, [x19], #16
 	mov		v24.16b, v31.16b
 	tbnz		x6, #7, 1f
-	ld1		{v24.16b}, [x1], #16
-	st1		{v5.16b}, [x0], #16
-1:	st1		{v24.16b}, [x5]			// store IV
+	ld1		{v24.16b}, [x20], #16
+	st1		{v5.16b}, [x19], #16
+1:	st1		{v24.16b}, [x24]		// store IV
 
-	cbnz		x4, 99b
+	cbz		x23, 2f
+	cond_yield_neon
+	b		99b
 
-	ldp		x29, x30, [sp], #16
+2:	frame_pop
 	ret
 ENDPROC(aesbs_cbc_decrypt)
 
@@ -731,87 +746,93 @@ CPU_BE(	.quad		0x87, 1		)
 	 */
 __xts_crypt8:
 	mov		x6, #1
-	lsl		x6, x6, x4
-	subs		w4, w4, #8
-	csel		x4, x4, xzr, pl
+	lsl		x6, x6, x23
+	subs		w23, w23, #8
+	csel		x23, x23, xzr, pl
 	csel		x6, x6, xzr, mi
 
-	ld1		{v0.16b}, [x1], #16
+	ld1		{v0.16b}, [x20], #16
 	next_tweak	v26, v25, v30, v31
 	eor		v0.16b, v0.16b, v25.16b
 	tbnz		x6, #1, 0f
 
-	ld1		{v1.16b}, [x1], #16
+	ld1		{v1.16b}, [x20], #16
 	next_tweak	v27, v26, v30, v31
 	eor		v1.16b, v1.16b, v26.16b
 	tbnz		x6, #2, 0f
 
-	ld1		{v2.16b}, [x1], #16
+	ld1		{v2.16b}, [x20], #16
 	next_tweak	v28, v27, v30, v31
 	eor		v2.16b, v2.16b, v27.16b
 	tbnz		x6, #3, 0f
 
-	ld1		{v3.16b}, [x1], #16
+	ld1		{v3.16b}, [x20], #16
 	next_tweak	v29, v28, v30, v31
 	eor		v3.16b, v3.16b, v28.16b
 	tbnz		x6, #4, 0f
 
-	ld1		{v4.16b}, [x1], #16
-	str		q29, [sp, #16]
+	ld1		{v4.16b}, [x20], #16
+	str		q29, [sp, #.Lframe_local_offset]
 	eor		v4.16b, v4.16b, v29.16b
 	next_tweak	v29, v29, v30, v31
 	tbnz		x6, #5, 0f
 
-	ld1		{v5.16b}, [x1], #16
-	str		q29, [sp, #32]
+	ld1		{v5.16b}, [x20], #16
+	str		q29, [sp, #.Lframe_local_offset + 16]
 	eor		v5.16b, v5.16b, v29.16b
 	next_tweak	v29, v29, v30, v31
 	tbnz		x6, #6, 0f
 
-	ld1		{v6.16b}, [x1], #16
-	str		q29, [sp, #48]
+	ld1		{v6.16b}, [x20], #16
+	str		q29, [sp, #.Lframe_local_offset + 32]
 	eor		v6.16b, v6.16b, v29.16b
 	next_tweak	v29, v29, v30, v31
 	tbnz		x6, #7, 0f
 
-	ld1		{v7.16b}, [x1], #16
-	str		q29, [sp, #64]
+	ld1		{v7.16b}, [x20], #16
+	str		q29, [sp, #.Lframe_local_offset + 48]
 	eor		v7.16b, v7.16b, v29.16b
 	next_tweak	v29, v29, v30, v31
 
-0:	mov		bskey, x2
-	mov		rounds, x3
+0:	mov		bskey, x21
+	mov		rounds, x22
 	br		x7
 ENDPROC(__xts_crypt8)
 
 	.macro		__xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
-	stp		x29, x30, [sp, #-80]!
-	mov		x29, sp
+	frame_push	6, 64
 
-	ldr		q30, .Lxts_mul_x
-	ld1		{v25.16b}, [x5]
+	mov		x19, x0
+	mov		x20, x1
+	mov		x21, x2
+	mov		x22, x3
+	mov		x23, x4
+	mov		x24, x5
+
+0:	ldr		q30, .Lxts_mul_x
+	ld1		{v25.16b}, [x24]
 
 99:	adr		x7, \do8
 	bl		__xts_crypt8
 
-	ldp		q16, q17, [sp, #16]
-	ldp		q18, q19, [sp, #48]
+	ldp		q16, q17, [sp, #.Lframe_local_offset]
+	ldp		q18, q19, [sp, #.Lframe_local_offset + 32]
 
 	eor		\o0\().16b, \o0\().16b, v25.16b
 	eor		\o1\().16b, \o1\().16b, v26.16b
 	eor		\o2\().16b, \o2\().16b, v27.16b
 	eor		\o3\().16b, \o3\().16b, v28.16b
 
-	st1		{\o0\().16b}, [x0], #16
+	st1		{\o0\().16b}, [x19], #16
 	mov		v25.16b, v26.16b
 	tbnz		x6, #1, 1f
-	st1		{\o1\().16b}, [x0], #16
+	st1		{\o1\().16b}, [x19], #16
 	mov		v25.16b, v27.16b
 	tbnz		x6, #2, 1f
-	st1		{\o2\().16b}, [x0], #16
+	st1		{\o2\().16b}, [x19], #16
 	mov		v25.16b, v28.16b
 	tbnz		x6, #3, 1f
-	st1		{\o3\().16b}, [x0], #16
+	st1		{\o3\().16b}, [x19], #16
 	mov		v25.16b, v29.16b
 	tbnz		x6, #4, 1f
 
@@ -820,18 +841,22 @@ ENDPROC(__xts_crypt8)
 	eor		\o6\().16b, \o6\().16b, v18.16b
 	eor		\o7\().16b, \o7\().16b, v19.16b
 
-	st1		{\o4\().16b}, [x0], #16
+	st1		{\o4\().16b}, [x19], #16
 	tbnz		x6, #5, 1f
-	st1		{\o5\().16b}, [x0], #16
+	st1		{\o5\().16b}, [x19], #16
 	tbnz		x6, #6, 1f
-	st1		{\o6\().16b}, [x0], #16
+	st1		{\o6\().16b}, [x19], #16
 	tbnz		x6, #7, 1f
-	st1		{\o7\().16b}, [x0], #16
+	st1		{\o7\().16b}, [x19], #16
+
+	cbz		x23, 1f
+	st1		{v25.16b}, [x24]
 
-	cbnz		x4, 99b
+	cond_yield_neon	0b
+	b		99b
 
-1:	st1		{v25.16b}, [x5]
-	ldp		x29, x30, [sp], #80
+1:	st1		{v25.16b}, [x24]
+	frame_pop
 	ret
 	.endm
 
@@ -856,24 +881,31 @@ ENDPROC(aesbs_xts_decrypt)
 	 *		     int rounds, int blocks, u8 iv[], u8 final[])
 	 */
 ENTRY(aesbs_ctr_encrypt)
-	stp		x29, x30, [sp, #-16]!
-	mov		x29, sp
-
-	cmp		x6, #0
-	cset		x10, ne
-	add		x4, x4, x10		// do one extra block if final
-
-	ldp		x7, x8, [x5]
-	ld1		{v0.16b}, [x5]
+	frame_push	8
+
+	mov		x19, x0
+	mov		x20, x1
+	mov		x21, x2
+	mov		x22, x3
+	mov		x23, x4
+	mov		x24, x5
+	mov		x25, x6
+
+	cmp		x25, #0
+	cset		x26, ne
+	add		x23, x23, x26		// do one extra block if final
+
+98:	ldp		x7, x8, [x24]
+	ld1		{v0.16b}, [x24]
 CPU_LE(	rev		x7, x7		)
 CPU_LE(	rev		x8, x8		)
 	adds		x8, x8, #1
 	adc		x7, x7, xzr
 
 99:	mov		x9, #1
-	lsl		x9, x9, x4
-	subs		w4, w4, #8
-	csel		x4, x4, xzr, pl
+	lsl		x9, x9, x23
+	subs		w23, w23, #8
+	csel		x23, x23, xzr, pl
 	csel		x9, x9, xzr, le
 
 	tbnz		x9, #1, 0f
@@ -891,82 +923,85 @@ CPU_LE(	rev		x8, x8		)
 	tbnz		x9, #7, 0f
 	next_ctr	v7
 
-0:	mov		bskey, x2
-	mov		rounds, x3
+0:	mov		bskey, x21
+	mov		rounds, x22
 	bl		aesbs_encrypt8
 
-	lsr		x9, x9, x10		// disregard the extra block
+	lsr		x9, x9, x26		// disregard the extra block
 	tbnz		x9, #0, 0f
 
-	ld1		{v8.16b}, [x1], #16
+	ld1		{v8.16b}, [x20], #16
 	eor		v0.16b, v0.16b, v8.16b
-	st1		{v0.16b}, [x0], #16
+	st1		{v0.16b}, [x19], #16
 	tbnz		x9, #1, 1f
 
-	ld1		{v9.16b}, [x1], #16
+	ld1		{v9.16b}, [x20], #16
 	eor		v1.16b, v1.16b, v9.16b
-	st1		{v1.16b}, [x0], #16
+	st1		{v1.16b}, [x19], #16
 	tbnz		x9, #2, 2f
 
-	ld1		{v10.16b}, [x1], #16
+	ld1		{v10.16b}, [x20], #16
 	eor		v4.16b, v4.16b, v10.16b
-	st1		{v4.16b}, [x0], #16
+	st1		{v4.16b}, [x19], #16
 	tbnz		x9, #3, 3f
 
-	ld1		{v11.16b}, [x1], #16
+	ld1		{v11.16b}, [x20], #16
 	eor		v6.16b, v6.16b, v11.16b
-	st1		{v6.16b}, [x0], #16
+	st1		{v6.16b}, [x19], #16
 	tbnz		x9, #4, 4f
 
-	ld1		{v12.16b}, [x1], #16
+	ld1		{v12.16b}, [x20], #16
 	eor		v3.16b, v3.16b, v12.16b
-	st1		{v3.16b}, [x0], #16
+	st1		{v3.16b}, [x19], #16
 	tbnz		x9, #5, 5f
 
-	ld1		{v13.16b}, [x1], #16
+	ld1		{v13.16b}, [x20], #16
 	eor		v7.16b, v7.16b, v13.16b
-	st1		{v7.16b}, [x0], #16
+	st1		{v7.16b}, [x19], #16
 	tbnz		x9, #6, 6f
 
-	ld1		{v14.16b}, [x1], #16
+	ld1		{v14.16b}, [x20], #16
 	eor		v2.16b, v2.16b, v14.16b
-	st1		{v2.16b}, [x0], #16
+	st1		{v2.16b}, [x19], #16
 	tbnz		x9, #7, 7f
 
-	ld1		{v15.16b}, [x1], #16
+	ld1		{v15.16b}, [x20], #16
 	eor		v5.16b, v5.16b, v15.16b
-	st1		{v5.16b}, [x0], #16
+	st1		{v5.16b}, [x19], #16
 
 8:	next_ctr	v0
-	cbnz		x4, 99b
+	st1		{v0.16b}, [x24]
+	cbz		x23, 0f
+
+	cond_yield_neon	98b
+	b		99b
 
-0:	st1		{v0.16b}, [x5]
-	ldp		x29, x30, [sp], #16
+0:	frame_pop
 	ret
 
 	/*
 	 * If we are handling the tail of the input (x6 != NULL), return the
 	 * final keystream block back to the caller.
 	 */
-1:	cbz		x6, 8b
-	st1		{v1.16b}, [x6]
+1:	cbz		x25, 8b
+	st1		{v1.16b}, [x25]
 	b		8b
-2:	cbz		x6, 8b
-	st1		{v4.16b}, [x6]
+2:	cbz		x25, 8b
+	st1		{v4.16b}, [x25]
 	b		8b
-3:	cbz		x6, 8b
-	st1		{v6.16b}, [x6]
+3:	cbz		x25, 8b
+	st1		{v6.16b}, [x25]
 	b		8b
-4:	cbz		x6, 8b
-	st1		{v3.16b}, [x6]
+4:	cbz		x25, 8b
+	st1		{v3.16b}, [x25]
 	b		8b
-5:	cbz		x6, 8b
-	st1		{v7.16b}, [x6]
+5:	cbz		x25, 8b
+	st1		{v7.16b}, [x25]
 	b		8b
-6:	cbz		x6, 8b
-	st1		{v2.16b}, [x6]
+6:	cbz		x25, 8b
+	st1		{v2.16b}, [x25]
 	b		8b
-7:	cbz		x6, 8b
-	st1		{v5.16b}, [x6]
+7:	cbz		x25, 8b
+	st1		{v5.16b}, [x25]
 	b		8b
 ENDPROC(aesbs_ctr_encrypt)
diff --git a/arch/arm64/crypto/crc32-ce-core.S b/arch/arm64/crypto/crc32-ce-core.S
index 16ed3c7..8061bf0 100644
--- a/arch/arm64/crypto/crc32-ce-core.S
+++ b/arch/arm64/crypto/crc32-ce-core.S
@@ -100,9 +100,10 @@
 	dCONSTANT	.req	d0
 	qCONSTANT	.req	q0
 
-	BUF		.req	x0
-	LEN		.req	x1
-	CRC		.req	x2
+	BUF		.req	x19
+	LEN		.req	x20
+	CRC		.req	x21
+	CONST		.req	x22
 
 	vzr		.req	v9
 
@@ -123,7 +124,14 @@ ENTRY(crc32_pmull_le)
 ENTRY(crc32c_pmull_le)
 	adr_l		x3, .Lcrc32c_constants
 
-0:	bic		LEN, LEN, #15
+0:	frame_push	4, 64
+
+	mov		BUF, x0
+	mov		LEN, x1
+	mov		CRC, x2
+	mov		CONST, x3
+
+	bic		LEN, LEN, #15
 	ld1		{v1.16b-v4.16b}, [BUF], #0x40
 	movi		vzr.16b, #0
 	fmov		dCONSTANT, CRC
@@ -132,7 +140,7 @@ ENTRY(crc32c_pmull_le)
 	cmp		LEN, #0x40
 	b.lt		less_64
 
-	ldr		qCONSTANT, [x3]
+	ldr		qCONSTANT, [CONST]
 
 loop_64:		/* 64 bytes Full cache line folding */
 	sub		LEN, LEN, #0x40
@@ -162,10 +170,21 @@ loop_64:		/* 64 bytes Full cache line folding */
 	eor		v4.16b, v4.16b, v8.16b
 
 	cmp		LEN, #0x40
-	b.ge		loop_64
+	b.lt		less_64
+
+	if_will_cond_yield_neon
+	stp		q1, q2, [sp, #.Lframe_local_offset]
+	stp		q3, q4, [sp, #.Lframe_local_offset + 32]
+	do_cond_yield_neon
+	ldp		q1, q2, [sp, #.Lframe_local_offset]
+	ldp		q3, q4, [sp, #.Lframe_local_offset + 32]
+	ldr		qCONSTANT, [CONST]
+	movi		vzr.16b, #0
+	endif_yield_neon
+	b		loop_64
 
 less_64:		/* Folding cache line into 128bit */
-	ldr		qCONSTANT, [x3, #16]
+	ldr		qCONSTANT, [CONST, #16]
 
 	pmull2		v5.1q, v1.2d, vCONSTANT.2d
 	pmull		v1.1q, v1.1d, vCONSTANT.1d
@@ -204,8 +223,8 @@ fold_64:
 	eor		v1.16b, v1.16b, v2.16b
 
 	/* final 32-bit fold */
-	ldr		dCONSTANT, [x3, #32]
-	ldr		d3, [x3, #40]
+	ldr		dCONSTANT, [CONST, #32]
+	ldr		d3, [CONST, #40]
 
 	ext		v2.16b, v1.16b, vzr.16b, #4
 	and		v1.16b, v1.16b, v3.16b
@@ -213,7 +232,7 @@ fold_64:
 	eor		v1.16b, v1.16b, v2.16b
 
 	/* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
-	ldr		qCONSTANT, [x3, #48]
+	ldr		qCONSTANT, [CONST, #48]
 
 	and		v2.16b, v1.16b, v3.16b
 	ext		v2.16b, vzr.16b, v2.16b, #8
@@ -223,6 +242,7 @@ fold_64:
 	eor		v1.16b, v1.16b, v2.16b
 	mov		w0, v1.s[1]
 
+	frame_pop
 	ret
 ENDPROC(crc32_pmull_le)
 ENDPROC(crc32c_pmull_le)
diff --git a/arch/arm64/crypto/crct10dif-ce-core.S b/arch/arm64/crypto/crct10dif-ce-core.S
index f179c01..663ea71 100644
--- a/arch/arm64/crypto/crct10dif-ce-core.S
+++ b/arch/arm64/crypto/crct10dif-ce-core.S
@@ -74,13 +74,19 @@
 	.text
 	.cpu		generic+crypto
 
-	arg1_low32	.req	w0
-	arg2		.req	x1
-	arg3		.req	x2
+	arg1_low32	.req	w19
+	arg2		.req	x20
+	arg3		.req	x21
 
 	vzr		.req	v13
 
 ENTRY(crc_t10dif_pmull)
+	frame_push	3, 128
+
+	mov		arg1_low32, w0
+	mov		arg2, x1
+	mov		arg3, x2
+
 	movi		vzr.16b, #0		// init zero register
 
 	// adjust the 16-bit initial_crc value, scale it to 32 bits
@@ -175,8 +181,25 @@ CPU_LE(	ext		v12.16b, v12.16b, v12.16b, #8	)
 	subs		arg3, arg3, #128
 
 	// check if there is another 64B in the buffer to be able to fold
-	b.ge		_fold_64_B_loop
+	b.lt		_fold_64_B_end
+
+	if_will_cond_yield_neon
+	stp		q0, q1, [sp, #.Lframe_local_offset]
+	stp		q2, q3, [sp, #.Lframe_local_offset + 32]
+	stp		q4, q5, [sp, #.Lframe_local_offset + 64]
+	stp		q6, q7, [sp, #.Lframe_local_offset + 96]
+	do_cond_yield_neon
+	ldp		q0, q1, [sp, #.Lframe_local_offset]
+	ldp		q2, q3, [sp, #.Lframe_local_offset + 32]
+	ldp		q4, q5, [sp, #.Lframe_local_offset + 64]
+	ldp		q6, q7, [sp, #.Lframe_local_offset + 96]
+	ldr_l		q10, rk3, x8
+	movi		vzr.16b, #0		// init zero register
+	endif_yield_neon
+
+	b		_fold_64_B_loop
 
+_fold_64_B_end:
 	// at this point, the buffer pointer is pointing at the last y Bytes
 	// of the buffer the 64B of folded data is in 4 of the vector
 	// registers: v0, v1, v2, v3
@@ -304,6 +327,7 @@ _barrett:
 _cleanup:
 	// scale the result back to 16 bits
 	lsr		x0, x0, #16
+	frame_pop
 	ret
 
 _less_than_128:
diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S
index 11ebf1a..dcffb9e 100644
--- a/arch/arm64/crypto/ghash-ce-core.S
+++ b/arch/arm64/crypto/ghash-ce-core.S
@@ -213,22 +213,31 @@
 	.endm
 
 	.macro		__pmull_ghash, pn
-	ld1		{SHASH.2d}, [x3]
-	ld1		{XL.2d}, [x1]
+	frame_push	5
+
+	mov		x19, x0
+	mov		x20, x1
+	mov		x21, x2
+	mov		x22, x3
+	mov		x23, x4
+
+0:	ld1		{SHASH.2d}, [x22]
+	ld1		{XL.2d}, [x20]
 	ext		SHASH2.16b, SHASH.16b, SHASH.16b, #8
 	eor		SHASH2.16b, SHASH2.16b, SHASH.16b
 
 	__pmull_pre_\pn
 
 	/* do the head block first, if supplied */
-	cbz		x4, 0f
-	ld1		{T1.2d}, [x4]
-	b		1f
+	cbz		x23, 1f
+	ld1		{T1.2d}, [x23]
+	mov		x23, xzr
+	b		2f
 
-0:	ld1		{T1.2d}, [x2], #16
-	sub		w0, w0, #1
+1:	ld1		{T1.2d}, [x21], #16
+	sub		w19, w19, #1
 
-1:	/* multiply XL by SHASH in GF(2^128) */
+2:	/* multiply XL by SHASH in GF(2^128) */
 CPU_LE(	rev64		T1.16b, T1.16b	)
 
 	ext		T2.16b, XL.16b, XL.16b, #8
@@ -250,9 +259,18 @@ CPU_LE(	rev64		T1.16b, T1.16b	)
 	eor		T2.16b, T2.16b, XH.16b
 	eor		XL.16b, XL.16b, T2.16b
 
-	cbnz		w0, 0b
+	cbz		w19, 3f
+
+	if_will_cond_yield_neon
+	st1		{XL.2d}, [x20]
+	do_cond_yield_neon
+	b		0b
+	endif_yield_neon
+
+	b		1b
 
-	st1		{XL.2d}, [x1]
+3:	st1		{XL.2d}, [x20]
+	frame_pop
 	ret
 	.endm
 
@@ -304,38 +322,55 @@ ENDPROC(pmull_ghash_update_p8)
 	.endm
 
 	.macro		pmull_gcm_do_crypt, enc
-	ld1		{SHASH.2d}, [x4]
-	ld1		{XL.2d}, [x1]
-	ldr		x8, [x5, #8]			// load lower counter
+	frame_push	10
+
+	mov		x19, x0
+	mov		x20, x1
+	mov		x21, x2
+	mov		x22, x3
+	mov		x23, x4
+	mov		x24, x5
+	mov		x25, x6
+	mov		x26, x7
+	.if		\enc == 1
+	ldr		x27, [sp, #96]			// first stacked arg
+	.endif
+
+	ldr		x28, [x24, #8]			// load lower counter
+CPU_LE(	rev		x28, x28	)
+
+0:	mov		x0, x25
+	load_round_keys	w26, x0
+	ld1		{SHASH.2d}, [x23]
+	ld1		{XL.2d}, [x20]
 
 	movi		MASK.16b, #0xe1
 	ext		SHASH2.16b, SHASH.16b, SHASH.16b, #8
-CPU_LE(	rev		x8, x8		)
 	shl		MASK.2d, MASK.2d, #57
 	eor		SHASH2.16b, SHASH2.16b, SHASH.16b
 
 	.if		\enc == 1
-	ld1		{KS.16b}, [x7]
+	ld1		{KS.16b}, [x27]
 	.endif
 
-0:	ld1		{CTR.8b}, [x5]			// load upper counter
-	ld1		{INP.16b}, [x3], #16
-	rev		x9, x8
-	add		x8, x8, #1
-	sub		w0, w0, #1
+1:	ld1		{CTR.8b}, [x24]			// load upper counter
+	ld1		{INP.16b}, [x22], #16
+	rev		x9, x28
+	add		x28, x28, #1
+	sub		w19, w19, #1
 	ins		CTR.d[1], x9			// set lower counter
 
 	.if		\enc == 1
 	eor		INP.16b, INP.16b, KS.16b	// encrypt input
-	st1		{INP.16b}, [x2], #16
+	st1		{INP.16b}, [x21], #16
 	.endif
 
 	rev64		T1.16b, INP.16b
 
-	cmp		w6, #12
-	b.ge		2f				// AES-192/256?
+	cmp		w26, #12
+	b.ge		4f				// AES-192/256?
 
-1:	enc_round	CTR, v21
+2:	enc_round	CTR, v21
 
 	ext		T2.16b, XL.16b, XL.16b, #8
 	ext		IN1.16b, T1.16b, T1.16b, #8
@@ -390,27 +425,39 @@ CPU_LE(	rev		x8, x8		)
 
 	.if		\enc == 0
 	eor		INP.16b, INP.16b, KS.16b
-	st1		{INP.16b}, [x2], #16
+	st1		{INP.16b}, [x21], #16
 	.endif
 
-	cbnz		w0, 0b
+	cbz		w19, 3f
 
-CPU_LE(	rev		x8, x8		)
-	st1		{XL.2d}, [x1]
-	str		x8, [x5, #8]			// store lower counter
+	if_will_cond_yield_neon
+	st1		{XL.2d}, [x20]
+	.if		\enc == 1
+	st1		{KS.16b}, [x27]
+	.endif
+	do_cond_yield_neon
+	b		0b
+	endif_yield_neon
 
+	b		1b
+
+3:	st1		{XL.2d}, [x20]
 	.if		\enc == 1
-	st1		{KS.16b}, [x7]
+	st1		{KS.16b}, [x27]
 	.endif
 
+CPU_LE(	rev		x28, x28	)
+	str		x28, [x24, #8]			// store lower counter
+
+	frame_pop
 	ret
 
-2:	b.eq		3f				// AES-192?
+4:	b.eq		5f				// AES-192?
 	enc_round	CTR, v17
 	enc_round	CTR, v18
-3:	enc_round	CTR, v19
+5:	enc_round	CTR, v19
 	enc_round	CTR, v20
-	b		1b
+	b		2b
 	.endm
 
 	/*
diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c
index cfc9c92..7cf0b1a 100644
--- a/arch/arm64/crypto/ghash-ce-glue.c
+++ b/arch/arm64/crypto/ghash-ce-glue.c
@@ -63,11 +63,12 @@ static void (*pmull_ghash_update)(int blocks, u64 dg[], const char *src,
 
 asmlinkage void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[],
 				  const u8 src[], struct ghash_key const *k,
-				  u8 ctr[], int rounds, u8 ks[]);
+				  u8 ctr[], u32 const rk[], int rounds,
+				  u8 ks[]);
 
 asmlinkage void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[],
 				  const u8 src[], struct ghash_key const *k,
-				  u8 ctr[], int rounds);
+				  u8 ctr[], u32 const rk[], int rounds);
 
 asmlinkage void pmull_gcm_encrypt_block(u8 dst[], u8 const src[],
 					u32 const rk[], int rounds);
@@ -368,26 +369,29 @@ static int gcm_encrypt(struct aead_request *req)
 		pmull_gcm_encrypt_block(ks, iv, NULL,
 					num_rounds(&ctx->aes_key));
 		put_unaligned_be32(3, iv + GCM_IV_SIZE);
+		kernel_neon_end();
 
-		err = skcipher_walk_aead_encrypt(&walk, req, true);
+		err = skcipher_walk_aead_encrypt(&walk, req, false);
 
 		while (walk.nbytes >= AES_BLOCK_SIZE) {
 			int blocks = walk.nbytes / AES_BLOCK_SIZE;
 
+			kernel_neon_begin();
 			pmull_gcm_encrypt(blocks, dg, walk.dst.virt.addr,
 					  walk.src.virt.addr, &ctx->ghash_key,
-					  iv, num_rounds(&ctx->aes_key), ks);
+					  iv, ctx->aes_key.key_enc,
+					  num_rounds(&ctx->aes_key), ks);
+			kernel_neon_end();
 
 			err = skcipher_walk_done(&walk,
 						 walk.nbytes % AES_BLOCK_SIZE);
 		}
-		kernel_neon_end();
 	} else {
 		__aes_arm64_encrypt(ctx->aes_key.key_enc, tag, iv,
 				    num_rounds(&ctx->aes_key));
 		put_unaligned_be32(2, iv + GCM_IV_SIZE);
 
-		err = skcipher_walk_aead_encrypt(&walk, req, true);
+		err = skcipher_walk_aead_encrypt(&walk, req, false);
 
 		while (walk.nbytes >= AES_BLOCK_SIZE) {
 			int blocks = walk.nbytes / AES_BLOCK_SIZE;
@@ -467,15 +471,19 @@ static int gcm_decrypt(struct aead_request *req)
 		pmull_gcm_encrypt_block(tag, iv, ctx->aes_key.key_enc,
 					num_rounds(&ctx->aes_key));
 		put_unaligned_be32(2, iv + GCM_IV_SIZE);
+		kernel_neon_end();
 
-		err = skcipher_walk_aead_decrypt(&walk, req, true);
+		err = skcipher_walk_aead_decrypt(&walk, req, false);
 
 		while (walk.nbytes >= AES_BLOCK_SIZE) {
 			int blocks = walk.nbytes / AES_BLOCK_SIZE;
 
+			kernel_neon_begin();
 			pmull_gcm_decrypt(blocks, dg, walk.dst.virt.addr,
 					  walk.src.virt.addr, &ctx->ghash_key,
-					  iv, num_rounds(&ctx->aes_key));
+					  iv, ctx->aes_key.key_enc,
+					  num_rounds(&ctx->aes_key));
+			kernel_neon_end();
 
 			err = skcipher_walk_done(&walk,
 						 walk.nbytes % AES_BLOCK_SIZE);
@@ -483,14 +491,12 @@ static int gcm_decrypt(struct aead_request *req)
 		if (walk.nbytes)
 			pmull_gcm_encrypt_block(iv, iv, NULL,
 						num_rounds(&ctx->aes_key));
-
-		kernel_neon_end();
 	} else {
 		__aes_arm64_encrypt(ctx->aes_key.key_enc, tag, iv,
 				    num_rounds(&ctx->aes_key));
 		put_unaligned_be32(2, iv + GCM_IV_SIZE);
 
-		err = skcipher_walk_aead_decrypt(&walk, req, true);
+		err = skcipher_walk_aead_decrypt(&walk, req, false);
 
 		while (walk.nbytes >= AES_BLOCK_SIZE) {
 			int blocks = walk.nbytes / AES_BLOCK_SIZE;
diff --git a/arch/arm64/crypto/sha1-ce-core.S b/arch/arm64/crypto/sha1-ce-core.S
index 4604985..78eb35f 100644
--- a/arch/arm64/crypto/sha1-ce-core.S
+++ b/arch/arm64/crypto/sha1-ce-core.S
@@ -69,30 +69,36 @@
 	 *			  int blocks)
 	 */
 ENTRY(sha1_ce_transform)
+	frame_push	3
+
+	mov		x19, x0
+	mov		x20, x1
+	mov		x21, x2
+
 	/* load round constants */
-	loadrc		k0.4s, 0x5a827999, w6
+0:	loadrc		k0.4s, 0x5a827999, w6
 	loadrc		k1.4s, 0x6ed9eba1, w6
 	loadrc		k2.4s, 0x8f1bbcdc, w6
 	loadrc		k3.4s, 0xca62c1d6, w6
 
 	/* load state */
-	ld1		{dgav.4s}, [x0]
-	ldr		dgb, [x0, #16]
+	ld1		{dgav.4s}, [x19]
+	ldr		dgb, [x19, #16]
 
 	/* load sha1_ce_state::finalize */
 	ldr_l		w4, sha1_ce_offsetof_finalize, x4
-	ldr		w4, [x0, x4]
+	ldr		w4, [x19, x4]
 
 	/* load input */
-0:	ld1		{v8.4s-v11.4s}, [x1], #64
-	sub		w2, w2, #1
+1:	ld1		{v8.4s-v11.4s}, [x20], #64
+	sub		w21, w21, #1
 
 CPU_LE(	rev32		v8.16b, v8.16b		)
 CPU_LE(	rev32		v9.16b, v9.16b		)
 CPU_LE(	rev32		v10.16b, v10.16b	)
 CPU_LE(	rev32		v11.16b, v11.16b	)
 
-1:	add		t0.4s, v8.4s, k0.4s
+2:	add		t0.4s, v8.4s, k0.4s
 	mov		dg0v.16b, dgav.16b
 
 	add_update	c, ev, k0,  8,  9, 10, 11, dgb
@@ -123,16 +129,25 @@ CPU_LE(	rev32		v11.16b, v11.16b	)
 	add		dgbv.2s, dgbv.2s, dg1v.2s
 	add		dgav.4s, dgav.4s, dg0v.4s
 
-	cbnz		w2, 0b
+	cbz		w21, 3f
+
+	if_will_cond_yield_neon
+	st1		{dgav.4s}, [x19]
+	str		dgb, [x19, #16]
+	do_cond_yield_neon
+	b		0b
+	endif_yield_neon
+
+	b		1b
 
 	/*
 	 * Final block: add padding and total bit count.
 	 * Skip if the input size was not a round multiple of the block size,
 	 * the padding is handled by the C code in that case.
 	 */
-	cbz		x4, 3f
+3:	cbz		x4, 4f
 	ldr_l		w4, sha1_ce_offsetof_count, x4
-	ldr		x4, [x0, x4]
+	ldr		x4, [x19, x4]
 	movi		v9.2d, #0
 	mov		x8, #0x80000000
 	movi		v10.2d, #0
@@ -141,10 +156,11 @@ CPU_LE(	rev32		v11.16b, v11.16b	)
 	mov		x4, #0
 	mov		v11.d[0], xzr
 	mov		v11.d[1], x7
-	b		1b
+	b		2b
 
 	/* store new state */
-3:	st1		{dgav.4s}, [x0]
-	str		dgb, [x0, #16]
+4:	st1		{dgav.4s}, [x19]
+	str		dgb, [x19, #16]
+	frame_pop
 	ret
 ENDPROC(sha1_ce_transform)
diff --git a/arch/arm64/crypto/sha2-ce-core.S b/arch/arm64/crypto/sha2-ce-core.S
index 4c3c89b..cd8b364 100644
--- a/arch/arm64/crypto/sha2-ce-core.S
+++ b/arch/arm64/crypto/sha2-ce-core.S
@@ -79,30 +79,36 @@
 	 */
 	.text
 ENTRY(sha2_ce_transform)
+	frame_push	3
+
+	mov		x19, x0
+	mov		x20, x1
+	mov		x21, x2
+
 	/* load round constants */
-	adr_l		x8, .Lsha2_rcon
+0:	adr_l		x8, .Lsha2_rcon
 	ld1		{ v0.4s- v3.4s}, [x8], #64
 	ld1		{ v4.4s- v7.4s}, [x8], #64
 	ld1		{ v8.4s-v11.4s}, [x8], #64
 	ld1		{v12.4s-v15.4s}, [x8]
 
 	/* load state */
-	ld1		{dgav.4s, dgbv.4s}, [x0]
+	ld1		{dgav.4s, dgbv.4s}, [x19]
 
 	/* load sha256_ce_state::finalize */
 	ldr_l		w4, sha256_ce_offsetof_finalize, x4
-	ldr		w4, [x0, x4]
+	ldr		w4, [x19, x4]
 
 	/* load input */
-0:	ld1		{v16.4s-v19.4s}, [x1], #64
-	sub		w2, w2, #1
+1:	ld1		{v16.4s-v19.4s}, [x20], #64
+	sub		w21, w21, #1
 
 CPU_LE(	rev32		v16.16b, v16.16b	)
 CPU_LE(	rev32		v17.16b, v17.16b	)
 CPU_LE(	rev32		v18.16b, v18.16b	)
 CPU_LE(	rev32		v19.16b, v19.16b	)
 
-1:	add		t0.4s, v16.4s, v0.4s
+2:	add		t0.4s, v16.4s, v0.4s
 	mov		dg0v.16b, dgav.16b
 	mov		dg1v.16b, dgbv.16b
 
@@ -131,16 +137,24 @@ CPU_LE(	rev32		v19.16b, v19.16b	)
 	add		dgbv.4s, dgbv.4s, dg1v.4s
 
 	/* handled all input blocks? */
-	cbnz		w2, 0b
+	cbz		w21, 3f
+
+	if_will_cond_yield_neon
+	st1		{dgav.4s, dgbv.4s}, [x19]
+	do_cond_yield_neon
+	b		0b
+	endif_yield_neon
+
+	b		1b
 
 	/*
 	 * Final block: add padding and total bit count.
 	 * Skip if the input size was not a round multiple of the block size,
 	 * the padding is handled by the C code in that case.
 	 */
-	cbz		x4, 3f
+3:	cbz		x4, 4f
 	ldr_l		w4, sha256_ce_offsetof_count, x4
-	ldr		x4, [x0, x4]
+	ldr		x4, [x19, x4]
 	movi		v17.2d, #0
 	mov		x8, #0x80000000
 	movi		v18.2d, #0
@@ -149,9 +163,10 @@ CPU_LE(	rev32		v19.16b, v19.16b	)
 	mov		x4, #0
 	mov		v19.d[0], xzr
 	mov		v19.d[1], x7
-	b		1b
+	b		2b
 
 	/* store new state */
-3:	st1		{dgav.4s, dgbv.4s}, [x0]
+4:	st1		{dgav.4s, dgbv.4s}, [x19]
+	frame_pop
 	ret
 ENDPROC(sha2_ce_transform)
diff --git a/arch/arm64/crypto/sha256-core.S_shipped b/arch/arm64/crypto/sha256-core.S_shipped
index 3ce82cc..7c7ce2e 100644
--- a/arch/arm64/crypto/sha256-core.S_shipped
+++ b/arch/arm64/crypto/sha256-core.S_shipped
@@ -1,3 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0
+
+// This code is taken from the OpenSSL project but the author (Andy Polyakov)
+// has relicensed it under the GPLv2. Therefore this program is free software;
+// you can redistribute it and/or modify it under the terms of the GNU General
+// Public License version 2 as published by the Free Software Foundation.
+//
+// The original headers, including the original license headers, are
+// included below for completeness.
+
 // Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
 //
 // Licensed under the OpenSSL license (the "License").  You may not use
@@ -10,8 +20,6 @@
 // project. The module is, however, dual licensed under OpenSSL and
 // CRYPTOGAMS licenses depending on where you obtain it. For further
 // details see http://www.openssl.org/~appro/cryptogams/.
-//
-// Permission to use under GPLv2 terms is granted.
 // ====================================================================
 //
 // SHA256/512 for ARMv8.
diff --git a/arch/arm64/crypto/sha3-ce-core.S b/arch/arm64/crypto/sha3-ce-core.S
index 332ad75..a7d587f 100644
--- a/arch/arm64/crypto/sha3-ce-core.S
+++ b/arch/arm64/crypto/sha3-ce-core.S
@@ -41,9 +41,16 @@
 	 */
 	.text
 ENTRY(sha3_ce_transform)
-	/* load state */
-	add	x8, x0, #32
-	ld1	{ v0.1d- v3.1d}, [x0]
+	frame_push	4
+
+	mov	x19, x0
+	mov	x20, x1
+	mov	x21, x2
+	mov	x22, x3
+
+0:	/* load state */
+	add	x8, x19, #32
+	ld1	{ v0.1d- v3.1d}, [x19]
 	ld1	{ v4.1d- v7.1d}, [x8], #32
 	ld1	{ v8.1d-v11.1d}, [x8], #32
 	ld1	{v12.1d-v15.1d}, [x8], #32
@@ -51,13 +58,13 @@ ENTRY(sha3_ce_transform)
 	ld1	{v20.1d-v23.1d}, [x8], #32
 	ld1	{v24.1d}, [x8]
 
-0:	sub	w2, w2, #1
+1:	sub	w21, w21, #1
 	mov	w8, #24
 	adr_l	x9, .Lsha3_rcon
 
 	/* load input */
-	ld1	{v25.8b-v28.8b}, [x1], #32
-	ld1	{v29.8b-v31.8b}, [x1], #24
+	ld1	{v25.8b-v28.8b}, [x20], #32
+	ld1	{v29.8b-v31.8b}, [x20], #24
 	eor	v0.8b, v0.8b, v25.8b
 	eor	v1.8b, v1.8b, v26.8b
 	eor	v2.8b, v2.8b, v27.8b
@@ -66,10 +73,10 @@ ENTRY(sha3_ce_transform)
 	eor	v5.8b, v5.8b, v30.8b
 	eor	v6.8b, v6.8b, v31.8b
 
-	tbnz	x3, #6, 2f		// SHA3-512
+	tbnz	x22, #6, 3f		// SHA3-512
 
-	ld1	{v25.8b-v28.8b}, [x1], #32
-	ld1	{v29.8b-v30.8b}, [x1], #16
+	ld1	{v25.8b-v28.8b}, [x20], #32
+	ld1	{v29.8b-v30.8b}, [x20], #16
 	eor	 v7.8b,  v7.8b, v25.8b
 	eor	 v8.8b,  v8.8b, v26.8b
 	eor	 v9.8b,  v9.8b, v27.8b
@@ -77,34 +84,34 @@ ENTRY(sha3_ce_transform)
 	eor	v11.8b, v11.8b, v29.8b
 	eor	v12.8b, v12.8b, v30.8b
 
-	tbnz	x3, #4, 1f		// SHA3-384 or SHA3-224
+	tbnz	x22, #4, 2f		// SHA3-384 or SHA3-224
 
 	// SHA3-256
-	ld1	{v25.8b-v28.8b}, [x1], #32
+	ld1	{v25.8b-v28.8b}, [x20], #32
 	eor	v13.8b, v13.8b, v25.8b
 	eor	v14.8b, v14.8b, v26.8b
 	eor	v15.8b, v15.8b, v27.8b
 	eor	v16.8b, v16.8b, v28.8b
-	b	3f
+	b	4f
 
-1:	tbz	x3, #2, 3f		// bit 2 cleared? SHA-384
+2:	tbz	x22, #2, 4f		// bit 2 cleared? SHA-384
 
 	// SHA3-224
-	ld1	{v25.8b-v28.8b}, [x1], #32
-	ld1	{v29.8b}, [x1], #8
+	ld1	{v25.8b-v28.8b}, [x20], #32
+	ld1	{v29.8b}, [x20], #8
 	eor	v13.8b, v13.8b, v25.8b
 	eor	v14.8b, v14.8b, v26.8b
 	eor	v15.8b, v15.8b, v27.8b
 	eor	v16.8b, v16.8b, v28.8b
 	eor	v17.8b, v17.8b, v29.8b
-	b	3f
+	b	4f
 
 	// SHA3-512
-2:	ld1	{v25.8b-v26.8b}, [x1], #16
+3:	ld1	{v25.8b-v26.8b}, [x20], #16
 	eor	 v7.8b,  v7.8b, v25.8b
 	eor	 v8.8b,  v8.8b, v26.8b
 
-3:	sub	w8, w8, #1
+4:	sub	w8, w8, #1
 
 	eor3	v29.16b,  v4.16b,  v9.16b, v14.16b
 	eor3	v26.16b,  v1.16b,  v6.16b, v11.16b
@@ -183,17 +190,33 @@ ENTRY(sha3_ce_transform)
 
 	eor	 v0.16b,  v0.16b, v31.16b
 
-	cbnz	w8, 3b
-	cbnz	w2, 0b
+	cbnz	w8, 4b
+	cbz	w21, 5f
+
+	if_will_cond_yield_neon
+	add	x8, x19, #32
+	st1	{ v0.1d- v3.1d}, [x19]
+	st1	{ v4.1d- v7.1d}, [x8], #32
+	st1	{ v8.1d-v11.1d}, [x8], #32
+	st1	{v12.1d-v15.1d}, [x8], #32
+	st1	{v16.1d-v19.1d}, [x8], #32
+	st1	{v20.1d-v23.1d}, [x8], #32
+	st1	{v24.1d}, [x8]
+	do_cond_yield_neon
+	b		0b
+	endif_yield_neon
+
+	b	1b
 
 	/* save state */
-	st1	{ v0.1d- v3.1d}, [x0], #32
-	st1	{ v4.1d- v7.1d}, [x0], #32
-	st1	{ v8.1d-v11.1d}, [x0], #32
-	st1	{v12.1d-v15.1d}, [x0], #32
-	st1	{v16.1d-v19.1d}, [x0], #32
-	st1	{v20.1d-v23.1d}, [x0], #32
-	st1	{v24.1d}, [x0]
+5:	st1	{ v0.1d- v3.1d}, [x19], #32
+	st1	{ v4.1d- v7.1d}, [x19], #32
+	st1	{ v8.1d-v11.1d}, [x19], #32
+	st1	{v12.1d-v15.1d}, [x19], #32
+	st1	{v16.1d-v19.1d}, [x19], #32
+	st1	{v20.1d-v23.1d}, [x19], #32
+	st1	{v24.1d}, [x19]
+	frame_pop
 	ret
 ENDPROC(sha3_ce_transform)
 
diff --git a/arch/arm64/crypto/sha512-armv8.pl b/arch/arm64/crypto/sha512-armv8.pl
index c55efb3..2d8655d 100644
--- a/arch/arm64/crypto/sha512-armv8.pl
+++ b/arch/arm64/crypto/sha512-armv8.pl
@@ -1,4 +1,14 @@
 #! /usr/bin/env perl
+# SPDX-License-Identifier: GPL-2.0
+
+# This code is taken from the OpenSSL project but the author (Andy Polyakov)
+# has relicensed it under the GPLv2. Therefore this program is free software;
+# you can redistribute it and/or modify it under the terms of the GNU General
+# Public License version 2 as published by the Free Software Foundation.
+#
+# The original headers, including the original license headers, are
+# included below for completeness.
+
 # Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
 #
 # Licensed under the OpenSSL license (the "License").  You may not use
@@ -11,8 +21,6 @@
 # project. The module is, however, dual licensed under OpenSSL and
 # CRYPTOGAMS licenses depending on where you obtain it. For further
 # details see http://www.openssl.org/~appro/cryptogams/.
-#
-# Permission to use under GPLv2 terms is granted.
 # ====================================================================
 #
 # SHA256/512 for ARMv8.
diff --git a/arch/arm64/crypto/sha512-ce-core.S b/arch/arm64/crypto/sha512-ce-core.S
index 7f3bca5..ce65e3a 100644
--- a/arch/arm64/crypto/sha512-ce-core.S
+++ b/arch/arm64/crypto/sha512-ce-core.S
@@ -107,17 +107,23 @@
 	 */
 	.text
 ENTRY(sha512_ce_transform)
+	frame_push	3
+
+	mov		x19, x0
+	mov		x20, x1
+	mov		x21, x2
+
 	/* load state */
-	ld1		{v8.2d-v11.2d}, [x0]
+0:	ld1		{v8.2d-v11.2d}, [x19]
 
 	/* load first 4 round constants */
 	adr_l		x3, .Lsha512_rcon
 	ld1		{v20.2d-v23.2d}, [x3], #64
 
 	/* load input */
-0:	ld1		{v12.2d-v15.2d}, [x1], #64
-	ld1		{v16.2d-v19.2d}, [x1], #64
-	sub		w2, w2, #1
+1:	ld1		{v12.2d-v15.2d}, [x20], #64
+	ld1		{v16.2d-v19.2d}, [x20], #64
+	sub		w21, w21, #1
 
 CPU_LE(	rev64		v12.16b, v12.16b	)
 CPU_LE(	rev64		v13.16b, v13.16b	)
@@ -196,9 +202,18 @@ CPU_LE(	rev64		v19.16b, v19.16b	)
 	add		v11.2d, v11.2d, v3.2d
 
 	/* handled all input blocks? */
-	cbnz		w2, 0b
+	cbz		w21, 3f
+
+	if_will_cond_yield_neon
+	st1		{v8.2d-v11.2d}, [x19]
+	do_cond_yield_neon
+	b		0b
+	endif_yield_neon
+
+	b		1b
 
 	/* store new state */
-3:	st1		{v8.2d-v11.2d}, [x0]
+3:	st1		{v8.2d-v11.2d}, [x19]
+	frame_pop
 	ret
 ENDPROC(sha512_ce_transform)
diff --git a/arch/arm64/crypto/sha512-core.S_shipped b/arch/arm64/crypto/sha512-core.S_shipped
index bd0f59f..e063a61 100644
--- a/arch/arm64/crypto/sha512-core.S_shipped
+++ b/arch/arm64/crypto/sha512-core.S_shipped
@@ -1,3 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0
+
+// This code is taken from the OpenSSL project but the author (Andy Polyakov)
+// has relicensed it under the GPLv2. Therefore this program is free software;
+// you can redistribute it and/or modify it under the terms of the GNU General
+// Public License version 2 as published by the Free Software Foundation.
+//
+// The original headers, including the original license headers, are
+// included below for completeness.
+
 // Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
 //
 // Licensed under the OpenSSL license (the "License").  You may not use
@@ -10,8 +20,6 @@
 // project. The module is, however, dual licensed under OpenSSL and
 // CRYPTOGAMS licenses depending on where you obtain it. For further
 // details see http://www.openssl.org/~appro/cryptogams/.
-//
-// Permission to use under GPLv2 terms is granted.
 // ====================================================================
 //
 // SHA256/512 for ARMv8.
diff --git a/arch/arm64/crypto/sm4-ce-core.S b/arch/arm64/crypto/sm4-ce-core.S
new file mode 100644
index 0000000..af3bfbc
--- /dev/null
+++ b/arch/arm64/crypto/sm4-ce-core.S
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+	.irp		b, 0, 1, 2, 3, 4, 5, 6, 7, 8
+	.set		.Lv\b\().4s, \b
+	.endr
+
+	.macro		sm4e, rd, rn
+	.inst		0xcec08400 | .L\rd | (.L\rn << 5)
+	.endm
+
+	/*
+	 * void sm4_ce_do_crypt(const u32 *rk, u32 *out, const u32 *in);
+	 */
+	.text
+ENTRY(sm4_ce_do_crypt)
+	ld1		{v8.4s}, [x2]
+	ld1		{v0.4s-v3.4s}, [x0], #64
+CPU_LE(	rev32		v8.16b, v8.16b		)
+	ld1		{v4.4s-v7.4s}, [x0]
+	sm4e		v8.4s, v0.4s
+	sm4e		v8.4s, v1.4s
+	sm4e		v8.4s, v2.4s
+	sm4e		v8.4s, v3.4s
+	sm4e		v8.4s, v4.4s
+	sm4e		v8.4s, v5.4s
+	sm4e		v8.4s, v6.4s
+	sm4e		v8.4s, v7.4s
+	rev64		v8.4s, v8.4s
+	ext		v8.16b, v8.16b, v8.16b, #8
+CPU_LE(	rev32		v8.16b, v8.16b		)
+	st1		{v8.4s}, [x1]
+	ret
+ENDPROC(sm4_ce_do_crypt)
diff --git a/arch/arm64/crypto/sm4-ce-glue.c b/arch/arm64/crypto/sm4-ce-glue.c
new file mode 100644
index 0000000..b7fb527
--- /dev/null
+++ b/arch/arm64/crypto/sm4-ce-glue.c
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <crypto/sm4.h>
+#include <linux/module.h>
+#include <linux/cpufeature.h>
+#include <linux/crypto.h>
+#include <linux/types.h>
+
+MODULE_ALIAS_CRYPTO("sm4");
+MODULE_ALIAS_CRYPTO("sm4-ce");
+MODULE_DESCRIPTION("SM4 symmetric cipher using ARMv8 Crypto Extensions");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+
+asmlinkage void sm4_ce_do_crypt(const u32 *rk, void *out, const void *in);
+
+static void sm4_ce_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+{
+	const struct crypto_sm4_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	if (!may_use_simd()) {
+		crypto_sm4_encrypt(tfm, out, in);
+	} else {
+		kernel_neon_begin();
+		sm4_ce_do_crypt(ctx->rkey_enc, out, in);
+		kernel_neon_end();
+	}
+}
+
+static void sm4_ce_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+{
+	const struct crypto_sm4_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	if (!may_use_simd()) {
+		crypto_sm4_decrypt(tfm, out, in);
+	} else {
+		kernel_neon_begin();
+		sm4_ce_do_crypt(ctx->rkey_dec, out, in);
+		kernel_neon_end();
+	}
+}
+
+static struct crypto_alg sm4_ce_alg = {
+	.cra_name			= "sm4",
+	.cra_driver_name		= "sm4-ce",
+	.cra_priority			= 200,
+	.cra_flags			= CRYPTO_ALG_TYPE_CIPHER,
+	.cra_blocksize			= SM4_BLOCK_SIZE,
+	.cra_ctxsize			= sizeof(struct crypto_sm4_ctx),
+	.cra_module			= THIS_MODULE,
+	.cra_u.cipher = {
+		.cia_min_keysize	= SM4_KEY_SIZE,
+		.cia_max_keysize	= SM4_KEY_SIZE,
+		.cia_setkey		= crypto_sm4_set_key,
+		.cia_encrypt		= sm4_ce_encrypt,
+		.cia_decrypt		= sm4_ce_decrypt
+	}
+};
+
+static int __init sm4_ce_mod_init(void)
+{
+	return crypto_register_alg(&sm4_ce_alg);
+}
+
+static void __exit sm4_ce_mod_fini(void)
+{
+	crypto_unregister_alg(&sm4_ce_alg);
+}
+
+module_cpu_feature_match(SM3, sm4_ce_mod_init);
+module_exit(sm4_ce_mod_fini);
diff --git a/arch/arm64/include/asm/acpi.h b/arch/arm64/include/asm/acpi.h
index 32f465a..0db62a4 100644
--- a/arch/arm64/include/asm/acpi.h
+++ b/arch/arm64/include/asm/acpi.h
@@ -86,6 +86,10 @@ static inline bool acpi_has_cpu_in_madt(void)
 }
 
 struct acpi_madt_generic_interrupt *acpi_cpu_get_madt_gicc(int cpu);
+static inline u32 get_acpi_id_for_cpu(unsigned int cpu)
+{
+	return	acpi_cpu_get_madt_gicc(cpu)->uid;
+}
 
 static inline void arch_fix_phys_package_id(int num, u32 slot) { }
 void __init acpi_init_cpus(void);
diff --git a/arch/arm64/include/asm/atomic_lse.h b/arch/arm64/include/asm/atomic_lse.h
index 9ef0797..f9b0b09 100644
--- a/arch/arm64/include/asm/atomic_lse.h
+++ b/arch/arm64/include/asm/atomic_lse.h
@@ -117,7 +117,7 @@ static inline void atomic_and(int i, atomic_t *v)
 	/* LSE atomics */
 	"	mvn	%w[i], %w[i]\n"
 	"	stclr	%w[i], %[v]")
-	: [i] "+r" (w0), [v] "+Q" (v->counter)
+	: [i] "+&r" (w0), [v] "+Q" (v->counter)
 	: "r" (x1)
 	: __LL_SC_CLOBBERS);
 }
@@ -135,7 +135,7 @@ static inline int atomic_fetch_and##name(int i, atomic_t *v)		\
 	/* LSE atomics */						\
 	"	mvn	%w[i], %w[i]\n"					\
 	"	ldclr" #mb "	%w[i], %w[i], %[v]")			\
-	: [i] "+r" (w0), [v] "+Q" (v->counter)				\
+	: [i] "+&r" (w0), [v] "+Q" (v->counter)				\
 	: "r" (x1)							\
 	: __LL_SC_CLOBBERS, ##cl);					\
 									\
@@ -161,7 +161,7 @@ static inline void atomic_sub(int i, atomic_t *v)
 	/* LSE atomics */
 	"	neg	%w[i], %w[i]\n"
 	"	stadd	%w[i], %[v]")
-	: [i] "+r" (w0), [v] "+Q" (v->counter)
+	: [i] "+&r" (w0), [v] "+Q" (v->counter)
 	: "r" (x1)
 	: __LL_SC_CLOBBERS);
 }
@@ -180,7 +180,7 @@ static inline int atomic_sub_return##name(int i, atomic_t *v)		\
 	"	neg	%w[i], %w[i]\n"					\
 	"	ldadd" #mb "	%w[i], w30, %[v]\n"			\
 	"	add	%w[i], %w[i], w30")				\
-	: [i] "+r" (w0), [v] "+Q" (v->counter)				\
+	: [i] "+&r" (w0), [v] "+Q" (v->counter)				\
 	: "r" (x1)							\
 	: __LL_SC_CLOBBERS , ##cl);					\
 									\
@@ -207,7 +207,7 @@ static inline int atomic_fetch_sub##name(int i, atomic_t *v)		\
 	/* LSE atomics */						\
 	"	neg	%w[i], %w[i]\n"					\
 	"	ldadd" #mb "	%w[i], %w[i], %[v]")			\
-	: [i] "+r" (w0), [v] "+Q" (v->counter)				\
+	: [i] "+&r" (w0), [v] "+Q" (v->counter)				\
 	: "r" (x1)							\
 	: __LL_SC_CLOBBERS, ##cl);					\
 									\
@@ -314,7 +314,7 @@ static inline void atomic64_and(long i, atomic64_t *v)
 	/* LSE atomics */
 	"	mvn	%[i], %[i]\n"
 	"	stclr	%[i], %[v]")
-	: [i] "+r" (x0), [v] "+Q" (v->counter)
+	: [i] "+&r" (x0), [v] "+Q" (v->counter)
 	: "r" (x1)
 	: __LL_SC_CLOBBERS);
 }
@@ -332,7 +332,7 @@ static inline long atomic64_fetch_and##name(long i, atomic64_t *v)	\
 	/* LSE atomics */						\
 	"	mvn	%[i], %[i]\n"					\
 	"	ldclr" #mb "	%[i], %[i], %[v]")			\
-	: [i] "+r" (x0), [v] "+Q" (v->counter)				\
+	: [i] "+&r" (x0), [v] "+Q" (v->counter)				\
 	: "r" (x1)							\
 	: __LL_SC_CLOBBERS, ##cl);					\
 									\
@@ -358,7 +358,7 @@ static inline void atomic64_sub(long i, atomic64_t *v)
 	/* LSE atomics */
 	"	neg	%[i], %[i]\n"
 	"	stadd	%[i], %[v]")
-	: [i] "+r" (x0), [v] "+Q" (v->counter)
+	: [i] "+&r" (x0), [v] "+Q" (v->counter)
 	: "r" (x1)
 	: __LL_SC_CLOBBERS);
 }
@@ -377,7 +377,7 @@ static inline long atomic64_sub_return##name(long i, atomic64_t *v)	\
 	"	neg	%[i], %[i]\n"					\
 	"	ldadd" #mb "	%[i], x30, %[v]\n"			\
 	"	add	%[i], %[i], x30")				\
-	: [i] "+r" (x0), [v] "+Q" (v->counter)				\
+	: [i] "+&r" (x0), [v] "+Q" (v->counter)				\
 	: "r" (x1)							\
 	: __LL_SC_CLOBBERS, ##cl);					\
 									\
@@ -404,7 +404,7 @@ static inline long atomic64_fetch_sub##name(long i, atomic64_t *v)	\
 	/* LSE atomics */						\
 	"	neg	%[i], %[i]\n"					\
 	"	ldadd" #mb "	%[i], %[i], %[v]")			\
-	: [i] "+r" (x0), [v] "+Q" (v->counter)				\
+	: [i] "+&r" (x0), [v] "+Q" (v->counter)				\
 	: "r" (x1)							\
 	: __LL_SC_CLOBBERS, ##cl);					\
 									\
@@ -435,7 +435,7 @@ static inline long atomic64_dec_if_positive(atomic64_t *v)
 	"	sub	x30, x30, %[ret]\n"
 	"	cbnz	x30, 1b\n"
 	"2:")
-	: [ret] "+r" (x0), [v] "+Q" (v->counter)
+	: [ret] "+&r" (x0), [v] "+Q" (v->counter)
 	:
 	: __LL_SC_CLOBBERS, "cc", "memory");
 
@@ -516,7 +516,7 @@ static inline long __cmpxchg_double##name(unsigned long old1,		\
 	"	eor	%[old1], %[old1], %[oldval1]\n"			\
 	"	eor	%[old2], %[old2], %[oldval2]\n"			\
 	"	orr	%[old1], %[old1], %[old2]")			\
-	: [old1] "+r" (x0), [old2] "+r" (x1),				\
+	: [old1] "+&r" (x0), [old2] "+&r" (x1),				\
 	  [v] "+Q" (*(unsigned long *)ptr)				\
 	: [new1] "r" (x2), [new2] "r" (x3), [ptr] "r" (x4),		\
 	  [oldval1] "r" (oldval1), [oldval2] "r" (oldval2)		\
diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h
index 9bbffc7..5df5cfe 100644
--- a/arch/arm64/include/asm/cache.h
+++ b/arch/arm64/include/asm/cache.h
@@ -33,7 +33,7 @@
 #define ICACHE_POLICY_VIPT	2
 #define ICACHE_POLICY_PIPT	3
 
-#define L1_CACHE_SHIFT		7
+#define L1_CACHE_SHIFT		(6)
 #define L1_CACHE_BYTES		(1 << L1_CACHE_SHIFT)
 
 /*
@@ -43,7 +43,7 @@
  * cache before the transfer is done, causing old data to be seen by
  * the CPU.
  */
-#define ARCH_DMA_MINALIGN	L1_CACHE_BYTES
+#define ARCH_DMA_MINALIGN	(128)
 
 #ifndef __ASSEMBLY__
 
@@ -77,7 +77,7 @@ static inline u32 cache_type_cwg(void)
 static inline int cache_line_size(void)
 {
 	u32 cwg = cache_type_cwg();
-	return cwg ? 4 << cwg : L1_CACHE_BYTES;
+	return cwg ? 4 << cwg : ARCH_DMA_MINALIGN;
 }
 
 #endif	/* __ASSEMBLY__ */
diff --git a/arch/arm64/include/asm/cmpxchg.h b/arch/arm64/include/asm/cmpxchg.h
index 4f5fd2a..3b09382 100644
--- a/arch/arm64/include/asm/cmpxchg.h
+++ b/arch/arm64/include/asm/cmpxchg.h
@@ -204,7 +204,9 @@ static inline void __cmpwait_case_##name(volatile void *ptr,		\
 	unsigned long tmp;						\
 									\
 	asm volatile(							\
-	"	ldxr" #sz "\t%" #w "[tmp], %[v]\n"		\
+	"	sevl\n"							\
+	"	wfe\n"							\
+	"	ldxr" #sz "\t%" #w "[tmp], %[v]\n"			\
 	"	eor	%" #w "[tmp], %" #w "[tmp], %" #w "[val]\n"	\
 	"	cbnz	%" #w "[tmp], 1f\n"				\
 	"	wfe\n"							\
diff --git a/arch/arm64/include/asm/compat.h b/arch/arm64/include/asm/compat.h
index c00c62e..1a037b9 100644
--- a/arch/arm64/include/asm/compat.h
+++ b/arch/arm64/include/asm/compat.h
@@ -34,7 +34,6 @@
 
 typedef u32		compat_size_t;
 typedef s32		compat_ssize_t;
-typedef s32		compat_time_t;
 typedef s32		compat_clock_t;
 typedef s32		compat_pid_t;
 typedef u16		__compat_uid_t;
@@ -66,16 +65,6 @@ typedef u32		compat_ulong_t;
 typedef u64		compat_u64;
 typedef u32		compat_uptr_t;
 
-struct compat_timespec {
-	compat_time_t	tv_sec;
-	s32		tv_nsec;
-};
-
-struct compat_timeval {
-	compat_time_t	tv_sec;
-	s32		tv_usec;
-};
-
 struct compat_stat {
 #ifdef __AARCH64EB__
 	short		st_dev;
@@ -192,10 +181,10 @@ struct compat_ipc64_perm {
 
 struct compat_semid64_ds {
 	struct compat_ipc64_perm sem_perm;
-	compat_time_t  sem_otime;
-	compat_ulong_t __unused1;
-	compat_time_t  sem_ctime;
-	compat_ulong_t __unused2;
+	compat_ulong_t sem_otime;
+	compat_ulong_t sem_otime_high;
+	compat_ulong_t sem_ctime;
+	compat_ulong_t sem_ctime_high;
 	compat_ulong_t sem_nsems;
 	compat_ulong_t __unused3;
 	compat_ulong_t __unused4;
@@ -203,12 +192,12 @@ struct compat_semid64_ds {
 
 struct compat_msqid64_ds {
 	struct compat_ipc64_perm msg_perm;
-	compat_time_t  msg_stime;
-	compat_ulong_t __unused1;
-	compat_time_t  msg_rtime;
-	compat_ulong_t __unused2;
-	compat_time_t  msg_ctime;
-	compat_ulong_t __unused3;
+	compat_ulong_t msg_stime;
+	compat_ulong_t msg_stime_high;
+	compat_ulong_t msg_rtime;
+	compat_ulong_t msg_rtime_high;
+	compat_ulong_t msg_ctime;
+	compat_ulong_t msg_ctime_high;
 	compat_ulong_t msg_cbytes;
 	compat_ulong_t msg_qnum;
 	compat_ulong_t msg_qbytes;
@@ -221,12 +210,12 @@ struct compat_msqid64_ds {
 struct compat_shmid64_ds {
 	struct compat_ipc64_perm shm_perm;
 	compat_size_t  shm_segsz;
-	compat_time_t  shm_atime;
-	compat_ulong_t __unused1;
-	compat_time_t  shm_dtime;
-	compat_ulong_t __unused2;
-	compat_time_t  shm_ctime;
-	compat_ulong_t __unused3;
+	compat_ulong_t shm_atime;
+	compat_ulong_t shm_atime_high;
+	compat_ulong_t shm_dtime;
+	compat_ulong_t shm_dtime_high;
+	compat_ulong_t shm_ctime;
+	compat_ulong_t shm_ctime_high;
 	compat_pid_t   shm_cpid;
 	compat_pid_t   shm_lpid;
 	compat_ulong_t shm_nattch;
diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
index bc51b72..8a699c7 100644
--- a/arch/arm64/include/asm/cpucaps.h
+++ b/arch/arm64/include/asm/cpucaps.h
@@ -48,7 +48,8 @@
 #define ARM64_HAS_CACHE_IDC			27
 #define ARM64_HAS_CACHE_DIC			28
 #define ARM64_HW_DBM				29
+#define ARM64_SSBD				30
 
-#define ARM64_NCAPS				30
+#define ARM64_NCAPS				31
 
 #endif /* __ASM_CPUCAPS_H */
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 09b0f2a..55bc1f0 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -537,6 +537,28 @@ static inline u64 read_zcr_features(void)
 	return zcr;
 }
 
+#define ARM64_SSBD_UNKNOWN		-1
+#define ARM64_SSBD_FORCE_DISABLE	0
+#define ARM64_SSBD_KERNEL		1
+#define ARM64_SSBD_FORCE_ENABLE		2
+#define ARM64_SSBD_MITIGATED		3
+
+static inline int arm64_get_ssbd_state(void)
+{
+#ifdef CONFIG_ARM64_SSBD
+	extern int ssbd_state;
+	return ssbd_state;
+#else
+	return ARM64_SSBD_UNKNOWN;
+#endif
+}
+
+#ifdef CONFIG_ARM64_SSBD
+void arm64_set_ssbd_mitigation(bool state);
+#else
+static inline void arm64_set_ssbd_mitigation(bool state) {}
+#endif
+
 #endif /* __ASSEMBLY__ */
 
 #endif
diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h
index fac1c4d..433b955 100644
--- a/arch/arm64/include/asm/elf.h
+++ b/arch/arm64/include/asm/elf.h
@@ -121,6 +121,9 @@
 
 #ifndef __ASSEMBLY__
 
+#include <linux/bug.h>
+#include <asm/processor.h> /* for signal_minsigstksz, used by ARCH_DLINFO */
+
 typedef unsigned long elf_greg_t;
 
 #define ELF_NGREG (sizeof(struct user_pt_regs) / sizeof(elf_greg_t))
@@ -148,6 +151,16 @@ typedef struct user_fpsimd_state elf_fpregset_t;
 do {									\
 	NEW_AUX_ENT(AT_SYSINFO_EHDR,					\
 		    (elf_addr_t)current->mm->context.vdso);		\
+									\
+	/*								\
+	 * Should always be nonzero unless there's a kernel bug.	\
+	 * If we haven't determined a sensible value to give to		\
+	 * userspace, omit the entry:					\
+	 */								\
+	if (likely(signal_minsigstksz))					\
+		NEW_AUX_ENT(AT_MINSIGSTKSZ, signal_minsigstksz);	\
+	else								\
+		NEW_AUX_ENT(AT_IGNORE, 0);				\
 } while (0)
 
 #define ARCH_HAS_SETUP_ADDITIONAL_PAGES
diff --git a/arch/arm64/include/asm/fpsimdmacros.h b/arch/arm64/include/asm/fpsimdmacros.h
index e050d76..4684351 100644
--- a/arch/arm64/include/asm/fpsimdmacros.h
+++ b/arch/arm64/include/asm/fpsimdmacros.h
@@ -207,12 +207,14 @@
 		str		w\nxtmp, [\xpfpsr, #4]
 .endm
 
-.macro sve_load nxbase, xpfpsr, xvqminus1, nxtmp
+.macro sve_load nxbase, xpfpsr, xvqminus1, nxtmp, xtmp2
 		mrs_s		x\nxtmp, SYS_ZCR_EL1
-		bic		x\nxtmp, x\nxtmp, ZCR_ELx_LEN_MASK
-		orr		x\nxtmp, x\nxtmp, \xvqminus1
-		msr_s		SYS_ZCR_EL1, x\nxtmp	// self-synchronising
-
+		bic		\xtmp2, x\nxtmp, ZCR_ELx_LEN_MASK
+		orr		\xtmp2, \xtmp2, \xvqminus1
+		cmp		\xtmp2, x\nxtmp
+		b.eq		921f
+		msr_s		SYS_ZCR_EL1, \xtmp2	// self-synchronising
+921:
  _for n, 0, 31,	_sve_ldr_v	\n, \nxbase, \n - 34
 		_sve_ldr_p	0, \nxbase
 		_sve_wrffr	0
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index f6648a3..951b207 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -20,6 +20,9 @@
 
 #include <asm/virt.h>
 
+#define	VCPU_WORKAROUND_2_FLAG_SHIFT	0
+#define	VCPU_WORKAROUND_2_FLAG		(_AC(1, UL) << VCPU_WORKAROUND_2_FLAG_SHIFT)
+
 #define ARM_EXIT_WITH_SERROR_BIT  31
 #define ARM_EXCEPTION_CODE(x)	  ((x) & ~(1U << ARM_EXIT_WITH_SERROR_BIT))
 #define ARM_SERROR_PENDING(x)	  !!((x) & (1U << ARM_EXIT_WITH_SERROR_BIT))
@@ -71,14 +74,37 @@ extern u32 __kvm_get_mdcr_el2(void);
 
 extern u32 __init_stage2_translation(void);
 
+/* Home-grown __this_cpu_{ptr,read} variants that always work at HYP */
+#define __hyp_this_cpu_ptr(sym)						\
+	({								\
+		void *__ptr = hyp_symbol_addr(sym);			\
+		__ptr += read_sysreg(tpidr_el2);			\
+		(typeof(&sym))__ptr;					\
+	 })
+
+#define __hyp_this_cpu_read(sym)					\
+	({								\
+		*__hyp_this_cpu_ptr(sym);				\
+	 })
+
 #else /* __ASSEMBLY__ */
 
-.macro get_host_ctxt reg, tmp
-	adr_l	\reg, kvm_host_cpu_state
+.macro hyp_adr_this_cpu reg, sym, tmp
+	adr_l	\reg, \sym
 	mrs	\tmp, tpidr_el2
 	add	\reg, \reg, \tmp
 .endm
 
+.macro hyp_ldr_this_cpu reg, sym, tmp
+	adr_l	\reg, \sym
+	mrs	\tmp, tpidr_el2
+	ldr	\reg,  [\reg, \tmp]
+.endm
+
+.macro get_host_ctxt reg, tmp
+	hyp_adr_this_cpu \reg, kvm_host_cpu_state, \tmp
+.endm
+
 .macro get_vcpu_ptr vcpu, ctxt
 	get_host_ctxt \ctxt, \vcpu
 	ldr	\vcpu, [\ctxt, #HOST_CONTEXT_VCPU]
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 469de8a..95d8a0e 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -216,6 +216,9 @@ struct kvm_vcpu_arch {
 	/* Exception Information */
 	struct kvm_vcpu_fault_info fault;
 
+	/* State of various workarounds, see kvm_asm.h for bit assignment */
+	u64 workaround_flags;
+
 	/* Guest debug state */
 	u64 debug_flags;
 
@@ -452,6 +455,29 @@ static inline bool kvm_arm_harden_branch_predictor(void)
 	return cpus_have_const_cap(ARM64_HARDEN_BRANCH_PREDICTOR);
 }
 
+#define KVM_SSBD_UNKNOWN		-1
+#define KVM_SSBD_FORCE_DISABLE		0
+#define KVM_SSBD_KERNEL		1
+#define KVM_SSBD_FORCE_ENABLE		2
+#define KVM_SSBD_MITIGATED		3
+
+static inline int kvm_arm_have_ssbd(void)
+{
+	switch (arm64_get_ssbd_state()) {
+	case ARM64_SSBD_FORCE_DISABLE:
+		return KVM_SSBD_FORCE_DISABLE;
+	case ARM64_SSBD_KERNEL:
+		return KVM_SSBD_KERNEL;
+	case ARM64_SSBD_FORCE_ENABLE:
+		return KVM_SSBD_FORCE_ENABLE;
+	case ARM64_SSBD_MITIGATED:
+		return KVM_SSBD_MITIGATED;
+	case ARM64_SSBD_UNKNOWN:
+	default:
+		return KVM_SSBD_UNKNOWN;
+	}
+}
+
 void kvm_vcpu_load_sysregs(struct kvm_vcpu *vcpu);
 void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu);
 
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 0821109..fb9a712 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -72,7 +72,6 @@
 #ifdef __ASSEMBLY__
 
 #include <asm/alternative.h>
-#include <asm/cpufeature.h>
 
 /*
  * Convert a kernel VA into a HYP VA.
@@ -360,6 +359,22 @@ static inline unsigned int kvm_get_vmid_bits(void)
 	return (cpuid_feature_extract_unsigned_field(reg, ID_AA64MMFR1_VMIDBITS_SHIFT) == 2) ? 16 : 8;
 }
 
+/*
+ * We are not in the kvm->srcu critical section most of the time, so we take
+ * the SRCU read lock here. Since we copy the data from the user page, we
+ * can immediately drop the lock again.
+ */
+static inline int kvm_read_guest_lock(struct kvm *kvm,
+				      gpa_t gpa, void *data, unsigned long len)
+{
+	int srcu_idx = srcu_read_lock(&kvm->srcu);
+	int ret = kvm_read_guest(kvm, gpa, data, len);
+
+	srcu_read_unlock(&kvm->srcu, srcu_idx);
+
+	return ret;
+}
+
 #ifdef CONFIG_KVM_INDIRECT_VECTORS
 /*
  * EL2 vectors can be mapped and rerouted in a number of ways,
@@ -457,6 +472,30 @@ static inline int kvm_map_vectors(void)
 }
 #endif
 
+#ifdef CONFIG_ARM64_SSBD
+DECLARE_PER_CPU_READ_MOSTLY(u64, arm64_ssbd_callback_required);
+
+static inline int hyp_map_aux_data(void)
+{
+	int cpu, err;
+
+	for_each_possible_cpu(cpu) {
+		u64 *ptr;
+
+		ptr = per_cpu_ptr(&arm64_ssbd_callback_required, cpu);
+		err = create_hyp_mappings(ptr, ptr + 1, PAGE_HYP);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+#else
+static inline int hyp_map_aux_data(void)
+{
+	return 0;
+}
+#endif
+
 #define kvm_phys_to_vttbr(addr)		phys_to_ttbr(addr)
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/arm64/include/asm/pci.h b/arch/arm64/include/asm/pci.h
index 8747f7c..9e69068 100644
--- a/arch/arm64/include/asm/pci.h
+++ b/arch/arm64/include/asm/pci.h
@@ -18,11 +18,6 @@
 #define pcibios_assign_all_busses() \
 	(pci_has_flag(PCI_REASSIGN_ALL_BUS))
 
-/*
- * PCI address space differs from physical memory address space
- */
-#define PCI_DMA_BUS_IS_PHYS	(0)
-
 #define ARCH_GENERIC_PCI_MMAP_RESOURCE	1
 
 extern int isa_dma_bridge_buggy;
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 7c4c8f3..9f82d6b 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -306,8 +306,6 @@ static inline int pte_same(pte_t pte_a, pte_t pte_b)
 #define HPAGE_MASK		(~(HPAGE_SIZE - 1))
 #define HUGETLB_PAGE_ORDER	(HPAGE_SHIFT - PAGE_SHIFT)
 
-#define __HAVE_ARCH_PTE_SPECIAL
-
 static inline pte_t pgd_pte(pgd_t pgd)
 {
 	return __pte(pgd_val(pgd));
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index 7675989..65ab83e 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -35,6 +35,8 @@
 #ifdef __KERNEL__
 
 #include <linux/build_bug.h>
+#include <linux/cache.h>
+#include <linux/init.h>
 #include <linux/stddef.h>
 #include <linux/string.h>
 
@@ -244,6 +246,9 @@ void cpu_enable_pan(const struct arm64_cpu_capabilities *__unused);
 void cpu_enable_cache_maint_trap(const struct arm64_cpu_capabilities *__unused);
 void cpu_clear_disr(const struct arm64_cpu_capabilities *__unused);
 
+extern unsigned long __ro_after_init signal_minsigstksz; /* sigframe size */
+extern void __init minsigstksz_setup(void);
+
 /* Userspace interface for PR_SVE_{SET,GET}_VL prctl()s: */
 #define SVE_SET_VL(arg)	sve_set_current_vl(arg)
 #define SVE_GET_VL()	sve_get_current_vl()
diff --git a/arch/arm64/include/asm/spinlock.h b/arch/arm64/include/asm/spinlock.h
index ebdae15..26c5bd7 100644
--- a/arch/arm64/include/asm/spinlock.h
+++ b/arch/arm64/include/asm/spinlock.h
@@ -122,11 +122,6 @@ static inline int arch_spin_value_unlocked(arch_spinlock_t lock)
 
 static inline int arch_spin_is_locked(arch_spinlock_t *lock)
 {
-	/*
-	 * Ensure prior spin_lock operations to other locks have completed
-	 * on this CPU before we test whether "lock" is locked.
-	 */
-	smp_mb(); /* ^^^ */
 	return !arch_spin_value_unlocked(READ_ONCE(*lock));
 }
 
diff --git a/arch/arm64/include/asm/stat.h b/arch/arm64/include/asm/stat.h
index 15e3559..eab7380 100644
--- a/arch/arm64/include/asm/stat.h
+++ b/arch/arm64/include/asm/stat.h
@@ -20,6 +20,7 @@
 
 #ifdef CONFIG_COMPAT
 
+#include <linux/compat_time.h>
 #include <asm/compat.h>
 
 /*
diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index 740aa03c..cbcf11b 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -94,6 +94,7 @@ void arch_release_task_struct(struct task_struct *tsk);
 #define TIF_32BIT		22	/* 32bit process */
 #define TIF_SVE			23	/* Scalable Vector Extension in use */
 #define TIF_SVE_VL_INHERIT	24	/* Inherit sve_vl_onexec across exec */
+#define TIF_SSBD		25	/* Wants SSB mitigation */
 
 #define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
 #define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h
index c4f2d50..df48212 100644
--- a/arch/arm64/include/asm/topology.h
+++ b/arch/arm64/include/asm/topology.h
@@ -7,14 +7,16 @@
 struct cpu_topology {
 	int thread_id;
 	int core_id;
-	int cluster_id;
+	int package_id;
+	int llc_id;
 	cpumask_t thread_sibling;
 	cpumask_t core_sibling;
+	cpumask_t llc_siblings;
 };
 
 extern struct cpu_topology cpu_topology[NR_CPUS];
 
-#define topology_physical_package_id(cpu)	(cpu_topology[cpu].cluster_id)
+#define topology_physical_package_id(cpu)	(cpu_topology[cpu].package_id)
 #define topology_core_id(cpu)		(cpu_topology[cpu].core_id)
 #define topology_core_cpumask(cpu)	(&cpu_topology[cpu].core_sibling)
 #define topology_sibling_cpumask(cpu)	(&cpu_topology[cpu].thread_sibling)
diff --git a/arch/arm64/include/uapi/asm/auxvec.h b/arch/arm64/include/uapi/asm/auxvec.h
index ec0a86d..743c0b8 100644
--- a/arch/arm64/include/uapi/asm/auxvec.h
+++ b/arch/arm64/include/uapi/asm/auxvec.h
@@ -19,7 +19,8 @@
 
 /* vDSO location */
 #define AT_SYSINFO_EHDR	33
+#define AT_MINSIGSTKSZ	51	/* stack needed for signal delivery */
 
-#define AT_VECTOR_SIZE_ARCH 1 /* entries in ARCH_DLINFO */
+#define AT_VECTOR_SIZE_ARCH 2 /* entries in ARCH_DLINFO */
 
 #endif
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index bf825f3..0025f869 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -54,6 +54,7 @@ arm64-obj-$(CONFIG_ARM64_RELOC_TEST)	+= arm64-reloc-test.o
 arm64-reloc-test-y := reloc_test_core.o reloc_test_syms.o
 arm64-obj-$(CONFIG_CRASH_DUMP)		+= crash_dump.o
 arm64-obj-$(CONFIG_ARM_SDE_INTERFACE)	+= sdei.o
+arm64-obj-$(CONFIG_ARM64_SSBD)		+= ssbd.o
 
 obj-y					+= $(arm64-obj-y) vdso/ probes/
 obj-m					+= $(arm64-obj-m)
diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c
index 66be504..d894a20 100644
--- a/arch/arm64/kernel/arm64ksyms.c
+++ b/arch/arm64/kernel/arm64ksyms.c
@@ -75,3 +75,11 @@ NOKPROBE_SYMBOL(_mcount);
 	/* arm-smccc */
 EXPORT_SYMBOL(__arm_smccc_smc);
 EXPORT_SYMBOL(__arm_smccc_hvc);
+
+	/* tishift.S */
+extern long long __ashlti3(long long a, int b);
+EXPORT_SYMBOL(__ashlti3);
+extern long long __ashrti3(long long a, int b);
+EXPORT_SYMBOL(__ashrti3);
+extern long long __lshrti3(long long a, int b);
+EXPORT_SYMBOL(__lshrti3);
diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c
index 6e47fc3..97d45d5 100644
--- a/arch/arm64/kernel/armv8_deprecated.c
+++ b/arch/arm64/kernel/armv8_deprecated.c
@@ -13,6 +13,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/sysctl.h>
+#include <linux/uaccess.h>
 
 #include <asm/cpufeature.h>
 #include <asm/insn.h>
@@ -20,8 +21,6 @@
 #include <asm/system_misc.h>
 #include <asm/traps.h>
 #include <asm/kprobes.h>
-#include <linux/uaccess.h>
-#include <asm/cpufeature.h>
 
 #define CREATE_TRACE_POINTS
 #include "trace-events-emulation.h"
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index 5bdda65..323aeb5 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -136,6 +136,7 @@ int main(void)
 #ifdef CONFIG_KVM_ARM_HOST
   DEFINE(VCPU_CONTEXT,		offsetof(struct kvm_vcpu, arch.ctxt));
   DEFINE(VCPU_FAULT_DISR,	offsetof(struct kvm_vcpu, arch.fault.disr_el1));
+  DEFINE(VCPU_WORKAROUND_FLAGS,	offsetof(struct kvm_vcpu, arch.workaround_flags));
   DEFINE(CPU_GP_REGS,		offsetof(struct kvm_cpu_context, gp_regs));
   DEFINE(CPU_USER_PT_REGS,	offsetof(struct kvm_regs, regs));
   DEFINE(CPU_FP_REGS,		offsetof(struct kvm_regs, fp_regs));
diff --git a/arch/arm64/kernel/cacheinfo.c b/arch/arm64/kernel/cacheinfo.c
index 380f2e2..0bf0a83 100644
--- a/arch/arm64/kernel/cacheinfo.c
+++ b/arch/arm64/kernel/cacheinfo.c
@@ -17,6 +17,7 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <linux/acpi.h>
 #include <linux/cacheinfo.h>
 #include <linux/of.h>
 
@@ -46,7 +47,7 @@ static void ci_leaf_init(struct cacheinfo *this_leaf,
 
 static int __init_cache_level(unsigned int cpu)
 {
-	unsigned int ctype, level, leaves, of_level;
+	unsigned int ctype, level, leaves, fw_level;
 	struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
 
 	for (level = 1, leaves = 0; level <= MAX_CACHE_LEVEL; level++) {
@@ -59,15 +60,19 @@ static int __init_cache_level(unsigned int cpu)
 		leaves += (ctype == CACHE_TYPE_SEPARATE) ? 2 : 1;
 	}
 
-	of_level = of_find_last_cache_level(cpu);
-	if (level < of_level) {
+	if (acpi_disabled)
+		fw_level = of_find_last_cache_level(cpu);
+	else
+		fw_level = acpi_find_last_cache_level(cpu);
+
+	if (level < fw_level) {
 		/*
 		 * some external caches not specified in CLIDR_EL1
 		 * the information may be available in the device tree
 		 * only unified external caches are considered here
 		 */
-		leaves += (of_level - level);
-		level = of_level;
+		leaves += (fw_level - level);
+		level = fw_level;
 	}
 
 	this_cpu_ci->num_levels = level;
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index e4a1182..1d2b6d7 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -16,6 +16,8 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <linux/arm-smccc.h>
+#include <linux/psci.h>
 #include <linux/types.h>
 #include <asm/cpu.h>
 #include <asm/cputype.h>
@@ -232,6 +234,178 @@ enable_smccc_arch_workaround_1(const struct arm64_cpu_capabilities *entry)
 }
 #endif	/* CONFIG_HARDEN_BRANCH_PREDICTOR */
 
+#ifdef CONFIG_ARM64_SSBD
+DEFINE_PER_CPU_READ_MOSTLY(u64, arm64_ssbd_callback_required);
+
+int ssbd_state __read_mostly = ARM64_SSBD_KERNEL;
+
+static const struct ssbd_options {
+	const char	*str;
+	int		state;
+} ssbd_options[] = {
+	{ "force-on",	ARM64_SSBD_FORCE_ENABLE, },
+	{ "force-off",	ARM64_SSBD_FORCE_DISABLE, },
+	{ "kernel",	ARM64_SSBD_KERNEL, },
+};
+
+static int __init ssbd_cfg(char *buf)
+{
+	int i;
+
+	if (!buf || !buf[0])
+		return -EINVAL;
+
+	for (i = 0; i < ARRAY_SIZE(ssbd_options); i++) {
+		int len = strlen(ssbd_options[i].str);
+
+		if (strncmp(buf, ssbd_options[i].str, len))
+			continue;
+
+		ssbd_state = ssbd_options[i].state;
+		return 0;
+	}
+
+	return -EINVAL;
+}
+early_param("ssbd", ssbd_cfg);
+
+void __init arm64_update_smccc_conduit(struct alt_instr *alt,
+				       __le32 *origptr, __le32 *updptr,
+				       int nr_inst)
+{
+	u32 insn;
+
+	BUG_ON(nr_inst != 1);
+
+	switch (psci_ops.conduit) {
+	case PSCI_CONDUIT_HVC:
+		insn = aarch64_insn_get_hvc_value();
+		break;
+	case PSCI_CONDUIT_SMC:
+		insn = aarch64_insn_get_smc_value();
+		break;
+	default:
+		return;
+	}
+
+	*updptr = cpu_to_le32(insn);
+}
+
+void __init arm64_enable_wa2_handling(struct alt_instr *alt,
+				      __le32 *origptr, __le32 *updptr,
+				      int nr_inst)
+{
+	BUG_ON(nr_inst != 1);
+	/*
+	 * Only allow mitigation on EL1 entry/exit and guest
+	 * ARCH_WORKAROUND_2 handling if the SSBD state allows it to
+	 * be flipped.
+	 */
+	if (arm64_get_ssbd_state() == ARM64_SSBD_KERNEL)
+		*updptr = cpu_to_le32(aarch64_insn_gen_nop());
+}
+
+void arm64_set_ssbd_mitigation(bool state)
+{
+	switch (psci_ops.conduit) {
+	case PSCI_CONDUIT_HVC:
+		arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_WORKAROUND_2, state, NULL);
+		break;
+
+	case PSCI_CONDUIT_SMC:
+		arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_2, state, NULL);
+		break;
+
+	default:
+		WARN_ON_ONCE(1);
+		break;
+	}
+}
+
+static bool has_ssbd_mitigation(const struct arm64_cpu_capabilities *entry,
+				    int scope)
+{
+	struct arm_smccc_res res;
+	bool required = true;
+	s32 val;
+
+	WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible());
+
+	if (psci_ops.smccc_version == SMCCC_VERSION_1_0) {
+		ssbd_state = ARM64_SSBD_UNKNOWN;
+		return false;
+	}
+
+	switch (psci_ops.conduit) {
+	case PSCI_CONDUIT_HVC:
+		arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
+				  ARM_SMCCC_ARCH_WORKAROUND_2, &res);
+		break;
+
+	case PSCI_CONDUIT_SMC:
+		arm_smccc_1_1_smc(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
+				  ARM_SMCCC_ARCH_WORKAROUND_2, &res);
+		break;
+
+	default:
+		ssbd_state = ARM64_SSBD_UNKNOWN;
+		return false;
+	}
+
+	val = (s32)res.a0;
+
+	switch (val) {
+	case SMCCC_RET_NOT_SUPPORTED:
+		ssbd_state = ARM64_SSBD_UNKNOWN;
+		return false;
+
+	case SMCCC_RET_NOT_REQUIRED:
+		pr_info_once("%s mitigation not required\n", entry->desc);
+		ssbd_state = ARM64_SSBD_MITIGATED;
+		return false;
+
+	case SMCCC_RET_SUCCESS:
+		required = true;
+		break;
+
+	case 1:	/* Mitigation not required on this CPU */
+		required = false;
+		break;
+
+	default:
+		WARN_ON(1);
+		return false;
+	}
+
+	switch (ssbd_state) {
+	case ARM64_SSBD_FORCE_DISABLE:
+		pr_info_once("%s disabled from command-line\n", entry->desc);
+		arm64_set_ssbd_mitigation(false);
+		required = false;
+		break;
+
+	case ARM64_SSBD_KERNEL:
+		if (required) {
+			__this_cpu_write(arm64_ssbd_callback_required, 1);
+			arm64_set_ssbd_mitigation(true);
+		}
+		break;
+
+	case ARM64_SSBD_FORCE_ENABLE:
+		pr_info_once("%s forced from command-line\n", entry->desc);
+		arm64_set_ssbd_mitigation(true);
+		required = true;
+		break;
+
+	default:
+		WARN_ON(1);
+		break;
+	}
+
+	return required;
+}
+#endif	/* CONFIG_ARM64_SSBD */
+
 #define CAP_MIDR_RANGE(model, v_min, r_min, v_max, r_max)	\
 	.matches = is_affected_midr_range,			\
 	.midr_range = MIDR_RANGE(model, v_min, r_min, v_max, r_max)
@@ -488,6 +662,14 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
 		ERRATA_MIDR_RANGE_LIST(arm64_harden_el2_vectors),
 	},
 #endif
+#ifdef CONFIG_ARM64_SSBD
+	{
+		.desc = "Speculative Store Bypass Disable",
+		.capability = ARM64_SSBD,
+		.type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM,
+		.matches = has_ssbd_mitigation,
+	},
+#endif
 	{
 	}
 };
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 9d1b06d..d2856b1 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -1606,7 +1606,6 @@ static void __init setup_system_capabilities(void)
 void __init setup_cpu_features(void)
 {
 	u32 cwg;
-	int cls;
 
 	setup_system_capabilities();
 	mark_const_caps_ready();
@@ -1619,6 +1618,7 @@ void __init setup_cpu_features(void)
 		pr_info("emulated: Privileged Access Never (PAN) using TTBR0_EL1 switching\n");
 
 	sve_setup();
+	minsigstksz_setup();
 
 	/* Advertise that we have computed the system capabilities */
 	set_sys_caps_initialised();
@@ -1627,13 +1627,9 @@ void __init setup_cpu_features(void)
 	 * Check for sane CTR_EL0.CWG value.
 	 */
 	cwg = cache_type_cwg();
-	cls = cache_line_size();
 	if (!cwg)
-		pr_warn("No Cache Writeback Granule information, assuming cache line size %d\n",
-			cls);
-	if (L1_CACHE_BYTES < cls)
-		pr_warn("L1_CACHE_BYTES smaller than the Cache Writeback Granule (%d < %d)\n",
-			L1_CACHE_BYTES, cls);
+		pr_warn("No Cache Writeback Granule information, assuming %d\n",
+			ARCH_DMA_MINALIGN);
 }
 
 static bool __maybe_unused
diff --git a/arch/arm64/kernel/entry-fpsimd.S b/arch/arm64/kernel/entry-fpsimd.S
index 73f17bf..12d4958 100644
--- a/arch/arm64/kernel/entry-fpsimd.S
+++ b/arch/arm64/kernel/entry-fpsimd.S
@@ -49,7 +49,7 @@ ENTRY(sve_save_state)
 ENDPROC(sve_save_state)
 
 ENTRY(sve_load_state)
-	sve_load 0, x1, x2, 3
+	sve_load 0, x1, x2, 3, x4
 	ret
 ENDPROC(sve_load_state)
 
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index ec2ee72..28ad879 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -18,6 +18,7 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <linux/arm-smccc.h>
 #include <linux/init.h>
 #include <linux/linkage.h>
 
@@ -137,6 +138,25 @@ alternative_else_nop_endif
 	add	\dst, \dst, #(\sym - .entry.tramp.text)
 	.endm
 
+	// This macro corrupts x0-x3. It is the caller's duty
+	// to save/restore them if required.
+	.macro	apply_ssbd, state, targ, tmp1, tmp2
+#ifdef CONFIG_ARM64_SSBD
+alternative_cb	arm64_enable_wa2_handling
+	b	\targ
+alternative_cb_end
+	ldr_this_cpu	\tmp2, arm64_ssbd_callback_required, \tmp1
+	cbz	\tmp2, \targ
+	ldr	\tmp2, [tsk, #TSK_TI_FLAGS]
+	tbnz	\tmp2, #TIF_SSBD, \targ
+	mov	w0, #ARM_SMCCC_ARCH_WORKAROUND_2
+	mov	w1, #\state
+alternative_cb	arm64_update_smccc_conduit
+	nop					// Patched to SMC/HVC #0
+alternative_cb_end
+#endif
+	.endm
+
 	.macro	kernel_entry, el, regsize = 64
 	.if	\regsize == 32
 	mov	w0, w0				// zero upper 32 bits of x0
@@ -163,6 +183,14 @@ alternative_else_nop_endif
 	ldr	x19, [tsk, #TSK_TI_FLAGS]	// since we can unmask debug
 	disable_step_tsk x19, x20		// exceptions when scheduling.
 
+	apply_ssbd 1, 1f, x22, x23
+
+#ifdef CONFIG_ARM64_SSBD
+	ldp	x0, x1, [sp, #16 * 0]
+	ldp	x2, x3, [sp, #16 * 1]
+#endif
+1:
+
 	mov	x29, xzr			// fp pointed to user-space
 	.else
 	add	x21, sp, #S_FRAME_SIZE
@@ -303,6 +331,8 @@ alternative_if ARM64_WORKAROUND_845719
 alternative_else_nop_endif
 #endif
 3:
+	apply_ssbd 0, 5f, x0, x1
+5:
 	.endif
 
 	msr	elr_el1, x21			// set up the return data
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 87a3536..3b527ae 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -31,7 +31,6 @@
 #include <linux/percpu.h>
 #include <linux/prctl.h>
 #include <linux/preempt.h>
-#include <linux/prctl.h>
 #include <linux/ptrace.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/task_stack.h>
@@ -129,7 +128,7 @@ static int sve_default_vl = -1;
 #ifdef CONFIG_ARM64_SVE
 
 /* Maximum supported vector length across all CPUs (initially poisoned) */
-int __ro_after_init sve_max_vl = -1;
+int __ro_after_init sve_max_vl = SVE_VL_MIN;
 /* Set of available vector lengths, as vq_to_bit(vq): */
 static __ro_after_init DECLARE_BITMAP(sve_vq_map, SVE_VQ_MAX);
 static void __percpu *efi_sve_state;
@@ -360,22 +359,13 @@ static int sve_proc_do_default_vl(struct ctl_table *table, int write,
 		return ret;
 
 	/* Writing -1 has the special meaning "set to max": */
-	if (vl == -1) {
-		/* Fail safe if sve_max_vl wasn't initialised */
-		if (WARN_ON(!sve_vl_valid(sve_max_vl)))
-			vl = SVE_VL_MIN;
-		else
-			vl = sve_max_vl;
-
-		goto chosen;
-	}
+	if (vl == -1)
+		vl = sve_max_vl;
 
 	if (!sve_vl_valid(vl))
 		return -EINVAL;
 
-	vl = find_supported_vector_length(vl);
-chosen:
-	sve_default_vl = vl;
+	sve_default_vl = find_supported_vector_length(vl);
 	return 0;
 }
 
@@ -882,7 +872,7 @@ asmlinkage void do_fpsimd_exc(unsigned int esr, struct pt_regs *regs)
 			si_code = FPE_FLTRES;
 	}
 
-	memset(&info, 0, sizeof(info));
+	clear_siginfo(&info);
 	info.si_signo = SIGFPE;
 	info.si_code = si_code;
 	info.si_addr = (void __user *)instruction_pointer(regs);
diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c
index 1ec5f28..6b2686d 100644
--- a/arch/arm64/kernel/hibernate.c
+++ b/arch/arm64/kernel/hibernate.c
@@ -313,6 +313,17 @@ int swsusp_arch_suspend(void)
 
 		sleep_cpu = -EINVAL;
 		__cpu_suspend_exit();
+
+		/*
+		 * Just in case the boot kernel did turn the SSBD
+		 * mitigation off behind our back, let's set the state
+		 * to what we expect it to be.
+		 */
+		switch (arm64_get_ssbd_state()) {
+		case ARM64_SSBD_FORCE_ENABLE:
+		case ARM64_SSBD_KERNEL:
+			arm64_set_ssbd_mitigation(true);
+		}
 	}
 
 	local_daif_restore(flags);
diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c
index 74bb56f..413dbe5 100644
--- a/arch/arm64/kernel/hw_breakpoint.c
+++ b/arch/arm64/kernel/hw_breakpoint.c
@@ -30,7 +30,6 @@
 #include <linux/smp.h>
 #include <linux/uaccess.h>
 
-#include <asm/compat.h>
 #include <asm/current.h>
 #include <asm/debug-monitors.h>
 #include <asm/hw_breakpoint.h>
diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
index 85a251b..33147aa 100644
--- a/arch/arm64/kernel/perf_event.c
+++ b/arch/arm64/kernel/perf_event.c
@@ -670,11 +670,10 @@ static void armv8pmu_disable_event(struct perf_event *event)
 	raw_spin_unlock_irqrestore(&events->pmu_lock, flags);
 }
 
-static irqreturn_t armv8pmu_handle_irq(int irq_num, void *dev)
+static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
 {
 	u32 pmovsr;
 	struct perf_sample_data data;
-	struct arm_pmu *cpu_pmu = (struct arm_pmu *)dev;
 	struct pmu_hw_events *cpuc = this_cpu_ptr(cpu_pmu->hw_events);
 	struct pt_regs *regs;
 	int idx;
diff --git a/arch/arm64/kernel/perf_regs.c b/arch/arm64/kernel/perf_regs.c
index 1d091d0..0bbac61 100644
--- a/arch/arm64/kernel/perf_regs.c
+++ b/arch/arm64/kernel/perf_regs.c
@@ -1,11 +1,11 @@
 // SPDX-License-Identifier: GPL-2.0
+#include <linux/compat.h>
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/perf_event.h>
 #include <linux/bug.h>
 #include <linux/sched/task_stack.h>
 
-#include <asm/compat.h>
 #include <asm/perf_regs.h>
 #include <asm/ptrace.h>
 
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index 7ff81fe..bd73264 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -766,9 +766,6 @@ static void sve_init_header_from_task(struct user_sve_header *header,
 	vq = sve_vq_from_vl(header->vl);
 
 	header->max_vl = sve_max_vl;
-	if (WARN_ON(!sve_vl_valid(sve_max_vl)))
-		header->max_vl = header->vl;
-
 	header->size = SVE_PT_SIZE(vq, header->flags);
 	header->max_size = SVE_PT_SIZE(sve_vq_from_vl(header->max_vl),
 				      SVE_PT_REGS_SVE);
@@ -1046,8 +1043,6 @@ static const struct user_regset_view user_aarch64_view = {
 };
 
 #ifdef CONFIG_COMPAT
-#include <linux/compat.h>
-
 enum compat_regset {
 	REGSET_COMPAT_GPR,
 	REGSET_COMPAT_VFP,
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 154b7d3..511af13 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -17,6 +17,7 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <linux/cache.h>
 #include <linux/compat.h>
 #include <linux/errno.h>
 #include <linux/kernel.h>
@@ -570,8 +571,15 @@ badframe:
 	return 0;
 }
 
-/* Determine the layout of optional records in the signal frame */
-static int setup_sigframe_layout(struct rt_sigframe_user_layout *user)
+/*
+ * Determine the layout of optional records in the signal frame
+ *
+ * add_all: if true, lays out the biggest possible signal frame for
+ *	this task; otherwise, generates a layout for the current state
+ *	of the task.
+ */
+static int setup_sigframe_layout(struct rt_sigframe_user_layout *user,
+				 bool add_all)
 {
 	int err;
 
@@ -581,7 +589,7 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user)
 		return err;
 
 	/* fault information, if valid */
-	if (current->thread.fault_code) {
+	if (add_all || current->thread.fault_code) {
 		err = sigframe_alloc(user, &user->esr_offset,
 				     sizeof(struct esr_context));
 		if (err)
@@ -591,8 +599,14 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user)
 	if (system_supports_sve()) {
 		unsigned int vq = 0;
 
-		if (test_thread_flag(TIF_SVE))
-			vq = sve_vq_from_vl(current->thread.sve_vl);
+		if (add_all || test_thread_flag(TIF_SVE)) {
+			int vl = sve_max_vl;
+
+			if (!add_all)
+				vl = current->thread.sve_vl;
+
+			vq = sve_vq_from_vl(vl);
+		}
 
 		err = sigframe_alloc(user, &user->sve_offset,
 				     SVE_SIG_CONTEXT_SIZE(vq));
@@ -603,7 +617,6 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user)
 	return sigframe_alloc_end(user);
 }
 
-
 static int setup_sigframe(struct rt_sigframe_user_layout *user,
 			  struct pt_regs *regs, sigset_t *set)
 {
@@ -701,7 +714,7 @@ static int get_sigframe(struct rt_sigframe_user_layout *user,
 	int err;
 
 	init_user_layout(user);
-	err = setup_sigframe_layout(user);
+	err = setup_sigframe_layout(user, false);
 	if (err)
 		return err;
 
@@ -830,11 +843,12 @@ static void do_signal(struct pt_regs *regs)
 	unsigned long continue_addr = 0, restart_addr = 0;
 	int retval = 0;
 	struct ksignal ksig;
+	bool syscall = in_syscall(regs);
 
 	/*
 	 * If we were from a system call, check for system call restarting...
 	 */
-	if (in_syscall(regs)) {
+	if (syscall) {
 		continue_addr = regs->pc;
 		restart_addr = continue_addr - (compat_thumb_mode(regs) ? 2 : 4);
 		retval = regs->regs[0];
@@ -886,7 +900,7 @@ static void do_signal(struct pt_regs *regs)
 	 * Handle restarting a different system call. As above, if a debugger
 	 * has chosen to restart at a different PC, ignore the restart.
 	 */
-	if (in_syscall(regs) && regs->pc == restart_addr) {
+	if (syscall && regs->pc == restart_addr) {
 		if (retval == -ERESTART_RESTARTBLOCK)
 			setup_restart_syscall(regs);
 		user_rewind_single_step(current);
@@ -936,3 +950,28 @@ asmlinkage void do_notify_resume(struct pt_regs *regs,
 		thread_flags = READ_ONCE(current_thread_info()->flags);
 	} while (thread_flags & _TIF_WORK_MASK);
 }
+
+unsigned long __ro_after_init signal_minsigstksz;
+
+/*
+ * Determine the stack space required for guaranteed signal devliery.
+ * This function is used to populate AT_MINSIGSTKSZ at process startup.
+ * cpufeatures setup is assumed to be complete.
+ */
+void __init minsigstksz_setup(void)
+{
+	struct rt_sigframe_user_layout user;
+
+	init_user_layout(&user);
+
+	/*
+	 * If this fails, SIGFRAME_MAXSZ needs to be enlarged.  It won't
+	 * be big enough, but it's our best guess:
+	 */
+	if (WARN_ON(setup_sigframe_layout(&user, true)))
+		return;
+
+	signal_minsigstksz = sigframe_size(&user) +
+		round_up(sizeof(struct frame_record), 16) +
+		16; /* max alignment padding */
+}
diff --git a/arch/arm64/kernel/ssbd.c b/arch/arm64/kernel/ssbd.c
new file mode 100644
index 0000000..3432e5e
--- /dev/null
+++ b/arch/arm64/kernel/ssbd.c
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2018 ARM Ltd, All Rights Reserved.
+ */
+
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/thread_info.h>
+
+#include <asm/cpufeature.h>
+
+/*
+ * prctl interface for SSBD
+ * FIXME: Drop the below ifdefery once merged in 4.18.
+ */
+#ifdef PR_SPEC_STORE_BYPASS
+static int ssbd_prctl_set(struct task_struct *task, unsigned long ctrl)
+{
+	int state = arm64_get_ssbd_state();
+
+	/* Unsupported */
+	if (state == ARM64_SSBD_UNKNOWN)
+		return -EINVAL;
+
+	/* Treat the unaffected/mitigated state separately */
+	if (state == ARM64_SSBD_MITIGATED) {
+		switch (ctrl) {
+		case PR_SPEC_ENABLE:
+			return -EPERM;
+		case PR_SPEC_DISABLE:
+		case PR_SPEC_FORCE_DISABLE:
+			return 0;
+		}
+	}
+
+	/*
+	 * Things are a bit backward here: the arm64 internal API
+	 * *enables the mitigation* when the userspace API *disables
+	 * speculation*. So much fun.
+	 */
+	switch (ctrl) {
+	case PR_SPEC_ENABLE:
+		/* If speculation is force disabled, enable is not allowed */
+		if (state == ARM64_SSBD_FORCE_ENABLE ||
+		    task_spec_ssb_force_disable(task))
+			return -EPERM;
+		task_clear_spec_ssb_disable(task);
+		clear_tsk_thread_flag(task, TIF_SSBD);
+		break;
+	case PR_SPEC_DISABLE:
+		if (state == ARM64_SSBD_FORCE_DISABLE)
+			return -EPERM;
+		task_set_spec_ssb_disable(task);
+		set_tsk_thread_flag(task, TIF_SSBD);
+		break;
+	case PR_SPEC_FORCE_DISABLE:
+		if (state == ARM64_SSBD_FORCE_DISABLE)
+			return -EPERM;
+		task_set_spec_ssb_disable(task);
+		task_set_spec_ssb_force_disable(task);
+		set_tsk_thread_flag(task, TIF_SSBD);
+		break;
+	default:
+		return -ERANGE;
+	}
+
+	return 0;
+}
+
+int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which,
+			     unsigned long ctrl)
+{
+	switch (which) {
+	case PR_SPEC_STORE_BYPASS:
+		return ssbd_prctl_set(task, ctrl);
+	default:
+		return -ENODEV;
+	}
+}
+
+static int ssbd_prctl_get(struct task_struct *task)
+{
+	switch (arm64_get_ssbd_state()) {
+	case ARM64_SSBD_UNKNOWN:
+		return -EINVAL;
+	case ARM64_SSBD_FORCE_ENABLE:
+		return PR_SPEC_DISABLE;
+	case ARM64_SSBD_KERNEL:
+		if (task_spec_ssb_force_disable(task))
+			return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE;
+		if (task_spec_ssb_disable(task))
+			return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
+		return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
+	case ARM64_SSBD_FORCE_DISABLE:
+		return PR_SPEC_ENABLE;
+	default:
+		return PR_SPEC_NOT_AFFECTED;
+	}
+}
+
+int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which)
+{
+	switch (which) {
+	case PR_SPEC_STORE_BYPASS:
+		return ssbd_prctl_get(task);
+	default:
+		return -ENODEV;
+	}
+}
+#endif	/* PR_SPEC_STORE_BYPASS */
diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c
index a307b9e..70c2833 100644
--- a/arch/arm64/kernel/suspend.c
+++ b/arch/arm64/kernel/suspend.c
@@ -62,6 +62,14 @@ void notrace __cpu_suspend_exit(void)
 	 */
 	if (hw_breakpoint_restore)
 		hw_breakpoint_restore(cpu);
+
+	/*
+	 * On resume, firmware implementing dynamic mitigation will
+	 * have turned the mitigation on. If the user has forcefully
+	 * disabled it, make sure their wishes are obeyed.
+	 */
+	if (arm64_get_ssbd_state() == ARM64_SSBD_FORCE_DISABLE)
+		arm64_set_ssbd_mitigation(false);
 }
 
 /*
diff --git a/arch/arm64/kernel/sys_compat.c b/arch/arm64/kernel/sys_compat.c
index 93ab57d..a610982 100644
--- a/arch/arm64/kernel/sys_compat.c
+++ b/arch/arm64/kernel/sys_compat.c
@@ -112,6 +112,7 @@ long compat_arm_syscall(struct pt_regs *regs)
 		break;
 	}
 
+	clear_siginfo(&info);
 	info.si_signo = SIGILL;
 	info.si_errno = 0;
 	info.si_code  = ILL_ILLTRP;
diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
index 2186853..f845a86 100644
--- a/arch/arm64/kernel/topology.c
+++ b/arch/arm64/kernel/topology.c
@@ -11,7 +11,9 @@
  * for more details.
  */
 
+#include <linux/acpi.h>
 #include <linux/arch_topology.h>
+#include <linux/cacheinfo.h>
 #include <linux/cpu.h>
 #include <linux/cpumask.h>
 #include <linux/init.h>
@@ -22,6 +24,7 @@
 #include <linux/sched.h>
 #include <linux/sched/topology.h>
 #include <linux/slab.h>
+#include <linux/smp.h>
 #include <linux/string.h>
 
 #include <asm/cpu.h>
@@ -47,7 +50,7 @@ static int __init get_cpu_for_node(struct device_node *node)
 	return cpu;
 }
 
-static int __init parse_core(struct device_node *core, int cluster_id,
+static int __init parse_core(struct device_node *core, int package_id,
 			     int core_id)
 {
 	char name[10];
@@ -63,7 +66,7 @@ static int __init parse_core(struct device_node *core, int cluster_id,
 			leaf = false;
 			cpu = get_cpu_for_node(t);
 			if (cpu >= 0) {
-				cpu_topology[cpu].cluster_id = cluster_id;
+				cpu_topology[cpu].package_id = package_id;
 				cpu_topology[cpu].core_id = core_id;
 				cpu_topology[cpu].thread_id = i;
 			} else {
@@ -85,7 +88,7 @@ static int __init parse_core(struct device_node *core, int cluster_id,
 			return -EINVAL;
 		}
 
-		cpu_topology[cpu].cluster_id = cluster_id;
+		cpu_topology[cpu].package_id = package_id;
 		cpu_topology[cpu].core_id = core_id;
 	} else if (leaf) {
 		pr_err("%pOF: Can't get CPU for leaf core\n", core);
@@ -101,7 +104,7 @@ static int __init parse_cluster(struct device_node *cluster, int depth)
 	bool leaf = true;
 	bool has_cores = false;
 	struct device_node *c;
-	static int cluster_id __initdata;
+	static int package_id __initdata;
 	int core_id = 0;
 	int i, ret;
 
@@ -140,7 +143,7 @@ static int __init parse_cluster(struct device_node *cluster, int depth)
 			}
 
 			if (leaf) {
-				ret = parse_core(c, cluster_id, core_id++);
+				ret = parse_core(c, package_id, core_id++);
 			} else {
 				pr_err("%pOF: Non-leaf cluster with core %s\n",
 				       cluster, name);
@@ -158,7 +161,7 @@ static int __init parse_cluster(struct device_node *cluster, int depth)
 		pr_warn("%pOF: empty cluster\n", cluster);
 
 	if (leaf)
-		cluster_id++;
+		package_id++;
 
 	return 0;
 }
@@ -194,7 +197,7 @@ static int __init parse_dt_topology(void)
 	 * only mark cores described in the DT as possible.
 	 */
 	for_each_possible_cpu(cpu)
-		if (cpu_topology[cpu].cluster_id == -1)
+		if (cpu_topology[cpu].package_id == -1)
 			ret = -EINVAL;
 
 out_map:
@@ -212,7 +215,14 @@ EXPORT_SYMBOL_GPL(cpu_topology);
 
 const struct cpumask *cpu_coregroup_mask(int cpu)
 {
-	return &cpu_topology[cpu].core_sibling;
+	const cpumask_t *core_mask = &cpu_topology[cpu].core_sibling;
+
+	if (cpu_topology[cpu].llc_id != -1) {
+		if (cpumask_subset(&cpu_topology[cpu].llc_siblings, core_mask))
+			core_mask = &cpu_topology[cpu].llc_siblings;
+	}
+
+	return core_mask;
 }
 
 static void update_siblings_masks(unsigned int cpuid)
@@ -224,7 +234,12 @@ static void update_siblings_masks(unsigned int cpuid)
 	for_each_possible_cpu(cpu) {
 		cpu_topo = &cpu_topology[cpu];
 
-		if (cpuid_topo->cluster_id != cpu_topo->cluster_id)
+		if (cpuid_topo->llc_id == cpu_topo->llc_id) {
+			cpumask_set_cpu(cpu, &cpuid_topo->llc_siblings);
+			cpumask_set_cpu(cpuid, &cpu_topo->llc_siblings);
+		}
+
+		if (cpuid_topo->package_id != cpu_topo->package_id)
 			continue;
 
 		cpumask_set_cpu(cpuid, &cpu_topo->core_sibling);
@@ -245,7 +260,7 @@ void store_cpu_topology(unsigned int cpuid)
 	struct cpu_topology *cpuid_topo = &cpu_topology[cpuid];
 	u64 mpidr;
 
-	if (cpuid_topo->cluster_id != -1)
+	if (cpuid_topo->package_id != -1)
 		goto topology_populated;
 
 	mpidr = read_cpuid_mpidr();
@@ -259,19 +274,19 @@ void store_cpu_topology(unsigned int cpuid)
 		/* Multiprocessor system : Multi-threads per core */
 		cpuid_topo->thread_id  = MPIDR_AFFINITY_LEVEL(mpidr, 0);
 		cpuid_topo->core_id    = MPIDR_AFFINITY_LEVEL(mpidr, 1);
-		cpuid_topo->cluster_id = MPIDR_AFFINITY_LEVEL(mpidr, 2) |
+		cpuid_topo->package_id = MPIDR_AFFINITY_LEVEL(mpidr, 2) |
 					 MPIDR_AFFINITY_LEVEL(mpidr, 3) << 8;
 	} else {
 		/* Multiprocessor system : Single-thread per core */
 		cpuid_topo->thread_id  = -1;
 		cpuid_topo->core_id    = MPIDR_AFFINITY_LEVEL(mpidr, 0);
-		cpuid_topo->cluster_id = MPIDR_AFFINITY_LEVEL(mpidr, 1) |
+		cpuid_topo->package_id = MPIDR_AFFINITY_LEVEL(mpidr, 1) |
 					 MPIDR_AFFINITY_LEVEL(mpidr, 2) << 8 |
 					 MPIDR_AFFINITY_LEVEL(mpidr, 3) << 16;
 	}
 
 	pr_debug("CPU%u: cluster %d core %d thread %d mpidr %#016llx\n",
-		 cpuid, cpuid_topo->cluster_id, cpuid_topo->core_id,
+		 cpuid, cpuid_topo->package_id, cpuid_topo->core_id,
 		 cpuid_topo->thread_id, mpidr);
 
 topology_populated:
@@ -287,7 +302,11 @@ static void __init reset_cpu_topology(void)
 
 		cpu_topo->thread_id = -1;
 		cpu_topo->core_id = 0;
-		cpu_topo->cluster_id = -1;
+		cpu_topo->package_id = -1;
+
+		cpu_topo->llc_id = -1;
+		cpumask_clear(&cpu_topo->llc_siblings);
+		cpumask_set_cpu(cpu, &cpu_topo->llc_siblings);
 
 		cpumask_clear(&cpu_topo->core_sibling);
 		cpumask_set_cpu(cpu, &cpu_topo->core_sibling);
@@ -296,6 +315,59 @@ static void __init reset_cpu_topology(void)
 	}
 }
 
+#ifdef CONFIG_ACPI
+/*
+ * Propagate the topology information of the processor_topology_node tree to the
+ * cpu_topology array.
+ */
+static int __init parse_acpi_topology(void)
+{
+	bool is_threaded;
+	int cpu, topology_id;
+
+	is_threaded = read_cpuid_mpidr() & MPIDR_MT_BITMASK;
+
+	for_each_possible_cpu(cpu) {
+		int i, cache_id;
+
+		topology_id = find_acpi_cpu_topology(cpu, 0);
+		if (topology_id < 0)
+			return topology_id;
+
+		if (is_threaded) {
+			cpu_topology[cpu].thread_id = topology_id;
+			topology_id = find_acpi_cpu_topology(cpu, 1);
+			cpu_topology[cpu].core_id   = topology_id;
+		} else {
+			cpu_topology[cpu].thread_id  = -1;
+			cpu_topology[cpu].core_id    = topology_id;
+		}
+		topology_id = find_acpi_cpu_topology_package(cpu);
+		cpu_topology[cpu].package_id = topology_id;
+
+		i = acpi_find_last_cache_level(cpu);
+
+		if (i > 0) {
+			/*
+			 * this is the only part of cpu_topology that has
+			 * a direct relationship with the cache topology
+			 */
+			cache_id = find_acpi_cpu_cache_topology(cpu, i);
+			if (cache_id > 0)
+				cpu_topology[cpu].llc_id = cache_id;
+		}
+	}
+
+	return 0;
+}
+
+#else
+static inline int __init parse_acpi_topology(void)
+{
+	return -EINVAL;
+}
+#endif
+
 void __init init_cpu_topology(void)
 {
 	reset_cpu_topology();
@@ -304,6 +376,8 @@ void __init init_cpu_topology(void)
 	 * Discard anything that was parsed if we hit an error so we
 	 * don't use partial information.
 	 */
-	if (of_have_populated_dt() && parse_dt_topology())
+	if (!acpi_disabled && parse_acpi_topology())
+		reset_cpu_topology();
+	else if (of_have_populated_dt() && parse_dt_topology())
 		reset_cpu_topology();
 }
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 8bbdc17..d399d45 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -635,6 +635,7 @@ asmlinkage void bad_el0_sync(struct pt_regs *regs, int reason, unsigned int esr)
 	siginfo_t info;
 	void __user *pc = (void __user *)instruction_pointer(regs);
 
+	clear_siginfo(&info);
 	info.si_signo = SIGILL;
 	info.si_errno = 0;
 	info.si_code  = ILL_ILLOPC;
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 0221aca..605d1b6 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -34,25 +34,25 @@ jiffies = jiffies_64;
 	 *    4 KB (see related ASSERT() below)		\
 	 */						\
 	. = ALIGN(SZ_4K);				\
-	VMLINUX_SYMBOL(__hyp_idmap_text_start) = .;	\
+	__hyp_idmap_text_start = .;			\
 	*(.hyp.idmap.text)				\
-	VMLINUX_SYMBOL(__hyp_idmap_text_end) = .;	\
-	VMLINUX_SYMBOL(__hyp_text_start) = .;		\
+	__hyp_idmap_text_end = .;			\
+	__hyp_text_start = .;				\
 	*(.hyp.text)					\
-	VMLINUX_SYMBOL(__hyp_text_end) = .;
+	__hyp_text_end = .;
 
 #define IDMAP_TEXT					\
 	. = ALIGN(SZ_4K);				\
-	VMLINUX_SYMBOL(__idmap_text_start) = .;		\
+	__idmap_text_start = .;				\
 	*(.idmap.text)					\
-	VMLINUX_SYMBOL(__idmap_text_end) = .;
+	__idmap_text_end = .;
 
 #ifdef CONFIG_HIBERNATION
 #define HIBERNATE_TEXT					\
 	. = ALIGN(SZ_4K);				\
-	VMLINUX_SYMBOL(__hibernate_exit_text_start) = .;\
+	__hibernate_exit_text_start = .;		\
 	*(.hibernate_exit.text)				\
-	VMLINUX_SYMBOL(__hibernate_exit_text_end) = .;
+	__hibernate_exit_text_end = .;
 #else
 #define HIBERNATE_TEXT
 #endif
@@ -60,10 +60,10 @@ jiffies = jiffies_64;
 #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
 #define TRAMP_TEXT					\
 	. = ALIGN(PAGE_SIZE);				\
-	VMLINUX_SYMBOL(__entry_tramp_text_start) = .;	\
+	__entry_tramp_text_start = .;			\
 	*(.entry.tramp.text)				\
 	. = ALIGN(PAGE_SIZE);				\
-	VMLINUX_SYMBOL(__entry_tramp_text_end) = .;
+	__entry_tramp_text_end = .;
 #else
 #define TRAMP_TEXT
 #endif
diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S
index bffece2..05d8369 100644
--- a/arch/arm64/kvm/hyp/hyp-entry.S
+++ b/arch/arm64/kvm/hyp/hyp-entry.S
@@ -106,8 +106,44 @@ el1_hvc_guest:
 	 */
 	ldr	x1, [sp]				// Guest's x0
 	eor	w1, w1, #ARM_SMCCC_ARCH_WORKAROUND_1
+	cbz	w1, wa_epilogue
+
+	/* ARM_SMCCC_ARCH_WORKAROUND_2 handling */
+	eor	w1, w1, #(ARM_SMCCC_ARCH_WORKAROUND_1 ^ \
+			  ARM_SMCCC_ARCH_WORKAROUND_2)
 	cbnz	w1, el1_trap
-	mov	x0, x1
+
+#ifdef CONFIG_ARM64_SSBD
+alternative_cb	arm64_enable_wa2_handling
+	b	wa2_end
+alternative_cb_end
+	get_vcpu_ptr	x2, x0
+	ldr	x0, [x2, #VCPU_WORKAROUND_FLAGS]
+
+	// Sanitize the argument and update the guest flags
+	ldr	x1, [sp, #8]			// Guest's x1
+	clz	w1, w1				// Murphy's device:
+	lsr	w1, w1, #5			// w1 = !!w1 without using
+	eor	w1, w1, #1			// the flags...
+	bfi	x0, x1, #VCPU_WORKAROUND_2_FLAG_SHIFT, #1
+	str	x0, [x2, #VCPU_WORKAROUND_FLAGS]
+
+	/* Check that we actually need to perform the call */
+	hyp_ldr_this_cpu x0, arm64_ssbd_callback_required, x2
+	cbz	x0, wa2_end
+
+	mov	w0, #ARM_SMCCC_ARCH_WORKAROUND_2
+	smc	#0
+
+	/* Don't leak data from the SMC call */
+	mov	x3, xzr
+wa2_end:
+	mov	x2, xzr
+	mov	x1, xzr
+#endif
+
+wa_epilogue:
+	mov	x0, xzr
 	add	sp, sp, #16
 	eret
 
diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c
index d964523..c50cedc 100644
--- a/arch/arm64/kvm/hyp/switch.c
+++ b/arch/arm64/kvm/hyp/switch.c
@@ -15,6 +15,7 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <linux/arm-smccc.h>
 #include <linux/types.h>
 #include <linux/jump_label.h>
 #include <uapi/linux/psci.h>
@@ -389,6 +390,39 @@ static bool __hyp_text fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code)
 	return false;
 }
 
+static inline bool __hyp_text __needs_ssbd_off(struct kvm_vcpu *vcpu)
+{
+	if (!cpus_have_const_cap(ARM64_SSBD))
+		return false;
+
+	return !(vcpu->arch.workaround_flags & VCPU_WORKAROUND_2_FLAG);
+}
+
+static void __hyp_text __set_guest_arch_workaround_state(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_ARM64_SSBD
+	/*
+	 * The host runs with the workaround always present. If the
+	 * guest wants it disabled, so be it...
+	 */
+	if (__needs_ssbd_off(vcpu) &&
+	    __hyp_this_cpu_read(arm64_ssbd_callback_required))
+		arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_2, 0, NULL);
+#endif
+}
+
+static void __hyp_text __set_host_arch_workaround_state(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_ARM64_SSBD
+	/*
+	 * If the guest has disabled the workaround, bring it back on.
+	 */
+	if (__needs_ssbd_off(vcpu) &&
+	    __hyp_this_cpu_read(arm64_ssbd_callback_required))
+		arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_2, 1, NULL);
+#endif
+}
+
 /* Switch to the guest for VHE systems running in EL2 */
 int kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
 {
@@ -409,6 +443,8 @@ int kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
 	sysreg_restore_guest_state_vhe(guest_ctxt);
 	__debug_switch_to_guest(vcpu);
 
+	__set_guest_arch_workaround_state(vcpu);
+
 	do {
 		/* Jump in the fire! */
 		exit_code = __guest_enter(vcpu, host_ctxt);
@@ -416,6 +452,8 @@ int kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
 		/* And we're baaack! */
 	} while (fixup_guest_exit(vcpu, &exit_code));
 
+	__set_host_arch_workaround_state(vcpu);
+
 	fp_enabled = fpsimd_enabled_vhe();
 
 	sysreg_save_guest_state_vhe(guest_ctxt);
@@ -465,6 +503,8 @@ int __hyp_text __kvm_vcpu_run_nvhe(struct kvm_vcpu *vcpu)
 	__sysreg_restore_state_nvhe(guest_ctxt);
 	__debug_switch_to_guest(vcpu);
 
+	__set_guest_arch_workaround_state(vcpu);
+
 	do {
 		/* Jump in the fire! */
 		exit_code = __guest_enter(vcpu, host_ctxt);
@@ -472,6 +512,8 @@ int __hyp_text __kvm_vcpu_run_nvhe(struct kvm_vcpu *vcpu)
 		/* And we're baaack! */
 	} while (fixup_guest_exit(vcpu, &exit_code));
 
+	__set_host_arch_workaround_state(vcpu);
+
 	fp_enabled = __fpsimd_enabled_nvhe();
 
 	__sysreg_save_state_nvhe(guest_ctxt);
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index 3256b92..a74311b 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -122,6 +122,10 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
 	/* Reset PMU */
 	kvm_pmu_vcpu_reset(vcpu);
 
+	/* Default workaround setup is enabled (if supported) */
+	if (kvm_arm_have_ssbd() == KVM_SSBD_KERNEL)
+		vcpu->arch.workaround_flags |= VCPU_WORKAROUND_2_FLAG;
+
 	/* Reset timer */
 	return kvm_timer_vcpu_reset(vcpu);
 }
diff --git a/arch/arm64/lib/tishift.S b/arch/arm64/lib/tishift.S
index d3db9b2..0fdff97 100644
--- a/arch/arm64/lib/tishift.S
+++ b/arch/arm64/lib/tishift.S
@@ -1,17 +1,6 @@
-/*
- * Copyright (C) 2017 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
  *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ * Copyright (C) 2017-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
  */
 
 #include <linux/linkage.h>
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index a96ec01..49e217a 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -504,20 +504,15 @@ static int __init arm64_dma_init(void)
 	    max_pfn > (arm64_dma_phys_limit >> PAGE_SHIFT))
 		swiotlb = 1;
 
+	WARN_TAINT(ARCH_DMA_MINALIGN < cache_line_size(),
+		   TAINT_CPU_OUT_OF_SPEC,
+		   "ARCH_DMA_MINALIGN smaller than CTR_EL0.CWG (%d < %d)",
+		   ARCH_DMA_MINALIGN, cache_line_size());
+
 	return atomic_pool_init();
 }
 arch_initcall(arm64_dma_init);
 
-#define PREALLOC_DMA_DEBUG_ENTRIES	4096
-
-static int __init dma_debug_do_init(void)
-{
-	dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
-	return 0;
-}
-fs_initcall(dma_debug_do_init);
-
-
 #ifdef CONFIG_IOMMU_DMA
 #include <linux/dma-iommu.h>
 #include <linux/platform_device.h>
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 4165485..b8eecc7 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -235,8 +235,9 @@ static bool is_el1_instruction_abort(unsigned int esr)
 	return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_CUR;
 }
 
-static inline bool is_permission_fault(unsigned int esr, struct pt_regs *regs,
-				       unsigned long addr)
+static inline bool is_el1_permission_fault(unsigned int esr,
+					   struct pt_regs *regs,
+					   unsigned long addr)
 {
 	unsigned int ec       = ESR_ELx_EC(esr);
 	unsigned int fsc_type = esr & ESR_ELx_FSC_TYPE;
@@ -254,6 +255,22 @@ static inline bool is_permission_fault(unsigned int esr, struct pt_regs *regs,
 	return false;
 }
 
+static void die_kernel_fault(const char *msg, unsigned long addr,
+			     unsigned int esr, struct pt_regs *regs)
+{
+	bust_spinlocks(1);
+
+	pr_alert("Unable to handle kernel %s at virtual address %016lx\n", msg,
+		 addr);
+
+	mem_abort_decode(esr);
+
+	show_pte(addr);
+	die("Oops", regs, esr);
+	bust_spinlocks(0);
+	do_exit(SIGKILL);
+}
+
 static void __do_kernel_fault(unsigned long addr, unsigned int esr,
 			      struct pt_regs *regs)
 {
@@ -266,9 +283,7 @@ static void __do_kernel_fault(unsigned long addr, unsigned int esr,
 	if (!is_el1_instruction_abort(esr) && fixup_exception(regs))
 		return;
 
-	bust_spinlocks(1);
-
-	if (is_permission_fault(esr, regs, addr)) {
+	if (is_el1_permission_fault(esr, regs, addr)) {
 		if (esr & ESR_ELx_WNR)
 			msg = "write to read-only memory";
 		else
@@ -279,20 +294,63 @@ static void __do_kernel_fault(unsigned long addr, unsigned int esr,
 		msg = "paging request";
 	}
 
-	pr_alert("Unable to handle kernel %s at virtual address %08lx\n", msg,
-		 addr);
-
-	mem_abort_decode(esr);
-
-	show_pte(addr);
-	die("Oops", regs, esr);
-	bust_spinlocks(0);
-	do_exit(SIGKILL);
+	die_kernel_fault(msg, addr, esr, regs);
 }
 
 static void __do_user_fault(struct siginfo *info, unsigned int esr)
 {
 	current->thread.fault_address = (unsigned long)info->si_addr;
+
+	/*
+	 * If the faulting address is in the kernel, we must sanitize the ESR.
+	 * From userspace's point of view, kernel-only mappings don't exist
+	 * at all, so we report them as level 0 translation faults.
+	 * (This is not quite the way that "no mapping there at all" behaves:
+	 * an alignment fault not caused by the memory type would take
+	 * precedence over translation fault for a real access to empty
+	 * space. Unfortunately we can't easily distinguish "alignment fault
+	 * not caused by memory type" from "alignment fault caused by memory
+	 * type", so we ignore this wrinkle and just return the translation
+	 * fault.)
+	 */
+	if (current->thread.fault_address >= TASK_SIZE) {
+		switch (ESR_ELx_EC(esr)) {
+		case ESR_ELx_EC_DABT_LOW:
+			/*
+			 * These bits provide only information about the
+			 * faulting instruction, which userspace knows already.
+			 * We explicitly clear bits which are architecturally
+			 * RES0 in case they are given meanings in future.
+			 * We always report the ESR as if the fault was taken
+			 * to EL1 and so ISV and the bits in ISS[23:14] are
+			 * clear. (In fact it always will be a fault to EL1.)
+			 */
+			esr &= ESR_ELx_EC_MASK | ESR_ELx_IL |
+				ESR_ELx_CM | ESR_ELx_WNR;
+			esr |= ESR_ELx_FSC_FAULT;
+			break;
+		case ESR_ELx_EC_IABT_LOW:
+			/*
+			 * Claim a level 0 translation fault.
+			 * All other bits are architecturally RES0 for faults
+			 * reported with that DFSC value, so we clear them.
+			 */
+			esr &= ESR_ELx_EC_MASK | ESR_ELx_IL;
+			esr |= ESR_ELx_FSC_FAULT;
+			break;
+		default:
+			/*
+			 * This should never happen (entry.S only brings us
+			 * into this code for insn and data aborts from a lower
+			 * exception level). Fail safe by not providing an ESR
+			 * context record at all.
+			 */
+			WARN(1, "ESR 0x%x is not DABT or IABT from EL0\n", esr);
+			esr = 0;
+			break;
+		}
+	}
+
 	current->thread.fault_code = esr;
 	arm64_force_sig_info(info, esr_to_fault_info(esr)->name, current);
 }
@@ -305,11 +363,12 @@ static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *re
 	 */
 	if (user_mode(regs)) {
 		const struct fault_info *inf = esr_to_fault_info(esr);
-		struct siginfo si = {
-			.si_signo	= inf->sig,
-			.si_code	= inf->code,
-			.si_addr	= (void __user *)addr,
-		};
+		struct siginfo si;
+
+		clear_siginfo(&si);
+		si.si_signo	= inf->sig;
+		si.si_code	= inf->code;
+		si.si_addr	= (void __user *)addr;
 
 		__do_user_fault(&si, esr);
 	} else {
@@ -395,16 +454,19 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 		mm_flags |= FAULT_FLAG_WRITE;
 	}
 
-	if (addr < TASK_SIZE && is_permission_fault(esr, regs, addr)) {
+	if (addr < TASK_SIZE && is_el1_permission_fault(esr, regs, addr)) {
 		/* regs->orig_addr_limit may be 0 if we entered from EL0 */
 		if (regs->orig_addr_limit == KERNEL_DS)
-			die("Accessing user space memory with fs=KERNEL_DS", regs, esr);
+			die_kernel_fault("access to user memory with fs=KERNEL_DS",
+					 addr, esr, regs);
 
 		if (is_el1_instruction_abort(esr))
-			die("Attempting to execute userspace memory", regs, esr);
+			die_kernel_fault("execution of user memory",
+					 addr, esr, regs);
 
 		if (!search_exception_tables(regs->pc))
-			die("Accessing user space memory outside uaccess.h routines", regs, esr);
+			die_kernel_fault("access to user memory outside uaccess routines",
+					 addr, esr, regs);
 	}
 
 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
@@ -583,6 +645,7 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
 			nmi_exit();
 	}
 
+	clear_siginfo(&info);
 	info.si_signo = inf->sig;
 	info.si_errno = 0;
 	info.si_code  = inf->code;
@@ -687,6 +750,7 @@ asmlinkage void __exception do_mem_abort(unsigned long addr, unsigned int esr,
 		show_pte(addr);
 	}
 
+	clear_siginfo(&info);
 	info.si_signo = inf->sig;
 	info.si_errno = 0;
 	info.si_code  = inf->code;
@@ -729,6 +793,7 @@ asmlinkage void __exception do_sp_pc_abort(unsigned long addr,
 		local_irq_enable();
 	}
 
+	clear_siginfo(&info);
 	info.si_signo = SIGBUS;
 	info.si_errno = 0;
 	info.si_code  = BUS_ADRALN;
@@ -772,7 +837,6 @@ asmlinkage int __exception do_debug_exception(unsigned long addr,
 					      struct pt_regs *regs)
 {
 	const struct fault_info *inf = debug_fault_info + DBG_ESR_EVT(esr);
-	struct siginfo info;
 	int rv;
 
 	/*
@@ -788,6 +852,9 @@ asmlinkage int __exception do_debug_exception(unsigned long addr,
 	if (!inf->fn(addr, esr, regs)) {
 		rv = 1;
 	} else {
+		struct siginfo info;
+
+		clear_siginfo(&info);
 		info.si_signo = inf->sig;
 		info.si_errno = 0;
 		info.si_code  = inf->code;
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 2dbb2c9..493ff75 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -933,13 +933,15 @@ int pud_set_huge(pud_t *pudp, phys_addr_t phys, pgprot_t prot)
 {
 	pgprot_t sect_prot = __pgprot(PUD_TYPE_SECT |
 					pgprot_val(mk_sect_prot(prot)));
+	pud_t new_pud = pfn_pud(__phys_to_pfn(phys), sect_prot);
 
-	/* ioremap_page_range doesn't honour BBM */
-	if (pud_present(READ_ONCE(*pudp)))
+	/* Only allow permission changes for now */
+	if (!pgattr_change_is_safe(READ_ONCE(pud_val(*pudp)),
+				   pud_val(new_pud)))
 		return 0;
 
 	BUG_ON(phys & ~PUD_MASK);
-	set_pud(pudp, pfn_pud(__phys_to_pfn(phys), sect_prot));
+	set_pud(pudp, new_pud);
 	return 1;
 }
 
@@ -947,13 +949,15 @@ int pmd_set_huge(pmd_t *pmdp, phys_addr_t phys, pgprot_t prot)
 {
 	pgprot_t sect_prot = __pgprot(PMD_TYPE_SECT |
 					pgprot_val(mk_sect_prot(prot)));
+	pmd_t new_pmd = pfn_pmd(__phys_to_pfn(phys), sect_prot);
 
-	/* ioremap_page_range doesn't honour BBM */
-	if (pmd_present(READ_ONCE(*pmdp)))
+	/* Only allow permission changes for now */
+	if (!pgattr_change_is_safe(READ_ONCE(pmd_val(*pmdp)),
+				   pmd_val(new_pmd)))
 		return 0;
 
 	BUG_ON(phys & ~PMD_MASK);
-	set_pmd(pmdp, pfn_pmd(__phys_to_pfn(phys), sect_prot));
+	set_pmd(pmdp, new_pmd);
 	return 1;
 }
 
diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index a933504..a6fdaea 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -21,7 +21,6 @@
 #include <linux/bpf.h>
 #include <linux/filter.h>
 #include <linux/printk.h>
-#include <linux/skbuff.h>
 #include <linux/slab.h>
 
 #include <asm/byteorder.h>
@@ -80,23 +79,66 @@ static inline void emit(const u32 insn, struct jit_ctx *ctx)
 	ctx->idx++;
 }
 
+static inline void emit_a64_mov_i(const int is64, const int reg,
+				  const s32 val, struct jit_ctx *ctx)
+{
+	u16 hi = val >> 16;
+	u16 lo = val & 0xffff;
+
+	if (hi & 0x8000) {
+		if (hi == 0xffff) {
+			emit(A64_MOVN(is64, reg, (u16)~lo, 0), ctx);
+		} else {
+			emit(A64_MOVN(is64, reg, (u16)~hi, 16), ctx);
+			if (lo != 0xffff)
+				emit(A64_MOVK(is64, reg, lo, 0), ctx);
+		}
+	} else {
+		emit(A64_MOVZ(is64, reg, lo, 0), ctx);
+		if (hi)
+			emit(A64_MOVK(is64, reg, hi, 16), ctx);
+	}
+}
+
+static int i64_i16_blocks(const u64 val, bool inverse)
+{
+	return (((val >>  0) & 0xffff) != (inverse ? 0xffff : 0x0000)) +
+	       (((val >> 16) & 0xffff) != (inverse ? 0xffff : 0x0000)) +
+	       (((val >> 32) & 0xffff) != (inverse ? 0xffff : 0x0000)) +
+	       (((val >> 48) & 0xffff) != (inverse ? 0xffff : 0x0000));
+}
+
 static inline void emit_a64_mov_i64(const int reg, const u64 val,
 				    struct jit_ctx *ctx)
 {
-	u64 tmp = val;
-	int shift = 0;
-
-	emit(A64_MOVZ(1, reg, tmp & 0xffff, shift), ctx);
-	tmp >>= 16;
-	shift += 16;
-	while (tmp) {
-		if (tmp & 0xffff)
-			emit(A64_MOVK(1, reg, tmp & 0xffff, shift), ctx);
-		tmp >>= 16;
-		shift += 16;
+	u64 nrm_tmp = val, rev_tmp = ~val;
+	bool inverse;
+	int shift;
+
+	if (!(nrm_tmp >> 32))
+		return emit_a64_mov_i(0, reg, (u32)val, ctx);
+
+	inverse = i64_i16_blocks(nrm_tmp, true) < i64_i16_blocks(nrm_tmp, false);
+	shift = max(round_down((inverse ? (fls64(rev_tmp) - 1) :
+					  (fls64(nrm_tmp) - 1)), 16), 0);
+	if (inverse)
+		emit(A64_MOVN(1, reg, (rev_tmp >> shift) & 0xffff, shift), ctx);
+	else
+		emit(A64_MOVZ(1, reg, (nrm_tmp >> shift) & 0xffff, shift), ctx);
+	shift -= 16;
+	while (shift >= 0) {
+		if (((nrm_tmp >> shift) & 0xffff) != (inverse ? 0xffff : 0x0000))
+			emit(A64_MOVK(1, reg, (nrm_tmp >> shift) & 0xffff, shift), ctx);
+		shift -= 16;
 	}
 }
 
+/*
+ * This is an unoptimized 64 immediate emission used for BPF to BPF call
+ * addresses. It will always do a full 64 bit decomposition as otherwise
+ * more complexity in the last extra pass is required since we previously
+ * reserved 4 instructions for the address.
+ */
 static inline void emit_addr_mov_i64(const int reg, const u64 val,
 				     struct jit_ctx *ctx)
 {
@@ -111,26 +153,6 @@ static inline void emit_addr_mov_i64(const int reg, const u64 val,
 	}
 }
 
-static inline void emit_a64_mov_i(const int is64, const int reg,
-				  const s32 val, struct jit_ctx *ctx)
-{
-	u16 hi = val >> 16;
-	u16 lo = val & 0xffff;
-
-	if (hi & 0x8000) {
-		if (hi == 0xffff) {
-			emit(A64_MOVN(is64, reg, (u16)~lo, 0), ctx);
-		} else {
-			emit(A64_MOVN(is64, reg, (u16)~hi, 16), ctx);
-			emit(A64_MOVK(is64, reg, lo, 0), ctx);
-		}
-	} else {
-		emit(A64_MOVZ(is64, reg, lo, 0), ctx);
-		if (hi)
-			emit(A64_MOVK(is64, reg, hi, 16), ctx);
-	}
-}
-
 static inline int bpf2a64_offset(int bpf_to, int bpf_from,
 				 const struct jit_ctx *ctx)
 {
@@ -163,7 +185,7 @@ static inline int epilogue_offset(const struct jit_ctx *ctx)
 /* Tail call offset to jump into */
 #define PROLOGUE_OFFSET 7
 
-static int build_prologue(struct jit_ctx *ctx)
+static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf)
 {
 	const struct bpf_prog *prog = ctx->prog;
 	const u8 r6 = bpf2a64[BPF_REG_6];
@@ -188,7 +210,7 @@ static int build_prologue(struct jit_ctx *ctx)
 	 *                        | ... | BPF prog stack
 	 *                        |     |
 	 *                        +-----+ <= (BPF_FP - prog->aux->stack_depth)
-	 *                        |RSVD | JIT scratchpad
+	 *                        |RSVD | padding
 	 * current A64_SP =>      +-----+ <= (BPF_FP - ctx->stack_size)
 	 *                        |     |
 	 *                        | ... | Function call stack
@@ -210,19 +232,19 @@ static int build_prologue(struct jit_ctx *ctx)
 	/* Set up BPF prog stack base register */
 	emit(A64_MOV(1, fp, A64_SP), ctx);
 
-	/* Initialize tail_call_cnt */
-	emit(A64_MOVZ(1, tcc, 0, 0), ctx);
+	if (!ebpf_from_cbpf) {
+		/* Initialize tail_call_cnt */
+		emit(A64_MOVZ(1, tcc, 0, 0), ctx);
 
-	cur_offset = ctx->idx - idx0;
-	if (cur_offset != PROLOGUE_OFFSET) {
-		pr_err_once("PROLOGUE_OFFSET = %d, expected %d!\n",
-			    cur_offset, PROLOGUE_OFFSET);
-		return -1;
+		cur_offset = ctx->idx - idx0;
+		if (cur_offset != PROLOGUE_OFFSET) {
+			pr_err_once("PROLOGUE_OFFSET = %d, expected %d!\n",
+				    cur_offset, PROLOGUE_OFFSET);
+			return -1;
+		}
 	}
 
-	/* 4 byte extra for skb_copy_bits buffer */
-	ctx->stack_size = prog->aux->stack_depth + 4;
-	ctx->stack_size = STACK_ALIGN(ctx->stack_size);
+	ctx->stack_size = STACK_ALIGN(prog->aux->stack_depth);
 
 	/* Set up function call stack */
 	emit(A64_SUB_I(1, A64_SP, A64_SP, ctx->stack_size), ctx);
@@ -723,71 +745,6 @@ emit_cond_jmp:
 		emit(A64_CBNZ(0, tmp3, jmp_offset), ctx);
 		break;
 
-	/* R0 = ntohx(*(size *)(((struct sk_buff *)R6)->data + imm)) */
-	case BPF_LD | BPF_ABS | BPF_W:
-	case BPF_LD | BPF_ABS | BPF_H:
-	case BPF_LD | BPF_ABS | BPF_B:
-	/* R0 = ntohx(*(size *)(((struct sk_buff *)R6)->data + src + imm)) */
-	case BPF_LD | BPF_IND | BPF_W:
-	case BPF_LD | BPF_IND | BPF_H:
-	case BPF_LD | BPF_IND | BPF_B:
-	{
-		const u8 r0 = bpf2a64[BPF_REG_0]; /* r0 = return value */
-		const u8 r6 = bpf2a64[BPF_REG_6]; /* r6 = pointer to sk_buff */
-		const u8 fp = bpf2a64[BPF_REG_FP];
-		const u8 r1 = bpf2a64[BPF_REG_1]; /* r1: struct sk_buff *skb */
-		const u8 r2 = bpf2a64[BPF_REG_2]; /* r2: int k */
-		const u8 r3 = bpf2a64[BPF_REG_3]; /* r3: unsigned int size */
-		const u8 r4 = bpf2a64[BPF_REG_4]; /* r4: void *buffer */
-		const u8 r5 = bpf2a64[BPF_REG_5]; /* r5: void *(*func)(...) */
-		int size;
-
-		emit(A64_MOV(1, r1, r6), ctx);
-		emit_a64_mov_i(0, r2, imm, ctx);
-		if (BPF_MODE(code) == BPF_IND)
-			emit(A64_ADD(0, r2, r2, src), ctx);
-		switch (BPF_SIZE(code)) {
-		case BPF_W:
-			size = 4;
-			break;
-		case BPF_H:
-			size = 2;
-			break;
-		case BPF_B:
-			size = 1;
-			break;
-		default:
-			return -EINVAL;
-		}
-		emit_a64_mov_i64(r3, size, ctx);
-		emit(A64_SUB_I(1, r4, fp, ctx->stack_size), ctx);
-		emit_a64_mov_i64(r5, (unsigned long)bpf_load_pointer, ctx);
-		emit(A64_BLR(r5), ctx);
-		emit(A64_MOV(1, r0, A64_R(0)), ctx);
-
-		jmp_offset = epilogue_offset(ctx);
-		check_imm19(jmp_offset);
-		emit(A64_CBZ(1, r0, jmp_offset), ctx);
-		emit(A64_MOV(1, r5, r0), ctx);
-		switch (BPF_SIZE(code)) {
-		case BPF_W:
-			emit(A64_LDR32(r0, r5, A64_ZR), ctx);
-#ifndef CONFIG_CPU_BIG_ENDIAN
-			emit(A64_REV32(0, r0, r0), ctx);
-#endif
-			break;
-		case BPF_H:
-			emit(A64_LDRH(r0, r5, A64_ZR), ctx);
-#ifndef CONFIG_CPU_BIG_ENDIAN
-			emit(A64_REV16(0, r0, r0), ctx);
-#endif
-			break;
-		case BPF_B:
-			emit(A64_LDRB(r0, r5, A64_ZR), ctx);
-			break;
-		}
-		break;
-	}
 	default:
 		pr_err_once("unknown opcode %02x\n", code);
 		return -EINVAL;
@@ -851,6 +808,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 	struct bpf_prog *tmp, *orig_prog = prog;
 	struct bpf_binary_header *header;
 	struct arm64_jit_data *jit_data;
+	bool was_classic = bpf_prog_was_classic(prog);
 	bool tmp_blinded = false;
 	bool extra_pass = false;
 	struct jit_ctx ctx;
@@ -905,7 +863,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 		goto out_off;
 	}
 
-	if (build_prologue(&ctx)) {
+	if (build_prologue(&ctx, was_classic)) {
 		prog = orig_prog;
 		goto out_off;
 	}
@@ -928,7 +886,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 skip_init_ctx:
 	ctx.idx = 0;
 
-	build_prologue(&ctx);
+	build_prologue(&ctx, was_classic);
 
 	if (build_body(&ctx)) {
 		bpf_jit_binary_free(header);