[ARM] 3611/4: optimize do_div() when divisor is constant

On ARM all divisions have to be performed "manually". For 64-bit divisions that may take more than a hundred cycles in many cases. With 32-bit divisions gcc already use the recyprocal of constant divisors to perform a multiplication, but not with 64-bit divisions. Since the kernel is increasingly relying upon 64-bit divisions it is worth optimizing at least those cases where the divisor is a constant. This is what this patch does using plain C code that gets optimized away at compile time. For example, despite the amount of added C code, do_div(x, 10000) now produces the following assembly code (where x is assigned to r0-r1): adr r4, .L0 ldmia r4, {r4-r5} umull r2, r3, r4, r0 mov r2, #0 umlal r3, r2, r5, r0 umlal r3, r2, r4, r1 mov r3, #0 umlal r2, r3, r5, r1 mov r0, r2, lsr #11 orr r0, r0, r3, lsl #21 mov r1, r3, lsr #11 ... .L0: .word 948328779 .word 879609302 which is the fastest that can be done for any value of x in that case, many times faster than the __do_div64 code (except for the small x value space for which the result ends up being zero or a single bit). The fact that this code is generated inline produces a tiny increase in .text size, but not significant compared to the needed code around each __do_div64 call site this code is replacing. The algorithm used has been validated on a 16-bit scale for all possible values, and then recodified for 64-bit values. Furthermore I've been running it with the final BUG_ON() uncommented for over two months now with no problem. Note that this new code is compiled with gcc versions 4.0 or later. Earlier gcc versions proved themselves too problematic and only the original code is used with them. Signed-off-by: Nicolas Pitre <nico@cam.org> Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
author: Nicolas Pitre <nico@cam.org> 2006-12-06 04:13:18 +0100
committer: Russell King <rmk+kernel@arm.linux.org.uk> 2006-12-07 16:06:09 +0000
commit: fa4adc614922c24601320e55bc5a1f837abad6e9 (patch)
tree: f93979fc2d9e2e2dad6edd3fa92b084a2645976a /include
parent: 0215ffb08ce99e2bb59eca114a99499a4d06e704 (diff)
download: op-kernel-dev-fa4adc614922c24601320e55bc5a1f837abad6e9.zip
op-kernel-dev-fa4adc614922c24601320e55bc5a1f837abad6e9.tar.gz
1 files changed, 179 insertions, 1 deletions
diff --git a/include/asm-arm/div64.h b/include/asm-arm/div64.h
index 3682616..37e0a96 100644
--- a/include/asm-arm/div64.h
+++ b/include/asm-arm/div64.h
@@ -27,7 +27,7 @@
 #define __xh "r1"
 #endif
 
-#define do_div(n,base)						\
+#define __do_div_asm(n, base)					\
 ({								\
 	register unsigned int __base      asm("r4") = base;	\
 	register unsigned long long __n   asm("r0") = n;	\
@@ -45,4 +45,182 @@
 	__rem;							\
 })
 
+#if __GNUC__ < 4
+
+/*
+ * gcc versions earlier than 4.0 are simply too problematic for the
+ * optimized implementation below. First there is gcc PR 15089 that
+ * tend to trig on more complex constructs, spurious .global __udivsi3
+ * are inserted even if none of those symbols are referenced in the
+ * generated code, and those gcc versions are not able to do constant
+ * propagation on long long values anyway.
+ */
+#define do_div(n, base) __do_div_asm(n, base)
+
+#elif __GNUC__ >= 4
+
+#include <asm/bug.h>
+
+/*
+ * If the divisor happens to be constant, we determine the appropriate
+ * inverse at compile time to turn the division into a few inline
+ * multiplications instead which is much faster. And yet only if compiling
+ * for ARMv4 or higher (we need umull/umlal) and if the gcc version is
+ * sufficiently recent to perform proper long long constant propagation.
+ * (It is unfortunate that gcc doesn't perform all this internally.)
+ */
+#define do_div(n, base)							\
+({									\
+	unsigned int __r, __b = (base);					\
+	if (!__builtin_constant_p(__b) || __b == 0 ||			\
+	    (__LINUX_ARM_ARCH__ < 4 && (__b & (__b - 1)) != 0)) {	\
+		/* non-constant divisor (or zero): slow path */		\
+		__r = __do_div_asm(n, __b);				\
+	} else if ((__b & (__b - 1)) == 0) {				\
+		/* Trivial: __b is constant and a power of 2 */		\
+		/* gcc does the right thing with this code.  */		\
+		__r = n;						\
+		__r &= (__b - 1);					\
+		n /= __b;						\
+	} else {							\
+		/* Multiply by inverse of __b: n/b = n*(p/b)/p       */	\
+		/* We rely on the fact that most of this code gets   */	\
+		/* optimized away at compile time due to constant    */	\
+		/* propagation and only a couple inline assembly     */	\
+		/* instructions should remain. Better avoid any      */	\
+		/* code construct that might prevent that.           */	\
+		unsigned long long __res, __x, __t, __m, __n = n;	\
+		unsigned int __c, __p, __z = 0;				\
+		/* preserve low part of n for reminder computation */	\
+		__r = __n;						\
+		/* determine number of bits to represent __b */		\
+		__p = 1 << __div64_fls(__b);				\
+		/* compute __m = ((__p << 64) + __b - 1) / __b */	\
+		__m = (~0ULL / __b) * __p;				\
+		__m += (((~0ULL % __b + 1) * __p) + __b - 1) / __b;	\
+		/* compute __res = __m*(~0ULL/__b*__b-1)/(__p << 64) */	\
+		__x = ~0ULL / __b * __b - 1;				\
+		__res = (__m & 0xffffffff) * (__x & 0xffffffff);	\
+		__res >>= 32;						\
+		__res += (__m & 0xffffffff) * (__x >> 32);		\
+		__t = __res;						\
+		__res += (__x & 0xffffffff) * (__m >> 32);		\
+		__t = (__res < __t) ? (1ULL << 32) : 0;			\
+		__res = (__res >> 32) + __t;				\
+		__res += (__m >> 32) * (__x >> 32);			\
+		__res /= __p;						\
+		/* Now sanitize and optimize what we've got. */		\
+		if (~0ULL % (__b / (__b & -__b)) == 0) {		\
+			/* those cases can be simplified with: */	\
+			__n /= (__b & -__b);				\
+			__m = ~0ULL / (__b / (__b & -__b));		\
+			__p = 1;					\
+			__c = 1;					\
+		} else if (__res != __x / __b) {			\
+			/* We can't get away without a correction    */	\
+			/* to compensate for bit truncation errors.  */	\
+			/* To avoid it we'd need an additional bit   */	\
+			/* to represent __m which would overflow it. */	\
+			/* Instead we do m=p/b and n/b=(n*m+m)/p.    */	\
+			__c = 1;					\
+			/* Compute __m = (__p << 64) / __b */		\
+			__m = (~0ULL / __b) * __p;			\
+			__m += ((~0ULL % __b + 1) * __p) / __b;		\
+		} else {						\
+			/* Reduce __m/__p, and try to clear bit 31   */	\
+			/* of __m when possible otherwise that'll    */	\
+			/* need extra overflow handling later.       */	\
+			unsigned int __bits = -(__m & -__m);		\
+			__bits |= __m >> 32;				\
+			__bits = (~__bits) << 1;			\
+			/* If __bits == 0 then setting bit 31 is     */	\
+			/* unavoidable.  Simply apply the maximum    */	\
+			/* possible reduction in that case.          */	\
+			/* Otherwise the MSB of __bits indicates the */	\
+			/* best reduction we should apply.           */	\
+			if (!__bits) {					\
+				__p /= (__m & -__m);			\
+				__m /= (__m & -__m);			\
+			} else {					\
+				__p >>= __div64_fls(__bits);		\
+				__m >>= __div64_fls(__bits);		\
+			}						\
+			/* No correction needed. */			\
+			__c = 0;					\
+		}							\
+		/* Now we have a combination of 2 conditions:        */	\
+		/* 1) whether or not we need a correction (__c), and */	\
+		/* 2) whether or not there might be an overflow in   */	\
+		/*    the cross product (__m & ((1<<63) | (1<<31)))  */	\
+		/* Select the best insn combination to perform the   */	\
+		/* actual __m * __n / (__p << 64) operation.         */	\
+		if (!__c) {						\
+			asm (	"umull	%Q0, %R0, %1, %Q2\n\t"		\
+				"mov	%Q0, #0"			\
+				: "=&r" (__res)				\
+				: "r" (__m), "r" (__n)			\
+				: "cc" );				\
+		} else if (!(__m & ((1ULL << 63) | (1ULL << 31)))) {	\
+			__res = __m;					\
+			asm (	"umlal	%Q0, %R0, %Q1, %Q2\n\t"		\
+				"mov	%Q0, #0"			\
+				: "+r" (__res)				\
+				: "r" (__m), "r" (__n)			\
+				: "cc" );				\
+		} else {						\
+			asm (	"umull	%Q0, %R0, %Q1, %Q2\n\t"		\
+				"cmn	%Q0, %Q1\n\t"			\
+				"adcs	%R0, %R0, %R1\n\t"		\
+				"adc	%Q0, %3, #0"			\
+				: "=&r" (__res)				\
+				: "r" (__m), "r" (__n), "r" (__z)	\
+				: "cc" );				\
+		}							\
+		if (!(__m & ((1ULL << 63) | (1ULL << 31)))) {		\
+			asm (	"umlal	%R0, %Q0, %R1, %Q2\n\t"		\
+				"umlal	%R0, %Q0, %Q1, %R2\n\t"		\
+				"mov	%R0, #0\n\t"			\
+				"umlal	%Q0, %R0, %R1, %R2"		\
+				: "+r" (__res)				\
+				: "r" (__m), "r" (__n)			\
+				: "cc" );				\
+		} else {						\
+			asm (	"umlal	%R0, %Q0, %R2, %Q3\n\t"		\
+				"umlal	%R0, %1, %Q2, %R3\n\t"		\
+				"mov	%R0, #0\n\t"			\
+				"adds	%Q0, %1, %Q0\n\t"		\
+				"adc	%R0, %R0, #0\n\t"		\
+				"umlal	%Q0, %R0, %R2, %R3"		\
+				: "+r" (__res), "+r" (__z)		\
+				: "r" (__m), "r" (__n)			\
+				: "cc" );				\
+		}							\
+		__res /= __p;						\
+		/* The reminder can be computed with 32-bit regs     */	\
+		/* only, and gcc is good at that.                    */	\
+		{							\
+			unsigned int __res0 = __res;			\
+			unsigned int __b0 = __b;			\
+			__r -= __res0 * __b0;				\
+		}							\
+		/* BUG_ON(__r >= __b || __res * __b + __r != n); */	\
+		n = __res;						\
+	}								\
+	__r;								\
+})
+
+/* our own fls implementation to make sure constant propagation is fine */
+#define __div64_fls(bits)						\
+({									\
+	unsigned int __left = (bits), __nr = 0;				\
+	if (__left & 0xffff0000) __nr += 16, __left >>= 16;		\
+	if (__left & 0x0000ff00) __nr +=  8, __left >>=  8;		\
+	if (__left & 0x000000f0) __nr +=  4, __left >>=  4;		\
+	if (__left & 0x0000000c) __nr +=  2, __left >>=  2;		\
+	if (__left & 0x00000002) __nr +=  1;				\
+	__nr;								\
+})
+
+#endif
+
 #endif
author	Nicolas Pitre <nico@cam.org>	2006-12-06 04:13:18 +0100
committer	Russell King <rmk+kernel@arm.linux.org.uk>	2006-12-07 16:06:09 +0000
commit	fa4adc614922c24601320e55bc5a1f837abad6e9 (patch)
tree	f93979fc2d9e2e2dad6edd3fa92b084a2645976a /include
parent	0215ffb08ce99e2bb59eca114a99499a4d06e704 (diff)
download	op-kernel-dev-fa4adc614922c24601320e55bc5a1f837abad6e9.zip op-kernel-dev-fa4adc614922c24601320e55bc5a1f837abad6e9.tar.gz