From ac4c2a3bbe5db5fc570b1d0ee1e474db7cb22585 Mon Sep 17 00:00:00 2001 From: Joakim Tjernlund Date: Fri, 8 Jan 2010 14:42:40 -0800 Subject: zlib: optimize inffast when copying direct from output JFFS2 uses lesser compression ratio and inflate always ends up in "copy direct from output" case. This patch tries to optimize the direct copy procedure. Uses get_unaligned() but only in one place. The copy loop just above this one can also use this optimization, but I havn't done so as I have not tested if it is a win there too. On my MPC8321 this is about 17% faster on my JFFS2 root FS than the original. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Joakim Tjernlund Cc: Roel Kluin Cc: Richard Purdie Cc: David Woodhouse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/zlib_inflate/inffast.c | 55 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 44 insertions(+), 11 deletions(-) (limited to 'lib/zlib_inflate') diff --git a/lib/zlib_inflate/inffast.c b/lib/zlib_inflate/inffast.c index 8550b0c..05e1559 100644 --- a/lib/zlib_inflate/inffast.c +++ b/lib/zlib_inflate/inffast.c @@ -4,6 +4,8 @@ */ #include +#include +#include #include "inftrees.h" #include "inflate.h" #include "inffast.h" @@ -24,9 +26,11 @@ #ifdef POSTINC # define OFF 0 # define PUP(a) *(a)++ +# define UP_UNALIGNED(a) get_unaligned((a)++) #else # define OFF 1 # define PUP(a) *++(a) +# define UP_UNALIGNED(a) get_unaligned(++(a)) #endif /* @@ -239,18 +243,47 @@ void inflate_fast(z_streamp strm, unsigned start) } } else { + unsigned short *sout; + unsigned long loops; + from = out - dist; /* copy direct from output */ - do { /* minimum length is three */ - PUP(out) = PUP(from); - PUP(out) = PUP(from); - PUP(out) = PUP(from); - len -= 3; - } while (len > 2); - if (len) { - PUP(out) = PUP(from); - if (len > 1) - PUP(out) = PUP(from); - } + /* minimum length is three */ + /* Align out addr */ + if (!((long)(out - 1 + OFF) & 1)) { + PUP(out) = PUP(from); + len--; + } + sout = (unsigned short *)(out - OFF); + if (dist > 2) { + unsigned short *sfrom; + + sfrom = (unsigned short *)(from - OFF); + loops = len >> 1; + do + PUP(sout) = UP_UNALIGNED(sfrom); + while (--loops); + out = (unsigned char *)sout + OFF; + from = (unsigned char *)sfrom + OFF; + } else { /* dist == 1 or dist == 2 */ + unsigned short pat16; + + pat16 = *(sout-2+2*OFF); + if (dist == 1) +#if defined(__BIG_ENDIAN) + pat16 = (pat16 & 0xff) | ((pat16 & 0xff) << 8); +#elif defined(__LITTLE_ENDIAN) + pat16 = (pat16 & 0xff00) | ((pat16 & 0xff00) >> 8); +#else +#error __BIG_ENDIAN nor __LITTLE_ENDIAN is defined +#endif + loops = len >> 1; + do + PUP(sout) = pat16; + while (--loops); + out = (unsigned char *)sout + OFF; + } + if (len & 1) + PUP(out) = PUP(from); } } else if ((op & 64) == 0) { /* 2nd level distance code */ -- cgit v1.1 From 6846ee5ca68d81e6baccf0d56221d7a00c1be18b Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Wed, 13 Jan 2010 16:19:34 +1100 Subject: zlib: Fix build of powerpc boot wrapper Commit ac4c2a3bbe5db5fc570b1d0ee1e474db7cb22585 broke the build of all powerpc boot wrappers. It attempts to add an include of autoconf.h but used the wrong path for it. It also adds -D__KERNEL__ to our boot wrapper, both things that we pretty much didn't do on purpose so far. We want our boot wrapper to remain independent enough of the kernel for various reasons, one of them being that you can "wrap" an existing kernel at distro install time which allows to ship one kernel image and a set of boot wrappers for different platforms, the wrappers don't have to be built out of the same kernel build tree. It's also incorrect to do what the patch does in our boot environment since we may not have a proper alignment exception handler which means we may not be able to fixup the few cases where an unaligned access will need SW emulation (depends on the core variant, could be when crossing page or segment boundaries for example). This patch fixes it by putting the old code back in and using the new "fancy" variant only when CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS is set, which happens not to be set on powerpc since we don't include autoconf.h. It also reverts the changes to our boot wrapper Makefile. This means that x86 should, afaik, keep the optimisations since its boot wrapper does include autoconf.h and define __KERNEL__ (though I doubt they make that much different outside of slow embedded processors). Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Linus Torvalds --- lib/zlib_inflate/inffast.c | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) (limited to 'lib/zlib_inflate') diff --git a/lib/zlib_inflate/inffast.c b/lib/zlib_inflate/inffast.c index 05e1559..215447c 100644 --- a/lib/zlib_inflate/inffast.c +++ b/lib/zlib_inflate/inffast.c @@ -4,12 +4,25 @@ */ #include -#include -#include #include "inftrees.h" #include "inflate.h" #include "inffast.h" +/* Only do the unaligned "Faster" variant when + * CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS is set + * + * On powerpc, it won't be as we don't include autoconf.h + * automatically for the boot wrapper, which is intended as + * we run in an environment where we may not be able to deal + * with (even rare) alignment faults. In addition, we do not + * define __KERNEL__ for arch/powerpc/boot unlike x86 + */ + +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS +#include +#include +#endif + #ifndef ASMINF /* Allow machine dependent optimization for post-increment or pre-increment. @@ -243,6 +256,7 @@ void inflate_fast(z_streamp strm, unsigned start) } } else { +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS unsigned short *sout; unsigned long loops; @@ -284,6 +298,20 @@ void inflate_fast(z_streamp strm, unsigned start) } if (len & 1) PUP(out) = PUP(from); +#else /* CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS */ + from = out - dist; /* copy direct from output */ + do { /* minimum length is three */ + PUP(out) = PUP(from); + PUP(out) = PUP(from); + PUP(out) = PUP(from); + len -= 3; + } while (len > 2); + if (len) { + PUP(out) = PUP(from); + if (len > 1) + PUP(out) = PUP(from); + } +#endif /* !CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS */ } } else if ((op & 64) == 0) { /* 2nd level distance code */ -- cgit v1.1