From 6a2a923f2a6b4af0047d9c3ccc384246c67483fc Mon Sep 17 00:00:00 2001 From: Timothy Pearson Date: Tue, 29 Dec 2015 09:40:04 -0600 Subject: Add OpenPOWER (POWER8/ppc64el) support --- Makefile | 7 + defs.h | 4 + main.c | 246 ++++++++- routines-ppc64el.asm | 1434 ++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 1665 insertions(+), 26 deletions(-) create mode 100755 routines-ppc64el.asm diff --git a/Makefile b/Makefile index 913d023..c89d707 100755 --- a/Makefile +++ b/Makefile @@ -32,6 +32,7 @@ message: @echo "" @echo "To compile for x86 Linux: make bandwidth32" @echo "To compile for x86_64 Linux: make bandwidth64" + @echo "To compile for ppc64el Linux: make bandwidth-ppc64el" @echo "To compile for x86 Mac OS/X: make bandwidth-mac32" @echo "To compile for x86_64 Mac OS/X: make bandwidth-mac64" @echo "To compile for x86 Win32/Cygwin: make bandwidth-win32" @@ -43,6 +44,11 @@ bandwidth64: main.c routines64.asm BMP64.a BMPGraphing64.a ${CC} ${CFLAGS} -m64 -c ${SRC} ${LD} -m64 routines64.o ${OBJ} BMP64.a -lm BMPGraphing64.a -o bandwidth64 +bandwidth-ppc64el: main.c routines-ppc64el.asm BMP64.a BMPGraphing64.a + as -mregnames -mpower8 -o routines-ppc64el.o routines-ppc64el.asm + ${CC} ${CFLAGS} -m64 -c ${SRC} + ${LD} -m64 routines-ppc64el.o ${OBJ} BMP64.a -lm BMPGraphing64.a -o bandwidth-ppc64el + bandwidth32: main.c routines32.asm BMP32.a BMPGraphing32.a ${AS} -f elf routines32.asm -o routines32.o ${CC} ${CFLAGS} -m32 -c ${SRC} @@ -81,6 +87,7 @@ BMP32.a: BMP.c clean: rm -f main.o bandwidth bandwidth32 bandwidth64 routines32.o routines64.o + rm -f bandwidth-ppc64el routines-ppc64el.o rm -f bandwidth-win32.exe bandwidth.bmp bandwidth-mac32 bandwidth-mac64 rm -f BMP.o BMP32.a BMP64.a BMPGraphing.o BMPGraphing32.a BMPGraphing64.a rm -f font.o minifont.o network_bandwidth.bmp diff --git a/defs.h b/defs.h index 176dbd1..54597ef 100755 --- a/defs.h +++ b/defs.h @@ -98,13 +98,17 @@ extern int CopyAVX (void*, void*, unsigned long, unsigned long); extern int CopySSE_128bytes (void*, void*, unsigned long, unsigned long); extern int ReaderAVX (void *ptr, unsigned long, unsigned long); +extern int ReaderVSX (void *ptr, unsigned long, unsigned long); extern int ReaderSSE2 (void *ptr, unsigned long, unsigned long); extern int ReaderSSE2_bypass (void *ptr, unsigned long, unsigned long); +extern int RandomReaderVSX (unsigned long **ptr, unsigned long, unsigned long); extern int RandomReaderSSE2 (unsigned long **ptr, unsigned long, unsigned long); extern int RandomReaderSSE2_bypass (unsigned long **ptr, unsigned long, unsigned long); extern int WriterAVX (void *ptr, unsigned long, unsigned long, unsigned long); +extern int WriterVSX (void *ptr, unsigned long, unsigned long, unsigned long); extern int WriterSSE2 (void *ptr, unsigned long, unsigned long, unsigned long); +extern int RandomWriterVSX(unsigned long **ptr, unsigned long, unsigned long, unsigned long); extern int RandomWriterSSE2(unsigned long **ptr, unsigned long, unsigned long, unsigned long); extern int ReaderSSE2_128bytes(void *ptr, unsigned long, unsigned long); diff --git a/main.c b/main.c index 2d293a8..ebcbbd0 100755 --- a/main.c +++ b/main.c @@ -1,6 +1,7 @@ /*============================================================================ bandwidth 1.1, a benchmark to estimate memory transfer bandwidth. Copyright (C) 2005-2014 by Zack T Smith. + Copyright (c) 2015 Raptor Engineering This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -16,7 +17,7 @@ along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - The author may be reached at veritas@comcast.net. + Zack Smith may be reached at veritas@comcast.net. *===========================================================================*/ #include @@ -44,8 +45,13 @@ #include "BMP.h" #include "BMPGraphing.h" +#if defined(__x86_64__) || defined(__i386__) #define TITLE_MEMORY_NET "Network benchmark results from bandwidth " RELEASE " by Zack Smith, http://zsmith.co" #define TITLE_MEMORY_GRAPH "Memory benchmark results from bandwidth " RELEASE " by Zack Smith, http://zsmith.co" +#else +#define TITLE_MEMORY_NET "Network benchmark results from bandwidth " RELEASE " by Zack Smith and Raptor Engineering" +#define TITLE_MEMORY_GRAPH "Memory benchmark results from bandwidth " RELEASE " by Zack Smith and Raptor Engineering" +#endif #ifdef __WIN32__ #include @@ -64,6 +70,7 @@ enum { SSE2_BYPASS, AVX, AVX_BYPASS, + VSX, LODSQ, LODSD, LODSW, @@ -88,6 +95,7 @@ static uint32_t cpu_has_sse42 = 0; static uint32_t cpu_has_aes = 0; static uint32_t cpu_has_avx = 0; static uint32_t cpu_has_avx2 = 0; +static uint32_t cpu_has_vsx = 0; static uint32_t cpu_has_64bit = 0; static uint32_t cpu_has_xd = 0; @@ -160,6 +168,11 @@ static int chunk_sizes[] = { 72 * 1024 * 1024, 96 * 1024 * 1024, 128 * 1024 * 1024, +#if defined(__PPC64__) + 256 * 1024 * 1024, + 512 * 1024 * 1024, + 1024 * 1024 * 1024, +#endif 0 }; @@ -330,7 +343,7 @@ do_write (unsigned long size, int mode, bool random) unsigned char *chunk0; unsigned long loops; unsigned long long total_count=0; -#ifdef __x86_64__ +#if defined(__x86_64__) || defined(__PPC64__) unsigned long value = 0x1234567689abcdef; #else unsigned long value = 0x12345678; @@ -343,17 +356,31 @@ do_write (unsigned long size, int mode, bool random) error ("do_write(): chunk size is not multiple of 128."); //------------------------------------------------- +#if defined(__PPC64__) + // Align to 128-bit boundaries + chunk0 = malloc (size+256); + chunk = chunk0; + if (!chunk) + error ("Out of memory"); + tmp = (unsigned long) chunk; + if (tmp & 127) { + tmp -= (tmp & 127); + tmp += 128; + chunk = (unsigned char*) tmp; + } +#else + // Align to 32-bit boundaries chunk0 = malloc (size+64); chunk = chunk0; if (!chunk) error ("Out of memory"); - tmp = (unsigned long) chunk; if (tmp & 31) { tmp -= (tmp & 31); tmp += 32; chunk = (unsigned char*) tmp; } +#endif //---------------------------------------- // Set up random pointers to chunks. @@ -401,6 +428,9 @@ do_write (unsigned long size, int mode, bool random) case AVX: print (L"(256-bit), size = "); break; + case VSX: + print (L"(128-bit), size = "); + break; case AVX_BYPASS: print (L"bypassing cache (256-bit), size = "); break; @@ -408,7 +438,7 @@ do_write (unsigned long size, int mode, bool random) print (L"bypassing cache (128-bit), size = "); break; default: -#ifdef __x86_64__ +#if defined(__x86_64__) || defined(__PPC64__) print (L"(64-bit), size = "); #else print (L"(32-bit), size = "); @@ -428,6 +458,7 @@ do_write (unsigned long size, int mode, bool random) total_count += loops; switch (mode) { +#if defined(__x86_64__) || defined(__i386__) case SSE2: if (random) RandomWriterSSE2 (chunk_ptrs, size/256, loops, value); @@ -461,7 +492,17 @@ do_write (unsigned long size, int mode, bool random) WriterAVX_bypass (chunk, size, loops, value); } break; - +#endif + +#if defined(__PPC64__) + case VSX: + if (random) + RandomWriterVSX (chunk_ptrs, size/256, loops, value); + else + WriterVSX (chunk, size, loops, value); + break; +#endif + default: if (random) RandomWriter (chunk_ptrs, size/256, loops, value); @@ -515,6 +556,22 @@ do_read (unsigned long size, int mode, bool random) error ("do_read(): chunk size is not multiple of 128."); //------------------------------------------------- +#if defined(__PPC64__) + // Align to 128-bit boundaries + chunk0 = chunk = malloc (size+256); + if (!chunk) + error ("Out of memory"); + + memset (chunk, 0, size); + + tmp = (unsigned long) chunk; + if (tmp & 127) { + tmp -= (tmp & 127); + tmp += 128; + chunk = (unsigned char*) tmp; + } +#else + // Align to 32-bit boundaries chunk0 = chunk = malloc (size+64); if (!chunk) error ("Out of memory"); @@ -527,6 +584,7 @@ do_read (unsigned long size, int mode, bool random) tmp += 32; chunk = (unsigned char*) tmp; } +#endif //---------------------------------------- // Set up random pointers to chunks. @@ -586,6 +644,9 @@ do_read (unsigned long size, int mode, bool random) case AVX: print (L"(256-bit), size = "); break; + case VSX: + print (L"(128-bit), size = "); + break; case AVX_BYPASS: print (L"bypassing cache (256-bit), size = "); break; @@ -593,7 +654,7 @@ do_read (unsigned long size, int mode, bool random) print (L"bypassing cache (128-bit), size = "); break; default: -#ifdef __x86_64__ +#if defined(__x86_64__) || defined(__PPC64__) print (L"(64-bit), size = "); #else print (L"(32-bit), size = "); @@ -615,6 +676,7 @@ do_read (unsigned long size, int mode, bool random) total_count += loops; switch (mode) { +#if defined(__x86_64__) || defined(__i386__) case SSE2: if (random) RandomReaderSSE2 (chunk_ptrs, size/256, loops); @@ -644,7 +706,17 @@ do_read (unsigned long size, int mode, bool random) ReaderAVX (chunk, size, loops); } break; - +#endif + +#if defined(__PPC64__) + case VSX: + if (random) + RandomReaderVSX (chunk_ptrs, size/256, loops); + else + ReaderVSX (chunk, size, loops); + break; +#endif + case LODSB: if (!random) { ReaderLODSB (chunk, size, loops); @@ -701,7 +773,7 @@ do_read (unsigned long size, int mode, bool random) } - +#if defined(__x86_64__) || defined(__i386__) //---------------------------------------------------------------------------- // Name: do_copy // Purpose: Performs sequential memory copy. @@ -722,6 +794,33 @@ do_copy (unsigned long size, int mode) error ("do_copy(): chunk size is not multiple of 128."); //------------------------------------------------- + +#if defined(__PPC64__) + // Align to 128-bit boundaries + chunk_src0 = chunk_src = malloc (size+256); + if (!chunk_src) + error ("Out of memory"); + chunk_dest0 = chunk_dest = malloc (size+256); + if (!chunk_dest) + error ("Out of memory"); + + memset (chunk_src, 100, size); + memset (chunk_dest, 200, size); + + tmp = (unsigned long) chunk_src; + if (tmp & 127) { + tmp -= (tmp & 127); + tmp += 128; + chunk_src = (unsigned char*) tmp; + } + tmp = (unsigned long) chunk_dest; + if (tmp & 127) { + tmp -= (tmp & 127); + tmp += 128; + chunk_dest = (unsigned char*) tmp; + } +#else + // Align to 32-bit boundaries chunk_src0 = chunk_src = malloc (size+64); if (!chunk_src) error ("Out of memory"); @@ -731,7 +830,7 @@ do_copy (unsigned long size, int mode) memset (chunk_src, 100, size); memset (chunk_dest, 200, size); - + tmp = (unsigned long) chunk_src; if (tmp & 31) { tmp -= (tmp & 31); @@ -744,6 +843,7 @@ do_copy (unsigned long size, int mode) tmp += 32; chunk_dest = (unsigned char*) tmp; } +#endif //------------------------------------------------- print (L"Sequential copy "); @@ -755,7 +855,7 @@ do_copy (unsigned long size, int mode) print (L"(256-bit), size = "); } else { -#ifdef __x86_64__ +#if defined(__x86_64__) || defined(__PPC64__) print (L"(64-bit), size = "); #else print (L"(32-bit), size = "); @@ -770,14 +870,14 @@ do_copy (unsigned long size, int mode) loops = (1 << 26) / size; // XX need to adjust for CPU MHz if (loops < 1) loops = 1; - + t0 = mytime (); while (diff < usec_per_test) { total_count += loops; if (mode == SSE2) { -#ifdef __x86_64__ +#if defined(__x86_64__) if (size & 128) CopySSE_128bytes (chunk_dest, chunk_src, size, loops); else @@ -808,6 +908,7 @@ do_copy (unsigned long size, int mode) return result; } +#endif //---------------------------------------------------------------------------- @@ -827,7 +928,7 @@ fb_readwrite (bool use_sse2) unsigned long datum; int fd; register unsigned long foo; -#ifdef __x86_64__ +#if defined(__x86_64__) || defined(__PPC64__) unsigned long value = 0x1234567689abcdef; #else unsigned long value = 0x12345678; @@ -901,9 +1002,11 @@ fb_readwrite (bool use_sse2) total_count = FBLOOPS_R; +#if defined(__x86_64__) || defined(__i386__) if (use_sse2) ReaderSSE2 (fb, length, FBLOOPS_R); else +#endif Reader (fb, length, FBLOOPS_R); diff = mytime () - t0; @@ -921,9 +1024,11 @@ fb_readwrite (bool use_sse2) total_count = FBLOOPS_W; +#if defined(__x86_64__) || defined(__i386__) if (use_sse2) WriterSSE2_bypass (fb, length, FBLOOPS_W, value); else +#endif Writer (fb, length, FBLOOPS_W, value); diff = mytime () - t0; @@ -937,6 +1042,8 @@ fb_readwrite (bool use_sse2) // Name: register_test // Purpose: Determines bandwidth of register-to-register transfers. //---------------------------------------------------------------------------- +#define REGISTER_COUNT 10000 +#define VREGISTER_COUNT 3333 void register_test () { @@ -945,13 +1052,12 @@ register_test () unsigned long diff = 0; //-------------------------------------- -#ifdef __x86_64__ +#if defined(__x86_64__) || defined(__PPC64__) print (L"Main register to main register transfers (64-bit) "); #else print (L"Main register to main register transfers (32-bit) "); #endif flush (); -#define REGISTER_COUNT 10000 t0 = mytime (); while (diff < usec_per_test) @@ -967,13 +1073,12 @@ register_test () flush (); //-------------------------------------- -#ifdef __x86_64__ +#if defined(__x86_64__) || defined(__PPC64__) print (L"Main register to vector register transfers (64-bit) "); #else print (L"Main register to vector register transfers (32-bit) "); #endif flush (); -#define VREGISTER_COUNT 3333 t0 = mytime (); diff = 0; @@ -991,7 +1096,7 @@ register_test () flush (); //-------------------------------------- -#ifdef __x86_64__ +#if defined(__x86_64__) || defined(__PPC64__) print (L"Vector register to main register transfers (64-bit) "); #else print (L"Vector register to main register transfers (32-bit) "); @@ -1032,6 +1137,8 @@ register_test () newline (); flush (); +#if defined(__x86_64__) || defined(__i386__) + //-------------------------------------- if (cpu_has_avx) { print (L"Vector register to vector register transfers (256-bit) "); @@ -1216,6 +1323,7 @@ register_test () newline (); flush (); } +#endif } //---------------------------------------------------------------------------- @@ -1229,7 +1337,7 @@ stack_test () unsigned long t0; unsigned long diff = 0; -#ifdef __x86_64__ +#if defined(__x86_64__) || defined(__PPC64__) print (L"Stack-to-register transfers (64-bit) "); #else print (L"Stack-to-register transfers (32-bit) "); @@ -1252,7 +1360,7 @@ stack_test () newline (); flush (); -#ifdef __x86_64__ +#if defined(__x86_64__) || defined(__PPC64__) print (L"Register-to-stack transfers (64-bit) "); #else print (L"Register-to-stack transfers (32-bit) "); @@ -1873,7 +1981,8 @@ main (int argc, char **argv) } printf ("This is bandwidth version %s.\n", RELEASE); - printf ("Copyright (C) 2005-2014 by Zack T Smith.\n\n"); + printf ("Copyright (C) 2005-2014 by Zack T Smith.\n"); + printf ("Copyright (C) 2015 Raptor Engineering.\n\n"); printf ("This software is covered by the GNU Public License.\n"); printf ("It is provided AS-IS, use at your own risk.\n"); printf ("See the file COPYING for more information.\n\n"); @@ -1909,6 +2018,11 @@ main (int argc, char **argv) return 0; } +#if defined(__PPC64__) + cpu_has_vsx = 1; +#endif + +#if defined(__x86_64__) || defined(__i386__) uint32_t ecx = get_cpuid1_ecx (); uint32_t edx = get_cpuid1_edx (); cpu_has_mmx = edx & CPUID_EDX_MMX; @@ -2019,6 +2133,8 @@ main (int argc, char **argv) if (!cpu_has_sse2) use_sse2 = false; +#endif + println (L"\nNotation: B = byte, kB = 1024 B, MB = 1048576 B."); flush (); @@ -2054,6 +2170,7 @@ main (int argc, char **argv) strcpy (graph_title, TITLE_MEMORY_GRAPH); BMPGraphing_set_title (graph, graph_title); +#if defined(__x86_64__) || defined(__i386__) //------------------------------------------------------------ // SSE2 sequential reads. // @@ -2239,11 +2356,85 @@ main (int argc, char **argv) } } +#endif + +#if defined(__PPC64__) + //------------------------------------------------------------ + // VSX sequential reads. + // + if (cpu_has_vsx) { + BMPGraphing_new_line (graph, "Sequential 128-bit reads", RGB_TURQUOISE); + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + if (!(chunk_size & 128)) { + int amount = do_read (chunk_size, VSX, false); + BMPGraphing_add_point (graph, chunk_size, amount); + } + } + } + + //------------------------------------------------------------ + // VSX random reads. + // + if (cpu_has_vsx) { + BMPGraphing_new_line (graph, "Random 128-bit reads", RGB_MAROON); + + newline (); + srand (time (NULL)); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + if (!(chunk_size & 128)) { + int amount = do_read (chunk_size, VSX, true); + BMPGraphing_add_point (graph, chunk_size, amount); + } + } + } + + //------------------------------------------------------------ + // VSX sequential writes that do not bypass the caches. + // + if (cpu_has_vsx) { + BMPGraphing_new_line (graph, "Sequential 128-bit cache writes", RGB_PINK); + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + if (!(chunk_size & 128)) { + int amount = do_write (chunk_size, VSX, false); + BMPGraphing_add_point (graph, chunk_size, amount); + } + } + } + + //------------------------------------------------------------ + // VSX random writes that do not bypass the caches. + // + if (cpu_has_vsx) { + BMPGraphing_new_line (graph, "Random 128-bit cache writes", RGB_NAVYBLUE); + + newline (); + srand (time (NULL)); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + if (!(chunk_size & 128)) { + int amount = do_write (chunk_size, VSX, true); + BMPGraphing_add_point (graph, chunk_size, amount); + } + } + } +#endif + //------------------------------------------------------------ // Sequential non-SSE2 reads. // newline (); -#ifdef __x86_64__ +#if defined(__x86_64__) || defined(__PPC64__) BMPGraphing_new_line (graph, "Sequential 64-bit reads", RGB_BLUE); #else BMPGraphing_new_line (graph, "Sequential 32-bit reads", RGB_BLUE); @@ -2259,7 +2450,7 @@ main (int argc, char **argv) // Random non-SSE2 reads. // newline (); -#ifdef __x86_64__ +#if defined(__x86_64__) || defined(__PPC64__) BMPGraphing_new_line (graph, "Random 64-bit reads", RGB_CYAN); #else BMPGraphing_new_line (graph, "Random 32-bit reads", RGB_CYAN); @@ -2277,7 +2468,7 @@ main (int argc, char **argv) //------------------------------------------------------------ // Sequential non-SSE2 writes. // -#ifdef __x86_64__ +#if defined(__x86_64__) || defined(__PPC64__) BMPGraphing_new_line (graph, "Sequential 64-bit writes", RGB_DARKGREEN); #else BMPGraphing_new_line (graph, "Sequential 32-bit writes", RGB_DARKGREEN); @@ -2294,7 +2485,7 @@ main (int argc, char **argv) //------------------------------------------------------------ // Random non-SSE2 writes. // -#ifdef __x86_64__ +#if defined(__x86_64__) || defined(__PPC64__) BMPGraphing_new_line (graph, "Random 64-bit writes", RGB_GREEN); #else BMPGraphing_new_line (graph, "Random 32-bit writes", RGB_GREEN); @@ -2311,6 +2502,8 @@ main (int argc, char **argv) } } +#if defined(__x86_64__) || defined(__i386__) + //------------------------------------------------------------ // SSE2 sequential copy. // @@ -2342,9 +2535,10 @@ main (int argc, char **argv) } } } +#endif #ifdef DOING_LODS -#ifdef __x86_64__ +#if defined(__x86_64__) || defined(__PPC64__) //------------------------------------------------------------ // LODSQ 64-bit sequential reads. // diff --git a/routines-ppc64el.asm b/routines-ppc64el.asm new file mode 100755 index 0000000..ce9e2cf --- /dev/null +++ b/routines-ppc64el.asm @@ -0,0 +1,1434 @@ +#============================================================================ +# ppc64el routines for bandwidth, a benchmark to estimate memory transfer bandwidth. +# Copyright (c) 2015 Raptor Engineering +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either ver4on 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +# +# The author of this file may be reached at tpearson@raptorengineeringinc.com. +#============================================================================= + +.global ReaderLODSQ +.global _ReaderLODSQ + +.global ReaderLODSD +.global _ReaderLODSD + +.global ReaderLODSW +.global _ReaderLODSW + +.global ReaderLODSB +.global _ReaderLODSB + +.global RandomReader +.global RandomWriter +.global Reader +.global Reader_128bytes +.global RandomReaderVSX +.global ReaderVSX +.global RegisterToRegister +.global RegisterToVector +.global StackReader +.global StackWriter +.global VectorToRegister +.global VectorToVector +.global Writer +.global Writer_128bytes +.global WriterVSX +.global RandomWriterVSX +.global _RandomReader +.global _RandomWriter +.global _Reader +.global _Reader_128bytes +.global _RandomReaderVSX +.global _ReaderVSX +.global _RegisterToRegister +.global _RegisterToVector +.global _StackReader +.global _StackWriter +.global _VectorToRegister +.global _VectorToVector +.global _Writer +.global _Writer_128bytes +.global _WriterVSX +.global _RandomWriterVSX + +.data +.align 3 # align to 8 byte boundary +stack_test_1: + .quad 1000 +stack_test_2: + .quad 2000 +stack_test_3: + .quad 3000 +stack_test_4: + .quad 4000 +stack_test_5: + .quad 5000 +stack_test_6: + .quad 6000 +stack_test_7: + .quad 7000 + +# Note: +# Unix ABI for 64-bit PPC64EL v2 says integer parameters are put into these registers in this order: +# r3, r4, r5, r6, r7, r7 + +.text + +#------------------------------------------------------------------------------ +# Name: ReaderLODSQ +# Purpose: Reads 64-bit values sequentially from an area of memory +# using LD instruction. +# Params: r3 = ptr to memory area +# r4 = length in bytes (inner loop count) +# r5 = loops (outer loop count) +#------------------------------------------------------------------------------ +.align 2 # align to 4 byte boundary +ReaderLODSQ: +_ReaderLODSQ: + stdu r1, -32(r1) # update and store stack pointer + mflr r0 # set up the stack frame + std r0, 16(r1) # save the link register + + mr r7, r4 # calculate inner loop count + srdi r7, r7, 3 # length in quadwords rounded down + + mr r10, r5 # load outer loop counter +.Louter_loop_0: + addi r10, r10, -1 + mtctr r7 # copy inner loop count to count register + mr r9, r3 # (re)load inner loop pointer address +.Linner_loop_0: + ld r8, 0(r9) + addi r9, r9, 8 # increment pointer by 1 quadword + bdnz .Linner_loop_0 + + cmpwi r10, 0 + bne .Louter_loop_0 + + ld r0, 16(r1) # restore saved link register + mtlr r0 + addi r1, r1, 32 # destroy the stack frame + blr + +#------------------------------------------------------------------------------ +# Name: ReaderLODSD +# Purpose: Reads 32-bit values sequentially from an area of memory +# using LWZ instruction. +# Params: r3 = ptr to memory area +# r4 = length in bytes (inner loop count) +# r5 = loops (outer loop count) +#------------------------------------------------------------------------------ +.align 2 # align to 4 byte boundary +ReaderLODSD: +_ReaderLODSD: + stdu r1, -32(r1) # update and store stack pointer + mflr r0 # set up the stack frame + std r0, 16(r1) # save the link register + + mr r7, r4 # calculate inner loop count + srdi r7, r7, 2 # length in doublewords rounded down + + mr r10, r5 # load outer loop counter +.Louter_loop_1: + addi r10, r10, -1 + mtctr r7 # copy inner loop count to count register + mr r9, r3 # (re)load inner loop pointer address +.Linner_loop_1: + lwz r8, 0(r9) + addi r9, r9, 4 # increment pointer by 1 double word + bdnz .Linner_loop_1 + + cmpwi r10, 0 + bne .Louter_loop_1 + + ld r0, 16(r1) # restore saved link register + mtlr r0 + addi r1, r1, 32 # destroy the stack frame + blr + +#------------------------------------------------------------------------------ +# Name: ReaderLODSW +# Purpose: Reads 16-bit values sequentially from an area of memory +# using LHZ instruction. +# Params: r3 = ptr to memory area +# r4 = length in bytes (inner loop count) +# r5 = loops (outer loop count) +#------------------------------------------------------------------------------ +.align 2 # align to 4 byte boundary +ReaderLODSW: +_ReaderLODSW: + stdu r1, -32(r1) # update and store stack pointer + mflr r0 # set up the stack frame + std r0, 16(r1) # save the link register + + mr r7, r4 # calculate inner loop count + srdi r7, r7, 1 # length in words rounded down + + mr r10, r5 # load outer loop counter +.Louter_loop_2: + addi r10, r10, -1 + mtctr r7 # copy inner loop count to count register + mr r9, r3 # (re)load inner loop pointer address +.Linner_loop_2: + lhz r8, 0(r9) + addi r9, r9, 2 # increment pointer by 1 word + bdnz .Linner_loop_2 + + cmpwi r10, 0 + bne .Louter_loop_2 + + ld r0, 16(r1) # restore saved link register + mtlr r0 + addi r1, r1, 32 # destroy the stack frame + blr + +#------------------------------------------------------------------------------ +# Name: ReaderLODSB +# Purpose: Reads 16-bit values sequentially from an area of memory +# using LDBZ instruction. +# Params: r3 = ptr to memory area +# r4 = length in bytes (inner loop count) +# r5 = loops (outer loop count) +#------------------------------------------------------------------------------ +.align 2 # align to 4 byte boundary +ReaderLODSB: +_ReaderLODSB: + stdu r1, -32(r1) # update and store stack pointer + mflr r0 # set up the stack frame + std r0, 16(r1) # save the link register + + mr r10, r5 # load outer loop counter +.Louter_loop_3: + addi r10, r10, -1 + mtctr r4 # copy inner loop count to count register + mr r9, r3 # (re)load inner loop pointer address +.Linner_loop_3: + lbz r8, 0(r9) + addi r9, r9, 1 # increment pointer by 1 byte + bdnz .Linner_loop_3 + + cmpwi r10, 0 + bne .Louter_loop_3 + + ld r0, 16(r1) # restore saved link register + mtlr r0 + addi r1, r1, 32 # destroy the stack frame + blr + + +#------------------------------------------------------------------------------ +# Name: Reader +# Purpose: Reads 64-bit values sequentially from an area of memory. +# Params: r3 = ptr to memory area +# r4 = length in bytes +# r5 = loops +#------------------------------------------------------------------------------ +.align 3 # align to 8 byte boundary +Reader: +_Reader: + stdu r1, -32(r1) # update and store stack pointer + mflr r0 # set up the stack frame + std r0, 16(r1) # save the link register + + mr r7, r4 # calculate inner loop count + srdi r7, r7, 8 # length in bytes / 256 + + mr r10, r5 # load outer loop counter +.Louter_loop_4: + addi r10, r10, -1 + mtctr r7 # copy inner loop count to count register + mr r9, r3 # (re)load inner loop pointer address +.Linner_loop_4: + ld r8, 0(r9) + ld r8, 8(r9) + ld r8, 16(r9) + ld r8, 24(r9) + ld r8, 32(r9) + ld r8, 40(r9) + ld r8, 48(r9) + ld r8, 56(r9) + ld r8, 64(r9) + ld r8, 72(r9) + ld r8, 80(r9) + ld r8, 88(r9) + ld r8, 96(r9) + ld r8, 104(r9) + ld r8, 112(r9) + ld r8, 120(r9) + ld r8, 128(r9) + ld r8, 136(r9) + ld r8, 144(r9) + ld r8, 152(r9) + ld r8, 160(r9) + ld r8, 168(r9) + ld r8, 176(r9) + ld r8, 184(r9) + ld r8, 192(r9) + ld r8, 200(r9) + ld r8, 208(r9) + ld r8, 216(r9) + ld r8, 224(r9) + ld r8, 232(r9) + ld r8, 240(r9) + ld r8, 248(r9) + addi r9, r9, 256 # increment pointer by 256 bytes + bdnz .Linner_loop_4 + + cmpwi r10, 0 + bne .Louter_loop_4 + + ld r0, 16(r1) # restore saved link register + mtlr r0 + addi r1, r1, 32 # destroy the stack frame + blr + +#------------------------------------------------------------------------------ +# Name: Reader_128bytes +# Purpose: Reads 64-bit values sequentially from an area of memory. +# Params: r3 = ptr to memory area +# r4 = length in bytes +# r5 = loops +#------------------------------------------------------------------------------ +.align 3 # align to 8 byte boundary +Reader_128bytes: +_Reader_128bytes: + stdu r1, -32(r1) # update and store stack pointer + mflr r0 # set up the stack frame + std r0, 16(r1) # save the link register + + mr r7, r4 # calculate inner loop count + srdi r7, r7, 7 # length in bytes / 128 + + mr r10, r5 # load outer loop counter +.Louter_loop_5: + addi r10, r10, -1 + mtctr r7 # copy inner loop count to count register + mr r9, r3 # (re)load inner loop pointer address +.Linner_loop_5: + ld r8, 0(r9) + ld r8, 8(r9) + ld r8, 16(r9) + ld r8, 24(r9) + ld r8, 32(r9) + ld r8, 40(r9) + ld r8, 48(r9) + ld r8, 56(r9) + ld r8, 64(r9) + ld r8, 72(r9) + ld r8, 80(r9) + ld r8, 88(r9) + ld r8, 96(r9) + ld r8, 104(r9) + ld r8, 112(r9) + ld r8, 120(r9) + addi r9, r9, 128 # increment pointer by 128 bytes + bdnz .Linner_loop_5 + + cmpwi r10, 0 + bne .Louter_loop_5 + + ld r0, 16(r1) # restore saved link register + mtlr r0 + addi r1, r1, 32 # destroy the stack frame + blr + +#------------------------------------------------------------------------------ +# Name: RandomReaderVSX +# Purpose: Reads 128-bit values randomly from an area of memory. +# Params: r3 = ptr to array of chunk pointers +# r4 = # of chunks +# r5 = loops +#------------------------------------------------------------------------------ +.align 3 # align to 8 byte boundary +RandomReaderVSX: +_RandomReaderVSX: + stdu r1, -152(r1) # update and store stack pointer + mflr r0 # set up the stack frame + std r0, 16(r1) # save the link register + std r14, 24(r1) # save r14 + std r15, 32(r1) # save r15 + std r16, 40(r1) # save r16 + std r17, 48(r1) # save r17 + std r18, 56(r1) # save r18 + std r19, 64(r1) # save r19 + std r20, 72(r1) # save r20 + std r21, 80(r1) # save r21 + std r22, 88(r1) # save r22 + std r23, 96(r1) # save r23 + std r24, 104(r1) # save r24 + std r25, 112(r1) # save r25 + std r26, 120(r1) # save r26 + std r27, 128(r1) # save r27 + std r28, 136(r1) # save r28 + + li r12, 240 # initialize read offset registers + li r14, 128 + li r15, 64 + li r16, 208 + li r17, 112 + li r18, 176 + li r19, 144 + li r20, 0 + + li r21, 96 + li r22, 16 + li r23, 192 + li r24, 160 + li r25, 32 + li r26, 48 + li r27, 224 + li r28, 80 + + mr r10, r5 # load outer loop counter +.Louter_vsx_loop_0: + addi r10, r10, -1 + mr r11, r4 # copy inner loop count to inner loop count register +.Linner_vsx_loop_0: + addi r11, r11, -1 + mr r7, r11 # (re)calculate inner loop data offset + sldi r7, r7, 3 + ldx r9, r3, r7 # (re)compute inner loop start pointer + + lxvd2x v0, r12, r9 + lxvd2x v0, r14, r9 + lxvd2x v0, r15, r9 + lxvd2x v0, r16, r9 + lxvd2x v0, r17, r9 + lxvd2x v0, r18, r9 + lxvd2x v0, r19, r9 + lxvd2x v0, r20, r9 + + lxvd2x v0, r21, r9 + lxvd2x v0, r22, r9 + lxvd2x v0, r23, r9 + lxvd2x v0, r24, r9 + lxvd2x v0, r25, r9 + lxvd2x v0, r26, r9 + lxvd2x v0, r27, r9 + lxvd2x v0, r28, r9 + + cmpwi r11, 0 + bne .Linner_vsx_loop_0 + + cmpwi r10, 0 + bne .Louter_vsx_loop_0 + + ld r28, 136(r1) # restore r28 + ld r27, 128(r1) # restore r27 + ld r26, 120(r1) # restore r26 + ld r25, 112(r1) # restore r25 + ld r24, 104(r1) # restore r24 + ld r23, 96(r1) # restore r23 + ld r22, 88(r1) # restore r22 + ld r21, 80(r1) # restore r21 + ld r20, 72(r1) # restore r20 + ld r19, 64(r1) # restore r19 + ld r18, 56(r1) # restore r18 + ld r17, 48(r1) # restore r17 + ld r16, 40(r1) # restore r16 + ld r15, 32(r1) # restore r15 + ld r14, 24(r1) # restore r14 + ld r0, 16(r1) # restore saved link register + mtlr r0 + addi r1, r1, 152 # destroy the stack frame + blr + +#------------------------------------------------------------------------------ +# Name: ReaderVSX +# Purpose: Reads 128-bit values sequentially from an area of memory. +# Params: r3 = ptr to memory area +# r4 = length in bytes +# r5 = loops +#------------------------------------------------------------------------------ +.align 3 # align to 8 byte boundary +ReaderVSX: +_ReaderVSX: + stdu r1, -144(r1) # update and store stack pointer + mflr r0 # set up the stack frame + std r0, 16(r1) # save the link register + std r14, 24(r1) # save r14 + std r15, 32(r1) # save r15 + std r16, 40(r1) # save r16 + std r17, 48(r1) # save r17 + std r18, 56(r1) # save r18 + std r19, 64(r1) # save r19 + std r20, 72(r1) # save r20 + std r21, 80(r1) # save r21 + std r22, 88(r1) # save r22 + std r23, 96(r1) # save r23 + std r24, 104(r1) # save r24 + std r25, 112(r1) # save r25 + std r26, 120(r1) # save r26 + std r27, 128(r1) # save r27 + + mr r7, r4 # calculate inner loop count + srdi r7, r7, 8 # length in bytes / 256 + + li r11, 0 # initialize read offset registers + li r12, 16 + li r14, 32 + li r15, 48 + li r16, 64 + li r17, 80 + li r18, 96 + li r19, 112 + + li r20, 128 + li r21, 144 + li r22, 160 + li r23, 176 + li r24, 192 + li r25, 208 + li r26, 224 + li r27, 240 + + mr r10, r5 # load outer loop counter +.Louter_vsx_loop_1: + addi r10, r10, -1 + mtctr r7 # copy inner loop count to count register + mr r9, r3 # (re)load inner loop pointer address +.Linner_vsx_loop_1: + lxvd2x v0, r11, r9 + lxvd2x v0, r12, r9 + lxvd2x v0, r14, r9 + lxvd2x v0, r15, r9 + lxvd2x v0, r16, r9 + lxvd2x v0, r17, r9 + lxvd2x v0, r18, r9 + lxvd2x v0, r19, r9 + + lxvd2x v0, r20, r9 + lxvd2x v0, r21, r9 + lxvd2x v0, r22, r9 + lxvd2x v0, r23, r9 + lxvd2x v0, r24, r9 + lxvd2x v0, r25, r9 + lxvd2x v0, r26, r9 + lxvd2x v0, r27, r9 + + addi r9, r9, 256 # increment pointer by 256 bytes + bdnz .Linner_vsx_loop_1 + + cmpwi r10, 0 + bne .Louter_vsx_loop_1 + + ld r27, 128(r1) # restore r27 + ld r26, 120(r1) # restore r26 + ld r25, 112(r1) # restore r25 + ld r24, 104(r1) # restore r24 + ld r23, 96(r1) # restore r23 + ld r22, 88(r1) # restore r22 + ld r21, 80(r1) # restore r21 + ld r20, 72(r1) # restore r20 + ld r19, 64(r1) # restore r19 + ld r18, 56(r1) # restore r18 + ld r17, 48(r1) # restore r17 + ld r16, 40(r1) # restore r16 + ld r15, 32(r1) # restore r15 + ld r14, 24(r1) # restore r14 + ld r0, 16(r1) # restore saved link register + mtlr r0 + addi r1, r1, 144 # destroy the stack frame + blr + +#------------------------------------------------------------------------------ +# Name: RandomReader +# Purpose: Reads 64-bit values randomly from an area of memory. +# Params: r3 = ptr to array of chunk pointers +# r4 = # of chunks +# r5 = loops +#------------------------------------------------------------------------------ +.align 3 # align to 8 byte boundary +RandomReader: +_RandomReader: + stdu r1, -32(r1) # update and store stack pointer + mflr r0 # set up the stack frame + std r0, 16(r1) # save the link register + + mr r10, r5 # load outer loop counter +.Louter_loop_6: + addi r10, r10, -1 + mr r11, r4 # copy inner loop count to inner loop count register +.Linner_loop_6: + addi r11, r11, -1 + mr r7, r11 # (re)calculate inner loop data offset + sldi r7, r7, 3 + ldx r9, r3, r7 # (re)compute inner loop start pointer + ld r8, 96(r9) + ld r8, 0(r9) + ld r8, 120(r9) + ld r8, 184(r9) + ld r8, 160(r9) + ld r8, 176(r9) + ld r8, 112(r9) + ld r8, 80(r9) + ld r8, 32(r9) + ld r8, 128(r9) + ld r8, 88(r9) + ld r8, 40(r9) + ld r8, 48(r9) + ld r8, 72(r9) + ld r8, 200(r9) + ld r8, 24(r9) + ld r8, 152(r9) + ld r8, 16(r9) + ld r8, 248(r9) + ld r8, 56(r9) + ld r8, 240(r9) + ld r8, 208(r9) + ld r8, 104(r9) + ld r8, 216(r9) + ld r8, 136(r9) + ld r8, 232(r9) + ld r8, 64(r9) + ld r8, 224(r9) + ld r8, 144(r9) + ld r8, 192(r9) + ld r8, 8(r9) + ld r8, 168(r9) + cmpwi r11, 0 + bne .Linner_loop_6 + + cmpwi r10, 0 + bne .Louter_loop_6 + + ld r0, 16(r1) # restore saved link register + mtlr r0 + addi r1, r1, 32 # destroy the stack frame + blr + +#------------------------------------------------------------------------------ +# Name: RandomWriter +# Purpose: Writes 64-bit values randomly to an area of memory. +# Params: r3 = ptr to array of chunk pointers +# r4 = # of chunks +# r5 = loops +# r6 = datum to write +#------------------------------------------------------------------------------ +.align 3 # align to 8 byte boundary +RandomWriter: +_RandomWriter: + stdu r1, -32(r1) # update and store stack pointer + mflr r0 # set up the stack frame + std r0, 16(r1) # save the link register + + mr r10, r5 # load outer loop counter +.Louter_loop_7: + addi r10, r10, -1 + mr r11, r4 # copy inner loop count to inner loop count register +.Linner_loop_7: + addi r11, r11, -1 + mr r7, r11 # (re)calculate inner loop data offset + sldi r7, r7, 3 + ldx r9, r3, r7 # (re)compute inner loop start pointer + std r8, 96(r9) + std r8, 0(r9) + std r8, 120(r9) + std r8, 184(r9) + std r8, 160(r9) + std r8, 176(r9) + std r8, 112(r9) + std r8, 80(r9) + std r8, 32(r9) + std r8, 128(r9) + std r8, 88(r9) + std r8, 40(r9) + std r8, 48(r9) + std r8, 72(r9) + std r8, 200(r9) + std r8, 24(r9) + std r8, 152(r9) + std r8, 16(r9) + std r8, 248(r9) + std r8, 56(r9) + std r8, 240(r9) + std r8, 208(r9) + std r8, 104(r9) + std r8, 216(r9) + std r8, 136(r9) + std r8, 232(r9) + std r8, 64(r9) + std r8, 224(r9) + std r8, 144(r9) + std r8, 192(r9) + std r8, 8(r9) + std r8, 168(r9) + cmpwi r11, 0 + bne .Linner_loop_7 + + cmpwi r10, 0 + bne .Louter_loop_7 + + ld r0, 16(r1) # restore saved link register + mtlr r0 + addi r1, r1, 32 # destroy the stack frame + blr + +#------------------------------------------------------------------------------ +# Name: WriterVSX +# Purpose: Writes 128-bit value sequentially to an area of memory. +# Params: r3 = ptr to memory area +# r4 = length in bytes +# r5 = loops +# r6 = quad to write +#------------------------------------------------------------------------------ +.align 3 # align to 8 byte boundary +WriterVSX: +_WriterVSX: + stdu r1, -144(r1) # update and store stack pointer + mflr r0 # set up the stack frame + std r0, 16(r1) # save the link register + std r14, 24(r1) # save r14 + std r15, 32(r1) # save r15 + std r16, 40(r1) # save r16 + std r17, 48(r1) # save r17 + std r18, 56(r1) # save r18 + std r19, 64(r1) # save r19 + std r20, 72(r1) # save r20 + std r21, 80(r1) # save r21 + std r22, 88(r1) # save r22 + std r23, 96(r1) # save r23 + std r24, 104(r1) # save r24 + std r25, 112(r1) # save r25 + std r26, 120(r1) # save r26 + std r27, 128(r1) # save r27 + + mtvsrd vs1, r6 # load value to write into vector register + + mr r7, r4 # calculate inner loop count + srdi r7, r7, 8 # length in bytes / 256 + + li r11, 0 # initialize write offset registers + li r12, 16 + li r14, 32 + li r15, 48 + li r16, 64 + li r17, 80 + li r18, 96 + li r19, 112 + + li r20, 128 + li r21, 144 + li r22, 160 + li r23, 176 + li r24, 192 + li r25, 208 + li r26, 224 + li r27, 240 + + mr r10, r5 # load outer loop counter +.Louter_vsx_loop_2: + addi r10, r10, -1 + mtctr r7 # copy inner loop count to count register + mr r9, r3 # (re)load inner loop pointer address +.Linner_vsx_loop_2: + stxvd2x v0, r11, r9 + stxvd2x v0, r12, r9 + stxvd2x v0, r14, r9 + stxvd2x v0, r15, r9 + stxvd2x v0, r16, r9 + stxvd2x v0, r17, r9 + stxvd2x v0, r18, r9 + stxvd2x v0, r19, r9 + + stxvd2x v0, r20, r9 + stxvd2x v0, r21, r9 + stxvd2x v0, r22, r9 + stxvd2x v0, r23, r9 + stxvd2x v0, r24, r9 + stxvd2x v0, r25, r9 + stxvd2x v0, r26, r9 + stxvd2x v0, r27, r9 + + addi r9, r9, 256 # increment pointer by 256 bytes + bdnz .Linner_vsx_loop_2 + + cmpwi r10, 0 + bne .Louter_vsx_loop_2 + + ld r27, 128(r1) # restore r27 + ld r26, 120(r1) # restore r26 + ld r25, 112(r1) # restore r25 + ld r24, 104(r1) # restore r24 + ld r23, 96(r1) # restore r23 + ld r22, 88(r1) # restore r22 + ld r21, 80(r1) # restore r21 + ld r20, 72(r1) # restore r20 + ld r19, 64(r1) # restore r19 + ld r18, 56(r1) # restore r18 + ld r17, 48(r1) # restore r17 + ld r16, 40(r1) # restore r16 + ld r15, 32(r1) # restore r15 + ld r14, 24(r1) # restore r14 + ld r0, 16(r1) # restore saved link register + mtlr r0 + addi r1, r1, 144 # destroy the stack frame + blr + +#------------------------------------------------------------------------------ +# Name: RandomWriterVSX +# Purpose: Writes 128-bit values randomly to an area of memory. +# Params: r3 = ptr to memory area +# r4 = length in bytes +# r5 = loops +# r6 = quad to write +#------------------------------------------------------------------------------ +.align 3 # align to 8 byte boundary +RandomWriterVSX: +_RandomWriterVSX: + stdu r1, -152(r1) # update and store stack pointer + mflr r0 # set up the stack frame + std r0, 16(r1) # save the link register + std r14, 24(r1) # save r14 + std r15, 32(r1) # save r15 + std r16, 40(r1) # save r16 + std r17, 48(r1) # save r17 + std r18, 56(r1) # save r18 + std r19, 64(r1) # save r19 + std r20, 72(r1) # save r20 + std r21, 80(r1) # save r21 + std r22, 88(r1) # save r22 + std r23, 96(r1) # save r23 + std r24, 104(r1) # save r24 + std r25, 112(r1) # save r25 + std r26, 120(r1) # save r26 + std r27, 128(r1) # save r27 + std r28, 136(r1) # save r28 + + mtvsrd vs1, r6 # load value to write into vector register + + li r12, 240 # initialize write offset registers + li r14, 128 + li r15, 64 + li r16, 208 + li r17, 112 + li r18, 176 + li r19, 144 + li r20, 0 + + li r21, 96 + li r22, 16 + li r23, 192 + li r24, 160 + li r25, 32 + li r26, 48 + li r27, 224 + li r28, 80 + + mr r10, r5 # load outer loop counter +.Louter_vsx_loop_3: + addi r10, r10, -1 + mr r11, r4 # copy inner loop count to inner loop count register +.Linner_vsx_loop_3: + addi r11, r11, -1 + mr r7, r11 # (re)calculate inner loop data offset + sldi r7, r7, 3 + ldx r9, r3, r7 # (re)compute inner loop start pointer + + stxvd2x v0, r12, r9 + stxvd2x v0, r14, r9 + stxvd2x v0, r15, r9 + stxvd2x v0, r16, r9 + stxvd2x v0, r17, r9 + stxvd2x v0, r18, r9 + stxvd2x v0, r19, r9 + stxvd2x v0, r20, r9 + + stxvd2x v0, r21, r9 + stxvd2x v0, r22, r9 + stxvd2x v0, r23, r9 + stxvd2x v0, r24, r9 + stxvd2x v0, r25, r9 + stxvd2x v0, r26, r9 + stxvd2x v0, r27, r9 + stxvd2x v0, r28, r9 + + cmpwi r11, 0 + bne .Linner_vsx_loop_3 + + cmpwi r10, 0 + bne .Louter_vsx_loop_3 + + ld r28, 136(r1) # restore r28 + ld r27, 128(r1) # restore r27 + ld r26, 120(r1) # restore r26 + ld r25, 112(r1) # restore r25 + ld r24, 104(r1) # restore r24 + ld r23, 96(r1) # restore r23 + ld r22, 88(r1) # restore r22 + ld r21, 80(r1) # restore r21 + ld r20, 72(r1) # restore r20 + ld r19, 64(r1) # restore r19 + ld r18, 56(r1) # restore r18 + ld r17, 48(r1) # restore r17 + ld r16, 40(r1) # restore r16 + ld r15, 32(r1) # restore r15 + ld r14, 24(r1) # restore r14 + ld r0, 16(r1) # restore saved link register + mtlr r0 + addi r1, r1, 152 # destroy the stack frame + blr + +#------------------------------------------------------------------------------ +# Name: Writer +# Purpose: Writes 64-bit value sequentially to an area of memory. +# Params: r3 = ptr to memory area +# r4 = length in bytes +# r5 = loops +# r6 = quad to write +#------------------------------------------------------------------------------ +.align 3 # align to 8 byte boundary +Writer: +_Writer: + stdu r1, -32(r1) # update and store stack pointer + mflr r0 # set up the stack frame + std r0, 16(r1) # save the link register + + mr r7, r4 # calculate inner loop count + srdi r7, r7, 8 # length in bytes / 256 + + mr r10, r5 # load outer loop counter +.Louter_loop_8: + addi r10, r10, -1 + mtctr r7 # copy inner loop count to count register + mr r9, r3 # (re)load inner loop pointer address +.Linner_loop_8: + std r6, 0(r9) + std r6, 8(r9) + std r6, 16(r9) + std r6, 24(r9) + std r6, 32(r9) + std r6, 40(r9) + std r6, 48(r9) + std r6, 56(r9) + std r6, 64(r9) + std r6, 72(r9) + std r6, 80(r9) + std r6, 88(r9) + std r6, 96(r9) + std r6, 104(r9) + std r6, 112(r9) + std r6, 120(r9) + std r6, 128(r9) + std r6, 136(r9) + std r6, 144(r9) + std r6, 152(r9) + std r6, 160(r9) + std r6, 168(r9) + std r6, 176(r9) + std r6, 184(r9) + std r6, 192(r9) + std r6, 200(r9) + std r6, 208(r9) + std r6, 216(r9) + std r6, 224(r9) + std r6, 232(r9) + std r6, 240(r9) + std r6, 248(r9) + addi r9, r9, 256 # increment pointer by 256 bytes + bdnz .Linner_loop_8 + + cmpwi r10, 0 + bne .Louter_loop_8 + + ld r0, 16(r1) # restore saved link register + mtlr r0 + addi r1, r1, 32 # destroy the stack frame + blr + +#------------------------------------------------------------------------------ +# Name: Writer_128bytes +# Purpose: Writes 64-bit value sequentially to an area of memory. +# Params: r3 = ptr to memory area +# r4 = length in bytes +# r5 = loops +# r6 = quad to write +#------------------------------------------------------------------------------ +.align 3 # align to 8 byte boundary +Writer_128bytes: +_Writer_128bytes: + stdu r1, -32(r1) # update and store stack pointer + mflr r0 # set up the stack frame + std r0, 16(r1) # save the link register + + mr r7, r4 # calculate inner loop count + srdi r7, r7, 7 # length in bytes / 128 + + mr r10, r5 # load outer loop counter +.Louter_loop_9: + addi r10, r10, -1 + mtctr r7 # copy inner loop count to count register + mr r9, r3 # (re)load inner loop pointer address +.Linner_loop_9: + std r6, 0(r9) + std r6, 8(r9) + std r6, 16(r9) + std r6, 24(r9) + std r6, 32(r9) + std r6, 40(r9) + std r6, 48(r9) + std r6, 56(r9) + std r6, 64(r9) + std r6, 72(r9) + std r6, 80(r9) + std r6, 88(r9) + std r6, 96(r9) + std r6, 104(r9) + std r6, 112(r9) + std r6, 120(r9) + addi r9, r9, 128 # increment pointer by 128 bytes + bdnz .Linner_loop_9 + + cmpwi r10, 0 + bne .Louter_loop_9 + + ld r0, 16(r1) # restore saved link register + mtlr r0 + addi r1, r1, 32 # destroy the stack frame + blr + +#------------------------------------------------------------------------------ +# Name: StackReader +# Purpose: Reads 64-bit values off the stack into registers of +# the main register set, effectively testing L1 cache access +# *and* effective-address calculation speed. +# Params: r3 = loops +#------------------------------------------------------------------------------ +.align 3 # align to 8 byte boundary +StackReader: +_StackReader: + stdu r1, -128(r1) # update and store stack pointer + mflr r0 # set up the stack frame + std r0, 16(r1) # save the link register + mr r5, r1 # save stack pointer for testing + addi r5, r5, 32 # increment saved testing stack pointer + + # push qword 7000 + lis r4, stack_test_7@highest # load stack_test_7 bits 48-63 into r4 bits 16-31 + ori r4, r4, stack_test_7@higher # load stack_test_7 bits 32-47 into r4 bits 0-15 + rldicr r4, r4, 32, 31 # rotate r4's low word into r4's high word + oris r4, r4, stack_test_7@h # load stack_test_7 bits 16-31 into r4 bits 16-31 + ori r4, r4, stack_test_7@l # load stack_test_7 bits 0-15 into r4 bits 0-15 + std r4, 48(r5) + + # push qword 6000 + lis r4, stack_test_6@highest # load stack_test_6 bits 48-63 into r4 bits 16-31 + ori r4, r4, stack_test_6@higher # load stack_test_6 bits 32-47 into r4 bits 0-15 + rldicr r4, r4, 32, 31 # rotate r4's low word into r4's high word + oris r4, r4, stack_test_6@h # load stack_test_6 bits 16-31 into r4 bits 16-31 + ori r4, r4, stack_test_6@l # load stack_test_6 bits 0-15 into r4 bits 0-15 + std r4, 40(r5) + + # push qword 5000 + lis r4, stack_test_5@highest # load stack_test_5 bits 48-63 into r4 bits 16-31 + ori r4, r4, stack_test_5@higher # load stack_test_5 bits 32-47 into r4 bits 0-15 + rldicr r4, r4, 32, 31 # rotate r4's low word into r4's high word + oris r4, r4, stack_test_5@h # load stack_test_5 bits 16-31 into r4 bits 16-31 + ori r4, r4, stack_test_5@l # load stack_test_5 bits 0-15 into r4 bits 0-15 + std r4, 32(r5) + + # push qword 4000 + lis r4, stack_test_4@highest # load stack_test_4 bits 48-63 into r4 bits 16-31 + ori r4, r4, stack_test_4@higher # load stack_test_4 bits 32-47 into r4 bits 0-15 + rldicr r4, r4, 32, 31 # rotate r4's low word into r4's high word + oris r4, r4, stack_test_4@h # load stack_test_4 bits 16-31 into r4 bits 16-31 + ori r4, r4, stack_test_4@l # load stack_test_4 bits 0-15 into r4 bits 0-15 + std r4, 24(r5) + + # push qword 3000 + lis r4, stack_test_3@highest # load stack_test_3 bits 48-63 into r4 bits 16-31 + ori r4, r4, stack_test_3@higher # load stack_test_3 bits 32-47 into r4 bits 0-15 + rldicr r4, r4, 32, 31 # rotate r4's low word into r4's high word + oris r4, r4, stack_test_3@h # load stack_test_3 bits 16-31 into r4 bits 16-31 + ori r4, r4, stack_test_3@l # load stack_test_3 bits 0-15 into r4 bits 0-15 + std r4, 16(r5) + + # push qword 2000 + lis r4, stack_test_2@highest # load stack_test_2 bits 48-63 into r4 bits 16-31 + ori r4, r4, stack_test_2@higher # load stack_test_2 bits 32-47 into r4 bits 0-15 + rldicr r4, r4, 32, 31 # rotate r4's low word into r4's high word + oris r4, r4, stack_test_2@h # load stack_test_2 bits 16-31 into r4 bits 16-31 + ori r4, r4, stack_test_2@l # load stack_test_2 bits 0-15 into r4 bits 0-15 + std r4, 8(r5) + + # push qword 1000 + lis r4, stack_test_1@highest # load stack_test_1 bits 48-63 into r4 bits 16-31 + ori r4, r4, stack_test_1@higher # load stack_test_1 bits 32-47 into r4 bits 0-15 + rldicr r4, r4, 32, 31 # rotate r4's low word into r4's high word + oris r4, r4, stack_test_1@h # load stack_test_1 bits 16-31 into r4 bits 16-31 + ori r4, r4, stack_test_1@l # load stack_test_1 bits 0-15 into r4 bits 0-15 + std r4, 0(r5) + + mtctr r3 # copy loop count to count register +.Lstack_loop_0: + ld r8, 0(r5) + ld r8, 16(r5) + ld r8, 24(r5) + ld r8, 32(r5) + ld r8, 40(r5) + ld r8, 8(r5) + ld r8, 48(r5) + ld r8, 0(r5) + ld r8, 0(r5) + ld r8, 16(r5) + ld r8, 24(r5) + ld r8, 32(r5) + ld r8, 40(r5) + ld r8, 8(r5) + ld r8, 48(r5) + ld r8, 0(r5) + ld r8, 0(r5) + ld r8, 16(r5) + ld r8, 24(r5) + ld r8, 32(r5) + ld r8, 40(r5) + ld r8, 8(r5) + ld r8, 48(r5) + ld r8, 8(r5) + ld r8, 8(r5) + ld r8, 16(r5) + ld r8, 24(r5) + ld r8, 32(r5) + ld r8, 40(r5) + ld r8, 8(r5) + ld r8, 48(r5) + ld r8, 8(r5) + bdnz .Lstack_loop_0 + + ld r0, 16(r1) # restore saved link register + mtlr r0 + addi r1, r1, 128 # destroy the stack frame + blr + +#------------------------------------------------------------------------------ +# Name: StackWriter +# Purpose: Writes 64-bit values into the stack from registers of +# the main register set, effectively testing L1 cache access +# *and* effective-address calculation speed. +# Params: r3 = loops +#------------------------------------------------------------------------------ +.align 3 # align to 8 byte boundary +StackWriter: +_StackWriter: + stdu r1, -128(r1) # update and store stack pointer + mflr r0 # set up the stack frame + std r0, 16(r1) # save the link register + mr r5, r1 # save stack pointer for testing + addi r5, r5, 32 # increment saved testing stack pointer + + # push qword 7000 + lis r4, stack_test_7@highest # load stack_test_7 bits 48-63 into r4 bits 16-31 + ori r4, r4, stack_test_7@higher # load stack_test_7 bits 32-47 into r4 bits 0-15 + rldicr r4, r4, 32, 31 # rotate r4's low word into r4's high word + oris r4, r4, stack_test_7@h # load stack_test_7 bits 16-31 into r4 bits 16-31 + ori r4, r4, stack_test_7@l # load stack_test_7 bits 0-15 into r4 bits 0-15 + std r4, 48(r5) + + # push qword 6000 + lis r4, stack_test_6@highest # load stack_test_6 bits 48-63 into r4 bits 16-31 + ori r4, r4, stack_test_6@higher # load stack_test_6 bits 32-47 into r4 bits 0-15 + rldicr r4, r4, 32, 31 # rotate r4's low word into r4's high word + oris r4, r4, stack_test_6@h # load stack_test_6 bits 16-31 into r4 bits 16-31 + ori r4, r4, stack_test_6@l # load stack_test_6 bits 0-15 into r4 bits 0-15 + std r4, 40(r5) + + # push qword 5000 + lis r4, stack_test_5@highest # load stack_test_5 bits 48-63 into r4 bits 16-31 + ori r4, r4, stack_test_5@higher # load stack_test_5 bits 32-47 into r4 bits 0-15 + rldicr r4, r4, 32, 31 # rotate r4's low word into r4's high word + oris r4, r4, stack_test_5@h # load stack_test_5 bits 16-31 into r4 bits 16-31 + ori r4, r4, stack_test_5@l # load stack_test_5 bits 0-15 into r4 bits 0-15 + std r4, 32(r5) + + # push qword 4000 + lis r4, stack_test_4@highest # load stack_test_4 bits 48-63 into r4 bits 16-31 + ori r4, r4, stack_test_4@higher # load stack_test_4 bits 32-47 into r4 bits 0-15 + rldicr r4, r4, 32, 31 # rotate r4's low word into r4's high word + oris r4, r4, stack_test_4@h # load stack_test_4 bits 16-31 into r4 bits 16-31 + ori r4, r4, stack_test_4@l # load stack_test_4 bits 0-15 into r4 bits 0-15 + std r4, 24(r5) + + # push qword 3000 + lis r4, stack_test_3@highest # load stack_test_3 bits 48-63 into r4 bits 16-31 + ori r4, r4, stack_test_3@higher # load stack_test_3 bits 32-47 into r4 bits 0-15 + rldicr r4, r4, 32, 31 # rotate r4's low word into r4's high word + oris r4, r4, stack_test_3@h # load stack_test_3 bits 16-31 into r4 bits 16-31 + ori r4, r4, stack_test_3@l # load stack_test_3 bits 0-15 into r4 bits 0-15 + std r4, 16(r5) + + # push qword 2000 + lis r4, stack_test_2@highest # load stack_test_2 bits 48-63 into r4 bits 16-31 + ori r4, r4, stack_test_2@higher # load stack_test_2 bits 32-47 into r4 bits 0-15 + rldicr r4, r4, 32, 31 # rotate r4's low word into r4's high word + oris r4, r4, stack_test_2@h # load stack_test_2 bits 16-31 into r4 bits 16-31 + ori r4, r4, stack_test_2@l # load stack_test_2 bits 0-15 into r4 bits 0-15 + std r4, 8(r5) + + # push qword 1000 + lis r4, stack_test_1@highest # load stack_test_1 bits 48-63 into r4 bits 16-31 + ori r4, r4, stack_test_1@higher # load stack_test_1 bits 32-47 into r4 bits 0-15 + rldicr r4, r4, 32, 31 # rotate r4's low word into r4's high word + oris r4, r4, stack_test_1@h # load stack_test_1 bits 16-31 into r4 bits 16-31 + ori r4, r4, stack_test_1@l # load stack_test_1 bits 0-15 into r4 bits 0-15 + std r4, 0(r5) + + mtctr r3 # copy loop count to count register +.Lstack_loop_1: + std r8, 0(r5) + std r8, 16(r5) + std r8, 24(r5) + std r8, 32(r5) + std r8, 40(r5) + std r8, 8(r5) + std r8, 48(r5) + std r8, 0(r5) + std r8, 0(r5) + std r8, 16(r5) + std r8, 24(r5) + std r8, 32(r5) + std r8, 40(r5) + std r8, 8(r5) + std r8, 48(r5) + std r8, 0(r5) + std r8, 0(r5) + std r8, 16(r5) + std r8, 24(r5) + std r8, 32(r5) + std r8, 40(r5) + std r8, 8(r5) + std r8, 48(r5) + std r8, 8(r5) + std r8, 8(r5) + std r8, 16(r5) + std r8, 24(r5) + std r8, 32(r5) + std r8, 40(r5) + std r8, 8(r5) + std r8, 48(r5) + std r8, 8(r5) + bdnz .Lstack_loop_1 + + ld r0, 16(r1) # restore saved link register + mtlr r0 + addi r1, r1, 128 # destroy the stack frame + blr + +#------------------------------------------------------------------------------ +# Name: RegisterToRegister +# Purpose: Reads/writes 64-bit values between registers of +# the main register set. +# Params: r3 = loops +#------------------------------------------------------------------------------ +.align 3 # align to 8 byte boundary +RegisterToRegister: +_RegisterToRegister: + stdu r1, -32(r1) # update and store stack pointer + mflr r0 # set up the stack frame + std r0, 16(r1) # save the link register + + mtctr r3 # copy loop count to count register +.Lreg_loop_0: + mr r8, r9 + mr r8, r7 + mr r8, r6 + mr r8, r5 + mr r8, r4 + mr r8, r10 + mr r8, r11 + mr r8, r9 + mr r8, r9 + mr r8, r7 + mr r8, r6 + mr r8, r5 + mr r8, r4 + mr r8, r10 + mr r8, r11 + mr r8, r9 + mr r8, r9 + mr r8, r7 + mr r8, r6 + mr r8, r5 + mr r8, r4 + mr r8, r10 + mr r8, r11 + mr r8, r9 + mr r8, r9 + mr r8, r7 + mr r8, r6 + mr r8, r5 + mr r8, r4 + mr r8, r10 + mr r8, r11 + mr r8, r9 + bdnz .Lreg_loop_0 + + ld r0, 16(r1) # restore saved link register + mtlr r0 + addi r1, r1, 32 # destroy the stack frame + blr + +#------------------------------------------------------------------------------ +# Name: VectorToVector +# Purpose: Reads/writes 128-bit values between registers of +# the vector register set, in this case AltiVec. +# Params: r3 = loops +#------------------------------------------------------------------------------ +.align 3 # align to 8 byte boundary +VectorToVector: +_VectorToVector: + stdu r1, -32(r1) # update and store stack pointer + mflr r0 # set up the stack frame + std r0, 16(r1) # save the link register + + mtctr r3 # copy loop count to count register +.Lreg_loop_1: + vmr vs0, vs1 # Each move moves 16 bytes, so we need 16 + vmr vs0, vs2 # moves to transfer a 256 byte chunk. + vmr vs0, vs3 + vmr vs2, vs0 + vmr vs1, vs2 + vmr vs2, vs1 + vmr vs0, vs3 + vmr vs3, vs1 + + vmr vs3, vs2 + vmr vs1, vs3 + vmr vs2, vs1 + vmr vs0, vs1 + vmr vs1, vs2 + vmr vs0, vs1 + vmr vs0, vs3 + vmr vs3, vs0 + bdnz .Lreg_loop_1 + + ld r0, 16(r1) # restore saved link register + mtlr r0 + addi r1, r1, 32 # destroy the stack frame + blr + +#------------------------------------------------------------------------------ +# Name: RegisterToVector +# Purpose: Writes 64-bit main register values into 128-bit vector register +# clearing the upper unused bits. +# Params: r3 = loops +#------------------------------------------------------------------------------ +.align 3 # align to 8 byte boundary +RegisterToVector: +_RegisterToVector: + stdu r1, -32(r1) # update and store stack pointer + mflr r0 # set up the stack frame + std r0, 16(r1) # save the link register + + mtctr r3 # copy loop count to count register +.Lreg_loop_2: + mtvsrd vs1, r8 # Each movq transfers 8 bytes, so we need + mtvsrd vs2, r5 # 32 transfers to move a 256-byte chunk. + mtvsrd vs3, r9 + mtvsrd vs1, r7 + mtvsrd vs2, r5 + mtvsrd vs3, r10 + mtvsrd vs0, r4 + mtvsrd vs0, r6 + + mtvsrd vs0, r8 + mtvsrd vs1, r5 + mtvsrd vs2, r9 + mtvsrd vs3, r7 + mtvsrd vs0, r5 + mtvsrd vs3, r10 + mtvsrd vs2, r4 + mtvsrd vs1, r6 + + mtvsrd vs0, r8 + mtvsrd vs1, r5 + mtvsrd vs2, r9 + mtvsrd vs3, r7 + mtvsrd vs0, r5 + mtvsrd vs3, r10 + mtvsrd vs2, r4 + mtvsrd vs1, r6 + + mtvsrd vs0, r8 + mtvsrd vs1, r5 + mtvsrd vs2, r9 + mtvsrd vs3, r7 + mtvsrd vs0, r5 + mtvsrd vs3, r10 + mtvsrd vs2, r4 + mtvsrd vs1, r6 + bdnz .Lreg_loop_2 + + ld r0, 16(r1) # restore saved link register + mtlr r0 + addi r1, r1, 32 # destroy the stack frame + blr + +#------------------------------------------------------------------------------ +# Name: VectorToRegister +# Purpose: Writes lower 64 bits of vector register into 64-bit main +# register. +# Params: r3 = loops +#------------------------------------------------------------------------------ +.align 3 # align to 8 byte boundary +VectorToRegister: +_VectorToRegister: + stdu r1, -32(r1) # update and store stack pointer + mflr r0 # set up the stack frame + std r0, 16(r1) # save the link register + + mtctr r3 # copy loop count to count register +.Lreg_loop_3: + mfvsrd r6, vs1 + mfvsrd r6, vs2 + mfvsrd r6, vs3 + mfvsrd r6, vs1 + mfvsrd r6, vs2 + mfvsrd r6, vs3 + mfvsrd r6, vs0 + mfvsrd r6, vs0 + + mfvsrd r6, vs0 + mfvsrd r6, vs1 + mfvsrd r6, vs2 + mfvsrd r6, vs3 + mfvsrd r6, vs0 + mfvsrd r6, vs3 + mfvsrd r6, vs2 + mfvsrd r6, vs1 + + mfvsrd r6, vs0 + mfvsrd r6, vs1 + mfvsrd r6, vs2 + mfvsrd r6, vs3 + mfvsrd r6, vs0 + mfvsrd r6, vs3 + mfvsrd r6, vs2 + mfvsrd r6, vs1 + + mfvsrd r6, vs0 + mfvsrd r6, vs1 + mfvsrd r6, vs2 + mfvsrd r6, vs3 + mfvsrd r6, vs0 + mfvsrd r6, vs3 + mfvsrd r6, vs2 + mfvsrd r6, vs1 + bdnz .Lreg_loop_3 + + ld r0, 16(r1) # restore saved link register + mtlr r0 + addi r1, r1, 32 # destroy the stack frame + blr -- cgit v1.1