summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rwxr-xr-xMakefile7
-rwxr-xr-xdefs.h4
-rwxr-xr-xmain.c246
-rwxr-xr-xroutines-ppc64el.asm1434
4 files changed, 1665 insertions, 26 deletions
diff --git a/Makefile b/Makefile
index 913d023..c89d707 100755
--- a/Makefile
+++ b/Makefile
@@ -32,6 +32,7 @@ message:
@echo ""
@echo "To compile for x86 Linux: make bandwidth32"
@echo "To compile for x86_64 Linux: make bandwidth64"
+ @echo "To compile for ppc64el Linux: make bandwidth-ppc64el"
@echo "To compile for x86 Mac OS/X: make bandwidth-mac32"
@echo "To compile for x86_64 Mac OS/X: make bandwidth-mac64"
@echo "To compile for x86 Win32/Cygwin: make bandwidth-win32"
@@ -43,6 +44,11 @@ bandwidth64: main.c routines64.asm BMP64.a BMPGraphing64.a
${CC} ${CFLAGS} -m64 -c ${SRC}
${LD} -m64 routines64.o ${OBJ} BMP64.a -lm BMPGraphing64.a -o bandwidth64
+bandwidth-ppc64el: main.c routines-ppc64el.asm BMP64.a BMPGraphing64.a
+ as -mregnames -mpower8 -o routines-ppc64el.o routines-ppc64el.asm
+ ${CC} ${CFLAGS} -m64 -c ${SRC}
+ ${LD} -m64 routines-ppc64el.o ${OBJ} BMP64.a -lm BMPGraphing64.a -o bandwidth-ppc64el
+
bandwidth32: main.c routines32.asm BMP32.a BMPGraphing32.a
${AS} -f elf routines32.asm -o routines32.o
${CC} ${CFLAGS} -m32 -c ${SRC}
@@ -81,6 +87,7 @@ BMP32.a: BMP.c
clean:
rm -f main.o bandwidth bandwidth32 bandwidth64 routines32.o routines64.o
+ rm -f bandwidth-ppc64el routines-ppc64el.o
rm -f bandwidth-win32.exe bandwidth.bmp bandwidth-mac32 bandwidth-mac64
rm -f BMP.o BMP32.a BMP64.a BMPGraphing.o BMPGraphing32.a BMPGraphing64.a
rm -f font.o minifont.o network_bandwidth.bmp
diff --git a/defs.h b/defs.h
index 176dbd1..54597ef 100755
--- a/defs.h
+++ b/defs.h
@@ -98,13 +98,17 @@ extern int CopyAVX (void*, void*, unsigned long, unsigned long);
extern int CopySSE_128bytes (void*, void*, unsigned long, unsigned long);
extern int ReaderAVX (void *ptr, unsigned long, unsigned long);
+extern int ReaderVSX (void *ptr, unsigned long, unsigned long);
extern int ReaderSSE2 (void *ptr, unsigned long, unsigned long);
extern int ReaderSSE2_bypass (void *ptr, unsigned long, unsigned long);
+extern int RandomReaderVSX (unsigned long **ptr, unsigned long, unsigned long);
extern int RandomReaderSSE2 (unsigned long **ptr, unsigned long, unsigned long);
extern int RandomReaderSSE2_bypass (unsigned long **ptr, unsigned long, unsigned long);
extern int WriterAVX (void *ptr, unsigned long, unsigned long, unsigned long);
+extern int WriterVSX (void *ptr, unsigned long, unsigned long, unsigned long);
extern int WriterSSE2 (void *ptr, unsigned long, unsigned long, unsigned long);
+extern int RandomWriterVSX(unsigned long **ptr, unsigned long, unsigned long, unsigned long);
extern int RandomWriterSSE2(unsigned long **ptr, unsigned long, unsigned long, unsigned long);
extern int ReaderSSE2_128bytes(void *ptr, unsigned long, unsigned long);
diff --git a/main.c b/main.c
index 2d293a8..ebcbbd0 100755
--- a/main.c
+++ b/main.c
@@ -1,6 +1,7 @@
/*============================================================================
bandwidth 1.1, a benchmark to estimate memory transfer bandwidth.
Copyright (C) 2005-2014 by Zack T Smith.
+ Copyright (c) 2015 Raptor Engineering
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -16,7 +17,7 @@
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- The author may be reached at veritas@comcast.net.
+ Zack Smith may be reached at veritas@comcast.net.
*===========================================================================*/
#include <stdio.h>
@@ -44,8 +45,13 @@
#include "BMP.h"
#include "BMPGraphing.h"
+#if defined(__x86_64__) || defined(__i386__)
#define TITLE_MEMORY_NET "Network benchmark results from bandwidth " RELEASE " by Zack Smith, http://zsmith.co"
#define TITLE_MEMORY_GRAPH "Memory benchmark results from bandwidth " RELEASE " by Zack Smith, http://zsmith.co"
+#else
+#define TITLE_MEMORY_NET "Network benchmark results from bandwidth " RELEASE " by Zack Smith and Raptor Engineering"
+#define TITLE_MEMORY_GRAPH "Memory benchmark results from bandwidth " RELEASE " by Zack Smith and Raptor Engineering"
+#endif
#ifdef __WIN32__
#include <windows.h>
@@ -64,6 +70,7 @@ enum {
SSE2_BYPASS,
AVX,
AVX_BYPASS,
+ VSX,
LODSQ,
LODSD,
LODSW,
@@ -88,6 +95,7 @@ static uint32_t cpu_has_sse42 = 0;
static uint32_t cpu_has_aes = 0;
static uint32_t cpu_has_avx = 0;
static uint32_t cpu_has_avx2 = 0;
+static uint32_t cpu_has_vsx = 0;
static uint32_t cpu_has_64bit = 0;
static uint32_t cpu_has_xd = 0;
@@ -160,6 +168,11 @@ static int chunk_sizes[] = {
72 * 1024 * 1024,
96 * 1024 * 1024,
128 * 1024 * 1024,
+#if defined(__PPC64__)
+ 256 * 1024 * 1024,
+ 512 * 1024 * 1024,
+ 1024 * 1024 * 1024,
+#endif
0
};
@@ -330,7 +343,7 @@ do_write (unsigned long size, int mode, bool random)
unsigned char *chunk0;
unsigned long loops;
unsigned long long total_count=0;
-#ifdef __x86_64__
+#if defined(__x86_64__) || defined(__PPC64__)
unsigned long value = 0x1234567689abcdef;
#else
unsigned long value = 0x12345678;
@@ -343,17 +356,31 @@ do_write (unsigned long size, int mode, bool random)
error ("do_write(): chunk size is not multiple of 128.");
//-------------------------------------------------
+#if defined(__PPC64__)
+ // Align to 128-bit boundaries
+ chunk0 = malloc (size+256);
+ chunk = chunk0;
+ if (!chunk)
+ error ("Out of memory");
+ tmp = (unsigned long) chunk;
+ if (tmp & 127) {
+ tmp -= (tmp & 127);
+ tmp += 128;
+ chunk = (unsigned char*) tmp;
+ }
+#else
+ // Align to 32-bit boundaries
chunk0 = malloc (size+64);
chunk = chunk0;
if (!chunk)
error ("Out of memory");
-
tmp = (unsigned long) chunk;
if (tmp & 31) {
tmp -= (tmp & 31);
tmp += 32;
chunk = (unsigned char*) tmp;
}
+#endif
//----------------------------------------
// Set up random pointers to chunks.
@@ -401,6 +428,9 @@ do_write (unsigned long size, int mode, bool random)
case AVX:
print (L"(256-bit), size = ");
break;
+ case VSX:
+ print (L"(128-bit), size = ");
+ break;
case AVX_BYPASS:
print (L"bypassing cache (256-bit), size = ");
break;
@@ -408,7 +438,7 @@ do_write (unsigned long size, int mode, bool random)
print (L"bypassing cache (128-bit), size = ");
break;
default:
-#ifdef __x86_64__
+#if defined(__x86_64__) || defined(__PPC64__)
print (L"(64-bit), size = ");
#else
print (L"(32-bit), size = ");
@@ -428,6 +458,7 @@ do_write (unsigned long size, int mode, bool random)
total_count += loops;
switch (mode) {
+#if defined(__x86_64__) || defined(__i386__)
case SSE2:
if (random)
RandomWriterSSE2 (chunk_ptrs, size/256, loops, value);
@@ -461,7 +492,17 @@ do_write (unsigned long size, int mode, bool random)
WriterAVX_bypass (chunk, size, loops, value);
}
break;
-
+#endif
+
+#if defined(__PPC64__)
+ case VSX:
+ if (random)
+ RandomWriterVSX (chunk_ptrs, size/256, loops, value);
+ else
+ WriterVSX (chunk, size, loops, value);
+ break;
+#endif
+
default:
if (random)
RandomWriter (chunk_ptrs, size/256, loops, value);
@@ -515,6 +556,22 @@ do_read (unsigned long size, int mode, bool random)
error ("do_read(): chunk size is not multiple of 128.");
//-------------------------------------------------
+#if defined(__PPC64__)
+ // Align to 128-bit boundaries
+ chunk0 = chunk = malloc (size+256);
+ if (!chunk)
+ error ("Out of memory");
+
+ memset (chunk, 0, size);
+
+ tmp = (unsigned long) chunk;
+ if (tmp & 127) {
+ tmp -= (tmp & 127);
+ tmp += 128;
+ chunk = (unsigned char*) tmp;
+ }
+#else
+ // Align to 32-bit boundaries
chunk0 = chunk = malloc (size+64);
if (!chunk)
error ("Out of memory");
@@ -527,6 +584,7 @@ do_read (unsigned long size, int mode, bool random)
tmp += 32;
chunk = (unsigned char*) tmp;
}
+#endif
//----------------------------------------
// Set up random pointers to chunks.
@@ -586,6 +644,9 @@ do_read (unsigned long size, int mode, bool random)
case AVX:
print (L"(256-bit), size = ");
break;
+ case VSX:
+ print (L"(128-bit), size = ");
+ break;
case AVX_BYPASS:
print (L"bypassing cache (256-bit), size = ");
break;
@@ -593,7 +654,7 @@ do_read (unsigned long size, int mode, bool random)
print (L"bypassing cache (128-bit), size = ");
break;
default:
-#ifdef __x86_64__
+#if defined(__x86_64__) || defined(__PPC64__)
print (L"(64-bit), size = ");
#else
print (L"(32-bit), size = ");
@@ -615,6 +676,7 @@ do_read (unsigned long size, int mode, bool random)
total_count += loops;
switch (mode) {
+#if defined(__x86_64__) || defined(__i386__)
case SSE2:
if (random)
RandomReaderSSE2 (chunk_ptrs, size/256, loops);
@@ -644,7 +706,17 @@ do_read (unsigned long size, int mode, bool random)
ReaderAVX (chunk, size, loops);
}
break;
-
+#endif
+
+#if defined(__PPC64__)
+ case VSX:
+ if (random)
+ RandomReaderVSX (chunk_ptrs, size/256, loops);
+ else
+ ReaderVSX (chunk, size, loops);
+ break;
+#endif
+
case LODSB:
if (!random) {
ReaderLODSB (chunk, size, loops);
@@ -701,7 +773,7 @@ do_read (unsigned long size, int mode, bool random)
}
-
+#if defined(__x86_64__) || defined(__i386__)
//----------------------------------------------------------------------------
// Name: do_copy
// Purpose: Performs sequential memory copy.
@@ -722,6 +794,33 @@ do_copy (unsigned long size, int mode)
error ("do_copy(): chunk size is not multiple of 128.");
//-------------------------------------------------
+
+#if defined(__PPC64__)
+ // Align to 128-bit boundaries
+ chunk_src0 = chunk_src = malloc (size+256);
+ if (!chunk_src)
+ error ("Out of memory");
+ chunk_dest0 = chunk_dest = malloc (size+256);
+ if (!chunk_dest)
+ error ("Out of memory");
+
+ memset (chunk_src, 100, size);
+ memset (chunk_dest, 200, size);
+
+ tmp = (unsigned long) chunk_src;
+ if (tmp & 127) {
+ tmp -= (tmp & 127);
+ tmp += 128;
+ chunk_src = (unsigned char*) tmp;
+ }
+ tmp = (unsigned long) chunk_dest;
+ if (tmp & 127) {
+ tmp -= (tmp & 127);
+ tmp += 128;
+ chunk_dest = (unsigned char*) tmp;
+ }
+#else
+ // Align to 32-bit boundaries
chunk_src0 = chunk_src = malloc (size+64);
if (!chunk_src)
error ("Out of memory");
@@ -731,7 +830,7 @@ do_copy (unsigned long size, int mode)
memset (chunk_src, 100, size);
memset (chunk_dest, 200, size);
-
+
tmp = (unsigned long) chunk_src;
if (tmp & 31) {
tmp -= (tmp & 31);
@@ -744,6 +843,7 @@ do_copy (unsigned long size, int mode)
tmp += 32;
chunk_dest = (unsigned char*) tmp;
}
+#endif
//-------------------------------------------------
print (L"Sequential copy ");
@@ -755,7 +855,7 @@ do_copy (unsigned long size, int mode)
print (L"(256-bit), size = ");
}
else {
-#ifdef __x86_64__
+#if defined(__x86_64__) || defined(__PPC64__)
print (L"(64-bit), size = ");
#else
print (L"(32-bit), size = ");
@@ -770,14 +870,14 @@ do_copy (unsigned long size, int mode)
loops = (1 << 26) / size; // XX need to adjust for CPU MHz
if (loops < 1)
loops = 1;
-
+
t0 = mytime ();
while (diff < usec_per_test) {
total_count += loops;
if (mode == SSE2) {
-#ifdef __x86_64__
+#if defined(__x86_64__)
if (size & 128)
CopySSE_128bytes (chunk_dest, chunk_src, size, loops);
else
@@ -808,6 +908,7 @@ do_copy (unsigned long size, int mode)
return result;
}
+#endif
//----------------------------------------------------------------------------
@@ -827,7 +928,7 @@ fb_readwrite (bool use_sse2)
unsigned long datum;
int fd;
register unsigned long foo;
-#ifdef __x86_64__
+#if defined(__x86_64__) || defined(__PPC64__)
unsigned long value = 0x1234567689abcdef;
#else
unsigned long value = 0x12345678;
@@ -901,9 +1002,11 @@ fb_readwrite (bool use_sse2)
total_count = FBLOOPS_R;
+#if defined(__x86_64__) || defined(__i386__)
if (use_sse2)
ReaderSSE2 (fb, length, FBLOOPS_R);
else
+#endif
Reader (fb, length, FBLOOPS_R);
diff = mytime () - t0;
@@ -921,9 +1024,11 @@ fb_readwrite (bool use_sse2)
total_count = FBLOOPS_W;
+#if defined(__x86_64__) || defined(__i386__)
if (use_sse2)
WriterSSE2_bypass (fb, length, FBLOOPS_W, value);
else
+#endif
Writer (fb, length, FBLOOPS_W, value);
diff = mytime () - t0;
@@ -937,6 +1042,8 @@ fb_readwrite (bool use_sse2)
// Name: register_test
// Purpose: Determines bandwidth of register-to-register transfers.
//----------------------------------------------------------------------------
+#define REGISTER_COUNT 10000
+#define VREGISTER_COUNT 3333
void
register_test ()
{
@@ -945,13 +1052,12 @@ register_test ()
unsigned long diff = 0;
//--------------------------------------
-#ifdef __x86_64__
+#if defined(__x86_64__) || defined(__PPC64__)
print (L"Main register to main register transfers (64-bit) ");
#else
print (L"Main register to main register transfers (32-bit) ");
#endif
flush ();
-#define REGISTER_COUNT 10000
t0 = mytime ();
while (diff < usec_per_test)
@@ -967,13 +1073,12 @@ register_test ()
flush ();
//--------------------------------------
-#ifdef __x86_64__
+#if defined(__x86_64__) || defined(__PPC64__)
print (L"Main register to vector register transfers (64-bit) ");
#else
print (L"Main register to vector register transfers (32-bit) ");
#endif
flush ();
-#define VREGISTER_COUNT 3333
t0 = mytime ();
diff = 0;
@@ -991,7 +1096,7 @@ register_test ()
flush ();
//--------------------------------------
-#ifdef __x86_64__
+#if defined(__x86_64__) || defined(__PPC64__)
print (L"Vector register to main register transfers (64-bit) ");
#else
print (L"Vector register to main register transfers (32-bit) ");
@@ -1032,6 +1137,8 @@ register_test ()
newline ();
flush ();
+#if defined(__x86_64__) || defined(__i386__)
+
//--------------------------------------
if (cpu_has_avx) {
print (L"Vector register to vector register transfers (256-bit) ");
@@ -1216,6 +1323,7 @@ register_test ()
newline ();
flush ();
}
+#endif
}
//----------------------------------------------------------------------------
@@ -1229,7 +1337,7 @@ stack_test ()
unsigned long t0;
unsigned long diff = 0;
-#ifdef __x86_64__
+#if defined(__x86_64__) || defined(__PPC64__)
print (L"Stack-to-register transfers (64-bit) ");
#else
print (L"Stack-to-register transfers (32-bit) ");
@@ -1252,7 +1360,7 @@ stack_test ()
newline ();
flush ();
-#ifdef __x86_64__
+#if defined(__x86_64__) || defined(__PPC64__)
print (L"Register-to-stack transfers (64-bit) ");
#else
print (L"Register-to-stack transfers (32-bit) ");
@@ -1873,7 +1981,8 @@ main (int argc, char **argv)
}
printf ("This is bandwidth version %s.\n", RELEASE);
- printf ("Copyright (C) 2005-2014 by Zack T Smith.\n\n");
+ printf ("Copyright (C) 2005-2014 by Zack T Smith.\n");
+ printf ("Copyright (C) 2015 Raptor Engineering.\n\n");
printf ("This software is covered by the GNU Public License.\n");
printf ("It is provided AS-IS, use at your own risk.\n");
printf ("See the file COPYING for more information.\n\n");
@@ -1909,6 +2018,11 @@ main (int argc, char **argv)
return 0;
}
+#if defined(__PPC64__)
+ cpu_has_vsx = 1;
+#endif
+
+#if defined(__x86_64__) || defined(__i386__)
uint32_t ecx = get_cpuid1_ecx ();
uint32_t edx = get_cpuid1_edx ();
cpu_has_mmx = edx & CPUID_EDX_MMX;
@@ -2019,6 +2133,8 @@ main (int argc, char **argv)
if (!cpu_has_sse2)
use_sse2 = false;
+#endif
+
println (L"\nNotation: B = byte, kB = 1024 B, MB = 1048576 B.");
flush ();
@@ -2054,6 +2170,7 @@ main (int argc, char **argv)
strcpy (graph_title, TITLE_MEMORY_GRAPH);
BMPGraphing_set_title (graph, graph_title);
+#if defined(__x86_64__) || defined(__i386__)
//------------------------------------------------------------
// SSE2 sequential reads.
//
@@ -2239,11 +2356,85 @@ main (int argc, char **argv)
}
}
+#endif
+
+#if defined(__PPC64__)
+ //------------------------------------------------------------
+ // VSX sequential reads.
+ //
+ if (cpu_has_vsx) {
+ BMPGraphing_new_line (graph, "Sequential 128-bit reads", RGB_TURQUOISE);
+
+ newline ();
+
+ i = 0;
+ while ((chunk_size = chunk_sizes [i++])) {
+ if (!(chunk_size & 128)) {
+ int amount = do_read (chunk_size, VSX, false);
+ BMPGraphing_add_point (graph, chunk_size, amount);
+ }
+ }
+ }
+
+ //------------------------------------------------------------
+ // VSX random reads.
+ //
+ if (cpu_has_vsx) {
+ BMPGraphing_new_line (graph, "Random 128-bit reads", RGB_MAROON);
+
+ newline ();
+ srand (time (NULL));
+
+ i = 0;
+ while ((chunk_size = chunk_sizes [i++])) {
+ if (!(chunk_size & 128)) {
+ int amount = do_read (chunk_size, VSX, true);
+ BMPGraphing_add_point (graph, chunk_size, amount);
+ }
+ }
+ }
+
+ //------------------------------------------------------------
+ // VSX sequential writes that do not bypass the caches.
+ //
+ if (cpu_has_vsx) {
+ BMPGraphing_new_line (graph, "Sequential 128-bit cache writes", RGB_PINK);
+
+ newline ();
+
+ i = 0;
+ while ((chunk_size = chunk_sizes [i++])) {
+ if (!(chunk_size & 128)) {
+ int amount = do_write (chunk_size, VSX, false);
+ BMPGraphing_add_point (graph, chunk_size, amount);
+ }
+ }
+ }
+
+ //------------------------------------------------------------
+ // VSX random writes that do not bypass the caches.
+ //
+ if (cpu_has_vsx) {
+ BMPGraphing_new_line (graph, "Random 128-bit cache writes", RGB_NAVYBLUE);
+
+ newline ();
+ srand (time (NULL));
+
+ i = 0;
+ while ((chunk_size = chunk_sizes [i++])) {
+ if (!(chunk_size & 128)) {
+ int amount = do_write (chunk_size, VSX, true);
+ BMPGraphing_add_point (graph, chunk_size, amount);
+ }
+ }
+ }
+#endif
+
//------------------------------------------------------------
// Sequential non-SSE2 reads.
//
newline ();
-#ifdef __x86_64__
+#if defined(__x86_64__) || defined(__PPC64__)
BMPGraphing_new_line (graph, "Sequential 64-bit reads", RGB_BLUE);
#else
BMPGraphing_new_line (graph, "Sequential 32-bit reads", RGB_BLUE);
@@ -2259,7 +2450,7 @@ main (int argc, char **argv)
// Random non-SSE2 reads.
//
newline ();
-#ifdef __x86_64__
+#if defined(__x86_64__) || defined(__PPC64__)
BMPGraphing_new_line (graph, "Random 64-bit reads", RGB_CYAN);
#else
BMPGraphing_new_line (graph, "Random 32-bit reads", RGB_CYAN);
@@ -2277,7 +2468,7 @@ main (int argc, char **argv)
//------------------------------------------------------------
// Sequential non-SSE2 writes.
//
-#ifdef __x86_64__
+#if defined(__x86_64__) || defined(__PPC64__)
BMPGraphing_new_line (graph, "Sequential 64-bit writes", RGB_DARKGREEN);
#else
BMPGraphing_new_line (graph, "Sequential 32-bit writes", RGB_DARKGREEN);
@@ -2294,7 +2485,7 @@ main (int argc, char **argv)
//------------------------------------------------------------
// Random non-SSE2 writes.
//
-#ifdef __x86_64__
+#if defined(__x86_64__) || defined(__PPC64__)
BMPGraphing_new_line (graph, "Random 64-bit writes", RGB_GREEN);
#else
BMPGraphing_new_line (graph, "Random 32-bit writes", RGB_GREEN);
@@ -2311,6 +2502,8 @@ main (int argc, char **argv)
}
}
+#if defined(__x86_64__) || defined(__i386__)
+
//------------------------------------------------------------
// SSE2 sequential copy.
//
@@ -2342,9 +2535,10 @@ main (int argc, char **argv)
}
}
}
+#endif
#ifdef DOING_LODS
-#ifdef __x86_64__
+#if defined(__x86_64__) || defined(__PPC64__)
//------------------------------------------------------------
// LODSQ 64-bit sequential reads.
//
diff --git a/routines-ppc64el.asm b/routines-ppc64el.asm
new file mode 100755
index 0000000..ce9e2cf
--- /dev/null
+++ b/routines-ppc64el.asm
@@ -0,0 +1,1434 @@
+#============================================================================
+# ppc64el routines for bandwidth, a benchmark to estimate memory transfer bandwidth.
+# Copyright (c) 2015 Raptor Engineering
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either ver4on 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+#
+# The author of this file may be reached at tpearson@raptorengineeringinc.com.
+#=============================================================================
+
+.global ReaderLODSQ
+.global _ReaderLODSQ
+
+.global ReaderLODSD
+.global _ReaderLODSD
+
+.global ReaderLODSW
+.global _ReaderLODSW
+
+.global ReaderLODSB
+.global _ReaderLODSB
+
+.global RandomReader
+.global RandomWriter
+.global Reader
+.global Reader_128bytes
+.global RandomReaderVSX
+.global ReaderVSX
+.global RegisterToRegister
+.global RegisterToVector
+.global StackReader
+.global StackWriter
+.global VectorToRegister
+.global VectorToVector
+.global Writer
+.global Writer_128bytes
+.global WriterVSX
+.global RandomWriterVSX
+.global _RandomReader
+.global _RandomWriter
+.global _Reader
+.global _Reader_128bytes
+.global _RandomReaderVSX
+.global _ReaderVSX
+.global _RegisterToRegister
+.global _RegisterToVector
+.global _StackReader
+.global _StackWriter
+.global _VectorToRegister
+.global _VectorToVector
+.global _Writer
+.global _Writer_128bytes
+.global _WriterVSX
+.global _RandomWriterVSX
+
+.data
+.align 3 # align to 8 byte boundary
+stack_test_1:
+ .quad 1000
+stack_test_2:
+ .quad 2000
+stack_test_3:
+ .quad 3000
+stack_test_4:
+ .quad 4000
+stack_test_5:
+ .quad 5000
+stack_test_6:
+ .quad 6000
+stack_test_7:
+ .quad 7000
+
+# Note:
+# Unix ABI for 64-bit PPC64EL v2 says integer parameters are put into these registers in this order:
+# r3, r4, r5, r6, r7, r7
+
+.text
+
+#------------------------------------------------------------------------------
+# Name: ReaderLODSQ
+# Purpose: Reads 64-bit values sequentially from an area of memory
+# using LD instruction.
+# Params: r3 = ptr to memory area
+# r4 = length in bytes (inner loop count)
+# r5 = loops (outer loop count)
+#------------------------------------------------------------------------------
+.align 2 # align to 4 byte boundary
+ReaderLODSQ:
+_ReaderLODSQ:
+ stdu r1, -32(r1) # update and store stack pointer
+ mflr r0 # set up the stack frame
+ std r0, 16(r1) # save the link register
+
+ mr r7, r4 # calculate inner loop count
+ srdi r7, r7, 3 # length in quadwords rounded down
+
+ mr r10, r5 # load outer loop counter
+.Louter_loop_0:
+ addi r10, r10, -1
+ mtctr r7 # copy inner loop count to count register
+ mr r9, r3 # (re)load inner loop pointer address
+.Linner_loop_0:
+ ld r8, 0(r9)
+ addi r9, r9, 8 # increment pointer by 1 quadword
+ bdnz .Linner_loop_0
+
+ cmpwi r10, 0
+ bne .Louter_loop_0
+
+ ld r0, 16(r1) # restore saved link register
+ mtlr r0
+ addi r1, r1, 32 # destroy the stack frame
+ blr
+
+#------------------------------------------------------------------------------
+# Name: ReaderLODSD
+# Purpose: Reads 32-bit values sequentially from an area of memory
+# using LWZ instruction.
+# Params: r3 = ptr to memory area
+# r4 = length in bytes (inner loop count)
+# r5 = loops (outer loop count)
+#------------------------------------------------------------------------------
+.align 2 # align to 4 byte boundary
+ReaderLODSD:
+_ReaderLODSD:
+ stdu r1, -32(r1) # update and store stack pointer
+ mflr r0 # set up the stack frame
+ std r0, 16(r1) # save the link register
+
+ mr r7, r4 # calculate inner loop count
+ srdi r7, r7, 2 # length in doublewords rounded down
+
+ mr r10, r5 # load outer loop counter
+.Louter_loop_1:
+ addi r10, r10, -1
+ mtctr r7 # copy inner loop count to count register
+ mr r9, r3 # (re)load inner loop pointer address
+.Linner_loop_1:
+ lwz r8, 0(r9)
+ addi r9, r9, 4 # increment pointer by 1 double word
+ bdnz .Linner_loop_1
+
+ cmpwi r10, 0
+ bne .Louter_loop_1
+
+ ld r0, 16(r1) # restore saved link register
+ mtlr r0
+ addi r1, r1, 32 # destroy the stack frame
+ blr
+
+#------------------------------------------------------------------------------
+# Name: ReaderLODSW
+# Purpose: Reads 16-bit values sequentially from an area of memory
+# using LHZ instruction.
+# Params: r3 = ptr to memory area
+# r4 = length in bytes (inner loop count)
+# r5 = loops (outer loop count)
+#------------------------------------------------------------------------------
+.align 2 # align to 4 byte boundary
+ReaderLODSW:
+_ReaderLODSW:
+ stdu r1, -32(r1) # update and store stack pointer
+ mflr r0 # set up the stack frame
+ std r0, 16(r1) # save the link register
+
+ mr r7, r4 # calculate inner loop count
+ srdi r7, r7, 1 # length in words rounded down
+
+ mr r10, r5 # load outer loop counter
+.Louter_loop_2:
+ addi r10, r10, -1
+ mtctr r7 # copy inner loop count to count register
+ mr r9, r3 # (re)load inner loop pointer address
+.Linner_loop_2:
+ lhz r8, 0(r9)
+ addi r9, r9, 2 # increment pointer by 1 word
+ bdnz .Linner_loop_2
+
+ cmpwi r10, 0
+ bne .Louter_loop_2
+
+ ld r0, 16(r1) # restore saved link register
+ mtlr r0
+ addi r1, r1, 32 # destroy the stack frame
+ blr
+
+#------------------------------------------------------------------------------
+# Name: ReaderLODSB
+# Purpose: Reads 16-bit values sequentially from an area of memory
+# using LDBZ instruction.
+# Params: r3 = ptr to memory area
+# r4 = length in bytes (inner loop count)
+# r5 = loops (outer loop count)
+#------------------------------------------------------------------------------
+.align 2 # align to 4 byte boundary
+ReaderLODSB:
+_ReaderLODSB:
+ stdu r1, -32(r1) # update and store stack pointer
+ mflr r0 # set up the stack frame
+ std r0, 16(r1) # save the link register
+
+ mr r10, r5 # load outer loop counter
+.Louter_loop_3:
+ addi r10, r10, -1
+ mtctr r4 # copy inner loop count to count register
+ mr r9, r3 # (re)load inner loop pointer address
+.Linner_loop_3:
+ lbz r8, 0(r9)
+ addi r9, r9, 1 # increment pointer by 1 byte
+ bdnz .Linner_loop_3
+
+ cmpwi r10, 0
+ bne .Louter_loop_3
+
+ ld r0, 16(r1) # restore saved link register
+ mtlr r0
+ addi r1, r1, 32 # destroy the stack frame
+ blr
+
+
+#------------------------------------------------------------------------------
+# Name: Reader
+# Purpose: Reads 64-bit values sequentially from an area of memory.
+# Params: r3 = ptr to memory area
+# r4 = length in bytes
+# r5 = loops
+#------------------------------------------------------------------------------
+.align 3 # align to 8 byte boundary
+Reader:
+_Reader:
+ stdu r1, -32(r1) # update and store stack pointer
+ mflr r0 # set up the stack frame
+ std r0, 16(r1) # save the link register
+
+ mr r7, r4 # calculate inner loop count
+ srdi r7, r7, 8 # length in bytes / 256
+
+ mr r10, r5 # load outer loop counter
+.Louter_loop_4:
+ addi r10, r10, -1
+ mtctr r7 # copy inner loop count to count register
+ mr r9, r3 # (re)load inner loop pointer address
+.Linner_loop_4:
+ ld r8, 0(r9)
+ ld r8, 8(r9)
+ ld r8, 16(r9)
+ ld r8, 24(r9)
+ ld r8, 32(r9)
+ ld r8, 40(r9)
+ ld r8, 48(r9)
+ ld r8, 56(r9)
+ ld r8, 64(r9)
+ ld r8, 72(r9)
+ ld r8, 80(r9)
+ ld r8, 88(r9)
+ ld r8, 96(r9)
+ ld r8, 104(r9)
+ ld r8, 112(r9)
+ ld r8, 120(r9)
+ ld r8, 128(r9)
+ ld r8, 136(r9)
+ ld r8, 144(r9)
+ ld r8, 152(r9)
+ ld r8, 160(r9)
+ ld r8, 168(r9)
+ ld r8, 176(r9)
+ ld r8, 184(r9)
+ ld r8, 192(r9)
+ ld r8, 200(r9)
+ ld r8, 208(r9)
+ ld r8, 216(r9)
+ ld r8, 224(r9)
+ ld r8, 232(r9)
+ ld r8, 240(r9)
+ ld r8, 248(r9)
+ addi r9, r9, 256 # increment pointer by 256 bytes
+ bdnz .Linner_loop_4
+
+ cmpwi r10, 0
+ bne .Louter_loop_4
+
+ ld r0, 16(r1) # restore saved link register
+ mtlr r0
+ addi r1, r1, 32 # destroy the stack frame
+ blr
+
+#------------------------------------------------------------------------------
+# Name: Reader_128bytes
+# Purpose: Reads 64-bit values sequentially from an area of memory.
+# Params: r3 = ptr to memory area
+# r4 = length in bytes
+# r5 = loops
+#------------------------------------------------------------------------------
+.align 3 # align to 8 byte boundary
+Reader_128bytes:
+_Reader_128bytes:
+ stdu r1, -32(r1) # update and store stack pointer
+ mflr r0 # set up the stack frame
+ std r0, 16(r1) # save the link register
+
+ mr r7, r4 # calculate inner loop count
+ srdi r7, r7, 7 # length in bytes / 128
+
+ mr r10, r5 # load outer loop counter
+.Louter_loop_5:
+ addi r10, r10, -1
+ mtctr r7 # copy inner loop count to count register
+ mr r9, r3 # (re)load inner loop pointer address
+.Linner_loop_5:
+ ld r8, 0(r9)
+ ld r8, 8(r9)
+ ld r8, 16(r9)
+ ld r8, 24(r9)
+ ld r8, 32(r9)
+ ld r8, 40(r9)
+ ld r8, 48(r9)
+ ld r8, 56(r9)
+ ld r8, 64(r9)
+ ld r8, 72(r9)
+ ld r8, 80(r9)
+ ld r8, 88(r9)
+ ld r8, 96(r9)
+ ld r8, 104(r9)
+ ld r8, 112(r9)
+ ld r8, 120(r9)
+ addi r9, r9, 128 # increment pointer by 128 bytes
+ bdnz .Linner_loop_5
+
+ cmpwi r10, 0
+ bne .Louter_loop_5
+
+ ld r0, 16(r1) # restore saved link register
+ mtlr r0
+ addi r1, r1, 32 # destroy the stack frame
+ blr
+
+#------------------------------------------------------------------------------
+# Name: RandomReaderVSX
+# Purpose: Reads 128-bit values randomly from an area of memory.
+# Params: r3 = ptr to array of chunk pointers
+# r4 = # of chunks
+# r5 = loops
+#------------------------------------------------------------------------------
+.align 3 # align to 8 byte boundary
+RandomReaderVSX:
+_RandomReaderVSX:
+ stdu r1, -152(r1) # update and store stack pointer
+ mflr r0 # set up the stack frame
+ std r0, 16(r1) # save the link register
+ std r14, 24(r1) # save r14
+ std r15, 32(r1) # save r15
+ std r16, 40(r1) # save r16
+ std r17, 48(r1) # save r17
+ std r18, 56(r1) # save r18
+ std r19, 64(r1) # save r19
+ std r20, 72(r1) # save r20
+ std r21, 80(r1) # save r21
+ std r22, 88(r1) # save r22
+ std r23, 96(r1) # save r23
+ std r24, 104(r1) # save r24
+ std r25, 112(r1) # save r25
+ std r26, 120(r1) # save r26
+ std r27, 128(r1) # save r27
+ std r28, 136(r1) # save r28
+
+ li r12, 240 # initialize read offset registers
+ li r14, 128
+ li r15, 64
+ li r16, 208
+ li r17, 112
+ li r18, 176
+ li r19, 144
+ li r20, 0
+
+ li r21, 96
+ li r22, 16
+ li r23, 192
+ li r24, 160
+ li r25, 32
+ li r26, 48
+ li r27, 224
+ li r28, 80
+
+ mr r10, r5 # load outer loop counter
+.Louter_vsx_loop_0:
+ addi r10, r10, -1
+ mr r11, r4 # copy inner loop count to inner loop count register
+.Linner_vsx_loop_0:
+ addi r11, r11, -1
+ mr r7, r11 # (re)calculate inner loop data offset
+ sldi r7, r7, 3
+ ldx r9, r3, r7 # (re)compute inner loop start pointer
+
+ lxvd2x v0, r12, r9
+ lxvd2x v0, r14, r9
+ lxvd2x v0, r15, r9
+ lxvd2x v0, r16, r9
+ lxvd2x v0, r17, r9
+ lxvd2x v0, r18, r9
+ lxvd2x v0, r19, r9
+ lxvd2x v0, r20, r9
+
+ lxvd2x v0, r21, r9
+ lxvd2x v0, r22, r9
+ lxvd2x v0, r23, r9
+ lxvd2x v0, r24, r9
+ lxvd2x v0, r25, r9
+ lxvd2x v0, r26, r9
+ lxvd2x v0, r27, r9
+ lxvd2x v0, r28, r9
+
+ cmpwi r11, 0
+ bne .Linner_vsx_loop_0
+
+ cmpwi r10, 0
+ bne .Louter_vsx_loop_0
+
+ ld r28, 136(r1) # restore r28
+ ld r27, 128(r1) # restore r27
+ ld r26, 120(r1) # restore r26
+ ld r25, 112(r1) # restore r25
+ ld r24, 104(r1) # restore r24
+ ld r23, 96(r1) # restore r23
+ ld r22, 88(r1) # restore r22
+ ld r21, 80(r1) # restore r21
+ ld r20, 72(r1) # restore r20
+ ld r19, 64(r1) # restore r19
+ ld r18, 56(r1) # restore r18
+ ld r17, 48(r1) # restore r17
+ ld r16, 40(r1) # restore r16
+ ld r15, 32(r1) # restore r15
+ ld r14, 24(r1) # restore r14
+ ld r0, 16(r1) # restore saved link register
+ mtlr r0
+ addi r1, r1, 152 # destroy the stack frame
+ blr
+
+#------------------------------------------------------------------------------
+# Name: ReaderVSX
+# Purpose: Reads 128-bit values sequentially from an area of memory.
+# Params: r3 = ptr to memory area
+# r4 = length in bytes
+# r5 = loops
+#------------------------------------------------------------------------------
+.align 3 # align to 8 byte boundary
+ReaderVSX:
+_ReaderVSX:
+ stdu r1, -144(r1) # update and store stack pointer
+ mflr r0 # set up the stack frame
+ std r0, 16(r1) # save the link register
+ std r14, 24(r1) # save r14
+ std r15, 32(r1) # save r15
+ std r16, 40(r1) # save r16
+ std r17, 48(r1) # save r17
+ std r18, 56(r1) # save r18
+ std r19, 64(r1) # save r19
+ std r20, 72(r1) # save r20
+ std r21, 80(r1) # save r21
+ std r22, 88(r1) # save r22
+ std r23, 96(r1) # save r23
+ std r24, 104(r1) # save r24
+ std r25, 112(r1) # save r25
+ std r26, 120(r1) # save r26
+ std r27, 128(r1) # save r27
+
+ mr r7, r4 # calculate inner loop count
+ srdi r7, r7, 8 # length in bytes / 256
+
+ li r11, 0 # initialize read offset registers
+ li r12, 16
+ li r14, 32
+ li r15, 48
+ li r16, 64
+ li r17, 80
+ li r18, 96
+ li r19, 112
+
+ li r20, 128
+ li r21, 144
+ li r22, 160
+ li r23, 176
+ li r24, 192
+ li r25, 208
+ li r26, 224
+ li r27, 240
+
+ mr r10, r5 # load outer loop counter
+.Louter_vsx_loop_1:
+ addi r10, r10, -1
+ mtctr r7 # copy inner loop count to count register
+ mr r9, r3 # (re)load inner loop pointer address
+.Linner_vsx_loop_1:
+ lxvd2x v0, r11, r9
+ lxvd2x v0, r12, r9
+ lxvd2x v0, r14, r9
+ lxvd2x v0, r15, r9
+ lxvd2x v0, r16, r9
+ lxvd2x v0, r17, r9
+ lxvd2x v0, r18, r9
+ lxvd2x v0, r19, r9
+
+ lxvd2x v0, r20, r9
+ lxvd2x v0, r21, r9
+ lxvd2x v0, r22, r9
+ lxvd2x v0, r23, r9
+ lxvd2x v0, r24, r9
+ lxvd2x v0, r25, r9
+ lxvd2x v0, r26, r9
+ lxvd2x v0, r27, r9
+
+ addi r9, r9, 256 # increment pointer by 256 bytes
+ bdnz .Linner_vsx_loop_1
+
+ cmpwi r10, 0
+ bne .Louter_vsx_loop_1
+
+ ld r27, 128(r1) # restore r27
+ ld r26, 120(r1) # restore r26
+ ld r25, 112(r1) # restore r25
+ ld r24, 104(r1) # restore r24
+ ld r23, 96(r1) # restore r23
+ ld r22, 88(r1) # restore r22
+ ld r21, 80(r1) # restore r21
+ ld r20, 72(r1) # restore r20
+ ld r19, 64(r1) # restore r19
+ ld r18, 56(r1) # restore r18
+ ld r17, 48(r1) # restore r17
+ ld r16, 40(r1) # restore r16
+ ld r15, 32(r1) # restore r15
+ ld r14, 24(r1) # restore r14
+ ld r0, 16(r1) # restore saved link register
+ mtlr r0
+ addi r1, r1, 144 # destroy the stack frame
+ blr
+
+#------------------------------------------------------------------------------
+# Name: RandomReader
+# Purpose: Reads 64-bit values randomly from an area of memory.
+# Params: r3 = ptr to array of chunk pointers
+# r4 = # of chunks
+# r5 = loops
+#------------------------------------------------------------------------------
+.align 3 # align to 8 byte boundary
+RandomReader:
+_RandomReader:
+ stdu r1, -32(r1) # update and store stack pointer
+ mflr r0 # set up the stack frame
+ std r0, 16(r1) # save the link register
+
+ mr r10, r5 # load outer loop counter
+.Louter_loop_6:
+ addi r10, r10, -1
+ mr r11, r4 # copy inner loop count to inner loop count register
+.Linner_loop_6:
+ addi r11, r11, -1
+ mr r7, r11 # (re)calculate inner loop data offset
+ sldi r7, r7, 3
+ ldx r9, r3, r7 # (re)compute inner loop start pointer
+ ld r8, 96(r9)
+ ld r8, 0(r9)
+ ld r8, 120(r9)
+ ld r8, 184(r9)
+ ld r8, 160(r9)
+ ld r8, 176(r9)
+ ld r8, 112(r9)
+ ld r8, 80(r9)
+ ld r8, 32(r9)
+ ld r8, 128(r9)
+ ld r8, 88(r9)
+ ld r8, 40(r9)
+ ld r8, 48(r9)
+ ld r8, 72(r9)
+ ld r8, 200(r9)
+ ld r8, 24(r9)
+ ld r8, 152(r9)
+ ld r8, 16(r9)
+ ld r8, 248(r9)
+ ld r8, 56(r9)
+ ld r8, 240(r9)
+ ld r8, 208(r9)
+ ld r8, 104(r9)
+ ld r8, 216(r9)
+ ld r8, 136(r9)
+ ld r8, 232(r9)
+ ld r8, 64(r9)
+ ld r8, 224(r9)
+ ld r8, 144(r9)
+ ld r8, 192(r9)
+ ld r8, 8(r9)
+ ld r8, 168(r9)
+ cmpwi r11, 0
+ bne .Linner_loop_6
+
+ cmpwi r10, 0
+ bne .Louter_loop_6
+
+ ld r0, 16(r1) # restore saved link register
+ mtlr r0
+ addi r1, r1, 32 # destroy the stack frame
+ blr
+
+#------------------------------------------------------------------------------
+# Name: RandomWriter
+# Purpose: Writes 64-bit values randomly to an area of memory.
+# Params: r3 = ptr to array of chunk pointers
+# r4 = # of chunks
+# r5 = loops
+# r6 = datum to write
+#------------------------------------------------------------------------------
+.align 3 # align to 8 byte boundary
+RandomWriter:
+_RandomWriter:
+ stdu r1, -32(r1) # update and store stack pointer
+ mflr r0 # set up the stack frame
+ std r0, 16(r1) # save the link register
+
+ mr r10, r5 # load outer loop counter
+.Louter_loop_7:
+ addi r10, r10, -1
+ mr r11, r4 # copy inner loop count to inner loop count register
+.Linner_loop_7:
+ addi r11, r11, -1
+ mr r7, r11 # (re)calculate inner loop data offset
+ sldi r7, r7, 3
+ ldx r9, r3, r7 # (re)compute inner loop start pointer
+ std r8, 96(r9)
+ std r8, 0(r9)
+ std r8, 120(r9)
+ std r8, 184(r9)
+ std r8, 160(r9)
+ std r8, 176(r9)
+ std r8, 112(r9)
+ std r8, 80(r9)
+ std r8, 32(r9)
+ std r8, 128(r9)
+ std r8, 88(r9)
+ std r8, 40(r9)
+ std r8, 48(r9)
+ std r8, 72(r9)
+ std r8, 200(r9)
+ std r8, 24(r9)
+ std r8, 152(r9)
+ std r8, 16(r9)
+ std r8, 248(r9)
+ std r8, 56(r9)
+ std r8, 240(r9)
+ std r8, 208(r9)
+ std r8, 104(r9)
+ std r8, 216(r9)
+ std r8, 136(r9)
+ std r8, 232(r9)
+ std r8, 64(r9)
+ std r8, 224(r9)
+ std r8, 144(r9)
+ std r8, 192(r9)
+ std r8, 8(r9)
+ std r8, 168(r9)
+ cmpwi r11, 0
+ bne .Linner_loop_7
+
+ cmpwi r10, 0
+ bne .Louter_loop_7
+
+ ld r0, 16(r1) # restore saved link register
+ mtlr r0
+ addi r1, r1, 32 # destroy the stack frame
+ blr
+
+#------------------------------------------------------------------------------
+# Name: WriterVSX
+# Purpose: Writes 128-bit value sequentially to an area of memory.
+# Params: r3 = ptr to memory area
+# r4 = length in bytes
+# r5 = loops
+# r6 = quad to write
+#------------------------------------------------------------------------------
+.align 3 # align to 8 byte boundary
+WriterVSX:
+_WriterVSX:
+ stdu r1, -144(r1) # update and store stack pointer
+ mflr r0 # set up the stack frame
+ std r0, 16(r1) # save the link register
+ std r14, 24(r1) # save r14
+ std r15, 32(r1) # save r15
+ std r16, 40(r1) # save r16
+ std r17, 48(r1) # save r17
+ std r18, 56(r1) # save r18
+ std r19, 64(r1) # save r19
+ std r20, 72(r1) # save r20
+ std r21, 80(r1) # save r21
+ std r22, 88(r1) # save r22
+ std r23, 96(r1) # save r23
+ std r24, 104(r1) # save r24
+ std r25, 112(r1) # save r25
+ std r26, 120(r1) # save r26
+ std r27, 128(r1) # save r27
+
+ mtvsrd vs1, r6 # load value to write into vector register
+
+ mr r7, r4 # calculate inner loop count
+ srdi r7, r7, 8 # length in bytes / 256
+
+ li r11, 0 # initialize write offset registers
+ li r12, 16
+ li r14, 32
+ li r15, 48
+ li r16, 64
+ li r17, 80
+ li r18, 96
+ li r19, 112
+
+ li r20, 128
+ li r21, 144
+ li r22, 160
+ li r23, 176
+ li r24, 192
+ li r25, 208
+ li r26, 224
+ li r27, 240
+
+ mr r10, r5 # load outer loop counter
+.Louter_vsx_loop_2:
+ addi r10, r10, -1
+ mtctr r7 # copy inner loop count to count register
+ mr r9, r3 # (re)load inner loop pointer address
+.Linner_vsx_loop_2:
+ stxvd2x v0, r11, r9
+ stxvd2x v0, r12, r9
+ stxvd2x v0, r14, r9
+ stxvd2x v0, r15, r9
+ stxvd2x v0, r16, r9
+ stxvd2x v0, r17, r9
+ stxvd2x v0, r18, r9
+ stxvd2x v0, r19, r9
+
+ stxvd2x v0, r20, r9
+ stxvd2x v0, r21, r9
+ stxvd2x v0, r22, r9
+ stxvd2x v0, r23, r9
+ stxvd2x v0, r24, r9
+ stxvd2x v0, r25, r9
+ stxvd2x v0, r26, r9
+ stxvd2x v0, r27, r9
+
+ addi r9, r9, 256 # increment pointer by 256 bytes
+ bdnz .Linner_vsx_loop_2
+
+ cmpwi r10, 0
+ bne .Louter_vsx_loop_2
+
+ ld r27, 128(r1) # restore r27
+ ld r26, 120(r1) # restore r26
+ ld r25, 112(r1) # restore r25
+ ld r24, 104(r1) # restore r24
+ ld r23, 96(r1) # restore r23
+ ld r22, 88(r1) # restore r22
+ ld r21, 80(r1) # restore r21
+ ld r20, 72(r1) # restore r20
+ ld r19, 64(r1) # restore r19
+ ld r18, 56(r1) # restore r18
+ ld r17, 48(r1) # restore r17
+ ld r16, 40(r1) # restore r16
+ ld r15, 32(r1) # restore r15
+ ld r14, 24(r1) # restore r14
+ ld r0, 16(r1) # restore saved link register
+ mtlr r0
+ addi r1, r1, 144 # destroy the stack frame
+ blr
+
+#------------------------------------------------------------------------------
+# Name: RandomWriterVSX
+# Purpose: Writes 128-bit values randomly to an area of memory.
+# Params: r3 = ptr to memory area
+# r4 = length in bytes
+# r5 = loops
+# r6 = quad to write
+#------------------------------------------------------------------------------
+.align 3 # align to 8 byte boundary
+RandomWriterVSX:
+_RandomWriterVSX:
+ stdu r1, -152(r1) # update and store stack pointer
+ mflr r0 # set up the stack frame
+ std r0, 16(r1) # save the link register
+ std r14, 24(r1) # save r14
+ std r15, 32(r1) # save r15
+ std r16, 40(r1) # save r16
+ std r17, 48(r1) # save r17
+ std r18, 56(r1) # save r18
+ std r19, 64(r1) # save r19
+ std r20, 72(r1) # save r20
+ std r21, 80(r1) # save r21
+ std r22, 88(r1) # save r22
+ std r23, 96(r1) # save r23
+ std r24, 104(r1) # save r24
+ std r25, 112(r1) # save r25
+ std r26, 120(r1) # save r26
+ std r27, 128(r1) # save r27
+ std r28, 136(r1) # save r28
+
+ mtvsrd vs1, r6 # load value to write into vector register
+
+ li r12, 240 # initialize write offset registers
+ li r14, 128
+ li r15, 64
+ li r16, 208
+ li r17, 112
+ li r18, 176
+ li r19, 144
+ li r20, 0
+
+ li r21, 96
+ li r22, 16
+ li r23, 192
+ li r24, 160
+ li r25, 32
+ li r26, 48
+ li r27, 224
+ li r28, 80
+
+ mr r10, r5 # load outer loop counter
+.Louter_vsx_loop_3:
+ addi r10, r10, -1
+ mr r11, r4 # copy inner loop count to inner loop count register
+.Linner_vsx_loop_3:
+ addi r11, r11, -1
+ mr r7, r11 # (re)calculate inner loop data offset
+ sldi r7, r7, 3
+ ldx r9, r3, r7 # (re)compute inner loop start pointer
+
+ stxvd2x v0, r12, r9
+ stxvd2x v0, r14, r9
+ stxvd2x v0, r15, r9
+ stxvd2x v0, r16, r9
+ stxvd2x v0, r17, r9
+ stxvd2x v0, r18, r9
+ stxvd2x v0, r19, r9
+ stxvd2x v0, r20, r9
+
+ stxvd2x v0, r21, r9
+ stxvd2x v0, r22, r9
+ stxvd2x v0, r23, r9
+ stxvd2x v0, r24, r9
+ stxvd2x v0, r25, r9
+ stxvd2x v0, r26, r9
+ stxvd2x v0, r27, r9
+ stxvd2x v0, r28, r9
+
+ cmpwi r11, 0
+ bne .Linner_vsx_loop_3
+
+ cmpwi r10, 0
+ bne .Louter_vsx_loop_3
+
+ ld r28, 136(r1) # restore r28
+ ld r27, 128(r1) # restore r27
+ ld r26, 120(r1) # restore r26
+ ld r25, 112(r1) # restore r25
+ ld r24, 104(r1) # restore r24
+ ld r23, 96(r1) # restore r23
+ ld r22, 88(r1) # restore r22
+ ld r21, 80(r1) # restore r21
+ ld r20, 72(r1) # restore r20
+ ld r19, 64(r1) # restore r19
+ ld r18, 56(r1) # restore r18
+ ld r17, 48(r1) # restore r17
+ ld r16, 40(r1) # restore r16
+ ld r15, 32(r1) # restore r15
+ ld r14, 24(r1) # restore r14
+ ld r0, 16(r1) # restore saved link register
+ mtlr r0
+ addi r1, r1, 152 # destroy the stack frame
+ blr
+
+#------------------------------------------------------------------------------
+# Name: Writer
+# Purpose: Writes 64-bit value sequentially to an area of memory.
+# Params: r3 = ptr to memory area
+# r4 = length in bytes
+# r5 = loops
+# r6 = quad to write
+#------------------------------------------------------------------------------
+.align 3 # align to 8 byte boundary
+Writer:
+_Writer:
+ stdu r1, -32(r1) # update and store stack pointer
+ mflr r0 # set up the stack frame
+ std r0, 16(r1) # save the link register
+
+ mr r7, r4 # calculate inner loop count
+ srdi r7, r7, 8 # length in bytes / 256
+
+ mr r10, r5 # load outer loop counter
+.Louter_loop_8:
+ addi r10, r10, -1
+ mtctr r7 # copy inner loop count to count register
+ mr r9, r3 # (re)load inner loop pointer address
+.Linner_loop_8:
+ std r6, 0(r9)
+ std r6, 8(r9)
+ std r6, 16(r9)
+ std r6, 24(r9)
+ std r6, 32(r9)
+ std r6, 40(r9)
+ std r6, 48(r9)
+ std r6, 56(r9)
+ std r6, 64(r9)
+ std r6, 72(r9)
+ std r6, 80(r9)
+ std r6, 88(r9)
+ std r6, 96(r9)
+ std r6, 104(r9)
+ std r6, 112(r9)
+ std r6, 120(r9)
+ std r6, 128(r9)
+ std r6, 136(r9)
+ std r6, 144(r9)
+ std r6, 152(r9)
+ std r6, 160(r9)
+ std r6, 168(r9)
+ std r6, 176(r9)
+ std r6, 184(r9)
+ std r6, 192(r9)
+ std r6, 200(r9)
+ std r6, 208(r9)
+ std r6, 216(r9)
+ std r6, 224(r9)
+ std r6, 232(r9)
+ std r6, 240(r9)
+ std r6, 248(r9)
+ addi r9, r9, 256 # increment pointer by 256 bytes
+ bdnz .Linner_loop_8
+
+ cmpwi r10, 0
+ bne .Louter_loop_8
+
+ ld r0, 16(r1) # restore saved link register
+ mtlr r0
+ addi r1, r1, 32 # destroy the stack frame
+ blr
+
+#------------------------------------------------------------------------------
+# Name: Writer_128bytes
+# Purpose: Writes 64-bit value sequentially to an area of memory.
+# Params: r3 = ptr to memory area
+# r4 = length in bytes
+# r5 = loops
+# r6 = quad to write
+#------------------------------------------------------------------------------
+.align 3 # align to 8 byte boundary
+Writer_128bytes:
+_Writer_128bytes:
+ stdu r1, -32(r1) # update and store stack pointer
+ mflr r0 # set up the stack frame
+ std r0, 16(r1) # save the link register
+
+ mr r7, r4 # calculate inner loop count
+ srdi r7, r7, 7 # length in bytes / 128
+
+ mr r10, r5 # load outer loop counter
+.Louter_loop_9:
+ addi r10, r10, -1
+ mtctr r7 # copy inner loop count to count register
+ mr r9, r3 # (re)load inner loop pointer address
+.Linner_loop_9:
+ std r6, 0(r9)
+ std r6, 8(r9)
+ std r6, 16(r9)
+ std r6, 24(r9)
+ std r6, 32(r9)
+ std r6, 40(r9)
+ std r6, 48(r9)
+ std r6, 56(r9)
+ std r6, 64(r9)
+ std r6, 72(r9)
+ std r6, 80(r9)
+ std r6, 88(r9)
+ std r6, 96(r9)
+ std r6, 104(r9)
+ std r6, 112(r9)
+ std r6, 120(r9)
+ addi r9, r9, 128 # increment pointer by 128 bytes
+ bdnz .Linner_loop_9
+
+ cmpwi r10, 0
+ bne .Louter_loop_9
+
+ ld r0, 16(r1) # restore saved link register
+ mtlr r0
+ addi r1, r1, 32 # destroy the stack frame
+ blr
+
+#------------------------------------------------------------------------------
+# Name: StackReader
+# Purpose: Reads 64-bit values off the stack into registers of
+# the main register set, effectively testing L1 cache access
+# *and* effective-address calculation speed.
+# Params: r3 = loops
+#------------------------------------------------------------------------------
+.align 3 # align to 8 byte boundary
+StackReader:
+_StackReader:
+ stdu r1, -128(r1) # update and store stack pointer
+ mflr r0 # set up the stack frame
+ std r0, 16(r1) # save the link register
+ mr r5, r1 # save stack pointer for testing
+ addi r5, r5, 32 # increment saved testing stack pointer
+
+ # push qword 7000
+ lis r4, stack_test_7@highest # load stack_test_7 bits 48-63 into r4 bits 16-31
+ ori r4, r4, stack_test_7@higher # load stack_test_7 bits 32-47 into r4 bits 0-15
+ rldicr r4, r4, 32, 31 # rotate r4's low word into r4's high word
+ oris r4, r4, stack_test_7@h # load stack_test_7 bits 16-31 into r4 bits 16-31
+ ori r4, r4, stack_test_7@l # load stack_test_7 bits 0-15 into r4 bits 0-15
+ std r4, 48(r5)
+
+ # push qword 6000
+ lis r4, stack_test_6@highest # load stack_test_6 bits 48-63 into r4 bits 16-31
+ ori r4, r4, stack_test_6@higher # load stack_test_6 bits 32-47 into r4 bits 0-15
+ rldicr r4, r4, 32, 31 # rotate r4's low word into r4's high word
+ oris r4, r4, stack_test_6@h # load stack_test_6 bits 16-31 into r4 bits 16-31
+ ori r4, r4, stack_test_6@l # load stack_test_6 bits 0-15 into r4 bits 0-15
+ std r4, 40(r5)
+
+ # push qword 5000
+ lis r4, stack_test_5@highest # load stack_test_5 bits 48-63 into r4 bits 16-31
+ ori r4, r4, stack_test_5@higher # load stack_test_5 bits 32-47 into r4 bits 0-15
+ rldicr r4, r4, 32, 31 # rotate r4's low word into r4's high word
+ oris r4, r4, stack_test_5@h # load stack_test_5 bits 16-31 into r4 bits 16-31
+ ori r4, r4, stack_test_5@l # load stack_test_5 bits 0-15 into r4 bits 0-15
+ std r4, 32(r5)
+
+ # push qword 4000
+ lis r4, stack_test_4@highest # load stack_test_4 bits 48-63 into r4 bits 16-31
+ ori r4, r4, stack_test_4@higher # load stack_test_4 bits 32-47 into r4 bits 0-15
+ rldicr r4, r4, 32, 31 # rotate r4's low word into r4's high word
+ oris r4, r4, stack_test_4@h # load stack_test_4 bits 16-31 into r4 bits 16-31
+ ori r4, r4, stack_test_4@l # load stack_test_4 bits 0-15 into r4 bits 0-15
+ std r4, 24(r5)
+
+ # push qword 3000
+ lis r4, stack_test_3@highest # load stack_test_3 bits 48-63 into r4 bits 16-31
+ ori r4, r4, stack_test_3@higher # load stack_test_3 bits 32-47 into r4 bits 0-15
+ rldicr r4, r4, 32, 31 # rotate r4's low word into r4's high word
+ oris r4, r4, stack_test_3@h # load stack_test_3 bits 16-31 into r4 bits 16-31
+ ori r4, r4, stack_test_3@l # load stack_test_3 bits 0-15 into r4 bits 0-15
+ std r4, 16(r5)
+
+ # push qword 2000
+ lis r4, stack_test_2@highest # load stack_test_2 bits 48-63 into r4 bits 16-31
+ ori r4, r4, stack_test_2@higher # load stack_test_2 bits 32-47 into r4 bits 0-15
+ rldicr r4, r4, 32, 31 # rotate r4's low word into r4's high word
+ oris r4, r4, stack_test_2@h # load stack_test_2 bits 16-31 into r4 bits 16-31
+ ori r4, r4, stack_test_2@l # load stack_test_2 bits 0-15 into r4 bits 0-15
+ std r4, 8(r5)
+
+ # push qword 1000
+ lis r4, stack_test_1@highest # load stack_test_1 bits 48-63 into r4 bits 16-31
+ ori r4, r4, stack_test_1@higher # load stack_test_1 bits 32-47 into r4 bits 0-15
+ rldicr r4, r4, 32, 31 # rotate r4's low word into r4's high word
+ oris r4, r4, stack_test_1@h # load stack_test_1 bits 16-31 into r4 bits 16-31
+ ori r4, r4, stack_test_1@l # load stack_test_1 bits 0-15 into r4 bits 0-15
+ std r4, 0(r5)
+
+ mtctr r3 # copy loop count to count register
+.Lstack_loop_0:
+ ld r8, 0(r5)
+ ld r8, 16(r5)
+ ld r8, 24(r5)
+ ld r8, 32(r5)
+ ld r8, 40(r5)
+ ld r8, 8(r5)
+ ld r8, 48(r5)
+ ld r8, 0(r5)
+ ld r8, 0(r5)
+ ld r8, 16(r5)
+ ld r8, 24(r5)
+ ld r8, 32(r5)
+ ld r8, 40(r5)
+ ld r8, 8(r5)
+ ld r8, 48(r5)
+ ld r8, 0(r5)
+ ld r8, 0(r5)
+ ld r8, 16(r5)
+ ld r8, 24(r5)
+ ld r8, 32(r5)
+ ld r8, 40(r5)
+ ld r8, 8(r5)
+ ld r8, 48(r5)
+ ld r8, 8(r5)
+ ld r8, 8(r5)
+ ld r8, 16(r5)
+ ld r8, 24(r5)
+ ld r8, 32(r5)
+ ld r8, 40(r5)
+ ld r8, 8(r5)
+ ld r8, 48(r5)
+ ld r8, 8(r5)
+ bdnz .Lstack_loop_0
+
+ ld r0, 16(r1) # restore saved link register
+ mtlr r0
+ addi r1, r1, 128 # destroy the stack frame
+ blr
+
+#------------------------------------------------------------------------------
+# Name: StackWriter
+# Purpose: Writes 64-bit values into the stack from registers of
+# the main register set, effectively testing L1 cache access
+# *and* effective-address calculation speed.
+# Params: r3 = loops
+#------------------------------------------------------------------------------
+.align 3 # align to 8 byte boundary
+StackWriter:
+_StackWriter:
+ stdu r1, -128(r1) # update and store stack pointer
+ mflr r0 # set up the stack frame
+ std r0, 16(r1) # save the link register
+ mr r5, r1 # save stack pointer for testing
+ addi r5, r5, 32 # increment saved testing stack pointer
+
+ # push qword 7000
+ lis r4, stack_test_7@highest # load stack_test_7 bits 48-63 into r4 bits 16-31
+ ori r4, r4, stack_test_7@higher # load stack_test_7 bits 32-47 into r4 bits 0-15
+ rldicr r4, r4, 32, 31 # rotate r4's low word into r4's high word
+ oris r4, r4, stack_test_7@h # load stack_test_7 bits 16-31 into r4 bits 16-31
+ ori r4, r4, stack_test_7@l # load stack_test_7 bits 0-15 into r4 bits 0-15
+ std r4, 48(r5)
+
+ # push qword 6000
+ lis r4, stack_test_6@highest # load stack_test_6 bits 48-63 into r4 bits 16-31
+ ori r4, r4, stack_test_6@higher # load stack_test_6 bits 32-47 into r4 bits 0-15
+ rldicr r4, r4, 32, 31 # rotate r4's low word into r4's high word
+ oris r4, r4, stack_test_6@h # load stack_test_6 bits 16-31 into r4 bits 16-31
+ ori r4, r4, stack_test_6@l # load stack_test_6 bits 0-15 into r4 bits 0-15
+ std r4, 40(r5)
+
+ # push qword 5000
+ lis r4, stack_test_5@highest # load stack_test_5 bits 48-63 into r4 bits 16-31
+ ori r4, r4, stack_test_5@higher # load stack_test_5 bits 32-47 into r4 bits 0-15
+ rldicr r4, r4, 32, 31 # rotate r4's low word into r4's high word
+ oris r4, r4, stack_test_5@h # load stack_test_5 bits 16-31 into r4 bits 16-31
+ ori r4, r4, stack_test_5@l # load stack_test_5 bits 0-15 into r4 bits 0-15
+ std r4, 32(r5)
+
+ # push qword 4000
+ lis r4, stack_test_4@highest # load stack_test_4 bits 48-63 into r4 bits 16-31
+ ori r4, r4, stack_test_4@higher # load stack_test_4 bits 32-47 into r4 bits 0-15
+ rldicr r4, r4, 32, 31 # rotate r4's low word into r4's high word
+ oris r4, r4, stack_test_4@h # load stack_test_4 bits 16-31 into r4 bits 16-31
+ ori r4, r4, stack_test_4@l # load stack_test_4 bits 0-15 into r4 bits 0-15
+ std r4, 24(r5)
+
+ # push qword 3000
+ lis r4, stack_test_3@highest # load stack_test_3 bits 48-63 into r4 bits 16-31
+ ori r4, r4, stack_test_3@higher # load stack_test_3 bits 32-47 into r4 bits 0-15
+ rldicr r4, r4, 32, 31 # rotate r4's low word into r4's high word
+ oris r4, r4, stack_test_3@h # load stack_test_3 bits 16-31 into r4 bits 16-31
+ ori r4, r4, stack_test_3@l # load stack_test_3 bits 0-15 into r4 bits 0-15
+ std r4, 16(r5)
+
+ # push qword 2000
+ lis r4, stack_test_2@highest # load stack_test_2 bits 48-63 into r4 bits 16-31
+ ori r4, r4, stack_test_2@higher # load stack_test_2 bits 32-47 into r4 bits 0-15
+ rldicr r4, r4, 32, 31 # rotate r4's low word into r4's high word
+ oris r4, r4, stack_test_2@h # load stack_test_2 bits 16-31 into r4 bits 16-31
+ ori r4, r4, stack_test_2@l # load stack_test_2 bits 0-15 into r4 bits 0-15
+ std r4, 8(r5)
+
+ # push qword 1000
+ lis r4, stack_test_1@highest # load stack_test_1 bits 48-63 into r4 bits 16-31
+ ori r4, r4, stack_test_1@higher # load stack_test_1 bits 32-47 into r4 bits 0-15
+ rldicr r4, r4, 32, 31 # rotate r4's low word into r4's high word
+ oris r4, r4, stack_test_1@h # load stack_test_1 bits 16-31 into r4 bits 16-31
+ ori r4, r4, stack_test_1@l # load stack_test_1 bits 0-15 into r4 bits 0-15
+ std r4, 0(r5)
+
+ mtctr r3 # copy loop count to count register
+.Lstack_loop_1:
+ std r8, 0(r5)
+ std r8, 16(r5)
+ std r8, 24(r5)
+ std r8, 32(r5)
+ std r8, 40(r5)
+ std r8, 8(r5)
+ std r8, 48(r5)
+ std r8, 0(r5)
+ std r8, 0(r5)
+ std r8, 16(r5)
+ std r8, 24(r5)
+ std r8, 32(r5)
+ std r8, 40(r5)
+ std r8, 8(r5)
+ std r8, 48(r5)
+ std r8, 0(r5)
+ std r8, 0(r5)
+ std r8, 16(r5)
+ std r8, 24(r5)
+ std r8, 32(r5)
+ std r8, 40(r5)
+ std r8, 8(r5)
+ std r8, 48(r5)
+ std r8, 8(r5)
+ std r8, 8(r5)
+ std r8, 16(r5)
+ std r8, 24(r5)
+ std r8, 32(r5)
+ std r8, 40(r5)
+ std r8, 8(r5)
+ std r8, 48(r5)
+ std r8, 8(r5)
+ bdnz .Lstack_loop_1
+
+ ld r0, 16(r1) # restore saved link register
+ mtlr r0
+ addi r1, r1, 128 # destroy the stack frame
+ blr
+
+#------------------------------------------------------------------------------
+# Name: RegisterToRegister
+# Purpose: Reads/writes 64-bit values between registers of
+# the main register set.
+# Params: r3 = loops
+#------------------------------------------------------------------------------
+.align 3 # align to 8 byte boundary
+RegisterToRegister:
+_RegisterToRegister:
+ stdu r1, -32(r1) # update and store stack pointer
+ mflr r0 # set up the stack frame
+ std r0, 16(r1) # save the link register
+
+ mtctr r3 # copy loop count to count register
+.Lreg_loop_0:
+ mr r8, r9
+ mr r8, r7
+ mr r8, r6
+ mr r8, r5
+ mr r8, r4
+ mr r8, r10
+ mr r8, r11
+ mr r8, r9
+ mr r8, r9
+ mr r8, r7
+ mr r8, r6
+ mr r8, r5
+ mr r8, r4
+ mr r8, r10
+ mr r8, r11
+ mr r8, r9
+ mr r8, r9
+ mr r8, r7
+ mr r8, r6
+ mr r8, r5
+ mr r8, r4
+ mr r8, r10
+ mr r8, r11
+ mr r8, r9
+ mr r8, r9
+ mr r8, r7
+ mr r8, r6
+ mr r8, r5
+ mr r8, r4
+ mr r8, r10
+ mr r8, r11
+ mr r8, r9
+ bdnz .Lreg_loop_0
+
+ ld r0, 16(r1) # restore saved link register
+ mtlr r0
+ addi r1, r1, 32 # destroy the stack frame
+ blr
+
+#------------------------------------------------------------------------------
+# Name: VectorToVector
+# Purpose: Reads/writes 128-bit values between registers of
+# the vector register set, in this case AltiVec.
+# Params: r3 = loops
+#------------------------------------------------------------------------------
+.align 3 # align to 8 byte boundary
+VectorToVector:
+_VectorToVector:
+ stdu r1, -32(r1) # update and store stack pointer
+ mflr r0 # set up the stack frame
+ std r0, 16(r1) # save the link register
+
+ mtctr r3 # copy loop count to count register
+.Lreg_loop_1:
+ vmr vs0, vs1 # Each move moves 16 bytes, so we need 16
+ vmr vs0, vs2 # moves to transfer a 256 byte chunk.
+ vmr vs0, vs3
+ vmr vs2, vs0
+ vmr vs1, vs2
+ vmr vs2, vs1
+ vmr vs0, vs3
+ vmr vs3, vs1
+
+ vmr vs3, vs2
+ vmr vs1, vs3
+ vmr vs2, vs1
+ vmr vs0, vs1
+ vmr vs1, vs2
+ vmr vs0, vs1
+ vmr vs0, vs3
+ vmr vs3, vs0
+ bdnz .Lreg_loop_1
+
+ ld r0, 16(r1) # restore saved link register
+ mtlr r0
+ addi r1, r1, 32 # destroy the stack frame
+ blr
+
+#------------------------------------------------------------------------------
+# Name: RegisterToVector
+# Purpose: Writes 64-bit main register values into 128-bit vector register
+# clearing the upper unused bits.
+# Params: r3 = loops
+#------------------------------------------------------------------------------
+.align 3 # align to 8 byte boundary
+RegisterToVector:
+_RegisterToVector:
+ stdu r1, -32(r1) # update and store stack pointer
+ mflr r0 # set up the stack frame
+ std r0, 16(r1) # save the link register
+
+ mtctr r3 # copy loop count to count register
+.Lreg_loop_2:
+ mtvsrd vs1, r8 # Each movq transfers 8 bytes, so we need
+ mtvsrd vs2, r5 # 32 transfers to move a 256-byte chunk.
+ mtvsrd vs3, r9
+ mtvsrd vs1, r7
+ mtvsrd vs2, r5
+ mtvsrd vs3, r10
+ mtvsrd vs0, r4
+ mtvsrd vs0, r6
+
+ mtvsrd vs0, r8
+ mtvsrd vs1, r5
+ mtvsrd vs2, r9
+ mtvsrd vs3, r7
+ mtvsrd vs0, r5
+ mtvsrd vs3, r10
+ mtvsrd vs2, r4
+ mtvsrd vs1, r6
+
+ mtvsrd vs0, r8
+ mtvsrd vs1, r5
+ mtvsrd vs2, r9
+ mtvsrd vs3, r7
+ mtvsrd vs0, r5
+ mtvsrd vs3, r10
+ mtvsrd vs2, r4
+ mtvsrd vs1, r6
+
+ mtvsrd vs0, r8
+ mtvsrd vs1, r5
+ mtvsrd vs2, r9
+ mtvsrd vs3, r7
+ mtvsrd vs0, r5
+ mtvsrd vs3, r10
+ mtvsrd vs2, r4
+ mtvsrd vs1, r6
+ bdnz .Lreg_loop_2
+
+ ld r0, 16(r1) # restore saved link register
+ mtlr r0
+ addi r1, r1, 32 # destroy the stack frame
+ blr
+
+#------------------------------------------------------------------------------
+# Name: VectorToRegister
+# Purpose: Writes lower 64 bits of vector register into 64-bit main
+# register.
+# Params: r3 = loops
+#------------------------------------------------------------------------------
+.align 3 # align to 8 byte boundary
+VectorToRegister:
+_VectorToRegister:
+ stdu r1, -32(r1) # update and store stack pointer
+ mflr r0 # set up the stack frame
+ std r0, 16(r1) # save the link register
+
+ mtctr r3 # copy loop count to count register
+.Lreg_loop_3:
+ mfvsrd r6, vs1
+ mfvsrd r6, vs2
+ mfvsrd r6, vs3
+ mfvsrd r6, vs1
+ mfvsrd r6, vs2
+ mfvsrd r6, vs3
+ mfvsrd r6, vs0
+ mfvsrd r6, vs0
+
+ mfvsrd r6, vs0
+ mfvsrd r6, vs1
+ mfvsrd r6, vs2
+ mfvsrd r6, vs3
+ mfvsrd r6, vs0
+ mfvsrd r6, vs3
+ mfvsrd r6, vs2
+ mfvsrd r6, vs1
+
+ mfvsrd r6, vs0
+ mfvsrd r6, vs1
+ mfvsrd r6, vs2
+ mfvsrd r6, vs3
+ mfvsrd r6, vs0
+ mfvsrd r6, vs3
+ mfvsrd r6, vs2
+ mfvsrd r6, vs1
+
+ mfvsrd r6, vs0
+ mfvsrd r6, vs1
+ mfvsrd r6, vs2
+ mfvsrd r6, vs3
+ mfvsrd r6, vs0
+ mfvsrd r6, vs3
+ mfvsrd r6, vs2
+ mfvsrd r6, vs1
+ bdnz .Lreg_loop_3
+
+ ld r0, 16(r1) # restore saved link register
+ mtlr r0
+ addi r1, r1, 32 # destroy the stack frame
+ blr
OpenPOWER on IntegriCloud