summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorZack Smith <1@zsmith.co>2015-12-23 04:52:54 +0000
committerTimothy Pearson <tpearson@raptorengineeringinc.com>2015-12-23 04:54:03 +0000
commit45d37750fc6f3f1f43d23732816ffccc3820e215 (patch)
tree84843c43c2e303f3755f0d68b4f1166490d53a62
downloadbandwidth-benchmark-45d37750fc6f3f1f43d23732816ffccc3820e215.zip
bandwidth-benchmark-45d37750fc6f3f1f43d23732816ffccc3820e215.tar.gz
Initial import of GPLed bandwidthd 1.1b source from author's site
-rwxr-xr-xBMP.c796
-rwxr-xr-xBMP.h100
-rwxr-xr-xBMPGraphing.c486
-rwxr-xr-xBMPGraphing.h88
-rwxr-xr-xCOPYING.txt340
-rwxr-xr-xMakefile87
-rwxr-xr-xREADME.txt167
-rwxr-xr-xdefs.h147
-rwxr-xr-xfont.c1655
-rwxr-xr-xfont.h28
-rwxr-xr-xloopback.sh5
-rwxr-xr-xmain.c2442
-rwxr-xr-xminifont.c845
-rwxr-xr-xminifont.h28
-rwxr-xr-xoutput/._Celeron-2.8GHz-slow.gifbin0 -> 489 bytes
-rwxr-xr-xoutput/._Corei5-2.6GHz-4288U-MacOSXMavericks-64bit.pngbin0 -> 177 bytes
-rwxr-xr-xoutput/._Corei5-520M-MacOSXLion-32bit-slow.gifbin0 -> 489 bytes
-rwxr-xr-xoutput/._Corei5-520M-MacOSXLion-64bit-slow-Crucial-SDRAM.gifbin0 -> 489 bytes
-rwxr-xr-xoutput/._Corei5-520M-MacOSXLion-64bit-slow-Samsung-SDRAM.gifbin0 -> 177 bytes
-rwxr-xr-xoutput/Celeron-2.8GHz-slow.gifbin0 -> 14589 bytes
-rwxr-xr-xoutput/Corei5-2.6GHz-4288U-MacOSXMavericks-64bit.pngbin0 -> 66393 bytes
-rwxr-xr-xoutput/Corei5-520M-MacOSXLion-32bit-slow.gifbin0 -> 16173 bytes
-rwxr-xr-xoutput/Corei5-520M-MacOSXLion-64bit-slow-Crucial-SDRAM.gifbin0 -> 18693 bytes
-rwxr-xr-xoutput/Corei5-520M-MacOSXLion-64bit-slow-Samsung-SDRAM.gifbin0 -> 17048 bytes
-rwxr-xr-xroutines32.asm2960
-rwxr-xr-xroutines64.asm2590
26 files changed, 12764 insertions, 0 deletions
diff --git a/BMP.c b/BMP.c
new file mode 100755
index 0000000..9327bb7
--- /dev/null
+++ b/BMP.c
@@ -0,0 +1,796 @@
+
+/*=============================================================================
+ bmplib, a simple library to create, modify, and write BMP image files.
+ Copyright (C) 2009-2014 by Zack T Smith.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2
+ as published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+ The author may be reached at veritas@comcast.net.
+ *============================================================================*/
+
+//--------------------------------------------------
+// Change Log
+// 0.8 ZS Added larger font of my own design.
+// 0.9 ZS Removed attempt at anti-aliasing.
+//--------------------------------------------------
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "BMP.h"
+#include "font.h"
+#include "minifont.h"
+
+// Narrowest possible numbers.
+static char* narrow_nums [] =
+{
+ " # ",
+ "# #",
+ "# #",
+ "# #",
+ "# #",
+ "# #",
+ " # ",
+
+ " #",
+ "##",
+ " #",
+ " #",
+ " #",
+ " #",
+ " #",
+
+ " # ",
+ "# #",
+ " #",
+ " ##",
+ "# ",
+ "# ",
+ "###",
+
+ "###",
+ " #",
+ " # ",
+ "## ",
+ " #",
+ "# #",
+ " # ",
+
+ "# #",
+ "# #",
+ "# #",
+ "###",
+ " #",
+ " #",
+ " #",
+
+ "###",
+ "# ",
+ "## ",
+ " #",
+ " #",
+ "# #",
+ " # ",
+
+
+ " # ",
+ "# ",
+ "# ",
+ "## ",
+ "# #",
+ "# #",
+ " # ",
+
+ "###",
+ " #",
+ " #",
+ " # ",
+ " # ",
+ " # ",
+ " # ",
+
+ " # ",
+ "# #",
+ "# #",
+ " # ",
+ "# #",
+ "# #",
+ " # ",
+
+ " # ",
+ "# #",
+ "# #",
+ " ##",
+ " #",
+ " # ",
+ "# ",
+
+ " ",
+ "",
+ "",
+ " ",
+ "",
+ "",
+ "#",
+};
+
+
+/*---------------------------------------------------------------------------
+ * Name: BMP_new
+ * Purpose: Creates new image.
+ *-------------------------------------------------------------------------*/
+BMP*
+BMP_new (int w, int h)
+{
+ unsigned long size;
+ BMP* nu;
+ if (w<1 || h<1)
+ return NULL;
+ //----------
+
+ if (w & 3)
+ w += 4 - (w & 3);
+ if (h & 3)
+ h += 4 - (h & 3);
+
+ nu = (BMP*) malloc (sizeof (BMP));
+ if (!nu)
+ return NULL;
+ memset (nu, 0, sizeof (BMP));
+ nu->width = w;
+ nu->height = h;
+ size = w * h * sizeof (long);
+ nu->pixels = (RGB*) malloc (size);
+ if (!nu->pixels) {
+ free (nu);
+ return NULL;
+ }
+ memset (nu->pixels, 0, size);
+ return nu;
+}
+
+/*---------------------------------------------------------------------------
+ * Name: BMP_destroy
+ * Purpose: Deallocates image.
+ *-------------------------------------------------------------------------*/
+void
+BMP_destroy (BMP* bmp)
+{
+ if (!bmp)
+ return;
+ //----------
+
+ if (bmp->pixels)
+ free (bmp->pixels);
+ free (bmp);
+}
+
+/*---------------------------------------------------------------------------
+ * Name: BMP_point
+ * Purpose: Writes pixel into image.
+ *-------------------------------------------------------------------------*/
+void
+BMP_point (BMP *bmp, int x, int y, RGB rgb)
+{
+ if (!bmp || x<0 || y<0)
+ return;
+ if (x >= bmp->width || y >= bmp->height)
+ return;
+ if (!bmp->pixels)
+ return;
+ //----------
+
+ bmp->pixels[y*bmp->width + x] = rgb;
+}
+
+/*---------------------------------------------------------------------------
+ * Name: BMP_line_core
+ * Purpose: Draws a line in a BMP image.
+ *-------------------------------------------------------------------------*/
+void
+BMP_line_core (BMP *bmp, int x0, int y0, int x1, int y1, RGB rgb,
+ int dashed)
+{
+ if ((rgb >> 24) == 0xff)
+ return;
+
+ int dot_counter = 0;
+
+ if (!dashed && x0 == x1 && y0 == y1)
+ BMP_point (bmp, x0, y0, rgb);
+ else if (!dashed && x0 == x1)
+ BMP_vline (bmp, x0, y0, y1, rgb);
+ else if (!dashed && y0 == y1)
+ BMP_hline (bmp, x0, x1, y0, rgb);
+ else {
+ int j, x, y, dx, dy, e, xchange, s1, s2;
+
+ // DDA, copied from my FramebufferUI project.
+
+ x = x0;
+ y = y0;
+ s1 = 1;
+ s2 = 1;
+
+ dx = x1 - x0;
+ if (dx < 0) {
+ dx = -dx;
+ s1 = -1;
+ }
+
+ dy = y1 - y0;
+ if (dy < 0) {
+ dy = -dy;
+ s2 = -1;
+ }
+
+ xchange = 0;
+
+ if (dy > dx) {
+ int tmp = dx;
+ dx = dy;
+ dy = tmp;
+ xchange = 1;
+ }
+
+ e = (dy<<1) - dx;
+ j = 0;
+
+ while (j <= dx) {
+ j++;
+
+ int draw = 1;
+ if (dashed && (1 & (dot_counter >> 2)))
+ draw = 0;
+
+ if (draw)
+ BMP_point (bmp, x, y, rgb);
+
+ dot_counter++;
+
+ if (e >= 0) {
+ if (xchange)
+ x += s1;
+ else
+ y += s2;
+ e -= (dx << 1);
+ }
+ if (xchange)
+ y += s2;
+ else
+ x += s1;
+ e += (dy << 1);
+ }
+ }
+}
+
+/*---------------------------------------------------------------------------
+ * Name: BMP_line
+ * Purpose: Draws a line in a BMP image.
+ *-------------------------------------------------------------------------*/
+void
+BMP_line (BMP *bmp, int x0, int y0, int x1, int y1, RGB rgb)
+{
+ BMP_line_core (bmp, x0, y0, x1, y1, rgb, 0);
+}
+
+/*---------------------------------------------------------------------------
+ * Name: BMP_line_dashed
+ * Purpose: Draws a dashed line in a BMP image.
+ *-------------------------------------------------------------------------*/
+void
+BMP_line_dashed (BMP *bmp, int x0, int y0, int x1, int y1, RGB rgb)
+{
+ BMP_line_core (bmp, x0, y0, x1, y1, rgb, 1);
+}
+
+/*---------------------------------------------------------------------------
+ * Name: BMP_rect
+ * Purpose: Fills a rectangle with a color.
+ *-------------------------------------------------------------------------*/
+void
+BMP_rect (BMP *bmp, int x, int y, int w, int h, RGB rgb)
+{
+ BMP_hline (bmp, x, x+w-1, y, rgb);
+ BMP_hline (bmp, x, x+w-1, y+h-1, rgb);
+ BMP_vline (bmp, x, y, y+h-1, rgb);
+ BMP_vline (bmp, x+w-1, y, y+h-1, rgb);
+}
+
+/*---------------------------------------------------------------------------
+ * Name: BMP_fillrect
+ * Purpose: Fills a rectangle with a color.
+ *-------------------------------------------------------------------------*/
+void
+BMP_fillrect (BMP *bmp, int x, int y, int w, int h, RGB rgb)
+{
+ while (h > 0) {
+ BMP_hline (bmp, x, x+w-1, y, rgb);
+ h--;
+ y++;
+ }
+}
+
+/*---------------------------------------------------------------------------
+ * Name: BMP_clear
+ * Purpose: Sets all pixels to specified color.
+ *-------------------------------------------------------------------------*/
+void
+BMP_clear (BMP *bmp, RGB rgb)
+{
+ BMP_fillrect (bmp, 0, 0, bmp->width, bmp->height, rgb);
+}
+
+/*---------------------------------------------------------------------------
+ * Name: BMP_hline
+ * Purpose: Draws horizontal line.
+ *-------------------------------------------------------------------------*/
+void
+BMP_hline (BMP *bmp, int x0, int x1, int y, RGB rgb)
+{
+ if (x0 > x1) {
+ int tmp=x1;
+ x1=x0;
+ x0=tmp;
+ }
+
+ while (x0 <= x1) {
+ BMP_point (bmp, x0++, y, rgb);
+ }
+}
+
+/*---------------------------------------------------------------------------
+ * Name: BMP_vline
+ * Purpose: Draws vertical line.
+ *-------------------------------------------------------------------------*/
+void
+BMP_vline (BMP *bmp, int x, int y0, int y1, RGB rgb)
+{
+ if (y0 > y1) {
+ int tmp=y1;
+ y1=y0;
+ y0=tmp;
+ }
+
+ while (y0 <= y1) {
+ BMP_point (bmp, x, y0++, rgb);
+ }
+}
+
+/*---------------------------------------------------------------------------
+ * Name: BMP_draw_string
+ * Purpose: Draws ature 5x8 characters into the image.
+ *-------------------------------------------------------------------------*/
+int
+BMP_draw_string (BMP *bmp, const char *string, int x, int y, RGB color)
+{
+ char ch;
+ const char *s;
+ RGB r,g,b;
+ RGB light, dark;
+
+ if (!bmp || !string)
+ return 0;
+ if (x >= bmp->width || y >= bmp->height || !*string)
+ return 0;
+ //----------
+
+ r = 0xff & (color >> 16);
+ g = 0xff & (color >> 8);
+ b = 0xff & color;
+ r += 3*0xff;
+ b += 3*0xff;
+ g += 3*0xff;
+ r /= 4;
+ g /= 4;
+ b /= 4;
+ light = b | (g << 8) | (r << 16);
+
+ r = 0xff & (color >> 16);
+ g = 0xff & (color >> 8);
+ b = 0xff & color;
+ r += 0xff;
+ b += 0xff;
+ g += 0xff;
+ r /= 2;
+ g /= 2;
+ b /= 2;
+ dark = b | (g << 8) | (r << 16);
+
+ const char **chars = get_font_chars ();
+
+ s = string;
+ while ((ch = *s++)) {
+ int ix = -1;
+ if (ch == ' ') {
+ x += 10;
+ continue;
+ }
+ if (ch > 'z')
+ continue;
+ if (ch > ' ' && ch <= 'z')
+ ix = FONT_HEIGHT * (ch - 33);
+
+ if (ix >= 0) {
+ int i;
+ int width = 0;
+
+ for (i=0; i<FONT_HEIGHT ; i++) {
+ int j=0;
+ char ch2;
+ const char *s2 = chars[ix + i];
+ int width2 = s2 ? strlen (s2) : 0;
+ if (width < width2)
+ width = width2;
+ while ((ch2 = *s2++)) {
+ RGB color_to_use;
+ char draw = 1;
+ switch (ch2) {
+ case '#':
+ color_to_use = color;
+ break;
+ default:
+ draw = 0;
+ }
+ if (draw)
+ BMP_point (bmp,x+j, y+i, color_to_use);
+ j++;
+ }
+ }
+
+ x += width + 2/* kerning */;
+ }
+ }
+
+ return x;
+}
+
+/*---------------------------------------------------------------------------
+ * Name: BMP_string_width
+ * Purpose: Gets width of 10x16 characters.
+ *-------------------------------------------------------------------------*/
+int
+BMP_string_width (const char *string)
+{
+ char ch;
+ const char *s;
+ int width = 0;
+
+ if (!string)
+ return 0;
+ //----------
+
+ const char **_chars = get_font_chars ();
+
+ s = string;
+ while ((ch = *s++)) {
+ int ix = -1;
+ if (ch == ' ') {
+ width += 10;
+ continue;
+ }
+ if (ch > 'z')
+ continue;
+ if (ch > ' ' && ch <= 'z')
+ ix = FONT_HEIGHT * (ch - 33);
+
+ if (ix >= 0) {
+ int j;
+ int max_w = 0;
+ for (j = 0; j < FONT_HEIGHT; j++) {
+ const char *ptr = _chars [j+ix];
+ int w = ptr ? strlen (ptr) : 0;
+ if (max_w < w) max_w = w;
+ }
+
+ width += max_w + 2/* kerning */;
+ }
+ }
+
+ return width;
+}
+
+/*---------------------------------------------------------------------------
+ * Name: BMP_draw_mini_string
+ * Purpose: Draws miniature 5x8 characters into the image.
+ *-------------------------------------------------------------------------*/
+int
+BMP_draw_mini_string (BMP *bmp, const char *string, int x, int y, RGB color)
+{
+ char ch;
+ const char *s;
+ unsigned long r,g,b;
+ unsigned long light, dark;
+
+ if (!bmp || !string)
+ return 0;
+ if (x >= bmp->width || y >= bmp->height || !*string)
+ return 0;
+ //----------
+
+ r = 0xff & (color >> 16);
+ g = 0xff & (color >> 8);
+ b = 0xff & color;
+ r += 3*0xff;
+ b += 3*0xff;
+ g += 3*0xff;
+ r /= 4;
+ g /= 4;
+ b /= 4;
+ light = b | (g << 8) | (r << 16);
+
+ r = 0xff & (color >> 16);
+ g = 0xff & (color >> 8);
+ b = 0xff & color;
+ r += 0xff;
+ b += 0xff;
+ g += 0xff;
+ r /= 2;
+ g /= 2;
+ b /= 2;
+ dark = b | (g << 8) | (r << 16);
+
+ const char **mini_chars = get_minifont_chars ();
+
+#define MINI_HEIGHT (8)
+ s = string;
+ while ((ch = *s++)) {
+ int ix = -1;
+ if (ch == ' ') {
+ x += 5;
+ continue;
+ }
+ if (ch > 'z')
+ continue;
+ if (ch > ' ' && ch <= 'z')
+ ix = MINI_HEIGHT * (ch - 33);
+
+ if (ix >= 0) {
+ int i;
+
+ int width = 0;
+ for (i=0; i<MINI_HEIGHT; i++) {
+ int j=0;
+ char ch2;
+ const char *s2 = mini_chars[ix + i];
+ int width2 = s2 ? strlen (s2) : 0;
+ if (width < width2)
+ width = width2;
+ while ((ch2 = *s2++)) {
+ RGB color_to_use;
+ char draw = 1;
+ switch (ch2) {
+ case '#':
+ color_to_use = color;
+ break;
+ default:
+ draw = 0;
+ }
+ if (draw)
+ BMP_point (bmp,x+j, y+i, color_to_use);
+ j++;
+ }
+ }
+
+ x += width + 1/* kerning */;
+ }
+ }
+
+ return x;
+}
+
+/*---------------------------------------------------------------------------
+ * Name: BMP_mini_string_width
+ * Purpose: Gets width of miniature 5x8 characters.
+ *-------------------------------------------------------------------------*/
+int
+BMP_mini_string_width (const char *string)
+{
+ char ch;
+ const char *s;
+ int width = 0;
+
+ if (!string)
+ return 0;
+ //----------
+
+ const char **mini_chars = get_minifont_chars ();
+
+ s = string;
+ while ((ch = *s++)) {
+ int ix = -1;
+ if (ch == ' ') {
+ width += 5;
+ continue;
+ }
+ if (ch > 'z')
+ continue;
+ if (ch > ' ' && ch <= 'z')
+ ix = MINI_HEIGHT * (ch - 33);
+
+ if (ix >= 0) {
+ int max_w = 0;
+ int j;
+ for (j = 0; j < MINI_HEIGHT; j++) {
+ const char *ptr = mini_chars [j+ix];
+ int w = ptr ? strlen (ptr) : 0;
+ if (max_w < w) max_w = w;
+ }
+
+ width += max_w + 1/*kerning*/;
+ }
+ }
+
+ return width;
+}
+
+/*---------------------------------------------------------------------------
+ * Name: BMP_narrow_numbers
+ * Purpose: Draws miniature 4x7 characters into the image.
+ *-------------------------------------------------------------------------*/
+int
+BMP_draw_narrow_numbers (BMP *bmp, const char *string, int x, int y, RGB color)
+{
+ char ch;
+ const char *s;
+
+ if (!bmp || !string)
+ return 0;
+ if (x >= bmp->width || y >= bmp->height || !*string)
+ return 0;
+ //----------
+
+#define NARROW_HEIGHT (7)
+ s = string;
+ while ((ch = *s++)) {
+ int ix = -1;
+ if (ch == ' ') {
+ x += 3;
+ continue;
+ }
+ if (ch >= '0' && ch <= '9')
+ ix = ch - '0';
+ else
+ if (ch == '.')
+ ix = 10;
+
+ ix *= NARROW_HEIGHT;
+
+ if (ix >= 0) {
+ int i;
+ int width = strlen (narrow_nums [ix]);
+
+ for (i=0; i<NARROW_HEIGHT; i++) {
+ int j=0;
+ char ch2;
+ const char *s2 = narrow_nums [ix + i];
+ while ((ch2 = *s2++)) {
+ if (ch2 == '#') {
+ BMP_point (bmp,
+ x+j, y+i, color);
+ }
+ j++;
+ }
+ }
+
+ x += width + 1;
+ }
+ }
+
+ return x;
+}
+
+/*---------------------------------------------------------------------------
+ * Name: BMP_getpixel
+ * Purpose: Reads pixel out of image.
+ *-------------------------------------------------------------------------*/
+RGB
+BMP_getpixel (BMP *bmp, int x, int y)
+{
+ if (!bmp || x<0 || y<0)
+ return 0;
+ if (x >= bmp->width || y >= bmp->height)
+ return 0;
+ if (!bmp->pixels)
+ return 0;
+ //----------
+
+ return bmp->pixels[y*bmp->width + x];
+}
+
+/*---------------------------------------------------------------------------
+ * Name: BMP_write
+ * Purpose: Writes image to BMP file.
+ *-------------------------------------------------------------------------*/
+int
+BMP_write (const BMP* bmp, const char *path)
+{
+ FILE *f;
+#define HDRLEN (54)
+ unsigned char h[HDRLEN];
+ unsigned long len;
+ int i, j;
+
+ if (!bmp || !path)
+ return -1;
+ //----------
+
+ memset (h, 0, HDRLEN);
+
+ //--------------------
+ // Create the file.
+ //
+ f = fopen (path, "wb");
+ if (!f)
+ return 0;
+
+ //--------------------
+ // Prepare header
+ //
+ len = HDRLEN + 3 * bmp->width * bmp->height;
+ h[0] = 'B';
+ h[1] = 'M';
+ h[2] = len & 0xff;
+ h[3] = (len >> 8) & 0xff;
+ h[4] = (len >> 16) & 0xff;
+ h[5] = (len >> 24) & 0xff;
+ h[10] = HDRLEN;
+ h[14] = 40;
+ h[18] = bmp->width & 0xff;
+ h[19] = (bmp->width >> 8) & 0xff;
+ h[20] = (bmp->width >> 16) & 0xff;
+ h[22] = bmp->height & 0xff;
+ h[23] = (bmp->height >> 8) & 0xff;
+ h[24] = (bmp->height >> 16) & 0xff;
+ h[26] = 1;
+ h[28] = 24;
+ h[34] = 16;
+ h[36] = 0x13; // 2835 pixels/meter
+ h[37] = 0x0b;
+ h[42] = 0x13; // 2835 pixels/meter
+ h[43] = 0x0b;
+
+ //--------------------
+ // Write header.
+ //
+ if (HDRLEN != fwrite (h, 1, HDRLEN, f)) {
+ fclose (f);
+ return 0;
+ }
+
+ //----------------------------------------
+ // Write pixels.
+ // Note that BMP has lower rows first.
+ //
+ for (j=bmp->height-1; j >= 0; j--) {
+ for (i=0; i < bmp->width; i++) {
+ unsigned char rgb[3];
+ int ix = i + j * bmp->width;
+ unsigned long pixel = bmp->pixels[ix];
+ rgb[0] = pixel & 0xff;
+ rgb[1] = (pixel >> 8) & 0xff;
+ rgb[2] = (pixel >> 16) & 0xff;
+ if (3 != fwrite (rgb, 1, 3, f)) {
+ fclose (f);
+ return 0;
+ }
+ }
+ }
+
+ fclose (f);
+ return 1;
+}
+
+
diff --git a/BMP.h b/BMP.h
new file mode 100755
index 0000000..c3430d6
--- /dev/null
+++ b/BMP.h
@@ -0,0 +1,100 @@
+
+/*=============================================================================
+ bmplib, a simple library to create, modify, and write BMP image files.
+ Copyright (C) 2009-2014 by Zack T Smith.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2
+ as published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+ The author may be reached at veritas@comcast.net.
+ *============================================================================*/
+
+#ifndef _BMP_H
+#define _BMP_H
+
+#include <stdint.h>
+
+#define BMPLIB_RELEASE "0.9"
+#define BMPLIB_RELEASE_MAJOR 0
+#define BMPLIB_RELEASE_MINOR 9
+
+typedef uint32_t RGB;
+typedef uint32_t RGBA;
+
+typedef struct {
+ int width, height;
+ RGB *pixels;
+} BMP;
+
+#define FONT_HEIGHT (17)
+#define MINIFONT_HEIGHT (8)
+
+extern BMP* BMP_new (int, int);
+extern void BMP_destroy (BMP*);
+extern void BMP_clear (BMP*, RGB);
+extern int BMP_write (const BMP*, const char *path);
+extern void BMP_point (BMP*, int, int, RGB);
+extern void BMP_line (BMP *, int x0, int y0, int x1, int y1, RGB);
+extern void BMP_line_dashed (BMP *, int x0, int y0, int x1, int y1, RGB);
+extern void BMP_hline (BMP *, int x0, int x1, int y, RGB);
+extern void BMP_vline (BMP *, int x, int y0, int y1, RGB);
+extern void BMP_rect (BMP *, int x, int y, int w, int h, RGB);
+extern void BMP_fillrect (BMP *, int x, int y, int w, int h, RGB);
+extern RGB BMP_getpixel (BMP*, int, int);
+
+extern int BMP_draw_string (BMP *, const char *, int x, int y, RGB);
+extern int BMP_string_width (const char *);
+
+extern int BMP_draw_mini_string (BMP *, const char *, int x, int y, RGB);
+extern int BMP_mini_string_width (const char *);
+
+#define RGB_BLACK (0)
+#define RGB_BLUE (0xff)
+#define RGB_BRASS (0xc3a368)
+#define RGB_BROWN (0x8b4513)
+#define RGB_CADETBLUE (0x5f9ea0)
+#define RGB_CHARTREUSE (0x7fff00)
+#define RGB_CORAL (0xff7f50)
+#define RGB_CYAN (0xffff)
+#define RGB_DARKGREEN (0x6400)
+#define RGB_DARKKHAKI (0xbdb76b)
+#define RGB_DARKOLIVEGREEN (0x556b2f)
+#define RGB_DARKORANGE (0xff8c00)
+#define RGB_DODGERBLUE (0x1e90ff)
+#define RGB_GOLDENROD (0xdaa520)
+#define RGB_GRAY (0xc0c0c0)
+#define RGB_GREEN (0xff00)
+#define RGB_KHAKI (0xf0e68c)
+#define RGB_LEMONYELLOW (0xfde910)
+#define RGB_MAGENTA (0xff00ff)
+#define RGB_MAROON (0x800000)
+#define RGB_NAVYBLUE (0x80)
+#define RGB_ORANGE (0xffa500)
+#define RGB_PINK (0xf77fbe)
+#define RGB_PURPLE (0xa020f0)
+#define RGB_RED (0xff0000)
+#define RGB_ROYALBLUE (0x4169e1)
+#define RGB_SALMON (0xfa8072)
+#define RGB_TURQUOISE (0x40e0d0)
+#define RGB_VIOLET (0xee82ee)
+#define RGB_WHITE (0xffffff)
+#define RGB_YELLOW (0xffff00)
+
+#define RGB_GRAY6 (0x606060)
+#define RGB_GRAY8 (0x808080)
+#define RGB_GRAY10 (0xa0a0a0)
+#define RGB_GRAY12 (0xc0c0c0)
+#define RGB_GRAY14 (0xe0e0e0)
+
+#endif
+
diff --git a/BMPGraphing.c b/BMPGraphing.c
new file mode 100755
index 0000000..61ae0d7
--- /dev/null
+++ b/BMPGraphing.c
@@ -0,0 +1,486 @@
+/*============================================================================
+ BMPGraphing, a library for graphing.
+ Copyright (C) 2005-2014 by Zack T Smith.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+ The author may be reached at veritas@comcast.net.
+ *===========================================================================*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#include "BMP.h"
+#include "BMPGraphing.h"
+
+//----------------------------------------------------------------------------
+// Name: BMPGraphing_draw_labels_log2
+// Purpose: Draw the labels and ticks.
+//----------------------------------------------------------------------------
+void
+BMPGraphing_draw_labels_log2 (BMPGraph* graph)
+{
+ if (!graph || !graph->image)
+ return;
+
+ //----------------------------------------
+ // Horizontal
+ //
+ // Establish min & max x values.
+ //
+ int i = 0;
+ Value min_x = 0x4000000000000000;
+ Value max_x = 0;
+ for (i = 0; i < graph->data_index; i += 2) {
+ Value type = graph->data[i];
+ Value value = graph->data[i+1];
+ if (type == DATUM_X) {
+ if (value < min_x)
+ min_x = value;
+ if (value > max_x)
+ max_x = value;
+ }
+ }
+ graph->min_x = (long long) log2 (min_x);
+ graph->max_x = (long long) ceil (log2 (max_x));
+
+ for (i = graph->min_x; i <= graph->max_x; i++) {
+ char str [200];
+ int x = graph->left_margin +
+ ((i-graph->min_x) * graph->x_span) /
+ (graph->max_x - graph->min_x);
+ int y = graph->height - graph->margin + 10;
+
+ unsigned long y2 = 1 << i;
+ if (y2 < 1536)
+ snprintf (str, 199, "%ld B", y2);
+ else if (y2 < (1<<20)) {
+ snprintf (str, 199, "%ld kB", y2 >> 10);
+ }
+ else {
+ Value j = y2 >> 20;
+ switch ((y2 >> 18) & 3) {
+ case 0: snprintf (str, 199, "%lld MB", j); break;
+ case 1: snprintf (str, 199, "%lld.25 MB", j); break;
+ case 2: snprintf (str, 199, "%lld.5 MB", j); break;
+ case 3: snprintf (str, 199, "%lld.75 MB", j); break;
+ }
+ }
+
+ BMP_vline (graph->image, x, y, y - 10, RGB_BLACK);
+ BMP_draw_mini_string (graph->image, str, x - 10, y + 8, RGB_BLACK);
+ }
+
+ //----------------------------------------
+ // Vertical
+ //
+ // Establish min & max y values.
+ //
+ Value min_y = 0x4000000000000000;
+ Value max_y = 0;
+ for (i = 0; i < graph->data_index; i += 2) {
+ Value type = graph->data[i];
+ Value value = graph->data[i+1];
+ if (type == DATUM_Y) {
+ if (value < min_y)
+ min_y = value;
+ if (value > max_y)
+ max_y = value;
+ }
+ }
+ graph->min_y = min_y;
+ graph->max_y = max_y;
+
+ int font_height = 10;
+ int available_height = graph->y_span;
+ int max_labels = available_height / font_height;
+ int preferred_n_labels = graph->max_y/10000;
+ int actual_n_labels;
+ float multiplier = 1;
+ if (preferred_n_labels < max_labels) {
+ actual_n_labels = preferred_n_labels;
+ } else {
+ actual_n_labels = max_labels;
+ multiplier = preferred_n_labels / (float) actual_n_labels;
+ }
+
+ for (i = 0; i <= actual_n_labels; i++) {
+ char str [200];
+ int x = graph->left_margin - 10;
+ int y = graph->height - graph->margin - (i * graph->y_span) / (float)actual_n_labels;
+
+ BMP_hline (graph->image, x, x+10, y, RGB_BLACK);
+
+ int value = (int) (i * multiplier);
+ snprintf (str, 199, "%d GB/s", value);
+ BMP_draw_mini_string (graph->image, str, x - 40, y - MINIFONT_HEIGHT/2, RGB_BLACK);
+ }
+}
+
+BMPGraph *
+BMPGraphing_new (int w, int h, int x_axis_mode)
+{
+ if (x_axis_mode != MODE_X_AXIS_LINEAR && x_axis_mode != MODE_X_AXIS_LOG2)
+ return NULL;
+
+ BMPGraph *graph = (BMPGraph*) malloc (sizeof(BMPGraph));
+ if (!graph)
+ return NULL;
+
+ bzero (graph, sizeof(BMPGraph));
+
+ graph->x_axis_mode = x_axis_mode;
+
+ if (w <= 0 || h <= 0) {
+ w = 1920;
+ h = 1080;
+ }
+
+ graph->width = w;
+ graph->height = h;
+ graph->image = BMP_new (w, h);
+ graph->margin = 40;
+ graph->left_margin = 80;
+
+ BMP_clear (graph->image, RGB_WHITE);
+
+ BMP_hline (graph->image, graph->left_margin, graph->width - graph->margin, graph->height - graph->margin, RGB_BLACK);
+ BMP_vline (graph->image, graph->left_margin, graph->margin, graph->height - graph->margin, RGB_BLACK);
+
+ graph->x_span = graph->width - (graph->margin + graph->left_margin);
+ graph->y_span = graph->height - 2 * graph->margin;
+
+ graph->legend_y = graph->margin;
+
+ return graph;
+}
+
+void BMPGraphing_set_title (BMPGraph* graph, const char *title)
+{
+ if (!graph || !title)
+ return;
+
+ if (graph->title)
+ free (graph->title);
+ graph->title = strdup (title);
+
+ BMP_draw_string (graph->image, graph->title, graph->left_margin, graph->margin/2, RGB_BLACK);
+}
+
+void
+BMPGraphing_new_line (BMPGraph *graph, char *str, RGB color)
+{
+ if (!graph || !graph->image)
+ return;
+
+ BMP_draw_string (graph->image, str, graph->width - graph->margin - 320, graph->legend_y, 0xffffff & color);
+
+ graph->legend_y += 17;
+
+ graph->fg = 0;
+ graph->last_x = graph->last_y = -1;
+
+ if (graph->data_index >= MAX_GRAPH_DATA-2)
+ return; // error ("Too many graph data.");
+
+ graph->data [graph->data_index++] = DATUM_COLOR;
+ graph->data [graph->data_index++] = color;
+}
+
+//----------------------------------------------------------------------------
+// Name: BMPGraphing_add_point
+// Purpose: Adds a point to this list to be drawn.
+//----------------------------------------------------------------------------
+void
+BMPGraphing_add_point (BMPGraph *graph, Value x, Value y)
+{
+ if (!graph || !graph->image)
+ return;
+
+ if (graph->data_index >= MAX_GRAPH_DATA-4)
+ return; // error ("Too many graph data.");
+
+ graph->data [graph->data_index++] = DATUM_X;
+ graph->data [graph->data_index++] = x;
+ graph->data [graph->data_index++] = DATUM_Y;
+ graph->data [graph->data_index++] = y;
+}
+
+//----------------------------------------------------------------------------
+// Name: BMPGraphing_plot_log2
+// Purpose: Plots a point on the current graph.
+//----------------------------------------------------------------------------
+
+void
+BMPGraphing_plot_log2 (BMPGraph *graph, Value x, Value y)
+{
+ if (!graph || !graph->image)
+ return;
+
+ int i = 0;
+
+ //----------------------------------------
+ // Plot the point. The x axis is
+ // logarithmic, base 2.
+ //
+ double tmp = log2 (x);
+ tmp -= (double) graph->min_x;
+ tmp *= (double) graph->x_span;
+ tmp /= (double) (graph->max_x - graph->min_x);
+
+ int x2 = graph->left_margin + (int) tmp;
+ int y2 = graph->height - graph->margin - (y * graph->y_span) / graph->max_y;
+
+ if (graph->last_x != -1 && graph->last_y != -1) {
+ if (graph->fg & DASHED)
+ BMP_line_dashed (graph->image, graph->last_x, graph->last_y, x2, y2, graph->fg & 0xffffff);
+ else
+ BMP_line (graph->image, graph->last_x, graph->last_y, x2, y2, graph->fg);
+ }
+
+ graph->last_x = x2;
+ graph->last_y = y2;
+}
+
+//----------------------------------------------------------------------------
+// Name: BMPGraphing_plot_linear
+// Purpose: Plots a point on the current graph.
+//----------------------------------------------------------------------------
+
+void
+BMPGraphing_plot_linear (BMPGraph *graph, Value x, Value y, Value max_y)
+{
+ if (!graph || !graph->image)
+ return;
+
+ //----------------------------------------
+ // Plot the point. The x axis is
+ // logarithmic, base 2. The units of the
+ // y value is kB.
+ //
+ double tmp = 10. + log2 (x);
+ tmp -= (double) XVALUE_MIN;
+ tmp *= (double) graph->x_span;
+ tmp /= (double) (XVALUE_MAX - XVALUE_MIN);
+ int x2 = graph->left_margin + (int) tmp;
+ int y2 = graph->height - graph->margin - (y * graph->y_span) / max_y;
+
+//printf ("\tx=%d, y=%d\n",x,y); fflush(stdout);
+
+ if (graph->last_x != -1 && graph->last_y != -1) {
+ if (graph->fg & DASHED)
+ BMP_line_dashed (graph->image, graph->last_x, graph->last_y, x2, y2, graph->fg & 0xffffff);
+ else
+ BMP_line (graph->image, graph->last_x, graph->last_y, x2, y2, graph->fg);
+ }
+
+ graph->last_x = x2;
+ graph->last_y = y2;
+}
+
+//----------------------------------------------------------------------------
+// Name: BMPGraphing_make_log2
+// Purpose: Plots all lines.
+//----------------------------------------------------------------------------
+
+static void
+BMPGraphing_make_log2 (BMPGraph *graph)
+{
+ if (!graph || !graph->image)
+ return;
+
+ BMPGraphing_draw_labels_log2 (graph);
+
+ //----------------------------------------
+ // OK, now draw the lines.
+ //
+ int i;
+ int x = -1, y = -1;
+ for (i = 0; i < graph->data_index; i += 2)
+ {
+ Value type = graph->data[i];
+ Value value = graph->data[i+1];
+
+ switch (type) {
+ case DATUM_Y: y = value; break;
+ case DATUM_X: x = value; break;
+ case DATUM_COLOR:
+ graph->fg = (unsigned long) value;
+ graph->last_x = -1;
+ graph->last_y = -1;
+ break;
+ }
+
+ if (x != -1 && y != -1) {
+ BMPGraphing_plot_log2 (graph, x, y);
+ x = y = -1;
+ }
+ }
+}
+
+//----------------------------------------------------------------------------
+// Name: BMPGraphing_make_linear
+// Purpose: Plots all lines for the network test graph.
+//----------------------------------------------------------------------------
+
+static void
+BMPGraphing_make_linear (BMPGraph *graph)
+{
+ if (!graph || !graph->image)
+ return;
+
+ int i;
+
+ // No data
+ if (!graph->data_index)
+ return;
+
+ //----------------------------------------
+ // Get the maximum bandwidth in order to
+ // properly scale the graph vertically.
+ //
+ int max_y = 0;
+ for (i = 0; i < graph->data_index; i += 2) {
+ if (graph->data[i] == DATUM_Y) {
+ int y = graph->data [i+1];
+ if (y > max_y)
+ max_y = y;
+ }
+ }
+
+ int range = max_y > 10000 ? 2 : (max_y > 1000 ? 1 : 0);
+ int y_spacing = 1;
+ switch (range) {
+ case 2:
+ // Round up to the next 100.00 MB/sec. (=10000).
+ y_spacing = 10000;
+ break;
+ case 1:
+ // Round up to the next 10.00 MB/sec.
+ y_spacing = 1000;
+ break;
+ case 0:
+ // Round up to the next 1.00 MB/sec.
+ y_spacing = 100;
+ break;
+ }
+ max_y /= y_spacing;
+ max_y *= y_spacing;
+ max_y += y_spacing;
+
+ //----------------------------------------
+ // Draw the axes, ticks & labels.
+ //
+ // X axis:
+ if (XVALUE_MIN < 10)
+ return; // error ("Minimum y is too small.");
+
+ for (i = XVALUE_MIN; i <= XVALUE_MAX; i++) {
+ char str[200];
+ unsigned long y2 = 1 << (i-10); // XX XVALUE_MIN>=10
+ if (y2 < 1024)
+ snprintf (str, 199, "%u kB", (unsigned int) y2);
+ else
+ snprintf (str, 199, "%lu MB", (unsigned long) (y2 >> 10));
+
+ int x = graph->left_margin + ((i - XVALUE_MIN) * graph->x_span) / (XVALUE_MAX - XVALUE_MIN);
+ int y = graph->height - graph->margin + 10;
+
+ BMP_vline (graph->image, x, y, y-10, RGB_BLACK);
+ BMP_draw_mini_string (graph->image, str, x - 10, y+8, RGB_BLACK);
+ }
+
+ //----------
+ // Y axis:
+ // Decide what the tick spacing will be.
+ for (i = 0; i <= max_y; i += y_spacing) {
+ char str[200];
+ unsigned long whole = i / 100;
+ unsigned long frac = i % 100;
+ snprintf (str, 199, "%lu.%02lu MB/s", whole, frac);
+
+ int x = graph->left_margin - 10;
+ int y = graph->height - graph->margin - (i * graph->y_span) / max_y;
+
+ BMP_hline (graph->image, x, x+10, y, RGB_BLACK);
+ BMP_draw_mini_string (graph->image, str, x - 60, y - MINIFONT_HEIGHT/2, RGB_BLACK);
+ }
+
+ //----------------------------------------
+ // Draw the data lines.
+ //
+ int x = -1, y = -1;
+ graph->last_x = -1;
+ graph->last_y = -1;
+ for (i = 0; i < graph->data_index; i += 2)
+ {
+ int type = graph->data[i];
+ long value = graph->data[i+1];
+
+ switch (type) {
+ case DATUM_Y: y = value; break;
+ case DATUM_X: x = value; break;
+ case DATUM_COLOR:
+ graph->fg = (unsigned long) value;
+ graph->last_x = -1;
+ graph->last_y = -1;
+ break;
+ }
+
+ if (x != -1 && y != -1) {
+ BMPGraphing_plot_linear (graph, x, y, max_y);
+ x = y = -1;
+ }
+ }
+}
+
+void
+BMPGraphing_make (BMPGraph *graph)
+{
+ if (!graph)
+ return; // XX silent error
+
+ switch (graph->x_axis_mode) {
+ case MODE_X_AXIS_LOG2:
+ BMPGraphing_make_log2 (graph);
+ break;
+ case MODE_X_AXIS_LINEAR:
+ BMPGraphing_make_linear (graph);
+ break;
+ default:
+ fprintf (stderr, "Invalid graph mode %d.\n", graph->x_axis_mode);
+ break;
+ }
+}
+
+void
+BMPGraphing_destroy (BMPGraph *graph)
+{
+ if (!graph)
+ return;
+
+ if (graph->title) {
+ free (graph->title);
+ graph->title = NULL;
+ }
+ if (graph->image) {
+ BMP_destroy (graph->image);
+ graph->image = NULL;
+ }
+
+ free (graph);
+}
diff --git a/BMPGraphing.h b/BMPGraphing.h
new file mode 100755
index 0000000..4f13972
--- /dev/null
+++ b/BMPGraphing.h
@@ -0,0 +1,88 @@
+/*============================================================================
+ BMPGraphing, a library for graphing.
+ Copyright (C) 2005-2014 by Zack T Smith.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+ The author may be reached at veritas@comcast.net.
+ *===========================================================================*/
+
+#ifndef _SUPERSIMPLEGRAPHING_H
+#define _SUPERSIMPLEGRAPHING_H
+
+#include <stdbool.h>
+
+#define SSG_RELEASE "0.2"
+
+#define XVALUE_MIN (15)
+#define XVALUE_MAX (28)
+
+enum {
+ DATUM_X=0,
+ DATUM_Y=1,
+ DATUM_COLOR=2,
+};
+
+typedef long Coordinate;
+typedef long long Value;
+
+enum {
+ MODE_X_AXIS_LINEAR = 0,
+ MODE_X_AXIS_LOG2 = 1,
+};
+
+//---------------
+// Graphing data.
+//
+typedef struct {
+ BMP *image;
+ char *title;
+
+ unsigned char x_axis_mode;
+
+ Coordinate width;
+ Coordinate height;
+ Coordinate left_margin;
+ Coordinate margin;
+ Coordinate last_x;
+ Coordinate last_y;
+ Coordinate x_span;
+ Coordinate y_span;
+ Coordinate legend_y;
+
+ RGB fg;
+#define MAX_GRAPH_DATA 50000
+ Value data [MAX_GRAPH_DATA];
+ int data_index;
+#define DASHED 0x1000000 // dashed line flag
+
+ Value max_y;
+ Value min_y;
+ Value min_x;
+ Value max_x;
+} BMPGraph;
+
+extern void BMPGraphing_set_title (BMPGraph*, const char *);
+extern void BMPGraphing_draw_labels_log2 (BMPGraph*);
+extern BMPGraph *BMPGraphing_new (int w, int h, int x_axis_mode);
+extern void BMPGraphing_new_line (BMPGraph *, char *str, RGB color);
+extern void BMPGraphing_add_point (BMPGraph *, Value x, Value y);
+extern void BMPGraphing_plot_log2 (BMPGraph *, Value x, Value y);
+extern void BMPGraphing_plot_linear (BMPGraph *, Value x, Value y, Value max_amt);
+extern void BMPGraphing_make (BMPGraph*);
+extern BMP *BMPGraphing_get_graph (BMPGraph*);
+extern void BMPGraphing_destroy (BMPGraph*);
+
+#endif
diff --git a/COPYING.txt b/COPYING.txt
new file mode 100755
index 0000000..3912109
--- /dev/null
+++ b/COPYING.txt
@@ -0,0 +1,340 @@
+ GNU GENERAL PUBLIC LICENSE
+ Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+ 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The licenses for most software are designed to take away your
+freedom to share and change it. By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users. This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it. (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.) You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+ To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have. You must make sure that they, too, receive or can get the
+source code. And you must show them these terms so they know their
+rights.
+
+ We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+ Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software. If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+ Finally, any free program is threatened constantly by software
+patents. We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary. To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ GNU GENERAL PUBLIC LICENSE
+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+ 0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License. The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language. (Hereinafter, translation is included without limitation in
+the term "modification".) Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope. The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+ 1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+ 2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+ a) You must cause the modified files to carry prominent notices
+ stating that you changed the files and the date of any change.
+
+ b) You must cause any work that you distribute or publish, that in
+ whole or in part contains or is derived from the Program or any
+ part thereof, to be licensed as a whole at no charge to all third
+ parties under the terms of this License.
+
+ c) If the modified program normally reads commands interactively
+ when run, you must cause it, when started running for such
+ interactive use in the most ordinary way, to print or display an
+ announcement including an appropriate copyright notice and a
+ notice that there is no warranty (or else, saying that you provide
+ a warranty) and that users may redistribute the program under
+ these conditions, and telling the user how to view a copy of this
+ License. (Exception: if the Program itself is interactive but
+ does not normally print such an announcement, your work based on
+ the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole. If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works. But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+ 3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+ a) Accompany it with the complete corresponding machine-readable
+ source code, which must be distributed under the terms of Sections
+ 1 and 2 above on a medium customarily used for software interchange; or,
+
+ b) Accompany it with a written offer, valid for at least three
+ years, to give any third party, for a charge no more than your
+ cost of physically performing source distribution, a complete
+ machine-readable copy of the corresponding source code, to be
+ distributed under the terms of Sections 1 and 2 above on a medium
+ customarily used for software interchange; or,
+
+ c) Accompany it with the information you received as to the offer
+ to distribute corresponding source code. (This alternative is
+ allowed only for noncommercial distribution and only if you
+ received the program in object code or executable form with such
+ an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it. For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable. However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+ 4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License. Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+ 5. You are not required to accept this License, since you have not
+signed it. However, nothing else grants you permission to modify or
+distribute the Program or its derivative works. These actions are
+prohibited by law if you do not accept this License. Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+ 6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions. You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+ 7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all. For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices. Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+ 8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded. In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+ 9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation. If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+ 10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission. For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this. Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+ NO WARRANTY
+
+ 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+ 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+ <one line to give the program's name and a brief idea of what it does.>
+ Copyright (C) <year> <name of author>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+ Gnomovision version 69, Copyright (C) year name of author
+ Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary. Here is a sample; alter the names:
+
+ Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+ `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+ <signature of Ty Coon>, 1 April 1989
+ Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs. If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library. If this is what you want to do, use the GNU Library General
+Public License instead of this License.
diff --git a/Makefile b/Makefile
new file mode 100755
index 0000000..913d023
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,87 @@
+#============================================================================
+# bandwidth, a benchmark to estimate memory transfer bandwidth.
+# Copyright (C) 2005-2014 by Zack T Smith.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+#
+# The author may be reached at veritas@comcast.net.
+#============================================================================
+
+CFLAGS= -O6
+CFLAGS= -g
+CC=gcc
+LD=gcc
+SRC=main.c
+OBJ=main.o
+LIB=
+AS=nasm
+
+message:
+ @echo ""
+ @echo "To compile for x86 Linux: make bandwidth32"
+ @echo "To compile for x86_64 Linux: make bandwidth64"
+ @echo "To compile for x86 Mac OS/X: make bandwidth-mac32"
+ @echo "To compile for x86_64 Mac OS/X: make bandwidth-mac64"
+ @echo "To compile for x86 Win32/Cygwin: make bandwidth-win32"
+ @echo "Note! For the Mac you will need to install the latest NASM; Apple's is insufficient."
+ @echo ""
+
+bandwidth64: main.c routines64.asm BMP64.a BMPGraphing64.a
+ ${AS} -f elf64 routines64.asm -o routines64.o
+ ${CC} ${CFLAGS} -m64 -c ${SRC}
+ ${LD} -m64 routines64.o ${OBJ} BMP64.a -lm BMPGraphing64.a -o bandwidth64
+
+bandwidth32: main.c routines32.asm BMP32.a BMPGraphing32.a
+ ${AS} -f elf routines32.asm -o routines32.o
+ ${CC} ${CFLAGS} -m32 -c ${SRC}
+ ${LD} -m32 routines32.o ${OBJ} BMP32.a -lm BMPGraphing32.a -o bandwidth32
+
+bandwidth-mac64: main.c routines64.asm BMPGraphing64.a BMP64.a
+ ${AS} -f macho64 routines64.asm -o routines64.o
+ ${CC} ${CFLAGS} -m64 -c ${SRC}
+ ${LD} -m64 -lm BMPGraphing64.a BMP64.a routines64.o ${OBJ} ${LIB} -o bandwidth-mac64
+
+bandwidth-mac32: main.c routines32.asm BMP32.a BMPGraphing32.a
+ ${AS} -f macho routines32.asm -o routines32.o
+ ${CC} ${CFLAGS} -m32 -c ${SRC}
+ ${LD} -m32 BMP32.a -lm BMPGraphing32.a routines32.o ${OBJ} ${LIB} -o bandwidth-mac32
+
+bandwidth-win32: main.c routines32.asm BMP32.a BMPGraphing32.a
+ ${AS} -f win32 routines32.asm -o routines32.o
+ ${CC} ${CFLAGS} -m32 -c ${SRC} -Wall -O6 -D__WIN32__ -DWINVER=0x0600
+ ${LD} -m32 BMP32.a -lm BMPGraphing32.a routines32.o ${OBJ} ${LIB} -o bandwidth-win32
+
+BMPGraphing64.a: BMPGraphing.c
+ ${CC} ${CFLAGS} -m64 -c BMPGraphing.c
+ ar rvs BMPGraphing64.a BMPGraphing.o
+
+BMPGraphing32.a: BMPGraphing.c
+ ${CC} ${CFLAGS} -m32 -c BMPGraphing.c
+ ar rvs BMPGraphing32.a BMPGraphing.o
+
+BMP64.a: BMP.c
+ ${CC} ${CFLAGS} -m64 -c BMP.c font.c minifont.c
+ ar rvs BMP64.a BMP.o font.o minifont.o
+
+BMP32.a: BMP.c
+ ${CC} ${CFLAGS} -m32 -c BMP.c font.c minifont.c
+ ar rvs BMP32.a BMP.o font.o minifont.o
+
+clean:
+ rm -f main.o bandwidth bandwidth32 bandwidth64 routines32.o routines64.o
+ rm -f bandwidth-win32.exe bandwidth.bmp bandwidth-mac32 bandwidth-mac64
+ rm -f BMP.o BMP32.a BMP64.a BMPGraphing.o BMPGraphing32.a BMPGraphing64.a
+ rm -f font.o minifont.o network_bandwidth.bmp
+
diff --git a/README.txt b/README.txt
new file mode 100755
index 0000000..7189a27
--- /dev/null
+++ b/README.txt
@@ -0,0 +1,167 @@
+
+This is the README file for my program, "bandwidth".
+
+Bandwidth is a benchmark that attempts to measure
+memory bandwidth. In December 2010 (and as of
+release 0.24), I extended 'bandwidth' to measure
+network bandwidth as well.
+
+Bandwidth is useful because both memory bandwidth
+and network bandwidth need to be measured to
+give you a clear idea of what your computer(s) can do.
+Merely relying on specs does not give a full picture
+and indeed specs can be misleading.
+
+--------------------------------------------------
+MEMORY BANDWIDTH
+
+My program bandwidth performs sequential and random
+reads and writes of varying sizes. This permits
+you to infer from the graph how each type of memory
+is performing. So for instance when bandwidth
+writes a 256-byte chunk, you know that because
+caches are normally write-back, this chunk
+will reside entirely in the L1 cache. Whereas
+a 512 kB chunk will mainly reside in L2.
+
+You could run a non-artificial benchmark and
+observe that a general performance number is lower
+on one machine or higher on anotehr, but that may
+conceal the cause.
+
+So the purpose of this program is to help you
+pinpoint the cause of a performance problem,
+or to affirm a general impression about a memory-
+intensive program.
+
+It also tells you the best-case scenario e.g.
+the maximum bandwidth achieved using sequential,
+128-bit memory accesses.
+
+Release 1.1:
+ - Added larger font.
+Release 1.0:
+ - Moved graphing into BMPGraphing module.
+ - Finally added LODS benchmarking, which
+ proves how badly lodsb/lodsw/lodsd/lodsq
+ perform.
+ - Added switches --faster and --fastest.
+Release 0.32:
+ - Improved AVX support.
+Release 0.31:
+ - Adds cache detection for Intel 32-bit CPUs
+ - Adds a little AVX support.
+ - Fixes vector-to/from-main transfer bugs.
+Release 0.30 adds cache detection for Intel 64-bit CPUs.
+Release 0.29 improved graph granularity with more
+ 128-byte tests and removes ARM support.
+Release 0.28 added a proper test of CPU features e.g. SSE 4.1.
+Release 0.27 added finer-granularity 128-byte tests.
+Release 0.26 fixed an issue with AMD processors.
+Release 0.25 maked network bandwidth bidirectional.
+Release 0.24 added network bandwidth testing.
+
+Release 0.23 added:
+ - Mac OS/X 64-bit support.
+ - Vector-to-vector register transfer test.
+ - Main register to/from vector register transfer test.
+ - Main register byte/word/dword/qword to/from
+ vector register test (pinsr*, pextr* instructions).
+ - Memory copy test using SSE2.
+ - Automatic checks under Linux for SSE2 & SSE4.
+
+Release 0.22 added:
+ - Register-to-register transfer test.
+ - Register-to/from-stack transfer tests.
+
+Release 0.21 added:
+ - Standardized memory chunks to always be
+ a multiple of 256-byte mini-chunks.
+ - Random memory accesses, in which each
+ 256-byte mini-chunk accessed is accessed
+ in a random order, but also, inside each
+ mini-chunk the 32/64/128 data are accessed
+ pseudo-randomly as well.
+ - Now 'bandwidth' includes chunk sizes that
+ are not powers of 2, which increases
+ data points around the key chunk sizes
+ corresponding to common L1 and L2 cache
+ sizes.
+ - Command-line options:
+ --fast for 0.25 seconds per test.
+ --slow for 20 seconds per test.
+ --title for adding a graph title.
+
+Release 0.20 added graphing, with the graph
+stored in a BMP image file. It also adds the
+--slow option for more precise runs.
+
+Release 0.19 added a second 128-bit SSE writer
+routine that bypasses the caches, in addition
+to the one that doesn't.
+
+Release 0.18 was my Grand Unified bandwidth
+benchmark that brought together support for
+four operating systems:
+ - Linux
+ - Windows Mobile
+ - 32-bit Windows
+ - Mac OS/X 64-bit
+and two processor architectures:
+ - x86
+ - Intel64
+I've written custom assembly routines for
+each architecture.
+
+Total run time for the default speed, which
+has 5 seconds per test, is about 35 minutes.
+
+--------------------------------------------------
+NETWORK BANDWIDTH (beginning with release 0.24)
+
+In mid-December 2010, I extended bandwidth to measure
+network bandwidth, which is useful for testing
+your home or workplace network setup, and in theory
+could be used to test machines across the Internet.
+
+Release 0.25 adds:
+ - Bidirectional network bandwidth testing.
+ - Specifiable port# (default is 49000).
+
+In the graph:
+ - Sent data appears as a solid line.
+ - Received data appears as a dashed line.
+
+The network test is pretty simple. It sends chunks
+of data of varying sizes to whatever computers
+(nodes) that you specify. Each of those must be
+running 'bandwidth' in transponder mode.
+
+The chunks of data range of 32 kB up to 32 MB.
+These are actually send as a stream of 1 or more
+32 kB sub-chunks.
+
+Sample output:
+ output/Network-Linux2.6-Celeron-2.8GHz-32bit-loopback.bmp
+ output/Network-MacOSX32-Corei5-2.4GHz-64bit-loopback.bmp
+ output/Network-Mac64-Linux32.bmp
+
+How to start a transponder:
+ ./bandwidth-mac64 --transponder
+
+Example invocation of the test leader:
+ ./bandwidth64 --network 192.168.1.104
+
+I've tested network mode on:
+ Linux 32-bit
+ Mac OS/X 32- and 64-bit
+ Win/Cygwin 32-bit.
+
+--------------------------------------------------
+This program is provided without any warranty
+and AS-IS. See the file COPYING for details.
+
+Zack Smith
+1@zsmith.co
+March 2013
+
diff --git a/defs.h b/defs.h
new file mode 100755
index 0000000..176dbd1
--- /dev/null
+++ b/defs.h
@@ -0,0 +1,147 @@
+/*============================================================================
+ bandwidth, a benchmark to estimate memory transfer bandwidth.
+ Copyright (C) 2005-2014 by Zack T Smith.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+ The author may be reached at 1@zsmith.co.
+ *===========================================================================*/
+
+//---------------------------------------------------------------------------
+// Change log
+// 0.18 Grand unified version supports x86/intel64/arm, linux/win32/winmo.
+// 0.19 Now have 128-bit writer that goes to cache AND one that bypasses.
+// 0.20 Added my bmplib and graphing of output. Also added --slow option.
+// 0.21 Adds random testing. Min chunk size = 256 B. Allows non-2^n chunks.
+// 0.22 Adds register-to-register and register-to/from-stack transfers.
+// 0.23 Adds vector-to-vector and register-to-vector transfers, & Mac support.
+// 0.24 Adds network bandwidth tests from this PC to specified others.
+// 0.25 Made network tests bidirectional to test asymmetric networks.
+// 0.26 Fixes to prevent certain vector instructions being used w/AMD chips.
+// 0.27 Added 128-byte tests for greater precision.
+// 0.28 Added use of CPUID.
+// 0.29 Added more 128-byte tests.
+// 0.30 Adds cache identification for Intel CPUs in 64-bit mode.
+// 0.31 Adds cache identification for Intel CPUs in 32-bit mode.
+// 0.32 Added AVX support.
+// 1.0 Moved graphing logic into BMPGraphing. Added LODS support.
+// 1.1 Switched to larger font in graphing module.
+//---------------------------------------------------------------------------
+
+#ifndef _DEFS_H
+#define _DEFS_H
+
+#define RELEASE "1.1"
+
+#ifndef bool
+typedef char bool;
+enum { true = 1, false = 0 };
+#endif
+
+#define NETWORK_DEFAULT_PORTNUM (49000)
+#define NETSIZE_MIN (15)
+#define NETSIZE_MAX (28)
+#define NETWORK_CHUNK_SIZE (1<<NETSIZE_MIN)
+
+#define DOING_LODS // lodsq and lodsd
+
+extern int Reader (void *ptr, unsigned long size, unsigned long loops);
+
+extern int ReaderLODSQ (void *ptr, unsigned long size, unsigned long loops);
+extern int ReaderLODSD (void *ptr, unsigned long size, unsigned long loops);
+extern int ReaderLODSW (void *ptr, unsigned long size, unsigned long loops);
+extern int ReaderLODSB (void *ptr, unsigned long size, unsigned long loops);
+
+extern int Reader_128bytes (void *ptr, unsigned long size, unsigned long loops);
+extern int RandomReader (void *ptr, unsigned long n_chunks, unsigned long loops);
+
+extern int Writer (void *ptr, unsigned long size, unsigned long loops, unsigned long value);
+extern int Writer_128bytes (void *ptr, unsigned long size, unsigned long loops, unsigned long value);
+extern int RandomWriter (void *ptr, unsigned long size, unsigned long loops, unsigned long value);
+
+extern int RegisterToRegister (unsigned long);
+
+extern int StackReader (unsigned long);
+extern int StackWriter (unsigned long);
+
+extern int RegisterToVector (unsigned long); // SSE2
+extern int Register8ToVector (unsigned long); // SSE2
+extern int Register16ToVector (unsigned long); // SSE2
+extern int Register32ToVector (unsigned long); // SSE2
+extern int Register64ToVector (unsigned long); // SSE2
+
+extern int VectorToVector (unsigned long); // SSE2
+
+extern int VectorToVectorAVX (unsigned long);
+
+extern int VectorToRegister (unsigned long); // SSE2
+extern int Vector8ToRegister (unsigned long); // SSE2
+extern int Vector16ToRegister (unsigned long); // SSE2
+extern int Vector32ToRegister (unsigned long); // SSE2
+extern int Vector64ToRegister (unsigned long); // SSE2
+
+extern int Copy (void*, void*, unsigned long, unsigned long);
+extern int CopySSE (void*, void*, unsigned long, unsigned long);
+extern int CopyAVX (void*, void*, unsigned long, unsigned long);
+extern int CopySSE_128bytes (void*, void*, unsigned long, unsigned long);
+
+extern int ReaderAVX (void *ptr, unsigned long, unsigned long);
+extern int ReaderSSE2 (void *ptr, unsigned long, unsigned long);
+extern int ReaderSSE2_bypass (void *ptr, unsigned long, unsigned long);
+extern int RandomReaderSSE2 (unsigned long **ptr, unsigned long, unsigned long);
+extern int RandomReaderSSE2_bypass (unsigned long **ptr, unsigned long, unsigned long);
+
+extern int WriterAVX (void *ptr, unsigned long, unsigned long, unsigned long);
+extern int WriterSSE2 (void *ptr, unsigned long, unsigned long, unsigned long);
+extern int RandomWriterSSE2(unsigned long **ptr, unsigned long, unsigned long, unsigned long);
+
+extern int ReaderSSE2_128bytes(void *ptr, unsigned long, unsigned long);
+extern int WriterSSE2_128bytes(void *ptr, unsigned long, unsigned long, unsigned long);
+
+extern int ReaderSSE2_128bytes_bypass (void *ptr, unsigned long, unsigned long);
+extern int WriterSSE2_128bytes_bypass (void *ptr, unsigned long, unsigned long, unsigned long);
+
+extern int WriterAVX_bypass (void *ptr, unsigned long, unsigned long, unsigned long);
+extern int WriterSSE2_bypass (void *ptr, unsigned long, unsigned long, unsigned long);
+extern int RandomWriterSSE2_bypass (unsigned long **ptr, unsigned long, unsigned long, unsigned long);
+
+extern void get_cpuid_family (char *family_return);
+extern void get_cpuid_cache_info (uint32_t *array, int index);
+extern unsigned get_cpuid1_ecx ();
+extern unsigned get_cpuid1_edx ();
+extern unsigned get_cpuid7_ebx ();
+extern unsigned get_cpuid_80000001_ecx ();
+extern unsigned get_cpuid_80000001_edx ();
+
+#define CPUID_EDX_MMX (1<<23)
+#define CPUID_EDX_SSE (1<<25)
+#define CPUID_EDX_SSE2 (1<<26)
+#define CPUID_EDX_INTEL64 (1<<29) // "Long Mode" on AMD.
+#define CPUID_EDX_XD (1<<20)
+#define CPUID_ECX_SSE3 (1)
+#define CPUID_ECX_SSSE3 (1<<9)
+#define CPUID_ECX_SSE4A (1<<6)
+#define CPUID_ECX_SSE41 (1<<19)
+#define CPUID_ECX_SSE42 (1<<20)
+#define CPUID_ECX_AES (1<<25) // Encryption.
+#define CPUID_ECX_AVX (1<<28) // 256-bit YMM registers.
+#define CPUID_EBX_AVX2 (0x20)
+
+#define FBLOOPS_R 400
+#define FBLOOPS_W 800
+#define FB_SIZE (640*480*2)
+
+#endif
+
diff --git a/font.c b/font.c
new file mode 100755
index 0000000..483e478
--- /dev/null
+++ b/font.c
@@ -0,0 +1,1655 @@
+
+/*=============================================================================
+ bmplib, a simple library to create, modify, and write BMP image files.
+ Copyright (C) 2009-2014 by Zack T Smith.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2
+ as published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+ The author may be reached at veritas@comcast.net.
+ *============================================================================*/
+
+#include <stdio.h>
+
+#include "BMP.h"
+
+// Mini characters, 8 pixels high.
+static const char *font_chars_ [] =
+{
+ "##",
+ "##",
+ "##",
+ "##",
+ "##",
+ "##",
+ "##",
+ "##",
+ "##",
+ "##",
+ " ",
+ " ",
+ "##",
+ "##",
+ "",
+ "",
+ "",
+
+ "## ##",
+ "## ##",
+ "## ##",
+ " # #",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+
+ " ## ## ",
+ " ## ## ",
+ " ## ## ",
+ " ## ## ",
+ " ##########",
+ " ##########",
+ " ## ## ",
+ " ## ## ",
+ " ##########",
+ " ##########",
+ " ## ## ",
+ " ## ## ",
+ " ## ## ",
+ " ## ## ",
+ "",
+ "",
+ "",
+
+ " ## ",
+ " ## ",
+ " ########",
+ "## ##",
+ "## ## ",
+ "## ## ",
+ " ###### ",
+ " ## ##",
+ " ## ##",
+ " ## ##",
+ " ## ##",
+ "########",
+ " ## ",
+ " ## ",
+ "",
+ "",
+ "",
+
+ " ## ##",
+ " # # ##",
+ " ## ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ "## ## ",
+ "## # #",
+ "## ## ",
+ "",
+ "",
+ "",
+
+ " #####",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ## ",
+ " ## ## ##",
+ " #### ##",
+ "## ## ##",
+ "## ####",
+ "## ##",
+ "## ##",
+ " ## ## ##",
+ " ##### ##",
+ "",
+ "",
+ "",
+
+ "###",
+ "###",
+ " ##",
+ " #",
+ " #",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ "##",
+ "##",
+ "##",
+ "##",
+ "##",
+ "##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ "",
+
+ "## ",
+ " ## ",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ "##",
+ "",
+
+ " ",
+ "",
+ "",
+ " ##",
+ "## ## ##",
+ " ## ## ##",
+ " ######",
+ " ####",
+ " ######",
+ " ## ## ##",
+ "## ## ##",
+ " ##",
+ "",
+ "",
+ "",
+ "",
+ "",
+
+ "",
+ "",
+ "",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ "##########",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ "",
+ "",
+ "",
+ "",
+ "",
+
+ " ",
+ " ",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "###",
+ "###",
+ " ##",
+ " ##",
+ "#",
+
+ " ",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "#######",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+
+ " ",
+ " ",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "###",
+ "###",
+ "",
+ "",
+ "",
+
+ " ",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ "##",
+ "##",
+ "",
+ "",
+ "",
+
+ " ##### ",
+ " ## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ " ## ##",
+ " ##### ",
+ "",
+ "",
+ "",
+
+" ##",
+" ##",
+" ###",
+"#####",
+" ##",
+" ##",
+" ##",
+" ##",
+" ##",
+" ##",
+" ##",
+" ##",
+" ##",
+" ## ",
+ "",
+"",
+"",
+
+ " #### ",
+ " ## ##",
+ "## ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ## ",
+ "## ",
+ "## ",
+ "## ",
+ "########",
+ "",
+ "",
+ "",
+
+ "########",
+ " ##",
+ " ##",
+ " ##",
+ " ## ",
+ " ## ",
+ " #### ",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ "## ##",
+ " ## ##",
+ " #### ",
+ "",
+ "",
+ "",
+
+ " ##",
+ " ###",
+ " ####",
+ " ## ##",
+ " ## ##",
+ "## ##",
+ "#########",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ "",
+ "",
+ "",
+
+ "########",
+ "## ",
+ "## ",
+ "## ",
+ "## ",
+ "###",
+ " ######",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ "## ##",
+ " ## ##",
+ " #### ",
+ "",
+ "",
+ "",
+
+ " ##### ",
+ " ## #",
+ "## ",
+ "## ",
+ "## ",
+ "## ",
+ "#######",
+ "## ## ",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ " ## ##",
+ " ##### ",
+ "",
+ "",
+ "",
+
+ "########",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ "",
+ "",
+ "",
+
+ " ##### ",
+ " ## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ " ## ##",
+ " ##### ",
+ " ## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ " ## ##",
+ " ##### ",
+ "",
+ "",
+ "",
+
+ " ##### ",
+ " ## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ " ## ##",
+ " #######",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ "",
+ "",
+ "",
+
+ " ",
+ " ",
+ "",
+ "",
+ "",
+ "###",
+ "###",
+ "",
+ "",
+ "",
+ "",
+ "###",
+ "###",
+ "",
+ "",
+ "",
+ "",
+
+ " ",
+ "",
+ "",
+ "",
+ "",
+ "###",
+ "###",
+ "",
+ "",
+ "",
+ "",
+ "###",
+ "###",
+ " ##",
+ " ##",
+ "#",
+ "",
+
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ "##",
+ "##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ "",
+ "",
+ "",
+
+ " ",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "############",
+ "############",
+ " ",
+ " ",
+ "############",
+ "############",
+ "",
+ "",
+ "",
+ "",
+ "",
+
+ "## ",
+ " ## ",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ "##",
+ "",
+ "",
+ "",
+
+ " ###### ",
+ " ## ##",
+ "## ##",
+ "## ##",
+ " ##",
+ " ##",
+ " ### ",
+ " ##",
+ " ##",
+ " ##",
+ "",
+ "",
+ " ##",
+ " ##",
+ "",
+ "",
+ "",
+
+ " ###### ",
+ " ## ##",
+ "## ##",
+ "## ##",
+ "## ####",
+ "## ## ##",
+ "## ## ##",
+ "## ## ##",
+ "## ## ##",
+ "## ####",
+ "## ",
+ "## ",
+ " ## ##",
+ " #######",
+ "",
+ "",
+ "",
+
+ " ##",
+ " ####",
+ " ####",
+ " ## ##",
+ " ## ##",
+ " ## ##",
+ " ## ##",
+ " ## ##",
+ " ########",
+ " ## ##",
+ " ## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "",
+ "",
+ "",
+
+ "########",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "######## ",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "########",
+ "",
+ "",
+ "",
+
+ " ###### ",
+ " ## ##",
+ "## ##",
+ "## ##",
+ "## ",
+ "## ",
+ "## ",
+ "## ",
+ "## ",
+ "## ",
+ "## ##",
+ "## ##",
+ " ## ##",
+ " ######",
+ "",
+ "",
+ "",
+
+ "########",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ## ",
+ "########",
+ "",
+ "",
+ "",
+
+ "##########",
+ "##",
+ "##",
+ "##",
+ "##",
+ "##",
+ "########",
+ "##",
+ "##",
+ "##",
+ "##",
+ "##",
+ "##",
+ "##########",
+ "",
+ "",
+ "",
+
+ "##########",
+ "##",
+ "##",
+ "##",
+ "##",
+ "##",
+ "########",
+ "##",
+ "##",
+ "##",
+ "##",
+ "##",
+ "##",
+ "##",
+ "",
+ "",
+ "",
+
+ " ###### ",
+ " ## ##",
+ "## ##",
+ "## ##",
+ "## ",
+ "## ",
+ "## ####",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ " ## ###",
+ " ###### #",
+ "",
+ "",
+ "",
+
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "##########",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "",
+ "",
+ "",
+
+ " ## ",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ "",
+ "",
+ "",
+
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ "## ##",
+ "## ##",
+ " ## ##",
+ " ####",
+ "",
+ "",
+ "",
+
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "####",
+ "####",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "",
+ "",
+ "",
+
+ "##",
+ "##",
+ "##",
+ "##",
+ "##",
+ "##",
+ "##",
+ "##",
+ "##",
+ "##",
+ "##",
+ "##",
+ "##",
+ "##########",
+ "",
+ "",
+ "",
+
+ "## ##",
+ "### ###",
+ "#### ####",
+ "## ## ## ##",
+ "## ## ## ##",
+ "## ## ## ##",
+ "## ## ## ##",
+ "## ## ## ##",
+ "## ## ## ##",
+ "## ## ## ##",
+ "## ## ## ##",
+ "## ### ##",
+ "## ### ##",
+ "## # ##",
+ "",
+ "",
+ "",
+
+ "## ##",
+ "### ##",
+ "#### ##",
+ "#### ##",
+ "## ## ##",
+ "## ## ##",
+ "## ## ##",
+ "## ## ##",
+ "## ## ##",
+ "## ## ##",
+ "## ####",
+ "## ####",
+ "## ###",
+ "## ##",
+ "",
+ "",
+ "",
+
+ " ###### ",
+ " ## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ " ## ##",
+ " ######",
+ "",
+ "",
+ "",
+
+ "########",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "########",
+ "##",
+ "##",
+ "##",
+ "##",
+ "##",
+ "##",
+ "##",
+ "",
+ "",
+ "",
+
+ " ######",
+ " ## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ## ##",
+ "## ## ##",
+ "## ## ##",
+ "## ###",
+ " ## ##",
+ " ##### ##",
+ "",
+ "",
+ "",
+
+ "########",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "########",
+ "####",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "",
+ "",
+ "",
+
+ " ######",
+ " ## ##",
+ "## ##",
+ "## ##",
+ "##",
+ " ###",
+ " #####",
+ " ##",
+ " ##",
+ " ##",
+ "## ##",
+ "## ##",
+ " ## ##",
+ " ######",
+ "",
+ "",
+ "",
+
+ "##########",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ "",
+ "",
+ "",
+
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ " ## ##",
+ " ######",
+ "",
+ "",
+ "",
+
+ "## ##",
+ "## ##",
+ " ## ##",
+ " ## ##",
+ " ## ##",
+ " ## ## ",
+ " ## ## ",
+ " ## ## ",
+ " ## ## ",
+ " ## ## ",
+ " ## ## ",
+ " ### ",
+ " ### ",
+ " # ",
+ "",
+ "",
+ "",
+
+"## ## ##",
+"## ## ##",
+"## ## ##",
+" ## #### ##",
+" ## #### ##",
+" ## ## ## ##",
+" ## ## ## ##",
+" ## ## ## ##",
+" ## ## ## ##",
+" ## ## ## ##",
+" ## ## ## ##",
+" ### ###",
+" ### ###",
+" # #",
+ "",
+ "",
+ "",
+
+ "## ##",
+ "## ##",
+ " ## ##",
+ " ## ##",
+ " ## ##",
+ " ## ##",
+ " ##",
+ " ##",
+ " ## ##",
+ " ## ##",
+ " ## ##",
+ " ## ##",
+ "## ##",
+ "## ##",
+ "",
+ "",
+ "",
+
+ "## ##",
+ "## ##",
+ " ## ##",
+ " ## ##",
+ " ## ##",
+ " ## ##",
+ " ####",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ "",
+ "",
+ "",
+
+ "#########",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ "#########",
+ "",
+ "",
+ "",
+
+ "#####",
+ "##",
+ "##",
+ "##",
+ "##",
+ "##",
+ "##",
+ "##",
+ "##",
+ "##",
+ "##",
+ "##",
+ "##",
+ "##",
+ "##",
+ "#####",
+ "",
+
+ "##",
+ "##",
+ "##",
+ "##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ "",
+ "",
+ "",
+
+ "#####",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ "#####",
+ "",
+
+ " ##",
+ " ####",
+ " ## ##",
+ " ## ##",
+ "## ##",
+ "## ##",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+
+ " ",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "########",
+ "",
+ "",
+ "",
+
+ "####",
+ "####",
+ "##",
+ " ##",
+ " #",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+
+ " ",
+ "",
+ "",
+ "",
+ " ######",
+ " ## ##",
+ " ##",
+ " ##",
+ " ######",
+ " ## ##",
+ "## ##",
+ "## ##",
+ " ## ##",
+ " ##### ##",
+ "",
+ "",
+ "",
+
+ "##",
+ "##",
+ "##",
+ "##",
+ "## ####",
+ "### ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "### ##",
+ "## ####",
+ "",
+ "",
+ "",
+
+ " ",
+ "",
+ "",
+ "",
+ " #####",
+ " ## ##",
+ "## ",
+ "## ",
+ "## ",
+ "## ",
+ "## ",
+ "## ",
+ " ## ##",
+ " #####",
+ "",
+ "",
+ "",
+
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " #### ##",
+ " ## ###",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ " ## ###",
+ " #### ##",
+ "",
+ "",
+ "",
+
+ " ",
+ "",
+ "",
+ "",
+ " #####",
+ " ## ##",
+ "## ##",
+ "## ##",
+ "#########",
+ "##",
+ "##",
+ "##",
+ " ## ##",
+ " ######",
+ "",
+ "",
+ "",
+
+ " ####",
+ " ##",
+ " ## ",
+ " ## ",
+ "#####",
+ " ## ",
+ " ## ",
+ " ## ",
+ " ## ",
+ " ## ",
+ " ## ",
+ " ## ",
+ " ## ",
+ " ## ",
+ "",
+ "",
+ "",
+
+ " ",
+ "",
+ "",
+ "",
+ " ##### #",
+ " ## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ " ## ##",
+ " #######",
+ " ##",
+ "## ##",
+ " ######",
+
+ "##",
+ "##",
+ "##",
+ "##",
+ "## ####",
+ "### ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "",
+ "",
+ "",
+
+"##",
+"##",
+"",
+"",
+"##",
+"##",
+"##",
+"##",
+"##",
+"##",
+"##",
+"##",
+"##",
+"##",
+ "",
+ "",
+ "",
+
+ " ##",
+ " ##",
+ " ",
+ " ",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ "###",
+
+ "##",
+ "##",
+ "##",
+ "##",
+ "## ##",
+ "## ##",
+ "## ## ",
+ "## ## ",
+ "#### ",
+ "#### ",
+ "## ## ",
+ "## ## ",
+ "## ##",
+ "## ##",
+ "",
+ "",
+ "",
+
+"##",
+"##",
+"##",
+"##",
+"##",
+"##",
+"##",
+"##",
+"##",
+"##",
+"##",
+"##",
+"##",
+"##",
+ "",
+ "",
+ "",
+
+ " ",
+ "",
+ "",
+ "",
+ "## ### ####",
+ "### ## ##",
+ "## ## ##",
+ "## ## ##",
+ "## ## ##",
+ "## ## ##",
+ "## ## ##",
+ "## ## ##",
+ "## ## ##",
+ "## ## ##",
+ "",
+ "",
+ "",
+
+ " ",
+ "",
+ "",
+ "",
+ "## ####",
+ "### ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "",
+ "",
+ "",
+
+ " ",
+ "",
+ "",
+ "",
+ " #####",
+ " ## ## ",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ " ## ## ",
+ " ##### ",
+ "",
+ "",
+ "",
+
+ " ",
+ "",
+ "",
+ "",
+ "## ####",
+ "### ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "#######",
+ "##",
+ "##",
+ "##",
+
+ " ",
+ "",
+ "",
+ "",
+ " #### ##",
+ " ## ###",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ " ## ##",
+ " #######",
+ " ##",
+ " ##",
+ " ##",
+
+ " ",
+ "",
+ "",
+ "",
+ "## ####",
+ "## ##",
+ "####",
+ "###",
+ "##",
+ "##",
+ "##",
+ "##",
+ "##",
+ "##",
+ "",
+ "",
+ "",
+
+ " ",
+ "",
+ "",
+ "",
+ " #######",
+ "## ##",
+ "##",
+ " ##",
+ " ###",
+ " ###",
+ " ##",
+ " ##",
+ "## ##",
+ " #######",
+ "",
+ "",
+ "",
+
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ "######",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ####",
+ "",
+ "",
+ "",
+
+ " ",
+ "",
+ "",
+ "",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ "## ##",
+ " ## ##",
+ " ##### #",
+ "",
+ "",
+ "",
+
+ " ",
+ "",
+ "",
+ "",
+ "## ##",
+ "## ##",
+ "## ##",
+ " ## ##",
+ " ## ##",
+ " ## ##",
+ " ## ##",
+ " ## ##",
+ " #####",
+ " ###",
+ "",
+ "",
+ "",
+
+ " ",
+ "",
+ "",
+ "",
+ "## ## ##",
+ "## ## ##",
+ "## ## ##",
+ " ## #### ##",
+ " ## #### ##",
+ " ## #### ##",
+ " ## ## ## ##",
+ " ## ## ## ##",
+ " ##### #####",
+ " ### ###",
+ "",
+ "",
+ "",
+
+ " ",
+ "",
+ "",
+ "",
+ "## ##",
+ "## ##",
+ " ## ##",
+ " ## ##",
+ " ##",
+ " ##",
+ " ## ##",
+ " ## ##",
+ "## ##",
+ "## ##",
+ "",
+ "",
+ "",
+
+ " ",
+ "",
+ "",
+ "",
+ "## ##",
+ "## ##",
+ "## ##",
+ " ## ##",
+ " ## ##",
+ " ## ##",
+ " ## ##",
+ " ## ##",
+ " ## ##",
+ " ####",
+ " ##",
+ " ##",
+ "####",
+
+ " ",
+ "",
+ "",
+ "",
+ "#########",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ##",
+ " ## ",
+ " ## ",
+ "## ",
+ "#########",
+ "",
+ "",
+ "",
+
+};
+
+const char **get_font_chars ()
+{
+ return font_chars_;
+}
+
diff --git a/font.h b/font.h
new file mode 100755
index 0000000..5ecb8ed
--- /dev/null
+++ b/font.h
@@ -0,0 +1,28 @@
+
+/*=============================================================================
+ bmplib, a simple library to create, modify, and write BMP image files.
+ Copyright (C) 2009-2014 by Zack T Smith.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2
+ as published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+ The author may be reached at veritas@comcast.net.
+ *============================================================================*/
+
+#ifndef _FONT_H
+#define _FONT_H
+
+extern const char **get_font_chars (void);
+
+#endif
+
diff --git a/loopback.sh b/loopback.sh
new file mode 100755
index 0000000..780d50f
--- /dev/null
+++ b/loopback.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+EXE=bandwidth32
+./$EXE --transponder &
+./$EXE --network 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1
+kill %1
diff --git a/main.c b/main.c
new file mode 100755
index 0000000..2d293a8
--- /dev/null
+++ b/main.c
@@ -0,0 +1,2442 @@
+/*============================================================================
+ bandwidth 1.1, a benchmark to estimate memory transfer bandwidth.
+ Copyright (C) 2005-2014 by Zack T Smith.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+ The author may be reached at veritas@comcast.net.
+ *===========================================================================*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <wchar.h>
+#include <math.h>
+
+#include <netdb.h> // gethostbyname
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+
+#define GRAPH_WIDTH 1440
+#define GRAPH_HEIGHT 900
+
+#include "defs.h"
+#include "BMP.h"
+#include "BMPGraphing.h"
+
+#define TITLE_MEMORY_NET "Network benchmark results from bandwidth " RELEASE " by Zack Smith, http://zsmith.co"
+#define TITLE_MEMORY_GRAPH "Memory benchmark results from bandwidth " RELEASE " by Zack Smith, http://zsmith.co"
+
+#ifdef __WIN32__
+#include <windows.h>
+#endif
+
+#ifdef __linux__
+#include <linux/fb.h>
+#include <sys/mman.h>
+#endif
+
+static int network_port = NETWORK_DEFAULT_PORTNUM;
+
+enum {
+ NO_SSE2,
+ SSE2,
+ SSE2_BYPASS,
+ AVX,
+ AVX_BYPASS,
+ LODSQ,
+ LODSD,
+ LODSW,
+ LODSB
+};
+
+static BMPGraph *graph = NULL;
+
+static bool use_sse2 = true;
+static bool use_sse4 = true;
+static bool is_intel = false;
+static bool is_amd = false;
+
+static uint32_t cpu_has_mmx = 0;
+static uint32_t cpu_has_sse = 0;
+static uint32_t cpu_has_sse2 = 0;
+static uint32_t cpu_has_sse3 = 0;
+static uint32_t cpu_has_ssse3 = 0;
+static uint32_t cpu_has_sse4a = 0;
+static uint32_t cpu_has_sse41 = 0;
+static uint32_t cpu_has_sse42 = 0;
+static uint32_t cpu_has_aes = 0;
+static uint32_t cpu_has_avx = 0;
+static uint32_t cpu_has_avx2 = 0;
+static uint32_t cpu_has_64bit = 0;
+static uint32_t cpu_has_xd = 0;
+
+//----------------------------------------
+// Parameters for the tests.
+//
+
+static long usec_per_test = 5000000; // 5 seconds per memory test.
+
+static int chunk_sizes[] = {
+ 128,
+ 256,
+ 384,
+ 512,
+ 640,
+ 768,
+ 896,
+ 1024,
+ 1280,
+ 2048,
+ 3072,
+ 4096,
+ 6144,
+ 8192, // Some processors' L1 data caches are only 8kB.
+ 12288,
+ 16384,
+ 20480,
+ 24576,
+ 28672,
+ 32768, // Common L1 data cache size.
+ 34*1024,
+ 36*1024,
+ 40960,
+ 49152,
+ 65536,
+ 131072, // Old L2 cache size.
+ 192 * 1024,
+ 256 * 1024, // Old L2 cache size.
+ 320 * 1024,
+ 384 * 1024,
+ 512 * 1024, // Old L2 cache size.
+ 768 * 1024,
+ 1 << 20, // 1 MB = common L2 cache size.
+ (1024 + 256) * 1024, // 1.25
+ (1024 + 512) * 1024, // 1.5
+ (1024 + 768) * 1024, // 1.75
+ 1 << 21, // 2 MB = common L2 cache size.
+ (2048 + 256) * 1024, // 2.25
+ (2048 + 512) * 1024, // 2.5
+ (2048 + 768) * 1024, // 2.75
+ 3072 * 1024, // 3 MB = common L2 cache size.
+ 3407872, // 3.25 MB
+ 3 * 1024 * 1024 + 1024 * 512, // 3.5 MB
+ 1 << 22, // 4 MB
+ 5242880, // 5 megs
+ 6291456, // 6 megs (common L2 cache size)
+ 7 * 1024 * 1024,
+ 8 * 1024 * 1024, // Xeon E3's often has 8MB L3
+ 9 * 1024 * 1024,
+ 10 * 1024 * 1024, // Xeon E5-2609 has 10MB L3
+ 12 * 1024 * 1024,
+ 14 * 1024 * 1024,
+ 15 * 1024 * 1024, // Xeon E6-2630 has 15MB L3
+ 16 * 1024 * 1024,
+ 20 * 1024 * 1024, // Xeon E5-2690 has 20MB L3
+ 21 * 1024 * 1024,
+ 32 * 1024 * 1024,
+ 48 * 1024 * 1024,
+ 64 * 1024 * 1024,
+ 72 * 1024 * 1024,
+ 96 * 1024 * 1024,
+ 128 * 1024 * 1024,
+ 0
+};
+
+static double chunk_sizes_log2 [sizeof(chunk_sizes)/sizeof(int)];
+
+//----------------------------------------------------------------------------
+// Name: error
+// Purpose: Complain and exit.
+//----------------------------------------------------------------------------
+void error (char *s)
+{
+#ifndef __WIN32__
+ fprintf (stderr, "Error: %s\n", s);
+ exit (1);
+#else
+ wchar_t tmp [200];
+ int i;
+ for (i = 0; s[i]; i++)
+ tmp[i] = s[i];
+ tmp[i] = 0;
+ MessageBoxW (0, tmp, L"Error", 0);
+ ExitProcess (0);
+#endif
+}
+
+//============================================================================
+// Output buffer logic.
+// This is somewhat vestigial code, originating with Windows Mobile ARM port.
+//============================================================================
+
+#define MSGLEN 10000
+static wchar_t msg [MSGLEN];
+
+void print (wchar_t *s)
+{
+ wcsncat (msg, s, MSGLEN-1);
+}
+
+void newline ()
+{
+ wcsncat (msg, L"\n", MSGLEN-1);
+}
+
+void println (wchar_t *s)
+{
+ wcsncat (msg, s, MSGLEN-1);
+ newline ();
+}
+
+void print_int (int d)
+{
+ swprintf (msg + wcslen (msg), MSGLEN, L"%d", d);
+}
+
+void print_uint (unsigned int d)
+{
+ swprintf (msg + wcslen (msg), MSGLEN, L"%lu", d);
+}
+
+void println_int (int d)
+{
+ print_int (d);
+ newline ();
+}
+
+void print_result (long double result)
+{
+ swprintf (msg + wcslen (msg), MSGLEN, L"%.1Lf MB/s", result);
+}
+
+void dump (FILE *f)
+{
+ if (!f)
+ f = stdout;
+
+ int i = 0;
+ while (msg[i]) {
+ char ch = (char) msg[i];
+ fputc (ch, f);
+ i++;
+ }
+
+ msg [0] = 0;
+}
+
+void flush ()
+{
+ dump (NULL);
+ fflush (stdout);
+}
+
+void print_size (unsigned long size)
+{
+ if (size < 1536) {
+ print_int (size);
+ print (L" B");
+ }
+ else if (size < (1<<20)) {
+ print_int (size >> 10);
+ print (L" kB");
+ } else {
+ print_int (size >> 20);
+ switch ((size >> 18) & 3) {
+ case 1: print (L".25"); break;
+ case 2: print (L".5"); break;
+ case 3: print (L".75"); break;
+ }
+ print (L" MB");
+ }
+}
+
+//============================================================================
+// Timing logic.
+//============================================================================
+
+//----------------------------------------------------------------------------
+// Name: mytime
+// Purpose: Reports time in microseconds.
+//----------------------------------------------------------------------------
+unsigned long mytime ()
+{
+#ifndef __WIN32__
+ struct timeval tv;
+ struct timezone tz;
+ memset (&tz, 0, sizeof(struct timezone));
+ gettimeofday (&tv, &tz);
+ return 1000000 * tv.tv_sec + tv.tv_usec;
+#else
+ return 1000 * GetTickCount (); // accurate enough.
+#endif
+}
+
+//----------------------------------------------------------------------------
+// Name: calculate_result
+// Purpose: Calculates and prints a result.
+// Returns: 10 times the number of megabytes per second.
+//----------------------------------------------------------------------------
+int
+calculate_result (unsigned long chunk_size, long long total_loops, long diff)
+{
+ if (!diff)
+ error ("Zero time difference.");
+
+// printf ("\nIn calculate_result, chunk_size=%ld, total_loops=%lld, diff=%ld\n", chunk_size, total_loops, diff);
+ long double result = (long double) chunk_size;
+ result *= (long double) total_loops;
+ result *= 1000000.; // Convert to microseconds.
+ result /= 1048576.;
+ result /= (long double) diff;
+
+ print_result (result);
+
+ return (long) (10.0 * result);
+}
+
+//============================================================================
+// Tests.
+//============================================================================
+
+//----------------------------------------------------------------------------
+// Name: do_write
+// Purpose: Performs write on chunk of memory of specified size.
+//----------------------------------------------------------------------------
+int
+do_write (unsigned long size, int mode, bool random)
+{
+ unsigned char *chunk;
+ unsigned char *chunk0;
+ unsigned long loops;
+ unsigned long long total_count=0;
+#ifdef __x86_64__
+ unsigned long value = 0x1234567689abcdef;
+#else
+ unsigned long value = 0x12345678;
+#endif
+ unsigned long diff=0, t0;
+ unsigned long tmp;
+ unsigned long **chunk_ptrs = NULL;
+
+ if (size & 127)
+ error ("do_write(): chunk size is not multiple of 128.");
+
+ //-------------------------------------------------
+ chunk0 = malloc (size+64);
+ chunk = chunk0;
+ if (!chunk)
+ error ("Out of memory");
+
+ tmp = (unsigned long) chunk;
+ if (tmp & 31) {
+ tmp -= (tmp & 31);
+ tmp += 32;
+ chunk = (unsigned char*) tmp;
+ }
+
+ //----------------------------------------
+ // Set up random pointers to chunks.
+ //
+ if (random) {
+ tmp = size/256;
+ chunk_ptrs = (unsigned long**) malloc (sizeof (unsigned long*) * tmp);
+ if (!chunk_ptrs)
+ error ("Out of memory.");
+
+ //----------------------------------------
+ // Store pointers to all chunks into array.
+ //
+ int i;
+ for (i = 0; i < tmp; i++) {
+ chunk_ptrs [i] = (unsigned long*) (chunk + 256 * i);
+ }
+
+ //----------------------------------------
+ // Randomize the array of chunk pointers.
+ //
+ int k = 100;
+ while (k--) {
+ for (i = 0; i < tmp; i++) {
+ int j = rand() % tmp;
+ if (i != j) {
+ unsigned long *ptr = chunk_ptrs [i];
+ chunk_ptrs [i] = chunk_ptrs [j];
+ chunk_ptrs [j] = ptr;
+ }
+ }
+ }
+ }
+
+ //-------------------------------------------------
+ if (random)
+ print (L"Random write ");
+ else
+ print (L"Sequential write ");
+
+ switch (mode) {
+ case SSE2:
+ print (L"(128-bit), size = ");
+ break;
+ case AVX:
+ print (L"(256-bit), size = ");
+ break;
+ case AVX_BYPASS:
+ print (L"bypassing cache (256-bit), size = ");
+ break;
+ case SSE2_BYPASS:
+ print (L"bypassing cache (128-bit), size = ");
+ break;
+ default:
+#ifdef __x86_64__
+ print (L"(64-bit), size = ");
+#else
+ print (L"(32-bit), size = ");
+#endif
+ }
+
+ print_size (size);
+ print (L", ");
+
+ loops = (1 << 26) / size;// XX need to adjust for CPU MHz
+ if (loops < 1)
+ loops = 1;
+
+ t0 = mytime ();
+
+ while (diff < usec_per_test) {
+ total_count += loops;
+
+ switch (mode) {
+ case SSE2:
+ if (random)
+ RandomWriterSSE2 (chunk_ptrs, size/256, loops, value);
+ else {
+ if (size & 128)
+ WriterSSE2_128bytes (chunk, size, loops, value);
+ else
+ WriterSSE2 (chunk, size, loops, value);
+ }
+ break;
+
+ case SSE2_BYPASS:
+ if (random)
+ RandomWriterSSE2_bypass (chunk_ptrs, size/256, loops, value);
+ else {
+ if (size & 128)
+ WriterSSE2_128bytes_bypass (chunk, size, loops, value);
+ else
+ WriterSSE2_bypass (chunk, size, loops, value);
+ }
+ break;
+
+ case AVX:
+ if (!random) {
+ WriterAVX (chunk, size, loops, value);
+ }
+ break;
+
+ case AVX_BYPASS:
+ if (!random) {
+ WriterAVX_bypass (chunk, size, loops, value);
+ }
+ break;
+
+ default:
+ if (random)
+ RandomWriter (chunk_ptrs, size/256, loops, value);
+ else {
+ if (size & 128)
+ Writer_128bytes (chunk, size, loops, value);
+ else
+ Writer (chunk, size, loops, value);
+ }
+ }
+
+ diff = mytime () - t0;
+ }
+
+ print (L"loops = ");
+ print_uint (total_count);
+ print (L", ");
+
+ flush ();
+
+ int result = calculate_result (size, total_count, diff);
+ newline ();
+
+ flush ();
+
+ free ((void*)chunk0);
+
+ if (chunk_ptrs)
+ free (chunk_ptrs);
+
+ return result;
+}
+
+
+//----------------------------------------------------------------------------
+// Name: do_read
+// Purpose: Performs sequential read on chunk of memory of specified size.
+//----------------------------------------------------------------------------
+int
+do_read (unsigned long size, int mode, bool random)
+{
+ unsigned long loops;
+ unsigned long long total_count = 0;
+ unsigned long t0, diff=0;
+ unsigned char *chunk;
+ unsigned char *chunk0;
+ unsigned long tmp;
+ unsigned long **chunk_ptrs = NULL;
+
+ if (size & 127)
+ error ("do_read(): chunk size is not multiple of 128.");
+
+ //-------------------------------------------------
+ chunk0 = chunk = malloc (size+64);
+ if (!chunk)
+ error ("Out of memory");
+
+ memset (chunk, 0, size);
+
+ tmp = (unsigned long) chunk;
+ if (tmp & 31) {
+ tmp -= (tmp & 31);
+ tmp += 32;
+ chunk = (unsigned char*) tmp;
+ }
+
+ //----------------------------------------
+ // Set up random pointers to chunks.
+ //
+ if (random) {
+ int tmp = size/256;
+ chunk_ptrs = (unsigned long**) malloc (sizeof (unsigned long*) * tmp);
+ if (!chunk_ptrs)
+ error ("Out of memory.");
+
+ //----------------------------------------
+ // Store pointers to all chunks into array.
+ //
+ int i;
+ for (i = 0; i < tmp; i++) {
+ chunk_ptrs [i] = (unsigned long*) (chunk + 256 * i);
+ }
+
+ //----------------------------------------
+ // Randomize the array of chunk pointers.
+ //
+ int k = 100;
+ while (k--) {
+ for (i = 0; i < tmp; i++) {
+ int j = rand() % tmp;
+ if (i != j) {
+ unsigned long *ptr = chunk_ptrs [i];
+ chunk_ptrs [i] = chunk_ptrs [j];
+ chunk_ptrs [j] = ptr;
+ }
+ }
+ }
+ }
+
+ //-------------------------------------------------
+ if (random)
+ print (L"Random read ");
+ else
+ print (L"Sequential read ");
+
+ switch (mode) {
+ case SSE2:
+ print (L"(128-bit), size = ");
+ break;
+ case LODSB:
+ print (L"(8-bit LODSB), size = ");
+ break;
+ case LODSW:
+ print (L"(16-bit LODSW), size = ");
+ break;
+ case LODSD:
+ print (L"(32-bit LODSD), size = ");
+ break;
+ case LODSQ:
+ print (L"(64-bit LODSQ), size = ");
+ break;
+ case AVX:
+ print (L"(256-bit), size = ");
+ break;
+ case AVX_BYPASS:
+ print (L"bypassing cache (256-bit), size = ");
+ break;
+ case SSE2_BYPASS:
+ print (L"bypassing cache (128-bit), size = ");
+ break;
+ default:
+#ifdef __x86_64__
+ print (L"(64-bit), size = ");
+#else
+ print (L"(32-bit), size = ");
+#endif
+ }
+
+ print_size (size);
+ print (L", ");
+
+ flush ();
+
+ loops = (1 << 26) / size; // XX need to adjust for CPU MHz
+ if (loops < 1)
+ loops = 1;
+
+ t0 = mytime ();
+
+ while (diff < usec_per_test) {
+ total_count += loops;
+
+ switch (mode) {
+ case SSE2:
+ if (random)
+ RandomReaderSSE2 (chunk_ptrs, size/256, loops);
+ else {
+ if (size & 128)
+ ReaderSSE2_128bytes (chunk, size, loops);
+ else
+ ReaderSSE2 (chunk, size, loops);
+ }
+ break;
+
+ case SSE2_BYPASS:
+ // No random reader for bypass.
+ //
+ if (random)
+ RandomReaderSSE2_bypass (chunk_ptrs, size/256, loops);
+ else {
+ if (size & 128)
+ ReaderSSE2_128bytes_bypass (chunk, size, loops);
+ else
+ ReaderSSE2_bypass (chunk, size, loops);
+ }
+ break;
+
+ case AVX:
+ if (!random) {
+ ReaderAVX (chunk, size, loops);
+ }
+ break;
+
+ case LODSB:
+ if (!random) {
+ ReaderLODSB (chunk, size, loops);
+ }
+ break;
+
+ case LODSW:
+ if (!random) {
+ ReaderLODSW (chunk, size, loops);
+ }
+ break;
+
+ case LODSD:
+ if (!random) {
+ ReaderLODSD (chunk, size, loops);
+ }
+ break;
+
+ case LODSQ:
+ if (!random) {
+ ReaderLODSQ (chunk, size, loops);
+ }
+ break;
+
+ default:
+ if (random) {
+ RandomReader (chunk_ptrs, size/256, loops);
+ } else {
+ if (size & 128)
+ Reader_128bytes (chunk, size, loops);
+ else
+ Reader (chunk, size, loops);
+ }
+ }
+
+ diff = mytime () - t0;
+ }
+
+ print (L"loops = ");
+ print_uint (total_count);
+ print (L", ");
+
+ int result = calculate_result (size, total_count, diff);
+ newline ();
+
+ flush ();
+
+ free (chunk0);
+
+ if (chunk_ptrs)
+ free (chunk_ptrs);
+
+ return result;
+}
+
+
+
+//----------------------------------------------------------------------------
+// Name: do_copy
+// Purpose: Performs sequential memory copy.
+//----------------------------------------------------------------------------
+int
+do_copy (unsigned long size, int mode)
+{
+ unsigned long loops;
+ unsigned long long total_count = 0;
+ unsigned long t0, diff=0;
+ unsigned char *chunk_src;
+ unsigned char *chunk_dest;
+ unsigned char *chunk_src0;
+ unsigned char *chunk_dest0;
+ unsigned long tmp;
+
+ if (size & 127)
+ error ("do_copy(): chunk size is not multiple of 128.");
+
+ //-------------------------------------------------
+ chunk_src0 = chunk_src = malloc (size+64);
+ if (!chunk_src)
+ error ("Out of memory");
+ chunk_dest0 = chunk_dest = malloc (size+64);
+ if (!chunk_dest)
+ error ("Out of memory");
+
+ memset (chunk_src, 100, size);
+ memset (chunk_dest, 200, size);
+
+ tmp = (unsigned long) chunk_src;
+ if (tmp & 31) {
+ tmp -= (tmp & 31);
+ tmp += 32;
+ chunk_src = (unsigned char*) tmp;
+ }
+ tmp = (unsigned long) chunk_dest;
+ if (tmp & 31) {
+ tmp -= (tmp & 31);
+ tmp += 32;
+ chunk_dest = (unsigned char*) tmp;
+ }
+
+ //-------------------------------------------------
+ print (L"Sequential copy ");
+
+ if (mode == SSE2) {
+ print (L"(128-bit), size = ");
+ }
+ else if (mode == AVX) {
+ print (L"(256-bit), size = ");
+ }
+ else {
+#ifdef __x86_64__
+ print (L"(64-bit), size = ");
+#else
+ print (L"(32-bit), size = ");
+#endif
+ }
+
+ print_size (size);
+ print (L", ");
+
+ flush ();
+
+ loops = (1 << 26) / size; // XX need to adjust for CPU MHz
+ if (loops < 1)
+ loops = 1;
+
+ t0 = mytime ();
+
+ while (diff < usec_per_test) {
+ total_count += loops;
+
+ if (mode == SSE2) {
+#ifdef __x86_64__
+ if (size & 128)
+ CopySSE_128bytes (chunk_dest, chunk_src, size, loops);
+ else
+ CopySSE (chunk_dest, chunk_src, size, loops);
+#else
+ CopySSE (chunk_dest, chunk_src, size, loops);
+#endif
+ }
+ else if (mode == AVX) {
+ if (!(size & 128))
+ CopyAVX (chunk_dest, chunk_src, size, loops);
+ }
+
+ diff = mytime () - t0;
+ }
+
+ print (L"loops = ");
+ print_uint (total_count);
+ print (L", ");
+
+ int result = calculate_result (size, total_count, diff);
+ newline ();
+
+ flush ();
+
+ free (chunk_src0);
+ free (chunk_dest0);
+
+ return result;
+}
+
+
+//----------------------------------------------------------------------------
+// Name: fb_readwrite
+// Purpose: Performs sequential read & write tests on framebuffer memory.
+//----------------------------------------------------------------------------
+#if defined(__linux__) && defined(FBIOGET_FSCREENINFO)
+void
+fb_readwrite (bool use_sse2)
+{
+ unsigned long counter, total_count;
+ unsigned long length;
+ unsigned long diff, t0;
+ static struct fb_fix_screeninfo fi;
+ static struct fb_var_screeninfo vi;
+ unsigned long *fb = NULL;
+ unsigned long datum;
+ int fd;
+ register unsigned long foo;
+#ifdef __x86_64__
+ unsigned long value = 0x1234567689abcdef;
+#else
+ unsigned long value = 0x12345678;
+#endif
+
+ //-------------------------------------------------
+
+ fd = open ("/dev/fb0", O_RDWR);
+ if (fd < 0)
+ fd = open ("/dev/fb/0", O_RDWR);
+ if (fd < 0) {
+ println (L"Cannot open framebuffer device.");
+ return;
+ }
+
+ if (ioctl (fd, FBIOGET_FSCREENINFO, &fi)) {
+ close (fd);
+ println (L"Cannot get framebuffer info");
+ return;
+ }
+ else
+ if (ioctl (fd, FBIOGET_VSCREENINFO, &vi)) {
+ close (fd);
+ println (L"Cannot get framebuffer info");
+ return;
+ }
+ else
+ {
+ if (fi.visual != FB_VISUAL_TRUECOLOR &&
+ fi.visual != FB_VISUAL_DIRECTCOLOR ) {
+ close (fd);
+ println (L"Need direct/truecolor framebuffer device.");
+ return;
+ } else {
+ unsigned long fblen;
+
+ print (L"Framebuffer resolution: ");
+ print_int (vi.xres);
+ print (L"x");
+ print_int (vi.yres);
+ print (L", ");
+ print_int (vi.bits_per_pixel);
+ println (L" bpp\n");
+
+ fb = (unsigned long*) fi.smem_start;
+ fblen = fi.smem_len;
+
+ fb = mmap (fb, fblen,
+ PROT_WRITE | PROT_READ,
+ MAP_SHARED, fd, 0);
+ if (fb == MAP_FAILED) {
+ close (fd);
+ println (L"Cannot access framebuffer memory.");
+ return;
+ }
+ }
+ }
+
+ //-------------------
+ // Use only the upper half of the display.
+ //
+ length = FB_SIZE;
+
+ //-------------------
+ // READ
+ //
+ print (L"Framebuffer memory sequential read ");
+ flush ();
+
+ t0 = mytime ();
+
+ total_count = FBLOOPS_R;
+
+ if (use_sse2)
+ ReaderSSE2 (fb, length, FBLOOPS_R);
+ else
+ Reader (fb, length, FBLOOPS_R);
+
+ diff = mytime () - t0;
+
+ calculate_result (length, total_count, diff);
+ newline ();
+
+ //-------------------
+ // WRITE
+ //
+ print (L"Framebuffer memory sequential write ");
+ flush ();
+
+ t0 = mytime ();
+
+ total_count = FBLOOPS_W;
+
+ if (use_sse2)
+ WriterSSE2_bypass (fb, length, FBLOOPS_W, value);
+ else
+ Writer (fb, length, FBLOOPS_W, value);
+
+ diff = mytime () - t0;
+
+ calculate_result (length, total_count, diff);
+ newline ();
+}
+#endif
+
+//----------------------------------------------------------------------------
+// Name: register_test
+// Purpose: Determines bandwidth of register-to-register transfers.
+//----------------------------------------------------------------------------
+void
+register_test ()
+{
+ long long total_count = 0;
+ unsigned long t0;
+ unsigned long diff = 0;
+
+ //--------------------------------------
+#ifdef __x86_64__
+ print (L"Main register to main register transfers (64-bit) ");
+#else
+ print (L"Main register to main register transfers (32-bit) ");
+#endif
+ flush ();
+#define REGISTER_COUNT 10000
+
+ t0 = mytime ();
+ while (diff < usec_per_test)
+ {
+ RegisterToRegister (REGISTER_COUNT);
+ total_count += REGISTER_COUNT;
+
+ diff = mytime () - t0;
+ }
+
+ calculate_result (256, total_count, diff);
+ newline ();
+ flush ();
+
+ //--------------------------------------
+#ifdef __x86_64__
+ print (L"Main register to vector register transfers (64-bit) ");
+#else
+ print (L"Main register to vector register transfers (32-bit) ");
+#endif
+ flush ();
+#define VREGISTER_COUNT 3333
+
+ t0 = mytime ();
+ diff = 0;
+ total_count = 0;
+ while (diff < usec_per_test)
+ {
+ RegisterToVector (VREGISTER_COUNT);
+ total_count += VREGISTER_COUNT;
+
+ diff = mytime () - t0;
+ }
+
+ calculate_result (256, total_count, diff);
+ newline ();
+ flush ();
+
+ //--------------------------------------
+#ifdef __x86_64__
+ print (L"Vector register to main register transfers (64-bit) ");
+#else
+ print (L"Vector register to main register transfers (32-bit) ");
+#endif
+ flush ();
+
+ t0 = mytime ();
+ diff = 0;
+ total_count = 0;
+ while (diff < usec_per_test)
+ {
+ VectorToRegister (VREGISTER_COUNT);
+ total_count += VREGISTER_COUNT;
+
+ diff = mytime () - t0;
+ }
+
+ calculate_result (256, total_count, diff);
+ newline ();
+ flush ();
+
+ //--------------------------------------
+ print (L"Vector register to vector register transfers (128-bit) ");
+ flush ();
+
+ t0 = mytime ();
+ diff = 0;
+ total_count = 0;
+ while (diff < usec_per_test)
+ {
+ VectorToVector (VREGISTER_COUNT);
+ total_count += VREGISTER_COUNT;
+
+ diff = mytime () - t0;
+ }
+
+ calculate_result (256, total_count, diff);
+ newline ();
+ flush ();
+
+ //--------------------------------------
+ if (cpu_has_avx) {
+ print (L"Vector register to vector register transfers (256-bit) ");
+ flush ();
+
+ t0 = mytime ();
+ diff = 0;
+ total_count = 0;
+ while (diff < usec_per_test)
+ {
+ VectorToVectorAVX (VREGISTER_COUNT);
+ total_count += VREGISTER_COUNT;
+
+ diff = mytime () - t0;
+ }
+
+ calculate_result (256, total_count, diff);
+ newline ();
+ flush ();
+ }
+
+ //--------------------------------------
+ if (use_sse4) {
+ print (L"Vector 8-bit datum to main register transfers ");
+ flush ();
+
+ t0 = mytime ();
+ diff = 0;
+ total_count = 0;
+ while (diff < usec_per_test)
+ {
+ Vector8ToRegister (VREGISTER_COUNT);
+ total_count += VREGISTER_COUNT;
+
+ diff = mytime () - t0;
+ }
+
+ calculate_result (64, total_count, diff);
+ newline ();
+ flush ();
+ }
+
+ //--------------------------------------
+ print (L"Vector 16-bit datum to main register transfers ");
+ flush ();
+
+ t0 = mytime ();
+ diff = 0;
+ total_count = 0;
+ while (diff < usec_per_test)
+ {
+ Vector16ToRegister (VREGISTER_COUNT);
+ total_count += VREGISTER_COUNT;
+
+ diff = mytime () - t0;
+ }
+
+ calculate_result (128, total_count, diff);
+ newline ();
+ flush ();
+
+ //--------------------------------------
+ if (use_sse4) {
+ print (L"Vector 32-bit datum to main register transfers ");
+ flush ();
+
+ t0 = mytime ();
+ diff = 0;
+ total_count = 0;
+ while (diff < usec_per_test)
+ {
+ Vector32ToRegister (VREGISTER_COUNT);
+ total_count += VREGISTER_COUNT;
+
+ diff = mytime () - t0;
+ }
+
+ calculate_result (256, total_count, diff);
+ newline ();
+ flush ();
+ }
+
+ //--------------------------------------
+ if (use_sse4) {
+ print (L"Vector 64-bit datum to main register transfers ");
+ flush ();
+
+ t0 = mytime ();
+ diff = 0;
+ total_count = 0;
+ while (diff < usec_per_test)
+ {
+ Vector64ToRegister (VREGISTER_COUNT);
+ total_count += VREGISTER_COUNT;
+
+ diff = mytime () - t0;
+ }
+
+ calculate_result (256, total_count, diff);
+ newline ();
+ flush ();
+ }
+
+ //--------------------------------------
+ if (use_sse4) {
+ print (L"Main register 8-bit datum to vector register transfers ");
+ flush ();
+
+ t0 = mytime ();
+ diff = 0;
+ total_count = 0;
+ while (diff < usec_per_test)
+ {
+ Register8ToVector (VREGISTER_COUNT);
+ total_count += VREGISTER_COUNT;
+
+ diff = mytime () - t0;
+ }
+
+ calculate_result (64, total_count, diff);
+ newline ();
+ flush ();
+ }
+
+ //--------------------------------------
+ print (L"Main register 16-bit datum to vector register transfers ");
+ flush ();
+
+ t0 = mytime ();
+ diff = 0;
+ total_count = 0;
+ while (diff < usec_per_test)
+ {
+ Register16ToVector (VREGISTER_COUNT);
+ total_count += VREGISTER_COUNT;
+
+ diff = mytime () - t0;
+ }
+
+ calculate_result (128, total_count, diff);
+ newline ();
+ flush ();
+
+ //--------------------------------------
+ if (use_sse4) {
+ print (L"Main register 32-bit datum to vector register transfers ");
+ flush ();
+
+ t0 = mytime ();
+ diff = 0;
+ total_count = 0;
+ while (diff < usec_per_test)
+ {
+ Register32ToVector (VREGISTER_COUNT);
+ total_count += VREGISTER_COUNT;
+
+ diff = mytime () - t0;
+ }
+
+ calculate_result (256, total_count, diff);
+ newline ();
+ flush ();
+ }
+
+ //--------------------------------------
+ if (use_sse4) {
+ print (L"Main register 64-bit datum to vector register transfers ");
+ flush ();
+
+ t0 = mytime ();
+ diff = 0;
+ total_count = 0;
+ while (diff < usec_per_test)
+ {
+ Register64ToVector (VREGISTER_COUNT);
+ total_count += VREGISTER_COUNT;
+
+ diff = mytime () - t0;
+ }
+
+ calculate_result (256, total_count, diff);
+ newline ();
+ flush ();
+ }
+}
+
+//----------------------------------------------------------------------------
+// Name: stack_test
+// Purpose: Determines bandwidth of stack-to/from-register transfers.
+//----------------------------------------------------------------------------
+void
+stack_test ()
+{
+ long long total_count = 0;
+ unsigned long t0;
+ unsigned long diff = 0;
+
+#ifdef __x86_64__
+ print (L"Stack-to-register transfers (64-bit) ");
+#else
+ print (L"Stack-to-register transfers (32-bit) ");
+#endif
+ flush ();
+
+ //--------------------------------------
+ diff = 0;
+ total_count = 0;
+ t0 = mytime ();
+ while (diff < usec_per_test)
+ {
+ StackReader (REGISTER_COUNT);
+ total_count += REGISTER_COUNT;
+
+ diff = mytime () - t0;
+ }
+
+ calculate_result (256, total_count, diff);
+ newline ();
+ flush ();
+
+#ifdef __x86_64__
+ print (L"Register-to-stack transfers (64-bit) ");
+#else
+ print (L"Register-to-stack transfers (32-bit) ");
+#endif
+ flush ();
+
+ //--------------------------------------
+ diff = 0;
+ total_count = 0;
+ t0 = mytime ();
+ while (diff < usec_per_test)
+ {
+ StackWriter (REGISTER_COUNT);
+ total_count += REGISTER_COUNT;
+
+ diff = mytime () - t0;
+ }
+
+ calculate_result (256, total_count, diff);
+ newline ();
+ flush ();
+}
+
+//----------------------------------------------------------------------------
+// Name: library_test
+// Purpose: Performs C library tests (memset, memcpy).
+//----------------------------------------------------------------------------
+void
+library_test ()
+{
+ char *a1, *a2;
+ unsigned long t, t0;
+ int i;
+
+ #define NT_SIZE (64*1024*1024)
+ #define NT_SIZE2 (100)
+
+ a1 = malloc (NT_SIZE);
+ if (!a1)
+ error ("Out of memory");
+
+ a2 = malloc (NT_SIZE);
+ if (!a2)
+ error ("Out of memory");
+
+ //--------------------------------------
+ t0 = mytime ();
+ for (i=0; i<NT_SIZE2; i++) {
+ memset (a1, i, NT_SIZE);
+ }
+ t = mytime ();
+
+ print (L"Library: memset ");
+ calculate_result (NT_SIZE, NT_SIZE2, t-t0);
+ newline ();
+
+ flush ();
+
+ //--------------------------------------
+ t0 = mytime ();
+ for (i=0; i<NT_SIZE2; i++) {
+ memcpy (a2, a1, NT_SIZE);
+ }
+ t = mytime ();
+
+ print (L"Library: memcpy ");
+ calculate_result (NT_SIZE, NT_SIZE2, t-t0);
+ newline ();
+
+ flush ();
+
+ free (a1);
+ free (a2);
+}
+
+//----------------------------------------------------------------------------
+// Name: network_test_core
+// Purpose: Performs the network test, talking to and receiving data
+// back from a transponder node.
+// Note: Port number specified using server:# notation.
+// Returns: -1 on error, else the network duration in microseconds.
+//----------------------------------------------------------------------------
+bool
+network_test_core (const char *hostname, char *chunk,
+ unsigned long chunk_size,
+ unsigned long n_chunks,
+ long *duration_send_return,
+ long *duration_recv_return)
+{
+ if (!hostname || !chunk || !n_chunks || !chunk_size ||
+ !duration_send_return ||
+ !duration_recv_return)
+ return false;
+
+ struct hostent* host = gethostbyname (hostname);
+ if (!host)
+ return false;
+
+ char *host_ip = inet_ntoa (*(struct in_addr *)*host->h_addr_list);
+ int sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
+
+ struct sockaddr_in addr;
+ addr.sin_family = AF_INET;
+ addr.sin_addr.s_addr = inet_addr(host_ip);
+ addr.sin_port = htons(network_port);
+
+ if (connect (sock, (struct sockaddr*) &addr, sizeof (struct sockaddr)))
+ {
+ // perror ("connect");
+ close (sock);
+ return false;
+ }
+
+ //------------------------------------
+ // Start stopwatch just before the send.
+ // It will be stopped on receipt of
+ // the response.
+ //
+ unsigned long t0 = mytime ();
+
+ //------------------------------------
+ // Put # of chunks in the chunk.
+ // Send all of our data.
+ //
+ sprintf (chunk, "%lu\n", n_chunks);
+ int i;
+ for (i = 0; i < n_chunks; i++)
+ send (sock, chunk, chunk_size, 0);
+
+#if 0
+ //------------------------------------
+ // Set nonblocking mode.
+ //
+ int opt = 1;
+ ioctl (sock, FIONBIO, &opt);
+#endif
+
+ unsigned long t1 = mytime ();
+
+ //------------------------------------
+ // Read the response.
+ //
+ int amount = recv (sock, chunk, chunk_size, 0);
+ if (amount < 16) {
+ close (sock);
+ return false;
+ }
+
+ unsigned long duration_send = mytime() - t0;
+
+ //------------------------------------
+ // Validate the response, which
+ // contains the transponder's
+ // perceived read duration. This value
+ // may be as little as half our number.
+ //
+ unsigned long duration2 = -1;
+ if (strncmp ("OK: ", chunk, 4)) {
+ close (sock);
+ return false;
+ }
+ if (1 != sscanf (4+chunk, "%lu", &duration2)) {
+ close (sock);
+ return false;
+ }
+
+ unsigned long remaining = chunk_size * n_chunks - amount;
+ while (remaining > 0) {
+ int amount = recv (sock, chunk, chunk_size, 0);
+ if (amount <= 0) {
+ perror ("recv");
+ close (sock);
+ return false;
+ }
+ remaining -= amount;
+ }
+
+ unsigned long duration_recv = mytime () - t1;
+
+ *duration_send_return = duration_send;
+ *duration_recv_return = duration_recv;
+
+ close (sock);
+ return true;
+}
+
+//----------------------------------------------------------------------------
+// Name: ip_to_str
+//----------------------------------------------------------------------------
+void
+ip_to_str (unsigned long addr, char *str)
+{
+ if (!str)
+ return;
+
+ unsigned short a = 0xff & addr;
+ unsigned short b = 0xff & (addr >> 8);
+ unsigned short c = 0xff & (addr >> 16);
+ unsigned short d = 0xff & (addr >> 24);
+ sprintf (str, "%u.%u.%u.%u", a,b,c,d);
+}
+
+//----------------------------------------------------------------------------
+// Name: network_transponder
+// Purpose: Act as a transponder, receiving chunks of data and sending
+// back an acknowledgement once the enture chunk is read.
+// Returns: False if a problem occurs setting up the network socket.
+//----------------------------------------------------------------------------
+bool
+network_transponder ()
+{
+ struct sockaddr_in sin, from;
+
+ //------------------------------
+ // Get listening socket for port.
+ // Then listen on given port#.
+ //
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = htonl(INADDR_ANY);
+ sin.sin_port = htons(network_port);
+ int listensock;
+ if ((listensock = socket (AF_INET, SOCK_STREAM, 0)) < 0) {
+ perror ("socket");
+ return false;
+ }
+ if (bind (listensock, (struct sockaddr*) &sin, sizeof(sin)) < 0) {
+ perror ("bind");
+ close (listensock);
+ return false;
+ }
+ if (listen (listensock, 500) < 0) {
+ perror ("listen");
+ close (listensock);
+ return false;
+ }
+
+ bool done = false;
+ while (!done) {
+ //----------------------------------------
+ // Wait for a client to contact us.
+ //
+ socklen_t len = sizeof (struct sockaddr);
+ int sock = accept (listensock, (struct sockaddr*) &from, &len);
+ if (sock < 0) {
+ perror ("accept");
+ close (listensock);
+ return false;
+ }
+
+ //----------------------------------------
+ // Clockwatch starts when we accept the
+ // connection.
+ //
+ unsigned long t0 = mytime ();
+
+ if (len != sizeof (struct sockaddr_in)) {
+ close (sock);
+ close (listensock);
+ return false;
+ }
+
+#if 0
+ unsigned long ipaddr = from.sin_addr.s_addr;
+ char ipstring[30];
+ ip_to_str (ipaddr, ipstring);
+ fprintf (stderr, "Incoming connection from %s\n", ipstring);
+#endif
+
+ //----------------------------------------
+ // Read the first chunk only, in order to
+ // get the # of bytes that will be sent.
+ //
+ char chunk [NETWORK_CHUNK_SIZE+1];
+ long n_chunks = 0;
+ int amount_read = read (sock, chunk, NETWORK_CHUNK_SIZE);
+ chunk [amount_read] = 0;
+ if (1 != sscanf (chunk, "%ld", &n_chunks)) {
+ close (sock);
+ close (listensock);
+ return false;
+ }
+
+ //----------------------------------------
+ // If the leader sends us a chunk count of
+ // -99, this indicates that we should exit.
+ //
+ if (n_chunks == -99) {
+ close (sock);
+ close (listensock);
+ return true;
+ }
+
+// printf ("Reading %lu chunks of %d bytes...\n", n_chunks, NETWORK_CHUNK_SIZE);
+
+ unsigned long long remaining = n_chunks;
+ remaining *= NETWORK_CHUNK_SIZE;
+
+// printf ("remaining="); dump_hex64(remaining); puts("");
+
+ remaining -= amount_read;
+ while (remaining > 0) {
+ amount_read = read (sock, chunk, NETWORK_CHUNK_SIZE);
+ remaining -= amount_read;
+
+ if (amount_read < 0) {
+ perror ("read");
+ break;
+ } else
+ if (!amount_read)
+ break;
+ }
+
+ unsigned long duration = mytime() - t0;
+
+ //------------------------------------
+ // Send response of same size.
+ //
+ sprintf (chunk, "OK: %lu\n", duration);
+ chunk[14] = '\n';
+
+ //------------------------------------
+ // Send all of our data.
+ //
+ int i;
+ for (i = 0; i < n_chunks; i++)
+ send (sock, chunk, NETWORK_CHUNK_SIZE, 0);
+
+ close (sock);
+ }
+
+ return true;
+}
+
+//----------------------------------------------------------------------------
+// Name: network_test
+//----------------------------------------------------------------------------
+bool
+network_test (char **destinations, int n_destinations)
+{
+ int i;
+
+ //----------------------------------------
+ // The memory chunk starts with a 12-byte
+ // length of the overall send size.
+ // The memory chunk will have a list of
+ // the destinations in it.
+ // In future, there will be a mechanism
+ // for testing bandwidth between all nodes,
+ // not just the leader & each of the
+ // transponders.
+ //
+ char chunk [NETWORK_CHUNK_SIZE];
+ memset (chunk, 0, NETWORK_CHUNK_SIZE);
+ sprintf (chunk, "000000000000\n%d\n", n_destinations);
+ for (i = 0; i < n_destinations; i++) {
+ char *s = destinations [i];
+ int chunk_len = strlen (chunk);
+ int len = strlen (s);
+ if (len + chunk_len < NETWORK_CHUNK_SIZE-1) {
+ //----------------------------------------
+ // "transp" indicates that the given node
+ // has not yet been a leader.
+ // In future, "done" will indicate it has.
+ //
+ sprintf (chunk + chunk_len, "%s %s\n", s, "transp");
+ }
+ }
+
+ static unsigned long colors [] = {
+ RGB_RED, RGB_GREEN, RGB_BLUE, RGB_ORANGE, RGB_PURPLE,
+ RGB_BLACK, RGB_CORAL,
+ RGB_CYAN, RGB_NAVYBLUE, RGB_BRASS, RGB_DARKORANGE,
+ RGB_DARKGREEN, RGB_SALMON, RGB_MAGENTA, RGB_LEMONYELLOW,
+ RGB_ROYALBLUE, RGB_DODGERBLUE, RGB_TURQUOISE, RGB_CADETBLUE,
+ RGB_CHARTREUSE, RGB_DARKOLIVEGREEN, RGB_VIOLET,
+ RGB_KHAKI, RGB_DARKKHAKI, RGB_GOLDENROD
+ };
+#define NCOLORS (sizeof(colors)/sizeof(unsigned long))
+
+ //----------------------------------------
+ // For each destination, run the test.
+ //
+ for (i = 0; i < n_destinations; i++) {
+ bool problem = false;
+
+ char *hostname = destinations[i];
+ printf ("Bandwidth sending to %s:\n", hostname);
+
+ char title [PATH_MAX];
+ sprintf (title, "%s send (solid)", hostname);
+ BMPGraphing_new_line (graph, title, i < NCOLORS? colors[i] : RGB_GRAY);
+
+ //----------------------------------------
+ // Cache the receive durations for later.
+ //
+ unsigned long recv_rates [NETSIZE_MAX];
+ int recv_ix = 0;
+
+ //----------------------------------------
+ // Send data of increasing sizes.
+ //
+ int j = NETSIZE_MIN;
+ int n_runs = 64;
+ while (!problem && j <= NETSIZE_MAX) {
+ unsigned long chunk_count = 1 << (j-NETSIZE_MIN);
+ unsigned long long amt_to_send = chunk_count;
+ amt_to_send *= NETWORK_CHUNK_SIZE;
+
+ if (!amt_to_send) // unlikely
+ break;
+
+ //----------------------------------------
+ // Send the data; do this n_runs times.
+ //
+ unsigned long long total_duration_send = 0;
+ unsigned long long total_duration_recv = 0;
+
+ int k = n_runs;
+ while (k--) {
+ long duration_send, duration_recv;
+
+ if (! network_test_core (hostname,
+ chunk, NETWORK_CHUNK_SIZE, chunk_count,
+ &duration_send, &duration_recv))
+ {
+ problem = true;
+ fprintf (stderr, "\nCan't connect to %s\n", hostname);
+ break;
+ }
+
+ total_duration_send += duration_send;
+ total_duration_recv += duration_recv;
+ }
+
+ if (problem)
+ break;
+
+ total_duration_send += n_runs/2; // Round up
+ total_duration_send /= n_runs; // Get average
+ long duration = (long) total_duration_send;
+
+ total_duration_recv += n_runs/2; // Round up
+ total_duration_recv /= n_runs; // Get average
+
+ unsigned long amt_in_kb = amt_to_send / 1024;
+ unsigned long amt_in_mb = amt_to_send / 1048576;
+ if (!amt_in_mb) {
+ printf ("\r\tChunk %lu kB x %d: \t", amt_in_kb,
+ n_runs);
+ } else {
+ printf ("\r\tChunk %lu MB x %d: \t", amt_in_mb,
+ n_runs);
+ }
+
+ //------------------------------
+ // Calculate send rate in MB/sec.
+ //
+ // Get total # bytes.
+ unsigned long long tmp = NETWORK_CHUNK_SIZE;
+ tmp *= chunk_count;
+
+ // Get total bytes per second.
+ tmp *= 1000000;
+ tmp /= duration;
+
+ // Bytes to megabytes.
+ tmp /= 1000;
+ tmp /= 10;
+ unsigned long whole = tmp / 100;
+ unsigned long frac = tmp % 100;
+ printf ("%lu.%02lu MB/s (sent)\t", whole, frac);
+ fflush (stdout);
+
+ BMPGraphing_add_point (graph, amt_in_kb, tmp);
+
+ //------------------------------
+ // Calculate recv rate in MB/sec.
+ //
+ // Get total # bytes.
+ tmp = NETWORK_CHUNK_SIZE;
+ tmp *= chunk_count;
+
+ // Get total bytes per second.
+ tmp *= 1000000;
+ tmp /= total_duration_recv;
+
+ // Bytes to megabytes.
+ tmp /= 1000;
+ tmp /= 10;
+ whole = tmp / 100;
+ frac = tmp % 100;
+ printf ("%lu.%02lu MB/s (received)\n", whole, frac);
+
+ recv_rates [recv_ix++] = tmp;
+
+ j++;
+ n_runs >>= 1;
+ if (!n_runs)
+ n_runs = 1;
+ }
+
+ //----------------------------------------
+ // Now add the line for the receive rates.
+ //
+ sprintf (title, "%s receive (dashed)", hostname);
+ BMPGraphing_new_line (graph, title, DASHED |
+ (i < NCOLORS? colors[i] : RGB_GRAY));
+ for (j = NETSIZE_MIN; j <= NETSIZE_MAX; j++) {
+ unsigned long chunk_count = 1 << (j-NETSIZE_MIN);
+ unsigned long long amt_to_send = chunk_count;
+ amt_to_send *= NETWORK_CHUNK_SIZE;
+ unsigned long amt_in_kb = amt_to_send / 1024;
+// printf ("amt_in_kb=%ld\n",amt_in_kb);
+
+ BMPGraphing_add_point (graph, amt_in_kb, recv_rates[j-NETSIZE_MIN]);
+ }
+
+ puts ("");
+ }
+
+ return true;
+}
+
+//----------------------------------------------------------------------------
+// Name: usage
+//----------------------------------------------------------------------------
+void
+usage ()
+{
+ printf ("Usage: bandwidth [--slow] [--fast] [--faster] [--fastest] [--title string]\n");
+ printf ("Usage for starting network tests: bandwidth --network <ipaddr1> [<ipaddr2...] [--port <port#>]\n");
+ printf ("Usage for receiving network tests: bandwidth --transponder [--port <port#>]\n");
+
+ exit (0);
+}
+
+//----------------------------------------------------------------------------
+// Name: main
+//----------------------------------------------------------------------------
+int
+main (int argc, char **argv)
+{
+ int i, chunk_size;
+
+ --argc;
+ ++argv;
+
+ bool network_mode = false;
+ bool network_leader = false; // false => transponder
+ int network_destinations_size = 0;
+ int n_network_destinations = 0;
+ char **network_destinations = NULL;
+
+ char graph_title [512] = {0};
+
+ i = 0;
+ while (i < argc) {
+ char *s = argv [i++];
+
+ if (!strcmp ("--network", s)) {
+ network_mode = true;
+ network_leader = true;
+ network_destinations_size = 20;
+ network_destinations = (char**) malloc (network_destinations_size * sizeof (char*));
+ }
+ else
+ if (!strcmp ("--transponder", s)) {
+ network_mode = true;
+ }
+ else
+ if (!strcmp ("--port", s)) {
+ if (i != argc)
+ network_port = atoi (argv[i++]);
+ }
+ else
+ if (!strcmp ("--slow", s)) {
+ usec_per_test=20000000; // 20 seconds per test.
+ }
+ else
+ if (!strcmp ("--fast", s)) {
+ usec_per_test = 500000; // 0.5 seconds per test.
+ }
+ else
+ if (!strcmp ("--faster", s)) {
+ usec_per_test = 50000; // 0.05 seconds per test.
+ }
+ else
+ if (!strcmp ("--fastest", s)) {
+ usec_per_test = 5000; // 0.005 seconds per test.
+ }
+ else
+ if (!strcmp ("--nosse2", s)) {
+ use_sse2 = false;
+ use_sse4 = false;
+ }
+ else
+ if (!strcmp ("--nosse4", s)) {
+ use_sse4 = false;
+ }
+ else
+ if (!strcmp ("--help", s)) {
+ usage ();
+ }
+ else
+ if (!strcmp ("--title", s) && i != argc) {
+ snprintf (graph_title, 511, "%s", argv[i++]);
+ }
+ else {
+ if ('-' == *s)
+ usage ();
+ }
+ }
+
+ msg[0] = 0;
+
+ for (i = 0; chunk_sizes[i] && i < sizeof(chunk_sizes)/sizeof(int); i++) {
+ chunk_sizes_log2[i] = log2 (chunk_sizes[i]);
+ }
+
+ printf ("This is bandwidth version %s.\n", RELEASE);
+ printf ("Copyright (C) 2005-2014 by Zack T Smith.\n\n");
+ printf ("This software is covered by the GNU Public License.\n");
+ printf ("It is provided AS-IS, use at your own risk.\n");
+ printf ("See the file COPYING for more information.\n\n");
+ fflush (stdout);
+
+ //----------------------------------------
+ // If network mode selected, enter it now.
+ // Currently cannot combine memory tests
+ // & network tests.
+ //
+ if (network_mode) {
+ if (network_leader) {
+ graph = BMPGraphing_new (GRAPH_WIDTH, GRAPH_HEIGHT, MODE_X_AXIS_LINEAR);
+ strcpy (graph_title, TITLE_MEMORY_NET);
+ BMPGraphing_set_title (graph, graph_title);
+
+ network_test (network_destinations, n_network_destinations);
+
+ BMPGraphing_make (graph);
+
+ BMP_write (graph->image, "network_bandwidth.bmp");
+
+#if defined(__linux__) || defined(__CYGWIN__) || defined(__APPLE__)
+ puts ("Wrote graph to network_bandwidth.bmp.");
+ puts ("");
+ puts ("Done.");
+#endif
+ BMPGraphing_destroy (graph);
+ } else {
+ network_transponder ();
+ }
+
+ return 0;
+ }
+
+ uint32_t ecx = get_cpuid1_ecx ();
+ uint32_t edx = get_cpuid1_edx ();
+ cpu_has_mmx = edx & CPUID_EDX_MMX;
+ cpu_has_sse = edx & CPUID_EDX_SSE;
+ cpu_has_sse2 = edx & CPUID_EDX_SSE2;
+ cpu_has_sse3 = ecx & CPUID_ECX_SSE3;
+ cpu_has_ssse3 = ecx & CPUID_ECX_SSSE3;
+ cpu_has_sse41 = ecx & CPUID_ECX_SSE41;
+ cpu_has_sse42 = ecx & CPUID_ECX_SSE42;
+ cpu_has_aes = ecx & CPUID_ECX_AES;
+ cpu_has_avx = ecx & CPUID_ECX_AVX;
+ cpu_has_avx2 = 0;
+
+ if (cpu_has_avx) {
+ cpu_has_avx2 = get_cpuid7_ebx ();
+ cpu_has_avx2 &= CPUID_EBX_AVX2;
+ }
+
+ use_sse2 = true;
+ use_sse4 = true;
+
+ cpu_has_sse4a = 0;
+ cpu_has_64bit = 0;
+ cpu_has_xd = 0;
+
+ static char family [17];
+ get_cpuid_family (family);
+ family [16] = 0;
+ printf ("CPU family: %s\n", family);
+
+ uint32_t ecx2 = get_cpuid_80000001_ecx ();
+ uint32_t edx2 = get_cpuid_80000001_edx ();
+
+ if (!strcmp ("AuthenticAMD", family)) {
+ is_amd = true;
+ cpu_has_sse4a = ecx2 & CPUID_ECX_SSE4A;
+ }
+ else
+ if (!strcmp ("GenuineIntel", family)) {
+ is_intel = true;
+ }
+
+ cpu_has_xd = edx2 & CPUID_EDX_XD;
+ cpu_has_64bit = edx2 & CPUID_EDX_INTEL64;
+
+ printf ("CPU features: ");
+ if (cpu_has_mmx) printf ("MMX ");
+ if (cpu_has_sse) printf ("SSE ");
+ if (cpu_has_sse2) printf ("SSE2 ");
+ if (cpu_has_sse3) printf ("SSE3 ");
+ if (cpu_has_ssse3) printf ("SSSE3 ");
+ if (cpu_has_sse4a) printf ("SSE4A ");
+ if (cpu_has_sse41) printf ("SSE4.1 ");
+ if (cpu_has_sse42) printf ("SSE4.2 ");
+ if (cpu_has_aes) printf ("AES ");
+ if (cpu_has_avx) printf ("AVX ");
+ if (cpu_has_avx2) printf ("AVX2 ");
+ if (cpu_has_xd) printf ("XD ");
+ if (cpu_has_64bit) {
+ if (!is_amd)
+ printf ("Intel64 ");
+ else
+ printf ("LongMode ");
+ }
+ puts ("\n");
+
+ if (is_intel) {
+ uint32_t cache_info[4];
+ i = 0;
+ while (1) {
+ get_cpuid_cache_info (cache_info, i);
+ if (!(cache_info[0] & 31))
+ break;
+
+#if 0
+ printf ("Cache info %d = 0x%08x, 0x%08x, 0x%08x, 0x%08x\n", i,
+ cache_info [0],
+ cache_info [1],
+ cache_info [2],
+ cache_info [3]);
+#endif
+ printf ("Cache %d: ", i);
+ switch ((cache_info[0] >> 5) & 7) {
+ case 1: printf ("L1 "); break;
+ case 2: printf ("L2 "); break;
+ case 3: printf ("L3 "); break;
+ }
+ switch (cache_info[0] & 31) {
+ case 1: printf ("data cache, "); break;
+ case 2: printf ("instruction cache, "); break;
+ case 3: printf ("unified cache, "); break;
+ }
+ uint32_t n_ways = 1 + (cache_info[1] >> 22);
+ uint32_t line_size = 1 + (cache_info[1] & 2047);
+ uint32_t n_sets = 1 + cache_info[2];
+ printf ("line size %d, ", line_size);
+ printf ("%2d-way%s, ", n_ways, n_ways>1 ? "s" : "");
+ printf ("%5d sets, ", n_sets);
+ unsigned size = (n_ways * line_size * n_sets) >> 10;
+ printf ("size %dk ", size);
+ puts ("");
+ i++;
+ }
+ }
+
+ if (!cpu_has_sse41)
+ use_sse4 = false;
+ if (!cpu_has_sse2)
+ use_sse2 = false;
+
+ println (L"\nNotation: B = byte, kB = 1024 B, MB = 1048576 B.");
+
+ flush ();
+
+ //------------------------------------------------------------
+ // Attempt to obtain information about the CPU.
+ //
+#ifdef __linux__
+ struct stat st;
+ if (!stat ("/proc/cpuinfo", &st)) {
+#define TMPFILE "/tmp/bandw_tmp"
+ unlink (TMPFILE);
+ if (-1 == system ("grep MHz /proc/cpuinfo | uniq | sed \"s/[\\t\\n: a-zA-Z]//g\" > "TMPFILE))
+ perror ("system");
+
+ FILE *f = fopen (TMPFILE, "r");
+ if (f) {
+ float cpu_speed = 0.0;
+
+ if (1 == fscanf (f, "%g", &cpu_speed)) {
+ puts ("");
+ printf ("CPU speed is %g MHz.\n", cpu_speed);
+ }
+ fclose (f);
+ }
+ } else {
+ printf ("CPU information is not available (/proc/cpuinfo).\n");
+ }
+ fflush (stdout);
+#endif
+
+ graph = BMPGraphing_new (GRAPH_WIDTH, GRAPH_HEIGHT, MODE_X_AXIS_LOG2);
+ strcpy (graph_title, TITLE_MEMORY_GRAPH);
+ BMPGraphing_set_title (graph, graph_title);
+
+ //------------------------------------------------------------
+ // SSE2 sequential reads.
+ //
+ if (use_sse2) {
+ BMPGraphing_new_line (graph, "Sequential 128-bit reads", RGB_RED);
+
+ newline ();
+
+ i = 0;
+ while ((chunk_size = chunk_sizes [i++])) {
+ int amount = do_read (chunk_size, SSE2, false);
+ BMPGraphing_add_point (graph, chunk_size, amount);
+ }
+ }
+
+ //------------------------------------------------------------
+ // AVX sequential reads.
+ //
+ if (cpu_has_avx) {
+ BMPGraphing_new_line (graph, "Sequential 256-bit reads", RGB_TURQUOISE);
+
+ newline ();
+
+ i = 0;
+ while ((chunk_size = chunk_sizes [i++])) {
+ if (!(chunk_size & 128)) {
+ int amount = do_read (chunk_size, AVX, false);
+ BMPGraphing_add_point (graph, chunk_size, amount);
+ }
+ }
+ }
+
+ //------------------------------------------------------------
+ // SSE2 random reads.
+ //
+ if (use_sse2) {
+ BMPGraphing_new_line (graph, "Random 128-bit reads", RGB_MAROON);
+
+ newline ();
+ srand (time (NULL));
+
+ i = 0;
+ while ((chunk_size = chunk_sizes [i++])) {
+ if (!(chunk_size & 128)) {
+ int amount = do_read (chunk_size, SSE2, true);
+ BMPGraphing_add_point (graph, chunk_size, amount);
+ }
+ }
+ }
+
+ //------------------------------------------------------------
+ // SSE2 sequential writes that do not bypass the caches.
+ //
+ if (use_sse2) {
+ BMPGraphing_new_line (graph, "Sequential 128-bit cache writes", RGB_PURPLE);
+
+ newline ();
+
+ i = 0;
+ while ((chunk_size = chunk_sizes [i++])) {
+ int amount = do_write (chunk_size, SSE2, false);
+ BMPGraphing_add_point (graph, chunk_size, amount);
+ }
+ }
+
+ //------------------------------------------------------------
+ // AVX sequential writes that do not bypass the caches.
+ //
+ if (cpu_has_avx) {
+ BMPGraphing_new_line (graph, "Sequential 256-bit cache writes", RGB_PINK);
+
+ newline ();
+
+ i = 0;
+ while ((chunk_size = chunk_sizes [i++])) {
+ if (!(chunk_size & 128)) {
+ int amount = do_write (chunk_size, AVX, false);
+ BMPGraphing_add_point (graph, chunk_size, amount);
+ }
+ }
+ }
+
+ //------------------------------------------------------------
+ // SSE2 random writes that do not bypass the caches.
+ //
+ if (use_sse2) {
+ BMPGraphing_new_line (graph, "Random 128-bit cache writes", RGB_NAVYBLUE);
+
+ newline ();
+ srand (time (NULL));
+
+ i = 0;
+ while ((chunk_size = chunk_sizes [i++])) {
+ if (!(chunk_size & 128)) {
+ int amount = do_write (chunk_size, SSE2, true);
+ BMPGraphing_add_point (graph, chunk_size, amount);
+ }
+ }
+ }
+
+ //------------------------------------------------------------
+ // SSE4 sequential reads that do bypass the caches.
+ //
+ if (use_sse4) {
+ BMPGraphing_new_line (graph, "Sequential 128-bit bypassing reads", RGB_BLACK);
+
+ newline ();
+
+ i = 0;
+ while ((chunk_size = chunk_sizes [i++])) {
+ int amount = do_read (chunk_size, SSE2_BYPASS, false);
+ BMPGraphing_add_point (graph, chunk_size, amount);
+ }
+ }
+
+ //------------------------------------------------------------
+ // SSE4 random reads that do bypass the caches.
+ //
+ if (use_sse4) {
+ BMPGraphing_new_line (graph, "Random 128-bit bypassing reads", 0xdeadbeef);
+
+ newline ();
+
+ i = 0;
+ while ((chunk_size = chunk_sizes [i++])) {
+ if (!(chunk_size & 128)) {
+ int amount = do_read (chunk_size, SSE2_BYPASS, true);
+ BMPGraphing_add_point (graph, chunk_size, amount);
+ }
+ }
+ }
+
+ //------------------------------------------------------------
+ // SSE4 sequential writes that do bypass the caches.
+ //
+ if (use_sse4) {
+ BMPGraphing_new_line (graph, "Sequential 128-bit bypassing writes", RGB_DARKORANGE);
+
+ newline ();
+
+ i = 0;
+ while ((chunk_size = chunk_sizes [i++])) {
+ int amount = do_write (chunk_size, SSE2_BYPASS, false);
+ BMPGraphing_add_point (graph, chunk_size, amount);
+ }
+ }
+
+ //------------------------------------------------------------
+ // AVX sequential writes that do bypass the caches.
+ // Currently on Intel CPUs (including Xeon) there is a
+ // microcode bug that leads to a severe drop in performance
+ // in this part of the test.
+ //
+ if (cpu_has_avx) {
+ BMPGraphing_new_line (graph, "Sequential 256-bit bypassing writes", RGB_DARKOLIVEGREEN);
+
+ newline ();
+
+ i = 0;
+ while ((chunk_size = chunk_sizes [i++])) {
+ if (!(chunk_size & 128)) {
+ int amount = do_write (chunk_size, AVX_BYPASS, false);
+ BMPGraphing_add_point (graph, chunk_size, amount);
+ }
+ }
+ }
+
+ //------------------------------------------------------------
+ // SSE4 random writes that bypass the caches.
+ //
+ if (use_sse4) {
+ BMPGraphing_new_line (graph, "Random 128-bit bypassing writes", RGB_LEMONYELLOW);
+
+ newline ();
+ srand (time (NULL));
+
+ i = 0;
+ while ((chunk_size = chunk_sizes [i++])) {
+ if (!(chunk_size & 128)) {
+ int amount = do_write (chunk_size, SSE2_BYPASS, true);
+ BMPGraphing_add_point (graph, chunk_size, amount);
+ }
+ }
+ }
+
+ //------------------------------------------------------------
+ // Sequential non-SSE2 reads.
+ //
+ newline ();
+#ifdef __x86_64__
+ BMPGraphing_new_line (graph, "Sequential 64-bit reads", RGB_BLUE);
+#else
+ BMPGraphing_new_line (graph, "Sequential 32-bit reads", RGB_BLUE);
+#endif
+
+ i = 0;
+ while ((chunk_size = chunk_sizes [i++])) {
+ int amount = do_read (chunk_size, NO_SSE2, false);
+ BMPGraphing_add_point (graph, chunk_size, amount);
+ }
+
+ //------------------------------------------------------------
+ // Random non-SSE2 reads.
+ //
+ newline ();
+#ifdef __x86_64__
+ BMPGraphing_new_line (graph, "Random 64-bit reads", RGB_CYAN);
+#else
+ BMPGraphing_new_line (graph, "Random 32-bit reads", RGB_CYAN);
+#endif
+ srand (time (NULL));
+
+ i = 0;
+ while ((chunk_size = chunk_sizes [i++])) {
+ if (!(chunk_size & 128)) {
+ int amount = do_read (chunk_size, NO_SSE2, true);
+ BMPGraphing_add_point (graph, chunk_size, amount);
+ }
+ }
+
+ //------------------------------------------------------------
+ // Sequential non-SSE2 writes.
+ //
+#ifdef __x86_64__
+ BMPGraphing_new_line (graph, "Sequential 64-bit writes", RGB_DARKGREEN);
+#else
+ BMPGraphing_new_line (graph, "Sequential 32-bit writes", RGB_DARKGREEN);
+#endif
+
+ newline ();
+
+ i = 0;
+ while ((chunk_size = chunk_sizes [i++])) {
+ int amount = do_write (chunk_size, NO_SSE2, false);
+ BMPGraphing_add_point (graph, chunk_size, amount);
+ }
+
+ //------------------------------------------------------------
+ // Random non-SSE2 writes.
+ //
+#ifdef __x86_64__
+ BMPGraphing_new_line (graph, "Random 64-bit writes", RGB_GREEN);
+#else
+ BMPGraphing_new_line (graph, "Random 32-bit writes", RGB_GREEN);
+#endif
+
+ newline ();
+ srand (time (NULL));
+
+ i = 0;
+ while ((chunk_size = chunk_sizes [i++])) {
+ if (!(chunk_size & 128)) {
+ int amount = do_write (chunk_size, NO_SSE2, true);
+ BMPGraphing_add_point (graph, chunk_size, amount);
+ }
+ }
+
+ //------------------------------------------------------------
+ // SSE2 sequential copy.
+ //
+ if (use_sse2) {
+ BMPGraphing_new_line (graph, "Sequential 128-bit copy", 0x8f8844);
+
+ newline ();
+
+ i = 0;
+ while ((chunk_size = chunk_sizes [i++])) {
+ int amount = do_copy (chunk_size, SSE2);
+ BMPGraphing_add_point (graph, chunk_size, amount);
+ }
+ }
+
+ //------------------------------------------------------------
+ // AVX sequential copy.
+ //
+ if (cpu_has_avx) {
+ BMPGraphing_new_line (graph, "Sequential 256-bit copy", RGB_CHARTREUSE);
+
+ newline ();
+
+ i = 0;
+ while ((chunk_size = chunk_sizes [i++])) {
+ if (!(chunk_size & 128)) {
+ int amount = do_copy (chunk_size, AVX);
+ BMPGraphing_add_point (graph, chunk_size, amount);
+ }
+ }
+ }
+
+#ifdef DOING_LODS
+#ifdef __x86_64__
+ //------------------------------------------------------------
+ // LODSQ 64-bit sequential reads.
+ //
+ BMPGraphing_new_line (graph, "Sequential 64-bit LODSQ reads", RGB_GRAY6);
+
+ newline ();
+
+ i = 0;
+ while ((chunk_size = chunk_sizes [i++])) {
+ int amount = do_read (chunk_size, LODSQ, false);
+ BMPGraphing_add_point (graph, chunk_size, amount);
+ }
+#endif
+
+ //------------------------------------------------------------
+ // LODSD 32-bit sequential reads.
+ //
+ BMPGraphing_new_line (graph, "Sequential 32-bit LODSD reads", RGB_GRAY8);
+
+ newline ();
+
+ i = 0;
+ while ((chunk_size = chunk_sizes [i++])) {
+ int amount = do_read (chunk_size, LODSD, false);
+ BMPGraphing_add_point (graph, chunk_size, amount);
+ }
+
+ //------------------------------------------------------------
+ // LODSW 16-bit sequential reads.
+ //
+ BMPGraphing_new_line (graph, "Sequential 16-bit LODSW reads", RGB_GRAY10);
+
+ newline ();
+
+ i = 0;
+ while ((chunk_size = chunk_sizes [i++])) {
+ int amount = do_read (chunk_size, LODSW, false);
+ BMPGraphing_add_point (graph, chunk_size, amount);
+ }
+
+ //------------------------------------------------------------
+ // LODSB 64-bit sequential reads.
+ //
+ BMPGraphing_new_line (graph, "Sequential 8-bit LODSB reads", RGB_GRAY12);
+
+ newline ();
+
+ i = 0;
+ while ((chunk_size = chunk_sizes [i++])) {
+ int amount = do_read (chunk_size, LODSB, false);
+ BMPGraphing_add_point (graph, chunk_size, amount);
+ }
+#endif
+
+ //------------------------------------------------------------
+ // Register to register.
+ //
+ newline ();
+ register_test ();
+
+ //------------------------------------------------------------
+ // Stack to/from register.
+ //
+ newline ();
+ stack_test ();
+
+ //------------------------------------------------------------
+ // C library performance.
+ //
+ newline ();
+ library_test ();
+
+ //------------------------------------------------------------
+ // Framebuffer read & write.
+ //
+#if defined(__linux__) && defined(FBIOGET_FSCREENINFO)
+ newline ();
+ fb_readwrite (true);
+#endif
+
+premature_end_for_testing:
+ flush ();
+
+ BMPGraphing_make (graph);
+
+ BMP_write (graph->image, "bandwidth.bmp");
+
+ puts ("\nWrote graph to bandwidth.bmp.");
+ puts ("");
+ puts ("Done.");
+
+ BMPGraphing_destroy (graph);
+
+ return 0;
+}
diff --git a/minifont.c b/minifont.c
new file mode 100755
index 0000000..8aa939c
--- /dev/null
+++ b/minifont.c
@@ -0,0 +1,845 @@
+
+/*=============================================================================
+ bmplib, a simple library to create, modify, and write BMP image files.
+ Copyright (C) 2009-2014 by Zack T Smith.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2
+ as published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+ The author may be reached at veritas@comcast.net.
+ *============================================================================*/
+
+#include <stdio.h>
+
+#include "BMP.h"
+
+// Mini characters, 8 pixels high.
+static const char *mini_chars_ [] =
+{
+ "#",
+ "#",
+ "#",
+ "#",
+ "#",
+ " ",
+ "#",
+ "",
+
+ "## ##",
+ " # #",
+ "# #",
+ " ",
+ " ",
+ " ",
+ " ",
+ "",
+
+ " # # ",
+ " # # ",
+ "#####",
+ " # # ",
+ "#####",
+ " # # ",
+ " # # ",
+ "",
+
+ " # ",
+ " ####",
+ "# # ",
+ " ### ",
+ " # #",
+ "####",
+ " # ",
+ "",
+
+ "## #",
+ " #",
+ " #",
+ " #",
+ " #",
+ "#",
+ "# ##",
+ "",
+
+ " # ",
+ "# # ",
+ "## ",
+ " ## #",
+ "# ## ",
+ "# # ",
+ " ## #",
+ "",
+
+ "##",
+ " #",
+ "#",
+ "",
+ "",
+ "",
+ "",
+ "",
+
+ " #",
+ "#",
+ "#",
+ "#",
+ "#",
+ "#",
+ "#",
+ " #",
+
+ "# ",
+ " #",
+ " #",
+ " #",
+ " #",
+ " #",
+ " #",
+ "#",
+
+ " ",
+ "# # #",
+ " ###",
+ " #",
+ " ###",
+ "# # #",
+ "",
+ "",
+
+ " ",
+ " #",
+ " #",
+ "#####",
+ " #",
+ " #",
+ "",
+ "",
+
+ " ",
+ "",
+ "",
+ "",
+ "",
+ "##",
+ " #",
+ "#",
+
+ " ",
+ "",
+ "",
+ "#####",
+ "",
+ "",
+ "",
+ "",
+
+ " ",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "#",
+ "",
+
+ " #",
+ " #",
+ " #",
+ " #",
+ " #",
+ "#",
+ "#",
+ "",
+
+ " ## ",
+ "# #",
+ "# #",
+ "# #",
+ "# #",
+ "# #",
+ " ## ",
+ "",
+
+ " #",
+ "##",
+ " #",
+ " #",
+ " #",
+ " #",
+ " #",
+ "",
+
+ " ## ",
+ "# #",
+ " #",
+ " ###",
+ "# ",
+ "# ",
+ "####",
+ "",
+
+ "####",
+ " #",
+ " # ",
+ " ## ",
+ " #",
+ "# #",
+ " ## ",
+ "",
+
+ "# # ",
+ "# #",
+ "# #",
+ "####",
+ " #",
+ " #",
+ " #",
+ "",
+
+ "####",
+ "# ",
+ "### ",
+ " #",
+ " #",
+ "# #",
+ " ## ",
+ "",
+
+ " ## ",
+ "# ",
+ "# ",
+ "### ",
+ "# #",
+ "# #",
+ " ## ",
+ "",
+
+ "####",
+ " #",
+ " #",
+ " # ",
+ " # ",
+ " # ",
+ " # ",
+ "",
+
+ " ## ",
+ "# #",
+ "# #",
+ " ## ",
+ "# #",
+ "# #",
+ " ## ",
+ "",
+
+ " ## ",
+ "# #",
+ "# #",
+ " ###",
+ " #",
+ " # ",
+ " # ",
+ "",
+
+ " ",
+ "",
+ "",
+ "#",
+ "",
+ "#",
+ "",
+ "",
+
+ " ",
+ "",
+ " ",
+ "##",
+ " ",
+ "##",
+ " #",
+ "#",
+
+ " #",
+ " #",
+ " #",
+ "#",
+ " #",
+ " #",
+ " #",
+ "",
+
+ " ",
+ "",
+ "",
+ "#####",
+ " ",
+ "#####",
+ "",
+ "",
+
+ "# ",
+ " #",
+ " #",
+ " #",
+ " #",
+ " #",
+ "#",
+ "",
+
+ " ### ",
+ "# #",
+ " #",
+ " ## ",
+ " #",
+ "",
+ " #",
+ "",
+
+ " ### ",
+ "# #",
+ "# ##",
+ "# # #",
+ "# ##",
+ "# ",
+ " ###",
+ "",
+
+ " # ",
+ " # # ",
+ "# #",
+ "# #",
+ "#####",
+ "# #",
+ "# #",
+ "",
+
+ "#### ",
+ "# #",
+ "# #",
+ "#### ",
+ "# #",
+ "# #",
+ "####",
+ "",
+
+ " ### ",
+ "# #",
+ "# ",
+ "# ",
+ "# ",
+ "# #",
+ " ###",
+ "",
+
+ "#### ",
+ "# #",
+ "# #",
+ "# #",
+ "# #",
+ "# #",
+ "####",
+ "",
+
+ "#####",
+ "#",
+ "#",
+ "###",
+ "#",
+ "#",
+ "#####",
+ "",
+
+ "#####",
+ "# ",
+ "# ",
+ "###",
+ "# ",
+ "# ",
+ "#",
+ "",
+
+ " ### ",
+ "# #",
+ "# ",
+ "# ##",
+ "# #",
+ "# #",
+ " ####",
+ "",
+
+ "# #",
+ "# #",
+ "# #",
+ "#####",
+ "# #",
+ "# #",
+ "# #",
+ "",
+
+ "###",
+ " #",
+ " #",
+ " #",
+ " #",
+ " #",
+ "###",
+ "",
+
+ " ###",
+ " #",
+ " #",
+ " #",
+ " #",
+ "# #",
+ " ##",
+ "",
+
+ "# #",
+ "# #",
+ "# #",
+ "##",
+ "# #",
+ "# #",
+ "# #",
+ "",
+
+ "# ",
+ "#",
+ "#",
+ "#",
+ "#",
+ "#",
+ "#####",
+ "",
+
+ "# #",
+ "## ##",
+ "# # #",
+ "# #",
+ "# #",
+ "# #",
+ "# #",
+ "",
+
+ "# #",
+ "## #",
+ "# # #",
+ "# ##",
+ "# #",
+ "# #",
+ "# #",
+ "",
+
+ " ### ",
+ "# #",
+ "# #",
+ "# #",
+ "# #",
+ "# #",
+ " ###",
+ "",
+
+ "#### ",
+ "# #",
+ "# #",
+ "#### ",
+ "# ",
+ "# ",
+ "# ",
+ "",
+
+ " ### ",
+ "# #",
+ "# #",
+ "# #",
+ "# # #",
+ "# # ",
+ " ## #",
+ "",
+
+ "#### ",
+ "# #",
+ "# #",
+ "#### ",
+ "# # ",
+ "# # ",
+ "# #",
+ "",
+
+ " ### ",
+ "# #",
+ "# ",
+ " ### ",
+ " #",
+ "# #",
+ " ###",
+ "",
+
+ "#####",
+ " #",
+ " #",
+ " #",
+ " #",
+ " #",
+ " #",
+ "",
+
+ "# #",
+ "# #",
+ "# #",
+ "# #",
+ "# #",
+ "# #",
+ " ###",
+ "",
+
+ "# #",
+ "# #",
+ "# #",
+ "# #",
+ "# #",
+ " # # ",
+ " #",
+ "",
+
+ "# #",
+ "# #",
+ "# #",
+ "# # #",
+ "# # #",
+ "## ##",
+ "# #",
+ "",
+
+ "# #",
+ "# #",
+ " # #",
+ " #",
+ " # #",
+ "# #",
+ "# #",
+ "",
+
+ "# #",
+ "# #",
+ "# #",
+ " # #",
+ " #",
+ " #",
+ " #",
+ "",
+
+ "#####",
+ " #",
+ " #",
+ " #",
+ " #",
+ "#",
+ "#####",
+ "",
+
+ "##",
+ "#",
+ "#",
+ "#",
+ "#",
+ "#",
+ "#",
+ "##",
+
+ "# ",
+ "#",
+ " #",
+ " #",
+ " #",
+ " #",
+ " #",
+ "",
+
+ "##",
+ " #",
+ " #",
+ " #",
+ " #",
+ " #",
+ " #",
+ "##",
+
+ " # ",
+ " # #",
+ "# #",
+ "",
+ "",
+ "",
+ "",
+ "",
+
+ " ",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "####",
+
+ "##",
+ "#",
+ " #",
+ "",
+ "",
+ "",
+ "",
+ "",
+
+ " ",
+ " ",
+ " ## ",
+ " #",
+ " ###",
+ "# #",
+ " ###",
+ "",
+
+ "# ",
+ "# ",
+ "### ",
+ "# #",
+ "# #",
+ "# #",
+ "### ",
+ "",
+
+ " ",
+ " ",
+ " ###",
+ "# ",
+ "# ",
+ "# ",
+ " ###",
+ "",
+
+ " #",
+ " #",
+ " ###",
+ "# #",
+ "# #",
+ "# #",
+ " ###",
+ "",
+
+ " ",
+ " ",
+ " ## ",
+ "# #",
+ "####",
+ "# ",
+ " ###",
+ "",
+
+ " ##",
+ " # ",
+ "### ",
+ " # ",
+ " # ",
+ " # ",
+ "### ",
+ "",
+
+ " ",
+ " ",
+ " ###",
+ "# #",
+ "# #",
+ " ###",
+ " #",
+ "### ",
+
+ "# ",
+ "# ",
+ "### ",
+ "# #",
+ "# #",
+ "# #",
+ "# #",
+ "",
+
+ " # ",
+ " ",
+ "## ",
+ " # ",
+ " # ",
+ " # ",
+ "###",
+ "",
+
+ " #",
+ " ",
+ " ##",
+ " #",
+ " #",
+ " #",
+ " #",
+ "## ",
+
+ "# ",
+ "# ",
+ "# #",
+ "# # ",
+ "## ",
+ "# # ",
+ "# #",
+ "",
+
+ "## ",
+ " # ",
+ " # ",
+ " # ",
+ " # ",
+ " # ",
+ "###",
+ "",
+
+ " ",
+ "",
+ "####",
+ "# # #",
+ "# # #",
+ "# # #",
+ "# # #",
+ "",
+
+ " ",
+ " ",
+ "###",
+ "# #",
+ "# #",
+ "# #",
+ "# #",
+ "",
+
+ " ",
+ " ",
+ " ## ",
+ "# #",
+ "# #",
+ "# #",
+ " ## ",
+ "",
+
+ " ",
+ "",
+ "###",
+ "# #",
+ "# #",
+ "###",
+ "#",
+ "#",
+
+ " ",
+ "",
+ " ###",
+ "# #",
+ "# #",
+ " ###",
+ " #",
+ " # ",
+
+ " ",
+ " ",
+ "# ##",
+ "## ",
+ "# ",
+ "# ",
+ "# ",
+ "",
+
+ " ",
+ " ",
+ " ###",
+ "# ",
+ " ##",
+ " #",
+ "### ",
+ "",
+
+ " # ",
+ " #",
+ "###",
+ " #",
+ " #",
+ " #",
+ " ##",
+ "",
+
+ " ",
+ "",
+ "# #",
+ "# #",
+ "# #",
+ "# #",
+ " ###",
+ "",
+
+ " ",
+ "",
+ "# #",
+ "# #",
+ "# #",
+ " # #",
+ " #",
+ "",
+
+ " ",
+ "",
+ "# # #",
+ "# # #",
+ "# # #",
+ "# # #",
+ " # #",
+ "",
+
+ " ",
+ "",
+ "# #",
+ " # #",
+ " #",
+ " # #",
+ "# #",
+ "",
+
+ " ",
+ " ",
+ "# #",
+ "# #",
+ "# #",
+ " ###",
+ " #",
+ "### ",
+
+ " ",
+ "",
+ "#####",
+ " #",
+ " #",
+ " # ",
+ "#####",
+ "",
+
+};
+
+const char **get_minifont_chars ()
+{
+ return mini_chars_;
+}
+
diff --git a/minifont.h b/minifont.h
new file mode 100755
index 0000000..a26edb9
--- /dev/null
+++ b/minifont.h
@@ -0,0 +1,28 @@
+
+/*=============================================================================
+ bmplib, a simple library to create, modify, and write BMP image files.
+ Copyright (C) 2009-2014 by Zack T Smith.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2
+ as published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+ The author may be reached at veritas@comcast.net.
+ *============================================================================*/
+
+#ifndef _MINIFONT_H
+#define _MINIFONT_H
+
+extern const char **get_minifont_chars (void);
+
+#endif
+
diff --git a/output/._Celeron-2.8GHz-slow.gif b/output/._Celeron-2.8GHz-slow.gif
new file mode 100755
index 0000000..bdd6833
--- /dev/null
+++ b/output/._Celeron-2.8GHz-slow.gif
Binary files differ
diff --git a/output/._Corei5-2.6GHz-4288U-MacOSXMavericks-64bit.png b/output/._Corei5-2.6GHz-4288U-MacOSXMavericks-64bit.png
new file mode 100755
index 0000000..826edc9
--- /dev/null
+++ b/output/._Corei5-2.6GHz-4288U-MacOSXMavericks-64bit.png
Binary files differ
diff --git a/output/._Corei5-520M-MacOSXLion-32bit-slow.gif b/output/._Corei5-520M-MacOSXLion-32bit-slow.gif
new file mode 100755
index 0000000..6fdd000
--- /dev/null
+++ b/output/._Corei5-520M-MacOSXLion-32bit-slow.gif
Binary files differ
diff --git a/output/._Corei5-520M-MacOSXLion-64bit-slow-Crucial-SDRAM.gif b/output/._Corei5-520M-MacOSXLion-64bit-slow-Crucial-SDRAM.gif
new file mode 100755
index 0000000..83d30f5
--- /dev/null
+++ b/output/._Corei5-520M-MacOSXLion-64bit-slow-Crucial-SDRAM.gif
Binary files differ
diff --git a/output/._Corei5-520M-MacOSXLion-64bit-slow-Samsung-SDRAM.gif b/output/._Corei5-520M-MacOSXLion-64bit-slow-Samsung-SDRAM.gif
new file mode 100755
index 0000000..5ffb9aa
--- /dev/null
+++ b/output/._Corei5-520M-MacOSXLion-64bit-slow-Samsung-SDRAM.gif
Binary files differ
diff --git a/output/Celeron-2.8GHz-slow.gif b/output/Celeron-2.8GHz-slow.gif
new file mode 100755
index 0000000..6d89c32
--- /dev/null
+++ b/output/Celeron-2.8GHz-slow.gif
Binary files differ
diff --git a/output/Corei5-2.6GHz-4288U-MacOSXMavericks-64bit.png b/output/Corei5-2.6GHz-4288U-MacOSXMavericks-64bit.png
new file mode 100755
index 0000000..d8d268e
--- /dev/null
+++ b/output/Corei5-2.6GHz-4288U-MacOSXMavericks-64bit.png
Binary files differ
diff --git a/output/Corei5-520M-MacOSXLion-32bit-slow.gif b/output/Corei5-520M-MacOSXLion-32bit-slow.gif
new file mode 100755
index 0000000..364adf7
--- /dev/null
+++ b/output/Corei5-520M-MacOSXLion-32bit-slow.gif
Binary files differ
diff --git a/output/Corei5-520M-MacOSXLion-64bit-slow-Crucial-SDRAM.gif b/output/Corei5-520M-MacOSXLion-64bit-slow-Crucial-SDRAM.gif
new file mode 100755
index 0000000..4ce6d5d
--- /dev/null
+++ b/output/Corei5-520M-MacOSXLion-64bit-slow-Crucial-SDRAM.gif
Binary files differ
diff --git a/output/Corei5-520M-MacOSXLion-64bit-slow-Samsung-SDRAM.gif b/output/Corei5-520M-MacOSXLion-64bit-slow-Samsung-SDRAM.gif
new file mode 100755
index 0000000..d38a120
--- /dev/null
+++ b/output/Corei5-520M-MacOSXLion-64bit-slow-Samsung-SDRAM.gif
Binary files differ
diff --git a/routines32.asm b/routines32.asm
new file mode 100755
index 0000000..44015d9
--- /dev/null
+++ b/routines32.asm
@@ -0,0 +1,2960 @@
+;============================================================================
+; bandwidth 0.32, a benchmark to estimate memory transfer bandwidth.
+; Copyright (C) 2005-2014 by Zack T Smith.
+;
+; This program is free software; you can redistribute it and/or modify
+; it under the terms of the GNU General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or
+; (at your option) any later version.
+;
+; This program is distributed in the hope that it will be useful,
+; but WITHOUT ANY WARRANTY; without even the implied warranty of
+; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+; GNU General Public License for more details.
+;
+; You should have received a copy of the GNU General Public License
+; along with this program; if not, write to the Free Software
+; Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+;
+; The author may be reached at veritas@comcast.net.
+;=============================================================================
+
+bits 32
+cpu ia64
+
+global ReaderLODSQ
+global _ReaderLODSQ
+
+global ReaderLODSD
+global _ReaderLODSD
+
+global ReaderLODSW
+global _ReaderLODSW
+
+global ReaderLODSB
+global _ReaderLODSB
+
+; Cygwin requires the underbar-prefixed symbols.
+global _WriterSSE2
+global WriterSSE2
+
+global _WriterAVX
+global WriterAVX
+
+global _WriterSSE2_128bytes
+global WriterSSE2_128bytes
+
+global _ReaderAVX
+global ReaderAVX
+
+global _ReaderSSE2
+global ReaderSSE2
+
+global ReaderSSE2_bypass
+global _ReaderSSE2_bypass
+
+global _ReaderSSE2_128bytes
+global ReaderSSE2_128bytes
+
+global ReaderSSE2_128bytes_bypass
+global _ReaderSSE2_128bytes_bypass
+
+global _RandomReaderSSE2
+global RandomReaderSSE2
+
+global _RandomReaderSSE2_bypass
+global RandomReaderSSE2_bypass
+
+global WriterAVX_bypass
+global _WriterAVX_bypass
+
+global _WriterSSE2_bypass
+global WriterSSE2_bypass
+
+global _WriterSSE2_128bytes_bypass
+global WriterSSE2_128bytes_bypass
+
+global _RandomWriterSSE2_bypass
+global RandomWriterSSE2_bypass
+
+global Reader
+global _Reader
+
+global Writer
+global _Writer
+
+global Reader_128bytes
+global _Reader_128bytes
+
+global Writer_128bytes
+global _Writer_128bytes
+
+global RandomReader
+global _RandomReader
+
+global RandomWriter
+global _RandomWriter
+
+global RandomWriterSSE2
+global _RandomWriterSSE2
+
+global get_cpuid_family
+global _get_cpuid_family
+
+global get_cpuid_cache_info
+global _get_cpuid_cache_info
+
+global get_cpuid1_ecx
+global _get_cpuid1_ecx
+
+global get_cpuid1_edx
+global _get_cpuid1_edx
+
+global get_cpuid7_ebx
+global _get_cpuid7_ebx
+
+global get_cpuid_80000001_ecx
+global _get_cpuid_80000001_ecx
+
+global get_cpuid_80000001_edx
+global _get_cpuid_80000001_edx
+
+global CopySSE
+global _CopySSE
+
+global CopyAVX
+global _CopyAVX
+
+global CopySSE_128bytes
+global _CopySSE_128bytes
+
+global RegisterToRegister
+global _RegisterToRegister
+
+global VectorToVector
+global _VectorToVector
+
+global VectorToVectorAVX
+global _VectorToVectorAVX
+
+global RegisterToVector
+global _RegisterToVector
+
+global VectorToRegister
+global _VectorToRegister
+
+global Register8ToVector
+global Register16ToVector
+global Register32ToVector
+global Register64ToVector
+global Vector8ToRegister
+global Vector16ToRegister
+global Vector32ToRegister
+global Vector64ToRegister
+
+global _Register8ToVector
+global _Register16ToVector
+global _Register32ToVector
+global _Register64ToVector
+global _Vector8ToRegister
+global _Vector16ToRegister
+global _Vector32ToRegister
+global _Vector64ToRegister
+
+global StackReader
+global _StackReader
+
+global StackWriter
+global _StackWriter
+
+ section .text
+
+;------------------------------------------------------------------------------
+; Name: ReaderLODSQ
+; Purpose: Reads 64-bit values sequentially from an area of memory
+; using LODSQ instruction.
+; Params: rdi = ptr to memory area
+; rsi = length in bytes
+; rdx = loops
+;------------------------------------------------------------------------------
+ align 32
+ReaderLODSQ:
+_ReaderLODSQ:
+ ; N/A
+ ret
+
+;------------------------------------------------------------------------------
+; Name: ReaderLODSD
+; Purpose: Reads 32-bit values sequentially from an area of memory
+; using LODSD instruction.
+; Params:
+; [esp+4] = ptr to memory area
+; [esp+8] = length in bytes
+; [esp+12] = loops
+;------------------------------------------------------------------------------
+ align 32
+ReaderLODSD:
+_ReaderLODSD:
+ shr dword [esp+8], 2 ; length in double words rounded down.
+
+ push ebx
+ push ecx ; REP counter
+ push edx
+
+ mov edx, [esp+12+12]
+.L1:
+ mov esi, [esp+4+12]
+ mov ecx, [esp+8+12]
+
+ rep lodsd
+
+ dec edx
+ jnz .L1
+
+ pop edx
+ pop ecx
+ pop ebx
+ ret
+
+;------------------------------------------------------------------------------
+; Name: ReaderLODSW
+; Purpose: Reads 16-bit values sequentially from an area of memory
+; using LODSW instruction.
+; Params:
+; [esp+4] = ptr to memory area
+; [esp+8] = length in bytes
+; [esp+12] = loops
+;------------------------------------------------------------------------------
+ align 32
+ReaderLODSW:
+_ReaderLODSW:
+ shr dword [esp+8], 1 ; length in words rounded down.
+
+ push ebx
+ push ecx ; REP counter
+ push edx
+
+ mov edx, [esp+12+12]
+.L1:
+ mov esi, [esp+4+12]
+ mov ecx, [esp+8+12]
+
+ rep lodsw
+
+ dec edx
+ jnz .L1
+
+ pop edx
+ pop ecx
+ pop ebx
+ ret
+
+;------------------------------------------------------------------------------
+; Name: ReaderLODSB
+; Purpose: Reads 8-bit values sequentially from an area of memory
+; using LODSB instruction.
+; Params:
+; [esp+4] = ptr to memory area
+; [esp+8] = length in bytes
+; [esp+12] = loops
+;------------------------------------------------------------------------------
+ align 32
+ReaderLODSB:
+_ReaderLODSB:
+ push ebx
+ push ecx ; REP counter
+ push edx
+
+ mov edx, [esp+12+12]
+.L1:
+ mov esi, [esp+4+12]
+ mov ecx, [esp+8+12]
+
+ rep lodsb
+
+ dec edx
+ jnz .L1
+
+ pop edx
+ pop ecx
+ pop ebx
+ ret
+
+;------------------------------------------------------------------------------
+; Name: Reader
+; Purpose: Reads 32-bit values sequentially from an area of memory.
+; Params:
+; [esp+4] = ptr to memory area
+; [esp+8] = length in bytes
+; [esp+12] = loops
+;------------------------------------------------------------------------------
+ align 64
+Reader:
+_Reader:
+ push ebx
+ push ecx
+ push edx
+
+ mov ecx, [esp+12+12] ; loops to do.
+
+ mov edx, [esp+4+12] ; ptr to memory chunk.
+ mov ebx, edx ; ebx = limit in memory
+ add ebx, [esp+8+12]
+
+.L1:
+ mov edx, [esp+4+12]
+
+.L2:
+ mov eax, [edx]
+ mov eax, [4+edx]
+ mov eax, [8+edx]
+ mov eax, [12+edx]
+ mov eax, [16+edx]
+ mov eax, [20+edx]
+ mov eax, [24+edx]
+ mov eax, [28+edx]
+ mov eax, [32+edx]
+ mov eax, [36+edx]
+ mov eax, [40+edx]
+ mov eax, [44+edx]
+ mov eax, [48+edx]
+ mov eax, [52+edx]
+ mov eax, [56+edx]
+ mov eax, [60+edx]
+ mov eax, [64+edx]
+ mov eax, [68+edx]
+ mov eax, [72+edx]
+ mov eax, [76+edx]
+ mov eax, [80+edx]
+ mov eax, [84+edx]
+ mov eax, [88+edx]
+ mov eax, [92+edx]
+ mov eax, [96+edx]
+ mov eax, [100+edx]
+ mov eax, [104+edx]
+ mov eax, [108+edx]
+ mov eax, [112+edx]
+ mov eax, [116+edx]
+ mov eax, [120+edx]
+ mov eax, [124+edx]
+
+ mov eax, [edx+128]
+ mov eax, [edx+132]
+ mov eax, [edx+136]
+ mov eax, [edx+140]
+ mov eax, [edx+144]
+ mov eax, [edx+148]
+ mov eax, [edx+152]
+ mov eax, [edx+156]
+ mov eax, [edx+160]
+ mov eax, [edx+164]
+ mov eax, [edx+168]
+ mov eax, [edx+172]
+ mov eax, [edx+176]
+ mov eax, [edx+180]
+ mov eax, [edx+184]
+ mov eax, [edx+188]
+ mov eax, [edx+192]
+ mov eax, [edx+196]
+ mov eax, [edx+200]
+ mov eax, [edx+204]
+ mov eax, [edx+208]
+ mov eax, [edx+212]
+ mov eax, [edx+216]
+ mov eax, [edx+220]
+ mov eax, [edx+224]
+ mov eax, [edx+228]
+ mov eax, [edx+232]
+ mov eax, [edx+236]
+ mov eax, [edx+240]
+ mov eax, [edx+244]
+ mov eax, [edx+248]
+ mov eax, [edx+252]
+
+ add edx, 256
+ cmp edx, ebx
+ jb .L2
+
+ sub ecx, 1
+ jnz .L1
+
+ pop edx
+ pop ecx
+ pop ebx
+ ret
+
+
+;------------------------------------------------------------------------------
+; Name: Writer
+; Purpose: Writes 32-bit value sequentially to an area of memory.
+; Params:
+; [esp+4] = ptr to memory area
+; [esp+8] = length in bytes
+; [esp+12] = loops
+; [esp+16] = long to write
+;------------------------------------------------------------------------------
+ align 64
+Writer:
+_Writer:
+ push ebx
+ push ecx
+ push edx
+
+ mov ecx, [esp+12+12]
+ mov eax, [esp+16+12]
+
+ mov edx, [esp+4+12] ; edx = ptr to chunk
+ mov ebx, edx
+ add ebx, [esp+8+12] ; ebx = limit in memory
+
+.L1:
+ mov edx, [esp+4+12]
+
+.L2:
+ mov [edx], eax
+ mov [4+edx], eax
+ mov [8+edx], eax
+ mov [12+edx], eax
+ mov [16+edx], eax
+ mov [20+edx], eax
+ mov [24+edx], eax
+ mov [28+edx], eax
+ mov [32+edx], eax
+ mov [36+edx], eax
+ mov [40+edx], eax
+ mov [44+edx], eax
+ mov [48+edx], eax
+ mov [52+edx], eax
+ mov [56+edx], eax
+ mov [60+edx], eax
+ mov [64+edx], eax
+ mov [68+edx], eax
+ mov [72+edx], eax
+ mov [76+edx], eax
+ mov [80+edx], eax
+ mov [84+edx], eax
+ mov [88+edx], eax
+ mov [92+edx], eax
+ mov [96+edx], eax
+ mov [100+edx], eax
+ mov [104+edx], eax
+ mov [108+edx], eax
+ mov [112+edx], eax
+ mov [116+edx], eax
+ mov [120+edx], eax
+ mov [124+edx], eax
+
+ mov [edx+128], eax
+ mov [edx+132], eax
+ mov [edx+136], eax
+ mov [edx+140], eax
+ mov [edx+144], eax
+ mov [edx+148], eax
+ mov [edx+152], eax
+ mov [edx+156], eax
+ mov [edx+160], eax
+ mov [edx+164], eax
+ mov [edx+168], eax
+ mov [edx+172], eax
+ mov [edx+176], eax
+ mov [edx+180], eax
+ mov [edx+184], eax
+ mov [edx+188], eax
+ mov [edx+192], eax
+ mov [edx+196], eax
+ mov [edx+200], eax
+ mov [edx+204], eax
+ mov [edx+208], eax
+ mov [edx+212], eax
+ mov [edx+216], eax
+ mov [edx+220], eax
+ mov [edx+224], eax
+ mov [edx+228], eax
+ mov [edx+232], eax
+ mov [edx+236], eax
+ mov [edx+240], eax
+ mov [edx+244], eax
+ mov [edx+248], eax
+ mov [edx+252], eax
+
+ add edx, 256
+ cmp edx, ebx
+ jb .L2
+
+ sub ecx, 1
+ jnz .L1
+
+ pop edx
+ pop ecx
+ pop ebx
+ ret
+
+
+;------------------------------------------------------------------------------
+; Name: Reader_128bytes
+; Purpose: Reads 32-bit values sequentially from an area of memory.
+; Params:
+; [esp+4] = ptr to memory area
+; [esp+8] = length in bytes
+; [esp+12] = loops
+;------------------------------------------------------------------------------
+ align 64
+Reader_128bytes:
+_Reader_128bytes:
+ push ebx
+ push ecx
+ push edx
+
+ mov ecx, [esp+12+12] ; loops to do.
+
+ mov edx, [esp+4+12] ; ptr to memory chunk.
+ mov ebx, edx ; ebx = limit in memory
+ add ebx, [esp+8+12]
+
+.L1:
+ mov edx, [esp+4+12]
+
+.L2:
+ mov eax, [edx]
+ mov eax, [4+edx]
+ mov eax, [8+edx]
+ mov eax, [12+edx]
+ mov eax, [16+edx]
+ mov eax, [20+edx]
+ mov eax, [24+edx]
+ mov eax, [28+edx]
+ mov eax, [32+edx]
+ mov eax, [36+edx]
+ mov eax, [40+edx]
+ mov eax, [44+edx]
+ mov eax, [48+edx]
+ mov eax, [52+edx]
+ mov eax, [56+edx]
+ mov eax, [60+edx]
+ mov eax, [64+edx]
+ mov eax, [68+edx]
+ mov eax, [72+edx]
+ mov eax, [76+edx]
+ mov eax, [80+edx]
+ mov eax, [84+edx]
+ mov eax, [88+edx]
+ mov eax, [92+edx]
+ mov eax, [96+edx]
+ mov eax, [100+edx]
+ mov eax, [104+edx]
+ mov eax, [108+edx]
+ mov eax, [112+edx]
+ mov eax, [116+edx]
+ mov eax, [120+edx]
+ mov eax, [124+edx]
+
+ add edx, 128
+ cmp edx, ebx
+ jb .L2
+
+ sub ecx, 1
+ jnz .L1
+
+ pop edx
+ pop ecx
+ pop ebx
+ ret
+
+
+;------------------------------------------------------------------------------
+; Name: Writer_128bytes
+; Purpose: Writes 32-bit value sequentially to an area of memory.
+; Params:
+; [esp+4] = ptr to memory area
+; [esp+8] = length in bytes
+; [esp+12] = loops
+; [esp+16] = long to write
+;------------------------------------------------------------------------------
+ align 64
+Writer_128bytes:
+_Writer_128bytes:
+ push ebx
+ push ecx
+ push edx
+
+ mov ecx, [esp+12+12]
+ mov eax, [esp+16+12]
+
+ mov edx, [esp+4+12] ; edx = ptr to chunk
+ mov ebx, edx
+ add ebx, [esp+8+12] ; ebx = limit in memory
+
+.L1:
+ mov edx, [esp+4+12]
+
+.L2:
+ mov [edx], eax
+ mov [4+edx], eax
+ mov [8+edx], eax
+ mov [12+edx], eax
+ mov [16+edx], eax
+ mov [20+edx], eax
+ mov [24+edx], eax
+ mov [28+edx], eax
+ mov [32+edx], eax
+ mov [36+edx], eax
+ mov [40+edx], eax
+ mov [44+edx], eax
+ mov [48+edx], eax
+ mov [52+edx], eax
+ mov [56+edx], eax
+ mov [60+edx], eax
+ mov [64+edx], eax
+ mov [68+edx], eax
+ mov [72+edx], eax
+ mov [76+edx], eax
+ mov [80+edx], eax
+ mov [84+edx], eax
+ mov [88+edx], eax
+ mov [92+edx], eax
+ mov [96+edx], eax
+ mov [100+edx], eax
+ mov [104+edx], eax
+ mov [108+edx], eax
+ mov [112+edx], eax
+ mov [116+edx], eax
+ mov [120+edx], eax
+ mov [124+edx], eax
+
+ add edx, 128
+ cmp edx, ebx
+ jb .L2
+
+ sub ecx, 1
+ jnz .L1
+
+ pop edx
+ pop ecx
+ pop ebx
+ ret
+
+;------------------------------------------------------------------------------
+; Name: get_cpuid_cache_info
+;
+get_cpuid_cache_info:
+_get_cpuid_cache_info:
+ push ebp
+ push ebx
+ push ecx
+ push edx
+ mov eax, 4
+ mov ecx, [esp + 16 + 4 + 4]
+ cpuid
+ mov ebp, eax
+ mov eax, [esp + 16 + 4]
+ mov [eax], ebp
+ mov [eax+4], ebx
+ mov [eax+8], ecx
+ mov [eax+12], edx
+ pop edx
+ pop ecx
+ pop ebx
+ pop ebp
+ ret
+
+;------------------------------------------------------------------------------
+; Name: get_cpuid_family
+;
+get_cpuid_family:
+_get_cpuid_family:
+ push ebx
+ push ecx
+ push edx
+ xor eax, eax
+ cpuid
+ mov eax, [esp + 12 + 4]
+ mov [eax], ebx
+ mov [eax+4], edx
+ mov [eax+8], ecx
+ mov byte [eax+12], 0
+ pop edx
+ pop ecx
+ pop ebx
+ ret
+
+;------------------------------------------------------------------------------
+; Name: get_cpuid1_ecx
+;
+get_cpuid1_ecx:
+_get_cpuid1_ecx:
+ push ebx
+ push ecx
+ push edx
+ mov eax, 1
+ cpuid
+ mov eax, ecx
+ pop edx
+ pop ecx
+ pop ebx
+ ret
+
+;------------------------------------------------------------------------------
+; Name: get_cpuid7_ebx
+;
+get_cpuid7_ebx:
+_get_cpuid7_ebx:
+ push ebx
+ push ecx
+ push edx
+ mov eax, 7
+ xor ecx, ecx
+ cpuid
+ mov eax, ebx
+ pop edx
+ pop ecx
+ pop ebx
+ ret
+
+;------------------------------------------------------------------------------
+; Name: get_cpuid_80000001_ecx
+;
+get_cpuid_80000001_ecx:
+_get_cpuid_80000001_ecx:
+ push ebx
+ push ecx
+ push edx
+ mov eax, 0x80000001
+ cpuid
+ mov eax, ecx
+ pop edx
+ pop ecx
+ pop ebx
+ ret
+
+;------------------------------------------------------------------------------
+; Name: get_cpuid_80000001_edx
+;
+get_cpuid_80000001_edx:
+_get_cpuid_80000001_edx:
+ push ebx
+ push ecx
+ push edx
+ mov eax, 0x80000001
+ cpuid
+ mov eax, edx
+ pop edx
+ pop ecx
+ pop ebx
+ ret
+
+;------------------------------------------------------------------------------
+; Name: get_cpuid1_edx
+;
+get_cpuid1_edx:
+_get_cpuid1_edx:
+ push ebx
+ push ecx
+ push edx
+ mov eax, 1
+ cpuid
+ mov eax, edx
+ pop edx
+ pop ecx
+ pop ebx
+ ret
+
+;------------------------------------------------------------------------------
+; Name: ReaderAVX
+; Purpose: Reads 128-bit values sequentially from an area of memory.
+; Params: [esp+4] = ptr to memory area
+; [esp+8] = length in bytes
+; [esp+12] = loops
+;------------------------------------------------------------------------------
+ align 64
+ReaderAVX:
+_ReaderAVX:
+ vzeroupper
+
+ push ebx
+ push ecx
+
+ mov ecx, [esp+12+8]
+
+ mov eax, [esp+4+8]
+ mov ebx, eax
+ add ebx, [esp+8+8] ; ebx points to end.
+
+.L1:
+ mov eax, [esp+4+8]
+
+.L2:
+ vmovdqa xmm0, [eax] ; Read aligned @ 16-byte boundary.
+ vmovdqa xmm0, [32+eax]
+ vmovdqa xmm0, [64+eax]
+ vmovdqa xmm0, [96+eax]
+ vmovdqa xmm0, [128+eax]
+ vmovdqa xmm0, [160+eax]
+ vmovdqa xmm0, [192+eax]
+ vmovdqa xmm0, [224+eax]
+
+ add eax, 256
+ cmp eax, ebx
+ jb .L2
+
+ sub ecx, 1
+ jnz .L1
+
+ pop ecx
+ pop ebx
+ ret
+
+;------------------------------------------------------------------------------
+; Name: ReaderSSE2
+; Purpose: Reads 128-bit values sequentially from an area of memory.
+; Params: [esp+4] = ptr to memory area
+; [esp+8] = length in bytes
+; [esp+12] = loops
+;------------------------------------------------------------------------------
+ align 64
+ReaderSSE2:
+_ReaderSSE2:
+ push ebx
+ push ecx
+
+ mov ecx, [esp+12+8]
+
+ mov eax, [esp+4+8]
+ mov ebx, eax
+ add ebx, [esp+8+8] ; ebx points to end.
+
+.L1:
+ mov eax, [esp+4+8]
+
+.L2:
+ movdqa xmm0, [eax] ; Read aligned @ 16-byte boundary.
+ movdqa xmm0, [16+eax]
+ movdqa xmm0, [32+eax]
+ movdqa xmm0, [48+eax]
+ movdqa xmm0, [64+eax]
+ movdqa xmm0, [80+eax]
+ movdqa xmm0, [96+eax]
+ movdqa xmm0, [112+eax]
+
+ movdqa xmm0, [128+eax]
+ movdqa xmm0, [144+eax]
+ movdqa xmm0, [160+eax]
+ movdqa xmm0, [176+eax]
+ movdqa xmm0, [192+eax]
+ movdqa xmm0, [208+eax]
+ movdqa xmm0, [224+eax]
+ movdqa xmm0, [240+eax]
+
+ add eax, 256
+ cmp eax, ebx
+ jb .L2
+
+ sub ecx, 1
+ jnz .L1
+
+ pop ecx
+ pop ebx
+ ret
+
+;------------------------------------------------------------------------------
+; Name: ReaderSSE2_bypass
+; Purpose: Reads 128-bit values sequentially from an area of memory.
+; Params: [esp+4] = ptr to memory area
+; [esp+8] = length in bytes
+; [esp+12] = loops
+;------------------------------------------------------------------------------
+ align 64
+ReaderSSE2_bypass:
+_ReaderSSE2_bypass:
+ push ebx
+ push ecx
+
+ mov ecx, [esp+12+8]
+
+ mov eax, [esp+4+8]
+ mov ebx, eax
+ add ebx, [esp+8+8] ; ebx points to end.
+
+.L1:
+ mov eax, [esp+4+8]
+
+.L2:
+ movntdqa xmm0, [eax] ; Read aligned @ 16-byte boundary.
+ movntdqa xmm0, [16+eax]
+ movntdqa xmm0, [32+eax]
+ movntdqa xmm0, [48+eax]
+ movntdqa xmm0, [64+eax]
+ movntdqa xmm0, [80+eax]
+ movntdqa xmm0, [96+eax]
+ movntdqa xmm0, [112+eax]
+
+ movntdqa xmm0, [128+eax]
+ movntdqa xmm0, [144+eax]
+ movntdqa xmm0, [160+eax]
+ movntdqa xmm0, [176+eax]
+ movntdqa xmm0, [192+eax]
+ movntdqa xmm0, [208+eax]
+ movntdqa xmm0, [224+eax]
+ movntdqa xmm0, [240+eax]
+
+ add eax, 256
+ cmp eax, ebx
+ jb .L2
+
+ sub ecx, 1
+ jnz .L1
+
+ pop ecx
+ pop ebx
+ ret
+
+;------------------------------------------------------------------------------
+; Name: ReaderSSE2_128bytes_bypass
+; Purpose: Reads 128-bit values sequentially from an area of memory.
+; Params: [esp+4] = ptr to memory area
+; [esp+8] = length in bytes
+; [esp+12] = loops
+;------------------------------------------------------------------------------
+ align 64
+ReaderSSE2_128bytes_bypass:
+_ReaderSSE2_128bytes_bypass:
+ push ebx
+ push ecx
+
+ mov ecx, [esp+12+8]
+
+ mov eax, [esp+4+8]
+ mov ebx, eax
+ add ebx, [esp+8+8] ; ebx points to end.
+
+.L1:
+ mov eax, [esp+4+8]
+
+.L2:
+ movntdqa xmm0, [eax] ; Read aligned @ 16-byte boundary.
+ movntdqa xmm0, [16+eax]
+ movntdqa xmm0, [32+eax]
+ movntdqa xmm0, [48+eax]
+ movntdqa xmm0, [64+eax]
+ movntdqa xmm0, [80+eax]
+ movntdqa xmm0, [96+eax]
+ movntdqa xmm0, [112+eax]
+
+ add eax, 128
+ cmp eax, ebx
+ jb .L2
+
+ sub ecx, 1
+ jnz .L1
+
+ pop ecx
+ pop ebx
+ ret
+
+;------------------------------------------------------------------------------
+; Name: ReaderSSE2_128bytes
+; Purpose: Reads 128-bit values sequentially from an area of memory.
+; Params: [esp+4] = ptr to memory area
+; [esp+8] = length in bytes
+; [esp+12] = loops
+;------------------------------------------------------------------------------
+ align 64
+ReaderSSE2_128bytes:
+_ReaderSSE2_128bytes:
+ push ebx
+ push ecx
+
+ mov ecx, [esp+12+8]
+
+ mov eax, [esp+4+8]
+ mov ebx, eax
+ add ebx, [esp+8+8] ; ebx points to end.
+
+.L1:
+ mov eax, [esp+4+8]
+
+.L2:
+ movdqa xmm0, [eax] ; Read aligned @ 16-byte boundary.
+ movdqa xmm0, [16+eax]
+ movdqa xmm0, [32+eax]
+ movdqa xmm0, [48+eax]
+ movdqa xmm0, [64+eax]
+ movdqa xmm0, [80+eax]
+ movdqa xmm0, [96+eax]
+ movdqa xmm0, [112+eax]
+
+ add eax, 128
+ cmp eax, ebx
+ jb .L2
+
+ sub ecx, 1
+ jnz .L1
+
+ pop ecx
+ pop ebx
+ ret
+
+;------------------------------------------------------------------------------
+; Name: WriterAVX
+; Purpose: Write 256-bit values sequentially from an area of memory.
+; Params: [esp+4] = ptr to memory area
+; [esp+8] = length in bytes
+; [esp+12] = loops
+; [esp+16] = value (ignored)
+;------------------------------------------------------------------------------
+ align 64
+WriterAVX:
+_WriterAVX:
+ vzeroupper
+
+ push ebx
+ push ecx
+
+ mov eax, [esp+16+8]
+ movd xmm0, eax ; Create a 128-bit replication of the 32-bit
+ movd xmm1, eax ; value that was provided.
+ movd xmm2, eax
+ movd xmm3, eax
+ pslldq xmm1, 32
+ pslldq xmm2, 64
+ pslldq xmm3, 96
+ por xmm0, xmm1
+ por xmm0, xmm2
+ por xmm0, xmm3
+
+ mov ecx, [esp+12+8]
+
+ mov eax, [esp+4+8]
+ mov ebx, eax
+ add ebx, [esp+8+8] ; ebx points to end.
+
+.L1:
+ mov eax, [esp+4+8]
+
+.L2:
+ vmovdqa [eax], xmm0
+ vmovdqa [32+eax], xmm0
+ vmovdqa [64+eax], xmm0
+ vmovdqa [96+eax], xmm0
+ vmovdqa [128+eax], xmm0
+ vmovdqa [160+eax], xmm0
+ vmovdqa [192+eax], xmm0
+ vmovdqa [224+eax], xmm0
+
+ add eax, 256
+ cmp eax, ebx
+ jb .L2
+
+ sub ecx, 1
+ jnz .L1
+
+ pop ecx
+ pop ebx
+ ret
+
+;------------------------------------------------------------------------------
+; Name: WriterSSE2
+; Purpose: Write 128-bit values sequentially from an area of memory.
+; Params: [esp+4] = ptr to memory area
+; [esp+8] = length in bytes
+; [esp+12] = loops
+; [esp+16] = value (ignored)
+;------------------------------------------------------------------------------
+ align 64
+WriterSSE2:
+_WriterSSE2:
+ push ebx
+ push ecx
+
+ mov eax, [esp+16+8]
+ movd xmm0, eax ; Create a 128-bit replication of the 32-bit
+ movd xmm1, eax ; value that was provided.
+ movd xmm2, eax
+ movd xmm3, eax
+ pslldq xmm1, 32
+ pslldq xmm2, 64
+ pslldq xmm3, 96
+ por xmm0, xmm1
+ por xmm0, xmm2
+ por xmm0, xmm3
+
+ mov ecx, [esp+12+8]
+
+ mov eax, [esp+4+8]
+ mov ebx, eax
+ add ebx, [esp+8+8] ; ebx points to end.
+
+.L1:
+ mov eax, [esp+4+8]
+
+.L2:
+ movdqa [eax], xmm0
+ movdqa [16+eax], xmm0
+ movdqa [32+eax], xmm0
+ movdqa [48+eax], xmm0
+ movdqa [64+eax], xmm0
+ movdqa [80+eax], xmm0
+ movdqa [96+eax], xmm0
+ movdqa [112+eax], xmm0
+
+ movdqa [128+eax], xmm0
+ movdqa [144+eax], xmm0
+ movdqa [160+eax], xmm0
+ movdqa [176+eax], xmm0
+ movdqa [192+eax], xmm0
+ movdqa [208+eax], xmm0
+ movdqa [224+eax], xmm0
+ movdqa [240+eax], xmm0
+
+ add eax, 256
+ cmp eax, ebx
+ jb .L2
+
+ sub ecx, 1
+ jnz .L1
+
+ pop ecx
+ pop ebx
+ ret
+
+;------------------------------------------------------------------------------
+; Name: WriterSSE2
+; Purpose: Write 128-bit values sequentially from an area of memory.
+; Params: [esp+4] = ptr to memory area
+; [esp+8] = length in bytes
+; [esp+12] = loops
+; [esp+16] = value (ignored)
+;------------------------------------------------------------------------------
+ align 64
+WriterSSE2_128bytes:
+_WriterSSE2_128bytes:
+ push ebx
+ push ecx
+
+ mov eax, [esp+16+8]
+ movd xmm0, eax ; Create a 128-bit replication of the 32-bit
+ movd xmm1, eax ; value that was provided.
+ movd xmm2, eax
+ movd xmm3, eax
+ pslldq xmm1, 32
+ pslldq xmm2, 64
+ pslldq xmm3, 96
+ por xmm0, xmm1
+ por xmm0, xmm2
+ por xmm0, xmm3
+
+ mov ecx, [esp+12+8]
+
+ mov eax, [esp+4+8]
+ mov ebx, eax
+ add ebx, [esp+8+8] ; ebx points to end.
+
+.L1:
+ mov eax, [esp+4+8]
+
+.L2:
+ movdqa [eax], xmm0
+ movdqa [16+eax], xmm0
+ movdqa [32+eax], xmm0
+ movdqa [48+eax], xmm0
+ movdqa [64+eax], xmm0
+ movdqa [80+eax], xmm0
+ movdqa [96+eax], xmm0
+ movdqa [112+eax], xmm0
+
+ add eax, 128
+ cmp eax, ebx
+ jb .L2
+
+ sub ecx, 1
+ jnz .L1
+
+ pop ecx
+ pop ebx
+ ret
+
+;------------------------------------------------------------------------------
+; Name: WriterAVX_bypass
+; Purpose: Write 256-bit values sequentially from an area of memory,
+; bypassing the cache.
+; Params: [esp+4] = ptr to memory area
+; [esp+8] = length in bytes
+; [esp+12] = loops
+; [esp+16] = value (ignored)
+;------------------------------------------------------------------------------
+
+ align 64
+WriterAVX_bypass:
+_WriterAVX_bypass:
+ vzeroupper
+
+ push ebx
+ push ecx
+
+ mov eax, [esp+16+8]
+ movd xmm0, eax ; Create a 128-bit replication of the 32-bit
+ movd xmm1, eax ; value that was provided.
+ movd xmm2, eax
+ movd xmm3, eax
+ pslldq xmm1, 32
+ pslldq xmm2, 64
+ pslldq xmm3, 96
+ por xmm0, xmm1
+ por xmm0, xmm2
+ por xmm0, xmm3
+
+ mov ecx, [esp+12+8]
+
+ mov eax, [esp+4+8]
+ mov ebx, eax
+ add ebx, [esp+8+8] ; ebx points to end.
+
+.L1:
+ mov eax, [esp+4+8]
+
+.L2:
+ vmovntdq [eax], xmm0 ; Write bypassing cache.
+ vmovntdq [32+eax], xmm0
+ vmovntdq [64+eax], xmm0
+ vmovntdq [96+eax], xmm0
+ vmovntdq [128+eax], xmm0
+ vmovntdq [160+eax], xmm0
+ vmovntdq [192+eax], xmm0
+ vmovntdq [224+eax], xmm0
+
+ add eax, 256
+ cmp eax, ebx
+ jb .L2
+
+ sub ecx, 1
+ jnz .L1
+
+ pop ecx
+ pop ebx
+ ret
+
+;------------------------------------------------------------------------------
+; Name: WriterSSE2_bypass
+; Purpose: Write 128-bit values sequentially from an area of memory,
+; bypassing the cache.
+; Params: [esp+4] = ptr to memory area
+; [esp+8] = length in bytes
+; [esp+12] = loops
+; [esp+16] = value (ignored)
+;------------------------------------------------------------------------------
+ align 64
+WriterSSE2_bypass:
+_WriterSSE2_bypass:
+ push ebx
+ push ecx
+
+ mov eax, [esp+16+8]
+ movd xmm0, eax ; Create a 128-bit replication of the 32-bit
+ movd xmm1, eax ; value that was provided.
+ movd xmm2, eax
+ movd xmm3, eax
+ pslldq xmm1, 32
+ pslldq xmm2, 64
+ pslldq xmm3, 96
+ por xmm0, xmm1
+ por xmm0, xmm2
+ por xmm0, xmm3
+
+ mov ecx, [esp+12+8]
+
+ mov eax, [esp+4+8]
+ mov ebx, eax
+ add ebx, [esp+8+8] ; ebx points to end.
+
+.L1:
+ mov eax, [esp+4+8]
+
+.L2:
+ movntdq [eax], xmm0 ; Write bypassing cache.
+ movntdq [16+eax], xmm0
+ movntdq [32+eax], xmm0
+ movntdq [48+eax], xmm0
+ movntdq [64+eax], xmm0
+ movntdq [80+eax], xmm0
+ movntdq [96+eax], xmm0
+ movntdq [112+eax], xmm0
+
+ movntdq [128+eax], xmm0
+ movntdq [144+eax], xmm0
+ movntdq [160+eax], xmm0
+ movntdq [176+eax], xmm0
+ movntdq [192+eax], xmm0
+ movntdq [208+eax], xmm0
+ movntdq [224+eax], xmm0
+ movntdq [240+eax], xmm0
+
+ add eax, 256
+ cmp eax, ebx
+ jb .L2
+
+ sub ecx, 1
+ jnz .L1
+
+ pop ecx
+ pop ebx
+ ret
+
+;------------------------------------------------------------------------------
+; Name: WriterSSE2_128bytes_bypass
+; Purpose: Write 128-bit values sequentially from an area of memory,
+; bypassing the cache.
+; Params: [esp+4] = ptr to memory area
+; [esp+8] = length in bytes
+; [esp+12] = loops
+; [esp+16] = value (ignored)
+;------------------------------------------------------------------------------
+ align 64
+WriterSSE2_128bytes_bypass:
+_WriterSSE2_128bytes_bypass:
+ push ebx
+ push ecx
+
+ mov eax, [esp+16+8]
+ movd xmm0, eax ; Create a 128-bit replication of the 32-bit
+ movd xmm1, eax ; value that was provided.
+ movd xmm2, eax
+ movd xmm3, eax
+ pslldq xmm1, 32
+ pslldq xmm2, 64
+ pslldq xmm3, 96
+ por xmm0, xmm1
+ por xmm0, xmm2
+ por xmm0, xmm3
+
+ mov ecx, [esp+12+8]
+
+ mov eax, [esp+4+8]
+ mov ebx, eax
+ add ebx, [esp+8+8] ; ebx points to end.
+
+.L1:
+ mov eax, [esp+4+8]
+
+.L2:
+ movntdq [eax], xmm0 ; Write bypassing cache.
+ movntdq [16+eax], xmm0
+ movntdq [32+eax], xmm0
+ movntdq [48+eax], xmm0
+ movntdq [64+eax], xmm0
+ movntdq [80+eax], xmm0
+ movntdq [96+eax], xmm0
+ movntdq [112+eax], xmm0
+
+ add eax, 128
+ cmp eax, ebx
+ jb .L2
+
+ sub ecx, 1
+ jnz .L1
+
+ pop ecx
+ pop ebx
+ ret
+
+;------------------------------------------------------------------------------
+; Name: RandomReader
+; Purpose: Reads 32-bit values randomly from an area of memory.
+; Params:
+; [esp+4] = ptr to array of chunk pointers
+; [esp+8] = # of 128-byte chunks
+; [esp+12] = loops
+;------------------------------------------------------------------------------
+ align 64
+RandomReader:
+_RandomReader:
+ push ebx
+ push ecx
+ push edx
+
+ mov ecx, [esp+12+12] ; loops to do.
+
+.L0:
+ mov ebx, [esp+8+12] ; # chunks to do
+
+.L1:
+ sub ebx, 1
+ jc .L2
+
+ mov edx, [esp+4+12] ; get ptr to memory chunk.
+ mov edx, [edx + 4*ebx]
+
+ mov eax, [edx+160]
+ mov eax, [edx+232]
+ mov eax, [edx+224]
+ mov eax, [96+edx]
+ mov eax, [edx+164]
+ mov eax, [76+edx]
+ mov eax, [100+edx]
+ mov eax, [edx+220]
+ mov eax, [edx+248]
+ mov eax, [104+edx]
+ mov eax, [4+edx]
+ mov eax, [edx+136]
+ mov eax, [112+edx]
+ mov eax, [edx+200]
+ mov eax, [12+edx]
+ mov eax, [edx+128]
+ mov eax, [edx+148]
+ mov eax, [edx+196]
+ mov eax, [edx+216]
+ mov eax, [edx]
+ mov eax, [84+edx]
+ mov eax, [edx+140]
+ mov eax, [edx+204]
+ mov eax, [edx+184]
+ mov eax, [124+edx]
+ mov eax, [48+edx]
+ mov eax, [64+edx]
+ mov eax, [edx+212]
+ mov eax, [edx+240]
+ mov eax, [edx+236]
+ mov eax, [24+edx]
+ mov eax, [edx+252]
+ mov eax, [68+edx]
+ mov eax, [20+edx]
+ mov eax, [72+edx]
+ mov eax, [32+edx]
+ mov eax, [28+edx]
+ mov eax, [52+edx]
+ mov eax, [edx+244]
+ mov eax, [edx+180]
+ mov eax, [80+edx]
+ mov eax, [60+edx]
+ mov eax, [8+edx]
+ mov eax, [56+edx]
+ mov eax, [edx+208]
+ mov eax, [edx+228]
+ mov eax, [40+edx]
+ mov eax, [edx+172]
+ mov eax, [120+edx]
+ mov eax, [edx+176]
+ mov eax, [108+edx]
+ mov eax, [edx+132]
+ mov eax, [16+edx]
+ mov eax, [44+edx]
+ mov eax, [92+edx]
+ mov eax, [edx+168]
+ mov eax, [edx+152]
+ mov eax, [edx+156]
+ mov eax, [edx+188]
+ mov eax, [36+edx]
+ mov eax, [88+edx]
+ mov eax, [116+edx]
+ mov eax, [edx+192]
+ mov eax, [edx+144]
+
+ jmp .L1
+
+.L2:
+ sub ecx, 1
+ jnz .L0
+
+ pop edx
+ pop ecx
+ pop ebx
+ ret
+
+
+;------------------------------------------------------------------------------
+; Name: RandomReaderSSE2
+; Purpose: Reads 128-bit values sequentially from an area of memory.
+; Params:
+; [esp+4] = ptr to array of chunk pointers
+; [esp+8] = # of 128-byte chunks
+; [esp+12] = loops
+;------------------------------------------------------------------------------
+ align 64
+RandomReaderSSE2:
+_RandomReaderSSE2:
+ push ebx
+ push ecx
+ push edx
+
+ mov ecx, [esp+12+12] ; loops to do.
+
+.L0:
+ mov ebx, [esp+8+12] ; # chunks to do
+
+.L1:
+ sub ebx, 1
+ jc .L2
+
+ mov edx, [esp+4+12] ; get ptr to memory chunk.
+ mov edx, [edx + 4*ebx]
+
+; Read aligned @ 16-byte boundary.
+ movdqa xmm0, [240+edx]
+ movdqa xmm0, [128+edx]
+ movdqa xmm0, [64+edx]
+ movdqa xmm0, [208+edx]
+ movdqa xmm0, [112+edx]
+ movdqa xmm0, [176+edx]
+ movdqa xmm0, [144+edx]
+ movdqa xmm0, [edx]
+ movdqa xmm0, [96+edx]
+ movdqa xmm0, [16+edx]
+ movdqa xmm0, [192+edx]
+ movdqa xmm0, [160+edx]
+ movdqa xmm0, [32+edx]
+ movdqa xmm0, [48+edx]
+ movdqa xmm0, [224+edx]
+ movdqa xmm0, [80+edx]
+
+ jmp .L1
+
+.L2:
+ sub ecx, 1
+ jnz .L0
+
+ pop edx
+ pop ecx
+ pop ebx
+ ret
+
+;------------------------------------------------------------------------------
+; Name: RandomReaderSSE2_bypass
+; Purpose: Reads 128-bit values sequentially from an area of memory.
+; Params:
+; [esp+4] = ptr to array of chunk pointers
+; [esp+8] = # of 128-byte chunks
+; [esp+12] = loops
+;------------------------------------------------------------------------------
+ align 64
+RandomReaderSSE2_bypass:
+_RandomReaderSSE2_bypass:
+ push ebx
+ push ecx
+ push edx
+
+ mov ecx, [esp+12+12] ; loops to do.
+
+.L0:
+ mov ebx, [esp+8+12] ; # chunks to do
+
+.L1:
+ sub ebx, 1
+ jc .L2
+
+ mov edx, [esp+4+12] ; get ptr to memory chunk.
+ mov edx, [edx + 4*ebx]
+
+; Read aligned @ 16-byte boundary.
+ movntdqa xmm0, [240+edx]
+ movntdqa xmm0, [edx]
+ movntdqa xmm0, [128+edx]
+ movntdqa xmm0, [64+edx]
+ movntdqa xmm0, [208+edx]
+ movntdqa xmm0, [112+edx]
+ movntdqa xmm0, [32+edx]
+ movntdqa xmm0, [176+edx]
+ movntdqa xmm0, [144+edx]
+ movntdqa xmm0, [96+edx]
+ movntdqa xmm0, [16+edx]
+ movntdqa xmm0, [160+edx]
+ movntdqa xmm0, [192+edx]
+ movntdqa xmm0, [48+edx]
+ movntdqa xmm0, [224+edx]
+ movntdqa xmm0, [80+edx]
+
+ jmp .L1
+
+.L2:
+ sub ecx, 1
+ jnz .L0
+
+ pop edx
+ pop ecx
+ pop ebx
+ ret
+
+;------------------------------------------------------------------------------
+; Name: RandomWriter
+; Purpose: Writes 32-bit value sequentially to an area of memory.
+; Params:
+; [esp+4] = ptr to memory area
+; [esp+8] = length in bytes
+; [esp+12] = loops
+; [esp+16] = long to write
+;------------------------------------------------------------------------------
+ align 64
+RandomWriter:
+_RandomWriter:
+ push ebx
+ push ecx
+ push edx
+
+ mov eax, [esp+16+12] ; get datum.
+ mov ecx, [esp+12+12] ; loops to do.
+
+.L0:
+ mov ebx, [esp+8+12] ; # chunks to do
+
+.L1:
+ sub ebx, 1
+ jc .L2
+
+ mov edx, [esp+4+12] ; get ptr to memory chunk.
+ mov edx, [edx + 4*ebx]
+
+ mov [edx+212], eax
+ mov [edx+156], eax
+ mov [edx+132], eax
+ mov [20+edx], eax
+ mov [edx+172], eax
+ mov [edx+196], eax
+ mov [edx+248], eax
+ mov [edx], eax
+ mov [edx+136], eax
+ mov [edx+228], eax
+ mov [edx+160], eax
+ mov [80+edx], eax
+ mov [76+edx], eax
+ mov [32+edx], eax
+ mov [64+edx], eax
+ mov [68+edx], eax
+ mov [120+edx], eax
+ mov [edx+216], eax
+ mov [124+edx], eax
+ mov [28+edx], eax
+ mov [edx+152], eax
+ mov [36+edx], eax
+ mov [edx+220], eax
+ mov [edx+188], eax
+ mov [48+edx], eax
+ mov [104+edx], eax
+ mov [72+edx], eax
+ mov [96+edx], eax
+ mov [edx+184], eax
+ mov [112+edx], eax
+ mov [edx+236], eax
+ mov [edx+224], eax
+ mov [edx+252], eax
+ mov [88+edx], eax
+ mov [edx+180], eax
+ mov [60+edx], eax
+ mov [24+edx], eax
+ mov [edx+192], eax
+ mov [edx+164], eax
+ mov [edx+204], eax
+ mov [44+edx], eax
+ mov [edx+168], eax
+ mov [92+edx], eax
+ mov [edx+208], eax
+ mov [8+edx], eax
+ mov [edx+144], eax
+ mov [edx+148], eax
+ mov [edx+128], eax
+ mov [52+edx], eax
+ mov [4+edx], eax
+ mov [108+edx], eax
+ mov [12+edx], eax
+ mov [56+edx], eax
+ mov [edx+200], eax
+ mov [edx+232], eax
+ mov [16+edx], eax
+ mov [edx+244], eax
+ mov [40+edx], eax
+ mov [edx+140], eax
+ mov [84+edx], eax
+ mov [100+edx], eax
+ mov [116+edx], eax
+ mov [edx+176], eax
+ mov [edx+240], eax
+
+ jmp .L1
+
+.L2:
+ sub ecx, 1
+ jnz .L0
+
+ pop edx
+ pop ecx
+ pop ebx
+ ret
+
+;------------------------------------------------------------------------------
+; Name: RandomWriterSSE2
+; Purpose: Writes 128-bit value randomly to an area of memory.
+; Params:
+; [esp+4] = ptr to memory area
+; [esp+8] = length in bytes
+; [esp+12] = loops
+; [esp+16] = long to write
+;------------------------------------------------------------------------------
+ align 64
+RandomWriterSSE2:
+_RandomWriterSSE2:
+ push ebx
+ push ecx
+ push edx
+
+ mov eax, [esp+16+8]
+ movd xmm0, eax ; Create a 128-bit replication of the 32-bit
+ movd xmm1, eax ; value that was provided.
+ movd xmm2, eax
+ movd xmm3, eax
+ pslldq xmm1, 32
+ pslldq xmm2, 64
+ pslldq xmm3, 96
+ por xmm0, xmm1
+ por xmm0, xmm2
+ por xmm0, xmm3
+
+ mov ecx, [esp+12+12] ; loops to do.
+
+.L0:
+ mov ebx, [esp+8+12] ; # chunks to do
+
+.L1:
+ sub ebx, 1
+ jc .L2
+
+ mov edx, [esp+4+12] ; get ptr to memory chunk.
+ mov edx, [edx + 4*ebx]
+
+ movdqa [64+edx], xmm0
+ movdqa [208+edx], xmm0
+ movdqa [128+edx], xmm0
+ movdqa [112+edx], xmm0
+ movdqa [176+edx], xmm0
+ movdqa [144+edx], xmm0
+ movdqa [edx], xmm0
+ movdqa [96+edx], xmm0
+ movdqa [48+edx], xmm0
+ movdqa [16+edx], xmm0
+ movdqa [192+edx], xmm0
+ movdqa [160+edx], xmm0
+ movdqa [32+edx], xmm0
+ movdqa [240+edx], xmm0
+ movdqa [224+edx], xmm0
+ movdqa [80+edx], xmm0
+
+ jmp .L1
+
+.L2:
+ sub ecx, 1
+ jnz .L0
+
+ pop edx
+ pop ecx
+ pop ebx
+ ret
+
+
+;------------------------------------------------------------------------------
+; Name: RandomWriterSSE2_bypass
+; Purpose: Writes 128-bit value randomly into memory, bypassing caches.
+; Params:
+; [esp+4] = ptr to memory area
+; [esp+8] = length in bytes
+; [esp+12] = loops
+; [esp+16] = long to write
+;------------------------------------------------------------------------------
+ align 64
+RandomWriterSSE2_bypass:
+_RandomWriterSSE2_bypass:
+ push ebx
+ push ecx
+ push edx
+
+ mov eax, [esp+16+8]
+ movd xmm0, eax ; Create a 128-bit replication of the 32-bit
+ movd xmm1, eax ; value that was provided.
+ movd xmm2, eax
+ movd xmm3, eax
+ pslldq xmm1, 32
+ pslldq xmm2, 64
+ pslldq xmm3, 96
+ por xmm0, xmm1
+ por xmm0, xmm2
+ por xmm0, xmm3
+
+ mov ecx, [esp+12+12] ; loops to do.
+
+.L0:
+ mov ebx, [esp+8+12] ; # chunks to do
+
+.L1:
+ sub ebx, 1
+ jc .L2
+
+ mov edx, [esp+4+12] ; get ptr to memory chunk.
+ mov edx, [edx + 4*ebx]
+
+ movntdq [128+edx], xmm0
+ movntdq [240+edx], xmm0
+ movntdq [112+edx], xmm0
+ movntdq [64+edx], xmm0
+ movntdq [176+edx], xmm0
+ movntdq [144+edx], xmm0
+ movntdq [edx], xmm0
+ movntdq [208+edx], xmm0
+ movntdq [80+edx], xmm0
+ movntdq [96+edx], xmm0
+ movntdq [48+edx], xmm0
+ movntdq [16+edx], xmm0
+ movntdq [192+edx], xmm0
+ movntdq [160+edx], xmm0
+ movntdq [224+edx], xmm0
+ movntdq [32+edx], xmm0
+
+ jmp .L1
+
+.L2:
+ sub ecx, 1
+ jnz .L0
+
+ pop edx
+ pop ecx
+ pop ebx
+ ret
+
+;------------------------------------------------------------------------------
+; Name: RegisterToRegister
+; Purpose: Reads/writes 32-bit values between registers of
+; the main register set.
+; Params:
+; dword [esp+4] = loops
+;------------------------------------------------------------------------------
+ align 64
+RegisterToRegister:
+_RegisterToRegister:
+ push ebx
+ push ecx
+
+ mov ecx, [esp+4+8] ; loops to do.
+
+.L1:
+ mov eax, ebx ; 64 transfers by 4 bytes = 256 bytes
+ mov eax, ecx
+ mov eax, edx
+ mov eax, esi
+ mov eax, edi
+ mov eax, ebp
+ mov eax, esp
+ mov eax, ebx
+ mov eax, ebx
+ mov eax, ecx
+ mov eax, edx
+ mov eax, esi
+ mov eax, edi
+ mov eax, ebp
+ mov eax, esp
+ mov eax, ebx
+ mov eax, ebx
+ mov eax, ecx
+ mov eax, edx
+ mov eax, esi
+ mov eax, edi
+ mov eax, ebp
+ mov eax, esp
+ mov eax, ebx
+ mov eax, ebx
+ mov eax, ecx
+ mov eax, edx
+ mov eax, esi
+ mov eax, edi
+ mov eax, ebp
+ mov eax, esp
+ mov eax, ebx
+
+ mov ebx, eax
+ mov ebx, ecx
+ mov ebx, edx
+ mov ebx, esi
+ mov ebx, edi
+ mov ebx, ebp
+ mov ebx, esp
+ mov ebx, eax
+ mov ebx, eax
+ mov ebx, ecx
+ mov ebx, edx
+ mov ebx, esi
+ mov ebx, edi
+ mov ebx, ebp
+ mov ebx, esp
+ mov ebx, eax
+ mov ebx, eax
+ mov ebx, ecx
+ mov ebx, edx
+ mov ebx, esi
+ mov ebx, edi
+ mov ebx, ebp
+ mov ebx, esp
+ mov ebx, eax
+ mov ebx, eax
+ mov ebx, ecx
+ mov ebx, edx
+ mov ebx, esi
+ mov ebx, edi
+ mov ebx, ebp
+ mov ebx, esp
+ mov ebx, eax
+
+ dec ecx
+ jnz .L1
+
+ pop ecx
+ pop ebx
+ ret
+
+;------------------------------------------------------------------------------
+; Name: VectorToVectorAVX
+; Purpose: Reads/writes 256-bit values between registers of
+; the vector register set, in this case YMM.
+; Params: rdi = loops
+;------------------------------------------------------------------------------
+ align 64
+VectorToVectorAVX:
+_VectorToVectorAVX:
+ vzeroupper
+
+ mov eax, [esp + 4]
+.L1:
+ vmovdqa ymm0, ymm1 ; Each move moves 32 bytes, so we need 8
+ vmovdqa ymm0, ymm2 ; moves to transfer a 256 byte chunk.
+ vmovdqa ymm0, ymm3
+ vmovdqa ymm2, ymm0
+ vmovdqa ymm1, ymm2
+ vmovdqa ymm2, ymm1
+ vmovdqa ymm0, ymm3
+ vmovdqa ymm3, ymm1
+
+ dec eax
+ jnz .L1
+ ret
+
+;------------------------------------------------------------------------------
+; Name: VectorToVector
+; Purpose: Reads/writes 128-bit values between registers of
+; the vector register set, in this case XMM.
+; Params: dword [esp + 4] = count.
+;------------------------------------------------------------------------------
+ align 64
+VectorToVector:
+_VectorToVector:
+ mov eax, [esp + 4]
+.L1:
+ movdqa xmm0, xmm1
+ movdqa xmm0, xmm2
+ movdqa xmm0, xmm3
+ movdqa xmm2, xmm0
+ movdqa xmm1, xmm2
+ movdqa xmm2, xmm1
+ movdqa xmm0, xmm3
+ movdqa xmm3, xmm1
+
+ movdqa xmm3, xmm2
+ movdqa xmm1, xmm3
+ movdqa xmm2, xmm1
+ movdqa xmm0, xmm1
+ movdqa xmm1, xmm2
+ movdqa xmm0, xmm1
+ movdqa xmm0, xmm3
+ movdqa xmm3, xmm0
+
+ dec eax
+ jnz .L1
+ ret
+
+;------------------------------------------------------------------------------
+; Name: RegisterToVector
+; Purpose: Writes 32-bit main register values into 128-bit vector register
+; clearing the upper unused bits.
+; Params: dword [esp + 4] = count.
+;------------------------------------------------------------------------------
+ align 64
+RegisterToVector:
+_RegisterToVector:
+ mov eax, [esp + 4]
+ add eax, eax ; Double # of loops.
+.L1:
+ movd xmm1, eax ; 32 transfers of 4 bytes = 128 bytes
+ movd xmm2, eax
+ movd xmm3, eax
+ movd xmm0, eax
+ movd xmm1, eax
+ movd xmm2, eax
+ movd xmm3, eax
+ movd xmm0, eax
+
+ movd xmm1, eax
+ movd xmm3, eax
+ movd xmm2, eax
+ movd xmm0, eax
+ movd xmm1, eax
+ movd xmm2, eax
+ movd xmm3, eax
+ movd xmm0, eax
+
+ movd xmm0, eax
+ movd xmm2, eax
+ movd xmm0, eax
+ movd xmm3, eax
+ movd xmm1, eax
+ movd xmm3, eax
+ movd xmm2, eax
+ movd xmm0, eax
+
+ movd xmm0, eax
+ movd xmm3, eax
+ movd xmm1, eax
+ movd xmm2, eax
+ movd xmm0, eax
+ movd xmm2, eax
+ movd xmm3, eax
+ movd xmm0, eax
+
+ dec eax
+ jnz .L1
+ ret
+
+;------------------------------------------------------------------------------
+; Name: VectorToRegister
+; Purpose: Writes lowest 32 bits of vector registers into 32-bit main
+; register.
+; Params: dword [esp + 4] = count.
+;------------------------------------------------------------------------------
+ align 64
+VectorToRegister:
+_VectorToRegister:
+ mov eax, [esp + 4]
+ add eax, eax ; Double # of loops.
+ push ebx
+.L1:
+ movd ebx, xmm1 ; 4 bytes per transfer therefore need 64
+ movd ebx, xmm2 ; to transfer 256 bytes.
+ movd ebx, xmm3
+ movd ebx, xmm0
+ movd ebx, xmm1
+ movd ebx, xmm2
+ movd ebx, xmm3
+ movd ebx, xmm0
+
+ movd ebx, xmm1
+ movd ebx, xmm3
+ movd ebx, xmm2
+ movd ebx, xmm0
+ movd ebx, xmm1
+ movd ebx, xmm2
+ movd ebx, xmm3
+ movd ebx, xmm0
+
+ movd ebx, xmm0
+ movd ebx, xmm2
+ movd ebx, xmm0
+ movd ebx, xmm3
+ movd ebx, xmm1
+ movd ebx, xmm3
+ movd ebx, xmm2
+ movd ebx, xmm0
+
+ movd ebx, xmm0
+ movd ebx, xmm3
+ movd ebx, xmm1
+ movd ebx, xmm2
+ movd ebx, xmm0
+ movd ebx, xmm2
+ movd ebx, xmm3
+ movd ebx, xmm0
+
+ dec eax
+ jnz .L1
+
+ pop ebx
+ ret
+
+;------------------------------------------------------------------------------
+; Name: StackReader
+; Purpose: Reads 32-bit values off the stack into registers of
+; the main register set, effectively testing L1 cache access
+; *and* effective-address calculation speed.
+; Params:
+; dword [esp+4] = loops
+;------------------------------------------------------------------------------
+ align 64
+StackReader:
+_StackReader:
+ push ebx
+ push ecx
+
+ mov ecx, [esp+4+8] ; loops to do.
+
+ push dword 7000 ; [esp+24]
+ push dword 6000 ; [esp+20]
+ push dword 5000 ; [esp+16]
+ push dword 4000 ; [esp+12]
+ push dword 3000 ; [esp+8]
+ push dword 2000 ; [esp+4]
+ push dword 1000 ; [esp]
+
+.L1:
+ mov eax, [esp]
+ mov eax, [esp+8]
+ mov eax, [esp+12]
+ mov eax, [esp+16]
+ mov eax, [esp+20]
+ mov eax, [esp+4]
+ mov eax, [esp+24]
+ mov eax, [esp]
+ mov eax, [esp]
+ mov eax, [esp+8]
+ mov eax, [esp+12]
+ mov eax, [esp+16]
+ mov eax, [esp+20]
+ mov eax, [esp+4]
+ mov eax, [esp+24]
+ mov eax, [esp]
+ mov eax, [esp]
+ mov eax, [esp+8]
+ mov eax, [esp+12]
+ mov eax, [esp+16]
+ mov eax, [esp+20]
+ mov eax, [esp+4]
+ mov eax, [esp+24]
+ mov eax, [esp+4]
+ mov eax, [esp+4]
+ mov eax, [esp+8]
+ mov eax, [esp+12]
+ mov eax, [esp+16]
+ mov eax, [esp+20]
+ mov eax, [esp+4]
+ mov eax, [esp+24]
+ mov eax, [esp+4]
+
+ mov ebx, [esp]
+ mov ebx, [esp+8]
+ mov ebx, [esp+12]
+ mov ebx, [esp+16]
+ mov ebx, [esp+20]
+ mov ebx, [esp+4]
+ mov ebx, [esp+24]
+ mov ebx, [esp]
+ mov ebx, [esp]
+ mov ebx, [esp+8]
+ mov ebx, [esp+12]
+ mov ebx, [esp+16]
+ mov ebx, [esp+20]
+ mov ebx, [esp+4]
+ mov ebx, [esp+24]
+ mov ebx, [esp]
+ mov ebx, [esp]
+ mov ebx, [esp+8]
+ mov ebx, [esp+12]
+ mov ebx, [esp+16]
+ mov ebx, [esp+20]
+ mov ebx, [esp+4]
+ mov ebx, [esp+24]
+ mov ebx, [esp+4]
+ mov ebx, [esp+4]
+ mov ebx, [esp+8]
+ mov ebx, [esp+12]
+ mov ebx, [esp+16]
+ mov ebx, [esp+20]
+ mov ebx, [esp+4]
+ mov ebx, [esp+24]
+ mov ebx, [esp+4]
+
+ dec ecx
+ jnz .L1
+
+ add esp, 28
+
+ pop ecx
+ pop ebx
+ ret
+
+;------------------------------------------------------------------------------
+; Name: StackWriter
+; Purpose: Writes 32-bit values into the stack from registers of
+; the main register set, effectively testing L1 cache access
+; *and* effective-address calculation speed.
+; Params:
+; dword [esp+4] = loops
+;------------------------------------------------------------------------------
+ align 64
+StackWriter:
+_StackWriter:
+ push ebx
+ push ecx
+
+ mov ecx, [esp+4+8] ; loops to do.
+
+ push dword 7000 ; [esp+24]
+ push dword 6000 ; [esp+20]
+ push dword 5000 ; [esp+16]
+ push dword 4000 ; [esp+12]
+ push dword 3000 ; [esp+8]
+ push dword 2000 ; [esp+4]
+ push dword 1000 ; [esp]
+
+ xor eax, eax
+ mov ebx, 0xffffffff
+
+.L1:
+ mov [esp], eax
+ mov [esp+8], eax
+ mov [esp+12], eax
+ mov [esp+16], eax
+ mov [esp+20], eax
+ mov [esp+4], eax
+ mov [esp+24], eax
+ mov [esp], eax
+ mov [esp], eax
+ mov [esp+8], eax
+ mov [esp+12], eax
+ mov [esp+16], eax
+ mov [esp+20], eax
+ mov [esp+4], eax
+ mov [esp+24], eax
+ mov [esp], eax
+ mov [esp], eax
+ mov [esp+8], eax
+ mov [esp+12], eax
+ mov [esp+16], eax
+ mov [esp+20], eax
+ mov [esp+4], eax
+ mov [esp+24], eax
+ mov [esp+4], eax
+ mov [esp+4], eax
+ mov [esp+8], eax
+ mov [esp+12], eax
+ mov [esp+16], eax
+ mov [esp+20], eax
+ mov [esp+4], eax
+ mov [esp+24], eax
+ mov [esp+4], eax
+
+ mov [esp], ebx
+ mov [esp+8], ebx
+ mov [esp+12], ebx
+ mov [esp+16], ebx
+ mov [esp+20], ebx
+ mov [esp+4], ebx
+ mov [esp+24], ebx
+ mov [esp], ebx
+ mov [esp], ebx
+ mov [esp+8], ebx
+ mov [esp+12], ebx
+ mov [esp+16], ebx
+ mov [esp+20], ebx
+ mov [esp+4], ebx
+ mov [esp+24], ebx
+ mov [esp], ebx
+ mov [esp], ebx
+ mov [esp+8], ebx
+ mov [esp+12], ebx
+ mov [esp+16], ebx
+ mov [esp+20], ebx
+ mov [esp+4], ebx
+ mov [esp+24], ebx
+ mov [esp+4], ebx
+ mov [esp+4], ebx
+ mov [esp+8], ebx
+ mov [esp+12], ebx
+ mov [esp+16], ebx
+ mov [esp+20], ebx
+ mov [esp+4], ebx
+ mov [esp+24], ebx
+ mov [esp+4], ebx
+
+ sub ecx, 1
+ jnz .L1
+
+ add esp, 28
+
+ pop ecx
+ pop ebx
+ ret
+
+;------------------------------------------------------------------------------
+; Name: Register8ToVector
+; Purpose: Writes 8-bit main register values into 128-bit vector register
+; without clearing the unused bits.
+; Params: dword [esp + 4]
+;------------------------------------------------------------------------------
+ align 64
+Register8ToVector:
+_Register8ToVector:
+ mov eax, [esp + 4]
+ sal eax, 4 ; Force some repetition.
+.L1:
+ pinsrb xmm1, al, 0 ; 64 transfers x 1 byte = 64 bytes
+ pinsrb xmm2, bl, 1
+ pinsrb xmm3, cl, 2
+ pinsrb xmm1, dl, 3
+ pinsrb xmm2, sil, 4
+ pinsrb xmm3, dil, 5
+ pinsrb xmm0, bpl, 6
+ pinsrb xmm0, spl, 7
+
+ pinsrb xmm0, al, 0
+ pinsrb xmm1, bl, 1
+ pinsrb xmm2, cl, 2
+ pinsrb xmm3, dl, 3
+ pinsrb xmm3, al, 4
+ pinsrb xmm2, bl, 5
+ pinsrb xmm1, bpl, 6
+ pinsrb xmm0, spl, 7
+
+ pinsrb xmm1, al, 0
+ pinsrb xmm2, al, 1
+ pinsrb xmm3, al, 2
+ pinsrb xmm1, al, 3
+ pinsrb xmm2, al, 4
+ pinsrb xmm3, al, 5
+ pinsrb xmm0, cl, 6
+ pinsrb xmm0, bl, 7
+
+ pinsrb xmm0, al, 0
+ pinsrb xmm0, al, 1
+ pinsrb xmm0, al, 2
+ pinsrb xmm0, al, 3
+ pinsrb xmm0, al, 4
+ pinsrb xmm0, al, 5
+ pinsrb xmm0, cl, 6
+ pinsrb xmm0, bl, 7
+
+ pinsrb xmm1, al, 0
+ pinsrb xmm2, bl, 1
+ pinsrb xmm3, cl, 2
+ pinsrb xmm1, dl, 3
+ pinsrb xmm2, sil, 4
+ pinsrb xmm3, dil, 5
+ pinsrb xmm0, bpl, 6
+ pinsrb xmm0, spl, 7
+
+ pinsrb xmm0, al, 10
+ pinsrb xmm1, bl, 11
+ pinsrb xmm2, cl, 12
+ pinsrb xmm3, dl, 13
+ pinsrb xmm3, dil, 14
+ pinsrb xmm2, cl, 15
+ pinsrb xmm1, al, 6
+ pinsrb xmm0, bpl, 7
+
+ pinsrb xmm1, al, 10
+ pinsrb xmm2, al, 11
+ pinsrb xmm3, al, 12
+ pinsrb xmm1, al, 13
+ pinsrb xmm2, al, 14
+ pinsrb xmm3, al, 15
+ pinsrb xmm0, cl, 6
+ pinsrb xmm0, bl, 7
+
+ pinsrb xmm0, al, 9
+ pinsrb xmm0, al, 8
+ pinsrb xmm0, al, 11
+ pinsrb xmm0, al, 3
+ pinsrb xmm0, al, 4
+ pinsrb xmm0, al, 5
+ pinsrb xmm0, cl, 6
+ pinsrb xmm0, bl, 7
+
+ dec eax
+ jnz .L1
+ ret
+
+;------------------------------------------------------------------------------
+; Name: Register16ToVector
+; Purpose: Writes 16-bit main register values into 128-bit vector register
+; without clearing the unused bits.
+; Params: rdi = loops
+;------------------------------------------------------------------------------
+ align 64
+Register16ToVector:
+_Register16ToVector:
+ mov eax, [esp + 4]
+ sal eax, 3 ; Force some repetition.
+.L1:
+ pinsrw xmm1, ax, 0 ; 64 transfers x 2 bytes = 128 bytes
+ pinsrw xmm2, bx, 1
+ pinsrw xmm3, cx, 2
+ pinsrw xmm1, dx, 3
+ pinsrw xmm2, si, 4
+ pinsrw xmm3, di, 5
+ pinsrw xmm0, bp, 6
+ pinsrw xmm0, sp, 7
+
+ pinsrw xmm0, ax, 0
+ pinsrw xmm1, bx, 1
+ pinsrw xmm2, cx, 2
+ pinsrw xmm3, dx, 3
+ pinsrw xmm3, si, 4
+ pinsrw xmm2, di, 5
+ pinsrw xmm1, bp, 6
+ pinsrw xmm0, sp, 7
+
+ pinsrw xmm1, ax, 0
+ pinsrw xmm2, ax, 1
+ pinsrw xmm3, ax, 2
+ pinsrw xmm1, ax, 3
+ pinsrw xmm2, ax, 4
+ pinsrw xmm3, ax, 5
+ pinsrw xmm0, bp, 6
+ pinsrw xmm0, bx, 7
+
+ pinsrw xmm0, ax, 0
+ pinsrw xmm0, ax, 1
+ pinsrw xmm0, ax, 2
+ pinsrw xmm0, ax, 3
+ pinsrw xmm0, ax, 4
+ pinsrw xmm0, ax, 5
+ pinsrw xmm0, bp, 6
+ pinsrw xmm0, bx, 7
+
+ pinsrw xmm1, ax, 0
+ pinsrw xmm2, bx, 1
+ pinsrw xmm3, cx, 2
+ pinsrw xmm1, dx, 3
+ pinsrw xmm2, si, 4
+ pinsrw xmm3, di, 5
+ pinsrw xmm0, bp, 6
+ pinsrw xmm0, sp, 7
+
+ pinsrw xmm0, ax, 0
+ pinsrw xmm1, bx, 1
+ pinsrw xmm2, cx, 2
+ pinsrw xmm3, dx, 3
+ pinsrw xmm3, si, 4
+ pinsrw xmm2, di, 5
+ pinsrw xmm1, bp, 6
+ pinsrw xmm0, sp, 7
+
+ pinsrw xmm1, ax, 0
+ pinsrw xmm2, ax, 1
+ pinsrw xmm3, ax, 2
+ pinsrw xmm1, ax, 3
+ pinsrw xmm2, ax, 4
+ pinsrw xmm3, ax, 5
+ pinsrw xmm0, bp, 6
+ pinsrw xmm0, bx, 7
+
+ pinsrw xmm0, ax, 0
+ pinsrw xmm0, ax, 1
+ pinsrw xmm0, ax, 2
+ pinsrw xmm0, ax, 3
+ pinsrw xmm0, ax, 4
+ pinsrw xmm0, ax, 5
+ pinsrw xmm0, bp, 6
+ pinsrw xmm0, bx, 7
+
+ dec eax
+ jnz .L1
+ ret
+
+;------------------------------------------------------------------------------
+; Name: Register32ToVector
+; Purpose: Writes 32-bit main register values into 128-bit vector register
+; without clearing the unused bits.
+; Params: rdi = loops
+;------------------------------------------------------------------------------
+ align 64
+Register32ToVector:
+_Register32ToVector:
+ mov eax, [esp + 4]
+ sal eax, 2 ; Force some repetition.
+.L1:
+ pinsrd xmm1, eax, 0 ; Each xfer moves 4 bytes so to move 256 bytes
+ pinsrd xmm2, ebx, 1 ; we need 64 transfers.
+ pinsrd xmm3, ecx, 2
+ pinsrd xmm1, edx, 3
+ pinsrd xmm2, esi, 0
+ pinsrd xmm3, edi, 1
+ pinsrd xmm0, ebp, 2
+ pinsrd xmm0, esp, 3
+
+ pinsrd xmm0, eax, 0
+ pinsrd xmm1, ebx, 1
+ pinsrd xmm2, ecx, 2
+ pinsrd xmm3, edx, 3
+ pinsrd xmm3, esi, 3
+ pinsrd xmm2, edi, 2
+ pinsrd xmm1, ebp, 1
+ pinsrd xmm0, esp, 0
+
+ pinsrd xmm1, eax, 0
+ pinsrd xmm2, eax, 1
+ pinsrd xmm3, eax, 2
+ pinsrd xmm1, eax, 3
+ pinsrd xmm2, eax, 0
+ pinsrd xmm3, eax, 1
+ pinsrd xmm0, ebp, 2
+ pinsrd xmm0, ebx, 3
+
+ pinsrd xmm0, eax, 0
+ pinsrd xmm0, eax, 1
+ pinsrd xmm0, eax, 2
+ pinsrd xmm0, eax, 3
+ pinsrd xmm0, eax, 0
+ pinsrd xmm0, eax, 0
+ pinsrd xmm0, ebp, 0
+ pinsrd xmm0, ebx, 0
+
+ pinsrd xmm1, eax, 0
+ pinsrd xmm2, ebx, 1
+ pinsrd xmm3, ecx, 2
+ pinsrd xmm1, edx, 3
+ pinsrd xmm2, esi, 0
+ pinsrd xmm3, edi, 1
+ pinsrd xmm0, ebp, 2
+ pinsrd xmm0, esp, 3
+
+ pinsrd xmm0, eax, 0
+ pinsrd xmm1, ebx, 1
+ pinsrd xmm2, ecx, 2
+ pinsrd xmm3, edx, 3
+ pinsrd xmm3, esi, 3
+ pinsrd xmm2, edi, 2
+ pinsrd xmm1, ebp, 1
+ pinsrd xmm0, esp, 0
+
+ pinsrd xmm1, eax, 0
+ pinsrd xmm2, eax, 1
+ pinsrd xmm3, eax, 2
+ pinsrd xmm1, eax, 3
+ pinsrd xmm2, eax, 0
+ pinsrd xmm3, eax, 1
+ pinsrd xmm0, ebp, 2
+ pinsrd xmm0, ebx, 3
+
+ pinsrd xmm0, eax, 0
+ pinsrd xmm0, eax, 1
+ pinsrd xmm0, eax, 2
+ pinsrd xmm0, eax, 3
+ pinsrd xmm0, eax, 0
+ pinsrd xmm0, eax, 0
+ pinsrd xmm0, ebp, 0
+ pinsrd xmm0, ebx, 0
+ pinsrd xmm0, esp, 0
+
+ dec eax
+ jnz .L1
+ ret
+
+;------------------------------------------------------------------------------
+; Name: Register64ToVector
+; Purpose: Writes 64-bit main register values into 128-bit vector register
+; without clearing the unused bits.
+; Params: rdi = loops
+;------------------------------------------------------------------------------
+Register64ToVector:
+_Register64ToVector:
+ ret
+
+
+;------------------------------------------------------------------------------
+; Name: Vector8ToRegister
+; Purpose: Writes 8-bit vector register values into main register.
+; Params: rdi = loops
+;------------------------------------------------------------------------------
+ align 64
+Vector8ToRegister:
+_Vector8ToRegister:
+ mov eax, [esp + 4]
+ sal eax, 4 ; Force some repetition.
+ push ebx
+.L1:
+ pextrb ebx, xmm1, 0
+ pextrb ebx, xmm2, 1
+ pextrb ebx, xmm3, 2
+ pextrb ebx, xmm1, 3
+ pextrb ebx, xmm2, 4
+ pextrb ebx, xmm3, 5
+ pextrb ebx, xmm0, 6
+ pextrb ebx, xmm0, 7
+
+ pextrb ebx, xmm0, 0
+ pextrb ebx, xmm1, 1
+ pextrb ebx, xmm2, 2
+ pextrb ebx, xmm3, 3
+ pextrb ebx, xmm3, 4
+ pextrb ebx, xmm2, 15
+ pextrb ebx, xmm1, 6
+ pextrb ebx, xmm0, 7
+
+ pextrb ebx, xmm1, 0
+ pextrb ebx, xmm2, 1
+ pextrb ebx, xmm3, 2
+ pextrb ebx, xmm1, 3
+ pextrb ebx, xmm2, 4
+ pextrb ebx, xmm3, 5
+ pextrb ebx, xmm0, 6
+ pextrb ebx, xmm0, 7
+
+ pextrb ebx, xmm0, 0
+ pextrb ebx, xmm1, 1
+ pextrb ebx, xmm2, 2
+ pextrb ebx, xmm3, 3
+ pextrb ebx, xmm3, 4
+ pextrb ebx, xmm2, 5
+ pextrb ebx, xmm1, 6
+ pextrb ebx, xmm0, 7
+
+ pextrb ebx, xmm1, 0
+ pextrb ebx, xmm2, 1
+ pextrb ebx, xmm3, 2
+ pextrb ebx, xmm1, 13
+ pextrb ebx, xmm2, 14
+ pextrb ebx, xmm3, 15
+ pextrb ebx, xmm0, 6
+ pextrb ebx, xmm0, 7
+
+ pextrb ebx, xmm0, 10
+ pextrb ebx, xmm1, 11
+ pextrb ebx, xmm2, 12
+ pextrb ebx, xmm3, 13
+ pextrb ebx, xmm3, 14
+ pextrb ebx, xmm2, 15
+ pextrb ebx, xmm1, 6
+ pextrb ebx, xmm0, 7
+
+ pextrb ebx, xmm1, 0
+ pextrb ebx, xmm2, 1
+ pextrb ebx, xmm3, 2
+ pextrb ebx, xmm1, 3
+ pextrb ebx, xmm2, 4
+ pextrb ebx, xmm3, 5
+ pextrb ebx, xmm0, 6
+ pextrb ebx, xmm0, 7
+
+ pextrb ebx, xmm0, 0
+ pextrb ebx, xmm1, 1
+ pextrb ebx, xmm2, 2
+ pextrb ebx, xmm3, 3
+ pextrb ebx, xmm3, 4
+ pextrb ebx, xmm2, 5
+ pextrb ebx, xmm1, 6
+ pextrb ebx, xmm0, 7
+
+ dec eax
+ jnz .L1
+ pop ebx
+ ret
+
+;------------------------------------------------------------------------------
+; Name: Vector16ToRegister
+; Purpose: Writes 16-bit vector register values into main register.
+; Params: rdi = loops
+;------------------------------------------------------------------------------
+ align 64
+Vector16ToRegister:
+_Vector16ToRegister:
+ mov eax, [esp + 4]
+ sal eax, 3 ; Force some repetition.
+ push ebx
+.L1:
+ pextrw ebx, xmm1, 0
+ pextrw ebx, xmm2, 1
+ pextrw ebx, xmm3, 2
+ pextrw ebx, xmm1, 3
+ pextrw ebx, xmm2, 4
+ pextrw ebx, xmm3, 5
+ pextrw ebx, xmm0, 6
+ pextrw ebx, xmm0, 7
+
+ pextrw ebx, xmm0, 0
+ pextrw ebx, xmm1, 1
+ pextrw ebx, xmm2, 2
+ pextrw ebx, xmm3, 3
+ pextrw ebx, xmm3, 4
+ pextrw ebx, xmm2, 5
+ pextrw ebx, xmm1, 6
+ pextrw ebx, xmm0, 7
+
+ pextrw ebx, xmm1, 0
+ pextrw ebx, xmm2, 1
+ pextrw ebx, xmm3, 2
+ pextrw ebx, xmm1, 3
+ pextrw ebx, xmm2, 4
+ pextrw ebx, xmm3, 5
+ pextrw ebx, xmm0, 6
+ pextrw ebx, xmm0, 7
+
+ pextrw ebx, xmm0, 0
+ pextrw ebx, xmm1, 1
+ pextrw ebx, xmm2, 2
+ pextrw ebx, xmm3, 3
+ pextrw ebx, xmm3, 4
+ pextrw ebx, xmm2, 5
+ pextrw ebx, xmm1, 6
+ pextrw ebx, xmm0, 7
+
+ pextrw ebx, xmm1, 0
+ pextrw ebx, xmm2, 1
+ pextrw ebx, xmm3, 2
+ pextrw ebx, xmm1, 3
+ pextrw ebx, xmm2, 4
+ pextrw ebx, xmm3, 5
+ pextrw ebx, xmm0, 6
+ pextrw ebx, xmm0, 7
+
+ pextrw ebx, xmm0, 0
+ pextrw ebx, xmm1, 1
+ pextrw ebx, xmm2, 2
+ pextrw ebx, xmm3, 3
+ pextrw ebx, xmm3, 4
+ pextrw ebx, xmm2, 5
+ pextrw ebx, xmm1, 6
+ pextrw ebx, xmm0, 7
+
+ pextrw ebx, xmm1, 0
+ pextrw ebx, xmm2, 1
+ pextrw ebx, xmm3, 2
+ pextrw ebx, xmm1, 3
+ pextrw ebx, xmm2, 4
+ pextrw ebx, xmm3, 5
+ pextrw ebx, xmm0, 6
+ pextrw ebx, xmm0, 7
+
+ pextrw ebx, xmm0, 0
+ pextrw ebx, xmm1, 1
+ pextrw ebx, xmm2, 2
+ pextrw ebx, xmm3, 3
+ pextrw ebx, xmm3, 4
+ pextrw ebx, xmm2, 5
+ pextrw ebx, xmm1, 6
+ pextrw ebx, xmm0, 7
+
+ dec eax
+ jnz .L1
+ pop ebx
+ ret
+
+;------------------------------------------------------------------------------
+; Name: Vector32ToRegister
+; Purpose: Writes 32-bit vector register values into main register.
+; Params: rdi = loops
+;------------------------------------------------------------------------------
+ align 64
+Vector32ToRegister:
+_Vector32ToRegister:
+ mov eax, [esp + 4]
+ sal eax, 2 ; Force some repetition.
+ push ebx
+.L1:
+ pextrd ebx, xmm1, 0
+ pextrd ebx, xmm2, 1
+ pextrd ebx, xmm3, 2
+ pextrd ebx, xmm1, 3
+ pextrd ebx, xmm2, 0
+ pextrd ebx, xmm3, 1
+ pextrd ebx, xmm0, 2
+ pextrd ebx, xmm0, 3
+
+ pextrd ebx, xmm0, 0
+ pextrd ebx, xmm1, 1
+ pextrd ebx, xmm2, 2
+ pextrd ebx, xmm3, 3
+ pextrd ebx, xmm3, 3
+ pextrd ebx, xmm2, 2
+ pextrd ebx, xmm1, 1
+ pextrd ebx, xmm0, 0
+
+ pextrd ebx, xmm1, 0
+ pextrd ebx, xmm2, 1
+ pextrd ebx, xmm3, 2
+ pextrd ebx, xmm1, 3
+ pextrd ebx, xmm2, 0
+ pextrd ebx, xmm3, 1
+ pextrd ebx, xmm0, 2
+ pextrd ebx, xmm0, 3
+
+ pextrd ebx, xmm0, 0
+ pextrd ebx, xmm1, 1
+ pextrd ebx, xmm2, 2
+ pextrd ebx, xmm3, 3
+ pextrd ebx, xmm3, 3
+ pextrd ebx, xmm2, 2
+ pextrd ebx, xmm1, 1
+ pextrd ebx, xmm0, 0
+
+ dec eax
+ jnz .L1
+ pop ebx
+ ret
+
+;------------------------------------------------------------------------------
+; Name: Vector64ToRegister
+; Purpose: Writes 64-bit vector register values into main register.
+; Params: rdi = loops
+;------------------------------------------------------------------------------
+Vector64ToRegister:
+_Vector64ToRegister:
+ ret
+
+;------------------------------------------------------------------------------
+; Name: CopyAVX
+; Purpose: Copies memory chunks that are 32-byte aligned.
+; Params: [esp + 4] = ptr to destination memory area
+; [esp + 8] = ptr to source memory area
+; [esp + 12] = length in bytes
+; [esp + 16] = loops
+;------------------------------------------------------------------------------
+ align 64
+CopyAVX:
+_CopyAVX:
+ vzeroupper
+ ; Register usage:
+ ; esi = source
+ ; edi = dest
+ ; ecx = loops
+ ; edx = length
+ push esi
+ push edi
+ push ecx
+ push edx
+
+ mov edi, [esp + 4 + 16]
+ mov esi, [esp + 8 + 16]
+ mov edx, [esp + 12 + 16]
+ mov ecx, [esp + 16 + 16]
+
+ shr edx, 8 ; Ensure length is multiple of 256.
+ shl edx, 8
+
+.L1:
+ mov eax, edx
+ prefetchnta [esi]
+
+.L2:
+ vmovdqa ymm0, [esi]
+ vmovdqa ymm1, [32+esi]
+ vmovdqa ymm2, [64+esi]
+ vmovdqa ymm3, [96+esi]
+
+ vmovdqa [edi], ymm0
+ vmovdqa [32+edi], ymm1
+ vmovdqa [64+edi], ymm2
+ vmovdqa [96+edi], ymm3
+
+ vmovdqa ymm0, [128+esi]
+ vmovdqa ymm1, [128+32+esi]
+ vmovdqa ymm2, [128+64+esi]
+ vmovdqa ymm3, [128+96+esi]
+
+ vmovdqa [128+edi], ymm0
+ vmovdqa [128+32+edi], ymm1
+ vmovdqa [128+64+edi], ymm2
+ vmovdqa [128+96+edi], ymm3
+
+ add esi, 256
+ add edi, 256
+
+ sub eax, 256
+ jnz .L2
+
+ sub esi, edx ; rsi now points to start.
+ sub edi, edx ; rdi now points to start.
+
+ dec ecx
+ jnz .L1
+
+ pop edx
+ pop ecx
+ pop edi
+ pop esi
+ ret
+
+;------------------------------------------------------------------------------
+; Name: CopySSE
+; Purpose: Copies memory chunks that are 16-byte aligned.
+; Params: [esp + 4] = ptr to destination memory area
+; [esp + 8] = ptr to source memory area
+; [esp + 12] = length in bytes
+; [esp + 16] = loops
+;------------------------------------------------------------------------------
+ align 64
+CopySSE:
+_CopySSE:
+ ; Register usage:
+ ; esi = source
+ ; edi = dest
+ ; ecx = loops
+ ; edx = length
+ push esi
+ push edi
+ push ecx
+ push edx
+
+ mov edi, [esp + 4 + 16]
+ mov esi, [esp + 8 + 16]
+ mov edx, [esp + 12 + 16]
+ mov ecx, [esp + 16 + 16]
+
+ shr edx, 7 ; Ensure length is multiple of 128.
+ shl edx, 7
+
+ ; Save our non-parameter XMM registers.
+ sub esp, 64
+ movdqu [esp], xmm4
+ movdqu [16+esp], xmm5
+ movdqu [32+esp], xmm6
+ movdqu [48+esp], xmm7
+
+.L1:
+ mov eax, edx
+
+.L2:
+ prefetchnta [esi]
+ movdqa xmm0, [esi]
+ movdqa xmm1, [16+esi]
+ movdqa xmm2, [32+esi]
+ movdqa xmm3, [48+esi]
+ movdqa xmm4, [64+esi]
+ movdqa xmm5, [80+esi]
+ movdqa xmm6, [96+esi]
+ movdqa xmm7, [112+esi]
+
+ ; 32-bit lacks xmm8 - xmm15.
+
+ movdqa [edi], xmm0
+ movdqa [16+edi], xmm1
+ movdqa [32+edi], xmm2
+ movdqa [48+edi], xmm3
+ movdqa [64+edi], xmm4
+ movdqa [80+edi], xmm5
+ movdqa [96+edi], xmm6
+ movdqa [112+edi], xmm7
+
+ add esi, 128
+ add edi, 128
+
+ sub eax, 128
+ jnz .L2
+
+ sub esi, edx ; rsi now points to start.
+ sub edi, edx ; rdi now points to start.
+
+ dec ecx
+ jnz .L1
+
+ movdqu xmm4, [0+esp]
+ movdqu xmm5, [16+esp]
+ movdqu xmm6, [32+esp]
+ movdqu xmm7, [48+esp]
+ add esp, 64
+
+ pop edx
+ pop ecx
+ pop edi
+ pop esi
+ ret
+
+;------------------------------------------------------------------------------
+; Name: CopySSE_128bytes
+; Purpose: Copies memory chunks that are 16-byte aligned.
+; Params: [esp + 4] = ptr to destination memory area
+; [esp + 8] = ptr to source memory area
+; [esp + 12] = length in bytes
+; [esp + 16] = loops
+;------------------------------------------------------------------------------
+ align 64
+CopySSE_128bytes:
+_CopySSE_128bytes:
+ jmp CopySSE
+
diff --git a/routines64.asm b/routines64.asm
new file mode 100755
index 0000000..e49b75a
--- /dev/null
+++ b/routines64.asm
@@ -0,0 +1,2590 @@
+;============================================================================
+; bandwidth, a benchmark to estimate memory transfer bandwidth.
+; Copyright (C) 2005-2014 by Zack T Smith.
+;
+; This program is free software; you can redistribute it and/or modify
+; it under the terms of the GNU General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or
+; (at your option) any later version.
+;
+; This program is distributed in the hope that it will be useful,
+; but WITHOUT ANY WARRANTY; without even the implied warranty of
+; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+; GNU General Public License for more details.
+;
+; You should have received a copy of the GNU General Public License
+; along with this program; if not, write to the Free Software
+; Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+;
+; The author may be reached at veritas@comcast.net.
+;=============================================================================
+
+bits 64
+cpu ia64
+
+global CopySSE
+global CopySSE_128bytes
+
+global CopyAVX
+global _CopyAVX
+
+global ReaderLODSQ
+global _ReaderLODSQ
+
+global ReaderLODSD
+global _ReaderLODSD
+
+global ReaderLODSW
+global _ReaderLODSW
+
+global ReaderLODSB
+global _ReaderLODSB
+
+global RandomReader
+global RandomReaderSSE2
+global RandomReaderSSE2_bypass
+global RandomWriter
+global RandomWriterSSE2
+global RandomWriterSSE2_bypass
+global Reader
+global Reader_128bytes
+global ReaderAVX
+global ReaderSSE2
+global ReaderSSE2_128bytes
+global ReaderSSE2_bypass
+global ReaderSSE2_128bytes_bypass
+global Register16ToVector
+global Register32ToVector
+global Register64ToVector
+global Register8ToVector
+global RegisterToRegister
+global RegisterToVector
+global StackReader
+global StackWriter
+global Vector16ToRegister
+global Vector32ToRegister
+global Vector64ToRegister
+global Vector8ToRegister
+global VectorToRegister
+global VectorToVector
+global VectorToVectorAVX
+global Writer
+global Writer_128bytes
+global WriterAVX
+global WriterSSE2
+global WriterSSE2_128bytes
+global WriterSSE2_bypass
+global WriterSSE2_128bytes_bypass
+global WriterAVX_bypass
+global _WriterAVX_bypass
+global _CopySSE
+global _CopySSE_128bytes
+global _RandomReader
+global _RandomReaderSSE2
+global _RandomReaderSSE2_bypass
+global _RandomWriter
+global _RandomWriterSSE2
+global _RandomWriterSSE2_bypass
+global _Reader
+global _ReaderAVX
+global _Reader_128bytes
+global _ReaderSSE2
+global _ReaderSSE2_bypass
+global _ReaderSSE2_128bytes
+global _ReaderSSE2_128bytes_bypass
+global _Register16ToVector
+global _Register32ToVector
+global _Register64ToVector
+global _Register8ToVector
+global _RegisterToRegister
+global _RegisterToVector
+global _StackReader
+global _StackWriter
+global _Vector16ToRegister
+global _Vector32ToRegister
+global _Vector64ToRegister
+global _Vector8ToRegister
+global _VectorToRegister
+global _VectorToVector
+global _VectorToVectorAVX
+global _Writer
+global _Writer_128bytes
+global _WriterSSE2
+global _WriterAVX
+global _WriterSSE2_128bytes
+global _WriterSSE2_bypass
+global _WriterSSE2_128bytes_bypass
+
+global get_cpuid_cache_info
+global _get_cpuid_cache_info
+
+global get_cpuid_family
+global _get_cpuid_family
+
+global get_cpuid1_ecx
+global _get_cpuid1_ecx
+
+global get_cpuid1_edx
+global _get_cpuid1_edx
+
+global get_cpuid7_ebx
+global _get_cpuid7_ebx
+
+global get_cpuid_80000001_ecx
+global _get_cpuid_80000001_ecx
+
+global get_cpuid_80000001_edx
+global _get_cpuid_80000001_edx
+
+; Note:
+; Unix ABI says integer param are put in these registers in this order:
+; rdi, rsi, rdx, rcx, r8, r9
+
+ section .text
+
+;------------------------------------------------------------------------------
+; Name: get_cpuid_cache_info
+;
+get_cpuid_cache_info:
+_get_cpuid_cache_info:
+ push rbx
+ push rcx
+ push rdx
+ mov rax, 4
+ mov rcx, rsi
+ cpuid
+ mov [rdi], eax
+ mov [rdi+4], ebx
+ mov [rdi+8], ecx
+ mov [rdi+12], edx
+ pop rdx
+ pop rcx
+ pop rbx
+ ret
+
+;------------------------------------------------------------------------------
+; Name: get_cpuid_family
+;
+get_cpuid_family:
+_get_cpuid_family:
+ push rbx
+ push rcx
+ push rdx
+ xor rax, rax
+ cpuid
+ mov [rdi], ebx
+ mov [rdi+4], edx
+ mov [rdi+8], ecx
+ mov byte [rdi+12], 0
+ pop rdx
+ pop rcx
+ pop rbx
+ ret
+
+;------------------------------------------------------------------------------
+; Name: get_cpuid1_ecx
+;
+get_cpuid1_ecx:
+_get_cpuid1_ecx:
+ push rbx
+ push rcx
+ push rdx
+ mov rax, 1
+ cpuid
+ mov rax, rcx
+ pop rdx
+ pop rcx
+ pop rbx
+ ret
+
+;------------------------------------------------------------------------------
+; Name: get_cpuid7_ebx
+;
+get_cpuid7_ebx:
+_get_cpuid7_ebx:
+ push rbx
+ push rcx
+ push rdx
+ mov rax, 7
+ xor rcx, rcx
+ cpuid
+ mov rax, rbx
+ pop rdx
+ pop rcx
+ pop rbx
+ ret
+
+;------------------------------------------------------------------------------
+; Name: get_cpuid1_edx
+;
+get_cpuid1_edx:
+_get_cpuid1_edx:
+ push rbx
+ push rcx
+ push rdx
+ mov rax, 1
+ cpuid
+ mov rax, rdx
+ pop rdx
+ pop rcx
+ pop rbx
+ ret
+
+;------------------------------------------------------------------------------
+; Name: get_cpuid_80000001_ecx
+;
+get_cpuid_80000001_ecx:
+_get_cpuid_80000001_ecx:
+ push rbx
+ push rcx
+ push rdx
+ mov rax, 0x80000001
+ cpuid
+ mov rax, rcx
+ pop rdx
+ pop rcx
+ pop rbx
+ ret
+
+;------------------------------------------------------------------------------
+; Name: get_cpuid_80000001_edx
+;
+get_cpuid_80000001_edx:
+_get_cpuid_80000001_edx:
+ push rbx
+ push rcx
+ push rdx
+ mov rax, 0x80000001
+ cpuid
+ mov rax, rdx
+ pop rdx
+ pop rcx
+ pop rbx
+ ret
+
+;------------------------------------------------------------------------------
+; Name: ReaderLODSQ
+; Purpose: Reads 64-bit values sequentially from an area of memory
+; using LODSQ instruction.
+; Params: rdi = ptr to memory area
+; rsi = length in bytes
+; rdx = loops
+;------------------------------------------------------------------------------
+ align 32
+ReaderLODSQ:
+_ReaderLODSQ:
+ push rcx ; REP counter
+ push r10
+ push r11
+ mov r10, rdi
+ mov r11, rsi
+ shr r11, 3 ; length in quadwords rounded down.
+
+.L1:
+ mov rsi, r10 ; buffer start
+ mov rcx, r11 ; # of quadwords
+
+ rep lodsq
+
+ dec rdx
+ jnz .L1
+
+ pop r11
+ pop r10
+ pop rcx
+ ret
+
+;------------------------------------------------------------------------------
+; Name: ReaderLODSD
+; Purpose: Reads 32-bit values sequentially from an area of memory
+; using LODSD instruction.
+; Params: rdi = ptr to memory area
+; rsi = length in bytes
+; rdx = loops
+;------------------------------------------------------------------------------
+ align 32
+ReaderLODSD:
+_ReaderLODSD:
+ push rcx ; REP counter
+ push r10
+ push r11
+ mov r10, rdi
+ mov r11, rsi
+ shr r11, 2 ; length in double words rounded down.
+
+.L1:
+ mov rsi, r10 ; buffer start
+ mov rcx, r11 ; # of double words
+
+ rep lodsd
+
+ dec rdx
+ jnz .L1
+
+ pop r11
+ pop r10
+ pop rcx
+ ret
+
+;------------------------------------------------------------------------------
+; Name: ReaderLODSW
+; Purpose: Reads 16-bit values sequentially from an area of memory
+; using LODSW instruction.
+; Params: rdi = ptr to memory area
+; rsi = length in bytes
+; rdx = loops
+;------------------------------------------------------------------------------
+ align 32
+ReaderLODSW:
+_ReaderLODSW:
+ push rcx ; REP counter
+ push r10
+ push r11
+ mov r10, rdi
+ mov r11, rsi
+ shr r11, 1 ; length in words rounded down.
+
+.L1:
+ mov rsi, r10 ; buffer start
+ mov rcx, r11 ; # of words
+
+ rep lodsw
+
+ dec rdx
+ jnz .L1
+
+ pop r11
+ pop r10
+ pop rcx
+ ret
+
+;------------------------------------------------------------------------------
+; Name: ReaderLODSB
+; Purpose: Reads 8-bit values sequentially from an area of memory
+; using LODSB instruction.
+; Params: rdi = ptr to memory area
+; rsi = length in bytes
+; rdx = loops
+;------------------------------------------------------------------------------
+ align 32
+ReaderLODSB:
+_ReaderLODSB:
+ push rcx ; REP counter
+ push r10
+ push r11
+ mov r10, rdi
+ mov r11, rsi
+
+.L1:
+ mov rsi, r10 ; buffer start
+ mov rcx, r11 ; # of bytes
+
+ rep lodsb
+
+ dec rdx
+ jnz .L1
+
+ pop r11
+ pop r10
+ pop rcx
+ ret
+
+;------------------------------------------------------------------------------
+; Name: Reader
+; Purpose: Reads 64-bit values sequentially from an area of memory.
+; Params: rdi = ptr to memory area
+; rsi = length in bytes
+; rdx = loops
+;------------------------------------------------------------------------------
+ align 64
+Reader:
+_Reader:
+ push r10
+
+ add rsi, rdi ; rsi now points to end.
+
+.L1:
+ mov r10, rdi
+
+.L2:
+ mov rax, [r10]
+ mov rax, [8+r10]
+ mov rax, [16+r10]
+ mov rax, [24+r10]
+ mov rax, [32+r10]
+ mov rax, [40+r10]
+ mov rax, [48+r10]
+ mov rax, [56+r10]
+ mov rax, [64+r10]
+ mov rax, [72+r10]
+ mov rax, [80+r10]
+ mov rax, [88+r10]
+ mov rax, [96+r10]
+ mov rax, [104+r10]
+ mov rax, [112+r10]
+ mov rax, [120+r10]
+ mov rax, [128+r10]
+ mov rax, [136+r10]
+ mov rax, [144+r10]
+ mov rax, [152+r10]
+ mov rax, [160+r10]
+ mov rax, [168+r10]
+ mov rax, [176+r10]
+ mov rax, [184+r10]
+ mov rax, [192+r10]
+ mov rax, [200+r10]
+ mov rax, [208+r10]
+ mov rax, [216+r10]
+ mov rax, [224+r10]
+ mov rax, [232+r10]
+ mov rax, [240+r10]
+ mov rax, [248+r10]
+
+ add r10, 256
+ cmp r10, rsi
+ jb .L2
+
+ dec rdx
+ jnz .L1
+
+ pop r10
+ ret
+
+;------------------------------------------------------------------------------
+; Name: Reader_128bytes
+; Purpose: Reads 64-bit values sequentially from an area of memory.
+; Params: rdi = ptr to memory area
+; rsi = length in bytes
+; rdx = loops
+;------------------------------------------------------------------------------
+ align 64
+Reader_128bytes:
+_Reader_128bytes:
+ push r10
+
+ add rsi, rdi ; rdi now points to end.
+
+.L1:
+ mov r10, rdi
+
+.L2:
+ mov rax, [r10]
+ mov rax, [8+r10]
+ mov rax, [16+r10]
+ mov rax, [24+r10]
+ mov rax, [32+r10]
+ mov rax, [40+r10]
+ mov rax, [48+r10]
+ mov rax, [56+r10]
+ mov rax, [64+r10]
+ mov rax, [72+r10]
+ mov rax, [80+r10]
+ mov rax, [88+r10]
+ mov rax, [96+r10]
+ mov rax, [104+r10]
+ mov rax, [112+r10]
+ mov rax, [120+r10]
+
+ add r10, 128
+ cmp r10, rsi
+ jb .L2
+
+ dec rdx
+ jnz .L1
+
+ pop r10
+ ret
+
+;------------------------------------------------------------------------------
+; Name: RandomReader
+; Purpose: Reads 64-bit values randomly from an area of memory.
+; Params: rdi = ptr to array of chunk pointers
+; rsi = # of chunks
+; rdx = loops
+;------------------------------------------------------------------------------
+ align 64
+RandomReader:
+_RandomReader:
+ push r10
+ push r11
+
+.L1:
+ xor r11, r11
+
+.L2:
+ mov r10, [rdi + 8*r11] ; Note, 64-bit pointers.
+
+ mov rax, [96+r10]
+ mov rax, [r10]
+ mov rax, [120+r10]
+ mov rax, [184+r10]
+ mov rax, [160+r10]
+ mov rax, [176+r10]
+ mov rax, [112+r10]
+ mov rax, [80+r10]
+ mov rax, [32+r10]
+ mov rax, [128+r10]
+ mov rax, [88+r10]
+ mov rax, [40+r10]
+ mov rax, [48+r10]
+ mov rax, [72+r10]
+ mov rax, [200+r10]
+ mov rax, [24+r10]
+ mov rax, [152+r10]
+ mov rax, [16+r10]
+ mov rax, [248+r10]
+ mov rax, [56+r10]
+ mov rax, [240+r10]
+ mov rax, [208+r10]
+ mov rax, [104+r10]
+ mov rax, [216+r10]
+ mov rax, [136+r10]
+ mov rax, [232+r10]
+ mov rax, [64+r10]
+ mov rax, [224+r10]
+ mov rax, [144+r10]
+ mov rax, [192+r10]
+ mov rax, [8+r10]
+ mov rax, [168+r10]
+
+ inc r11
+ cmp r11, rsi
+ jb .L2
+
+ dec rdx
+ jnz .L1
+
+ pop r11
+ pop r10
+ ret
+
+;------------------------------------------------------------------------------
+; Name: RandomReaderSSE2
+; Purpose: Reads 128-bit values randomly from an area of memory.
+; Params: rdi = ptr to array of chunk pointers
+; rsi = # of chunks
+; rdx = loops
+;------------------------------------------------------------------------------
+ align 64
+RandomReaderSSE2:
+_RandomReaderSSE2:
+ push r10
+ push r11
+
+.L1:
+ xor r11, r11
+
+.L2:
+ mov r10, [rdi + 8*r11]
+
+ movdqa xmm0, [240+r10]
+ movdqa xmm0, [128+r10]
+ movdqa xmm0, [64+r10]
+ movdqa xmm0, [208+r10]
+ movdqa xmm0, [112+r10]
+ movdqa xmm0, [176+r10]
+ movdqa xmm0, [144+r10]
+ movdqa xmm0, [r10]
+ movdqa xmm0, [96+r10]
+ movdqa xmm0, [16+r10]
+ movdqa xmm0, [192+r10]
+ movdqa xmm0, [160+r10]
+ movdqa xmm0, [32+r10]
+ movdqa xmm0, [48+r10]
+ movdqa xmm0, [224+r10]
+ movdqa xmm0, [80+r10]
+
+ inc r11
+ cmp r11, rsi
+ jb .L2
+
+ dec rdx
+ jnz .L1
+
+ pop r11
+ pop r10
+ ret
+
+;------------------------------------------------------------------------------
+; Name: RandomReaderSSE2_bypass
+; Purpose: Reads 128-bit values randomly from an area of memory.
+; Params: rdi = ptr to array of chunk pointers
+; rsi = # of chunks
+; rdx = loops
+;------------------------------------------------------------------------------
+ align 64
+RandomReaderSSE2_bypass:
+_RandomReaderSSE2_bypass:
+ push r10
+ push r11
+
+.L1:
+ xor r11, r11
+
+.L2:
+ mov r10, [rdi + 8*r11]
+
+ ; SSE 4.1 required
+ movntdqa xmm0, [240+r10]
+ movntdqa xmm0, [r10]
+ movntdqa xmm0, [128+r10]
+ movntdqa xmm0, [64+r10]
+ movntdqa xmm0, [208+r10]
+ movntdqa xmm0, [112+r10]
+ movntdqa xmm0, [48+r10]
+ movntdqa xmm0, [176+r10]
+ movntdqa xmm0, [144+r10]
+ movntdqa xmm0, [96+r10]
+ movntdqa xmm0, [16+r10]
+ movntdqa xmm0, [160+r10]
+ movntdqa xmm0, [32+r10]
+ movntdqa xmm0, [224+r10]
+ movntdqa xmm0, [80+r10]
+ movntdqa xmm0, [192+r10]
+
+ inc r11
+ cmp r11, rsi
+ jb .L2
+
+ dec rdx
+ jnz .L1
+
+ pop r11
+ pop r10
+ ret
+
+;------------------------------------------------------------------------------
+; Name: RandomWriter
+; Purpose: Writes 64-bit values randomly to an area of memory.
+; Params: rdi = ptr to array of chunk pointers
+; rsi = # of chunks
+; rdx = loops
+; rcx = datum to write
+;------------------------------------------------------------------------------
+ align 64
+RandomWriter:
+_RandomWriter:
+ push r10
+ push r11
+
+.L1:
+ xor r11, r11
+
+.L2:
+ mov r10, [rdi + 8*r11] ; Note, 64-bit pointers.
+
+ mov [96+r10], rcx
+ mov [r10], rcx
+ mov [120+r10], rcx
+ mov [184+r10], rcx
+ mov [160+r10], rcx
+ mov [176+r10], rcx
+ mov [112+r10], rcx
+ mov [80+r10], rcx
+ mov [32+r10], rcx
+ mov [128+r10], rcx
+ mov [88+r10], rcx
+ mov [40+r10], rcx
+ mov [48+r10], rcx
+ mov [72+r10], rcx
+ mov [200+r10], rcx
+ mov [24+r10], rcx
+ mov [152+r10], rcx
+ mov [16+r10], rcx
+ mov [248+r10], rcx
+ mov [56+r10], rcx
+ mov [240+r10], rcx
+ mov [208+r10], rcx
+ mov [104+r10], rcx
+ mov [216+r10], rcx
+ mov [136+r10], rcx
+ mov [232+r10], rcx
+ mov [64+r10], rcx
+ mov [224+r10], rcx
+ mov [144+r10], rcx
+ mov [192+r10], rcx
+ mov [8+r10], rcx
+ mov [168+r10], rcx
+
+ inc r11
+ cmp r11, rsi
+ jb .L2
+
+ dec rdx
+ jnz .L1
+
+ pop r11
+ pop r10
+ ret
+
+;------------------------------------------------------------------------------
+; Name: RandomWriterSSE2
+; Purpose: Writes 128-bit values randomly to an area of memory.
+; Params: rdi = ptr to array of chunk pointers
+; rsi = # of chunks
+; rdx = loops
+; rcx = datum to write
+;------------------------------------------------------------------------------
+ align 64
+RandomWriterSSE2:
+_RandomWriterSSE2:
+ push r10
+ push r11
+
+ movq xmm0, rcx ; Create duplicated 128-bit datum
+ movq xmm1, rcx
+ pslldq xmm1, 64
+ por xmm0, xmm1
+
+.L1:
+ xor r11, r11
+
+.L2:
+ mov r10, [rdi + 8*r11] ; Note, 64-bit pointers.
+
+ movdqa [240+r10], xmm0
+ movdqa [128+r10], xmm0
+ movdqa [208+r10], xmm0
+ movdqa [112+r10], xmm0
+ movdqa [64+r10], xmm0
+ movdqa [176+r10], xmm0
+ movdqa [144+r10], xmm0
+ movdqa [r10], xmm0
+ movdqa [96+r10], xmm0
+ movdqa [16+r10], xmm0
+ movdqa [192+r10], xmm0
+ movdqa [160+r10], xmm0
+ movdqa [32+r10], xmm0
+ movdqa [48+r10], xmm0
+ movdqa [224+r10], xmm0
+ movdqa [80+r10], xmm0
+
+ inc r11
+ cmp r11, rsi
+ jb .L2
+
+ dec rdx
+ jnz .L1
+
+ pop r11
+ pop r10
+ ret
+
+;------------------------------------------------------------------------------
+; Name: RandomWriterSSE2_bypass
+; Purpose: Writes 128-bit values randomly into memory, bypassing caches.
+; Params: rdi = ptr to array of chunk pointers
+; rsi = # of chunks
+; rdx = loops
+; rcx = datum to write
+;------------------------------------------------------------------------------
+ align 64
+RandomWriterSSE2_bypass:
+_RandomWriterSSE2_bypass:
+ push r10
+ push r11
+
+ movq xmm0, rcx ; Create duplicated 128-bit datum
+ movq xmm1, rcx
+ pslldq xmm1, 64
+ por xmm0, xmm1
+
+.L1:
+ xor r11, r11
+
+.L2:
+ mov r10, [rdi + 8*r11] ; Note, 64-bit pointers.
+
+ movntdq [240+r10], xmm0
+ movntdq [128+r10], xmm0
+ movntdq [208+r10], xmm0
+ movntdq [112+r10], xmm0
+ movntdq [64+r10], xmm0
+ movntdq [176+r10], xmm0
+ movntdq [144+r10], xmm0
+ movntdq [r10], xmm0
+ movntdq [96+r10], xmm0
+ movntdq [16+r10], xmm0
+ movntdq [192+r10], xmm0
+ movntdq [160+r10], xmm0
+ movntdq [32+r10], xmm0
+ movntdq [48+r10], xmm0
+ movntdq [224+r10], xmm0
+ movntdq [80+r10], xmm0
+
+ inc r11
+ cmp r11, rsi
+ jb .L2
+
+ dec rdx
+ jnz .L1
+
+ pop r11
+ pop r10
+ ret
+
+;------------------------------------------------------------------------------
+; Name: ReaderSSE2_128bytes
+; Purpose: Reads 128-bit values sequentially from an area of memory.
+; Params: rdi = ptr to memory area
+; rsi = length in bytes
+; rdx = loops
+;------------------------------------------------------------------------------
+ align 64
+ReaderSSE2_128bytes:
+_ReaderSSE2_128bytes:
+ push r10
+
+ add rsi, rdi ; rsi now points to end.
+
+.L1:
+ mov r10, rdi
+
+.L2:
+ movdqa xmm0, [r10] ; Read aligned to 16-byte boundary.
+ movdqa xmm0, [16+r10]
+ movdqa xmm0, [32+r10]
+ movdqa xmm0, [48+r10]
+ movdqa xmm0, [64+r10]
+ movdqa xmm0, [80+r10]
+ movdqa xmm0, [96+r10]
+ movdqa xmm0, [112+r10]
+
+ add r10, 128
+ cmp r10, rsi
+ jb .L2
+
+ dec rdx
+ jnz .L1
+
+ pop r10
+ ret
+
+
+;------------------------------------------------------------------------------
+; Name: ReaderSSE2
+; Purpose: Reads 128-bit values sequentially from an area of memory.
+; Params: rdi = ptr to memory area
+; rsi = length in bytes
+; rdx = loops
+;------------------------------------------------------------------------------
+ align 64
+ReaderSSE2:
+_ReaderSSE2:
+ push r10
+
+ add rsi, rdi ; rsi now points to end.
+
+.L1:
+ mov r10, rdi
+
+.L2:
+ movdqa xmm0, [r10] ; Read aligned to 16-byte boundary.
+ movdqa xmm0, [16+r10]
+ movdqa xmm0, [32+r10]
+ movdqa xmm0, [48+r10]
+ movdqa xmm0, [64+r10]
+ movdqa xmm0, [80+r10]
+ movdqa xmm0, [96+r10]
+ movdqa xmm0, [112+r10]
+
+ movdqa xmm0, [128+r10]
+ movdqa xmm0, [144+r10]
+ movdqa xmm0, [160+r10]
+ movdqa xmm0, [176+r10]
+ movdqa xmm0, [192+r10]
+ movdqa xmm0, [208+r10]
+ movdqa xmm0, [224+r10]
+ movdqa xmm0, [240+r10]
+
+ add r10, 256
+ cmp r10, rsi
+ jb .L2
+
+ dec rdx
+ jnz .L1
+
+ pop r10
+ ret
+
+
+;------------------------------------------------------------------------------
+; Name: ReaderAVX
+; Purpose: Reads 256-bit values sequentially from an area of memory.
+; Params: rdi = ptr to memory area
+; rsi = length in bytes
+; rdx = loops
+;------------------------------------------------------------------------------
+ align 64
+ReaderAVX:
+_ReaderAVX:
+ vzeroupper
+
+ push r10
+
+ add rsi, rdi ; rsi now points to end.
+
+.L1:
+ mov r10, rdi
+
+.L2:
+ vmovdqa ymm0, [r10] ; Read aligned to 32-byte boundary.
+ vmovdqa ymm0, [32+r10]
+ vmovdqa ymm0, [64+r10]
+ vmovdqa ymm0, [96+r10]
+ vmovdqa ymm0, [128+r10]
+ vmovdqa ymm0, [160+r10]
+ vmovdqa ymm0, [192+r10]
+ vmovdqa ymm0, [224+r10]
+
+ add r10, 256
+ cmp r10, rsi
+ jb .L2
+
+ dec rdx
+ jnz .L1
+
+ pop r10
+ ret
+
+
+;------------------------------------------------------------------------------
+; Name: ReaderSSE2_bypass
+; Purpose: Reads 128-bit values sequentially from an area of memory.
+; Params: rdi = ptr to memory area
+; rsi = length in bytes
+; rdx = loops
+;------------------------------------------------------------------------------
+ align 64
+ReaderSSE2_bypass:
+_ReaderSSE2_bypass:
+ push r10
+
+ add rsi, rdi ; rsi now points to end.
+
+.L1:
+ mov r10, rdi
+
+.L2:
+ movntdqa xmm0, [r10] ; Read aligned to 16-byte boundary.
+ movntdqa xmm0, [16+r10]
+ movntdqa xmm0, [32+r10]
+ movntdqa xmm0, [48+r10]
+ movntdqa xmm0, [64+r10]
+ movntdqa xmm0, [80+r10]
+ movntdqa xmm0, [96+r10]
+ movntdqa xmm0, [112+r10]
+
+ movntdqa xmm0, [128+r10]
+ movntdqa xmm0, [144+r10]
+ movntdqa xmm0, [160+r10]
+ movntdqa xmm0, [176+r10]
+ movntdqa xmm0, [192+r10]
+ movntdqa xmm0, [208+r10]
+ movntdqa xmm0, [224+r10]
+ movntdqa xmm0, [240+r10]
+
+ add r10, 256
+ cmp r10, rsi
+ jb .L2
+
+ dec rdx
+ jnz .L1
+
+ pop r10
+ ret
+
+
+;------------------------------------------------------------------------------
+; Name: ReaderSSE2_128bytes_bypass
+; Purpose: Reads 128-bit values sequentially from an area of memory.
+; Params: rdi = ptr to memory area
+; rsi = length in bytes
+; rdx = loops
+;------------------------------------------------------------------------------
+ align 64
+ReaderSSE2_128bytes_bypass:
+_ReaderSSE2_128bytes_bypass:
+ push r10
+
+ add rsi, rdi ; rsi now points to end.
+
+.L1:
+ mov r10, rdi
+
+.L2:
+ movntdqa xmm0, [r10] ; Read aligned to 16-byte boundary.
+ movntdqa xmm0, [16+r10]
+ movntdqa xmm0, [32+r10]
+ movntdqa xmm0, [48+r10]
+ movntdqa xmm0, [64+r10]
+ movntdqa xmm0, [80+r10]
+ movntdqa xmm0, [96+r10]
+ movntdqa xmm0, [112+r10]
+
+ add r10, 128
+ cmp r10, rsi
+ jb .L2
+
+ dec rdx
+ jnz .L1
+
+ pop r10
+ ret
+
+
+;------------------------------------------------------------------------------
+; Name: Writer
+; Purpose: Writes 64-bit value sequentially to an area of memory.
+; Params: rdi = ptr to memory area
+; rsi = length in bytes
+; rdx = loops
+; rcx = quad to write
+;------------------------------------------------------------------------------
+ align 64
+Writer:
+_Writer:
+ push r10
+
+ add rsi, rdi ; rsi now points to end.
+
+.L1:
+ mov r10, rdi
+
+.L2:
+ mov [r10], rcx
+ mov [8+r10], rcx
+ mov [16+r10], rcx
+ mov [24+r10], rcx
+ mov [32+r10], rcx
+ mov [40+r10], rcx
+ mov [48+r10], rcx
+ mov [56+r10], rcx
+ mov [64+r10], rcx
+ mov [72+r10], rcx
+ mov [80+r10], rcx
+ mov [88+r10], rcx
+ mov [96+r10], rcx
+ mov [104+r10], rcx
+ mov [112+r10], rcx
+ mov [120+r10], rcx
+ mov [128+r10], rcx
+ mov [136+r10], rcx
+ mov [144+r10], rcx
+ mov [152+r10], rcx
+ mov [160+r10], rcx
+ mov [168+r10], rcx
+ mov [176+r10], rcx
+ mov [184+r10], rcx
+ mov [192+r10], rcx
+ mov [200+r10], rcx
+ mov [208+r10], rcx
+ mov [216+r10], rcx
+ mov [224+r10], rcx
+ mov [232+r10], rcx
+ mov [240+r10], rcx
+ mov [248+r10], rcx
+
+ add r10, 256
+ cmp r10, rsi
+ jb .L2
+
+ dec rdx
+ jnz .L1
+
+ pop r10
+ ret
+
+;------------------------------------------------------------------------------
+; Name: Writer_128bytes
+; Purpose: Writes 64-bit value sequentially to an area of memory.
+; Params: rdi = ptr to memory area
+; rsi = length in bytes
+; rdx = loops
+; rcx = quad to write
+;------------------------------------------------------------------------------
+ align 64
+Writer_128bytes:
+_Writer_128bytes:
+ push r10
+
+ add rsi, rdi ; rsi now points to end.
+
+.L1:
+ mov r10, rdi
+
+.L2:
+ mov [r10], rcx
+ mov [8+r10], rcx
+ mov [16+r10], rcx
+ mov [24+r10], rcx
+ mov [32+r10], rcx
+ mov [40+r10], rcx
+ mov [48+r10], rcx
+ mov [56+r10], rcx
+ mov [64+r10], rcx
+ mov [72+r10], rcx
+ mov [80+r10], rcx
+ mov [88+r10], rcx
+ mov [96+r10], rcx
+ mov [104+r10], rcx
+ mov [112+r10], rcx
+ mov [120+r10], rcx
+
+ add r10, 128
+ cmp r10, rsi
+ jb .L2
+
+ dec rdx
+ jnz .L1
+
+ pop r10
+ ret
+
+;------------------------------------------------------------------------------
+; Name: WriterSSE2
+; Purpose: Writes 128-bit value sequentially to an area of memory.
+; Params: rdi = ptr to memory area
+; rsi = length in bytes
+; rdx = loops
+; rcx = quad to write
+;------------------------------------------------------------------------------
+ align 64
+WriterSSE2:
+_WriterSSE2:
+ push r10
+
+ add rsi, rdi ; rsi now points to end.
+
+ movq xmm0, rcx
+
+.L1:
+ mov r10, rdi
+
+.L2:
+ movdqa [r10], xmm0
+ movdqa [16+r10], xmm0
+ movdqa [32+r10], xmm0
+ movdqa [48+r10], xmm0
+ movdqa [64+r10], xmm0
+ movdqa [80+r10], xmm0
+ movdqa [96+r10], xmm0
+ movdqa [112+r10], xmm0
+
+ movdqa [128+r10], xmm0
+ movdqa [144+r10], xmm0
+ movdqa [160+r10], xmm0
+ movdqa [176+r10], xmm0
+ movdqa [192+r10], xmm0
+ movdqa [208+r10], xmm0
+ movdqa [224+r10], xmm0
+ movdqa [240+r10], xmm0
+
+ add r10, 256
+ cmp r10, rsi
+ jb .L2
+
+ dec rdx
+ jnz .L1
+
+ pop r10
+ ret
+
+;------------------------------------------------------------------------------
+; Name: WriterAVX
+; Purpose: Writes 256-bit value sequentially to an area of memory.
+; Params: rdi = ptr to memory area
+; rsi = length in bytes
+; rdx = loops
+; rcx = quad to write
+;------------------------------------------------------------------------------
+ align 64
+WriterAVX:
+_WriterAVX:
+ vzeroupper
+
+ push r10
+
+ add rsi, rdi ; rsi now points to end.
+
+ pinsrq xmm0, rcx, 0
+ pinsrq xmm0, rcx, 1
+
+.L1:
+ mov r10, rdi
+
+.L2:
+ vmovdqa [r10], ymm0
+ vmovdqa [32+r10], ymm0
+ vmovdqa [64+r10], ymm0
+ vmovdqa [96+r10], ymm0
+ vmovdqa [128+r10], ymm0
+ vmovdqa [160+r10], ymm0
+ vmovdqa [192+r10], ymm0
+ vmovdqa [224+r10], ymm0
+
+ add r10, 256
+ cmp r10, rsi
+ jb .L2
+
+ dec rdx
+ jnz .L1
+
+ pop r10
+ ret
+
+;------------------------------------------------------------------------------
+; Name: WriterSSE2_128bytes
+; Purpose: Writes 128-bit value sequentially to an area of memory,
+; chunks are 128 bytes rather than 256.
+; Params: rdi = ptr to memory area
+; rsi = length in bytes
+; rdx = loops
+; rcx = quad to write
+;------------------------------------------------------------------------------
+ align 64
+WriterSSE2_128bytes:
+_WriterSSE2_128bytes:
+ push r10
+
+ add rsi, rdi ; rsi now points to end.
+
+ movq xmm0, rcx
+
+.L1:
+ mov r10, rdi
+
+.L2:
+ movdqa [r10], xmm0
+ movdqa [16+r10], xmm0
+ movdqa [32+r10], xmm0
+ movdqa [48+r10], xmm0
+ movdqa [64+r10], xmm0
+ movdqa [80+r10], xmm0
+ movdqa [96+r10], xmm0
+ movdqa [112+r10], xmm0
+
+ add r10, 128
+ cmp r10, rsi
+ jb .L2
+
+ dec rdx
+ jnz .L1
+
+ pop r10
+ ret
+
+;------------------------------------------------------------------------------
+; Name: WriterSSE2_bypass
+; Purpose: Writes 128-bit value sequentially to an area of memory.
+; Params: rdi = ptr to memory area
+; rsi = length in bytes
+; rdx = loops
+; rcx = quad to write
+;------------------------------------------------------------------------------
+ align 64
+WriterSSE2_bypass:
+_WriterSSE2_bypass:
+ push r10
+
+ add rsi, rdi ; rsi now points to end.
+
+ movq xmm0, rcx
+
+.L1:
+ mov r10, rdi
+
+.L2:
+ movntdq [r10], xmm0 ; Write bypassing cache.
+ movntdq [16+r10], xmm0
+ movntdq [32+r10], xmm0
+ movntdq [48+r10], xmm0
+ movntdq [64+r10], xmm0
+ movntdq [80+r10], xmm0
+ movntdq [96+r10], xmm0
+ movntdq [112+r10], xmm0
+
+ movntdq [128+r10], xmm0
+ movntdq [144+r10], xmm0
+ movntdq [160+r10], xmm0
+ movntdq [176+r10], xmm0
+ movntdq [192+r10], xmm0
+ movntdq [208+r10], xmm0
+ movntdq [224+r10], xmm0
+ movntdq [240+r10], xmm0
+
+ add r10, 256
+ cmp r10, rsi
+ jb .L2
+
+ dec rdx
+ jnz .L1
+
+ pop r10
+ ret
+
+;------------------------------------------------------------------------------
+; Name: WriterAVX_bypass
+; Purpose: Writes 256-bit value sequentially to an area of memory.
+; Params: rdi = ptr to memory area
+; rsi = length in bytes
+; rdx = loops
+; rcx = quad to write
+;------------------------------------------------------------------------------
+ align 64
+WriterAVX_bypass:
+_WriterAVX_bypass:
+ vzeroupper
+
+ push r10
+
+ add rsi, rdi ; rsi now points to end.
+
+ movq xmm0, rcx
+
+.L1:
+ mov r10, rdi
+
+.L2:
+ vmovntdq [r10], xmm0 ; Write bypassing cache.
+ vmovntdq [32+r10], xmm0
+ vmovntdq [64+r10], xmm0
+ vmovntdq [96+r10], xmm0
+ vmovntdq [128+r10], xmm0
+ vmovntdq [160+r10], xmm0
+ vmovntdq [192+r10], xmm0
+ vmovntdq [224+r10], xmm0
+
+ add r10, 256
+ cmp r10, rsi
+ jb .L2
+
+ dec rdx
+ jnz .L1
+
+ pop r10
+ ret
+
+;------------------------------------------------------------------------------
+; Name: WriterSSE2_128bytes_bypass
+; Purpose: Writes 128-bit value sequentially to an area of memory.
+; Params: rdi = ptr to memory area
+; rsi = length in bytes
+; rdx = loops
+; rcx = quad to write
+;------------------------------------------------------------------------------
+ align 64
+WriterSSE2_128bytes_bypass:
+_WriterSSE2_128bytes_bypass:
+ push r10
+
+ add rsi, rdi ; rsi now points to end.
+
+ movq xmm0, rcx
+
+.L1:
+ mov r10, rdi
+
+.L2:
+ movntdq [r10], xmm0 ; Write bypassing cache.
+ movntdq [16+r10], xmm0
+ movntdq [32+r10], xmm0
+ movntdq [48+r10], xmm0
+ movntdq [64+r10], xmm0
+ movntdq [80+r10], xmm0
+ movntdq [96+r10], xmm0
+ movntdq [112+r10], xmm0
+
+ add r10, 128
+ cmp r10, rsi
+ jb .L2
+
+ dec rdx
+ jnz .L1
+
+ pop r10
+ ret
+
+;------------------------------------------------------------------------------
+; Name: StackReader
+; Purpose: Reads 64-bit values off the stack into registers of
+; the main register set, effectively testing L1 cache access
+; *and* effective-address calculation speed.
+; Params: rdi = loops
+;------------------------------------------------------------------------------
+ align 64
+StackReader:
+_StackReader:
+ push qword 7000 ; [rsp+48]
+ push qword 6000 ; [rsp+40]
+ push qword 5000 ; [rsp+32]
+ push qword 4000 ; [rsp+24]
+ push qword 3000 ; [rsp+16]
+ push qword 2000 ; [rsp+8]
+ push qword 1000 ; [rsp]
+
+.L1:
+ mov rax, [rsp]
+ mov rax, [rsp+16]
+ mov rax, [rsp+24]
+ mov rax, [rsp+32]
+ mov rax, [rsp+40]
+ mov rax, [rsp+8]
+ mov rax, [rsp+48]
+ mov rax, [rsp]
+ mov rax, [rsp]
+ mov rax, [rsp+16]
+ mov rax, [rsp+24]
+ mov rax, [rsp+32]
+ mov rax, [rsp+40]
+ mov rax, [rsp+8]
+ mov rax, [rsp+48]
+ mov rax, [rsp]
+ mov rax, [rsp]
+ mov rax, [rsp+16]
+ mov rax, [rsp+24]
+ mov rax, [rsp+32]
+ mov rax, [rsp+40]
+ mov rax, [rsp+8]
+ mov rax, [rsp+48]
+ mov rax, [rsp+8]
+ mov rax, [rsp+8]
+ mov rax, [rsp+16]
+ mov rax, [rsp+24]
+ mov rax, [rsp+32]
+ mov rax, [rsp+40]
+ mov rax, [rsp+8]
+ mov rax, [rsp+48]
+ mov rax, [rsp+8]
+
+ sub rdi, 1
+ jnz .L1
+
+ add rsp, 56
+ ret
+
+;------------------------------------------------------------------------------
+; Name: StackWriter
+; Purpose: Writes 64-bit values into the stack from registers of
+; the main register set, effectively testing L1 cache access
+; *and* effective-address calculation speed.
+; Params: rdi = loops
+;------------------------------------------------------------------------------
+ align 64
+StackWriter:
+_StackWriter:
+ push qword 7000 ; [rsp+48]
+ push qword 6000 ; [rsp+40]
+ push qword 5000 ; [rsp+32]
+ push qword 4000 ; [rsp+24]
+ push qword 3000 ; [rsp+16]
+ push qword 2000 ; [rsp+8]
+ push qword 1000 ; [rsp]
+
+ xor rax, rax
+
+.L1:
+ mov [rsp], rax
+ mov [rsp+16], rax
+ mov [rsp+24], rax
+ mov [rsp+32], rax
+ mov [rsp+40], rax
+ mov [rsp+8], rax
+ mov [rsp+48], rax
+ mov [rsp], rax
+ mov [rsp], rax
+ mov [rsp+16], rax
+ mov [rsp+24], rax
+ mov [rsp+32], rax
+ mov [rsp+40], rax
+ mov [rsp+8], rax
+ mov [rsp+48], rax
+ mov [rsp], rax
+ mov [rsp], rax
+ mov [rsp+16], rax
+ mov [rsp+24], rax
+ mov [rsp+32], rax
+ mov [rsp+40], rax
+ mov [rsp+8], rax
+ mov [rsp+48], rax
+ mov [rsp+8], rax
+ mov [rsp+8], rax
+ mov [rsp+16], rax
+ mov [rsp+24], rax
+ mov [rsp+32], rax
+ mov [rsp+40], rax
+ mov [rsp+8], rax
+ mov [rsp+48], rax
+ mov [rsp+8], rax
+
+ sub rdi, 1
+ jnz .L1
+
+ add rsp, 56
+ ret
+
+;------------------------------------------------------------------------------
+; Name: RegisterToRegister
+; Purpose: Reads/writes 64-bit values between registers of
+; the main register set.
+; Params: rdi = loops
+;------------------------------------------------------------------------------
+ align 64
+RegisterToRegister:
+_RegisterToRegister:
+.L1:
+ mov rax, rbx
+ mov rax, rcx
+ mov rax, rdx
+ mov rax, rsi
+ mov rax, rdi
+ mov rax, rbp
+ mov rax, rsp
+ mov rax, rbx
+ mov rax, rbx
+ mov rax, rcx
+ mov rax, rdx
+ mov rax, rsi
+ mov rax, rdi
+ mov rax, rbp
+ mov rax, rsp
+ mov rax, rbx
+ mov rax, rbx
+ mov rax, rcx
+ mov rax, rdx
+ mov rax, rsi
+ mov rax, rdi
+ mov rax, rbp
+ mov rax, rsp
+ mov rax, rbx
+ mov rax, rbx
+ mov rax, rcx
+ mov rax, rdx
+ mov rax, rsi
+ mov rax, rdi
+ mov rax, rbp
+ mov rax, rsp
+ mov rax, rbx
+
+ sub rdi, 1
+ jnz .L1
+ ret
+
+;------------------------------------------------------------------------------
+; Name: VectorToVector
+; Purpose: Reads/writes 128-bit values between registers of
+; the vector register set, in this case XMM.
+; Params: rdi = loops
+;------------------------------------------------------------------------------
+ align 64
+VectorToVector:
+_VectorToVector:
+.L1:
+ movq xmm0, xmm1 ; Each move moves 16 bytes, so we need 16
+ movq xmm0, xmm2 ; moves to transfer a 256 byte chunk.
+ movq xmm0, xmm3
+ movq xmm2, xmm0
+ movq xmm1, xmm2
+ movq xmm2, xmm1
+ movq xmm0, xmm3
+ movq xmm3, xmm1
+
+ movq xmm3, xmm2
+ movq xmm1, xmm3
+ movq xmm2, xmm1
+ movq xmm0, xmm1
+ movq xmm1, xmm2
+ movq xmm0, xmm1
+ movq xmm0, xmm3
+ movq xmm3, xmm0
+
+ sub rdi, 1
+ jnz .L1
+ ret
+
+;------------------------------------------------------------------------------
+; Name: VectorToVectorAVX
+; Purpose: Reads/writes 256-bit values between registers of
+; the vector register set, in this case YMM.
+; Params: rdi = loops
+;------------------------------------------------------------------------------
+ align 64
+VectorToVectorAVX:
+_VectorToVectorAVX:
+ vzeroupper
+
+.L1:
+ vmovdqa ymm0, ymm1 ; Each move moves 32 bytes, so we need 8
+ vmovdqa ymm0, ymm2 ; moves to transfer a 256 byte chunk.
+ vmovdqa ymm0, ymm3
+ vmovdqa ymm2, ymm0
+ vmovdqa ymm1, ymm2
+ vmovdqa ymm2, ymm1
+ vmovdqa ymm0, ymm3
+ vmovdqa ymm3, ymm1
+
+ sub rdi, 1
+ jnz .L1
+ ret
+
+;------------------------------------------------------------------------------
+; Name: RegisterToVector
+; Purpose: Writes 64-bit main register values into 128-bit vector register
+; clearing the upper unused bits.
+; Params: rdi = loops
+;------------------------------------------------------------------------------
+ align 64
+RegisterToVector:
+_RegisterToVector:
+.L1:
+ movq xmm1, rax ; Each movq transfers 8 bytes, so we need
+ movq xmm2, rsi ; 32 transfers to move a 256-byte chunk.
+ movq xmm3, rbx
+ movq xmm1, rcx
+ movq xmm2, rsi
+ movq xmm3, rsp
+ movq xmm0, rdi
+ movq xmm0, rdx
+
+ movq xmm0, rax
+ movq xmm1, rsi
+ movq xmm2, rbx
+ movq xmm3, rcx
+ movq xmm0, rsi
+ movq xmm3, rsp
+ movq xmm2, rdi
+ movq xmm1, rdx
+
+ movq xmm0, rax
+ movq xmm1, rsi
+ movq xmm2, rbx
+ movq xmm3, rcx
+ movq xmm0, rsi
+ movq xmm3, rsp
+ movq xmm2, rdi
+ movq xmm1, rdx
+
+ movq xmm0, rax
+ movq xmm1, rsi
+ movq xmm2, rbx
+ movq xmm3, rcx
+ movq xmm0, rsi
+ movq xmm3, rsp
+ movq xmm2, rdi
+ movq xmm1, rdx
+
+ dec rdi
+ jnz .L1
+ ret
+
+;------------------------------------------------------------------------------
+; Name: VectorToRegister
+; Purpose: Writes lower 64 bits of vector register into 64-bit main
+; register.
+; Params: rdi = loops
+;------------------------------------------------------------------------------
+ align 64
+VectorToRegister:
+_VectorToRegister:
+.L1:
+ movq rax, xmm1
+ movq rax, xmm2
+ movq rax, xmm3
+ movq rax, xmm1
+ movq rax, xmm2
+ movq rax, xmm3
+ movq rax, xmm0
+ movq rax, xmm0
+
+ movq rax, xmm0
+ movq rax, xmm1
+ movq rax, xmm2
+ movq rax, xmm3
+ movq rax, xmm0
+ movq rax, xmm3
+ movq rax, xmm2
+ movq rax, xmm1
+
+ movq rax, xmm0
+ movq rax, xmm1
+ movq rax, xmm2
+ movq rax, xmm3
+ movq rax, xmm0
+ movq rax, xmm3
+ movq rax, xmm2
+ movq rax, xmm1
+
+ movq rax, xmm0
+ movq rax, xmm1
+ movq rax, xmm2
+ movq rax, xmm3
+ movq rax, xmm0
+ movq rax, xmm3
+ movq rax, xmm2
+ movq rax, xmm1
+
+ dec rdi
+ jnz .L1
+ ret
+
+;------------------------------------------------------------------------------
+; Name: Register8ToVector
+; Purpose: Writes 8-bit main register values into 128-bit vector register
+; without clearing the unused bits.
+; Params: rdi = loops
+;------------------------------------------------------------------------------
+ align 64
+Register8ToVector:
+_Register8ToVector:
+ sal rdi, 2 ; Force some repetition.
+.L1:
+ pinsrb xmm1, al, 0 ; 64 transfers x 1 byte = 64 bytes
+ pinsrb xmm2, bl, 1
+ pinsrb xmm3, cl, 2
+ pinsrb xmm1, dl, 3
+ pinsrb xmm2, sil, 4
+ pinsrb xmm3, dil, 5
+ pinsrb xmm0, bpl, 6
+ pinsrb xmm0, spl, 7
+
+ pinsrb xmm0, al, 0
+ pinsrb xmm1, bl, 1
+ pinsrb xmm2, cl, 2
+ pinsrb xmm3, dl, 3
+ pinsrb xmm3, al, 4
+ pinsrb xmm2, bl, 5
+ pinsrb xmm1, bpl, 6
+ pinsrb xmm0, spl, 7
+
+ pinsrb xmm1, r8b, 0
+ pinsrb xmm2, r9b, 1
+ pinsrb xmm3, r10b, 2
+ pinsrb xmm1, r11b, 3
+ pinsrb xmm2, r12b, 4
+ pinsrb xmm3, al, 5
+ pinsrb xmm0, cl, 6
+ pinsrb xmm0, bl, 7
+
+ pinsrb xmm0, r8b, 0
+ pinsrb xmm0, r9b, 1
+ pinsrb xmm0, r10b, 2
+ pinsrb xmm0, r11b, 3
+ pinsrb xmm0, r12b, 4
+ pinsrb xmm0, al, 5
+ pinsrb xmm0, cl, 6
+ pinsrb xmm0, bl, 7
+
+ pinsrb xmm1, al, 0
+ pinsrb xmm2, bl, 1
+ pinsrb xmm3, cl, 2
+ pinsrb xmm1, dl, 3
+ pinsrb xmm2, sil, 4
+ pinsrb xmm3, dil, 5
+ pinsrb xmm0, bpl, 6
+ pinsrb xmm0, spl, 7
+
+ pinsrb xmm0, al, 10
+ pinsrb xmm1, bl, 11
+ pinsrb xmm2, cl, 12
+ pinsrb xmm3, dl, 13
+ pinsrb xmm3, dil, 14
+ pinsrb xmm2, cl, 15
+ pinsrb xmm1, al, 6
+ pinsrb xmm0, bpl, 7
+
+ pinsrb xmm1, r8b, 10
+ pinsrb xmm2, r9b, 11
+ pinsrb xmm3, r10b, 12
+ pinsrb xmm1, r11b, 13
+ pinsrb xmm2, r12b, 14
+ pinsrb xmm3, al, 15
+ pinsrb xmm0, cl, 6
+ pinsrb xmm0, bl, 7
+
+ pinsrb xmm0, r8b, 9
+ pinsrb xmm0, r9b, 8
+ pinsrb xmm0, r10b, 11
+ pinsrb xmm0, r11b, 3
+ pinsrb xmm0, r12b, 4
+ pinsrb xmm0, al, 5
+ pinsrb xmm0, cl, 6
+ pinsrb xmm0, bl, 7
+
+ dec rdi
+ jnz .L1
+ ret
+
+;------------------------------------------------------------------------------
+; Name: Register16ToVector
+; Purpose: Writes 16-bit main register values into 128-bit vector register
+; without clearing the unused bits.
+; Params: rdi = loops
+;------------------------------------------------------------------------------
+ align 64
+Register16ToVector:
+_Register16ToVector:
+ sal rdi, 1 ; Force some repetition.
+.L1:
+ pinsrw xmm1, ax, 0 ; 64 transfers x 2 bytes = 128 bytes
+ pinsrw xmm2, bx, 1
+ pinsrw xmm3, cx, 2
+ pinsrw xmm1, dx, 3
+ pinsrw xmm2, si, 4
+ pinsrw xmm3, di, 5
+ pinsrw xmm0, bp, 6
+ pinsrw xmm0, sp, 7
+
+ pinsrw xmm0, ax, 0
+ pinsrw xmm1, bx, 1
+ pinsrw xmm2, cx, 2
+ pinsrw xmm3, dx, 3
+ pinsrw xmm3, si, 4
+ pinsrw xmm2, di, 5
+ pinsrw xmm1, bp, 6
+ pinsrw xmm0, sp, 7
+
+ pinsrw xmm1, r8w, 0
+ pinsrw xmm2, r9w, 1
+ pinsrw xmm3, r10w, 2
+ pinsrw xmm1, r11w, 3
+ pinsrw xmm2, r12w, 4
+ pinsrw xmm3, ax, 5
+ pinsrw xmm0, bp, 6
+ pinsrw xmm0, bx, 7
+
+ pinsrw xmm0, r8w, 0
+ pinsrw xmm0, r9w, 1
+ pinsrw xmm0, r10w, 2
+ pinsrw xmm0, r11w, 3
+ pinsrw xmm0, r12w, 4
+ pinsrw xmm0, ax, 5
+ pinsrw xmm0, bp, 6
+ pinsrw xmm0, bx, 7
+
+ pinsrw xmm1, ax, 0
+ pinsrw xmm2, bx, 1
+ pinsrw xmm3, cx, 2
+ pinsrw xmm1, dx, 3
+ pinsrw xmm2, si, 4
+ pinsrw xmm3, di, 5
+ pinsrw xmm0, bp, 6
+ pinsrw xmm0, sp, 7
+
+ pinsrw xmm0, ax, 0
+ pinsrw xmm1, bx, 1
+ pinsrw xmm2, cx, 2
+ pinsrw xmm3, dx, 3
+ pinsrw xmm3, si, 4
+ pinsrw xmm2, di, 5
+ pinsrw xmm1, bp, 6
+ pinsrw xmm0, sp, 7
+
+ pinsrw xmm1, r8w, 0
+ pinsrw xmm2, r9w, 1
+ pinsrw xmm3, r10w, 2
+ pinsrw xmm1, r11w, 3
+ pinsrw xmm2, r12w, 4
+ pinsrw xmm3, ax, 5
+ pinsrw xmm0, bp, 6
+ pinsrw xmm0, bx, 7
+
+ pinsrw xmm0, r8w, 0
+ pinsrw xmm0, r9w, 1
+ pinsrw xmm0, r10w, 2
+ pinsrw xmm0, r11w, 3
+ pinsrw xmm0, r12w, 4
+ pinsrw xmm0, ax, 5
+ pinsrw xmm0, bp, 6
+ pinsrw xmm0, bx, 7
+
+ dec rdi
+ jnz .L1
+ ret
+
+;------------------------------------------------------------------------------
+; Name: Register32ToVector
+; Purpose: Writes 32-bit main register values into 128-bit vector register
+; without clearing the unused bits.
+; Params: rdi = loops
+;------------------------------------------------------------------------------
+ align 64
+Register32ToVector:
+_Register32ToVector:
+.L1:
+ pinsrd xmm1, eax, 0 ; Each xfer moves 4 bytes so to move 256 bytes
+ pinsrd xmm2, ebx, 1 ; we need 64 transfers.
+ pinsrd xmm3, ecx, 2
+ pinsrd xmm1, edx, 3
+ pinsrd xmm2, esi, 0
+ pinsrd xmm3, edi, 1
+ pinsrd xmm0, ebp, 2
+ pinsrd xmm0, esp, 3
+
+ pinsrd xmm0, eax, 0
+ pinsrd xmm1, ebx, 1
+ pinsrd xmm2, ecx, 2
+ pinsrd xmm3, edx, 3
+ pinsrd xmm3, esi, 3
+ pinsrd xmm2, edi, 2
+ pinsrd xmm1, ebp, 1
+ pinsrd xmm0, esp, 0
+
+ pinsrd xmm1, r8d, 0
+ pinsrd xmm2, r9d, 1
+ pinsrd xmm3, r10d, 2
+ pinsrd xmm1, r11d, 3
+ pinsrd xmm2, r12d, 0
+ pinsrd xmm3, eax, 1
+ pinsrd xmm0, ebp, 2
+ pinsrd xmm0, ebx, 3
+
+ pinsrd xmm0, r8d, 0
+ pinsrd xmm0, r9d, 1
+ pinsrd xmm0, r10d, 2
+ pinsrd xmm0, r11d, 3
+ pinsrd xmm0, r12d, 0
+ pinsrd xmm0, eax, 0
+ pinsrd xmm0, ebp, 0
+ pinsrd xmm0, ebx, 0
+
+ pinsrd xmm1, eax, 0
+ pinsrd xmm2, ebx, 1
+ pinsrd xmm3, ecx, 2
+ pinsrd xmm1, edx, 3
+ pinsrd xmm2, esi, 0
+ pinsrd xmm3, edi, 1
+ pinsrd xmm0, ebp, 2
+ pinsrd xmm0, esp, 3
+
+ pinsrd xmm0, eax, 0
+ pinsrd xmm1, ebx, 1
+ pinsrd xmm2, ecx, 2
+ pinsrd xmm3, edx, 3
+ pinsrd xmm3, esi, 3
+ pinsrd xmm2, edi, 2
+ pinsrd xmm1, ebp, 1
+ pinsrd xmm0, esp, 0
+
+ pinsrd xmm1, r8d, 0
+ pinsrd xmm2, r9d, 1
+ pinsrd xmm3, r10d, 2
+ pinsrd xmm1, r11d, 3
+ pinsrd xmm2, r12d, 0
+ pinsrd xmm3, eax, 1
+ pinsrd xmm0, ebp, 2
+ pinsrd xmm0, ebx, 3
+
+ pinsrd xmm0, r8d, 0
+ pinsrd xmm0, r9d, 1
+ pinsrd xmm0, r10d, 2
+ pinsrd xmm0, r11d, 3
+ pinsrd xmm0, r12d, 0
+ pinsrd xmm0, eax, 0
+ pinsrd xmm0, ebp, 0
+ pinsrd xmm0, ebx, 0
+
+ dec rdi
+ jnz .L1
+ ret
+
+;------------------------------------------------------------------------------
+; Name: Register64ToVector
+; Purpose: Writes 64-bit main register values into 128-bit vector register
+; without clearing the unused bits.
+; Params: rdi = loops
+;------------------------------------------------------------------------------
+ align 64
+Register64ToVector:
+_Register64ToVector:
+ add rdi, rdi
+.L1:
+ pinsrq xmm1, r8, 0 ; Each xfer moves 8 bytes, therefore to do
+ pinsrq xmm2, r9, 1 ; 256 bytes we need 32 transfers.
+ pinsrq xmm3, r10, 0
+ pinsrq xmm1, r11, 1
+ pinsrq xmm2, r12, 0
+ pinsrq xmm3, rax, 1
+ pinsrq xmm0, rbp, 0
+ pinsrq xmm0, rbx, 1
+
+ pinsrq xmm0, r8, 0
+ pinsrq xmm0, r9, 1
+ pinsrq xmm0, r10, 1
+ pinsrq xmm0, r11, 1
+ pinsrq xmm0, r12, 0
+ pinsrq xmm0, rax, 0
+ pinsrq xmm0, rbp, 0
+ pinsrq xmm0, rbx, 0
+
+ pinsrq xmm0, r8, 0
+ pinsrq xmm0, r9, 1
+ pinsrq xmm0, r10, 1
+ pinsrq xmm0, r11, 1
+ pinsrq xmm0, r12, 0
+ pinsrq xmm0, rax, 0
+ pinsrq xmm0, rbp, 0
+ pinsrq xmm0, rbx, 0
+
+ pinsrq xmm0, r8, 0
+ pinsrq xmm0, r9, 1
+ pinsrq xmm0, r10, 1
+ pinsrq xmm0, r11, 1
+ pinsrq xmm0, r12, 0
+ pinsrq xmm0, rax, 0
+ pinsrq xmm0, rbp, 0
+ pinsrq xmm0, rbx, 0
+
+ dec rdi
+ jnz .L1
+ ret
+
+
+;------------------------------------------------------------------------------
+; Name: Vector8ToRegister
+; Purpose: Writes 8-bit vector register values into main register.
+; Params: rdi = loops
+;------------------------------------------------------------------------------
+ align 64
+Vector8ToRegister:
+_Vector8ToRegister:
+ sal rdi, 3 ; Force some repetition.
+.L1:
+ pextrb eax, xmm1, 0 ; 64 transfers x 1 bytes = 64 bytes
+ pextrb eax, xmm2, 1
+ pextrb eax, xmm3, 2
+ pextrb eax, xmm1, 3
+ pextrb eax, xmm2, 4
+ pextrb eax, xmm3, 5
+ pextrb eax, xmm0, 6
+ pextrb eax, xmm0, 7
+
+ pextrb eax, xmm0, 0
+ pextrb eax, xmm1, 1
+ pextrb eax, xmm2, 2
+ pextrb eax, xmm3, 3
+ pextrb eax, xmm3, 4
+ pextrb eax, xmm2, 5
+ pextrb eax, xmm1, 6
+ pextrb eax, xmm0, 7
+
+ pextrb eax, xmm1, 0
+ pextrb eax, xmm2, 1
+ pextrb eax, xmm3, 2
+ pextrb eax, xmm1, 3
+ pextrb eax, xmm2, 4
+ pextrb eax, xmm3, 5
+ pextrb eax, xmm0, 6
+ pextrb eax, xmm0, 7
+
+ pextrb eax, xmm0, 0
+ pextrb eax, xmm0, 1
+ pextrb eax, xmm0, 2
+ pextrb eax, xmm0, 3
+ pextrb eax, xmm0, 4
+ pextrb eax, xmm0, 5
+ pextrb eax, xmm0, 6
+ pextrb eax, xmm0, 7
+
+ pextrb eax, xmm1, 0
+ pextrb eax, xmm2, 1
+ pextrb eax, xmm3, 2
+ pextrb eax, xmm1, 3
+ pextrb eax, xmm2, 4
+ pextrb eax, xmm3, 5
+ pextrb eax, xmm0, 6
+ pextrb eax, xmm0, 7
+
+ pextrb eax, xmm0, 0
+ pextrb eax, xmm1, 1
+ pextrb eax, xmm2, 2
+ pextrb eax, xmm3, 3
+ pextrb eax, xmm3, 4
+ pextrb eax, xmm2, 5
+ pextrb eax, xmm1, 6
+ pextrb eax, xmm0, 7
+
+ pextrb eax, xmm1, 0
+ pextrb eax, xmm2, 1
+ pextrb eax, xmm3, 2
+ pextrb eax, xmm1, 3
+ pextrb eax, xmm2, 4
+ pextrb eax, xmm3, 5
+ pextrb eax, xmm0, 6
+ pextrb eax, xmm0, 7
+
+ pextrb eax, xmm0, 0
+ pextrb eax, xmm0, 1
+ pextrb eax, xmm0, 2
+ pextrb eax, xmm0, 3
+ pextrb eax, xmm0, 4
+ pextrb eax, xmm0, 5
+ pextrb eax, xmm0, 6
+ pextrb eax, xmm0, 7
+
+ dec rdi
+ jnz .L1
+ ret
+
+;------------------------------------------------------------------------------
+; Name: Vector16ToRegister
+; Purpose: Writes 16-bit vector register values into main register.
+; Params: rdi = loops
+;------------------------------------------------------------------------------
+ align 64
+Vector16ToRegister:
+_Vector16ToRegister:
+ sal rdi, 2 ; Force some repetition.
+.L1:
+ pextrw eax, xmm1, 0 ; 64 transfers x 2 bytes = 128 bytes
+ pextrw eax, xmm2, 1
+ pextrw eax, xmm3, 2
+ pextrw eax, xmm1, 3
+ pextrw eax, xmm2, 4
+ pextrw eax, xmm3, 5
+ pextrw eax, xmm0, 6
+ pextrw eax, xmm0, 7
+
+ pextrw eax, xmm0, 0
+ pextrw eax, xmm1, 1
+ pextrw eax, xmm2, 2
+ pextrw eax, xmm3, 3
+ pextrw eax, xmm3, 4
+ pextrw eax, xmm2, 5
+ pextrw eax, xmm1, 6
+ pextrw eax, xmm0, 7
+
+ pextrw eax, xmm1, 0
+ pextrw eax, xmm2, 1
+ pextrw eax, xmm3, 2
+ pextrw eax, xmm1, 3
+ pextrw eax, xmm2, 4
+ pextrw eax, xmm3, 5
+ pextrw eax, xmm0, 6
+ pextrw eax, xmm0, 7
+
+ pextrw eax, xmm0, 0
+ pextrw eax, xmm0, 1
+ pextrw eax, xmm0, 2
+ pextrw eax, xmm0, 3
+ pextrw eax, xmm0, 4
+ pextrw eax, xmm0, 5
+ pextrw eax, xmm0, 6
+ pextrw eax, xmm0, 7
+
+ pextrw eax, xmm1, 0
+ pextrw eax, xmm2, 1
+ pextrw eax, xmm3, 2
+ pextrw eax, xmm1, 3
+ pextrw eax, xmm2, 4
+ pextrw eax, xmm3, 5
+ pextrw eax, xmm0, 6
+ pextrw eax, xmm0, 7
+
+ pextrw eax, xmm0, 0
+ pextrw eax, xmm1, 1
+ pextrw eax, xmm2, 2
+ pextrw eax, xmm3, 3
+ pextrw eax, xmm3, 4
+ pextrw eax, xmm2, 5
+ pextrw eax, xmm1, 6
+ pextrw eax, xmm0, 7
+
+ pextrw eax, xmm1, 0
+ pextrw eax, xmm2, 1
+ pextrw eax, xmm3, 2
+ pextrw eax, xmm1, 3
+ pextrw eax, xmm2, 4
+ pextrw eax, xmm3, 5
+ pextrw eax, xmm0, 6
+ pextrw eax, xmm0, 7
+
+ pextrw eax, xmm0, 0
+ pextrw eax, xmm0, 1
+ pextrw eax, xmm0, 2
+ pextrw eax, xmm0, 3
+ pextrw eax, xmm0, 4
+ pextrw eax, xmm0, 5
+ pextrw eax, xmm0, 6
+ pextrw eax, xmm0, 7
+
+ dec rdi
+ jnz .L1
+ ret
+
+;------------------------------------------------------------------------------
+; Name: Vector32ToRegister
+; Purpose: Writes 32-bit vector register values into main register.
+; Params: rdi = loops
+;------------------------------------------------------------------------------
+ align 64
+Vector32ToRegister:
+_Vector32ToRegister:
+ add rdi, rdi
+.L1:
+ pextrd eax, xmm1, 0 ; 64 xfers x 4 bytes = 256 bytes
+ pextrd eax, xmm2, 1
+ pextrd eax, xmm3, 2
+ pextrd eax, xmm1, 3
+ pextrd eax, xmm2, 0
+ pextrd eax, xmm3, 1
+ pextrd eax, xmm0, 2
+ pextrd eax, xmm0, 3
+
+ pextrd eax, xmm0, 0
+ pextrd eax, xmm1, 1
+ pextrd eax, xmm2, 2
+ pextrd eax, xmm3, 3
+ pextrd eax, xmm3, 3
+ pextrd eax, xmm2, 2
+ pextrd eax, xmm1, 1
+ pextrd eax, xmm0, 0
+
+ pextrd eax, xmm1, 0
+ pextrd eax, xmm2, 1
+ pextrd eax, xmm3, 2
+ pextrd eax, xmm1, 3
+ pextrd eax, xmm2, 0
+ pextrd eax, xmm3, 1
+ pextrd eax, xmm0, 2
+ pextrd eax, xmm0, 3
+
+ pextrd eax, xmm0, 0
+ pextrd eax, xmm0, 1
+ pextrd eax, xmm0, 2
+ pextrd eax, xmm0, 3
+ pextrd eax, xmm0, 0
+ pextrd eax, xmm0, 0
+ pextrd eax, xmm0, 0
+ pextrd eax, xmm0, 0
+
+ pextrd eax, xmm1, 0
+ pextrd eax, xmm2, 1
+ pextrd eax, xmm3, 2
+ pextrd eax, xmm1, 3
+ pextrd eax, xmm2, 0
+ pextrd eax, xmm3, 1
+ pextrd eax, xmm0, 2
+ pextrd eax, xmm0, 3
+
+ pextrd eax, xmm0, 0
+ pextrd eax, xmm1, 1
+ pextrd eax, xmm2, 2
+ pextrd eax, xmm3, 3
+ pextrd eax, xmm3, 3
+ pextrd eax, xmm2, 2
+ pextrd eax, xmm1, 1
+ pextrd eax, xmm0, 0
+
+ pextrd eax, xmm1, 0
+ pextrd eax, xmm2, 1
+ pextrd eax, xmm3, 2
+ pextrd eax, xmm1, 3
+ pextrd eax, xmm2, 0
+ pextrd eax, xmm3, 1
+ pextrd eax, xmm0, 2
+ pextrd eax, xmm0, 3
+
+ pextrd eax, xmm0, 0
+ pextrd eax, xmm0, 1
+ pextrd eax, xmm0, 2
+ pextrd eax, xmm0, 3
+ pextrd eax, xmm0, 0
+ pextrd eax, xmm0, 1
+ pextrd eax, xmm0, 2
+ pextrd eax, xmm0, 3
+
+ dec rdi
+ jnz .L1
+ ret
+
+;------------------------------------------------------------------------------
+; Name: Vector64ToRegister
+; Purpose: Writes 64-bit vector register values into main register.
+; Params: rdi = loops
+;------------------------------------------------------------------------------
+ align 64
+Vector64ToRegister:
+_Vector64ToRegister:
+ add rdi, rdi
+.L1:
+ pextrq rax, xmm1, 0 ; 32 transfers by 8 bytes = 256 bytes
+ pextrq rax, xmm2, 1
+ pextrq rax, xmm3, 0
+ pextrq rax, xmm1, 1
+ pextrq rax, xmm2, 0
+ pextrq rax, xmm3, 1
+ pextrq rax, xmm0, 0
+ pextrq rax, xmm0, 1
+
+ pextrq rax, xmm0, 0
+ pextrq rax, xmm0, 1
+ pextrq rax, xmm0, 0
+ pextrq rax, xmm0, 1
+ pextrq rax, xmm0, 0
+ pextrq rax, xmm0, 1
+ pextrq rax, xmm0, 0
+ pextrq rax, xmm0, 1
+
+ pextrq rax, xmm1, 0
+ pextrq rax, xmm2, 1
+ pextrq rax, xmm3, 0
+ pextrq rax, xmm1, 1
+ pextrq rax, xmm2, 0
+ pextrq rax, xmm3, 1
+ pextrq rax, xmm0, 0
+ pextrq rax, xmm0, 1
+
+ pextrq rax, xmm0, 0
+ pextrq rax, xmm0, 1
+ pextrq rax, xmm0, 0
+ pextrq rax, xmm0, 1
+ pextrq rax, xmm0, 0
+ pextrq rax, xmm0, 1
+ pextrq rax, xmm0, 0
+ pextrq rax, xmm0, 1
+
+ dec rdi
+ jnz .L1
+ ret
+
+;------------------------------------------------------------------------------
+; Name: CopyAVX
+; Purpose: Copies memory chunks that are 32-byte aligned.
+; Params: rdi = ptr to destination memory area
+; rsi = ptr to source memory area
+; rdx = length in bytes
+; rcx = loops
+;------------------------------------------------------------------------------
+ align 64
+CopyAVX:
+_CopyAVX:
+ vzeroupper
+
+ push r10
+
+ shr rdx, 8 ; Ensure length is multiple of 256.
+ shl rdx, 8
+
+ prefetcht0 [rsi]
+
+.L1:
+ mov r10, rdx
+
+.L2:
+ vmovdqa ymm0, [rsi]
+ vmovdqa ymm1, [32+rsi]
+ vmovdqa ymm2, [64+rsi]
+ vmovdqa ymm3, [96+rsi]
+
+ vmovdqa [rdi], ymm0
+ vmovdqa [32+rdi], ymm1
+ vmovdqa [64+rdi], ymm2
+ vmovdqa [96+rdi], ymm3
+
+ vmovdqa ymm0, [128+rsi]
+ vmovdqa ymm1, [128+32+rsi]
+ vmovdqa ymm2, [128+64+rsi]
+ vmovdqa ymm3, [128+96+rsi]
+
+ vmovdqa [128+rdi], ymm0
+ vmovdqa [128+32+rdi], ymm1
+ vmovdqa [128+64+rdi], ymm2
+ vmovdqa [128+96+rdi], ymm3
+
+ add rsi, 256
+ add rdi, 256
+
+ sub r10, 256
+ jnz .L2
+
+ sub rsi, rdx ; rsi now points to start.
+ sub rdi, rdx ; rdi now points to start.
+
+ dec rcx
+ jnz .L1
+
+ pop r10
+
+ ret
+
+
+;------------------------------------------------------------------------------
+; Name: CopySSE
+; Purpose: Copies memory chunks that are 16-byte aligned.
+; Params: rdi = ptr to destination memory area
+; rsi = ptr to source memory area
+; rdx = length in bytes
+; rcx = loops
+;------------------------------------------------------------------------------
+ align 64
+CopySSE:
+_CopySSE:
+ push r10
+
+ shr rdx, 8 ; Ensure length is multiple of 256.
+ shl rdx, 8
+
+ prefetcht0 [rsi]
+
+ ; Save our non-parameter XMM registers.
+ sub rsp, 192
+ movdqu [rsp], xmm4
+ movdqu [16+rsp], xmm5
+ movdqu [32+rsp], xmm6
+ movdqu [48+rsp], xmm7
+ movdqu [64+rsp], xmm8
+ movdqu [80+rsp], xmm9
+ movdqu [96+rsp], xmm10
+ movdqu [112+rsp], xmm11
+ movdqu [128+rsp], xmm12
+ movdqu [144+rsp], xmm13
+ movdqu [160+rsp], xmm14
+ movdqu [176+rsp], xmm15
+
+.L1:
+ mov r10, rdx
+
+.L2:
+ movdqa xmm0, [rsi]
+ movdqa xmm1, [16+rsi]
+ movdqa xmm2, [32+rsi]
+ movdqa xmm3, [48+rsi]
+
+ movdqa [rdi], xmm0
+ movdqa [16+rdi], xmm1
+ movdqa [32+rdi], xmm2
+ movdqa [48+rdi], xmm3
+
+ movdqa xmm4, [64+rsi]
+ movdqa xmm5, [80+rsi]
+ movdqa xmm6, [96+rsi]
+ movdqa xmm7, [112+rsi]
+
+ movdqa [64+rdi], xmm4
+ movdqa [80+rdi], xmm5
+ movdqa [96+rdi], xmm6
+ movdqa [112+rdi], xmm7
+
+ movdqa xmm8, [128+rsi]
+ movdqa xmm9, [144+rsi]
+ movdqa xmm10, [160+rsi]
+ movdqa xmm11, [176+rsi]
+
+ movdqa [128+rdi], xmm8
+ movdqa [144+rdi], xmm9
+ movdqa [160+rdi], xmm10
+ movdqa [176+rdi], xmm11
+
+ movdqa xmm12, [192+rsi]
+ movdqa xmm13, [208+rsi]
+ movdqa xmm14, [224+rsi]
+ movdqa xmm15, [240+rsi]
+
+ movdqa [192+rdi], xmm12
+ movdqa [208+rdi], xmm13
+ movdqa [224+rdi], xmm14
+ movdqa [240+rdi], xmm15
+
+ add rsi, 256
+ add rdi, 256
+
+ sub r10, 256
+ jnz .L2
+
+ sub rsi, rdx ; rsi now points to start.
+ sub rdi, rdx ; rdi now points to start.
+
+ dec rcx
+ jnz .L1
+
+ movdqu xmm4, [rsp]
+ movdqu xmm5, [16+rsp]
+ movdqu xmm6, [32+rsp]
+ movdqu xmm7, [48+rsp]
+ movdqu xmm8, [64+rsp]
+ movdqu xmm9, [80+rsp]
+ movdqu xmm10, [96+rsp]
+ movdqu xmm11, [112+rsp]
+ movdqu xmm12, [128+rsp]
+ movdqu xmm13, [144+rsp]
+ movdqu xmm14, [160+rsp]
+ movdqu xmm15, [176+rsp]
+ add rsp, 192
+
+ pop r10
+
+ ret
+
+
+;------------------------------------------------------------------------------
+; Name: CopySSE_128bytes
+; Purpose: Copies memory chunks that are 16-byte aligned.
+; Params: rdi = ptr to destination memory area
+; rsi = ptr to source memory area
+; rdx = length in bytes
+; rcx = loops
+;------------------------------------------------------------------------------
+ align 64
+CopySSE_128bytes:
+_CopySSE_128bytes:
+ push r10
+
+ shr rdx, 7 ; Ensure length is multiple of 128.
+ shl rdx, 7
+
+ prefetcht0 [rsi]
+
+ ; Save our non-parameter XMM registers.
+ sub rsp, 64
+ movdqu [rsp], xmm4
+ movdqu [16+rsp], xmm5
+ movdqu [32+rsp], xmm6
+ movdqu [48+rsp], xmm7
+
+.L1:
+ mov r10, rdx
+
+.L2:
+ movdqa xmm0, [rsi]
+ movdqa xmm1, [16+rsi]
+ movdqa xmm2, [32+rsi]
+ movdqa xmm3, [48+rsi]
+
+ movdqa [rdi], xmm0
+ movdqa [16+rdi], xmm1
+ movdqa [32+rdi], xmm2
+ movdqa [48+rdi], xmm3
+
+ movdqa xmm4, [64+rsi]
+ movdqa xmm5, [80+rsi]
+ movdqa xmm6, [96+rsi]
+ movdqa xmm7, [112+rsi]
+
+ movdqa [64+rdi], xmm4
+ movdqa [80+rdi], xmm5
+ movdqa [96+rdi], xmm6
+ movdqa [112+rdi], xmm7
+
+ add rsi, 128
+ add rdi, 128
+
+ sub r10, 128
+ jnz .L2
+
+ sub rsi, rdx ; rsi now points to start.
+ sub rdi, rdx ; rdi now points to start.
+
+ dec rcx
+ jnz .L1
+
+ movdqu xmm4, [rsp]
+ movdqu xmm5, [16+rsp]
+ movdqu xmm6, [32+rsp]
+ movdqu xmm7, [48+rsp]
+ add rsp, 64
+
+ pop r10
+
+ ret
+
+
OpenPOWER on IntegriCloud