From 45d37750fc6f3f1f43d23732816ffccc3820e215 Mon Sep 17 00:00:00 2001 From: Zack Smith <1@zsmith.co> Date: Wed, 23 Dec 2015 04:52:54 +0000 Subject: Initial import of GPLed bandwidthd 1.1b source from author's site --- BMP.c | 796 ++++++ BMP.h | 100 + BMPGraphing.c | 486 ++++ BMPGraphing.h | 88 + COPYING.txt | 340 +++ Makefile | 87 + README.txt | 167 ++ defs.h | 147 + font.c | 1655 +++++++++++ font.h | 28 + loopback.sh | 5 + main.c | 2442 ++++++++++++++++ minifont.c | 845 ++++++ minifont.h | 28 + output/._Celeron-2.8GHz-slow.gif | Bin 0 -> 489 bytes ...._Corei5-2.6GHz-4288U-MacOSXMavericks-64bit.png | Bin 0 -> 177 bytes output/._Corei5-520M-MacOSXLion-32bit-slow.gif | Bin 0 -> 489 bytes ...i5-520M-MacOSXLion-64bit-slow-Crucial-SDRAM.gif | Bin 0 -> 489 bytes ...i5-520M-MacOSXLion-64bit-slow-Samsung-SDRAM.gif | Bin 0 -> 177 bytes output/Celeron-2.8GHz-slow.gif | Bin 0 -> 14589 bytes .../Corei5-2.6GHz-4288U-MacOSXMavericks-64bit.png | Bin 0 -> 66393 bytes output/Corei5-520M-MacOSXLion-32bit-slow.gif | Bin 0 -> 16173 bytes ...i5-520M-MacOSXLion-64bit-slow-Crucial-SDRAM.gif | Bin 0 -> 18693 bytes ...i5-520M-MacOSXLion-64bit-slow-Samsung-SDRAM.gif | Bin 0 -> 17048 bytes routines32.asm | 2960 ++++++++++++++++++++ routines64.asm | 2590 +++++++++++++++++ 26 files changed, 12764 insertions(+) create mode 100755 BMP.c create mode 100755 BMP.h create mode 100755 BMPGraphing.c create mode 100755 BMPGraphing.h create mode 100755 COPYING.txt create mode 100755 Makefile create mode 100755 README.txt create mode 100755 defs.h create mode 100755 font.c create mode 100755 font.h create mode 100755 loopback.sh create mode 100755 main.c create mode 100755 minifont.c create mode 100755 minifont.h create mode 100755 output/._Celeron-2.8GHz-slow.gif create mode 100755 output/._Corei5-2.6GHz-4288U-MacOSXMavericks-64bit.png create mode 100755 output/._Corei5-520M-MacOSXLion-32bit-slow.gif create mode 100755 output/._Corei5-520M-MacOSXLion-64bit-slow-Crucial-SDRAM.gif create mode 100755 output/._Corei5-520M-MacOSXLion-64bit-slow-Samsung-SDRAM.gif create mode 100755 output/Celeron-2.8GHz-slow.gif create mode 100755 output/Corei5-2.6GHz-4288U-MacOSXMavericks-64bit.png create mode 100755 output/Corei5-520M-MacOSXLion-32bit-slow.gif create mode 100755 output/Corei5-520M-MacOSXLion-64bit-slow-Crucial-SDRAM.gif create mode 100755 output/Corei5-520M-MacOSXLion-64bit-slow-Samsung-SDRAM.gif create mode 100755 routines32.asm create mode 100755 routines64.asm diff --git a/BMP.c b/BMP.c new file mode 100755 index 0000000..9327bb7 --- /dev/null +++ b/BMP.c @@ -0,0 +1,796 @@ + +/*============================================================================= + bmplib, a simple library to create, modify, and write BMP image files. + Copyright (C) 2009-2014 by Zack T Smith. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 + as published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + The author may be reached at veritas@comcast.net. + *============================================================================*/ + +//-------------------------------------------------- +// Change Log +// 0.8 ZS Added larger font of my own design. +// 0.9 ZS Removed attempt at anti-aliasing. +//-------------------------------------------------- + +#include +#include +#include + +#include "BMP.h" +#include "font.h" +#include "minifont.h" + +// Narrowest possible numbers. +static char* narrow_nums [] = +{ + " # ", + "# #", + "# #", + "# #", + "# #", + "# #", + " # ", + + " #", + "##", + " #", + " #", + " #", + " #", + " #", + + " # ", + "# #", + " #", + " ##", + "# ", + "# ", + "###", + + "###", + " #", + " # ", + "## ", + " #", + "# #", + " # ", + + "# #", + "# #", + "# #", + "###", + " #", + " #", + " #", + + "###", + "# ", + "## ", + " #", + " #", + "# #", + " # ", + + + " # ", + "# ", + "# ", + "## ", + "# #", + "# #", + " # ", + + "###", + " #", + " #", + " # ", + " # ", + " # ", + " # ", + + " # ", + "# #", + "# #", + " # ", + "# #", + "# #", + " # ", + + " # ", + "# #", + "# #", + " ##", + " #", + " # ", + "# ", + + " ", + "", + "", + " ", + "", + "", + "#", +}; + + +/*--------------------------------------------------------------------------- + * Name: BMP_new + * Purpose: Creates new image. + *-------------------------------------------------------------------------*/ +BMP* +BMP_new (int w, int h) +{ + unsigned long size; + BMP* nu; + if (w<1 || h<1) + return NULL; + //---------- + + if (w & 3) + w += 4 - (w & 3); + if (h & 3) + h += 4 - (h & 3); + + nu = (BMP*) malloc (sizeof (BMP)); + if (!nu) + return NULL; + memset (nu, 0, sizeof (BMP)); + nu->width = w; + nu->height = h; + size = w * h * sizeof (long); + nu->pixels = (RGB*) malloc (size); + if (!nu->pixels) { + free (nu); + return NULL; + } + memset (nu->pixels, 0, size); + return nu; +} + +/*--------------------------------------------------------------------------- + * Name: BMP_destroy + * Purpose: Deallocates image. + *-------------------------------------------------------------------------*/ +void +BMP_destroy (BMP* bmp) +{ + if (!bmp) + return; + //---------- + + if (bmp->pixels) + free (bmp->pixels); + free (bmp); +} + +/*--------------------------------------------------------------------------- + * Name: BMP_point + * Purpose: Writes pixel into image. + *-------------------------------------------------------------------------*/ +void +BMP_point (BMP *bmp, int x, int y, RGB rgb) +{ + if (!bmp || x<0 || y<0) + return; + if (x >= bmp->width || y >= bmp->height) + return; + if (!bmp->pixels) + return; + //---------- + + bmp->pixels[y*bmp->width + x] = rgb; +} + +/*--------------------------------------------------------------------------- + * Name: BMP_line_core + * Purpose: Draws a line in a BMP image. + *-------------------------------------------------------------------------*/ +void +BMP_line_core (BMP *bmp, int x0, int y0, int x1, int y1, RGB rgb, + int dashed) +{ + if ((rgb >> 24) == 0xff) + return; + + int dot_counter = 0; + + if (!dashed && x0 == x1 && y0 == y1) + BMP_point (bmp, x0, y0, rgb); + else if (!dashed && x0 == x1) + BMP_vline (bmp, x0, y0, y1, rgb); + else if (!dashed && y0 == y1) + BMP_hline (bmp, x0, x1, y0, rgb); + else { + int j, x, y, dx, dy, e, xchange, s1, s2; + + // DDA, copied from my FramebufferUI project. + + x = x0; + y = y0; + s1 = 1; + s2 = 1; + + dx = x1 - x0; + if (dx < 0) { + dx = -dx; + s1 = -1; + } + + dy = y1 - y0; + if (dy < 0) { + dy = -dy; + s2 = -1; + } + + xchange = 0; + + if (dy > dx) { + int tmp = dx; + dx = dy; + dy = tmp; + xchange = 1; + } + + e = (dy<<1) - dx; + j = 0; + + while (j <= dx) { + j++; + + int draw = 1; + if (dashed && (1 & (dot_counter >> 2))) + draw = 0; + + if (draw) + BMP_point (bmp, x, y, rgb); + + dot_counter++; + + if (e >= 0) { + if (xchange) + x += s1; + else + y += s2; + e -= (dx << 1); + } + if (xchange) + y += s2; + else + x += s1; + e += (dy << 1); + } + } +} + +/*--------------------------------------------------------------------------- + * Name: BMP_line + * Purpose: Draws a line in a BMP image. + *-------------------------------------------------------------------------*/ +void +BMP_line (BMP *bmp, int x0, int y0, int x1, int y1, RGB rgb) +{ + BMP_line_core (bmp, x0, y0, x1, y1, rgb, 0); +} + +/*--------------------------------------------------------------------------- + * Name: BMP_line_dashed + * Purpose: Draws a dashed line in a BMP image. + *-------------------------------------------------------------------------*/ +void +BMP_line_dashed (BMP *bmp, int x0, int y0, int x1, int y1, RGB rgb) +{ + BMP_line_core (bmp, x0, y0, x1, y1, rgb, 1); +} + +/*--------------------------------------------------------------------------- + * Name: BMP_rect + * Purpose: Fills a rectangle with a color. + *-------------------------------------------------------------------------*/ +void +BMP_rect (BMP *bmp, int x, int y, int w, int h, RGB rgb) +{ + BMP_hline (bmp, x, x+w-1, y, rgb); + BMP_hline (bmp, x, x+w-1, y+h-1, rgb); + BMP_vline (bmp, x, y, y+h-1, rgb); + BMP_vline (bmp, x+w-1, y, y+h-1, rgb); +} + +/*--------------------------------------------------------------------------- + * Name: BMP_fillrect + * Purpose: Fills a rectangle with a color. + *-------------------------------------------------------------------------*/ +void +BMP_fillrect (BMP *bmp, int x, int y, int w, int h, RGB rgb) +{ + while (h > 0) { + BMP_hline (bmp, x, x+w-1, y, rgb); + h--; + y++; + } +} + +/*--------------------------------------------------------------------------- + * Name: BMP_clear + * Purpose: Sets all pixels to specified color. + *-------------------------------------------------------------------------*/ +void +BMP_clear (BMP *bmp, RGB rgb) +{ + BMP_fillrect (bmp, 0, 0, bmp->width, bmp->height, rgb); +} + +/*--------------------------------------------------------------------------- + * Name: BMP_hline + * Purpose: Draws horizontal line. + *-------------------------------------------------------------------------*/ +void +BMP_hline (BMP *bmp, int x0, int x1, int y, RGB rgb) +{ + if (x0 > x1) { + int tmp=x1; + x1=x0; + x0=tmp; + } + + while (x0 <= x1) { + BMP_point (bmp, x0++, y, rgb); + } +} + +/*--------------------------------------------------------------------------- + * Name: BMP_vline + * Purpose: Draws vertical line. + *-------------------------------------------------------------------------*/ +void +BMP_vline (BMP *bmp, int x, int y0, int y1, RGB rgb) +{ + if (y0 > y1) { + int tmp=y1; + y1=y0; + y0=tmp; + } + + while (y0 <= y1) { + BMP_point (bmp, x, y0++, rgb); + } +} + +/*--------------------------------------------------------------------------- + * Name: BMP_draw_string + * Purpose: Draws ature 5x8 characters into the image. + *-------------------------------------------------------------------------*/ +int +BMP_draw_string (BMP *bmp, const char *string, int x, int y, RGB color) +{ + char ch; + const char *s; + RGB r,g,b; + RGB light, dark; + + if (!bmp || !string) + return 0; + if (x >= bmp->width || y >= bmp->height || !*string) + return 0; + //---------- + + r = 0xff & (color >> 16); + g = 0xff & (color >> 8); + b = 0xff & color; + r += 3*0xff; + b += 3*0xff; + g += 3*0xff; + r /= 4; + g /= 4; + b /= 4; + light = b | (g << 8) | (r << 16); + + r = 0xff & (color >> 16); + g = 0xff & (color >> 8); + b = 0xff & color; + r += 0xff; + b += 0xff; + g += 0xff; + r /= 2; + g /= 2; + b /= 2; + dark = b | (g << 8) | (r << 16); + + const char **chars = get_font_chars (); + + s = string; + while ((ch = *s++)) { + int ix = -1; + if (ch == ' ') { + x += 10; + continue; + } + if (ch > 'z') + continue; + if (ch > ' ' && ch <= 'z') + ix = FONT_HEIGHT * (ch - 33); + + if (ix >= 0) { + int i; + int width = 0; + + for (i=0; i 'z') + continue; + if (ch > ' ' && ch <= 'z') + ix = FONT_HEIGHT * (ch - 33); + + if (ix >= 0) { + int j; + int max_w = 0; + for (j = 0; j < FONT_HEIGHT; j++) { + const char *ptr = _chars [j+ix]; + int w = ptr ? strlen (ptr) : 0; + if (max_w < w) max_w = w; + } + + width += max_w + 2/* kerning */; + } + } + + return width; +} + +/*--------------------------------------------------------------------------- + * Name: BMP_draw_mini_string + * Purpose: Draws miniature 5x8 characters into the image. + *-------------------------------------------------------------------------*/ +int +BMP_draw_mini_string (BMP *bmp, const char *string, int x, int y, RGB color) +{ + char ch; + const char *s; + unsigned long r,g,b; + unsigned long light, dark; + + if (!bmp || !string) + return 0; + if (x >= bmp->width || y >= bmp->height || !*string) + return 0; + //---------- + + r = 0xff & (color >> 16); + g = 0xff & (color >> 8); + b = 0xff & color; + r += 3*0xff; + b += 3*0xff; + g += 3*0xff; + r /= 4; + g /= 4; + b /= 4; + light = b | (g << 8) | (r << 16); + + r = 0xff & (color >> 16); + g = 0xff & (color >> 8); + b = 0xff & color; + r += 0xff; + b += 0xff; + g += 0xff; + r /= 2; + g /= 2; + b /= 2; + dark = b | (g << 8) | (r << 16); + + const char **mini_chars = get_minifont_chars (); + +#define MINI_HEIGHT (8) + s = string; + while ((ch = *s++)) { + int ix = -1; + if (ch == ' ') { + x += 5; + continue; + } + if (ch > 'z') + continue; + if (ch > ' ' && ch <= 'z') + ix = MINI_HEIGHT * (ch - 33); + + if (ix >= 0) { + int i; + + int width = 0; + for (i=0; i 'z') + continue; + if (ch > ' ' && ch <= 'z') + ix = MINI_HEIGHT * (ch - 33); + + if (ix >= 0) { + int max_w = 0; + int j; + for (j = 0; j < MINI_HEIGHT; j++) { + const char *ptr = mini_chars [j+ix]; + int w = ptr ? strlen (ptr) : 0; + if (max_w < w) max_w = w; + } + + width += max_w + 1/*kerning*/; + } + } + + return width; +} + +/*--------------------------------------------------------------------------- + * Name: BMP_narrow_numbers + * Purpose: Draws miniature 4x7 characters into the image. + *-------------------------------------------------------------------------*/ +int +BMP_draw_narrow_numbers (BMP *bmp, const char *string, int x, int y, RGB color) +{ + char ch; + const char *s; + + if (!bmp || !string) + return 0; + if (x >= bmp->width || y >= bmp->height || !*string) + return 0; + //---------- + +#define NARROW_HEIGHT (7) + s = string; + while ((ch = *s++)) { + int ix = -1; + if (ch == ' ') { + x += 3; + continue; + } + if (ch >= '0' && ch <= '9') + ix = ch - '0'; + else + if (ch == '.') + ix = 10; + + ix *= NARROW_HEIGHT; + + if (ix >= 0) { + int i; + int width = strlen (narrow_nums [ix]); + + for (i=0; i= bmp->width || y >= bmp->height) + return 0; + if (!bmp->pixels) + return 0; + //---------- + + return bmp->pixels[y*bmp->width + x]; +} + +/*--------------------------------------------------------------------------- + * Name: BMP_write + * Purpose: Writes image to BMP file. + *-------------------------------------------------------------------------*/ +int +BMP_write (const BMP* bmp, const char *path) +{ + FILE *f; +#define HDRLEN (54) + unsigned char h[HDRLEN]; + unsigned long len; + int i, j; + + if (!bmp || !path) + return -1; + //---------- + + memset (h, 0, HDRLEN); + + //-------------------- + // Create the file. + // + f = fopen (path, "wb"); + if (!f) + return 0; + + //-------------------- + // Prepare header + // + len = HDRLEN + 3 * bmp->width * bmp->height; + h[0] = 'B'; + h[1] = 'M'; + h[2] = len & 0xff; + h[3] = (len >> 8) & 0xff; + h[4] = (len >> 16) & 0xff; + h[5] = (len >> 24) & 0xff; + h[10] = HDRLEN; + h[14] = 40; + h[18] = bmp->width & 0xff; + h[19] = (bmp->width >> 8) & 0xff; + h[20] = (bmp->width >> 16) & 0xff; + h[22] = bmp->height & 0xff; + h[23] = (bmp->height >> 8) & 0xff; + h[24] = (bmp->height >> 16) & 0xff; + h[26] = 1; + h[28] = 24; + h[34] = 16; + h[36] = 0x13; // 2835 pixels/meter + h[37] = 0x0b; + h[42] = 0x13; // 2835 pixels/meter + h[43] = 0x0b; + + //-------------------- + // Write header. + // + if (HDRLEN != fwrite (h, 1, HDRLEN, f)) { + fclose (f); + return 0; + } + + //---------------------------------------- + // Write pixels. + // Note that BMP has lower rows first. + // + for (j=bmp->height-1; j >= 0; j--) { + for (i=0; i < bmp->width; i++) { + unsigned char rgb[3]; + int ix = i + j * bmp->width; + unsigned long pixel = bmp->pixels[ix]; + rgb[0] = pixel & 0xff; + rgb[1] = (pixel >> 8) & 0xff; + rgb[2] = (pixel >> 16) & 0xff; + if (3 != fwrite (rgb, 1, 3, f)) { + fclose (f); + return 0; + } + } + } + + fclose (f); + return 1; +} + + diff --git a/BMP.h b/BMP.h new file mode 100755 index 0000000..c3430d6 --- /dev/null +++ b/BMP.h @@ -0,0 +1,100 @@ + +/*============================================================================= + bmplib, a simple library to create, modify, and write BMP image files. + Copyright (C) 2009-2014 by Zack T Smith. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 + as published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + The author may be reached at veritas@comcast.net. + *============================================================================*/ + +#ifndef _BMP_H +#define _BMP_H + +#include + +#define BMPLIB_RELEASE "0.9" +#define BMPLIB_RELEASE_MAJOR 0 +#define BMPLIB_RELEASE_MINOR 9 + +typedef uint32_t RGB; +typedef uint32_t RGBA; + +typedef struct { + int width, height; + RGB *pixels; +} BMP; + +#define FONT_HEIGHT (17) +#define MINIFONT_HEIGHT (8) + +extern BMP* BMP_new (int, int); +extern void BMP_destroy (BMP*); +extern void BMP_clear (BMP*, RGB); +extern int BMP_write (const BMP*, const char *path); +extern void BMP_point (BMP*, int, int, RGB); +extern void BMP_line (BMP *, int x0, int y0, int x1, int y1, RGB); +extern void BMP_line_dashed (BMP *, int x0, int y0, int x1, int y1, RGB); +extern void BMP_hline (BMP *, int x0, int x1, int y, RGB); +extern void BMP_vline (BMP *, int x, int y0, int y1, RGB); +extern void BMP_rect (BMP *, int x, int y, int w, int h, RGB); +extern void BMP_fillrect (BMP *, int x, int y, int w, int h, RGB); +extern RGB BMP_getpixel (BMP*, int, int); + +extern int BMP_draw_string (BMP *, const char *, int x, int y, RGB); +extern int BMP_string_width (const char *); + +extern int BMP_draw_mini_string (BMP *, const char *, int x, int y, RGB); +extern int BMP_mini_string_width (const char *); + +#define RGB_BLACK (0) +#define RGB_BLUE (0xff) +#define RGB_BRASS (0xc3a368) +#define RGB_BROWN (0x8b4513) +#define RGB_CADETBLUE (0x5f9ea0) +#define RGB_CHARTREUSE (0x7fff00) +#define RGB_CORAL (0xff7f50) +#define RGB_CYAN (0xffff) +#define RGB_DARKGREEN (0x6400) +#define RGB_DARKKHAKI (0xbdb76b) +#define RGB_DARKOLIVEGREEN (0x556b2f) +#define RGB_DARKORANGE (0xff8c00) +#define RGB_DODGERBLUE (0x1e90ff) +#define RGB_GOLDENROD (0xdaa520) +#define RGB_GRAY (0xc0c0c0) +#define RGB_GREEN (0xff00) +#define RGB_KHAKI (0xf0e68c) +#define RGB_LEMONYELLOW (0xfde910) +#define RGB_MAGENTA (0xff00ff) +#define RGB_MAROON (0x800000) +#define RGB_NAVYBLUE (0x80) +#define RGB_ORANGE (0xffa500) +#define RGB_PINK (0xf77fbe) +#define RGB_PURPLE (0xa020f0) +#define RGB_RED (0xff0000) +#define RGB_ROYALBLUE (0x4169e1) +#define RGB_SALMON (0xfa8072) +#define RGB_TURQUOISE (0x40e0d0) +#define RGB_VIOLET (0xee82ee) +#define RGB_WHITE (0xffffff) +#define RGB_YELLOW (0xffff00) + +#define RGB_GRAY6 (0x606060) +#define RGB_GRAY8 (0x808080) +#define RGB_GRAY10 (0xa0a0a0) +#define RGB_GRAY12 (0xc0c0c0) +#define RGB_GRAY14 (0xe0e0e0) + +#endif + diff --git a/BMPGraphing.c b/BMPGraphing.c new file mode 100755 index 0000000..61ae0d7 --- /dev/null +++ b/BMPGraphing.c @@ -0,0 +1,486 @@ +/*============================================================================ + BMPGraphing, a library for graphing. + Copyright (C) 2005-2014 by Zack T Smith. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + The author may be reached at veritas@comcast.net. + *===========================================================================*/ + +#include +#include +#include +#include + +#include "BMP.h" +#include "BMPGraphing.h" + +//---------------------------------------------------------------------------- +// Name: BMPGraphing_draw_labels_log2 +// Purpose: Draw the labels and ticks. +//---------------------------------------------------------------------------- +void +BMPGraphing_draw_labels_log2 (BMPGraph* graph) +{ + if (!graph || !graph->image) + return; + + //---------------------------------------- + // Horizontal + // + // Establish min & max x values. + // + int i = 0; + Value min_x = 0x4000000000000000; + Value max_x = 0; + for (i = 0; i < graph->data_index; i += 2) { + Value type = graph->data[i]; + Value value = graph->data[i+1]; + if (type == DATUM_X) { + if (value < min_x) + min_x = value; + if (value > max_x) + max_x = value; + } + } + graph->min_x = (long long) log2 (min_x); + graph->max_x = (long long) ceil (log2 (max_x)); + + for (i = graph->min_x; i <= graph->max_x; i++) { + char str [200]; + int x = graph->left_margin + + ((i-graph->min_x) * graph->x_span) / + (graph->max_x - graph->min_x); + int y = graph->height - graph->margin + 10; + + unsigned long y2 = 1 << i; + if (y2 < 1536) + snprintf (str, 199, "%ld B", y2); + else if (y2 < (1<<20)) { + snprintf (str, 199, "%ld kB", y2 >> 10); + } + else { + Value j = y2 >> 20; + switch ((y2 >> 18) & 3) { + case 0: snprintf (str, 199, "%lld MB", j); break; + case 1: snprintf (str, 199, "%lld.25 MB", j); break; + case 2: snprintf (str, 199, "%lld.5 MB", j); break; + case 3: snprintf (str, 199, "%lld.75 MB", j); break; + } + } + + BMP_vline (graph->image, x, y, y - 10, RGB_BLACK); + BMP_draw_mini_string (graph->image, str, x - 10, y + 8, RGB_BLACK); + } + + //---------------------------------------- + // Vertical + // + // Establish min & max y values. + // + Value min_y = 0x4000000000000000; + Value max_y = 0; + for (i = 0; i < graph->data_index; i += 2) { + Value type = graph->data[i]; + Value value = graph->data[i+1]; + if (type == DATUM_Y) { + if (value < min_y) + min_y = value; + if (value > max_y) + max_y = value; + } + } + graph->min_y = min_y; + graph->max_y = max_y; + + int font_height = 10; + int available_height = graph->y_span; + int max_labels = available_height / font_height; + int preferred_n_labels = graph->max_y/10000; + int actual_n_labels; + float multiplier = 1; + if (preferred_n_labels < max_labels) { + actual_n_labels = preferred_n_labels; + } else { + actual_n_labels = max_labels; + multiplier = preferred_n_labels / (float) actual_n_labels; + } + + for (i = 0; i <= actual_n_labels; i++) { + char str [200]; + int x = graph->left_margin - 10; + int y = graph->height - graph->margin - (i * graph->y_span) / (float)actual_n_labels; + + BMP_hline (graph->image, x, x+10, y, RGB_BLACK); + + int value = (int) (i * multiplier); + snprintf (str, 199, "%d GB/s", value); + BMP_draw_mini_string (graph->image, str, x - 40, y - MINIFONT_HEIGHT/2, RGB_BLACK); + } +} + +BMPGraph * +BMPGraphing_new (int w, int h, int x_axis_mode) +{ + if (x_axis_mode != MODE_X_AXIS_LINEAR && x_axis_mode != MODE_X_AXIS_LOG2) + return NULL; + + BMPGraph *graph = (BMPGraph*) malloc (sizeof(BMPGraph)); + if (!graph) + return NULL; + + bzero (graph, sizeof(BMPGraph)); + + graph->x_axis_mode = x_axis_mode; + + if (w <= 0 || h <= 0) { + w = 1920; + h = 1080; + } + + graph->width = w; + graph->height = h; + graph->image = BMP_new (w, h); + graph->margin = 40; + graph->left_margin = 80; + + BMP_clear (graph->image, RGB_WHITE); + + BMP_hline (graph->image, graph->left_margin, graph->width - graph->margin, graph->height - graph->margin, RGB_BLACK); + BMP_vline (graph->image, graph->left_margin, graph->margin, graph->height - graph->margin, RGB_BLACK); + + graph->x_span = graph->width - (graph->margin + graph->left_margin); + graph->y_span = graph->height - 2 * graph->margin; + + graph->legend_y = graph->margin; + + return graph; +} + +void BMPGraphing_set_title (BMPGraph* graph, const char *title) +{ + if (!graph || !title) + return; + + if (graph->title) + free (graph->title); + graph->title = strdup (title); + + BMP_draw_string (graph->image, graph->title, graph->left_margin, graph->margin/2, RGB_BLACK); +} + +void +BMPGraphing_new_line (BMPGraph *graph, char *str, RGB color) +{ + if (!graph || !graph->image) + return; + + BMP_draw_string (graph->image, str, graph->width - graph->margin - 320, graph->legend_y, 0xffffff & color); + + graph->legend_y += 17; + + graph->fg = 0; + graph->last_x = graph->last_y = -1; + + if (graph->data_index >= MAX_GRAPH_DATA-2) + return; // error ("Too many graph data."); + + graph->data [graph->data_index++] = DATUM_COLOR; + graph->data [graph->data_index++] = color; +} + +//---------------------------------------------------------------------------- +// Name: BMPGraphing_add_point +// Purpose: Adds a point to this list to be drawn. +//---------------------------------------------------------------------------- +void +BMPGraphing_add_point (BMPGraph *graph, Value x, Value y) +{ + if (!graph || !graph->image) + return; + + if (graph->data_index >= MAX_GRAPH_DATA-4) + return; // error ("Too many graph data."); + + graph->data [graph->data_index++] = DATUM_X; + graph->data [graph->data_index++] = x; + graph->data [graph->data_index++] = DATUM_Y; + graph->data [graph->data_index++] = y; +} + +//---------------------------------------------------------------------------- +// Name: BMPGraphing_plot_log2 +// Purpose: Plots a point on the current graph. +//---------------------------------------------------------------------------- + +void +BMPGraphing_plot_log2 (BMPGraph *graph, Value x, Value y) +{ + if (!graph || !graph->image) + return; + + int i = 0; + + //---------------------------------------- + // Plot the point. The x axis is + // logarithmic, base 2. + // + double tmp = log2 (x); + tmp -= (double) graph->min_x; + tmp *= (double) graph->x_span; + tmp /= (double) (graph->max_x - graph->min_x); + + int x2 = graph->left_margin + (int) tmp; + int y2 = graph->height - graph->margin - (y * graph->y_span) / graph->max_y; + + if (graph->last_x != -1 && graph->last_y != -1) { + if (graph->fg & DASHED) + BMP_line_dashed (graph->image, graph->last_x, graph->last_y, x2, y2, graph->fg & 0xffffff); + else + BMP_line (graph->image, graph->last_x, graph->last_y, x2, y2, graph->fg); + } + + graph->last_x = x2; + graph->last_y = y2; +} + +//---------------------------------------------------------------------------- +// Name: BMPGraphing_plot_linear +// Purpose: Plots a point on the current graph. +//---------------------------------------------------------------------------- + +void +BMPGraphing_plot_linear (BMPGraph *graph, Value x, Value y, Value max_y) +{ + if (!graph || !graph->image) + return; + + //---------------------------------------- + // Plot the point. The x axis is + // logarithmic, base 2. The units of the + // y value is kB. + // + double tmp = 10. + log2 (x); + tmp -= (double) XVALUE_MIN; + tmp *= (double) graph->x_span; + tmp /= (double) (XVALUE_MAX - XVALUE_MIN); + int x2 = graph->left_margin + (int) tmp; + int y2 = graph->height - graph->margin - (y * graph->y_span) / max_y; + +//printf ("\tx=%d, y=%d\n",x,y); fflush(stdout); + + if (graph->last_x != -1 && graph->last_y != -1) { + if (graph->fg & DASHED) + BMP_line_dashed (graph->image, graph->last_x, graph->last_y, x2, y2, graph->fg & 0xffffff); + else + BMP_line (graph->image, graph->last_x, graph->last_y, x2, y2, graph->fg); + } + + graph->last_x = x2; + graph->last_y = y2; +} + +//---------------------------------------------------------------------------- +// Name: BMPGraphing_make_log2 +// Purpose: Plots all lines. +//---------------------------------------------------------------------------- + +static void +BMPGraphing_make_log2 (BMPGraph *graph) +{ + if (!graph || !graph->image) + return; + + BMPGraphing_draw_labels_log2 (graph); + + //---------------------------------------- + // OK, now draw the lines. + // + int i; + int x = -1, y = -1; + for (i = 0; i < graph->data_index; i += 2) + { + Value type = graph->data[i]; + Value value = graph->data[i+1]; + + switch (type) { + case DATUM_Y: y = value; break; + case DATUM_X: x = value; break; + case DATUM_COLOR: + graph->fg = (unsigned long) value; + graph->last_x = -1; + graph->last_y = -1; + break; + } + + if (x != -1 && y != -1) { + BMPGraphing_plot_log2 (graph, x, y); + x = y = -1; + } + } +} + +//---------------------------------------------------------------------------- +// Name: BMPGraphing_make_linear +// Purpose: Plots all lines for the network test graph. +//---------------------------------------------------------------------------- + +static void +BMPGraphing_make_linear (BMPGraph *graph) +{ + if (!graph || !graph->image) + return; + + int i; + + // No data + if (!graph->data_index) + return; + + //---------------------------------------- + // Get the maximum bandwidth in order to + // properly scale the graph vertically. + // + int max_y = 0; + for (i = 0; i < graph->data_index; i += 2) { + if (graph->data[i] == DATUM_Y) { + int y = graph->data [i+1]; + if (y > max_y) + max_y = y; + } + } + + int range = max_y > 10000 ? 2 : (max_y > 1000 ? 1 : 0); + int y_spacing = 1; + switch (range) { + case 2: + // Round up to the next 100.00 MB/sec. (=10000). + y_spacing = 10000; + break; + case 1: + // Round up to the next 10.00 MB/sec. + y_spacing = 1000; + break; + case 0: + // Round up to the next 1.00 MB/sec. + y_spacing = 100; + break; + } + max_y /= y_spacing; + max_y *= y_spacing; + max_y += y_spacing; + + //---------------------------------------- + // Draw the axes, ticks & labels. + // + // X axis: + if (XVALUE_MIN < 10) + return; // error ("Minimum y is too small."); + + for (i = XVALUE_MIN; i <= XVALUE_MAX; i++) { + char str[200]; + unsigned long y2 = 1 << (i-10); // XX XVALUE_MIN>=10 + if (y2 < 1024) + snprintf (str, 199, "%u kB", (unsigned int) y2); + else + snprintf (str, 199, "%lu MB", (unsigned long) (y2 >> 10)); + + int x = graph->left_margin + ((i - XVALUE_MIN) * graph->x_span) / (XVALUE_MAX - XVALUE_MIN); + int y = graph->height - graph->margin + 10; + + BMP_vline (graph->image, x, y, y-10, RGB_BLACK); + BMP_draw_mini_string (graph->image, str, x - 10, y+8, RGB_BLACK); + } + + //---------- + // Y axis: + // Decide what the tick spacing will be. + for (i = 0; i <= max_y; i += y_spacing) { + char str[200]; + unsigned long whole = i / 100; + unsigned long frac = i % 100; + snprintf (str, 199, "%lu.%02lu MB/s", whole, frac); + + int x = graph->left_margin - 10; + int y = graph->height - graph->margin - (i * graph->y_span) / max_y; + + BMP_hline (graph->image, x, x+10, y, RGB_BLACK); + BMP_draw_mini_string (graph->image, str, x - 60, y - MINIFONT_HEIGHT/2, RGB_BLACK); + } + + //---------------------------------------- + // Draw the data lines. + // + int x = -1, y = -1; + graph->last_x = -1; + graph->last_y = -1; + for (i = 0; i < graph->data_index; i += 2) + { + int type = graph->data[i]; + long value = graph->data[i+1]; + + switch (type) { + case DATUM_Y: y = value; break; + case DATUM_X: x = value; break; + case DATUM_COLOR: + graph->fg = (unsigned long) value; + graph->last_x = -1; + graph->last_y = -1; + break; + } + + if (x != -1 && y != -1) { + BMPGraphing_plot_linear (graph, x, y, max_y); + x = y = -1; + } + } +} + +void +BMPGraphing_make (BMPGraph *graph) +{ + if (!graph) + return; // XX silent error + + switch (graph->x_axis_mode) { + case MODE_X_AXIS_LOG2: + BMPGraphing_make_log2 (graph); + break; + case MODE_X_AXIS_LINEAR: + BMPGraphing_make_linear (graph); + break; + default: + fprintf (stderr, "Invalid graph mode %d.\n", graph->x_axis_mode); + break; + } +} + +void +BMPGraphing_destroy (BMPGraph *graph) +{ + if (!graph) + return; + + if (graph->title) { + free (graph->title); + graph->title = NULL; + } + if (graph->image) { + BMP_destroy (graph->image); + graph->image = NULL; + } + + free (graph); +} diff --git a/BMPGraphing.h b/BMPGraphing.h new file mode 100755 index 0000000..4f13972 --- /dev/null +++ b/BMPGraphing.h @@ -0,0 +1,88 @@ +/*============================================================================ + BMPGraphing, a library for graphing. + Copyright (C) 2005-2014 by Zack T Smith. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + The author may be reached at veritas@comcast.net. + *===========================================================================*/ + +#ifndef _SUPERSIMPLEGRAPHING_H +#define _SUPERSIMPLEGRAPHING_H + +#include + +#define SSG_RELEASE "0.2" + +#define XVALUE_MIN (15) +#define XVALUE_MAX (28) + +enum { + DATUM_X=0, + DATUM_Y=1, + DATUM_COLOR=2, +}; + +typedef long Coordinate; +typedef long long Value; + +enum { + MODE_X_AXIS_LINEAR = 0, + MODE_X_AXIS_LOG2 = 1, +}; + +//--------------- +// Graphing data. +// +typedef struct { + BMP *image; + char *title; + + unsigned char x_axis_mode; + + Coordinate width; + Coordinate height; + Coordinate left_margin; + Coordinate margin; + Coordinate last_x; + Coordinate last_y; + Coordinate x_span; + Coordinate y_span; + Coordinate legend_y; + + RGB fg; +#define MAX_GRAPH_DATA 50000 + Value data [MAX_GRAPH_DATA]; + int data_index; +#define DASHED 0x1000000 // dashed line flag + + Value max_y; + Value min_y; + Value min_x; + Value max_x; +} BMPGraph; + +extern void BMPGraphing_set_title (BMPGraph*, const char *); +extern void BMPGraphing_draw_labels_log2 (BMPGraph*); +extern BMPGraph *BMPGraphing_new (int w, int h, int x_axis_mode); +extern void BMPGraphing_new_line (BMPGraph *, char *str, RGB color); +extern void BMPGraphing_add_point (BMPGraph *, Value x, Value y); +extern void BMPGraphing_plot_log2 (BMPGraph *, Value x, Value y); +extern void BMPGraphing_plot_linear (BMPGraph *, Value x, Value y, Value max_amt); +extern void BMPGraphing_make (BMPGraph*); +extern BMP *BMPGraphing_get_graph (BMPGraph*); +extern void BMPGraphing_destroy (BMPGraph*); + +#endif diff --git a/COPYING.txt b/COPYING.txt new file mode 100755 index 0000000..3912109 --- /dev/null +++ b/COPYING.txt @@ -0,0 +1,340 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. diff --git a/Makefile b/Makefile new file mode 100755 index 0000000..913d023 --- /dev/null +++ b/Makefile @@ -0,0 +1,87 @@ +#============================================================================ +# bandwidth, a benchmark to estimate memory transfer bandwidth. +# Copyright (C) 2005-2014 by Zack T Smith. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +# +# The author may be reached at veritas@comcast.net. +#============================================================================ + +CFLAGS= -O6 +CFLAGS= -g +CC=gcc +LD=gcc +SRC=main.c +OBJ=main.o +LIB= +AS=nasm + +message: + @echo "" + @echo "To compile for x86 Linux: make bandwidth32" + @echo "To compile for x86_64 Linux: make bandwidth64" + @echo "To compile for x86 Mac OS/X: make bandwidth-mac32" + @echo "To compile for x86_64 Mac OS/X: make bandwidth-mac64" + @echo "To compile for x86 Win32/Cygwin: make bandwidth-win32" + @echo "Note! For the Mac you will need to install the latest NASM; Apple's is insufficient." + @echo "" + +bandwidth64: main.c routines64.asm BMP64.a BMPGraphing64.a + ${AS} -f elf64 routines64.asm -o routines64.o + ${CC} ${CFLAGS} -m64 -c ${SRC} + ${LD} -m64 routines64.o ${OBJ} BMP64.a -lm BMPGraphing64.a -o bandwidth64 + +bandwidth32: main.c routines32.asm BMP32.a BMPGraphing32.a + ${AS} -f elf routines32.asm -o routines32.o + ${CC} ${CFLAGS} -m32 -c ${SRC} + ${LD} -m32 routines32.o ${OBJ} BMP32.a -lm BMPGraphing32.a -o bandwidth32 + +bandwidth-mac64: main.c routines64.asm BMPGraphing64.a BMP64.a + ${AS} -f macho64 routines64.asm -o routines64.o + ${CC} ${CFLAGS} -m64 -c ${SRC} + ${LD} -m64 -lm BMPGraphing64.a BMP64.a routines64.o ${OBJ} ${LIB} -o bandwidth-mac64 + +bandwidth-mac32: main.c routines32.asm BMP32.a BMPGraphing32.a + ${AS} -f macho routines32.asm -o routines32.o + ${CC} ${CFLAGS} -m32 -c ${SRC} + ${LD} -m32 BMP32.a -lm BMPGraphing32.a routines32.o ${OBJ} ${LIB} -o bandwidth-mac32 + +bandwidth-win32: main.c routines32.asm BMP32.a BMPGraphing32.a + ${AS} -f win32 routines32.asm -o routines32.o + ${CC} ${CFLAGS} -m32 -c ${SRC} -Wall -O6 -D__WIN32__ -DWINVER=0x0600 + ${LD} -m32 BMP32.a -lm BMPGraphing32.a routines32.o ${OBJ} ${LIB} -o bandwidth-win32 + +BMPGraphing64.a: BMPGraphing.c + ${CC} ${CFLAGS} -m64 -c BMPGraphing.c + ar rvs BMPGraphing64.a BMPGraphing.o + +BMPGraphing32.a: BMPGraphing.c + ${CC} ${CFLAGS} -m32 -c BMPGraphing.c + ar rvs BMPGraphing32.a BMPGraphing.o + +BMP64.a: BMP.c + ${CC} ${CFLAGS} -m64 -c BMP.c font.c minifont.c + ar rvs BMP64.a BMP.o font.o minifont.o + +BMP32.a: BMP.c + ${CC} ${CFLAGS} -m32 -c BMP.c font.c minifont.c + ar rvs BMP32.a BMP.o font.o minifont.o + +clean: + rm -f main.o bandwidth bandwidth32 bandwidth64 routines32.o routines64.o + rm -f bandwidth-win32.exe bandwidth.bmp bandwidth-mac32 bandwidth-mac64 + rm -f BMP.o BMP32.a BMP64.a BMPGraphing.o BMPGraphing32.a BMPGraphing64.a + rm -f font.o minifont.o network_bandwidth.bmp + diff --git a/README.txt b/README.txt new file mode 100755 index 0000000..7189a27 --- /dev/null +++ b/README.txt @@ -0,0 +1,167 @@ + +This is the README file for my program, "bandwidth". + +Bandwidth is a benchmark that attempts to measure +memory bandwidth. In December 2010 (and as of +release 0.24), I extended 'bandwidth' to measure +network bandwidth as well. + +Bandwidth is useful because both memory bandwidth +and network bandwidth need to be measured to +give you a clear idea of what your computer(s) can do. +Merely relying on specs does not give a full picture +and indeed specs can be misleading. + +-------------------------------------------------- +MEMORY BANDWIDTH + +My program bandwidth performs sequential and random +reads and writes of varying sizes. This permits +you to infer from the graph how each type of memory +is performing. So for instance when bandwidth +writes a 256-byte chunk, you know that because +caches are normally write-back, this chunk +will reside entirely in the L1 cache. Whereas +a 512 kB chunk will mainly reside in L2. + +You could run a non-artificial benchmark and +observe that a general performance number is lower +on one machine or higher on anotehr, but that may +conceal the cause. + +So the purpose of this program is to help you +pinpoint the cause of a performance problem, +or to affirm a general impression about a memory- +intensive program. + +It also tells you the best-case scenario e.g. +the maximum bandwidth achieved using sequential, +128-bit memory accesses. + +Release 1.1: + - Added larger font. +Release 1.0: + - Moved graphing into BMPGraphing module. + - Finally added LODS benchmarking, which + proves how badly lodsb/lodsw/lodsd/lodsq + perform. + - Added switches --faster and --fastest. +Release 0.32: + - Improved AVX support. +Release 0.31: + - Adds cache detection for Intel 32-bit CPUs + - Adds a little AVX support. + - Fixes vector-to/from-main transfer bugs. +Release 0.30 adds cache detection for Intel 64-bit CPUs. +Release 0.29 improved graph granularity with more + 128-byte tests and removes ARM support. +Release 0.28 added a proper test of CPU features e.g. SSE 4.1. +Release 0.27 added finer-granularity 128-byte tests. +Release 0.26 fixed an issue with AMD processors. +Release 0.25 maked network bandwidth bidirectional. +Release 0.24 added network bandwidth testing. + +Release 0.23 added: + - Mac OS/X 64-bit support. + - Vector-to-vector register transfer test. + - Main register to/from vector register transfer test. + - Main register byte/word/dword/qword to/from + vector register test (pinsr*, pextr* instructions). + - Memory copy test using SSE2. + - Automatic checks under Linux for SSE2 & SSE4. + +Release 0.22 added: + - Register-to-register transfer test. + - Register-to/from-stack transfer tests. + +Release 0.21 added: + - Standardized memory chunks to always be + a multiple of 256-byte mini-chunks. + - Random memory accesses, in which each + 256-byte mini-chunk accessed is accessed + in a random order, but also, inside each + mini-chunk the 32/64/128 data are accessed + pseudo-randomly as well. + - Now 'bandwidth' includes chunk sizes that + are not powers of 2, which increases + data points around the key chunk sizes + corresponding to common L1 and L2 cache + sizes. + - Command-line options: + --fast for 0.25 seconds per test. + --slow for 20 seconds per test. + --title for adding a graph title. + +Release 0.20 added graphing, with the graph +stored in a BMP image file. It also adds the +--slow option for more precise runs. + +Release 0.19 added a second 128-bit SSE writer +routine that bypasses the caches, in addition +to the one that doesn't. + +Release 0.18 was my Grand Unified bandwidth +benchmark that brought together support for +four operating systems: + - Linux + - Windows Mobile + - 32-bit Windows + - Mac OS/X 64-bit +and two processor architectures: + - x86 + - Intel64 +I've written custom assembly routines for +each architecture. + +Total run time for the default speed, which +has 5 seconds per test, is about 35 minutes. + +-------------------------------------------------- +NETWORK BANDWIDTH (beginning with release 0.24) + +In mid-December 2010, I extended bandwidth to measure +network bandwidth, which is useful for testing +your home or workplace network setup, and in theory +could be used to test machines across the Internet. + +Release 0.25 adds: + - Bidirectional network bandwidth testing. + - Specifiable port# (default is 49000). + +In the graph: + - Sent data appears as a solid line. + - Received data appears as a dashed line. + +The network test is pretty simple. It sends chunks +of data of varying sizes to whatever computers +(nodes) that you specify. Each of those must be +running 'bandwidth' in transponder mode. + +The chunks of data range of 32 kB up to 32 MB. +These are actually send as a stream of 1 or more +32 kB sub-chunks. + +Sample output: + output/Network-Linux2.6-Celeron-2.8GHz-32bit-loopback.bmp + output/Network-MacOSX32-Corei5-2.4GHz-64bit-loopback.bmp + output/Network-Mac64-Linux32.bmp + +How to start a transponder: + ./bandwidth-mac64 --transponder + +Example invocation of the test leader: + ./bandwidth64 --network 192.168.1.104 + +I've tested network mode on: + Linux 32-bit + Mac OS/X 32- and 64-bit + Win/Cygwin 32-bit. + +-------------------------------------------------- +This program is provided without any warranty +and AS-IS. See the file COPYING for details. + +Zack Smith +1@zsmith.co +March 2013 + diff --git a/defs.h b/defs.h new file mode 100755 index 0000000..176dbd1 --- /dev/null +++ b/defs.h @@ -0,0 +1,147 @@ +/*============================================================================ + bandwidth, a benchmark to estimate memory transfer bandwidth. + Copyright (C) 2005-2014 by Zack T Smith. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + The author may be reached at 1@zsmith.co. + *===========================================================================*/ + +//--------------------------------------------------------------------------- +// Change log +// 0.18 Grand unified version supports x86/intel64/arm, linux/win32/winmo. +// 0.19 Now have 128-bit writer that goes to cache AND one that bypasses. +// 0.20 Added my bmplib and graphing of output. Also added --slow option. +// 0.21 Adds random testing. Min chunk size = 256 B. Allows non-2^n chunks. +// 0.22 Adds register-to-register and register-to/from-stack transfers. +// 0.23 Adds vector-to-vector and register-to-vector transfers, & Mac support. +// 0.24 Adds network bandwidth tests from this PC to specified others. +// 0.25 Made network tests bidirectional to test asymmetric networks. +// 0.26 Fixes to prevent certain vector instructions being used w/AMD chips. +// 0.27 Added 128-byte tests for greater precision. +// 0.28 Added use of CPUID. +// 0.29 Added more 128-byte tests. +// 0.30 Adds cache identification for Intel CPUs in 64-bit mode. +// 0.31 Adds cache identification for Intel CPUs in 32-bit mode. +// 0.32 Added AVX support. +// 1.0 Moved graphing logic into BMPGraphing. Added LODS support. +// 1.1 Switched to larger font in graphing module. +//--------------------------------------------------------------------------- + +#ifndef _DEFS_H +#define _DEFS_H + +#define RELEASE "1.1" + +#ifndef bool +typedef char bool; +enum { true = 1, false = 0 }; +#endif + +#define NETWORK_DEFAULT_PORTNUM (49000) +#define NETSIZE_MIN (15) +#define NETSIZE_MAX (28) +#define NETWORK_CHUNK_SIZE (1< + +#include "BMP.h" + +// Mini characters, 8 pixels high. +static const char *font_chars_ [] = +{ + "##", + "##", + "##", + "##", + "##", + "##", + "##", + "##", + "##", + "##", + " ", + " ", + "##", + "##", + "", + "", + "", + + "## ##", + "## ##", + "## ##", + " # #", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + + " ## ## ", + " ## ## ", + " ## ## ", + " ## ## ", + " ##########", + " ##########", + " ## ## ", + " ## ## ", + " ##########", + " ##########", + " ## ## ", + " ## ## ", + " ## ## ", + " ## ## ", + "", + "", + "", + + " ## ", + " ## ", + " ########", + "## ##", + "## ## ", + "## ## ", + " ###### ", + " ## ##", + " ## ##", + " ## ##", + " ## ##", + "########", + " ## ", + " ## ", + "", + "", + "", + + " ## ##", + " # # ##", + " ## ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + "## ## ", + "## # #", + "## ## ", + "", + "", + "", + + " #####", + "## ##", + "## ##", + "## ##", + "## ##", + "## ## ", + " ## ## ##", + " #### ##", + "## ## ##", + "## ####", + "## ##", + "## ##", + " ## ## ##", + " ##### ##", + "", + "", + "", + + "###", + "###", + " ##", + " #", + " #", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + + " ##", + " ##", + " ##", + " ##", + " ##", + "##", + "##", + "##", + "##", + "##", + "##", + " ##", + " ##", + " ##", + " ##", + " ##", + "", + + "## ", + " ## ", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + "##", + "", + + " ", + "", + "", + " ##", + "## ## ##", + " ## ## ##", + " ######", + " ####", + " ######", + " ## ## ##", + "## ## ##", + " ##", + "", + "", + "", + "", + "", + + "", + "", + "", + " ##", + " ##", + " ##", + " ##", + "##########", + " ##", + " ##", + " ##", + " ##", + "", + "", + "", + "", + "", + + " ", + " ", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "###", + "###", + " ##", + " ##", + "#", + + " ", + "", + "", + "", + "", + "", + "", + "#######", + "", + "", + "", + "", + "", + "", + "", + "", + "", + + " ", + " ", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "###", + "###", + "", + "", + "", + + " ", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + "##", + "##", + "", + "", + "", + + " ##### ", + " ## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + " ## ##", + " ##### ", + "", + "", + "", + +" ##", +" ##", +" ###", +"#####", +" ##", +" ##", +" ##", +" ##", +" ##", +" ##", +" ##", +" ##", +" ##", +" ## ", + "", +"", +"", + + " #### ", + " ## ##", + "## ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ## ", + "## ", + "## ", + "## ", + "########", + "", + "", + "", + + "########", + " ##", + " ##", + " ##", + " ## ", + " ## ", + " #### ", + " ##", + " ##", + " ##", + " ##", + "## ##", + " ## ##", + " #### ", + "", + "", + "", + + " ##", + " ###", + " ####", + " ## ##", + " ## ##", + "## ##", + "#########", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + "", + "", + "", + + "########", + "## ", + "## ", + "## ", + "## ", + "###", + " ######", + " ##", + " ##", + " ##", + " ##", + "## ##", + " ## ##", + " #### ", + "", + "", + "", + + " ##### ", + " ## #", + "## ", + "## ", + "## ", + "## ", + "#######", + "## ## ", + "## ##", + "## ##", + "## ##", + "## ##", + " ## ##", + " ##### ", + "", + "", + "", + + "########", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + "", + "", + "", + + " ##### ", + " ## ##", + "## ##", + "## ##", + "## ##", + " ## ##", + " ##### ", + " ## ##", + "## ##", + "## ##", + "## ##", + "## ##", + " ## ##", + " ##### ", + "", + "", + "", + + " ##### ", + " ## ##", + "## ##", + "## ##", + "## ##", + " ## ##", + " #######", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + "", + "", + "", + + " ", + " ", + "", + "", + "", + "###", + "###", + "", + "", + "", + "", + "###", + "###", + "", + "", + "", + "", + + " ", + "", + "", + "", + "", + "###", + "###", + "", + "", + "", + "", + "###", + "###", + " ##", + " ##", + "#", + "", + + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + "##", + "##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + "", + "", + "", + + " ", + "", + "", + "", + "", + "", + "############", + "############", + " ", + " ", + "############", + "############", + "", + "", + "", + "", + "", + + "## ", + " ## ", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + "##", + "", + "", + "", + + " ###### ", + " ## ##", + "## ##", + "## ##", + " ##", + " ##", + " ### ", + " ##", + " ##", + " ##", + "", + "", + " ##", + " ##", + "", + "", + "", + + " ###### ", + " ## ##", + "## ##", + "## ##", + "## ####", + "## ## ##", + "## ## ##", + "## ## ##", + "## ## ##", + "## ####", + "## ", + "## ", + " ## ##", + " #######", + "", + "", + "", + + " ##", + " ####", + " ####", + " ## ##", + " ## ##", + " ## ##", + " ## ##", + " ## ##", + " ########", + " ## ##", + " ## ##", + "## ##", + "## ##", + "## ##", + "", + "", + "", + + "########", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "######## ", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "########", + "", + "", + "", + + " ###### ", + " ## ##", + "## ##", + "## ##", + "## ", + "## ", + "## ", + "## ", + "## ", + "## ", + "## ##", + "## ##", + " ## ##", + " ######", + "", + "", + "", + + "########", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ## ", + "########", + "", + "", + "", + + "##########", + "##", + "##", + "##", + "##", + "##", + "########", + "##", + "##", + "##", + "##", + "##", + "##", + "##########", + "", + "", + "", + + "##########", + "##", + "##", + "##", + "##", + "##", + "########", + "##", + "##", + "##", + "##", + "##", + "##", + "##", + "", + "", + "", + + " ###### ", + " ## ##", + "## ##", + "## ##", + "## ", + "## ", + "## ####", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + " ## ###", + " ###### #", + "", + "", + "", + + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "##########", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "", + "", + "", + + " ## ", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + "", + "", + "", + + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + "## ##", + "## ##", + " ## ##", + " ####", + "", + "", + "", + + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "####", + "####", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "", + "", + "", + + "##", + "##", + "##", + "##", + "##", + "##", + "##", + "##", + "##", + "##", + "##", + "##", + "##", + "##########", + "", + "", + "", + + "## ##", + "### ###", + "#### ####", + "## ## ## ##", + "## ## ## ##", + "## ## ## ##", + "## ## ## ##", + "## ## ## ##", + "## ## ## ##", + "## ## ## ##", + "## ## ## ##", + "## ### ##", + "## ### ##", + "## # ##", + "", + "", + "", + + "## ##", + "### ##", + "#### ##", + "#### ##", + "## ## ##", + "## ## ##", + "## ## ##", + "## ## ##", + "## ## ##", + "## ## ##", + "## ####", + "## ####", + "## ###", + "## ##", + "", + "", + "", + + " ###### ", + " ## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + " ## ##", + " ######", + "", + "", + "", + + "########", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "########", + "##", + "##", + "##", + "##", + "##", + "##", + "##", + "", + "", + "", + + " ######", + " ## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ## ##", + "## ## ##", + "## ## ##", + "## ###", + " ## ##", + " ##### ##", + "", + "", + "", + + "########", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "########", + "####", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "", + "", + "", + + " ######", + " ## ##", + "## ##", + "## ##", + "##", + " ###", + " #####", + " ##", + " ##", + " ##", + "## ##", + "## ##", + " ## ##", + " ######", + "", + "", + "", + + "##########", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + "", + "", + "", + + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + " ## ##", + " ######", + "", + "", + "", + + "## ##", + "## ##", + " ## ##", + " ## ##", + " ## ##", + " ## ## ", + " ## ## ", + " ## ## ", + " ## ## ", + " ## ## ", + " ## ## ", + " ### ", + " ### ", + " # ", + "", + "", + "", + +"## ## ##", +"## ## ##", +"## ## ##", +" ## #### ##", +" ## #### ##", +" ## ## ## ##", +" ## ## ## ##", +" ## ## ## ##", +" ## ## ## ##", +" ## ## ## ##", +" ## ## ## ##", +" ### ###", +" ### ###", +" # #", + "", + "", + "", + + "## ##", + "## ##", + " ## ##", + " ## ##", + " ## ##", + " ## ##", + " ##", + " ##", + " ## ##", + " ## ##", + " ## ##", + " ## ##", + "## ##", + "## ##", + "", + "", + "", + + "## ##", + "## ##", + " ## ##", + " ## ##", + " ## ##", + " ## ##", + " ####", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + "", + "", + "", + + "#########", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + "#########", + "", + "", + "", + + "#####", + "##", + "##", + "##", + "##", + "##", + "##", + "##", + "##", + "##", + "##", + "##", + "##", + "##", + "##", + "#####", + "", + + "##", + "##", + "##", + "##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + "", + "", + "", + + "#####", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + "#####", + "", + + " ##", + " ####", + " ## ##", + " ## ##", + "## ##", + "## ##", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + + " ", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "########", + "", + "", + "", + + "####", + "####", + "##", + " ##", + " #", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + + " ", + "", + "", + "", + " ######", + " ## ##", + " ##", + " ##", + " ######", + " ## ##", + "## ##", + "## ##", + " ## ##", + " ##### ##", + "", + "", + "", + + "##", + "##", + "##", + "##", + "## ####", + "### ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "### ##", + "## ####", + "", + "", + "", + + " ", + "", + "", + "", + " #####", + " ## ##", + "## ", + "## ", + "## ", + "## ", + "## ", + "## ", + " ## ##", + " #####", + "", + "", + "", + + " ##", + " ##", + " ##", + " ##", + " #### ##", + " ## ###", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + " ## ###", + " #### ##", + "", + "", + "", + + " ", + "", + "", + "", + " #####", + " ## ##", + "## ##", + "## ##", + "#########", + "##", + "##", + "##", + " ## ##", + " ######", + "", + "", + "", + + " ####", + " ##", + " ## ", + " ## ", + "#####", + " ## ", + " ## ", + " ## ", + " ## ", + " ## ", + " ## ", + " ## ", + " ## ", + " ## ", + "", + "", + "", + + " ", + "", + "", + "", + " ##### #", + " ## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + " ## ##", + " #######", + " ##", + "## ##", + " ######", + + "##", + "##", + "##", + "##", + "## ####", + "### ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "", + "", + "", + +"##", +"##", +"", +"", +"##", +"##", +"##", +"##", +"##", +"##", +"##", +"##", +"##", +"##", + "", + "", + "", + + " ##", + " ##", + " ", + " ", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + "###", + + "##", + "##", + "##", + "##", + "## ##", + "## ##", + "## ## ", + "## ## ", + "#### ", + "#### ", + "## ## ", + "## ## ", + "## ##", + "## ##", + "", + "", + "", + +"##", +"##", +"##", +"##", +"##", +"##", +"##", +"##", +"##", +"##", +"##", +"##", +"##", +"##", + "", + "", + "", + + " ", + "", + "", + "", + "## ### ####", + "### ## ##", + "## ## ##", + "## ## ##", + "## ## ##", + "## ## ##", + "## ## ##", + "## ## ##", + "## ## ##", + "## ## ##", + "", + "", + "", + + " ", + "", + "", + "", + "## ####", + "### ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "", + "", + "", + + " ", + "", + "", + "", + " #####", + " ## ## ", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + " ## ## ", + " ##### ", + "", + "", + "", + + " ", + "", + "", + "", + "## ####", + "### ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "#######", + "##", + "##", + "##", + + " ", + "", + "", + "", + " #### ##", + " ## ###", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + " ## ##", + " #######", + " ##", + " ##", + " ##", + + " ", + "", + "", + "", + "## ####", + "## ##", + "####", + "###", + "##", + "##", + "##", + "##", + "##", + "##", + "", + "", + "", + + " ", + "", + "", + "", + " #######", + "## ##", + "##", + " ##", + " ###", + " ###", + " ##", + " ##", + "## ##", + " #######", + "", + "", + "", + + " ##", + " ##", + " ##", + " ##", + "######", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ####", + "", + "", + "", + + " ", + "", + "", + "", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + " ## ##", + " ##### #", + "", + "", + "", + + " ", + "", + "", + "", + "## ##", + "## ##", + "## ##", + " ## ##", + " ## ##", + " ## ##", + " ## ##", + " ## ##", + " #####", + " ###", + "", + "", + "", + + " ", + "", + "", + "", + "## ## ##", + "## ## ##", + "## ## ##", + " ## #### ##", + " ## #### ##", + " ## #### ##", + " ## ## ## ##", + " ## ## ## ##", + " ##### #####", + " ### ###", + "", + "", + "", + + " ", + "", + "", + "", + "## ##", + "## ##", + " ## ##", + " ## ##", + " ##", + " ##", + " ## ##", + " ## ##", + "## ##", + "## ##", + "", + "", + "", + + " ", + "", + "", + "", + "## ##", + "## ##", + "## ##", + " ## ##", + " ## ##", + " ## ##", + " ## ##", + " ## ##", + " ## ##", + " ####", + " ##", + " ##", + "####", + + " ", + "", + "", + "", + "#########", + " ##", + " ##", + " ##", + " ##", + " ##", + " ## ", + " ## ", + "## ", + "#########", + "", + "", + "", + +}; + +const char **get_font_chars () +{ + return font_chars_; +} + diff --git a/font.h b/font.h new file mode 100755 index 0000000..5ecb8ed --- /dev/null +++ b/font.h @@ -0,0 +1,28 @@ + +/*============================================================================= + bmplib, a simple library to create, modify, and write BMP image files. + Copyright (C) 2009-2014 by Zack T Smith. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 + as published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + The author may be reached at veritas@comcast.net. + *============================================================================*/ + +#ifndef _FONT_H +#define _FONT_H + +extern const char **get_font_chars (void); + +#endif + diff --git a/loopback.sh b/loopback.sh new file mode 100755 index 0000000..780d50f --- /dev/null +++ b/loopback.sh @@ -0,0 +1,5 @@ +#!/bin/bash +EXE=bandwidth32 +./$EXE --transponder & +./$EXE --network 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 +kill %1 diff --git a/main.c b/main.c new file mode 100755 index 0000000..2d293a8 --- /dev/null +++ b/main.c @@ -0,0 +1,2442 @@ +/*============================================================================ + bandwidth 1.1, a benchmark to estimate memory transfer bandwidth. + Copyright (C) 2005-2014 by Zack T Smith. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + The author may be reached at veritas@comcast.net. + *===========================================================================*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include // gethostbyname +#include +#include +#include + +#define GRAPH_WIDTH 1440 +#define GRAPH_HEIGHT 900 + +#include "defs.h" +#include "BMP.h" +#include "BMPGraphing.h" + +#define TITLE_MEMORY_NET "Network benchmark results from bandwidth " RELEASE " by Zack Smith, http://zsmith.co" +#define TITLE_MEMORY_GRAPH "Memory benchmark results from bandwidth " RELEASE " by Zack Smith, http://zsmith.co" + +#ifdef __WIN32__ +#include +#endif + +#ifdef __linux__ +#include +#include +#endif + +static int network_port = NETWORK_DEFAULT_PORTNUM; + +enum { + NO_SSE2, + SSE2, + SSE2_BYPASS, + AVX, + AVX_BYPASS, + LODSQ, + LODSD, + LODSW, + LODSB +}; + +static BMPGraph *graph = NULL; + +static bool use_sse2 = true; +static bool use_sse4 = true; +static bool is_intel = false; +static bool is_amd = false; + +static uint32_t cpu_has_mmx = 0; +static uint32_t cpu_has_sse = 0; +static uint32_t cpu_has_sse2 = 0; +static uint32_t cpu_has_sse3 = 0; +static uint32_t cpu_has_ssse3 = 0; +static uint32_t cpu_has_sse4a = 0; +static uint32_t cpu_has_sse41 = 0; +static uint32_t cpu_has_sse42 = 0; +static uint32_t cpu_has_aes = 0; +static uint32_t cpu_has_avx = 0; +static uint32_t cpu_has_avx2 = 0; +static uint32_t cpu_has_64bit = 0; +static uint32_t cpu_has_xd = 0; + +//---------------------------------------- +// Parameters for the tests. +// + +static long usec_per_test = 5000000; // 5 seconds per memory test. + +static int chunk_sizes[] = { + 128, + 256, + 384, + 512, + 640, + 768, + 896, + 1024, + 1280, + 2048, + 3072, + 4096, + 6144, + 8192, // Some processors' L1 data caches are only 8kB. + 12288, + 16384, + 20480, + 24576, + 28672, + 32768, // Common L1 data cache size. + 34*1024, + 36*1024, + 40960, + 49152, + 65536, + 131072, // Old L2 cache size. + 192 * 1024, + 256 * 1024, // Old L2 cache size. + 320 * 1024, + 384 * 1024, + 512 * 1024, // Old L2 cache size. + 768 * 1024, + 1 << 20, // 1 MB = common L2 cache size. + (1024 + 256) * 1024, // 1.25 + (1024 + 512) * 1024, // 1.5 + (1024 + 768) * 1024, // 1.75 + 1 << 21, // 2 MB = common L2 cache size. + (2048 + 256) * 1024, // 2.25 + (2048 + 512) * 1024, // 2.5 + (2048 + 768) * 1024, // 2.75 + 3072 * 1024, // 3 MB = common L2 cache size. + 3407872, // 3.25 MB + 3 * 1024 * 1024 + 1024 * 512, // 3.5 MB + 1 << 22, // 4 MB + 5242880, // 5 megs + 6291456, // 6 megs (common L2 cache size) + 7 * 1024 * 1024, + 8 * 1024 * 1024, // Xeon E3's often has 8MB L3 + 9 * 1024 * 1024, + 10 * 1024 * 1024, // Xeon E5-2609 has 10MB L3 + 12 * 1024 * 1024, + 14 * 1024 * 1024, + 15 * 1024 * 1024, // Xeon E6-2630 has 15MB L3 + 16 * 1024 * 1024, + 20 * 1024 * 1024, // Xeon E5-2690 has 20MB L3 + 21 * 1024 * 1024, + 32 * 1024 * 1024, + 48 * 1024 * 1024, + 64 * 1024 * 1024, + 72 * 1024 * 1024, + 96 * 1024 * 1024, + 128 * 1024 * 1024, + 0 +}; + +static double chunk_sizes_log2 [sizeof(chunk_sizes)/sizeof(int)]; + +//---------------------------------------------------------------------------- +// Name: error +// Purpose: Complain and exit. +//---------------------------------------------------------------------------- +void error (char *s) +{ +#ifndef __WIN32__ + fprintf (stderr, "Error: %s\n", s); + exit (1); +#else + wchar_t tmp [200]; + int i; + for (i = 0; s[i]; i++) + tmp[i] = s[i]; + tmp[i] = 0; + MessageBoxW (0, tmp, L"Error", 0); + ExitProcess (0); +#endif +} + +//============================================================================ +// Output buffer logic. +// This is somewhat vestigial code, originating with Windows Mobile ARM port. +//============================================================================ + +#define MSGLEN 10000 +static wchar_t msg [MSGLEN]; + +void print (wchar_t *s) +{ + wcsncat (msg, s, MSGLEN-1); +} + +void newline () +{ + wcsncat (msg, L"\n", MSGLEN-1); +} + +void println (wchar_t *s) +{ + wcsncat (msg, s, MSGLEN-1); + newline (); +} + +void print_int (int d) +{ + swprintf (msg + wcslen (msg), MSGLEN, L"%d", d); +} + +void print_uint (unsigned int d) +{ + swprintf (msg + wcslen (msg), MSGLEN, L"%lu", d); +} + +void println_int (int d) +{ + print_int (d); + newline (); +} + +void print_result (long double result) +{ + swprintf (msg + wcslen (msg), MSGLEN, L"%.1Lf MB/s", result); +} + +void dump (FILE *f) +{ + if (!f) + f = stdout; + + int i = 0; + while (msg[i]) { + char ch = (char) msg[i]; + fputc (ch, f); + i++; + } + + msg [0] = 0; +} + +void flush () +{ + dump (NULL); + fflush (stdout); +} + +void print_size (unsigned long size) +{ + if (size < 1536) { + print_int (size); + print (L" B"); + } + else if (size < (1<<20)) { + print_int (size >> 10); + print (L" kB"); + } else { + print_int (size >> 20); + switch ((size >> 18) & 3) { + case 1: print (L".25"); break; + case 2: print (L".5"); break; + case 3: print (L".75"); break; + } + print (L" MB"); + } +} + +//============================================================================ +// Timing logic. +//============================================================================ + +//---------------------------------------------------------------------------- +// Name: mytime +// Purpose: Reports time in microseconds. +//---------------------------------------------------------------------------- +unsigned long mytime () +{ +#ifndef __WIN32__ + struct timeval tv; + struct timezone tz; + memset (&tz, 0, sizeof(struct timezone)); + gettimeofday (&tv, &tz); + return 1000000 * tv.tv_sec + tv.tv_usec; +#else + return 1000 * GetTickCount (); // accurate enough. +#endif +} + +//---------------------------------------------------------------------------- +// Name: calculate_result +// Purpose: Calculates and prints a result. +// Returns: 10 times the number of megabytes per second. +//---------------------------------------------------------------------------- +int +calculate_result (unsigned long chunk_size, long long total_loops, long diff) +{ + if (!diff) + error ("Zero time difference."); + +// printf ("\nIn calculate_result, chunk_size=%ld, total_loops=%lld, diff=%ld\n", chunk_size, total_loops, diff); + long double result = (long double) chunk_size; + result *= (long double) total_loops; + result *= 1000000.; // Convert to microseconds. + result /= 1048576.; + result /= (long double) diff; + + print_result (result); + + return (long) (10.0 * result); +} + +//============================================================================ +// Tests. +//============================================================================ + +//---------------------------------------------------------------------------- +// Name: do_write +// Purpose: Performs write on chunk of memory of specified size. +//---------------------------------------------------------------------------- +int +do_write (unsigned long size, int mode, bool random) +{ + unsigned char *chunk; + unsigned char *chunk0; + unsigned long loops; + unsigned long long total_count=0; +#ifdef __x86_64__ + unsigned long value = 0x1234567689abcdef; +#else + unsigned long value = 0x12345678; +#endif + unsigned long diff=0, t0; + unsigned long tmp; + unsigned long **chunk_ptrs = NULL; + + if (size & 127) + error ("do_write(): chunk size is not multiple of 128."); + + //------------------------------------------------- + chunk0 = malloc (size+64); + chunk = chunk0; + if (!chunk) + error ("Out of memory"); + + tmp = (unsigned long) chunk; + if (tmp & 31) { + tmp -= (tmp & 31); + tmp += 32; + chunk = (unsigned char*) tmp; + } + + //---------------------------------------- + // Set up random pointers to chunks. + // + if (random) { + tmp = size/256; + chunk_ptrs = (unsigned long**) malloc (sizeof (unsigned long*) * tmp); + if (!chunk_ptrs) + error ("Out of memory."); + + //---------------------------------------- + // Store pointers to all chunks into array. + // + int i; + for (i = 0; i < tmp; i++) { + chunk_ptrs [i] = (unsigned long*) (chunk + 256 * i); + } + + //---------------------------------------- + // Randomize the array of chunk pointers. + // + int k = 100; + while (k--) { + for (i = 0; i < tmp; i++) { + int j = rand() % tmp; + if (i != j) { + unsigned long *ptr = chunk_ptrs [i]; + chunk_ptrs [i] = chunk_ptrs [j]; + chunk_ptrs [j] = ptr; + } + } + } + } + + //------------------------------------------------- + if (random) + print (L"Random write "); + else + print (L"Sequential write "); + + switch (mode) { + case SSE2: + print (L"(128-bit), size = "); + break; + case AVX: + print (L"(256-bit), size = "); + break; + case AVX_BYPASS: + print (L"bypassing cache (256-bit), size = "); + break; + case SSE2_BYPASS: + print (L"bypassing cache (128-bit), size = "); + break; + default: +#ifdef __x86_64__ + print (L"(64-bit), size = "); +#else + print (L"(32-bit), size = "); +#endif + } + + print_size (size); + print (L", "); + + loops = (1 << 26) / size;// XX need to adjust for CPU MHz + if (loops < 1) + loops = 1; + + t0 = mytime (); + + while (diff < usec_per_test) { + total_count += loops; + + switch (mode) { + case SSE2: + if (random) + RandomWriterSSE2 (chunk_ptrs, size/256, loops, value); + else { + if (size & 128) + WriterSSE2_128bytes (chunk, size, loops, value); + else + WriterSSE2 (chunk, size, loops, value); + } + break; + + case SSE2_BYPASS: + if (random) + RandomWriterSSE2_bypass (chunk_ptrs, size/256, loops, value); + else { + if (size & 128) + WriterSSE2_128bytes_bypass (chunk, size, loops, value); + else + WriterSSE2_bypass (chunk, size, loops, value); + } + break; + + case AVX: + if (!random) { + WriterAVX (chunk, size, loops, value); + } + break; + + case AVX_BYPASS: + if (!random) { + WriterAVX_bypass (chunk, size, loops, value); + } + break; + + default: + if (random) + RandomWriter (chunk_ptrs, size/256, loops, value); + else { + if (size & 128) + Writer_128bytes (chunk, size, loops, value); + else + Writer (chunk, size, loops, value); + } + } + + diff = mytime () - t0; + } + + print (L"loops = "); + print_uint (total_count); + print (L", "); + + flush (); + + int result = calculate_result (size, total_count, diff); + newline (); + + flush (); + + free ((void*)chunk0); + + if (chunk_ptrs) + free (chunk_ptrs); + + return result; +} + + +//---------------------------------------------------------------------------- +// Name: do_read +// Purpose: Performs sequential read on chunk of memory of specified size. +//---------------------------------------------------------------------------- +int +do_read (unsigned long size, int mode, bool random) +{ + unsigned long loops; + unsigned long long total_count = 0; + unsigned long t0, diff=0; + unsigned char *chunk; + unsigned char *chunk0; + unsigned long tmp; + unsigned long **chunk_ptrs = NULL; + + if (size & 127) + error ("do_read(): chunk size is not multiple of 128."); + + //------------------------------------------------- + chunk0 = chunk = malloc (size+64); + if (!chunk) + error ("Out of memory"); + + memset (chunk, 0, size); + + tmp = (unsigned long) chunk; + if (tmp & 31) { + tmp -= (tmp & 31); + tmp += 32; + chunk = (unsigned char*) tmp; + } + + //---------------------------------------- + // Set up random pointers to chunks. + // + if (random) { + int tmp = size/256; + chunk_ptrs = (unsigned long**) malloc (sizeof (unsigned long*) * tmp); + if (!chunk_ptrs) + error ("Out of memory."); + + //---------------------------------------- + // Store pointers to all chunks into array. + // + int i; + for (i = 0; i < tmp; i++) { + chunk_ptrs [i] = (unsigned long*) (chunk + 256 * i); + } + + //---------------------------------------- + // Randomize the array of chunk pointers. + // + int k = 100; + while (k--) { + for (i = 0; i < tmp; i++) { + int j = rand() % tmp; + if (i != j) { + unsigned long *ptr = chunk_ptrs [i]; + chunk_ptrs [i] = chunk_ptrs [j]; + chunk_ptrs [j] = ptr; + } + } + } + } + + //------------------------------------------------- + if (random) + print (L"Random read "); + else + print (L"Sequential read "); + + switch (mode) { + case SSE2: + print (L"(128-bit), size = "); + break; + case LODSB: + print (L"(8-bit LODSB), size = "); + break; + case LODSW: + print (L"(16-bit LODSW), size = "); + break; + case LODSD: + print (L"(32-bit LODSD), size = "); + break; + case LODSQ: + print (L"(64-bit LODSQ), size = "); + break; + case AVX: + print (L"(256-bit), size = "); + break; + case AVX_BYPASS: + print (L"bypassing cache (256-bit), size = "); + break; + case SSE2_BYPASS: + print (L"bypassing cache (128-bit), size = "); + break; + default: +#ifdef __x86_64__ + print (L"(64-bit), size = "); +#else + print (L"(32-bit), size = "); +#endif + } + + print_size (size); + print (L", "); + + flush (); + + loops = (1 << 26) / size; // XX need to adjust for CPU MHz + if (loops < 1) + loops = 1; + + t0 = mytime (); + + while (diff < usec_per_test) { + total_count += loops; + + switch (mode) { + case SSE2: + if (random) + RandomReaderSSE2 (chunk_ptrs, size/256, loops); + else { + if (size & 128) + ReaderSSE2_128bytes (chunk, size, loops); + else + ReaderSSE2 (chunk, size, loops); + } + break; + + case SSE2_BYPASS: + // No random reader for bypass. + // + if (random) + RandomReaderSSE2_bypass (chunk_ptrs, size/256, loops); + else { + if (size & 128) + ReaderSSE2_128bytes_bypass (chunk, size, loops); + else + ReaderSSE2_bypass (chunk, size, loops); + } + break; + + case AVX: + if (!random) { + ReaderAVX (chunk, size, loops); + } + break; + + case LODSB: + if (!random) { + ReaderLODSB (chunk, size, loops); + } + break; + + case LODSW: + if (!random) { + ReaderLODSW (chunk, size, loops); + } + break; + + case LODSD: + if (!random) { + ReaderLODSD (chunk, size, loops); + } + break; + + case LODSQ: + if (!random) { + ReaderLODSQ (chunk, size, loops); + } + break; + + default: + if (random) { + RandomReader (chunk_ptrs, size/256, loops); + } else { + if (size & 128) + Reader_128bytes (chunk, size, loops); + else + Reader (chunk, size, loops); + } + } + + diff = mytime () - t0; + } + + print (L"loops = "); + print_uint (total_count); + print (L", "); + + int result = calculate_result (size, total_count, diff); + newline (); + + flush (); + + free (chunk0); + + if (chunk_ptrs) + free (chunk_ptrs); + + return result; +} + + + +//---------------------------------------------------------------------------- +// Name: do_copy +// Purpose: Performs sequential memory copy. +//---------------------------------------------------------------------------- +int +do_copy (unsigned long size, int mode) +{ + unsigned long loops; + unsigned long long total_count = 0; + unsigned long t0, diff=0; + unsigned char *chunk_src; + unsigned char *chunk_dest; + unsigned char *chunk_src0; + unsigned char *chunk_dest0; + unsigned long tmp; + + if (size & 127) + error ("do_copy(): chunk size is not multiple of 128."); + + //------------------------------------------------- + chunk_src0 = chunk_src = malloc (size+64); + if (!chunk_src) + error ("Out of memory"); + chunk_dest0 = chunk_dest = malloc (size+64); + if (!chunk_dest) + error ("Out of memory"); + + memset (chunk_src, 100, size); + memset (chunk_dest, 200, size); + + tmp = (unsigned long) chunk_src; + if (tmp & 31) { + tmp -= (tmp & 31); + tmp += 32; + chunk_src = (unsigned char*) tmp; + } + tmp = (unsigned long) chunk_dest; + if (tmp & 31) { + tmp -= (tmp & 31); + tmp += 32; + chunk_dest = (unsigned char*) tmp; + } + + //------------------------------------------------- + print (L"Sequential copy "); + + if (mode == SSE2) { + print (L"(128-bit), size = "); + } + else if (mode == AVX) { + print (L"(256-bit), size = "); + } + else { +#ifdef __x86_64__ + print (L"(64-bit), size = "); +#else + print (L"(32-bit), size = "); +#endif + } + + print_size (size); + print (L", "); + + flush (); + + loops = (1 << 26) / size; // XX need to adjust for CPU MHz + if (loops < 1) + loops = 1; + + t0 = mytime (); + + while (diff < usec_per_test) { + total_count += loops; + + if (mode == SSE2) { +#ifdef __x86_64__ + if (size & 128) + CopySSE_128bytes (chunk_dest, chunk_src, size, loops); + else + CopySSE (chunk_dest, chunk_src, size, loops); +#else + CopySSE (chunk_dest, chunk_src, size, loops); +#endif + } + else if (mode == AVX) { + if (!(size & 128)) + CopyAVX (chunk_dest, chunk_src, size, loops); + } + + diff = mytime () - t0; + } + + print (L"loops = "); + print_uint (total_count); + print (L", "); + + int result = calculate_result (size, total_count, diff); + newline (); + + flush (); + + free (chunk_src0); + free (chunk_dest0); + + return result; +} + + +//---------------------------------------------------------------------------- +// Name: fb_readwrite +// Purpose: Performs sequential read & write tests on framebuffer memory. +//---------------------------------------------------------------------------- +#if defined(__linux__) && defined(FBIOGET_FSCREENINFO) +void +fb_readwrite (bool use_sse2) +{ + unsigned long counter, total_count; + unsigned long length; + unsigned long diff, t0; + static struct fb_fix_screeninfo fi; + static struct fb_var_screeninfo vi; + unsigned long *fb = NULL; + unsigned long datum; + int fd; + register unsigned long foo; +#ifdef __x86_64__ + unsigned long value = 0x1234567689abcdef; +#else + unsigned long value = 0x12345678; +#endif + + //------------------------------------------------- + + fd = open ("/dev/fb0", O_RDWR); + if (fd < 0) + fd = open ("/dev/fb/0", O_RDWR); + if (fd < 0) { + println (L"Cannot open framebuffer device."); + return; + } + + if (ioctl (fd, FBIOGET_FSCREENINFO, &fi)) { + close (fd); + println (L"Cannot get framebuffer info"); + return; + } + else + if (ioctl (fd, FBIOGET_VSCREENINFO, &vi)) { + close (fd); + println (L"Cannot get framebuffer info"); + return; + } + else + { + if (fi.visual != FB_VISUAL_TRUECOLOR && + fi.visual != FB_VISUAL_DIRECTCOLOR ) { + close (fd); + println (L"Need direct/truecolor framebuffer device."); + return; + } else { + unsigned long fblen; + + print (L"Framebuffer resolution: "); + print_int (vi.xres); + print (L"x"); + print_int (vi.yres); + print (L", "); + print_int (vi.bits_per_pixel); + println (L" bpp\n"); + + fb = (unsigned long*) fi.smem_start; + fblen = fi.smem_len; + + fb = mmap (fb, fblen, + PROT_WRITE | PROT_READ, + MAP_SHARED, fd, 0); + if (fb == MAP_FAILED) { + close (fd); + println (L"Cannot access framebuffer memory."); + return; + } + } + } + + //------------------- + // Use only the upper half of the display. + // + length = FB_SIZE; + + //------------------- + // READ + // + print (L"Framebuffer memory sequential read "); + flush (); + + t0 = mytime (); + + total_count = FBLOOPS_R; + + if (use_sse2) + ReaderSSE2 (fb, length, FBLOOPS_R); + else + Reader (fb, length, FBLOOPS_R); + + diff = mytime () - t0; + + calculate_result (length, total_count, diff); + newline (); + + //------------------- + // WRITE + // + print (L"Framebuffer memory sequential write "); + flush (); + + t0 = mytime (); + + total_count = FBLOOPS_W; + + if (use_sse2) + WriterSSE2_bypass (fb, length, FBLOOPS_W, value); + else + Writer (fb, length, FBLOOPS_W, value); + + diff = mytime () - t0; + + calculate_result (length, total_count, diff); + newline (); +} +#endif + +//---------------------------------------------------------------------------- +// Name: register_test +// Purpose: Determines bandwidth of register-to-register transfers. +//---------------------------------------------------------------------------- +void +register_test () +{ + long long total_count = 0; + unsigned long t0; + unsigned long diff = 0; + + //-------------------------------------- +#ifdef __x86_64__ + print (L"Main register to main register transfers (64-bit) "); +#else + print (L"Main register to main register transfers (32-bit) "); +#endif + flush (); +#define REGISTER_COUNT 10000 + + t0 = mytime (); + while (diff < usec_per_test) + { + RegisterToRegister (REGISTER_COUNT); + total_count += REGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); + + //-------------------------------------- +#ifdef __x86_64__ + print (L"Main register to vector register transfers (64-bit) "); +#else + print (L"Main register to vector register transfers (32-bit) "); +#endif + flush (); +#define VREGISTER_COUNT 3333 + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + RegisterToVector (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); + + //-------------------------------------- +#ifdef __x86_64__ + print (L"Vector register to main register transfers (64-bit) "); +#else + print (L"Vector register to main register transfers (32-bit) "); +#endif + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + VectorToRegister (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); + + //-------------------------------------- + print (L"Vector register to vector register transfers (128-bit) "); + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + VectorToVector (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); + + //-------------------------------------- + if (cpu_has_avx) { + print (L"Vector register to vector register transfers (256-bit) "); + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + VectorToVectorAVX (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); + } + + //-------------------------------------- + if (use_sse4) { + print (L"Vector 8-bit datum to main register transfers "); + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + Vector8ToRegister (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (64, total_count, diff); + newline (); + flush (); + } + + //-------------------------------------- + print (L"Vector 16-bit datum to main register transfers "); + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + Vector16ToRegister (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (128, total_count, diff); + newline (); + flush (); + + //-------------------------------------- + if (use_sse4) { + print (L"Vector 32-bit datum to main register transfers "); + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + Vector32ToRegister (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); + } + + //-------------------------------------- + if (use_sse4) { + print (L"Vector 64-bit datum to main register transfers "); + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + Vector64ToRegister (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); + } + + //-------------------------------------- + if (use_sse4) { + print (L"Main register 8-bit datum to vector register transfers "); + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + Register8ToVector (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (64, total_count, diff); + newline (); + flush (); + } + + //-------------------------------------- + print (L"Main register 16-bit datum to vector register transfers "); + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + Register16ToVector (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (128, total_count, diff); + newline (); + flush (); + + //-------------------------------------- + if (use_sse4) { + print (L"Main register 32-bit datum to vector register transfers "); + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + Register32ToVector (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); + } + + //-------------------------------------- + if (use_sse4) { + print (L"Main register 64-bit datum to vector register transfers "); + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + Register64ToVector (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); + } +} + +//---------------------------------------------------------------------------- +// Name: stack_test +// Purpose: Determines bandwidth of stack-to/from-register transfers. +//---------------------------------------------------------------------------- +void +stack_test () +{ + long long total_count = 0; + unsigned long t0; + unsigned long diff = 0; + +#ifdef __x86_64__ + print (L"Stack-to-register transfers (64-bit) "); +#else + print (L"Stack-to-register transfers (32-bit) "); +#endif + flush (); + + //-------------------------------------- + diff = 0; + total_count = 0; + t0 = mytime (); + while (diff < usec_per_test) + { + StackReader (REGISTER_COUNT); + total_count += REGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); + +#ifdef __x86_64__ + print (L"Register-to-stack transfers (64-bit) "); +#else + print (L"Register-to-stack transfers (32-bit) "); +#endif + flush (); + + //-------------------------------------- + diff = 0; + total_count = 0; + t0 = mytime (); + while (diff < usec_per_test) + { + StackWriter (REGISTER_COUNT); + total_count += REGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); +} + +//---------------------------------------------------------------------------- +// Name: library_test +// Purpose: Performs C library tests (memset, memcpy). +//---------------------------------------------------------------------------- +void +library_test () +{ + char *a1, *a2; + unsigned long t, t0; + int i; + + #define NT_SIZE (64*1024*1024) + #define NT_SIZE2 (100) + + a1 = malloc (NT_SIZE); + if (!a1) + error ("Out of memory"); + + a2 = malloc (NT_SIZE); + if (!a2) + error ("Out of memory"); + + //-------------------------------------- + t0 = mytime (); + for (i=0; ih_addr_list); + int sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + + struct sockaddr_in addr; + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = inet_addr(host_ip); + addr.sin_port = htons(network_port); + + if (connect (sock, (struct sockaddr*) &addr, sizeof (struct sockaddr))) + { + // perror ("connect"); + close (sock); + return false; + } + + //------------------------------------ + // Start stopwatch just before the send. + // It will be stopped on receipt of + // the response. + // + unsigned long t0 = mytime (); + + //------------------------------------ + // Put # of chunks in the chunk. + // Send all of our data. + // + sprintf (chunk, "%lu\n", n_chunks); + int i; + for (i = 0; i < n_chunks; i++) + send (sock, chunk, chunk_size, 0); + +#if 0 + //------------------------------------ + // Set nonblocking mode. + // + int opt = 1; + ioctl (sock, FIONBIO, &opt); +#endif + + unsigned long t1 = mytime (); + + //------------------------------------ + // Read the response. + // + int amount = recv (sock, chunk, chunk_size, 0); + if (amount < 16) { + close (sock); + return false; + } + + unsigned long duration_send = mytime() - t0; + + //------------------------------------ + // Validate the response, which + // contains the transponder's + // perceived read duration. This value + // may be as little as half our number. + // + unsigned long duration2 = -1; + if (strncmp ("OK: ", chunk, 4)) { + close (sock); + return false; + } + if (1 != sscanf (4+chunk, "%lu", &duration2)) { + close (sock); + return false; + } + + unsigned long remaining = chunk_size * n_chunks - amount; + while (remaining > 0) { + int amount = recv (sock, chunk, chunk_size, 0); + if (amount <= 0) { + perror ("recv"); + close (sock); + return false; + } + remaining -= amount; + } + + unsigned long duration_recv = mytime () - t1; + + *duration_send_return = duration_send; + *duration_recv_return = duration_recv; + + close (sock); + return true; +} + +//---------------------------------------------------------------------------- +// Name: ip_to_str +//---------------------------------------------------------------------------- +void +ip_to_str (unsigned long addr, char *str) +{ + if (!str) + return; + + unsigned short a = 0xff & addr; + unsigned short b = 0xff & (addr >> 8); + unsigned short c = 0xff & (addr >> 16); + unsigned short d = 0xff & (addr >> 24); + sprintf (str, "%u.%u.%u.%u", a,b,c,d); +} + +//---------------------------------------------------------------------------- +// Name: network_transponder +// Purpose: Act as a transponder, receiving chunks of data and sending +// back an acknowledgement once the enture chunk is read. +// Returns: False if a problem occurs setting up the network socket. +//---------------------------------------------------------------------------- +bool +network_transponder () +{ + struct sockaddr_in sin, from; + + //------------------------------ + // Get listening socket for port. + // Then listen on given port#. + // + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = htonl(INADDR_ANY); + sin.sin_port = htons(network_port); + int listensock; + if ((listensock = socket (AF_INET, SOCK_STREAM, 0)) < 0) { + perror ("socket"); + return false; + } + if (bind (listensock, (struct sockaddr*) &sin, sizeof(sin)) < 0) { + perror ("bind"); + close (listensock); + return false; + } + if (listen (listensock, 500) < 0) { + perror ("listen"); + close (listensock); + return false; + } + + bool done = false; + while (!done) { + //---------------------------------------- + // Wait for a client to contact us. + // + socklen_t len = sizeof (struct sockaddr); + int sock = accept (listensock, (struct sockaddr*) &from, &len); + if (sock < 0) { + perror ("accept"); + close (listensock); + return false; + } + + //---------------------------------------- + // Clockwatch starts when we accept the + // connection. + // + unsigned long t0 = mytime (); + + if (len != sizeof (struct sockaddr_in)) { + close (sock); + close (listensock); + return false; + } + +#if 0 + unsigned long ipaddr = from.sin_addr.s_addr; + char ipstring[30]; + ip_to_str (ipaddr, ipstring); + fprintf (stderr, "Incoming connection from %s\n", ipstring); +#endif + + //---------------------------------------- + // Read the first chunk only, in order to + // get the # of bytes that will be sent. + // + char chunk [NETWORK_CHUNK_SIZE+1]; + long n_chunks = 0; + int amount_read = read (sock, chunk, NETWORK_CHUNK_SIZE); + chunk [amount_read] = 0; + if (1 != sscanf (chunk, "%ld", &n_chunks)) { + close (sock); + close (listensock); + return false; + } + + //---------------------------------------- + // If the leader sends us a chunk count of + // -99, this indicates that we should exit. + // + if (n_chunks == -99) { + close (sock); + close (listensock); + return true; + } + +// printf ("Reading %lu chunks of %d bytes...\n", n_chunks, NETWORK_CHUNK_SIZE); + + unsigned long long remaining = n_chunks; + remaining *= NETWORK_CHUNK_SIZE; + +// printf ("remaining="); dump_hex64(remaining); puts(""); + + remaining -= amount_read; + while (remaining > 0) { + amount_read = read (sock, chunk, NETWORK_CHUNK_SIZE); + remaining -= amount_read; + + if (amount_read < 0) { + perror ("read"); + break; + } else + if (!amount_read) + break; + } + + unsigned long duration = mytime() - t0; + + //------------------------------------ + // Send response of same size. + // + sprintf (chunk, "OK: %lu\n", duration); + chunk[14] = '\n'; + + //------------------------------------ + // Send all of our data. + // + int i; + for (i = 0; i < n_chunks; i++) + send (sock, chunk, NETWORK_CHUNK_SIZE, 0); + + close (sock); + } + + return true; +} + +//---------------------------------------------------------------------------- +// Name: network_test +//---------------------------------------------------------------------------- +bool +network_test (char **destinations, int n_destinations) +{ + int i; + + //---------------------------------------- + // The memory chunk starts with a 12-byte + // length of the overall send size. + // The memory chunk will have a list of + // the destinations in it. + // In future, there will be a mechanism + // for testing bandwidth between all nodes, + // not just the leader & each of the + // transponders. + // + char chunk [NETWORK_CHUNK_SIZE]; + memset (chunk, 0, NETWORK_CHUNK_SIZE); + sprintf (chunk, "000000000000\n%d\n", n_destinations); + for (i = 0; i < n_destinations; i++) { + char *s = destinations [i]; + int chunk_len = strlen (chunk); + int len = strlen (s); + if (len + chunk_len < NETWORK_CHUNK_SIZE-1) { + //---------------------------------------- + // "transp" indicates that the given node + // has not yet been a leader. + // In future, "done" will indicate it has. + // + sprintf (chunk + chunk_len, "%s %s\n", s, "transp"); + } + } + + static unsigned long colors [] = { + RGB_RED, RGB_GREEN, RGB_BLUE, RGB_ORANGE, RGB_PURPLE, + RGB_BLACK, RGB_CORAL, + RGB_CYAN, RGB_NAVYBLUE, RGB_BRASS, RGB_DARKORANGE, + RGB_DARKGREEN, RGB_SALMON, RGB_MAGENTA, RGB_LEMONYELLOW, + RGB_ROYALBLUE, RGB_DODGERBLUE, RGB_TURQUOISE, RGB_CADETBLUE, + RGB_CHARTREUSE, RGB_DARKOLIVEGREEN, RGB_VIOLET, + RGB_KHAKI, RGB_DARKKHAKI, RGB_GOLDENROD + }; +#define NCOLORS (sizeof(colors)/sizeof(unsigned long)) + + //---------------------------------------- + // For each destination, run the test. + // + for (i = 0; i < n_destinations; i++) { + bool problem = false; + + char *hostname = destinations[i]; + printf ("Bandwidth sending to %s:\n", hostname); + + char title [PATH_MAX]; + sprintf (title, "%s send (solid)", hostname); + BMPGraphing_new_line (graph, title, i < NCOLORS? colors[i] : RGB_GRAY); + + //---------------------------------------- + // Cache the receive durations for later. + // + unsigned long recv_rates [NETSIZE_MAX]; + int recv_ix = 0; + + //---------------------------------------- + // Send data of increasing sizes. + // + int j = NETSIZE_MIN; + int n_runs = 64; + while (!problem && j <= NETSIZE_MAX) { + unsigned long chunk_count = 1 << (j-NETSIZE_MIN); + unsigned long long amt_to_send = chunk_count; + amt_to_send *= NETWORK_CHUNK_SIZE; + + if (!amt_to_send) // unlikely + break; + + //---------------------------------------- + // Send the data; do this n_runs times. + // + unsigned long long total_duration_send = 0; + unsigned long long total_duration_recv = 0; + + int k = n_runs; + while (k--) { + long duration_send, duration_recv; + + if (! network_test_core (hostname, + chunk, NETWORK_CHUNK_SIZE, chunk_count, + &duration_send, &duration_recv)) + { + problem = true; + fprintf (stderr, "\nCan't connect to %s\n", hostname); + break; + } + + total_duration_send += duration_send; + total_duration_recv += duration_recv; + } + + if (problem) + break; + + total_duration_send += n_runs/2; // Round up + total_duration_send /= n_runs; // Get average + long duration = (long) total_duration_send; + + total_duration_recv += n_runs/2; // Round up + total_duration_recv /= n_runs; // Get average + + unsigned long amt_in_kb = amt_to_send / 1024; + unsigned long amt_in_mb = amt_to_send / 1048576; + if (!amt_in_mb) { + printf ("\r\tChunk %lu kB x %d: \t", amt_in_kb, + n_runs); + } else { + printf ("\r\tChunk %lu MB x %d: \t", amt_in_mb, + n_runs); + } + + //------------------------------ + // Calculate send rate in MB/sec. + // + // Get total # bytes. + unsigned long long tmp = NETWORK_CHUNK_SIZE; + tmp *= chunk_count; + + // Get total bytes per second. + tmp *= 1000000; + tmp /= duration; + + // Bytes to megabytes. + tmp /= 1000; + tmp /= 10; + unsigned long whole = tmp / 100; + unsigned long frac = tmp % 100; + printf ("%lu.%02lu MB/s (sent)\t", whole, frac); + fflush (stdout); + + BMPGraphing_add_point (graph, amt_in_kb, tmp); + + //------------------------------ + // Calculate recv rate in MB/sec. + // + // Get total # bytes. + tmp = NETWORK_CHUNK_SIZE; + tmp *= chunk_count; + + // Get total bytes per second. + tmp *= 1000000; + tmp /= total_duration_recv; + + // Bytes to megabytes. + tmp /= 1000; + tmp /= 10; + whole = tmp / 100; + frac = tmp % 100; + printf ("%lu.%02lu MB/s (received)\n", whole, frac); + + recv_rates [recv_ix++] = tmp; + + j++; + n_runs >>= 1; + if (!n_runs) + n_runs = 1; + } + + //---------------------------------------- + // Now add the line for the receive rates. + // + sprintf (title, "%s receive (dashed)", hostname); + BMPGraphing_new_line (graph, title, DASHED | + (i < NCOLORS? colors[i] : RGB_GRAY)); + for (j = NETSIZE_MIN; j <= NETSIZE_MAX; j++) { + unsigned long chunk_count = 1 << (j-NETSIZE_MIN); + unsigned long long amt_to_send = chunk_count; + amt_to_send *= NETWORK_CHUNK_SIZE; + unsigned long amt_in_kb = amt_to_send / 1024; +// printf ("amt_in_kb=%ld\n",amt_in_kb); + + BMPGraphing_add_point (graph, amt_in_kb, recv_rates[j-NETSIZE_MIN]); + } + + puts (""); + } + + return true; +} + +//---------------------------------------------------------------------------- +// Name: usage +//---------------------------------------------------------------------------- +void +usage () +{ + printf ("Usage: bandwidth [--slow] [--fast] [--faster] [--fastest] [--title string]\n"); + printf ("Usage for starting network tests: bandwidth --network []\n"); + printf ("Usage for receiving network tests: bandwidth --transponder [--port ]\n"); + + exit (0); +} + +//---------------------------------------------------------------------------- +// Name: main +//---------------------------------------------------------------------------- +int +main (int argc, char **argv) +{ + int i, chunk_size; + + --argc; + ++argv; + + bool network_mode = false; + bool network_leader = false; // false => transponder + int network_destinations_size = 0; + int n_network_destinations = 0; + char **network_destinations = NULL; + + char graph_title [512] = {0}; + + i = 0; + while (i < argc) { + char *s = argv [i++]; + + if (!strcmp ("--network", s)) { + network_mode = true; + network_leader = true; + network_destinations_size = 20; + network_destinations = (char**) malloc (network_destinations_size * sizeof (char*)); + } + else + if (!strcmp ("--transponder", s)) { + network_mode = true; + } + else + if (!strcmp ("--port", s)) { + if (i != argc) + network_port = atoi (argv[i++]); + } + else + if (!strcmp ("--slow", s)) { + usec_per_test=20000000; // 20 seconds per test. + } + else + if (!strcmp ("--fast", s)) { + usec_per_test = 500000; // 0.5 seconds per test. + } + else + if (!strcmp ("--faster", s)) { + usec_per_test = 50000; // 0.05 seconds per test. + } + else + if (!strcmp ("--fastest", s)) { + usec_per_test = 5000; // 0.005 seconds per test. + } + else + if (!strcmp ("--nosse2", s)) { + use_sse2 = false; + use_sse4 = false; + } + else + if (!strcmp ("--nosse4", s)) { + use_sse4 = false; + } + else + if (!strcmp ("--help", s)) { + usage (); + } + else + if (!strcmp ("--title", s) && i != argc) { + snprintf (graph_title, 511, "%s", argv[i++]); + } + else { + if ('-' == *s) + usage (); + } + } + + msg[0] = 0; + + for (i = 0; chunk_sizes[i] && i < sizeof(chunk_sizes)/sizeof(int); i++) { + chunk_sizes_log2[i] = log2 (chunk_sizes[i]); + } + + printf ("This is bandwidth version %s.\n", RELEASE); + printf ("Copyright (C) 2005-2014 by Zack T Smith.\n\n"); + printf ("This software is covered by the GNU Public License.\n"); + printf ("It is provided AS-IS, use at your own risk.\n"); + printf ("See the file COPYING for more information.\n\n"); + fflush (stdout); + + //---------------------------------------- + // If network mode selected, enter it now. + // Currently cannot combine memory tests + // & network tests. + // + if (network_mode) { + if (network_leader) { + graph = BMPGraphing_new (GRAPH_WIDTH, GRAPH_HEIGHT, MODE_X_AXIS_LINEAR); + strcpy (graph_title, TITLE_MEMORY_NET); + BMPGraphing_set_title (graph, graph_title); + + network_test (network_destinations, n_network_destinations); + + BMPGraphing_make (graph); + + BMP_write (graph->image, "network_bandwidth.bmp"); + +#if defined(__linux__) || defined(__CYGWIN__) || defined(__APPLE__) + puts ("Wrote graph to network_bandwidth.bmp."); + puts (""); + puts ("Done."); +#endif + BMPGraphing_destroy (graph); + } else { + network_transponder (); + } + + return 0; + } + + uint32_t ecx = get_cpuid1_ecx (); + uint32_t edx = get_cpuid1_edx (); + cpu_has_mmx = edx & CPUID_EDX_MMX; + cpu_has_sse = edx & CPUID_EDX_SSE; + cpu_has_sse2 = edx & CPUID_EDX_SSE2; + cpu_has_sse3 = ecx & CPUID_ECX_SSE3; + cpu_has_ssse3 = ecx & CPUID_ECX_SSSE3; + cpu_has_sse41 = ecx & CPUID_ECX_SSE41; + cpu_has_sse42 = ecx & CPUID_ECX_SSE42; + cpu_has_aes = ecx & CPUID_ECX_AES; + cpu_has_avx = ecx & CPUID_ECX_AVX; + cpu_has_avx2 = 0; + + if (cpu_has_avx) { + cpu_has_avx2 = get_cpuid7_ebx (); + cpu_has_avx2 &= CPUID_EBX_AVX2; + } + + use_sse2 = true; + use_sse4 = true; + + cpu_has_sse4a = 0; + cpu_has_64bit = 0; + cpu_has_xd = 0; + + static char family [17]; + get_cpuid_family (family); + family [16] = 0; + printf ("CPU family: %s\n", family); + + uint32_t ecx2 = get_cpuid_80000001_ecx (); + uint32_t edx2 = get_cpuid_80000001_edx (); + + if (!strcmp ("AuthenticAMD", family)) { + is_amd = true; + cpu_has_sse4a = ecx2 & CPUID_ECX_SSE4A; + } + else + if (!strcmp ("GenuineIntel", family)) { + is_intel = true; + } + + cpu_has_xd = edx2 & CPUID_EDX_XD; + cpu_has_64bit = edx2 & CPUID_EDX_INTEL64; + + printf ("CPU features: "); + if (cpu_has_mmx) printf ("MMX "); + if (cpu_has_sse) printf ("SSE "); + if (cpu_has_sse2) printf ("SSE2 "); + if (cpu_has_sse3) printf ("SSE3 "); + if (cpu_has_ssse3) printf ("SSSE3 "); + if (cpu_has_sse4a) printf ("SSE4A "); + if (cpu_has_sse41) printf ("SSE4.1 "); + if (cpu_has_sse42) printf ("SSE4.2 "); + if (cpu_has_aes) printf ("AES "); + if (cpu_has_avx) printf ("AVX "); + if (cpu_has_avx2) printf ("AVX2 "); + if (cpu_has_xd) printf ("XD "); + if (cpu_has_64bit) { + if (!is_amd) + printf ("Intel64 "); + else + printf ("LongMode "); + } + puts ("\n"); + + if (is_intel) { + uint32_t cache_info[4]; + i = 0; + while (1) { + get_cpuid_cache_info (cache_info, i); + if (!(cache_info[0] & 31)) + break; + +#if 0 + printf ("Cache info %d = 0x%08x, 0x%08x, 0x%08x, 0x%08x\n", i, + cache_info [0], + cache_info [1], + cache_info [2], + cache_info [3]); +#endif + printf ("Cache %d: ", i); + switch ((cache_info[0] >> 5) & 7) { + case 1: printf ("L1 "); break; + case 2: printf ("L2 "); break; + case 3: printf ("L3 "); break; + } + switch (cache_info[0] & 31) { + case 1: printf ("data cache, "); break; + case 2: printf ("instruction cache, "); break; + case 3: printf ("unified cache, "); break; + } + uint32_t n_ways = 1 + (cache_info[1] >> 22); + uint32_t line_size = 1 + (cache_info[1] & 2047); + uint32_t n_sets = 1 + cache_info[2]; + printf ("line size %d, ", line_size); + printf ("%2d-way%s, ", n_ways, n_ways>1 ? "s" : ""); + printf ("%5d sets, ", n_sets); + unsigned size = (n_ways * line_size * n_sets) >> 10; + printf ("size %dk ", size); + puts (""); + i++; + } + } + + if (!cpu_has_sse41) + use_sse4 = false; + if (!cpu_has_sse2) + use_sse2 = false; + + println (L"\nNotation: B = byte, kB = 1024 B, MB = 1048576 B."); + + flush (); + + //------------------------------------------------------------ + // Attempt to obtain information about the CPU. + // +#ifdef __linux__ + struct stat st; + if (!stat ("/proc/cpuinfo", &st)) { +#define TMPFILE "/tmp/bandw_tmp" + unlink (TMPFILE); + if (-1 == system ("grep MHz /proc/cpuinfo | uniq | sed \"s/[\\t\\n: a-zA-Z]//g\" > "TMPFILE)) + perror ("system"); + + FILE *f = fopen (TMPFILE, "r"); + if (f) { + float cpu_speed = 0.0; + + if (1 == fscanf (f, "%g", &cpu_speed)) { + puts (""); + printf ("CPU speed is %g MHz.\n", cpu_speed); + } + fclose (f); + } + } else { + printf ("CPU information is not available (/proc/cpuinfo).\n"); + } + fflush (stdout); +#endif + + graph = BMPGraphing_new (GRAPH_WIDTH, GRAPH_HEIGHT, MODE_X_AXIS_LOG2); + strcpy (graph_title, TITLE_MEMORY_GRAPH); + BMPGraphing_set_title (graph, graph_title); + + //------------------------------------------------------------ + // SSE2 sequential reads. + // + if (use_sse2) { + BMPGraphing_new_line (graph, "Sequential 128-bit reads", RGB_RED); + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_read (chunk_size, SSE2, false); + BMPGraphing_add_point (graph, chunk_size, amount); + } + } + + //------------------------------------------------------------ + // AVX sequential reads. + // + if (cpu_has_avx) { + BMPGraphing_new_line (graph, "Sequential 256-bit reads", RGB_TURQUOISE); + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + if (!(chunk_size & 128)) { + int amount = do_read (chunk_size, AVX, false); + BMPGraphing_add_point (graph, chunk_size, amount); + } + } + } + + //------------------------------------------------------------ + // SSE2 random reads. + // + if (use_sse2) { + BMPGraphing_new_line (graph, "Random 128-bit reads", RGB_MAROON); + + newline (); + srand (time (NULL)); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + if (!(chunk_size & 128)) { + int amount = do_read (chunk_size, SSE2, true); + BMPGraphing_add_point (graph, chunk_size, amount); + } + } + } + + //------------------------------------------------------------ + // SSE2 sequential writes that do not bypass the caches. + // + if (use_sse2) { + BMPGraphing_new_line (graph, "Sequential 128-bit cache writes", RGB_PURPLE); + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_write (chunk_size, SSE2, false); + BMPGraphing_add_point (graph, chunk_size, amount); + } + } + + //------------------------------------------------------------ + // AVX sequential writes that do not bypass the caches. + // + if (cpu_has_avx) { + BMPGraphing_new_line (graph, "Sequential 256-bit cache writes", RGB_PINK); + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + if (!(chunk_size & 128)) { + int amount = do_write (chunk_size, AVX, false); + BMPGraphing_add_point (graph, chunk_size, amount); + } + } + } + + //------------------------------------------------------------ + // SSE2 random writes that do not bypass the caches. + // + if (use_sse2) { + BMPGraphing_new_line (graph, "Random 128-bit cache writes", RGB_NAVYBLUE); + + newline (); + srand (time (NULL)); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + if (!(chunk_size & 128)) { + int amount = do_write (chunk_size, SSE2, true); + BMPGraphing_add_point (graph, chunk_size, amount); + } + } + } + + //------------------------------------------------------------ + // SSE4 sequential reads that do bypass the caches. + // + if (use_sse4) { + BMPGraphing_new_line (graph, "Sequential 128-bit bypassing reads", RGB_BLACK); + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_read (chunk_size, SSE2_BYPASS, false); + BMPGraphing_add_point (graph, chunk_size, amount); + } + } + + //------------------------------------------------------------ + // SSE4 random reads that do bypass the caches. + // + if (use_sse4) { + BMPGraphing_new_line (graph, "Random 128-bit bypassing reads", 0xdeadbeef); + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + if (!(chunk_size & 128)) { + int amount = do_read (chunk_size, SSE2_BYPASS, true); + BMPGraphing_add_point (graph, chunk_size, amount); + } + } + } + + //------------------------------------------------------------ + // SSE4 sequential writes that do bypass the caches. + // + if (use_sse4) { + BMPGraphing_new_line (graph, "Sequential 128-bit bypassing writes", RGB_DARKORANGE); + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_write (chunk_size, SSE2_BYPASS, false); + BMPGraphing_add_point (graph, chunk_size, amount); + } + } + + //------------------------------------------------------------ + // AVX sequential writes that do bypass the caches. + // Currently on Intel CPUs (including Xeon) there is a + // microcode bug that leads to a severe drop in performance + // in this part of the test. + // + if (cpu_has_avx) { + BMPGraphing_new_line (graph, "Sequential 256-bit bypassing writes", RGB_DARKOLIVEGREEN); + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + if (!(chunk_size & 128)) { + int amount = do_write (chunk_size, AVX_BYPASS, false); + BMPGraphing_add_point (graph, chunk_size, amount); + } + } + } + + //------------------------------------------------------------ + // SSE4 random writes that bypass the caches. + // + if (use_sse4) { + BMPGraphing_new_line (graph, "Random 128-bit bypassing writes", RGB_LEMONYELLOW); + + newline (); + srand (time (NULL)); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + if (!(chunk_size & 128)) { + int amount = do_write (chunk_size, SSE2_BYPASS, true); + BMPGraphing_add_point (graph, chunk_size, amount); + } + } + } + + //------------------------------------------------------------ + // Sequential non-SSE2 reads. + // + newline (); +#ifdef __x86_64__ + BMPGraphing_new_line (graph, "Sequential 64-bit reads", RGB_BLUE); +#else + BMPGraphing_new_line (graph, "Sequential 32-bit reads", RGB_BLUE); +#endif + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_read (chunk_size, NO_SSE2, false); + BMPGraphing_add_point (graph, chunk_size, amount); + } + + //------------------------------------------------------------ + // Random non-SSE2 reads. + // + newline (); +#ifdef __x86_64__ + BMPGraphing_new_line (graph, "Random 64-bit reads", RGB_CYAN); +#else + BMPGraphing_new_line (graph, "Random 32-bit reads", RGB_CYAN); +#endif + srand (time (NULL)); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + if (!(chunk_size & 128)) { + int amount = do_read (chunk_size, NO_SSE2, true); + BMPGraphing_add_point (graph, chunk_size, amount); + } + } + + //------------------------------------------------------------ + // Sequential non-SSE2 writes. + // +#ifdef __x86_64__ + BMPGraphing_new_line (graph, "Sequential 64-bit writes", RGB_DARKGREEN); +#else + BMPGraphing_new_line (graph, "Sequential 32-bit writes", RGB_DARKGREEN); +#endif + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_write (chunk_size, NO_SSE2, false); + BMPGraphing_add_point (graph, chunk_size, amount); + } + + //------------------------------------------------------------ + // Random non-SSE2 writes. + // +#ifdef __x86_64__ + BMPGraphing_new_line (graph, "Random 64-bit writes", RGB_GREEN); +#else + BMPGraphing_new_line (graph, "Random 32-bit writes", RGB_GREEN); +#endif + + newline (); + srand (time (NULL)); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + if (!(chunk_size & 128)) { + int amount = do_write (chunk_size, NO_SSE2, true); + BMPGraphing_add_point (graph, chunk_size, amount); + } + } + + //------------------------------------------------------------ + // SSE2 sequential copy. + // + if (use_sse2) { + BMPGraphing_new_line (graph, "Sequential 128-bit copy", 0x8f8844); + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_copy (chunk_size, SSE2); + BMPGraphing_add_point (graph, chunk_size, amount); + } + } + + //------------------------------------------------------------ + // AVX sequential copy. + // + if (cpu_has_avx) { + BMPGraphing_new_line (graph, "Sequential 256-bit copy", RGB_CHARTREUSE); + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + if (!(chunk_size & 128)) { + int amount = do_copy (chunk_size, AVX); + BMPGraphing_add_point (graph, chunk_size, amount); + } + } + } + +#ifdef DOING_LODS +#ifdef __x86_64__ + //------------------------------------------------------------ + // LODSQ 64-bit sequential reads. + // + BMPGraphing_new_line (graph, "Sequential 64-bit LODSQ reads", RGB_GRAY6); + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_read (chunk_size, LODSQ, false); + BMPGraphing_add_point (graph, chunk_size, amount); + } +#endif + + //------------------------------------------------------------ + // LODSD 32-bit sequential reads. + // + BMPGraphing_new_line (graph, "Sequential 32-bit LODSD reads", RGB_GRAY8); + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_read (chunk_size, LODSD, false); + BMPGraphing_add_point (graph, chunk_size, amount); + } + + //------------------------------------------------------------ + // LODSW 16-bit sequential reads. + // + BMPGraphing_new_line (graph, "Sequential 16-bit LODSW reads", RGB_GRAY10); + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_read (chunk_size, LODSW, false); + BMPGraphing_add_point (graph, chunk_size, amount); + } + + //------------------------------------------------------------ + // LODSB 64-bit sequential reads. + // + BMPGraphing_new_line (graph, "Sequential 8-bit LODSB reads", RGB_GRAY12); + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_read (chunk_size, LODSB, false); + BMPGraphing_add_point (graph, chunk_size, amount); + } +#endif + + //------------------------------------------------------------ + // Register to register. + // + newline (); + register_test (); + + //------------------------------------------------------------ + // Stack to/from register. + // + newline (); + stack_test (); + + //------------------------------------------------------------ + // C library performance. + // + newline (); + library_test (); + + //------------------------------------------------------------ + // Framebuffer read & write. + // +#if defined(__linux__) && defined(FBIOGET_FSCREENINFO) + newline (); + fb_readwrite (true); +#endif + +premature_end_for_testing: + flush (); + + BMPGraphing_make (graph); + + BMP_write (graph->image, "bandwidth.bmp"); + + puts ("\nWrote graph to bandwidth.bmp."); + puts (""); + puts ("Done."); + + BMPGraphing_destroy (graph); + + return 0; +} diff --git a/minifont.c b/minifont.c new file mode 100755 index 0000000..8aa939c --- /dev/null +++ b/minifont.c @@ -0,0 +1,845 @@ + +/*============================================================================= + bmplib, a simple library to create, modify, and write BMP image files. + Copyright (C) 2009-2014 by Zack T Smith. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 + as published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + The author may be reached at veritas@comcast.net. + *============================================================================*/ + +#include + +#include "BMP.h" + +// Mini characters, 8 pixels high. +static const char *mini_chars_ [] = +{ + "#", + "#", + "#", + "#", + "#", + " ", + "#", + "", + + "## ##", + " # #", + "# #", + " ", + " ", + " ", + " ", + "", + + " # # ", + " # # ", + "#####", + " # # ", + "#####", + " # # ", + " # # ", + "", + + " # ", + " ####", + "# # ", + " ### ", + " # #", + "####", + " # ", + "", + + "## #", + " #", + " #", + " #", + " #", + "#", + "# ##", + "", + + " # ", + "# # ", + "## ", + " ## #", + "# ## ", + "# # ", + " ## #", + "", + + "##", + " #", + "#", + "", + "", + "", + "", + "", + + " #", + "#", + "#", + "#", + "#", + "#", + "#", + " #", + + "# ", + " #", + " #", + " #", + " #", + " #", + " #", + "#", + + " ", + "# # #", + " ###", + " #", + " ###", + "# # #", + "", + "", + + " ", + " #", + " #", + "#####", + " #", + " #", + "", + "", + + " ", + "", + "", + "", + "", + "##", + " #", + "#", + + " ", + "", + "", + "#####", + "", + "", + "", + "", + + " ", + "", + "", + "", + "", + "", + "#", + "", + + " #", + " #", + " #", + " #", + " #", + "#", + "#", + "", + + " ## ", + "# #", + "# #", + "# #", + "# #", + "# #", + " ## ", + "", + + " #", + "##", + " #", + " #", + " #", + " #", + " #", + "", + + " ## ", + "# #", + " #", + " ###", + "# ", + "# ", + "####", + "", + + "####", + " #", + " # ", + " ## ", + " #", + "# #", + " ## ", + "", + + "# # ", + "# #", + "# #", + "####", + " #", + " #", + " #", + "", + + "####", + "# ", + "### ", + " #", + " #", + "# #", + " ## ", + "", + + " ## ", + "# ", + "# ", + "### ", + "# #", + "# #", + " ## ", + "", + + "####", + " #", + " #", + " # ", + " # ", + " # ", + " # ", + "", + + " ## ", + "# #", + "# #", + " ## ", + "# #", + "# #", + " ## ", + "", + + " ## ", + "# #", + "# #", + " ###", + " #", + " # ", + " # ", + "", + + " ", + "", + "", + "#", + "", + "#", + "", + "", + + " ", + "", + " ", + "##", + " ", + "##", + " #", + "#", + + " #", + " #", + " #", + "#", + " #", + " #", + " #", + "", + + " ", + "", + "", + "#####", + " ", + "#####", + "", + "", + + "# ", + " #", + " #", + " #", + " #", + " #", + "#", + "", + + " ### ", + "# #", + " #", + " ## ", + " #", + "", + " #", + "", + + " ### ", + "# #", + "# ##", + "# # #", + "# ##", + "# ", + " ###", + "", + + " # ", + " # # ", + "# #", + "# #", + "#####", + "# #", + "# #", + "", + + "#### ", + "# #", + "# #", + "#### ", + "# #", + "# #", + "####", + "", + + " ### ", + "# #", + "# ", + "# ", + "# ", + "# #", + " ###", + "", + + "#### ", + "# #", + "# #", + "# #", + "# #", + "# #", + "####", + "", + + "#####", + "#", + "#", + "###", + "#", + "#", + "#####", + "", + + "#####", + "# ", + "# ", + "###", + "# ", + "# ", + "#", + "", + + " ### ", + "# #", + "# ", + "# ##", + "# #", + "# #", + " ####", + "", + + "# #", + "# #", + "# #", + "#####", + "# #", + "# #", + "# #", + "", + + "###", + " #", + " #", + " #", + " #", + " #", + "###", + "", + + " ###", + " #", + " #", + " #", + " #", + "# #", + " ##", + "", + + "# #", + "# #", + "# #", + "##", + "# #", + "# #", + "# #", + "", + + "# ", + "#", + "#", + "#", + "#", + "#", + "#####", + "", + + "# #", + "## ##", + "# # #", + "# #", + "# #", + "# #", + "# #", + "", + + "# #", + "## #", + "# # #", + "# ##", + "# #", + "# #", + "# #", + "", + + " ### ", + "# #", + "# #", + "# #", + "# #", + "# #", + " ###", + "", + + "#### ", + "# #", + "# #", + "#### ", + "# ", + "# ", + "# ", + "", + + " ### ", + "# #", + "# #", + "# #", + "# # #", + "# # ", + " ## #", + "", + + "#### ", + "# #", + "# #", + "#### ", + "# # ", + "# # ", + "# #", + "", + + " ### ", + "# #", + "# ", + " ### ", + " #", + "# #", + " ###", + "", + + "#####", + " #", + " #", + " #", + " #", + " #", + " #", + "", + + "# #", + "# #", + "# #", + "# #", + "# #", + "# #", + " ###", + "", + + "# #", + "# #", + "# #", + "# #", + "# #", + " # # ", + " #", + "", + + "# #", + "# #", + "# #", + "# # #", + "# # #", + "## ##", + "# #", + "", + + "# #", + "# #", + " # #", + " #", + " # #", + "# #", + "# #", + "", + + "# #", + "# #", + "# #", + " # #", + " #", + " #", + " #", + "", + + "#####", + " #", + " #", + " #", + " #", + "#", + "#####", + "", + + "##", + "#", + "#", + "#", + "#", + "#", + "#", + "##", + + "# ", + "#", + " #", + " #", + " #", + " #", + " #", + "", + + "##", + " #", + " #", + " #", + " #", + " #", + " #", + "##", + + " # ", + " # #", + "# #", + "", + "", + "", + "", + "", + + " ", + "", + "", + "", + "", + "", + "", + "####", + + "##", + "#", + " #", + "", + "", + "", + "", + "", + + " ", + " ", + " ## ", + " #", + " ###", + "# #", + " ###", + "", + + "# ", + "# ", + "### ", + "# #", + "# #", + "# #", + "### ", + "", + + " ", + " ", + " ###", + "# ", + "# ", + "# ", + " ###", + "", + + " #", + " #", + " ###", + "# #", + "# #", + "# #", + " ###", + "", + + " ", + " ", + " ## ", + "# #", + "####", + "# ", + " ###", + "", + + " ##", + " # ", + "### ", + " # ", + " # ", + " # ", + "### ", + "", + + " ", + " ", + " ###", + "# #", + "# #", + " ###", + " #", + "### ", + + "# ", + "# ", + "### ", + "# #", + "# #", + "# #", + "# #", + "", + + " # ", + " ", + "## ", + " # ", + " # ", + " # ", + "###", + "", + + " #", + " ", + " ##", + " #", + " #", + " #", + " #", + "## ", + + "# ", + "# ", + "# #", + "# # ", + "## ", + "# # ", + "# #", + "", + + "## ", + " # ", + " # ", + " # ", + " # ", + " # ", + "###", + "", + + " ", + "", + "####", + "# # #", + "# # #", + "# # #", + "# # #", + "", + + " ", + " ", + "###", + "# #", + "# #", + "# #", + "# #", + "", + + " ", + " ", + " ## ", + "# #", + "# #", + "# #", + " ## ", + "", + + " ", + "", + "###", + "# #", + "# #", + "###", + "#", + "#", + + " ", + "", + " ###", + "# #", + "# #", + " ###", + " #", + " # ", + + " ", + " ", + "# ##", + "## ", + "# ", + "# ", + "# ", + "", + + " ", + " ", + " ###", + "# ", + " ##", + " #", + "### ", + "", + + " # ", + " #", + "###", + " #", + " #", + " #", + " ##", + "", + + " ", + "", + "# #", + "# #", + "# #", + "# #", + " ###", + "", + + " ", + "", + "# #", + "# #", + "# #", + " # #", + " #", + "", + + " ", + "", + "# # #", + "# # #", + "# # #", + "# # #", + " # #", + "", + + " ", + "", + "# #", + " # #", + " #", + " # #", + "# #", + "", + + " ", + " ", + "# #", + "# #", + "# #", + " ###", + " #", + "### ", + + " ", + "", + "#####", + " #", + " #", + " # ", + "#####", + "", + +}; + +const char **get_minifont_chars () +{ + return mini_chars_; +} + diff --git a/minifont.h b/minifont.h new file mode 100755 index 0000000..a26edb9 --- /dev/null +++ b/minifont.h @@ -0,0 +1,28 @@ + +/*============================================================================= + bmplib, a simple library to create, modify, and write BMP image files. + Copyright (C) 2009-2014 by Zack T Smith. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 + as published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + The author may be reached at veritas@comcast.net. + *============================================================================*/ + +#ifndef _MINIFONT_H +#define _MINIFONT_H + +extern const char **get_minifont_chars (void); + +#endif + diff --git a/output/._Celeron-2.8GHz-slow.gif b/output/._Celeron-2.8GHz-slow.gif new file mode 100755 index 0000000..bdd6833 Binary files /dev/null and b/output/._Celeron-2.8GHz-slow.gif differ diff --git a/output/._Corei5-2.6GHz-4288U-MacOSXMavericks-64bit.png b/output/._Corei5-2.6GHz-4288U-MacOSXMavericks-64bit.png new file mode 100755 index 0000000..826edc9 Binary files /dev/null and b/output/._Corei5-2.6GHz-4288U-MacOSXMavericks-64bit.png differ diff --git a/output/._Corei5-520M-MacOSXLion-32bit-slow.gif b/output/._Corei5-520M-MacOSXLion-32bit-slow.gif new file mode 100755 index 0000000..6fdd000 Binary files /dev/null and b/output/._Corei5-520M-MacOSXLion-32bit-slow.gif differ diff --git a/output/._Corei5-520M-MacOSXLion-64bit-slow-Crucial-SDRAM.gif b/output/._Corei5-520M-MacOSXLion-64bit-slow-Crucial-SDRAM.gif new file mode 100755 index 0000000..83d30f5 Binary files /dev/null and b/output/._Corei5-520M-MacOSXLion-64bit-slow-Crucial-SDRAM.gif differ diff --git a/output/._Corei5-520M-MacOSXLion-64bit-slow-Samsung-SDRAM.gif b/output/._Corei5-520M-MacOSXLion-64bit-slow-Samsung-SDRAM.gif new file mode 100755 index 0000000..5ffb9aa Binary files /dev/null and b/output/._Corei5-520M-MacOSXLion-64bit-slow-Samsung-SDRAM.gif differ diff --git a/output/Celeron-2.8GHz-slow.gif b/output/Celeron-2.8GHz-slow.gif new file mode 100755 index 0000000..6d89c32 Binary files /dev/null and b/output/Celeron-2.8GHz-slow.gif differ diff --git a/output/Corei5-2.6GHz-4288U-MacOSXMavericks-64bit.png b/output/Corei5-2.6GHz-4288U-MacOSXMavericks-64bit.png new file mode 100755 index 0000000..d8d268e Binary files /dev/null and b/output/Corei5-2.6GHz-4288U-MacOSXMavericks-64bit.png differ diff --git a/output/Corei5-520M-MacOSXLion-32bit-slow.gif b/output/Corei5-520M-MacOSXLion-32bit-slow.gif new file mode 100755 index 0000000..364adf7 Binary files /dev/null and b/output/Corei5-520M-MacOSXLion-32bit-slow.gif differ diff --git a/output/Corei5-520M-MacOSXLion-64bit-slow-Crucial-SDRAM.gif b/output/Corei5-520M-MacOSXLion-64bit-slow-Crucial-SDRAM.gif new file mode 100755 index 0000000..4ce6d5d Binary files /dev/null and b/output/Corei5-520M-MacOSXLion-64bit-slow-Crucial-SDRAM.gif differ diff --git a/output/Corei5-520M-MacOSXLion-64bit-slow-Samsung-SDRAM.gif b/output/Corei5-520M-MacOSXLion-64bit-slow-Samsung-SDRAM.gif new file mode 100755 index 0000000..d38a120 Binary files /dev/null and b/output/Corei5-520M-MacOSXLion-64bit-slow-Samsung-SDRAM.gif differ diff --git a/routines32.asm b/routines32.asm new file mode 100755 index 0000000..44015d9 --- /dev/null +++ b/routines32.asm @@ -0,0 +1,2960 @@ +;============================================================================ +; bandwidth 0.32, a benchmark to estimate memory transfer bandwidth. +; Copyright (C) 2005-2014 by Zack T Smith. +; +; This program is free software; you can redistribute it and/or modify +; it under the terms of the GNU General Public License as published by +; the Free Software Foundation; either version 2 of the License, or +; (at your option) any later version. +; +; This program is distributed in the hope that it will be useful, +; but WITHOUT ANY WARRANTY; without even the implied warranty of +; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; GNU General Public License for more details. +; +; You should have received a copy of the GNU General Public License +; along with this program; if not, write to the Free Software +; Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +; +; The author may be reached at veritas@comcast.net. +;============================================================================= + +bits 32 +cpu ia64 + +global ReaderLODSQ +global _ReaderLODSQ + +global ReaderLODSD +global _ReaderLODSD + +global ReaderLODSW +global _ReaderLODSW + +global ReaderLODSB +global _ReaderLODSB + +; Cygwin requires the underbar-prefixed symbols. +global _WriterSSE2 +global WriterSSE2 + +global _WriterAVX +global WriterAVX + +global _WriterSSE2_128bytes +global WriterSSE2_128bytes + +global _ReaderAVX +global ReaderAVX + +global _ReaderSSE2 +global ReaderSSE2 + +global ReaderSSE2_bypass +global _ReaderSSE2_bypass + +global _ReaderSSE2_128bytes +global ReaderSSE2_128bytes + +global ReaderSSE2_128bytes_bypass +global _ReaderSSE2_128bytes_bypass + +global _RandomReaderSSE2 +global RandomReaderSSE2 + +global _RandomReaderSSE2_bypass +global RandomReaderSSE2_bypass + +global WriterAVX_bypass +global _WriterAVX_bypass + +global _WriterSSE2_bypass +global WriterSSE2_bypass + +global _WriterSSE2_128bytes_bypass +global WriterSSE2_128bytes_bypass + +global _RandomWriterSSE2_bypass +global RandomWriterSSE2_bypass + +global Reader +global _Reader + +global Writer +global _Writer + +global Reader_128bytes +global _Reader_128bytes + +global Writer_128bytes +global _Writer_128bytes + +global RandomReader +global _RandomReader + +global RandomWriter +global _RandomWriter + +global RandomWriterSSE2 +global _RandomWriterSSE2 + +global get_cpuid_family +global _get_cpuid_family + +global get_cpuid_cache_info +global _get_cpuid_cache_info + +global get_cpuid1_ecx +global _get_cpuid1_ecx + +global get_cpuid1_edx +global _get_cpuid1_edx + +global get_cpuid7_ebx +global _get_cpuid7_ebx + +global get_cpuid_80000001_ecx +global _get_cpuid_80000001_ecx + +global get_cpuid_80000001_edx +global _get_cpuid_80000001_edx + +global CopySSE +global _CopySSE + +global CopyAVX +global _CopyAVX + +global CopySSE_128bytes +global _CopySSE_128bytes + +global RegisterToRegister +global _RegisterToRegister + +global VectorToVector +global _VectorToVector + +global VectorToVectorAVX +global _VectorToVectorAVX + +global RegisterToVector +global _RegisterToVector + +global VectorToRegister +global _VectorToRegister + +global Register8ToVector +global Register16ToVector +global Register32ToVector +global Register64ToVector +global Vector8ToRegister +global Vector16ToRegister +global Vector32ToRegister +global Vector64ToRegister + +global _Register8ToVector +global _Register16ToVector +global _Register32ToVector +global _Register64ToVector +global _Vector8ToRegister +global _Vector16ToRegister +global _Vector32ToRegister +global _Vector64ToRegister + +global StackReader +global _StackReader + +global StackWriter +global _StackWriter + + section .text + +;------------------------------------------------------------------------------ +; Name: ReaderLODSQ +; Purpose: Reads 64-bit values sequentially from an area of memory +; using LODSQ instruction. +; Params: rdi = ptr to memory area +; rsi = length in bytes +; rdx = loops +;------------------------------------------------------------------------------ + align 32 +ReaderLODSQ: +_ReaderLODSQ: + ; N/A + ret + +;------------------------------------------------------------------------------ +; Name: ReaderLODSD +; Purpose: Reads 32-bit values sequentially from an area of memory +; using LODSD instruction. +; Params: +; [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +;------------------------------------------------------------------------------ + align 32 +ReaderLODSD: +_ReaderLODSD: + shr dword [esp+8], 2 ; length in double words rounded down. + + push ebx + push ecx ; REP counter + push edx + + mov edx, [esp+12+12] +.L1: + mov esi, [esp+4+12] + mov ecx, [esp+8+12] + + rep lodsd + + dec edx + jnz .L1 + + pop edx + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: ReaderLODSW +; Purpose: Reads 16-bit values sequentially from an area of memory +; using LODSW instruction. +; Params: +; [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +;------------------------------------------------------------------------------ + align 32 +ReaderLODSW: +_ReaderLODSW: + shr dword [esp+8], 1 ; length in words rounded down. + + push ebx + push ecx ; REP counter + push edx + + mov edx, [esp+12+12] +.L1: + mov esi, [esp+4+12] + mov ecx, [esp+8+12] + + rep lodsw + + dec edx + jnz .L1 + + pop edx + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: ReaderLODSB +; Purpose: Reads 8-bit values sequentially from an area of memory +; using LODSB instruction. +; Params: +; [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +;------------------------------------------------------------------------------ + align 32 +ReaderLODSB: +_ReaderLODSB: + push ebx + push ecx ; REP counter + push edx + + mov edx, [esp+12+12] +.L1: + mov esi, [esp+4+12] + mov ecx, [esp+8+12] + + rep lodsb + + dec edx + jnz .L1 + + pop edx + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: Reader +; Purpose: Reads 32-bit values sequentially from an area of memory. +; Params: +; [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +;------------------------------------------------------------------------------ + align 64 +Reader: +_Reader: + push ebx + push ecx + push edx + + mov ecx, [esp+12+12] ; loops to do. + + mov edx, [esp+4+12] ; ptr to memory chunk. + mov ebx, edx ; ebx = limit in memory + add ebx, [esp+8+12] + +.L1: + mov edx, [esp+4+12] + +.L2: + mov eax, [edx] + mov eax, [4+edx] + mov eax, [8+edx] + mov eax, [12+edx] + mov eax, [16+edx] + mov eax, [20+edx] + mov eax, [24+edx] + mov eax, [28+edx] + mov eax, [32+edx] + mov eax, [36+edx] + mov eax, [40+edx] + mov eax, [44+edx] + mov eax, [48+edx] + mov eax, [52+edx] + mov eax, [56+edx] + mov eax, [60+edx] + mov eax, [64+edx] + mov eax, [68+edx] + mov eax, [72+edx] + mov eax, [76+edx] + mov eax, [80+edx] + mov eax, [84+edx] + mov eax, [88+edx] + mov eax, [92+edx] + mov eax, [96+edx] + mov eax, [100+edx] + mov eax, [104+edx] + mov eax, [108+edx] + mov eax, [112+edx] + mov eax, [116+edx] + mov eax, [120+edx] + mov eax, [124+edx] + + mov eax, [edx+128] + mov eax, [edx+132] + mov eax, [edx+136] + mov eax, [edx+140] + mov eax, [edx+144] + mov eax, [edx+148] + mov eax, [edx+152] + mov eax, [edx+156] + mov eax, [edx+160] + mov eax, [edx+164] + mov eax, [edx+168] + mov eax, [edx+172] + mov eax, [edx+176] + mov eax, [edx+180] + mov eax, [edx+184] + mov eax, [edx+188] + mov eax, [edx+192] + mov eax, [edx+196] + mov eax, [edx+200] + mov eax, [edx+204] + mov eax, [edx+208] + mov eax, [edx+212] + mov eax, [edx+216] + mov eax, [edx+220] + mov eax, [edx+224] + mov eax, [edx+228] + mov eax, [edx+232] + mov eax, [edx+236] + mov eax, [edx+240] + mov eax, [edx+244] + mov eax, [edx+248] + mov eax, [edx+252] + + add edx, 256 + cmp edx, ebx + jb .L2 + + sub ecx, 1 + jnz .L1 + + pop edx + pop ecx + pop ebx + ret + + +;------------------------------------------------------------------------------ +; Name: Writer +; Purpose: Writes 32-bit value sequentially to an area of memory. +; Params: +; [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +; [esp+16] = long to write +;------------------------------------------------------------------------------ + align 64 +Writer: +_Writer: + push ebx + push ecx + push edx + + mov ecx, [esp+12+12] + mov eax, [esp+16+12] + + mov edx, [esp+4+12] ; edx = ptr to chunk + mov ebx, edx + add ebx, [esp+8+12] ; ebx = limit in memory + +.L1: + mov edx, [esp+4+12] + +.L2: + mov [edx], eax + mov [4+edx], eax + mov [8+edx], eax + mov [12+edx], eax + mov [16+edx], eax + mov [20+edx], eax + mov [24+edx], eax + mov [28+edx], eax + mov [32+edx], eax + mov [36+edx], eax + mov [40+edx], eax + mov [44+edx], eax + mov [48+edx], eax + mov [52+edx], eax + mov [56+edx], eax + mov [60+edx], eax + mov [64+edx], eax + mov [68+edx], eax + mov [72+edx], eax + mov [76+edx], eax + mov [80+edx], eax + mov [84+edx], eax + mov [88+edx], eax + mov [92+edx], eax + mov [96+edx], eax + mov [100+edx], eax + mov [104+edx], eax + mov [108+edx], eax + mov [112+edx], eax + mov [116+edx], eax + mov [120+edx], eax + mov [124+edx], eax + + mov [edx+128], eax + mov [edx+132], eax + mov [edx+136], eax + mov [edx+140], eax + mov [edx+144], eax + mov [edx+148], eax + mov [edx+152], eax + mov [edx+156], eax + mov [edx+160], eax + mov [edx+164], eax + mov [edx+168], eax + mov [edx+172], eax + mov [edx+176], eax + mov [edx+180], eax + mov [edx+184], eax + mov [edx+188], eax + mov [edx+192], eax + mov [edx+196], eax + mov [edx+200], eax + mov [edx+204], eax + mov [edx+208], eax + mov [edx+212], eax + mov [edx+216], eax + mov [edx+220], eax + mov [edx+224], eax + mov [edx+228], eax + mov [edx+232], eax + mov [edx+236], eax + mov [edx+240], eax + mov [edx+244], eax + mov [edx+248], eax + mov [edx+252], eax + + add edx, 256 + cmp edx, ebx + jb .L2 + + sub ecx, 1 + jnz .L1 + + pop edx + pop ecx + pop ebx + ret + + +;------------------------------------------------------------------------------ +; Name: Reader_128bytes +; Purpose: Reads 32-bit values sequentially from an area of memory. +; Params: +; [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +;------------------------------------------------------------------------------ + align 64 +Reader_128bytes: +_Reader_128bytes: + push ebx + push ecx + push edx + + mov ecx, [esp+12+12] ; loops to do. + + mov edx, [esp+4+12] ; ptr to memory chunk. + mov ebx, edx ; ebx = limit in memory + add ebx, [esp+8+12] + +.L1: + mov edx, [esp+4+12] + +.L2: + mov eax, [edx] + mov eax, [4+edx] + mov eax, [8+edx] + mov eax, [12+edx] + mov eax, [16+edx] + mov eax, [20+edx] + mov eax, [24+edx] + mov eax, [28+edx] + mov eax, [32+edx] + mov eax, [36+edx] + mov eax, [40+edx] + mov eax, [44+edx] + mov eax, [48+edx] + mov eax, [52+edx] + mov eax, [56+edx] + mov eax, [60+edx] + mov eax, [64+edx] + mov eax, [68+edx] + mov eax, [72+edx] + mov eax, [76+edx] + mov eax, [80+edx] + mov eax, [84+edx] + mov eax, [88+edx] + mov eax, [92+edx] + mov eax, [96+edx] + mov eax, [100+edx] + mov eax, [104+edx] + mov eax, [108+edx] + mov eax, [112+edx] + mov eax, [116+edx] + mov eax, [120+edx] + mov eax, [124+edx] + + add edx, 128 + cmp edx, ebx + jb .L2 + + sub ecx, 1 + jnz .L1 + + pop edx + pop ecx + pop ebx + ret + + +;------------------------------------------------------------------------------ +; Name: Writer_128bytes +; Purpose: Writes 32-bit value sequentially to an area of memory. +; Params: +; [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +; [esp+16] = long to write +;------------------------------------------------------------------------------ + align 64 +Writer_128bytes: +_Writer_128bytes: + push ebx + push ecx + push edx + + mov ecx, [esp+12+12] + mov eax, [esp+16+12] + + mov edx, [esp+4+12] ; edx = ptr to chunk + mov ebx, edx + add ebx, [esp+8+12] ; ebx = limit in memory + +.L1: + mov edx, [esp+4+12] + +.L2: + mov [edx], eax + mov [4+edx], eax + mov [8+edx], eax + mov [12+edx], eax + mov [16+edx], eax + mov [20+edx], eax + mov [24+edx], eax + mov [28+edx], eax + mov [32+edx], eax + mov [36+edx], eax + mov [40+edx], eax + mov [44+edx], eax + mov [48+edx], eax + mov [52+edx], eax + mov [56+edx], eax + mov [60+edx], eax + mov [64+edx], eax + mov [68+edx], eax + mov [72+edx], eax + mov [76+edx], eax + mov [80+edx], eax + mov [84+edx], eax + mov [88+edx], eax + mov [92+edx], eax + mov [96+edx], eax + mov [100+edx], eax + mov [104+edx], eax + mov [108+edx], eax + mov [112+edx], eax + mov [116+edx], eax + mov [120+edx], eax + mov [124+edx], eax + + add edx, 128 + cmp edx, ebx + jb .L2 + + sub ecx, 1 + jnz .L1 + + pop edx + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: get_cpuid_cache_info +; +get_cpuid_cache_info: +_get_cpuid_cache_info: + push ebp + push ebx + push ecx + push edx + mov eax, 4 + mov ecx, [esp + 16 + 4 + 4] + cpuid + mov ebp, eax + mov eax, [esp + 16 + 4] + mov [eax], ebp + mov [eax+4], ebx + mov [eax+8], ecx + mov [eax+12], edx + pop edx + pop ecx + pop ebx + pop ebp + ret + +;------------------------------------------------------------------------------ +; Name: get_cpuid_family +; +get_cpuid_family: +_get_cpuid_family: + push ebx + push ecx + push edx + xor eax, eax + cpuid + mov eax, [esp + 12 + 4] + mov [eax], ebx + mov [eax+4], edx + mov [eax+8], ecx + mov byte [eax+12], 0 + pop edx + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: get_cpuid1_ecx +; +get_cpuid1_ecx: +_get_cpuid1_ecx: + push ebx + push ecx + push edx + mov eax, 1 + cpuid + mov eax, ecx + pop edx + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: get_cpuid7_ebx +; +get_cpuid7_ebx: +_get_cpuid7_ebx: + push ebx + push ecx + push edx + mov eax, 7 + xor ecx, ecx + cpuid + mov eax, ebx + pop edx + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: get_cpuid_80000001_ecx +; +get_cpuid_80000001_ecx: +_get_cpuid_80000001_ecx: + push ebx + push ecx + push edx + mov eax, 0x80000001 + cpuid + mov eax, ecx + pop edx + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: get_cpuid_80000001_edx +; +get_cpuid_80000001_edx: +_get_cpuid_80000001_edx: + push ebx + push ecx + push edx + mov eax, 0x80000001 + cpuid + mov eax, edx + pop edx + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: get_cpuid1_edx +; +get_cpuid1_edx: +_get_cpuid1_edx: + push ebx + push ecx + push edx + mov eax, 1 + cpuid + mov eax, edx + pop edx + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: ReaderAVX +; Purpose: Reads 128-bit values sequentially from an area of memory. +; Params: [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +;------------------------------------------------------------------------------ + align 64 +ReaderAVX: +_ReaderAVX: + vzeroupper + + push ebx + push ecx + + mov ecx, [esp+12+8] + + mov eax, [esp+4+8] + mov ebx, eax + add ebx, [esp+8+8] ; ebx points to end. + +.L1: + mov eax, [esp+4+8] + +.L2: + vmovdqa xmm0, [eax] ; Read aligned @ 16-byte boundary. + vmovdqa xmm0, [32+eax] + vmovdqa xmm0, [64+eax] + vmovdqa xmm0, [96+eax] + vmovdqa xmm0, [128+eax] + vmovdqa xmm0, [160+eax] + vmovdqa xmm0, [192+eax] + vmovdqa xmm0, [224+eax] + + add eax, 256 + cmp eax, ebx + jb .L2 + + sub ecx, 1 + jnz .L1 + + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: ReaderSSE2 +; Purpose: Reads 128-bit values sequentially from an area of memory. +; Params: [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +;------------------------------------------------------------------------------ + align 64 +ReaderSSE2: +_ReaderSSE2: + push ebx + push ecx + + mov ecx, [esp+12+8] + + mov eax, [esp+4+8] + mov ebx, eax + add ebx, [esp+8+8] ; ebx points to end. + +.L1: + mov eax, [esp+4+8] + +.L2: + movdqa xmm0, [eax] ; Read aligned @ 16-byte boundary. + movdqa xmm0, [16+eax] + movdqa xmm0, [32+eax] + movdqa xmm0, [48+eax] + movdqa xmm0, [64+eax] + movdqa xmm0, [80+eax] + movdqa xmm0, [96+eax] + movdqa xmm0, [112+eax] + + movdqa xmm0, [128+eax] + movdqa xmm0, [144+eax] + movdqa xmm0, [160+eax] + movdqa xmm0, [176+eax] + movdqa xmm0, [192+eax] + movdqa xmm0, [208+eax] + movdqa xmm0, [224+eax] + movdqa xmm0, [240+eax] + + add eax, 256 + cmp eax, ebx + jb .L2 + + sub ecx, 1 + jnz .L1 + + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: ReaderSSE2_bypass +; Purpose: Reads 128-bit values sequentially from an area of memory. +; Params: [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +;------------------------------------------------------------------------------ + align 64 +ReaderSSE2_bypass: +_ReaderSSE2_bypass: + push ebx + push ecx + + mov ecx, [esp+12+8] + + mov eax, [esp+4+8] + mov ebx, eax + add ebx, [esp+8+8] ; ebx points to end. + +.L1: + mov eax, [esp+4+8] + +.L2: + movntdqa xmm0, [eax] ; Read aligned @ 16-byte boundary. + movntdqa xmm0, [16+eax] + movntdqa xmm0, [32+eax] + movntdqa xmm0, [48+eax] + movntdqa xmm0, [64+eax] + movntdqa xmm0, [80+eax] + movntdqa xmm0, [96+eax] + movntdqa xmm0, [112+eax] + + movntdqa xmm0, [128+eax] + movntdqa xmm0, [144+eax] + movntdqa xmm0, [160+eax] + movntdqa xmm0, [176+eax] + movntdqa xmm0, [192+eax] + movntdqa xmm0, [208+eax] + movntdqa xmm0, [224+eax] + movntdqa xmm0, [240+eax] + + add eax, 256 + cmp eax, ebx + jb .L2 + + sub ecx, 1 + jnz .L1 + + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: ReaderSSE2_128bytes_bypass +; Purpose: Reads 128-bit values sequentially from an area of memory. +; Params: [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +;------------------------------------------------------------------------------ + align 64 +ReaderSSE2_128bytes_bypass: +_ReaderSSE2_128bytes_bypass: + push ebx + push ecx + + mov ecx, [esp+12+8] + + mov eax, [esp+4+8] + mov ebx, eax + add ebx, [esp+8+8] ; ebx points to end. + +.L1: + mov eax, [esp+4+8] + +.L2: + movntdqa xmm0, [eax] ; Read aligned @ 16-byte boundary. + movntdqa xmm0, [16+eax] + movntdqa xmm0, [32+eax] + movntdqa xmm0, [48+eax] + movntdqa xmm0, [64+eax] + movntdqa xmm0, [80+eax] + movntdqa xmm0, [96+eax] + movntdqa xmm0, [112+eax] + + add eax, 128 + cmp eax, ebx + jb .L2 + + sub ecx, 1 + jnz .L1 + + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: ReaderSSE2_128bytes +; Purpose: Reads 128-bit values sequentially from an area of memory. +; Params: [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +;------------------------------------------------------------------------------ + align 64 +ReaderSSE2_128bytes: +_ReaderSSE2_128bytes: + push ebx + push ecx + + mov ecx, [esp+12+8] + + mov eax, [esp+4+8] + mov ebx, eax + add ebx, [esp+8+8] ; ebx points to end. + +.L1: + mov eax, [esp+4+8] + +.L2: + movdqa xmm0, [eax] ; Read aligned @ 16-byte boundary. + movdqa xmm0, [16+eax] + movdqa xmm0, [32+eax] + movdqa xmm0, [48+eax] + movdqa xmm0, [64+eax] + movdqa xmm0, [80+eax] + movdqa xmm0, [96+eax] + movdqa xmm0, [112+eax] + + add eax, 128 + cmp eax, ebx + jb .L2 + + sub ecx, 1 + jnz .L1 + + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: WriterAVX +; Purpose: Write 256-bit values sequentially from an area of memory. +; Params: [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +; [esp+16] = value (ignored) +;------------------------------------------------------------------------------ + align 64 +WriterAVX: +_WriterAVX: + vzeroupper + + push ebx + push ecx + + mov eax, [esp+16+8] + movd xmm0, eax ; Create a 128-bit replication of the 32-bit + movd xmm1, eax ; value that was provided. + movd xmm2, eax + movd xmm3, eax + pslldq xmm1, 32 + pslldq xmm2, 64 + pslldq xmm3, 96 + por xmm0, xmm1 + por xmm0, xmm2 + por xmm0, xmm3 + + mov ecx, [esp+12+8] + + mov eax, [esp+4+8] + mov ebx, eax + add ebx, [esp+8+8] ; ebx points to end. + +.L1: + mov eax, [esp+4+8] + +.L2: + vmovdqa [eax], xmm0 + vmovdqa [32+eax], xmm0 + vmovdqa [64+eax], xmm0 + vmovdqa [96+eax], xmm0 + vmovdqa [128+eax], xmm0 + vmovdqa [160+eax], xmm0 + vmovdqa [192+eax], xmm0 + vmovdqa [224+eax], xmm0 + + add eax, 256 + cmp eax, ebx + jb .L2 + + sub ecx, 1 + jnz .L1 + + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: WriterSSE2 +; Purpose: Write 128-bit values sequentially from an area of memory. +; Params: [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +; [esp+16] = value (ignored) +;------------------------------------------------------------------------------ + align 64 +WriterSSE2: +_WriterSSE2: + push ebx + push ecx + + mov eax, [esp+16+8] + movd xmm0, eax ; Create a 128-bit replication of the 32-bit + movd xmm1, eax ; value that was provided. + movd xmm2, eax + movd xmm3, eax + pslldq xmm1, 32 + pslldq xmm2, 64 + pslldq xmm3, 96 + por xmm0, xmm1 + por xmm0, xmm2 + por xmm0, xmm3 + + mov ecx, [esp+12+8] + + mov eax, [esp+4+8] + mov ebx, eax + add ebx, [esp+8+8] ; ebx points to end. + +.L1: + mov eax, [esp+4+8] + +.L2: + movdqa [eax], xmm0 + movdqa [16+eax], xmm0 + movdqa [32+eax], xmm0 + movdqa [48+eax], xmm0 + movdqa [64+eax], xmm0 + movdqa [80+eax], xmm0 + movdqa [96+eax], xmm0 + movdqa [112+eax], xmm0 + + movdqa [128+eax], xmm0 + movdqa [144+eax], xmm0 + movdqa [160+eax], xmm0 + movdqa [176+eax], xmm0 + movdqa [192+eax], xmm0 + movdqa [208+eax], xmm0 + movdqa [224+eax], xmm0 + movdqa [240+eax], xmm0 + + add eax, 256 + cmp eax, ebx + jb .L2 + + sub ecx, 1 + jnz .L1 + + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: WriterSSE2 +; Purpose: Write 128-bit values sequentially from an area of memory. +; Params: [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +; [esp+16] = value (ignored) +;------------------------------------------------------------------------------ + align 64 +WriterSSE2_128bytes: +_WriterSSE2_128bytes: + push ebx + push ecx + + mov eax, [esp+16+8] + movd xmm0, eax ; Create a 128-bit replication of the 32-bit + movd xmm1, eax ; value that was provided. + movd xmm2, eax + movd xmm3, eax + pslldq xmm1, 32 + pslldq xmm2, 64 + pslldq xmm3, 96 + por xmm0, xmm1 + por xmm0, xmm2 + por xmm0, xmm3 + + mov ecx, [esp+12+8] + + mov eax, [esp+4+8] + mov ebx, eax + add ebx, [esp+8+8] ; ebx points to end. + +.L1: + mov eax, [esp+4+8] + +.L2: + movdqa [eax], xmm0 + movdqa [16+eax], xmm0 + movdqa [32+eax], xmm0 + movdqa [48+eax], xmm0 + movdqa [64+eax], xmm0 + movdqa [80+eax], xmm0 + movdqa [96+eax], xmm0 + movdqa [112+eax], xmm0 + + add eax, 128 + cmp eax, ebx + jb .L2 + + sub ecx, 1 + jnz .L1 + + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: WriterAVX_bypass +; Purpose: Write 256-bit values sequentially from an area of memory, +; bypassing the cache. +; Params: [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +; [esp+16] = value (ignored) +;------------------------------------------------------------------------------ + + align 64 +WriterAVX_bypass: +_WriterAVX_bypass: + vzeroupper + + push ebx + push ecx + + mov eax, [esp+16+8] + movd xmm0, eax ; Create a 128-bit replication of the 32-bit + movd xmm1, eax ; value that was provided. + movd xmm2, eax + movd xmm3, eax + pslldq xmm1, 32 + pslldq xmm2, 64 + pslldq xmm3, 96 + por xmm0, xmm1 + por xmm0, xmm2 + por xmm0, xmm3 + + mov ecx, [esp+12+8] + + mov eax, [esp+4+8] + mov ebx, eax + add ebx, [esp+8+8] ; ebx points to end. + +.L1: + mov eax, [esp+4+8] + +.L2: + vmovntdq [eax], xmm0 ; Write bypassing cache. + vmovntdq [32+eax], xmm0 + vmovntdq [64+eax], xmm0 + vmovntdq [96+eax], xmm0 + vmovntdq [128+eax], xmm0 + vmovntdq [160+eax], xmm0 + vmovntdq [192+eax], xmm0 + vmovntdq [224+eax], xmm0 + + add eax, 256 + cmp eax, ebx + jb .L2 + + sub ecx, 1 + jnz .L1 + + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: WriterSSE2_bypass +; Purpose: Write 128-bit values sequentially from an area of memory, +; bypassing the cache. +; Params: [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +; [esp+16] = value (ignored) +;------------------------------------------------------------------------------ + align 64 +WriterSSE2_bypass: +_WriterSSE2_bypass: + push ebx + push ecx + + mov eax, [esp+16+8] + movd xmm0, eax ; Create a 128-bit replication of the 32-bit + movd xmm1, eax ; value that was provided. + movd xmm2, eax + movd xmm3, eax + pslldq xmm1, 32 + pslldq xmm2, 64 + pslldq xmm3, 96 + por xmm0, xmm1 + por xmm0, xmm2 + por xmm0, xmm3 + + mov ecx, [esp+12+8] + + mov eax, [esp+4+8] + mov ebx, eax + add ebx, [esp+8+8] ; ebx points to end. + +.L1: + mov eax, [esp+4+8] + +.L2: + movntdq [eax], xmm0 ; Write bypassing cache. + movntdq [16+eax], xmm0 + movntdq [32+eax], xmm0 + movntdq [48+eax], xmm0 + movntdq [64+eax], xmm0 + movntdq [80+eax], xmm0 + movntdq [96+eax], xmm0 + movntdq [112+eax], xmm0 + + movntdq [128+eax], xmm0 + movntdq [144+eax], xmm0 + movntdq [160+eax], xmm0 + movntdq [176+eax], xmm0 + movntdq [192+eax], xmm0 + movntdq [208+eax], xmm0 + movntdq [224+eax], xmm0 + movntdq [240+eax], xmm0 + + add eax, 256 + cmp eax, ebx + jb .L2 + + sub ecx, 1 + jnz .L1 + + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: WriterSSE2_128bytes_bypass +; Purpose: Write 128-bit values sequentially from an area of memory, +; bypassing the cache. +; Params: [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +; [esp+16] = value (ignored) +;------------------------------------------------------------------------------ + align 64 +WriterSSE2_128bytes_bypass: +_WriterSSE2_128bytes_bypass: + push ebx + push ecx + + mov eax, [esp+16+8] + movd xmm0, eax ; Create a 128-bit replication of the 32-bit + movd xmm1, eax ; value that was provided. + movd xmm2, eax + movd xmm3, eax + pslldq xmm1, 32 + pslldq xmm2, 64 + pslldq xmm3, 96 + por xmm0, xmm1 + por xmm0, xmm2 + por xmm0, xmm3 + + mov ecx, [esp+12+8] + + mov eax, [esp+4+8] + mov ebx, eax + add ebx, [esp+8+8] ; ebx points to end. + +.L1: + mov eax, [esp+4+8] + +.L2: + movntdq [eax], xmm0 ; Write bypassing cache. + movntdq [16+eax], xmm0 + movntdq [32+eax], xmm0 + movntdq [48+eax], xmm0 + movntdq [64+eax], xmm0 + movntdq [80+eax], xmm0 + movntdq [96+eax], xmm0 + movntdq [112+eax], xmm0 + + add eax, 128 + cmp eax, ebx + jb .L2 + + sub ecx, 1 + jnz .L1 + + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: RandomReader +; Purpose: Reads 32-bit values randomly from an area of memory. +; Params: +; [esp+4] = ptr to array of chunk pointers +; [esp+8] = # of 128-byte chunks +; [esp+12] = loops +;------------------------------------------------------------------------------ + align 64 +RandomReader: +_RandomReader: + push ebx + push ecx + push edx + + mov ecx, [esp+12+12] ; loops to do. + +.L0: + mov ebx, [esp+8+12] ; # chunks to do + +.L1: + sub ebx, 1 + jc .L2 + + mov edx, [esp+4+12] ; get ptr to memory chunk. + mov edx, [edx + 4*ebx] + + mov eax, [edx+160] + mov eax, [edx+232] + mov eax, [edx+224] + mov eax, [96+edx] + mov eax, [edx+164] + mov eax, [76+edx] + mov eax, [100+edx] + mov eax, [edx+220] + mov eax, [edx+248] + mov eax, [104+edx] + mov eax, [4+edx] + mov eax, [edx+136] + mov eax, [112+edx] + mov eax, [edx+200] + mov eax, [12+edx] + mov eax, [edx+128] + mov eax, [edx+148] + mov eax, [edx+196] + mov eax, [edx+216] + mov eax, [edx] + mov eax, [84+edx] + mov eax, [edx+140] + mov eax, [edx+204] + mov eax, [edx+184] + mov eax, [124+edx] + mov eax, [48+edx] + mov eax, [64+edx] + mov eax, [edx+212] + mov eax, [edx+240] + mov eax, [edx+236] + mov eax, [24+edx] + mov eax, [edx+252] + mov eax, [68+edx] + mov eax, [20+edx] + mov eax, [72+edx] + mov eax, [32+edx] + mov eax, [28+edx] + mov eax, [52+edx] + mov eax, [edx+244] + mov eax, [edx+180] + mov eax, [80+edx] + mov eax, [60+edx] + mov eax, [8+edx] + mov eax, [56+edx] + mov eax, [edx+208] + mov eax, [edx+228] + mov eax, [40+edx] + mov eax, [edx+172] + mov eax, [120+edx] + mov eax, [edx+176] + mov eax, [108+edx] + mov eax, [edx+132] + mov eax, [16+edx] + mov eax, [44+edx] + mov eax, [92+edx] + mov eax, [edx+168] + mov eax, [edx+152] + mov eax, [edx+156] + mov eax, [edx+188] + mov eax, [36+edx] + mov eax, [88+edx] + mov eax, [116+edx] + mov eax, [edx+192] + mov eax, [edx+144] + + jmp .L1 + +.L2: + sub ecx, 1 + jnz .L0 + + pop edx + pop ecx + pop ebx + ret + + +;------------------------------------------------------------------------------ +; Name: RandomReaderSSE2 +; Purpose: Reads 128-bit values sequentially from an area of memory. +; Params: +; [esp+4] = ptr to array of chunk pointers +; [esp+8] = # of 128-byte chunks +; [esp+12] = loops +;------------------------------------------------------------------------------ + align 64 +RandomReaderSSE2: +_RandomReaderSSE2: + push ebx + push ecx + push edx + + mov ecx, [esp+12+12] ; loops to do. + +.L0: + mov ebx, [esp+8+12] ; # chunks to do + +.L1: + sub ebx, 1 + jc .L2 + + mov edx, [esp+4+12] ; get ptr to memory chunk. + mov edx, [edx + 4*ebx] + +; Read aligned @ 16-byte boundary. + movdqa xmm0, [240+edx] + movdqa xmm0, [128+edx] + movdqa xmm0, [64+edx] + movdqa xmm0, [208+edx] + movdqa xmm0, [112+edx] + movdqa xmm0, [176+edx] + movdqa xmm0, [144+edx] + movdqa xmm0, [edx] + movdqa xmm0, [96+edx] + movdqa xmm0, [16+edx] + movdqa xmm0, [192+edx] + movdqa xmm0, [160+edx] + movdqa xmm0, [32+edx] + movdqa xmm0, [48+edx] + movdqa xmm0, [224+edx] + movdqa xmm0, [80+edx] + + jmp .L1 + +.L2: + sub ecx, 1 + jnz .L0 + + pop edx + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: RandomReaderSSE2_bypass +; Purpose: Reads 128-bit values sequentially from an area of memory. +; Params: +; [esp+4] = ptr to array of chunk pointers +; [esp+8] = # of 128-byte chunks +; [esp+12] = loops +;------------------------------------------------------------------------------ + align 64 +RandomReaderSSE2_bypass: +_RandomReaderSSE2_bypass: + push ebx + push ecx + push edx + + mov ecx, [esp+12+12] ; loops to do. + +.L0: + mov ebx, [esp+8+12] ; # chunks to do + +.L1: + sub ebx, 1 + jc .L2 + + mov edx, [esp+4+12] ; get ptr to memory chunk. + mov edx, [edx + 4*ebx] + +; Read aligned @ 16-byte boundary. + movntdqa xmm0, [240+edx] + movntdqa xmm0, [edx] + movntdqa xmm0, [128+edx] + movntdqa xmm0, [64+edx] + movntdqa xmm0, [208+edx] + movntdqa xmm0, [112+edx] + movntdqa xmm0, [32+edx] + movntdqa xmm0, [176+edx] + movntdqa xmm0, [144+edx] + movntdqa xmm0, [96+edx] + movntdqa xmm0, [16+edx] + movntdqa xmm0, [160+edx] + movntdqa xmm0, [192+edx] + movntdqa xmm0, [48+edx] + movntdqa xmm0, [224+edx] + movntdqa xmm0, [80+edx] + + jmp .L1 + +.L2: + sub ecx, 1 + jnz .L0 + + pop edx + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: RandomWriter +; Purpose: Writes 32-bit value sequentially to an area of memory. +; Params: +; [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +; [esp+16] = long to write +;------------------------------------------------------------------------------ + align 64 +RandomWriter: +_RandomWriter: + push ebx + push ecx + push edx + + mov eax, [esp+16+12] ; get datum. + mov ecx, [esp+12+12] ; loops to do. + +.L0: + mov ebx, [esp+8+12] ; # chunks to do + +.L1: + sub ebx, 1 + jc .L2 + + mov edx, [esp+4+12] ; get ptr to memory chunk. + mov edx, [edx + 4*ebx] + + mov [edx+212], eax + mov [edx+156], eax + mov [edx+132], eax + mov [20+edx], eax + mov [edx+172], eax + mov [edx+196], eax + mov [edx+248], eax + mov [edx], eax + mov [edx+136], eax + mov [edx+228], eax + mov [edx+160], eax + mov [80+edx], eax + mov [76+edx], eax + mov [32+edx], eax + mov [64+edx], eax + mov [68+edx], eax + mov [120+edx], eax + mov [edx+216], eax + mov [124+edx], eax + mov [28+edx], eax + mov [edx+152], eax + mov [36+edx], eax + mov [edx+220], eax + mov [edx+188], eax + mov [48+edx], eax + mov [104+edx], eax + mov [72+edx], eax + mov [96+edx], eax + mov [edx+184], eax + mov [112+edx], eax + mov [edx+236], eax + mov [edx+224], eax + mov [edx+252], eax + mov [88+edx], eax + mov [edx+180], eax + mov [60+edx], eax + mov [24+edx], eax + mov [edx+192], eax + mov [edx+164], eax + mov [edx+204], eax + mov [44+edx], eax + mov [edx+168], eax + mov [92+edx], eax + mov [edx+208], eax + mov [8+edx], eax + mov [edx+144], eax + mov [edx+148], eax + mov [edx+128], eax + mov [52+edx], eax + mov [4+edx], eax + mov [108+edx], eax + mov [12+edx], eax + mov [56+edx], eax + mov [edx+200], eax + mov [edx+232], eax + mov [16+edx], eax + mov [edx+244], eax + mov [40+edx], eax + mov [edx+140], eax + mov [84+edx], eax + mov [100+edx], eax + mov [116+edx], eax + mov [edx+176], eax + mov [edx+240], eax + + jmp .L1 + +.L2: + sub ecx, 1 + jnz .L0 + + pop edx + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: RandomWriterSSE2 +; Purpose: Writes 128-bit value randomly to an area of memory. +; Params: +; [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +; [esp+16] = long to write +;------------------------------------------------------------------------------ + align 64 +RandomWriterSSE2: +_RandomWriterSSE2: + push ebx + push ecx + push edx + + mov eax, [esp+16+8] + movd xmm0, eax ; Create a 128-bit replication of the 32-bit + movd xmm1, eax ; value that was provided. + movd xmm2, eax + movd xmm3, eax + pslldq xmm1, 32 + pslldq xmm2, 64 + pslldq xmm3, 96 + por xmm0, xmm1 + por xmm0, xmm2 + por xmm0, xmm3 + + mov ecx, [esp+12+12] ; loops to do. + +.L0: + mov ebx, [esp+8+12] ; # chunks to do + +.L1: + sub ebx, 1 + jc .L2 + + mov edx, [esp+4+12] ; get ptr to memory chunk. + mov edx, [edx + 4*ebx] + + movdqa [64+edx], xmm0 + movdqa [208+edx], xmm0 + movdqa [128+edx], xmm0 + movdqa [112+edx], xmm0 + movdqa [176+edx], xmm0 + movdqa [144+edx], xmm0 + movdqa [edx], xmm0 + movdqa [96+edx], xmm0 + movdqa [48+edx], xmm0 + movdqa [16+edx], xmm0 + movdqa [192+edx], xmm0 + movdqa [160+edx], xmm0 + movdqa [32+edx], xmm0 + movdqa [240+edx], xmm0 + movdqa [224+edx], xmm0 + movdqa [80+edx], xmm0 + + jmp .L1 + +.L2: + sub ecx, 1 + jnz .L0 + + pop edx + pop ecx + pop ebx + ret + + +;------------------------------------------------------------------------------ +; Name: RandomWriterSSE2_bypass +; Purpose: Writes 128-bit value randomly into memory, bypassing caches. +; Params: +; [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +; [esp+16] = long to write +;------------------------------------------------------------------------------ + align 64 +RandomWriterSSE2_bypass: +_RandomWriterSSE2_bypass: + push ebx + push ecx + push edx + + mov eax, [esp+16+8] + movd xmm0, eax ; Create a 128-bit replication of the 32-bit + movd xmm1, eax ; value that was provided. + movd xmm2, eax + movd xmm3, eax + pslldq xmm1, 32 + pslldq xmm2, 64 + pslldq xmm3, 96 + por xmm0, xmm1 + por xmm0, xmm2 + por xmm0, xmm3 + + mov ecx, [esp+12+12] ; loops to do. + +.L0: + mov ebx, [esp+8+12] ; # chunks to do + +.L1: + sub ebx, 1 + jc .L2 + + mov edx, [esp+4+12] ; get ptr to memory chunk. + mov edx, [edx + 4*ebx] + + movntdq [128+edx], xmm0 + movntdq [240+edx], xmm0 + movntdq [112+edx], xmm0 + movntdq [64+edx], xmm0 + movntdq [176+edx], xmm0 + movntdq [144+edx], xmm0 + movntdq [edx], xmm0 + movntdq [208+edx], xmm0 + movntdq [80+edx], xmm0 + movntdq [96+edx], xmm0 + movntdq [48+edx], xmm0 + movntdq [16+edx], xmm0 + movntdq [192+edx], xmm0 + movntdq [160+edx], xmm0 + movntdq [224+edx], xmm0 + movntdq [32+edx], xmm0 + + jmp .L1 + +.L2: + sub ecx, 1 + jnz .L0 + + pop edx + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: RegisterToRegister +; Purpose: Reads/writes 32-bit values between registers of +; the main register set. +; Params: +; dword [esp+4] = loops +;------------------------------------------------------------------------------ + align 64 +RegisterToRegister: +_RegisterToRegister: + push ebx + push ecx + + mov ecx, [esp+4+8] ; loops to do. + +.L1: + mov eax, ebx ; 64 transfers by 4 bytes = 256 bytes + mov eax, ecx + mov eax, edx + mov eax, esi + mov eax, edi + mov eax, ebp + mov eax, esp + mov eax, ebx + mov eax, ebx + mov eax, ecx + mov eax, edx + mov eax, esi + mov eax, edi + mov eax, ebp + mov eax, esp + mov eax, ebx + mov eax, ebx + mov eax, ecx + mov eax, edx + mov eax, esi + mov eax, edi + mov eax, ebp + mov eax, esp + mov eax, ebx + mov eax, ebx + mov eax, ecx + mov eax, edx + mov eax, esi + mov eax, edi + mov eax, ebp + mov eax, esp + mov eax, ebx + + mov ebx, eax + mov ebx, ecx + mov ebx, edx + mov ebx, esi + mov ebx, edi + mov ebx, ebp + mov ebx, esp + mov ebx, eax + mov ebx, eax + mov ebx, ecx + mov ebx, edx + mov ebx, esi + mov ebx, edi + mov ebx, ebp + mov ebx, esp + mov ebx, eax + mov ebx, eax + mov ebx, ecx + mov ebx, edx + mov ebx, esi + mov ebx, edi + mov ebx, ebp + mov ebx, esp + mov ebx, eax + mov ebx, eax + mov ebx, ecx + mov ebx, edx + mov ebx, esi + mov ebx, edi + mov ebx, ebp + mov ebx, esp + mov ebx, eax + + dec ecx + jnz .L1 + + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: VectorToVectorAVX +; Purpose: Reads/writes 256-bit values between registers of +; the vector register set, in this case YMM. +; Params: rdi = loops +;------------------------------------------------------------------------------ + align 64 +VectorToVectorAVX: +_VectorToVectorAVX: + vzeroupper + + mov eax, [esp + 4] +.L1: + vmovdqa ymm0, ymm1 ; Each move moves 32 bytes, so we need 8 + vmovdqa ymm0, ymm2 ; moves to transfer a 256 byte chunk. + vmovdqa ymm0, ymm3 + vmovdqa ymm2, ymm0 + vmovdqa ymm1, ymm2 + vmovdqa ymm2, ymm1 + vmovdqa ymm0, ymm3 + vmovdqa ymm3, ymm1 + + dec eax + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: VectorToVector +; Purpose: Reads/writes 128-bit values between registers of +; the vector register set, in this case XMM. +; Params: dword [esp + 4] = count. +;------------------------------------------------------------------------------ + align 64 +VectorToVector: +_VectorToVector: + mov eax, [esp + 4] +.L1: + movdqa xmm0, xmm1 + movdqa xmm0, xmm2 + movdqa xmm0, xmm3 + movdqa xmm2, xmm0 + movdqa xmm1, xmm2 + movdqa xmm2, xmm1 + movdqa xmm0, xmm3 + movdqa xmm3, xmm1 + + movdqa xmm3, xmm2 + movdqa xmm1, xmm3 + movdqa xmm2, xmm1 + movdqa xmm0, xmm1 + movdqa xmm1, xmm2 + movdqa xmm0, xmm1 + movdqa xmm0, xmm3 + movdqa xmm3, xmm0 + + dec eax + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: RegisterToVector +; Purpose: Writes 32-bit main register values into 128-bit vector register +; clearing the upper unused bits. +; Params: dword [esp + 4] = count. +;------------------------------------------------------------------------------ + align 64 +RegisterToVector: +_RegisterToVector: + mov eax, [esp + 4] + add eax, eax ; Double # of loops. +.L1: + movd xmm1, eax ; 32 transfers of 4 bytes = 128 bytes + movd xmm2, eax + movd xmm3, eax + movd xmm0, eax + movd xmm1, eax + movd xmm2, eax + movd xmm3, eax + movd xmm0, eax + + movd xmm1, eax + movd xmm3, eax + movd xmm2, eax + movd xmm0, eax + movd xmm1, eax + movd xmm2, eax + movd xmm3, eax + movd xmm0, eax + + movd xmm0, eax + movd xmm2, eax + movd xmm0, eax + movd xmm3, eax + movd xmm1, eax + movd xmm3, eax + movd xmm2, eax + movd xmm0, eax + + movd xmm0, eax + movd xmm3, eax + movd xmm1, eax + movd xmm2, eax + movd xmm0, eax + movd xmm2, eax + movd xmm3, eax + movd xmm0, eax + + dec eax + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: VectorToRegister +; Purpose: Writes lowest 32 bits of vector registers into 32-bit main +; register. +; Params: dword [esp + 4] = count. +;------------------------------------------------------------------------------ + align 64 +VectorToRegister: +_VectorToRegister: + mov eax, [esp + 4] + add eax, eax ; Double # of loops. + push ebx +.L1: + movd ebx, xmm1 ; 4 bytes per transfer therefore need 64 + movd ebx, xmm2 ; to transfer 256 bytes. + movd ebx, xmm3 + movd ebx, xmm0 + movd ebx, xmm1 + movd ebx, xmm2 + movd ebx, xmm3 + movd ebx, xmm0 + + movd ebx, xmm1 + movd ebx, xmm3 + movd ebx, xmm2 + movd ebx, xmm0 + movd ebx, xmm1 + movd ebx, xmm2 + movd ebx, xmm3 + movd ebx, xmm0 + + movd ebx, xmm0 + movd ebx, xmm2 + movd ebx, xmm0 + movd ebx, xmm3 + movd ebx, xmm1 + movd ebx, xmm3 + movd ebx, xmm2 + movd ebx, xmm0 + + movd ebx, xmm0 + movd ebx, xmm3 + movd ebx, xmm1 + movd ebx, xmm2 + movd ebx, xmm0 + movd ebx, xmm2 + movd ebx, xmm3 + movd ebx, xmm0 + + dec eax + jnz .L1 + + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: StackReader +; Purpose: Reads 32-bit values off the stack into registers of +; the main register set, effectively testing L1 cache access +; *and* effective-address calculation speed. +; Params: +; dword [esp+4] = loops +;------------------------------------------------------------------------------ + align 64 +StackReader: +_StackReader: + push ebx + push ecx + + mov ecx, [esp+4+8] ; loops to do. + + push dword 7000 ; [esp+24] + push dword 6000 ; [esp+20] + push dword 5000 ; [esp+16] + push dword 4000 ; [esp+12] + push dword 3000 ; [esp+8] + push dword 2000 ; [esp+4] + push dword 1000 ; [esp] + +.L1: + mov eax, [esp] + mov eax, [esp+8] + mov eax, [esp+12] + mov eax, [esp+16] + mov eax, [esp+20] + mov eax, [esp+4] + mov eax, [esp+24] + mov eax, [esp] + mov eax, [esp] + mov eax, [esp+8] + mov eax, [esp+12] + mov eax, [esp+16] + mov eax, [esp+20] + mov eax, [esp+4] + mov eax, [esp+24] + mov eax, [esp] + mov eax, [esp] + mov eax, [esp+8] + mov eax, [esp+12] + mov eax, [esp+16] + mov eax, [esp+20] + mov eax, [esp+4] + mov eax, [esp+24] + mov eax, [esp+4] + mov eax, [esp+4] + mov eax, [esp+8] + mov eax, [esp+12] + mov eax, [esp+16] + mov eax, [esp+20] + mov eax, [esp+4] + mov eax, [esp+24] + mov eax, [esp+4] + + mov ebx, [esp] + mov ebx, [esp+8] + mov ebx, [esp+12] + mov ebx, [esp+16] + mov ebx, [esp+20] + mov ebx, [esp+4] + mov ebx, [esp+24] + mov ebx, [esp] + mov ebx, [esp] + mov ebx, [esp+8] + mov ebx, [esp+12] + mov ebx, [esp+16] + mov ebx, [esp+20] + mov ebx, [esp+4] + mov ebx, [esp+24] + mov ebx, [esp] + mov ebx, [esp] + mov ebx, [esp+8] + mov ebx, [esp+12] + mov ebx, [esp+16] + mov ebx, [esp+20] + mov ebx, [esp+4] + mov ebx, [esp+24] + mov ebx, [esp+4] + mov ebx, [esp+4] + mov ebx, [esp+8] + mov ebx, [esp+12] + mov ebx, [esp+16] + mov ebx, [esp+20] + mov ebx, [esp+4] + mov ebx, [esp+24] + mov ebx, [esp+4] + + dec ecx + jnz .L1 + + add esp, 28 + + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: StackWriter +; Purpose: Writes 32-bit values into the stack from registers of +; the main register set, effectively testing L1 cache access +; *and* effective-address calculation speed. +; Params: +; dword [esp+4] = loops +;------------------------------------------------------------------------------ + align 64 +StackWriter: +_StackWriter: + push ebx + push ecx + + mov ecx, [esp+4+8] ; loops to do. + + push dword 7000 ; [esp+24] + push dword 6000 ; [esp+20] + push dword 5000 ; [esp+16] + push dword 4000 ; [esp+12] + push dword 3000 ; [esp+8] + push dword 2000 ; [esp+4] + push dword 1000 ; [esp] + + xor eax, eax + mov ebx, 0xffffffff + +.L1: + mov [esp], eax + mov [esp+8], eax + mov [esp+12], eax + mov [esp+16], eax + mov [esp+20], eax + mov [esp+4], eax + mov [esp+24], eax + mov [esp], eax + mov [esp], eax + mov [esp+8], eax + mov [esp+12], eax + mov [esp+16], eax + mov [esp+20], eax + mov [esp+4], eax + mov [esp+24], eax + mov [esp], eax + mov [esp], eax + mov [esp+8], eax + mov [esp+12], eax + mov [esp+16], eax + mov [esp+20], eax + mov [esp+4], eax + mov [esp+24], eax + mov [esp+4], eax + mov [esp+4], eax + mov [esp+8], eax + mov [esp+12], eax + mov [esp+16], eax + mov [esp+20], eax + mov [esp+4], eax + mov [esp+24], eax + mov [esp+4], eax + + mov [esp], ebx + mov [esp+8], ebx + mov [esp+12], ebx + mov [esp+16], ebx + mov [esp+20], ebx + mov [esp+4], ebx + mov [esp+24], ebx + mov [esp], ebx + mov [esp], ebx + mov [esp+8], ebx + mov [esp+12], ebx + mov [esp+16], ebx + mov [esp+20], ebx + mov [esp+4], ebx + mov [esp+24], ebx + mov [esp], ebx + mov [esp], ebx + mov [esp+8], ebx + mov [esp+12], ebx + mov [esp+16], ebx + mov [esp+20], ebx + mov [esp+4], ebx + mov [esp+24], ebx + mov [esp+4], ebx + mov [esp+4], ebx + mov [esp+8], ebx + mov [esp+12], ebx + mov [esp+16], ebx + mov [esp+20], ebx + mov [esp+4], ebx + mov [esp+24], ebx + mov [esp+4], ebx + + sub ecx, 1 + jnz .L1 + + add esp, 28 + + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: Register8ToVector +; Purpose: Writes 8-bit main register values into 128-bit vector register +; without clearing the unused bits. +; Params: dword [esp + 4] +;------------------------------------------------------------------------------ + align 64 +Register8ToVector: +_Register8ToVector: + mov eax, [esp + 4] + sal eax, 4 ; Force some repetition. +.L1: + pinsrb xmm1, al, 0 ; 64 transfers x 1 byte = 64 bytes + pinsrb xmm2, bl, 1 + pinsrb xmm3, cl, 2 + pinsrb xmm1, dl, 3 + pinsrb xmm2, sil, 4 + pinsrb xmm3, dil, 5 + pinsrb xmm0, bpl, 6 + pinsrb xmm0, spl, 7 + + pinsrb xmm0, al, 0 + pinsrb xmm1, bl, 1 + pinsrb xmm2, cl, 2 + pinsrb xmm3, dl, 3 + pinsrb xmm3, al, 4 + pinsrb xmm2, bl, 5 + pinsrb xmm1, bpl, 6 + pinsrb xmm0, spl, 7 + + pinsrb xmm1, al, 0 + pinsrb xmm2, al, 1 + pinsrb xmm3, al, 2 + pinsrb xmm1, al, 3 + pinsrb xmm2, al, 4 + pinsrb xmm3, al, 5 + pinsrb xmm0, cl, 6 + pinsrb xmm0, bl, 7 + + pinsrb xmm0, al, 0 + pinsrb xmm0, al, 1 + pinsrb xmm0, al, 2 + pinsrb xmm0, al, 3 + pinsrb xmm0, al, 4 + pinsrb xmm0, al, 5 + pinsrb xmm0, cl, 6 + pinsrb xmm0, bl, 7 + + pinsrb xmm1, al, 0 + pinsrb xmm2, bl, 1 + pinsrb xmm3, cl, 2 + pinsrb xmm1, dl, 3 + pinsrb xmm2, sil, 4 + pinsrb xmm3, dil, 5 + pinsrb xmm0, bpl, 6 + pinsrb xmm0, spl, 7 + + pinsrb xmm0, al, 10 + pinsrb xmm1, bl, 11 + pinsrb xmm2, cl, 12 + pinsrb xmm3, dl, 13 + pinsrb xmm3, dil, 14 + pinsrb xmm2, cl, 15 + pinsrb xmm1, al, 6 + pinsrb xmm0, bpl, 7 + + pinsrb xmm1, al, 10 + pinsrb xmm2, al, 11 + pinsrb xmm3, al, 12 + pinsrb xmm1, al, 13 + pinsrb xmm2, al, 14 + pinsrb xmm3, al, 15 + pinsrb xmm0, cl, 6 + pinsrb xmm0, bl, 7 + + pinsrb xmm0, al, 9 + pinsrb xmm0, al, 8 + pinsrb xmm0, al, 11 + pinsrb xmm0, al, 3 + pinsrb xmm0, al, 4 + pinsrb xmm0, al, 5 + pinsrb xmm0, cl, 6 + pinsrb xmm0, bl, 7 + + dec eax + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: Register16ToVector +; Purpose: Writes 16-bit main register values into 128-bit vector register +; without clearing the unused bits. +; Params: rdi = loops +;------------------------------------------------------------------------------ + align 64 +Register16ToVector: +_Register16ToVector: + mov eax, [esp + 4] + sal eax, 3 ; Force some repetition. +.L1: + pinsrw xmm1, ax, 0 ; 64 transfers x 2 bytes = 128 bytes + pinsrw xmm2, bx, 1 + pinsrw xmm3, cx, 2 + pinsrw xmm1, dx, 3 + pinsrw xmm2, si, 4 + pinsrw xmm3, di, 5 + pinsrw xmm0, bp, 6 + pinsrw xmm0, sp, 7 + + pinsrw xmm0, ax, 0 + pinsrw xmm1, bx, 1 + pinsrw xmm2, cx, 2 + pinsrw xmm3, dx, 3 + pinsrw xmm3, si, 4 + pinsrw xmm2, di, 5 + pinsrw xmm1, bp, 6 + pinsrw xmm0, sp, 7 + + pinsrw xmm1, ax, 0 + pinsrw xmm2, ax, 1 + pinsrw xmm3, ax, 2 + pinsrw xmm1, ax, 3 + pinsrw xmm2, ax, 4 + pinsrw xmm3, ax, 5 + pinsrw xmm0, bp, 6 + pinsrw xmm0, bx, 7 + + pinsrw xmm0, ax, 0 + pinsrw xmm0, ax, 1 + pinsrw xmm0, ax, 2 + pinsrw xmm0, ax, 3 + pinsrw xmm0, ax, 4 + pinsrw xmm0, ax, 5 + pinsrw xmm0, bp, 6 + pinsrw xmm0, bx, 7 + + pinsrw xmm1, ax, 0 + pinsrw xmm2, bx, 1 + pinsrw xmm3, cx, 2 + pinsrw xmm1, dx, 3 + pinsrw xmm2, si, 4 + pinsrw xmm3, di, 5 + pinsrw xmm0, bp, 6 + pinsrw xmm0, sp, 7 + + pinsrw xmm0, ax, 0 + pinsrw xmm1, bx, 1 + pinsrw xmm2, cx, 2 + pinsrw xmm3, dx, 3 + pinsrw xmm3, si, 4 + pinsrw xmm2, di, 5 + pinsrw xmm1, bp, 6 + pinsrw xmm0, sp, 7 + + pinsrw xmm1, ax, 0 + pinsrw xmm2, ax, 1 + pinsrw xmm3, ax, 2 + pinsrw xmm1, ax, 3 + pinsrw xmm2, ax, 4 + pinsrw xmm3, ax, 5 + pinsrw xmm0, bp, 6 + pinsrw xmm0, bx, 7 + + pinsrw xmm0, ax, 0 + pinsrw xmm0, ax, 1 + pinsrw xmm0, ax, 2 + pinsrw xmm0, ax, 3 + pinsrw xmm0, ax, 4 + pinsrw xmm0, ax, 5 + pinsrw xmm0, bp, 6 + pinsrw xmm0, bx, 7 + + dec eax + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: Register32ToVector +; Purpose: Writes 32-bit main register values into 128-bit vector register +; without clearing the unused bits. +; Params: rdi = loops +;------------------------------------------------------------------------------ + align 64 +Register32ToVector: +_Register32ToVector: + mov eax, [esp + 4] + sal eax, 2 ; Force some repetition. +.L1: + pinsrd xmm1, eax, 0 ; Each xfer moves 4 bytes so to move 256 bytes + pinsrd xmm2, ebx, 1 ; we need 64 transfers. + pinsrd xmm3, ecx, 2 + pinsrd xmm1, edx, 3 + pinsrd xmm2, esi, 0 + pinsrd xmm3, edi, 1 + pinsrd xmm0, ebp, 2 + pinsrd xmm0, esp, 3 + + pinsrd xmm0, eax, 0 + pinsrd xmm1, ebx, 1 + pinsrd xmm2, ecx, 2 + pinsrd xmm3, edx, 3 + pinsrd xmm3, esi, 3 + pinsrd xmm2, edi, 2 + pinsrd xmm1, ebp, 1 + pinsrd xmm0, esp, 0 + + pinsrd xmm1, eax, 0 + pinsrd xmm2, eax, 1 + pinsrd xmm3, eax, 2 + pinsrd xmm1, eax, 3 + pinsrd xmm2, eax, 0 + pinsrd xmm3, eax, 1 + pinsrd xmm0, ebp, 2 + pinsrd xmm0, ebx, 3 + + pinsrd xmm0, eax, 0 + pinsrd xmm0, eax, 1 + pinsrd xmm0, eax, 2 + pinsrd xmm0, eax, 3 + pinsrd xmm0, eax, 0 + pinsrd xmm0, eax, 0 + pinsrd xmm0, ebp, 0 + pinsrd xmm0, ebx, 0 + + pinsrd xmm1, eax, 0 + pinsrd xmm2, ebx, 1 + pinsrd xmm3, ecx, 2 + pinsrd xmm1, edx, 3 + pinsrd xmm2, esi, 0 + pinsrd xmm3, edi, 1 + pinsrd xmm0, ebp, 2 + pinsrd xmm0, esp, 3 + + pinsrd xmm0, eax, 0 + pinsrd xmm1, ebx, 1 + pinsrd xmm2, ecx, 2 + pinsrd xmm3, edx, 3 + pinsrd xmm3, esi, 3 + pinsrd xmm2, edi, 2 + pinsrd xmm1, ebp, 1 + pinsrd xmm0, esp, 0 + + pinsrd xmm1, eax, 0 + pinsrd xmm2, eax, 1 + pinsrd xmm3, eax, 2 + pinsrd xmm1, eax, 3 + pinsrd xmm2, eax, 0 + pinsrd xmm3, eax, 1 + pinsrd xmm0, ebp, 2 + pinsrd xmm0, ebx, 3 + + pinsrd xmm0, eax, 0 + pinsrd xmm0, eax, 1 + pinsrd xmm0, eax, 2 + pinsrd xmm0, eax, 3 + pinsrd xmm0, eax, 0 + pinsrd xmm0, eax, 0 + pinsrd xmm0, ebp, 0 + pinsrd xmm0, ebx, 0 + pinsrd xmm0, esp, 0 + + dec eax + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: Register64ToVector +; Purpose: Writes 64-bit main register values into 128-bit vector register +; without clearing the unused bits. +; Params: rdi = loops +;------------------------------------------------------------------------------ +Register64ToVector: +_Register64ToVector: + ret + + +;------------------------------------------------------------------------------ +; Name: Vector8ToRegister +; Purpose: Writes 8-bit vector register values into main register. +; Params: rdi = loops +;------------------------------------------------------------------------------ + align 64 +Vector8ToRegister: +_Vector8ToRegister: + mov eax, [esp + 4] + sal eax, 4 ; Force some repetition. + push ebx +.L1: + pextrb ebx, xmm1, 0 + pextrb ebx, xmm2, 1 + pextrb ebx, xmm3, 2 + pextrb ebx, xmm1, 3 + pextrb ebx, xmm2, 4 + pextrb ebx, xmm3, 5 + pextrb ebx, xmm0, 6 + pextrb ebx, xmm0, 7 + + pextrb ebx, xmm0, 0 + pextrb ebx, xmm1, 1 + pextrb ebx, xmm2, 2 + pextrb ebx, xmm3, 3 + pextrb ebx, xmm3, 4 + pextrb ebx, xmm2, 15 + pextrb ebx, xmm1, 6 + pextrb ebx, xmm0, 7 + + pextrb ebx, xmm1, 0 + pextrb ebx, xmm2, 1 + pextrb ebx, xmm3, 2 + pextrb ebx, xmm1, 3 + pextrb ebx, xmm2, 4 + pextrb ebx, xmm3, 5 + pextrb ebx, xmm0, 6 + pextrb ebx, xmm0, 7 + + pextrb ebx, xmm0, 0 + pextrb ebx, xmm1, 1 + pextrb ebx, xmm2, 2 + pextrb ebx, xmm3, 3 + pextrb ebx, xmm3, 4 + pextrb ebx, xmm2, 5 + pextrb ebx, xmm1, 6 + pextrb ebx, xmm0, 7 + + pextrb ebx, xmm1, 0 + pextrb ebx, xmm2, 1 + pextrb ebx, xmm3, 2 + pextrb ebx, xmm1, 13 + pextrb ebx, xmm2, 14 + pextrb ebx, xmm3, 15 + pextrb ebx, xmm0, 6 + pextrb ebx, xmm0, 7 + + pextrb ebx, xmm0, 10 + pextrb ebx, xmm1, 11 + pextrb ebx, xmm2, 12 + pextrb ebx, xmm3, 13 + pextrb ebx, xmm3, 14 + pextrb ebx, xmm2, 15 + pextrb ebx, xmm1, 6 + pextrb ebx, xmm0, 7 + + pextrb ebx, xmm1, 0 + pextrb ebx, xmm2, 1 + pextrb ebx, xmm3, 2 + pextrb ebx, xmm1, 3 + pextrb ebx, xmm2, 4 + pextrb ebx, xmm3, 5 + pextrb ebx, xmm0, 6 + pextrb ebx, xmm0, 7 + + pextrb ebx, xmm0, 0 + pextrb ebx, xmm1, 1 + pextrb ebx, xmm2, 2 + pextrb ebx, xmm3, 3 + pextrb ebx, xmm3, 4 + pextrb ebx, xmm2, 5 + pextrb ebx, xmm1, 6 + pextrb ebx, xmm0, 7 + + dec eax + jnz .L1 + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: Vector16ToRegister +; Purpose: Writes 16-bit vector register values into main register. +; Params: rdi = loops +;------------------------------------------------------------------------------ + align 64 +Vector16ToRegister: +_Vector16ToRegister: + mov eax, [esp + 4] + sal eax, 3 ; Force some repetition. + push ebx +.L1: + pextrw ebx, xmm1, 0 + pextrw ebx, xmm2, 1 + pextrw ebx, xmm3, 2 + pextrw ebx, xmm1, 3 + pextrw ebx, xmm2, 4 + pextrw ebx, xmm3, 5 + pextrw ebx, xmm0, 6 + pextrw ebx, xmm0, 7 + + pextrw ebx, xmm0, 0 + pextrw ebx, xmm1, 1 + pextrw ebx, xmm2, 2 + pextrw ebx, xmm3, 3 + pextrw ebx, xmm3, 4 + pextrw ebx, xmm2, 5 + pextrw ebx, xmm1, 6 + pextrw ebx, xmm0, 7 + + pextrw ebx, xmm1, 0 + pextrw ebx, xmm2, 1 + pextrw ebx, xmm3, 2 + pextrw ebx, xmm1, 3 + pextrw ebx, xmm2, 4 + pextrw ebx, xmm3, 5 + pextrw ebx, xmm0, 6 + pextrw ebx, xmm0, 7 + + pextrw ebx, xmm0, 0 + pextrw ebx, xmm1, 1 + pextrw ebx, xmm2, 2 + pextrw ebx, xmm3, 3 + pextrw ebx, xmm3, 4 + pextrw ebx, xmm2, 5 + pextrw ebx, xmm1, 6 + pextrw ebx, xmm0, 7 + + pextrw ebx, xmm1, 0 + pextrw ebx, xmm2, 1 + pextrw ebx, xmm3, 2 + pextrw ebx, xmm1, 3 + pextrw ebx, xmm2, 4 + pextrw ebx, xmm3, 5 + pextrw ebx, xmm0, 6 + pextrw ebx, xmm0, 7 + + pextrw ebx, xmm0, 0 + pextrw ebx, xmm1, 1 + pextrw ebx, xmm2, 2 + pextrw ebx, xmm3, 3 + pextrw ebx, xmm3, 4 + pextrw ebx, xmm2, 5 + pextrw ebx, xmm1, 6 + pextrw ebx, xmm0, 7 + + pextrw ebx, xmm1, 0 + pextrw ebx, xmm2, 1 + pextrw ebx, xmm3, 2 + pextrw ebx, xmm1, 3 + pextrw ebx, xmm2, 4 + pextrw ebx, xmm3, 5 + pextrw ebx, xmm0, 6 + pextrw ebx, xmm0, 7 + + pextrw ebx, xmm0, 0 + pextrw ebx, xmm1, 1 + pextrw ebx, xmm2, 2 + pextrw ebx, xmm3, 3 + pextrw ebx, xmm3, 4 + pextrw ebx, xmm2, 5 + pextrw ebx, xmm1, 6 + pextrw ebx, xmm0, 7 + + dec eax + jnz .L1 + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: Vector32ToRegister +; Purpose: Writes 32-bit vector register values into main register. +; Params: rdi = loops +;------------------------------------------------------------------------------ + align 64 +Vector32ToRegister: +_Vector32ToRegister: + mov eax, [esp + 4] + sal eax, 2 ; Force some repetition. + push ebx +.L1: + pextrd ebx, xmm1, 0 + pextrd ebx, xmm2, 1 + pextrd ebx, xmm3, 2 + pextrd ebx, xmm1, 3 + pextrd ebx, xmm2, 0 + pextrd ebx, xmm3, 1 + pextrd ebx, xmm0, 2 + pextrd ebx, xmm0, 3 + + pextrd ebx, xmm0, 0 + pextrd ebx, xmm1, 1 + pextrd ebx, xmm2, 2 + pextrd ebx, xmm3, 3 + pextrd ebx, xmm3, 3 + pextrd ebx, xmm2, 2 + pextrd ebx, xmm1, 1 + pextrd ebx, xmm0, 0 + + pextrd ebx, xmm1, 0 + pextrd ebx, xmm2, 1 + pextrd ebx, xmm3, 2 + pextrd ebx, xmm1, 3 + pextrd ebx, xmm2, 0 + pextrd ebx, xmm3, 1 + pextrd ebx, xmm0, 2 + pextrd ebx, xmm0, 3 + + pextrd ebx, xmm0, 0 + pextrd ebx, xmm1, 1 + pextrd ebx, xmm2, 2 + pextrd ebx, xmm3, 3 + pextrd ebx, xmm3, 3 + pextrd ebx, xmm2, 2 + pextrd ebx, xmm1, 1 + pextrd ebx, xmm0, 0 + + dec eax + jnz .L1 + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: Vector64ToRegister +; Purpose: Writes 64-bit vector register values into main register. +; Params: rdi = loops +;------------------------------------------------------------------------------ +Vector64ToRegister: +_Vector64ToRegister: + ret + +;------------------------------------------------------------------------------ +; Name: CopyAVX +; Purpose: Copies memory chunks that are 32-byte aligned. +; Params: [esp + 4] = ptr to destination memory area +; [esp + 8] = ptr to source memory area +; [esp + 12] = length in bytes +; [esp + 16] = loops +;------------------------------------------------------------------------------ + align 64 +CopyAVX: +_CopyAVX: + vzeroupper + ; Register usage: + ; esi = source + ; edi = dest + ; ecx = loops + ; edx = length + push esi + push edi + push ecx + push edx + + mov edi, [esp + 4 + 16] + mov esi, [esp + 8 + 16] + mov edx, [esp + 12 + 16] + mov ecx, [esp + 16 + 16] + + shr edx, 8 ; Ensure length is multiple of 256. + shl edx, 8 + +.L1: + mov eax, edx + prefetchnta [esi] + +.L2: + vmovdqa ymm0, [esi] + vmovdqa ymm1, [32+esi] + vmovdqa ymm2, [64+esi] + vmovdqa ymm3, [96+esi] + + vmovdqa [edi], ymm0 + vmovdqa [32+edi], ymm1 + vmovdqa [64+edi], ymm2 + vmovdqa [96+edi], ymm3 + + vmovdqa ymm0, [128+esi] + vmovdqa ymm1, [128+32+esi] + vmovdqa ymm2, [128+64+esi] + vmovdqa ymm3, [128+96+esi] + + vmovdqa [128+edi], ymm0 + vmovdqa [128+32+edi], ymm1 + vmovdqa [128+64+edi], ymm2 + vmovdqa [128+96+edi], ymm3 + + add esi, 256 + add edi, 256 + + sub eax, 256 + jnz .L2 + + sub esi, edx ; rsi now points to start. + sub edi, edx ; rdi now points to start. + + dec ecx + jnz .L1 + + pop edx + pop ecx + pop edi + pop esi + ret + +;------------------------------------------------------------------------------ +; Name: CopySSE +; Purpose: Copies memory chunks that are 16-byte aligned. +; Params: [esp + 4] = ptr to destination memory area +; [esp + 8] = ptr to source memory area +; [esp + 12] = length in bytes +; [esp + 16] = loops +;------------------------------------------------------------------------------ + align 64 +CopySSE: +_CopySSE: + ; Register usage: + ; esi = source + ; edi = dest + ; ecx = loops + ; edx = length + push esi + push edi + push ecx + push edx + + mov edi, [esp + 4 + 16] + mov esi, [esp + 8 + 16] + mov edx, [esp + 12 + 16] + mov ecx, [esp + 16 + 16] + + shr edx, 7 ; Ensure length is multiple of 128. + shl edx, 7 + + ; Save our non-parameter XMM registers. + sub esp, 64 + movdqu [esp], xmm4 + movdqu [16+esp], xmm5 + movdqu [32+esp], xmm6 + movdqu [48+esp], xmm7 + +.L1: + mov eax, edx + +.L2: + prefetchnta [esi] + movdqa xmm0, [esi] + movdqa xmm1, [16+esi] + movdqa xmm2, [32+esi] + movdqa xmm3, [48+esi] + movdqa xmm4, [64+esi] + movdqa xmm5, [80+esi] + movdqa xmm6, [96+esi] + movdqa xmm7, [112+esi] + + ; 32-bit lacks xmm8 - xmm15. + + movdqa [edi], xmm0 + movdqa [16+edi], xmm1 + movdqa [32+edi], xmm2 + movdqa [48+edi], xmm3 + movdqa [64+edi], xmm4 + movdqa [80+edi], xmm5 + movdqa [96+edi], xmm6 + movdqa [112+edi], xmm7 + + add esi, 128 + add edi, 128 + + sub eax, 128 + jnz .L2 + + sub esi, edx ; rsi now points to start. + sub edi, edx ; rdi now points to start. + + dec ecx + jnz .L1 + + movdqu xmm4, [0+esp] + movdqu xmm5, [16+esp] + movdqu xmm6, [32+esp] + movdqu xmm7, [48+esp] + add esp, 64 + + pop edx + pop ecx + pop edi + pop esi + ret + +;------------------------------------------------------------------------------ +; Name: CopySSE_128bytes +; Purpose: Copies memory chunks that are 16-byte aligned. +; Params: [esp + 4] = ptr to destination memory area +; [esp + 8] = ptr to source memory area +; [esp + 12] = length in bytes +; [esp + 16] = loops +;------------------------------------------------------------------------------ + align 64 +CopySSE_128bytes: +_CopySSE_128bytes: + jmp CopySSE + diff --git a/routines64.asm b/routines64.asm new file mode 100755 index 0000000..e49b75a --- /dev/null +++ b/routines64.asm @@ -0,0 +1,2590 @@ +;============================================================================ +; bandwidth, a benchmark to estimate memory transfer bandwidth. +; Copyright (C) 2005-2014 by Zack T Smith. +; +; This program is free software; you can redistribute it and/or modify +; it under the terms of the GNU General Public License as published by +; the Free Software Foundation; either version 2 of the License, or +; (at your option) any later version. +; +; This program is distributed in the hope that it will be useful, +; but WITHOUT ANY WARRANTY; without even the implied warranty of +; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; GNU General Public License for more details. +; +; You should have received a copy of the GNU General Public License +; along with this program; if not, write to the Free Software +; Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +; +; The author may be reached at veritas@comcast.net. +;============================================================================= + +bits 64 +cpu ia64 + +global CopySSE +global CopySSE_128bytes + +global CopyAVX +global _CopyAVX + +global ReaderLODSQ +global _ReaderLODSQ + +global ReaderLODSD +global _ReaderLODSD + +global ReaderLODSW +global _ReaderLODSW + +global ReaderLODSB +global _ReaderLODSB + +global RandomReader +global RandomReaderSSE2 +global RandomReaderSSE2_bypass +global RandomWriter +global RandomWriterSSE2 +global RandomWriterSSE2_bypass +global Reader +global Reader_128bytes +global ReaderAVX +global ReaderSSE2 +global ReaderSSE2_128bytes +global ReaderSSE2_bypass +global ReaderSSE2_128bytes_bypass +global Register16ToVector +global Register32ToVector +global Register64ToVector +global Register8ToVector +global RegisterToRegister +global RegisterToVector +global StackReader +global StackWriter +global Vector16ToRegister +global Vector32ToRegister +global Vector64ToRegister +global Vector8ToRegister +global VectorToRegister +global VectorToVector +global VectorToVectorAVX +global Writer +global Writer_128bytes +global WriterAVX +global WriterSSE2 +global WriterSSE2_128bytes +global WriterSSE2_bypass +global WriterSSE2_128bytes_bypass +global WriterAVX_bypass +global _WriterAVX_bypass +global _CopySSE +global _CopySSE_128bytes +global _RandomReader +global _RandomReaderSSE2 +global _RandomReaderSSE2_bypass +global _RandomWriter +global _RandomWriterSSE2 +global _RandomWriterSSE2_bypass +global _Reader +global _ReaderAVX +global _Reader_128bytes +global _ReaderSSE2 +global _ReaderSSE2_bypass +global _ReaderSSE2_128bytes +global _ReaderSSE2_128bytes_bypass +global _Register16ToVector +global _Register32ToVector +global _Register64ToVector +global _Register8ToVector +global _RegisterToRegister +global _RegisterToVector +global _StackReader +global _StackWriter +global _Vector16ToRegister +global _Vector32ToRegister +global _Vector64ToRegister +global _Vector8ToRegister +global _VectorToRegister +global _VectorToVector +global _VectorToVectorAVX +global _Writer +global _Writer_128bytes +global _WriterSSE2 +global _WriterAVX +global _WriterSSE2_128bytes +global _WriterSSE2_bypass +global _WriterSSE2_128bytes_bypass + +global get_cpuid_cache_info +global _get_cpuid_cache_info + +global get_cpuid_family +global _get_cpuid_family + +global get_cpuid1_ecx +global _get_cpuid1_ecx + +global get_cpuid1_edx +global _get_cpuid1_edx + +global get_cpuid7_ebx +global _get_cpuid7_ebx + +global get_cpuid_80000001_ecx +global _get_cpuid_80000001_ecx + +global get_cpuid_80000001_edx +global _get_cpuid_80000001_edx + +; Note: +; Unix ABI says integer param are put in these registers in this order: +; rdi, rsi, rdx, rcx, r8, r9 + + section .text + +;------------------------------------------------------------------------------ +; Name: get_cpuid_cache_info +; +get_cpuid_cache_info: +_get_cpuid_cache_info: + push rbx + push rcx + push rdx + mov rax, 4 + mov rcx, rsi + cpuid + mov [rdi], eax + mov [rdi+4], ebx + mov [rdi+8], ecx + mov [rdi+12], edx + pop rdx + pop rcx + pop rbx + ret + +;------------------------------------------------------------------------------ +; Name: get_cpuid_family +; +get_cpuid_family: +_get_cpuid_family: + push rbx + push rcx + push rdx + xor rax, rax + cpuid + mov [rdi], ebx + mov [rdi+4], edx + mov [rdi+8], ecx + mov byte [rdi+12], 0 + pop rdx + pop rcx + pop rbx + ret + +;------------------------------------------------------------------------------ +; Name: get_cpuid1_ecx +; +get_cpuid1_ecx: +_get_cpuid1_ecx: + push rbx + push rcx + push rdx + mov rax, 1 + cpuid + mov rax, rcx + pop rdx + pop rcx + pop rbx + ret + +;------------------------------------------------------------------------------ +; Name: get_cpuid7_ebx +; +get_cpuid7_ebx: +_get_cpuid7_ebx: + push rbx + push rcx + push rdx + mov rax, 7 + xor rcx, rcx + cpuid + mov rax, rbx + pop rdx + pop rcx + pop rbx + ret + +;------------------------------------------------------------------------------ +; Name: get_cpuid1_edx +; +get_cpuid1_edx: +_get_cpuid1_edx: + push rbx + push rcx + push rdx + mov rax, 1 + cpuid + mov rax, rdx + pop rdx + pop rcx + pop rbx + ret + +;------------------------------------------------------------------------------ +; Name: get_cpuid_80000001_ecx +; +get_cpuid_80000001_ecx: +_get_cpuid_80000001_ecx: + push rbx + push rcx + push rdx + mov rax, 0x80000001 + cpuid + mov rax, rcx + pop rdx + pop rcx + pop rbx + ret + +;------------------------------------------------------------------------------ +; Name: get_cpuid_80000001_edx +; +get_cpuid_80000001_edx: +_get_cpuid_80000001_edx: + push rbx + push rcx + push rdx + mov rax, 0x80000001 + cpuid + mov rax, rdx + pop rdx + pop rcx + pop rbx + ret + +;------------------------------------------------------------------------------ +; Name: ReaderLODSQ +; Purpose: Reads 64-bit values sequentially from an area of memory +; using LODSQ instruction. +; Params: rdi = ptr to memory area +; rsi = length in bytes +; rdx = loops +;------------------------------------------------------------------------------ + align 32 +ReaderLODSQ: +_ReaderLODSQ: + push rcx ; REP counter + push r10 + push r11 + mov r10, rdi + mov r11, rsi + shr r11, 3 ; length in quadwords rounded down. + +.L1: + mov rsi, r10 ; buffer start + mov rcx, r11 ; # of quadwords + + rep lodsq + + dec rdx + jnz .L1 + + pop r11 + pop r10 + pop rcx + ret + +;------------------------------------------------------------------------------ +; Name: ReaderLODSD +; Purpose: Reads 32-bit values sequentially from an area of memory +; using LODSD instruction. +; Params: rdi = ptr to memory area +; rsi = length in bytes +; rdx = loops +;------------------------------------------------------------------------------ + align 32 +ReaderLODSD: +_ReaderLODSD: + push rcx ; REP counter + push r10 + push r11 + mov r10, rdi + mov r11, rsi + shr r11, 2 ; length in double words rounded down. + +.L1: + mov rsi, r10 ; buffer start + mov rcx, r11 ; # of double words + + rep lodsd + + dec rdx + jnz .L1 + + pop r11 + pop r10 + pop rcx + ret + +;------------------------------------------------------------------------------ +; Name: ReaderLODSW +; Purpose: Reads 16-bit values sequentially from an area of memory +; using LODSW instruction. +; Params: rdi = ptr to memory area +; rsi = length in bytes +; rdx = loops +;------------------------------------------------------------------------------ + align 32 +ReaderLODSW: +_ReaderLODSW: + push rcx ; REP counter + push r10 + push r11 + mov r10, rdi + mov r11, rsi + shr r11, 1 ; length in words rounded down. + +.L1: + mov rsi, r10 ; buffer start + mov rcx, r11 ; # of words + + rep lodsw + + dec rdx + jnz .L1 + + pop r11 + pop r10 + pop rcx + ret + +;------------------------------------------------------------------------------ +; Name: ReaderLODSB +; Purpose: Reads 8-bit values sequentially from an area of memory +; using LODSB instruction. +; Params: rdi = ptr to memory area +; rsi = length in bytes +; rdx = loops +;------------------------------------------------------------------------------ + align 32 +ReaderLODSB: +_ReaderLODSB: + push rcx ; REP counter + push r10 + push r11 + mov r10, rdi + mov r11, rsi + +.L1: + mov rsi, r10 ; buffer start + mov rcx, r11 ; # of bytes + + rep lodsb + + dec rdx + jnz .L1 + + pop r11 + pop r10 + pop rcx + ret + +;------------------------------------------------------------------------------ +; Name: Reader +; Purpose: Reads 64-bit values sequentially from an area of memory. +; Params: rdi = ptr to memory area +; rsi = length in bytes +; rdx = loops +;------------------------------------------------------------------------------ + align 64 +Reader: +_Reader: + push r10 + + add rsi, rdi ; rsi now points to end. + +.L1: + mov r10, rdi + +.L2: + mov rax, [r10] + mov rax, [8+r10] + mov rax, [16+r10] + mov rax, [24+r10] + mov rax, [32+r10] + mov rax, [40+r10] + mov rax, [48+r10] + mov rax, [56+r10] + mov rax, [64+r10] + mov rax, [72+r10] + mov rax, [80+r10] + mov rax, [88+r10] + mov rax, [96+r10] + mov rax, [104+r10] + mov rax, [112+r10] + mov rax, [120+r10] + mov rax, [128+r10] + mov rax, [136+r10] + mov rax, [144+r10] + mov rax, [152+r10] + mov rax, [160+r10] + mov rax, [168+r10] + mov rax, [176+r10] + mov rax, [184+r10] + mov rax, [192+r10] + mov rax, [200+r10] + mov rax, [208+r10] + mov rax, [216+r10] + mov rax, [224+r10] + mov rax, [232+r10] + mov rax, [240+r10] + mov rax, [248+r10] + + add r10, 256 + cmp r10, rsi + jb .L2 + + dec rdx + jnz .L1 + + pop r10 + ret + +;------------------------------------------------------------------------------ +; Name: Reader_128bytes +; Purpose: Reads 64-bit values sequentially from an area of memory. +; Params: rdi = ptr to memory area +; rsi = length in bytes +; rdx = loops +;------------------------------------------------------------------------------ + align 64 +Reader_128bytes: +_Reader_128bytes: + push r10 + + add rsi, rdi ; rdi now points to end. + +.L1: + mov r10, rdi + +.L2: + mov rax, [r10] + mov rax, [8+r10] + mov rax, [16+r10] + mov rax, [24+r10] + mov rax, [32+r10] + mov rax, [40+r10] + mov rax, [48+r10] + mov rax, [56+r10] + mov rax, [64+r10] + mov rax, [72+r10] + mov rax, [80+r10] + mov rax, [88+r10] + mov rax, [96+r10] + mov rax, [104+r10] + mov rax, [112+r10] + mov rax, [120+r10] + + add r10, 128 + cmp r10, rsi + jb .L2 + + dec rdx + jnz .L1 + + pop r10 + ret + +;------------------------------------------------------------------------------ +; Name: RandomReader +; Purpose: Reads 64-bit values randomly from an area of memory. +; Params: rdi = ptr to array of chunk pointers +; rsi = # of chunks +; rdx = loops +;------------------------------------------------------------------------------ + align 64 +RandomReader: +_RandomReader: + push r10 + push r11 + +.L1: + xor r11, r11 + +.L2: + mov r10, [rdi + 8*r11] ; Note, 64-bit pointers. + + mov rax, [96+r10] + mov rax, [r10] + mov rax, [120+r10] + mov rax, [184+r10] + mov rax, [160+r10] + mov rax, [176+r10] + mov rax, [112+r10] + mov rax, [80+r10] + mov rax, [32+r10] + mov rax, [128+r10] + mov rax, [88+r10] + mov rax, [40+r10] + mov rax, [48+r10] + mov rax, [72+r10] + mov rax, [200+r10] + mov rax, [24+r10] + mov rax, [152+r10] + mov rax, [16+r10] + mov rax, [248+r10] + mov rax, [56+r10] + mov rax, [240+r10] + mov rax, [208+r10] + mov rax, [104+r10] + mov rax, [216+r10] + mov rax, [136+r10] + mov rax, [232+r10] + mov rax, [64+r10] + mov rax, [224+r10] + mov rax, [144+r10] + mov rax, [192+r10] + mov rax, [8+r10] + mov rax, [168+r10] + + inc r11 + cmp r11, rsi + jb .L2 + + dec rdx + jnz .L1 + + pop r11 + pop r10 + ret + +;------------------------------------------------------------------------------ +; Name: RandomReaderSSE2 +; Purpose: Reads 128-bit values randomly from an area of memory. +; Params: rdi = ptr to array of chunk pointers +; rsi = # of chunks +; rdx = loops +;------------------------------------------------------------------------------ + align 64 +RandomReaderSSE2: +_RandomReaderSSE2: + push r10 + push r11 + +.L1: + xor r11, r11 + +.L2: + mov r10, [rdi + 8*r11] + + movdqa xmm0, [240+r10] + movdqa xmm0, [128+r10] + movdqa xmm0, [64+r10] + movdqa xmm0, [208+r10] + movdqa xmm0, [112+r10] + movdqa xmm0, [176+r10] + movdqa xmm0, [144+r10] + movdqa xmm0, [r10] + movdqa xmm0, [96+r10] + movdqa xmm0, [16+r10] + movdqa xmm0, [192+r10] + movdqa xmm0, [160+r10] + movdqa xmm0, [32+r10] + movdqa xmm0, [48+r10] + movdqa xmm0, [224+r10] + movdqa xmm0, [80+r10] + + inc r11 + cmp r11, rsi + jb .L2 + + dec rdx + jnz .L1 + + pop r11 + pop r10 + ret + +;------------------------------------------------------------------------------ +; Name: RandomReaderSSE2_bypass +; Purpose: Reads 128-bit values randomly from an area of memory. +; Params: rdi = ptr to array of chunk pointers +; rsi = # of chunks +; rdx = loops +;------------------------------------------------------------------------------ + align 64 +RandomReaderSSE2_bypass: +_RandomReaderSSE2_bypass: + push r10 + push r11 + +.L1: + xor r11, r11 + +.L2: + mov r10, [rdi + 8*r11] + + ; SSE 4.1 required + movntdqa xmm0, [240+r10] + movntdqa xmm0, [r10] + movntdqa xmm0, [128+r10] + movntdqa xmm0, [64+r10] + movntdqa xmm0, [208+r10] + movntdqa xmm0, [112+r10] + movntdqa xmm0, [48+r10] + movntdqa xmm0, [176+r10] + movntdqa xmm0, [144+r10] + movntdqa xmm0, [96+r10] + movntdqa xmm0, [16+r10] + movntdqa xmm0, [160+r10] + movntdqa xmm0, [32+r10] + movntdqa xmm0, [224+r10] + movntdqa xmm0, [80+r10] + movntdqa xmm0, [192+r10] + + inc r11 + cmp r11, rsi + jb .L2 + + dec rdx + jnz .L1 + + pop r11 + pop r10 + ret + +;------------------------------------------------------------------------------ +; Name: RandomWriter +; Purpose: Writes 64-bit values randomly to an area of memory. +; Params: rdi = ptr to array of chunk pointers +; rsi = # of chunks +; rdx = loops +; rcx = datum to write +;------------------------------------------------------------------------------ + align 64 +RandomWriter: +_RandomWriter: + push r10 + push r11 + +.L1: + xor r11, r11 + +.L2: + mov r10, [rdi + 8*r11] ; Note, 64-bit pointers. + + mov [96+r10], rcx + mov [r10], rcx + mov [120+r10], rcx + mov [184+r10], rcx + mov [160+r10], rcx + mov [176+r10], rcx + mov [112+r10], rcx + mov [80+r10], rcx + mov [32+r10], rcx + mov [128+r10], rcx + mov [88+r10], rcx + mov [40+r10], rcx + mov [48+r10], rcx + mov [72+r10], rcx + mov [200+r10], rcx + mov [24+r10], rcx + mov [152+r10], rcx + mov [16+r10], rcx + mov [248+r10], rcx + mov [56+r10], rcx + mov [240+r10], rcx + mov [208+r10], rcx + mov [104+r10], rcx + mov [216+r10], rcx + mov [136+r10], rcx + mov [232+r10], rcx + mov [64+r10], rcx + mov [224+r10], rcx + mov [144+r10], rcx + mov [192+r10], rcx + mov [8+r10], rcx + mov [168+r10], rcx + + inc r11 + cmp r11, rsi + jb .L2 + + dec rdx + jnz .L1 + + pop r11 + pop r10 + ret + +;------------------------------------------------------------------------------ +; Name: RandomWriterSSE2 +; Purpose: Writes 128-bit values randomly to an area of memory. +; Params: rdi = ptr to array of chunk pointers +; rsi = # of chunks +; rdx = loops +; rcx = datum to write +;------------------------------------------------------------------------------ + align 64 +RandomWriterSSE2: +_RandomWriterSSE2: + push r10 + push r11 + + movq xmm0, rcx ; Create duplicated 128-bit datum + movq xmm1, rcx + pslldq xmm1, 64 + por xmm0, xmm1 + +.L1: + xor r11, r11 + +.L2: + mov r10, [rdi + 8*r11] ; Note, 64-bit pointers. + + movdqa [240+r10], xmm0 + movdqa [128+r10], xmm0 + movdqa [208+r10], xmm0 + movdqa [112+r10], xmm0 + movdqa [64+r10], xmm0 + movdqa [176+r10], xmm0 + movdqa [144+r10], xmm0 + movdqa [r10], xmm0 + movdqa [96+r10], xmm0 + movdqa [16+r10], xmm0 + movdqa [192+r10], xmm0 + movdqa [160+r10], xmm0 + movdqa [32+r10], xmm0 + movdqa [48+r10], xmm0 + movdqa [224+r10], xmm0 + movdqa [80+r10], xmm0 + + inc r11 + cmp r11, rsi + jb .L2 + + dec rdx + jnz .L1 + + pop r11 + pop r10 + ret + +;------------------------------------------------------------------------------ +; Name: RandomWriterSSE2_bypass +; Purpose: Writes 128-bit values randomly into memory, bypassing caches. +; Params: rdi = ptr to array of chunk pointers +; rsi = # of chunks +; rdx = loops +; rcx = datum to write +;------------------------------------------------------------------------------ + align 64 +RandomWriterSSE2_bypass: +_RandomWriterSSE2_bypass: + push r10 + push r11 + + movq xmm0, rcx ; Create duplicated 128-bit datum + movq xmm1, rcx + pslldq xmm1, 64 + por xmm0, xmm1 + +.L1: + xor r11, r11 + +.L2: + mov r10, [rdi + 8*r11] ; Note, 64-bit pointers. + + movntdq [240+r10], xmm0 + movntdq [128+r10], xmm0 + movntdq [208+r10], xmm0 + movntdq [112+r10], xmm0 + movntdq [64+r10], xmm0 + movntdq [176+r10], xmm0 + movntdq [144+r10], xmm0 + movntdq [r10], xmm0 + movntdq [96+r10], xmm0 + movntdq [16+r10], xmm0 + movntdq [192+r10], xmm0 + movntdq [160+r10], xmm0 + movntdq [32+r10], xmm0 + movntdq [48+r10], xmm0 + movntdq [224+r10], xmm0 + movntdq [80+r10], xmm0 + + inc r11 + cmp r11, rsi + jb .L2 + + dec rdx + jnz .L1 + + pop r11 + pop r10 + ret + +;------------------------------------------------------------------------------ +; Name: ReaderSSE2_128bytes +; Purpose: Reads 128-bit values sequentially from an area of memory. +; Params: rdi = ptr to memory area +; rsi = length in bytes +; rdx = loops +;------------------------------------------------------------------------------ + align 64 +ReaderSSE2_128bytes: +_ReaderSSE2_128bytes: + push r10 + + add rsi, rdi ; rsi now points to end. + +.L1: + mov r10, rdi + +.L2: + movdqa xmm0, [r10] ; Read aligned to 16-byte boundary. + movdqa xmm0, [16+r10] + movdqa xmm0, [32+r10] + movdqa xmm0, [48+r10] + movdqa xmm0, [64+r10] + movdqa xmm0, [80+r10] + movdqa xmm0, [96+r10] + movdqa xmm0, [112+r10] + + add r10, 128 + cmp r10, rsi + jb .L2 + + dec rdx + jnz .L1 + + pop r10 + ret + + +;------------------------------------------------------------------------------ +; Name: ReaderSSE2 +; Purpose: Reads 128-bit values sequentially from an area of memory. +; Params: rdi = ptr to memory area +; rsi = length in bytes +; rdx = loops +;------------------------------------------------------------------------------ + align 64 +ReaderSSE2: +_ReaderSSE2: + push r10 + + add rsi, rdi ; rsi now points to end. + +.L1: + mov r10, rdi + +.L2: + movdqa xmm0, [r10] ; Read aligned to 16-byte boundary. + movdqa xmm0, [16+r10] + movdqa xmm0, [32+r10] + movdqa xmm0, [48+r10] + movdqa xmm0, [64+r10] + movdqa xmm0, [80+r10] + movdqa xmm0, [96+r10] + movdqa xmm0, [112+r10] + + movdqa xmm0, [128+r10] + movdqa xmm0, [144+r10] + movdqa xmm0, [160+r10] + movdqa xmm0, [176+r10] + movdqa xmm0, [192+r10] + movdqa xmm0, [208+r10] + movdqa xmm0, [224+r10] + movdqa xmm0, [240+r10] + + add r10, 256 + cmp r10, rsi + jb .L2 + + dec rdx + jnz .L1 + + pop r10 + ret + + +;------------------------------------------------------------------------------ +; Name: ReaderAVX +; Purpose: Reads 256-bit values sequentially from an area of memory. +; Params: rdi = ptr to memory area +; rsi = length in bytes +; rdx = loops +;------------------------------------------------------------------------------ + align 64 +ReaderAVX: +_ReaderAVX: + vzeroupper + + push r10 + + add rsi, rdi ; rsi now points to end. + +.L1: + mov r10, rdi + +.L2: + vmovdqa ymm0, [r10] ; Read aligned to 32-byte boundary. + vmovdqa ymm0, [32+r10] + vmovdqa ymm0, [64+r10] + vmovdqa ymm0, [96+r10] + vmovdqa ymm0, [128+r10] + vmovdqa ymm0, [160+r10] + vmovdqa ymm0, [192+r10] + vmovdqa ymm0, [224+r10] + + add r10, 256 + cmp r10, rsi + jb .L2 + + dec rdx + jnz .L1 + + pop r10 + ret + + +;------------------------------------------------------------------------------ +; Name: ReaderSSE2_bypass +; Purpose: Reads 128-bit values sequentially from an area of memory. +; Params: rdi = ptr to memory area +; rsi = length in bytes +; rdx = loops +;------------------------------------------------------------------------------ + align 64 +ReaderSSE2_bypass: +_ReaderSSE2_bypass: + push r10 + + add rsi, rdi ; rsi now points to end. + +.L1: + mov r10, rdi + +.L2: + movntdqa xmm0, [r10] ; Read aligned to 16-byte boundary. + movntdqa xmm0, [16+r10] + movntdqa xmm0, [32+r10] + movntdqa xmm0, [48+r10] + movntdqa xmm0, [64+r10] + movntdqa xmm0, [80+r10] + movntdqa xmm0, [96+r10] + movntdqa xmm0, [112+r10] + + movntdqa xmm0, [128+r10] + movntdqa xmm0, [144+r10] + movntdqa xmm0, [160+r10] + movntdqa xmm0, [176+r10] + movntdqa xmm0, [192+r10] + movntdqa xmm0, [208+r10] + movntdqa xmm0, [224+r10] + movntdqa xmm0, [240+r10] + + add r10, 256 + cmp r10, rsi + jb .L2 + + dec rdx + jnz .L1 + + pop r10 + ret + + +;------------------------------------------------------------------------------ +; Name: ReaderSSE2_128bytes_bypass +; Purpose: Reads 128-bit values sequentially from an area of memory. +; Params: rdi = ptr to memory area +; rsi = length in bytes +; rdx = loops +;------------------------------------------------------------------------------ + align 64 +ReaderSSE2_128bytes_bypass: +_ReaderSSE2_128bytes_bypass: + push r10 + + add rsi, rdi ; rsi now points to end. + +.L1: + mov r10, rdi + +.L2: + movntdqa xmm0, [r10] ; Read aligned to 16-byte boundary. + movntdqa xmm0, [16+r10] + movntdqa xmm0, [32+r10] + movntdqa xmm0, [48+r10] + movntdqa xmm0, [64+r10] + movntdqa xmm0, [80+r10] + movntdqa xmm0, [96+r10] + movntdqa xmm0, [112+r10] + + add r10, 128 + cmp r10, rsi + jb .L2 + + dec rdx + jnz .L1 + + pop r10 + ret + + +;------------------------------------------------------------------------------ +; Name: Writer +; Purpose: Writes 64-bit value sequentially to an area of memory. +; Params: rdi = ptr to memory area +; rsi = length in bytes +; rdx = loops +; rcx = quad to write +;------------------------------------------------------------------------------ + align 64 +Writer: +_Writer: + push r10 + + add rsi, rdi ; rsi now points to end. + +.L1: + mov r10, rdi + +.L2: + mov [r10], rcx + mov [8+r10], rcx + mov [16+r10], rcx + mov [24+r10], rcx + mov [32+r10], rcx + mov [40+r10], rcx + mov [48+r10], rcx + mov [56+r10], rcx + mov [64+r10], rcx + mov [72+r10], rcx + mov [80+r10], rcx + mov [88+r10], rcx + mov [96+r10], rcx + mov [104+r10], rcx + mov [112+r10], rcx + mov [120+r10], rcx + mov [128+r10], rcx + mov [136+r10], rcx + mov [144+r10], rcx + mov [152+r10], rcx + mov [160+r10], rcx + mov [168+r10], rcx + mov [176+r10], rcx + mov [184+r10], rcx + mov [192+r10], rcx + mov [200+r10], rcx + mov [208+r10], rcx + mov [216+r10], rcx + mov [224+r10], rcx + mov [232+r10], rcx + mov [240+r10], rcx + mov [248+r10], rcx + + add r10, 256 + cmp r10, rsi + jb .L2 + + dec rdx + jnz .L1 + + pop r10 + ret + +;------------------------------------------------------------------------------ +; Name: Writer_128bytes +; Purpose: Writes 64-bit value sequentially to an area of memory. +; Params: rdi = ptr to memory area +; rsi = length in bytes +; rdx = loops +; rcx = quad to write +;------------------------------------------------------------------------------ + align 64 +Writer_128bytes: +_Writer_128bytes: + push r10 + + add rsi, rdi ; rsi now points to end. + +.L1: + mov r10, rdi + +.L2: + mov [r10], rcx + mov [8+r10], rcx + mov [16+r10], rcx + mov [24+r10], rcx + mov [32+r10], rcx + mov [40+r10], rcx + mov [48+r10], rcx + mov [56+r10], rcx + mov [64+r10], rcx + mov [72+r10], rcx + mov [80+r10], rcx + mov [88+r10], rcx + mov [96+r10], rcx + mov [104+r10], rcx + mov [112+r10], rcx + mov [120+r10], rcx + + add r10, 128 + cmp r10, rsi + jb .L2 + + dec rdx + jnz .L1 + + pop r10 + ret + +;------------------------------------------------------------------------------ +; Name: WriterSSE2 +; Purpose: Writes 128-bit value sequentially to an area of memory. +; Params: rdi = ptr to memory area +; rsi = length in bytes +; rdx = loops +; rcx = quad to write +;------------------------------------------------------------------------------ + align 64 +WriterSSE2: +_WriterSSE2: + push r10 + + add rsi, rdi ; rsi now points to end. + + movq xmm0, rcx + +.L1: + mov r10, rdi + +.L2: + movdqa [r10], xmm0 + movdqa [16+r10], xmm0 + movdqa [32+r10], xmm0 + movdqa [48+r10], xmm0 + movdqa [64+r10], xmm0 + movdqa [80+r10], xmm0 + movdqa [96+r10], xmm0 + movdqa [112+r10], xmm0 + + movdqa [128+r10], xmm0 + movdqa [144+r10], xmm0 + movdqa [160+r10], xmm0 + movdqa [176+r10], xmm0 + movdqa [192+r10], xmm0 + movdqa [208+r10], xmm0 + movdqa [224+r10], xmm0 + movdqa [240+r10], xmm0 + + add r10, 256 + cmp r10, rsi + jb .L2 + + dec rdx + jnz .L1 + + pop r10 + ret + +;------------------------------------------------------------------------------ +; Name: WriterAVX +; Purpose: Writes 256-bit value sequentially to an area of memory. +; Params: rdi = ptr to memory area +; rsi = length in bytes +; rdx = loops +; rcx = quad to write +;------------------------------------------------------------------------------ + align 64 +WriterAVX: +_WriterAVX: + vzeroupper + + push r10 + + add rsi, rdi ; rsi now points to end. + + pinsrq xmm0, rcx, 0 + pinsrq xmm0, rcx, 1 + +.L1: + mov r10, rdi + +.L2: + vmovdqa [r10], ymm0 + vmovdqa [32+r10], ymm0 + vmovdqa [64+r10], ymm0 + vmovdqa [96+r10], ymm0 + vmovdqa [128+r10], ymm0 + vmovdqa [160+r10], ymm0 + vmovdqa [192+r10], ymm0 + vmovdqa [224+r10], ymm0 + + add r10, 256 + cmp r10, rsi + jb .L2 + + dec rdx + jnz .L1 + + pop r10 + ret + +;------------------------------------------------------------------------------ +; Name: WriterSSE2_128bytes +; Purpose: Writes 128-bit value sequentially to an area of memory, +; chunks are 128 bytes rather than 256. +; Params: rdi = ptr to memory area +; rsi = length in bytes +; rdx = loops +; rcx = quad to write +;------------------------------------------------------------------------------ + align 64 +WriterSSE2_128bytes: +_WriterSSE2_128bytes: + push r10 + + add rsi, rdi ; rsi now points to end. + + movq xmm0, rcx + +.L1: + mov r10, rdi + +.L2: + movdqa [r10], xmm0 + movdqa [16+r10], xmm0 + movdqa [32+r10], xmm0 + movdqa [48+r10], xmm0 + movdqa [64+r10], xmm0 + movdqa [80+r10], xmm0 + movdqa [96+r10], xmm0 + movdqa [112+r10], xmm0 + + add r10, 128 + cmp r10, rsi + jb .L2 + + dec rdx + jnz .L1 + + pop r10 + ret + +;------------------------------------------------------------------------------ +; Name: WriterSSE2_bypass +; Purpose: Writes 128-bit value sequentially to an area of memory. +; Params: rdi = ptr to memory area +; rsi = length in bytes +; rdx = loops +; rcx = quad to write +;------------------------------------------------------------------------------ + align 64 +WriterSSE2_bypass: +_WriterSSE2_bypass: + push r10 + + add rsi, rdi ; rsi now points to end. + + movq xmm0, rcx + +.L1: + mov r10, rdi + +.L2: + movntdq [r10], xmm0 ; Write bypassing cache. + movntdq [16+r10], xmm0 + movntdq [32+r10], xmm0 + movntdq [48+r10], xmm0 + movntdq [64+r10], xmm0 + movntdq [80+r10], xmm0 + movntdq [96+r10], xmm0 + movntdq [112+r10], xmm0 + + movntdq [128+r10], xmm0 + movntdq [144+r10], xmm0 + movntdq [160+r10], xmm0 + movntdq [176+r10], xmm0 + movntdq [192+r10], xmm0 + movntdq [208+r10], xmm0 + movntdq [224+r10], xmm0 + movntdq [240+r10], xmm0 + + add r10, 256 + cmp r10, rsi + jb .L2 + + dec rdx + jnz .L1 + + pop r10 + ret + +;------------------------------------------------------------------------------ +; Name: WriterAVX_bypass +; Purpose: Writes 256-bit value sequentially to an area of memory. +; Params: rdi = ptr to memory area +; rsi = length in bytes +; rdx = loops +; rcx = quad to write +;------------------------------------------------------------------------------ + align 64 +WriterAVX_bypass: +_WriterAVX_bypass: + vzeroupper + + push r10 + + add rsi, rdi ; rsi now points to end. + + movq xmm0, rcx + +.L1: + mov r10, rdi + +.L2: + vmovntdq [r10], xmm0 ; Write bypassing cache. + vmovntdq [32+r10], xmm0 + vmovntdq [64+r10], xmm0 + vmovntdq [96+r10], xmm0 + vmovntdq [128+r10], xmm0 + vmovntdq [160+r10], xmm0 + vmovntdq [192+r10], xmm0 + vmovntdq [224+r10], xmm0 + + add r10, 256 + cmp r10, rsi + jb .L2 + + dec rdx + jnz .L1 + + pop r10 + ret + +;------------------------------------------------------------------------------ +; Name: WriterSSE2_128bytes_bypass +; Purpose: Writes 128-bit value sequentially to an area of memory. +; Params: rdi = ptr to memory area +; rsi = length in bytes +; rdx = loops +; rcx = quad to write +;------------------------------------------------------------------------------ + align 64 +WriterSSE2_128bytes_bypass: +_WriterSSE2_128bytes_bypass: + push r10 + + add rsi, rdi ; rsi now points to end. + + movq xmm0, rcx + +.L1: + mov r10, rdi + +.L2: + movntdq [r10], xmm0 ; Write bypassing cache. + movntdq [16+r10], xmm0 + movntdq [32+r10], xmm0 + movntdq [48+r10], xmm0 + movntdq [64+r10], xmm0 + movntdq [80+r10], xmm0 + movntdq [96+r10], xmm0 + movntdq [112+r10], xmm0 + + add r10, 128 + cmp r10, rsi + jb .L2 + + dec rdx + jnz .L1 + + pop r10 + ret + +;------------------------------------------------------------------------------ +; Name: StackReader +; Purpose: Reads 64-bit values off the stack into registers of +; the main register set, effectively testing L1 cache access +; *and* effective-address calculation speed. +; Params: rdi = loops +;------------------------------------------------------------------------------ + align 64 +StackReader: +_StackReader: + push qword 7000 ; [rsp+48] + push qword 6000 ; [rsp+40] + push qword 5000 ; [rsp+32] + push qword 4000 ; [rsp+24] + push qword 3000 ; [rsp+16] + push qword 2000 ; [rsp+8] + push qword 1000 ; [rsp] + +.L1: + mov rax, [rsp] + mov rax, [rsp+16] + mov rax, [rsp+24] + mov rax, [rsp+32] + mov rax, [rsp+40] + mov rax, [rsp+8] + mov rax, [rsp+48] + mov rax, [rsp] + mov rax, [rsp] + mov rax, [rsp+16] + mov rax, [rsp+24] + mov rax, [rsp+32] + mov rax, [rsp+40] + mov rax, [rsp+8] + mov rax, [rsp+48] + mov rax, [rsp] + mov rax, [rsp] + mov rax, [rsp+16] + mov rax, [rsp+24] + mov rax, [rsp+32] + mov rax, [rsp+40] + mov rax, [rsp+8] + mov rax, [rsp+48] + mov rax, [rsp+8] + mov rax, [rsp+8] + mov rax, [rsp+16] + mov rax, [rsp+24] + mov rax, [rsp+32] + mov rax, [rsp+40] + mov rax, [rsp+8] + mov rax, [rsp+48] + mov rax, [rsp+8] + + sub rdi, 1 + jnz .L1 + + add rsp, 56 + ret + +;------------------------------------------------------------------------------ +; Name: StackWriter +; Purpose: Writes 64-bit values into the stack from registers of +; the main register set, effectively testing L1 cache access +; *and* effective-address calculation speed. +; Params: rdi = loops +;------------------------------------------------------------------------------ + align 64 +StackWriter: +_StackWriter: + push qword 7000 ; [rsp+48] + push qword 6000 ; [rsp+40] + push qword 5000 ; [rsp+32] + push qword 4000 ; [rsp+24] + push qword 3000 ; [rsp+16] + push qword 2000 ; [rsp+8] + push qword 1000 ; [rsp] + + xor rax, rax + +.L1: + mov [rsp], rax + mov [rsp+16], rax + mov [rsp+24], rax + mov [rsp+32], rax + mov [rsp+40], rax + mov [rsp+8], rax + mov [rsp+48], rax + mov [rsp], rax + mov [rsp], rax + mov [rsp+16], rax + mov [rsp+24], rax + mov [rsp+32], rax + mov [rsp+40], rax + mov [rsp+8], rax + mov [rsp+48], rax + mov [rsp], rax + mov [rsp], rax + mov [rsp+16], rax + mov [rsp+24], rax + mov [rsp+32], rax + mov [rsp+40], rax + mov [rsp+8], rax + mov [rsp+48], rax + mov [rsp+8], rax + mov [rsp+8], rax + mov [rsp+16], rax + mov [rsp+24], rax + mov [rsp+32], rax + mov [rsp+40], rax + mov [rsp+8], rax + mov [rsp+48], rax + mov [rsp+8], rax + + sub rdi, 1 + jnz .L1 + + add rsp, 56 + ret + +;------------------------------------------------------------------------------ +; Name: RegisterToRegister +; Purpose: Reads/writes 64-bit values between registers of +; the main register set. +; Params: rdi = loops +;------------------------------------------------------------------------------ + align 64 +RegisterToRegister: +_RegisterToRegister: +.L1: + mov rax, rbx + mov rax, rcx + mov rax, rdx + mov rax, rsi + mov rax, rdi + mov rax, rbp + mov rax, rsp + mov rax, rbx + mov rax, rbx + mov rax, rcx + mov rax, rdx + mov rax, rsi + mov rax, rdi + mov rax, rbp + mov rax, rsp + mov rax, rbx + mov rax, rbx + mov rax, rcx + mov rax, rdx + mov rax, rsi + mov rax, rdi + mov rax, rbp + mov rax, rsp + mov rax, rbx + mov rax, rbx + mov rax, rcx + mov rax, rdx + mov rax, rsi + mov rax, rdi + mov rax, rbp + mov rax, rsp + mov rax, rbx + + sub rdi, 1 + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: VectorToVector +; Purpose: Reads/writes 128-bit values between registers of +; the vector register set, in this case XMM. +; Params: rdi = loops +;------------------------------------------------------------------------------ + align 64 +VectorToVector: +_VectorToVector: +.L1: + movq xmm0, xmm1 ; Each move moves 16 bytes, so we need 16 + movq xmm0, xmm2 ; moves to transfer a 256 byte chunk. + movq xmm0, xmm3 + movq xmm2, xmm0 + movq xmm1, xmm2 + movq xmm2, xmm1 + movq xmm0, xmm3 + movq xmm3, xmm1 + + movq xmm3, xmm2 + movq xmm1, xmm3 + movq xmm2, xmm1 + movq xmm0, xmm1 + movq xmm1, xmm2 + movq xmm0, xmm1 + movq xmm0, xmm3 + movq xmm3, xmm0 + + sub rdi, 1 + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: VectorToVectorAVX +; Purpose: Reads/writes 256-bit values between registers of +; the vector register set, in this case YMM. +; Params: rdi = loops +;------------------------------------------------------------------------------ + align 64 +VectorToVectorAVX: +_VectorToVectorAVX: + vzeroupper + +.L1: + vmovdqa ymm0, ymm1 ; Each move moves 32 bytes, so we need 8 + vmovdqa ymm0, ymm2 ; moves to transfer a 256 byte chunk. + vmovdqa ymm0, ymm3 + vmovdqa ymm2, ymm0 + vmovdqa ymm1, ymm2 + vmovdqa ymm2, ymm1 + vmovdqa ymm0, ymm3 + vmovdqa ymm3, ymm1 + + sub rdi, 1 + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: RegisterToVector +; Purpose: Writes 64-bit main register values into 128-bit vector register +; clearing the upper unused bits. +; Params: rdi = loops +;------------------------------------------------------------------------------ + align 64 +RegisterToVector: +_RegisterToVector: +.L1: + movq xmm1, rax ; Each movq transfers 8 bytes, so we need + movq xmm2, rsi ; 32 transfers to move a 256-byte chunk. + movq xmm3, rbx + movq xmm1, rcx + movq xmm2, rsi + movq xmm3, rsp + movq xmm0, rdi + movq xmm0, rdx + + movq xmm0, rax + movq xmm1, rsi + movq xmm2, rbx + movq xmm3, rcx + movq xmm0, rsi + movq xmm3, rsp + movq xmm2, rdi + movq xmm1, rdx + + movq xmm0, rax + movq xmm1, rsi + movq xmm2, rbx + movq xmm3, rcx + movq xmm0, rsi + movq xmm3, rsp + movq xmm2, rdi + movq xmm1, rdx + + movq xmm0, rax + movq xmm1, rsi + movq xmm2, rbx + movq xmm3, rcx + movq xmm0, rsi + movq xmm3, rsp + movq xmm2, rdi + movq xmm1, rdx + + dec rdi + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: VectorToRegister +; Purpose: Writes lower 64 bits of vector register into 64-bit main +; register. +; Params: rdi = loops +;------------------------------------------------------------------------------ + align 64 +VectorToRegister: +_VectorToRegister: +.L1: + movq rax, xmm1 + movq rax, xmm2 + movq rax, xmm3 + movq rax, xmm1 + movq rax, xmm2 + movq rax, xmm3 + movq rax, xmm0 + movq rax, xmm0 + + movq rax, xmm0 + movq rax, xmm1 + movq rax, xmm2 + movq rax, xmm3 + movq rax, xmm0 + movq rax, xmm3 + movq rax, xmm2 + movq rax, xmm1 + + movq rax, xmm0 + movq rax, xmm1 + movq rax, xmm2 + movq rax, xmm3 + movq rax, xmm0 + movq rax, xmm3 + movq rax, xmm2 + movq rax, xmm1 + + movq rax, xmm0 + movq rax, xmm1 + movq rax, xmm2 + movq rax, xmm3 + movq rax, xmm0 + movq rax, xmm3 + movq rax, xmm2 + movq rax, xmm1 + + dec rdi + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: Register8ToVector +; Purpose: Writes 8-bit main register values into 128-bit vector register +; without clearing the unused bits. +; Params: rdi = loops +;------------------------------------------------------------------------------ + align 64 +Register8ToVector: +_Register8ToVector: + sal rdi, 2 ; Force some repetition. +.L1: + pinsrb xmm1, al, 0 ; 64 transfers x 1 byte = 64 bytes + pinsrb xmm2, bl, 1 + pinsrb xmm3, cl, 2 + pinsrb xmm1, dl, 3 + pinsrb xmm2, sil, 4 + pinsrb xmm3, dil, 5 + pinsrb xmm0, bpl, 6 + pinsrb xmm0, spl, 7 + + pinsrb xmm0, al, 0 + pinsrb xmm1, bl, 1 + pinsrb xmm2, cl, 2 + pinsrb xmm3, dl, 3 + pinsrb xmm3, al, 4 + pinsrb xmm2, bl, 5 + pinsrb xmm1, bpl, 6 + pinsrb xmm0, spl, 7 + + pinsrb xmm1, r8b, 0 + pinsrb xmm2, r9b, 1 + pinsrb xmm3, r10b, 2 + pinsrb xmm1, r11b, 3 + pinsrb xmm2, r12b, 4 + pinsrb xmm3, al, 5 + pinsrb xmm0, cl, 6 + pinsrb xmm0, bl, 7 + + pinsrb xmm0, r8b, 0 + pinsrb xmm0, r9b, 1 + pinsrb xmm0, r10b, 2 + pinsrb xmm0, r11b, 3 + pinsrb xmm0, r12b, 4 + pinsrb xmm0, al, 5 + pinsrb xmm0, cl, 6 + pinsrb xmm0, bl, 7 + + pinsrb xmm1, al, 0 + pinsrb xmm2, bl, 1 + pinsrb xmm3, cl, 2 + pinsrb xmm1, dl, 3 + pinsrb xmm2, sil, 4 + pinsrb xmm3, dil, 5 + pinsrb xmm0, bpl, 6 + pinsrb xmm0, spl, 7 + + pinsrb xmm0, al, 10 + pinsrb xmm1, bl, 11 + pinsrb xmm2, cl, 12 + pinsrb xmm3, dl, 13 + pinsrb xmm3, dil, 14 + pinsrb xmm2, cl, 15 + pinsrb xmm1, al, 6 + pinsrb xmm0, bpl, 7 + + pinsrb xmm1, r8b, 10 + pinsrb xmm2, r9b, 11 + pinsrb xmm3, r10b, 12 + pinsrb xmm1, r11b, 13 + pinsrb xmm2, r12b, 14 + pinsrb xmm3, al, 15 + pinsrb xmm0, cl, 6 + pinsrb xmm0, bl, 7 + + pinsrb xmm0, r8b, 9 + pinsrb xmm0, r9b, 8 + pinsrb xmm0, r10b, 11 + pinsrb xmm0, r11b, 3 + pinsrb xmm0, r12b, 4 + pinsrb xmm0, al, 5 + pinsrb xmm0, cl, 6 + pinsrb xmm0, bl, 7 + + dec rdi + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: Register16ToVector +; Purpose: Writes 16-bit main register values into 128-bit vector register +; without clearing the unused bits. +; Params: rdi = loops +;------------------------------------------------------------------------------ + align 64 +Register16ToVector: +_Register16ToVector: + sal rdi, 1 ; Force some repetition. +.L1: + pinsrw xmm1, ax, 0 ; 64 transfers x 2 bytes = 128 bytes + pinsrw xmm2, bx, 1 + pinsrw xmm3, cx, 2 + pinsrw xmm1, dx, 3 + pinsrw xmm2, si, 4 + pinsrw xmm3, di, 5 + pinsrw xmm0, bp, 6 + pinsrw xmm0, sp, 7 + + pinsrw xmm0, ax, 0 + pinsrw xmm1, bx, 1 + pinsrw xmm2, cx, 2 + pinsrw xmm3, dx, 3 + pinsrw xmm3, si, 4 + pinsrw xmm2, di, 5 + pinsrw xmm1, bp, 6 + pinsrw xmm0, sp, 7 + + pinsrw xmm1, r8w, 0 + pinsrw xmm2, r9w, 1 + pinsrw xmm3, r10w, 2 + pinsrw xmm1, r11w, 3 + pinsrw xmm2, r12w, 4 + pinsrw xmm3, ax, 5 + pinsrw xmm0, bp, 6 + pinsrw xmm0, bx, 7 + + pinsrw xmm0, r8w, 0 + pinsrw xmm0, r9w, 1 + pinsrw xmm0, r10w, 2 + pinsrw xmm0, r11w, 3 + pinsrw xmm0, r12w, 4 + pinsrw xmm0, ax, 5 + pinsrw xmm0, bp, 6 + pinsrw xmm0, bx, 7 + + pinsrw xmm1, ax, 0 + pinsrw xmm2, bx, 1 + pinsrw xmm3, cx, 2 + pinsrw xmm1, dx, 3 + pinsrw xmm2, si, 4 + pinsrw xmm3, di, 5 + pinsrw xmm0, bp, 6 + pinsrw xmm0, sp, 7 + + pinsrw xmm0, ax, 0 + pinsrw xmm1, bx, 1 + pinsrw xmm2, cx, 2 + pinsrw xmm3, dx, 3 + pinsrw xmm3, si, 4 + pinsrw xmm2, di, 5 + pinsrw xmm1, bp, 6 + pinsrw xmm0, sp, 7 + + pinsrw xmm1, r8w, 0 + pinsrw xmm2, r9w, 1 + pinsrw xmm3, r10w, 2 + pinsrw xmm1, r11w, 3 + pinsrw xmm2, r12w, 4 + pinsrw xmm3, ax, 5 + pinsrw xmm0, bp, 6 + pinsrw xmm0, bx, 7 + + pinsrw xmm0, r8w, 0 + pinsrw xmm0, r9w, 1 + pinsrw xmm0, r10w, 2 + pinsrw xmm0, r11w, 3 + pinsrw xmm0, r12w, 4 + pinsrw xmm0, ax, 5 + pinsrw xmm0, bp, 6 + pinsrw xmm0, bx, 7 + + dec rdi + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: Register32ToVector +; Purpose: Writes 32-bit main register values into 128-bit vector register +; without clearing the unused bits. +; Params: rdi = loops +;------------------------------------------------------------------------------ + align 64 +Register32ToVector: +_Register32ToVector: +.L1: + pinsrd xmm1, eax, 0 ; Each xfer moves 4 bytes so to move 256 bytes + pinsrd xmm2, ebx, 1 ; we need 64 transfers. + pinsrd xmm3, ecx, 2 + pinsrd xmm1, edx, 3 + pinsrd xmm2, esi, 0 + pinsrd xmm3, edi, 1 + pinsrd xmm0, ebp, 2 + pinsrd xmm0, esp, 3 + + pinsrd xmm0, eax, 0 + pinsrd xmm1, ebx, 1 + pinsrd xmm2, ecx, 2 + pinsrd xmm3, edx, 3 + pinsrd xmm3, esi, 3 + pinsrd xmm2, edi, 2 + pinsrd xmm1, ebp, 1 + pinsrd xmm0, esp, 0 + + pinsrd xmm1, r8d, 0 + pinsrd xmm2, r9d, 1 + pinsrd xmm3, r10d, 2 + pinsrd xmm1, r11d, 3 + pinsrd xmm2, r12d, 0 + pinsrd xmm3, eax, 1 + pinsrd xmm0, ebp, 2 + pinsrd xmm0, ebx, 3 + + pinsrd xmm0, r8d, 0 + pinsrd xmm0, r9d, 1 + pinsrd xmm0, r10d, 2 + pinsrd xmm0, r11d, 3 + pinsrd xmm0, r12d, 0 + pinsrd xmm0, eax, 0 + pinsrd xmm0, ebp, 0 + pinsrd xmm0, ebx, 0 + + pinsrd xmm1, eax, 0 + pinsrd xmm2, ebx, 1 + pinsrd xmm3, ecx, 2 + pinsrd xmm1, edx, 3 + pinsrd xmm2, esi, 0 + pinsrd xmm3, edi, 1 + pinsrd xmm0, ebp, 2 + pinsrd xmm0, esp, 3 + + pinsrd xmm0, eax, 0 + pinsrd xmm1, ebx, 1 + pinsrd xmm2, ecx, 2 + pinsrd xmm3, edx, 3 + pinsrd xmm3, esi, 3 + pinsrd xmm2, edi, 2 + pinsrd xmm1, ebp, 1 + pinsrd xmm0, esp, 0 + + pinsrd xmm1, r8d, 0 + pinsrd xmm2, r9d, 1 + pinsrd xmm3, r10d, 2 + pinsrd xmm1, r11d, 3 + pinsrd xmm2, r12d, 0 + pinsrd xmm3, eax, 1 + pinsrd xmm0, ebp, 2 + pinsrd xmm0, ebx, 3 + + pinsrd xmm0, r8d, 0 + pinsrd xmm0, r9d, 1 + pinsrd xmm0, r10d, 2 + pinsrd xmm0, r11d, 3 + pinsrd xmm0, r12d, 0 + pinsrd xmm0, eax, 0 + pinsrd xmm0, ebp, 0 + pinsrd xmm0, ebx, 0 + + dec rdi + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: Register64ToVector +; Purpose: Writes 64-bit main register values into 128-bit vector register +; without clearing the unused bits. +; Params: rdi = loops +;------------------------------------------------------------------------------ + align 64 +Register64ToVector: +_Register64ToVector: + add rdi, rdi +.L1: + pinsrq xmm1, r8, 0 ; Each xfer moves 8 bytes, therefore to do + pinsrq xmm2, r9, 1 ; 256 bytes we need 32 transfers. + pinsrq xmm3, r10, 0 + pinsrq xmm1, r11, 1 + pinsrq xmm2, r12, 0 + pinsrq xmm3, rax, 1 + pinsrq xmm0, rbp, 0 + pinsrq xmm0, rbx, 1 + + pinsrq xmm0, r8, 0 + pinsrq xmm0, r9, 1 + pinsrq xmm0, r10, 1 + pinsrq xmm0, r11, 1 + pinsrq xmm0, r12, 0 + pinsrq xmm0, rax, 0 + pinsrq xmm0, rbp, 0 + pinsrq xmm0, rbx, 0 + + pinsrq xmm0, r8, 0 + pinsrq xmm0, r9, 1 + pinsrq xmm0, r10, 1 + pinsrq xmm0, r11, 1 + pinsrq xmm0, r12, 0 + pinsrq xmm0, rax, 0 + pinsrq xmm0, rbp, 0 + pinsrq xmm0, rbx, 0 + + pinsrq xmm0, r8, 0 + pinsrq xmm0, r9, 1 + pinsrq xmm0, r10, 1 + pinsrq xmm0, r11, 1 + pinsrq xmm0, r12, 0 + pinsrq xmm0, rax, 0 + pinsrq xmm0, rbp, 0 + pinsrq xmm0, rbx, 0 + + dec rdi + jnz .L1 + ret + + +;------------------------------------------------------------------------------ +; Name: Vector8ToRegister +; Purpose: Writes 8-bit vector register values into main register. +; Params: rdi = loops +;------------------------------------------------------------------------------ + align 64 +Vector8ToRegister: +_Vector8ToRegister: + sal rdi, 3 ; Force some repetition. +.L1: + pextrb eax, xmm1, 0 ; 64 transfers x 1 bytes = 64 bytes + pextrb eax, xmm2, 1 + pextrb eax, xmm3, 2 + pextrb eax, xmm1, 3 + pextrb eax, xmm2, 4 + pextrb eax, xmm3, 5 + pextrb eax, xmm0, 6 + pextrb eax, xmm0, 7 + + pextrb eax, xmm0, 0 + pextrb eax, xmm1, 1 + pextrb eax, xmm2, 2 + pextrb eax, xmm3, 3 + pextrb eax, xmm3, 4 + pextrb eax, xmm2, 5 + pextrb eax, xmm1, 6 + pextrb eax, xmm0, 7 + + pextrb eax, xmm1, 0 + pextrb eax, xmm2, 1 + pextrb eax, xmm3, 2 + pextrb eax, xmm1, 3 + pextrb eax, xmm2, 4 + pextrb eax, xmm3, 5 + pextrb eax, xmm0, 6 + pextrb eax, xmm0, 7 + + pextrb eax, xmm0, 0 + pextrb eax, xmm0, 1 + pextrb eax, xmm0, 2 + pextrb eax, xmm0, 3 + pextrb eax, xmm0, 4 + pextrb eax, xmm0, 5 + pextrb eax, xmm0, 6 + pextrb eax, xmm0, 7 + + pextrb eax, xmm1, 0 + pextrb eax, xmm2, 1 + pextrb eax, xmm3, 2 + pextrb eax, xmm1, 3 + pextrb eax, xmm2, 4 + pextrb eax, xmm3, 5 + pextrb eax, xmm0, 6 + pextrb eax, xmm0, 7 + + pextrb eax, xmm0, 0 + pextrb eax, xmm1, 1 + pextrb eax, xmm2, 2 + pextrb eax, xmm3, 3 + pextrb eax, xmm3, 4 + pextrb eax, xmm2, 5 + pextrb eax, xmm1, 6 + pextrb eax, xmm0, 7 + + pextrb eax, xmm1, 0 + pextrb eax, xmm2, 1 + pextrb eax, xmm3, 2 + pextrb eax, xmm1, 3 + pextrb eax, xmm2, 4 + pextrb eax, xmm3, 5 + pextrb eax, xmm0, 6 + pextrb eax, xmm0, 7 + + pextrb eax, xmm0, 0 + pextrb eax, xmm0, 1 + pextrb eax, xmm0, 2 + pextrb eax, xmm0, 3 + pextrb eax, xmm0, 4 + pextrb eax, xmm0, 5 + pextrb eax, xmm0, 6 + pextrb eax, xmm0, 7 + + dec rdi + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: Vector16ToRegister +; Purpose: Writes 16-bit vector register values into main register. +; Params: rdi = loops +;------------------------------------------------------------------------------ + align 64 +Vector16ToRegister: +_Vector16ToRegister: + sal rdi, 2 ; Force some repetition. +.L1: + pextrw eax, xmm1, 0 ; 64 transfers x 2 bytes = 128 bytes + pextrw eax, xmm2, 1 + pextrw eax, xmm3, 2 + pextrw eax, xmm1, 3 + pextrw eax, xmm2, 4 + pextrw eax, xmm3, 5 + pextrw eax, xmm0, 6 + pextrw eax, xmm0, 7 + + pextrw eax, xmm0, 0 + pextrw eax, xmm1, 1 + pextrw eax, xmm2, 2 + pextrw eax, xmm3, 3 + pextrw eax, xmm3, 4 + pextrw eax, xmm2, 5 + pextrw eax, xmm1, 6 + pextrw eax, xmm0, 7 + + pextrw eax, xmm1, 0 + pextrw eax, xmm2, 1 + pextrw eax, xmm3, 2 + pextrw eax, xmm1, 3 + pextrw eax, xmm2, 4 + pextrw eax, xmm3, 5 + pextrw eax, xmm0, 6 + pextrw eax, xmm0, 7 + + pextrw eax, xmm0, 0 + pextrw eax, xmm0, 1 + pextrw eax, xmm0, 2 + pextrw eax, xmm0, 3 + pextrw eax, xmm0, 4 + pextrw eax, xmm0, 5 + pextrw eax, xmm0, 6 + pextrw eax, xmm0, 7 + + pextrw eax, xmm1, 0 + pextrw eax, xmm2, 1 + pextrw eax, xmm3, 2 + pextrw eax, xmm1, 3 + pextrw eax, xmm2, 4 + pextrw eax, xmm3, 5 + pextrw eax, xmm0, 6 + pextrw eax, xmm0, 7 + + pextrw eax, xmm0, 0 + pextrw eax, xmm1, 1 + pextrw eax, xmm2, 2 + pextrw eax, xmm3, 3 + pextrw eax, xmm3, 4 + pextrw eax, xmm2, 5 + pextrw eax, xmm1, 6 + pextrw eax, xmm0, 7 + + pextrw eax, xmm1, 0 + pextrw eax, xmm2, 1 + pextrw eax, xmm3, 2 + pextrw eax, xmm1, 3 + pextrw eax, xmm2, 4 + pextrw eax, xmm3, 5 + pextrw eax, xmm0, 6 + pextrw eax, xmm0, 7 + + pextrw eax, xmm0, 0 + pextrw eax, xmm0, 1 + pextrw eax, xmm0, 2 + pextrw eax, xmm0, 3 + pextrw eax, xmm0, 4 + pextrw eax, xmm0, 5 + pextrw eax, xmm0, 6 + pextrw eax, xmm0, 7 + + dec rdi + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: Vector32ToRegister +; Purpose: Writes 32-bit vector register values into main register. +; Params: rdi = loops +;------------------------------------------------------------------------------ + align 64 +Vector32ToRegister: +_Vector32ToRegister: + add rdi, rdi +.L1: + pextrd eax, xmm1, 0 ; 64 xfers x 4 bytes = 256 bytes + pextrd eax, xmm2, 1 + pextrd eax, xmm3, 2 + pextrd eax, xmm1, 3 + pextrd eax, xmm2, 0 + pextrd eax, xmm3, 1 + pextrd eax, xmm0, 2 + pextrd eax, xmm0, 3 + + pextrd eax, xmm0, 0 + pextrd eax, xmm1, 1 + pextrd eax, xmm2, 2 + pextrd eax, xmm3, 3 + pextrd eax, xmm3, 3 + pextrd eax, xmm2, 2 + pextrd eax, xmm1, 1 + pextrd eax, xmm0, 0 + + pextrd eax, xmm1, 0 + pextrd eax, xmm2, 1 + pextrd eax, xmm3, 2 + pextrd eax, xmm1, 3 + pextrd eax, xmm2, 0 + pextrd eax, xmm3, 1 + pextrd eax, xmm0, 2 + pextrd eax, xmm0, 3 + + pextrd eax, xmm0, 0 + pextrd eax, xmm0, 1 + pextrd eax, xmm0, 2 + pextrd eax, xmm0, 3 + pextrd eax, xmm0, 0 + pextrd eax, xmm0, 0 + pextrd eax, xmm0, 0 + pextrd eax, xmm0, 0 + + pextrd eax, xmm1, 0 + pextrd eax, xmm2, 1 + pextrd eax, xmm3, 2 + pextrd eax, xmm1, 3 + pextrd eax, xmm2, 0 + pextrd eax, xmm3, 1 + pextrd eax, xmm0, 2 + pextrd eax, xmm0, 3 + + pextrd eax, xmm0, 0 + pextrd eax, xmm1, 1 + pextrd eax, xmm2, 2 + pextrd eax, xmm3, 3 + pextrd eax, xmm3, 3 + pextrd eax, xmm2, 2 + pextrd eax, xmm1, 1 + pextrd eax, xmm0, 0 + + pextrd eax, xmm1, 0 + pextrd eax, xmm2, 1 + pextrd eax, xmm3, 2 + pextrd eax, xmm1, 3 + pextrd eax, xmm2, 0 + pextrd eax, xmm3, 1 + pextrd eax, xmm0, 2 + pextrd eax, xmm0, 3 + + pextrd eax, xmm0, 0 + pextrd eax, xmm0, 1 + pextrd eax, xmm0, 2 + pextrd eax, xmm0, 3 + pextrd eax, xmm0, 0 + pextrd eax, xmm0, 1 + pextrd eax, xmm0, 2 + pextrd eax, xmm0, 3 + + dec rdi + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: Vector64ToRegister +; Purpose: Writes 64-bit vector register values into main register. +; Params: rdi = loops +;------------------------------------------------------------------------------ + align 64 +Vector64ToRegister: +_Vector64ToRegister: + add rdi, rdi +.L1: + pextrq rax, xmm1, 0 ; 32 transfers by 8 bytes = 256 bytes + pextrq rax, xmm2, 1 + pextrq rax, xmm3, 0 + pextrq rax, xmm1, 1 + pextrq rax, xmm2, 0 + pextrq rax, xmm3, 1 + pextrq rax, xmm0, 0 + pextrq rax, xmm0, 1 + + pextrq rax, xmm0, 0 + pextrq rax, xmm0, 1 + pextrq rax, xmm0, 0 + pextrq rax, xmm0, 1 + pextrq rax, xmm0, 0 + pextrq rax, xmm0, 1 + pextrq rax, xmm0, 0 + pextrq rax, xmm0, 1 + + pextrq rax, xmm1, 0 + pextrq rax, xmm2, 1 + pextrq rax, xmm3, 0 + pextrq rax, xmm1, 1 + pextrq rax, xmm2, 0 + pextrq rax, xmm3, 1 + pextrq rax, xmm0, 0 + pextrq rax, xmm0, 1 + + pextrq rax, xmm0, 0 + pextrq rax, xmm0, 1 + pextrq rax, xmm0, 0 + pextrq rax, xmm0, 1 + pextrq rax, xmm0, 0 + pextrq rax, xmm0, 1 + pextrq rax, xmm0, 0 + pextrq rax, xmm0, 1 + + dec rdi + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: CopyAVX +; Purpose: Copies memory chunks that are 32-byte aligned. +; Params: rdi = ptr to destination memory area +; rsi = ptr to source memory area +; rdx = length in bytes +; rcx = loops +;------------------------------------------------------------------------------ + align 64 +CopyAVX: +_CopyAVX: + vzeroupper + + push r10 + + shr rdx, 8 ; Ensure length is multiple of 256. + shl rdx, 8 + + prefetcht0 [rsi] + +.L1: + mov r10, rdx + +.L2: + vmovdqa ymm0, [rsi] + vmovdqa ymm1, [32+rsi] + vmovdqa ymm2, [64+rsi] + vmovdqa ymm3, [96+rsi] + + vmovdqa [rdi], ymm0 + vmovdqa [32+rdi], ymm1 + vmovdqa [64+rdi], ymm2 + vmovdqa [96+rdi], ymm3 + + vmovdqa ymm0, [128+rsi] + vmovdqa ymm1, [128+32+rsi] + vmovdqa ymm2, [128+64+rsi] + vmovdqa ymm3, [128+96+rsi] + + vmovdqa [128+rdi], ymm0 + vmovdqa [128+32+rdi], ymm1 + vmovdqa [128+64+rdi], ymm2 + vmovdqa [128+96+rdi], ymm3 + + add rsi, 256 + add rdi, 256 + + sub r10, 256 + jnz .L2 + + sub rsi, rdx ; rsi now points to start. + sub rdi, rdx ; rdi now points to start. + + dec rcx + jnz .L1 + + pop r10 + + ret + + +;------------------------------------------------------------------------------ +; Name: CopySSE +; Purpose: Copies memory chunks that are 16-byte aligned. +; Params: rdi = ptr to destination memory area +; rsi = ptr to source memory area +; rdx = length in bytes +; rcx = loops +;------------------------------------------------------------------------------ + align 64 +CopySSE: +_CopySSE: + push r10 + + shr rdx, 8 ; Ensure length is multiple of 256. + shl rdx, 8 + + prefetcht0 [rsi] + + ; Save our non-parameter XMM registers. + sub rsp, 192 + movdqu [rsp], xmm4 + movdqu [16+rsp], xmm5 + movdqu [32+rsp], xmm6 + movdqu [48+rsp], xmm7 + movdqu [64+rsp], xmm8 + movdqu [80+rsp], xmm9 + movdqu [96+rsp], xmm10 + movdqu [112+rsp], xmm11 + movdqu [128+rsp], xmm12 + movdqu [144+rsp], xmm13 + movdqu [160+rsp], xmm14 + movdqu [176+rsp], xmm15 + +.L1: + mov r10, rdx + +.L2: + movdqa xmm0, [rsi] + movdqa xmm1, [16+rsi] + movdqa xmm2, [32+rsi] + movdqa xmm3, [48+rsi] + + movdqa [rdi], xmm0 + movdqa [16+rdi], xmm1 + movdqa [32+rdi], xmm2 + movdqa [48+rdi], xmm3 + + movdqa xmm4, [64+rsi] + movdqa xmm5, [80+rsi] + movdqa xmm6, [96+rsi] + movdqa xmm7, [112+rsi] + + movdqa [64+rdi], xmm4 + movdqa [80+rdi], xmm5 + movdqa [96+rdi], xmm6 + movdqa [112+rdi], xmm7 + + movdqa xmm8, [128+rsi] + movdqa xmm9, [144+rsi] + movdqa xmm10, [160+rsi] + movdqa xmm11, [176+rsi] + + movdqa [128+rdi], xmm8 + movdqa [144+rdi], xmm9 + movdqa [160+rdi], xmm10 + movdqa [176+rdi], xmm11 + + movdqa xmm12, [192+rsi] + movdqa xmm13, [208+rsi] + movdqa xmm14, [224+rsi] + movdqa xmm15, [240+rsi] + + movdqa [192+rdi], xmm12 + movdqa [208+rdi], xmm13 + movdqa [224+rdi], xmm14 + movdqa [240+rdi], xmm15 + + add rsi, 256 + add rdi, 256 + + sub r10, 256 + jnz .L2 + + sub rsi, rdx ; rsi now points to start. + sub rdi, rdx ; rdi now points to start. + + dec rcx + jnz .L1 + + movdqu xmm4, [rsp] + movdqu xmm5, [16+rsp] + movdqu xmm6, [32+rsp] + movdqu xmm7, [48+rsp] + movdqu xmm8, [64+rsp] + movdqu xmm9, [80+rsp] + movdqu xmm10, [96+rsp] + movdqu xmm11, [112+rsp] + movdqu xmm12, [128+rsp] + movdqu xmm13, [144+rsp] + movdqu xmm14, [160+rsp] + movdqu xmm15, [176+rsp] + add rsp, 192 + + pop r10 + + ret + + +;------------------------------------------------------------------------------ +; Name: CopySSE_128bytes +; Purpose: Copies memory chunks that are 16-byte aligned. +; Params: rdi = ptr to destination memory area +; rsi = ptr to source memory area +; rdx = length in bytes +; rcx = loops +;------------------------------------------------------------------------------ + align 64 +CopySSE_128bytes: +_CopySSE_128bytes: + push r10 + + shr rdx, 7 ; Ensure length is multiple of 128. + shl rdx, 7 + + prefetcht0 [rsi] + + ; Save our non-parameter XMM registers. + sub rsp, 64 + movdqu [rsp], xmm4 + movdqu [16+rsp], xmm5 + movdqu [32+rsp], xmm6 + movdqu [48+rsp], xmm7 + +.L1: + mov r10, rdx + +.L2: + movdqa xmm0, [rsi] + movdqa xmm1, [16+rsi] + movdqa xmm2, [32+rsi] + movdqa xmm3, [48+rsi] + + movdqa [rdi], xmm0 + movdqa [16+rdi], xmm1 + movdqa [32+rdi], xmm2 + movdqa [48+rdi], xmm3 + + movdqa xmm4, [64+rsi] + movdqa xmm5, [80+rsi] + movdqa xmm6, [96+rsi] + movdqa xmm7, [112+rsi] + + movdqa [64+rdi], xmm4 + movdqa [80+rdi], xmm5 + movdqa [96+rdi], xmm6 + movdqa [112+rdi], xmm7 + + add rsi, 128 + add rdi, 128 + + sub r10, 128 + jnz .L2 + + sub rsi, rdx ; rsi now points to start. + sub rdi, rdx ; rdi now points to start. + + dec rcx + jnz .L1 + + movdqu xmm4, [rsp] + movdqu xmm5, [16+rsp] + movdqu xmm6, [32+rsp] + movdqu xmm7, [48+rsp] + add rsp, 64 + + pop r10 + + ret + + -- cgit v1.1