diff options
26 files changed, 12764 insertions, 0 deletions
@@ -0,0 +1,796 @@ + +/*============================================================================= + bmplib, a simple library to create, modify, and write BMP image files. + Copyright (C) 2009-2014 by Zack T Smith. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 + as published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + The author may be reached at veritas@comcast.net. + *============================================================================*/ + +//-------------------------------------------------- +// Change Log +// 0.8 ZS Added larger font of my own design. +// 0.9 ZS Removed attempt at anti-aliasing. +//-------------------------------------------------- + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "BMP.h" +#include "font.h" +#include "minifont.h" + +// Narrowest possible numbers. +static char* narrow_nums [] = +{ + " # ", + "# #", + "# #", + "# #", + "# #", + "# #", + " # ", + + " #", + "##", + " #", + " #", + " #", + " #", + " #", + + " # ", + "# #", + " #", + " ##", + "# ", + "# ", + "###", + + "###", + " #", + " # ", + "## ", + " #", + "# #", + " # ", + + "# #", + "# #", + "# #", + "###", + " #", + " #", + " #", + + "###", + "# ", + "## ", + " #", + " #", + "# #", + " # ", + + + " # ", + "# ", + "# ", + "## ", + "# #", + "# #", + " # ", + + "###", + " #", + " #", + " # ", + " # ", + " # ", + " # ", + + " # ", + "# #", + "# #", + " # ", + "# #", + "# #", + " # ", + + " # ", + "# #", + "# #", + " ##", + " #", + " # ", + "# ", + + " ", + "", + "", + " ", + "", + "", + "#", +}; + + +/*--------------------------------------------------------------------------- + * Name: BMP_new + * Purpose: Creates new image. + *-------------------------------------------------------------------------*/ +BMP* +BMP_new (int w, int h) +{ + unsigned long size; + BMP* nu; + if (w<1 || h<1) + return NULL; + //---------- + + if (w & 3) + w += 4 - (w & 3); + if (h & 3) + h += 4 - (h & 3); + + nu = (BMP*) malloc (sizeof (BMP)); + if (!nu) + return NULL; + memset (nu, 0, sizeof (BMP)); + nu->width = w; + nu->height = h; + size = w * h * sizeof (long); + nu->pixels = (RGB*) malloc (size); + if (!nu->pixels) { + free (nu); + return NULL; + } + memset (nu->pixels, 0, size); + return nu; +} + +/*--------------------------------------------------------------------------- + * Name: BMP_destroy + * Purpose: Deallocates image. + *-------------------------------------------------------------------------*/ +void +BMP_destroy (BMP* bmp) +{ + if (!bmp) + return; + //---------- + + if (bmp->pixels) + free (bmp->pixels); + free (bmp); +} + +/*--------------------------------------------------------------------------- + * Name: BMP_point + * Purpose: Writes pixel into image. + *-------------------------------------------------------------------------*/ +void +BMP_point (BMP *bmp, int x, int y, RGB rgb) +{ + if (!bmp || x<0 || y<0) + return; + if (x >= bmp->width || y >= bmp->height) + return; + if (!bmp->pixels) + return; + //---------- + + bmp->pixels[y*bmp->width + x] = rgb; +} + +/*--------------------------------------------------------------------------- + * Name: BMP_line_core + * Purpose: Draws a line in a BMP image. + *-------------------------------------------------------------------------*/ +void +BMP_line_core (BMP *bmp, int x0, int y0, int x1, int y1, RGB rgb, + int dashed) +{ + if ((rgb >> 24) == 0xff) + return; + + int dot_counter = 0; + + if (!dashed && x0 == x1 && y0 == y1) + BMP_point (bmp, x0, y0, rgb); + else if (!dashed && x0 == x1) + BMP_vline (bmp, x0, y0, y1, rgb); + else if (!dashed && y0 == y1) + BMP_hline (bmp, x0, x1, y0, rgb); + else { + int j, x, y, dx, dy, e, xchange, s1, s2; + + // DDA, copied from my FramebufferUI project. + + x = x0; + y = y0; + s1 = 1; + s2 = 1; + + dx = x1 - x0; + if (dx < 0) { + dx = -dx; + s1 = -1; + } + + dy = y1 - y0; + if (dy < 0) { + dy = -dy; + s2 = -1; + } + + xchange = 0; + + if (dy > dx) { + int tmp = dx; + dx = dy; + dy = tmp; + xchange = 1; + } + + e = (dy<<1) - dx; + j = 0; + + while (j <= dx) { + j++; + + int draw = 1; + if (dashed && (1 & (dot_counter >> 2))) + draw = 0; + + if (draw) + BMP_point (bmp, x, y, rgb); + + dot_counter++; + + if (e >= 0) { + if (xchange) + x += s1; + else + y += s2; + e -= (dx << 1); + } + if (xchange) + y += s2; + else + x += s1; + e += (dy << 1); + } + } +} + +/*--------------------------------------------------------------------------- + * Name: BMP_line + * Purpose: Draws a line in a BMP image. + *-------------------------------------------------------------------------*/ +void +BMP_line (BMP *bmp, int x0, int y0, int x1, int y1, RGB rgb) +{ + BMP_line_core (bmp, x0, y0, x1, y1, rgb, 0); +} + +/*--------------------------------------------------------------------------- + * Name: BMP_line_dashed + * Purpose: Draws a dashed line in a BMP image. + *-------------------------------------------------------------------------*/ +void +BMP_line_dashed (BMP *bmp, int x0, int y0, int x1, int y1, RGB rgb) +{ + BMP_line_core (bmp, x0, y0, x1, y1, rgb, 1); +} + +/*--------------------------------------------------------------------------- + * Name: BMP_rect + * Purpose: Fills a rectangle with a color. + *-------------------------------------------------------------------------*/ +void +BMP_rect (BMP *bmp, int x, int y, int w, int h, RGB rgb) +{ + BMP_hline (bmp, x, x+w-1, y, rgb); + BMP_hline (bmp, x, x+w-1, y+h-1, rgb); + BMP_vline (bmp, x, y, y+h-1, rgb); + BMP_vline (bmp, x+w-1, y, y+h-1, rgb); +} + +/*--------------------------------------------------------------------------- + * Name: BMP_fillrect + * Purpose: Fills a rectangle with a color. + *-------------------------------------------------------------------------*/ +void +BMP_fillrect (BMP *bmp, int x, int y, int w, int h, RGB rgb) +{ + while (h > 0) { + BMP_hline (bmp, x, x+w-1, y, rgb); + h--; + y++; + } +} + +/*--------------------------------------------------------------------------- + * Name: BMP_clear + * Purpose: Sets all pixels to specified color. + *-------------------------------------------------------------------------*/ +void +BMP_clear (BMP *bmp, RGB rgb) +{ + BMP_fillrect (bmp, 0, 0, bmp->width, bmp->height, rgb); +} + +/*--------------------------------------------------------------------------- + * Name: BMP_hline + * Purpose: Draws horizontal line. + *-------------------------------------------------------------------------*/ +void +BMP_hline (BMP *bmp, int x0, int x1, int y, RGB rgb) +{ + if (x0 > x1) { + int tmp=x1; + x1=x0; + x0=tmp; + } + + while (x0 <= x1) { + BMP_point (bmp, x0++, y, rgb); + } +} + +/*--------------------------------------------------------------------------- + * Name: BMP_vline + * Purpose: Draws vertical line. + *-------------------------------------------------------------------------*/ +void +BMP_vline (BMP *bmp, int x, int y0, int y1, RGB rgb) +{ + if (y0 > y1) { + int tmp=y1; + y1=y0; + y0=tmp; + } + + while (y0 <= y1) { + BMP_point (bmp, x, y0++, rgb); + } +} + +/*--------------------------------------------------------------------------- + * Name: BMP_draw_string + * Purpose: Draws ature 5x8 characters into the image. + *-------------------------------------------------------------------------*/ +int +BMP_draw_string (BMP *bmp, const char *string, int x, int y, RGB color) +{ + char ch; + const char *s; + RGB r,g,b; + RGB light, dark; + + if (!bmp || !string) + return 0; + if (x >= bmp->width || y >= bmp->height || !*string) + return 0; + //---------- + + r = 0xff & (color >> 16); + g = 0xff & (color >> 8); + b = 0xff & color; + r += 3*0xff; + b += 3*0xff; + g += 3*0xff; + r /= 4; + g /= 4; + b /= 4; + light = b | (g << 8) | (r << 16); + + r = 0xff & (color >> 16); + g = 0xff & (color >> 8); + b = 0xff & color; + r += 0xff; + b += 0xff; + g += 0xff; + r /= 2; + g /= 2; + b /= 2; + dark = b | (g << 8) | (r << 16); + + const char **chars = get_font_chars (); + + s = string; + while ((ch = *s++)) { + int ix = -1; + if (ch == ' ') { + x += 10; + continue; + } + if (ch > 'z') + continue; + if (ch > ' ' && ch <= 'z') + ix = FONT_HEIGHT * (ch - 33); + + if (ix >= 0) { + int i; + int width = 0; + + for (i=0; i<FONT_HEIGHT ; i++) { + int j=0; + char ch2; + const char *s2 = chars[ix + i]; + int width2 = s2 ? strlen (s2) : 0; + if (width < width2) + width = width2; + while ((ch2 = *s2++)) { + RGB color_to_use; + char draw = 1; + switch (ch2) { + case '#': + color_to_use = color; + break; + default: + draw = 0; + } + if (draw) + BMP_point (bmp,x+j, y+i, color_to_use); + j++; + } + } + + x += width + 2/* kerning */; + } + } + + return x; +} + +/*--------------------------------------------------------------------------- + * Name: BMP_string_width + * Purpose: Gets width of 10x16 characters. + *-------------------------------------------------------------------------*/ +int +BMP_string_width (const char *string) +{ + char ch; + const char *s; + int width = 0; + + if (!string) + return 0; + //---------- + + const char **_chars = get_font_chars (); + + s = string; + while ((ch = *s++)) { + int ix = -1; + if (ch == ' ') { + width += 10; + continue; + } + if (ch > 'z') + continue; + if (ch > ' ' && ch <= 'z') + ix = FONT_HEIGHT * (ch - 33); + + if (ix >= 0) { + int j; + int max_w = 0; + for (j = 0; j < FONT_HEIGHT; j++) { + const char *ptr = _chars [j+ix]; + int w = ptr ? strlen (ptr) : 0; + if (max_w < w) max_w = w; + } + + width += max_w + 2/* kerning */; + } + } + + return width; +} + +/*--------------------------------------------------------------------------- + * Name: BMP_draw_mini_string + * Purpose: Draws miniature 5x8 characters into the image. + *-------------------------------------------------------------------------*/ +int +BMP_draw_mini_string (BMP *bmp, const char *string, int x, int y, RGB color) +{ + char ch; + const char *s; + unsigned long r,g,b; + unsigned long light, dark; + + if (!bmp || !string) + return 0; + if (x >= bmp->width || y >= bmp->height || !*string) + return 0; + //---------- + + r = 0xff & (color >> 16); + g = 0xff & (color >> 8); + b = 0xff & color; + r += 3*0xff; + b += 3*0xff; + g += 3*0xff; + r /= 4; + g /= 4; + b /= 4; + light = b | (g << 8) | (r << 16); + + r = 0xff & (color >> 16); + g = 0xff & (color >> 8); + b = 0xff & color; + r += 0xff; + b += 0xff; + g += 0xff; + r /= 2; + g /= 2; + b /= 2; + dark = b | (g << 8) | (r << 16); + + const char **mini_chars = get_minifont_chars (); + +#define MINI_HEIGHT (8) + s = string; + while ((ch = *s++)) { + int ix = -1; + if (ch == ' ') { + x += 5; + continue; + } + if (ch > 'z') + continue; + if (ch > ' ' && ch <= 'z') + ix = MINI_HEIGHT * (ch - 33); + + if (ix >= 0) { + int i; + + int width = 0; + for (i=0; i<MINI_HEIGHT; i++) { + int j=0; + char ch2; + const char *s2 = mini_chars[ix + i]; + int width2 = s2 ? strlen (s2) : 0; + if (width < width2) + width = width2; + while ((ch2 = *s2++)) { + RGB color_to_use; + char draw = 1; + switch (ch2) { + case '#': + color_to_use = color; + break; + default: + draw = 0; + } + if (draw) + BMP_point (bmp,x+j, y+i, color_to_use); + j++; + } + } + + x += width + 1/* kerning */; + } + } + + return x; +} + +/*--------------------------------------------------------------------------- + * Name: BMP_mini_string_width + * Purpose: Gets width of miniature 5x8 characters. + *-------------------------------------------------------------------------*/ +int +BMP_mini_string_width (const char *string) +{ + char ch; + const char *s; + int width = 0; + + if (!string) + return 0; + //---------- + + const char **mini_chars = get_minifont_chars (); + + s = string; + while ((ch = *s++)) { + int ix = -1; + if (ch == ' ') { + width += 5; + continue; + } + if (ch > 'z') + continue; + if (ch > ' ' && ch <= 'z') + ix = MINI_HEIGHT * (ch - 33); + + if (ix >= 0) { + int max_w = 0; + int j; + for (j = 0; j < MINI_HEIGHT; j++) { + const char *ptr = mini_chars [j+ix]; + int w = ptr ? strlen (ptr) : 0; + if (max_w < w) max_w = w; + } + + width += max_w + 1/*kerning*/; + } + } + + return width; +} + +/*--------------------------------------------------------------------------- + * Name: BMP_narrow_numbers + * Purpose: Draws miniature 4x7 characters into the image. + *-------------------------------------------------------------------------*/ +int +BMP_draw_narrow_numbers (BMP *bmp, const char *string, int x, int y, RGB color) +{ + char ch; + const char *s; + + if (!bmp || !string) + return 0; + if (x >= bmp->width || y >= bmp->height || !*string) + return 0; + //---------- + +#define NARROW_HEIGHT (7) + s = string; + while ((ch = *s++)) { + int ix = -1; + if (ch == ' ') { + x += 3; + continue; + } + if (ch >= '0' && ch <= '9') + ix = ch - '0'; + else + if (ch == '.') + ix = 10; + + ix *= NARROW_HEIGHT; + + if (ix >= 0) { + int i; + int width = strlen (narrow_nums [ix]); + + for (i=0; i<NARROW_HEIGHT; i++) { + int j=0; + char ch2; + const char *s2 = narrow_nums [ix + i]; + while ((ch2 = *s2++)) { + if (ch2 == '#') { + BMP_point (bmp, + x+j, y+i, color); + } + j++; + } + } + + x += width + 1; + } + } + + return x; +} + +/*--------------------------------------------------------------------------- + * Name: BMP_getpixel + * Purpose: Reads pixel out of image. + *-------------------------------------------------------------------------*/ +RGB +BMP_getpixel (BMP *bmp, int x, int y) +{ + if (!bmp || x<0 || y<0) + return 0; + if (x >= bmp->width || y >= bmp->height) + return 0; + if (!bmp->pixels) + return 0; + //---------- + + return bmp->pixels[y*bmp->width + x]; +} + +/*--------------------------------------------------------------------------- + * Name: BMP_write + * Purpose: Writes image to BMP file. + *-------------------------------------------------------------------------*/ +int +BMP_write (const BMP* bmp, const char *path) +{ + FILE *f; +#define HDRLEN (54) + unsigned char h[HDRLEN]; + unsigned long len; + int i, j; + + if (!bmp || !path) + return -1; + //---------- + + memset (h, 0, HDRLEN); + + //-------------------- + // Create the file. + // + f = fopen (path, "wb"); + if (!f) + return 0; + + //-------------------- + // Prepare header + // + len = HDRLEN + 3 * bmp->width * bmp->height; + h[0] = 'B'; + h[1] = 'M'; + h[2] = len & 0xff; + h[3] = (len >> 8) & 0xff; + h[4] = (len >> 16) & 0xff; + h[5] = (len >> 24) & 0xff; + h[10] = HDRLEN; + h[14] = 40; + h[18] = bmp->width & 0xff; + h[19] = (bmp->width >> 8) & 0xff; + h[20] = (bmp->width >> 16) & 0xff; + h[22] = bmp->height & 0xff; + h[23] = (bmp->height >> 8) & 0xff; + h[24] = (bmp->height >> 16) & 0xff; + h[26] = 1; + h[28] = 24; + h[34] = 16; + h[36] = 0x13; // 2835 pixels/meter + h[37] = 0x0b; + h[42] = 0x13; // 2835 pixels/meter + h[43] = 0x0b; + + //-------------------- + // Write header. + // + if (HDRLEN != fwrite (h, 1, HDRLEN, f)) { + fclose (f); + return 0; + } + + //---------------------------------------- + // Write pixels. + // Note that BMP has lower rows first. + // + for (j=bmp->height-1; j >= 0; j--) { + for (i=0; i < bmp->width; i++) { + unsigned char rgb[3]; + int ix = i + j * bmp->width; + unsigned long pixel = bmp->pixels[ix]; + rgb[0] = pixel & 0xff; + rgb[1] = (pixel >> 8) & 0xff; + rgb[2] = (pixel >> 16) & 0xff; + if (3 != fwrite (rgb, 1, 3, f)) { + fclose (f); + return 0; + } + } + } + + fclose (f); + return 1; +} + + @@ -0,0 +1,100 @@ + +/*============================================================================= + bmplib, a simple library to create, modify, and write BMP image files. + Copyright (C) 2009-2014 by Zack T Smith. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 + as published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + The author may be reached at veritas@comcast.net. + *============================================================================*/ + +#ifndef _BMP_H +#define _BMP_H + +#include <stdint.h> + +#define BMPLIB_RELEASE "0.9" +#define BMPLIB_RELEASE_MAJOR 0 +#define BMPLIB_RELEASE_MINOR 9 + +typedef uint32_t RGB; +typedef uint32_t RGBA; + +typedef struct { + int width, height; + RGB *pixels; +} BMP; + +#define FONT_HEIGHT (17) +#define MINIFONT_HEIGHT (8) + +extern BMP* BMP_new (int, int); +extern void BMP_destroy (BMP*); +extern void BMP_clear (BMP*, RGB); +extern int BMP_write (const BMP*, const char *path); +extern void BMP_point (BMP*, int, int, RGB); +extern void BMP_line (BMP *, int x0, int y0, int x1, int y1, RGB); +extern void BMP_line_dashed (BMP *, int x0, int y0, int x1, int y1, RGB); +extern void BMP_hline (BMP *, int x0, int x1, int y, RGB); +extern void BMP_vline (BMP *, int x, int y0, int y1, RGB); +extern void BMP_rect (BMP *, int x, int y, int w, int h, RGB); +extern void BMP_fillrect (BMP *, int x, int y, int w, int h, RGB); +extern RGB BMP_getpixel (BMP*, int, int); + +extern int BMP_draw_string (BMP *, const char *, int x, int y, RGB); +extern int BMP_string_width (const char *); + +extern int BMP_draw_mini_string (BMP *, const char *, int x, int y, RGB); +extern int BMP_mini_string_width (const char *); + +#define RGB_BLACK (0) +#define RGB_BLUE (0xff) +#define RGB_BRASS (0xc3a368) +#define RGB_BROWN (0x8b4513) +#define RGB_CADETBLUE (0x5f9ea0) +#define RGB_CHARTREUSE (0x7fff00) +#define RGB_CORAL (0xff7f50) +#define RGB_CYAN (0xffff) +#define RGB_DARKGREEN (0x6400) +#define RGB_DARKKHAKI (0xbdb76b) +#define RGB_DARKOLIVEGREEN (0x556b2f) +#define RGB_DARKORANGE (0xff8c00) +#define RGB_DODGERBLUE (0x1e90ff) +#define RGB_GOLDENROD (0xdaa520) +#define RGB_GRAY (0xc0c0c0) +#define RGB_GREEN (0xff00) +#define RGB_KHAKI (0xf0e68c) +#define RGB_LEMONYELLOW (0xfde910) +#define RGB_MAGENTA (0xff00ff) +#define RGB_MAROON (0x800000) +#define RGB_NAVYBLUE (0x80) +#define RGB_ORANGE (0xffa500) +#define RGB_PINK (0xf77fbe) +#define RGB_PURPLE (0xa020f0) +#define RGB_RED (0xff0000) +#define RGB_ROYALBLUE (0x4169e1) +#define RGB_SALMON (0xfa8072) +#define RGB_TURQUOISE (0x40e0d0) +#define RGB_VIOLET (0xee82ee) +#define RGB_WHITE (0xffffff) +#define RGB_YELLOW (0xffff00) + +#define RGB_GRAY6 (0x606060) +#define RGB_GRAY8 (0x808080) +#define RGB_GRAY10 (0xa0a0a0) +#define RGB_GRAY12 (0xc0c0c0) +#define RGB_GRAY14 (0xe0e0e0) + +#endif + diff --git a/BMPGraphing.c b/BMPGraphing.c new file mode 100755 index 0000000..61ae0d7 --- /dev/null +++ b/BMPGraphing.c @@ -0,0 +1,486 @@ +/*============================================================================ + BMPGraphing, a library for graphing. + Copyright (C) 2005-2014 by Zack T Smith. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + The author may be reached at veritas@comcast.net. + *===========================================================================*/ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <math.h> + +#include "BMP.h" +#include "BMPGraphing.h" + +//---------------------------------------------------------------------------- +// Name: BMPGraphing_draw_labels_log2 +// Purpose: Draw the labels and ticks. +//---------------------------------------------------------------------------- +void +BMPGraphing_draw_labels_log2 (BMPGraph* graph) +{ + if (!graph || !graph->image) + return; + + //---------------------------------------- + // Horizontal + // + // Establish min & max x values. + // + int i = 0; + Value min_x = 0x4000000000000000; + Value max_x = 0; + for (i = 0; i < graph->data_index; i += 2) { + Value type = graph->data[i]; + Value value = graph->data[i+1]; + if (type == DATUM_X) { + if (value < min_x) + min_x = value; + if (value > max_x) + max_x = value; + } + } + graph->min_x = (long long) log2 (min_x); + graph->max_x = (long long) ceil (log2 (max_x)); + + for (i = graph->min_x; i <= graph->max_x; i++) { + char str [200]; + int x = graph->left_margin + + ((i-graph->min_x) * graph->x_span) / + (graph->max_x - graph->min_x); + int y = graph->height - graph->margin + 10; + + unsigned long y2 = 1 << i; + if (y2 < 1536) + snprintf (str, 199, "%ld B", y2); + else if (y2 < (1<<20)) { + snprintf (str, 199, "%ld kB", y2 >> 10); + } + else { + Value j = y2 >> 20; + switch ((y2 >> 18) & 3) { + case 0: snprintf (str, 199, "%lld MB", j); break; + case 1: snprintf (str, 199, "%lld.25 MB", j); break; + case 2: snprintf (str, 199, "%lld.5 MB", j); break; + case 3: snprintf (str, 199, "%lld.75 MB", j); break; + } + } + + BMP_vline (graph->image, x, y, y - 10, RGB_BLACK); + BMP_draw_mini_string (graph->image, str, x - 10, y + 8, RGB_BLACK); + } + + //---------------------------------------- + // Vertical + // + // Establish min & max y values. + // + Value min_y = 0x4000000000000000; + Value max_y = 0; + for (i = 0; i < graph->data_index; i += 2) { + Value type = graph->data[i]; + Value value = graph->data[i+1]; + if (type == DATUM_Y) { + if (value < min_y) + min_y = value; + if (value > max_y) + max_y = value; + } + } + graph->min_y = min_y; + graph->max_y = max_y; + + int font_height = 10; + int available_height = graph->y_span; + int max_labels = available_height / font_height; + int preferred_n_labels = graph->max_y/10000; + int actual_n_labels; + float multiplier = 1; + if (preferred_n_labels < max_labels) { + actual_n_labels = preferred_n_labels; + } else { + actual_n_labels = max_labels; + multiplier = preferred_n_labels / (float) actual_n_labels; + } + + for (i = 0; i <= actual_n_labels; i++) { + char str [200]; + int x = graph->left_margin - 10; + int y = graph->height - graph->margin - (i * graph->y_span) / (float)actual_n_labels; + + BMP_hline (graph->image, x, x+10, y, RGB_BLACK); + + int value = (int) (i * multiplier); + snprintf (str, 199, "%d GB/s", value); + BMP_draw_mini_string (graph->image, str, x - 40, y - MINIFONT_HEIGHT/2, RGB_BLACK); + } +} + +BMPGraph * +BMPGraphing_new (int w, int h, int x_axis_mode) +{ + if (x_axis_mode != MODE_X_AXIS_LINEAR && x_axis_mode != MODE_X_AXIS_LOG2) + return NULL; + + BMPGraph *graph = (BMPGraph*) malloc (sizeof(BMPGraph)); + if (!graph) + return NULL; + + bzero (graph, sizeof(BMPGraph)); + + graph->x_axis_mode = x_axis_mode; + + if (w <= 0 || h <= 0) { + w = 1920; + h = 1080; + } + + graph->width = w; + graph->height = h; + graph->image = BMP_new (w, h); + graph->margin = 40; + graph->left_margin = 80; + + BMP_clear (graph->image, RGB_WHITE); + + BMP_hline (graph->image, graph->left_margin, graph->width - graph->margin, graph->height - graph->margin, RGB_BLACK); + BMP_vline (graph->image, graph->left_margin, graph->margin, graph->height - graph->margin, RGB_BLACK); + + graph->x_span = graph->width - (graph->margin + graph->left_margin); + graph->y_span = graph->height - 2 * graph->margin; + + graph->legend_y = graph->margin; + + return graph; +} + +void BMPGraphing_set_title (BMPGraph* graph, const char *title) +{ + if (!graph || !title) + return; + + if (graph->title) + free (graph->title); + graph->title = strdup (title); + + BMP_draw_string (graph->image, graph->title, graph->left_margin, graph->margin/2, RGB_BLACK); +} + +void +BMPGraphing_new_line (BMPGraph *graph, char *str, RGB color) +{ + if (!graph || !graph->image) + return; + + BMP_draw_string (graph->image, str, graph->width - graph->margin - 320, graph->legend_y, 0xffffff & color); + + graph->legend_y += 17; + + graph->fg = 0; + graph->last_x = graph->last_y = -1; + + if (graph->data_index >= MAX_GRAPH_DATA-2) + return; // error ("Too many graph data."); + + graph->data [graph->data_index++] = DATUM_COLOR; + graph->data [graph->data_index++] = color; +} + +//---------------------------------------------------------------------------- +// Name: BMPGraphing_add_point +// Purpose: Adds a point to this list to be drawn. +//---------------------------------------------------------------------------- +void +BMPGraphing_add_point (BMPGraph *graph, Value x, Value y) +{ + if (!graph || !graph->image) + return; + + if (graph->data_index >= MAX_GRAPH_DATA-4) + return; // error ("Too many graph data."); + + graph->data [graph->data_index++] = DATUM_X; + graph->data [graph->data_index++] = x; + graph->data [graph->data_index++] = DATUM_Y; + graph->data [graph->data_index++] = y; +} + +//---------------------------------------------------------------------------- +// Name: BMPGraphing_plot_log2 +// Purpose: Plots a point on the current graph. +//---------------------------------------------------------------------------- + +void +BMPGraphing_plot_log2 (BMPGraph *graph, Value x, Value y) +{ + if (!graph || !graph->image) + return; + + int i = 0; + + //---------------------------------------- + // Plot the point. The x axis is + // logarithmic, base 2. + // + double tmp = log2 (x); + tmp -= (double) graph->min_x; + tmp *= (double) graph->x_span; + tmp /= (double) (graph->max_x - graph->min_x); + + int x2 = graph->left_margin + (int) tmp; + int y2 = graph->height - graph->margin - (y * graph->y_span) / graph->max_y; + + if (graph->last_x != -1 && graph->last_y != -1) { + if (graph->fg & DASHED) + BMP_line_dashed (graph->image, graph->last_x, graph->last_y, x2, y2, graph->fg & 0xffffff); + else + BMP_line (graph->image, graph->last_x, graph->last_y, x2, y2, graph->fg); + } + + graph->last_x = x2; + graph->last_y = y2; +} + +//---------------------------------------------------------------------------- +// Name: BMPGraphing_plot_linear +// Purpose: Plots a point on the current graph. +//---------------------------------------------------------------------------- + +void +BMPGraphing_plot_linear (BMPGraph *graph, Value x, Value y, Value max_y) +{ + if (!graph || !graph->image) + return; + + //---------------------------------------- + // Plot the point. The x axis is + // logarithmic, base 2. The units of the + // y value is kB. + // + double tmp = 10. + log2 (x); + tmp -= (double) XVALUE_MIN; + tmp *= (double) graph->x_span; + tmp /= (double) (XVALUE_MAX - XVALUE_MIN); + int x2 = graph->left_margin + (int) tmp; + int y2 = graph->height - graph->margin - (y * graph->y_span) / max_y; + +//printf ("\tx=%d, y=%d\n",x,y); fflush(stdout); + + if (graph->last_x != -1 && graph->last_y != -1) { + if (graph->fg & DASHED) + BMP_line_dashed (graph->image, graph->last_x, graph->last_y, x2, y2, graph->fg & 0xffffff); + else + BMP_line (graph->image, graph->last_x, graph->last_y, x2, y2, graph->fg); + } + + graph->last_x = x2; + graph->last_y = y2; +} + +//---------------------------------------------------------------------------- +// Name: BMPGraphing_make_log2 +// Purpose: Plots all lines. +//---------------------------------------------------------------------------- + +static void +BMPGraphing_make_log2 (BMPGraph *graph) +{ + if (!graph || !graph->image) + return; + + BMPGraphing_draw_labels_log2 (graph); + + //---------------------------------------- + // OK, now draw the lines. + // + int i; + int x = -1, y = -1; + for (i = 0; i < graph->data_index; i += 2) + { + Value type = graph->data[i]; + Value value = graph->data[i+1]; + + switch (type) { + case DATUM_Y: y = value; break; + case DATUM_X: x = value; break; + case DATUM_COLOR: + graph->fg = (unsigned long) value; + graph->last_x = -1; + graph->last_y = -1; + break; + } + + if (x != -1 && y != -1) { + BMPGraphing_plot_log2 (graph, x, y); + x = y = -1; + } + } +} + +//---------------------------------------------------------------------------- +// Name: BMPGraphing_make_linear +// Purpose: Plots all lines for the network test graph. +//---------------------------------------------------------------------------- + +static void +BMPGraphing_make_linear (BMPGraph *graph) +{ + if (!graph || !graph->image) + return; + + int i; + + // No data + if (!graph->data_index) + return; + + //---------------------------------------- + // Get the maximum bandwidth in order to + // properly scale the graph vertically. + // + int max_y = 0; + for (i = 0; i < graph->data_index; i += 2) { + if (graph->data[i] == DATUM_Y) { + int y = graph->data [i+1]; + if (y > max_y) + max_y = y; + } + } + + int range = max_y > 10000 ? 2 : (max_y > 1000 ? 1 : 0); + int y_spacing = 1; + switch (range) { + case 2: + // Round up to the next 100.00 MB/sec. (=10000). + y_spacing = 10000; + break; + case 1: + // Round up to the next 10.00 MB/sec. + y_spacing = 1000; + break; + case 0: + // Round up to the next 1.00 MB/sec. + y_spacing = 100; + break; + } + max_y /= y_spacing; + max_y *= y_spacing; + max_y += y_spacing; + + //---------------------------------------- + // Draw the axes, ticks & labels. + // + // X axis: + if (XVALUE_MIN < 10) + return; // error ("Minimum y is too small."); + + for (i = XVALUE_MIN; i <= XVALUE_MAX; i++) { + char str[200]; + unsigned long y2 = 1 << (i-10); // XX XVALUE_MIN>=10 + if (y2 < 1024) + snprintf (str, 199, "%u kB", (unsigned int) y2); + else + snprintf (str, 199, "%lu MB", (unsigned long) (y2 >> 10)); + + int x = graph->left_margin + ((i - XVALUE_MIN) * graph->x_span) / (XVALUE_MAX - XVALUE_MIN); + int y = graph->height - graph->margin + 10; + + BMP_vline (graph->image, x, y, y-10, RGB_BLACK); + BMP_draw_mini_string (graph->image, str, x - 10, y+8, RGB_BLACK); + } + + //---------- + // Y axis: + // Decide what the tick spacing will be. + for (i = 0; i <= max_y; i += y_spacing) { + char str[200]; + unsigned long whole = i / 100; + unsigned long frac = i % 100; + snprintf (str, 199, "%lu.%02lu MB/s", whole, frac); + + int x = graph->left_margin - 10; + int y = graph->height - graph->margin - (i * graph->y_span) / max_y; + + BMP_hline (graph->image, x, x+10, y, RGB_BLACK); + BMP_draw_mini_string (graph->image, str, x - 60, y - MINIFONT_HEIGHT/2, RGB_BLACK); + } + + //---------------------------------------- + // Draw the data lines. + // + int x = -1, y = -1; + graph->last_x = -1; + graph->last_y = -1; + for (i = 0; i < graph->data_index; i += 2) + { + int type = graph->data[i]; + long value = graph->data[i+1]; + + switch (type) { + case DATUM_Y: y = value; break; + case DATUM_X: x = value; break; + case DATUM_COLOR: + graph->fg = (unsigned long) value; + graph->last_x = -1; + graph->last_y = -1; + break; + } + + if (x != -1 && y != -1) { + BMPGraphing_plot_linear (graph, x, y, max_y); + x = y = -1; + } + } +} + +void +BMPGraphing_make (BMPGraph *graph) +{ + if (!graph) + return; // XX silent error + + switch (graph->x_axis_mode) { + case MODE_X_AXIS_LOG2: + BMPGraphing_make_log2 (graph); + break; + case MODE_X_AXIS_LINEAR: + BMPGraphing_make_linear (graph); + break; + default: + fprintf (stderr, "Invalid graph mode %d.\n", graph->x_axis_mode); + break; + } +} + +void +BMPGraphing_destroy (BMPGraph *graph) +{ + if (!graph) + return; + + if (graph->title) { + free (graph->title); + graph->title = NULL; + } + if (graph->image) { + BMP_destroy (graph->image); + graph->image = NULL; + } + + free (graph); +} diff --git a/BMPGraphing.h b/BMPGraphing.h new file mode 100755 index 0000000..4f13972 --- /dev/null +++ b/BMPGraphing.h @@ -0,0 +1,88 @@ +/*============================================================================ + BMPGraphing, a library for graphing. + Copyright (C) 2005-2014 by Zack T Smith. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + The author may be reached at veritas@comcast.net. + *===========================================================================*/ + +#ifndef _SUPERSIMPLEGRAPHING_H +#define _SUPERSIMPLEGRAPHING_H + +#include <stdbool.h> + +#define SSG_RELEASE "0.2" + +#define XVALUE_MIN (15) +#define XVALUE_MAX (28) + +enum { + DATUM_X=0, + DATUM_Y=1, + DATUM_COLOR=2, +}; + +typedef long Coordinate; +typedef long long Value; + +enum { + MODE_X_AXIS_LINEAR = 0, + MODE_X_AXIS_LOG2 = 1, +}; + +//--------------- +// Graphing data. +// +typedef struct { + BMP *image; + char *title; + + unsigned char x_axis_mode; + + Coordinate width; + Coordinate height; + Coordinate left_margin; + Coordinate margin; + Coordinate last_x; + Coordinate last_y; + Coordinate x_span; + Coordinate y_span; + Coordinate legend_y; + + RGB fg; +#define MAX_GRAPH_DATA 50000 + Value data [MAX_GRAPH_DATA]; + int data_index; +#define DASHED 0x1000000 // dashed line flag + + Value max_y; + Value min_y; + Value min_x; + Value max_x; +} BMPGraph; + +extern void BMPGraphing_set_title (BMPGraph*, const char *); +extern void BMPGraphing_draw_labels_log2 (BMPGraph*); +extern BMPGraph *BMPGraphing_new (int w, int h, int x_axis_mode); +extern void BMPGraphing_new_line (BMPGraph *, char *str, RGB color); +extern void BMPGraphing_add_point (BMPGraph *, Value x, Value y); +extern void BMPGraphing_plot_log2 (BMPGraph *, Value x, Value y); +extern void BMPGraphing_plot_linear (BMPGraph *, Value x, Value y, Value max_amt); +extern void BMPGraphing_make (BMPGraph*); +extern BMP *BMPGraphing_get_graph (BMPGraph*); +extern void BMPGraphing_destroy (BMPGraph*); + +#endif diff --git a/COPYING.txt b/COPYING.txt new file mode 100755 index 0000000..3912109 --- /dev/null +++ b/COPYING.txt @@ -0,0 +1,340 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + <one line to give the program's name and a brief idea of what it does.> + Copyright (C) <year> <name of author> + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + <signature of Ty Coon>, 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. diff --git a/Makefile b/Makefile new file mode 100755 index 0000000..913d023 --- /dev/null +++ b/Makefile @@ -0,0 +1,87 @@ +#============================================================================ +# bandwidth, a benchmark to estimate memory transfer bandwidth. +# Copyright (C) 2005-2014 by Zack T Smith. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +# +# The author may be reached at veritas@comcast.net. +#============================================================================ + +CFLAGS= -O6 +CFLAGS= -g +CC=gcc +LD=gcc +SRC=main.c +OBJ=main.o +LIB= +AS=nasm + +message: + @echo "" + @echo "To compile for x86 Linux: make bandwidth32" + @echo "To compile for x86_64 Linux: make bandwidth64" + @echo "To compile for x86 Mac OS/X: make bandwidth-mac32" + @echo "To compile for x86_64 Mac OS/X: make bandwidth-mac64" + @echo "To compile for x86 Win32/Cygwin: make bandwidth-win32" + @echo "Note! For the Mac you will need to install the latest NASM; Apple's is insufficient." + @echo "" + +bandwidth64: main.c routines64.asm BMP64.a BMPGraphing64.a + ${AS} -f elf64 routines64.asm -o routines64.o + ${CC} ${CFLAGS} -m64 -c ${SRC} + ${LD} -m64 routines64.o ${OBJ} BMP64.a -lm BMPGraphing64.a -o bandwidth64 + +bandwidth32: main.c routines32.asm BMP32.a BMPGraphing32.a + ${AS} -f elf routines32.asm -o routines32.o + ${CC} ${CFLAGS} -m32 -c ${SRC} + ${LD} -m32 routines32.o ${OBJ} BMP32.a -lm BMPGraphing32.a -o bandwidth32 + +bandwidth-mac64: main.c routines64.asm BMPGraphing64.a BMP64.a + ${AS} -f macho64 routines64.asm -o routines64.o + ${CC} ${CFLAGS} -m64 -c ${SRC} + ${LD} -m64 -lm BMPGraphing64.a BMP64.a routines64.o ${OBJ} ${LIB} -o bandwidth-mac64 + +bandwidth-mac32: main.c routines32.asm BMP32.a BMPGraphing32.a + ${AS} -f macho routines32.asm -o routines32.o + ${CC} ${CFLAGS} -m32 -c ${SRC} + ${LD} -m32 BMP32.a -lm BMPGraphing32.a routines32.o ${OBJ} ${LIB} -o bandwidth-mac32 + +bandwidth-win32: main.c routines32.asm BMP32.a BMPGraphing32.a + ${AS} -f win32 routines32.asm -o routines32.o + ${CC} ${CFLAGS} -m32 -c ${SRC} -Wall -O6 -D__WIN32__ -DWINVER=0x0600 + ${LD} -m32 BMP32.a -lm BMPGraphing32.a routines32.o ${OBJ} ${LIB} -o bandwidth-win32 + +BMPGraphing64.a: BMPGraphing.c + ${CC} ${CFLAGS} -m64 -c BMPGraphing.c + ar rvs BMPGraphing64.a BMPGraphing.o + +BMPGraphing32.a: BMPGraphing.c + ${CC} ${CFLAGS} -m32 -c BMPGraphing.c + ar rvs BMPGraphing32.a BMPGraphing.o + +BMP64.a: BMP.c + ${CC} ${CFLAGS} -m64 -c BMP.c font.c minifont.c + ar rvs BMP64.a BMP.o font.o minifont.o + +BMP32.a: BMP.c + ${CC} ${CFLAGS} -m32 -c BMP.c font.c minifont.c + ar rvs BMP32.a BMP.o font.o minifont.o + +clean: + rm -f main.o bandwidth bandwidth32 bandwidth64 routines32.o routines64.o + rm -f bandwidth-win32.exe bandwidth.bmp bandwidth-mac32 bandwidth-mac64 + rm -f BMP.o BMP32.a BMP64.a BMPGraphing.o BMPGraphing32.a BMPGraphing64.a + rm -f font.o minifont.o network_bandwidth.bmp + diff --git a/README.txt b/README.txt new file mode 100755 index 0000000..7189a27 --- /dev/null +++ b/README.txt @@ -0,0 +1,167 @@ + +This is the README file for my program, "bandwidth". + +Bandwidth is a benchmark that attempts to measure +memory bandwidth. In December 2010 (and as of +release 0.24), I extended 'bandwidth' to measure +network bandwidth as well. + +Bandwidth is useful because both memory bandwidth +and network bandwidth need to be measured to +give you a clear idea of what your computer(s) can do. +Merely relying on specs does not give a full picture +and indeed specs can be misleading. + +-------------------------------------------------- +MEMORY BANDWIDTH + +My program bandwidth performs sequential and random +reads and writes of varying sizes. This permits +you to infer from the graph how each type of memory +is performing. So for instance when bandwidth +writes a 256-byte chunk, you know that because +caches are normally write-back, this chunk +will reside entirely in the L1 cache. Whereas +a 512 kB chunk will mainly reside in L2. + +You could run a non-artificial benchmark and +observe that a general performance number is lower +on one machine or higher on anotehr, but that may +conceal the cause. + +So the purpose of this program is to help you +pinpoint the cause of a performance problem, +or to affirm a general impression about a memory- +intensive program. + +It also tells you the best-case scenario e.g. +the maximum bandwidth achieved using sequential, +128-bit memory accesses. + +Release 1.1: + - Added larger font. +Release 1.0: + - Moved graphing into BMPGraphing module. + - Finally added LODS benchmarking, which + proves how badly lodsb/lodsw/lodsd/lodsq + perform. + - Added switches --faster and --fastest. +Release 0.32: + - Improved AVX support. +Release 0.31: + - Adds cache detection for Intel 32-bit CPUs + - Adds a little AVX support. + - Fixes vector-to/from-main transfer bugs. +Release 0.30 adds cache detection for Intel 64-bit CPUs. +Release 0.29 improved graph granularity with more + 128-byte tests and removes ARM support. +Release 0.28 added a proper test of CPU features e.g. SSE 4.1. +Release 0.27 added finer-granularity 128-byte tests. +Release 0.26 fixed an issue with AMD processors. +Release 0.25 maked network bandwidth bidirectional. +Release 0.24 added network bandwidth testing. + +Release 0.23 added: + - Mac OS/X 64-bit support. + - Vector-to-vector register transfer test. + - Main register to/from vector register transfer test. + - Main register byte/word/dword/qword to/from + vector register test (pinsr*, pextr* instructions). + - Memory copy test using SSE2. + - Automatic checks under Linux for SSE2 & SSE4. + +Release 0.22 added: + - Register-to-register transfer test. + - Register-to/from-stack transfer tests. + +Release 0.21 added: + - Standardized memory chunks to always be + a multiple of 256-byte mini-chunks. + - Random memory accesses, in which each + 256-byte mini-chunk accessed is accessed + in a random order, but also, inside each + mini-chunk the 32/64/128 data are accessed + pseudo-randomly as well. + - Now 'bandwidth' includes chunk sizes that + are not powers of 2, which increases + data points around the key chunk sizes + corresponding to common L1 and L2 cache + sizes. + - Command-line options: + --fast for 0.25 seconds per test. + --slow for 20 seconds per test. + --title for adding a graph title. + +Release 0.20 added graphing, with the graph +stored in a BMP image file. It also adds the +--slow option for more precise runs. + +Release 0.19 added a second 128-bit SSE writer +routine that bypasses the caches, in addition +to the one that doesn't. + +Release 0.18 was my Grand Unified bandwidth +benchmark that brought together support for +four operating systems: + - Linux + - Windows Mobile + - 32-bit Windows + - Mac OS/X 64-bit +and two processor architectures: + - x86 + - Intel64 +I've written custom assembly routines for +each architecture. + +Total run time for the default speed, which +has 5 seconds per test, is about 35 minutes. + +-------------------------------------------------- +NETWORK BANDWIDTH (beginning with release 0.24) + +In mid-December 2010, I extended bandwidth to measure +network bandwidth, which is useful for testing +your home or workplace network setup, and in theory +could be used to test machines across the Internet. + +Release 0.25 adds: + - Bidirectional network bandwidth testing. + - Specifiable port# (default is 49000). + +In the graph: + - Sent data appears as a solid line. + - Received data appears as a dashed line. + +The network test is pretty simple. It sends chunks +of data of varying sizes to whatever computers +(nodes) that you specify. Each of those must be +running 'bandwidth' in transponder mode. + +The chunks of data range of 32 kB up to 32 MB. +These are actually send as a stream of 1 or more +32 kB sub-chunks. + +Sample output: + output/Network-Linux2.6-Celeron-2.8GHz-32bit-loopback.bmp + output/Network-MacOSX32-Corei5-2.4GHz-64bit-loopback.bmp + output/Network-Mac64-Linux32.bmp + +How to start a transponder: + ./bandwidth-mac64 --transponder + +Example invocation of the test leader: + ./bandwidth64 --network 192.168.1.104 + +I've tested network mode on: + Linux 32-bit + Mac OS/X 32- and 64-bit + Win/Cygwin 32-bit. + +-------------------------------------------------- +This program is provided without any warranty +and AS-IS. See the file COPYING for details. + +Zack Smith +1@zsmith.co +March 2013 + @@ -0,0 +1,147 @@ +/*============================================================================ + bandwidth, a benchmark to estimate memory transfer bandwidth. + Copyright (C) 2005-2014 by Zack T Smith. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + The author may be reached at 1@zsmith.co. + *===========================================================================*/ + +//--------------------------------------------------------------------------- +// Change log +// 0.18 Grand unified version supports x86/intel64/arm, linux/win32/winmo. +// 0.19 Now have 128-bit writer that goes to cache AND one that bypasses. +// 0.20 Added my bmplib and graphing of output. Also added --slow option. +// 0.21 Adds random testing. Min chunk size = 256 B. Allows non-2^n chunks. +// 0.22 Adds register-to-register and register-to/from-stack transfers. +// 0.23 Adds vector-to-vector and register-to-vector transfers, & Mac support. +// 0.24 Adds network bandwidth tests from this PC to specified others. +// 0.25 Made network tests bidirectional to test asymmetric networks. +// 0.26 Fixes to prevent certain vector instructions being used w/AMD chips. +// 0.27 Added 128-byte tests for greater precision. +// 0.28 Added use of CPUID. +// 0.29 Added more 128-byte tests. +// 0.30 Adds cache identification for Intel CPUs in 64-bit mode. +// 0.31 Adds cache identification for Intel CPUs in 32-bit mode. +// 0.32 Added AVX support. +// 1.0 Moved graphing logic into BMPGraphing. Added LODS support. +// 1.1 Switched to larger font in graphing module. +//--------------------------------------------------------------------------- + +#ifndef _DEFS_H +#define _DEFS_H + +#define RELEASE "1.1" + +#ifndef bool +typedef char bool; +enum { true = 1, false = 0 }; +#endif + +#define NETWORK_DEFAULT_PORTNUM (49000) +#define NETSIZE_MIN (15) +#define NETSIZE_MAX (28) +#define NETWORK_CHUNK_SIZE (1<<NETSIZE_MIN) + +#define DOING_LODS // lodsq and lodsd + +extern int Reader (void *ptr, unsigned long size, unsigned long loops); + +extern int ReaderLODSQ (void *ptr, unsigned long size, unsigned long loops); +extern int ReaderLODSD (void *ptr, unsigned long size, unsigned long loops); +extern int ReaderLODSW (void *ptr, unsigned long size, unsigned long loops); +extern int ReaderLODSB (void *ptr, unsigned long size, unsigned long loops); + +extern int Reader_128bytes (void *ptr, unsigned long size, unsigned long loops); +extern int RandomReader (void *ptr, unsigned long n_chunks, unsigned long loops); + +extern int Writer (void *ptr, unsigned long size, unsigned long loops, unsigned long value); +extern int Writer_128bytes (void *ptr, unsigned long size, unsigned long loops, unsigned long value); +extern int RandomWriter (void *ptr, unsigned long size, unsigned long loops, unsigned long value); + +extern int RegisterToRegister (unsigned long); + +extern int StackReader (unsigned long); +extern int StackWriter (unsigned long); + +extern int RegisterToVector (unsigned long); // SSE2 +extern int Register8ToVector (unsigned long); // SSE2 +extern int Register16ToVector (unsigned long); // SSE2 +extern int Register32ToVector (unsigned long); // SSE2 +extern int Register64ToVector (unsigned long); // SSE2 + +extern int VectorToVector (unsigned long); // SSE2 + +extern int VectorToVectorAVX (unsigned long); + +extern int VectorToRegister (unsigned long); // SSE2 +extern int Vector8ToRegister (unsigned long); // SSE2 +extern int Vector16ToRegister (unsigned long); // SSE2 +extern int Vector32ToRegister (unsigned long); // SSE2 +extern int Vector64ToRegister (unsigned long); // SSE2 + +extern int Copy (void*, void*, unsigned long, unsigned long); +extern int CopySSE (void*, void*, unsigned long, unsigned long); +extern int CopyAVX (void*, void*, unsigned long, unsigned long); +extern int CopySSE_128bytes (void*, void*, unsigned long, unsigned long); + +extern int ReaderAVX (void *ptr, unsigned long, unsigned long); +extern int ReaderSSE2 (void *ptr, unsigned long, unsigned long); +extern int ReaderSSE2_bypass (void *ptr, unsigned long, unsigned long); +extern int RandomReaderSSE2 (unsigned long **ptr, unsigned long, unsigned long); +extern int RandomReaderSSE2_bypass (unsigned long **ptr, unsigned long, unsigned long); + +extern int WriterAVX (void *ptr, unsigned long, unsigned long, unsigned long); +extern int WriterSSE2 (void *ptr, unsigned long, unsigned long, unsigned long); +extern int RandomWriterSSE2(unsigned long **ptr, unsigned long, unsigned long, unsigned long); + +extern int ReaderSSE2_128bytes(void *ptr, unsigned long, unsigned long); +extern int WriterSSE2_128bytes(void *ptr, unsigned long, unsigned long, unsigned long); + +extern int ReaderSSE2_128bytes_bypass (void *ptr, unsigned long, unsigned long); +extern int WriterSSE2_128bytes_bypass (void *ptr, unsigned long, unsigned long, unsigned long); + +extern int WriterAVX_bypass (void *ptr, unsigned long, unsigned long, unsigned long); +extern int WriterSSE2_bypass (void *ptr, unsigned long, unsigned long, unsigned long); +extern int RandomWriterSSE2_bypass (unsigned long **ptr, unsigned long, unsigned long, unsigned long); + +extern void get_cpuid_family (char *family_return); +extern void get_cpuid_cache_info (uint32_t *array, int index); +extern unsigned get_cpuid1_ecx (); +extern unsigned get_cpuid1_edx (); +extern unsigned get_cpuid7_ebx (); +extern unsigned get_cpuid_80000001_ecx (); +extern unsigned get_cpuid_80000001_edx (); + +#define CPUID_EDX_MMX (1<<23) +#define CPUID_EDX_SSE (1<<25) +#define CPUID_EDX_SSE2 (1<<26) +#define CPUID_EDX_INTEL64 (1<<29) // "Long Mode" on AMD. +#define CPUID_EDX_XD (1<<20) +#define CPUID_ECX_SSE3 (1) +#define CPUID_ECX_SSSE3 (1<<9) +#define CPUID_ECX_SSE4A (1<<6) +#define CPUID_ECX_SSE41 (1<<19) +#define CPUID_ECX_SSE42 (1<<20) +#define CPUID_ECX_AES (1<<25) // Encryption. +#define CPUID_ECX_AVX (1<<28) // 256-bit YMM registers. +#define CPUID_EBX_AVX2 (0x20) + +#define FBLOOPS_R 400 +#define FBLOOPS_W 800 +#define FB_SIZE (640*480*2) + +#endif + @@ -0,0 +1,1655 @@ + +/*============================================================================= + bmplib, a simple library to create, modify, and write BMP image files. + Copyright (C) 2009-2014 by Zack T Smith. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 + as published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + The author may be reached at veritas@comcast.net. + *============================================================================*/ + +#include <stdio.h> + +#include "BMP.h" + +// Mini characters, 8 pixels high. +static const char *font_chars_ [] = +{ + "##", + "##", + "##", + "##", + "##", + "##", + "##", + "##", + "##", + "##", + " ", + " ", + "##", + "##", + "", + "", + "", + + "## ##", + "## ##", + "## ##", + " # #", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + + " ## ## ", + " ## ## ", + " ## ## ", + " ## ## ", + " ##########", + " ##########", + " ## ## ", + " ## ## ", + " ##########", + " ##########", + " ## ## ", + " ## ## ", + " ## ## ", + " ## ## ", + "", + "", + "", + + " ## ", + " ## ", + " ########", + "## ##", + "## ## ", + "## ## ", + " ###### ", + " ## ##", + " ## ##", + " ## ##", + " ## ##", + "########", + " ## ", + " ## ", + "", + "", + "", + + " ## ##", + " # # ##", + " ## ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + "## ## ", + "## # #", + "## ## ", + "", + "", + "", + + " #####", + "## ##", + "## ##", + "## ##", + "## ##", + "## ## ", + " ## ## ##", + " #### ##", + "## ## ##", + "## ####", + "## ##", + "## ##", + " ## ## ##", + " ##### ##", + "", + "", + "", + + "###", + "###", + " ##", + " #", + " #", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + + " ##", + " ##", + " ##", + " ##", + " ##", + "##", + "##", + "##", + "##", + "##", + "##", + " ##", + " ##", + " ##", + " ##", + " ##", + "", + + "## ", + " ## ", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + "##", + "", + + " ", + "", + "", + " ##", + "## ## ##", + " ## ## ##", + " ######", + " ####", + " ######", + " ## ## ##", + "## ## ##", + " ##", + "", + "", + "", + "", + "", + + "", + "", + "", + " ##", + " ##", + " ##", + " ##", + "##########", + " ##", + " ##", + " ##", + " ##", + "", + "", + "", + "", + "", + + " ", + " ", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "###", + "###", + " ##", + " ##", + "#", + + " ", + "", + "", + "", + "", + "", + "", + "#######", + "", + "", + "", + "", + "", + "", + "", + "", + "", + + " ", + " ", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "###", + "###", + "", + "", + "", + + " ", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + "##", + "##", + "", + "", + "", + + " ##### ", + " ## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + " ## ##", + " ##### ", + "", + "", + "", + +" ##", +" ##", +" ###", +"#####", +" ##", +" ##", +" ##", +" ##", +" ##", +" ##", +" ##", +" ##", +" ##", +" ## ", + "", +"", +"", + + " #### ", + " ## ##", + "## ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ## ", + "## ", + "## ", + "## ", + "########", + "", + "", + "", + + "########", + " ##", + " ##", + " ##", + " ## ", + " ## ", + " #### ", + " ##", + " ##", + " ##", + " ##", + "## ##", + " ## ##", + " #### ", + "", + "", + "", + + " ##", + " ###", + " ####", + " ## ##", + " ## ##", + "## ##", + "#########", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + "", + "", + "", + + "########", + "## ", + "## ", + "## ", + "## ", + "###", + " ######", + " ##", + " ##", + " ##", + " ##", + "## ##", + " ## ##", + " #### ", + "", + "", + "", + + " ##### ", + " ## #", + "## ", + "## ", + "## ", + "## ", + "#######", + "## ## ", + "## ##", + "## ##", + "## ##", + "## ##", + " ## ##", + " ##### ", + "", + "", + "", + + "########", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + "", + "", + "", + + " ##### ", + " ## ##", + "## ##", + "## ##", + "## ##", + " ## ##", + " ##### ", + " ## ##", + "## ##", + "## ##", + "## ##", + "## ##", + " ## ##", + " ##### ", + "", + "", + "", + + " ##### ", + " ## ##", + "## ##", + "## ##", + "## ##", + " ## ##", + " #######", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + "", + "", + "", + + " ", + " ", + "", + "", + "", + "###", + "###", + "", + "", + "", + "", + "###", + "###", + "", + "", + "", + "", + + " ", + "", + "", + "", + "", + "###", + "###", + "", + "", + "", + "", + "###", + "###", + " ##", + " ##", + "#", + "", + + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + "##", + "##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + "", + "", + "", + + " ", + "", + "", + "", + "", + "", + "############", + "############", + " ", + " ", + "############", + "############", + "", + "", + "", + "", + "", + + "## ", + " ## ", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + "##", + "", + "", + "", + + " ###### ", + " ## ##", + "## ##", + "## ##", + " ##", + " ##", + " ### ", + " ##", + " ##", + " ##", + "", + "", + " ##", + " ##", + "", + "", + "", + + " ###### ", + " ## ##", + "## ##", + "## ##", + "## ####", + "## ## ##", + "## ## ##", + "## ## ##", + "## ## ##", + "## ####", + "## ", + "## ", + " ## ##", + " #######", + "", + "", + "", + + " ##", + " ####", + " ####", + " ## ##", + " ## ##", + " ## ##", + " ## ##", + " ## ##", + " ########", + " ## ##", + " ## ##", + "## ##", + "## ##", + "## ##", + "", + "", + "", + + "########", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "######## ", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "########", + "", + "", + "", + + " ###### ", + " ## ##", + "## ##", + "## ##", + "## ", + "## ", + "## ", + "## ", + "## ", + "## ", + "## ##", + "## ##", + " ## ##", + " ######", + "", + "", + "", + + "########", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ## ", + "########", + "", + "", + "", + + "##########", + "##", + "##", + "##", + "##", + "##", + "########", + "##", + "##", + "##", + "##", + "##", + "##", + "##########", + "", + "", + "", + + "##########", + "##", + "##", + "##", + "##", + "##", + "########", + "##", + "##", + "##", + "##", + "##", + "##", + "##", + "", + "", + "", + + " ###### ", + " ## ##", + "## ##", + "## ##", + "## ", + "## ", + "## ####", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + " ## ###", + " ###### #", + "", + "", + "", + + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "##########", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "", + "", + "", + + " ## ", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + "", + "", + "", + + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + "## ##", + "## ##", + " ## ##", + " ####", + "", + "", + "", + + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "####", + "####", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "", + "", + "", + + "##", + "##", + "##", + "##", + "##", + "##", + "##", + "##", + "##", + "##", + "##", + "##", + "##", + "##########", + "", + "", + "", + + "## ##", + "### ###", + "#### ####", + "## ## ## ##", + "## ## ## ##", + "## ## ## ##", + "## ## ## ##", + "## ## ## ##", + "## ## ## ##", + "## ## ## ##", + "## ## ## ##", + "## ### ##", + "## ### ##", + "## # ##", + "", + "", + "", + + "## ##", + "### ##", + "#### ##", + "#### ##", + "## ## ##", + "## ## ##", + "## ## ##", + "## ## ##", + "## ## ##", + "## ## ##", + "## ####", + "## ####", + "## ###", + "## ##", + "", + "", + "", + + " ###### ", + " ## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + " ## ##", + " ######", + "", + "", + "", + + "########", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "########", + "##", + "##", + "##", + "##", + "##", + "##", + "##", + "", + "", + "", + + " ######", + " ## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ## ##", + "## ## ##", + "## ## ##", + "## ###", + " ## ##", + " ##### ##", + "", + "", + "", + + "########", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "########", + "####", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "", + "", + "", + + " ######", + " ## ##", + "## ##", + "## ##", + "##", + " ###", + " #####", + " ##", + " ##", + " ##", + "## ##", + "## ##", + " ## ##", + " ######", + "", + "", + "", + + "##########", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + "", + "", + "", + + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + " ## ##", + " ######", + "", + "", + "", + + "## ##", + "## ##", + " ## ##", + " ## ##", + " ## ##", + " ## ## ", + " ## ## ", + " ## ## ", + " ## ## ", + " ## ## ", + " ## ## ", + " ### ", + " ### ", + " # ", + "", + "", + "", + +"## ## ##", +"## ## ##", +"## ## ##", +" ## #### ##", +" ## #### ##", +" ## ## ## ##", +" ## ## ## ##", +" ## ## ## ##", +" ## ## ## ##", +" ## ## ## ##", +" ## ## ## ##", +" ### ###", +" ### ###", +" # #", + "", + "", + "", + + "## ##", + "## ##", + " ## ##", + " ## ##", + " ## ##", + " ## ##", + " ##", + " ##", + " ## ##", + " ## ##", + " ## ##", + " ## ##", + "## ##", + "## ##", + "", + "", + "", + + "## ##", + "## ##", + " ## ##", + " ## ##", + " ## ##", + " ## ##", + " ####", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + "", + "", + "", + + "#########", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + "#########", + "", + "", + "", + + "#####", + "##", + "##", + "##", + "##", + "##", + "##", + "##", + "##", + "##", + "##", + "##", + "##", + "##", + "##", + "#####", + "", + + "##", + "##", + "##", + "##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + "", + "", + "", + + "#####", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + "#####", + "", + + " ##", + " ####", + " ## ##", + " ## ##", + "## ##", + "## ##", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + + " ", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "########", + "", + "", + "", + + "####", + "####", + "##", + " ##", + " #", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + + " ", + "", + "", + "", + " ######", + " ## ##", + " ##", + " ##", + " ######", + " ## ##", + "## ##", + "## ##", + " ## ##", + " ##### ##", + "", + "", + "", + + "##", + "##", + "##", + "##", + "## ####", + "### ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "### ##", + "## ####", + "", + "", + "", + + " ", + "", + "", + "", + " #####", + " ## ##", + "## ", + "## ", + "## ", + "## ", + "## ", + "## ", + " ## ##", + " #####", + "", + "", + "", + + " ##", + " ##", + " ##", + " ##", + " #### ##", + " ## ###", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + " ## ###", + " #### ##", + "", + "", + "", + + " ", + "", + "", + "", + " #####", + " ## ##", + "## ##", + "## ##", + "#########", + "##", + "##", + "##", + " ## ##", + " ######", + "", + "", + "", + + " ####", + " ##", + " ## ", + " ## ", + "#####", + " ## ", + " ## ", + " ## ", + " ## ", + " ## ", + " ## ", + " ## ", + " ## ", + " ## ", + "", + "", + "", + + " ", + "", + "", + "", + " ##### #", + " ## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + " ## ##", + " #######", + " ##", + "## ##", + " ######", + + "##", + "##", + "##", + "##", + "## ####", + "### ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "", + "", + "", + +"##", +"##", +"", +"", +"##", +"##", +"##", +"##", +"##", +"##", +"##", +"##", +"##", +"##", + "", + "", + "", + + " ##", + " ##", + " ", + " ", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + "###", + + "##", + "##", + "##", + "##", + "## ##", + "## ##", + "## ## ", + "## ## ", + "#### ", + "#### ", + "## ## ", + "## ## ", + "## ##", + "## ##", + "", + "", + "", + +"##", +"##", +"##", +"##", +"##", +"##", +"##", +"##", +"##", +"##", +"##", +"##", +"##", +"##", + "", + "", + "", + + " ", + "", + "", + "", + "## ### ####", + "### ## ##", + "## ## ##", + "## ## ##", + "## ## ##", + "## ## ##", + "## ## ##", + "## ## ##", + "## ## ##", + "## ## ##", + "", + "", + "", + + " ", + "", + "", + "", + "## ####", + "### ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "", + "", + "", + + " ", + "", + "", + "", + " #####", + " ## ## ", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + " ## ## ", + " ##### ", + "", + "", + "", + + " ", + "", + "", + "", + "## ####", + "### ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "#######", + "##", + "##", + "##", + + " ", + "", + "", + "", + " #### ##", + " ## ###", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + " ## ##", + " #######", + " ##", + " ##", + " ##", + + " ", + "", + "", + "", + "## ####", + "## ##", + "####", + "###", + "##", + "##", + "##", + "##", + "##", + "##", + "", + "", + "", + + " ", + "", + "", + "", + " #######", + "## ##", + "##", + " ##", + " ###", + " ###", + " ##", + " ##", + "## ##", + " #######", + "", + "", + "", + + " ##", + " ##", + " ##", + " ##", + "######", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ##", + " ####", + "", + "", + "", + + " ", + "", + "", + "", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + "## ##", + " ## ##", + " ##### #", + "", + "", + "", + + " ", + "", + "", + "", + "## ##", + "## ##", + "## ##", + " ## ##", + " ## ##", + " ## ##", + " ## ##", + " ## ##", + " #####", + " ###", + "", + "", + "", + + " ", + "", + "", + "", + "## ## ##", + "## ## ##", + "## ## ##", + " ## #### ##", + " ## #### ##", + " ## #### ##", + " ## ## ## ##", + " ## ## ## ##", + " ##### #####", + " ### ###", + "", + "", + "", + + " ", + "", + "", + "", + "## ##", + "## ##", + " ## ##", + " ## ##", + " ##", + " ##", + " ## ##", + " ## ##", + "## ##", + "## ##", + "", + "", + "", + + " ", + "", + "", + "", + "## ##", + "## ##", + "## ##", + " ## ##", + " ## ##", + " ## ##", + " ## ##", + " ## ##", + " ## ##", + " ####", + " ##", + " ##", + "####", + + " ", + "", + "", + "", + "#########", + " ##", + " ##", + " ##", + " ##", + " ##", + " ## ", + " ## ", + "## ", + "#########", + "", + "", + "", + +}; + +const char **get_font_chars () +{ + return font_chars_; +} + @@ -0,0 +1,28 @@ + +/*============================================================================= + bmplib, a simple library to create, modify, and write BMP image files. + Copyright (C) 2009-2014 by Zack T Smith. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 + as published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + The author may be reached at veritas@comcast.net. + *============================================================================*/ + +#ifndef _FONT_H +#define _FONT_H + +extern const char **get_font_chars (void); + +#endif + diff --git a/loopback.sh b/loopback.sh new file mode 100755 index 0000000..780d50f --- /dev/null +++ b/loopback.sh @@ -0,0 +1,5 @@ +#!/bin/bash +EXE=bandwidth32 +./$EXE --transponder & +./$EXE --network 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 127.0.0.1 +kill %1 @@ -0,0 +1,2442 @@ +/*============================================================================ + bandwidth 1.1, a benchmark to estimate memory transfer bandwidth. + Copyright (C) 2005-2014 by Zack T Smith. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + The author may be reached at veritas@comcast.net. + *===========================================================================*/ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> +#include <sys/param.h> +#include <sys/types.h> +#include <sys/time.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <unistd.h> +#include <wchar.h> +#include <math.h> + +#include <netdb.h> // gethostbyname +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> + +#define GRAPH_WIDTH 1440 +#define GRAPH_HEIGHT 900 + +#include "defs.h" +#include "BMP.h" +#include "BMPGraphing.h" + +#define TITLE_MEMORY_NET "Network benchmark results from bandwidth " RELEASE " by Zack Smith, http://zsmith.co" +#define TITLE_MEMORY_GRAPH "Memory benchmark results from bandwidth " RELEASE " by Zack Smith, http://zsmith.co" + +#ifdef __WIN32__ +#include <windows.h> +#endif + +#ifdef __linux__ +#include <linux/fb.h> +#include <sys/mman.h> +#endif + +static int network_port = NETWORK_DEFAULT_PORTNUM; + +enum { + NO_SSE2, + SSE2, + SSE2_BYPASS, + AVX, + AVX_BYPASS, + LODSQ, + LODSD, + LODSW, + LODSB +}; + +static BMPGraph *graph = NULL; + +static bool use_sse2 = true; +static bool use_sse4 = true; +static bool is_intel = false; +static bool is_amd = false; + +static uint32_t cpu_has_mmx = 0; +static uint32_t cpu_has_sse = 0; +static uint32_t cpu_has_sse2 = 0; +static uint32_t cpu_has_sse3 = 0; +static uint32_t cpu_has_ssse3 = 0; +static uint32_t cpu_has_sse4a = 0; +static uint32_t cpu_has_sse41 = 0; +static uint32_t cpu_has_sse42 = 0; +static uint32_t cpu_has_aes = 0; +static uint32_t cpu_has_avx = 0; +static uint32_t cpu_has_avx2 = 0; +static uint32_t cpu_has_64bit = 0; +static uint32_t cpu_has_xd = 0; + +//---------------------------------------- +// Parameters for the tests. +// + +static long usec_per_test = 5000000; // 5 seconds per memory test. + +static int chunk_sizes[] = { + 128, + 256, + 384, + 512, + 640, + 768, + 896, + 1024, + 1280, + 2048, + 3072, + 4096, + 6144, + 8192, // Some processors' L1 data caches are only 8kB. + 12288, + 16384, + 20480, + 24576, + 28672, + 32768, // Common L1 data cache size. + 34*1024, + 36*1024, + 40960, + 49152, + 65536, + 131072, // Old L2 cache size. + 192 * 1024, + 256 * 1024, // Old L2 cache size. + 320 * 1024, + 384 * 1024, + 512 * 1024, // Old L2 cache size. + 768 * 1024, + 1 << 20, // 1 MB = common L2 cache size. + (1024 + 256) * 1024, // 1.25 + (1024 + 512) * 1024, // 1.5 + (1024 + 768) * 1024, // 1.75 + 1 << 21, // 2 MB = common L2 cache size. + (2048 + 256) * 1024, // 2.25 + (2048 + 512) * 1024, // 2.5 + (2048 + 768) * 1024, // 2.75 + 3072 * 1024, // 3 MB = common L2 cache size. + 3407872, // 3.25 MB + 3 * 1024 * 1024 + 1024 * 512, // 3.5 MB + 1 << 22, // 4 MB + 5242880, // 5 megs + 6291456, // 6 megs (common L2 cache size) + 7 * 1024 * 1024, + 8 * 1024 * 1024, // Xeon E3's often has 8MB L3 + 9 * 1024 * 1024, + 10 * 1024 * 1024, // Xeon E5-2609 has 10MB L3 + 12 * 1024 * 1024, + 14 * 1024 * 1024, + 15 * 1024 * 1024, // Xeon E6-2630 has 15MB L3 + 16 * 1024 * 1024, + 20 * 1024 * 1024, // Xeon E5-2690 has 20MB L3 + 21 * 1024 * 1024, + 32 * 1024 * 1024, + 48 * 1024 * 1024, + 64 * 1024 * 1024, + 72 * 1024 * 1024, + 96 * 1024 * 1024, + 128 * 1024 * 1024, + 0 +}; + +static double chunk_sizes_log2 [sizeof(chunk_sizes)/sizeof(int)]; + +//---------------------------------------------------------------------------- +// Name: error +// Purpose: Complain and exit. +//---------------------------------------------------------------------------- +void error (char *s) +{ +#ifndef __WIN32__ + fprintf (stderr, "Error: %s\n", s); + exit (1); +#else + wchar_t tmp [200]; + int i; + for (i = 0; s[i]; i++) + tmp[i] = s[i]; + tmp[i] = 0; + MessageBoxW (0, tmp, L"Error", 0); + ExitProcess (0); +#endif +} + +//============================================================================ +// Output buffer logic. +// This is somewhat vestigial code, originating with Windows Mobile ARM port. +//============================================================================ + +#define MSGLEN 10000 +static wchar_t msg [MSGLEN]; + +void print (wchar_t *s) +{ + wcsncat (msg, s, MSGLEN-1); +} + +void newline () +{ + wcsncat (msg, L"\n", MSGLEN-1); +} + +void println (wchar_t *s) +{ + wcsncat (msg, s, MSGLEN-1); + newline (); +} + +void print_int (int d) +{ + swprintf (msg + wcslen (msg), MSGLEN, L"%d", d); +} + +void print_uint (unsigned int d) +{ + swprintf (msg + wcslen (msg), MSGLEN, L"%lu", d); +} + +void println_int (int d) +{ + print_int (d); + newline (); +} + +void print_result (long double result) +{ + swprintf (msg + wcslen (msg), MSGLEN, L"%.1Lf MB/s", result); +} + +void dump (FILE *f) +{ + if (!f) + f = stdout; + + int i = 0; + while (msg[i]) { + char ch = (char) msg[i]; + fputc (ch, f); + i++; + } + + msg [0] = 0; +} + +void flush () +{ + dump (NULL); + fflush (stdout); +} + +void print_size (unsigned long size) +{ + if (size < 1536) { + print_int (size); + print (L" B"); + } + else if (size < (1<<20)) { + print_int (size >> 10); + print (L" kB"); + } else { + print_int (size >> 20); + switch ((size >> 18) & 3) { + case 1: print (L".25"); break; + case 2: print (L".5"); break; + case 3: print (L".75"); break; + } + print (L" MB"); + } +} + +//============================================================================ +// Timing logic. +//============================================================================ + +//---------------------------------------------------------------------------- +// Name: mytime +// Purpose: Reports time in microseconds. +//---------------------------------------------------------------------------- +unsigned long mytime () +{ +#ifndef __WIN32__ + struct timeval tv; + struct timezone tz; + memset (&tz, 0, sizeof(struct timezone)); + gettimeofday (&tv, &tz); + return 1000000 * tv.tv_sec + tv.tv_usec; +#else + return 1000 * GetTickCount (); // accurate enough. +#endif +} + +//---------------------------------------------------------------------------- +// Name: calculate_result +// Purpose: Calculates and prints a result. +// Returns: 10 times the number of megabytes per second. +//---------------------------------------------------------------------------- +int +calculate_result (unsigned long chunk_size, long long total_loops, long diff) +{ + if (!diff) + error ("Zero time difference."); + +// printf ("\nIn calculate_result, chunk_size=%ld, total_loops=%lld, diff=%ld\n", chunk_size, total_loops, diff); + long double result = (long double) chunk_size; + result *= (long double) total_loops; + result *= 1000000.; // Convert to microseconds. + result /= 1048576.; + result /= (long double) diff; + + print_result (result); + + return (long) (10.0 * result); +} + +//============================================================================ +// Tests. +//============================================================================ + +//---------------------------------------------------------------------------- +// Name: do_write +// Purpose: Performs write on chunk of memory of specified size. +//---------------------------------------------------------------------------- +int +do_write (unsigned long size, int mode, bool random) +{ + unsigned char *chunk; + unsigned char *chunk0; + unsigned long loops; + unsigned long long total_count=0; +#ifdef __x86_64__ + unsigned long value = 0x1234567689abcdef; +#else + unsigned long value = 0x12345678; +#endif + unsigned long diff=0, t0; + unsigned long tmp; + unsigned long **chunk_ptrs = NULL; + + if (size & 127) + error ("do_write(): chunk size is not multiple of 128."); + + //------------------------------------------------- + chunk0 = malloc (size+64); + chunk = chunk0; + if (!chunk) + error ("Out of memory"); + + tmp = (unsigned long) chunk; + if (tmp & 31) { + tmp -= (tmp & 31); + tmp += 32; + chunk = (unsigned char*) tmp; + } + + //---------------------------------------- + // Set up random pointers to chunks. + // + if (random) { + tmp = size/256; + chunk_ptrs = (unsigned long**) malloc (sizeof (unsigned long*) * tmp); + if (!chunk_ptrs) + error ("Out of memory."); + + //---------------------------------------- + // Store pointers to all chunks into array. + // + int i; + for (i = 0; i < tmp; i++) { + chunk_ptrs [i] = (unsigned long*) (chunk + 256 * i); + } + + //---------------------------------------- + // Randomize the array of chunk pointers. + // + int k = 100; + while (k--) { + for (i = 0; i < tmp; i++) { + int j = rand() % tmp; + if (i != j) { + unsigned long *ptr = chunk_ptrs [i]; + chunk_ptrs [i] = chunk_ptrs [j]; + chunk_ptrs [j] = ptr; + } + } + } + } + + //------------------------------------------------- + if (random) + print (L"Random write "); + else + print (L"Sequential write "); + + switch (mode) { + case SSE2: + print (L"(128-bit), size = "); + break; + case AVX: + print (L"(256-bit), size = "); + break; + case AVX_BYPASS: + print (L"bypassing cache (256-bit), size = "); + break; + case SSE2_BYPASS: + print (L"bypassing cache (128-bit), size = "); + break; + default: +#ifdef __x86_64__ + print (L"(64-bit), size = "); +#else + print (L"(32-bit), size = "); +#endif + } + + print_size (size); + print (L", "); + + loops = (1 << 26) / size;// XX need to adjust for CPU MHz + if (loops < 1) + loops = 1; + + t0 = mytime (); + + while (diff < usec_per_test) { + total_count += loops; + + switch (mode) { + case SSE2: + if (random) + RandomWriterSSE2 (chunk_ptrs, size/256, loops, value); + else { + if (size & 128) + WriterSSE2_128bytes (chunk, size, loops, value); + else + WriterSSE2 (chunk, size, loops, value); + } + break; + + case SSE2_BYPASS: + if (random) + RandomWriterSSE2_bypass (chunk_ptrs, size/256, loops, value); + else { + if (size & 128) + WriterSSE2_128bytes_bypass (chunk, size, loops, value); + else + WriterSSE2_bypass (chunk, size, loops, value); + } + break; + + case AVX: + if (!random) { + WriterAVX (chunk, size, loops, value); + } + break; + + case AVX_BYPASS: + if (!random) { + WriterAVX_bypass (chunk, size, loops, value); + } + break; + + default: + if (random) + RandomWriter (chunk_ptrs, size/256, loops, value); + else { + if (size & 128) + Writer_128bytes (chunk, size, loops, value); + else + Writer (chunk, size, loops, value); + } + } + + diff = mytime () - t0; + } + + print (L"loops = "); + print_uint (total_count); + print (L", "); + + flush (); + + int result = calculate_result (size, total_count, diff); + newline (); + + flush (); + + free ((void*)chunk0); + + if (chunk_ptrs) + free (chunk_ptrs); + + return result; +} + + +//---------------------------------------------------------------------------- +// Name: do_read +// Purpose: Performs sequential read on chunk of memory of specified size. +//---------------------------------------------------------------------------- +int +do_read (unsigned long size, int mode, bool random) +{ + unsigned long loops; + unsigned long long total_count = 0; + unsigned long t0, diff=0; + unsigned char *chunk; + unsigned char *chunk0; + unsigned long tmp; + unsigned long **chunk_ptrs = NULL; + + if (size & 127) + error ("do_read(): chunk size is not multiple of 128."); + + //------------------------------------------------- + chunk0 = chunk = malloc (size+64); + if (!chunk) + error ("Out of memory"); + + memset (chunk, 0, size); + + tmp = (unsigned long) chunk; + if (tmp & 31) { + tmp -= (tmp & 31); + tmp += 32; + chunk = (unsigned char*) tmp; + } + + //---------------------------------------- + // Set up random pointers to chunks. + // + if (random) { + int tmp = size/256; + chunk_ptrs = (unsigned long**) malloc (sizeof (unsigned long*) * tmp); + if (!chunk_ptrs) + error ("Out of memory."); + + //---------------------------------------- + // Store pointers to all chunks into array. + // + int i; + for (i = 0; i < tmp; i++) { + chunk_ptrs [i] = (unsigned long*) (chunk + 256 * i); + } + + //---------------------------------------- + // Randomize the array of chunk pointers. + // + int k = 100; + while (k--) { + for (i = 0; i < tmp; i++) { + int j = rand() % tmp; + if (i != j) { + unsigned long *ptr = chunk_ptrs [i]; + chunk_ptrs [i] = chunk_ptrs [j]; + chunk_ptrs [j] = ptr; + } + } + } + } + + //------------------------------------------------- + if (random) + print (L"Random read "); + else + print (L"Sequential read "); + + switch (mode) { + case SSE2: + print (L"(128-bit), size = "); + break; + case LODSB: + print (L"(8-bit LODSB), size = "); + break; + case LODSW: + print (L"(16-bit LODSW), size = "); + break; + case LODSD: + print (L"(32-bit LODSD), size = "); + break; + case LODSQ: + print (L"(64-bit LODSQ), size = "); + break; + case AVX: + print (L"(256-bit), size = "); + break; + case AVX_BYPASS: + print (L"bypassing cache (256-bit), size = "); + break; + case SSE2_BYPASS: + print (L"bypassing cache (128-bit), size = "); + break; + default: +#ifdef __x86_64__ + print (L"(64-bit), size = "); +#else + print (L"(32-bit), size = "); +#endif + } + + print_size (size); + print (L", "); + + flush (); + + loops = (1 << 26) / size; // XX need to adjust for CPU MHz + if (loops < 1) + loops = 1; + + t0 = mytime (); + + while (diff < usec_per_test) { + total_count += loops; + + switch (mode) { + case SSE2: + if (random) + RandomReaderSSE2 (chunk_ptrs, size/256, loops); + else { + if (size & 128) + ReaderSSE2_128bytes (chunk, size, loops); + else + ReaderSSE2 (chunk, size, loops); + } + break; + + case SSE2_BYPASS: + // No random reader for bypass. + // + if (random) + RandomReaderSSE2_bypass (chunk_ptrs, size/256, loops); + else { + if (size & 128) + ReaderSSE2_128bytes_bypass (chunk, size, loops); + else + ReaderSSE2_bypass (chunk, size, loops); + } + break; + + case AVX: + if (!random) { + ReaderAVX (chunk, size, loops); + } + break; + + case LODSB: + if (!random) { + ReaderLODSB (chunk, size, loops); + } + break; + + case LODSW: + if (!random) { + ReaderLODSW (chunk, size, loops); + } + break; + + case LODSD: + if (!random) { + ReaderLODSD (chunk, size, loops); + } + break; + + case LODSQ: + if (!random) { + ReaderLODSQ (chunk, size, loops); + } + break; + + default: + if (random) { + RandomReader (chunk_ptrs, size/256, loops); + } else { + if (size & 128) + Reader_128bytes (chunk, size, loops); + else + Reader (chunk, size, loops); + } + } + + diff = mytime () - t0; + } + + print (L"loops = "); + print_uint (total_count); + print (L", "); + + int result = calculate_result (size, total_count, diff); + newline (); + + flush (); + + free (chunk0); + + if (chunk_ptrs) + free (chunk_ptrs); + + return result; +} + + + +//---------------------------------------------------------------------------- +// Name: do_copy +// Purpose: Performs sequential memory copy. +//---------------------------------------------------------------------------- +int +do_copy (unsigned long size, int mode) +{ + unsigned long loops; + unsigned long long total_count = 0; + unsigned long t0, diff=0; + unsigned char *chunk_src; + unsigned char *chunk_dest; + unsigned char *chunk_src0; + unsigned char *chunk_dest0; + unsigned long tmp; + + if (size & 127) + error ("do_copy(): chunk size is not multiple of 128."); + + //------------------------------------------------- + chunk_src0 = chunk_src = malloc (size+64); + if (!chunk_src) + error ("Out of memory"); + chunk_dest0 = chunk_dest = malloc (size+64); + if (!chunk_dest) + error ("Out of memory"); + + memset (chunk_src, 100, size); + memset (chunk_dest, 200, size); + + tmp = (unsigned long) chunk_src; + if (tmp & 31) { + tmp -= (tmp & 31); + tmp += 32; + chunk_src = (unsigned char*) tmp; + } + tmp = (unsigned long) chunk_dest; + if (tmp & 31) { + tmp -= (tmp & 31); + tmp += 32; + chunk_dest = (unsigned char*) tmp; + } + + //------------------------------------------------- + print (L"Sequential copy "); + + if (mode == SSE2) { + print (L"(128-bit), size = "); + } + else if (mode == AVX) { + print (L"(256-bit), size = "); + } + else { +#ifdef __x86_64__ + print (L"(64-bit), size = "); +#else + print (L"(32-bit), size = "); +#endif + } + + print_size (size); + print (L", "); + + flush (); + + loops = (1 << 26) / size; // XX need to adjust for CPU MHz + if (loops < 1) + loops = 1; + + t0 = mytime (); + + while (diff < usec_per_test) { + total_count += loops; + + if (mode == SSE2) { +#ifdef __x86_64__ + if (size & 128) + CopySSE_128bytes (chunk_dest, chunk_src, size, loops); + else + CopySSE (chunk_dest, chunk_src, size, loops); +#else + CopySSE (chunk_dest, chunk_src, size, loops); +#endif + } + else if (mode == AVX) { + if (!(size & 128)) + CopyAVX (chunk_dest, chunk_src, size, loops); + } + + diff = mytime () - t0; + } + + print (L"loops = "); + print_uint (total_count); + print (L", "); + + int result = calculate_result (size, total_count, diff); + newline (); + + flush (); + + free (chunk_src0); + free (chunk_dest0); + + return result; +} + + +//---------------------------------------------------------------------------- +// Name: fb_readwrite +// Purpose: Performs sequential read & write tests on framebuffer memory. +//---------------------------------------------------------------------------- +#if defined(__linux__) && defined(FBIOGET_FSCREENINFO) +void +fb_readwrite (bool use_sse2) +{ + unsigned long counter, total_count; + unsigned long length; + unsigned long diff, t0; + static struct fb_fix_screeninfo fi; + static struct fb_var_screeninfo vi; + unsigned long *fb = NULL; + unsigned long datum; + int fd; + register unsigned long foo; +#ifdef __x86_64__ + unsigned long value = 0x1234567689abcdef; +#else + unsigned long value = 0x12345678; +#endif + + //------------------------------------------------- + + fd = open ("/dev/fb0", O_RDWR); + if (fd < 0) + fd = open ("/dev/fb/0", O_RDWR); + if (fd < 0) { + println (L"Cannot open framebuffer device."); + return; + } + + if (ioctl (fd, FBIOGET_FSCREENINFO, &fi)) { + close (fd); + println (L"Cannot get framebuffer info"); + return; + } + else + if (ioctl (fd, FBIOGET_VSCREENINFO, &vi)) { + close (fd); + println (L"Cannot get framebuffer info"); + return; + } + else + { + if (fi.visual != FB_VISUAL_TRUECOLOR && + fi.visual != FB_VISUAL_DIRECTCOLOR ) { + close (fd); + println (L"Need direct/truecolor framebuffer device."); + return; + } else { + unsigned long fblen; + + print (L"Framebuffer resolution: "); + print_int (vi.xres); + print (L"x"); + print_int (vi.yres); + print (L", "); + print_int (vi.bits_per_pixel); + println (L" bpp\n"); + + fb = (unsigned long*) fi.smem_start; + fblen = fi.smem_len; + + fb = mmap (fb, fblen, + PROT_WRITE | PROT_READ, + MAP_SHARED, fd, 0); + if (fb == MAP_FAILED) { + close (fd); + println (L"Cannot access framebuffer memory."); + return; + } + } + } + + //------------------- + // Use only the upper half of the display. + // + length = FB_SIZE; + + //------------------- + // READ + // + print (L"Framebuffer memory sequential read "); + flush (); + + t0 = mytime (); + + total_count = FBLOOPS_R; + + if (use_sse2) + ReaderSSE2 (fb, length, FBLOOPS_R); + else + Reader (fb, length, FBLOOPS_R); + + diff = mytime () - t0; + + calculate_result (length, total_count, diff); + newline (); + + //------------------- + // WRITE + // + print (L"Framebuffer memory sequential write "); + flush (); + + t0 = mytime (); + + total_count = FBLOOPS_W; + + if (use_sse2) + WriterSSE2_bypass (fb, length, FBLOOPS_W, value); + else + Writer (fb, length, FBLOOPS_W, value); + + diff = mytime () - t0; + + calculate_result (length, total_count, diff); + newline (); +} +#endif + +//---------------------------------------------------------------------------- +// Name: register_test +// Purpose: Determines bandwidth of register-to-register transfers. +//---------------------------------------------------------------------------- +void +register_test () +{ + long long total_count = 0; + unsigned long t0; + unsigned long diff = 0; + + //-------------------------------------- +#ifdef __x86_64__ + print (L"Main register to main register transfers (64-bit) "); +#else + print (L"Main register to main register transfers (32-bit) "); +#endif + flush (); +#define REGISTER_COUNT 10000 + + t0 = mytime (); + while (diff < usec_per_test) + { + RegisterToRegister (REGISTER_COUNT); + total_count += REGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); + + //-------------------------------------- +#ifdef __x86_64__ + print (L"Main register to vector register transfers (64-bit) "); +#else + print (L"Main register to vector register transfers (32-bit) "); +#endif + flush (); +#define VREGISTER_COUNT 3333 + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + RegisterToVector (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); + + //-------------------------------------- +#ifdef __x86_64__ + print (L"Vector register to main register transfers (64-bit) "); +#else + print (L"Vector register to main register transfers (32-bit) "); +#endif + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + VectorToRegister (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); + + //-------------------------------------- + print (L"Vector register to vector register transfers (128-bit) "); + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + VectorToVector (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); + + //-------------------------------------- + if (cpu_has_avx) { + print (L"Vector register to vector register transfers (256-bit) "); + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + VectorToVectorAVX (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); + } + + //-------------------------------------- + if (use_sse4) { + print (L"Vector 8-bit datum to main register transfers "); + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + Vector8ToRegister (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (64, total_count, diff); + newline (); + flush (); + } + + //-------------------------------------- + print (L"Vector 16-bit datum to main register transfers "); + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + Vector16ToRegister (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (128, total_count, diff); + newline (); + flush (); + + //-------------------------------------- + if (use_sse4) { + print (L"Vector 32-bit datum to main register transfers "); + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + Vector32ToRegister (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); + } + + //-------------------------------------- + if (use_sse4) { + print (L"Vector 64-bit datum to main register transfers "); + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + Vector64ToRegister (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); + } + + //-------------------------------------- + if (use_sse4) { + print (L"Main register 8-bit datum to vector register transfers "); + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + Register8ToVector (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (64, total_count, diff); + newline (); + flush (); + } + + //-------------------------------------- + print (L"Main register 16-bit datum to vector register transfers "); + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + Register16ToVector (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (128, total_count, diff); + newline (); + flush (); + + //-------------------------------------- + if (use_sse4) { + print (L"Main register 32-bit datum to vector register transfers "); + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + Register32ToVector (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); + } + + //-------------------------------------- + if (use_sse4) { + print (L"Main register 64-bit datum to vector register transfers "); + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + Register64ToVector (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); + } +} + +//---------------------------------------------------------------------------- +// Name: stack_test +// Purpose: Determines bandwidth of stack-to/from-register transfers. +//---------------------------------------------------------------------------- +void +stack_test () +{ + long long total_count = 0; + unsigned long t0; + unsigned long diff = 0; + +#ifdef __x86_64__ + print (L"Stack-to-register transfers (64-bit) "); +#else + print (L"Stack-to-register transfers (32-bit) "); +#endif + flush (); + + //-------------------------------------- + diff = 0; + total_count = 0; + t0 = mytime (); + while (diff < usec_per_test) + { + StackReader (REGISTER_COUNT); + total_count += REGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); + +#ifdef __x86_64__ + print (L"Register-to-stack transfers (64-bit) "); +#else + print (L"Register-to-stack transfers (32-bit) "); +#endif + flush (); + + //-------------------------------------- + diff = 0; + total_count = 0; + t0 = mytime (); + while (diff < usec_per_test) + { + StackWriter (REGISTER_COUNT); + total_count += REGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); +} + +//---------------------------------------------------------------------------- +// Name: library_test +// Purpose: Performs C library tests (memset, memcpy). +//---------------------------------------------------------------------------- +void +library_test () +{ + char *a1, *a2; + unsigned long t, t0; + int i; + + #define NT_SIZE (64*1024*1024) + #define NT_SIZE2 (100) + + a1 = malloc (NT_SIZE); + if (!a1) + error ("Out of memory"); + + a2 = malloc (NT_SIZE); + if (!a2) + error ("Out of memory"); + + //-------------------------------------- + t0 = mytime (); + for (i=0; i<NT_SIZE2; i++) { + memset (a1, i, NT_SIZE); + } + t = mytime (); + + print (L"Library: memset "); + calculate_result (NT_SIZE, NT_SIZE2, t-t0); + newline (); + + flush (); + + //-------------------------------------- + t0 = mytime (); + for (i=0; i<NT_SIZE2; i++) { + memcpy (a2, a1, NT_SIZE); + } + t = mytime (); + + print (L"Library: memcpy "); + calculate_result (NT_SIZE, NT_SIZE2, t-t0); + newline (); + + flush (); + + free (a1); + free (a2); +} + +//---------------------------------------------------------------------------- +// Name: network_test_core +// Purpose: Performs the network test, talking to and receiving data +// back from a transponder node. +// Note: Port number specified using server:# notation. +// Returns: -1 on error, else the network duration in microseconds. +//---------------------------------------------------------------------------- +bool +network_test_core (const char *hostname, char *chunk, + unsigned long chunk_size, + unsigned long n_chunks, + long *duration_send_return, + long *duration_recv_return) +{ + if (!hostname || !chunk || !n_chunks || !chunk_size || + !duration_send_return || + !duration_recv_return) + return false; + + struct hostent* host = gethostbyname (hostname); + if (!host) + return false; + + char *host_ip = inet_ntoa (*(struct in_addr *)*host->h_addr_list); + int sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + + struct sockaddr_in addr; + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = inet_addr(host_ip); + addr.sin_port = htons(network_port); + + if (connect (sock, (struct sockaddr*) &addr, sizeof (struct sockaddr))) + { + // perror ("connect"); + close (sock); + return false; + } + + //------------------------------------ + // Start stopwatch just before the send. + // It will be stopped on receipt of + // the response. + // + unsigned long t0 = mytime (); + + //------------------------------------ + // Put # of chunks in the chunk. + // Send all of our data. + // + sprintf (chunk, "%lu\n", n_chunks); + int i; + for (i = 0; i < n_chunks; i++) + send (sock, chunk, chunk_size, 0); + +#if 0 + //------------------------------------ + // Set nonblocking mode. + // + int opt = 1; + ioctl (sock, FIONBIO, &opt); +#endif + + unsigned long t1 = mytime (); + + //------------------------------------ + // Read the response. + // + int amount = recv (sock, chunk, chunk_size, 0); + if (amount < 16) { + close (sock); + return false; + } + + unsigned long duration_send = mytime() - t0; + + //------------------------------------ + // Validate the response, which + // contains the transponder's + // perceived read duration. This value + // may be as little as half our number. + // + unsigned long duration2 = -1; + if (strncmp ("OK: ", chunk, 4)) { + close (sock); + return false; + } + if (1 != sscanf (4+chunk, "%lu", &duration2)) { + close (sock); + return false; + } + + unsigned long remaining = chunk_size * n_chunks - amount; + while (remaining > 0) { + int amount = recv (sock, chunk, chunk_size, 0); + if (amount <= 0) { + perror ("recv"); + close (sock); + return false; + } + remaining -= amount; + } + + unsigned long duration_recv = mytime () - t1; + + *duration_send_return = duration_send; + *duration_recv_return = duration_recv; + + close (sock); + return true; +} + +//---------------------------------------------------------------------------- +// Name: ip_to_str +//---------------------------------------------------------------------------- +void +ip_to_str (unsigned long addr, char *str) +{ + if (!str) + return; + + unsigned short a = 0xff & addr; + unsigned short b = 0xff & (addr >> 8); + unsigned short c = 0xff & (addr >> 16); + unsigned short d = 0xff & (addr >> 24); + sprintf (str, "%u.%u.%u.%u", a,b,c,d); +} + +//---------------------------------------------------------------------------- +// Name: network_transponder +// Purpose: Act as a transponder, receiving chunks of data and sending +// back an acknowledgement once the enture chunk is read. +// Returns: False if a problem occurs setting up the network socket. +//---------------------------------------------------------------------------- +bool +network_transponder () +{ + struct sockaddr_in sin, from; + + //------------------------------ + // Get listening socket for port. + // Then listen on given port#. + // + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = htonl(INADDR_ANY); + sin.sin_port = htons(network_port); + int listensock; + if ((listensock = socket (AF_INET, SOCK_STREAM, 0)) < 0) { + perror ("socket"); + return false; + } + if (bind (listensock, (struct sockaddr*) &sin, sizeof(sin)) < 0) { + perror ("bind"); + close (listensock); + return false; + } + if (listen (listensock, 500) < 0) { + perror ("listen"); + close (listensock); + return false; + } + + bool done = false; + while (!done) { + //---------------------------------------- + // Wait for a client to contact us. + // + socklen_t len = sizeof (struct sockaddr); + int sock = accept (listensock, (struct sockaddr*) &from, &len); + if (sock < 0) { + perror ("accept"); + close (listensock); + return false; + } + + //---------------------------------------- + // Clockwatch starts when we accept the + // connection. + // + unsigned long t0 = mytime (); + + if (len != sizeof (struct sockaddr_in)) { + close (sock); + close (listensock); + return false; + } + +#if 0 + unsigned long ipaddr = from.sin_addr.s_addr; + char ipstring[30]; + ip_to_str (ipaddr, ipstring); + fprintf (stderr, "Incoming connection from %s\n", ipstring); +#endif + + //---------------------------------------- + // Read the first chunk only, in order to + // get the # of bytes that will be sent. + // + char chunk [NETWORK_CHUNK_SIZE+1]; + long n_chunks = 0; + int amount_read = read (sock, chunk, NETWORK_CHUNK_SIZE); + chunk [amount_read] = 0; + if (1 != sscanf (chunk, "%ld", &n_chunks)) { + close (sock); + close (listensock); + return false; + } + + //---------------------------------------- + // If the leader sends us a chunk count of + // -99, this indicates that we should exit. + // + if (n_chunks == -99) { + close (sock); + close (listensock); + return true; + } + +// printf ("Reading %lu chunks of %d bytes...\n", n_chunks, NETWORK_CHUNK_SIZE); + + unsigned long long remaining = n_chunks; + remaining *= NETWORK_CHUNK_SIZE; + +// printf ("remaining="); dump_hex64(remaining); puts(""); + + remaining -= amount_read; + while (remaining > 0) { + amount_read = read (sock, chunk, NETWORK_CHUNK_SIZE); + remaining -= amount_read; + + if (amount_read < 0) { + perror ("read"); + break; + } else + if (!amount_read) + break; + } + + unsigned long duration = mytime() - t0; + + //------------------------------------ + // Send response of same size. + // + sprintf (chunk, "OK: %lu\n", duration); + chunk[14] = '\n'; + + //------------------------------------ + // Send all of our data. + // + int i; + for (i = 0; i < n_chunks; i++) + send (sock, chunk, NETWORK_CHUNK_SIZE, 0); + + close (sock); + } + + return true; +} + +//---------------------------------------------------------------------------- +// Name: network_test +//---------------------------------------------------------------------------- +bool +network_test (char **destinations, int n_destinations) +{ + int i; + + //---------------------------------------- + // The memory chunk starts with a 12-byte + // length of the overall send size. + // The memory chunk will have a list of + // the destinations in it. + // In future, there will be a mechanism + // for testing bandwidth between all nodes, + // not just the leader & each of the + // transponders. + // + char chunk [NETWORK_CHUNK_SIZE]; + memset (chunk, 0, NETWORK_CHUNK_SIZE); + sprintf (chunk, "000000000000\n%d\n", n_destinations); + for (i = 0; i < n_destinations; i++) { + char *s = destinations [i]; + int chunk_len = strlen (chunk); + int len = strlen (s); + if (len + chunk_len < NETWORK_CHUNK_SIZE-1) { + //---------------------------------------- + // "transp" indicates that the given node + // has not yet been a leader. + // In future, "done" will indicate it has. + // + sprintf (chunk + chunk_len, "%s %s\n", s, "transp"); + } + } + + static unsigned long colors [] = { + RGB_RED, RGB_GREEN, RGB_BLUE, RGB_ORANGE, RGB_PURPLE, + RGB_BLACK, RGB_CORAL, + RGB_CYAN, RGB_NAVYBLUE, RGB_BRASS, RGB_DARKORANGE, + RGB_DARKGREEN, RGB_SALMON, RGB_MAGENTA, RGB_LEMONYELLOW, + RGB_ROYALBLUE, RGB_DODGERBLUE, RGB_TURQUOISE, RGB_CADETBLUE, + RGB_CHARTREUSE, RGB_DARKOLIVEGREEN, RGB_VIOLET, + RGB_KHAKI, RGB_DARKKHAKI, RGB_GOLDENROD + }; +#define NCOLORS (sizeof(colors)/sizeof(unsigned long)) + + //---------------------------------------- + // For each destination, run the test. + // + for (i = 0; i < n_destinations; i++) { + bool problem = false; + + char *hostname = destinations[i]; + printf ("Bandwidth sending to %s:\n", hostname); + + char title [PATH_MAX]; + sprintf (title, "%s send (solid)", hostname); + BMPGraphing_new_line (graph, title, i < NCOLORS? colors[i] : RGB_GRAY); + + //---------------------------------------- + // Cache the receive durations for later. + // + unsigned long recv_rates [NETSIZE_MAX]; + int recv_ix = 0; + + //---------------------------------------- + // Send data of increasing sizes. + // + int j = NETSIZE_MIN; + int n_runs = 64; + while (!problem && j <= NETSIZE_MAX) { + unsigned long chunk_count = 1 << (j-NETSIZE_MIN); + unsigned long long amt_to_send = chunk_count; + amt_to_send *= NETWORK_CHUNK_SIZE; + + if (!amt_to_send) // unlikely + break; + + //---------------------------------------- + // Send the data; do this n_runs times. + // + unsigned long long total_duration_send = 0; + unsigned long long total_duration_recv = 0; + + int k = n_runs; + while (k--) { + long duration_send, duration_recv; + + if (! network_test_core (hostname, + chunk, NETWORK_CHUNK_SIZE, chunk_count, + &duration_send, &duration_recv)) + { + problem = true; + fprintf (stderr, "\nCan't connect to %s\n", hostname); + break; + } + + total_duration_send += duration_send; + total_duration_recv += duration_recv; + } + + if (problem) + break; + + total_duration_send += n_runs/2; // Round up + total_duration_send /= n_runs; // Get average + long duration = (long) total_duration_send; + + total_duration_recv += n_runs/2; // Round up + total_duration_recv /= n_runs; // Get average + + unsigned long amt_in_kb = amt_to_send / 1024; + unsigned long amt_in_mb = amt_to_send / 1048576; + if (!amt_in_mb) { + printf ("\r\tChunk %lu kB x %d: \t", amt_in_kb, + n_runs); + } else { + printf ("\r\tChunk %lu MB x %d: \t", amt_in_mb, + n_runs); + } + + //------------------------------ + // Calculate send rate in MB/sec. + // + // Get total # bytes. + unsigned long long tmp = NETWORK_CHUNK_SIZE; + tmp *= chunk_count; + + // Get total bytes per second. + tmp *= 1000000; + tmp /= duration; + + // Bytes to megabytes. + tmp /= 1000; + tmp /= 10; + unsigned long whole = tmp / 100; + unsigned long frac = tmp % 100; + printf ("%lu.%02lu MB/s (sent)\t", whole, frac); + fflush (stdout); + + BMPGraphing_add_point (graph, amt_in_kb, tmp); + + //------------------------------ + // Calculate recv rate in MB/sec. + // + // Get total # bytes. + tmp = NETWORK_CHUNK_SIZE; + tmp *= chunk_count; + + // Get total bytes per second. + tmp *= 1000000; + tmp /= total_duration_recv; + + // Bytes to megabytes. + tmp /= 1000; + tmp /= 10; + whole = tmp / 100; + frac = tmp % 100; + printf ("%lu.%02lu MB/s (received)\n", whole, frac); + + recv_rates [recv_ix++] = tmp; + + j++; + n_runs >>= 1; + if (!n_runs) + n_runs = 1; + } + + //---------------------------------------- + // Now add the line for the receive rates. + // + sprintf (title, "%s receive (dashed)", hostname); + BMPGraphing_new_line (graph, title, DASHED | + (i < NCOLORS? colors[i] : RGB_GRAY)); + for (j = NETSIZE_MIN; j <= NETSIZE_MAX; j++) { + unsigned long chunk_count = 1 << (j-NETSIZE_MIN); + unsigned long long amt_to_send = chunk_count; + amt_to_send *= NETWORK_CHUNK_SIZE; + unsigned long amt_in_kb = amt_to_send / 1024; +// printf ("amt_in_kb=%ld\n",amt_in_kb); + + BMPGraphing_add_point (graph, amt_in_kb, recv_rates[j-NETSIZE_MIN]); + } + + puts (""); + } + + return true; +} + +//---------------------------------------------------------------------------- +// Name: usage +//---------------------------------------------------------------------------- +void +usage () +{ + printf ("Usage: bandwidth [--slow] [--fast] [--faster] [--fastest] [--title string]\n"); + printf ("Usage for starting network tests: bandwidth --network <ipaddr1> [<ipaddr2...] [--port <port#>]\n"); + printf ("Usage for receiving network tests: bandwidth --transponder [--port <port#>]\n"); + + exit (0); +} + +//---------------------------------------------------------------------------- +// Name: main +//---------------------------------------------------------------------------- +int +main (int argc, char **argv) +{ + int i, chunk_size; + + --argc; + ++argv; + + bool network_mode = false; + bool network_leader = false; // false => transponder + int network_destinations_size = 0; + int n_network_destinations = 0; + char **network_destinations = NULL; + + char graph_title [512] = {0}; + + i = 0; + while (i < argc) { + char *s = argv [i++]; + + if (!strcmp ("--network", s)) { + network_mode = true; + network_leader = true; + network_destinations_size = 20; + network_destinations = (char**) malloc (network_destinations_size * sizeof (char*)); + } + else + if (!strcmp ("--transponder", s)) { + network_mode = true; + } + else + if (!strcmp ("--port", s)) { + if (i != argc) + network_port = atoi (argv[i++]); + } + else + if (!strcmp ("--slow", s)) { + usec_per_test=20000000; // 20 seconds per test. + } + else + if (!strcmp ("--fast", s)) { + usec_per_test = 500000; // 0.5 seconds per test. + } + else + if (!strcmp ("--faster", s)) { + usec_per_test = 50000; // 0.05 seconds per test. + } + else + if (!strcmp ("--fastest", s)) { + usec_per_test = 5000; // 0.005 seconds per test. + } + else + if (!strcmp ("--nosse2", s)) { + use_sse2 = false; + use_sse4 = false; + } + else + if (!strcmp ("--nosse4", s)) { + use_sse4 = false; + } + else + if (!strcmp ("--help", s)) { + usage (); + } + else + if (!strcmp ("--title", s) && i != argc) { + snprintf (graph_title, 511, "%s", argv[i++]); + } + else { + if ('-' == *s) + usage (); + } + } + + msg[0] = 0; + + for (i = 0; chunk_sizes[i] && i < sizeof(chunk_sizes)/sizeof(int); i++) { + chunk_sizes_log2[i] = log2 (chunk_sizes[i]); + } + + printf ("This is bandwidth version %s.\n", RELEASE); + printf ("Copyright (C) 2005-2014 by Zack T Smith.\n\n"); + printf ("This software is covered by the GNU Public License.\n"); + printf ("It is provided AS-IS, use at your own risk.\n"); + printf ("See the file COPYING for more information.\n\n"); + fflush (stdout); + + //---------------------------------------- + // If network mode selected, enter it now. + // Currently cannot combine memory tests + // & network tests. + // + if (network_mode) { + if (network_leader) { + graph = BMPGraphing_new (GRAPH_WIDTH, GRAPH_HEIGHT, MODE_X_AXIS_LINEAR); + strcpy (graph_title, TITLE_MEMORY_NET); + BMPGraphing_set_title (graph, graph_title); + + network_test (network_destinations, n_network_destinations); + + BMPGraphing_make (graph); + + BMP_write (graph->image, "network_bandwidth.bmp"); + +#if defined(__linux__) || defined(__CYGWIN__) || defined(__APPLE__) + puts ("Wrote graph to network_bandwidth.bmp."); + puts (""); + puts ("Done."); +#endif + BMPGraphing_destroy (graph); + } else { + network_transponder (); + } + + return 0; + } + + uint32_t ecx = get_cpuid1_ecx (); + uint32_t edx = get_cpuid1_edx (); + cpu_has_mmx = edx & CPUID_EDX_MMX; + cpu_has_sse = edx & CPUID_EDX_SSE; + cpu_has_sse2 = edx & CPUID_EDX_SSE2; + cpu_has_sse3 = ecx & CPUID_ECX_SSE3; + cpu_has_ssse3 = ecx & CPUID_ECX_SSSE3; + cpu_has_sse41 = ecx & CPUID_ECX_SSE41; + cpu_has_sse42 = ecx & CPUID_ECX_SSE42; + cpu_has_aes = ecx & CPUID_ECX_AES; + cpu_has_avx = ecx & CPUID_ECX_AVX; + cpu_has_avx2 = 0; + + if (cpu_has_avx) { + cpu_has_avx2 = get_cpuid7_ebx (); + cpu_has_avx2 &= CPUID_EBX_AVX2; + } + + use_sse2 = true; + use_sse4 = true; + + cpu_has_sse4a = 0; + cpu_has_64bit = 0; + cpu_has_xd = 0; + + static char family [17]; + get_cpuid_family (family); + family [16] = 0; + printf ("CPU family: %s\n", family); + + uint32_t ecx2 = get_cpuid_80000001_ecx (); + uint32_t edx2 = get_cpuid_80000001_edx (); + + if (!strcmp ("AuthenticAMD", family)) { + is_amd = true; + cpu_has_sse4a = ecx2 & CPUID_ECX_SSE4A; + } + else + if (!strcmp ("GenuineIntel", family)) { + is_intel = true; + } + + cpu_has_xd = edx2 & CPUID_EDX_XD; + cpu_has_64bit = edx2 & CPUID_EDX_INTEL64; + + printf ("CPU features: "); + if (cpu_has_mmx) printf ("MMX "); + if (cpu_has_sse) printf ("SSE "); + if (cpu_has_sse2) printf ("SSE2 "); + if (cpu_has_sse3) printf ("SSE3 "); + if (cpu_has_ssse3) printf ("SSSE3 "); + if (cpu_has_sse4a) printf ("SSE4A "); + if (cpu_has_sse41) printf ("SSE4.1 "); + if (cpu_has_sse42) printf ("SSE4.2 "); + if (cpu_has_aes) printf ("AES "); + if (cpu_has_avx) printf ("AVX "); + if (cpu_has_avx2) printf ("AVX2 "); + if (cpu_has_xd) printf ("XD "); + if (cpu_has_64bit) { + if (!is_amd) + printf ("Intel64 "); + else + printf ("LongMode "); + } + puts ("\n"); + + if (is_intel) { + uint32_t cache_info[4]; + i = 0; + while (1) { + get_cpuid_cache_info (cache_info, i); + if (!(cache_info[0] & 31)) + break; + +#if 0 + printf ("Cache info %d = 0x%08x, 0x%08x, 0x%08x, 0x%08x\n", i, + cache_info [0], + cache_info [1], + cache_info [2], + cache_info [3]); +#endif + printf ("Cache %d: ", i); + switch ((cache_info[0] >> 5) & 7) { + case 1: printf ("L1 "); break; + case 2: printf ("L2 "); break; + case 3: printf ("L3 "); break; + } + switch (cache_info[0] & 31) { + case 1: printf ("data cache, "); break; + case 2: printf ("instruction cache, "); break; + case 3: printf ("unified cache, "); break; + } + uint32_t n_ways = 1 + (cache_info[1] >> 22); + uint32_t line_size = 1 + (cache_info[1] & 2047); + uint32_t n_sets = 1 + cache_info[2]; + printf ("line size %d, ", line_size); + printf ("%2d-way%s, ", n_ways, n_ways>1 ? "s" : ""); + printf ("%5d sets, ", n_sets); + unsigned size = (n_ways * line_size * n_sets) >> 10; + printf ("size %dk ", size); + puts (""); + i++; + } + } + + if (!cpu_has_sse41) + use_sse4 = false; + if (!cpu_has_sse2) + use_sse2 = false; + + println (L"\nNotation: B = byte, kB = 1024 B, MB = 1048576 B."); + + flush (); + + //------------------------------------------------------------ + // Attempt to obtain information about the CPU. + // +#ifdef __linux__ + struct stat st; + if (!stat ("/proc/cpuinfo", &st)) { +#define TMPFILE "/tmp/bandw_tmp" + unlink (TMPFILE); + if (-1 == system ("grep MHz /proc/cpuinfo | uniq | sed \"s/[\\t\\n: a-zA-Z]//g\" > "TMPFILE)) + perror ("system"); + + FILE *f = fopen (TMPFILE, "r"); + if (f) { + float cpu_speed = 0.0; + + if (1 == fscanf (f, "%g", &cpu_speed)) { + puts (""); + printf ("CPU speed is %g MHz.\n", cpu_speed); + } + fclose (f); + } + } else { + printf ("CPU information is not available (/proc/cpuinfo).\n"); + } + fflush (stdout); +#endif + + graph = BMPGraphing_new (GRAPH_WIDTH, GRAPH_HEIGHT, MODE_X_AXIS_LOG2); + strcpy (graph_title, TITLE_MEMORY_GRAPH); + BMPGraphing_set_title (graph, graph_title); + + //------------------------------------------------------------ + // SSE2 sequential reads. + // + if (use_sse2) { + BMPGraphing_new_line (graph, "Sequential 128-bit reads", RGB_RED); + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_read (chunk_size, SSE2, false); + BMPGraphing_add_point (graph, chunk_size, amount); + } + } + + //------------------------------------------------------------ + // AVX sequential reads. + // + if (cpu_has_avx) { + BMPGraphing_new_line (graph, "Sequential 256-bit reads", RGB_TURQUOISE); + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + if (!(chunk_size & 128)) { + int amount = do_read (chunk_size, AVX, false); + BMPGraphing_add_point (graph, chunk_size, amount); + } + } + } + + //------------------------------------------------------------ + // SSE2 random reads. + // + if (use_sse2) { + BMPGraphing_new_line (graph, "Random 128-bit reads", RGB_MAROON); + + newline (); + srand (time (NULL)); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + if (!(chunk_size & 128)) { + int amount = do_read (chunk_size, SSE2, true); + BMPGraphing_add_point (graph, chunk_size, amount); + } + } + } + + //------------------------------------------------------------ + // SSE2 sequential writes that do not bypass the caches. + // + if (use_sse2) { + BMPGraphing_new_line (graph, "Sequential 128-bit cache writes", RGB_PURPLE); + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_write (chunk_size, SSE2, false); + BMPGraphing_add_point (graph, chunk_size, amount); + } + } + + //------------------------------------------------------------ + // AVX sequential writes that do not bypass the caches. + // + if (cpu_has_avx) { + BMPGraphing_new_line (graph, "Sequential 256-bit cache writes", RGB_PINK); + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + if (!(chunk_size & 128)) { + int amount = do_write (chunk_size, AVX, false); + BMPGraphing_add_point (graph, chunk_size, amount); + } + } + } + + //------------------------------------------------------------ + // SSE2 random writes that do not bypass the caches. + // + if (use_sse2) { + BMPGraphing_new_line (graph, "Random 128-bit cache writes", RGB_NAVYBLUE); + + newline (); + srand (time (NULL)); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + if (!(chunk_size & 128)) { + int amount = do_write (chunk_size, SSE2, true); + BMPGraphing_add_point (graph, chunk_size, amount); + } + } + } + + //------------------------------------------------------------ + // SSE4 sequential reads that do bypass the caches. + // + if (use_sse4) { + BMPGraphing_new_line (graph, "Sequential 128-bit bypassing reads", RGB_BLACK); + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_read (chunk_size, SSE2_BYPASS, false); + BMPGraphing_add_point (graph, chunk_size, amount); + } + } + + //------------------------------------------------------------ + // SSE4 random reads that do bypass the caches. + // + if (use_sse4) { + BMPGraphing_new_line (graph, "Random 128-bit bypassing reads", 0xdeadbeef); + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + if (!(chunk_size & 128)) { + int amount = do_read (chunk_size, SSE2_BYPASS, true); + BMPGraphing_add_point (graph, chunk_size, amount); + } + } + } + + //------------------------------------------------------------ + // SSE4 sequential writes that do bypass the caches. + // + if (use_sse4) { + BMPGraphing_new_line (graph, "Sequential 128-bit bypassing writes", RGB_DARKORANGE); + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_write (chunk_size, SSE2_BYPASS, false); + BMPGraphing_add_point (graph, chunk_size, amount); + } + } + + //------------------------------------------------------------ + // AVX sequential writes that do bypass the caches. + // Currently on Intel CPUs (including Xeon) there is a + // microcode bug that leads to a severe drop in performance + // in this part of the test. + // + if (cpu_has_avx) { + BMPGraphing_new_line (graph, "Sequential 256-bit bypassing writes", RGB_DARKOLIVEGREEN); + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + if (!(chunk_size & 128)) { + int amount = do_write (chunk_size, AVX_BYPASS, false); + BMPGraphing_add_point (graph, chunk_size, amount); + } + } + } + + //------------------------------------------------------------ + // SSE4 random writes that bypass the caches. + // + if (use_sse4) { + BMPGraphing_new_line (graph, "Random 128-bit bypassing writes", RGB_LEMONYELLOW); + + newline (); + srand (time (NULL)); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + if (!(chunk_size & 128)) { + int amount = do_write (chunk_size, SSE2_BYPASS, true); + BMPGraphing_add_point (graph, chunk_size, amount); + } + } + } + + //------------------------------------------------------------ + // Sequential non-SSE2 reads. + // + newline (); +#ifdef __x86_64__ + BMPGraphing_new_line (graph, "Sequential 64-bit reads", RGB_BLUE); +#else + BMPGraphing_new_line (graph, "Sequential 32-bit reads", RGB_BLUE); +#endif + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_read (chunk_size, NO_SSE2, false); + BMPGraphing_add_point (graph, chunk_size, amount); + } + + //------------------------------------------------------------ + // Random non-SSE2 reads. + // + newline (); +#ifdef __x86_64__ + BMPGraphing_new_line (graph, "Random 64-bit reads", RGB_CYAN); +#else + BMPGraphing_new_line (graph, "Random 32-bit reads", RGB_CYAN); +#endif + srand (time (NULL)); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + if (!(chunk_size & 128)) { + int amount = do_read (chunk_size, NO_SSE2, true); + BMPGraphing_add_point (graph, chunk_size, amount); + } + } + + //------------------------------------------------------------ + // Sequential non-SSE2 writes. + // +#ifdef __x86_64__ + BMPGraphing_new_line (graph, "Sequential 64-bit writes", RGB_DARKGREEN); +#else + BMPGraphing_new_line (graph, "Sequential 32-bit writes", RGB_DARKGREEN); +#endif + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_write (chunk_size, NO_SSE2, false); + BMPGraphing_add_point (graph, chunk_size, amount); + } + + //------------------------------------------------------------ + // Random non-SSE2 writes. + // +#ifdef __x86_64__ + BMPGraphing_new_line (graph, "Random 64-bit writes", RGB_GREEN); +#else + BMPGraphing_new_line (graph, "Random 32-bit writes", RGB_GREEN); +#endif + + newline (); + srand (time (NULL)); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + if (!(chunk_size & 128)) { + int amount = do_write (chunk_size, NO_SSE2, true); + BMPGraphing_add_point (graph, chunk_size, amount); + } + } + + //------------------------------------------------------------ + // SSE2 sequential copy. + // + if (use_sse2) { + BMPGraphing_new_line (graph, "Sequential 128-bit copy", 0x8f8844); + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_copy (chunk_size, SSE2); + BMPGraphing_add_point (graph, chunk_size, amount); + } + } + + //------------------------------------------------------------ + // AVX sequential copy. + // + if (cpu_has_avx) { + BMPGraphing_new_line (graph, "Sequential 256-bit copy", RGB_CHARTREUSE); + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + if (!(chunk_size & 128)) { + int amount = do_copy (chunk_size, AVX); + BMPGraphing_add_point (graph, chunk_size, amount); + } + } + } + +#ifdef DOING_LODS +#ifdef __x86_64__ + //------------------------------------------------------------ + // LODSQ 64-bit sequential reads. + // + BMPGraphing_new_line (graph, "Sequential 64-bit LODSQ reads", RGB_GRAY6); + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_read (chunk_size, LODSQ, false); + BMPGraphing_add_point (graph, chunk_size, amount); + } +#endif + + //------------------------------------------------------------ + // LODSD 32-bit sequential reads. + // + BMPGraphing_new_line (graph, "Sequential 32-bit LODSD reads", RGB_GRAY8); + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_read (chunk_size, LODSD, false); + BMPGraphing_add_point (graph, chunk_size, amount); + } + + //------------------------------------------------------------ + // LODSW 16-bit sequential reads. + // + BMPGraphing_new_line (graph, "Sequential 16-bit LODSW reads", RGB_GRAY10); + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_read (chunk_size, LODSW, false); + BMPGraphing_add_point (graph, chunk_size, amount); + } + + //------------------------------------------------------------ + // LODSB 64-bit sequential reads. + // + BMPGraphing_new_line (graph, "Sequential 8-bit LODSB reads", RGB_GRAY12); + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_read (chunk_size, LODSB, false); + BMPGraphing_add_point (graph, chunk_size, amount); + } +#endif + + //------------------------------------------------------------ + // Register to register. + // + newline (); + register_test (); + + //------------------------------------------------------------ + // Stack to/from register. + // + newline (); + stack_test (); + + //------------------------------------------------------------ + // C library performance. + // + newline (); + library_test (); + + //------------------------------------------------------------ + // Framebuffer read & write. + // +#if defined(__linux__) && defined(FBIOGET_FSCREENINFO) + newline (); + fb_readwrite (true); +#endif + +premature_end_for_testing: + flush (); + + BMPGraphing_make (graph); + + BMP_write (graph->image, "bandwidth.bmp"); + + puts ("\nWrote graph to bandwidth.bmp."); + puts (""); + puts ("Done."); + + BMPGraphing_destroy (graph); + + return 0; +} diff --git a/minifont.c b/minifont.c new file mode 100755 index 0000000..8aa939c --- /dev/null +++ b/minifont.c @@ -0,0 +1,845 @@ + +/*============================================================================= + bmplib, a simple library to create, modify, and write BMP image files. + Copyright (C) 2009-2014 by Zack T Smith. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 + as published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + The author may be reached at veritas@comcast.net. + *============================================================================*/ + +#include <stdio.h> + +#include "BMP.h" + +// Mini characters, 8 pixels high. +static const char *mini_chars_ [] = +{ + "#", + "#", + "#", + "#", + "#", + " ", + "#", + "", + + "## ##", + " # #", + "# #", + " ", + " ", + " ", + " ", + "", + + " # # ", + " # # ", + "#####", + " # # ", + "#####", + " # # ", + " # # ", + "", + + " # ", + " ####", + "# # ", + " ### ", + " # #", + "####", + " # ", + "", + + "## #", + " #", + " #", + " #", + " #", + "#", + "# ##", + "", + + " # ", + "# # ", + "## ", + " ## #", + "# ## ", + "# # ", + " ## #", + "", + + "##", + " #", + "#", + "", + "", + "", + "", + "", + + " #", + "#", + "#", + "#", + "#", + "#", + "#", + " #", + + "# ", + " #", + " #", + " #", + " #", + " #", + " #", + "#", + + " ", + "# # #", + " ###", + " #", + " ###", + "# # #", + "", + "", + + " ", + " #", + " #", + "#####", + " #", + " #", + "", + "", + + " ", + "", + "", + "", + "", + "##", + " #", + "#", + + " ", + "", + "", + "#####", + "", + "", + "", + "", + + " ", + "", + "", + "", + "", + "", + "#", + "", + + " #", + " #", + " #", + " #", + " #", + "#", + "#", + "", + + " ## ", + "# #", + "# #", + "# #", + "# #", + "# #", + " ## ", + "", + + " #", + "##", + " #", + " #", + " #", + " #", + " #", + "", + + " ## ", + "# #", + " #", + " ###", + "# ", + "# ", + "####", + "", + + "####", + " #", + " # ", + " ## ", + " #", + "# #", + " ## ", + "", + + "# # ", + "# #", + "# #", + "####", + " #", + " #", + " #", + "", + + "####", + "# ", + "### ", + " #", + " #", + "# #", + " ## ", + "", + + " ## ", + "# ", + "# ", + "### ", + "# #", + "# #", + " ## ", + "", + + "####", + " #", + " #", + " # ", + " # ", + " # ", + " # ", + "", + + " ## ", + "# #", + "# #", + " ## ", + "# #", + "# #", + " ## ", + "", + + " ## ", + "# #", + "# #", + " ###", + " #", + " # ", + " # ", + "", + + " ", + "", + "", + "#", + "", + "#", + "", + "", + + " ", + "", + " ", + "##", + " ", + "##", + " #", + "#", + + " #", + " #", + " #", + "#", + " #", + " #", + " #", + "", + + " ", + "", + "", + "#####", + " ", + "#####", + "", + "", + + "# ", + " #", + " #", + " #", + " #", + " #", + "#", + "", + + " ### ", + "# #", + " #", + " ## ", + " #", + "", + " #", + "", + + " ### ", + "# #", + "# ##", + "# # #", + "# ##", + "# ", + " ###", + "", + + " # ", + " # # ", + "# #", + "# #", + "#####", + "# #", + "# #", + "", + + "#### ", + "# #", + "# #", + "#### ", + "# #", + "# #", + "####", + "", + + " ### ", + "# #", + "# ", + "# ", + "# ", + "# #", + " ###", + "", + + "#### ", + "# #", + "# #", + "# #", + "# #", + "# #", + "####", + "", + + "#####", + "#", + "#", + "###", + "#", + "#", + "#####", + "", + + "#####", + "# ", + "# ", + "###", + "# ", + "# ", + "#", + "", + + " ### ", + "# #", + "# ", + "# ##", + "# #", + "# #", + " ####", + "", + + "# #", + "# #", + "# #", + "#####", + "# #", + "# #", + "# #", + "", + + "###", + " #", + " #", + " #", + " #", + " #", + "###", + "", + + " ###", + " #", + " #", + " #", + " #", + "# #", + " ##", + "", + + "# #", + "# #", + "# #", + "##", + "# #", + "# #", + "# #", + "", + + "# ", + "#", + "#", + "#", + "#", + "#", + "#####", + "", + + "# #", + "## ##", + "# # #", + "# #", + "# #", + "# #", + "# #", + "", + + "# #", + "## #", + "# # #", + "# ##", + "# #", + "# #", + "# #", + "", + + " ### ", + "# #", + "# #", + "# #", + "# #", + "# #", + " ###", + "", + + "#### ", + "# #", + "# #", + "#### ", + "# ", + "# ", + "# ", + "", + + " ### ", + "# #", + "# #", + "# #", + "# # #", + "# # ", + " ## #", + "", + + "#### ", + "# #", + "# #", + "#### ", + "# # ", + "# # ", + "# #", + "", + + " ### ", + "# #", + "# ", + " ### ", + " #", + "# #", + " ###", + "", + + "#####", + " #", + " #", + " #", + " #", + " #", + " #", + "", + + "# #", + "# #", + "# #", + "# #", + "# #", + "# #", + " ###", + "", + + "# #", + "# #", + "# #", + "# #", + "# #", + " # # ", + " #", + "", + + "# #", + "# #", + "# #", + "# # #", + "# # #", + "## ##", + "# #", + "", + + "# #", + "# #", + " # #", + " #", + " # #", + "# #", + "# #", + "", + + "# #", + "# #", + "# #", + " # #", + " #", + " #", + " #", + "", + + "#####", + " #", + " #", + " #", + " #", + "#", + "#####", + "", + + "##", + "#", + "#", + "#", + "#", + "#", + "#", + "##", + + "# ", + "#", + " #", + " #", + " #", + " #", + " #", + "", + + "##", + " #", + " #", + " #", + " #", + " #", + " #", + "##", + + " # ", + " # #", + "# #", + "", + "", + "", + "", + "", + + " ", + "", + "", + "", + "", + "", + "", + "####", + + "##", + "#", + " #", + "", + "", + "", + "", + "", + + " ", + " ", + " ## ", + " #", + " ###", + "# #", + " ###", + "", + + "# ", + "# ", + "### ", + "# #", + "# #", + "# #", + "### ", + "", + + " ", + " ", + " ###", + "# ", + "# ", + "# ", + " ###", + "", + + " #", + " #", + " ###", + "# #", + "# #", + "# #", + " ###", + "", + + " ", + " ", + " ## ", + "# #", + "####", + "# ", + " ###", + "", + + " ##", + " # ", + "### ", + " # ", + " # ", + " # ", + "### ", + "", + + " ", + " ", + " ###", + "# #", + "# #", + " ###", + " #", + "### ", + + "# ", + "# ", + "### ", + "# #", + "# #", + "# #", + "# #", + "", + + " # ", + " ", + "## ", + " # ", + " # ", + " # ", + "###", + "", + + " #", + " ", + " ##", + " #", + " #", + " #", + " #", + "## ", + + "# ", + "# ", + "# #", + "# # ", + "## ", + "# # ", + "# #", + "", + + "## ", + " # ", + " # ", + " # ", + " # ", + " # ", + "###", + "", + + " ", + "", + "####", + "# # #", + "# # #", + "# # #", + "# # #", + "", + + " ", + " ", + "###", + "# #", + "# #", + "# #", + "# #", + "", + + " ", + " ", + " ## ", + "# #", + "# #", + "# #", + " ## ", + "", + + " ", + "", + "###", + "# #", + "# #", + "###", + "#", + "#", + + " ", + "", + " ###", + "# #", + "# #", + " ###", + " #", + " # ", + + " ", + " ", + "# ##", + "## ", + "# ", + "# ", + "# ", + "", + + " ", + " ", + " ###", + "# ", + " ##", + " #", + "### ", + "", + + " # ", + " #", + "###", + " #", + " #", + " #", + " ##", + "", + + " ", + "", + "# #", + "# #", + "# #", + "# #", + " ###", + "", + + " ", + "", + "# #", + "# #", + "# #", + " # #", + " #", + "", + + " ", + "", + "# # #", + "# # #", + "# # #", + "# # #", + " # #", + "", + + " ", + "", + "# #", + " # #", + " #", + " # #", + "# #", + "", + + " ", + " ", + "# #", + "# #", + "# #", + " ###", + " #", + "### ", + + " ", + "", + "#####", + " #", + " #", + " # ", + "#####", + "", + +}; + +const char **get_minifont_chars () +{ + return mini_chars_; +} + diff --git a/minifont.h b/minifont.h new file mode 100755 index 0000000..a26edb9 --- /dev/null +++ b/minifont.h @@ -0,0 +1,28 @@ + +/*============================================================================= + bmplib, a simple library to create, modify, and write BMP image files. + Copyright (C) 2009-2014 by Zack T Smith. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 + as published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + The author may be reached at veritas@comcast.net. + *============================================================================*/ + +#ifndef _MINIFONT_H +#define _MINIFONT_H + +extern const char **get_minifont_chars (void); + +#endif + diff --git a/output/._Celeron-2.8GHz-slow.gif b/output/._Celeron-2.8GHz-slow.gif Binary files differnew file mode 100755 index 0000000..bdd6833 --- /dev/null +++ b/output/._Celeron-2.8GHz-slow.gif diff --git a/output/._Corei5-2.6GHz-4288U-MacOSXMavericks-64bit.png b/output/._Corei5-2.6GHz-4288U-MacOSXMavericks-64bit.png Binary files differnew file mode 100755 index 0000000..826edc9 --- /dev/null +++ b/output/._Corei5-2.6GHz-4288U-MacOSXMavericks-64bit.png diff --git a/output/._Corei5-520M-MacOSXLion-32bit-slow.gif b/output/._Corei5-520M-MacOSXLion-32bit-slow.gif Binary files differnew file mode 100755 index 0000000..6fdd000 --- /dev/null +++ b/output/._Corei5-520M-MacOSXLion-32bit-slow.gif diff --git a/output/._Corei5-520M-MacOSXLion-64bit-slow-Crucial-SDRAM.gif b/output/._Corei5-520M-MacOSXLion-64bit-slow-Crucial-SDRAM.gif Binary files differnew file mode 100755 index 0000000..83d30f5 --- /dev/null +++ b/output/._Corei5-520M-MacOSXLion-64bit-slow-Crucial-SDRAM.gif diff --git a/output/._Corei5-520M-MacOSXLion-64bit-slow-Samsung-SDRAM.gif b/output/._Corei5-520M-MacOSXLion-64bit-slow-Samsung-SDRAM.gif Binary files differnew file mode 100755 index 0000000..5ffb9aa --- /dev/null +++ b/output/._Corei5-520M-MacOSXLion-64bit-slow-Samsung-SDRAM.gif diff --git a/output/Celeron-2.8GHz-slow.gif b/output/Celeron-2.8GHz-slow.gif Binary files differnew file mode 100755 index 0000000..6d89c32 --- /dev/null +++ b/output/Celeron-2.8GHz-slow.gif diff --git a/output/Corei5-2.6GHz-4288U-MacOSXMavericks-64bit.png b/output/Corei5-2.6GHz-4288U-MacOSXMavericks-64bit.png Binary files differnew file mode 100755 index 0000000..d8d268e --- /dev/null +++ b/output/Corei5-2.6GHz-4288U-MacOSXMavericks-64bit.png diff --git a/output/Corei5-520M-MacOSXLion-32bit-slow.gif b/output/Corei5-520M-MacOSXLion-32bit-slow.gif Binary files differnew file mode 100755 index 0000000..364adf7 --- /dev/null +++ b/output/Corei5-520M-MacOSXLion-32bit-slow.gif diff --git a/output/Corei5-520M-MacOSXLion-64bit-slow-Crucial-SDRAM.gif b/output/Corei5-520M-MacOSXLion-64bit-slow-Crucial-SDRAM.gif Binary files differnew file mode 100755 index 0000000..4ce6d5d --- /dev/null +++ b/output/Corei5-520M-MacOSXLion-64bit-slow-Crucial-SDRAM.gif diff --git a/output/Corei5-520M-MacOSXLion-64bit-slow-Samsung-SDRAM.gif b/output/Corei5-520M-MacOSXLion-64bit-slow-Samsung-SDRAM.gif Binary files differnew file mode 100755 index 0000000..d38a120 --- /dev/null +++ b/output/Corei5-520M-MacOSXLion-64bit-slow-Samsung-SDRAM.gif diff --git a/routines32.asm b/routines32.asm new file mode 100755 index 0000000..44015d9 --- /dev/null +++ b/routines32.asm @@ -0,0 +1,2960 @@ +;============================================================================ +; bandwidth 0.32, a benchmark to estimate memory transfer bandwidth. +; Copyright (C) 2005-2014 by Zack T Smith. +; +; This program is free software; you can redistribute it and/or modify +; it under the terms of the GNU General Public License as published by +; the Free Software Foundation; either version 2 of the License, or +; (at your option) any later version. +; +; This program is distributed in the hope that it will be useful, +; but WITHOUT ANY WARRANTY; without even the implied warranty of +; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; GNU General Public License for more details. +; +; You should have received a copy of the GNU General Public License +; along with this program; if not, write to the Free Software +; Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +; +; The author may be reached at veritas@comcast.net. +;============================================================================= + +bits 32 +cpu ia64 + +global ReaderLODSQ +global _ReaderLODSQ + +global ReaderLODSD +global _ReaderLODSD + +global ReaderLODSW +global _ReaderLODSW + +global ReaderLODSB +global _ReaderLODSB + +; Cygwin requires the underbar-prefixed symbols. +global _WriterSSE2 +global WriterSSE2 + +global _WriterAVX +global WriterAVX + +global _WriterSSE2_128bytes +global WriterSSE2_128bytes + +global _ReaderAVX +global ReaderAVX + +global _ReaderSSE2 +global ReaderSSE2 + +global ReaderSSE2_bypass +global _ReaderSSE2_bypass + +global _ReaderSSE2_128bytes +global ReaderSSE2_128bytes + +global ReaderSSE2_128bytes_bypass +global _ReaderSSE2_128bytes_bypass + +global _RandomReaderSSE2 +global RandomReaderSSE2 + +global _RandomReaderSSE2_bypass +global RandomReaderSSE2_bypass + +global WriterAVX_bypass +global _WriterAVX_bypass + +global _WriterSSE2_bypass +global WriterSSE2_bypass + +global _WriterSSE2_128bytes_bypass +global WriterSSE2_128bytes_bypass + +global _RandomWriterSSE2_bypass +global RandomWriterSSE2_bypass + +global Reader +global _Reader + +global Writer +global _Writer + +global Reader_128bytes +global _Reader_128bytes + +global Writer_128bytes +global _Writer_128bytes + +global RandomReader +global _RandomReader + +global RandomWriter +global _RandomWriter + +global RandomWriterSSE2 +global _RandomWriterSSE2 + +global get_cpuid_family +global _get_cpuid_family + +global get_cpuid_cache_info +global _get_cpuid_cache_info + +global get_cpuid1_ecx +global _get_cpuid1_ecx + +global get_cpuid1_edx +global _get_cpuid1_edx + +global get_cpuid7_ebx +global _get_cpuid7_ebx + +global get_cpuid_80000001_ecx +global _get_cpuid_80000001_ecx + +global get_cpuid_80000001_edx +global _get_cpuid_80000001_edx + +global CopySSE +global _CopySSE + +global CopyAVX +global _CopyAVX + +global CopySSE_128bytes +global _CopySSE_128bytes + +global RegisterToRegister +global _RegisterToRegister + +global VectorToVector +global _VectorToVector + +global VectorToVectorAVX +global _VectorToVectorAVX + +global RegisterToVector +global _RegisterToVector + +global VectorToRegister +global _VectorToRegister + +global Register8ToVector +global Register16ToVector +global Register32ToVector +global Register64ToVector +global Vector8ToRegister +global Vector16ToRegister +global Vector32ToRegister +global Vector64ToRegister + +global _Register8ToVector +global _Register16ToVector +global _Register32ToVector +global _Register64ToVector +global _Vector8ToRegister +global _Vector16ToRegister +global _Vector32ToRegister +global _Vector64ToRegister + +global StackReader +global _StackReader + +global StackWriter +global _StackWriter + + section .text + +;------------------------------------------------------------------------------ +; Name: ReaderLODSQ +; Purpose: Reads 64-bit values sequentially from an area of memory +; using LODSQ instruction. +; Params: rdi = ptr to memory area +; rsi = length in bytes +; rdx = loops +;------------------------------------------------------------------------------ + align 32 +ReaderLODSQ: +_ReaderLODSQ: + ; N/A + ret + +;------------------------------------------------------------------------------ +; Name: ReaderLODSD +; Purpose: Reads 32-bit values sequentially from an area of memory +; using LODSD instruction. +; Params: +; [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +;------------------------------------------------------------------------------ + align 32 +ReaderLODSD: +_ReaderLODSD: + shr dword [esp+8], 2 ; length in double words rounded down. + + push ebx + push ecx ; REP counter + push edx + + mov edx, [esp+12+12] +.L1: + mov esi, [esp+4+12] + mov ecx, [esp+8+12] + + rep lodsd + + dec edx + jnz .L1 + + pop edx + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: ReaderLODSW +; Purpose: Reads 16-bit values sequentially from an area of memory +; using LODSW instruction. +; Params: +; [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +;------------------------------------------------------------------------------ + align 32 +ReaderLODSW: +_ReaderLODSW: + shr dword [esp+8], 1 ; length in words rounded down. + + push ebx + push ecx ; REP counter + push edx + + mov edx, [esp+12+12] +.L1: + mov esi, [esp+4+12] + mov ecx, [esp+8+12] + + rep lodsw + + dec edx + jnz .L1 + + pop edx + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: ReaderLODSB +; Purpose: Reads 8-bit values sequentially from an area of memory +; using LODSB instruction. +; Params: +; [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +;------------------------------------------------------------------------------ + align 32 +ReaderLODSB: +_ReaderLODSB: + push ebx + push ecx ; REP counter + push edx + + mov edx, [esp+12+12] +.L1: + mov esi, [esp+4+12] + mov ecx, [esp+8+12] + + rep lodsb + + dec edx + jnz .L1 + + pop edx + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: Reader +; Purpose: Reads 32-bit values sequentially from an area of memory. +; Params: +; [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +;------------------------------------------------------------------------------ + align 64 +Reader: +_Reader: + push ebx + push ecx + push edx + + mov ecx, [esp+12+12] ; loops to do. + + mov edx, [esp+4+12] ; ptr to memory chunk. + mov ebx, edx ; ebx = limit in memory + add ebx, [esp+8+12] + +.L1: + mov edx, [esp+4+12] + +.L2: + mov eax, [edx] + mov eax, [4+edx] + mov eax, [8+edx] + mov eax, [12+edx] + mov eax, [16+edx] + mov eax, [20+edx] + mov eax, [24+edx] + mov eax, [28+edx] + mov eax, [32+edx] + mov eax, [36+edx] + mov eax, [40+edx] + mov eax, [44+edx] + mov eax, [48+edx] + mov eax, [52+edx] + mov eax, [56+edx] + mov eax, [60+edx] + mov eax, [64+edx] + mov eax, [68+edx] + mov eax, [72+edx] + mov eax, [76+edx] + mov eax, [80+edx] + mov eax, [84+edx] + mov eax, [88+edx] + mov eax, [92+edx] + mov eax, [96+edx] + mov eax, [100+edx] + mov eax, [104+edx] + mov eax, [108+edx] + mov eax, [112+edx] + mov eax, [116+edx] + mov eax, [120+edx] + mov eax, [124+edx] + + mov eax, [edx+128] + mov eax, [edx+132] + mov eax, [edx+136] + mov eax, [edx+140] + mov eax, [edx+144] + mov eax, [edx+148] + mov eax, [edx+152] + mov eax, [edx+156] + mov eax, [edx+160] + mov eax, [edx+164] + mov eax, [edx+168] + mov eax, [edx+172] + mov eax, [edx+176] + mov eax, [edx+180] + mov eax, [edx+184] + mov eax, [edx+188] + mov eax, [edx+192] + mov eax, [edx+196] + mov eax, [edx+200] + mov eax, [edx+204] + mov eax, [edx+208] + mov eax, [edx+212] + mov eax, [edx+216] + mov eax, [edx+220] + mov eax, [edx+224] + mov eax, [edx+228] + mov eax, [edx+232] + mov eax, [edx+236] + mov eax, [edx+240] + mov eax, [edx+244] + mov eax, [edx+248] + mov eax, [edx+252] + + add edx, 256 + cmp edx, ebx + jb .L2 + + sub ecx, 1 + jnz .L1 + + pop edx + pop ecx + pop ebx + ret + + +;------------------------------------------------------------------------------ +; Name: Writer +; Purpose: Writes 32-bit value sequentially to an area of memory. +; Params: +; [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +; [esp+16] = long to write +;------------------------------------------------------------------------------ + align 64 +Writer: +_Writer: + push ebx + push ecx + push edx + + mov ecx, [esp+12+12] + mov eax, [esp+16+12] + + mov edx, [esp+4+12] ; edx = ptr to chunk + mov ebx, edx + add ebx, [esp+8+12] ; ebx = limit in memory + +.L1: + mov edx, [esp+4+12] + +.L2: + mov [edx], eax + mov [4+edx], eax + mov [8+edx], eax + mov [12+edx], eax + mov [16+edx], eax + mov [20+edx], eax + mov [24+edx], eax + mov [28+edx], eax + mov [32+edx], eax + mov [36+edx], eax + mov [40+edx], eax + mov [44+edx], eax + mov [48+edx], eax + mov [52+edx], eax + mov [56+edx], eax + mov [60+edx], eax + mov [64+edx], eax + mov [68+edx], eax + mov [72+edx], eax + mov [76+edx], eax + mov [80+edx], eax + mov [84+edx], eax + mov [88+edx], eax + mov [92+edx], eax + mov [96+edx], eax + mov [100+edx], eax + mov [104+edx], eax + mov [108+edx], eax + mov [112+edx], eax + mov [116+edx], eax + mov [120+edx], eax + mov [124+edx], eax + + mov [edx+128], eax + mov [edx+132], eax + mov [edx+136], eax + mov [edx+140], eax + mov [edx+144], eax + mov [edx+148], eax + mov [edx+152], eax + mov [edx+156], eax + mov [edx+160], eax + mov [edx+164], eax + mov [edx+168], eax + mov [edx+172], eax + mov [edx+176], eax + mov [edx+180], eax + mov [edx+184], eax + mov [edx+188], eax + mov [edx+192], eax + mov [edx+196], eax + mov [edx+200], eax + mov [edx+204], eax + mov [edx+208], eax + mov [edx+212], eax + mov [edx+216], eax + mov [edx+220], eax + mov [edx+224], eax + mov [edx+228], eax + mov [edx+232], eax + mov [edx+236], eax + mov [edx+240], eax + mov [edx+244], eax + mov [edx+248], eax + mov [edx+252], eax + + add edx, 256 + cmp edx, ebx + jb .L2 + + sub ecx, 1 + jnz .L1 + + pop edx + pop ecx + pop ebx + ret + + +;------------------------------------------------------------------------------ +; Name: Reader_128bytes +; Purpose: Reads 32-bit values sequentially from an area of memory. +; Params: +; [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +;------------------------------------------------------------------------------ + align 64 +Reader_128bytes: +_Reader_128bytes: + push ebx + push ecx + push edx + + mov ecx, [esp+12+12] ; loops to do. + + mov edx, [esp+4+12] ; ptr to memory chunk. + mov ebx, edx ; ebx = limit in memory + add ebx, [esp+8+12] + +.L1: + mov edx, [esp+4+12] + +.L2: + mov eax, [edx] + mov eax, [4+edx] + mov eax, [8+edx] + mov eax, [12+edx] + mov eax, [16+edx] + mov eax, [20+edx] + mov eax, [24+edx] + mov eax, [28+edx] + mov eax, [32+edx] + mov eax, [36+edx] + mov eax, [40+edx] + mov eax, [44+edx] + mov eax, [48+edx] + mov eax, [52+edx] + mov eax, [56+edx] + mov eax, [60+edx] + mov eax, [64+edx] + mov eax, [68+edx] + mov eax, [72+edx] + mov eax, [76+edx] + mov eax, [80+edx] + mov eax, [84+edx] + mov eax, [88+edx] + mov eax, [92+edx] + mov eax, [96+edx] + mov eax, [100+edx] + mov eax, [104+edx] + mov eax, [108+edx] + mov eax, [112+edx] + mov eax, [116+edx] + mov eax, [120+edx] + mov eax, [124+edx] + + add edx, 128 + cmp edx, ebx + jb .L2 + + sub ecx, 1 + jnz .L1 + + pop edx + pop ecx + pop ebx + ret + + +;------------------------------------------------------------------------------ +; Name: Writer_128bytes +; Purpose: Writes 32-bit value sequentially to an area of memory. +; Params: +; [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +; [esp+16] = long to write +;------------------------------------------------------------------------------ + align 64 +Writer_128bytes: +_Writer_128bytes: + push ebx + push ecx + push edx + + mov ecx, [esp+12+12] + mov eax, [esp+16+12] + + mov edx, [esp+4+12] ; edx = ptr to chunk + mov ebx, edx + add ebx, [esp+8+12] ; ebx = limit in memory + +.L1: + mov edx, [esp+4+12] + +.L2: + mov [edx], eax + mov [4+edx], eax + mov [8+edx], eax + mov [12+edx], eax + mov [16+edx], eax + mov [20+edx], eax + mov [24+edx], eax + mov [28+edx], eax + mov [32+edx], eax + mov [36+edx], eax + mov [40+edx], eax + mov [44+edx], eax + mov [48+edx], eax + mov [52+edx], eax + mov [56+edx], eax + mov [60+edx], eax + mov [64+edx], eax + mov [68+edx], eax + mov [72+edx], eax + mov [76+edx], eax + mov [80+edx], eax + mov [84+edx], eax + mov [88+edx], eax + mov [92+edx], eax + mov [96+edx], eax + mov [100+edx], eax + mov [104+edx], eax + mov [108+edx], eax + mov [112+edx], eax + mov [116+edx], eax + mov [120+edx], eax + mov [124+edx], eax + + add edx, 128 + cmp edx, ebx + jb .L2 + + sub ecx, 1 + jnz .L1 + + pop edx + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: get_cpuid_cache_info +; +get_cpuid_cache_info: +_get_cpuid_cache_info: + push ebp + push ebx + push ecx + push edx + mov eax, 4 + mov ecx, [esp + 16 + 4 + 4] + cpuid + mov ebp, eax + mov eax, [esp + 16 + 4] + mov [eax], ebp + mov [eax+4], ebx + mov [eax+8], ecx + mov [eax+12], edx + pop edx + pop ecx + pop ebx + pop ebp + ret + +;------------------------------------------------------------------------------ +; Name: get_cpuid_family +; +get_cpuid_family: +_get_cpuid_family: + push ebx + push ecx + push edx + xor eax, eax + cpuid + mov eax, [esp + 12 + 4] + mov [eax], ebx + mov [eax+4], edx + mov [eax+8], ecx + mov byte [eax+12], 0 + pop edx + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: get_cpuid1_ecx +; +get_cpuid1_ecx: +_get_cpuid1_ecx: + push ebx + push ecx + push edx + mov eax, 1 + cpuid + mov eax, ecx + pop edx + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: get_cpuid7_ebx +; +get_cpuid7_ebx: +_get_cpuid7_ebx: + push ebx + push ecx + push edx + mov eax, 7 + xor ecx, ecx + cpuid + mov eax, ebx + pop edx + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: get_cpuid_80000001_ecx +; +get_cpuid_80000001_ecx: +_get_cpuid_80000001_ecx: + push ebx + push ecx + push edx + mov eax, 0x80000001 + cpuid + mov eax, ecx + pop edx + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: get_cpuid_80000001_edx +; +get_cpuid_80000001_edx: +_get_cpuid_80000001_edx: + push ebx + push ecx + push edx + mov eax, 0x80000001 + cpuid + mov eax, edx + pop edx + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: get_cpuid1_edx +; +get_cpuid1_edx: +_get_cpuid1_edx: + push ebx + push ecx + push edx + mov eax, 1 + cpuid + mov eax, edx + pop edx + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: ReaderAVX +; Purpose: Reads 128-bit values sequentially from an area of memory. +; Params: [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +;------------------------------------------------------------------------------ + align 64 +ReaderAVX: +_ReaderAVX: + vzeroupper + + push ebx + push ecx + + mov ecx, [esp+12+8] + + mov eax, [esp+4+8] + mov ebx, eax + add ebx, [esp+8+8] ; ebx points to end. + +.L1: + mov eax, [esp+4+8] + +.L2: + vmovdqa xmm0, [eax] ; Read aligned @ 16-byte boundary. + vmovdqa xmm0, [32+eax] + vmovdqa xmm0, [64+eax] + vmovdqa xmm0, [96+eax] + vmovdqa xmm0, [128+eax] + vmovdqa xmm0, [160+eax] + vmovdqa xmm0, [192+eax] + vmovdqa xmm0, [224+eax] + + add eax, 256 + cmp eax, ebx + jb .L2 + + sub ecx, 1 + jnz .L1 + + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: ReaderSSE2 +; Purpose: Reads 128-bit values sequentially from an area of memory. +; Params: [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +;------------------------------------------------------------------------------ + align 64 +ReaderSSE2: +_ReaderSSE2: + push ebx + push ecx + + mov ecx, [esp+12+8] + + mov eax, [esp+4+8] + mov ebx, eax + add ebx, [esp+8+8] ; ebx points to end. + +.L1: + mov eax, [esp+4+8] + +.L2: + movdqa xmm0, [eax] ; Read aligned @ 16-byte boundary. + movdqa xmm0, [16+eax] + movdqa xmm0, [32+eax] + movdqa xmm0, [48+eax] + movdqa xmm0, [64+eax] + movdqa xmm0, [80+eax] + movdqa xmm0, [96+eax] + movdqa xmm0, [112+eax] + + movdqa xmm0, [128+eax] + movdqa xmm0, [144+eax] + movdqa xmm0, [160+eax] + movdqa xmm0, [176+eax] + movdqa xmm0, [192+eax] + movdqa xmm0, [208+eax] + movdqa xmm0, [224+eax] + movdqa xmm0, [240+eax] + + add eax, 256 + cmp eax, ebx + jb .L2 + + sub ecx, 1 + jnz .L1 + + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: ReaderSSE2_bypass +; Purpose: Reads 128-bit values sequentially from an area of memory. +; Params: [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +;------------------------------------------------------------------------------ + align 64 +ReaderSSE2_bypass: +_ReaderSSE2_bypass: + push ebx + push ecx + + mov ecx, [esp+12+8] + + mov eax, [esp+4+8] + mov ebx, eax + add ebx, [esp+8+8] ; ebx points to end. + +.L1: + mov eax, [esp+4+8] + +.L2: + movntdqa xmm0, [eax] ; Read aligned @ 16-byte boundary. + movntdqa xmm0, [16+eax] + movntdqa xmm0, [32+eax] + movntdqa xmm0, [48+eax] + movntdqa xmm0, [64+eax] + movntdqa xmm0, [80+eax] + movntdqa xmm0, [96+eax] + movntdqa xmm0, [112+eax] + + movntdqa xmm0, [128+eax] + movntdqa xmm0, [144+eax] + movntdqa xmm0, [160+eax] + movntdqa xmm0, [176+eax] + movntdqa xmm0, [192+eax] + movntdqa xmm0, [208+eax] + movntdqa xmm0, [224+eax] + movntdqa xmm0, [240+eax] + + add eax, 256 + cmp eax, ebx + jb .L2 + + sub ecx, 1 + jnz .L1 + + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: ReaderSSE2_128bytes_bypass +; Purpose: Reads 128-bit values sequentially from an area of memory. +; Params: [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +;------------------------------------------------------------------------------ + align 64 +ReaderSSE2_128bytes_bypass: +_ReaderSSE2_128bytes_bypass: + push ebx + push ecx + + mov ecx, [esp+12+8] + + mov eax, [esp+4+8] + mov ebx, eax + add ebx, [esp+8+8] ; ebx points to end. + +.L1: + mov eax, [esp+4+8] + +.L2: + movntdqa xmm0, [eax] ; Read aligned @ 16-byte boundary. + movntdqa xmm0, [16+eax] + movntdqa xmm0, [32+eax] + movntdqa xmm0, [48+eax] + movntdqa xmm0, [64+eax] + movntdqa xmm0, [80+eax] + movntdqa xmm0, [96+eax] + movntdqa xmm0, [112+eax] + + add eax, 128 + cmp eax, ebx + jb .L2 + + sub ecx, 1 + jnz .L1 + + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: ReaderSSE2_128bytes +; Purpose: Reads 128-bit values sequentially from an area of memory. +; Params: [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +;------------------------------------------------------------------------------ + align 64 +ReaderSSE2_128bytes: +_ReaderSSE2_128bytes: + push ebx + push ecx + + mov ecx, [esp+12+8] + + mov eax, [esp+4+8] + mov ebx, eax + add ebx, [esp+8+8] ; ebx points to end. + +.L1: + mov eax, [esp+4+8] + +.L2: + movdqa xmm0, [eax] ; Read aligned @ 16-byte boundary. + movdqa xmm0, [16+eax] + movdqa xmm0, [32+eax] + movdqa xmm0, [48+eax] + movdqa xmm0, [64+eax] + movdqa xmm0, [80+eax] + movdqa xmm0, [96+eax] + movdqa xmm0, [112+eax] + + add eax, 128 + cmp eax, ebx + jb .L2 + + sub ecx, 1 + jnz .L1 + + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: WriterAVX +; Purpose: Write 256-bit values sequentially from an area of memory. +; Params: [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +; [esp+16] = value (ignored) +;------------------------------------------------------------------------------ + align 64 +WriterAVX: +_WriterAVX: + vzeroupper + + push ebx + push ecx + + mov eax, [esp+16+8] + movd xmm0, eax ; Create a 128-bit replication of the 32-bit + movd xmm1, eax ; value that was provided. + movd xmm2, eax + movd xmm3, eax + pslldq xmm1, 32 + pslldq xmm2, 64 + pslldq xmm3, 96 + por xmm0, xmm1 + por xmm0, xmm2 + por xmm0, xmm3 + + mov ecx, [esp+12+8] + + mov eax, [esp+4+8] + mov ebx, eax + add ebx, [esp+8+8] ; ebx points to end. + +.L1: + mov eax, [esp+4+8] + +.L2: + vmovdqa [eax], xmm0 + vmovdqa [32+eax], xmm0 + vmovdqa [64+eax], xmm0 + vmovdqa [96+eax], xmm0 + vmovdqa [128+eax], xmm0 + vmovdqa [160+eax], xmm0 + vmovdqa [192+eax], xmm0 + vmovdqa [224+eax], xmm0 + + add eax, 256 + cmp eax, ebx + jb .L2 + + sub ecx, 1 + jnz .L1 + + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: WriterSSE2 +; Purpose: Write 128-bit values sequentially from an area of memory. +; Params: [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +; [esp+16] = value (ignored) +;------------------------------------------------------------------------------ + align 64 +WriterSSE2: +_WriterSSE2: + push ebx + push ecx + + mov eax, [esp+16+8] + movd xmm0, eax ; Create a 128-bit replication of the 32-bit + movd xmm1, eax ; value that was provided. + movd xmm2, eax + movd xmm3, eax + pslldq xmm1, 32 + pslldq xmm2, 64 + pslldq xmm3, 96 + por xmm0, xmm1 + por xmm0, xmm2 + por xmm0, xmm3 + + mov ecx, [esp+12+8] + + mov eax, [esp+4+8] + mov ebx, eax + add ebx, [esp+8+8] ; ebx points to end. + +.L1: + mov eax, [esp+4+8] + +.L2: + movdqa [eax], xmm0 + movdqa [16+eax], xmm0 + movdqa [32+eax], xmm0 + movdqa [48+eax], xmm0 + movdqa [64+eax], xmm0 + movdqa [80+eax], xmm0 + movdqa [96+eax], xmm0 + movdqa [112+eax], xmm0 + + movdqa [128+eax], xmm0 + movdqa [144+eax], xmm0 + movdqa [160+eax], xmm0 + movdqa [176+eax], xmm0 + movdqa [192+eax], xmm0 + movdqa [208+eax], xmm0 + movdqa [224+eax], xmm0 + movdqa [240+eax], xmm0 + + add eax, 256 + cmp eax, ebx + jb .L2 + + sub ecx, 1 + jnz .L1 + + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: WriterSSE2 +; Purpose: Write 128-bit values sequentially from an area of memory. +; Params: [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +; [esp+16] = value (ignored) +;------------------------------------------------------------------------------ + align 64 +WriterSSE2_128bytes: +_WriterSSE2_128bytes: + push ebx + push ecx + + mov eax, [esp+16+8] + movd xmm0, eax ; Create a 128-bit replication of the 32-bit + movd xmm1, eax ; value that was provided. + movd xmm2, eax + movd xmm3, eax + pslldq xmm1, 32 + pslldq xmm2, 64 + pslldq xmm3, 96 + por xmm0, xmm1 + por xmm0, xmm2 + por xmm0, xmm3 + + mov ecx, [esp+12+8] + + mov eax, [esp+4+8] + mov ebx, eax + add ebx, [esp+8+8] ; ebx points to end. + +.L1: + mov eax, [esp+4+8] + +.L2: + movdqa [eax], xmm0 + movdqa [16+eax], xmm0 + movdqa [32+eax], xmm0 + movdqa [48+eax], xmm0 + movdqa [64+eax], xmm0 + movdqa [80+eax], xmm0 + movdqa [96+eax], xmm0 + movdqa [112+eax], xmm0 + + add eax, 128 + cmp eax, ebx + jb .L2 + + sub ecx, 1 + jnz .L1 + + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: WriterAVX_bypass +; Purpose: Write 256-bit values sequentially from an area of memory, +; bypassing the cache. +; Params: [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +; [esp+16] = value (ignored) +;------------------------------------------------------------------------------ + + align 64 +WriterAVX_bypass: +_WriterAVX_bypass: + vzeroupper + + push ebx + push ecx + + mov eax, [esp+16+8] + movd xmm0, eax ; Create a 128-bit replication of the 32-bit + movd xmm1, eax ; value that was provided. + movd xmm2, eax + movd xmm3, eax + pslldq xmm1, 32 + pslldq xmm2, 64 + pslldq xmm3, 96 + por xmm0, xmm1 + por xmm0, xmm2 + por xmm0, xmm3 + + mov ecx, [esp+12+8] + + mov eax, [esp+4+8] + mov ebx, eax + add ebx, [esp+8+8] ; ebx points to end. + +.L1: + mov eax, [esp+4+8] + +.L2: + vmovntdq [eax], xmm0 ; Write bypassing cache. + vmovntdq [32+eax], xmm0 + vmovntdq [64+eax], xmm0 + vmovntdq [96+eax], xmm0 + vmovntdq [128+eax], xmm0 + vmovntdq [160+eax], xmm0 + vmovntdq [192+eax], xmm0 + vmovntdq [224+eax], xmm0 + + add eax, 256 + cmp eax, ebx + jb .L2 + + sub ecx, 1 + jnz .L1 + + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: WriterSSE2_bypass +; Purpose: Write 128-bit values sequentially from an area of memory, +; bypassing the cache. +; Params: [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +; [esp+16] = value (ignored) +;------------------------------------------------------------------------------ + align 64 +WriterSSE2_bypass: +_WriterSSE2_bypass: + push ebx + push ecx + + mov eax, [esp+16+8] + movd xmm0, eax ; Create a 128-bit replication of the 32-bit + movd xmm1, eax ; value that was provided. + movd xmm2, eax + movd xmm3, eax + pslldq xmm1, 32 + pslldq xmm2, 64 + pslldq xmm3, 96 + por xmm0, xmm1 + por xmm0, xmm2 + por xmm0, xmm3 + + mov ecx, [esp+12+8] + + mov eax, [esp+4+8] + mov ebx, eax + add ebx, [esp+8+8] ; ebx points to end. + +.L1: + mov eax, [esp+4+8] + +.L2: + movntdq [eax], xmm0 ; Write bypassing cache. + movntdq [16+eax], xmm0 + movntdq [32+eax], xmm0 + movntdq [48+eax], xmm0 + movntdq [64+eax], xmm0 + movntdq [80+eax], xmm0 + movntdq [96+eax], xmm0 + movntdq [112+eax], xmm0 + + movntdq [128+eax], xmm0 + movntdq [144+eax], xmm0 + movntdq [160+eax], xmm0 + movntdq [176+eax], xmm0 + movntdq [192+eax], xmm0 + movntdq [208+eax], xmm0 + movntdq [224+eax], xmm0 + movntdq [240+eax], xmm0 + + add eax, 256 + cmp eax, ebx + jb .L2 + + sub ecx, 1 + jnz .L1 + + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: WriterSSE2_128bytes_bypass +; Purpose: Write 128-bit values sequentially from an area of memory, +; bypassing the cache. +; Params: [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +; [esp+16] = value (ignored) +;------------------------------------------------------------------------------ + align 64 +WriterSSE2_128bytes_bypass: +_WriterSSE2_128bytes_bypass: + push ebx + push ecx + + mov eax, [esp+16+8] + movd xmm0, eax ; Create a 128-bit replication of the 32-bit + movd xmm1, eax ; value that was provided. + movd xmm2, eax + movd xmm3, eax + pslldq xmm1, 32 + pslldq xmm2, 64 + pslldq xmm3, 96 + por xmm0, xmm1 + por xmm0, xmm2 + por xmm0, xmm3 + + mov ecx, [esp+12+8] + + mov eax, [esp+4+8] + mov ebx, eax + add ebx, [esp+8+8] ; ebx points to end. + +.L1: + mov eax, [esp+4+8] + +.L2: + movntdq [eax], xmm0 ; Write bypassing cache. + movntdq [16+eax], xmm0 + movntdq [32+eax], xmm0 + movntdq [48+eax], xmm0 + movntdq [64+eax], xmm0 + movntdq [80+eax], xmm0 + movntdq [96+eax], xmm0 + movntdq [112+eax], xmm0 + + add eax, 128 + cmp eax, ebx + jb .L2 + + sub ecx, 1 + jnz .L1 + + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: RandomReader +; Purpose: Reads 32-bit values randomly from an area of memory. +; Params: +; [esp+4] = ptr to array of chunk pointers +; [esp+8] = # of 128-byte chunks +; [esp+12] = loops +;------------------------------------------------------------------------------ + align 64 +RandomReader: +_RandomReader: + push ebx + push ecx + push edx + + mov ecx, [esp+12+12] ; loops to do. + +.L0: + mov ebx, [esp+8+12] ; # chunks to do + +.L1: + sub ebx, 1 + jc .L2 + + mov edx, [esp+4+12] ; get ptr to memory chunk. + mov edx, [edx + 4*ebx] + + mov eax, [edx+160] + mov eax, [edx+232] + mov eax, [edx+224] + mov eax, [96+edx] + mov eax, [edx+164] + mov eax, [76+edx] + mov eax, [100+edx] + mov eax, [edx+220] + mov eax, [edx+248] + mov eax, [104+edx] + mov eax, [4+edx] + mov eax, [edx+136] + mov eax, [112+edx] + mov eax, [edx+200] + mov eax, [12+edx] + mov eax, [edx+128] + mov eax, [edx+148] + mov eax, [edx+196] + mov eax, [edx+216] + mov eax, [edx] + mov eax, [84+edx] + mov eax, [edx+140] + mov eax, [edx+204] + mov eax, [edx+184] + mov eax, [124+edx] + mov eax, [48+edx] + mov eax, [64+edx] + mov eax, [edx+212] + mov eax, [edx+240] + mov eax, [edx+236] + mov eax, [24+edx] + mov eax, [edx+252] + mov eax, [68+edx] + mov eax, [20+edx] + mov eax, [72+edx] + mov eax, [32+edx] + mov eax, [28+edx] + mov eax, [52+edx] + mov eax, [edx+244] + mov eax, [edx+180] + mov eax, [80+edx] + mov eax, [60+edx] + mov eax, [8+edx] + mov eax, [56+edx] + mov eax, [edx+208] + mov eax, [edx+228] + mov eax, [40+edx] + mov eax, [edx+172] + mov eax, [120+edx] + mov eax, [edx+176] + mov eax, [108+edx] + mov eax, [edx+132] + mov eax, [16+edx] + mov eax, [44+edx] + mov eax, [92+edx] + mov eax, [edx+168] + mov eax, [edx+152] + mov eax, [edx+156] + mov eax, [edx+188] + mov eax, [36+edx] + mov eax, [88+edx] + mov eax, [116+edx] + mov eax, [edx+192] + mov eax, [edx+144] + + jmp .L1 + +.L2: + sub ecx, 1 + jnz .L0 + + pop edx + pop ecx + pop ebx + ret + + +;------------------------------------------------------------------------------ +; Name: RandomReaderSSE2 +; Purpose: Reads 128-bit values sequentially from an area of memory. +; Params: +; [esp+4] = ptr to array of chunk pointers +; [esp+8] = # of 128-byte chunks +; [esp+12] = loops +;------------------------------------------------------------------------------ + align 64 +RandomReaderSSE2: +_RandomReaderSSE2: + push ebx + push ecx + push edx + + mov ecx, [esp+12+12] ; loops to do. + +.L0: + mov ebx, [esp+8+12] ; # chunks to do + +.L1: + sub ebx, 1 + jc .L2 + + mov edx, [esp+4+12] ; get ptr to memory chunk. + mov edx, [edx + 4*ebx] + +; Read aligned @ 16-byte boundary. + movdqa xmm0, [240+edx] + movdqa xmm0, [128+edx] + movdqa xmm0, [64+edx] + movdqa xmm0, [208+edx] + movdqa xmm0, [112+edx] + movdqa xmm0, [176+edx] + movdqa xmm0, [144+edx] + movdqa xmm0, [edx] + movdqa xmm0, [96+edx] + movdqa xmm0, [16+edx] + movdqa xmm0, [192+edx] + movdqa xmm0, [160+edx] + movdqa xmm0, [32+edx] + movdqa xmm0, [48+edx] + movdqa xmm0, [224+edx] + movdqa xmm0, [80+edx] + + jmp .L1 + +.L2: + sub ecx, 1 + jnz .L0 + + pop edx + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: RandomReaderSSE2_bypass +; Purpose: Reads 128-bit values sequentially from an area of memory. +; Params: +; [esp+4] = ptr to array of chunk pointers +; [esp+8] = # of 128-byte chunks +; [esp+12] = loops +;------------------------------------------------------------------------------ + align 64 +RandomReaderSSE2_bypass: +_RandomReaderSSE2_bypass: + push ebx + push ecx + push edx + + mov ecx, [esp+12+12] ; loops to do. + +.L0: + mov ebx, [esp+8+12] ; # chunks to do + +.L1: + sub ebx, 1 + jc .L2 + + mov edx, [esp+4+12] ; get ptr to memory chunk. + mov edx, [edx + 4*ebx] + +; Read aligned @ 16-byte boundary. + movntdqa xmm0, [240+edx] + movntdqa xmm0, [edx] + movntdqa xmm0, [128+edx] + movntdqa xmm0, [64+edx] + movntdqa xmm0, [208+edx] + movntdqa xmm0, [112+edx] + movntdqa xmm0, [32+edx] + movntdqa xmm0, [176+edx] + movntdqa xmm0, [144+edx] + movntdqa xmm0, [96+edx] + movntdqa xmm0, [16+edx] + movntdqa xmm0, [160+edx] + movntdqa xmm0, [192+edx] + movntdqa xmm0, [48+edx] + movntdqa xmm0, [224+edx] + movntdqa xmm0, [80+edx] + + jmp .L1 + +.L2: + sub ecx, 1 + jnz .L0 + + pop edx + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: RandomWriter +; Purpose: Writes 32-bit value sequentially to an area of memory. +; Params: +; [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +; [esp+16] = long to write +;------------------------------------------------------------------------------ + align 64 +RandomWriter: +_RandomWriter: + push ebx + push ecx + push edx + + mov eax, [esp+16+12] ; get datum. + mov ecx, [esp+12+12] ; loops to do. + +.L0: + mov ebx, [esp+8+12] ; # chunks to do + +.L1: + sub ebx, 1 + jc .L2 + + mov edx, [esp+4+12] ; get ptr to memory chunk. + mov edx, [edx + 4*ebx] + + mov [edx+212], eax + mov [edx+156], eax + mov [edx+132], eax + mov [20+edx], eax + mov [edx+172], eax + mov [edx+196], eax + mov [edx+248], eax + mov [edx], eax + mov [edx+136], eax + mov [edx+228], eax + mov [edx+160], eax + mov [80+edx], eax + mov [76+edx], eax + mov [32+edx], eax + mov [64+edx], eax + mov [68+edx], eax + mov [120+edx], eax + mov [edx+216], eax + mov [124+edx], eax + mov [28+edx], eax + mov [edx+152], eax + mov [36+edx], eax + mov [edx+220], eax + mov [edx+188], eax + mov [48+edx], eax + mov [104+edx], eax + mov [72+edx], eax + mov [96+edx], eax + mov [edx+184], eax + mov [112+edx], eax + mov [edx+236], eax + mov [edx+224], eax + mov [edx+252], eax + mov [88+edx], eax + mov [edx+180], eax + mov [60+edx], eax + mov [24+edx], eax + mov [edx+192], eax + mov [edx+164], eax + mov [edx+204], eax + mov [44+edx], eax + mov [edx+168], eax + mov [92+edx], eax + mov [edx+208], eax + mov [8+edx], eax + mov [edx+144], eax + mov [edx+148], eax + mov [edx+128], eax + mov [52+edx], eax + mov [4+edx], eax + mov [108+edx], eax + mov [12+edx], eax + mov [56+edx], eax + mov [edx+200], eax + mov [edx+232], eax + mov [16+edx], eax + mov [edx+244], eax + mov [40+edx], eax + mov [edx+140], eax + mov [84+edx], eax + mov [100+edx], eax + mov [116+edx], eax + mov [edx+176], eax + mov [edx+240], eax + + jmp .L1 + +.L2: + sub ecx, 1 + jnz .L0 + + pop edx + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: RandomWriterSSE2 +; Purpose: Writes 128-bit value randomly to an area of memory. +; Params: +; [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +; [esp+16] = long to write +;------------------------------------------------------------------------------ + align 64 +RandomWriterSSE2: +_RandomWriterSSE2: + push ebx + push ecx + push edx + + mov eax, [esp+16+8] + movd xmm0, eax ; Create a 128-bit replication of the 32-bit + movd xmm1, eax ; value that was provided. + movd xmm2, eax + movd xmm3, eax + pslldq xmm1, 32 + pslldq xmm2, 64 + pslldq xmm3, 96 + por xmm0, xmm1 + por xmm0, xmm2 + por xmm0, xmm3 + + mov ecx, [esp+12+12] ; loops to do. + +.L0: + mov ebx, [esp+8+12] ; # chunks to do + +.L1: + sub ebx, 1 + jc .L2 + + mov edx, [esp+4+12] ; get ptr to memory chunk. + mov edx, [edx + 4*ebx] + + movdqa [64+edx], xmm0 + movdqa [208+edx], xmm0 + movdqa [128+edx], xmm0 + movdqa [112+edx], xmm0 + movdqa [176+edx], xmm0 + movdqa [144+edx], xmm0 + movdqa [edx], xmm0 + movdqa [96+edx], xmm0 + movdqa [48+edx], xmm0 + movdqa [16+edx], xmm0 + movdqa [192+edx], xmm0 + movdqa [160+edx], xmm0 + movdqa [32+edx], xmm0 + movdqa [240+edx], xmm0 + movdqa [224+edx], xmm0 + movdqa [80+edx], xmm0 + + jmp .L1 + +.L2: + sub ecx, 1 + jnz .L0 + + pop edx + pop ecx + pop ebx + ret + + +;------------------------------------------------------------------------------ +; Name: RandomWriterSSE2_bypass +; Purpose: Writes 128-bit value randomly into memory, bypassing caches. +; Params: +; [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +; [esp+16] = long to write +;------------------------------------------------------------------------------ + align 64 +RandomWriterSSE2_bypass: +_RandomWriterSSE2_bypass: + push ebx + push ecx + push edx + + mov eax, [esp+16+8] + movd xmm0, eax ; Create a 128-bit replication of the 32-bit + movd xmm1, eax ; value that was provided. + movd xmm2, eax + movd xmm3, eax + pslldq xmm1, 32 + pslldq xmm2, 64 + pslldq xmm3, 96 + por xmm0, xmm1 + por xmm0, xmm2 + por xmm0, xmm3 + + mov ecx, [esp+12+12] ; loops to do. + +.L0: + mov ebx, [esp+8+12] ; # chunks to do + +.L1: + sub ebx, 1 + jc .L2 + + mov edx, [esp+4+12] ; get ptr to memory chunk. + mov edx, [edx + 4*ebx] + + movntdq [128+edx], xmm0 + movntdq [240+edx], xmm0 + movntdq [112+edx], xmm0 + movntdq [64+edx], xmm0 + movntdq [176+edx], xmm0 + movntdq [144+edx], xmm0 + movntdq [edx], xmm0 + movntdq [208+edx], xmm0 + movntdq [80+edx], xmm0 + movntdq [96+edx], xmm0 + movntdq [48+edx], xmm0 + movntdq [16+edx], xmm0 + movntdq [192+edx], xmm0 + movntdq [160+edx], xmm0 + movntdq [224+edx], xmm0 + movntdq [32+edx], xmm0 + + jmp .L1 + +.L2: + sub ecx, 1 + jnz .L0 + + pop edx + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: RegisterToRegister +; Purpose: Reads/writes 32-bit values between registers of +; the main register set. +; Params: +; dword [esp+4] = loops +;------------------------------------------------------------------------------ + align 64 +RegisterToRegister: +_RegisterToRegister: + push ebx + push ecx + + mov ecx, [esp+4+8] ; loops to do. + +.L1: + mov eax, ebx ; 64 transfers by 4 bytes = 256 bytes + mov eax, ecx + mov eax, edx + mov eax, esi + mov eax, edi + mov eax, ebp + mov eax, esp + mov eax, ebx + mov eax, ebx + mov eax, ecx + mov eax, edx + mov eax, esi + mov eax, edi + mov eax, ebp + mov eax, esp + mov eax, ebx + mov eax, ebx + mov eax, ecx + mov eax, edx + mov eax, esi + mov eax, edi + mov eax, ebp + mov eax, esp + mov eax, ebx + mov eax, ebx + mov eax, ecx + mov eax, edx + mov eax, esi + mov eax, edi + mov eax, ebp + mov eax, esp + mov eax, ebx + + mov ebx, eax + mov ebx, ecx + mov ebx, edx + mov ebx, esi + mov ebx, edi + mov ebx, ebp + mov ebx, esp + mov ebx, eax + mov ebx, eax + mov ebx, ecx + mov ebx, edx + mov ebx, esi + mov ebx, edi + mov ebx, ebp + mov ebx, esp + mov ebx, eax + mov ebx, eax + mov ebx, ecx + mov ebx, edx + mov ebx, esi + mov ebx, edi + mov ebx, ebp + mov ebx, esp + mov ebx, eax + mov ebx, eax + mov ebx, ecx + mov ebx, edx + mov ebx, esi + mov ebx, edi + mov ebx, ebp + mov ebx, esp + mov ebx, eax + + dec ecx + jnz .L1 + + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: VectorToVectorAVX +; Purpose: Reads/writes 256-bit values between registers of +; the vector register set, in this case YMM. +; Params: rdi = loops +;------------------------------------------------------------------------------ + align 64 +VectorToVectorAVX: +_VectorToVectorAVX: + vzeroupper + + mov eax, [esp + 4] +.L1: + vmovdqa ymm0, ymm1 ; Each move moves 32 bytes, so we need 8 + vmovdqa ymm0, ymm2 ; moves to transfer a 256 byte chunk. + vmovdqa ymm0, ymm3 + vmovdqa ymm2, ymm0 + vmovdqa ymm1, ymm2 + vmovdqa ymm2, ymm1 + vmovdqa ymm0, ymm3 + vmovdqa ymm3, ymm1 + + dec eax + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: VectorToVector +; Purpose: Reads/writes 128-bit values between registers of +; the vector register set, in this case XMM. +; Params: dword [esp + 4] = count. +;------------------------------------------------------------------------------ + align 64 +VectorToVector: +_VectorToVector: + mov eax, [esp + 4] +.L1: + movdqa xmm0, xmm1 + movdqa xmm0, xmm2 + movdqa xmm0, xmm3 + movdqa xmm2, xmm0 + movdqa xmm1, xmm2 + movdqa xmm2, xmm1 + movdqa xmm0, xmm3 + movdqa xmm3, xmm1 + + movdqa xmm3, xmm2 + movdqa xmm1, xmm3 + movdqa xmm2, xmm1 + movdqa xmm0, xmm1 + movdqa xmm1, xmm2 + movdqa xmm0, xmm1 + movdqa xmm0, xmm3 + movdqa xmm3, xmm0 + + dec eax + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: RegisterToVector +; Purpose: Writes 32-bit main register values into 128-bit vector register +; clearing the upper unused bits. +; Params: dword [esp + 4] = count. +;------------------------------------------------------------------------------ + align 64 +RegisterToVector: +_RegisterToVector: + mov eax, [esp + 4] + add eax, eax ; Double # of loops. +.L1: + movd xmm1, eax ; 32 transfers of 4 bytes = 128 bytes + movd xmm2, eax + movd xmm3, eax + movd xmm0, eax + movd xmm1, eax + movd xmm2, eax + movd xmm3, eax + movd xmm0, eax + + movd xmm1, eax + movd xmm3, eax + movd xmm2, eax + movd xmm0, eax + movd xmm1, eax + movd xmm2, eax + movd xmm3, eax + movd xmm0, eax + + movd xmm0, eax + movd xmm2, eax + movd xmm0, eax + movd xmm3, eax + movd xmm1, eax + movd xmm3, eax + movd xmm2, eax + movd xmm0, eax + + movd xmm0, eax + movd xmm3, eax + movd xmm1, eax + movd xmm2, eax + movd xmm0, eax + movd xmm2, eax + movd xmm3, eax + movd xmm0, eax + + dec eax + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: VectorToRegister +; Purpose: Writes lowest 32 bits of vector registers into 32-bit main +; register. +; Params: dword [esp + 4] = count. +;------------------------------------------------------------------------------ + align 64 +VectorToRegister: +_VectorToRegister: + mov eax, [esp + 4] + add eax, eax ; Double # of loops. + push ebx +.L1: + movd ebx, xmm1 ; 4 bytes per transfer therefore need 64 + movd ebx, xmm2 ; to transfer 256 bytes. + movd ebx, xmm3 + movd ebx, xmm0 + movd ebx, xmm1 + movd ebx, xmm2 + movd ebx, xmm3 + movd ebx, xmm0 + + movd ebx, xmm1 + movd ebx, xmm3 + movd ebx, xmm2 + movd ebx, xmm0 + movd ebx, xmm1 + movd ebx, xmm2 + movd ebx, xmm3 + movd ebx, xmm0 + + movd ebx, xmm0 + movd ebx, xmm2 + movd ebx, xmm0 + movd ebx, xmm3 + movd ebx, xmm1 + movd ebx, xmm3 + movd ebx, xmm2 + movd ebx, xmm0 + + movd ebx, xmm0 + movd ebx, xmm3 + movd ebx, xmm1 + movd ebx, xmm2 + movd ebx, xmm0 + movd ebx, xmm2 + movd ebx, xmm3 + movd ebx, xmm0 + + dec eax + jnz .L1 + + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: StackReader +; Purpose: Reads 32-bit values off the stack into registers of +; the main register set, effectively testing L1 cache access +; *and* effective-address calculation speed. +; Params: +; dword [esp+4] = loops +;------------------------------------------------------------------------------ + align 64 +StackReader: +_StackReader: + push ebx + push ecx + + mov ecx, [esp+4+8] ; loops to do. + + push dword 7000 ; [esp+24] + push dword 6000 ; [esp+20] + push dword 5000 ; [esp+16] + push dword 4000 ; [esp+12] + push dword 3000 ; [esp+8] + push dword 2000 ; [esp+4] + push dword 1000 ; [esp] + +.L1: + mov eax, [esp] + mov eax, [esp+8] + mov eax, [esp+12] + mov eax, [esp+16] + mov eax, [esp+20] + mov eax, [esp+4] + mov eax, [esp+24] + mov eax, [esp] + mov eax, [esp] + mov eax, [esp+8] + mov eax, [esp+12] + mov eax, [esp+16] + mov eax, [esp+20] + mov eax, [esp+4] + mov eax, [esp+24] + mov eax, [esp] + mov eax, [esp] + mov eax, [esp+8] + mov eax, [esp+12] + mov eax, [esp+16] + mov eax, [esp+20] + mov eax, [esp+4] + mov eax, [esp+24] + mov eax, [esp+4] + mov eax, [esp+4] + mov eax, [esp+8] + mov eax, [esp+12] + mov eax, [esp+16] + mov eax, [esp+20] + mov eax, [esp+4] + mov eax, [esp+24] + mov eax, [esp+4] + + mov ebx, [esp] + mov ebx, [esp+8] + mov ebx, [esp+12] + mov ebx, [esp+16] + mov ebx, [esp+20] + mov ebx, [esp+4] + mov ebx, [esp+24] + mov ebx, [esp] + mov ebx, [esp] + mov ebx, [esp+8] + mov ebx, [esp+12] + mov ebx, [esp+16] + mov ebx, [esp+20] + mov ebx, [esp+4] + mov ebx, [esp+24] + mov ebx, [esp] + mov ebx, [esp] + mov ebx, [esp+8] + mov ebx, [esp+12] + mov ebx, [esp+16] + mov ebx, [esp+20] + mov ebx, [esp+4] + mov ebx, [esp+24] + mov ebx, [esp+4] + mov ebx, [esp+4] + mov ebx, [esp+8] + mov ebx, [esp+12] + mov ebx, [esp+16] + mov ebx, [esp+20] + mov ebx, [esp+4] + mov ebx, [esp+24] + mov ebx, [esp+4] + + dec ecx + jnz .L1 + + add esp, 28 + + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: StackWriter +; Purpose: Writes 32-bit values into the stack from registers of +; the main register set, effectively testing L1 cache access +; *and* effective-address calculation speed. +; Params: +; dword [esp+4] = loops +;------------------------------------------------------------------------------ + align 64 +StackWriter: +_StackWriter: + push ebx + push ecx + + mov ecx, [esp+4+8] ; loops to do. + + push dword 7000 ; [esp+24] + push dword 6000 ; [esp+20] + push dword 5000 ; [esp+16] + push dword 4000 ; [esp+12] + push dword 3000 ; [esp+8] + push dword 2000 ; [esp+4] + push dword 1000 ; [esp] + + xor eax, eax + mov ebx, 0xffffffff + +.L1: + mov [esp], eax + mov [esp+8], eax + mov [esp+12], eax + mov [esp+16], eax + mov [esp+20], eax + mov [esp+4], eax + mov [esp+24], eax + mov [esp], eax + mov [esp], eax + mov [esp+8], eax + mov [esp+12], eax + mov [esp+16], eax + mov [esp+20], eax + mov [esp+4], eax + mov [esp+24], eax + mov [esp], eax + mov [esp], eax + mov [esp+8], eax + mov [esp+12], eax + mov [esp+16], eax + mov [esp+20], eax + mov [esp+4], eax + mov [esp+24], eax + mov [esp+4], eax + mov [esp+4], eax + mov [esp+8], eax + mov [esp+12], eax + mov [esp+16], eax + mov [esp+20], eax + mov [esp+4], eax + mov [esp+24], eax + mov [esp+4], eax + + mov [esp], ebx + mov [esp+8], ebx + mov [esp+12], ebx + mov [esp+16], ebx + mov [esp+20], ebx + mov [esp+4], ebx + mov [esp+24], ebx + mov [esp], ebx + mov [esp], ebx + mov [esp+8], ebx + mov [esp+12], ebx + mov [esp+16], ebx + mov [esp+20], ebx + mov [esp+4], ebx + mov [esp+24], ebx + mov [esp], ebx + mov [esp], ebx + mov [esp+8], ebx + mov [esp+12], ebx + mov [esp+16], ebx + mov [esp+20], ebx + mov [esp+4], ebx + mov [esp+24], ebx + mov [esp+4], ebx + mov [esp+4], ebx + mov [esp+8], ebx + mov [esp+12], ebx + mov [esp+16], ebx + mov [esp+20], ebx + mov [esp+4], ebx + mov [esp+24], ebx + mov [esp+4], ebx + + sub ecx, 1 + jnz .L1 + + add esp, 28 + + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: Register8ToVector +; Purpose: Writes 8-bit main register values into 128-bit vector register +; without clearing the unused bits. +; Params: dword [esp + 4] +;------------------------------------------------------------------------------ + align 64 +Register8ToVector: +_Register8ToVector: + mov eax, [esp + 4] + sal eax, 4 ; Force some repetition. +.L1: + pinsrb xmm1, al, 0 ; 64 transfers x 1 byte = 64 bytes + pinsrb xmm2, bl, 1 + pinsrb xmm3, cl, 2 + pinsrb xmm1, dl, 3 + pinsrb xmm2, sil, 4 + pinsrb xmm3, dil, 5 + pinsrb xmm0, bpl, 6 + pinsrb xmm0, spl, 7 + + pinsrb xmm0, al, 0 + pinsrb xmm1, bl, 1 + pinsrb xmm2, cl, 2 + pinsrb xmm3, dl, 3 + pinsrb xmm3, al, 4 + pinsrb xmm2, bl, 5 + pinsrb xmm1, bpl, 6 + pinsrb xmm0, spl, 7 + + pinsrb xmm1, al, 0 + pinsrb xmm2, al, 1 + pinsrb xmm3, al, 2 + pinsrb xmm1, al, 3 + pinsrb xmm2, al, 4 + pinsrb xmm3, al, 5 + pinsrb xmm0, cl, 6 + pinsrb xmm0, bl, 7 + + pinsrb xmm0, al, 0 + pinsrb xmm0, al, 1 + pinsrb xmm0, al, 2 + pinsrb xmm0, al, 3 + pinsrb xmm0, al, 4 + pinsrb xmm0, al, 5 + pinsrb xmm0, cl, 6 + pinsrb xmm0, bl, 7 + + pinsrb xmm1, al, 0 + pinsrb xmm2, bl, 1 + pinsrb xmm3, cl, 2 + pinsrb xmm1, dl, 3 + pinsrb xmm2, sil, 4 + pinsrb xmm3, dil, 5 + pinsrb xmm0, bpl, 6 + pinsrb xmm0, spl, 7 + + pinsrb xmm0, al, 10 + pinsrb xmm1, bl, 11 + pinsrb xmm2, cl, 12 + pinsrb xmm3, dl, 13 + pinsrb xmm3, dil, 14 + pinsrb xmm2, cl, 15 + pinsrb xmm1, al, 6 + pinsrb xmm0, bpl, 7 + + pinsrb xmm1, al, 10 + pinsrb xmm2, al, 11 + pinsrb xmm3, al, 12 + pinsrb xmm1, al, 13 + pinsrb xmm2, al, 14 + pinsrb xmm3, al, 15 + pinsrb xmm0, cl, 6 + pinsrb xmm0, bl, 7 + + pinsrb xmm0, al, 9 + pinsrb xmm0, al, 8 + pinsrb xmm0, al, 11 + pinsrb xmm0, al, 3 + pinsrb xmm0, al, 4 + pinsrb xmm0, al, 5 + pinsrb xmm0, cl, 6 + pinsrb xmm0, bl, 7 + + dec eax + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: Register16ToVector +; Purpose: Writes 16-bit main register values into 128-bit vector register +; without clearing the unused bits. +; Params: rdi = loops +;------------------------------------------------------------------------------ + align 64 +Register16ToVector: +_Register16ToVector: + mov eax, [esp + 4] + sal eax, 3 ; Force some repetition. +.L1: + pinsrw xmm1, ax, 0 ; 64 transfers x 2 bytes = 128 bytes + pinsrw xmm2, bx, 1 + pinsrw xmm3, cx, 2 + pinsrw xmm1, dx, 3 + pinsrw xmm2, si, 4 + pinsrw xmm3, di, 5 + pinsrw xmm0, bp, 6 + pinsrw xmm0, sp, 7 + + pinsrw xmm0, ax, 0 + pinsrw xmm1, bx, 1 + pinsrw xmm2, cx, 2 + pinsrw xmm3, dx, 3 + pinsrw xmm3, si, 4 + pinsrw xmm2, di, 5 + pinsrw xmm1, bp, 6 + pinsrw xmm0, sp, 7 + + pinsrw xmm1, ax, 0 + pinsrw xmm2, ax, 1 + pinsrw xmm3, ax, 2 + pinsrw xmm1, ax, 3 + pinsrw xmm2, ax, 4 + pinsrw xmm3, ax, 5 + pinsrw xmm0, bp, 6 + pinsrw xmm0, bx, 7 + + pinsrw xmm0, ax, 0 + pinsrw xmm0, ax, 1 + pinsrw xmm0, ax, 2 + pinsrw xmm0, ax, 3 + pinsrw xmm0, ax, 4 + pinsrw xmm0, ax, 5 + pinsrw xmm0, bp, 6 + pinsrw xmm0, bx, 7 + + pinsrw xmm1, ax, 0 + pinsrw xmm2, bx, 1 + pinsrw xmm3, cx, 2 + pinsrw xmm1, dx, 3 + pinsrw xmm2, si, 4 + pinsrw xmm3, di, 5 + pinsrw xmm0, bp, 6 + pinsrw xmm0, sp, 7 + + pinsrw xmm0, ax, 0 + pinsrw xmm1, bx, 1 + pinsrw xmm2, cx, 2 + pinsrw xmm3, dx, 3 + pinsrw xmm3, si, 4 + pinsrw xmm2, di, 5 + pinsrw xmm1, bp, 6 + pinsrw xmm0, sp, 7 + + pinsrw xmm1, ax, 0 + pinsrw xmm2, ax, 1 + pinsrw xmm3, ax, 2 + pinsrw xmm1, ax, 3 + pinsrw xmm2, ax, 4 + pinsrw xmm3, ax, 5 + pinsrw xmm0, bp, 6 + pinsrw xmm0, bx, 7 + + pinsrw xmm0, ax, 0 + pinsrw xmm0, ax, 1 + pinsrw xmm0, ax, 2 + pinsrw xmm0, ax, 3 + pinsrw xmm0, ax, 4 + pinsrw xmm0, ax, 5 + pinsrw xmm0, bp, 6 + pinsrw xmm0, bx, 7 + + dec eax + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: Register32ToVector +; Purpose: Writes 32-bit main register values into 128-bit vector register +; without clearing the unused bits. +; Params: rdi = loops +;------------------------------------------------------------------------------ + align 64 +Register32ToVector: +_Register32ToVector: + mov eax, [esp + 4] + sal eax, 2 ; Force some repetition. +.L1: + pinsrd xmm1, eax, 0 ; Each xfer moves 4 bytes so to move 256 bytes + pinsrd xmm2, ebx, 1 ; we need 64 transfers. + pinsrd xmm3, ecx, 2 + pinsrd xmm1, edx, 3 + pinsrd xmm2, esi, 0 + pinsrd xmm3, edi, 1 + pinsrd xmm0, ebp, 2 + pinsrd xmm0, esp, 3 + + pinsrd xmm0, eax, 0 + pinsrd xmm1, ebx, 1 + pinsrd xmm2, ecx, 2 + pinsrd xmm3, edx, 3 + pinsrd xmm3, esi, 3 + pinsrd xmm2, edi, 2 + pinsrd xmm1, ebp, 1 + pinsrd xmm0, esp, 0 + + pinsrd xmm1, eax, 0 + pinsrd xmm2, eax, 1 + pinsrd xmm3, eax, 2 + pinsrd xmm1, eax, 3 + pinsrd xmm2, eax, 0 + pinsrd xmm3, eax, 1 + pinsrd xmm0, ebp, 2 + pinsrd xmm0, ebx, 3 + + pinsrd xmm0, eax, 0 + pinsrd xmm0, eax, 1 + pinsrd xmm0, eax, 2 + pinsrd xmm0, eax, 3 + pinsrd xmm0, eax, 0 + pinsrd xmm0, eax, 0 + pinsrd xmm0, ebp, 0 + pinsrd xmm0, ebx, 0 + + pinsrd xmm1, eax, 0 + pinsrd xmm2, ebx, 1 + pinsrd xmm3, ecx, 2 + pinsrd xmm1, edx, 3 + pinsrd xmm2, esi, 0 + pinsrd xmm3, edi, 1 + pinsrd xmm0, ebp, 2 + pinsrd xmm0, esp, 3 + + pinsrd xmm0, eax, 0 + pinsrd xmm1, ebx, 1 + pinsrd xmm2, ecx, 2 + pinsrd xmm3, edx, 3 + pinsrd xmm3, esi, 3 + pinsrd xmm2, edi, 2 + pinsrd xmm1, ebp, 1 + pinsrd xmm0, esp, 0 + + pinsrd xmm1, eax, 0 + pinsrd xmm2, eax, 1 + pinsrd xmm3, eax, 2 + pinsrd xmm1, eax, 3 + pinsrd xmm2, eax, 0 + pinsrd xmm3, eax, 1 + pinsrd xmm0, ebp, 2 + pinsrd xmm0, ebx, 3 + + pinsrd xmm0, eax, 0 + pinsrd xmm0, eax, 1 + pinsrd xmm0, eax, 2 + pinsrd xmm0, eax, 3 + pinsrd xmm0, eax, 0 + pinsrd xmm0, eax, 0 + pinsrd xmm0, ebp, 0 + pinsrd xmm0, ebx, 0 + pinsrd xmm0, esp, 0 + + dec eax + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: Register64ToVector +; Purpose: Writes 64-bit main register values into 128-bit vector register +; without clearing the unused bits. +; Params: rdi = loops +;------------------------------------------------------------------------------ +Register64ToVector: +_Register64ToVector: + ret + + +;------------------------------------------------------------------------------ +; Name: Vector8ToRegister +; Purpose: Writes 8-bit vector register values into main register. +; Params: rdi = loops +;------------------------------------------------------------------------------ + align 64 +Vector8ToRegister: +_Vector8ToRegister: + mov eax, [esp + 4] + sal eax, 4 ; Force some repetition. + push ebx +.L1: + pextrb ebx, xmm1, 0 + pextrb ebx, xmm2, 1 + pextrb ebx, xmm3, 2 + pextrb ebx, xmm1, 3 + pextrb ebx, xmm2, 4 + pextrb ebx, xmm3, 5 + pextrb ebx, xmm0, 6 + pextrb ebx, xmm0, 7 + + pextrb ebx, xmm0, 0 + pextrb ebx, xmm1, 1 + pextrb ebx, xmm2, 2 + pextrb ebx, xmm3, 3 + pextrb ebx, xmm3, 4 + pextrb ebx, xmm2, 15 + pextrb ebx, xmm1, 6 + pextrb ebx, xmm0, 7 + + pextrb ebx, xmm1, 0 + pextrb ebx, xmm2, 1 + pextrb ebx, xmm3, 2 + pextrb ebx, xmm1, 3 + pextrb ebx, xmm2, 4 + pextrb ebx, xmm3, 5 + pextrb ebx, xmm0, 6 + pextrb ebx, xmm0, 7 + + pextrb ebx, xmm0, 0 + pextrb ebx, xmm1, 1 + pextrb ebx, xmm2, 2 + pextrb ebx, xmm3, 3 + pextrb ebx, xmm3, 4 + pextrb ebx, xmm2, 5 + pextrb ebx, xmm1, 6 + pextrb ebx, xmm0, 7 + + pextrb ebx, xmm1, 0 + pextrb ebx, xmm2, 1 + pextrb ebx, xmm3, 2 + pextrb ebx, xmm1, 13 + pextrb ebx, xmm2, 14 + pextrb ebx, xmm3, 15 + pextrb ebx, xmm0, 6 + pextrb ebx, xmm0, 7 + + pextrb ebx, xmm0, 10 + pextrb ebx, xmm1, 11 + pextrb ebx, xmm2, 12 + pextrb ebx, xmm3, 13 + pextrb ebx, xmm3, 14 + pextrb ebx, xmm2, 15 + pextrb ebx, xmm1, 6 + pextrb ebx, xmm0, 7 + + pextrb ebx, xmm1, 0 + pextrb ebx, xmm2, 1 + pextrb ebx, xmm3, 2 + pextrb ebx, xmm1, 3 + pextrb ebx, xmm2, 4 + pextrb ebx, xmm3, 5 + pextrb ebx, xmm0, 6 + pextrb ebx, xmm0, 7 + + pextrb ebx, xmm0, 0 + pextrb ebx, xmm1, 1 + pextrb ebx, xmm2, 2 + pextrb ebx, xmm3, 3 + pextrb ebx, xmm3, 4 + pextrb ebx, xmm2, 5 + pextrb ebx, xmm1, 6 + pextrb ebx, xmm0, 7 + + dec eax + jnz .L1 + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: Vector16ToRegister +; Purpose: Writes 16-bit vector register values into main register. +; Params: rdi = loops +;------------------------------------------------------------------------------ + align 64 +Vector16ToRegister: +_Vector16ToRegister: + mov eax, [esp + 4] + sal eax, 3 ; Force some repetition. + push ebx +.L1: + pextrw ebx, xmm1, 0 + pextrw ebx, xmm2, 1 + pextrw ebx, xmm3, 2 + pextrw ebx, xmm1, 3 + pextrw ebx, xmm2, 4 + pextrw ebx, xmm3, 5 + pextrw ebx, xmm0, 6 + pextrw ebx, xmm0, 7 + + pextrw ebx, xmm0, 0 + pextrw ebx, xmm1, 1 + pextrw ebx, xmm2, 2 + pextrw ebx, xmm3, 3 + pextrw ebx, xmm3, 4 + pextrw ebx, xmm2, 5 + pextrw ebx, xmm1, 6 + pextrw ebx, xmm0, 7 + + pextrw ebx, xmm1, 0 + pextrw ebx, xmm2, 1 + pextrw ebx, xmm3, 2 + pextrw ebx, xmm1, 3 + pextrw ebx, xmm2, 4 + pextrw ebx, xmm3, 5 + pextrw ebx, xmm0, 6 + pextrw ebx, xmm0, 7 + + pextrw ebx, xmm0, 0 + pextrw ebx, xmm1, 1 + pextrw ebx, xmm2, 2 + pextrw ebx, xmm3, 3 + pextrw ebx, xmm3, 4 + pextrw ebx, xmm2, 5 + pextrw ebx, xmm1, 6 + pextrw ebx, xmm0, 7 + + pextrw ebx, xmm1, 0 + pextrw ebx, xmm2, 1 + pextrw ebx, xmm3, 2 + pextrw ebx, xmm1, 3 + pextrw ebx, xmm2, 4 + pextrw ebx, xmm3, 5 + pextrw ebx, xmm0, 6 + pextrw ebx, xmm0, 7 + + pextrw ebx, xmm0, 0 + pextrw ebx, xmm1, 1 + pextrw ebx, xmm2, 2 + pextrw ebx, xmm3, 3 + pextrw ebx, xmm3, 4 + pextrw ebx, xmm2, 5 + pextrw ebx, xmm1, 6 + pextrw ebx, xmm0, 7 + + pextrw ebx, xmm1, 0 + pextrw ebx, xmm2, 1 + pextrw ebx, xmm3, 2 + pextrw ebx, xmm1, 3 + pextrw ebx, xmm2, 4 + pextrw ebx, xmm3, 5 + pextrw ebx, xmm0, 6 + pextrw ebx, xmm0, 7 + + pextrw ebx, xmm0, 0 + pextrw ebx, xmm1, 1 + pextrw ebx, xmm2, 2 + pextrw ebx, xmm3, 3 + pextrw ebx, xmm3, 4 + pextrw ebx, xmm2, 5 + pextrw ebx, xmm1, 6 + pextrw ebx, xmm0, 7 + + dec eax + jnz .L1 + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: Vector32ToRegister +; Purpose: Writes 32-bit vector register values into main register. +; Params: rdi = loops +;------------------------------------------------------------------------------ + align 64 +Vector32ToRegister: +_Vector32ToRegister: + mov eax, [esp + 4] + sal eax, 2 ; Force some repetition. + push ebx +.L1: + pextrd ebx, xmm1, 0 + pextrd ebx, xmm2, 1 + pextrd ebx, xmm3, 2 + pextrd ebx, xmm1, 3 + pextrd ebx, xmm2, 0 + pextrd ebx, xmm3, 1 + pextrd ebx, xmm0, 2 + pextrd ebx, xmm0, 3 + + pextrd ebx, xmm0, 0 + pextrd ebx, xmm1, 1 + pextrd ebx, xmm2, 2 + pextrd ebx, xmm3, 3 + pextrd ebx, xmm3, 3 + pextrd ebx, xmm2, 2 + pextrd ebx, xmm1, 1 + pextrd ebx, xmm0, 0 + + pextrd ebx, xmm1, 0 + pextrd ebx, xmm2, 1 + pextrd ebx, xmm3, 2 + pextrd ebx, xmm1, 3 + pextrd ebx, xmm2, 0 + pextrd ebx, xmm3, 1 + pextrd ebx, xmm0, 2 + pextrd ebx, xmm0, 3 + + pextrd ebx, xmm0, 0 + pextrd ebx, xmm1, 1 + pextrd ebx, xmm2, 2 + pextrd ebx, xmm3, 3 + pextrd ebx, xmm3, 3 + pextrd ebx, xmm2, 2 + pextrd ebx, xmm1, 1 + pextrd ebx, xmm0, 0 + + dec eax + jnz .L1 + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: Vector64ToRegister +; Purpose: Writes 64-bit vector register values into main register. +; Params: rdi = loops +;------------------------------------------------------------------------------ +Vector64ToRegister: +_Vector64ToRegister: + ret + +;------------------------------------------------------------------------------ +; Name: CopyAVX +; Purpose: Copies memory chunks that are 32-byte aligned. +; Params: [esp + 4] = ptr to destination memory area +; [esp + 8] = ptr to source memory area +; [esp + 12] = length in bytes +; [esp + 16] = loops +;------------------------------------------------------------------------------ + align 64 +CopyAVX: +_CopyAVX: + vzeroupper + ; Register usage: + ; esi = source + ; edi = dest + ; ecx = loops + ; edx = length + push esi + push edi + push ecx + push edx + + mov edi, [esp + 4 + 16] + mov esi, [esp + 8 + 16] + mov edx, [esp + 12 + 16] + mov ecx, [esp + 16 + 16] + + shr edx, 8 ; Ensure length is multiple of 256. + shl edx, 8 + +.L1: + mov eax, edx + prefetchnta [esi] + +.L2: + vmovdqa ymm0, [esi] + vmovdqa ymm1, [32+esi] + vmovdqa ymm2, [64+esi] + vmovdqa ymm3, [96+esi] + + vmovdqa [edi], ymm0 + vmovdqa [32+edi], ymm1 + vmovdqa [64+edi], ymm2 + vmovdqa [96+edi], ymm3 + + vmovdqa ymm0, [128+esi] + vmovdqa ymm1, [128+32+esi] + vmovdqa ymm2, [128+64+esi] + vmovdqa ymm3, [128+96+esi] + + vmovdqa [128+edi], ymm0 + vmovdqa [128+32+edi], ymm1 + vmovdqa [128+64+edi], ymm2 + vmovdqa [128+96+edi], ymm3 + + add esi, 256 + add edi, 256 + + sub eax, 256 + jnz .L2 + + sub esi, edx ; rsi now points to start. + sub edi, edx ; rdi now points to start. + + dec ecx + jnz .L1 + + pop edx + pop ecx + pop edi + pop esi + ret + +;------------------------------------------------------------------------------ +; Name: CopySSE +; Purpose: Copies memory chunks that are 16-byte aligned. +; Params: [esp + 4] = ptr to destination memory area +; [esp + 8] = ptr to source memory area +; [esp + 12] = length in bytes +; [esp + 16] = loops +;------------------------------------------------------------------------------ + align 64 +CopySSE: +_CopySSE: + ; Register usage: + ; esi = source + ; edi = dest + ; ecx = loops + ; edx = length + push esi + push edi + push ecx + push edx + + mov edi, [esp + 4 + 16] + mov esi, [esp + 8 + 16] + mov edx, [esp + 12 + 16] + mov ecx, [esp + 16 + 16] + + shr edx, 7 ; Ensure length is multiple of 128. + shl edx, 7 + + ; Save our non-parameter XMM registers. + sub esp, 64 + movdqu [esp], xmm4 + movdqu [16+esp], xmm5 + movdqu [32+esp], xmm6 + movdqu [48+esp], xmm7 + +.L1: + mov eax, edx + +.L2: + prefetchnta [esi] + movdqa xmm0, [esi] + movdqa xmm1, [16+esi] + movdqa xmm2, [32+esi] + movdqa xmm3, [48+esi] + movdqa xmm4, [64+esi] + movdqa xmm5, [80+esi] + movdqa xmm6, [96+esi] + movdqa xmm7, [112+esi] + + ; 32-bit lacks xmm8 - xmm15. + + movdqa [edi], xmm0 + movdqa [16+edi], xmm1 + movdqa [32+edi], xmm2 + movdqa [48+edi], xmm3 + movdqa [64+edi], xmm4 + movdqa [80+edi], xmm5 + movdqa [96+edi], xmm6 + movdqa [112+edi], xmm7 + + add esi, 128 + add edi, 128 + + sub eax, 128 + jnz .L2 + + sub esi, edx ; rsi now points to start. + sub edi, edx ; rdi now points to start. + + dec ecx + jnz .L1 + + movdqu xmm4, [0+esp] + movdqu xmm5, [16+esp] + movdqu xmm6, [32+esp] + movdqu xmm7, [48+esp] + add esp, 64 + + pop edx + pop ecx + pop edi + pop esi + ret + +;------------------------------------------------------------------------------ +; Name: CopySSE_128bytes +; Purpose: Copies memory chunks that are 16-byte aligned. +; Params: [esp + 4] = ptr to destination memory area +; [esp + 8] = ptr to source memory area +; [esp + 12] = length in bytes +; [esp + 16] = loops +;------------------------------------------------------------------------------ + align 64 +CopySSE_128bytes: +_CopySSE_128bytes: + jmp CopySSE + diff --git a/routines64.asm b/routines64.asm new file mode 100755 index 0000000..e49b75a --- /dev/null +++ b/routines64.asm @@ -0,0 +1,2590 @@ +;============================================================================ +; bandwidth, a benchmark to estimate memory transfer bandwidth. +; Copyright (C) 2005-2014 by Zack T Smith. +; +; This program is free software; you can redistribute it and/or modify +; it under the terms of the GNU General Public License as published by +; the Free Software Foundation; either version 2 of the License, or +; (at your option) any later version. +; +; This program is distributed in the hope that it will be useful, +; but WITHOUT ANY WARRANTY; without even the implied warranty of +; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; GNU General Public License for more details. +; +; You should have received a copy of the GNU General Public License +; along with this program; if not, write to the Free Software +; Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +; +; The author may be reached at veritas@comcast.net. +;============================================================================= + +bits 64 +cpu ia64 + +global CopySSE +global CopySSE_128bytes + +global CopyAVX +global _CopyAVX + +global ReaderLODSQ +global _ReaderLODSQ + +global ReaderLODSD +global _ReaderLODSD + +global ReaderLODSW +global _ReaderLODSW + +global ReaderLODSB +global _ReaderLODSB + +global RandomReader +global RandomReaderSSE2 +global RandomReaderSSE2_bypass +global RandomWriter +global RandomWriterSSE2 +global RandomWriterSSE2_bypass +global Reader +global Reader_128bytes +global ReaderAVX +global ReaderSSE2 +global ReaderSSE2_128bytes +global ReaderSSE2_bypass +global ReaderSSE2_128bytes_bypass +global Register16ToVector +global Register32ToVector +global Register64ToVector +global Register8ToVector +global RegisterToRegister +global RegisterToVector +global StackReader +global StackWriter +global Vector16ToRegister +global Vector32ToRegister +global Vector64ToRegister +global Vector8ToRegister +global VectorToRegister +global VectorToVector +global VectorToVectorAVX +global Writer +global Writer_128bytes +global WriterAVX +global WriterSSE2 +global WriterSSE2_128bytes +global WriterSSE2_bypass +global WriterSSE2_128bytes_bypass +global WriterAVX_bypass +global _WriterAVX_bypass +global _CopySSE +global _CopySSE_128bytes +global _RandomReader +global _RandomReaderSSE2 +global _RandomReaderSSE2_bypass +global _RandomWriter +global _RandomWriterSSE2 +global _RandomWriterSSE2_bypass +global _Reader +global _ReaderAVX +global _Reader_128bytes +global _ReaderSSE2 +global _ReaderSSE2_bypass +global _ReaderSSE2_128bytes +global _ReaderSSE2_128bytes_bypass +global _Register16ToVector +global _Register32ToVector +global _Register64ToVector +global _Register8ToVector +global _RegisterToRegister +global _RegisterToVector +global _StackReader +global _StackWriter +global _Vector16ToRegister +global _Vector32ToRegister +global _Vector64ToRegister +global _Vector8ToRegister +global _VectorToRegister +global _VectorToVector +global _VectorToVectorAVX +global _Writer +global _Writer_128bytes +global _WriterSSE2 +global _WriterAVX +global _WriterSSE2_128bytes +global _WriterSSE2_bypass +global _WriterSSE2_128bytes_bypass + +global get_cpuid_cache_info +global _get_cpuid_cache_info + +global get_cpuid_family +global _get_cpuid_family + +global get_cpuid1_ecx +global _get_cpuid1_ecx + +global get_cpuid1_edx +global _get_cpuid1_edx + +global get_cpuid7_ebx +global _get_cpuid7_ebx + +global get_cpuid_80000001_ecx +global _get_cpuid_80000001_ecx + +global get_cpuid_80000001_edx +global _get_cpuid_80000001_edx + +; Note: +; Unix ABI says integer param are put in these registers in this order: +; rdi, rsi, rdx, rcx, r8, r9 + + section .text + +;------------------------------------------------------------------------------ +; Name: get_cpuid_cache_info +; +get_cpuid_cache_info: +_get_cpuid_cache_info: + push rbx + push rcx + push rdx + mov rax, 4 + mov rcx, rsi + cpuid + mov [rdi], eax + mov [rdi+4], ebx + mov [rdi+8], ecx + mov [rdi+12], edx + pop rdx + pop rcx + pop rbx + ret + +;------------------------------------------------------------------------------ +; Name: get_cpuid_family +; +get_cpuid_family: +_get_cpuid_family: + push rbx + push rcx + push rdx + xor rax, rax + cpuid + mov [rdi], ebx + mov [rdi+4], edx + mov [rdi+8], ecx + mov byte [rdi+12], 0 + pop rdx + pop rcx + pop rbx + ret + +;------------------------------------------------------------------------------ +; Name: get_cpuid1_ecx +; +get_cpuid1_ecx: +_get_cpuid1_ecx: + push rbx + push rcx + push rdx + mov rax, 1 + cpuid + mov rax, rcx + pop rdx + pop rcx + pop rbx + ret + +;------------------------------------------------------------------------------ +; Name: get_cpuid7_ebx +; +get_cpuid7_ebx: +_get_cpuid7_ebx: + push rbx + push rcx + push rdx + mov rax, 7 + xor rcx, rcx + cpuid + mov rax, rbx + pop rdx + pop rcx + pop rbx + ret + +;------------------------------------------------------------------------------ +; Name: get_cpuid1_edx +; +get_cpuid1_edx: +_get_cpuid1_edx: + push rbx + push rcx + push rdx + mov rax, 1 + cpuid + mov rax, rdx + pop rdx + pop rcx + pop rbx + ret + +;------------------------------------------------------------------------------ +; Name: get_cpuid_80000001_ecx +; +get_cpuid_80000001_ecx: +_get_cpuid_80000001_ecx: + push rbx + push rcx + push rdx + mov rax, 0x80000001 + cpuid + mov rax, rcx + pop rdx + pop rcx + pop rbx + ret + +;------------------------------------------------------------------------------ +; Name: get_cpuid_80000001_edx +; +get_cpuid_80000001_edx: +_get_cpuid_80000001_edx: + push rbx + push rcx + push rdx + mov rax, 0x80000001 + cpuid + mov rax, rdx + pop rdx + pop rcx + pop rbx + ret + +;------------------------------------------------------------------------------ +; Name: ReaderLODSQ +; Purpose: Reads 64-bit values sequentially from an area of memory +; using LODSQ instruction. +; Params: rdi = ptr to memory area +; rsi = length in bytes +; rdx = loops +;------------------------------------------------------------------------------ + align 32 +ReaderLODSQ: +_ReaderLODSQ: + push rcx ; REP counter + push r10 + push r11 + mov r10, rdi + mov r11, rsi + shr r11, 3 ; length in quadwords rounded down. + +.L1: + mov rsi, r10 ; buffer start + mov rcx, r11 ; # of quadwords + + rep lodsq + + dec rdx + jnz .L1 + + pop r11 + pop r10 + pop rcx + ret + +;------------------------------------------------------------------------------ +; Name: ReaderLODSD +; Purpose: Reads 32-bit values sequentially from an area of memory +; using LODSD instruction. +; Params: rdi = ptr to memory area +; rsi = length in bytes +; rdx = loops +;------------------------------------------------------------------------------ + align 32 +ReaderLODSD: +_ReaderLODSD: + push rcx ; REP counter + push r10 + push r11 + mov r10, rdi + mov r11, rsi + shr r11, 2 ; length in double words rounded down. + +.L1: + mov rsi, r10 ; buffer start + mov rcx, r11 ; # of double words + + rep lodsd + + dec rdx + jnz .L1 + + pop r11 + pop r10 + pop rcx + ret + +;------------------------------------------------------------------------------ +; Name: ReaderLODSW +; Purpose: Reads 16-bit values sequentially from an area of memory +; using LODSW instruction. +; Params: rdi = ptr to memory area +; rsi = length in bytes +; rdx = loops +;------------------------------------------------------------------------------ + align 32 +ReaderLODSW: +_ReaderLODSW: + push rcx ; REP counter + push r10 + push r11 + mov r10, rdi + mov r11, rsi + shr r11, 1 ; length in words rounded down. + +.L1: + mov rsi, r10 ; buffer start + mov rcx, r11 ; # of words + + rep lodsw + + dec rdx + jnz .L1 + + pop r11 + pop r10 + pop rcx + ret + +;------------------------------------------------------------------------------ +; Name: ReaderLODSB +; Purpose: Reads 8-bit values sequentially from an area of memory +; using LODSB instruction. +; Params: rdi = ptr to memory area +; rsi = length in bytes +; rdx = loops +;------------------------------------------------------------------------------ + align 32 +ReaderLODSB: +_ReaderLODSB: + push rcx ; REP counter + push r10 + push r11 + mov r10, rdi + mov r11, rsi + +.L1: + mov rsi, r10 ; buffer start + mov rcx, r11 ; # of bytes + + rep lodsb + + dec rdx + jnz .L1 + + pop r11 + pop r10 + pop rcx + ret + +;------------------------------------------------------------------------------ +; Name: Reader +; Purpose: Reads 64-bit values sequentially from an area of memory. +; Params: rdi = ptr to memory area +; rsi = length in bytes +; rdx = loops +;------------------------------------------------------------------------------ + align 64 +Reader: +_Reader: + push r10 + + add rsi, rdi ; rsi now points to end. + +.L1: + mov r10, rdi + +.L2: + mov rax, [r10] + mov rax, [8+r10] + mov rax, [16+r10] + mov rax, [24+r10] + mov rax, [32+r10] + mov rax, [40+r10] + mov rax, [48+r10] + mov rax, [56+r10] + mov rax, [64+r10] + mov rax, [72+r10] + mov rax, [80+r10] + mov rax, [88+r10] + mov rax, [96+r10] + mov rax, [104+r10] + mov rax, [112+r10] + mov rax, [120+r10] + mov rax, [128+r10] + mov rax, [136+r10] + mov rax, [144+r10] + mov rax, [152+r10] + mov rax, [160+r10] + mov rax, [168+r10] + mov rax, [176+r10] + mov rax, [184+r10] + mov rax, [192+r10] + mov rax, [200+r10] + mov rax, [208+r10] + mov rax, [216+r10] + mov rax, [224+r10] + mov rax, [232+r10] + mov rax, [240+r10] + mov rax, [248+r10] + + add r10, 256 + cmp r10, rsi + jb .L2 + + dec rdx + jnz .L1 + + pop r10 + ret + +;------------------------------------------------------------------------------ +; Name: Reader_128bytes +; Purpose: Reads 64-bit values sequentially from an area of memory. +; Params: rdi = ptr to memory area +; rsi = length in bytes +; rdx = loops +;------------------------------------------------------------------------------ + align 64 +Reader_128bytes: +_Reader_128bytes: + push r10 + + add rsi, rdi ; rdi now points to end. + +.L1: + mov r10, rdi + +.L2: + mov rax, [r10] + mov rax, [8+r10] + mov rax, [16+r10] + mov rax, [24+r10] + mov rax, [32+r10] + mov rax, [40+r10] + mov rax, [48+r10] + mov rax, [56+r10] + mov rax, [64+r10] + mov rax, [72+r10] + mov rax, [80+r10] + mov rax, [88+r10] + mov rax, [96+r10] + mov rax, [104+r10] + mov rax, [112+r10] + mov rax, [120+r10] + + add r10, 128 + cmp r10, rsi + jb .L2 + + dec rdx + jnz .L1 + + pop r10 + ret + +;------------------------------------------------------------------------------ +; Name: RandomReader +; Purpose: Reads 64-bit values randomly from an area of memory. +; Params: rdi = ptr to array of chunk pointers +; rsi = # of chunks +; rdx = loops +;------------------------------------------------------------------------------ + align 64 +RandomReader: +_RandomReader: + push r10 + push r11 + +.L1: + xor r11, r11 + +.L2: + mov r10, [rdi + 8*r11] ; Note, 64-bit pointers. + + mov rax, [96+r10] + mov rax, [r10] + mov rax, [120+r10] + mov rax, [184+r10] + mov rax, [160+r10] + mov rax, [176+r10] + mov rax, [112+r10] + mov rax, [80+r10] + mov rax, [32+r10] + mov rax, [128+r10] + mov rax, [88+r10] + mov rax, [40+r10] + mov rax, [48+r10] + mov rax, [72+r10] + mov rax, [200+r10] + mov rax, [24+r10] + mov rax, [152+r10] + mov rax, [16+r10] + mov rax, [248+r10] + mov rax, [56+r10] + mov rax, [240+r10] + mov rax, [208+r10] + mov rax, [104+r10] + mov rax, [216+r10] + mov rax, [136+r10] + mov rax, [232+r10] + mov rax, [64+r10] + mov rax, [224+r10] + mov rax, [144+r10] + mov rax, [192+r10] + mov rax, [8+r10] + mov rax, [168+r10] + + inc r11 + cmp r11, rsi + jb .L2 + + dec rdx + jnz .L1 + + pop r11 + pop r10 + ret + +;------------------------------------------------------------------------------ +; Name: RandomReaderSSE2 +; Purpose: Reads 128-bit values randomly from an area of memory. +; Params: rdi = ptr to array of chunk pointers +; rsi = # of chunks +; rdx = loops +;------------------------------------------------------------------------------ + align 64 +RandomReaderSSE2: +_RandomReaderSSE2: + push r10 + push r11 + +.L1: + xor r11, r11 + +.L2: + mov r10, [rdi + 8*r11] + + movdqa xmm0, [240+r10] + movdqa xmm0, [128+r10] + movdqa xmm0, [64+r10] + movdqa xmm0, [208+r10] + movdqa xmm0, [112+r10] + movdqa xmm0, [176+r10] + movdqa xmm0, [144+r10] + movdqa xmm0, [r10] + movdqa xmm0, [96+r10] + movdqa xmm0, [16+r10] + movdqa xmm0, [192+r10] + movdqa xmm0, [160+r10] + movdqa xmm0, [32+r10] + movdqa xmm0, [48+r10] + movdqa xmm0, [224+r10] + movdqa xmm0, [80+r10] + + inc r11 + cmp r11, rsi + jb .L2 + + dec rdx + jnz .L1 + + pop r11 + pop r10 + ret + +;------------------------------------------------------------------------------ +; Name: RandomReaderSSE2_bypass +; Purpose: Reads 128-bit values randomly from an area of memory. +; Params: rdi = ptr to array of chunk pointers +; rsi = # of chunks +; rdx = loops +;------------------------------------------------------------------------------ + align 64 +RandomReaderSSE2_bypass: +_RandomReaderSSE2_bypass: + push r10 + push r11 + +.L1: + xor r11, r11 + +.L2: + mov r10, [rdi + 8*r11] + + ; SSE 4.1 required + movntdqa xmm0, [240+r10] + movntdqa xmm0, [r10] + movntdqa xmm0, [128+r10] + movntdqa xmm0, [64+r10] + movntdqa xmm0, [208+r10] + movntdqa xmm0, [112+r10] + movntdqa xmm0, [48+r10] + movntdqa xmm0, [176+r10] + movntdqa xmm0, [144+r10] + movntdqa xmm0, [96+r10] + movntdqa xmm0, [16+r10] + movntdqa xmm0, [160+r10] + movntdqa xmm0, [32+r10] + movntdqa xmm0, [224+r10] + movntdqa xmm0, [80+r10] + movntdqa xmm0, [192+r10] + + inc r11 + cmp r11, rsi + jb .L2 + + dec rdx + jnz .L1 + + pop r11 + pop r10 + ret + +;------------------------------------------------------------------------------ +; Name: RandomWriter +; Purpose: Writes 64-bit values randomly to an area of memory. +; Params: rdi = ptr to array of chunk pointers +; rsi = # of chunks +; rdx = loops +; rcx = datum to write +;------------------------------------------------------------------------------ + align 64 +RandomWriter: +_RandomWriter: + push r10 + push r11 + +.L1: + xor r11, r11 + +.L2: + mov r10, [rdi + 8*r11] ; Note, 64-bit pointers. + + mov [96+r10], rcx + mov [r10], rcx + mov [120+r10], rcx + mov [184+r10], rcx + mov [160+r10], rcx + mov [176+r10], rcx + mov [112+r10], rcx + mov [80+r10], rcx + mov [32+r10], rcx + mov [128+r10], rcx + mov [88+r10], rcx + mov [40+r10], rcx + mov [48+r10], rcx + mov [72+r10], rcx + mov [200+r10], rcx + mov [24+r10], rcx + mov [152+r10], rcx + mov [16+r10], rcx + mov [248+r10], rcx + mov [56+r10], rcx + mov [240+r10], rcx + mov [208+r10], rcx + mov [104+r10], rcx + mov [216+r10], rcx + mov [136+r10], rcx + mov [232+r10], rcx + mov [64+r10], rcx + mov [224+r10], rcx + mov [144+r10], rcx + mov [192+r10], rcx + mov [8+r10], rcx + mov [168+r10], rcx + + inc r11 + cmp r11, rsi + jb .L2 + + dec rdx + jnz .L1 + + pop r11 + pop r10 + ret + +;------------------------------------------------------------------------------ +; Name: RandomWriterSSE2 +; Purpose: Writes 128-bit values randomly to an area of memory. +; Params: rdi = ptr to array of chunk pointers +; rsi = # of chunks +; rdx = loops +; rcx = datum to write +;------------------------------------------------------------------------------ + align 64 +RandomWriterSSE2: +_RandomWriterSSE2: + push r10 + push r11 + + movq xmm0, rcx ; Create duplicated 128-bit datum + movq xmm1, rcx + pslldq xmm1, 64 + por xmm0, xmm1 + +.L1: + xor r11, r11 + +.L2: + mov r10, [rdi + 8*r11] ; Note, 64-bit pointers. + + movdqa [240+r10], xmm0 + movdqa [128+r10], xmm0 + movdqa [208+r10], xmm0 + movdqa [112+r10], xmm0 + movdqa [64+r10], xmm0 + movdqa [176+r10], xmm0 + movdqa [144+r10], xmm0 + movdqa [r10], xmm0 + movdqa [96+r10], xmm0 + movdqa [16+r10], xmm0 + movdqa [192+r10], xmm0 + movdqa [160+r10], xmm0 + movdqa [32+r10], xmm0 + movdqa [48+r10], xmm0 + movdqa [224+r10], xmm0 + movdqa [80+r10], xmm0 + + inc r11 + cmp r11, rsi + jb .L2 + + dec rdx + jnz .L1 + + pop r11 + pop r10 + ret + +;------------------------------------------------------------------------------ +; Name: RandomWriterSSE2_bypass +; Purpose: Writes 128-bit values randomly into memory, bypassing caches. +; Params: rdi = ptr to array of chunk pointers +; rsi = # of chunks +; rdx = loops +; rcx = datum to write +;------------------------------------------------------------------------------ + align 64 +RandomWriterSSE2_bypass: +_RandomWriterSSE2_bypass: + push r10 + push r11 + + movq xmm0, rcx ; Create duplicated 128-bit datum + movq xmm1, rcx + pslldq xmm1, 64 + por xmm0, xmm1 + +.L1: + xor r11, r11 + +.L2: + mov r10, [rdi + 8*r11] ; Note, 64-bit pointers. + + movntdq [240+r10], xmm0 + movntdq [128+r10], xmm0 + movntdq [208+r10], xmm0 + movntdq [112+r10], xmm0 + movntdq [64+r10], xmm0 + movntdq [176+r10], xmm0 + movntdq [144+r10], xmm0 + movntdq [r10], xmm0 + movntdq [96+r10], xmm0 + movntdq [16+r10], xmm0 + movntdq [192+r10], xmm0 + movntdq [160+r10], xmm0 + movntdq [32+r10], xmm0 + movntdq [48+r10], xmm0 + movntdq [224+r10], xmm0 + movntdq [80+r10], xmm0 + + inc r11 + cmp r11, rsi + jb .L2 + + dec rdx + jnz .L1 + + pop r11 + pop r10 + ret + +;------------------------------------------------------------------------------ +; Name: ReaderSSE2_128bytes +; Purpose: Reads 128-bit values sequentially from an area of memory. +; Params: rdi = ptr to memory area +; rsi = length in bytes +; rdx = loops +;------------------------------------------------------------------------------ + align 64 +ReaderSSE2_128bytes: +_ReaderSSE2_128bytes: + push r10 + + add rsi, rdi ; rsi now points to end. + +.L1: + mov r10, rdi + +.L2: + movdqa xmm0, [r10] ; Read aligned to 16-byte boundary. + movdqa xmm0, [16+r10] + movdqa xmm0, [32+r10] + movdqa xmm0, [48+r10] + movdqa xmm0, [64+r10] + movdqa xmm0, [80+r10] + movdqa xmm0, [96+r10] + movdqa xmm0, [112+r10] + + add r10, 128 + cmp r10, rsi + jb .L2 + + dec rdx + jnz .L1 + + pop r10 + ret + + +;------------------------------------------------------------------------------ +; Name: ReaderSSE2 +; Purpose: Reads 128-bit values sequentially from an area of memory. +; Params: rdi = ptr to memory area +; rsi = length in bytes +; rdx = loops +;------------------------------------------------------------------------------ + align 64 +ReaderSSE2: +_ReaderSSE2: + push r10 + + add rsi, rdi ; rsi now points to end. + +.L1: + mov r10, rdi + +.L2: + movdqa xmm0, [r10] ; Read aligned to 16-byte boundary. + movdqa xmm0, [16+r10] + movdqa xmm0, [32+r10] + movdqa xmm0, [48+r10] + movdqa xmm0, [64+r10] + movdqa xmm0, [80+r10] + movdqa xmm0, [96+r10] + movdqa xmm0, [112+r10] + + movdqa xmm0, [128+r10] + movdqa xmm0, [144+r10] + movdqa xmm0, [160+r10] + movdqa xmm0, [176+r10] + movdqa xmm0, [192+r10] + movdqa xmm0, [208+r10] + movdqa xmm0, [224+r10] + movdqa xmm0, [240+r10] + + add r10, 256 + cmp r10, rsi + jb .L2 + + dec rdx + jnz .L1 + + pop r10 + ret + + +;------------------------------------------------------------------------------ +; Name: ReaderAVX +; Purpose: Reads 256-bit values sequentially from an area of memory. +; Params: rdi = ptr to memory area +; rsi = length in bytes +; rdx = loops +;------------------------------------------------------------------------------ + align 64 +ReaderAVX: +_ReaderAVX: + vzeroupper + + push r10 + + add rsi, rdi ; rsi now points to end. + +.L1: + mov r10, rdi + +.L2: + vmovdqa ymm0, [r10] ; Read aligned to 32-byte boundary. + vmovdqa ymm0, [32+r10] + vmovdqa ymm0, [64+r10] + vmovdqa ymm0, [96+r10] + vmovdqa ymm0, [128+r10] + vmovdqa ymm0, [160+r10] + vmovdqa ymm0, [192+r10] + vmovdqa ymm0, [224+r10] + + add r10, 256 + cmp r10, rsi + jb .L2 + + dec rdx + jnz .L1 + + pop r10 + ret + + +;------------------------------------------------------------------------------ +; Name: ReaderSSE2_bypass +; Purpose: Reads 128-bit values sequentially from an area of memory. +; Params: rdi = ptr to memory area +; rsi = length in bytes +; rdx = loops +;------------------------------------------------------------------------------ + align 64 +ReaderSSE2_bypass: +_ReaderSSE2_bypass: + push r10 + + add rsi, rdi ; rsi now points to end. + +.L1: + mov r10, rdi + +.L2: + movntdqa xmm0, [r10] ; Read aligned to 16-byte boundary. + movntdqa xmm0, [16+r10] + movntdqa xmm0, [32+r10] + movntdqa xmm0, [48+r10] + movntdqa xmm0, [64+r10] + movntdqa xmm0, [80+r10] + movntdqa xmm0, [96+r10] + movntdqa xmm0, [112+r10] + + movntdqa xmm0, [128+r10] + movntdqa xmm0, [144+r10] + movntdqa xmm0, [160+r10] + movntdqa xmm0, [176+r10] + movntdqa xmm0, [192+r10] + movntdqa xmm0, [208+r10] + movntdqa xmm0, [224+r10] + movntdqa xmm0, [240+r10] + + add r10, 256 + cmp r10, rsi + jb .L2 + + dec rdx + jnz .L1 + + pop r10 + ret + + +;------------------------------------------------------------------------------ +; Name: ReaderSSE2_128bytes_bypass +; Purpose: Reads 128-bit values sequentially from an area of memory. +; Params: rdi = ptr to memory area +; rsi = length in bytes +; rdx = loops +;------------------------------------------------------------------------------ + align 64 +ReaderSSE2_128bytes_bypass: +_ReaderSSE2_128bytes_bypass: + push r10 + + add rsi, rdi ; rsi now points to end. + +.L1: + mov r10, rdi + +.L2: + movntdqa xmm0, [r10] ; Read aligned to 16-byte boundary. + movntdqa xmm0, [16+r10] + movntdqa xmm0, [32+r10] + movntdqa xmm0, [48+r10] + movntdqa xmm0, [64+r10] + movntdqa xmm0, [80+r10] + movntdqa xmm0, [96+r10] + movntdqa xmm0, [112+r10] + + add r10, 128 + cmp r10, rsi + jb .L2 + + dec rdx + jnz .L1 + + pop r10 + ret + + +;------------------------------------------------------------------------------ +; Name: Writer +; Purpose: Writes 64-bit value sequentially to an area of memory. +; Params: rdi = ptr to memory area +; rsi = length in bytes +; rdx = loops +; rcx = quad to write +;------------------------------------------------------------------------------ + align 64 +Writer: +_Writer: + push r10 + + add rsi, rdi ; rsi now points to end. + +.L1: + mov r10, rdi + +.L2: + mov [r10], rcx + mov [8+r10], rcx + mov [16+r10], rcx + mov [24+r10], rcx + mov [32+r10], rcx + mov [40+r10], rcx + mov [48+r10], rcx + mov [56+r10], rcx + mov [64+r10], rcx + mov [72+r10], rcx + mov [80+r10], rcx + mov [88+r10], rcx + mov [96+r10], rcx + mov [104+r10], rcx + mov [112+r10], rcx + mov [120+r10], rcx + mov [128+r10], rcx + mov [136+r10], rcx + mov [144+r10], rcx + mov [152+r10], rcx + mov [160+r10], rcx + mov [168+r10], rcx + mov [176+r10], rcx + mov [184+r10], rcx + mov [192+r10], rcx + mov [200+r10], rcx + mov [208+r10], rcx + mov [216+r10], rcx + mov [224+r10], rcx + mov [232+r10], rcx + mov [240+r10], rcx + mov [248+r10], rcx + + add r10, 256 + cmp r10, rsi + jb .L2 + + dec rdx + jnz .L1 + + pop r10 + ret + +;------------------------------------------------------------------------------ +; Name: Writer_128bytes +; Purpose: Writes 64-bit value sequentially to an area of memory. +; Params: rdi = ptr to memory area +; rsi = length in bytes +; rdx = loops +; rcx = quad to write +;------------------------------------------------------------------------------ + align 64 +Writer_128bytes: +_Writer_128bytes: + push r10 + + add rsi, rdi ; rsi now points to end. + +.L1: + mov r10, rdi + +.L2: + mov [r10], rcx + mov [8+r10], rcx + mov [16+r10], rcx + mov [24+r10], rcx + mov [32+r10], rcx + mov [40+r10], rcx + mov [48+r10], rcx + mov [56+r10], rcx + mov [64+r10], rcx + mov [72+r10], rcx + mov [80+r10], rcx + mov [88+r10], rcx + mov [96+r10], rcx + mov [104+r10], rcx + mov [112+r10], rcx + mov [120+r10], rcx + + add r10, 128 + cmp r10, rsi + jb .L2 + + dec rdx + jnz .L1 + + pop r10 + ret + +;------------------------------------------------------------------------------ +; Name: WriterSSE2 +; Purpose: Writes 128-bit value sequentially to an area of memory. +; Params: rdi = ptr to memory area +; rsi = length in bytes +; rdx = loops +; rcx = quad to write +;------------------------------------------------------------------------------ + align 64 +WriterSSE2: +_WriterSSE2: + push r10 + + add rsi, rdi ; rsi now points to end. + + movq xmm0, rcx + +.L1: + mov r10, rdi + +.L2: + movdqa [r10], xmm0 + movdqa [16+r10], xmm0 + movdqa [32+r10], xmm0 + movdqa [48+r10], xmm0 + movdqa [64+r10], xmm0 + movdqa [80+r10], xmm0 + movdqa [96+r10], xmm0 + movdqa [112+r10], xmm0 + + movdqa [128+r10], xmm0 + movdqa [144+r10], xmm0 + movdqa [160+r10], xmm0 + movdqa [176+r10], xmm0 + movdqa [192+r10], xmm0 + movdqa [208+r10], xmm0 + movdqa [224+r10], xmm0 + movdqa [240+r10], xmm0 + + add r10, 256 + cmp r10, rsi + jb .L2 + + dec rdx + jnz .L1 + + pop r10 + ret + +;------------------------------------------------------------------------------ +; Name: WriterAVX +; Purpose: Writes 256-bit value sequentially to an area of memory. +; Params: rdi = ptr to memory area +; rsi = length in bytes +; rdx = loops +; rcx = quad to write +;------------------------------------------------------------------------------ + align 64 +WriterAVX: +_WriterAVX: + vzeroupper + + push r10 + + add rsi, rdi ; rsi now points to end. + + pinsrq xmm0, rcx, 0 + pinsrq xmm0, rcx, 1 + +.L1: + mov r10, rdi + +.L2: + vmovdqa [r10], ymm0 + vmovdqa [32+r10], ymm0 + vmovdqa [64+r10], ymm0 + vmovdqa [96+r10], ymm0 + vmovdqa [128+r10], ymm0 + vmovdqa [160+r10], ymm0 + vmovdqa [192+r10], ymm0 + vmovdqa [224+r10], ymm0 + + add r10, 256 + cmp r10, rsi + jb .L2 + + dec rdx + jnz .L1 + + pop r10 + ret + +;------------------------------------------------------------------------------ +; Name: WriterSSE2_128bytes +; Purpose: Writes 128-bit value sequentially to an area of memory, +; chunks are 128 bytes rather than 256. +; Params: rdi = ptr to memory area +; rsi = length in bytes +; rdx = loops +; rcx = quad to write +;------------------------------------------------------------------------------ + align 64 +WriterSSE2_128bytes: +_WriterSSE2_128bytes: + push r10 + + add rsi, rdi ; rsi now points to end. + + movq xmm0, rcx + +.L1: + mov r10, rdi + +.L2: + movdqa [r10], xmm0 + movdqa [16+r10], xmm0 + movdqa [32+r10], xmm0 + movdqa [48+r10], xmm0 + movdqa [64+r10], xmm0 + movdqa [80+r10], xmm0 + movdqa [96+r10], xmm0 + movdqa [112+r10], xmm0 + + add r10, 128 + cmp r10, rsi + jb .L2 + + dec rdx + jnz .L1 + + pop r10 + ret + +;------------------------------------------------------------------------------ +; Name: WriterSSE2_bypass +; Purpose: Writes 128-bit value sequentially to an area of memory. +; Params: rdi = ptr to memory area +; rsi = length in bytes +; rdx = loops +; rcx = quad to write +;------------------------------------------------------------------------------ + align 64 +WriterSSE2_bypass: +_WriterSSE2_bypass: + push r10 + + add rsi, rdi ; rsi now points to end. + + movq xmm0, rcx + +.L1: + mov r10, rdi + +.L2: + movntdq [r10], xmm0 ; Write bypassing cache. + movntdq [16+r10], xmm0 + movntdq [32+r10], xmm0 + movntdq [48+r10], xmm0 + movntdq [64+r10], xmm0 + movntdq [80+r10], xmm0 + movntdq [96+r10], xmm0 + movntdq [112+r10], xmm0 + + movntdq [128+r10], xmm0 + movntdq [144+r10], xmm0 + movntdq [160+r10], xmm0 + movntdq [176+r10], xmm0 + movntdq [192+r10], xmm0 + movntdq [208+r10], xmm0 + movntdq [224+r10], xmm0 + movntdq [240+r10], xmm0 + + add r10, 256 + cmp r10, rsi + jb .L2 + + dec rdx + jnz .L1 + + pop r10 + ret + +;------------------------------------------------------------------------------ +; Name: WriterAVX_bypass +; Purpose: Writes 256-bit value sequentially to an area of memory. +; Params: rdi = ptr to memory area +; rsi = length in bytes +; rdx = loops +; rcx = quad to write +;------------------------------------------------------------------------------ + align 64 +WriterAVX_bypass: +_WriterAVX_bypass: + vzeroupper + + push r10 + + add rsi, rdi ; rsi now points to end. + + movq xmm0, rcx + +.L1: + mov r10, rdi + +.L2: + vmovntdq [r10], xmm0 ; Write bypassing cache. + vmovntdq [32+r10], xmm0 + vmovntdq [64+r10], xmm0 + vmovntdq [96+r10], xmm0 + vmovntdq [128+r10], xmm0 + vmovntdq [160+r10], xmm0 + vmovntdq [192+r10], xmm0 + vmovntdq [224+r10], xmm0 + + add r10, 256 + cmp r10, rsi + jb .L2 + + dec rdx + jnz .L1 + + pop r10 + ret + +;------------------------------------------------------------------------------ +; Name: WriterSSE2_128bytes_bypass +; Purpose: Writes 128-bit value sequentially to an area of memory. +; Params: rdi = ptr to memory area +; rsi = length in bytes +; rdx = loops +; rcx = quad to write +;------------------------------------------------------------------------------ + align 64 +WriterSSE2_128bytes_bypass: +_WriterSSE2_128bytes_bypass: + push r10 + + add rsi, rdi ; rsi now points to end. + + movq xmm0, rcx + +.L1: + mov r10, rdi + +.L2: + movntdq [r10], xmm0 ; Write bypassing cache. + movntdq [16+r10], xmm0 + movntdq [32+r10], xmm0 + movntdq [48+r10], xmm0 + movntdq [64+r10], xmm0 + movntdq [80+r10], xmm0 + movntdq [96+r10], xmm0 + movntdq [112+r10], xmm0 + + add r10, 128 + cmp r10, rsi + jb .L2 + + dec rdx + jnz .L1 + + pop r10 + ret + +;------------------------------------------------------------------------------ +; Name: StackReader +; Purpose: Reads 64-bit values off the stack into registers of +; the main register set, effectively testing L1 cache access +; *and* effective-address calculation speed. +; Params: rdi = loops +;------------------------------------------------------------------------------ + align 64 +StackReader: +_StackReader: + push qword 7000 ; [rsp+48] + push qword 6000 ; [rsp+40] + push qword 5000 ; [rsp+32] + push qword 4000 ; [rsp+24] + push qword 3000 ; [rsp+16] + push qword 2000 ; [rsp+8] + push qword 1000 ; [rsp] + +.L1: + mov rax, [rsp] + mov rax, [rsp+16] + mov rax, [rsp+24] + mov rax, [rsp+32] + mov rax, [rsp+40] + mov rax, [rsp+8] + mov rax, [rsp+48] + mov rax, [rsp] + mov rax, [rsp] + mov rax, [rsp+16] + mov rax, [rsp+24] + mov rax, [rsp+32] + mov rax, [rsp+40] + mov rax, [rsp+8] + mov rax, [rsp+48] + mov rax, [rsp] + mov rax, [rsp] + mov rax, [rsp+16] + mov rax, [rsp+24] + mov rax, [rsp+32] + mov rax, [rsp+40] + mov rax, [rsp+8] + mov rax, [rsp+48] + mov rax, [rsp+8] + mov rax, [rsp+8] + mov rax, [rsp+16] + mov rax, [rsp+24] + mov rax, [rsp+32] + mov rax, [rsp+40] + mov rax, [rsp+8] + mov rax, [rsp+48] + mov rax, [rsp+8] + + sub rdi, 1 + jnz .L1 + + add rsp, 56 + ret + +;------------------------------------------------------------------------------ +; Name: StackWriter +; Purpose: Writes 64-bit values into the stack from registers of +; the main register set, effectively testing L1 cache access +; *and* effective-address calculation speed. +; Params: rdi = loops +;------------------------------------------------------------------------------ + align 64 +StackWriter: +_StackWriter: + push qword 7000 ; [rsp+48] + push qword 6000 ; [rsp+40] + push qword 5000 ; [rsp+32] + push qword 4000 ; [rsp+24] + push qword 3000 ; [rsp+16] + push qword 2000 ; [rsp+8] + push qword 1000 ; [rsp] + + xor rax, rax + +.L1: + mov [rsp], rax + mov [rsp+16], rax + mov [rsp+24], rax + mov [rsp+32], rax + mov [rsp+40], rax + mov [rsp+8], rax + mov [rsp+48], rax + mov [rsp], rax + mov [rsp], rax + mov [rsp+16], rax + mov [rsp+24], rax + mov [rsp+32], rax + mov [rsp+40], rax + mov [rsp+8], rax + mov [rsp+48], rax + mov [rsp], rax + mov [rsp], rax + mov [rsp+16], rax + mov [rsp+24], rax + mov [rsp+32], rax + mov [rsp+40], rax + mov [rsp+8], rax + mov [rsp+48], rax + mov [rsp+8], rax + mov [rsp+8], rax + mov [rsp+16], rax + mov [rsp+24], rax + mov [rsp+32], rax + mov [rsp+40], rax + mov [rsp+8], rax + mov [rsp+48], rax + mov [rsp+8], rax + + sub rdi, 1 + jnz .L1 + + add rsp, 56 + ret + +;------------------------------------------------------------------------------ +; Name: RegisterToRegister +; Purpose: Reads/writes 64-bit values between registers of +; the main register set. +; Params: rdi = loops +;------------------------------------------------------------------------------ + align 64 +RegisterToRegister: +_RegisterToRegister: +.L1: + mov rax, rbx + mov rax, rcx + mov rax, rdx + mov rax, rsi + mov rax, rdi + mov rax, rbp + mov rax, rsp + mov rax, rbx + mov rax, rbx + mov rax, rcx + mov rax, rdx + mov rax, rsi + mov rax, rdi + mov rax, rbp + mov rax, rsp + mov rax, rbx + mov rax, rbx + mov rax, rcx + mov rax, rdx + mov rax, rsi + mov rax, rdi + mov rax, rbp + mov rax, rsp + mov rax, rbx + mov rax, rbx + mov rax, rcx + mov rax, rdx + mov rax, rsi + mov rax, rdi + mov rax, rbp + mov rax, rsp + mov rax, rbx + + sub rdi, 1 + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: VectorToVector +; Purpose: Reads/writes 128-bit values between registers of +; the vector register set, in this case XMM. +; Params: rdi = loops +;------------------------------------------------------------------------------ + align 64 +VectorToVector: +_VectorToVector: +.L1: + movq xmm0, xmm1 ; Each move moves 16 bytes, so we need 16 + movq xmm0, xmm2 ; moves to transfer a 256 byte chunk. + movq xmm0, xmm3 + movq xmm2, xmm0 + movq xmm1, xmm2 + movq xmm2, xmm1 + movq xmm0, xmm3 + movq xmm3, xmm1 + + movq xmm3, xmm2 + movq xmm1, xmm3 + movq xmm2, xmm1 + movq xmm0, xmm1 + movq xmm1, xmm2 + movq xmm0, xmm1 + movq xmm0, xmm3 + movq xmm3, xmm0 + + sub rdi, 1 + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: VectorToVectorAVX +; Purpose: Reads/writes 256-bit values between registers of +; the vector register set, in this case YMM. +; Params: rdi = loops +;------------------------------------------------------------------------------ + align 64 +VectorToVectorAVX: +_VectorToVectorAVX: + vzeroupper + +.L1: + vmovdqa ymm0, ymm1 ; Each move moves 32 bytes, so we need 8 + vmovdqa ymm0, ymm2 ; moves to transfer a 256 byte chunk. + vmovdqa ymm0, ymm3 + vmovdqa ymm2, ymm0 + vmovdqa ymm1, ymm2 + vmovdqa ymm2, ymm1 + vmovdqa ymm0, ymm3 + vmovdqa ymm3, ymm1 + + sub rdi, 1 + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: RegisterToVector +; Purpose: Writes 64-bit main register values into 128-bit vector register +; clearing the upper unused bits. +; Params: rdi = loops +;------------------------------------------------------------------------------ + align 64 +RegisterToVector: +_RegisterToVector: +.L1: + movq xmm1, rax ; Each movq transfers 8 bytes, so we need + movq xmm2, rsi ; 32 transfers to move a 256-byte chunk. + movq xmm3, rbx + movq xmm1, rcx + movq xmm2, rsi + movq xmm3, rsp + movq xmm0, rdi + movq xmm0, rdx + + movq xmm0, rax + movq xmm1, rsi + movq xmm2, rbx + movq xmm3, rcx + movq xmm0, rsi + movq xmm3, rsp + movq xmm2, rdi + movq xmm1, rdx + + movq xmm0, rax + movq xmm1, rsi + movq xmm2, rbx + movq xmm3, rcx + movq xmm0, rsi + movq xmm3, rsp + movq xmm2, rdi + movq xmm1, rdx + + movq xmm0, rax + movq xmm1, rsi + movq xmm2, rbx + movq xmm3, rcx + movq xmm0, rsi + movq xmm3, rsp + movq xmm2, rdi + movq xmm1, rdx + + dec rdi + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: VectorToRegister +; Purpose: Writes lower 64 bits of vector register into 64-bit main +; register. +; Params: rdi = loops +;------------------------------------------------------------------------------ + align 64 +VectorToRegister: +_VectorToRegister: +.L1: + movq rax, xmm1 + movq rax, xmm2 + movq rax, xmm3 + movq rax, xmm1 + movq rax, xmm2 + movq rax, xmm3 + movq rax, xmm0 + movq rax, xmm0 + + movq rax, xmm0 + movq rax, xmm1 + movq rax, xmm2 + movq rax, xmm3 + movq rax, xmm0 + movq rax, xmm3 + movq rax, xmm2 + movq rax, xmm1 + + movq rax, xmm0 + movq rax, xmm1 + movq rax, xmm2 + movq rax, xmm3 + movq rax, xmm0 + movq rax, xmm3 + movq rax, xmm2 + movq rax, xmm1 + + movq rax, xmm0 + movq rax, xmm1 + movq rax, xmm2 + movq rax, xmm3 + movq rax, xmm0 + movq rax, xmm3 + movq rax, xmm2 + movq rax, xmm1 + + dec rdi + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: Register8ToVector +; Purpose: Writes 8-bit main register values into 128-bit vector register +; without clearing the unused bits. +; Params: rdi = loops +;------------------------------------------------------------------------------ + align 64 +Register8ToVector: +_Register8ToVector: + sal rdi, 2 ; Force some repetition. +.L1: + pinsrb xmm1, al, 0 ; 64 transfers x 1 byte = 64 bytes + pinsrb xmm2, bl, 1 + pinsrb xmm3, cl, 2 + pinsrb xmm1, dl, 3 + pinsrb xmm2, sil, 4 + pinsrb xmm3, dil, 5 + pinsrb xmm0, bpl, 6 + pinsrb xmm0, spl, 7 + + pinsrb xmm0, al, 0 + pinsrb xmm1, bl, 1 + pinsrb xmm2, cl, 2 + pinsrb xmm3, dl, 3 + pinsrb xmm3, al, 4 + pinsrb xmm2, bl, 5 + pinsrb xmm1, bpl, 6 + pinsrb xmm0, spl, 7 + + pinsrb xmm1, r8b, 0 + pinsrb xmm2, r9b, 1 + pinsrb xmm3, r10b, 2 + pinsrb xmm1, r11b, 3 + pinsrb xmm2, r12b, 4 + pinsrb xmm3, al, 5 + pinsrb xmm0, cl, 6 + pinsrb xmm0, bl, 7 + + pinsrb xmm0, r8b, 0 + pinsrb xmm0, r9b, 1 + pinsrb xmm0, r10b, 2 + pinsrb xmm0, r11b, 3 + pinsrb xmm0, r12b, 4 + pinsrb xmm0, al, 5 + pinsrb xmm0, cl, 6 + pinsrb xmm0, bl, 7 + + pinsrb xmm1, al, 0 + pinsrb xmm2, bl, 1 + pinsrb xmm3, cl, 2 + pinsrb xmm1, dl, 3 + pinsrb xmm2, sil, 4 + pinsrb xmm3, dil, 5 + pinsrb xmm0, bpl, 6 + pinsrb xmm0, spl, 7 + + pinsrb xmm0, al, 10 + pinsrb xmm1, bl, 11 + pinsrb xmm2, cl, 12 + pinsrb xmm3, dl, 13 + pinsrb xmm3, dil, 14 + pinsrb xmm2, cl, 15 + pinsrb xmm1, al, 6 + pinsrb xmm0, bpl, 7 + + pinsrb xmm1, r8b, 10 + pinsrb xmm2, r9b, 11 + pinsrb xmm3, r10b, 12 + pinsrb xmm1, r11b, 13 + pinsrb xmm2, r12b, 14 + pinsrb xmm3, al, 15 + pinsrb xmm0, cl, 6 + pinsrb xmm0, bl, 7 + + pinsrb xmm0, r8b, 9 + pinsrb xmm0, r9b, 8 + pinsrb xmm0, r10b, 11 + pinsrb xmm0, r11b, 3 + pinsrb xmm0, r12b, 4 + pinsrb xmm0, al, 5 + pinsrb xmm0, cl, 6 + pinsrb xmm0, bl, 7 + + dec rdi + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: Register16ToVector +; Purpose: Writes 16-bit main register values into 128-bit vector register +; without clearing the unused bits. +; Params: rdi = loops +;------------------------------------------------------------------------------ + align 64 +Register16ToVector: +_Register16ToVector: + sal rdi, 1 ; Force some repetition. +.L1: + pinsrw xmm1, ax, 0 ; 64 transfers x 2 bytes = 128 bytes + pinsrw xmm2, bx, 1 + pinsrw xmm3, cx, 2 + pinsrw xmm1, dx, 3 + pinsrw xmm2, si, 4 + pinsrw xmm3, di, 5 + pinsrw xmm0, bp, 6 + pinsrw xmm0, sp, 7 + + pinsrw xmm0, ax, 0 + pinsrw xmm1, bx, 1 + pinsrw xmm2, cx, 2 + pinsrw xmm3, dx, 3 + pinsrw xmm3, si, 4 + pinsrw xmm2, di, 5 + pinsrw xmm1, bp, 6 + pinsrw xmm0, sp, 7 + + pinsrw xmm1, r8w, 0 + pinsrw xmm2, r9w, 1 + pinsrw xmm3, r10w, 2 + pinsrw xmm1, r11w, 3 + pinsrw xmm2, r12w, 4 + pinsrw xmm3, ax, 5 + pinsrw xmm0, bp, 6 + pinsrw xmm0, bx, 7 + + pinsrw xmm0, r8w, 0 + pinsrw xmm0, r9w, 1 + pinsrw xmm0, r10w, 2 + pinsrw xmm0, r11w, 3 + pinsrw xmm0, r12w, 4 + pinsrw xmm0, ax, 5 + pinsrw xmm0, bp, 6 + pinsrw xmm0, bx, 7 + + pinsrw xmm1, ax, 0 + pinsrw xmm2, bx, 1 + pinsrw xmm3, cx, 2 + pinsrw xmm1, dx, 3 + pinsrw xmm2, si, 4 + pinsrw xmm3, di, 5 + pinsrw xmm0, bp, 6 + pinsrw xmm0, sp, 7 + + pinsrw xmm0, ax, 0 + pinsrw xmm1, bx, 1 + pinsrw xmm2, cx, 2 + pinsrw xmm3, dx, 3 + pinsrw xmm3, si, 4 + pinsrw xmm2, di, 5 + pinsrw xmm1, bp, 6 + pinsrw xmm0, sp, 7 + + pinsrw xmm1, r8w, 0 + pinsrw xmm2, r9w, 1 + pinsrw xmm3, r10w, 2 + pinsrw xmm1, r11w, 3 + pinsrw xmm2, r12w, 4 + pinsrw xmm3, ax, 5 + pinsrw xmm0, bp, 6 + pinsrw xmm0, bx, 7 + + pinsrw xmm0, r8w, 0 + pinsrw xmm0, r9w, 1 + pinsrw xmm0, r10w, 2 + pinsrw xmm0, r11w, 3 + pinsrw xmm0, r12w, 4 + pinsrw xmm0, ax, 5 + pinsrw xmm0, bp, 6 + pinsrw xmm0, bx, 7 + + dec rdi + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: Register32ToVector +; Purpose: Writes 32-bit main register values into 128-bit vector register +; without clearing the unused bits. +; Params: rdi = loops +;------------------------------------------------------------------------------ + align 64 +Register32ToVector: +_Register32ToVector: +.L1: + pinsrd xmm1, eax, 0 ; Each xfer moves 4 bytes so to move 256 bytes + pinsrd xmm2, ebx, 1 ; we need 64 transfers. + pinsrd xmm3, ecx, 2 + pinsrd xmm1, edx, 3 + pinsrd xmm2, esi, 0 + pinsrd xmm3, edi, 1 + pinsrd xmm0, ebp, 2 + pinsrd xmm0, esp, 3 + + pinsrd xmm0, eax, 0 + pinsrd xmm1, ebx, 1 + pinsrd xmm2, ecx, 2 + pinsrd xmm3, edx, 3 + pinsrd xmm3, esi, 3 + pinsrd xmm2, edi, 2 + pinsrd xmm1, ebp, 1 + pinsrd xmm0, esp, 0 + + pinsrd xmm1, r8d, 0 + pinsrd xmm2, r9d, 1 + pinsrd xmm3, r10d, 2 + pinsrd xmm1, r11d, 3 + pinsrd xmm2, r12d, 0 + pinsrd xmm3, eax, 1 + pinsrd xmm0, ebp, 2 + pinsrd xmm0, ebx, 3 + + pinsrd xmm0, r8d, 0 + pinsrd xmm0, r9d, 1 + pinsrd xmm0, r10d, 2 + pinsrd xmm0, r11d, 3 + pinsrd xmm0, r12d, 0 + pinsrd xmm0, eax, 0 + pinsrd xmm0, ebp, 0 + pinsrd xmm0, ebx, 0 + + pinsrd xmm1, eax, 0 + pinsrd xmm2, ebx, 1 + pinsrd xmm3, ecx, 2 + pinsrd xmm1, edx, 3 + pinsrd xmm2, esi, 0 + pinsrd xmm3, edi, 1 + pinsrd xmm0, ebp, 2 + pinsrd xmm0, esp, 3 + + pinsrd xmm0, eax, 0 + pinsrd xmm1, ebx, 1 + pinsrd xmm2, ecx, 2 + pinsrd xmm3, edx, 3 + pinsrd xmm3, esi, 3 + pinsrd xmm2, edi, 2 + pinsrd xmm1, ebp, 1 + pinsrd xmm0, esp, 0 + + pinsrd xmm1, r8d, 0 + pinsrd xmm2, r9d, 1 + pinsrd xmm3, r10d, 2 + pinsrd xmm1, r11d, 3 + pinsrd xmm2, r12d, 0 + pinsrd xmm3, eax, 1 + pinsrd xmm0, ebp, 2 + pinsrd xmm0, ebx, 3 + + pinsrd xmm0, r8d, 0 + pinsrd xmm0, r9d, 1 + pinsrd xmm0, r10d, 2 + pinsrd xmm0, r11d, 3 + pinsrd xmm0, r12d, 0 + pinsrd xmm0, eax, 0 + pinsrd xmm0, ebp, 0 + pinsrd xmm0, ebx, 0 + + dec rdi + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: Register64ToVector +; Purpose: Writes 64-bit main register values into 128-bit vector register +; without clearing the unused bits. +; Params: rdi = loops +;------------------------------------------------------------------------------ + align 64 +Register64ToVector: +_Register64ToVector: + add rdi, rdi +.L1: + pinsrq xmm1, r8, 0 ; Each xfer moves 8 bytes, therefore to do + pinsrq xmm2, r9, 1 ; 256 bytes we need 32 transfers. + pinsrq xmm3, r10, 0 + pinsrq xmm1, r11, 1 + pinsrq xmm2, r12, 0 + pinsrq xmm3, rax, 1 + pinsrq xmm0, rbp, 0 + pinsrq xmm0, rbx, 1 + + pinsrq xmm0, r8, 0 + pinsrq xmm0, r9, 1 + pinsrq xmm0, r10, 1 + pinsrq xmm0, r11, 1 + pinsrq xmm0, r12, 0 + pinsrq xmm0, rax, 0 + pinsrq xmm0, rbp, 0 + pinsrq xmm0, rbx, 0 + + pinsrq xmm0, r8, 0 + pinsrq xmm0, r9, 1 + pinsrq xmm0, r10, 1 + pinsrq xmm0, r11, 1 + pinsrq xmm0, r12, 0 + pinsrq xmm0, rax, 0 + pinsrq xmm0, rbp, 0 + pinsrq xmm0, rbx, 0 + + pinsrq xmm0, r8, 0 + pinsrq xmm0, r9, 1 + pinsrq xmm0, r10, 1 + pinsrq xmm0, r11, 1 + pinsrq xmm0, r12, 0 + pinsrq xmm0, rax, 0 + pinsrq xmm0, rbp, 0 + pinsrq xmm0, rbx, 0 + + dec rdi + jnz .L1 + ret + + +;------------------------------------------------------------------------------ +; Name: Vector8ToRegister +; Purpose: Writes 8-bit vector register values into main register. +; Params: rdi = loops +;------------------------------------------------------------------------------ + align 64 +Vector8ToRegister: +_Vector8ToRegister: + sal rdi, 3 ; Force some repetition. +.L1: + pextrb eax, xmm1, 0 ; 64 transfers x 1 bytes = 64 bytes + pextrb eax, xmm2, 1 + pextrb eax, xmm3, 2 + pextrb eax, xmm1, 3 + pextrb eax, xmm2, 4 + pextrb eax, xmm3, 5 + pextrb eax, xmm0, 6 + pextrb eax, xmm0, 7 + + pextrb eax, xmm0, 0 + pextrb eax, xmm1, 1 + pextrb eax, xmm2, 2 + pextrb eax, xmm3, 3 + pextrb eax, xmm3, 4 + pextrb eax, xmm2, 5 + pextrb eax, xmm1, 6 + pextrb eax, xmm0, 7 + + pextrb eax, xmm1, 0 + pextrb eax, xmm2, 1 + pextrb eax, xmm3, 2 + pextrb eax, xmm1, 3 + pextrb eax, xmm2, 4 + pextrb eax, xmm3, 5 + pextrb eax, xmm0, 6 + pextrb eax, xmm0, 7 + + pextrb eax, xmm0, 0 + pextrb eax, xmm0, 1 + pextrb eax, xmm0, 2 + pextrb eax, xmm0, 3 + pextrb eax, xmm0, 4 + pextrb eax, xmm0, 5 + pextrb eax, xmm0, 6 + pextrb eax, xmm0, 7 + + pextrb eax, xmm1, 0 + pextrb eax, xmm2, 1 + pextrb eax, xmm3, 2 + pextrb eax, xmm1, 3 + pextrb eax, xmm2, 4 + pextrb eax, xmm3, 5 + pextrb eax, xmm0, 6 + pextrb eax, xmm0, 7 + + pextrb eax, xmm0, 0 + pextrb eax, xmm1, 1 + pextrb eax, xmm2, 2 + pextrb eax, xmm3, 3 + pextrb eax, xmm3, 4 + pextrb eax, xmm2, 5 + pextrb eax, xmm1, 6 + pextrb eax, xmm0, 7 + + pextrb eax, xmm1, 0 + pextrb eax, xmm2, 1 + pextrb eax, xmm3, 2 + pextrb eax, xmm1, 3 + pextrb eax, xmm2, 4 + pextrb eax, xmm3, 5 + pextrb eax, xmm0, 6 + pextrb eax, xmm0, 7 + + pextrb eax, xmm0, 0 + pextrb eax, xmm0, 1 + pextrb eax, xmm0, 2 + pextrb eax, xmm0, 3 + pextrb eax, xmm0, 4 + pextrb eax, xmm0, 5 + pextrb eax, xmm0, 6 + pextrb eax, xmm0, 7 + + dec rdi + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: Vector16ToRegister +; Purpose: Writes 16-bit vector register values into main register. +; Params: rdi = loops +;------------------------------------------------------------------------------ + align 64 +Vector16ToRegister: +_Vector16ToRegister: + sal rdi, 2 ; Force some repetition. +.L1: + pextrw eax, xmm1, 0 ; 64 transfers x 2 bytes = 128 bytes + pextrw eax, xmm2, 1 + pextrw eax, xmm3, 2 + pextrw eax, xmm1, 3 + pextrw eax, xmm2, 4 + pextrw eax, xmm3, 5 + pextrw eax, xmm0, 6 + pextrw eax, xmm0, 7 + + pextrw eax, xmm0, 0 + pextrw eax, xmm1, 1 + pextrw eax, xmm2, 2 + pextrw eax, xmm3, 3 + pextrw eax, xmm3, 4 + pextrw eax, xmm2, 5 + pextrw eax, xmm1, 6 + pextrw eax, xmm0, 7 + + pextrw eax, xmm1, 0 + pextrw eax, xmm2, 1 + pextrw eax, xmm3, 2 + pextrw eax, xmm1, 3 + pextrw eax, xmm2, 4 + pextrw eax, xmm3, 5 + pextrw eax, xmm0, 6 + pextrw eax, xmm0, 7 + + pextrw eax, xmm0, 0 + pextrw eax, xmm0, 1 + pextrw eax, xmm0, 2 + pextrw eax, xmm0, 3 + pextrw eax, xmm0, 4 + pextrw eax, xmm0, 5 + pextrw eax, xmm0, 6 + pextrw eax, xmm0, 7 + + pextrw eax, xmm1, 0 + pextrw eax, xmm2, 1 + pextrw eax, xmm3, 2 + pextrw eax, xmm1, 3 + pextrw eax, xmm2, 4 + pextrw eax, xmm3, 5 + pextrw eax, xmm0, 6 + pextrw eax, xmm0, 7 + + pextrw eax, xmm0, 0 + pextrw eax, xmm1, 1 + pextrw eax, xmm2, 2 + pextrw eax, xmm3, 3 + pextrw eax, xmm3, 4 + pextrw eax, xmm2, 5 + pextrw eax, xmm1, 6 + pextrw eax, xmm0, 7 + + pextrw eax, xmm1, 0 + pextrw eax, xmm2, 1 + pextrw eax, xmm3, 2 + pextrw eax, xmm1, 3 + pextrw eax, xmm2, 4 + pextrw eax, xmm3, 5 + pextrw eax, xmm0, 6 + pextrw eax, xmm0, 7 + + pextrw eax, xmm0, 0 + pextrw eax, xmm0, 1 + pextrw eax, xmm0, 2 + pextrw eax, xmm0, 3 + pextrw eax, xmm0, 4 + pextrw eax, xmm0, 5 + pextrw eax, xmm0, 6 + pextrw eax, xmm0, 7 + + dec rdi + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: Vector32ToRegister +; Purpose: Writes 32-bit vector register values into main register. +; Params: rdi = loops +;------------------------------------------------------------------------------ + align 64 +Vector32ToRegister: +_Vector32ToRegister: + add rdi, rdi +.L1: + pextrd eax, xmm1, 0 ; 64 xfers x 4 bytes = 256 bytes + pextrd eax, xmm2, 1 + pextrd eax, xmm3, 2 + pextrd eax, xmm1, 3 + pextrd eax, xmm2, 0 + pextrd eax, xmm3, 1 + pextrd eax, xmm0, 2 + pextrd eax, xmm0, 3 + + pextrd eax, xmm0, 0 + pextrd eax, xmm1, 1 + pextrd eax, xmm2, 2 + pextrd eax, xmm3, 3 + pextrd eax, xmm3, 3 + pextrd eax, xmm2, 2 + pextrd eax, xmm1, 1 + pextrd eax, xmm0, 0 + + pextrd eax, xmm1, 0 + pextrd eax, xmm2, 1 + pextrd eax, xmm3, 2 + pextrd eax, xmm1, 3 + pextrd eax, xmm2, 0 + pextrd eax, xmm3, 1 + pextrd eax, xmm0, 2 + pextrd eax, xmm0, 3 + + pextrd eax, xmm0, 0 + pextrd eax, xmm0, 1 + pextrd eax, xmm0, 2 + pextrd eax, xmm0, 3 + pextrd eax, xmm0, 0 + pextrd eax, xmm0, 0 + pextrd eax, xmm0, 0 + pextrd eax, xmm0, 0 + + pextrd eax, xmm1, 0 + pextrd eax, xmm2, 1 + pextrd eax, xmm3, 2 + pextrd eax, xmm1, 3 + pextrd eax, xmm2, 0 + pextrd eax, xmm3, 1 + pextrd eax, xmm0, 2 + pextrd eax, xmm0, 3 + + pextrd eax, xmm0, 0 + pextrd eax, xmm1, 1 + pextrd eax, xmm2, 2 + pextrd eax, xmm3, 3 + pextrd eax, xmm3, 3 + pextrd eax, xmm2, 2 + pextrd eax, xmm1, 1 + pextrd eax, xmm0, 0 + + pextrd eax, xmm1, 0 + pextrd eax, xmm2, 1 + pextrd eax, xmm3, 2 + pextrd eax, xmm1, 3 + pextrd eax, xmm2, 0 + pextrd eax, xmm3, 1 + pextrd eax, xmm0, 2 + pextrd eax, xmm0, 3 + + pextrd eax, xmm0, 0 + pextrd eax, xmm0, 1 + pextrd eax, xmm0, 2 + pextrd eax, xmm0, 3 + pextrd eax, xmm0, 0 + pextrd eax, xmm0, 1 + pextrd eax, xmm0, 2 + pextrd eax, xmm0, 3 + + dec rdi + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: Vector64ToRegister +; Purpose: Writes 64-bit vector register values into main register. +; Params: rdi = loops +;------------------------------------------------------------------------------ + align 64 +Vector64ToRegister: +_Vector64ToRegister: + add rdi, rdi +.L1: + pextrq rax, xmm1, 0 ; 32 transfers by 8 bytes = 256 bytes + pextrq rax, xmm2, 1 + pextrq rax, xmm3, 0 + pextrq rax, xmm1, 1 + pextrq rax, xmm2, 0 + pextrq rax, xmm3, 1 + pextrq rax, xmm0, 0 + pextrq rax, xmm0, 1 + + pextrq rax, xmm0, 0 + pextrq rax, xmm0, 1 + pextrq rax, xmm0, 0 + pextrq rax, xmm0, 1 + pextrq rax, xmm0, 0 + pextrq rax, xmm0, 1 + pextrq rax, xmm0, 0 + pextrq rax, xmm0, 1 + + pextrq rax, xmm1, 0 + pextrq rax, xmm2, 1 + pextrq rax, xmm3, 0 + pextrq rax, xmm1, 1 + pextrq rax, xmm2, 0 + pextrq rax, xmm3, 1 + pextrq rax, xmm0, 0 + pextrq rax, xmm0, 1 + + pextrq rax, xmm0, 0 + pextrq rax, xmm0, 1 + pextrq rax, xmm0, 0 + pextrq rax, xmm0, 1 + pextrq rax, xmm0, 0 + pextrq rax, xmm0, 1 + pextrq rax, xmm0, 0 + pextrq rax, xmm0, 1 + + dec rdi + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: CopyAVX +; Purpose: Copies memory chunks that are 32-byte aligned. +; Params: rdi = ptr to destination memory area +; rsi = ptr to source memory area +; rdx = length in bytes +; rcx = loops +;------------------------------------------------------------------------------ + align 64 +CopyAVX: +_CopyAVX: + vzeroupper + + push r10 + + shr rdx, 8 ; Ensure length is multiple of 256. + shl rdx, 8 + + prefetcht0 [rsi] + +.L1: + mov r10, rdx + +.L2: + vmovdqa ymm0, [rsi] + vmovdqa ymm1, [32+rsi] + vmovdqa ymm2, [64+rsi] + vmovdqa ymm3, [96+rsi] + + vmovdqa [rdi], ymm0 + vmovdqa [32+rdi], ymm1 + vmovdqa [64+rdi], ymm2 + vmovdqa [96+rdi], ymm3 + + vmovdqa ymm0, [128+rsi] + vmovdqa ymm1, [128+32+rsi] + vmovdqa ymm2, [128+64+rsi] + vmovdqa ymm3, [128+96+rsi] + + vmovdqa [128+rdi], ymm0 + vmovdqa [128+32+rdi], ymm1 + vmovdqa [128+64+rdi], ymm2 + vmovdqa [128+96+rdi], ymm3 + + add rsi, 256 + add rdi, 256 + + sub r10, 256 + jnz .L2 + + sub rsi, rdx ; rsi now points to start. + sub rdi, rdx ; rdi now points to start. + + dec rcx + jnz .L1 + + pop r10 + + ret + + +;------------------------------------------------------------------------------ +; Name: CopySSE +; Purpose: Copies memory chunks that are 16-byte aligned. +; Params: rdi = ptr to destination memory area +; rsi = ptr to source memory area +; rdx = length in bytes +; rcx = loops +;------------------------------------------------------------------------------ + align 64 +CopySSE: +_CopySSE: + push r10 + + shr rdx, 8 ; Ensure length is multiple of 256. + shl rdx, 8 + + prefetcht0 [rsi] + + ; Save our non-parameter XMM registers. + sub rsp, 192 + movdqu [rsp], xmm4 + movdqu [16+rsp], xmm5 + movdqu [32+rsp], xmm6 + movdqu [48+rsp], xmm7 + movdqu [64+rsp], xmm8 + movdqu [80+rsp], xmm9 + movdqu [96+rsp], xmm10 + movdqu [112+rsp], xmm11 + movdqu [128+rsp], xmm12 + movdqu [144+rsp], xmm13 + movdqu [160+rsp], xmm14 + movdqu [176+rsp], xmm15 + +.L1: + mov r10, rdx + +.L2: + movdqa xmm0, [rsi] + movdqa xmm1, [16+rsi] + movdqa xmm2, [32+rsi] + movdqa xmm3, [48+rsi] + + movdqa [rdi], xmm0 + movdqa [16+rdi], xmm1 + movdqa [32+rdi], xmm2 + movdqa [48+rdi], xmm3 + + movdqa xmm4, [64+rsi] + movdqa xmm5, [80+rsi] + movdqa xmm6, [96+rsi] + movdqa xmm7, [112+rsi] + + movdqa [64+rdi], xmm4 + movdqa [80+rdi], xmm5 + movdqa [96+rdi], xmm6 + movdqa [112+rdi], xmm7 + + movdqa xmm8, [128+rsi] + movdqa xmm9, [144+rsi] + movdqa xmm10, [160+rsi] + movdqa xmm11, [176+rsi] + + movdqa [128+rdi], xmm8 + movdqa [144+rdi], xmm9 + movdqa [160+rdi], xmm10 + movdqa [176+rdi], xmm11 + + movdqa xmm12, [192+rsi] + movdqa xmm13, [208+rsi] + movdqa xmm14, [224+rsi] + movdqa xmm15, [240+rsi] + + movdqa [192+rdi], xmm12 + movdqa [208+rdi], xmm13 + movdqa [224+rdi], xmm14 + movdqa [240+rdi], xmm15 + + add rsi, 256 + add rdi, 256 + + sub r10, 256 + jnz .L2 + + sub rsi, rdx ; rsi now points to start. + sub rdi, rdx ; rdi now points to start. + + dec rcx + jnz .L1 + + movdqu xmm4, [rsp] + movdqu xmm5, [16+rsp] + movdqu xmm6, [32+rsp] + movdqu xmm7, [48+rsp] + movdqu xmm8, [64+rsp] + movdqu xmm9, [80+rsp] + movdqu xmm10, [96+rsp] + movdqu xmm11, [112+rsp] + movdqu xmm12, [128+rsp] + movdqu xmm13, [144+rsp] + movdqu xmm14, [160+rsp] + movdqu xmm15, [176+rsp] + add rsp, 192 + + pop r10 + + ret + + +;------------------------------------------------------------------------------ +; Name: CopySSE_128bytes +; Purpose: Copies memory chunks that are 16-byte aligned. +; Params: rdi = ptr to destination memory area +; rsi = ptr to source memory area +; rdx = length in bytes +; rcx = loops +;------------------------------------------------------------------------------ + align 64 +CopySSE_128bytes: +_CopySSE_128bytes: + push r10 + + shr rdx, 7 ; Ensure length is multiple of 128. + shl rdx, 7 + + prefetcht0 [rsi] + + ; Save our non-parameter XMM registers. + sub rsp, 64 + movdqu [rsp], xmm4 + movdqu [16+rsp], xmm5 + movdqu [32+rsp], xmm6 + movdqu [48+rsp], xmm7 + +.L1: + mov r10, rdx + +.L2: + movdqa xmm0, [rsi] + movdqa xmm1, [16+rsi] + movdqa xmm2, [32+rsi] + movdqa xmm3, [48+rsi] + + movdqa [rdi], xmm0 + movdqa [16+rdi], xmm1 + movdqa [32+rdi], xmm2 + movdqa [48+rdi], xmm3 + + movdqa xmm4, [64+rsi] + movdqa xmm5, [80+rsi] + movdqa xmm6, [96+rsi] + movdqa xmm7, [112+rsi] + + movdqa [64+rdi], xmm4 + movdqa [80+rdi], xmm5 + movdqa [96+rdi], xmm6 + movdqa [112+rdi], xmm7 + + add rsi, 128 + add rdi, 128 + + sub r10, 128 + jnz .L2 + + sub rsi, rdx ; rsi now points to start. + sub rdi, rdx ; rdi now points to start. + + dec rcx + jnz .L1 + + movdqu xmm4, [rsp] + movdqu xmm5, [16+rsp] + movdqu xmm6, [32+rsp] + movdqu xmm7, [48+rsp] + add rsp, 64 + + pop r10 + + ret + + |