summaryrefslogtreecommitdiffstats
path: root/main.c
diff options
context:
space:
mode:
Diffstat (limited to 'main.c')
-rwxr-xr-xmain.c2442
1 files changed, 2442 insertions, 0 deletions
diff --git a/main.c b/main.c
new file mode 100755
index 0000000..2d293a8
--- /dev/null
+++ b/main.c
@@ -0,0 +1,2442 @@
+/*============================================================================
+ bandwidth 1.1, a benchmark to estimate memory transfer bandwidth.
+ Copyright (C) 2005-2014 by Zack T Smith.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+ The author may be reached at veritas@comcast.net.
+ *===========================================================================*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <wchar.h>
+#include <math.h>
+
+#include <netdb.h> // gethostbyname
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+
+#define GRAPH_WIDTH 1440
+#define GRAPH_HEIGHT 900
+
+#include "defs.h"
+#include "BMP.h"
+#include "BMPGraphing.h"
+
+#define TITLE_MEMORY_NET "Network benchmark results from bandwidth " RELEASE " by Zack Smith, http://zsmith.co"
+#define TITLE_MEMORY_GRAPH "Memory benchmark results from bandwidth " RELEASE " by Zack Smith, http://zsmith.co"
+
+#ifdef __WIN32__
+#include <windows.h>
+#endif
+
+#ifdef __linux__
+#include <linux/fb.h>
+#include <sys/mman.h>
+#endif
+
+static int network_port = NETWORK_DEFAULT_PORTNUM;
+
+enum {
+ NO_SSE2,
+ SSE2,
+ SSE2_BYPASS,
+ AVX,
+ AVX_BYPASS,
+ LODSQ,
+ LODSD,
+ LODSW,
+ LODSB
+};
+
+static BMPGraph *graph = NULL;
+
+static bool use_sse2 = true;
+static bool use_sse4 = true;
+static bool is_intel = false;
+static bool is_amd = false;
+
+static uint32_t cpu_has_mmx = 0;
+static uint32_t cpu_has_sse = 0;
+static uint32_t cpu_has_sse2 = 0;
+static uint32_t cpu_has_sse3 = 0;
+static uint32_t cpu_has_ssse3 = 0;
+static uint32_t cpu_has_sse4a = 0;
+static uint32_t cpu_has_sse41 = 0;
+static uint32_t cpu_has_sse42 = 0;
+static uint32_t cpu_has_aes = 0;
+static uint32_t cpu_has_avx = 0;
+static uint32_t cpu_has_avx2 = 0;
+static uint32_t cpu_has_64bit = 0;
+static uint32_t cpu_has_xd = 0;
+
+//----------------------------------------
+// Parameters for the tests.
+//
+
+static long usec_per_test = 5000000; // 5 seconds per memory test.
+
+static int chunk_sizes[] = {
+ 128,
+ 256,
+ 384,
+ 512,
+ 640,
+ 768,
+ 896,
+ 1024,
+ 1280,
+ 2048,
+ 3072,
+ 4096,
+ 6144,
+ 8192, // Some processors' L1 data caches are only 8kB.
+ 12288,
+ 16384,
+ 20480,
+ 24576,
+ 28672,
+ 32768, // Common L1 data cache size.
+ 34*1024,
+ 36*1024,
+ 40960,
+ 49152,
+ 65536,
+ 131072, // Old L2 cache size.
+ 192 * 1024,
+ 256 * 1024, // Old L2 cache size.
+ 320 * 1024,
+ 384 * 1024,
+ 512 * 1024, // Old L2 cache size.
+ 768 * 1024,
+ 1 << 20, // 1 MB = common L2 cache size.
+ (1024 + 256) * 1024, // 1.25
+ (1024 + 512) * 1024, // 1.5
+ (1024 + 768) * 1024, // 1.75
+ 1 << 21, // 2 MB = common L2 cache size.
+ (2048 + 256) * 1024, // 2.25
+ (2048 + 512) * 1024, // 2.5
+ (2048 + 768) * 1024, // 2.75
+ 3072 * 1024, // 3 MB = common L2 cache size.
+ 3407872, // 3.25 MB
+ 3 * 1024 * 1024 + 1024 * 512, // 3.5 MB
+ 1 << 22, // 4 MB
+ 5242880, // 5 megs
+ 6291456, // 6 megs (common L2 cache size)
+ 7 * 1024 * 1024,
+ 8 * 1024 * 1024, // Xeon E3's often has 8MB L3
+ 9 * 1024 * 1024,
+ 10 * 1024 * 1024, // Xeon E5-2609 has 10MB L3
+ 12 * 1024 * 1024,
+ 14 * 1024 * 1024,
+ 15 * 1024 * 1024, // Xeon E6-2630 has 15MB L3
+ 16 * 1024 * 1024,
+ 20 * 1024 * 1024, // Xeon E5-2690 has 20MB L3
+ 21 * 1024 * 1024,
+ 32 * 1024 * 1024,
+ 48 * 1024 * 1024,
+ 64 * 1024 * 1024,
+ 72 * 1024 * 1024,
+ 96 * 1024 * 1024,
+ 128 * 1024 * 1024,
+ 0
+};
+
+static double chunk_sizes_log2 [sizeof(chunk_sizes)/sizeof(int)];
+
+//----------------------------------------------------------------------------
+// Name: error
+// Purpose: Complain and exit.
+//----------------------------------------------------------------------------
+void error (char *s)
+{
+#ifndef __WIN32__
+ fprintf (stderr, "Error: %s\n", s);
+ exit (1);
+#else
+ wchar_t tmp [200];
+ int i;
+ for (i = 0; s[i]; i++)
+ tmp[i] = s[i];
+ tmp[i] = 0;
+ MessageBoxW (0, tmp, L"Error", 0);
+ ExitProcess (0);
+#endif
+}
+
+//============================================================================
+// Output buffer logic.
+// This is somewhat vestigial code, originating with Windows Mobile ARM port.
+//============================================================================
+
+#define MSGLEN 10000
+static wchar_t msg [MSGLEN];
+
+void print (wchar_t *s)
+{
+ wcsncat (msg, s, MSGLEN-1);
+}
+
+void newline ()
+{
+ wcsncat (msg, L"\n", MSGLEN-1);
+}
+
+void println (wchar_t *s)
+{
+ wcsncat (msg, s, MSGLEN-1);
+ newline ();
+}
+
+void print_int (int d)
+{
+ swprintf (msg + wcslen (msg), MSGLEN, L"%d", d);
+}
+
+void print_uint (unsigned int d)
+{
+ swprintf (msg + wcslen (msg), MSGLEN, L"%lu", d);
+}
+
+void println_int (int d)
+{
+ print_int (d);
+ newline ();
+}
+
+void print_result (long double result)
+{
+ swprintf (msg + wcslen (msg), MSGLEN, L"%.1Lf MB/s", result);
+}
+
+void dump (FILE *f)
+{
+ if (!f)
+ f = stdout;
+
+ int i = 0;
+ while (msg[i]) {
+ char ch = (char) msg[i];
+ fputc (ch, f);
+ i++;
+ }
+
+ msg [0] = 0;
+}
+
+void flush ()
+{
+ dump (NULL);
+ fflush (stdout);
+}
+
+void print_size (unsigned long size)
+{
+ if (size < 1536) {
+ print_int (size);
+ print (L" B");
+ }
+ else if (size < (1<<20)) {
+ print_int (size >> 10);
+ print (L" kB");
+ } else {
+ print_int (size >> 20);
+ switch ((size >> 18) & 3) {
+ case 1: print (L".25"); break;
+ case 2: print (L".5"); break;
+ case 3: print (L".75"); break;
+ }
+ print (L" MB");
+ }
+}
+
+//============================================================================
+// Timing logic.
+//============================================================================
+
+//----------------------------------------------------------------------------
+// Name: mytime
+// Purpose: Reports time in microseconds.
+//----------------------------------------------------------------------------
+unsigned long mytime ()
+{
+#ifndef __WIN32__
+ struct timeval tv;
+ struct timezone tz;
+ memset (&tz, 0, sizeof(struct timezone));
+ gettimeofday (&tv, &tz);
+ return 1000000 * tv.tv_sec + tv.tv_usec;
+#else
+ return 1000 * GetTickCount (); // accurate enough.
+#endif
+}
+
+//----------------------------------------------------------------------------
+// Name: calculate_result
+// Purpose: Calculates and prints a result.
+// Returns: 10 times the number of megabytes per second.
+//----------------------------------------------------------------------------
+int
+calculate_result (unsigned long chunk_size, long long total_loops, long diff)
+{
+ if (!diff)
+ error ("Zero time difference.");
+
+// printf ("\nIn calculate_result, chunk_size=%ld, total_loops=%lld, diff=%ld\n", chunk_size, total_loops, diff);
+ long double result = (long double) chunk_size;
+ result *= (long double) total_loops;
+ result *= 1000000.; // Convert to microseconds.
+ result /= 1048576.;
+ result /= (long double) diff;
+
+ print_result (result);
+
+ return (long) (10.0 * result);
+}
+
+//============================================================================
+// Tests.
+//============================================================================
+
+//----------------------------------------------------------------------------
+// Name: do_write
+// Purpose: Performs write on chunk of memory of specified size.
+//----------------------------------------------------------------------------
+int
+do_write (unsigned long size, int mode, bool random)
+{
+ unsigned char *chunk;
+ unsigned char *chunk0;
+ unsigned long loops;
+ unsigned long long total_count=0;
+#ifdef __x86_64__
+ unsigned long value = 0x1234567689abcdef;
+#else
+ unsigned long value = 0x12345678;
+#endif
+ unsigned long diff=0, t0;
+ unsigned long tmp;
+ unsigned long **chunk_ptrs = NULL;
+
+ if (size & 127)
+ error ("do_write(): chunk size is not multiple of 128.");
+
+ //-------------------------------------------------
+ chunk0 = malloc (size+64);
+ chunk = chunk0;
+ if (!chunk)
+ error ("Out of memory");
+
+ tmp = (unsigned long) chunk;
+ if (tmp & 31) {
+ tmp -= (tmp & 31);
+ tmp += 32;
+ chunk = (unsigned char*) tmp;
+ }
+
+ //----------------------------------------
+ // Set up random pointers to chunks.
+ //
+ if (random) {
+ tmp = size/256;
+ chunk_ptrs = (unsigned long**) malloc (sizeof (unsigned long*) * tmp);
+ if (!chunk_ptrs)
+ error ("Out of memory.");
+
+ //----------------------------------------
+ // Store pointers to all chunks into array.
+ //
+ int i;
+ for (i = 0; i < tmp; i++) {
+ chunk_ptrs [i] = (unsigned long*) (chunk + 256 * i);
+ }
+
+ //----------------------------------------
+ // Randomize the array of chunk pointers.
+ //
+ int k = 100;
+ while (k--) {
+ for (i = 0; i < tmp; i++) {
+ int j = rand() % tmp;
+ if (i != j) {
+ unsigned long *ptr = chunk_ptrs [i];
+ chunk_ptrs [i] = chunk_ptrs [j];
+ chunk_ptrs [j] = ptr;
+ }
+ }
+ }
+ }
+
+ //-------------------------------------------------
+ if (random)
+ print (L"Random write ");
+ else
+ print (L"Sequential write ");
+
+ switch (mode) {
+ case SSE2:
+ print (L"(128-bit), size = ");
+ break;
+ case AVX:
+ print (L"(256-bit), size = ");
+ break;
+ case AVX_BYPASS:
+ print (L"bypassing cache (256-bit), size = ");
+ break;
+ case SSE2_BYPASS:
+ print (L"bypassing cache (128-bit), size = ");
+ break;
+ default:
+#ifdef __x86_64__
+ print (L"(64-bit), size = ");
+#else
+ print (L"(32-bit), size = ");
+#endif
+ }
+
+ print_size (size);
+ print (L", ");
+
+ loops = (1 << 26) / size;// XX need to adjust for CPU MHz
+ if (loops < 1)
+ loops = 1;
+
+ t0 = mytime ();
+
+ while (diff < usec_per_test) {
+ total_count += loops;
+
+ switch (mode) {
+ case SSE2:
+ if (random)
+ RandomWriterSSE2 (chunk_ptrs, size/256, loops, value);
+ else {
+ if (size & 128)
+ WriterSSE2_128bytes (chunk, size, loops, value);
+ else
+ WriterSSE2 (chunk, size, loops, value);
+ }
+ break;
+
+ case SSE2_BYPASS:
+ if (random)
+ RandomWriterSSE2_bypass (chunk_ptrs, size/256, loops, value);
+ else {
+ if (size & 128)
+ WriterSSE2_128bytes_bypass (chunk, size, loops, value);
+ else
+ WriterSSE2_bypass (chunk, size, loops, value);
+ }
+ break;
+
+ case AVX:
+ if (!random) {
+ WriterAVX (chunk, size, loops, value);
+ }
+ break;
+
+ case AVX_BYPASS:
+ if (!random) {
+ WriterAVX_bypass (chunk, size, loops, value);
+ }
+ break;
+
+ default:
+ if (random)
+ RandomWriter (chunk_ptrs, size/256, loops, value);
+ else {
+ if (size & 128)
+ Writer_128bytes (chunk, size, loops, value);
+ else
+ Writer (chunk, size, loops, value);
+ }
+ }
+
+ diff = mytime () - t0;
+ }
+
+ print (L"loops = ");
+ print_uint (total_count);
+ print (L", ");
+
+ flush ();
+
+ int result = calculate_result (size, total_count, diff);
+ newline ();
+
+ flush ();
+
+ free ((void*)chunk0);
+
+ if (chunk_ptrs)
+ free (chunk_ptrs);
+
+ return result;
+}
+
+
+//----------------------------------------------------------------------------
+// Name: do_read
+// Purpose: Performs sequential read on chunk of memory of specified size.
+//----------------------------------------------------------------------------
+int
+do_read (unsigned long size, int mode, bool random)
+{
+ unsigned long loops;
+ unsigned long long total_count = 0;
+ unsigned long t0, diff=0;
+ unsigned char *chunk;
+ unsigned char *chunk0;
+ unsigned long tmp;
+ unsigned long **chunk_ptrs = NULL;
+
+ if (size & 127)
+ error ("do_read(): chunk size is not multiple of 128.");
+
+ //-------------------------------------------------
+ chunk0 = chunk = malloc (size+64);
+ if (!chunk)
+ error ("Out of memory");
+
+ memset (chunk, 0, size);
+
+ tmp = (unsigned long) chunk;
+ if (tmp & 31) {
+ tmp -= (tmp & 31);
+ tmp += 32;
+ chunk = (unsigned char*) tmp;
+ }
+
+ //----------------------------------------
+ // Set up random pointers to chunks.
+ //
+ if (random) {
+ int tmp = size/256;
+ chunk_ptrs = (unsigned long**) malloc (sizeof (unsigned long*) * tmp);
+ if (!chunk_ptrs)
+ error ("Out of memory.");
+
+ //----------------------------------------
+ // Store pointers to all chunks into array.
+ //
+ int i;
+ for (i = 0; i < tmp; i++) {
+ chunk_ptrs [i] = (unsigned long*) (chunk + 256 * i);
+ }
+
+ //----------------------------------------
+ // Randomize the array of chunk pointers.
+ //
+ int k = 100;
+ while (k--) {
+ for (i = 0; i < tmp; i++) {
+ int j = rand() % tmp;
+ if (i != j) {
+ unsigned long *ptr = chunk_ptrs [i];
+ chunk_ptrs [i] = chunk_ptrs [j];
+ chunk_ptrs [j] = ptr;
+ }
+ }
+ }
+ }
+
+ //-------------------------------------------------
+ if (random)
+ print (L"Random read ");
+ else
+ print (L"Sequential read ");
+
+ switch (mode) {
+ case SSE2:
+ print (L"(128-bit), size = ");
+ break;
+ case LODSB:
+ print (L"(8-bit LODSB), size = ");
+ break;
+ case LODSW:
+ print (L"(16-bit LODSW), size = ");
+ break;
+ case LODSD:
+ print (L"(32-bit LODSD), size = ");
+ break;
+ case LODSQ:
+ print (L"(64-bit LODSQ), size = ");
+ break;
+ case AVX:
+ print (L"(256-bit), size = ");
+ break;
+ case AVX_BYPASS:
+ print (L"bypassing cache (256-bit), size = ");
+ break;
+ case SSE2_BYPASS:
+ print (L"bypassing cache (128-bit), size = ");
+ break;
+ default:
+#ifdef __x86_64__
+ print (L"(64-bit), size = ");
+#else
+ print (L"(32-bit), size = ");
+#endif
+ }
+
+ print_size (size);
+ print (L", ");
+
+ flush ();
+
+ loops = (1 << 26) / size; // XX need to adjust for CPU MHz
+ if (loops < 1)
+ loops = 1;
+
+ t0 = mytime ();
+
+ while (diff < usec_per_test) {
+ total_count += loops;
+
+ switch (mode) {
+ case SSE2:
+ if (random)
+ RandomReaderSSE2 (chunk_ptrs, size/256, loops);
+ else {
+ if (size & 128)
+ ReaderSSE2_128bytes (chunk, size, loops);
+ else
+ ReaderSSE2 (chunk, size, loops);
+ }
+ break;
+
+ case SSE2_BYPASS:
+ // No random reader for bypass.
+ //
+ if (random)
+ RandomReaderSSE2_bypass (chunk_ptrs, size/256, loops);
+ else {
+ if (size & 128)
+ ReaderSSE2_128bytes_bypass (chunk, size, loops);
+ else
+ ReaderSSE2_bypass (chunk, size, loops);
+ }
+ break;
+
+ case AVX:
+ if (!random) {
+ ReaderAVX (chunk, size, loops);
+ }
+ break;
+
+ case LODSB:
+ if (!random) {
+ ReaderLODSB (chunk, size, loops);
+ }
+ break;
+
+ case LODSW:
+ if (!random) {
+ ReaderLODSW (chunk, size, loops);
+ }
+ break;
+
+ case LODSD:
+ if (!random) {
+ ReaderLODSD (chunk, size, loops);
+ }
+ break;
+
+ case LODSQ:
+ if (!random) {
+ ReaderLODSQ (chunk, size, loops);
+ }
+ break;
+
+ default:
+ if (random) {
+ RandomReader (chunk_ptrs, size/256, loops);
+ } else {
+ if (size & 128)
+ Reader_128bytes (chunk, size, loops);
+ else
+ Reader (chunk, size, loops);
+ }
+ }
+
+ diff = mytime () - t0;
+ }
+
+ print (L"loops = ");
+ print_uint (total_count);
+ print (L", ");
+
+ int result = calculate_result (size, total_count, diff);
+ newline ();
+
+ flush ();
+
+ free (chunk0);
+
+ if (chunk_ptrs)
+ free (chunk_ptrs);
+
+ return result;
+}
+
+
+
+//----------------------------------------------------------------------------
+// Name: do_copy
+// Purpose: Performs sequential memory copy.
+//----------------------------------------------------------------------------
+int
+do_copy (unsigned long size, int mode)
+{
+ unsigned long loops;
+ unsigned long long total_count = 0;
+ unsigned long t0, diff=0;
+ unsigned char *chunk_src;
+ unsigned char *chunk_dest;
+ unsigned char *chunk_src0;
+ unsigned char *chunk_dest0;
+ unsigned long tmp;
+
+ if (size & 127)
+ error ("do_copy(): chunk size is not multiple of 128.");
+
+ //-------------------------------------------------
+ chunk_src0 = chunk_src = malloc (size+64);
+ if (!chunk_src)
+ error ("Out of memory");
+ chunk_dest0 = chunk_dest = malloc (size+64);
+ if (!chunk_dest)
+ error ("Out of memory");
+
+ memset (chunk_src, 100, size);
+ memset (chunk_dest, 200, size);
+
+ tmp = (unsigned long) chunk_src;
+ if (tmp & 31) {
+ tmp -= (tmp & 31);
+ tmp += 32;
+ chunk_src = (unsigned char*) tmp;
+ }
+ tmp = (unsigned long) chunk_dest;
+ if (tmp & 31) {
+ tmp -= (tmp & 31);
+ tmp += 32;
+ chunk_dest = (unsigned char*) tmp;
+ }
+
+ //-------------------------------------------------
+ print (L"Sequential copy ");
+
+ if (mode == SSE2) {
+ print (L"(128-bit), size = ");
+ }
+ else if (mode == AVX) {
+ print (L"(256-bit), size = ");
+ }
+ else {
+#ifdef __x86_64__
+ print (L"(64-bit), size = ");
+#else
+ print (L"(32-bit), size = ");
+#endif
+ }
+
+ print_size (size);
+ print (L", ");
+
+ flush ();
+
+ loops = (1 << 26) / size; // XX need to adjust for CPU MHz
+ if (loops < 1)
+ loops = 1;
+
+ t0 = mytime ();
+
+ while (diff < usec_per_test) {
+ total_count += loops;
+
+ if (mode == SSE2) {
+#ifdef __x86_64__
+ if (size & 128)
+ CopySSE_128bytes (chunk_dest, chunk_src, size, loops);
+ else
+ CopySSE (chunk_dest, chunk_src, size, loops);
+#else
+ CopySSE (chunk_dest, chunk_src, size, loops);
+#endif
+ }
+ else if (mode == AVX) {
+ if (!(size & 128))
+ CopyAVX (chunk_dest, chunk_src, size, loops);
+ }
+
+ diff = mytime () - t0;
+ }
+
+ print (L"loops = ");
+ print_uint (total_count);
+ print (L", ");
+
+ int result = calculate_result (size, total_count, diff);
+ newline ();
+
+ flush ();
+
+ free (chunk_src0);
+ free (chunk_dest0);
+
+ return result;
+}
+
+
+//----------------------------------------------------------------------------
+// Name: fb_readwrite
+// Purpose: Performs sequential read & write tests on framebuffer memory.
+//----------------------------------------------------------------------------
+#if defined(__linux__) && defined(FBIOGET_FSCREENINFO)
+void
+fb_readwrite (bool use_sse2)
+{
+ unsigned long counter, total_count;
+ unsigned long length;
+ unsigned long diff, t0;
+ static struct fb_fix_screeninfo fi;
+ static struct fb_var_screeninfo vi;
+ unsigned long *fb = NULL;
+ unsigned long datum;
+ int fd;
+ register unsigned long foo;
+#ifdef __x86_64__
+ unsigned long value = 0x1234567689abcdef;
+#else
+ unsigned long value = 0x12345678;
+#endif
+
+ //-------------------------------------------------
+
+ fd = open ("/dev/fb0", O_RDWR);
+ if (fd < 0)
+ fd = open ("/dev/fb/0", O_RDWR);
+ if (fd < 0) {
+ println (L"Cannot open framebuffer device.");
+ return;
+ }
+
+ if (ioctl (fd, FBIOGET_FSCREENINFO, &fi)) {
+ close (fd);
+ println (L"Cannot get framebuffer info");
+ return;
+ }
+ else
+ if (ioctl (fd, FBIOGET_VSCREENINFO, &vi)) {
+ close (fd);
+ println (L"Cannot get framebuffer info");
+ return;
+ }
+ else
+ {
+ if (fi.visual != FB_VISUAL_TRUECOLOR &&
+ fi.visual != FB_VISUAL_DIRECTCOLOR ) {
+ close (fd);
+ println (L"Need direct/truecolor framebuffer device.");
+ return;
+ } else {
+ unsigned long fblen;
+
+ print (L"Framebuffer resolution: ");
+ print_int (vi.xres);
+ print (L"x");
+ print_int (vi.yres);
+ print (L", ");
+ print_int (vi.bits_per_pixel);
+ println (L" bpp\n");
+
+ fb = (unsigned long*) fi.smem_start;
+ fblen = fi.smem_len;
+
+ fb = mmap (fb, fblen,
+ PROT_WRITE | PROT_READ,
+ MAP_SHARED, fd, 0);
+ if (fb == MAP_FAILED) {
+ close (fd);
+ println (L"Cannot access framebuffer memory.");
+ return;
+ }
+ }
+ }
+
+ //-------------------
+ // Use only the upper half of the display.
+ //
+ length = FB_SIZE;
+
+ //-------------------
+ // READ
+ //
+ print (L"Framebuffer memory sequential read ");
+ flush ();
+
+ t0 = mytime ();
+
+ total_count = FBLOOPS_R;
+
+ if (use_sse2)
+ ReaderSSE2 (fb, length, FBLOOPS_R);
+ else
+ Reader (fb, length, FBLOOPS_R);
+
+ diff = mytime () - t0;
+
+ calculate_result (length, total_count, diff);
+ newline ();
+
+ //-------------------
+ // WRITE
+ //
+ print (L"Framebuffer memory sequential write ");
+ flush ();
+
+ t0 = mytime ();
+
+ total_count = FBLOOPS_W;
+
+ if (use_sse2)
+ WriterSSE2_bypass (fb, length, FBLOOPS_W, value);
+ else
+ Writer (fb, length, FBLOOPS_W, value);
+
+ diff = mytime () - t0;
+
+ calculate_result (length, total_count, diff);
+ newline ();
+}
+#endif
+
+//----------------------------------------------------------------------------
+// Name: register_test
+// Purpose: Determines bandwidth of register-to-register transfers.
+//----------------------------------------------------------------------------
+void
+register_test ()
+{
+ long long total_count = 0;
+ unsigned long t0;
+ unsigned long diff = 0;
+
+ //--------------------------------------
+#ifdef __x86_64__
+ print (L"Main register to main register transfers (64-bit) ");
+#else
+ print (L"Main register to main register transfers (32-bit) ");
+#endif
+ flush ();
+#define REGISTER_COUNT 10000
+
+ t0 = mytime ();
+ while (diff < usec_per_test)
+ {
+ RegisterToRegister (REGISTER_COUNT);
+ total_count += REGISTER_COUNT;
+
+ diff = mytime () - t0;
+ }
+
+ calculate_result (256, total_count, diff);
+ newline ();
+ flush ();
+
+ //--------------------------------------
+#ifdef __x86_64__
+ print (L"Main register to vector register transfers (64-bit) ");
+#else
+ print (L"Main register to vector register transfers (32-bit) ");
+#endif
+ flush ();
+#define VREGISTER_COUNT 3333
+
+ t0 = mytime ();
+ diff = 0;
+ total_count = 0;
+ while (diff < usec_per_test)
+ {
+ RegisterToVector (VREGISTER_COUNT);
+ total_count += VREGISTER_COUNT;
+
+ diff = mytime () - t0;
+ }
+
+ calculate_result (256, total_count, diff);
+ newline ();
+ flush ();
+
+ //--------------------------------------
+#ifdef __x86_64__
+ print (L"Vector register to main register transfers (64-bit) ");
+#else
+ print (L"Vector register to main register transfers (32-bit) ");
+#endif
+ flush ();
+
+ t0 = mytime ();
+ diff = 0;
+ total_count = 0;
+ while (diff < usec_per_test)
+ {
+ VectorToRegister (VREGISTER_COUNT);
+ total_count += VREGISTER_COUNT;
+
+ diff = mytime () - t0;
+ }
+
+ calculate_result (256, total_count, diff);
+ newline ();
+ flush ();
+
+ //--------------------------------------
+ print (L"Vector register to vector register transfers (128-bit) ");
+ flush ();
+
+ t0 = mytime ();
+ diff = 0;
+ total_count = 0;
+ while (diff < usec_per_test)
+ {
+ VectorToVector (VREGISTER_COUNT);
+ total_count += VREGISTER_COUNT;
+
+ diff = mytime () - t0;
+ }
+
+ calculate_result (256, total_count, diff);
+ newline ();
+ flush ();
+
+ //--------------------------------------
+ if (cpu_has_avx) {
+ print (L"Vector register to vector register transfers (256-bit) ");
+ flush ();
+
+ t0 = mytime ();
+ diff = 0;
+ total_count = 0;
+ while (diff < usec_per_test)
+ {
+ VectorToVectorAVX (VREGISTER_COUNT);
+ total_count += VREGISTER_COUNT;
+
+ diff = mytime () - t0;
+ }
+
+ calculate_result (256, total_count, diff);
+ newline ();
+ flush ();
+ }
+
+ //--------------------------------------
+ if (use_sse4) {
+ print (L"Vector 8-bit datum to main register transfers ");
+ flush ();
+
+ t0 = mytime ();
+ diff = 0;
+ total_count = 0;
+ while (diff < usec_per_test)
+ {
+ Vector8ToRegister (VREGISTER_COUNT);
+ total_count += VREGISTER_COUNT;
+
+ diff = mytime () - t0;
+ }
+
+ calculate_result (64, total_count, diff);
+ newline ();
+ flush ();
+ }
+
+ //--------------------------------------
+ print (L"Vector 16-bit datum to main register transfers ");
+ flush ();
+
+ t0 = mytime ();
+ diff = 0;
+ total_count = 0;
+ while (diff < usec_per_test)
+ {
+ Vector16ToRegister (VREGISTER_COUNT);
+ total_count += VREGISTER_COUNT;
+
+ diff = mytime () - t0;
+ }
+
+ calculate_result (128, total_count, diff);
+ newline ();
+ flush ();
+
+ //--------------------------------------
+ if (use_sse4) {
+ print (L"Vector 32-bit datum to main register transfers ");
+ flush ();
+
+ t0 = mytime ();
+ diff = 0;
+ total_count = 0;
+ while (diff < usec_per_test)
+ {
+ Vector32ToRegister (VREGISTER_COUNT);
+ total_count += VREGISTER_COUNT;
+
+ diff = mytime () - t0;
+ }
+
+ calculate_result (256, total_count, diff);
+ newline ();
+ flush ();
+ }
+
+ //--------------------------------------
+ if (use_sse4) {
+ print (L"Vector 64-bit datum to main register transfers ");
+ flush ();
+
+ t0 = mytime ();
+ diff = 0;
+ total_count = 0;
+ while (diff < usec_per_test)
+ {
+ Vector64ToRegister (VREGISTER_COUNT);
+ total_count += VREGISTER_COUNT;
+
+ diff = mytime () - t0;
+ }
+
+ calculate_result (256, total_count, diff);
+ newline ();
+ flush ();
+ }
+
+ //--------------------------------------
+ if (use_sse4) {
+ print (L"Main register 8-bit datum to vector register transfers ");
+ flush ();
+
+ t0 = mytime ();
+ diff = 0;
+ total_count = 0;
+ while (diff < usec_per_test)
+ {
+ Register8ToVector (VREGISTER_COUNT);
+ total_count += VREGISTER_COUNT;
+
+ diff = mytime () - t0;
+ }
+
+ calculate_result (64, total_count, diff);
+ newline ();
+ flush ();
+ }
+
+ //--------------------------------------
+ print (L"Main register 16-bit datum to vector register transfers ");
+ flush ();
+
+ t0 = mytime ();
+ diff = 0;
+ total_count = 0;
+ while (diff < usec_per_test)
+ {
+ Register16ToVector (VREGISTER_COUNT);
+ total_count += VREGISTER_COUNT;
+
+ diff = mytime () - t0;
+ }
+
+ calculate_result (128, total_count, diff);
+ newline ();
+ flush ();
+
+ //--------------------------------------
+ if (use_sse4) {
+ print (L"Main register 32-bit datum to vector register transfers ");
+ flush ();
+
+ t0 = mytime ();
+ diff = 0;
+ total_count = 0;
+ while (diff < usec_per_test)
+ {
+ Register32ToVector (VREGISTER_COUNT);
+ total_count += VREGISTER_COUNT;
+
+ diff = mytime () - t0;
+ }
+
+ calculate_result (256, total_count, diff);
+ newline ();
+ flush ();
+ }
+
+ //--------------------------------------
+ if (use_sse4) {
+ print (L"Main register 64-bit datum to vector register transfers ");
+ flush ();
+
+ t0 = mytime ();
+ diff = 0;
+ total_count = 0;
+ while (diff < usec_per_test)
+ {
+ Register64ToVector (VREGISTER_COUNT);
+ total_count += VREGISTER_COUNT;
+
+ diff = mytime () - t0;
+ }
+
+ calculate_result (256, total_count, diff);
+ newline ();
+ flush ();
+ }
+}
+
+//----------------------------------------------------------------------------
+// Name: stack_test
+// Purpose: Determines bandwidth of stack-to/from-register transfers.
+//----------------------------------------------------------------------------
+void
+stack_test ()
+{
+ long long total_count = 0;
+ unsigned long t0;
+ unsigned long diff = 0;
+
+#ifdef __x86_64__
+ print (L"Stack-to-register transfers (64-bit) ");
+#else
+ print (L"Stack-to-register transfers (32-bit) ");
+#endif
+ flush ();
+
+ //--------------------------------------
+ diff = 0;
+ total_count = 0;
+ t0 = mytime ();
+ while (diff < usec_per_test)
+ {
+ StackReader (REGISTER_COUNT);
+ total_count += REGISTER_COUNT;
+
+ diff = mytime () - t0;
+ }
+
+ calculate_result (256, total_count, diff);
+ newline ();
+ flush ();
+
+#ifdef __x86_64__
+ print (L"Register-to-stack transfers (64-bit) ");
+#else
+ print (L"Register-to-stack transfers (32-bit) ");
+#endif
+ flush ();
+
+ //--------------------------------------
+ diff = 0;
+ total_count = 0;
+ t0 = mytime ();
+ while (diff < usec_per_test)
+ {
+ StackWriter (REGISTER_COUNT);
+ total_count += REGISTER_COUNT;
+
+ diff = mytime () - t0;
+ }
+
+ calculate_result (256, total_count, diff);
+ newline ();
+ flush ();
+}
+
+//----------------------------------------------------------------------------
+// Name: library_test
+// Purpose: Performs C library tests (memset, memcpy).
+//----------------------------------------------------------------------------
+void
+library_test ()
+{
+ char *a1, *a2;
+ unsigned long t, t0;
+ int i;
+
+ #define NT_SIZE (64*1024*1024)
+ #define NT_SIZE2 (100)
+
+ a1 = malloc (NT_SIZE);
+ if (!a1)
+ error ("Out of memory");
+
+ a2 = malloc (NT_SIZE);
+ if (!a2)
+ error ("Out of memory");
+
+ //--------------------------------------
+ t0 = mytime ();
+ for (i=0; i<NT_SIZE2; i++) {
+ memset (a1, i, NT_SIZE);
+ }
+ t = mytime ();
+
+ print (L"Library: memset ");
+ calculate_result (NT_SIZE, NT_SIZE2, t-t0);
+ newline ();
+
+ flush ();
+
+ //--------------------------------------
+ t0 = mytime ();
+ for (i=0; i<NT_SIZE2; i++) {
+ memcpy (a2, a1, NT_SIZE);
+ }
+ t = mytime ();
+
+ print (L"Library: memcpy ");
+ calculate_result (NT_SIZE, NT_SIZE2, t-t0);
+ newline ();
+
+ flush ();
+
+ free (a1);
+ free (a2);
+}
+
+//----------------------------------------------------------------------------
+// Name: network_test_core
+// Purpose: Performs the network test, talking to and receiving data
+// back from a transponder node.
+// Note: Port number specified using server:# notation.
+// Returns: -1 on error, else the network duration in microseconds.
+//----------------------------------------------------------------------------
+bool
+network_test_core (const char *hostname, char *chunk,
+ unsigned long chunk_size,
+ unsigned long n_chunks,
+ long *duration_send_return,
+ long *duration_recv_return)
+{
+ if (!hostname || !chunk || !n_chunks || !chunk_size ||
+ !duration_send_return ||
+ !duration_recv_return)
+ return false;
+
+ struct hostent* host = gethostbyname (hostname);
+ if (!host)
+ return false;
+
+ char *host_ip = inet_ntoa (*(struct in_addr *)*host->h_addr_list);
+ int sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
+
+ struct sockaddr_in addr;
+ addr.sin_family = AF_INET;
+ addr.sin_addr.s_addr = inet_addr(host_ip);
+ addr.sin_port = htons(network_port);
+
+ if (connect (sock, (struct sockaddr*) &addr, sizeof (struct sockaddr)))
+ {
+ // perror ("connect");
+ close (sock);
+ return false;
+ }
+
+ //------------------------------------
+ // Start stopwatch just before the send.
+ // It will be stopped on receipt of
+ // the response.
+ //
+ unsigned long t0 = mytime ();
+
+ //------------------------------------
+ // Put # of chunks in the chunk.
+ // Send all of our data.
+ //
+ sprintf (chunk, "%lu\n", n_chunks);
+ int i;
+ for (i = 0; i < n_chunks; i++)
+ send (sock, chunk, chunk_size, 0);
+
+#if 0
+ //------------------------------------
+ // Set nonblocking mode.
+ //
+ int opt = 1;
+ ioctl (sock, FIONBIO, &opt);
+#endif
+
+ unsigned long t1 = mytime ();
+
+ //------------------------------------
+ // Read the response.
+ //
+ int amount = recv (sock, chunk, chunk_size, 0);
+ if (amount < 16) {
+ close (sock);
+ return false;
+ }
+
+ unsigned long duration_send = mytime() - t0;
+
+ //------------------------------------
+ // Validate the response, which
+ // contains the transponder's
+ // perceived read duration. This value
+ // may be as little as half our number.
+ //
+ unsigned long duration2 = -1;
+ if (strncmp ("OK: ", chunk, 4)) {
+ close (sock);
+ return false;
+ }
+ if (1 != sscanf (4+chunk, "%lu", &duration2)) {
+ close (sock);
+ return false;
+ }
+
+ unsigned long remaining = chunk_size * n_chunks - amount;
+ while (remaining > 0) {
+ int amount = recv (sock, chunk, chunk_size, 0);
+ if (amount <= 0) {
+ perror ("recv");
+ close (sock);
+ return false;
+ }
+ remaining -= amount;
+ }
+
+ unsigned long duration_recv = mytime () - t1;
+
+ *duration_send_return = duration_send;
+ *duration_recv_return = duration_recv;
+
+ close (sock);
+ return true;
+}
+
+//----------------------------------------------------------------------------
+// Name: ip_to_str
+//----------------------------------------------------------------------------
+void
+ip_to_str (unsigned long addr, char *str)
+{
+ if (!str)
+ return;
+
+ unsigned short a = 0xff & addr;
+ unsigned short b = 0xff & (addr >> 8);
+ unsigned short c = 0xff & (addr >> 16);
+ unsigned short d = 0xff & (addr >> 24);
+ sprintf (str, "%u.%u.%u.%u", a,b,c,d);
+}
+
+//----------------------------------------------------------------------------
+// Name: network_transponder
+// Purpose: Act as a transponder, receiving chunks of data and sending
+// back an acknowledgement once the enture chunk is read.
+// Returns: False if a problem occurs setting up the network socket.
+//----------------------------------------------------------------------------
+bool
+network_transponder ()
+{
+ struct sockaddr_in sin, from;
+
+ //------------------------------
+ // Get listening socket for port.
+ // Then listen on given port#.
+ //
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = htonl(INADDR_ANY);
+ sin.sin_port = htons(network_port);
+ int listensock;
+ if ((listensock = socket (AF_INET, SOCK_STREAM, 0)) < 0) {
+ perror ("socket");
+ return false;
+ }
+ if (bind (listensock, (struct sockaddr*) &sin, sizeof(sin)) < 0) {
+ perror ("bind");
+ close (listensock);
+ return false;
+ }
+ if (listen (listensock, 500) < 0) {
+ perror ("listen");
+ close (listensock);
+ return false;
+ }
+
+ bool done = false;
+ while (!done) {
+ //----------------------------------------
+ // Wait for a client to contact us.
+ //
+ socklen_t len = sizeof (struct sockaddr);
+ int sock = accept (listensock, (struct sockaddr*) &from, &len);
+ if (sock < 0) {
+ perror ("accept");
+ close (listensock);
+ return false;
+ }
+
+ //----------------------------------------
+ // Clockwatch starts when we accept the
+ // connection.
+ //
+ unsigned long t0 = mytime ();
+
+ if (len != sizeof (struct sockaddr_in)) {
+ close (sock);
+ close (listensock);
+ return false;
+ }
+
+#if 0
+ unsigned long ipaddr = from.sin_addr.s_addr;
+ char ipstring[30];
+ ip_to_str (ipaddr, ipstring);
+ fprintf (stderr, "Incoming connection from %s\n", ipstring);
+#endif
+
+ //----------------------------------------
+ // Read the first chunk only, in order to
+ // get the # of bytes that will be sent.
+ //
+ char chunk [NETWORK_CHUNK_SIZE+1];
+ long n_chunks = 0;
+ int amount_read = read (sock, chunk, NETWORK_CHUNK_SIZE);
+ chunk [amount_read] = 0;
+ if (1 != sscanf (chunk, "%ld", &n_chunks)) {
+ close (sock);
+ close (listensock);
+ return false;
+ }
+
+ //----------------------------------------
+ // If the leader sends us a chunk count of
+ // -99, this indicates that we should exit.
+ //
+ if (n_chunks == -99) {
+ close (sock);
+ close (listensock);
+ return true;
+ }
+
+// printf ("Reading %lu chunks of %d bytes...\n", n_chunks, NETWORK_CHUNK_SIZE);
+
+ unsigned long long remaining = n_chunks;
+ remaining *= NETWORK_CHUNK_SIZE;
+
+// printf ("remaining="); dump_hex64(remaining); puts("");
+
+ remaining -= amount_read;
+ while (remaining > 0) {
+ amount_read = read (sock, chunk, NETWORK_CHUNK_SIZE);
+ remaining -= amount_read;
+
+ if (amount_read < 0) {
+ perror ("read");
+ break;
+ } else
+ if (!amount_read)
+ break;
+ }
+
+ unsigned long duration = mytime() - t0;
+
+ //------------------------------------
+ // Send response of same size.
+ //
+ sprintf (chunk, "OK: %lu\n", duration);
+ chunk[14] = '\n';
+
+ //------------------------------------
+ // Send all of our data.
+ //
+ int i;
+ for (i = 0; i < n_chunks; i++)
+ send (sock, chunk, NETWORK_CHUNK_SIZE, 0);
+
+ close (sock);
+ }
+
+ return true;
+}
+
+//----------------------------------------------------------------------------
+// Name: network_test
+//----------------------------------------------------------------------------
+bool
+network_test (char **destinations, int n_destinations)
+{
+ int i;
+
+ //----------------------------------------
+ // The memory chunk starts with a 12-byte
+ // length of the overall send size.
+ // The memory chunk will have a list of
+ // the destinations in it.
+ // In future, there will be a mechanism
+ // for testing bandwidth between all nodes,
+ // not just the leader & each of the
+ // transponders.
+ //
+ char chunk [NETWORK_CHUNK_SIZE];
+ memset (chunk, 0, NETWORK_CHUNK_SIZE);
+ sprintf (chunk, "000000000000\n%d\n", n_destinations);
+ for (i = 0; i < n_destinations; i++) {
+ char *s = destinations [i];
+ int chunk_len = strlen (chunk);
+ int len = strlen (s);
+ if (len + chunk_len < NETWORK_CHUNK_SIZE-1) {
+ //----------------------------------------
+ // "transp" indicates that the given node
+ // has not yet been a leader.
+ // In future, "done" will indicate it has.
+ //
+ sprintf (chunk + chunk_len, "%s %s\n", s, "transp");
+ }
+ }
+
+ static unsigned long colors [] = {
+ RGB_RED, RGB_GREEN, RGB_BLUE, RGB_ORANGE, RGB_PURPLE,
+ RGB_BLACK, RGB_CORAL,
+ RGB_CYAN, RGB_NAVYBLUE, RGB_BRASS, RGB_DARKORANGE,
+ RGB_DARKGREEN, RGB_SALMON, RGB_MAGENTA, RGB_LEMONYELLOW,
+ RGB_ROYALBLUE, RGB_DODGERBLUE, RGB_TURQUOISE, RGB_CADETBLUE,
+ RGB_CHARTREUSE, RGB_DARKOLIVEGREEN, RGB_VIOLET,
+ RGB_KHAKI, RGB_DARKKHAKI, RGB_GOLDENROD
+ };
+#define NCOLORS (sizeof(colors)/sizeof(unsigned long))
+
+ //----------------------------------------
+ // For each destination, run the test.
+ //
+ for (i = 0; i < n_destinations; i++) {
+ bool problem = false;
+
+ char *hostname = destinations[i];
+ printf ("Bandwidth sending to %s:\n", hostname);
+
+ char title [PATH_MAX];
+ sprintf (title, "%s send (solid)", hostname);
+ BMPGraphing_new_line (graph, title, i < NCOLORS? colors[i] : RGB_GRAY);
+
+ //----------------------------------------
+ // Cache the receive durations for later.
+ //
+ unsigned long recv_rates [NETSIZE_MAX];
+ int recv_ix = 0;
+
+ //----------------------------------------
+ // Send data of increasing sizes.
+ //
+ int j = NETSIZE_MIN;
+ int n_runs = 64;
+ while (!problem && j <= NETSIZE_MAX) {
+ unsigned long chunk_count = 1 << (j-NETSIZE_MIN);
+ unsigned long long amt_to_send = chunk_count;
+ amt_to_send *= NETWORK_CHUNK_SIZE;
+
+ if (!amt_to_send) // unlikely
+ break;
+
+ //----------------------------------------
+ // Send the data; do this n_runs times.
+ //
+ unsigned long long total_duration_send = 0;
+ unsigned long long total_duration_recv = 0;
+
+ int k = n_runs;
+ while (k--) {
+ long duration_send, duration_recv;
+
+ if (! network_test_core (hostname,
+ chunk, NETWORK_CHUNK_SIZE, chunk_count,
+ &duration_send, &duration_recv))
+ {
+ problem = true;
+ fprintf (stderr, "\nCan't connect to %s\n", hostname);
+ break;
+ }
+
+ total_duration_send += duration_send;
+ total_duration_recv += duration_recv;
+ }
+
+ if (problem)
+ break;
+
+ total_duration_send += n_runs/2; // Round up
+ total_duration_send /= n_runs; // Get average
+ long duration = (long) total_duration_send;
+
+ total_duration_recv += n_runs/2; // Round up
+ total_duration_recv /= n_runs; // Get average
+
+ unsigned long amt_in_kb = amt_to_send / 1024;
+ unsigned long amt_in_mb = amt_to_send / 1048576;
+ if (!amt_in_mb) {
+ printf ("\r\tChunk %lu kB x %d: \t", amt_in_kb,
+ n_runs);
+ } else {
+ printf ("\r\tChunk %lu MB x %d: \t", amt_in_mb,
+ n_runs);
+ }
+
+ //------------------------------
+ // Calculate send rate in MB/sec.
+ //
+ // Get total # bytes.
+ unsigned long long tmp = NETWORK_CHUNK_SIZE;
+ tmp *= chunk_count;
+
+ // Get total bytes per second.
+ tmp *= 1000000;
+ tmp /= duration;
+
+ // Bytes to megabytes.
+ tmp /= 1000;
+ tmp /= 10;
+ unsigned long whole = tmp / 100;
+ unsigned long frac = tmp % 100;
+ printf ("%lu.%02lu MB/s (sent)\t", whole, frac);
+ fflush (stdout);
+
+ BMPGraphing_add_point (graph, amt_in_kb, tmp);
+
+ //------------------------------
+ // Calculate recv rate in MB/sec.
+ //
+ // Get total # bytes.
+ tmp = NETWORK_CHUNK_SIZE;
+ tmp *= chunk_count;
+
+ // Get total bytes per second.
+ tmp *= 1000000;
+ tmp /= total_duration_recv;
+
+ // Bytes to megabytes.
+ tmp /= 1000;
+ tmp /= 10;
+ whole = tmp / 100;
+ frac = tmp % 100;
+ printf ("%lu.%02lu MB/s (received)\n", whole, frac);
+
+ recv_rates [recv_ix++] = tmp;
+
+ j++;
+ n_runs >>= 1;
+ if (!n_runs)
+ n_runs = 1;
+ }
+
+ //----------------------------------------
+ // Now add the line for the receive rates.
+ //
+ sprintf (title, "%s receive (dashed)", hostname);
+ BMPGraphing_new_line (graph, title, DASHED |
+ (i < NCOLORS? colors[i] : RGB_GRAY));
+ for (j = NETSIZE_MIN; j <= NETSIZE_MAX; j++) {
+ unsigned long chunk_count = 1 << (j-NETSIZE_MIN);
+ unsigned long long amt_to_send = chunk_count;
+ amt_to_send *= NETWORK_CHUNK_SIZE;
+ unsigned long amt_in_kb = amt_to_send / 1024;
+// printf ("amt_in_kb=%ld\n",amt_in_kb);
+
+ BMPGraphing_add_point (graph, amt_in_kb, recv_rates[j-NETSIZE_MIN]);
+ }
+
+ puts ("");
+ }
+
+ return true;
+}
+
+//----------------------------------------------------------------------------
+// Name: usage
+//----------------------------------------------------------------------------
+void
+usage ()
+{
+ printf ("Usage: bandwidth [--slow] [--fast] [--faster] [--fastest] [--title string]\n");
+ printf ("Usage for starting network tests: bandwidth --network <ipaddr1> [<ipaddr2...] [--port <port#>]\n");
+ printf ("Usage for receiving network tests: bandwidth --transponder [--port <port#>]\n");
+
+ exit (0);
+}
+
+//----------------------------------------------------------------------------
+// Name: main
+//----------------------------------------------------------------------------
+int
+main (int argc, char **argv)
+{
+ int i, chunk_size;
+
+ --argc;
+ ++argv;
+
+ bool network_mode = false;
+ bool network_leader = false; // false => transponder
+ int network_destinations_size = 0;
+ int n_network_destinations = 0;
+ char **network_destinations = NULL;
+
+ char graph_title [512] = {0};
+
+ i = 0;
+ while (i < argc) {
+ char *s = argv [i++];
+
+ if (!strcmp ("--network", s)) {
+ network_mode = true;
+ network_leader = true;
+ network_destinations_size = 20;
+ network_destinations = (char**) malloc (network_destinations_size * sizeof (char*));
+ }
+ else
+ if (!strcmp ("--transponder", s)) {
+ network_mode = true;
+ }
+ else
+ if (!strcmp ("--port", s)) {
+ if (i != argc)
+ network_port = atoi (argv[i++]);
+ }
+ else
+ if (!strcmp ("--slow", s)) {
+ usec_per_test=20000000; // 20 seconds per test.
+ }
+ else
+ if (!strcmp ("--fast", s)) {
+ usec_per_test = 500000; // 0.5 seconds per test.
+ }
+ else
+ if (!strcmp ("--faster", s)) {
+ usec_per_test = 50000; // 0.05 seconds per test.
+ }
+ else
+ if (!strcmp ("--fastest", s)) {
+ usec_per_test = 5000; // 0.005 seconds per test.
+ }
+ else
+ if (!strcmp ("--nosse2", s)) {
+ use_sse2 = false;
+ use_sse4 = false;
+ }
+ else
+ if (!strcmp ("--nosse4", s)) {
+ use_sse4 = false;
+ }
+ else
+ if (!strcmp ("--help", s)) {
+ usage ();
+ }
+ else
+ if (!strcmp ("--title", s) && i != argc) {
+ snprintf (graph_title, 511, "%s", argv[i++]);
+ }
+ else {
+ if ('-' == *s)
+ usage ();
+ }
+ }
+
+ msg[0] = 0;
+
+ for (i = 0; chunk_sizes[i] && i < sizeof(chunk_sizes)/sizeof(int); i++) {
+ chunk_sizes_log2[i] = log2 (chunk_sizes[i]);
+ }
+
+ printf ("This is bandwidth version %s.\n", RELEASE);
+ printf ("Copyright (C) 2005-2014 by Zack T Smith.\n\n");
+ printf ("This software is covered by the GNU Public License.\n");
+ printf ("It is provided AS-IS, use at your own risk.\n");
+ printf ("See the file COPYING for more information.\n\n");
+ fflush (stdout);
+
+ //----------------------------------------
+ // If network mode selected, enter it now.
+ // Currently cannot combine memory tests
+ // & network tests.
+ //
+ if (network_mode) {
+ if (network_leader) {
+ graph = BMPGraphing_new (GRAPH_WIDTH, GRAPH_HEIGHT, MODE_X_AXIS_LINEAR);
+ strcpy (graph_title, TITLE_MEMORY_NET);
+ BMPGraphing_set_title (graph, graph_title);
+
+ network_test (network_destinations, n_network_destinations);
+
+ BMPGraphing_make (graph);
+
+ BMP_write (graph->image, "network_bandwidth.bmp");
+
+#if defined(__linux__) || defined(__CYGWIN__) || defined(__APPLE__)
+ puts ("Wrote graph to network_bandwidth.bmp.");
+ puts ("");
+ puts ("Done.");
+#endif
+ BMPGraphing_destroy (graph);
+ } else {
+ network_transponder ();
+ }
+
+ return 0;
+ }
+
+ uint32_t ecx = get_cpuid1_ecx ();
+ uint32_t edx = get_cpuid1_edx ();
+ cpu_has_mmx = edx & CPUID_EDX_MMX;
+ cpu_has_sse = edx & CPUID_EDX_SSE;
+ cpu_has_sse2 = edx & CPUID_EDX_SSE2;
+ cpu_has_sse3 = ecx & CPUID_ECX_SSE3;
+ cpu_has_ssse3 = ecx & CPUID_ECX_SSSE3;
+ cpu_has_sse41 = ecx & CPUID_ECX_SSE41;
+ cpu_has_sse42 = ecx & CPUID_ECX_SSE42;
+ cpu_has_aes = ecx & CPUID_ECX_AES;
+ cpu_has_avx = ecx & CPUID_ECX_AVX;
+ cpu_has_avx2 = 0;
+
+ if (cpu_has_avx) {
+ cpu_has_avx2 = get_cpuid7_ebx ();
+ cpu_has_avx2 &= CPUID_EBX_AVX2;
+ }
+
+ use_sse2 = true;
+ use_sse4 = true;
+
+ cpu_has_sse4a = 0;
+ cpu_has_64bit = 0;
+ cpu_has_xd = 0;
+
+ static char family [17];
+ get_cpuid_family (family);
+ family [16] = 0;
+ printf ("CPU family: %s\n", family);
+
+ uint32_t ecx2 = get_cpuid_80000001_ecx ();
+ uint32_t edx2 = get_cpuid_80000001_edx ();
+
+ if (!strcmp ("AuthenticAMD", family)) {
+ is_amd = true;
+ cpu_has_sse4a = ecx2 & CPUID_ECX_SSE4A;
+ }
+ else
+ if (!strcmp ("GenuineIntel", family)) {
+ is_intel = true;
+ }
+
+ cpu_has_xd = edx2 & CPUID_EDX_XD;
+ cpu_has_64bit = edx2 & CPUID_EDX_INTEL64;
+
+ printf ("CPU features: ");
+ if (cpu_has_mmx) printf ("MMX ");
+ if (cpu_has_sse) printf ("SSE ");
+ if (cpu_has_sse2) printf ("SSE2 ");
+ if (cpu_has_sse3) printf ("SSE3 ");
+ if (cpu_has_ssse3) printf ("SSSE3 ");
+ if (cpu_has_sse4a) printf ("SSE4A ");
+ if (cpu_has_sse41) printf ("SSE4.1 ");
+ if (cpu_has_sse42) printf ("SSE4.2 ");
+ if (cpu_has_aes) printf ("AES ");
+ if (cpu_has_avx) printf ("AVX ");
+ if (cpu_has_avx2) printf ("AVX2 ");
+ if (cpu_has_xd) printf ("XD ");
+ if (cpu_has_64bit) {
+ if (!is_amd)
+ printf ("Intel64 ");
+ else
+ printf ("LongMode ");
+ }
+ puts ("\n");
+
+ if (is_intel) {
+ uint32_t cache_info[4];
+ i = 0;
+ while (1) {
+ get_cpuid_cache_info (cache_info, i);
+ if (!(cache_info[0] & 31))
+ break;
+
+#if 0
+ printf ("Cache info %d = 0x%08x, 0x%08x, 0x%08x, 0x%08x\n", i,
+ cache_info [0],
+ cache_info [1],
+ cache_info [2],
+ cache_info [3]);
+#endif
+ printf ("Cache %d: ", i);
+ switch ((cache_info[0] >> 5) & 7) {
+ case 1: printf ("L1 "); break;
+ case 2: printf ("L2 "); break;
+ case 3: printf ("L3 "); break;
+ }
+ switch (cache_info[0] & 31) {
+ case 1: printf ("data cache, "); break;
+ case 2: printf ("instruction cache, "); break;
+ case 3: printf ("unified cache, "); break;
+ }
+ uint32_t n_ways = 1 + (cache_info[1] >> 22);
+ uint32_t line_size = 1 + (cache_info[1] & 2047);
+ uint32_t n_sets = 1 + cache_info[2];
+ printf ("line size %d, ", line_size);
+ printf ("%2d-way%s, ", n_ways, n_ways>1 ? "s" : "");
+ printf ("%5d sets, ", n_sets);
+ unsigned size = (n_ways * line_size * n_sets) >> 10;
+ printf ("size %dk ", size);
+ puts ("");
+ i++;
+ }
+ }
+
+ if (!cpu_has_sse41)
+ use_sse4 = false;
+ if (!cpu_has_sse2)
+ use_sse2 = false;
+
+ println (L"\nNotation: B = byte, kB = 1024 B, MB = 1048576 B.");
+
+ flush ();
+
+ //------------------------------------------------------------
+ // Attempt to obtain information about the CPU.
+ //
+#ifdef __linux__
+ struct stat st;
+ if (!stat ("/proc/cpuinfo", &st)) {
+#define TMPFILE "/tmp/bandw_tmp"
+ unlink (TMPFILE);
+ if (-1 == system ("grep MHz /proc/cpuinfo | uniq | sed \"s/[\\t\\n: a-zA-Z]//g\" > "TMPFILE))
+ perror ("system");
+
+ FILE *f = fopen (TMPFILE, "r");
+ if (f) {
+ float cpu_speed = 0.0;
+
+ if (1 == fscanf (f, "%g", &cpu_speed)) {
+ puts ("");
+ printf ("CPU speed is %g MHz.\n", cpu_speed);
+ }
+ fclose (f);
+ }
+ } else {
+ printf ("CPU information is not available (/proc/cpuinfo).\n");
+ }
+ fflush (stdout);
+#endif
+
+ graph = BMPGraphing_new (GRAPH_WIDTH, GRAPH_HEIGHT, MODE_X_AXIS_LOG2);
+ strcpy (graph_title, TITLE_MEMORY_GRAPH);
+ BMPGraphing_set_title (graph, graph_title);
+
+ //------------------------------------------------------------
+ // SSE2 sequential reads.
+ //
+ if (use_sse2) {
+ BMPGraphing_new_line (graph, "Sequential 128-bit reads", RGB_RED);
+
+ newline ();
+
+ i = 0;
+ while ((chunk_size = chunk_sizes [i++])) {
+ int amount = do_read (chunk_size, SSE2, false);
+ BMPGraphing_add_point (graph, chunk_size, amount);
+ }
+ }
+
+ //------------------------------------------------------------
+ // AVX sequential reads.
+ //
+ if (cpu_has_avx) {
+ BMPGraphing_new_line (graph, "Sequential 256-bit reads", RGB_TURQUOISE);
+
+ newline ();
+
+ i = 0;
+ while ((chunk_size = chunk_sizes [i++])) {
+ if (!(chunk_size & 128)) {
+ int amount = do_read (chunk_size, AVX, false);
+ BMPGraphing_add_point (graph, chunk_size, amount);
+ }
+ }
+ }
+
+ //------------------------------------------------------------
+ // SSE2 random reads.
+ //
+ if (use_sse2) {
+ BMPGraphing_new_line (graph, "Random 128-bit reads", RGB_MAROON);
+
+ newline ();
+ srand (time (NULL));
+
+ i = 0;
+ while ((chunk_size = chunk_sizes [i++])) {
+ if (!(chunk_size & 128)) {
+ int amount = do_read (chunk_size, SSE2, true);
+ BMPGraphing_add_point (graph, chunk_size, amount);
+ }
+ }
+ }
+
+ //------------------------------------------------------------
+ // SSE2 sequential writes that do not bypass the caches.
+ //
+ if (use_sse2) {
+ BMPGraphing_new_line (graph, "Sequential 128-bit cache writes", RGB_PURPLE);
+
+ newline ();
+
+ i = 0;
+ while ((chunk_size = chunk_sizes [i++])) {
+ int amount = do_write (chunk_size, SSE2, false);
+ BMPGraphing_add_point (graph, chunk_size, amount);
+ }
+ }
+
+ //------------------------------------------------------------
+ // AVX sequential writes that do not bypass the caches.
+ //
+ if (cpu_has_avx) {
+ BMPGraphing_new_line (graph, "Sequential 256-bit cache writes", RGB_PINK);
+
+ newline ();
+
+ i = 0;
+ while ((chunk_size = chunk_sizes [i++])) {
+ if (!(chunk_size & 128)) {
+ int amount = do_write (chunk_size, AVX, false);
+ BMPGraphing_add_point (graph, chunk_size, amount);
+ }
+ }
+ }
+
+ //------------------------------------------------------------
+ // SSE2 random writes that do not bypass the caches.
+ //
+ if (use_sse2) {
+ BMPGraphing_new_line (graph, "Random 128-bit cache writes", RGB_NAVYBLUE);
+
+ newline ();
+ srand (time (NULL));
+
+ i = 0;
+ while ((chunk_size = chunk_sizes [i++])) {
+ if (!(chunk_size & 128)) {
+ int amount = do_write (chunk_size, SSE2, true);
+ BMPGraphing_add_point (graph, chunk_size, amount);
+ }
+ }
+ }
+
+ //------------------------------------------------------------
+ // SSE4 sequential reads that do bypass the caches.
+ //
+ if (use_sse4) {
+ BMPGraphing_new_line (graph, "Sequential 128-bit bypassing reads", RGB_BLACK);
+
+ newline ();
+
+ i = 0;
+ while ((chunk_size = chunk_sizes [i++])) {
+ int amount = do_read (chunk_size, SSE2_BYPASS, false);
+ BMPGraphing_add_point (graph, chunk_size, amount);
+ }
+ }
+
+ //------------------------------------------------------------
+ // SSE4 random reads that do bypass the caches.
+ //
+ if (use_sse4) {
+ BMPGraphing_new_line (graph, "Random 128-bit bypassing reads", 0xdeadbeef);
+
+ newline ();
+
+ i = 0;
+ while ((chunk_size = chunk_sizes [i++])) {
+ if (!(chunk_size & 128)) {
+ int amount = do_read (chunk_size, SSE2_BYPASS, true);
+ BMPGraphing_add_point (graph, chunk_size, amount);
+ }
+ }
+ }
+
+ //------------------------------------------------------------
+ // SSE4 sequential writes that do bypass the caches.
+ //
+ if (use_sse4) {
+ BMPGraphing_new_line (graph, "Sequential 128-bit bypassing writes", RGB_DARKORANGE);
+
+ newline ();
+
+ i = 0;
+ while ((chunk_size = chunk_sizes [i++])) {
+ int amount = do_write (chunk_size, SSE2_BYPASS, false);
+ BMPGraphing_add_point (graph, chunk_size, amount);
+ }
+ }
+
+ //------------------------------------------------------------
+ // AVX sequential writes that do bypass the caches.
+ // Currently on Intel CPUs (including Xeon) there is a
+ // microcode bug that leads to a severe drop in performance
+ // in this part of the test.
+ //
+ if (cpu_has_avx) {
+ BMPGraphing_new_line (graph, "Sequential 256-bit bypassing writes", RGB_DARKOLIVEGREEN);
+
+ newline ();
+
+ i = 0;
+ while ((chunk_size = chunk_sizes [i++])) {
+ if (!(chunk_size & 128)) {
+ int amount = do_write (chunk_size, AVX_BYPASS, false);
+ BMPGraphing_add_point (graph, chunk_size, amount);
+ }
+ }
+ }
+
+ //------------------------------------------------------------
+ // SSE4 random writes that bypass the caches.
+ //
+ if (use_sse4) {
+ BMPGraphing_new_line (graph, "Random 128-bit bypassing writes", RGB_LEMONYELLOW);
+
+ newline ();
+ srand (time (NULL));
+
+ i = 0;
+ while ((chunk_size = chunk_sizes [i++])) {
+ if (!(chunk_size & 128)) {
+ int amount = do_write (chunk_size, SSE2_BYPASS, true);
+ BMPGraphing_add_point (graph, chunk_size, amount);
+ }
+ }
+ }
+
+ //------------------------------------------------------------
+ // Sequential non-SSE2 reads.
+ //
+ newline ();
+#ifdef __x86_64__
+ BMPGraphing_new_line (graph, "Sequential 64-bit reads", RGB_BLUE);
+#else
+ BMPGraphing_new_line (graph, "Sequential 32-bit reads", RGB_BLUE);
+#endif
+
+ i = 0;
+ while ((chunk_size = chunk_sizes [i++])) {
+ int amount = do_read (chunk_size, NO_SSE2, false);
+ BMPGraphing_add_point (graph, chunk_size, amount);
+ }
+
+ //------------------------------------------------------------
+ // Random non-SSE2 reads.
+ //
+ newline ();
+#ifdef __x86_64__
+ BMPGraphing_new_line (graph, "Random 64-bit reads", RGB_CYAN);
+#else
+ BMPGraphing_new_line (graph, "Random 32-bit reads", RGB_CYAN);
+#endif
+ srand (time (NULL));
+
+ i = 0;
+ while ((chunk_size = chunk_sizes [i++])) {
+ if (!(chunk_size & 128)) {
+ int amount = do_read (chunk_size, NO_SSE2, true);
+ BMPGraphing_add_point (graph, chunk_size, amount);
+ }
+ }
+
+ //------------------------------------------------------------
+ // Sequential non-SSE2 writes.
+ //
+#ifdef __x86_64__
+ BMPGraphing_new_line (graph, "Sequential 64-bit writes", RGB_DARKGREEN);
+#else
+ BMPGraphing_new_line (graph, "Sequential 32-bit writes", RGB_DARKGREEN);
+#endif
+
+ newline ();
+
+ i = 0;
+ while ((chunk_size = chunk_sizes [i++])) {
+ int amount = do_write (chunk_size, NO_SSE2, false);
+ BMPGraphing_add_point (graph, chunk_size, amount);
+ }
+
+ //------------------------------------------------------------
+ // Random non-SSE2 writes.
+ //
+#ifdef __x86_64__
+ BMPGraphing_new_line (graph, "Random 64-bit writes", RGB_GREEN);
+#else
+ BMPGraphing_new_line (graph, "Random 32-bit writes", RGB_GREEN);
+#endif
+
+ newline ();
+ srand (time (NULL));
+
+ i = 0;
+ while ((chunk_size = chunk_sizes [i++])) {
+ if (!(chunk_size & 128)) {
+ int amount = do_write (chunk_size, NO_SSE2, true);
+ BMPGraphing_add_point (graph, chunk_size, amount);
+ }
+ }
+
+ //------------------------------------------------------------
+ // SSE2 sequential copy.
+ //
+ if (use_sse2) {
+ BMPGraphing_new_line (graph, "Sequential 128-bit copy", 0x8f8844);
+
+ newline ();
+
+ i = 0;
+ while ((chunk_size = chunk_sizes [i++])) {
+ int amount = do_copy (chunk_size, SSE2);
+ BMPGraphing_add_point (graph, chunk_size, amount);
+ }
+ }
+
+ //------------------------------------------------------------
+ // AVX sequential copy.
+ //
+ if (cpu_has_avx) {
+ BMPGraphing_new_line (graph, "Sequential 256-bit copy", RGB_CHARTREUSE);
+
+ newline ();
+
+ i = 0;
+ while ((chunk_size = chunk_sizes [i++])) {
+ if (!(chunk_size & 128)) {
+ int amount = do_copy (chunk_size, AVX);
+ BMPGraphing_add_point (graph, chunk_size, amount);
+ }
+ }
+ }
+
+#ifdef DOING_LODS
+#ifdef __x86_64__
+ //------------------------------------------------------------
+ // LODSQ 64-bit sequential reads.
+ //
+ BMPGraphing_new_line (graph, "Sequential 64-bit LODSQ reads", RGB_GRAY6);
+
+ newline ();
+
+ i = 0;
+ while ((chunk_size = chunk_sizes [i++])) {
+ int amount = do_read (chunk_size, LODSQ, false);
+ BMPGraphing_add_point (graph, chunk_size, amount);
+ }
+#endif
+
+ //------------------------------------------------------------
+ // LODSD 32-bit sequential reads.
+ //
+ BMPGraphing_new_line (graph, "Sequential 32-bit LODSD reads", RGB_GRAY8);
+
+ newline ();
+
+ i = 0;
+ while ((chunk_size = chunk_sizes [i++])) {
+ int amount = do_read (chunk_size, LODSD, false);
+ BMPGraphing_add_point (graph, chunk_size, amount);
+ }
+
+ //------------------------------------------------------------
+ // LODSW 16-bit sequential reads.
+ //
+ BMPGraphing_new_line (graph, "Sequential 16-bit LODSW reads", RGB_GRAY10);
+
+ newline ();
+
+ i = 0;
+ while ((chunk_size = chunk_sizes [i++])) {
+ int amount = do_read (chunk_size, LODSW, false);
+ BMPGraphing_add_point (graph, chunk_size, amount);
+ }
+
+ //------------------------------------------------------------
+ // LODSB 64-bit sequential reads.
+ //
+ BMPGraphing_new_line (graph, "Sequential 8-bit LODSB reads", RGB_GRAY12);
+
+ newline ();
+
+ i = 0;
+ while ((chunk_size = chunk_sizes [i++])) {
+ int amount = do_read (chunk_size, LODSB, false);
+ BMPGraphing_add_point (graph, chunk_size, amount);
+ }
+#endif
+
+ //------------------------------------------------------------
+ // Register to register.
+ //
+ newline ();
+ register_test ();
+
+ //------------------------------------------------------------
+ // Stack to/from register.
+ //
+ newline ();
+ stack_test ();
+
+ //------------------------------------------------------------
+ // C library performance.
+ //
+ newline ();
+ library_test ();
+
+ //------------------------------------------------------------
+ // Framebuffer read & write.
+ //
+#if defined(__linux__) && defined(FBIOGET_FSCREENINFO)
+ newline ();
+ fb_readwrite (true);
+#endif
+
+premature_end_for_testing:
+ flush ();
+
+ BMPGraphing_make (graph);
+
+ BMP_write (graph->image, "bandwidth.bmp");
+
+ puts ("\nWrote graph to bandwidth.bmp.");
+ puts ("");
+ puts ("Done.");
+
+ BMPGraphing_destroy (graph);
+
+ return 0;
+}
OpenPOWER on IntegriCloud