diff options
Diffstat (limited to 'main.c')
-rwxr-xr-x | main.c | 2442 |
1 files changed, 2442 insertions, 0 deletions
@@ -0,0 +1,2442 @@ +/*============================================================================ + bandwidth 1.1, a benchmark to estimate memory transfer bandwidth. + Copyright (C) 2005-2014 by Zack T Smith. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + The author may be reached at veritas@comcast.net. + *===========================================================================*/ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> +#include <sys/param.h> +#include <sys/types.h> +#include <sys/time.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <unistd.h> +#include <wchar.h> +#include <math.h> + +#include <netdb.h> // gethostbyname +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> + +#define GRAPH_WIDTH 1440 +#define GRAPH_HEIGHT 900 + +#include "defs.h" +#include "BMP.h" +#include "BMPGraphing.h" + +#define TITLE_MEMORY_NET "Network benchmark results from bandwidth " RELEASE " by Zack Smith, http://zsmith.co" +#define TITLE_MEMORY_GRAPH "Memory benchmark results from bandwidth " RELEASE " by Zack Smith, http://zsmith.co" + +#ifdef __WIN32__ +#include <windows.h> +#endif + +#ifdef __linux__ +#include <linux/fb.h> +#include <sys/mman.h> +#endif + +static int network_port = NETWORK_DEFAULT_PORTNUM; + +enum { + NO_SSE2, + SSE2, + SSE2_BYPASS, + AVX, + AVX_BYPASS, + LODSQ, + LODSD, + LODSW, + LODSB +}; + +static BMPGraph *graph = NULL; + +static bool use_sse2 = true; +static bool use_sse4 = true; +static bool is_intel = false; +static bool is_amd = false; + +static uint32_t cpu_has_mmx = 0; +static uint32_t cpu_has_sse = 0; +static uint32_t cpu_has_sse2 = 0; +static uint32_t cpu_has_sse3 = 0; +static uint32_t cpu_has_ssse3 = 0; +static uint32_t cpu_has_sse4a = 0; +static uint32_t cpu_has_sse41 = 0; +static uint32_t cpu_has_sse42 = 0; +static uint32_t cpu_has_aes = 0; +static uint32_t cpu_has_avx = 0; +static uint32_t cpu_has_avx2 = 0; +static uint32_t cpu_has_64bit = 0; +static uint32_t cpu_has_xd = 0; + +//---------------------------------------- +// Parameters for the tests. +// + +static long usec_per_test = 5000000; // 5 seconds per memory test. + +static int chunk_sizes[] = { + 128, + 256, + 384, + 512, + 640, + 768, + 896, + 1024, + 1280, + 2048, + 3072, + 4096, + 6144, + 8192, // Some processors' L1 data caches are only 8kB. + 12288, + 16384, + 20480, + 24576, + 28672, + 32768, // Common L1 data cache size. + 34*1024, + 36*1024, + 40960, + 49152, + 65536, + 131072, // Old L2 cache size. + 192 * 1024, + 256 * 1024, // Old L2 cache size. + 320 * 1024, + 384 * 1024, + 512 * 1024, // Old L2 cache size. + 768 * 1024, + 1 << 20, // 1 MB = common L2 cache size. + (1024 + 256) * 1024, // 1.25 + (1024 + 512) * 1024, // 1.5 + (1024 + 768) * 1024, // 1.75 + 1 << 21, // 2 MB = common L2 cache size. + (2048 + 256) * 1024, // 2.25 + (2048 + 512) * 1024, // 2.5 + (2048 + 768) * 1024, // 2.75 + 3072 * 1024, // 3 MB = common L2 cache size. + 3407872, // 3.25 MB + 3 * 1024 * 1024 + 1024 * 512, // 3.5 MB + 1 << 22, // 4 MB + 5242880, // 5 megs + 6291456, // 6 megs (common L2 cache size) + 7 * 1024 * 1024, + 8 * 1024 * 1024, // Xeon E3's often has 8MB L3 + 9 * 1024 * 1024, + 10 * 1024 * 1024, // Xeon E5-2609 has 10MB L3 + 12 * 1024 * 1024, + 14 * 1024 * 1024, + 15 * 1024 * 1024, // Xeon E6-2630 has 15MB L3 + 16 * 1024 * 1024, + 20 * 1024 * 1024, // Xeon E5-2690 has 20MB L3 + 21 * 1024 * 1024, + 32 * 1024 * 1024, + 48 * 1024 * 1024, + 64 * 1024 * 1024, + 72 * 1024 * 1024, + 96 * 1024 * 1024, + 128 * 1024 * 1024, + 0 +}; + +static double chunk_sizes_log2 [sizeof(chunk_sizes)/sizeof(int)]; + +//---------------------------------------------------------------------------- +// Name: error +// Purpose: Complain and exit. +//---------------------------------------------------------------------------- +void error (char *s) +{ +#ifndef __WIN32__ + fprintf (stderr, "Error: %s\n", s); + exit (1); +#else + wchar_t tmp [200]; + int i; + for (i = 0; s[i]; i++) + tmp[i] = s[i]; + tmp[i] = 0; + MessageBoxW (0, tmp, L"Error", 0); + ExitProcess (0); +#endif +} + +//============================================================================ +// Output buffer logic. +// This is somewhat vestigial code, originating with Windows Mobile ARM port. +//============================================================================ + +#define MSGLEN 10000 +static wchar_t msg [MSGLEN]; + +void print (wchar_t *s) +{ + wcsncat (msg, s, MSGLEN-1); +} + +void newline () +{ + wcsncat (msg, L"\n", MSGLEN-1); +} + +void println (wchar_t *s) +{ + wcsncat (msg, s, MSGLEN-1); + newline (); +} + +void print_int (int d) +{ + swprintf (msg + wcslen (msg), MSGLEN, L"%d", d); +} + +void print_uint (unsigned int d) +{ + swprintf (msg + wcslen (msg), MSGLEN, L"%lu", d); +} + +void println_int (int d) +{ + print_int (d); + newline (); +} + +void print_result (long double result) +{ + swprintf (msg + wcslen (msg), MSGLEN, L"%.1Lf MB/s", result); +} + +void dump (FILE *f) +{ + if (!f) + f = stdout; + + int i = 0; + while (msg[i]) { + char ch = (char) msg[i]; + fputc (ch, f); + i++; + } + + msg [0] = 0; +} + +void flush () +{ + dump (NULL); + fflush (stdout); +} + +void print_size (unsigned long size) +{ + if (size < 1536) { + print_int (size); + print (L" B"); + } + else if (size < (1<<20)) { + print_int (size >> 10); + print (L" kB"); + } else { + print_int (size >> 20); + switch ((size >> 18) & 3) { + case 1: print (L".25"); break; + case 2: print (L".5"); break; + case 3: print (L".75"); break; + } + print (L" MB"); + } +} + +//============================================================================ +// Timing logic. +//============================================================================ + +//---------------------------------------------------------------------------- +// Name: mytime +// Purpose: Reports time in microseconds. +//---------------------------------------------------------------------------- +unsigned long mytime () +{ +#ifndef __WIN32__ + struct timeval tv; + struct timezone tz; + memset (&tz, 0, sizeof(struct timezone)); + gettimeofday (&tv, &tz); + return 1000000 * tv.tv_sec + tv.tv_usec; +#else + return 1000 * GetTickCount (); // accurate enough. +#endif +} + +//---------------------------------------------------------------------------- +// Name: calculate_result +// Purpose: Calculates and prints a result. +// Returns: 10 times the number of megabytes per second. +//---------------------------------------------------------------------------- +int +calculate_result (unsigned long chunk_size, long long total_loops, long diff) +{ + if (!diff) + error ("Zero time difference."); + +// printf ("\nIn calculate_result, chunk_size=%ld, total_loops=%lld, diff=%ld\n", chunk_size, total_loops, diff); + long double result = (long double) chunk_size; + result *= (long double) total_loops; + result *= 1000000.; // Convert to microseconds. + result /= 1048576.; + result /= (long double) diff; + + print_result (result); + + return (long) (10.0 * result); +} + +//============================================================================ +// Tests. +//============================================================================ + +//---------------------------------------------------------------------------- +// Name: do_write +// Purpose: Performs write on chunk of memory of specified size. +//---------------------------------------------------------------------------- +int +do_write (unsigned long size, int mode, bool random) +{ + unsigned char *chunk; + unsigned char *chunk0; + unsigned long loops; + unsigned long long total_count=0; +#ifdef __x86_64__ + unsigned long value = 0x1234567689abcdef; +#else + unsigned long value = 0x12345678; +#endif + unsigned long diff=0, t0; + unsigned long tmp; + unsigned long **chunk_ptrs = NULL; + + if (size & 127) + error ("do_write(): chunk size is not multiple of 128."); + + //------------------------------------------------- + chunk0 = malloc (size+64); + chunk = chunk0; + if (!chunk) + error ("Out of memory"); + + tmp = (unsigned long) chunk; + if (tmp & 31) { + tmp -= (tmp & 31); + tmp += 32; + chunk = (unsigned char*) tmp; + } + + //---------------------------------------- + // Set up random pointers to chunks. + // + if (random) { + tmp = size/256; + chunk_ptrs = (unsigned long**) malloc (sizeof (unsigned long*) * tmp); + if (!chunk_ptrs) + error ("Out of memory."); + + //---------------------------------------- + // Store pointers to all chunks into array. + // + int i; + for (i = 0; i < tmp; i++) { + chunk_ptrs [i] = (unsigned long*) (chunk + 256 * i); + } + + //---------------------------------------- + // Randomize the array of chunk pointers. + // + int k = 100; + while (k--) { + for (i = 0; i < tmp; i++) { + int j = rand() % tmp; + if (i != j) { + unsigned long *ptr = chunk_ptrs [i]; + chunk_ptrs [i] = chunk_ptrs [j]; + chunk_ptrs [j] = ptr; + } + } + } + } + + //------------------------------------------------- + if (random) + print (L"Random write "); + else + print (L"Sequential write "); + + switch (mode) { + case SSE2: + print (L"(128-bit), size = "); + break; + case AVX: + print (L"(256-bit), size = "); + break; + case AVX_BYPASS: + print (L"bypassing cache (256-bit), size = "); + break; + case SSE2_BYPASS: + print (L"bypassing cache (128-bit), size = "); + break; + default: +#ifdef __x86_64__ + print (L"(64-bit), size = "); +#else + print (L"(32-bit), size = "); +#endif + } + + print_size (size); + print (L", "); + + loops = (1 << 26) / size;// XX need to adjust for CPU MHz + if (loops < 1) + loops = 1; + + t0 = mytime (); + + while (diff < usec_per_test) { + total_count += loops; + + switch (mode) { + case SSE2: + if (random) + RandomWriterSSE2 (chunk_ptrs, size/256, loops, value); + else { + if (size & 128) + WriterSSE2_128bytes (chunk, size, loops, value); + else + WriterSSE2 (chunk, size, loops, value); + } + break; + + case SSE2_BYPASS: + if (random) + RandomWriterSSE2_bypass (chunk_ptrs, size/256, loops, value); + else { + if (size & 128) + WriterSSE2_128bytes_bypass (chunk, size, loops, value); + else + WriterSSE2_bypass (chunk, size, loops, value); + } + break; + + case AVX: + if (!random) { + WriterAVX (chunk, size, loops, value); + } + break; + + case AVX_BYPASS: + if (!random) { + WriterAVX_bypass (chunk, size, loops, value); + } + break; + + default: + if (random) + RandomWriter (chunk_ptrs, size/256, loops, value); + else { + if (size & 128) + Writer_128bytes (chunk, size, loops, value); + else + Writer (chunk, size, loops, value); + } + } + + diff = mytime () - t0; + } + + print (L"loops = "); + print_uint (total_count); + print (L", "); + + flush (); + + int result = calculate_result (size, total_count, diff); + newline (); + + flush (); + + free ((void*)chunk0); + + if (chunk_ptrs) + free (chunk_ptrs); + + return result; +} + + +//---------------------------------------------------------------------------- +// Name: do_read +// Purpose: Performs sequential read on chunk of memory of specified size. +//---------------------------------------------------------------------------- +int +do_read (unsigned long size, int mode, bool random) +{ + unsigned long loops; + unsigned long long total_count = 0; + unsigned long t0, diff=0; + unsigned char *chunk; + unsigned char *chunk0; + unsigned long tmp; + unsigned long **chunk_ptrs = NULL; + + if (size & 127) + error ("do_read(): chunk size is not multiple of 128."); + + //------------------------------------------------- + chunk0 = chunk = malloc (size+64); + if (!chunk) + error ("Out of memory"); + + memset (chunk, 0, size); + + tmp = (unsigned long) chunk; + if (tmp & 31) { + tmp -= (tmp & 31); + tmp += 32; + chunk = (unsigned char*) tmp; + } + + //---------------------------------------- + // Set up random pointers to chunks. + // + if (random) { + int tmp = size/256; + chunk_ptrs = (unsigned long**) malloc (sizeof (unsigned long*) * tmp); + if (!chunk_ptrs) + error ("Out of memory."); + + //---------------------------------------- + // Store pointers to all chunks into array. + // + int i; + for (i = 0; i < tmp; i++) { + chunk_ptrs [i] = (unsigned long*) (chunk + 256 * i); + } + + //---------------------------------------- + // Randomize the array of chunk pointers. + // + int k = 100; + while (k--) { + for (i = 0; i < tmp; i++) { + int j = rand() % tmp; + if (i != j) { + unsigned long *ptr = chunk_ptrs [i]; + chunk_ptrs [i] = chunk_ptrs [j]; + chunk_ptrs [j] = ptr; + } + } + } + } + + //------------------------------------------------- + if (random) + print (L"Random read "); + else + print (L"Sequential read "); + + switch (mode) { + case SSE2: + print (L"(128-bit), size = "); + break; + case LODSB: + print (L"(8-bit LODSB), size = "); + break; + case LODSW: + print (L"(16-bit LODSW), size = "); + break; + case LODSD: + print (L"(32-bit LODSD), size = "); + break; + case LODSQ: + print (L"(64-bit LODSQ), size = "); + break; + case AVX: + print (L"(256-bit), size = "); + break; + case AVX_BYPASS: + print (L"bypassing cache (256-bit), size = "); + break; + case SSE2_BYPASS: + print (L"bypassing cache (128-bit), size = "); + break; + default: +#ifdef __x86_64__ + print (L"(64-bit), size = "); +#else + print (L"(32-bit), size = "); +#endif + } + + print_size (size); + print (L", "); + + flush (); + + loops = (1 << 26) / size; // XX need to adjust for CPU MHz + if (loops < 1) + loops = 1; + + t0 = mytime (); + + while (diff < usec_per_test) { + total_count += loops; + + switch (mode) { + case SSE2: + if (random) + RandomReaderSSE2 (chunk_ptrs, size/256, loops); + else { + if (size & 128) + ReaderSSE2_128bytes (chunk, size, loops); + else + ReaderSSE2 (chunk, size, loops); + } + break; + + case SSE2_BYPASS: + // No random reader for bypass. + // + if (random) + RandomReaderSSE2_bypass (chunk_ptrs, size/256, loops); + else { + if (size & 128) + ReaderSSE2_128bytes_bypass (chunk, size, loops); + else + ReaderSSE2_bypass (chunk, size, loops); + } + break; + + case AVX: + if (!random) { + ReaderAVX (chunk, size, loops); + } + break; + + case LODSB: + if (!random) { + ReaderLODSB (chunk, size, loops); + } + break; + + case LODSW: + if (!random) { + ReaderLODSW (chunk, size, loops); + } + break; + + case LODSD: + if (!random) { + ReaderLODSD (chunk, size, loops); + } + break; + + case LODSQ: + if (!random) { + ReaderLODSQ (chunk, size, loops); + } + break; + + default: + if (random) { + RandomReader (chunk_ptrs, size/256, loops); + } else { + if (size & 128) + Reader_128bytes (chunk, size, loops); + else + Reader (chunk, size, loops); + } + } + + diff = mytime () - t0; + } + + print (L"loops = "); + print_uint (total_count); + print (L", "); + + int result = calculate_result (size, total_count, diff); + newline (); + + flush (); + + free (chunk0); + + if (chunk_ptrs) + free (chunk_ptrs); + + return result; +} + + + +//---------------------------------------------------------------------------- +// Name: do_copy +// Purpose: Performs sequential memory copy. +//---------------------------------------------------------------------------- +int +do_copy (unsigned long size, int mode) +{ + unsigned long loops; + unsigned long long total_count = 0; + unsigned long t0, diff=0; + unsigned char *chunk_src; + unsigned char *chunk_dest; + unsigned char *chunk_src0; + unsigned char *chunk_dest0; + unsigned long tmp; + + if (size & 127) + error ("do_copy(): chunk size is not multiple of 128."); + + //------------------------------------------------- + chunk_src0 = chunk_src = malloc (size+64); + if (!chunk_src) + error ("Out of memory"); + chunk_dest0 = chunk_dest = malloc (size+64); + if (!chunk_dest) + error ("Out of memory"); + + memset (chunk_src, 100, size); + memset (chunk_dest, 200, size); + + tmp = (unsigned long) chunk_src; + if (tmp & 31) { + tmp -= (tmp & 31); + tmp += 32; + chunk_src = (unsigned char*) tmp; + } + tmp = (unsigned long) chunk_dest; + if (tmp & 31) { + tmp -= (tmp & 31); + tmp += 32; + chunk_dest = (unsigned char*) tmp; + } + + //------------------------------------------------- + print (L"Sequential copy "); + + if (mode == SSE2) { + print (L"(128-bit), size = "); + } + else if (mode == AVX) { + print (L"(256-bit), size = "); + } + else { +#ifdef __x86_64__ + print (L"(64-bit), size = "); +#else + print (L"(32-bit), size = "); +#endif + } + + print_size (size); + print (L", "); + + flush (); + + loops = (1 << 26) / size; // XX need to adjust for CPU MHz + if (loops < 1) + loops = 1; + + t0 = mytime (); + + while (diff < usec_per_test) { + total_count += loops; + + if (mode == SSE2) { +#ifdef __x86_64__ + if (size & 128) + CopySSE_128bytes (chunk_dest, chunk_src, size, loops); + else + CopySSE (chunk_dest, chunk_src, size, loops); +#else + CopySSE (chunk_dest, chunk_src, size, loops); +#endif + } + else if (mode == AVX) { + if (!(size & 128)) + CopyAVX (chunk_dest, chunk_src, size, loops); + } + + diff = mytime () - t0; + } + + print (L"loops = "); + print_uint (total_count); + print (L", "); + + int result = calculate_result (size, total_count, diff); + newline (); + + flush (); + + free (chunk_src0); + free (chunk_dest0); + + return result; +} + + +//---------------------------------------------------------------------------- +// Name: fb_readwrite +// Purpose: Performs sequential read & write tests on framebuffer memory. +//---------------------------------------------------------------------------- +#if defined(__linux__) && defined(FBIOGET_FSCREENINFO) +void +fb_readwrite (bool use_sse2) +{ + unsigned long counter, total_count; + unsigned long length; + unsigned long diff, t0; + static struct fb_fix_screeninfo fi; + static struct fb_var_screeninfo vi; + unsigned long *fb = NULL; + unsigned long datum; + int fd; + register unsigned long foo; +#ifdef __x86_64__ + unsigned long value = 0x1234567689abcdef; +#else + unsigned long value = 0x12345678; +#endif + + //------------------------------------------------- + + fd = open ("/dev/fb0", O_RDWR); + if (fd < 0) + fd = open ("/dev/fb/0", O_RDWR); + if (fd < 0) { + println (L"Cannot open framebuffer device."); + return; + } + + if (ioctl (fd, FBIOGET_FSCREENINFO, &fi)) { + close (fd); + println (L"Cannot get framebuffer info"); + return; + } + else + if (ioctl (fd, FBIOGET_VSCREENINFO, &vi)) { + close (fd); + println (L"Cannot get framebuffer info"); + return; + } + else + { + if (fi.visual != FB_VISUAL_TRUECOLOR && + fi.visual != FB_VISUAL_DIRECTCOLOR ) { + close (fd); + println (L"Need direct/truecolor framebuffer device."); + return; + } else { + unsigned long fblen; + + print (L"Framebuffer resolution: "); + print_int (vi.xres); + print (L"x"); + print_int (vi.yres); + print (L", "); + print_int (vi.bits_per_pixel); + println (L" bpp\n"); + + fb = (unsigned long*) fi.smem_start; + fblen = fi.smem_len; + + fb = mmap (fb, fblen, + PROT_WRITE | PROT_READ, + MAP_SHARED, fd, 0); + if (fb == MAP_FAILED) { + close (fd); + println (L"Cannot access framebuffer memory."); + return; + } + } + } + + //------------------- + // Use only the upper half of the display. + // + length = FB_SIZE; + + //------------------- + // READ + // + print (L"Framebuffer memory sequential read "); + flush (); + + t0 = mytime (); + + total_count = FBLOOPS_R; + + if (use_sse2) + ReaderSSE2 (fb, length, FBLOOPS_R); + else + Reader (fb, length, FBLOOPS_R); + + diff = mytime () - t0; + + calculate_result (length, total_count, diff); + newline (); + + //------------------- + // WRITE + // + print (L"Framebuffer memory sequential write "); + flush (); + + t0 = mytime (); + + total_count = FBLOOPS_W; + + if (use_sse2) + WriterSSE2_bypass (fb, length, FBLOOPS_W, value); + else + Writer (fb, length, FBLOOPS_W, value); + + diff = mytime () - t0; + + calculate_result (length, total_count, diff); + newline (); +} +#endif + +//---------------------------------------------------------------------------- +// Name: register_test +// Purpose: Determines bandwidth of register-to-register transfers. +//---------------------------------------------------------------------------- +void +register_test () +{ + long long total_count = 0; + unsigned long t0; + unsigned long diff = 0; + + //-------------------------------------- +#ifdef __x86_64__ + print (L"Main register to main register transfers (64-bit) "); +#else + print (L"Main register to main register transfers (32-bit) "); +#endif + flush (); +#define REGISTER_COUNT 10000 + + t0 = mytime (); + while (diff < usec_per_test) + { + RegisterToRegister (REGISTER_COUNT); + total_count += REGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); + + //-------------------------------------- +#ifdef __x86_64__ + print (L"Main register to vector register transfers (64-bit) "); +#else + print (L"Main register to vector register transfers (32-bit) "); +#endif + flush (); +#define VREGISTER_COUNT 3333 + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + RegisterToVector (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); + + //-------------------------------------- +#ifdef __x86_64__ + print (L"Vector register to main register transfers (64-bit) "); +#else + print (L"Vector register to main register transfers (32-bit) "); +#endif + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + VectorToRegister (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); + + //-------------------------------------- + print (L"Vector register to vector register transfers (128-bit) "); + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + VectorToVector (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); + + //-------------------------------------- + if (cpu_has_avx) { + print (L"Vector register to vector register transfers (256-bit) "); + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + VectorToVectorAVX (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); + } + + //-------------------------------------- + if (use_sse4) { + print (L"Vector 8-bit datum to main register transfers "); + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + Vector8ToRegister (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (64, total_count, diff); + newline (); + flush (); + } + + //-------------------------------------- + print (L"Vector 16-bit datum to main register transfers "); + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + Vector16ToRegister (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (128, total_count, diff); + newline (); + flush (); + + //-------------------------------------- + if (use_sse4) { + print (L"Vector 32-bit datum to main register transfers "); + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + Vector32ToRegister (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); + } + + //-------------------------------------- + if (use_sse4) { + print (L"Vector 64-bit datum to main register transfers "); + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + Vector64ToRegister (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); + } + + //-------------------------------------- + if (use_sse4) { + print (L"Main register 8-bit datum to vector register transfers "); + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + Register8ToVector (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (64, total_count, diff); + newline (); + flush (); + } + + //-------------------------------------- + print (L"Main register 16-bit datum to vector register transfers "); + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + Register16ToVector (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (128, total_count, diff); + newline (); + flush (); + + //-------------------------------------- + if (use_sse4) { + print (L"Main register 32-bit datum to vector register transfers "); + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + Register32ToVector (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); + } + + //-------------------------------------- + if (use_sse4) { + print (L"Main register 64-bit datum to vector register transfers "); + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + Register64ToVector (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); + } +} + +//---------------------------------------------------------------------------- +// Name: stack_test +// Purpose: Determines bandwidth of stack-to/from-register transfers. +//---------------------------------------------------------------------------- +void +stack_test () +{ + long long total_count = 0; + unsigned long t0; + unsigned long diff = 0; + +#ifdef __x86_64__ + print (L"Stack-to-register transfers (64-bit) "); +#else + print (L"Stack-to-register transfers (32-bit) "); +#endif + flush (); + + //-------------------------------------- + diff = 0; + total_count = 0; + t0 = mytime (); + while (diff < usec_per_test) + { + StackReader (REGISTER_COUNT); + total_count += REGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); + +#ifdef __x86_64__ + print (L"Register-to-stack transfers (64-bit) "); +#else + print (L"Register-to-stack transfers (32-bit) "); +#endif + flush (); + + //-------------------------------------- + diff = 0; + total_count = 0; + t0 = mytime (); + while (diff < usec_per_test) + { + StackWriter (REGISTER_COUNT); + total_count += REGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); +} + +//---------------------------------------------------------------------------- +// Name: library_test +// Purpose: Performs C library tests (memset, memcpy). +//---------------------------------------------------------------------------- +void +library_test () +{ + char *a1, *a2; + unsigned long t, t0; + int i; + + #define NT_SIZE (64*1024*1024) + #define NT_SIZE2 (100) + + a1 = malloc (NT_SIZE); + if (!a1) + error ("Out of memory"); + + a2 = malloc (NT_SIZE); + if (!a2) + error ("Out of memory"); + + //-------------------------------------- + t0 = mytime (); + for (i=0; i<NT_SIZE2; i++) { + memset (a1, i, NT_SIZE); + } + t = mytime (); + + print (L"Library: memset "); + calculate_result (NT_SIZE, NT_SIZE2, t-t0); + newline (); + + flush (); + + //-------------------------------------- + t0 = mytime (); + for (i=0; i<NT_SIZE2; i++) { + memcpy (a2, a1, NT_SIZE); + } + t = mytime (); + + print (L"Library: memcpy "); + calculate_result (NT_SIZE, NT_SIZE2, t-t0); + newline (); + + flush (); + + free (a1); + free (a2); +} + +//---------------------------------------------------------------------------- +// Name: network_test_core +// Purpose: Performs the network test, talking to and receiving data +// back from a transponder node. +// Note: Port number specified using server:# notation. +// Returns: -1 on error, else the network duration in microseconds. +//---------------------------------------------------------------------------- +bool +network_test_core (const char *hostname, char *chunk, + unsigned long chunk_size, + unsigned long n_chunks, + long *duration_send_return, + long *duration_recv_return) +{ + if (!hostname || !chunk || !n_chunks || !chunk_size || + !duration_send_return || + !duration_recv_return) + return false; + + struct hostent* host = gethostbyname (hostname); + if (!host) + return false; + + char *host_ip = inet_ntoa (*(struct in_addr *)*host->h_addr_list); + int sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + + struct sockaddr_in addr; + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = inet_addr(host_ip); + addr.sin_port = htons(network_port); + + if (connect (sock, (struct sockaddr*) &addr, sizeof (struct sockaddr))) + { + // perror ("connect"); + close (sock); + return false; + } + + //------------------------------------ + // Start stopwatch just before the send. + // It will be stopped on receipt of + // the response. + // + unsigned long t0 = mytime (); + + //------------------------------------ + // Put # of chunks in the chunk. + // Send all of our data. + // + sprintf (chunk, "%lu\n", n_chunks); + int i; + for (i = 0; i < n_chunks; i++) + send (sock, chunk, chunk_size, 0); + +#if 0 + //------------------------------------ + // Set nonblocking mode. + // + int opt = 1; + ioctl (sock, FIONBIO, &opt); +#endif + + unsigned long t1 = mytime (); + + //------------------------------------ + // Read the response. + // + int amount = recv (sock, chunk, chunk_size, 0); + if (amount < 16) { + close (sock); + return false; + } + + unsigned long duration_send = mytime() - t0; + + //------------------------------------ + // Validate the response, which + // contains the transponder's + // perceived read duration. This value + // may be as little as half our number. + // + unsigned long duration2 = -1; + if (strncmp ("OK: ", chunk, 4)) { + close (sock); + return false; + } + if (1 != sscanf (4+chunk, "%lu", &duration2)) { + close (sock); + return false; + } + + unsigned long remaining = chunk_size * n_chunks - amount; + while (remaining > 0) { + int amount = recv (sock, chunk, chunk_size, 0); + if (amount <= 0) { + perror ("recv"); + close (sock); + return false; + } + remaining -= amount; + } + + unsigned long duration_recv = mytime () - t1; + + *duration_send_return = duration_send; + *duration_recv_return = duration_recv; + + close (sock); + return true; +} + +//---------------------------------------------------------------------------- +// Name: ip_to_str +//---------------------------------------------------------------------------- +void +ip_to_str (unsigned long addr, char *str) +{ + if (!str) + return; + + unsigned short a = 0xff & addr; + unsigned short b = 0xff & (addr >> 8); + unsigned short c = 0xff & (addr >> 16); + unsigned short d = 0xff & (addr >> 24); + sprintf (str, "%u.%u.%u.%u", a,b,c,d); +} + +//---------------------------------------------------------------------------- +// Name: network_transponder +// Purpose: Act as a transponder, receiving chunks of data and sending +// back an acknowledgement once the enture chunk is read. +// Returns: False if a problem occurs setting up the network socket. +//---------------------------------------------------------------------------- +bool +network_transponder () +{ + struct sockaddr_in sin, from; + + //------------------------------ + // Get listening socket for port. + // Then listen on given port#. + // + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = htonl(INADDR_ANY); + sin.sin_port = htons(network_port); + int listensock; + if ((listensock = socket (AF_INET, SOCK_STREAM, 0)) < 0) { + perror ("socket"); + return false; + } + if (bind (listensock, (struct sockaddr*) &sin, sizeof(sin)) < 0) { + perror ("bind"); + close (listensock); + return false; + } + if (listen (listensock, 500) < 0) { + perror ("listen"); + close (listensock); + return false; + } + + bool done = false; + while (!done) { + //---------------------------------------- + // Wait for a client to contact us. + // + socklen_t len = sizeof (struct sockaddr); + int sock = accept (listensock, (struct sockaddr*) &from, &len); + if (sock < 0) { + perror ("accept"); + close (listensock); + return false; + } + + //---------------------------------------- + // Clockwatch starts when we accept the + // connection. + // + unsigned long t0 = mytime (); + + if (len != sizeof (struct sockaddr_in)) { + close (sock); + close (listensock); + return false; + } + +#if 0 + unsigned long ipaddr = from.sin_addr.s_addr; + char ipstring[30]; + ip_to_str (ipaddr, ipstring); + fprintf (stderr, "Incoming connection from %s\n", ipstring); +#endif + + //---------------------------------------- + // Read the first chunk only, in order to + // get the # of bytes that will be sent. + // + char chunk [NETWORK_CHUNK_SIZE+1]; + long n_chunks = 0; + int amount_read = read (sock, chunk, NETWORK_CHUNK_SIZE); + chunk [amount_read] = 0; + if (1 != sscanf (chunk, "%ld", &n_chunks)) { + close (sock); + close (listensock); + return false; + } + + //---------------------------------------- + // If the leader sends us a chunk count of + // -99, this indicates that we should exit. + // + if (n_chunks == -99) { + close (sock); + close (listensock); + return true; + } + +// printf ("Reading %lu chunks of %d bytes...\n", n_chunks, NETWORK_CHUNK_SIZE); + + unsigned long long remaining = n_chunks; + remaining *= NETWORK_CHUNK_SIZE; + +// printf ("remaining="); dump_hex64(remaining); puts(""); + + remaining -= amount_read; + while (remaining > 0) { + amount_read = read (sock, chunk, NETWORK_CHUNK_SIZE); + remaining -= amount_read; + + if (amount_read < 0) { + perror ("read"); + break; + } else + if (!amount_read) + break; + } + + unsigned long duration = mytime() - t0; + + //------------------------------------ + // Send response of same size. + // + sprintf (chunk, "OK: %lu\n", duration); + chunk[14] = '\n'; + + //------------------------------------ + // Send all of our data. + // + int i; + for (i = 0; i < n_chunks; i++) + send (sock, chunk, NETWORK_CHUNK_SIZE, 0); + + close (sock); + } + + return true; +} + +//---------------------------------------------------------------------------- +// Name: network_test +//---------------------------------------------------------------------------- +bool +network_test (char **destinations, int n_destinations) +{ + int i; + + //---------------------------------------- + // The memory chunk starts with a 12-byte + // length of the overall send size. + // The memory chunk will have a list of + // the destinations in it. + // In future, there will be a mechanism + // for testing bandwidth between all nodes, + // not just the leader & each of the + // transponders. + // + char chunk [NETWORK_CHUNK_SIZE]; + memset (chunk, 0, NETWORK_CHUNK_SIZE); + sprintf (chunk, "000000000000\n%d\n", n_destinations); + for (i = 0; i < n_destinations; i++) { + char *s = destinations [i]; + int chunk_len = strlen (chunk); + int len = strlen (s); + if (len + chunk_len < NETWORK_CHUNK_SIZE-1) { + //---------------------------------------- + // "transp" indicates that the given node + // has not yet been a leader. + // In future, "done" will indicate it has. + // + sprintf (chunk + chunk_len, "%s %s\n", s, "transp"); + } + } + + static unsigned long colors [] = { + RGB_RED, RGB_GREEN, RGB_BLUE, RGB_ORANGE, RGB_PURPLE, + RGB_BLACK, RGB_CORAL, + RGB_CYAN, RGB_NAVYBLUE, RGB_BRASS, RGB_DARKORANGE, + RGB_DARKGREEN, RGB_SALMON, RGB_MAGENTA, RGB_LEMONYELLOW, + RGB_ROYALBLUE, RGB_DODGERBLUE, RGB_TURQUOISE, RGB_CADETBLUE, + RGB_CHARTREUSE, RGB_DARKOLIVEGREEN, RGB_VIOLET, + RGB_KHAKI, RGB_DARKKHAKI, RGB_GOLDENROD + }; +#define NCOLORS (sizeof(colors)/sizeof(unsigned long)) + + //---------------------------------------- + // For each destination, run the test. + // + for (i = 0; i < n_destinations; i++) { + bool problem = false; + + char *hostname = destinations[i]; + printf ("Bandwidth sending to %s:\n", hostname); + + char title [PATH_MAX]; + sprintf (title, "%s send (solid)", hostname); + BMPGraphing_new_line (graph, title, i < NCOLORS? colors[i] : RGB_GRAY); + + //---------------------------------------- + // Cache the receive durations for later. + // + unsigned long recv_rates [NETSIZE_MAX]; + int recv_ix = 0; + + //---------------------------------------- + // Send data of increasing sizes. + // + int j = NETSIZE_MIN; + int n_runs = 64; + while (!problem && j <= NETSIZE_MAX) { + unsigned long chunk_count = 1 << (j-NETSIZE_MIN); + unsigned long long amt_to_send = chunk_count; + amt_to_send *= NETWORK_CHUNK_SIZE; + + if (!amt_to_send) // unlikely + break; + + //---------------------------------------- + // Send the data; do this n_runs times. + // + unsigned long long total_duration_send = 0; + unsigned long long total_duration_recv = 0; + + int k = n_runs; + while (k--) { + long duration_send, duration_recv; + + if (! network_test_core (hostname, + chunk, NETWORK_CHUNK_SIZE, chunk_count, + &duration_send, &duration_recv)) + { + problem = true; + fprintf (stderr, "\nCan't connect to %s\n", hostname); + break; + } + + total_duration_send += duration_send; + total_duration_recv += duration_recv; + } + + if (problem) + break; + + total_duration_send += n_runs/2; // Round up + total_duration_send /= n_runs; // Get average + long duration = (long) total_duration_send; + + total_duration_recv += n_runs/2; // Round up + total_duration_recv /= n_runs; // Get average + + unsigned long amt_in_kb = amt_to_send / 1024; + unsigned long amt_in_mb = amt_to_send / 1048576; + if (!amt_in_mb) { + printf ("\r\tChunk %lu kB x %d: \t", amt_in_kb, + n_runs); + } else { + printf ("\r\tChunk %lu MB x %d: \t", amt_in_mb, + n_runs); + } + + //------------------------------ + // Calculate send rate in MB/sec. + // + // Get total # bytes. + unsigned long long tmp = NETWORK_CHUNK_SIZE; + tmp *= chunk_count; + + // Get total bytes per second. + tmp *= 1000000; + tmp /= duration; + + // Bytes to megabytes. + tmp /= 1000; + tmp /= 10; + unsigned long whole = tmp / 100; + unsigned long frac = tmp % 100; + printf ("%lu.%02lu MB/s (sent)\t", whole, frac); + fflush (stdout); + + BMPGraphing_add_point (graph, amt_in_kb, tmp); + + //------------------------------ + // Calculate recv rate in MB/sec. + // + // Get total # bytes. + tmp = NETWORK_CHUNK_SIZE; + tmp *= chunk_count; + + // Get total bytes per second. + tmp *= 1000000; + tmp /= total_duration_recv; + + // Bytes to megabytes. + tmp /= 1000; + tmp /= 10; + whole = tmp / 100; + frac = tmp % 100; + printf ("%lu.%02lu MB/s (received)\n", whole, frac); + + recv_rates [recv_ix++] = tmp; + + j++; + n_runs >>= 1; + if (!n_runs) + n_runs = 1; + } + + //---------------------------------------- + // Now add the line for the receive rates. + // + sprintf (title, "%s receive (dashed)", hostname); + BMPGraphing_new_line (graph, title, DASHED | + (i < NCOLORS? colors[i] : RGB_GRAY)); + for (j = NETSIZE_MIN; j <= NETSIZE_MAX; j++) { + unsigned long chunk_count = 1 << (j-NETSIZE_MIN); + unsigned long long amt_to_send = chunk_count; + amt_to_send *= NETWORK_CHUNK_SIZE; + unsigned long amt_in_kb = amt_to_send / 1024; +// printf ("amt_in_kb=%ld\n",amt_in_kb); + + BMPGraphing_add_point (graph, amt_in_kb, recv_rates[j-NETSIZE_MIN]); + } + + puts (""); + } + + return true; +} + +//---------------------------------------------------------------------------- +// Name: usage +//---------------------------------------------------------------------------- +void +usage () +{ + printf ("Usage: bandwidth [--slow] [--fast] [--faster] [--fastest] [--title string]\n"); + printf ("Usage for starting network tests: bandwidth --network <ipaddr1> [<ipaddr2...] [--port <port#>]\n"); + printf ("Usage for receiving network tests: bandwidth --transponder [--port <port#>]\n"); + + exit (0); +} + +//---------------------------------------------------------------------------- +// Name: main +//---------------------------------------------------------------------------- +int +main (int argc, char **argv) +{ + int i, chunk_size; + + --argc; + ++argv; + + bool network_mode = false; + bool network_leader = false; // false => transponder + int network_destinations_size = 0; + int n_network_destinations = 0; + char **network_destinations = NULL; + + char graph_title [512] = {0}; + + i = 0; + while (i < argc) { + char *s = argv [i++]; + + if (!strcmp ("--network", s)) { + network_mode = true; + network_leader = true; + network_destinations_size = 20; + network_destinations = (char**) malloc (network_destinations_size * sizeof (char*)); + } + else + if (!strcmp ("--transponder", s)) { + network_mode = true; + } + else + if (!strcmp ("--port", s)) { + if (i != argc) + network_port = atoi (argv[i++]); + } + else + if (!strcmp ("--slow", s)) { + usec_per_test=20000000; // 20 seconds per test. + } + else + if (!strcmp ("--fast", s)) { + usec_per_test = 500000; // 0.5 seconds per test. + } + else + if (!strcmp ("--faster", s)) { + usec_per_test = 50000; // 0.05 seconds per test. + } + else + if (!strcmp ("--fastest", s)) { + usec_per_test = 5000; // 0.005 seconds per test. + } + else + if (!strcmp ("--nosse2", s)) { + use_sse2 = false; + use_sse4 = false; + } + else + if (!strcmp ("--nosse4", s)) { + use_sse4 = false; + } + else + if (!strcmp ("--help", s)) { + usage (); + } + else + if (!strcmp ("--title", s) && i != argc) { + snprintf (graph_title, 511, "%s", argv[i++]); + } + else { + if ('-' == *s) + usage (); + } + } + + msg[0] = 0; + + for (i = 0; chunk_sizes[i] && i < sizeof(chunk_sizes)/sizeof(int); i++) { + chunk_sizes_log2[i] = log2 (chunk_sizes[i]); + } + + printf ("This is bandwidth version %s.\n", RELEASE); + printf ("Copyright (C) 2005-2014 by Zack T Smith.\n\n"); + printf ("This software is covered by the GNU Public License.\n"); + printf ("It is provided AS-IS, use at your own risk.\n"); + printf ("See the file COPYING for more information.\n\n"); + fflush (stdout); + + //---------------------------------------- + // If network mode selected, enter it now. + // Currently cannot combine memory tests + // & network tests. + // + if (network_mode) { + if (network_leader) { + graph = BMPGraphing_new (GRAPH_WIDTH, GRAPH_HEIGHT, MODE_X_AXIS_LINEAR); + strcpy (graph_title, TITLE_MEMORY_NET); + BMPGraphing_set_title (graph, graph_title); + + network_test (network_destinations, n_network_destinations); + + BMPGraphing_make (graph); + + BMP_write (graph->image, "network_bandwidth.bmp"); + +#if defined(__linux__) || defined(__CYGWIN__) || defined(__APPLE__) + puts ("Wrote graph to network_bandwidth.bmp."); + puts (""); + puts ("Done."); +#endif + BMPGraphing_destroy (graph); + } else { + network_transponder (); + } + + return 0; + } + + uint32_t ecx = get_cpuid1_ecx (); + uint32_t edx = get_cpuid1_edx (); + cpu_has_mmx = edx & CPUID_EDX_MMX; + cpu_has_sse = edx & CPUID_EDX_SSE; + cpu_has_sse2 = edx & CPUID_EDX_SSE2; + cpu_has_sse3 = ecx & CPUID_ECX_SSE3; + cpu_has_ssse3 = ecx & CPUID_ECX_SSSE3; + cpu_has_sse41 = ecx & CPUID_ECX_SSE41; + cpu_has_sse42 = ecx & CPUID_ECX_SSE42; + cpu_has_aes = ecx & CPUID_ECX_AES; + cpu_has_avx = ecx & CPUID_ECX_AVX; + cpu_has_avx2 = 0; + + if (cpu_has_avx) { + cpu_has_avx2 = get_cpuid7_ebx (); + cpu_has_avx2 &= CPUID_EBX_AVX2; + } + + use_sse2 = true; + use_sse4 = true; + + cpu_has_sse4a = 0; + cpu_has_64bit = 0; + cpu_has_xd = 0; + + static char family [17]; + get_cpuid_family (family); + family [16] = 0; + printf ("CPU family: %s\n", family); + + uint32_t ecx2 = get_cpuid_80000001_ecx (); + uint32_t edx2 = get_cpuid_80000001_edx (); + + if (!strcmp ("AuthenticAMD", family)) { + is_amd = true; + cpu_has_sse4a = ecx2 & CPUID_ECX_SSE4A; + } + else + if (!strcmp ("GenuineIntel", family)) { + is_intel = true; + } + + cpu_has_xd = edx2 & CPUID_EDX_XD; + cpu_has_64bit = edx2 & CPUID_EDX_INTEL64; + + printf ("CPU features: "); + if (cpu_has_mmx) printf ("MMX "); + if (cpu_has_sse) printf ("SSE "); + if (cpu_has_sse2) printf ("SSE2 "); + if (cpu_has_sse3) printf ("SSE3 "); + if (cpu_has_ssse3) printf ("SSSE3 "); + if (cpu_has_sse4a) printf ("SSE4A "); + if (cpu_has_sse41) printf ("SSE4.1 "); + if (cpu_has_sse42) printf ("SSE4.2 "); + if (cpu_has_aes) printf ("AES "); + if (cpu_has_avx) printf ("AVX "); + if (cpu_has_avx2) printf ("AVX2 "); + if (cpu_has_xd) printf ("XD "); + if (cpu_has_64bit) { + if (!is_amd) + printf ("Intel64 "); + else + printf ("LongMode "); + } + puts ("\n"); + + if (is_intel) { + uint32_t cache_info[4]; + i = 0; + while (1) { + get_cpuid_cache_info (cache_info, i); + if (!(cache_info[0] & 31)) + break; + +#if 0 + printf ("Cache info %d = 0x%08x, 0x%08x, 0x%08x, 0x%08x\n", i, + cache_info [0], + cache_info [1], + cache_info [2], + cache_info [3]); +#endif + printf ("Cache %d: ", i); + switch ((cache_info[0] >> 5) & 7) { + case 1: printf ("L1 "); break; + case 2: printf ("L2 "); break; + case 3: printf ("L3 "); break; + } + switch (cache_info[0] & 31) { + case 1: printf ("data cache, "); break; + case 2: printf ("instruction cache, "); break; + case 3: printf ("unified cache, "); break; + } + uint32_t n_ways = 1 + (cache_info[1] >> 22); + uint32_t line_size = 1 + (cache_info[1] & 2047); + uint32_t n_sets = 1 + cache_info[2]; + printf ("line size %d, ", line_size); + printf ("%2d-way%s, ", n_ways, n_ways>1 ? "s" : ""); + printf ("%5d sets, ", n_sets); + unsigned size = (n_ways * line_size * n_sets) >> 10; + printf ("size %dk ", size); + puts (""); + i++; + } + } + + if (!cpu_has_sse41) + use_sse4 = false; + if (!cpu_has_sse2) + use_sse2 = false; + + println (L"\nNotation: B = byte, kB = 1024 B, MB = 1048576 B."); + + flush (); + + //------------------------------------------------------------ + // Attempt to obtain information about the CPU. + // +#ifdef __linux__ + struct stat st; + if (!stat ("/proc/cpuinfo", &st)) { +#define TMPFILE "/tmp/bandw_tmp" + unlink (TMPFILE); + if (-1 == system ("grep MHz /proc/cpuinfo | uniq | sed \"s/[\\t\\n: a-zA-Z]//g\" > "TMPFILE)) + perror ("system"); + + FILE *f = fopen (TMPFILE, "r"); + if (f) { + float cpu_speed = 0.0; + + if (1 == fscanf (f, "%g", &cpu_speed)) { + puts (""); + printf ("CPU speed is %g MHz.\n", cpu_speed); + } + fclose (f); + } + } else { + printf ("CPU information is not available (/proc/cpuinfo).\n"); + } + fflush (stdout); +#endif + + graph = BMPGraphing_new (GRAPH_WIDTH, GRAPH_HEIGHT, MODE_X_AXIS_LOG2); + strcpy (graph_title, TITLE_MEMORY_GRAPH); + BMPGraphing_set_title (graph, graph_title); + + //------------------------------------------------------------ + // SSE2 sequential reads. + // + if (use_sse2) { + BMPGraphing_new_line (graph, "Sequential 128-bit reads", RGB_RED); + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_read (chunk_size, SSE2, false); + BMPGraphing_add_point (graph, chunk_size, amount); + } + } + + //------------------------------------------------------------ + // AVX sequential reads. + // + if (cpu_has_avx) { + BMPGraphing_new_line (graph, "Sequential 256-bit reads", RGB_TURQUOISE); + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + if (!(chunk_size & 128)) { + int amount = do_read (chunk_size, AVX, false); + BMPGraphing_add_point (graph, chunk_size, amount); + } + } + } + + //------------------------------------------------------------ + // SSE2 random reads. + // + if (use_sse2) { + BMPGraphing_new_line (graph, "Random 128-bit reads", RGB_MAROON); + + newline (); + srand (time (NULL)); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + if (!(chunk_size & 128)) { + int amount = do_read (chunk_size, SSE2, true); + BMPGraphing_add_point (graph, chunk_size, amount); + } + } + } + + //------------------------------------------------------------ + // SSE2 sequential writes that do not bypass the caches. + // + if (use_sse2) { + BMPGraphing_new_line (graph, "Sequential 128-bit cache writes", RGB_PURPLE); + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_write (chunk_size, SSE2, false); + BMPGraphing_add_point (graph, chunk_size, amount); + } + } + + //------------------------------------------------------------ + // AVX sequential writes that do not bypass the caches. + // + if (cpu_has_avx) { + BMPGraphing_new_line (graph, "Sequential 256-bit cache writes", RGB_PINK); + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + if (!(chunk_size & 128)) { + int amount = do_write (chunk_size, AVX, false); + BMPGraphing_add_point (graph, chunk_size, amount); + } + } + } + + //------------------------------------------------------------ + // SSE2 random writes that do not bypass the caches. + // + if (use_sse2) { + BMPGraphing_new_line (graph, "Random 128-bit cache writes", RGB_NAVYBLUE); + + newline (); + srand (time (NULL)); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + if (!(chunk_size & 128)) { + int amount = do_write (chunk_size, SSE2, true); + BMPGraphing_add_point (graph, chunk_size, amount); + } + } + } + + //------------------------------------------------------------ + // SSE4 sequential reads that do bypass the caches. + // + if (use_sse4) { + BMPGraphing_new_line (graph, "Sequential 128-bit bypassing reads", RGB_BLACK); + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_read (chunk_size, SSE2_BYPASS, false); + BMPGraphing_add_point (graph, chunk_size, amount); + } + } + + //------------------------------------------------------------ + // SSE4 random reads that do bypass the caches. + // + if (use_sse4) { + BMPGraphing_new_line (graph, "Random 128-bit bypassing reads", 0xdeadbeef); + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + if (!(chunk_size & 128)) { + int amount = do_read (chunk_size, SSE2_BYPASS, true); + BMPGraphing_add_point (graph, chunk_size, amount); + } + } + } + + //------------------------------------------------------------ + // SSE4 sequential writes that do bypass the caches. + // + if (use_sse4) { + BMPGraphing_new_line (graph, "Sequential 128-bit bypassing writes", RGB_DARKORANGE); + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_write (chunk_size, SSE2_BYPASS, false); + BMPGraphing_add_point (graph, chunk_size, amount); + } + } + + //------------------------------------------------------------ + // AVX sequential writes that do bypass the caches. + // Currently on Intel CPUs (including Xeon) there is a + // microcode bug that leads to a severe drop in performance + // in this part of the test. + // + if (cpu_has_avx) { + BMPGraphing_new_line (graph, "Sequential 256-bit bypassing writes", RGB_DARKOLIVEGREEN); + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + if (!(chunk_size & 128)) { + int amount = do_write (chunk_size, AVX_BYPASS, false); + BMPGraphing_add_point (graph, chunk_size, amount); + } + } + } + + //------------------------------------------------------------ + // SSE4 random writes that bypass the caches. + // + if (use_sse4) { + BMPGraphing_new_line (graph, "Random 128-bit bypassing writes", RGB_LEMONYELLOW); + + newline (); + srand (time (NULL)); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + if (!(chunk_size & 128)) { + int amount = do_write (chunk_size, SSE2_BYPASS, true); + BMPGraphing_add_point (graph, chunk_size, amount); + } + } + } + + //------------------------------------------------------------ + // Sequential non-SSE2 reads. + // + newline (); +#ifdef __x86_64__ + BMPGraphing_new_line (graph, "Sequential 64-bit reads", RGB_BLUE); +#else + BMPGraphing_new_line (graph, "Sequential 32-bit reads", RGB_BLUE); +#endif + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_read (chunk_size, NO_SSE2, false); + BMPGraphing_add_point (graph, chunk_size, amount); + } + + //------------------------------------------------------------ + // Random non-SSE2 reads. + // + newline (); +#ifdef __x86_64__ + BMPGraphing_new_line (graph, "Random 64-bit reads", RGB_CYAN); +#else + BMPGraphing_new_line (graph, "Random 32-bit reads", RGB_CYAN); +#endif + srand (time (NULL)); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + if (!(chunk_size & 128)) { + int amount = do_read (chunk_size, NO_SSE2, true); + BMPGraphing_add_point (graph, chunk_size, amount); + } + } + + //------------------------------------------------------------ + // Sequential non-SSE2 writes. + // +#ifdef __x86_64__ + BMPGraphing_new_line (graph, "Sequential 64-bit writes", RGB_DARKGREEN); +#else + BMPGraphing_new_line (graph, "Sequential 32-bit writes", RGB_DARKGREEN); +#endif + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_write (chunk_size, NO_SSE2, false); + BMPGraphing_add_point (graph, chunk_size, amount); + } + + //------------------------------------------------------------ + // Random non-SSE2 writes. + // +#ifdef __x86_64__ + BMPGraphing_new_line (graph, "Random 64-bit writes", RGB_GREEN); +#else + BMPGraphing_new_line (graph, "Random 32-bit writes", RGB_GREEN); +#endif + + newline (); + srand (time (NULL)); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + if (!(chunk_size & 128)) { + int amount = do_write (chunk_size, NO_SSE2, true); + BMPGraphing_add_point (graph, chunk_size, amount); + } + } + + //------------------------------------------------------------ + // SSE2 sequential copy. + // + if (use_sse2) { + BMPGraphing_new_line (graph, "Sequential 128-bit copy", 0x8f8844); + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_copy (chunk_size, SSE2); + BMPGraphing_add_point (graph, chunk_size, amount); + } + } + + //------------------------------------------------------------ + // AVX sequential copy. + // + if (cpu_has_avx) { + BMPGraphing_new_line (graph, "Sequential 256-bit copy", RGB_CHARTREUSE); + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + if (!(chunk_size & 128)) { + int amount = do_copy (chunk_size, AVX); + BMPGraphing_add_point (graph, chunk_size, amount); + } + } + } + +#ifdef DOING_LODS +#ifdef __x86_64__ + //------------------------------------------------------------ + // LODSQ 64-bit sequential reads. + // + BMPGraphing_new_line (graph, "Sequential 64-bit LODSQ reads", RGB_GRAY6); + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_read (chunk_size, LODSQ, false); + BMPGraphing_add_point (graph, chunk_size, amount); + } +#endif + + //------------------------------------------------------------ + // LODSD 32-bit sequential reads. + // + BMPGraphing_new_line (graph, "Sequential 32-bit LODSD reads", RGB_GRAY8); + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_read (chunk_size, LODSD, false); + BMPGraphing_add_point (graph, chunk_size, amount); + } + + //------------------------------------------------------------ + // LODSW 16-bit sequential reads. + // + BMPGraphing_new_line (graph, "Sequential 16-bit LODSW reads", RGB_GRAY10); + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_read (chunk_size, LODSW, false); + BMPGraphing_add_point (graph, chunk_size, amount); + } + + //------------------------------------------------------------ + // LODSB 64-bit sequential reads. + // + BMPGraphing_new_line (graph, "Sequential 8-bit LODSB reads", RGB_GRAY12); + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_read (chunk_size, LODSB, false); + BMPGraphing_add_point (graph, chunk_size, amount); + } +#endif + + //------------------------------------------------------------ + // Register to register. + // + newline (); + register_test (); + + //------------------------------------------------------------ + // Stack to/from register. + // + newline (); + stack_test (); + + //------------------------------------------------------------ + // C library performance. + // + newline (); + library_test (); + + //------------------------------------------------------------ + // Framebuffer read & write. + // +#if defined(__linux__) && defined(FBIOGET_FSCREENINFO) + newline (); + fb_readwrite (true); +#endif + +premature_end_for_testing: + flush (); + + BMPGraphing_make (graph); + + BMP_write (graph->image, "bandwidth.bmp"); + + puts ("\nWrote graph to bandwidth.bmp."); + puts (""); + puts ("Done."); + + BMPGraphing_destroy (graph); + + return 0; +} |