/*============================================================================ bandwidth 1.1, a benchmark to estimate memory transfer bandwidth. Copyright (C) 2005-2014 by Zack T Smith. Copyright (c) 2015 Raptor Engineering This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA Zack Smith may be reached at veritas@comcast.net. *===========================================================================*/ #include #include #include #include #include #include #include #include #include #include #include #include #include #include // gethostbyname #include #include #include #define GRAPH_WIDTH 1440 #define GRAPH_HEIGHT 900 #include "defs.h" #include "BMP.h" #include "BMPGraphing.h" #if defined(__x86_64__) || defined(__i386__) #define TITLE_MEMORY_NET "Network benchmark results from bandwidth " RELEASE " by Zack Smith, http://zsmith.co" #define TITLE_MEMORY_GRAPH "Memory benchmark results from bandwidth " RELEASE " by Zack Smith, http://zsmith.co" #else #define TITLE_MEMORY_NET "Network benchmark results from bandwidth " RELEASE " by Zack Smith and Raptor Engineering" #define TITLE_MEMORY_GRAPH "Memory benchmark results from bandwidth " RELEASE " by Zack Smith and Raptor Engineering" #endif #ifdef __WIN32__ #include #endif #ifdef __linux__ #include #include #endif static int network_port = NETWORK_DEFAULT_PORTNUM; enum { NO_SSE2, SSE2, SSE2_BYPASS, AVX, AVX_BYPASS, VSX, LODSQ, LODSD, LODSW, LODSB }; static BMPGraph *graph = NULL; static bool use_sse2 = true; static bool use_sse4 = true; static bool is_intel = false; static bool is_amd = false; static uint32_t cpu_has_mmx = 0; static uint32_t cpu_has_sse = 0; static uint32_t cpu_has_sse2 = 0; static uint32_t cpu_has_sse3 = 0; static uint32_t cpu_has_ssse3 = 0; static uint32_t cpu_has_sse4a = 0; static uint32_t cpu_has_sse41 = 0; static uint32_t cpu_has_sse42 = 0; static uint32_t cpu_has_aes = 0; static uint32_t cpu_has_avx = 0; static uint32_t cpu_has_avx2 = 0; static uint32_t cpu_has_vsx = 0; static uint32_t cpu_has_64bit = 0; static uint32_t cpu_has_xd = 0; //---------------------------------------- // Parameters for the tests. // static long usec_per_test = 5000000; // 5 seconds per memory test. static int chunk_sizes[] = { 128, 256, 384, 512, 640, 768, 896, 1024, 1280, 2048, 3072, 4096, 6144, 8192, // Some processors' L1 data caches are only 8kB. 12288, 16384, 20480, 24576, 28672, 32768, // Common L1 data cache size. 34*1024, 36*1024, 40960, 49152, 65536, 131072, // Old L2 cache size. 192 * 1024, 256 * 1024, // Old L2 cache size. 320 * 1024, 384 * 1024, 512 * 1024, // Old L2 cache size. 768 * 1024, 1 << 20, // 1 MB = common L2 cache size. (1024 + 256) * 1024, // 1.25 (1024 + 512) * 1024, // 1.5 (1024 + 768) * 1024, // 1.75 1 << 21, // 2 MB = common L2 cache size. (2048 + 256) * 1024, // 2.25 (2048 + 512) * 1024, // 2.5 (2048 + 768) * 1024, // 2.75 3072 * 1024, // 3 MB = common L2 cache size. 3407872, // 3.25 MB 3 * 1024 * 1024 + 1024 * 512, // 3.5 MB 1 << 22, // 4 MB 5242880, // 5 megs 6291456, // 6 megs (common L2 cache size) 7 * 1024 * 1024, 8 * 1024 * 1024, // Xeon E3's often has 8MB L3 9 * 1024 * 1024, 10 * 1024 * 1024, // Xeon E5-2609 has 10MB L3 12 * 1024 * 1024, 14 * 1024 * 1024, 15 * 1024 * 1024, // Xeon E6-2630 has 15MB L3 16 * 1024 * 1024, 20 * 1024 * 1024, // Xeon E5-2690 has 20MB L3 21 * 1024 * 1024, 32 * 1024 * 1024, 48 * 1024 * 1024, 64 * 1024 * 1024, 72 * 1024 * 1024, 96 * 1024 * 1024, 128 * 1024 * 1024, #if defined(__PPC64__) 256 * 1024 * 1024, 512 * 1024 * 1024, 1024 * 1024 * 1024, #endif 0 }; static double chunk_sizes_log2 [sizeof(chunk_sizes)/sizeof(int)]; //---------------------------------------------------------------------------- // Name: error // Purpose: Complain and exit. //---------------------------------------------------------------------------- void error (char *s) { #ifndef __WIN32__ fprintf (stderr, "Error: %s\n", s); exit (1); #else wchar_t tmp [200]; int i; for (i = 0; s[i]; i++) tmp[i] = s[i]; tmp[i] = 0; MessageBoxW (0, tmp, L"Error", 0); ExitProcess (0); #endif } //============================================================================ // Output buffer logic. // This is somewhat vestigial code, originating with Windows Mobile ARM port. //============================================================================ #define MSGLEN 10000 static wchar_t msg [MSGLEN]; void print (wchar_t *s) { wcsncat (msg, s, MSGLEN-1); } void newline () { wcsncat (msg, L"\n", MSGLEN-1); } void println (wchar_t *s) { wcsncat (msg, s, MSGLEN-1); newline (); } void print_int (int d) { swprintf (msg + wcslen (msg), MSGLEN, L"%d", d); } void print_uint (unsigned int d) { swprintf (msg + wcslen (msg), MSGLEN, L"%lu", d); } void println_int (int d) { print_int (d); newline (); } void print_result (long double result) { swprintf (msg + wcslen (msg), MSGLEN, L"%.1Lf MB/s", result); } void dump (FILE *f) { if (!f) f = stdout; int i = 0; while (msg[i]) { char ch = (char) msg[i]; fputc (ch, f); i++; } msg [0] = 0; } void flush () { dump (NULL); fflush (stdout); } void print_size (unsigned long size) { if (size < 1536) { print_int (size); print (L" B"); } else if (size < (1<<20)) { print_int (size >> 10); print (L" kB"); } else { print_int (size >> 20); switch ((size >> 18) & 3) { case 1: print (L".25"); break; case 2: print (L".5"); break; case 3: print (L".75"); break; } print (L" MB"); } } //============================================================================ // Timing logic. //============================================================================ //---------------------------------------------------------------------------- // Name: mytime // Purpose: Reports time in microseconds. //---------------------------------------------------------------------------- unsigned long mytime () { #ifndef __WIN32__ struct timeval tv; struct timezone tz; memset (&tz, 0, sizeof(struct timezone)); gettimeofday (&tv, &tz); return 1000000 * tv.tv_sec + tv.tv_usec; #else return 1000 * GetTickCount (); // accurate enough. #endif } //---------------------------------------------------------------------------- // Name: calculate_result // Purpose: Calculates and prints a result. // Returns: 10 times the number of megabytes per second. //---------------------------------------------------------------------------- int calculate_result (unsigned long chunk_size, long long total_loops, long diff) { if (!diff) error ("Zero time difference."); // printf ("\nIn calculate_result, chunk_size=%ld, total_loops=%lld, diff=%ld\n", chunk_size, total_loops, diff); long double result = (long double) chunk_size; result *= (long double) total_loops; result *= 1000000.; // Convert to microseconds. result /= 1048576.; result /= (long double) diff; print_result (result); return (long) (10.0 * result); } //============================================================================ // Tests. //============================================================================ //---------------------------------------------------------------------------- // Name: do_write // Purpose: Performs write on chunk of memory of specified size. //---------------------------------------------------------------------------- int do_write (unsigned long size, int mode, bool random) { unsigned char *chunk; unsigned char *chunk0; unsigned long loops; unsigned long long total_count=0; #if defined(__x86_64__) || defined(__PPC64__) unsigned long value = 0x1234567689abcdef; #else unsigned long value = 0x12345678; #endif unsigned long diff=0, t0; unsigned long tmp; unsigned long **chunk_ptrs = NULL; if (size & 127) error ("do_write(): chunk size is not multiple of 128."); //------------------------------------------------- #if defined(__PPC64__) // Align to 128-bit boundaries chunk0 = malloc (size+256); chunk = chunk0; if (!chunk) error ("Out of memory"); tmp = (unsigned long) chunk; if (tmp & 127) { tmp -= (tmp & 127); tmp += 128; chunk = (unsigned char*) tmp; } #else // Align to 32-bit boundaries chunk0 = malloc (size+64); chunk = chunk0; if (!chunk) error ("Out of memory"); tmp = (unsigned long) chunk; if (tmp & 31) { tmp -= (tmp & 31); tmp += 32; chunk = (unsigned char*) tmp; } #endif //---------------------------------------- // Set up random pointers to chunks. // if (random) { tmp = size/256; chunk_ptrs = (unsigned long**) malloc (sizeof (unsigned long*) * tmp); if (!chunk_ptrs) error ("Out of memory."); //---------------------------------------- // Store pointers to all chunks into array. // int i; for (i = 0; i < tmp; i++) { chunk_ptrs [i] = (unsigned long*) (chunk + 256 * i); } //---------------------------------------- // Randomize the array of chunk pointers. // int k = 100; while (k--) { for (i = 0; i < tmp; i++) { int j = rand() % tmp; if (i != j) { unsigned long *ptr = chunk_ptrs [i]; chunk_ptrs [i] = chunk_ptrs [j]; chunk_ptrs [j] = ptr; } } } } //------------------------------------------------- if (random) print (L"Random write "); else print (L"Sequential write "); switch (mode) { case SSE2: print (L"(128-bit), size = "); break; case AVX: print (L"(256-bit), size = "); break; case VSX: print (L"(128-bit), size = "); break; case AVX_BYPASS: print (L"bypassing cache (256-bit), size = "); break; case SSE2_BYPASS: print (L"bypassing cache (128-bit), size = "); break; default: #if defined(__x86_64__) || defined(__PPC64__) print (L"(64-bit), size = "); #else print (L"(32-bit), size = "); #endif } print_size (size); print (L", "); loops = (1 << 26) / size;// XX need to adjust for CPU MHz if (loops < 1) loops = 1; t0 = mytime (); while (diff < usec_per_test) { total_count += loops; switch (mode) { #if defined(__x86_64__) || defined(__i386__) case SSE2: if (random) RandomWriterSSE2 (chunk_ptrs, size/256, loops, value); else { if (size & 128) WriterSSE2_128bytes (chunk, size, loops, value); else WriterSSE2 (chunk, size, loops, value); } break; case SSE2_BYPASS: if (random) RandomWriterSSE2_bypass (chunk_ptrs, size/256, loops, value); else { if (size & 128) WriterSSE2_128bytes_bypass (chunk, size, loops, value); else WriterSSE2_bypass (chunk, size, loops, value); } break; case AVX: if (!random) { WriterAVX (chunk, size, loops, value); } break; case AVX_BYPASS: if (!random) { WriterAVX_bypass (chunk, size, loops, value); } break; #endif #if defined(__PPC64__) case VSX: if (random) RandomWriterVSX (chunk_ptrs, size/256, loops, value); else WriterVSX (chunk, size, loops, value); break; #endif default: if (random) RandomWriter (chunk_ptrs, size/256, loops, value); else { if (size & 128) Writer_128bytes (chunk, size, loops, value); else Writer (chunk, size, loops, value); } } diff = mytime () - t0; } print (L"loops = "); print_uint (total_count); print (L", "); flush (); int result = calculate_result (size, total_count, diff); newline (); flush (); free ((void*)chunk0); if (chunk_ptrs) free (chunk_ptrs); return result; } //---------------------------------------------------------------------------- // Name: do_read // Purpose: Performs sequential read on chunk of memory of specified size. //---------------------------------------------------------------------------- int do_read (unsigned long size, int mode, bool random) { unsigned long loops; unsigned long long total_count = 0; unsigned long t0, diff=0; unsigned char *chunk; unsigned char *chunk0; unsigned long tmp; unsigned long **chunk_ptrs = NULL; if (size & 127) error ("do_read(): chunk size is not multiple of 128."); //------------------------------------------------- #if defined(__PPC64__) // Align to 128-bit boundaries chunk0 = chunk = malloc (size+256); if (!chunk) error ("Out of memory"); memset (chunk, 0, size); tmp = (unsigned long) chunk; if (tmp & 127) { tmp -= (tmp & 127); tmp += 128; chunk = (unsigned char*) tmp; } #else // Align to 32-bit boundaries chunk0 = chunk = malloc (size+64); if (!chunk) error ("Out of memory"); memset (chunk, 0, size); tmp = (unsigned long) chunk; if (tmp & 31) { tmp -= (tmp & 31); tmp += 32; chunk = (unsigned char*) tmp; } #endif //---------------------------------------- // Set up random pointers to chunks. // if (random) { int tmp = size/256; chunk_ptrs = (unsigned long**) malloc (sizeof (unsigned long*) * tmp); if (!chunk_ptrs) error ("Out of memory."); //---------------------------------------- // Store pointers to all chunks into array. // int i; for (i = 0; i < tmp; i++) { chunk_ptrs [i] = (unsigned long*) (chunk + 256 * i); } //---------------------------------------- // Randomize the array of chunk pointers. // int k = 100; while (k--) { for (i = 0; i < tmp; i++) { int j = rand() % tmp; if (i != j) { unsigned long *ptr = chunk_ptrs [i]; chunk_ptrs [i] = chunk_ptrs [j]; chunk_ptrs [j] = ptr; } } } } //------------------------------------------------- if (random) print (L"Random read "); else print (L"Sequential read "); switch (mode) { case SSE2: print (L"(128-bit), size = "); break; case LODSB: print (L"(8-bit LODSB), size = "); break; case LODSW: print (L"(16-bit LODSW), size = "); break; case LODSD: print (L"(32-bit LODSD), size = "); break; case LODSQ: print (L"(64-bit LODSQ), size = "); break; case AVX: print (L"(256-bit), size = "); break; case VSX: print (L"(128-bit), size = "); break; case AVX_BYPASS: print (L"bypassing cache (256-bit), size = "); break; case SSE2_BYPASS: print (L"bypassing cache (128-bit), size = "); break; default: #if defined(__x86_64__) || defined(__PPC64__) print (L"(64-bit), size = "); #else print (L"(32-bit), size = "); #endif } print_size (size); print (L", "); flush (); loops = (1 << 26) / size; // XX need to adjust for CPU MHz if (loops < 1) loops = 1; t0 = mytime (); while (diff < usec_per_test) { total_count += loops; switch (mode) { #if defined(__x86_64__) || defined(__i386__) case SSE2: if (random) RandomReaderSSE2 (chunk_ptrs, size/256, loops); else { if (size & 128) ReaderSSE2_128bytes (chunk, size, loops); else ReaderSSE2 (chunk, size, loops); } break; case SSE2_BYPASS: // No random reader for bypass. // if (random) RandomReaderSSE2_bypass (chunk_ptrs, size/256, loops); else { if (size & 128) ReaderSSE2_128bytes_bypass (chunk, size, loops); else ReaderSSE2_bypass (chunk, size, loops); } break; case AVX: if (!random) { ReaderAVX (chunk, size, loops); } break; #endif #if defined(__PPC64__) case VSX: if (random) RandomReaderVSX (chunk_ptrs, size/256, loops); else ReaderVSX (chunk, size, loops); break; #endif case LODSB: if (!random) { ReaderLODSB (chunk, size, loops); } break; case LODSW: if (!random) { ReaderLODSW (chunk, size, loops); } break; case LODSD: if (!random) { ReaderLODSD (chunk, size, loops); } break; case LODSQ: if (!random) { ReaderLODSQ (chunk, size, loops); } break; default: if (random) { RandomReader (chunk_ptrs, size/256, loops); } else { if (size & 128) Reader_128bytes (chunk, size, loops); else Reader (chunk, size, loops); } } diff = mytime () - t0; } print (L"loops = "); print_uint (total_count); print (L", "); int result = calculate_result (size, total_count, diff); newline (); flush (); free (chunk0); if (chunk_ptrs) free (chunk_ptrs); return result; } #if defined(__x86_64__) || defined(__i386__) //---------------------------------------------------------------------------- // Name: do_copy // Purpose: Performs sequential memory copy. //---------------------------------------------------------------------------- int do_copy (unsigned long size, int mode) { unsigned long loops; unsigned long long total_count = 0; unsigned long t0, diff=0; unsigned char *chunk_src; unsigned char *chunk_dest; unsigned char *chunk_src0; unsigned char *chunk_dest0; unsigned long tmp; if (size & 127) error ("do_copy(): chunk size is not multiple of 128."); //------------------------------------------------- #if defined(__PPC64__) // Align to 128-bit boundaries chunk_src0 = chunk_src = malloc (size+256); if (!chunk_src) error ("Out of memory"); chunk_dest0 = chunk_dest = malloc (size+256); if (!chunk_dest) error ("Out of memory"); memset (chunk_src, 100, size); memset (chunk_dest, 200, size); tmp = (unsigned long) chunk_src; if (tmp & 127) { tmp -= (tmp & 127); tmp += 128; chunk_src = (unsigned char*) tmp; } tmp = (unsigned long) chunk_dest; if (tmp & 127) { tmp -= (tmp & 127); tmp += 128; chunk_dest = (unsigned char*) tmp; } #else // Align to 32-bit boundaries chunk_src0 = chunk_src = malloc (size+64); if (!chunk_src) error ("Out of memory"); chunk_dest0 = chunk_dest = malloc (size+64); if (!chunk_dest) error ("Out of memory"); memset (chunk_src, 100, size); memset (chunk_dest, 200, size); tmp = (unsigned long) chunk_src; if (tmp & 31) { tmp -= (tmp & 31); tmp += 32; chunk_src = (unsigned char*) tmp; } tmp = (unsigned long) chunk_dest; if (tmp & 31) { tmp -= (tmp & 31); tmp += 32; chunk_dest = (unsigned char*) tmp; } #endif //------------------------------------------------- print (L"Sequential copy "); if (mode == SSE2) { print (L"(128-bit), size = "); } else if (mode == AVX) { print (L"(256-bit), size = "); } else { #if defined(__x86_64__) || defined(__PPC64__) print (L"(64-bit), size = "); #else print (L"(32-bit), size = "); #endif } print_size (size); print (L", "); flush (); loops = (1 << 26) / size; // XX need to adjust for CPU MHz if (loops < 1) loops = 1; t0 = mytime (); while (diff < usec_per_test) { total_count += loops; if (mode == SSE2) { #if defined(__x86_64__) if (size & 128) CopySSE_128bytes (chunk_dest, chunk_src, size, loops); else CopySSE (chunk_dest, chunk_src, size, loops); #else CopySSE (chunk_dest, chunk_src, size, loops); #endif } else if (mode == AVX) { if (!(size & 128)) CopyAVX (chunk_dest, chunk_src, size, loops); } diff = mytime () - t0; } print (L"loops = "); print_uint (total_count); print (L", "); int result = calculate_result (size, total_count, diff); newline (); flush (); free (chunk_src0); free (chunk_dest0); return result; } #endif //---------------------------------------------------------------------------- // Name: fb_readwrite // Purpose: Performs sequential read & write tests on framebuffer memory. //---------------------------------------------------------------------------- #if defined(__linux__) && defined(FBIOGET_FSCREENINFO) void fb_readwrite (bool use_sse2) { unsigned long counter, total_count; unsigned long length; unsigned long diff, t0; static struct fb_fix_screeninfo fi; static struct fb_var_screeninfo vi; unsigned long *fb = NULL; unsigned long datum; int fd; register unsigned long foo; #if defined(__x86_64__) || defined(__PPC64__) unsigned long value = 0x1234567689abcdef; #else unsigned long value = 0x12345678; #endif //------------------------------------------------- fd = open ("/dev/fb0", O_RDWR); if (fd < 0) fd = open ("/dev/fb/0", O_RDWR); if (fd < 0) { println (L"Cannot open framebuffer device."); return; } if (ioctl (fd, FBIOGET_FSCREENINFO, &fi)) { close (fd); println (L"Cannot get framebuffer info"); return; } else if (ioctl (fd, FBIOGET_VSCREENINFO, &vi)) { close (fd); println (L"Cannot get framebuffer info"); return; } else { if (fi.visual != FB_VISUAL_TRUECOLOR && fi.visual != FB_VISUAL_DIRECTCOLOR ) { close (fd); println (L"Need direct/truecolor framebuffer device."); return; } else { unsigned long fblen; print (L"Framebuffer resolution: "); print_int (vi.xres); print (L"x"); print_int (vi.yres); print (L", "); print_int (vi.bits_per_pixel); println (L" bpp\n"); fb = (unsigned long*) fi.smem_start; fblen = fi.smem_len; fb = mmap (fb, fblen, PROT_WRITE | PROT_READ, MAP_SHARED, fd, 0); if (fb == MAP_FAILED) { close (fd); println (L"Cannot access framebuffer memory."); return; } } } //------------------- // Use only the upper half of the display. // length = FB_SIZE; //------------------- // READ // print (L"Framebuffer memory sequential read "); flush (); t0 = mytime (); total_count = FBLOOPS_R; #if defined(__x86_64__) || defined(__i386__) if (use_sse2) ReaderSSE2 (fb, length, FBLOOPS_R); else #endif Reader (fb, length, FBLOOPS_R); diff = mytime () - t0; calculate_result (length, total_count, diff); newline (); //------------------- // WRITE // print (L"Framebuffer memory sequential write "); flush (); t0 = mytime (); total_count = FBLOOPS_W; #if defined(__x86_64__) || defined(__i386__) if (use_sse2) WriterSSE2_bypass (fb, length, FBLOOPS_W, value); else #endif Writer (fb, length, FBLOOPS_W, value); diff = mytime () - t0; calculate_result (length, total_count, diff); newline (); } #endif //---------------------------------------------------------------------------- // Name: register_test // Purpose: Determines bandwidth of register-to-register transfers. //---------------------------------------------------------------------------- #define REGISTER_COUNT 10000 #define VREGISTER_COUNT 3333 void register_test () { long long total_count = 0; unsigned long t0; unsigned long diff = 0; //-------------------------------------- #if defined(__x86_64__) || defined(__PPC64__) print (L"Main register to main register transfers (64-bit) "); #else print (L"Main register to main register transfers (32-bit) "); #endif flush (); t0 = mytime (); while (diff < usec_per_test) { RegisterToRegister (REGISTER_COUNT); total_count += REGISTER_COUNT; diff = mytime () - t0; } calculate_result (256, total_count, diff); newline (); flush (); //-------------------------------------- #if defined(__x86_64__) || defined(__PPC64__) print (L"Main register to vector register transfers (64-bit) "); #else print (L"Main register to vector register transfers (32-bit) "); #endif flush (); t0 = mytime (); diff = 0; total_count = 0; while (diff < usec_per_test) { RegisterToVector (VREGISTER_COUNT); total_count += VREGISTER_COUNT; diff = mytime () - t0; } calculate_result (256, total_count, diff); newline (); flush (); //-------------------------------------- #if defined(__x86_64__) || defined(__PPC64__) print (L"Vector register to main register transfers (64-bit) "); #else print (L"Vector register to main register transfers (32-bit) "); #endif flush (); t0 = mytime (); diff = 0; total_count = 0; while (diff < usec_per_test) { VectorToRegister (VREGISTER_COUNT); total_count += VREGISTER_COUNT; diff = mytime () - t0; } calculate_result (256, total_count, diff); newline (); flush (); //-------------------------------------- print (L"Vector register to vector register transfers (128-bit) "); flush (); t0 = mytime (); diff = 0; total_count = 0; while (diff < usec_per_test) { VectorToVector (VREGISTER_COUNT); total_count += VREGISTER_COUNT; diff = mytime () - t0; } calculate_result (256, total_count, diff); newline (); flush (); #if defined(__x86_64__) || defined(__i386__) //-------------------------------------- if (cpu_has_avx) { print (L"Vector register to vector register transfers (256-bit) "); flush (); t0 = mytime (); diff = 0; total_count = 0; while (diff < usec_per_test) { VectorToVectorAVX (VREGISTER_COUNT); total_count += VREGISTER_COUNT; diff = mytime () - t0; } calculate_result (256, total_count, diff); newline (); flush (); } //-------------------------------------- if (use_sse4) { print (L"Vector 8-bit datum to main register transfers "); flush (); t0 = mytime (); diff = 0; total_count = 0; while (diff < usec_per_test) { Vector8ToRegister (VREGISTER_COUNT); total_count += VREGISTER_COUNT; diff = mytime () - t0; } calculate_result (64, total_count, diff); newline (); flush (); } //-------------------------------------- print (L"Vector 16-bit datum to main register transfers "); flush (); t0 = mytime (); diff = 0; total_count = 0; while (diff < usec_per_test) { Vector16ToRegister (VREGISTER_COUNT); total_count += VREGISTER_COUNT; diff = mytime () - t0; } calculate_result (128, total_count, diff); newline (); flush (); //-------------------------------------- if (use_sse4) { print (L"Vector 32-bit datum to main register transfers "); flush (); t0 = mytime (); diff = 0; total_count = 0; while (diff < usec_per_test) { Vector32ToRegister (VREGISTER_COUNT); total_count += VREGISTER_COUNT; diff = mytime () - t0; } calculate_result (256, total_count, diff); newline (); flush (); } //-------------------------------------- if (use_sse4) { print (L"Vector 64-bit datum to main register transfers "); flush (); t0 = mytime (); diff = 0; total_count = 0; while (diff < usec_per_test) { Vector64ToRegister (VREGISTER_COUNT); total_count += VREGISTER_COUNT; diff = mytime () - t0; } calculate_result (256, total_count, diff); newline (); flush (); } //-------------------------------------- if (use_sse4) { print (L"Main register 8-bit datum to vector register transfers "); flush (); t0 = mytime (); diff = 0; total_count = 0; while (diff < usec_per_test) { Register8ToVector (VREGISTER_COUNT); total_count += VREGISTER_COUNT; diff = mytime () - t0; } calculate_result (64, total_count, diff); newline (); flush (); } //-------------------------------------- print (L"Main register 16-bit datum to vector register transfers "); flush (); t0 = mytime (); diff = 0; total_count = 0; while (diff < usec_per_test) { Register16ToVector (VREGISTER_COUNT); total_count += VREGISTER_COUNT; diff = mytime () - t0; } calculate_result (128, total_count, diff); newline (); flush (); //-------------------------------------- if (use_sse4) { print (L"Main register 32-bit datum to vector register transfers "); flush (); t0 = mytime (); diff = 0; total_count = 0; while (diff < usec_per_test) { Register32ToVector (VREGISTER_COUNT); total_count += VREGISTER_COUNT; diff = mytime () - t0; } calculate_result (256, total_count, diff); newline (); flush (); } //-------------------------------------- if (use_sse4) { print (L"Main register 64-bit datum to vector register transfers "); flush (); t0 = mytime (); diff = 0; total_count = 0; while (diff < usec_per_test) { Register64ToVector (VREGISTER_COUNT); total_count += VREGISTER_COUNT; diff = mytime () - t0; } calculate_result (256, total_count, diff); newline (); flush (); } #endif } //---------------------------------------------------------------------------- // Name: stack_test // Purpose: Determines bandwidth of stack-to/from-register transfers. //---------------------------------------------------------------------------- void stack_test () { long long total_count = 0; unsigned long t0; unsigned long diff = 0; #if defined(__x86_64__) || defined(__PPC64__) print (L"Stack-to-register transfers (64-bit) "); #else print (L"Stack-to-register transfers (32-bit) "); #endif flush (); //-------------------------------------- diff = 0; total_count = 0; t0 = mytime (); while (diff < usec_per_test) { StackReader (REGISTER_COUNT); total_count += REGISTER_COUNT; diff = mytime () - t0; } calculate_result (256, total_count, diff); newline (); flush (); #if defined(__x86_64__) || defined(__PPC64__) print (L"Register-to-stack transfers (64-bit) "); #else print (L"Register-to-stack transfers (32-bit) "); #endif flush (); //-------------------------------------- diff = 0; total_count = 0; t0 = mytime (); while (diff < usec_per_test) { StackWriter (REGISTER_COUNT); total_count += REGISTER_COUNT; diff = mytime () - t0; } calculate_result (256, total_count, diff); newline (); flush (); } //---------------------------------------------------------------------------- // Name: library_test // Purpose: Performs C library tests (memset, memcpy). //---------------------------------------------------------------------------- void library_test () { char *a1, *a2; unsigned long t, t0; int i; #define NT_SIZE (64*1024*1024) #define NT_SIZE2 (100) a1 = malloc (NT_SIZE); if (!a1) error ("Out of memory"); a2 = malloc (NT_SIZE); if (!a2) error ("Out of memory"); //-------------------------------------- t0 = mytime (); for (i=0; ih_addr_list); int sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); struct sockaddr_in addr; addr.sin_family = AF_INET; addr.sin_addr.s_addr = inet_addr(host_ip); addr.sin_port = htons(network_port); if (connect (sock, (struct sockaddr*) &addr, sizeof (struct sockaddr))) { // perror ("connect"); close (sock); return false; } //------------------------------------ // Start stopwatch just before the send. // It will be stopped on receipt of // the response. // unsigned long t0 = mytime (); //------------------------------------ // Put # of chunks in the chunk. // Send all of our data. // sprintf (chunk, "%lu\n", n_chunks); int i; for (i = 0; i < n_chunks; i++) send (sock, chunk, chunk_size, 0); #if 0 //------------------------------------ // Set nonblocking mode. // int opt = 1; ioctl (sock, FIONBIO, &opt); #endif unsigned long t1 = mytime (); //------------------------------------ // Read the response. // int amount = recv (sock, chunk, chunk_size, 0); if (amount < 16) { close (sock); return false; } unsigned long duration_send = mytime() - t0; //------------------------------------ // Validate the response, which // contains the transponder's // perceived read duration. This value // may be as little as half our number. // unsigned long duration2 = -1; if (strncmp ("OK: ", chunk, 4)) { close (sock); return false; } if (1 != sscanf (4+chunk, "%lu", &duration2)) { close (sock); return false; } unsigned long remaining = chunk_size * n_chunks - amount; while (remaining > 0) { int amount = recv (sock, chunk, chunk_size, 0); if (amount <= 0) { perror ("recv"); close (sock); return false; } remaining -= amount; } unsigned long duration_recv = mytime () - t1; *duration_send_return = duration_send; *duration_recv_return = duration_recv; close (sock); return true; } //---------------------------------------------------------------------------- // Name: ip_to_str //---------------------------------------------------------------------------- void ip_to_str (unsigned long addr, char *str) { if (!str) return; unsigned short a = 0xff & addr; unsigned short b = 0xff & (addr >> 8); unsigned short c = 0xff & (addr >> 16); unsigned short d = 0xff & (addr >> 24); sprintf (str, "%u.%u.%u.%u", a,b,c,d); } //---------------------------------------------------------------------------- // Name: network_transponder // Purpose: Act as a transponder, receiving chunks of data and sending // back an acknowledgement once the enture chunk is read. // Returns: False if a problem occurs setting up the network socket. //---------------------------------------------------------------------------- bool network_transponder () { struct sockaddr_in sin, from; //------------------------------ // Get listening socket for port. // Then listen on given port#. // sin.sin_family = AF_INET; sin.sin_addr.s_addr = htonl(INADDR_ANY); sin.sin_port = htons(network_port); int listensock; if ((listensock = socket (AF_INET, SOCK_STREAM, 0)) < 0) { perror ("socket"); return false; } if (bind (listensock, (struct sockaddr*) &sin, sizeof(sin)) < 0) { perror ("bind"); close (listensock); return false; } if (listen (listensock, 500) < 0) { perror ("listen"); close (listensock); return false; } bool done = false; while (!done) { //---------------------------------------- // Wait for a client to contact us. // socklen_t len = sizeof (struct sockaddr); int sock = accept (listensock, (struct sockaddr*) &from, &len); if (sock < 0) { perror ("accept"); close (listensock); return false; } //---------------------------------------- // Clockwatch starts when we accept the // connection. // unsigned long t0 = mytime (); if (len != sizeof (struct sockaddr_in)) { close (sock); close (listensock); return false; } #if 0 unsigned long ipaddr = from.sin_addr.s_addr; char ipstring[30]; ip_to_str (ipaddr, ipstring); fprintf (stderr, "Incoming connection from %s\n", ipstring); #endif //---------------------------------------- // Read the first chunk only, in order to // get the # of bytes that will be sent. // char chunk [NETWORK_CHUNK_SIZE+1]; long n_chunks = 0; int amount_read = read (sock, chunk, NETWORK_CHUNK_SIZE); chunk [amount_read] = 0; if (1 != sscanf (chunk, "%ld", &n_chunks)) { close (sock); close (listensock); return false; } //---------------------------------------- // If the leader sends us a chunk count of // -99, this indicates that we should exit. // if (n_chunks == -99) { close (sock); close (listensock); return true; } // printf ("Reading %lu chunks of %d bytes...\n", n_chunks, NETWORK_CHUNK_SIZE); unsigned long long remaining = n_chunks; remaining *= NETWORK_CHUNK_SIZE; // printf ("remaining="); dump_hex64(remaining); puts(""); remaining -= amount_read; while (remaining > 0) { amount_read = read (sock, chunk, NETWORK_CHUNK_SIZE); remaining -= amount_read; if (amount_read < 0) { perror ("read"); break; } else if (!amount_read) break; } unsigned long duration = mytime() - t0; //------------------------------------ // Send response of same size. // sprintf (chunk, "OK: %lu\n", duration); chunk[14] = '\n'; //------------------------------------ // Send all of our data. // int i; for (i = 0; i < n_chunks; i++) send (sock, chunk, NETWORK_CHUNK_SIZE, 0); close (sock); } return true; } //---------------------------------------------------------------------------- // Name: network_test //---------------------------------------------------------------------------- bool network_test (char **destinations, int n_destinations) { int i; //---------------------------------------- // The memory chunk starts with a 12-byte // length of the overall send size. // The memory chunk will have a list of // the destinations in it. // In future, there will be a mechanism // for testing bandwidth between all nodes, // not just the leader & each of the // transponders. // char chunk [NETWORK_CHUNK_SIZE]; memset (chunk, 0, NETWORK_CHUNK_SIZE); sprintf (chunk, "000000000000\n%d\n", n_destinations); for (i = 0; i < n_destinations; i++) { char *s = destinations [i]; int chunk_len = strlen (chunk); int len = strlen (s); if (len + chunk_len < NETWORK_CHUNK_SIZE-1) { //---------------------------------------- // "transp" indicates that the given node // has not yet been a leader. // In future, "done" will indicate it has. // sprintf (chunk + chunk_len, "%s %s\n", s, "transp"); } } static unsigned long colors [] = { RGB_RED, RGB_GREEN, RGB_BLUE, RGB_ORANGE, RGB_PURPLE, RGB_BLACK, RGB_CORAL, RGB_CYAN, RGB_NAVYBLUE, RGB_BRASS, RGB_DARKORANGE, RGB_DARKGREEN, RGB_SALMON, RGB_MAGENTA, RGB_LEMONYELLOW, RGB_ROYALBLUE, RGB_DODGERBLUE, RGB_TURQUOISE, RGB_CADETBLUE, RGB_CHARTREUSE, RGB_DARKOLIVEGREEN, RGB_VIOLET, RGB_KHAKI, RGB_DARKKHAKI, RGB_GOLDENROD }; #define NCOLORS (sizeof(colors)/sizeof(unsigned long)) //---------------------------------------- // For each destination, run the test. // for (i = 0; i < n_destinations; i++) { bool problem = false; char *hostname = destinations[i]; printf ("Bandwidth sending to %s:\n", hostname); char title [PATH_MAX]; sprintf (title, "%s send (solid)", hostname); BMPGraphing_new_line (graph, title, i < NCOLORS? colors[i] : RGB_GRAY); //---------------------------------------- // Cache the receive durations for later. // unsigned long recv_rates [NETSIZE_MAX]; int recv_ix = 0; //---------------------------------------- // Send data of increasing sizes. // int j = NETSIZE_MIN; int n_runs = 64; while (!problem && j <= NETSIZE_MAX) { unsigned long chunk_count = 1 << (j-NETSIZE_MIN); unsigned long long amt_to_send = chunk_count; amt_to_send *= NETWORK_CHUNK_SIZE; if (!amt_to_send) // unlikely break; //---------------------------------------- // Send the data; do this n_runs times. // unsigned long long total_duration_send = 0; unsigned long long total_duration_recv = 0; int k = n_runs; while (k--) { long duration_send, duration_recv; if (! network_test_core (hostname, chunk, NETWORK_CHUNK_SIZE, chunk_count, &duration_send, &duration_recv)) { problem = true; fprintf (stderr, "\nCan't connect to %s\n", hostname); break; } total_duration_send += duration_send; total_duration_recv += duration_recv; } if (problem) break; total_duration_send += n_runs/2; // Round up total_duration_send /= n_runs; // Get average long duration = (long) total_duration_send; total_duration_recv += n_runs/2; // Round up total_duration_recv /= n_runs; // Get average unsigned long amt_in_kb = amt_to_send / 1024; unsigned long amt_in_mb = amt_to_send / 1048576; if (!amt_in_mb) { printf ("\r\tChunk %lu kB x %d: \t", amt_in_kb, n_runs); } else { printf ("\r\tChunk %lu MB x %d: \t", amt_in_mb, n_runs); } //------------------------------ // Calculate send rate in MB/sec. // // Get total # bytes. unsigned long long tmp = NETWORK_CHUNK_SIZE; tmp *= chunk_count; // Get total bytes per second. tmp *= 1000000; tmp /= duration; // Bytes to megabytes. tmp /= 1000; tmp /= 10; unsigned long whole = tmp / 100; unsigned long frac = tmp % 100; printf ("%lu.%02lu MB/s (sent)\t", whole, frac); fflush (stdout); BMPGraphing_add_point (graph, amt_in_kb, tmp); //------------------------------ // Calculate recv rate in MB/sec. // // Get total # bytes. tmp = NETWORK_CHUNK_SIZE; tmp *= chunk_count; // Get total bytes per second. tmp *= 1000000; tmp /= total_duration_recv; // Bytes to megabytes. tmp /= 1000; tmp /= 10; whole = tmp / 100; frac = tmp % 100; printf ("%lu.%02lu MB/s (received)\n", whole, frac); recv_rates [recv_ix++] = tmp; j++; n_runs >>= 1; if (!n_runs) n_runs = 1; } //---------------------------------------- // Now add the line for the receive rates. // sprintf (title, "%s receive (dashed)", hostname); BMPGraphing_new_line (graph, title, DASHED | (i < NCOLORS? colors[i] : RGB_GRAY)); for (j = NETSIZE_MIN; j <= NETSIZE_MAX; j++) { unsigned long chunk_count = 1 << (j-NETSIZE_MIN); unsigned long long amt_to_send = chunk_count; amt_to_send *= NETWORK_CHUNK_SIZE; unsigned long amt_in_kb = amt_to_send / 1024; // printf ("amt_in_kb=%ld\n",amt_in_kb); BMPGraphing_add_point (graph, amt_in_kb, recv_rates[j-NETSIZE_MIN]); } puts (""); } return true; } //---------------------------------------------------------------------------- // Name: usage //---------------------------------------------------------------------------- void usage () { printf ("Usage: bandwidth [--slow] [--fast] [--faster] [--fastest] [--title string]\n"); printf ("Usage for starting network tests: bandwidth --network []\n"); printf ("Usage for receiving network tests: bandwidth --transponder [--port ]\n"); exit (0); } //---------------------------------------------------------------------------- // Name: main //---------------------------------------------------------------------------- int main (int argc, char **argv) { int i, chunk_size; --argc; ++argv; bool network_mode = false; bool network_leader = false; // false => transponder int network_destinations_size = 0; int n_network_destinations = 0; char **network_destinations = NULL; char graph_title [512] = {0}; i = 0; while (i < argc) { char *s = argv [i++]; if (!strcmp ("--network", s)) { network_mode = true; network_leader = true; network_destinations_size = 20; network_destinations = (char**) malloc (network_destinations_size * sizeof (char*)); } else if (!strcmp ("--transponder", s)) { network_mode = true; } else if (!strcmp ("--port", s)) { if (i != argc) network_port = atoi (argv[i++]); } else if (!strcmp ("--slow", s)) { usec_per_test=20000000; // 20 seconds per test. } else if (!strcmp ("--fast", s)) { usec_per_test = 500000; // 0.5 seconds per test. } else if (!strcmp ("--faster", s)) { usec_per_test = 50000; // 0.05 seconds per test. } else if (!strcmp ("--fastest", s)) { usec_per_test = 5000; // 0.005 seconds per test. } else if (!strcmp ("--nosse2", s)) { use_sse2 = false; use_sse4 = false; } else if (!strcmp ("--nosse4", s)) { use_sse4 = false; } else if (!strcmp ("--help", s)) { usage (); } else if (!strcmp ("--title", s) && i != argc) { snprintf (graph_title, 511, "%s", argv[i++]); } else { if ('-' == *s) usage (); } } msg[0] = 0; for (i = 0; chunk_sizes[i] && i < sizeof(chunk_sizes)/sizeof(int); i++) { chunk_sizes_log2[i] = log2 (chunk_sizes[i]); } printf ("This is bandwidth version %s.\n", RELEASE); printf ("Copyright (C) 2005-2014 by Zack T Smith.\n"); printf ("Copyright (C) 2015 Raptor Engineering.\n\n"); printf ("This software is covered by the GNU Public License.\n"); printf ("It is provided AS-IS, use at your own risk.\n"); printf ("See the file COPYING for more information.\n\n"); fflush (stdout); //---------------------------------------- // If network mode selected, enter it now. // Currently cannot combine memory tests // & network tests. // if (network_mode) { if (network_leader) { graph = BMPGraphing_new (GRAPH_WIDTH, GRAPH_HEIGHT, MODE_X_AXIS_LINEAR); strcpy (graph_title, TITLE_MEMORY_NET); BMPGraphing_set_title (graph, graph_title); network_test (network_destinations, n_network_destinations); BMPGraphing_make (graph); BMP_write (graph->image, "network_bandwidth.bmp"); #if defined(__linux__) || defined(__CYGWIN__) || defined(__APPLE__) puts ("Wrote graph to network_bandwidth.bmp."); puts (""); puts ("Done."); #endif BMPGraphing_destroy (graph); } else { network_transponder (); } return 0; } #if defined(__PPC64__) cpu_has_vsx = 1; #endif #if defined(__x86_64__) || defined(__i386__) uint32_t ecx = get_cpuid1_ecx (); uint32_t edx = get_cpuid1_edx (); cpu_has_mmx = edx & CPUID_EDX_MMX; cpu_has_sse = edx & CPUID_EDX_SSE; cpu_has_sse2 = edx & CPUID_EDX_SSE2; cpu_has_sse3 = ecx & CPUID_ECX_SSE3; cpu_has_ssse3 = ecx & CPUID_ECX_SSSE3; cpu_has_sse41 = ecx & CPUID_ECX_SSE41; cpu_has_sse42 = ecx & CPUID_ECX_SSE42; cpu_has_aes = ecx & CPUID_ECX_AES; cpu_has_avx = ecx & CPUID_ECX_AVX; cpu_has_avx2 = 0; if (cpu_has_avx) { cpu_has_avx2 = get_cpuid7_ebx (); cpu_has_avx2 &= CPUID_EBX_AVX2; } use_sse2 = true; use_sse4 = true; cpu_has_sse4a = 0; cpu_has_64bit = 0; cpu_has_xd = 0; static char family [17]; get_cpuid_family (family); family [16] = 0; printf ("CPU family: %s\n", family); uint32_t ecx2 = get_cpuid_80000001_ecx (); uint32_t edx2 = get_cpuid_80000001_edx (); if (!strcmp ("AuthenticAMD", family)) { is_amd = true; cpu_has_sse4a = ecx2 & CPUID_ECX_SSE4A; } else if (!strcmp ("GenuineIntel", family)) { is_intel = true; } cpu_has_xd = edx2 & CPUID_EDX_XD; cpu_has_64bit = edx2 & CPUID_EDX_INTEL64; printf ("CPU features: "); if (cpu_has_mmx) printf ("MMX "); if (cpu_has_sse) printf ("SSE "); if (cpu_has_sse2) printf ("SSE2 "); if (cpu_has_sse3) printf ("SSE3 "); if (cpu_has_ssse3) printf ("SSSE3 "); if (cpu_has_sse4a) printf ("SSE4A "); if (cpu_has_sse41) printf ("SSE4.1 "); if (cpu_has_sse42) printf ("SSE4.2 "); if (cpu_has_aes) printf ("AES "); if (cpu_has_avx) printf ("AVX "); if (cpu_has_avx2) printf ("AVX2 "); if (cpu_has_xd) printf ("XD "); if (cpu_has_64bit) { if (!is_amd) printf ("Intel64 "); else printf ("LongMode "); } puts ("\n"); if (is_intel) { uint32_t cache_info[4]; i = 0; while (1) { get_cpuid_cache_info (cache_info, i); if (!(cache_info[0] & 31)) break; #if 0 printf ("Cache info %d = 0x%08x, 0x%08x, 0x%08x, 0x%08x\n", i, cache_info [0], cache_info [1], cache_info [2], cache_info [3]); #endif printf ("Cache %d: ", i); switch ((cache_info[0] >> 5) & 7) { case 1: printf ("L1 "); break; case 2: printf ("L2 "); break; case 3: printf ("L3 "); break; } switch (cache_info[0] & 31) { case 1: printf ("data cache, "); break; case 2: printf ("instruction cache, "); break; case 3: printf ("unified cache, "); break; } uint32_t n_ways = 1 + (cache_info[1] >> 22); uint32_t line_size = 1 + (cache_info[1] & 2047); uint32_t n_sets = 1 + cache_info[2]; printf ("line size %d, ", line_size); printf ("%2d-way%s, ", n_ways, n_ways>1 ? "s" : ""); printf ("%5d sets, ", n_sets); unsigned size = (n_ways * line_size * n_sets) >> 10; printf ("size %dk ", size); puts (""); i++; } } if (!cpu_has_sse41) use_sse4 = false; if (!cpu_has_sse2) use_sse2 = false; #endif println (L"\nNotation: B = byte, kB = 1024 B, MB = 1048576 B."); flush (); //------------------------------------------------------------ // Attempt to obtain information about the CPU. // #ifdef __linux__ struct stat st; if (!stat ("/proc/cpuinfo", &st)) { #define TMPFILE "/tmp/bandw_tmp" unlink (TMPFILE); if (-1 == system ("grep MHz /proc/cpuinfo | uniq | sed \"s/[\\t\\n: a-zA-Z]//g\" > "TMPFILE)) perror ("system"); FILE *f = fopen (TMPFILE, "r"); if (f) { float cpu_speed = 0.0; if (1 == fscanf (f, "%g", &cpu_speed)) { puts (""); printf ("CPU speed is %g MHz.\n", cpu_speed); } fclose (f); } } else { printf ("CPU information is not available (/proc/cpuinfo).\n"); } fflush (stdout); #endif graph = BMPGraphing_new (GRAPH_WIDTH, GRAPH_HEIGHT, MODE_X_AXIS_LOG2); strcpy (graph_title, TITLE_MEMORY_GRAPH); BMPGraphing_set_title (graph, graph_title); #if defined(__x86_64__) || defined(__i386__) //------------------------------------------------------------ // SSE2 sequential reads. // if (use_sse2) { BMPGraphing_new_line (graph, "Sequential 128-bit reads", RGB_RED); newline (); i = 0; while ((chunk_size = chunk_sizes [i++])) { int amount = do_read (chunk_size, SSE2, false); BMPGraphing_add_point (graph, chunk_size, amount); } } //------------------------------------------------------------ // AVX sequential reads. // if (cpu_has_avx) { BMPGraphing_new_line (graph, "Sequential 256-bit reads", RGB_TURQUOISE); newline (); i = 0; while ((chunk_size = chunk_sizes [i++])) { if (!(chunk_size & 128)) { int amount = do_read (chunk_size, AVX, false); BMPGraphing_add_point (graph, chunk_size, amount); } } } //------------------------------------------------------------ // SSE2 random reads. // if (use_sse2) { BMPGraphing_new_line (graph, "Random 128-bit reads", RGB_MAROON); newline (); srand (time (NULL)); i = 0; while ((chunk_size = chunk_sizes [i++])) { if (!(chunk_size & 128)) { int amount = do_read (chunk_size, SSE2, true); BMPGraphing_add_point (graph, chunk_size, amount); } } } //------------------------------------------------------------ // SSE2 sequential writes that do not bypass the caches. // if (use_sse2) { BMPGraphing_new_line (graph, "Sequential 128-bit cache writes", RGB_PURPLE); newline (); i = 0; while ((chunk_size = chunk_sizes [i++])) { int amount = do_write (chunk_size, SSE2, false); BMPGraphing_add_point (graph, chunk_size, amount); } } //------------------------------------------------------------ // AVX sequential writes that do not bypass the caches. // if (cpu_has_avx) { BMPGraphing_new_line (graph, "Sequential 256-bit cache writes", RGB_PINK); newline (); i = 0; while ((chunk_size = chunk_sizes [i++])) { if (!(chunk_size & 128)) { int amount = do_write (chunk_size, AVX, false); BMPGraphing_add_point (graph, chunk_size, amount); } } } //------------------------------------------------------------ // SSE2 random writes that do not bypass the caches. // if (use_sse2) { BMPGraphing_new_line (graph, "Random 128-bit cache writes", RGB_NAVYBLUE); newline (); srand (time (NULL)); i = 0; while ((chunk_size = chunk_sizes [i++])) { if (!(chunk_size & 128)) { int amount = do_write (chunk_size, SSE2, true); BMPGraphing_add_point (graph, chunk_size, amount); } } } //------------------------------------------------------------ // SSE4 sequential reads that do bypass the caches. // if (use_sse4) { BMPGraphing_new_line (graph, "Sequential 128-bit bypassing reads", RGB_BLACK); newline (); i = 0; while ((chunk_size = chunk_sizes [i++])) { int amount = do_read (chunk_size, SSE2_BYPASS, false); BMPGraphing_add_point (graph, chunk_size, amount); } } //------------------------------------------------------------ // SSE4 random reads that do bypass the caches. // if (use_sse4) { BMPGraphing_new_line (graph, "Random 128-bit bypassing reads", 0xdeadbeef); newline (); i = 0; while ((chunk_size = chunk_sizes [i++])) { if (!(chunk_size & 128)) { int amount = do_read (chunk_size, SSE2_BYPASS, true); BMPGraphing_add_point (graph, chunk_size, amount); } } } //------------------------------------------------------------ // SSE4 sequential writes that do bypass the caches. // if (use_sse4) { BMPGraphing_new_line (graph, "Sequential 128-bit bypassing writes", RGB_DARKORANGE); newline (); i = 0; while ((chunk_size = chunk_sizes [i++])) { int amount = do_write (chunk_size, SSE2_BYPASS, false); BMPGraphing_add_point (graph, chunk_size, amount); } } //------------------------------------------------------------ // AVX sequential writes that do bypass the caches. // Currently on Intel CPUs (including Xeon) there is a // microcode bug that leads to a severe drop in performance // in this part of the test. // if (cpu_has_avx) { BMPGraphing_new_line (graph, "Sequential 256-bit bypassing writes", RGB_DARKOLIVEGREEN); newline (); i = 0; while ((chunk_size = chunk_sizes [i++])) { if (!(chunk_size & 128)) { int amount = do_write (chunk_size, AVX_BYPASS, false); BMPGraphing_add_point (graph, chunk_size, amount); } } } //------------------------------------------------------------ // SSE4 random writes that bypass the caches. // if (use_sse4) { BMPGraphing_new_line (graph, "Random 128-bit bypassing writes", RGB_LEMONYELLOW); newline (); srand (time (NULL)); i = 0; while ((chunk_size = chunk_sizes [i++])) { if (!(chunk_size & 128)) { int amount = do_write (chunk_size, SSE2_BYPASS, true); BMPGraphing_add_point (graph, chunk_size, amount); } } } #endif #if defined(__PPC64__) //------------------------------------------------------------ // VSX sequential reads. // if (cpu_has_vsx) { BMPGraphing_new_line (graph, "Sequential 128-bit reads", RGB_TURQUOISE); newline (); i = 0; while ((chunk_size = chunk_sizes [i++])) { if (!(chunk_size & 128)) { int amount = do_read (chunk_size, VSX, false); BMPGraphing_add_point (graph, chunk_size, amount); } } } //------------------------------------------------------------ // VSX random reads. // if (cpu_has_vsx) { BMPGraphing_new_line (graph, "Random 128-bit reads", RGB_MAROON); newline (); srand (time (NULL)); i = 0; while ((chunk_size = chunk_sizes [i++])) { if (!(chunk_size & 128)) { int amount = do_read (chunk_size, VSX, true); BMPGraphing_add_point (graph, chunk_size, amount); } } } //------------------------------------------------------------ // VSX sequential writes that do not bypass the caches. // if (cpu_has_vsx) { BMPGraphing_new_line (graph, "Sequential 128-bit cache writes", RGB_PINK); newline (); i = 0; while ((chunk_size = chunk_sizes [i++])) { if (!(chunk_size & 128)) { int amount = do_write (chunk_size, VSX, false); BMPGraphing_add_point (graph, chunk_size, amount); } } } //------------------------------------------------------------ // VSX random writes that do not bypass the caches. // if (cpu_has_vsx) { BMPGraphing_new_line (graph, "Random 128-bit cache writes", RGB_NAVYBLUE); newline (); srand (time (NULL)); i = 0; while ((chunk_size = chunk_sizes [i++])) { if (!(chunk_size & 128)) { int amount = do_write (chunk_size, VSX, true); BMPGraphing_add_point (graph, chunk_size, amount); } } } #endif //------------------------------------------------------------ // Sequential non-SSE2 reads. // newline (); #if defined(__x86_64__) || defined(__PPC64__) BMPGraphing_new_line (graph, "Sequential 64-bit reads", RGB_BLUE); #else BMPGraphing_new_line (graph, "Sequential 32-bit reads", RGB_BLUE); #endif i = 0; while ((chunk_size = chunk_sizes [i++])) { int amount = do_read (chunk_size, NO_SSE2, false); BMPGraphing_add_point (graph, chunk_size, amount); } //------------------------------------------------------------ // Random non-SSE2 reads. // newline (); #if defined(__x86_64__) || defined(__PPC64__) BMPGraphing_new_line (graph, "Random 64-bit reads", RGB_CYAN); #else BMPGraphing_new_line (graph, "Random 32-bit reads", RGB_CYAN); #endif srand (time (NULL)); i = 0; while ((chunk_size = chunk_sizes [i++])) { if (!(chunk_size & 128)) { int amount = do_read (chunk_size, NO_SSE2, true); BMPGraphing_add_point (graph, chunk_size, amount); } } //------------------------------------------------------------ // Sequential non-SSE2 writes. // #if defined(__x86_64__) || defined(__PPC64__) BMPGraphing_new_line (graph, "Sequential 64-bit writes", RGB_DARKGREEN); #else BMPGraphing_new_line (graph, "Sequential 32-bit writes", RGB_DARKGREEN); #endif newline (); i = 0; while ((chunk_size = chunk_sizes [i++])) { int amount = do_write (chunk_size, NO_SSE2, false); BMPGraphing_add_point (graph, chunk_size, amount); } //------------------------------------------------------------ // Random non-SSE2 writes. // #if defined(__x86_64__) || defined(__PPC64__) BMPGraphing_new_line (graph, "Random 64-bit writes", RGB_GREEN); #else BMPGraphing_new_line (graph, "Random 32-bit writes", RGB_GREEN); #endif newline (); srand (time (NULL)); i = 0; while ((chunk_size = chunk_sizes [i++])) { if (!(chunk_size & 128)) { int amount = do_write (chunk_size, NO_SSE2, true); BMPGraphing_add_point (graph, chunk_size, amount); } } #if defined(__x86_64__) || defined(__i386__) //------------------------------------------------------------ // SSE2 sequential copy. // if (use_sse2) { BMPGraphing_new_line (graph, "Sequential 128-bit copy", 0x8f8844); newline (); i = 0; while ((chunk_size = chunk_sizes [i++])) { int amount = do_copy (chunk_size, SSE2); BMPGraphing_add_point (graph, chunk_size, amount); } } //------------------------------------------------------------ // AVX sequential copy. // if (cpu_has_avx) { BMPGraphing_new_line (graph, "Sequential 256-bit copy", RGB_CHARTREUSE); newline (); i = 0; while ((chunk_size = chunk_sizes [i++])) { if (!(chunk_size & 128)) { int amount = do_copy (chunk_size, AVX); BMPGraphing_add_point (graph, chunk_size, amount); } } } #endif #ifdef DOING_LODS #if defined(__x86_64__) || defined(__PPC64__) //------------------------------------------------------------ // LODSQ 64-bit sequential reads. // BMPGraphing_new_line (graph, "Sequential 64-bit LODSQ reads", RGB_GRAY6); newline (); i = 0; while ((chunk_size = chunk_sizes [i++])) { int amount = do_read (chunk_size, LODSQ, false); BMPGraphing_add_point (graph, chunk_size, amount); } #endif //------------------------------------------------------------ // LODSD 32-bit sequential reads. // BMPGraphing_new_line (graph, "Sequential 32-bit LODSD reads", RGB_GRAY8); newline (); i = 0; while ((chunk_size = chunk_sizes [i++])) { int amount = do_read (chunk_size, LODSD, false); BMPGraphing_add_point (graph, chunk_size, amount); } //------------------------------------------------------------ // LODSW 16-bit sequential reads. // BMPGraphing_new_line (graph, "Sequential 16-bit LODSW reads", RGB_GRAY10); newline (); i = 0; while ((chunk_size = chunk_sizes [i++])) { int amount = do_read (chunk_size, LODSW, false); BMPGraphing_add_point (graph, chunk_size, amount); } //------------------------------------------------------------ // LODSB 64-bit sequential reads. // BMPGraphing_new_line (graph, "Sequential 8-bit LODSB reads", RGB_GRAY12); newline (); i = 0; while ((chunk_size = chunk_sizes [i++])) { int amount = do_read (chunk_size, LODSB, false); BMPGraphing_add_point (graph, chunk_size, amount); } #endif //------------------------------------------------------------ // Register to register. // newline (); register_test (); //------------------------------------------------------------ // Stack to/from register. // newline (); stack_test (); //------------------------------------------------------------ // C library performance. // newline (); library_test (); //------------------------------------------------------------ // Framebuffer read & write. // #if defined(__linux__) && defined(FBIOGET_FSCREENINFO) newline (); fb_readwrite (true); #endif premature_end_for_testing: flush (); BMPGraphing_make (graph); BMP_write (graph->image, "bandwidth.bmp"); puts ("\nWrote graph to bandwidth.bmp."); puts (""); puts ("Done."); BMPGraphing_destroy (graph); return 0; }