diff options
author | Zack Smith <1@zsmith.co> | 2015-12-23 04:52:54 +0000 |
---|---|---|
committer | Timothy Pearson <tpearson@raptorengineeringinc.com> | 2015-12-23 04:54:03 +0000 |
commit | 45d37750fc6f3f1f43d23732816ffccc3820e215 (patch) | |
tree | 84843c43c2e303f3755f0d68b4f1166490d53a62 /routines32.asm | |
download | bandwidth-benchmark-45d37750fc6f3f1f43d23732816ffccc3820e215.zip bandwidth-benchmark-45d37750fc6f3f1f43d23732816ffccc3820e215.tar.gz |
Initial import of GPLed bandwidthd 1.1b source from author's site
Diffstat (limited to 'routines32.asm')
-rwxr-xr-x | routines32.asm | 2960 |
1 files changed, 2960 insertions, 0 deletions
diff --git a/routines32.asm b/routines32.asm new file mode 100755 index 0000000..44015d9 --- /dev/null +++ b/routines32.asm @@ -0,0 +1,2960 @@ +;============================================================================ +; bandwidth 0.32, a benchmark to estimate memory transfer bandwidth. +; Copyright (C) 2005-2014 by Zack T Smith. +; +; This program is free software; you can redistribute it and/or modify +; it under the terms of the GNU General Public License as published by +; the Free Software Foundation; either version 2 of the License, or +; (at your option) any later version. +; +; This program is distributed in the hope that it will be useful, +; but WITHOUT ANY WARRANTY; without even the implied warranty of +; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; GNU General Public License for more details. +; +; You should have received a copy of the GNU General Public License +; along with this program; if not, write to the Free Software +; Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +; +; The author may be reached at veritas@comcast.net. +;============================================================================= + +bits 32 +cpu ia64 + +global ReaderLODSQ +global _ReaderLODSQ + +global ReaderLODSD +global _ReaderLODSD + +global ReaderLODSW +global _ReaderLODSW + +global ReaderLODSB +global _ReaderLODSB + +; Cygwin requires the underbar-prefixed symbols. +global _WriterSSE2 +global WriterSSE2 + +global _WriterAVX +global WriterAVX + +global _WriterSSE2_128bytes +global WriterSSE2_128bytes + +global _ReaderAVX +global ReaderAVX + +global _ReaderSSE2 +global ReaderSSE2 + +global ReaderSSE2_bypass +global _ReaderSSE2_bypass + +global _ReaderSSE2_128bytes +global ReaderSSE2_128bytes + +global ReaderSSE2_128bytes_bypass +global _ReaderSSE2_128bytes_bypass + +global _RandomReaderSSE2 +global RandomReaderSSE2 + +global _RandomReaderSSE2_bypass +global RandomReaderSSE2_bypass + +global WriterAVX_bypass +global _WriterAVX_bypass + +global _WriterSSE2_bypass +global WriterSSE2_bypass + +global _WriterSSE2_128bytes_bypass +global WriterSSE2_128bytes_bypass + +global _RandomWriterSSE2_bypass +global RandomWriterSSE2_bypass + +global Reader +global _Reader + +global Writer +global _Writer + +global Reader_128bytes +global _Reader_128bytes + +global Writer_128bytes +global _Writer_128bytes + +global RandomReader +global _RandomReader + +global RandomWriter +global _RandomWriter + +global RandomWriterSSE2 +global _RandomWriterSSE2 + +global get_cpuid_family +global _get_cpuid_family + +global get_cpuid_cache_info +global _get_cpuid_cache_info + +global get_cpuid1_ecx +global _get_cpuid1_ecx + +global get_cpuid1_edx +global _get_cpuid1_edx + +global get_cpuid7_ebx +global _get_cpuid7_ebx + +global get_cpuid_80000001_ecx +global _get_cpuid_80000001_ecx + +global get_cpuid_80000001_edx +global _get_cpuid_80000001_edx + +global CopySSE +global _CopySSE + +global CopyAVX +global _CopyAVX + +global CopySSE_128bytes +global _CopySSE_128bytes + +global RegisterToRegister +global _RegisterToRegister + +global VectorToVector +global _VectorToVector + +global VectorToVectorAVX +global _VectorToVectorAVX + +global RegisterToVector +global _RegisterToVector + +global VectorToRegister +global _VectorToRegister + +global Register8ToVector +global Register16ToVector +global Register32ToVector +global Register64ToVector +global Vector8ToRegister +global Vector16ToRegister +global Vector32ToRegister +global Vector64ToRegister + +global _Register8ToVector +global _Register16ToVector +global _Register32ToVector +global _Register64ToVector +global _Vector8ToRegister +global _Vector16ToRegister +global _Vector32ToRegister +global _Vector64ToRegister + +global StackReader +global _StackReader + +global StackWriter +global _StackWriter + + section .text + +;------------------------------------------------------------------------------ +; Name: ReaderLODSQ +; Purpose: Reads 64-bit values sequentially from an area of memory +; using LODSQ instruction. +; Params: rdi = ptr to memory area +; rsi = length in bytes +; rdx = loops +;------------------------------------------------------------------------------ + align 32 +ReaderLODSQ: +_ReaderLODSQ: + ; N/A + ret + +;------------------------------------------------------------------------------ +; Name: ReaderLODSD +; Purpose: Reads 32-bit values sequentially from an area of memory +; using LODSD instruction. +; Params: +; [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +;------------------------------------------------------------------------------ + align 32 +ReaderLODSD: +_ReaderLODSD: + shr dword [esp+8], 2 ; length in double words rounded down. + + push ebx + push ecx ; REP counter + push edx + + mov edx, [esp+12+12] +.L1: + mov esi, [esp+4+12] + mov ecx, [esp+8+12] + + rep lodsd + + dec edx + jnz .L1 + + pop edx + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: ReaderLODSW +; Purpose: Reads 16-bit values sequentially from an area of memory +; using LODSW instruction. +; Params: +; [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +;------------------------------------------------------------------------------ + align 32 +ReaderLODSW: +_ReaderLODSW: + shr dword [esp+8], 1 ; length in words rounded down. + + push ebx + push ecx ; REP counter + push edx + + mov edx, [esp+12+12] +.L1: + mov esi, [esp+4+12] + mov ecx, [esp+8+12] + + rep lodsw + + dec edx + jnz .L1 + + pop edx + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: ReaderLODSB +; Purpose: Reads 8-bit values sequentially from an area of memory +; using LODSB instruction. +; Params: +; [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +;------------------------------------------------------------------------------ + align 32 +ReaderLODSB: +_ReaderLODSB: + push ebx + push ecx ; REP counter + push edx + + mov edx, [esp+12+12] +.L1: + mov esi, [esp+4+12] + mov ecx, [esp+8+12] + + rep lodsb + + dec edx + jnz .L1 + + pop edx + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: Reader +; Purpose: Reads 32-bit values sequentially from an area of memory. +; Params: +; [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +;------------------------------------------------------------------------------ + align 64 +Reader: +_Reader: + push ebx + push ecx + push edx + + mov ecx, [esp+12+12] ; loops to do. + + mov edx, [esp+4+12] ; ptr to memory chunk. + mov ebx, edx ; ebx = limit in memory + add ebx, [esp+8+12] + +.L1: + mov edx, [esp+4+12] + +.L2: + mov eax, [edx] + mov eax, [4+edx] + mov eax, [8+edx] + mov eax, [12+edx] + mov eax, [16+edx] + mov eax, [20+edx] + mov eax, [24+edx] + mov eax, [28+edx] + mov eax, [32+edx] + mov eax, [36+edx] + mov eax, [40+edx] + mov eax, [44+edx] + mov eax, [48+edx] + mov eax, [52+edx] + mov eax, [56+edx] + mov eax, [60+edx] + mov eax, [64+edx] + mov eax, [68+edx] + mov eax, [72+edx] + mov eax, [76+edx] + mov eax, [80+edx] + mov eax, [84+edx] + mov eax, [88+edx] + mov eax, [92+edx] + mov eax, [96+edx] + mov eax, [100+edx] + mov eax, [104+edx] + mov eax, [108+edx] + mov eax, [112+edx] + mov eax, [116+edx] + mov eax, [120+edx] + mov eax, [124+edx] + + mov eax, [edx+128] + mov eax, [edx+132] + mov eax, [edx+136] + mov eax, [edx+140] + mov eax, [edx+144] + mov eax, [edx+148] + mov eax, [edx+152] + mov eax, [edx+156] + mov eax, [edx+160] + mov eax, [edx+164] + mov eax, [edx+168] + mov eax, [edx+172] + mov eax, [edx+176] + mov eax, [edx+180] + mov eax, [edx+184] + mov eax, [edx+188] + mov eax, [edx+192] + mov eax, [edx+196] + mov eax, [edx+200] + mov eax, [edx+204] + mov eax, [edx+208] + mov eax, [edx+212] + mov eax, [edx+216] + mov eax, [edx+220] + mov eax, [edx+224] + mov eax, [edx+228] + mov eax, [edx+232] + mov eax, [edx+236] + mov eax, [edx+240] + mov eax, [edx+244] + mov eax, [edx+248] + mov eax, [edx+252] + + add edx, 256 + cmp edx, ebx + jb .L2 + + sub ecx, 1 + jnz .L1 + + pop edx + pop ecx + pop ebx + ret + + +;------------------------------------------------------------------------------ +; Name: Writer +; Purpose: Writes 32-bit value sequentially to an area of memory. +; Params: +; [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +; [esp+16] = long to write +;------------------------------------------------------------------------------ + align 64 +Writer: +_Writer: + push ebx + push ecx + push edx + + mov ecx, [esp+12+12] + mov eax, [esp+16+12] + + mov edx, [esp+4+12] ; edx = ptr to chunk + mov ebx, edx + add ebx, [esp+8+12] ; ebx = limit in memory + +.L1: + mov edx, [esp+4+12] + +.L2: + mov [edx], eax + mov [4+edx], eax + mov [8+edx], eax + mov [12+edx], eax + mov [16+edx], eax + mov [20+edx], eax + mov [24+edx], eax + mov [28+edx], eax + mov [32+edx], eax + mov [36+edx], eax + mov [40+edx], eax + mov [44+edx], eax + mov [48+edx], eax + mov [52+edx], eax + mov [56+edx], eax + mov [60+edx], eax + mov [64+edx], eax + mov [68+edx], eax + mov [72+edx], eax + mov [76+edx], eax + mov [80+edx], eax + mov [84+edx], eax + mov [88+edx], eax + mov [92+edx], eax + mov [96+edx], eax + mov [100+edx], eax + mov [104+edx], eax + mov [108+edx], eax + mov [112+edx], eax + mov [116+edx], eax + mov [120+edx], eax + mov [124+edx], eax + + mov [edx+128], eax + mov [edx+132], eax + mov [edx+136], eax + mov [edx+140], eax + mov [edx+144], eax + mov [edx+148], eax + mov [edx+152], eax + mov [edx+156], eax + mov [edx+160], eax + mov [edx+164], eax + mov [edx+168], eax + mov [edx+172], eax + mov [edx+176], eax + mov [edx+180], eax + mov [edx+184], eax + mov [edx+188], eax + mov [edx+192], eax + mov [edx+196], eax + mov [edx+200], eax + mov [edx+204], eax + mov [edx+208], eax + mov [edx+212], eax + mov [edx+216], eax + mov [edx+220], eax + mov [edx+224], eax + mov [edx+228], eax + mov [edx+232], eax + mov [edx+236], eax + mov [edx+240], eax + mov [edx+244], eax + mov [edx+248], eax + mov [edx+252], eax + + add edx, 256 + cmp edx, ebx + jb .L2 + + sub ecx, 1 + jnz .L1 + + pop edx + pop ecx + pop ebx + ret + + +;------------------------------------------------------------------------------ +; Name: Reader_128bytes +; Purpose: Reads 32-bit values sequentially from an area of memory. +; Params: +; [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +;------------------------------------------------------------------------------ + align 64 +Reader_128bytes: +_Reader_128bytes: + push ebx + push ecx + push edx + + mov ecx, [esp+12+12] ; loops to do. + + mov edx, [esp+4+12] ; ptr to memory chunk. + mov ebx, edx ; ebx = limit in memory + add ebx, [esp+8+12] + +.L1: + mov edx, [esp+4+12] + +.L2: + mov eax, [edx] + mov eax, [4+edx] + mov eax, [8+edx] + mov eax, [12+edx] + mov eax, [16+edx] + mov eax, [20+edx] + mov eax, [24+edx] + mov eax, [28+edx] + mov eax, [32+edx] + mov eax, [36+edx] + mov eax, [40+edx] + mov eax, [44+edx] + mov eax, [48+edx] + mov eax, [52+edx] + mov eax, [56+edx] + mov eax, [60+edx] + mov eax, [64+edx] + mov eax, [68+edx] + mov eax, [72+edx] + mov eax, [76+edx] + mov eax, [80+edx] + mov eax, [84+edx] + mov eax, [88+edx] + mov eax, [92+edx] + mov eax, [96+edx] + mov eax, [100+edx] + mov eax, [104+edx] + mov eax, [108+edx] + mov eax, [112+edx] + mov eax, [116+edx] + mov eax, [120+edx] + mov eax, [124+edx] + + add edx, 128 + cmp edx, ebx + jb .L2 + + sub ecx, 1 + jnz .L1 + + pop edx + pop ecx + pop ebx + ret + + +;------------------------------------------------------------------------------ +; Name: Writer_128bytes +; Purpose: Writes 32-bit value sequentially to an area of memory. +; Params: +; [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +; [esp+16] = long to write +;------------------------------------------------------------------------------ + align 64 +Writer_128bytes: +_Writer_128bytes: + push ebx + push ecx + push edx + + mov ecx, [esp+12+12] + mov eax, [esp+16+12] + + mov edx, [esp+4+12] ; edx = ptr to chunk + mov ebx, edx + add ebx, [esp+8+12] ; ebx = limit in memory + +.L1: + mov edx, [esp+4+12] + +.L2: + mov [edx], eax + mov [4+edx], eax + mov [8+edx], eax + mov [12+edx], eax + mov [16+edx], eax + mov [20+edx], eax + mov [24+edx], eax + mov [28+edx], eax + mov [32+edx], eax + mov [36+edx], eax + mov [40+edx], eax + mov [44+edx], eax + mov [48+edx], eax + mov [52+edx], eax + mov [56+edx], eax + mov [60+edx], eax + mov [64+edx], eax + mov [68+edx], eax + mov [72+edx], eax + mov [76+edx], eax + mov [80+edx], eax + mov [84+edx], eax + mov [88+edx], eax + mov [92+edx], eax + mov [96+edx], eax + mov [100+edx], eax + mov [104+edx], eax + mov [108+edx], eax + mov [112+edx], eax + mov [116+edx], eax + mov [120+edx], eax + mov [124+edx], eax + + add edx, 128 + cmp edx, ebx + jb .L2 + + sub ecx, 1 + jnz .L1 + + pop edx + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: get_cpuid_cache_info +; +get_cpuid_cache_info: +_get_cpuid_cache_info: + push ebp + push ebx + push ecx + push edx + mov eax, 4 + mov ecx, [esp + 16 + 4 + 4] + cpuid + mov ebp, eax + mov eax, [esp + 16 + 4] + mov [eax], ebp + mov [eax+4], ebx + mov [eax+8], ecx + mov [eax+12], edx + pop edx + pop ecx + pop ebx + pop ebp + ret + +;------------------------------------------------------------------------------ +; Name: get_cpuid_family +; +get_cpuid_family: +_get_cpuid_family: + push ebx + push ecx + push edx + xor eax, eax + cpuid + mov eax, [esp + 12 + 4] + mov [eax], ebx + mov [eax+4], edx + mov [eax+8], ecx + mov byte [eax+12], 0 + pop edx + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: get_cpuid1_ecx +; +get_cpuid1_ecx: +_get_cpuid1_ecx: + push ebx + push ecx + push edx + mov eax, 1 + cpuid + mov eax, ecx + pop edx + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: get_cpuid7_ebx +; +get_cpuid7_ebx: +_get_cpuid7_ebx: + push ebx + push ecx + push edx + mov eax, 7 + xor ecx, ecx + cpuid + mov eax, ebx + pop edx + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: get_cpuid_80000001_ecx +; +get_cpuid_80000001_ecx: +_get_cpuid_80000001_ecx: + push ebx + push ecx + push edx + mov eax, 0x80000001 + cpuid + mov eax, ecx + pop edx + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: get_cpuid_80000001_edx +; +get_cpuid_80000001_edx: +_get_cpuid_80000001_edx: + push ebx + push ecx + push edx + mov eax, 0x80000001 + cpuid + mov eax, edx + pop edx + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: get_cpuid1_edx +; +get_cpuid1_edx: +_get_cpuid1_edx: + push ebx + push ecx + push edx + mov eax, 1 + cpuid + mov eax, edx + pop edx + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: ReaderAVX +; Purpose: Reads 128-bit values sequentially from an area of memory. +; Params: [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +;------------------------------------------------------------------------------ + align 64 +ReaderAVX: +_ReaderAVX: + vzeroupper + + push ebx + push ecx + + mov ecx, [esp+12+8] + + mov eax, [esp+4+8] + mov ebx, eax + add ebx, [esp+8+8] ; ebx points to end. + +.L1: + mov eax, [esp+4+8] + +.L2: + vmovdqa xmm0, [eax] ; Read aligned @ 16-byte boundary. + vmovdqa xmm0, [32+eax] + vmovdqa xmm0, [64+eax] + vmovdqa xmm0, [96+eax] + vmovdqa xmm0, [128+eax] + vmovdqa xmm0, [160+eax] + vmovdqa xmm0, [192+eax] + vmovdqa xmm0, [224+eax] + + add eax, 256 + cmp eax, ebx + jb .L2 + + sub ecx, 1 + jnz .L1 + + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: ReaderSSE2 +; Purpose: Reads 128-bit values sequentially from an area of memory. +; Params: [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +;------------------------------------------------------------------------------ + align 64 +ReaderSSE2: +_ReaderSSE2: + push ebx + push ecx + + mov ecx, [esp+12+8] + + mov eax, [esp+4+8] + mov ebx, eax + add ebx, [esp+8+8] ; ebx points to end. + +.L1: + mov eax, [esp+4+8] + +.L2: + movdqa xmm0, [eax] ; Read aligned @ 16-byte boundary. + movdqa xmm0, [16+eax] + movdqa xmm0, [32+eax] + movdqa xmm0, [48+eax] + movdqa xmm0, [64+eax] + movdqa xmm0, [80+eax] + movdqa xmm0, [96+eax] + movdqa xmm0, [112+eax] + + movdqa xmm0, [128+eax] + movdqa xmm0, [144+eax] + movdqa xmm0, [160+eax] + movdqa xmm0, [176+eax] + movdqa xmm0, [192+eax] + movdqa xmm0, [208+eax] + movdqa xmm0, [224+eax] + movdqa xmm0, [240+eax] + + add eax, 256 + cmp eax, ebx + jb .L2 + + sub ecx, 1 + jnz .L1 + + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: ReaderSSE2_bypass +; Purpose: Reads 128-bit values sequentially from an area of memory. +; Params: [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +;------------------------------------------------------------------------------ + align 64 +ReaderSSE2_bypass: +_ReaderSSE2_bypass: + push ebx + push ecx + + mov ecx, [esp+12+8] + + mov eax, [esp+4+8] + mov ebx, eax + add ebx, [esp+8+8] ; ebx points to end. + +.L1: + mov eax, [esp+4+8] + +.L2: + movntdqa xmm0, [eax] ; Read aligned @ 16-byte boundary. + movntdqa xmm0, [16+eax] + movntdqa xmm0, [32+eax] + movntdqa xmm0, [48+eax] + movntdqa xmm0, [64+eax] + movntdqa xmm0, [80+eax] + movntdqa xmm0, [96+eax] + movntdqa xmm0, [112+eax] + + movntdqa xmm0, [128+eax] + movntdqa xmm0, [144+eax] + movntdqa xmm0, [160+eax] + movntdqa xmm0, [176+eax] + movntdqa xmm0, [192+eax] + movntdqa xmm0, [208+eax] + movntdqa xmm0, [224+eax] + movntdqa xmm0, [240+eax] + + add eax, 256 + cmp eax, ebx + jb .L2 + + sub ecx, 1 + jnz .L1 + + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: ReaderSSE2_128bytes_bypass +; Purpose: Reads 128-bit values sequentially from an area of memory. +; Params: [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +;------------------------------------------------------------------------------ + align 64 +ReaderSSE2_128bytes_bypass: +_ReaderSSE2_128bytes_bypass: + push ebx + push ecx + + mov ecx, [esp+12+8] + + mov eax, [esp+4+8] + mov ebx, eax + add ebx, [esp+8+8] ; ebx points to end. + +.L1: + mov eax, [esp+4+8] + +.L2: + movntdqa xmm0, [eax] ; Read aligned @ 16-byte boundary. + movntdqa xmm0, [16+eax] + movntdqa xmm0, [32+eax] + movntdqa xmm0, [48+eax] + movntdqa xmm0, [64+eax] + movntdqa xmm0, [80+eax] + movntdqa xmm0, [96+eax] + movntdqa xmm0, [112+eax] + + add eax, 128 + cmp eax, ebx + jb .L2 + + sub ecx, 1 + jnz .L1 + + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: ReaderSSE2_128bytes +; Purpose: Reads 128-bit values sequentially from an area of memory. +; Params: [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +;------------------------------------------------------------------------------ + align 64 +ReaderSSE2_128bytes: +_ReaderSSE2_128bytes: + push ebx + push ecx + + mov ecx, [esp+12+8] + + mov eax, [esp+4+8] + mov ebx, eax + add ebx, [esp+8+8] ; ebx points to end. + +.L1: + mov eax, [esp+4+8] + +.L2: + movdqa xmm0, [eax] ; Read aligned @ 16-byte boundary. + movdqa xmm0, [16+eax] + movdqa xmm0, [32+eax] + movdqa xmm0, [48+eax] + movdqa xmm0, [64+eax] + movdqa xmm0, [80+eax] + movdqa xmm0, [96+eax] + movdqa xmm0, [112+eax] + + add eax, 128 + cmp eax, ebx + jb .L2 + + sub ecx, 1 + jnz .L1 + + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: WriterAVX +; Purpose: Write 256-bit values sequentially from an area of memory. +; Params: [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +; [esp+16] = value (ignored) +;------------------------------------------------------------------------------ + align 64 +WriterAVX: +_WriterAVX: + vzeroupper + + push ebx + push ecx + + mov eax, [esp+16+8] + movd xmm0, eax ; Create a 128-bit replication of the 32-bit + movd xmm1, eax ; value that was provided. + movd xmm2, eax + movd xmm3, eax + pslldq xmm1, 32 + pslldq xmm2, 64 + pslldq xmm3, 96 + por xmm0, xmm1 + por xmm0, xmm2 + por xmm0, xmm3 + + mov ecx, [esp+12+8] + + mov eax, [esp+4+8] + mov ebx, eax + add ebx, [esp+8+8] ; ebx points to end. + +.L1: + mov eax, [esp+4+8] + +.L2: + vmovdqa [eax], xmm0 + vmovdqa [32+eax], xmm0 + vmovdqa [64+eax], xmm0 + vmovdqa [96+eax], xmm0 + vmovdqa [128+eax], xmm0 + vmovdqa [160+eax], xmm0 + vmovdqa [192+eax], xmm0 + vmovdqa [224+eax], xmm0 + + add eax, 256 + cmp eax, ebx + jb .L2 + + sub ecx, 1 + jnz .L1 + + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: WriterSSE2 +; Purpose: Write 128-bit values sequentially from an area of memory. +; Params: [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +; [esp+16] = value (ignored) +;------------------------------------------------------------------------------ + align 64 +WriterSSE2: +_WriterSSE2: + push ebx + push ecx + + mov eax, [esp+16+8] + movd xmm0, eax ; Create a 128-bit replication of the 32-bit + movd xmm1, eax ; value that was provided. + movd xmm2, eax + movd xmm3, eax + pslldq xmm1, 32 + pslldq xmm2, 64 + pslldq xmm3, 96 + por xmm0, xmm1 + por xmm0, xmm2 + por xmm0, xmm3 + + mov ecx, [esp+12+8] + + mov eax, [esp+4+8] + mov ebx, eax + add ebx, [esp+8+8] ; ebx points to end. + +.L1: + mov eax, [esp+4+8] + +.L2: + movdqa [eax], xmm0 + movdqa [16+eax], xmm0 + movdqa [32+eax], xmm0 + movdqa [48+eax], xmm0 + movdqa [64+eax], xmm0 + movdqa [80+eax], xmm0 + movdqa [96+eax], xmm0 + movdqa [112+eax], xmm0 + + movdqa [128+eax], xmm0 + movdqa [144+eax], xmm0 + movdqa [160+eax], xmm0 + movdqa [176+eax], xmm0 + movdqa [192+eax], xmm0 + movdqa [208+eax], xmm0 + movdqa [224+eax], xmm0 + movdqa [240+eax], xmm0 + + add eax, 256 + cmp eax, ebx + jb .L2 + + sub ecx, 1 + jnz .L1 + + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: WriterSSE2 +; Purpose: Write 128-bit values sequentially from an area of memory. +; Params: [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +; [esp+16] = value (ignored) +;------------------------------------------------------------------------------ + align 64 +WriterSSE2_128bytes: +_WriterSSE2_128bytes: + push ebx + push ecx + + mov eax, [esp+16+8] + movd xmm0, eax ; Create a 128-bit replication of the 32-bit + movd xmm1, eax ; value that was provided. + movd xmm2, eax + movd xmm3, eax + pslldq xmm1, 32 + pslldq xmm2, 64 + pslldq xmm3, 96 + por xmm0, xmm1 + por xmm0, xmm2 + por xmm0, xmm3 + + mov ecx, [esp+12+8] + + mov eax, [esp+4+8] + mov ebx, eax + add ebx, [esp+8+8] ; ebx points to end. + +.L1: + mov eax, [esp+4+8] + +.L2: + movdqa [eax], xmm0 + movdqa [16+eax], xmm0 + movdqa [32+eax], xmm0 + movdqa [48+eax], xmm0 + movdqa [64+eax], xmm0 + movdqa [80+eax], xmm0 + movdqa [96+eax], xmm0 + movdqa [112+eax], xmm0 + + add eax, 128 + cmp eax, ebx + jb .L2 + + sub ecx, 1 + jnz .L1 + + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: WriterAVX_bypass +; Purpose: Write 256-bit values sequentially from an area of memory, +; bypassing the cache. +; Params: [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +; [esp+16] = value (ignored) +;------------------------------------------------------------------------------ + + align 64 +WriterAVX_bypass: +_WriterAVX_bypass: + vzeroupper + + push ebx + push ecx + + mov eax, [esp+16+8] + movd xmm0, eax ; Create a 128-bit replication of the 32-bit + movd xmm1, eax ; value that was provided. + movd xmm2, eax + movd xmm3, eax + pslldq xmm1, 32 + pslldq xmm2, 64 + pslldq xmm3, 96 + por xmm0, xmm1 + por xmm0, xmm2 + por xmm0, xmm3 + + mov ecx, [esp+12+8] + + mov eax, [esp+4+8] + mov ebx, eax + add ebx, [esp+8+8] ; ebx points to end. + +.L1: + mov eax, [esp+4+8] + +.L2: + vmovntdq [eax], xmm0 ; Write bypassing cache. + vmovntdq [32+eax], xmm0 + vmovntdq [64+eax], xmm0 + vmovntdq [96+eax], xmm0 + vmovntdq [128+eax], xmm0 + vmovntdq [160+eax], xmm0 + vmovntdq [192+eax], xmm0 + vmovntdq [224+eax], xmm0 + + add eax, 256 + cmp eax, ebx + jb .L2 + + sub ecx, 1 + jnz .L1 + + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: WriterSSE2_bypass +; Purpose: Write 128-bit values sequentially from an area of memory, +; bypassing the cache. +; Params: [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +; [esp+16] = value (ignored) +;------------------------------------------------------------------------------ + align 64 +WriterSSE2_bypass: +_WriterSSE2_bypass: + push ebx + push ecx + + mov eax, [esp+16+8] + movd xmm0, eax ; Create a 128-bit replication of the 32-bit + movd xmm1, eax ; value that was provided. + movd xmm2, eax + movd xmm3, eax + pslldq xmm1, 32 + pslldq xmm2, 64 + pslldq xmm3, 96 + por xmm0, xmm1 + por xmm0, xmm2 + por xmm0, xmm3 + + mov ecx, [esp+12+8] + + mov eax, [esp+4+8] + mov ebx, eax + add ebx, [esp+8+8] ; ebx points to end. + +.L1: + mov eax, [esp+4+8] + +.L2: + movntdq [eax], xmm0 ; Write bypassing cache. + movntdq [16+eax], xmm0 + movntdq [32+eax], xmm0 + movntdq [48+eax], xmm0 + movntdq [64+eax], xmm0 + movntdq [80+eax], xmm0 + movntdq [96+eax], xmm0 + movntdq [112+eax], xmm0 + + movntdq [128+eax], xmm0 + movntdq [144+eax], xmm0 + movntdq [160+eax], xmm0 + movntdq [176+eax], xmm0 + movntdq [192+eax], xmm0 + movntdq [208+eax], xmm0 + movntdq [224+eax], xmm0 + movntdq [240+eax], xmm0 + + add eax, 256 + cmp eax, ebx + jb .L2 + + sub ecx, 1 + jnz .L1 + + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: WriterSSE2_128bytes_bypass +; Purpose: Write 128-bit values sequentially from an area of memory, +; bypassing the cache. +; Params: [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +; [esp+16] = value (ignored) +;------------------------------------------------------------------------------ + align 64 +WriterSSE2_128bytes_bypass: +_WriterSSE2_128bytes_bypass: + push ebx + push ecx + + mov eax, [esp+16+8] + movd xmm0, eax ; Create a 128-bit replication of the 32-bit + movd xmm1, eax ; value that was provided. + movd xmm2, eax + movd xmm3, eax + pslldq xmm1, 32 + pslldq xmm2, 64 + pslldq xmm3, 96 + por xmm0, xmm1 + por xmm0, xmm2 + por xmm0, xmm3 + + mov ecx, [esp+12+8] + + mov eax, [esp+4+8] + mov ebx, eax + add ebx, [esp+8+8] ; ebx points to end. + +.L1: + mov eax, [esp+4+8] + +.L2: + movntdq [eax], xmm0 ; Write bypassing cache. + movntdq [16+eax], xmm0 + movntdq [32+eax], xmm0 + movntdq [48+eax], xmm0 + movntdq [64+eax], xmm0 + movntdq [80+eax], xmm0 + movntdq [96+eax], xmm0 + movntdq [112+eax], xmm0 + + add eax, 128 + cmp eax, ebx + jb .L2 + + sub ecx, 1 + jnz .L1 + + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: RandomReader +; Purpose: Reads 32-bit values randomly from an area of memory. +; Params: +; [esp+4] = ptr to array of chunk pointers +; [esp+8] = # of 128-byte chunks +; [esp+12] = loops +;------------------------------------------------------------------------------ + align 64 +RandomReader: +_RandomReader: + push ebx + push ecx + push edx + + mov ecx, [esp+12+12] ; loops to do. + +.L0: + mov ebx, [esp+8+12] ; # chunks to do + +.L1: + sub ebx, 1 + jc .L2 + + mov edx, [esp+4+12] ; get ptr to memory chunk. + mov edx, [edx + 4*ebx] + + mov eax, [edx+160] + mov eax, [edx+232] + mov eax, [edx+224] + mov eax, [96+edx] + mov eax, [edx+164] + mov eax, [76+edx] + mov eax, [100+edx] + mov eax, [edx+220] + mov eax, [edx+248] + mov eax, [104+edx] + mov eax, [4+edx] + mov eax, [edx+136] + mov eax, [112+edx] + mov eax, [edx+200] + mov eax, [12+edx] + mov eax, [edx+128] + mov eax, [edx+148] + mov eax, [edx+196] + mov eax, [edx+216] + mov eax, [edx] + mov eax, [84+edx] + mov eax, [edx+140] + mov eax, [edx+204] + mov eax, [edx+184] + mov eax, [124+edx] + mov eax, [48+edx] + mov eax, [64+edx] + mov eax, [edx+212] + mov eax, [edx+240] + mov eax, [edx+236] + mov eax, [24+edx] + mov eax, [edx+252] + mov eax, [68+edx] + mov eax, [20+edx] + mov eax, [72+edx] + mov eax, [32+edx] + mov eax, [28+edx] + mov eax, [52+edx] + mov eax, [edx+244] + mov eax, [edx+180] + mov eax, [80+edx] + mov eax, [60+edx] + mov eax, [8+edx] + mov eax, [56+edx] + mov eax, [edx+208] + mov eax, [edx+228] + mov eax, [40+edx] + mov eax, [edx+172] + mov eax, [120+edx] + mov eax, [edx+176] + mov eax, [108+edx] + mov eax, [edx+132] + mov eax, [16+edx] + mov eax, [44+edx] + mov eax, [92+edx] + mov eax, [edx+168] + mov eax, [edx+152] + mov eax, [edx+156] + mov eax, [edx+188] + mov eax, [36+edx] + mov eax, [88+edx] + mov eax, [116+edx] + mov eax, [edx+192] + mov eax, [edx+144] + + jmp .L1 + +.L2: + sub ecx, 1 + jnz .L0 + + pop edx + pop ecx + pop ebx + ret + + +;------------------------------------------------------------------------------ +; Name: RandomReaderSSE2 +; Purpose: Reads 128-bit values sequentially from an area of memory. +; Params: +; [esp+4] = ptr to array of chunk pointers +; [esp+8] = # of 128-byte chunks +; [esp+12] = loops +;------------------------------------------------------------------------------ + align 64 +RandomReaderSSE2: +_RandomReaderSSE2: + push ebx + push ecx + push edx + + mov ecx, [esp+12+12] ; loops to do. + +.L0: + mov ebx, [esp+8+12] ; # chunks to do + +.L1: + sub ebx, 1 + jc .L2 + + mov edx, [esp+4+12] ; get ptr to memory chunk. + mov edx, [edx + 4*ebx] + +; Read aligned @ 16-byte boundary. + movdqa xmm0, [240+edx] + movdqa xmm0, [128+edx] + movdqa xmm0, [64+edx] + movdqa xmm0, [208+edx] + movdqa xmm0, [112+edx] + movdqa xmm0, [176+edx] + movdqa xmm0, [144+edx] + movdqa xmm0, [edx] + movdqa xmm0, [96+edx] + movdqa xmm0, [16+edx] + movdqa xmm0, [192+edx] + movdqa xmm0, [160+edx] + movdqa xmm0, [32+edx] + movdqa xmm0, [48+edx] + movdqa xmm0, [224+edx] + movdqa xmm0, [80+edx] + + jmp .L1 + +.L2: + sub ecx, 1 + jnz .L0 + + pop edx + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: RandomReaderSSE2_bypass +; Purpose: Reads 128-bit values sequentially from an area of memory. +; Params: +; [esp+4] = ptr to array of chunk pointers +; [esp+8] = # of 128-byte chunks +; [esp+12] = loops +;------------------------------------------------------------------------------ + align 64 +RandomReaderSSE2_bypass: +_RandomReaderSSE2_bypass: + push ebx + push ecx + push edx + + mov ecx, [esp+12+12] ; loops to do. + +.L0: + mov ebx, [esp+8+12] ; # chunks to do + +.L1: + sub ebx, 1 + jc .L2 + + mov edx, [esp+4+12] ; get ptr to memory chunk. + mov edx, [edx + 4*ebx] + +; Read aligned @ 16-byte boundary. + movntdqa xmm0, [240+edx] + movntdqa xmm0, [edx] + movntdqa xmm0, [128+edx] + movntdqa xmm0, [64+edx] + movntdqa xmm0, [208+edx] + movntdqa xmm0, [112+edx] + movntdqa xmm0, [32+edx] + movntdqa xmm0, [176+edx] + movntdqa xmm0, [144+edx] + movntdqa xmm0, [96+edx] + movntdqa xmm0, [16+edx] + movntdqa xmm0, [160+edx] + movntdqa xmm0, [192+edx] + movntdqa xmm0, [48+edx] + movntdqa xmm0, [224+edx] + movntdqa xmm0, [80+edx] + + jmp .L1 + +.L2: + sub ecx, 1 + jnz .L0 + + pop edx + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: RandomWriter +; Purpose: Writes 32-bit value sequentially to an area of memory. +; Params: +; [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +; [esp+16] = long to write +;------------------------------------------------------------------------------ + align 64 +RandomWriter: +_RandomWriter: + push ebx + push ecx + push edx + + mov eax, [esp+16+12] ; get datum. + mov ecx, [esp+12+12] ; loops to do. + +.L0: + mov ebx, [esp+8+12] ; # chunks to do + +.L1: + sub ebx, 1 + jc .L2 + + mov edx, [esp+4+12] ; get ptr to memory chunk. + mov edx, [edx + 4*ebx] + + mov [edx+212], eax + mov [edx+156], eax + mov [edx+132], eax + mov [20+edx], eax + mov [edx+172], eax + mov [edx+196], eax + mov [edx+248], eax + mov [edx], eax + mov [edx+136], eax + mov [edx+228], eax + mov [edx+160], eax + mov [80+edx], eax + mov [76+edx], eax + mov [32+edx], eax + mov [64+edx], eax + mov [68+edx], eax + mov [120+edx], eax + mov [edx+216], eax + mov [124+edx], eax + mov [28+edx], eax + mov [edx+152], eax + mov [36+edx], eax + mov [edx+220], eax + mov [edx+188], eax + mov [48+edx], eax + mov [104+edx], eax + mov [72+edx], eax + mov [96+edx], eax + mov [edx+184], eax + mov [112+edx], eax + mov [edx+236], eax + mov [edx+224], eax + mov [edx+252], eax + mov [88+edx], eax + mov [edx+180], eax + mov [60+edx], eax + mov [24+edx], eax + mov [edx+192], eax + mov [edx+164], eax + mov [edx+204], eax + mov [44+edx], eax + mov [edx+168], eax + mov [92+edx], eax + mov [edx+208], eax + mov [8+edx], eax + mov [edx+144], eax + mov [edx+148], eax + mov [edx+128], eax + mov [52+edx], eax + mov [4+edx], eax + mov [108+edx], eax + mov [12+edx], eax + mov [56+edx], eax + mov [edx+200], eax + mov [edx+232], eax + mov [16+edx], eax + mov [edx+244], eax + mov [40+edx], eax + mov [edx+140], eax + mov [84+edx], eax + mov [100+edx], eax + mov [116+edx], eax + mov [edx+176], eax + mov [edx+240], eax + + jmp .L1 + +.L2: + sub ecx, 1 + jnz .L0 + + pop edx + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: RandomWriterSSE2 +; Purpose: Writes 128-bit value randomly to an area of memory. +; Params: +; [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +; [esp+16] = long to write +;------------------------------------------------------------------------------ + align 64 +RandomWriterSSE2: +_RandomWriterSSE2: + push ebx + push ecx + push edx + + mov eax, [esp+16+8] + movd xmm0, eax ; Create a 128-bit replication of the 32-bit + movd xmm1, eax ; value that was provided. + movd xmm2, eax + movd xmm3, eax + pslldq xmm1, 32 + pslldq xmm2, 64 + pslldq xmm3, 96 + por xmm0, xmm1 + por xmm0, xmm2 + por xmm0, xmm3 + + mov ecx, [esp+12+12] ; loops to do. + +.L0: + mov ebx, [esp+8+12] ; # chunks to do + +.L1: + sub ebx, 1 + jc .L2 + + mov edx, [esp+4+12] ; get ptr to memory chunk. + mov edx, [edx + 4*ebx] + + movdqa [64+edx], xmm0 + movdqa [208+edx], xmm0 + movdqa [128+edx], xmm0 + movdqa [112+edx], xmm0 + movdqa [176+edx], xmm0 + movdqa [144+edx], xmm0 + movdqa [edx], xmm0 + movdqa [96+edx], xmm0 + movdqa [48+edx], xmm0 + movdqa [16+edx], xmm0 + movdqa [192+edx], xmm0 + movdqa [160+edx], xmm0 + movdqa [32+edx], xmm0 + movdqa [240+edx], xmm0 + movdqa [224+edx], xmm0 + movdqa [80+edx], xmm0 + + jmp .L1 + +.L2: + sub ecx, 1 + jnz .L0 + + pop edx + pop ecx + pop ebx + ret + + +;------------------------------------------------------------------------------ +; Name: RandomWriterSSE2_bypass +; Purpose: Writes 128-bit value randomly into memory, bypassing caches. +; Params: +; [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +; [esp+16] = long to write +;------------------------------------------------------------------------------ + align 64 +RandomWriterSSE2_bypass: +_RandomWriterSSE2_bypass: + push ebx + push ecx + push edx + + mov eax, [esp+16+8] + movd xmm0, eax ; Create a 128-bit replication of the 32-bit + movd xmm1, eax ; value that was provided. + movd xmm2, eax + movd xmm3, eax + pslldq xmm1, 32 + pslldq xmm2, 64 + pslldq xmm3, 96 + por xmm0, xmm1 + por xmm0, xmm2 + por xmm0, xmm3 + + mov ecx, [esp+12+12] ; loops to do. + +.L0: + mov ebx, [esp+8+12] ; # chunks to do + +.L1: + sub ebx, 1 + jc .L2 + + mov edx, [esp+4+12] ; get ptr to memory chunk. + mov edx, [edx + 4*ebx] + + movntdq [128+edx], xmm0 + movntdq [240+edx], xmm0 + movntdq [112+edx], xmm0 + movntdq [64+edx], xmm0 + movntdq [176+edx], xmm0 + movntdq [144+edx], xmm0 + movntdq [edx], xmm0 + movntdq [208+edx], xmm0 + movntdq [80+edx], xmm0 + movntdq [96+edx], xmm0 + movntdq [48+edx], xmm0 + movntdq [16+edx], xmm0 + movntdq [192+edx], xmm0 + movntdq [160+edx], xmm0 + movntdq [224+edx], xmm0 + movntdq [32+edx], xmm0 + + jmp .L1 + +.L2: + sub ecx, 1 + jnz .L0 + + pop edx + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: RegisterToRegister +; Purpose: Reads/writes 32-bit values between registers of +; the main register set. +; Params: +; dword [esp+4] = loops +;------------------------------------------------------------------------------ + align 64 +RegisterToRegister: +_RegisterToRegister: + push ebx + push ecx + + mov ecx, [esp+4+8] ; loops to do. + +.L1: + mov eax, ebx ; 64 transfers by 4 bytes = 256 bytes + mov eax, ecx + mov eax, edx + mov eax, esi + mov eax, edi + mov eax, ebp + mov eax, esp + mov eax, ebx + mov eax, ebx + mov eax, ecx + mov eax, edx + mov eax, esi + mov eax, edi + mov eax, ebp + mov eax, esp + mov eax, ebx + mov eax, ebx + mov eax, ecx + mov eax, edx + mov eax, esi + mov eax, edi + mov eax, ebp + mov eax, esp + mov eax, ebx + mov eax, ebx + mov eax, ecx + mov eax, edx + mov eax, esi + mov eax, edi + mov eax, ebp + mov eax, esp + mov eax, ebx + + mov ebx, eax + mov ebx, ecx + mov ebx, edx + mov ebx, esi + mov ebx, edi + mov ebx, ebp + mov ebx, esp + mov ebx, eax + mov ebx, eax + mov ebx, ecx + mov ebx, edx + mov ebx, esi + mov ebx, edi + mov ebx, ebp + mov ebx, esp + mov ebx, eax + mov ebx, eax + mov ebx, ecx + mov ebx, edx + mov ebx, esi + mov ebx, edi + mov ebx, ebp + mov ebx, esp + mov ebx, eax + mov ebx, eax + mov ebx, ecx + mov ebx, edx + mov ebx, esi + mov ebx, edi + mov ebx, ebp + mov ebx, esp + mov ebx, eax + + dec ecx + jnz .L1 + + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: VectorToVectorAVX +; Purpose: Reads/writes 256-bit values between registers of +; the vector register set, in this case YMM. +; Params: rdi = loops +;------------------------------------------------------------------------------ + align 64 +VectorToVectorAVX: +_VectorToVectorAVX: + vzeroupper + + mov eax, [esp + 4] +.L1: + vmovdqa ymm0, ymm1 ; Each move moves 32 bytes, so we need 8 + vmovdqa ymm0, ymm2 ; moves to transfer a 256 byte chunk. + vmovdqa ymm0, ymm3 + vmovdqa ymm2, ymm0 + vmovdqa ymm1, ymm2 + vmovdqa ymm2, ymm1 + vmovdqa ymm0, ymm3 + vmovdqa ymm3, ymm1 + + dec eax + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: VectorToVector +; Purpose: Reads/writes 128-bit values between registers of +; the vector register set, in this case XMM. +; Params: dword [esp + 4] = count. +;------------------------------------------------------------------------------ + align 64 +VectorToVector: +_VectorToVector: + mov eax, [esp + 4] +.L1: + movdqa xmm0, xmm1 + movdqa xmm0, xmm2 + movdqa xmm0, xmm3 + movdqa xmm2, xmm0 + movdqa xmm1, xmm2 + movdqa xmm2, xmm1 + movdqa xmm0, xmm3 + movdqa xmm3, xmm1 + + movdqa xmm3, xmm2 + movdqa xmm1, xmm3 + movdqa xmm2, xmm1 + movdqa xmm0, xmm1 + movdqa xmm1, xmm2 + movdqa xmm0, xmm1 + movdqa xmm0, xmm3 + movdqa xmm3, xmm0 + + dec eax + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: RegisterToVector +; Purpose: Writes 32-bit main register values into 128-bit vector register +; clearing the upper unused bits. +; Params: dword [esp + 4] = count. +;------------------------------------------------------------------------------ + align 64 +RegisterToVector: +_RegisterToVector: + mov eax, [esp + 4] + add eax, eax ; Double # of loops. +.L1: + movd xmm1, eax ; 32 transfers of 4 bytes = 128 bytes + movd xmm2, eax + movd xmm3, eax + movd xmm0, eax + movd xmm1, eax + movd xmm2, eax + movd xmm3, eax + movd xmm0, eax + + movd xmm1, eax + movd xmm3, eax + movd xmm2, eax + movd xmm0, eax + movd xmm1, eax + movd xmm2, eax + movd xmm3, eax + movd xmm0, eax + + movd xmm0, eax + movd xmm2, eax + movd xmm0, eax + movd xmm3, eax + movd xmm1, eax + movd xmm3, eax + movd xmm2, eax + movd xmm0, eax + + movd xmm0, eax + movd xmm3, eax + movd xmm1, eax + movd xmm2, eax + movd xmm0, eax + movd xmm2, eax + movd xmm3, eax + movd xmm0, eax + + dec eax + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: VectorToRegister +; Purpose: Writes lowest 32 bits of vector registers into 32-bit main +; register. +; Params: dword [esp + 4] = count. +;------------------------------------------------------------------------------ + align 64 +VectorToRegister: +_VectorToRegister: + mov eax, [esp + 4] + add eax, eax ; Double # of loops. + push ebx +.L1: + movd ebx, xmm1 ; 4 bytes per transfer therefore need 64 + movd ebx, xmm2 ; to transfer 256 bytes. + movd ebx, xmm3 + movd ebx, xmm0 + movd ebx, xmm1 + movd ebx, xmm2 + movd ebx, xmm3 + movd ebx, xmm0 + + movd ebx, xmm1 + movd ebx, xmm3 + movd ebx, xmm2 + movd ebx, xmm0 + movd ebx, xmm1 + movd ebx, xmm2 + movd ebx, xmm3 + movd ebx, xmm0 + + movd ebx, xmm0 + movd ebx, xmm2 + movd ebx, xmm0 + movd ebx, xmm3 + movd ebx, xmm1 + movd ebx, xmm3 + movd ebx, xmm2 + movd ebx, xmm0 + + movd ebx, xmm0 + movd ebx, xmm3 + movd ebx, xmm1 + movd ebx, xmm2 + movd ebx, xmm0 + movd ebx, xmm2 + movd ebx, xmm3 + movd ebx, xmm0 + + dec eax + jnz .L1 + + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: StackReader +; Purpose: Reads 32-bit values off the stack into registers of +; the main register set, effectively testing L1 cache access +; *and* effective-address calculation speed. +; Params: +; dword [esp+4] = loops +;------------------------------------------------------------------------------ + align 64 +StackReader: +_StackReader: + push ebx + push ecx + + mov ecx, [esp+4+8] ; loops to do. + + push dword 7000 ; [esp+24] + push dword 6000 ; [esp+20] + push dword 5000 ; [esp+16] + push dword 4000 ; [esp+12] + push dword 3000 ; [esp+8] + push dword 2000 ; [esp+4] + push dword 1000 ; [esp] + +.L1: + mov eax, [esp] + mov eax, [esp+8] + mov eax, [esp+12] + mov eax, [esp+16] + mov eax, [esp+20] + mov eax, [esp+4] + mov eax, [esp+24] + mov eax, [esp] + mov eax, [esp] + mov eax, [esp+8] + mov eax, [esp+12] + mov eax, [esp+16] + mov eax, [esp+20] + mov eax, [esp+4] + mov eax, [esp+24] + mov eax, [esp] + mov eax, [esp] + mov eax, [esp+8] + mov eax, [esp+12] + mov eax, [esp+16] + mov eax, [esp+20] + mov eax, [esp+4] + mov eax, [esp+24] + mov eax, [esp+4] + mov eax, [esp+4] + mov eax, [esp+8] + mov eax, [esp+12] + mov eax, [esp+16] + mov eax, [esp+20] + mov eax, [esp+4] + mov eax, [esp+24] + mov eax, [esp+4] + + mov ebx, [esp] + mov ebx, [esp+8] + mov ebx, [esp+12] + mov ebx, [esp+16] + mov ebx, [esp+20] + mov ebx, [esp+4] + mov ebx, [esp+24] + mov ebx, [esp] + mov ebx, [esp] + mov ebx, [esp+8] + mov ebx, [esp+12] + mov ebx, [esp+16] + mov ebx, [esp+20] + mov ebx, [esp+4] + mov ebx, [esp+24] + mov ebx, [esp] + mov ebx, [esp] + mov ebx, [esp+8] + mov ebx, [esp+12] + mov ebx, [esp+16] + mov ebx, [esp+20] + mov ebx, [esp+4] + mov ebx, [esp+24] + mov ebx, [esp+4] + mov ebx, [esp+4] + mov ebx, [esp+8] + mov ebx, [esp+12] + mov ebx, [esp+16] + mov ebx, [esp+20] + mov ebx, [esp+4] + mov ebx, [esp+24] + mov ebx, [esp+4] + + dec ecx + jnz .L1 + + add esp, 28 + + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: StackWriter +; Purpose: Writes 32-bit values into the stack from registers of +; the main register set, effectively testing L1 cache access +; *and* effective-address calculation speed. +; Params: +; dword [esp+4] = loops +;------------------------------------------------------------------------------ + align 64 +StackWriter: +_StackWriter: + push ebx + push ecx + + mov ecx, [esp+4+8] ; loops to do. + + push dword 7000 ; [esp+24] + push dword 6000 ; [esp+20] + push dword 5000 ; [esp+16] + push dword 4000 ; [esp+12] + push dword 3000 ; [esp+8] + push dword 2000 ; [esp+4] + push dword 1000 ; [esp] + + xor eax, eax + mov ebx, 0xffffffff + +.L1: + mov [esp], eax + mov [esp+8], eax + mov [esp+12], eax + mov [esp+16], eax + mov [esp+20], eax + mov [esp+4], eax + mov [esp+24], eax + mov [esp], eax + mov [esp], eax + mov [esp+8], eax + mov [esp+12], eax + mov [esp+16], eax + mov [esp+20], eax + mov [esp+4], eax + mov [esp+24], eax + mov [esp], eax + mov [esp], eax + mov [esp+8], eax + mov [esp+12], eax + mov [esp+16], eax + mov [esp+20], eax + mov [esp+4], eax + mov [esp+24], eax + mov [esp+4], eax + mov [esp+4], eax + mov [esp+8], eax + mov [esp+12], eax + mov [esp+16], eax + mov [esp+20], eax + mov [esp+4], eax + mov [esp+24], eax + mov [esp+4], eax + + mov [esp], ebx + mov [esp+8], ebx + mov [esp+12], ebx + mov [esp+16], ebx + mov [esp+20], ebx + mov [esp+4], ebx + mov [esp+24], ebx + mov [esp], ebx + mov [esp], ebx + mov [esp+8], ebx + mov [esp+12], ebx + mov [esp+16], ebx + mov [esp+20], ebx + mov [esp+4], ebx + mov [esp+24], ebx + mov [esp], ebx + mov [esp], ebx + mov [esp+8], ebx + mov [esp+12], ebx + mov [esp+16], ebx + mov [esp+20], ebx + mov [esp+4], ebx + mov [esp+24], ebx + mov [esp+4], ebx + mov [esp+4], ebx + mov [esp+8], ebx + mov [esp+12], ebx + mov [esp+16], ebx + mov [esp+20], ebx + mov [esp+4], ebx + mov [esp+24], ebx + mov [esp+4], ebx + + sub ecx, 1 + jnz .L1 + + add esp, 28 + + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: Register8ToVector +; Purpose: Writes 8-bit main register values into 128-bit vector register +; without clearing the unused bits. +; Params: dword [esp + 4] +;------------------------------------------------------------------------------ + align 64 +Register8ToVector: +_Register8ToVector: + mov eax, [esp + 4] + sal eax, 4 ; Force some repetition. +.L1: + pinsrb xmm1, al, 0 ; 64 transfers x 1 byte = 64 bytes + pinsrb xmm2, bl, 1 + pinsrb xmm3, cl, 2 + pinsrb xmm1, dl, 3 + pinsrb xmm2, sil, 4 + pinsrb xmm3, dil, 5 + pinsrb xmm0, bpl, 6 + pinsrb xmm0, spl, 7 + + pinsrb xmm0, al, 0 + pinsrb xmm1, bl, 1 + pinsrb xmm2, cl, 2 + pinsrb xmm3, dl, 3 + pinsrb xmm3, al, 4 + pinsrb xmm2, bl, 5 + pinsrb xmm1, bpl, 6 + pinsrb xmm0, spl, 7 + + pinsrb xmm1, al, 0 + pinsrb xmm2, al, 1 + pinsrb xmm3, al, 2 + pinsrb xmm1, al, 3 + pinsrb xmm2, al, 4 + pinsrb xmm3, al, 5 + pinsrb xmm0, cl, 6 + pinsrb xmm0, bl, 7 + + pinsrb xmm0, al, 0 + pinsrb xmm0, al, 1 + pinsrb xmm0, al, 2 + pinsrb xmm0, al, 3 + pinsrb xmm0, al, 4 + pinsrb xmm0, al, 5 + pinsrb xmm0, cl, 6 + pinsrb xmm0, bl, 7 + + pinsrb xmm1, al, 0 + pinsrb xmm2, bl, 1 + pinsrb xmm3, cl, 2 + pinsrb xmm1, dl, 3 + pinsrb xmm2, sil, 4 + pinsrb xmm3, dil, 5 + pinsrb xmm0, bpl, 6 + pinsrb xmm0, spl, 7 + + pinsrb xmm0, al, 10 + pinsrb xmm1, bl, 11 + pinsrb xmm2, cl, 12 + pinsrb xmm3, dl, 13 + pinsrb xmm3, dil, 14 + pinsrb xmm2, cl, 15 + pinsrb xmm1, al, 6 + pinsrb xmm0, bpl, 7 + + pinsrb xmm1, al, 10 + pinsrb xmm2, al, 11 + pinsrb xmm3, al, 12 + pinsrb xmm1, al, 13 + pinsrb xmm2, al, 14 + pinsrb xmm3, al, 15 + pinsrb xmm0, cl, 6 + pinsrb xmm0, bl, 7 + + pinsrb xmm0, al, 9 + pinsrb xmm0, al, 8 + pinsrb xmm0, al, 11 + pinsrb xmm0, al, 3 + pinsrb xmm0, al, 4 + pinsrb xmm0, al, 5 + pinsrb xmm0, cl, 6 + pinsrb xmm0, bl, 7 + + dec eax + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: Register16ToVector +; Purpose: Writes 16-bit main register values into 128-bit vector register +; without clearing the unused bits. +; Params: rdi = loops +;------------------------------------------------------------------------------ + align 64 +Register16ToVector: +_Register16ToVector: + mov eax, [esp + 4] + sal eax, 3 ; Force some repetition. +.L1: + pinsrw xmm1, ax, 0 ; 64 transfers x 2 bytes = 128 bytes + pinsrw xmm2, bx, 1 + pinsrw xmm3, cx, 2 + pinsrw xmm1, dx, 3 + pinsrw xmm2, si, 4 + pinsrw xmm3, di, 5 + pinsrw xmm0, bp, 6 + pinsrw xmm0, sp, 7 + + pinsrw xmm0, ax, 0 + pinsrw xmm1, bx, 1 + pinsrw xmm2, cx, 2 + pinsrw xmm3, dx, 3 + pinsrw xmm3, si, 4 + pinsrw xmm2, di, 5 + pinsrw xmm1, bp, 6 + pinsrw xmm0, sp, 7 + + pinsrw xmm1, ax, 0 + pinsrw xmm2, ax, 1 + pinsrw xmm3, ax, 2 + pinsrw xmm1, ax, 3 + pinsrw xmm2, ax, 4 + pinsrw xmm3, ax, 5 + pinsrw xmm0, bp, 6 + pinsrw xmm0, bx, 7 + + pinsrw xmm0, ax, 0 + pinsrw xmm0, ax, 1 + pinsrw xmm0, ax, 2 + pinsrw xmm0, ax, 3 + pinsrw xmm0, ax, 4 + pinsrw xmm0, ax, 5 + pinsrw xmm0, bp, 6 + pinsrw xmm0, bx, 7 + + pinsrw xmm1, ax, 0 + pinsrw xmm2, bx, 1 + pinsrw xmm3, cx, 2 + pinsrw xmm1, dx, 3 + pinsrw xmm2, si, 4 + pinsrw xmm3, di, 5 + pinsrw xmm0, bp, 6 + pinsrw xmm0, sp, 7 + + pinsrw xmm0, ax, 0 + pinsrw xmm1, bx, 1 + pinsrw xmm2, cx, 2 + pinsrw xmm3, dx, 3 + pinsrw xmm3, si, 4 + pinsrw xmm2, di, 5 + pinsrw xmm1, bp, 6 + pinsrw xmm0, sp, 7 + + pinsrw xmm1, ax, 0 + pinsrw xmm2, ax, 1 + pinsrw xmm3, ax, 2 + pinsrw xmm1, ax, 3 + pinsrw xmm2, ax, 4 + pinsrw xmm3, ax, 5 + pinsrw xmm0, bp, 6 + pinsrw xmm0, bx, 7 + + pinsrw xmm0, ax, 0 + pinsrw xmm0, ax, 1 + pinsrw xmm0, ax, 2 + pinsrw xmm0, ax, 3 + pinsrw xmm0, ax, 4 + pinsrw xmm0, ax, 5 + pinsrw xmm0, bp, 6 + pinsrw xmm0, bx, 7 + + dec eax + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: Register32ToVector +; Purpose: Writes 32-bit main register values into 128-bit vector register +; without clearing the unused bits. +; Params: rdi = loops +;------------------------------------------------------------------------------ + align 64 +Register32ToVector: +_Register32ToVector: + mov eax, [esp + 4] + sal eax, 2 ; Force some repetition. +.L1: + pinsrd xmm1, eax, 0 ; Each xfer moves 4 bytes so to move 256 bytes + pinsrd xmm2, ebx, 1 ; we need 64 transfers. + pinsrd xmm3, ecx, 2 + pinsrd xmm1, edx, 3 + pinsrd xmm2, esi, 0 + pinsrd xmm3, edi, 1 + pinsrd xmm0, ebp, 2 + pinsrd xmm0, esp, 3 + + pinsrd xmm0, eax, 0 + pinsrd xmm1, ebx, 1 + pinsrd xmm2, ecx, 2 + pinsrd xmm3, edx, 3 + pinsrd xmm3, esi, 3 + pinsrd xmm2, edi, 2 + pinsrd xmm1, ebp, 1 + pinsrd xmm0, esp, 0 + + pinsrd xmm1, eax, 0 + pinsrd xmm2, eax, 1 + pinsrd xmm3, eax, 2 + pinsrd xmm1, eax, 3 + pinsrd xmm2, eax, 0 + pinsrd xmm3, eax, 1 + pinsrd xmm0, ebp, 2 + pinsrd xmm0, ebx, 3 + + pinsrd xmm0, eax, 0 + pinsrd xmm0, eax, 1 + pinsrd xmm0, eax, 2 + pinsrd xmm0, eax, 3 + pinsrd xmm0, eax, 0 + pinsrd xmm0, eax, 0 + pinsrd xmm0, ebp, 0 + pinsrd xmm0, ebx, 0 + + pinsrd xmm1, eax, 0 + pinsrd xmm2, ebx, 1 + pinsrd xmm3, ecx, 2 + pinsrd xmm1, edx, 3 + pinsrd xmm2, esi, 0 + pinsrd xmm3, edi, 1 + pinsrd xmm0, ebp, 2 + pinsrd xmm0, esp, 3 + + pinsrd xmm0, eax, 0 + pinsrd xmm1, ebx, 1 + pinsrd xmm2, ecx, 2 + pinsrd xmm3, edx, 3 + pinsrd xmm3, esi, 3 + pinsrd xmm2, edi, 2 + pinsrd xmm1, ebp, 1 + pinsrd xmm0, esp, 0 + + pinsrd xmm1, eax, 0 + pinsrd xmm2, eax, 1 + pinsrd xmm3, eax, 2 + pinsrd xmm1, eax, 3 + pinsrd xmm2, eax, 0 + pinsrd xmm3, eax, 1 + pinsrd xmm0, ebp, 2 + pinsrd xmm0, ebx, 3 + + pinsrd xmm0, eax, 0 + pinsrd xmm0, eax, 1 + pinsrd xmm0, eax, 2 + pinsrd xmm0, eax, 3 + pinsrd xmm0, eax, 0 + pinsrd xmm0, eax, 0 + pinsrd xmm0, ebp, 0 + pinsrd xmm0, ebx, 0 + pinsrd xmm0, esp, 0 + + dec eax + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: Register64ToVector +; Purpose: Writes 64-bit main register values into 128-bit vector register +; without clearing the unused bits. +; Params: rdi = loops +;------------------------------------------------------------------------------ +Register64ToVector: +_Register64ToVector: + ret + + +;------------------------------------------------------------------------------ +; Name: Vector8ToRegister +; Purpose: Writes 8-bit vector register values into main register. +; Params: rdi = loops +;------------------------------------------------------------------------------ + align 64 +Vector8ToRegister: +_Vector8ToRegister: + mov eax, [esp + 4] + sal eax, 4 ; Force some repetition. + push ebx +.L1: + pextrb ebx, xmm1, 0 + pextrb ebx, xmm2, 1 + pextrb ebx, xmm3, 2 + pextrb ebx, xmm1, 3 + pextrb ebx, xmm2, 4 + pextrb ebx, xmm3, 5 + pextrb ebx, xmm0, 6 + pextrb ebx, xmm0, 7 + + pextrb ebx, xmm0, 0 + pextrb ebx, xmm1, 1 + pextrb ebx, xmm2, 2 + pextrb ebx, xmm3, 3 + pextrb ebx, xmm3, 4 + pextrb ebx, xmm2, 15 + pextrb ebx, xmm1, 6 + pextrb ebx, xmm0, 7 + + pextrb ebx, xmm1, 0 + pextrb ebx, xmm2, 1 + pextrb ebx, xmm3, 2 + pextrb ebx, xmm1, 3 + pextrb ebx, xmm2, 4 + pextrb ebx, xmm3, 5 + pextrb ebx, xmm0, 6 + pextrb ebx, xmm0, 7 + + pextrb ebx, xmm0, 0 + pextrb ebx, xmm1, 1 + pextrb ebx, xmm2, 2 + pextrb ebx, xmm3, 3 + pextrb ebx, xmm3, 4 + pextrb ebx, xmm2, 5 + pextrb ebx, xmm1, 6 + pextrb ebx, xmm0, 7 + + pextrb ebx, xmm1, 0 + pextrb ebx, xmm2, 1 + pextrb ebx, xmm3, 2 + pextrb ebx, xmm1, 13 + pextrb ebx, xmm2, 14 + pextrb ebx, xmm3, 15 + pextrb ebx, xmm0, 6 + pextrb ebx, xmm0, 7 + + pextrb ebx, xmm0, 10 + pextrb ebx, xmm1, 11 + pextrb ebx, xmm2, 12 + pextrb ebx, xmm3, 13 + pextrb ebx, xmm3, 14 + pextrb ebx, xmm2, 15 + pextrb ebx, xmm1, 6 + pextrb ebx, xmm0, 7 + + pextrb ebx, xmm1, 0 + pextrb ebx, xmm2, 1 + pextrb ebx, xmm3, 2 + pextrb ebx, xmm1, 3 + pextrb ebx, xmm2, 4 + pextrb ebx, xmm3, 5 + pextrb ebx, xmm0, 6 + pextrb ebx, xmm0, 7 + + pextrb ebx, xmm0, 0 + pextrb ebx, xmm1, 1 + pextrb ebx, xmm2, 2 + pextrb ebx, xmm3, 3 + pextrb ebx, xmm3, 4 + pextrb ebx, xmm2, 5 + pextrb ebx, xmm1, 6 + pextrb ebx, xmm0, 7 + + dec eax + jnz .L1 + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: Vector16ToRegister +; Purpose: Writes 16-bit vector register values into main register. +; Params: rdi = loops +;------------------------------------------------------------------------------ + align 64 +Vector16ToRegister: +_Vector16ToRegister: + mov eax, [esp + 4] + sal eax, 3 ; Force some repetition. + push ebx +.L1: + pextrw ebx, xmm1, 0 + pextrw ebx, xmm2, 1 + pextrw ebx, xmm3, 2 + pextrw ebx, xmm1, 3 + pextrw ebx, xmm2, 4 + pextrw ebx, xmm3, 5 + pextrw ebx, xmm0, 6 + pextrw ebx, xmm0, 7 + + pextrw ebx, xmm0, 0 + pextrw ebx, xmm1, 1 + pextrw ebx, xmm2, 2 + pextrw ebx, xmm3, 3 + pextrw ebx, xmm3, 4 + pextrw ebx, xmm2, 5 + pextrw ebx, xmm1, 6 + pextrw ebx, xmm0, 7 + + pextrw ebx, xmm1, 0 + pextrw ebx, xmm2, 1 + pextrw ebx, xmm3, 2 + pextrw ebx, xmm1, 3 + pextrw ebx, xmm2, 4 + pextrw ebx, xmm3, 5 + pextrw ebx, xmm0, 6 + pextrw ebx, xmm0, 7 + + pextrw ebx, xmm0, 0 + pextrw ebx, xmm1, 1 + pextrw ebx, xmm2, 2 + pextrw ebx, xmm3, 3 + pextrw ebx, xmm3, 4 + pextrw ebx, xmm2, 5 + pextrw ebx, xmm1, 6 + pextrw ebx, xmm0, 7 + + pextrw ebx, xmm1, 0 + pextrw ebx, xmm2, 1 + pextrw ebx, xmm3, 2 + pextrw ebx, xmm1, 3 + pextrw ebx, xmm2, 4 + pextrw ebx, xmm3, 5 + pextrw ebx, xmm0, 6 + pextrw ebx, xmm0, 7 + + pextrw ebx, xmm0, 0 + pextrw ebx, xmm1, 1 + pextrw ebx, xmm2, 2 + pextrw ebx, xmm3, 3 + pextrw ebx, xmm3, 4 + pextrw ebx, xmm2, 5 + pextrw ebx, xmm1, 6 + pextrw ebx, xmm0, 7 + + pextrw ebx, xmm1, 0 + pextrw ebx, xmm2, 1 + pextrw ebx, xmm3, 2 + pextrw ebx, xmm1, 3 + pextrw ebx, xmm2, 4 + pextrw ebx, xmm3, 5 + pextrw ebx, xmm0, 6 + pextrw ebx, xmm0, 7 + + pextrw ebx, xmm0, 0 + pextrw ebx, xmm1, 1 + pextrw ebx, xmm2, 2 + pextrw ebx, xmm3, 3 + pextrw ebx, xmm3, 4 + pextrw ebx, xmm2, 5 + pextrw ebx, xmm1, 6 + pextrw ebx, xmm0, 7 + + dec eax + jnz .L1 + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: Vector32ToRegister +; Purpose: Writes 32-bit vector register values into main register. +; Params: rdi = loops +;------------------------------------------------------------------------------ + align 64 +Vector32ToRegister: +_Vector32ToRegister: + mov eax, [esp + 4] + sal eax, 2 ; Force some repetition. + push ebx +.L1: + pextrd ebx, xmm1, 0 + pextrd ebx, xmm2, 1 + pextrd ebx, xmm3, 2 + pextrd ebx, xmm1, 3 + pextrd ebx, xmm2, 0 + pextrd ebx, xmm3, 1 + pextrd ebx, xmm0, 2 + pextrd ebx, xmm0, 3 + + pextrd ebx, xmm0, 0 + pextrd ebx, xmm1, 1 + pextrd ebx, xmm2, 2 + pextrd ebx, xmm3, 3 + pextrd ebx, xmm3, 3 + pextrd ebx, xmm2, 2 + pextrd ebx, xmm1, 1 + pextrd ebx, xmm0, 0 + + pextrd ebx, xmm1, 0 + pextrd ebx, xmm2, 1 + pextrd ebx, xmm3, 2 + pextrd ebx, xmm1, 3 + pextrd ebx, xmm2, 0 + pextrd ebx, xmm3, 1 + pextrd ebx, xmm0, 2 + pextrd ebx, xmm0, 3 + + pextrd ebx, xmm0, 0 + pextrd ebx, xmm1, 1 + pextrd ebx, xmm2, 2 + pextrd ebx, xmm3, 3 + pextrd ebx, xmm3, 3 + pextrd ebx, xmm2, 2 + pextrd ebx, xmm1, 1 + pextrd ebx, xmm0, 0 + + dec eax + jnz .L1 + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: Vector64ToRegister +; Purpose: Writes 64-bit vector register values into main register. +; Params: rdi = loops +;------------------------------------------------------------------------------ +Vector64ToRegister: +_Vector64ToRegister: + ret + +;------------------------------------------------------------------------------ +; Name: CopyAVX +; Purpose: Copies memory chunks that are 32-byte aligned. +; Params: [esp + 4] = ptr to destination memory area +; [esp + 8] = ptr to source memory area +; [esp + 12] = length in bytes +; [esp + 16] = loops +;------------------------------------------------------------------------------ + align 64 +CopyAVX: +_CopyAVX: + vzeroupper + ; Register usage: + ; esi = source + ; edi = dest + ; ecx = loops + ; edx = length + push esi + push edi + push ecx + push edx + + mov edi, [esp + 4 + 16] + mov esi, [esp + 8 + 16] + mov edx, [esp + 12 + 16] + mov ecx, [esp + 16 + 16] + + shr edx, 8 ; Ensure length is multiple of 256. + shl edx, 8 + +.L1: + mov eax, edx + prefetchnta [esi] + +.L2: + vmovdqa ymm0, [esi] + vmovdqa ymm1, [32+esi] + vmovdqa ymm2, [64+esi] + vmovdqa ymm3, [96+esi] + + vmovdqa [edi], ymm0 + vmovdqa [32+edi], ymm1 + vmovdqa [64+edi], ymm2 + vmovdqa [96+edi], ymm3 + + vmovdqa ymm0, [128+esi] + vmovdqa ymm1, [128+32+esi] + vmovdqa ymm2, [128+64+esi] + vmovdqa ymm3, [128+96+esi] + + vmovdqa [128+edi], ymm0 + vmovdqa [128+32+edi], ymm1 + vmovdqa [128+64+edi], ymm2 + vmovdqa [128+96+edi], ymm3 + + add esi, 256 + add edi, 256 + + sub eax, 256 + jnz .L2 + + sub esi, edx ; rsi now points to start. + sub edi, edx ; rdi now points to start. + + dec ecx + jnz .L1 + + pop edx + pop ecx + pop edi + pop esi + ret + +;------------------------------------------------------------------------------ +; Name: CopySSE +; Purpose: Copies memory chunks that are 16-byte aligned. +; Params: [esp + 4] = ptr to destination memory area +; [esp + 8] = ptr to source memory area +; [esp + 12] = length in bytes +; [esp + 16] = loops +;------------------------------------------------------------------------------ + align 64 +CopySSE: +_CopySSE: + ; Register usage: + ; esi = source + ; edi = dest + ; ecx = loops + ; edx = length + push esi + push edi + push ecx + push edx + + mov edi, [esp + 4 + 16] + mov esi, [esp + 8 + 16] + mov edx, [esp + 12 + 16] + mov ecx, [esp + 16 + 16] + + shr edx, 7 ; Ensure length is multiple of 128. + shl edx, 7 + + ; Save our non-parameter XMM registers. + sub esp, 64 + movdqu [esp], xmm4 + movdqu [16+esp], xmm5 + movdqu [32+esp], xmm6 + movdqu [48+esp], xmm7 + +.L1: + mov eax, edx + +.L2: + prefetchnta [esi] + movdqa xmm0, [esi] + movdqa xmm1, [16+esi] + movdqa xmm2, [32+esi] + movdqa xmm3, [48+esi] + movdqa xmm4, [64+esi] + movdqa xmm5, [80+esi] + movdqa xmm6, [96+esi] + movdqa xmm7, [112+esi] + + ; 32-bit lacks xmm8 - xmm15. + + movdqa [edi], xmm0 + movdqa [16+edi], xmm1 + movdqa [32+edi], xmm2 + movdqa [48+edi], xmm3 + movdqa [64+edi], xmm4 + movdqa [80+edi], xmm5 + movdqa [96+edi], xmm6 + movdqa [112+edi], xmm7 + + add esi, 128 + add edi, 128 + + sub eax, 128 + jnz .L2 + + sub esi, edx ; rsi now points to start. + sub edi, edx ; rdi now points to start. + + dec ecx + jnz .L1 + + movdqu xmm4, [0+esp] + movdqu xmm5, [16+esp] + movdqu xmm6, [32+esp] + movdqu xmm7, [48+esp] + add esp, 64 + + pop edx + pop ecx + pop edi + pop esi + ret + +;------------------------------------------------------------------------------ +; Name: CopySSE_128bytes +; Purpose: Copies memory chunks that are 16-byte aligned. +; Params: [esp + 4] = ptr to destination memory area +; [esp + 8] = ptr to source memory area +; [esp + 12] = length in bytes +; [esp + 16] = loops +;------------------------------------------------------------------------------ + align 64 +CopySSE_128bytes: +_CopySSE_128bytes: + jmp CopySSE + |