| | 1 | = memcpy = |
| | 2 | |
| | 3 | == Variants == |
| | 4 | |
| | 5 | ||= '''Name''' =||= '''Description''' =|| |
| | 6 | || stock || MD amd64 version {{rep movsq}} || |
| | 7 | || SSE2 || {{{movdqu}}} for block-copy || |
| | 8 | || SSE2 aligned || align source to use always use {{{movaps}}} and use {{{movaps}}} for aligned destination and {{{movdqu}}} for unaligned destination || |
| | 9 | || AVX || 256-bit {{{vmovdqu}}} for block-copy with 128-byte block as common loop || |
| | 10 | || ERMS || {{{repne movsb}}} for machines with ERMS || |
| | 11 | |
| | 12 | |
| | 13 | == Machines Tested == |
| | 14 | |
| | 15 | ||= '''CPU''' =||= '''Speed (GHz)''' =||= '''Notes''' =|| |
| | 16 | || AMD FX-8120 || 3.11 || 1 x 8 zoo.freebsd.org || |
| | 17 | || AMD Opteron 6328 || 3.20 || 2 x 8 Supermicro H8DG6/H8DGi || |
| | 18 | || Intel Xeon X5365 || 3.00 || 2 x 4 Supermicro X7DBU || |
| | 19 | || Intel Xeon X5482 || 3.20 || 2 x 4 Supermicro X7DWN+ || |
| | 20 | || Intel Xeon X5675 || 3.07 || Westmere 2 x 6 Supermicro X8DTU || |
| | 21 | || Intel Core i5-2520M || 2.50 || Sandy Bridge 1 x 4 Thinkpad X220 (4286) || |
| | 22 | || Intel Core i5-2500K || 3.30 || Sandy Bridge 1 x 4 MSI Z77A-G45 (MS-7752) || |
| | 23 | || Intel Xeon E5-2680 || 2.70 || Romley 2 x 8 Supermicro X9DRW || |
| | 24 | || Intel Xeon E5-2667 v2 || 3.30 || Romley V2 2 x 8 Supermicro X9DRW (supports ERMS) || |
| | 25 | |
| | 26 | == Test Cases == |
| | 27 | |
| | 28 | ||= '''Name''' =||= '''Description''' =|| |
| | 29 | || page || copy aligned page to aligned page || |
| | 30 | || overlap || overlapping copy of page - 16 bytes within a page || |
| | 31 | || short || aligned copy of 15 bytes || |
| | 32 | || short2 || aligned copy of 32 bytes || |
| | 33 | || short3 || aligned copy of 48 bytes || |
| | 34 | || offset || 4 byte offset copy of 128 bytes || |
| | 35 | || offset2 || 7 byte offset copy of 97 bytes || |
| | 36 | |
| | 37 | == Results == |
| | 38 | |
| | 39 | The numbers are the min value in the distribution where the values are a TSC delta across a single invocation of the test. |
| | 40 | |
| | 41 | Bold indicates the lowest time among the given variations in a Test and CPU combination. Green text is used for times faster than the stock implementation, and red text is used for times slower than the stock implementation. |
| | 42 | |
| | 43 | {{{#!th rowspan=3 |
| | 44 | '''CPU''' |
| | 45 | }}} |
| | 46 | {{{#!th colspan=35 |
| | 47 | '''Test / Variant''' |
| | 48 | }}} |
| | 49 | |-- |
| | 50 | {{{#!th colspan=5 |
| | 51 | '''page''' |
| | 52 | }}} |
| | 53 | {{{#!th colspan=5 |
| | 54 | '''overlap''' |
| | 55 | }}} |
| | 56 | {{{#!th colspan=5 |
| | 57 | '''short''' |
| | 58 | }}} |
| | 59 | {{{#!th colspan=5 |
| | 60 | '''short2''' |
| | 61 | }}} |
| | 62 | {{{#!th colspan=5 |
| | 63 | '''short3''' |
| | 64 | }}} |
| | 65 | {{{#!th colspan=5 |
| | 66 | '''offset''' |
| | 67 | }}} |
| | 68 | {{{#!th colspan=5 |
| | 69 | '''offset2''' |
| | 70 | }}} |
| | 71 | |-- |
| | 72 | ||= '''stock''' =||= '''SSE2''' =||= '''SSSE2 aligned''' =||= '''AVX''' =||= '''ERMS''' =|| \ |
| | 73 | ||= '''stock''' =||= '''SSE2''' =||= '''SSSE2 aligned''' =||= '''AVX''' =||= '''ERMS''' =|| \ |
| | 74 | ||= '''stock''' =||= '''SSE2''' =||= '''SSSE2 aligned''' =||= '''AVX''' =||= '''ERMS''' =|| \ |
| | 75 | ||= '''stock''' =||= '''SSE2''' =||= '''SSSE2 aligned''' =||= '''AVX''' =||= '''ERMS''' =|| \ |
| | 76 | ||= '''stock''' =||= '''SSE2''' =||= '''SSSE2 aligned''' =||= '''AVX''' =||= '''ERMS''' =|| \ |
| | 77 | ||= '''stock''' =||= '''SSE2''' =||= '''SSSE2 aligned''' =||= '''AVX''' =||= '''ERMS''' =|| \ |
| | 78 | ||= '''stock''' =||= '''SSE2''' =||= '''SSSE2 aligned''' =||= '''AVX''' =||= '''ERMS''' =|| \ |
| | 79 | || AMD FX-8120 || |
| | 80 | || AMD Opteron 6328 || |
| | 81 | || Intel Xeon X5365 || |
| | 82 | || Intel Xeon X5482 || |
| | 83 | || Intel Xeon X5675 || |
| | 84 | || Intel Core i5-2520M || |
| | 85 | || Intel Core i5-2500K || |
| | 86 | || Intel Xeon E5-2680 || |
| | 87 | || Intel Xeon E5-2667 v2 || |
| | 88 | |
| | 89 | == Conclusions == |
| | 90 | |
| | 91 | == Early Notes == |
| | 92 | |