본문 바로가기

3.구현/C or C++

메모리 복사 성능시험 (memcpy)

이글은 memcpy의 성능을 여러환경 사용방법상에 성능을 비교실험한 글을 가져온 것이다.
특정한 시스템에서 테스트한 것이기 모든 시스템에서 일괄적으로 영향을 미친다고 볼 수는 없다. 단지 참고하기 바란다.
- ospace

####################################################

Subject: Performance of various memcpy()'s
To: None _tech-perform@netbsd.org_
From: Bang Jun-Young _junyoung@mogua.com_
List: tech-perform
Date: 10/16/2002 04:18:30

--mYCpIKhGyMATD0i+
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

Hi,

About 14 monthes ago, I had some discussion on memcpy performance on
i386 platform here. Monthes later, I took a look into it again, and
now am coming with (not-so-)new benchmark results (attached). The
tests were performed on Athlon XP 1800 and DDR 256MB.

From the results, it's obvious that memcpy() using MMX insns is the
best for in-cache sized data, typically 50-100% faster than plain old
memcpy for data <= 32 KB.

Another attached patch is i686 version of copyin(9) that makes use
of MMX insns. It works well with intops-only programs, but doesn't
with ones like XFree86 that uses FP ops. In this case, it would be
helpful if NPX handling code was imported from FreeBSD (they have
i586 optimized version of copyin/out(9)). Can anybody give me some
comments wrt this?

Jun-Young

--
Bang Jun-Young junyoung@mogua.com

--mYCpIKhGyMATD0i+
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="memcpy_bench.txt"

addr1=0x804c000 addr2=0x804c080
memcpy 64B -- 16777216 loops
  aligned blocks
      libc memcpy                                        0.796562 s
      MMX memcpy using MOVQ                              0.332473 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.437861 s
      with simple MOVUSB (no prefetch)                   0.477142 s
      arjanv's MOVQ (with prefetch)                      0.397613 s
      arjanv's MOVNTQ (with prefetch, for Athlon)        0.386256 s
      arjanv's interleaved MOVQ/MOVNTQ with prefetchNTA  0.468275 s
  +0/+4 moderately unaligned blocks
      libc memcpy                                        0.794225 s
      MMX memcpy using MOVQ                              0.408814 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.469252 s
      with simple MOVUSB (no prefetch)                   0.542820 s
  +10/+13 cruelly unaligned blocks
      libc memcpy                                        0.931550 s
      MMX memcpy using MOVQ                              0.465778 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.556663 s
      with simple MOVUSB (no prefetch)                   0.545896 s

addr1=0x804c000 addr2=0x804c100
memcpy 128B -- 8388608 loops
  aligned blocks
      libc memcpy                                        0.511865 s
      MMX memcpy using MOVQ                              0.233085 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.314226 s
      with simple MOVUSB (no prefetch)                   0.363533 s
      arjanv's MOVQ (with prefetch)                      0.266980 s
      arjanv's MOVNTQ (with prefetch, for Athlon)        0.255603 s
      arjanv's interleaved MOVQ/MOVNTQ with prefetchNTA  0.273115 s
  +0/+4 moderately unaligned blocks
      libc memcpy                                        0.513906 s
      MMX memcpy using MOVQ                              0.295375 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.351025 s
      with simple MOVUSB (no prefetch)                   0.412870 s
  +10/+13 cruelly unaligned blocks
      libc memcpy                                        0.647541 s
      MMX memcpy using MOVQ                              0.381870 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.403322 s
      with simple MOVUSB (no prefetch)                   0.421661 s

addr1=0x804c000 addr2=0x804c200
memcpy 256B -- 4194304 loops
  aligned blocks
      libc memcpy                                        0.380581 s
      MMX memcpy using MOVQ                              0.173247 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.252765 s
      with simple MOVUSB (no prefetch)                   0.320588 s
      arjanv's MOVQ (with prefetch)                      0.196009 s
      arjanv's MOVNTQ (with prefetch, for Athlon)        0.211234 s
      arjanv's interleaved MOVQ/MOVNTQ with prefetchNTA  0.198807 s
  +0/+4 moderately unaligned blocks
      libc memcpy                                        0.379022 s
      MMX memcpy using MOVQ                              0.241409 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.295835 s
      with simple MOVUSB (no prefetch)                   0.388839 s
  +10/+13 cruelly unaligned blocks
      libc memcpy                                        0.505536 s
      MMX memcpy using MOVQ                              0.343646 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.318098 s
      with simple MOVUSB (no prefetch)                   0.359642 s

addr1=0x804c000 addr2=0x804c400
memcpy 512B -- 2097152 loops
  aligned blocks
      libc memcpy                                        0.309567 s
      MMX memcpy using MOVQ                              0.161895 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.259866 s
      with simple MOVUSB (no prefetch)                   0.299634 s
      arjanv's MOVQ (with prefetch)                      0.171824 s
      arjanv's MOVNTQ (with prefetch, for Athlon)        0.204493 s
      arjanv's interleaved MOVQ/MOVNTQ with prefetchNTA  0.159063 s
  +0/+4 moderately unaligned blocks
      libc memcpy                                        0.310000 s
      MMX memcpy using MOVQ                              0.210169 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.293950 s
      with simple MOVUSB (no prefetch)                   0.347535 s
  +10/+13 cruelly unaligned blocks
      libc memcpy                                        0.434541 s
      MMX memcpy using MOVQ                              0.318089 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.301053 s
      with simple MOVUSB (no prefetch)                   0.350758 s

addr1=0x804c000 addr2=0x804c800
memcpy 1024B -- 1048576 loops
  aligned blocks
      libc memcpy                                        0.276199 s
      MMX memcpy using MOVQ                              0.170408 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.232004 s
      with simple MOVUSB (no prefetch)                   0.274786 s
      arjanv's MOVQ (with prefetch)                      0.168275 s
      arjanv's MOVNTQ (with prefetch, for Athlon)        0.192419 s
      arjanv's interleaved MOVQ/MOVNTQ with prefetchNTA  0.157286 s
  +0/+4 moderately unaligned blocks
      libc memcpy                                        0.276402 s
      MMX memcpy using MOVQ                              0.208041 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.264838 s
      with simple MOVUSB (no prefetch)                   0.321226 s
  +10/+13 cruelly unaligned blocks
      libc memcpy                                        0.399037 s
      MMX memcpy using MOVQ                              0.317386 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.269808 s
      with simple MOVUSB (no prefetch)                   0.323063 s

addr1=0x804c000 addr2=0x804f000
memcpy 2048B -- 524288 loops
  aligned blocks
      libc memcpy                                        0.259386 s
      MMX memcpy using MOVQ                              0.164728 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.216017 s
      with simple MOVUSB (no prefetch)                   0.262353 s
      arjanv's MOVQ (with prefetch)                      0.160822 s
      arjanv's MOVNTQ (with prefetch, for Athlon)        0.188910 s
      arjanv's interleaved MOVQ/MOVNTQ with prefetchNTA  0.148048 s
  +0/+4 moderately unaligned blocks
      libc memcpy                                        0.259205 s
      MMX memcpy using MOVQ                              0.194549 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.264177 s
      with simple MOVUSB (no prefetch)                   0.308492 s
  +10/+13 cruelly unaligned blocks
      libc memcpy                                        0.381286 s
      MMX memcpy using MOVQ                              0.306385 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.256044 s
      with simple MOVUSB (no prefetch)                   0.309575 s

addr1=0x8050000 addr2=0x8052000
memcpy 4kB -- 262144 loops
  aligned blocks
      libc memcpy                                        0.251069 s
      MMX memcpy using MOVQ                              0.161883 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.250987 s
      with simple MOVUSB (no prefetch)                   0.256146 s
      arjanv's MOVQ (with prefetch)                      0.251169 s
      arjanv's MOVNTQ (with prefetch, for Athlon)        0.256027 s
      arjanv's interleaved MOVQ/MOVNTQ with prefetchNTA  0.207190 s
  +0/+4 moderately unaligned blocks
      libc memcpy                                        0.250998 s
      MMX memcpy using MOVQ                              0.188332 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.267415 s
      with simple MOVUSB (no prefetch)                   0.301825 s
  +10/+13 cruelly unaligned blocks
      libc memcpy                                        0.372422 s
      MMX memcpy using MOVQ                              0.300877 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.264033 s
      with simple MOVUSB (no prefetch)                   0.302476 s

addr1=0x804f000 addr2=0x8054000
memcpy 8kB -- 131072 loops
  aligned blocks
      libc memcpy                                        0.246683 s
      MMX memcpy using MOVQ                              0.160469 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.223785 s
      with simple MOVUSB (no prefetch)                   0.253043 s
      arjanv's MOVQ (with prefetch)                      0.198100 s
      arjanv's MOVNTQ (with prefetch, for Athlon)        0.220333 s
      arjanv's interleaved MOVQ/MOVNTQ with prefetchNTA  0.165994 s
  +0/+4 moderately unaligned blocks
      libc memcpy                                        0.246569 s
      MMX memcpy using MOVQ                              0.184975 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.244896 s
      with simple MOVUSB (no prefetch)                   0.298646 s
  +10/+13 cruelly unaligned blocks
      libc memcpy                                        0.367977 s
      MMX memcpy using MOVQ                              0.298119 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.245495 s
      with simple MOVUSB (no prefetch)                   0.298924 s

addr1=0x804f000 addr2=0x8057000
memcpy 16kB -- 65536 loops
  aligned blocks
      libc memcpy                                        0.246980 s
      MMX memcpy using MOVQ                              0.159769 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.274188 s
      with simple MOVUSB (no prefetch)                   0.251510 s
      arjanv's MOVQ (with prefetch)                      0.174101 s
      arjanv's MOVNTQ (with prefetch, for Athlon)        0.278145 s
      arjanv's interleaved MOVQ/MOVNTQ with prefetchNTA  0.306673 s
  +0/+4 moderately unaligned blocks
      libc memcpy                                        0.247309 s
      MMX memcpy using MOVQ                              0.183421 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.460254 s
      with simple MOVUSB (no prefetch)                   0.297058 s
  +10/+13 cruelly unaligned blocks
      libc memcpy                                        0.368122 s
      MMX memcpy using MOVQ                              0.296768 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.299830 s
      with simple MOVUSB (no prefetch)                   0.297180 s

addr1=0x804f000 addr2=0x805c000
memcpy 32kB -- 32768 loops
  aligned blocks
      libc memcpy                                        0.246418 s
      MMX memcpy using MOVQ                              0.161774 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.291646 s
      with simple MOVUSB (no prefetch)                   0.252990 s
      arjanv's MOVQ (with prefetch)                      0.168720 s
      arjanv's MOVNTQ (with prefetch, for Athlon)        0.220957 s
      arjanv's interleaved MOVQ/MOVNTQ with prefetchNTA  0.279949 s
  +0/+4 moderately unaligned blocks
      libc memcpy                                        0.253483 s
      MMX memcpy using MOVQ                              0.189459 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.330665 s
      with simple MOVUSB (no prefetch)                   0.299876 s
  +10/+13 cruelly unaligned blocks
      libc memcpy                                        0.371691 s
      MMX memcpy using MOVQ                              0.280076 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.325644 s
      with simple MOVUSB (no prefetch)                   0.299598 s

addr1=0x805f000 addr2=0x8070000
memcpy 64kB -- 16384 loops
  aligned blocks
      libc memcpy                                        0.557651 s
      MMX memcpy using MOVQ                              0.484263 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.513905 s
      with simple MOVUSB (no prefetch)                   0.504620 s
      arjanv's MOVQ (with prefetch)                      0.481128 s
      arjanv's MOVNTQ (with prefetch, for Athlon)        0.514562 s
      arjanv's interleaved MOVQ/MOVNTQ with prefetchNTA  0.513256 s
  +0/+4 moderately unaligned blocks
      libc memcpy                                        0.573247 s
      MMX memcpy using MOVQ                              0.577181 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.511285 s
      with simple MOVUSB (no prefetch)                   0.596480 s
  +10/+13 cruelly unaligned blocks
      libc memcpy                                        0.742978 s
      MMX memcpy using MOVQ                              0.657358 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.523992 s
      with simple MOVUSB (no prefetch)                   0.550855 s

addr1=0x805f000 addr2=0x8080000
memcpy 128kB -- 8192 loops
  aligned blocks
      libc memcpy                                        0.557362 s
      MMX memcpy using MOVQ                              0.480659 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.512456 s
      with simple MOVUSB (no prefetch)                   0.503718 s
      arjanv's MOVQ (with prefetch)                      0.477681 s
      arjanv's MOVNTQ (with prefetch, for Athlon)        0.512652 s
      arjanv's interleaved MOVQ/MOVNTQ with prefetchNTA  0.511952 s
  +0/+4 moderately unaligned blocks
      libc memcpy                                        0.568122 s
      MMX memcpy using MOVQ                              0.575231 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.511856 s
      with simple MOVUSB (no prefetch)                   0.594627 s
  +10/+13 cruelly unaligned blocks
      libc memcpy                                        0.742519 s
      MMX memcpy using MOVQ                              0.642598 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.514444 s
      with simple MOVUSB (no prefetch)                   0.549701 s

addr1=0x805f000 addr2=0x80a0000
memcpy 256kB -- 4096 loops
  aligned blocks
      libc memcpy                                        3.312519 s
      MMX memcpy using MOVQ                              2.991133 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.511574 s
      with simple MOVUSB (no prefetch)                   3.132933 s
      arjanv's MOVQ (with prefetch)                      2.855973 s
      arjanv's MOVNTQ (with prefetch, for Athlon)        0.511921 s
      arjanv's interleaved MOVQ/MOVNTQ with prefetchNTA  0.511443 s
  +0/+4 moderately unaligned blocks
      libc memcpy                                        3.287035 s
      MMX memcpy using MOVQ                              3.142660 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.511720 s
      with simple MOVUSB (no prefetch)                   3.266173 s
  +10/+13 cruelly unaligned blocks
      libc memcpy                                        3.434399 s
      MMX memcpy using MOVQ                              3.422942 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.514526 s
      with simple MOVUSB (no prefetch)                   3.203671 s

addr1=0x805f000 addr2=0x80e0000
memcpy 512kB -- 2048 loops
  aligned blocks
      libc memcpy                                        3.318688 s
      MMX memcpy using MOVQ                              2.991518 s
      with mingo's MOVUSB (prefetch, non-temporal)       2.113506 s
      with simple MOVUSB (no prefetch)                   3.136624 s
      arjanv's MOVQ (with prefetch)                      2.856525 s
      arjanv's MOVNTQ (with prefetch, for Athlon)        1.760890 s
      arjanv's interleaved MOVQ/MOVNTQ with prefetchNTA  1.892791 s
  +0/+4 moderately unaligned blocks
      libc memcpy                                        3.288139 s
      MMX memcpy using MOVQ                              3.144040 s
      with mingo's MOVUSB (prefetch, non-temporal)       1.998237 s
      with simple MOVUSB (no prefetch)                   3.270226 s
  +10/+13 cruelly unaligned blocks
      libc memcpy                                        3.441033 s
      MMX memcpy using MOVQ                              3.426393 s
      with mingo's MOVUSB (prefetch, non-temporal)       2.332106 s
      with simple MOVUSB (no prefetch)                   3.209592 s

addr1=0x805f000 addr2=0x8160000
memcpy 1024kB -- 1024 loops
  aligned blocks
      libc memcpy                                        3.158626 s
      MMX memcpy using MOVQ                              2.801466 s
      with mingo's MOVUSB (prefetch, non-temporal)       1.963610 s
      with simple MOVUSB (no prefetch)                   2.986171 s
      arjanv's MOVQ (with prefetch)                      2.656920 s
      arjanv's MOVNTQ (with prefetch, for Athlon)        1.601385 s
      arjanv's interleaved MOVQ/MOVNTQ with prefetchNTA  1.727029 s
  +0/+4 moderately unaligned blocks
      libc memcpy                                        2.989358 s
      MMX memcpy using MOVQ                              2.831822 s
      with mingo's MOVUSB (prefetch, non-temporal)       1.909301 s
      with simple MOVUSB (no prefetch)                   3.057700 s
  +10/+13 cruelly unaligned blocks
      libc memcpy                                        3.195734 s
      MMX memcpy using MOVQ                              3.108697 s
      with mingo's MOVUSB (prefetch, non-temporal)       2.108903 s
      with simple MOVUSB (no prefetch)                   3.039293 s

--mYCpIKhGyMATD0i+
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="memcpy_bench.c"

/* -_- c-file-style: "linux" -_- */

/* memcpy speed benchmark using different i86-specific routines.  
 *

-   Framework (C) 2001 by Martin Pool [mbp@samba.org](mailto:mbp@samba.org), based on speed.c
-   by tridge.

-   Routines lifted from all kinds of places.

-   You must not use floating-point code anywhere in this application
-   because it scribbles on the FP state and does not reset it. */

#include <stdio.h>  
#include <math.h>  
#include <stdlib.h>  
#include <sys/time.h>

#define MAX(a,b) ((a)>(b)?(a):(b))  
#define MIN(a,b) ((a)<(b)?(a):(b))

#include <sys/resource.h>  
struct rusage tp1,tp2;

static void start_timer()  
{  
getrusage(RUSAGE\_SELF,&tp1);  
}

static long end\_timer()  
{  
getrusage(RUSAGE_SELF,&tp2);  
#if 0  
printf ("tp1 = %ld.%05ld, tp2 = %ld.%05ld\\n",  
(long) tp1.ru_utime.tv_sec, (long) tp1.ru_utime.tv_usec,  
(long) tp2.ru_utime.tv_sec, (long) tp2.ru_utime.tv_usec);  
#endif

return ((tp2.ru_utime.tv_sec - tp1.ru_utime.tv_sec) * 1000000 + 
    (tp2.ru_utime.tv_usec - tp1.ru_utime.tv_usec));

}

/*

-   By Ingo Molnar and Doug Ledford; hacked up to remove
-   kernel-specific stuff like saving/restoring float registers.

-   [http://people.redhat.com/mingo/mmx-patches/mmx-2.3.99-A0](http://people.redhat.com/mingo/mmx-patches/mmx-2.3.99-A0) */  
    void *  
    memcpy_movusb (void *to, const void *from, size_t n)  
    {  
    size_t size;

#define STEP 0x20  
#define ALIGN 0x10  
if ((unsigned long)to & (ALIGN-1)) {  
size = ALIGN - ((unsigned long)to & (ALIGN-1));  
**asm** **volatile**("movups (%0),%%xmm0\\n\\t"  
"movups %%xmm0,(%1)\\n\\t"  
:  
: "r" (from),  
"r" (to));  
n -= size;  
from += size;  
to += size;  
}  
/*

-   If the copy would have tailings, take care of them
-   now instead of later
-   /  
    if (n & (ALIGN-1)) {}  
    /*
-   `size = n - ALIGN; __asm__ __volatile__("movups (%0),%%xmm0\n\t" "movups %%xmm0,(%1)\n\t" : : "r" (from + size), "r" (to + size)); n &= ~(ALIGN-1);`
-   Prefetch the first two cachelines now.
-   /  
    **asm** **volatile**("prefetchnta 0x00(%0)\\n\\t"while (n >= STEP) {}
-   return to;  
    }
-   `__asm__ __volatile__( "movups 0x00(%0),%%xmm0\n\t" "movups 0x10(%0),%%xmm1\n\t" "movntps %%xmm0,0x00(%1)\n\t" "movntps %%xmm1,0x10(%1)\n\t" : : "r" (from), "r" (to) : "memory"); from += STEP; /* * Note: Intermixing the prefetch at *exactly* this point * in time has been shown to be the fastest possible. * Timing these prefetch instructions is a complete black * art with nothing but trial and error showing the way. * To that extent, this optimum version was found by using * a userland version of this routine that we clocked for * lots of runs. We then fiddled with ordering until we * settled on our highest speen routines. So, the long * and short of this is, don't mess with instruction ordering * here or suffer permance penalties you will. */ __asm__ __volatile__( "prefetchnta 0x20(%0)\n\t" : : "r" (from)); to += STEP; n -= STEP;`
-   `"prefetchnta 0x20(%0)\n\t" : : "r" (from));`

void *  
memcpy_simple_movusb (void *to, const void *from, size_t n)  
{  
size_t size;

#define STEP 0x20  
#define ALIGN 0x10  
if ((unsigned long)to & (ALIGN-1)) {  
size = ALIGN - ((unsigned long)to & (ALIGN-1));  
**asm** **volatile**("movups (%0),%%xmm0\\n\\t"  
"movups %%xmm0,(%1)\\n\\t"  
:  
: "r" (from),  
"r" (to));  
n -= size;  
from += size;  
to += size;  
}  
/*

   If the copy would have tailings, take care of them
   now instead of later
   */  
    if (n & (ALIGN-1)) {}

      __asm__ __volatile__(
          "movups 0x00(%0),%%xmm0\n\t"
          "movups 0x10(%0),%%xmm1\n\t"
          "movups %%xmm0,0x00(%1)\n\t"
          "movups %%xmm1,0x10(%1)\n\t"
          : 
          : "r" (from), "r" (to)
          : "memory");
      from += STEP;
      to += STEP;
      n -= STEP;

    }
   return to;  
    }
   while (n >= STEP) {
   `size = n - ALIGN; __asm__ __volatile__("movups (%0),%%xmm0\n\t" "movups %%xmm0,(%1)\n\t" : : "r" (from + size), "r" (to + size)); n &= ~(ALIGN-1);`

/* From Linux 2.4.8. I think this must be aligned. */  
void *  
memcpy_mmx (void *to, const void *from, size_t len)  
{  
int i;

for(i = 0; i < len / 64; i++) {
          __asm__ __volatile__ (
       "movq (%0), %%mm0\n"
       "\tmovq 8(%0), %%mm1\n"
       "\tmovq 16(%0), %%mm2\n"
       "\tmovq 24(%0), %%mm3\n"
       "\tmovq %%mm0, (%1)\n"
       "\tmovq %%mm1, 8(%1)\n"
       "\tmovq %%mm2, 16(%1)\n"
       "\tmovq %%mm3, 24(%1)\n"
       "\tmovq 32(%0), %%mm0\n"
       "\tmovq 40(%0), %%mm1\n"
       "\tmovq 48(%0), %%mm2\n"
       "\tmovq 56(%0), %%mm3\n"
       "\tmovq %%mm0, 32(%1)\n"
       "\tmovq %%mm1, 40(%1)\n"
       "\tmovq %%mm2, 48(%1)\n"
       "\tmovq %%mm3, 56(%1)\n"
       : : "r" (from), "r" (to) : "memory");
    from += 64;
    to += 64;
}

if (len & 63)
    memcpy(to, from, len & 63);

return to;

}

static void print_time (char const *msg,  
long long loops,  
long t)  
{  
printf(" %-50s %ld.%06ld s\\n", msg, t/1000000,  
t % 1000000);  
}

void *  
memcpy_arjanv (void *to, const void *from, size_t len)  
{  
int i;

__asm__ __volatile__ (
    "1: prefetchnta (%0)\n"
    "   prefetchnta 64(%0)\n"
    "   prefetchnta 128(%0)\n"
    "   prefetchnta 192(%0)\n"
    "   prefetchnta 256(%0)\n"
    : : "r" (from) );

for(i=0; i<len/64; i++) {
    __asm__ __volatile__ (
        "1: prefetchnta 320(%0)\n"
        "2: movq (%0), %%mm0\n"
        "   movq 8(%0), %%mm1\n"
        "   movq 16(%0), %%mm2\n"
        "   movq 24(%0), %%mm3\n"
        "   movq %%mm0, (%1)\n"
        "   movq %%mm1, 8(%1)\n"
        "   movq %%mm2, 16(%1)\n"
        "   movq %%mm3, 24(%1)\n"
        "   movq 32(%0), %%mm0\n"
        "   movq 40(%0), %%mm1\n"
        "   movq 48(%0), %%mm2\n"
        "   movq 56(%0), %%mm3\n"
        "   movq %%mm0, 32(%1)\n"
        "   movq %%mm1, 40(%1)\n"
        "   movq %%mm2, 48(%1)\n"
        "   movq %%mm3, 56(%1)\n"
        : : "r" (from), "r" (to) : "memory");
    from+=64;
    to+=64;
}

/*
 *Now do the tail of the block
 */
if (len&63)
    memcpy(to, from, len&63);

return to;

}

void *  
memcpy_arjanv_movntq (void *to, const void *from, size_t len)  
{  
int i;

__asm__ __volatile__ (
    "1: prefetchnta (%0)\n"
    "   prefetchnta 64(%0)\n"
    "   prefetchnta 128(%0)\n"
    "   prefetchnta 192(%0)\n"
    : : "r" (from) );

for(i=0; i<len/64; i++) {
    __asm__ __volatile__ (
        "   prefetchnta 200(%0)\n"
        "   movq (%0), %%mm0\n"
        "   movq 8(%0), %%mm1\n"
        "   movq 16(%0), %%mm2\n"
        "   movq 24(%0), %%mm3\n"
        "   movq 32(%0), %%mm4\n"
        "   movq 40(%0), %%mm5\n"
        "   movq 48(%0), %%mm6\n"
        "   movq 56(%0), %%mm7\n"
        "   movntq %%mm0, (%1)\n"
        "   movntq %%mm1, 8(%1)\n"
        "   movntq %%mm2, 16(%1)\n"
        "   movntq %%mm3, 24(%1)\n"
        "   movntq %%mm4, 32(%1)\n"
        "   movntq %%mm5, 40(%1)\n"
        "   movntq %%mm6, 48(%1)\n"
        "   movntq %%mm7, 56(%1)\n"
        : : "r" (from), "r" (to) : "memory");
    from+=64;
    to+=64;
}
/*
 *Now do the tail of the block
 */
if (len&63)
    memcpy(to, from, len&63);

return to;
}

void *  
memcpy_arjanv_interleave (void *to, const void *from, size_t len)  
{  
int i;

__asm__ __volatile__ (
    "1: prefetchnta (%0)\n"
    "   prefetchnta 64(%0)\n"
    "   prefetchnta 128(%0)\n"
    "   prefetchnta 192(%0)\n"
    : : "r" (from) );


for(i=0; i<len/64; i++) {
    __asm__ __volatile__ (
        "   prefetchnta 168(%0)\n"
        "   movq (%0), %%mm0\n"
        "   movntq %%mm0, (%1)\n"
        "   movq 8(%0), %%mm1\n"
        "   movntq %%mm1, 8(%1)\n"
        "   movq 16(%0), %%mm2\n"
        "   movntq %%mm2, 16(%1)\n"
        "   movq 24(%0), %%mm3\n"
        "   movntq %%mm3, 24(%1)\n"
        "   movq 32(%0), %%mm4\n"
        "   movntq %%mm4, 32(%1)\n"
        "   movq 40(%0), %%mm5\n"
        "   movntq %%mm5, 40(%1)\n"
        "   movq 48(%0), %%mm6\n"
        "   movntq %%mm6, 48(%1)\n"
        "   movq 56(%0), %%mm7\n"
        "   movntq %%mm7, 56(%1)\n"
        : : "r" (from), "r" (to) : "memory");
    from+=64;
    to+=64;
}
/*
 *Now do the tail of the block
 */
if (len&63)
    memcpy(to, from, len&63);

return to;
}

static void wrap (char _p1,  
char *p2,  
size_t size,  
long loops,  
void *(_bfn) (void *, const void *, size_t),  
const char *msg)  
{  
long t;  
int i;

memset(p2,42,size);

start_timer();

for (i=0; i<loops; i++)
    bfn (p1, p2, size);

t = end_timer();

print_time (msg, loops, t);
}

static void memcpy_test(size_t size)  
{  
long loops = 1024_1024_1024 / size;

/* We need to make sure the blocks are *VERY* aligned, because
   MMX is potentially pretty fussy. */

char *p1 = (char *) malloc (size+64);
char *p2 = (char *) malloc (size+64);

printf("addr1=%p addr2=%p\n", p1, p2);

if (size > 2048)
    printf ("memcpy %dkB -- %ld loops\n", size>>10, loops);
else
    printf ("memcpy %dB -- %ld loops\n", size, loops);


printf ("  aligned blocks\n");

wrap (p1, p2, size, loops, memcpy, "libc memcpy");
wrap (p1, p2, size, loops, memcpy_mmx,
    "MMX memcpy using MOVQ");
wrap(p1, p2, size, loops, memcpy_movusb,
    "with mingo's MOVUSB (prefetch, non-temporal)");
wrap (p1, p2, size, loops, memcpy_simple_movusb,
      "with simple MOVUSB (no prefetch)");
wrap (p1, p2, size, loops, memcpy_arjanv,
      "arjanv's MOVQ (with prefetch)");
wrap (p1, p2, size, loops, memcpy_arjanv_movntq,
      "arjanv's MOVNTQ (with prefetch, for Athlon)");
wrap (p1, p2, size, loops, memcpy_arjanv_interleave,
      "arjanv's interleaved MOVQ/MOVNTQ with prefetchNTA");

printf ("  +0/+4 moderately unaligned blocks\n");

wrap (p1, p2+4, size, loops, memcpy, "libc memcpy");
wrap (p1, p2+4, size, loops, memcpy_mmx,
    "MMX memcpy using MOVQ");
wrap(p1, p2+4, size, loops, memcpy_movusb,
    "with mingo's MOVUSB (prefetch, non-temporal)");
wrap (p1, p2+4, size, loops, memcpy_simple_movusb,
      "with simple MOVUSB (no prefetch)");

printf ("  +10/+13 cruelly unaligned blocks\n");

wrap (p1+10, p2+13, size, loops, memcpy, "libc memcpy");
wrap (p1+10, p2+13, size, loops, memcpy_mmx,
    "MMX memcpy using MOVQ");
wrap(p1+10, p2+13, size, loops, memcpy_movusb,
    "with mingo's MOVUSB (prefetch, non-temporal)");
wrap (p1+10, p2+13, size, loops, memcpy_simple_movusb,
      "with simple MOVUSB (no prefetch)");

puts("");

free(p1); free(p2);

}

int main (void)  
{  
memcpy_test(1<<6);  
memcpy_test(1<<7);  
memcpy_test(1<<8);  
memcpy_test(1<<9);  
memcpy_test(1<<10);  
memcpy_test(1<<11);  
memcpy_test(1<<12);  
memcpy_test(1<<13);  
memcpy_test(1<<14);  
memcpy_test(1<<15);  
memcpy_test(1<<16);  
memcpy_test(1<<17);  
memcpy_test(1<<18);  
memcpy_test(1<<19);  
memcpy_test(1<<20);

return 0;

}

--mYCpIKhGyMATD0i+
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="locore.s.diff"

Index: locore.s
===================================================================
RCS file: /usr/local/cvs/moguawin/sys/arch/i386/i386/locore.s,v
retrieving revision 1.5
diff -u -r1.5 locore.s
--- locore.s    2002/10/10 03:59:38    1.5
+++ locore.s    2002/10/15 18:55:38
@@ -951,7 +951,7 @@
 #define    DEFAULT_COPYIN        _C_LABEL(i386_copyin)    /* XXX */
 #elif defined(I686_CPU)
 #define    DEFAULT_COPYOUT        _C_LABEL(i486_copyout)    /* XXX */
-#define    DEFAULT_COPYIN        _C_LABEL(i386_copyin)    /* XXX */
+#define    DEFAULT_COPYIN        _C_LABEL(i686_copyin)    /* XXX */
 #endif

     .data
@@ -1159,6 +1159,103 @@
     xorl    %eax,%eax
     ret
 #endif /* I386_CPU || I486_CPU || I586_CPU || I686_CPU */
+
+#if defined(I686_CPU)
+/* LINTSTUB: Func: int i686_copyin(const void *uaddr, void *kaddr, size_t len) */
+ENTRY(i686_copyin)
+    pushl    %esi
+    pushl    %edi
+    pushl    %ebx
+    GET_CURPCB(%eax)
+    movl    $_C_LABEL(i686_copy_fault),PCB_ONFAULT(%eax)
+    
+    movl    16(%esp),%eax
+    movl    20(%esp),%ecx
+    movl    24(%esp),%esi
+
+    /*
+     * We check that the end of the destination buffer is not past the end
+     * of the user's address space.  If it's not, then we only need to
+     * check that each page is readable, and the CPU will do that for us.
+     */
+    movl    %eax,%edx
+    addl    %esi,%edx
+    jc    _C_LABEL(i686_copy_efault)
+    cmpl    $VM_MAXUSER_ADDRESS,%edx
+    ja    _C_LABEL(i686_copy_efault)
+
+    xorl    %ebx,%ebx
+    movl    %esi,%edx
+    shrl    $6,%edx
+    cmpl    %edx,%ebx
+    jae    2f
+
+1:
+    movq     (%eax),%mm0
+    movq    8(%eax),%mm1
+    movq    16(%eax),%mm2
+    movq    24(%eax),%mm3
+    movq    %mm0,(%ecx)
+    movq    %mm1,8(%ecx)
+    movq    %mm2,16(%ecx)
+    movq    %mm3,24(%ecx)
+    movq    32(%eax),%mm0
+    movq    40(%eax),%mm1
+    movq    48(%eax),%mm2
+    movq    56(%eax),%mm3
+    movq    %mm0,32(%ecx)
+    movq    %mm1,40(%ecx)
+    movq    %mm2,48(%ecx)
+    movq    %mm3,56(%ecx)
+
+    addl    $64,%eax
+    addl    $64,%ecx
+    incl    %ebx
+    cmpl    %edx,%ebx
+    jb    1b
+
+2:
+    movl    %esi,%edx
+    andl    $63,%edx
+    je    3f
+
+    movl    %eax,%esi
+    movl    %edx,%eax
+    movl    %ecx,%edi
+
+    /* bcopy(%esi, %edi, %eax); */
+    cld
+    movl    %eax,%ecx
+    shrl    $2,%ecx
+    rep
+    movsl
+    movb    %al,%cl
+    andb    $3,%cl
+    rep
+    movsb
+
+3:
+    GET_CURPCB(%edx)
+    xorl    %eax,%eax
+    popl    %ebx
+    popl    %edi
+    popl    %esi
+    movl    %eax,PCB_ONFAULT(%edx)
+    ret
+
+/* LINTSTUB: Ignore */
+NENTRY(i686_copy_efault)
+    movl    $EFAULT,%eax
+
+/* LINTSTUB: Ignore */
+NENTRY(i686_copy_fault)
+    GET_CURPCB(%edx)
+    movl    %eax,PCB_ONFAULT(%edx)
+    popl    %ebx
+    popl    %edi
+    popl    %esi
+    ret
+#endif /* I686_CPU */

 /* LINTSTUB: Ignore */
 NENTRY(copy_efault)

--mYCpIKhGyMATD0i+--

반응형