diff options
author | John J. Ellis <jje@gentoo.org> | 2003-08-05 09:58:13 +0000 |
---|---|---|
committer | John J. Ellis <jje@gentoo.org> | 2003-08-05 09:58:13 +0000 |
commit | a1dd813f6bfc0ab7f3cb64ed4081c914c39803c8 (patch) | |
tree | 1b38a01b0d32c0587f20de9b531b03b23cc424fc /media-libs/libvorbis | |
parent | Added simd patch (use sse) ~x86 masked. Closes #21585. (diff) | |
download | historical-a1dd813f6bfc0ab7f3cb64ed4081c914c39803c8.tar.gz historical-a1dd813f6bfc0ab7f3cb64ed4081c914c39803c8.tar.bz2 historical-a1dd813f6bfc0ab7f3cb64ed4081c914c39803c8.zip |
Added simd patch (use sse) ~x86 masked. Closes #21585.
Diffstat (limited to 'media-libs/libvorbis')
-rw-r--r-- | media-libs/libvorbis/Manifest | 14 | ||||
-rw-r--r-- | media-libs/libvorbis/files/digest-libvorbis-1.0-r1 | 1 | ||||
-rw-r--r-- | media-libs/libvorbis/files/digest-libvorbis-1.0-r3 (renamed from media-libs/libvorbis/files/digest-libvorbis-1.0) | 0 | ||||
-rw-r--r-- | media-libs/libvorbis/files/libvorbis-1.0-m4.patch | 31 | ||||
-rw-r--r-- | media-libs/libvorbis/files/libvorbis-m4.patch (renamed from media-libs/libvorbis/files/libvorbis-1.0-r2-m4.patch) | 0 | ||||
-rw-r--r-- | media-libs/libvorbis/files/libvorbis-simd.patch | 1019 | ||||
-rw-r--r-- | media-libs/libvorbis/libvorbis-1.0-r2.ebuild | 4 | ||||
-rw-r--r-- | media-libs/libvorbis/libvorbis-1.0-r3.ebuild (renamed from media-libs/libvorbis/libvorbis-1.0-r1.ebuild) | 19 | ||||
-rw-r--r-- | media-libs/libvorbis/libvorbis-1.0.ebuild | 61 |
9 files changed, 1039 insertions, 110 deletions
diff --git a/media-libs/libvorbis/Manifest b/media-libs/libvorbis/Manifest index 3aefcf7106cc..370f1349411c 100644 --- a/media-libs/libvorbis/Manifest +++ b/media-libs/libvorbis/Manifest @@ -1,13 +1,7 @@ -MD5 13ba2657dde7f84a0389ee0f31fd2523 ChangeLog 2029 -MD5 e9a0d9e24916cc5c07ae807bf644b5e8 libvorbis-1.0-r1.ebuild 1562 -MD5 547a992279bd48bc8c0ec30ba5a276f2 libvorbis-1.0.ebuild 1490 -MD5 41684381f21d3559c14320860478658c libvorbis-1.0-r2.ebuild 1710 -MD5 5603100e019622c95077292668929b6e libvorbis-1.0-r3.ebuild 1777 -MD5 0a0d3872bdf2c6da4e3fe5471fd3eb16 files/digest-libvorbis-1.0 65 -MD5 0a0d3872bdf2c6da4e3fe5471fd3eb16 files/digest-libvorbis-1.0-r1 65 -MD5 b8048905ed8638913c7b6302fdc3eb4b files/libvorbis-1.0-m4.patch 1032 +MD5 3df66b52d2f66cad749719b268ee3dce ChangeLog 2321 +MD5 073b0e5bb9e3c2061a1fa745f4130320 libvorbis-1.0-r2.ebuild 1707 +MD5 caf792187e22d3bcdee52fa25bf4283b libvorbis-1.0-r3.ebuild 1773 MD5 0a0d3872bdf2c6da4e3fe5471fd3eb16 files/digest-libvorbis-1.0-r2 65 -MD5 069e26021a32d3d105c544229e071e5f files/libvorbis-1.0-r2-m4.patch 919 MD5 0a0d3872bdf2c6da4e3fe5471fd3eb16 files/digest-libvorbis-1.0-r3 65 MD5 174ab9630810bce8aac9eac4f4f20247 files/libvorbis-simd.patch 28887 -MD5 b8048905ed8638913c7b6302fdc3eb4b files/libvorbis-m4.patch 1032 +MD5 069e26021a32d3d105c544229e071e5f files/libvorbis-m4.patch 919 diff --git a/media-libs/libvorbis/files/digest-libvorbis-1.0-r1 b/media-libs/libvorbis/files/digest-libvorbis-1.0-r1 deleted file mode 100644 index 6c800c478006..000000000000 --- a/media-libs/libvorbis/files/digest-libvorbis-1.0-r1 +++ /dev/null @@ -1 +0,0 @@ -MD5 d1ad94fe8e240269c790e18992171e53 libvorbis-1.0.tar.gz 749064 diff --git a/media-libs/libvorbis/files/digest-libvorbis-1.0 b/media-libs/libvorbis/files/digest-libvorbis-1.0-r3 index 6c800c478006..6c800c478006 100644 --- a/media-libs/libvorbis/files/digest-libvorbis-1.0 +++ b/media-libs/libvorbis/files/digest-libvorbis-1.0-r3 diff --git a/media-libs/libvorbis/files/libvorbis-1.0-m4.patch b/media-libs/libvorbis/files/libvorbis-1.0-m4.patch deleted file mode 100644 index 43828e527c6a..000000000000 --- a/media-libs/libvorbis/files/libvorbis-1.0-m4.patch +++ /dev/null @@ -1,31 +0,0 @@ -diff -u -r libvorbis-1.0/vorbis.m4 libvorbis-cvs/vorbis.m4 ---- libvorbis-1.0/vorbis.m4 2002-07-09 23:08:57.000000000 +1000 -+++ libvorbis-cvs/vorbis.m4 2002-08-07 02:01:40.000000000 +1000 -@@ -54,6 +54,7 @@ - #include <stdlib.h> - #include <string.h> - #include <vorbis/codec.h> -+#include <vorbis/vorbisenc.h> - - int main () - { -@@ -62,7 +63,7 @@ - vorbis_info vi; - - vorbis_info_init (&vi); -- vorbis_encode_init (&vi, 2, 44100, -1, 128, -1); -+ vorbis_encode_init (&vi, 2, 44100, -1, 128000, -1); - vorbis_analysis_init (&vd, &vi); - vorbis_block_init (&vd, &vb); - /* this function was added in 1.0rc3, so this is what we're testing for */ -@@ -86,8 +87,8 @@ - : - else - echo "*** Could not run Vorbis test program, checking why..." -- CFLAGS="$CFLAGS $VORBIS_CFLAGS" -- LIBS="$LIBS $VORBIS_LIBS $OGG_LIBS" -+ CFLAGS="$CFLAGS $VORBIS_CFLAGS $OGG_CFLAGS" -+ LIBS="$LIBS $VORBIS_LIBS $VORBISENC_LIBS $OGG_LIBS" - AC_TRY_LINK([ - #include <stdio.h> - #include <vorbis/codec.h> diff --git a/media-libs/libvorbis/files/libvorbis-1.0-r2-m4.patch b/media-libs/libvorbis/files/libvorbis-m4.patch index 55a82bc98411..55a82bc98411 100644 --- a/media-libs/libvorbis/files/libvorbis-1.0-r2-m4.patch +++ b/media-libs/libvorbis/files/libvorbis-m4.patch diff --git a/media-libs/libvorbis/files/libvorbis-simd.patch b/media-libs/libvorbis/files/libvorbis-simd.patch new file mode 100644 index 000000000000..85f1d1aef7be --- /dev/null +++ b/media-libs/libvorbis/files/libvorbis-simd.patch @@ -0,0 +1,1019 @@ +diff -ur libvorbis-1.0/lib/block.c libvorbis-1.0-simd/lib/block.c +--- libvorbis-1.0/lib/block.c 2002-07-11 08:40:48.000000000 +0200 ++++ libvorbis-1.0-simd/lib/block.c 2003-04-26 20:32:07.000000000 +0200 +@@ -22,6 +22,7 @@ + #include <stdlib.h> + #include <string.h> + #include <ogg/ogg.h> ++#include <assert.h> + #include "vorbis/codec.h" + #include "codec_internal.h" + +@@ -31,6 +32,10 @@ + #include "registry.h" + #include "misc.h" + ++#ifdef __SSE__ ++#include <xmmintrin.h> ++#endif ++ + static int ilog2(unsigned int v){ + int ret=0; + if(v)--v; +@@ -701,11 +706,95 @@ + /* the overlap/add section */ + if(v->lW){ + if(v->W){ ++#ifdef __SSE__ ++ /* large/large */ ++ float *pcm=v->pcm[j]+prevCenter; ++ float *p=vb->pcm[j]; ++ unsigned long _pcm=(unsigned long)pcm&15; ++ unsigned long _p=(unsigned long)p&15; ++ register __m128* PCM,* P; ++ i=0; ++ /* n1 is always 1024, prevCenter is either 0 or 1024 */ ++ /* both pcm and p can be unaligned, and they usually are. ++ * This code assumes unaligned addresses are still 8-byte ++ * aligned (which is true because glibc's malloc does 8-byte ++ * alignment */ ++ if (_pcm) { ++ pcm[0]+=p[0]; pcm[1]+=p[1]; ++ i=2; ++ PCM=(__m128*)(pcm+2); ++ P=(__m128*)(p+2); ++ } else { ++ PCM=(__m128*)pcm; ++ P=(__m128*)p; ++ } ++ if (_pcm ^ _p) { ++ /* one is properly aligned, the other is not */ ++ register __m128 a; ++ P=(__m128*)((float*)P-2); ++ a=P[0]; ++ for (; i+7<n1; i+=8) { ++ register __m128 b=P[1]; ++ /* now we need the upper most floats from a and the lower ++ * most floats from b */ ++ *PCM=_mm_add_ps(_mm_movehl_ps(_mm_shuffle_ps(b,b,_MM_SHUFFLE(1,0,3,2)),a),*PCM); ++ a=P[2]; ++ PCM[1]=_mm_add_ps(_mm_movehl_ps(_mm_shuffle_ps(a,a,_MM_SHUFFLE(1,0,3,2)),b),PCM[1]); ++ PCM+=2; P+=2; ++ } ++ } else { ++ for (; i+3<n1; i+=4) { ++ *PCM=_mm_add_ps(*P,*PCM); ++ ++PCM; ++P; ++ } ++ } ++ /* strange, does not appear to happen */ ++ for(;i<n1;i++) ++ pcm[i]+=p[i]; ++#elif defined(simd_3dn) ++ assert((n1&1)==0); ++ asm volatile("jecxz 2f\n\t" ++ "1: " ++ "movq (%1),%%mm0\n\t" ++ "addl $8,%1\n\t" ++ "pfadd (%2),%%mm0\n\t" ++ "movq %%mm0,(%2)\n\t" ++ "addl $8,%2\n\t" ++ "subl $2,%%ecx\n\t" ++ "jz 2f\n\t" ++ "movq (%1),%%mm0\n\t" ++ "addl $8,%1\n\t" ++ "pfadd (%2),%%mm0\n\t" ++ "movq %%mm0,(%2)\n\t" ++ "addl $8,%2\n\t" ++ "subl $2,%%ecx\n\t" ++ "jz 2f\n\t" ++ "movq (%1),%%mm0\n\t" ++ "addl $8,%1\n\t" ++ "pfadd (%2),%%mm0\n\t" ++ "movq %%mm0,(%2)\n\t" ++ "addl $8,%2\n\t" ++ "subl $2,%%ecx\n\t" ++ "jz 2f\n\t" ++ "movq (%1),%%mm0\n\t" ++ "addl $8,%1\n\t" ++ "pfadd (%2),%%mm0\n\t" ++ "movq %%mm0,(%2)\n\t" ++ "addl $8,%2\n\t" ++ "subl $2,%%ecx\n\t" ++ "jz 2f\n\t" ++ "jmp 1b\n\t" ++ "2: femms\n\t" ++ : : "c" (n1), ++ "r" ((float*)(vb->pcm[j])), ++ "r" ((float*)(v->pcm[j]+prevCenter)) : "memory" ); ++#else + /* large/large */ + float *pcm=v->pcm[j]+prevCenter; + float *p=vb->pcm[j]; + for(i=0;i<n1;i++) + pcm[i]+=p[i]; ++#endif + }else{ + /* large/small */ + float *pcm=v->pcm[j]+prevCenter+n1/2-n0/2; +@@ -732,12 +821,60 @@ + } + + /* the copy section */ ++#ifdef simd_3dn ++ assert((n&1)==0); ++ asm volatile("jecxz 2f\n\t" ++ "testl $1,%%ecx\n\t" ++ "jz 1f\n\t" ++ "3: movl (%1),%%eax\n\t" ++ "addl $4,%1\n\t" ++ "movl %%eax,(%2)\n\t" ++ "addl $4,%2\n\t" ++ "decl %%ecx\n\t" ++ "jz 2f\n\t" ++ "1: " ++ "movq (%1),%%mm0\n\t" ++ "addl $8,%1\n\t" ++ "movq %%mm0,(%2)\n\t" ++ "addl $8,%2\n\t" ++ "subl $2,%%ecx\n\t" ++ "jc 3b\n\t" ++ "jz 2f\n\t" ++ "movq (%1),%%mm0\n\t" ++ "addl $8,%1\n\t" ++ "movq %%mm0,(%2)\n\t" ++ "addl $8,%2\n\t" ++ "subl $2,%%ecx\n\t" ++ "jc 3b\n\t" ++ "jz 2f\n\t" ++ "movq (%1),%%mm0\n\t" ++ "addl $8,%1\n\t" ++ "movq %%mm0,(%2)\n\t" ++ "addl $8,%2\n\t" ++ "subl $2,%%ecx\n\t" ++ "jc 3b\n\t" ++ "jz 2f\n\t" ++ "movq (%1),%%mm0\n\t" ++ "addl $8,%1\n\t" ++ "movq %%mm0,(%2)\n\t" ++ "addl $8,%2\n\t" ++ "subl $2,%%ecx\n\t" ++ "jc 3b\n\t" ++ "jz 2f\n\t" ++ "jmp 1b\n\t" ++ "2: femms\n\t" ++ : : "c" (n), ++ "r" ((float*)(vb->pcm[j]+n)), ++ "r" ((float*)(v->pcm[j]+thisCenter)) ++ : "%eax", "memory" ); ++#else + { + float *pcm=v->pcm[j]+thisCenter; + float *p=vb->pcm[j]+n; + for(i=0;i<n;i++) + pcm[i]=p[i]; + } ++#endif + } + + if(v->centerW) +diff -ur libvorbis-1.0/lib/lsp.c libvorbis-1.0-simd/lib/lsp.c +--- libvorbis-1.0/lib/lsp.c 2002-07-17 23:28:37.000000000 +0200 ++++ libvorbis-1.0-simd/lib/lsp.c 2003-04-26 20:32:27.000000000 +0200 +@@ -54,7 +54,12 @@ + #define FLOAT_LOOKUP + #undef INT_LOOKUP + ++#ifdef __SSE__ ++#include <xmmintrin.h> ++#endif ++ + #ifdef FLOAT_LOOKUP ++ + #include "lookup.c" /* catch this in the build system; we #include for + compilers (like gcc) that can't inline across + modules */ +@@ -73,17 +78,88 @@ + while(i<n){ + int k=map[i]; + int qexp; ++#ifdef __SSE__ ++ register __m128 pqpq; ++ static float __attribute__((aligned(16))) PQPQ[4]; ++ register __m128 wwww; ++ float pq[2]; ++#define p pq[1] ++#define q pq[0] ++#else ++#ifdef simd_3dn ++ float pq[2],ww[2]; ++#define p pq[1] ++#define q pq[0] ++#define w ww[0] ++#else + float p=.7071067812f; + float q=.7071067812f; + float w=vorbis_coslook(wdel*k); ++#endif ++#endif + float *ftmp=lsp; + int c=m>>1; + ++#ifdef __SSE__ ++ static float __attribute__((aligned(16))) w; ++ w=vorbis_coslook(wdel*k); ++ PQPQ[0]=PQPQ[1]=.7071067812f; ++ if ((((long)ftmp)&15)==8) { ++ PQPQ[2]=ftmp[0]-w; ++ PQPQ[3]=ftmp[1]-w; ++ --c; ++ ftmp+=2; ++ } else { ++ PQPQ[2]=PQPQ[3]=1.f; ++ } ++ pqpq=*(__m128*)&(PQPQ[0]); ++ wwww=_mm_load1_ps(&w); ++#define UNROLL ++#ifdef UNROLL ++ while (c>3) { ++ pqpq=_mm_mul_ps(pqpq,_mm_sub_ps(*(__m128*)ftmp,wwww)); ++ ftmp+=4; ++ pqpq=_mm_mul_ps(pqpq,_mm_sub_ps(*(__m128*)ftmp,wwww)); ++ ftmp+=4; ++ c-=4; ++ } ++#endif ++ while (c>1) { ++ pqpq=_mm_mul_ps(pqpq,_mm_sub_ps(*(__m128*)ftmp,wwww)); ++ ftmp+=4; ++ c-=2; ++ } ++ pqpq=_mm_mul_ps(pqpq,_mm_shuffle_ps(pqpq,pqpq,_MM_SHUFFLE(1,0,3,2))); ++ _mm_storel_pi((__m64*)(&(pq[0])),pqpq); ++ if (c) { ++ q*=ftmp[0]-w; ++ p*=ftmp[1]-w; ++ ftmp+=2; ++ }; ++#else ++#ifdef simd_3dn ++ pq[0]=pq[1]=.7071067812f; ++ ww[0]=ww[1]=vorbis_coslook(wdel*k); ++ ++ asm volatile("movq (%2),%%mm1\n\t" /* mm1 = (w,w) */ ++ "movq (%3),%%mm2\n\t" /* mm2 = (p,q) */ ++ "1: movq (%0),%%mm0\n\t" /* mm0 = (ftmp[0],ftmp[1]) */ ++ "pfsub %%mm1,%%mm0\n\t" /* mm0 = (ftmp[0]-w,ftmp[1]-w) */ ++ "pfmul %%mm0,%%mm2\n\t" /* mm2 *= (ftmp[0]-w,ftmp[1]-w) */ ++ "addl $8,%0\n\t" /* ftmp += 2 */ ++ "decl %1\n\t" /* --c */ ++ "jnz 1b\n\t" ++ "movq %%mm2,(%3)\n\t" /* pq = mm0 */ ++ "femms\n\t" ++ : "+r" (ftmp), "+r" (c) : "r" (ww), "r" (pq) : "memory" ); ++#else + do{ + q*=ftmp[0]-w; + p*=ftmp[1]-w; + ftmp+=2; + }while(--c); ++#endif ++#endif + + if(m&1){ + /* odd order filter; slightly assymetric */ +@@ -107,6 +183,9 @@ + curve[i++]*=q; + }while(map[i]==k); + } ++#undef p ++#undef q ++#undef w + vorbis_fpu_restore(fpu); + } + +diff -ur libvorbis-1.0/lib/mdct.c libvorbis-1.0-simd/lib/mdct.c +--- libvorbis-1.0/lib/mdct.c 2002-06-29 00:19:36.000000000 +0200 ++++ libvorbis-1.0-simd/lib/mdct.c 2003-01-26 07:09:54.000000000 +0100 +@@ -271,41 +271,158 @@ + REG_TYPE r0; + REG_TYPE r1; + ++#if DATA_TYPE != float ++#undef simd_3dn ++#endif ++ + do{ + ++#ifdef simd_3dn ++ static unsigned long negxor[2]={0x80000000,0}; ++ asm ( /* mm0 = (r0,r1) */ ++ "movq %0,%%mm0\n\t" /* mm0 = (x1[6],x1[7]) */ ++ "movq %1,%%mm1\n\t" /* mm1 = (x2[6],x2[7]) */ ++ "movq %%mm0,%%mm2\n\t" /* mm2 = (x1[6],x1[7]) */ ++ "pfsub %%mm1,%%mm0\n\t" /* mm0 = (x1[6]-x2[6],x1[7]-x2[7]) */ ++ "pfadd %%mm1,%%mm2\n\t" /* mm2 = (x1[6]+x2[6],x1[7]+x2[7]) */ ++ "movq %%mm2,%0\n\t" ++ ++ "movq %2,%%mm1\n\t" ++ "movq %%mm0,%%mm2\n\t" ++ "pfmul %%mm1,%%mm0\n\t" ++ "pxor %3,%%mm2\n\t" ++#ifdef simd_sse ++ "pshufw $0x4e,%%mm1,%%mm1\n\t" ++#else ++ "movq %%mm1,%%mm3\n\t" ++ "psllq $32,%%mm3\n\t" ++ "psrlq $32,%%mm1\n\t" ++ "por %%mm3,%%mm1\n\t" ++#endif ++ "pfmul %%mm2,%%mm1\n\t" ++ "pfacc %%mm1,%%mm0\n\t" ++ "movq %%mm0,%1\n\t" ++ "femms\n\t" ++ : : "m" (x1[6]), "m" (x2[6]), "m" (T[0]), "m" (negxor[0]) : "memory"); ++#else + r0 = x1[6] - x2[6]; + r1 = x1[7] - x2[7]; + x1[6] += x2[6]; + x1[7] += x2[7]; + x2[6] = MULT_NORM(r1 * T[1] + r0 * T[0]); + x2[7] = MULT_NORM(r1 * T[0] - r0 * T[1]); ++#endif + + T+=trigint; + ++#ifdef simd_3dn ++ asm ( /* mm0 = (r0,r1) */ ++ "movq %0,%%mm0\n\t" /* mm0 = (x1[6],x1[7]) */ ++ "movq %1,%%mm1\n\t" /* mm1 = (x2[6],x2[7]) */ ++ "movq %%mm0,%%mm2\n\t" /* mm2 = (x1[6],x1[7]) */ ++ "pfsub %%mm1,%%mm0\n\t" /* mm0 = (x1[6]-x2[6],x1[7]-x2[7]) */ ++ "pfadd %%mm1,%%mm2\n\t" /* mm2 = (x1[6]+x2[6],x1[7]+x2[7]) */ ++ "movq %%mm2,%0\n\t" ++ ++ "movq %2,%%mm1\n\t" ++ "movq %%mm0,%%mm2\n\t" ++ "pfmul %%mm1,%%mm0\n\t" ++ "pxor %3,%%mm2\n\t" ++#ifdef simd_sse ++ "pshufw $0x4e,%%mm1,%%mm1\n\t" ++#else ++ "movq %%mm1,%%mm3\n\t" ++ "psllq $32,%%mm3\n\t" ++ "psrlq $32,%%mm1\n\t" ++ "por %%mm3,%%mm1\n\t" ++#endif ++ "pfmul %%mm2,%%mm1\n\t" ++ "pfacc %%mm1,%%mm0\n\t" ++ "movq %%mm0,%1\n\t" ++ "femms\n\t" ++ : : "m" (x1[4]), "m" (x2[4]), "m" (T[0]), "m" (negxor[0]) : "memory"); ++#else + r0 = x1[4] - x2[4]; + r1 = x1[5] - x2[5]; + x1[4] += x2[4]; + x1[5] += x2[5]; + x2[4] = MULT_NORM(r1 * T[1] + r0 * T[0]); + x2[5] = MULT_NORM(r1 * T[0] - r0 * T[1]); ++#endif + + T+=trigint; + ++#ifdef simd_3dn ++ asm ( /* mm0 = (r0,r1) */ ++ "movq %0,%%mm0\n\t" /* mm0 = (x1[6],x1[7]) */ ++ "movq %1,%%mm1\n\t" /* mm1 = (x2[6],x2[7]) */ ++ "movq %%mm0,%%mm2\n\t" /* mm2 = (x1[6],x1[7]) */ ++ "pfsub %%mm1,%%mm0\n\t" /* mm0 = (x1[6]-x2[6],x1[7]-x2[7]) */ ++ "pfadd %%mm1,%%mm2\n\t" /* mm2 = (x1[6]+x2[6],x1[7]+x2[7]) */ ++ "movq %%mm2,%0\n\t" ++ ++ "movq %2,%%mm1\n\t" ++ "movq %%mm0,%%mm2\n\t" ++ "pfmul %%mm1,%%mm0\n\t" ++ "pxor %3,%%mm2\n\t" ++#ifdef simd_sse ++ "pshufw $0x4e,%%mm1,%%mm1\n\t" ++#else ++ "movq %%mm1,%%mm3\n\t" ++ "psllq $32,%%mm3\n\t" ++ "psrlq $32,%%mm1\n\t" ++ "por %%mm3,%%mm1\n\t" ++#endif ++ "pfmul %%mm2,%%mm1\n\t" ++ "pfacc %%mm1,%%mm0\n\t" ++ "movq %%mm0,%1\n\t" ++ "femms\n\t" ++ : : "m" (x1[2]), "m" (x2[2]), "m" (T[0]), "m" (negxor[0]) : "memory"); ++#else + r0 = x1[2] - x2[2]; + r1 = x1[3] - x2[3]; + x1[2] += x2[2]; + x1[3] += x2[3]; + x2[2] = MULT_NORM(r1 * T[1] + r0 * T[0]); + x2[3] = MULT_NORM(r1 * T[0] - r0 * T[1]); ++#endif + + T+=trigint; + ++#ifdef simd_3dn ++ asm ( /* mm0 = (r0,r1) */ ++ "movq %0,%%mm0\n\t" /* mm0 = (x1[6],x1[7]) */ ++ "movq %1,%%mm1\n\t" /* mm1 = (x2[6],x2[7]) */ ++ "movq %%mm0,%%mm2\n\t" /* mm2 = (x1[6],x1[7]) */ ++ "pfsub %%mm1,%%mm0\n\t" /* mm0 = (x1[6]-x2[6],x1[7]-x2[7]) */ ++ "pfadd %%mm1,%%mm2\n\t" /* mm2 = (x1[6]+x2[6],x1[7]+x2[7]) */ ++ "movq %%mm2,%0\n\t" ++ ++ "movq %2,%%mm1\n\t" ++ "movq %%mm0,%%mm2\n\t" ++ "pfmul %%mm1,%%mm0\n\t" ++ "pxor %3,%%mm2\n\t" ++#ifdef simd_sse ++ "pshufw $0x4e,%%mm1,%%mm1\n\t" ++#else ++ "movq %%mm1,%%mm3\n\t" ++ "psllq $32,%%mm3\n\t" ++ "psrlq $32,%%mm1\n\t" ++ "por %%mm3,%%mm1\n\t" ++#endif ++ "pfmul %%mm2,%%mm1\n\t" ++ "pfacc %%mm1,%%mm0\n\t" ++ "movq %%mm0,%1\n\t" ++ "femms\n\t" ++ : : "m" (x1[0]), "m" (x2[0]), "m" (T[0]), "m" (negxor[0]) : "memory"); ++#else + r0 = x1[0] - x2[0]; + r1 = x1[1] - x2[1]; + x1[0] += x2[0]; + x1[1] += x2[1]; + x2[0] = MULT_NORM(r1 * T[1] + r0 * T[0]); + x2[1] = MULT_NORM(r1 * T[0] - r0 * T[1]); ++#endif + + T+=trigint; + x1-=8; +diff -ur libvorbis-1.0/lib/psy.c libvorbis-1.0-simd/lib/psy.c +--- libvorbis-1.0/lib/psy.c 2002-07-13 12:18:33.000000000 +0200 ++++ libvorbis-1.0-simd/lib/psy.c 2003-04-17 20:50:13.000000000 +0200 +@@ -29,9 +29,25 @@ + #include "scales.h" + #include "misc.h" + ++#ifdef __SSE__ ++#include <mmintrin.h> ++#include <xmmintrin.h> ++#endif ++ + #define NEGINF -9999.f + static double stereo_threshholds[]={0.0, .5, 1.0, 1.5, 2.5, 4.5, 8.5, 16.5, 9e10}; + ++#ifdef __SSE__ ++static void* align(void* x) { ++ long l=(long)x; ++ int r=l&0xf; ++ if (r) ++ return (void*)(l+16-(l&0xf)); ++ else ++ return x; ++} ++#endif ++ + vorbis_look_psy_global *_vp_global_look(vorbis_info *vi){ + codec_setup_info *ci=vi->codec_setup; + vorbis_info_psy_global *gi=&ci->psy_g_param; +@@ -530,17 +546,31 @@ + + } + ++#ifdef __SSE__ ++__m128 tmp __attribute__((aligned(16))); ++#endif ++ + static void bark_noise_hybridmp(int n,const long *b, + const float *f, + float *noise, + const float offset, + const int fixed){ + ++#ifdef __SSE__ ++ /* we need to be 16-bytes aligned for SSE */ ++ /* so we can be at most 16-4=12 bytes off, allocate three more floats */ ++ float *N=align(alloca((n+4)*sizeof(*N))); ++ float *X=align(alloca((n+4)*sizeof(*N))); ++ float *XX=align(alloca((n+4)*sizeof(*N))); ++ float *Y=align(alloca((n+4)*sizeof(*N))); ++ float *XY=align(alloca((n+4)*sizeof(*N))); ++#else + float *N=alloca((n+1)*sizeof(*N)); + float *X=alloca((n+1)*sizeof(*N)); + float *XX=alloca((n+1)*sizeof(*N)); + float *Y=alloca((n+1)*sizeof(*N)); + float *XY=alloca((n+1)*sizeof(*N)); ++#endif + + float tN, tX, tXX, tY, tXY; + float fi; +@@ -548,9 +578,131 @@ + + int lo, hi; + float R, A, B, D; ++#ifdef __SSE__ ++ register __m128 offset4=_mm_set_ps1(offset); ++#endif + + tN = tX = tXX = tY = tXY = 0.f; ++#ifdef __SSE__ ++ for (i=0, fi=0.f; i+4<n; i+=4, fi+=4.f) { ++ static float __attribute__((aligned(16))) c1111[4]={1.f,1.f,1.f,1.f}; ++ static float __attribute__((aligned(16))) c0123[4]={0.f,1.f,2.f,3.f}; ++ register __m128 xxxx = _mm_add_ps(_mm_set_ps1(fi),*(__m128*)c0123); ++ /* turns out that f may be unaligned and there is nothing I can do ++ * about it */ ++ register __m128 yyyy=_mm_max_ps(*(__m128*)c1111,_mm_add_ps(_mm_loadu_ps((float*)f+i),offset4)); ++ register __m128 wwww=_mm_mul_ps(yyyy,yyyy); ++ /* now it gets hairy */ ++ ++ float* a=(float*)&tmp; ++ ++#ifdef OLD_AND_WORKING ++ tmp=wwww; ++ N[i]=tN; ++ N[i+1]=(tN+=a[0]); ++ N[i+2]=(tN+=a[1]); ++ N[i+3]=(tN+=a[2]); ++ tN+=a[3]; ++ ++ tmp=_mm_mul_ps(wwww,xxxx); ++ X[i]=tX; ++ X[i+1]=(tX+=a[0]); ++ X[i+2]=(tX+=a[1]); ++ X[i+3]=(tX+=a[2]); ++ tX+=a[3]; ++ ++ tmp=_mm_mul_ps(tmp,xxxx); ++ XX[i]=tXX; ++ XX[i+1]=(tXX+=a[0]); ++ XX[i+2]=(tXX+=a[1]); ++ XX[i+3]=(tXX+=a[2]); ++ tXX+=a[3]; ++ ++ tmp=_mm_mul_ps(wwww,yyyy); ++ Y[i]=tY; ++ Y[i+1]=(tY+=a[0]); ++ Y[i+2]=(tY+=a[1]); ++ Y[i+3]=(tY+=a[2]); ++ tY+=a[3]; ++ ++ tmp=_mm_mul_ps(tmp,xxxx); ++ XY[i]=tXY; ++ XY[i+1]=(tXY+=a[0]); ++ XY[i+2]=(tXY+=a[1]); ++ XY[i+3]=(tXY+=a[2]); ++ tXY+=a[3]; ++#else ++ { ++ register __m128 o=_mm_set_ss(tN); ++ register __m128 x=wwww; ++ o=_mm_add_ss(_mm_shuffle_ps(o,o,_MM_SHUFFLE(2,1,0,0)),x); ++ o=_mm_add_ss(_mm_shuffle_ps(o,o,_MM_SHUFFLE(2,1,0,0)),_mm_shuffle_ps(x,x,_MM_SHUFFLE(0,0,0,1))); ++ o=_mm_add_ss(_mm_shuffle_ps(o,o,_MM_SHUFFLE(2,1,0,0)),_mm_shuffle_ps(x,x,_MM_SHUFFLE(0,0,0,2))); ++ _mm_storeu_ps(N+i,_mm_shuffle_ps(o,o,_MM_SHUFFLE(0,1,2,3))); ++ _mm_store_ss(&tN,_mm_add_ss(_mm_shuffle_ps(o,o,_MM_SHUFFLE(2,1,0,0)),_mm_shuffle_ps(x,x,_MM_SHUFFLE(0,0,0,3)))); ++ ++ o=_mm_set_ss(tX); x=_mm_mul_ps(wwww,xxxx); ++ o=_mm_add_ss(_mm_shuffle_ps(o,o,_MM_SHUFFLE(2,1,0,0)),x); ++ o=_mm_add_ss(_mm_shuffle_ps(o,o,_MM_SHUFFLE(2,1,0,0)),_mm_shuffle_ps(x,x,_MM_SHUFFLE(0,0,0,1))); ++ o=_mm_add_ss(_mm_shuffle_ps(o,o,_MM_SHUFFLE(2,1,0,0)),_mm_shuffle_ps(x,x,_MM_SHUFFLE(0,0,0,2))); ++ _mm_storeu_ps(X+i,_mm_shuffle_ps(o,o,_MM_SHUFFLE(0,1,2,3))); ++ _mm_store_ss(&tX,_mm_add_ss(_mm_shuffle_ps(o,o,_MM_SHUFFLE(2,1,0,0)),_mm_shuffle_ps(x,x,_MM_SHUFFLE(0,0,0,3)))); ++ ++ o=_mm_set_ss(tXX); x=_mm_mul_ps(x,xxxx); ++ o=_mm_add_ss(_mm_shuffle_ps(o,o,_MM_SHUFFLE(2,1,0,0)),x); ++ o=_mm_add_ss(_mm_shuffle_ps(o,o,_MM_SHUFFLE(2,1,0,0)),_mm_shuffle_ps(x,x,_MM_SHUFFLE(0,0,0,1))); ++ o=_mm_add_ss(_mm_shuffle_ps(o,o,_MM_SHUFFLE(2,1,0,0)),_mm_shuffle_ps(x,x,_MM_SHUFFLE(0,0,0,2))); ++ _mm_storeu_ps(XX+i,_mm_shuffle_ps(o,o,_MM_SHUFFLE(0,1,2,3))); ++ _mm_store_ss(&tXX,_mm_add_ss(_mm_shuffle_ps(o,o,_MM_SHUFFLE(2,1,0,0)),_mm_shuffle_ps(x,x,_MM_SHUFFLE(0,0,0,3)))); ++ ++ o=_mm_set_ss(tY); x=_mm_mul_ps(wwww,yyyy); ++ o=_mm_add_ss(_mm_shuffle_ps(o,o,_MM_SHUFFLE(2,1,0,0)),x); ++ o=_mm_add_ss(_mm_shuffle_ps(o,o,_MM_SHUFFLE(2,1,0,0)),_mm_shuffle_ps(x,x,_MM_SHUFFLE(0,0,0,1))); ++ o=_mm_add_ss(_mm_shuffle_ps(o,o,_MM_SHUFFLE(2,1,0,0)),_mm_shuffle_ps(x,x,_MM_SHUFFLE(0,0,0,2))); ++ _mm_storeu_ps(Y+i,_mm_shuffle_ps(o,o,_MM_SHUFFLE(0,1,2,3))); ++ _mm_store_ss(&tY,_mm_add_ss(_mm_shuffle_ps(o,o,_MM_SHUFFLE(2,1,0,0)),_mm_shuffle_ps(x,x,_MM_SHUFFLE(0,0,0,3)))); ++ ++ o=_mm_set_ss(tXY); x=_mm_mul_ps(x,xxxx); ++ o=_mm_add_ss(_mm_shuffle_ps(o,o,_MM_SHUFFLE(2,1,0,0)),x); ++ o=_mm_add_ss(_mm_shuffle_ps(o,o,_MM_SHUFFLE(2,1,0,0)),_mm_shuffle_ps(x,x,_MM_SHUFFLE(0,0,0,1))); ++ o=_mm_add_ss(_mm_shuffle_ps(o,o,_MM_SHUFFLE(2,1,0,0)),_mm_shuffle_ps(x,x,_MM_SHUFFLE(0,0,0,2))); ++ _mm_storeu_ps(XY+i,_mm_shuffle_ps(o,o,_MM_SHUFFLE(0,1,2,3))); ++ _mm_store_ss(&tXY,_mm_add_ss(_mm_shuffle_ps(o,o,_MM_SHUFFLE(2,1,0,0)),_mm_shuffle_ps(x,x,_MM_SHUFFLE(0,0,0,3)))); ++ ++ } ++#endif ++ ++#if 0 ++ N[i] = tN; ++ N[i+1] = tN+w[0]; ++ N[i+2] = tN+w[0]+w[1]; ++ N[i+3] = tN+w[0]+w[1]+w[2]; ++ ++ X[i] = tX; ++ X[i+1] = tX+w[0]*x[0]; ++ X[i+2] = tX+w[0]*x[0]+w[1]*x[1]; ++ X[i+3] = tX+w[0]*x[0]+w[1]*x[1]+w[2]*x[2]; ++ ++ Y[i] = tY; ++ Y[i+1] = tY+w[0]*y[0]; ++ Y[i+2] = tY+w[0]*y[0]+w[1]*y[1]; ++ Y[i+3] = tY+w[0]*y[0]+w[1]*y[1]+w[2]*y[2]; ++ ++ XX[i] = tXX; ++ XX[i+1] = tXX+w[0]*x[0]*x[0]; ++ XX[i+2] = tXX+w[0]*x[0]*x[0]+w[1]*x[1]*x[1]; ++ XX[i+3] = tXX+w[0]*x[0]*x[0]+w[1]*x[1]*x[1]+w[2]*x[2]*x[2]; ++ ++ XY[i] = tXY; ++ XY[i+1] = tXY+w[0]*x[0]*y[0]; ++ XY[i+2] = tXY+w[0]*x[0]*y[0]+w[1]*x[1]*y[1]; ++ XY[i+3] = tXY+w[0]*x[0]*y[0]+w[1]*x[1]*y[1]+w[2]*x[2]*y[2]; ++#endif ++ } ++ for (; i < n; i++, fi += 1.f) { ++#else + for (i = 0, fi = 0.f; i < n; i++, fi += 1.f) { ++#endif + float w, x, y; + + x = fi; +@@ -597,6 +749,7 @@ + } + + for ( ; hi < n; i++, fi += 1.f) { ++ /* TODO hotspot */ + + lo = b[i] >> 16; + hi = b[i] & 0xffff; +@@ -644,6 +797,7 @@ + if (R > 0.f && R - offset < noise[i]) noise[i] = R - offset; + } + for ( ; hi < n; i++, fi += 1.f) { ++ /* TODO hotspot */ + + hi = i + fixed / 2; + lo = hi - fixed; +@@ -744,6 +898,7 @@ + + if(sliding_lowpass>n)sliding_lowpass=n; + ++ /* TODO hotspot */ + for(i=0;i<sliding_lowpass;i++){ + residue[i]= + mdct[i]*FLOOR1_fromdB_INV_LOOKUP[codedflr[i]]; +@@ -792,6 +947,7 @@ + } + #endif + ++ /* TODO hotspot */ + for(i=0;i<n;i++){ + int dB=logmask[i]+.5; + if(dB>=NOISE_COMPAND_LEVELS)dB=NOISE_COMPAND_LEVELS-1; +@@ -816,7 +972,28 @@ + specified att) */ + if(att<p->vi->ath_maxatt)att=p->vi->ath_maxatt; + ++ /* TODO hotspot */ ++#ifdef __SSE__ ++/* this optimization does not make a difference for me */ ++ { ++ register __m128 att4=_mm_set_ps1(att); ++#if 0 ++ for (i=0;i+4<n && (long)(p->ath+i)&0xf;++i) ++ logmask[i]=p->ath[i]+att; ++#endif ++ for (i=0;i+16<n;i+=16) { ++ _mm_storeu_ps(logmask+i,_mm_add_ps(_mm_loadu_ps(p->ath+i),att4)); ++ _mm_storeu_ps(logmask+i+4,_mm_add_ps(_mm_loadu_ps(p->ath+i+4),att4)); ++ _mm_storeu_ps(logmask+i+8,_mm_add_ps(_mm_loadu_ps(p->ath+i+8),att4)); ++ _mm_storeu_ps(logmask+i+12,_mm_add_ps(_mm_loadu_ps(p->ath+i+12),att4)); ++ } ++ for (;i+4<n;i+=4) ++ _mm_storeu_ps(logmask+i,_mm_add_ps(_mm_loadu_ps(p->ath+i),att4)); ++ } ++ for(;i<n;i++) ++#else + for(i=0;i<n;i++) ++#endif + logmask[i]=p->ath[i]+att; + + /* tone masking */ +@@ -833,7 +1010,21 @@ + int i,n=p->n; + float toneatt=p->vi->tone_masteratt[offset_select]; + ++ /* TODO hotspot */ ++#ifdef __SSE__ ++ register float* no=p->noiseoffset[offset_select]; ++ register __m128 toneatt4=_mm_set_ps1(toneatt); ++ register __m128 noisemax4=_mm_set_ps1(p->vi->noisemaxsupp); ++ for(i=0;i+4<n;i+=4) { ++ register __m128 x=_mm_add_ps(_mm_loadu_ps(noise+i),_mm_loadu_ps(no+i)); ++ x=_mm_min_ps(x,noisemax4); ++ x=_mm_max_ps(x,_mm_add_ps(_mm_loadu_ps(tone+i),toneatt4)); ++ _mm_storeu_ps(logmask+i,x); ++ } ++ for(;i<n;i++){ ++#else + for(i=0;i<n;i++){ ++#endif + float val= noise[i]+p->noiseoffset[offset_select][i]; + if(val>p->vi->noisemaxsupp)val=p->vi->noisemaxsupp; + logmask[i]=max(val,tone[i]+toneatt); +@@ -883,6 +1074,7 @@ + -0.159093, -0.175146, -0.192286, -0.210490, + -0.229718, -0.249913, -0.271001, -0.292893}; + ++/* minor hotspot */ + static void precomputed_couple_point(float premag, + int floorA,int floorB, + float *mag, float *ang){ +@@ -949,6 +1141,7 @@ + } + + /* this is for per-channel noise normalization */ ++/* TODO hotspot */ + static int apsort(const void *a, const void *b){ + if(fabs(**(float **)a)>fabs(**(float **)b))return -1; + return 1; +@@ -972,6 +1165,8 @@ + for(j=0;j<n;j+=partition){ + for(k=0;k<partition;k++)work[k]=mags[i]+k+j; + qsort(work,partition,sizeof(*work),apsort); ++ /* TODO hotspot */ ++ /* the obvious MMX version is not faster */ + for(k=0;k<partition;k++)ret[i][k+j]=work[k]-mags[i]; + } + } +@@ -1008,8 +1203,26 @@ + if(start>n)start=n; + + if(vi->normal_channel_p){ ++ /* TODO hotspot */ ++#ifdef __SSE__DISABLED ++ /* this optimization does not make a difference */ ++ /* also, it appears to distort the results :( */ ++ register __m128 half4=_mm_set1_ps(0.5f); ++ for(;j+4<start;j+=4) { ++ register __m128 x=_mm_add_ps(_mm_loadu_ps(in+j),half4); ++ register __m64 y=_mm_cvtps_pi32(x); ++ register __m64 z=_mm_cvtps_pi32(_mm_shuffle_ps(x,x,0x1b)); ++ x=_mm_cvtpi32_ps(x,z); ++ x=_mm_cvtpi32_ps(_mm_shuffle_ps(x,x,0x1b),y); ++ _mm_storeu_ps(out+j,x); ++ } ++ _mm_empty(); ++ for(;j<start;j++) ++ out[j]=rint(in[j]); ++#else + for(;j<start;j++) + out[j]=rint(in[j]); ++#endif + + for(;j+partition<=n;j+=partition){ + float acc=0.; +@@ -1018,6 +1231,7 @@ + for(i=j;i<j+partition;i++) + acc+=in[i]*in[i]; + ++ /* TODO hotspot */ + for(i=0;i<partition;i++){ + k=sortedindex[i+j-start]; + +@@ -1032,6 +1246,7 @@ + } + } + ++ /* TODO hotspot */ + for(;i<partition;i++){ + k=sortedindex[i+j-start]; + out[k]=0.; +@@ -1093,6 +1308,7 @@ + for(j=0;j<p->n;j+=partition){ + float acc=0.f; + ++ /* TODO hotspot */ + for(k=0;k<partition;k++){ + int l=k+j; + +@@ -1116,6 +1332,7 @@ + } + + if(p->vi->normal_point_p){ ++ /* TODO minor hotspot */ + for(k=0;k<partition && acc>=p->vi->normal_thresh;k++){ + int l=mag_sort[i][j+k]; + if(l<sliding_lowpass && l>=pointlimit && rint(qM[l])==0.f){ +diff -ur libvorbis-1.0/lib/vorbisfile.c libvorbis-1.0-simd/lib/vorbisfile.c +--- libvorbis-1.0/lib/vorbisfile.c 2002-07-06 06:20:03.000000000 +0200 ++++ libvorbis-1.0-simd/lib/vorbisfile.c 2003-04-26 21:55:49.000000000 +0200 +@@ -21,12 +21,19 @@ + #include <string.h> + #include <math.h> + ++#include <assert.h> ++ + #include "vorbis/codec.h" + #include "vorbis/vorbisfile.h" + + #include "os.h" + #include "misc.h" + ++#ifdef __SSE__ ++#include <xmmintrin.h> ++#include <mmintrin.h> ++#endif ++ + /* A 'chained bitstream' is a Vorbis bitstream that contains more than + one logical bitstream arranged end to end (the only form of Ogg + multiplexing allowed in a Vorbis bitstream; grouping [parallel +@@ -1500,7 +1507,72 @@ + + if(host_endian==bigendianp){ + if(sgned){ +- ++#ifdef __SSE__ ++ /* Oh no! On my box, exactly one of pcm[0][j] and pcm[1][j] ++ * is always misaligned! SSE required 16-byte alignment. ++ * Also, the pcm[][] array layout is bad for vectorizing, ++ * but we can fix that with mmx unpack magic for the common ++ * case where channels==2. */ ++ if (channels==2 && samples>8) { ++ register __m128 scale=_mm_set1_ps(32768.f); ++ for (j=0; j+8<samples; j+=8) { ++ register __m128 x=_mm_loadu_ps(pcm[0]+j); ++ register __m128 y=_mm_loadu_ps(pcm[1]+j); ++ ++ *(__m64*)buffer=_mm_cvtps_pi16(_mm_mul_ps(_mm_unpacklo_ps(x,y),scale)); ++ *(__m64*)(buffer+8)=_mm_cvtps_pi16(_mm_mul_ps(_mm_unpackhi_ps(x,y),scale)); ++ ++ x=_mm_loadu_ps(pcm[0]+j+4); ++ y=_mm_loadu_ps(pcm[1]+j+4); ++ ++ *(__m64*)(buffer+16)=_mm_cvtps_pi16(_mm_mul_ps(_mm_unpacklo_ps(x,y),scale)); ++ *(__m64*)(buffer+24)=_mm_cvtps_pi16(_mm_mul_ps(_mm_unpackhi_ps(x,y),scale)); ++ ++ buffer+=32; ++ } ++ for (; j+4<samples; j+=4) { ++ register __m128 x=_mm_loadu_ps(pcm[0]+j); ++ register __m128 y=_mm_loadu_ps(pcm[1]+j); ++ ++ *(__m64*)buffer=_mm_cvtps_pi16(_mm_mul_ps(_mm_unpacklo_ps(x,y),scale)); ++ *(__m64*)(buffer+8)=_mm_cvtps_pi16(_mm_mul_ps(_mm_unpackhi_ps(x,y),scale)); ++ ++ buffer+=16; ++ } ++ _mm_empty(); ++ ++ for (; j<samples; ++j) { ++ val=vorbis_ftoi(pcm[0][j]*32768.f); ++ if(val>32767)val=32767; ++ else if(val<-32768)val=-32768; ++ *(short*)buffer=val; ++ val=vorbis_ftoi(pcm[1][j]*32768.f); ++ if(val>32767)val=32767; ++ else if(val<-32768)val=-32768; ++ *(short*)(buffer+2)=val; ++ buffer+=4; ++ } ++ } else { ++#elif defined(simd_3dn) ++ if (channels==2) { ++ static float scale[2]={32768.f,32768.f}; ++ for(j=0;j<samples;j+=2) { ++ asm("movq %0,%%mm0\n\t" /* (pcm[0][j],pcm[0][j+1]) */ ++ "pfmul %1,%%mm0\n\t" ++ "pf2id %%mm0,%%mm0\n\t" ++ "packssdw %%mm0,%%mm0\n\t" ++ "movq %3,%%mm1\n\t" /* (pcm[1][j],pcm[1][j+1]) */ ++ "pfmul %1,%%mm1\n\t" ++ "pf2id %%mm1,%%mm1\n\t" ++ "packssdw %%mm1,%%mm1\n\t" ++ "punpckhwd %%mm1,%%mm0\n\t" ++ "movq %%mm0,(%2)\n\t" ++ : : "m" (pcm[0][j]), "m" (scale), "r" (buffer), "m" (pcm[1][j]) ); ++ buffer += 8; ++ } ++ asm volatile("femms\n\t"); ++ } else { ++#endif + vorbis_fpu_setround(&fpu); + for(i=0;i<channels;i++) { /* It's faster in this order */ + float *src=pcm[i]; +@@ -1514,6 +1586,9 @@ + } + } + vorbis_fpu_restore(fpu); ++#if defined(__SSE__) || defined(simd_3dn) ++ } ++#endif + + }else{ + +@@ -1548,6 +1623,7 @@ + + }else{ + int val; ++ write(1,"a",1); + vorbis_fpu_setround(&fpu); + for(j=0;j<samples;j++) + for(i=0;i<channels;i++){ +diff -ur libvorbis-1.0/lib/window.c libvorbis-1.0-simd/lib/window.c +--- libvorbis-1.0/lib/window.c 2002-03-23 04:17:34.000000000 +0100 ++++ libvorbis-1.0-simd/lib/window.c 2003-02-08 02:53:04.000000000 +0100 +@@ -17,9 +17,14 @@ + + #include <stdlib.h> + #include <math.h> ++#include <stdio.h> + #include "os.h" + #include "misc.h" + ++#ifdef __SSE__ ++#include <xmmintrin.h> ++#endif ++ + float *_vorbis_window(int type, int left){ + float *ret=_ogg_calloc(left,sizeof(*ret)); + int i; +@@ -67,8 +72,68 @@ + for(i=0;i<leftbegin;i++) + d[i]=0.f; + ++#ifdef __SSE__ ++// printf("%p %p\n",d+i,window[lW]); ++ /* alignment issues. Again. Who saw that one coming? :-/ */ ++ p=0; ++ if (leftend-i>8) { /* make it worth our while */ ++ unsigned long _d=(long)(d+i)&15; ++ unsigned long _win=(long)(window[lW])&15; ++ register __m128* D,* WIN; ++ if (_d) { ++ d[i]*=window[lW][p]; ++ d[i+1]*=window[lW][p+1]; ++ i+=2; p+=2; ++ } ++ D=(__m128*)(d+i); ++ WIN=(__m128*)(window[lW]+p); ++ if (_d ^ _win) { ++ /* one is properly aligned, the other is not */ ++ register __m128 a; ++ WIN=(__m128*)((float*)WIN-2); ++ a=WIN[0]; ++ for(;i+7<leftend;i+=8,p+=8) { ++ /* now we need the upper most floats from a and the lower ++ * most floats from b */ ++ register __m128 b=WIN[1]; ++ *D=_mm_mul_ps(_mm_movehl_ps(_mm_shuffle_ps(b,b,_MM_SHUFFLE(1,0,3,2)),a),*D); ++ a=WIN[2]; ++ D[1]=_mm_mul_ps(_mm_movehl_ps(_mm_shuffle_ps(a,a,_MM_SHUFFLE(1,0,3,2)),b),D[1]); ++ D+=2; WIN+=2; ++ } ++ } else { ++ for(;i+3<leftend;i+=4,p+=4) { ++ *D=_mm_mul_ps(*D,*WIN); ++ ++D; ++WIN; ++// d[i]*=window[lW][p]; ++ } ++ } ++ } ++ for(;i<leftend;i++,p++) ++ d[i]*=window[lW][p]; ++#elif defined(simd_3dn) ++ if ((i&1) || ((leftend-i)&1)) { /* d[i] is unaligned */ ++ for(p=0;i<leftend;i++,p++) ++ d[i]*=window[lW][p]; ++ } else { ++ asm("orl %0,%0\n\t" ++ "jbe 2f\n\t" ++ "1: movq (%1),%%mm0\n\t" ++ "pfmul (%2),%%mm0\n\t" ++ "addl $8,%2\n\t" ++ "movq %%mm0,(%1)\n\t" ++ "addl $8,%1\n\t" ++ "decl %0\n\t" ++ "jz 2f\n\t" ++ "jmp 1b\n\t" ++ "2: femms\n\t" ++ : : "r" ((leftend-i)/2), "r" (&d[i]), "r" (&window[lW][0]) : "memory" ); ++ i=leftend; ++ } ++#else + for(p=0;i<leftend;i++,p++) + d[i]*=window[lW][p]; ++#endif + + for(i=rightbegin,p=rn/2-1;i<rightend;i++,p--) + d[i]*=window[nW][p]; + diff --git a/media-libs/libvorbis/libvorbis-1.0-r2.ebuild b/media-libs/libvorbis/libvorbis-1.0-r2.ebuild index ce958dcd9958..8827670acd33 100644 --- a/media-libs/libvorbis/libvorbis-1.0-r2.ebuild +++ b/media-libs/libvorbis/libvorbis-1.0-r2.ebuild @@ -1,6 +1,6 @@ # Copyright 1999-2003 Gentoo Technologies, Inc. # Distributed under the terms of the GNU General Public License v2 -# $Header: /var/cvsroot/gentoo-x86/media-libs/libvorbis/libvorbis-1.0-r2.ebuild,v 1.3 2003/07/18 21:55:45 tester Exp $ +# $Header: /var/cvsroot/gentoo-x86/media-libs/libvorbis/libvorbis-1.0-r2.ebuild,v 1.4 2003/08/05 09:58:07 jje Exp $ inherit libtool eutils @@ -19,7 +19,7 @@ src_unpack() { unpack ${A} cd ${S} - epatch ${FILESDIR}/${PF}-m4.patch || die "Patching failed" + epatch ${FILESDIR}/${PN}-m4.patch || die "Patching failed" # Fix a gcc crash. With the new atexit patch to gcc, it # seems it do not handle -mno-ieee-fp too well. cp configure configure.orig diff --git a/media-libs/libvorbis/libvorbis-1.0-r1.ebuild b/media-libs/libvorbis/libvorbis-1.0-r3.ebuild index 0c573a249fcc..397c1c4573da 100644 --- a/media-libs/libvorbis/libvorbis-1.0-r1.ebuild +++ b/media-libs/libvorbis/libvorbis-1.0-r3.ebuild @@ -1,25 +1,30 @@ # Copyright 1999-2003 Gentoo Technologies, Inc. # Distributed under the terms of the GNU General Public License v2 -# $Header: /var/cvsroot/gentoo-x86/media-libs/libvorbis/libvorbis-1.0-r1.ebuild,v 1.7 2003/02/13 12:51:32 vapier Exp $ +# $Header: /var/cvsroot/gentoo-x86/media-libs/libvorbis/libvorbis-1.0-r3.ebuild,v 1.1 2003/08/05 09:58:07 jje Exp $ -inherit libtool +inherit libtool eutils S=${WORKDIR}/${P} DESCRIPTION="the Ogg Vorbis sound file format library" SRC_URI="http://fatpipe.vorbis.com/files/1.0/unix/${P}.tar.gz" HOMEPAGE="http://www.xiph.org/ogg/vorbis/index.html" +IUSE="sse" DEPEND=">=media-libs/libogg-1.0" SLOT="0" LICENSE="as-is" -KEYWORDS="x86 ppc sparc alpha" +KEYWORDS="~x86" src_unpack() { unpack ${A} cd ${S} - patch -p1 < ${FILESDIR}/${P}-m4.patch || die "Patching failed" + if [ `use x86` ] ; then + use sse && epatch ${FILESDIR}/${PN}-simd.patch + fi + + epatch ${FILESDIR}/${PN}-m4.patch || die "Patching failed" # Fix a gcc crash. With the new atexit patch to gcc, it # seems it do not handle -mno-ieee-fp too well. cp configure configure.orig @@ -30,7 +35,7 @@ src_unpack() { src_compile() { elibtoolize - export CFLAGS="${CFLAGS/-march=*/}" + #export CFLAGS="${CFLAGS/-march=*/}" ./configure --prefix=/usr \ --host=${CHOST} || die @@ -41,6 +46,9 @@ src_compile() { src_install () { make DESTDIR=${D} install || die + dosym /usr/lib/libvorbisfile.so.3.0.0 /usr/lib/libvorbisfile.so.0 + dosym /usr/lib/libvorbisenc.so.2.0.0 /usr/lib/libvorbisenc.so.0 + echo "Removing docs installed by make install" rm -rf ${D}/usr/share/doc @@ -60,3 +68,4 @@ pkg_postinst() { einfo "recompilation is needed for these things." einfo } + diff --git a/media-libs/libvorbis/libvorbis-1.0.ebuild b/media-libs/libvorbis/libvorbis-1.0.ebuild deleted file mode 100644 index 0114be22ee2b..000000000000 --- a/media-libs/libvorbis/libvorbis-1.0.ebuild +++ /dev/null @@ -1,61 +0,0 @@ -# Copyright 1999-2003 Gentoo Technologies, Inc. -# Distributed under the terms of the GNU General Public License v2 -# $Header: /var/cvsroot/gentoo-x86/media-libs/libvorbis/libvorbis-1.0.ebuild,v 1.6 2003/02/13 12:51:36 vapier Exp $ - -inherit libtool - -S=${WORKDIR}/${P} -DESCRIPTION="the Ogg Vorbis sound file format library" -SRC_URI="http://fatpipe.vorbis.com/files/1.0/unix/${P}.tar.gz" -HOMEPAGE="http://www.xiph.org/ogg/vorbis/index.html" - -DEPEND=">=media-libs/libogg-1.0" - -SLOT="0" -LICENSE="as-is" -KEYWORDS="x86 ppc sparc " - -src_unpack() { - unpack ${A} - - cd ${S} - # Fix a gcc crash. With the new atexit patch to gcc, it - # seems it do not handle -mno-ieee-fp too well. - cp configure configure.orig - sed -e "s:-mno-ieee-fp::g" \ - configure.orig >configure -} - -src_compile() { - elibtoolize - - export CFLAGS="${CFLAGS/-march=*/}" - - ./configure --prefix=/usr \ - --host=${CHOST} || die - - emake || die -} - -src_install () { - make DESTDIR=${D} install || die - - echo "Removing docs installed by make install" - rm -rf ${D}/usr/share/doc - - dodoc AUTHORS COPYING README todo.txt - docinto txt - dodoc doc/*.txt - dohtml -r doc -} - -pkg_postinst() { - einfo - einfo "Note the 1.0 version of libvorbis has been installed" - einfo "Applications that used pre-1.0 vorbis libraries will" - einfo "need to be recompiled for the new version." - einfo "Now that the vorbis folks have finalized the API" - einfo "this should be the last time for a while that" - einfo "recompilation is needed for these things." - einfo -} |