summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn J. Ellis <jje@gentoo.org>2003-08-05 09:58:13 +0000
committerJohn J. Ellis <jje@gentoo.org>2003-08-05 09:58:13 +0000
commita1dd813f6bfc0ab7f3cb64ed4081c914c39803c8 (patch)
tree1b38a01b0d32c0587f20de9b531b03b23cc424fc /media-libs/libvorbis
parentAdded simd patch (use sse) ~x86 masked. Closes #21585. (diff)
downloadhistorical-a1dd813f6bfc0ab7f3cb64ed4081c914c39803c8.tar.gz
historical-a1dd813f6bfc0ab7f3cb64ed4081c914c39803c8.tar.bz2
historical-a1dd813f6bfc0ab7f3cb64ed4081c914c39803c8.zip
Added simd patch (use sse) ~x86 masked. Closes #21585.
Diffstat (limited to 'media-libs/libvorbis')
-rw-r--r--media-libs/libvorbis/Manifest14
-rw-r--r--media-libs/libvorbis/files/digest-libvorbis-1.0-r11
-rw-r--r--media-libs/libvorbis/files/digest-libvorbis-1.0-r3 (renamed from media-libs/libvorbis/files/digest-libvorbis-1.0)0
-rw-r--r--media-libs/libvorbis/files/libvorbis-1.0-m4.patch31
-rw-r--r--media-libs/libvorbis/files/libvorbis-m4.patch (renamed from media-libs/libvorbis/files/libvorbis-1.0-r2-m4.patch)0
-rw-r--r--media-libs/libvorbis/files/libvorbis-simd.patch1019
-rw-r--r--media-libs/libvorbis/libvorbis-1.0-r2.ebuild4
-rw-r--r--media-libs/libvorbis/libvorbis-1.0-r3.ebuild (renamed from media-libs/libvorbis/libvorbis-1.0-r1.ebuild)19
-rw-r--r--media-libs/libvorbis/libvorbis-1.0.ebuild61
9 files changed, 1039 insertions, 110 deletions
diff --git a/media-libs/libvorbis/Manifest b/media-libs/libvorbis/Manifest
index 3aefcf7106cc..370f1349411c 100644
--- a/media-libs/libvorbis/Manifest
+++ b/media-libs/libvorbis/Manifest
@@ -1,13 +1,7 @@
-MD5 13ba2657dde7f84a0389ee0f31fd2523 ChangeLog 2029
-MD5 e9a0d9e24916cc5c07ae807bf644b5e8 libvorbis-1.0-r1.ebuild 1562
-MD5 547a992279bd48bc8c0ec30ba5a276f2 libvorbis-1.0.ebuild 1490
-MD5 41684381f21d3559c14320860478658c libvorbis-1.0-r2.ebuild 1710
-MD5 5603100e019622c95077292668929b6e libvorbis-1.0-r3.ebuild 1777
-MD5 0a0d3872bdf2c6da4e3fe5471fd3eb16 files/digest-libvorbis-1.0 65
-MD5 0a0d3872bdf2c6da4e3fe5471fd3eb16 files/digest-libvorbis-1.0-r1 65
-MD5 b8048905ed8638913c7b6302fdc3eb4b files/libvorbis-1.0-m4.patch 1032
+MD5 3df66b52d2f66cad749719b268ee3dce ChangeLog 2321
+MD5 073b0e5bb9e3c2061a1fa745f4130320 libvorbis-1.0-r2.ebuild 1707
+MD5 caf792187e22d3bcdee52fa25bf4283b libvorbis-1.0-r3.ebuild 1773
MD5 0a0d3872bdf2c6da4e3fe5471fd3eb16 files/digest-libvorbis-1.0-r2 65
-MD5 069e26021a32d3d105c544229e071e5f files/libvorbis-1.0-r2-m4.patch 919
MD5 0a0d3872bdf2c6da4e3fe5471fd3eb16 files/digest-libvorbis-1.0-r3 65
MD5 174ab9630810bce8aac9eac4f4f20247 files/libvorbis-simd.patch 28887
-MD5 b8048905ed8638913c7b6302fdc3eb4b files/libvorbis-m4.patch 1032
+MD5 069e26021a32d3d105c544229e071e5f files/libvorbis-m4.patch 919
diff --git a/media-libs/libvorbis/files/digest-libvorbis-1.0-r1 b/media-libs/libvorbis/files/digest-libvorbis-1.0-r1
deleted file mode 100644
index 6c800c478006..000000000000
--- a/media-libs/libvorbis/files/digest-libvorbis-1.0-r1
+++ /dev/null
@@ -1 +0,0 @@
-MD5 d1ad94fe8e240269c790e18992171e53 libvorbis-1.0.tar.gz 749064
diff --git a/media-libs/libvorbis/files/digest-libvorbis-1.0 b/media-libs/libvorbis/files/digest-libvorbis-1.0-r3
index 6c800c478006..6c800c478006 100644
--- a/media-libs/libvorbis/files/digest-libvorbis-1.0
+++ b/media-libs/libvorbis/files/digest-libvorbis-1.0-r3
diff --git a/media-libs/libvorbis/files/libvorbis-1.0-m4.patch b/media-libs/libvorbis/files/libvorbis-1.0-m4.patch
deleted file mode 100644
index 43828e527c6a..000000000000
--- a/media-libs/libvorbis/files/libvorbis-1.0-m4.patch
+++ /dev/null
@@ -1,31 +0,0 @@
-diff -u -r libvorbis-1.0/vorbis.m4 libvorbis-cvs/vorbis.m4
---- libvorbis-1.0/vorbis.m4 2002-07-09 23:08:57.000000000 +1000
-+++ libvorbis-cvs/vorbis.m4 2002-08-07 02:01:40.000000000 +1000
-@@ -54,6 +54,7 @@
- #include <stdlib.h>
- #include <string.h>
- #include <vorbis/codec.h>
-+#include <vorbis/vorbisenc.h>
-
- int main ()
- {
-@@ -62,7 +63,7 @@
- vorbis_info vi;
-
- vorbis_info_init (&vi);
-- vorbis_encode_init (&vi, 2, 44100, -1, 128, -1);
-+ vorbis_encode_init (&vi, 2, 44100, -1, 128000, -1);
- vorbis_analysis_init (&vd, &vi);
- vorbis_block_init (&vd, &vb);
- /* this function was added in 1.0rc3, so this is what we're testing for */
-@@ -86,8 +87,8 @@
- :
- else
- echo "*** Could not run Vorbis test program, checking why..."
-- CFLAGS="$CFLAGS $VORBIS_CFLAGS"
-- LIBS="$LIBS $VORBIS_LIBS $OGG_LIBS"
-+ CFLAGS="$CFLAGS $VORBIS_CFLAGS $OGG_CFLAGS"
-+ LIBS="$LIBS $VORBIS_LIBS $VORBISENC_LIBS $OGG_LIBS"
- AC_TRY_LINK([
- #include <stdio.h>
- #include <vorbis/codec.h>
diff --git a/media-libs/libvorbis/files/libvorbis-1.0-r2-m4.patch b/media-libs/libvorbis/files/libvorbis-m4.patch
index 55a82bc98411..55a82bc98411 100644
--- a/media-libs/libvorbis/files/libvorbis-1.0-r2-m4.patch
+++ b/media-libs/libvorbis/files/libvorbis-m4.patch
diff --git a/media-libs/libvorbis/files/libvorbis-simd.patch b/media-libs/libvorbis/files/libvorbis-simd.patch
new file mode 100644
index 000000000000..85f1d1aef7be
--- /dev/null
+++ b/media-libs/libvorbis/files/libvorbis-simd.patch
@@ -0,0 +1,1019 @@
+diff -ur libvorbis-1.0/lib/block.c libvorbis-1.0-simd/lib/block.c
+--- libvorbis-1.0/lib/block.c 2002-07-11 08:40:48.000000000 +0200
++++ libvorbis-1.0-simd/lib/block.c 2003-04-26 20:32:07.000000000 +0200
+@@ -22,6 +22,7 @@
+ #include <stdlib.h>
+ #include <string.h>
+ #include <ogg/ogg.h>
++#include <assert.h>
+ #include "vorbis/codec.h"
+ #include "codec_internal.h"
+
+@@ -31,6 +32,10 @@
+ #include "registry.h"
+ #include "misc.h"
+
++#ifdef __SSE__
++#include <xmmintrin.h>
++#endif
++
+ static int ilog2(unsigned int v){
+ int ret=0;
+ if(v)--v;
+@@ -701,11 +706,95 @@
+ /* the overlap/add section */
+ if(v->lW){
+ if(v->W){
++#ifdef __SSE__
++ /* large/large */
++ float *pcm=v->pcm[j]+prevCenter;
++ float *p=vb->pcm[j];
++ unsigned long _pcm=(unsigned long)pcm&15;
++ unsigned long _p=(unsigned long)p&15;
++ register __m128* PCM,* P;
++ i=0;
++ /* n1 is always 1024, prevCenter is either 0 or 1024 */
++ /* both pcm and p can be unaligned, and they usually are.
++ * This code assumes unaligned addresses are still 8-byte
++ * aligned (which is true because glibc's malloc does 8-byte
++ * alignment */
++ if (_pcm) {
++ pcm[0]+=p[0]; pcm[1]+=p[1];
++ i=2;
++ PCM=(__m128*)(pcm+2);
++ P=(__m128*)(p+2);
++ } else {
++ PCM=(__m128*)pcm;
++ P=(__m128*)p;
++ }
++ if (_pcm ^ _p) {
++ /* one is properly aligned, the other is not */
++ register __m128 a;
++ P=(__m128*)((float*)P-2);
++ a=P[0];
++ for (; i+7<n1; i+=8) {
++ register __m128 b=P[1];
++ /* now we need the upper most floats from a and the lower
++ * most floats from b */
++ *PCM=_mm_add_ps(_mm_movehl_ps(_mm_shuffle_ps(b,b,_MM_SHUFFLE(1,0,3,2)),a),*PCM);
++ a=P[2];
++ PCM[1]=_mm_add_ps(_mm_movehl_ps(_mm_shuffle_ps(a,a,_MM_SHUFFLE(1,0,3,2)),b),PCM[1]);
++ PCM+=2; P+=2;
++ }
++ } else {
++ for (; i+3<n1; i+=4) {
++ *PCM=_mm_add_ps(*P,*PCM);
++ ++PCM; ++P;
++ }
++ }
++ /* strange, does not appear to happen */
++ for(;i<n1;i++)
++ pcm[i]+=p[i];
++#elif defined(simd_3dn)
++ assert((n1&1)==0);
++ asm volatile("jecxz 2f\n\t"
++ "1: "
++ "movq (%1),%%mm0\n\t"
++ "addl $8,%1\n\t"
++ "pfadd (%2),%%mm0\n\t"
++ "movq %%mm0,(%2)\n\t"
++ "addl $8,%2\n\t"
++ "subl $2,%%ecx\n\t"
++ "jz 2f\n\t"
++ "movq (%1),%%mm0\n\t"
++ "addl $8,%1\n\t"
++ "pfadd (%2),%%mm0\n\t"
++ "movq %%mm0,(%2)\n\t"
++ "addl $8,%2\n\t"
++ "subl $2,%%ecx\n\t"
++ "jz 2f\n\t"
++ "movq (%1),%%mm0\n\t"
++ "addl $8,%1\n\t"
++ "pfadd (%2),%%mm0\n\t"
++ "movq %%mm0,(%2)\n\t"
++ "addl $8,%2\n\t"
++ "subl $2,%%ecx\n\t"
++ "jz 2f\n\t"
++ "movq (%1),%%mm0\n\t"
++ "addl $8,%1\n\t"
++ "pfadd (%2),%%mm0\n\t"
++ "movq %%mm0,(%2)\n\t"
++ "addl $8,%2\n\t"
++ "subl $2,%%ecx\n\t"
++ "jz 2f\n\t"
++ "jmp 1b\n\t"
++ "2: femms\n\t"
++ : : "c" (n1),
++ "r" ((float*)(vb->pcm[j])),
++ "r" ((float*)(v->pcm[j]+prevCenter)) : "memory" );
++#else
+ /* large/large */
+ float *pcm=v->pcm[j]+prevCenter;
+ float *p=vb->pcm[j];
+ for(i=0;i<n1;i++)
+ pcm[i]+=p[i];
++#endif
+ }else{
+ /* large/small */
+ float *pcm=v->pcm[j]+prevCenter+n1/2-n0/2;
+@@ -732,12 +821,60 @@
+ }
+
+ /* the copy section */
++#ifdef simd_3dn
++ assert((n&1)==0);
++ asm volatile("jecxz 2f\n\t"
++ "testl $1,%%ecx\n\t"
++ "jz 1f\n\t"
++ "3: movl (%1),%%eax\n\t"
++ "addl $4,%1\n\t"
++ "movl %%eax,(%2)\n\t"
++ "addl $4,%2\n\t"
++ "decl %%ecx\n\t"
++ "jz 2f\n\t"
++ "1: "
++ "movq (%1),%%mm0\n\t"
++ "addl $8,%1\n\t"
++ "movq %%mm0,(%2)\n\t"
++ "addl $8,%2\n\t"
++ "subl $2,%%ecx\n\t"
++ "jc 3b\n\t"
++ "jz 2f\n\t"
++ "movq (%1),%%mm0\n\t"
++ "addl $8,%1\n\t"
++ "movq %%mm0,(%2)\n\t"
++ "addl $8,%2\n\t"
++ "subl $2,%%ecx\n\t"
++ "jc 3b\n\t"
++ "jz 2f\n\t"
++ "movq (%1),%%mm0\n\t"
++ "addl $8,%1\n\t"
++ "movq %%mm0,(%2)\n\t"
++ "addl $8,%2\n\t"
++ "subl $2,%%ecx\n\t"
++ "jc 3b\n\t"
++ "jz 2f\n\t"
++ "movq (%1),%%mm0\n\t"
++ "addl $8,%1\n\t"
++ "movq %%mm0,(%2)\n\t"
++ "addl $8,%2\n\t"
++ "subl $2,%%ecx\n\t"
++ "jc 3b\n\t"
++ "jz 2f\n\t"
++ "jmp 1b\n\t"
++ "2: femms\n\t"
++ : : "c" (n),
++ "r" ((float*)(vb->pcm[j]+n)),
++ "r" ((float*)(v->pcm[j]+thisCenter))
++ : "%eax", "memory" );
++#else
+ {
+ float *pcm=v->pcm[j]+thisCenter;
+ float *p=vb->pcm[j]+n;
+ for(i=0;i<n;i++)
+ pcm[i]=p[i];
+ }
++#endif
+ }
+
+ if(v->centerW)
+diff -ur libvorbis-1.0/lib/lsp.c libvorbis-1.0-simd/lib/lsp.c
+--- libvorbis-1.0/lib/lsp.c 2002-07-17 23:28:37.000000000 +0200
++++ libvorbis-1.0-simd/lib/lsp.c 2003-04-26 20:32:27.000000000 +0200
+@@ -54,7 +54,12 @@
+ #define FLOAT_LOOKUP
+ #undef INT_LOOKUP
+
++#ifdef __SSE__
++#include <xmmintrin.h>
++#endif
++
+ #ifdef FLOAT_LOOKUP
++
+ #include "lookup.c" /* catch this in the build system; we #include for
+ compilers (like gcc) that can't inline across
+ modules */
+@@ -73,17 +78,88 @@
+ while(i<n){
+ int k=map[i];
+ int qexp;
++#ifdef __SSE__
++ register __m128 pqpq;
++ static float __attribute__((aligned(16))) PQPQ[4];
++ register __m128 wwww;
++ float pq[2];
++#define p pq[1]
++#define q pq[0]
++#else
++#ifdef simd_3dn
++ float pq[2],ww[2];
++#define p pq[1]
++#define q pq[0]
++#define w ww[0]
++#else
+ float p=.7071067812f;
+ float q=.7071067812f;
+ float w=vorbis_coslook(wdel*k);
++#endif
++#endif
+ float *ftmp=lsp;
+ int c=m>>1;
+
++#ifdef __SSE__
++ static float __attribute__((aligned(16))) w;
++ w=vorbis_coslook(wdel*k);
++ PQPQ[0]=PQPQ[1]=.7071067812f;
++ if ((((long)ftmp)&15)==8) {
++ PQPQ[2]=ftmp[0]-w;
++ PQPQ[3]=ftmp[1]-w;
++ --c;
++ ftmp+=2;
++ } else {
++ PQPQ[2]=PQPQ[3]=1.f;
++ }
++ pqpq=*(__m128*)&(PQPQ[0]);
++ wwww=_mm_load1_ps(&w);
++#define UNROLL
++#ifdef UNROLL
++ while (c>3) {
++ pqpq=_mm_mul_ps(pqpq,_mm_sub_ps(*(__m128*)ftmp,wwww));
++ ftmp+=4;
++ pqpq=_mm_mul_ps(pqpq,_mm_sub_ps(*(__m128*)ftmp,wwww));
++ ftmp+=4;
++ c-=4;
++ }
++#endif
++ while (c>1) {
++ pqpq=_mm_mul_ps(pqpq,_mm_sub_ps(*(__m128*)ftmp,wwww));
++ ftmp+=4;
++ c-=2;
++ }
++ pqpq=_mm_mul_ps(pqpq,_mm_shuffle_ps(pqpq,pqpq,_MM_SHUFFLE(1,0,3,2)));
++ _mm_storel_pi((__m64*)(&(pq[0])),pqpq);
++ if (c) {
++ q*=ftmp[0]-w;
++ p*=ftmp[1]-w;
++ ftmp+=2;
++ };
++#else
++#ifdef simd_3dn
++ pq[0]=pq[1]=.7071067812f;
++ ww[0]=ww[1]=vorbis_coslook(wdel*k);
++
++ asm volatile("movq (%2),%%mm1\n\t" /* mm1 = (w,w) */
++ "movq (%3),%%mm2\n\t" /* mm2 = (p,q) */
++ "1: movq (%0),%%mm0\n\t" /* mm0 = (ftmp[0],ftmp[1]) */
++ "pfsub %%mm1,%%mm0\n\t" /* mm0 = (ftmp[0]-w,ftmp[1]-w) */
++ "pfmul %%mm0,%%mm2\n\t" /* mm2 *= (ftmp[0]-w,ftmp[1]-w) */
++ "addl $8,%0\n\t" /* ftmp += 2 */
++ "decl %1\n\t" /* --c */
++ "jnz 1b\n\t"
++ "movq %%mm2,(%3)\n\t" /* pq = mm0 */
++ "femms\n\t"
++ : "+r" (ftmp), "+r" (c) : "r" (ww), "r" (pq) : "memory" );
++#else
+ do{
+ q*=ftmp[0]-w;
+ p*=ftmp[1]-w;
+ ftmp+=2;
+ }while(--c);
++#endif
++#endif
+
+ if(m&1){
+ /* odd order filter; slightly assymetric */
+@@ -107,6 +183,9 @@
+ curve[i++]*=q;
+ }while(map[i]==k);
+ }
++#undef p
++#undef q
++#undef w
+ vorbis_fpu_restore(fpu);
+ }
+
+diff -ur libvorbis-1.0/lib/mdct.c libvorbis-1.0-simd/lib/mdct.c
+--- libvorbis-1.0/lib/mdct.c 2002-06-29 00:19:36.000000000 +0200
++++ libvorbis-1.0-simd/lib/mdct.c 2003-01-26 07:09:54.000000000 +0100
+@@ -271,41 +271,158 @@
+ REG_TYPE r0;
+ REG_TYPE r1;
+
++#if DATA_TYPE != float
++#undef simd_3dn
++#endif
++
+ do{
+
++#ifdef simd_3dn
++ static unsigned long negxor[2]={0x80000000,0};
++ asm ( /* mm0 = (r0,r1) */
++ "movq %0,%%mm0\n\t" /* mm0 = (x1[6],x1[7]) */
++ "movq %1,%%mm1\n\t" /* mm1 = (x2[6],x2[7]) */
++ "movq %%mm0,%%mm2\n\t" /* mm2 = (x1[6],x1[7]) */
++ "pfsub %%mm1,%%mm0\n\t" /* mm0 = (x1[6]-x2[6],x1[7]-x2[7]) */
++ "pfadd %%mm1,%%mm2\n\t" /* mm2 = (x1[6]+x2[6],x1[7]+x2[7]) */
++ "movq %%mm2,%0\n\t"
++
++ "movq %2,%%mm1\n\t"
++ "movq %%mm0,%%mm2\n\t"
++ "pfmul %%mm1,%%mm0\n\t"
++ "pxor %3,%%mm2\n\t"
++#ifdef simd_sse
++ "pshufw $0x4e,%%mm1,%%mm1\n\t"
++#else
++ "movq %%mm1,%%mm3\n\t"
++ "psllq $32,%%mm3\n\t"
++ "psrlq $32,%%mm1\n\t"
++ "por %%mm3,%%mm1\n\t"
++#endif
++ "pfmul %%mm2,%%mm1\n\t"
++ "pfacc %%mm1,%%mm0\n\t"
++ "movq %%mm0,%1\n\t"
++ "femms\n\t"
++ : : "m" (x1[6]), "m" (x2[6]), "m" (T[0]), "m" (negxor[0]) : "memory");
++#else
+ r0 = x1[6] - x2[6];
+ r1 = x1[7] - x2[7];
+ x1[6] += x2[6];
+ x1[7] += x2[7];
+ x2[6] = MULT_NORM(r1 * T[1] + r0 * T[0]);
+ x2[7] = MULT_NORM(r1 * T[0] - r0 * T[1]);
++#endif
+
+ T+=trigint;
+
++#ifdef simd_3dn
++ asm ( /* mm0 = (r0,r1) */
++ "movq %0,%%mm0\n\t" /* mm0 = (x1[6],x1[7]) */
++ "movq %1,%%mm1\n\t" /* mm1 = (x2[6],x2[7]) */
++ "movq %%mm0,%%mm2\n\t" /* mm2 = (x1[6],x1[7]) */
++ "pfsub %%mm1,%%mm0\n\t" /* mm0 = (x1[6]-x2[6],x1[7]-x2[7]) */
++ "pfadd %%mm1,%%mm2\n\t" /* mm2 = (x1[6]+x2[6],x1[7]+x2[7]) */
++ "movq %%mm2,%0\n\t"
++
++ "movq %2,%%mm1\n\t"
++ "movq %%mm0,%%mm2\n\t"
++ "pfmul %%mm1,%%mm0\n\t"
++ "pxor %3,%%mm2\n\t"
++#ifdef simd_sse
++ "pshufw $0x4e,%%mm1,%%mm1\n\t"
++#else
++ "movq %%mm1,%%mm3\n\t"
++ "psllq $32,%%mm3\n\t"
++ "psrlq $32,%%mm1\n\t"
++ "por %%mm3,%%mm1\n\t"
++#endif
++ "pfmul %%mm2,%%mm1\n\t"
++ "pfacc %%mm1,%%mm0\n\t"
++ "movq %%mm0,%1\n\t"
++ "femms\n\t"
++ : : "m" (x1[4]), "m" (x2[4]), "m" (T[0]), "m" (negxor[0]) : "memory");
++#else
+ r0 = x1[4] - x2[4];
+ r1 = x1[5] - x2[5];
+ x1[4] += x2[4];
+ x1[5] += x2[5];
+ x2[4] = MULT_NORM(r1 * T[1] + r0 * T[0]);
+ x2[5] = MULT_NORM(r1 * T[0] - r0 * T[1]);
++#endif
+
+ T+=trigint;
+
++#ifdef simd_3dn
++ asm ( /* mm0 = (r0,r1) */
++ "movq %0,%%mm0\n\t" /* mm0 = (x1[6],x1[7]) */
++ "movq %1,%%mm1\n\t" /* mm1 = (x2[6],x2[7]) */
++ "movq %%mm0,%%mm2\n\t" /* mm2 = (x1[6],x1[7]) */
++ "pfsub %%mm1,%%mm0\n\t" /* mm0 = (x1[6]-x2[6],x1[7]-x2[7]) */
++ "pfadd %%mm1,%%mm2\n\t" /* mm2 = (x1[6]+x2[6],x1[7]+x2[7]) */
++ "movq %%mm2,%0\n\t"
++
++ "movq %2,%%mm1\n\t"
++ "movq %%mm0,%%mm2\n\t"
++ "pfmul %%mm1,%%mm0\n\t"
++ "pxor %3,%%mm2\n\t"
++#ifdef simd_sse
++ "pshufw $0x4e,%%mm1,%%mm1\n\t"
++#else
++ "movq %%mm1,%%mm3\n\t"
++ "psllq $32,%%mm3\n\t"
++ "psrlq $32,%%mm1\n\t"
++ "por %%mm3,%%mm1\n\t"
++#endif
++ "pfmul %%mm2,%%mm1\n\t"
++ "pfacc %%mm1,%%mm0\n\t"
++ "movq %%mm0,%1\n\t"
++ "femms\n\t"
++ : : "m" (x1[2]), "m" (x2[2]), "m" (T[0]), "m" (negxor[0]) : "memory");
++#else
+ r0 = x1[2] - x2[2];
+ r1 = x1[3] - x2[3];
+ x1[2] += x2[2];
+ x1[3] += x2[3];
+ x2[2] = MULT_NORM(r1 * T[1] + r0 * T[0]);
+ x2[3] = MULT_NORM(r1 * T[0] - r0 * T[1]);
++#endif
+
+ T+=trigint;
+
++#ifdef simd_3dn
++ asm ( /* mm0 = (r0,r1) */
++ "movq %0,%%mm0\n\t" /* mm0 = (x1[6],x1[7]) */
++ "movq %1,%%mm1\n\t" /* mm1 = (x2[6],x2[7]) */
++ "movq %%mm0,%%mm2\n\t" /* mm2 = (x1[6],x1[7]) */
++ "pfsub %%mm1,%%mm0\n\t" /* mm0 = (x1[6]-x2[6],x1[7]-x2[7]) */
++ "pfadd %%mm1,%%mm2\n\t" /* mm2 = (x1[6]+x2[6],x1[7]+x2[7]) */
++ "movq %%mm2,%0\n\t"
++
++ "movq %2,%%mm1\n\t"
++ "movq %%mm0,%%mm2\n\t"
++ "pfmul %%mm1,%%mm0\n\t"
++ "pxor %3,%%mm2\n\t"
++#ifdef simd_sse
++ "pshufw $0x4e,%%mm1,%%mm1\n\t"
++#else
++ "movq %%mm1,%%mm3\n\t"
++ "psllq $32,%%mm3\n\t"
++ "psrlq $32,%%mm1\n\t"
++ "por %%mm3,%%mm1\n\t"
++#endif
++ "pfmul %%mm2,%%mm1\n\t"
++ "pfacc %%mm1,%%mm0\n\t"
++ "movq %%mm0,%1\n\t"
++ "femms\n\t"
++ : : "m" (x1[0]), "m" (x2[0]), "m" (T[0]), "m" (negxor[0]) : "memory");
++#else
+ r0 = x1[0] - x2[0];
+ r1 = x1[1] - x2[1];
+ x1[0] += x2[0];
+ x1[1] += x2[1];
+ x2[0] = MULT_NORM(r1 * T[1] + r0 * T[0]);
+ x2[1] = MULT_NORM(r1 * T[0] - r0 * T[1]);
++#endif
+
+ T+=trigint;
+ x1-=8;
+diff -ur libvorbis-1.0/lib/psy.c libvorbis-1.0-simd/lib/psy.c
+--- libvorbis-1.0/lib/psy.c 2002-07-13 12:18:33.000000000 +0200
++++ libvorbis-1.0-simd/lib/psy.c 2003-04-17 20:50:13.000000000 +0200
+@@ -29,9 +29,25 @@
+ #include "scales.h"
+ #include "misc.h"
+
++#ifdef __SSE__
++#include <mmintrin.h>
++#include <xmmintrin.h>
++#endif
++
+ #define NEGINF -9999.f
+ static double stereo_threshholds[]={0.0, .5, 1.0, 1.5, 2.5, 4.5, 8.5, 16.5, 9e10};
+
++#ifdef __SSE__
++static void* align(void* x) {
++ long l=(long)x;
++ int r=l&0xf;
++ if (r)
++ return (void*)(l+16-(l&0xf));
++ else
++ return x;
++}
++#endif
++
+ vorbis_look_psy_global *_vp_global_look(vorbis_info *vi){
+ codec_setup_info *ci=vi->codec_setup;
+ vorbis_info_psy_global *gi=&ci->psy_g_param;
+@@ -530,17 +546,31 @@
+
+ }
+
++#ifdef __SSE__
++__m128 tmp __attribute__((aligned(16)));
++#endif
++
+ static void bark_noise_hybridmp(int n,const long *b,
+ const float *f,
+ float *noise,
+ const float offset,
+ const int fixed){
+
++#ifdef __SSE__
++ /* we need to be 16-bytes aligned for SSE */
++ /* so we can be at most 16-4=12 bytes off, allocate three more floats */
++ float *N=align(alloca((n+4)*sizeof(*N)));
++ float *X=align(alloca((n+4)*sizeof(*N)));
++ float *XX=align(alloca((n+4)*sizeof(*N)));
++ float *Y=align(alloca((n+4)*sizeof(*N)));
++ float *XY=align(alloca((n+4)*sizeof(*N)));
++#else
+ float *N=alloca((n+1)*sizeof(*N));
+ float *X=alloca((n+1)*sizeof(*N));
+ float *XX=alloca((n+1)*sizeof(*N));
+ float *Y=alloca((n+1)*sizeof(*N));
+ float *XY=alloca((n+1)*sizeof(*N));
++#endif
+
+ float tN, tX, tXX, tY, tXY;
+ float fi;
+@@ -548,9 +578,131 @@
+
+ int lo, hi;
+ float R, A, B, D;
++#ifdef __SSE__
++ register __m128 offset4=_mm_set_ps1(offset);
++#endif
+
+ tN = tX = tXX = tY = tXY = 0.f;
++#ifdef __SSE__
++ for (i=0, fi=0.f; i+4<n; i+=4, fi+=4.f) {
++ static float __attribute__((aligned(16))) c1111[4]={1.f,1.f,1.f,1.f};
++ static float __attribute__((aligned(16))) c0123[4]={0.f,1.f,2.f,3.f};
++ register __m128 xxxx = _mm_add_ps(_mm_set_ps1(fi),*(__m128*)c0123);
++ /* turns out that f may be unaligned and there is nothing I can do
++ * about it */
++ register __m128 yyyy=_mm_max_ps(*(__m128*)c1111,_mm_add_ps(_mm_loadu_ps((float*)f+i),offset4));
++ register __m128 wwww=_mm_mul_ps(yyyy,yyyy);
++ /* now it gets hairy */
++
++ float* a=(float*)&tmp;
++
++#ifdef OLD_AND_WORKING
++ tmp=wwww;
++ N[i]=tN;
++ N[i+1]=(tN+=a[0]);
++ N[i+2]=(tN+=a[1]);
++ N[i+3]=(tN+=a[2]);
++ tN+=a[3];
++
++ tmp=_mm_mul_ps(wwww,xxxx);
++ X[i]=tX;
++ X[i+1]=(tX+=a[0]);
++ X[i+2]=(tX+=a[1]);
++ X[i+3]=(tX+=a[2]);
++ tX+=a[3];
++
++ tmp=_mm_mul_ps(tmp,xxxx);
++ XX[i]=tXX;
++ XX[i+1]=(tXX+=a[0]);
++ XX[i+2]=(tXX+=a[1]);
++ XX[i+3]=(tXX+=a[2]);
++ tXX+=a[3];
++
++ tmp=_mm_mul_ps(wwww,yyyy);
++ Y[i]=tY;
++ Y[i+1]=(tY+=a[0]);
++ Y[i+2]=(tY+=a[1]);
++ Y[i+3]=(tY+=a[2]);
++ tY+=a[3];
++
++ tmp=_mm_mul_ps(tmp,xxxx);
++ XY[i]=tXY;
++ XY[i+1]=(tXY+=a[0]);
++ XY[i+2]=(tXY+=a[1]);
++ XY[i+3]=(tXY+=a[2]);
++ tXY+=a[3];
++#else
++ {
++ register __m128 o=_mm_set_ss(tN);
++ register __m128 x=wwww;
++ o=_mm_add_ss(_mm_shuffle_ps(o,o,_MM_SHUFFLE(2,1,0,0)),x);
++ o=_mm_add_ss(_mm_shuffle_ps(o,o,_MM_SHUFFLE(2,1,0,0)),_mm_shuffle_ps(x,x,_MM_SHUFFLE(0,0,0,1)));
++ o=_mm_add_ss(_mm_shuffle_ps(o,o,_MM_SHUFFLE(2,1,0,0)),_mm_shuffle_ps(x,x,_MM_SHUFFLE(0,0,0,2)));
++ _mm_storeu_ps(N+i,_mm_shuffle_ps(o,o,_MM_SHUFFLE(0,1,2,3)));
++ _mm_store_ss(&tN,_mm_add_ss(_mm_shuffle_ps(o,o,_MM_SHUFFLE(2,1,0,0)),_mm_shuffle_ps(x,x,_MM_SHUFFLE(0,0,0,3))));
++
++ o=_mm_set_ss(tX); x=_mm_mul_ps(wwww,xxxx);
++ o=_mm_add_ss(_mm_shuffle_ps(o,o,_MM_SHUFFLE(2,1,0,0)),x);
++ o=_mm_add_ss(_mm_shuffle_ps(o,o,_MM_SHUFFLE(2,1,0,0)),_mm_shuffle_ps(x,x,_MM_SHUFFLE(0,0,0,1)));
++ o=_mm_add_ss(_mm_shuffle_ps(o,o,_MM_SHUFFLE(2,1,0,0)),_mm_shuffle_ps(x,x,_MM_SHUFFLE(0,0,0,2)));
++ _mm_storeu_ps(X+i,_mm_shuffle_ps(o,o,_MM_SHUFFLE(0,1,2,3)));
++ _mm_store_ss(&tX,_mm_add_ss(_mm_shuffle_ps(o,o,_MM_SHUFFLE(2,1,0,0)),_mm_shuffle_ps(x,x,_MM_SHUFFLE(0,0,0,3))));
++
++ o=_mm_set_ss(tXX); x=_mm_mul_ps(x,xxxx);
++ o=_mm_add_ss(_mm_shuffle_ps(o,o,_MM_SHUFFLE(2,1,0,0)),x);
++ o=_mm_add_ss(_mm_shuffle_ps(o,o,_MM_SHUFFLE(2,1,0,0)),_mm_shuffle_ps(x,x,_MM_SHUFFLE(0,0,0,1)));
++ o=_mm_add_ss(_mm_shuffle_ps(o,o,_MM_SHUFFLE(2,1,0,0)),_mm_shuffle_ps(x,x,_MM_SHUFFLE(0,0,0,2)));
++ _mm_storeu_ps(XX+i,_mm_shuffle_ps(o,o,_MM_SHUFFLE(0,1,2,3)));
++ _mm_store_ss(&tXX,_mm_add_ss(_mm_shuffle_ps(o,o,_MM_SHUFFLE(2,1,0,0)),_mm_shuffle_ps(x,x,_MM_SHUFFLE(0,0,0,3))));
++
++ o=_mm_set_ss(tY); x=_mm_mul_ps(wwww,yyyy);
++ o=_mm_add_ss(_mm_shuffle_ps(o,o,_MM_SHUFFLE(2,1,0,0)),x);
++ o=_mm_add_ss(_mm_shuffle_ps(o,o,_MM_SHUFFLE(2,1,0,0)),_mm_shuffle_ps(x,x,_MM_SHUFFLE(0,0,0,1)));
++ o=_mm_add_ss(_mm_shuffle_ps(o,o,_MM_SHUFFLE(2,1,0,0)),_mm_shuffle_ps(x,x,_MM_SHUFFLE(0,0,0,2)));
++ _mm_storeu_ps(Y+i,_mm_shuffle_ps(o,o,_MM_SHUFFLE(0,1,2,3)));
++ _mm_store_ss(&tY,_mm_add_ss(_mm_shuffle_ps(o,o,_MM_SHUFFLE(2,1,0,0)),_mm_shuffle_ps(x,x,_MM_SHUFFLE(0,0,0,3))));
++
++ o=_mm_set_ss(tXY); x=_mm_mul_ps(x,xxxx);
++ o=_mm_add_ss(_mm_shuffle_ps(o,o,_MM_SHUFFLE(2,1,0,0)),x);
++ o=_mm_add_ss(_mm_shuffle_ps(o,o,_MM_SHUFFLE(2,1,0,0)),_mm_shuffle_ps(x,x,_MM_SHUFFLE(0,0,0,1)));
++ o=_mm_add_ss(_mm_shuffle_ps(o,o,_MM_SHUFFLE(2,1,0,0)),_mm_shuffle_ps(x,x,_MM_SHUFFLE(0,0,0,2)));
++ _mm_storeu_ps(XY+i,_mm_shuffle_ps(o,o,_MM_SHUFFLE(0,1,2,3)));
++ _mm_store_ss(&tXY,_mm_add_ss(_mm_shuffle_ps(o,o,_MM_SHUFFLE(2,1,0,0)),_mm_shuffle_ps(x,x,_MM_SHUFFLE(0,0,0,3))));
++
++ }
++#endif
++
++#if 0
++ N[i] = tN;
++ N[i+1] = tN+w[0];
++ N[i+2] = tN+w[0]+w[1];
++ N[i+3] = tN+w[0]+w[1]+w[2];
++
++ X[i] = tX;
++ X[i+1] = tX+w[0]*x[0];
++ X[i+2] = tX+w[0]*x[0]+w[1]*x[1];
++ X[i+3] = tX+w[0]*x[0]+w[1]*x[1]+w[2]*x[2];
++
++ Y[i] = tY;
++ Y[i+1] = tY+w[0]*y[0];
++ Y[i+2] = tY+w[0]*y[0]+w[1]*y[1];
++ Y[i+3] = tY+w[0]*y[0]+w[1]*y[1]+w[2]*y[2];
++
++ XX[i] = tXX;
++ XX[i+1] = tXX+w[0]*x[0]*x[0];
++ XX[i+2] = tXX+w[0]*x[0]*x[0]+w[1]*x[1]*x[1];
++ XX[i+3] = tXX+w[0]*x[0]*x[0]+w[1]*x[1]*x[1]+w[2]*x[2]*x[2];
++
++ XY[i] = tXY;
++ XY[i+1] = tXY+w[0]*x[0]*y[0];
++ XY[i+2] = tXY+w[0]*x[0]*y[0]+w[1]*x[1]*y[1];
++ XY[i+3] = tXY+w[0]*x[0]*y[0]+w[1]*x[1]*y[1]+w[2]*x[2]*y[2];
++#endif
++ }
++ for (; i < n; i++, fi += 1.f) {
++#else
+ for (i = 0, fi = 0.f; i < n; i++, fi += 1.f) {
++#endif
+ float w, x, y;
+
+ x = fi;
+@@ -597,6 +749,7 @@
+ }
+
+ for ( ; hi < n; i++, fi += 1.f) {
++ /* TODO hotspot */
+
+ lo = b[i] >> 16;
+ hi = b[i] & 0xffff;
+@@ -644,6 +797,7 @@
+ if (R > 0.f && R - offset < noise[i]) noise[i] = R - offset;
+ }
+ for ( ; hi < n; i++, fi += 1.f) {
++ /* TODO hotspot */
+
+ hi = i + fixed / 2;
+ lo = hi - fixed;
+@@ -744,6 +898,7 @@
+
+ if(sliding_lowpass>n)sliding_lowpass=n;
+
++ /* TODO hotspot */
+ for(i=0;i<sliding_lowpass;i++){
+ residue[i]=
+ mdct[i]*FLOOR1_fromdB_INV_LOOKUP[codedflr[i]];
+@@ -792,6 +947,7 @@
+ }
+ #endif
+
++ /* TODO hotspot */
+ for(i=0;i<n;i++){
+ int dB=logmask[i]+.5;
+ if(dB>=NOISE_COMPAND_LEVELS)dB=NOISE_COMPAND_LEVELS-1;
+@@ -816,7 +972,28 @@
+ specified att) */
+ if(att<p->vi->ath_maxatt)att=p->vi->ath_maxatt;
+
++ /* TODO hotspot */
++#ifdef __SSE__
++/* this optimization does not make a difference for me */
++ {
++ register __m128 att4=_mm_set_ps1(att);
++#if 0
++ for (i=0;i+4<n && (long)(p->ath+i)&0xf;++i)
++ logmask[i]=p->ath[i]+att;
++#endif
++ for (i=0;i+16<n;i+=16) {
++ _mm_storeu_ps(logmask+i,_mm_add_ps(_mm_loadu_ps(p->ath+i),att4));
++ _mm_storeu_ps(logmask+i+4,_mm_add_ps(_mm_loadu_ps(p->ath+i+4),att4));
++ _mm_storeu_ps(logmask+i+8,_mm_add_ps(_mm_loadu_ps(p->ath+i+8),att4));
++ _mm_storeu_ps(logmask+i+12,_mm_add_ps(_mm_loadu_ps(p->ath+i+12),att4));
++ }
++ for (;i+4<n;i+=4)
++ _mm_storeu_ps(logmask+i,_mm_add_ps(_mm_loadu_ps(p->ath+i),att4));
++ }
++ for(;i<n;i++)
++#else
+ for(i=0;i<n;i++)
++#endif
+ logmask[i]=p->ath[i]+att;
+
+ /* tone masking */
+@@ -833,7 +1010,21 @@
+ int i,n=p->n;
+ float toneatt=p->vi->tone_masteratt[offset_select];
+
++ /* TODO hotspot */
++#ifdef __SSE__
++ register float* no=p->noiseoffset[offset_select];
++ register __m128 toneatt4=_mm_set_ps1(toneatt);
++ register __m128 noisemax4=_mm_set_ps1(p->vi->noisemaxsupp);
++ for(i=0;i+4<n;i+=4) {
++ register __m128 x=_mm_add_ps(_mm_loadu_ps(noise+i),_mm_loadu_ps(no+i));
++ x=_mm_min_ps(x,noisemax4);
++ x=_mm_max_ps(x,_mm_add_ps(_mm_loadu_ps(tone+i),toneatt4));
++ _mm_storeu_ps(logmask+i,x);
++ }
++ for(;i<n;i++){
++#else
+ for(i=0;i<n;i++){
++#endif
+ float val= noise[i]+p->noiseoffset[offset_select][i];
+ if(val>p->vi->noisemaxsupp)val=p->vi->noisemaxsupp;
+ logmask[i]=max(val,tone[i]+toneatt);
+@@ -883,6 +1074,7 @@
+ -0.159093, -0.175146, -0.192286, -0.210490,
+ -0.229718, -0.249913, -0.271001, -0.292893};
+
++/* minor hotspot */
+ static void precomputed_couple_point(float premag,
+ int floorA,int floorB,
+ float *mag, float *ang){
+@@ -949,6 +1141,7 @@
+ }
+
+ /* this is for per-channel noise normalization */
++/* TODO hotspot */
+ static int apsort(const void *a, const void *b){
+ if(fabs(**(float **)a)>fabs(**(float **)b))return -1;
+ return 1;
+@@ -972,6 +1165,8 @@
+ for(j=0;j<n;j+=partition){
+ for(k=0;k<partition;k++)work[k]=mags[i]+k+j;
+ qsort(work,partition,sizeof(*work),apsort);
++ /* TODO hotspot */
++ /* the obvious MMX version is not faster */
+ for(k=0;k<partition;k++)ret[i][k+j]=work[k]-mags[i];
+ }
+ }
+@@ -1008,8 +1203,26 @@
+ if(start>n)start=n;
+
+ if(vi->normal_channel_p){
++ /* TODO hotspot */
++#ifdef __SSE__DISABLED
++ /* this optimization does not make a difference */
++ /* also, it appears to distort the results :( */
++ register __m128 half4=_mm_set1_ps(0.5f);
++ for(;j+4<start;j+=4) {
++ register __m128 x=_mm_add_ps(_mm_loadu_ps(in+j),half4);
++ register __m64 y=_mm_cvtps_pi32(x);
++ register __m64 z=_mm_cvtps_pi32(_mm_shuffle_ps(x,x,0x1b));
++ x=_mm_cvtpi32_ps(x,z);
++ x=_mm_cvtpi32_ps(_mm_shuffle_ps(x,x,0x1b),y);
++ _mm_storeu_ps(out+j,x);
++ }
++ _mm_empty();
++ for(;j<start;j++)
++ out[j]=rint(in[j]);
++#else
+ for(;j<start;j++)
+ out[j]=rint(in[j]);
++#endif
+
+ for(;j+partition<=n;j+=partition){
+ float acc=0.;
+@@ -1018,6 +1231,7 @@
+ for(i=j;i<j+partition;i++)
+ acc+=in[i]*in[i];
+
++ /* TODO hotspot */
+ for(i=0;i<partition;i++){
+ k=sortedindex[i+j-start];
+
+@@ -1032,6 +1246,7 @@
+ }
+ }
+
++ /* TODO hotspot */
+ for(;i<partition;i++){
+ k=sortedindex[i+j-start];
+ out[k]=0.;
+@@ -1093,6 +1308,7 @@
+ for(j=0;j<p->n;j+=partition){
+ float acc=0.f;
+
++ /* TODO hotspot */
+ for(k=0;k<partition;k++){
+ int l=k+j;
+
+@@ -1116,6 +1332,7 @@
+ }
+
+ if(p->vi->normal_point_p){
++ /* TODO minor hotspot */
+ for(k=0;k<partition && acc>=p->vi->normal_thresh;k++){
+ int l=mag_sort[i][j+k];
+ if(l<sliding_lowpass && l>=pointlimit && rint(qM[l])==0.f){
+diff -ur libvorbis-1.0/lib/vorbisfile.c libvorbis-1.0-simd/lib/vorbisfile.c
+--- libvorbis-1.0/lib/vorbisfile.c 2002-07-06 06:20:03.000000000 +0200
++++ libvorbis-1.0-simd/lib/vorbisfile.c 2003-04-26 21:55:49.000000000 +0200
+@@ -21,12 +21,19 @@
+ #include <string.h>
+ #include <math.h>
+
++#include <assert.h>
++
+ #include "vorbis/codec.h"
+ #include "vorbis/vorbisfile.h"
+
+ #include "os.h"
+ #include "misc.h"
+
++#ifdef __SSE__
++#include <xmmintrin.h>
++#include <mmintrin.h>
++#endif
++
+ /* A 'chained bitstream' is a Vorbis bitstream that contains more than
+ one logical bitstream arranged end to end (the only form of Ogg
+ multiplexing allowed in a Vorbis bitstream; grouping [parallel
+@@ -1500,7 +1507,72 @@
+
+ if(host_endian==bigendianp){
+ if(sgned){
+-
++#ifdef __SSE__
++ /* Oh no! On my box, exactly one of pcm[0][j] and pcm[1][j]
++ * is always misaligned! SSE required 16-byte alignment.
++ * Also, the pcm[][] array layout is bad for vectorizing,
++ * but we can fix that with mmx unpack magic for the common
++ * case where channels==2. */
++ if (channels==2 && samples>8) {
++ register __m128 scale=_mm_set1_ps(32768.f);
++ for (j=0; j+8<samples; j+=8) {
++ register __m128 x=_mm_loadu_ps(pcm[0]+j);
++ register __m128 y=_mm_loadu_ps(pcm[1]+j);
++
++ *(__m64*)buffer=_mm_cvtps_pi16(_mm_mul_ps(_mm_unpacklo_ps(x,y),scale));
++ *(__m64*)(buffer+8)=_mm_cvtps_pi16(_mm_mul_ps(_mm_unpackhi_ps(x,y),scale));
++
++ x=_mm_loadu_ps(pcm[0]+j+4);
++ y=_mm_loadu_ps(pcm[1]+j+4);
++
++ *(__m64*)(buffer+16)=_mm_cvtps_pi16(_mm_mul_ps(_mm_unpacklo_ps(x,y),scale));
++ *(__m64*)(buffer+24)=_mm_cvtps_pi16(_mm_mul_ps(_mm_unpackhi_ps(x,y),scale));
++
++ buffer+=32;
++ }
++ for (; j+4<samples; j+=4) {
++ register __m128 x=_mm_loadu_ps(pcm[0]+j);
++ register __m128 y=_mm_loadu_ps(pcm[1]+j);
++
++ *(__m64*)buffer=_mm_cvtps_pi16(_mm_mul_ps(_mm_unpacklo_ps(x,y),scale));
++ *(__m64*)(buffer+8)=_mm_cvtps_pi16(_mm_mul_ps(_mm_unpackhi_ps(x,y),scale));
++
++ buffer+=16;
++ }
++ _mm_empty();
++
++ for (; j<samples; ++j) {
++ val=vorbis_ftoi(pcm[0][j]*32768.f);
++ if(val>32767)val=32767;
++ else if(val<-32768)val=-32768;
++ *(short*)buffer=val;
++ val=vorbis_ftoi(pcm[1][j]*32768.f);
++ if(val>32767)val=32767;
++ else if(val<-32768)val=-32768;
++ *(short*)(buffer+2)=val;
++ buffer+=4;
++ }
++ } else {
++#elif defined(simd_3dn)
++ if (channels==2) {
++ static float scale[2]={32768.f,32768.f};
++ for(j=0;j<samples;j+=2) {
++ asm("movq %0,%%mm0\n\t" /* (pcm[0][j],pcm[0][j+1]) */
++ "pfmul %1,%%mm0\n\t"
++ "pf2id %%mm0,%%mm0\n\t"
++ "packssdw %%mm0,%%mm0\n\t"
++ "movq %3,%%mm1\n\t" /* (pcm[1][j],pcm[1][j+1]) */
++ "pfmul %1,%%mm1\n\t"
++ "pf2id %%mm1,%%mm1\n\t"
++ "packssdw %%mm1,%%mm1\n\t"
++ "punpckhwd %%mm1,%%mm0\n\t"
++ "movq %%mm0,(%2)\n\t"
++ : : "m" (pcm[0][j]), "m" (scale), "r" (buffer), "m" (pcm[1][j]) );
++ buffer += 8;
++ }
++ asm volatile("femms\n\t");
++ } else {
++#endif
+ vorbis_fpu_setround(&fpu);
+ for(i=0;i<channels;i++) { /* It's faster in this order */
+ float *src=pcm[i];
+@@ -1514,6 +1586,9 @@
+ }
+ }
+ vorbis_fpu_restore(fpu);
++#if defined(__SSE__) || defined(simd_3dn)
++ }
++#endif
+
+ }else{
+
+@@ -1548,6 +1623,7 @@
+
+ }else{
+ int val;
++ write(1,"a",1);
+ vorbis_fpu_setround(&fpu);
+ for(j=0;j<samples;j++)
+ for(i=0;i<channels;i++){
+diff -ur libvorbis-1.0/lib/window.c libvorbis-1.0-simd/lib/window.c
+--- libvorbis-1.0/lib/window.c 2002-03-23 04:17:34.000000000 +0100
++++ libvorbis-1.0-simd/lib/window.c 2003-02-08 02:53:04.000000000 +0100
+@@ -17,9 +17,14 @@
+
+ #include <stdlib.h>
+ #include <math.h>
++#include <stdio.h>
+ #include "os.h"
+ #include "misc.h"
+
++#ifdef __SSE__
++#include <xmmintrin.h>
++#endif
++
+ float *_vorbis_window(int type, int left){
+ float *ret=_ogg_calloc(left,sizeof(*ret));
+ int i;
+@@ -67,8 +72,68 @@
+ for(i=0;i<leftbegin;i++)
+ d[i]=0.f;
+
++#ifdef __SSE__
++// printf("%p %p\n",d+i,window[lW]);
++ /* alignment issues. Again. Who saw that one coming? :-/ */
++ p=0;
++ if (leftend-i>8) { /* make it worth our while */
++ unsigned long _d=(long)(d+i)&15;
++ unsigned long _win=(long)(window[lW])&15;
++ register __m128* D,* WIN;
++ if (_d) {
++ d[i]*=window[lW][p];
++ d[i+1]*=window[lW][p+1];
++ i+=2; p+=2;
++ }
++ D=(__m128*)(d+i);
++ WIN=(__m128*)(window[lW]+p);
++ if (_d ^ _win) {
++ /* one is properly aligned, the other is not */
++ register __m128 a;
++ WIN=(__m128*)((float*)WIN-2);
++ a=WIN[0];
++ for(;i+7<leftend;i+=8,p+=8) {
++ /* now we need the upper most floats from a and the lower
++ * most floats from b */
++ register __m128 b=WIN[1];
++ *D=_mm_mul_ps(_mm_movehl_ps(_mm_shuffle_ps(b,b,_MM_SHUFFLE(1,0,3,2)),a),*D);
++ a=WIN[2];
++ D[1]=_mm_mul_ps(_mm_movehl_ps(_mm_shuffle_ps(a,a,_MM_SHUFFLE(1,0,3,2)),b),D[1]);
++ D+=2; WIN+=2;
++ }
++ } else {
++ for(;i+3<leftend;i+=4,p+=4) {
++ *D=_mm_mul_ps(*D,*WIN);
++ ++D; ++WIN;
++// d[i]*=window[lW][p];
++ }
++ }
++ }
++ for(;i<leftend;i++,p++)
++ d[i]*=window[lW][p];
++#elif defined(simd_3dn)
++ if ((i&1) || ((leftend-i)&1)) { /* d[i] is unaligned */
++ for(p=0;i<leftend;i++,p++)
++ d[i]*=window[lW][p];
++ } else {
++ asm("orl %0,%0\n\t"
++ "jbe 2f\n\t"
++ "1: movq (%1),%%mm0\n\t"
++ "pfmul (%2),%%mm0\n\t"
++ "addl $8,%2\n\t"
++ "movq %%mm0,(%1)\n\t"
++ "addl $8,%1\n\t"
++ "decl %0\n\t"
++ "jz 2f\n\t"
++ "jmp 1b\n\t"
++ "2: femms\n\t"
++ : : "r" ((leftend-i)/2), "r" (&d[i]), "r" (&window[lW][0]) : "memory" );
++ i=leftend;
++ }
++#else
+ for(p=0;i<leftend;i++,p++)
+ d[i]*=window[lW][p];
++#endif
+
+ for(i=rightbegin,p=rn/2-1;i<rightend;i++,p--)
+ d[i]*=window[nW][p];
+
diff --git a/media-libs/libvorbis/libvorbis-1.0-r2.ebuild b/media-libs/libvorbis/libvorbis-1.0-r2.ebuild
index ce958dcd9958..8827670acd33 100644
--- a/media-libs/libvorbis/libvorbis-1.0-r2.ebuild
+++ b/media-libs/libvorbis/libvorbis-1.0-r2.ebuild
@@ -1,6 +1,6 @@
# Copyright 1999-2003 Gentoo Technologies, Inc.
# Distributed under the terms of the GNU General Public License v2
-# $Header: /var/cvsroot/gentoo-x86/media-libs/libvorbis/libvorbis-1.0-r2.ebuild,v 1.3 2003/07/18 21:55:45 tester Exp $
+# $Header: /var/cvsroot/gentoo-x86/media-libs/libvorbis/libvorbis-1.0-r2.ebuild,v 1.4 2003/08/05 09:58:07 jje Exp $
inherit libtool eutils
@@ -19,7 +19,7 @@ src_unpack() {
unpack ${A}
cd ${S}
- epatch ${FILESDIR}/${PF}-m4.patch || die "Patching failed"
+ epatch ${FILESDIR}/${PN}-m4.patch || die "Patching failed"
# Fix a gcc crash. With the new atexit patch to gcc, it
# seems it do not handle -mno-ieee-fp too well.
cp configure configure.orig
diff --git a/media-libs/libvorbis/libvorbis-1.0-r1.ebuild b/media-libs/libvorbis/libvorbis-1.0-r3.ebuild
index 0c573a249fcc..397c1c4573da 100644
--- a/media-libs/libvorbis/libvorbis-1.0-r1.ebuild
+++ b/media-libs/libvorbis/libvorbis-1.0-r3.ebuild
@@ -1,25 +1,30 @@
# Copyright 1999-2003 Gentoo Technologies, Inc.
# Distributed under the terms of the GNU General Public License v2
-# $Header: /var/cvsroot/gentoo-x86/media-libs/libvorbis/libvorbis-1.0-r1.ebuild,v 1.7 2003/02/13 12:51:32 vapier Exp $
+# $Header: /var/cvsroot/gentoo-x86/media-libs/libvorbis/libvorbis-1.0-r3.ebuild,v 1.1 2003/08/05 09:58:07 jje Exp $
-inherit libtool
+inherit libtool eutils
S=${WORKDIR}/${P}
DESCRIPTION="the Ogg Vorbis sound file format library"
SRC_URI="http://fatpipe.vorbis.com/files/1.0/unix/${P}.tar.gz"
HOMEPAGE="http://www.xiph.org/ogg/vorbis/index.html"
+IUSE="sse"
DEPEND=">=media-libs/libogg-1.0"
SLOT="0"
LICENSE="as-is"
-KEYWORDS="x86 ppc sparc alpha"
+KEYWORDS="~x86"
src_unpack() {
unpack ${A}
cd ${S}
- patch -p1 < ${FILESDIR}/${P}-m4.patch || die "Patching failed"
+ if [ `use x86` ] ; then
+ use sse && epatch ${FILESDIR}/${PN}-simd.patch
+ fi
+
+ epatch ${FILESDIR}/${PN}-m4.patch || die "Patching failed"
# Fix a gcc crash. With the new atexit patch to gcc, it
# seems it do not handle -mno-ieee-fp too well.
cp configure configure.orig
@@ -30,7 +35,7 @@ src_unpack() {
src_compile() {
elibtoolize
- export CFLAGS="${CFLAGS/-march=*/}"
+ #export CFLAGS="${CFLAGS/-march=*/}"
./configure --prefix=/usr \
--host=${CHOST} || die
@@ -41,6 +46,9 @@ src_compile() {
src_install () {
make DESTDIR=${D} install || die
+ dosym /usr/lib/libvorbisfile.so.3.0.0 /usr/lib/libvorbisfile.so.0
+ dosym /usr/lib/libvorbisenc.so.2.0.0 /usr/lib/libvorbisenc.so.0
+
echo "Removing docs installed by make install"
rm -rf ${D}/usr/share/doc
@@ -60,3 +68,4 @@ pkg_postinst() {
einfo "recompilation is needed for these things."
einfo
}
+
diff --git a/media-libs/libvorbis/libvorbis-1.0.ebuild b/media-libs/libvorbis/libvorbis-1.0.ebuild
deleted file mode 100644
index 0114be22ee2b..000000000000
--- a/media-libs/libvorbis/libvorbis-1.0.ebuild
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright 1999-2003 Gentoo Technologies, Inc.
-# Distributed under the terms of the GNU General Public License v2
-# $Header: /var/cvsroot/gentoo-x86/media-libs/libvorbis/libvorbis-1.0.ebuild,v 1.6 2003/02/13 12:51:36 vapier Exp $
-
-inherit libtool
-
-S=${WORKDIR}/${P}
-DESCRIPTION="the Ogg Vorbis sound file format library"
-SRC_URI="http://fatpipe.vorbis.com/files/1.0/unix/${P}.tar.gz"
-HOMEPAGE="http://www.xiph.org/ogg/vorbis/index.html"
-
-DEPEND=">=media-libs/libogg-1.0"
-
-SLOT="0"
-LICENSE="as-is"
-KEYWORDS="x86 ppc sparc "
-
-src_unpack() {
- unpack ${A}
-
- cd ${S}
- # Fix a gcc crash. With the new atexit patch to gcc, it
- # seems it do not handle -mno-ieee-fp too well.
- cp configure configure.orig
- sed -e "s:-mno-ieee-fp::g" \
- configure.orig >configure
-}
-
-src_compile() {
- elibtoolize
-
- export CFLAGS="${CFLAGS/-march=*/}"
-
- ./configure --prefix=/usr \
- --host=${CHOST} || die
-
- emake || die
-}
-
-src_install () {
- make DESTDIR=${D} install || die
-
- echo "Removing docs installed by make install"
- rm -rf ${D}/usr/share/doc
-
- dodoc AUTHORS COPYING README todo.txt
- docinto txt
- dodoc doc/*.txt
- dohtml -r doc
-}
-
-pkg_postinst() {
- einfo
- einfo "Note the 1.0 version of libvorbis has been installed"
- einfo "Applications that used pre-1.0 vorbis libraries will"
- einfo "need to be recompiled for the new version."
- einfo "Now that the vorbis folks have finalized the API"
- einfo "this should be the last time for a while that"
- einfo "recompilation is needed for these things."
- einfo
-}