/*
/*
2 | ```
* FFT/MDCT transform with 3DNow! optimizations
* Copyright (c) 2006 Zuxy MENG Jie, Loren Merritt
* Copyright (c) 2006 Zuxy MENG Jie, Loren Merritt
4 | 82eb4b0f | Zuxy Meng | ```
* Based on fft_sse.c copyright (c) 2002 Fabrice Bellard.
*
*
* This file is part of FFmpeg.
* This file is part of FFmpeg.
*
*
8 | ```
* FFmpeg is free software; you can redistribute it and/or
9 | 82eb4b0f | Zuxy Meng | ```
* modify it under the terms of the GNU Lesser General Public
10 | ```
* License as published by the Free Software Foundation; either
``` |
11 | b78e7197 | Diego Biurrun | ```
* version 2.1 of the License, or (at your option) any later version.
*
*
13 | b78e7197 | Diego Biurrun | ```
* FFmpeg is distributed in the hope that it will be useful,
14 | 82eb4b0f | Zuxy Meng | ```
* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 | ```
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 | ```
* Lesser General Public License for more details.
*
*
18 | ```
* You should have received a copy of the GNU Lesser General Public
19 | b78e7197 | Diego Biurrun | ```
* License along with FFmpeg; if not, write to the Free Software
20 | 82eb4b0f | Zuxy Meng | ```
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
*/
#include "dsputil.h"

#include "x86_cpu.h"



static const int p1m1[2] __attribute__((aligned(8))) =
{ 0, 1 << 31 };


static const int m1p1[2] __attribute__((aligned(8))) =
{ 1 << 31, 0 };


31 | ```
void ff_fft_calc_3dn(FFTContext *s, FFTComplex *z)
``` |
{
33 | ```
int ln = s->nbits;
``` |
long j;
long j;
``` |

x86_reg i;
||

long nblocks, nloops;
long nblocks, nloops;
``` |

FFTComplex *p, *cptr;


asm volatile(
40 | ```
/* FEMMS is not a must here but recommended by AMD */
``` |
"femms \n\t"
"femms \n\t"
``` |
42 | ```
"movq %0, %%mm7 \n\t"
``` |
43 | ```
::"m"(*(s->inverse ? m1p1 : p1m1))
``` |
);
||



i = 8 << ln;
i = 8 << ln;
``` |
asm volatile(
||

"1: \n\t"
"1: \n\t"
``` |
49 | ```
"sub $32, %0 \n\t"
``` |
50 | ```
"movq (%0,%1), %%mm0 \n\t"
``` |
51 | ```
"movq 16(%0,%1), %%mm1 \n\t"
``` |
52 | ```
"movq 8(%0,%1), %%mm2 \n\t"
``` |
53 | ```
"movq 24(%0,%1), %%mm3 \n\t"
``` |
54 | ```
"movq %%mm0, %%mm4 \n\t"
``` |
55 | ```
"movq %%mm1, %%mm5 \n\t"
``` |
56 | ```
"pfadd %%mm2, %%mm0 \n\t"
``` |
57 | ```
"pfadd %%mm3, %%mm1 \n\t"
``` |
58 | ```
"pfsub %%mm2, %%mm4 \n\t"
``` |
59 | ```
"pfsub %%mm3, %%mm5 \n\t"
``` |
60 | ```
"movq %%mm0, %%mm2 \n\t"
``` |
61 | ```
"punpckldq %%mm5, %%mm6 \n\t"
``` |
62 | ```
"punpckhdq %%mm6, %%mm5 \n\t"
``` |
63 | ```
"movq %%mm4, %%mm3 \n\t"
``` |
64 | ```
"pxor %%mm7, %%mm5 \n\t"
``` |
65 | ```
"pfadd %%mm1, %%mm0 \n\t"
``` |
66 | ```
"pfadd %%mm5, %%mm4 \n\t"
``` |
67 | ```
"pfsub %%mm1, %%mm2 \n\t"
``` |
68 | ```
"pfsub %%mm5, %%mm3 \n\t"
``` |
69 | ```
"movq %%mm0, (%0,%1) \n\t"
``` |
70 | ```
"movq %%mm4, 8(%0,%1) \n\t"
``` |
71 | ```
"movq %%mm2, 16(%0,%1) \n\t"
``` |
72 | ```
"movq %%mm3, 24(%0,%1) \n\t"
``` |
"jg 1b \n\t"
"jg 1b \n\t"
``` |
:"+r"(i)
:"+r"(i)
``` |
:"r"(z)
:"r"(z)
``` |
);
/* pass 2 .. ln-1 */
/* pass 2 .. ln-1 */
``` |

nblocks = 1 << (ln-3);

nloops = 1 << 2;

cptr = s->exptab1;

do {
do {
``` |

p = z;
j = nblocks;
||

do {
do {
``` |
i = nloops*8;
i = nloops*8;
``` |

asm volatile(
"1: \n\t"
"1: \n\t"
``` |
89 | ```
"sub $16, %0 \n\t"
``` |
90 | ```
"movq (%1,%0), %%mm0 \n\t"
``` |
91 | ```
"movq 8(%1,%0), %%mm1 \n\t"
``` |
92 | ```
"movq (%2,%0), %%mm2 \n\t"
``` |
93 | ```
"movq 8(%2,%0), %%mm3 \n\t"
``` |
94 | ```
"movq %%mm2, %%mm4 \n\t"
``` |
95 | ```
"movq %%mm3, %%mm5 \n\t"
``` |
96 | ```
"punpckldq %%mm2, %%mm2 \n\t"
``` |
97 | ```
"punpckldq %%mm3, %%mm3 \n\t"
``` |
98 | ```
"punpckhdq %%mm4, %%mm4 \n\t"
``` |
99 | ```
"punpckhdq %%mm5, %%mm5 \n\t"
``` |
"pfmul (%3,%0,2), %%mm2 \n\t" // cre*re cim*re
101 | ```
"pfmul 8(%3,%0,2), %%mm3 \n\t"
``` |
"pfmul 16(%3,%0,2), %%mm4 \n\t" // -cim*im cre*im
103 | ```
"pfmul 24(%3,%0,2), %%mm5 \n\t"
``` |
"pfadd %%mm2, %%mm4 \n\t" // cre*re-cim*im cim*re+cre*im
105 | ```
"pfadd %%mm3, %%mm5 \n\t"
``` |
106 | ```
"movq %%mm0, %%mm2 \n\t"
``` |
107 | ```
"movq %%mm1, %%mm3 \n\t"
``` |
108 | ```
"pfadd %%mm4, %%mm0 \n\t"
``` |
109 | ```
"pfadd %%mm5, %%mm1 \n\t"
``` |
110 | ```
"pfsub %%mm4, %%mm2 \n\t"
``` |
111 | ```
"pfsub %%mm5, %%mm3 \n\t"
``` |
112 | ```
"movq %%mm0, (%1,%0) \n\t"
``` |
113 | ```
"movq %%mm1, 8(%1,%0) \n\t"
``` |
114 | ```
"movq %%mm2, (%2,%0) \n\t"
``` |
115 | ```
"movq %%mm3, 8(%2,%0) \n\t"
``` |
"jg 1b \n\t"
"jg 1b \n\t"
``` |
:"+r"(i)
:"+r"(i)
``` |
:"r"(p), "r"(p + nloops), "r"(cptr)
);
120 | ```
p += nloops*2;
``` |
} while (--j);
} while (--j);
``` |

cptr += nloops*2;
cptr += nloops*2;
``` |

123 | ```
nblocks >>= 1;
``` |
nloops <<= 1;
nloops <<= 1;
``` |
} while (nblocks != 0);

asm volatile("femms");

}