## ffmpeg / libavcodec / i386 / fft_3dn.c @ 40d0e665

History | View | Annotate | Download (4.24 KB)

1 | 82eb4b0f | Zuxy Meng | ```
/*
``` |
---|---|---|---|

2 | ```
* FFT/MDCT transform with 3DNow! optimizations
``` |
||

3 | 1e4ecf26 | Loren Merritt | ```
* Copyright (c) 2006 Zuxy MENG Jie, Loren Merritt
``` |

4 | 82eb4b0f | Zuxy Meng | ```
* Based on fft_sse.c copyright (c) 2002 Fabrice Bellard.
``` |

5 | ```
*
``` |
||

6 | b78e7197 | Diego Biurrun | ```
* This file is part of FFmpeg.
``` |

7 | ```
*
``` |
||

8 | ```
* FFmpeg is free software; you can redistribute it and/or
``` |
||

9 | 82eb4b0f | Zuxy Meng | ```
* modify it under the terms of the GNU Lesser General Public
``` |

10 | ```
* License as published by the Free Software Foundation; either
``` |
||

11 | b78e7197 | Diego Biurrun | ```
* version 2.1 of the License, or (at your option) any later version.
``` |

12 | 82eb4b0f | Zuxy Meng | ```
*
``` |

13 | b78e7197 | Diego Biurrun | ```
* FFmpeg is distributed in the hope that it will be useful,
``` |

14 | 82eb4b0f | Zuxy Meng | ```
* but WITHOUT ANY WARRANTY; without even the implied warranty of
``` |

15 | ```
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
``` |
||

16 | ```
* Lesser General Public License for more details.
``` |
||

17 | ```
*
``` |
||

18 | ```
* You should have received a copy of the GNU Lesser General Public
``` |
||

19 | b78e7197 | Diego Biurrun | ```
* License along with FFmpeg; if not, write to the Free Software
``` |

20 | 82eb4b0f | Zuxy Meng | ```
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
``` |

21 | ```
*/
``` |
||

22 | b550bfaa | Ronald S. Bultje | #include "dsputil.h" |

23 | 40d0e665 | Ramiro Polla | #include "x86_cpu.h" |

24 | 82eb4b0f | Zuxy Meng | |

25 | static const int p1m1[2] __attribute__((aligned(8))) = |
||

26 | { 0, 1 << 31 }; |
||

27 | |||

28 | static const int m1p1[2] __attribute__((aligned(8))) = |
||

29 | { 1 << 31, 0 }; |
||

30 | |||

31 | ```
void ff_fft_calc_3dn(FFTContext *s, FFTComplex *z)
``` |
||

32 | { |
||

33 | ```
int ln = s->nbits;
``` |
||

34 | 40d0e665 | Ramiro Polla | ```
long j;
``` |

35 | x86_reg i; |
||

36 | 1e4ecf26 | Loren Merritt | ```
long nblocks, nloops;
``` |

37 | FFTComplex *p, *cptr; |
||

38 | |||

39 | asm volatile( |
||

40 | ```
/* FEMMS is not a must here but recommended by AMD */
``` |
||

41 | ```
"femms \n\t"
``` |
||

42 | ```
"movq %0, %%mm7 \n\t"
``` |
||

43 | ```
::"m"(*(s->inverse ? m1p1 : p1m1))
``` |
||

44 | ); |
||

45 | |||

46 | ```
i = 8 << ln;
``` |
||

47 | asm volatile( |
||

48 | ```
"1: \n\t"
``` |
||

49 | ```
"sub $32, %0 \n\t"
``` |
||

50 | ```
"movq (%0,%1), %%mm0 \n\t"
``` |
||

51 | ```
"movq 16(%0,%1), %%mm1 \n\t"
``` |
||

52 | ```
"movq 8(%0,%1), %%mm2 \n\t"
``` |
||

53 | ```
"movq 24(%0,%1), %%mm3 \n\t"
``` |
||

54 | ```
"movq %%mm0, %%mm4 \n\t"
``` |
||

55 | ```
"movq %%mm1, %%mm5 \n\t"
``` |
||

56 | ```
"pfadd %%mm2, %%mm0 \n\t"
``` |
||

57 | ```
"pfadd %%mm3, %%mm1 \n\t"
``` |
||

58 | ```
"pfsub %%mm2, %%mm4 \n\t"
``` |
||

59 | ```
"pfsub %%mm3, %%mm5 \n\t"
``` |
||

60 | ```
"movq %%mm0, %%mm2 \n\t"
``` |
||

61 | ```
"punpckldq %%mm5, %%mm6 \n\t"
``` |
||

62 | ```
"punpckhdq %%mm6, %%mm5 \n\t"
``` |
||

63 | ```
"movq %%mm4, %%mm3 \n\t"
``` |
||

64 | ```
"pxor %%mm7, %%mm5 \n\t"
``` |
||

65 | ```
"pfadd %%mm1, %%mm0 \n\t"
``` |
||

66 | ```
"pfadd %%mm5, %%mm4 \n\t"
``` |
||

67 | ```
"pfsub %%mm1, %%mm2 \n\t"
``` |
||

68 | ```
"pfsub %%mm5, %%mm3 \n\t"
``` |
||

69 | ```
"movq %%mm0, (%0,%1) \n\t"
``` |
||

70 | ```
"movq %%mm4, 8(%0,%1) \n\t"
``` |
||

71 | ```
"movq %%mm2, 16(%0,%1) \n\t"
``` |
||

72 | ```
"movq %%mm3, 24(%0,%1) \n\t"
``` |
||

73 | ```
"jg 1b \n\t"
``` |
||

74 | ```
:"+r"(i)
``` |
||

75 | ```
:"r"(z)
``` |
||

76 | ); |
||

77 | 82eb4b0f | Zuxy Meng | ```
/* pass 2 .. ln-1 */
``` |

78 | |||

79 | 1e4ecf26 | Loren Merritt | nblocks = 1 << (ln-3); |

80 | 82eb4b0f | Zuxy Meng | nloops = 1 << 2; |

81 | 1e4ecf26 | Loren Merritt | cptr = s->exptab1; |

82 | 82eb4b0f | Zuxy Meng | ```
do {
``` |

83 | p = z; |
||

84 | j = nblocks; |
||

85 | ```
do {
``` |
||

86 | 1e4ecf26 | Loren Merritt | ```
i = nloops*8;
``` |

87 | asm volatile( |
||

88 | ```
"1: \n\t"
``` |
||

89 | ```
"sub $16, %0 \n\t"
``` |
||

90 | ```
"movq (%1,%0), %%mm0 \n\t"
``` |
||

91 | ```
"movq 8(%1,%0), %%mm1 \n\t"
``` |
||

92 | ```
"movq (%2,%0), %%mm2 \n\t"
``` |
||

93 | ```
"movq 8(%2,%0), %%mm3 \n\t"
``` |
||

94 | ```
"movq %%mm2, %%mm4 \n\t"
``` |
||

95 | ```
"movq %%mm3, %%mm5 \n\t"
``` |
||

96 | ```
"punpckldq %%mm2, %%mm2 \n\t"
``` |
||

97 | ```
"punpckldq %%mm3, %%mm3 \n\t"
``` |
||

98 | ```
"punpckhdq %%mm4, %%mm4 \n\t"
``` |
||

99 | ```
"punpckhdq %%mm5, %%mm5 \n\t"
``` |
||

100 | "pfmul (%3,%0,2), %%mm2 \n\t" // cre*re cim*re |
||

101 | ```
"pfmul 8(%3,%0,2), %%mm3 \n\t"
``` |
||

102 | "pfmul 16(%3,%0,2), %%mm4 \n\t" // -cim*im cre*im |
||

103 | ```
"pfmul 24(%3,%0,2), %%mm5 \n\t"
``` |
||

104 | "pfadd %%mm2, %%mm4 \n\t" // cre*re-cim*im cim*re+cre*im |
||

105 | ```
"pfadd %%mm3, %%mm5 \n\t"
``` |
||

106 | ```
"movq %%mm0, %%mm2 \n\t"
``` |
||

107 | ```
"movq %%mm1, %%mm3 \n\t"
``` |
||

108 | ```
"pfadd %%mm4, %%mm0 \n\t"
``` |
||

109 | ```
"pfadd %%mm5, %%mm1 \n\t"
``` |
||

110 | ```
"pfsub %%mm4, %%mm2 \n\t"
``` |
||

111 | ```
"pfsub %%mm5, %%mm3 \n\t"
``` |
||

112 | ```
"movq %%mm0, (%1,%0) \n\t"
``` |
||

113 | ```
"movq %%mm1, 8(%1,%0) \n\t"
``` |
||

114 | ```
"movq %%mm2, (%2,%0) \n\t"
``` |
||

115 | ```
"movq %%mm3, 8(%2,%0) \n\t"
``` |
||

116 | ```
"jg 1b \n\t"
``` |
||

117 | ```
:"+r"(i)
``` |
||

118 | :"r"(p), "r"(p + nloops), "r"(cptr) |
||

119 | ); |
||

120 | ```
p += nloops*2;
``` |
||

121 | 82eb4b0f | Zuxy Meng | ```
} while (--j);
``` |

122 | 1e4ecf26 | Loren Merritt | ```
cptr += nloops*2;
``` |

123 | ```
nblocks >>= 1;
``` |
||

124 | ```
nloops <<= 1;
``` |
||

125 | 82eb4b0f | Zuxy Meng | } while (nblocks != 0); |

126 | 1e4ecf26 | Loren Merritt | asm volatile("femms"); |

127 | 82eb4b0f | Zuxy Meng | } |