summaryrefslogtreecommitdiff
authorMichael Niedermayer <michaelni@gmx.at>2013-10-08 09:22:54 (GMT)
committer Michael Niedermayer <michaelni@gmx.at>2013-10-08 09:23:00 (GMT)
commit1f17619fe4800dce32a81fda0cec9afad91ff095 (patch)
treed8da9d339ec07d6a0910dbe6563ad96d47cfa2b9
parent17d9c7c208915c6c090e10508056f68b93440839 (diff)
parentbbe4a6db44f0b55b424a5cc9d3e89cd88e250450 (diff)
downloadffmpeg-1f17619fe4800dce32a81fda0cec9afad91ff095.zip
ffmpeg-1f17619fe4800dce32a81fda0cec9afad91ff095.tar.gz
ffmpeg-1f17619fe4800dce32a81fda0cec9afad91ff095.tar.bz2
Merge commit 'bbe4a6db44f0b55b424a5cc9d3e89cd88e250450'
* commit 'bbe4a6db44f0b55b424a5cc9d3e89cd88e250450': x86inc: Utilize the shadow space on 64-bit Windows Merged-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat
-rw-r--r--libavcodec/x86/fft.asm4
-rw-r--r--libavcodec/x86/h264_deblock.asm19
-rw-r--r--libavutil/x86/x86inc.asm83
3 files changed, 55 insertions, 51 deletions
diff --git a/libavcodec/x86/fft.asm b/libavcodec/x86/fft.asm
index 5071741..879b84e 100644
--- a/libavcodec/x86/fft.asm
+++ b/libavcodec/x86/fft.asm
@@ -672,13 +672,13 @@ cglobal imdct_calc, 3,5,3
push r1
push r0
%else
- sub rsp, 8
+ sub rsp, 8+32*WIN64 ; allocate win64 shadow space
%endif
call r4
%if ARCH_X86_32
add esp, 12
%else
- add rsp, 8
+ add rsp, 8+32*WIN64
%endif
POP r1
POP r3
diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm
index d58e16c..1317783 100644
--- a/libavcodec/x86/h264_deblock.asm
+++ b/libavcodec/x86/h264_deblock.asm
@@ -331,16 +331,14 @@ cglobal deblock_v_luma_8, 5,5,10
; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
INIT_MMX cpuname
-cglobal deblock_h_luma_8, 5,9
+cglobal deblock_h_luma_8, 5,9,0,0x60+16*WIN64
movsxd r7, r1d
lea r8, [r7+r7*2]
lea r6, [r0-4]
lea r5, [r0-4+r8]
%if WIN64
- sub rsp, 0x98
- %define pix_tmp rsp+0x30
+ %define pix_tmp rsp+0x30 ; shadow space + r4
%else
- sub rsp, 0x68
%define pix_tmp rsp
%endif
@@ -379,11 +377,6 @@ cglobal deblock_h_luma_8, 5,9
movq m3, [pix_tmp+0x40]
TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8)
-%if WIN64
- add rsp, 0x98
-%else
- add rsp, 0x68
-%endif
RET
%endmacro
@@ -708,13 +701,16 @@ INIT_MMX cpuname
;-----------------------------------------------------------------------------
; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
-cglobal deblock_h_luma_intra_8, 4,9
+cglobal deblock_h_luma_intra_8, 4,9,0,0x80
movsxd r7, r1d
lea r8, [r7*3]
lea r6, [r0-4]
lea r5, [r0-4+r8]
- sub rsp, 0x88
+%if WIN64
+ %define pix_tmp rsp+0x20 ; shadow space
+%else
%define pix_tmp rsp
+%endif
; transpose 8x16 -> tmp space
TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
@@ -734,7 +730,6 @@ cglobal deblock_h_luma_intra_8, 4,9
sub r5, r7
shr r7, 3
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
- add rsp, 0x88
RET
%else
cglobal deblock_h_luma_intra_8, 2,4,8,0x80
diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm
index f311866..cb3f82b 100644
--- a/libavutil/x86/x86inc.asm
+++ b/libavutil/x86/x86inc.asm
@@ -353,14 +353,18 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
%if stack_size < 0
%assign stack_size -stack_size
%endif
- %if mmsize != 8
- %assign xmm_regs_used %2
+ %assign stack_size_padded stack_size
+ %if WIN64
+ %assign stack_size_padded stack_size_padded + 32 ; reserve 32 bytes for shadow space
+ %if mmsize != 8
+ %assign xmm_regs_used %2
+ %if xmm_regs_used > 8
+ %assign stack_size_padded stack_size_padded + (xmm_regs_used-8)*16
+ %endif
+ %endif
%endif
%if mmsize <= 16 && HAVE_ALIGNED_STACK
- %assign stack_size_padded stack_size + %%stack_alignment - gprsize - (stack_offset & (%%stack_alignment - 1))
- %if xmm_regs_used > 6
- %assign stack_size_padded stack_size_padded + (xmm_regs_used - 6) * 16
- %endif
+ %assign stack_size_padded stack_size_padded + %%stack_alignment - gprsize - (stack_offset & (%%stack_alignment - 1))
SUB rsp, stack_size_padded
%else
%assign %%reg_num (regs_used - 1)
@@ -370,14 +374,6 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
; stack in a single instruction (i.e. mov rsp, rstk or mov
; rsp, [rsp+stack_size_padded])
mov rstk, rsp
- %assign stack_size_padded stack_size
- %if xmm_regs_used > 6
- %assign stack_size_padded stack_size_padded + (xmm_regs_used - 6) * 16
- %if mmsize == 32 && xmm_regs_used & 1
- ; re-align to 32 bytes
- %assign stack_size_padded (stack_size_padded + 16)
- %endif
- %endif
%if %1 < 0 ; need to store rsp on stack
sub rsp, gprsize+stack_size_padded
and rsp, ~(%%stack_alignment-1)
@@ -389,9 +385,7 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
%xdefine rstkm rstk
%endif
%endif
- %if xmm_regs_used > 6
- WIN64_PUSH_XMM
- %endif
+ WIN64_PUSH_XMM
%endif
%endif
%endmacro
@@ -452,40 +446,55 @@ DECLARE_REG 14, R15, 120
%endmacro
%macro WIN64_PUSH_XMM 0
- %assign %%i xmm_regs_used
- %rep (xmm_regs_used-6)
- %assign %%i %%i-1
- movaps [rsp + (%%i-6)*16 + stack_size + (~stack_offset&8)], xmm %+ %%i
- %endrep
+ ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
+ %if xmm_regs_used > 6
+ movaps [rstk + stack_offset + 8], xmm6
+ %endif
+ %if xmm_regs_used > 7
+ movaps [rstk + stack_offset + 24], xmm7
+ %endif
+ %if xmm_regs_used > 8
+ %assign %%i 8
+ %rep xmm_regs_used-8
+ movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
+ %assign %%i %%i+1
+ %endrep
+ %endif
%endmacro
%macro WIN64_SPILL_XMM 1
%assign xmm_regs_used %1
ASSERT xmm_regs_used <= 16
- %if xmm_regs_used > 6
- SUB rsp, (xmm_regs_used-6)*16+16
- WIN64_PUSH_XMM
+ %if xmm_regs_used > 8
+ %assign stack_size_padded (xmm_regs_used-8)*16 + (~stack_offset&8) + 32
+ SUB rsp, stack_size_padded
%endif
+ WIN64_PUSH_XMM
%endmacro
%macro WIN64_RESTORE_XMM_INTERNAL 1
- %if xmm_regs_used > 6
+ %assign %%pad_size 0
+ %if xmm_regs_used > 8
%assign %%i xmm_regs_used
- %rep (xmm_regs_used-6)
+ %rep xmm_regs_used-8
%assign %%i %%i-1
- movaps xmm %+ %%i, [%1 + (%%i-6)*16+stack_size+(~stack_offset&8)]
+ movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32]
%endrep
- %if stack_size_padded == 0
- add %1, (xmm_regs_used-6)*16+16
- %endif
%endif
%if stack_size_padded > 0
%if stack_size > 0 && (mmsize == 32 || HAVE_ALIGNED_STACK == 0)
mov rsp, rstkm
%else
add %1, stack_size_padded
+ %assign %%pad_size stack_size_padded
%endif
%endif
+ %if xmm_regs_used > 7
+ movaps xmm7, [%1 + stack_offset - %%pad_size + 24]
+ %endif
+ %if xmm_regs_used > 6
+ movaps xmm6, [%1 + stack_offset - %%pad_size + 8]
+ %endif
%endmacro
%macro WIN64_RESTORE_XMM 1
@@ -702,12 +711,12 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%endif
align function_align
%2:
- RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
- %xdefine rstk rsp
- %assign stack_offset 0
- %assign stack_size 0
- %assign stack_size_padded 0
- %assign xmm_regs_used 0
+ RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly somewhat nicer
+ %xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required
+ %assign stack_offset 0 ; stack pointer offset relative to the return address
+ %assign stack_size 0 ; amount of stack space that can be freely used inside a function
+ %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
+ %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64
%ifnidn %3, ""
PROLOGUE %3
%endif