Commit 7c278354 authored by Aske Simon Christensen's avatar Aske Simon Christensen
Browse files

Calling conventions for Mac/Linux

parent 57c22524
......@@ -15,15 +15,11 @@
%define NAME(n) n
%endif
; Registers and stack layout
; Registers
%if __BITS__ == 32
%define r(n) e%+n
%define PSIZE 4
%define STACK_OFFSET (4*4 + 4)
%else
%define r(n) r%+n
%define PSIZE 8
%define STACK_OFFSET (4*8 + 2*16 + 8)
default rel
%endif
......@@ -40,6 +36,96 @@ c_zero: dq 0.0, 0.0, 0.0, 0.0
c_one: dq 1.0, 1.0, 1.0, 1.0
; Register assignment
%define STATE_RE r(di)
%define STATE_IM r(si)
%define STEP_RE r(dx)
%define STEP_IM r(cx)
%define FILTER_LOW r(ax)
%define FILTER_HIGH r(bx)
%define COUNT r(bp)
; Argument to ENTRY and EXIT macros to specify encoding
%define LEGACY(i) i
%define VEX(i) v%+i
%macro ENTRY 1
; Save general purpose registers
push r(bx)
push r(bp)
push r(si)
push r(di)
; Get arguments
%if __BITS__ == 32
mov STATE_RE, [esp + 5*4 + 0*4]
mov STATE_IM, [esp + 5*4 + 1*4]
mov STEP_RE, [esp + 5*4 + 2*4]
mov STEP_IM, [esp + 5*4 + 3*4]
mov FILTER_LOW, [esp + 5*4 + 4*4]
mov FILTER_HIGH, [esp + 5*4 + 5*4]
%1(movsd) xmm0, [esp + 5*4 + 6*4 + 0*8]
%1(movsd) xmm1, [esp + 5*4 + 6*4 + 1*8]
mov COUNT, [esp + 5*4 + 6*4 + 2*8 + 0*4]
%elif WINDOWS
mov STATE_RE, rcx
mov STATE_IM, rdx
mov STEP_RE, r8
mov STEP_IM, r9
mov FILTER_LOW, [rsp + 5*8 + 32 + 0*8]
mov FILTER_HIGH, [rsp + 5*8 + 32 + 1*8]
%1(movsd) xmm0, [rsp + 5*8 + 32 + 2*8 + 0*8]
%1(movsd) xmm1, [rsp + 5*8 + 32 + 2*8 + 1*8]
mov COUNT, [rsp + 5*8 + 32 + 2*8 + 2*8 + 0*8]
%else
mov FILTER_LOW, r8
mov FILTER_HIGH, r9
mov COUNT, [rsp + 5*8 + 0*8]
%endif
; Save float registers
%if __BITS__ == 64
sub rsp, 2*16
%1(movupd) [rsp + 0*16], xmm6
%1(movupd) [rsp + 1*16], xmm7
%endif
; Disable denormals
sub r(sp), 4
%1(stmxcsr) [r(sp)]
or dword [r(sp)], 0x8040
%1(ldmxcsr) [r(sp)]
add r(sp), 4
%endmacro
%macro EXIT 1
; Restore float registers
%if __BITS__ == 64
%1(movupd) xmm6, [rsp + 0*16]
%1(movupd) xmm7, [rsp + 1*16]
add rsp, 2*16
%else
; Return result on FP stack
sub esp, 8
%1(movsd) [esp], xmm0
fld qword [esp]
add esp, 8
%endif
; Restore general purpose registers
pop r(di)
pop r(si)
pop r(bp)
pop r(bx)
%endmacro
; Supports AVX?
section .text
NAME(supports_avx):
push r(bx)
......@@ -55,76 +141,44 @@ NAME(supports_avx):
ret
; SSE2 core
section .text
NAME(additive_core_sse2):
; Disable denormals
push r(ax)
stmxcsr [r(sp)]
or dword [r(sp)], 0x8040
ldmxcsr [r(sp)]
pop r(ax)
%if __BITS__ == 64
; Save register arguments to stack
mov [rsp + 8], rcx
mov [rsp + 16], rdx
mov [rsp + 24], r8
mov [rsp + 32], r9
; Save callee-save registers
sub rsp, 2*16
movupd [rsp + 0*16], xmm6
movupd [rsp + 1*16], xmm7
%endif
push r(bx)
push r(bp)
push r(si)
push r(di)
ENTRY LEGACY
; Initialize
xorpd xmm0, xmm0
movsd xmm6, [r(sp) + STACK_OFFSET + 6*PSIZE + 0*8]
movsd xmm6, xmm0
unpcklpd xmm6, xmm6
movsd xmm7, [r(sp) + STACK_OFFSET + 6*PSIZE + 1*8]
movsd xmm7, xmm1
unpcklpd xmm7, xmm7
; Pointers
mov r(ax), [r(sp) + STACK_OFFSET + 0*PSIZE] ; state_re
mov r(dx), [r(sp) + STACK_OFFSET + 1*PSIZE] ; state_im
mov r(bx), [r(sp) + STACK_OFFSET + 2*PSIZE] ; step_re
mov r(bp), [r(sp) + STACK_OFFSET + 3*PSIZE] ; step_im
mov r(si), [r(sp) + STACK_OFFSET + 4*PSIZE] ; filter_low
mov r(di), [r(sp) + STACK_OFFSET + 5*PSIZE] ; filter_high
; Count
mov r(cx), [r(sp) + STACK_OFFSET + 6*PSIZE + 2*8]
xorpd xmm0, xmm0
.loop:
; Update oscillator
movupd xmm2, [r(ax)]
movupd xmm3, [r(dx)]
movupd xmm2, [STATE_RE]
movupd xmm3, [STATE_IM]
movapd xmm4, xmm2
movapd xmm5, xmm3
movupd xmm1, [r(bx)]
movupd xmm1, [STEP_RE]
mulpd xmm2, xmm1
mulpd xmm3, xmm1
movupd xmm1, [r(bp)]
movupd xmm1, [STEP_IM]
mulpd xmm4, xmm1
mulpd xmm5, xmm1
subpd xmm2, xmm5
addpd xmm3, xmm4
movupd [r(ax)], xmm2
movupd [r(dx)], xmm3
movupd [STATE_RE], xmm2
movupd [STATE_IM], xmm3
; Update filter
movupd xmm4, [r(si)]
movupd xmm5, [r(di)]
movupd xmm4, [FILTER_LOW]
movupd xmm5, [FILTER_HIGH]
movapd xmm3, xmm4
minpd xmm3, xmm5
addpd xmm4, xmm6
addpd xmm5, xmm7
movupd [r(si)], xmm4
movupd [r(di)], xmm5
movupd [FILTER_LOW], xmm4
movupd [FILTER_HIGH], xmm5
maxpd xmm3, [c_zero]
minpd xmm3, [c_one]
......@@ -133,14 +187,14 @@ NAME(additive_core_sse2):
addpd xmm0, xmm2
; Advance pointers
add r(ax), 16
add r(dx), 16
add r(bx), 16
add r(bp), 16
add r(si), 16
add r(di), 16
sub r(cx), 2
add STATE_RE, 16
add STATE_IM, 16
add STEP_RE, 16
add STEP_IM, 16
add FILTER_LOW, 16
add FILTER_HIGH, 16
sub COUNT, 2
ja .loop
; Final summation
......@@ -148,90 +202,41 @@ NAME(additive_core_sse2):
unpckhpd xmm1, xmm1
addsd xmm0, xmm1
; Restore callee-save registers
pop r(di)
pop r(si)
pop r(bp)
pop r(bx)
%if __BITS__ == 64
movupd xmm6, [rsp + 0*16]
movupd xmm7, [rsp + 1*16]
add rsp, 2*16
%else
; Return result on FP stack
sub esp, 8
movsd [esp], xmm0
fld qword [esp]
add esp, 8
%endif
EXIT LEGACY
ret
; AVX core
section .text
NAME(additive_core_avx):
; Disable denormals
push r(ax)
vstmxcsr [r(sp)]
or dword [r(sp)], 0x8040
vldmxcsr [r(sp)]
pop r(ax)
%if __BITS__ == 64
; Save register arguments to stack
mov [rsp + 8], rcx
mov [rsp + 16], rdx
mov [rsp + 24], r8
mov [rsp + 32], r9
; Save callee-save registers
sub rsp, 2*16
vmovupd [rsp + 0*16], xmm6
vmovupd [rsp + 1*16], xmm7
%endif
push r(bx)
push r(bp)
push r(si)
push r(di)
ENTRY VEX
; Initialize
vbroadcastsd ymm6, xmm0
vbroadcastsd ymm7, xmm1
vxorpd ymm0, ymm0
vbroadcastsd ymm6, [r(sp) + STACK_OFFSET + 6*PSIZE + 0*8]
vbroadcastsd ymm7, [r(sp) + STACK_OFFSET + 6*PSIZE + 1*8]
; Pointers
mov r(ax), [r(sp) + STACK_OFFSET + 0*PSIZE] ; state_re
mov r(dx), [r(sp) + STACK_OFFSET + 1*PSIZE] ; state_im
mov r(bx), [r(sp) + STACK_OFFSET + 2*PSIZE] ; step_re
mov r(bp), [r(sp) + STACK_OFFSET + 3*PSIZE] ; step_im
mov r(si), [r(sp) + STACK_OFFSET + 4*PSIZE] ; filter_low
mov r(di), [r(sp) + STACK_OFFSET + 5*PSIZE] ; filter_high
; Count
mov r(cx), [r(sp) + STACK_OFFSET + 6*PSIZE + 2*8]
.loop:
; Update oscillator
vmovupd ymm2, [r(ax)]
vmovupd ymm3, [r(dx)]
vmulpd ymm4, ymm2, [r(bx)]
vmulpd ymm5, ymm2, [r(bp)]
vmulpd ymm2, ymm3, [r(bp)]
vmulpd ymm3, ymm3, [r(bx)]
vmovupd ymm2, [STATE_RE]
vmovupd ymm3, [STATE_IM]
vmulpd ymm4, ymm2, [STEP_RE]
vmulpd ymm5, ymm2, [STEP_IM]
vmulpd ymm2, ymm3, [STEP_IM]
vmulpd ymm3, ymm3, [STEP_RE]
vsubpd ymm2, ymm4, ymm2
vaddpd ymm3, ymm3, ymm5
vmovupd [r(ax)], ymm2
vmovupd [r(dx)], ymm3
vmovupd [STATE_RE], ymm2
vmovupd [STATE_IM], ymm3
; Update filter
vmovupd ymm4, [r(si)]
vmovupd ymm5, [r(di)]
vmovupd ymm4, [FILTER_LOW]
vmovupd ymm5, [FILTER_HIGH]
vminpd ymm3, ymm4, ymm5
vaddpd ymm4, ymm4, ymm6
vaddpd ymm5, ymm5, ymm7
vmovupd [r(si)], ymm4
vmovupd [r(di)], ymm5
vmovupd [FILTER_LOW], ymm4
vmovupd [FILTER_HIGH], ymm5
vmaxpd ymm3, ymm3, [c_zero]
vminpd ymm3, ymm3, [c_one]
......@@ -240,14 +245,14 @@ NAME(additive_core_avx):
vaddpd ymm0, ymm0, ymm2
; Advance pointers
add r(ax), 32
add r(dx), 32
add r(bx), 32
add r(bp), 32
add r(si), 32
add r(di), 32
sub r(cx), 4
add STATE_RE, 32
add STATE_IM, 32
add STEP_RE, 32
add STEP_IM, 32
add FILTER_LOW, 32
add FILTER_HIGH, 32
sub COUNT, 4
ja .loop
; Final summation
......@@ -255,23 +260,6 @@ NAME(additive_core_avx):
vaddpd xmm0, xmm0, xmm1
vhaddpd xmm0, xmm0, xmm0
; Restore callee-save registers
pop r(di)
pop r(si)
pop r(bp)
pop r(bx)
%if __BITS__ == 64
vmovupd xmm6, [rsp + 0*16]
vmovupd xmm7, [rsp + 1*16]
add rsp, 2*16
%else
; Return result on FP stack
sub esp, 8
vmovsd [esp], xmm0
fld qword [esp]
add esp, 8
%endif
EXIT VEX
ret
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment