Commit 9131032b authored by Aske Simon Christensen's avatar Aske Simon Christensen
Browse files

SSE2 version of additive core

parent 45316802
%if __BITS__ == 32
%define NAME _additive_core
%define NAME(n) _%+n
%define r(n) e%+n
%define PSIZE 4
%define STACK_OFFSET (4*4 + 4)
%else
%define NAME additive_core
%define NAME(n) n
%define r(n) r%+n
%define PSIZE 8
%define STACK_OFFSET (4*8 + 2*16 + 8)
default rel
%endif
global NAME
global NAME(supports_avx)
global NAME(additive_core_sse2)
global NAME(additive_core_avx)
section sec text align=1
NAME:
section con rdata align=16
c_zero: dq 0.0, 0.0, 0.0, 0.0
c_one: dq 1.0, 1.0, 1.0, 1.0
section sup text
NAME(supports_avx):
push r(bx)
mov eax, 1
cpuid
mov r(ax), r(cx)
shr r(ax), 28
and r(ax), 1
pop r(bx)
ret
section sse2 text
NAME(additive_core_sse2):
; Disable denormals
push r(ax)
stmxcsr [r(sp)]
or dword [r(sp)], 0x8040
ldmxcsr [r(sp)]
pop r(ax)
%if __BITS__ == 64
; Save register arguments to stack
mov [rsp + 8], rcx
mov [rsp + 16], rdx
mov [rsp + 24], r8
mov [rsp + 32], r9
; Save callee-save registers
sub rsp, 2*16
movupd [rsp + 0*16], xmm6
movupd [rsp + 1*16], xmm7
%endif
push r(bx)
push r(bp)
push r(si)
push r(di)
; Initialize
xorpd xmm0, xmm0
movsd xmm6, [r(sp) + STACK_OFFSET + 6*PSIZE + 0*8]
unpcklpd xmm6, xmm6
movsd xmm7, [r(sp) + STACK_OFFSET + 6*PSIZE + 1*8]
unpcklpd xmm7, xmm7
; Pointers
mov r(ax), [r(sp) + STACK_OFFSET + 0*PSIZE] ; state_re
mov r(dx), [r(sp) + STACK_OFFSET + 1*PSIZE] ; state_im
mov r(bx), [r(sp) + STACK_OFFSET + 2*PSIZE] ; step_re
mov r(bp), [r(sp) + STACK_OFFSET + 3*PSIZE] ; step_im
mov r(si), [r(sp) + STACK_OFFSET + 4*PSIZE] ; filter_low
mov r(di), [r(sp) + STACK_OFFSET + 5*PSIZE] ; filter_high
; Count
mov r(cx), [r(sp) + STACK_OFFSET + 6*PSIZE + 2*8]
.loop:
; Update oscillator
movupd xmm2, [r(ax)]
movupd xmm3, [r(dx)]
movapd xmm4, xmm2
movapd xmm5, xmm3
movupd xmm1, [r(bx)]
mulpd xmm2, xmm1
mulpd xmm3, xmm1
movupd xmm1, [r(bp)]
mulpd xmm4, xmm1
mulpd xmm5, xmm1
subpd xmm2, xmm5
addpd xmm3, xmm4
movupd [r(ax)], xmm2
movupd [r(dx)], xmm3
; Update filter
movupd xmm4, [r(si)]
movupd xmm5, [r(di)]
movapd xmm3, xmm4
minpd xmm3, xmm5
addpd xmm4, xmm6
addpd xmm5, xmm7
movupd [r(si)], xmm4
movupd [r(di)], xmm5
maxpd xmm3, [c_zero]
minpd xmm3, [c_one]
; Accumulate filtered oscillator
mulpd xmm2, xmm3
addpd xmm0, xmm2
; Advance pointers
add r(ax), 16
add r(dx), 16
add r(bx), 16
add r(bp), 16
add r(si), 16
add r(di), 16
sub r(cx), 2
ja .loop
; Final summation
movapd xmm1, xmm0
unpckhpd xmm1, xmm1
addsd xmm0, xmm1
; Restore callee-save registers
pop r(di)
pop r(si)
pop r(bp)
pop r(bx)
%if __BITS__ == 64
movupd xmm6, [rsp + 0*16]
movupd xmm7, [rsp + 1*16]
add rsp, 2*16
%else
; Return result on FP stack
sub esp, 8
movsd [esp], xmm0
fld qword [esp]
add esp, 8
%endif
ret
section avx text
NAME(additive_core_avx):
; Disable denormals
push r(ax)
vstmxcsr [r(sp)]
......@@ -41,9 +179,6 @@ NAME:
; Initialize
vxorpd ymm0, ymm0
mov eax, 1
vcvtsi2sd xmm1, eax
vbroadcastsd ymm1, xmm1
vbroadcastsd ymm6, [r(sp) + STACK_OFFSET + 6*PSIZE + 0*8]
vbroadcastsd ymm7, [r(sp) + STACK_OFFSET + 6*PSIZE + 1*8]
......@@ -57,8 +192,6 @@ NAME:
; Count
mov r(cx), [r(sp) + STACK_OFFSET + 6*PSIZE + 2*8]
add r(cx), 3
shr r(cx), 2
.loop:
; Update oscillator
......@@ -81,9 +214,8 @@ NAME:
vaddpd ymm5, ymm5, ymm7
vmovupd [r(si)], ymm4
vmovupd [r(di)], ymm5
vxorpd ymm4, ymm4
vminpd ymm3, ymm3, ymm1
vmaxpd ymm3, ymm3, ymm4
vmaxpd ymm3, ymm3, [c_zero]
vminpd ymm3, ymm3, [c_one]
; Accumulate filtered oscillator
vmulpd ymm2, ymm2, ymm3
......@@ -97,7 +229,8 @@ NAME:
add r(si), 32
add r(di), 32
loop .loop
sub r(cx), 4
ja .loop
; Final summation
vextractf128 xmm1, ymm0, 1
......
......@@ -321,7 +321,9 @@ pub struct OidosSoundGenerator {
f_add_low: f64,
f_add_high: f64,
gain: f64
gain: f64,
avx_support: bool
}
impl SoundGenerator for OidosSoundGenerator {
......@@ -345,7 +347,9 @@ impl SoundGenerator for OidosSoundGenerator {
f_add_low: (-param.f_sweeplow * param.f_slopelow) as f64,
f_add_high: (param.f_sweephigh * param.f_slopehigh) as f64,
gain: param.gain as f64
gain: param.gain as f64,
avx_support: unsafe { supports_avx() }
};
let f_lowlimit = param.f_low as f64 + tone as f64;
......@@ -403,16 +407,26 @@ impl SoundGenerator for OidosSoundGenerator {
fn produce_sample(&mut self) -> f32 {
let s = unsafe {
additive_core(self.state_re.as_mut_ptr(), self.state_im.as_mut_ptr(),
self.step_re.as_ptr(), self.step_im.as_ptr(),
self.filter_low.as_mut_ptr(), self.filter_high.as_mut_ptr(),
self.f_add_low, self.f_add_high, self.n_partials)
if self.avx_support {
additive_core_avx(self.state_re.as_mut_ptr(), self.state_im.as_mut_ptr(),
self.step_re.as_ptr(), self.step_im.as_ptr(),
self.filter_low.as_mut_ptr(), self.filter_high.as_mut_ptr(),
self.f_add_low, self.f_add_high, self.n_partials)
} else {
additive_core_sse2(self.state_re.as_mut_ptr(), self.state_im.as_mut_ptr(),
self.step_re.as_ptr(), self.step_im.as_ptr(),
self.filter_low.as_mut_ptr(), self.filter_high.as_mut_ptr(),
self.f_add_low, self.f_add_high, self.n_partials)
}
};
(s * (self.gain / (self.n_partials as f64 + (self.gain - 1.0) * s * s)).sqrt()) as f32
}
}
extern "cdecl" {
fn additive_core(state_re: *mut f64, state_im: *mut f64, step_re: *const f64, step_im: *const f64,
filter_low: *mut f64, filter_high: *mut f64, f_add_low: f64, f_add_high: f64, n: usize) -> f64;
fn supports_avx() -> bool;
fn additive_core_sse2(state_re: *mut f64, state_im: *mut f64, step_re: *const f64, step_im: *const f64,
filter_low: *mut f64, filter_high: *mut f64, f_add_low: f64, f_add_high: f64, n: usize) -> f64;
fn additive_core_avx(state_re: *mut f64, state_im: *mut f64, step_re: *const f64, step_im: *const f64,
filter_low: *mut f64, filter_high: *mut f64, f_add_low: f64, f_add_high: f64, n: usize) -> f64;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment