-
Notifications
You must be signed in to change notification settings - Fork 4
perf: optimize MlpgAdjust #116
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
Codecov Report❌ Patch coverage is
Additional details and impacted files@@ Coverage Diff @@
## main #116 +/- ##
==========================================
- Coverage 71.46% 71.28% -0.18%
==========================================
Files 37 37
Lines 1675 1689 +14
==========================================
+ Hits 1197 1204 +7
- Misses 478 485 +7 ☔ View full report in Codecov by Sentry. 🚀 New features to boost your workflow:
|
mlsafir diagnosticsref: Artifact: aarch64-baseAssembly: Details.section .text.jbonsai::vocoder::mlsa::fir,"ax",@progbits
.p2align 2
.type jbonsai::vocoder::mlsa::fir,@function
jbonsai::vocoder::mlsa::fir:
.cfi_startproc
stp x29, x30, [sp, #-16]!
.cfi_def_cfa_offset 16
mov x29, sp
.cfi_def_cfa w29, 16
.cfi_offset w30, -8
.cfi_offset w29, -16
.cfi_remember_state
cmp x1, #1
b.ls .LBB265_13
fmul d4, d1, d1
fmov d2, #1.00000000
ldr d3, [x0, #8]
cmp x1, x3
fmul d6, d1, d3
fsub d2, d2, d4
fmul d5, d0, d2
fmul d0, d0, d1
fadd d5, d5, d6
stp d0, d5, [x0]
b.hi .LBB265_14
fsub d3, d3, d0
sub x12, x1, #2
add x9, x2, #16
ands x10, x12, #0xffffffffffffffc
and x8, x12, #0x3
add x11, x0, #16
b.eq .LBB265_11
fmul d5, d4, d4
fneg d6, d1
movi v0.2d, #0000000000000000
and x12, x12, #0xffffffffffffffc
mov x13, x9
mov x14, x11
neg x12, x12
.LBB265_4:
cbz x12, .LBB265_6
ldr q7, [x14]
ldur q17, [x14, #8]
mov v18.16b, v3.16b
adds x12, x12, #4
fmadd d16, d6, d3, d7
fmla v17.2d, v7.2d, v6.d[0]
fmul v7.2d, v7.2d, v1.d[0]
mov v18.d[1], v16.d[0]
mov v19.16b, v17.16b
ldr q16, [x14, #16]
fmul v20.2d, v16.2d, v1.d[0]
mov d16, v16.d[1]
fmla v19.2d, v18.2d, v4.d[0]
fmla v7.2d, v18.2d, v2.d[0]
ldr d18, [x14, #16]
fmadd d16, d6, d18, d16
fmla v20.2d, v19.2d, v2.d[0]
ldp q19, q21, [x13], #32
fmadd d16, d4, d17, d16
fmul v18.2d, v7.2d, v19.2d
fmul v19.2d, v20.2d, v21.2d
stp q7, q20, [x14], #32
fmadd d3, d5, d3, d16
fadd v17.2d, v18.2d, v19.2d
fadd v0.2d, v0.2d, v17.2d
b.ne .LBB265_4
.LBB265_6:
cbz x8, .LBB265_12
.LBB265_7:
lsl x12, x10, #3
fmul d4, d2, d3
cmp x8, #1
add x10, x11, x12
add x9, x9, x12
ldr d5, [x10]
fmul d6, d1, d5
fadd d6, d4, d6
ldr d4, [x9]
fmul d4, d4, d6
str d6, [x10]
fadd d4, d0, d4
b.eq .LBB265_10
fmul d3, d1, d3
cmp x8, #2
fsub d3, d5, d3
ldr d5, [x10, #8]
fmul d7, d1, d5
fmul d6, d2, d3
fadd d6, d6, d7
ldr d7, [x9, #8]
fmul d7, d7, d6
str d6, [x10, #8]
fadd d4, d4, d7
b.eq .LBB265_10
fmul d3, d1, d3
fsub d3, d5, d3
ldr d5, [x10, #16]
fmul d1, d1, d5
fmul d2, d2, d3
fadd d1, d2, d1
ldr d2, [x9, #16]
fmul d2, d2, d1
str d1, [x10, #16]
fadd d4, d4, d2
.LBB265_10:
mov d0, v0.d[1]
fadd d0, d4, d0
.cfi_def_cfa wsp, 16
ldp x29, x30, [sp], #16
.cfi_def_cfa_offset 0
.cfi_restore w30
.cfi_restore w29
ret
.LBB265_11:
.cfi_restore_state
.cfi_remember_state
movi v0.2d, #0000000000000000
cbnz x8, .LBB265_7
.LBB265_12:
fmov d4, d0
mov d0, v0.d[1]
fadd d0, d4, d0
.cfi_def_cfa wsp, 16
ldp x29, x30, [sp], #16
.cfi_def_cfa_offset 0
.cfi_restore w30
.cfi_restore w29
ret
.LBB265_13:
.cfi_restore_state
adrp x0, .Lanon.1c920b96ead1ce3e28b70eae9644e04a.543
add x0, x0, :lo12:.Lanon.1c920b96ead1ce3e28b70eae9644e04a.543
adrp x2, .Lanon.1c920b96ead1ce3e28b70eae9644e04a.544
add x2, x2, :lo12:.Lanon.1c920b96ead1ce3e28b70eae9644e04a.544
mov w1, #30
bl core::panicking::panic
.LBB265_14:
adrp x8, .Lanon.1c920b96ead1ce3e28b70eae9644e04a.545
add x8, x8, :lo12:.Lanon.1c920b96ead1ce3e28b70eae9644e04a.545
mov w0, #2
mov x2, x3
mov x3, x8
bl core::slice::index::slice_index_failBenchmark results: Artifact: aarch64-mergeAssembly: Details.section .text.jbonsai::vocoder::mlsa::fir,"ax",@progbits
.p2align 2
.type jbonsai::vocoder::mlsa::fir,@function
jbonsai::vocoder::mlsa::fir:
.cfi_startproc
stp x29, x30, [sp, #-16]!
.cfi_def_cfa_offset 16
mov x29, sp
.cfi_def_cfa w29, 16
.cfi_offset w30, -8
.cfi_offset w29, -16
.cfi_remember_state
cmp x1, #1
b.ls .LBB266_13
fmul d4, d1, d1
fmov d2, #1.00000000
ldr d3, [x0, #8]
cmp x1, x3
fmul d6, d1, d3
fsub d2, d2, d4
fmul d5, d0, d2
fmul d0, d0, d1
fadd d5, d5, d6
stp d0, d5, [x0]
b.hi .LBB266_14
fsub d3, d3, d0
sub x13, x1, #2
add x9, x2, #16
ands x10, x13, #0xfffffffffffffffc
and x8, x13, #0x3
add x11, x0, #16
and x12, x13, #0xffffffffffffffc
b.eq .LBB266_11
fmul d5, d4, d4
fneg d6, d1
movi v0.2d, #0000000000000000
and x13, x13, #0xfffffffffffffffc
neg x14, x12
mov x15, x9
neg x13, x13
mov x16, x11
.LBB266_4:
cbz x14, .LBB266_6
ldr q7, [x16]
ldur q17, [x16, #8]
mov v18.16b, v3.16b
adds x13, x13, #4
add x14, x14, #4
fmadd d16, d6, d3, d7
fmla v17.2d, v7.2d, v6.d[0]
fmul v7.2d, v7.2d, v1.d[0]
mov v18.d[1], v16.d[0]
mov v19.16b, v17.16b
ldr q16, [x16, #16]
fmul v20.2d, v16.2d, v1.d[0]
mov d16, v16.d[1]
fmla v19.2d, v18.2d, v4.d[0]
fmla v7.2d, v18.2d, v2.d[0]
ldr d18, [x16, #16]
fmadd d16, d6, d18, d16
fmla v20.2d, v19.2d, v2.d[0]
ldp q19, q21, [x15], #32
fmadd d16, d4, d17, d16
fmul v18.2d, v7.2d, v19.2d
fmul v19.2d, v20.2d, v21.2d
stp q7, q20, [x16], #32
fmadd d3, d5, d3, d16
fadd v17.2d, v18.2d, v19.2d
fadd v0.2d, v0.2d, v17.2d
b.ne .LBB266_4
.LBB266_6:
cbz x8, .LBB266_12
.LBB266_7:
add x11, x11, x12, lsl #3
fmul d4, d2, d3
add x9, x9, x10, lsl #3
cmp x8, #1
ldr d5, [x11]
fmul d6, d1, d5
fadd d6, d4, d6
ldr d4, [x9]
fmul d4, d4, d6
str d6, [x11]
fadd d4, d0, d4
b.eq .LBB266_10
fmul d3, d1, d3
cmp x8, #2
fsub d3, d5, d3
ldr d5, [x11, #8]
fmul d7, d1, d5
fmul d6, d2, d3
fadd d6, d6, d7
ldr d7, [x9, #8]
fmul d7, d7, d6
str d6, [x11, #8]
fadd d4, d4, d7
b.eq .LBB266_10
fmul d3, d1, d3
fsub d3, d5, d3
ldr d5, [x11, #16]
fmul d1, d1, d5
fmul d2, d2, d3
fadd d1, d2, d1
ldr d2, [x9, #16]
fmul d2, d2, d1
str d1, [x11, #16]
fadd d4, d4, d2
.LBB266_10:
mov d0, v0.d[1]
fadd d0, d4, d0
.cfi_def_cfa wsp, 16
ldp x29, x30, [sp], #16
.cfi_def_cfa_offset 0
.cfi_restore w30
.cfi_restore w29
ret
.LBB266_11:
.cfi_restore_state
.cfi_remember_state
movi v0.2d, #0000000000000000
cbnz x8, .LBB266_7
.LBB266_12:
fmov d4, d0
mov d0, v0.d[1]
fadd d0, d4, d0
.cfi_def_cfa wsp, 16
ldp x29, x30, [sp], #16
.cfi_def_cfa_offset 0
.cfi_restore w30
.cfi_restore w29
ret
.LBB266_13:
.cfi_restore_state
adrp x0, .Lanon.1c920b96ead1ce3e28b70eae9644e04a.526
add x0, x0, :lo12:.Lanon.1c920b96ead1ce3e28b70eae9644e04a.526
adrp x2, .Lanon.1c920b96ead1ce3e28b70eae9644e04a.527
add x2, x2, :lo12:.Lanon.1c920b96ead1ce3e28b70eae9644e04a.527
mov w1, #30
bl core::panicking::panic
.LBB266_14:
adrp x8, .Lanon.1c920b96ead1ce3e28b70eae9644e04a.528
add x8, x8, :lo12:.Lanon.1c920b96ead1ce3e28b70eae9644e04a.528
mov w0, #2
mov x2, x3
mov x3, x8
bl core::slice::index::slice_index_failBenchmark results: Artifact: x86_64+avx2+fma-baseAssembly: Details.section .text.jbonsai::vocoder::mlsa::fir,"ax",@progbits
.p2align 4
.type jbonsai::vocoder::mlsa::fir,@function
jbonsai::vocoder::mlsa::fir:
.cfi_startproc
push rax
.cfi_def_cfa_offset 16
cmp rsi, 1
jbe .LBB267_13
vmulsd xmm4, xmm1, xmm1
vmovsd xmm2, qword ptr [rip + .LCPI267_0]
vsubsd xmm2, xmm2, xmm4
vmovsd xmm3, qword ptr [rdi + 8]
vmulsd xmm5, xmm0, xmm1
vmovsd qword ptr [rdi], xmm5
vmulsd xmm0, xmm0, xmm2
vmulsd xmm6, xmm1, xmm3
vaddsd xmm0, xmm0, xmm6
vmovsd qword ptr [rdi + 8], xmm0
cmp rsi, rcx
ja .LBB267_14
vsubsd xmm3, xmm3, xmm5
add rsi, -2
mov ecx, esi
and ecx, 3
movabs rax, 1152921504606846972
and rax, rsi
je .LBB267_3
vmulsd xmm5, xmm4, xmm4
vxorpd xmm6, xmm1, xmmword ptr [rip + .LCPI267_1]
vmovddup xmm7, xmm1
vmovddup xmm8, xmm2
vmovddup xmm9, xmm6
vmovddup xmm10, xmm4
shl rsi, 3
movabs r8, 9223372036854775776
and r8, rsi
vxorpd xmm0, xmm0, xmm0
xor esi, esi
.p2align 4
.LBB267_5:
cmp r8, rsi
je .LBB267_7
vmovupd xmm11, xmmword ptr [rdi + rsi + 16]
vmovapd xmm12, xmm3
vfmadd213sd xmm12, xmm6, xmm11
vmulpd xmm13, xmm11, xmm7
vunpcklpd xmm12, xmm3, xmm12
vfmadd231pd xmm13, xmm8, xmm12
vmovsd xmm14, qword ptr [rdi + rsi + 32]
vfmadd213pd xmm11, xmm9, xmmword ptr [rdi + rsi + 24]
vmulpd xmm15, xmm7, xmmword ptr [rdi + rsi + 32]
vfmadd213pd xmm12, xmm10, xmm11
vfmadd231pd xmm15, xmm8, xmm12
vfmadd213sd xmm14, xmm6, qword ptr [rdi + rsi + 40]
vfmadd231sd xmm14, xmm4, xmm11
vfmadd213sd xmm3, xmm5, xmm14
vmovupd xmmword ptr [rdi + rsi + 16], xmm13
vmovupd xmmword ptr [rdi + rsi + 32], xmm15
vmulpd xmm11, xmm13, xmmword ptr [rdx + rsi + 16]
vmulpd xmm12, xmm15, xmmword ptr [rdx + rsi + 32]
vaddpd xmm11, xmm11, xmm12
vaddpd xmm0, xmm11, xmm0
add rsi, 32
cmp r8, rsi
jne .LBB267_5
.LBB267_7:
test rcx, rcx
je .LBB267_8
.LBB267_9:
vmulsd xmm4, xmm2, xmm3
vmovsd xmm5, qword ptr [rdi + 8*rax + 16]
vmulsd xmm6, xmm1, xmm5
vaddsd xmm4, xmm4, xmm6
vmovsd qword ptr [rdi + 8*rax + 16], xmm4
vmulsd xmm4, xmm4, qword ptr [rdx + 8*rax + 16]
vaddsd xmm4, xmm0, xmm4
cmp ecx, 1
je .LBB267_12
vmulsd xmm3, xmm1, xmm3
vsubsd xmm3, xmm5, xmm3
vmulsd xmm6, xmm2, xmm3
vmovsd xmm5, qword ptr [rdi + 8*rax + 24]
vmulsd xmm7, xmm1, xmm5
vaddsd xmm6, xmm6, xmm7
vmovsd qword ptr [rdi + 8*rax + 24], xmm6
vmulsd xmm6, xmm6, qword ptr [rdx + 8*rax + 24]
vaddsd xmm4, xmm4, xmm6
cmp ecx, 2
je .LBB267_12
vmulsd xmm3, xmm1, xmm3
vsubsd xmm3, xmm5, xmm3
vmulsd xmm2, xmm2, xmm3
vmulsd xmm1, xmm1, qword ptr [rdi + 8*rax + 32]
vaddsd xmm1, xmm2, xmm1
vmovsd qword ptr [rdi + 8*rax + 32], xmm1
vmulsd xmm1, xmm1, qword ptr [rdx + 8*rax + 32]
vaddsd xmm4, xmm4, xmm1
.LBB267_12:
vshufpd xmm0, xmm0, xmm0, 1
vaddsd xmm0, xmm4, xmm0
pop rax
.cfi_def_cfa_offset 8
ret
.LBB267_3:
.cfi_def_cfa_offset 16
vxorpd xmm0, xmm0, xmm0
test rcx, rcx
jne .LBB267_9
.LBB267_8:
vmovapd xmm4, xmm0
vshufpd xmm0, xmm0, xmm0, 1
vaddsd xmm0, xmm4, xmm0
pop rax
.cfi_def_cfa_offset 8
ret
.LBB267_13:
.cfi_def_cfa_offset 16
lea rdi, [rip + .Lanon.267bd5b1072c0486f439c9d3a79604fd.543]
lea rdx, [rip + .Lanon.267bd5b1072c0486f439c9d3a79604fd.544]
mov esi, 30
call qword ptr [rip + core::panicking::panic@GOTPCREL]
.LBB267_14:
lea rax, [rip + .Lanon.267bd5b1072c0486f439c9d3a79604fd.545]
mov edi, 2
mov rdx, rcx
mov rcx, rax
call qword ptr [rip + core::slice::index::slice_index_fail@GOTPCREL]Benchmark results: Artifact: x86_64+avx2+fma-mergeAssembly: Details.section .text.jbonsai::vocoder::mlsa::fir,"ax",@progbits
.p2align 4
.type jbonsai::vocoder::mlsa::fir,@function
jbonsai::vocoder::mlsa::fir:
.cfi_startproc
push rax
.cfi_def_cfa_offset 16
cmp rsi, 1
jbe .LBB268_13
vmulsd xmm4, xmm1, xmm1
vmovsd xmm2, qword ptr [rip + .LCPI268_0]
vsubsd xmm2, xmm2, xmm4
vmovsd xmm3, qword ptr [rdi + 8]
vmulsd xmm5, xmm0, xmm1
vmovsd qword ptr [rdi], xmm5
vmulsd xmm0, xmm0, xmm2
vmulsd xmm6, xmm1, xmm3
vaddsd xmm0, xmm0, xmm6
vmovsd qword ptr [rdi + 8], xmm0
cmp rsi, rcx
ja .LBB268_14
vsubsd xmm3, xmm3, xmm5
add rsi, -2
mov ecx, esi
and ecx, 3
movabs rax, 1152921504606846972
and rax, rsi
and rsi, -4
je .LBB268_3
vmulsd xmm5, xmm4, xmm4
vxorpd xmm6, xmm1, xmmword ptr [rip + .LCPI268_1]
vmovddup xmm7, xmm1
vmovddup xmm8, xmm2
vmovddup xmm9, xmm6
vmovddup xmm10, xmm4
vxorpd xmm0, xmm0, xmm0
xor r8d, r8d
.p2align 4
.LBB268_5:
cmp rax, r8
je .LBB268_7
vmovupd xmm11, xmmword ptr [rdi + 8*r8 + 16]
vmovapd xmm12, xmm3
vfmadd213sd xmm12, xmm6, xmm11
vmulpd xmm13, xmm11, xmm7
vunpcklpd xmm12, xmm3, xmm12
vfmadd231pd xmm13, xmm8, xmm12
vmovsd xmm14, qword ptr [rdi + 8*r8 + 32]
vfmadd213pd xmm11, xmm9, xmmword ptr [rdi + 8*r8 + 24]
vmulpd xmm15, xmm7, xmmword ptr [rdi + 8*r8 + 32]
vfmadd213pd xmm12, xmm10, xmm11
vfmadd231pd xmm15, xmm8, xmm12
vfmadd213sd xmm14, xmm6, qword ptr [rdi + 8*r8 + 40]
vfmadd231sd xmm14, xmm4, xmm11
vfmadd213sd xmm3, xmm5, xmm14
vmovupd xmmword ptr [rdi + 8*r8 + 16], xmm13
vmovupd xmmword ptr [rdi + 8*r8 + 32], xmm15
vmulpd xmm11, xmm13, xmmword ptr [rdx + 8*r8 + 16]
vmulpd xmm12, xmm15, xmmword ptr [rdx + 8*r8 + 32]
vaddpd xmm11, xmm11, xmm12
vaddpd xmm0, xmm11, xmm0
add r8, 4
cmp rsi, r8
jne .LBB268_5
.LBB268_7:
test rcx, rcx
je .LBB268_8
.LBB268_9:
vmulsd xmm4, xmm2, xmm3
vmovsd xmm5, qword ptr [rdi + 8*rax + 16]
vmulsd xmm6, xmm1, xmm5
vaddsd xmm4, xmm4, xmm6
vmovsd qword ptr [rdi + 8*rax + 16], xmm4
vmulsd xmm4, xmm4, qword ptr [rdx + 8*rsi + 16]
vaddsd xmm4, xmm0, xmm4
cmp ecx, 1
je .LBB268_12
vmulsd xmm3, xmm1, xmm3
vsubsd xmm3, xmm5, xmm3
vmulsd xmm6, xmm2, xmm3
vmovsd xmm5, qword ptr [rdi + 8*rax + 24]
vmulsd xmm7, xmm1, xmm5
vaddsd xmm6, xmm6, xmm7
vmovsd qword ptr [rdi + 8*rax + 24], xmm6
vmulsd xmm6, xmm6, qword ptr [rdx + 8*rsi + 24]
vaddsd xmm4, xmm4, xmm6
cmp ecx, 2
je .LBB268_12
vmulsd xmm3, xmm1, xmm3
vsubsd xmm3, xmm5, xmm3
vmulsd xmm2, xmm2, xmm3
vmulsd xmm1, xmm1, qword ptr [rdi + 8*rax + 32]
vaddsd xmm1, xmm2, xmm1
vmovsd qword ptr [rdi + 8*rax + 32], xmm1
vmulsd xmm1, xmm1, qword ptr [rdx + 8*rsi + 32]
vaddsd xmm4, xmm4, xmm1
.LBB268_12:
vshufpd xmm0, xmm0, xmm0, 1
vaddsd xmm0, xmm4, xmm0
pop rax
.cfi_def_cfa_offset 8
ret
.LBB268_3:
.cfi_def_cfa_offset 16
vxorpd xmm0, xmm0, xmm0
test rcx, rcx
jne .LBB268_9
.LBB268_8:
vmovapd xmm4, xmm0
vshufpd xmm0, xmm0, xmm0, 1
vaddsd xmm0, xmm4, xmm0
pop rax
.cfi_def_cfa_offset 8
ret
.LBB268_13:
.cfi_def_cfa_offset 16
lea rdi, [rip + .Lanon.267bd5b1072c0486f439c9d3a79604fd.526]
lea rdx, [rip + .Lanon.267bd5b1072c0486f439c9d3a79604fd.527]
mov esi, 30
call qword ptr [rip + core::panicking::panic@GOTPCREL]
.LBB268_14:
lea rax, [rip + .Lanon.267bd5b1072c0486f439c9d3a79604fd.528]
mov edi, 2
mov rdx, rcx
mov rcx, rax
call qword ptr [rip + core::slice::index::slice_index_fail@GOTPCREL]Benchmark results: Artifact: x86_64-baseAssembly: Details.section .text.jbonsai::vocoder::mlsa::fir,"ax",@progbits
.p2align 4
.type jbonsai::vocoder::mlsa::fir,@function
jbonsai::vocoder::mlsa::fir:
.cfi_startproc
push rax
.cfi_def_cfa_offset 16
cmp rsi, 1
jbe .LBB267_15
movapd xmm4, xmm1
mulsd xmm4, xmm1
movsd xmm2, qword ptr [rip + .LCPI267_0]
subsd xmm2, xmm4
movsd xmm3, qword ptr [rdi + 8]
movapd xmm5, xmm0
mulsd xmm5, xmm1
movsd qword ptr [rdi], xmm5
mulsd xmm0, xmm2
movapd xmm6, xmm1
mulsd xmm6, xmm3
addsd xmm6, xmm0
movsd qword ptr [rdi + 8], xmm6
cmp rsi, rcx
ja .LBB267_7
subsd xmm3, xmm5
add rsi, -2
mov ecx, esi
and ecx, 3
movabs rax, 1152921504606846972
and rax, rsi
je .LBB267_3
movapd xmm5, xmm4
mulsd xmm5, xmm4
movapd xmm6, xmm2
unpcklpd xmm6, xmm2
movapd xmm7, xmm1
unpcklpd xmm7, xmm1
movapd xmm8, xmm4
unpcklpd xmm8, xmm4
shl rsi, 3
movabs r8, 9223372036854775776
and r8, rsi
xorpd xmm0, xmm0
xor esi, esi
movapd xmm9, xmm3
.p2align 4
.LBB267_5:
cmp r8, rsi
je .LBB267_6
movapd xmm3, xmm1
mulsd xmm3, xmm9
movapd xmm10, xmm5
mulsd xmm10, xmm9
movupd xmm11, xmmword ptr [rdi + rsi + 16]
movupd xmm12, xmmword ptr [rdi + rsi + 24]
movupd xmm13, xmmword ptr [rdi + rsi + 32]
movapd xmm14, xmm7
mulpd xmm14, xmm11
subsd xmm11, xmm3
unpcklpd xmm9, xmm11
movapd xmm11, xmm6
mulpd xmm11, xmm9
addpd xmm11, xmm14
movsd xmm3, qword ptr [rdi + rsi + 40]
subpd xmm12, xmm14
mulpd xmm9, xmm8
addpd xmm9, xmm12
mulpd xmm9, xmm6
mulpd xmm13, xmm7
addpd xmm9, xmm13
mulsd xmm12, xmm4
subsd xmm3, xmm13
addsd xmm3, xmm12
addsd xmm3, xmm10
movupd xmmword ptr [rdi + rsi + 16], xmm11
movupd xmmword ptr [rdi + rsi + 32], xmm9
movupd xmm10, xmmword ptr [rdx + rsi + 16]
movupd xmm12, xmmword ptr [rdx + rsi + 32]
mulpd xmm10, xmm11
mulpd xmm12, xmm9
addpd xmm12, xmm10
addpd xmm0, xmm12
add rsi, 32
movapd xmm9, xmm3
cmp r8, rsi
jne .LBB267_5
jmp .LBB267_9
.LBB267_3:
xorpd xmm0, xmm0
.LBB267_9:
test rcx, rcx
je .LBB267_10
.LBB267_11:
movapd xmm6, xmm2
mulsd xmm6, xmm3
movsd xmm5, qword ptr [rdi + 8*rax + 16]
movapd xmm4, xmm1
mulsd xmm4, xmm5
addsd xmm4, xmm6
movsd qword ptr [rdi + 8*rax + 16], xmm4
mulsd xmm4, qword ptr [rdx + 8*rax + 16]
addsd xmm4, xmm0
cmp ecx, 1
je .LBB267_14
mulsd xmm3, xmm1
subsd xmm5, xmm3
movapd xmm6, xmm2
mulsd xmm6, xmm5
movsd xmm3, qword ptr [rdi + 8*rax + 24]
movapd xmm7, xmm1
mulsd xmm7, xmm3
addsd xmm7, xmm6
movsd qword ptr [rdi + 8*rax + 24], xmm7
mulsd xmm7, qword ptr [rdx + 8*rax + 24]
addsd xmm4, xmm7
cmp ecx, 2
je .LBB267_14
mulsd xmm5, xmm1
subsd xmm3, xmm5
mulsd xmm2, xmm3
mulsd xmm1, qword ptr [rdi + 8*rax + 32]
addsd xmm1, xmm2
movsd qword ptr [rdi + 8*rax + 32], xmm1
mulsd xmm1, qword ptr [rdx + 8*rax + 32]
addsd xmm4, xmm1
.LBB267_14:
unpckhpd xmm0, xmm0
addsd xmm0, xmm4
pop rax
.cfi_def_cfa_offset 8
ret
.LBB267_6:
.cfi_def_cfa_offset 16
movapd xmm3, xmm9
test rcx, rcx
jne .LBB267_11
.LBB267_10:
movapd xmm4, xmm0
unpckhpd xmm0, xmm0
addsd xmm0, xmm4
pop rax
.cfi_def_cfa_offset 8
ret
.LBB267_15:
.cfi_def_cfa_offset 16
lea rdi, [rip + .Lanon.267bd5b1072c0486f439c9d3a79604fd.543]
lea rdx, [rip + .Lanon.267bd5b1072c0486f439c9d3a79604fd.544]
mov esi, 30
call qword ptr [rip + core::panicking::panic@GOTPCREL]
.LBB267_7:
lea rax, [rip + .Lanon.267bd5b1072c0486f439c9d3a79604fd.545]
mov edi, 2
mov rdx, rcx
mov rcx, rax
call qword ptr [rip + core::slice::index::slice_index_fail@GOTPCREL]Benchmark results: Artifact: x86_64-mergeAssembly: Details.section .text.jbonsai::vocoder::mlsa::fir,"ax",@progbits
.p2align 4
.type jbonsai::vocoder::mlsa::fir,@function
jbonsai::vocoder::mlsa::fir:
.cfi_startproc
push rax
.cfi_def_cfa_offset 16
cmp rsi, 1
jbe .LBB268_15
movapd xmm4, xmm1
mulsd xmm4, xmm1
movsd xmm2, qword ptr [rip + .LCPI268_0]
subsd xmm2, xmm4
movsd xmm3, qword ptr [rdi + 8]
movapd xmm5, xmm0
mulsd xmm5, xmm1
movsd qword ptr [rdi], xmm5
mulsd xmm0, xmm2
movapd xmm6, xmm1
mulsd xmm6, xmm3
addsd xmm6, xmm0
movsd qword ptr [rdi + 8], xmm6
cmp rsi, rcx
ja .LBB268_7
subsd xmm3, xmm5
add rsi, -2
mov ecx, esi
and ecx, 3
movabs rax, 1152921504606846972
and rax, rsi
and rsi, -4
je .LBB268_3
movapd xmm5, xmm4
mulsd xmm5, xmm4
movapd xmm6, xmm2
unpcklpd xmm6, xmm2
movapd xmm7, xmm1
unpcklpd xmm7, xmm1
movapd xmm8, xmm4
unpcklpd xmm8, xmm4
xorpd xmm0, xmm0
xor r8d, r8d
movapd xmm9, xmm3
.p2align 4
.LBB268_5:
cmp rax, r8
je .LBB268_6
movapd xmm3, xmm1
mulsd xmm3, xmm9
movapd xmm10, xmm5
mulsd xmm10, xmm9
movupd xmm11, xmmword ptr [rdi + 8*r8 + 16]
movupd xmm12, xmmword ptr [rdi + 8*r8 + 24]
movupd xmm13, xmmword ptr [rdi + 8*r8 + 32]
movapd xmm14, xmm7
mulpd xmm14, xmm11
subsd xmm11, xmm3
unpcklpd xmm9, xmm11
movapd xmm11, xmm6
mulpd xmm11, xmm9
addpd xmm11, xmm14
movsd xmm3, qword ptr [rdi + 8*r8 + 40]
subpd xmm12, xmm14
mulpd xmm9, xmm8
addpd xmm9, xmm12
mulpd xmm9, xmm6
mulpd xmm13, xmm7
addpd xmm9, xmm13
mulsd xmm12, xmm4
subsd xmm3, xmm13
addsd xmm3, xmm12
addsd xmm3, xmm10
movupd xmmword ptr [rdi + 8*r8 + 16], xmm11
movupd xmmword ptr [rdi + 8*r8 + 32], xmm9
movupd xmm10, xmmword ptr [rdx + 8*r8 + 16]
movupd xmm12, xmmword ptr [rdx + 8*r8 + 32]
mulpd xmm10, xmm11
mulpd xmm12, xmm9
addpd xmm12, xmm10
addpd xmm0, xmm12
add r8, 4
movapd xmm9, xmm3
cmp rsi, r8
jne .LBB268_5
jmp .LBB268_9
.LBB268_3:
xorpd xmm0, xmm0
.LBB268_9:
test rcx, rcx
je .LBB268_10
.LBB268_11:
movapd xmm6, xmm2
mulsd xmm6, xmm3
movsd xmm5, qword ptr [rdi + 8*rax + 16]
movapd xmm4, xmm1
mulsd xmm4, xmm5
addsd xmm4, xmm6
movsd qword ptr [rdi + 8*rax + 16], xmm4
mulsd xmm4, qword ptr [rdx + 8*rsi + 16]
addsd xmm4, xmm0
cmp ecx, 1
je .LBB268_14
mulsd xmm3, xmm1
subsd xmm5, xmm3
movapd xmm6, xmm2
mulsd xmm6, xmm5
movsd xmm3, qword ptr [rdi + 8*rax + 24]
movapd xmm7, xmm1
mulsd xmm7, xmm3
addsd xmm7, xmm6
movsd qword ptr [rdi + 8*rax + 24], xmm7
mulsd xmm7, qword ptr [rdx + 8*rsi + 24]
addsd xmm4, xmm7
cmp ecx, 2
je .LBB268_14
mulsd xmm5, xmm1
subsd xmm3, xmm5
mulsd xmm2, xmm3
mulsd xmm1, qword ptr [rdi + 8*rax + 32]
addsd xmm1, xmm2
movsd qword ptr [rdi + 8*rax + 32], xmm1
mulsd xmm1, qword ptr [rdx + 8*rsi + 32]
addsd xmm4, xmm1
.LBB268_14:
unpckhpd xmm0, xmm0
addsd xmm0, xmm4
pop rax
.cfi_def_cfa_offset 8
ret
.LBB268_6:
.cfi_def_cfa_offset 16
movapd xmm3, xmm9
test rcx, rcx
jne .LBB268_11
.LBB268_10:
movapd xmm4, xmm0
unpckhpd xmm0, xmm0
addsd xmm0, xmm4
pop rax
.cfi_def_cfa_offset 8
ret
.LBB268_15:
.cfi_def_cfa_offset 16
lea rdi, [rip + .Lanon.267bd5b1072c0486f439c9d3a79604fd.526]
lea rdx, [rip + .Lanon.267bd5b1072c0486f439c9d3a79604fd.527]
mov esi, 30
call qword ptr [rip + core::panicking::panic@GOTPCREL]
.LBB268_7:
lea rax, [rip + .Lanon.267bd5b1072c0486f439c9d3a79604fd.528]
mov edi, 2
mov rdx, rcx
mov rcx, rax
call qword ptr [rip + core::slice::index::slice_index_fail@GOTPCREL]Benchmark results: |
No description provided.