Skip to content

Conversation

@cm-ayf
Copy link
Contributor

@cm-ayf cm-ayf commented Nov 18, 2025

No description provided.

@cm-ayf cm-ayf self-assigned this Nov 18, 2025
@codecov
Copy link

codecov bot commented Nov 18, 2025

Codecov Report

❌ Patch coverage is 96.49123% with 4 lines in your changes missing coverage. Please review.
✅ Project coverage is 71.28%. Comparing base (55b2214) to head (ca9223a).

Files with missing lines Patch % Lines
src/mlpg_adjust/mlpg.rs 96.47% 3 Missing ⚠️
src/mlpg_adjust/mod.rs 94.73% 1 Missing ⚠️
Additional details and impacted files
@@            Coverage Diff             @@
##             main     #116      +/-   ##
==========================================
- Coverage   71.46%   71.28%   -0.18%     
==========================================
  Files          37       37              
  Lines        1675     1689      +14     
==========================================
+ Hits         1197     1204       +7     
- Misses        478      485       +7     

☔ View full report in Codecov by Sentry.
📢 Have feedback on the report? Share it here.

🚀 New features to boost your workflow:
  • ❄️ Test Analytics: Detect flaky tests, report on failures, and find test suite problems.

@github-actions
Copy link

mlsafir diagnostics

ref: ca9223acd372ea6a4b4901938f56647cea23fd61

Artifact: aarch64-base

Assembly:

Details
.section .text.jbonsai::vocoder::mlsa::fir,"ax",@progbits
	.p2align	2
.type	jbonsai::vocoder::mlsa::fir,@function
jbonsai::vocoder::mlsa::fir:
	.cfi_startproc
	stp x29, x30, [sp, #-16]!
	.cfi_def_cfa_offset 16
	mov x29, sp
	.cfi_def_cfa w29, 16
	.cfi_offset w30, -8
	.cfi_offset w29, -16
	.cfi_remember_state
	cmp x1, #1
	b.ls .LBB265_13
	fmul d4, d1, d1
	fmov d2, #1.00000000
	ldr d3, [x0, #8]
	cmp x1, x3
	fmul d6, d1, d3
	fsub d2, d2, d4
	fmul d5, d0, d2
	fmul d0, d0, d1
	fadd d5, d5, d6
	stp d0, d5, [x0]
	b.hi .LBB265_14
	fsub d3, d3, d0
	sub x12, x1, #2
	add x9, x2, #16
	ands x10, x12, #0xffffffffffffffc
	and x8, x12, #0x3
	add x11, x0, #16
	b.eq .LBB265_11
	fmul d5, d4, d4
	fneg d6, d1
	movi v0.2d, #0000000000000000
	and x12, x12, #0xffffffffffffffc
	mov x13, x9
	mov x14, x11
	neg x12, x12
.LBB265_4:
	cbz x12, .LBB265_6
	ldr q7, [x14]
	ldur q17, [x14, #8]
	mov v18.16b, v3.16b
	adds x12, x12, #4
	fmadd d16, d6, d3, d7
	fmla v17.2d, v7.2d, v6.d[0]
	fmul v7.2d, v7.2d, v1.d[0]
	mov v18.d[1], v16.d[0]
	mov v19.16b, v17.16b
	ldr q16, [x14, #16]
	fmul v20.2d, v16.2d, v1.d[0]
	mov d16, v16.d[1]
	fmla v19.2d, v18.2d, v4.d[0]
	fmla v7.2d, v18.2d, v2.d[0]
	ldr d18, [x14, #16]
	fmadd d16, d6, d18, d16
	fmla v20.2d, v19.2d, v2.d[0]
	ldp q19, q21, [x13], #32
	fmadd d16, d4, d17, d16
	fmul v18.2d, v7.2d, v19.2d
	fmul v19.2d, v20.2d, v21.2d
	stp q7, q20, [x14], #32
	fmadd d3, d5, d3, d16
	fadd v17.2d, v18.2d, v19.2d
	fadd v0.2d, v0.2d, v17.2d
	b.ne .LBB265_4
.LBB265_6:
	cbz x8, .LBB265_12
.LBB265_7:
	lsl x12, x10, #3
	fmul d4, d2, d3
	cmp x8, #1
	add x10, x11, x12
	add x9, x9, x12
	ldr d5, [x10]
	fmul d6, d1, d5
	fadd d6, d4, d6
	ldr d4, [x9]
	fmul d4, d4, d6
	str d6, [x10]
	fadd d4, d0, d4
	b.eq .LBB265_10
	fmul d3, d1, d3
	cmp x8, #2
	fsub d3, d5, d3
	ldr d5, [x10, #8]
	fmul d7, d1, d5
	fmul d6, d2, d3
	fadd d6, d6, d7
	ldr d7, [x9, #8]
	fmul d7, d7, d6
	str d6, [x10, #8]
	fadd d4, d4, d7
	b.eq .LBB265_10
	fmul d3, d1, d3
	fsub d3, d5, d3
	ldr d5, [x10, #16]
	fmul d1, d1, d5
	fmul d2, d2, d3
	fadd d1, d2, d1
	ldr d2, [x9, #16]
	fmul d2, d2, d1
	str d1, [x10, #16]
	fadd d4, d4, d2
.LBB265_10:
	mov d0, v0.d[1]
	fadd d0, d4, d0
	.cfi_def_cfa wsp, 16
	ldp x29, x30, [sp], #16
	.cfi_def_cfa_offset 0
	.cfi_restore w30
	.cfi_restore w29
	ret
.LBB265_11:
	.cfi_restore_state
	.cfi_remember_state
	movi v0.2d, #0000000000000000
	cbnz x8, .LBB265_7
.LBB265_12:
	fmov d4, d0
	mov d0, v0.d[1]
	fadd d0, d4, d0
	.cfi_def_cfa wsp, 16
	ldp x29, x30, [sp], #16
	.cfi_def_cfa_offset 0
	.cfi_restore w30
	.cfi_restore w29
	ret
.LBB265_13:
	.cfi_restore_state
	adrp x0, .Lanon.1c920b96ead1ce3e28b70eae9644e04a.543
	add x0, x0, :lo12:.Lanon.1c920b96ead1ce3e28b70eae9644e04a.543
	adrp x2, .Lanon.1c920b96ead1ce3e28b70eae9644e04a.544
	add x2, x2, :lo12:.Lanon.1c920b96ead1ce3e28b70eae9644e04a.544
	mov w1, #30
	bl core::panicking::panic
.LBB265_14:
	adrp x8, .Lanon.1c920b96ead1ce3e28b70eae9644e04a.545
	add x8, x8, :lo12:.Lanon.1c920b96ead1ce3e28b70eae9644e04a.545
	mov w0, #2
	mov x2, x3
	mov x3, x8
	bl core::slice::index::slice_index_fail

Benchmark results:


running 3 tests
test bonsai        ... bench:  16,545,317.00 ns/iter (+/- 63,406.78)
test bonsai_letter ... bench:  45,380,850.10 ns/iter (+/- 86,757.78)
test is_bonsai     ... bench:  25,368,750.50 ns/iter (+/- 79,254.50)

test result: ok. 0 passed; 0 failed; 0 ignored; 3 measured; 0 filtered out; finished in 26.29s

Artifact: aarch64-merge

Assembly:

Details
.section .text.jbonsai::vocoder::mlsa::fir,"ax",@progbits
	.p2align	2
.type	jbonsai::vocoder::mlsa::fir,@function
jbonsai::vocoder::mlsa::fir:
	.cfi_startproc
	stp x29, x30, [sp, #-16]!
	.cfi_def_cfa_offset 16
	mov x29, sp
	.cfi_def_cfa w29, 16
	.cfi_offset w30, -8
	.cfi_offset w29, -16
	.cfi_remember_state
	cmp x1, #1
	b.ls .LBB266_13
	fmul d4, d1, d1
	fmov d2, #1.00000000
	ldr d3, [x0, #8]
	cmp x1, x3
	fmul d6, d1, d3
	fsub d2, d2, d4
	fmul d5, d0, d2
	fmul d0, d0, d1
	fadd d5, d5, d6
	stp d0, d5, [x0]
	b.hi .LBB266_14
	fsub d3, d3, d0
	sub x13, x1, #2
	add x9, x2, #16
	ands x10, x13, #0xfffffffffffffffc
	and x8, x13, #0x3
	add x11, x0, #16
	and x12, x13, #0xffffffffffffffc
	b.eq .LBB266_11
	fmul d5, d4, d4
	fneg d6, d1
	movi v0.2d, #0000000000000000
	and x13, x13, #0xfffffffffffffffc
	neg x14, x12
	mov x15, x9
	neg x13, x13
	mov x16, x11
.LBB266_4:
	cbz x14, .LBB266_6
	ldr q7, [x16]
	ldur q17, [x16, #8]
	mov v18.16b, v3.16b
	adds x13, x13, #4
	add x14, x14, #4
	fmadd d16, d6, d3, d7
	fmla v17.2d, v7.2d, v6.d[0]
	fmul v7.2d, v7.2d, v1.d[0]
	mov v18.d[1], v16.d[0]
	mov v19.16b, v17.16b
	ldr q16, [x16, #16]
	fmul v20.2d, v16.2d, v1.d[0]
	mov d16, v16.d[1]
	fmla v19.2d, v18.2d, v4.d[0]
	fmla v7.2d, v18.2d, v2.d[0]
	ldr d18, [x16, #16]
	fmadd d16, d6, d18, d16
	fmla v20.2d, v19.2d, v2.d[0]
	ldp q19, q21, [x15], #32
	fmadd d16, d4, d17, d16
	fmul v18.2d, v7.2d, v19.2d
	fmul v19.2d, v20.2d, v21.2d
	stp q7, q20, [x16], #32
	fmadd d3, d5, d3, d16
	fadd v17.2d, v18.2d, v19.2d
	fadd v0.2d, v0.2d, v17.2d
	b.ne .LBB266_4
.LBB266_6:
	cbz x8, .LBB266_12
.LBB266_7:
	add x11, x11, x12, lsl #3
	fmul d4, d2, d3
	add x9, x9, x10, lsl #3
	cmp x8, #1
	ldr d5, [x11]
	fmul d6, d1, d5
	fadd d6, d4, d6
	ldr d4, [x9]
	fmul d4, d4, d6
	str d6, [x11]
	fadd d4, d0, d4
	b.eq .LBB266_10
	fmul d3, d1, d3
	cmp x8, #2
	fsub d3, d5, d3
	ldr d5, [x11, #8]
	fmul d7, d1, d5
	fmul d6, d2, d3
	fadd d6, d6, d7
	ldr d7, [x9, #8]
	fmul d7, d7, d6
	str d6, [x11, #8]
	fadd d4, d4, d7
	b.eq .LBB266_10
	fmul d3, d1, d3
	fsub d3, d5, d3
	ldr d5, [x11, #16]
	fmul d1, d1, d5
	fmul d2, d2, d3
	fadd d1, d2, d1
	ldr d2, [x9, #16]
	fmul d2, d2, d1
	str d1, [x11, #16]
	fadd d4, d4, d2
.LBB266_10:
	mov d0, v0.d[1]
	fadd d0, d4, d0
	.cfi_def_cfa wsp, 16
	ldp x29, x30, [sp], #16
	.cfi_def_cfa_offset 0
	.cfi_restore w30
	.cfi_restore w29
	ret
.LBB266_11:
	.cfi_restore_state
	.cfi_remember_state
	movi v0.2d, #0000000000000000
	cbnz x8, .LBB266_7
.LBB266_12:
	fmov d4, d0
	mov d0, v0.d[1]
	fadd d0, d4, d0
	.cfi_def_cfa wsp, 16
	ldp x29, x30, [sp], #16
	.cfi_def_cfa_offset 0
	.cfi_restore w30
	.cfi_restore w29
	ret
.LBB266_13:
	.cfi_restore_state
	adrp x0, .Lanon.1c920b96ead1ce3e28b70eae9644e04a.526
	add x0, x0, :lo12:.Lanon.1c920b96ead1ce3e28b70eae9644e04a.526
	adrp x2, .Lanon.1c920b96ead1ce3e28b70eae9644e04a.527
	add x2, x2, :lo12:.Lanon.1c920b96ead1ce3e28b70eae9644e04a.527
	mov w1, #30
	bl core::panicking::panic
.LBB266_14:
	adrp x8, .Lanon.1c920b96ead1ce3e28b70eae9644e04a.528
	add x8, x8, :lo12:.Lanon.1c920b96ead1ce3e28b70eae9644e04a.528
	mov w0, #2
	mov x2, x3
	mov x3, x8
	bl core::slice::index::slice_index_fail

Benchmark results:


running 3 tests
test bonsai        ... bench:  15,351,624.90 ns/iter (+/- 74,412.57)
test bonsai_letter ... bench:  42,289,737.40 ns/iter (+/- 148,118.27)
test is_bonsai     ... bench:  23,580,990.90 ns/iter (+/- 76,256.62)

test result: ok. 0 passed; 0 failed; 0 ignored; 3 measured; 0 filtered out; finished in 24.47s

Artifact: x86_64+avx2+fma-base

Assembly:

Details
.section .text.jbonsai::vocoder::mlsa::fir,"ax",@progbits
	.p2align	4
.type	jbonsai::vocoder::mlsa::fir,@function
jbonsai::vocoder::mlsa::fir:
	.cfi_startproc
	push rax
	.cfi_def_cfa_offset 16
	cmp rsi, 1
	jbe .LBB267_13
	vmulsd xmm4, xmm1, xmm1
	vmovsd xmm2, qword ptr [rip + .LCPI267_0]
	vsubsd xmm2, xmm2, xmm4
	vmovsd xmm3, qword ptr [rdi + 8]
	vmulsd xmm5, xmm0, xmm1
	vmovsd qword ptr [rdi], xmm5
	vmulsd xmm0, xmm0, xmm2
	vmulsd xmm6, xmm1, xmm3
	vaddsd xmm0, xmm0, xmm6
	vmovsd qword ptr [rdi + 8], xmm0
	cmp rsi, rcx
	ja .LBB267_14
	vsubsd xmm3, xmm3, xmm5
	add rsi, -2
	mov ecx, esi
	and ecx, 3
	movabs rax, 1152921504606846972
	and rax, rsi
	je .LBB267_3
	vmulsd xmm5, xmm4, xmm4
	vxorpd xmm6, xmm1, xmmword ptr [rip + .LCPI267_1]
	vmovddup xmm7, xmm1
	vmovddup xmm8, xmm2
	vmovddup xmm9, xmm6
	vmovddup xmm10, xmm4
	shl rsi, 3
	movabs r8, 9223372036854775776
	and r8, rsi
	vxorpd xmm0, xmm0, xmm0
	xor esi, esi
	.p2align	4
.LBB267_5:
	cmp r8, rsi
	je .LBB267_7
	vmovupd xmm11, xmmword ptr [rdi + rsi + 16]
	vmovapd xmm12, xmm3
	vfmadd213sd xmm12, xmm6, xmm11
	vmulpd xmm13, xmm11, xmm7
	vunpcklpd xmm12, xmm3, xmm12
	vfmadd231pd xmm13, xmm8, xmm12
	vmovsd xmm14, qword ptr [rdi + rsi + 32]
	vfmadd213pd xmm11, xmm9, xmmword ptr [rdi + rsi + 24]
	vmulpd xmm15, xmm7, xmmword ptr [rdi + rsi + 32]
	vfmadd213pd xmm12, xmm10, xmm11
	vfmadd231pd xmm15, xmm8, xmm12
	vfmadd213sd xmm14, xmm6, qword ptr [rdi + rsi + 40]
	vfmadd231sd xmm14, xmm4, xmm11
	vfmadd213sd xmm3, xmm5, xmm14
	vmovupd xmmword ptr [rdi + rsi + 16], xmm13
	vmovupd xmmword ptr [rdi + rsi + 32], xmm15
	vmulpd xmm11, xmm13, xmmword ptr [rdx + rsi + 16]
	vmulpd xmm12, xmm15, xmmword ptr [rdx + rsi + 32]
	vaddpd xmm11, xmm11, xmm12
	vaddpd xmm0, xmm11, xmm0
	add rsi, 32
	cmp r8, rsi
	jne .LBB267_5
.LBB267_7:
	test rcx, rcx
	je .LBB267_8
.LBB267_9:
	vmulsd xmm4, xmm2, xmm3
	vmovsd xmm5, qword ptr [rdi + 8*rax + 16]
	vmulsd xmm6, xmm1, xmm5
	vaddsd xmm4, xmm4, xmm6
	vmovsd qword ptr [rdi + 8*rax + 16], xmm4
	vmulsd xmm4, xmm4, qword ptr [rdx + 8*rax + 16]
	vaddsd xmm4, xmm0, xmm4
	cmp ecx, 1
	je .LBB267_12
	vmulsd xmm3, xmm1, xmm3
	vsubsd xmm3, xmm5, xmm3
	vmulsd xmm6, xmm2, xmm3
	vmovsd xmm5, qword ptr [rdi + 8*rax + 24]
	vmulsd xmm7, xmm1, xmm5
	vaddsd xmm6, xmm6, xmm7
	vmovsd qword ptr [rdi + 8*rax + 24], xmm6
	vmulsd xmm6, xmm6, qword ptr [rdx + 8*rax + 24]
	vaddsd xmm4, xmm4, xmm6
	cmp ecx, 2
	je .LBB267_12
	vmulsd xmm3, xmm1, xmm3
	vsubsd xmm3, xmm5, xmm3
	vmulsd xmm2, xmm2, xmm3
	vmulsd xmm1, xmm1, qword ptr [rdi + 8*rax + 32]
	vaddsd xmm1, xmm2, xmm1
	vmovsd qword ptr [rdi + 8*rax + 32], xmm1
	vmulsd xmm1, xmm1, qword ptr [rdx + 8*rax + 32]
	vaddsd xmm4, xmm4, xmm1
.LBB267_12:
	vshufpd xmm0, xmm0, xmm0, 1
	vaddsd xmm0, xmm4, xmm0
	pop rax
	.cfi_def_cfa_offset 8
	ret
.LBB267_3:
	.cfi_def_cfa_offset 16
	vxorpd xmm0, xmm0, xmm0
	test rcx, rcx
	jne .LBB267_9
.LBB267_8:
	vmovapd xmm4, xmm0
	vshufpd xmm0, xmm0, xmm0, 1
	vaddsd xmm0, xmm4, xmm0
	pop rax
	.cfi_def_cfa_offset 8
	ret
.LBB267_13:
	.cfi_def_cfa_offset 16
	lea rdi, [rip + .Lanon.267bd5b1072c0486f439c9d3a79604fd.543]
	lea rdx, [rip + .Lanon.267bd5b1072c0486f439c9d3a79604fd.544]
	mov esi, 30
	call qword ptr [rip + core::panicking::panic@GOTPCREL]
.LBB267_14:
	lea rax, [rip + .Lanon.267bd5b1072c0486f439c9d3a79604fd.545]
	mov edi, 2
	mov rdx, rcx
	mov rcx, rax
	call qword ptr [rip + core::slice::index::slice_index_fail@GOTPCREL]

Benchmark results:


running 3 tests
test bonsai        ... bench:  13,364,649.80 ns/iter (+/- 102,220.30)
test bonsai_letter ... bench:  36,645,569.00 ns/iter (+/- 1,029,065.64)
test is_bonsai     ... bench:  20,492,604.50 ns/iter (+/- 182,538.63)

test result: ok. 0 passed; 0 failed; 0 ignored; 3 measured; 0 filtered out; finished in 21.30s

Artifact: x86_64+avx2+fma-merge

Assembly:

Details
.section .text.jbonsai::vocoder::mlsa::fir,"ax",@progbits
	.p2align	4
.type	jbonsai::vocoder::mlsa::fir,@function
jbonsai::vocoder::mlsa::fir:
	.cfi_startproc
	push rax
	.cfi_def_cfa_offset 16
	cmp rsi, 1
	jbe .LBB268_13
	vmulsd xmm4, xmm1, xmm1
	vmovsd xmm2, qword ptr [rip + .LCPI268_0]
	vsubsd xmm2, xmm2, xmm4
	vmovsd xmm3, qword ptr [rdi + 8]
	vmulsd xmm5, xmm0, xmm1
	vmovsd qword ptr [rdi], xmm5
	vmulsd xmm0, xmm0, xmm2
	vmulsd xmm6, xmm1, xmm3
	vaddsd xmm0, xmm0, xmm6
	vmovsd qword ptr [rdi + 8], xmm0
	cmp rsi, rcx
	ja .LBB268_14
	vsubsd xmm3, xmm3, xmm5
	add rsi, -2
	mov ecx, esi
	and ecx, 3
	movabs rax, 1152921504606846972
	and rax, rsi
	and rsi, -4
	je .LBB268_3
	vmulsd xmm5, xmm4, xmm4
	vxorpd xmm6, xmm1, xmmword ptr [rip + .LCPI268_1]
	vmovddup xmm7, xmm1
	vmovddup xmm8, xmm2
	vmovddup xmm9, xmm6
	vmovddup xmm10, xmm4
	vxorpd xmm0, xmm0, xmm0
	xor r8d, r8d
	.p2align	4
.LBB268_5:
	cmp rax, r8
	je .LBB268_7
	vmovupd xmm11, xmmword ptr [rdi + 8*r8 + 16]
	vmovapd xmm12, xmm3
	vfmadd213sd xmm12, xmm6, xmm11
	vmulpd xmm13, xmm11, xmm7
	vunpcklpd xmm12, xmm3, xmm12
	vfmadd231pd xmm13, xmm8, xmm12
	vmovsd xmm14, qword ptr [rdi + 8*r8 + 32]
	vfmadd213pd xmm11, xmm9, xmmword ptr [rdi + 8*r8 + 24]
	vmulpd xmm15, xmm7, xmmword ptr [rdi + 8*r8 + 32]
	vfmadd213pd xmm12, xmm10, xmm11
	vfmadd231pd xmm15, xmm8, xmm12
	vfmadd213sd xmm14, xmm6, qword ptr [rdi + 8*r8 + 40]
	vfmadd231sd xmm14, xmm4, xmm11
	vfmadd213sd xmm3, xmm5, xmm14
	vmovupd xmmword ptr [rdi + 8*r8 + 16], xmm13
	vmovupd xmmword ptr [rdi + 8*r8 + 32], xmm15
	vmulpd xmm11, xmm13, xmmword ptr [rdx + 8*r8 + 16]
	vmulpd xmm12, xmm15, xmmword ptr [rdx + 8*r8 + 32]
	vaddpd xmm11, xmm11, xmm12
	vaddpd xmm0, xmm11, xmm0
	add r8, 4
	cmp rsi, r8
	jne .LBB268_5
.LBB268_7:
	test rcx, rcx
	je .LBB268_8
.LBB268_9:
	vmulsd xmm4, xmm2, xmm3
	vmovsd xmm5, qword ptr [rdi + 8*rax + 16]
	vmulsd xmm6, xmm1, xmm5
	vaddsd xmm4, xmm4, xmm6
	vmovsd qword ptr [rdi + 8*rax + 16], xmm4
	vmulsd xmm4, xmm4, qword ptr [rdx + 8*rsi + 16]
	vaddsd xmm4, xmm0, xmm4
	cmp ecx, 1
	je .LBB268_12
	vmulsd xmm3, xmm1, xmm3
	vsubsd xmm3, xmm5, xmm3
	vmulsd xmm6, xmm2, xmm3
	vmovsd xmm5, qword ptr [rdi + 8*rax + 24]
	vmulsd xmm7, xmm1, xmm5
	vaddsd xmm6, xmm6, xmm7
	vmovsd qword ptr [rdi + 8*rax + 24], xmm6
	vmulsd xmm6, xmm6, qword ptr [rdx + 8*rsi + 24]
	vaddsd xmm4, xmm4, xmm6
	cmp ecx, 2
	je .LBB268_12
	vmulsd xmm3, xmm1, xmm3
	vsubsd xmm3, xmm5, xmm3
	vmulsd xmm2, xmm2, xmm3
	vmulsd xmm1, xmm1, qword ptr [rdi + 8*rax + 32]
	vaddsd xmm1, xmm2, xmm1
	vmovsd qword ptr [rdi + 8*rax + 32], xmm1
	vmulsd xmm1, xmm1, qword ptr [rdx + 8*rsi + 32]
	vaddsd xmm4, xmm4, xmm1
.LBB268_12:
	vshufpd xmm0, xmm0, xmm0, 1
	vaddsd xmm0, xmm4, xmm0
	pop rax
	.cfi_def_cfa_offset 8
	ret
.LBB268_3:
	.cfi_def_cfa_offset 16
	vxorpd xmm0, xmm0, xmm0
	test rcx, rcx
	jne .LBB268_9
.LBB268_8:
	vmovapd xmm4, xmm0
	vshufpd xmm0, xmm0, xmm0, 1
	vaddsd xmm0, xmm4, xmm0
	pop rax
	.cfi_def_cfa_offset 8
	ret
.LBB268_13:
	.cfi_def_cfa_offset 16
	lea rdi, [rip + .Lanon.267bd5b1072c0486f439c9d3a79604fd.526]
	lea rdx, [rip + .Lanon.267bd5b1072c0486f439c9d3a79604fd.527]
	mov esi, 30
	call qword ptr [rip + core::panicking::panic@GOTPCREL]
.LBB268_14:
	lea rax, [rip + .Lanon.267bd5b1072c0486f439c9d3a79604fd.528]
	mov edi, 2
	mov rdx, rcx
	mov rcx, rax
	call qword ptr [rip + core::slice::index::slice_index_fail@GOTPCREL]

Benchmark results:


running 3 tests
test bonsai        ... bench:  11,869,957.60 ns/iter (+/- 123,454.15)
test bonsai_letter ... bench:  31,982,824.50 ns/iter (+/- 1,112,849.16)
test is_bonsai     ... bench:  17,875,838.60 ns/iter (+/- 452,810.44)

test result: ok. 0 passed; 0 failed; 0 ignored; 3 measured; 0 filtered out; finished in 18.68s

Artifact: x86_64-base

Assembly:

Details
.section .text.jbonsai::vocoder::mlsa::fir,"ax",@progbits
	.p2align	4
.type	jbonsai::vocoder::mlsa::fir,@function
jbonsai::vocoder::mlsa::fir:
	.cfi_startproc
	push rax
	.cfi_def_cfa_offset 16
	cmp rsi, 1
	jbe .LBB267_15
	movapd xmm4, xmm1
	mulsd xmm4, xmm1
	movsd xmm2, qword ptr [rip + .LCPI267_0]
	subsd xmm2, xmm4
	movsd xmm3, qword ptr [rdi + 8]
	movapd xmm5, xmm0
	mulsd xmm5, xmm1
	movsd qword ptr [rdi], xmm5
	mulsd xmm0, xmm2
	movapd xmm6, xmm1
	mulsd xmm6, xmm3
	addsd xmm6, xmm0
	movsd qword ptr [rdi + 8], xmm6
	cmp rsi, rcx
	ja .LBB267_7
	subsd xmm3, xmm5
	add rsi, -2
	mov ecx, esi
	and ecx, 3
	movabs rax, 1152921504606846972
	and rax, rsi
	je .LBB267_3
	movapd xmm5, xmm4
	mulsd xmm5, xmm4
	movapd xmm6, xmm2
	unpcklpd xmm6, xmm2
	movapd xmm7, xmm1
	unpcklpd xmm7, xmm1
	movapd xmm8, xmm4
	unpcklpd xmm8, xmm4
	shl rsi, 3
	movabs r8, 9223372036854775776
	and r8, rsi
	xorpd xmm0, xmm0
	xor esi, esi
	movapd xmm9, xmm3
	.p2align	4
.LBB267_5:
	cmp r8, rsi
	je .LBB267_6
	movapd xmm3, xmm1
	mulsd xmm3, xmm9
	movapd xmm10, xmm5
	mulsd xmm10, xmm9
	movupd xmm11, xmmword ptr [rdi + rsi + 16]
	movupd xmm12, xmmword ptr [rdi + rsi + 24]
	movupd xmm13, xmmword ptr [rdi + rsi + 32]
	movapd xmm14, xmm7
	mulpd xmm14, xmm11
	subsd xmm11, xmm3
	unpcklpd xmm9, xmm11
	movapd xmm11, xmm6
	mulpd xmm11, xmm9
	addpd xmm11, xmm14
	movsd xmm3, qword ptr [rdi + rsi + 40]
	subpd xmm12, xmm14
	mulpd xmm9, xmm8
	addpd xmm9, xmm12
	mulpd xmm9, xmm6
	mulpd xmm13, xmm7
	addpd xmm9, xmm13
	mulsd xmm12, xmm4
	subsd xmm3, xmm13
	addsd xmm3, xmm12
	addsd xmm3, xmm10
	movupd xmmword ptr [rdi + rsi + 16], xmm11
	movupd xmmword ptr [rdi + rsi + 32], xmm9
	movupd xmm10, xmmword ptr [rdx + rsi + 16]
	movupd xmm12, xmmword ptr [rdx + rsi + 32]
	mulpd xmm10, xmm11
	mulpd xmm12, xmm9
	addpd xmm12, xmm10
	addpd xmm0, xmm12
	add rsi, 32
	movapd xmm9, xmm3
	cmp r8, rsi
	jne .LBB267_5
	jmp .LBB267_9
.LBB267_3:
	xorpd xmm0, xmm0
.LBB267_9:
	test rcx, rcx
	je .LBB267_10
.LBB267_11:
	movapd xmm6, xmm2
	mulsd xmm6, xmm3
	movsd xmm5, qword ptr [rdi + 8*rax + 16]
	movapd xmm4, xmm1
	mulsd xmm4, xmm5
	addsd xmm4, xmm6
	movsd qword ptr [rdi + 8*rax + 16], xmm4
	mulsd xmm4, qword ptr [rdx + 8*rax + 16]
	addsd xmm4, xmm0
	cmp ecx, 1
	je .LBB267_14
	mulsd xmm3, xmm1
	subsd xmm5, xmm3
	movapd xmm6, xmm2
	mulsd xmm6, xmm5
	movsd xmm3, qword ptr [rdi + 8*rax + 24]
	movapd xmm7, xmm1
	mulsd xmm7, xmm3
	addsd xmm7, xmm6
	movsd qword ptr [rdi + 8*rax + 24], xmm7
	mulsd xmm7, qword ptr [rdx + 8*rax + 24]
	addsd xmm4, xmm7
	cmp ecx, 2
	je .LBB267_14
	mulsd xmm5, xmm1
	subsd xmm3, xmm5
	mulsd xmm2, xmm3
	mulsd xmm1, qword ptr [rdi + 8*rax + 32]
	addsd xmm1, xmm2
	movsd qword ptr [rdi + 8*rax + 32], xmm1
	mulsd xmm1, qword ptr [rdx + 8*rax + 32]
	addsd xmm4, xmm1
.LBB267_14:
	unpckhpd xmm0, xmm0
	addsd xmm0, xmm4
	pop rax
	.cfi_def_cfa_offset 8
	ret
.LBB267_6:
	.cfi_def_cfa_offset 16
	movapd xmm3, xmm9
	test rcx, rcx
	jne .LBB267_11
.LBB267_10:
	movapd xmm4, xmm0
	unpckhpd xmm0, xmm0
	addsd xmm0, xmm4
	pop rax
	.cfi_def_cfa_offset 8
	ret
.LBB267_15:
	.cfi_def_cfa_offset 16
	lea rdi, [rip + .Lanon.267bd5b1072c0486f439c9d3a79604fd.543]
	lea rdx, [rip + .Lanon.267bd5b1072c0486f439c9d3a79604fd.544]
	mov esi, 30
	call qword ptr [rip + core::panicking::panic@GOTPCREL]
.LBB267_7:
	lea rax, [rip + .Lanon.267bd5b1072c0486f439c9d3a79604fd.545]
	mov edi, 2
	mov rdx, rcx
	mov rcx, rax
	call qword ptr [rip + core::slice::index::slice_index_fail@GOTPCREL]

Benchmark results:


running 3 tests
test bonsai        ... bench:  16,092,417.90 ns/iter (+/- 312,023.34)
test bonsai_letter ... bench:  44,077,186.00 ns/iter (+/- 982,797.01)
test is_bonsai     ... bench:  24,683,830.80 ns/iter (+/- 210,632.07)

test result: ok. 0 passed; 0 failed; 0 ignored; 3 measured; 0 filtered out; finished in 25.63s

Artifact: x86_64-merge

Assembly:

Details
.section .text.jbonsai::vocoder::mlsa::fir,"ax",@progbits
	.p2align	4
.type	jbonsai::vocoder::mlsa::fir,@function
jbonsai::vocoder::mlsa::fir:
	.cfi_startproc
	push rax
	.cfi_def_cfa_offset 16
	cmp rsi, 1
	jbe .LBB268_15
	movapd xmm4, xmm1
	mulsd xmm4, xmm1
	movsd xmm2, qword ptr [rip + .LCPI268_0]
	subsd xmm2, xmm4
	movsd xmm3, qword ptr [rdi + 8]
	movapd xmm5, xmm0
	mulsd xmm5, xmm1
	movsd qword ptr [rdi], xmm5
	mulsd xmm0, xmm2
	movapd xmm6, xmm1
	mulsd xmm6, xmm3
	addsd xmm6, xmm0
	movsd qword ptr [rdi + 8], xmm6
	cmp rsi, rcx
	ja .LBB268_7
	subsd xmm3, xmm5
	add rsi, -2
	mov ecx, esi
	and ecx, 3
	movabs rax, 1152921504606846972
	and rax, rsi
	and rsi, -4
	je .LBB268_3
	movapd xmm5, xmm4
	mulsd xmm5, xmm4
	movapd xmm6, xmm2
	unpcklpd xmm6, xmm2
	movapd xmm7, xmm1
	unpcklpd xmm7, xmm1
	movapd xmm8, xmm4
	unpcklpd xmm8, xmm4
	xorpd xmm0, xmm0
	xor r8d, r8d
	movapd xmm9, xmm3
	.p2align	4
.LBB268_5:
	cmp rax, r8
	je .LBB268_6
	movapd xmm3, xmm1
	mulsd xmm3, xmm9
	movapd xmm10, xmm5
	mulsd xmm10, xmm9
	movupd xmm11, xmmword ptr [rdi + 8*r8 + 16]
	movupd xmm12, xmmword ptr [rdi + 8*r8 + 24]
	movupd xmm13, xmmword ptr [rdi + 8*r8 + 32]
	movapd xmm14, xmm7
	mulpd xmm14, xmm11
	subsd xmm11, xmm3
	unpcklpd xmm9, xmm11
	movapd xmm11, xmm6
	mulpd xmm11, xmm9
	addpd xmm11, xmm14
	movsd xmm3, qword ptr [rdi + 8*r8 + 40]
	subpd xmm12, xmm14
	mulpd xmm9, xmm8
	addpd xmm9, xmm12
	mulpd xmm9, xmm6
	mulpd xmm13, xmm7
	addpd xmm9, xmm13
	mulsd xmm12, xmm4
	subsd xmm3, xmm13
	addsd xmm3, xmm12
	addsd xmm3, xmm10
	movupd xmmword ptr [rdi + 8*r8 + 16], xmm11
	movupd xmmword ptr [rdi + 8*r8 + 32], xmm9
	movupd xmm10, xmmword ptr [rdx + 8*r8 + 16]
	movupd xmm12, xmmword ptr [rdx + 8*r8 + 32]
	mulpd xmm10, xmm11
	mulpd xmm12, xmm9
	addpd xmm12, xmm10
	addpd xmm0, xmm12
	add r8, 4
	movapd xmm9, xmm3
	cmp rsi, r8
	jne .LBB268_5
	jmp .LBB268_9
.LBB268_3:
	xorpd xmm0, xmm0
.LBB268_9:
	test rcx, rcx
	je .LBB268_10
.LBB268_11:
	movapd xmm6, xmm2
	mulsd xmm6, xmm3
	movsd xmm5, qword ptr [rdi + 8*rax + 16]
	movapd xmm4, xmm1
	mulsd xmm4, xmm5
	addsd xmm4, xmm6
	movsd qword ptr [rdi + 8*rax + 16], xmm4
	mulsd xmm4, qword ptr [rdx + 8*rsi + 16]
	addsd xmm4, xmm0
	cmp ecx, 1
	je .LBB268_14
	mulsd xmm3, xmm1
	subsd xmm5, xmm3
	movapd xmm6, xmm2
	mulsd xmm6, xmm5
	movsd xmm3, qword ptr [rdi + 8*rax + 24]
	movapd xmm7, xmm1
	mulsd xmm7, xmm3
	addsd xmm7, xmm6
	movsd qword ptr [rdi + 8*rax + 24], xmm7
	mulsd xmm7, qword ptr [rdx + 8*rsi + 24]
	addsd xmm4, xmm7
	cmp ecx, 2
	je .LBB268_14
	mulsd xmm5, xmm1
	subsd xmm3, xmm5
	mulsd xmm2, xmm3
	mulsd xmm1, qword ptr [rdi + 8*rax + 32]
	addsd xmm1, xmm2
	movsd qword ptr [rdi + 8*rax + 32], xmm1
	mulsd xmm1, qword ptr [rdx + 8*rsi + 32]
	addsd xmm4, xmm1
.LBB268_14:
	unpckhpd xmm0, xmm0
	addsd xmm0, xmm4
	pop rax
	.cfi_def_cfa_offset 8
	ret
.LBB268_6:
	.cfi_def_cfa_offset 16
	movapd xmm3, xmm9
	test rcx, rcx
	jne .LBB268_11
.LBB268_10:
	movapd xmm4, xmm0
	unpckhpd xmm0, xmm0
	addsd xmm0, xmm4
	pop rax
	.cfi_def_cfa_offset 8
	ret
.LBB268_15:
	.cfi_def_cfa_offset 16
	lea rdi, [rip + .Lanon.267bd5b1072c0486f439c9d3a79604fd.526]
	lea rdx, [rip + .Lanon.267bd5b1072c0486f439c9d3a79604fd.527]
	mov esi, 30
	call qword ptr [rip + core::panicking::panic@GOTPCREL]
.LBB268_7:
	lea rax, [rip + .Lanon.267bd5b1072c0486f439c9d3a79604fd.528]
	mov edi, 2
	mov rdx, rcx
	mov rcx, rax
	call qword ptr [rip + core::slice::index::slice_index_fail@GOTPCREL]

Benchmark results:


running 3 tests
test bonsai        ... bench:  14,584,525.20 ns/iter (+/- 646,837.23)
test bonsai_letter ... bench:  39,581,129.80 ns/iter (+/- 1,062,756.07)
test is_bonsai     ... bench:  22,230,101.50 ns/iter (+/- 240,700.65)

test result: ok. 0 passed; 0 failed; 0 ignored; 3 measured; 0 filtered out; finished in 23.05s

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants