From 83e8321c36bbd838c5916ff9b2b15d2fb1eb4582 Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Tue, 29 Jul 2025 13:03:36 -0700 Subject: [PATCH 1/2] expmt: boxed index --- rust/timsseek/src/scoring/scorer.rs | 8 ++--- rust/timsseek/tests/test_scorer.rs | 6 ++++ rust/timsseek_cli/src/config.rs | 54 +++++++++++++++++++++++++++++ rust/timsseek_cli/src/main.rs | 12 +++++-- rust/timsseek_cli/src/processing.rs | 6 ++-- rust/timsseek_rts/src/index.rs | 2 +- 6 files changed, 77 insertions(+), 11 deletions(-) create mode 100644 rust/timsseek/tests/test_scorer.rs diff --git a/rust/timsseek/src/scoring/scorer.rs b/rust/timsseek/src/scoring/scorer.rs index b24b3e9..bba4c3c 100644 --- a/rust/timsseek/src/scoring/scorer.rs +++ b/rust/timsseek/src/scoring/scorer.rs @@ -69,9 +69,9 @@ impl SecondaryQuery { } } -pub struct Scorer> { +pub struct Scorer + ?Sized> { pub index_cycle_rt_ms: Arc<[u32]>, - pub index: I, + pub index: Box, pub tolerance: Tolerance, // The secondsty tolerance is used for ... // the secondary query and is meant to be @@ -81,7 +81,7 @@ pub struct Scorer> { pub fragmented_range: IncludedRange, } -impl> Scorer { +impl + ?Sized> Scorer { // does inlining do anything here? #[inline] fn _build_prescore(&self, item: &QueryItemToScore) -> PreScore { @@ -224,7 +224,7 @@ impl FromParallelIterator<(Option, ScoreTimings)> for IonSearc } } -impl> Scorer { +impl + ?Sized> Scorer { /// Scores a single query item by orchestrating the internal steps. /// Useful for testing or single-item processing scenarios. pub fn buffered_score( diff --git a/rust/timsseek/tests/test_scorer.rs b/rust/timsseek/tests/test_scorer.rs new file mode 100644 index 0000000..c5c179a --- /dev/null +++ b/rust/timsseek/tests/test_scorer.rs @@ -0,0 +1,6 @@ +// use adder::add_two; + +#[test] +fn it_adds_two() { + assert_eq!(1, 1); +} diff --git a/rust/timsseek_cli/src/config.rs b/rust/timsseek_cli/src/config.rs index a28e7ae..cd79b08 100644 --- a/rust/timsseek_cli/src/config.rs +++ b/rust/timsseek_cli/src/config.rs @@ -2,17 +2,71 @@ use serde::{ Deserialize, Serialize, }; +use timsseek::IonAnnot; use std::path::PathBuf; use timsquery::Tolerance; +use timsquery::models::indices::{ + ExpandedRawFrameIndex, + QuadSplittedTransposedIndex, +}; +use timsquery::GenerallyQueriable; +use std::sync::Arc; +use timsquery::IncludedRange; use crate::cli::Cli; use crate::errors; +#[derive(Debug, Serialize, Deserialize, Clone, Default)] +pub enum IndexType { + #[default] + Transposed, + Expanded, +} + +pub struct IndexElements { + pub index: Box>, + pub index_cycle_rt_ms: Arc<[u32]>, + pub fragmented_range: IncludedRange, + +} + +impl IndexType { + pub fn build_index( + &self, + raw_file_path: &str, + ) -> IndexElements { + match self { + IndexType::Expanded => { + let tmp = ExpandedRawFrameIndex::from_path(raw_file_path).unwrap(); + let rts = tmp.cycle_rt_ms.clone(); + let fragmented_range = tmp.fragmented_range(); + IndexElements { + index: Box::new(tmp), + index_cycle_rt_ms: rts, + fragmented_range, + } + } + IndexType::Transposed => { + let tmp = QuadSplittedTransposedIndex::from_path(raw_file_path).unwrap(); + let rts = tmp.cycle_rt_ms.clone(); + let fragmented_range = tmp.fragmented_range(); + IndexElements { + index: Box::new(tmp), + index_cycle_rt_ms: rts, + fragmented_range, + } + } + } + } +} + #[derive(Debug, Serialize, Deserialize, Clone)] pub struct Config { pub input: Option, pub analysis: AnalysisConfig, pub output: Option, + #[serde(default)] + pub index_type: IndexType, } #[derive(Debug, Serialize, Deserialize, Clone)] diff --git a/rust/timsseek_cli/src/main.rs b/rust/timsseek_cli/src/main.rs index a0447da..bbe054f 100644 --- a/rust/timsseek_cli/src/main.rs +++ b/rust/timsseek_cli/src/main.rs @@ -106,6 +106,11 @@ fn main() -> std::result::Result<(), errors::CliError> { .unwrap(); let fragmented_range = index.fragmented_range(); + let index_elems = config.index_type.build_index(dotd_file_location + .clone() + .unwrap() // TODO: Error handling + .to_str() + .expect("Path is not convertable to string")); // Process based on input type match config.input { @@ -118,14 +123,14 @@ fn main() -> std::result::Result<(), errors::CliError> { // } Some(InputConfig::Speclib { path }) => { let scorer = Scorer { - index_cycle_rt_ms: index.cycle_rt_ms.clone(), - index, + index_cycle_rt_ms: index_elems.index_cycle_rt_ms, + index: index_elems.index, tolerance: config.analysis.tolerance.clone(), secondary_tolerance: config .analysis .tolerance .with_rt_tolerance(RtTolerance::Minutes((0.5, 0.5))), - fragmented_range, + fragmented_range: index_elems.fragmented_range, }; processing::process_speclib(path, &scorer, config.analysis.chunk_size, &output_config) .unwrap(); @@ -139,3 +144,4 @@ fn main() -> std::result::Result<(), errors::CliError> { Ok(()) } + diff --git a/rust/timsseek_cli/src/processing.rs b/rust/timsseek_cli/src/processing.rs index 164eaef..77c09ab 100644 --- a/rust/timsseek_cli/src/processing.rs +++ b/rust/timsseek_cli/src/processing.rs @@ -20,7 +20,7 @@ use tracing::{ info, }; -pub fn main_loop>( +pub fn main_loop + ?Sized>( // query_iterator: impl ExactSizeIterator, // # I would like this to be streaming query_iterator: Speclib, @@ -70,9 +70,9 @@ pub fn main_loop>( Ok(()) } -pub fn process_speclib( +pub fn process_speclib + ?Sized>( path: PathBuf, - scorer: &Scorer, + scorer: &Scorer, chunk_size: usize, output: &OutputConfig, ) -> std::result::Result<(), TimsSeekError> { diff --git a/rust/timsseek_rts/src/index.rs b/rust/timsseek_rts/src/index.rs index ec97385..d1effca 100644 --- a/rust/timsseek_rts/src/index.rs +++ b/rust/timsseek_rts/src/index.rs @@ -35,7 +35,7 @@ pub fn new_index( Ok(Scorer { index_cycle_rt_ms: ref_time_ms, - index, + index: Box::new(index), tolerance: tolerance.clone(), secondary_tolerance: tolerance.with_rt_tolerance( timsquery::models::tolerance::RtTolerance::Minutes((0.5, 0.5)), From b25dddc8538d909be774bd4207f6d964de22aa70 Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Tue, 29 Jul 2025 13:40:33 -0700 Subject: [PATCH 2/2] fix: make default transposed index centroided --- rust/timsseek_cli/src/config.rs | 5 ++++- rust/timsseek_cli/src/main.rs | 10 ---------- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/rust/timsseek_cli/src/config.rs b/rust/timsseek_cli/src/config.rs index cd79b08..49e8dbe 100644 --- a/rust/timsseek_cli/src/config.rs +++ b/rust/timsseek_cli/src/config.rs @@ -37,6 +37,9 @@ impl IndexType { ) -> IndexElements { match self { IndexType::Expanded => { + // Throughput seems to be ~ 30% better if I use the centroided version + // But I like the idea of having the full resolution data available + + // for the high throughput use case, we have the transposed index. let tmp = ExpandedRawFrameIndex::from_path(raw_file_path).unwrap(); let rts = tmp.cycle_rt_ms.clone(); let fragmented_range = tmp.fragmented_range(); @@ -47,7 +50,7 @@ impl IndexType { } } IndexType::Transposed => { - let tmp = QuadSplittedTransposedIndex::from_path(raw_file_path).unwrap(); + let tmp = QuadSplittedTransposedIndex::from_path_centroided(raw_file_path).unwrap(); let rts = tmp.cycle_rt_ms.clone(); let fragmented_range = tmp.fragmented_range(); IndexElements { diff --git a/rust/timsseek_cli/src/main.rs b/rust/timsseek_cli/src/main.rs index bbe054f..f3d599c 100644 --- a/rust/timsseek_cli/src/main.rs +++ b/rust/timsseek_cli/src/main.rs @@ -95,17 +95,7 @@ fn main() -> std::result::Result<(), errors::CliError> { }; let dotd_file_location = &config.analysis.dotd_file; - let index = QuadSplittedTransposedIndex::from_path_centroided( - // let index = QuadSplittedTransposedIndex::from_path( - dotd_file_location - .clone() - .unwrap() // TODO: Error handling - .to_str() - .expect("Path is not convertable to string"), - ) - .unwrap(); - let fragmented_range = index.fragmented_range(); let index_elems = config.index_type.build_index(dotd_file_location .clone() .unwrap() // TODO: Error handling