From 16ce86dc42ad4b0b1d3c61c510532aec2bb352f3 Mon Sep 17 00:00:00 2001 From: Josh Date: Mon, 22 Dec 2025 15:27:37 -0500 Subject: [PATCH] metrics: track sled physical cpu usage. On sled-agent startup, begin polling physical cpu usage via the `cpu:*:sys:cpu_nsec_*` kstat. We expose this as `sled_cpu:cpu_nsec`, and label samples by cpu micro-state (user, idle, etc.). Part of #9559. --- oximeter/instruments/Cargo.toml | 3 +- oximeter/instruments/src/kstat/cpu.rs | 280 +++++++++++++++++++++++++ oximeter/instruments/src/kstat/mod.rs | 2 + oximeter/oximeter/schema/sled-cpu.toml | 46 ++++ sled-agent/src/metrics.rs | 95 ++++++++- sled-agent/src/sled_agent.rs | 12 ++ 6 files changed, 435 insertions(+), 3 deletions(-) create mode 100644 oximeter/instruments/src/kstat/cpu.rs create mode 100644 oximeter/oximeter/schema/sled-cpu.toml diff --git a/oximeter/instruments/Cargo.toml b/oximeter/instruments/Cargo.toml index b0d5b35410c..5ad68d26314 100644 --- a/oximeter/instruments/Cargo.toml +++ b/oximeter/instruments/Cargo.toml @@ -26,7 +26,7 @@ uuid = { workspace = true, optional = true } omicron-workspace-hack.workspace = true [features] -default = ["http-instruments", "datalink"] +default = ["http-instruments", "cpu", "datalink"] http-instruments = [ "dep:chrono", "dep:dropshot", @@ -51,6 +51,7 @@ kstat = [ "dep:thiserror", "dep:uuid" ] +cpu = ["kstat"] datalink = ["kstat"] [dev-dependencies] diff --git a/oximeter/instruments/src/kstat/cpu.rs b/oximeter/instruments/src/kstat/cpu.rs new file mode 100644 index 00000000000..ca1da1b4b89 --- /dev/null +++ b/oximeter/instruments/src/kstat/cpu.rs @@ -0,0 +1,280 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Report metrics about CPU cores on the host system + +use crate::kstat::ConvertNamedData; +use crate::kstat::Error; +use crate::kstat::KstatList; +use crate::kstat::KstatTarget; +use crate::kstat::hrtime_to_utc; +use kstat_rs::Data; +use kstat_rs::Kstat; +use kstat_rs::Named; +use oximeter::FieldType; +use oximeter::FieldValue; +use oximeter::Sample; +use oximeter::Target; +use oximeter::types::Cumulative; +use uuid::Uuid; + +oximeter::use_timeseries!("sled-cpu.toml"); +pub use self::sled_cpu::SledCpu as SledCpuTarget; + +/// The prefix for CPU microstate kstat fields. +const CPU_NSEC_PREFIX: &str = "cpu_nsec_"; + +/// The CPU microstates we track from kstats. +/// +/// These correspond to the `cpu_nsec_*` fields in the `cpu::sys` kstat. +const CPU_MICROSTATES: &[&str] = &["idle", "user", "kernel", "dtrace", "intr"]; + +/// CPU metrics for a sled, tracking microstate statistics across all cores. +#[derive(Clone, Debug)] +pub struct SledCpu { + /// The target for this sled's CPUs. + pub target: SledCpuTarget, + /// Flag indicating whether the sled is synced with NTP. + pub time_synced: bool, +} + +impl SledCpu { + /// Create a new `SledCpu` with the given target and synchronization flag. + pub fn new(target: SledCpuTarget, time_synced: bool) -> Self { + Self { target, time_synced } + } + + /// Return the sled ID. + pub fn sled_id(&self) -> Uuid { + self.target.sled_id + } +} + +impl KstatTarget for SledCpu { + fn interested(&self, kstat: &Kstat<'_>) -> bool { + self.time_synced && kstat.ks_module == "cpu" && kstat.ks_name == "sys" + } + + fn to_samples( + &self, + kstats: KstatList<'_, '_>, + ) -> Result, Error> { + let mut samples = Vec::new(); + + for (creation_time, kstat, data) in kstats.iter() { + let snapshot_time = hrtime_to_utc(kstat.ks_snaptime)?; + let cpu_id = u32::try_from(kstat.ks_instance) + .expect("CPU instance ID should fit in u32"); + + let Data::Named(named) = data else { + return Err(Error::ExpectedNamedKstat); + }; + + for named_data in named.iter() { + let Named { name, value } = named_data; + + // Check if this is a cpu_nsec_* field we care about + let Some(state) = name.strip_prefix(CPU_NSEC_PREFIX) else { + continue; + }; + + // Only process states we know about + if !CPU_MICROSTATES.contains(&state) { + continue; + } + + let datum = value.as_u64()?; + let metric = sled_cpu::CpuNsec { + cpu_id, + state: state.to_string().into(), + datum: Cumulative::with_start_time(*creation_time, datum), + }; + let sample = Sample::new_with_timestamp( + snapshot_time, + &self.target, + &metric, + ) + .map_err(Error::Sample)?; + samples.push(sample); + } + } + + Ok(samples) + } +} + +// NOTE: Delegate to the inner target type for this implementation. +impl Target for SledCpu { + fn name(&self) -> &'static str { + self.target.name() + } + + fn field_names(&self) -> &'static [&'static str] { + self.target.field_names() + } + + fn field_types(&self) -> Vec { + self.target.field_types() + } + + fn field_values(&self) -> Vec { + self.target.field_values() + } +} + +#[cfg(all(test, target_os = "illumos"))] +mod tests { + use super::*; + use crate::kstat::CollectionDetails; + use crate::kstat::KstatSampler; + use crate::kstat::TargetStatus; + use kstat_rs::Ctl; + use oximeter::Producer; + use slog::Drain; + use slog::Logger; + use std::time::Duration; + use tokio::time::Instant; + use uuid::Uuid; + use uuid::uuid; + + fn test_logger() -> Logger { + let dec = + slog_term::PlainSyncDecorator::new(slog_term::TestStdoutWriter); + let drain = slog_term::FullFormat::new(dec).build().fuse(); + let log = + Logger::root(drain, slog::o!("component" => "fake-cleanup-task")); + log + } + + const RACK_ID: Uuid = uuid!("de784702-cafb-41a9-b3e5-93af189def29"); + const SLED_ID: Uuid = uuid!("88240343-5262-45f4-86f1-3c82fe383f2a"); + const SLED_MODEL: &str = "fake-gimlet"; + const SLED_REVISION: u32 = 1; + const SLED_SERIAL: &str = "fake-serial"; + + #[test] + fn test_kstat_interested() { + let target = SledCpuTarget { + rack_id: RACK_ID, + sled_id: SLED_ID, + sled_serial: SLED_SERIAL.into(), + sled_model: SLED_MODEL.into(), + sled_revision: SLED_REVISION, + }; + let mut cpu = SledCpu::new(target, false); + + let ctl = Ctl::new().unwrap(); + let ctl = ctl.update().unwrap(); + let kstat = + ctl.filter(Some("cpu"), Some(0), Some("sys")).next().unwrap(); + + // Not interested when not time synced + assert!(!cpu.interested(&kstat)); + + // Interested when time synced + cpu.time_synced = true; + assert!(cpu.interested(&kstat)); + + // Not interested in other cpu kstats (e.g., cpu:0:vm) + if let Some(vm_kstat) = + ctl.filter(Some("cpu"), Some(0), Some("vm")).next() + { + assert!(!cpu.interested(&vm_kstat)); + } + + // Not interested in non-cpu kstats + if let Some(other_kstat) = ctl.filter(Some("link"), None, None).next() { + assert!(!cpu.interested(&other_kstat)); + } + } + + #[test] + fn test_sled_cpu_samples() { + let target = SledCpuTarget { + rack_id: RACK_ID, + sled_id: SLED_ID, + sled_serial: SLED_SERIAL.into(), + sled_model: SLED_MODEL.into(), + sled_revision: SLED_REVISION, + }; + let cpu = SledCpu::new(target, true); + let ctl = Ctl::new().unwrap(); + let ctl = ctl.update().unwrap(); + + // Collect kstats for CPU 0 + let mut kstat = + ctl.filter(Some("cpu"), Some(0), Some("sys")).next().unwrap(); + let creation_time = hrtime_to_utc(kstat.ks_crtime).unwrap(); + let data = ctl.read(&mut kstat).unwrap(); + let samples = cpu.to_samples(&[(creation_time, kstat, data)]).unwrap(); + println!("{samples:#?}"); + + // Extract the state from each sample + let mut states: Vec<_> = samples + .iter() + .filter_map(|s| { + s.sorted_metric_fields().get("state").and_then(|f| { + match &f.value { + oximeter::FieldValue::String(s) => { + Some(s.as_ref().to_string()) + } + _ => None, + } + }) + }) + .collect(); + states.sort(); + + // Verify we got exactly one sample for each expected microstate + let mut expected: Vec<_> = + CPU_MICROSTATES.iter().map(|s| s.to_string()).collect(); + expected.sort(); + assert_eq!(states, expected); + } + + #[tokio::test] + async fn test_kstat_sampler() { + let mut sampler = KstatSampler::new(&test_logger()).unwrap(); + let target = SledCpuTarget { + rack_id: RACK_ID, + sled_id: SLED_ID, + sled_serial: SLED_SERIAL.into(), + sled_model: SLED_MODEL.into(), + sled_revision: SLED_REVISION, + }; + let cpu = SledCpu::new(target, true); + let details = CollectionDetails::never(Duration::from_secs(1)); + let id = sampler.add_target(cpu, details).await.unwrap(); + let samples: Vec<_> = sampler.produce().unwrap().collect(); + assert!(samples.is_empty()); + + // Pause time, and advance until we're notified of new samples. + tokio::time::pause(); + const MAX_DURATION: Duration = Duration::from_secs(3); + const STEP_DURATION: Duration = Duration::from_secs(1); + let now = Instant::now(); + let expected_counts = loop { + tokio::time::advance(STEP_DURATION).await; + if now.elapsed() > MAX_DURATION { + panic!("Waited too long for samples"); + } + if let Some(counts) = sampler.sample_counts() { + break counts; + } + }; + let samples: Vec<_> = sampler.produce().unwrap().collect(); + println!("{samples:#?}"); + assert_eq!(samples.len(), expected_counts.total); + assert_eq!(expected_counts.overflow, 0); + + // Test status and remove behavior. + tokio::time::resume(); + assert!(matches!( + sampler.target_status(id).await.unwrap(), + TargetStatus::Ok { .. }, + )); + sampler.remove_target(id).await.unwrap(); + assert!(sampler.target_status(id).await.is_err()); + } +} diff --git a/oximeter/instruments/src/kstat/mod.rs b/oximeter/instruments/src/kstat/mod.rs index e010545329e..f1106c45c85 100644 --- a/oximeter/instruments/src/kstat/mod.rs +++ b/oximeter/instruments/src/kstat/mod.rs @@ -87,6 +87,8 @@ use std::cmp::Ordering; use std::collections::BTreeMap; use std::time::Duration; +#[cfg(any(feature = "cpu", test))] +pub mod cpu; #[cfg(any(feature = "datalink", test))] pub mod link; mod sampler; diff --git a/oximeter/oximeter/schema/sled-cpu.toml b/oximeter/oximeter/schema/sled-cpu.toml new file mode 100644 index 00000000000..f2251feff24 --- /dev/null +++ b/oximeter/oximeter/schema/sled-cpu.toml @@ -0,0 +1,46 @@ +format_version = 1 + +[target] +name = "sled_cpu" +description = "CPU metrics for a compute sled" +authz_scope = "fleet" +versions = [ + { version = 1, fields = [ "rack_id", "sled_id", "sled_model", "sled_revision", "sled_serial" ] }, +] + +[fields.cpu_id] +type = "u32" +description = "The CPU core identifier" + +[fields.rack_id] +type = "uuid" +description = "ID for the CPU's rack" + +[fields.sled_id] +type = "uuid" +description = "ID for the CPU's sled" + +[fields.sled_model] +type = "string" +description = "Model number of the sled" + +[fields.sled_revision] +type = "u32" +description = "Revision number of the sled" + +[fields.sled_serial] +type = "string" +description = "Serial number of the sled" + +[fields.state] +type = "string" +description = "The CPU microstate (idle, user, kernel, dtrace, intr)" + +[[metrics]] +name = "cpu_nsec" +description = "Cumulative nanoseconds spent in a CPU microstate" +units = "nanoseconds" +datum_type = "cumulative_u64" +versions = [ + { added_in = 1, fields = [ "cpu_id", "state" ] } +] diff --git a/sled-agent/src/metrics.rs b/sled-agent/src/metrics.rs index e6342b27bec..04d5046a776 100644 --- a/sled-agent/src/metrics.rs +++ b/sled-agent/src/metrics.rs @@ -12,6 +12,8 @@ use oximeter_instruments::kstat::CollectionDetails; use oximeter_instruments::kstat::Error as KstatError; use oximeter_instruments::kstat::KstatSampler; use oximeter_instruments::kstat::TargetId; +use oximeter_instruments::kstat::cpu::SledCpu; +use oximeter_instruments::kstat::cpu::SledCpuTarget; use oximeter_instruments::kstat::link::SledDataLink; use oximeter_instruments::kstat::link::SledDataLinkTarget; use oximeter_producer::LogConfig; @@ -81,8 +83,12 @@ pub enum Error { pub enum Message { /// Start tracking the named physical link. /// - /// This is only use on startup, to track the underlays. + /// This is only used on startup, to track the underlays. TrackPhysical { zone_name: String, name: String }, + /// Start tracking CPU metrics for this sled. + /// + /// This is only used on startup. + TrackCpu, /// Track the named VNIC. TrackVnic { zone_name: String, name: String }, /// Stop tracking the named VNIC. @@ -134,6 +140,7 @@ async fn metrics_task( mut rx: mpsc::Receiver, ) { let mut tracked_links: TrackedLinks = HashMap::new(); + let mut tracked_cpu: Option = None; let mut sled_time_synced: bool = false; // Main polling loop, waiting for messages from other pieces of the code to @@ -161,6 +168,16 @@ async fn metrics_task( add_datalink(&log, &mut tracked_links, &kstat_sampler, link) .await; } + Message::TrackCpu => { + add_sled_cpu( + &log, + &sled_identifiers, + &mut tracked_cpu, + &kstat_sampler, + sled_time_synced, + ) + .await; + } Message::TrackVnic { zone_name, name } => { let target = SledDataLinkTarget { kind: LinkKind::VNIC.into(), @@ -211,7 +228,8 @@ async fn metrics_task( &mut tracked_links, &kstat_sampler, ) - .await + .await; + sync_sled_cpu(&log, &mut tracked_cpu, &kstat_sampler).await; } } } @@ -339,6 +357,71 @@ fn is_transient_link(kind: &str) -> bool { kind == LinkKind::VNIC || kind == LinkKind::OPTE } +/// Start tracking CPU metrics for the sled. +async fn add_sled_cpu( + log: &Logger, + sled_identifiers: &SledIdentifiers, + tracked_cpu: &mut Option, + kstat_sampler: &KstatSampler, + time_synced: bool, +) { + if tracked_cpu.is_some() { + debug!(log, "CPU metrics already being tracked"); + return; + } + + let target = SledCpuTarget { + rack_id: sled_identifiers.rack_id, + sled_id: sled_identifiers.sled_id, + sled_model: sled_identifiers.model.clone().into(), + sled_revision: sled_identifiers.revision, + sled_serial: sled_identifiers.serial.clone().into(), + }; + let cpu = SledCpu::new(target, time_synced); + + // CPUs are permanent, so we never expire them. + let details = CollectionDetails::never(LINK_SAMPLE_INTERVAL); + match kstat_sampler.add_target(cpu.clone(), details).await { + Ok(_id) => { + debug!(log, "added CPU metrics to kstat sampler"); + *tracked_cpu = Some(cpu); + } + Err(err) => { + error!( + log, + "failed to add CPU metrics to kstat sampler"; + "error" => ?err, + ); + } + } +} + +/// Update CPU tracking when the sled is synced with NTP. +async fn sync_sled_cpu( + log: &Logger, + tracked_cpu: &mut Option, + kstat_sampler: &KstatSampler, +) { + let Some(cpu) = tracked_cpu.as_mut() else { + return; + }; + + cpu.time_synced = true; + let details = CollectionDetails::never(LINK_SAMPLE_INTERVAL); + match kstat_sampler.update_target(cpu.clone(), details).await { + Ok(_) => { + debug!(log, "updated CPU metrics after time sync"); + } + Err(err) => { + error!( + log, + "failed to update CPU metrics after time sync"; + "error" => ?err, + ); + } + } +} + /// Manages sled-based metrics reported to Oximeter. /// /// This object is used to sample kernel statistics and produce other Oximeter @@ -421,6 +504,14 @@ impl MetricsRequestQueue { .map_err(|e| Error::SendFailed(e)) } + /// Ask the task to start tracking CPU metrics for this sled. + /// + /// This is non-blocking, and returns an error if the task is currently + /// unavailable. + pub fn track_cpu(&self) -> Result<(), Error> { + self.0.try_send(Message::TrackCpu).map_err(|e| Error::SendFailed(e)) + } + /// Ask the task to start tracking the named VNIC. /// /// This is non-blocking, and returns an error if the task is currently diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index c2b369469a5..3ac77cadec2 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -517,6 +517,18 @@ impl SledAgent { } } + // Start tracking CPU metrics. + match metrics_manager.request_queue().track_cpu() { + Ok(_) => { + debug!(log, "started tracking CPU metrics") + } + Err(e) => error!( + log, + "failed to track CPU metrics"; + "error" => slog_error_chain::InlineErrorChain::new(&e), + ), + } + // Create the PortManager to manage all the OPTE ports on the sled. let port_manager = PortManager::new( parent_log.new(o!("component" => "PortManager")),