diff options
author | Aria Shrimpton <me@aria.rip> | 2024-01-31 17:32:10 +0000 |
---|---|---|
committer | Aria Shrimpton <me@aria.rip> | 2024-01-31 17:32:10 +0000 |
commit | eafe2080e9e825649bd84edba9647df0e811af99 (patch) | |
tree | 47307bb0b8ae282578c48741e87c204c440c7082 /src | |
parent | 4c96b7fd7a6ec2f6a0d1d0b61c005c0e1b08c2a0 (diff) |
add partitioning of collection lifetimes
Diffstat (limited to 'src')
-rw-r--r-- | src/crates/candelabra/src/profiler.rs | 356 | ||||
-rw-r--r-- | src/crates/cli/src/display.rs (renamed from src/crates/cli/src/util.rs) | 15 | ||||
-rw-r--r-- | src/crates/cli/src/estimate.rs | 2 | ||||
-rw-r--r-- | src/crates/cli/src/main.rs | 2 | ||||
-rw-r--r-- | src/crates/cli/src/profile.rs | 12 | ||||
-rw-r--r-- | src/crates/library/src/profiler.rs | 10 |
6 files changed, 327 insertions, 70 deletions
diff --git a/src/crates/candelabra/src/profiler.rs b/src/crates/candelabra/src/profiler.rs index 8291b33..4677bbc 100644 --- a/src/crates/candelabra/src/profiler.rs +++ b/src/crates/candelabra/src/profiler.rs @@ -1,6 +1,6 @@ //! Profiling applications for info about container usage -use anyhow::{anyhow, bail, Context, Result}; +use anyhow::{anyhow, Context, Result}; use camino::{Utf8Path, Utf8PathBuf}; use log::{debug, log_enabled, trace, warn, Level}; use primrose::ContainerSelector; @@ -17,8 +17,8 @@ use tempfile::tempdir; use crate::cache::{gen_tree_hash, FileCache}; use crate::candidates::ConTypeName; -use crate::cost::benchmark::OpName; -use crate::cost::{Cost, CostModel}; +use crate::cost::benchmark::{tee_output, OpName}; +use crate::cost::{Cost, CostModel, Estimator}; use crate::project::Project; use crate::{Paths, State}; @@ -30,64 +30,77 @@ pub(crate) struct CacheEntry { } /// The information we get from profiling. +/// Rather than keeping all results, we split them into 'similar enough' partitions, +/// with the idea that each partition will probably have the same best implementation. #[derive(Clone, Debug, Default, Serialize, Deserialize)] -pub struct ProfilerInfo(pub Vec<CollectionLifetime>); +pub struct ProfilerInfo(pub Vec<ProfilerPartition>); + +/// A vector of container lifetimes which have similar characteristics +#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)] +pub struct ProfilerPartition { + pub occurences: f64, + pub avg_n: f64, + pub avg_op_counts: HashMap<OpName, f64>, +} /// Breakdown of a cost value pub type CostBreakdown<'a> = HashMap<&'a OpName, Cost>; -/// Profiler info collected from the lifetime of a single collection instance -#[derive(Clone, Debug, Serialize, Deserialize)] -pub struct CollectionLifetime { - pub n: usize, - pub op_counts: HashMap<OpName, usize>, -} - impl ProfilerInfo { pub fn estimate_cost(&self, cost_model: &CostModel) -> f64 { - let sum: f64 = self.0.iter().map(|cl| cl.estimate_cost(cost_model)).sum(); - - sum / self.0.len() as f64 + self.0 + .iter() + .map(|cl| cl.estimate_cost(cost_model)) + .sum::<f64>() } pub fn cost_breakdown<'a>(&self, cost_model: &'a CostModel) -> CostBreakdown<'a> { cost_model .by_op .iter() - .map(|(op_name, estimator)| { + .map(|(op, estimator)| { ( - op_name, + op, self.0 .iter() - .map(|cl| estimator.estimatef(cl.avg_n()) * cl.op_count(op_name) as f64) - .sum::<f64>() - / self.0.len() as f64, + .map(|cl| cl.op_cost(op, estimator)) + .sum::<f64>(), ) }) .collect() } } -impl CollectionLifetime { - pub fn avg_n(&self) -> f64 { - self.n as f64 / (self.op_counts.values().sum::<usize>() as f64) - } - - pub fn op_count(&self, op: &str) -> usize { +impl ProfilerPartition { + pub fn avg_op_count(&self, op: &str) -> f64 { *self - .op_counts + .avg_op_counts .get(op) .expect("invalid op passed to op_count") } pub fn estimate_cost(&self, cost_model: &CostModel) -> f64 { - let avg_n = self.avg_n(); - let mut acc = 0.0; - for (op, estimator) in cost_model.by_op.iter() { - acc += estimator.estimatef(avg_n) * self.op_count(op) as f64; - } + cost_model + .by_op + .iter() + .map(|(op, estimator)| self.op_cost(op, estimator)) + .sum::<f64>() + } + + pub fn op_cost(&self, op: &str, estimator: &Estimator) -> f64 { + estimator.estimatef(self.avg_n) * self.avg_op_count(op) * self.occurences + } - acc + fn add_lifetime(&mut self, (n, ops): (f64, HashMap<String, usize>)) { + self.avg_n = self.avg_n + (n - self.avg_n) / (self.occurences + 1.0); + for (op, count) in ops { + let count = count as f64; + self.avg_op_counts + .entry(op) + .and_modify(|avg| *avg = *avg + (count - *avg) / (self.occurences + 1.0)) + .or_insert(count); + } + self.occurences += 1.0; } } @@ -217,25 +230,24 @@ impl State { name, profiler_out_dir ); - let mut command = Command::new("cargo"); - command + let child = Command::new("cargo") .current_dir(&project.source_dir) .args(["bench", "--bench", name]) - .env("PROFILER_OUT_DIR", profiler_out_dir.as_ref()); // Where profiler info gets outputted - - if log_enabled!(Level::Debug) { - command.stderr(Stdio::inherit()).stdout(Stdio::inherit()); - } else { - command.stderr(Stdio::null()).stdout(Stdio::null()); - }; - let output = command.output()?; + .env("PROFILER_OUT_DIR", profiler_out_dir.as_ref()) // Where profiler info gets outputted + .stdout(Stdio::piped()) + .stderr(if log_enabled!(Level::Debug) { + Stdio::inherit() + } else { + Stdio::null() + }) + .spawn() + .context("Error running bench command")?; - if !output.status.success() { - bail!("Error running benchmark"); - } + tee_output(child)?; let mut con_type_results = HashMap::new(); for dir in read_dir(&profiler_out_dir)? { + // each directory has an index, corresponding to the container type name let dir = dir?; let con_type: String = con_types[dir .file_name() @@ -244,24 +256,63 @@ impl State { .parse::<usize>() .unwrap()] .to_string(); - let mut acc = Vec::default(); - for file in read_dir(dir.path())? { - let file = file?; - let mut contents = String::new(); - File::open(file.path())?.read_to_string(&mut contents)?; - - acc.push(parse_output(&contents)?); - } - con_type_results.insert(con_type, ProfilerInfo(acc)); + let partitions = read_dir(dir.path())? + .map(|f| -> Result<String> { + // read file contents + let mut contents = String::new(); + File::open(f?.path())?.read_to_string(&mut contents)?; + Ok(contents) + }) + .map(|contents| parse_output(&contents?)) + .fold(Ok(vec![]), partition_costs)?; + + con_type_results.insert(con_type, ProfilerInfo(partitions)); } Ok(con_type_results) } } +type CollectionLifetime = (f64, HashMap<OpName, usize>); + +/// Attempt to compress an iterator of collection lifetimes into as few partitions as possible +fn partition_costs( + acc: Result<Vec<ProfilerPartition>>, + cl: Result<CollectionLifetime>, +) -> Result<Vec<ProfilerPartition>> { + // error short circuiting + let (mut acc, (n, ops)) = (acc?, cl?); + + // attempt to find a partition with a close enough n value + let (closest_idx, closest_delta) = + acc.iter() + .enumerate() + .fold((0, f64::MAX), |acc @ (_, val), (idx, partition)| { + let delta = (partition.avg_n - n).abs(); + if delta < val { + (idx, delta) + } else { + acc + } + }); + + if closest_delta < 100.0 { + acc[closest_idx].add_lifetime((n, ops)); + } else { + // add a new partition + acc.push(ProfilerPartition { + occurences: 1.0, + avg_n: n, + avg_op_counts: ops.into_iter().map(|(k, v)| (k, v as f64)).collect(), + }) + } + + Ok(acc) +} + /// Parse the output of the profiler -fn parse_output(contents: &str) -> Result<CollectionLifetime> { +fn parse_output(contents: &str) -> Result<(f64, HashMap<OpName, usize>)> { let mut lines = contents.lines().map(usize::from_str); let missing_line_err = || anyhow!("wrong number of lines in "); let n = lines.next().ok_or_else(missing_line_err)??; @@ -307,5 +358,198 @@ fn parse_output(contents: &str) -> Result<CollectionLifetime> { lines.next().ok_or_else(missing_line_err)??, ); - Ok(CollectionLifetime { n, op_counts }) + Ok((n as f64, op_counts)) +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + + use crate::{ + cost::{CostModel, Estimator}, + profiler::partition_costs, + }; + + use super::{ProfilerInfo, ProfilerPartition}; + + const EPSILON: f64 = 1e-5; + fn assert_feq(left: f64, right: f64, msg: &'static str) { + assert!((left - right).abs() < EPSILON, "{}", msg); + } + + fn linear_estimator() -> Estimator { + Estimator { + coeffs: [0.0, 1.0, 0.0, 0.0], + transform_x: (0.0, 1.0), + transform_y: (0.0, 1.0), + } + } + + #[test] + fn test_cost_single_partition() { + let info = ProfilerInfo(vec![ProfilerPartition { + occurences: 1.0, + avg_n: 100.0, + avg_op_counts: { + let mut map = HashMap::new(); + map.insert("insert".to_string(), 100.0); + map + }, + }]); + + let model = CostModel { + by_op: { + let mut map = HashMap::new(); + map.insert("insert".to_string(), linear_estimator()); + map + }, + }; + + let cost = dbg!(info.estimate_cost(&model)); + assert_feq(cost, 10_000.0, "per op = 100 * 100 ops"); + } + + #[test] + fn test_cost_multi_partitions_sums() { + let info = ProfilerInfo(vec![ + ProfilerPartition { + occurences: 1.0, + avg_n: 100.0, + avg_op_counts: { + let mut map = HashMap::new(); + map.insert("insert".to_string(), 100.0); + map + }, + }, + ProfilerPartition { + occurences: 1.0, + avg_n: 10.0, + avg_op_counts: { + let mut map = HashMap::new(); + map.insert("insert".to_string(), 10.0); + map + }, + }, + ]); + + let model = CostModel { + by_op: { + let mut map = HashMap::new(); + map.insert("insert".to_string(), linear_estimator()); + map + }, + }; + + let cost = dbg!(info.estimate_cost(&model)); + assert_feq(cost, 10_100.0, "100ns/op * 100 ops + 10ns/op * 10 ops"); + } + + #[test] + fn test_cost_multi_partitions_sums_weighted() { + let info = ProfilerInfo(vec![ + ProfilerPartition { + occurences: 2.0, + avg_n: 100.0, + avg_op_counts: { + let mut map = HashMap::new(); + map.insert("insert".to_string(), 100.0); + map + }, + }, + ProfilerPartition { + occurences: 1.0, + avg_n: 10.0, + avg_op_counts: { + let mut map = HashMap::new(); + map.insert("insert".to_string(), 10.0); + map + }, + }, + ]); + + let model = CostModel { + by_op: { + let mut map = HashMap::new(); + map.insert("insert".to_string(), linear_estimator()); + map + }, + }; + + let cost = dbg!(info.estimate_cost(&model)); + assert_feq(cost, 20_100.0, "100ns/op * 100 ops * 2 + 10ns/op * 10 ops"); + } + + #[test] + fn test_partition_costs_merges_duplicates() { + let cl = (100.0, { + let mut map = HashMap::new(); + map.insert("insert".to_string(), 10); + map + }); + let outp = vec![Ok(cl.clone()), Ok(cl)] + .into_iter() + .fold(Ok(vec![]), partition_costs) + .unwrap(); + + assert_eq!(outp.len(), 1, "merged duplicates"); + assert_eq!(outp[0].occurences, 2.0, "weight updated"); + assert_feq(outp[0].avg_n, 100.0, "average n correct"); + assert_feq( + *outp[0].avg_op_counts.get("insert").unwrap(), + 10.0, + "average n correct", + ); + } + + #[test] + fn test_partition_costs_merges_close() { + let outp = vec![ + Ok((100.0, { + let mut map = HashMap::new(); + map.insert("insert".to_string(), 50); + map + })), + Ok((110.0, { + let mut map = HashMap::new(); + map.insert("insert".to_string(), 100); + map + })), + ] + .into_iter() + .fold(Ok(vec![]), partition_costs) + .unwrap(); + + assert_eq!(outp.len(), 1, "merged duplicates"); + assert_eq!(outp[0].occurences, 2.0, "weight updated"); + assert_feq(outp[0].avg_n, 105.0, "average n correct"); + assert_feq( + *outp[0].avg_op_counts.get("insert").unwrap(), + 75.0, + "average n correct", + ); + } + #[test] + fn test_partition_costs_keeps_separate() { + let outp = vec![ + Ok((100.0, { + let mut map = HashMap::new(); + map.insert("insert".to_string(), 10); + map + })), + Ok((999999.0, { + let mut map = HashMap::new(); + map.insert("insert".to_string(), 10); + map + })), + ] + .into_iter() + .fold(Ok(vec![]), partition_costs) + .unwrap(); + + assert_eq!( + outp.len(), + 2, + "large difference in n values causes partition" + ); + } } diff --git a/src/crates/cli/src/util.rs b/src/crates/cli/src/display.rs index 499f4ba..2ce9039 100644 --- a/src/crates/cli/src/util.rs +++ b/src/crates/cli/src/display.rs @@ -1,5 +1,6 @@ use std::{collections::HashSet, fmt::Display, hash::Hash, iter::once}; +use candelabra::profiler::ProfilerInfo; use tabled::{builder::Builder, settings::Style}; // Print the given 2D map as a table, where the first key is the left-most column, and the second key the column index @@ -40,3 +41,17 @@ where println!("{}", builder.build().with(Style::sharp())); } + +pub fn display_profiler_info(profile_info: ProfilerInfo) { + print_table(profile_info.0.into_iter().enumerate().map(|(i, p)| { + ( + i, + [ + ("n".to_string(), p.avg_n), + ("occurences".to_string(), p.occurences), + ] + .into_iter() + .chain(p.avg_op_counts.into_iter()), + ) + })) +} diff --git a/src/crates/cli/src/estimate.rs b/src/crates/cli/src/estimate.rs index 6915afd..6f4716f 100644 --- a/src/crates/cli/src/estimate.rs +++ b/src/crates/cli/src/estimate.rs @@ -4,7 +4,7 @@ use anyhow::{anyhow, bail, Result}; use argh::FromArgs; use log::info; -use crate::{util::print_table, State}; +use crate::{display::print_table, State}; /// Estimate the cost of a given set of assignments for a specific project, and detail how that was reached. #[derive(FromArgs)] diff --git a/src/crates/cli/src/main.rs b/src/crates/cli/src/main.rs index 81d484d..04bcc96 100644 --- a/src/crates/cli/src/main.rs +++ b/src/crates/cli/src/main.rs @@ -4,12 +4,12 @@ use candelabra::{Paths, Project}; use log::info; mod candidates; +mod display; mod estimate; mod library; mod model; mod profile; mod select; -mod util; #[derive(FromArgs)] /// Find the best performing container type using primrose diff --git a/src/crates/cli/src/profile.rs b/src/crates/cli/src/profile.rs index 0971771..efb1571 100644 --- a/src/crates/cli/src/profile.rs +++ b/src/crates/cli/src/profile.rs @@ -2,7 +2,7 @@ use anyhow::Result; use argh::FromArgs; use log::info; -use crate::{util::print_table, State}; +use crate::{display::display_profiler_info, State}; /// Profile the selected projects and print the results #[derive(FromArgs)] @@ -16,12 +16,10 @@ impl State { let all_info = self.inner.profiler_info(proj)?; - print_table(all_info.iter().map(|(con_type_name, profile_info)| { - ( - con_type_name, - profile_info.0.iter().flat_map(|cl| cl.op_counts.iter()), - ) - })); + for (con_type_name, profile_info) in all_info { + info!("{}:", con_type_name); + display_profiler_info(profile_info); + } } Ok(()) } diff --git a/src/crates/library/src/profiler.rs b/src/crates/library/src/profiler.rs index ba8c357..585d745 100644 --- a/src/crates/library/src/profiler.rs +++ b/src/crates/library/src/profiler.rs @@ -10,7 +10,7 @@ use crate::traits::{Container, Indexable, Mapping, Stack}; pub struct ProfilerWrapper<const ID: usize, T, E> { inner: T, - sum_ns: usize, + max_n: usize, n_contains: usize, n_insert: usize, n_clear: usize, @@ -28,7 +28,7 @@ impl<const ID: usize, T: Default, E> Default for ProfilerWrapper<ID, T, E> { fn default() -> Self { Self { inner: T::default(), - sum_ns: 0, + max_n: 0, n_contains: 0, n_insert: 0, n_clear: 0, @@ -46,13 +46,13 @@ impl<const ID: usize, T: Default, E> Default for ProfilerWrapper<ID, T, E> { impl<const ID: usize, T: Container<E>, E> ProfilerWrapper<ID, T, E> { fn add_n(&mut self) { - self.sum_ns += self.inner.len(); + self.max_n = self.max_n.max(self.inner.len()); } } impl<const ID: usize, T: Mapping<K, V>, K, V> ProfilerWrapper<ID, T, (K, V)> { fn add_n_map(&mut self) { - self.sum_ns += self.inner.len(); + self.max_n = self.max_n.max(self.inner.len()); } } @@ -175,7 +175,7 @@ impl<const ID: usize, T, E> Drop for ProfilerWrapper<ID, T, E> { } let mut f = File::create(format!("{}/{}", dir, unix_time.as_nanos())).unwrap(); - writeln!(f, "{}", self.sum_ns).unwrap(); + writeln!(f, "{}", self.max_n).unwrap(); writeln!(f, "{}", self.n_contains).unwrap(); writeln!(f, "{}", self.n_insert).unwrap(); writeln!(f, "{}", self.n_clear).unwrap(); |