add partitioning of collection lifetimes

author: Aria Shrimpton <me@aria.rip> 2024-01-31 17:32:10 +0000
committer: Aria Shrimpton <me@aria.rip> 2024-01-31 17:32:10 +0000
commit: eafe2080e9e825649bd84edba9647df0e811af99 (patch)
tree: 47307bb0b8ae282578c48741e87c204c440c7082 /src
parent: 4c96b7fd7a6ec2f6a0d1d0b61c005c0e1b08c2a0 (diff)
6 files changed, 327 insertions, 70 deletions
diff --git a/src/crates/candelabra/src/profiler.rs b/src/crates/candelabra/src/profiler.rs
index 8291b33..4677bbc 100644
--- a/src/crates/candelabra/src/profiler.rs
+++ b/src/crates/candelabra/src/profiler.rs
@@ -1,6 +1,6 @@
 //! Profiling applications for info about container usage
 
-use anyhow::{anyhow, bail, Context, Result};
+use anyhow::{anyhow, Context, Result};
 use camino::{Utf8Path, Utf8PathBuf};
 use log::{debug, log_enabled, trace, warn, Level};
 use primrose::ContainerSelector;
@@ -17,8 +17,8 @@ use tempfile::tempdir;
 
 use crate::cache::{gen_tree_hash, FileCache};
 use crate::candidates::ConTypeName;
-use crate::cost::benchmark::OpName;
-use crate::cost::{Cost, CostModel};
+use crate::cost::benchmark::{tee_output, OpName};
+use crate::cost::{Cost, CostModel, Estimator};
 use crate::project::Project;
 use crate::{Paths, State};
 
@@ -30,64 +30,77 @@ pub(crate) struct CacheEntry {
 }
 
 /// The information we get from profiling.
+/// Rather than keeping all results, we split them into 'similar enough' partitions,
+/// with the idea that each partition will probably have the same best implementation.
 #[derive(Clone, Debug, Default, Serialize, Deserialize)]
-pub struct ProfilerInfo(pub Vec<CollectionLifetime>);
+pub struct ProfilerInfo(pub Vec<ProfilerPartition>);
+
+/// A vector of container lifetimes which have similar characteristics
+#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)]
+pub struct ProfilerPartition {
+    pub occurences: f64,
+    pub avg_n: f64,
+    pub avg_op_counts: HashMap<OpName, f64>,
+}
 
 /// Breakdown of a cost value
 pub type CostBreakdown<'a> = HashMap<&'a OpName, Cost>;
 
-/// Profiler info collected from the lifetime of a single collection instance
-#[derive(Clone, Debug, Serialize, Deserialize)]
-pub struct CollectionLifetime {
-    pub n: usize,
-    pub op_counts: HashMap<OpName, usize>,
-}
-
 impl ProfilerInfo {
     pub fn estimate_cost(&self, cost_model: &CostModel) -> f64 {
-        let sum: f64 = self.0.iter().map(|cl| cl.estimate_cost(cost_model)).sum();
-
-        sum / self.0.len() as f64
+        self.0
+            .iter()
+            .map(|cl| cl.estimate_cost(cost_model))
+            .sum::<f64>()
     }
 
     pub fn cost_breakdown<'a>(&self, cost_model: &'a CostModel) -> CostBreakdown<'a> {
         cost_model
             .by_op
             .iter()
-            .map(|(op_name, estimator)| {
+            .map(|(op, estimator)| {
                 (
-                    op_name,
+                    op,
                     self.0
                         .iter()
-                        .map(|cl| estimator.estimatef(cl.avg_n()) * cl.op_count(op_name) as f64)
-                        .sum::<f64>()
-                        / self.0.len() as f64,
+                        .map(|cl| cl.op_cost(op, estimator))
+                        .sum::<f64>(),
                 )
             })
             .collect()
     }
 }
 
-impl CollectionLifetime {
-    pub fn avg_n(&self) -> f64 {
-        self.n as f64 / (self.op_counts.values().sum::<usize>() as f64)
-    }
-
-    pub fn op_count(&self, op: &str) -> usize {
+impl ProfilerPartition {
+    pub fn avg_op_count(&self, op: &str) -> f64 {
         *self
-            .op_counts
+            .avg_op_counts
             .get(op)
             .expect("invalid op passed to op_count")
     }
 
     pub fn estimate_cost(&self, cost_model: &CostModel) -> f64 {
-        let avg_n = self.avg_n();
-        let mut acc = 0.0;
-        for (op, estimator) in cost_model.by_op.iter() {
-            acc += estimator.estimatef(avg_n) * self.op_count(op) as f64;
-        }
+        cost_model
+            .by_op
+            .iter()
+            .map(|(op, estimator)| self.op_cost(op, estimator))
+            .sum::<f64>()
+    }
+
+    pub fn op_cost(&self, op: &str, estimator: &Estimator) -> f64 {
+        estimator.estimatef(self.avg_n) * self.avg_op_count(op) * self.occurences
+    }
 
-        acc
+    fn add_lifetime(&mut self, (n, ops): (f64, HashMap<String, usize>)) {
+        self.avg_n = self.avg_n + (n - self.avg_n) / (self.occurences + 1.0);
+        for (op, count) in ops {
+            let count = count as f64;
+            self.avg_op_counts
+                .entry(op)
+                .and_modify(|avg| *avg = *avg + (count - *avg) / (self.occurences + 1.0))
+                .or_insert(count);
+        }
+        self.occurences += 1.0;
     }
 }
 
@@ -217,25 +230,24 @@ impl State {
             name, profiler_out_dir
         );
 
-        let mut command = Command::new("cargo");
-        command
+        let child = Command::new("cargo")
             .current_dir(&project.source_dir)
             .args(["bench", "--bench", name])
-            .env("PROFILER_OUT_DIR", profiler_out_dir.as_ref()); // Where profiler info gets outputted
-
-        if log_enabled!(Level::Debug) {
-            command.stderr(Stdio::inherit()).stdout(Stdio::inherit());
-        } else {
-            command.stderr(Stdio::null()).stdout(Stdio::null());
-        };
-        let output = command.output()?;
+            .env("PROFILER_OUT_DIR", profiler_out_dir.as_ref()) // Where profiler info gets outputted
+            .stdout(Stdio::piped())
+            .stderr(if log_enabled!(Level::Debug) {
+                Stdio::inherit()
+            } else {
+                Stdio::null()
+            })
+            .spawn()
+            .context("Error running bench command")?;
 
-        if !output.status.success() {
-            bail!("Error running benchmark");
-        }
+        tee_output(child)?;
 
         let mut con_type_results = HashMap::new();
         for dir in read_dir(&profiler_out_dir)? {
+            // each directory has an index, corresponding to the container type name
             let dir = dir?;
             let con_type: String = con_types[dir
                 .file_name()
@@ -244,24 +256,63 @@ impl State {
                 .parse::<usize>()
                 .unwrap()]
             .to_string();
-            let mut acc = Vec::default();
-            for file in read_dir(dir.path())? {
-                let file = file?;
-                let mut contents = String::new();
-                File::open(file.path())?.read_to_string(&mut contents)?;
-
-                acc.push(parse_output(&contents)?);
-            }
 
-            con_type_results.insert(con_type, ProfilerInfo(acc));
+            let partitions = read_dir(dir.path())?
+                .map(|f| -> Result<String> {
+                    // read file contents
+                    let mut contents = String::new();
+                    File::open(f?.path())?.read_to_string(&mut contents)?;
+                    Ok(contents)
+                })
+                .map(|contents| parse_output(&contents?))
+                .fold(Ok(vec![]), partition_costs)?;
+
+            con_type_results.insert(con_type, ProfilerInfo(partitions));
         }
 
         Ok(con_type_results)
     }
 }
 
+type CollectionLifetime = (f64, HashMap<OpName, usize>);
+
+/// Attempt to compress an iterator of collection lifetimes into as few partitions as possible
+fn partition_costs(
+    acc: Result<Vec<ProfilerPartition>>,
+    cl: Result<CollectionLifetime>,
+) -> Result<Vec<ProfilerPartition>> {
+    // error short circuiting
+    let (mut acc, (n, ops)) = (acc?, cl?);
+
+    // attempt to find a partition with a close enough n value
+    let (closest_idx, closest_delta) =
+        acc.iter()
+            .enumerate()
+            .fold((0, f64::MAX), |acc @ (_, val), (idx, partition)| {
+                let delta = (partition.avg_n - n).abs();
+                if delta < val {
+                    (idx, delta)
+                } else {
+                    acc
+                }
+            });
+
+    if closest_delta < 100.0 {
+        acc[closest_idx].add_lifetime((n, ops));
+    } else {
+        // add a new partition
+        acc.push(ProfilerPartition {
+            occurences: 1.0,
+            avg_n: n,
+            avg_op_counts: ops.into_iter().map(|(k, v)| (k, v as f64)).collect(),
+        })
+    }
+
+    Ok(acc)
+}
+
 /// Parse the output of the profiler
-fn parse_output(contents: &str) -> Result<CollectionLifetime> {
+fn parse_output(contents: &str) -> Result<(f64, HashMap<OpName, usize>)> {
     let mut lines = contents.lines().map(usize::from_str);
     let missing_line_err = || anyhow!("wrong number of lines in ");
     let n = lines.next().ok_or_else(missing_line_err)??;
@@ -307,5 +358,198 @@ fn parse_output(contents: &str) -> Result<CollectionLifetime> {
         lines.next().ok_or_else(missing_line_err)??,
     );
 
-    Ok(CollectionLifetime { n, op_counts })
+    Ok((n as f64, op_counts))
+}
+
+#[cfg(test)]
+mod tests {
+    use std::collections::HashMap;
+
+    use crate::{
+        cost::{CostModel, Estimator},
+        profiler::partition_costs,
+    };
+
+    use super::{ProfilerInfo, ProfilerPartition};
+
+    const EPSILON: f64 = 1e-5;
+    fn assert_feq(left: f64, right: f64, msg: &'static str) {
+        assert!((left - right).abs() < EPSILON, "{}", msg);
+    }
+
+    fn linear_estimator() -> Estimator {
+        Estimator {
+            coeffs: [0.0, 1.0, 0.0, 0.0],
+            transform_x: (0.0, 1.0),
+            transform_y: (0.0, 1.0),
+        }
+    }
+
+    #[test]
+    fn test_cost_single_partition() {
+        let info = ProfilerInfo(vec![ProfilerPartition {
+            occurences: 1.0,
+            avg_n: 100.0,
+            avg_op_counts: {
+                let mut map = HashMap::new();
+                map.insert("insert".to_string(), 100.0);
+                map
+            },
+        }]);
+
+        let model = CostModel {
+            by_op: {
+                let mut map = HashMap::new();
+                map.insert("insert".to_string(), linear_estimator());
+                map
+            },
+        };
+
+        let cost = dbg!(info.estimate_cost(&model));
+        assert_feq(cost, 10_000.0, "per op = 100 * 100 ops");
+    }
+
+    #[test]
+    fn test_cost_multi_partitions_sums() {
+        let info = ProfilerInfo(vec![
+            ProfilerPartition {
+                occurences: 1.0,
+                avg_n: 100.0,
+                avg_op_counts: {
+                    let mut map = HashMap::new();
+                    map.insert("insert".to_string(), 100.0);
+                    map
+                },
+            },
+            ProfilerPartition {
+                occurences: 1.0,
+                avg_n: 10.0,
+                avg_op_counts: {
+                    let mut map = HashMap::new();
+                    map.insert("insert".to_string(), 10.0);
+                    map
+                },
+            },
+        ]);
+
+        let model = CostModel {
+            by_op: {
+                let mut map = HashMap::new();
+                map.insert("insert".to_string(), linear_estimator());
+                map
+            },
+        };
+
+        let cost = dbg!(info.estimate_cost(&model));
+        assert_feq(cost, 10_100.0, "100ns/op * 100 ops + 10ns/op * 10 ops");
+    }
+
+    #[test]
+    fn test_cost_multi_partitions_sums_weighted() {
+        let info = ProfilerInfo(vec![
+            ProfilerPartition {
+                occurences: 2.0,
+                avg_n: 100.0,
+                avg_op_counts: {
+                    let mut map = HashMap::new();
+                    map.insert("insert".to_string(), 100.0);
+                    map
+                },
+            },
+            ProfilerPartition {
+                occurences: 1.0,
+                avg_n: 10.0,
+                avg_op_counts: {
+                    let mut map = HashMap::new();
+                    map.insert("insert".to_string(), 10.0);
+                    map
+                },
+            },
+        ]);
+
+        let model = CostModel {
+            by_op: {
+                let mut map = HashMap::new();
+                map.insert("insert".to_string(), linear_estimator());
+                map
+            },
+        };
+
+        let cost = dbg!(info.estimate_cost(&model));
+        assert_feq(cost, 20_100.0, "100ns/op * 100 ops * 2 + 10ns/op * 10 ops");
+    }
+
+    #[test]
+    fn test_partition_costs_merges_duplicates() {
+        let cl = (100.0, {
+            let mut map = HashMap::new();
+            map.insert("insert".to_string(), 10);
+            map
+        });
+        let outp = vec![Ok(cl.clone()), Ok(cl)]
+            .into_iter()
+            .fold(Ok(vec![]), partition_costs)
+            .unwrap();
+
+        assert_eq!(outp.len(), 1, "merged duplicates");
+        assert_eq!(outp[0].occurences, 2.0, "weight updated");
+        assert_feq(outp[0].avg_n, 100.0, "average n correct");
+        assert_feq(
+            *outp[0].avg_op_counts.get("insert").unwrap(),
+            10.0,
+            "average n correct",
+        );
+    }
+
+    #[test]
+    fn test_partition_costs_merges_close() {
+        let outp = vec![
+            Ok((100.0, {
+                let mut map = HashMap::new();
+                map.insert("insert".to_string(), 50);
+                map
+            })),
+            Ok((110.0, {
+                let mut map = HashMap::new();
+                map.insert("insert".to_string(), 100);
+                map
+            })),
+        ]
+        .into_iter()
+        .fold(Ok(vec![]), partition_costs)
+        .unwrap();
+
+        assert_eq!(outp.len(), 1, "merged duplicates");
+        assert_eq!(outp[0].occurences, 2.0, "weight updated");
+        assert_feq(outp[0].avg_n, 105.0, "average n correct");
+        assert_feq(
+            *outp[0].avg_op_counts.get("insert").unwrap(),
+            75.0,
+            "average n correct",
+        );
+    }
+    #[test]
+    fn test_partition_costs_keeps_separate() {
+        let outp = vec![
+            Ok((100.0, {
+                let mut map = HashMap::new();
+                map.insert("insert".to_string(), 10);
+                map
+            })),
+            Ok((999999.0, {
+                let mut map = HashMap::new();
+                map.insert("insert".to_string(), 10);
+                map
+            })),
+        ]
+        .into_iter()
+        .fold(Ok(vec![]), partition_costs)
+        .unwrap();
+
+        assert_eq!(
+            outp.len(),
+            2,
+            "large difference in n values causes partition"
+        );
+    }
 }
diff --git a/src/crates/cli/src/util.rs b/src/crates/cli/src/display.rs
index 499f4ba..2ce9039 100644
--- a/src/crates/cli/src/util.rs
+++ b/src/crates/cli/src/display.rs
@@ -1,5 +1,6 @@
 use std::{collections::HashSet, fmt::Display, hash::Hash, iter::once};
 
+use candelabra::profiler::ProfilerInfo;
 use tabled::{builder::Builder, settings::Style};
 
 // Print the given 2D map as a table, where the first key is the left-most column, and the second key the column index
@@ -40,3 +41,17 @@ where
 
     println!("{}", builder.build().with(Style::sharp()));
 }
+
+pub fn display_profiler_info(profile_info: ProfilerInfo) {
+    print_table(profile_info.0.into_iter().enumerate().map(|(i, p)| {
+        (
+            i,
+            [
+                ("n".to_string(), p.avg_n),
+                ("occurences".to_string(), p.occurences),
+            ]
+            .into_iter()
+            .chain(p.avg_op_counts.into_iter()),
+        )
+    }))
+}
diff --git a/src/crates/cli/src/estimate.rs b/src/crates/cli/src/estimate.rs
index 6915afd..6f4716f 100644
--- a/src/crates/cli/src/estimate.rs
+++ b/src/crates/cli/src/estimate.rs
@@ -4,7 +4,7 @@ use anyhow::{anyhow, bail, Result};
 use argh::FromArgs;
 use log::info;
 
-use crate::{util::print_table, State};
+use crate::{display::print_table, State};
 
 /// Estimate the cost of a given set of assignments for a specific project, and detail how that was reached.
 #[derive(FromArgs)]
diff --git a/src/crates/cli/src/main.rs b/src/crates/cli/src/main.rs
index 81d484d..04bcc96 100644
--- a/src/crates/cli/src/main.rs
+++ b/src/crates/cli/src/main.rs
@@ -4,12 +4,12 @@ use candelabra::{Paths, Project};
 use log::info;
 
 mod candidates;
+mod display;
 mod estimate;
 mod library;
 mod model;
 mod profile;
 mod select;
-mod util;
 
 #[derive(FromArgs)]
 /// Find the best performing container type using primrose
diff --git a/src/crates/cli/src/profile.rs b/src/crates/cli/src/profile.rs
index 0971771..efb1571 100644
--- a/src/crates/cli/src/profile.rs
+++ b/src/crates/cli/src/profile.rs
@@ -2,7 +2,7 @@ use anyhow::Result;
 use argh::FromArgs;
 use log::info;
 
-use crate::{util::print_table, State};
+use crate::{display::display_profiler_info, State};
 
 /// Profile the selected projects and print the results
 #[derive(FromArgs)]
@@ -16,12 +16,10 @@ impl State {
 
             let all_info = self.inner.profiler_info(proj)?;
 
-            print_table(all_info.iter().map(|(con_type_name, profile_info)| {
-                (
-                    con_type_name,
-                    profile_info.0.iter().flat_map(|cl| cl.op_counts.iter()),
-                )
-            }));
+            for (con_type_name, profile_info) in all_info {
+                info!("{}:", con_type_name);
+                display_profiler_info(profile_info);
+            }
         }
         Ok(())
     }
diff --git a/src/crates/library/src/profiler.rs b/src/crates/library/src/profiler.rs
index ba8c357..585d745 100644
--- a/src/crates/library/src/profiler.rs
+++ b/src/crates/library/src/profiler.rs
@@ -10,7 +10,7 @@ use crate::traits::{Container, Indexable, Mapping, Stack};
 
 pub struct ProfilerWrapper<const ID: usize, T, E> {
     inner: T,
-    sum_ns: usize,
+    max_n: usize,
     n_contains: usize,
     n_insert: usize,
     n_clear: usize,
@@ -28,7 +28,7 @@ impl<const ID: usize, T: Default, E> Default for ProfilerWrapper<ID, T, E> {
     fn default() -> Self {
         Self {
             inner: T::default(),
-            sum_ns: 0,
+            max_n: 0,
             n_contains: 0,
             n_insert: 0,
             n_clear: 0,
@@ -46,13 +46,13 @@ impl<const ID: usize, T: Default, E> Default for ProfilerWrapper<ID, T, E> {
 
 impl<const ID: usize, T: Container<E>, E> ProfilerWrapper<ID, T, E> {
     fn add_n(&mut self) {
-        self.sum_ns += self.inner.len();
+        self.max_n = self.max_n.max(self.inner.len());
     }
 }
 
 impl<const ID: usize, T: Mapping<K, V>, K, V> ProfilerWrapper<ID, T, (K, V)> {
     fn add_n_map(&mut self) {
-        self.sum_ns += self.inner.len();
+        self.max_n = self.max_n.max(self.inner.len());
     }
 }
 
@@ -175,7 +175,7 @@ impl<const ID: usize, T, E> Drop for ProfilerWrapper<ID, T, E> {
         }
         let mut f = File::create(format!("{}/{}", dir, unix_time.as_nanos())).unwrap();
 
-        writeln!(f, "{}", self.sum_ns).unwrap();
+        writeln!(f, "{}", self.max_n).unwrap();
         writeln!(f, "{}", self.n_contains).unwrap();
         writeln!(f, "{}", self.n_insert).unwrap();
         writeln!(f, "{}", self.n_clear).unwrap();
author	Aria Shrimpton <me@aria.rip>	2024-01-31 17:32:10 +0000
committer	Aria Shrimpton <me@aria.rip>	2024-01-31 17:32:10 +0000
commit	eafe2080e9e825649bd84edba9647df0e811af99 (patch)
tree	47307bb0b8ae282578c48741e87c204c440c7082 /src
parent	4c96b7fd7a6ec2f6a0d1d0b61c005c0e1b08c2a0 (diff)