update data & analysis

author: Aria Shrimpton <me@aria.rip> 2024-03-25 18:54:08 +0000
committer: Aria Shrimpton <me@aria.rip> 2024-03-25 18:54:08 +0000
commit: e2d0806b2d944b87eb2a00bb46243a1d50487a3e (patch)
tree: 1096059e340ba64d6186fde7dc8810f9e7c2cb26 /analysis/vis.livemd
parent: 57c3c48f6660f905fb974cff7ec58f746a1a6970 (diff)
1 files changed, 163 insertions, 73 deletions
diff --git a/analysis/vis.livemd b/analysis/vis.livemd
index 4913e76..48b8e59 100644
--- a/analysis/vis.livemd
+++ b/analysis/vis.livemd
@@ -18,6 +18,7 @@ Mix.install([
 ```elixir
 require Explorer.DataFrame
 require Explorer.Series
+require VegaLite
 alias Explorer.DataFrame, as: DF
 alias Explorer.Series, as: SE
 job_id = "current"
@@ -86,21 +87,31 @@ cost_model_points =
     |> DF.new()
   end)
   |> DF.concat_rows()
+  |> DF.mutate(t: cast(t, {:duration, :nanosecond}))
 ```
 
 ## Cost model exploratory plots
 
-<!-- livebook:{"reevaluate_automatically":true} -->
-
 ```elixir
-set_impls = ["BTreeSet", "HashSet", "VecSet", "SortedVecSet"]
-mapping_impls = ["HashMap", "BTreeMap", "VecMap", "SortedVecMap"]
-other_impls = ["Vec", "LinkedList", "SortedVec"]
-
-impls = other_impls
-
 defmodule CostModel do
   @defaults %{y_domain: nil, ns: 1..60_000//100, draw_points: true}
+  @all_impls Enum.sort([
+               "SortedVec",
+               "SortedVecSet",
+               "SortedVecMap",
+               "Vec",
+               "VecSet",
+               "VecMap",
+               "BTreeSet",
+               "BTreeMap",
+               "HashSet",
+               "HashMap",
+               "LinkedList"
+             ])
+
+  def friendly_impl_name(impl) do
+    String.split(impl, "::") |> List.last()
+  end
 
   def points_for(cost_models, ns, impl, op) do
     %{"coeffs" => [coeffs]} =
@@ -116,7 +127,7 @@ defmodule CostModel do
          |> Enum.sum()) + Enum.at(coeffs, 3) * Math.log2(n)
 
       %{
-        impl: String.split(impl, "::") |> List.last(),
+        impl: friendly_impl_name(impl),
         op: op,
         n: n,
         t: max(t, 0)
@@ -146,7 +157,7 @@ defmodule CostModel do
                 cost_model_points
                 |> DF.filter(op == ^op and impl in ^impls)
                 |> DF.group_by(["impl", "n"])
-                |> DF.summarise(t: mean(t)),
+                |> DF.summarise(t: mean(cast(t, :f32))),
                 "n",
                 "t",
                 color_by: "impl",
@@ -169,11 +180,61 @@ defmodule CostModel do
       _ -> plot
     end
   end
+
+  def split_plot(cost_models, cost_model_points, impl_splits, op) do
+    @all_impls = List.flatten(impl_splits) |> Enum.sort()
+
+    Enum.map(impl_splits, &plot(cost_models, cost_model_points, &1, op))
+    |> Tucan.vconcat()
+  end
 end
 ```
 
+<!-- livebook:{"reevaluate_automatically":true} -->
+
+```elixir
+graph =
+  CostModel.split_plot(
+    cost_models,
+    cost_model_points,
+    [
+      ["SortedVec", "SortedVecSet", "SortedVecMap", "VecSet", "VecMap"],
+      [
+        "Vec",
+        "LinkedList"
+      ],
+      ["BTreeSet", "BTreeMap", "HashSet", "HashMap"]
+    ],
+    "insert"
+  )
+  |> VegaLite.resolve(:scale, color: :independent)
+
+VegaLite.Export.save!(graph, "../thesis/assets/insert.json")
+
+graph
+```
+
 ```elixir
-CostModel.plot(cost_models, cost_model_points, other_impls, "remove")
+graph =
+  CostModel.split_plot(
+    cost_models,
+    cost_model_points,
+    [
+      ["SortedVec", "SortedVecSet", "SortedVecMap", "VecSet"],
+      [
+        "Vec",
+        "LinkedList",
+        "VecMap"
+      ],
+      ["BTreeSet", "BTreeMap", "HashSet", "HashMap"]
+    ],
+    "contains"
+  )
+  |> VegaLite.resolve(:scale, color: :independent)
+
+VegaLite.Export.save!(graph, "../thesis/assets/contains.json")
+
+graph
 ```
 
 ## Read benchmark data
@@ -203,14 +264,29 @@ raw_benchmarks =
         using:
           Regex.scan(~r/\"(\w*)\", ([^)]*)/, Path.basename(dir))
           |> Enum.map(fn [_, ctn, impl] -> %{ctn: ctn, impl: impl} end),
-        mean: raw_results["mean"]["point_estimate"] / 10 ** 9,
-        hi_95th: raw_results["mean"]["confidence_interval"]["upper_bound"] / 10 ** 9,
-        lo_95th: raw_results["mean"]["confidence_interval"]["lower_bound"] / 10 ** 9
+        mean: raw_results["mean"]["point_estimate"],
+        stderr: raw_results["mean"]["standard_error"]
       }
     end)
   end)
   |> List.flatten()
   |> DF.new()
+  |> DF.mutate(
+    mean: cast(mean, {:duration, :nanosecond}),
+    stderr: cast(stderr, {:duration, :nanosecond})
+  )
+```
+
+```elixir
+# `using` is a list of structs, but we aren't gonna make use of this mostly
+# and we want to be able to group by that column, so add a new column that's just a nice
+# string representation
+# also parse out the n value, which all of our benchmarks have
+display_using = fn using ->
+  using
+  |> Enum.map(fn %{"ctn" => ctn, "impl" => impl} -> ctn <> "=" <> impl end)
+  |> Enum.join(", ")
+end
 ```
 
 ```elixir
@@ -238,7 +314,7 @@ benchmarks =
   )
   # Get the total benchmark time for each project and assignment
   |> DF.group_by(["proj", "using_idx"])
-  |> DF.summarise(time: sum(mean))
+  |> DF.summarise(time: sum(cast(mean, :f32)))
   # Convert using_idx back to original using values
   |> DF.to_rows()
   |> Enum.map(fn row = %{"using_idx" => using_idx} ->
@@ -292,12 +368,12 @@ cost_estimates =
 estimate_impls = SE.distinct(cost_estimates["impl"])
 
 true =
-  (benchmarks
+  (raw_benchmarks
    |> DF.explode("using")
    |> DF.unnest("using"))["impl"]
   |> SE.distinct()
   |> SE.to_list()
-  |> Enum.all?(fn impl -> SE.equal(estimate_impls, impl) |> SE.any?() end)
+  |> Enum.all?(&SE.any?(SE.equal(estimate_impls, &1)))
 ```
 
 ```elixir
@@ -352,54 +428,75 @@ DF.n_rows(singular_benchmarks)
 ```
 
 ```elixir
-# Compare each assignments position in the estimates to its position in the results
-sorted_singular_estimates =
-  singular_estimated_costs
-  |> DF.group_by(["proj"])
-  |> DF.sort_by(estimated_cost)
-
-sorted_singular_results =
+# Best and predicted best implementation for each container type
+selection_comparison =
   singular_benchmarks
+  |> DF.explode("using")
+  |> DF.unnest("using")
   |> DF.group_by(["proj"])
-  |> DF.sort_by(time)
-
-singular_position_comparison =
-  sorted_singular_estimates
-  |> DF.to_rows_stream()
-  |> Enum.map(fn %{"proj" => proj, "using" => using} ->
-    %{
-      proj: proj,
-      using: using,
-      pos_estimate:
-        DF.filter(sorted_singular_estimates, proj == ^proj)["using"]
-        |> SE.to_list()
-        |> Enum.find_index(fn u -> u == using end),
-      pos_results:
-        DF.filter(sorted_singular_results, proj == ^proj)["using"]
-        |> SE.to_list()
-        |> Enum.find_index(fn u -> u == using end)
-    }
-  end)
-  |> DF.new()
+  |> DF.filter(time == min(time))
+  |> DF.join(
+    cost_estimates
+    |> DF.filter(not contains(impl, "until"))
+    |> DF.group_by(["proj", "ctn"])
+    |> DF.filter(cost == min(cost))
+    |> DF.rename(%{"impl" => "predicted_impl"})
+  )
+  |> DF.select(["proj", "ctn", "impl", "predicted_impl"])
+  |> DF.rename(%{"impl" => "best_impl"})
 ```
 
 ```elixir
-# Everywhere we predicted wrong.
-singular_position_comparison
-|> DF.filter(pos_estimate == 0 and pos_estimate != pos_results)
-|> DF.collect()
-```
+# Tools for printing out latex
+defmodule Latex do
+  def escape_latex(str) do
+    String.replace(str, ~r/(\\|{|}|_|\^|#|&|\$|%|~)/, "\\\\\\1")
+  end
 
-```elixir
-singular_estimated_costs
-|> DF.filter(proj == "prime_sieve")
-|> DF.sort_by(estimated_cost)
-```
+  def table(df) do
+    cols = DF.names(df)
+
+    "\\begin{tabular}{|" <>
+      String.duplicate("c|", length(cols)) <>
+      "}\n" <>
+      Enum.join(Enum.map(cols, &escape_latex/1), " & ") <>
+      " \\\\\n\\hline\n" <>
+      (DF.to_rows(df)
+       |> Enum.map(fn row ->
+         cols
+         |> Enum.map(&escape_latex(Kernel.to_string(row[&1])))
+         |> Enum.join(" & ")
+       end)
+       |> Enum.join(" \\\\\n")) <>
+      " \\\\\n\\end{tabular}"
+  end
+end
 
-```elixir
-singular_benchmarks
-|> DF.filter(proj == "prime_sieve")
-|> DF.sort_by(time)
+Latex.table(selection_comparison)
+
+selection_comparison
+|> DF.put(
+  "best_impl",
+  SE.transform(selection_comparison["best_impl"], &CostModel.friendly_impl_name/1)
+)
+|> DF.put(
+  "predicted_impl",
+  SE.transform(selection_comparison["predicted_impl"], &CostModel.friendly_impl_name/1)
+)
+|> DF.put(
+  "mark",
+  SE.not_equal(selection_comparison["best_impl"], selection_comparison["predicted_impl"])
+  |> SE.transform(&if &1, do: "*", else: "")
+)
+|> DF.rename(%{
+  "mark" => " ",
+  "proj" => "Project",
+  "ctn" => "Container Type",
+  "best_impl" => "Best implementation",
+  "predicted_impl" => "Predicted best"
+})
+|> Latex.table()
+|> IO.puts()
 ```
 
 ## Adaptive Containers
@@ -449,31 +546,24 @@ adaptive_raw_benchmarks =
 ```elixir
 best_usings =
   adaptive_raw_benchmarks
+  # get best set of assignments for each project
   |> DF.group_by(["proj", "using"])
   |> DF.filter(not contains(using, "until"))
-  |> DF.summarise(total: sum(mean))
+  |> DF.summarise(total: sum(cast(mean, :f32)))
   |> DF.group_by(["proj"])
   |> DF.filter(total == min(total))
   |> DF.discard("total")
   |> DF.rename(%{"using" => "best_using"})
+  # select adaptive container and the best assignment for each project
   |> DF.join(adaptive_raw_benchmarks)
   |> DF.filter(using == best_using or contains(using, "until"))
-  |> DF.pivot_longer(["hi_95th", "lo_95th"])
+  # summary data point
+  |> DF.mutate(value: cast(mean, :string) <> " +/- " <> cast(stderr, :string))
   |> DF.select(["proj", "using", "n", "value"])
 ```
 
 ```elixir
-Tucan.errorbar(
-  best_usings
-  |> DF.filter(proj == "example_mapping"),
-  "value",
-  orient: :vertical,
-  ticks: true,
-  points: true,
-  group_by: "n"
-  # color_by: "using"
-)
-|> Tucan.Legend.set_orientation(:color, "bottom")
-|> Tucan.Legend.put_options(:color, label_limit: 1000)
-|> Tucan.set_size(500, 500)
+best_usings
+|> DF.filter(proj == "aoc_2022_09")
+|> DF.pivot_wider("n", "value")
 ```
author	Aria Shrimpton <me@aria.rip>	2024-03-25 18:54:08 +0000
committer	Aria Shrimpton <me@aria.rip>	2024-03-25 18:54:08 +0000
commit	e2d0806b2d944b87eb2a00bb46243a1d50487a3e (patch)
tree	1096059e340ba64d6186fde7dc8810f9e7c2cb26 /analysis/vis.livemd
parent	57c3c48f6660f905fb974cff7ec58f746a1a6970 (diff)