aboutsummaryrefslogtreecommitdiff
path: root/analysis/vis.livemd
diff options
context:
space:
mode:
authorAria Shrimpton <me@aria.rip>2024-03-25 18:54:08 +0000
committerAria Shrimpton <me@aria.rip>2024-03-25 18:54:08 +0000
commite2d0806b2d944b87eb2a00bb46243a1d50487a3e (patch)
tree1096059e340ba64d6186fde7dc8810f9e7c2cb26 /analysis/vis.livemd
parent57c3c48f6660f905fb974cff7ec58f746a1a6970 (diff)
update data & analysis
Diffstat (limited to 'analysis/vis.livemd')
-rw-r--r--analysis/vis.livemd236
1 files changed, 163 insertions, 73 deletions
diff --git a/analysis/vis.livemd b/analysis/vis.livemd
index 4913e76..48b8e59 100644
--- a/analysis/vis.livemd
+++ b/analysis/vis.livemd
@@ -18,6 +18,7 @@ Mix.install([
```elixir
require Explorer.DataFrame
require Explorer.Series
+require VegaLite
alias Explorer.DataFrame, as: DF
alias Explorer.Series, as: SE
job_id = "current"
@@ -86,21 +87,31 @@ cost_model_points =
|> DF.new()
end)
|> DF.concat_rows()
+ |> DF.mutate(t: cast(t, {:duration, :nanosecond}))
```
## Cost model exploratory plots
-<!-- livebook:{"reevaluate_automatically":true} -->
-
```elixir
-set_impls = ["BTreeSet", "HashSet", "VecSet", "SortedVecSet"]
-mapping_impls = ["HashMap", "BTreeMap", "VecMap", "SortedVecMap"]
-other_impls = ["Vec", "LinkedList", "SortedVec"]
-
-impls = other_impls
-
defmodule CostModel do
@defaults %{y_domain: nil, ns: 1..60_000//100, draw_points: true}
+ @all_impls Enum.sort([
+ "SortedVec",
+ "SortedVecSet",
+ "SortedVecMap",
+ "Vec",
+ "VecSet",
+ "VecMap",
+ "BTreeSet",
+ "BTreeMap",
+ "HashSet",
+ "HashMap",
+ "LinkedList"
+ ])
+
+ def friendly_impl_name(impl) do
+ String.split(impl, "::") |> List.last()
+ end
def points_for(cost_models, ns, impl, op) do
%{"coeffs" => [coeffs]} =
@@ -116,7 +127,7 @@ defmodule CostModel do
|> Enum.sum()) + Enum.at(coeffs, 3) * Math.log2(n)
%{
- impl: String.split(impl, "::") |> List.last(),
+ impl: friendly_impl_name(impl),
op: op,
n: n,
t: max(t, 0)
@@ -146,7 +157,7 @@ defmodule CostModel do
cost_model_points
|> DF.filter(op == ^op and impl in ^impls)
|> DF.group_by(["impl", "n"])
- |> DF.summarise(t: mean(t)),
+ |> DF.summarise(t: mean(cast(t, :f32))),
"n",
"t",
color_by: "impl",
@@ -169,11 +180,61 @@ defmodule CostModel do
_ -> plot
end
end
+
+ def split_plot(cost_models, cost_model_points, impl_splits, op) do
+ @all_impls = List.flatten(impl_splits) |> Enum.sort()
+
+ Enum.map(impl_splits, &plot(cost_models, cost_model_points, &1, op))
+ |> Tucan.vconcat()
+ end
end
```
+<!-- livebook:{"reevaluate_automatically":true} -->
+
+```elixir
+graph =
+ CostModel.split_plot(
+ cost_models,
+ cost_model_points,
+ [
+ ["SortedVec", "SortedVecSet", "SortedVecMap", "VecSet", "VecMap"],
+ [
+ "Vec",
+ "LinkedList"
+ ],
+ ["BTreeSet", "BTreeMap", "HashSet", "HashMap"]
+ ],
+ "insert"
+ )
+ |> VegaLite.resolve(:scale, color: :independent)
+
+VegaLite.Export.save!(graph, "../thesis/assets/insert.json")
+
+graph
+```
+
```elixir
-CostModel.plot(cost_models, cost_model_points, other_impls, "remove")
+graph =
+ CostModel.split_plot(
+ cost_models,
+ cost_model_points,
+ [
+ ["SortedVec", "SortedVecSet", "SortedVecMap", "VecSet"],
+ [
+ "Vec",
+ "LinkedList",
+ "VecMap"
+ ],
+ ["BTreeSet", "BTreeMap", "HashSet", "HashMap"]
+ ],
+ "contains"
+ )
+ |> VegaLite.resolve(:scale, color: :independent)
+
+VegaLite.Export.save!(graph, "../thesis/assets/contains.json")
+
+graph
```
## Read benchmark data
@@ -203,14 +264,29 @@ raw_benchmarks =
using:
Regex.scan(~r/\"(\w*)\", ([^)]*)/, Path.basename(dir))
|> Enum.map(fn [_, ctn, impl] -> %{ctn: ctn, impl: impl} end),
- mean: raw_results["mean"]["point_estimate"] / 10 ** 9,
- hi_95th: raw_results["mean"]["confidence_interval"]["upper_bound"] / 10 ** 9,
- lo_95th: raw_results["mean"]["confidence_interval"]["lower_bound"] / 10 ** 9
+ mean: raw_results["mean"]["point_estimate"],
+ stderr: raw_results["mean"]["standard_error"]
}
end)
end)
|> List.flatten()
|> DF.new()
+ |> DF.mutate(
+ mean: cast(mean, {:duration, :nanosecond}),
+ stderr: cast(stderr, {:duration, :nanosecond})
+ )
+```
+
+```elixir
+# `using` is a list of structs, but we aren't gonna make use of this mostly
+# and we want to be able to group by that column, so add a new column that's just a nice
+# string representation
+# also parse out the n value, which all of our benchmarks have
+display_using = fn using ->
+ using
+ |> Enum.map(fn %{"ctn" => ctn, "impl" => impl} -> ctn <> "=" <> impl end)
+ |> Enum.join(", ")
+end
```
```elixir
@@ -238,7 +314,7 @@ benchmarks =
)
# Get the total benchmark time for each project and assignment
|> DF.group_by(["proj", "using_idx"])
- |> DF.summarise(time: sum(mean))
+ |> DF.summarise(time: sum(cast(mean, :f32)))
# Convert using_idx back to original using values
|> DF.to_rows()
|> Enum.map(fn row = %{"using_idx" => using_idx} ->
@@ -292,12 +368,12 @@ cost_estimates =
estimate_impls = SE.distinct(cost_estimates["impl"])
true =
- (benchmarks
+ (raw_benchmarks
|> DF.explode("using")
|> DF.unnest("using"))["impl"]
|> SE.distinct()
|> SE.to_list()
- |> Enum.all?(fn impl -> SE.equal(estimate_impls, impl) |> SE.any?() end)
+ |> Enum.all?(&SE.any?(SE.equal(estimate_impls, &1)))
```
```elixir
@@ -352,54 +428,75 @@ DF.n_rows(singular_benchmarks)
```
```elixir
-# Compare each assignments position in the estimates to its position in the results
-sorted_singular_estimates =
- singular_estimated_costs
- |> DF.group_by(["proj"])
- |> DF.sort_by(estimated_cost)
-
-sorted_singular_results =
+# Best and predicted best implementation for each container type
+selection_comparison =
singular_benchmarks
+ |> DF.explode("using")
+ |> DF.unnest("using")
|> DF.group_by(["proj"])
- |> DF.sort_by(time)
-
-singular_position_comparison =
- sorted_singular_estimates
- |> DF.to_rows_stream()
- |> Enum.map(fn %{"proj" => proj, "using" => using} ->
- %{
- proj: proj,
- using: using,
- pos_estimate:
- DF.filter(sorted_singular_estimates, proj == ^proj)["using"]
- |> SE.to_list()
- |> Enum.find_index(fn u -> u == using end),
- pos_results:
- DF.filter(sorted_singular_results, proj == ^proj)["using"]
- |> SE.to_list()
- |> Enum.find_index(fn u -> u == using end)
- }
- end)
- |> DF.new()
+ |> DF.filter(time == min(time))
+ |> DF.join(
+ cost_estimates
+ |> DF.filter(not contains(impl, "until"))
+ |> DF.group_by(["proj", "ctn"])
+ |> DF.filter(cost == min(cost))
+ |> DF.rename(%{"impl" => "predicted_impl"})
+ )
+ |> DF.select(["proj", "ctn", "impl", "predicted_impl"])
+ |> DF.rename(%{"impl" => "best_impl"})
```
```elixir
-# Everywhere we predicted wrong.
-singular_position_comparison
-|> DF.filter(pos_estimate == 0 and pos_estimate != pos_results)
-|> DF.collect()
-```
+# Tools for printing out latex
+defmodule Latex do
+ def escape_latex(str) do
+ String.replace(str, ~r/(\\|{|}|_|\^|#|&|\$|%|~)/, "\\\\\\1")
+ end
-```elixir
-singular_estimated_costs
-|> DF.filter(proj == "prime_sieve")
-|> DF.sort_by(estimated_cost)
-```
+ def table(df) do
+ cols = DF.names(df)
+
+ "\\begin{tabular}{|" <>
+ String.duplicate("c|", length(cols)) <>
+ "}\n" <>
+ Enum.join(Enum.map(cols, &escape_latex/1), " & ") <>
+ " \\\\\n\\hline\n" <>
+ (DF.to_rows(df)
+ |> Enum.map(fn row ->
+ cols
+ |> Enum.map(&escape_latex(Kernel.to_string(row[&1])))
+ |> Enum.join(" & ")
+ end)
+ |> Enum.join(" \\\\\n")) <>
+ " \\\\\n\\end{tabular}"
+ end
+end
-```elixir
-singular_benchmarks
-|> DF.filter(proj == "prime_sieve")
-|> DF.sort_by(time)
+Latex.table(selection_comparison)
+
+selection_comparison
+|> DF.put(
+ "best_impl",
+ SE.transform(selection_comparison["best_impl"], &CostModel.friendly_impl_name/1)
+)
+|> DF.put(
+ "predicted_impl",
+ SE.transform(selection_comparison["predicted_impl"], &CostModel.friendly_impl_name/1)
+)
+|> DF.put(
+ "mark",
+ SE.not_equal(selection_comparison["best_impl"], selection_comparison["predicted_impl"])
+ |> SE.transform(&if &1, do: "*", else: "")
+)
+|> DF.rename(%{
+ "mark" => " ",
+ "proj" => "Project",
+ "ctn" => "Container Type",
+ "best_impl" => "Best implementation",
+ "predicted_impl" => "Predicted best"
+})
+|> Latex.table()
+|> IO.puts()
```
## Adaptive Containers
@@ -449,31 +546,24 @@ adaptive_raw_benchmarks =
```elixir
best_usings =
adaptive_raw_benchmarks
+ # get best set of assignments for each project
|> DF.group_by(["proj", "using"])
|> DF.filter(not contains(using, "until"))
- |> DF.summarise(total: sum(mean))
+ |> DF.summarise(total: sum(cast(mean, :f32)))
|> DF.group_by(["proj"])
|> DF.filter(total == min(total))
|> DF.discard("total")
|> DF.rename(%{"using" => "best_using"})
+ # select adaptive container and the best assignment for each project
|> DF.join(adaptive_raw_benchmarks)
|> DF.filter(using == best_using or contains(using, "until"))
- |> DF.pivot_longer(["hi_95th", "lo_95th"])
+ # summary data point
+ |> DF.mutate(value: cast(mean, :string) <> " +/- " <> cast(stderr, :string))
|> DF.select(["proj", "using", "n", "value"])
```
```elixir
-Tucan.errorbar(
- best_usings
- |> DF.filter(proj == "example_mapping"),
- "value",
- orient: :vertical,
- ticks: true,
- points: true,
- group_by: "n"
- # color_by: "using"
-)
-|> Tucan.Legend.set_orientation(:color, "bottom")
-|> Tucan.Legend.put_options(:color, label_limit: 1000)
-|> Tucan.set_size(500, 500)
+best_usings
+|> DF.filter(proj == "aoc_2022_09")
+|> DF.pivot_wider("n", "value")
```