diff options
Diffstat (limited to 'analysis/vis.livemd')
-rw-r--r-- | analysis/vis.livemd | 236 |
1 files changed, 163 insertions, 73 deletions
diff --git a/analysis/vis.livemd b/analysis/vis.livemd index 4913e76..48b8e59 100644 --- a/analysis/vis.livemd +++ b/analysis/vis.livemd @@ -18,6 +18,7 @@ Mix.install([ ```elixir require Explorer.DataFrame require Explorer.Series +require VegaLite alias Explorer.DataFrame, as: DF alias Explorer.Series, as: SE job_id = "current" @@ -86,21 +87,31 @@ cost_model_points = |> DF.new() end) |> DF.concat_rows() + |> DF.mutate(t: cast(t, {:duration, :nanosecond})) ``` ## Cost model exploratory plots -<!-- livebook:{"reevaluate_automatically":true} --> - ```elixir -set_impls = ["BTreeSet", "HashSet", "VecSet", "SortedVecSet"] -mapping_impls = ["HashMap", "BTreeMap", "VecMap", "SortedVecMap"] -other_impls = ["Vec", "LinkedList", "SortedVec"] - -impls = other_impls - defmodule CostModel do @defaults %{y_domain: nil, ns: 1..60_000//100, draw_points: true} + @all_impls Enum.sort([ + "SortedVec", + "SortedVecSet", + "SortedVecMap", + "Vec", + "VecSet", + "VecMap", + "BTreeSet", + "BTreeMap", + "HashSet", + "HashMap", + "LinkedList" + ]) + + def friendly_impl_name(impl) do + String.split(impl, "::") |> List.last() + end def points_for(cost_models, ns, impl, op) do %{"coeffs" => [coeffs]} = @@ -116,7 +127,7 @@ defmodule CostModel do |> Enum.sum()) + Enum.at(coeffs, 3) * Math.log2(n) %{ - impl: String.split(impl, "::") |> List.last(), + impl: friendly_impl_name(impl), op: op, n: n, t: max(t, 0) @@ -146,7 +157,7 @@ defmodule CostModel do cost_model_points |> DF.filter(op == ^op and impl in ^impls) |> DF.group_by(["impl", "n"]) - |> DF.summarise(t: mean(t)), + |> DF.summarise(t: mean(cast(t, :f32))), "n", "t", color_by: "impl", @@ -169,11 +180,61 @@ defmodule CostModel do _ -> plot end end + + def split_plot(cost_models, cost_model_points, impl_splits, op) do + @all_impls = List.flatten(impl_splits) |> Enum.sort() + + Enum.map(impl_splits, &plot(cost_models, cost_model_points, &1, op)) + |> Tucan.vconcat() + end end ``` +<!-- livebook:{"reevaluate_automatically":true} --> + +```elixir +graph = + CostModel.split_plot( + cost_models, + cost_model_points, + [ + ["SortedVec", "SortedVecSet", "SortedVecMap", "VecSet", "VecMap"], + [ + "Vec", + "LinkedList" + ], + ["BTreeSet", "BTreeMap", "HashSet", "HashMap"] + ], + "insert" + ) + |> VegaLite.resolve(:scale, color: :independent) + +VegaLite.Export.save!(graph, "../thesis/assets/insert.json") + +graph +``` + ```elixir -CostModel.plot(cost_models, cost_model_points, other_impls, "remove") +graph = + CostModel.split_plot( + cost_models, + cost_model_points, + [ + ["SortedVec", "SortedVecSet", "SortedVecMap", "VecSet"], + [ + "Vec", + "LinkedList", + "VecMap" + ], + ["BTreeSet", "BTreeMap", "HashSet", "HashMap"] + ], + "contains" + ) + |> VegaLite.resolve(:scale, color: :independent) + +VegaLite.Export.save!(graph, "../thesis/assets/contains.json") + +graph ``` ## Read benchmark data @@ -203,14 +264,29 @@ raw_benchmarks = using: Regex.scan(~r/\"(\w*)\", ([^)]*)/, Path.basename(dir)) |> Enum.map(fn [_, ctn, impl] -> %{ctn: ctn, impl: impl} end), - mean: raw_results["mean"]["point_estimate"] / 10 ** 9, - hi_95th: raw_results["mean"]["confidence_interval"]["upper_bound"] / 10 ** 9, - lo_95th: raw_results["mean"]["confidence_interval"]["lower_bound"] / 10 ** 9 + mean: raw_results["mean"]["point_estimate"], + stderr: raw_results["mean"]["standard_error"] } end) end) |> List.flatten() |> DF.new() + |> DF.mutate( + mean: cast(mean, {:duration, :nanosecond}), + stderr: cast(stderr, {:duration, :nanosecond}) + ) +``` + +```elixir +# `using` is a list of structs, but we aren't gonna make use of this mostly +# and we want to be able to group by that column, so add a new column that's just a nice +# string representation +# also parse out the n value, which all of our benchmarks have +display_using = fn using -> + using + |> Enum.map(fn %{"ctn" => ctn, "impl" => impl} -> ctn <> "=" <> impl end) + |> Enum.join(", ") +end ``` ```elixir @@ -238,7 +314,7 @@ benchmarks = ) # Get the total benchmark time for each project and assignment |> DF.group_by(["proj", "using_idx"]) - |> DF.summarise(time: sum(mean)) + |> DF.summarise(time: sum(cast(mean, :f32))) # Convert using_idx back to original using values |> DF.to_rows() |> Enum.map(fn row = %{"using_idx" => using_idx} -> @@ -292,12 +368,12 @@ cost_estimates = estimate_impls = SE.distinct(cost_estimates["impl"]) true = - (benchmarks + (raw_benchmarks |> DF.explode("using") |> DF.unnest("using"))["impl"] |> SE.distinct() |> SE.to_list() - |> Enum.all?(fn impl -> SE.equal(estimate_impls, impl) |> SE.any?() end) + |> Enum.all?(&SE.any?(SE.equal(estimate_impls, &1))) ``` ```elixir @@ -352,54 +428,75 @@ DF.n_rows(singular_benchmarks) ``` ```elixir -# Compare each assignments position in the estimates to its position in the results -sorted_singular_estimates = - singular_estimated_costs - |> DF.group_by(["proj"]) - |> DF.sort_by(estimated_cost) - -sorted_singular_results = +# Best and predicted best implementation for each container type +selection_comparison = singular_benchmarks + |> DF.explode("using") + |> DF.unnest("using") |> DF.group_by(["proj"]) - |> DF.sort_by(time) - -singular_position_comparison = - sorted_singular_estimates - |> DF.to_rows_stream() - |> Enum.map(fn %{"proj" => proj, "using" => using} -> - %{ - proj: proj, - using: using, - pos_estimate: - DF.filter(sorted_singular_estimates, proj == ^proj)["using"] - |> SE.to_list() - |> Enum.find_index(fn u -> u == using end), - pos_results: - DF.filter(sorted_singular_results, proj == ^proj)["using"] - |> SE.to_list() - |> Enum.find_index(fn u -> u == using end) - } - end) - |> DF.new() + |> DF.filter(time == min(time)) + |> DF.join( + cost_estimates + |> DF.filter(not contains(impl, "until")) + |> DF.group_by(["proj", "ctn"]) + |> DF.filter(cost == min(cost)) + |> DF.rename(%{"impl" => "predicted_impl"}) + ) + |> DF.select(["proj", "ctn", "impl", "predicted_impl"]) + |> DF.rename(%{"impl" => "best_impl"}) ``` ```elixir -# Everywhere we predicted wrong. -singular_position_comparison -|> DF.filter(pos_estimate == 0 and pos_estimate != pos_results) -|> DF.collect() -``` +# Tools for printing out latex +defmodule Latex do + def escape_latex(str) do + String.replace(str, ~r/(\\|{|}|_|\^|#|&|\$|%|~)/, "\\\\\\1") + end -```elixir -singular_estimated_costs -|> DF.filter(proj == "prime_sieve") -|> DF.sort_by(estimated_cost) -``` + def table(df) do + cols = DF.names(df) + + "\\begin{tabular}{|" <> + String.duplicate("c|", length(cols)) <> + "}\n" <> + Enum.join(Enum.map(cols, &escape_latex/1), " & ") <> + " \\\\\n\\hline\n" <> + (DF.to_rows(df) + |> Enum.map(fn row -> + cols + |> Enum.map(&escape_latex(Kernel.to_string(row[&1]))) + |> Enum.join(" & ") + end) + |> Enum.join(" \\\\\n")) <> + " \\\\\n\\end{tabular}" + end +end -```elixir -singular_benchmarks -|> DF.filter(proj == "prime_sieve") -|> DF.sort_by(time) +Latex.table(selection_comparison) + +selection_comparison +|> DF.put( + "best_impl", + SE.transform(selection_comparison["best_impl"], &CostModel.friendly_impl_name/1) +) +|> DF.put( + "predicted_impl", + SE.transform(selection_comparison["predicted_impl"], &CostModel.friendly_impl_name/1) +) +|> DF.put( + "mark", + SE.not_equal(selection_comparison["best_impl"], selection_comparison["predicted_impl"]) + |> SE.transform(&if &1, do: "*", else: "") +) +|> DF.rename(%{ + "mark" => " ", + "proj" => "Project", + "ctn" => "Container Type", + "best_impl" => "Best implementation", + "predicted_impl" => "Predicted best" +}) +|> Latex.table() +|> IO.puts() ``` ## Adaptive Containers @@ -449,31 +546,24 @@ adaptive_raw_benchmarks = ```elixir best_usings = adaptive_raw_benchmarks + # get best set of assignments for each project |> DF.group_by(["proj", "using"]) |> DF.filter(not contains(using, "until")) - |> DF.summarise(total: sum(mean)) + |> DF.summarise(total: sum(cast(mean, :f32))) |> DF.group_by(["proj"]) |> DF.filter(total == min(total)) |> DF.discard("total") |> DF.rename(%{"using" => "best_using"}) + # select adaptive container and the best assignment for each project |> DF.join(adaptive_raw_benchmarks) |> DF.filter(using == best_using or contains(using, "until")) - |> DF.pivot_longer(["hi_95th", "lo_95th"]) + # summary data point + |> DF.mutate(value: cast(mean, :string) <> " +/- " <> cast(stderr, :string)) |> DF.select(["proj", "using", "n", "value"]) ``` ```elixir -Tucan.errorbar( - best_usings - |> DF.filter(proj == "example_mapping"), - "value", - orient: :vertical, - ticks: true, - points: true, - group_by: "n" - # color_by: "using" -) -|> Tucan.Legend.set_orientation(:color, "bottom") -|> Tucan.Legend.put_options(:color, label_limit: 1000) -|> Tucan.set_size(500, 500) +best_usings +|> DF.filter(proj == "aoc_2022_09") +|> DF.pivot_wider("n", "value") ``` |