<!-- livebook:{"app_settings":{"slug":"asdf"}} -->

# Dissertation Visualisations

```elixir
Mix.install([
  {:tucan, "~> 0.3.0"},
  {:kino_vega_lite, "~> 0.1.8"},
  {:json, "~> 1.4"},
  {:explorer, "~> 0.8.0"},
  {:kino_explorer, "~> 0.1.11"},
  {:math, "~> 0.7.0"}
])
```

## Variables

```elixir
require Explorer.DataFrame
require Explorer.Series
alias Explorer.DataFrame, as: DF
alias Explorer.Series, as: SE
job_id = "current"
job_dir = Path.expand(~c"./" ++ job_id) |> Path.absname()
sections_dir = Path.join(job_dir, "sections")
cm_dir = Path.join([job_dir, "candelabra", "benchmark_results"])
criterion_dir = Path.join(job_dir, "criterion")
```

## Read cost model data

```elixir
{:ok, cost_model_files} = File.ls(cm_dir)

cost_model_files =
  cost_model_files
  |> Enum.map(fn fname -> Path.join(cm_dir, fname) |> Path.absname() end)

cost_model_files
```

<!-- livebook:{"reevaluate_automatically":true} -->

```elixir
# Parse cost model information
cost_models =
  cost_model_files
  |> Enum.map(fn fname ->
    impl = Path.basename(fname) |> String.replace("_", ":")
    contents = File.read!(fname)
    contents = JSON.decode!(contents)

    contents["model"]["by_op"]
    |> Enum.map(fn {op, %{"coeffs" => coeffs}} ->
      %{
        op: op,
        impl: impl,
        coeffs: coeffs
      }
    end)
    |> DF.new()
  end)
  |> DF.concat_rows()
```

```elixir
# Parse cost model information
cost_model_points =
  cost_model_files
  |> Enum.map(fn fname ->
    impl = Path.basename(fname) |> String.replace("_", ":")
    contents = File.read!(fname)
    contents = JSON.decode!(contents)

    contents["results"]["by_op"]
    |> Enum.flat_map(fn {op, results} ->
      Enum.map(results, fn [n, cost] ->
        %{
          op: op,
          impl: String.split(impl, "::") |> List.last(),
          n: n,
          t: cost
        }
      end)
    end)
    |> DF.new()
  end)
  |> DF.concat_rows()
```

## Cost model exploratory plots

<!-- livebook:{"reevaluate_automatically":true} -->

```elixir
set_impls = ["BTreeSet", "HashSet", "VecSet", "SortedVecSet"]
mapping_impls = ["HashMap", "BTreeMap", "VecMap", "SortedVecMap"]
other_impls = ["Vec", "LinkedList", "SortedVec"]

impls = other_impls

defmodule CostModel do
  @defaults %{y_domain: nil, ns: 1..60_000//100, draw_points: true}

  def points_for(cost_models, ns, impl, op) do
    %{"coeffs" => [coeffs]} =
      DF.filter(cost_models, impl == ^impl and op == ^op)
      |> DF.to_columns()

    Enum.map(ns, fn n ->
      t =
        (coeffs
         |> Enum.take(3)
         |> Enum.with_index()
         |> Enum.map(fn {coeff, idx} -> coeff * n ** idx end)
         |> Enum.sum()) + Enum.at(coeffs, 3) * Math.log2(n)

      %{
        impl: String.split(impl, "::") |> List.last(),
        op: op,
        n: n,
        t: max(t, 0)
      }
    end)
    |> DF.new()
  end

  def plot(cost_models, cost_model_points, impls, op, opts \\ []) do
    %{y_domain: y_domain, ns: ns, draw_points: draw_points} = Enum.into(opts, @defaults)

    plot =
      Tucan.layers(
        [
          cost_models
          |> DF.filter(op == ^op)
          |> DF.distinct(["impl"])
          |> DF.to_rows()
          |> Enum.map(fn %{"impl" => impl} -> points_for(cost_models, ns, impl, op) end)
          |> DF.concat_rows()
          |> DF.filter(impl in ^impls)
          |> Tucan.lineplot("n", "t", color_by: "impl", clip: true)
        ] ++
          if(draw_points,
            do: [
              Tucan.scatter(
                cost_model_points
                |> DF.filter(op == ^op and impl in ^impls)
                |> DF.group_by(["impl", "n"])
                |> DF.summarise(t: mean(t)),
                "n",
                "t",
                color_by: "impl",
                clip: true
              )
            ],
            else: []
          )
      )

    plot =
      plot
      |> Tucan.Axes.set_y_title("Estimated cost")
      |> Tucan.Axes.set_x_title("Size of container (n)")
      |> Tucan.set_size(500, 250)
      |> Tucan.Legend.set_title(:color, "Implementation")

    case y_domain do
      [lo, hi] -> Tucan.Scale.set_y_domain(plot, lo, hi)
      _ -> plot
    end
  end
end
```

```elixir
CostModel.plot(cost_models, cost_model_points, other_impls, "remove")
```

## Read benchmark data

```elixir
# Read in the results of every individual criterion benchmark
raw_benchmarks =
  File.ls!(criterion_dir)
  |> Enum.map(fn name ->
    File.ls!(Path.join(criterion_dir, name))
    |> Enum.map(fn p -> %{bench: name, subbench: p} end)
  end)
  |> List.flatten()
  |> Enum.map(fn %{bench: bench, subbench: subbench} ->
    File.ls!(Path.join([criterion_dir, bench, subbench]))
    |> Enum.filter(fn x -> String.contains?(x, "Mapping2D") end)
    |> Enum.map(fn x -> Path.join([criterion_dir, bench, subbench, x]) end)
    |> Enum.map(fn dir ->
      raw_results =
        Path.join(dir, "estimates.json")
        |> File.read!()
        |> JSON.decode!()

      %{
        bench_id: bench <> "/" <> subbench,
        proj: String.split(bench, "-") |> hd,
        using:
          Regex.scan(~r/\"(\w*)\", ([^)]*)/, Path.basename(dir))
          |> Enum.map(fn [_, ctn, impl] -> %{ctn: ctn, impl: impl} end),
        mean: raw_results["mean"]["point_estimate"] / 10 ** 9,
        hi_95th: raw_results["mean"]["confidence_interval"]["upper_bound"] / 10 ** 9,
        lo_95th: raw_results["mean"]["confidence_interval"]["lower_bound"] / 10 ** 9
      }
    end)
  end)
  |> List.flatten()
  |> DF.new()
```

```elixir
# Aggregate benchmark results by project, since we can only do assignments by project
# Unfortunately we can't group by lists, so we need to do some weird shit.
# This is basically equivalent to:
# benchmarks = raw_benchmarks
# |> DF.group_by(["proj", "using"])
# |> DF.summarise(time: sum(mean))

# Build list of using values to index into
usings =
  raw_benchmarks["using"]
  |> SE.to_list()
  |> Enum.uniq()

benchmarks =
  raw_benchmarks
  # Make a column corresponding to using that isn't a list
  |> DF.put(
    "using_idx",
    raw_benchmarks["using"]
    |> SE.to_list()
    |> Enum.map(fn using -> Enum.find_index(usings, &(&1 == using)) end)
  )
  # Get the total benchmark time for each project and assignment
  |> DF.group_by(["proj", "using_idx"])
  |> DF.summarise(time: sum(mean))
  # Convert using_idx back to original using values
  |> DF.to_rows()
  |> Enum.map(fn row = %{"using_idx" => using_idx} ->
    Map.put(row, "using", Enum.at(usings, using_idx))
  end)
  |> DF.new()
  |> DF.select(["proj", "time", "using"])
```

## Read cost estimate data

```elixir
# Cost estimates by project, ctn, and implementation
projs = SE.distinct(benchmarks["proj"])

cost_estimates =
  SE.transform(projs, fn proj_name ->
    [_, table | _] =
      Path.join(sections_dir, "compare-" <> proj_name)
      |> File.read!()
      |> String.split("& file \\\\\n\\hline\n")

    table
    |> String.split("\n\\end{tabular}")
    |> hd
    |> String.split("\n")
    |> Enum.map(fn x -> String.split(x, " & ") end)
    |> Enum.map(fn [ctn, impl, cost | _] ->
      %{
        proj: proj_name,
        ctn: ctn,
        impl:
          impl
          |> String.replace("\\_", "_"),
        cost:
          if String.contains?(cost, ".") do
            String.to_float(cost)
          else
            String.to_integer(cost)
          end
      }
    end)
  end)
  |> SE.to_list()
  |> List.flatten()
  |> DF.new()
```

```elixir
# Double-check that we have all of the cost estimates for everything mentioned in the assignments
estimate_impls = SE.distinct(cost_estimates["impl"])

true =
  (benchmarks
   |> DF.explode("using")
   |> DF.unnest("using"))["impl"]
  |> SE.distinct()
  |> SE.to_list()
  |> Enum.all?(fn impl -> SE.equal(estimate_impls, impl) |> SE.any?() end)
```

```elixir
# Gets the cost of assignment from cost estimates
cost_of_assignment = fn proj, assignment ->
  assignment
  |> Enum.map(fn %{"ctn" => ctn, "impl" => impl} ->
    DF.filter(cost_estimates, proj == ^proj and ctn == ^ctn and impl == ^impl)["cost"][0]
  end)
  |> Enum.sum()
end

cost_of_assignment.("example_stack", [%{"ctn" => "StackCon", "impl" => "std::vec::Vec"}])
```

```elixir
# For each benchmarked assignment, estimate the cost.
estimated_costs =
  benchmarks
  |> DF.to_rows_stream()
  |> Enum.map(fn %{"proj" => proj, "using" => using} ->
    %{
      proj: proj,
      using: using,
      estimated_cost: cost_of_assignment.(proj, using)
    }
  end)
  |> DF.new()
```

## Estimates vs results (ignoring adaptive containers)

```elixir
# Don't worry about adaptive containers for now
singular_estimated_costs =
  estimated_costs
  |> DF.to_rows_stream()
  |> Enum.filter(fn %{"using" => using} ->
    Enum.all?(using, fn %{"impl" => impl} -> !String.contains?(impl, "until") end)
  end)
  |> DF.new()

singular_benchmarks =
  benchmarks
  |> DF.to_rows_stream()
  |> Enum.filter(fn %{"using" => using} ->
    Enum.all?(using, fn %{"impl" => impl} -> !String.contains?(impl, "until") end)
  end)
  |> DF.new()

DF.n_rows(singular_benchmarks)
```

```elixir
# Compare each assignments position in the estimates to its position in the results
sorted_singular_estimates =
  singular_estimated_costs
  |> DF.group_by(["proj"])
  |> DF.sort_by(estimated_cost)

sorted_singular_results =
  singular_benchmarks
  |> DF.group_by(["proj"])
  |> DF.sort_by(time)

singular_position_comparison =
  sorted_singular_estimates
  |> DF.to_rows_stream()
  |> Enum.map(fn %{"proj" => proj, "using" => using} ->
    %{
      proj: proj,
      using: using,
      pos_estimate:
        DF.filter(sorted_singular_estimates, proj == ^proj)["using"]
        |> SE.to_list()
        |> Enum.find_index(fn u -> u == using end),
      pos_results:
        DF.filter(sorted_singular_results, proj == ^proj)["using"]
        |> SE.to_list()
        |> Enum.find_index(fn u -> u == using end)
    }
  end)
  |> DF.new()
```

```elixir
# Everywhere we predicted wrong.
singular_position_comparison
|> DF.filter(pos_estimate == 0 and pos_estimate != pos_results)
|> DF.collect()
```

```elixir
singular_estimated_costs
|> DF.filter(proj == "prime_sieve")
|> DF.sort_by(estimated_cost)
```

```elixir
singular_benchmarks
|> DF.filter(proj == "prime_sieve")
|> DF.sort_by(time)
```

## Adaptive Containers

```elixir
# Projects where an adaptive container was suggested
adaptive_projs =
  (estimated_costs
   |> DF.to_rows()
   |> Enum.filter(fn %{"using" => using} ->
     using
     |> Enum.map(fn %{"impl" => impl} -> String.contains?(impl, "until") end)
     |> Enum.any?()
   end)
   |> DF.new()
   |> DF.distinct(["proj"]))["proj"]
```

```elixir
adaptive_estimated_costs = estimated_costs |> DF.filter(proj in ^adaptive_projs)

adaptive_raw_benchmarks =
  raw_benchmarks
  |> DF.filter(proj in ^adaptive_projs)

display_using = fn using ->
  using
  |> Enum.map(fn %{"ctn" => ctn, "impl" => impl} -> ctn <> "=" <> impl end)
  |> Enum.join(", ")
end

adaptive_raw_benchmarks =
  adaptive_raw_benchmarks
  |> DF.put(
    "n",
    adaptive_raw_benchmarks["bench_id"]
    |> SE.split("/")
    |> SE.transform(&Enum.at(&1, 1))
  )
  |> DF.put(
    "using",
    adaptive_raw_benchmarks["using"]
    |> SE.transform(display_using)
  )
```

```elixir
best_usings =
  adaptive_raw_benchmarks
  |> DF.group_by(["proj", "using"])
  |> DF.filter(not contains(using, "until"))
  |> DF.summarise(total: sum(mean))
  |> DF.group_by(["proj"])
  |> DF.filter(total == min(total))
  |> DF.discard("total")
  |> DF.rename(%{"using" => "best_using"})
  |> DF.join(adaptive_raw_benchmarks)
  |> DF.filter(using == best_using or contains(using, "until"))
  |> DF.pivot_longer(["hi_95th", "lo_95th"])
  |> DF.select(["proj", "using", "n", "value"])
```

```elixir
Tucan.errorbar(
  best_usings
  |> DF.filter(proj == "example_mapping"),
  "value",
  orient: :vertical,
  ticks: true,
  points: true,
  group_by: "n"
  # color_by: "using"
)
|> Tucan.Legend.set_orientation(:color, "bottom")
|> Tucan.Legend.put_options(:color, label_limit: 1000)
|> Tucan.set_size(500, 500)
```