From 47941ae2594c8eb3cea07d40352a96a7243b8cee Mon Sep 17 00:00:00 2001 From: Aria Shrimpton Date: Mon, 1 Apr 2024 15:20:39 +0100 Subject: redraft #2 --- Tasks.org | 2 +- thesis/biblio.bib | 40 +++++++++++--------- thesis/main.tex | 1 + thesis/parts/abstract.tex | 10 ++--- thesis/parts/acknowledgements.tex | 2 +- thesis/parts/background.tex | 80 +++++++++++++++++++++------------------ thesis/parts/conclusion.tex | 5 ++- thesis/parts/design.tex | 27 ++++++------- thesis/parts/implementation.tex | 15 ++++---- thesis/parts/introduction.tex | 12 +++--- thesis/parts/results.tex | 53 ++++++++++++++------------ 11 files changed, 134 insertions(+), 113 deletions(-) diff --git a/Tasks.org b/Tasks.org index e6274f8..c86beb0 100644 --- a/Tasks.org +++ b/Tasks.org @@ -2,6 +2,6 @@ #+TODO: TODO(t) DOING(w) | DONE(d) BLOCKED(b) CANCELED(c) #+FILETAGS: :@school: -* TODO Redraft (#2) +* TODO Redraft (#3) * TODO Get more feedback diff --git a/thesis/biblio.bib b/thesis/biblio.bib index 5237df9..63dac32 100644 --- a/thesis/biblio.bib +++ b/thesis/biblio.bib @@ -7,7 +7,7 @@ booktitle = {Proceedings of the 32nd {ACM} {SIGPLAN} Conference on Programming Language Design and Implementation}, publisher = {Association for Computing Machinery}, author = {Jung, Changhee and Rus, Silvius and Railing, Brian P. and Clark, Nathan and Pande, Santosh}, - date = {2011}, + year = {2011}, } @inproceedings{thomas_framework_2005, @@ -19,7 +19,7 @@ booktitle = {Proceedings of the Tenth {ACM} {SIGPLAN} Symposium on Principles and Practice of Parallel Programming}, publisher = {Association for Computing Machinery}, author = {Thomas, Nathan and Tanase, Gabriel and Tkachyshyn, Olga and Perdue, Jack and Amato, Nancy M. and Rauchwerger, Lawrence}, - date = {2005}, + year = {2005}, } @inproceedings{osterlund_dynamically_2013, @@ -28,7 +28,7 @@ pages = {410--420}, booktitle = {2013 28th {IEEE}/{ACM} International Conference on Automated Software Engineering ({ASE})}, author = {Österlund, Erik and Löwe, Welf}, - date = {2013}, + year = {2013}, } @inproceedings{franke_collection_2022, @@ -40,7 +40,7 @@ booktitle = {Proceedings of the 15th {ACM} {SIGPLAN} International Conference on Software Language Engineering}, publisher = {Association for Computing Machinery}, author = {Franke, Björn and Li, Zhibo and Morton, Magnus and Steuwer, Michel}, - date = {2022}, + year = {2022}, } @article{qin_primrose_2023, @@ -52,15 +52,12 @@ number = {3}, journaltitle = {The Art, Science, and Engineering of Programming}, author = {Qin, Xueying and O'Connor, Liam and Steuwer, Michel}, - urldate = {2023-09-25}, date = {2023-02-15}, - eprinttype = {arxiv}, - eprint = {2205.09655 [cs]}, + year = {2023} } @inproceedings{costa_collectionswitch_2018, title = {{CollectionSwitch}: a framework for efficient and dynamic collection selection}, - isbn = {978-1-4503-5617-6}, url = {https://dl.acm.org/doi/10.1145/3168825}, doi = {10.1145/3168825}, shorttitle = {{CollectionSwitch}}, @@ -69,8 +66,8 @@ booktitle = {Proceedings of the 2018 International Symposium on Code Generation and Optimization}, publisher = {{ACM}}, author = {Costa, Diego and Andrzejak, Artur}, - urldate = {2023-09-21}, date = {2018-02-24}, + year = {2018} } @inproceedings{shacham_chameleon_2009, @@ -83,8 +80,8 @@ booktitle = {Proceedings of the 30th {ACM} {SIGPLAN} Conference on Programming Language Design and Implementation}, publisher = {{ACM}}, author = {Shacham, Ohad and Vechev, Martin and Yahav, Eran}, - urldate = {2023-09-21}, date = {2009-06-15}, + year = {2009} } @incollection{hutchison_coco_2013, @@ -96,8 +93,8 @@ booktitle = {{ECOOP} 2013 – Object-Oriented Programming}, publisher = {Springer Berlin Heidelberg}, author = {Xu, Guoqing}, - urldate = {2023-10-17}, - date = {2013}, + date = {2023-10-17}, + year = {2013}, doi = "10.1007/978-3-642-39038-8_1", } @@ -109,6 +106,7 @@ booktitle = {2009 International Symposium on Code Generation and Optimization}, author = {{L. Liu} and {S. Rus}}, date = {2009-03-22}, + year = {2009} } @article{jung_brainy_2011-1, @@ -122,8 +120,8 @@ journaltitle = {{ACM} {SIGPLAN} Notices}, shortjournal = {{SIGPLAN} Notices}, author = {Jung, Changhee and Rus, Silvius and Railing, Brian P. and Clark, Nathan and Pande, Santosh}, - urldate = {2023-09-21}, date = {2011-06-04}, + year = {2011} } @inproceedings{costa_empirical_2017, @@ -135,7 +133,7 @@ booktitle = {Proceedings of the 8th {ACM}/{SPEC} on International Conference on Performance Engineering}, publisher = {Association for Computing Machinery}, author = {Costa, Diego and Andrzejak, Artur and Seboek, Janos and Lo, David}, - date = {2017}, + year = {2017}, } @online{wastl_advent_2015, @@ -143,7 +141,7 @@ url = {https://adventofcode.com/2022/about}, author = {Wastl, Eric}, urldate = {2024-03-08}, - date = {2015}, + year = {2015}, } @misc{rust_documentation_team_btreemap_2024, @@ -151,7 +149,15 @@ url = {https://doc.rust-lang.org/stable/std/collections/struct.BTreeMap.html}, author = {{Rust Documentation Team}}, urldate = {2024-03-08}, - date = {2024}, + year = {2024}, +} + +@misc{rust_rfc_allocators, + title = {RFC 1398: Allocators}, + url = {https://github.com/rust-lang/rfcs/blob/master/text/1398-kinds-of-allocators.md}, + author = {{Rust allocator working group}}, + urldate = {2016-04-08}, + year = {2016}, } @inproceedings{bayer_organization_1970, @@ -165,5 +171,5 @@ publisher = {{ACM} Press}, author = {Bayer, R. and {McCreight}, E.}, urldate = {2024-03-08}, - date = {1970}, + year = {1970}, } diff --git a/thesis/main.tex b/thesis/main.tex index 0376f11..9b51754 100644 --- a/thesis/main.tex +++ b/thesis/main.tex @@ -15,6 +15,7 @@ \usepackage{natbib} \usepackage{hyperref} \bibliographystyle{unsrtnat} +\setcitestyle{authoryear,open={(},close={)}} %% Convenience macros \newcommand{\code}[1]{\lstinline$#1$} diff --git a/thesis/parts/abstract.tex b/thesis/parts/abstract.tex index 9f65c30..ae86901 100644 --- a/thesis/parts/abstract.tex +++ b/thesis/parts/abstract.tex @@ -1,8 +1,8 @@ -Almost every program makes extensive use of container data structures -- structures that hold a collection of values together. -Despite many programming languages offering a variety of data structures, most programmers stick to one or two, potentially leaving large performance improvements on the table. +Almost every program makes extensive use of container types -- structures that hold a collection of values together. +Despite many programming languages offering a variety of implementations, most programmers stick to one or two, potentially leaving large performance improvements on the table. -We present Candelabra, a system for selecting the best implementation of a container type based on an individual program's requirements. -Using the DSL proposed in \cite{qin_primrose_2023}, developers can specify the way a container must behave and what operations it must be able to perform. +We present Candelabra, a system for selecting the best implementation of a container type based on the individual program's requirements. +Using the DSL proposed in \cite{qin_primrose_2023}, developers specify the way a container must behave and what operations it must be able to perform. Once they have done this, we are able to select implementations that meet those requirements, and suggest which will be the fastest based on the usage patterns of the user's program. Our system is designed with flexibility in mind, meaning it is easy to add new container implementations, and operations. @@ -11,4 +11,4 @@ It is also able to scale up to larger programs, without suffering the exponentia Our approach is able to suggest the fastest implementation in most of our tests, although further testing is required on a wider range of workloads. We also investigate the feasibility of adaptive containers, which switch implementation once the size reaches a certain threshold. -In doing so, we identify several key concerns that future work could address. +In doing so, we identify several key concerns that future work should address. diff --git a/thesis/parts/acknowledgements.tex b/thesis/parts/acknowledgements.tex index 0337f02..4a22bf5 100644 --- a/thesis/parts/acknowledgements.tex +++ b/thesis/parts/acknowledgements.tex @@ -1,3 +1,3 @@ -Firstly, I'd like to express my deepest gratitude to my supervisor, Liam O' Connor, for his help. +I'd like to express my deepest gratitude to my supervisor, Liam O' Connor, for his help. I'd also like to thank the Tardis Project for the compute resources used for benchmarking, and the members of CompSoc for their advice. diff --git a/thesis/parts/background.tex b/thesis/parts/background.tex index e65afad..ea30f36 100644 --- a/thesis/parts/background.tex +++ b/thesis/parts/background.tex @@ -1,10 +1,10 @@ -In this chapter, we provide an overview of the problem of container selection and its effect on program correctness and performance. +In this chapter, we explain the problem of container selection and its effect on program correctness and performance. We then provide an overview of approaches taken by modern programming languages and existing literature. Finally, we explain how our system is novel, and the weaknesses in existing literature it addresses. \section{Container Selection} -A container data structure is simply a structure which holds a collection of related values. +A container data type is simply a structure which holds a collection of related values. This could include a list (growable or otherwise), a set (with no duplicate elements), or something more complex (like a min heap). In many languages, the standard library provides implementations of various container types, with users able to choose which is best for their program. @@ -12,8 +12,7 @@ This saves users a lot of time, however selecting the best type is not always st Consider a program which needs to store and query a set of numbers, and doesn't care about ordering or duplicates. If the number of items ($n$) is small enough, it might be fastest to use a dynamically-sized array (known as a vector in many languages), and scan through each time we want to check if a number is inside. -On the other hand, if the set we deal with is much larger, we might need to use a more complex method to keep things fast. -A common example would be a hash set, which provides roughly the same lookup speed regardless of size, at the cost of being slower overall. +On the other hand, if the set we deal with is much larger, we might instead use a hash set, which provides roughly the same lookup speed regardless of size at the cost of being slower overall. In this case, there are two factors driving our decision. Our \emph{functional requirements} -- that we don't care about ordering or duplicates -- and our \emph{non-functional requirements} -- that we want our program to use resources efficiently. @@ -24,7 +23,7 @@ Functional requirements tell us how the container must behave in order for the p Continuing with our previous example, we'll compare Rust's \code{Vec} implementation (a dynamic array), with its \code{HashSet} implementation (a hash table). To start with, we can see that the two types have different methods. -\code{Vec} has a \code{.get(index)} method, while \code{HashSet} does not; \code{HashSet}s aren't ordered so this wouldn't make sense. +\code{Vec} has a \code{.get()} method, while \code{HashSet} does not. If we were building a program that needed an ordered collection, replacing \code{Vec} with \code{HashSet} would likely cause the compiler to raise an error. We will call the operations that a container implementation provides the \emph{syntactic properties} of the implementation. @@ -35,9 +34,9 @@ Suppose our program only requires a type with the \code{.contains(value)} and \c Both \code{Vec} and \code{HashSet} satisfy these requirements, but our program might also rely on the count returned from \code{.len()} including duplicates. In this case, \code{HashSet} would give us different behaviour, causing our program to behave incorrectly. -Therefore, we also say that a container implementation has \code{semantic properties}. -Intuitively we can think of this as what conditions are upheld. -For a \code{HashSet}, this would include that there are never any duplicates. +Therefore, we also say that a container implementation has \emph{semantic properties}. +Intuitively we can think of this as the conditions upheld by the container. +A \code{HashSet}, would have the property that there are never any duplicates. A \code{Vec} would not have this property, but would have the property that insertion order is preserved. To select a correct container implementation, we then need to ensure we meet some syntactic and semantic requirements specific to our program. @@ -48,18 +47,17 @@ So long as we specify our requirements correctly, and use an implementation whic While meeting our program's functional requirements should ensure that it runs correctly, this doesn't say anything about our program's efficiency. We also want to choose the most efficient implementation available, striking a balance between runtime and memory usage. -Prior work has demonstrated that the right container implementation can give substantial performance improvements. -Perflint\citep{l_liu_perflint_2009} found and suggested fixes for ``hundreds of suboptimal patterns in a set of large C++ benchmarks,'' with one such case improving performance by 17\%. -Similarly, Brainy\citep{jung_brainy_2011} demonstrates an average increase in speed of 27-33\% on real-world applications and libraries using a similar approach. +Prior work has demonstrated that changing container implementation can give substantial performance improvements. +Perflint \citep{l_liu_perflint_2009} found and suggested fixes for ``hundreds of suboptimal patterns in a set of large C++ benchmarks,'' with one such case improving performance by 17\%. +Similarly, Brainy \citep{jung_brainy_2011} found a 27-33\% speedup of real-world applications and libraries using a similar approach. -If we can find a set of implementations that satisfy our functional requirements, then an obvious solution is to benchmark the program with each of these implementations in place. -This will obviously work, as long as our benchmarks are roughly representative of the real world. +If we can find a set of implementations that satisfy our functional requirements, then one obvious solution is to benchmark the program with each of these implementations in place. +This will obviously work, so long as our benchmarks are roughly representative of the real world. Unfortunately, this technique scales poorly for larger applications. As the number of types we must select increases, the number of combinations we have to try increases exponentially. -This quickly becomes unfeasible, so we must explore other selection methods. -\section{Prior literature} +\section{Prior art} In this section we outline existing methods for container selection, in both current programming languages and literature. @@ -68,60 +66,70 @@ In this section we outline existing methods for container selection, in both cur Modern programming languages broadly take one of two approaches to container selection. Some languages, usually higher-level ones, recommend built-in structures as the default, using implementations that perform well enough for the vast majority of use-cases. -One popular examples is Python, which uses dynamic arrays as its built-in list implementation. +A popular example is Python, which uses dynamic arrays as its built-in list implementation. + This approach prioritises developer ergonomics: programmers do not need to think about how these are implemented. Often other implementations are possible, but are used only when needed and come at the cost of code readability. In other languages, collections are given as part of a standard library or must be written by the user. Java comes with growable lists as part of its standard library, as does Rust. -In both cases, the standard library implementation is not special --- users can implement their own and use them in the same ways. +In both cases, the standard library implementation is not special -- users can implement their own and use them in the same ways. -Interfaces, or their closest equivalent, are often used to abstract over 'similar' collections. +Interfaces, or their closest equivalent, are often used to abstract over similar collections. In Java, ordered collections implement the interface \code{List}, with similar interfaces for \code{Set}, \code{Queue}, etc. +This allows most code to be implementation-agnostic, with functional requirements specified by the interface used. + +Whilst this provides some flexibility, it still requires the developer to choose a concrete implementation at some point. +In most cases, developers will simply choose the most common implementation and assume it will be fast enough. -This allows most code to be implementation-agnostic, but still requires the developer to choose a concrete implementation at some point. -This means that developers are forced to guess based on their knowledge of the underlying implementations, or to simply choose the most common implementation. +Otherwise, developers are forced to guess based on their knowledge of specific implementations and their program's behaviour. +For more complex programs or data structures, it can be difficult or impossible to reason about an implementation's performance. \subsection{Rules-based approaches} -One approach to this problem is to allow the developer to make the choice initially, but use some tool to detect poor choices. -Chameleon\citep{shacham_chameleon_2009} uses this approach. +One way to address this is to allow the developer to make the choice initially, but attempt to detect cases where the wrong choice was made. +Chameleon \citep{shacham_chameleon_2009} is one system which uses this approach. First, it collects statistics from program benchmarks using a ``semantic profiler''. This includes the space used by collections over time and the counts of each operation performed. These statistics are tracked per individual collection allocated and then aggregated by 'allocation context' --- the call stack at the point where the allocation occured. -These aggregated statistics are passed to a rules engine, which uses a set of rules to suggest different container types which might have better performance. +These aggregated statistics are passed to a rules engine, which uses a set of rules to identify cases where a different container implementations might perform better. This results in a flexible engine for providing suggestions which can be extended with new rules and types as necessary. A similar approach is used by \cite{l_liu_perflint_2009} for the C++ standard library. -However, adding new implementations requires the developer to write new suggestion rules. -This can be difficult, as it requires the developer to understand all of the existing implementations' performance characteristics. +By using the developer's selection as a baseline, both of these tools function similarly to a linter, which the developer can use to catch mistakes and suggest improvements. +This makes it easy to integrate into existing projects and workflows. -To satisfy functional requirements, Chameleon only suggests new types that behave identically to the existing type. -This results in selection rules being more restricted than they otherwise could be. -For instance, a rule cannot suggest a \code{HashSet} instead of a \code{LinkedList} as the two are not semantically identical. -Chameleon has no way of knowing if doing so will break the program's functionality and so it does not make the suggestion. +However, the use of suggestion rules means that adding a new container implementations requires writing new suggestion rules. +This requires the developer to understand all of the existing implementations' performance characteristics, and how they relate to the new implementation. +In effect, the difficulty of selecting an implementation is offloaded to whoever writes the suggestion rules. -CoCo\citep{hutchison_coco_2013} and \cite{osterlund_dynamically_2013} use similar techniques, but work as the program runs. +To ensure that functional requirements are satisfied, both systems will only suggest implementations that behave identically to the existing one. +This results in selection rules being more restricted than necessary. +For instance, a rule could not suggest a \code{HashSet} instead of a \code{Vec}, as the two are not semantically identical. + +CoCo \citep{hutchison_coco_2013} and \cite{osterlund_dynamically_2013} use similar techniques, but work as the program runs. This was shown to work well for programs with different phases of execution, such as loading and then working on data. However, the overhead from profiling and from checking rules may not be worth the improvements in other programs, where access patterns are roughly the same throughout. \subsection{ML-based approaches} -Brainy\citep{jung_brainy_2011} gathers similar statistics, but uses machine learning for selection instead of programmed rules. +Brainy \citep{jung_brainy_2011} gathers similar statistics, but uses machine learning for selection instead of programmed rules. ML has the advantage of being able to detect patterns a human may not be aware of. For example, Brainy takes into account statistics from hardware counters, which are difficult for a human to reason about. This also makes it easier to add new collection implementations, as rules do not need to be written by hand. +Whilst this offers increased flexibility, it comes at the cost of requiring a more lengthy model training process when implementations are changed. + \subsection{Estimate-based approaches} -CollectionSwitch\citep{costa_collectionswitch_2018} is another solution, which attempts to estimate the performance characteristics of each implementation individually. +CollectionSwitch \citep{costa_collectionswitch_2018} also avoids forcing developers to write rules, by estimating the performance characteristics of each implementation individually. First, a performance model is built for each container implementation. -This gives an estimate of some cost for each operation at a given collection size. -This cost might be a measurement of memory usage, or execution time. +This gives an estimate of some cost dimensions for each operation at a given collection size. +The originally proposed cost dimensions were memory usage and execution time. The system then collects data on how the program uses containers as it runs, and combines this with the built cost models to estimate the performance impact for each collection type. It may then decide to switch between container types if the potential change in cost seems high enough. @@ -152,7 +160,7 @@ As we note above, this scales poorly. \section{Contribution} -Of the tools presented, none are able to deal with both functional and non-functional requirements properly. +Of the tools presented, none are designed to deal with both functional and non-functional requirements well. Our contribution is a system for container selection that addresses both of these aspects. Users are able to specify their functional requirements in a way that is expressive enough for most usecases, and easy to integrate with existing projects. @@ -160,4 +168,4 @@ We then find which implementations in our library satisfy these requirements, an We also aim to make it easy to add new container implementations, and for our system to scale up to large projects without selection time becoming an issue. -Whilst the bulk of our system is focused on offline selection (done before the program is compiled), we also attempt to detect when changing implementation at runtime is desirable. +Whilst the bulk of our system is focused on offline selection (done before the program is compiled), we also attempt to detect when changing implementation at runtime is desirable, a technique which has largely only been applied to higher-level languages. diff --git a/thesis/parts/conclusion.tex b/thesis/parts/conclusion.tex index cb0f9a4..693988f 100644 --- a/thesis/parts/conclusion.tex +++ b/thesis/parts/conclusion.tex @@ -10,6 +10,7 @@ We prove that this approach has merit, although our testing had notable limitati We also found that while linear regression is powerful enough for many cases, more research is required on how best to gather and preprocess data in order to best capture an implementation's performance characteristics. %% Researched feasibility of adaptive containers, found issues with overhead and threshold detection -We test the effectiveness of switching container implementation as the n value changes, and in doing so find several important factors to consider. +We test the effectiveness of adaptive containers, in which the underlying implementation changes as the container grows. +We find significant challenges in implementing this technique, suggesting that the overhead incurred is more important in lower-level compiled languages such as Rust. %% Future work should focus on minimising overhead and finding the ideal threshold -Future work should focus on minimising the overhead applied to every operation, as well as on finding the correct threshold at which to switch implementation. +Future work should focus on minimising this overhead, as well as on finding the correct threshold at which to switch implementation. diff --git a/thesis/parts/design.tex b/thesis/parts/design.tex index 01cd858..796549e 100644 --- a/thesis/parts/design.tex +++ b/thesis/parts/design.tex @@ -7,7 +7,7 @@ We leave detailed discussion of implementation for chapter \ref{chap:implementat \section{Aims \& Usage} -As mentioned previously, we aim to create an all-in-one solution for container selection that can select based on both functional and non-functional requirements. +As mentioned previously, we aim to create an all-in-one solution for container selection that takes into account both functional and non-functional requirements. Flexibility is a high priority: It should be easy to add new container implementations, and to integrate our system into existing applications. Our system should also be able to scale to larger programs, and remain convenient for developers to use. @@ -47,8 +47,8 @@ The first must implement the \code{Container} and \code{Stack} traits, and must The second container type, \code{Primes}, must implement the \code{Container} trait, and must satisfy the \code{ascending} property. This property requires that for all consecutive \code{x, y} pairs in the container, \code{x <= y}. -Once we've specified our functional requirements and provided a benchmark (\code{src/tests/prime_sieve/benches/main.rs}), we can simply run Candelabra to select a container: \code{candelabra-cli -p prime_sieve select}. -This command outputs something like table \ref{table:selection_output}, and saves the best combination of container types to be used the next time the program is run. +Once we have specified our functional requirements and provided a benchmark, we can simply run Candelabra to select a container: \code{candelabra-cli -p prime_sieve select}. +This command outputs the information in table \ref{table:selection_output} and saves the best combination of container types to be used the next time the program is run. Here, the code generated uses \code{Vec} as the implementation for \code{Sieve}, and \code{HashSet} as the implementation for \code{Primes}. \begin{table}[h] @@ -66,18 +66,19 @@ Here, the code generated uses \code{Vec} as the implementation for \code{Sieve}, \label{table:selection_output} \end{table} +\newpage \section{Overview of process} Our tool integrates with Rust's packaging system (Cargo) to discover the information it needs about our project. -It then runs Primrose to find a list of implementations satsifying our functional requirements from a pre-built library of container implementations. +It then runs a modified version of Primrose \citep{qin_primrose_2023} to find a list of implementations satsifying our functional requirements from a pre-built library of container implementations. -Once we have this list, we build a 'cost model' for each candidate type. This allows us to get an upper bound for the runtime cost of an operation at any given n. -We choose to focus only on CPU time, and disregard memory usage due to the difficulty of accurately measuring memory footprint.\footnote{As Rust is not interpreted, we would need to hook into calls to the OS' memory allocator. This is very platform-specific, although the currently work in progress allocator API may make this easier in future.} +Once we have this list, we build a \emph{cost model} for each candidate type. This allows us to get an upper bound for the runtime cost of an operation at any given n. +We choose to focus only on CPU time, and disregard memory usage due to the difficulty of accurately measuring memory footprint.\footnote{As Rust is not interpreted, we would need to hook into calls to the OS' memory allocator. This is very platform-specific, although the currently work in progress allocator API \citep{rust_rfc_allocators} may make this easier in future.} We then run the user-provided benchmarks, using a wrapper around any of the valid candidates to track how many times each operation is performed, and the maximum size the container reaches. We combine this information with our cost models to estimate a total cost for each candidate, which is an upper bound on the total time taken for all container operations. -At this point, we also check if an 'adaptive' container would be better, by checking if one implementation is better performing at a lower n, and another at a higher n. +At this point, we also check if an adaptive container would be better, by checking if one implementation is better performing at a lower n, and another at a higher n. Finally, we pick the implementation with the minimum cost, and generate code which allows the program to use that implementation. @@ -88,10 +89,10 @@ We now go into more detail on how each step works, although we leave some specif \section{Functional requirements} %% Explain role in entire process -As described in Chapter \ref{chap:background}, any implementation we pick must satisfy the program's functional requirements. +As described in chapter \ref{chap:background}, any implementation we pick must satisfy the program's functional requirements. To do this, we integrate Primrose \citep{qin_primrose_2023} as a first step. -Primrose allows users to specify both the traits they require in an implementation (essentially the API and methods available), and what properties must be satisfied. +Primrose allows users to specify both the traits they require (syntactic properties), and the semantic properties that must be satisfied. Each container type that we want to select an implementation for is bound by a list of traits and a list of properties (lines 11 and 12 in Listing \ref{lst:selection_example}). @@ -128,11 +129,11 @@ Although we use primrose in our implementation, the rest of our system isn't dep \section{Cost Models} -Now that we have a list of correct implementations for each container type, we need a way to understand the performance characteristics of each of them in isolation. -We use an approach similar to CollectionSwitch\citep{costa_collectionswitch_2018}, which assumes that the main factor in how long an operation takes is the current size of the collection. +Now that we have a list of correct implementations for each container type, we need a way to understand the performance characteristics of each of them. +We use an approach similar to CollectionSwitch \citep{costa_collectionswitch_2018}, which assumes that the main factor in how long an operation takes is the current size of the collection. %% Benchmarks -An implementation has a seperate cost model for each operation, which we obtain by executing the operation repeatedly on collections of various sizes. +Implementations have a seperate cost model for each operation, which we obtain by executing that operation repeatedly at various collection sizes. For example, to build a cost model for \code{Vec::contains}, we would create several \code{Vec}s of varying sizes, and find the average execution time $t$ of \code{contains} at each. @@ -233,7 +234,7 @@ But when the size of the container grows, the cost of doing \code{contains} may Adaptive containers attempt to address this need, by starting off with one implementation (referred to as the low or before implementation), and switching to a new implemenation (the high or after implementation) once the size of the container passes a certain threshold. -This is similar to systems such as CoCo\citep{hutchison_coco_2013} and \cite{osterlund_dynamically_2013}. +This is similar to systems such as CoCo \citep{hutchison_coco_2013} and \cite{osterlund_dynamically_2013}. However, we decide when to switch container implementation before the program is run, rather than as it is running. We also do so in a way that requires no knowledge of the implementation internals. diff --git a/thesis/parts/implementation.tex b/thesis/parts/implementation.tex index 478280c..bc2802c 100644 --- a/thesis/parts/implementation.tex +++ b/thesis/parts/implementation.tex @@ -33,7 +33,7 @@ The library source can be found in \code{src/crates/library}. \code{VecMap} & A Vec of (K, V) tuples sorted by key, used as a Mapping \\ \code{HashMap} & Hash map with quadratic probing \\ \code{HashSet} & Hash map with empty values \\ - \code{BTreeMap} & B-Tree\citep{bayer_organization_1970} map with linear search. \\ + \code{BTreeMap} & B-Tree \citep{bayer_organization_1970} map with linear search. \\ \code{BTreeSet} & B-Tree map with empty values \\ \end{tabular} \caption{Implementations in our library} @@ -46,7 +46,7 @@ We also added new syntax to Primrose's domain-specific language to support defin While performing integration testing, we found and fixed several other issues with the existing code: \begin{enumerate} -\item Only push and pop operations could be modelled in properties. Ohter operations would raise an error during type-checking. +\item Only push and pop operations could be modelled in properties. Other operations would raise an error during type-checking. \item The Rosette code generated for properties using other operations was incorrect. \item Some trait methods used mutable borrows unnecessarily, making it difficult or impossible to write safe Rust using them. \item The generated code would perform an unnecessary heap allocation for every created container, which could affect performance. @@ -65,7 +65,7 @@ As Rust's generics are monomorphised, our generic code is compiled as if we were Each benchmark is run in a 'warmup' loop for a fixed amount of time (currently 500ms), then runs for a fixed number of iterations (currently 50). This is important because we are using least squares fitting - if there are less data points at higher $n$ values then our resulting model may not fit those points as well. -We repeat each benchmark at a range of $n$ values: $10, 50, 100, 250, 500, 1,000, 6,000, 12,000, 24,000, 36,000, 48,000, 60,000$. +We repeat each benchmark at a range of $n$ values: $10, 50, 100, 250, 500, 1000, 6000, 12000, 24000, 36000, 48000, 60000$. Each benchmark we run corresponds to one container operation. For most operations, we insert $n$ random values to a new container, then run the operation once per iteration. @@ -73,14 +73,15 @@ For certain operations which are commonly amortized (\code{insert}, \code{push}, As discussed previously, we discard all points that are outwith one standard deviation of the mean for each $n$ value. We use the least squares method to fit a polynomial of form $x_0 + x_1 n + x_2 n^2 + x_3 \log_2 n$. -As most operations on common data structures are polynomial or logarithmic complexity, we believe that least squares fitting is good enough to capture the cost of most operations. + +As most operations on common data structures are polynomial or logarithmic complexity, we believe that this function is good enough to capture the cost of most operations. We originally experimented with coefficients up to $x^3$, but found that this led to overfitting. \section{Profiling} -We implement profiling using a \code{ProfilerWrapper} type (\code{src/crates/library/src/profiler.rs}), which takes as type parameters the inner container implementation and an index, used later to identify what container type the output corresponds to. +We implement profiling using the \code{ProfilerWrapper} type (\code{src/crates/library/src/profiler.rs}), which takes as type parameters the inner container implementation and an index, used later to identify what container type the output corresponds to. We then implement any primrose traits that the inner container implements, counting the number of times each operation is called. -We also check the length of the container after each insertion operation, and track the maximum. +We also check the length of the container after each insert operation, and track the maximum. Tracking is done per-instance, and recorded when the container goes out of scope and its \code{Drop} implementation is called. We write the counts of each operation and maximum size of the collection to a location specified by an environment variable. @@ -109,7 +110,7 @@ In order to try and suggest an adaptive container, we use the following algorith \item Calculate the cost for each candidate in each partition individually. \item For each partition, find the best candidate and store it in the array \code{best}. Note that we don't sum across all partitions this time. \item Find the lowest index \code{i} where \code{best[i] != best[0]} -\item Check that \code{i} splits the list properly: For all \code{j < i}, \code{best[j] == best[0]} and for all \code{j>=i}, \code{best[j] == best[i]}. +\item Check that \code{i} splits the list properly: For all \code{j < i}, we require \code{best[j] == best[0]} and for all \code{j>=i}, we require \code{best[j] == best[i]}. \item Let \code{before} be the name of the candidate in \code{best[0]}, \code{after} be the name of the candidate in \code{best[i]}, and \code{threshold} be halfway between the maximum n values of partition \code{i} and partition \code{i-1}. \item Calculate the cost of switching as: $$ diff --git a/thesis/parts/introduction.tex b/thesis/parts/introduction.tex index e24abea..c9661e9 100644 --- a/thesis/parts/introduction.tex +++ b/thesis/parts/introduction.tex @@ -3,16 +3,16 @@ %% **** Container types common in programs -Almost every program makes extensive use of container data structures -- structures which hold a collection of values. -Often, programmers will have some requirements they want to impose on this collection, such as not storing duplicate elements, or storing the items in sorted order. +Almost every program makes extensive use of container data types -- structures which hold a collection of values. +Often, programmers will also have some requirements they want to impose on this collection, such as not storing duplicate elements, or storing the items in sorted order. %% **** Functionally identical implementations -However, implementing these collection types manually wastes time, and can be hard to do right for more complicated structures. -Most programmers will simply use one or two of the collection types provided by their language. -Some languages, such as Python, go a step further, providing built-in implementations of growable lists and associative maps, with special syntax for both. +However, implementing these collection types wastes time, and can be hard to do right for more complicated structures. +Most programmers will instead use one or two of the collection types provided by their language. +Some languages, such as Python, go a step further, providing built-in implementations of common data structures, with special syntax and handling. %% **** Large difference in performance -Unfortunately, the underlying implementation of container types which function the same can have a drastic effect on performance (\cite{l_liu_perflint_2009}, \cite{jung_brainy_2011}). +However, the underlying implementation of container types which function the same can have a drastic effect on performance (\cite{l_liu_perflint_2009}, \cite{jung_brainy_2011}). By largely ignoring the performance characteristics of their implementation, programmers may be missing out on large performance gains. %% *** Motivate w/ effectiveness claims diff --git a/thesis/parts/results.tex b/thesis/parts/results.tex index 76247a4..4db297d 100644 --- a/thesis/parts/results.tex +++ b/thesis/parts/results.tex @@ -1,4 +1,4 @@ -In this chapter, we present the methodology used for benchmarking our system, and comment on the results we got. +In this chapter, we present the methodology used for benchmarking our system, our results, and analysis. We examine the produced cost models of certain operations in detail, with reference to the expected asymptotics of each operation. We then compare the selections made by our system to the actual optimal selections (obtained by brute force) for a variety of test cases. This includes examining when adaptive containers are suggested, and their effectiveness. @@ -24,11 +24,11 @@ The most important software versions are listed below. \section{Cost models} -We start by examining some of our generated cost models, and comparing them both to the observations they are based on, and what we expect from asymptotic analysis. +We start by examining some of our generated cost models, comparing them both to the observations they are based on, and what we expect from asymptotic analysis. As we build a total of 77 cost models from our library, we will not examine them all in detail. We look at models of the most common operations, grouped by containers that are commonly selected together. -\subsection{Insertion operations} +\subsection{Insert operations} Starting with the \code{insert} operation, Figure \ref{fig:cm_insert} shows how the estimated cost changes with the size of the container. The lines correspond to our fitted curves, while the points indicate the raw observations we drew from. @@ -46,7 +46,7 @@ The former appears to be in line with the observations, and is likely due to the The latter appears to diverge from the observations, and may indicate poor fitting. \code{LinkedList} has a significantly slower insertion. -This is likely because it requires a syscall for heap allocation for every item inserted, no matter the current size. +This is likely because it requires a heap allocation system call for every item inserted, no matter the current size. This would also explain why data points appear spread out more, as system calls have more unpredictable latency, even on systems with few other processes running. Notably, insertion appears to start to get cheaper past $n=24,000$, although this is only weakly suggested by observations. @@ -66,8 +66,9 @@ This is what we expect for hash-based collections, with the slight growth likely \code{BTreeSet} has similar behaviour, but settles at a larger value overall. \code{BTreeMap} appears to grow more rapidly, and cost more overall. + It's important to note that Rust's \code{BTreeSet} is not based on binary tree search, but instead a more general tree search originally proposed by \cite{bayer_organization_1970}, where each node contains $B-1$ to $2B-1$ elements in an unsorted array. -The standard library documentation\citep{rust_documentation_team_btreemap_2024} states that search is expected to take $O(B\lg n)$ comparisons. +The standard library documentation~\citep{rust_documentation_team_btreemap_2024} states that search is expected to take $O(B\lg n)$ comparisons. Since both of these implementations require searching the collection before inserting, the close-to-logarithmic growth seems to makes sense. \subsubsection{Small n values} @@ -105,9 +106,9 @@ The observations in these graphs have a much wider spread than our \code{insert} This is probably because we attempt to get a different random element in our container every time, so our observations show the best and worst case of our data structures. This is desirable assuming that \code{contains} operations are actually randomly distributed in the real world, which seems likely. -For the \code{SortedVec} family, we would expect to see roughly logarithmic growth, as contains is based on binary search. +For the \code{SortedVec} family, we would expect to see roughly logarithmic growth, as we are performing a binary search. This is the case for \code{SortedVecMap}, however \code{SortedVec} and \code{SortedVecSet} both show exponential growth with a 'dip' around $n=25,000$. -It's unclear why this happened, although it could be due to how the elements we query are randomly distributed throughout the list. +It's unclear why this is, one reason could be that the elements we query are randomly distributed throughout the list, and this distribution may not be fair for all benchmarks. A possible improvement would be to run contains with a known distribution of values, including low, high, and not present values in equal parts. The \code{Vec} family exhibits roughly linear growth, which is expected, since this implementation scans through the whole array each time. @@ -127,6 +128,7 @@ It's unclear why this is, however it could be related to the larger spread in ob Overall, our cost models appear to be a good representation of each implementations performance impact. Future improvements should focus on improving accuracy at lower $n$ values, such as by employing a more complex fitting procedure, or on ensuring operations have their best and worst cases tested fairly. +\newpage %% * Predictions \section{Selections} @@ -135,10 +137,10 @@ We now proceed with end-to-end testing of the system, selecting containers for a \subsection{Benchmarks} %% ** Chosen benchmarks -Our test programs broadly fall into two categories: Examples, which repeat a few operations many times, and real-life programs, which are implementations of common algorithms and solutions to programming puzles. +Our test programs broadly fall into two categories: examples programs, which repeat a few operations many times, and real-life programs, which are implementations of common algorithms and solutions to programming puzles. We expect the results from our example programs to be relatively obvious, while our real programs are more complex and harder to predict. -Most of our real programs are solutions to puzzles from Advent of Code\citep{wastl_advent_2015}, a popular collection of programming puzzles. +Most of our real programs are solutions to puzzles from Advent of Code~\citep{wastl_advent_2015}, a popular collection of programming puzzles. Table \ref{table:test_cases} lists and briefly describes our test programs. \begin{table}[h!] @@ -194,20 +196,20 @@ In all but two of our test cases (marked with *), we correctly identify the best \begin{table}[h!] \centering - \begin{tabular}{|c|c|c|c|c|} - Project & Container Type & Best implementation & Predicted best & \\ + \begin{tabular}{c|c|c|c|c|} + & Project & Container Type & Best implementation & Predicted best \\ \hline - aoc\_2021\_09 & Map & HashMap & HashMap & \\ - aoc\_2021\_09 & Set & HashSet & HashSet & \\ - aoc\_2022\_08 & Map & HashMap & HashMap & \\ - aoc\_2022\_09 & Set & HashSet & HashSet & \\ - aoc\_2022\_14 & Set & HashSet & HashSet & \\ - aoc\_2022\_14 & List & Vec & LinkedList & * \\ - example\_mapping & Map & HashMap & HashMap & \\ - example\_sets & Set & HashSet & HashSet & \\ - example\_stack & StackCon & Vec & Vec & \\ - prime\_sieve & Primes & BTreeSet & BTreeSet & \\ - prime\_sieve & Sieve & Vec & LinkedList & * \\ + & aoc\_2021\_09 & Map & HashMap & HashMap \\ + & aoc\_2021\_09 & Set & HashSet & HashSet \\ + & aoc\_2022\_08 & Map & HashMap & HashMap \\ + & aoc\_2022\_09 & Set & HashSet & HashSet \\ + & aoc\_2022\_14 & Set & HashSet & HashSet \\ + * & aoc\_2022\_14 & List & Vec & LinkedList \\ + & example\_mapping & Map & HashMap & HashMap \\ + & example\_sets & Set & HashSet & HashSet \\ + & example\_stack & StackCon & Vec & Vec \\ + & prime\_sieve & Primes & BTreeSet & BTreeSet \\ + * & prime\_sieve & Sieve & Vec & LinkedList \\ \end{tabular} \caption{Actual best vs predicted best implementations} \label{table:predicted_actual} @@ -218,7 +220,7 @@ From looking at detailed profiling information, it seems that both of these cont Therefore this is likely caused by our cost models being inaccurate at small $n$ values, as mentioned in section \ref{section:cm_small_n}. Overall, our results suggest that our system is effective, at least for large enough $n$ values. -Unfortunately, these tests are somewhat limited, as the best container is almost always predictable: \code{Vec} where uniqueness is not important, and \code{Hash*} otherwise. +Unfortunately, these tests are somewhat limited, as the best container seems easy to predict for most cases: \code{Vec} where uniqueness is not important, and \code{Hash*} otherwise. Therefore, more thorough testing is needed to fully establish the system's effectiveness. \subsection{Adaptive containers} @@ -304,7 +306,7 @@ This shows that adaptive containers as we have implemented them are not effectiv Even in cases where we never reach the size threshold, the presence of adaptive containers has an overhead which slows down the program 3x in the worst case (\code{example_mapping}, size = 150). One explanation for this could be that every operation now requires checking which inner implementation we are using, resulting in an additional check for each operation. -More work could be done to minimise this overhead, although it's unclear exactly how much this could be minimised. +More work could be done to minimise this overhead, although it's unclear how. It is also unclear if the threshold values that we suggest are the optimal ones. Currently, we decide our threshold by picking a value between two partitions with different best containers. @@ -312,9 +314,10 @@ Future work could take a more complex approach that finds the best threshold val \subsection{Evaluation} -Overall, we find that the main part of our container selection system appears to have merit. +Overall, we find that the main part of our container selection system has merit. Whilst our testing has limitations, it shows that we can correctly identify the best container even in complex programs. More work is needed on improving our system's performance for very small containers, and on testing with a wider range of programs. Our proposed technique for identifying adaptive containers appears ineffective. The primary challenges appear to be in the overhead introduced to each operation, and in finding the correct point at which to switch implementations. +This could also suggest that adaptive containers are less effective in lower-level compiled languages, as previous literate focused mostly on higher-level languages such as Java \citep{hutchison_coco_2013,costa_collectionswitch_2018}. -- cgit v1.2.3