diff options
author | eldritch horrors <pennae@lix.systems> | 2024-06-16 23:10:09 +0200 |
---|---|---|
committer | eldritch horrors <pennae@lix.systems> | 2024-06-25 12:24:58 +0000 |
commit | e6cd67591b44b4902bac73febcab3c4d96724aea (patch) | |
tree | 94c8ad90b8e756c5b00b8d68b2adf13c0f2febd9 /src/libexpr/parser | |
parent | c097ebe66bf474da886ffa20d2f31bdb1d2196a8 (diff) |
libexpr: rewrite the parser with pegtl instead of flex/bison
this gives about 20% performance improvements on pure parsing. obviously
it will be less on full eval, but depending on how much parsing is to be
done (e.g. including hackage-packages.nix or not) it's more like 4%-10%.
this has been tested (with thousands of core hours of fuzzing) to ensure
that the ASTs produced by the new parser are exactly the same as the old
one would have produced. error messages will change (sometimes by a lot)
and are not yet perfect, but we would rather leave this as is for later.
test results for running only the parser (excluding the variable binding
code) in a tight loop with inputs and parameters as given are promising:
- 40% faster on lix's package.nix at 10000 iterations
- 1.3% faster on nixpkgs all-packages.nix at 1000 iterations
- equivalent on all of nixpkgs concatenated at 100 iterations
(excluding invalid files, each file surrounded with parens)
more realistic benchmarks are somewhere in between the extremes, parsing
once again getting the largest uplift. other realistic workloads improve
by a few percentage points as well, notably system builds are 4% faster.
Benchmarks summary (from ./bench/summarize.jq bench/bench-*.json)
old/bin/nix --extra-experimental-features 'nix-command flakes' eval -f bench/nixpkgs/pkgs/development/haskell-modules/hackage-packages.nix
mean: 0.408s ± 0.025s
user: 0.355s | system: 0.033s
median: 0.389s
range: 0.388s ... 0.442s
relative: 1
new/bin/nix --extra-experimental-features 'nix-command flakes' eval -f bench/nixpkgs/pkgs/development/haskell-modules/hackage-packages.nix
mean: 0.332s ± 0.024s
user: 0.279s | system: 0.033s
median: 0.314s
range: 0.313s ... 0.361s
relative: 0.814
---
old/bin/nix --extra-experimental-features 'nix-command flakes' eval --raw --impure --expr 'with import <nixpkgs/nixos> {}; system'
mean: 6.133s ± 0.022s
user: 5.395s | system: 0.437s
median: 6.128s
range: 6.099s ... 6.183s
relative: 1
new/bin/nix --extra-experimental-features 'nix-command flakes' eval --raw --impure --expr 'with import <nixpkgs/nixos> {}; system'
mean: 5.925s ± 0.025s
user: 5.176s | system: 0.456s
median: 5.934s
range: 5.861s ... 5.943s
relative: 0.966
---
GC_INITIAL_HEAP_SIZE=10g old/bin/nix eval --extra-experimental-features 'nix-command flakes' --raw --impure --expr 'with import <nixpkgs/nixos> {}; system'
mean: 4.503s ± 0.027s
user: 3.731s | system: 0.547s
median: 4.499s
range: 4.478s ... 4.541s
relative: 1
GC_INITIAL_HEAP_SIZE=10g new/bin/nix eval --extra-experimental-features 'nix-command flakes' --raw --impure --expr 'with import <nixpkgs/nixos> {}; system'
mean: 4.285s ± 0.031s
user: 3.504s | system: 0.571s
median: 4.281s
range: 4.221s ... 4.328s
relative: 0.951
---
old/bin/nix --extra-experimental-features 'nix-command flakes' search --no-eval-cache github:nixos/nixpkgs/e1fa12d4f6c6fe19ccb59cac54b5b3f25e160870 hello
mean: 16.475s ± 0.07s
user: 14.088s | system: 1.572s
median: 16.495s
range: 16.351s ... 16.536s
relative: 1
new/bin/nix --extra-experimental-features 'nix-command flakes' search --no-eval-cache github:nixos/nixpkgs/e1fa12d4f6c6fe19ccb59cac54b5b3f25e160870 hello
mean: 15.973s ± 0.013s
user: 13.558s | system: 1.615s
median: 15.973s
range: 15.946s ... 15.99s
relative: 0.97
---
Change-Id: Ie66ec2d045dec964632c6541e25f8f0797319ee2
Diffstat (limited to 'src/libexpr/parser')
-rw-r--r-- | src/libexpr/parser/change_head.hh | 66 | ||||
-rw-r--r-- | src/libexpr/parser/grammar.hh | 707 | ||||
-rw-r--r-- | src/libexpr/parser/parser.cc | 862 | ||||
-rw-r--r-- | src/libexpr/parser/state.hh | 259 |
4 files changed, 1894 insertions, 0 deletions
diff --git a/src/libexpr/parser/change_head.hh b/src/libexpr/parser/change_head.hh new file mode 100644 index 000000000..aab315553 --- /dev/null +++ b/src/libexpr/parser/change_head.hh @@ -0,0 +1,66 @@ +#pragma once +///@file + +#include <tao/pegtl.hpp> + +namespace nix::parser { + +// modified copy of change_state, as the manual suggest for more involved +// state manipulation. we want to change only the first state parameter, +// and we care about the *initial* position of a rule application (not the +// past-the-end position as pegtl change_state provides) +template<typename NewState> +struct change_head : tao::pegtl::maybe_nothing +{ + template< + typename Rule, + tao::pegtl::apply_mode A, + tao::pegtl::rewind_mode M, + template<typename...> class Action, + template<typename...> class Control, + typename ParseInput, + typename State, + typename... States + > + [[nodiscard]] static bool match(ParseInput & in, State && st, States &&... sts) + { + const auto begin = in.iterator(); + + if constexpr (std::is_constructible_v<NewState, State, States...>) { + NewState s(st, sts...); + if (tao::pegtl::match<Rule, A, M, Action, Control>(in, s, sts...)) { + if constexpr (A == tao::pegtl::apply_mode::action) { + _success<Action<Rule>>(0, begin, in, s, st, sts...); + } + return true; + } + return false; + } else if constexpr (std::is_default_constructible_v<NewState>) { + NewState s; + if (tao::pegtl::match<Rule, A, M, Action, Control>(in, s, sts...)) { + if constexpr (A == tao::pegtl::apply_mode::action) { + _success<Action<Rule>>(0, begin, in, s, st, sts...); + } + return true; + } + return false; + } else { + static_assert(decltype(sizeof(NewState))(), "unable to instantiate new state"); + } + } + + template<typename Target, typename ParseInput, typename... S> + static void _success(void *, auto & begin, ParseInput & in, S & ... sts) + { + const typename ParseInput::action_t at(begin, in); + Target::success(at, sts...); + } + + template<typename Target, typename... S> + static void _success(decltype(Target::success0(std::declval<S &>()...), 0), auto &, auto &, S & ... sts) + { + Target::success0(sts...); + } +}; + +} diff --git a/src/libexpr/parser/grammar.hh b/src/libexpr/parser/grammar.hh new file mode 100644 index 000000000..82df63bc5 --- /dev/null +++ b/src/libexpr/parser/grammar.hh @@ -0,0 +1,707 @@ +#pragma once +///@file + +#include "tao/pegtl.hpp" +#include <type_traits> +#include <variant> + +#include <boost/container/small_vector.hpp> + +// NOTE +// nix line endings are \n, \r\n, \r. the grammar does not use eol or +// eolf rules in favor of reproducing the old flex lexer as faithfully as +// possible, and deferring calculation of positions to downstream users. + +namespace nix::parser::grammar { + +using namespace tao::pegtl; +namespace p = tao::pegtl; + +// character classes +namespace c { + +struct path : sor< + ranges<'a', 'z', 'A', 'Z', '0', '9'>, + one<'.', '_', '-', '+'> +> {}; +struct path_sep : one<'/'> {}; + +struct id_first : ranges<'a', 'z', 'A', 'Z', '_'> {}; +struct id_rest : sor< + ranges<'a', 'z', 'A', 'Z', '0', '9'>, + one<'_', '\'', '-'> +> {}; + +struct uri_scheme_first : ranges<'a', 'z', 'A', 'Z'> {}; +struct uri_scheme_rest : sor< + ranges<'a', 'z', 'A', 'Z', '0', '9'>, + one<'+', '-', '.'> +> {}; +struct uri_sep : one<':'> {}; +struct uri_rest : sor< + ranges<'a', 'z', 'A', 'Z', '0', '9'>, + one<'%', '/', '?', ':', '@', '&', '=', '+', '$', ',', '-', '_', '.', '!', '~', '*', '\''> +> {}; + +} + +// "tokens". PEGs don't really care about tokens, we merely use them as a convenient +// way of writing down keywords and a couple complicated syntax rules. +namespace t { + +struct _extend_as_path : seq< + star<c::path>, + not_at<TAO_PEGTL_STRING("/*")>, + not_at<TAO_PEGTL_STRING("//")>, + c::path_sep, + sor<c::path, TAO_PEGTL_STRING("${")> +> {}; +struct _extend_as_uri : seq< + star<c::uri_scheme_rest>, + c::uri_sep, + c::uri_rest +> {}; + +// keywords might be extended to identifiers, paths, or uris. +// NOTE this assumes that keywords are a-zA-Z only, otherwise uri schemes would never +// match correctly. +// NOTE not a simple seq<...> because this would report incorrect positions for +// keywords used inside must<> if a prefix of the keyword matches. +template<typename S> +struct _keyword : sor< + seq< + S, + not_at<c::id_rest>, + not_at<_extend_as_path>, + not_at<_extend_as_uri> + >, + failure +> {}; + +struct kw_if : _keyword<TAO_PEGTL_STRING("if")> {}; +struct kw_then : _keyword<TAO_PEGTL_STRING("then")> {}; +struct kw_else : _keyword<TAO_PEGTL_STRING("else")> {}; +struct kw_assert : _keyword<TAO_PEGTL_STRING("assert")> {}; +struct kw_with : _keyword<TAO_PEGTL_STRING("with")> {}; +struct kw_let : _keyword<TAO_PEGTL_STRING("let")> {}; +struct kw_in : _keyword<TAO_PEGTL_STRING("in")> {}; +struct kw_rec : _keyword<TAO_PEGTL_STRING("rec")> {}; +struct kw_inherit : _keyword<TAO_PEGTL_STRING("inherit")> {}; +struct kw_or : _keyword<TAO_PEGTL_STRING("or")> {}; + +// `-` can be a unary prefix op, a binary infix op, or the first character +// of a path or -> (ex 1->1--1) +// `/` can be a path leader or an operator (ex a?a /a) +struct op_minus : seq<one<'-'>, not_at<one<'>'>>, not_at<_extend_as_path>> {}; +struct op_div : seq<one<'/'>, not_at<c::path>> {}; + +// match a rule, making sure we are not matching it where a keyword would match. +// using minus like this is a lot faster than flipping the order and using seq. +template<typename... Rules> +struct _not_at_any_keyword : minus< + seq<Rules...>, + sor< + TAO_PEGTL_STRING("inherit"), + TAO_PEGTL_STRING("assert"), + TAO_PEGTL_STRING("else"), + TAO_PEGTL_STRING("then"), + TAO_PEGTL_STRING("with"), + TAO_PEGTL_STRING("let"), + TAO_PEGTL_STRING("rec"), + TAO_PEGTL_STRING("if"), + TAO_PEGTL_STRING("in"), + TAO_PEGTL_STRING("or") + > +> {}; + +// identifiers are kind of horrid: +// +// - uri_scheme_first ⊂ id_first +// - uri_scheme_first ⊂ uri_scheme_rest ⊂ path +// - id_first ⊂ id_rest ∖ { ' } ⊂ path +// - id_first ∩ (path ∖ uri_scheme_first) = { _ } +// - uri_sep ∉ ⋃ { id_first, id_rest, uri_scheme_first, uri_scheme_rest, path } +// - path_sep ∉ ⋃ { id_first, id_rest, uri_scheme_first, uri_scheme_rest } +// +// and we want, without reading the input more than once, a string that +// matches (id_first id_rest*) and is not followed by any number of +// characters such that the extended string matches path or uri rules. +// +// since the first character must be either _ or a uri scheme character +// we can ignore path-like bits at the beginning. uri_sep cannot appear anywhere +// in an identifier, so it's only needed in lookahead checks at the uri-like +// prefix. likewise path_sep cannot appear anywhere in the idenfier, so it's +// only needed in lookahead checks in the path-like prefix. +// +// in total that gives us a decomposition of +// +// (uri-scheme-like? (?! continues-as-uri) | _) +// (path-segment-like? (?! continues-as-path)) +// id_rest* +struct identifier : _not_at_any_keyword< + // we don't use (at<id_rest>, ...) matches here because identifiers are + // a really hot path and rewinding as needed by at<> isn't entirely free. + sor< + seq< + c::uri_scheme_first, + star<ranges<'a', 'z', 'A', 'Z', '0', '9', '-'>>, + not_at<_extend_as_uri> + >, + one<'_'> + >, + star<sor<ranges<'a', 'z', 'A', 'Z', '0', '9'>, one<'_', '-'>>>, + not_at<_extend_as_path>, + star<c::id_rest> +> {}; + +// floats may extend ints, thus these rules are very similar. +struct integer : seq< + sor< + seq<range<'1', '9'>, star<digit>, not_at<one<'.'>>>, + seq<one<'0'>, not_at<one<'.'>, digit>, star<digit>> + >, + not_at<_extend_as_path> +> {}; + +struct floating : seq< + sor< + seq<range<'1', '9'>, star<digit>, one<'.'>, star<digit>>, + seq<opt<one<'0'>>, one<'.'>, plus<digit>> + >, + opt<one<'E', 'e'>, opt<one<'+', '-'>>, plus<digit>>, + not_at<_extend_as_path> +> {}; + +struct uri : seq< + c::uri_scheme_first, + star<c::uri_scheme_rest>, + c::uri_sep, + plus<c::uri_rest> +> {}; + +struct sep : sor< + plus<one<' ', '\t', '\r', '\n'>>, + seq<one<'#'>, star<not_one<'\r', '\n'>>>, + seq<string<'/', '*'>, until<string<'*', '/'>>> +> {}; + +} + + + +using seps = star<t::sep>; + + +// marker for semantic rules. not handling one of these in an action that cares about +// semantics is probably an error. +struct semantic {}; + + +struct expr; + +struct _string { + template<typename... Inner> + struct literal : semantic, seq<Inner...> {}; + struct cr_lf : semantic, seq<one<'\r'>, opt<one<'\n'>>> {}; + struct interpolation : semantic, seq< + p::string<'$', '{'>, seps, + must<expr>, seps, + must<one<'}'>> + > {}; + struct escape : semantic, must<any> {}; +}; +struct string : _string, seq< + one<'"'>, + star< + sor< + _string::literal<plus<not_one<'$', '"', '\\', '\r'>>>, + _string::cr_lf, + _string::interpolation, + _string::literal<one<'$'>, opt<one<'$'>>>, + seq<one<'\\'>, _string::escape> + > + >, + must<one<'"'>> +> {}; + +struct _ind_string { + template<bool Indented, typename... Inner> + struct literal : semantic, seq<Inner...> {}; + struct interpolation : semantic, seq< + p::string<'$', '{'>, seps, + must<expr>, seps, + must<one<'}'>> + > {}; + struct escape : semantic, must<any> {}; +}; +struct ind_string : _ind_string, seq< + TAO_PEGTL_STRING("''"), + opt<star<one<' '>>, one<'\n'>>, + star< + sor< + _ind_string::literal< + true, + plus< + sor< + not_one<'$', '\''>, + seq<one<'$'>, not_one<'{', '\''>>, + seq<one<'\''>, not_one<'\'', '$'>> + > + > + >, + _ind_string::interpolation, + _ind_string::literal<false, one<'$'>>, + _ind_string::literal<false, one<'\''>, not_at<one<'\''>>>, + seq<one<'\''>, _ind_string::literal<false, p::string<'\'', '\''>>>, + seq< + p::string<'\'', '\''>, + sor< + _ind_string::literal<false, one<'$'>>, + seq<one<'\\'>, _ind_string::escape> + > + > + > + >, + must<TAO_PEGTL_STRING("''")> +> {}; + +struct _path { + // legacy lexer rules. extra l_ to avoid reserved c++ identifiers. + struct _l_PATH : seq<star<c::path>, plus<c::path_sep, plus<c::path>>, opt<c::path_sep>> {}; + struct _l_PATH_SEG : seq<star<c::path>, c::path_sep> {}; + struct _l_HPATH : seq<one<'~'>, plus<c::path_sep, plus<c::path>>, opt<c::path_sep>> {}; + struct _l_HPATH_START : TAO_PEGTL_STRING("~/") {}; + struct _path_str : sor<_l_PATH, _l_PATH_SEG, plus<c::path>> {}; + // modern rules + template<typename... Inner> + struct literal : semantic, seq<Inner...> {}; + struct interpolation : semantic, seq< + p::string<'$', '{'>, seps, + must<expr>, seps, + must<one<'}'>> + > {}; + struct anchor : semantic, sor< + _l_PATH, + seq<_l_PATH_SEG, at<TAO_PEGTL_STRING("${")>> + > {}; + struct home_anchor : semantic, sor< + _l_HPATH, + seq<_l_HPATH_START, at<TAO_PEGTL_STRING("${")>> + > {}; + struct searched_path : semantic, list<plus<c::path>, c::path_sep> {}; + struct forbid_prefix_triple_slash : sor<not_at<c::path_sep>, failure> {}; + struct forbid_prefix_double_slash_no_interp : sor< + not_at<c::path_sep, star<c::path>, not_at<TAO_PEGTL_STRING("${")>>, + failure + > {}; + // legacy parser rules + struct _str_rest : seq< + must<forbid_prefix_double_slash_no_interp>, + opt<literal<_path_str>>, + must<forbid_prefix_triple_slash>, + star< + sor< + literal<_path_str>, + interpolation + > + > + > {}; +}; +struct path : _path, sor< + seq< + sor<_path::anchor, _path::home_anchor>, + _path::_str_rest + >, + seq<one<'<'>, _path::searched_path, one<'>'>> +> {}; + +struct _formal { + struct name : semantic, t::identifier {}; + struct default_value : semantic, must<expr> {}; +}; +struct formal : semantic, _formal, seq< + _formal::name, + opt<seps, one<'?'>, seps, _formal::default_value> +> {}; + +struct _formals { + struct ellipsis : semantic, p::ellipsis {}; +}; +struct formals : semantic, _formals, seq< + one<'{'>, seps, + // formals and attrsets share a two-token head sequence ('{' <id>). + // this rule unrolls the formals list a bit to provide better error messages than + // "expected '='" at the first ',' if formals are incorrect. + sor< + one<'}'>, + seq<_formals::ellipsis, seps, must<one<'}'>>>, + seq< + formal, seps, + if_then_else< + at<one<','>>, + seq< + star<one<','>, seps, formal, seps>, + opt<one<','>, seps, opt<_formals::ellipsis, seps>>, + must<one<'}'>> + >, + one<'}'> + > + > + > +> {}; + +struct _attr { + struct simple : semantic, sor<t::identifier, t::kw_or> {}; + struct string : semantic, seq<grammar::string> {}; + struct expr : semantic, seq< + TAO_PEGTL_STRING("${"), seps, + must<grammar::expr>, seps, + must<one<'}'>> + > {}; +}; +struct attr : _attr, sor< + _attr::simple, + _attr::string, + _attr::expr +> {}; + +struct attrpath : list<attr, one<'.'>, t::sep> {}; + +struct _inherit { + struct from : semantic, must<expr> {}; + struct attrs : list<attr, seps> {}; +}; +struct inherit : _inherit, seq< + t::kw_inherit, seps, + opt<one<'('>, seps, _inherit::from, seps, must<one<')'>>, seps>, + opt<_inherit::attrs, seps>, + must<one<';'>> +> {}; + +struct _binding { + struct path : semantic, attrpath {}; + struct equal : one<'='> {}; + struct value : semantic, must<expr> {}; +}; +struct binding : _binding, seq< + _binding::path, seps, + must<_binding::equal>, seps, + _binding::value, seps, + must<one<';'>> +> {}; + +struct bindings : opt<list<sor<inherit, binding>, seps>> {}; + +struct op { + enum class kind { + // NOTE non-associativity is *NOT* handled in the grammar structure. + // handling it in the grammar itself instead of in semantic actions + // slows down the parser significantly and makes the rules *much* + // harder to read. maybe this will be different at some point when + // ! does not sit between two binary precedence levels. + nonAssoc, + leftAssoc, + rightAssoc, + unary, + }; + template<typename Rule, unsigned Precedence, kind Kind = kind::leftAssoc> + struct _op : Rule { + static constexpr unsigned precedence = Precedence; + static constexpr op::kind kind = Kind; + }; + + struct unary_minus : _op<t::op_minus, 3, kind::unary> {}; + + // treating this like a unary postfix operator is sketchy, but that's + // the most reasonable way to implement the operator precedence set forth + // by the language way back. it'd be much better if `.` and `?` had the same + // precedence, but alas. + struct has_attr : _op<seq<one<'?'>, seps, must<attrpath>>, 4> {}; + + struct concat : _op<TAO_PEGTL_STRING("++"), 5, kind::rightAssoc> {}; + struct mul : _op<one<'*'>, 6> {}; + struct div : _op<t::op_div, 6> {}; + struct plus : _op<one<'+'>, 7> {}; + struct minus : _op<t::op_minus, 7> {}; + struct not_ : _op<one<'!'>, 8, kind::unary> {}; + struct update : _op<TAO_PEGTL_STRING("//"), 9, kind::rightAssoc> {}; + struct less_eq : _op<TAO_PEGTL_STRING("<="), 10, kind::nonAssoc> {}; + struct greater_eq : _op<TAO_PEGTL_STRING(">="), 10, kind::nonAssoc> {}; + struct less : _op<one<'<'>, 10, kind::nonAssoc> {}; + struct greater : _op<one<'>'>, 10, kind::nonAssoc> {}; + struct equals : _op<TAO_PEGTL_STRING("=="), 11, kind::nonAssoc> {}; + struct not_equals : _op<TAO_PEGTL_STRING("!="), 11, kind::nonAssoc> {}; + struct and_ : _op<TAO_PEGTL_STRING("&&"), 12> {}; + struct or_ : _op<TAO_PEGTL_STRING("||"), 13> {}; + struct implies : _op<TAO_PEGTL_STRING("->"), 14, kind::rightAssoc> {}; +}; + +struct _expr { + template<template<typename...> class OpenMod = seq, typename... Init> + struct _attrset : seq< + Init..., + OpenMod<one<'{'>>, seps, + bindings, seps, + must<one<'}'>> + > {}; + + struct select; + + struct id : semantic, t::identifier {}; + struct int_ : semantic, t::integer {}; + struct float_ : semantic, t::floating {}; + struct string : semantic, seq<grammar::string> {}; + struct ind_string : semantic, seq<grammar::ind_string> {}; + struct path : semantic, seq<grammar::path> {}; + struct uri : semantic, t::uri {}; + struct ancient_let : semantic, _attrset<must, t::kw_let, seps> {}; + struct rec_set : semantic, _attrset<must, t::kw_rec, seps> {}; + struct set : semantic, _attrset<> {}; + + struct _list { + struct entry : semantic, seq<select> {}; + }; + struct list : semantic, _list, seq< + one<'['>, seps, + opt<p::list<_list::entry, seps>, seps>, + must<one<']'>> + > {}; + + struct _simple : sor< + id, + int_, + float_, + string, + ind_string, + path, + uri, + seq<one<'('>, seps, must<expr>, seps, must<one<')'>>>, + ancient_let, + rec_set, + set, + list + > {}; + + struct _select { + struct head : _simple {}; + struct attr : semantic, seq<attrpath> {}; + struct attr_or : semantic, must<select> {}; + struct as_app_or : semantic, t::kw_or {}; + }; + struct _app { + struct first_arg : semantic, seq<select> {}; + struct another_arg : semantic, seq<select> {}; + // can be used to stash a position of the application head node + struct select_or_fn : seq<select> {}; + }; + + struct select : _select, seq< + _select::head, seps, + opt< + sor< + seq< + one<'.'>, seps, _select::attr, + opt<seps, t::kw_or, seps, _select::attr_or> + >, + _select::as_app_or + > + > + > {}; + + struct app : _app, seq< + _app::select_or_fn, + opt<seps, _app::first_arg, star<seps, _app::another_arg>> + > {}; + + template<typename Op> + struct operator_ : semantic, Op {}; + + struct unary : seq< + star<sor<operator_<op::not_>, operator_<op::unary_minus>>, seps>, + app + > {}; + + struct _binary_operator : sor< + operator_<op::implies>, + operator_<op::update>, + operator_<op::concat>, + operator_<op::plus>, + operator_<op::minus>, + operator_<op::mul>, + operator_<op::div>, + operator_<op::less_eq>, + operator_<op::greater_eq>, + operator_<op::less>, + operator_<op::greater>, + operator_<op::equals>, + operator_<op::not_equals>, + operator_<op::or_>, + operator_<op::and_> + > {}; + + struct _binop : seq< + unary, + star< + seps, + sor< + seq<_binary_operator, seps, must<unary>>, + operator_<op::has_attr> + > + > + > {}; + + struct _lambda { + struct arg : semantic, t::identifier {}; + }; + struct lambda : semantic, _lambda, sor< + seq< + _lambda::arg, seps, + sor< + seq<one<':'>, seps, must<expr>>, + seq<one<'@'>, seps, must<formals, seps, one<':'>, seps, expr>> + > + >, + seq< + formals, seps, + sor< + seq<one<':'>, seps, must<expr>>, + seq<one<'@'>, seps, must<_lambda::arg, seps, one<':'>, seps, expr>> + > + > + > {}; + + struct assert_ : semantic, seq< + t::kw_assert, seps, + must<expr>, seps, + must<one<';'>>, seps, + must<expr> + > {}; + struct with : semantic, seq< + t::kw_with, seps, + must<expr>, seps, + must<one<';'>>, seps, + must<expr> + > {}; + struct let : seq< + t::kw_let, seps, + not_at<one<'{'>>, // exclude ancient_let so we can must<kw_in> + bindings, seps, + must<t::kw_in>, seps, + must<expr> + > {}; + struct if_ : semantic, seq< + t::kw_if, seps, + must<expr>, seps, + must<t::kw_then>, seps, + must<expr>, seps, + must<t::kw_else>, seps, + must<expr> + > {}; +}; +struct expr : semantic, _expr, sor< + _expr::lambda, + _expr::assert_, + _expr::with, + _expr::let, + _expr::if_, + _expr::_binop +> {}; + +// legacy support: \0 terminates input if passed from flex to bison as a token +struct eof : sor<p::eof, one<0>> {}; + +struct root : must<seps, expr, seps, eof> {}; + + + +template<typename Rule> +struct nothing : p::nothing<Rule> { + static_assert(!std::is_base_of_v<semantic, Rule>); +}; + + + +template<typename Self, typename OpCtx, typename AttrPathT, typename ExprT> +struct operator_semantics { + struct has_attr : grammar::op::has_attr { + AttrPathT path; + }; + + struct OpEntry { + OpCtx ctx; + uint8_t prec; + grammar::op::kind assoc; + std::variant< + grammar::op::not_, + grammar::op::unary_minus, + grammar::op::implies, + grammar::op::or_, + grammar::op::and_, + grammar::op::equals, + grammar::op::not_equals, + grammar::op::less_eq, + grammar::op::greater_eq, + grammar::op::update, + grammar::op::concat, + grammar::op::less, + grammar::op::greater, + grammar::op::plus, + grammar::op::minus, + grammar::op::mul, + grammar::op::div, + has_attr + > op; + }; + + // statistics here are taken from nixpkgs commit de502c4d0ba96261e5de803e4d1d1925afd3e22f. + // over 99.9% of contexts in nixpkgs need at most 4 slots, ~85% need only 1 + boost::container::small_vector<ExprT, 4> exprs; + // over 99.9% of contexts in nixpkgs need at most 2 slots, ~85% need only 1 + boost::container::small_vector<OpEntry, 2> ops; + + // derived class is expected to define members: + // + // ExprT applyOp(OpCtx & pos, auto & op, auto &... args); + // [[noreturn]] static void badOperator(OpCtx & pos, auto &... args); + + void reduce(uint8_t toPrecedence, auto &... args) { + while (!ops.empty()) { + auto & [ctx, precedence, kind, op] = ops.back(); + // NOTE this relies on associativity not being mixed within a precedence level. + if ((precedence > toPrecedence) + || (kind != grammar::op::kind::leftAssoc && precedence == toPrecedence)) + break; + std::visit([&, ctx=std::move(ctx)] (auto & op) { + exprs.push_back(static_cast<Self &>(*this).applyOp(ctx, op, args...)); + }, op); + ops.pop_back(); + } + } + + ExprT popExpr() + { + auto r = std::move(exprs.back()); + exprs.pop_back(); + return r; + } + + void pushOp(OpCtx ctx, auto o, auto &... args) + { + if (o.kind != grammar::op::kind::unary) + reduce(o.precedence, args...); + if (!ops.empty() && o.kind == grammar::op::kind::nonAssoc) { + auto & [_pos, _prec, _kind, _o] = ops.back(); + if (_kind == o.kind && _prec == o.precedence) + Self::badOperator(ctx, args...); + } + ops.emplace_back(ctx, o.precedence, o.kind, std::move(o)); + } + + ExprT finish(auto &... args) + { + reduce(255, args...); + return popExpr(); + } +}; + +} diff --git a/src/libexpr/parser/parser.cc b/src/libexpr/parser/parser.cc new file mode 100644 index 000000000..850f1276e --- /dev/null +++ b/src/libexpr/parser/parser.cc @@ -0,0 +1,862 @@ +#include "attr-set.hh" +#include "error.hh" +#include "eval-settings.hh" +#include "eval.hh" +#include "finally.hh" +#include "nixexpr.hh" +#include "symbol-table.hh" +#include "users.hh" + +#include "change_head.hh" +#include "grammar.hh" +#include "state.hh" + +#include <charconv> +#include <clocale> +#include <memory> + +// flip this define when doing parser development to enable some g checks. +#if 0 +#include <tao/pegtl/contrib/analyze.hpp> +#define ANALYZE_GRAMMAR \ + ([] { \ + const std::size_t issues = tao::pegtl::analyze<grammar::root>(); \ + assert(issues == 0); \ + })() +#else +#define ANALYZE_GRAMMAR ((void) 0) +#endif + +namespace p = tao::pegtl; + +namespace nix::parser { +namespace { + +template<typename> +inline constexpr const char * error_message = nullptr; + +#define error_message_for(...) \ + template<> inline constexpr auto error_message<__VA_ARGS__> + +error_message_for(p::one<'{'>) = "expecting '{'"; +error_message_for(p::one<'}'>) = "expecting '}'"; +error_message_for(p::one<'"'>) = "expecting '\"'"; +error_message_for(p::one<';'>) = "expecting ';'"; +error_message_for(p::one<')'>) = "expecting ')'"; +error_message_for(p::one<'='>) = "expecting '='"; +error_message_for(p::one<']'>) = "expecting ']'"; +error_message_for(p::one<':'>) = "expecting ':'"; +error_message_for(p::string<'\'', '\''>) = "expecting \"''\""; +error_message_for(p::any) = "expecting any character"; +error_message_for(grammar::eof) = "expecting end of file"; +error_message_for(grammar::seps) = "expecting separators"; +error_message_for(grammar::path::forbid_prefix_triple_slash) = "too many slashes in path"; +error_message_for(grammar::path::forbid_prefix_double_slash_no_interp) = "path has a trailing slash"; +error_message_for(grammar::expr) = "expecting expression"; +error_message_for(grammar::expr::unary) = "expecting expression"; +error_message_for(grammar::binding::equal) = "expecting '='"; +error_message_for(grammar::expr::lambda::arg) = "expecting identifier"; +error_message_for(grammar::formals) = "expecting formals"; +error_message_for(grammar::attrpath) = "expecting attribute path"; +error_message_for(grammar::expr::select) = "expecting selection expression"; +error_message_for(grammar::t::kw_then) = "expecting 'then'"; +error_message_for(grammar::t::kw_else) = "expecting 'else'"; +error_message_for(grammar::t::kw_in) = "expecting 'in'"; + +struct SyntaxErrors +{ + template<typename Rule> + static constexpr auto message = error_message<Rule>; + + template<typename Rule> + static constexpr bool raise_on_failure = false; +}; + +template<typename Rule> +struct Control : p::must_if<SyntaxErrors>::control<Rule> +{ + template<typename ParseInput, typename... States> + [[noreturn]] static void raise(const ParseInput & in, States &&... st) + { + if (in.empty()) { + std::string expected; + if constexpr (constexpr auto msg = error_message<Rule>) + expected = fmt(", %s", msg); + throw p::parse_error("unexpected end of file" + expected, in); + } + p::must_if<SyntaxErrors>::control<Rule>::raise(in, st...); + } +}; + +struct ExprState + : grammar:: + operator_semantics<ExprState, PosIdx, AttrPath, std::pair<PosIdx, std::unique_ptr<Expr>>> +{ + std::unique_ptr<Expr> popExprOnly() { + return std::move(popExpr().second); + } + + template<typename Op, typename... Args> + std::unique_ptr<Expr> applyUnary(Args &&... args) { + return std::make_unique<Op>(popExprOnly(), std::forward<Args>(args)...); + } + + template<typename Op> + std::unique_ptr<Expr> applyBinary(PosIdx pos) { + auto right = popExprOnly(), left = popExprOnly(); + return std::make_unique<Op>(pos, std::move(left), std::move(right)); + } + + std::unique_ptr<Expr> call(PosIdx pos, Symbol fn, bool flip = false) + { + std::vector<std::unique_ptr<Expr>> args(2); + args[flip ? 0 : 1] = popExprOnly(); + args[flip ? 1 : 0] = popExprOnly(); + return std::make_unique<ExprCall>(pos, std::make_unique<ExprVar>(fn), std::move(args)); + } + + std::unique_ptr<Expr> order(PosIdx pos, bool less, State & state) + { + return call(pos, state.s.lessThan, !less); + } + + std::unique_ptr<Expr> concatStrings(PosIdx pos) + { + std::vector<std::pair<PosIdx, std::unique_ptr<Expr>>> args(2); + args[1] = popExpr(); + args[0] = popExpr(); + return std::make_unique<ExprConcatStrings>(pos, false, std::move(args)); + } + + std::unique_ptr<Expr> negate(PosIdx pos, State & state) + { + std::vector<std::unique_ptr<Expr>> args(2); + args[0] = std::make_unique<ExprInt>(0); + args[1] = popExprOnly(); + return std::make_unique<ExprCall>(pos, std::make_unique<ExprVar>(state.s.sub), std::move(args)); + } + + std::pair<PosIdx, std::unique_ptr<Expr>> applyOp(PosIdx pos, auto & op, State & state) { + using Op = grammar::op; + + auto not_ = [] (auto e) { + return std::make_unique<ExprOpNot>(std::move(e)); + }; + + return { + pos, + (overloaded { + [&] (Op::implies) { return applyBinary<ExprOpImpl>(pos); }, + [&] (Op::or_) { return applyBinary<ExprOpOr>(pos); }, + [&] (Op::and_) { return applyBinary<ExprOpAnd>(pos); }, + [&] (Op::equals) { return applyBinary<ExprOpEq>(pos); }, + [&] (Op::not_equals) { return applyBinary<ExprOpNEq>(pos); }, + [&] (Op::less) { return order(pos, true, state); }, + [&] (Op::greater_eq) { return not_(order(pos, true, state)); }, + [&] (Op::greater) { return order(pos, false, state); }, + [&] (Op::less_eq) { return not_(order(pos, false, state)); }, + [&] (Op::update) { return applyBinary<ExprOpUpdate>(pos); }, + [&] (Op::not_) { return applyUnary<ExprOpNot>(); }, + [&] (Op::plus) { return concatStrings(pos); }, + [&] (Op::minus) { return call(pos, state.s.sub); }, + [&] (Op::mul) { return call(pos, state.s.mul); }, + [&] (Op::div) { return call(pos, state.s.div); }, + [&] (Op::concat) { return applyBinary<ExprOpConcatLists>(pos); }, + [&] (has_attr & a) { return applyUnary<ExprOpHasAttr>(std::move(a.path)); }, + [&] (Op::unary_minus) { return negate(pos, state); }, + })(op) + }; + } + + // always_inline is needed, otherwise pushOp slows down considerably + [[noreturn, gnu::always_inline]] + static void badOperator(PosIdx pos, State & state) + { + throw ParseError({ + .msg = HintFmt("syntax error, unexpected operator"), + .pos = state.positions[pos] + }); + } + + template<typename Expr, typename... Args> + Expr & pushExpr(PosIdx pos, Args && ... args) + { + auto p = std::make_unique<Expr>(std::forward<Args>(args)...); + auto & result = *p; + exprs.emplace_back(pos, std::move(p)); + return result; + } +}; + +struct SubexprState { +private: + ExprState * up; + +public: + explicit SubexprState(ExprState & up, auto &...) : up(&up) {} + operator ExprState &() { return *up; } + ExprState * operator->() { return up; } +}; + + + +template<typename Rule> +struct BuildAST : grammar::nothing<Rule> {}; + +struct LambdaState : SubexprState { + using SubexprState::SubexprState; + + Symbol arg; + std::unique_ptr<Formals> formals; +}; + +struct FormalsState : SubexprState { + using SubexprState::SubexprState; + + Formals formals{}; + Formal formal{}; +}; + +template<> struct BuildAST<grammar::formal::name> { + static void apply(const auto & in, FormalsState & s, State & ps) { + s.formal = { + .pos = ps.at(in), + .name = ps.symbols.create(in.string_view()), + }; + } +}; + +template<> struct BuildAST<grammar::formal> { + static void apply0(FormalsState & s, State &) { + s.formals.formals.emplace_back(std::move(s.formal)); + } +}; + +template<> struct BuildAST<grammar::formal::default_value> { + static void apply0(FormalsState & s, State & ps) { + s.formal.def = s->popExprOnly(); + } +}; + +template<> struct BuildAST<grammar::formals::ellipsis> { + static void apply0(FormalsState & s, State &) { + s.formals.ellipsis = true; + } +}; + +template<> struct BuildAST<grammar::formals> : change_head<FormalsState> { + static void success0(FormalsState & f, LambdaState & s, State &) { + s.formals = std::make_unique<Formals>(std::move(f.formals)); + } +}; + +struct AttrState : SubexprState { + using SubexprState::SubexprState; + + std::vector<AttrName> attrs; + + void pushAttr(auto && attr, PosIdx) { attrs.emplace_back(std::move(attr)); } +}; + +template<> struct BuildAST<grammar::attr::simple> { + static void apply(const auto & in, auto & s, State & ps) { + s.pushAttr(ps.symbols.create(in.string_view()), ps.at(in)); + } +}; + +template<> struct BuildAST<grammar::attr::string> { + static void apply(const auto & in, auto & s, State & ps) { + auto e = s->popExprOnly(); + if (auto str = dynamic_cast<ExprString *>(e.get())) + s.pushAttr(ps.symbols.create(str->s), ps.at(in)); + else + s.pushAttr(std::move(e), ps.at(in)); + } +}; + +template<> struct BuildAST<grammar::attr::expr> : BuildAST<grammar::attr::string> {}; + +struct BindingsState : SubexprState { + using SubexprState::SubexprState; + + ExprAttrs attrs; + AttrPath path; + std::unique_ptr<Expr> value; +}; + +struct InheritState : SubexprState { + using SubexprState::SubexprState; + + std::vector<std::pair<AttrName, PosIdx>> attrs; + std::unique_ptr<Expr> from; + PosIdx fromPos; + + void pushAttr(auto && attr, PosIdx pos) { attrs.emplace_back(std::move(attr), pos); } +}; + +template<> struct BuildAST<grammar::inherit::from> { + static void apply(const auto & in, InheritState & s, State & ps) { + s.from = s->popExprOnly(); + s.fromPos = ps.at(in); + } +}; + +template<> struct BuildAST<grammar::inherit> : change_head<InheritState> { + static void success0(InheritState & s, BindingsState & b, State & ps) { + auto & attrs = b.attrs.attrs; + // TODO this should not reuse generic attrpath rules. + for (auto & [i, iPos] : s.attrs) { + if (i.symbol) + continue; + if (auto str = dynamic_cast<ExprString *>(i.expr.get())) + i = AttrName(ps.symbols.create(str->s)); + else { + throw ParseError({ + .msg = HintFmt("dynamic attributes not allowed in inherit"), + .pos = ps.positions[iPos] + }); + } + } + if (auto fromE = std::move(s.from)) { + if (!b.attrs.inheritFromExprs) + b.attrs.inheritFromExprs = std::make_unique<std::vector<std::unique_ptr<Expr>>>(); + b.attrs.inheritFromExprs->push_back(std::move(fromE)); + for (auto & [i, iPos] : s.attrs) { + if (attrs.find(i.symbol) != attrs.end()) + ps.dupAttr(i.symbol, iPos, attrs[i.symbol].pos); + auto from = std::make_unique<ExprInheritFrom>(s.fromPos, b.attrs.inheritFromExprs->size() - 1); + attrs.emplace( + i.symbol, + ExprAttrs::AttrDef( + std::make_unique<ExprSelect>(iPos, std::move(from), i.symbol), + iPos, + ExprAttrs::AttrDef::Kind::InheritedFrom)); + } + } else { + for (auto & [i, iPos] : s.attrs) { + if (attrs.find(i.symbol) != attrs.end()) + ps.dupAttr(i.symbol, iPos, attrs[i.symbol].pos); + attrs.emplace( + i.symbol, + ExprAttrs::AttrDef( + std::make_unique<ExprVar>(iPos, i.symbol), + iPos, + ExprAttrs::AttrDef::Kind::Inherited)); + } + } + } +}; + +template<> struct BuildAST<grammar::binding::path> : change_head<AttrState> { + static void success0(AttrState & a, BindingsState & s, State & ps) { + s.path = std::move(a.attrs); + } +}; + +template<> struct BuildAST<grammar::binding::value> { + static void apply0(BindingsState & s, State & ps) { + s.value = s->popExprOnly(); + } +}; + +template<> struct BuildAST<grammar::binding> { + static void apply(const auto & in, BindingsState & s, State & ps) { + ps.addAttr(&s.attrs, std::move(s.path), std::move(s.value), ps.at(in)); + } +}; + +template<> struct BuildAST<grammar::expr::id> { + static void apply(const auto & in, ExprState & s, State & ps) { + if (in.string_view() == "__curPos") + s.pushExpr<ExprPos>(ps.at(in), ps.at(in)); + else + s.pushExpr<ExprVar>(ps.at(in), ps.at(in), ps.symbols.create(in.string_view())); + } +}; + +template<> struct BuildAST<grammar::expr::int_> { + static void apply(const auto & in, ExprState & s, State & ps) { + int64_t v; + if (std::from_chars(in.begin(), in.end(), v).ec != std::errc{}) { + throw ParseError({ + .msg = HintFmt("invalid integer '%1%'", in.string_view()), + .pos = ps.positions[ps.at(in)], + }); + } + s.pushExpr<ExprInt>(noPos, v); + } +}; + +template<> struct BuildAST<grammar::expr::float_> { + static void apply(const auto & in, ExprState & s, State & ps) { + // copy the input into a temporary string so we can call stod. + // can't use from_chars because libc++ (thus darwin) does not have it, + // and floats are not performance-sensitive anyway. if they were you'd + // be in much bigger trouble than this. + // + // we also get to do a locale-save dance because stod is locale-aware and + // something (a plugin?) may have called setlocale or uselocale. + static struct locale_hack { + locale_t posix; + locale_hack(): posix(newlocale(LC_ALL_MASK, "POSIX", 0)) + { + if (posix == 0) + throw SysError("could not get POSIX locale"); + } + } locale; + + auto tmp = in.string(); + double v = [&] { + auto oldLocale = uselocale(locale.posix); + Finally resetLocale([=] { uselocale(oldLocale); }); + try { + return std::stod(tmp); + } catch (...) { + throw ParseError({ + .msg = HintFmt("invalid float '%1%'", in.string_view()), + .pos = ps.positions[ps.at(in)], + }); + } + }(); + s.pushExpr<ExprFloat>(noPos, v); + } +}; + +struct StringState : SubexprState { + using SubexprState::SubexprState; + + std::string currentLiteral; + PosIdx currentPos; + std::vector<std::pair<nix::PosIdx, std::unique_ptr<Expr>>> parts; + + void append(PosIdx pos, std::string_view s) + { + if (currentLiteral.empty()) + currentPos = pos; + currentLiteral += s; + } + + // FIXME this truncates strings on NUL for compat with the old parser. ideally + // we should use the decomposition the g gives us instead of iterating over + // the entire string again. + static void unescapeStr(std::string & str) + { + char * s = str.data(); + char * t = s; + char c; + while ((c = *s++)) { + if (c == '\\') { + c = *s++; + if (c == 'n') *t = '\n'; + else if (c == 'r') *t = '\r'; + else if (c == 't') *t = '\t'; + else *t = c; + } + else if (c == '\r') { + /* Normalise CR and CR/LF into LF. */ + *t = '\n'; + if (*s == '\n') s++; /* cr/lf */ + } + else *t = c; + t++; + } + str.resize(t - str.data()); + } + + void endLiteral() + { + if (!currentLiteral.empty()) { + unescapeStr(currentLiteral); + parts.emplace_back(currentPos, std::make_unique<ExprString>(std::move(currentLiteral))); + } + } + + std::unique_ptr<Expr> finish() + { + if (parts.empty()) { + unescapeStr(currentLiteral); + return std::make_unique<ExprString>(std::move(currentLiteral)); + } else { + endLiteral(); + auto pos = parts[0].first; + return std::make_unique<ExprConcatStrings>(pos, true, std::move(parts)); + } + } +}; + +template<typename... Content> struct BuildAST<grammar::string::literal<Content...>> { + static void apply(const auto & in, StringState & s, State & ps) { + s.append(ps.at(in), in.string_view()); + } +}; + +template<> struct BuildAST<grammar::string::cr_lf> { + static void apply(const auto & in, StringState & s, State & ps) { + s.append(ps.at(in), in.string_view()); // FIXME compat with old parser + } +}; + +template<> struct BuildAST<grammar::string::interpolation> { + static void apply(const auto & in, StringState & s, State & ps) { + s.endLiteral(); + s.parts.emplace_back(ps.at(in), s->popExprOnly()); + } +}; + +template<> struct BuildAST<grammar::string::escape> { + static void apply(const auto & in, StringState & s, State & ps) { + s.append(ps.at(in), "\\"); // FIXME compat with old parser + s.append(ps.at(in), in.string_view()); + } +}; + +template<> struct BuildAST<grammar::string> : change_head<StringState> { + static void success0(StringState & s, ExprState & e, State &) { + e.exprs.emplace_back(noPos, s.finish()); + } +}; + +struct IndStringState : SubexprState { + using SubexprState::SubexprState; + + std::vector<std::pair<PosIdx, std::variant<std::unique_ptr<Expr>, StringToken>>> parts; +}; + +template<bool Indented, typename... Content> +struct BuildAST<grammar::ind_string::literal<Indented, Content...>> { + static void apply(const auto & in, IndStringState & s, State & ps) { + s.parts.emplace_back(ps.at(in), StringToken{in.string_view(), Indented}); + } +}; + +template<> struct BuildAST<grammar::ind_string::interpolation> { + static void apply(const auto & in, IndStringState & s, State & ps) { + s.parts.emplace_back(ps.at(in), s->popExprOnly()); + } +}; + +template<> struct BuildAST<grammar::ind_string::escape> { + static void apply(const auto & in, IndStringState & s, State & ps) { + switch (*in.begin()) { + case 'n': s.parts.emplace_back(ps.at(in), StringToken{"\n"}); break; + case 'r': s.parts.emplace_back(ps.at(in), StringToken{"\r"}); break; + case 't': s.parts.emplace_back(ps.at(in), StringToken{"\t"}); break; + default: s.parts.emplace_back(ps.at(in), StringToken{in.string_view()}); break; + } + } +}; + +template<> struct BuildAST<grammar::ind_string> : change_head<IndStringState> { + static void success(const auto & in, IndStringState & s, ExprState & e, State & ps) { + e.exprs.emplace_back(noPos, ps.stripIndentation(ps.at(in), std::move(s.parts))); + } +}; + +template<typename... Content> struct BuildAST<grammar::path::literal<Content...>> { + static void apply(const auto & in, StringState & s, State & ps) { + s.append(ps.at(in), in.string_view()); + s.endLiteral(); + } +}; + +template<> struct BuildAST<grammar::path::interpolation> : BuildAST<grammar::string::interpolation> {}; + +template<> struct BuildAST<grammar::path::anchor> { + static void apply(const auto & in, StringState & s, State & ps) { + Path path(absPath(in.string(), ps.basePath.path.abs())); + /* add back in the trailing '/' to the first segment */ + if (in.string_view().ends_with('/') && in.size() > 1) + path += "/"; + s.parts.emplace_back(ps.at(in), new ExprPath(std::move(path))); + } +}; + +template<> struct BuildAST<grammar::path::home_anchor> { + static void apply(const auto & in, StringState & s, State & ps) { + if (evalSettings.pureEval) + throw Error("the path '%s' can not be resolved in pure mode", in.string_view()); + Path path(getHome() + in.string_view().substr(1)); + s.parts.emplace_back(ps.at(in), new ExprPath(std::move(path))); + } +}; + +template<> struct BuildAST<grammar::path::searched_path> { + static void apply(const auto & in, StringState & s, State & ps) { + std::vector<std::unique_ptr<Expr>> args{2}; + args[0] = std::make_unique<ExprVar>(ps.s.nixPath); + args[1] = std::make_unique<ExprString>(in.string()); + s.parts.emplace_back( + ps.at(in), + std::make_unique<ExprCall>( + ps.at(in), + std::make_unique<ExprVar>(ps.s.findFile), + std::move(args))); + } +}; + +template<> struct BuildAST<grammar::path> : change_head<StringState> { + template<typename E> + static void check_slash(PosIdx end, StringState & s, State & ps) { + auto e = dynamic_cast<E *>(s.parts.back().second.get()); + if (!e || !e->s.ends_with('/')) + return; + if (s.parts.size() > 1 || e->s != "/") + throw ParseError({ + .msg = HintFmt("path has a trailing slash"), + .pos = ps.positions[end], + }); + } + + static void success(const auto & in, StringState & s, ExprState & e, State & ps) { + s.endLiteral(); + check_slash<ExprPath>(ps.atEnd(in), s, ps); + check_slash<ExprString>(ps.atEnd(in), s, ps); + if (s.parts.size() == 1) { + e.exprs.emplace_back(noPos, std::move(s.parts.back().second)); + } else { + e.pushExpr<ExprConcatStrings>(ps.at(in), ps.at(in), false, std::move(s.parts)); + } + } +}; + +// strings and paths sare handled fully by the grammar-level rule for now +template<> struct BuildAST<grammar::expr::string> : p::maybe_nothing {}; +template<> struct BuildAST<grammar::expr::ind_string> : p::maybe_nothing {}; +template<> struct BuildAST<grammar::expr::path> : p::maybe_nothing {}; + +template<> struct BuildAST<grammar::expr::uri> { + static void apply(const auto & in, ExprState & s, State & ps) { + static bool noURLLiterals = experimentalFeatureSettings.isEnabled(Xp::NoUrlLiterals); + if (noURLLiterals) + throw ParseError({ + .msg = HintFmt("URL literals are disabled"), + .pos = ps.positions[ps.at(in)] + }); + s.pushExpr<ExprString>(ps.at(in), in.string()); + } +}; + +template<> struct BuildAST<grammar::expr::ancient_let> : change_head<BindingsState> { + static void success(const auto & in, BindingsState & b, ExprState & s, State & ps) { + b.attrs.pos = ps.at(in); + b.attrs.recursive = true; + s.pushExpr<ExprSelect>(b.attrs.pos, b.attrs.pos, std::make_unique<ExprAttrs>(std::move(b.attrs)), ps.s.body); + } +}; + +template<> struct BuildAST<grammar::expr::rec_set> : change_head<BindingsState> { + static void success(const auto & in, BindingsState & b, ExprState & s, State & ps) { + b.attrs.pos = ps.at(in); + b.attrs.recursive = true; + s.pushExpr<ExprAttrs>(b.attrs.pos, std::move(b.attrs)); + } +}; + +template<> struct BuildAST<grammar::expr::set> : change_head<BindingsState> { + static void success(const auto & in, BindingsState & b, ExprState & s, State & ps) { + b.attrs.pos = ps.at(in); + s.pushExpr<ExprAttrs>(b.attrs.pos, std::move(b.attrs)); + } +}; + +using ListState = std::vector<std::unique_ptr<Expr>>; + +template<> struct BuildAST<grammar::expr::list> : change_head<ListState> { + static void success(const auto & in, ListState & ls, ExprState & s, State & ps) { + auto e = std::make_unique<ExprList>(); + e->elems = std::move(ls); + s.exprs.emplace_back(ps.at(in), std::move(e)); + } +}; + +template<> struct BuildAST<grammar::expr::list::entry> : change_head<ExprState> { + static void success0(ExprState & e, ListState & s, State & ps) { + s.emplace_back(e.finish(ps).second); + } +}; + +struct SelectState : SubexprState { + using SubexprState::SubexprState; + + PosIdx pos; + ExprSelect * e = nullptr; +}; + +template<> struct BuildAST<grammar::expr::select::head> { + static void apply(const auto & in, SelectState & s, State & ps) { + s.pos = ps.at(in); + } +}; + +template<> struct BuildAST<grammar::expr::select::attr> : change_head<AttrState> { + static void success0(AttrState & a, SelectState & s, State &) { + s.e = &s->pushExpr<ExprSelect>(s.pos, s.pos, s->popExprOnly(), std::move(a.attrs), nullptr); + } +}; + +template<> struct BuildAST<grammar::expr::select::attr_or> { + static void apply0(SelectState & s, State &) { + s.e->def = s->popExprOnly(); + } +}; + +template<> struct BuildAST<grammar::expr::select::as_app_or> { + static void apply(const auto & in, SelectState & s, State & ps) { + std::vector<std::unique_ptr<Expr>> args(1); + args[0] = std::make_unique<ExprVar>(ps.at(in), ps.s.or_); + s->pushExpr<ExprCall>(s.pos, s.pos, s->popExprOnly(), std::move(args)); + } +}; + +template<> struct BuildAST<grammar::expr::select> : change_head<SelectState> { + static void success0(const auto &...) {} +}; + +struct AppState : SubexprState { + using SubexprState::SubexprState; + + PosIdx pos; + ExprCall * e = nullptr; +}; + +template<> struct BuildAST<grammar::expr::app::select_or_fn> { + static void apply(const auto & in, AppState & s, State & ps) { + s.pos = ps.at(in); + } +}; + +template<> struct BuildAST<grammar::expr::app::first_arg> { + static void apply(auto & in, AppState & s, State & ps) { + auto arg = s->popExprOnly(), fn = s->popExprOnly(); + if ((s.e = dynamic_cast<ExprCall *>(fn.get()))) { + // TODO remove. + // AST compat with old parser, semantics are the same. + // this can happen on occasions such as `<p> <p>` or `a or b or`, + // neither of which are super worth optimizing. + s.e->args.push_back(std::move(arg)); + s->exprs.emplace_back(noPos, std::move(fn)); + } else { + std::vector<std::unique_ptr<Expr>> args{1}; + args[0] = std::move(arg); + s.e = &s->pushExpr<ExprCall>(s.pos, s.pos, std::move(fn), std::move(args)); + } + } +}; + +template<> struct BuildAST<grammar::expr::app::another_arg> { + static void apply0(AppState & s, State & ps) { + s.e->args.push_back(s->popExprOnly()); + } +}; + +template<> struct BuildAST<grammar::expr::app> : change_head<AppState> { + static void success0(const auto &...) {} +}; + +template<typename Op> struct BuildAST<grammar::expr::operator_<Op>> { + static void apply(const auto & in, ExprState & s, State & ps) { + s.pushOp(ps.at(in), Op{}, ps); + } +}; +template<> struct BuildAST<grammar::expr::operator_<grammar::op::has_attr>> : change_head<AttrState> { + static void success(const auto & in, AttrState & a, ExprState & s, State & ps) { + s.pushOp(ps.at(in), ExprState::has_attr{{}, std::move(a.attrs)}, ps); + } +}; + +template<> struct BuildAST<grammar::expr::lambda::arg> { + static void apply(const auto & in, LambdaState & s, State & ps) { + s.arg = ps.symbols.create(in.string_view()); + } +}; + +template<> struct BuildAST<grammar::expr::lambda> : change_head<LambdaState> { + static void success(const auto & in, LambdaState & l, ExprState & s, State & ps) { + if (l.formals) + l.formals = ps.validateFormals(std::move(l.formals), ps.at(in), l.arg); + s.pushExpr<ExprLambda>(ps.at(in), ps.at(in), l.arg, std::move(l.formals), l->popExprOnly()); + } +}; + +template<> struct BuildAST<grammar::expr::assert_> { + static void apply(const auto & in, ExprState & s, State & ps) { + auto body = s.popExprOnly(), cond = s.popExprOnly(); + s.pushExpr<ExprAssert>(ps.at(in), ps.at(in), std::move(cond), std::move(body)); + } +}; + +template<> struct BuildAST<grammar::expr::with> { + static void apply(const auto & in, ExprState & s, State & ps) { + auto body = s.popExprOnly(), scope = s.popExprOnly(); + s.pushExpr<ExprWith>(ps.at(in), ps.at(in), std::move(scope), std::move(body)); + } +}; + +template<> struct BuildAST<grammar::expr::let> : change_head<BindingsState> { + static void success(const auto & in, BindingsState & b, ExprState & s, State & ps) { + if (!b.attrs.dynamicAttrs.empty()) + throw ParseError({ + .msg = HintFmt("dynamic attributes not allowed in let"), + .pos = ps.positions[ps.at(in)] + }); + + s.pushExpr<ExprLet>(ps.at(in), std::make_unique<ExprAttrs>(std::move(b.attrs)), b->popExprOnly()); + } +}; + +template<> struct BuildAST<grammar::expr::if_> { + static void apply(const auto & in, ExprState & s, State & ps) { + auto else_ = s.popExprOnly(), then = s.popExprOnly(), cond = s.popExprOnly(); + s.pushExpr<ExprIf>(ps.at(in), ps.at(in), std::move(cond), std::move(then), std::move(else_)); + } +}; + +template<> struct BuildAST<grammar::expr> : change_head<ExprState> { + static void success0(ExprState & inner, ExprState & outer, State & ps) { + outer.exprs.push_back(inner.finish(ps)); + } +}; + +} +} + +namespace nix { + +Expr * EvalState::parse( + char * text, + size_t length, + Pos::Origin origin, + const SourcePath & basePath, + std::shared_ptr<StaticEnv> & staticEnv) +{ + parser::State s = { + symbols, + positions, + basePath, + positions.addOrigin(origin, length), + exprSymbols, + }; + parser::ExprState x; + + assert(length >= 2); + assert(text[length - 1] == 0); + assert(text[length - 2] == 0); + length -= 2; + + p::string_input<p::tracking_mode::lazy> inp{std::string_view{text, length}, "input"}; + try { + p::parse<parser::grammar::root, parser::BuildAST, parser::Control>(inp, x, s); + } catch (p::parse_error & e) { + auto pos = e.positions().back(); + throw ParseError({ + .msg = HintFmt("syntax error, %s", e.message()), + .pos = positions[s.positions.add(s.origin, pos.byte)] + }); + } + + auto [_pos, result] = x.finish(s); + result->bindVars(*this, staticEnv); + return result.release(); +} + +} diff --git a/src/libexpr/parser/state.hh b/src/libexpr/parser/state.hh new file mode 100644 index 000000000..f5a0428d7 --- /dev/null +++ b/src/libexpr/parser/state.hh @@ -0,0 +1,259 @@ +#pragma once +///@file + +#include "eval.hh" + +namespace nix::parser { + +struct StringToken +{ + std::string_view s; + bool hasIndentation; + operator std::string_view() const { return s; } +}; + +struct State +{ + SymbolTable & symbols; + PosTable & positions; + SourcePath basePath; + PosTable::Origin origin; + const Expr::AstSymbols & s; + + void dupAttr(const AttrPath & attrPath, const PosIdx pos, const PosIdx prevPos); + void dupAttr(Symbol attr, const PosIdx pos, const PosIdx prevPos); + void addAttr(ExprAttrs * attrs, AttrPath && attrPath, std::unique_ptr<Expr> e, const PosIdx pos); + std::unique_ptr<Formals> validateFormals(std::unique_ptr<Formals> formals, PosIdx pos = noPos, Symbol arg = {}); + std::unique_ptr<Expr> stripIndentation(const PosIdx pos, + std::vector<std::pair<PosIdx, std::variant<std::unique_ptr<Expr>, StringToken>>> && es); + + // lazy positioning means we don't get byte offsets directly, in.position() would work + // but also requires line and column (which is expensive) + PosIdx at(const auto & in) + { + return positions.add(origin, in.begin() - in.input().begin()); + } + + PosIdx atEnd(const auto & in) + { + return positions.add(origin, in.end() - in.input().begin()); + } +}; + +inline void State::dupAttr(const AttrPath & attrPath, const PosIdx pos, const PosIdx prevPos) +{ + throw ParseError({ + .msg = HintFmt("attribute '%1%' already defined at %2%", + showAttrPath(symbols, attrPath), positions[prevPos]), + .pos = positions[pos] + }); +} + +inline void State::dupAttr(Symbol attr, const PosIdx pos, const PosIdx prevPos) +{ + throw ParseError({ + .msg = HintFmt("attribute '%1%' already defined at %2%", symbols[attr], positions[prevPos]), + .pos = positions[pos] + }); +} + +inline void State::addAttr(ExprAttrs * attrs, AttrPath && attrPath, std::unique_ptr<Expr> e, const PosIdx pos) +{ + AttrPath::iterator i; + // All attrpaths have at least one attr + assert(!attrPath.empty()); + // Checking attrPath validity. + // =========================== + for (i = attrPath.begin(); i + 1 < attrPath.end(); i++) { + if (i->symbol) { + ExprAttrs::AttrDefs::iterator j = attrs->attrs.find(i->symbol); + if (j != attrs->attrs.end()) { + if (j->second.kind != ExprAttrs::AttrDef::Kind::Inherited) { + ExprAttrs * attrs2 = dynamic_cast<ExprAttrs *>(j->second.e.get()); + if (!attrs2) { + attrPath.erase(i + 1, attrPath.end()); + dupAttr(attrPath, pos, j->second.pos); + } + attrs = attrs2; + } else { + attrPath.erase(i + 1, attrPath.end()); + dupAttr(attrPath, pos, j->second.pos); + } + } else { + auto next = attrs->attrs.emplace(std::piecewise_construct, + std::tuple(i->symbol), + std::tuple(std::make_unique<ExprAttrs>(), pos)); + attrs = static_cast<ExprAttrs *>(next.first->second.e.get()); + } + } else { + auto & next = attrs->dynamicAttrs.emplace_back(std::move(i->expr), std::make_unique<ExprAttrs>(), pos); + attrs = static_cast<ExprAttrs *>(next.valueExpr.get()); + } + } + // Expr insertion. + // ========================== + if (i->symbol) { + ExprAttrs::AttrDefs::iterator j = attrs->attrs.find(i->symbol); + if (j != attrs->attrs.end()) { + // This attr path is already defined. However, if both + // e and the expr pointed by the attr path are two attribute sets, + // we want to merge them. + // Otherwise, throw an error. + auto * ae = dynamic_cast<ExprAttrs *>(e.get()); + auto * jAttrs = dynamic_cast<ExprAttrs *>(j->second.e.get()); + if (jAttrs && ae) { + if (ae->inheritFromExprs && !jAttrs->inheritFromExprs) + jAttrs->inheritFromExprs = std::make_unique<std::vector<std::unique_ptr<Expr>>>(); + for (auto & ad : ae->attrs) { + auto j2 = jAttrs->attrs.find(ad.first); + if (j2 != jAttrs->attrs.end()) // Attr already defined in iAttrs, error. + return dupAttr(ad.first, j2->second.pos, ad.second.pos); + if (ad.second.kind == ExprAttrs::AttrDef::Kind::InheritedFrom) { + auto & sel = dynamic_cast<ExprSelect &>(*ad.second.e); + auto & from = dynamic_cast<ExprInheritFrom &>(*sel.e); + from.displ += jAttrs->inheritFromExprs->size(); + } + jAttrs->attrs.emplace(ad.first, std::move(ad.second)); + } + std::ranges::move(ae->dynamicAttrs, std::back_inserter(jAttrs->dynamicAttrs)); + if (ae->inheritFromExprs) + std::ranges::move(*ae->inheritFromExprs, std::back_inserter(*jAttrs->inheritFromExprs)); + } else { + dupAttr(attrPath, pos, j->second.pos); + } + } else { + // This attr path is not defined. Let's create it. + e->setName(i->symbol); + attrs->attrs.emplace(std::piecewise_construct, + std::tuple(i->symbol), + std::tuple(std::move(e), pos)); + } + } else { + attrs->dynamicAttrs.emplace_back(std::move(i->expr), std::move(e), pos); + } +} + +inline std::unique_ptr<Formals> State::validateFormals(std::unique_ptr<Formals> formals, PosIdx pos, Symbol arg) +{ + std::sort(formals->formals.begin(), formals->formals.end(), + [] (const auto & a, const auto & b) { + return std::tie(a.name, a.pos) < std::tie(b.name, b.pos); + }); + + std::optional<std::pair<Symbol, PosIdx>> duplicate; + for (size_t i = 0; i + 1 < formals->formals.size(); i++) { + if (formals->formals[i].name != formals->formals[i + 1].name) + continue; + std::pair thisDup{formals->formals[i].name, formals->formals[i + 1].pos}; + duplicate = std::min(thisDup, duplicate.value_or(thisDup)); + } + if (duplicate) + throw ParseError({ + .msg = HintFmt("duplicate formal function argument '%1%'", symbols[duplicate->first]), + .pos = positions[duplicate->second] + }); + + if (arg && formals->has(arg)) + throw ParseError({ + .msg = HintFmt("duplicate formal function argument '%1%'", symbols[arg]), + .pos = positions[pos] + }); + + return formals; +} + +inline std::unique_ptr<Expr> State::stripIndentation(const PosIdx pos, + std::vector<std::pair<PosIdx, std::variant<std::unique_ptr<Expr>, StringToken>>> && es) +{ + if (es.empty()) return std::make_unique<ExprString>(""); + + /* Figure out the minimum indentation. Note that by design + whitespace-only final lines are not taken into account. (So + the " " in "\n ''" is ignored, but the " " in "\n foo''" is.) */ + bool atStartOfLine = true; /* = seen only whitespace in the current line */ + size_t minIndent = 1000000; + size_t curIndent = 0; + for (auto & [i_pos, i] : es) { + auto * str = std::get_if<StringToken>(&i); + if (!str || !str->hasIndentation) { + /* Anti-quotations and escaped characters end the current start-of-line whitespace. */ + if (atStartOfLine) { + atStartOfLine = false; + if (curIndent < minIndent) minIndent = curIndent; + } + continue; + } + for (size_t j = 0; j < str->s.size(); ++j) { + if (atStartOfLine) { + if (str->s[j] == ' ') + curIndent++; + else if (str->s[j] == '\n') { + /* Empty line, doesn't influence minimum + indentation. */ + curIndent = 0; + } else { + atStartOfLine = false; + if (curIndent < minIndent) minIndent = curIndent; + } + } else if (str->s[j] == '\n') { + atStartOfLine = true; + curIndent = 0; + } + } + } + + /* Strip spaces from each line. */ + std::vector<std::pair<PosIdx, std::unique_ptr<Expr>>> es2; + atStartOfLine = true; + size_t curDropped = 0; + size_t n = es.size(); + auto i = es.begin(); + const auto trimExpr = [&] (std::unique_ptr<Expr> e) { + atStartOfLine = false; + curDropped = 0; + es2.emplace_back(i->first, std::move(e)); + }; + const auto trimString = [&] (const StringToken t) { + std::string s2; + for (size_t j = 0; j < t.s.size(); ++j) { + if (atStartOfLine) { + if (t.s[j] == ' ') { + if (curDropped++ >= minIndent) + s2 += t.s[j]; + } + else if (t.s[j] == '\n') { + curDropped = 0; + s2 += t.s[j]; + } else { + atStartOfLine = false; + curDropped = 0; + s2 += t.s[j]; + } + } else { + s2 += t.s[j]; + if (t.s[j] == '\n') atStartOfLine = true; + } + } + + /* Remove the last line if it is empty and consists only of + spaces. */ + if (n == 1) { + std::string::size_type p = s2.find_last_of('\n'); + if (p != std::string::npos && s2.find_first_not_of(' ', p + 1) == std::string::npos) + s2 = std::string(s2, 0, p + 1); + } + + es2.emplace_back(i->first, std::make_unique<ExprString>(std::move(s2))); + }; + for (; i != es.end(); ++i, --n) { + std::visit(overloaded { trimExpr, trimString }, std::move(i->second)); + } + + /* If this is a single string, then don't do a concatenation. */ + if (es2.size() == 1 && dynamic_cast<ExprString *>(es2[0].second.get())) { + return std::move(es2[0].second); + } + return std::make_unique<ExprConcatStrings>(pos, true, std::move(es2)); +} + +} |