#pragma once ///@file #include "tao/pegtl.hpp" #include #include #include // NOTE // nix line endings are \n, \r\n, \r. the grammar does not use eol or // eolf rules in favor of reproducing the old flex lexer as faithfully as // possible, and deferring calculation of positions to downstream users. namespace nix::parser::grammar::v1 { using namespace tao::pegtl; namespace p = tao::pegtl; // character classes namespace c { struct path : sor< ranges<'a', 'z', 'A', 'Z', '0', '9'>, one<'.', '_', '-', '+'> > {}; struct path_sep : one<'/'> {}; struct id_first : ranges<'a', 'z', 'A', 'Z', '_'> {}; struct id_rest : sor< ranges<'a', 'z', 'A', 'Z', '0', '9'>, one<'_', '\'', '-'> > {}; struct uri_scheme_first : ranges<'a', 'z', 'A', 'Z'> {}; struct uri_scheme_rest : sor< ranges<'a', 'z', 'A', 'Z', '0', '9'>, one<'+', '-', '.'> > {}; struct uri_sep : one<':'> {}; struct uri_rest : sor< ranges<'a', 'z', 'A', 'Z', '0', '9'>, one<'%', '/', '?', ':', '@', '&', '=', '+', '$', ',', '-', '_', '.', '!', '~', '*', '\''> > {}; } // "tokens". PEGs don't really care about tokens, we merely use them as a convenient // way of writing down keywords and a couple complicated syntax rules. namespace t { struct _extend_as_path : seq< star, not_at, not_at, c::path_sep, sor > {}; struct _extend_as_uri : seq< star, c::uri_sep, c::uri_rest > {}; // keywords might be extended to identifiers, paths, or uris. // NOTE this assumes that keywords are a-zA-Z only, otherwise uri schemes would never // match correctly. // NOTE not a simple seq<...> because this would report incorrect positions for // keywords used inside must<> if a prefix of the keyword matches. template struct _keyword : sor< seq< S, not_at, not_at<_extend_as_path>, not_at<_extend_as_uri> >, failure > {}; struct kw_if : _keyword {}; struct kw_then : _keyword {}; struct kw_else : _keyword {}; struct kw_assert : _keyword {}; struct kw_with : _keyword {}; struct kw_let : _keyword {}; struct kw_in : _keyword {}; struct kw_rec : _keyword {}; struct kw_inherit : _keyword {}; struct kw_or : _keyword {}; // `-` can be a unary prefix op, a binary infix op, or the first character // of a path or -> (ex 1->1--1) // `/` can be a path leader or an operator (ex a?a /a) struct op_minus : seq, not_at'>>, not_at<_extend_as_path>> {}; struct op_div : seq, not_at> {}; // match a rule, making sure we are not matching it where a keyword would match. // using minus like this is a lot faster than flipping the order and using seq. template struct _not_at_any_keyword : minus< seq, sor< TAO_PEGTL_STRING("inherit"), TAO_PEGTL_STRING("assert"), TAO_PEGTL_STRING("else"), TAO_PEGTL_STRING("then"), TAO_PEGTL_STRING("with"), TAO_PEGTL_STRING("let"), TAO_PEGTL_STRING("rec"), TAO_PEGTL_STRING("if"), TAO_PEGTL_STRING("in"), TAO_PEGTL_STRING("or") > > {}; // identifiers are kind of horrid: // // - uri_scheme_first ⊂ id_first // - uri_scheme_first ⊂ uri_scheme_rest ⊂ path // - id_first ⊂ id_rest ∖ { ' } ⊂ path // - id_first ∩ (path ∖ uri_scheme_first) = { _ } // - uri_sep ∉ ⋃ { id_first, id_rest, uri_scheme_first, uri_scheme_rest, path } // - path_sep ∉ ⋃ { id_first, id_rest, uri_scheme_first, uri_scheme_rest } // // and we want, without reading the input more than once, a string that // matches (id_first id_rest*) and is not followed by any number of // characters such that the extended string matches path or uri rules. // // since the first character must be either _ or a uri scheme character // we can ignore path-like bits at the beginning. uri_sep cannot appear anywhere // in an identifier, so it's only needed in lookahead checks at the uri-like // prefix. likewise path_sep cannot appear anywhere in the idenfier, so it's // only needed in lookahead checks in the path-like prefix. // // in total that gives us a decomposition of // // (uri-scheme-like? (?! continues-as-uri) | _) // (path-segment-like? (?! continues-as-path)) // id_rest* struct identifier : _not_at_any_keyword< // we don't use (at, ...) matches here because identifiers are // a really hot path and rewinding as needed by at<> isn't entirely free. sor< seq< c::uri_scheme_first, star>, not_at<_extend_as_uri> >, one<'_'> >, star, one<'_', '-'>>>, not_at<_extend_as_path>, star > {}; // floats may extend ints, thus these rules are very similar. struct integer : seq< sor< seq, star, not_at>>, seq, not_at, digit>, star> >, not_at<_extend_as_path> > {}; struct floating : seq< sor< seq, star, one<'.'>, star>, seq>, one<'.'>, plus> >, opt, opt>, plus>, not_at<_extend_as_path> > {}; struct uri : seq< c::uri_scheme_first, star, c::uri_sep, plus > {}; struct sep : sor< plus>, seq, star>>, seq, until>> > {}; } using seps = star; // marker for semantic rules. not handling one of these in an action that cares about // semantics is probably an error. struct semantic {}; struct expr; struct _string { template struct literal : semantic, seq {}; struct cr_lf : semantic, seq, opt>> {}; struct interpolation : semantic, seq< p::string<'$', '{'>, seps, must, seps, must> > {}; struct escape : semantic, must {}; }; struct string : _string, seq< one<'"'>, star< sor< _string::literal>>, _string::cr_lf, _string::interpolation, _string::literal, opt>>, seq, _string::escape> > >, must> > {}; struct _ind_string { struct line_start : semantic, star> {}; template struct literal : semantic, seq {}; struct interpolation : semantic, seq< p::string<'$', '{'>, seps, must, seps, must> > {}; struct escape : semantic, must {}; /* Marker for non-empty lines */ struct has_content : semantic, seq<> {}; }; struct ind_string : _ind_string, seq< TAO_PEGTL_STRING("''"), // Strip first line completely if empty opt>, one<'\n'>>, list< seq< // Start a line with some indentation // (we always match even the empty string if no indentation, as this creates the line) _ind_string::line_start, // The actual line opt< plus< sor< _ind_string::literal< true, plus< sor< not_one<'$', '\'', '\n'>, // TODO probably factor this out like the others for performance seq, not_one<'{', '\'', '\n'>>, seq, at>>, seq, not_one<'\'', '$', '\n'>>, seq, at>> > > >, _ind_string::interpolation, _ind_string::literal>, _ind_string::literal, not_at>>, seq, _ind_string::literal>>, seq< p::string<'\'', '\''>, sor< _ind_string::literal>, seq, _ind_string::escape> > > >, _ind_string::has_content > > >, // End of line, LF. CR is just ignored and not treated as ending a line // (for the purpose of indentation stripping) _ind_string::literal> >, must > {}; struct _path { // legacy lexer rules. extra l_ to avoid reserved c++ identifiers. struct _l_PATH : seq, plus>, opt> {}; struct _l_PATH_SEG : seq, c::path_sep> {}; struct _l_HPATH : seq, plus>, opt> {}; struct _l_HPATH_START : TAO_PEGTL_STRING("~/") {}; struct _path_str : sor<_l_PATH, _l_PATH_SEG, plus> {}; // modern rules template struct literal : semantic, seq {}; struct interpolation : semantic, seq< p::string<'$', '{'>, seps, must, seps, must> > {}; struct anchor : semantic, sor< _l_PATH, seq<_l_PATH_SEG, at> > {}; struct home_anchor : semantic, sor< _l_HPATH, seq<_l_HPATH_START, at> > {}; struct searched_path : semantic, list, c::path_sep> {}; struct forbid_prefix_triple_slash : sor, failure> {}; struct forbid_prefix_double_slash_no_interp : sor< not_at, not_at>, failure > {}; // legacy parser rules struct _str_rest : seq< must, opt>, must, star< sor< literal<_path_str>, interpolation > > > {}; }; struct path : _path, sor< seq< sor<_path::anchor, _path::home_anchor>, _path::_str_rest >, seq, _path::searched_path, one<'>'>> > {}; struct _formal { struct name : semantic, t::identifier {}; struct default_value : semantic, must {}; }; struct formal : semantic, _formal, seq< _formal::name, opt, seps, _formal::default_value> > {}; struct _formals { struct ellipsis : semantic, p::ellipsis {}; }; struct formals : semantic, _formals, seq< one<'{'>, seps, // formals and attrsets share a two-token head sequence ('{' ). // this rule unrolls the formals list a bit to provide better error messages than // "expected '='" at the first ',' if formals are incorrect. sor< one<'}'>, seq<_formals::ellipsis, seps, must>>, seq< formal, seps, if_then_else< at>, seq< star, seps, formal, seps>, opt, seps, opt<_formals::ellipsis, seps>>, must> >, one<'}'> > > > > {}; struct _attr { struct simple : semantic, sor {}; struct string : semantic, seq {}; struct expr : semantic, seq< TAO_PEGTL_STRING("${"), seps, must, seps, must> > {}; }; struct attr : _attr, sor< _attr::simple, _attr::string, _attr::expr > {}; struct attrpath : list, t::sep> {}; struct _inherit { struct from : semantic, must {}; struct attrs : list {}; }; struct inherit : _inherit, seq< t::kw_inherit, seps, opt, seps, _inherit::from, seps, must>, seps>, opt<_inherit::attrs, seps>, must> > {}; struct _binding { struct path : semantic, attrpath {}; struct equal : one<'='> {}; struct value : semantic, must {}; }; struct binding : _binding, seq< _binding::path, seps, must<_binding::equal>, seps, _binding::value, seps, must> > {}; struct bindings : opt, seps>> {}; struct op { enum class kind { // NOTE non-associativity is *NOT* handled in the grammar structure. // handling it in the grammar itself instead of in semantic actions // slows down the parser significantly and makes the rules *much* // harder to read. maybe this will be different at some point when // ! does not sit between two binary precedence levels. nonAssoc, leftAssoc, rightAssoc, unary, }; template struct _op : Rule { static constexpr unsigned precedence = Precedence; static constexpr op::kind kind = Kind; }; struct unary_minus : _op {}; // treating this like a unary postfix operator is sketchy, but that's // the most reasonable way to implement the operator precedence set forth // by the language way back. it'd be much better if `.` and `?` had the same // precedence, but alas. struct has_attr : _op, seps, must>, 4> {}; struct concat : _op {}; struct mul : _op, 6> {}; struct div : _op {}; struct plus : _op, 7> {}; struct minus : _op {}; struct not_ : _op, 8, kind::unary> {}; struct update : _op {}; struct less_eq : _op {}; struct greater_eq : _op="), 10, kind::nonAssoc> {}; struct less : _op, 10, kind::nonAssoc> {}; struct greater : _op'>, 10, kind::nonAssoc> {}; struct equals : _op {}; struct not_equals : _op {}; struct and_ : _op {}; struct or_ : _op {}; struct implies : _op"), 14, kind::rightAssoc> {}; struct pipe_right : _op"), 15> {}; struct pipe_left : _op {}; }; struct _expr { template class OpenMod = seq, typename... Init> struct _attrset : seq< Init..., OpenMod>, seps, bindings, seps, must> > {}; struct select; struct id : semantic, t::identifier {}; struct int_ : semantic, t::integer {}; struct float_ : semantic, t::floating {}; struct string : semantic, seq {}; struct ind_string : semantic, seq {}; struct path : semantic, seq {}; struct uri : semantic, t::uri {}; struct ancient_let : semantic, _attrset {}; struct rec_set : semantic, _attrset {}; struct set : semantic, _attrset<> {}; struct _list { struct entry : semantic, seq {}; struct as_app_or : semantic, t::kw_or {}; }; struct _app { struct first_arg : semantic, seq {}; // can be used to stash a position of the application head node struct select_or_fn : seq