From e6cd67591b44b4902bac73febcab3c4d96724aea Mon Sep 17 00:00:00 2001 From: eldritch horrors Date: Sun, 16 Jun 2024 23:10:09 +0200 Subject: libexpr: rewrite the parser with pegtl instead of flex/bison MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit this gives about 20% performance improvements on pure parsing. obviously it will be less on full eval, but depending on how much parsing is to be done (e.g. including hackage-packages.nix or not) it's more like 4%-10%. this has been tested (with thousands of core hours of fuzzing) to ensure that the ASTs produced by the new parser are exactly the same as the old one would have produced. error messages will change (sometimes by a lot) and are not yet perfect, but we would rather leave this as is for later. test results for running only the parser (excluding the variable binding code) in a tight loop with inputs and parameters as given are promising: - 40% faster on lix's package.nix at 10000 iterations - 1.3% faster on nixpkgs all-packages.nix at 1000 iterations - equivalent on all of nixpkgs concatenated at 100 iterations (excluding invalid files, each file surrounded with parens) more realistic benchmarks are somewhere in between the extremes, parsing once again getting the largest uplift. other realistic workloads improve by a few percentage points as well, notably system builds are 4% faster. Benchmarks summary (from ./bench/summarize.jq bench/bench-*.json) old/bin/nix --extra-experimental-features 'nix-command flakes' eval -f bench/nixpkgs/pkgs/development/haskell-modules/hackage-packages.nix mean: 0.408s ± 0.025s user: 0.355s | system: 0.033s median: 0.389s range: 0.388s ... 0.442s relative: 1 new/bin/nix --extra-experimental-features 'nix-command flakes' eval -f bench/nixpkgs/pkgs/development/haskell-modules/hackage-packages.nix mean: 0.332s ± 0.024s user: 0.279s | system: 0.033s median: 0.314s range: 0.313s ... 0.361s relative: 0.814 --- old/bin/nix --extra-experimental-features 'nix-command flakes' eval --raw --impure --expr 'with import {}; system' mean: 6.133s ± 0.022s user: 5.395s | system: 0.437s median: 6.128s range: 6.099s ... 6.183s relative: 1 new/bin/nix --extra-experimental-features 'nix-command flakes' eval --raw --impure --expr 'with import {}; system' mean: 5.925s ± 0.025s user: 5.176s | system: 0.456s median: 5.934s range: 5.861s ... 5.943s relative: 0.966 --- GC_INITIAL_HEAP_SIZE=10g old/bin/nix eval --extra-experimental-features 'nix-command flakes' --raw --impure --expr 'with import {}; system' mean: 4.503s ± 0.027s user: 3.731s | system: 0.547s median: 4.499s range: 4.478s ... 4.541s relative: 1 GC_INITIAL_HEAP_SIZE=10g new/bin/nix eval --extra-experimental-features 'nix-command flakes' --raw --impure --expr 'with import {}; system' mean: 4.285s ± 0.031s user: 3.504s | system: 0.571s median: 4.281s range: 4.221s ... 4.328s relative: 0.951 --- old/bin/nix --extra-experimental-features 'nix-command flakes' search --no-eval-cache github:nixos/nixpkgs/e1fa12d4f6c6fe19ccb59cac54b5b3f25e160870 hello mean: 16.475s ± 0.07s user: 14.088s | system: 1.572s median: 16.495s range: 16.351s ... 16.536s relative: 1 new/bin/nix --extra-experimental-features 'nix-command flakes' search --no-eval-cache github:nixos/nixpkgs/e1fa12d4f6c6fe19ccb59cac54b5b3f25e160870 hello mean: 15.973s ± 0.013s user: 13.558s | system: 1.615s median: 15.973s range: 15.946s ... 15.99s relative: 0.97 --- Change-Id: Ie66ec2d045dec964632c6541e25f8f0797319ee2 --- flake.nix | 2 + meson.build | 10 +- meson/cleanup-install.bash | 50 -- misc/pegtl.nix | 23 + package.nix | 6 +- src/libexpr/eval.cc | 16 - src/libexpr/lexer.l | 302 -------- src/libexpr/meson.build | 58 +- src/libexpr/parser-state.hh | 282 ------- src/libexpr/parser.y | 503 ------------ src/libexpr/parser/change_head.hh | 66 ++ src/libexpr/parser/grammar.hh | 707 +++++++++++++++++ src/libexpr/parser/parser.cc | 862 +++++++++++++++++++++ src/libexpr/parser/state.hh | 259 +++++++ tests/functional/lang/parse-fail-eof-pos.err.exp | 2 +- .../functional/lang/parse-fail-undef-var-2.err.exp | 2 +- tests/functional/lang/parse-fail-utf8.err.exp | 2 +- 17 files changed, 1936 insertions(+), 1216 deletions(-) delete mode 100755 meson/cleanup-install.bash create mode 100644 misc/pegtl.nix delete mode 100644 src/libexpr/lexer.l delete mode 100644 src/libexpr/parser-state.hh delete mode 100644 src/libexpr/parser.y create mode 100644 src/libexpr/parser/change_head.hh create mode 100644 src/libexpr/parser/grammar.hh create mode 100644 src/libexpr/parser/parser.cc create mode 100644 src/libexpr/parser/state.hh diff --git a/flake.nix b/flake.nix index 372983f6d..5c764d73e 100644 --- a/flake.nix +++ b/flake.nix @@ -195,6 +195,8 @@ busybox-sandbox-shell = final.busybox-sandbox-shell or final.default-busybox-sandbox-shell; }; + pegtl = final.callPackage ./misc/pegtl.nix { }; + # Export the patched version of boehmgc that Lix uses into the overlay # for consumers of this flake. boehmgc-nix = final.nix.boehmgc-nix; diff --git a/meson.build b/meson.build index e6151e0a2..0cb2030e7 100644 --- a/meson.build +++ b/meson.build @@ -287,6 +287,14 @@ gtest = [ toml11 = dependency('toml11', version : '>=3.7.0', required : true, method : 'cmake') +pegtl = dependency( + 'pegtl', + version : '>=3.2.7', + required : true, + method : 'cmake', + modules : [ 'taocpp::pegtl' ], +) + nlohmann_json = dependency('nlohmann_json', required : true) # lix-doc is a Rust project provided via buildInputs and unfortunately doesn't have any way to be detected. @@ -335,8 +343,6 @@ endif # that busybox sh won't run busybox applets as builtins (which would break our sandbox). lsof = find_program('lsof', native : true) -bison = find_program('bison', native : true) -flex = find_program('flex', native : true) # This is how Nix does generated headers... # other instances of header generation use a very similar command. diff --git a/meson/cleanup-install.bash b/meson/cleanup-install.bash deleted file mode 100755 index 928edc74a..000000000 --- a/meson/cleanup-install.bash +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env bash -# Meson will call this with an absolute path to Bash. -# The shebang is just for convenience. - -# The parser and lexer tab are generated via custom Meson targets in src/libexpr/meson.build, -# but Meson doesn't support marking only part of a target for install. The generation creates -# both headers (parser-tab.hh, lexer-tab.hh) and source files (parser-tab.cc, lexer-tab.cc), -# and we definitely want the former installed, but not the latter. This script is added to -# Meson's install steps to correct this, as the logic for it is just complex enough to -# warrant separate and careful handling, because both Meson's configured include directory -# may or may not be an absolute path, and DESTDIR may or may not be set at all, but can't be -# manipulated in Meson logic. - -set -euo pipefail - -echo "cleanup-install: removing Meson-placed C++ sources from dest includedir" - -if [[ "${1/--help/}" != "$1" ]]; then - echo "cleanup-install: this script should only be called from the Meson build system" - exit 1 -fi - -# Ensure the includedir was passed as the first argument -# (set -u will make this fail otherwise). -includedir="$1" -# And then ensure that first argument is a directory that exists. -if ! [[ -d "$1" ]]; then - echo "cleanup-install: this script should only be called from the Meson build system" - echo "argv[1] (${1@Q}) is not a directory" - exit 2 -fi - -# If DESTDIR environment variable is set, prepend it to the include dir. -# Unfortunately, we cannot do this on the Meson side. We do have an environment variable -# `MESON_INSTALL_DESTDIR_PREFIX`, but that will not refer to the include directory if -# includedir has been set separately, which Lix's split-output derivation does. -# We also cannot simply do an inline bash conditional like "${DESTDIR:=}" or similar, -# because we need to specifically *join* DESTDIR and includedir with a slash, and *not* -# have a slash if DESTDIR isn't set at all, since $includedir could be a relative directory. -# Finally, DESTDIR is only available to us as an environment variable in these install scripts, -# not in Meson logic. -# Therefore, our best option is to have Meson pass this script the configured includedir, -# and perform this dance with it and $DESTDIR. -if [[ -n "${DESTDIR:-}" ]]; then - includedir="$DESTDIR/$includedir" -fi - -# Intentionally not using -f. -# If these files don't exist then our assumptions have been violated and we should fail. -rm -v "$includedir/lix/libexpr/parser-tab.cc" "$includedir/lix/libexpr/lexer-tab.cc" diff --git a/misc/pegtl.nix b/misc/pegtl.nix new file mode 100644 index 000000000..3fd999d9d --- /dev/null +++ b/misc/pegtl.nix @@ -0,0 +1,23 @@ +{ + stdenv, + cmake, + ninja, + fetchFromGitHub, +}: + +stdenv.mkDerivation { + pname = "pegtl"; + version = "3.2.7"; + + src = fetchFromGitHub { + repo = "PEGTL"; + owner = "taocpp"; + rev = "refs/tags/3.2.7"; + hash = "sha256-IV5YNGE4EWVrmg2Sia/rcU8jCuiBynQGJM6n3DCWTQU="; + }; + + nativeBuildInputs = [ + cmake + ninja + ]; +} diff --git a/package.nix b/package.nix index 988379618..0f194796f 100644 --- a/package.nix +++ b/package.nix @@ -10,7 +10,6 @@ boehmgc-nix ? __forDefaults.boehmgc-nix, boehmgc, nlohmann_json, - bison, build-release-notes ? __forDefaults.build-release-notes, boost, brotli, @@ -20,7 +19,6 @@ doxygen, editline-lix ? __forDefaults.editline-lix, editline, - flex, git, gtest, jq, @@ -36,6 +34,7 @@ meson, ninja, openssl, + pegtl, pkg-config, python3, rapidcheck, @@ -210,8 +209,6 @@ stdenv.mkDerivation (finalAttrs: { nativeBuildInputs = [ - bison - flex python3 meson ninja @@ -250,6 +247,7 @@ stdenv.mkDerivation (finalAttrs: { libsodium toml11 lix-doc + pegtl ] ++ lib.optionals hostPlatform.isLinux [ libseccomp diff --git a/src/libexpr/eval.cc b/src/libexpr/eval.cc index afee89420..a6a64a43c 100644 --- a/src/libexpr/eval.cc +++ b/src/libexpr/eval.cc @@ -18,7 +18,6 @@ #include "gc-small-vector.hh" #include "fetch-to-store.hh" #include "flake/flakeref.hh" -#include "parser-tab.hh" #include #include @@ -2958,21 +2957,6 @@ std::optional EvalState::resolveSearchPathPath(const SearchPath::Pa } -Expr * EvalState::parse( - char * text, - size_t length, - Pos::Origin origin, - const SourcePath & basePath, - std::shared_ptr & staticEnv) -{ - auto result = parseExprFromBuf(text, length, origin, basePath, symbols, positions, exprSymbols); - - result->bindVars(*this, staticEnv); - - return result; -} - - std::string ExternalValueBase::coerceToString(EvalState & state, const PosIdx & pos, NixStringContext & context, bool copyMore, bool copyToStore) const { state.error( diff --git a/src/libexpr/lexer.l b/src/libexpr/lexer.l deleted file mode 100644 index 5bc815f00..000000000 --- a/src/libexpr/lexer.l +++ /dev/null @@ -1,302 +0,0 @@ -%option reentrant bison-bridge bison-locations -%option align -%option noyywrap -%option never-interactive -%option stack -%option nodefault -%option nounput noyy_top_state - - -%s DEFAULT -%x STRING -%x IND_STRING -%x INPATH -%x INPATH_SLASH -%x PATH_START - - -%{ -#ifdef __clang__ -#pragma clang diagnostic ignored "-Wunneeded-internal-declaration" -#endif - -// yacc generates code that uses unannotated fallthrough. -#pragma GCC diagnostic ignored "-Wimplicit-fallthrough" -#ifdef __clang__ -#pragma clang diagnostic ignored "-Wimplicit-fallthrough" -#endif - -#include "nixexpr.hh" -#include "parser-tab.hh" -#include "strings.hh" - -using namespace nix; - -#define THROW(...) \ - do { \ - state->error.reset(new auto(__VA_ARGS__)); \ - return YYerror; \ - } while (0) - -namespace nix { - -#define CUR_POS state->at(*yylloc) - -static void initLoc(YYLTYPE * loc) -{ - loc->first_line = loc->last_line = 0; - loc->first_column = loc->last_column = 0; -} - -static void adjustLoc(YYLTYPE * loc, const char * s, size_t len) -{ - loc->stash(); - - loc->first_column = loc->last_column; - loc->last_column += len; -} - - -// we make use of the fact that the parser receives a private copy of the input -// string and can munge around in it. -static StringToken unescapeStr(SymbolTable & symbols, char * s, size_t length) -{ - char * result = s; - char * t = s; - char c; - // the input string is terminated with *two* NULs, so we can safely take - // *one* character after the one being checked against. - while ((c = *s++)) { - if (c == '\\') { - c = *s++; - if (c == 'n') *t = '\n'; - else if (c == 'r') *t = '\r'; - else if (c == 't') *t = '\t'; - else *t = c; - } - else if (c == '\r') { - /* Normalise CR and CR/LF into LF. */ - *t = '\n'; - if (*s == '\n') s++; /* cr/lf */ - } - else *t = c; - t++; - } - return {result, size_t(t - result)}; -} - - -} - -#define YY_USER_INIT initLoc(yylloc) -#define YY_USER_ACTION adjustLoc(yylloc, yytext, yyleng); - -#define PUSH_STATE(state) yy_push_state(state, yyscanner) -#define POP_STATE() yy_pop_state(yyscanner) - -%} - - -ANY .|\n -ID [a-zA-Z\_][a-zA-Z0-9\_\'\-]* -INT [0-9]+ -FLOAT (([1-9][0-9]*\.[0-9]*)|(0?\.[0-9]+))([Ee][+-]?[0-9]+)? -PATH_CHAR [a-zA-Z0-9\.\_\-\+] -PATH {PATH_CHAR}*(\/{PATH_CHAR}+)+\/? -PATH_SEG {PATH_CHAR}*\/ -HPATH \~(\/{PATH_CHAR}+)+\/? -HPATH_START \~\/ -SPATH \<{PATH_CHAR}+(\/{PATH_CHAR}+)*\> -URI [a-zA-Z][a-zA-Z0-9\+\-\.]*\:[a-zA-Z0-9\%\/\?\:\@\&\=\+\$\,\-\_\.\!\~\*\']+ - - -%% - - -if { return IF; } -then { return THEN; } -else { return ELSE; } -assert { return ASSERT; } -with { return WITH; } -let { return LET; } -in { return IN; } -rec { return REC; } -inherit { return INHERIT; } -or { return OR_KW; } -\.\.\. { return ELLIPSIS; } - -\=\= { return EQ; } -\!\= { return NEQ; } -\<\= { return LEQ; } -\>\= { return GEQ; } -\&\& { return AND; } -\|\| { return OR; } -\-\> { return IMPL; } -\/\/ { return UPDATE; } -\+\+ { return CONCAT; } - -{ID} { yylval->id = {yytext, (size_t) yyleng}; return ID; } -{INT} { errno = 0; - std::optional numMay = string2Int(yytext); - if (numMay.has_value()) { - yylval->n = *numMay; - } else { - THROW(ParseError(ErrorInfo{ - .msg = HintFmt("invalid integer '%1%'", yytext), - .pos = state->positions[CUR_POS], - })); - } - return INT; - } -{FLOAT} { errno = 0; - yylval->nf = strtod(yytext, 0); - if (errno != 0) - THROW(ParseError(ErrorInfo{ - .msg = HintFmt("invalid float '%1%'", yytext), - .pos = state->positions[CUR_POS], - })); - return FLOAT; - } - -\$\{ { PUSH_STATE(DEFAULT); return DOLLAR_CURLY; } - -\} { /* State INITIAL only exists at the bottom of the stack and is - used as a marker. DEFAULT replaces it everywhere else. - Popping when in INITIAL state causes an empty stack exception, - so don't */ - if (YYSTATE != INITIAL) - POP_STATE(); - return '}'; - } -\{ { PUSH_STATE(DEFAULT); return '{'; } - -\" { PUSH_STATE(STRING); return '"'; } -([^\$\"\\]|\$[^\{\"\\]|\\{ANY}|\$\\{ANY})*\$/\" | -([^\$\"\\]|\$[^\{\"\\]|\\{ANY}|\$\\{ANY})+ { - /* It is impossible to match strings ending with '$' with one - regex because trailing contexts are only valid at the end - of a rule. (A sane but undocumented limitation.) */ - yylval->str = unescapeStr(state->symbols, yytext, yyleng); - return STR; - } -\$\{ { PUSH_STATE(DEFAULT); return DOLLAR_CURLY; } -\" { POP_STATE(); return '"'; } -\$|\\|\$\\ { - /* This can only occur when we reach EOF, otherwise the above - (...|\$[^\{\"\\]|\\.|\$\\.)+ would have triggered. - This is technically invalid, but we leave the problem to the - parser who fails with exact location. */ - return EOF; - } - -\'\'(\ *\n)? { PUSH_STATE(IND_STRING); return IND_STRING_OPEN; } -([^\$\']|\$[^\{\']|\'[^\'\$])+ { - yylval->str = {yytext, (size_t) yyleng, true}; - return IND_STR; - } -\'\'\$ | -\$ { - yylval->str = {"$", 1}; - return IND_STR; - } -\'\'\' { - yylval->str = {"''", 2}; - return IND_STR; - } -\'\'\\{ANY} { - yylval->str = unescapeStr(state->symbols, yytext + 2, yyleng - 2); - return IND_STR; - } -\$\{ { PUSH_STATE(DEFAULT); return DOLLAR_CURLY; } -\'\' { POP_STATE(); return IND_STRING_CLOSE; } -\' { - yylval->str = {"'", 1}; - return IND_STR; - } - -{PATH_SEG}\$\{ | -{HPATH_START}\$\{ { - PUSH_STATE(PATH_START); - yyless(0); - yylloc->unstash(); -} - -{PATH_SEG} { - POP_STATE(); - PUSH_STATE(INPATH_SLASH); - yylval->path = {yytext, (size_t) yyleng}; - return PATH; -} - -{HPATH_START} { - POP_STATE(); - PUSH_STATE(INPATH_SLASH); - yylval->path = {yytext, (size_t) yyleng}; - return HPATH; -} - -{PATH} { - if (yytext[yyleng-1] == '/') - PUSH_STATE(INPATH_SLASH); - else - PUSH_STATE(INPATH); - yylval->path = {yytext, (size_t) yyleng}; - return PATH; -} -{HPATH} { - if (yytext[yyleng-1] == '/') - PUSH_STATE(INPATH_SLASH); - else - PUSH_STATE(INPATH); - yylval->path = {yytext, (size_t) yyleng}; - return HPATH; -} - -\$\{ { - POP_STATE(); - PUSH_STATE(INPATH); - PUSH_STATE(DEFAULT); - return DOLLAR_CURLY; -} -{PATH}|{PATH_SEG}|{PATH_CHAR}+ { - POP_STATE(); - if (yytext[yyleng-1] == '/') - PUSH_STATE(INPATH_SLASH); - else - PUSH_STATE(INPATH); - yylval->str = {yytext, (size_t) yyleng}; - return STR; -} -{ANY} | -<> { - /* if we encounter a non-path character we inform the parser that the path has - ended with a PATH_END token and re-parse this character in the default - context (it may be ')', ';', or something of that sort) */ - POP_STATE(); - yyless(0); - yylloc->unstash(); - return PATH_END; -} - -{ANY} | -<> { - THROW(ParseError(ErrorInfo{ - .msg = HintFmt("path has a trailing slash"), - .pos = state->positions[CUR_POS], - })); -} - -{SPATH} { yylval->path = {yytext, (size_t) yyleng}; return SPATH; } -{URI} { yylval->uri = {yytext, (size_t) yyleng}; return URI; } - -[ \t\r\n]+ /* eat up whitespace */ -\#[^\r\n]* /* single-line comments */ -\/\*([^*]|\*+[^*/])*\*+\/ /* long comments */ - -{ANY} { - /* Don't return a negative number, as this will cause - Bison to stop parsing without an error. */ - return (unsigned char) yytext[0]; - } - -%% diff --git a/src/libexpr/meson.build b/src/libexpr/meson.build index 080fdb443..39493dadc 100644 --- a/src/libexpr/meson.build +++ b/src/libexpr/meson.build @@ -1,54 +1,3 @@ -parser_tab = custom_target( - input : 'parser.y', - output : [ - 'parser-tab.cc', - 'parser-tab.hh', - ], - command : [ - 'bison', - '-v', - '-o', - '@OUTPUT0@', - '@INPUT@', - '-d', - ], - # NOTE(Qyriad): Meson doesn't support installing only part of a custom target, so we add - # an install script below which removes parser-tab.cc. - install : true, - install_dir : includedir / 'lix/libexpr', -) - -lexer_tab = custom_target( - input : [ - 'lexer.l', - parser_tab, - ], - output : [ - 'lexer-tab.cc', - 'lexer-tab.hh', - ], - command : [ - 'flex', - '--outfile', - '@OUTPUT0@', - '--header-file=' + '@OUTPUT1@', - '@INPUT0@', - ], - # NOTE(Qyriad): Meson doesn't support installing only part of a custom target, so we add - # an install script below which removes lexer-tab.cc. - install : true, - install_dir : includedir / 'lix/libexpr', -) - -# TODO(Qyriad): When the parser and lexer are rewritten this should be removed. -# NOTE(Qyriad): We do this this way instead of an inline bash or rm command -# due to subtleties in Meson. Check the comments in cleanup-install.bash for details. -meson.add_install_script( - bash, - meson.project_source_root() / 'meson/cleanup-install.bash', - '@0@'.format(includedir), -) - libexpr_generated_headers = [ gen_header.process('primops/derivation.nix', preserve_path_from : meson.current_source_dir()), ] @@ -75,6 +24,7 @@ libexpr_sources = files( 'get-drvs.cc', 'json-to-value.cc', 'nixexpr.cc', + 'parser/parser.cc', 'paths.cc', 'primops.cc', 'print-ambiguous.cc', @@ -110,7 +60,9 @@ libexpr_headers = files( 'get-drvs.hh', 'json-to-value.hh', 'nixexpr.hh', - 'parser-state.hh', + 'parser/change_head.hh', + 'parser/grammar.hh', + 'parser/state.hh', 'pos-idx.hh', 'pos-table.hh', 'primops.hh', @@ -129,8 +81,6 @@ libexpr_headers = files( libexpr = library( 'lixexpr', libexpr_sources, - parser_tab, - lexer_tab, libexpr_generated_headers, dependencies : [ liblixutil, diff --git a/src/libexpr/parser-state.hh b/src/libexpr/parser-state.hh deleted file mode 100644 index cb1f12230..000000000 --- a/src/libexpr/parser-state.hh +++ /dev/null @@ -1,282 +0,0 @@ -#pragma once -///@file - -#include "eval.hh" - -namespace nix { - -/** - * @note Storing a C-style `char *` and `size_t` allows us to avoid - * having to define the special members that using string_view here - * would implicitly delete. - */ -struct StringToken -{ - const char * p; - size_t l; - bool hasIndentation; - operator std::string_view() const { return {p, l}; } -}; - -struct ParserLocation -{ - int first_line, first_column; - int last_line, last_column; - - // backup to recover from yyless(0) - int stashed_first_column, stashed_last_column; - - void stash() { - stashed_first_column = first_column; - stashed_last_column = last_column; - } - - void unstash() { - first_column = stashed_first_column; - last_column = stashed_last_column; - } -}; - -struct ParserState -{ - SymbolTable & symbols; - PosTable & positions; - Expr * result; - SourcePath basePath; - PosTable::Origin origin; - const Expr::AstSymbols & s; - std::unique_ptr error; - - [[nodiscard]] ParseError dupAttr(const AttrPath & attrPath, const PosIdx pos, const PosIdx prevPos); - [[nodiscard]] ParseError dupAttr(Symbol attr, const PosIdx pos, const PosIdx prevPos); - [[nodiscard]] std::optional addAttr(ExprAttrs * attrs, AttrPath && attrPath, std::unique_ptr e, const PosIdx pos); - [[nodiscard]] std::optional validateFormals(Formals * formals, PosIdx pos = noPos, Symbol arg = {}); - std::unique_ptr stripIndentation(const PosIdx pos, - std::vector, StringToken>>> && es); - PosIdx at(const ParserLocation & loc); -}; - -inline ParseError ParserState::dupAttr(const AttrPath & attrPath, const PosIdx pos, const PosIdx prevPos) -{ - return ParseError({ - .msg = HintFmt("attribute '%1%' already defined at %2%", - showAttrPath(symbols, attrPath), positions[prevPos]), - .pos = positions[pos] - }); -} - -inline ParseError ParserState::dupAttr(Symbol attr, const PosIdx pos, const PosIdx prevPos) -{ - return ParseError({ - .msg = HintFmt("attribute '%1%' already defined at %2%", symbols[attr], positions[prevPos]), - .pos = positions[pos] - }); -} - -inline std::optional ParserState::addAttr(ExprAttrs * attrs, AttrPath && attrPath, std::unique_ptr e, const PosIdx pos) -{ - AttrPath::iterator i; - // All attrpaths have at least one attr - assert(!attrPath.empty()); - // Checking attrPath validity. - // =========================== - for (i = attrPath.begin(); i + 1 < attrPath.end(); i++) { - if (i->symbol) { - ExprAttrs::AttrDefs::iterator j = attrs->attrs.find(i->symbol); - if (j != attrs->attrs.end()) { - if (j->second.kind != ExprAttrs::AttrDef::Kind::Inherited) { - ExprAttrs * attrs2 = dynamic_cast(j->second.e.get()); - if (!attrs2) { - attrPath.erase(i + 1, attrPath.end()); - return dupAttr(attrPath, pos, j->second.pos); - } - attrs = attrs2; - } else { - attrPath.erase(i + 1, attrPath.end()); - return dupAttr(attrPath, pos, j->second.pos); - } - } else { - auto next = attrs->attrs.emplace(std::piecewise_construct, - std::tuple(i->symbol), - std::tuple(std::make_unique(), pos)); - attrs = static_cast(next.first->second.e.get()); - } - } else { - auto & next = attrs->dynamicAttrs.emplace_back(std::move(i->expr), std::make_unique(), pos); - attrs = static_cast(next.valueExpr.get()); - } - } - // Expr insertion. - // ========================== - if (i->symbol) { - ExprAttrs::AttrDefs::iterator j = attrs->attrs.find(i->symbol); - if (j != attrs->attrs.end()) { - // This attr path is already defined. However, if both - // e and the expr pointed by the attr path are two attribute sets, - // we want to merge them. - // Otherwise, throw an error. - auto * ae = dynamic_cast(e.get()); - auto * jAttrs = dynamic_cast(j->second.e.get()); - if (jAttrs && ae) { - if (ae->inheritFromExprs && !jAttrs->inheritFromExprs) - jAttrs->inheritFromExprs = std::make_unique>>(); - for (auto & ad : ae->attrs) { - auto j2 = jAttrs->attrs.find(ad.first); - if (j2 != jAttrs->attrs.end()) // Attr already defined in iAttrs, error. - return dupAttr(ad.first, j2->second.pos, ad.second.pos); - if (ad.second.kind == ExprAttrs::AttrDef::Kind::InheritedFrom) { - auto & sel = dynamic_cast(*ad.second.e); - auto & from = dynamic_cast(*sel.e); - from.displ += jAttrs->inheritFromExprs->size(); - } - jAttrs->attrs.emplace(ad.first, std::move(ad.second)); - } - std::ranges::move(ae->dynamicAttrs, std::back_inserter(jAttrs->dynamicAttrs)); - if (ae->inheritFromExprs) - std::ranges::move(*ae->inheritFromExprs, std::back_inserter(*jAttrs->inheritFromExprs)); - } else { - return dupAttr(attrPath, pos, j->second.pos); - } - } else { - // This attr path is not defined. Let's create it. - e->setName(i->symbol); - attrs->attrs.emplace(std::piecewise_construct, - std::tuple(i->symbol), - std::tuple(std::move(e), pos)); - } - } else { - attrs->dynamicAttrs.emplace_back(std::move(i->expr), std::move(e), pos); - } - - return {}; -} - -inline std::optional ParserState::validateFormals(Formals * formals, PosIdx pos, Symbol arg) -{ - std::sort(formals->formals.begin(), formals->formals.end(), - [] (const auto & a, const auto & b) { - return std::tie(a.name, a.pos) < std::tie(b.name, b.pos); - }); - - std::optional> duplicate; - for (size_t i = 0; i + 1 < formals->formals.size(); i++) { - if (formals->formals[i].name != formals->formals[i + 1].name) - continue; - std::pair thisDup{formals->formals[i].name, formals->formals[i + 1].pos}; - duplicate = std::min(thisDup, duplicate.value_or(thisDup)); - } - if (duplicate) - return ParseError({ - .msg = HintFmt("duplicate formal function argument '%1%'", symbols[duplicate->first]), - .pos = positions[duplicate->second] - }); - - if (arg && formals->has(arg)) - return ParseError({ - .msg = HintFmt("duplicate formal function argument '%1%'", symbols[arg]), - .pos = positions[pos] - }); - - return {}; -} - -inline std::unique_ptr ParserState::stripIndentation(const PosIdx pos, - std::vector, StringToken>>> && es) -{ - if (es.empty()) return std::make_unique(""); - - /* Figure out the minimum indentation. Note that by design - whitespace-only final lines are not taken into account. (So - the " " in "\n ''" is ignored, but the " " in "\n foo''" is.) */ - bool atStartOfLine = true; /* = seen only whitespace in the current line */ - size_t minIndent = 1000000; - size_t curIndent = 0; - for (auto & [i_pos, i] : es) { - auto * str = std::get_if(&i); - if (!str || !str->hasIndentation) { - /* Anti-quotations and escaped characters end the current start-of-line whitespace. */ - if (atStartOfLine) { - atStartOfLine = false; - if (curIndent < minIndent) minIndent = curIndent; - } - continue; - } - for (size_t j = 0; j < str->l; ++j) { - if (atStartOfLine) { - if (str->p[j] == ' ') - curIndent++; - else if (str->p[j] == '\n') { - /* Empty line, doesn't influence minimum - indentation. */ - curIndent = 0; - } else { - atStartOfLine = false; - if (curIndent < minIndent) minIndent = curIndent; - } - } else if (str->p[j] == '\n') { - atStartOfLine = true; - curIndent = 0; - } - } - } - - /* Strip spaces from each line. */ - std::vector>> es2; - atStartOfLine = true; - size_t curDropped = 0; - size_t n = es.size(); - auto i = es.begin(); - const auto trimExpr = [&] (std::unique_ptr e) { - atStartOfLine = false; - curDropped = 0; - es2.emplace_back(i->first, std::move(e)); - }; - const auto trimString = [&] (const StringToken t) { - std::string s2; - for (size_t j = 0; j < t.l; ++j) { - if (atStartOfLine) { - if (t.p[j] == ' ') { - if (curDropped++ >= minIndent) - s2 += t.p[j]; - } - else if (t.p[j] == '\n') { - curDropped = 0; - s2 += t.p[j]; - } else { - atStartOfLine = false; - curDropped = 0; - s2 += t.p[j]; - } - } else { - s2 += t.p[j]; - if (t.p[j] == '\n') atStartOfLine = true; - } - } - - /* Remove the last line if it is empty and consists only of - spaces. */ - if (n == 1) { - std::string::size_type p = s2.find_last_of('\n'); - if (p != std::string::npos && s2.find_first_not_of(' ', p + 1) == std::string::npos) - s2 = std::string(s2, 0, p + 1); - } - - es2.emplace_back(i->first, std::make_unique(std::move(s2))); - }; - for (; i != es.end(); ++i, --n) { - std::visit(overloaded { trimExpr, trimString }, std::move(i->second)); - } - - /* If this is a single string, then don't do a concatenation. */ - if (es2.size() == 1 && dynamic_cast(es2[0].second.get())) { - return std::move(es2[0].second); - } - return std::make_unique(pos, true, std::move(es2)); -} - -inline PosIdx ParserState::at(const ParserLocation & loc) -{ - return positions.add(origin, loc.first_column); -} - -} diff --git a/src/libexpr/parser.y b/src/libexpr/parser.y deleted file mode 100644 index b825f2ed8..000000000 --- a/src/libexpr/parser.y +++ /dev/null @@ -1,503 +0,0 @@ -%glr-parser -%define api.pure -%locations -%define parse.error verbose -%defines -/* %no-lines */ -%parse-param { void * scanner } -%parse-param { nix::ParserState * state } -%lex-param { void * scanner } -%lex-param { nix::ParserState * state } -%expect 1 -%expect-rr 1 - -%code requires { - -#ifndef BISON_HEADER -#define BISON_HEADER - -#include - -#include "finally.hh" -#include "users.hh" - -#include "nixexpr.hh" -#include "eval.hh" -#include "eval-settings.hh" -#include "globals.hh" -#include "parser-state.hh" - -#define YYLTYPE ::nix::ParserLocation -#define YY_DECL int yylex \ - (YYSTYPE * yylval_param, YYLTYPE * yylloc_param, yyscan_t yyscanner, nix::ParserState * state) - -namespace nix { - -Expr * parseExprFromBuf( - char * text, - size_t length, - Pos::Origin origin, - const SourcePath & basePath, - SymbolTable & symbols, - PosTable & positions, - const Expr::AstSymbols & astSymbols); - -} - -#endif - -} - -%{ - -#include "parser-tab.hh" -#include "lexer-tab.hh" - -YY_DECL; - -using namespace nix; - -#define CUR_POS state->at(*yylocp) - -// otherwise destructors cause compiler errors -#pragma GCC diagnostic ignored "-Wswitch-enum" - -#define THROW(err, ...) \ - do { \ - state->error.reset(new auto(err)); \ - [](auto... d) { (delete d, ...); }(__VA_ARGS__); \ - YYABORT; \ - } while (0) - -void yyerror(YYLTYPE * loc, yyscan_t scanner, ParserState * state, const char * error) -{ - if (std::string_view(error).starts_with("syntax error, unexpected end of file")) { - loc->first_column = loc->last_column; - loc->first_line = loc->last_line; - } - throw ParseError({ - .msg = HintFmt(error), - .pos = state->positions[state->at(*loc)] - }); -} - -template -static std::unique_ptr unp(T * e) -{ - return std::unique_ptr(e); -} - -template, typename... Args> -static std::vector vec(Args && ... args) -{ - std::vector result; - result.reserve(sizeof...(Args)); - (result.emplace_back(std::forward(args)), ...); - return result; -} - - -%} - -%union { - // !!! We're probably leaking stuff here. - nix::Expr * e; - nix::ExprList * list; - nix::ExprAttrs * attrs; - nix::Formals * formals; - nix::Formal * formal; - nix::NixInt n; - nix::NixFloat nf; - nix::StringToken id; // !!! -> Symbol - nix::StringToken path; - nix::StringToken uri; - nix::StringToken str; - std::vector * attrNames; - std::vector> * inheritAttrs; - std::vector>> * string_parts; - std::vector, nix::StringToken>>> * ind_string_parts; -} - -%destructor { delete $$; } -%destructor { delete $$; } -%destructor { delete $$; } -%destructor { delete $$; } -%destructor { delete $$; } -%destructor { delete $$; } -%destructor { delete $$; } -%destructor { delete $$; } -%destructor { delete $$; } - -%type start -%type expr expr_function expr_if expr_op -%type expr_select expr_simple expr_app -%type expr_list -%type binds -%type formals -%type formal -%type attrpath -%type attrs -%type string_parts_interpolated -%type ind_string_parts -%type path_start string_parts string_attr -%type attr -%token ID -%token STR IND_STR -%token INT -%token FLOAT -%token PATH HPATH SPATH PATH_END -%token URI -%token IF THEN ELSE ASSERT WITH LET IN REC INHERIT EQ NEQ AND OR IMPL OR_KW -%token DOLLAR_CURLY /* == ${ */ -%token IND_STRING_OPEN IND_STRING_CLOSE -%token ELLIPSIS - -%right IMPL -%left OR -%left AND -%nonassoc EQ NEQ -%nonassoc '<' '>' LEQ GEQ -%right UPDATE -%left NOT -%left '+' '-' -%left '*' '/' -%right CONCAT -%nonassoc '?' -%nonassoc NEGATE - -%% - -start: expr { state->result = $1; $$ = 0; }; - -expr: expr_function; - -expr_function - : ID ':' expr_function - { $$ = new ExprLambda(CUR_POS, state->symbols.create($1), nullptr, unp($3)); } - | '{' formals '}' ':' expr_function - { if (auto e = state->validateFormals($2)) THROW(*e); - $$ = new ExprLambda(CUR_POS, unp($2), unp($5)); - } - | '{' formals '}' '@' ID ':' expr_function - { - auto arg = state->symbols.create($5); - if (auto e = state->validateFormals($2, CUR_POS, arg)) THROW(*e, $2, $7); - $$ = new ExprLambda(CUR_POS, arg, unp($2), unp($7)); - } - | ID '@' '{' formals '}' ':' expr_function - { - auto arg = state->symbols.create($1); - if (auto e = state->validateFormals($4, CUR_POS, arg)) THROW(*e, $4, $7); - $$ = new ExprLambda(CUR_POS, arg, unp($4), unp($7)); - } - | ASSERT expr ';' expr_function - { $$ = new ExprAssert(CUR_POS, unp($2), unp($4)); } - | WITH expr ';' expr_function - { $$ = new ExprWith(CUR_POS, unp($2), unp($4)); } - | LET binds IN expr_function - { if (!$2->dynamicAttrs.empty()) - THROW(ParseError({ - .msg = HintFmt("dynamic attributes not allowed in let"), - .pos = state->positions[CUR_POS] - }), $2, $4); - $$ = new ExprLet(unp($2), unp($4)); - } - | expr_if - ; - -expr_if - : IF expr THEN expr ELSE expr { $$ = new ExprIf(CUR_POS, unp($2), unp($4), unp($6)); } - | expr_op - ; - -expr_op - : '!' expr_op %prec NOT { $$ = new ExprOpNot(unp($2)); } - | '-' expr_op %prec NEGATE { $$ = new ExprCall(CUR_POS, std::make_unique(state->s.sub), vec(std::make_unique(0), unp($2))); } - | expr_op EQ expr_op { $$ = new ExprOpEq(unp($1), unp($3)); } - | expr_op NEQ expr_op { $$ = new ExprOpNEq(unp($1), unp($3)); } - | expr_op '<' expr_op { $$ = new ExprCall(state->at(@2), std::make_unique(state->s.lessThan), vec($1, $3)); } - | expr_op LEQ expr_op { $$ = new ExprOpNot(std::make_unique(state->at(@2), std::make_unique(state->s.lessThan), vec($3, $1))); } - | expr_op '>' expr_op { $$ = new ExprCall(state->at(@2), std::make_unique(state->s.lessThan), vec($3, $1)); } - | expr_op GEQ expr_op { $$ = new ExprOpNot(std::make_unique(state->at(@2), std::make_unique(state->s.lessThan), vec($1, $3))); } - | expr_op AND expr_op { $$ = new ExprOpAnd(state->at(@2), unp($1), unp($3)); } - | expr_op OR expr_op { $$ = new ExprOpOr(state->at(@2), unp($1), unp($3)); } - | expr_op IMPL expr_op { $$ = new ExprOpImpl(state->at(@2), unp($1), unp($3)); } - | expr_op UPDATE expr_op { $$ = new ExprOpUpdate(state->at(@2), unp($1), unp($3)); } - | expr_op '?' attrpath { $$ = new ExprOpHasAttr(unp($1), std::move(*$3)); delete $3; } - | expr_op '+' expr_op - { $$ = new ExprConcatStrings(state->at(@2), false, vec>>(std::pair(state->at(@1), unp($1)), std::pair(state->at(@3), unp($3)))); } - | expr_op '-' expr_op { $$ = new ExprCall(state->at(@2), std::make_unique(state->s.sub), vec($1, $3)); } - | expr_op '*' expr_op { $$ = new ExprCall(state->at(@2), std::make_unique(state->s.mul), vec($1, $3)); } - | expr_op '/' expr_op { $$ = new ExprCall(state->at(@2), std::make_unique(state->s.div), vec($1, $3)); } - | expr_op CONCAT expr_op { $$ = new ExprOpConcatLists(state->at(@2), unp($1), unp($3)); } - | expr_app - ; - -expr_app - : expr_app expr_select { - if (auto e2 = dynamic_cast($1)) { - e2->args.emplace_back($2); - $$ = $1; - } else - $$ = new ExprCall(CUR_POS, unp($1), vec(unp($2))); - } - | expr_select - ; - -expr_select - : expr_simple '.' attrpath - { $$ = new ExprSelect(CUR_POS, unp($1), std::move(*$3), nullptr); delete $3; } - | expr_simple '.' attrpath OR_KW expr_select - { $$ = new ExprSelect(CUR_POS, unp($1), std::move(*$3), unp($5)); delete $3; } - | /* Backwards compatibility: because Nixpkgs has a rarely used - function named ‘or’, allow stuff like ‘map or [...]’. */ - expr_simple OR_KW - { $$ = new ExprCall(CUR_POS, unp($1), vec(std::make_unique(CUR_POS, state->s.or_))); } - | expr_simple - ; - -expr_simple - : ID { - std::string_view s = "__curPos"; - if ($1.l == s.size() && strncmp($1.p, s.data(), s.size()) == 0) - $$ = new ExprPos(CUR_POS); - else - $$ = new ExprVar(CUR_POS, state->symbols.create($1)); - } - | INT { $$ = new ExprInt($1); } - | FLOAT { $$ = new ExprFloat($1); } - | '"' string_parts '"' { $$ = $2; } - | IND_STRING_OPEN ind_string_parts IND_STRING_CLOSE { - $$ = state->stripIndentation(CUR_POS, std::move(*$2)).release(); - delete $2; - } - | path_start PATH_END - | path_start string_parts_interpolated PATH_END { - $2->emplace($2->begin(), state->at(@1), $1); - $$ = new ExprConcatStrings(CUR_POS, false, std::move(*$2)); - delete $2; - } - | SPATH { - std::string path($1.p + 1, $1.l - 2); - $$ = new ExprCall(CUR_POS, - std::make_unique(state->s.findFile), - vec(std::make_unique(state->s.nixPath), - std::make_unique(std::move(path)))); - } - | URI { - static bool noURLLiterals = experimentalFeatureSettings.isEnabled(Xp::NoUrlLiterals); - if (noURLLiterals) - THROW(ParseError({ - .msg = HintFmt("URL literals are disabled"), - .pos = state->positions[CUR_POS] - })); - $$ = new ExprString(std::string($1)); - } - | '(' expr ')' { $$ = $2; } - /* Let expressions `let {..., body = ...}' are just desugared - into `(rec {..., body = ...}).body'. */ - | LET '{' binds '}' - { $3->recursive = true; $$ = new ExprSelect(noPos, unp($3), state->s.body); } - | REC '{' binds '}' - { $3->recursive = true; $$ = $3; } - | '{' binds '}' - { $$ = $2; } - | '[' expr_list ']' { $$ = $2; } - ; - -string_parts - : STR { $$ = new ExprString(std::string($1)); } - | string_parts_interpolated - { $$ = new ExprConcatStrings(CUR_POS, true, std::move(*$1)); - delete $1; - } - | { $$ = new ExprString(""); } - ; - -string_parts_interpolated - : string_parts_interpolated STR - { $$ = $1; $1->emplace_back(state->at(@2), new ExprString(std::string($2))); } - | string_parts_interpolated DOLLAR_CURLY expr '}' { $$ = $1; $1->emplace_back(state->at(@2), $3); } - | DOLLAR_CURLY expr '}' { $$ = new std::vector>>; $$->emplace_back(state->at(@1), $2); } - | STR DOLLAR_CURLY expr '}' { - $$ = new std::vector>>; - $$->emplace_back(state->at(@1), new ExprString(std::string($1))); - $$->emplace_back(state->at(@2), $3); - } - ; - -path_start - : PATH { - Path path(absPath({$1.p, $1.l}, state->basePath.path.abs())); - /* add back in the trailing '/' to the first segment */ - if ($1.p[$1.l-1] == '/' && $1.l > 1) - path += "/"; - $$ = new ExprPath(path); - } - | HPATH { - if (evalSettings.pureEval) { - THROW(Error( - "the path '%s' can not be resolved in pure mode", - std::string_view($1.p, $1.l) - )); - } - Path path(getHome() + std::string($1.p + 1, $1.l - 1)); - $$ = new ExprPath(path); - } - ; - -ind_string_parts - : ind_string_parts IND_STR { $$ = $1; $1->emplace_back(state->at(@2), $2); } - | ind_string_parts DOLLAR_CURLY expr '}' { $$ = $1; $1->emplace_back(state->at(@2), unp($3)); } - | { $$ = new std::vector, StringToken>>>; } - ; - -binds - : binds attrpath '=' expr ';' - { $$ = $1; - if (auto e = state->addAttr($$, std::move(*$2), unp($4), state->at(@2))) THROW(*e, $1, $2); - delete $2; - } - | binds INHERIT attrs ';' - { $$ = $1; - for (auto & [i, iPos] : *$3) { - if ($$->attrs.find(i.symbol) != $$->attrs.end()) - THROW(state->dupAttr(i.symbol, iPos, $$->attrs[i.symbol].pos), $1); - $$->attrs.emplace( - i.symbol, - ExprAttrs::AttrDef(std::make_unique(iPos, i.symbol), iPos, ExprAttrs::AttrDef::Kind::Inherited)); - } - delete $3; - } - | binds INHERIT '(' expr ')' attrs ';' - { $$ = $1; - if (!$$->inheritFromExprs) - $$->inheritFromExprs = std::make_unique>>(); - $$->inheritFromExprs->push_back(unp($4)); - for (auto & [i, iPos] : *$6) { - if ($$->attrs.find(i.symbol) != $$->attrs.end()) - THROW(state->dupAttr(i.symbol, iPos, $$->attrs[i.symbol].pos), $1); - auto from = std::make_unique(state->at(@4), $$->inheritFromExprs->size() - 1); - $$->attrs.emplace( - i.symbol, - ExprAttrs::AttrDef( - std::make_unique(iPos, std::move(from), i.symbol), - iPos, - ExprAttrs::AttrDef::Kind::InheritedFrom)); - } - delete $6; - } - | { $$ = new ExprAttrs(state->at(@0)); } - ; - -attrs - : attrs attr { $$ = $1; $1->emplace_back(AttrName(state->symbols.create($2)), state->at(@2)); } - | attrs string_attr - { $$ = $1; - ExprString * str = dynamic_cast($2); - if (str) { - $$->emplace_back(AttrName(state->symbols.create(str->s)), state->at(@2)); - delete str; - } else - THROW(ParseError({ - .msg = HintFmt("dynamic attributes not allowed in inherit"), - .pos = state->positions[state->at(@2)] - }), $1, $2); - } - | { $$ = new std::vector>; } - ; - -attrpath - : attrpath '.' attr { $$ = $1; $1->push_back(AttrName(state->symbols.create($3))); } - | attrpath '.' string_attr - { $$ = $1; - ExprString * str = dynamic_cast($3); - if (str) { - $$->push_back(AttrName(state->symbols.create(str->s))); - delete str; - } else - $$->emplace_back(unp($3)); - } - | attr { $$ = new std::vector; $$->push_back(AttrName(state->symbols.create($1))); } - | string_attr - { $$ = new std::vector; - ExprString *str = dynamic_cast($1); - if (str) { - $$->push_back(AttrName(state->symbols.create(str->s))); - delete str; - } else - $$->emplace_back(unp($1)); - } - ; - -attr - : ID - | OR_KW { $$ = {"or", 2}; } - ; - -string_attr - : '"' string_parts '"' { $$ = $2; } - | DOLLAR_CURLY expr '}' { $$ = $2; } - ; - -expr_list - : expr_list expr_select { $$ = $1; $1->elems.emplace_back($2); /* !!! dangerous */ } - | { $$ = new ExprList; } - ; - -formals - : formal ',' formals - { $$ = $3; $$->formals.emplace_back(std::move(*$1)); delete $1; } - | formal - { $$ = new Formals; $$->formals.emplace_back(std::move(*$1)); $$->ellipsis = false; delete $1; } - | - { $$ = new Formals; $$->ellipsis = false; } - | ELLIPSIS - { $$ = new Formals; $$->ellipsis = true; } - ; - -formal - : ID { $$ = new Formal{CUR_POS, state->symbols.create($1), nullptr}; } - | ID '?' expr { $$ = new Formal{CUR_POS, state->symbols.create($1), unp($3)}; } - ; - -%% - -#include "eval.hh" - - -namespace nix { - -Expr * parseExprFromBuf( - char * text, - size_t length, - Pos::Origin origin, - const SourcePath & basePath, - SymbolTable & symbols, - PosTable & positions, - const Expr::AstSymbols & astSymbols) -{ - yyscan_t scanner; - ParserState state { - .symbols = symbols, - .positions = positions, - .basePath = basePath, - .origin = positions.addOrigin(origin, length), - .s = astSymbols, - }; - - yylex_init(&scanner); - Finally _destroy([&] { yylex_destroy(scanner); }); - - yy_scan_buffer(text, length, scanner); - yyparse(scanner, &state); - if (state.error) { - delete state.result; - throw *state.error; - } - - return state.result; -} - - -} diff --git a/src/libexpr/parser/change_head.hh b/src/libexpr/parser/change_head.hh new file mode 100644 index 000000000..aab315553 --- /dev/null +++ b/src/libexpr/parser/change_head.hh @@ -0,0 +1,66 @@ +#pragma once +///@file + +#include + +namespace nix::parser { + +// modified copy of change_state, as the manual suggest for more involved +// state manipulation. we want to change only the first state parameter, +// and we care about the *initial* position of a rule application (not the +// past-the-end position as pegtl change_state provides) +template +struct change_head : tao::pegtl::maybe_nothing +{ + template< + typename Rule, + tao::pegtl::apply_mode A, + tao::pegtl::rewind_mode M, + template class Action, + template class Control, + typename ParseInput, + typename State, + typename... States + > + [[nodiscard]] static bool match(ParseInput & in, State && st, States &&... sts) + { + const auto begin = in.iterator(); + + if constexpr (std::is_constructible_v) { + NewState s(st, sts...); + if (tao::pegtl::match(in, s, sts...)) { + if constexpr (A == tao::pegtl::apply_mode::action) { + _success>(0, begin, in, s, st, sts...); + } + return true; + } + return false; + } else if constexpr (std::is_default_constructible_v) { + NewState s; + if (tao::pegtl::match(in, s, sts...)) { + if constexpr (A == tao::pegtl::apply_mode::action) { + _success>(0, begin, in, s, st, sts...); + } + return true; + } + return false; + } else { + static_assert(decltype(sizeof(NewState))(), "unable to instantiate new state"); + } + } + + template + static void _success(void *, auto & begin, ParseInput & in, S & ... sts) + { + const typename ParseInput::action_t at(begin, in); + Target::success(at, sts...); + } + + template + static void _success(decltype(Target::success0(std::declval()...), 0), auto &, auto &, S & ... sts) + { + Target::success0(sts...); + } +}; + +} diff --git a/src/libexpr/parser/grammar.hh b/src/libexpr/parser/grammar.hh new file mode 100644 index 000000000..82df63bc5 --- /dev/null +++ b/src/libexpr/parser/grammar.hh @@ -0,0 +1,707 @@ +#pragma once +///@file + +#include "tao/pegtl.hpp" +#include +#include + +#include + +// NOTE +// nix line endings are \n, \r\n, \r. the grammar does not use eol or +// eolf rules in favor of reproducing the old flex lexer as faithfully as +// possible, and deferring calculation of positions to downstream users. + +namespace nix::parser::grammar { + +using namespace tao::pegtl; +namespace p = tao::pegtl; + +// character classes +namespace c { + +struct path : sor< + ranges<'a', 'z', 'A', 'Z', '0', '9'>, + one<'.', '_', '-', '+'> +> {}; +struct path_sep : one<'/'> {}; + +struct id_first : ranges<'a', 'z', 'A', 'Z', '_'> {}; +struct id_rest : sor< + ranges<'a', 'z', 'A', 'Z', '0', '9'>, + one<'_', '\'', '-'> +> {}; + +struct uri_scheme_first : ranges<'a', 'z', 'A', 'Z'> {}; +struct uri_scheme_rest : sor< + ranges<'a', 'z', 'A', 'Z', '0', '9'>, + one<'+', '-', '.'> +> {}; +struct uri_sep : one<':'> {}; +struct uri_rest : sor< + ranges<'a', 'z', 'A', 'Z', '0', '9'>, + one<'%', '/', '?', ':', '@', '&', '=', '+', '$', ',', '-', '_', '.', '!', '~', '*', '\''> +> {}; + +} + +// "tokens". PEGs don't really care about tokens, we merely use them as a convenient +// way of writing down keywords and a couple complicated syntax rules. +namespace t { + +struct _extend_as_path : seq< + star, + not_at, + not_at, + c::path_sep, + sor +> {}; +struct _extend_as_uri : seq< + star, + c::uri_sep, + c::uri_rest +> {}; + +// keywords might be extended to identifiers, paths, or uris. +// NOTE this assumes that keywords are a-zA-Z only, otherwise uri schemes would never +// match correctly. +// NOTE not a simple seq<...> because this would report incorrect positions for +// keywords used inside must<> if a prefix of the keyword matches. +template +struct _keyword : sor< + seq< + S, + not_at, + not_at<_extend_as_path>, + not_at<_extend_as_uri> + >, + failure +> {}; + +struct kw_if : _keyword {}; +struct kw_then : _keyword {}; +struct kw_else : _keyword {}; +struct kw_assert : _keyword {}; +struct kw_with : _keyword {}; +struct kw_let : _keyword {}; +struct kw_in : _keyword {}; +struct kw_rec : _keyword {}; +struct kw_inherit : _keyword {}; +struct kw_or : _keyword {}; + +// `-` can be a unary prefix op, a binary infix op, or the first character +// of a path or -> (ex 1->1--1) +// `/` can be a path leader or an operator (ex a?a /a) +struct op_minus : seq, not_at'>>, not_at<_extend_as_path>> {}; +struct op_div : seq, not_at> {}; + +// match a rule, making sure we are not matching it where a keyword would match. +// using minus like this is a lot faster than flipping the order and using seq. +template +struct _not_at_any_keyword : minus< + seq, + sor< + TAO_PEGTL_STRING("inherit"), + TAO_PEGTL_STRING("assert"), + TAO_PEGTL_STRING("else"), + TAO_PEGTL_STRING("then"), + TAO_PEGTL_STRING("with"), + TAO_PEGTL_STRING("let"), + TAO_PEGTL_STRING("rec"), + TAO_PEGTL_STRING("if"), + TAO_PEGTL_STRING("in"), + TAO_PEGTL_STRING("or") + > +> {}; + +// identifiers are kind of horrid: +// +// - uri_scheme_first ⊂ id_first +// - uri_scheme_first ⊂ uri_scheme_rest ⊂ path +// - id_first ⊂ id_rest ∖ { ' } ⊂ path +// - id_first ∩ (path ∖ uri_scheme_first) = { _ } +// - uri_sep ∉ ⋃ { id_first, id_rest, uri_scheme_first, uri_scheme_rest, path } +// - path_sep ∉ ⋃ { id_first, id_rest, uri_scheme_first, uri_scheme_rest } +// +// and we want, without reading the input more than once, a string that +// matches (id_first id_rest*) and is not followed by any number of +// characters such that the extended string matches path or uri rules. +// +// since the first character must be either _ or a uri scheme character +// we can ignore path-like bits at the beginning. uri_sep cannot appear anywhere +// in an identifier, so it's only needed in lookahead checks at the uri-like +// prefix. likewise path_sep cannot appear anywhere in the idenfier, so it's +// only needed in lookahead checks in the path-like prefix. +// +// in total that gives us a decomposition of +// +// (uri-scheme-like? (?! continues-as-uri) | _) +// (path-segment-like? (?! continues-as-path)) +// id_rest* +struct identifier : _not_at_any_keyword< + // we don't use (at, ...) matches here because identifiers are + // a really hot path and rewinding as needed by at<> isn't entirely free. + sor< + seq< + c::uri_scheme_first, + star>, + not_at<_extend_as_uri> + >, + one<'_'> + >, + star, one<'_', '-'>>>, + not_at<_extend_as_path>, + star +> {}; + +// floats may extend ints, thus these rules are very similar. +struct integer : seq< + sor< + seq, star, not_at>>, + seq, not_at, digit>, star> + >, + not_at<_extend_as_path> +> {}; + +struct floating : seq< + sor< + seq, star, one<'.'>, star>, + seq>, one<'.'>, plus> + >, + opt, opt>, plus>, + not_at<_extend_as_path> +> {}; + +struct uri : seq< + c::uri_scheme_first, + star, + c::uri_sep, + plus +> {}; + +struct sep : sor< + plus>, + seq, star>>, + seq, until>> +> {}; + +} + + + +using seps = star; + + +// marker for semantic rules. not handling one of these in an action that cares about +// semantics is probably an error. +struct semantic {}; + + +struct expr; + +struct _string { + template + struct literal : semantic, seq {}; + struct cr_lf : semantic, seq, opt>> {}; + struct interpolation : semantic, seq< + p::string<'$', '{'>, seps, + must, seps, + must> + > {}; + struct escape : semantic, must {}; +}; +struct string : _string, seq< + one<'"'>, + star< + sor< + _string::literal>>, + _string::cr_lf, + _string::interpolation, + _string::literal, opt>>, + seq, _string::escape> + > + >, + must> +> {}; + +struct _ind_string { + template + struct literal : semantic, seq {}; + struct interpolation : semantic, seq< + p::string<'$', '{'>, seps, + must, seps, + must> + > {}; + struct escape : semantic, must {}; +}; +struct ind_string : _ind_string, seq< + TAO_PEGTL_STRING("''"), + opt>, one<'\n'>>, + star< + sor< + _ind_string::literal< + true, + plus< + sor< + not_one<'$', '\''>, + seq, not_one<'{', '\''>>, + seq, not_one<'\'', '$'>> + > + > + >, + _ind_string::interpolation, + _ind_string::literal>, + _ind_string::literal, not_at>>, + seq, _ind_string::literal>>, + seq< + p::string<'\'', '\''>, + sor< + _ind_string::literal>, + seq, _ind_string::escape> + > + > + > + >, + must +> {}; + +struct _path { + // legacy lexer rules. extra l_ to avoid reserved c++ identifiers. + struct _l_PATH : seq, plus>, opt> {}; + struct _l_PATH_SEG : seq, c::path_sep> {}; + struct _l_HPATH : seq, plus>, opt> {}; + struct _l_HPATH_START : TAO_PEGTL_STRING("~/") {}; + struct _path_str : sor<_l_PATH, _l_PATH_SEG, plus> {}; + // modern rules + template + struct literal : semantic, seq {}; + struct interpolation : semantic, seq< + p::string<'$', '{'>, seps, + must, seps, + must> + > {}; + struct anchor : semantic, sor< + _l_PATH, + seq<_l_PATH_SEG, at> + > {}; + struct home_anchor : semantic, sor< + _l_HPATH, + seq<_l_HPATH_START, at> + > {}; + struct searched_path : semantic, list, c::path_sep> {}; + struct forbid_prefix_triple_slash : sor, failure> {}; + struct forbid_prefix_double_slash_no_interp : sor< + not_at, not_at>, + failure + > {}; + // legacy parser rules + struct _str_rest : seq< + must, + opt>, + must, + star< + sor< + literal<_path_str>, + interpolation + > + > + > {}; +}; +struct path : _path, sor< + seq< + sor<_path::anchor, _path::home_anchor>, + _path::_str_rest + >, + seq, _path::searched_path, one<'>'>> +> {}; + +struct _formal { + struct name : semantic, t::identifier {}; + struct default_value : semantic, must {}; +}; +struct formal : semantic, _formal, seq< + _formal::name, + opt, seps, _formal::default_value> +> {}; + +struct _formals { + struct ellipsis : semantic, p::ellipsis {}; +}; +struct formals : semantic, _formals, seq< + one<'{'>, seps, + // formals and attrsets share a two-token head sequence ('{' ). + // this rule unrolls the formals list a bit to provide better error messages than + // "expected '='" at the first ',' if formals are incorrect. + sor< + one<'}'>, + seq<_formals::ellipsis, seps, must>>, + seq< + formal, seps, + if_then_else< + at>, + seq< + star, seps, formal, seps>, + opt, seps, opt<_formals::ellipsis, seps>>, + must> + >, + one<'}'> + > + > + > +> {}; + +struct _attr { + struct simple : semantic, sor {}; + struct string : semantic, seq {}; + struct expr : semantic, seq< + TAO_PEGTL_STRING("${"), seps, + must, seps, + must> + > {}; +}; +struct attr : _attr, sor< + _attr::simple, + _attr::string, + _attr::expr +> {}; + +struct attrpath : list, t::sep> {}; + +struct _inherit { + struct from : semantic, must {}; + struct attrs : list {}; +}; +struct inherit : _inherit, seq< + t::kw_inherit, seps, + opt, seps, _inherit::from, seps, must>, seps>, + opt<_inherit::attrs, seps>, + must> +> {}; + +struct _binding { + struct path : semantic, attrpath {}; + struct equal : one<'='> {}; + struct value : semantic, must {}; +}; +struct binding : _binding, seq< + _binding::path, seps, + must<_binding::equal>, seps, + _binding::value, seps, + must> +> {}; + +struct bindings : opt, seps>> {}; + +struct op { + enum class kind { + // NOTE non-associativity is *NOT* handled in the grammar structure. + // handling it in the grammar itself instead of in semantic actions + // slows down the parser significantly and makes the rules *much* + // harder to read. maybe this will be different at some point when + // ! does not sit between two binary precedence levels. + nonAssoc, + leftAssoc, + rightAssoc, + unary, + }; + template + struct _op : Rule { + static constexpr unsigned precedence = Precedence; + static constexpr op::kind kind = Kind; + }; + + struct unary_minus : _op {}; + + // treating this like a unary postfix operator is sketchy, but that's + // the most reasonable way to implement the operator precedence set forth + // by the language way back. it'd be much better if `.` and `?` had the same + // precedence, but alas. + struct has_attr : _op, seps, must>, 4> {}; + + struct concat : _op {}; + struct mul : _op, 6> {}; + struct div : _op {}; + struct plus : _op, 7> {}; + struct minus : _op {}; + struct not_ : _op, 8, kind::unary> {}; + struct update : _op {}; + struct less_eq : _op {}; + struct greater_eq : _op="), 10, kind::nonAssoc> {}; + struct less : _op, 10, kind::nonAssoc> {}; + struct greater : _op'>, 10, kind::nonAssoc> {}; + struct equals : _op {}; + struct not_equals : _op {}; + struct and_ : _op {}; + struct or_ : _op {}; + struct implies : _op"), 14, kind::rightAssoc> {}; +}; + +struct _expr { + template class OpenMod = seq, typename... Init> + struct _attrset : seq< + Init..., + OpenMod>, seps, + bindings, seps, + must> + > {}; + + struct select; + + struct id : semantic, t::identifier {}; + struct int_ : semantic, t::integer {}; + struct float_ : semantic, t::floating {}; + struct string : semantic, seq {}; + struct ind_string : semantic, seq {}; + struct path : semantic, seq {}; + struct uri : semantic, t::uri {}; + struct ancient_let : semantic, _attrset {}; + struct rec_set : semantic, _attrset {}; + struct set : semantic, _attrset<> {}; + + struct _list { + struct entry : semantic, seq {}; + struct as_app_or : semantic, t::kw_or {}; + }; + struct _app { + struct first_arg : semantic, seq {}; + // can be used to stash a position of the application head node + struct select_or_fn : seq