From e6cd67591b44b4902bac73febcab3c4d96724aea Mon Sep 17 00:00:00 2001 From: eldritch horrors Date: Sun, 16 Jun 2024 23:10:09 +0200 Subject: libexpr: rewrite the parser with pegtl instead of flex/bison MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit this gives about 20% performance improvements on pure parsing. obviously it will be less on full eval, but depending on how much parsing is to be done (e.g. including hackage-packages.nix or not) it's more like 4%-10%. this has been tested (with thousands of core hours of fuzzing) to ensure that the ASTs produced by the new parser are exactly the same as the old one would have produced. error messages will change (sometimes by a lot) and are not yet perfect, but we would rather leave this as is for later. test results for running only the parser (excluding the variable binding code) in a tight loop with inputs and parameters as given are promising: - 40% faster on lix's package.nix at 10000 iterations - 1.3% faster on nixpkgs all-packages.nix at 1000 iterations - equivalent on all of nixpkgs concatenated at 100 iterations (excluding invalid files, each file surrounded with parens) more realistic benchmarks are somewhere in between the extremes, parsing once again getting the largest uplift. other realistic workloads improve by a few percentage points as well, notably system builds are 4% faster. Benchmarks summary (from ./bench/summarize.jq bench/bench-*.json) old/bin/nix --extra-experimental-features 'nix-command flakes' eval -f bench/nixpkgs/pkgs/development/haskell-modules/hackage-packages.nix mean: 0.408s ± 0.025s user: 0.355s | system: 0.033s median: 0.389s range: 0.388s ... 0.442s relative: 1 new/bin/nix --extra-experimental-features 'nix-command flakes' eval -f bench/nixpkgs/pkgs/development/haskell-modules/hackage-packages.nix mean: 0.332s ± 0.024s user: 0.279s | system: 0.033s median: 0.314s range: 0.313s ... 0.361s relative: 0.814 --- old/bin/nix --extra-experimental-features 'nix-command flakes' eval --raw --impure --expr 'with import {}; system' mean: 6.133s ± 0.022s user: 5.395s | system: 0.437s median: 6.128s range: 6.099s ... 6.183s relative: 1 new/bin/nix --extra-experimental-features 'nix-command flakes' eval --raw --impure --expr 'with import {}; system' mean: 5.925s ± 0.025s user: 5.176s | system: 0.456s median: 5.934s range: 5.861s ... 5.943s relative: 0.966 --- GC_INITIAL_HEAP_SIZE=10g old/bin/nix eval --extra-experimental-features 'nix-command flakes' --raw --impure --expr 'with import {}; system' mean: 4.503s ± 0.027s user: 3.731s | system: 0.547s median: 4.499s range: 4.478s ... 4.541s relative: 1 GC_INITIAL_HEAP_SIZE=10g new/bin/nix eval --extra-experimental-features 'nix-command flakes' --raw --impure --expr 'with import {}; system' mean: 4.285s ± 0.031s user: 3.504s | system: 0.571s median: 4.281s range: 4.221s ... 4.328s relative: 0.951 --- old/bin/nix --extra-experimental-features 'nix-command flakes' search --no-eval-cache github:nixos/nixpkgs/e1fa12d4f6c6fe19ccb59cac54b5b3f25e160870 hello mean: 16.475s ± 0.07s user: 14.088s | system: 1.572s median: 16.495s range: 16.351s ... 16.536s relative: 1 new/bin/nix --extra-experimental-features 'nix-command flakes' search --no-eval-cache github:nixos/nixpkgs/e1fa12d4f6c6fe19ccb59cac54b5b3f25e160870 hello mean: 15.973s ± 0.013s user: 13.558s | system: 1.615s median: 15.973s range: 15.946s ... 15.99s relative: 0.97 --- Change-Id: Ie66ec2d045dec964632c6541e25f8f0797319ee2 --- src/libexpr/parser/state.hh | 259 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 259 insertions(+) create mode 100644 src/libexpr/parser/state.hh (limited to 'src/libexpr/parser/state.hh') diff --git a/src/libexpr/parser/state.hh b/src/libexpr/parser/state.hh new file mode 100644 index 000000000..f5a0428d7 --- /dev/null +++ b/src/libexpr/parser/state.hh @@ -0,0 +1,259 @@ +#pragma once +///@file + +#include "eval.hh" + +namespace nix::parser { + +struct StringToken +{ + std::string_view s; + bool hasIndentation; + operator std::string_view() const { return s; } +}; + +struct State +{ + SymbolTable & symbols; + PosTable & positions; + SourcePath basePath; + PosTable::Origin origin; + const Expr::AstSymbols & s; + + void dupAttr(const AttrPath & attrPath, const PosIdx pos, const PosIdx prevPos); + void dupAttr(Symbol attr, const PosIdx pos, const PosIdx prevPos); + void addAttr(ExprAttrs * attrs, AttrPath && attrPath, std::unique_ptr e, const PosIdx pos); + std::unique_ptr validateFormals(std::unique_ptr formals, PosIdx pos = noPos, Symbol arg = {}); + std::unique_ptr stripIndentation(const PosIdx pos, + std::vector, StringToken>>> && es); + + // lazy positioning means we don't get byte offsets directly, in.position() would work + // but also requires line and column (which is expensive) + PosIdx at(const auto & in) + { + return positions.add(origin, in.begin() - in.input().begin()); + } + + PosIdx atEnd(const auto & in) + { + return positions.add(origin, in.end() - in.input().begin()); + } +}; + +inline void State::dupAttr(const AttrPath & attrPath, const PosIdx pos, const PosIdx prevPos) +{ + throw ParseError({ + .msg = HintFmt("attribute '%1%' already defined at %2%", + showAttrPath(symbols, attrPath), positions[prevPos]), + .pos = positions[pos] + }); +} + +inline void State::dupAttr(Symbol attr, const PosIdx pos, const PosIdx prevPos) +{ + throw ParseError({ + .msg = HintFmt("attribute '%1%' already defined at %2%", symbols[attr], positions[prevPos]), + .pos = positions[pos] + }); +} + +inline void State::addAttr(ExprAttrs * attrs, AttrPath && attrPath, std::unique_ptr e, const PosIdx pos) +{ + AttrPath::iterator i; + // All attrpaths have at least one attr + assert(!attrPath.empty()); + // Checking attrPath validity. + // =========================== + for (i = attrPath.begin(); i + 1 < attrPath.end(); i++) { + if (i->symbol) { + ExprAttrs::AttrDefs::iterator j = attrs->attrs.find(i->symbol); + if (j != attrs->attrs.end()) { + if (j->second.kind != ExprAttrs::AttrDef::Kind::Inherited) { + ExprAttrs * attrs2 = dynamic_cast(j->second.e.get()); + if (!attrs2) { + attrPath.erase(i + 1, attrPath.end()); + dupAttr(attrPath, pos, j->second.pos); + } + attrs = attrs2; + } else { + attrPath.erase(i + 1, attrPath.end()); + dupAttr(attrPath, pos, j->second.pos); + } + } else { + auto next = attrs->attrs.emplace(std::piecewise_construct, + std::tuple(i->symbol), + std::tuple(std::make_unique(), pos)); + attrs = static_cast(next.first->second.e.get()); + } + } else { + auto & next = attrs->dynamicAttrs.emplace_back(std::move(i->expr), std::make_unique(), pos); + attrs = static_cast(next.valueExpr.get()); + } + } + // Expr insertion. + // ========================== + if (i->symbol) { + ExprAttrs::AttrDefs::iterator j = attrs->attrs.find(i->symbol); + if (j != attrs->attrs.end()) { + // This attr path is already defined. However, if both + // e and the expr pointed by the attr path are two attribute sets, + // we want to merge them. + // Otherwise, throw an error. + auto * ae = dynamic_cast(e.get()); + auto * jAttrs = dynamic_cast(j->second.e.get()); + if (jAttrs && ae) { + if (ae->inheritFromExprs && !jAttrs->inheritFromExprs) + jAttrs->inheritFromExprs = std::make_unique>>(); + for (auto & ad : ae->attrs) { + auto j2 = jAttrs->attrs.find(ad.first); + if (j2 != jAttrs->attrs.end()) // Attr already defined in iAttrs, error. + return dupAttr(ad.first, j2->second.pos, ad.second.pos); + if (ad.second.kind == ExprAttrs::AttrDef::Kind::InheritedFrom) { + auto & sel = dynamic_cast(*ad.second.e); + auto & from = dynamic_cast(*sel.e); + from.displ += jAttrs->inheritFromExprs->size(); + } + jAttrs->attrs.emplace(ad.first, std::move(ad.second)); + } + std::ranges::move(ae->dynamicAttrs, std::back_inserter(jAttrs->dynamicAttrs)); + if (ae->inheritFromExprs) + std::ranges::move(*ae->inheritFromExprs, std::back_inserter(*jAttrs->inheritFromExprs)); + } else { + dupAttr(attrPath, pos, j->second.pos); + } + } else { + // This attr path is not defined. Let's create it. + e->setName(i->symbol); + attrs->attrs.emplace(std::piecewise_construct, + std::tuple(i->symbol), + std::tuple(std::move(e), pos)); + } + } else { + attrs->dynamicAttrs.emplace_back(std::move(i->expr), std::move(e), pos); + } +} + +inline std::unique_ptr State::validateFormals(std::unique_ptr formals, PosIdx pos, Symbol arg) +{ + std::sort(formals->formals.begin(), formals->formals.end(), + [] (const auto & a, const auto & b) { + return std::tie(a.name, a.pos) < std::tie(b.name, b.pos); + }); + + std::optional> duplicate; + for (size_t i = 0; i + 1 < formals->formals.size(); i++) { + if (formals->formals[i].name != formals->formals[i + 1].name) + continue; + std::pair thisDup{formals->formals[i].name, formals->formals[i + 1].pos}; + duplicate = std::min(thisDup, duplicate.value_or(thisDup)); + } + if (duplicate) + throw ParseError({ + .msg = HintFmt("duplicate formal function argument '%1%'", symbols[duplicate->first]), + .pos = positions[duplicate->second] + }); + + if (arg && formals->has(arg)) + throw ParseError({ + .msg = HintFmt("duplicate formal function argument '%1%'", symbols[arg]), + .pos = positions[pos] + }); + + return formals; +} + +inline std::unique_ptr State::stripIndentation(const PosIdx pos, + std::vector, StringToken>>> && es) +{ + if (es.empty()) return std::make_unique(""); + + /* Figure out the minimum indentation. Note that by design + whitespace-only final lines are not taken into account. (So + the " " in "\n ''" is ignored, but the " " in "\n foo''" is.) */ + bool atStartOfLine = true; /* = seen only whitespace in the current line */ + size_t minIndent = 1000000; + size_t curIndent = 0; + for (auto & [i_pos, i] : es) { + auto * str = std::get_if(&i); + if (!str || !str->hasIndentation) { + /* Anti-quotations and escaped characters end the current start-of-line whitespace. */ + if (atStartOfLine) { + atStartOfLine = false; + if (curIndent < minIndent) minIndent = curIndent; + } + continue; + } + for (size_t j = 0; j < str->s.size(); ++j) { + if (atStartOfLine) { + if (str->s[j] == ' ') + curIndent++; + else if (str->s[j] == '\n') { + /* Empty line, doesn't influence minimum + indentation. */ + curIndent = 0; + } else { + atStartOfLine = false; + if (curIndent < minIndent) minIndent = curIndent; + } + } else if (str->s[j] == '\n') { + atStartOfLine = true; + curIndent = 0; + } + } + } + + /* Strip spaces from each line. */ + std::vector>> es2; + atStartOfLine = true; + size_t curDropped = 0; + size_t n = es.size(); + auto i = es.begin(); + const auto trimExpr = [&] (std::unique_ptr e) { + atStartOfLine = false; + curDropped = 0; + es2.emplace_back(i->first, std::move(e)); + }; + const auto trimString = [&] (const StringToken t) { + std::string s2; + for (size_t j = 0; j < t.s.size(); ++j) { + if (atStartOfLine) { + if (t.s[j] == ' ') { + if (curDropped++ >= minIndent) + s2 += t.s[j]; + } + else if (t.s[j] == '\n') { + curDropped = 0; + s2 += t.s[j]; + } else { + atStartOfLine = false; + curDropped = 0; + s2 += t.s[j]; + } + } else { + s2 += t.s[j]; + if (t.s[j] == '\n') atStartOfLine = true; + } + } + + /* Remove the last line if it is empty and consists only of + spaces. */ + if (n == 1) { + std::string::size_type p = s2.find_last_of('\n'); + if (p != std::string::npos && s2.find_first_not_of(' ', p + 1) == std::string::npos) + s2 = std::string(s2, 0, p + 1); + } + + es2.emplace_back(i->first, std::make_unique(std::move(s2))); + }; + for (; i != es.end(); ++i, --n) { + std::visit(overloaded { trimExpr, trimString }, std::move(i->second)); + } + + /* If this is a single string, then don't do a concatenation. */ + if (es2.size() == 1 && dynamic_cast(es2[0].second.get())) { + return std::move(es2[0].second); + } + return std::make_unique(pos, true, std::move(es2)); +} + +} -- cgit v1.2.3