diff options
author | eldritch horrors <pennae@lix.systems> | 2024-06-16 23:10:09 +0200 |
---|---|---|
committer | eldritch horrors <pennae@lix.systems> | 2024-06-25 12:24:58 +0000 |
commit | e6cd67591b44b4902bac73febcab3c4d96724aea (patch) | |
tree | 94c8ad90b8e756c5b00b8d68b2adf13c0f2febd9 /src/libexpr/parser/state.hh | |
parent | c097ebe66bf474da886ffa20d2f31bdb1d2196a8 (diff) |
libexpr: rewrite the parser with pegtl instead of flex/bison
this gives about 20% performance improvements on pure parsing. obviously
it will be less on full eval, but depending on how much parsing is to be
done (e.g. including hackage-packages.nix or not) it's more like 4%-10%.
this has been tested (with thousands of core hours of fuzzing) to ensure
that the ASTs produced by the new parser are exactly the same as the old
one would have produced. error messages will change (sometimes by a lot)
and are not yet perfect, but we would rather leave this as is for later.
test results for running only the parser (excluding the variable binding
code) in a tight loop with inputs and parameters as given are promising:
- 40% faster on lix's package.nix at 10000 iterations
- 1.3% faster on nixpkgs all-packages.nix at 1000 iterations
- equivalent on all of nixpkgs concatenated at 100 iterations
(excluding invalid files, each file surrounded with parens)
more realistic benchmarks are somewhere in between the extremes, parsing
once again getting the largest uplift. other realistic workloads improve
by a few percentage points as well, notably system builds are 4% faster.
Benchmarks summary (from ./bench/summarize.jq bench/bench-*.json)
old/bin/nix --extra-experimental-features 'nix-command flakes' eval -f bench/nixpkgs/pkgs/development/haskell-modules/hackage-packages.nix
mean: 0.408s ± 0.025s
user: 0.355s | system: 0.033s
median: 0.389s
range: 0.388s ... 0.442s
relative: 1
new/bin/nix --extra-experimental-features 'nix-command flakes' eval -f bench/nixpkgs/pkgs/development/haskell-modules/hackage-packages.nix
mean: 0.332s ± 0.024s
user: 0.279s | system: 0.033s
median: 0.314s
range: 0.313s ... 0.361s
relative: 0.814
---
old/bin/nix --extra-experimental-features 'nix-command flakes' eval --raw --impure --expr 'with import <nixpkgs/nixos> {}; system'
mean: 6.133s ± 0.022s
user: 5.395s | system: 0.437s
median: 6.128s
range: 6.099s ... 6.183s
relative: 1
new/bin/nix --extra-experimental-features 'nix-command flakes' eval --raw --impure --expr 'with import <nixpkgs/nixos> {}; system'
mean: 5.925s ± 0.025s
user: 5.176s | system: 0.456s
median: 5.934s
range: 5.861s ... 5.943s
relative: 0.966
---
GC_INITIAL_HEAP_SIZE=10g old/bin/nix eval --extra-experimental-features 'nix-command flakes' --raw --impure --expr 'with import <nixpkgs/nixos> {}; system'
mean: 4.503s ± 0.027s
user: 3.731s | system: 0.547s
median: 4.499s
range: 4.478s ... 4.541s
relative: 1
GC_INITIAL_HEAP_SIZE=10g new/bin/nix eval --extra-experimental-features 'nix-command flakes' --raw --impure --expr 'with import <nixpkgs/nixos> {}; system'
mean: 4.285s ± 0.031s
user: 3.504s | system: 0.571s
median: 4.281s
range: 4.221s ... 4.328s
relative: 0.951
---
old/bin/nix --extra-experimental-features 'nix-command flakes' search --no-eval-cache github:nixos/nixpkgs/e1fa12d4f6c6fe19ccb59cac54b5b3f25e160870 hello
mean: 16.475s ± 0.07s
user: 14.088s | system: 1.572s
median: 16.495s
range: 16.351s ... 16.536s
relative: 1
new/bin/nix --extra-experimental-features 'nix-command flakes' search --no-eval-cache github:nixos/nixpkgs/e1fa12d4f6c6fe19ccb59cac54b5b3f25e160870 hello
mean: 15.973s ± 0.013s
user: 13.558s | system: 1.615s
median: 15.973s
range: 15.946s ... 15.99s
relative: 0.97
---
Change-Id: Ie66ec2d045dec964632c6541e25f8f0797319ee2
Diffstat (limited to 'src/libexpr/parser/state.hh')
-rw-r--r-- | src/libexpr/parser/state.hh | 259 |
1 files changed, 259 insertions, 0 deletions
diff --git a/src/libexpr/parser/state.hh b/src/libexpr/parser/state.hh new file mode 100644 index 000000000..f5a0428d7 --- /dev/null +++ b/src/libexpr/parser/state.hh @@ -0,0 +1,259 @@ +#pragma once +///@file + +#include "eval.hh" + +namespace nix::parser { + +struct StringToken +{ + std::string_view s; + bool hasIndentation; + operator std::string_view() const { return s; } +}; + +struct State +{ + SymbolTable & symbols; + PosTable & positions; + SourcePath basePath; + PosTable::Origin origin; + const Expr::AstSymbols & s; + + void dupAttr(const AttrPath & attrPath, const PosIdx pos, const PosIdx prevPos); + void dupAttr(Symbol attr, const PosIdx pos, const PosIdx prevPos); + void addAttr(ExprAttrs * attrs, AttrPath && attrPath, std::unique_ptr<Expr> e, const PosIdx pos); + std::unique_ptr<Formals> validateFormals(std::unique_ptr<Formals> formals, PosIdx pos = noPos, Symbol arg = {}); + std::unique_ptr<Expr> stripIndentation(const PosIdx pos, + std::vector<std::pair<PosIdx, std::variant<std::unique_ptr<Expr>, StringToken>>> && es); + + // lazy positioning means we don't get byte offsets directly, in.position() would work + // but also requires line and column (which is expensive) + PosIdx at(const auto & in) + { + return positions.add(origin, in.begin() - in.input().begin()); + } + + PosIdx atEnd(const auto & in) + { + return positions.add(origin, in.end() - in.input().begin()); + } +}; + +inline void State::dupAttr(const AttrPath & attrPath, const PosIdx pos, const PosIdx prevPos) +{ + throw ParseError({ + .msg = HintFmt("attribute '%1%' already defined at %2%", + showAttrPath(symbols, attrPath), positions[prevPos]), + .pos = positions[pos] + }); +} + +inline void State::dupAttr(Symbol attr, const PosIdx pos, const PosIdx prevPos) +{ + throw ParseError({ + .msg = HintFmt("attribute '%1%' already defined at %2%", symbols[attr], positions[prevPos]), + .pos = positions[pos] + }); +} + +inline void State::addAttr(ExprAttrs * attrs, AttrPath && attrPath, std::unique_ptr<Expr> e, const PosIdx pos) +{ + AttrPath::iterator i; + // All attrpaths have at least one attr + assert(!attrPath.empty()); + // Checking attrPath validity. + // =========================== + for (i = attrPath.begin(); i + 1 < attrPath.end(); i++) { + if (i->symbol) { + ExprAttrs::AttrDefs::iterator j = attrs->attrs.find(i->symbol); + if (j != attrs->attrs.end()) { + if (j->second.kind != ExprAttrs::AttrDef::Kind::Inherited) { + ExprAttrs * attrs2 = dynamic_cast<ExprAttrs *>(j->second.e.get()); + if (!attrs2) { + attrPath.erase(i + 1, attrPath.end()); + dupAttr(attrPath, pos, j->second.pos); + } + attrs = attrs2; + } else { + attrPath.erase(i + 1, attrPath.end()); + dupAttr(attrPath, pos, j->second.pos); + } + } else { + auto next = attrs->attrs.emplace(std::piecewise_construct, + std::tuple(i->symbol), + std::tuple(std::make_unique<ExprAttrs>(), pos)); + attrs = static_cast<ExprAttrs *>(next.first->second.e.get()); + } + } else { + auto & next = attrs->dynamicAttrs.emplace_back(std::move(i->expr), std::make_unique<ExprAttrs>(), pos); + attrs = static_cast<ExprAttrs *>(next.valueExpr.get()); + } + } + // Expr insertion. + // ========================== + if (i->symbol) { + ExprAttrs::AttrDefs::iterator j = attrs->attrs.find(i->symbol); + if (j != attrs->attrs.end()) { + // This attr path is already defined. However, if both + // e and the expr pointed by the attr path are two attribute sets, + // we want to merge them. + // Otherwise, throw an error. + auto * ae = dynamic_cast<ExprAttrs *>(e.get()); + auto * jAttrs = dynamic_cast<ExprAttrs *>(j->second.e.get()); + if (jAttrs && ae) { + if (ae->inheritFromExprs && !jAttrs->inheritFromExprs) + jAttrs->inheritFromExprs = std::make_unique<std::vector<std::unique_ptr<Expr>>>(); + for (auto & ad : ae->attrs) { + auto j2 = jAttrs->attrs.find(ad.first); + if (j2 != jAttrs->attrs.end()) // Attr already defined in iAttrs, error. + return dupAttr(ad.first, j2->second.pos, ad.second.pos); + if (ad.second.kind == ExprAttrs::AttrDef::Kind::InheritedFrom) { + auto & sel = dynamic_cast<ExprSelect &>(*ad.second.e); + auto & from = dynamic_cast<ExprInheritFrom &>(*sel.e); + from.displ += jAttrs->inheritFromExprs->size(); + } + jAttrs->attrs.emplace(ad.first, std::move(ad.second)); + } + std::ranges::move(ae->dynamicAttrs, std::back_inserter(jAttrs->dynamicAttrs)); + if (ae->inheritFromExprs) + std::ranges::move(*ae->inheritFromExprs, std::back_inserter(*jAttrs->inheritFromExprs)); + } else { + dupAttr(attrPath, pos, j->second.pos); + } + } else { + // This attr path is not defined. Let's create it. + e->setName(i->symbol); + attrs->attrs.emplace(std::piecewise_construct, + std::tuple(i->symbol), + std::tuple(std::move(e), pos)); + } + } else { + attrs->dynamicAttrs.emplace_back(std::move(i->expr), std::move(e), pos); + } +} + +inline std::unique_ptr<Formals> State::validateFormals(std::unique_ptr<Formals> formals, PosIdx pos, Symbol arg) +{ + std::sort(formals->formals.begin(), formals->formals.end(), + [] (const auto & a, const auto & b) { + return std::tie(a.name, a.pos) < std::tie(b.name, b.pos); + }); + + std::optional<std::pair<Symbol, PosIdx>> duplicate; + for (size_t i = 0; i + 1 < formals->formals.size(); i++) { + if (formals->formals[i].name != formals->formals[i + 1].name) + continue; + std::pair thisDup{formals->formals[i].name, formals->formals[i + 1].pos}; + duplicate = std::min(thisDup, duplicate.value_or(thisDup)); + } + if (duplicate) + throw ParseError({ + .msg = HintFmt("duplicate formal function argument '%1%'", symbols[duplicate->first]), + .pos = positions[duplicate->second] + }); + + if (arg && formals->has(arg)) + throw ParseError({ + .msg = HintFmt("duplicate formal function argument '%1%'", symbols[arg]), + .pos = positions[pos] + }); + + return formals; +} + +inline std::unique_ptr<Expr> State::stripIndentation(const PosIdx pos, + std::vector<std::pair<PosIdx, std::variant<std::unique_ptr<Expr>, StringToken>>> && es) +{ + if (es.empty()) return std::make_unique<ExprString>(""); + + /* Figure out the minimum indentation. Note that by design + whitespace-only final lines are not taken into account. (So + the " " in "\n ''" is ignored, but the " " in "\n foo''" is.) */ + bool atStartOfLine = true; /* = seen only whitespace in the current line */ + size_t minIndent = 1000000; + size_t curIndent = 0; + for (auto & [i_pos, i] : es) { + auto * str = std::get_if<StringToken>(&i); + if (!str || !str->hasIndentation) { + /* Anti-quotations and escaped characters end the current start-of-line whitespace. */ + if (atStartOfLine) { + atStartOfLine = false; + if (curIndent < minIndent) minIndent = curIndent; + } + continue; + } + for (size_t j = 0; j < str->s.size(); ++j) { + if (atStartOfLine) { + if (str->s[j] == ' ') + curIndent++; + else if (str->s[j] == '\n') { + /* Empty line, doesn't influence minimum + indentation. */ + curIndent = 0; + } else { + atStartOfLine = false; + if (curIndent < minIndent) minIndent = curIndent; + } + } else if (str->s[j] == '\n') { + atStartOfLine = true; + curIndent = 0; + } + } + } + + /* Strip spaces from each line. */ + std::vector<std::pair<PosIdx, std::unique_ptr<Expr>>> es2; + atStartOfLine = true; + size_t curDropped = 0; + size_t n = es.size(); + auto i = es.begin(); + const auto trimExpr = [&] (std::unique_ptr<Expr> e) { + atStartOfLine = false; + curDropped = 0; + es2.emplace_back(i->first, std::move(e)); + }; + const auto trimString = [&] (const StringToken t) { + std::string s2; + for (size_t j = 0; j < t.s.size(); ++j) { + if (atStartOfLine) { + if (t.s[j] == ' ') { + if (curDropped++ >= minIndent) + s2 += t.s[j]; + } + else if (t.s[j] == '\n') { + curDropped = 0; + s2 += t.s[j]; + } else { + atStartOfLine = false; + curDropped = 0; + s2 += t.s[j]; + } + } else { + s2 += t.s[j]; + if (t.s[j] == '\n') atStartOfLine = true; + } + } + + /* Remove the last line if it is empty and consists only of + spaces. */ + if (n == 1) { + std::string::size_type p = s2.find_last_of('\n'); + if (p != std::string::npos && s2.find_first_not_of(' ', p + 1) == std::string::npos) + s2 = std::string(s2, 0, p + 1); + } + + es2.emplace_back(i->first, std::make_unique<ExprString>(std::move(s2))); + }; + for (; i != es.end(); ++i, --n) { + std::visit(overloaded { trimExpr, trimString }, std::move(i->second)); + } + + /* If this is a single string, then don't do a concatenation. */ + if (es2.size() == 1 && dynamic_cast<ExprString *>(es2[0].second.get())) { + return std::move(es2[0].second); + } + return std::make_unique<ExprConcatStrings>(pos, true, std::move(es2)); +} + +} |