From e6cd67591b44b4902bac73febcab3c4d96724aea Mon Sep 17 00:00:00 2001
From: eldritch horrors <pennae@lix.systems>
Date: Sun, 16 Jun 2024 23:10:09 +0200
Subject: libexpr: rewrite the parser with pegtl instead of flex/bison
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

this gives about 20% performance improvements on pure parsing. obviously
it will be less on full eval, but depending on how much parsing is to be
done (e.g. including hackage-packages.nix or not) it's more like 4%-10%.

this has been tested (with thousands of core hours of fuzzing) to ensure
that the ASTs produced by the new parser are exactly the same as the old
one would have produced. error messages will change (sometimes by a lot)
and are not yet perfect, but we would rather leave this as is for later.

test results for running only the parser (excluding the variable binding
code) in a tight loop with inputs and parameters as given are promising:

  - 40% faster on lix's package.nix at 10000 iterations
  - 1.3% faster on nixpkgs all-packages.nix at 1000 iterations
  - equivalent on all of nixpkgs concatenated at 100 iterations
    (excluding invalid files, each file surrounded with parens)

more realistic benchmarks are somewhere in between the extremes, parsing
once again getting the largest uplift. other realistic workloads improve
by a few percentage points as well, notably system builds are 4% faster.

Benchmarks summary (from ./bench/summarize.jq bench/bench-*.json)
old/bin/nix --extra-experimental-features 'nix-command flakes' eval -f bench/nixpkgs/pkgs/development/haskell-modules/hackage-packages.nix
  mean:     0.408s ± 0.025s
            user: 0.355s | system: 0.033s
  median:   0.389s
  range:    0.388s ... 0.442s
  relative: 1

new/bin/nix --extra-experimental-features 'nix-command flakes' eval -f bench/nixpkgs/pkgs/development/haskell-modules/hackage-packages.nix
  mean:     0.332s ± 0.024s
            user: 0.279s | system: 0.033s
  median:   0.314s
  range:    0.313s ... 0.361s
  relative: 0.814

---

old/bin/nix --extra-experimental-features 'nix-command flakes' eval --raw --impure --expr 'with import <nixpkgs/nixos> {}; system'
  mean:     6.133s ± 0.022s
            user: 5.395s | system: 0.437s
  median:   6.128s
  range:    6.099s ... 6.183s
  relative: 1

new/bin/nix --extra-experimental-features 'nix-command flakes' eval --raw --impure --expr 'with import <nixpkgs/nixos> {}; system'
  mean:     5.925s ± 0.025s
            user: 5.176s | system: 0.456s
  median:   5.934s
  range:    5.861s ... 5.943s
  relative: 0.966

---

GC_INITIAL_HEAP_SIZE=10g old/bin/nix eval --extra-experimental-features 'nix-command flakes' --raw --impure --expr 'with import <nixpkgs/nixos> {}; system'
  mean:     4.503s ± 0.027s
            user: 3.731s | system: 0.547s
  median:   4.499s
  range:    4.478s ... 4.541s
  relative: 1

GC_INITIAL_HEAP_SIZE=10g new/bin/nix eval --extra-experimental-features 'nix-command flakes' --raw --impure --expr 'with import <nixpkgs/nixos> {}; system'
  mean:     4.285s ± 0.031s
            user: 3.504s | system: 0.571s
  median:   4.281s
  range:    4.221s ... 4.328s
  relative: 0.951

---

old/bin/nix --extra-experimental-features 'nix-command flakes' search --no-eval-cache github:nixos/nixpkgs/e1fa12d4f6c6fe19ccb59cac54b5b3f25e160870 hello
  mean:     16.475s ± 0.07s
            user: 14.088s | system: 1.572s
  median:   16.495s
  range:    16.351s ... 16.536s
  relative: 1

new/bin/nix --extra-experimental-features 'nix-command flakes' search --no-eval-cache github:nixos/nixpkgs/e1fa12d4f6c6fe19ccb59cac54b5b3f25e160870 hello
  mean:     15.973s ± 0.013s
            user: 13.558s | system: 1.615s
  median:   15.973s
  range:    15.946s ... 15.99s
  relative: 0.97

---

Change-Id: Ie66ec2d045dec964632c6541e25f8f0797319ee2
---
 src/libexpr/parser/state.hh | 259 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 259 insertions(+)
 create mode 100644 src/libexpr/parser/state.hh

(limited to 'src/libexpr/parser/state.hh')

diff --git a/src/libexpr/parser/state.hh b/src/libexpr/parser/state.hh
new file mode 100644
index 000000000..f5a0428d7
--- /dev/null
+++ b/src/libexpr/parser/state.hh
@@ -0,0 +1,259 @@
+#pragma once
+///@file
+
+#include "eval.hh"
+
+namespace nix::parser {
+
+struct StringToken
+{
+    std::string_view s;
+    bool hasIndentation;
+    operator std::string_view() const { return s; }
+};
+
+struct State
+{
+    SymbolTable & symbols;
+    PosTable & positions;
+    SourcePath basePath;
+    PosTable::Origin origin;
+    const Expr::AstSymbols & s;
+
+    void dupAttr(const AttrPath & attrPath, const PosIdx pos, const PosIdx prevPos);
+    void dupAttr(Symbol attr, const PosIdx pos, const PosIdx prevPos);
+    void addAttr(ExprAttrs * attrs, AttrPath && attrPath, std::unique_ptr<Expr> e, const PosIdx pos);
+    std::unique_ptr<Formals> validateFormals(std::unique_ptr<Formals> formals, PosIdx pos = noPos, Symbol arg = {});
+    std::unique_ptr<Expr> stripIndentation(const PosIdx pos,
+        std::vector<std::pair<PosIdx, std::variant<std::unique_ptr<Expr>, StringToken>>> && es);
+
+    // lazy positioning means we don't get byte offsets directly, in.position() would work
+    // but also requires line and column (which is expensive)
+    PosIdx at(const auto & in)
+    {
+        return positions.add(origin, in.begin() - in.input().begin());
+    }
+
+    PosIdx atEnd(const auto & in)
+    {
+        return positions.add(origin, in.end() - in.input().begin());
+    }
+};
+
+inline void State::dupAttr(const AttrPath & attrPath, const PosIdx pos, const PosIdx prevPos)
+{
+    throw ParseError({
+         .msg = HintFmt("attribute '%1%' already defined at %2%",
+             showAttrPath(symbols, attrPath), positions[prevPos]),
+         .pos = positions[pos]
+    });
+}
+
+inline void State::dupAttr(Symbol attr, const PosIdx pos, const PosIdx prevPos)
+{
+    throw ParseError({
+        .msg = HintFmt("attribute '%1%' already defined at %2%", symbols[attr], positions[prevPos]),
+        .pos = positions[pos]
+    });
+}
+
+inline void State::addAttr(ExprAttrs * attrs, AttrPath && attrPath, std::unique_ptr<Expr> e, const PosIdx pos)
+{
+    AttrPath::iterator i;
+    // All attrpaths have at least one attr
+    assert(!attrPath.empty());
+    // Checking attrPath validity.
+    // ===========================
+    for (i = attrPath.begin(); i + 1 < attrPath.end(); i++) {
+        if (i->symbol) {
+            ExprAttrs::AttrDefs::iterator j = attrs->attrs.find(i->symbol);
+            if (j != attrs->attrs.end()) {
+                if (j->second.kind != ExprAttrs::AttrDef::Kind::Inherited) {
+                    ExprAttrs * attrs2 = dynamic_cast<ExprAttrs *>(j->second.e.get());
+                    if (!attrs2) {
+                        attrPath.erase(i + 1, attrPath.end());
+                        dupAttr(attrPath, pos, j->second.pos);
+                    }
+                    attrs = attrs2;
+                } else {
+                    attrPath.erase(i + 1, attrPath.end());
+                    dupAttr(attrPath, pos, j->second.pos);
+                }
+            } else {
+                auto next = attrs->attrs.emplace(std::piecewise_construct,
+                    std::tuple(i->symbol),
+                    std::tuple(std::make_unique<ExprAttrs>(), pos));
+                attrs = static_cast<ExprAttrs *>(next.first->second.e.get());
+            }
+        } else {
+            auto & next = attrs->dynamicAttrs.emplace_back(std::move(i->expr), std::make_unique<ExprAttrs>(), pos);
+            attrs = static_cast<ExprAttrs *>(next.valueExpr.get());
+        }
+    }
+    // Expr insertion.
+    // ==========================
+    if (i->symbol) {
+        ExprAttrs::AttrDefs::iterator j = attrs->attrs.find(i->symbol);
+        if (j != attrs->attrs.end()) {
+            // This attr path is already defined. However, if both
+            // e and the expr pointed by the attr path are two attribute sets,
+            // we want to merge them.
+            // Otherwise, throw an error.
+            auto * ae = dynamic_cast<ExprAttrs *>(e.get());
+            auto * jAttrs = dynamic_cast<ExprAttrs *>(j->second.e.get());
+            if (jAttrs && ae) {
+                if (ae->inheritFromExprs && !jAttrs->inheritFromExprs)
+                    jAttrs->inheritFromExprs = std::make_unique<std::vector<std::unique_ptr<Expr>>>();
+                for (auto & ad : ae->attrs) {
+                    auto j2 = jAttrs->attrs.find(ad.first);
+                    if (j2 != jAttrs->attrs.end()) // Attr already defined in iAttrs, error.
+                        return dupAttr(ad.first, j2->second.pos, ad.second.pos);
+                    if (ad.second.kind == ExprAttrs::AttrDef::Kind::InheritedFrom) {
+                        auto & sel = dynamic_cast<ExprSelect &>(*ad.second.e);
+                        auto & from = dynamic_cast<ExprInheritFrom &>(*sel.e);
+                        from.displ += jAttrs->inheritFromExprs->size();
+                    }
+                    jAttrs->attrs.emplace(ad.first, std::move(ad.second));
+                }
+                std::ranges::move(ae->dynamicAttrs, std::back_inserter(jAttrs->dynamicAttrs));
+                if (ae->inheritFromExprs)
+                    std::ranges::move(*ae->inheritFromExprs, std::back_inserter(*jAttrs->inheritFromExprs));
+            } else {
+                dupAttr(attrPath, pos, j->second.pos);
+            }
+        } else {
+            // This attr path is not defined. Let's create it.
+            e->setName(i->symbol);
+            attrs->attrs.emplace(std::piecewise_construct,
+                std::tuple(i->symbol),
+                std::tuple(std::move(e), pos));
+        }
+    } else {
+        attrs->dynamicAttrs.emplace_back(std::move(i->expr), std::move(e), pos);
+    }
+}
+
+inline std::unique_ptr<Formals> State::validateFormals(std::unique_ptr<Formals> formals, PosIdx pos, Symbol arg)
+{
+    std::sort(formals->formals.begin(), formals->formals.end(),
+        [] (const auto & a, const auto & b) {
+            return std::tie(a.name, a.pos) < std::tie(b.name, b.pos);
+        });
+
+    std::optional<std::pair<Symbol, PosIdx>> duplicate;
+    for (size_t i = 0; i + 1 < formals->formals.size(); i++) {
+        if (formals->formals[i].name != formals->formals[i + 1].name)
+            continue;
+        std::pair thisDup{formals->formals[i].name, formals->formals[i + 1].pos};
+        duplicate = std::min(thisDup, duplicate.value_or(thisDup));
+    }
+    if (duplicate)
+        throw ParseError({
+            .msg = HintFmt("duplicate formal function argument '%1%'", symbols[duplicate->first]),
+            .pos = positions[duplicate->second]
+        });
+
+    if (arg && formals->has(arg))
+        throw ParseError({
+            .msg = HintFmt("duplicate formal function argument '%1%'", symbols[arg]),
+            .pos = positions[pos]
+        });
+
+    return formals;
+}
+
+inline std::unique_ptr<Expr> State::stripIndentation(const PosIdx pos,
+    std::vector<std::pair<PosIdx, std::variant<std::unique_ptr<Expr>, StringToken>>> && es)
+{
+    if (es.empty()) return std::make_unique<ExprString>("");
+
+    /* Figure out the minimum indentation.  Note that by design
+       whitespace-only final lines are not taken into account.  (So
+       the " " in "\n ''" is ignored, but the " " in "\n foo''" is.) */
+    bool atStartOfLine = true; /* = seen only whitespace in the current line */
+    size_t minIndent = 1000000;
+    size_t curIndent = 0;
+    for (auto & [i_pos, i] : es) {
+        auto * str = std::get_if<StringToken>(&i);
+        if (!str || !str->hasIndentation) {
+            /* Anti-quotations and escaped characters end the current start-of-line whitespace. */
+            if (atStartOfLine) {
+                atStartOfLine = false;
+                if (curIndent < minIndent) minIndent = curIndent;
+            }
+            continue;
+        }
+        for (size_t j = 0; j < str->s.size(); ++j) {
+            if (atStartOfLine) {
+                if (str->s[j] == ' ')
+                    curIndent++;
+                else if (str->s[j] == '\n') {
+                    /* Empty line, doesn't influence minimum
+                       indentation. */
+                    curIndent = 0;
+                } else {
+                    atStartOfLine = false;
+                    if (curIndent < minIndent) minIndent = curIndent;
+                }
+            } else if (str->s[j] == '\n') {
+                atStartOfLine = true;
+                curIndent = 0;
+            }
+        }
+    }
+
+    /* Strip spaces from each line. */
+    std::vector<std::pair<PosIdx, std::unique_ptr<Expr>>> es2;
+    atStartOfLine = true;
+    size_t curDropped = 0;
+    size_t n = es.size();
+    auto i = es.begin();
+    const auto trimExpr = [&] (std::unique_ptr<Expr> e) {
+        atStartOfLine = false;
+        curDropped = 0;
+        es2.emplace_back(i->first, std::move(e));
+    };
+    const auto trimString = [&] (const StringToken t) {
+        std::string s2;
+        for (size_t j = 0; j < t.s.size(); ++j) {
+            if (atStartOfLine) {
+                if (t.s[j] == ' ') {
+                    if (curDropped++ >= minIndent)
+                        s2 += t.s[j];
+                }
+                else if (t.s[j] == '\n') {
+                    curDropped = 0;
+                    s2 += t.s[j];
+                } else {
+                    atStartOfLine = false;
+                    curDropped = 0;
+                    s2 += t.s[j];
+                }
+            } else {
+                s2 += t.s[j];
+                if (t.s[j] == '\n') atStartOfLine = true;
+            }
+        }
+
+        /* Remove the last line if it is empty and consists only of
+           spaces. */
+        if (n == 1) {
+            std::string::size_type p = s2.find_last_of('\n');
+            if (p != std::string::npos && s2.find_first_not_of(' ', p + 1) == std::string::npos)
+                s2 = std::string(s2, 0, p + 1);
+        }
+
+        es2.emplace_back(i->first, std::make_unique<ExprString>(std::move(s2)));
+    };
+    for (; i != es.end(); ++i, --n) {
+        std::visit(overloaded { trimExpr, trimString }, std::move(i->second));
+    }
+
+    /* If this is a single string, then don't do a concatenation. */
+    if (es2.size() == 1 && dynamic_cast<ExprString *>(es2[0].second.get())) {
+        return std::move(es2[0].second);
+    }
+    return std::make_unique<ExprConcatStrings>(pos, true, std::move(es2));
+}
+
+}
-- 
cgit v1.2.3