libexpr: Rewrite stripIndentation for indented strings

This commit should faithfully reproduce the old behavior down to the bugs. The new code is a lot more readable, all quirks are well documented, and it is overall much more maintainable. Change-Id: I629585918e4f2b7d296b6b8330235cdc90b7bade
author: piegames <git@piegames.de> 2024-10-01 18:41:54 +0200
committer: piegames <git@piegames.de> 2024-10-18 11:40:04 +0000
commit: c852ae60da16b890f77e9e1b274d31dced73ae66 (patch)
tree: a9764d48695f30606b2d775139b7f899ec00f596
parent: 765771a355cc61340a87d6fc361047c1d32d75b7 (diff)
3 files changed, 178 insertions, 111 deletions
diff --git a/src/libexpr/parser/grammar.hh b/src/libexpr/parser/grammar.hh
index 6e42609c0..92721fa20 100644
--- a/src/libexpr/parser/grammar.hh
+++ b/src/libexpr/parser/grammar.hh
@@ -225,7 +225,8 @@ struct string : _string, seq<
 > {};
 
 struct _ind_string {
-    template<bool Indented, typename... Inner>
+    struct line_start : semantic, star<one<' '>> {};
+    template<bool CanMerge, typename... Inner>
     struct literal : semantic, seq<Inner...> {};
     struct interpolation : semantic, seq<
         p::string<'$', '{'>, seps,
@@ -233,34 +234,54 @@ struct _ind_string {
         must<one<'}'>>
     > {};
     struct escape : semantic, must<any> {};
+    /* Marker for non-empty lines */
+    struct has_content : semantic, seq<> {};
 };
 struct ind_string : _ind_string, seq<
     TAO_PEGTL_STRING("''"),
+    // Strip first line completely if empty
     opt<star<one<' '>>, one<'\n'>>,
-    star<
-        sor<
-            _ind_string::literal<
-                true,
+    list<
+        seq<
+            // Start a line with some indentation
+            // (we always match even the empty string if no indentation, as this creates the line)
+            _ind_string::line_start,
+            // The actual line
+            opt<
                 plus<
                     sor<
-                        not_one<'$', '\''>,
-                        seq<one<'$'>, not_one<'{', '\''>>,
-                        seq<one<'\''>, not_one<'\'', '$'>>
-                    >
-                >
-            >,
-            _ind_string::interpolation,
-            _ind_string::literal<false, one<'$'>>,
-            _ind_string::literal<false, one<'\''>, not_at<one<'\''>>>,
-            seq<one<'\''>, _ind_string::literal<false, p::string<'\'', '\''>>>,
-            seq<
-                p::string<'\'', '\''>,
-                sor<
-                    _ind_string::literal<false, one<'$'>>,
-                    seq<one<'\\'>, _ind_string::escape>
+                        _ind_string::literal<
+                            true,
+                            plus<
+                                sor<
+                                    not_one<'$', '\'', '\n'>,
+                                    // TODO probably factor this out like the others for performance
+                                    seq<one<'$'>, not_one<'{', '\'', '\n'>>,
+                                    seq<one<'$'>, at<one<'\n'>>>,
+                                    seq<one<'\''>, not_one<'\'', '$', '\n'>>,
+                                    seq<one<'\''>, at<one<'\n'>>>
+                                >
+                            >
+                        >,
+                        _ind_string::interpolation,
+                        _ind_string::literal<false, one<'$'>>,
+                        _ind_string::literal<false, one<'\''>, not_at<one<'\''>>>,
+                        seq<one<'\''>, _ind_string::literal<false, p::string<'\'', '\''>>>,
+                        seq<
+                            p::string<'\'', '\''>,
+                            sor<
+                                _ind_string::literal<false, one<'$'>>,
+                                seq<one<'\\'>, _ind_string::escape>
+                            >
+                        >
+                    >,
+                    _ind_string::has_content
                 >
             >
-        >
+        >,
+        // End of line, LF. CR is just ignored and not treated as ending a line
+        // (for the purpose of indentation stripping)
+        _ind_string::literal<true, one<'\n'>>
     >,
     must<TAO_PEGTL_STRING("''")>
 > {};
diff --git a/src/libexpr/parser/parser-impl1.inc.cc b/src/libexpr/parser/parser-impl1.inc.cc
index 0d41324b3..c65fb3ddc 100644
--- a/src/libexpr/parser/parser-impl1.inc.cc
+++ b/src/libexpr/parser/parser-impl1.inc.cc
@@ -533,36 +533,48 @@ template<> struct BuildAST<grammar::v1::string> : change_head<StringState> {
 struct IndStringState : SubexprState {
     using SubexprState::SubexprState;
 
-    std::vector<std::pair<PosIdx, std::variant<std::unique_ptr<Expr>, StringToken>>> parts;
+    std::vector<IndStringLine> lines;
 };
 
-template<bool Indented, typename... Content>
-struct BuildAST<grammar::v1::ind_string::literal<Indented, Content...>> {
+template<> struct BuildAST<grammar::v1::ind_string::line_start> {
     static void apply(const auto & in, IndStringState & s, State & ps) {
-        s.parts.emplace_back(ps.at(in), StringToken{in.string_view(), Indented});
+        s.lines.push_back(IndStringLine { in.string_view(), ps.at(in) });
+    }
+};
+
+template<bool CanMerge, typename... Content>
+struct BuildAST<grammar::v1::ind_string::literal<CanMerge, Content...>> {
+    static void apply(const auto & in, IndStringState & s, State & ps) {
+        s.lines.back().parts.emplace_back(ps.at(in), StringToken{ in.string_view(), CanMerge });
     }
 };
 
 template<> struct BuildAST<grammar::v1::ind_string::interpolation> {
     static void apply(const auto & in, IndStringState & s, State & ps) {
-        s.parts.emplace_back(ps.at(in), s->popExprOnly());
+        s.lines.back().parts.emplace_back(ps.at(in), s->popExprOnly());
     }
 };
 
 template<> struct BuildAST<grammar::v1::ind_string::escape> {
     static void apply(const auto & in, IndStringState & s, State & ps) {
         switch (*in.begin()) {
-        case 'n': s.parts.emplace_back(ps.at(in), StringToken{"\n"}); break;
-        case 'r': s.parts.emplace_back(ps.at(in), StringToken{"\r"}); break;
-        case 't': s.parts.emplace_back(ps.at(in), StringToken{"\t"}); break;
-        default:  s.parts.emplace_back(ps.at(in), StringToken{in.string_view()}); break;
+        case 'n': s.lines.back().parts.emplace_back(ps.at(in), StringToken{"\n"}); break;
+        case 'r': s.lines.back().parts.emplace_back(ps.at(in), StringToken{"\r"}); break;
+        case 't': s.lines.back().parts.emplace_back(ps.at(in), StringToken{"\t"}); break;
+        default:  s.lines.back().parts.emplace_back(ps.at(in), StringToken{in.string_view()}); break;
         }
     }
 };
 
+template<> struct BuildAST<grammar::v1::ind_string::has_content> {
+    static void apply(const auto & in, IndStringState & s, State & ps) {
+        s.lines.back().hasContent = true;
+    }
+};
+
 template<> struct BuildAST<grammar::v1::ind_string> : change_head<IndStringState> {
     static void success(const auto & in, IndStringState & s, ExprState & e, State & ps) {
-        e.exprs.emplace_back(noPos, ps.stripIndentation(ps.at(in), std::move(s.parts)));
+        e.exprs.emplace_back(noPos, ps.stripIndentation(ps.at(in), std::move(s.lines)));
     }
 };
 
diff --git a/src/libexpr/parser/state.hh b/src/libexpr/parser/state.hh
index 3b9b90b94..62bf08ae7 100644
--- a/src/libexpr/parser/state.hh
+++ b/src/libexpr/parser/state.hh
@@ -9,10 +9,28 @@ namespace nix::parser {
 struct StringToken
 {
     std::string_view s;
-    bool hasIndentation = false;
+    // canMerge is only used to faithfully reproduce the quirks from the old code base.
+    bool canMerge = false;
     operator std::string_view() const { return s; }
 };
 
+struct IndStringLine {
+    // String containing only the leading whitespace of the line. May be empty.
+    std::string_view indentation;
+    // Position of the line start (before the indentation)
+    PosIdx pos;
+
+    // Whether the line contains anything besides indentation and line break
+    bool hasContent = false;
+
+    std::vector<
+        std::pair<
+            PosIdx,
+            std::variant<std::unique_ptr<Expr>, StringToken>
+        >
+    > parts = {};
+};
+
 struct State
 {
     SymbolTable & symbols;
@@ -27,8 +45,7 @@ struct State
     void overridesFound(const PosIdx pos);
     void addAttr(ExprAttrs * attrs, AttrPath && attrPath, std::unique_ptr<Expr> e, const PosIdx pos);
     std::unique_ptr<Formals> validateFormals(std::unique_ptr<Formals> formals, PosIdx pos = noPos, Symbol arg = {});
-    std::unique_ptr<Expr> stripIndentation(const PosIdx pos,
-        std::vector<std::pair<PosIdx, std::variant<std::unique_ptr<Expr>, StringToken>>> && es);
+    std::unique_ptr<Expr> stripIndentation(const PosIdx pos, std::vector<IndStringLine> && line);
 
     // lazy positioning means we don't get byte offsets directly, in.position() would work
     // but also requires line and column (which is expensive)
@@ -182,98 +199,115 @@ inline std::unique_ptr<Formals> State::validateFormals(std::unique_ptr<Formals>
     return formals;
 }
 
-inline std::unique_ptr<Expr> State::stripIndentation(const PosIdx pos,
-    std::vector<std::pair<PosIdx, std::variant<std::unique_ptr<Expr>, StringToken>>> && es)
+inline std::unique_ptr<Expr> State::stripIndentation(
+    const PosIdx pos,
+    std::vector<IndStringLine> && lines)
 {
-    if (es.empty()) return std::make_unique<ExprString>("");
+    /* If the only line is whitespace-only, directly return empty string.
+     * NOTE: This is not merely an optimization, but `compatStripLeadingEmptyString`
+     * later on relies on the string not being empty for working.
+     */
+    if (lines.size() == 1 && lines.front().parts.empty()) {
+        return std::make_unique<ExprString>("");
+    }
+
+    /* If the last line only contains whitespace, trim it to not cause excessive whitespace.
+     * (Other whitespace-only lines get stripped only of the common indentation, and excess
+     * whitespace becomes part of the string.)
+     */
+    if (lines.back().parts.empty()) {
+        lines.back().indentation = {};
+    }
+
+    /*
+     * Quirk compatibility:
+     *
+     * » nix-instantiate --parse -E $'\'\'${"foo"}\'\''
+     * "foo"
+     * » nix-instantiate --parse -E $'\'\'    ${"foo"}\'\''
+     * ("" + "foo")
+     *
+     * Our code always produces the form with the additional "" +, so we'll manually
+     * strip it at the end if necessary.
+     */
+    const bool compatStripLeadingEmptyString = !lines.empty() && lines[0].indentation.empty();
 
-    /* Figure out the minimum indentation.  Note that by design
-       whitespace-only final lines are not taken into account.  (So
-       the " " in "\n ''" is ignored, but the " " in "\n foo''" is.) */
-    bool atStartOfLine = true; /* = seen only whitespace in the current line */
+    /* Figure out the minimum indentation. Note that by design
+       whitespace-only lines are not taken into account. */
     size_t minIndent = 1000000;
-    size_t curIndent = 0;
-    for (auto & [i_pos, i] : es) {
-        auto * str = std::get_if<StringToken>(&i);
-        if (!str || !str->hasIndentation) {
-            /* Anti-quotations and escaped characters end the current start-of-line whitespace. */
-            if (atStartOfLine) {
-                atStartOfLine = false;
-                if (curIndent < minIndent) minIndent = curIndent;
-            }
-            continue;
-        }
-        for (size_t j = 0; j < str->s.size(); ++j) {
-            if (atStartOfLine) {
-                if (str->s[j] == ' ')
-                    curIndent++;
-                else if (str->s[j] == '\n') {
-                    /* Empty line, doesn't influence minimum
-                       indentation. */
-                    curIndent = 0;
-                } else {
-                    atStartOfLine = false;
-                    if (curIndent < minIndent) minIndent = curIndent;
-                }
-            } else if (str->s[j] == '\n') {
-                atStartOfLine = true;
-                curIndent = 0;
-            }
+    for (auto & line : lines) {
+        if (line.hasContent) {
+            minIndent = std::min(minIndent, line.indentation.size());
         }
     }
 
     /* Strip spaces from each line. */
-    std::vector<std::pair<PosIdx, std::unique_ptr<Expr>>> es2;
-    atStartOfLine = true;
-    size_t curDropped = 0;
-    size_t n = es.size();
-    auto i = es.begin();
-    const auto trimExpr = [&] (std::unique_ptr<Expr> e) {
-        atStartOfLine = false;
-        curDropped = 0;
-        es2.emplace_back(i->first, std::move(e));
+    for (auto & line : lines) {
+        line.indentation.remove_prefix(std::min(minIndent, line.indentation.size()));
+    }
+
+    /* Concat the parts together again */
+
+    /* Note that we don't concat all adjacent string parts to fully reproduce the original code.
+     * This means that any escapes will result in string concatenation even if this is unnecessary.
+     */
+    std::vector<std::pair<PosIdx, std::unique_ptr<Expr>>> parts;
+    /* Accumulator for merging intermediates */
+    PosIdx merged_pos;
+    std::string merged = "";
+    bool has_merged = false;
+
+    auto push_merged = [&] (PosIdx i_pos, std::string_view str) {
+        merged += str;
+        if (!has_merged) {
+            has_merged = true;
+            merged_pos = i_pos;
+        }
     };
-    const auto trimString = [&] (const StringToken t) {
-        std::string s2;
-        for (size_t j = 0; j < t.s.size(); ++j) {
-            if (atStartOfLine) {
-                if (t.s[j] == ' ') {
-                    if (curDropped++ >= minIndent)
-                        s2 += t.s[j];
-                }
-                else if (t.s[j] == '\n') {
-                    curDropped = 0;
-                    s2 += t.s[j];
-                } else {
-                    atStartOfLine = false;
-                    curDropped = 0;
-                    s2 += t.s[j];
-                }
-            } else {
-                s2 += t.s[j];
-                if (t.s[j] == '\n') atStartOfLine = true;
-            }
+
+    auto flush_merged = [&] () {
+        if (has_merged) {
+            parts.emplace_back(merged_pos, std::make_unique<ExprString>(std::string(merged)));
+            merged.clear();
+            has_merged = false;
         }
+    };
 
-        /* Remove the last line if it is empty and consists only of
-           spaces. */
-        if (n == 1) {
-            std::string::size_type p = s2.find_last_of('\n');
-            if (p != std::string::npos && s2.find_first_not_of(' ', p + 1) == std::string::npos)
-                s2 = std::string(s2, 0, p + 1);
+    for (auto && [li, line] : enumerate(lines)) {
+        /* Always merge indentation, except for the first line when compatStripLeadingEmptyString is set (see above) */
+        if (!compatStripLeadingEmptyString || li != 0) {
+            push_merged(line.pos, line.indentation);
         }
 
-        es2.emplace_back(i->first, std::make_unique<ExprString>(std::move(s2)));
-    };
-    for (; i != es.end(); ++i, --n) {
-        std::visit(overloaded { trimExpr, trimString }, std::move(i->second));
+        for (auto & val : line.parts) {
+            auto &[i_pos, item] = val;
+
+            std::visit(overloaded{
+                [&](StringToken str) {
+                    if (str.canMerge) {
+                        push_merged(i_pos, str.s);
+                    } else {
+                        flush_merged();
+                        parts.emplace_back(i_pos, std::make_unique<ExprString>(std::string(str.s)));
+                    }
+                },
+                [&](std::unique_ptr<Expr> expr) {
+                    flush_merged();
+                    parts.emplace_back(i_pos, std::move(expr));
+                },
+            }, std::move(item));
+        }
     }
 
-    /* If this is a single string, then don't do a concatenation. */
-    if (es2.size() == 1 && dynamic_cast<ExprString *>(es2[0].second.get())) {
-        return std::move(es2[0].second);
+    flush_merged();
+
+    /* If this is a single string, then don't do a concatenation.
+     * (If it's a single expression, still do the ConcatStrings to properly force it being a string.)
+     */
+    if (parts.size() == 1 && dynamic_cast<ExprString *>(parts[0].second.get())) {
+        return std::move(parts[0].second);
     }
-    return std::make_unique<ExprConcatStrings>(pos, true, std::move(es2));
+    return std::make_unique<ExprConcatStrings>(pos, true, std::move(parts));
 }
 
 }
author	piegames <git@piegames.de>	2024-10-01 18:41:54 +0200
committer	piegames <git@piegames.de>	2024-10-18 11:40:04 +0000
commit	c852ae60da16b890f77e9e1b274d31dced73ae66 (patch)
tree	a9764d48695f30606b2d775139b7f899ec00f596
parent	765771a355cc61340a87d6fc361047c1d32d75b7 (diff)