libexpr: Optimize complex indented strings

The old behavior results in lots of concatenations happening for no good reason and is an artifact of the technical limitations of the old parser (combined with some lack of care for such details). Change-Id: I0d78d6220ca6aeaa10bc437e48e08bf7922e0bb3
author: piegames <git@piegames.de> 2024-10-15 21:48:15 +0200
committer: piegames <git@piegames.de> 2024-10-18 19:37:23 +0200
commit: e5de1d13c493f80ff6cd21c51f77a4ed10088ea2 (patch)
tree: f96868291c386ef9b7ef8d3035bf58fa1821b78c /src/libexpr
parent: 878e18188215ad52784027c3489c7b4f6f0bcba3 (diff)
3 files changed, 21 insertions, 58 deletions
diff --git a/src/libexpr/parser/grammar.hh b/src/libexpr/parser/grammar.hh
index 92721fa20..701b40505 100644
--- a/src/libexpr/parser/grammar.hh
+++ b/src/libexpr/parser/grammar.hh
@@ -226,7 +226,7 @@ struct string : _string, seq<
 
 struct _ind_string {
     struct line_start : semantic, star<one<' '>> {};
-    template<bool CanMerge, typename... Inner>
+    template<typename... Inner>
     struct literal : semantic, seq<Inner...> {};
     struct interpolation : semantic, seq<
         p::string<'$', '{'>, seps,
@@ -251,7 +251,6 @@ struct ind_string : _ind_string, seq<
                 plus<
                     sor<
                         _ind_string::literal<
-                            true,
                             plus<
                                 sor<
                                     not_one<'$', '\'', '\n'>,
@@ -264,13 +263,13 @@ struct ind_string : _ind_string, seq<
                             >
                         >,
                         _ind_string::interpolation,
-                        _ind_string::literal<false, one<'$'>>,
-                        _ind_string::literal<false, one<'\''>, not_at<one<'\''>>>,
-                        seq<one<'\''>, _ind_string::literal<false, p::string<'\'', '\''>>>,
+                        _ind_string::literal<one<'$'>>,
+                        _ind_string::literal<one<'\''>, not_at<one<'\''>>>,
+                        seq<one<'\''>, _ind_string::literal<p::string<'\'', '\''>>>,
                         seq<
                             p::string<'\'', '\''>,
                             sor<
-                                _ind_string::literal<false, one<'$'>>,
+                                _ind_string::literal<one<'$'>>,
                                 seq<one<'\\'>, _ind_string::escape>
                             >
                         >
@@ -281,7 +280,7 @@ struct ind_string : _ind_string, seq<
         >,
         // End of line, LF. CR is just ignored and not treated as ending a line
         // (for the purpose of indentation stripping)
-        _ind_string::literal<true, one<'\n'>>
+        _ind_string::literal<one<'\n'>>
     >,
     must<TAO_PEGTL_STRING("''")>
 > {};
diff --git a/src/libexpr/parser/parser-impl1.inc.cc b/src/libexpr/parser/parser-impl1.inc.cc
index c65fb3ddc..5836ab752 100644
--- a/src/libexpr/parser/parser-impl1.inc.cc
+++ b/src/libexpr/parser/parser-impl1.inc.cc
@@ -542,10 +542,10 @@ template<> struct BuildAST<grammar::v1::ind_string::line_start> {
     }
 };
 
-template<bool CanMerge, typename... Content>
-struct BuildAST<grammar::v1::ind_string::literal<CanMerge, Content...>> {
+template<typename... Content>
+struct BuildAST<grammar::v1::ind_string::literal<Content...>> {
     static void apply(const auto & in, IndStringState & s, State & ps) {
-        s.lines.back().parts.emplace_back(ps.at(in), StringToken{ in.string_view(), CanMerge });
+        s.lines.back().parts.emplace_back(ps.at(in), in.string_view());
     }
 };
 
@@ -558,10 +558,10 @@ template<> struct BuildAST<grammar::v1::ind_string::interpolation> {
 template<> struct BuildAST<grammar::v1::ind_string::escape> {
     static void apply(const auto & in, IndStringState & s, State & ps) {
         switch (*in.begin()) {
-        case 'n': s.lines.back().parts.emplace_back(ps.at(in), StringToken{"\n"}); break;
-        case 'r': s.lines.back().parts.emplace_back(ps.at(in), StringToken{"\r"}); break;
-        case 't': s.lines.back().parts.emplace_back(ps.at(in), StringToken{"\t"}); break;
-        default:  s.lines.back().parts.emplace_back(ps.at(in), StringToken{in.string_view()}); break;
+        case 'n': s.lines.back().parts.emplace_back(ps.at(in), "\n"); break;
+        case 'r': s.lines.back().parts.emplace_back(ps.at(in), "\r"); break;
+        case 't': s.lines.back().parts.emplace_back(ps.at(in), "\t"); break;
+        default:  s.lines.back().parts.emplace_back(ps.at(in), in.string_view()); break;
         }
     }
 };
diff --git a/src/libexpr/parser/state.hh b/src/libexpr/parser/state.hh
index 62bf08ae7..b969f73e4 100644
--- a/src/libexpr/parser/state.hh
+++ b/src/libexpr/parser/state.hh
@@ -6,14 +6,6 @@
 
 namespace nix::parser {
 
-struct StringToken
-{
-    std::string_view s;
-    // canMerge is only used to faithfully reproduce the quirks from the old code base.
-    bool canMerge = false;
-    operator std::string_view() const { return s; }
-};
-
 struct IndStringLine {
     // String containing only the leading whitespace of the line. May be empty.
     std::string_view indentation;
@@ -26,7 +18,7 @@ struct IndStringLine {
     std::vector<
         std::pair<
             PosIdx,
-            std::variant<std::unique_ptr<Expr>, StringToken>
+            std::variant<std::unique_ptr<Expr>, std::string_view>
         >
     > parts = {};
 };
@@ -204,8 +196,7 @@ inline std::unique_ptr<Expr> State::stripIndentation(
     std::vector<IndStringLine> && lines)
 {
     /* If the only line is whitespace-only, directly return empty string.
-     * NOTE: This is not merely an optimization, but `compatStripLeadingEmptyString`
-     * later on relies on the string not being empty for working.
+     * The rest of the code relies on the final string not being empty.
      */
     if (lines.size() == 1 && lines.front().parts.empty()) {
         return std::make_unique<ExprString>("");
@@ -219,19 +210,6 @@ inline std::unique_ptr<Expr> State::stripIndentation(
         lines.back().indentation = {};
     }
 
-    /*
-     * Quirk compatibility:
-     *
-     * » nix-instantiate --parse -E $'\'\'${"foo"}\'\''
-     * "foo"
-     * » nix-instantiate --parse -E $'\'\'    ${"foo"}\'\''
-     * ("" + "foo")
-     *
-     * Our code always produces the form with the additional "" +, so we'll manually
-     * strip it at the end if necessary.
-     */
-    const bool compatStripLeadingEmptyString = !lines.empty() && lines[0].indentation.empty();
-
     /* Figure out the minimum indentation. Note that by design
        whitespace-only lines are not taken into account. */
     size_t minIndent = 1000000;
@@ -248,48 +226,34 @@ inline std::unique_ptr<Expr> State::stripIndentation(
 
     /* Concat the parts together again */
 
-    /* Note that we don't concat all adjacent string parts to fully reproduce the original code.
-     * This means that any escapes will result in string concatenation even if this is unnecessary.
-     */
     std::vector<std::pair<PosIdx, std::unique_ptr<Expr>>> parts;
     /* Accumulator for merging intermediates */
     PosIdx merged_pos;
     std::string merged = "";
-    bool has_merged = false;
 
     auto push_merged = [&] (PosIdx i_pos, std::string_view str) {
-        merged += str;
-        if (!has_merged) {
-            has_merged = true;
+        if (merged.empty()) {
             merged_pos = i_pos;
         }
+        merged += str;
     };
 
     auto flush_merged = [&] () {
-        if (has_merged) {
+        if (!merged.empty()) {
             parts.emplace_back(merged_pos, std::make_unique<ExprString>(std::string(merged)));
             merged.clear();
-            has_merged = false;
         }
     };
 
     for (auto && [li, line] : enumerate(lines)) {
-        /* Always merge indentation, except for the first line when compatStripLeadingEmptyString is set (see above) */
-        if (!compatStripLeadingEmptyString || li != 0) {
-            push_merged(line.pos, line.indentation);
-        }
+        push_merged(line.pos, line.indentation);
 
         for (auto & val : line.parts) {
             auto &[i_pos, item] = val;
 
             std::visit(overloaded{
-                [&](StringToken str) {
-                    if (str.canMerge) {
-                        push_merged(i_pos, str.s);
-                    } else {
-                        flush_merged();
-                        parts.emplace_back(i_pos, std::make_unique<ExprString>(std::string(str.s)));
-                    }
+                [&](std::string_view str) {
+                    push_merged(i_pos, str);
                 },
                 [&](std::unique_ptr<Expr> expr) {
                     flush_merged();
author	piegames <git@piegames.de>	2024-10-15 21:48:15 +0200
committer	piegames <git@piegames.de>	2024-10-18 19:37:23 +0200
commit	e5de1d13c493f80ff6cd21c51f77a4ed10088ea2 (patch)
tree	f96868291c386ef9b7ef8d3035bf58fa1821b78c /src/libexpr
parent	878e18188215ad52784027c3489c7b4f6f0bcba3 (diff)