diff options
author | piegames <git@piegames.de> | 2024-10-01 18:41:54 +0200 |
---|---|---|
committer | piegames <git@piegames.de> | 2024-10-18 11:40:04 +0000 |
commit | c852ae60da16b890f77e9e1b274d31dced73ae66 (patch) | |
tree | a9764d48695f30606b2d775139b7f899ec00f596 | |
parent | 765771a355cc61340a87d6fc361047c1d32d75b7 (diff) |
libexpr: Rewrite stripIndentation for indented strings
This commit should faithfully reproduce the old behavior down to the
bugs. The new code is a lot more readable, all quirks are well
documented, and it is overall much more maintainable.
Change-Id: I629585918e4f2b7d296b6b8330235cdc90b7bade
-rw-r--r-- | src/libexpr/parser/grammar.hh | 63 | ||||
-rw-r--r-- | src/libexpr/parser/parser-impl1.inc.cc | 32 | ||||
-rw-r--r-- | src/libexpr/parser/state.hh | 194 |
3 files changed, 178 insertions, 111 deletions
diff --git a/src/libexpr/parser/grammar.hh b/src/libexpr/parser/grammar.hh index 6e42609c0..92721fa20 100644 --- a/src/libexpr/parser/grammar.hh +++ b/src/libexpr/parser/grammar.hh @@ -225,7 +225,8 @@ struct string : _string, seq< > {}; struct _ind_string { - template<bool Indented, typename... Inner> + struct line_start : semantic, star<one<' '>> {}; + template<bool CanMerge, typename... Inner> struct literal : semantic, seq<Inner...> {}; struct interpolation : semantic, seq< p::string<'$', '{'>, seps, @@ -233,34 +234,54 @@ struct _ind_string { must<one<'}'>> > {}; struct escape : semantic, must<any> {}; + /* Marker for non-empty lines */ + struct has_content : semantic, seq<> {}; }; struct ind_string : _ind_string, seq< TAO_PEGTL_STRING("''"), + // Strip first line completely if empty opt<star<one<' '>>, one<'\n'>>, - star< - sor< - _ind_string::literal< - true, + list< + seq< + // Start a line with some indentation + // (we always match even the empty string if no indentation, as this creates the line) + _ind_string::line_start, + // The actual line + opt< plus< sor< - not_one<'$', '\''>, - seq<one<'$'>, not_one<'{', '\''>>, - seq<one<'\''>, not_one<'\'', '$'>> - > - > - >, - _ind_string::interpolation, - _ind_string::literal<false, one<'$'>>, - _ind_string::literal<false, one<'\''>, not_at<one<'\''>>>, - seq<one<'\''>, _ind_string::literal<false, p::string<'\'', '\''>>>, - seq< - p::string<'\'', '\''>, - sor< - _ind_string::literal<false, one<'$'>>, - seq<one<'\\'>, _ind_string::escape> + _ind_string::literal< + true, + plus< + sor< + not_one<'$', '\'', '\n'>, + // TODO probably factor this out like the others for performance + seq<one<'$'>, not_one<'{', '\'', '\n'>>, + seq<one<'$'>, at<one<'\n'>>>, + seq<one<'\''>, not_one<'\'', '$', '\n'>>, + seq<one<'\''>, at<one<'\n'>>> + > + > + >, + _ind_string::interpolation, + _ind_string::literal<false, one<'$'>>, + _ind_string::literal<false, one<'\''>, not_at<one<'\''>>>, + seq<one<'\''>, _ind_string::literal<false, p::string<'\'', '\''>>>, + seq< + p::string<'\'', '\''>, + sor< + _ind_string::literal<false, one<'$'>>, + seq<one<'\\'>, _ind_string::escape> + > + > + >, + _ind_string::has_content > > - > + >, + // End of line, LF. CR is just ignored and not treated as ending a line + // (for the purpose of indentation stripping) + _ind_string::literal<true, one<'\n'>> >, must<TAO_PEGTL_STRING("''")> > {}; diff --git a/src/libexpr/parser/parser-impl1.inc.cc b/src/libexpr/parser/parser-impl1.inc.cc index 0d41324b3..c65fb3ddc 100644 --- a/src/libexpr/parser/parser-impl1.inc.cc +++ b/src/libexpr/parser/parser-impl1.inc.cc @@ -533,36 +533,48 @@ template<> struct BuildAST<grammar::v1::string> : change_head<StringState> { struct IndStringState : SubexprState { using SubexprState::SubexprState; - std::vector<std::pair<PosIdx, std::variant<std::unique_ptr<Expr>, StringToken>>> parts; + std::vector<IndStringLine> lines; }; -template<bool Indented, typename... Content> -struct BuildAST<grammar::v1::ind_string::literal<Indented, Content...>> { +template<> struct BuildAST<grammar::v1::ind_string::line_start> { static void apply(const auto & in, IndStringState & s, State & ps) { - s.parts.emplace_back(ps.at(in), StringToken{in.string_view(), Indented}); + s.lines.push_back(IndStringLine { in.string_view(), ps.at(in) }); + } +}; + +template<bool CanMerge, typename... Content> +struct BuildAST<grammar::v1::ind_string::literal<CanMerge, Content...>> { + static void apply(const auto & in, IndStringState & s, State & ps) { + s.lines.back().parts.emplace_back(ps.at(in), StringToken{ in.string_view(), CanMerge }); } }; template<> struct BuildAST<grammar::v1::ind_string::interpolation> { static void apply(const auto & in, IndStringState & s, State & ps) { - s.parts.emplace_back(ps.at(in), s->popExprOnly()); + s.lines.back().parts.emplace_back(ps.at(in), s->popExprOnly()); } }; template<> struct BuildAST<grammar::v1::ind_string::escape> { static void apply(const auto & in, IndStringState & s, State & ps) { switch (*in.begin()) { - case 'n': s.parts.emplace_back(ps.at(in), StringToken{"\n"}); break; - case 'r': s.parts.emplace_back(ps.at(in), StringToken{"\r"}); break; - case 't': s.parts.emplace_back(ps.at(in), StringToken{"\t"}); break; - default: s.parts.emplace_back(ps.at(in), StringToken{in.string_view()}); break; + case 'n': s.lines.back().parts.emplace_back(ps.at(in), StringToken{"\n"}); break; + case 'r': s.lines.back().parts.emplace_back(ps.at(in), StringToken{"\r"}); break; + case 't': s.lines.back().parts.emplace_back(ps.at(in), StringToken{"\t"}); break; + default: s.lines.back().parts.emplace_back(ps.at(in), StringToken{in.string_view()}); break; } } }; +template<> struct BuildAST<grammar::v1::ind_string::has_content> { + static void apply(const auto & in, IndStringState & s, State & ps) { + s.lines.back().hasContent = true; + } +}; + template<> struct BuildAST<grammar::v1::ind_string> : change_head<IndStringState> { static void success(const auto & in, IndStringState & s, ExprState & e, State & ps) { - e.exprs.emplace_back(noPos, ps.stripIndentation(ps.at(in), std::move(s.parts))); + e.exprs.emplace_back(noPos, ps.stripIndentation(ps.at(in), std::move(s.lines))); } }; diff --git a/src/libexpr/parser/state.hh b/src/libexpr/parser/state.hh index 3b9b90b94..62bf08ae7 100644 --- a/src/libexpr/parser/state.hh +++ b/src/libexpr/parser/state.hh @@ -9,10 +9,28 @@ namespace nix::parser { struct StringToken { std::string_view s; - bool hasIndentation = false; + // canMerge is only used to faithfully reproduce the quirks from the old code base. + bool canMerge = false; operator std::string_view() const { return s; } }; +struct IndStringLine { + // String containing only the leading whitespace of the line. May be empty. + std::string_view indentation; + // Position of the line start (before the indentation) + PosIdx pos; + + // Whether the line contains anything besides indentation and line break + bool hasContent = false; + + std::vector< + std::pair< + PosIdx, + std::variant<std::unique_ptr<Expr>, StringToken> + > + > parts = {}; +}; + struct State { SymbolTable & symbols; @@ -27,8 +45,7 @@ struct State void overridesFound(const PosIdx pos); void addAttr(ExprAttrs * attrs, AttrPath && attrPath, std::unique_ptr<Expr> e, const PosIdx pos); std::unique_ptr<Formals> validateFormals(std::unique_ptr<Formals> formals, PosIdx pos = noPos, Symbol arg = {}); - std::unique_ptr<Expr> stripIndentation(const PosIdx pos, - std::vector<std::pair<PosIdx, std::variant<std::unique_ptr<Expr>, StringToken>>> && es); + std::unique_ptr<Expr> stripIndentation(const PosIdx pos, std::vector<IndStringLine> && line); // lazy positioning means we don't get byte offsets directly, in.position() would work // but also requires line and column (which is expensive) @@ -182,98 +199,115 @@ inline std::unique_ptr<Formals> State::validateFormals(std::unique_ptr<Formals> return formals; } -inline std::unique_ptr<Expr> State::stripIndentation(const PosIdx pos, - std::vector<std::pair<PosIdx, std::variant<std::unique_ptr<Expr>, StringToken>>> && es) +inline std::unique_ptr<Expr> State::stripIndentation( + const PosIdx pos, + std::vector<IndStringLine> && lines) { - if (es.empty()) return std::make_unique<ExprString>(""); + /* If the only line is whitespace-only, directly return empty string. + * NOTE: This is not merely an optimization, but `compatStripLeadingEmptyString` + * later on relies on the string not being empty for working. + */ + if (lines.size() == 1 && lines.front().parts.empty()) { + return std::make_unique<ExprString>(""); + } + + /* If the last line only contains whitespace, trim it to not cause excessive whitespace. + * (Other whitespace-only lines get stripped only of the common indentation, and excess + * whitespace becomes part of the string.) + */ + if (lines.back().parts.empty()) { + lines.back().indentation = {}; + } + + /* + * Quirk compatibility: + * + * » nix-instantiate --parse -E $'\'\'${"foo"}\'\'' + * "foo" + * » nix-instantiate --parse -E $'\'\' ${"foo"}\'\'' + * ("" + "foo") + * + * Our code always produces the form with the additional "" +, so we'll manually + * strip it at the end if necessary. + */ + const bool compatStripLeadingEmptyString = !lines.empty() && lines[0].indentation.empty(); - /* Figure out the minimum indentation. Note that by design - whitespace-only final lines are not taken into account. (So - the " " in "\n ''" is ignored, but the " " in "\n foo''" is.) */ - bool atStartOfLine = true; /* = seen only whitespace in the current line */ + /* Figure out the minimum indentation. Note that by design + whitespace-only lines are not taken into account. */ size_t minIndent = 1000000; - size_t curIndent = 0; - for (auto & [i_pos, i] : es) { - auto * str = std::get_if<StringToken>(&i); - if (!str || !str->hasIndentation) { - /* Anti-quotations and escaped characters end the current start-of-line whitespace. */ - if (atStartOfLine) { - atStartOfLine = false; - if (curIndent < minIndent) minIndent = curIndent; - } - continue; - } - for (size_t j = 0; j < str->s.size(); ++j) { - if (atStartOfLine) { - if (str->s[j] == ' ') - curIndent++; - else if (str->s[j] == '\n') { - /* Empty line, doesn't influence minimum - indentation. */ - curIndent = 0; - } else { - atStartOfLine = false; - if (curIndent < minIndent) minIndent = curIndent; - } - } else if (str->s[j] == '\n') { - atStartOfLine = true; - curIndent = 0; - } + for (auto & line : lines) { + if (line.hasContent) { + minIndent = std::min(minIndent, line.indentation.size()); } } /* Strip spaces from each line. */ - std::vector<std::pair<PosIdx, std::unique_ptr<Expr>>> es2; - atStartOfLine = true; - size_t curDropped = 0; - size_t n = es.size(); - auto i = es.begin(); - const auto trimExpr = [&] (std::unique_ptr<Expr> e) { - atStartOfLine = false; - curDropped = 0; - es2.emplace_back(i->first, std::move(e)); + for (auto & line : lines) { + line.indentation.remove_prefix(std::min(minIndent, line.indentation.size())); + } + + /* Concat the parts together again */ + + /* Note that we don't concat all adjacent string parts to fully reproduce the original code. + * This means that any escapes will result in string concatenation even if this is unnecessary. + */ + std::vector<std::pair<PosIdx, std::unique_ptr<Expr>>> parts; + /* Accumulator for merging intermediates */ + PosIdx merged_pos; + std::string merged = ""; + bool has_merged = false; + + auto push_merged = [&] (PosIdx i_pos, std::string_view str) { + merged += str; + if (!has_merged) { + has_merged = true; + merged_pos = i_pos; + } }; - const auto trimString = [&] (const StringToken t) { - std::string s2; - for (size_t j = 0; j < t.s.size(); ++j) { - if (atStartOfLine) { - if (t.s[j] == ' ') { - if (curDropped++ >= minIndent) - s2 += t.s[j]; - } - else if (t.s[j] == '\n') { - curDropped = 0; - s2 += t.s[j]; - } else { - atStartOfLine = false; - curDropped = 0; - s2 += t.s[j]; - } - } else { - s2 += t.s[j]; - if (t.s[j] == '\n') atStartOfLine = true; - } + + auto flush_merged = [&] () { + if (has_merged) { + parts.emplace_back(merged_pos, std::make_unique<ExprString>(std::string(merged))); + merged.clear(); + has_merged = false; } + }; - /* Remove the last line if it is empty and consists only of - spaces. */ - if (n == 1) { - std::string::size_type p = s2.find_last_of('\n'); - if (p != std::string::npos && s2.find_first_not_of(' ', p + 1) == std::string::npos) - s2 = std::string(s2, 0, p + 1); + for (auto && [li, line] : enumerate(lines)) { + /* Always merge indentation, except for the first line when compatStripLeadingEmptyString is set (see above) */ + if (!compatStripLeadingEmptyString || li != 0) { + push_merged(line.pos, line.indentation); } - es2.emplace_back(i->first, std::make_unique<ExprString>(std::move(s2))); - }; - for (; i != es.end(); ++i, --n) { - std::visit(overloaded { trimExpr, trimString }, std::move(i->second)); + for (auto & val : line.parts) { + auto &[i_pos, item] = val; + + std::visit(overloaded{ + [&](StringToken str) { + if (str.canMerge) { + push_merged(i_pos, str.s); + } else { + flush_merged(); + parts.emplace_back(i_pos, std::make_unique<ExprString>(std::string(str.s))); + } + }, + [&](std::unique_ptr<Expr> expr) { + flush_merged(); + parts.emplace_back(i_pos, std::move(expr)); + }, + }, std::move(item)); + } } - /* If this is a single string, then don't do a concatenation. */ - if (es2.size() == 1 && dynamic_cast<ExprString *>(es2[0].second.get())) { - return std::move(es2[0].second); + flush_merged(); + + /* If this is a single string, then don't do a concatenation. + * (If it's a single expression, still do the ConcatStrings to properly force it being a string.) + */ + if (parts.size() == 1 && dynamic_cast<ExprString *>(parts[0].second.get())) { + return std::move(parts[0].second); } - return std::make_unique<ExprConcatStrings>(pos, true, std::move(es2)); + return std::make_unique<ExprConcatStrings>(pos, true, std::move(parts)); } } |