From c852ae60da16b890f77e9e1b274d31dced73ae66 Mon Sep 17 00:00:00 2001 From: piegames Date: Tue, 1 Oct 2024 18:41:54 +0200 Subject: libexpr: Rewrite stripIndentation for indented strings This commit should faithfully reproduce the old behavior down to the bugs. The new code is a lot more readable, all quirks are well documented, and it is overall much more maintainable. Change-Id: I629585918e4f2b7d296b6b8330235cdc90b7bade --- src/libexpr/parser/state.hh | 194 ++++++++++++++++++++++++++------------------ 1 file changed, 114 insertions(+), 80 deletions(-) (limited to 'src/libexpr/parser/state.hh') diff --git a/src/libexpr/parser/state.hh b/src/libexpr/parser/state.hh index 3b9b90b94..62bf08ae7 100644 --- a/src/libexpr/parser/state.hh +++ b/src/libexpr/parser/state.hh @@ -9,10 +9,28 @@ namespace nix::parser { struct StringToken { std::string_view s; - bool hasIndentation = false; + // canMerge is only used to faithfully reproduce the quirks from the old code base. + bool canMerge = false; operator std::string_view() const { return s; } }; +struct IndStringLine { + // String containing only the leading whitespace of the line. May be empty. + std::string_view indentation; + // Position of the line start (before the indentation) + PosIdx pos; + + // Whether the line contains anything besides indentation and line break + bool hasContent = false; + + std::vector< + std::pair< + PosIdx, + std::variant, StringToken> + > + > parts = {}; +}; + struct State { SymbolTable & symbols; @@ -27,8 +45,7 @@ struct State void overridesFound(const PosIdx pos); void addAttr(ExprAttrs * attrs, AttrPath && attrPath, std::unique_ptr e, const PosIdx pos); std::unique_ptr validateFormals(std::unique_ptr formals, PosIdx pos = noPos, Symbol arg = {}); - std::unique_ptr stripIndentation(const PosIdx pos, - std::vector, StringToken>>> && es); + std::unique_ptr stripIndentation(const PosIdx pos, std::vector && line); // lazy positioning means we don't get byte offsets directly, in.position() would work // but also requires line and column (which is expensive) @@ -182,98 +199,115 @@ inline std::unique_ptr State::validateFormals(std::unique_ptr return formals; } -inline std::unique_ptr State::stripIndentation(const PosIdx pos, - std::vector, StringToken>>> && es) +inline std::unique_ptr State::stripIndentation( + const PosIdx pos, + std::vector && lines) { - if (es.empty()) return std::make_unique(""); + /* If the only line is whitespace-only, directly return empty string. + * NOTE: This is not merely an optimization, but `compatStripLeadingEmptyString` + * later on relies on the string not being empty for working. + */ + if (lines.size() == 1 && lines.front().parts.empty()) { + return std::make_unique(""); + } + + /* If the last line only contains whitespace, trim it to not cause excessive whitespace. + * (Other whitespace-only lines get stripped only of the common indentation, and excess + * whitespace becomes part of the string.) + */ + if (lines.back().parts.empty()) { + lines.back().indentation = {}; + } + + /* + * Quirk compatibility: + * + * » nix-instantiate --parse -E $'\'\'${"foo"}\'\'' + * "foo" + * » nix-instantiate --parse -E $'\'\' ${"foo"}\'\'' + * ("" + "foo") + * + * Our code always produces the form with the additional "" +, so we'll manually + * strip it at the end if necessary. + */ + const bool compatStripLeadingEmptyString = !lines.empty() && lines[0].indentation.empty(); - /* Figure out the minimum indentation. Note that by design - whitespace-only final lines are not taken into account. (So - the " " in "\n ''" is ignored, but the " " in "\n foo''" is.) */ - bool atStartOfLine = true; /* = seen only whitespace in the current line */ + /* Figure out the minimum indentation. Note that by design + whitespace-only lines are not taken into account. */ size_t minIndent = 1000000; - size_t curIndent = 0; - for (auto & [i_pos, i] : es) { - auto * str = std::get_if(&i); - if (!str || !str->hasIndentation) { - /* Anti-quotations and escaped characters end the current start-of-line whitespace. */ - if (atStartOfLine) { - atStartOfLine = false; - if (curIndent < minIndent) minIndent = curIndent; - } - continue; - } - for (size_t j = 0; j < str->s.size(); ++j) { - if (atStartOfLine) { - if (str->s[j] == ' ') - curIndent++; - else if (str->s[j] == '\n') { - /* Empty line, doesn't influence minimum - indentation. */ - curIndent = 0; - } else { - atStartOfLine = false; - if (curIndent < minIndent) minIndent = curIndent; - } - } else if (str->s[j] == '\n') { - atStartOfLine = true; - curIndent = 0; - } + for (auto & line : lines) { + if (line.hasContent) { + minIndent = std::min(minIndent, line.indentation.size()); } } /* Strip spaces from each line. */ - std::vector>> es2; - atStartOfLine = true; - size_t curDropped = 0; - size_t n = es.size(); - auto i = es.begin(); - const auto trimExpr = [&] (std::unique_ptr e) { - atStartOfLine = false; - curDropped = 0; - es2.emplace_back(i->first, std::move(e)); + for (auto & line : lines) { + line.indentation.remove_prefix(std::min(minIndent, line.indentation.size())); + } + + /* Concat the parts together again */ + + /* Note that we don't concat all adjacent string parts to fully reproduce the original code. + * This means that any escapes will result in string concatenation even if this is unnecessary. + */ + std::vector>> parts; + /* Accumulator for merging intermediates */ + PosIdx merged_pos; + std::string merged = ""; + bool has_merged = false; + + auto push_merged = [&] (PosIdx i_pos, std::string_view str) { + merged += str; + if (!has_merged) { + has_merged = true; + merged_pos = i_pos; + } }; - const auto trimString = [&] (const StringToken t) { - std::string s2; - for (size_t j = 0; j < t.s.size(); ++j) { - if (atStartOfLine) { - if (t.s[j] == ' ') { - if (curDropped++ >= minIndent) - s2 += t.s[j]; - } - else if (t.s[j] == '\n') { - curDropped = 0; - s2 += t.s[j]; - } else { - atStartOfLine = false; - curDropped = 0; - s2 += t.s[j]; - } - } else { - s2 += t.s[j]; - if (t.s[j] == '\n') atStartOfLine = true; - } + + auto flush_merged = [&] () { + if (has_merged) { + parts.emplace_back(merged_pos, std::make_unique(std::string(merged))); + merged.clear(); + has_merged = false; } + }; - /* Remove the last line if it is empty and consists only of - spaces. */ - if (n == 1) { - std::string::size_type p = s2.find_last_of('\n'); - if (p != std::string::npos && s2.find_first_not_of(' ', p + 1) == std::string::npos) - s2 = std::string(s2, 0, p + 1); + for (auto && [li, line] : enumerate(lines)) { + /* Always merge indentation, except for the first line when compatStripLeadingEmptyString is set (see above) */ + if (!compatStripLeadingEmptyString || li != 0) { + push_merged(line.pos, line.indentation); } - es2.emplace_back(i->first, std::make_unique(std::move(s2))); - }; - for (; i != es.end(); ++i, --n) { - std::visit(overloaded { trimExpr, trimString }, std::move(i->second)); + for (auto & val : line.parts) { + auto &[i_pos, item] = val; + + std::visit(overloaded{ + [&](StringToken str) { + if (str.canMerge) { + push_merged(i_pos, str.s); + } else { + flush_merged(); + parts.emplace_back(i_pos, std::make_unique(std::string(str.s))); + } + }, + [&](std::unique_ptr expr) { + flush_merged(); + parts.emplace_back(i_pos, std::move(expr)); + }, + }, std::move(item)); + } } - /* If this is a single string, then don't do a concatenation. */ - if (es2.size() == 1 && dynamic_cast(es2[0].second.get())) { - return std::move(es2[0].second); + flush_merged(); + + /* If this is a single string, then don't do a concatenation. + * (If it's a single expression, still do the ConcatStrings to properly force it being a string.) + */ + if (parts.size() == 1 && dynamic_cast(parts[0].second.get())) { + return std::move(parts[0].second); } - return std::make_unique(pos, true, std::move(es2)); + return std::make_unique(pos, true, std::move(parts)); } } -- cgit v1.2.3