aboutsummaryrefslogtreecommitdiff
path: root/src/libexpr/parser
diff options
context:
space:
mode:
Diffstat (limited to 'src/libexpr/parser')
-rw-r--r--src/libexpr/parser/grammar.hh63
-rw-r--r--src/libexpr/parser/parser-impl1.inc.cc32
-rw-r--r--src/libexpr/parser/state.hh194
3 files changed, 178 insertions, 111 deletions
diff --git a/src/libexpr/parser/grammar.hh b/src/libexpr/parser/grammar.hh
index 6e42609c0..92721fa20 100644
--- a/src/libexpr/parser/grammar.hh
+++ b/src/libexpr/parser/grammar.hh
@@ -225,7 +225,8 @@ struct string : _string, seq<
> {};
struct _ind_string {
- template<bool Indented, typename... Inner>
+ struct line_start : semantic, star<one<' '>> {};
+ template<bool CanMerge, typename... Inner>
struct literal : semantic, seq<Inner...> {};
struct interpolation : semantic, seq<
p::string<'$', '{'>, seps,
@@ -233,34 +234,54 @@ struct _ind_string {
must<one<'}'>>
> {};
struct escape : semantic, must<any> {};
+ /* Marker for non-empty lines */
+ struct has_content : semantic, seq<> {};
};
struct ind_string : _ind_string, seq<
TAO_PEGTL_STRING("''"),
+ // Strip first line completely if empty
opt<star<one<' '>>, one<'\n'>>,
- star<
- sor<
- _ind_string::literal<
- true,
+ list<
+ seq<
+ // Start a line with some indentation
+ // (we always match even the empty string if no indentation, as this creates the line)
+ _ind_string::line_start,
+ // The actual line
+ opt<
plus<
sor<
- not_one<'$', '\''>,
- seq<one<'$'>, not_one<'{', '\''>>,
- seq<one<'\''>, not_one<'\'', '$'>>
- >
- >
- >,
- _ind_string::interpolation,
- _ind_string::literal<false, one<'$'>>,
- _ind_string::literal<false, one<'\''>, not_at<one<'\''>>>,
- seq<one<'\''>, _ind_string::literal<false, p::string<'\'', '\''>>>,
- seq<
- p::string<'\'', '\''>,
- sor<
- _ind_string::literal<false, one<'$'>>,
- seq<one<'\\'>, _ind_string::escape>
+ _ind_string::literal<
+ true,
+ plus<
+ sor<
+ not_one<'$', '\'', '\n'>,
+ // TODO probably factor this out like the others for performance
+ seq<one<'$'>, not_one<'{', '\'', '\n'>>,
+ seq<one<'$'>, at<one<'\n'>>>,
+ seq<one<'\''>, not_one<'\'', '$', '\n'>>,
+ seq<one<'\''>, at<one<'\n'>>>
+ >
+ >
+ >,
+ _ind_string::interpolation,
+ _ind_string::literal<false, one<'$'>>,
+ _ind_string::literal<false, one<'\''>, not_at<one<'\''>>>,
+ seq<one<'\''>, _ind_string::literal<false, p::string<'\'', '\''>>>,
+ seq<
+ p::string<'\'', '\''>,
+ sor<
+ _ind_string::literal<false, one<'$'>>,
+ seq<one<'\\'>, _ind_string::escape>
+ >
+ >
+ >,
+ _ind_string::has_content
>
>
- >
+ >,
+ // End of line, LF. CR is just ignored and not treated as ending a line
+ // (for the purpose of indentation stripping)
+ _ind_string::literal<true, one<'\n'>>
>,
must<TAO_PEGTL_STRING("''")>
> {};
diff --git a/src/libexpr/parser/parser-impl1.inc.cc b/src/libexpr/parser/parser-impl1.inc.cc
index 0d41324b3..c65fb3ddc 100644
--- a/src/libexpr/parser/parser-impl1.inc.cc
+++ b/src/libexpr/parser/parser-impl1.inc.cc
@@ -533,36 +533,48 @@ template<> struct BuildAST<grammar::v1::string> : change_head<StringState> {
struct IndStringState : SubexprState {
using SubexprState::SubexprState;
- std::vector<std::pair<PosIdx, std::variant<std::unique_ptr<Expr>, StringToken>>> parts;
+ std::vector<IndStringLine> lines;
};
-template<bool Indented, typename... Content>
-struct BuildAST<grammar::v1::ind_string::literal<Indented, Content...>> {
+template<> struct BuildAST<grammar::v1::ind_string::line_start> {
static void apply(const auto & in, IndStringState & s, State & ps) {
- s.parts.emplace_back(ps.at(in), StringToken{in.string_view(), Indented});
+ s.lines.push_back(IndStringLine { in.string_view(), ps.at(in) });
+ }
+};
+
+template<bool CanMerge, typename... Content>
+struct BuildAST<grammar::v1::ind_string::literal<CanMerge, Content...>> {
+ static void apply(const auto & in, IndStringState & s, State & ps) {
+ s.lines.back().parts.emplace_back(ps.at(in), StringToken{ in.string_view(), CanMerge });
}
};
template<> struct BuildAST<grammar::v1::ind_string::interpolation> {
static void apply(const auto & in, IndStringState & s, State & ps) {
- s.parts.emplace_back(ps.at(in), s->popExprOnly());
+ s.lines.back().parts.emplace_back(ps.at(in), s->popExprOnly());
}
};
template<> struct BuildAST<grammar::v1::ind_string::escape> {
static void apply(const auto & in, IndStringState & s, State & ps) {
switch (*in.begin()) {
- case 'n': s.parts.emplace_back(ps.at(in), StringToken{"\n"}); break;
- case 'r': s.parts.emplace_back(ps.at(in), StringToken{"\r"}); break;
- case 't': s.parts.emplace_back(ps.at(in), StringToken{"\t"}); break;
- default: s.parts.emplace_back(ps.at(in), StringToken{in.string_view()}); break;
+ case 'n': s.lines.back().parts.emplace_back(ps.at(in), StringToken{"\n"}); break;
+ case 'r': s.lines.back().parts.emplace_back(ps.at(in), StringToken{"\r"}); break;
+ case 't': s.lines.back().parts.emplace_back(ps.at(in), StringToken{"\t"}); break;
+ default: s.lines.back().parts.emplace_back(ps.at(in), StringToken{in.string_view()}); break;
}
}
};
+template<> struct BuildAST<grammar::v1::ind_string::has_content> {
+ static void apply(const auto & in, IndStringState & s, State & ps) {
+ s.lines.back().hasContent = true;
+ }
+};
+
template<> struct BuildAST<grammar::v1::ind_string> : change_head<IndStringState> {
static void success(const auto & in, IndStringState & s, ExprState & e, State & ps) {
- e.exprs.emplace_back(noPos, ps.stripIndentation(ps.at(in), std::move(s.parts)));
+ e.exprs.emplace_back(noPos, ps.stripIndentation(ps.at(in), std::move(s.lines)));
}
};
diff --git a/src/libexpr/parser/state.hh b/src/libexpr/parser/state.hh
index 3b9b90b94..62bf08ae7 100644
--- a/src/libexpr/parser/state.hh
+++ b/src/libexpr/parser/state.hh
@@ -9,10 +9,28 @@ namespace nix::parser {
struct StringToken
{
std::string_view s;
- bool hasIndentation = false;
+ // canMerge is only used to faithfully reproduce the quirks from the old code base.
+ bool canMerge = false;
operator std::string_view() const { return s; }
};
+struct IndStringLine {
+ // String containing only the leading whitespace of the line. May be empty.
+ std::string_view indentation;
+ // Position of the line start (before the indentation)
+ PosIdx pos;
+
+ // Whether the line contains anything besides indentation and line break
+ bool hasContent = false;
+
+ std::vector<
+ std::pair<
+ PosIdx,
+ std::variant<std::unique_ptr<Expr>, StringToken>
+ >
+ > parts = {};
+};
+
struct State
{
SymbolTable & symbols;
@@ -27,8 +45,7 @@ struct State
void overridesFound(const PosIdx pos);
void addAttr(ExprAttrs * attrs, AttrPath && attrPath, std::unique_ptr<Expr> e, const PosIdx pos);
std::unique_ptr<Formals> validateFormals(std::unique_ptr<Formals> formals, PosIdx pos = noPos, Symbol arg = {});
- std::unique_ptr<Expr> stripIndentation(const PosIdx pos,
- std::vector<std::pair<PosIdx, std::variant<std::unique_ptr<Expr>, StringToken>>> && es);
+ std::unique_ptr<Expr> stripIndentation(const PosIdx pos, std::vector<IndStringLine> && line);
// lazy positioning means we don't get byte offsets directly, in.position() would work
// but also requires line and column (which is expensive)
@@ -182,98 +199,115 @@ inline std::unique_ptr<Formals> State::validateFormals(std::unique_ptr<Formals>
return formals;
}
-inline std::unique_ptr<Expr> State::stripIndentation(const PosIdx pos,
- std::vector<std::pair<PosIdx, std::variant<std::unique_ptr<Expr>, StringToken>>> && es)
+inline std::unique_ptr<Expr> State::stripIndentation(
+ const PosIdx pos,
+ std::vector<IndStringLine> && lines)
{
- if (es.empty()) return std::make_unique<ExprString>("");
+ /* If the only line is whitespace-only, directly return empty string.
+ * NOTE: This is not merely an optimization, but `compatStripLeadingEmptyString`
+ * later on relies on the string not being empty for working.
+ */
+ if (lines.size() == 1 && lines.front().parts.empty()) {
+ return std::make_unique<ExprString>("");
+ }
+
+ /* If the last line only contains whitespace, trim it to not cause excessive whitespace.
+ * (Other whitespace-only lines get stripped only of the common indentation, and excess
+ * whitespace becomes part of the string.)
+ */
+ if (lines.back().parts.empty()) {
+ lines.back().indentation = {};
+ }
+
+ /*
+ * Quirk compatibility:
+ *
+ * » nix-instantiate --parse -E $'\'\'${"foo"}\'\''
+ * "foo"
+ * » nix-instantiate --parse -E $'\'\' ${"foo"}\'\''
+ * ("" + "foo")
+ *
+ * Our code always produces the form with the additional "" +, so we'll manually
+ * strip it at the end if necessary.
+ */
+ const bool compatStripLeadingEmptyString = !lines.empty() && lines[0].indentation.empty();
- /* Figure out the minimum indentation. Note that by design
- whitespace-only final lines are not taken into account. (So
- the " " in "\n ''" is ignored, but the " " in "\n foo''" is.) */
- bool atStartOfLine = true; /* = seen only whitespace in the current line */
+ /* Figure out the minimum indentation. Note that by design
+ whitespace-only lines are not taken into account. */
size_t minIndent = 1000000;
- size_t curIndent = 0;
- for (auto & [i_pos, i] : es) {
- auto * str = std::get_if<StringToken>(&i);
- if (!str || !str->hasIndentation) {
- /* Anti-quotations and escaped characters end the current start-of-line whitespace. */
- if (atStartOfLine) {
- atStartOfLine = false;
- if (curIndent < minIndent) minIndent = curIndent;
- }
- continue;
- }
- for (size_t j = 0; j < str->s.size(); ++j) {
- if (atStartOfLine) {
- if (str->s[j] == ' ')
- curIndent++;
- else if (str->s[j] == '\n') {
- /* Empty line, doesn't influence minimum
- indentation. */
- curIndent = 0;
- } else {
- atStartOfLine = false;
- if (curIndent < minIndent) minIndent = curIndent;
- }
- } else if (str->s[j] == '\n') {
- atStartOfLine = true;
- curIndent = 0;
- }
+ for (auto & line : lines) {
+ if (line.hasContent) {
+ minIndent = std::min(minIndent, line.indentation.size());
}
}
/* Strip spaces from each line. */
- std::vector<std::pair<PosIdx, std::unique_ptr<Expr>>> es2;
- atStartOfLine = true;
- size_t curDropped = 0;
- size_t n = es.size();
- auto i = es.begin();
- const auto trimExpr = [&] (std::unique_ptr<Expr> e) {
- atStartOfLine = false;
- curDropped = 0;
- es2.emplace_back(i->first, std::move(e));
+ for (auto & line : lines) {
+ line.indentation.remove_prefix(std::min(minIndent, line.indentation.size()));
+ }
+
+ /* Concat the parts together again */
+
+ /* Note that we don't concat all adjacent string parts to fully reproduce the original code.
+ * This means that any escapes will result in string concatenation even if this is unnecessary.
+ */
+ std::vector<std::pair<PosIdx, std::unique_ptr<Expr>>> parts;
+ /* Accumulator for merging intermediates */
+ PosIdx merged_pos;
+ std::string merged = "";
+ bool has_merged = false;
+
+ auto push_merged = [&] (PosIdx i_pos, std::string_view str) {
+ merged += str;
+ if (!has_merged) {
+ has_merged = true;
+ merged_pos = i_pos;
+ }
};
- const auto trimString = [&] (const StringToken t) {
- std::string s2;
- for (size_t j = 0; j < t.s.size(); ++j) {
- if (atStartOfLine) {
- if (t.s[j] == ' ') {
- if (curDropped++ >= minIndent)
- s2 += t.s[j];
- }
- else if (t.s[j] == '\n') {
- curDropped = 0;
- s2 += t.s[j];
- } else {
- atStartOfLine = false;
- curDropped = 0;
- s2 += t.s[j];
- }
- } else {
- s2 += t.s[j];
- if (t.s[j] == '\n') atStartOfLine = true;
- }
+
+ auto flush_merged = [&] () {
+ if (has_merged) {
+ parts.emplace_back(merged_pos, std::make_unique<ExprString>(std::string(merged)));
+ merged.clear();
+ has_merged = false;
}
+ };
- /* Remove the last line if it is empty and consists only of
- spaces. */
- if (n == 1) {
- std::string::size_type p = s2.find_last_of('\n');
- if (p != std::string::npos && s2.find_first_not_of(' ', p + 1) == std::string::npos)
- s2 = std::string(s2, 0, p + 1);
+ for (auto && [li, line] : enumerate(lines)) {
+ /* Always merge indentation, except for the first line when compatStripLeadingEmptyString is set (see above) */
+ if (!compatStripLeadingEmptyString || li != 0) {
+ push_merged(line.pos, line.indentation);
}
- es2.emplace_back(i->first, std::make_unique<ExprString>(std::move(s2)));
- };
- for (; i != es.end(); ++i, --n) {
- std::visit(overloaded { trimExpr, trimString }, std::move(i->second));
+ for (auto & val : line.parts) {
+ auto &[i_pos, item] = val;
+
+ std::visit(overloaded{
+ [&](StringToken str) {
+ if (str.canMerge) {
+ push_merged(i_pos, str.s);
+ } else {
+ flush_merged();
+ parts.emplace_back(i_pos, std::make_unique<ExprString>(std::string(str.s)));
+ }
+ },
+ [&](std::unique_ptr<Expr> expr) {
+ flush_merged();
+ parts.emplace_back(i_pos, std::move(expr));
+ },
+ }, std::move(item));
+ }
}
- /* If this is a single string, then don't do a concatenation. */
- if (es2.size() == 1 && dynamic_cast<ExprString *>(es2[0].second.get())) {
- return std::move(es2[0].second);
+ flush_merged();
+
+ /* If this is a single string, then don't do a concatenation.
+ * (If it's a single expression, still do the ConcatStrings to properly force it being a string.)
+ */
+ if (parts.size() == 1 && dynamic_cast<ExprString *>(parts[0].second.get())) {
+ return std::move(parts[0].second);
}
- return std::make_unique<ExprConcatStrings>(pos, true, std::move(es2));
+ return std::make_unique<ExprConcatStrings>(pos, true, std::move(parts));
}
}