aboutsummaryrefslogtreecommitdiff
path: root/src/libexpr/primops.cc
diff options
context:
space:
mode:
authorsugar <sugar@sylveon.social>2024-08-20 00:21:59 +0200
committersugar <sugar@sylveon.social>2024-08-22 03:17:55 +0200
commit447212fa65a80180150b265411924cc638a2c52c (patch)
tree75a6a2a1ab40580d3ae176a2a812f09fc54d6779 /src/libexpr/primops.cc
parente727dbc3a3d59d7742a24a2b394b63a04ecb4d24 (diff)
libexpr: Replace regex engine with boost::regex
This avoids C++'s standard library regexes, which aren't the same across platforms, and have many other issues, like using stack so much that they stack overflow when processing a lot of data. To avoid backwards and forward compatibility issues, regexes are processed using a function converting libstdc++ regexes into Boost regexes, escaping characters that Boost needs to have escaped, and rejecting features that Boost has and libstdc++ doesn't. Related context: - Original failed attempt to use `boost::regex` in CppNix, failed due to boost icu dependency being large (disabling ICU is no longer necessary because linking ICU requires using a different header file, `boost/regex/icu.hpp`): https://github.com/NixOS/nix/pull/3826 - An attempt to use PCRE, rejected due to providing less backwards compatibility with `std::regex` than `boost::regex`: https://github.com/NixOS/nix/pull/7336 - Second attempt to use `boost::regex`, failed due to `}` regex failing to compile (dealt with by writing a wrapper that parses a regular expression and escapes `}` characters): https://github.com/NixOS/nix/pull/7762 Closes #34. Closes #476. Change-Id: Ieb0eb9e270a93e4c7eed412ba4f9f96cb00a5fa4
Diffstat (limited to 'src/libexpr/primops.cc')
-rw-r--r--src/libexpr/primops.cc210
1 files changed, 198 insertions, 12 deletions
diff --git a/src/libexpr/primops.cc b/src/libexpr/primops.cc
index dab96d6d4..d6618df2a 100644
--- a/src/libexpr/primops.cc
+++ b/src/libexpr/primops.cc
@@ -17,6 +17,7 @@
#include "fetch-to-store.hh"
#include <boost/container/small_vector.hpp>
+#include <boost/regex.hpp>
#include <nlohmann/json.hpp>
#include <sys/types.h>
@@ -26,7 +27,6 @@
#include <algorithm>
#include <cstring>
#include <sstream>
-#include <regex>
#include <dlfcn.h>
#include <cmath>
@@ -3878,19 +3878,205 @@ static RegisterPrimOp primop_hashString({
.fun = prim_hashString,
});
+enum class RegexParseState {
+ // Anything outside of those
+ Regular,
+
+ // Bounded repeats, `}` shouldn't be escaped in those
+ //
+ // a{2,5}b
+ // ^^^^
+ BoundedRepeat,
+
+ // Backslashes, as C++ regexes only support escaping what needs to be
+ // escaped and nothing else
+ //
+ // a\nb
+ // ^
+ Backslash,
+
+ // Initial part of character set, as `[]]` is a regex for `]` character
+ //
+ // [abc] [^abc]
+ // ^ ^
+ CharacterSetStart,
+
+ // Initial part of negated character set, as `[^]]` is a regex for
+ // anything but `]` character
+ //
+ // [^abc]
+ // ^
+ NegatedCharacterSetStart,
+
+ // Character set after its first character
+ //
+ // [abc]
+ // ^^
+ CharacterSetMiddle,
+
+ // Parser state after seeing [, assumes the input is character extension
+ // after seeing `:`, `.`, or `=`
+ //
+ // [a[:alpha:]b]
+ // ^
+ PossibleCharacterSetExtension,
+
+ // Within character extension
+ //
+ // [a[:alpha:]b]
+ // ^^^^^^^
+ CharacterSetExtension,
+
+ // Within equivalence class expression
+ //
+ // [[=a=]]
+ // ^
+ EquivalenceClassExpression,
+};
+
+static boost::regex compile_regex(std::string_view re) {
+ // Make sure that Boost supports everything that C++ regexes do,
+ // and no non-standard extensions are available.
+ //
+ // In particular, C++ regexes only support escaping regex metacharacters.
+ // They don't support other escape sequences like `\n` and `\d`.
+ // Additionally, within character groups, it's not possible to escape
+ // anything, backslash is a literal character in those. `[\]` in regexes
+ // is a weird way to write `\\`.
+ std::string boost_re;
+ boost_re.reserve(re.size());
+ auto state = RegexParseState::Regular;
+ for (char c : re) {
+ switch (state) {
+ case RegexParseState::Regular:
+ switch (c) {
+ // Boost regex engine supports more escape sequences than C++ regexes,
+ // and as such it's necessary to ensure only escapes supported by C++
+ // are allowed.
+ case '\\':
+ state = RegexParseState::Backslash;
+ break;
+ case '[':
+ state = RegexParseState::CharacterSetStart;
+ break;
+ case '{':
+ state = RegexParseState::BoundedRepeat;
+ break;
+ // Boost doesn't permit unescaped `}`, escape it outside of
+ // bounded repeats.
+ case '}':
+ boost_re.push_back('\\');
+ break;
+ default:
+ break;
+ }
+ break;
+
+ case RegexParseState::BoundedRepeat:
+ if (c == '}') {
+ state = RegexParseState::Regular;
+ }
+ break;
+
+ case RegexParseState::Backslash:
+ switch (c) {
+ case '.': case '|': case '*': case '?': case '+': case '{':
+ case '^': case '$': case '[': case '(': case ')': case '\\':
+ state = RegexParseState::Regular;
+ break;
+ default:
+ throw boost::regex_error(
+ boost::regex_constants::error_type::error_escape
+ );
+ }
+ break;
+
+ case RegexParseState::CharacterSetStart:
+ if (c == '^') {
+ state = RegexParseState::NegatedCharacterSetStart;
+ break;
+ }
+ [[fallthrough]];
+
+ case RegexParseState::NegatedCharacterSetStart:
+ if (c == ']') {
+ state = RegexParseState::CharacterSetMiddle;
+ break;
+ }
+ [[fallthrough]];
+
+ case RegexParseState::CharacterSetMiddle:
+ middle:
+ switch (c) {
+ case '[':
+ state = RegexParseState::PossibleCharacterSetExtension;
+ break;
+ case '\\':
+ // Backslashes aren't supported in character groups, escape them
+ boost_re.push_back('\\');
+ state = RegexParseState::CharacterSetMiddle;
+ break;
+ case ']':
+ state = RegexParseState::Regular;
+ break;
+ default:
+ state = RegexParseState::CharacterSetMiddle;
+ break;
+ }
+ break;
+
+ case RegexParseState::PossibleCharacterSetExtension:
+ switch (c) {
+ case ':': case '.':
+ state = RegexParseState::CharacterSetExtension;
+ break;
+ case '=':
+ state = RegexParseState::EquivalenceClassExpression;
+ break;
+ default:
+ goto middle;
+ }
+ break;
+
+ case RegexParseState::CharacterSetExtension:
+ if (c == ']') {
+ state = RegexParseState::CharacterSetMiddle;
+ }
+ break;
+
+ case RegexParseState::EquivalenceClassExpression:
+ // C++'s regex parser only supports equivalence classes for
+ // alphabetic characters
+ if (!((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))) {
+ throw boost::regex_error(
+ boost::regex_constants::error_type::error_brack
+ );
+ }
+ // After verifying first character, this can be parsed as
+ // a regular character set extension, Boost will notice issues
+ // after that.
+ state = RegexParseState::CharacterSetExtension;
+ break;
+ }
+
+ boost_re.push_back(c);
+ }
+ return boost::regex(boost_re, boost::regex::extended);
+}
+
struct RegexCache
{
// TODO use C++20 transparent comparison when available
- std::unordered_map<std::string_view, std::regex> cache;
+ std::unordered_map<std::string_view, boost::regex> cache;
std::list<std::string> keys;
- std::regex get(std::string_view re)
+ boost::regex get(std::string_view re)
{
auto it = cache.find(re);
if (it != cache.end())
return it->second;
keys.emplace_back(re);
- return cache.emplace(keys.back(), std::regex(keys.back(), std::regex::extended)).first->second;
+ return cache.emplace(keys.back(), compile_regex(re)).first->second;
}
};
@@ -3910,8 +4096,8 @@ void prim_match(EvalState & state, const PosIdx pos, Value * * args, Value & v)
NixStringContext context;
const auto str = state.forceString(*args[1], context, pos, "while evaluating the second argument passed to builtins.match");
- std::cmatch match;
- if (!std::regex_match(str.begin(), str.end(), match, regex)) {
+ boost::cmatch match;
+ if (!boost::regex_match(str.begin(), str.end(), match, regex)) {
v.mkNull();
return;
}
@@ -3926,8 +4112,8 @@ void prim_match(EvalState & state, const PosIdx pos, Value * * args, Value & v)
(v.listElems()[i] = state.allocValue())->mkString(match[i + 1].str());
}
- } catch (std::regex_error & e) {
- if (e.code() == std::regex_constants::error_space) {
+ } catch (boost::regex_error & e) {
+ if (e.code() == boost::regex_constants::error_space) {
// limit is _GLIBCXX_REGEX_STATE_LIMIT for libstdc++
state.error<EvalError>("memory limit exceeded by regular expression '%s'", re)
.atPos(pos)
@@ -3988,8 +4174,8 @@ void prim_split(EvalState & state, const PosIdx pos, Value * * args, Value & v)
NixStringContext context;
const auto str = state.forceString(*args[1], context, pos, "while evaluating the second argument passed to builtins.split");
- auto begin = std::cregex_iterator(str.begin(), str.end(), regex);
- auto end = std::cregex_iterator();
+ auto begin = boost::cregex_iterator(str.begin(), str.end(), regex);
+ auto end = boost::cregex_iterator();
// Any matches results are surrounded by non-matching results.
const size_t len = std::distance(begin, end);
@@ -4028,8 +4214,8 @@ void prim_split(EvalState & state, const PosIdx pos, Value * * args, Value & v)
assert(idx == 2 * len + 1);
- } catch (std::regex_error & e) {
- if (e.code() == std::regex_constants::error_space) {
+ } catch (boost::regex_error & e) {
+ if (e.code() == boost::regex_constants::error_space) {
// limit is _GLIBCXX_REGEX_STATE_LIMIT for libstdc++
state.error<EvalError>("memory limit exceeded by regular expression '%s'", re)
.atPos(pos)