aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorjade <lix@jade.fyi>2024-08-22 22:34:10 +0000
committerGerrit Code Review <gerrit@localhost>2024-08-22 22:34:10 +0000
commit87fd6e00952f3597618a82a121052a9ad37b8f4a (patch)
treea504f0872caf718db17921b89e42973d040dab4b
parentf2e7f8bab875809e8b489e1e5a7aa8572bb4bc13 (diff)
parent9896d309cbf3e4c0888760981654c1da0b5983a9 (diff)
Merge "Revert "libexpr: Replace regex engine with boost::regex"" into main
-rw-r--r--doc/manual/change-authors.yml4
-rw-r--r--doc/manual/rl-next/boost-regex.md37
-rw-r--r--src/libexpr/primops.cc210
-rw-r--r--tests/unit/libexpr/primops.cc145
4 files changed, 12 insertions, 384 deletions
diff --git a/doc/manual/change-authors.yml b/doc/manual/change-authors.yml
index d9303a747..e18abada1 100644
--- a/doc/manual/change-authors.yml
+++ b/doc/manual/change-authors.yml
@@ -129,10 +129,6 @@ roberth:
display_name: Robert Hensing
github: roberth
-sugar:
- forgejo: sugar
- github: sugar700
-
thufschmitt:
display_name: Théophane Hufschmitt
github: thufschmitt
diff --git a/doc/manual/rl-next/boost-regex.md b/doc/manual/rl-next/boost-regex.md
deleted file mode 100644
index c541434d0..000000000
--- a/doc/manual/rl-next/boost-regex.md
+++ /dev/null
@@ -1,37 +0,0 @@
----
-synopsis: Replace regex engine with boost::regex
-issues: [fj#34, fj#476]
-cls: [1821]
-category: Fixes
-credits: [sugar]
----
-
-Previously, the C++ standard regex expression library was used, the
-behaviour of which varied depending on the platform. This has been
-replaced with the Boost regex library, which works identically across
-platforms.
-
-The visible behaviour of the regex functions doesn't change. While
-the new library has more features, Lix will reject regular expressions
-using them.
-
-This also fixes regex matching reporting stack overflow when matching
-on too much data.
-
-Before:
-
- nix-repl> builtins.match ".*" (
- builtins.concatStringsSep "" (
- builtins.genList (_: "a") 1000000
- )
- )
- error: stack overflow (possible infinite recursion)
-
-After:
-
- nix-repl> builtins.match ".*" (
- builtins.concatStringsSep "" (
- builtins.genList (_: "a") 1000000
- )
- )
- [ ]
diff --git a/src/libexpr/primops.cc b/src/libexpr/primops.cc
index d6618df2a..dab96d6d4 100644
--- a/src/libexpr/primops.cc
+++ b/src/libexpr/primops.cc
@@ -17,7 +17,6 @@
#include "fetch-to-store.hh"
#include <boost/container/small_vector.hpp>
-#include <boost/regex.hpp>
#include <nlohmann/json.hpp>
#include <sys/types.h>
@@ -27,6 +26,7 @@
#include <algorithm>
#include <cstring>
#include <sstream>
+#include <regex>
#include <dlfcn.h>
#include <cmath>
@@ -3878,205 +3878,19 @@ static RegisterPrimOp primop_hashString({
.fun = prim_hashString,
});
-enum class RegexParseState {
- // Anything outside of those
- Regular,
-
- // Bounded repeats, `}` shouldn't be escaped in those
- //
- // a{2,5}b
- // ^^^^
- BoundedRepeat,
-
- // Backslashes, as C++ regexes only support escaping what needs to be
- // escaped and nothing else
- //
- // a\nb
- // ^
- Backslash,
-
- // Initial part of character set, as `[]]` is a regex for `]` character
- //
- // [abc] [^abc]
- // ^ ^
- CharacterSetStart,
-
- // Initial part of negated character set, as `[^]]` is a regex for
- // anything but `]` character
- //
- // [^abc]
- // ^
- NegatedCharacterSetStart,
-
- // Character set after its first character
- //
- // [abc]
- // ^^
- CharacterSetMiddle,
-
- // Parser state after seeing [, assumes the input is character extension
- // after seeing `:`, `.`, or `=`
- //
- // [a[:alpha:]b]
- // ^
- PossibleCharacterSetExtension,
-
- // Within character extension
- //
- // [a[:alpha:]b]
- // ^^^^^^^
- CharacterSetExtension,
-
- // Within equivalence class expression
- //
- // [[=a=]]
- // ^
- EquivalenceClassExpression,
-};
-
-static boost::regex compile_regex(std::string_view re) {
- // Make sure that Boost supports everything that C++ regexes do,
- // and no non-standard extensions are available.
- //
- // In particular, C++ regexes only support escaping regex metacharacters.
- // They don't support other escape sequences like `\n` and `\d`.
- // Additionally, within character groups, it's not possible to escape
- // anything, backslash is a literal character in those. `[\]` in regexes
- // is a weird way to write `\\`.
- std::string boost_re;
- boost_re.reserve(re.size());
- auto state = RegexParseState::Regular;
- for (char c : re) {
- switch (state) {
- case RegexParseState::Regular:
- switch (c) {
- // Boost regex engine supports more escape sequences than C++ regexes,
- // and as such it's necessary to ensure only escapes supported by C++
- // are allowed.
- case '\\':
- state = RegexParseState::Backslash;
- break;
- case '[':
- state = RegexParseState::CharacterSetStart;
- break;
- case '{':
- state = RegexParseState::BoundedRepeat;
- break;
- // Boost doesn't permit unescaped `}`, escape it outside of
- // bounded repeats.
- case '}':
- boost_re.push_back('\\');
- break;
- default:
- break;
- }
- break;
-
- case RegexParseState::BoundedRepeat:
- if (c == '}') {
- state = RegexParseState::Regular;
- }
- break;
-
- case RegexParseState::Backslash:
- switch (c) {
- case '.': case '|': case '*': case '?': case '+': case '{':
- case '^': case '$': case '[': case '(': case ')': case '\\':
- state = RegexParseState::Regular;
- break;
- default:
- throw boost::regex_error(
- boost::regex_constants::error_type::error_escape
- );
- }
- break;
-
- case RegexParseState::CharacterSetStart:
- if (c == '^') {
- state = RegexParseState::NegatedCharacterSetStart;
- break;
- }
- [[fallthrough]];
-
- case RegexParseState::NegatedCharacterSetStart:
- if (c == ']') {
- state = RegexParseState::CharacterSetMiddle;
- break;
- }
- [[fallthrough]];
-
- case RegexParseState::CharacterSetMiddle:
- middle:
- switch (c) {
- case '[':
- state = RegexParseState::PossibleCharacterSetExtension;
- break;
- case '\\':
- // Backslashes aren't supported in character groups, escape them
- boost_re.push_back('\\');
- state = RegexParseState::CharacterSetMiddle;
- break;
- case ']':
- state = RegexParseState::Regular;
- break;
- default:
- state = RegexParseState::CharacterSetMiddle;
- break;
- }
- break;
-
- case RegexParseState::PossibleCharacterSetExtension:
- switch (c) {
- case ':': case '.':
- state = RegexParseState::CharacterSetExtension;
- break;
- case '=':
- state = RegexParseState::EquivalenceClassExpression;
- break;
- default:
- goto middle;
- }
- break;
-
- case RegexParseState::CharacterSetExtension:
- if (c == ']') {
- state = RegexParseState::CharacterSetMiddle;
- }
- break;
-
- case RegexParseState::EquivalenceClassExpression:
- // C++'s regex parser only supports equivalence classes for
- // alphabetic characters
- if (!((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))) {
- throw boost::regex_error(
- boost::regex_constants::error_type::error_brack
- );
- }
- // After verifying first character, this can be parsed as
- // a regular character set extension, Boost will notice issues
- // after that.
- state = RegexParseState::CharacterSetExtension;
- break;
- }
-
- boost_re.push_back(c);
- }
- return boost::regex(boost_re, boost::regex::extended);
-}
-
struct RegexCache
{
// TODO use C++20 transparent comparison when available
- std::unordered_map<std::string_view, boost::regex> cache;
+ std::unordered_map<std::string_view, std::regex> cache;
std::list<std::string> keys;
- boost::regex get(std::string_view re)
+ std::regex get(std::string_view re)
{
auto it = cache.find(re);
if (it != cache.end())
return it->second;
keys.emplace_back(re);
- return cache.emplace(keys.back(), compile_regex(re)).first->second;
+ return cache.emplace(keys.back(), std::regex(keys.back(), std::regex::extended)).first->second;
}
};
@@ -4096,8 +3910,8 @@ void prim_match(EvalState & state, const PosIdx pos, Value * * args, Value & v)
NixStringContext context;
const auto str = state.forceString(*args[1], context, pos, "while evaluating the second argument passed to builtins.match");
- boost::cmatch match;
- if (!boost::regex_match(str.begin(), str.end(), match, regex)) {
+ std::cmatch match;
+ if (!std::regex_match(str.begin(), str.end(), match, regex)) {
v.mkNull();
return;
}
@@ -4112,8 +3926,8 @@ void prim_match(EvalState & state, const PosIdx pos, Value * * args, Value & v)
(v.listElems()[i] = state.allocValue())->mkString(match[i + 1].str());
}
- } catch (boost::regex_error & e) {
- if (e.code() == boost::regex_constants::error_space) {
+ } catch (std::regex_error & e) {
+ if (e.code() == std::regex_constants::error_space) {
// limit is _GLIBCXX_REGEX_STATE_LIMIT for libstdc++
state.error<EvalError>("memory limit exceeded by regular expression '%s'", re)
.atPos(pos)
@@ -4174,8 +3988,8 @@ void prim_split(EvalState & state, const PosIdx pos, Value * * args, Value & v)
NixStringContext context;
const auto str = state.forceString(*args[1], context, pos, "while evaluating the second argument passed to builtins.split");
- auto begin = boost::cregex_iterator(str.begin(), str.end(), regex);
- auto end = boost::cregex_iterator();
+ auto begin = std::cregex_iterator(str.begin(), str.end(), regex);
+ auto end = std::cregex_iterator();
// Any matches results are surrounded by non-matching results.
const size_t len = std::distance(begin, end);
@@ -4214,8 +4028,8 @@ void prim_split(EvalState & state, const PosIdx pos, Value * * args, Value & v)
assert(idx == 2 * len + 1);
- } catch (boost::regex_error & e) {
- if (e.code() == boost::regex_constants::error_space) {
+ } catch (std::regex_error & e) {
+ if (e.code() == std::regex_constants::error_space) {
// limit is _GLIBCXX_REGEX_STATE_LIMIT for libstdc++
state.error<EvalError>("memory limit exceeded by regular expression '%s'", re)
.atPos(pos)
diff --git a/tests/unit/libexpr/primops.cc b/tests/unit/libexpr/primops.cc
index b73fdff72..bd174a6c0 100644
--- a/tests/unit/libexpr/primops.cc
+++ b/tests/unit/libexpr/primops.cc
@@ -816,151 +816,6 @@ namespace nix {
ASSERT_THAT(*v.listElems()[0], IsStringEq("FOO"));
}
- TEST_F(PrimOpTest, match5) {
- auto v = eval("builtins.match ''}'' ''}''");
- ASSERT_THAT(v, IsListOfSize(0));
- }
-
- TEST_F(PrimOpTest, match6) {
- auto v = eval("builtins.match '']'' '']''");
- ASSERT_THAT(v, IsListOfSize(0));
- }
-
- TEST_F(PrimOpTest, match7) {
- auto v = eval("builtins.match ''[[]'' ''[''");
- ASSERT_THAT(v, IsListOfSize(0));
- }
-
- TEST_F(PrimOpTest, match8) {
- auto v = eval("builtins.match ''[]]'' '']''");
- ASSERT_THAT(v, IsListOfSize(0));
- }
-
- TEST_F(PrimOpTest, match9) {
- auto v = eval("builtins.match ''[[=a=]]'' ''A''");
- ASSERT_THAT(v, IsListOfSize(0));
- }
-
- TEST_F(PrimOpTest, match10) {
- auto v = eval("builtins.match ''[[.right-curly-bracket.]]'' ''}''");
- ASSERT_THAT(v, IsListOfSize(0));
- }
-
- TEST_F(PrimOpTest, match11) {
- auto v = eval("builtins.match ''[[.tilde.]]'' ''~''");
- ASSERT_THAT(v, IsListOfSize(0));
- }
-
- TEST_F(PrimOpTest, match12) {
- auto v = eval("builtins.match ''[\\n]'' ''\\''");
- ASSERT_THAT(v, IsListOfSize(0));
- }
-
- TEST_F(PrimOpTest, match13) {
- auto v = eval("builtins.match ''[\\[]'' ''\\''");
- ASSERT_THAT(v, IsListOfSize(0));
- }
-
- TEST_F(PrimOpTest, match14) {
- auto v = eval("builtins.match ''[\\]]'' ''\\]''");
- ASSERT_THAT(v, IsListOfSize(0));
- }
-
- TEST_F(PrimOpTest, match15) {
- auto v = eval("builtins.match ''[\\-z]'' ''y''");
- ASSERT_THAT(v, IsListOfSize(0));
- }
-
- TEST_F(PrimOpTest, match16) {
- auto v = eval("builtins.match ''[\\\\]'' ''\\''");
- ASSERT_THAT(v, IsListOfSize(0));
- }
-
- TEST_F(PrimOpTest, match17) {
- auto v = eval("builtins.match ''[\\]'' ''\\''");
- ASSERT_THAT(v, IsListOfSize(0));
- }
-
- TEST_F(PrimOpTest, match18) {
- auto v = eval("builtins.match ''[\\]]'' ''\\]''");
- ASSERT_THAT(v, IsListOfSize(0));
- }
-
- TEST_F(PrimOpTest, match19) {
- auto v = eval("builtins.match ''.*[Ω].*'' ''β''");
- ASSERT_THAT(v, IsListOfSize(0));
- }
-
- TEST_F(PrimOpTest, match20) {
- auto v = eval("builtins.match ''[^]]'' ''a''");
- ASSERT_THAT(v, IsListOfSize(0));
- }
-
- TEST_F(PrimOpTest, match21) {
- auto v = eval("builtins.match ''[[[:alpha:]]'' ''[''");
- ASSERT_THAT(v, IsListOfSize(0));
- }
-
- TEST_F(PrimOpTest, match_unsupported_syntax1) {
- ASSERT_THROW(eval("builtins.match ''{'' ''{''"), EvalError);
- }
-
- TEST_F(PrimOpTest, match_unsupported_syntax2) {
- ASSERT_THROW(eval("builtins.match ''(a)\\1'' ''aa''"), EvalError);
- }
-
- TEST_F(PrimOpTest, match_unsupported_syntax3) {
- ASSERT_THROW(eval("builtins.match ''\\}'' ''}''"), EvalError);
- }
-
- TEST_F(PrimOpTest, match_unsupported_syntax4) {
- ASSERT_THROW(eval("builtins.match ''\\]'' '']''"), EvalError);
- }
-
- TEST_F(PrimOpTest, match_unsupported_syntax5) {
- ASSERT_THROW(eval("builtins.match ''\\x41'' ''A''"), EvalError);
- }
-
- TEST_F(PrimOpTest, match_unsupported_syntax6) {
- ASSERT_THROW(eval("builtins.match ''\\n'' \"\\n\""), EvalError);
- }
-
- TEST_F(PrimOpTest, match_unsupported_syntax7) {
- ASSERT_THROW(eval("builtins.match ''\\d'' ''1''"), EvalError);
- }
-
- TEST_F(PrimOpTest, match_unsupported_syntax8) {
- ASSERT_THROW(eval("builtins.match ''\\b1'' ''1''"), EvalError);
- }
-
- TEST_F(PrimOpTest, match_unsupported_syntax9) {
- ASSERT_THROW(eval("builtins.match ''\\A1'' ''1''"), EvalError);
- }
-
- TEST_F(PrimOpTest, match_unsupported_syntax10) {
- ASSERT_THROW(eval("builtins.match ''(?:1)'' ''1''"), EvalError);
- }
-
- TEST_F(PrimOpTest, match_unsupported_syntax11) {
- ASSERT_THROW(eval("builtins.match ''[b-a]'' ''b''"), EvalError);
- }
-
- TEST_F(PrimOpTest, match_unsupported_syntax12) {
- ASSERT_THROW(eval("builtins.match ''[[:alpha:]-a]'' ''b''"), EvalError);
- }
-
- TEST_F(PrimOpTest, match_unsupported_syntax13) {
- ASSERT_THROW(eval("builtins.match ''[[=1=]]'' ''1''"), EvalError);
- }
-
- TEST_F(PrimOpTest, match_unsupported_syntax14) {
- ASSERT_THROW(eval("builtins.match ''[[=]=]]'' '']''"), EvalError);
- }
-
- TEST_F(PrimOpTest, match_unsupported_syntax15) {
- ASSERT_THROW(eval("builtins.match ''[a-b-c]'' ''b''"), EvalError);
- }
-
TEST_F(PrimOpTest, attrNames) {
auto v = eval("builtins.attrNames { x = 1; y = 2; z = 3; a = 2; }");
ASSERT_THAT(v, IsListOfSize(4));