-
Notifications
You must be signed in to change notification settings - Fork 5
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Bug-Fix: Add negative tags for RegexMultiplicationAST
with min=0
.
#41
base: main
Are you sure you want to change the base?
Changes from all commits
a6274ec
186d239
4f122c6
c24f6e1
33582da
3338ec7
3cd3c0f
e05acbb
5e61e83
082090d
2c6d94e
4e02f24
bb3c543
54027ad
e58274f
1321871
c904755
ffe9a0f
913ed1a
7aa8a92
77e44a5
d1d87e7
f386a3b
0c600d7
bedad75
053d057
a822307
0b9603a
2ef84d1
83bd518
d3d815e
168adb0
20b3421
5231a4a
e4ac215
4b8b13e
e2d05fa
660eb9b
731b9fe
81b4ffa
a4d29a5
9379447
6c4d933
a2fdbf1
b0485f5
64da95c
e5bda43
0ac7c43
529dcb2
2e17bee
013765b
14cbe97
20864d6
72e61f6
aadb290
c9d5510
f138527
4cc5e2a
e08e345
ea5121a
2ff09b5
69cdd1f
f580cdd
c0dd3fe
df12855
be00753
2f47770
a15365b
239ff76
2826461
db42bd8
d5eeb5a
ce2fc76
9643138
550961e
74c46ad
29e0777
cde19a6
9837c10
dac2122
aa4a4e4
e00fead
63fd9da
a5eae39
7b73929
1c62d8c
033928f
1b6cd08
bc4444a
48cea41
547bef4
c73c72c
f364aac
e93ea4f
c2933f3
f195955
37dfdca
3e64e7c
69af073
14f9f69
e49421a
5ca7b4e
2c72410
f113c19
064379c
bf06d33
34f9e4f
af61ee1
9329cc8
ede2a16
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||
---|---|---|---|---|---|---|---|---|
|
@@ -30,6 +30,8 @@ using RegexASTCatByte = log_surgeon::finite_automata::RegexASTCat< | |||||||
log_surgeon::finite_automata::RegexNFAByteState>; | ||||||||
using RegexASTCaptureByte = log_surgeon::finite_automata::RegexASTCapture< | ||||||||
log_surgeon::finite_automata::RegexNFAByteState>; | ||||||||
using RegexASTEmptyByte = log_surgeon::finite_automata::RegexASTEmpty< | ||||||||
log_surgeon::finite_automata::RegexNFAByteState>; | ||||||||
|
||||||||
using std::make_unique; | ||||||||
using std::string; | ||||||||
|
@@ -196,8 +198,11 @@ static auto regex_or_rule(NonTerminal* m) -> unique_ptr<ParserAST> { | |||||||
|
||||||||
static auto regex_match_zero_or_more_rule(NonTerminal* m) -> unique_ptr<ParserAST> { | ||||||||
auto& r1 = m->non_terminal_cast(0)->get_parser_ast()->get<unique_ptr<RegexASTByte>>(); | ||||||||
return unique_ptr<ParserAST>(new ParserValueRegex( | ||||||||
unique_ptr<RegexASTByte>(new RegexASTMultiplicationByte(std::move(r1), 0, 0)) | ||||||||
|
||||||||
// To handle negative tags we treat `R{0,N}` as `R{1,N} | ∅`. | ||||||||
return make_unique<ParserValueRegex>(make_unique<RegexASTOrByte>( | ||||||||
make_unique<RegexASTEmptyByte>(), | ||||||||
make_unique<RegexASTMultiplicationByte>(std::move(r1), 1, 0) | ||||||||
)); | ||||||||
} | ||||||||
|
||||||||
|
@@ -238,6 +243,14 @@ static auto regex_match_range_rule(NonTerminal* m) -> unique_ptr<ParserAST> { | |||||||
max += r5_ptr->get_digit(i) * (uint32_t)pow(10, r5_size - i - 1); | ||||||||
} | ||||||||
auto& r1 = m->non_terminal_cast(0)->get_parser_ast()->get<unique_ptr<RegexASTByte>>(); | ||||||||
|
||||||||
if (min == 0) { | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||
// To handle negative tags we treat `R{0,N}` as `R{1,N} | ∅`. | ||||||||
return make_unique<ParserValueRegex>(make_unique<RegexASTOrByte>( | ||||||||
make_unique<RegexASTEmptyByte>(), | ||||||||
make_unique<RegexASTMultiplicationByte>(std::move(r1), 1, max) | ||||||||
)); | ||||||||
} | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||
return unique_ptr<ParserAST>(new ParserValueRegex( | ||||||||
unique_ptr<RegexASTByte>(new RegexASTMultiplicationByte(std::move(r1), min, max)) | ||||||||
)); | ||||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -124,6 +124,40 @@ class RegexAST { | |
std::set<uint32_t> m_negative_tags; | ||
}; | ||
|
||
/** | ||
* Class for an empty AST node. This is used to simplify tagged-NFA creation when using regex | ||
* repetition with a minimum repetition of 0. Namely, we treat `R{0,N}` as `R{1,N} | ∅`. Then, the | ||
* NFA handles the 0 repetition case using the logic in `RegexASTOR` (i.e., adding a negative | ||
* transition for every capture group matched in `R{1,N}`). | ||
* @tparam NFAStateType Whether this AST is used for byte lexing or UTF-8 lexing. | ||
*/ | ||
template <typename NFAStateType> | ||
class RegexASTEmpty : public RegexAST<NFAStateType> { | ||
public: | ||
RegexASTEmpty() = default; | ||
|
||
[[nodiscard]] auto clone() const -> gsl::owner<RegexASTEmpty*> override { | ||
return new RegexASTEmpty(*this); | ||
} | ||
|
||
// Do nothing as an empty node contains no utf8 characters. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we don't need this comment. If you want to keep it, we should move it inside the function body since it's an inline comment. |
||
auto set_possible_inputs_to_true( | ||
[[maybe_unused]] std::array<bool, cSizeOfUnicode>& is_possible_input | ||
) const -> void override {} | ||
|
||
// Do nothing as an empty node contains no delimiters. | ||
auto remove_delimiters_from_wildcard([[maybe_unused]] std::vector<uint32_t>& delimiters | ||
) -> void override {} | ||
|
||
// Do nothing as adding an empty node to the NFA is a null operation. | ||
auto add_to_nfa( | ||
[[maybe_unused]] RegexNFA<NFAStateType>* nfa, | ||
[[maybe_unused]] NFAStateType* end_state | ||
) const -> void override {} | ||
|
||
[[nodiscard]] auto serialize() const -> std::u32string override; | ||
}; | ||
|
||
template <typename NFAStateType> | ||
class RegexASTLiteral : public RegexAST<NFAStateType> { | ||
public: | ||
|
@@ -233,7 +267,7 @@ class RegexASTGroup : public RegexAST<NFAStateType> { | |
public: | ||
using Range = std::pair<uint32_t, uint32_t>; | ||
|
||
RegexASTGroup(); | ||
RegexASTGroup() = default; | ||
|
||
explicit RegexASTGroup(RegexASTLiteral<NFAStateType> const* right); | ||
|
||
|
@@ -655,6 +689,11 @@ class RegexASTCapture : public RegexAST<NFAStateType> { | |
uint32_t m_tag; | ||
}; | ||
|
||
template <typename NFAStateType> | ||
[[nodiscard]] auto RegexASTEmpty<NFAStateType>::serialize() const -> std::u32string { | ||
return fmt::format(U"{}", RegexAST<NFAStateType>::serialize_negative_tags()); | ||
} | ||
|
||
template <typename NFAStateType> | ||
RegexASTLiteral<NFAStateType>::RegexASTLiteral(uint32_t character) : m_character(character) {} | ||
|
||
|
@@ -844,9 +883,6 @@ template <typename NFAStateType> | |
); | ||
} | ||
|
||
template <typename NFAStateType> | ||
RegexASTGroup<NFAStateType>::RegexASTGroup() = default; | ||
|
||
template <typename NFAStateType> | ||
RegexASTGroup<NFAStateType>::RegexASTGroup( | ||
RegexASTGroup const* left, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,7 @@ | ||
#include <codecvt> | ||
#include <cstdint> | ||
#include <locale> | ||
#include <ranges> | ||
#include <string> | ||
#include <vector> | ||
|
||
|
@@ -10,6 +13,7 @@ | |
#include <log_surgeon/SchemaParser.hpp> | ||
|
||
using std::string; | ||
using std::u32string; | ||
using std::vector; | ||
|
||
using RegexASTCatByte = log_surgeon::finite_automata::RegexASTCat< | ||
|
@@ -26,6 +30,33 @@ using RegexASTOrByte | |
= log_surgeon::finite_automata::RegexASTOr<log_surgeon::finite_automata::RegexNFAByteState>; | ||
using log_surgeon::SchemaVarAST; | ||
|
||
auto test_regex_ast(string const& regex, u32string const& expected_serialized_ast) -> void { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
log_surgeon::Schema schema; | ||
schema.add_variable("capture", regex, -1); | ||
auto const schema_ast = schema.release_schema_ast_ptr(); | ||
auto const* capture_rule_ast = dynamic_cast<SchemaVarAST*>(schema_ast->m_schema_vars[0].get()); | ||
REQUIRE(capture_rule_ast != nullptr); | ||
|
||
auto u32_to_u8 = [](char32_t const u32_char) -> std::string { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
[[nodiscard]] auto u32string_to_utf8(std::u32string const& u32_str) -> string {
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
return converter.to_bytes(u32_str);
} |
||
std::u32string const u32_str{1, u32_char}; | ||
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter; | ||
return converter.to_bytes(u32_str.data(), u32_str.data() + u32_str.size()); | ||
}; | ||
|
||
auto const actual_u32string = capture_rule_ast->m_regex_ptr->serialize(); | ||
auto const actual_string = fmt::format( | ||
"{}", | ||
fmt::join(actual_u32string | std::ranges::views::transform(u32_to_u8), "") | ||
); | ||
|
||
auto const expected_string = fmt::format( | ||
"{}", | ||
fmt::join(expected_serialized_ast | std::ranges::views::transform(u32_to_u8), "") | ||
); | ||
|
||
REQUIRE(actual_string == expected_string); | ||
} | ||
|
||
TEST_CASE("Test the Schema class", "[Schema]") { | ||
SECTION("Add a number variable to schema") { | ||
log_surgeon::Schema schema; | ||
|
@@ -92,27 +123,16 @@ TEST_CASE("Test the Schema class", "[Schema]") { | |
// This test validates the serialization of a regex AST with named capture groups. The | ||
// serialized output includes tags (<n> for positive matches, <~n> for negative matches) to | ||
// indicate which capture groups are matched or unmatched at each node. | ||
|
||
log_surgeon::Schema schema; | ||
schema.add_variable( | ||
test_regex_ast( | ||
// clang-format off | ||
"capture", | ||
"Z|(" | ||
"A(?<letter>(" | ||
"(?<letter1>(a)|(b))|" | ||
"(?<letter2>(c)|(d))" | ||
"(?<letter1>(a)|(b))|" | ||
"(?<letter2>(c)|(d))" | ||
"))B(" | ||
"?<containerID>\\d+" | ||
")C" | ||
")", | ||
// clang-format on | ||
-1 | ||
); | ||
auto const schema_ast = schema.release_schema_ast_ptr(); | ||
auto& capture_rule_ast = dynamic_cast<SchemaVarAST&>(*schema_ast->m_schema_vars[0]); | ||
|
||
constexpr std::u32string_view cExpectedSerializedU32StringWithTags{ | ||
// clang-format off | ||
U"(Z<~0><~1><~2><~3>)|(" | ||
"A(" | ||
"(((a)|(b))<0><~1>)|" | ||
|
@@ -122,8 +142,23 @@ TEST_CASE("Test the Schema class", "[Schema]") { | |
")<3>C" | ||
")" | ||
// clang-format on | ||
}; | ||
REQUIRE(capture_rule_ast.m_regex_ptr->serialize() | ||
== std::u32string(cExpectedSerializedU32StringWithTags)); | ||
); | ||
} | ||
|
||
SECTION("Test repetition regex") { | ||
// Repetition without capture groups untagged and tagged AST are the same | ||
test_regex_ast("a{0,10}", U"()|(a{1,10})"); | ||
test_regex_ast("a{5,10}", U"a{5,10}"); | ||
test_regex_ast("a*", U"()|(a{1,inf})"); | ||
test_regex_ast("a+", U"a{1,inf}"); | ||
|
||
// Repetition with capture groups untagged and tagged AST are different | ||
test_regex_ast("(?<letter>a){0,10}", U"(<~0>)|((a)<0>{1,10})"); | ||
test_regex_ast("(?<letter>a){5,10}", U"(a)<0>{5,10}"); | ||
test_regex_ast("(?<letter>a)*", U"(<~0>)|((a)<0>{1,inf})"); | ||
test_regex_ast("(?<letter>a)+", U"(a)<0>{1,inf}"); | ||
|
||
// Capture group with repetition | ||
test_regex_ast("(?<letter>a{0,10})", U"(()|(a{1,10}))<0>"); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we add a more complicated test case like this? test_regex_ast(
"(((?<letterA>a)|(?<letterB>b))*)|(((?<letterC>c)|(?<letterD>d)){0,10})",
U"((<~0><~1>)|(((a)<0><~1>)|((b)<1><~0>){1,inf})<~2><~3>)|((<~2><~3>)|(((c)<2><~3>)"
U"|((d)<3><~2>){1,10})<~0><~1>)"
); |
||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.