y-scope · SharafMohamed · Sep 12, 2024 · Sep 12, 2024 · Sep 12, 2024 · Sep 12, 2024
diff --git a/src/log_surgeon/SchemaParser.cpp b/src/log_surgeon/SchemaParser.cpp
@@ -30,6 +30,8 @@ using RegexASTCatByte = log_surgeon::finite_automata::RegexASTCat<
         log_surgeon::finite_automata::RegexNFAByteState>;
 using RegexASTCaptureByte = log_surgeon::finite_automata::RegexASTCapture<
         log_surgeon::finite_automata::RegexNFAByteState>;
+using RegexASTEmptyByte = log_surgeon::finite_automata::RegexASTEmpty<
+        log_surgeon::finite_automata::RegexNFAByteState>;
 
 using std::make_unique;
 using std::string;
@@ -196,8 +198,11 @@ static auto regex_or_rule(NonTerminal* m) -> unique_ptr<ParserAST> {
 
 static auto regex_match_zero_or_more_rule(NonTerminal* m) -> unique_ptr<ParserAST> {
     auto& r1 = m->non_terminal_cast(0)->get_parser_ast()->get<unique_ptr<RegexASTByte>>();
-    return unique_ptr<ParserAST>(new ParserValueRegex(
-            unique_ptr<RegexASTByte>(new RegexASTMultiplicationByte(std::move(r1), 0, 0))
+
+    // To handle negative tags we treat `R{0,N}` as `R{1,N} | ∅`.
-    // To handle negative tags we treat `R{0,N}` as `R{1,N} | ∅`.
+    // To handle negative tags we treat `R*` as `R+ | ∅`.
-    // To handle negative tags we treat `R{0,N}` as `R{1,N} | ∅`.
+    // To handle negative tags we treat `R*` as `R+ | ∅`.
+    return make_unique<ParserValueRegex>(make_unique<RegexASTOrByte>(
+            make_unique<RegexASTEmptyByte>(),
+            make_unique<RegexASTMultiplicationByte>(std::move(r1), 1, 0)
     ));
 }
 
@@ -238,6 +243,14 @@ static auto regex_match_range_rule(NonTerminal* m) -> unique_ptr<ParserAST> {
         max += r5_ptr->get_digit(i) * (uint32_t)pow(10, r5_size - i - 1);
     }
     auto& r1 = m->non_terminal_cast(0)->get_parser_ast()->get<unique_ptr<RegexASTByte>>();
+
+    if (min == 0) {
-    if (min == 0) {
+    if (0 == min) {
-    if (min == 0) {
+    if (0 == min) {
+        // To handle negative tags we treat `R{0,N}` as `R{1,N} | ∅`.
+        return make_unique<ParserValueRegex>(make_unique<RegexASTOrByte>(
+                make_unique<RegexASTEmptyByte>(),
+                make_unique<RegexASTMultiplicationByte>(std::move(r1), 1, max)
+        ));
+    }
-    }
+    }
+  
-    }
+    }
+  
     return unique_ptr<ParserAST>(new ParserValueRegex(
             unique_ptr<RegexASTByte>(new RegexASTMultiplicationByte(std::move(r1), min, max))
     ));

diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp
@@ -124,6 +124,40 @@ class RegexAST {
     std::set<uint32_t> m_negative_tags;
 };
 
+/**
+ * Class for an empty AST node. This is used to simplify tagged-NFA creation when using regex
+ * repetition with a minimum repetition of 0. Namely, we treat `R{0,N}` as `R{1,N} | ∅`. Then, the
+ * NFA handles the 0 repetition case using the logic in `RegexASTOR` (i.e., adding a negative
+ * transition for every capture group matched in `R{1,N}`).
+ * @tparam NFAStateType Whether this AST is used for byte lexing or UTF-8 lexing.
+ */
+template <typename NFAStateType>
+class RegexASTEmpty : public RegexAST<NFAStateType> {
+public:
+    RegexASTEmpty() = default;
+
+    [[nodiscard]] auto clone() const -> gsl::owner<RegexASTEmpty*> override {
+        return new RegexASTEmpty(*this);
+    }
+
+    // Do nothing as an empty node contains no utf8 characters.
+    auto set_possible_inputs_to_true(
+            [[maybe_unused]] std::array<bool, cSizeOfUnicode>& is_possible_input
+    ) const -> void override {}
+
+    // Do nothing as an empty node contains no delimiters.
+    auto remove_delimiters_from_wildcard([[maybe_unused]] std::vector<uint32_t>& delimiters
+    ) -> void override {}
+
+    // Do nothing as adding an empty node to the NFA is a null operation.
+    auto add_to_nfa(
+            [[maybe_unused]] RegexNFA<NFAStateType>* nfa,
+            [[maybe_unused]] NFAStateType* end_state
+    ) const -> void override {}
+
+    [[nodiscard]] auto serialize() const -> std::u32string override;
+};
+
 template <typename NFAStateType>
 class RegexASTLiteral : public RegexAST<NFAStateType> {
 public:
@@ -233,7 +267,7 @@ class RegexASTGroup : public RegexAST<NFAStateType> {
 public:
     using Range = std::pair<uint32_t, uint32_t>;
 
-    RegexASTGroup();
+    RegexASTGroup() = default;
 
     explicit RegexASTGroup(RegexASTLiteral<NFAStateType> const* right);
 
@@ -655,6 +689,11 @@ class RegexASTCapture : public RegexAST<NFAStateType> {
     uint32_t m_tag;
 };
 
+template <typename NFAStateType>
+[[nodiscard]] auto RegexASTEmpty<NFAStateType>::serialize() const -> std::u32string {
+    return fmt::format(U"{}", RegexAST<NFAStateType>::serialize_negative_tags());
+}
+
 template <typename NFAStateType>
 RegexASTLiteral<NFAStateType>::RegexASTLiteral(uint32_t character) : m_character(character) {}
 
@@ -844,9 +883,6 @@ template <typename NFAStateType>
     );
 }
 
-template <typename NFAStateType>
-RegexASTGroup<NFAStateType>::RegexASTGroup() = default;
-
 template <typename NFAStateType>
 RegexASTGroup<NFAStateType>::RegexASTGroup(
         RegexASTGroup const* left,

diff --git a/tests/test-lexer.cpp b/tests/test-lexer.cpp
@@ -1,4 +1,7 @@
+#include <codecvt>
 #include <cstdint>
+#include <locale>
+#include <ranges>
 #include <string>
 #include <vector>
 
@@ -10,6 +13,7 @@
 #include <log_surgeon/SchemaParser.hpp>
 
 using std::string;
+using std::u32string;
 using std::vector;
 
 using RegexASTCatByte = log_surgeon::finite_automata::RegexASTCat<
@@ -26,6 +30,33 @@ using RegexASTOrByte
         = log_surgeon::finite_automata::RegexASTOr<log_surgeon::finite_automata::RegexNFAByteState>;
 using log_surgeon::SchemaVarAST;
 
+auto test_regex_ast(string const& regex, u32string const& expected_serialized_ast) -> void {
+    log_surgeon::Schema schema;
+    schema.add_variable("capture", regex, -1);
+    auto const schema_ast = schema.release_schema_ast_ptr();
+    auto const* capture_rule_ast = dynamic_cast<SchemaVarAST*>(schema_ast->m_schema_vars[0].get());
+    REQUIRE(capture_rule_ast != nullptr);
+
+    auto u32_to_u8 = [](char32_t const u32_char) -> std::string {
+        std::u32string const u32_str{1, u32_char};
+        std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
+        return converter.to_bytes(u32_str.data(), u32_str.data() + u32_str.size());
+    };
+
+    auto const actual_u32string = capture_rule_ast->m_regex_ptr->serialize();
+    auto const actual_string = fmt::format(
+            "{}",
+            fmt::join(actual_u32string | std::ranges::views::transform(u32_to_u8), "")
+    );
+
+    auto const expected_string = fmt::format(
+            "{}",
+            fmt::join(expected_serialized_ast | std::ranges::views::transform(u32_to_u8), "")
+    );
+
+    REQUIRE(actual_string == expected_string);
+}
+
 TEST_CASE("Test the Schema class", "[Schema]") {
     SECTION("Add a number variable to schema") {
         log_surgeon::Schema schema;
@@ -92,27 +123,16 @@ TEST_CASE("Test the Schema class", "[Schema]") {
         // This test validates the serialization of a regex AST with named capture groups. The
         // serialized output includes tags (<n> for positive matches, <~n> for negative matches) to
         // indicate which capture groups are matched or unmatched at each node.
-
-        log_surgeon::Schema schema;
-        schema.add_variable(
+        test_regex_ast(
                 // clang-format off
-                "capture",
                 "Z|("
                     "A(?<letter>("
-                            "(?<letter1>(a)|(b))|"
-                            "(?<letter2>(c)|(d))"
+                        "(?<letter1>(a)|(b))|"
+                        "(?<letter2>(c)|(d))"
                     "))B("
                         "?<containerID>\\d+"
                     ")C"
                 ")",
-                // clang-format on
-                -1
-        );
-        auto const schema_ast = schema.release_schema_ast_ptr();
-        auto& capture_rule_ast = dynamic_cast<SchemaVarAST&>(*schema_ast->m_schema_vars[0]);
-
-        constexpr std::u32string_view cExpectedSerializedU32StringWithTags{
-                // clang-format off
                 U"(Z<~0><~1><~2><~3>)|("
                     "A("
                         "(((a)|(b))<0><~1>)|"
@@ -122,8 +142,23 @@ TEST_CASE("Test the Schema class", "[Schema]") {
                     ")<3>C"
                 ")"
                 // clang-format on
-        };
-        REQUIRE(capture_rule_ast.m_regex_ptr->serialize()
-                == std::u32string(cExpectedSerializedU32StringWithTags));
+        );
+    }
+
+    SECTION("Test repetition regex") {
+        // Repetition without capture groups untagged and tagged AST are the same
+        test_regex_ast("a{0,10}", U"()|(a{1,10})");
+        test_regex_ast("a{5,10}", U"a{5,10}");
+        test_regex_ast("a*", U"()|(a{1,inf})");
+        test_regex_ast("a+", U"a{1,inf}");
+
+        // Repetition with capture groups untagged and tagged AST are different
+        test_regex_ast("(?<letter>a){0,10}", U"(<~0>)|((a)<0>{1,10})");
+        test_regex_ast("(?<letter>a){5,10}", U"(a)<0>{5,10}");
+        test_regex_ast("(?<letter>a)*", U"(<~0>)|((a)<0>{1,inf})");
+        test_regex_ast("(?<letter>a)+", U"(a)<0>{1,inf}");
+
+        // Capture group with repetition
+        test_regex_ast("(?<letter>a{0,10})", U"(()|(a{1,10}))<0>");
     }
 }