Skip to content

Commit

Permalink
Merge branch 'branch-24.10' into fea-pq-writer-refactor-default-rg-size
Browse files Browse the repository at this point in the history
  • Loading branch information
mhaseeb123 authored Sep 17, 2024
2 parents 135d565 + e98e109 commit 94fb993
Show file tree
Hide file tree
Showing 30 changed files with 934 additions and 72 deletions.
2 changes: 1 addition & 1 deletion cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,7 @@ ConfigureBench(TEXT_BENCH text/ngrams.cpp text/subword.cpp)

ConfigureNVBench(
TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp
text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp
text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp text/word_minhash.cpp
)

# ##################################################################################################
Expand Down
77 changes: 77 additions & 0 deletions cpp/benchmarks/text/word_minhash.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmarks/common/generate_input.hpp>

#include <cudf/column/column_factories.hpp>
#include <cudf/filling.hpp>
#include <cudf/scalar/scalar.hpp>
#include <cudf/strings/strings_column_view.hpp>

#include <nvtext/minhash.hpp>

#include <rmm/device_buffer.hpp>

#include <nvbench/nvbench.cuh>

static void bench_word_minhash(nvbench::state& state)
{
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
auto const seed_count = static_cast<cudf::size_type>(state.get_int64("seed_count"));
auto const base64 = state.get_int64("hash_type") == 64;

data_profile const strings_profile =
data_profile_builder().distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, 5);
auto strings_table =
create_random_table({cudf::type_id::STRING}, row_count{num_rows}, strings_profile);

auto const num_offsets = (num_rows / row_width) + 1;
auto offsets = cudf::sequence(num_offsets,
cudf::numeric_scalar<cudf::size_type>(0),
cudf::numeric_scalar<cudf::size_type>(row_width));

auto source = cudf::make_lists_column(num_offsets - 1,
std::move(offsets),
std::move(strings_table->release().front()),
0,
rmm::device_buffer{});

data_profile const seeds_profile = data_profile_builder().no_validity().distribution(
cudf::type_to_id<cudf::hash_value_type>(), distribution_id::NORMAL, 0, 256);
auto const seed_type = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32;
auto const seeds_table = create_random_table({seed_type}, row_count{seed_count}, seeds_profile);
auto seeds = seeds_table->get_column(0);

state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));

cudf::strings_column_view input(cudf::lists_column_view(source->view()).child());
auto chars_size = input.chars_size(cudf::get_default_stream());
state.add_global_memory_reads<nvbench::int8_t>(chars_size);
state.add_global_memory_writes<nvbench::int32_t>(num_rows); // output are hashes

state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
auto result = base64 ? nvtext::word_minhash64(source->view(), seeds.view())
: nvtext::word_minhash(source->view(), seeds.view());
});
}

NVBENCH_BENCH(bench_word_minhash)
.set_name("word_minhash")
.add_int64_axis("num_rows", {131072, 262144, 524288, 1048576, 2097152})
.add_int64_axis("row_width", {10, 100, 1000})
.add_int64_axis("seed_count", {2, 25})
.add_int64_axis("hash_type", {32, 64});
6 changes: 6 additions & 0 deletions cpp/doxygen/regex.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@ The details are based on features documented at https://www.regular-expressions.

**Note:** The alternation character is the pipe character `|` and not the character included in the tables on this page. There is an issue including the pipe character inside the table markdown that is rendered by doxygen.

By default, only the `\n` character is recognized as a line break. The [cudf::strings::regex_flags::EXT_NEWLINE](@ref cudf::strings::regex_flags) increases the set of line break characters to include:
- Paragraph separator (Unicode: `2029`, UTF-8: `E280A9`)
- Line separator (Unicode: `2028`, UTF-8: `E280A8`)
- Next line (Unicode: `0085`, UTF-8: `C285`)
- Carriage return (Unicode: `000D`, UTF-8: `0D`)

**Invalid regex patterns will result in undefined behavior**. This includes but is not limited to the following:
- Unescaped special characters (listed in the third row of the Characters table below) when they are intended to match as literals.
- Unmatched paired special characters like `()`, `[]`, and `{}`.
Expand Down
20 changes: 16 additions & 4 deletions cpp/include/cudf/strings/regex/flags.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,11 @@ namespace strings {
* and to match the Python flag values.
*/
enum regex_flags : uint32_t {
DEFAULT = 0, ///< default
MULTILINE = 8, ///< the '^' and '$' honor new-line characters
DOTALL = 16, ///< the '.' matching includes new-line characters
ASCII = 256 ///< use only ASCII when matching built-in character classes
DEFAULT = 0, ///< default
MULTILINE = 8, ///< the '^' and '$' honor new-line characters
DOTALL = 16, ///< the '.' matching includes new-line characters
ASCII = 256, ///< use only ASCII when matching built-in character classes
EXT_NEWLINE = 512 ///< new-line matches extended characters
};

/**
Expand Down Expand Up @@ -74,6 +75,17 @@ constexpr bool is_ascii(regex_flags const f)
return (f & regex_flags::ASCII) == regex_flags::ASCII;
}

/**
* @brief Returns true if the given flags contain EXT_NEWLINE
*
* @param f Regex flags to check
* @return true if `f` includes EXT_NEWLINE
*/
constexpr bool is_ext_newline(regex_flags const f)
{
return (f & regex_flags::EXT_NEWLINE) == regex_flags::EXT_NEWLINE;
}

/**
* @brief Capture groups setting
*
Expand Down
11 changes: 8 additions & 3 deletions cpp/include/cudf/strings/string_view.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -191,9 +191,14 @@ __device__ inline string_view::const_iterator& string_view::const_iterator::oper

__device__ inline string_view::const_iterator& string_view::const_iterator::operator--()
{
if (byte_pos > 0)
while (strings::detail::bytes_in_utf8_byte(static_cast<uint8_t>(p[--byte_pos])) == 0)
;
if (byte_pos > 0) {
if (byte_pos == char_pos) {
--byte_pos;
} else {
while (strings::detail::bytes_in_utf8_byte(static_cast<uint8_t>(p[--byte_pos])) == 0)
;
}
}
--char_pos;
return *this;
}
Expand Down
61 changes: 59 additions & 2 deletions cpp/include/nvtext/minhash.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

#include <cudf/column/column.hpp>
#include <cudf/hashing.hpp>
#include <cudf/lists/lists_column_view.hpp>
#include <cudf/scalar/scalar.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/utilities/export.hpp>
Expand Down Expand Up @@ -72,7 +73,7 @@ std::unique_ptr<cudf::column> minhash(
*
* @throw std::invalid_argument if the width < 2
* @throw std::invalid_argument if seeds is empty
* @throw std::overflow_error if `seeds * input.size()` exceeds the column size limit
* @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
*
* @param input Strings column to compute minhash
* @param seeds Seed values used for the hash algorithm
Expand Down Expand Up @@ -133,7 +134,7 @@ std::unique_ptr<cudf::column> minhash64(
*
* @throw std::invalid_argument if the width < 2
* @throw std::invalid_argument if seeds is empty
* @throw std::overflow_error if `seeds * input.size()` exceeds the column size limit
* @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
*
* @param input Strings column to compute minhash
* @param seeds Seed values used for the hash algorithm
Expand All @@ -150,5 +151,61 @@ std::unique_ptr<cudf::column> minhash64(
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief Returns the minhash values for each row of strings per seed
*
* Hash values are computed from each string in each row and the
* minimum hash value is returned for each row for each seed.
* Each row of the output list column are seed results for the corresponding
* input row. The order of the elements in each row match the order of
* the seeds provided in the `seeds` parameter.
*
* This function uses MurmurHash3_x86_32 for the hash algorithm.
*
* Any null row entries result in corresponding null output rows.
*
* @throw std::invalid_argument if seeds is empty
* @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
*
* @param input Lists column of strings to compute minhash
* @param seeds Seed values used for the hash algorithm
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return List column of minhash values for each string per seed
*/
std::unique_ptr<cudf::column> word_minhash(
cudf::lists_column_view const& input,
cudf::device_span<uint32_t const> seeds,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief Returns the minhash values for each row of strings per seed
*
* Hash values are computed from each string in each row and the
* minimum hash value is returned for each row for each seed.
* Each row of the output list column are seed results for the corresponding
* input row. The order of the elements in each row match the order of
* the seeds provided in the `seeds` parameter.
*
* This function uses MurmurHash3_x64_128 for the hash algorithm though
* only the first 64-bits of the hash are used in computing the output.
*
* Any null row entries result in corresponding null output rows.
*
* @throw std::invalid_argument if seeds is empty
* @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
*
* @param input Lists column of strings to compute minhash
* @param seeds Seed values used for the hash algorithm
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return List column of minhash values for each string per seed
*/
std::unique_ptr<cudf::column> word_minhash64(
cudf::lists_column_view const& input,
cudf::device_span<uint64_t const> seeds,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
/** @} */ // end of group
} // namespace CUDF_EXPORT nvtext
21 changes: 16 additions & 5 deletions cpp/src/strings/regex/regcomp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -539,15 +539,26 @@ class regex_parser {
: static_cast<int32_t>(LBRA);
case ')': return RBRA;
case '^': {
_chr = is_multiline(_flags) ? chr : '\n';
if (is_ext_newline(_flags)) {
_chr = is_multiline(_flags) ? 'S' : 'N';
} else {
_chr = is_multiline(_flags) ? chr : '\n';
}
return BOL;
}
case '$': {
_chr = is_multiline(_flags) ? chr : '\n';
if (is_ext_newline(_flags)) {
_chr = is_multiline(_flags) ? 'S' : 'N';
} else {
_chr = is_multiline(_flags) ? chr : '\n';
}
return EOL;
}
case '[': return build_cclass();
case '.': return dot_type;
case '.': {
_chr = is_ext_newline(_flags) ? 'N' : chr;
return dot_type;
}
}

if (std::find(quantifiers.begin(), quantifiers.end(), static_cast<char>(chr)) ==
Expand Down Expand Up @@ -959,7 +970,7 @@ class regex_compiler {
_prog.inst_at(inst_id).u1.cls_id = class_id;
} else if (token == CHAR) {
_prog.inst_at(inst_id).u1.c = yy;
} else if (token == BOL || token == EOL) {
} else if (token == BOL || token == EOL || token == ANY) {
_prog.inst_at(inst_id).u1.c = yy;
}
push_and(inst_id, inst_id);
Expand Down Expand Up @@ -1194,7 +1205,7 @@ void reprog::print(regex_flags const flags)
case STAR: printf(" STAR next=%d", inst.u2.next_id); break;
case PLUS: printf(" PLUS next=%d", inst.u2.next_id); break;
case QUEST: printf(" QUEST next=%d", inst.u2.next_id); break;
case ANY: printf(" ANY next=%d", inst.u2.next_id); break;
case ANY: printf(" ANY '%c', next=%d", inst.u1.c, inst.u2.next_id); break;
case ANYNL: printf(" ANYNL next=%d", inst.u2.next_id); break;
case NOP: printf(" NOP next=%d", inst.u2.next_id); break;
case BOL: {
Expand Down
46 changes: 34 additions & 12 deletions cpp/src/strings/regex/regex.inl
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,16 @@ __device__ __forceinline__ void reprog_device::reljunk::swaplist()
list2 = tmp;
}

/**
* @brief Check for supported new-line characters
*
* '\n, \r, \u0085, \u2028, or \u2029'
*/
constexpr bool is_newline(char32_t const ch)
{
return (ch == '\n' || ch == '\r' || ch == 0x00c285 || ch == 0x00e280a8 || ch == 0x00e280a9);
}

/**
* @brief Utility to check a specific character against this class instance.
*
Expand Down Expand Up @@ -258,11 +268,14 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const
if (checkstart) {
auto startchar = static_cast<char_utf8>(jnk.startchar);
switch (jnk.starttype) {
case BOL:
if (pos == 0) break;
if (jnk.startchar != '^') { return cuda::std::nullopt; }
case BOL: {
if (pos == 0) { break; }
if (startchar != '^' && startchar != 'S') { return cuda::std::nullopt; }
if (startchar != '\n') { break; }
--itr;
startchar = static_cast<char_utf8>('\n');
[[fallthrough]];
}
case CHAR: {
auto const find_itr = find_char(startchar, dstr, itr);
if (find_itr.byte_offset() >= dstr.size_bytes()) { return cuda::std::nullopt; }
Expand Down Expand Up @@ -312,26 +325,34 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const
id_activate = inst.u2.next_id;
expanded = true;
break;
case BOL:
if ((pos == 0) || ((inst.u1.c == '^') && (dstr[pos - 1] == '\n'))) {
case BOL: {
auto titr = itr;
auto const prev_c = pos > 0 ? *(--titr) : 0;
if ((pos == 0) || ((inst.u1.c == '^') && (prev_c == '\n')) ||
((inst.u1.c == 'S') && (is_newline(prev_c)))) {
id_activate = inst.u2.next_id;
expanded = true;
}
break;
case EOL:
}
case EOL: {
// after the last character OR:
// - for MULTILINE, if current character is new-line
// - for non-MULTILINE, the very last character of the string can also be a new-line
bool const nl = (inst.u1.c == 'S' || inst.u1.c == 'N') ? is_newline(c) : (c == '\n');
if (last_character ||
((c == '\n') && (inst.u1.c != 'Z') &&
((inst.u1.c == '$') || (itr.byte_offset() + 1 == dstr.size_bytes())))) {
(nl && (inst.u1.c != 'Z') &&
((inst.u1.c == '$' || inst.u1.c == 'S') ||
(itr.byte_offset() + bytes_in_char_utf8(c) == dstr.size_bytes())))) {
id_activate = inst.u2.next_id;
expanded = true;
}
break;
}
case BOW:
case NBOW: {
auto const prev_c = pos > 0 ? dstr[pos - 1] : 0;
auto titr = itr;
auto const prev_c = pos > 0 ? *(--titr) : 0;
auto const word_class = reclass_device{CCLASS_W};
bool const curr_is_word = word_class.is_match(c, _codepoint_flags);
bool const prev_is_word = word_class.is_match(prev_c, _codepoint_flags);
Expand Down Expand Up @@ -366,9 +387,10 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const
case CHAR:
if (inst.u1.c == c) id_activate = inst.u2.next_id;
break;
case ANY:
if (c != '\n') id_activate = inst.u2.next_id;
break;
case ANY: {
if ((c == '\n') || ((inst.u1.c == 'N') && is_newline(c))) { break; }
[[fallthrough]];
}
case ANYNL: id_activate = inst.u2.next_id; break;
case NCCLASS:
case CCLASS: {
Expand Down
Loading

0 comments on commit 94fb993

Please sign in to comment.