diff --git a/cpp/benchmarks/text/ngrams.cpp b/cpp/benchmarks/text/ngrams.cpp index 0319577f6b9..f3fd5cc5729 100644 --- a/cpp/benchmarks/text/ngrams.cpp +++ b/cpp/benchmarks/text/ngrams.cpp @@ -36,11 +36,12 @@ static void BM_ngrams(benchmark::State& state, ngrams_type nt) cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length); auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile); cudf::strings_column_view input(column->view()); + auto const separator = cudf::string_scalar("_"); for (auto _ : state) { cuda_event_timer raii(state, true); switch (nt) { - case ngrams_type::tokens: nvtext::generate_ngrams(input); break; + case ngrams_type::tokens: nvtext::generate_ngrams(input, 2, separator); break; case ngrams_type::characters: nvtext::generate_character_ngrams(input); break; } } diff --git a/cpp/benchmarks/text/tokenize.cpp b/cpp/benchmarks/text/tokenize.cpp index 423fe667b05..b556a84c541 100644 --- a/cpp/benchmarks/text/tokenize.cpp +++ b/cpp/benchmarks/text/tokenize.cpp @@ -67,8 +67,11 @@ static void bench_tokenize(nvbench::state& state) auto result = nvtext::count_tokens(input, cudf::strings_column_view(delimiters)); }); } else if (tokenize_type == "ngrams") { - state.exec(nvbench::exec_tag::sync, - [&](nvbench::launch& launch) { auto result = nvtext::ngrams_tokenize(input); }); + auto const delimiter = cudf::string_scalar(""); + auto const separator = cudf::string_scalar("_"); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = nvtext::ngrams_tokenize(input, 2, delimiter, separator); + }); } else if (tokenize_type == "characters") { state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { auto result = nvtext::character_tokenize(input); }); diff --git a/cpp/include/nvtext/generate_ngrams.hpp b/cpp/include/nvtext/generate_ngrams.hpp index 5d66401df9d..46f2c0e7bc9 100644 --- a/cpp/include/nvtext/generate_ngrams.hpp +++ b/cpp/include/nvtext/generate_ngrams.hpp @@ -47,19 +47,19 @@ namespace nvtext { * @throw cudf::logic_error if `separator` is invalid * @throw cudf::logic_error if there are not enough strings to generate any ngrams * - * @param strings Strings column to tokenize and produce ngrams from. - * @param ngrams The ngram number to generate. - * Default is 2 = bigram. - * @param separator The string to use for separating ngram tokens. - * Default is "_" character. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New strings columns of tokens. + * @param input Strings column to tokenize and produce ngrams from + * @param ngrams The ngram number to generate + * @param separator The string to use for separating ngram tokens + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings columns of tokens */ std::unique_ptr generate_ngrams( - cudf::strings_column_view const& strings, - cudf::size_type ngrams = 2, - cudf::string_scalar const& separator = cudf::string_scalar{"_"}, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + cudf::strings_column_view const& input, + cudf::size_type ngrams, + cudf::string_scalar const& separator, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Generates ngrams of characters within each string. @@ -79,15 +79,17 @@ std::unique_ptr generate_ngrams( * @throw cudf::logic_error if `ngrams < 2` * @throw cudf::logic_error if there are not enough characters to generate any ngrams * - * @param strings Strings column to produce ngrams from. + * @param input Strings column to produce ngrams from * @param ngrams The ngram number to generate. * Default is 2 = bigram. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New strings columns of tokens. + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings columns of tokens */ std::unique_ptr generate_character_ngrams( - cudf::strings_column_view const& strings, + cudf::strings_column_view const& input, cudf::size_type ngrams = 2, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -113,14 +115,16 @@ std::unique_ptr generate_character_ngrams( * @throw cudf::logic_error if `ngrams < 2` * @throw cudf::logic_error if there are not enough characters to generate any ngrams * - * @param strings Strings column to produce ngrams from. + * @param input Strings column to produce ngrams from * @param ngrams The ngram number to generate. Default is 5. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory. * @return A lists column of hash values */ std::unique_ptr hash_character_ngrams( - cudf::strings_column_view const& strings, + cudf::strings_column_view const& input, cudf::size_type ngrams = 5, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/include/nvtext/ngrams_tokenize.hpp b/cpp/include/nvtext/ngrams_tokenize.hpp index 17f20f7ea4c..9d76ef8689f 100644 --- a/cpp/include/nvtext/ngrams_tokenize.hpp +++ b/cpp/include/nvtext/ngrams_tokenize.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -66,22 +66,22 @@ namespace nvtext { * * All null row entries are ignored and the output contains all valid rows. * - * @param strings Strings column to tokenize and produce ngrams from. - * @param ngrams The ngram number to generate. - * Default is 2 = bigram. + * @param input Strings column to tokenize and produce ngrams from + * @param ngrams The ngram number to generate * @param delimiter UTF-8 characters used to separate each string into tokens. - * The default of empty string will separate tokens using whitespace. - * @param separator The string to use for separating ngram tokens. - * Default is "_" character. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New strings columns of tokens. + * An empty string will separate tokens using whitespace. + * @param separator The string to use for separating ngram tokens + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings columns of tokens */ std::unique_ptr ngrams_tokenize( - cudf::strings_column_view const& strings, - cudf::size_type ngrams = 2, - cudf::string_scalar const& delimiter = cudf::string_scalar{""}, - cudf::string_scalar const& separator = cudf::string_scalar{"_"}, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + cudf::strings_column_view const& input, + cudf::size_type ngrams, + cudf::string_scalar const& delimiter, + cudf::string_scalar const& separator, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group } // namespace nvtext diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu index 938fd45246d..5f2f4d021a4 100644 --- a/cpp/src/text/generate_ngrams.cu +++ b/cpp/src/text/generate_ngrams.cu @@ -150,10 +150,11 @@ std::unique_ptr generate_ngrams(cudf::strings_column_view const& s std::unique_ptr generate_ngrams(cudf::strings_column_view const& strings, cudf::size_type ngrams, cudf::string_scalar const& separator, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::generate_ngrams(strings, ngrams, separator, cudf::get_default_stream(), mr); + return detail::generate_ngrams(strings, ngrams, separator, stream, mr); } namespace detail { @@ -317,18 +318,20 @@ std::unique_ptr hash_character_ngrams(cudf::strings_column_view co std::unique_ptr generate_character_ngrams(cudf::strings_column_view const& strings, cudf::size_type ngrams, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::generate_character_ngrams(strings, ngrams, cudf::get_default_stream(), mr); + return detail::generate_character_ngrams(strings, ngrams, stream, mr); } std::unique_ptr hash_character_ngrams(cudf::strings_column_view const& strings, cudf::size_type ngrams, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::hash_character_ngrams(strings, ngrams, cudf::get_default_stream(), mr); + return detail::hash_character_ngrams(strings, ngrams, stream, mr); } } // namespace nvtext diff --git a/cpp/src/text/jaccard.cu b/cpp/src/text/jaccard.cu index 5b55745c2c7..95324847ea0 100644 --- a/cpp/src/text/jaccard.cu +++ b/cpp/src/text/jaccard.cu @@ -107,7 +107,7 @@ rmm::device_uvector compute_unique_counts(cudf::column_view con * * This is called with a warp per row */ -struct sorted_interset_fn { +struct sorted_intersect_fn { cudf::column_device_view const d_input1; cudf::column_device_view const d_input2; cudf::size_type* d_results; @@ -151,7 +151,7 @@ rmm::device_uvector compute_intersect_counts(cudf::column_view auto const d_input1 = cudf::column_device_view::create(input1, stream); auto const d_input2 = cudf::column_device_view::create(input2, stream); auto d_results = rmm::device_uvector(input1.size(), stream); - sorted_interset_fn fn{*d_input1, *d_input2, d_results.data()}; + sorted_intersect_fn fn{*d_input1, *d_input2, d_results.data()}; thrust::for_each_n(rmm::exec_policy(stream), thrust::counting_iterator(0), input1.size() * cudf::detail::warp_size, diff --git a/cpp/src/text/ngrams_tokenize.cu b/cpp/src/text/ngrams_tokenize.cu index fd1cbf99221..73d85513e95 100644 --- a/cpp/src/text/ngrams_tokenize.cu +++ b/cpp/src/text/ngrams_tokenize.cu @@ -265,11 +265,11 @@ std::unique_ptr ngrams_tokenize(cudf::strings_column_view const& s cudf::size_type ngrams, cudf::string_scalar const& delimiter, cudf::string_scalar const& separator, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::ngrams_tokenize( - strings, ngrams, delimiter, separator, cudf::get_default_stream(), mr); + return detail::ngrams_tokenize(strings, ngrams, delimiter, separator, stream, mr); } } // namespace nvtext diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index d1e50442058..ba4921848d7 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -632,6 +632,7 @@ ConfigureTest( STREAM_STRINGS_TEST streams/strings/case_test.cpp streams/strings/find_test.cpp STREAM_MODE testing ) +ConfigureTest(STREAM_TEXT_TEST streams/text/ngrams_test.cpp STREAM_MODE testing) # ################################################################################################## # Install tests #################################################################################### diff --git a/cpp/tests/streams/text/ngrams_test.cpp b/cpp/tests/streams/text/ngrams_test.cpp new file mode 100644 index 00000000000..bce0d2b680b --- /dev/null +++ b/cpp/tests/streams/text/ngrams_test.cpp @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include + +class TextNGramsTest : public cudf::test::BaseFixture {}; + +TEST_F(TextNGramsTest, GenerateNgrams) +{ + auto const input = + cudf::test::strings_column_wrapper({"the", "fox", "jumped", "over", "thé", "dog"}); + auto const separator = cudf::string_scalar{"_", true, cudf::test::get_default_stream()}; + nvtext::generate_ngrams( + cudf::strings_column_view(input), 3, separator, cudf::test::get_default_stream()); +} + +TEST_F(TextNGramsTest, GenerateCharacterNgrams) +{ + auto const input = + cudf::test::strings_column_wrapper({"the", "fox", "jumped", "over", "thé", "dog"}); + nvtext::generate_character_ngrams( + cudf::strings_column_view(input), 3, cudf::test::get_default_stream()); +} + +TEST_F(TextNGramsTest, HashCharacterNgrams) +{ + auto input = + cudf::test::strings_column_wrapper({"the quick brown fox", "jumped over the lazy dog."}); + nvtext::hash_character_ngrams( + cudf::strings_column_view(input), 5, cudf::test::get_default_stream()); +} + +TEST_F(TextNGramsTest, NgramsTokenize) +{ + auto input = + cudf::test::strings_column_wrapper({"the quick brown fox", "jumped over the lazy dog."}); + auto const delimiter = cudf::string_scalar{" ", true, cudf::test::get_default_stream()}; + auto const separator = cudf::string_scalar{"_", true, cudf::test::get_default_stream()}; + nvtext::ngrams_tokenize( + cudf::strings_column_view(input), 2, delimiter, separator, cudf::test::get_default_stream()); +} diff --git a/cpp/tests/text/ngrams_tests.cpp b/cpp/tests/text/ngrams_tests.cpp index 323b3eed3e2..7b179588385 100644 --- a/cpp/tests/text/ngrams_tests.cpp +++ b/cpp/tests/text/ngrams_tests.cpp @@ -34,18 +34,19 @@ TEST_F(TextGenerateNgramsTest, Ngrams) { cudf::test::strings_column_wrapper strings{"the", "fox", "jumped", "over", "thé", "dog"}; cudf::strings_column_view strings_view(strings); + auto const separator = cudf::string_scalar("_"); { cudf::test::strings_column_wrapper expected{ "the_fox", "fox_jumped", "jumped_over", "over_thé", "thé_dog"}; - auto const results = nvtext::generate_ngrams(strings_view); + auto const results = nvtext::generate_ngrams(strings_view, 2, separator); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { cudf::test::strings_column_wrapper expected{ "the_fox_jumped", "fox_jumped_over", "jumped_over_thé", "over_thé_dog"}; - auto const results = nvtext::generate_ngrams(strings_view, 3); + auto const results = nvtext::generate_ngrams(strings_view, 3, separator); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { @@ -83,10 +84,11 @@ TEST_F(TextGenerateNgramsTest, NgramsWithNulls) h_strings.begin(), h_strings.end(), thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); + auto const separator = cudf::string_scalar("_"); cudf::strings_column_view strings_view(strings); { - auto const results = nvtext::generate_ngrams(strings_view, 3); + auto const results = nvtext::generate_ngrams(strings_view, 3, separator); cudf::test::strings_column_wrapper expected{ "the_fox_jumped", "fox_jumped_over", "jumped_over_the", "over_the_dog"}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); @@ -103,7 +105,10 @@ TEST_F(TextGenerateNgramsTest, Empty) { auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view(); - auto results = nvtext::generate_ngrams(cudf::strings_column_view(zero_size_strings_column)); + auto const separator = cudf::string_scalar("_"); + + auto results = + nvtext::generate_ngrams(cudf::strings_column_view(zero_size_strings_column), 2, separator); cudf::test::expect_column_empty(results->view()); results = nvtext::generate_character_ngrams(cudf::strings_column_view(zero_size_strings_column)); cudf::test::expect_column_empty(results->view()); @@ -112,21 +117,20 @@ TEST_F(TextGenerateNgramsTest, Empty) TEST_F(TextGenerateNgramsTest, Errors) { cudf::test::strings_column_wrapper strings{""}; + auto const separator = cudf::string_scalar("_"); // invalid parameter value - EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 1), cudf::logic_error); + EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 1, separator), + cudf::logic_error); EXPECT_THROW(nvtext::generate_character_ngrams(cudf::strings_column_view(strings), 1), cudf::logic_error); // not enough strings to generate ngrams - EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 3), cudf::logic_error); + EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 3, separator), + cudf::logic_error); EXPECT_THROW(nvtext::generate_character_ngrams(cudf::strings_column_view(strings), 3), cudf::logic_error); - std::vector h_strings{"", nullptr, "", nullptr}; - cudf::test::strings_column_wrapper strings_no_tokens( - h_strings.begin(), - h_strings.end(), - thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); - EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings_no_tokens)), + cudf::test::strings_column_wrapper strings_no_tokens({"", "", "", ""}, {1, 0, 1, 0}); + EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings_no_tokens), 2, separator), cudf::logic_error); EXPECT_THROW(nvtext::generate_character_ngrams(cudf::strings_column_view(strings_no_tokens)), cudf::logic_error); diff --git a/cpp/tests/text/ngrams_tokenize_tests.cpp b/cpp/tests/text/ngrams_tokenize_tests.cpp index 5879bec3e64..c6fb886f7e5 100644 --- a/cpp/tests/text/ngrams_tokenize_tests.cpp +++ b/cpp/tests/text/ngrams_tokenize_tests.cpp @@ -62,7 +62,7 @@ TEST_F(TextNgramsTokenizeTest, Tokenize) "mousé_ate", "ate_the", "the_cheese"}; - auto results = nvtext::ngrams_tokenize(strings_view); + auto results = nvtext::ngrams_tokenize(strings_view, 2, std::string(), std::string("_")); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { @@ -101,9 +101,10 @@ TEST_F(TextNgramsTokenizeTest, TokenizeOneGram) { cudf::test::strings_column_wrapper strings{"aaa bbb", " ccc ddd ", "eee"}; cudf::strings_column_view strings_view(strings); + auto const empty = cudf::string_scalar(""); cudf::test::strings_column_wrapper expected{"aaa", "bbb", "ccc", "ddd", "eee"}; - auto results = nvtext::ngrams_tokenize(strings_view, 1); + auto results = nvtext::ngrams_tokenize(strings_view, 1, empty, empty); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } @@ -111,7 +112,8 @@ TEST_F(TextNgramsTokenizeTest, TokenizeEmptyTest) { auto strings = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); cudf::strings_column_view strings_view(strings->view()); - auto results = nvtext::ngrams_tokenize(strings_view); + auto const empty = cudf::string_scalar(""); + auto results = nvtext::ngrams_tokenize(strings_view, 2, empty, empty); EXPECT_EQ(results->size(), 0); EXPECT_EQ(results->has_nulls(), false); } @@ -120,5 +122,6 @@ TEST_F(TextNgramsTokenizeTest, TokenizeErrorTest) { cudf::test::strings_column_wrapper strings{"this column intentionally left blank"}; cudf::strings_column_view strings_view(strings); - EXPECT_THROW(nvtext::ngrams_tokenize(strings_view, 0), cudf::logic_error); + auto const empty = cudf::string_scalar(""); + EXPECT_THROW(nvtext::ngrams_tokenize(strings_view, 0, empty, empty), cudf::logic_error); }