From e87d2fc1df6105d802b300bad19a9937f8155613 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 20 Sep 2023 21:18:31 +0100 Subject: [PATCH] Reduce memory usage of as_categorical_column (#14138) The main culprit is in the way the codes returned from _label_encoding were being ordered. We were generating an int64 column for the order, gathering through the left gather map, and then argsorting, before using that ordering as a gather map for the codes. We note that gather(y, with=argsort(x)) is equivalent to sort_by_key(y, with=x) so use that instead (avoiding an unnecessary gather). Furthermore we also note that gather([0..n), with=x) is just equivalent to x, so we can avoid a gather too. This reduces the peak memory footprint of categorifying a random column of 500_000_000 int32 values where there are 100 unique values from 24.75 GiB to 11.67 GiB. ### Test code ```python import cudf import cupy as cp K = 100 N = 500_000_000 rng = cp.random._generator.RandomState() column = cudf.core.column.as_column(rng.choice(cp.arange(K, dtype="int32"), size=(N,), replace=True)) column = column.astype("category", ordered=False) ``` ### Before ![Screenshot from 2023-09-20 14-49-27](https://github.com/rapidsai/cudf/assets/1126981/08782501-c233-4efd-b4d6-a378cea82a82) ### After ![Screenshot from 2023-09-20 14-49-42](https://github.com/rapidsai/cudf/assets/1126981/93193bfb-a93e-45bf-8e5a-24289efc77c4) Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Bradley Dice (https://github.com/bdice) - Ashwin Srinath (https://github.com/shwina) URL: https://github.com/rapidsai/cudf/pull/14138 --- python/cudf/cudf/core/column/column.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index d2e2f11a12e..0bc50a521e2 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1390,20 +1390,19 @@ def _return_sentinel_column(): except ValueError: return _return_sentinel_column() - codes = arange(len(cats), dtype=dtype) left_gather_map, right_gather_map = cpp_join( [self], [cats], how="left" ) - codes = codes.take( - right_gather_map, nullify=True, check_bounds=False - ).fillna(na_sentinel.value) - + codes = libcudf.copying.gather( + [arange(len(cats), dtype=dtype)], right_gather_map, nullify=True + ) + del right_gather_map # reorder `codes` so that its values correspond to the # values of `self`: - order = arange(len(self)) - order = order.take(left_gather_map, check_bounds=False).argsort() - codes = codes.take(order) - return codes + (codes,) = libcudf.sort.sort_by_key( + codes, [left_gather_map], [True], ["last"], stable=True + ) + return codes.fillna(na_sentinel.value) def column_empty_like(