Skip to content

Commit

Permalink
parameterize pandas datetime caching
Browse files Browse the repository at this point in the history
Summary:
Detection algortihms is memory intensive and recently started ooming after onboarding ~6k prophet detectors (D53922840).

Strobelight points to convert_listlike_datetimes using [11%](https://fburl.com/scuba/pyperf_alloc/on_demand/65bwy03b) heap now and [20%](https://fburl.com/scuba/pyperf_alloc/on_demand/rzwgukjx) during the period the tier was OOMing.

Pandas [caches](https://github.com/pandas-dev/pandas/blob/main/pandas/core/tools/datetimes.py#L1008) datetime conversions internally by default to avoid recomputation. This behaviour is parameterized. This diff turns it off to since DA is memory bound so it's desirable to trade some compute for more memory buffer. There will be no performance implications for other users of Kats.

Differential Revision: D53941075

fbshipit-source-id: ec72922b5351937a24c6846f979a86a74509c7b0
  • Loading branch information
Rikin Shah authored and facebook-github-bot committed Feb 21, 2024
1 parent 5be3003 commit 6141d6c
Showing 1 changed file with 17 additions and 4 deletions.
21 changes: 17 additions & 4 deletions kats/consts.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,7 @@ class TimeSeriesData:
values (default "raise").
- categorical_var: A list of column names of categorical variables that are not required to be numerical. Default is None.
- drop_duplicate_time: A bool variable to indicate whether to drop the duplicate time stamps.
- cache_datetimes: A bool variable to indicate whether to use pandas cache to avoid recomputing datetime conversions
Raises:
ValueError: Invalid params passed when trying to create the
Expand Down Expand Up @@ -236,6 +237,7 @@ def __init__( # noqa C901
tz_nonexistent: str = "raise",
categorical_var: Optional[List[str]] = None,
drop_duplicate_time: bool = False,
cache_datetimes: bool = True,
) -> None:
"""Initializes :class:`TimeSeriesData` class with arguments provided."""
self.time_col_name = time_col_name
Expand Down Expand Up @@ -272,6 +274,7 @@ def __init__( # noqa C901
tz=tz,
tz_ambiguous=tz_ambiguous,
tz_nonexistent=tz_nonexistent,
cache_datetimes=cache_datetimes,
)
if drop_duplicate_time:
# drop duplicate time stamps
Expand Down Expand Up @@ -564,6 +567,7 @@ def _set_time_format(
tz: Optional[str] = None,
tz_ambiguous: Union[str, np.ndarray] = "raise",
tz_nonexistent: str = "raise",
cache_datetimes: bool = True,
) -> pd.core.series.Series:
"""Parses time format when initializing :class:`TimeSeriesData`."""

Expand All @@ -574,14 +578,19 @@ def _set_time_format(
if tz:
return (
pd.to_datetime(
series.values, unit=unix_time_units, utc=True
series.values,
unit=unix_time_units,
utc=True,
cache=cache_datetimes,
)
.tz_convert(tz)
.to_series()
.reset_index(drop=True)
)
else:
return pd.to_datetime(series, unit=unix_time_units)
return pd.to_datetime(
series, unit=unix_time_units, cache=cache_datetimes
)
except ValueError:
msg = (
"Failed to parse time column "
Expand All @@ -595,15 +604,19 @@ def _set_time_format(
try:
if tz:
return (
pd.to_datetime(series.values, format=date_format)
pd.to_datetime(
series.values, format=date_format, cache=cache_datetimes
)
.tz_localize(
tz, ambiguous=tz_ambiguous, nonexistent=tz_nonexistent
)
.to_series()
.reset_index(drop=True)
)
else:
return pd.to_datetime(series, format=date_format)
return pd.to_datetime(
series, format=date_format, cache=cache_datetimes
)
except ValueError:
msg = (
"Failed to parse time column "
Expand Down

0 comments on commit 6141d6c

Please sign in to comment.