parameterize pandas datetime caching

Summary: Detection algortihms is memory intensive and recently started ooming after onboarding ~6k prophet detectors (D53922840). Strobelight points to convert_listlike_datetimes using [11%](https://fburl.com/scuba/pyperf_alloc/on_demand/65bwy03b) heap now and [20%](https://fburl.com/scuba/pyperf_alloc/on_demand/rzwgukjx) during the period the tier was OOMing. Pandas [caches](https://github.com/pandas-dev/pandas/blob/main/pandas/core/tools/datetimes.py#L1008) datetime conversions internally by default to avoid recomputation. This behaviour is parameterized. This diff turns it off to since DA is memory bound so it's desirable to trade some compute for more memory buffer. There will be no performance implications for other users of Kats. Differential Revision: D53941075 fbshipit-source-id: ec72922b5351937a24c6846f979a86a74509c7b0
facebookresearch · Feb 21, 2024 · 6141d6c · 6141d6c
1 parent 5be3003
commit 6141d6c
Showing 1 changed file with 17 additions and 4 deletions.
diff --git a/kats/consts.py b/kats/consts.py
@@ -184,6 +184,7 @@ class TimeSeriesData:
         values (default "raise").
     - categorical_var: A list of column names of categorical variables that are not required to be numerical. Default is None.
     - drop_duplicate_time: A bool variable to indicate whether to drop the duplicate time stamps.
+    - cache_datetimes: A bool variable to indicate whether to use pandas cache to avoid recomputing datetime conversions
 
     Raises:
       ValueError: Invalid params passed when trying to create the
@@ -236,6 +237,7 @@ def __init__(  # noqa C901
         tz_nonexistent: str = "raise",
         categorical_var: Optional[List[str]] = None,
         drop_duplicate_time: bool = False,
+        cache_datetimes: bool = True,
     ) -> None:
         """Initializes :class:`TimeSeriesData` class with arguments provided."""
         self.time_col_name = time_col_name
@@ -272,6 +274,7 @@ def __init__(  # noqa C901
                     tz=tz,
                     tz_ambiguous=tz_ambiguous,
                     tz_nonexistent=tz_nonexistent,
+                    cache_datetimes=cache_datetimes,
                 )
                 if drop_duplicate_time:
                     # drop duplicate time stamps
@@ -564,6 +567,7 @@ def _set_time_format(
         tz: Optional[str] = None,
         tz_ambiguous: Union[str, np.ndarray] = "raise",
         tz_nonexistent: str = "raise",
+        cache_datetimes: bool = True,
     ) -> pd.core.series.Series:
         """Parses time format when initializing :class:`TimeSeriesData`."""
 
@@ -574,14 +578,19 @@ def _set_time_format(
                     if tz:
                         return (
                             pd.to_datetime(
-                                series.values, unit=unix_time_units, utc=True
+                                series.values,
+                                unit=unix_time_units,
+                                utc=True,
+                                cache=cache_datetimes,
                             )
                             .tz_convert(tz)
                             .to_series()
                             .reset_index(drop=True)
                         )
                     else:
-                        return pd.to_datetime(series, unit=unix_time_units)
+                        return pd.to_datetime(
+                            series, unit=unix_time_units, cache=cache_datetimes
+                        )
                 except ValueError:
                     msg = (
                         "Failed to parse time column "
@@ -595,15 +604,19 @@ def _set_time_format(
                 try:
                     if tz:
                         return (
-                            pd.to_datetime(series.values, format=date_format)
+                            pd.to_datetime(
+                                series.values, format=date_format, cache=cache_datetimes
+                            )
                             .tz_localize(
                                 tz, ambiguous=tz_ambiguous, nonexistent=tz_nonexistent
                             )
                             .to_series()
                             .reset_index(drop=True)
                         )
                     else:
-                        return pd.to_datetime(series, format=date_format)
+                        return pd.to_datetime(
+                            series, format=date_format, cache=cache_datetimes
+                        )
                 except ValueError:
                     msg = (
                         "Failed to parse time column "