Skip to content

Commit

Permalink
fix: don't loop trying to recover
Browse files Browse the repository at this point in the history
  • Loading branch information
tazlin committed Mar 6, 2024
1 parent e211bcb commit c2a2fb6
Showing 1 changed file with 13 additions and 1 deletion.
14 changes: 13 additions & 1 deletion horde_worker_regen/process_management/process_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -2999,6 +2999,8 @@ def shutdown() -> None:

threading.Thread(target=shutdown).start()

_recently_recovered = False

def replace_hung_processes(self) -> bool:
"""
Replaces processes that haven't checked in since `process_timeout` seconds in bridgeData
Expand All @@ -3023,7 +3025,7 @@ def replace_hung_processes(self) -> bool:
seconds=self.bridge_data.process_timeout,
)
)
) and not self._last_pop_no_jobs_available:
) and not (self._last_pop_no_jobs_available or self._recently_recovered):
if self.bridge_data.exit_on_unhandled_faults:
logger.error("All processes have been unresponsive for too long, exiting.")

Expand Down Expand Up @@ -3063,6 +3065,16 @@ def replace_hung_processes(self) -> bool:
if len(self.jobs_being_safety_checked) > 0:
logger.error("Jobs are still being safety checked...")

self._recently_recovered = True

def timed_unset_recently_recovered() -> None:
time.sleep(60)
self._recently_recovered = False

import threading

threading.Thread(target=timed_unset_recently_recovered).start()

return True

any_replaced = False
Expand Down

0 comments on commit c2a2fb6

Please sign in to comment.