diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d4688c95..e0b37ea0 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,7 +19,7 @@ repos: - id: mypy args: [] additional_dependencies: - - pydantic==2.7.4 + - pydantic==2.9.2 - types-requests - types-pytz - types-setuptools @@ -40,7 +40,7 @@ repos: - horde_safety==0.2.3 - torch==2.3.1 - ruamel.yaml - - horde_engine==2.15.0 - - horde_sdk==0.14.3 + - horde_engine==2.15.1 + - horde_sdk==0.14.7 - horde_model_reference==0.9.0 - semver diff --git a/horde-bridge.cmd b/horde-bridge.cmd index a1e59a2c..4d8cbdd4 100644 --- a/horde-bridge.cmd +++ b/horde-bridge.cmd @@ -5,7 +5,7 @@ cd /d %~dp0 call runtime python -s -m pip -V call python -s -m pip uninstall hordelib -call python -s -m pip install horde_sdk~=0.14.3 horde_model_reference~=0.9.0 horde_engine~=2.15.0 horde_safety~=0.2.3 -U +call python -s -m pip install horde_sdk~=0.14.7 horde_model_reference~=0.9.0 horde_engine~=2.15.1 horde_safety~=0.2.3 -U if %ERRORLEVEL% NEQ 0 ( echo "Please run update-runtime.cmd." diff --git a/horde_worker_regen/process_management/process_manager.py b/horde_worker_regen/process_management/process_manager.py index c659033e..5a352831 100644 --- a/horde_worker_regen/process_management/process_manager.py +++ b/horde_worker_regen/process_management/process_manager.py @@ -51,7 +51,8 @@ from horde_sdk.ai_horde_api.fields import JobID from loguru import logger from pydantic import BaseModel, ConfigDict, RootModel, ValidationError -from typing_extensions import override +from typing import Literal, Union +from typing_extensions import override, TypeAlias import horde_worker_regen from horde_worker_regen.bridge_data.data_model import reGenBridgeData @@ -103,18 +104,15 @@ _async_client_exceptions = (asyncio.exceptions.TimeoutError, aiohttp.client_exceptions.ClientError, OSError) _excludes_for_job_dump = { - "job_image_results": ..., + "job_image_results": True, "sdk_api_job_info": { - "payload": { - "prompt", - "special", - }, - "skipped": ..., - "source_image": ..., - "source_mask": ..., - "extra_source_images": ..., - "r2_upload": ..., - "r2_uploads": ..., + "payload": {"prompt": True, "special": True}, + "skipped": True, + "source_image": True, + "source_mask": True, + "extra_source_images": True, + "r2_upload": True, + "r2_uploads": True, }, } @@ -1740,7 +1738,7 @@ def receive_and_handle_process_messages(self) -> None: ) logger.debug( - f"Job data: {message.sdk_api_job_info.model_dump(exclude=_excludes_for_job_dump)}", + f"Job data: {message.sdk_api_job_info.model_dump(exclude=_excludes_for_job_dump)}", # type: ignore ) self.completed_jobs.append(job_info) @@ -2789,7 +2787,7 @@ async def api_submit_job(self) -> None: ): model_dump = hji.model_dump( - exclude=_excludes_for_job_dump, + exclude=_excludes_for_job_dump, # type: ignore ) if ( self.stable_diffusion_reference is not None @@ -3158,6 +3156,9 @@ async def _get_source_images(self, job_pop_response: ImageGenerateJobPopResponse return job_pop_response _last_pop_no_jobs_available: bool = False + _too_many_consecutive_failed_jobs: bool = False + _too_many_consecutive_failed_jobs_time: float = 0.0 + _too_many_consecutive_failed_jobs_wait_time = 180 @logger.catch(reraise=True) async def api_job_pop(self) -> None: @@ -3165,6 +3166,18 @@ async def api_job_pop(self) -> None: if self._shutting_down: return + cur_time = time.time() + + if self._too_many_consecutive_failed_jobs: + if ( + cur_time - self._too_many_consecutive_failed_jobs_time + > self._too_many_consecutive_failed_jobs_wait_time + ): + self._too_many_consecutive_failed_jobs = False + self._too_many_consecutive_failed_jobs_time = 0 + logger.debug("Resuming job pops after too many consecutive failed jobs") + return + if self._consecutive_failed_jobs >= 3: logger.error( "Too many consecutive failed jobs, pausing job pops. " @@ -3174,9 +3187,8 @@ async def api_job_pop(self) -> None: if self.bridge_data.exit_on_unhandled_faults: logger.error("Exiting due to exit_on_unhandled_faults being enabled") self._abort() - await asyncio.sleep(180) - self._consecutive_failed_jobs = 0 - logger.info("Resuming job pops") + self._too_many_consecutive_failed_jobs = True + self._too_many_consecutive_failed_jobs_time = cur_time return max_jobs_in_queue = self.bridge_data.queue_size + 1 @@ -3937,6 +3949,17 @@ def print_status_method(self) -> None: "mode. Consider disabling `extra_slow_worker` in your config.", ) + if self._too_many_consecutive_failed_jobs: + time_since_failure = time.time() - self._too_many_consecutive_failed_jobs_time + logger.error( + "Too many consecutive failed jobs. This may be due to a misconfiguration or other issue. " + "Please check your logs and configuration.", + ) + logger.error( + f"Time since last job failure: {time_since_failure:.2f}s). " + f"{self._too_many_consecutive_failed_jobs_wait_time} seconds must pass before resuming.", + ) + self._last_status_message_time = time.time() _bridge_data_loop_interval = 1.0 diff --git a/horde_worker_regen/process_management/worker_entry_points.py b/horde_worker_regen/process_management/worker_entry_points.py index 716abf7a..af8abc71 100644 --- a/horde_worker_regen/process_management/worker_entry_points.py +++ b/horde_worker_regen/process_management/worker_entry_points.py @@ -64,21 +64,25 @@ def start_inference_process( if amd_gpu: extra_comfyui_args.append("--use-pytorch-cross-attention") - models_not_to_force_load = [] + models_not_to_force_load = ["flux"] if very_high_memory_mode: extra_comfyui_args.append("--gpu-only") elif high_memory_mode: extra_comfyui_args.append("--normalvram") - models_not_to_force_load = [ - "cascade", - ] + models_not_to_force_load.extend( + [ + "cascade", + ], + ) elif low_memory_mode: extra_comfyui_args.append("--novram") - models_not_to_force_load = [ - "sdxl", - "cascade", - ] + models_not_to_force_load.extend( + [ + "sdxl", + "cascade", + ], + ) with logger.catch(reraise=True): hordelib.initialise( diff --git a/requirements.dev.txt b/requirements.dev.txt index 976ab2bb..35c65242 100644 --- a/requirements.dev.txt +++ b/requirements.dev.txt @@ -1,9 +1,9 @@ -pytest==8.3.1 -mypy==1.11.0 -black==24.4.2 -ruff==0.5.4 -tox~=4.16.0 -pre-commit~=3.7.1 +pytest==8.3.3 +mypy==1.11.2 +black==24.8.0 +ruff==0.6.5 +tox~=4.18.1 +pre-commit~=3.8.0 build>=0.10.0 coverage>=7.2.7 diff --git a/requirements.rocm.txt b/requirements.rocm.txt index 7f2d185c..91395bc6 100644 --- a/requirements.rocm.txt +++ b/requirements.rocm.txt @@ -1,9 +1,9 @@ numpy==1.26.4 torch==2.3.1+rocm6.0 -horde_sdk~=0.14.3 +horde_sdk~=0.14.7 horde_safety~=0.2.3 -horde_engine~=2.15.0 +horde_engine~=2.15.1 horde_model_reference~=0.9.0 python-dotenv @@ -13,7 +13,7 @@ wheel python-Levenshtein -pydantic>=2.7.4 +pydantic>=2.9.2 typing_extensions requests StrEnum diff --git a/requirements.txt b/requirements.txt index bb0bc0a1..18ad68a9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,9 @@ numpy==1.26.4 torch==2.3.1 -horde_sdk~=0.14.3 +horde_sdk~=0.14.7 horde_safety~=0.2.3 -horde_engine~=2.15.0 +horde_engine~=2.15.1 horde_model_reference>=0.9.0 python-dotenv @@ -12,7 +12,7 @@ semver python-Levenshtein -pydantic>=2.7.4 +pydantic>=2.9.2 typing_extensions requests StrEnum