Skip to content

Commit

Permalink
fix: restart when vllm crashes with OutOfMemory (#105)
Browse files Browse the repository at this point in the history
Co-authored-by: Avram Tudor <[email protected]>
  • Loading branch information
quitrk and Avram Tudor authored Oct 9, 2024
1 parent 91e18ba commit 63513e8
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 13 deletions.
11 changes: 0 additions & 11 deletions skynet/modules/ttt/openai_api/app.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import os
import subprocess

from skynet import http_client
Expand All @@ -13,7 +12,6 @@
vllm_server_path,
)
from skynet.logs import get_logger
from skynet.modules.monitoring import OPENAI_API_RESTART_COUNTER
from skynet.utils import get_device

proc = None
Expand Down Expand Up @@ -72,13 +70,4 @@ def destroy():
proc.kill()


def restart():
log.info('Restarting Skynet...')

OPENAI_API_RESTART_COUNTER.inc()

# rely on the supervisor to restart the process
os._exit(1)


__all__ = ['destroy', 'initialize', 'restart']
19 changes: 17 additions & 2 deletions skynet/modules/ttt/summaries/jobs.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import asyncio
import os
import time
import uuid

Expand All @@ -7,13 +8,14 @@
from skynet.env import job_timeout, modules, redis_exp_seconds, summary_minimum_payload_length
from skynet.logs import get_logger
from skynet.modules.monitoring import (
OPENAI_API_RESTART_COUNTER,
SUMMARY_DURATION_METRIC,
SUMMARY_ERROR_COUNTER,
SUMMARY_INPUT_LENGTH_METRIC,
SUMMARY_QUEUE_SIZE_METRIC,
SUMMARY_TIME_IN_QUEUE_METRIC,
)
from skynet.modules.ttt.openai_api.app import is_ready as is_openai_api_ready, restart as restart_openai_api
from skynet.modules.ttt.openai_api.app import is_ready as is_openai_api_ready

from .persistence import db
from .processor import process, process_azure, process_open_ai
Expand All @@ -32,6 +34,15 @@
current_task = None


def restart():
log.info('Restarting Skynet...')

OPENAI_API_RESTART_COUNTER.inc()

# rely on the supervisor to restart the process
os._exit(1)


def can_run_next_job() -> bool:
return 'summaries:executor' in modules and (current_task is None or current_task.done())

Expand Down Expand Up @@ -202,6 +213,10 @@ async def _run_job(job: Job) -> None:

await update_done_job(job, result, processor, has_failed)

# error returned from the api when vllm crashes with torch.OutOfMemoryError
if result == 'Error code: 500' and processor == Processors.LOCAL:
restart()


def create_run_job_task(job: Job) -> asyncio.Task:
global current_task
Expand Down Expand Up @@ -245,7 +260,7 @@ async def restart_on_timeout(job: Job) -> None:

await update_done_job(job, "Job timed out", Processors.LOCAL, has_failed=True)

restart_openai_api()
restart()


def start_monitoring_jobs() -> None:
Expand Down

0 comments on commit 63513e8

Please sign in to comment.