Skip to content

Commit

Permalink
HH-217736 restart whole app on worker oom
Browse files Browse the repository at this point in the history
  • Loading branch information
712u3 committed May 30, 2024
1 parent 1db6bc4 commit b3e24bb
Show file tree
Hide file tree
Showing 4 changed files with 15 additions and 5 deletions.
2 changes: 1 addition & 1 deletion frontik/integrations/telemetry.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def initialize_app(self, app: FrontikApplication) -> Optional[Future]:

resource = Resource(
attributes={
ResourceAttributes.SERVICE_NAME: options.app, # type: ignore
ResourceAttributes.SERVICE_NAME: app.app_name, # type: ignore
ResourceAttributes.SERVICE_VERSION: app.application_version(), # type: ignore
ResourceAttributes.HOST_NAME: options.node_name,
ResourceAttributes.CLOUD_REGION: http_client_options.datacenter,
Expand Down
14 changes: 12 additions & 2 deletions frontik/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,10 +90,12 @@ def master_sigterm_handler(signum, _frame):
time.sleep(0.1)
_master_function_wrapper(worker_state, master_function)
worker_state.master_done.value = True
_supervise_workers(worker_state, worker_function_wrapped)
_supervise_workers(worker_state, worker_function_wrapped, master_before_shutdown_action)


def _supervise_workers(worker_state: WorkerState, worker_function: Callable) -> None:
def _supervise_workers(
worker_state: WorkerState, worker_function: Callable, master_before_shutdown_action: Callable
) -> None:
while worker_state.children:
try:
pid, status = os.wait()
Expand All @@ -114,6 +116,14 @@ def _supervise_workers(worker_state: WorkerState, worker_function: Callable) ->

if os.WIFSIGNALED(status):
log.warning('child %d (pid %d) killed by signal %d, restarting', worker_id, pid, os.WTERMSIG(status))

# TODO remove this block # noqa
worker_state.terminating = True
master_before_shutdown_action()
for pid, worker_id in worker_state.children.items():
log.info('sending %s to child %d (pid %d)', signal.Signals(os.WTERMSIG(status)).name, worker_id, pid)
os.kill(pid, signal.SIGTERM)

elif os.WEXITSTATUS(status) != 0:
log.warning('child %d (pid %d) exited with status %d, restarting', worker_id, pid, os.WEXITSTATUS(status))
else:
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ ignore = [
# should be ignored
'ANN101','ANN102','D102','D101','CPY001','D100','D107','D106','B008','D103','D104','D105','D202',
'RET505','RET506','RET504','RSE102','TCH003','TCH002',
'COM812', 'ISC001', 'PT015',
'COM812', 'ISC001', 'PT015', 'FIX002',
]

[tool.ruff.lint.isort]
Expand Down
2 changes: 1 addition & 1 deletion tests/test_telemetry.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ def frontik_app(self) -> FrontikApplication:
return app

async def test_parent_span(self, frontik_app: FrontikApplication) -> None:
await self.fetch_json('/page_a')
await self.fetch('/page_a')
BATCH_SPAN_PROCESSOR[0].force_flush()
assert len(SPAN_STORAGE) == 4
client_a_span = find_span('http.request.cloud.region', 'externalRequest')
Expand Down

0 comments on commit b3e24bb

Please sign in to comment.