Skip to content

Commit

Permalink
Merge pull request #1283 from OCR-D/mets-server-kill-zombies
Browse files Browse the repository at this point in the history
add endpoint DELETE /kill_mets_server_zombies to kill -SIGTERM METS servers with ctime > 60mins ago
  • Loading branch information
kba authored Oct 2, 2024
2 parents 3882e7a + a8bfbe4 commit c5fd843
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 6 deletions.
10 changes: 8 additions & 2 deletions src/ocrd/mets_server.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
"""
# METS server functionality
"""
import os
import re
from os import _exit, chmod
import signal
from typing import Dict, Optional, Union, List, Tuple
from time import sleep
from pathlib import Path
Expand Down Expand Up @@ -428,8 +430,12 @@ def create_process(mets_server_url: str, ws_dir_path: str, log_file: str) -> int

@staticmethod
def kill_process(mets_server_pid: int):
subprocess_run(args=["kill", "-s", "SIGINT", f"{mets_server_pid}"], shell=False, universal_newlines=True)
return
os.kill(mets_server_pid, signal.SIGINT)
sleep(3)
try:
os.kill(mets_server_pid, signal.SIGKILL)
except ProcessLookupError as e:
pass

def shutdown(self):
if self.is_uds:
Expand Down
13 changes: 13 additions & 0 deletions src/ocrd_network/processing_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
get_workflow_content,
get_from_database_workspace,
get_from_database_workflow_job,
kill_mets_server_zombies,
parse_workflow_tasks,
raise_http_exception,
request_processor_server_tool_json,
Expand Down Expand Up @@ -200,6 +201,14 @@ def add_api_routes_others(self):
tags=[ServerApiTags.WORKSPACE],
summary="Forward a TCP request to UDS mets server"
)
others_router.add_api_route(
path="/kill_mets_server_zombies",
endpoint=self.kill_mets_server_zombies,
methods=["DELETE"],
tags=[ServerApiTags.WORKFLOW, ServerApiTags.PROCESSING],
status_code=status.HTTP_200_OK,
summary="!! Workaround Do Not Use Unless You Have A Reason !! Kill all METS servers on this machine that have been created more than 60 minutes ago."
)
self.include_router(others_router)

def add_api_routes_processing(self):
Expand Down Expand Up @@ -817,6 +826,10 @@ async def get_workflow_info(self, workflow_job_id) -> Dict:
response = self._produce_workflow_status_response(processing_jobs=jobs)
return response

async def kill_mets_server_zombies(self) -> List[int]:
pids_killed = kill_mets_server_zombies(minutes_ago=60)
return pids_killed

async def get_workflow_info_simple(self, workflow_job_id) -> Dict[str, JobState]:
"""
Simplified version of the `get_workflow_info` that returns a single state for the entire workflow.
Expand Down
36 changes: 32 additions & 4 deletions src/ocrd_network/server_utils.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
import os
import re
import signal
from pathlib import Path
from json import dumps, loads
from urllib.parse import urljoin
from typing import Dict, List, Union
from time import time

from fastapi import HTTPException, status, UploadFile
from fastapi.responses import FileResponse
from httpx import AsyncClient, Timeout
from json import dumps, loads
from logging import Logger
from pathlib import Path
from requests import get as requests_get
from typing import Dict, List, Union
from urllib.parse import urljoin
from requests_unixsocket import sys

from ocrd.resolver import Resolver
from ocrd.task_sequence import ProcessorTask
Expand Down Expand Up @@ -241,3 +247,25 @@ def validate_first_task_input_file_groups_existence(logger: Logger, mets_path: s
if group not in available_groups:
message = f"Input file group '{group}' of the first processor not found: {input_file_grps}"
raise_http_exception(logger, status.HTTP_422_UNPROCESSABLE_ENTITY, message)


def kill_mets_server_zombies(minutes_ago=60) -> List[int]:
now = time()
cmdline_pat = r'.*ocrd workspace -U.*server start $'
ret = []
for procdir in sorted(Path('/proc').glob('*'), key=os.path.getctime):
if not procdir.is_dir():
continue
cmdline_file = procdir.joinpath('cmdline')
if not cmdline_file.is_file():
continue
ctime_ago = int((now - procdir.stat().st_ctime) / 60)
if ctime_ago < minutes_ago:
continue
cmdline = cmdline_file.read_text().replace('\x00', ' ')
if re.match(cmdline_pat, cmdline):
pid = procdir.name
ret.append(pid)
print(f'METS Server with PID {pid} was created {ctime_ago} minutes ago, more than {minutes_ago}, so killing (cmdline="{cmdline})', file=sys.stderr)
os.kill(int(pid), signal.SIGTERM)
return ret

0 comments on commit c5fd843

Please sign in to comment.