Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

prometheus metrics and retry updates #289

Merged
merged 31 commits into from
Sep 25, 2024
Merged
Show file tree
Hide file tree
Changes from 29 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
edf2f9e
retries for proxyrack and push
Sep 11, 2024
2d7efba
initial commit
Sep 11, 2024
7de5b1c
add some labels
Sep 12, 2024
1d63793
add update_page_metrics
Sep 12, 2024
1d84e72
update setup.py: skip prometheus-async; update ytdlp
Sep 12, 2024
c82858a
make http_sd_registry optional
Sep 12, 2024
66827fb
more metrics
Sep 12, 2024
80ce6c0
register_prom_metrics working in dev env
Sep 12, 2024
7b6c306
setup registry_url, metrics_port, env vars and CLI args
Sep 13, 2024
6609645
tidy params for register_prom_metrics
Sep 13, 2024
1bc2135
better registry_url help
Sep 13, 2024
9d7b7c4
add @metrics.brozzler_in_progress_pages.track_in_progress()
Sep 13, 2024
8c20d1d
add (back) env param
Sep 13, 2024
1a6aeb4
track_inprogress (not _in_progress)
Sep 13, 2024
551d186
better order
Sep 13, 2024
4a60ff3
post-deploy bug fixes
Sep 14, 2024
cfafc56
Merge branch 'proxyrack' into metrics_plus_proxy_retries
Sep 16, 2024
bb1c343
updates for review of PR 287
Sep 18, 2024
8b2c254
brozzler_ydl_extract, not download
Sep 18, 2024
62b1243
mostly ydl.py updates for new proxyrack testing
Sep 18, 2024
27cb104
more extract (less download)
Sep 18, 2024
2aa1788
mostly black'd
Sep 18, 2024
f624e7e
bump qa-ish version
Sep 19, 2024
7fbee54
proxyrack only for youtube.com
Sep 19, 2024
229d53d
add brozzler_ydl_download_successes metric
Sep 19, 2024
c74d9ad
limit proxy logging
Sep 20, 2024
7de6483
skip brozzler_resources metrics
Sep 20, 2024
3e9030a
rm oddly merged try except block
Sep 23, 2024
fb43d3f
bump version
Sep 23, 2024
6a0b0b0
updates post-walkthru
Sep 24, 2024
9983f43
black'd
Sep 24, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions brozzler/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,25 @@ def brozzle_page(argv=None):
action="store_true",
help="Try to avoid web bot detection",
)
arg_parser.add_argument(
"--metrics_port",
type=int,
dest="metrics_port",
default=8888,
galgeek marked this conversation as resolved.
Show resolved Hide resolved
help="Port for brozzler's Prometheus scrape endpoint",
)
arg_parser.add_argument(
"--registry_url",
dest="registry_url",
default=None,
help="http-sd-registry url, for Prometheus metrics discovery",
)
arg_parser.add_argument(
"--env",
dest="env",
default=None,
help="env for Prometheus target registry",
galgeek marked this conversation as resolved.
Show resolved Hide resolved
)
arg_parser.add_argument(
"--screenshot-full-page", dest="screenshot_full_page", action="store_true"
)
Expand Down Expand Up @@ -279,6 +298,9 @@ def brozzle_page(argv=None):
window_height=args.window_height,
window_width=args.window_width,
stealth=args.stealth,
metrics_port=int(args.metrics_port),
galgeek marked this conversation as resolved.
Show resolved Hide resolved
registry_url=args.registry_url,
env=args.env,
)

def on_screenshot(screenshot_jpeg):
Expand Down Expand Up @@ -517,6 +539,25 @@ def brozzler_worker(argv=None):
action="store_true",
help="Try to avoid web bot detection",
)
arg_parser.add_argument(
"--metrics_port",
type=int,
dest="metrics_port",
default=8888,
help="Port for brozzler's Prometheus scrape endpoint",
)
arg_parser.add_argument(
"--registry_url",
dest="registry_url",
default=None,
help="http-sd-registry url, for Prometheus metrics discovery",
)
arg_parser.add_argument(
"--env",
dest="env",
default=None,
help="env for Prometheus target registry",
)
add_common_options(arg_parser, argv)

args = arg_parser.parse_args(args=argv[1:])
Expand Down Expand Up @@ -573,6 +614,9 @@ def get_skip_av_seeds():
skip_visit_hashtags=args.skip_visit_hashtags,
skip_youtube_dl=args.skip_youtube_dl,
stealth=args.stealth,
metrics_port=int(args.metrics_port),
registry_url=args.registry_url,
env=args.env,
)

signal.signal(signal.SIGQUIT, dump_state)
Expand Down
58 changes: 58 additions & 0 deletions brozzler/metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from typing import Optional

try:
from http_sd_registry.client import (
Client,
Env,
Registration,
Scheme,
format_self_target,
)
from http_sd_registry.config import ClientConfig
except ImportError:
# for users without access to http_sd_registry
http_sd_registry = None


from prometheus_client import Counter, Gauge, Histogram, start_http_server

# fmt: off
brozzler_pages_crawled = Counter("brozzler_pages_crawled", "number of pages visited by brozzler")
brozzler_page_processing_duration_seconds = Histogram("brozzler_page_processing_duration_seconds", "time spent processing a page in brozzler")
brozzler_outlinks_found = Counter("brozzler_outlinks_found", "number of outlinks found by brozzler")
brozzler_last_page_crawled_time = Gauge("brozzler_last_page_crawled_time", "time of last page visit, in seconds since UNIX epoch")
brozzler_in_progress_pages = Gauge("brozzler_in_progress_pages", "number of pages currently processing with brozzler")
brozzler_ydl_urls_checked = Counter("brozzler_ydl_urls_checked", "count of urls checked by brozzler yt-dlp")
brozzler_ydl_extract_attempts = Counter("brozzler_ydl_extract_attempts", "count of extracts attempted by brozzler yt-dlp", labelnames=["youtube_host"])
brozzler_ydl_extract_successes = Counter("brozzler_ydl_extract_successes", "count of extracts completed by brozzler yt-dlp", labelnames=["youtube_host"])
brozzler_ydl_download_successes = Counter("brozzler_ydl_download_successes", "count of downloads completed by brozzler yt-dlp", labelnames=["youtube_host"])
# fmt: on


def register_prom_metrics(
metrics_port: int = 8888,
registry_url: Optional[str] = None,
env: Optional[str] = None,
):
# Start metrics endpoint for scraping
start_http_server(metrics_port)

if registry_url is None:
return

if env == "qa":
env_for_prom = Env.qa
elif env == "prod":
env_for_prom = Env.prod
else:
env_for_prom = Env.qa

config = ClientConfig(server_url_base=registry_url)
client = Client(config)
target = format_self_target(scrape_port=metrics_port)
registration = Registration(
target=target,
env=env_for_prom,
scheme=Scheme.http,
)
client.keep_registered_threaded(registration)
20 changes: 20 additions & 0 deletions brozzler/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
import urlcanon
from requests.structures import CaseInsensitiveDict
import rethinkdb as rdb
from . import metrics
from . import ydl

r = rdb.RethinkDB()
Expand Down Expand Up @@ -71,6 +72,9 @@ def __init__(
stealth=False,
window_height=900,
window_width=1400,
metrics_port=None,
registry_url=None,
env=None,
):
self._frontier = frontier
self._service_registry = service_registry
Expand All @@ -93,6 +97,9 @@ def __init__(
self._window_height = window_height
self._window_width = window_width
self._stealth = stealth
self._metrics_port = metrics_port
self._registry_url = registry_url
self._env = env

self._browser_pool = brozzler.browser.BrowserPool(
max_browsers, chrome_exe=chrome_exe, ignore_cert_errors=True
Expand All @@ -104,6 +111,9 @@ def __init__(
self._start_stop_lock = threading.Lock()
self._shutdown = threading.Event()

# Setup metrics
metrics.register_prom_metrics(self._metrics_port, self._registry_url, self._env)
galgeek marked this conversation as resolved.
Show resolved Hide resolved

def _choose_warcprox(self):
warcproxes = self._service_registry.available_services("warcprox")
if not warcproxes:
Expand Down Expand Up @@ -266,6 +276,7 @@ def brozzle_page(
):
try:
ydl_outlinks = ydl.do_youtube_dl(self, site, page)
metrics.brozzler_ydl_urls_checked.inc(1)
outlinks.update(ydl_outlinks)
except brozzler.ReachedLimit as e:
raise
Expand Down Expand Up @@ -311,7 +322,15 @@ def _needs_browsing(self, page_headers):
return False
return True

@metrics.brozzler_page_processing_duration_seconds.time()
@metrics.brozzler_in_progress_pages.track_inprogress()
def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None):
def update_page_metrics(page, outlinks):
"""Update page-level Prometheus metrics."""
metrics.brozzler_last_page_crawled_time.set_to_current_time()
metrics.brozzler_pages_crawled.inc(1)
metrics.brozzler_outlinks_found.inc(len(outlinks))

def _on_screenshot(screenshot_jpeg):
if on_screenshot:
on_screenshot(screenshot_jpeg)
Expand Down Expand Up @@ -416,6 +435,7 @@ def _on_service_worker_version_updated(chrome_msg):
)
if final_page_url != page.url:
page.note_redirect(final_page_url)
update_page_metrics(page, outlinks)
return outlinks

def _fetch_url(self, site, url=None, page=None):
Expand Down
Loading