Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

prometheus metrics for brozzler, plus yt-dlp #287

Closed
wants to merge 15 commits into from
42 changes: 42 additions & 0 deletions brozzler/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,24 @@ def brozzle_page(argv=None):
action="store_true",
help="Try to avoid web bot detection",
)
arg_parser.add_argument(
galgeek marked this conversation as resolved.
Show resolved Hide resolved
"--metrics_port",
dest="metrics_port",
default=8888,
help="Prometheus metrics port",
galgeek marked this conversation as resolved.
Show resolved Hide resolved
)
arg_parser.add_argument(
"--registry_url",
dest="registry_url",
default=None,
help="Prometheus scrape target registry URL",
galgeek marked this conversation as resolved.
Show resolved Hide resolved
)
arg_parser.add_argument(

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider adding:

    choices=["qa", "prod"],

"--env",
dest="env",
default=None,
help="env for Prometheus target registry",
)
arg_parser.add_argument(
"--screenshot-full-page", dest="screenshot_full_page", action="store_true"
)
Expand Down Expand Up @@ -279,6 +297,9 @@ def brozzle_page(argv=None):
window_height=args.window_height,
window_width=args.window_width,
stealth=args.stealth,
metrics_port=int(args.metrics_port),
registry_url=args.registry_url,
env=args.env,
)

def on_screenshot(screenshot_jpeg):
Expand Down Expand Up @@ -517,6 +538,24 @@ def brozzler_worker(argv=None):
action="store_true",
help="Try to avoid web bot detection",
)
arg_parser.add_argument(
"--metrics_port",
dest="metrics_port",
default=8888,
help="Prometheus metrics port",
)
arg_parser.add_argument(
"--registry_url",
dest="registry_url",
default=None,
help="Prometheus scrape target registry URL",
)
arg_parser.add_argument(
"--env",
dest="env",
default=None,
help="env for Prometheus target registry",
)
add_common_options(arg_parser, argv)

args = arg_parser.parse_args(args=argv[1:])
Expand Down Expand Up @@ -573,6 +612,9 @@ def get_skip_av_seeds():
skip_visit_hashtags=args.skip_visit_hashtags,
skip_youtube_dl=args.skip_youtube_dl,
stealth=args.stealth,
metrics_port=int(args.metrics_port),
registry_url=args.registry_url,
env=args.env,
)

signal.signal(signal.SIGQUIT, dump_state)
Expand Down
59 changes: 59 additions & 0 deletions brozzler/metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from typing import Optional

try:
from http_sd_registry.client import (
Client,
Env,
Registration,
Scheme,
format_self_target,
)
from http_sd_registry.config import ClientConfig
except ImportError:
http_sd_registry = None
galgeek marked this conversation as resolved.
Show resolved Hide resolved


from prometheus_client import Counter, Gauge, Histogram, start_http_server

# fmt: off
brozzler_pages_crawled = Counter("brozzler_pages_crawled", "number of pages visited by brozzler")
brozzler_page_processing_duration_seconds = Histogram("brozzler_page_processing_duration_seconds", "time spent processing a page in brozzler")
galgeek marked this conversation as resolved.
Show resolved Hide resolved
brozzler_outlinks_found = Counter("brozzler_outlinks_found", "number of outlinks found by brozzler")
brozzler_last_page_crawled_time = Gauge("brozzler_last_page_crawled_time", "time of last page visit")
galgeek marked this conversation as resolved.
Show resolved Hide resolved
brozzler_in_progress_pages = Gauge("brozzler_in_progress_pages", "number of pages currently processing with brozzler")
brozzler_resources_requested = Counter("brozzler_resources_requested", "number of resources requested", labelnames=["resource_type"])
galgeek marked this conversation as resolved.
Show resolved Hide resolved
brozzler_resources_fetched = Counter("brozzler_resources_fetched", "number of resources fetched", labelnames=["resource_type", "status_code"])
brozzler_resources_size_total = Counter("brozzler_resources_size_total", "total size of resources fetched", labelnames=["resource_type"])
brozzler_resources_fetch_time = Counter("brozzler_resources_fetch_time", "time spent fetching resources", labelnames=["resource_type"])
brozzler_ydl_urls_checked = Counter("brozzler_ydl_urls_checked", "count of urls checked by brozzler yt-dlp")
brozzler_ydl_download_attempts = Counter("brozzler_ydl_download_attempts", "count of download attempted by brozzler yt-dlp", labelnames=["host"])
galgeek marked this conversation as resolved.
Show resolved Hide resolved
brozzler_ydl_download_successes = Counter("brozzler_ydl_download_successes", "count of downloads completed by brozzler yt-dlp", labelnames=["host"])
# fmt: on


def register_prom_metrics(
metrics_port: int = 8888,
registry_url: Optional[str] = None,
env: Optional[str] = None,
):
# Start metrics endpoint for scraping
start_http_server(metrics_port)

if registry_url is None:
return

env_for_prom = None
if env == "qa":
env_for_prom = Env.qa
elif env == "prod":
env_for_prom = Env.prod

config = ClientConfig(server_url_base=registry_url)
client = Client(config)
target = format_self_target(scrape_port=metrics_port)
registration = Registration(
target=target,
env=env_for_prom,
galgeek marked this conversation as resolved.
Show resolved Hide resolved
scheme=Scheme.http,
)
client.keep_registered_threaded(registration)
20 changes: 20 additions & 0 deletions brozzler/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
import urlcanon
from requests.structures import CaseInsensitiveDict
import rethinkdb as rdb
from . import metrics
from . import ydl

r = rdb.RethinkDB()
Expand Down Expand Up @@ -71,6 +72,9 @@ def __init__(
stealth=False,
window_height=900,
window_width=1400,
metrics_port=None,
registry_url=None,
env=None,
):
self._frontier = frontier
self._service_registry = service_registry
Expand All @@ -93,6 +97,9 @@ def __init__(
self._window_height = window_height
self._window_width = window_width
self._stealth = stealth
self._metrics_port = metrics_port
self._registry_url = registry_url
self._env = env

self._browser_pool = brozzler.browser.BrowserPool(
max_browsers, chrome_exe=chrome_exe, ignore_cert_errors=True
Expand All @@ -104,6 +111,9 @@ def __init__(
self._start_stop_lock = threading.Lock()
self._shutdown = threading.Event()

# Setup metrics
metrics.register_prom_metrics(self._metrics_port, self._registry_url, self._env)

def _choose_warcprox(self):
warcproxes = self._service_registry.available_services("warcprox")
if not warcproxes:
Expand Down Expand Up @@ -266,6 +276,7 @@ def brozzle_page(
):
try:
ydl_outlinks = ydl.do_youtube_dl(self, site, page)
metrics.brozzler_ydl_urls_checked.inc(1)
outlinks.update(ydl_outlinks)
except brozzler.ReachedLimit as e:
raise
Expand Down Expand Up @@ -311,7 +322,15 @@ def _needs_browsing(self, page_headers):
return False
return True

@metrics.brozzler_page_processing_duration_seconds.time()
@metrics.brozzler_in_progress_pages.track_inprogress()
def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None):
def update_page_metrics(page, outlinks):
"""Update page-level Prometheus metrics."""
metrics.brozzler_last_page_crawled_time.set_to_current_time()
metrics.brozzler_pages_crawled.inc(1)
metrics.brozzler_outlinks_found.inc(len(outlinks))

def _on_screenshot(screenshot_jpeg):
if on_screenshot:
on_screenshot(screenshot_jpeg)
Expand Down Expand Up @@ -416,6 +435,7 @@ def _on_service_worker_version_updated(chrome_msg):
)
if final_page_url != page.url:
page.note_redirect(final_page_url)
update_page_metrics(page, outlinks)
return outlinks

def _fetch_url(self, site, url=None, page=None):
Expand Down
5 changes: 4 additions & 1 deletion brozzler/ydl.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import json
import doublethink
import datetime
from . import metrics
import threading

thread_local = threading.local()
Expand Down Expand Up @@ -296,9 +297,10 @@ def _remember_videos(page, pushed_videos=None):

def _try_youtube_dl(worker, ydl, site, page):
ytdlp_url = page.redirect_url if page.redirect_url else page.url
ytdlp_host = ytdlp_url.split("//")[-1].split("/")[0].split("?")[0]
try:
logging.info("trying yt-dlp on %s", ytdlp_url)

metrics.brozzler_ydl_download_attempts.labels(ytdlp_host).inc(1)
with brozzler.thread_accept_exceptions():
# we do whatwg canonicalization here to avoid "<urlopen error
# no host given>" resulting in ProxyError
Expand All @@ -307,6 +309,7 @@ def _try_youtube_dl(worker, ydl, site, page):
ie_result = ydl.sanitize_info(
ydl.extract_info(str(urlcanon.whatwg(ytdlp_url)))
)
metrics.brozzler_ydl_download_successes.labels(ytdlp_host).inc(1)
_remember_videos(page, ydl.pushed_videos)
if worker._using_warcprox(site):
info_json = json.dumps(ie_result, sort_keys=True, indent=4)
Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,10 @@ def find_package_data(package):
"jinja2>=2.10",
"cryptography>=2.3",
"python-magic>=0.4.15",
"prometheus-client>=0.20.0",
],
extras_require={
"yt-dlp": ["yt-dlp==2024.7.25"],
"yt-dlp": ["yt-dlp>=2024.7.25"],
"dashboard": ["flask>=1.0", "gunicorn>=19.8.1"],
"easy": [
"warcprox>=2.4.31",
Expand Down