From 674231ed2225d740edd25409bd2726fbde129989 Mon Sep 17 00:00:00 2001 From: Matt Frazier Date: Wed, 13 Nov 2024 11:38:24 -0500 Subject: [PATCH 1/4] Add undated AGU conference campaign for annual use --- framework/auth/campaigns.py | 9 +++++++ framework/auth/views.py | 2 +- tests/test_campaigns.py | 1 + tests/test_views.py | 6 ++--- website/mails/mails.py | 4 +++ .../emails/confirm_agu_conference.html.mako | 26 +++++++++++++++++++ website/util/metrics.py | 1 + 7 files changed, 45 insertions(+), 4 deletions(-) create mode 100644 website/templates/emails/confirm_agu_conference.html.mako diff --git a/framework/auth/campaigns.py b/framework/auth/campaigns.py index 8a902245817..a47b3cf637b 100644 --- a/framework/auth/campaigns.py +++ b/framework/auth/campaigns.py @@ -100,6 +100,15 @@ def get_campaigns(): } }) + newest_campaigns.update({ + 'agu_conference': { + 'system_tag': CampaignSourceTags.AguConference.value, + 'redirect_url': furl(DOMAIN).add(path='dashboard/').url, + 'confirmation_email_template': mails.CONFIRM_EMAIL_AGU_CONFERENCE, + 'login_type': 'native', + } + }) + CAMPAIGNS = newest_campaigns CAMPAIGNS_LAST_REFRESHED = timezone.now() diff --git a/framework/auth/views.py b/framework/auth/views.py index e398a6db0a5..5f999aaaca6 100644 --- a/framework/auth/views.py +++ b/framework/auth/views.py @@ -944,7 +944,7 @@ def register_user(**kwargs): ) if settings.CONFIRM_REGISTRATIONS_BY_EMAIL: - send_confirm_email_async(user, email=user.username) + send_confirm_email(user, email=user.username) message = language.REGISTRATION_SUCCESS.format(email=user.username) return {'message': message} else: diff --git a/tests/test_campaigns.py b/tests/test_campaigns.py index 587aaaa82d8..1df6a32169a 100644 --- a/tests/test_campaigns.py +++ b/tests/test_campaigns.py @@ -46,6 +46,7 @@ def setUp(self): 'osf-registries', 'osf-registered-reports', 'agu_conference_2023', + 'agu_conference', ] self.refresh = timezone.now() campaigns.CAMPAIGNS = None # force campaign refresh now that preprint providers are populated diff --git a/tests/test_views.py b/tests/test_views.py index f1dbaa3285d..d78e7760c17 100644 --- a/tests/test_views.py +++ b/tests/test_views.py @@ -3438,8 +3438,8 @@ def test_register_after_being_invited_as_unreg_contributor(self, mock_update_sea assert new_user.check_password(password) assert new_user.fullname == real_name - @mock.patch('framework.auth.views.send_confirm_email_async') - def test_register_sends_user_registered_signal(self, mock_send_confirm_email_async): + @mock.patch('framework.auth.views.send_confirm_email') + def test_register_sends_user_registered_signal(self, mock_send_confirm_email): url = api_url_for('register_user') name, email, password = fake.name(), fake_email(), 'underpressure' with capture_signals() as mock_signals: @@ -3453,7 +3453,7 @@ def test_register_sends_user_registered_signal(self, mock_send_confirm_email_asy } ) assert mock_signals.signals_sent() == {auth.signals.user_registered, auth.signals.unconfirmed_user_created} - assert mock_send_confirm_email_async.called + assert mock_send_confirm_email.called @mock.patch('framework.auth.views.mails.send_mail') def test_resend_confirmation(self, send_mail: MagicMock): diff --git a/website/mails/mails.py b/website/mails/mails.py index da66ad8d083..afca9e78f03 100644 --- a/website/mails/mails.py +++ b/website/mails/mails.py @@ -191,6 +191,10 @@ def get_english_article(word): 'confirm_agu_conference_2023', subject='OSF Account Verification, from the American Geophysical Union Conference' ) +CONFIRM_EMAIL_AGU_CONFERENCE = Mail( + 'confirm_agu_conference', + subject='OSF Account Verification, from the American Geophysical Union Conference' +) CONFIRM_EMAIL_PREPRINTS = lambda name, provider: Mail( f'confirm_preprints_{name}', subject=f'OSF Account Verification, {provider}' diff --git a/website/templates/emails/confirm_agu_conference.html.mako b/website/templates/emails/confirm_agu_conference.html.mako new file mode 100644 index 00000000000..603e2c39e8d --- /dev/null +++ b/website/templates/emails/confirm_agu_conference.html.mako @@ -0,0 +1,26 @@ +<%inherit file="notify_base.mako" /> + +<%def name="content()"> + + + Hello ${user.fullname},
+
+ + Thank you for joining us at the AGU Open Science Pavilion, and welcome to the Open Science Framework (OSF). + + We are pleased to offer a special AGU attendees exclusive 1:1 consultation to continue our conversation and to help + you get oriented on the OSF. This is an opportunity for us to show you useful OSF features, talk about + open science in Earth and space sciences, and for you to ask any questions you may have. + You can sign up to participate by completing this form, and a member of our team will be in touch to + determine your availability: +
+ https://docs.google.com/forms/d/e/1FAIpQLSeJ23YPaEMdbLY1OqbcP85Tt6rhLpFoOtH0Yg4vY_wSKULRcw/viewform?usp=sf_link +

+ To confirm your OSF account, please verify your email address by visiting this link:
+
+ ${confirmation_url}
+
+ From the team at the Center for Open Science
+ + + diff --git a/website/util/metrics.py b/website/util/metrics.py index 7324a410138..c76adb89f5a 100644 --- a/website/util/metrics.py +++ b/website/util/metrics.py @@ -57,6 +57,7 @@ class CampaignSourceTags(Enum): OsfRegisteredReports = campaign_source_tag('osf_registered_reports') Osf4m = campaign_source_tag('osf4m') AguConference2023 = campaign_source_tag('agu_conference_2023') + AguConference = campaign_source_tag('agu_conference') class OsfClaimedTags(Enum): From cb0c07844eb87a9b7344777f1986c23861a6adb1 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Fri, 22 Nov 2024 06:54:43 -0500 Subject: [PATCH 2/4] [ENG-6590] Fix: Monthly Usage Data - update monthly reporters with `iter_report_kwargs` (mostly affects `PublicItemUsageReporter`, which was badly optimized to generate many reports at once) - add `schedule_monthly_reporter` task that schedules tasks from `iter_report_kwargs` results - change `MonthlyReporter.followup_task()` to run per-report --- admin/management/views.py | 8 +- osf/features.yaml | 5 + .../commands/monthly_reporters_go.py | 127 ++++-- osf/metrics/preprint_metrics.py | 4 +- osf/metrics/reporters/_base.py | 19 +- .../reporters/institution_summary_monthly.py | 13 +- osf/metrics/reporters/institutional_users.py | 33 +- osf/metrics/reporters/public_item_usage.py | 378 ++++++++++-------- osf/metrics/reporters/spam_count.py | 7 +- osf/metrics/utils.py | 8 + osf_tests/metrics/reporters/__init__.py | 0 osf_tests/metrics/reporters/_testutils.py | 10 + .../test_institutional_summary_reporter.py | 7 +- .../test_institutional_users_reporter.py | 15 +- .../test_public_item_usage_reporter.py | 148 ++++--- osf_tests/metrics/test_yearmonth.txt | 7 + 16 files changed, 485 insertions(+), 304 deletions(-) create mode 100644 osf_tests/metrics/reporters/__init__.py create mode 100644 osf_tests/metrics/reporters/_testutils.py diff --git a/admin/management/views.py b/admin/management/views.py index 88548a518d1..bb7065c1062 100644 --- a/admin/management/views.py +++ b/admin/management/views.py @@ -12,6 +12,7 @@ from scripts.find_spammy_content import manage_spammy_content from django.urls import reverse from django.shortcuts import redirect +from osf.metrics.utils import YearMonth from osf.models import Preprint, Node, Registration @@ -122,8 +123,11 @@ def post(self, request, *args, **kwargs): report_date = None errors = monthly_reporters_go( - report_month=getattr(report_date, 'month', None), - report_year=getattr(report_date, 'year', None) + yearmonth=( + str(YearMonth.from_date(report_date)) + if report_date is not None + else '' + ), ) if errors: diff --git a/osf/features.yaml b/osf/features.yaml index a3f0fcc1f14..1b41e4b2cdc 100644 --- a/osf/features.yaml +++ b/osf/features.yaml @@ -221,3 +221,8 @@ switches: - flag_name: ENABLE_INACTIVE_SCHEMAS name: enable_inactive_schemas note: This is no longer used + + - flag_name: COUNTEDUSAGE_UNIFIED_METRICS_2024 + name: countedusage_unified_metrics_2024 + note: use only `osf.metrics.counted_usage`-based metrics where possible; un-use PageCounter, PreprintView, PreprintDownload, etc + active: false diff --git a/osf/management/commands/monthly_reporters_go.py b/osf/management/commands/monthly_reporters_go.py index c467640cd15..7ab7b843434 100644 --- a/osf/management/commands/monthly_reporters_go.py +++ b/osf/management/commands/monthly_reporters_go.py @@ -1,68 +1,125 @@ +import datetime import logging from django.core.management.base import BaseCommand -from django.db.utils import OperationalError -from django.utils import timezone +from django.db import OperationalError as DjangoOperationalError +from elasticsearch.exceptions import ConnectionError as ElasticConnectionError +from psycopg2 import OperationalError as PostgresOperationalError from framework.celery_tasks import app as celery_app +import framework.sentry from osf.metrics.reporters import AllMonthlyReporters from osf.metrics.utils import YearMonth -from website.app import init_app logger = logging.getLogger(__name__) -MAXMONTH = 12 - +_CONTINUE_AFTER_ERRORS = ( + DjangoOperationalError, + ElasticConnectionError, + PostgresOperationalError, +) @celery_app.task(name='management.commands.monthly_reporters_go') -def monthly_reporters_go(report_year=None, report_month=None): - init_app() # OSF-specific setup - - if report_year and report_month: - report_yearmonth = YearMonth(report_year, report_month) - else: # default to last month if year and month not provided - today = timezone.now().date() - report_yearmonth = YearMonth( - year=today.year if today.month > 1 else today.year - 1, - month=today.month - 1 or MAXMONTH, - ) - for _reporter_key in AllMonthlyReporters.__members__.keys(): - monthly_reporter_go.apply_async(kwargs={ +def monthly_reporters_go(yearmonth: str = '', reporter_key: str = ''): + _yearmonth = ( + YearMonth.from_str(yearmonth) + if yearmonth + else YearMonth.from_date(datetime.date.today()).prior() # default last month + ) + _reporter_keys = ( + [reporter_key] + if reporter_key + else _enum_names(AllMonthlyReporters) + ) + for _reporter_key in _reporter_keys: + schedule_monthly_reporter.apply_async(kwargs={ + 'yearmonth': str(_yearmonth), 'reporter_key': _reporter_key, - 'yearmonth': str(report_yearmonth), }) +@celery_app.task(name='management.commands.schedule_monthly_reporter') +def schedule_monthly_reporter( + yearmonth: str, + reporter_key: str, + continue_after: dict | None = None, +): + _reporter = _get_reporter(reporter_key, yearmonth) + _last_kwargs = None + try: + for _kwargs in _reporter.iter_report_kwargs(continue_after=continue_after): + monthly_reporter_do.apply_async(kwargs={ + 'yearmonth': yearmonth, + 'reporter_key': reporter_key, + 'report_kwargs': _kwargs, + }) + _last_kwargs = _kwargs + except _CONTINUE_AFTER_ERRORS as _error: + # let the celery task succeed but log the error + framework.sentry.log_exception(_error) + # schedule another task to continue scheduling + if _last_kwargs is not None: + schedule_monthly_reporter.apply_async(kwargs={ + 'yearmonth': yearmonth, + 'reporter_key': reporter_key, + 'continue_after': _last_kwargs, + }) + + @celery_app.task( - name='management.commands.monthly_reporter_go', - autoretry_for=(OperationalError,), + name='management.commands.monthly_reporter_do', + autoretry_for=( + DjangoOperationalError, + ElasticConnectionError, + PostgresOperationalError, + ), max_retries=5, retry_backoff=True, - bind=True, ) -def monthly_reporter_go(task, reporter_key: str, yearmonth: str): - _reporter_class = AllMonthlyReporters[reporter_key].value - _reporter = _reporter_class(YearMonth.from_str(yearmonth)) - _reporter.run_and_record_for_month() - _followup = _reporter.followup_task() - if _followup is not None: - _followup.apply_async() +def monthly_reporter_do(reporter_key: str, yearmonth: str, report_kwargs: dict): + _reporter = _get_reporter(reporter_key, yearmonth) + _report = _reporter.report(**report_kwargs) + if _report is not None: + _report.report_yearmonth = _reporter.yearmonth + _report.save() + _followup_task = _reporter.followup_task(_report) + if _followup_task is not None: + _followup_task.apply_async() class Command(BaseCommand): def add_arguments(self, parser): parser.add_argument( 'yearmonth', - type=YearMonth.from_str, - default={'year': None, 'month': None}, + type=str, help='year and month (YYYY-MM)', ) + parser.add_argument( + '-r', '--reporter', + type=str, + choices={_name.lower() for _name in _enum_names(AllMonthlyReporters)}, + default='', + help='name of the reporter to run (default all)', + ) - def handle(self, *args, **options): + def handle(self, *args, **kwargs): monthly_reporters_go( - report_year=getattr(options.get('yearmonth'), 'year', None), - report_month=getattr(options.get('yearmonth'), 'month', None), + yearmonth=kwargs['yearmonth'], + reporter_key=kwargs['reporter'].upper(), ) - self.stdout.write(self.style.SUCCESS('reporter tasks scheduled.')) + self.stdout.write(self.style.SUCCESS( + f'scheduling tasks for monthly reporter "{kwargs['reporter']}"...' + if kwargs['reporter'] + else 'scheduling tasks for all monthly reporters...' + )) + + +def _get_reporter(reporter_key: str, yearmonth: str): + _reporter_class = AllMonthlyReporters[reporter_key].value + return _reporter_class(YearMonth.from_str(yearmonth)) + + +def _enum_names(enum_cls) -> list[str]: + return list(enum_cls.__members__.keys()) diff --git a/osf/metrics/preprint_metrics.py b/osf/metrics/preprint_metrics.py index 472cd01f698..4b64398a5c6 100644 --- a/osf/metrics/preprint_metrics.py +++ b/osf/metrics/preprint_metrics.py @@ -37,8 +37,8 @@ def record_for_preprint(cls, preprint, user=None, **kwargs): ) @classmethod - def get_count_for_preprint(cls, preprint, after=None, before=None, index=None): - search = cls.search(after=after, before=before, index=index).filter('match', preprint_id=preprint._id) + def get_count_for_preprint(cls, preprint, after=None, before=None, index=None) -> int: + search = cls.search(index=index).filter('term', preprint_id=preprint._id) timestamp = {} if after: timestamp['gte'] = after diff --git a/osf/metrics/reporters/_base.py b/osf/metrics/reporters/_base.py index 931afe23fd0..707e869522b 100644 --- a/osf/metrics/reporters/_base.py +++ b/osf/metrics/reporters/_base.py @@ -15,18 +15,17 @@ class MonthlyReporter: yearmonth: YearMonth - def report(self) -> abc.Iterable[MonthlyReport] | abc.Iterator[MonthlyReport]: + def iter_report_kwargs(self, continue_after: dict | None = None) -> abc.Iterator[dict]: + # override for multiple reports per month + if continue_after is None: + yield {} # by default, calls `.report()` once with no kwargs + + def report(self, **report_kwargs) -> MonthlyReport | None: """build a report for the given month """ - raise NotImplementedError(f'{self.__name__} must implement `report`') - - def run_and_record_for_month(self) -> None: - reports = self.report() - for report in reports: - report.report_yearmonth = self.yearmonth - report.save() + raise NotImplementedError(f'{self.__class__.__name__} must implement `report`') - def followup_task(self) -> celery.Signature | None: + def followup_task(self, report) -> celery.Signature | None: return None @@ -36,7 +35,7 @@ def report(self, report_date): return an iterable of DailyReport (unsaved) """ - raise NotImplementedError(f'{self.__name__} must implement `report`') + raise NotImplementedError(f'{self.__class__.__name__} must implement `report`') def run_and_record_for_date(self, report_date): reports = self.report(report_date) diff --git a/osf/metrics/reporters/institution_summary_monthly.py b/osf/metrics/reporters/institution_summary_monthly.py index 998cc056298..4748860db32 100644 --- a/osf/metrics/reporters/institution_summary_monthly.py +++ b/osf/metrics/reporters/institution_summary_monthly.py @@ -11,9 +11,16 @@ class InstitutionalSummaryMonthlyReporter(MonthlyReporter): """Generate an InstitutionMonthlySummaryReport for each institution.""" - def report(self): - for institution in Institution.objects.all(): - yield self.generate_report(institution) + def iter_report_kwargs(self, continue_after: dict | None = None): + _inst_qs = Institution.objects.order_by('pk') + if continue_after: + _inst_qs = _inst_qs.filter(pk__gt=continue_after['institution_pk']) + for _pk in _inst_qs.values_list('pk', flat=True): + yield {'institution_pk': _pk} + + def report(self, **report_kwargs): + _institution = Institution.objects.get(pk=report_kwargs['institution_pk']) + return self.generate_report(_institution) def generate_report(self, institution): node_queryset = institution.nodes.filter( diff --git a/osf/metrics/reporters/institutional_users.py b/osf/metrics/reporters/institutional_users.py index e0f7f42a156..e34875d4b28 100644 --- a/osf/metrics/reporters/institutional_users.py +++ b/osf/metrics/reporters/institutional_users.py @@ -1,5 +1,4 @@ import dataclasses -import datetime from django.contrib.contenttypes.models import ContentType from django.db.models import Q, F, Sum @@ -12,9 +11,6 @@ from ._base import MonthlyReporter -_CHUNK_SIZE = 500 - - class InstitutionalUsersReporter(MonthlyReporter): '''build an InstitutionalUserReport for each institution-user affiliation @@ -22,13 +18,27 @@ class InstitutionalUsersReporter(MonthlyReporter): which offers institutional admins insight into how people at their institution are using osf, based on their explicitly-affiliated osf objects ''' - def report(self): + def iter_report_kwargs(self, continue_after: dict | None = None): _before_datetime = self.yearmonth.month_end() - for _institution in osfdb.Institution.objects.filter(created__lt=_before_datetime): + _inst_qs = ( + osfdb.Institution.objects + .filter(created__lt=_before_datetime) + .order_by('pk') + ) + if continue_after: + _inst_qs = _inst_qs.filter(pk__gte=continue_after['institution_pk']) + for _institution in _inst_qs: _user_qs = _institution.get_institution_users().filter(created__lt=_before_datetime) - for _user in _user_qs.iterator(chunk_size=_CHUNK_SIZE): - _helper = _InstiUserReportHelper(_institution, _user, self.yearmonth, _before_datetime) - yield _helper.report + if continue_after and (_institution.pk == continue_after['institution_pk']): + _user_qs = _user_qs.filter(pk__gt=continue_after['user_pk']) + for _user_pk in _user_qs.values_list('pk', flat=True): + yield {'institution_pk': _institution.pk, 'user_pk': _user_pk} + + def report(self, **report_kwargs): + _institution = osfdb.Institution.objects.get(pk=report_kwargs['institution_pk']) + _user = osfdb.OSFUser.objects.get(pk=report_kwargs['user_pk']) + _helper = _InstiUserReportHelper(_institution, _user, self.yearmonth) + return _helper.report # helper @@ -37,7 +47,6 @@ class _InstiUserReportHelper: institution: osfdb.Institution user: osfdb.OSFUser yearmonth: YearMonth - before_datetime: datetime.datetime report: InstitutionalUserReport = dataclasses.field(init=False) def __post_init__(self): @@ -64,6 +73,10 @@ def __post_init__(self): storage_byte_count=self._storage_byte_count(), ) + @property + def before_datetime(self): + return self.yearmonth.month_end() + def _node_queryset(self): _institution_node_qs = self.institution.nodes.filter( created__lt=self.before_datetime, diff --git a/osf/metrics/reporters/public_item_usage.py b/osf/metrics/reporters/public_item_usage.py index ecc34a5d9c7..cc401d50bd7 100644 --- a/osf/metrics/reporters/public_item_usage.py +++ b/osf/metrics/reporters/public_item_usage.py @@ -1,17 +1,24 @@ from __future__ import annotations +import datetime import typing -import celery +import waffle if typing.TYPE_CHECKING: import elasticsearch_dsl as edsl +import osf.features from osf.metadata.osf_gathering import OsfmapPartition from osf.metrics.counted_usage import ( CountedAuthUsage, get_item_type, get_provider_id, ) +from osf.metrics.preprint_metrics import ( + PreprintDownload, + PreprintView, +) from osf.metrics.reports import PublicItemUsageReport +from osf.metrics.utils import YearMonth from osf import models as osfdb from website import settings as website_settings from ._base import MonthlyReporter @@ -31,80 +38,128 @@ class PublicItemUsageReporter(MonthlyReporter): includes projects, project components, registrations, registration components, and preprints ''' - - def report(self): - # use two composite aggregations in parallel to page thru every - # public item viewed or downloaded this month, counting: - # - views and downloads for each item (using `CountedAuthUsage.item_guid`) - # - views for each item's components and files (using `CountedAuthUsage.surrounding_guids`) - for _exact_bucket, _contained_views_bucket in _zip_composite_aggs( - self._exact_item_search(), 'agg_osfid', - self._contained_item_views_search(), 'agg_surrounding_osfid', + def iter_report_kwargs(self, continue_after: dict | None = None): + _after_osfid = continue_after['osfid'] if continue_after else None + for _osfid in _zip_sorted( + self._countedusage_osfids(_after_osfid), + self._preprintview_osfids(_after_osfid), + self._preprintdownload_osfids(_after_osfid), ): - try: - _report = self._report_from_buckets(_exact_bucket, _contained_views_bucket) - yield _report - except _SkipItem: - pass + yield {'osfid': _osfid} + + def report(self, **report_kwargs): + _osfid = report_kwargs['osfid'] + # get usage metrics from several sources: + # - osf.metrics.counted_usage: + # - views and downloads for each item (using `CountedAuthUsage.item_guid`) + # - views for each item's components and files (using `CountedAuthUsage.surrounding_guids`) + # - osf.metrics.preprint_metrics: + # - preprint views and downloads + # - PageCounter? (no) + try: + _guid = osfdb.Guid.load(_osfid) + if _guid is None or _guid.referent is None: + raise _SkipItem + _obj = _guid.referent + _report = self._init_report(_obj) + self._fill_report_counts(_report, _obj) + if not any(( + _report.view_count, + _report.view_session_count, + _report.download_count, + _report.download_session_count, + )): + raise _SkipItem + return _report + except _SkipItem: + return None + + def followup_task(self, report): + _is_last_month = report.report_yearmonth.next() == YearMonth.from_date(datetime.date.today()) + if _is_last_month: + from api.share.utils import task__update_share + return task__update_share.signature( + args=(report.item_osfid,), + kwargs={ + 'is_backfill': True, + 'osfmap_partition_name': OsfmapPartition.MONTHLY_SUPPLEMENT.name, + }, + countdown=30, # give index time to settle + ) - def followup_task(self): - return task__update_monthly_metadatas.signature( - args=[str(self.yearmonth)], - countdown=30, # give index time to settle + def _countedusage_osfids(self, after_osfid: str | None) -> typing.Iterator[str]: + _search = self._base_usage_search() + _search.aggs.bucket( + 'agg_osfid', + 'composite', + sources=[{'osfid': {'terms': {'field': 'item_guid'}}}], + size=_CHUNK_SIZE, ) + return _iter_composite_bucket_keys(_search, 'agg_osfid', 'osfid', after=after_osfid) - def _report_from_buckets(self, exact_bucket, contained_views_bucket): - # either exact_bucket or contained_views_bucket may be None, but not both - assert (exact_bucket is not None) or (contained_views_bucket is not None) - _report = ( - self._init_report_from_exact_bucket(exact_bucket) - if exact_bucket is not None - else self._init_report_from_osfid(contained_views_bucket.key.osfid) + def _preprintview_osfids(self, after_osfid: str | None) -> typing.Iterator[str]: + _search = ( + PreprintView.search() + .filter('range', timestamp={ + 'gte': self.yearmonth.month_start(), + 'lt': self.yearmonth.month_end(), + }) + .extra(size=0) # only aggregations, no hits + ) + _search.aggs.bucket( + 'agg_osfid', + 'composite', + sources=[{'osfid': {'terms': {'field': 'preprint_id'}}}], + size=_CHUNK_SIZE, ) - # view counts include views on contained items (components, files) - _report.view_count, _report.view_session_count = self._get_view_counts(_report.item_osfid) - return _report + return _iter_composite_bucket_keys(_search, 'agg_osfid', 'osfid', after=after_osfid) - def _init_report_from_exact_bucket(self, exact_bucket) -> PublicItemUsageReport: - # in the (should-be common) case of an item that has been directly viewed in - # this month, the stored metrics already have the data required - _report = PublicItemUsageReport( - item_osfid=exact_bucket.key.osfid, - item_type=_agg_keys(exact_bucket.agg_item_type), - provider_id=_agg_keys(exact_bucket.agg_provider_id), - platform_iri=_agg_keys(exact_bucket.agg_platform_iri), - # default counts to zero, will be updated if non-zero - view_count=0, - view_session_count=0, - download_count=0, - download_session_count=0, + def _preprintdownload_osfids(self, after_osfid: str | None) -> typing.Iterator[str]: + _search = ( + PreprintDownload.search() + .filter('range', timestamp={ + 'gte': self.yearmonth.month_start(), + 'lt': self.yearmonth.month_end(), + }) + .extra(size=0) # only aggregations, no hits ) - for _actionbucket in exact_bucket.agg_action: - # note: view counts computed separately to avoid double-counting - if _actionbucket.key == CountedAuthUsage.ActionLabel.DOWNLOAD.value: - _report.download_count = _actionbucket.doc_count - _report.download_session_count = _actionbucket.agg_session_count.value - return _report + _search.aggs.bucket( + 'agg_osfid', + 'composite', + sources=[{'osfid': {'terms': {'field': 'preprint_id'}}}], + size=_CHUNK_SIZE, + ) + return _iter_composite_bucket_keys(_search, 'agg_osfid', 'osfid', after=after_osfid) - def _init_report_from_osfid(self, osfid: str) -> PublicItemUsageReport: - # for the (should-be unusual) case where the components/files contained by - # an item have views in this month, but the item itself does not -- - # load necessary info via django models, instead - _osfguid = osfdb.Guid.load(osfid) - if _osfguid is None or not getattr(_osfguid.referent, 'is_public', False): + def _init_report(self, osf_obj) -> PublicItemUsageReport: + if not _is_item_public(osf_obj): raise _SkipItem return PublicItemUsageReport( - item_osfid=osfid, - item_type=[get_item_type(_osfguid.referent)], - provider_id=[get_provider_id(_osfguid.referent)], + item_osfid=osf_obj._id, + item_type=[get_item_type(osf_obj)], + provider_id=[get_provider_id(osf_obj)], platform_iri=[website_settings.DOMAIN], - # default counts to zero, will be updated if non-zero - view_count=0, - view_session_count=0, - download_count=0, - download_session_count=0, + # leave counts null; will be set if there's data ) + def _fill_report_counts(self, report, osf_obj): + if ( + isinstance(osf_obj, osfdb.Preprint) + and not waffle.switch_is_active(osf.features.COUNTEDUSAGE_UNIFIED_METRICS_2024) # type: ignore[attr-defined] + ): + # note: no session-count info in preprint metrics + report.view_count = self._preprint_views(osf_obj) + report.download_count = self._preprint_downloads(osf_obj) + else: + ( + report.view_count, + report.view_session_count, + ) = self._countedusage_view_counts(osf_obj) + ( + report.download_count, + report.download_session_count, + ) = self._countedusage_download_counts(osf_obj) + def _base_usage_search(self): return ( CountedAuthUsage.search() @@ -113,59 +168,10 @@ def _base_usage_search(self): 'gte': self.yearmonth.month_start(), 'lt': self.yearmonth.month_end(), }) - .update_from_dict({'size': 0}) # only aggregations, no hits + .extra(size=0) # only aggregations, no hits ) - def _exact_item_search(self) -> edsl.Search: - '''aggregate views and downloads on each osfid (not including components/files)''' - _search = self._base_usage_search() - # the main agg: use a composite aggregation to page thru *every* item - _agg_osfid = _search.aggs.bucket( - 'agg_osfid', - 'composite', - sources=[{'osfid': {'terms': {'field': 'item_guid'}}}], - size=_CHUNK_SIZE, - ) - # nested agg: for each item, get platform_iri values - _agg_osfid.bucket('agg_platform_iri', 'terms', field='platform_iri') - # nested agg: for each item, get provider_id values - _agg_osfid.bucket('agg_provider_id', 'terms', field='provider_id') - # nested agg: for each item, get item_type values - _agg_osfid.bucket('agg_item_type', 'terms', field='item_type') - # nested agg: for each item, get download count - _agg_action = _agg_osfid.bucket( - 'agg_action', - 'terms', - field='action_labels', - include=[ - CountedAuthUsage.ActionLabel.DOWNLOAD.value, - ], - ) - # nested nested agg: get download session count - _agg_action.metric( - 'agg_session_count', - 'cardinality', - field='session_id', - precision_threshold=_MAX_CARDINALITY_PRECISION, - ) - return _search - - def _contained_item_views_search(self) -> edsl.Search: - '''iterate osfids with views on contained components and files''' - _search = ( - self._base_usage_search() - .filter('term', action_labels=CountedAuthUsage.ActionLabel.VIEW.value) - ) - # the main agg: use a composite aggregation to page thru *every* item - _search.aggs.bucket( - 'agg_surrounding_osfid', - 'composite', - sources=[{'osfid': {'terms': {'field': 'surrounding_guids'}}}], - size=_CHUNK_SIZE, - ) - return _search - - def _get_view_counts(self, osfid: str) -> tuple[int, int]: + def _countedusage_view_counts(self, osf_obj) -> tuple[int, int]: '''compute view_session_count separately to avoid double-counting (the same session may be represented in both the composite agg on `item_guid` @@ -179,8 +185,8 @@ def _get_view_counts(self, osfid: str) -> tuple[int, int]: {'term': {'action_labels': CountedAuthUsage.ActionLabel.VIEW.value}}, ], should=[ - {'term': {'item_guid': osfid}}, - {'term': {'surrounding_guids': osfid}}, + {'term': {'item_guid': osf_obj._id}}, + {'term': {'surrounding_guids': osf_obj._id}}, ], minimum_should_match=1, ) @@ -193,86 +199,108 @@ def _get_view_counts(self, osfid: str) -> tuple[int, int]: ) _response = _search.execute() _view_count = _response.hits.total - _view_session_count = _response.aggregations.agg_session_count.value + _view_session_count = ( + _response.aggregations.agg_session_count.value + if 'agg_session_count' in _response.aggregations + else 0 + ) return (_view_count, _view_session_count) - -### -# followup celery task -@celery.shared_task -def task__update_monthly_metadatas(yearmonth: str): - from api.share.utils import task__update_share - _report_search = ( - PublicItemUsageReport.search() - .filter('term', report_yearmonth=yearmonth) - .source(['item_osfid']) # return only the 'item_osfid' field - ) - for _hit in _report_search.scan(): - task__update_share.delay( - _hit.item_osfid, - is_backfill=True, - osfmap_partition_name=OsfmapPartition.MONTHLY_SUPPLEMENT.name, + def _countedusage_download_counts(self, osf_obj) -> tuple[int, int]: + '''aggregate downloads on each osfid (not including components/files)''' + _search = ( + self._base_usage_search() + .filter('term', item_guid=osf_obj._id) + .filter('term', action_labels=CountedAuthUsage.ActionLabel.DOWNLOAD.value) + ) + # agg: get download session count + _search.aggs.metric( + 'agg_session_count', + 'cardinality', + field='session_id', + precision_threshold=_MAX_CARDINALITY_PRECISION, + ) + _response = _search.execute() + _download_count = _response.hits.total + _download_session_count = ( + _response.aggregations.agg_session_count.value + if 'agg_session_count' in _response.aggregations + else 0 + ) + return (_download_count, _download_session_count) + + def _preprint_views(self, preprint: osfdb.Preprint) -> int: + '''aggregate views on each preprint''' + return PreprintView.get_count_for_preprint( + preprint=preprint, + after=self.yearmonth.month_start(), + before=self.yearmonth.month_end(), ) + def _preprint_downloads(self, preprint: osfdb.Preprint) -> int: + '''aggregate downloads on each preprint''' + return PreprintDownload.get_count_for_preprint( + preprint=preprint, + after=self.yearmonth.month_start(), + before=self.yearmonth.month_end(), + ) -### -# local helpers - -def _agg_keys(bucket_agg_result) -> list: - return [_bucket.key for _bucket in bucket_agg_result] +def _is_item_public(osfid_referent) -> bool: + if isinstance(osfid_referent, osfdb.Preprint): + return bool(osfid_referent.verified_publishable) # quacks like Preprint + return getattr(osfid_referent, 'is_public', False) # quacks like AbstractNode -def _zip_composite_aggs( - search_a: edsl.Search, - composite_agg_name_a: str, - search_b: edsl.Search, - composite_agg_name_b: str, -): - '''iterate thru two composite aggregations, yielding pairs of buckets matched by key - the composite aggregations must have matching names in `sources` so their keys can be compared +def _zip_sorted( + *iterators: typing.Iterator[str], +) -> typing.Iterator[str]: + '''loop thru multiple iterators on sorted (ascending) sequences of strings ''' - _iter_a = _iter_composite_buckets(search_a, composite_agg_name_a) - _iter_b = _iter_composite_buckets(search_b, composite_agg_name_b) - _next_a = next(_iter_a, None) - _next_b = next(_iter_b, None) + _nexts = { # holds the next value from each iterator, or None + _i: next(_iter, None) + for _i, _iter in enumerate(iterators) + } while True: - if _next_a is None and _next_b is None: - return # both done - elif _next_a is None or _next_b is None: - # one is done but not the other -- no matching needed - yield (_next_a, _next_b) - _next_a = next(_iter_a, None) - _next_b = next(_iter_b, None) - elif _next_a.key == _next_b.key: - # match -- yield and increment both - yield (_next_a, _next_b) - _next_a = next(_iter_a, None) - _next_b = next(_iter_b, None) - elif _orderable_key(_next_a) < _orderable_key(_next_b): - # mismatch -- yield and increment a (but not b) - yield (_next_a, None) - _next_a = next(_iter_a, None) - else: - # mismatch -- yield and increment b (but not a) - yield (None, _next_b) - _next_b = next(_iter_b, None) - - -def _iter_composite_buckets(search: edsl.Search, composite_agg_name: str): + _nonnull_nexts = [ + _next + for _next in _nexts.values() + if _next is not None + ] + if not _nonnull_nexts: + return # all done + _value = min(_nonnull_nexts) + yield _value + for _i, _iter in enumerate(iterators): + if _nexts[_i] == _value: + _nexts[_i] = next(_iter, None) + + +def _iter_composite_bucket_keys( + search: edsl.Search, + composite_agg_name: str, + composite_source_name: str, + after: str | None = None, +) -> typing.Iterator[str]: '''iterate thru *all* buckets of a composite aggregation, requesting new pages as needed assumes the given search has a composite aggregation of the given name + with a single value source of the given name updates the search in-place for subsequent pages ''' + if after is not None: + search.aggs[composite_agg_name].after = {composite_source_name: after} while True: _page_response = search.execute(ignore_cache=True) # reused search object has the previous page cached try: _agg_result = _page_response.aggregations[composite_agg_name] except KeyError: return # no data; all done - yield from _agg_result.buckets + for _bucket in _agg_result.buckets: + _key = _bucket.key.to_dict() + assert set(_key.keys()) == {composite_source_name}, f'expected only one key ("{composite_source_name}") in {_bucket.key}' + yield _key[composite_source_name] # update the search for the next page try: _next_after = _agg_result.after_key @@ -280,7 +308,3 @@ def _iter_composite_buckets(search: edsl.Search, composite_agg_name: str): return # all done else: search.aggs[composite_agg_name].after = _next_after - - -def _orderable_key(composite_bucket) -> list: - return sorted(composite_bucket.key.to_dict().items()) diff --git a/osf/metrics/reporters/spam_count.py b/osf/metrics/reporters/spam_count.py index 94290f96203..cb1c3eeb641 100644 --- a/osf/metrics/reporters/spam_count.py +++ b/osf/metrics/reporters/spam_count.py @@ -8,11 +8,12 @@ class SpamCountReporter(MonthlyReporter): - def report(self): + def report(self, **report_kwargs): + assert not report_kwargs target_month = self.yearmonth.month_start() next_month = self.yearmonth.month_end() - report = SpamSummaryReport( + return SpamSummaryReport( # Node Log entries node_confirmed_spam=NodeLog.objects.filter( action=NodeLog.CONFIRM_SPAM, @@ -79,5 +80,3 @@ def report(self): created__lt=next_month, ).count() ) - - return [report] diff --git a/osf/metrics/utils.py b/osf/metrics/utils.py index 910b1f3104c..febfd24d6d2 100644 --- a/osf/metrics/utils.py +++ b/osf/metrics/utils.py @@ -58,6 +58,14 @@ def next(self) -> YearMonth: else YearMonth(self.year, self.month + 1) ) + def prior(self) -> YearMonth: + """get a new YearMonth for the month before this one""" + return ( + YearMonth(self.year - 1, int(calendar.DECEMBER)) + if self.month == calendar.JANUARY + else YearMonth(self.year, self.month - 1) + ) + def month_start(self) -> datetime.datetime: """get a datetime (in UTC timezone) when this YearMonth starts""" return datetime.datetime(self.year, self.month, 1, tzinfo=datetime.UTC) diff --git a/osf_tests/metrics/reporters/__init__.py b/osf_tests/metrics/reporters/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/osf_tests/metrics/reporters/_testutils.py b/osf_tests/metrics/reporters/_testutils.py new file mode 100644 index 00000000000..0d18f3bcac9 --- /dev/null +++ b/osf_tests/metrics/reporters/_testutils.py @@ -0,0 +1,10 @@ +from osf.metrics.reporters._base import MonthlyReporter +from osf.metrics.reports import MonthlyReport + + +def list_monthly_reports(reporter: MonthlyReporter) -> list[MonthlyReport]: + _reports = ( + reporter.report(**_kwargs) + for _kwargs in reporter.iter_report_kwargs() + ) + return [_report for _report in _reports if (_report is not None)] diff --git a/osf_tests/metrics/reporters/test_institutional_summary_reporter.py b/osf_tests/metrics/reporters/test_institutional_summary_reporter.py index 715a2cd1553..05baa4d38e7 100644 --- a/osf_tests/metrics/reporters/test_institutional_summary_reporter.py +++ b/osf_tests/metrics/reporters/test_institutional_summary_reporter.py @@ -11,6 +11,7 @@ PreprintFactory, AuthUserFactory, ) +from ._testutils import list_monthly_reports class TestInstiSummaryMonthlyReporter(TestCase): @@ -78,7 +79,7 @@ def _create_active_user(cls, institution, date_confirmed): def test_report_generation(self): reporter = InstitutionalSummaryMonthlyReporter(self._yearmonth) - reports = list(reporter.report()) + reports = list_monthly_reports(reporter) self.assertEqual(len(reports), 1) report = reports[0] @@ -114,7 +115,7 @@ def test_report_generation_multiple_institutions(self): # Run the reporter for the current month (February 2018) reporter = InstitutionalSummaryMonthlyReporter(self._yearmonth) - reports = list(reporter.report()) + reports = list_monthly_reports(reporter) self.assertEqual(len(reports), 3) # Reports for self._institution, institution2, institution3 # Extract reports by institution @@ -263,7 +264,7 @@ def test_high_counts_multiple_institutions(self): if enable_benchmarking: reporter_start_time = time.time() reporter = InstitutionalSummaryMonthlyReporter(self._yearmonth) - reports = list(reporter.report()) + reports = list_monthly_reports(reporter) assert len(reports) == additional_institution_count + 1 if enable_benchmarking: diff --git a/osf_tests/metrics/reporters/test_institutional_users_reporter.py b/osf_tests/metrics/reporters/test_institutional_users_reporter.py index 876fd08cf9b..275fcb1e8a1 100644 --- a/osf_tests/metrics/reporters/test_institutional_users_reporter.py +++ b/osf_tests/metrics/reporters/test_institutional_users_reporter.py @@ -18,6 +18,7 @@ UserFactory, EmbargoFactory, ) +from ._testutils import list_monthly_reports def _patch_now(fakenow: datetime.datetime): @@ -67,24 +68,24 @@ def _assert_report_matches_setup(self, report: InstitutionalUserReport, setup: _ self.assertEqual(report.published_preprint_count, setup.published_preprint_count) def test_no_users(self): - _actual_reports = list(InstitutionalUsersReporter(self._yearmonth).report()) + _actual_reports = list_monthly_reports(InstitutionalUsersReporter(self._yearmonth)) self.assertEqual(_actual_reports, []) def test_one_user_with_nothing(self): self._user_setup_with_nothing.affiliate_user() - _reports = list(InstitutionalUsersReporter(self._yearmonth).report()) + _reports = list_monthly_reports(InstitutionalUsersReporter(self._yearmonth)) self.assertEqual(len(_reports), 1) self._assert_report_matches_setup(_reports[0], self._user_setup_with_nothing) def test_one_user_with_ones(self): self._user_setup_with_ones.affiliate_user() - _reports = list(InstitutionalUsersReporter(self._yearmonth).report()) + _reports = list_monthly_reports(InstitutionalUsersReporter(self._yearmonth)) self.assertEqual(len(_reports), 1) self._assert_report_matches_setup(_reports[0], self._user_setup_with_ones) def test_one_user_with_stuff_and_no_files(self): self._user_setup_with_stuff.affiliate_user() - _reports = list(InstitutionalUsersReporter(self._yearmonth).report()) + _reports = list_monthly_reports(InstitutionalUsersReporter(self._yearmonth)) self.assertEqual(len(_reports), 1) self._assert_report_matches_setup(_reports[0], self._user_setup_with_stuff) self.assertEqual(_reports[0].public_file_count, 2) # preprint 2 files @@ -96,7 +97,7 @@ def test_one_user_with_stuff_and_a_file(self): _project = _user.nodes.first() with _patch_now(self._now): create_test_file(target=_project, user=_user, size=37) - (_report,) = InstitutionalUsersReporter(self._yearmonth).report() + (_report,) = list_monthly_reports(InstitutionalUsersReporter(self._yearmonth)) self._assert_report_matches_setup(_report, self._user_setup_with_stuff) self.assertEqual(_report.public_file_count, 3) # 2 preprint files self.assertEqual(_report.storage_byte_count, 2711) # 2 preprint files @@ -113,7 +114,7 @@ def test_one_user_with_stuff_and_multiple_files(self): create_test_file(target=_component, user=_user, size=53, filename='bla') create_test_file(target=_component, user=_user, size=51, filename='blar') create_test_file(target=_component, user=_user, size=47, filename='blarg') - (_report,) = InstitutionalUsersReporter(self._yearmonth).report() + (_report,) = list_monthly_reports(InstitutionalUsersReporter(self._yearmonth)) self._assert_report_matches_setup(_report, self._user_setup_with_stuff) self.assertEqual(_report.public_file_count, 7) # 2 preprint files self.assertEqual(_report.storage_byte_count, 2935) # 2 preprint files + 37 + 73 + 53 + 51 + 47 @@ -130,7 +131,7 @@ def test_several_users(self): _setup.user._id: _setup for _setup in _setups } - _reports = list(InstitutionalUsersReporter(self._yearmonth).report()) + _reports = list_monthly_reports(InstitutionalUsersReporter(self._yearmonth)) self.assertEqual(len(_reports), len(_setup_by_userid)) for _actual_report in _reports: _setup = _setup_by_userid[_actual_report.user_id] diff --git a/osf_tests/metrics/reporters/test_public_item_usage_reporter.py b/osf_tests/metrics/reporters/test_public_item_usage_reporter.py index 454b8d6700d..b75c420b1a2 100644 --- a/osf_tests/metrics/reporters/test_public_item_usage_reporter.py +++ b/osf_tests/metrics/reporters/test_public_item_usage_reporter.py @@ -1,27 +1,48 @@ -from datetime import timedelta +from datetime import datetime, timedelta from operator import attrgetter from unittest import mock import pytest from osf.metrics.counted_usage import CountedAuthUsage +from osf.metrics.preprint_metrics import ( + PreprintDownload, + PreprintView, +) from osf.metrics.reporters.public_item_usage import PublicItemUsageReporter from osf.metrics.reports import PublicItemUsageReport from osf.metrics.utils import YearMonth +from osf import models as osfdb +from osf_tests import factories +from ._testutils import list_monthly_reports @pytest.mark.es_metrics +@pytest.mark.django_db class TestPublicItemUsageReporter: @pytest.fixture(autouse=True) - def _mocks(self): - with ( - # set a tiny page size to force aggregation pagination: - mock.patch('osf.metrics.reporters.public_item_usage._CHUNK_SIZE', 1), - # HACK: skip auto-filling fields from the database: - mock.patch('osf.models.base.Guid.load', return_value=None), - ): + def _patch_settings(self): + with mock.patch('website.settings.DOMAIN', 'http://osf.example'): yield + @pytest.fixture + def item0(self): + _item0 = factories.PreprintFactory(is_public=True) + _item0._id = 'item0' + return _item0 + + @pytest.fixture + def item1(self): + _item1 = factories.ProjectFactory(is_public=True) + _item1._id = 'item1' + return _item1 + + @pytest.fixture + def item2(self, item1): + _item2 = factories.ProjectFactory(is_public=True, parent=item1) + _item2._id = 'item2' + return _item2 + @pytest.fixture def ym_empty(self) -> YearMonth: return YearMonth(2012, 7) @@ -35,89 +56,87 @@ def ym_busy(self) -> YearMonth: return YearMonth(2023, 7) @pytest.fixture - def sparse_month_usage(self, ym_sparse): + def sparse_month_usage(self, ym_sparse, item0, item1, item2): # "sparse" month: # item0: 3 views, 0 downloads, 2 sessions # item1: 1 views, 1 download, 1 session (plus 1 view from child item2) # item2: 1 views, 0 downloads, 1 session _month_start = ym_sparse.month_start() _save_usage( + item0, timestamp=_month_start, - item_guid='item0', session_id='sesh0', action_labels=['view'], ) _save_usage( + item0, timestamp=_month_start + timedelta(minutes=2), - item_guid='item0', session_id='sesh0', action_labels=['view'], ) _save_usage( + item1, timestamp=_month_start + timedelta(minutes=3), - item_guid='item1', session_id='sesh0', action_labels=['download'], ) _save_usage( + item0, timestamp=_month_start + timedelta(days=17), - item_guid='item0', session_id='sesh1', action_labels=['view'], ) _save_usage( + item1, timestamp=_month_start + timedelta(days=17, minutes=3), - item_guid='item1', session_id='sesh1', action_labels=['view'], ) _save_usage( + item2, timestamp=_month_start + timedelta(days=17, minutes=5), - item_guid='item2', - surrounding_guids=['item1'], session_id='sesh1', action_labels=['view'], ) _save_usage( + item2, timestamp=_month_start + timedelta(days=17, minutes=11), - item_guid='item2', - surrounding_guids=['item1'], session_id='sesh1', action_labels=['download'], ) @pytest.fixture - def busy_month_item0(self, ym_busy): + def busy_month_item0(self, ym_busy, item0): # item0: 4 sessions, 4*7 views, 4*5 downloads _month_start = ym_busy.month_start() for _sesh in range(0, 4): _sesh_start = _month_start + timedelta(days=_sesh) for _minute in range(0, 7): _save_usage( + item0, timestamp=_sesh_start + timedelta(minutes=_minute), - item_guid='item0', session_id=f'sesh0{_sesh}', action_labels=['view'], ) for _minute in range(10, 15): _save_usage( + item0, timestamp=_sesh_start + timedelta(minutes=_minute), - item_guid='item0', session_id=f'sesh0{_sesh}', action_labels=['download'], ) @pytest.fixture - def busy_month_item1(self, ym_busy): - # item1: 10 sessions, 6*9 views, 5*7 downloads, 2 providers + def busy_month_item1(self, ym_busy, item1): + # item1: 10 sessions, 6*9 views, 5*7 downloads # (plus 11 views in 11 sessions from child item2) _month_start = ym_busy.month_start() for _sesh in range(0, 6): _sesh_start = _month_start + timedelta(days=_sesh) for _minute in range(0, 9): _save_usage( + item1, timestamp=_sesh_start + timedelta(minutes=_minute), - item_guid='item1', session_id=f'sesh1{_sesh}', action_labels=['view'], ) @@ -125,42 +144,39 @@ def busy_month_item1(self, ym_busy): _sesh_start = _month_start + timedelta(days=_sesh) for _minute in range(10, 17): _save_usage( + item1, timestamp=_sesh_start + timedelta(minutes=_minute), - item_guid='item1', session_id=f'sesh1{_sesh}', action_labels=['download'], - provider_id='prov1', # additional provider_id ) @pytest.fixture - def busy_month_item2(self, ym_busy): + def busy_month_item2(self, ym_busy, item2): # item2: 11 sessions, 11 views, 11 downloads (child of item1) _month_start = ym_busy.month_start() for _sesh in range(1, 12): _save_usage( + item2, timestamp=_month_start + timedelta(days=_sesh), - item_guid='item2', - surrounding_guids=['item1'], session_id=f'sesh2{_sesh}', action_labels=['view'], ) _save_usage( + item2, timestamp=_month_start + timedelta(days=_sesh, hours=_sesh), - item_guid='item2', - surrounding_guids=['item1'], session_id=f'sesh2{_sesh}', action_labels=['download'], ) def test_no_data(self, ym_empty): _reporter = PublicItemUsageReporter(ym_empty) - _empty = list(_reporter.report()) + _empty = list_monthly_reports(_reporter) assert _empty == [] - def test_reporter(self, ym_empty, ym_sparse, ym_busy, sparse_month_usage, busy_month_item0, busy_month_item1, busy_month_item2): - _empty = list(PublicItemUsageReporter(ym_empty).report()) - _sparse = list(PublicItemUsageReporter(ym_sparse).report()) - _busy = list(PublicItemUsageReporter(ym_busy).report()) + def test_reporter(self, ym_empty, ym_sparse, ym_busy, sparse_month_usage, busy_month_item0, busy_month_item1, busy_month_item2, item0): + _empty = list_monthly_reports(PublicItemUsageReporter(ym_empty)) + _sparse = list_monthly_reports(PublicItemUsageReporter(ym_sparse)) + _busy = list_monthly_reports(PublicItemUsageReporter(ym_busy)) # empty month: assert _empty == [] @@ -171,16 +187,16 @@ def test_reporter(self, ym_empty, ym_sparse, ym_busy, sparse_month_usage, busy_m # sparse-month item0 assert isinstance(_sparse_item0, PublicItemUsageReport) assert _sparse_item0.item_osfid == 'item0' - assert _sparse_item0.provider_id == ['prov0'] + assert _sparse_item0.provider_id == [item0.provider._id] assert _sparse_item0.platform_iri == ['http://osf.example'] assert _sparse_item0.view_count == 3 - assert _sparse_item0.view_session_count == 2 + assert _sparse_item0.view_session_count is None # no session count for preprints assert _sparse_item0.download_count == 0 - assert _sparse_item0.download_session_count == 0 + assert _sparse_item0.download_session_count is None # no session count for preprints # sparse-month item1 assert isinstance(_sparse_item1, PublicItemUsageReport) assert _sparse_item1.item_osfid == 'item1' - assert _sparse_item1.provider_id == ['prov0'] + assert _sparse_item1.provider_id == ['osf'] assert _sparse_item1.platform_iri == ['http://osf.example'] assert _sparse_item1.view_count == 2 # including item2 assert _sparse_item1.view_session_count == 1 # including item2 @@ -189,7 +205,7 @@ def test_reporter(self, ym_empty, ym_sparse, ym_busy, sparse_month_usage, busy_m # sparse-month item2 assert isinstance(_sparse_item1, PublicItemUsageReport) assert _sparse_item2.item_osfid == 'item2' - assert _sparse_item2.provider_id == ['prov0'] + assert _sparse_item2.provider_id == ['osf'] assert _sparse_item2.platform_iri == ['http://osf.example'] assert _sparse_item2.view_count == 1 assert _sparse_item2.view_session_count == 1 @@ -202,16 +218,16 @@ def test_reporter(self, ym_empty, ym_sparse, ym_busy, sparse_month_usage, busy_m # busy-month item0 assert isinstance(_busy_item0, PublicItemUsageReport) assert _busy_item0.item_osfid == 'item0' - assert _busy_item0.provider_id == ['prov0'] + assert _busy_item0.provider_id == [item0.provider._id] assert _busy_item0.platform_iri == ['http://osf.example'] assert _busy_item0.view_count == 4 * 7 - assert _busy_item0.view_session_count == 4 + assert _busy_item0.view_session_count is None # no session count for preprints assert _busy_item0.download_count == 4 * 5 - assert _busy_item0.download_session_count == 4 + assert _busy_item0.download_session_count is None # no session count for preprints # busy-month item1 assert isinstance(_busy_item1, PublicItemUsageReport) assert _busy_item1.item_osfid == 'item1' - assert _busy_item1.provider_id == ['prov0', 'prov1'] + assert _busy_item1.provider_id == ['osf'] assert _busy_item1.platform_iri == ['http://osf.example'] assert _busy_item1.view_count == 6 * 9 + 11 assert _busy_item1.view_session_count == 6 + 11 @@ -220,7 +236,7 @@ def test_reporter(self, ym_empty, ym_sparse, ym_busy, sparse_month_usage, busy_m # busy-month item2 assert isinstance(_busy_item2, PublicItemUsageReport) assert _busy_item2.item_osfid == 'item2' - assert _busy_item2.provider_id == ['prov0'] + assert _busy_item2.provider_id == ['osf'] assert _busy_item2.platform_iri == ['http://osf.example'] assert _busy_item2.view_count == 11 assert _busy_item2.view_session_count == 11 @@ -228,11 +244,41 @@ def test_reporter(self, ym_empty, ym_sparse, ym_busy, sparse_month_usage, busy_m assert _busy_item2.download_session_count == 11 -def _save_usage(**kwargs): - _kwargs = { # overridable defaults: +def _save_usage( + item, + *, + timestamp: datetime, + action_labels: list[str], + **kwargs, +): + _countedusage_kwargs = { + 'timestamp': timestamp, + 'item_guid': item._id, + 'action_labels': action_labels, 'platform_iri': 'http://osf.example', - 'item_public': True, - 'provider_id': 'prov0', **kwargs, } - CountedAuthUsage(**_kwargs).save(refresh=True) + CountedAuthUsage(**_countedusage_kwargs).save(refresh=True) + if isinstance(item, osfdb.Preprint): + if 'view' in action_labels: + _save_preprint_view(item, timestamp) + if 'download' in action_labels: + _save_preprint_download(item, timestamp) + + +def _save_preprint_view(preprint, timestamp): + PreprintView( + timestamp=timestamp, + count=1, + preprint_id=preprint._id, + provider_id=preprint.provider._id, + ).save(refresh=True) + + +def _save_preprint_download(preprint, timestamp): + PreprintDownload( + timestamp=timestamp, + count=1, + preprint_id=preprint._id, + provider_id=preprint.provider._id, + ).save(refresh=True) diff --git a/osf_tests/metrics/test_yearmonth.txt b/osf_tests/metrics/test_yearmonth.txt index 646c73c42f9..fae6b990c36 100644 --- a/osf_tests/metrics/test_yearmonth.txt +++ b/osf_tests/metrics/test_yearmonth.txt @@ -35,6 +35,13 @@ YearMonth(year=1491, month=12) >>> ym.next().next() YearMonth(year=1492, month=1) +`prior` method gives the prior year-month: +>>> ym = YearMonth(1492, 2) +>>> ym.prior() +YearMonth(year=1492, month=1) +>>> ym.prior().prior() +YearMonth(year=1491, month=12) + `month_start` method: >>> YearMonth(3333, 3).month_start() datetime.datetime(3333, 3, 1, 0, 0, tzinfo=datetime.timezone.utc) From 0ec9101d1bbcc4df8e163fc7283ee9b5b1e7da2b Mon Sep 17 00:00:00 2001 From: Matt Frazier Date: Mon, 2 Dec 2024 15:45:02 -0500 Subject: [PATCH 3/4] Avoid Sequence Scans on BFN --- osf/metrics/reporters/institutional_users.py | 37 ++++++++++++++------ 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/osf/metrics/reporters/institutional_users.py b/osf/metrics/reporters/institutional_users.py index e34875d4b28..512472a3d96 100644 --- a/osf/metrics/reporters/institutional_users.py +++ b/osf/metrics/reporters/institutional_users.py @@ -68,7 +68,7 @@ def __post_init__(self): private_project_count=self._private_project_queryset().count(), public_registration_count=self._public_registration_queryset().count(), embargoed_registration_count=self._embargoed_registration_queryset().count(), - public_file_count=self._public_osfstorage_file_queryset().count(), + public_file_count=self._public_osfstorage_file_count(), published_preprint_count=self._published_preprint_queryset().count(), storage_byte_count=self._storage_byte_count(), ) @@ -127,7 +127,7 @@ def _published_preprint_queryset(self): .exclude(spam_status=SpamStatus.SPAM) ) - def _public_osfstorage_file_queryset(self): + def _public_osfstorage_file_querysets(self): _target_node_q = Q( # any public project, registration, project component, or registration component target_object_id__in=self._node_queryset().filter(is_public=True).values('pk'), @@ -137,23 +137,40 @@ def _public_osfstorage_file_queryset(self): target_object_id__in=self._published_preprint_queryset().values('pk'), target_content_type=ContentType.objects.get_for_model(osfdb.Preprint), ) - return ( + return ( # split into two queries to avoid a parallel sequence scan on BFN + OsfStorageFile.objects + .filter( + created__lt=self.before_datetime, + deleted__isnull=True, + purged__isnull=True, + ) + .filter(_target_node_q), OsfStorageFile.objects .filter( created__lt=self.before_datetime, deleted__isnull=True, purged__isnull=True, ) - .filter(_target_node_q | _target_preprint_q) + .filter(_target_preprint_q) + ) + + def _public_osfstorage_file_count(self): + return sum( + _target_queryset.count() for _target_queryset + in self._public_osfstorage_file_querysets() ) def _storage_byte_count(self): - return osfdb.FileVersion.objects.filter( - size__gt=0, - created__lt=self.before_datetime, - purged__isnull=True, - basefilenode__in=self._public_osfstorage_file_queryset(), - ).aggregate(storage_bytes=Sum('size', default=0))['storage_bytes'] + return sum( + osfdb.FileVersion.objects.filter( + size__gt=0, + created__lt=self.before_datetime, + purged__isnull=True, + basefilenode__in=_target_queryset, + ).aggregate(storage_bytes=Sum('size', default=0))['storage_bytes'] + for _target_queryset + in self._public_osfstorage_file_querysets() + ) def _get_last_active(self): end_date = self.yearmonth.month_end() From 0a510f5cf477a8018a2c1886cdda3cdbb5a1ccf6 Mon Sep 17 00:00:00 2001 From: Matt Frazier Date: Mon, 2 Dec 2024 15:47:55 -0500 Subject: [PATCH 4/4] Use low queue for metric reporters - h/t @aaxelb --- website/settings/defaults.py | 1 + 1 file changed, 1 insertion(+) diff --git a/website/settings/defaults.py b/website/settings/defaults.py index 0467ef3c166..91e3c1bacc6 100644 --- a/website/settings/defaults.py +++ b/website/settings/defaults.py @@ -446,6 +446,7 @@ class CeleryConfig: 'osf.management.commands.daily_reporters_go', 'osf.management.commands.monthly_reporters_go', 'osf.management.commands.ingest_cedar_metadata_templates', + 'osf.metrics.reporters', } med_pri_modules = {