diff --git a/.docker-compose.env b/.docker-compose.env index 80eebc8707b..a712f7ac23c 100644 --- a/.docker-compose.env +++ b/.docker-compose.env @@ -6,7 +6,6 @@ DOMAIN=http://localhost:5000/ INTERNAL_DOMAIN=http://192.168.168.167:5000/ API_DOMAIN=http://localhost:8000/ ELASTIC_URI=192.168.168.167:9200 -ELASTIC6_URI=192.168.168.167:9201 ELASTIC8_URI=http://192.168.168.167:9202 ELASTIC8_USERNAME=elastic OSF_DB_HOST=192.168.168.167 diff --git a/.github/actions/build-es6/action.yml b/.github/actions/build-es6/action.yml deleted file mode 100644 index 5eb71f2147f..00000000000 --- a/.github/actions/build-es6/action.yml +++ /dev/null @@ -1,32 +0,0 @@ -name: 'Build ElasticSearch6' -description: 'Building and starting the ElasticSearch6 service' -inputs: - ELASTICSEARCH6_ARCHIVE: - description: 'Where ES6 archive is located' -runs: - using: "composite" - steps: - - id: ES6-Step-1 - shell: bash - run: | - cd ~/.cache/downloads - if [ ! -f "${{ inputs.ELASTICSEARCH6_ARCHIVE }}" ]; then - curl -SLO https://artifacts.elastic.co/downloads/elasticsearch/${{ inputs.ELASTICSEARCH6_ARCHIVE }} - fi - - mkdir -p /tmp/elasticsearch6 - tar -xzf ${{ inputs.ELASTICSEARCH6_ARCHIVE }} -C /tmp/elasticsearch6 --strip-components=1 - echo "ES6 started..." - - id: ES6-Step-2 - shell: bash - run: /tmp/elasticsearch6/bin/elasticsearch > /dev/null & export ELASTICSEARCH6_PID=$! - - id: ES6-Step-3 - shell: bash - run: | - echo "Waiting for ES6 health..." - sleep 5 - while [ ! $(curl -sf http://localhost:9201/_cluster/health?wait_for_status=yellow) ]; do - echo "trying again..."; - sleep 5; - done - echo "ES6 started successfully!" diff --git a/.github/actions/start-build/action.yml b/.github/actions/start-build/action.yml index 22d8deae0d5..b75362f56f5 100644 --- a/.github/actions/start-build/action.yml +++ b/.github/actions/start-build/action.yml @@ -12,9 +12,6 @@ runs: - uses: ./.github/actions/build-es with: ELASTICSEARCH_ARCHIVE: ${{ env.ELASTICSEARCH_ARCHIVE }} - - uses: ./.github/actions/build-es6 - with: - ELASTICSEARCH6_ARCHIVE: ${{ env.ELASTICSEARCH6_ARCHIVE }} - name: Set up Python 3.12 uses: actions/setup-python@v6 with: diff --git a/.github/workflows/test-build.yml b/.github/workflows/test-build.yml index f147941c5ff..ead75f5e21e 100644 --- a/.github/workflows/test-build.yml +++ b/.github/workflows/test-build.yml @@ -8,7 +8,6 @@ permissions: env: WHEELHOUSE: ~/.cache/wheelhouse ELASTICSEARCH_ARCHIVE: elasticsearch-2.4.5.tar.gz - ELASTICSEARCH6_ARCHIVE: elasticsearch-6.3.1.tar.gz OSF_DB_PORT: 5432 OSF_DB_PASSWORD: postgres GITHUB_ACTIONS: true @@ -140,6 +139,8 @@ jobs: - uses: ./.github/actions/start-build - name: Run tests run: poetry run python3 -m invoke test-ci-api3-and-osf --junit + env: + ELASTIC8_URI: http://localhost:9202 - name: Upload report if: (success() || failure()) # run this step even if previous step failed uses: ./.github/actions/gen-report diff --git a/README-docker-compose.md b/README-docker-compose.md index 0956379333f..6cc0f0341b3 100644 --- a/README-docker-compose.md +++ b/README-docker-compose.md @@ -116,11 +116,7 @@ #### Special Instructions for Apple Chipset (M1, M2, etc.) and other ARM64 architecture - * _NOTE: The default `elasticsearch`, `elasticsearch6`, and `sharejs` containers are incompatible with ARM64._ - - - To run `elasticsearch6` on ARM64 architecture: - - - Copy `docker-compose-dist-arm64.override.yml` into your `docker-compose.override.yml` file + * _NOTE: The default `elasticsearch` and `sharejs` containers are incompatible with ARM64._ - Running containers with docker compose @@ -194,7 +190,7 @@ - Start all containers ```bash - alias dcsa="docker compose up -d assets admin_assets mfr wb fakecas sharejs worker elasticsearch elasticsearch6 web api admin preprints gv" + alias dcsa="docker compose up -d assets admin_assets mfr wb fakecas sharejs worker elasticsearch elasticsearch8 web api admin preprints gv" ``` - Shut down all containers diff --git a/addons/base/views.py b/addons/base/views.py index 12b78fb9957..04e620c4c54 100644 --- a/addons/base/views.py +++ b/addons/base/views.py @@ -14,7 +14,7 @@ import waffle from django.db import transaction from django.contrib.contenttypes.models import ContentType -from elasticsearch6 import exceptions as es_exceptions +from elasticsearch8 import exceptions as es_exceptions from rest_framework import status as http_status from api.caching.tasks import update_storage_usage_with_size @@ -34,7 +34,7 @@ from framework.flask import redirect from framework.sentry import log_exception from framework.transactions.handlers import no_auto_transaction -from osf.metrics.es8_metrics import OsfCountedUsageEvent +from osf.metrics.events import OsfCountedUsageEvent from website import settings from addons.base import signals as file_signals from addons.base.utils import format_last_known_metadata, get_mfr_url @@ -54,7 +54,6 @@ FileVersionUserMetadata, FileVersion, NotificationTypeEnum ) -from osf.metrics import PreprintView, PreprintDownload from osf.utils import permissions from osf.external.gravy_valet import request_helpers from website.profile.utils import get_profile_image_url @@ -686,12 +685,6 @@ def osfstoragefile_viewed_update_metrics(self, auth, fileversion, file_node): return if waffle.switch_is_active(features.ELASTICSEARCH_METRICS) and isinstance(resource, Preprint): try: - PreprintView.record_for_preprint( - preprint=resource, - user=auth.user, - version=fileversion.identifier, - path=file_node.path, - ) OsfCountedUsageEvent.record( user_id=getattr(user, '_id', None), item_osfid=resource._id, @@ -725,12 +718,6 @@ def osfstoragefile_downloaded_update_metrics(self, auth, fileversion, file_node) return if waffle.switch_is_active(features.ELASTICSEARCH_METRICS) and isinstance(resource, Preprint): try: - PreprintDownload.record_for_preprint( - preprint=resource, - user=auth.user, - version=fileversion.identifier, - path=file_node.path, - ) OsfCountedUsageEvent.record( user_id=getattr(user, '_id', None), item_osfid=resource._id, diff --git a/admin/management/urls.py b/admin/management/urls.py index 2e4cd7479a1..3d29a259483 100644 --- a/admin/management/urls.py +++ b/admin/management/urls.py @@ -1,4 +1,4 @@ -from django.urls import re_path, path +from django.urls import re_path from admin.management import views @@ -22,5 +22,4 @@ name='sync_notification_templates'), re_path(r'^remove_orcid_from_user_social', views.RemoveOrcidFromUserSocial.as_view(), name='remove_orcid_from_user_social'), - path('migrate_osfmetrics_6to8', views.MigrateOsfmetrics6to8.as_view(), name='migrate_osfmetrics_6to8'), ] diff --git a/admin/management/views.py b/admin/management/views.py index 04034bfaa08..4b6e6b4c080 100644 --- a/admin/management/views.py +++ b/admin/management/views.py @@ -1,5 +1,3 @@ -from io import StringIO - from dateutil.parser import isoparse from django.views.generic import TemplateView, View from django.contrib import messages @@ -205,22 +203,3 @@ def post(self, request): remove_orcid_from_user_social() messages.success(request, 'Orcid from user social have been successfully removed.') return redirect(reverse('management:commands')) - - -class MigrateOsfmetrics6to8(ManagementCommandPermissionView): - def post(self, request): - _command_kwargs = { - 'no_color': True, - 'no_counts': request.POST.get('no_counts'), - 'clear_state': request.POST.get('clear_state'), - 'clear_es8_data': request.POST.get('clear_es8_data'), - 'start': request.POST.get('start'), - 'unchanged': request.POST.get('unchanged'), - 'usage_reports': request.POST.get('usage_reports'), - 'usage_events': request.POST.get('usage_events'), - } - _out_io = StringIO() - call_command('migrate_osfmetrics_6to8', **_command_kwargs, stdout=_out_io) - for _line in _out_io.getvalue().split('\n'): - messages.info(request, _line) - return redirect(reverse('management:commands')) diff --git a/admin/templates/management/commands.html b/admin/templates/management/commands.html index 03be151ddbb..fd9ceec9c1b 100644 --- a/admin/templates/management/commands.html +++ b/admin/templates/management/commands.html @@ -190,31 +190,6 @@

Remove existing orcid info from user social

-
-

migrate osf-metrics 6to8

-

- view progress of the osf-metrics migration from elastic6 to elastic8 (or start it) -

-
- {% csrf_token %} - - - - -
- (narrow types: - - - - ) -
- -
-
{% endblock %} diff --git a/api/base/elasticsearch_dsl_views.py b/api/base/elasticsearch_dsl_views.py index ecf2825d4e8..a8f4292c33e 100644 --- a/api/base/elasticsearch_dsl_views.py +++ b/api/base/elasticsearch_dsl_views.py @@ -3,8 +3,9 @@ import datetime import typing -import elasticsearch6_dsl as edsl +import elasticsearch8.dsl as esdsl from rest_framework import generics, exceptions as drf_exceptions +from rest_framework.serializers import Serializer from rest_framework.settings import api_settings as drf_settings from api.base.settings.defaults import REPORT_FILENAME_FORMAT @@ -23,7 +24,7 @@ class ElasticsearchListView(FilterMixin, JSONAPIBaseView, generics.ListAPIView, abc.ABC): - '''abstract view class using `elasticsearch6_dsl.Search` as a queryset-analogue + '''abstract view class using `elasticsearch8.dsl.Search` as a queryset-analogue builds a `Search` based on `self.get_default_search()` and the request's query parameters for filtering, sorting, and pagination -- fetches only @@ -35,18 +36,18 @@ class ElasticsearchListView(FilterMixin, JSONAPIBaseView, generics.ListAPIView, ordering_fields: frozenset[str] = frozenset() # serializer field names @abc.abstractmethod - def get_default_search(self) -> edsl.Search | None: - '''the base `elasticsearch6_dsl.Search` for this list, based on url path + def get_default_search(self) -> esdsl.Search | None: + '''the base `elasticsearch8.dsl.Search` for this list, based on url path (common jsonapi query parameters will be considered automatically) ''' ... - FILE_RENDERER_CLASSES = { + FILE_RENDERER_CLASSES = ( MetricsReportsCsvRenderer, MetricsReportsTsvRenderer, MetricsReportsJsonRenderer, - } + ) def set_content_disposition(self, response, renderer: str): """Set the Content-Disposition header to prompt a file download with the appropriate filename. @@ -75,7 +76,7 @@ def finalize_response(self, request, response, *args, **kwargs): response = super().finalize_response(request, response, *args, **kwargs) # Check if this is a direct download request or file renderer classes, set to the Content-Disposition header # so filename and attachment for browser download - if isinstance(request.accepted_renderer, tuple(self.FILE_RENDERER_CLASSES)): + if isinstance(request.accepted_renderer, self.FILE_RENDERER_CLASSES): self.set_content_disposition(response, request.accepted_renderer) return response @@ -95,7 +96,7 @@ def finalize_response(self, request, response, *args, **kwargs): # (filtering handled in-view to reuse logic from FilterMixin) filter_backends = () - # note: because elasticsearch6_dsl.Search supports slicing and gives results when iterated on, + # note: because elasticsearch8.dsl.Search supports slicing and gives results when iterated on, # it works fine with default pagination # override rest_framework.generics.GenericAPIView @@ -128,10 +129,17 @@ def get_queryset(self): ) return self.__add_sort(_search) + def get_serializer_context(self): + return ( + super().get_serializer_context() + if issubclass(self.get_serializer_class(), Serializer) + else {} # allow custom BaseSerializer-based serializer + ) + ### # private methods - def __add_sort(self, search: edsl.Search) -> edsl.Search: + def __add_sort(self, search: esdsl.Search) -> esdsl.Search: _elastic_sort = self.__get_elastic_sort() return (search if _elastic_sort is None else search.sort(_elastic_sort)) @@ -148,17 +156,20 @@ def __get_elastic_sort(self) -> str | None: raise drf_exceptions.ValidationError( f'invalid value for {drf_settings.ORDERING_PARAM} query param (valid values: {", ".join(self.ordering_fields)})', ) - _serializer_field = self.get_serializer().fields[_sort_field] - _elastic_sort_field = _serializer_field.source + _elastic_sort_field = ( + self.get_serializer().fields[_sort_field].source + if issubclass(self.get_serializer_class(), Serializer) + else _sort_field # allow custom BaseSerializer-based serializer + ) return (_elastic_sort_field if _ascending else f'-{_elastic_sort_field}') def __add_search_filter( self, - search: edsl.Search, + search: esdsl.Search, elastic_field_name: str, operator: str, value: str, - ) -> edsl.Search: + ) -> esdsl.Search: match operator: # operators from FilterMixin case 'eq': if value == '': diff --git a/api/base/metrics.py b/api/base/metrics.py index d68f19a45b8..d5027403ca8 100644 --- a/api/base/metrics.py +++ b/api/base/metrics.py @@ -1,15 +1,14 @@ -import re -from datetime import timedelta - +import abc import waffle -from django.utils import timezone from api.base.exceptions import InvalidQueryStringError from osf import features -from website.settings import PREPRINT_METRICS_START_DATE +from osf.metrics.events import OsfCountedUsageEvent +from osf.metrics.monthly_reports import MonthlyPublicItemUsageReport +from osf.models.base import osfid_iri -class MetricsViewMixin: +class UsageMetricsViewMixin(abc.ABC): """Mixin for views that expose metrics via django-elasticsearch-metrics. Enables metrics to be requested with a query parameter, like so: :: @@ -18,110 +17,98 @@ class MetricsViewMixin: Any subclass of this mixin MUST do the following: * Use a serializer_class that subclasses MetricsSerializerMixin - * Define metric_map as a class variable. It should be dict mapping metric name - ("downloads") to a Metric class (PreprintDownload) - * For list views: implement `get_annotated_queryset_with_metrics` - * For detail views: implement `add_metric_to_object` + * Call add_metrics_to_object(obj) to get `views` and/or `downloads` + assigned on the obj (according to query params) """ - # Adapted from FilterMixin.QUERY_PATTERN - METRICS_QUERY_PATTERN = re.compile(r'^metrics\[(?P((?:,*\s*\w+)*))\]$') - TIMEDELTA_MAP = { - 'daily': timedelta(hours=24), - 'weekly': timedelta(days=7), - 'monthly': timedelta(days=30), - 'yearly': timedelta(days=365), + METRICS_QUERY_MAP = { + 'metrics[views]': OsfCountedUsageEvent.ActionLabel.VIEW, + 'metrics[downloads]': OsfCountedUsageEvent.ActionLabel.DOWNLOAD, + } + METRICS_ATTR_MAP = { + OsfCountedUsageEvent.ActionLabel.VIEW: 'views', + OsfCountedUsageEvent.ActionLabel.DOWNLOAD: 'downloads', + } + TIMESPAN_MAP = { + 'daily': 'now-1d/d', + 'weekly': 'now-1w/d', + 'monthly': 'now-1M/d', } VALID_METRIC_PERIODS = { 'daily', 'weekly', 'monthly', - 'yearly', 'total', } - @property - def metric_map(self): - raise NotImplementedError('MetricsViewMixin subclasses must define a metric_map class variable.') - - def get_annotated_queryset_with_metrics(self, queryset, metric_class, metric_name, after): - """Return a queryset annotated with metrics. Use for list endpoints that expose metrics.""" - raise NotImplementedError('MetricsViewMixin subclasses must define get_annotated_queryset_with_metrics().') - - def add_metric_to_object(self, obj, metric_class, metric_name, after): - """Set an attribute for a metric on obj. Use for detail endpoints that expose metrics. - Return the modified object. - """ - raise NotImplementedError('MetricsViewMixin subclasses must define add_metric_to_object().') - - @property - def metrics_default_after(self): - """Value to be used as the `after` in metrics queries if not otherwise specified. - Datetime or None. - """ - return None - @property def metrics_requested(self): return ( - waffle.switch_is_active(features.ELASTICSEARCH_METRICS) and - bool(self.parse_metric_query_params(self.request.query_params)) + waffle.switch_is_active(features.ELASTICSEARCH_METRICS) + and any(_param in self.METRICS_QUERY_MAP for _param in self.request.query_params) ) - # Adapted from FilterMixin.parse_query_params - # TODO: Should we get rid of query_params argument and use self.request.query_params instead? - def parse_metric_query_params(self, query_params): + def get_item_iri(self, item): + return osfid_iri(item._id) + + def parse_metric_query_params(self): """Parses query parameters to a dict usable for fetching metrics. :param dict query_params: :return dict of the format { - : { - 'period': <[daily|weekly|monthly|yearly|total]>, - } + : <[daily|weekly|monthly|yearly|total]>, } """ query = {} - for key, value in query_params.items(): - match = self.METRICS_QUERY_PATTERN.match(key) - if match: - match_dict = match.groupdict() - metric_name = match_dict['metric_name'] - query[metric_name] = value + for key, value in self.request.query_params.items(): + _usage_label = self.METRICS_QUERY_MAP.get(key) + if _usage_label: + if value not in self.VALID_METRIC_PERIODS: + raise InvalidQueryStringError(f"Invalid period for metric: '{value}'", parameter='metrics') + query[_usage_label] = value return query - def _add_metrics(self, queryset_or_obj, method): - """Parse the ?metric[METRIC]=PERIOD query param, validate it, and - run ``method`` for each requested object. - - This is used to share code between add_metric_to_object and get_metrics_queryset. + def add_metrics_to_object(self, obj): + """Helper method used for detail views. """ - metrics_requested = self.parse_metric_query_params(self.request.query_params) - if metrics_requested: - metric_map = self.metric_map - for metric, period in metrics_requested.items(): - if metric not in metric_map: - raise InvalidQueryStringError(f"Invalid metric in query string: '{metric}'", parameter='metrics') - if period not in self.VALID_METRIC_PERIODS: - raise InvalidQueryStringError(f"Invalid period for metric: '{period}'", parameter='metrics') - metric_class = metric_map[metric] - if period == 'total': - after = self.metrics_default_after + for _action_label, _period in self.parse_metric_query_params().items(): + _count = self._get_usage_count(self.get_item_iri(obj), _action_label, _period) + setattr(obj, self.METRICS_ATTR_MAP[_action_label], _count) + + def _get_usage_count(self, item_iri, action_label, period): + _search = ( + OsfCountedUsageEvent.search() + .filter('term', item_iri=item_iri) + .filter('term', action_labels=action_label.value) + ) + _prior_count = 0 + if _timespan := self.TIMESPAN_MAP.get(period): + _search = _search.filter('range', timestamp={'gte': _timespan}) + else: # cumulative total + _latest_usage_report = self._get_latest_usage_report(item_iri) + if _latest_usage_report: + _search = _search.filter( + 'range', timestamp={ + 'gte': _latest_usage_report.report_yearmonth.month_end(), + }, + ) + if action_label == OsfCountedUsageEvent.ActionLabel.VIEW: + _prior_count = _latest_usage_report.cumulative_view_count + elif action_label == OsfCountedUsageEvent.ActionLabel.DOWNLOAD: + _prior_count = _latest_usage_report.cumulative_download_count else: - after = timezone.now() - self.TIMEDELTA_MAP[period] - queryset_or_obj = method(queryset_or_obj, metric_class, metric, after) - return queryset_or_obj - - def add_metrics_to_object(self, obj): - """Helper method used for detail views.""" - return self._add_metrics(obj, method=self.add_metric_to_object) - - def get_metrics_queryset(self, queryset): - """Helper method used for list views.""" - return self._add_metrics(queryset, method=self.get_annotated_queryset_with_metrics) + raise ValueError(f'unsupported action label {action_label!r}') + _response = _search[0:0].execute() + return _prior_count + _response.doc_count + + def _get_latest_usage_report(self, item_iri): + _search = ( + MonthlyPublicItemUsageReport.search() + .filter('term', item_iri=item_iri) + .sort('-cycle_coverage') + ) + _response = _search[0].execute() + return _response[0] if _response else None - # Override get_default_queryset for convenience - def get_default_queryset(self): - queryset = super().get_default_queryset() - return self.get_metrics_queryset(queryset) class MetricsSerializerMixin: @property @@ -138,9 +125,3 @@ def get_meta(self, obj): meta = meta or {'metrics': {}} meta['metrics'][metric] = getattr(obj, metric) return meta - - -class PreprintMetricsViewMixin(MetricsViewMixin): - @property - def metrics_default_after(self): - return PREPRINT_METRICS_START_DATE diff --git a/api/base/settings/defaults.py b/api/base/settings/defaults.py index ac9a9739f1b..b250b283ed8 100644 --- a/api/base/settings/defaults.py +++ b/api/base/settings/defaults.py @@ -321,12 +321,6 @@ # django-elasticsearch-metrics DJELME_BACKENDS = { - 'osfmetrics_es6': { - 'elasticsearch_metrics.imps.elastic6': { - 'hosts': osf_settings.ELASTIC6_URI, - 'retry_on_timeout': True, - }, - }, 'osfmetrics_es8': { 'elasticsearch_metrics.imps.elastic8': { # passthru kwargs to elasticsearch8 connection constructor diff --git a/api/institutions/views.py b/api/institutions/views.py index d653f5b4e77..159e303b0ef 100644 --- a/api/institutions/views.py +++ b/api/institutions/views.py @@ -10,8 +10,11 @@ from framework.auth.oauth_scopes import CoreScopes from osf.models import OSFUser, Node, Institution, Registration -from osf.metrics.reports import InstitutionalUserReport, InstitutionMonthlySummaryReport -from osf.metrics.utils import YearMonth +from osf.metrics.monthly_reports import ( + MonthlyInstitutionalUserReport, + MonthlyInstitutionSummaryReport, +) +from osf.metrics.utils import YearMonth, cycle_coverage_yearmonth from osf.utils import permissions as osf_permissions from api.base import permissions as base_permissions @@ -27,11 +30,6 @@ ) from api.base.exceptions import RelationshipPostMakesNoChanges from api.metrics.permissions import IsInstitutionalMetricsUser -from api.metrics.renderers import ( - MetricsReportsCsvRenderer, - MetricsReportsTsvRenderer, - MetricsReportsJsonRenderer, -) from api.nodes.serializers import NodeSerializer from api.nodes.filters import NodesFilterMixin from api.users.serializers import UserSerializer @@ -411,23 +409,21 @@ class InstitutionDepartmentList(InstitutionMixin, ElasticsearchListView): serializer_class = InstitutionDepartmentMetricsSerializer renderer_classes = ( *api_settings.DEFAULT_RENDERER_CLASSES, - MetricsReportsCsvRenderer, - MetricsReportsTsvRenderer, - MetricsReportsJsonRenderer, + *ElasticsearchListView.FILE_RENDERER_CLASSES, ) pagination_class = JSONAPINoPagination def get_default_search(self): _base_search = ( - InstitutionalUserReport.search() + MonthlyInstitutionalUserReport.search() .filter('term', institution_id=self.get_institution()._id) ) - _yearmonth = InstitutionalUserReport.most_recent_yearmonth(base_search=_base_search) - if _yearmonth is None: + _most_recent_cycle = MonthlyInstitutionalUserReport.most_recent_cycle(_base_search) + if _most_recent_cycle is None: return None _search = ( _base_search - .filter('term', report_yearmonth=str(_yearmonth)) + .filter('term', cycle_coverage=_most_recent_cycle) .exclude('term', user_name='Deleted user') ) # add aggregation on department name @@ -468,9 +464,7 @@ class InstitutionUserMetricsList(InstitutionMixin, ElasticsearchListView): view_name = 'institution-user-metrics' renderer_classes = ( *api_settings.DEFAULT_RENDERER_CLASSES, - MetricsReportsCsvRenderer, - MetricsReportsTsvRenderer, - MetricsReportsJsonRenderer, + *ElasticsearchListView.FILE_RENDERER_CLASSES, ) serializer_class = InstitutionUserMetricsSerializer @@ -492,17 +486,16 @@ class InstitutionUserMetricsList(InstitutionMixin, ElasticsearchListView): )) def get_default_search(self): - base_search = InstitutionalUserReport.search().filter( - 'term', - institution_id=self.get_institution()._id, + _base_search = ( + MonthlyInstitutionalUserReport.search() + .filter('term', institution_id=self.get_institution()._id) ) - yearmonth = InstitutionalUserReport.most_recent_yearmonth(base_search=base_search) - if yearmonth is None: + _most_recent_cycle = MonthlyInstitutionalUserReport.most_recent_cycle(_base_search) + if _most_recent_cycle is None: return None - return ( - base_search - .filter('term', report_yearmonth=str(yearmonth)) + _base_search + .filter('term', cycle_coverage=_most_recent_cycle) .exclude('term', user_name='Deleted user') ) @@ -525,29 +518,33 @@ class InstitutionSummaryMetricsDetail(JSONAPIBaseView, generics.RetrieveAPIView, serializer_class = InstitutionSummaryMetricsSerializer def get_object(self): - institution = self.get_institution() - search_object = self.get_default_search() - if search_object: - object = search_object.execute()[0] - object.id = institution._id - return object + _institution = self.get_institution() + _search = self.get_default_search() + if _search: + _response = _search[0].execute() + if _response: + _report = _response[0] + _report.id = _institution._id + return _report + return None def get_default_search(self): - base_search = InstitutionMonthlySummaryReport.search().filter( - 'term', - institution_id=self.get_institution()._id, + _base_search = ( + MonthlyInstitutionSummaryReport.search() + .filter('term', institution_id=self.get_institution()._id) ) - yearmonth = InstitutionMonthlySummaryReport.most_recent_yearmonth(base_search=base_search) - if report_date_str := self.request.query_params.get('report_yearmonth'): + _cycle_coverage = None + if _yearmonth_str := self.request.query_params.get('report_yearmonth'): try: - yearmonth = YearMonth.from_str(report_date_str) + _yearmonth = YearMonth.from_str(_yearmonth_str) except ValueError: - pass - - if yearmonth is None: + raise exceptions.ValidationError( + 'report_yearmonth query param must be in YYYY-MM format', + ) + else: + _cycle_coverage = cycle_coverage_yearmonth(_yearmonth) + else: + _cycle_coverage = MonthlyInstitutionSummaryReport.most_recent_cycle(_base_search) + if _cycle_coverage is None: return None - - return base_search.filter( - 'term', - report_yearmonth=str(yearmonth), - ) + return _base_search.filter('term', cycle_coverage=_cycle_coverage) diff --git a/api/metrics/serializers.py b/api/metrics/serializers.py index 9e3f61f5b50..eba54f2bb5d 100644 --- a/api/metrics/serializers.py +++ b/api/metrics/serializers.py @@ -1,26 +1,12 @@ import logging -import datetime from rest_framework import serializers as ser -from api.base.serializers import BaseAPISerializer from api.base.utils import absolute_reverse -from osf.metrics.counted_usage import CountedAuthUsage, PageviewInfo -from osf.metrics.es8_metrics import ( - OsfCountedUsageEvent, - PageviewInfo as PageviewInfoEs8, -) -from website import settings as website_settings - -logger = logging.getLogger(__name__) +from osf.metrics.events import OsfCountedUsageEvent -class PreprintMetricSerializer(BaseAPISerializer): - - query = ser.DictField() - - class Meta: - type_ = 'preprint_metrics' +logger = logging.getLogger(__name__) class RawMetricsSerializer(): @@ -30,9 +16,9 @@ class RawMetricsSerializer(): def validate_action_label(label): try: - CountedAuthUsage.ActionLabel(label) + OsfCountedUsageEvent.ActionLabel(label) except ValueError: - valid_labels = ', '.join(label.value for label in CountedAuthUsage.ActionLabel) + valid_labels = ', '.join(label.value for label in OsfCountedUsageEvent.ActionLabel) raise ser.ValidationError( f'Invalid value in action_labels! Valid labels: {valid_labels}', ) @@ -67,31 +53,17 @@ def validate(self, data): return data def create(self, validated_data): - pageview_info = None - pageview_info_es8 = None - if pageview_info_data := validated_data.get('pageview_info'): - pageview_info = PageviewInfo(**pageview_info_data) - pageview_info_es8 = PageviewInfoEs8(**pageview_info_data) - OsfCountedUsageEvent.record( + return OsfCountedUsageEvent.record( item_osfid=validated_data['item_guid'], action_labels=validated_data.get('action_labels'), provider_id=validated_data.get('provider_id'), - pageview_info=pageview_info_es8, + pageview_info=validated_data.get('pageview_info'), # used to create a COUNTER session-hour id, not stored: client_session_id=validated_data.get('client_session_id'), user_id=self.context.get('user_id'), request_host=self.context.get('request_host'), request_useragent=self.context.get('request_useragent'), ) - return CountedAuthUsage.record( - platform_iri=website_settings.DOMAIN, - provider_id=validated_data.get('provider_id'), - item_guid=validated_data.get('item_guid'), - session_id=validated_data['session_id'], # must be provided by the view - user_is_authenticated=validated_data['user_is_authenticated'], # must be provided by the view - action_labels=validated_data.get('action_labels'), - pageview_info=pageview_info, - ) class ReportNameSerializer(ser.BaseSerializer): @@ -109,44 +81,19 @@ def to_representation(self, instance): } -class DailyReportSerializer(ser.BaseSerializer): - def to_representation(self, instance): - # TODO: detangle datamodel (osf.metrics.reports) from api serialization - # (don't use `to_dict` here) - report_as_dict = instance.to_dict() - report_name = self.context['report_name'] - report_date = report_as_dict['report_date'] - - if isinstance(report_date, datetime.datetime): - report_date = report_date.date() - if isinstance(report_date, datetime.date): - report_date = str(report_date) - - return { - 'id': instance.meta.id, - 'type': f'daily-report:{report_name}', - 'attributes': { - **report_as_dict, - 'report_date': report_date, - }, - } - - -class MonthlyReportSerializer(ser.BaseSerializer): +class CyclicReportSerializer(ser.BaseSerializer): def to_representation(self, instance): - # TODO: detangle datamodel (osf.metrics.reports) from api serialization - # (don't use `to_dict` here) - report_as_dict = instance.to_dict() - report_name = self.context['report_name'] - report_yearmonth = report_as_dict['report_yearmonth'] - + # TODO: detangle datamodel from api serialization (don't use `to_dict` here) + _report_attrs = instance.to_dict() + for _extra_attr in ('report_date', 'report_yearmonth'): + if (_extra_attr not in _report_attrs) and hasattr(instance, _extra_attr): + _report_attrs[_extra_attr] = getattr(instance, _extra_attr) + del _report_attrs['cycle_coverage'] + _report_name = self.context['report_name'] return { 'id': instance.meta.id, - 'type': f'monthly-report:{report_name}', - 'attributes': { - **report_as_dict, - 'report_month': report_yearmonth, - }, + 'type': f'cyclic-report:{_report_name}', + 'attributes': _report_attrs, } @@ -158,28 +105,28 @@ def to_representation(self, instance): 'path': bucket['key'], 'route': bucket['route-for-path'].buckets[0]['key'], 'title': bucket['title-for-path'].buckets[0]['key'], - 'count': bucket['doc_count'], + 'count': bucket['unique-count'].value, } for bucket in aggs['popular-pages'].buckets ] unique_visits = [ { 'date': bucket['key'].date(), - 'count': bucket['doc_count'], + 'count': bucket['unique-count'].value, } for bucket in aggs['unique-visits'].buckets ] time_of_day = [ { 'hour': bucket['key'], - 'count': bucket['doc_count'], + 'count': bucket['unique-count'].value, } for bucket in aggs['time-of-day'].buckets ] referer_domain = [ { 'referer_domain': bucket['key'], - 'count': bucket['doc_count'], + 'count': bucket['unique-count'].value, } for bucket in aggs['referer-domain'].buckets ] diff --git a/api/metrics/urls.py b/api/metrics/urls.py index db63df3dd4c..d9bc0a92307 100644 --- a/api/metrics/urls.py +++ b/api/metrics/urls.py @@ -5,11 +5,8 @@ app_name = 'osf' urlpatterns = [ - re_path(r'^raw/(?P[a-z0-9._/]*)$', views.RawMetricsView.as_view(), name=views.RawMetricsView.view_name, kwargs={'djelme_backend_name': 'osfmetrics_es6'}), path('raw-/', views.RawMetricsView.as_view(), name=views.RawMetricsView.view_name, kwargs={'url_path': ''}), path('raw-/', views.RawMetricsView.as_view(), name=views.RawMetricsView.view_name), - re_path(r'^preprints/views/$', views.PreprintViewMetrics.as_view(), name=views.PreprintViewMetrics.view_name), - re_path(r'^preprints/downloads/$', views.PreprintDownloadMetrics.as_view(), name=views.PreprintDownloadMetrics.view_name), re_path(r'^registries_moderation/transitions/$', views.RegistriesModerationMetricsView.as_view(), name=views.RegistriesModerationMetricsView.view_name), re_path( @@ -17,8 +14,13 @@ views.ReportNameList.as_view(), name=views.ReportNameList.view_name, ), - re_path( - r'^reports/(?P[a-z0-9_]+)/recent/$', + path( + 'reports//', + views.ReportList.as_view(), + name=views.ReportList.view_name, + ), + path( + 'reports//recent/', views.RecentReportList.as_view(), name=views.RecentReportList.view_name, ), diff --git a/api/metrics/utils.py b/api/metrics/utils.py index 54af7531200..3ffc515e9f1 100644 --- a/api/metrics/utils.py +++ b/api/metrics/utils.py @@ -8,7 +8,7 @@ from rest_framework.exceptions import ValidationError from osf.models import AbstractNode, Guid -from osf.metrics.counted_usage import _get_immediate_wrapper +from osf.metrics.utils import get_immediate_wrapper DATETIME_FORMAT = '%Y-%m-%dT%H:%M' @@ -124,7 +124,7 @@ def _user_has_read_on_resolved_node(user, guid_referent): """True if ``user`` has READ on the node this referent belongs to.""" current = guid_referent while current is not None and not isinstance(current, AbstractNode): - current = _get_immediate_wrapper(current) + current = get_immediate_wrapper(current) if current is None or not isinstance(current, AbstractNode): return False return current.contributors_and_group_members.filter(guids___id=user._id).exists() diff --git a/api/metrics/views.py b/api/metrics/views.py index bd53bee296e..829426755ed 100644 --- a/api/metrics/views.py +++ b/api/metrics/views.py @@ -4,25 +4,21 @@ from enum import Enum from django.http import JsonResponse, HttpResponse, Http404 -from django.utils import timezone - -from elasticsearch6.exceptions import NotFoundError, RequestError -from elasticsearch6_dsl.connections import get_connection +from elasticsearch8.exceptions import ApiError as Es8ApiError from elasticsearch_metrics.registry import djelme_registry from framework.auth.oauth_scopes import CoreScopes -from rest_framework.exceptions import ValidationError from rest_framework import permissions as drf_permissions from rest_framework.response import Response from rest_framework.generics import GenericAPIView from rest_framework.settings import api_settings as drf_api_settings +from api.base.elasticsearch_dsl_views import ElasticsearchListView from api.base.views import JSONAPIBaseView from api.base.permissions import TokenHasScope from api.base.waffle_decorators import require_switch from api.metrics.permissions import ( - IsPreprintMetricsUser, IsRawMetricsUser, IsRegistriesModerationMetricsUser, ) @@ -31,10 +27,8 @@ MetricsReportsTsvRenderer, ) from api.metrics.serializers import ( - PreprintMetricSerializer, RawMetricsSerializer, - DailyReportSerializer, - MonthlyReportSerializer, + CyclicReportSerializer, ReportNameSerializer, NodeAnalyticsSerializer, UserVisitsSerializer, @@ -42,169 +36,51 @@ CountedAuthUsageSerializer, ) from api.metrics.utils import ( - parse_datetimes, parse_date_range, should_skip_counted_usage, ) from api.nodes.permissions import MustBePublic from osf.features import ENABLE_RAW_METRICS -from osf.metrics import ( - utils, - reports, - PreprintDownload, - PreprintView, - RegistriesModerationMetrics, - CountedAuthUsage, +from osf.metrics.events import ( + OsfCountedUsageEvent, + RegistriesModerationEvent, +) +from osf.metrics.daily_reports import ( + BaseDailyReport, + DailyDownloadCountReport, + DailyInstitutionSummaryReport, + DailyNodeSummaryReport, + DailyOsfstorageFileCountReport, + DailyPreprintSummaryReport, + DailyStorageAddonUsageReport, + DailyUserSummaryReport, + DailyNewUserDomainReport, +) +from osf.metrics.monthly_reports import ( + BaseMonthlyReport, + MonthlySpamSummaryReport, ) from osf.metrics.openapi import get_metrics_openapi_json_dict from osf.models import AbstractNode +from osf.utils.workflows import RegistrationModerationTriggers, RegistrationModerationStates logger = logging.getLogger(__name__) -class PreprintMetricMixin(JSONAPIBaseView): - permission_classes = ( - drf_permissions.IsAuthenticated, - drf_permissions.IsAdminUser, - IsPreprintMetricsUser, - TokenHasScope, - ) - - required_read_scopes = [CoreScopes.METRICS_BASIC] - required_write_scopes = [CoreScopes.METRICS_RESTRICTED] - - serializer_class = PreprintMetricSerializer - - @property - def metric_type(self): - raise NotImplementedError - - @property - def metric(self): - raise NotImplementedError - - def add_search(self, search, query_params, **kwargs): - """ - get list of guids from the kwargs - use that in a query to narrow down metrics results - """ - preprint_guid_string = query_params.get('guids') - if not preprint_guid_string: - raise ValidationError( - 'To gather metrics for preprints, you must provide one or more preprint ' + - 'guids in the `guids` query parameter.', - ) - preprint_guids = preprint_guid_string.split(',') - - return search.filter('terms', preprint_id=preprint_guids) - - def format_response(self, response, query_params): - data = [] - if getattr(response, 'aggregations') and response.aggregations: - for result in response.aggregations.dates.buckets: - guid_results = {} - for preprint_result in result.preprints.buckets: - guid_results[preprint_result['key']] = preprint_result['total']['value'] - # return 0 for the guids with no results for consistent payloads - guids = query_params['guids'].split(',') - if guid_results.keys() != guids: - for guid in guids: - if not guid_results.get(guid): - guid_results[guid] = 0 - result_dict = {result.key_as_string: guid_results} - data.append(result_dict) - - return { - 'metric_type': self.metric_type, - 'data': data, - } - - def execute_search(self, search, query=None): - try: - # There's a bug in the ES python library the prevents us from updating the search object, so lets just make - # the raw query. If we have it. - if query: - es = get_connection(search._using) - response = search._response_class( - search, - es.search( - index=search._index, - body=query, - ), - ) - else: - response = search.execute() - except NotFoundError: - # _get_relevant_indices returned 1 or more indices - # that doesn't exist. Fall back to unoptimized query - search = search.index().index(self.metric._default_index()) - response = search.execute() - return response - - def get(self, *args, **kwargs): - query_params = getattr(self.request, 'query_params', self.request.GET) - - interval = query_params.get('interval', 'day') - - start_datetime, end_datetime = parse_datetimes(query_params) - - search = self.metric.search(after=start_datetime) - search = search.filter('range', timestamp={'gte': start_datetime, 'lt': end_datetime}) - search.aggs.bucket('dates', 'date_histogram', field='timestamp', interval=interval) \ - .bucket('preprints', 'terms', field='preprint_id') \ - .metric('total', 'sum', field='count') - search = self.add_search(search, query_params, **kwargs) - response = self.execute_search(search) - resp_dict = self.format_response(response, query_params) - - return JsonResponse(resp_dict) - - def post(self, request, *args, **kwargs): - """ - For a bit of future proofing, accept custom elasticsearch aggregation queries in JSON form. - Caution - this could be slow if a very large query is executed, so use with care! - """ - search = self.metric.search() - query = request.data.get('query') - - try: - results = self.execute_search(search, query) - except RequestError as e: - if e.args: - raise ValidationError(e.info['error']['root_cause'][0]['reason']) - raise ValidationError('Malformed elasticsearch query.') - - return JsonResponse(results.to_dict()) - - -class PreprintViewMetrics(PreprintMetricMixin): - - view_category = 'preprint-metrics' - view_name = 'preprint-view-metrics' - - @property - def metric_type(self): - return 'views' - - @property - def metric(self): - return PreprintView - - -class PreprintDownloadMetrics(PreprintMetricMixin): - - view_category = 'preprint-metrics' - view_name = 'preprint-download-metrics' - - @property - def metric_type(self): - return 'downloads' +VIEWABLE_REPORTS = { + 'download_count': DailyDownloadCountReport, + 'institution_summary': DailyInstitutionSummaryReport, + 'node_summary': DailyNodeSummaryReport, + 'osfstorage_file_count': DailyOsfstorageFileCountReport, + 'preprint_summary': DailyPreprintSummaryReport, + 'storage_addon_usage': DailyStorageAddonUsageReport, + 'user_summary': DailyUserSummaryReport, + 'spam_summary': MonthlySpamSummaryReport, + 'new_user_domains': DailyNewUserDomainReport, +} - @property - def metric(self): - return PreprintDownload class RawMetricsView(GenericAPIView): @@ -222,47 +98,59 @@ class RawMetricsView(GenericAPIView): serializer_class = RawMetricsSerializer - @require_switch(ENABLE_RAW_METRICS) - def delete(self, request, *args, **kwargs): - raise ValidationError('DELETE not supported. Use GET/POST/PUT') - @require_switch(ENABLE_RAW_METRICS) def get(self, request, *args, djelme_backend_name, url_path, **kwargs): - _response_body = self._do_es_request( + return self._do_es_request( + request, djelme_backend_name, method='GET', path=url_path, - qp=request.GET, ) - return JsonResponse(_response_body) @require_switch(ENABLE_RAW_METRICS) def post(self, request, *args, djelme_backend_name, url_path, **kwargs): - _response_body = self._do_es_request( + return self._do_es_request( + request, djelme_backend_name, method='POST', path=url_path, - qp=request.GET, - body=json.loads(request.body), ) - return JsonResponse(_response_body) @require_switch(ENABLE_RAW_METRICS) def put(self, request, *args, djelme_backend_name, url_path, **kwargs): - _response_body = self._do_es_request( + return self._do_es_request( + request, djelme_backend_name, method='PUT', path=url_path, - qp=request.GET, - body=json.loads(request.body), ) - return JsonResponse(_response_body) - def _do_es_request(self, djelme_backend_name, method, path, qp, body=None): + def _do_es_request(self, django_request, djelme_backend_name, method, path): _client = self._get_es_client(djelme_backend_name) - _perform_fn = getattr(_client, 'perform_request', None) or _client.transport.perform_request - _response = _perform_fn(method, f'/{path}', params=qp.dict(), body=body) - return _response if isinstance(_response, dict) else _response.body + _body = ( + json.loads(django_request.body) + if django_request.body else None + ) + _content_type = django_request.headers.get('Content-Type') + _headers = ( + {'Content-Type': _content_type, 'Accept': 'application/json'} + if _content_type else None + ) + try: + _response = _client.perform_request( + method, + f'/{path}', + params=django_request.GET.dict(), + body=_body, + headers=_headers, + ) + except Es8ApiError as _api_error: + return HttpResponse( + str(_api_error), + content_type='text/plain; charset=utf-8', + status=_api_error.status_code, + ) + return JsonResponse(_response.body) def _get_es_client(self, djelme_backend_name): try: @@ -287,21 +175,85 @@ class RegistriesModerationMetricsView(GenericAPIView): view_name = 'raw-metrics-view' def get(self, request, *args, **kwargs): - return JsonResponse(RegistriesModerationMetrics.get_registries_info()) - + _search = RegistriesModerationEvent.search().update_from_dict(self._build_es_query()) + _search_response = _search.execute() + _providers_agg_json = ( + _search_response.aggregations['providers'].to_dict() + if _search_response.aggregations + else {} + ) + return JsonResponse(_providers_agg_json) + + def _build_es_query(self): + _submit_trigger = RegistrationModerationTriggers.SUBMIT.db_name + _reject_trigger = RegistrationModerationTriggers.REJECT_SUBMISSION.db_name + _accept_withdrawal_trigger = RegistrationModerationTriggers.ACCEPT_WITHDRAWAL.db_name + _accepted_state = RegistrationModerationStates.ACCEPTED.db_name + _embargo_state = RegistrationModerationStates.EMBARGO.db_name + _rejected_state = RegistrationModerationStates.REJECTED.db_name + _withdrawn_state = RegistrationModerationStates.WITHDRAWN.db_name + return { + 'aggs': { + 'providers': { + 'terms': {'field': 'provider_id'}, + 'aggs': { + 'transitions_without_comments': { + 'missing': {'field': 'comment'}, + }, + 'transitions_with_comments': { + 'filter': {'exists': {'field': 'comment'}}, + }, + 'submissions': { + 'filter': {'term': {'trigger': _submit_trigger}}, + }, + 'accepted_with_embargo': { + 'filter': { + 'bool': { + 'must': [ + {'term': {'to_state': _embargo_state}}, + {'term': {'trigger': _submit_trigger}}, + ], + }, + }, + }, + 'accepted_without_embargo': { + 'filter': { + 'bool': { + 'must': [ + {'term': {'to_state': _accepted_state}}, + {'term': {'trigger': _submit_trigger}}, + ], + }, + }, + }, + 'rejected': { + 'filter': { + 'bool': { + 'must': [ + {'term': {'to_state': _rejected_state}}, + {'term': {'trigger': _reject_trigger}}, + ], + }, + }, + }, + 'withdrawn': { + 'filter': { + 'bool': { + 'must': [ + {'term': {'to_state': _withdrawn_state}}, + {'term': {'trigger': _accept_withdrawal_trigger}}, + ], + }, + }, + }, + }, + }, + }, + } -VIEWABLE_REPORTS = { - 'download_count': reports.DownloadCountReport, - 'institution_summary': reports.InstitutionSummaryReport, - 'node_summary': reports.NodeSummaryReport, - 'osfstorage_file_count': reports.OsfstorageFileCountReport, - 'preprint_summary': reports.PreprintSummaryReport, - 'storage_addon_usage': reports.StorageAddonUsage, - 'user_summary': reports.UserSummaryReport, - 'spam_summary': reports.SpamSummaryReport, - 'new_user_domains': reports.NewUserDomainReport, -} +### +# reports class ReportNameList(JSONAPIBaseView): permission_classes = ( @@ -325,6 +277,51 @@ def get(self, request, *args, **kwargs): return Response({'data': serializer.data}) +class ReportList(ElasticsearchListView): + view_category = 'metrics' + view_name = 'report-list' + + permission_classes = ( + TokenHasScope, + drf_permissions.IsAuthenticatedOrReadOnly, + ) + + required_read_scopes = [CoreScopes.ALWAYS_PUBLIC] + required_write_scopes = [CoreScopes.NULL] + + serializer_class = CyclicReportSerializer + renderer_classes = ( + *drf_api_settings.DEFAULT_RENDERER_CLASSES, + *ElasticsearchListView.FILE_RENDERER_CLASSES, + ) + + default_ordering = '-cycle_coverage' + ordering_fields = frozenset(( + 'cycle_coverage', + )) + + def get_default_search(self): + _report_name = self.kwargs['report_name'] + try: + _report_cls = VIEWABLE_REPORTS[_report_name] + except KeyError: + return Response( + { + 'errors': [{ + 'title': 'unknown report name', + 'detail': f'unknown report: "{_report_name}"', + }], + }, + status=404, + ) + return _report_cls.search() + + def get_serializer_context(self): + return { + **super().get_serializer_context(), + 'report_name': self.kwargs['report_name'], + } + class RecentReportList(JSONAPIBaseView): MAX_COUNT = 10000 DEFAULT_DAYS_BACK = 13 @@ -340,7 +337,7 @@ class RecentReportList(JSONAPIBaseView): view_category = 'metrics' view_name = 'recent-report-list' - serializer_class = DailyReportSerializer + serializer_class = CyclicReportSerializer renderer_classes = ( *drf_api_settings.DEFAULT_RENDERER_CLASSES, MetricsReportsCsvRenderer, @@ -360,23 +357,15 @@ def get(self, request, *args, report_name): }, status=404, ) - is_daily = issubclass(report_class, reports.DailyReport) + is_daily = issubclass(report_class, BaseDailyReport) days_back = request.GET.get('days_back', self.DEFAULT_DAYS_BACK if is_daily else None) - is_monthly = issubclass(report_class, reports.MonthlyReport) - - if is_daily: - serializer_class = DailyReportSerializer - range_field_name = 'report_date' - elif is_monthly: - serializer_class = MonthlyReportSerializer - range_field_name = 'report_yearmonth' - else: - raise ValueError(f'report class must subclass DailyReport or MonthlyReport: {report_class}') + is_monthly = issubclass(report_class, BaseMonthlyReport) + range_filter = parse_date_range(request.GET, is_monthly=is_monthly) search_recent = ( report_class.search() - .filter('range', **{range_field_name: range_filter}) - .sort(range_field_name) + .filter('range', cycle_coverage=range_filter) + .sort('-cycle_coverage') [:self.MAX_COUNT] ) if days_back: @@ -384,7 +373,7 @@ def get(self, request, *args, report_name): report_date_range = parse_date_range(request.GET) search_response = search_recent.execute() - serializer = serializer_class( + serializer = self.serializer_class( search_response, many=True, context={'report_name': report_name}, @@ -428,46 +417,9 @@ def post(self, request, *args, **kwargs): pageview_info=serializer.validated_data.get('pageview_info'), ): return HttpResponse(status=204) - session_id, user_is_authenticated = self._get_session_id( - request, - client_session_id=serializer.validated_data.get('client_session_id'), - ) - serializer.save(session_id=session_id, user_is_authenticated=user_is_authenticated) + serializer.save() return HttpResponse(status=201) - def _get_session_id(self, request, client_session_id=None): - # NOTE: to remove after osfmetrics 6to8 migration -- logic moved to djelme - - # get a session id as described in the COUNTER code of practice: - # https://cop5.projectcounter.org/en/5.0.2/07-processing/03-counting-unique-items.html - # -- different from the "login session" tracked by `osf.models.Session` (which - # lasts about a month), this session lasts at most a day and may time out after - # minutes or hours of inactivity - now = timezone.now() - current_date_str = now.date().isoformat() - - user_is_authenticated = request.user.is_authenticated - if client_session_id: - session_id_parts = [ - client_session_id, - current_date_str, - ] - elif user_is_authenticated: - session_id_parts = [ - request.user._id, - current_date_str, - now.hour, - ] - else: - session_id_parts = [ - request.get_host(), - request.META.get('HTTP_USER_AGENT', ''), - current_date_str, - now.hour, - ] - user_is_authenticated = False - return utils.stable_key(*session_id_parts), user_is_authenticated - class NodeAnalyticsQuery(JSONAPIBaseView): permission_classes = ( @@ -495,7 +447,7 @@ def get(self, request, *args, node_guid, timespan): except AbstractNode.DoesNotExist: raise Http404 self.check_object_permissions(request, node) - analytics_result = self._run_query(node_guid, timespan) + analytics_result = self._run_node_analytics_query(node.get_semantic_iri(), timespan) serializer = self.serializer_class( analytics_result, context={ @@ -505,22 +457,18 @@ def get(self, request, *args, node_guid, timespan): ) return Response({'data': serializer.data}) - def _run_query(self, node_guid, timespan): - query_dict = self._build_query_payload(node_guid, NodeAnalyticsQuery.Timespan(timespan)) - analytics_search = CountedAuthUsage.search().update_from_dict(query_dict) + def _run_node_analytics_query(self, item_iri, timespan): + query_dict = self._build_query_payload(item_iri, NodeAnalyticsQuery.Timespan(timespan)) + analytics_search = OsfCountedUsageEvent.search().update_from_dict(query_dict) return analytics_search.execute() - def _build_query_payload(self, node_guid, timespan): + def _build_query_payload(self, item_iri, timespan): return { 'size': 0, # don't return hits, just the aggregations 'query': { 'bool': { - 'minimum_should_match': 1, - 'should': [ - {'term': {'item_guid': node_guid}}, - {'term': {'surrounding_guids': node_guid}}, - ], 'filter': [ + {'term': {'within_iris': item_iri}}, {'term': {'item_public': True}}, {'term': {'action_labels': 'view'}}, {'term': {'action_labels': 'web'}}, @@ -532,7 +480,12 @@ def _build_query_payload(self, node_guid, timespan): 'unique-visits': { 'date_histogram': { 'field': 'timestamp', - 'interval': 'day', + 'calendar_interval': 'day', + }, + 'aggs': { + 'unique-count': { + 'cardinality': {'field': 'sessionhour_id'}, + }, }, }, 'time-of-day': { @@ -540,12 +493,22 @@ def _build_query_payload(self, node_guid, timespan): 'field': 'pageview_info.hour_of_day', 'size': 24, }, + 'aggs': { + 'unique-count': { + 'cardinality': {'field': 'sessionhour_id'}, + }, + }, }, 'referer-domain': { 'terms': { 'field': 'pageview_info.referer_domain', 'size': 10, }, + 'aggs': { + 'unique-count': { + 'cardinality': {'field': 'sessionhour_id'}, + }, + }, }, 'popular-pages': { 'terms': { @@ -553,6 +516,9 @@ def _build_query_payload(self, node_guid, timespan): 'size': 10, }, 'aggs': { + 'unique-count': { + 'cardinality': {'field': 'sessionhour_id'}, + }, 'route-for-path': { 'terms': { 'field': 'pageview_info.route_name', @@ -627,7 +593,7 @@ def get(self, request, *args): pass # just fall back to days_back for now timespan = report_date - analytics_result = self._run_query(timespan) + analytics_result = self._run_user_visits_query(timespan) serializer = self.serializer_class( analytics_result, context={ @@ -636,9 +602,9 @@ def get(self, request, *args): ) return JsonResponse({'data': serializer.data}) - def _run_query(self, timespan): + def _run_user_visits_query(self, timespan): query_dict = self._build_query_payload(timespan) - analytics_search = CountedAuthUsage.search().update_from_dict(query_dict) + analytics_search = OsfCountedUsageEvent.search().update_from_dict(query_dict) return analytics_search.execute() def _build_query_payload(self, timespan): @@ -655,13 +621,11 @@ def _build_query_payload(self, timespan): 'unique-visits': { 'date_histogram': { 'field': 'timestamp', - 'interval': 'day', + 'calendar_interval': 'day', }, 'aggs': { 'user-visits': { - 'cardinality': { - 'field': 'session_id', - }, + 'cardinality': {'field': 'sessionhour_id'}, }, }, }, diff --git a/api/preprints/views.py b/api/preprints/views.py index 7e087aaa858..3d02b8f704a 100644 --- a/api/preprints/views.py +++ b/api/preprints/views.py @@ -71,8 +71,7 @@ from api.requests.serializers import PreprintRequestSerializer, PreprintRequestCreateSerializer from api.requests.views import PreprintRequestMixin from api.subjects.views import BaseResourceSubjectsList, SubjectRelationshipBaseView -from api.base.metrics import PreprintMetricsViewMixin -from osf.metrics import PreprintDownload, PreprintView +from api.base.metrics import UsageMetricsViewMixin class PreprintOldVersionsImmutableMixin: @@ -172,7 +171,7 @@ def get_preprint(self, check_object_permissions=True, ignore_404=False): return preprint -class PreprintList(PreprintMetricsViewMixin, JSONAPIBaseView, generics.ListCreateAPIView, PreprintFilterMixin): +class PreprintList(JSONAPIBaseView, generics.ListCreateAPIView, PreprintFilterMixin): """See [documentation for this endpoint](https://developer.osf.io/#operation/preprints_list). """ # These permissions are not checked for the list of preprints, permissions handled by the query @@ -194,10 +193,6 @@ class PreprintList(PreprintMetricsViewMixin, JSONAPIBaseView, generics.ListCreat ordering_fields = ('created', 'date_last_transitioned') view_category = 'preprints' view_name = 'preprint-list' - metric_map = { - 'downloads': PreprintDownload, - 'views': PreprintView, - } def get_serializer_class(self): if self.request.method == 'POST': @@ -208,38 +203,15 @@ def get_serializer_class(self): def get_default_queryset(self): auth = get_user_auth(self.request) auth_user = getattr(auth, 'user', None) - # Permissions on the list objects are handled by the query - public_only = self.metrics_requested - queryset = self.preprints_queryset(Preprint.objects.all(), auth_user, public_only=public_only) - # Use get_metrics_queryset to return a queryset with annotated metrics - # iff ?metrics query param is present - if self.metrics_requested: - return self.get_metrics_queryset(queryset) - else: - return queryset + return self.preprints_queryset(Preprint.objects.all(), auth_user) # overrides ListAPIView def get_queryset(self): return self.get_queryset_from_request() - # overrides PreprintMetricsViewMixin - def get_annotated_queryset_with_metrics(self, queryset, metric_class, metric_name, after): - return metric_class.get_top_by_count( - qs=queryset, - model_field='guids___id', - metric_field='preprint_id', - annotation=metric_name, - after=after, - # Limit the bucket size - # of the ES aggregation. Otherwise, - # the number of buckets == the number of total preprints, - # which is too many for ES to handle - size=200, - ) - -class PreprintVersionsList(PreprintMetricsViewMixin, JSONAPIBaseView, generics.ListCreateAPIView, PreprintFilterMixin): +class PreprintVersionsList(JSONAPIBaseView, generics.ListCreateAPIView, PreprintFilterMixin): """List existing versions of a preprint or create a new version. GET: Returns a collection of preprint resources representing all versions of the given preprint. @@ -265,10 +237,6 @@ class PreprintVersionsList(PreprintMetricsViewMixin, JSONAPIBaseView, generics.L ordering_fields = ('created', 'date_last_transitioned') view_category = 'preprints' view_name = 'preprint-versions' - metric_map = { - 'downloads': PreprintDownload, - 'views': PreprintView, - } def get_serializer_class(self): if self.request.method == 'POST': @@ -288,8 +256,7 @@ def get_queryset(self): auth_user = getattr(auth, 'user', None) # Permissions on the list objects are handled by the query - public_only = self.metrics_requested - qs = qs.filter(Preprint.objects.preprint_versions_permissions_query(auth_user, public_only=public_only)) + qs = qs.filter(Preprint.objects.preprint_versions_permissions_query(auth_user)) return qs @@ -299,7 +266,7 @@ def create(self, request, *args, **kwargs): return super().create(request, *args, **kwargs) -class PreprintDetail(PreprintOldVersionsImmutableMixin, PreprintMetricsViewMixin, JSONAPIBaseView, generics.RetrieveUpdateDestroyAPIView, PreprintMixin, WaterButlerMixin): +class PreprintDetail(PreprintOldVersionsImmutableMixin, UsageMetricsViewMixin, JSONAPIBaseView, generics.RetrieveUpdateDestroyAPIView, PreprintMixin, WaterButlerMixin): """See [documentation for this endpoint](https://developer.osf.io/#operation/preprints_read). Note: The resource now exposes a `versions` relationship pointing to @@ -324,15 +291,6 @@ class PreprintDetail(PreprintOldVersionsImmutableMixin, PreprintMetricsViewMixin view_category = 'preprints' view_name = 'preprint-detail' - metric_map = { - 'downloads': PreprintDownload, - 'views': PreprintView, - } - - def add_metric_to_object(self, obj, metric_class, metric_name, after): - count = metric_class.get_count_for_preprint(obj, after=after) - setattr(obj, metric_name, count) - return obj def get_object(self): preprint = self.get_preprint() @@ -355,6 +313,7 @@ def delete(self, request, *args, **kwargs): raise ValidationError('You cannot delete created preprint') + class PreprintNodeRelationship(PreprintOldVersionsImmutableMixin, JSONAPIBaseView, generics.RetrieveUpdateAPIView, PreprintMixin): permission_classes = ( drf_permissions.IsAuthenticatedOrReadOnly, diff --git a/api/providers/views.py b/api/providers/views.py index fbfa287d4a7..4a35706bb4d 100644 --- a/api/providers/views.py +++ b/api/providers/views.py @@ -16,7 +16,6 @@ InvalidFilterValue, ) from api.base.filters import ListFilterMixin, PreprintAsTargetFilterMixin, PreprintFilterMixin -from api.base.metrics import PreprintMetricsViewMixin from api.base.pagination import MaxSizePagination, IncreasedPageSizePagination from api.base.settings import BULK_SETTINGS from api.base.utils import get_object_or_error, get_user_auth, is_truthy @@ -61,7 +60,6 @@ from framework.auth.oauth_scopes import CoreScopes from framework.celery_tasks.handlers import enqueue_task from guardian.shortcuts import get_objects_for_user -from osf.metrics import PreprintDownload, PreprintView from osf.models import ( AbstractNode, CollectionProvider, @@ -148,7 +146,7 @@ class RegistrationProviderList(GenericProviderList): view_name = 'registration-providers-list' -class PreprintProviderList(PreprintMetricsViewMixin, GenericProviderList): +class PreprintProviderList(GenericProviderList): """See [documentation for this endpoint](https://developer.osf.io/#operation/preprint_provider_list). """ @@ -156,21 +154,6 @@ class PreprintProviderList(PreprintMetricsViewMixin, GenericProviderList): serializer_class = PreprintProviderSerializer view_category = 'preprint-providers' view_name = 'preprint-providers-list' - metric_map = { - 'downloads': PreprintDownload, - 'views': PreprintView, - } - - # overrides PreprintMetricsViewMixin - def get_annotated_queryset_with_metrics(self, queryset, metric_class, metric_name, after): - return metric_class.get_top_by_count( - qs=queryset, - model_field='_id', - metric_field='provider_id', - annotation=metric_name, - after=after, - size=None, - ) def get_renderer_context(self): context = super().get_renderer_context() diff --git a/api_tests/institutions/views/test_institution_department_list.py b/api_tests/institutions/views/test_institution_department_list.py index 8b785504756..c5b53395d20 100644 --- a/api_tests/institutions/views/test_institution_department_list.py +++ b/api_tests/institutions/views/test_institution_department_list.py @@ -1,16 +1,17 @@ -import pytest import datetime +import pytest + from api.base.settings.defaults import API_BASE, DEFAULT_ES_NULL_VALUE from osf_tests.factories import ( InstitutionFactory, AuthUserFactory, ) -from osf.metrics.reports import InstitutionalUserReport +from osf.metrics.monthly_reports import MonthlyInstitutionalUserReport from osf.metrics.utils import YearMonth -@pytest.mark.es_metrics +@pytest.mark.djelme_elasticsearch_backends @pytest.mark.django_db class TestInstitutionDepartmentList: @@ -37,55 +38,55 @@ def user4(self): @pytest.fixture() def populate_counts(self, user, user2, user3, user4, admin, institution): # This represents a Department that had a user, but no longer has any users, so does not appear in results. - InstitutionalUserReport( + MonthlyInstitutionalUserReport( report_yearmonth=YearMonth(2017, 2), user_id=user._id, institution_id=institution._id, department_name='Old Department', public_project_count=1, private_project_count=1, - ).save() + ).save(validate=False) _this_month = YearMonth.from_date(datetime.date.today()) # The user has left the department - InstitutionalUserReport( + MonthlyInstitutionalUserReport( report_yearmonth=_this_month, user_id=user._id, institution_id=institution._id, department_name='New Department', public_project_count=1, private_project_count=1, - ).save() + ).save(validate=False) # A second user entered the department - InstitutionalUserReport( + MonthlyInstitutionalUserReport( report_yearmonth=_this_month, user_id=user2._id, institution_id=institution._id, department_name='New Department', public_project_count=1, private_project_count=1, - ).save() + ).save(validate=False) # A new department with a single user to test sorting - InstitutionalUserReport( + MonthlyInstitutionalUserReport( report_yearmonth=_this_month, user_id=user3._id, institution_id=institution._id, department_name='Smaller Department', public_project_count=1, private_project_count=1, - ).save() + ).save(validate=False) # A user with no department - InstitutionalUserReport( + MonthlyInstitutionalUserReport( report_yearmonth=_this_month, user_id=user4._id, institution_id=institution._id, public_project_count=1, private_project_count=1, - ).save() + ).save(validate=False) @pytest.fixture() def admin(self, institution): @@ -113,7 +114,7 @@ def test_auth(self, app, url, user, admin): assert resp.json['data'] == [] def test_get(self, app, url, admin, institution, populate_counts): - InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern) + MonthlyInstitutionalUserReport.refresh() resp = app.get(url, auth=admin.auth) assert resp.json['data'] == [{ diff --git a/api_tests/institutions/views/test_institution_summary_metrics.py b/api_tests/institutions/views/test_institution_summary_metrics.py index 6dd6c5bbda3..178bed6ce6b 100644 --- a/api_tests/institutions/views/test_institution_summary_metrics.py +++ b/api_tests/institutions/views/test_institution_summary_metrics.py @@ -5,12 +5,13 @@ InstitutionFactory, AuthUserFactory, ) -from osf.metrics.reports import InstitutionMonthlySummaryReport +from osf.metrics.monthly_reports import MonthlyInstitutionSummaryReport -@pytest.mark.es_metrics +@pytest.mark.djelme_elasticsearch_backends @pytest.mark.django_db class TestInstitutionSummaryMetricsList: + @pytest.fixture() def institution(self): return InstitutionFactory() @@ -30,10 +31,10 @@ def unshown_reports(self, institution): # Reports that should not be shown in the results # Report from another institution another_institution = InstitutionFactory() - _summary_report_factory('2024-08', another_institution) + _summary_report_factory('2024-08', another_institution, validate=False) # Old report from the same institution - _summary_report_factory('2024-07', institution) - _summary_report_factory('2018-02', institution) + _summary_report_factory('2024-07', institution, validate=False) + _summary_report_factory('2018-02', institution, validate=False) @pytest.fixture() def reports(self, institution): @@ -84,7 +85,7 @@ def test_get_empty(self, app, url, institutional_admin): assert resp.json['meta'] == {'version': '2.0'} def test_get_report(self, app, url, institutional_admin, institution, reports, unshown_reports): - InstitutionMonthlySummaryReport._get_connection().indices.refresh(InstitutionMonthlySummaryReport._template_pattern) + MonthlyInstitutionSummaryReport.refresh() resp = app.get(url, auth=institutional_admin.auth) assert resp.status_code == 200 @@ -150,7 +151,7 @@ def test_get_report_with_multiple_months_and_institutions( monthly_logged_in_user_count=270, monthly_active_user_count=260, ) - InstitutionMonthlySummaryReport._get_connection().indices.refresh(InstitutionMonthlySummaryReport._template_pattern) + MonthlyInstitutionSummaryReport.refresh() resp = app.get(url, auth=institutional_admin.auth) assert resp.status_code == 200 @@ -179,19 +180,21 @@ def test_get_with_valid_report_dates(self, app, url, institution, institutional_ '2024-08', institution, user_count=0, + validate=False, ) _summary_report_factory( '2024-09', institution, user_count=999, - + validate=False, ) _summary_report_factory( '2018-02', institution, user_count=4133, + validate=False, ) - InstitutionMonthlySummaryReport._get_connection().indices.refresh(InstitutionMonthlySummaryReport._template_pattern) + MonthlyInstitutionSummaryReport.refresh() resp = app.get(f'{url}?report_yearmonth=2024-08', auth=institutional_admin.auth) assert resp.status_code == 200 @@ -205,39 +208,25 @@ def test_get_with_valid_report_dates(self, app, url, institution, institutional_ attributes = resp.json['data']['attributes'] assert attributes['user_count'] == 4133 - def test_get_with_invalid_report_date(self, app, url, institution, institutional_admin): - _summary_report_factory( - '2024-08', - institution, - user_count=0, - ) - _summary_report_factory( - '2024-09', - institution, - user_count=999, - ) - InstitutionMonthlySummaryReport._get_connection().indices.refresh(InstitutionMonthlySummaryReport._template_pattern) - - # Request with an invalid report_date format - resp = app.get(f'{url}?report_yearmonth=invalid-date', auth=institutional_admin.auth) - assert resp.status_code == 200 - - # Verify it defaults to the most recent report data - attributes = resp.json['data']['attributes'] - assert attributes['user_count'] == 999 + def test_get_with_invalid_report_yearmonth(self, app, url, institution, institutional_admin): + # Request with an invalid report_yearmonth format + resp = app.get(f'{url}?report_yearmonth=invalid-date', auth=institutional_admin.auth, expect_errors=True) + assert resp.status_code == 400 def test_get_without_report_date_uses_most_recent(self, app, url, institution, institutional_admin): _summary_report_factory( '2024-08', institution, user_count=0, + validate=False, ) _summary_report_factory( '2024-09', institution, user_count=999, + validate=False, ) - InstitutionMonthlySummaryReport._get_connection().indices.refresh(InstitutionMonthlySummaryReport._template_pattern) + MonthlyInstitutionSummaryReport.refresh() resp = app.get(url, auth=institutional_admin.auth) assert resp.status_code == 200 @@ -246,11 +235,11 @@ def test_get_without_report_date_uses_most_recent(self, app, url, institution, i assert attributes['user_count'] == 999 -def _summary_report_factory(yearmonth, institution, **kwargs): - report = InstitutionMonthlySummaryReport( +def _summary_report_factory(yearmonth, institution, *, validate=True, **kwargs): + report = MonthlyInstitutionSummaryReport( report_yearmonth=yearmonth, institution_id=institution._id, **kwargs, ) - report.save() + report.save(validate=validate) return report diff --git a/api_tests/institutions/views/test_institution_user_metric_list.py b/api_tests/institutions/views/test_institution_user_metric_list.py index d2b99da435f..dc097c221c4 100644 --- a/api_tests/institutions/views/test_institution_user_metric_list.py +++ b/api_tests/institutions/views/test_institution_user_metric_list.py @@ -12,12 +12,12 @@ AuthUserFactory, ) -from osf.metrics.reports import InstitutionalUserReport +from osf.metrics.monthly_reports import MonthlyInstitutionalUserReport from osf.models import UserMessage from tests.utils import capture_notifications -@pytest.mark.es_metrics +@pytest.mark.djelme_elasticsearch_backends @pytest.mark.django_db class TestInstitutionUserMetricList: @pytest.fixture() @@ -89,7 +89,7 @@ def test_get_empty(self, app, url, institutional_admin): assert _resp.json['data'] == [] def test_get_reports(self, app, url, institutional_admin, institution, reports, unshown_reports): - InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern) + MonthlyInstitutionalUserReport.refresh() _resp = app.get(url, auth=institutional_admin.auth) assert _resp.status_code == 200 assert len(_resp.json['data']) == len(reports) @@ -101,7 +101,7 @@ def test_get_reports(self, app, url, institutional_admin, institution, reports, assert len(response_object['attributes']['contacts']) == 0 def test_filter_reports(self, app, url, institutional_admin, institution, reports, unshown_reports): - InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern) + MonthlyInstitutionalUserReport.refresh() for _query, _expected_user_ids in ( ({'filter[department]': 'nunavum'}, set()), ({'filter[department]': 'incidentally'}, set()), @@ -137,7 +137,7 @@ def test_filter_reports(self, app, url, institutional_admin, institution, report assert set(_user_ids(_resp)) == _expected_user_ids def test_sort_reports(self, app, url, institutional_admin, institution, reports, unshown_reports): - InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern) + MonthlyInstitutionalUserReport.refresh() for _query, _expected_user_id_list in ( ({'sort': 'storage_byte_count'}, ['u_sparse', 'u_orc', 'u_blargl', 'u_orcomma']), ({'sort': '-storage_byte_count'}, ['u_orcomma', 'u_blargl', 'u_orc', 'u_sparse']), @@ -147,7 +147,7 @@ def test_sort_reports(self, app, url, institutional_admin, institution, reports, assert list(_user_ids(_resp)) == _expected_user_id_list def test_paginate_reports(self, app, url, institutional_admin, institution, reports, unshown_reports): - InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern) + MonthlyInstitutionalUserReport.refresh() for _query, _expected_user_id_list in ( ({'sort': 'storage_byte_count', 'page[size]': 2}, ['u_sparse', 'u_orc']), ({'sort': 'storage_byte_count', 'page[size]': 2, 'page': 2}, ['u_blargl', 'u_orcomma']), @@ -182,7 +182,7 @@ def test_get_report_formats_csv_tsv(self, app, url, institutional_admin, institu month_last_active='2018-02', month_last_login='2018-02', ) - InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern) + MonthlyInstitutionalUserReport.refresh() resp = app.get(f'{url}?format={format_type}', auth=institutional_admin.auth) assert resp.status_code == 200 @@ -286,7 +286,7 @@ def test_csv_tsv_ignores_pagination(self, app, url, institutional_admin, institu str(736662999298 + i), f'Jalen Hurts #{i}', ]) - InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern) + MonthlyInstitutionalUserReport.refresh() # Make request for CSV format with page[size]=10 resp = app.get(f'{url}?format={format_type}', auth=institutional_admin.auth) @@ -352,7 +352,7 @@ def test_get_report_format_table_json(self, app, url, institutional_admin, insti month_last_active='2018-02', month_last_login='2018-02', ) - InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern) + MonthlyInstitutionalUserReport.refresh() resp = app.get(f'{url}?format=json_report', auth=institutional_admin.auth) assert resp.status_code == 200 @@ -418,7 +418,7 @@ def test_correct_number_of_contact_messages(self, app, url, institutional_admin, department_name='a department, or so, that happens, incidentally, to have commas', storage_byte_count=736662999298, ) - InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern) + MonthlyInstitutionalUserReport.refresh() receiver = user1 with capture_notifications(): @@ -480,10 +480,10 @@ def _user_ids(api_response): yield _datum['relationships']['user']['data']['id'] def _report_factory(yearmonth, institution, **kwargs): - _report = InstitutionalUserReport( + _report = MonthlyInstitutionalUserReport( report_yearmonth=yearmonth, institution_id=institution._id, **kwargs, ) - _report.save() + _report.save(validate=False) return _report diff --git a/api_tests/metrics/test_composite_query.py b/api_tests/metrics/test_composite_query.py deleted file mode 100644 index 016677c3a11..00000000000 --- a/api_tests/metrics/test_composite_query.py +++ /dev/null @@ -1,86 +0,0 @@ -import pytest -from datetime import datetime -from osf_tests.factories import ( - PreprintFactory, - AuthUserFactory -) - -from osf.metrics import PreprintDownload -from api.base.settings import API_PRIVATE_BASE as API_BASE - - -@pytest.fixture() -def preprint(): - return PreprintFactory() - - -@pytest.fixture() -def user(): - user = AuthUserFactory() - user.is_staff = True - user.add_system_tag('preprint_metrics') - user.save() - return user - - -@pytest.fixture -def base_url(): - return f'/{API_BASE}metrics/preprints/' - - -@pytest.mark.es_metrics -@pytest.mark.django_db -class TestElasticSearch: - - def test_elasticsearch_agg_query(self, app, user, base_url, preprint): - post_url = f'{base_url}downloads/' - - payload = { - 'data': { - 'type': 'preprint_metrics', - 'attributes': { - 'query': { - 'aggs': { - 'preprints_by_year': { - 'composite': { - 'sources': [{ - 'date': { - 'date_histogram': { - 'field': 'timestamp', - 'interval': 'year' - } - } - }] - } - } - } - } - } - } - } - - resp = app.post_json_api(post_url, payload, auth=user.auth) - - assert resp.status_code == 200 - assert resp.json['hits']['hits'] == [] - - PreprintDownload.record_for_preprint( - preprint, - path=preprint.primary_file.path, - timestamp=datetime(year=2020, month=1, day=1), - ) - PreprintDownload.record_for_preprint( - preprint, - path=preprint.primary_file.path, - timestamp=datetime(year=2020, month=2, day=1) - ) - PreprintDownload._get_connection().indices.refresh(PreprintDownload._template_pattern) - - resp = app.post_json_api(post_url, payload, auth=user.auth) - assert resp.status_code == 200 - assert len(resp.json['aggregations']['preprints_by_year']['buckets']) == 1 - - payload['data']['attributes']['query']['aggs']['preprints_by_year']['composite']['sources'][0]['date']['date_histogram']['interval'] = 'month' - - resp = app.post_json_api(post_url, payload, auth=user.auth) - assert len(resp.json['aggregations']['preprints_by_year']['buckets']) == 2 diff --git a/api_tests/metrics/test_counted_usage.py b/api_tests/metrics/test_counted_usage.py index e954248c15b..c319cc1690b 100644 --- a/api_tests/metrics/test_counted_usage.py +++ b/api_tests/metrics/test_counted_usage.py @@ -1,10 +1,13 @@ from datetime import datetime, timezone +from unittest import mock import pytest -from unittest import mock +from elasticsearch_metrics.util.anon_enough import opaque_key from framework.auth.core import Auth +from osf.utils.permissions import ADMIN, READ, WRITE +from api_tests.utils import create_test_file from osf_tests.factories import ( AuthUserFactory, NodeFactory, @@ -12,11 +15,7 @@ PrivateLinkFactory, ProjectFactory, RegistrationFactory, - # UserFactory, ) -from osf.utils.permissions import ADMIN, READ, WRITE -from api_tests.utils import create_test_file -from elasticsearch_metrics.tests.util import djelme_test_backends COUNTED_USAGE_URL = '/_/metrics/events/counted_usage/' @@ -30,23 +29,24 @@ def counted_usage_payload(**attributes): } -def assert_saved_with(mock_save, *, expected_doc_id=None, expected_attrs): - assert mock_save.call_count == 1 - args, kwargs = mock_save.call_args - actual_instance = args[0] +def assert_saved_with(mock_es8, *, expected_doc_id=None, expected_attrs): + assert mock_es8.index.call_count == 1 + _args, _kwargs = mock_es8.index.call_args if expected_doc_id is not None: - assert actual_instance.meta.id == expected_doc_id - actual_attrs = actual_instance.to_dict() - for attr_name, expected_value in expected_attrs.items(): - actual_value = actual_attrs.get(attr_name, None) - assert actual_value == expected_value, repr(actual_value) + assert _kwargs['id'] == expected_doc_id + _actual_attrs = _kwargs['body'] + for _attr_name, _expected_value in expected_attrs.items(): + _actual_value = _actual_attrs.get(_attr_name, None) + assert (_actual_value == _expected_value), repr(_actual_value) @pytest.fixture -def mock_save(): - with mock.patch('elasticsearch_metrics.imps.elastic6.BaseMetric.check_index_template'): - with mock.patch('elasticsearch6_dsl.Document.save', autospec=True) as mock_save: - yield mock_save +def mock_es8(): + with mock.patch('elasticsearch_metrics.imps.elastic8.TimeseriesRecord.check_djelme_setup'): + with mock.patch('elasticsearch_metrics.imps.elastic8.BaseDjelmeRecord._get_connection') as _mock_get_connection: + _mock_es8 = _mock_get_connection.return_value + _mock_es8.index.return_value = {'result': {}} + yield _mock_es8 @pytest.mark.django_db @@ -76,21 +76,19 @@ def test_required_attributes(self, app, attrs): @pytest.mark.django_db class TestComputedFields: - @pytest.fixture(autouse=True) - def _real_elastic(self): - with djelme_test_backends(): - yield - @pytest.fixture(autouse=True) def mock_domain(self): domain = 'http://example.foo/' - with mock.patch('api.metrics.serializers.website_settings.DOMAIN', new=domain): + with mock.patch('website.settings.DOMAIN', new=domain): yield domain @pytest.fixture(autouse=True) def mock_now(self): timestamp = datetime(1981, 1, 1, 0, 1, 31, tzinfo=timezone.utc) - with mock.patch('django.utils.timezone.now', return_value=timestamp): + with ( + mock.patch('django.utils.timezone.now', return_value=timestamp), + mock.patch('elasticsearch_metrics.imps.elastic8.utcnow', return_value=timestamp), + ): yield timestamp @pytest.fixture @@ -105,7 +103,7 @@ def user(self): with mock.patch('osf.models.base.generate_guid', return_value='guidy'): return AuthUserFactory() - def test_by_client_session_id(self, app, mock_save, user, preprint): + def test_by_client_session_id(self, app, mock_es8, user, preprint): payload = counted_usage_payload( client_session_id='hello', item_guid=preprint._id, @@ -115,18 +113,25 @@ def test_by_client_session_id(self, app, mock_save, user, preprint): headers = { 'User-Agent': 'haha', } - resp = app.post_json_api(COUNTED_USAGE_URL, payload, headers=headers, auth=user.auth) + resp = app.post_json_api(COUNTED_USAGE_URL, payload, headers=headers) assert resp.status_code == 201 + _expected_sessionhour_id = opaque_key(['hello', '1981-01-01', '0']) assert_saved_with( - mock_save, - # doc_id: sha256(b'http://example.foo/|http://example.foo/blahblah/blee|5b7c8b0a740a5b23712258a9d1164d2af008df02a8e3d339f16ead1d19595b34|1981-01-01|3|api,view').hexdigest() - expected_doc_id='3239044c7462dd318edd0522a0ed7d84b9c6502ef16cb40dfcae6c1f456d57a2', + mock_es8, + expected_doc_id=opaque_key([ + 'http://example.foo/', + _expected_sessionhour_id, + "['api', 'view']", + 'http://example.foo/blahblah/blee', + '1981-01-01', + '3', + ]), expected_attrs={ 'platform_iri': 'http://example.foo/', - 'item_guid': preprint._id, - # session_id: sha256(b'hello|1981-01-01').hexdigest() - 'session_id': '5b7c8b0a740a5b23712258a9d1164d2af008df02a8e3d339f16ead1d19595b34', - 'action_labels': ['view', 'api'], + 'item_osfid': preprint._id, + 'item_type': 'Preprint', + 'sessionhour_id': _expected_sessionhour_id, + 'action_labels': ['api', 'view'], 'pageview_info': { 'page_url': 'http://example.foo/blahblah/blee', 'page_path': '/blahblah/blee', @@ -135,9 +140,9 @@ def test_by_client_session_id(self, app, mock_save, user, preprint): }, ) - def test_by_client_session_id_anon(self, app, mock_save, preprint): + def test_by_client_session_id_anon(self, app, mock_es8, preprint): payload = counted_usage_payload( - client_session_id='hello', + client_session_id='hihi', item_guid=preprint._id, action_labels=['view', 'web'], pageview_info={ @@ -150,15 +155,22 @@ def test_by_client_session_id_anon(self, app, mock_save, preprint): } resp = app.post_json_api(COUNTED_USAGE_URL, payload, headers=headers) assert resp.status_code == 201 + _expected_sessionhour_id = opaque_key(['hihi', '1981-01-01', '0']) assert_saved_with( - mock_save, - # doc_id: sha256(b'http://example.foo/|http://example.foo/bliz/|5b7c8b0a740a5b23712258a9d1164d2af008df02a8e3d339f16ead1d19595b34|1981-01-01|3|view,web').hexdigest() - expected_doc_id='d01759e963893f9dc9b2ccf016a5ef29135673779802b5578f31449543677e82', + mock_es8, + expected_doc_id=opaque_key([ + 'http://example.foo/', + _expected_sessionhour_id, + "['view', 'web']", + 'http://example.foo/bliz/', + '1981-01-01', + '3', + ]), expected_attrs={ 'platform_iri': 'http://example.foo/', - 'item_guid': preprint._id, - # session_id: sha256(b'hello|1981-01-01').hexdigest() - 'session_id': '5b7c8b0a740a5b23712258a9d1164d2af008df02a8e3d339f16ead1d19595b34', + 'item_osfid': preprint._id, + 'item_type': 'Preprint', + 'sessionhour_id': _expected_sessionhour_id, 'action_labels': ['view', 'web'], 'pageview_info': { 'page_url': 'http://example.foo/bliz/', @@ -170,7 +182,7 @@ def test_by_client_session_id_anon(self, app, mock_save, preprint): }, ) - def test_by_user_auth(self, app, mock_save, user, preprint): + def test_by_user_auth(self, app, mock_es8, user, preprint): payload = counted_usage_payload( item_guid=preprint._id, action_labels=['view', 'web'], @@ -184,15 +196,22 @@ def test_by_user_auth(self, app, mock_save, user, preprint): } resp = app.post_json_api(COUNTED_USAGE_URL, payload, headers=headers, auth=user.auth) assert resp.status_code == 201 + _expected_sessionhour_id = opaque_key(['guidy', '1981-01-01', '0']) assert_saved_with( - mock_save, - # doc_id: sha256(b'http://example.foo/|http://osf.io/mst3k|ec768abb16c3411570af99b9d635c2c32d1ca31d1b25eec8ee73759e7242e74a|1981-01-01|3|view,web').hexdigest() - expected_doc_id='7b8bc27c6d90fb45aa5bbd02deceba9f7384ed61b9a6e7253317c262020b94c2', + mock_es8, + expected_doc_id=opaque_key([ + 'http://example.foo/', + _expected_sessionhour_id, + "['view', 'web']", + 'http://osf.io/mst3k', + '1981-01-01', + '3', + ]), expected_attrs={ 'platform_iri': 'http://example.foo/', - 'item_guid': preprint._id, - # session_id: sha256(b'guidy|1981-01-01|0').hexdigest() - 'session_id': 'ec768abb16c3411570af99b9d635c2c32d1ca31d1b25eec8ee73759e7242e74a', + 'item_osfid': preprint._id, + 'item_type': 'Preprint', + 'sessionhour_id': _expected_sessionhour_id, 'action_labels': ['view', 'web'], 'pageview_info': { 'page_url': 'http://osf.io/mst3k', @@ -204,7 +223,7 @@ def test_by_user_auth(self, app, mock_save, user, preprint): }, ) - def test_by_useragent_header(self, app, mock_save, preprint): + def test_by_useragent_header(self, app, mock_es8, preprint): payload = counted_usage_payload( item_guid=preprint._id, action_labels=['view', 'api'], @@ -218,16 +237,23 @@ def test_by_useragent_header(self, app, mock_save, preprint): } resp = app.post_json_api(COUNTED_USAGE_URL, payload, headers=headers) assert resp.status_code == 201 + _expected_sessionhour_id = opaque_key(['localhost:80', 'haha', '1981-01-01', '0']) assert_saved_with( - mock_save, - # doc_id: sha256(b'http://example.foo/|yxwvu|97098dd3f7cd26053c0d0264d1c84eaeea8e08d2c55ca34017ffbe53c749ba5a|1981-01-01|3|api,view').hexdigest() - expected_doc_id='6d7549df6734bb955eb832c6316ffae46c2959c95b5817ab4fcb341dbc875c23', + mock_es8, + expected_doc_id=opaque_key([ + 'http://example.foo/', + _expected_sessionhour_id, + "['api', 'view']", + 'http://example.foo/bliz/', + '1981-01-01', + '3', + ]), expected_attrs={ 'platform_iri': 'http://example.foo/', - 'item_guid': preprint._id, - # session_id: sha256(b'localhost:80|haha|1981-01-01|0').hexdigest() - 'session_id': '97098dd3f7cd26053c0d0264d1c84eaeea8e08d2c55ca34017ffbe53c749ba5a', - 'action_labels': ['view', 'api'], + 'item_osfid': preprint._id, + 'item_type': 'Preprint', + 'sessionhour_id': opaque_key(['localhost:80', 'haha', '1981-01-01', '0']), + 'action_labels': ['api', 'view'], 'pageview_info': { 'page_url': 'http://example.foo/bliz/', 'page_path': '/bliz', @@ -244,9 +270,10 @@ def test_by_useragent_header(self, app, mock_save, preprint): class TestGuidFields: @pytest.fixture(autouse=True) - def _real_elastic(self): - with djelme_test_backends(): - yield + def mock_domain(self): + domain = 'http://example.foo/' + with mock.patch('website.settings.DOMAIN', new=domain): + yield domain @pytest.fixture def preprint(self, item_public): @@ -286,7 +313,7 @@ def child_reg_file(self, child_reg): def child_reg_file_guid(self, child_reg_file): return child_reg_file.get_guid(create=True)._id - def test_preprint_file(self, app, mock_save, preprint, item_public): + def test_preprint_file(self, app, mock_es8, preprint, item_public, mock_domain): # test_preprint_guid payload = counted_usage_payload( item_guid=preprint._id, @@ -295,16 +322,18 @@ def test_preprint_file(self, app, mock_save, preprint, item_public): resp = app.post_json_api(COUNTED_USAGE_URL, payload, headers={'User-Agent': 'blarg'}) assert resp.status_code == 201 assert_saved_with( - mock_save, + mock_es8, expected_attrs={ - 'item_guid': preprint._id, - 'item_type': 'preprint', + 'item_osfid': preprint._id, + 'item_iri': f'{mock_domain}{preprint._id}', + 'item_type': 'Preprint', 'item_public': item_public, 'provider_id': preprint.provider._id, - 'surrounding_guids': None, + 'database_iri': f'{mock_domain}preprints/{preprint.provider._id}', + 'within_iris': [f'{mock_domain}{preprint._id}'], }, ) - mock_save.reset_mock() + mock_es8.reset_mock() # test_preprint_file_guid payload = counted_usage_payload( @@ -314,17 +343,22 @@ def test_preprint_file(self, app, mock_save, preprint, item_public): resp = app.post_json_api(COUNTED_USAGE_URL, payload, headers={'User-Agent': 'blarg'}) assert resp.status_code == 201 assert_saved_with( - mock_save, + mock_es8, expected_attrs={ - 'item_guid': preprint.primary_file.get_guid()._id, - 'item_type': 'osfstoragefile', + 'item_osfid': preprint.primary_file.get_guid()._id, + 'item_iri': preprint.primary_file.get_semantic_iri(), + 'item_type': 'File', 'item_public': item_public, 'provider_id': preprint.primary_file.provider, - 'surrounding_guids': [preprint._id], + 'database_iri': f'urn:files.osf.io:{preprint.primary_file.provider}', + 'within_iris': sorted([ + f'{mock_domain}{preprint._id}', + preprint.primary_file.get_semantic_iri(), + ]), }, ) - def test_child_registration_file(self, app, mock_save, child_reg_file_guid, child_reg, parent_reg, item_public): + def test_child_registration_file(self, app, mock_es8, child_reg_file_guid, child_reg_file, child_reg, parent_reg, item_public, mock_domain): # test_child_registration_file_guid payload = counted_usage_payload( item_guid=child_reg_file_guid, @@ -333,20 +367,22 @@ def test_child_registration_file(self, app, mock_save, child_reg_file_guid, chil resp = app.post_json_api(COUNTED_USAGE_URL, payload, headers={'User-Agent': 'blarg'}) assert resp.status_code == 201 assert_saved_with( - mock_save, + mock_es8, expected_attrs={ 'action_labels': ['view', 'web'], - 'item_guid': child_reg_file_guid, - 'item_type': 'osfstoragefile', + 'item_osfid': child_reg_file_guid, + 'item_type': 'File', 'item_public': item_public, 'provider_id': 'osfstorage', - 'surrounding_guids': [ - child_reg._id, - parent_reg._id, - ], + 'database_iri': 'urn:files.osf.io:osfstorage', + 'within_iris': sorted([ + child_reg_file.get_semantic_iri(), + child_reg.get_semantic_iri(), + parent_reg.get_semantic_iri(), + ]), }, ) - mock_save.reset_mock() + mock_es8.reset_mock() # test_child_registration_guid payload = counted_usage_payload( @@ -356,19 +392,22 @@ def test_child_registration_file(self, app, mock_save, child_reg_file_guid, chil resp = app.post_json_api(COUNTED_USAGE_URL, payload, headers={'User-Agent': 'blarg'}) assert resp.status_code == 201 assert_saved_with( - mock_save, + mock_es8, expected_attrs={ 'action_labels': ['view', 'web'], - 'item_guid': child_reg._id, - 'item_type': 'registration', + 'item_osfid': child_reg._id, + 'item_type': 'RegistrationComponent', 'item_public': item_public, 'provider_id': 'osf', - 'surrounding_guids': [ - parent_reg._id, - ], + 'database_iri': f'{mock_domain}registries/osf', + 'item_iri': child_reg.get_semantic_iri(), + 'within_iris': sorted([ + child_reg.get_semantic_iri(), + parent_reg.get_semantic_iri(), + ]), }, ) - mock_save.reset_mock() + mock_es8.reset_mock() # test_parent_registration_guid payload = counted_usage_payload( @@ -378,13 +417,15 @@ def test_child_registration_file(self, app, mock_save, child_reg_file_guid, chil resp = app.post_json_api(COUNTED_USAGE_URL, payload, headers={'User-Agent': 'blarg'}) assert resp.status_code == 201 assert_saved_with( - mock_save, + mock_es8, expected_attrs={ 'action_labels': ['view', 'web'], - 'item_guid': parent_reg._id, + 'item_osfid': parent_reg._id, 'item_public': item_public, 'provider_id': 'osf', - 'surrounding_guids': None, + 'database_iri': f'{mock_domain}registries/osf', + 'item_iri': parent_reg.get_semantic_iri(), + 'within_iris': [parent_reg.get_semantic_iri()], }, ) @@ -392,7 +433,7 @@ def test_child_registration_file(self, app, mock_save, child_reg_file_guid, chil @pytest.mark.django_db class TestContributorExclusion: - def test_creator_pageview_not_recorded(self, app, mock_save): + def test_creator_pageview_not_recorded(self, app, mock_es8): user = AuthUserFactory() project = ProjectFactory(creator=user) payload = counted_usage_payload( @@ -402,14 +443,14 @@ def test_creator_pageview_not_recorded(self, app, mock_save): ) resp = app.post_json_api(COUNTED_USAGE_URL, payload, auth=user.auth) assert resp.status_code == 204 - assert mock_save.call_count == 0 + assert mock_es8.index.call_count == 0 @pytest.mark.parametrize( 'permissions', [READ, WRITE, ADMIN], ids=['read', 'write', 'admin'], ) - def test_contributor_pageview_not_recorded(self, app, mock_save, permissions): + def test_contributor_pageview_not_recorded(self, app, mock_es8, permissions): creator = AuthUserFactory() contributor = AuthUserFactory() project = ProjectFactory(creator=creator) @@ -421,9 +462,9 @@ def test_contributor_pageview_not_recorded(self, app, mock_save, permissions): ) resp = app.post_json_api(COUNTED_USAGE_URL, payload, auth=contributor.auth) assert resp.status_code == 204 - assert mock_save.call_count == 0 + assert mock_es8.index.call_count == 0 - def test_non_contributor_pageview_recorded(self, app, mock_save): + def test_non_contributor_pageview_recorded(self, app, mock_es8): creator = AuthUserFactory() visitor = AuthUserFactory() project = ProjectFactory(creator=creator, is_public=True) @@ -434,9 +475,9 @@ def test_non_contributor_pageview_recorded(self, app, mock_save): ) resp = app.post_json_api(COUNTED_USAGE_URL, payload, auth=visitor.auth) assert resp.status_code == 201 - assert mock_save.call_count == 1 + assert mock_es8.index.call_count == 1 - def test_parent_contributor_not_on_child_component_pageview_recorded(self, app, mock_save): + def test_parent_contributor_not_on_child_component_pageview_recorded(self, app, mock_es8): creator = AuthUserFactory() child_owner = AuthUserFactory() parent_reader = AuthUserFactory() @@ -451,9 +492,9 @@ def test_parent_contributor_not_on_child_component_pageview_recorded(self, app, ) resp = app.post_json_api(COUNTED_USAGE_URL, payload, auth=parent_reader.auth) assert resp.status_code == 201 - assert mock_save.call_count == 1 + assert mock_es8.index.call_count == 1 - def test_anonymous_view_only_link_visitor_pageview_recorded(self, app, mock_save): + def test_anonymous_view_only_link_visitor_pageview_recorded(self, app, mock_es8): creator = AuthUserFactory() project = ProjectFactory(creator=creator, is_public=False) link = PrivateLinkFactory(anonymous=True, creator=creator) @@ -468,9 +509,9 @@ def test_anonymous_view_only_link_visitor_pageview_recorded(self, app, mock_save ) resp = app.post_json_api(COUNTED_USAGE_URL, payload) assert resp.status_code == 201 - assert mock_save.call_count == 1 + assert mock_es8.index.call_count == 1 - def test_logged_in_non_contributor_view_only_link_pageview_recorded(self, app, mock_save): + def test_logged_in_non_contributor_view_only_link_pageview_recorded(self, app, mock_es8): creator = AuthUserFactory() visitor = AuthUserFactory() project = ProjectFactory(creator=creator, is_public=False) @@ -485,14 +526,14 @@ def test_logged_in_non_contributor_view_only_link_pageview_recorded(self, app, m ) resp = app.post_json_api(COUNTED_USAGE_URL, payload, auth=visitor.auth) assert resp.status_code == 201 - assert mock_save.call_count == 1 + assert mock_es8.index.call_count == 1 @pytest.mark.parametrize( 'permissions', [READ, WRITE, ADMIN], ids=['read', 'write', 'admin'], ) - def test_logged_in_contributor_view_only_link_pageview_not_recorded(self, app, mock_save, permissions): + def test_logged_in_contributor_view_only_link_pageview_not_recorded(self, app, mock_es8, permissions): creator = AuthUserFactory() contributor = AuthUserFactory() project = ProjectFactory(creator=creator, is_public=False) @@ -508,4 +549,4 @@ def test_logged_in_contributor_view_only_link_pageview_not_recorded(self, app, m ) resp = app.post_json_api(COUNTED_USAGE_URL, payload, auth=contributor.auth) assert resp.status_code == 204 - assert mock_save.call_count == 0 + assert mock_es8.index.call_count == 0 diff --git a/api_tests/metrics/test_preprint_metrics.py b/api_tests/metrics/test_preprint_metrics.py deleted file mode 100644 index cd9b8041c2d..00000000000 --- a/api_tests/metrics/test_preprint_metrics.py +++ /dev/null @@ -1,240 +0,0 @@ -import pytest -from unittest import mock -from datetime import datetime - -from website.app import setup_django - -setup_django() - -from django.utils import timezone -from waffle.testutils import override_switch -from elasticsearch6.exceptions import RequestError - -from osf import features -from api.base.settings import API_PRIVATE_BASE as API_BASE -from osf.metrics import PreprintDownload, PreprintView -from osf_tests.factories import AuthUserFactory, PreprintFactory, NodeFactory - -pytestmark = pytest.mark.django_db - - -@pytest.mark.django_db -class TestPreprintMetrics: - - @pytest.fixture(autouse=True) - def enable_elasticsearch_metrics(self): - with override_switch(features.ELASTICSEARCH_METRICS, active=True): - yield - - @pytest.fixture - def user(self): - user = AuthUserFactory() - user.is_staff = True - user.add_system_tag('preprint_metrics') - user.save() - return user - - @pytest.fixture - def other_user(self): - return AuthUserFactory() - - @pytest.fixture - def other_admin_user(self): - user = AuthUserFactory() - user.is_staff = True - user.save() - return user - - @pytest.fixture - def other_non_admin_user(self): - user = AuthUserFactory() - user.add_system_tag('preprint_metrics') - user.save() - return user - - @pytest.fixture - def preprint(self, user): - preprint = PreprintFactory(creator=user) - return preprint - - @pytest.fixture - def preprint_two(self): - return PreprintFactory() - - @pytest.fixture - def preprint_three(self): - return PreprintFactory() - - @pytest.fixture - def preprint_no_results(self): - return PreprintFactory() - - @pytest.fixture - def project(self): - return NodeFactory() - - @pytest.fixture - def project_two(self): - return NodeFactory() - - @pytest.fixture - def metric_dates(self): - return ['2019-01-01', '2019-01-02', '2019-01-03'] - - def add_views_and_downloads(self, preprint_to_add, user_to_use, dates_to_use): - # create 3 timestamps for 3 days, 1 hour apart - times = ['T00:05', 'T01:05', 'T02:05'] - - metrics = [PreprintView, PreprintDownload] - for metric in metrics: - for date in dates_to_use: - for time in times: - metric.record_for_preprint( - preprint=preprint_to_add, - user=user_to_use, - path=preprint_to_add.primary_file.path, - timestamp=datetime.strptime(date + time, '%Y-%m-%dT%H:%M') - ) - - @pytest.fixture - def base_url(self): - return f'/{API_BASE}metrics/preprints/' - - @mock.patch('api.metrics.views.PreprintDownloadMetrics.execute_search') - def test_custom_metric_malformed_query(self, mock_execute, app, user, base_url): - mock_execute.side_effect = RequestError() - post_url = f'{base_url}downloads/' - post_data = { - 'data': { - 'type': 'preprint_metric', - 'attributes': { - 'query': {'not_a_field': 'Yay!'} - } - } - } - res = app.post_json_api(post_url, post_data, auth=user.auth, expect_errors=True) - assert res.status_code == 400 - assert res.json['errors'][0]['detail'] == 'Malformed elasticsearch query.' - - @pytest.mark.es_metrics - def test_agg_query(self, app, user, base_url): - - post_url = f'{base_url}downloads/' - - payload = { - 'data': { - 'type': 'preprint_metrics', - 'attributes': { - 'query': { - 'aggs': { - 'preprints_by_year': { - 'composite': { - 'sources': [{ - 'date': { - 'date_histogram': { - 'field': 'timestamp', - 'interval': 'year' - } - } - }] - } - } - } - } - } - } - } - resp = app.post_json_api(post_url, payload, auth=user.auth) - assert resp.status_code == 200 - - @mock.patch('api.metrics.views.PreprintDownloadMetrics.format_response') - @mock.patch('api.metrics.views.PreprintDownloadMetrics.execute_search') - def test_post_custom_metric(self, mock_execute, mock_format, app, user, base_url, preprint, other_user): - mock_return = {'good': 'job'} - mock_execute.return_value.to_dict.return_value = mock_return - mock_format.return_value = mock_return - post_url = f'{base_url}downloads/' - post_data = { - 'data': { - 'type': 'preprint_metrics', - 'attributes': { - 'query': mock_return - } - } - } - res = app.post_json_api(post_url, post_data, auth=user.auth) - assert res.json == mock_return - - @pytest.mark.parametrize('metric_name', ['downloads', 'views']) - @mock.patch('api.metrics.utils.timezone.now') - def test_preprint_list_with_metrics_fails(self, mock_timezone, app, user, base_url, preprint, preprint_two, - preprint_three, metric_name, other_user, project, project_two, - other_admin_user, other_non_admin_user): - mock_timezone.return_value = datetime(2019, 1, 4, tzinfo=timezone.utc) - url = f'{base_url}{metric_name}/' - - one_preprint_url = f'{url}?guids={preprint._id}' - # test non-logged in cannot access - res = app.get(one_preprint_url, expect_errors=True) - assert res.status_code == 401 - - # test logged in non-metrics, non-admin user cannot access - res = app.get(one_preprint_url, auth=other_user.auth, expect_errors=True) - assert res.status_code == 403 - - # test logged in, non-metrics, admin user cannot access - res = app.get(one_preprint_url, auth=other_admin_user.auth, expect_errors=True) - assert res.status_code == 403 - - # test logged in, metrics, non-admin user cannot access - res = app.get(one_preprint_url, auth=other_non_admin_user.auth, expect_errors=True) - assert res.status_code == 403 - - @pytest.mark.skip('Return results will be entirely mocked so does not make a lot of sense to run on ci.') - @mock.patch('api.metrics.utils.timezone.now') - def test_preprint_with_metrics_succeeds(self, mock_timezone, app, user, base_url, preprint, other_user, - preprint_no_results, metric_dates): - mock_timezone.return_value = datetime(2019, 1, 4, tzinfo=timezone.utc) - self.add_views_and_downloads(preprint, other_user, metric_dates) - metric_name = 'downloads' - - mock_timezone.return_value = datetime(2019, 1, 4, tzinfo=timezone.utc) - url = f'{base_url}{metric_name}/' - one_preprint_url = f'{url}?guids={preprint._id}' - - # base url should return all results - res = app.get(one_preprint_url, auth=user.auth) - assert res.json['metric_type'] == metric_name - assert len(res.json['data']) == 3 - - # starting a day later only returns 2 results - later_url = f'{one_preprint_url}&start_datetime=2019-01-02' - res = app.get(later_url, auth=user.auth) - assert len(res.json['data']) == 2 - datetimes = [result.keys()[0] for result in res.json['data']] - assert '2019-01-01T00:05:00.000Z' not in datetimes - - # filter between two specific datetimes - two_times_url = f'{one_preprint_url}&start_datetime=2019-01-02T00:00&end_datetime=2019-01-02T02:00' - res = app.get(two_times_url, auth=user.auth) - assert len(res.json['data']) == 1 - datetimes = [result.keys()[0] for result in res.json['data']] - assert '2019-01-01T00:05:00.000Z' not in datetimes - assert '2019-01-01T03:05:00.000Z' not in datetimes - - # test two specific datetimes with minute interval - two_min_interval = f'{one_preprint_url}&start_datetime=2019-01-02T00:00&end_datetime=2019-01-02T02:00&interval=1m' - res = app.get(two_min_interval, auth=user.auth) - assert len(res.json['data']) == 61 - first = res.json['data'][0] - last = res.json['data'][-1] - assert first.keys() == ['2019-01-02T00:05:00.000Z'] - assert first['2019-01-02T00:05:00.000Z'] == {preprint._id: 1} - assert last.keys() == ['2019-01-02T01:05:00.000Z'] - assert last['2019-01-02T01:05:00.000Z'] == {preprint._id: 1} - - # make sure requesting one preprint with no results is OK - non_preprint_url = f'{url}?guids={preprint_no_results._id}' - res = app.get(non_preprint_url, auth=user.auth) - assert res.status_code == 200 - assert res.json['data'] == [] diff --git a/api_tests/metrics/test_queries.py b/api_tests/metrics/test_queries.py index 8b19247f5b4..a7a2ef1e6d7 100644 --- a/api_tests/metrics/test_queries.py +++ b/api_tests/metrics/test_queries.py @@ -1,111 +1,259 @@ +import datetime from unittest import mock -import pytest +from django.test import TestCase +from elasticsearch_metrics.tests.util import RealElasticTestCase +from osf.metrics.events import OsfCountedUsageEvent from osf_tests.factories import NodeFactory, AuthUserFactory -@pytest.mark.django_db -class TestNodeAnalyticsQuery: - @pytest.fixture - def mock_search(self): - with mock.patch('elasticsearch6.Elasticsearch.search', autospec=True) as mock_search: - yield mock_search - @pytest.mark.parametrize('timespan', ['week', 'fortnight', 'month']) - def test_private_node(self, app, mock_search, timespan): - node = NodeFactory(is_public=False) - guid = node._id - resp = app.get( - f'/_/metrics/query/node_analytics/{guid}/{timespan}/', - expect_errors=True, - ) - assert resp.status_code == 401 +class TestNodeAnalyticsQueryErrors: + def test_private_node_anon(self, app): + _node = NodeFactory(is_public=False) + with mock.patch('elasticsearch8.Elasticsearch.search') as _mock_search: + for timespan in ['week', 'fortnight', 'month']: + resp = app.get( + f'/_/metrics/query/node_analytics/{_node._id}/{timespan}/', + expect_errors=True, + ) + assert resp.status_code == 401 + assert _mock_search.call_count == 0 + + def test_private_node_rando(self, app): + _node = NodeFactory(is_public=False) + _user = AuthUserFactory() + with mock.patch('elasticsearch8.Elasticsearch.search') as _mock_search: + for timespan in ['week', 'fortnight', 'month']: + resp = app.get( + f'/_/metrics/query/node_analytics/{_node._id}/{timespan}/', + expect_errors=True, + auth=_user.auth, + ) + assert resp.status_code == 403 + assert _mock_search.call_count == 0 - user = AuthUserFactory() - resp = app.get( - f'/_/metrics/query/node_analytics/{guid}/{timespan}/', - auth=user.auth, - expect_errors=True, - ) - assert resp.status_code == 403 - assert mock_search.call_count == 0 +class TestNodeAnalyticsQuery(RealElasticTestCase, TestCase): + def setUp(self): + super().setUp() + self._node = NodeFactory(is_public=True) + self._osfid = self._node._id + self._today = datetime.date.today() + self._now = datetime.datetime( + self._today.year, + self._today.month, + self._today.day, + 12, + tzinfo=datetime.UTC, + ) + ### + # past week + OsfCountedUsageEvent.record( + sessionhour_id='s1', + item_osfid=self._osfid, + action_labels=['view', 'web'], + timestamp=self._now - datetime.timedelta(hours=1), + pageview_info={ + 'referer_url': 'http://somewhere.example.com/there', + 'page_url': 'http://osf.example/page/path', + 'route_name': 'page.route', + 'page_title': 'foo', + } + ) + OsfCountedUsageEvent.record( + sessionhour_id='s2', + item_osfid=self._osfid, + action_labels=['view', 'web'], + timestamp=self._now - datetime.timedelta(days=1), + pageview_info={ + 'referer_url': 'http://somewhere.example.com/there', + 'page_url': 'http://osf.example/page/path', + 'route_name': 'page.route', + 'page_title': 'foo', + } + ) + OsfCountedUsageEvent.record( + sessionhour_id='s3', + item_osfid=self._osfid, + action_labels=['view', 'web'], + timestamp=self._now - datetime.timedelta(days=1, hours=1), + pageview_info={ + 'referer_url': 'http://somewhere.example.com/there', + 'page_url': 'http://osf.example/page/another', + 'route_name': 'page.another', + 'page_title': 'blaz', + } + ) + OsfCountedUsageEvent.record( + sessionhour_id='s4', + item_osfid=self._osfid, + action_labels=['view', 'web'], + timestamp=self._now - datetime.timedelta(days=1, hours=2), + pageview_info={ + 'referer_url': 'http://elsewhere.example.com/there', + 'page_url': 'http://osf.example/page/another', + 'route_name': 'page.another', + 'page_title': 'blaz', + } + ) + OsfCountedUsageEvent.record( + sessionhour_id='s5', + item_osfid=self._osfid, + action_labels=['view', 'web'], + timestamp=self._now - datetime.timedelta(days=2, hours=1), + pageview_info={ + 'page_url': 'http://osf.example/page/another', + 'route_name': 'page.another', + 'page_title': 'blaz', + } + ) + OsfCountedUsageEvent.record( + sessionhour_id='s6', + item_osfid=self._osfid, + action_labels=['view', 'web'], + timestamp=self._now - datetime.timedelta(days=2, hours=2), + pageview_info={ + 'page_url': 'http://osf.example/page/another', + 'route_name': 'page.another', + 'page_title': 'blaz', + } + ) + ### + # past fortnight + OsfCountedUsageEvent.record( + sessionhour_id='s7', + item_osfid=self._osfid, + action_labels=['view', 'web'], + timestamp=self._now - datetime.timedelta(days=10, hours=1), + pageview_info={ + 'referer_url': 'http://elsewhere.example.com/there', + 'page_url': 'http://osf.example/page/another', + 'route_name': 'page.another', + 'page_title': 'blaz', + } + ) + ### + # past month + OsfCountedUsageEvent.record( + sessionhour_id='s8', + item_osfid=self._osfid, + action_labels=['view', 'web'], + timestamp=self._now - datetime.timedelta(days=20, hours=1), + pageview_info={ + 'referer_url': 'http://somewhere.example.com/anothere', + 'page_url': 'http://osf.example/page/another', + 'route_name': 'page.another', + 'page_title': 'blaz', + } + ) + ### + # older than a month + OsfCountedUsageEvent.record( + sessionhour_id='s9', + item_osfid=self._osfid, + action_labels=['view', 'web'], + timestamp=self._now - datetime.timedelta(days=80, hours=7), + pageview_info={ + 'referer_url': 'http://somewhere.example.com/anothere', + 'page_url': 'http://osf.example/page/another', + 'route_name': 'page.another', + 'page_title': 'blaz', + } + ) + # refresh + OsfCountedUsageEvent.refresh() - @pytest.mark.parametrize('timespan', ['week', 'fortnight', 'month']) - def test_public_node(self, app, mock_search, timespan): - node = NodeFactory(is_public=True) - guid = node._id - mock_search.return_value = { - 'aggregations': { - 'popular-pages': { - 'buckets': [ - { - 'key': '/page/path', - 'doc_count': 17, - 'route-for-path': { - 'buckets': [{'key': 'page.route'}], - }, - 'title-for-path': { - 'buckets': [{'key': 'foo'}], - }, - }, - { - 'key': '/page/another', - 'doc_count': 7, - 'route-for-path': { - 'buckets': [{'key': 'page.another'}], - }, - 'title-for-path': { - 'buckets': [{'key': 'blaz'}], - }, - }, - ], - }, - 'unique-visits': { - 'buckets': [ - {'key': 1646265600000, 'key_as_string': '2022-03-03', 'doc_count': 8}, - {'key': 1646352000000, 'key_as_string': '2022-03-04', 'doc_count': 1}, - ], - }, - 'time-of-day': { - 'buckets': [ - {'key': 8, 'doc_count': 1}, - {'key': 9, 'doc_count': 2}, - {'key': 10, 'doc_count': 3}, - ], - }, - 'referer-domain': { - 'buckets': [ - {'key': 'somewhere.example.com', 'doc_count': 9}, - {'key': 'elsewhere.example.com', 'doc_count': 4}, - ], - }, + def test_public_node(self): + _week_resp = self.client.get(f'/_/metrics/query/node_analytics/{self._osfid}/week/') + assert _week_resp.json()['data'] == { + 'id': f'{self._osfid}:week', + 'type': 'node-analytics', + 'attributes': { + 'popular_pages': [ + {'route': 'page.another', 'path': '/page/another', 'title': 'blaz', 'count': 4}, + {'route': 'page.route', 'path': '/page/path', 'title': 'foo', 'count': 2}, + ], + 'unique_visits': [ + {'date': str(self._today - datetime.timedelta(days=2)), 'count': 2}, + {'date': str(self._today - datetime.timedelta(days=1)), 'count': 3}, + {'date': str(self._today), 'count': 1}, + ], + 'time_of_day': [ + {'hour': 11, 'count': 3}, + {'hour': 10, 'count': 2}, + {'hour': 12, 'count': 1}, + ], + 'referer_domain': [ + {'referer_domain': 'somewhere.example.com', 'count': 3}, + {'referer_domain': 'elsewhere.example.com', 'count': 1}, + ], }, } - resp = app.get(f'/_/metrics/query/node_analytics/{guid}/{timespan}/') - assert resp.json['data'] == { - 'id': f'{guid}:{timespan}', + _fortnight_resp = self.client.get(f'/_/metrics/query/node_analytics/{self._osfid}/fortnight/') + assert _fortnight_resp.json()['data'] == { + 'id': f'{self._osfid}:fortnight', 'type': 'node-analytics', 'attributes': { 'popular_pages': [ - {'route': 'page.route', 'path': '/page/path', 'title': 'foo', 'count': 17}, - {'route': 'page.another', 'path': '/page/another', 'title': 'blaz', 'count': 7}, + {'route': 'page.another', 'path': '/page/another', 'title': 'blaz', 'count': 5}, + {'route': 'page.route', 'path': '/page/path', 'title': 'foo', 'count': 2}, ], 'unique_visits': [ - {'date': '2022-03-03', 'count': 8}, - {'date': '2022-03-04', 'count': 1}, + {'date': str(self._today - datetime.timedelta(days=10)), 'count': 1}, + *( + {'date': str(self._today - datetime.timedelta(days=_n)), 'count': 0} + for _n in range(9, 2, -1) + ), + {'date': str(self._today - datetime.timedelta(days=2)), 'count': 2}, + {'date': str(self._today - datetime.timedelta(days=1)), 'count': 3}, + {'date': str(self._today), 'count': 1}, ], 'time_of_day': [ - {'hour': 8, 'count': 1}, - {'hour': 9, 'count': 2}, - {'hour': 10, 'count': 3}, + {'hour': 11, 'count': 4}, + {'hour': 10, 'count': 2}, + {'hour': 12, 'count': 1}, ], 'referer_domain': [ - {'referer_domain': 'somewhere.example.com', 'count': 9}, - {'referer_domain': 'elsewhere.example.com', 'count': 4}, + {'referer_domain': 'somewhere.example.com', 'count': 3}, + {'referer_domain': 'elsewhere.example.com', 'count': 2}, ], }, } - assert mock_search.call_count == 1 + _month_resp = self.client.get(f'/_/metrics/query/node_analytics/{self._osfid}/month/') + assert _month_resp.json()['data'] == { + 'id': f'{self._osfid}:month', + 'type': 'node-analytics', + 'attributes': { + 'popular_pages': [ + {'route': 'page.another', 'path': '/page/another', 'title': 'blaz', 'count': 6}, + {'route': 'page.route', 'path': '/page/path', 'title': 'foo', 'count': 2}, + ], + 'unique_visits': [ + {'date': str(self._today - datetime.timedelta(days=20)), 'count': 1}, + *( + {'date': str(self._today - datetime.timedelta(days=_n)), 'count': 0} + for _n in range(19, 10, -1) + ), + {'date': str(self._today - datetime.timedelta(days=10)), 'count': 1}, + *( + {'date': str(self._today - datetime.timedelta(days=_n)), 'count': 0} + for _n in range(9, 2, -1) + ), + {'date': str(self._today - datetime.timedelta(days=2)), 'count': 2}, + {'date': str(self._today - datetime.timedelta(days=1)), 'count': 3}, + {'date': str(self._today), 'count': 1}, + ], + 'time_of_day': [ + {'hour': 11, 'count': 5}, + {'hour': 10, 'count': 2}, + {'hour': 12, 'count': 1}, + ], + 'referer_domain': [ + {'referer_domain': 'somewhere.example.com', 'count': 4}, + {'referer_domain': 'elsewhere.example.com', 'count': 2}, + ], + }, + } diff --git a/api_tests/metrics/test_raw_metrics.py b/api_tests/metrics/test_raw_metrics.py index e32936d9024..a30be5584e7 100644 --- a/api_tests/metrics/test_raw_metrics.py +++ b/api_tests/metrics/test_raw_metrics.py @@ -1,10 +1,8 @@ -import pytest - -from website.app import setup_django -setup_django() +from http import HTTPStatus +import pytest from waffle.testutils import override_switch -from elasticsearch6_dsl.connections import connections as es6_connections +from elasticsearch8.dsl.connections import connections as es8_connections from osf import features from osf_tests.factories import AuthUserFactory @@ -14,7 +12,7 @@ pytestmark = pytest.mark.django_db -@pytest.mark.es_metrics +@pytest.mark.djelme_elasticsearch_backends class TestRawMetrics: @pytest.fixture(autouse=True) @@ -23,10 +21,11 @@ def enable_elasticsearch_metrics(self): yield @pytest.fixture(autouse=True) - def teardown_customer_index(self, es6_client): - es6_client.indices.delete(index='customer', ignore_unavailable=True) + def teardown_customer_index(self): + _es8_client = es8_connections.get_connection('osfmetrics_es8') + _es8_client.indices.delete(index='customer', ignore_unavailable=True) yield - es6_client.indices.delete(index='customer', ignore_unavailable=True) + _es8_client.indices.delete(index='customer', ignore_unavailable=True) @pytest.fixture def user(self): @@ -40,19 +39,17 @@ def user(self): def other_user(self): return AuthUserFactory() - @pytest.fixture(params=['raw', 'raw-osfmetrics_es6']) + @pytest.fixture def base_url(self, request): - return f'/{API_BASE}metrics/{request.param}/' + return f'/{API_BASE}metrics/raw-osfmetrics_es8/' def test_delete(self, app, user, base_url): res = app.delete_json_api(base_url, auth=user.auth, expect_errors=True) - assert res.status_code == 400 - assert res.json['errors'][0]['detail'] == 'DELETE not supported. Use GET/POST/PUT' + assert res.status_code == HTTPStatus.METHOD_NOT_ALLOWED def test_put(self, app, user, base_url): put_return = { '_index': 'customer', - '_type': '_doc', '_id': '1', '_version': 1, 'result': 'created', @@ -69,7 +66,7 @@ def test_put(self, app, user, base_url): put_data = { 'name': 'John Doe' } - res = app.put_json_api(put_url, put_data, auth=user.auth) + res = app.put_json_api(put_url, put_data, headers={'Content-Type': 'application/json'}, auth=user.auth) assert res.json == put_return def test_put_no_perms(self, app, other_user, base_url): @@ -77,14 +74,13 @@ def test_put_no_perms(self, app, other_user, base_url): put_data = { 'name': 'John Doe' } - res = app.put_json_api(put_url, put_data, auth=other_user.auth, expect_errors=True) + res = app.put_json_api(put_url, put_data, auth=other_user.auth, headers={'Content-Type': 'application/json'}, expect_errors=True) assert res.status_code == 403 assert res.json['errors'][0]['detail'] == 'You do not have permission to perform this action.' def test_post(self, app, user, base_url): post_return = { '_index': 'customer', - '_type': '_doc', '_id': '1', '_version': 1, 'result': 'created', @@ -101,7 +97,7 @@ def test_post(self, app, user, base_url): post_data = { 'name': 'Jane Doe' } - res = app.post_json_api(post_url, post_data, auth=user.auth) + res = app.post_json_api(post_url, post_data, headers={'Content-Type': 'application/json'}, auth=user.auth) assert res.json == post_return def test_post_no_perms(self, app, other_user, base_url): @@ -109,14 +105,13 @@ def test_post_no_perms(self, app, other_user, base_url): post_data = { 'name': 'John Doe' } - res = app.post_json_api(post_url, post_data, auth=other_user.auth, expect_errors=True) + res = app.post_json_api(post_url, post_data, headers={'Content-Type': 'application/json'}, auth=other_user.auth, expect_errors=True) assert res.status_code == 403 assert res.json['errors'][0]['detail'] == 'You do not have permission to perform this action.' def test_post_and_get(self, app, user, base_url): post_return = { '_index': 'customer', - '_type': '_doc', '_id': '1', '_version': 1, 'result': 'created', @@ -133,17 +128,17 @@ def test_post_and_get(self, app, user, base_url): post_data = { 'name': 'Beyonce' } - res = app.post_json_api(post_url, post_data, auth=user.auth) + res = app.post_json_api(post_url, post_data, headers={'Content-Type': 'application/json'}, auth=user.auth) assert res.json == post_return - es6_connections.get_connection('osfmetrics_es6').indices.refresh( + es8_connections.get_connection('osfmetrics_es8').indices.refresh( index='customer', ) get_url = f'{base_url}customer/_search?q=*' res = app.get(get_url, auth=user.auth) - assert res.json['hits']['total'] == 1 + assert res.json['hits']['total']['value'] == 1 assert res.json['hits']['hits'][0]['_source']['name'] == 'Beyonce' get_url = f'{base_url}customer/_doc/1/' diff --git a/api_tests/metrics/test_registries_moderation_metrics.py b/api_tests/metrics/test_registries_moderation_metrics.py index f5d3a047b10..feaf48b7de9 100644 --- a/api_tests/metrics/test_registries_moderation_metrics.py +++ b/api_tests/metrics/test_registries_moderation_metrics.py @@ -2,12 +2,11 @@ from osf_tests.factories import RegistrationFactory, AuthUserFactory from osf.utils.workflows import RegistrationModerationStates, RegistrationModerationTriggers -from osf.metrics import RegistriesModerationMetrics +from osf.metrics.events import RegistriesModerationEvent from tests.utils import capture_notifications -pytestmark = pytest.mark.django_db - +@pytest.mark.djelme_elasticsearch_backends @pytest.mark.django_db class TestRegistrationModerationMetrics: @@ -15,7 +14,6 @@ class TestRegistrationModerationMetrics: def registration(self): return RegistrationFactory() - @pytest.mark.es_metrics def test_record_transitions(self, registration): with capture_notifications(): registration._write_registration_action( @@ -24,10 +22,10 @@ def test_record_transitions(self, registration): registration.creator, 'Metrics is easy' ) - RegistriesModerationMetrics._get_connection().indices.refresh(RegistriesModerationMetrics._template_pattern) + RegistriesModerationEvent.refresh() - assert RegistriesModerationMetrics.search().count() == 1 - data = RegistriesModerationMetrics.search().execute()['hits']['hits'][0]['_source'] + assert RegistriesModerationEvent.search().count() == 1 + data = RegistriesModerationEvent.search().execute()['hits']['hits'][0]['_source'] assert data['from_state'] == RegistrationModerationStates.INITIAL.db_name assert data['to_state'] == RegistrationModerationStates.PENDING.db_name @@ -36,6 +34,7 @@ def test_record_transitions(self, registration): assert data['comment'] == 'Metrics is easy' +@pytest.mark.djelme_elasticsearch_backends @pytest.mark.django_db class TestRegistrationModerationMetricsView: @@ -59,7 +58,6 @@ def other_user(self): def base_url(self): return '/_/metrics/registries_moderation/transitions/' - @pytest.mark.es_metrics def test_registries_moderation_view(self, app, user, base_url, registration): with capture_notifications(): registration._write_registration_action( @@ -68,7 +66,7 @@ def test_registries_moderation_view(self, app, user, base_url, registration): registration.creator, 'Metrics is easy' ) - RegistriesModerationMetrics._get_connection().indices.refresh(RegistriesModerationMetrics._template_pattern) + RegistriesModerationEvent.refresh() res = app.get(base_url, auth=user.auth, expect_errors=True) data = res.json diff --git a/api_tests/metrics/test_reports.py b/api_tests/metrics/test_reports.py index db748bdb05b..bebb42059b8 100644 --- a/api_tests/metrics/test_reports.py +++ b/api_tests/metrics/test_reports.py @@ -21,7 +21,7 @@ def mock_domain(self): @pytest.fixture def mock_search(self): - with mock.patch('elasticsearch6.Elasticsearch.search', autospec=True) as mock_search: + with mock.patch('elasticsearch8.Elasticsearch.search', autospec=True) as mock_search: yield mock_search def test_report_names(self, app, mock_domain): @@ -44,11 +44,11 @@ def test_report_names(self, app, mock_domain): @pytest.mark.parametrize('report_name', expected_report_names) def test_recent_reports(self, app, mock_domain, mock_search, report_name): - mock_search.return_value = { + mock_search.return_value.body = { 'hits': { 'hits': [ - {'_id': 'hi-by', '_source': {'report_date': '1234-12-12', 'hello': 'goodbye'}}, - {'_id': 'doof', '_source': {'report_date': '1234-12-11', 'hello': 'upwa'}}, + {'_id': 'hi-by', '_source': {'report_date': '1234-12-12', 'hello': 'goodbye', 'created': '1235-12-13T01:00:00Z'}}, + {'_id': 'doof', '_source': {'report_date': '1234-12-11', 'hello': 'upwa', 'created': '1235-12-12T01:00:00Z'}}, ], }, } @@ -58,17 +58,19 @@ def test_recent_reports(self, app, mock_domain, mock_search, report_name): assert resp.json['data'] == [ { 'id': 'hi-by', - 'type': f'daily-report:{report_name}', + 'type': f'cyclic-report:{report_name}', 'attributes': { 'report_date': '1234-12-12', 'hello': 'goodbye', + 'created': '1235-12-13T01:00:00Z', }, }, { 'id': 'doof', - 'type': f'daily-report:{report_name}', + 'type': f'cyclic-report:{report_name}', 'attributes': { 'report_date': '1234-12-11', 'hello': 'upwa', + 'created': '1235-12-12T01:00:00Z', }, } ] @@ -84,12 +86,12 @@ def test_recent_reports(self, app, mock_domain, mock_search, report_name): assert resp.unicode_body == CSV_REPORTS -TSV_REPORTS = '''report_date hello -1234-12-12 goodbye -1234-12-11 upwa +TSV_REPORTS = '''report_date created hello +1234-12-12 1235-12-13 01:00:00+00:00 goodbye +1234-12-11 1235-12-12 01:00:00+00:00 upwa '''.replace('\n', '\r\n') -CSV_REPORTS = '''report_date,hello -1234-12-12,goodbye -1234-12-11,upwa +CSV_REPORTS = '''report_date,created,hello +1234-12-12,1235-12-13 01:00:00+00:00,goodbye +1234-12-11,1235-12-12 01:00:00+00:00,upwa '''.replace('\n', '\r\n') diff --git a/api_tests/preprints/views/test_preprint_detail_metrics.py b/api_tests/preprints/views/test_preprint_detail_metrics.py index f98777be678..9d945e8159f 100644 --- a/api_tests/preprints/views/test_preprint_detail_metrics.py +++ b/api_tests/preprints/views/test_preprint_detail_metrics.py @@ -17,17 +17,13 @@ def enable_elasticsearch_metrics(self): with override_switch(features.ELASTICSEARCH_METRICS, active=True): yield - @pytest.mark.parametrize(('metric_name', 'metric_class_name'), - [ - ('downloads', 'PreprintDownload'), - ('views', 'PreprintView'), - ]) - def test_preprint_detail_with_downloads(self, app, settings, metric_name, metric_class_name): + @pytest.mark.parametrize('metric_name', ['downloads', 'views']) + def test_preprint_detail_with_downloads(self, app, settings, metric_name): preprint = PreprintFactory() url = f'/{API_BASE}preprints/{preprint._id}/?metrics[{metric_name}]=total' - with mock.patch(f'api.preprints.views.{metric_class_name}.get_count_for_preprint') as mock_get_count_for_preprint: - mock_get_count_for_preprint.return_value = 42 + with mock.patch('api.base.metrics.UsageMetricsViewMixin._get_usage_count') as mock_get_count: + mock_get_count.return_value = 42 res = app.get(url) assert res.status_code == 200 diff --git a/api_tests/preprints/views/test_preprint_list.py b/api_tests/preprints/views/test_preprint_list.py index 3208c397893..15d12079328 100644 --- a/api_tests/preprints/views/test_preprint_list.py +++ b/api_tests/preprints/views/test_preprint_list.py @@ -1,9 +1,8 @@ from unittest import mock -import datetime as dt import pytest from django.utils import timezone -from waffle.testutils import override_switch, override_flag +from waffle.testutils import override_flag from addons.github.models import GithubFile from api.base.settings.defaults import API_BASE @@ -1027,65 +1026,3 @@ def provider(self): @pytest.fixture() def url(self, project): return f'/{API_BASE}preprints/?version=2.2&' - - -@pytest.mark.django_db -class TestPreprintListWithMetrics: - - # enable the ELASTICSEARCH_METRICS switch for all tests - @pytest.fixture(autouse=True) - def enable_elasticsearch_metrics(self): - with override_switch(features.ELASTICSEARCH_METRICS, active=True): - yield - - @pytest.mark.parametrize( - ('metric_name', 'metric_class_name'), - [ - ('downloads', 'PreprintDownload'), - ('views', 'PreprintView'), - ], - ) - def test_preprint_list_with_metrics(self, app, metric_name, metric_class_name): - url = f'/{API_BASE}preprints/?metrics[{metric_name}]=total' - preprint1 = PreprintFactory() - preprint1.downloads = 41 - preprint2 = PreprintFactory() - preprint2.downloads = 42 - - with mock.patch(f'api.preprints.views.{metric_class_name}.get_top_by_count') as mock_get_top_by_count: - mock_get_top_by_count.return_value = [preprint2, preprint1] - res = app.get(url) - assert res.status_code == 200 - - preprint_2_data = res.json['data'][0] - assert preprint_2_data['meta']['metrics']['downloads'] == 42 - - preprint_1_data = res.json['data'][1] - assert preprint_1_data['meta']['metrics']['downloads'] == 41 - - @mock.patch('django.utils.timezone.now') - @pytest.mark.parametrize( - ('query_value', 'timedelta'), - [ - ('daily', dt.timedelta(days=1)), - ('weekly', dt.timedelta(days=7)), - ('yearly', dt.timedelta(days=365)), - ], - ) - def test_preprint_list_filter_metric_by_time_period(self, mock_timezone_now, app, settings, query_value, timedelta): - url = f'/{API_BASE}preprints/?metrics[views]={query_value}' - mock_now = dt.datetime.utcnow().replace(tzinfo=timezone.utc) - mock_timezone_now.return_value = mock_now - - preprint1 = PreprintFactory() - preprint1.views = 41 - preprint2 = PreprintFactory() - preprint2.views = 42 - - with mock.patch('api.preprints.views.PreprintView.get_top_by_count') as mock_get_top_by_count: - mock_get_top_by_count.return_value = [preprint2, preprint1] - res = app.get(url) - - assert res.status_code == 200 - call_kwargs = mock_get_top_by_count.call_args[1] - assert call_kwargs['after'] == mock_now - timedelta diff --git a/api_tests/providers/preprints/views/test_preprint_provider_list.py b/api_tests/providers/preprints/views/test_preprint_provider_list.py index c1624fd58f9..21499744d77 100644 --- a/api_tests/providers/preprints/views/test_preprint_provider_list.py +++ b/api_tests/providers/preprints/views/test_preprint_provider_list.py @@ -1,8 +1,5 @@ -from unittest import mock import pytest -from waffle.testutils import override_switch -from osf import features from api.base.settings.defaults import API_BASE from osf_tests.factories import ( AuthUserFactory, @@ -65,28 +62,3 @@ def test_preprint_provider_list_filtering( url, filter_type, filter_value)) assert res.status_code == 200 assert len(res.json['data']) == 1 - - -@pytest.mark.django_db -class TestPreprintProviderListWithMetrics: - - # enable the ELASTICSEARCH_METRICS switch for all tests - @pytest.fixture(autouse=True) - def enable_elasticsearch_metrics(self): - with override_switch(features.ELASTICSEARCH_METRICS, active=True): - yield - - def test_preprint_provider_list_with_metrics(self, app, url, provider_one, provider_two): - provider_one.downloads = 41 - provider_two.downloads = 42 - with mock.patch('api.preprints.views.PreprintDownload.get_top_by_count') as mock_get_top_by_count: - mock_get_top_by_count.return_value = [provider_one, provider_two] - res = app.get(url + 'metrics[downloads]=total') - - assert res.status_code == 200 - - provider_2_data = res.json['data'][0] - provider_2_data['meta']['metrics']['downloads'] == 42 - - provider_1_data = res.json['data'][1] - provider_1_data['meta']['metrics']['downloads'] == 41 diff --git a/api_tests/share/test_share_node.py b/api_tests/share/test_share_node.py index 2fcc9cc48a8..9466f603fda 100644 --- a/api_tests/share/test_share_node.py +++ b/api_tests/share/test_share_node.py @@ -21,7 +21,10 @@ from framework.auth.core import Auth from api.share.utils import shtrove_ingest_url -from ._utils import expect_ingest_request +from ._utils import ( + expect_ingest_request, + mock_share_responses, +) @pytest.mark.django_db @@ -30,9 +33,12 @@ class TestNodeShare: @pytest.fixture(scope='class', autouse=True) def _patches(self): - with patch('osf.models.identifiers.IdentifierMixin.request_identifier_update'): - with patch.object(settings, 'USE_CELERY', False): - yield + with ( + patch.object(settings, 'USE_CELERY', False), + patch('osf.models.identifiers.IdentifierMixin.request_identifier_update'), + patch('osf.metadata.osf_gathering.MonthlyPublicItemUsageReport.from_last_month', return_value=()), + ): + yield @pytest.fixture() def user(self): @@ -98,15 +104,21 @@ def registration_outcome(self, registration): ) return o - def test_update_node_share(self, mock_share_responses, node, user): - with expect_ingest_request(mock_share_responses, node): + def test_update_node_share(self, node, user): + with ( + mock_share_responses() as _mock_share_responses, + expect_ingest_request(_mock_share_responses, node), + ): on_node_updated(node._id, user._id, False, {'is_public'}) - def test_update_registration_share(self, mock_share_responses, registration, user): - with expect_ingest_request(mock_share_responses, registration): + def test_update_registration_share(self, registration, user): + with ( + mock_share_responses() as _mock_share_responses, + expect_ingest_request(_mock_share_responses, registration), + ): on_node_updated(registration._id, user._id, False, {'is_public'}) - def test_update_share_correctly_for_projects(self, mock_share_responses, node, user): + def test_update_share_correctly_for_projects(self, node, user): cases = [{ 'is_deleted': False, 'attrs': {'is_public': True, 'is_deleted': False, 'spam_status': SpamStatus.HAM} @@ -121,14 +133,16 @@ def test_update_share_correctly_for_projects(self, mock_share_responses, node, u 'attrs': {'is_public': True, 'is_deleted': False, 'spam_status': SpamStatus.SPAM} }] - mock_share_responses._calls.reset() # reset after factory calls for i, case in enumerate(cases): for attr, value in case['attrs'].items(): setattr(node, attr, value) - with expect_ingest_request(mock_share_responses, node, delete=case['is_deleted']): + with ( + mock_share_responses() as _mock_share_responses, + expect_ingest_request(_mock_share_responses, node, delete=case['is_deleted']), + ): node.save() - def test_update_share_correctly_for_registrations(self, mock_share_responses, registration, user): + def test_update_share_correctly_for_registrations(self, registration, user): cases = [{ 'is_deleted': True, 'attrs': {'is_public': False, 'is_deleted': False} @@ -140,44 +154,50 @@ def test_update_share_correctly_for_registrations(self, mock_share_responses, re 'attrs': {'is_public': True, 'is_deleted': False} }] - mock_share_responses._calls.reset() # reset after factory calls for i, case in enumerate(cases): for attr, value in case['attrs'].items(): setattr(registration, attr, value) - with expect_ingest_request(mock_share_responses, registration, delete=case['is_deleted']): + with ( + mock_share_responses() as _mock_share_responses, + expect_ingest_request(_mock_share_responses, registration, delete=case['is_deleted']), + ): registration.save() assert registration.is_registration - def test_update_share_correctly_for_projects_with_qa_tags(self, mock_share_responses, node, user): - with expect_ingest_request(mock_share_responses, node, delete=True): - node.add_tag(settings.DO_NOT_INDEX_LIST['tags'][0], auth=Auth(user)) - with expect_ingest_request(mock_share_responses, node, delete=False): - node.remove_tag(settings.DO_NOT_INDEX_LIST['tags'][0], auth=Auth(user), save=True) - - def test_update_share_correctly_for_registrations_with_qa_tags(self, mock_share_responses, registration, user): - with expect_ingest_request(mock_share_responses, registration, delete=True): - registration.add_tag(settings.DO_NOT_INDEX_LIST['tags'][0], auth=Auth(user)) - with expect_ingest_request(mock_share_responses, registration): - registration.remove_tag(settings.DO_NOT_INDEX_LIST['tags'][0], auth=Auth(user), save=True) - - def test_update_share_correctly_for_projects_with_qa_titles(self, mock_share_responses, node, user): + def test_update_share_correctly_for_projects_with_qa_tags(self, node, user): + with mock_share_responses() as _mock_share_responses: + with expect_ingest_request(_mock_share_responses, node, delete=True): + node.add_tag(settings.DO_NOT_INDEX_LIST['tags'][0], auth=Auth(user)) + with expect_ingest_request(_mock_share_responses, node, delete=False): + node.remove_tag(settings.DO_NOT_INDEX_LIST['tags'][0], auth=Auth(user), save=True) + + def test_update_share_correctly_for_registrations_with_qa_tags(self, registration, user): + with mock_share_responses() as _mock_share_responses: + with expect_ingest_request(_mock_share_responses, registration, delete=True): + registration.add_tag(settings.DO_NOT_INDEX_LIST['tags'][0], auth=Auth(user)) + with expect_ingest_request(_mock_share_responses, registration): + registration.remove_tag(settings.DO_NOT_INDEX_LIST['tags'][0], auth=Auth(user), save=True) + + def test_update_share_correctly_for_projects_with_qa_titles(self, node, user): node.title = settings.DO_NOT_INDEX_LIST['titles'][0] + ' arbitrary text for test title.' node.save() - with expect_ingest_request(mock_share_responses, node, delete=True): - on_node_updated(node._id, user._id, False, {'is_public'}) - node.title = 'Not a qa title' - with expect_ingest_request(mock_share_responses, node): - node.save() - assert node.title not in settings.DO_NOT_INDEX_LIST['titles'] + with mock_share_responses() as _mock_share_responses: + with expect_ingest_request(_mock_share_responses, node, delete=True): + on_node_updated(node._id, user._id, False, {'is_public'}) + node.title = 'Not a qa title' + with expect_ingest_request(_mock_share_responses, node): + node.save() + assert node.title not in settings.DO_NOT_INDEX_LIST['titles'] - def test_update_share_correctly_for_registrations_with_qa_titles(self, mock_share_responses, registration, user): + def test_update_share_correctly_for_registrations_with_qa_titles(self, registration, user): registration.title = settings.DO_NOT_INDEX_LIST['titles'][0] + ' arbitrary text for test title.' - with expect_ingest_request(mock_share_responses, registration, delete=True): - registration.save() - registration.title = 'Not a qa title' - with expect_ingest_request(mock_share_responses, registration): - registration.save() - assert registration.title not in settings.DO_NOT_INDEX_LIST['titles'] + with mock_share_responses() as _mock_share_responses: + with expect_ingest_request(_mock_share_responses, registration, delete=True): + registration.save() + registration.title = 'Not a qa title' + with expect_ingest_request(_mock_share_responses, registration): + registration.save() + assert registration.title not in settings.DO_NOT_INDEX_LIST['titles'] @responses.activate def test_skips_no_settings(self, node, user): @@ -185,22 +205,25 @@ def test_skips_no_settings(self, node, user): assert len(responses.calls) == 0 @mark.skip('Synchronous retries not supported if celery >=5.0') - def test_call_async_update_on_500_retry(self, mock_share_responses, node, user): + def test_call_async_update_on_500_retry(self, node, user): """This is meant to simulate a temporary outage, so the retry mechanism should kick in and complete it.""" - mock_share_responses.replace(responses.POST, shtrove_ingest_url(), status=500) - mock_share_responses.add(responses.POST, shtrove_ingest_url(), status=200) - with expect_ingest_request(mock_share_responses, node, count=2): - on_node_updated(node._id, user._id, False, {'is_public'}) + with mock_share_responses() as _mock_share_responses: + _mock_share_responses.replace(responses.POST, shtrove_ingest_url(), status=500) + _mock_share_responses.add(responses.POST, shtrove_ingest_url(), status=200) + with expect_ingest_request(_mock_share_responses, node, count=2): + on_node_updated(node._id, user._id, False, {'is_public'}) @mark.skip('Synchronous retries not supported if celery >=5.0') - def test_call_async_update_on_500_failure(self, mock_share_responses, node, user): + def test_call_async_update_on_500_failure(self, node, user): """This is meant to simulate a total outage, so the retry mechanism should try X number of times and quit.""" - mock_share_responses.replace(responses.POST, shtrove_ingest_url(), status=500) - with expect_ingest_request(mock_share_responses, node, count=5): # tries five times - on_node_updated(node._id, user._id, False, {'is_public'}) + with mock_share_responses() as _mock_share_responses: + _mock_share_responses.replace(responses.POST, shtrove_ingest_url(), status=500) + with expect_ingest_request(_mock_share_responses, node, count=5): # tries five times + on_node_updated(node._id, user._id, False, {'is_public'}) @mark.skip('Synchronous retries not supported if celery >=5.0') - def test_no_call_async_update_on_400_failure(self, mock_share_responses, node, user): - mock_share_responses.replace(responses.POST, shtrove_ingest_url(), status=400) - with expect_ingest_request(mock_share_responses, node): - on_node_updated(node._id, user._id, False, {'is_public'}) + def test_no_call_async_update_on_400_failure(self, node, user): + with mock_share_responses() as _mock_share_responses: + _mock_share_responses.replace(responses.POST, shtrove_ingest_url(), status=400) + with expect_ingest_request(_mock_share_responses, node): + on_node_updated(node._id, user._id, False, {'is_public'}) diff --git a/api_tests/share/test_share_preprint.py b/api_tests/share/test_share_preprint.py index 118abf3105b..ca58a868e47 100644 --- a/api_tests/share/test_share_preprint.py +++ b/api_tests/share/test_share_preprint.py @@ -17,8 +17,11 @@ ) from website import settings from website.preprints.tasks import on_preprint_updated -from ._utils import expect_preprint_ingest_request from tests.utils import capture_notifications +from ._utils import ( + expect_preprint_ingest_request, + mock_share_responses, +) @pytest.mark.django_db @@ -26,7 +29,10 @@ class TestPreprintShare: @pytest.fixture(scope='class', autouse=True) def _patches(self): - with mock.patch.object(settings, 'USE_CELERY', False): + with ( + mock.patch.object(settings, 'USE_CELERY', False), + mock.patch('osf.metadata.osf_gathering.MonthlyPublicItemUsageReport.from_last_month', return_value=()), + ): yield @pytest.fixture @@ -45,7 +51,7 @@ def provider(self): ) @pytest.fixture - def project(self, user, mock_share_responses): + def project(self, user): return ProjectFactory(creator=user, is_public=True) @pytest.fixture @@ -67,89 +73,121 @@ def preprint(self, project, user, provider, subject): is_published=False ) - def test_save_unpublished_not_called(self, mock_share_responses, preprint): + def test_save_unpublished_not_called(self, preprint): # expecting no ingest requests (delete or otherwise) - with expect_preprint_ingest_request(mock_share_responses, preprint, count=0): + with ( + mock_share_responses() as _mock_share_responses, + expect_preprint_ingest_request(_mock_share_responses, preprint, count=0), + ): preprint.save() - def test_save_published_called(self, mock_share_responses, preprint, user, auth): - with capture_notifications(): - with expect_preprint_ingest_request(mock_share_responses, preprint): - preprint.set_published(True, auth=auth, save=True) + def test_save_published_called(self, preprint, user, auth): + with ( + capture_notifications(), + mock_share_responses() as _mock_share_responses, + expect_preprint_ingest_request(_mock_share_responses, preprint), + ): + preprint.set_published(True, auth=auth, save=True) # This covers an edge case where a preprint is forced back to unpublished # that it sends the information back to share - def test_save_unpublished_called_forced(self, mock_share_responses, auth, preprint): - with capture_notifications(): - with expect_preprint_ingest_request(mock_share_responses, preprint): + def test_save_unpublished_called_forced(self, auth, preprint): + with ( + capture_notifications(), + mock_share_responses() as _mock_share_responses, + ): + with expect_preprint_ingest_request(_mock_share_responses, preprint): preprint.set_published(True, auth=auth, save=True) - with expect_preprint_ingest_request(mock_share_responses, preprint, delete=True): + with expect_preprint_ingest_request(_mock_share_responses, preprint, delete=True): preprint.is_published = False preprint.save(**{'force_update': True}) - def test_save_published_subject_change_called(self, mock_share_responses, auth, preprint, subject, subject_two): + def test_save_published_subject_change_called(self, auth, preprint, subject, subject_two): with capture_notifications(): preprint.set_published(True, auth=auth, save=True) - with expect_preprint_ingest_request(mock_share_responses, preprint): + with ( + mock_share_responses() as _mock_share_responses, + expect_preprint_ingest_request(_mock_share_responses, preprint), + ): preprint.set_subjects([[subject_two._id]], auth=auth) - def test_save_unpublished_subject_change_not_called(self, mock_share_responses, auth, preprint, subject_two): - with expect_preprint_ingest_request(mock_share_responses, preprint, delete=True): + def test_save_unpublished_subject_change_not_called(self, auth, preprint, subject_two): + with ( + mock_share_responses() as _mock_share_responses, + expect_preprint_ingest_request(_mock_share_responses, preprint, delete=True), + ): preprint.set_subjects([[subject_two._id]], auth=auth) - def test_send_to_share_is_true(self, mock_share_responses, auth, preprint): + def test_send_to_share_is_true(self, auth, preprint): with capture_notifications(): preprint.set_published(True, auth=auth, save=True) - with expect_preprint_ingest_request(mock_share_responses, preprint): + with ( + mock_share_responses() as _mock_share_responses, + expect_preprint_ingest_request(_mock_share_responses, preprint), + ): on_preprint_updated(preprint._id, saved_fields=['title']) - def test_preprint_contributor_changes_updates_preprints_share(self, mock_share_responses, user, auth): + def test_preprint_contributor_changes_updates_preprints_share(self, user, auth): with capture_notifications(): preprint = PreprintFactory(is_published=True, creator=user) preprint.set_published(True, auth=auth, save=True) user2 = AuthUserFactory() - with expect_preprint_ingest_request(mock_share_responses, preprint): - preprint.add_contributor(contributor=user2, auth=auth, save=True) + with mock_share_responses() as _mock_share_responses: + with expect_preprint_ingest_request(_mock_share_responses, preprint): + preprint.add_contributor(contributor=user2, auth=auth, save=True) - with expect_preprint_ingest_request(mock_share_responses, preprint): - preprint.move_contributor(contributor=user, index=0, auth=auth, save=True) + with expect_preprint_ingest_request(_mock_share_responses, preprint): + preprint.move_contributor(contributor=user, index=0, auth=auth, save=True) - data = [{'id': user._id, 'permissions': ADMIN, 'visible': True}, - {'id': user2._id, 'permissions': WRITE, 'visible': False}] + data = [{'id': user._id, 'permissions': ADMIN, 'visible': True}, + {'id': user2._id, 'permissions': WRITE, 'visible': False}] - with expect_preprint_ingest_request(mock_share_responses, preprint): - preprint.manage_contributors(data, auth=auth, save=True) + with expect_preprint_ingest_request(_mock_share_responses, preprint): + preprint.manage_contributors(data, auth=auth, save=True) - with expect_preprint_ingest_request(mock_share_responses, preprint): - preprint.update_contributor(user2, READ, True, auth=auth, save=True) + with expect_preprint_ingest_request(_mock_share_responses, preprint): + preprint.update_contributor(user2, READ, True, auth=auth, save=True) - with expect_preprint_ingest_request(mock_share_responses, preprint): - preprint.remove_contributor(contributor=user2, auth=auth) + with expect_preprint_ingest_request(_mock_share_responses, preprint): + preprint.remove_contributor(contributor=user2, auth=auth) @pytest.mark.skip('Synchronous retries not supported if celery >=5.0') - def test_call_async_update_on_500_failure(self, mock_share_responses, preprint, auth): - mock_share_responses.replace(responses.POST, shtrove_ingest_url(), status=500) + def test_call_async_update_on_500_failure(self, preprint, auth): preprint.set_published(True, auth=auth, save=True) - with expect_preprint_ingest_request(mock_share_responses, preprint, count=5): - preprint.update_search() + with mock_share_responses() as _mock_share_responses: + _mock_share_responses.replace(responses.POST, shtrove_ingest_url(), status=500) + with expect_preprint_ingest_request(_mock_share_responses, preprint, count=5): + preprint.update_search() - def test_no_call_async_update_on_400_failure(self, mock_share_responses, preprint, auth): + def test_no_call_async_update_on_400_failure(self, preprint, auth): with capture_notifications(): - mock_share_responses.replace(responses.POST, shtrove_ingest_url(), status=400) preprint.set_published(True, auth=auth, save=True) - with expect_preprint_ingest_request(mock_share_responses, preprint, count=1, error_response=True): + with ( + mock_share_responses() as _mock_share_responses, + expect_preprint_ingest_request(_mock_share_responses, preprint, count=1, error_response=True), + ): + _mock_share_responses.replace(responses.POST, shtrove_ingest_url(), status=400) preprint.update_search() - def test_delete_from_share(self, mock_share_responses): + def test_delete_from_share(self): preprint = PreprintFactory() - with expect_preprint_ingest_request(mock_share_responses, preprint): + with ( + mock_share_responses() as _mock_share_responses, + expect_preprint_ingest_request(_mock_share_responses, preprint), + ): preprint.update_search() preprint.date_withdrawn = datetime.now() preprint.save() - with expect_preprint_ingest_request(mock_share_responses, preprint): + with ( + mock_share_responses() as _mock_share_responses, + expect_preprint_ingest_request(_mock_share_responses, preprint), + ): preprint.update_search() preprint.spam_status = SpamStatus.SPAM preprint.save() - with expect_preprint_ingest_request(mock_share_responses, preprint, delete=True): + with ( + mock_share_responses() as _mock_share_responses, + expect_preprint_ingest_request(_mock_share_responses, preprint, delete=True), + ): preprint.update_search() diff --git a/conftest.py b/conftest.py index e80c4e5c566..165fae951ca 100644 --- a/conftest.py +++ b/conftest.py @@ -4,7 +4,6 @@ import re from django.db import transaction -from elasticsearch6_dsl.connections import connections from elasticsearch_metrics.tests.util import djelme_test_backends from faker import Factory import pytest @@ -129,22 +128,12 @@ def _test_speedups_disable(request, settings, _test_speedups): patcher.start() -@pytest.fixture(scope='session') -def setup_connections(): - connections.create_connection(hosts=[website_settings.ELASTIC6_URI]) - - -@pytest.fixture(scope='function') -def es6_client(setup_connections): - return connections.get_connection() - - @pytest.fixture(scope='function', autouse=True) -def _es_metrics_marker(request): +def _djelme_elasticsearch_backends_marker(request): """Clear out all indices and index templates before and after - tests marked with `es_metrics`. + tests marked with `djelme_elasticsearch_backends`. """ - marker = request.node.get_closest_marker('es_metrics') + marker = request.node.get_closest_marker('djelme_elasticsearch_backends') if not marker: yield diff --git a/docker-compose-dist-arm64.override.yml b/docker-compose-dist-arm64.override.yml deleted file mode 100644 index cffa4bd8982..00000000000 --- a/docker-compose-dist-arm64.override.yml +++ /dev/null @@ -1,11 +0,0 @@ -## Reference README-docker-compose.md for instructions. - -services: - - ####### - # OSF # - ####### - - elasticsearch6: - image: quay.io/centerforopenscience/elasticsearch:es6-arm-6.3.1 - platform: linux/arm64 diff --git a/docker-compose.yml b/docker-compose.yml index 42f7efc5ce7..c62541d6596 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -11,8 +11,6 @@ volumes: external: false elasticsearch_data_vol: external: false - elasticsearch6_data_vol: - external: false elasticsearch8_data_vol: external: false rabbitmq_vol: @@ -67,22 +65,6 @@ services: - elasticsearch_data_vol:/usr/share/elasticsearch/data stdin_open: true - # Temporary: Remove when we've upgraded to ES6 - elasticsearch6: - image: docker.elastic.co/elasticsearch/elasticsearch:6.3.1 - environment: - - ES_JAVA_OPTS=-Xms512m -Xmx512m # reduce memory usage - ports: - - 9201:9200 - volumes: - - elasticsearch6_data_vol:/usr/share/elasticsearch/data - healthcheck: - start_period: 15s - test: curl -s http://localhost:9200/_cluster/health | grep -vq '"status":"red"' - interval: 10s - retries: 30 - stdin_open: true - elasticsearch8: image: elasticsearch:8.19.14 environment: diff --git a/osf/features.yaml b/osf/features.yaml index cce490a25a4..1da56e44f79 100644 --- a/osf/features.yaml +++ b/osf/features.yaml @@ -93,11 +93,6 @@ switches: name: enable_inactive_schemas note: This is no longer used - - flag_name: COUNTEDUSAGE_UNIFIED_METRICS_2024 - name: countedusage_unified_metrics_2024 - note: use only `osf.metrics.counted_usage`-based metrics where possible; un-use PageCounter, PreprintView, PreprintDownload, etc - active: false - - flag_name: ENABLE_MAILHOG name: enable_mailhog note: This is used to enable the MailHog email testing service, this will allow emails to be sent to the diff --git a/osf/management/commands/fake_metrics_reports.py b/osf/management/commands/fake_metrics_reports.py index 53e13472e74..b2c36adce38 100644 --- a/osf/management/commands/fake_metrics_reports.py +++ b/osf/management/commands/fake_metrics_reports.py @@ -4,18 +4,19 @@ from django.conf import settings from django.core.management.base import BaseCommand -from osf.metrics import ( - UserSummaryReport, - PreprintSummaryReport, +from osf.metrics.daily_reports import ( + DailyUserSummaryReport, + DailyPreprintSummaryReport, ) -from osf.metrics.reports import PublicItemUsageReport +from osf.metrics.monthly_reports import MonthlyPublicItemUsageReport from osf.metrics.utils import YearMonth +from osf.models.base import osfid_iri from osf.models import PreprintProvider def fake_user_counts(days_back): yesterday = date.today() - timedelta(days=1) - first_report = UserSummaryReport( + first_report = DailyUserSummaryReport( report_date=(yesterday - timedelta(days=days_back)), active=randint(0, 23), deactivated=randint(0, 2), @@ -29,7 +30,7 @@ def fake_user_counts(days_back): last_report = first_report while last_report.report_date < yesterday: new_user_count = randint(0, 500) - new_report = UserSummaryReport( + new_report = DailyUserSummaryReport( report_date=(last_report.report_date + timedelta(days=1)), active=(last_report.active + randint(0, new_user_count)), deactivated=(last_report.deactivated + randint(0, new_user_count)), @@ -48,7 +49,7 @@ def fake_preprint_counts(days_back): for day_delta in range(days_back): for provider_key in provider_keys: preprint_count = randint(100, 5000) * (days_back - day_delta) - PreprintSummaryReport( + DailyPreprintSummaryReport( report_date=yesterday - timedelta(days=day_delta), provider_key=provider_key, preprint_count=preprint_count, @@ -57,16 +58,29 @@ def fake_preprint_counts(days_back): def fake_usage_reports(osfid: str, count: int): _ym = YearMonth.from_date(date.today()).prior() + _prior_report = None for _months in range(count): - PublicItemUsageReport.record( + _report = MonthlyPublicItemUsageReport( item_osfid=osfid, + item_iri=osfid_iri(osfid), report_yearmonth=_ym, view_count=(_vc := randint(0, 500)), - view_session_count=randint(0, _vc), + view_session_count=(_vsc := randint(0, _vc)), + cumulative_view_count=_vc, + cumulative_view_session_count=_vsc, download_count=(_dc := randint(0, 300)), - download_session_count=randint(0, _dc), + download_session_count=(_dsc := randint(0, _dc)), + cumulative_download_count=_dc, + cumulative_download_session_count=_dsc, ) + if _prior_report: + _report.cumulative_view_count += _prior_report.cumulative_view_count + _report.cumulative_view_session_count += _prior_report.cumulative_view_session_count + _report.cumulative_download_count += _prior_report.cumulative_download_count + _report.cumulative_download_session_count += _prior_report.cumulative_download_session_count + _report.save() _ym = _ym.prior() + _prior_report = _report class Command(BaseCommand): diff --git a/osf/management/commands/make_dummy_pageviews_for_metrics.py b/osf/management/commands/make_dummy_pageviews_for_metrics.py deleted file mode 100644 index 09de34bf7a8..00000000000 --- a/osf/management/commands/make_dummy_pageviews_for_metrics.py +++ /dev/null @@ -1,118 +0,0 @@ -"""osf/management/commands/poke_metrics_timespan_queries.py -""" -import logging -import random -import datetime - -from django.core.management.base import BaseCommand -from osf.metrics import CountedAuthUsage - - -logger = logging.getLogger(__name__) - -TIME_FILTERS = ( - {'gte': 'now/d-150d'}, - {'gte': '2021-11-28T23:00:00.000Z', 'lte': '2023-01-16T00:00:00.000Z'}, -) - -PLATFORM_IRI = 'http://localhost:9201/' - -ITEM_GUID = 'foo' - - -class Command(BaseCommand): - - def add_arguments(self, parser): - parser.add_argument( - '--count', - type=int, - default=100, - help='number of fake pageviews to generate', - ) - parser.add_argument( - '--seconds_back', - type=int, - default=60 * 60 * 24 * 14, # up to two weeks back - help='max age in seconds of random event', - ) - - def handle(self, *args, **options): - self._generate_random_countedusage(options.get('count'), options.get('seconds_back')) - - results = [ - self._run_date_query(time_filter) - for time_filter in TIME_FILTERS - ] - - self._print_line( - (str(f) for f in TIME_FILTERS), - label='timefilter:', - ) - - date_keys = { - k - for r in results - for k in r - } - for date_key in sorted(date_keys): - self._print_line( - (r.get(date_key, 0) for r in results), - label=str(date_key), - ) - - def _print_line(self, lineitems, label=''): - print('\t'.join((label, *map(str, lineitems)))) - - def _generate_random_countedusage(self, n, max_age): - now = datetime.datetime.now(tz=datetime.UTC) - for _ in range(n): - seconds_back = random.randint(0, max_age) - timestamp_time = now - datetime.timedelta(seconds=seconds_back) - CountedAuthUsage.record( - platform_iri=PLATFORM_IRI, - timestamp=timestamp_time, - item_guid=ITEM_GUID, - session_id='freshen by key', - user_is_authenticated=bool(random.randint(0, 1)), - item_public=bool(random.randint(0, 1)), - action_labels=[['view', 'download'][random.randint(0, 1)]], - ) - - def _run_date_query(self, time_range_filter): - result = self._run_query({ - 'query': { - 'bool': { - 'filter': { - 'range': { - 'timestamp': time_range_filter, - }, - }, - }, - }, - 'aggs': { - 'by-date': { - 'date_histogram': { - 'field': 'timestamp', - 'interval': 'day', - }, - }, - 'max-timestamp': { - 'max': {'field': 'timestamp'}, - }, - 'min-timestamp': { - 'min': {'field': 'timestamp'}, - }, - }, - }) - return { - 'min': result.aggs['min-timestamp'].value, - 'max': result.aggs['max-timestamp'].value, - **{ - str(bucket.key.date()): bucket.doc_count - for bucket in result.aggs['by-date'] - }, - } - - def _run_query(self, query_dict): - analytics_search = CountedAuthUsage.search().update_from_dict(query_dict) - return analytics_search.execute() diff --git a/osf/management/commands/metrics_backfill_pageviews.py b/osf/management/commands/metrics_backfill_pageviews.py deleted file mode 100644 index 13898037923..00000000000 --- a/osf/management/commands/metrics_backfill_pageviews.py +++ /dev/null @@ -1,203 +0,0 @@ -"""osf/management/commands/metrics_backfill_pageviews.py - -Usage: - - $ dc-manage metrics_backfill_pageviews --source=$path_to_csv - $ dc-manage metrics_backfill_pageviews --source=$path_to_csv --dry # dry run - $ dc-manage metrics_backfill_pageviews --source=$path_to_csv --resume-from 1264 # start from record 1264 - - -""" -import csv -import logging -import datetime - -from django.core.management.base import BaseCommand -from osf.metrics import CountedAuthUsage -from osf.models import Guid - -logger = logging.getLogger(__name__) - -def main(source, dry_run=False, resume_from=None): - if not source: - logger.info('No source file detected, exiting.') - return - - # keen.timestamp => _source.timestamp # "2023-01-19T04:06:45.675432+00:00", - # page.info.protocol + page.info.domain => _source.platform_iri # "http://localhost:5000/", - # visitor.session => _source.session_id # "fcae918a3b6a19641bd0087f84083f0d57982d8c93ab821c405561d1b5c7b305", - # user.id => _source.user_is_authenticated # true, - # page.url => _source.pageview_info.page_url # "http://localhost:5000/my-projects/", - # page.title => _source.pageview_info.page_title # "OSF | My Projects", - # referrer.url => _source.pageview_info.referer_url # "http://localhost:5000/csab4/analytics", - # page.meta.routeName => _source.pageview_info.route_name # "OsfWebRenderer.my_projects", - # time.utc.hour_of_day => _source.pageview_info.hour_of_day # 4, - # page.info.path => _source.pageview_info.page_path # "/my-projects", - # referrer.info.domain => _source.pageview_info.referer_domain # "localhost:5000" - # page.meta.public => _source.item_public # true, - # node.id => _source.item_guid # "ry7dn", - - # ??? => _source.provider_id # "osf", - # ??? => _source.item_type # "node" - # ??? => _source.surrounding_guids = # [parent_guids?] - # ??? => _source.action_labels # ["web"] - - count = 0 - reader = csv.DictReader(source) - for row in reader: - if not row['page.url'].startswith('https://staging.osf.io'): - continue - - count += 1 - if resume_from is not None and count < resume_from: - continue - - something_wonderful = { - 'timestamp': _timestamp_to_dt(row['keen.timestamp']), - 'platform_iri': row['page.info.protocol'] + '://' + row['page.info.domain'], - 'session_id': row['visitor.session'], - 'user_is_authenticated': row['user.id'] is not None, - 'item_guid': row['node.id'], - 'item_public': row['page.meta.public'] or row['page.meta.pubic'], # unfortunate misspelling - 'pageview_info': { - 'hour_of_day': row['time.utc.hour_of_day'], - 'page_path': row['page.info.path'], - 'page_title': row['page.title'], - 'page_url': row['page.url'], - 'referer_url': row['referrer.url'], - 'referer_domain': row['referrer.info.domain'], - 'route_name': row['page.meta.routeName'], - }, - } - - db_info = annotate_from_db(row) - if db_info: - something_wonderful.update(db_info) - populate_action_labels(something_wonderful, row) - - logger.info(f'*** {count}: something wonderful:({something_wonderful})') - - if not dry_run: - CountedAuthUsage.record(**something_wonderful) - -def populate_action_labels(something_wonderful, row): - labels = ['web'] - - if row['page.info.path']: - path_parts = row['page.info.path'].split('/') - if len(path_parts) == 1 and path_parts[0] not in ('my-projects', 'goodbye', 'login'): - labels.append('view') - elif path_parts[1] in ('wiki'): - labels.append('view') - - if row['page.meta.routeName']: - route_name = row['page.meta.routeName'] - if 'search' in route_name: - labels.append('search') - - something_wonderful['action_labels'] = labels - -guid_cache = {} -# this may be done by CountedAuthUsage._fill_osfguid_info -def annotate_from_db(row): - item_guid = row['node.id'] - if not item_guid: - return - - if not guid_cache.get(item_guid, None): - guid_info = {} - guid_instance = Guid.load(item_guid) - - if guid_instance and guid_instance.referent: - guid_info = _fill_osfguid_info(guid_instance.referent) - guid_cache[item_guid] = guid_info - - return guid_cache[item_guid] - -# from CountedAuthUsage -def _fill_osfguid_info(guid_referent): - guid_info = {} - guid_info['item_public'] = _get_ispublic(guid_referent) - guid_info['item_type'] = type(guid_referent).__name__.lower() - guid_info['surrounding_guids'] = _get_surrounding_guids(guid_referent) - guid_info['provider_id'] = _get_provider_id(guid_referent) - return guid_info - -def _get_ispublic(guid_referent): - # if it quacks like BaseFileNode, look at .target instead - maybe_public = getattr(guid_referent, 'target', None) or guid_referent - if hasattr(maybe_public, 'verified_publishable'): - return maybe_public.verified_publishable # quacks like Preprint - return getattr(maybe_public, 'is_public', None) # quacks like AbstractNode - -def _get_provider_id(guid_referent): - provider = getattr(guid_referent, 'provider', None) - if isinstance(provider, str): - return provider # quacks like BaseFileNode - elif provider: - return provider._id # quacks like Registration, Preprint, Collection - return 'osf' # quacks like Node, Comment, WikiPage - -def _get_immediate_wrapper(guid_referent): - if hasattr(guid_referent, 'verified_publishable'): - return None # quacks like Preprint - return ( - getattr(guid_referent, 'parent_node', None) # quacks like AbstractNode - or getattr(guid_referent, 'node', None) # quacks like WikiPage, Comment - or getattr(guid_referent, 'target', None) # quacks like BaseFileNode - ) - -def _get_surrounding_guids(guid_referent): - """get all the parent/owner/surrounding guids for the given guid_referent - - @param guid_referent: instance of a model that has GuidMixin - @returns list of str - - For AbstractNode, goes up the node hierarchy up to the root. - For WikiPage or BaseFileNode, grab the node it belongs to and - follow the node hierarchy from there. - """ - surrounding_guids = [] - current_referent = guid_referent - while current_referent: - next_referent = _get_immediate_wrapper(current_referent) - if next_referent: - surrounding_guids.append(next_referent._id) - current_referent = next_referent - return surrounding_guids - -def _timestamp_to_dt(timestamp): - return datetime.datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S.%fZ').replace(tzinfo=datetime.UTC) - -def _timestamp_to_date(timestamp): - dt_obj = _timestamp_to_dt(timestamp) - return str(dt_obj.date()) - - -class Command(BaseCommand): - - def add_arguments(self, parser): - super().add_arguments(parser) - parser.add_argument( - '--source', - type=open, - help='source file (csv format w/ header line)', - ) - parser.add_argument( - '--dry', - dest='dry', - action='store_true', - help='Dry run' - ) - parser.add_argument( - '--resume-from', - dest='resume_from', - type=int, - help='start from which record', - ) - - def handle(self, *args, **options): - dry_run = options.get('dry', None) - source = options.get('source', None) - resume_from = options.get('resume_from', None) - main(source, dry_run, resume_from) diff --git a/osf/management/commands/metrics_backfill_summaries.py b/osf/management/commands/metrics_backfill_summaries.py deleted file mode 100644 index d259e9b2a52..00000000000 --- a/osf/management/commands/metrics_backfill_summaries.py +++ /dev/null @@ -1,435 +0,0 @@ -"""osf/management/commands/metrics_backfill_summaries.py - -usage: - - $ dc-manage metrics_backfill_summaries --which=$which_metric --source=$path_to_csv - -where ``$which_metric`` is one of: - - file_summary - download_count - preprint_summary - institution_summary - user_summary - node_summary - -""" -import csv -import logging -import datetime - -from django.core.management.base import BaseCommand -from osf.metrics import ( - DownloadCountReport, - InstitutionSummaryReport, - # NewUserDomainReport, - NodeSummaryReport, - OsfstorageFileCountReport, - PreprintSummaryReport, - # StorageAddonUsage, - UserSummaryReport, -) - - -logger = logging.getLogger(__name__) - - -def main(source, which, dry_run=False, resume_from=None): - if which not in SUMMARIES: - logger.info(f'No such summary, {which}, exiting.') - return - - if not source: - logger.info('No path to source data file, exiting.') - return - - summary_meta = SUMMARIES[which] - - logger.info('Kicking off...') - with open(source) as csvfile: - reader = csv.DictReader(csvfile) - - count = 0 - for row in reader: - count += 1 - if resume_from is not None and count < resume_from: - continue - - something_wonderful = summary_meta['mapper'](row) - logger.info(f'{count}: transformed:({something_wonderful})') - if not dry_run: - summary_meta['class'].record(**something_wonderful) - - logger.info('All done!') - if which == 'preprint_summary': - logger.error(f'Unrecognized provider names: ({bogus_preprints})') - - -def _map_download_count(row): - # date(keen.timestamp) => _source.report_date # "2022-12-30", - # keen.created_at => _source.timestamp # "2023-01-02T14:58:38.041721+00:00" - # files.total => _source.daily_file_downloads # 0, - return { - 'report_date': _timestamp_to_date(row['keen.timestamp']), - 'timestamp': _timestamp_to_dt(row['keen.created_at']), - 'daily_file_downloads': int(row['files.total']), - } - -def _map_file_summary(row): - # date(keen.timestamp) => _source.report_date # "2022-12-30", - # keen.created_at => _source.timestamp # "2023-01-02T14:59:04.397056+00:00" - # osfstorage_files.private => _source.files.private # 12146, - # osfstorage_files.total_daily => _source.files.total_daily # 0, - # osfstorage_files.public_daily => _source.files.public_daily # 0, - # osfstorage_files.private_daily => _source.files.private_daily # 0 - return { - 'report_date': _timestamp_to_date(row['keen.timestamp']), - 'timestamp': _timestamp_to_dt(row['keen.created_at']), - 'files': { - 'total': int(row['osfstorage_files.total']), - 'public': int(row['osfstorage_files.public']), - 'private': int(row['osfstorage_files.private']), - 'total_daily': int(row['osfstorage_files.total_daily']), - 'public_daily': int(row['osfstorage_files.public_daily']), - 'private_daily': int(row['osfstorage_files.private_daily']), - }, - } - - -def _map_institution_summary(row): - # date(keen.timestamp) => _source.report_date # "2022-12-30", - # keen.created => _source.timestamp # "2023-01-02T14:59:01.706319+00:00" - # institution.id => _source.institution_id # "okstate", - # institution.name => _source.institution_name # "Oklahoma State University [Test]", - # ### => _source.users # {} - # users.total => _source.total # 0, - # users.total_daily => _source.total_daily # 0 - # ### => _source.nodes # {} - # nodes.total => _source.nodes.total": 0, - # nodes.public => _source.nodes.public": 0, - # nodes.private => _source.nodes.private": 0, - # nodes.total_daily => _source.nodes.total_daily": 0, - # nodes.public_daily => _source.nodes.public_daily": 0, - # nodes.private_daily => _source.nodes.private_daily": 0 - # ### => _source.projects # {} - # projects.total => _source.projects.total": 0, - # projects.public => _source.projects.public": 0, - # projects.private => _source.projects.private": 0, - # projects.total_daily => _source.projects.total_daily": 0, - # projects.public_daily => _source.projects.public_daily": 0, - # projects.private_daily => _source.projects.private_daily": 0 - # ### => _source.registered_nodes # {} - # registered_nodes.total => _source.registered_nodes.total": 0, - # registered_nodes.public => _source.registered_nodes.public": 0, - # registered_nodes.embargoed => _source.registered_nodes.embargoed": 0, - # registered_nodes.embargoed_v2 => _source.registered_nodes.embargoed_v2": 0, - # registered_nodes.total_daily => _source.registered_nodes.total_daily": 0, - # registered_nodes.public_daily => _source.registered_nodes.public_daily": 0, - # registered_nodes.embargoed_daily => _source.registered_nodes.embargoed_daily": 0, - # registered_nodes.embargoed_v2_daily => _source.registered_nodes.embargoed_v2_daily": 0 - # ### => _source.registered_projects # {} - # registered_projects.total => _source.registered_projects.total": 0, - # registered_projects.public => _source.registered_projects.public": 0, - # registered_projects.embargoed => _source.registered_projects.embargoed": 0, - # registered_projects.embargoed_v2 => _source.registered_projects.embargoed_v2": 0, - # registered_projects.total_daily => _source.registered_projects.total_daily": 0, - # registered_projects.public_daily => _source.registered_projects.public_daily": 0, - # registered_projects.embargoed_daily => _source.registered_projects.embargoed_daily": 0, - # registered_projects.embargoed_v2_daily => _source.registered_projects.embargoed_v2_daily": 0 - return { - 'report_date': _timestamp_to_date(row['keen.timestamp']), - 'timestamp': _timestamp_to_dt(row['keen.created_at']), - 'institution_id': row['institution.id'], - 'institution_name': row['institution.name'], - 'users': { - 'total': int(row['users.total']), - 'total_daily': int(row['users.total_daily'] or 0), - }, - 'nodes': { - 'total': int(row['nodes.total']), - 'public': int(row['nodes.public']), - 'private': int(row['nodes.private']), - 'total_daily': int(row['nodes.total_daily'] or 0), - 'public_daily': int(row['nodes.public_daily'] or 0), - 'private_daily': int(row['nodes.private_daily'] or 0), - }, - 'projects': { - 'total': int(row['projects.total']), - 'public': int(row['projects.public']), - 'private': int(row['projects.private']), - 'total_daily': int(row['projects.total_daily'] or 0), - 'public_daily': int(row['projects.public_daily'] or 0), - 'private_daily': int(row['projects.private_daily'] or 0), - }, - 'registered_nodes': { - 'total': int(row['registered_nodes.total']), - 'public': int(row['registered_nodes.public']), - 'embargoed': int(row['registered_nodes.embargoed']), - 'embargoed_v2': int(row['registered_nodes.embargoed_v2'] or 0), - 'total_daily': int(row['registered_nodes.total_daily'] or 0), - 'public_daily': int(row['registered_nodes.public_daily'] or 0), - 'embargoed_daily': int(row['registered_nodes.embargoed_daily'] or 0), - 'embargoed_v2_daily': int(row['registered_nodes.embargoed_v2_daily'] or 0), - }, - 'registered_projects': { - 'total': int(row['registered_projects.total']), - 'public': int(row['registered_projects.public']), - 'embargoed': int(row['registered_projects.embargoed']), - 'embargoed_v2': int(row['registered_projects.embargoed_v2'] or 0), - 'total_daily': int(row['registered_projects.total_daily'] or 0), - 'public_daily': int(row['registered_projects.public_daily'] or 0), - 'embargoed_daily': int(row['registered_projects.embargoed_daily'] or 0), - 'embargoed_v2_daily': int(row['registered_projects.embargoed_v2_daily'] or 0), - }, - } - -def _map_node_summary(row): - # date(keen.timestamp) => _source.report_date # "2022-12-30", - # keen.created_at => _source.timestamp # "2023-01-02T14:59:03.886999+00:00" - # ### => _source.nodes # {} - # nodes.total => _source.nodes.total # 58, - # nodes.total_excluding_spam => _source.nodes.total_excluding_spam # 58, - # nodes.public => _source.nodes.public # 14, - # nodes.private => _source.nodes.private # 44, - # nodes.total_daily => _source.nodes.total_daily # 0, - # nodes.total_daily_excluding_spam => _source.nodes.total_daily_excluding_spam # 0, - # nodes.public_daily => _source.nodes.public_daily # 0, - # nodes.private_daily => _source.nodes.private_daily # 0 - # ### => _source.projects # {} - # projects.total => _source.projects.total # 53, - # projects.total_excluding_spam => _source.projects.total_excluding_spam # 53, - # projects.public => _source.projects.public # 14, - # projects.private => _source.projects.private # 39, - # projects.total_daily => _source.projects.total_daily # 0, - # projects.total_daily_excluding_spam => _source.projects.total_daily_excluding_spam # 0, - # projects.public_daily => _source.projects.public_daily # 0, - # projects.private_daily => _source.projects.private_daily # 0 - # ### => _source.registered_nodes # {} - # registered_nodes.total => _source.registered_nodes.total # 10, - # registered_nodes.public => _source.registered_nodes.public # 9, - # registered_nodes.embargoed => _source.registered_nodes.embargoed # 1, - # registered_nodes.embargoed_v2 => _source.registered_nodes.embargoed_v2 # 0, - # registered_nodes.withdrawn => _source.registered_nodes.withdrawn # 0, - # registered_nodes.total_daily => _source.registered_nodes.total_daily # 0, - # registered_nodes.public_daily => _source.registered_nodes.public_daily # 0, - # registered_nodes.embargoed_daily => _source.registered_nodes.embargoed_daily # 0, - # registered_nodes.embargoed_v2_daily => _source.registered_nodes.embargoed_v2_daily # 0, - # registered_nodes.withdrawn_daily => _source.registered_nodes.withdrawn_daily # 0 - # ### => _source.registered_projects # {} - # registered_projects.total => _source.registered_projects."total # 10, - # registered_projects.public => _source.registered_projects."public # 9, - # registered_projects.embargoed => _source.registered_projects."embargoed # 1, - # registered_projects.embargoed_v2 => _source.registered_projects."embargoed_v2 # 0, - # registered_projects.withdrawn => _source.registered_projects."withdrawn # 0, - # registered_projects.total_daily => _source.registered_projects."total_daily # 0, - # registered_projects.public_daily => _source.registered_projects."public_daily # 0, - # registered_projects.embargoed_daily => _source.registered_projects."embargoed_daily # 0, - # registered_projects.embargoed_v2_daily => _source.registered_projects."embargoed_v2_daily # 0, - # registered_projects.withdrawn_daily => _source.registered_projects."withdrawn_daily # 0 - return { - 'report_date': _timestamp_to_date(row['keen.timestamp']), - 'timestamp': _timestamp_to_dt(row['keen.created_at']), - 'nodes': { - 'total': int(row['nodes.total'] or 0), - 'total_excluding_spam': int(row['nodes.total_excluding_spam'] or 0), - 'public': int(row['nodes.public'] or 0), - 'private': int(row['nodes.private'] or 0), - 'total_daily': int(row['nodes.total_daily'] or 0), - 'total_daily_excluding_spam': int(row['nodes.total_daily_excluding_spam'] or 0), - 'public_daily': int(row['nodes.public_daily'] or 0), - 'private_daily': int(row['nodes.private_daily'] or 0), - }, - 'projects': { - 'total': int(row['projects.total']), - 'total_excluding_spam': int(row['projects.total_excluding_spam'] or 0), - 'public': int(row['projects.public'] or 0), - 'private': int(row['projects.private'] or 0), - 'total_daily': int(row['projects.total_daily'] or 0), - 'total_daily_excluding_spam': int(row['projects.total_daily_excluding_spam'] or 0), - 'public_daily': int(row['projects.public_daily'] or 0), - 'private_daily': int(row['projects.private_daily'] or 0), - }, - 'registered_nodes': { - 'total': int(row['registered_nodes.total'] or 0), - 'public': int(row['registered_nodes.public'] or 0), - 'embargoed': int(row['registered_nodes.embargoed'] or 0), - 'embargoed_v2': int(row['registered_nodes.embargoed_v2'] or 0), - 'withdrawn': int(row['registered_nodes.withdrawn'] or 0), - 'total_daily': int(row['registered_nodes.total_daily'] or 0), - 'public_daily': int(row['registered_nodes.public_daily'] or 0), - 'embargoed_daily': int(row['registered_nodes.embargoed_daily'] or 0), - 'embargoed_v2_daily': int(row['registered_nodes.embargoed_v2_daily'] or 0), - 'withdrawn_daily': int(row['registered_nodes.withdrawn_daily'] or 0), - }, - 'registered_projects': { - 'total': int(row['registered_projects.total'] or 0), - 'public': int(row['registered_projects.public'] or 0), - 'embargoed': int(row['registered_projects.embargoed'] or 0), - 'embargoed_v2': int(row['registered_projects.embargoed_v2'] or 0), - 'withdrawn': int(row['registered_projects.withdrawn'] or 0), - 'total_daily': int(row['registered_projects.total_daily'] or 0), - 'public_daily': int(row['registered_projects.public_daily'] or 0), - 'embargoed_daily': int(row['registered_projects.embargoed_daily'] or 0), - 'embargoed_v2_daily': int(row['registered_projects.embargoed_v2_daily'] or 0), - 'withdrawn_daily': int(row['registered_projects.withdrawn_daily'] or 0), - }, - } - - -preprint_name_map = { - 'AfricArXiv': 'africarxiv', - 'AgriXiv': 'agrixiv', - 'Arabixiv': 'arabixiv', - 'BioHackrXiv': 'biohackrxiv', - 'BITSS': 'metaarxiv', - 'BodoArXiv': 'bodoarxiv', - 'coppreprints': 'coppreprints', - 'EarthArXiv': 'eartharxiv', - 'EcoEvoRxiv': 'ecoevorxiv', - 'ECSarXiv': 'ecsarxiv', - 'EdArXiv': 'edarxiv', - 'engrXiv': 'engrxiv', - 'FocUS Archive': 'focusarchive', - 'Frenxiv': 'frenxiv', - 'INA-Rxiv': 'inarxiv', - 'IndiaRxiv': 'indiarxiv', - 'LawArXiv': 'lawarxiv', - 'LIS Scholarship Archive': 'lissa', - 'LiveData': 'livedata', - 'Research AZ': 'livedata', - 'MarXiv': 'marxiv', - 'MedArXiv': 'medarxiv', - 'MediArXiv': 'mediarxiv', - 'MetaArXiv': 'metaarxiv', - 'MindRxiv': 'mindrxiv', - 'NutriXiv': 'nutrixiv', - 'Open Science Framework': 'osf', - 'PaleorXiv': 'paleorxiv', - 'PsyArXiv': 'psyarxiv', - 'SocArXiv': 'socarxiv', - 'SportRxiv': 'sportrxiv', - 'Thesis Commons': 'thesiscommons', - 'Vulnerability Assessment Testing': 'vulnerabilityassessmenttesting', -} -preprint_long_names = list(preprint_name_map.keys()) -preprint_short_names = list(preprint_name_map.values()) -bogus_preprints = {} -def _map_preprint_summary(row): - # date(keen.timestamp) => _source.report_date # "2022-12-30", - # keen.created_at => _source.timestamp # "2023-01-02T14:59:05.684642+00:00" - # provider.name => _source.provider_key # "psyarxiv", - # provider.total => _source.preprint_count # 0, - - # normalize provider names: we used to store the formal name, now we store the short name - provider_key = None - provider_name = row['provider.name'] - if provider_name in preprint_short_names: - provider_key = provider_name - elif provider_name in preprint_long_names: - provider_key = preprint_name_map[provider_name] - else: - logger.error(f'Unrecognized preprint provider name: ({provider_name})') - if provider_name not in bogus_preprints: - bogus_preprints[provider_name] = 0 - bogus_preprints[provider_name] += 1 - provider_key = provider_name # oh well - - return { - 'report_date': _timestamp_to_date(row['keen.timestamp']), - 'timestamp': _timestamp_to_dt(row['keen.created_at']), - 'provider_key': provider_key, - 'preprint_count': int(row['provider.total']), - } - -def _map_user_summary(row): - # date(keen.timestamp) => _source.report_date # "2023-01-03", - # keen.created_at => _source.timestamp # "2023-01-04T13:47:34.216419+00:00" - # status.active => _source.active # 7, - # status.deactivated => _source.deactivated # 0, - # status.merged => _source.merged # 0, - # status.new_users_daily => _source.new_users_daily # 0, - # status.new_users_with_institution_daily => _source.new_users_with_institution_daily # 0, - # status.unconfirmed => _source.unconfirmed # 0, - return { - 'report_date': _timestamp_to_date(row['keen.timestamp']), - 'timestamp': _timestamp_to_dt(row['keen.created_at']), - 'active': int(row['status.active']), - 'deactivated': int(row['status.deactivated'] or 0), - 'merged': int(row['status.merged'] or 0), - 'new_users_daily': int(row['status.new_users_daily'] or 0), - 'new_users_with_institution_daily': int(row['status.new_users_with_institution_daily'] or 0), - 'unconfirmed': int(row['status.unconfirmed'] or 0), - } - -SUMMARIES = { - 'download_count': { - 'mapper': _map_download_count, - 'class': DownloadCountReport, - }, - 'file_summary': { - 'mapper': _map_file_summary, - 'class': OsfstorageFileCountReport, - }, - 'institution_summary': { - 'mapper': _map_institution_summary, - 'class': InstitutionSummaryReport, - }, - 'node_summary': { - 'mapper': _map_node_summary, - 'class': NodeSummaryReport, - }, - 'preprint_summary': { - 'mapper': _map_preprint_summary, - 'class': PreprintSummaryReport, - }, - 'user_summary': { - 'mapper': _map_user_summary, - 'class': UserSummaryReport, - }, -} - -def _timestamp_to_dt(timestamp): - return datetime.datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S.%fZ') - -def _timestamp_to_date(timestamp): - dt_obj = _timestamp_to_dt(timestamp) - return dt_obj.date() - - -def _dt_to_date(dt): - dt_obj = datetime.datetime.strptime(dt, '%Y-%m-%dT%H:%M:%S.%fZ') - return str(dt_obj.date()) - -class Command(BaseCommand): - - def add_arguments(self, parser): - super().add_arguments(parser) - parser.add_argument( - '--source', - type=str, - help='source file path (csv format w/ header line)', - ) - parser.add_argument( - '--dry', - dest='dry', - action='store_true', - help='Dry run' - ) - parser.add_argument( - '--which', - type=str, - help='which metric summary this data is for' - ) - parser.add_argument( - '--resume-from', - dest='resume_from', - type=int, - help='start from which record', - ) - - def handle(self, *args, **options): - dry_run = options.get('dry', None) - source = options.get('source', None) - which = options.get('which', None) - resume_from = options.get('resume_from', None) - main(source, which, dry_run, resume_from) diff --git a/osf/management/commands/metrics_backfill_user_domains.py b/osf/management/commands/metrics_backfill_user_domains.py deleted file mode 100644 index 685dd55243e..00000000000 --- a/osf/management/commands/metrics_backfill_user_domains.py +++ /dev/null @@ -1,130 +0,0 @@ -"""osf/management/commands/metrics_backfill_user_domains.py - -Usage: - - $ dc-manage metrics_backfill_user_domains --source=$path_to_csv - $ dc-manage metrics_backfill_user_domains --source=$path_to_csv --dry # dry run - $ dc-manage metrics_backfill_user_domains --source=$path_to_csv --resume-from 1264 # start from record 1264 - - -""" -import csv -import logging -import datetime - -from django.core.management.base import BaseCommand -from osf.metrics import NewUserDomainReport - -logger = logging.getLogger(__name__) - -def main(source, dry_run=False, resume_from=None): - if not source: - logger.info('No source file detected, exiting.') - return - - # new user domains report is weird, b/c old data needs to be aggregated by date & domain - - count = 0 - reader = csv.DictReader(source) - tally = {} - this_year = None - for row in reader: - count += 1 - if resume_from is not None and count < resume_from: - continue - - logger.info(f'count:({count}) this_year:({this_year})') - - event_ts = _timestamp_to_dt(row['keen.timestamp']) - event_date = event_ts.date() - event_date_str = str(event_date) - - if this_year is None: - logger.info(' >>> setting new year') - this_year = event_date.year - - if this_year != event_date.year: - # we've built up a year of data; commit and clear - logger.info(' >>> year is up, committing data') - _upload_data_and_purge(tally, dry_run) - this_year = event_date.year - logger.info(' >>> data committed, new year is:({}) and tally should be ' - 'empty:({})'.format(this_year, tally)) - - if event_date_str not in tally: - tally[event_date_str] = { - 'timestamp': event_ts, - 'report_date': event_date, - 'domains': {}, - } - - domain = row['domain'] - if domain not in tally[event_date_str]['domains']: - tally[event_date_str]['domains'][domain] = 0 - tally[event_date_str]['domains'][domain] += 1 - - _upload_data_and_purge(tally, dry_run) - - -def _upload_data_and_purge(tally, dry_run): - for event_date_str, record in tally.items(): - for domain, count in record['domains'].items(): - - # date(keen.timestamp) => _source.report_date # "2022-12-30", - # keen.created_at => _source.timestamp # "2023-01-02T14:59:05.684642+00:00" - # domain => _source.domain_name # metrics.Keyword() - # count_agg(domain) => _source.new_user_count # metrics.Integer() - - something_wonderful = { - 'timestamp': record['timestamp'], - 'report_date': record['report_date'], - 'domain_name': domain, - 'new_user_count': count, - } - - logger.info(f' *** {event_date_str}::{domain}::{count}') - logger.info(' *** {}::{}: something wonderful:({})'.format(event_date_str, domain, - something_wonderful)) - - if not dry_run: - NewUserDomainReport.record(**something_wonderful) - - # purge tally - tally.clear() - - -def _timestamp_to_dt(timestamp): - return datetime.datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S.%fZ').replace(tzinfo=datetime.UTC) - -def _timestamp_to_date(timestamp): - dt_obj = _timestamp_to_dt(timestamp) - return str(dt_obj.date()) - - -class Command(BaseCommand): - - def add_arguments(self, parser): - super().add_arguments(parser) - parser.add_argument( - '--source', - type=open, - help='source file (csv format w/ header line)', - ) - parser.add_argument( - '--dry', - dest='dry', - action='store_true', - help='Dry run' - ) - parser.add_argument( - '--resume-from', - dest='resume_from', - type=int, - help='start from which record', - ) - - def handle(self, *args, **options): - dry_run = options.get('dry', None) - source = options.get('source', None) - resume_from = options.get('resume_from', None) - main(source, dry_run, resume_from) diff --git a/osf/management/commands/migrate_osfmetrics_6to8.py b/osf/management/commands/migrate_osfmetrics_6to8.py deleted file mode 100644 index 49396d36ba3..00000000000 --- a/osf/management/commands/migrate_osfmetrics_6to8.py +++ /dev/null @@ -1,915 +0,0 @@ -import collections -import datetime -import functools -import heapq -import itertools -import logging - -from django.apps import apps -from django.core.management.base import BaseCommand -from django.db import OperationalError as DjangoOperationalError -from elasticsearch6.exceptions import ConnectionError as Elastic6ConnectionError -from elasticsearch6 import helpers as es6_helpers -from elasticsearch6_dsl.connections import connections as es6_connections -from elasticsearch8.exceptions import TransportError as Elastic8TransportError -from elasticsearch8.helpers import BulkIndexError as Elastic8BulkIndexError -from elasticsearch_metrics.registry import djelme_registry -from elasticsearch_metrics.imps import elastic8 as djel8me -from psycopg2 import OperationalError as PostgresOperationalError - -from framework.celery_tasks import app as celery_app -from osf.metadata.rdfutils import OSF -from osf.metadata.osfmap_utils import is_osf_component -from osf.metrics.preprint_metrics import ( - PreprintView, - PreprintDownload, -) -from osf.metrics.counted_usage import ( - CountedAuthUsage as CountedUsageEs6, - get_provider_id, -) -from osf.metrics import reports as es6_reports -from osf.metrics import es8_metrics, RegistriesModerationMetrics -from osf.metrics.reporters.public_item_usage import _iter_composite_bucket_keys -from osf.metrics.utils import ( - YearMonth, - get_database_iri, - get_item_type, - get_item_type_from_model, - get_item_type_from_iri, -) -from osf import models as osfdb -from osf.models.base import osfid_iri -from website import settings as website_settings - - -_logger = logging.getLogger(__name__) - -### -# constants - -_USAGE_DAYS_BACK = 99 - -_MAX_CARDINALITY_PRECISION = 40000 # https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-metrics-cardinality-aggregation.html#_precision_control - -_COMPOSITE_CHUNK_SIZE = 500 - -_UNCHANGED_RECORDTYPES = { - # reports - es6_reports.StorageAddonUsage: es8_metrics.DailyStorageAddonUsageReportEs8, - es6_reports.DownloadCountReport: es8_metrics.DailyDownloadCountReportEs8, - es6_reports.InstitutionSummaryReport: es8_metrics.DailyInstitutionSummaryReportEs8, - es6_reports.NewUserDomainReport: es8_metrics.DailyNewUserDomainReportEs8, - es6_reports.NodeSummaryReport: es8_metrics.DailyNodeSummaryReportEs8, - es6_reports.OsfstorageFileCountReport: es8_metrics.DailyOsfstorageFileCountReportEs8, - es6_reports.PreprintSummaryReport: es8_metrics.DailyPreprintSummaryReportEs8, - es6_reports.UserSummaryReport: es8_metrics.DailyUserSummaryReportEs8, - es6_reports.SpamSummaryReport: es8_metrics.MonthlySpamSummaryReportEs8, - es6_reports.InstitutionalUserReport: es8_metrics.MonthlyInstitutionalUserReportEs8, - es6_reports.InstitutionMonthlySummaryReport: es8_metrics.MonthlyInstitutionSummaryReportEs8, - es6_reports.PrivateSpamMetricsReport: es8_metrics.MonthlyPrivateSpamMetricsReportEs8, - # events - RegistriesModerationMetrics: es8_metrics.RegistriesModerationEventEs8, -} - -_TASK_KWARGS = dict( - autoretry_for=( - DjangoOperationalError, - Elastic6ConnectionError, - Elastic8TransportError, - PostgresOperationalError, - ), - retry_backoff=True, # exponential backoff, with jitter - max_retries=20, -) - -### -# celery tasks - - -@celery_app.task(**_TASK_KWARGS) -def migrate_unchanged_recordtype(es6_recordtype_name: str, until_when: str): - _es6_recordtype = djelme_registry.get_recordtype('osf', es6_recordtype_name) - _es8_recordtype = _UNCHANGED_RECORDTYPES[_es6_recordtype] - _convert_kwargs = ( - _convert_unchanged_cyclicrecord_kwargs - if issubclass(_es8_recordtype, djel8me.CyclicRecord) - else (lambda _kw: _kw) # no conversion needed for event record - ) - _each_new = ( - _es8_recordtype(**_convert_kwargs(_hit['_source'])) - for _hit in _es6_scan_range(_es6_recordtype, until_when=until_when) - ) - _es8_bulk_save(_es8_recordtype, _each_new) - - -@celery_app.task(**_TASK_KWARGS) -def migrate_counted_usages(from_when: str, until_when: str): - # CountedAuthUsage => OsfCountedUsageEvent - _each_new = ( - _convert_counted_usage(_hit['_source']) - for _hit in _es6_scan_range( - CountedUsageEs6, - from_when=from_when, - until_when=until_when, - addl_filter={'exists': {'field': 'item_guid'}}, - ) - ) - _es8_bulk_save(es8_metrics.OsfCountedUsageEvent, _each_new) - - -@celery_app.task(**_TASK_KWARGS) -def migrate_preprint_views(from_when: str, until_when: str): - # PreprintView => OsfCountedUsageEvent - _action_labels = ['view', 'web'] - _each_new = ( - _convert_preprint_metric(_hit, _action_labels) - for _hit in _es6_scan_range( - PreprintView, from_when=from_when, until_when=until_when - ) - ) - _es8_bulk_save(es8_metrics.OsfCountedUsageEvent, _each_new) - - -@celery_app.task(**_TASK_KWARGS) -def migrate_preprint_downloads(from_when: str, until_when: str): - # PreprintDownload => OsfCountedUsageEvent - _action_labels = ['download'] - _each_new = ( - _convert_preprint_metric(_hit, _action_labels) - for _hit in _es6_scan_range( - PreprintDownload, from_when=from_when, until_when=until_when - ) - ) - _es8_bulk_save(es8_metrics.OsfCountedUsageEvent, _each_new) - - -@celery_app.task(**_TASK_KWARGS) -def schedule_migrate_usage_reports(until_when: str): - for _osfid in _merge_sorted_osfids( - _each_usage_report_osfid(until_when=until_when), - _each_countedusage_osfid(until_when=until_when), - _each_preprintview_osfid(until_when=until_when), - _each_preprintdownload_osfid(until_when=until_when), - ): - migrate_usage_reports.delay(_osfid, until_when) - - -@celery_app.task(**_TASK_KWARGS) -def migrate_usage_reports(osfid: str, until_when: str): - # from PublicItemUsageReport to MonthlyPublicItemUsageReportEs8 - _osfobj, _ = osfdb.Guid.load_referent(osfid) - _item_is_component = is_osf_component(_osfobj) if _osfobj else False - - def _each_new(): - _each_hit = _es6_scan_range( - es6_reports.PublicItemUsageReport, - until_when=until_when, - addl_filter={'terms': {'item_osfid': _synonymous_osfids(osfid)}}, - ) - # (only a few dozen of these per item; should be fine to load all at once) - _hits = list(_each_hit) - if _osfobj and not _hits: - # this item has usages, but only before the monthly usage reparts started - # -- create one for cumulative counts (if the object still exists) - yield _backfill_old_usage_report(_osfobj, _item_is_component, until_when) - else: - for _hit in _hits: - yield _convert_public_usage_report( - _hit['_source'], - item_is_component=_item_is_component, - ) - - _es8_bulk_save(es8_metrics.MonthlyPublicItemUsageReportEs8, _each_new()) - - -### -# various helper functions - - -def _es6_connection(): - return es6_connections.get_connection('osfmetrics_es6') - - -def _es8_bulk_save(es8_recordtype, each_new_record): - try: - es8_recordtype.bulk(each_new_record, stats_only=True) - except Elastic8BulkIndexError as _bulk_error: - # so actual errors show in celery task result - raise Exception(_bulk_error.errors) from _bulk_error - - -def _date_range( - range_start: datetime.date, - range_end: datetime.date, - step: datetime.timedelta = datetime.timedelta(days=1), -) -> collections.abc.Iterator[tuple[datetime.date, datetime.date]]: - _from_date = range_start - _until_date = range_start + step - while _from_date < range_end: - yield (_from_date, _until_date) - (_from_date, _until_date) = (_until_date, _until_date + step) - - -def _es6_scan_range( - es6_recordtype, - *, - from_when: str = '', - until_when: str, - addl_filter=None, -): - _timestamp_range = {'lt': until_when} - if from_when: - _timestamp_range['gte'] = from_when - _filters = [ - {'range': {'timestamp': _timestamp_range}}, - ] - if addl_filter: - _filters.append(addl_filter) - _query_body = {'query': {'bool': {'filter': _filters}}} - return es6_helpers.scan( - _es6_connection(), - index=es6_recordtype._template_pattern, - query=_query_body, - ) - - -def _es6_usage_report_counts() -> tuple[int, int]: - _search = es6_reports.PublicItemUsageReport.search() - _search.aggs.metric( - 'agg_item_count', - 'cardinality', - field='item_osfid', - precision_threshold=_MAX_CARDINALITY_PRECISION, - ) - _response = _search.execute() - _total_count = _response.hits.total - _item_count = ( - _response.aggregations.agg_item_count.value - if 'agg_item_count' in _response.aggregations - else 0 - ) - return (_total_count, _item_count) - - -def _es8_usage_report_counts() -> tuple[int, int]: - _search = es8_metrics.MonthlyPublicItemUsageReportEs8.search() - _search.aggs.metric( - 'agg_item_count', - 'cardinality', - field='item_osfids', - precision_threshold=_MAX_CARDINALITY_PRECISION, - ) - _response = _search.execute() - _total_count = _response.hits.total.value - _item_count = ( - _response.aggregations.agg_item_count.value - if 'agg_item_count' in _response.aggregations - else 0 - ) - return (_total_count, _item_count) - - -def _get_es6_field_names(es6_recordtype): - ''' - adapted from DocumentBase._get_field_names in elasticsearch8.dsl - ''' - for _field_name in es6_recordtype._doc_type.mapping: - _field = es6_recordtype._doc_type.mapping[_field_name] - if hasattr(_field, '_doc_class'): - for _sub_field in _get_es6_field_names(_field._doc_class): - yield f'{_field_name}.{_sub_field}' - else: - yield _field_name - - -def _assert_field_unchangedness(es6_recordtype, es8_recordtype): - _es6_fields = set(_get_es6_field_names(es6_recordtype)) - _es8_fields = set(es8_recordtype._get_field_names()) - - # remove fields intentionally removed in migration - if issubclass(es6_recordtype, es6_reports.DailyReport): - assert issubclass(es8_recordtype, djel8me.CyclicRecord) - _es6_fields.remove('timestamp') - _es6_fields.remove('report_date') - elif issubclass(es6_recordtype, es6_reports.MonthlyReport): - assert issubclass(es8_recordtype, djel8me.CyclicRecord) - _es6_fields.remove('timestamp') - _es6_fields.remove('report_yearmonth') - else: - assert issubclass(es8_recordtype, djel8me.EventRecord) - - # remove fields intentionally added in migration - _es8_fields.remove('timeseries_timeparts') - if issubclass(es8_recordtype, djel8me.CyclicRecord): - _es8_fields.remove('created') - _es8_fields.remove('cycle_coverage') - - # all remaining fields should match - assert _es6_fields == _es8_fields - - -def _semverish_from_yearmonth(given_yearmonth): - _ym = YearMonth.from_any(given_yearmonth) - return f'{_ym.year}.{_ym.month}' - - -def _semverish_from_date(given_date: str): - _d = datetime.date.fromisoformat(given_date) - return f'{_d.year}.{_d.month}.{_d.day}' - - -def _convert_unchanged_cyclicrecord_kwargs(es6_source: dict) -> dict: - def _each_kwarg(): - for _key, _val in es6_source.items(): - if _key == 'report_yearmonth': - # report_yearmonth converts to cycle_coverage Y.M - yield ('cycle_coverage', _semverish_from_yearmonth(_val)) - elif _key == 'report_date': - # report_date converts to cycle_coverage Y.M.D - yield ('cycle_coverage', _semverish_from_date(_val)) - elif _key != 'timestamp': - # skipping timestamp; on daily/monthly reports just copied from yearmonth/date - yield (_key, _val) - - return dict(_each_kwarg()) - - -def _convert_counted_usage(source: dict) -> es8_metrics.OsfCountedUsageEvent: - return es8_metrics.OsfCountedUsageEvent( - # fields from djelme.CountedUsageRecord: - timestamp=source['timestamp'], - sessionhour_id=source['session_id'], - platform_iri=source.get('platform_iri') or website_settings.DOMAIN, - database_iri=_convert_database_iri( - provider_id=source.get('provider_id'), - osf_model_name=source.get('item_type'), - ), - within_iris=[ - osfid_iri(_within_osfid) - for _within_osfid in source.get('surrounding_guids', ()) - ], - # fields from OsfCountedUsageEvent: - item_osfid=source['item_guid'], - item_type=_convert_item_type( - source.get('item_type'), - has_surrounding_items=bool(source.get('surrounding_guids')), - ), - item_public=source.get('item_public', True), - provider_id=source.get('provider_id', 'osf'), - user_is_authenticated=source.get('user_is_authenticated', False), - action_labels=source.get('action_labels'), - pageview_info=source.get('pageview_info'), - ) - - -def _convert_preprint_metric( - hit: dict, action_labels: list[str] -) -> es8_metrics.OsfCountedUsageEvent: - _source = hit['_source'] - _doc_id = hit['_id'] - return es8_metrics.OsfCountedUsageEvent.record( - using=False, # don't save yet; will save in bulk - # fields used to compute a sessionhour_id: - timestamp=datetime.datetime.fromisoformat(_source['timestamp']), - user_id=_source.get('user_id'), - client_session_id=_doc_id, # unique session per event (best can do) - # fields from djelme.CountedUsageRecord: - platform_iri=website_settings.DOMAIN, - database_iri=_convert_database_iri( - provider_id=_source.get('provider_id'), - osf_model_name='preprint', - ), - # fields from OsfCountedUsageEvent: - item_osfid=_source['preprint_id'], - item_type='Preprint', - item_public=True, - provider_id=_source.get('provider_id'), - user_is_authenticated=bool(_source.get('user_id')), - action_labels=action_labels, - ) - - -def _convert_public_usage_report( - source: dict, - item_is_component: bool, -) -> es8_metrics.MonthlyPublicItemUsageReportEs8: - _c_views, _c_view_sess, _c_downloads, _c_download_sess = _get_cumulative_usage( - osfid=source['item_osfid'], - until_when=YearMonth.from_str(source['report_yearmonth']).month_end(), - is_preprint=('preprint' in source.get('item_type', ())), - ) - return es8_metrics.MonthlyPublicItemUsageReportEs8( - cycle_coverage=_semverish_from_yearmonth(source['report_yearmonth']), - item_iri=osfid_iri(source['item_osfid']), - item_osfids=[source['item_osfid']], - item_types=_convert_item_type_list( - source.get('item_type', []), - has_surrounding_items=item_is_component, - ), - database_iris=_convert_database_iri_list( - provider_ids=source.get('provider_id', []), - osf_model_names=source.get('item_type', []), - ), - provider_ids=source.get('provider_id'), - platform_iris=source.get('platform_iri') or [website_settings.DOMAIN], - view_count=source.get('view_count', 0), - view_session_count=source.get('view_session_count') or source.get('view_count', 0), - cumulative_view_count=_c_views, - cumulative_view_session_count=_c_view_sess or _c_views, - download_count=source.get('download_count', 0), - download_session_count=source.get('download_session_count') or source.get('download_count', 0), - cumulative_download_count=_c_downloads, - cumulative_download_session_count=_c_download_sess or _c_downloads, - ) - - -def _backfill_old_usage_report(osf_obj, is_component: bool, until_when: str): - # add a "last month" report with cumulative counts up to that point - _last_month = YearMonth.from_date(datetime.datetime.fromisoformat(until_when)).prior() - _c_views, _c_view_sess, _c_downloads, _c_download_sess = _get_cumulative_usage( - osfid=osf_obj._id, - until_when=_last_month.month_end().isoformat(), - is_preprint=isinstance(osf_obj, osfdb.Preprint), - ) - return es8_metrics.MonthlyPublicItemUsageReportEs8( - cycle_coverage=_semverish_from_yearmonth(_last_month), - item_iri=osfid_iri(osf_obj._id), - item_osfids=[osf_obj._id], - item_types=[get_item_type(osf_obj)], - provider_ids=[get_provider_id(osf_obj)], - database_iris=[get_database_iri(osf_obj)], - platform_iris=[website_settings.DOMAIN], - view_count=0, - view_session_count=0, - cumulative_view_count=_c_views, - cumulative_view_session_count=_c_view_sess or _c_views, - download_count=0, - download_session_count=0, - cumulative_download_count=_c_downloads, - cumulative_download_session_count=_c_download_sess or _c_downloads, - ) - - -def _get_cumulative_usage(osfid: str, until_when, *, is_preprint: bool): - if is_preprint: - _views = _cumulative_preprint_count(PreprintView, osfid, until_when) - _downloads = _cumulative_preprint_count(PreprintDownload, osfid, until_when) - _view_sess, _download_sess = 0, 0 # no session info on preprints (yet) - else: - _views, _view_sess = _cumulative_countedusage_views(osfid, until_when) - _downloads, _download_sess = _cumulative_countedusage_downloads( - osfid, until_when - ) - return (_views, _view_sess, _downloads, _download_sess) - - -def _cumulative_countedusage_views(osfid: str, until_when: str) -> tuple[int, int]: - '''compute view_session_count separately to avoid double-counting - - (the same session may be represented in both the composite agg on `item_guid` - and that on `surrounding_guids`) - ''' - # copied/adapted from osf.metrics.reporters.public_item_usage - _search = ( - CountedUsageEs6.search() - .filter('term', item_public=True) - .filter('range', timestamp={'lt': until_when}) - .filter('term', action_labels='view') - .filter( - 'bool', - should=[ - {'term': {'item_guid': osfid}}, - {'term': {'surrounding_guids': osfid}}, - ], - minimum_should_match=1, - ) - .extra(size=0) # only aggregations, no hits - ) - _search.aggs.metric( - 'agg_session_count', - 'cardinality', - field='session_id', - precision_threshold=_MAX_CARDINALITY_PRECISION, - ) - _response = _search.execute() - _view_count = _response.hits.total - _view_session_count = ( - _response.aggregations.agg_session_count.value - if 'agg_session_count' in _response.aggregations - else 0 - ) - return (_view_count, _view_session_count) - - -def _cumulative_countedusage_downloads(osfid, until_when) -> tuple[int, int]: - '''aggregate downloads on each osfid (not including components/files)''' - # copied/adapted from osf.metrics.reporters.public_item_usage - _search = ( - CountedUsageEs6.search() - .filter('term', item_public=True) - .filter('range', timestamp={'lt': until_when}) - .filter('term', action_labels='download') - .filter('term', item_guid=osfid) - ) - _search.aggs.metric( - 'agg_session_count', - 'cardinality', - field='session_id', - precision_threshold=_MAX_CARDINALITY_PRECISION, - ) - _response = _search.execute() - _download_count = _response.hits.total - _download_session_count = ( - _response.aggregations.agg_session_count.value - if 'agg_session_count' in _response.aggregations - else 0 - ) - return (_download_count, _download_session_count) - - -def _cumulative_preprint_count(preprint_metric_cls, osfid: str, until_when: str) -> int: - '''aggregate counts on given preprint''' - # copied/adapted from osf.metrics.preprint_metrics - _search = ( - preprint_metric_cls.search() - .filter('terms', preprint_id=_synonymous_osfids(osfid)) - .filter('range', timestamp={'lt': until_when}) - .extra(size=0) # no hits; only aggs - ) - _search.aggs.metric('agg_count', 'sum', field='count') - _response = _search.execute() - return ( - int(_response.aggregations.agg_count.value) - if hasattr(_response.aggregations, 'agg_count') - else 0 - ) - - -def _synonymous_osfids(osfid: str) -> list[str]: - _synonyms = [osfid] - if osfid.endswith('_v1'): - # include pre-versioned-guid counts for v1 - _synonyms.append(osfid.removesuffix('_v1')) - elif '_' not in osfid: - # include v1 (if it exists) with unversioned guid - _synonyms.append(f'{osfid}_v1') - return _synonyms - - -def _convert_item_type_list(osf_model_names: list[str] | str, has_surrounding_items: bool): - if isinstance(osf_model_names, str): - osf_model_names = [osf_model_names] - return [ - _convert_item_type(_model_name, has_surrounding_items) - for _model_name in osf_model_names - ] - - -def _convert_item_type(osf_model_name: str | None, has_surrounding_items: bool): - if osf_model_name: - try: - return get_item_type_from_model( - apps.get_model('osf', osf_model_name), - is_component=has_surrounding_items, - ) - except LookupError: - pass - return get_item_type_from_iri(OSF.Object) # fallback abstract osf:Object - - -def _convert_database_iri_list(provider_ids: list[str], osf_model_names: list[str]): - return [ - _convert_database_iri(_id, _model_name) - for _id in provider_ids - for _model_name in osf_model_names - ] - - -def _convert_database_iri(provider_id: str | None, osf_model_name: str): - if not provider_id: - return website_settings.DOMAIN # osf is a provider, sure why not - - match osf_model_name: # lower-cased osf.models class names - case 'node' | 'osfuser': # implicit untyped 'osf' provider - return website_settings.DOMAIN - case 'preprint': # match PreprintProvider.get_semantic_iri - return f'{website_settings.DOMAIN}preprints/{provider_id}' - case 'registration': # match RegistrationProvider.get_semantic_iri - return f'{website_settings.DOMAIN}registries/{provider_id}' - case _ if 'file' in osf_model_name: - # file providers are a different thing that don't really have an iri, just an id - return f'urn:files.osf.io:{provider_id}' - case _: # give up gracefully - _logger.error( - f'unknown model {osf_model_name!r} with provider {provider_id!r}' - ) - return f'urn:osf.io:{provider_id}' - - -def _each_usage_report_osfid(until_when, after_osfid=None): - _search = ( - es6_reports.PublicItemUsageReport.search() - .filter('range', timestamp={'lt': until_when}) - .extra(size=0) - ) - _search.aggs.bucket( - 'agg_osfid', - 'composite', - sources=[{'osfid': {'terms': {'field': 'item_osfid'}}}], - size=500, - ) - return _iter_composite_bucket_keys(_search, 'agg_osfid', 'osfid', after=after_osfid) - - -def _each_countedusage_osfid(until_when, after_osfid=None) -> collections.abc.Iterator[str]: - _search = ( - CountedUsageEs6.search() - .filter('term', item_public=True) - .filter('terms', action_labels=['view', 'download']) - .filter('range', timestamp={'lt': until_when}) - .extra(size=0) # only aggregations, no hits - ) - _search.aggs.bucket( - 'agg_osfid', - 'composite', - sources=[{'osfid': {'terms': {'field': 'item_guid'}}}], - size=_COMPOSITE_CHUNK_SIZE, - ) - return _iter_composite_bucket_keys(_search, 'agg_osfid', 'osfid', after=after_osfid) - - -def _each_preprintview_osfid(until_when, after_osfid=None) -> collections.abc.Iterator[str]: - _search = ( - PreprintView.search() - .filter('range', timestamp={'lt': until_when}) - .extra(size=0) # only aggregations, no hits - ) - _search.aggs.bucket( - 'agg_osfid', - 'composite', - sources=[{'osfid': {'terms': {'field': 'preprint_id'}}}], - size=_COMPOSITE_CHUNK_SIZE, - ) - return _iter_composite_bucket_keys(_search, 'agg_osfid', 'osfid', after=after_osfid) - - -def _each_preprintdownload_osfid(until_when, after_osfid=None) -> collections.abc.Iterator[str]: - _search = ( - PreprintDownload.search() - .filter('range', timestamp={'lt': until_when}) - .extra(size=0) # only aggregations, no hits - ) - _search.aggs.bucket( - 'agg_osfid', - 'composite', - sources=[{'osfid': {'terms': {'field': 'preprint_id'}}}], - size=_COMPOSITE_CHUNK_SIZE, - ) - return _iter_composite_bucket_keys(_search, 'agg_osfid', 'osfid', after=after_osfid) - - -def _merge_sorted_osfids(*osfid_iterables): - def _osfids_group_key(osfid: str): - return ( # v1 same as unversioned - osfid.removesuffix('_v1') - if osfid.endswith('_v1') - else osfid - ) - for _k, _g in itertools.groupby( - heapq.merge(*osfid_iterables), - key=_osfids_group_key, - ): - yield _k - - -### -# the command itself - -class Command(BaseCommand): - def add_arguments(self, parser): - parser.add_argument( - '--no-counts', - action='store_true', - ) - parser.add_argument( - '--clear-state', - action='store_true', - ) - parser.add_argument( - '--clear-es8-data', - action='store_true', - ) - parser.add_argument( - '--start', - action='store_true', - ) - parser.add_argument( - '--unchanged', - action='store_true', - ) - parser.add_argument( - '--usage-events', - action='store_true', - ) - parser.add_argument( - '--usage-reports', - action='store_true', - ) - - @functools.cached_property - def _migration_started_at(self): - return es8_metrics.Elastic6To8State.get_started_at() - - def handle( - self, - *, - no_counts, - clear_state, - clear_es8_data, - start, - unchanged, - usage_events, - usage_reports, - **kwargs, - ): - self._quiet_chatty_loggers() - if clear_state: - self._clear_state() - if clear_es8_data: - self._clear_es8_data(unchanged, usage_events, usage_reports) - self._check_started_at(start_now=start) - _default_all = not any((unchanged, usage_events, usage_reports)) - if usage_reports or _default_all: - self._handle_usage_reports(start=start, no_counts=no_counts) - if usage_events or _default_all: - self._handle_usage_events(start=start, no_counts=no_counts) - if unchanged or _default_all: - self._handle_unchanged(start=start, no_counts=no_counts) - if not no_counts: - self.stdout.write('(counts may be approximate)') - - def _handle_unchanged(self, *, start: bool, no_counts: bool): - # for each (unchanged) report/event: - for _es6_cls, _es8_cls in _UNCHANGED_RECORDTYPES.items(): - _assert_field_unchangedness(_es6_cls, _es8_cls) - if not no_counts: - # display counts - _es6_count = _es6_cls.search().count() - _es8_count = _es8_cls.search().count() - self._write_tabbed('es6', _es6_cls, _es6_count) - self._write_tabbed( - 'es8', - _es8_cls, - _es8_count, - style=self._eq_style(_es8_count, _es6_count), - ) - if start: # schedule task - self.stdout.write( - f'starting {_es6_cls.__name__} => {_es8_cls.__name__}' - ) - migrate_unchanged_recordtype.delay( - _es6_cls.__name__, self._migration_started_at.isoformat() - ) - - def _handle_usage_events(self, *, start: bool, no_counts: bool): - # for counted-usage events: - _started = self._migration_started_at or datetime.datetime.now() - _range_start = (_started - datetime.timedelta(days=_USAGE_DAYS_BACK)).date() - _range_end = _started.date() + datetime.timedelta(days=1) - if not no_counts: - # display counts for each view/download event type - _range_q = { - 'range': { - 'timestamp': { - 'gte': _range_start.isoformat(), - 'lt': _range_end.isoformat(), - } - } - } - _es6_usage_count_q = { - 'bool': { - 'filter': [_range_q, {'exists': {'field': 'item_guid'}}], - }, - } - _es6_pview_count = PreprintView.search().filter(_range_q).count() - _es6_pdownload_count = PreprintDownload.search().filter(_range_q).count() - _es6_usage_event_count = CountedUsageEs6.search().filter(_es6_usage_count_q).count() - _es6_count = ( - _es6_pview_count + _es6_pdownload_count + _es6_usage_event_count - ) - _es8_count = es8_metrics.OsfCountedUsageEvent.search().filter(_range_q).count() - self._write_tabbed('es6', PreprintView, _es6_pview_count) - self._write_tabbed('es6', PreprintDownload, _es6_pdownload_count) - self._write_tabbed('es6', CountedUsageEs6, _es6_usage_event_count) - self._write_tabbed( - 'es6', f'(total between {_range_start} and {_range_end})', _es6_count - ) - self._write_tabbed( - 'es8', - es8_metrics.OsfCountedUsageEvent, - _es8_count, - style=self._eq_style(_es8_count, _es6_count), - ) - if start: # schedule (per-day?) tasks (if --start) - self.stdout.write( - f'starting usages => {es8_metrics.OsfCountedUsageEvent.__name__}' - ) - for _from_date, _until_date in _date_range(_range_start, _range_end): - _from_str = _from_date.isoformat() - _until_str = _until_date.isoformat() - migrate_counted_usages.delay(_from_str, _until_str) - migrate_preprint_views.delay(_from_str, _until_str) - migrate_preprint_downloads.delay(_from_str, _until_str) - - def _handle_usage_reports(self, *, start: bool, no_counts: bool): - if not no_counts: - # display counts of reports and distinct items - _es6_count, _es6_item_count = _es6_usage_report_counts() - _es8_count, _es8_item_count = _es8_usage_report_counts() - self._write_tabbed('es6', es6_reports.PublicItemUsageReport, _es6_count) - self._write_tabbed( - 'es8', - es8_metrics.MonthlyPublicItemUsageReportEs8, - _es8_count, - style=self._eq_style(_es8_count, _es6_count), - ) - self._write_tabbed( - 'es6', - es6_reports.PublicItemUsageReport, - 'osfid count:', - _es6_item_count, - ) - self._write_tabbed( - 'es8', - es8_metrics.MonthlyPublicItemUsageReportEs8, - 'osfid count:', - _es8_item_count, - style=self._eq_style(_es8_item_count, _es6_item_count), - ) - # (if --start) schedule task per item (by composite agg on es6 usage reports and events) - # each item-task iter thru reports oldest to newest, adding cumulative counts - if start: - self.stdout.write( - f'starting per-item {es6_reports.PublicItemUsageReport.__name__} => {es8_metrics.MonthlyPublicItemUsageReportEs8.__name__}' - ) - schedule_migrate_usage_reports.delay(self._migration_started_at.isoformat()) - - def _check_started_at(self, start_now): - _started_at = self._migration_started_at - if _started_at: - self.stdout.write( - f'osf.metrics 6->8 migration started previously, at {_started_at.isoformat()}' - ) - elif start_now: - _started_at = es8_metrics.Elastic6To8State.set_started_at_now() - del self._migration_started_at # clear cache - self.stdout.write( - f'osf.metrics 6->8 migration starting now, at {_started_at.isoformat()}' - ) - else: - self.stdout.write( - 'osf.metrics 6->8 migration not started nor starting (run with `--start` to start)' - ) - - def _clear_state(self): - self.stdout.write( - 'clearing all migration state (start time, etc)', self.style.NOTICE - ) - es8_metrics.Elastic6To8State.search().query({'match_all': {}}).delete() - es8_metrics.Elastic6To8State.refresh() - - def _clear_es8_data(self, unchanged, usage_events, usage_reports): - _default_all = not any((unchanged, usage_events, usage_reports)) - _to_clear = [] - if _default_all or unchanged: - _to_clear.extend(_UNCHANGED_RECORDTYPES.values()) - if _default_all or usage_events: - _to_clear.append(es8_metrics.OsfCountedUsageEvent) - if _default_all or usage_reports: - _to_clear.append(es8_metrics.MonthlyPublicItemUsageReportEs8) - for _es8_recordtype in _to_clear: - self.stdout.write( - f'clearing {_es8_recordtype.__name__}', self.style.NOTICE - ) - _es8_recordtype.do_teardown(keep_templates=True) - - def _eq_style(self, num: int, should_be: int): - return self.style.SUCCESS if (num == should_be) else self.style.WARNING - - def _write_tabbed(self, *strables, style=None): - def _to_str(strable): - if isinstance(strable, type): - return strable.__name__ - return str(strable) - - self.stdout.write('\t'.join(map(_to_str, strables)), style) - - def _quiet_chatty_loggers(self): - _chatty_loggers = [ - 'elasticsearch', - 'elastic_transport', - 'elasticsearch_metrics', - ] - for logger_name in _chatty_loggers: - logging.getLogger(logger_name).setLevel(logging.ERROR) diff --git a/osf/management/commands/monthly_reporters_go.py b/osf/management/commands/monthly_reporters_go.py index cfcb22bfc7f..9f6d57bc5db 100644 --- a/osf/management/commands/monthly_reporters_go.py +++ b/osf/management/commands/monthly_reporters_go.py @@ -3,14 +3,12 @@ from django.core.management.base import BaseCommand from django.db import OperationalError as DjangoOperationalError -from elasticsearch6.exceptions import ConnectionError as Elastic6ConnectionError from elasticsearch8.exceptions import ConnectionError as Elastic8ConnectionError from psycopg2 import OperationalError as PostgresOperationalError from framework.celery_tasks import app as celery_app import framework.sentry from osf.metrics.reporters import AllMonthlyReporters -from osf.metrics.reports import MonthlyReport from osf.metrics.utils import YearMonth @@ -19,7 +17,6 @@ _CONTINUE_AFTER_ERRORS = ( DjangoOperationalError, - Elastic6ConnectionError, Elastic8ConnectionError, PostgresOperationalError, ) @@ -86,8 +83,6 @@ def monthly_reporter_do(reporter_key: str, yearmonth: str, report_kwargs: dict): _reports = _reporter.report(**report_kwargs) for _report in _reports: - if isinstance(_report, MonthlyReport) and (_report.report_yearmonth is None): - _report.report_yearmonth = _reporter.yearmonth _report.save() _followup_task = _reporter.followup_task(_report) if _followup_task is not None: diff --git a/osf/management/commands/populate_impact_preprint_metrics.py b/osf/management/commands/populate_impact_preprint_metrics.py deleted file mode 100644 index f5fc60cd8e1..00000000000 --- a/osf/management/commands/populate_impact_preprint_metrics.py +++ /dev/null @@ -1,117 +0,0 @@ -import datetime as dt -from random import random -from django.core.management.base import BaseCommand - -from osf.metrics import ( - PreprintView, - PreprintDownload, -) - -from osf.models import Preprint - - -""" -This management command can be run to populate impact with fake -preprints metrics data. - -All flags are optional with the script defaulting to 3 preprints from -your local database with metrics for the past 7 days and an average -count of 25 for preprint views/downloads per day. - ---preprints: Specify preprint guids ---num_preprints: Specify the number of preprint to use from the database (if -preprint guids aren't specified) ---days: Specify the number of days to write metrics data for ---group_counts: Indicates that metric counts should be grouped -in a single record per preprint per day ---avg_counts: The average number of view/download counts to write -for each preprint per day - -Example: docker-compose run --rm web python3 manage.py populate_impact_preprint_metrics --num_preprints 1 --days 5 --group_counts --avg_counts 50 -""" - - -def populate_preprint_metrics(preprints, dates, avg_counts, group_counts=False): - for date in dates: - for preprint in preprints: - preprint_view_count = int((avg_counts * 2) * random()) - preprint_download_count = int((avg_counts * 2) * random()) - - if group_counts: - PreprintView.record_for_preprint( - preprint=preprint, - path=preprint.primary_file.path, - timestamp=date, - count=preprint_view_count - ) - - PreprintDownload.record_for_preprint( - preprint=preprint, - path=preprint.primary_file.path, - timestamp=date, - count=preprint_download_count - ) - else: - for count in range(preprint_view_count): - PreprintView.record_for_preprint( - preprint=preprint, - path=preprint.primary_file.path, - timestamp=date - ) - - for count in range(preprint_download_count): - PreprintDownload.record_for_preprint( - preprint=preprint, - path=preprint.primary_file.path, - timestamp=date - ) - - -class Command(BaseCommand): - - def add_arguments(self, parser): - super().add_arguments(parser) - parser.add_argument( - '--preprints', - nargs='*', - help='Specify preprints guids' - ) - parser.add_argument( - '--num_preprints', - type=int, - default=3, - help='Specify number of preprints to use if not specifying preprints' - ) - parser.add_argument( - '--days', - type=int, - default=7, - help='Specify number of past days to write metrics data for' - ) - parser.add_argument( - '--group_counts', - action='store_true', - help='Group counts in metric records for fewer ES requests' - ) - parser.add_argument( - '--avg_counts', - type=int, - default=25, - help='Average number of counts to write per day per preprint' - ) - - def handle(self, *args, **options): - days = options.get('days') - num_preprints = options.get('num_preprints') - group_counts = options.get('group_counts') - avg_counts = options.get('avg_counts') - - if options.get('preprints'): - preprints = Preprint.objects.filter(guids___id__in=options.get('preprints')) - else: - preprints = Preprint.objects.all()[:num_preprints] - - today = dt.datetime.today() - last_x_days = [(today - dt.timedelta(days=num_days)) for num_days in range(0, days)] - - populate_preprint_metrics(preprints, last_x_days, avg_counts, group_counts) diff --git a/osf/management/commands/reindex_es6.py b/osf/management/commands/reindex_es6.py deleted file mode 100644 index 8961ea6fff1..00000000000 --- a/osf/management/commands/reindex_es6.py +++ /dev/null @@ -1,104 +0,0 @@ -""" -Reindex data to use current mapping for ES metrics classes -""" -import logging - -from django.core.management.base import BaseCommand -from elasticsearch6_dsl import connections -from elasticsearch_metrics.registry import registry - -logger = logging.getLogger(__name__) - - -def get_metric_class(index_name: str) -> type: - app_label, model_name = index_name.split('_')[:2] - return registry.all_metrics[app_label][model_name] - - -def increment_index_versions(client, old_indices: list): - """ - Increment versions numbers for new indices, these kind don't matter because they should always be aliased to - the original format of {app_label}_{cls.__name__.lower()}_{year}. - - :param old_indices: indices to be updated - :return: indices names that are going to be reindexed into. - """ - new_indices = [] - for index in old_indices: - index_name = list(client.indices.get(index).keys())[0] # in case we've already aliased this index - if '_v' in index_name and index_name[-1].isdigit(): - name, version_num = index_name.split('_v') - new_index = f'{name}_v{int(version_num) + 1}' - else: - new_index = f'{index}_v2' - new_indices.append(new_index) - - return new_indices - - -def reindex_and_alias(old_indices: list, dry_run: bool = False): - """ - To migrate data in ES with new mappings is a 4 step process: - 1) Create an index with new mappings - 2) Reindex data from old to new - 3) Delete the old index - 4) Alias the new index so it references the old. - - :param old_indices: indices with data that has old mappings - :return: None - """ - if dry_run: - logger.info('[DRY RUN] THIS IS A DRY RUN.') - client = connections.get_connection() - new_indices = increment_index_versions(client, old_indices) - - for old_index, new_index in zip(old_indices, new_indices): - metric_class = get_metric_class(old_index) - if dry_run: - logger.info(f'[DRY RUN] Would reindex {old_index} to {new_index} for {metric_class}') - continue - client.indices.create(new_index, body=metric_class._index.to_dict(), params={'wait_for_active_shards': 1}) - logger.info(f'Created index {new_index}') - body = { - 'source': { - 'index': old_index - }, - 'dest': { - 'index': new_index - } - } - logger.info(f'Created reindexing {old_index} to {new_index}') - client.reindex(body, params={'wait_for_completion': 'true'}) - logger.info('Reindexing complete') - old_index_name = list(client.indices.get(old_index).keys())[0] # in case we've already aliased this index - - if old_index_name == old_index: # True if not aliased - client.indices.delete(old_index) - logger.info(f'{old_index} deleted') - client.indices.put_alias(new_index, old_index) - else: - client.indices.put_alias(new_index, old_index) - client.indices.delete(old_index_name) - logger.info(f'{old_index_name} deleted') - - -class Command(BaseCommand): - def add_arguments(self, parser): - super().add_arguments(parser) - parser.add_argument( - '--indices', - type=str, - nargs='+', - help='List of indices to be reindexed and remapped' - ) - parser.add_argument( - '--dry', - action='store_true', - dest='dry_run', - help='Run migration and roll back changes to db', - ) - - def handle(self, *args, **options): - indices = options.get('indices', []) - dry_run = options.get('dry_run', True) - reindex_and_alias(indices, dry_run) diff --git a/osf/metadata/osf_gathering.py b/osf/metadata/osf_gathering.py index a72a799402d..14c637955aa 100644 --- a/osf/metadata/osf_gathering.py +++ b/osf/metadata/osf_gathering.py @@ -11,12 +11,12 @@ from api.caching.tasks import get_storage_usage_total from osf import models as osfdb +from osf.models.base import osfid_iri from osf.metadata import gather from osf.metadata.definitions.datacite import DATACITE_RESOURCE_TYPES_GENERAL from osf.metadata.osfmap_utils import ( osfmap_type, is_osf_component, - osfid_from_iri, ) from osf.metadata.rdfutils import ( DATACITE, @@ -37,7 +37,7 @@ format_dcterms_extent, smells_like_iri, ) -from osf.metrics.reports import PublicItemUsageReport +from osf.metrics.monthly_reports import MonthlyPublicItemUsageReport from osf.metrics.utils import YearMonth from osf.utils import ( workflows as osfworkflows, @@ -1085,22 +1085,33 @@ def gather_cedar_templates(focus): @gather.er(OSF.usage) def gather_last_month_usage(focus): - _usage_report = PublicItemUsageReport.for_last_month( - item_osfid=osfid_from_iri(focus.iri), - ) - if _usage_report is not None: + _item_iris = [focus.iri] + # items with versioned osfids may have a separate usage report for each version, + # but this metadata is gathered for the unversioned osfid -- add counts together + if hasattr(focus.dbmodel, 'versioned_guids'): + _item_iris.extend( + osfid_iri(_vg.versioned_osfid()) + for _vg in focus.dbmodel.versioned_guids.all() + ) + _usage_reports = MonthlyPublicItemUsageReport.from_last_month(_item_iris) + if _usage_reports: + def _sum_usage(report_attr_name): + return sum( + getattr(_usage_report, report_attr_name) + for _usage_report in _usage_reports + ) _usage_report_ref = rdflib.BNode() yield (OSF.usage, _usage_report_ref) yield (_usage_report_ref, DCAT.accessService, rdflib.URIRef(website_settings.DOMAIN.rstrip('/'))) yield (_usage_report_ref, FOAF.primaryTopic, focus.iri) yield (_usage_report_ref, DCTERMS.temporal, rdflib.Literal( - str(_usage_report.report_yearmonth), + str(_usage_reports[0].report_yearmonth), datatype=rdflib.XSD.gYearMonth, )) - yield (_usage_report_ref, OSF.viewCount, _usage_report.view_count) - yield (_usage_report_ref, OSF.viewSessionCount, _usage_report.view_session_count) - yield (_usage_report_ref, OSF.downloadCount, _usage_report.download_count) - yield (_usage_report_ref, OSF.downloadSessionCount, _usage_report.download_session_count) + yield (_usage_report_ref, OSF.viewCount, _sum_usage('view_count')) + yield (_usage_report_ref, OSF.viewSessionCount, _sum_usage('view_session_count')) + yield (_usage_report_ref, OSF.downloadCount, _sum_usage('download_count')) + yield (_usage_report_ref, OSF.downloadSessionCount, _sum_usage('download_session_count')) @gather.er(OSF.hasOsfAddon) diff --git a/osf/metadata/serializers/linkset.py b/osf/metadata/serializers/linkset.py index 3ee907d0532..499a82fcdb4 100644 --- a/osf/metadata/serializers/linkset.py +++ b/osf/metadata/serializers/linkset.py @@ -16,7 +16,7 @@ import rdflib from ._base import MetadataSerializer -from osf.metadata.osf_gathering import osfid_from_iri +from osf.metadata.osfmap_utils import osfid_from_iri from osf.metadata.rdfutils import (DOI, DATACITE, DCTERMS, OWL, RDF, OSF, DCAT, SCHEMA, DATACITE_SCHEMA_RESOURCE_TYPE_GENERAL_MAPPING, map_resource_type_general_datacite_to_scheme) from website.settings import DOMAIN from website.util import web_url_for diff --git a/osf/metrics/README.md b/osf/metrics/README.md index b5dad732e29..4f81049a0c9 100644 --- a/osf/metrics/README.md +++ b/osf/metrics/README.md @@ -8,14 +8,15 @@ but note that the COUNTER_SUSHI api has not yet been implemented atop. ## data model usage data and periodic reports are both stored in elasticsearch using -`elasticsearch-dsl`-based data models. +`django-elasticsearch-metrics` and `elasticsearch8.dsl`-based data models. -each "usage" is represented as `CountedAuthUsage` -- see `osf.metrics.counted_usage` +each "usage" is represented as `OsfCountedUsageEvent` -- see `osf.metrics.events` for field definitions with comments mapping fields to concepts in the COUNTER spec. -each periodic report is represented as a subclass of `DailyReport` or `MonthlyReport` -(see `osf.metrics.reports`) and has a "reporter" (see `osf.metrics.reporters`) that -is invoked periodically to report. +each periodic report is a subclass of `osf.metrics.monthly_reports.BaseMonthlyReport` +or `osf.metrics.daily_reports.BaseDailyReport` (themselves subclasses of +`elasticsearch_metrics.imps.elastic8.CyclicRecord`) and has a "reporter" +(see `osf.metrics.reporters`) that is invoked periodically to report. ## api note: the `osf.metrics` api is subject to change, is supported only for use within OSF @@ -29,12 +30,13 @@ endpoints of interest for new development (all starting with `/_/metrics/`): - `events/counted_usage/`: POST-only, for recording a usage - `reports/`: GET list of available report types - `reports//recent`: GET list of recent reports + - `reports//`: GET list of reports (filterable, sortable) - `query/`: namespace for views that query usage data on demand (only for statically defined, cheap queries) ## how to ### add a new monthly report -- add a `MonthlyReport` subclass (in `osf.metrics.reports`) with the fields you want +- add a `BaseMonthlyReport` subclass (in `osf.metrics.monthly_reports`) with the fields you want - add a `MonthlyReporter` subclass (in a module under `osf.metrics.reporters`) that knows how to build your report - to have your reporter run automatically, add it to `osf.metrics.reporters.MONTHLY_REPORTERS` diff --git a/osf/metrics/__init__.py b/osf/metrics/__init__.py index 6056e6d92f3..7d124c501b7 100644 --- a/osf/metrics/__init__.py +++ b/osf/metrics/__init__.py @@ -1,42 +1,12 @@ -from .counted_usage import CountedAuthUsage - -from .preprint_metrics import ( - PreprintView, - PreprintDownload, -) - -from .registry_metrics import RegistriesModerationMetrics - -from .reports import ( - DownloadCountReport, - InstitutionSummaryReport, - NewUserDomainReport, - NodeSummaryReport, - OsfstorageFileCountReport, - PreprintSummaryReport, - StorageAddonUsage, - UserSummaryReport, -) -from . import es8_metrics - - -DAILY_REPORTS = ( - DownloadCountReport, - InstitutionSummaryReport, - NewUserDomainReport, - NodeSummaryReport, - OsfstorageFileCountReport, - PreprintSummaryReport, - StorageAddonUsage, - UserSummaryReport, +from . import ( + events, + daily_reports, + monthly_reports, ) __all__ = ( - 'CountedAuthUsage', - 'DAILY_REPORTS', - 'PreprintView', - 'PreprintDownload', - 'RegistriesModerationMetrics', - 'es8_metrics', + 'events', + 'daily_reports', + 'monthly_reports', ) diff --git a/osf/metrics/counted_usage.py b/osf/metrics/counted_usage.py deleted file mode 100644 index 41ea012fda5..00000000000 --- a/osf/metrics/counted_usage.py +++ /dev/null @@ -1,196 +0,0 @@ -from datetime import datetime -import enum -import logging -from urllib.parse import urlsplit - -from elasticsearch6_dsl import InnerDoc, analyzer, tokenizer -import elasticsearch_metrics.imps.elastic6 as metrics -from elasticsearch_metrics.signals import pre_save -from django.dispatch import receiver -import pytz - -from osf.metrics.utils import stable_key - - -logger = logging.getLogger(__name__) - -route_prefix_analyzer = analyzer( - 'route_prefix_analyzer', - tokenizer=tokenizer('route_prefix_tokenizer', 'path_hierarchy', delimiter='.'), -) - -class PageviewInfo(InnerDoc): - """PageviewInfo - - for CountedAuthUsage generated by viewing a web page - """ - # fields that should be provided - referer_url = metrics.Keyword() - page_url = metrics.Keyword() - page_title = metrics.Keyword() - route_name = metrics.Keyword( - fields={ - 'by_prefix': metrics.Text(analyzer=route_prefix_analyzer), - }, - ) - - # fields autofilled from the above (see `_autofill_fields`) - page_path = metrics.Keyword() - referer_domain = metrics.Keyword() - hour_of_day = metrics.Integer() - - -class CountedAuthUsage(metrics.Metric): - """CountedAuthUsage - - Something was used! Let's quickly take note of that and - move on, then come back later to query/analyze/investigate. - - Aim to support a COUNTER-style reporting api - (see https://cop5.projectcounter.org/en/5.0.2/) - """ - - # where noted, fields correspond to defined terms from COUNTER - # https://cop5.projectcounter.org/en/5.0.2/appendices/a-glossary-of-terms.html - platform_iri = metrics.Keyword() # counter:Platform - provider_id = metrics.Keyword() # counter:Database(?) - session_id = metrics.Keyword() # counter:Session - item_guid = metrics.Keyword() # counter:Item - item_type = metrics.Keyword() # counter:Data-Type - surrounding_guids = metrics.Keyword(multi=True) # counter:Title - item_public = metrics.Boolean() # counter:Access-Type(?) - user_is_authenticated = metrics.Boolean() - - action_labels = metrics.Keyword(multi=True) - class ActionLabel(enum.Enum): - SEARCH = 'search' # counter:Search - VIEW = 'view' # counter:Investigation - DOWNLOAD = 'download' # counter:Request - WEB = 'web' # counter:Regular (aka "pageview") - API = 'api' # counter:TDM (aka "non-web api usage") - # TODO: count api usage, distinguish between web and non-web api requests - - # pageviews get additional info to support the "node analytics" view - # (see `api.metrics.views.NodeAnalyticsQuery`) - pageview_info = metrics.Object(PageviewInfo) - - class Meta: - dynamic = metrics.MetaField('strict') - source = metrics.MetaField(enabled=True) - - -@receiver(pre_save, sender=CountedAuthUsage) -def _autofill_fields(sender, instance, **kwargs): - pageview = getattr(instance, 'pageview_info', None) - if pageview: - _fill_pageview_info(instance) - item_guid = getattr(instance, 'item_guid', None) - if item_guid: - from osf.models import Guid - guid_instance = Guid.load(item_guid) - if guid_instance and guid_instance.referent: - _fill_osfguid_info(instance, guid_instance.referent) - _fill_document_id(instance) - - -def _fill_pageview_info(counted_usage): - pageview = counted_usage.pageview_info - pageview_dict = pageview.to_dict() - pageview.hour_of_day = counted_usage.timestamp.hour - pageview.page_path = urlsplit(pageview_dict['page_url']).path.rstrip('/') - if referer := pageview_dict.get('referer_url'): - pageview.referer_domain = urlsplit(referer).netloc - - -def _fill_osfguid_info(counted_usage, guid_referent): - counted_usage.item_public = _get_ispublic(guid_referent) - counted_usage.item_type = get_item_type(guid_referent) - counted_usage.surrounding_guids = _get_surrounding_guids(guid_referent) - if not counted_usage.provider_id: - counted_usage.provider_id = get_provider_id(guid_referent) - - -def _fill_document_id(counted_usage): - # set the document id to a hash of "unique together" - # values to get "ON CONFLICT UPDATE" behavior -- if - # a matching document already exists, it will be updated, - # not duplicated. - - # cannot detect/avoid conflicts this way, but that's ok - # because we want to approximate `counter:Double-Click Filtering` - - if counted_usage.pageview_info is not None and counted_usage.pageview_info.page_url is not None: - target_identifier = counted_usage.pageview_info.page_url - else: - target_identifier = counted_usage.item_guid - - # slice the day into an array of 30-second windows, - # find this timestamp's windowslice index - day_start = datetime( - counted_usage.timestamp.year, - counted_usage.timestamp.month, - counted_usage.timestamp.day, - tzinfo=pytz.utc, - ) - time_in_seconds = (counted_usage.timestamp - day_start).total_seconds() - time_window = int(time_in_seconds / 30) - - counted_usage.meta.id = stable_key( - # unique-together values: - counted_usage.platform_iri, - target_identifier, - counted_usage.session_id, - counted_usage.timestamp.date(), - time_window, - ','.join(sorted(counted_usage.action_labels)), - ) - - -def _get_ispublic(guid_referent): - # if it quacks like BaseFileNode, look at .target instead - maybe_public = getattr(guid_referent, 'target', None) or guid_referent - if hasattr(maybe_public, 'verified_publishable'): - return maybe_public.verified_publishable # quacks like Preprint - return getattr(maybe_public, 'is_public', None) # quacks like AbstractNode - - -def get_provider_id(guid_referent): - provider = getattr(guid_referent, 'provider', None) - if isinstance(provider, str): - return provider # quacks like BaseFileNode - elif provider: - return provider._id # quacks like Registration, Preprint, Collection - return 'osf' # quacks like Node, Comment, WikiPage - - -def get_item_type(guid_referent): - return type(guid_referent).__name__.lower() - - -def _get_immediate_wrapper(guid_referent): - if hasattr(guid_referent, 'verified_publishable'): - return None # quacks like Preprint - return ( - getattr(guid_referent, 'parent_node', None) # quacks like AbstractNode - or getattr(guid_referent, 'node', None) # quacks like WikiPage, Comment - or getattr(guid_referent, 'target', None) # quacks like BaseFileNode - ) - -def _get_surrounding_guids(guid_referent): - """get all the parent/owner/surrounding guids for the given guid_referent - - @param guid_referent: instance of a model that has GuidMixin - @returns list of str - - For AbstractNode, goes up the node hierarchy up to the root. - For WikiPage or BaseFileNode, grab the node it belongs to and - follow the node hierarchy from there. - """ - surrounding_guids = [] - current_referent = guid_referent - while current_referent: - next_referent = _get_immediate_wrapper(current_referent) - if next_referent: - surrounding_guids.append(next_referent._id) - current_referent = next_referent - return surrounding_guids diff --git a/osf/metrics/daily_reports.py b/osf/metrics/daily_reports.py new file mode 100644 index 00000000000..40eb5073236 --- /dev/null +++ b/osf/metrics/daily_reports.py @@ -0,0 +1,178 @@ +import datetime + +import elasticsearch8.dsl as esdsl +from elasticsearch_metrics import DAILY, YEARLY +import elasticsearch_metrics.imps.elastic8 as djelme + +from osf.metrics.utils import cycle_coverage_date + +__all__ = ( + 'BaseDailyReport', + 'DailyDownloadCountReport', + 'DailyInstitutionSummaryReport', + 'DailyNewUserDomainReport', + 'DailyNodeSummaryReport', + 'DailyOsfstorageFileCountReport', + 'DailyPreprintSummaryReport', + 'DailyStorageAddonUsageReport', + 'DailyUserSummaryReport', +) + + +### +# base class + +class BaseDailyReport(djelme.CyclicRecord): + CYCLE_TIMEDEPTH = DAILY + + class Meta: + abstract = True + + def __init__(self, *, report_date=None, **kwargs): + super().__init__(**kwargs) + # separate out report_date, so the property setter gets used + if report_date is not None: + self.report_date = report_date + + @property + def report_date(self): + _year, _month, _day = map(int, self.cycle_coverage.split('.')) + return datetime.date(_year, _month, _day) + + @report_date.setter + def report_date(self, d: str | datetime.date): + self.cycle_coverage = cycle_coverage_date( + datetime.date.fromisoformat(d) if isinstance(d, str) else d + ) + + +### +# reusable inner objects + +class RunningTotal(esdsl.InnerDoc): + total: int + total_daily: int | None + + +class FileRunningTotals(esdsl.InnerDoc): + total: int + public: int + private: int + total_daily: int + public_daily: int + private_daily: int + + +class NodeRunningTotals(esdsl.InnerDoc): + total: int + total_excluding_spam: int | None + public: int + private: int + total_daily: int + total_daily_excluding_spam: int | None + public_daily: int + private_daily: int + + +class RegistrationRunningTotals(esdsl.InnerDoc): + total: int + public: int + embargoed: int + embargoed_v2: int + withdrawn: int | None + total_daily: int + public_daily: int + embargoed_daily: int + embargoed_v2_daily: int + withdrawn_daily: int | None + + +class UsageByStorageAddon(esdsl.InnerDoc): + addon_shortname: str + enabled_usersettings: RunningTotal + linked_usersettings: RunningTotal + deleted_usersettings: RunningTotal + usersetting_links: RunningTotal + connected_nodesettings: RunningTotal + disconnected_nodesettings: RunningTotal + deleted_nodesettings: RunningTotal + + +### +# daily reports + +class DailyStorageAddonUsageReport(BaseDailyReport): + usage_by_addon: list[UsageByStorageAddon] + + class Meta: + timeseries_index_timedepth = YEARLY + + +class DailyDownloadCountReport(BaseDailyReport): + daily_file_downloads: int + + class Meta: + timeseries_index_timedepth = YEARLY + + +class DailyInstitutionSummaryReport(BaseDailyReport): + UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'institution_id',) + + institution_id: str + institution_name: str + users: RunningTotal + nodes: NodeRunningTotals + projects: NodeRunningTotals + registered_nodes: RegistrationRunningTotals + registered_projects: RegistrationRunningTotals + + class Meta: + timeseries_index_timedepth = YEARLY + + +class DailyNewUserDomainReport(BaseDailyReport): + UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'domain_name',) + + domain_name: str + new_user_count: int + + class Meta: + timeseries_index_timedepth = YEARLY + + +class DailyNodeSummaryReport(BaseDailyReport): + nodes: NodeRunningTotals + projects: NodeRunningTotals + registered_nodes: RegistrationRunningTotals + registered_projects: RegistrationRunningTotals + + class Meta: + timeseries_index_timedepth = YEARLY + + +class DailyOsfstorageFileCountReport(BaseDailyReport): + files: FileRunningTotals + + class Meta: + timeseries_index_timedepth = YEARLY + + +class DailyPreprintSummaryReport(BaseDailyReport): + UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'provider_key',) + provider_key: str + preprint_count: int + + class Meta: + timeseries_index_timedepth = YEARLY + + +class DailyUserSummaryReport(BaseDailyReport): + active: int + deactivated: int + merged: int + new_users_daily: int + new_users_with_institution_daily: int + unconfirmed: int + + class Meta: + timeseries_index_timedepth = YEARLY diff --git a/osf/metrics/es8_metrics.py b/osf/metrics/es8_metrics.py deleted file mode 100644 index be68883a648..00000000000 --- a/osf/metrics/es8_metrics.py +++ /dev/null @@ -1,576 +0,0 @@ -import datetime -import enum -import functools -from urllib.parse import urlsplit - -import elasticsearch8.dsl as esdsl -from elasticsearch_metrics import DAILY, MONTHLY, YEARLY -import elasticsearch_metrics.imps.elastic8 as djelme - -from osf.metadata.osfmap_utils import osfid_from_iri -from osf.metrics.counted_usage import _get_surrounding_guids -from osf.metrics.utils import ( - YearMonth, - get_database_iri, - get_item_type, -) -from osf import models as osfdb -from osf.models.base import osfid_iri -from website import settings as website_settings - - -### -# custom dsl fields - -class YearmonthField(esdsl.Date): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs, format='strict_year_month') - - def deserialize(self, data): - if isinstance(data, int): - # elasticsearch stores dates in milliseconds since the unix epoch - _as_datetime = datetime.datetime.fromtimestamp(data // 1000) - return YearMonth.from_date(_as_datetime) - elif data is None: - return None - try: - return YearMonth.from_any(data) - except ValueError: - raise ValueError(f'unsure how to deserialize "{data}" (of type {type(data)}) to YearMonth') - - def serialize(self, data, skip_empty=True): - if isinstance(data, str): - return data - elif isinstance(data, YearMonth): - return str(data) - elif isinstance(data, (datetime.datetime, datetime.date)): - return str(YearMonth.from_date(data)) - elif data is None: - return None - else: - raise ValueError(f'unsure how to serialize "{data}" (of type {type(data)}) as YYYY-MM') - - -### -# inner objects for events - -route_prefix_analyzer = esdsl.analyzer( - 'route_prefix_analyzer', - tokenizer=esdsl.tokenizer('route_prefix_tokenizer', 'path_hierarchy', delimiter='.'), -) - - -class PageviewInfo(esdsl.InnerDoc): - """PageviewInfo - - for CountedAuthUsage generated by viewing a web page - """ - - # fields that should be provided - referer_url: str | None - page_url: str | None - page_title: str | None - route_name: str | None = esdsl.mapped_field(esdsl.Keyword( - fields={ - 'by_prefix': esdsl.Text(analyzer=route_prefix_analyzer), - }, - )) - - # fields auto-filled - page_path: str | None - referer_domain: str | None - hour_of_day: int | None - - -### -# Event records - -class OsfCountedUsageEvent(djelme.CountedUsageRecord): - ''' - Aim to support a COUNTER-style reporting api - https://cop5.projectcounter.org/en/5.1/appendices/a-glossary-of-terms.html - https://coprd.countermetrics.org/en/1.0.1/appendices/a-glossary.html - ''' - UNIQUE_TOGETHER_FIELDS = ( - 'platform_iri', - 'sessionhour_id', - 'action_labels', - # include some non-field properties for more complex logic to - # slightly better approximate `counter:Double-Click Filtering` - # and allow for multiple pages describing the same item_iri - '_page_url_or_osfid', # non-field property - '_timestamp_date', # non-field property - '_timestamp_30sec_window', # non-field property - ) - - # inherited fields: - # timestamp: datetime.datetime - # platform_iri: str - # database_iri: str - # item_iri: str - # sessionhour_id: str - # within_iris: list[str] - - # osf-specific fields: - item_osfid: str - item_type: str - item_public: bool - provider_id: str | None - user_is_authenticated: bool - action_labels: list[str] - pageview_info: PageviewInfo | None - - class Meta: - timeseries_index_timedepth = MONTHLY - - class ActionLabel(enum.Enum): - SEARCH = 'search' # counter:Search - VIEW = 'view' # counter:Investigation - DOWNLOAD = 'download' # counter:Request - WEB = 'web' # counter:Regular (aka "pageview") - API = 'api' # counter:TDM (aka "non-web api usage") - - @classmethod - def record(cls, **kwargs): - # autofill `user_is_authenticated` before `user_id` discarded (couldn't in `clean`) - if 'user_is_authenticated' not in kwargs: - kwargs['user_is_authenticated'] = bool(kwargs.get('user_id')) - return super().record(**kwargs) - - @property - def _page_url_or_osfid(self): - # for UNIQUE_TOGETHER_FIELDS - return ( - self.pageview_info.page_url - if self.pageview_info is not None and self.pageview_info.page_url is not None - else self.item_osfid - ) - - @property - def _timestamp_date(self): - # for UNIQUE_TOGETHER_FIELDS - return self.timestamp.date() - - @property - def _timestamp_30sec_window(self): - # for UNIQUE_TOGETHER_FIELDS - # slice the day into an array of 30-second windows, - # find this timestamp's windowslice index - _day_start = datetime.datetime( - self.timestamp.year, - self.timestamp.month, - self.timestamp.day, - tzinfo=self.timestamp.tzinfo, - ) - _time_in_seconds = (self.timestamp - _day_start).total_seconds() - return int(_time_in_seconds / 30) # 30-second windows - - @functools.cached_property - def _osfid_referent(self): - # for use by autofill methods, if needed - _osfguid = osfdb.Guid.load(self.item_osfid) - return _osfguid.referent if _osfguid else None - - def clean(self): - self._autofill_platform_iri() - self._autofill_item_iri_and_osfid() - self._autofill_item_public() - self._autofill_item_type() - self._autofill_provider_id() - self._autofill_within_iris() - self._autofill_pageview() - self._autofill_database_iri() - self._clean_action_labels() - super().clean() - - def _autofill_platform_iri(self): - if self.platform_iri is None: - self.platform_iri = website_settings.DOMAIN - - def _autofill_item_iri_and_osfid(self): - if self.item_osfid and not self.item_iri: - self.item_iri = osfid_iri(self.item_osfid) - elif self.item_iri and not self.item_osfid: - try: - self.item_osfid = osfid_from_iri(self.item_iri) - except ValueError: - pass - - def _autofill_item_public(self): - if self.item_osfid and (self.item_public is None): - _item = self._osfid_referent - # if it quacks like BaseFileNode, look at .target instead - _item = getattr(_item, 'target', None) or _item - self.item_public = ( - _item.verified_publishable # quacks like Preprint - if hasattr(_item, 'verified_publishable') - else getattr(_item, 'is_public', False) # quacks like AbstractNode - ) - - def _autofill_item_type(self): - if self.item_osfid and not self.item_type: - self.item_type = get_item_type(self._osfid_referent) - - def _autofill_provider_id(self): - if self.item_osfid and not self.provider_id: - _provider = getattr(self._osfid_referent, 'provider', None) - if _provider is None: - self.provider_id = 'osf' # quacks like Node, Comment, WikiPage - elif isinstance(_provider, str): - self.provider_id = _provider # quacks like BaseFileNode - else: - self.provider_id = _provider._id # quacks like Registration, Preprint, Collection - - def _autofill_within_iris(self): - if self.item_osfid and (self.within_iris is None) and self._osfid_referent: - self.within_iris = [ - osfid_iri(_osfid) - for _osfid in _get_surrounding_guids(self._osfid_referent) - ] - # ensure inclusive "within" - if not self.within_iris: - self.within_iris = [self.item_iri] - if self.item_iri not in self.within_iris: - self.within_iris = [self.item_iri, *self.within_iris] - - def _autofill_pageview(self): - # autofill pageview_info fields from other fields - if self.pageview_info: - self.pageview_info.hour_of_day = self.timestamp.hour - _url = self.pageview_info.page_url - if _url: - self.pageview_info.page_path = urlsplit(_url).path.rstrip('/') - _ref_url = self.pageview_info.referer_url - if _ref_url: - self.pageview_info.referer_domain = urlsplit(_ref_url).netloc - - def _autofill_database_iri(self): - if self.item_osfid and not self.database_iri: - self.database_iri = get_database_iri(self._osfid_referent) - - def _clean_action_labels(self): - if self.action_labels: - self.action_labels = sorted(self.action_labels) - - -class RegistriesModerationEventEs8(djelme.EventRecord): - UNIQUE_TOGETHER_FIELDS = ( - 'timestamp', 'registration_id', 'trigger', 'from_state', 'to_state', 'user_id' - ) - - registration_id: str - provider_id: str - trigger: str - from_state: str - to_state: str - user_id: str - comment: str | None - - class Meta: - timeseries_recordtype_name = 'RegistriesModerationEvent' - timeseries_index_timedepth = MONTHLY - - -### -# Reusable inner objects for reports - -class RunningTotal(esdsl.InnerDoc): - total: int - total_daily: int | None - - -class FileRunningTotals(esdsl.InnerDoc): - total: int - public: int - private: int - total_daily: int - public_daily: int - private_daily: int - - -class NodeRunningTotals(esdsl.InnerDoc): - total: int - total_excluding_spam: int | None - public: int - private: int - total_daily: int - total_daily_excluding_spam: int | None - public_daily: int - private_daily: int - - -class RegistrationRunningTotals(esdsl.InnerDoc): - total: int - public: int - embargoed: int - embargoed_v2: int - withdrawn: int | None - total_daily: int - public_daily: int - embargoed_daily: int - embargoed_v2_daily: int - withdrawn_daily: int | None - - -class UsageByStorageAddon(esdsl.InnerDoc): - addon_shortname: str - enabled_usersettings: RunningTotal - linked_usersettings: RunningTotal - deleted_usersettings: RunningTotal - usersetting_links: RunningTotal - connected_nodesettings: RunningTotal - disconnected_nodesettings: RunningTotal - deleted_nodesettings: RunningTotal - - -### -# Cyclic reports - - -class DailyStorageAddonUsageReportEs8(djelme.CyclicRecord): - CYCLE_TIMEDEPTH = DAILY - - usage_by_addon: list[UsageByStorageAddon] - - class Meta: - timeseries_index_timedepth = YEARLY - timeseries_recordtype_name = 'DailyStorageAddonUsageReport' - - -class DailyDownloadCountReportEs8(djelme.CyclicRecord): - CYCLE_TIMEDEPTH = DAILY - - daily_file_downloads: int - - class Meta: - timeseries_index_timedepth = YEARLY - timeseries_recordtype_name = 'DailyDownloadCountReport' - - -class DailyInstitutionSummaryReportEs8(djelme.CyclicRecord): - CYCLE_TIMEDEPTH = DAILY - UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'institution_id',) - - institution_id: str - institution_name: str - users: RunningTotal - nodes: NodeRunningTotals - projects: NodeRunningTotals - registered_nodes: RegistrationRunningTotals - registered_projects: RegistrationRunningTotals - - class Meta: - timeseries_index_timedepth = YEARLY - timeseries_recordtype_name = 'DailyInstitutionSummaryReport' - - -class DailyNewUserDomainReportEs8(djelme.CyclicRecord): - CYCLE_TIMEDEPTH = DAILY - UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'domain_name',) - - domain_name: str - new_user_count: int - - class Meta: - timeseries_index_timedepth = YEARLY - timeseries_recordtype_name = 'DailyNewUserDomainReport' - - -class DailyNodeSummaryReportEs8(djelme.CyclicRecord): - CYCLE_TIMEDEPTH = DAILY - - nodes: NodeRunningTotals - projects: NodeRunningTotals - registered_nodes: RegistrationRunningTotals - registered_projects: RegistrationRunningTotals - - class Meta: - timeseries_index_timedepth = YEARLY - timeseries_recordtype_name = 'DailyNodeSummaryReport' - - -class DailyOsfstorageFileCountReportEs8(djelme.CyclicRecord): - CYCLE_TIMEDEPTH = DAILY - - files: FileRunningTotals - - class Meta: - timeseries_index_timedepth = YEARLY - timeseries_recordtype_name = 'DailyOsfstorageFileCountReport' - - -class DailyPreprintSummaryReportEs8(djelme.CyclicRecord): - CYCLE_TIMEDEPTH = DAILY - - UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'provider_key',) - provider_key: str - preprint_count: int - - class Meta: - timeseries_index_timedepth = YEARLY - timeseries_recordtype_name = 'DailyPreprintSummaryReport' - - -class DailyUserSummaryReportEs8(djelme.CyclicRecord): - CYCLE_TIMEDEPTH = DAILY - - active: int - deactivated: int - merged: int - new_users_daily: int - new_users_with_institution_daily: int - unconfirmed: int - - class Meta: - timeseries_index_timedepth = YEARLY - timeseries_recordtype_name = 'DailyUserSummaryReport' - - -class MonthlySpamSummaryReportEs8(djelme.CyclicRecord): - CYCLE_TIMEDEPTH = MONTHLY - - node_confirmed_spam: int - node_confirmed_ham: int - node_flagged: int - registration_confirmed_spam: int - registration_confirmed_ham: int - registration_flagged: int - preprint_confirmed_spam: int - preprint_confirmed_ham: int - preprint_flagged: int - user_marked_as_spam: int - user_marked_as_ham: int - - class Meta: - timeseries_index_timedepth = YEARLY - timeseries_recordtype_name = 'MonthlySpamSummaryReport' - - -class MonthlyInstitutionalUserReportEs8(djelme.CyclicRecord): - CYCLE_TIMEDEPTH = MONTHLY - UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'institution_id', 'user_id',) - - institution_id: str - # user info: - user_id: str - user_name: str - department_name: str | None - month_last_login = YearmonthField() - month_last_active = YearmonthField() - account_creation_date = YearmonthField() - orcid_id: str | None - # counts: - public_project_count: int - private_project_count: int - public_registration_count: int - embargoed_registration_count: int - published_preprint_count: int - public_file_count: int = esdsl.mapped_field(esdsl.Long()) - storage_byte_count: int = esdsl.mapped_field(esdsl.Long()) - - class Meta: - timeseries_index_timedepth = YEARLY - timeseries_recordtype_name = 'MonthlyInstitutionalUserReport' - - -class MonthlyInstitutionSummaryReportEs8(djelme.CyclicRecord): - CYCLE_TIMEDEPTH = MONTHLY - UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'institution_id', ) - - institution_id: str - user_count: int - public_project_count: int - private_project_count: int - public_registration_count: int - embargoed_registration_count: int - published_preprint_count: int - storage_byte_count: int = esdsl.mapped_field(esdsl.Long()) - public_file_count: int = esdsl.mapped_field(esdsl.Long()) - monthly_logged_in_user_count: int = esdsl.mapped_field(esdsl.Long()) - monthly_active_user_count: int = esdsl.mapped_field(esdsl.Long()) - - class Meta: - timeseries_index_timedepth = YEARLY - timeseries_recordtype_name = 'MonthlyInstitutionSummaryReport' - - -class MonthlyPublicItemUsageReportEs8(djelme.CyclicRecord): - CYCLE_TIMEDEPTH = MONTHLY - UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'item_iri') - - # where noted, fields are meant to correspond to defined terms from COUNTER - # https://cop5.projectcounter.org/en/5.1/appendices/a-glossary-of-terms.html - # https://coprd.countermetrics.org/en/1.0.1/appendices/a-glossary.html - item_iri: str - item_osfids: list[str] - # fields built from aggregations -- more than one value unlikely, but possible - item_types: list[str] # counter:Data-Type - platform_iris: list[str] # counter:Platform - database_iris: list[str] # counter:Database - provider_ids: list[str] # osf-specific (usually corresponds to database_iri) - - # view counts include views on components or files contained by this item - view_count: int | None = esdsl.mapped_field(esdsl.Long()) - view_session_count: int | None = esdsl.mapped_field(esdsl.Long()) - cumulative_view_count: int | None = esdsl.mapped_field(esdsl.Long()) - cumulative_view_session_count: int | None = esdsl.mapped_field(esdsl.Long()) - - # download counts of this item only (not including contained components or files) - download_count: int | None = esdsl.mapped_field(esdsl.Long()) - download_session_count: int | None = esdsl.mapped_field(esdsl.Long()) - cumulative_download_count: int | None = esdsl.mapped_field(esdsl.Long()) - cumulative_download_session_count: int | None = esdsl.mapped_field(esdsl.Long()) - - class Meta: - timeseries_index_timedepth = YEARLY - timeseries_recordtype_name = 'MonthlyPublicItemUsageReport' - - -class MonthlyPrivateSpamMetricsReportEs8(djelme.CyclicRecord): - CYCLE_TIMEDEPTH = MONTHLY - - node_oopspam_flagged: int - node_oopspam_hammed: int - node_akismet_flagged: int - node_akismet_hammed: int - preprint_oopspam_flagged: int - preprint_oopspam_hammed: int - preprint_akismet_flagged: int - preprint_akismet_hammed: int - - class Meta: - timeseries_index_timedepth = YEARLY - timeseries_recordtype_name = 'MonthlyPrivateSpamMetricsReport' - - -### -# data migration state - -class Elastic6To8State(djelme.SimpleRecord): - """index for storing values helpful for keeping track of the elastic 6->8 data migration""" - UNIQUE_TOGETHER_FIELDS = ('key',) - key: str - value: str | None - timestamp: datetime.datetime = esdsl.mapped_field( - default_factory=lambda: datetime.datetime.now(datetime.UTC), - ) - - @classmethod - def get_by_key(cls, key: str): - _response = cls.search().query({'term': {'key': key}})[0].execute() - return _response[0] if _response else None - - @classmethod - def get_timestamp(cls, key: str) -> datetime.datetime | None: - _record = cls.get_by_key(key) - return _record.timestamp if _record else None - - @classmethod - def get_started_at(cls): - return cls.get_timestamp('started_at') - - @classmethod - def set_started_at_now(cls): - _record = cls.record(key='started_at') - cls.refresh() - return _record.timestamp diff --git a/osf/metrics/events.py b/osf/metrics/events.py new file mode 100644 index 00000000000..e827581d2ef --- /dev/null +++ b/osf/metrics/events.py @@ -0,0 +1,242 @@ +import datetime +import enum +import functools +from urllib.parse import urlsplit + +import elasticsearch8.dsl as esdsl +from elasticsearch_metrics import MONTHLY +import elasticsearch_metrics.imps.elastic8 as djelme + +from osf.metadata.osfmap_utils import osfid_from_iri +from osf.metrics.utils import ( + get_database_iri, + get_item_type, + get_surrounding_osfids, +) +from osf import models as osfdb +from osf.models.base import osfid_iri +from website import settings as website_settings + +__all__ = ( + 'OsfCountedUsageEvent', + 'RegistriesModerationEvent', +) + + +### +# inner objects for events + +route_prefix_analyzer = esdsl.analyzer( + 'route_prefix_analyzer', + tokenizer=esdsl.tokenizer('route_prefix_tokenizer', 'path_hierarchy', delimiter='.'), +) + + +class PageviewInfo(esdsl.InnerDoc): + """PageviewInfo + + for OsfCountedUsageEvent generated by viewing a web page + """ + + # fields that should be provided + referer_url: str | None + page_url: str | None + page_title: str | None + route_name: str | None = esdsl.mapped_field(esdsl.Keyword( + fields={ + 'by_prefix': esdsl.Text(analyzer=route_prefix_analyzer), + }, + )) + + # fields auto-filled + page_path: str | None + referer_domain: str | None + hour_of_day: int | None + + +### +# Event records + +class OsfCountedUsageEvent(djelme.CountedUsageRecord): + ''' + Aim to support a COUNTER-style reporting api + https://cop5.projectcounter.org/en/5.1/appendices/a-glossary-of-terms.html + https://coprd.countermetrics.org/en/1.0.1/appendices/a-glossary.html + ''' + UNIQUE_TOGETHER_FIELDS = ( + 'platform_iri', + 'sessionhour_id', + 'action_labels', + # include some non-field properties for more complex logic to + # slightly better approximate `counter:Double-Click Filtering` + # and allow for multiple pages describing the same item_iri + '_page_url_or_osfid', # non-field property + '_timestamp_date', # non-field property + '_timestamp_30sec_window', # non-field property + ) + + # inherited fields: + # timestamp: datetime.datetime + # platform_iri: str + # database_iri: str + # item_iri: str + # sessionhour_id: str + # within_iris: list[str] + + # osf-specific fields: + item_osfid: str + item_type: str + item_public: bool + provider_id: str | None + user_is_authenticated: bool + action_labels: list[str] + pageview_info: PageviewInfo | None + + class Meta: + timeseries_index_timedepth = MONTHLY + + class ActionLabel(enum.Enum): + SEARCH = 'search' # counter:Search + VIEW = 'view' # counter:Investigation + DOWNLOAD = 'download' # counter:Request + WEB = 'web' # counter:Regular (aka "pageview") + API = 'api' # counter:TDM (aka "non-web api usage") + + @classmethod + def record(cls, **kwargs): + # autofill `user_is_authenticated` before `user_id` discarded (couldn't in `clean`) + if 'user_is_authenticated' not in kwargs: + kwargs['user_is_authenticated'] = bool(kwargs.get('user_id')) + return super().record(**kwargs) + + @property + def _page_url_or_osfid(self): + # for UNIQUE_TOGETHER_FIELDS + return ( + self.pageview_info.page_url + if self.pageview_info is not None and self.pageview_info.page_url is not None + else self.item_osfid + ) + + @property + def _timestamp_date(self): + # for UNIQUE_TOGETHER_FIELDS + return self.timestamp.date() + + @property + def _timestamp_30sec_window(self): + # for UNIQUE_TOGETHER_FIELDS + # slice the day into an array of 30-second windows, + # find this timestamp's windowslice index + _day_start = datetime.datetime( + self.timestamp.year, + self.timestamp.month, + self.timestamp.day, + tzinfo=self.timestamp.tzinfo, + ) + _time_in_seconds = (self.timestamp - _day_start).total_seconds() + return int(_time_in_seconds / 30) # 30-second windows + + @functools.cached_property + def _osfid_referent(self): + # for use by autofill methods, if needed + _osfguid = osfdb.Guid.load(self.item_osfid) + return _osfguid.referent if _osfguid else None + + def clean(self): + self._autofill_platform_iri() + self._autofill_item_iri_and_osfid() + self._autofill_item_public() + self._autofill_item_type() + self._autofill_provider_id() + self._autofill_within_iris() + self._autofill_pageview() + self._autofill_database_iri() + self._clean_action_labels() + super().clean() + + def _autofill_platform_iri(self): + if self.platform_iri is None: + self.platform_iri = website_settings.DOMAIN + + def _autofill_item_iri_and_osfid(self): + if self.item_osfid and not self.item_iri: + self.item_iri = osfid_iri(self.item_osfid) + elif self.item_iri and not self.item_osfid: + try: + self.item_osfid = osfid_from_iri(self.item_iri) + except ValueError: + pass + + def _autofill_item_public(self): + if self.item_osfid and (self.item_public is None): + _item = self._osfid_referent + # if it quacks like BaseFileNode, look at .target instead + _item = getattr(_item, 'target', None) or _item + self.item_public = ( + _item.verified_publishable # quacks like Preprint + if hasattr(_item, 'verified_publishable') + else getattr(_item, 'is_public', False) # quacks like AbstractNode + ) + + def _autofill_item_type(self): + if self.item_osfid and not self.item_type: + self.item_type = get_item_type(self._osfid_referent) + + def _autofill_provider_id(self): + if self.item_osfid and not self.provider_id: + _provider = getattr(self._osfid_referent, 'provider', None) + if _provider is None: + self.provider_id = 'osf' # quacks like Node, Comment, WikiPage + elif isinstance(_provider, str): + self.provider_id = _provider # quacks like BaseFileNode + else: + self.provider_id = _provider._id # quacks like Registration, Preprint, Collection + + def _autofill_within_iris(self): + if self.item_osfid and (not self.within_iris) and self._osfid_referent: + self.within_iris = [ + osfid_iri(_osfid) + for _osfid in get_surrounding_osfids(self._osfid_referent) + ] + # ensure inclusive "within" + if self.item_iri not in self.within_iris: + self.within_iris = [self.item_iri, *self.within_iris] + self.within_iris = sorted(self.within_iris) + + def _autofill_pageview(self): + # autofill pageview_info fields from other fields + if self.pageview_info: + self.pageview_info.hour_of_day = self.timestamp.hour + _url = self.pageview_info.page_url + if _url: + self.pageview_info.page_path = urlsplit(_url).path.rstrip('/') + _ref_url = self.pageview_info.referer_url + if _ref_url: + self.pageview_info.referer_domain = urlsplit(_ref_url).netloc + + def _autofill_database_iri(self): + if self.item_osfid and not self.database_iri: + self.database_iri = get_database_iri(self._osfid_referent) + + def _clean_action_labels(self): + if self.action_labels: + self.action_labels = sorted(self.action_labels) + + +class RegistriesModerationEvent(djelme.EventRecord): + UNIQUE_TOGETHER_FIELDS = ( + 'timestamp', 'registration_id', 'trigger', 'from_state', 'to_state', 'user_id' + ) + + registration_id: str + provider_id: str + trigger: str + from_state: str + to_state: str + user_id: str + comment: str | None + + class Meta: + timeseries_recordtype_name = 'RegistriesModerationEvent' + timeseries_index_timedepth = MONTHLY diff --git a/osf/metrics/fields.py b/osf/metrics/fields.py new file mode 100644 index 00000000000..a91ad40eeea --- /dev/null +++ b/osf/metrics/fields.py @@ -0,0 +1,37 @@ +import datetime + +import elasticsearch8.dsl as esdsl + +from osf.metrics.utils import YearMonth + + +### +# custom elasticsearch dsl fields + +class YearmonthField(esdsl.Date): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs, format='strict_year_month') + + def deserialize(self, data): + if isinstance(data, int): + # elasticsearch stores dates in milliseconds since the unix epoch + _as_datetime = datetime.datetime.fromtimestamp(data // 1000) + return YearMonth.from_date(_as_datetime) + elif data is None: + return None + try: + return YearMonth.from_any(data) + except ValueError: + raise ValueError(f'unsure how to deserialize "{data}" (of type {type(data)}) to YearMonth') + + def serialize(self, data, skip_empty=True): + if isinstance(data, str): + return data + elif isinstance(data, YearMonth): + return str(data) + elif isinstance(data, (datetime.datetime, datetime.date)): + return str(YearMonth.from_date(data)) + elif data is None: + return None + else: + raise ValueError(f'unsure how to serialize "{data}" (of type {type(data)}) as YYYY-MM') diff --git a/osf/metrics/metric_mixin.py b/osf/metrics/metric_mixin.py deleted file mode 100644 index df87d5123b1..00000000000 --- a/osf/metrics/metric_mixin.py +++ /dev/null @@ -1,144 +0,0 @@ -from datetime import datetime - -from django.db import models -from django.utils import timezone -from elasticsearch6.exceptions import NotFoundError -import pytz - - -class MetricMixin: - - @classmethod - def _get_all_indices(cls): - all_aliases = cls._index.get_alias() - indices = set() - for index, aliases in all_aliases.items(): - indices.add(index) - if aliases['aliases']: - for alias in aliases['aliases'].keys(): - indices.add(alias) - return indices - - @classmethod - def _get_relevant_indices(cls, after, before): - # NOTE: This will only work for yearly indices. This logic - # will need to be updated if we change to monthly or daily indices - if before and after: - year_range = range(after.year, before.year + 1) - elif after: - year_range = range(after.year, timezone.now().year + 1) - else: - # No metric data from before 2013 - year_range = range(2013, before.year + 1) - all_indices = cls._get_all_indices() - relevant_indices = [ - # get_index_name takes a datetime, so get Jan 1 for each relevant year - cls.get_index_name(datetime(year, 1, 1, tzinfo=pytz.utc)) - for year in year_range - ] - return [index for index in relevant_indices if index in all_indices] - - @classmethod - def _get_id_to_count(cls, size, metric_field, count_field, after=None, before=None): - """Performs the elasticsearch aggregation for get_top_by_count. Return a - dict mapping ids to summed counts. If there's no data in the ES index, return None. - """ - search = cls.search(after=after, before=before) - timestamp = {} - if after: - timestamp['gte'] = after - if before: - timestamp['lt'] = before - if timestamp: - search = search.filter('range', timestamp=timestamp) - search.aggs.\ - bucket('by_id', 'terms', field=metric_field, size=size, order={'sum_count': 'desc'}).\ - metric('sum_count', 'sum', field=count_field) - # Optimization: set size to 0 so that hits aren't returned (we only care about the aggregation) - search = search.extra(size=0) - try: - response = search.execute() - except NotFoundError: - # _get_relevant_indices returned 1 or more indices - # that doesn't exist. Fall back to unoptimized query - search = search.index().index(cls._default_index()) - response = search.execute() - # No indexed data - if not hasattr(response.aggregations, 'by_id'): - return None - buckets = response.aggregations.by_id.buckets - # Map _id => count - return { - bucket.key: int(bucket.sum_count.value) - for bucket in buckets - } - - # Overrides Document.search to only search relevant - # indices, determined from `after` - @classmethod - def search(cls, using=None, index=None, after=None, before=None, *args, **kwargs): - if not index and (before or after): - indices = cls._get_relevant_indices(after, before) - index = ','.join(indices) - return super().search(using=using, index=index, *args, **kwargs) - - @classmethod - def get_top_by_count(cls, qs, model_field, metric_field, - size, order_by=None, - count_field='count', - annotation='metric_count', - after=None, before=None): - """Return a queryset annotated with the metric counts for each item. - - Example: :: - - # Get the top 10 PreprintProviders by download count - top_providers = PreprintDownload.get_top_by_count( - qs=PreprintProvider.objects.all(), - model_field='_id', - metric_field='provider_id', - annotation='download_count', - size=10 - ) - - for each in top_providers: - print('{}: {}'.format(each._id, each.download_count)) - - ``size`` determines the number of buckets returned by the aggregation. - If ``size=None``, the size of the queryset is used. - WARNING: Be careful when using size=None when using a large queryset. - - :param QuerySet qs: The initial queryset to annotate - :param str model_field: Model field that corresponds to ``metric_field``. - :param str metric_field: Metric field that corresponds to ``model_field``. - :param int size: Size of the aggregation. Also determines the size of the final - queryset. - :param str order_by: Field to order queryset by. If `None`, orders by - the metric, descending. - :param datetime after: Minimum datetime to narrow the search (inclusive). - :param datetime before: Maximum datetime to narrow the search (exclusive). - :param str count_field: Name of the field where count values are stored. - :param str annotation: Name of the annotation. - """ - id_to_count = cls._get_id_to_count( - size=size or qs.count(), - metric_field=metric_field, - count_field=count_field, - after=after, - before=before - ) - if id_to_count is None: - return qs.annotate(**{annotation: models.Value(0, models.IntegerField())}) - # Annotate the queryset with the counts for each id - # https://stackoverflow.com/a/48187723/1157536 - whens = [ - models.When(**{ - model_field: k, - 'then': v, - }) for k, v in id_to_count.items() - ] - # By default order by annotation, desc - order_by = order_by or f'-{annotation}' - return qs.annotate(**{ - annotation: models.Case(*whens, default=0, output_field=models.IntegerField()) - }).order_by(order_by) diff --git a/osf/metrics/monthly_reports.py b/osf/metrics/monthly_reports.py new file mode 100644 index 00000000000..b0c3f4a3895 --- /dev/null +++ b/osf/metrics/monthly_reports.py @@ -0,0 +1,200 @@ +import collections.abc + +import elasticsearch8.dsl as esdsl +from elasticsearch_metrics import MONTHLY, YEARLY +import elasticsearch_metrics.imps.elastic8 as djelme + +from osf.metrics.fields import YearmonthField +from osf.metrics.utils import ( + YearMonth, + cycle_coverage_yearmonth, +) + +__all__ = ( + 'BaseMonthlyReport', + 'MonthlyInstitutionSummaryReport', + 'MonthlyInstitutionalUserReport', + 'MonthlyPrivateSpamMetricsReport', + 'MonthlyPublicItemUsageReport', + 'MonthlySpamSummaryReport', +) + + +### +# base class + +class BaseMonthlyReport(djelme.CyclicRecord): + CYCLE_TIMEDEPTH = MONTHLY + + class Meta: + abstract = True + + @classmethod + def most_recent_cycle(cls, base_search=None) -> str | None: + _search = base_search or cls.search() + _search = _search[0:0] # omit hits + _search.aggs.bucket( + 'agg_most_recent_cycle', + 'terms', + field='cycle_coverage', + order={'_key': 'desc'}, + size=1, + ) + _response = _search.execute() + if not _response.aggregations: + return None + _buckets = _response.aggregations.agg_most_recent_cycle.buckets + if not _buckets: + return None + return _buckets[0].key + + def __init__(self, *, report_yearmonth=None, **kwargs): + super().__init__(**kwargs) + # separate out report_yearmonth, so the property setter gets used + if report_yearmonth is not None: + self.report_yearmonth = report_yearmonth + + @property + def report_yearmonth(self): + _year, _month = map(int, self.cycle_coverage.split('.')) + return YearMonth(_year, _month) + + @report_yearmonth.setter + def report_yearmonth(self, ym): + self.cycle_coverage = cycle_coverage_yearmonth(YearMonth.from_any(ym)) + + +### +# monthly reports + +class MonthlySpamSummaryReport(BaseMonthlyReport): + node_confirmed_spam: int + node_confirmed_ham: int + node_flagged: int + registration_confirmed_spam: int + registration_confirmed_ham: int + registration_flagged: int + preprint_confirmed_spam: int + preprint_confirmed_ham: int + preprint_flagged: int + user_marked_as_spam: int + user_marked_as_ham: int + + class Meta: + timeseries_index_timedepth = YEARLY + + +class MonthlyInstitutionalUserReport(BaseMonthlyReport): + UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'institution_id', 'user_id',) + + institution_id: str + # user info: + user_id: str + user_name: str + department_name: str | None + month_last_login = YearmonthField() + month_last_active = YearmonthField() + account_creation_date = YearmonthField() + orcid_id: str | None + # counts: + public_project_count: int + private_project_count: int + public_registration_count: int + embargoed_registration_count: int + published_preprint_count: int + public_file_count: int = esdsl.mapped_field(esdsl.Long()) + storage_byte_count: int = esdsl.mapped_field(esdsl.Long()) + + class Meta: + timeseries_index_timedepth = YEARLY + + +class MonthlyInstitutionSummaryReport(BaseMonthlyReport): + UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'institution_id', ) + + institution_id: str + user_count: int + public_project_count: int + private_project_count: int + public_registration_count: int + embargoed_registration_count: int + published_preprint_count: int + storage_byte_count: int = esdsl.mapped_field(esdsl.Long()) + public_file_count: int = esdsl.mapped_field(esdsl.Long()) + monthly_logged_in_user_count: int = esdsl.mapped_field(esdsl.Long()) + monthly_active_user_count: int = esdsl.mapped_field(esdsl.Long()) + + class Meta: + timeseries_index_timedepth = YEARLY + + +class MonthlyPublicItemUsageReport(BaseMonthlyReport): + UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'item_iri') + + # where noted, fields are meant to correspond to defined terms from COUNTER + # https://cop5.projectcounter.org/en/5.1/appendices/a-glossary-of-terms.html + # https://coprd.countermetrics.org/en/1.0.1/appendices/a-glossary.html + item_iri: str + item_osfids: list[str] + # fields built from aggregations -- more than one value unlikely, but possible + item_types: list[str] # counter:Data-Type + platform_iris: list[str] # counter:Platform + database_iris: list[str] # counter:Database + provider_ids: list[str] # osf-specific (usually corresponds to database_iri) + + # view counts include views on components or files contained by this item + view_count: int | None = esdsl.mapped_field(esdsl.Long()) + view_session_count: int | None = esdsl.mapped_field(esdsl.Long()) + cumulative_view_count: int | None = esdsl.mapped_field(esdsl.Long()) + cumulative_view_session_count: int | None = esdsl.mapped_field(esdsl.Long()) + + # download counts of this item only (not including contained components or files) + download_count: int | None = esdsl.mapped_field(esdsl.Long()) + download_session_count: int | None = esdsl.mapped_field(esdsl.Long()) + cumulative_download_count: int | None = esdsl.mapped_field(esdsl.Long()) + cumulative_download_session_count: int | None = esdsl.mapped_field(esdsl.Long()) + + class Meta: + timeseries_index_timedepth = YEARLY + + @classmethod + def from_last_month( + cls, + item_iris: collections.abc.Collection[str], + ) -> list['MonthlyPublicItemUsageReport']: + _last_month = YearMonth.from_today().prior() + _from_last_month = list(cls.each_from_month(item_iris, _last_month)) + if item_iris and not _from_last_month: + # monthly reporters may not run immediately at the beginning of the month, + # but this could -- if none exist, try the month prior + _from_last_month = list(cls.each_from_month(item_iris, _last_month.prior())) + return _from_last_month + + @classmethod + def each_from_month( + cls, + item_iris: collections.abc.Collection[str], + yearmonth: YearMonth, + ) -> collections.abc.Collection['MonthlyPublicItemUsageReport']: + if item_iris: + _search = ( + cls.search() + .filter('term', cycle_coverage=cycle_coverage_yearmonth(yearmonth)) + .filter('terms', item_iri=item_iris) + [:len(item_iris)] + ) + yield from _search.execute() + + +class MonthlyPrivateSpamMetricsReport(BaseMonthlyReport): + node_oopspam_flagged: int + node_oopspam_hammed: int + node_akismet_flagged: int + node_akismet_hammed: int + preprint_oopspam_flagged: int + preprint_oopspam_hammed: int + preprint_akismet_flagged: int + preprint_akismet_hammed: int + + class Meta: + timeseries_index_timedepth = YEARLY diff --git a/osf/metrics/preprint_metrics.py b/osf/metrics/preprint_metrics.py deleted file mode 100644 index d284d80827e..00000000000 --- a/osf/metrics/preprint_metrics.py +++ /dev/null @@ -1,73 +0,0 @@ -from elasticsearch6.exceptions import NotFoundError -import elasticsearch_metrics.imps.elastic6 as metrics - -from .metric_mixin import MetricMixin - - -class BasePreprintMetric(MetricMixin, metrics.Metric): - count = metrics.Integer(doc_values=True, index=True, required=True) - provider_id = metrics.Keyword(index=True, doc_values=True, required=True) - user_id = metrics.Keyword(index=True, doc_values=True, required=False) - preprint_id = metrics.Keyword(index=True, doc_values=True, required=True) - version = metrics.Keyword(index=True, doc_values=True) - path = metrics.Text(index=True) - - # TODO: locale - - class Index: - settings = { - 'number_of_shards': 1, - 'number_of_replicas': 1, - 'refresh_interval': '1s', - } - - class Meta: - abstract = True - source = metrics.MetaField(enabled=True) - - @classmethod - def record_for_preprint(cls, preprint, user=None, **kwargs): - count = kwargs.pop('count', 1) - return cls.record( - count=count, - preprint_id=preprint._id, - user_id=getattr(user, '_id', None), - provider_id=preprint.provider._id, - **kwargs - ) - - @classmethod - def get_count_for_preprint(cls, preprint, after=None, before=None, index=None) -> int: - if preprint.version == 1: - search = cls.search(index=index).filter('terms', preprint_id=[preprint.get_guid()._id, preprint._id]) - else: - search = cls.search(index=index).filter('term', preprint_id=preprint._id) - timestamp = {} - if after: - timestamp['gte'] = after - if before: - timestamp['lt'] = before - if timestamp: - search = search.filter('range', timestamp=timestamp) - search.aggs.metric('sum_count', 'sum', field='count') - # Optimization: set size to 0 so that hits aren't returned (we only care about the aggregation) - search = search.extra(size=0) - try: - response = search.execute() - except NotFoundError: - # _get_relevant_indices returned 1 or more indices - # that doesn't exist. Fall back to unoptimized query - search = search.index().index(cls._default_index()) - response = search.execute() - # No indexed data - if not hasattr(response.aggregations, 'sum_count'): - return 0 - return int(response.aggregations.sum_count.value) - - -class PreprintView(BasePreprintMetric): - pass - - -class PreprintDownload(BasePreprintMetric): - pass diff --git a/osf/metrics/registry_metrics.py b/osf/metrics/registry_metrics.py deleted file mode 100644 index 9c779fe8c0b..00000000000 --- a/osf/metrics/registry_metrics.py +++ /dev/null @@ -1,176 +0,0 @@ -import elasticsearch_metrics.imps.elastic6 as metrics - -from osf.utils.workflows import RegistrationModerationTriggers, RegistrationModerationStates -from .metric_mixin import MetricMixin - - -class RegistriesModerationMetrics(MetricMixin, metrics.Metric): - registration_id = metrics.Keyword(index=True, doc_values=True, required=True) - provider_id = metrics.Keyword(index=True, doc_values=True, required=True) - trigger = metrics.Keyword(index=True, doc_values=True, required=True) - from_state = metrics.Keyword(index=True, doc_values=True, required=True) - to_state = metrics.Keyword(index=True, doc_values=True, required=True) - user_id = metrics.Keyword(index=True, doc_values=True, required=True) - comment = metrics.Keyword(index=True) - - class Index: - settings = { - 'number_of_shards': 1, - 'number_of_replicas': 1, - 'refresh_interval': '1s', - } - - class Meta: - source = metrics.MetaField(enabled=True) - - @classmethod - def record_transitions(cls, action): - return cls.record( - registration_id=action.target._id, - provider_id=action.target.provider._id, - from_state=action.from_state, - to_state=action.to_state, - trigger=action.trigger, - user_id=action.creator._id, - comment=action.comment, - ) - - @classmethod - def get_registries_info(cls) -> dict: - """ - Gets metrics info for each registry - expected output: - { - 'doc_count_error_upper_bound': 0, - 'sum_other_doc_count': 0, - 'buckets': [{ - 'key': 'osf', - 'doc_count': 6, - 'rejected': {'doc_count': 0}, - 'submissions': {'doc_count': 3}, - 'not_embargoed_but_accepted': {'doc_count': 0}, - 'withdrawn': {'doc_count': 0}, - 'transitions_without_comments': {'doc_count': 1}, - 'embargoed': {'doc_count': 0}, - 'transitions_with_comments': {'doc_count': 5} - }, - { - 'key': 'provider2', - 'doc_count': 4, - 'rejected': {'doc_count': 1}, - 'submissions': {'doc_count': 1}, - 'not_embargoed_but_accepted': {'doc_count': 1}, - 'withdrawn': {'doc_count': 0}, - 'transitions_without_comments': {'doc_count': 0}, - 'embargoed': {'doc_count': 0}, - 'transitions_with_comments': {'doc_count': 4} - }] - } - :return: dict - """ - search = cls.search() - - return search.update_from_dict({ - 'aggs': { - 'providers': { - 'terms': { - 'field': 'provider_id' - }, - 'aggs': { - 'transitions_without_comments': { - 'missing': { - 'field': 'comment' - } - }, - 'transitions_with_comments': { - 'filter': { - 'exists': { - 'field': 'comment' - } - } - }, - 'submissions': { - 'filter': { - 'match': { - 'trigger': { - 'query': RegistrationModerationTriggers.SUBMIT.db_name - } - } - } - }, - 'accepted_with_embargo': { - 'filter': { - 'bool': { - 'must': [ - { - 'match': { - 'to_state': RegistrationModerationStates.EMBARGO.db_name - } - }, - { - 'match': { - 'trigger': RegistrationModerationTriggers.SUBMIT.db_name - } - } - ] - } - } - }, - 'accepted_without_embargo': { - 'filter': { - 'bool': { - 'must': [ - { - 'match': { - 'to_state': RegistrationModerationStates.ACCEPTED.db_name - } - }, - { - 'match': { - 'trigger': RegistrationModerationTriggers.SUBMIT.db_name - } - } - ] - } - } - }, - 'rejected': { - 'filter': { - 'bool': { - 'must': [ - { - 'match': { - 'to_state': RegistrationModerationStates.REJECTED.db_name - } - }, - { - 'match': { - 'trigger': RegistrationModerationTriggers.REJECT_SUBMISSION.db_name - } - } - ] - } - } - }, - 'withdrawn': { - 'filter': { - 'bool': { - 'must': [ - { - 'match': { - 'to_state': RegistrationModerationStates.WITHDRAWN.db_name - } - }, - { - 'match': { - 'trigger': RegistrationModerationTriggers.ACCEPT_WITHDRAWAL.db_name - } - } - ] - } - } - }, - } - } - } - }).execute().aggregations['providers'].to_dict() diff --git a/osf/metrics/reporters/_base.py b/osf/metrics/reporters/_base.py index 707e869522b..6f1d183ee6e 100644 --- a/osf/metrics/reporters/_base.py +++ b/osf/metrics/reporters/_base.py @@ -1,10 +1,11 @@ -from collections import abc +import collections import dataclasses import logging import celery -from osf.metrics.reports import MonthlyReport +from osf.metrics.daily_reports import BaseDailyReport +from osf.metrics.monthly_reports import BaseMonthlyReport from osf.metrics.utils import YearMonth @@ -15,22 +16,28 @@ class MonthlyReporter: yearmonth: YearMonth - def iter_report_kwargs(self, continue_after: dict | None = None) -> abc.Iterator[dict]: - # override for multiple reports per month + def iter_report_kwargs(self, continue_after: dict | None = None) -> collections.abc.Iterator[dict]: + """yield kwargs that can be passed to `report` (in separate async tasks) + + by default, `report` is called once with empty kwargs + (override for multiple reports per month) + """ if continue_after is None: - yield {} # by default, calls `.report()` once with no kwargs + yield {} - def report(self, **report_kwargs) -> MonthlyReport | None: - """build a report for the given month + def report(self, **report_kwargs) -> collections.abc.Iterator[BaseMonthlyReport]: + """yield reports for the given month and kwargs (from iter_report_kwargs) """ raise NotImplementedError(f'{self.__class__.__name__} must implement `report`') def followup_task(self, report) -> celery.Signature | None: + """return a task signature that will be enqueued after the report is saved + """ return None class DailyReporter: - def report(self, report_date): + def report(self, report_date) -> collections.abc.Iterator[BaseDailyReport]: """build reports for the given date return an iterable of DailyReport (unsaved) @@ -38,9 +45,7 @@ def report(self, report_date): raise NotImplementedError(f'{self.__class__.__name__} must implement `report`') def run_and_record_for_date(self, report_date): - reports = self.report(report_date) - # expecting each reporter to spit out only a handful of reports per day; - # not bothering with bulk-create - for report in reports: + # not bothering with bulk-create (this allows multiple types of reports) + for report in self.report(report_date): report.save() diff --git a/osf/metrics/reporters/download_count.py b/osf/metrics/reporters/download_count.py index 4350c1440a1..7e98d24a326 100644 --- a/osf/metrics/reporters/download_count.py +++ b/osf/metrics/reporters/download_count.py @@ -1,22 +1,12 @@ from osf.models import PageCounter -from osf.metrics.reports import DownloadCountReport -from osf.metrics.es8_metrics import DailyDownloadCountReportEs8 -from osf.metrics.utils import cycle_coverage_date +from osf.metrics.daily_reports import DailyDownloadCountReport from ._base import DailyReporter class DownloadCountReporter(DailyReporter): def report(self, date): download_count = int(PageCounter.get_all_downloads_on_date(date) or 0) - reports = [] - report_es8 = DailyDownloadCountReportEs8( - cycle_coverage=cycle_coverage_date(date), - daily_file_downloads=download_count, - ) - reports.append(report_es8) - report = DownloadCountReport( - daily_file_downloads=report_es8.daily_file_downloads, + yield DailyDownloadCountReport( report_date=date, + daily_file_downloads=download_count, ) - reports.append(report) - return reports diff --git a/osf/metrics/reporters/institution_summary.py b/osf/metrics/reporters/institution_summary.py index 1148f2456e5..b34e22c1e1f 100644 --- a/osf/metrics/reporters/institution_summary.py +++ b/osf/metrics/reporters/institution_summary.py @@ -2,19 +2,8 @@ from django.db.models import Q -from osf.metrics.reports import ( - InstitutionSummaryReport, - RunningTotal, - NodeRunningTotals, - RegistrationRunningTotals, -) from osf.models import Institution -from osf.metrics.es8_metrics import ( - DailyInstitutionSummaryReportEs8, - RunningTotal as RunningTotalEs8, - NodeRunningTotals as NodeRunningTotalsEs8, - RegistrationRunningTotals as RegistrationRunningTotalsEs8 -) +from osf.metrics.daily_reports import DailyInstitutionSummaryReport from osf.metrics.utils import cycle_coverage_date from ._base import DailyReporter @@ -25,7 +14,6 @@ class InstitutionSummaryReporter(DailyReporter): def report(self, date): institutions = Institution.objects.all() - reports = [] daily_query = Q(created__date=date) public_query = Q(is_public=True) @@ -45,15 +33,15 @@ def report(self, date): created__date__lte=date, type='osf.registration', ) - report_es8 = DailyInstitutionSummaryReportEs8( + yield DailyInstitutionSummaryReport( cycle_coverage=cycle_coverage_date(date), institution_id=institution._id, institution_name=institution.name, - users=RunningTotalEs8( + users=dict( total=institution.get_institution_users().filter(is_active=True).count(), total_daily=institution.get_institution_users().filter(date_confirmed__date=date).count(), ), - nodes=NodeRunningTotalsEs8( + nodes=dict( total=node_qs.count(), public=node_qs.filter(public_query).count(), private=node_qs.filter(private_query).count(), @@ -63,7 +51,7 @@ def report(self, date): private_daily=node_qs.filter(private_query & daily_query).count(), ), # Projects use get_roots to remove children - projects=NodeRunningTotalsEs8( + projects=dict( total=node_qs.get_roots().count(), public=node_qs.filter(public_query).get_roots().count(), private=node_qs.filter(private_query).get_roots().count(), @@ -72,7 +60,7 @@ def report(self, date): public_daily=node_qs.filter(public_query & daily_query).get_roots().count(), private_daily=node_qs.filter(private_query & daily_query).get_roots().count(), ), - registered_nodes=RegistrationRunningTotalsEs8( + registered_nodes=dict( total=registration_qs.count(), public=registration_qs.filter(public_query).count(), embargoed=registration_qs.filter(private_query).count(), @@ -83,7 +71,7 @@ def report(self, date): embargoed_daily=registration_qs.filter(private_query & daily_query).count(), embargoed_v2_daily=registration_qs.filter(private_query & daily_query & embargo_v2_query).count(), ), - registered_projects=RegistrationRunningTotalsEs8( + registered_projects=dict( total=registration_qs.get_roots().count(), public=registration_qs.filter(public_query).get_roots().count(), embargoed=registration_qs.filter(private_query).get_roots().count(), @@ -96,58 +84,3 @@ def report(self, date): private_query & daily_query & embargo_v2_query).get_roots().count(), ), ) - reports.append(report_es8) - - report = InstitutionSummaryReport( - report_date=date, - institution_id=institution._id, - institution_name=institution.name, - users=RunningTotal( - total=report_es8.users.total, - total_daily=report_es8.users.total_daily, - ), - nodes=NodeRunningTotals( - total=report_es8.nodes.total, - public=report_es8.nodes.public, - private=report_es8.nodes.private, - - total_daily=report_es8.nodes.total_daily, - public_daily=report_es8.nodes.public_daily, - private_daily=report_es8.nodes.private_daily, - ), - # Projects use get_roots to remove children - projects=NodeRunningTotals( - total=report_es8.projects.total, - public=report_es8.projects.public, - private=report_es8.projects.private, - - total_daily=report_es8.projects.total_daily, - public_daily=report_es8.projects.public_daily, - private_daily=report_es8.projects.private_daily, - ), - registered_nodes=RegistrationRunningTotals( - total=report_es8.registered_nodes.total, - public=report_es8.registered_nodes.public, - embargoed=report_es8.registered_nodes.embargoed, - embargoed_v2=report_es8.registered_nodes.embargoed_v2, - - total_daily=report_es8.registered_nodes.total_daily, - public_daily=report_es8.registered_nodes.public_daily, - embargoed_daily=report_es8.registered_nodes.embargoed_daily, - embargoed_v2_daily=report_es8.registered_nodes.embargoed_v2_daily, - ), - registered_projects=RegistrationRunningTotals( - total=report_es8.registered_projects.total, - public=report_es8.registered_projects.public, - embargoed=report_es8.registered_projects.embargoed, - embargoed_v2=report_es8.registered_projects.embargoed_v2, - - total_daily=report_es8.registered_projects.total_daily, - public_daily=report_es8.registered_projects.public_daily, - embargoed_daily=report_es8.registered_projects.embargoed_daily, - embargoed_v2_daily=report_es8.registered_projects.embargoed_v2_daily, - ), - ) - - reports.append(report) - return reports diff --git a/osf/metrics/reporters/institution_summary_monthly.py b/osf/metrics/reporters/institution_summary_monthly.py index 88d8e1fb891..559a2c2ae1a 100644 --- a/osf/metrics/reporters/institution_summary_monthly.py +++ b/osf/metrics/reporters/institution_summary_monthly.py @@ -4,13 +4,11 @@ from osf.models import Institution, Preprint, AbstractNode, FileVersion, NodeLog, PreprintLog from osf.models.spam import SpamStatus from addons.osfstorage.models import OsfStorageFile -from osf.metrics.reports import InstitutionMonthlySummaryReport -from osf.metrics.es8_metrics import MonthlyInstitutionSummaryReportEs8 -from osf.metrics.utils import cycle_coverage_yearmonth +from osf.metrics.monthly_reports import MonthlyInstitutionSummaryReport from ._base import MonthlyReporter class InstitutionalSummaryMonthlyReporter(MonthlyReporter): - """Generate an InstitutionMonthlySummaryReport for each institution.""" + """Generate a MonthlyInstitutionSummaryReport for each institution.""" def iter_report_kwargs(self, continue_after: dict | None = None): _inst_qs = Institution.objects.order_by('pk') @@ -20,22 +18,16 @@ def iter_report_kwargs(self, continue_after: dict | None = None): yield {'institution_pk': _pk} def report(self, **report_kwargs): - _institution = Institution.objects.get(pk=report_kwargs['institution_pk']) - reports = self.generate_report(_institution) - return reports - - def generate_report(self, institution): + institution = Institution.objects.get(pk=report_kwargs['institution_pk']) node_queryset = institution.nodes.filter( deleted__isnull=True, created__lt=self.yearmonth.month_end() ).exclude( spam_status=SpamStatus.SPAM, ) - preprint_queryset = self.get_published_preprints(institution, self.yearmonth) - reports = [] - report_es8 = MonthlyInstitutionSummaryReportEs8( - cycle_coverage=cycle_coverage_yearmonth(self.yearmonth), + yield MonthlyInstitutionSummaryReport( + report_yearmonth=self.yearmonth, institution_id=institution._id, user_count=institution.get_institution_users().count(), private_project_count=self._get_count(node_queryset, 'osf.node', is_public=False), @@ -48,23 +40,6 @@ def generate_report(self, institution): monthly_logged_in_user_count=self.get_monthly_logged_in_user_count(institution, self.yearmonth), monthly_active_user_count=self.get_monthly_active_user_count(institution, self.yearmonth), ) - reports.append(report_es8) - - report = InstitutionMonthlySummaryReport( - institution_id=report_es8.institution_id, - user_count=report_es8.user_count, - private_project_count=report_es8.private_project_count, - public_project_count=report_es8.public_project_count, - public_registration_count=report_es8.public_registration_count, - embargoed_registration_count=report_es8.embargoed_registration_count, - published_preprint_count=report_es8.published_preprint_count, - storage_byte_count=report_es8.storage_byte_count, - public_file_count=report_es8.public_file_count, - monthly_logged_in_user_count=report_es8.monthly_logged_in_user_count, - monthly_active_user_count=report_es8.monthly_active_user_count, - ) - reports.append(report) - return reports def _get_count(self, node_queryset, node_type, is_public): return node_queryset.filter(type=node_type, is_public=is_public, root_id=F('pk')).count() diff --git a/osf/metrics/reporters/institutional_users.py b/osf/metrics/reporters/institutional_users.py index a9fba3adfcb..f7a745be6ce 100644 --- a/osf/metrics/reporters/institutional_users.py +++ b/osf/metrics/reporters/institutional_users.py @@ -6,14 +6,13 @@ from osf import models as osfdb from osf.models.spam import SpamStatus from addons.osfstorage.models import OsfStorageFile -from osf.metrics.reports import InstitutionalUserReport -from osf.metrics.utils import YearMonth, cycle_coverage_yearmonth -from osf.metrics.es8_metrics import MonthlyInstitutionalUserReportEs8 +from osf.metrics.utils import YearMonth +from osf.metrics.monthly_reports import MonthlyInstitutionalUserReport from ._base import MonthlyReporter class InstitutionalUsersReporter(MonthlyReporter): - '''build an InstitutionalUserReport for each institution-user affiliation + '''build a MonthlyInstitutionalUserReport for each institution-user affiliation built for the institution dashboard at ://osf.example/institutions//dashboard/, which offers institutional admins insight into how people at their institution are @@ -39,7 +38,7 @@ def report(self, **report_kwargs): _institution = osfdb.Institution.objects.get(pk=report_kwargs['institution_pk']) _user = osfdb.OSFUser.objects.get(pk=report_kwargs['user_pk']) _helper = _InstiUserReportHelper(_institution, _user, self.yearmonth) - return _helper.build_reports() + yield _helper.build_report() # helper @@ -49,10 +48,10 @@ class _InstiUserReportHelper: user: osfdb.OSFUser yearmonth: YearMonth - def build_reports(self): + def build_report(self): _affiliation = self.user.get_institution_affiliation(self.institution._id) - report_es8 = MonthlyInstitutionalUserReportEs8( - cycle_coverage=cycle_coverage_yearmonth(self.yearmonth), + return MonthlyInstitutionalUserReport( + report_yearmonth=self.yearmonth, institution_id=self.institution._id, user_id=self.user._id, user_name=self.user.fullname, @@ -73,24 +72,6 @@ def build_reports(self): published_preprint_count=self._published_preprint_queryset().count(), storage_byte_count=self._storage_byte_count(), ) - report_es6 = InstitutionalUserReport( - institution_id=report_es8.institution_id, - user_id=report_es8.user_id, - user_name=report_es8.user_name, - department_name=report_es8.department_name, - month_last_login=report_es8.month_last_login, - month_last_active=report_es8.month_last_active, - account_creation_date=report_es8.account_creation_date, - orcid_id=report_es8.orcid_id, - public_project_count=report_es8.public_project_count, - private_project_count=report_es8.private_project_count, - public_registration_count=report_es8.public_registration_count, - embargoed_registration_count=report_es8.embargoed_registration_count, - public_file_count=report_es8.public_file_count, - published_preprint_count=report_es8.published_preprint_count, - storage_byte_count=report_es8.storage_byte_count, - ) - return [report_es8, report_es6] @property def before_datetime(self): diff --git a/osf/metrics/reporters/new_user_domain.py b/osf/metrics/reporters/new_user_domain.py index 125e02754d7..8219d3e0ef3 100644 --- a/osf/metrics/reporters/new_user_domain.py +++ b/osf/metrics/reporters/new_user_domain.py @@ -2,8 +2,7 @@ from collections import Counter from osf.models import OSFUser -from osf.metrics.reports import NewUserDomainReport -from osf.metrics.es8_metrics import DailyNewUserDomainReportEs8 +from osf.metrics.daily_reports import DailyNewUserDomainReport from osf.metrics.utils import cycle_coverage_date from ._base import DailyReporter @@ -22,19 +21,9 @@ def report(self, date): email.split('@')[-1] for email in new_user_emails ) - reports = [] for domain_name, count in domain_names.items(): - report_es8 = DailyNewUserDomainReportEs8( + yield DailyNewUserDomainReport( cycle_coverage=cycle_coverage_date(date), domain_name=domain_name, new_user_count=count, ) - reports.append(report_es8) - - report = NewUserDomainReport( - report_date=date, - domain_name=report_es8.domain_name, - new_user_count=report_es8.new_user_count, - ) - reports.append(report) - return reports diff --git a/osf/metrics/reporters/node_count.py b/osf/metrics/reporters/node_count.py index 23f4c9bb78c..48831fdc87e 100644 --- a/osf/metrics/reporters/node_count.py +++ b/osf/metrics/reporters/node_count.py @@ -2,16 +2,7 @@ from django.db.models import Q -from osf.metrics.reports import ( - NodeSummaryReport, - NodeRunningTotals, - RegistrationRunningTotals, -) -from osf.metrics.es8_metrics import ( - DailyNodeSummaryReportEs8, - NodeRunningTotals as NodeRunningTotalsEs8, - RegistrationRunningTotals as RegistrationRunningTotalsEs8 -) +from osf.metrics.daily_reports import DailyNodeSummaryReport from osf.metrics.utils import cycle_coverage_date from ._base import DailyReporter @@ -40,11 +31,10 @@ def report(self, date): embargo_v2_query = Q(root__embargo__end_date__date__gt=date) exclude_spam = ~Q(spam_status__in=[SpamStatus.SPAM, SpamStatus.FLAGGED]) - reports = [] - report_es8 = DailyNodeSummaryReportEs8( + yield DailyNodeSummaryReport( cycle_coverage=cycle_coverage_date(date), # Nodes - the number of projects and components - nodes=NodeRunningTotalsEs8( + nodes=dict( total=node_qs.count(), total_excluding_spam=node_qs.filter(exclude_spam).count(), public=node_qs.filter(public_query).count(), @@ -55,7 +45,7 @@ def report(self, date): private_daily=node_qs.filter(private_query & created_today_query).count(), ), # Projects - the number of top-level only projects - projects=NodeRunningTotalsEs8( + projects=dict( total=node_qs.get_roots().count(), total_excluding_spam=node_qs.get_roots().filter(exclude_spam).count(), public=node_qs.filter(public_query).get_roots().count(), @@ -66,7 +56,7 @@ def report(self, date): private_daily=node_qs.filter(private_query & created_today_query).get_roots().count(), ), # Registered Nodes - the number of registered projects and components - registered_nodes=RegistrationRunningTotalsEs8( + registered_nodes=dict( total=registration_qs.count(), public=registration_qs.filter(public_query).count(), embargoed=registration_qs.filter(private_query).count(), @@ -80,7 +70,7 @@ def report(self, date): ), # Registered Projects - the number of registered top level projects - registered_projects=RegistrationRunningTotalsEs8( + registered_projects=dict( total=registration_qs.get_roots().count(), public=registration_qs.filter(public_query).get_roots().count(), embargoed=registration_qs.filter(private_query).get_roots().count(), @@ -93,58 +83,3 @@ def report(self, date): withdrawn_daily=registration_qs.filter(retracted_query & retracted_today_query).get_roots().count(), ), ) - reports.append(report_es8) - report = NodeSummaryReport( - report_date=date, - # Nodes - the number of projects and components - nodes=NodeRunningTotals( - total=report_es8.nodes.total, - total_excluding_spam=report_es8.nodes.total_excluding_spam, - public=report_es8.nodes.public, - private=report_es8.nodes.private, - total_daily=report_es8.nodes.total_daily, - total_daily_excluding_spam=report_es8.nodes.total_daily_excluding_spam, - public_daily=report_es8.nodes.public_daily, - private_daily=report_es8.nodes.private_daily, - ), - # Projects - the number of top-level only projects - projects=NodeRunningTotals( - total=report_es8.projects.total, - total_excluding_spam=report_es8.projects.total_excluding_spam, - public=report_es8.projects.public, - private=report_es8.projects.private, - total_daily=report_es8.projects.total_daily, - total_daily_excluding_spam=report_es8.projects.total_daily_excluding_spam, - public_daily=report_es8.projects.public_daily, - private_daily=report_es8.projects.private_daily, - ), - # Registered Nodes - the number of registered projects and components - registered_nodes=RegistrationRunningTotals( - total=report_es8.registered_nodes.total, - public=report_es8.registered_nodes.public, - embargoed=report_es8.registered_nodes.embargoed, - embargoed_v2=report_es8.registered_nodes.embargoed_v2, - withdrawn=report_es8.registered_nodes.withdrawn, - total_daily=report_es8.registered_nodes.total_daily, - public_daily=report_es8.registered_nodes.public_daily, - embargoed_daily=report_es8.registered_nodes.embargoed_daily, - embargoed_v2_daily=report_es8.registered_nodes.embargoed_v2_daily, - withdrawn_daily=report_es8.registered_nodes.withdrawn_daily, - ), - # Registered Projects - the number of registered top level projects - registered_projects=RegistrationRunningTotals( - total=report_es8.registered_projects.total, - public=report_es8.registered_projects.public, - embargoed=report_es8.registered_projects.embargoed, - embargoed_v2=report_es8.registered_projects.embargoed_v2, - withdrawn=report_es8.registered_projects.withdrawn, - total_daily=report_es8.registered_projects.total_daily, - public_daily=report_es8.registered_projects.public_daily, - embargoed_daily=report_es8.registered_projects.embargoed_daily, - embargoed_v2_daily=report_es8.registered_projects.embargoed_v2_daily, - withdrawn_daily=report_es8.registered_projects.withdrawn_daily, - ), - ) - reports.append(report) - - return reports diff --git a/osf/metrics/reporters/osfstorage_file_count.py b/osf/metrics/reporters/osfstorage_file_count.py index 6ddeb89945b..5db9ad1cff6 100644 --- a/osf/metrics/reporters/osfstorage_file_count.py +++ b/osf/metrics/reporters/osfstorage_file_count.py @@ -2,12 +2,8 @@ from django.db.models import Q import logging -from osf.metrics.reports import OsfstorageFileCountReport, FileRunningTotals from osf.models import AbstractNode, Preprint -from osf.metrics.es8_metrics import ( - DailyOsfstorageFileCountReportEs8, - FileRunningTotals as FileRunningTotalsEs8 -) +from osf.metrics.daily_reports import DailyOsfstorageFileCountReport from osf.metrics.utils import cycle_coverage_date from ._base import DailyReporter @@ -36,11 +32,9 @@ def report(self, date): daily_query = Q(created__date=date) - reports = [] - - report_es8 = DailyOsfstorageFileCountReportEs8( + yield DailyOsfstorageFileCountReport( cycle_coverage=cycle_coverage_date(date), - files=FileRunningTotalsEs8( + files=dict( total=file_qs.count(), public=file_qs.filter(public_query).count(), private=file_qs.filter(private_query).count(), @@ -49,19 +43,3 @@ def report(self, date): private_daily=file_qs.filter(private_query & daily_query).count(), ), ) - reports.append(report_es8) - - report = OsfstorageFileCountReport( - report_date=date, - files=FileRunningTotals( - total=report_es8.files.total, - public=report_es8.files.public, - private=report_es8.files.private, - total_daily=report_es8.files.total_daily, - public_daily=report_es8.files.public_daily, - private_daily=report_es8.files.private_daily, - ), - ) - reports.append(report) - - return reports diff --git a/osf/metrics/reporters/preprint_count.py b/osf/metrics/reporters/preprint_count.py index 85ba639a32f..6cafa063c62 100644 --- a/osf/metrics/reporters/preprint_count.py +++ b/osf/metrics/reporters/preprint_count.py @@ -1,9 +1,8 @@ import logging import requests -from osf.metrics import PreprintSummaryReport from website import settings -from osf.metrics.es8_metrics import DailyPreprintSummaryReportEs8 +from osf.metrics.daily_reports import DailyPreprintSummaryReport from osf.metrics.utils import cycle_coverage_date from ._base import DailyReporter @@ -46,24 +45,12 @@ class PreprintCountReporter(DailyReporter): def report(self, date): from osf.models import PreprintProvider - reports = [] for preprint_provider in PreprintProvider.objects.all(): elastic_query = get_elastic_query(date, preprint_provider) resp = requests.post(f'{settings.SHARE_URL}api/v2/search/creativeworks/_search', json=elastic_query).json() - report_es8 = DailyPreprintSummaryReportEs8( + yield DailyPreprintSummaryReport( cycle_coverage=cycle_coverage_date(date), provider_key=preprint_provider._id, preprint_count=resp['hits']['total'], ) - reports.append(report_es8) - - report = PreprintSummaryReport( - report_date=date, - provider_key=report_es8.provider_key, - preprint_count=report_es8.preprint_count, - ) - reports.append(report) - logger.info('{} Preprints counted for the provider {}'.format(resp['hits']['total'], preprint_provider.name)) - - return reports diff --git a/osf/metrics/reporters/private_spam_metrics.py b/osf/metrics/reporters/private_spam_metrics.py index fde545247e6..c5f91206a7e 100644 --- a/osf/metrics/reporters/private_spam_metrics.py +++ b/osf/metrics/reporters/private_spam_metrics.py @@ -1,8 +1,6 @@ -from osf.metrics.reports import PrivateSpamMetricsReport from osf.external.oopspam.client import OOPSpamClient from osf.external.askismet.client import AkismetClient -from osf.metrics.es8_metrics import MonthlyPrivateSpamMetricsReportEs8 -from osf.metrics.utils import cycle_coverage_yearmonth +from osf.metrics.monthly_reports import MonthlyPrivateSpamMetricsReport from ._base import MonthlyReporter @@ -16,10 +14,8 @@ def report(self): oopspam_client = OOPSpamClient() akismet_client = AkismetClient() - reports = [] - - report_es8 = MonthlyPrivateSpamMetricsReportEs8( - cycle_coverage=cycle_coverage_yearmonth(self.yearmonth), + yield MonthlyPrivateSpamMetricsReport( + report_yearmonth=self.yearmonth, node_oopspam_flagged=oopspam_client.get_flagged_count(target_month, next_month, category='node'), node_oopspam_hammed=oopspam_client.get_hammed_count(target_month, next_month, category='node'), node_akismet_flagged=akismet_client.get_flagged_count(target_month, next_month, category='node'), @@ -29,19 +25,3 @@ def report(self): preprint_akismet_flagged=akismet_client.get_flagged_count(target_month, next_month, category='preprint'), preprint_akismet_hammed=akismet_client.get_hammed_count(target_month, next_month, category='preprint') ) - reports.append(report_es8) - - report = PrivateSpamMetricsReport( - report_yearmonth=str(self.yearmonth), - node_oopspam_flagged=report_es8.node_oopspam_flagged, - node_oopspam_hammed=report_es8.node_oopspam_hammed, - node_akismet_flagged=report_es8.node_akismet_flagged, - node_akismet_hammed=report_es8.node_akismet_hammed, - preprint_oopspam_flagged=report_es8.preprint_oopspam_flagged, - preprint_oopspam_hammed=report_es8.preprint_oopspam_hammed, - preprint_akismet_flagged=report_es8.preprint_akismet_flagged, - preprint_akismet_hammed=report_es8.preprint_akismet_hammed, - ) - reports.append(report) - - return reports diff --git a/osf/metrics/reporters/public_item_usage.py b/osf/metrics/reporters/public_item_usage.py index 985a1213be2..0fab423f85e 100644 --- a/osf/metrics/reporters/public_item_usage.py +++ b/osf/metrics/reporters/public_item_usage.py @@ -2,34 +2,12 @@ import datetime import typing -import waffle +from elasticsearch8 import dsl as esdsl -from osf.metrics.es8_metrics import MonthlyPublicItemUsageReportEs8 - -if typing.TYPE_CHECKING: - import elasticsearch6_dsl as edsl - -import osf.features from osf.metadata.osf_gathering import OsfmapPartition -from osf.metrics.counted_usage import ( - CountedAuthUsage, - get_provider_id, - get_item_type as get_legacy_item_type, -) -from osf.metrics.preprint_metrics import ( - PreprintDownload, - PreprintView, -) -from osf.metrics.reports import PublicItemUsageReport -from osf.metrics.utils import ( - YearMonth, - cycle_coverage_yearmonth, - get_database_iri, - get_item_type, -) -from osf import models as osfdb -from osf.models.base import osfid_iri -from website import settings as website_settings +from osf.metrics.monthly_reports import MonthlyPublicItemUsageReport +from osf.metrics.events import OsfCountedUsageEvent +from osf.metrics.utils import YearMonth, cycle_coverage_yearmonth from ._base import MonthlyReporter @@ -48,60 +26,20 @@ class PublicItemUsageReporter(MonthlyReporter): includes projects, project components, registrations, registration components, and preprints ''' def iter_report_kwargs(self, continue_after: dict | None = None): - _after_osfid = continue_after['osfid'] if continue_after else None - for _osfid in _zip_sorted( - self._countedusage_osfids(_after_osfid), - self._preprintview_osfids(_after_osfid), - self._preprintdownload_osfids(_after_osfid), - ): - yield {'osfid': _osfid} + _after_item_iri = continue_after['item_iri'] if continue_after else None + for _item_iri in self._each_item_iri(_after_item_iri): + yield {'item_iri': _item_iri} def report(self, **report_kwargs): - _osfid = report_kwargs['osfid'] - # get usage metrics from several sources: - # - osf.metrics.counted_usage: - # - views and downloads for each item (using `CountedAuthUsage.item_guid`) - # - views for each item's components and files (using `CountedAuthUsage.surrounding_guids`) - # - osf.metrics.preprint_metrics: - # - preprint views and downloads - # - PageCounter? (no) + _item_iri = report_kwargs['item_iri'] try: - _guid = osfdb.Guid.load(_osfid) - if _guid is None or _guid.referent is None: - raise _SkipItem - _obj = _guid.referent - _report = self._init_report(_obj) - self._fill_report_counts(_report, _obj) - if not any(( - _report.view_count, - _report.view_session_count, - _report.download_count, - _report.download_session_count, - )): - raise _SkipItem - _report_es6 = PublicItemUsageReport( - item_osfid=_report.item_osfids[0], - item_type=[get_legacy_item_type(_obj)], - provider_id=list(_report.provider_ids), - platform_iri=list(_report.platform_iris), - view_count=_report.view_count, - view_session_count=_report.view_session_count, - download_count=_report.download_count, - download_session_count=_report.download_session_count, - ) - return [_report, _report_es6] + yield self._build_report(_item_iri) except _SkipItem: - return [] + pass def followup_task(self, report): _last_month = YearMonth.from_date(datetime.date.today()).prior() - if isinstance(report, MonthlyPublicItemUsageReportEs8): - _is_last_month = (report.cycle_coverage == cycle_coverage_yearmonth(_last_month)) - elif isinstance(report, PublicItemUsageReport): - return None # followup for only one of the two reports - else: - raise ValueError(report) - if _is_last_month: + if report.report_yearmonth == _last_month: from api.share.utils import task__update_share return task__update_share.signature( args=(report.item_osfids[0],), @@ -112,212 +50,121 @@ def followup_task(self, report): countdown=30, # give index time to settle ) - def _countedusage_osfids(self, after_osfid: str | None) -> typing.Iterator[str]: + def _each_item_iri(self, after_item_iri: str | None) -> typing.Iterator[str]: _search = self._base_usage_search() _search.aggs.bucket( - 'agg_osfid', + 'agg_item_iri', 'composite', - sources=[{'osfid': {'terms': {'field': 'item_guid'}}}], + sources=[{'item_iri': {'terms': {'field': 'item_iri'}}}], size=_CHUNK_SIZE, ) - return _iter_composite_bucket_keys(_search, 'agg_osfid', 'osfid', after=after_osfid) + return _iter_composite_bucket_keys(_search, 'agg_item_iri', 'item_iri', after=after_item_iri) - def _preprintview_osfids(self, after_osfid: str | None) -> typing.Iterator[str]: - _search = ( - PreprintView.search() - .filter('range', timestamp={ - 'gte': self.yearmonth.month_start(), - 'lt': self.yearmonth.month_end(), - }) - .extra(size=0) # only aggregations, no hits - ) - _search.aggs.bucket( - 'agg_osfid', - 'composite', - sources=[{'osfid': {'terms': {'field': 'preprint_id'}}}], - size=_CHUNK_SIZE, + def _build_report(self, item_iri) -> MonthlyPublicItemUsageReport: + # get usage metrics from OsfCountedUsageEvent: + # - views of the item and its components and files (matching `within_iris`) + # - downloads for each item (matching `item_iri`) + _search = self._build_usage_counts_search(item_iri) + _response = _search.execute() + _views_bucket = _response.aggregations.agg_by_label.buckets.views + _downloads_bucket = _response.aggregations.agg_by_label.buckets.downloads + _fields_agg = _response.aggregations.agg_for_terms + _report = MonthlyPublicItemUsageReport( + report_yearmonth=self.yearmonth, + item_iri=item_iri, + item_osfids=_bucket_keys(_fields_agg.item_osfids.buckets), + database_iris=_bucket_keys(_fields_agg.database_iris.buckets), + platform_iris=_bucket_keys(_fields_agg.platform_iris.buckets), + provider_ids=_bucket_keys(_fields_agg.provider_ids.buckets), + item_types=_bucket_keys(_fields_agg.item_types.buckets), + view_count=_views_bucket.doc_count, + view_session_count=_views_bucket.agg_session_count.value, + download_count=_downloads_bucket.doc_count, + download_session_count=_downloads_bucket.agg_session_count.value, + # same as non-cumulative counts, unless there's a prior report (added below) + cumulative_view_count=_views_bucket.doc_count, + cumulative_view_session_count=_views_bucket.agg_session_count.value, + cumulative_download_count=_downloads_bucket.doc_count, + cumulative_download_session_count=_downloads_bucket.agg_session_count.value, ) - return _iter_composite_bucket_keys(_search, 'agg_osfid', 'osfid', after=after_osfid) - - def _preprintdownload_osfids(self, after_osfid: str | None) -> typing.Iterator[str]: - _search = ( - PreprintDownload.search() + _prior = self._prior_usage_report(item_iri) + if _prior is not None: + _report.cumulative_view_count += _prior.cumulative_view_count + _report.cumulative_view_session_count += _prior.cumulative_view_session_count + _report.cumulative_download_count += _prior.cumulative_download_count + _report.cumulative_download_session_count += _prior.cumulative_download_session_count + return _report + + def _base_usage_search(self): + return ( + OsfCountedUsageEvent.search() + .filter('term', item_public=True) .filter('range', timestamp={ - 'gte': self.yearmonth.month_start(), 'lt': self.yearmonth.month_end(), + 'gte': self.yearmonth.month_start() }) .extra(size=0) # only aggregations, no hits ) - _search.aggs.bucket( - 'agg_osfid', - 'composite', - sources=[{'osfid': {'terms': {'field': 'preprint_id'}}}], - size=_CHUNK_SIZE, - ) - return _iter_composite_bucket_keys(_search, 'agg_osfid', 'osfid', after=after_osfid) - - def _init_report(self, osf_obj) -> MonthlyPublicItemUsageReportEs8: - if not _is_item_public(osf_obj): - raise _SkipItem - return MonthlyPublicItemUsageReportEs8( - cycle_coverage=cycle_coverage_yearmonth(self.yearmonth), - item_iri=osfid_iri(osf_obj._id), - item_osfids=[osf_obj._id], - item_types=[get_item_type(osf_obj)], - provider_ids=[get_provider_id(osf_obj)], - database_iris=[get_database_iri(osf_obj)], - platform_iris=[website_settings.DOMAIN], - # leave counts null; will be set if there's data - ) - - def _fill_report_counts(self, report, osf_obj): - if ( - isinstance(osf_obj, osfdb.Preprint) - and not waffle.switch_is_active(osf.features.COUNTEDUSAGE_UNIFIED_METRICS_2024) # type: ignore[attr-defined] - ): - # note: no session-count info in preprint metrics - report.view_count = PreprintView.get_count_for_preprint( - preprint=osf_obj, - after=self.yearmonth.month_start(), - before=self.yearmonth.month_end(), - ) - report.download_count = PreprintDownload.get_count_for_preprint( - preprint=osf_obj, - after=self.yearmonth.month_start(), - before=self.yearmonth.month_end(), - ) - report.cumulative_view_count = PreprintView.get_count_for_preprint( - preprint=osf_obj, - before=self.yearmonth.month_end(), - ) - report.cumulative_download_count = PreprintDownload.get_count_for_preprint( - preprint=osf_obj, - before=self.yearmonth.month_end(), - ) - else: - ( - report.view_count, - report.view_session_count, - ) = self._countedusage_view_counts(osf_obj, cumulative=False) - ( - report.download_count, - report.download_session_count, - ) = self._countedusage_download_counts(osf_obj, cumulative=False) - ( - report.cumulative_view_count, - report.cumulative_view_session_count, - ) = self._countedusage_view_counts(osf_obj, cumulative=True) - - ( - report.cumulative_download_count, - report.cumulative_download_session_count, - ) = self._countedusage_download_counts(osf_obj, cumulative=True) - - def _base_usage_search(self, cumulative: bool = False): - timestamp_filter = { - 'lt': self.yearmonth.month_end(), - } - if not cumulative: - timestamp_filter['gte'] = self.yearmonth.month_start() - return ( - CountedAuthUsage.search() - .filter('term', item_public=True) - .filter('range', timestamp=timestamp_filter) - .extra(size=0) # only aggregations, no hits - ) - - def _countedusage_view_counts(self, osf_obj, cumulative: bool = False) -> tuple[int, int]: - '''compute view_session_count separately to avoid double-counting - - (the same session may be represented in both the composite agg on `item_guid` - and that on `surrounding_guids`) + def _build_usage_counts_search(self, item_iri, cumulative: bool = False) -> tuple[int, int]: + '''get usage counts for the given item_iri ''' - _search = ( - self._base_usage_search(cumulative=cumulative) - .query( - 'bool', - filter=[ - {'term': {'action_labels': CountedAuthUsage.ActionLabel.VIEW.value}}, - ], - should=[ - {'term': {'item_guid': osf_obj._id}}, - {'term': {'surrounding_guids': osf_obj._id}}, - ], - minimum_should_match=1, - ) - ) - _search.aggs.metric( + _search = self._base_usage_search().filter('term', within_iris=item_iri) + + # aggregation for counts by action label (views, downloads) + _agg_by_label = esdsl.A('filters', filters={ + # bucket for views (including items within) + 'views': {'term': {'action_labels': OsfCountedUsageEvent.ActionLabel.VIEW.value}}, + # bucket for downloads (excluding items within) + 'downloads': { + 'bool': { + 'filter': [ + {'term': {'action_labels': OsfCountedUsageEvent.ActionLabel.DOWNLOAD.value}}, + {'term': {'item_iri': item_iri}}, + ], + }, + }, + }) + # session count for each label bucket + _agg_by_label.metric( 'agg_session_count', 'cardinality', - field='session_id', + field='sessionhour_id', precision_threshold=_MAX_CARDINALITY_PRECISION, ) - _response = _search.execute() - _view_count = _response.hits.total - _view_session_count = ( - _response.aggregations.agg_session_count.value - if 'agg_session_count' in _response.aggregations - else 0 - ) - return (_view_count, _view_session_count) + _search.aggs.bucket('agg_by_label', _agg_by_label) - def _countedusage_download_counts(self, osf_obj, cumulative: bool = False) -> tuple[int, int]: - '''aggregate downloads on each osfid (not including components/files)''' - _search = ( - self._base_usage_search(cumulative=cumulative) - .filter('term', item_guid=osf_obj._id) - .filter('term', action_labels=CountedAuthUsage.ActionLabel.DOWNLOAD.value) - ) - # agg: get download session count - _search.aggs.metric( - 'agg_session_count', - 'cardinality', - field='session_id', - precision_threshold=_MAX_CARDINALITY_PRECISION, - ) - _response = _search.execute() - _download_count = _response.hits.total - _download_session_count = ( - _response.aggregations.agg_session_count.value - if 'agg_session_count' in _response.aggregations - else 0 - ) - return (_download_count, _download_session_count) + # aggregation for getting terms used on usage events directly on the item + # (excluding items within) -- usually one value per field per item, but could be more + _agg_for_terms = esdsl.A('filter', term={'item_iri': item_iri}) + _agg_for_terms.bucket('item_osfids', esdsl.A('terms', field='item_osfid')) + _agg_for_terms.bucket('item_types', esdsl.A('terms', field='item_type')) + _agg_for_terms.bucket('platform_iris', esdsl.A('terms', field='platform_iri')) + _agg_for_terms.bucket('database_iris', esdsl.A('terms', field='database_iri')) + _agg_for_terms.bucket('provider_ids', esdsl.A('terms', field='provider_id')) + _search.aggs.bucket('agg_for_terms', _agg_for_terms) + return _search -def _is_item_public(osfid_referent) -> bool: - if isinstance(osfid_referent, osfdb.Preprint): - return bool(osfid_referent.verified_publishable) # quacks like Preprint - return getattr(osfid_referent, 'is_public', False) # quacks like AbstractNode + def _prior_usage_report(self, item_iri): + _search = ( + MonthlyPublicItemUsageReport.search() + .filter('term', item_iri=item_iri) + .filter('range', cycle_coverage={ + 'lt': cycle_coverage_yearmonth(self.yearmonth), + }) + .sort('-cycle_coverage') # most recent first + ) + _response = _search[0].execute() + return _response[0] if _response else None -def _zip_sorted( - *iterators: typing.Iterator[str], -) -> typing.Iterator[str]: - '''loop thru multiple iterators on sorted (ascending) sequences of strings - ''' - _nexts = { # holds the next value from each iterator, or None - _i: next(_iter, None) - for _i, _iter in enumerate(iterators) - } - while True: - _nonnull_nexts = [ - _next - for _next in _nexts.values() - if _next is not None - ] - if not _nonnull_nexts: - return # all done - _value = min(_nonnull_nexts) - yield _value - for _i, _iter in enumerate(iterators): - if _nexts[_i] == _value: - _nexts[_i] = next(_iter, None) +def _bucket_keys(buckets): + return [_bucket['key'] for _bucket in buckets] def _iter_composite_bucket_keys( - search: edsl.Search, + search: esdsl.Search, composite_agg_name: str, composite_source_name: str, after: str | None = None, diff --git a/osf/metrics/reporters/spam_count.py b/osf/metrics/reporters/spam_count.py index 2fbac671ad1..50af17ab452 100644 --- a/osf/metrics/reporters/spam_count.py +++ b/osf/metrics/reporters/spam_count.py @@ -1,10 +1,8 @@ from osf.models import OSFUser -from osf.metrics.reports import SpamSummaryReport from osf.models import PreprintLog, NodeLog from osf.models.spam import SpamStatus -from osf.metrics.es8_metrics import MonthlySpamSummaryReportEs8 -from osf.metrics.utils import cycle_coverage_yearmonth +from osf.metrics.monthly_reports import MonthlySpamSummaryReport from ._base import MonthlyReporter class SpamCountReporter(MonthlyReporter): @@ -13,9 +11,8 @@ def report(self, **report_kwargs): assert not report_kwargs target_month = self.yearmonth.month_start() next_month = self.yearmonth.month_end() - reports = [] - report_es8 = MonthlySpamSummaryReportEs8( - cycle_coverage=cycle_coverage_yearmonth(self.yearmonth), + yield MonthlySpamSummaryReport( + report_yearmonth=self.yearmonth, node_confirmed_spam=NodeLog.objects.filter( action=NodeLog.CONFIRM_SPAM, created__gt=target_month, @@ -81,23 +78,3 @@ def report(self, **report_kwargs): created__lt=next_month, ).count() ) - reports.append(report_es8) - report = SpamSummaryReport( - # Node Log entries - node_confirmed_spam=report_es8.node_confirmed_spam, - node_confirmed_ham=report_es8.node_confirmed_ham, - node_flagged=report_es8.node_flagged, - # Registration Log entries - registration_confirmed_spam=report_es8.registration_confirmed_spam, - registration_confirmed_ham=report_es8.registration_confirmed_ham, - registration_flagged=report_es8.registration_flagged, - # Preprint Log entries - preprint_confirmed_spam=report_es8.preprint_confirmed_spam, - preprint_confirmed_ham=report_es8.preprint_confirmed_ham, - preprint_flagged=report_es8.preprint_flagged, - # New Users marked as Spam/Ham - user_marked_as_spam=report_es8.user_marked_as_spam, - user_marked_as_ham=report_es8.user_marked_as_ham, - ) - reports.append(report) - return reports diff --git a/osf/metrics/reporters/storage_addon_usage.py b/osf/metrics/reporters/storage_addon_usage.py index af6dbb3ebdd..053470d578c 100644 --- a/osf/metrics/reporters/storage_addon_usage.py +++ b/osf/metrics/reporters/storage_addon_usage.py @@ -10,10 +10,9 @@ ) from addons.base.models import BaseOAuthUserSettings, BaseOAuthNodeSettings -from osf.metrics.reports import StorageAddonUsage from osf.models import SpamStatus, Tag from website import settings -from osf.metrics.es8_metrics import DailyStorageAddonUsageReportEs8 +from osf.metrics.daily_reports import DailyStorageAddonUsageReport from osf.metrics.utils import cycle_coverage_date from ._base import DailyReporter @@ -168,13 +167,7 @@ def report(self, date): 'total_daily': node_counts.get('deleted_daily', 0), }, }) - return [ - DailyStorageAddonUsageReportEs8( - cycle_coverage=cycle_coverage_date(date), - usage_by_addon=_usages_by_addon, - ), - StorageAddonUsage( - report_date=date, - usage_by_addon=_usages_by_addon, - ), - ] + yield DailyStorageAddonUsageReport( + cycle_coverage=cycle_coverage_date(date), + usage_by_addon=_usages_by_addon, + ) diff --git a/osf/metrics/reporters/user_count.py b/osf/metrics/reporters/user_count.py index 121b830c466..8a11b4a41d7 100644 --- a/osf/metrics/reporters/user_count.py +++ b/osf/metrics/reporters/user_count.py @@ -1,7 +1,6 @@ from osf.models import OSFUser -from osf.metrics import UserSummaryReport -from osf.metrics.es8_metrics import DailyUserSummaryReportEs8 +from osf.metrics.daily_reports import DailyUserSummaryReport from osf.metrics.utils import cycle_coverage_date from ._base import DailyReporter @@ -9,8 +8,7 @@ class UserCountReporter(DailyReporter): def report(self, report_date): - reports = [] - report_es8 = DailyUserSummaryReportEs8( + yield DailyUserSummaryReport( cycle_coverage=cycle_coverage_date(report_date), active=OSFUser.objects.filter(is_active=True, date_confirmed__date__lte=report_date).count(), deactivated=OSFUser.objects.filter(date_disabled__isnull=False, date_disabled__date__lte=report_date).count(), @@ -19,16 +17,3 @@ def report(self, report_date): new_users_with_institution_daily=OSFUser.objects.filter(is_active=True, date_confirmed__date=report_date, institutionaffiliation__isnull=False).count(), unconfirmed=OSFUser.objects.filter(date_registered__date__lte=report_date, date_confirmed__isnull=True).count(), ) - reports.append(report_es8) - report = UserSummaryReport( - report_date=report_date, - active=report_es8.active, - deactivated=report_es8.deactivated, - merged=report_es8.merged, - new_users_daily=report_es8.new_users_daily, - new_users_with_institution_daily=report_es8.new_users_with_institution_daily, - unconfirmed=report_es8.unconfirmed, - ) - reports.append(report) - - return reports diff --git a/osf/metrics/reports.py b/osf/metrics/reports.py deleted file mode 100644 index 62479e359cd..00000000000 --- a/osf/metrics/reports.py +++ /dev/null @@ -1,353 +0,0 @@ -from __future__ import annotations -from collections import abc -import datetime - -from django.dispatch import receiver -from elasticsearch6_dsl import InnerDoc -import elasticsearch_metrics.imps.elastic6 as metrics -from elasticsearch_metrics.signals import pre_save as metrics_pre_save - -from osf.metrics.utils import stable_key, YearMonth - - -class ReportInvalid(Exception): - """Tried to save a report with invalid something-or-other - """ - pass - - -class DailyReport(metrics.Metric): - """DailyReport (abstract base for report-based metrics) - - There's something we'd like to know about every so often, - so let's regularly run a report and stash the results here. - """ - UNIQUE_TOGETHER_FIELDS: tuple[str, ...] = ('report_date',) # override in subclasses for multiple reports per day - - report_date = metrics.Date(format='strict_date', required=True) - - def __init_subclass__(cls, **kwargs): - super().__init_subclass__(**kwargs) - assert 'report_date' in cls.UNIQUE_TOGETHER_FIELDS, f'DailyReport subclasses must have "report_date" in UNIQUE_TOGETHER_FIELDS (on {cls.__qualname__}, got {cls.UNIQUE_TOGETHER_FIELDS})' - - def save(self, *args, **kwargs): - if self.timestamp is None: - self.timestamp = datetime.datetime( - self.report_date.year, - self.report_date.month, - self.report_date.day, - tzinfo=datetime.UTC, - ) - super().save(*args, **kwargs) - - class Meta: - abstract = True - dynamic = metrics.MetaField('strict') - source = metrics.MetaField(enabled=True) - - -class YearmonthField(metrics.Date): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs, format='strict_year_month') - - def deserialize(self, data): - if isinstance(data, int): - # elasticsearch stores dates in milliseconds since the unix epoch - _as_datetime = datetime.datetime.fromtimestamp(data // 1000) - return YearMonth.from_date(_as_datetime) - elif data is None: - return None - try: - return YearMonth.from_any(data) - except ValueError: - raise ValueError(f'unsure how to deserialize "{data}" (of type {type(data)}) to YearMonth') - - def serialize(self, data): - if isinstance(data, str): - return data - elif isinstance(data, YearMonth): - return str(data) - elif isinstance(data, (datetime.datetime, datetime.date)): - return str(YearMonth.from_date(data)) - elif data is None: - return None - else: - raise ValueError(f'unsure how to serialize "{data}" (of type {type(data)}) as YYYY-MM') - - -class MonthlyReport(metrics.Metric): - """MonthlyReport (abstract base for report-based metrics that run monthly) - """ - UNIQUE_TOGETHER_FIELDS: tuple[str, ...] = ('report_yearmonth',) # override in subclasses for multiple reports per month - - report_yearmonth = YearmonthField(required=True) - - class Meta: - abstract = True - dynamic = metrics.MetaField('strict') - source = metrics.MetaField(enabled=True) - - @classmethod - def most_recent_yearmonth(cls, base_search=None) -> YearMonth | None: - _search = base_search or cls.search() - _search = _search[0:0] # omit hits - _search.aggs.bucket( - 'agg_most_recent_yearmonth', - 'terms', - field='report_yearmonth', - order={'_key': 'desc'}, - size=1, - ) - _response = _search.execute() - if not _response.aggregations: - return None - - buckets = _response.aggregations.agg_most_recent_yearmonth.buckets - if not buckets: - return None - - return buckets[0].key - - def __init_subclass__(cls, **kwargs): - super().__init_subclass__(**kwargs) - assert 'report_yearmonth' in cls.UNIQUE_TOGETHER_FIELDS, f'MonthlyReport subclasses must have "report_yearmonth" in UNIQUE_TOGETHER_FIELDS (on {cls.__qualname__}, got {cls.UNIQUE_TOGETHER_FIELDS})' - - def save(self, *args, **kwargs): - if self.timestamp is None: - self.timestamp = YearMonth.from_any(self.report_yearmonth).month_start() - super().save(*args, **kwargs) - - -@receiver(metrics_pre_save) -def set_report_id(sender, instance, **kwargs): - if not issubclass(sender, metrics.Metric): - return # skip es8 record types - try: - _unique_together_fields = instance.UNIQUE_TOGETHER_FIELDS - except AttributeError: - pass - else: - # Set the document id to a hash of "unique together" fields - # for "ON CONFLICT UPDATE" behavior -- if the document - # already exists, it will be updated rather than duplicated. - # Cannot detect/avoid conflicts this way, but that's ok. - _key_values = [] - for _field_name in _unique_together_fields: - _field_value = getattr(instance, _field_name) - if not _field_value or ( - isinstance(_field_value, abc.Iterable) and not isinstance(_field_value, str) - ): - raise ReportInvalid(f'because "{_field_name}" is in {sender.__name__}.UNIQUE_TOGETHER_FIELDS, {sender.__name__}.{_field_name} MUST have a non-empty scalar value (got {_field_value} of type {type(_field_value)})') - _key_values.append(_field_value) - instance.meta.id = stable_key(*_key_values) - - -#### BEGIN reusable inner objects ##### - -class RunningTotal(InnerDoc): - total = metrics.Integer() - total_daily = metrics.Integer() - -class FileRunningTotals(InnerDoc): - total = metrics.Integer() - public = metrics.Integer() - private = metrics.Integer() - total_daily = metrics.Integer() - public_daily = metrics.Integer() - private_daily = metrics.Integer() - -class NodeRunningTotals(InnerDoc): - total = metrics.Integer() - total_excluding_spam = metrics.Integer() - public = metrics.Integer() - private = metrics.Integer() - total_daily = metrics.Integer() - total_daily_excluding_spam = metrics.Integer() - public_daily = metrics.Integer() - private_daily = metrics.Integer() - -class RegistrationRunningTotals(InnerDoc): - total = metrics.Integer() - public = metrics.Integer() - embargoed = metrics.Integer() - embargoed_v2 = metrics.Integer() - withdrawn = metrics.Integer() - total_daily = metrics.Integer() - public_daily = metrics.Integer() - embargoed_daily = metrics.Integer() - embargoed_v2_daily = metrics.Integer() - withdrawn_daily = metrics.Integer() - -##### END reusable inner objects ##### - - -# TODO: -# class ActiveUsersReport(DailyReport): -# past_day = metrics.Integer() -# past_week = metrics.Integer() -# past_30_days = metrics.Integer() -# past_year = metrics.Integer() - - -class UsageByStorageAddon(InnerDoc): - addon_shortname = metrics.Keyword() - - enabled_usersettings = metrics.Object(RunningTotal) - linked_usersettings = metrics.Object(RunningTotal) - deleted_usersettings = metrics.Object(RunningTotal) - usersetting_links = metrics.Object(RunningTotal) - - connected_nodesettings = metrics.Object(RunningTotal) - disconnected_nodesettings = metrics.Object(RunningTotal) - deleted_nodesettings = metrics.Object(RunningTotal) - - -class StorageAddonUsage(DailyReport): - usage_by_addon = metrics.Object(UsageByStorageAddon, multi=True) - - -class DownloadCountReport(DailyReport): - daily_file_downloads = metrics.Integer() - - -class InstitutionSummaryReport(DailyReport): - UNIQUE_TOGETHER_FIELDS = ('report_date', 'institution_id',) - - institution_id = metrics.Keyword() - institution_name = metrics.Keyword() - users = metrics.Object(RunningTotal) - nodes = metrics.Object(NodeRunningTotals) - projects = metrics.Object(NodeRunningTotals) - registered_nodes = metrics.Object(RegistrationRunningTotals) - registered_projects = metrics.Object(RegistrationRunningTotals) - - -class NewUserDomainReport(DailyReport): - UNIQUE_TOGETHER_FIELDS = ('report_date', 'domain_name',) - - domain_name = metrics.Keyword() - new_user_count = metrics.Integer() - - -class NodeSummaryReport(DailyReport): - nodes = metrics.Object(NodeRunningTotals) - projects = metrics.Object(NodeRunningTotals) - registered_nodes = metrics.Object(RegistrationRunningTotals) - registered_projects = metrics.Object(RegistrationRunningTotals) - - -class OsfstorageFileCountReport(DailyReport): - files = metrics.Object(FileRunningTotals) - - -class PreprintSummaryReport(DailyReport): - UNIQUE_TOGETHER_FIELDS = ('report_date', 'provider_key',) - - provider_key = metrics.Keyword() - preprint_count = metrics.Integer() - - -class UserSummaryReport(DailyReport): - active = metrics.Integer() - deactivated = metrics.Integer() - merged = metrics.Integer() - new_users_daily = metrics.Integer() - new_users_with_institution_daily = metrics.Integer() - unconfirmed = metrics.Integer() - - -class SpamSummaryReport(MonthlyReport): - node_confirmed_spam = metrics.Integer() - node_confirmed_ham = metrics.Integer() - node_flagged = metrics.Integer() - registration_confirmed_spam = metrics.Integer() - registration_confirmed_ham = metrics.Integer() - registration_flagged = metrics.Integer() - preprint_confirmed_spam = metrics.Integer() - preprint_confirmed_ham = metrics.Integer() - preprint_flagged = metrics.Integer() - user_marked_as_spam = metrics.Integer() - user_marked_as_ham = metrics.Integer() - - -class InstitutionalUserReport(MonthlyReport): - UNIQUE_TOGETHER_FIELDS = ('report_yearmonth', 'institution_id', 'user_id',) - institution_id = metrics.Keyword() - # user info: - user_id = metrics.Keyword() - user_name = metrics.Keyword() - department_name = metrics.Keyword() - month_last_login = YearmonthField() - month_last_active = YearmonthField() - account_creation_date = YearmonthField() - orcid_id = metrics.Keyword() - # counts: - public_project_count = metrics.Integer() - private_project_count = metrics.Integer() - public_registration_count = metrics.Integer() - embargoed_registration_count = metrics.Integer() - published_preprint_count = metrics.Integer() - public_file_count = metrics.Long() - storage_byte_count = metrics.Long() - - -class InstitutionMonthlySummaryReport(MonthlyReport): - UNIQUE_TOGETHER_FIELDS = ('report_yearmonth', 'institution_id', ) - institution_id = metrics.Keyword() - user_count = metrics.Integer() - public_project_count = metrics.Integer() - private_project_count = metrics.Integer() - public_registration_count = metrics.Integer() - embargoed_registration_count = metrics.Integer() - published_preprint_count = metrics.Integer() - storage_byte_count = metrics.Long() - public_file_count = metrics.Long() - monthly_logged_in_user_count = metrics.Long() - monthly_active_user_count = metrics.Long() - - -class PublicItemUsageReport(MonthlyReport): - UNIQUE_TOGETHER_FIELDS = ('report_yearmonth', 'item_osfid') - - # where noted, fields are meant to correspond to defined terms from COUNTER - # https://cop5.projectcounter.org/en/5.1/appendices/a-glossary-of-terms.html - # https://coprd.countermetrics.org/en/1.0.1/appendices/a-glossary.html - item_osfid = metrics.Keyword() # counter:Item (or Dataset) - item_type = metrics.Keyword(multi=True) # counter:Data-Type - provider_id = metrics.Keyword(multi=True) # counter:Database(?) - platform_iri = metrics.Keyword(multi=True) # counter:Platform - - # view counts include views on components or files contained by this item - view_count = metrics.Long() # counter:Total Investigations - view_session_count = metrics.Long() # counter:Unique Investigations - - # download counts of this item only (not including contained components or files) - download_count = metrics.Long() # counter:Total Requests - download_session_count = metrics.Long() # counter:Unique Requests - - @classmethod - def for_last_month(cls, item_osfid: str) -> PublicItemUsageReport | None: - _search = ( - PublicItemUsageReport.search() - .filter('term', item_osfid=item_osfid) - # only last month's report - .filter('range', report_yearmonth={ - 'gte': 'now-2M/M', - 'lt': 'now/M', - }) - .sort('-report_yearmonth') - [:1] - ) - _response = _search.execute() - return _response[0] if _response else None - - -class PrivateSpamMetricsReport(MonthlyReport): - node_oopspam_flagged = metrics.Integer() - node_oopspam_hammed = metrics.Integer() - node_akismet_flagged = metrics.Integer() - node_akismet_hammed = metrics.Integer() - preprint_oopspam_flagged = metrics.Integer() - preprint_oopspam_hammed = metrics.Integer() - preprint_akismet_flagged = metrics.Integer() - preprint_akismet_hammed = metrics.Integer() diff --git a/osf/metrics/utils.py b/osf/metrics/utils.py index 87b2d48f6fd..e0ea0b1f4e6 100644 --- a/osf/metrics/utils.py +++ b/osf/metrics/utils.py @@ -75,6 +75,36 @@ def get_item_type_from_iri(type_iri) -> str: return _shortname +def get_surrounding_osfids(osfid_referent): + """get all the parent/owner/surrounding osfids for the given osfid_referent + + @param osfid_referent: instance of a model that has GuidMixin + @returns list of str + + For AbstractNode, goes up the node hierarchy up to the root. + For WikiPage or BaseFileNode, grab the node it belongs to and + follow the node hierarchy from there. + """ + _surrounding_osfids = [] + _current_referent = osfid_referent + while _current_referent: + next_referent = get_immediate_wrapper(_current_referent) + if next_referent: + _surrounding_osfids.append(next_referent._id) + _current_referent = next_referent + return _surrounding_osfids + + +def get_immediate_wrapper(osfid_referent): + if hasattr(osfid_referent, 'verified_publishable'): + return None # quacks like Preprint + return ( + getattr(osfid_referent, 'parent_node', None) # quacks like AbstractNode + or getattr(osfid_referent, 'node', None) # quacks like WikiPage, Comment + or getattr(osfid_referent, 'target', None) # quacks like BaseFileNode + ) + + @dataclasses.dataclass(frozen=True) class YearMonth: """YearMonth: represents a specific month in a specific year""" @@ -88,6 +118,11 @@ def from_date(cls, date: datetime.date) -> YearMonth: """construct a YearMonth from a `datetime.date` (or `datetime.datetime`)""" return cls(date.year, date.month) + @classmethod + def from_today(cls) -> YearMonth: + """construct a YearMonth from the current moment""" + return cls.from_date(datetime.date.today()) + @classmethod def from_str(cls, input_str: str) -> YearMonth: """construct a YearMonth from a string in "YYYY-MM" format""" diff --git a/osf/models/base.py b/osf/models/base.py index 9e6d5f502d4..65cb73d05e8 100644 --- a/osf/models/base.py +++ b/osf/models/base.py @@ -309,6 +309,9 @@ class Meta: UniqueConstraint(fields=['guid', 'version'], name='unique_guid_version') ] + def versioned_osfid(self): + return f'{self.guid._id}{VersionedGuidMixin.GUID_VERSION_DELIMITER}{self.version}' + class BlackListGuid(BaseModel): id = models.AutoField(primary_key=True) @@ -553,12 +556,11 @@ def _id(self): f'`self.versioned_guids` does not exist: [self={self.pk}, type={type(self).__name__}]' ) return None - guid = versioned_guid.first().guid - version = versioned_guid.first().version + _current_versioned_guid = versioned_guid.first() except IndexError as e: sentry.log_exception(e) return None - return f'{guid._id}{VersionedGuidMixin.GUID_VERSION_DELIMITER}{version}' + return _current_versioned_guid.versioned_osfid() @_id.setter def _id(self, value): diff --git a/osf/models/registrations.py b/osf/models/registrations.py index f13489f1201..568a1a575ed 100644 --- a/osf/models/registrations.py +++ b/osf/models/registrations.py @@ -24,8 +24,7 @@ from osf.utils.permissions import ADMIN, READ, WRITE from osf.exceptions import NodeStateError, DraftRegistrationStateError from osf.external.internet_archive.tasks import archive_to_ia, update_ia_metadata -from osf.metrics import RegistriesModerationMetrics -from osf.metrics.es8_metrics import RegistriesModerationEventEs8 +from osf.metrics.events import RegistriesModerationEvent from osf.models.notification_type import NotificationTypeEnum from .action import RegistrationAction from .archive import ArchiveJob @@ -786,8 +785,7 @@ def _write_registration_action(self, from_state, to_state, initiated_by, comment ) action.save() if waffle.switch_is_active(features.ELASTICSEARCH_METRICS): - RegistriesModerationMetrics.record_transitions(action) - RegistriesModerationEventEs8.record( + RegistriesModerationEvent.record( registration_id=action.target._id, provider_id=action.target.provider._id, from_state=action.from_state, diff --git a/osf_tests/management_commands/test_monthly_reporters_go.py b/osf_tests/management_commands/test_monthly_reporters_go.py index 505e7adf4bd..1fbac32a77d 100644 --- a/osf_tests/management_commands/test_monthly_reporters_go.py +++ b/osf_tests/management_commands/test_monthly_reporters_go.py @@ -2,26 +2,24 @@ from django.core.management import call_command from django.test import TestCase -from elasticsearch_metrics.tests.util import djelme_test_backends +from elasticsearch_metrics.tests.util import RealElasticTestCase from framework.celery_tasks import app as celery_app -from osf.metrics import reports as es6_reports -from osf.metrics.es8_metrics import ( - MonthlyInstitutionSummaryReportEs8, - MonthlyInstitutionalUserReportEs8, - MonthlyPrivateSpamMetricsReportEs8, - MonthlyPublicItemUsageReportEs8, - MonthlySpamSummaryReportEs8, +from osf.metrics.monthly_reports import ( + MonthlyInstitutionSummaryReport, + MonthlyInstitutionalUserReport, + MonthlyPrivateSpamMetricsReport, + MonthlyPublicItemUsageReport, + MonthlySpamSummaryReport, ) -from osf.metrics.counted_usage import CountedAuthUsage +from osf.metrics.events import OsfCountedUsageEvent from osf.metrics.utils import YearMonth from osf_tests import factories -from website import settings as website_settings -class TestMonthlyReportersGo(TestCase): +class TestMonthlyReportersGo(RealElasticTestCase, TestCase): def setUp(self): - self.enterContext(djelme_test_backends()) + super().setUp() celery_app.conf.update({ 'task_always_eager': True, 'task_eager_propagates': True, @@ -33,48 +31,26 @@ def setUp(self): _user.add_or_update_affiliated_institution(_inst) # set up for public item usage report _reg = factories.RegistrationFactory(is_public=True) - CountedAuthUsage.record( - platform_iri=website_settings.DOMAIN, - item_guid=_reg._id, - session_id='blarg', - user_is_authenticated=True, + OsfCountedUsageEvent.record( + item_osfid=_reg._id, action_labels=['view', 'web'], + user_id=_user._id, ) - CountedAuthUsage._get_connection().indices.refresh(CountedAuthUsage._template_pattern) - # TODO when switching to use es8 data - # OsfCountedUsageEvent.record( - # item_osfid=_preprint._id, - # action_labels=['view', 'web'], - # user_id=_user._id, - # ) - # OsfCountedUsageEvent.refresh() + OsfCountedUsageEvent.refresh() def test_for_smoke(self): - self._assert_count(MonthlyInstitutionSummaryReportEs8, 0) - self._assert_count(MonthlyInstitutionalUserReportEs8, 0) - self._assert_count(MonthlyPrivateSpamMetricsReportEs8, 0) - self._assert_count(MonthlyPublicItemUsageReportEs8, 0) - self._assert_count(MonthlySpamSummaryReportEs8, 0) - self._assert_count(es6_reports.SpamSummaryReport, 0) - self._assert_count(es6_reports.InstitutionalUserReport, 0) - self._assert_count(es6_reports.InstitutionMonthlySummaryReport, 0) - self._assert_count(es6_reports.PublicItemUsageReport, 0) - self._assert_count(es6_reports.PrivateSpamMetricsReport, 0) + self._assert_count(MonthlyInstitutionSummaryReport, 0) + self._assert_count(MonthlyInstitutionalUserReport, 0) + self._assert_count(MonthlyPrivateSpamMetricsReport, 0) + self._assert_count(MonthlyPublicItemUsageReport, 0) + self._assert_count(MonthlySpamSummaryReport, 0) call_command('monthly_reporters_go', yearmonth=str(self._report_yearmonth)) - self._assert_count(MonthlyInstitutionSummaryReportEs8, 1) - self._assert_count(MonthlyInstitutionalUserReportEs8, 1) - self._assert_count(MonthlyPrivateSpamMetricsReportEs8, 1) - self._assert_count(MonthlyPublicItemUsageReportEs8, 1) - self._assert_count(MonthlySpamSummaryReportEs8, 1) - self._assert_count(es6_reports.SpamSummaryReport, 1) - self._assert_count(es6_reports.InstitutionalUserReport, 1) - self._assert_count(es6_reports.InstitutionMonthlySummaryReport, 1) - self._assert_count(es6_reports.PublicItemUsageReport, 1) - self._assert_count(es6_reports.PrivateSpamMetricsReport, 1) + self._assert_count(MonthlyInstitutionSummaryReport, 1) + self._assert_count(MonthlyInstitutionalUserReport, 1) + self._assert_count(MonthlyPrivateSpamMetricsReport, 1) + self._assert_count(MonthlyPublicItemUsageReport, 1) + self._assert_count(MonthlySpamSummaryReport, 1) def _assert_count(self, recordtype, expected_count): - if hasattr(recordtype, 'refresh'): - recordtype.refresh() - else: # elasticsearch_metrics.imps.elastic6 - recordtype._get_connection().indices.refresh(recordtype._template_pattern) + recordtype.refresh() self.assertEqual(recordtype.search().count(), expected_count) diff --git a/osf_tests/management_commands/test_reindex_es6.py b/osf_tests/management_commands/test_reindex_es6.py deleted file mode 100644 index 36158c18da6..00000000000 --- a/osf_tests/management_commands/test_reindex_es6.py +++ /dev/null @@ -1,116 +0,0 @@ -import time -import pytest -from website import settings - -from osf.metrics import PreprintDownload -from django.core.management import call_command - -from osf_tests.factories import ( - PreprintFactory, - AuthUserFactory -) - -from elasticsearch6_dsl import Keyword - -from tests.json_api_test_app import JSONAPITestApp - -from api.base import settings as django_settings - - -@pytest.fixture() -def app(): - return JSONAPITestApp() - - -@pytest.mark.django_db -class TestReindexingMetrics: - - @pytest.fixture() - def preprint(self): - return PreprintFactory() - - @pytest.fixture() - def user(self): - return AuthUserFactory() - - @pytest.fixture() - def admin(self): - user = AuthUserFactory() - user.is_staff = True - user.add_system_tag('preprint_metrics') - user.save() - return user - - @pytest.fixture() - def url(self): - return f'{settings.API_DOMAIN}_/metrics/preprints/downloads/' - - @pytest.mark.es - @pytest.mark.skipif(django_settings.CI_ENV, reason='Non-deterministic fails on CI') - def test_reindexing(self, app, url, preprint, user, admin, es6_client): - preprint_download = PreprintDownload.record_for_preprint( - preprint, - user, - version=1, - path='/MalcolmJenkinsKnockedBrandinCooksOutColdInTheSuperBowl', - random_new_field='Hi!' # Here's our unmapped field! It's a text field by default. - ) - preprint_download.save() - - query = { - 'aggs': { - 'random_new_field': { - 'terms': { - 'field': 'random_new_field', # Oh no, this is a text field, you can't query it like that! - } - } - } - } - - payload = { - 'data': { - 'type': 'preprint_metrics', - 'attributes': { - 'query': query - } - } - } - - # Hacky way to simulate a re-mapped index template - index_template = preprint_download._index - mapping = index_template._mapping - mapping.properties._params['properties']['random_new_field'] = Keyword(doc_values=True, index=True) - index_template._mapping._update_from_dict(mapping.to_dict()) - - # This should 400 because random_new_field is still stored as a text field despite the our index being remapped. - res = app.post_json_api(url, payload, auth=admin.auth, expect_errors=True) - assert res.status_code == 400 - assert res.json['errors'][0]['detail'] == 'Fielddata is disabled on text fields by default. Set ' \ - 'fielddata=true on [random_new_field] in order to load' \ - ' fielddata in memory by uninverting the inverted inde' \ - 'x. Note that this can however use significant memory.' \ - ' Alternatively use a keyword field instead.' - - call_command('reindex_es6', f'--indices={preprint_download.meta["index"]}') - time.sleep(2) - - res = app.post_json_api(url, payload, auth=admin.auth) - assert res.status_code == 200 - assert res.json['hits']['hits'][0]['_source']['random_new_field'] == 'Hi!' - - # Just checking version number incremented properly - es6_client.indices.get(f'{preprint_download.meta["index"]}_v2') - - # Just check it was aliased properly - es6_client.indices.get(f'{preprint_download.meta["index"]}') - - call_command('reindex_es6', f'--indices={preprint_download.meta["index"]}') - time.sleep(2) - - # Just checking version number incremented properly again - es6_client.indices.get(f'{preprint_download.meta["index"]}_v3') - - # Just check it was aliased properly again (to the OG index, not the v2 index) - data = es6_client.indices.get(f'{preprint_download.meta["index"]}') - - assert data[f'{preprint_download.meta["index"]}_v3']['aliases'] == {'osf_preprintdownload_2020': {}} diff --git a/osf_tests/metadata/test_osf_gathering.py b/osf_tests/metadata/test_osf_gathering.py index f235488e557..23095b066ee 100644 --- a/osf_tests/metadata/test_osf_gathering.py +++ b/osf_tests/metadata/test_osf_gathering.py @@ -25,7 +25,7 @@ checksum_iri, ) from osf import models as osfdb -from osf.metrics.reports import PublicItemUsageReport +from osf.metrics.monthly_reports import MonthlyPublicItemUsageReport from osf.metrics.utils import YearMonth from osf.utils import permissions, workflows from osf_tests import factories @@ -799,22 +799,23 @@ def test_gather_cedar_templates(self): def test_gather_last_month_usage(self): # no usage report: with mock.patch( - 'osf.metrics.reports.PublicItemUsageReport.for_last_month', - return_value=None, + 'osf.metrics.monthly_reports.MonthlyPublicItemUsageReport.from_last_month', + return_value=[], ): assert_triples(osf_gathering.gather_last_month_usage(self.projectfocus), set()) # yes usage report: _ym = YearMonth.from_date(datetime.datetime.now(tz=datetime.UTC)) with mock.patch( - 'osf.metrics.reports.PublicItemUsageReport.for_last_month', - return_value=PublicItemUsageReport( - item_osfid=self.project._id, + 'osf.metrics.monthly_reports.MonthlyPublicItemUsageReport.from_last_month', + return_value=[MonthlyPublicItemUsageReport( + item_iri=self.project.get_semantic_iri(), + item_osfids=[self.project._id], report_yearmonth=_ym, view_count=71, view_session_count=13, download_count=43, download_session_count=11, - ), + )], ): _usage_bnode = rdflib.BNode() assert_triples(osf_gathering.gather_last_month_usage(self.projectfocus), { diff --git a/osf_tests/metadata/test_serialized_metadata.py b/osf_tests/metadata/test_serialized_metadata.py index 5dc4029aaf4..369e06555a7 100644 --- a/osf_tests/metadata/test_serialized_metadata.py +++ b/osf_tests/metadata/test_serialized_metadata.py @@ -9,7 +9,7 @@ from osf.metadata.osf_gathering import OsfmapPartition from osf.metadata.rdfutils import OSF, DCTERMS from osf.metadata.tools import pls_gather_metadata_file -from osf.metrics.reports import PublicItemUsageReport +from osf.metrics.monthly_reports import MonthlyPublicItemUsageReport from osf.metrics.utils import YearMonth from osf.models.licenses import NodeLicense from api_tests.utils import create_test_file @@ -311,14 +311,14 @@ def setUp(self): 'resource_type_general': 'StudyRegistration', }, auth=self.user) self.enterContext(mock.patch( - 'osf.metrics.reports.PublicItemUsageReport.for_last_month', - return_value=PublicItemUsageReport( + 'osf.metrics.monthly_reports.MonthlyPublicItemUsageReport.from_last_month', + return_value=[MonthlyPublicItemUsageReport( report_yearmonth=YearMonth.from_date(forever_now()), view_count=7, view_session_count=5, download_count=3, download_session_count=2, - ), + )], )) self.guid_dict = { OSF.Project: self.project._id, diff --git a/osf_tests/metrics/reporters/_testutils.py b/osf_tests/metrics/reporters/_testutils.py index 3275b0f1651..ef504c06a18 100644 --- a/osf_tests/metrics/reporters/_testutils.py +++ b/osf_tests/metrics/reporters/_testutils.py @@ -1,8 +1,9 @@ +from elasticsearch_metrics.imps.elastic8 import CyclicRecord + from osf.metrics.reporters._base import MonthlyReporter -from osf.metrics.reports import MonthlyReport -def list_monthly_reports(reporter: MonthlyReporter, *, flat=False) -> list[MonthlyReport]: +def list_monthly_reports(reporter: MonthlyReporter) -> list[CyclicRecord]: _each_reports_list = ( reporter.report(**_kwargs) for _kwargs in reporter.iter_report_kwargs() @@ -11,5 +12,4 @@ def list_monthly_reports(reporter: MonthlyReporter, *, flat=False) -> list[Month _report for _reports_list in _each_reports_list for _report in _reports_list - if isinstance(_report, MonthlyReport) # TODO: update tests with es8 ] diff --git a/osf_tests/metrics/reporters/test_institutional_summary_reporter.py b/osf_tests/metrics/reporters/test_institutional_summary_reporter.py index 02c24d86f3c..5d45056d8ec 100644 --- a/osf_tests/metrics/reporters/test_institutional_summary_reporter.py +++ b/osf_tests/metrics/reporters/test_institutional_summary_reporter.py @@ -81,18 +81,18 @@ def test_report_generation(self): reporter = InstitutionalSummaryMonthlyReporter(self._yearmonth) reports = list_monthly_reports(reporter) self.assertEqual(len(reports), 1) - report = reports[0] - self.assertEqual(report.institution_id, self._institution._id) - self.assertEqual(report.user_count, 2) # _logged_in_user and _active_user - self.assertEqual(report.public_project_count, 1) - self.assertEqual(report.private_project_count, 1) - self.assertEqual(report.public_registration_count, 1) - self.assertEqual(report.embargoed_registration_count, 1) - self.assertEqual(report.published_preprint_count, 1) - self.assertEqual(report.storage_byte_count, 1337) # test value for one file - self.assertEqual(report.public_file_count, 1) - self.assertEqual(report.monthly_logged_in_user_count, 1) - self.assertEqual(report.monthly_active_user_count, 1) + for report in reports: + self.assertEqual(report.institution_id, self._institution._id) + self.assertEqual(report.user_count, 2) # _logged_in_user and _active_user + self.assertEqual(report.public_project_count, 1) + self.assertEqual(report.private_project_count, 1) + self.assertEqual(report.public_registration_count, 1) + self.assertEqual(report.embargoed_registration_count, 1) + self.assertEqual(report.published_preprint_count, 1) + self.assertEqual(report.storage_byte_count, 1337) # test value for one file + self.assertEqual(report.public_file_count, 1) + self.assertEqual(report.monthly_logged_in_user_count, 1) + self.assertEqual(report.monthly_active_user_count, 1) def test_report_generation_multiple_institutions(self): institution2 = InstitutionFactory() @@ -118,22 +118,24 @@ def test_report_generation_multiple_institutions(self): self.assertEqual(len(reports), 3) # Reports for self._institution, institution2, institution3 # Extract reports by institution - report_institution = next(r for r in reports if r.institution_id == self._institution._id) - report_institution2 = next(r for r in reports if r.institution_id == institution2._id) + _reports1 = [r for r in reports if r.institution_id == self._institution._id] + _reports2 = [r for r in reports if r.institution_id == institution2._id] # Validate report for self._institution - self.assertEqual(report_institution.public_project_count, 1) - self.assertEqual(report_institution.private_project_count, 1) - self.assertEqual(report_institution.user_count, 2) - self.assertEqual(report_institution.monthly_active_user_count, 1) - self.assertEqual(report_institution.monthly_logged_in_user_count, 1) + for _report in _reports1: + self.assertEqual(_report.public_project_count, 1) + self.assertEqual(_report.private_project_count, 1) + self.assertEqual(_report.user_count, 2) + self.assertEqual(_report.monthly_active_user_count, 1) + self.assertEqual(_report.monthly_logged_in_user_count, 1) # Validate report for institution2 - self.assertEqual(report_institution2.public_project_count, 1) - self.assertEqual(report_institution2.private_project_count, 0) - self.assertEqual(report_institution2.user_count, 1) - self.assertEqual(report_institution2.monthly_active_user_count, 1) - self.assertEqual(report_institution2.monthly_logged_in_user_count, 0) # No logged-in users + for _report in _reports2: + self.assertEqual(_report.public_project_count, 1) + self.assertEqual(_report.private_project_count, 0) + self.assertEqual(_report.user_count, 1) + self.assertEqual(_report.monthly_active_user_count, 1) + self.assertEqual(_report.monthly_logged_in_user_count, 0) # No logged-in users class TestSummaryMonthlyReporterBenchmarker(TestCase): @@ -264,7 +266,6 @@ def test_high_counts_multiple_institutions(self): reporter_start_time = time.time() reporter = InstitutionalSummaryMonthlyReporter(self._yearmonth) reports = list_monthly_reports(reporter) - assert len(reports) == additional_institution_count + 1 if enable_benchmarking: reporter_end_time = time.time() diff --git a/osf_tests/metrics/reporters/test_institutional_users_reporter.py b/osf_tests/metrics/reporters/test_institutional_users_reporter.py index e399d848396..a43e90d1313 100644 --- a/osf_tests/metrics/reporters/test_institutional_users_reporter.py +++ b/osf_tests/metrics/reporters/test_institutional_users_reporter.py @@ -8,7 +8,6 @@ from api_tests.utils import create_test_file from osf import models as osfdb from osf.management.commands.populate_notification_types import populate_notification_types -from osf.metrics.reports import InstitutionalUserReport from osf.metrics.reporters import InstitutionalUsersReporter from osf.metrics.utils import YearMonth from osf_tests.factories import ( @@ -48,7 +47,7 @@ def setUpTestData(cls): ) cls._user_setup_with_stuff.fill_uncounted_objects() - def _assert_report_matches_setup(self, report: InstitutionalUserReport, setup: _InstiUserSetup): + def _assert_report_matches_setup(self, report, setup: _InstiUserSetup): self.assertEqual(report.institution_id, setup.institution._id) # user info: self.assertEqual(report.user_id, setup.user._id) @@ -99,10 +98,12 @@ def test_one_user_with_stuff_and_a_file(self): _project = _user.nodes.first() with _patch_now(self._now): create_test_file(target=_project, user=_user, size=37) - (_report,) = list_monthly_reports(InstitutionalUsersReporter(self._yearmonth)) - self._assert_report_matches_setup(_report, self._user_setup_with_stuff) - self.assertEqual(_report.public_file_count, 3) # 2 preprint files - self.assertEqual(_report.storage_byte_count, 2711) # 2 preprint files + _reports = list_monthly_reports(InstitutionalUsersReporter(self._yearmonth)) + self.assertEqual(len(_reports), 1) + for _report in _reports: + self._assert_report_matches_setup(_report, self._user_setup_with_stuff) + self.assertEqual(_report.public_file_count, 3) # 2 preprint files + self.assertEqual(_report.storage_byte_count, 2711) # 2 preprint files def test_one_user_with_stuff_and_multiple_files(self): self._user_setup_with_stuff.affiliate_user() @@ -116,10 +117,12 @@ def test_one_user_with_stuff_and_multiple_files(self): create_test_file(target=_component, user=_user, size=53, filename='bla') create_test_file(target=_component, user=_user, size=51, filename='blar') create_test_file(target=_component, user=_user, size=47, filename='blarg') - (_report,) = list_monthly_reports(InstitutionalUsersReporter(self._yearmonth)) - self._assert_report_matches_setup(_report, self._user_setup_with_stuff) - self.assertEqual(_report.public_file_count, 7) # 2 preprint files - self.assertEqual(_report.storage_byte_count, 2935) # 2 preprint files + 37 + 73 + 53 + 51 + 47 + _reports = list_monthly_reports(InstitutionalUsersReporter(self._yearmonth)) + self.assertEqual(len(_reports), 1) + for _report in _reports: + self._assert_report_matches_setup(_report, self._user_setup_with_stuff) + self.assertEqual(_report.public_file_count, 7) # 2 preprint files + self.assertEqual(_report.storage_byte_count, 2935) # 2 preprint files + 37 + 73 + 53 + 51 + 47 def test_several_users(self): _setups = [ diff --git a/osf_tests/metrics/reporters/test_public_item_usage_reporter.py b/osf_tests/metrics/reporters/test_public_item_usage_reporter.py index 69bd266285a..2f111efb6bd 100644 --- a/osf_tests/metrics/reporters/test_public_item_usage_reporter.py +++ b/osf_tests/metrics/reporters/test_public_item_usage_reporter.py @@ -1,246 +1,293 @@ from datetime import datetime, timedelta +from functools import cached_property from operator import attrgetter from unittest import mock -import pytest +from django.test import TestCase +from elasticsearch_metrics.tests.util import RealElasticTestCase -from osf.metrics.counted_usage import CountedAuthUsage -from osf.metrics.preprint_metrics import ( - PreprintDownload, - PreprintView, -) +from osf.metadata.rdfutils import OSF +from osf.metrics.events import OsfCountedUsageEvent +from osf.metrics.monthly_reports import MonthlyPublicItemUsageReport from osf.metrics.reporters.public_item_usage import PublicItemUsageReporter -from osf.metrics.reports import PublicItemUsageReport from osf.metrics.utils import YearMonth -from osf import models as osfdb from osf_tests import factories from ._testutils import list_monthly_reports -@pytest.mark.es_metrics -@pytest.mark.django_db -class TestPublicItemUsageReporter: - @pytest.fixture(autouse=True) - def _patch_settings(self): - with mock.patch('website.settings.DOMAIN', 'http://osf.example'): - yield +class TestPublicItemUsageReporter(RealElasticTestCase, TestCase): + def setUp(self): + super().setUp() + self.enterContext(mock.patch('website.settings.DOMAIN', 'http://osf.example/')) - @pytest.fixture + @cached_property def item0(self): _item0 = factories.PreprintFactory(is_public=True, set_guid='item0') return _item0 - @pytest.fixture + @cached_property def item1(self): _item1 = factories.ProjectFactory(is_public=True) _item1._id = 'item1' return _item1 - @pytest.fixture - def item2(self, item1): - _item2 = factories.ProjectFactory(is_public=True, parent=item1) + @cached_property + def item2(self): + _item2 = factories.ProjectFactory(is_public=True, parent=self.item1) _item2._id = 'item2' return _item2 - @pytest.fixture + @cached_property def ym_empty(self) -> YearMonth: return YearMonth(2012, 7) - @pytest.fixture + @cached_property def ym_sparse(self) -> YearMonth: return YearMonth(2017, 7) - @pytest.fixture + @cached_property def ym_busy(self) -> YearMonth: return YearMonth(2023, 7) - @pytest.fixture - def sparse_month_usage(self, ym_sparse, item0, item1, item2): + def _setup_sparse_month_usage(self): # "sparse" month: # item0: 3 views, 0 downloads, 2 sessions # item1: 1 views, 1 download, 1 session (plus 1 view from child item2) # item2: 1 views, 0 downloads, 1 session - _month_start = ym_sparse.month_start() + _month_start = self.ym_sparse.month_start() _save_usage( - item0, + self.item0, timestamp=_month_start, - session_id='sesh0', + sessionhour_id='sesh0', action_labels=['view'], ) _save_usage( - item0, + self.item0, timestamp=_month_start + timedelta(minutes=2), - session_id='sesh0', + sessionhour_id='sesh0', action_labels=['view'], ) _save_usage( - item1, + self.item1, timestamp=_month_start + timedelta(minutes=3), - session_id='sesh0', + sessionhour_id='sesh0', action_labels=['download'], ) _save_usage( - item0, + self.item0, timestamp=_month_start + timedelta(days=17), - session_id='sesh1', + sessionhour_id='sesh1', action_labels=['view'], ) _save_usage( - item1, + self.item1, timestamp=_month_start + timedelta(days=17, minutes=3), - session_id='sesh1', + sessionhour_id='sesh1', action_labels=['view'], ) _save_usage( - item2, + self.item2, timestamp=_month_start + timedelta(days=17, minutes=5), - session_id='sesh1', + sessionhour_id='sesh1', action_labels=['view'], ) _save_usage( - item2, + self.item2, timestamp=_month_start + timedelta(days=17, minutes=11), - session_id='sesh1', + sessionhour_id='sesh1', action_labels=['download'], ) - @pytest.fixture - def busy_month_item0(self, ym_busy, item0): + def _setup_busy_month_item0(self): # item0: 4 sessions, 4*7 views, 4*5 downloads - _month_start = ym_busy.month_start() + _month_start = self.ym_busy.month_start() for _sesh in range(0, 4): _sesh_start = _month_start + timedelta(days=_sesh) for _minute in range(0, 7): _save_usage( - item0, + self.item0, timestamp=_sesh_start + timedelta(minutes=_minute), - session_id=f'sesh0{_sesh}', + sessionhour_id=f'sesh0{_sesh}', action_labels=['view'], ) for _minute in range(10, 15): _save_usage( - item0, + self.item0, timestamp=_sesh_start + timedelta(minutes=_minute), - session_id=f'sesh0{_sesh}', + sessionhour_id=f'sesh0{_sesh}', action_labels=['download'], ) + # plus prior report with cumulative counts: + # 4 views, 3 view sessions, 2 downloads, 1 download session + MonthlyPublicItemUsageReport.record( + report_yearmonth=self.ym_busy.prior(), + item_iri='http://osf.example/item0_v1', + item_osfids=['item0_v1'], + item_types=[OSF.Preprint], + platform_iris=['http://osf.example/'], + database_iris=[self.item0.provider.get_semantic_iri()], + provider_ids=[self.item0.provider._id], + view_count=1, + view_session_count=1, + cumulative_view_count=4, + cumulative_view_session_count=3, + download_count=2, + download_session_count=1, + cumulative_download_count=2, + cumulative_download_session_count=1, + ) - @pytest.fixture - def busy_month_item1(self, ym_busy, item1): + def _setup_busy_month_item1(self): # item1: 10 sessions, 6*9 views, 5*7 downloads # (plus 11 views in 11 sessions from child item2) - _month_start = ym_busy.month_start() + _month_start = self.ym_busy.month_start() for _sesh in range(0, 6): _sesh_start = _month_start + timedelta(days=_sesh) for _minute in range(0, 9): _save_usage( - item1, + self.item1, timestamp=_sesh_start + timedelta(minutes=_minute), - session_id=f'sesh1{_sesh}', + sessionhour_id=f'sesh1{_sesh}', action_labels=['view'], ) for _sesh in range(5, 10): _sesh_start = _month_start + timedelta(days=_sesh) for _minute in range(10, 17): _save_usage( - item1, + self.item1, timestamp=_sesh_start + timedelta(minutes=_minute), - session_id=f'sesh1{_sesh}', + sessionhour_id=f'sesh1{_sesh}', action_labels=['download'], ) - @pytest.fixture - def busy_month_item2(self, ym_busy, item2): + def _setup_busy_month_item2(self): # item2: 11 sessions, 11 views, 11 downloads (child of item1) - _month_start = ym_busy.month_start() + _month_start = self.ym_busy.month_start() for _sesh in range(1, 12): _save_usage( - item2, + self.item2, timestamp=_month_start + timedelta(days=_sesh), - session_id=f'sesh2{_sesh}', + sessionhour_id=f'sesh2{_sesh}', action_labels=['view'], ) _save_usage( - item2, + self.item2, timestamp=_month_start + timedelta(days=_sesh, hours=_sesh), - session_id=f'sesh2{_sesh}', + sessionhour_id=f'sesh2{_sesh}', action_labels=['download'], ) - def test_no_data(self, ym_empty): - _reporter = PublicItemUsageReporter(ym_empty) + def test_no_data(self): + _reporter = PublicItemUsageReporter(self.ym_empty) _empty = list_monthly_reports(_reporter) assert _empty == [] - def test_reporter(self, ym_empty, ym_sparse, ym_busy, sparse_month_usage, busy_month_item0, busy_month_item1, busy_month_item2, item0): - _empty = list_monthly_reports(PublicItemUsageReporter(ym_empty)) - _sparse = list_monthly_reports(PublicItemUsageReporter(ym_sparse)) - _busy = list_monthly_reports(PublicItemUsageReporter(ym_busy)) + def test_reporter(self): + self._setup_sparse_month_usage() + self._setup_busy_month_item0() + self._setup_busy_month_item1() + self._setup_busy_month_item2() + OsfCountedUsageEvent.refresh() + + _empty = list_monthly_reports(PublicItemUsageReporter(self.ym_empty)) + _sparse = list_monthly_reports(PublicItemUsageReporter(self.ym_sparse)) + _busy = list_monthly_reports(PublicItemUsageReporter(self.ym_busy)) # empty month: assert _empty == [] # sparse month: assert len(_sparse) == 3 - _sparse_item0, _sparse_item1, _sparse_item2 = sorted(_sparse, key=attrgetter('item_osfid')) + _sparse_item0, _sparse_item1, _sparse_item2 = sorted(_sparse, key=attrgetter('item_iri')) # sparse-month item0 - assert isinstance(_sparse_item0, PublicItemUsageReport) - assert _sparse_item0.item_osfid == 'item0_v1' - assert _sparse_item0.provider_id == [item0.provider._id] - assert _sparse_item0.platform_iri == ['http://osf.example'] + assert isinstance(_sparse_item0, MonthlyPublicItemUsageReport) + assert _sparse_item0.item_iri == 'http://osf.example/item0_v1' + assert _sparse_item0.item_osfids == ['item0_v1'] + assert _sparse_item0.provider_ids == [self.item0.provider._id] + assert _sparse_item0.platform_iris == ['http://osf.example'] assert _sparse_item0.view_count == 3 - assert _sparse_item0.view_session_count is None # no session count for preprints + assert _sparse_item0.view_session_count == 2 assert _sparse_item0.download_count == 0 - assert _sparse_item0.download_session_count is None # no session count for preprints + assert _sparse_item0.download_session_count == 0 + assert _sparse_item0.cumulative_view_count == 3 + assert _sparse_item0.cumulative_view_session_count == 2 + assert _sparse_item0.cumulative_download_count == 0 + assert _sparse_item0.cumulative_download_session_count == 0 # sparse-month item1 - assert isinstance(_sparse_item1, PublicItemUsageReport) - assert _sparse_item1.item_osfid == 'item1' - assert _sparse_item1.provider_id == ['osf'] - assert _sparse_item1.platform_iri == ['http://osf.example'] + assert isinstance(_sparse_item1, MonthlyPublicItemUsageReport) + assert _sparse_item1.item_iri == 'http://osf.example/item1' + assert _sparse_item1.item_osfids == ['item1'] + assert _sparse_item1.provider_ids == ['osf'] + assert _sparse_item1.platform_iris == ['http://osf.example'] assert _sparse_item1.view_count == 2 # including item2 assert _sparse_item1.view_session_count == 1 # including item2 assert _sparse_item1.download_count == 1 # NOT including item2 assert _sparse_item1.download_session_count == 1 # NOT including item2 + assert _sparse_item1.cumulative_view_count == 2 + assert _sparse_item1.cumulative_view_session_count == 1 + assert _sparse_item1.cumulative_download_count == 1 + assert _sparse_item1.cumulative_download_session_count == 1 # sparse-month item2 - assert isinstance(_sparse_item1, PublicItemUsageReport) - assert _sparse_item2.item_osfid == 'item2' - assert _sparse_item2.provider_id == ['osf'] - assert _sparse_item2.platform_iri == ['http://osf.example'] + assert isinstance(_sparse_item1, MonthlyPublicItemUsageReport) + assert _sparse_item2.item_iri == 'http://osf.example/item2' + assert _sparse_item2.item_osfids == ['item2'] + assert _sparse_item2.provider_ids == ['osf'] + assert _sparse_item2.platform_iris == ['http://osf.example'] assert _sparse_item2.view_count == 1 assert _sparse_item2.view_session_count == 1 assert _sparse_item2.download_count == 1 assert _sparse_item2.download_session_count == 1 + assert _sparse_item2.cumulative_view_count == 1 + assert _sparse_item2.cumulative_view_session_count == 1 + assert _sparse_item2.cumulative_download_count == 1 + assert _sparse_item2.cumulative_download_session_count == 1 # busy month: assert len(_busy) == 3 - _busy_item0, _busy_item1, _busy_item2 = sorted(_busy, key=attrgetter('item_osfid')) - # busy-month item0 - assert isinstance(_busy_item0, PublicItemUsageReport) - assert _busy_item0.item_osfid == 'item0_v1' - assert _busy_item0.provider_id == [item0.provider._id] - assert _busy_item0.platform_iri == ['http://osf.example'] + _busy_item0, _busy_item1, _busy_item2 = sorted(_busy, key=attrgetter('item_iri')) + # busy-month item0 (plus prior-month report) + assert isinstance(_busy_item0, MonthlyPublicItemUsageReport) + assert _busy_item0.item_iri == 'http://osf.example/item0_v1' + assert _busy_item0.item_osfids == ['item0_v1'] + assert _busy_item0.provider_ids == [self.item0.provider._id] + assert _busy_item0.platform_iris == ['http://osf.example'] assert _busy_item0.view_count == 4 * 7 - assert _busy_item0.view_session_count is None # no session count for preprints + assert _busy_item0.view_session_count == 4 assert _busy_item0.download_count == 4 * 5 - assert _busy_item0.download_session_count is None # no session count for preprints + assert _busy_item0.download_session_count == 4 + # plus values from prior report: + assert _busy_item0.cumulative_view_count == (4 * 7) + 4 + assert _busy_item0.cumulative_view_session_count == 4 + 3 + assert _busy_item0.cumulative_download_count == (4 * 5) + 2 + assert _busy_item0.cumulative_download_session_count == 4 + 1 # busy-month item1 - assert isinstance(_busy_item1, PublicItemUsageReport) - assert _busy_item1.item_osfid == 'item1' - assert _busy_item1.provider_id == ['osf'] - assert _busy_item1.platform_iri == ['http://osf.example'] + assert isinstance(_busy_item1, MonthlyPublicItemUsageReport) + assert _busy_item1.item_iri == 'http://osf.example/item1' + assert _busy_item1.item_osfids == ['item1'] + assert _busy_item1.provider_ids == ['osf'] + assert _busy_item1.platform_iris == ['http://osf.example'] assert _busy_item1.view_count == 6 * 9 + 11 assert _busy_item1.view_session_count == 6 + 11 assert _busy_item1.download_count == 5 * 7 assert _busy_item1.download_session_count == 5 + assert _busy_item1.cumulative_view_count == 6 * 9 + 11 + assert _busy_item1.cumulative_view_session_count == 6 + 11 + assert _busy_item1.cumulative_download_count == 5 * 7 + assert _busy_item1.cumulative_download_session_count == 5 # busy-month item2 - assert isinstance(_busy_item2, PublicItemUsageReport) - assert _busy_item2.item_osfid == 'item2' - assert _busy_item2.provider_id == ['osf'] - assert _busy_item2.platform_iri == ['http://osf.example'] + assert isinstance(_busy_item2, MonthlyPublicItemUsageReport) + assert _busy_item2.item_iri == 'http://osf.example/item2' + assert _busy_item2.item_osfids == ['item2'] + assert _busy_item2.provider_ids == ['osf'] + assert _busy_item2.platform_iris == ['http://osf.example'] assert _busy_item2.view_count == 11 assert _busy_item2.view_session_count == 11 assert _busy_item2.download_count == 11 assert _busy_item2.download_session_count == 11 + assert _busy_item2.cumulative_view_count == 11 + assert _busy_item2.cumulative_view_session_count == 11 + assert _busy_item2.cumulative_download_count == 11 + assert _busy_item2.cumulative_download_session_count == 11 def _save_usage( @@ -252,32 +299,9 @@ def _save_usage( ): _countedusage_kwargs = { 'timestamp': timestamp, - 'item_guid': item._id, + 'item_osfid': item._id, 'action_labels': action_labels, 'platform_iri': 'http://osf.example', **kwargs, } - CountedAuthUsage(**_countedusage_kwargs).save(refresh=True) - if isinstance(item, osfdb.Preprint): - if 'view' in action_labels: - _save_preprint_view(item, timestamp) - if 'download' in action_labels: - _save_preprint_download(item, timestamp) - - -def _save_preprint_view(preprint, timestamp): - PreprintView( - timestamp=timestamp, - count=1, - preprint_id=preprint._id, - provider_id=preprint.provider._id, - ).save(refresh=True) - - -def _save_preprint_download(preprint, timestamp): - PreprintDownload( - timestamp=timestamp, - count=1, - preprint_id=preprint._id, - provider_id=preprint.provider._id, - ).save(refresh=True) + OsfCountedUsageEvent.record(**_countedusage_kwargs) diff --git a/osf_tests/metrics/test_daily_report.py b/osf_tests/metrics/test_daily_report.py deleted file mode 100644 index 5228e2342c5..00000000000 --- a/osf_tests/metrics/test_daily_report.py +++ /dev/null @@ -1,88 +0,0 @@ -import datetime -from unittest import mock - -import pytest -import elasticsearch_metrics.imps.elastic6 as metrics - -from osf.metrics.reports import DailyReport, ReportInvalid - - -class TestDailyReportKey: - @pytest.fixture - def mock_save(self): - with mock.patch('elasticsearch_metrics.imps.elastic6.BaseMetric.check_index_template'): - with mock.patch('elasticsearch6_dsl.Document.save', autospec=True) as mock_save: - yield mock_save - - def test_default(self, mock_save): - # only one of this type of report per day - class UniqueByDate(DailyReport): - blah = metrics.Keyword() - - class Meta: - app_label = 'osf' - - today = datetime.date(2022, 5, 18) - expected_timestamp = datetime.datetime( - today.year, - today.month, - today.day, - tzinfo=datetime.UTC, - ) - - reports = [ - UniqueByDate(report_date=today), - UniqueByDate(report_date=today, blah='blah'), - UniqueByDate(report_date=today, blah='fleh'), - ] - expected_key = '6fe48593af0f9d34159616759bd4678f383c912fdff3e8a338c51ecb1cf9d0d5' - - for report in reports: - report.save() - assert mock_save.call_count == 1 - assert mock_save.call_args[0][0] is report - assert report.meta.id == expected_key - assert report.timestamp == expected_timestamp - mock_save.reset_mock() - - def test_with_unique_together(self, mock_save): - # multiple reports of this type per day, unique by given field - class UniqueByDateAndField(DailyReport): - UNIQUE_TOGETHER_FIELDS = ('report_date', 'uniquefield',) - uniquefield = metrics.Keyword() - - class Meta: - app_label = 'osf' - - today = datetime.date(2022, 5, 18) - expected_timestamp = datetime.datetime( - today.year, - today.month, - today.day, - tzinfo=datetime.UTC, - ) - - expected_blah = 'dca57e6cde89b19274ea24bc713971dab137a896b8e06d43a11a3f437cd1d151' - blah_report = UniqueByDateAndField(report_date=today, uniquefield='blah') - blah_report.save() - assert mock_save.call_count == 1 - assert mock_save.call_args[0][0] is blah_report - assert blah_report.meta.id == expected_blah - assert blah_report.timestamp == expected_timestamp - mock_save.reset_mock() - - expected_fleh = 'e7dd5ff6b087807efcfa958077dc713878f21c65af79b3ccdb5dc2409bf5ad99' - fleh_report = UniqueByDateAndField(report_date=today, uniquefield='fleh') - fleh_report.save() - assert mock_save.call_count == 1 - assert mock_save.call_args[0][0] is fleh_report - assert fleh_report.meta.id == expected_fleh - assert fleh_report.timestamp == expected_timestamp - mock_save.reset_mock() - - for _bad_report in ( - UniqueByDateAndField(report_date=today), - UniqueByDateAndField(report_date=today, uniquefield=['list', 'of', 'things']), - ): - with pytest.raises(ReportInvalid): - _bad_report.save() diff --git a/osf_tests/metrics/test_es8_metrics.py b/osf_tests/metrics/test_es8_metrics.py index 1560558abe1..a7312508a92 100644 --- a/osf_tests/metrics/test_es8_metrics.py +++ b/osf_tests/metrics/test_es8_metrics.py @@ -1,23 +1,15 @@ import datetime -from elasticsearch_metrics.tests.util import djelme_test_backends -import pytest +from django.test import TestCase +from elasticsearch_metrics.tests.util import RealElasticTestCase -from osf.metrics.es8_metrics import ( - PageviewInfo, - DailyDownloadCountReportEs8, - OsfCountedUsageEvent, -) +from osf.metrics.daily_reports import DailyDownloadCountReport +from osf.metrics.events import OsfCountedUsageEvent -class TestEs8Metrics: +class TestEs8Metrics(RealElasticTestCase, TestCase): """smoke tests to check that djelme records can be saved and searched""" - @pytest.fixture(autouse=True) - def _real_elastic(self): - with djelme_test_backends(): - yield - def test_nested_pageview_autofill(self): usage = OsfCountedUsageEvent.record( timestamp=datetime.datetime(2024, 1, 1, 15, 0, tzinfo=datetime.UTC), @@ -29,12 +21,12 @@ def test_nested_pageview_autofill(self): item_type='Preprint', platform_iri='https://osf.example', user_is_authenticated=False, - pageview_info=PageviewInfo( - page_url='https://example.com/path/test', - referer_url='https://google.com', - route_name='foo.bar', - page_title='title title', - ), + pageview_info={ + 'page_url': 'https://example.com/path/test', + 'referer_url': 'https://google.com', + 'route_name': 'foo.bar', + 'page_title': 'title title', + }, ) assert usage.pageview_info.page_path == '/path/test' assert usage.pageview_info.referer_domain == 'google.com' @@ -80,12 +72,12 @@ def test_none_pageview_nested_autofill(self): assert usage.item_iri in usage.within_iris def test_save_report(self): - _saved = DailyDownloadCountReportEs8.record( + _saved = DailyDownloadCountReport.record( cycle_coverage='2026.1.1', daily_file_downloads=17, ) - DailyDownloadCountReportEs8.refresh() - _response = DailyDownloadCountReportEs8.search().execute() + DailyDownloadCountReport.refresh() + _response = DailyDownloadCountReport.search().execute() (_fetched,) = _response assert _fetched.meta.id == _saved.meta.id assert _fetched.cycle_coverage == '2026.1.1' diff --git a/osf_tests/metrics/test_metric_mixin.py b/osf_tests/metrics/test_metric_mixin.py deleted file mode 100644 index ec9b2d302de..00000000000 --- a/osf_tests/metrics/test_metric_mixin.py +++ /dev/null @@ -1,35 +0,0 @@ -from unittest import mock -import pytest -import elasticsearch_metrics.imps.elastic6 as metrics - -from osf.metrics.metric_mixin import MetricMixin -from osf.models import OSFUser -from osf_tests.factories import UserFactory - -class DummyMetric(MetricMixin, metrics.Metric): - count = metrics.Integer(doc_values=True, index=True, required=True) - user_id = metrics.Keyword(index=True, doc_values=True, required=False) - - class Meta: - app_label = 'osf' - -@pytest.mark.django_db -@mock.patch.object(DummyMetric, '_get_id_to_count') -def test_get_top_by_count(mock_get_id_to_count): - user1, user2 = UserFactory(), UserFactory() - mock_get_id_to_count.return_value = { - user1._id: 41, - user2._id: 42, - } - - metric_qs = DummyMetric.get_top_by_count( - qs=OSFUser.objects.all(), - model_field='guids___id', - metric_field='user_id', - annotation='dummies', - size=None, - ) - - annotated_user = metric_qs.first() - assert annotated_user._id == user2._id - assert annotated_user.dummies == 42 diff --git a/osf_tests/metrics/test_monthly_report.py b/osf_tests/metrics/test_monthly_report.py deleted file mode 100644 index ba981e997d6..00000000000 --- a/osf_tests/metrics/test_monthly_report.py +++ /dev/null @@ -1,153 +0,0 @@ -import datetime -from unittest import mock - -import pytest -import elasticsearch_metrics.imps.elastic6 as metrics - -from osf.metrics.reports import MonthlyReport, ReportInvalid, PublicItemUsageReport -from osf.metrics.utils import YearMonth - - -class TestMonthlyReportKey: - @pytest.fixture - def mock_save(self): - with mock.patch('elasticsearch_metrics.imps.elastic6.BaseMetric.check_index_template'): - with mock.patch('elasticsearch6_dsl.Document.save', autospec=True) as mock_save: - yield mock_save - - def test_default(self, mock_save): - # only one of this type of report per month - class UniqueByMonth(MonthlyReport): - blah = metrics.Keyword() - - class Meta: - app_label = 'osf' - - yearmonth = YearMonth(2022, 5) - expected_timestamp = datetime.datetime(yearmonth.year, yearmonth.month, 1, tzinfo=datetime.UTC) - - reports = [ - UniqueByMonth(report_yearmonth=yearmonth), - UniqueByMonth(report_yearmonth=yearmonth, blah='blah'), - UniqueByMonth(report_yearmonth=yearmonth, blah='fleh'), - ] - expected_key = '8463aac67c1e5a038049196781d8f100f069225352d1829651892cf3fbfc50e2' - - for report in reports: - report.save() - assert mock_save.call_count == 1 - assert mock_save.call_args[0][0] is report - assert report.meta.id == expected_key - assert report.timestamp == expected_timestamp - mock_save.reset_mock() - - def test_with_unique_together(self, mock_save): - # multiple reports of this type per day, unique by given field - class UniqueByMonthAndField(MonthlyReport): - UNIQUE_TOGETHER_FIELDS = ('report_yearmonth', 'uniquefield',) - uniquefield = metrics.Keyword() - - class Meta: - app_label = 'osf' - - yearmonth = YearMonth(2022, 5) - expected_timestamp = datetime.datetime(yearmonth.year, yearmonth.month, 1, tzinfo=datetime.UTC) - - expected_blah = '62ebf38317cd8402e27a50ce99f836d1734b3f545adf7d144d0e1cf37a0d9d08' - blah_report = UniqueByMonthAndField(report_yearmonth=yearmonth, uniquefield='blah') - blah_report.save() - assert mock_save.call_count == 1 - assert mock_save.call_args[0][0] is blah_report - assert blah_report.meta.id == expected_blah - assert blah_report.timestamp == expected_timestamp - mock_save.reset_mock() - - expected_fleh = '385700db282f6d6089a0d21836db5ee8423f548615e515b6e034bcc90a14500f' - fleh_report = UniqueByMonthAndField(report_yearmonth=yearmonth, uniquefield='fleh') - fleh_report.save() - assert mock_save.call_count == 1 - assert mock_save.call_args[0][0] is fleh_report - assert fleh_report.meta.id == expected_fleh - assert fleh_report.timestamp == expected_timestamp - mock_save.reset_mock() - - for _bad_report in ( - UniqueByMonthAndField(report_yearmonth=yearmonth), - UniqueByMonthAndField(report_yearmonth=yearmonth, uniquefield=['list']), - ): - with pytest.raises(ReportInvalid): - _bad_report.save() - - -@pytest.mark.es_metrics -@pytest.mark.django_db -class TestLastMonthReport: - @pytest.fixture - def osfid(self): - return 'abced' - - @pytest.fixture - def this_month(self): - return YearMonth.from_date(datetime.date.today()) - - @pytest.fixture - def last_month(self, this_month): - return _prior_yearmonth(this_month) - - @pytest.fixture - def two_months_back(self, last_month): - return _prior_yearmonth(last_month) - - @pytest.fixture - def three_months_back(self, two_months_back): - return _prior_yearmonth(two_months_back) - - @pytest.fixture - def this_month_report(self, osfid, this_month): - return _item_usage_report(this_month, osfid, view_count=77) - - @pytest.fixture - def last_month_report(self, osfid, last_month): - return _item_usage_report(last_month, osfid, view_count=57) - - @pytest.fixture - def diff_last_month_report(self, last_month): - return _item_usage_report(last_month, 'zyxvt', view_count=17) - - @pytest.fixture - def two_months_back_report(self, osfid, two_months_back): - return _item_usage_report(two_months_back, osfid, view_count=27) - - @pytest.fixture - def three_months_back_report(self, osfid, three_months_back): - return _item_usage_report(three_months_back, osfid, view_count=37) - - def test_with_none(self, osfid): - assert PublicItemUsageReport.for_last_month(osfid) is None - - def test_with_others(self, osfid, this_month_report, three_months_back_report, diff_last_month_report): - assert PublicItemUsageReport.for_last_month(osfid) is None - - def test_with_prior_month(self, osfid, this_month_report, two_months_back_report, three_months_back_report, diff_last_month_report): - assert PublicItemUsageReport.for_last_month(osfid) == two_months_back_report - - def test_with_last_month(self, osfid, this_month_report, last_month_report, two_months_back_report, three_months_back_report, diff_last_month_report): - assert PublicItemUsageReport.for_last_month(osfid) == last_month_report - - -def _prior_yearmonth(ym: YearMonth) -> YearMonth: - return ( - YearMonth(ym.year - 1, 12) - if ym.month == 1 - else YearMonth(ym.year, ym.month - 1) - ) - - -def _item_usage_report(ym: YearMonth, osfid: str, **kwargs): - _report = PublicItemUsageReport( - report_yearmonth=ym, - item_osfid=osfid, - **kwargs - ) - _report.save(refresh=True) - return _report diff --git a/osf_tests/metrics/test_monthly_usage_report.py b/osf_tests/metrics/test_monthly_usage_report.py new file mode 100644 index 00000000000..8381d629f3f --- /dev/null +++ b/osf_tests/metrics/test_monthly_usage_report.py @@ -0,0 +1,103 @@ +import datetime +from functools import cached_property + +from django.test import TestCase +from elasticsearch_metrics.tests.util import RealElasticTestCase + +from osf.models.base import osfid_iri +from osf.metrics.monthly_reports import MonthlyPublicItemUsageReport +from osf.metrics.utils import YearMonth + + +class TestEachFromLastMonth(RealElasticTestCase, TestCase): + osfid = 'abced' + + @cached_property + def item_iri(self): + return osfid_iri(self.osfid) + + @cached_property + def this_month(self): + return YearMonth.from_date(datetime.date.today()) + + @cached_property + def last_month(self): + return self.this_month.prior() + + @cached_property + def two_months_back(self): + return self.last_month.prior() + + @cached_property + def three_months_back(self): + return self.two_months_back.prior() + + @cached_property + def this_month_report(self): + return _item_usage_report(self.this_month, self.osfid, view_count=77) + + @cached_property + def last_month_report(self): + return _item_usage_report(self.last_month, self.osfid, view_count=57) + + @cached_property + def diff_last_month_report(self): + return _item_usage_report(self.last_month, 'zyxvt', view_count=17) + + @cached_property + def two_months_back_report(self): + return _item_usage_report(self.two_months_back, self.osfid, view_count=27) + + @cached_property + def three_months_back_report(self): + return _item_usage_report(self.three_months_back, self.osfid, view_count=37) + + def test_with_none(self): + self.assertEqual( + MonthlyPublicItemUsageReport.from_last_month([self.item_iri]), + [], + ) + + def test_with_others(self): + self.this_month_report + self.three_months_back_report + self.diff_last_month_report + MonthlyPublicItemUsageReport.refresh() + self.assertEqual( + MonthlyPublicItemUsageReport.from_last_month([self.item_iri]), + [], + ) + + def test_with_prior_month(self): + self.this_month_report + self.two_months_back_report + self.three_months_back_report + self.diff_last_month_report + MonthlyPublicItemUsageReport.refresh() + self.assertEqual( + MonthlyPublicItemUsageReport.from_last_month([self.item_iri]), + [self.two_months_back_report], + ) + + def test_with_last_month(self): + self.this_month_report + self.last_month_report + self.two_months_back_report + self.three_months_back_report + self.diff_last_month_report + MonthlyPublicItemUsageReport.refresh() + self.assertEqual( + MonthlyPublicItemUsageReport.from_last_month([self.item_iri]), + [self.last_month_report], + ) + + +def _item_usage_report(ym: YearMonth, osfid: str, **kwargs): + _report = MonthlyPublicItemUsageReport( + report_yearmonth=ym, + item_iri=osfid_iri(osfid), + item_osfids=osfid, + **kwargs + ) + _report.save(validate=False) + return _report diff --git a/osf_tests/metrics/test_spam_count_reporter.py b/osf_tests/metrics/test_spam_count_reporter.py index 448a8136f7a..a72d1a71ab5 100644 --- a/osf_tests/metrics/test_spam_count_reporter.py +++ b/osf_tests/metrics/test_spam_count_reporter.py @@ -1,7 +1,6 @@ import pytest from datetime import datetime from osf.metrics.reporters.private_spam_metrics import PrivateSpamMetricsReporter -from osf.metrics.reports import PrivateSpamMetricsReport from osf.metrics.utils import YearMonth from osf_tests.factories import NodeLogFactory, NodeFactory from unittest.mock import patch @@ -31,10 +30,10 @@ def test_private_spam_metrics_reporter(): mock_akismet_get_hammed_count.return_value = 10 reporter = PrivateSpamMetricsReporter(report_yearmonth) - reports_raw = reporter.report() - report = next(r for r in reports_raw if isinstance(r, PrivateSpamMetricsReport)) - - assert report.node_oopspam_flagged == 10, f"Expected 10, got {report.node_oopspam_flagged}" - assert report.node_oopspam_hammed == 5, f"Expected 5, got {report.node_oopspam_hammed}" - assert report.node_akismet_flagged == 20, f"Expected 20, got {report.node_akismet_flagged}" - assert report.node_akismet_hammed == 10, f"Expected 10, got {report.node_akismet_hammed}" + reports = list(reporter.report()) + assert len(reports) == 1 + for report in reports: + assert report.node_oopspam_flagged == 10, f"Expected 10, got {report.node_oopspam_flagged}" + assert report.node_oopspam_hammed == 5, f"Expected 5, got {report.node_oopspam_hammed}" + assert report.node_akismet_flagged == 20, f"Expected 20, got {report.node_akismet_flagged}" + assert report.node_akismet_hammed == 10, f"Expected 10, got {report.node_akismet_hammed}" diff --git a/poetry.lock b/poetry.lock index 6e26c23d295..b6965cc1a35 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1085,7 +1085,7 @@ Django = ">=2.0" [[package]] name = "django-elasticsearch-metrics" -version = "2026.0.4" +version = "2026.0.5" description = "Django app for storing time-series metrics in Elasticsearch." optional = false python-versions = ">=3.10,<4" @@ -1095,14 +1095,13 @@ develop = false [package.extras] anydjango = ["django"] -elastic6 = ["elasticsearch6-dsl (>=6.3.0,<7.0.0)"] elastic8 = ["elasticsearch8 (>=8.0.0,<9.0.0)"] [package.source] type = "git" url = "https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git" -reference = "709ff1d5c869d5696212b9109ed79e5d9766c60c" -resolved_reference = "709ff1d5c869d5696212b9109ed79e5d9766c60c" +reference = "46890bb61d35459e9793eba92d9ae54d4ce9c6af" +resolved_reference = "46890bb61d35459e9793eba92d9ae54d4ce9c6af" [[package]] name = "django-extensions" @@ -1390,45 +1389,6 @@ files = [ [package.dependencies] urllib3 = ">=1.8,<2.0" -[[package]] -name = "elasticsearch6" -version = "6.8.2" -description = "Python client for Elasticsearch" -optional = false -python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*, <4" -groups = ["main"] -files = [ - {file = "elasticsearch6-6.8.2-py2.py3-none-any.whl", hash = "sha256:4edf2d61f854f642185d5af915b23c57e70d9f2b54f558b62ae55fa720583f5e"}, - {file = "elasticsearch6-6.8.2.tar.gz", hash = "sha256:7c215910b6bc18928d24d6c1d0b09b0684c824af609906d5e007a9a268109678"}, -] - -[package.dependencies] -urllib3 = ">=1.21.1" - -[package.extras] -develop = ["coverage", "mock", "nose", "nosexcover", "numpy", "pandas", "pyyaml", "requests (>=2.0.0,<3.0.0)", "sphinx (<1.7)", "sphinx-rtd-theme"] -requests = ["requests (>=2.4.0,<3.0.0)"] - -[[package]] -name = "elasticsearch6-dsl" -version = "6.4.0" -description = "Python client for Elasticsearch" -optional = false -python-versions = "*" -groups = ["main"] -files = [ - {file = "elasticsearch6-dsl-6.4.0.tar.gz", hash = "sha256:4bbc60919b73484d028eca31f749f0eea80d8b0bfe0a9a33b54eb0afca1d9b5f"}, - {file = "elasticsearch6_dsl-6.4.0-py2.py3-none-any.whl", hash = "sha256:a5767ef65c50f7c8af7ba6c176bd8df2c1fb501c644bc196cbd675f15c0f2be1"}, -] - -[package.dependencies] -elasticsearch6 = ">=6.0.0,<7.0.0" -python-dateutil = "*" -six = "*" - -[package.extras] -develop = ["coverage (<5.0.0)", "mock", "pytest (>=3.0.0)", "pytest-cov", "pytz", "sphinx", "sphinx-rtd-theme"] - [[package]] name = "elasticsearch8" version = "8.19.3" @@ -4727,4 +4687,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.1" python-versions = "^3.12" -content-hash = "964ae9c9b6ce89c023a6bf0216cab95cdaafe5ce20be927d7c8f7244127993db" +content-hash = "d032b7d17bbc25dbbd06a5f7134b2dfd83946a2e959a699fd13aad23c3bcedb1" diff --git a/pyproject.toml b/pyproject.toml index 28f2f4ed22d..e9c856a4eab 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,8 +31,6 @@ Markupsafe = "2.1.5" blinker = "1.7.0" furl = "2.1.3" elasticsearch2 = "2.5.1" -elasticsearch6= "6.8.2" -elasticsearch6-dsl = "6.4.0" elasticsearch8 = "8.19.3" elastic-transport = "8.17.1" google-api-python-client = "2.123.0" @@ -91,7 +89,7 @@ datacite = "1.1.3" rdflib = "7.0.0" colorlog = "6.8.2" # Metrics -django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "709ff1d5c869d5696212b9109ed79e5d9766c60c"} +django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "46890bb61d35459e9793eba92d9ae54d4ce9c6af"} # Impact Metrics CSV Export djangorestframework-csv = "3.0.2" gevent = "24.2.1" diff --git a/pytest.ini b/pytest.ini index 4417f537dd0..450117d7f68 100644 --- a/pytest.ini +++ b/pytest.ini @@ -8,5 +8,5 @@ markers = enable_enqueue_task enable_bookmark_creation enable_implicit_clean - es enable_account_status_messaging + djelme_elasticsearch_backends diff --git a/website/settings/defaults.py b/website/settings/defaults.py index 220d03ba2e6..1d8ee10bd32 100644 --- a/website/settings/defaults.py +++ b/website/settings/defaults.py @@ -108,7 +108,6 @@ def parent_dir(path): SEARCH_ENGINE = 'elastic' # Can be 'elastic', or None ELASTIC_URI = '127.0.0.1:9200' -ELASTIC6_URI = os.environ.get('ELASTIC6_URI', '127.0.0.1:9201') ELASTIC8_URI = os.environ.get('ELASTIC8_URI') ELASTIC8_CERT_PATH = os.environ.get('ELASTIC8_CERT_PATH') ELASTIC8_ASSERT_HOSTNAME = os.environ.get('ELASTIC8_ASSERT_HOSTNAME') @@ -485,7 +484,6 @@ class CeleryConfig: } background_migration_modules = { - 'osf.management.commands.migrate_osfmetrics_6to8', } try: @@ -602,7 +600,6 @@ class CeleryConfig: 'scripts.remove_after_use.merge_notification_subscription_provider_ct', 'scripts.disable_removed_beat_tasks', 'osf.management.commands.delete_withdrawn_or_failed_registration_files', - 'osf.management.commands.migrate_osfmetrics_6to8', ) # Modules that need metrics and release requirements @@ -2144,8 +2141,6 @@ def from_node_usage(cls, usage_bytes, private_limit=None, public_limit=None): CAS_LOG_LEVEL = 3 # ERROR -PREPRINT_METRICS_START_DATE = datetime.datetime(2019, 1, 1) - WAFFLE_VALUES_YAML = 'osf/features.yaml' DEFAULT_DRAFT_NODE_TITLE = 'Untitled' USE_COLOR = False