diff --git a/.docker-compose.env b/.docker-compose.env
index 80eebc8707b..a712f7ac23c 100644
--- a/.docker-compose.env
+++ b/.docker-compose.env
@@ -6,7 +6,6 @@ DOMAIN=http://localhost:5000/
INTERNAL_DOMAIN=http://192.168.168.167:5000/
API_DOMAIN=http://localhost:8000/
ELASTIC_URI=192.168.168.167:9200
-ELASTIC6_URI=192.168.168.167:9201
ELASTIC8_URI=http://192.168.168.167:9202
ELASTIC8_USERNAME=elastic
OSF_DB_HOST=192.168.168.167
diff --git a/.github/actions/build-es6/action.yml b/.github/actions/build-es6/action.yml
deleted file mode 100644
index 5eb71f2147f..00000000000
--- a/.github/actions/build-es6/action.yml
+++ /dev/null
@@ -1,32 +0,0 @@
-name: 'Build ElasticSearch6'
-description: 'Building and starting the ElasticSearch6 service'
-inputs:
- ELASTICSEARCH6_ARCHIVE:
- description: 'Where ES6 archive is located'
-runs:
- using: "composite"
- steps:
- - id: ES6-Step-1
- shell: bash
- run: |
- cd ~/.cache/downloads
- if [ ! -f "${{ inputs.ELASTICSEARCH6_ARCHIVE }}" ]; then
- curl -SLO https://artifacts.elastic.co/downloads/elasticsearch/${{ inputs.ELASTICSEARCH6_ARCHIVE }}
- fi
-
- mkdir -p /tmp/elasticsearch6
- tar -xzf ${{ inputs.ELASTICSEARCH6_ARCHIVE }} -C /tmp/elasticsearch6 --strip-components=1
- echo "ES6 started..."
- - id: ES6-Step-2
- shell: bash
- run: /tmp/elasticsearch6/bin/elasticsearch > /dev/null & export ELASTICSEARCH6_PID=$!
- - id: ES6-Step-3
- shell: bash
- run: |
- echo "Waiting for ES6 health..."
- sleep 5
- while [ ! $(curl -sf http://localhost:9201/_cluster/health?wait_for_status=yellow) ]; do
- echo "trying again...";
- sleep 5;
- done
- echo "ES6 started successfully!"
diff --git a/.github/actions/start-build/action.yml b/.github/actions/start-build/action.yml
index 22d8deae0d5..b75362f56f5 100644
--- a/.github/actions/start-build/action.yml
+++ b/.github/actions/start-build/action.yml
@@ -12,9 +12,6 @@ runs:
- uses: ./.github/actions/build-es
with:
ELASTICSEARCH_ARCHIVE: ${{ env.ELASTICSEARCH_ARCHIVE }}
- - uses: ./.github/actions/build-es6
- with:
- ELASTICSEARCH6_ARCHIVE: ${{ env.ELASTICSEARCH6_ARCHIVE }}
- name: Set up Python 3.12
uses: actions/setup-python@v6
with:
diff --git a/.github/workflows/test-build.yml b/.github/workflows/test-build.yml
index f147941c5ff..ead75f5e21e 100644
--- a/.github/workflows/test-build.yml
+++ b/.github/workflows/test-build.yml
@@ -8,7 +8,6 @@ permissions:
env:
WHEELHOUSE: ~/.cache/wheelhouse
ELASTICSEARCH_ARCHIVE: elasticsearch-2.4.5.tar.gz
- ELASTICSEARCH6_ARCHIVE: elasticsearch-6.3.1.tar.gz
OSF_DB_PORT: 5432
OSF_DB_PASSWORD: postgres
GITHUB_ACTIONS: true
@@ -140,6 +139,8 @@ jobs:
- uses: ./.github/actions/start-build
- name: Run tests
run: poetry run python3 -m invoke test-ci-api3-and-osf --junit
+ env:
+ ELASTIC8_URI: http://localhost:9202
- name: Upload report
if: (success() || failure()) # run this step even if previous step failed
uses: ./.github/actions/gen-report
diff --git a/README-docker-compose.md b/README-docker-compose.md
index 0956379333f..6cc0f0341b3 100644
--- a/README-docker-compose.md
+++ b/README-docker-compose.md
@@ -116,11 +116,7 @@
#### Special Instructions for Apple Chipset (M1, M2, etc.) and other ARM64 architecture
- * _NOTE: The default `elasticsearch`, `elasticsearch6`, and `sharejs` containers are incompatible with ARM64._
-
- - To run `elasticsearch6` on ARM64 architecture:
-
- - Copy `docker-compose-dist-arm64.override.yml` into your `docker-compose.override.yml` file
+ * _NOTE: The default `elasticsearch` and `sharejs` containers are incompatible with ARM64._
- Running containers with docker compose
@@ -194,7 +190,7 @@
- Start all containers
```bash
- alias dcsa="docker compose up -d assets admin_assets mfr wb fakecas sharejs worker elasticsearch elasticsearch6 web api admin preprints gv"
+ alias dcsa="docker compose up -d assets admin_assets mfr wb fakecas sharejs worker elasticsearch elasticsearch8 web api admin preprints gv"
```
- Shut down all containers
diff --git a/addons/base/views.py b/addons/base/views.py
index 12b78fb9957..04e620c4c54 100644
--- a/addons/base/views.py
+++ b/addons/base/views.py
@@ -14,7 +14,7 @@
import waffle
from django.db import transaction
from django.contrib.contenttypes.models import ContentType
-from elasticsearch6 import exceptions as es_exceptions
+from elasticsearch8 import exceptions as es_exceptions
from rest_framework import status as http_status
from api.caching.tasks import update_storage_usage_with_size
@@ -34,7 +34,7 @@
from framework.flask import redirect
from framework.sentry import log_exception
from framework.transactions.handlers import no_auto_transaction
-from osf.metrics.es8_metrics import OsfCountedUsageEvent
+from osf.metrics.events import OsfCountedUsageEvent
from website import settings
from addons.base import signals as file_signals
from addons.base.utils import format_last_known_metadata, get_mfr_url
@@ -54,7 +54,6 @@
FileVersionUserMetadata,
FileVersion, NotificationTypeEnum
)
-from osf.metrics import PreprintView, PreprintDownload
from osf.utils import permissions
from osf.external.gravy_valet import request_helpers
from website.profile.utils import get_profile_image_url
@@ -686,12 +685,6 @@ def osfstoragefile_viewed_update_metrics(self, auth, fileversion, file_node):
return
if waffle.switch_is_active(features.ELASTICSEARCH_METRICS) and isinstance(resource, Preprint):
try:
- PreprintView.record_for_preprint(
- preprint=resource,
- user=auth.user,
- version=fileversion.identifier,
- path=file_node.path,
- )
OsfCountedUsageEvent.record(
user_id=getattr(user, '_id', None),
item_osfid=resource._id,
@@ -725,12 +718,6 @@ def osfstoragefile_downloaded_update_metrics(self, auth, fileversion, file_node)
return
if waffle.switch_is_active(features.ELASTICSEARCH_METRICS) and isinstance(resource, Preprint):
try:
- PreprintDownload.record_for_preprint(
- preprint=resource,
- user=auth.user,
- version=fileversion.identifier,
- path=file_node.path,
- )
OsfCountedUsageEvent.record(
user_id=getattr(user, '_id', None),
item_osfid=resource._id,
diff --git a/admin/management/urls.py b/admin/management/urls.py
index 2e4cd7479a1..3d29a259483 100644
--- a/admin/management/urls.py
+++ b/admin/management/urls.py
@@ -1,4 +1,4 @@
-from django.urls import re_path, path
+from django.urls import re_path
from admin.management import views
@@ -22,5 +22,4 @@
name='sync_notification_templates'),
re_path(r'^remove_orcid_from_user_social', views.RemoveOrcidFromUserSocial.as_view(),
name='remove_orcid_from_user_social'),
- path('migrate_osfmetrics_6to8', views.MigrateOsfmetrics6to8.as_view(), name='migrate_osfmetrics_6to8'),
]
diff --git a/admin/management/views.py b/admin/management/views.py
index 04034bfaa08..4b6e6b4c080 100644
--- a/admin/management/views.py
+++ b/admin/management/views.py
@@ -1,5 +1,3 @@
-from io import StringIO
-
from dateutil.parser import isoparse
from django.views.generic import TemplateView, View
from django.contrib import messages
@@ -205,22 +203,3 @@ def post(self, request):
remove_orcid_from_user_social()
messages.success(request, 'Orcid from user social have been successfully removed.')
return redirect(reverse('management:commands'))
-
-
-class MigrateOsfmetrics6to8(ManagementCommandPermissionView):
- def post(self, request):
- _command_kwargs = {
- 'no_color': True,
- 'no_counts': request.POST.get('no_counts'),
- 'clear_state': request.POST.get('clear_state'),
- 'clear_es8_data': request.POST.get('clear_es8_data'),
- 'start': request.POST.get('start'),
- 'unchanged': request.POST.get('unchanged'),
- 'usage_reports': request.POST.get('usage_reports'),
- 'usage_events': request.POST.get('usage_events'),
- }
- _out_io = StringIO()
- call_command('migrate_osfmetrics_6to8', **_command_kwargs, stdout=_out_io)
- for _line in _out_io.getvalue().split('\n'):
- messages.info(request, _line)
- return redirect(reverse('management:commands'))
diff --git a/admin/templates/management/commands.html b/admin/templates/management/commands.html
index 03be151ddbb..fd9ceec9c1b 100644
--- a/admin/templates/management/commands.html
+++ b/admin/templates/management/commands.html
@@ -190,31 +190,6 @@
Remove existing orcid info from user social
-
- migrate osf-metrics 6to8
-
- view progress of the osf-metrics migration from elastic6 to elastic8 (or start it)
-
-
-
{% endblock %}
diff --git a/api/base/elasticsearch_dsl_views.py b/api/base/elasticsearch_dsl_views.py
index ecf2825d4e8..a8f4292c33e 100644
--- a/api/base/elasticsearch_dsl_views.py
+++ b/api/base/elasticsearch_dsl_views.py
@@ -3,8 +3,9 @@
import datetime
import typing
-import elasticsearch6_dsl as edsl
+import elasticsearch8.dsl as esdsl
from rest_framework import generics, exceptions as drf_exceptions
+from rest_framework.serializers import Serializer
from rest_framework.settings import api_settings as drf_settings
from api.base.settings.defaults import REPORT_FILENAME_FORMAT
@@ -23,7 +24,7 @@
class ElasticsearchListView(FilterMixin, JSONAPIBaseView, generics.ListAPIView, abc.ABC):
- '''abstract view class using `elasticsearch6_dsl.Search` as a queryset-analogue
+ '''abstract view class using `elasticsearch8.dsl.Search` as a queryset-analogue
builds a `Search` based on `self.get_default_search()` and the request's
query parameters for filtering, sorting, and pagination -- fetches only
@@ -35,18 +36,18 @@ class ElasticsearchListView(FilterMixin, JSONAPIBaseView, generics.ListAPIView,
ordering_fields: frozenset[str] = frozenset() # serializer field names
@abc.abstractmethod
- def get_default_search(self) -> edsl.Search | None:
- '''the base `elasticsearch6_dsl.Search` for this list, based on url path
+ def get_default_search(self) -> esdsl.Search | None:
+ '''the base `elasticsearch8.dsl.Search` for this list, based on url path
(common jsonapi query parameters will be considered automatically)
'''
...
- FILE_RENDERER_CLASSES = {
+ FILE_RENDERER_CLASSES = (
MetricsReportsCsvRenderer,
MetricsReportsTsvRenderer,
MetricsReportsJsonRenderer,
- }
+ )
def set_content_disposition(self, response, renderer: str):
"""Set the Content-Disposition header to prompt a file download with the appropriate filename.
@@ -75,7 +76,7 @@ def finalize_response(self, request, response, *args, **kwargs):
response = super().finalize_response(request, response, *args, **kwargs)
# Check if this is a direct download request or file renderer classes, set to the Content-Disposition header
# so filename and attachment for browser download
- if isinstance(request.accepted_renderer, tuple(self.FILE_RENDERER_CLASSES)):
+ if isinstance(request.accepted_renderer, self.FILE_RENDERER_CLASSES):
self.set_content_disposition(response, request.accepted_renderer)
return response
@@ -95,7 +96,7 @@ def finalize_response(self, request, response, *args, **kwargs):
# (filtering handled in-view to reuse logic from FilterMixin)
filter_backends = ()
- # note: because elasticsearch6_dsl.Search supports slicing and gives results when iterated on,
+ # note: because elasticsearch8.dsl.Search supports slicing and gives results when iterated on,
# it works fine with default pagination
# override rest_framework.generics.GenericAPIView
@@ -128,10 +129,17 @@ def get_queryset(self):
)
return self.__add_sort(_search)
+ def get_serializer_context(self):
+ return (
+ super().get_serializer_context()
+ if issubclass(self.get_serializer_class(), Serializer)
+ else {} # allow custom BaseSerializer-based serializer
+ )
+
###
# private methods
- def __add_sort(self, search: edsl.Search) -> edsl.Search:
+ def __add_sort(self, search: esdsl.Search) -> esdsl.Search:
_elastic_sort = self.__get_elastic_sort()
return (search if _elastic_sort is None else search.sort(_elastic_sort))
@@ -148,17 +156,20 @@ def __get_elastic_sort(self) -> str | None:
raise drf_exceptions.ValidationError(
f'invalid value for {drf_settings.ORDERING_PARAM} query param (valid values: {", ".join(self.ordering_fields)})',
)
- _serializer_field = self.get_serializer().fields[_sort_field]
- _elastic_sort_field = _serializer_field.source
+ _elastic_sort_field = (
+ self.get_serializer().fields[_sort_field].source
+ if issubclass(self.get_serializer_class(), Serializer)
+ else _sort_field # allow custom BaseSerializer-based serializer
+ )
return (_elastic_sort_field if _ascending else f'-{_elastic_sort_field}')
def __add_search_filter(
self,
- search: edsl.Search,
+ search: esdsl.Search,
elastic_field_name: str,
operator: str,
value: str,
- ) -> edsl.Search:
+ ) -> esdsl.Search:
match operator: # operators from FilterMixin
case 'eq':
if value == '':
diff --git a/api/base/metrics.py b/api/base/metrics.py
index d68f19a45b8..d5027403ca8 100644
--- a/api/base/metrics.py
+++ b/api/base/metrics.py
@@ -1,15 +1,14 @@
-import re
-from datetime import timedelta
-
+import abc
import waffle
-from django.utils import timezone
from api.base.exceptions import InvalidQueryStringError
from osf import features
-from website.settings import PREPRINT_METRICS_START_DATE
+from osf.metrics.events import OsfCountedUsageEvent
+from osf.metrics.monthly_reports import MonthlyPublicItemUsageReport
+from osf.models.base import osfid_iri
-class MetricsViewMixin:
+class UsageMetricsViewMixin(abc.ABC):
"""Mixin for views that expose metrics via django-elasticsearch-metrics.
Enables metrics to be requested with a query parameter, like so: ::
@@ -18,110 +17,98 @@ class MetricsViewMixin:
Any subclass of this mixin MUST do the following:
* Use a serializer_class that subclasses MetricsSerializerMixin
- * Define metric_map as a class variable. It should be dict mapping metric name
- ("downloads") to a Metric class (PreprintDownload)
- * For list views: implement `get_annotated_queryset_with_metrics`
- * For detail views: implement `add_metric_to_object`
+ * Call add_metrics_to_object(obj) to get `views` and/or `downloads`
+ assigned on the obj (according to query params)
"""
- # Adapted from FilterMixin.QUERY_PATTERN
- METRICS_QUERY_PATTERN = re.compile(r'^metrics\[(?P((?:,*\s*\w+)*))\]$')
- TIMEDELTA_MAP = {
- 'daily': timedelta(hours=24),
- 'weekly': timedelta(days=7),
- 'monthly': timedelta(days=30),
- 'yearly': timedelta(days=365),
+ METRICS_QUERY_MAP = {
+ 'metrics[views]': OsfCountedUsageEvent.ActionLabel.VIEW,
+ 'metrics[downloads]': OsfCountedUsageEvent.ActionLabel.DOWNLOAD,
+ }
+ METRICS_ATTR_MAP = {
+ OsfCountedUsageEvent.ActionLabel.VIEW: 'views',
+ OsfCountedUsageEvent.ActionLabel.DOWNLOAD: 'downloads',
+ }
+ TIMESPAN_MAP = {
+ 'daily': 'now-1d/d',
+ 'weekly': 'now-1w/d',
+ 'monthly': 'now-1M/d',
}
VALID_METRIC_PERIODS = {
'daily',
'weekly',
'monthly',
- 'yearly',
'total',
}
- @property
- def metric_map(self):
- raise NotImplementedError('MetricsViewMixin subclasses must define a metric_map class variable.')
-
- def get_annotated_queryset_with_metrics(self, queryset, metric_class, metric_name, after):
- """Return a queryset annotated with metrics. Use for list endpoints that expose metrics."""
- raise NotImplementedError('MetricsViewMixin subclasses must define get_annotated_queryset_with_metrics().')
-
- def add_metric_to_object(self, obj, metric_class, metric_name, after):
- """Set an attribute for a metric on obj. Use for detail endpoints that expose metrics.
- Return the modified object.
- """
- raise NotImplementedError('MetricsViewMixin subclasses must define add_metric_to_object().')
-
- @property
- def metrics_default_after(self):
- """Value to be used as the `after` in metrics queries if not otherwise specified.
- Datetime or None.
- """
- return None
-
@property
def metrics_requested(self):
return (
- waffle.switch_is_active(features.ELASTICSEARCH_METRICS) and
- bool(self.parse_metric_query_params(self.request.query_params))
+ waffle.switch_is_active(features.ELASTICSEARCH_METRICS)
+ and any(_param in self.METRICS_QUERY_MAP for _param in self.request.query_params)
)
- # Adapted from FilterMixin.parse_query_params
- # TODO: Should we get rid of query_params argument and use self.request.query_params instead?
- def parse_metric_query_params(self, query_params):
+ def get_item_iri(self, item):
+ return osfid_iri(item._id)
+
+ def parse_metric_query_params(self):
"""Parses query parameters to a dict usable for fetching metrics.
:param dict query_params:
:return dict of the format {
- : {
- 'period': <[daily|weekly|monthly|yearly|total]>,
- }
+ : <[daily|weekly|monthly|yearly|total]>,
}
"""
query = {}
- for key, value in query_params.items():
- match = self.METRICS_QUERY_PATTERN.match(key)
- if match:
- match_dict = match.groupdict()
- metric_name = match_dict['metric_name']
- query[metric_name] = value
+ for key, value in self.request.query_params.items():
+ _usage_label = self.METRICS_QUERY_MAP.get(key)
+ if _usage_label:
+ if value not in self.VALID_METRIC_PERIODS:
+ raise InvalidQueryStringError(f"Invalid period for metric: '{value}'", parameter='metrics')
+ query[_usage_label] = value
return query
- def _add_metrics(self, queryset_or_obj, method):
- """Parse the ?metric[METRIC]=PERIOD query param, validate it, and
- run ``method`` for each requested object.
-
- This is used to share code between add_metric_to_object and get_metrics_queryset.
+ def add_metrics_to_object(self, obj):
+ """Helper method used for detail views.
"""
- metrics_requested = self.parse_metric_query_params(self.request.query_params)
- if metrics_requested:
- metric_map = self.metric_map
- for metric, period in metrics_requested.items():
- if metric not in metric_map:
- raise InvalidQueryStringError(f"Invalid metric in query string: '{metric}'", parameter='metrics')
- if period not in self.VALID_METRIC_PERIODS:
- raise InvalidQueryStringError(f"Invalid period for metric: '{period}'", parameter='metrics')
- metric_class = metric_map[metric]
- if period == 'total':
- after = self.metrics_default_after
+ for _action_label, _period in self.parse_metric_query_params().items():
+ _count = self._get_usage_count(self.get_item_iri(obj), _action_label, _period)
+ setattr(obj, self.METRICS_ATTR_MAP[_action_label], _count)
+
+ def _get_usage_count(self, item_iri, action_label, period):
+ _search = (
+ OsfCountedUsageEvent.search()
+ .filter('term', item_iri=item_iri)
+ .filter('term', action_labels=action_label.value)
+ )
+ _prior_count = 0
+ if _timespan := self.TIMESPAN_MAP.get(period):
+ _search = _search.filter('range', timestamp={'gte': _timespan})
+ else: # cumulative total
+ _latest_usage_report = self._get_latest_usage_report(item_iri)
+ if _latest_usage_report:
+ _search = _search.filter(
+ 'range', timestamp={
+ 'gte': _latest_usage_report.report_yearmonth.month_end(),
+ },
+ )
+ if action_label == OsfCountedUsageEvent.ActionLabel.VIEW:
+ _prior_count = _latest_usage_report.cumulative_view_count
+ elif action_label == OsfCountedUsageEvent.ActionLabel.DOWNLOAD:
+ _prior_count = _latest_usage_report.cumulative_download_count
else:
- after = timezone.now() - self.TIMEDELTA_MAP[period]
- queryset_or_obj = method(queryset_or_obj, metric_class, metric, after)
- return queryset_or_obj
-
- def add_metrics_to_object(self, obj):
- """Helper method used for detail views."""
- return self._add_metrics(obj, method=self.add_metric_to_object)
-
- def get_metrics_queryset(self, queryset):
- """Helper method used for list views."""
- return self._add_metrics(queryset, method=self.get_annotated_queryset_with_metrics)
+ raise ValueError(f'unsupported action label {action_label!r}')
+ _response = _search[0:0].execute()
+ return _prior_count + _response.doc_count
+
+ def _get_latest_usage_report(self, item_iri):
+ _search = (
+ MonthlyPublicItemUsageReport.search()
+ .filter('term', item_iri=item_iri)
+ .sort('-cycle_coverage')
+ )
+ _response = _search[0].execute()
+ return _response[0] if _response else None
- # Override get_default_queryset for convenience
- def get_default_queryset(self):
- queryset = super().get_default_queryset()
- return self.get_metrics_queryset(queryset)
class MetricsSerializerMixin:
@property
@@ -138,9 +125,3 @@ def get_meta(self, obj):
meta = meta or {'metrics': {}}
meta['metrics'][metric] = getattr(obj, metric)
return meta
-
-
-class PreprintMetricsViewMixin(MetricsViewMixin):
- @property
- def metrics_default_after(self):
- return PREPRINT_METRICS_START_DATE
diff --git a/api/base/settings/defaults.py b/api/base/settings/defaults.py
index ac9a9739f1b..b250b283ed8 100644
--- a/api/base/settings/defaults.py
+++ b/api/base/settings/defaults.py
@@ -321,12 +321,6 @@
# django-elasticsearch-metrics
DJELME_BACKENDS = {
- 'osfmetrics_es6': {
- 'elasticsearch_metrics.imps.elastic6': {
- 'hosts': osf_settings.ELASTIC6_URI,
- 'retry_on_timeout': True,
- },
- },
'osfmetrics_es8': {
'elasticsearch_metrics.imps.elastic8': {
# passthru kwargs to elasticsearch8 connection constructor
diff --git a/api/institutions/views.py b/api/institutions/views.py
index d653f5b4e77..159e303b0ef 100644
--- a/api/institutions/views.py
+++ b/api/institutions/views.py
@@ -10,8 +10,11 @@
from framework.auth.oauth_scopes import CoreScopes
from osf.models import OSFUser, Node, Institution, Registration
-from osf.metrics.reports import InstitutionalUserReport, InstitutionMonthlySummaryReport
-from osf.metrics.utils import YearMonth
+from osf.metrics.monthly_reports import (
+ MonthlyInstitutionalUserReport,
+ MonthlyInstitutionSummaryReport,
+)
+from osf.metrics.utils import YearMonth, cycle_coverage_yearmonth
from osf.utils import permissions as osf_permissions
from api.base import permissions as base_permissions
@@ -27,11 +30,6 @@
)
from api.base.exceptions import RelationshipPostMakesNoChanges
from api.metrics.permissions import IsInstitutionalMetricsUser
-from api.metrics.renderers import (
- MetricsReportsCsvRenderer,
- MetricsReportsTsvRenderer,
- MetricsReportsJsonRenderer,
-)
from api.nodes.serializers import NodeSerializer
from api.nodes.filters import NodesFilterMixin
from api.users.serializers import UserSerializer
@@ -411,23 +409,21 @@ class InstitutionDepartmentList(InstitutionMixin, ElasticsearchListView):
serializer_class = InstitutionDepartmentMetricsSerializer
renderer_classes = (
*api_settings.DEFAULT_RENDERER_CLASSES,
- MetricsReportsCsvRenderer,
- MetricsReportsTsvRenderer,
- MetricsReportsJsonRenderer,
+ *ElasticsearchListView.FILE_RENDERER_CLASSES,
)
pagination_class = JSONAPINoPagination
def get_default_search(self):
_base_search = (
- InstitutionalUserReport.search()
+ MonthlyInstitutionalUserReport.search()
.filter('term', institution_id=self.get_institution()._id)
)
- _yearmonth = InstitutionalUserReport.most_recent_yearmonth(base_search=_base_search)
- if _yearmonth is None:
+ _most_recent_cycle = MonthlyInstitutionalUserReport.most_recent_cycle(_base_search)
+ if _most_recent_cycle is None:
return None
_search = (
_base_search
- .filter('term', report_yearmonth=str(_yearmonth))
+ .filter('term', cycle_coverage=_most_recent_cycle)
.exclude('term', user_name='Deleted user')
)
# add aggregation on department name
@@ -468,9 +464,7 @@ class InstitutionUserMetricsList(InstitutionMixin, ElasticsearchListView):
view_name = 'institution-user-metrics'
renderer_classes = (
*api_settings.DEFAULT_RENDERER_CLASSES,
- MetricsReportsCsvRenderer,
- MetricsReportsTsvRenderer,
- MetricsReportsJsonRenderer,
+ *ElasticsearchListView.FILE_RENDERER_CLASSES,
)
serializer_class = InstitutionUserMetricsSerializer
@@ -492,17 +486,16 @@ class InstitutionUserMetricsList(InstitutionMixin, ElasticsearchListView):
))
def get_default_search(self):
- base_search = InstitutionalUserReport.search().filter(
- 'term',
- institution_id=self.get_institution()._id,
+ _base_search = (
+ MonthlyInstitutionalUserReport.search()
+ .filter('term', institution_id=self.get_institution()._id)
)
- yearmonth = InstitutionalUserReport.most_recent_yearmonth(base_search=base_search)
- if yearmonth is None:
+ _most_recent_cycle = MonthlyInstitutionalUserReport.most_recent_cycle(_base_search)
+ if _most_recent_cycle is None:
return None
-
return (
- base_search
- .filter('term', report_yearmonth=str(yearmonth))
+ _base_search
+ .filter('term', cycle_coverage=_most_recent_cycle)
.exclude('term', user_name='Deleted user')
)
@@ -525,29 +518,33 @@ class InstitutionSummaryMetricsDetail(JSONAPIBaseView, generics.RetrieveAPIView,
serializer_class = InstitutionSummaryMetricsSerializer
def get_object(self):
- institution = self.get_institution()
- search_object = self.get_default_search()
- if search_object:
- object = search_object.execute()[0]
- object.id = institution._id
- return object
+ _institution = self.get_institution()
+ _search = self.get_default_search()
+ if _search:
+ _response = _search[0].execute()
+ if _response:
+ _report = _response[0]
+ _report.id = _institution._id
+ return _report
+ return None
def get_default_search(self):
- base_search = InstitutionMonthlySummaryReport.search().filter(
- 'term',
- institution_id=self.get_institution()._id,
+ _base_search = (
+ MonthlyInstitutionSummaryReport.search()
+ .filter('term', institution_id=self.get_institution()._id)
)
- yearmonth = InstitutionMonthlySummaryReport.most_recent_yearmonth(base_search=base_search)
- if report_date_str := self.request.query_params.get('report_yearmonth'):
+ _cycle_coverage = None
+ if _yearmonth_str := self.request.query_params.get('report_yearmonth'):
try:
- yearmonth = YearMonth.from_str(report_date_str)
+ _yearmonth = YearMonth.from_str(_yearmonth_str)
except ValueError:
- pass
-
- if yearmonth is None:
+ raise exceptions.ValidationError(
+ 'report_yearmonth query param must be in YYYY-MM format',
+ )
+ else:
+ _cycle_coverage = cycle_coverage_yearmonth(_yearmonth)
+ else:
+ _cycle_coverage = MonthlyInstitutionSummaryReport.most_recent_cycle(_base_search)
+ if _cycle_coverage is None:
return None
-
- return base_search.filter(
- 'term',
- report_yearmonth=str(yearmonth),
- )
+ return _base_search.filter('term', cycle_coverage=_cycle_coverage)
diff --git a/api/metrics/serializers.py b/api/metrics/serializers.py
index 9e3f61f5b50..eba54f2bb5d 100644
--- a/api/metrics/serializers.py
+++ b/api/metrics/serializers.py
@@ -1,26 +1,12 @@
import logging
-import datetime
from rest_framework import serializers as ser
-from api.base.serializers import BaseAPISerializer
from api.base.utils import absolute_reverse
-from osf.metrics.counted_usage import CountedAuthUsage, PageviewInfo
-from osf.metrics.es8_metrics import (
- OsfCountedUsageEvent,
- PageviewInfo as PageviewInfoEs8,
-)
-from website import settings as website_settings
-
-logger = logging.getLogger(__name__)
+from osf.metrics.events import OsfCountedUsageEvent
-class PreprintMetricSerializer(BaseAPISerializer):
-
- query = ser.DictField()
-
- class Meta:
- type_ = 'preprint_metrics'
+logger = logging.getLogger(__name__)
class RawMetricsSerializer():
@@ -30,9 +16,9 @@ class RawMetricsSerializer():
def validate_action_label(label):
try:
- CountedAuthUsage.ActionLabel(label)
+ OsfCountedUsageEvent.ActionLabel(label)
except ValueError:
- valid_labels = ', '.join(label.value for label in CountedAuthUsage.ActionLabel)
+ valid_labels = ', '.join(label.value for label in OsfCountedUsageEvent.ActionLabel)
raise ser.ValidationError(
f'Invalid value in action_labels! Valid labels: {valid_labels}',
)
@@ -67,31 +53,17 @@ def validate(self, data):
return data
def create(self, validated_data):
- pageview_info = None
- pageview_info_es8 = None
- if pageview_info_data := validated_data.get('pageview_info'):
- pageview_info = PageviewInfo(**pageview_info_data)
- pageview_info_es8 = PageviewInfoEs8(**pageview_info_data)
- OsfCountedUsageEvent.record(
+ return OsfCountedUsageEvent.record(
item_osfid=validated_data['item_guid'],
action_labels=validated_data.get('action_labels'),
provider_id=validated_data.get('provider_id'),
- pageview_info=pageview_info_es8,
+ pageview_info=validated_data.get('pageview_info'),
# used to create a COUNTER session-hour id, not stored:
client_session_id=validated_data.get('client_session_id'),
user_id=self.context.get('user_id'),
request_host=self.context.get('request_host'),
request_useragent=self.context.get('request_useragent'),
)
- return CountedAuthUsage.record(
- platform_iri=website_settings.DOMAIN,
- provider_id=validated_data.get('provider_id'),
- item_guid=validated_data.get('item_guid'),
- session_id=validated_data['session_id'], # must be provided by the view
- user_is_authenticated=validated_data['user_is_authenticated'], # must be provided by the view
- action_labels=validated_data.get('action_labels'),
- pageview_info=pageview_info,
- )
class ReportNameSerializer(ser.BaseSerializer):
@@ -109,44 +81,19 @@ def to_representation(self, instance):
}
-class DailyReportSerializer(ser.BaseSerializer):
- def to_representation(self, instance):
- # TODO: detangle datamodel (osf.metrics.reports) from api serialization
- # (don't use `to_dict` here)
- report_as_dict = instance.to_dict()
- report_name = self.context['report_name']
- report_date = report_as_dict['report_date']
-
- if isinstance(report_date, datetime.datetime):
- report_date = report_date.date()
- if isinstance(report_date, datetime.date):
- report_date = str(report_date)
-
- return {
- 'id': instance.meta.id,
- 'type': f'daily-report:{report_name}',
- 'attributes': {
- **report_as_dict,
- 'report_date': report_date,
- },
- }
-
-
-class MonthlyReportSerializer(ser.BaseSerializer):
+class CyclicReportSerializer(ser.BaseSerializer):
def to_representation(self, instance):
- # TODO: detangle datamodel (osf.metrics.reports) from api serialization
- # (don't use `to_dict` here)
- report_as_dict = instance.to_dict()
- report_name = self.context['report_name']
- report_yearmonth = report_as_dict['report_yearmonth']
-
+ # TODO: detangle datamodel from api serialization (don't use `to_dict` here)
+ _report_attrs = instance.to_dict()
+ for _extra_attr in ('report_date', 'report_yearmonth'):
+ if (_extra_attr not in _report_attrs) and hasattr(instance, _extra_attr):
+ _report_attrs[_extra_attr] = getattr(instance, _extra_attr)
+ del _report_attrs['cycle_coverage']
+ _report_name = self.context['report_name']
return {
'id': instance.meta.id,
- 'type': f'monthly-report:{report_name}',
- 'attributes': {
- **report_as_dict,
- 'report_month': report_yearmonth,
- },
+ 'type': f'cyclic-report:{_report_name}',
+ 'attributes': _report_attrs,
}
@@ -158,28 +105,28 @@ def to_representation(self, instance):
'path': bucket['key'],
'route': bucket['route-for-path'].buckets[0]['key'],
'title': bucket['title-for-path'].buckets[0]['key'],
- 'count': bucket['doc_count'],
+ 'count': bucket['unique-count'].value,
}
for bucket in aggs['popular-pages'].buckets
]
unique_visits = [
{
'date': bucket['key'].date(),
- 'count': bucket['doc_count'],
+ 'count': bucket['unique-count'].value,
}
for bucket in aggs['unique-visits'].buckets
]
time_of_day = [
{
'hour': bucket['key'],
- 'count': bucket['doc_count'],
+ 'count': bucket['unique-count'].value,
}
for bucket in aggs['time-of-day'].buckets
]
referer_domain = [
{
'referer_domain': bucket['key'],
- 'count': bucket['doc_count'],
+ 'count': bucket['unique-count'].value,
}
for bucket in aggs['referer-domain'].buckets
]
diff --git a/api/metrics/urls.py b/api/metrics/urls.py
index db63df3dd4c..d9bc0a92307 100644
--- a/api/metrics/urls.py
+++ b/api/metrics/urls.py
@@ -5,11 +5,8 @@
app_name = 'osf'
urlpatterns = [
- re_path(r'^raw/(?P[a-z0-9._/]*)$', views.RawMetricsView.as_view(), name=views.RawMetricsView.view_name, kwargs={'djelme_backend_name': 'osfmetrics_es6'}),
path('raw-/', views.RawMetricsView.as_view(), name=views.RawMetricsView.view_name, kwargs={'url_path': ''}),
path('raw-/', views.RawMetricsView.as_view(), name=views.RawMetricsView.view_name),
- re_path(r'^preprints/views/$', views.PreprintViewMetrics.as_view(), name=views.PreprintViewMetrics.view_name),
- re_path(r'^preprints/downloads/$', views.PreprintDownloadMetrics.as_view(), name=views.PreprintDownloadMetrics.view_name),
re_path(r'^registries_moderation/transitions/$', views.RegistriesModerationMetricsView.as_view(), name=views.RegistriesModerationMetricsView.view_name),
re_path(
@@ -17,8 +14,13 @@
views.ReportNameList.as_view(),
name=views.ReportNameList.view_name,
),
- re_path(
- r'^reports/(?P[a-z0-9_]+)/recent/$',
+ path(
+ 'reports//',
+ views.ReportList.as_view(),
+ name=views.ReportList.view_name,
+ ),
+ path(
+ 'reports//recent/',
views.RecentReportList.as_view(),
name=views.RecentReportList.view_name,
),
diff --git a/api/metrics/utils.py b/api/metrics/utils.py
index 54af7531200..3ffc515e9f1 100644
--- a/api/metrics/utils.py
+++ b/api/metrics/utils.py
@@ -8,7 +8,7 @@
from rest_framework.exceptions import ValidationError
from osf.models import AbstractNode, Guid
-from osf.metrics.counted_usage import _get_immediate_wrapper
+from osf.metrics.utils import get_immediate_wrapper
DATETIME_FORMAT = '%Y-%m-%dT%H:%M'
@@ -124,7 +124,7 @@ def _user_has_read_on_resolved_node(user, guid_referent):
"""True if ``user`` has READ on the node this referent belongs to."""
current = guid_referent
while current is not None and not isinstance(current, AbstractNode):
- current = _get_immediate_wrapper(current)
+ current = get_immediate_wrapper(current)
if current is None or not isinstance(current, AbstractNode):
return False
return current.contributors_and_group_members.filter(guids___id=user._id).exists()
diff --git a/api/metrics/views.py b/api/metrics/views.py
index bd53bee296e..829426755ed 100644
--- a/api/metrics/views.py
+++ b/api/metrics/views.py
@@ -4,25 +4,21 @@
from enum import Enum
from django.http import JsonResponse, HttpResponse, Http404
-from django.utils import timezone
-
-from elasticsearch6.exceptions import NotFoundError, RequestError
-from elasticsearch6_dsl.connections import get_connection
+from elasticsearch8.exceptions import ApiError as Es8ApiError
from elasticsearch_metrics.registry import djelme_registry
from framework.auth.oauth_scopes import CoreScopes
-from rest_framework.exceptions import ValidationError
from rest_framework import permissions as drf_permissions
from rest_framework.response import Response
from rest_framework.generics import GenericAPIView
from rest_framework.settings import api_settings as drf_api_settings
+from api.base.elasticsearch_dsl_views import ElasticsearchListView
from api.base.views import JSONAPIBaseView
from api.base.permissions import TokenHasScope
from api.base.waffle_decorators import require_switch
from api.metrics.permissions import (
- IsPreprintMetricsUser,
IsRawMetricsUser,
IsRegistriesModerationMetricsUser,
)
@@ -31,10 +27,8 @@
MetricsReportsTsvRenderer,
)
from api.metrics.serializers import (
- PreprintMetricSerializer,
RawMetricsSerializer,
- DailyReportSerializer,
- MonthlyReportSerializer,
+ CyclicReportSerializer,
ReportNameSerializer,
NodeAnalyticsSerializer,
UserVisitsSerializer,
@@ -42,169 +36,51 @@
CountedAuthUsageSerializer,
)
from api.metrics.utils import (
- parse_datetimes,
parse_date_range,
should_skip_counted_usage,
)
from api.nodes.permissions import MustBePublic
from osf.features import ENABLE_RAW_METRICS
-from osf.metrics import (
- utils,
- reports,
- PreprintDownload,
- PreprintView,
- RegistriesModerationMetrics,
- CountedAuthUsage,
+from osf.metrics.events import (
+ OsfCountedUsageEvent,
+ RegistriesModerationEvent,
+)
+from osf.metrics.daily_reports import (
+ BaseDailyReport,
+ DailyDownloadCountReport,
+ DailyInstitutionSummaryReport,
+ DailyNodeSummaryReport,
+ DailyOsfstorageFileCountReport,
+ DailyPreprintSummaryReport,
+ DailyStorageAddonUsageReport,
+ DailyUserSummaryReport,
+ DailyNewUserDomainReport,
+)
+from osf.metrics.monthly_reports import (
+ BaseMonthlyReport,
+ MonthlySpamSummaryReport,
)
from osf.metrics.openapi import get_metrics_openapi_json_dict
from osf.models import AbstractNode
+from osf.utils.workflows import RegistrationModerationTriggers, RegistrationModerationStates
logger = logging.getLogger(__name__)
-class PreprintMetricMixin(JSONAPIBaseView):
- permission_classes = (
- drf_permissions.IsAuthenticated,
- drf_permissions.IsAdminUser,
- IsPreprintMetricsUser,
- TokenHasScope,
- )
-
- required_read_scopes = [CoreScopes.METRICS_BASIC]
- required_write_scopes = [CoreScopes.METRICS_RESTRICTED]
-
- serializer_class = PreprintMetricSerializer
-
- @property
- def metric_type(self):
- raise NotImplementedError
-
- @property
- def metric(self):
- raise NotImplementedError
-
- def add_search(self, search, query_params, **kwargs):
- """
- get list of guids from the kwargs
- use that in a query to narrow down metrics results
- """
- preprint_guid_string = query_params.get('guids')
- if not preprint_guid_string:
- raise ValidationError(
- 'To gather metrics for preprints, you must provide one or more preprint ' +
- 'guids in the `guids` query parameter.',
- )
- preprint_guids = preprint_guid_string.split(',')
-
- return search.filter('terms', preprint_id=preprint_guids)
-
- def format_response(self, response, query_params):
- data = []
- if getattr(response, 'aggregations') and response.aggregations:
- for result in response.aggregations.dates.buckets:
- guid_results = {}
- for preprint_result in result.preprints.buckets:
- guid_results[preprint_result['key']] = preprint_result['total']['value']
- # return 0 for the guids with no results for consistent payloads
- guids = query_params['guids'].split(',')
- if guid_results.keys() != guids:
- for guid in guids:
- if not guid_results.get(guid):
- guid_results[guid] = 0
- result_dict = {result.key_as_string: guid_results}
- data.append(result_dict)
-
- return {
- 'metric_type': self.metric_type,
- 'data': data,
- }
-
- def execute_search(self, search, query=None):
- try:
- # There's a bug in the ES python library the prevents us from updating the search object, so lets just make
- # the raw query. If we have it.
- if query:
- es = get_connection(search._using)
- response = search._response_class(
- search,
- es.search(
- index=search._index,
- body=query,
- ),
- )
- else:
- response = search.execute()
- except NotFoundError:
- # _get_relevant_indices returned 1 or more indices
- # that doesn't exist. Fall back to unoptimized query
- search = search.index().index(self.metric._default_index())
- response = search.execute()
- return response
-
- def get(self, *args, **kwargs):
- query_params = getattr(self.request, 'query_params', self.request.GET)
-
- interval = query_params.get('interval', 'day')
-
- start_datetime, end_datetime = parse_datetimes(query_params)
-
- search = self.metric.search(after=start_datetime)
- search = search.filter('range', timestamp={'gte': start_datetime, 'lt': end_datetime})
- search.aggs.bucket('dates', 'date_histogram', field='timestamp', interval=interval) \
- .bucket('preprints', 'terms', field='preprint_id') \
- .metric('total', 'sum', field='count')
- search = self.add_search(search, query_params, **kwargs)
- response = self.execute_search(search)
- resp_dict = self.format_response(response, query_params)
-
- return JsonResponse(resp_dict)
-
- def post(self, request, *args, **kwargs):
- """
- For a bit of future proofing, accept custom elasticsearch aggregation queries in JSON form.
- Caution - this could be slow if a very large query is executed, so use with care!
- """
- search = self.metric.search()
- query = request.data.get('query')
-
- try:
- results = self.execute_search(search, query)
- except RequestError as e:
- if e.args:
- raise ValidationError(e.info['error']['root_cause'][0]['reason'])
- raise ValidationError('Malformed elasticsearch query.')
-
- return JsonResponse(results.to_dict())
-
-
-class PreprintViewMetrics(PreprintMetricMixin):
-
- view_category = 'preprint-metrics'
- view_name = 'preprint-view-metrics'
-
- @property
- def metric_type(self):
- return 'views'
-
- @property
- def metric(self):
- return PreprintView
-
-
-class PreprintDownloadMetrics(PreprintMetricMixin):
-
- view_category = 'preprint-metrics'
- view_name = 'preprint-download-metrics'
-
- @property
- def metric_type(self):
- return 'downloads'
+VIEWABLE_REPORTS = {
+ 'download_count': DailyDownloadCountReport,
+ 'institution_summary': DailyInstitutionSummaryReport,
+ 'node_summary': DailyNodeSummaryReport,
+ 'osfstorage_file_count': DailyOsfstorageFileCountReport,
+ 'preprint_summary': DailyPreprintSummaryReport,
+ 'storage_addon_usage': DailyStorageAddonUsageReport,
+ 'user_summary': DailyUserSummaryReport,
+ 'spam_summary': MonthlySpamSummaryReport,
+ 'new_user_domains': DailyNewUserDomainReport,
+}
- @property
- def metric(self):
- return PreprintDownload
class RawMetricsView(GenericAPIView):
@@ -222,47 +98,59 @@ class RawMetricsView(GenericAPIView):
serializer_class = RawMetricsSerializer
- @require_switch(ENABLE_RAW_METRICS)
- def delete(self, request, *args, **kwargs):
- raise ValidationError('DELETE not supported. Use GET/POST/PUT')
-
@require_switch(ENABLE_RAW_METRICS)
def get(self, request, *args, djelme_backend_name, url_path, **kwargs):
- _response_body = self._do_es_request(
+ return self._do_es_request(
+ request,
djelme_backend_name,
method='GET',
path=url_path,
- qp=request.GET,
)
- return JsonResponse(_response_body)
@require_switch(ENABLE_RAW_METRICS)
def post(self, request, *args, djelme_backend_name, url_path, **kwargs):
- _response_body = self._do_es_request(
+ return self._do_es_request(
+ request,
djelme_backend_name,
method='POST',
path=url_path,
- qp=request.GET,
- body=json.loads(request.body),
)
- return JsonResponse(_response_body)
@require_switch(ENABLE_RAW_METRICS)
def put(self, request, *args, djelme_backend_name, url_path, **kwargs):
- _response_body = self._do_es_request(
+ return self._do_es_request(
+ request,
djelme_backend_name,
method='PUT',
path=url_path,
- qp=request.GET,
- body=json.loads(request.body),
)
- return JsonResponse(_response_body)
- def _do_es_request(self, djelme_backend_name, method, path, qp, body=None):
+ def _do_es_request(self, django_request, djelme_backend_name, method, path):
_client = self._get_es_client(djelme_backend_name)
- _perform_fn = getattr(_client, 'perform_request', None) or _client.transport.perform_request
- _response = _perform_fn(method, f'/{path}', params=qp.dict(), body=body)
- return _response if isinstance(_response, dict) else _response.body
+ _body = (
+ json.loads(django_request.body)
+ if django_request.body else None
+ )
+ _content_type = django_request.headers.get('Content-Type')
+ _headers = (
+ {'Content-Type': _content_type, 'Accept': 'application/json'}
+ if _content_type else None
+ )
+ try:
+ _response = _client.perform_request(
+ method,
+ f'/{path}',
+ params=django_request.GET.dict(),
+ body=_body,
+ headers=_headers,
+ )
+ except Es8ApiError as _api_error:
+ return HttpResponse(
+ str(_api_error),
+ content_type='text/plain; charset=utf-8',
+ status=_api_error.status_code,
+ )
+ return JsonResponse(_response.body)
def _get_es_client(self, djelme_backend_name):
try:
@@ -287,21 +175,85 @@ class RegistriesModerationMetricsView(GenericAPIView):
view_name = 'raw-metrics-view'
def get(self, request, *args, **kwargs):
- return JsonResponse(RegistriesModerationMetrics.get_registries_info())
-
+ _search = RegistriesModerationEvent.search().update_from_dict(self._build_es_query())
+ _search_response = _search.execute()
+ _providers_agg_json = (
+ _search_response.aggregations['providers'].to_dict()
+ if _search_response.aggregations
+ else {}
+ )
+ return JsonResponse(_providers_agg_json)
+
+ def _build_es_query(self):
+ _submit_trigger = RegistrationModerationTriggers.SUBMIT.db_name
+ _reject_trigger = RegistrationModerationTriggers.REJECT_SUBMISSION.db_name
+ _accept_withdrawal_trigger = RegistrationModerationTriggers.ACCEPT_WITHDRAWAL.db_name
+ _accepted_state = RegistrationModerationStates.ACCEPTED.db_name
+ _embargo_state = RegistrationModerationStates.EMBARGO.db_name
+ _rejected_state = RegistrationModerationStates.REJECTED.db_name
+ _withdrawn_state = RegistrationModerationStates.WITHDRAWN.db_name
+ return {
+ 'aggs': {
+ 'providers': {
+ 'terms': {'field': 'provider_id'},
+ 'aggs': {
+ 'transitions_without_comments': {
+ 'missing': {'field': 'comment'},
+ },
+ 'transitions_with_comments': {
+ 'filter': {'exists': {'field': 'comment'}},
+ },
+ 'submissions': {
+ 'filter': {'term': {'trigger': _submit_trigger}},
+ },
+ 'accepted_with_embargo': {
+ 'filter': {
+ 'bool': {
+ 'must': [
+ {'term': {'to_state': _embargo_state}},
+ {'term': {'trigger': _submit_trigger}},
+ ],
+ },
+ },
+ },
+ 'accepted_without_embargo': {
+ 'filter': {
+ 'bool': {
+ 'must': [
+ {'term': {'to_state': _accepted_state}},
+ {'term': {'trigger': _submit_trigger}},
+ ],
+ },
+ },
+ },
+ 'rejected': {
+ 'filter': {
+ 'bool': {
+ 'must': [
+ {'term': {'to_state': _rejected_state}},
+ {'term': {'trigger': _reject_trigger}},
+ ],
+ },
+ },
+ },
+ 'withdrawn': {
+ 'filter': {
+ 'bool': {
+ 'must': [
+ {'term': {'to_state': _withdrawn_state}},
+ {'term': {'trigger': _accept_withdrawal_trigger}},
+ ],
+ },
+ },
+ },
+ },
+ },
+ },
+ }
-VIEWABLE_REPORTS = {
- 'download_count': reports.DownloadCountReport,
- 'institution_summary': reports.InstitutionSummaryReport,
- 'node_summary': reports.NodeSummaryReport,
- 'osfstorage_file_count': reports.OsfstorageFileCountReport,
- 'preprint_summary': reports.PreprintSummaryReport,
- 'storage_addon_usage': reports.StorageAddonUsage,
- 'user_summary': reports.UserSummaryReport,
- 'spam_summary': reports.SpamSummaryReport,
- 'new_user_domains': reports.NewUserDomainReport,
-}
+###
+# reports
class ReportNameList(JSONAPIBaseView):
permission_classes = (
@@ -325,6 +277,51 @@ def get(self, request, *args, **kwargs):
return Response({'data': serializer.data})
+class ReportList(ElasticsearchListView):
+ view_category = 'metrics'
+ view_name = 'report-list'
+
+ permission_classes = (
+ TokenHasScope,
+ drf_permissions.IsAuthenticatedOrReadOnly,
+ )
+
+ required_read_scopes = [CoreScopes.ALWAYS_PUBLIC]
+ required_write_scopes = [CoreScopes.NULL]
+
+ serializer_class = CyclicReportSerializer
+ renderer_classes = (
+ *drf_api_settings.DEFAULT_RENDERER_CLASSES,
+ *ElasticsearchListView.FILE_RENDERER_CLASSES,
+ )
+
+ default_ordering = '-cycle_coverage'
+ ordering_fields = frozenset((
+ 'cycle_coverage',
+ ))
+
+ def get_default_search(self):
+ _report_name = self.kwargs['report_name']
+ try:
+ _report_cls = VIEWABLE_REPORTS[_report_name]
+ except KeyError:
+ return Response(
+ {
+ 'errors': [{
+ 'title': 'unknown report name',
+ 'detail': f'unknown report: "{_report_name}"',
+ }],
+ },
+ status=404,
+ )
+ return _report_cls.search()
+
+ def get_serializer_context(self):
+ return {
+ **super().get_serializer_context(),
+ 'report_name': self.kwargs['report_name'],
+ }
+
class RecentReportList(JSONAPIBaseView):
MAX_COUNT = 10000
DEFAULT_DAYS_BACK = 13
@@ -340,7 +337,7 @@ class RecentReportList(JSONAPIBaseView):
view_category = 'metrics'
view_name = 'recent-report-list'
- serializer_class = DailyReportSerializer
+ serializer_class = CyclicReportSerializer
renderer_classes = (
*drf_api_settings.DEFAULT_RENDERER_CLASSES,
MetricsReportsCsvRenderer,
@@ -360,23 +357,15 @@ def get(self, request, *args, report_name):
},
status=404,
)
- is_daily = issubclass(report_class, reports.DailyReport)
+ is_daily = issubclass(report_class, BaseDailyReport)
days_back = request.GET.get('days_back', self.DEFAULT_DAYS_BACK if is_daily else None)
- is_monthly = issubclass(report_class, reports.MonthlyReport)
-
- if is_daily:
- serializer_class = DailyReportSerializer
- range_field_name = 'report_date'
- elif is_monthly:
- serializer_class = MonthlyReportSerializer
- range_field_name = 'report_yearmonth'
- else:
- raise ValueError(f'report class must subclass DailyReport or MonthlyReport: {report_class}')
+ is_monthly = issubclass(report_class, BaseMonthlyReport)
+
range_filter = parse_date_range(request.GET, is_monthly=is_monthly)
search_recent = (
report_class.search()
- .filter('range', **{range_field_name: range_filter})
- .sort(range_field_name)
+ .filter('range', cycle_coverage=range_filter)
+ .sort('-cycle_coverage')
[:self.MAX_COUNT]
)
if days_back:
@@ -384,7 +373,7 @@ def get(self, request, *args, report_name):
report_date_range = parse_date_range(request.GET)
search_response = search_recent.execute()
- serializer = serializer_class(
+ serializer = self.serializer_class(
search_response,
many=True,
context={'report_name': report_name},
@@ -428,46 +417,9 @@ def post(self, request, *args, **kwargs):
pageview_info=serializer.validated_data.get('pageview_info'),
):
return HttpResponse(status=204)
- session_id, user_is_authenticated = self._get_session_id(
- request,
- client_session_id=serializer.validated_data.get('client_session_id'),
- )
- serializer.save(session_id=session_id, user_is_authenticated=user_is_authenticated)
+ serializer.save()
return HttpResponse(status=201)
- def _get_session_id(self, request, client_session_id=None):
- # NOTE: to remove after osfmetrics 6to8 migration -- logic moved to djelme
-
- # get a session id as described in the COUNTER code of practice:
- # https://cop5.projectcounter.org/en/5.0.2/07-processing/03-counting-unique-items.html
- # -- different from the "login session" tracked by `osf.models.Session` (which
- # lasts about a month), this session lasts at most a day and may time out after
- # minutes or hours of inactivity
- now = timezone.now()
- current_date_str = now.date().isoformat()
-
- user_is_authenticated = request.user.is_authenticated
- if client_session_id:
- session_id_parts = [
- client_session_id,
- current_date_str,
- ]
- elif user_is_authenticated:
- session_id_parts = [
- request.user._id,
- current_date_str,
- now.hour,
- ]
- else:
- session_id_parts = [
- request.get_host(),
- request.META.get('HTTP_USER_AGENT', ''),
- current_date_str,
- now.hour,
- ]
- user_is_authenticated = False
- return utils.stable_key(*session_id_parts), user_is_authenticated
-
class NodeAnalyticsQuery(JSONAPIBaseView):
permission_classes = (
@@ -495,7 +447,7 @@ def get(self, request, *args, node_guid, timespan):
except AbstractNode.DoesNotExist:
raise Http404
self.check_object_permissions(request, node)
- analytics_result = self._run_query(node_guid, timespan)
+ analytics_result = self._run_node_analytics_query(node.get_semantic_iri(), timespan)
serializer = self.serializer_class(
analytics_result,
context={
@@ -505,22 +457,18 @@ def get(self, request, *args, node_guid, timespan):
)
return Response({'data': serializer.data})
- def _run_query(self, node_guid, timespan):
- query_dict = self._build_query_payload(node_guid, NodeAnalyticsQuery.Timespan(timespan))
- analytics_search = CountedAuthUsage.search().update_from_dict(query_dict)
+ def _run_node_analytics_query(self, item_iri, timespan):
+ query_dict = self._build_query_payload(item_iri, NodeAnalyticsQuery.Timespan(timespan))
+ analytics_search = OsfCountedUsageEvent.search().update_from_dict(query_dict)
return analytics_search.execute()
- def _build_query_payload(self, node_guid, timespan):
+ def _build_query_payload(self, item_iri, timespan):
return {
'size': 0, # don't return hits, just the aggregations
'query': {
'bool': {
- 'minimum_should_match': 1,
- 'should': [
- {'term': {'item_guid': node_guid}},
- {'term': {'surrounding_guids': node_guid}},
- ],
'filter': [
+ {'term': {'within_iris': item_iri}},
{'term': {'item_public': True}},
{'term': {'action_labels': 'view'}},
{'term': {'action_labels': 'web'}},
@@ -532,7 +480,12 @@ def _build_query_payload(self, node_guid, timespan):
'unique-visits': {
'date_histogram': {
'field': 'timestamp',
- 'interval': 'day',
+ 'calendar_interval': 'day',
+ },
+ 'aggs': {
+ 'unique-count': {
+ 'cardinality': {'field': 'sessionhour_id'},
+ },
},
},
'time-of-day': {
@@ -540,12 +493,22 @@ def _build_query_payload(self, node_guid, timespan):
'field': 'pageview_info.hour_of_day',
'size': 24,
},
+ 'aggs': {
+ 'unique-count': {
+ 'cardinality': {'field': 'sessionhour_id'},
+ },
+ },
},
'referer-domain': {
'terms': {
'field': 'pageview_info.referer_domain',
'size': 10,
},
+ 'aggs': {
+ 'unique-count': {
+ 'cardinality': {'field': 'sessionhour_id'},
+ },
+ },
},
'popular-pages': {
'terms': {
@@ -553,6 +516,9 @@ def _build_query_payload(self, node_guid, timespan):
'size': 10,
},
'aggs': {
+ 'unique-count': {
+ 'cardinality': {'field': 'sessionhour_id'},
+ },
'route-for-path': {
'terms': {
'field': 'pageview_info.route_name',
@@ -627,7 +593,7 @@ def get(self, request, *args):
pass # just fall back to days_back for now
timespan = report_date
- analytics_result = self._run_query(timespan)
+ analytics_result = self._run_user_visits_query(timespan)
serializer = self.serializer_class(
analytics_result,
context={
@@ -636,9 +602,9 @@ def get(self, request, *args):
)
return JsonResponse({'data': serializer.data})
- def _run_query(self, timespan):
+ def _run_user_visits_query(self, timespan):
query_dict = self._build_query_payload(timespan)
- analytics_search = CountedAuthUsage.search().update_from_dict(query_dict)
+ analytics_search = OsfCountedUsageEvent.search().update_from_dict(query_dict)
return analytics_search.execute()
def _build_query_payload(self, timespan):
@@ -655,13 +621,11 @@ def _build_query_payload(self, timespan):
'unique-visits': {
'date_histogram': {
'field': 'timestamp',
- 'interval': 'day',
+ 'calendar_interval': 'day',
},
'aggs': {
'user-visits': {
- 'cardinality': {
- 'field': 'session_id',
- },
+ 'cardinality': {'field': 'sessionhour_id'},
},
},
},
diff --git a/api/preprints/views.py b/api/preprints/views.py
index 7e087aaa858..3d02b8f704a 100644
--- a/api/preprints/views.py
+++ b/api/preprints/views.py
@@ -71,8 +71,7 @@
from api.requests.serializers import PreprintRequestSerializer, PreprintRequestCreateSerializer
from api.requests.views import PreprintRequestMixin
from api.subjects.views import BaseResourceSubjectsList, SubjectRelationshipBaseView
-from api.base.metrics import PreprintMetricsViewMixin
-from osf.metrics import PreprintDownload, PreprintView
+from api.base.metrics import UsageMetricsViewMixin
class PreprintOldVersionsImmutableMixin:
@@ -172,7 +171,7 @@ def get_preprint(self, check_object_permissions=True, ignore_404=False):
return preprint
-class PreprintList(PreprintMetricsViewMixin, JSONAPIBaseView, generics.ListCreateAPIView, PreprintFilterMixin):
+class PreprintList(JSONAPIBaseView, generics.ListCreateAPIView, PreprintFilterMixin):
"""See [documentation for this endpoint](https://developer.osf.io/#operation/preprints_list).
"""
# These permissions are not checked for the list of preprints, permissions handled by the query
@@ -194,10 +193,6 @@ class PreprintList(PreprintMetricsViewMixin, JSONAPIBaseView, generics.ListCreat
ordering_fields = ('created', 'date_last_transitioned')
view_category = 'preprints'
view_name = 'preprint-list'
- metric_map = {
- 'downloads': PreprintDownload,
- 'views': PreprintView,
- }
def get_serializer_class(self):
if self.request.method == 'POST':
@@ -208,38 +203,15 @@ def get_serializer_class(self):
def get_default_queryset(self):
auth = get_user_auth(self.request)
auth_user = getattr(auth, 'user', None)
-
# Permissions on the list objects are handled by the query
- public_only = self.metrics_requested
- queryset = self.preprints_queryset(Preprint.objects.all(), auth_user, public_only=public_only)
- # Use get_metrics_queryset to return a queryset with annotated metrics
- # iff ?metrics query param is present
- if self.metrics_requested:
- return self.get_metrics_queryset(queryset)
- else:
- return queryset
+ return self.preprints_queryset(Preprint.objects.all(), auth_user)
# overrides ListAPIView
def get_queryset(self):
return self.get_queryset_from_request()
- # overrides PreprintMetricsViewMixin
- def get_annotated_queryset_with_metrics(self, queryset, metric_class, metric_name, after):
- return metric_class.get_top_by_count(
- qs=queryset,
- model_field='guids___id',
- metric_field='preprint_id',
- annotation=metric_name,
- after=after,
- # Limit the bucket size
- # of the ES aggregation. Otherwise,
- # the number of buckets == the number of total preprints,
- # which is too many for ES to handle
- size=200,
- )
-
-class PreprintVersionsList(PreprintMetricsViewMixin, JSONAPIBaseView, generics.ListCreateAPIView, PreprintFilterMixin):
+class PreprintVersionsList(JSONAPIBaseView, generics.ListCreateAPIView, PreprintFilterMixin):
"""List existing versions of a preprint or create a new version.
GET: Returns a collection of preprint resources representing all versions of the given preprint.
@@ -265,10 +237,6 @@ class PreprintVersionsList(PreprintMetricsViewMixin, JSONAPIBaseView, generics.L
ordering_fields = ('created', 'date_last_transitioned')
view_category = 'preprints'
view_name = 'preprint-versions'
- metric_map = {
- 'downloads': PreprintDownload,
- 'views': PreprintView,
- }
def get_serializer_class(self):
if self.request.method == 'POST':
@@ -288,8 +256,7 @@ def get_queryset(self):
auth_user = getattr(auth, 'user', None)
# Permissions on the list objects are handled by the query
- public_only = self.metrics_requested
- qs = qs.filter(Preprint.objects.preprint_versions_permissions_query(auth_user, public_only=public_only))
+ qs = qs.filter(Preprint.objects.preprint_versions_permissions_query(auth_user))
return qs
@@ -299,7 +266,7 @@ def create(self, request, *args, **kwargs):
return super().create(request, *args, **kwargs)
-class PreprintDetail(PreprintOldVersionsImmutableMixin, PreprintMetricsViewMixin, JSONAPIBaseView, generics.RetrieveUpdateDestroyAPIView, PreprintMixin, WaterButlerMixin):
+class PreprintDetail(PreprintOldVersionsImmutableMixin, UsageMetricsViewMixin, JSONAPIBaseView, generics.RetrieveUpdateDestroyAPIView, PreprintMixin, WaterButlerMixin):
"""See [documentation for this endpoint](https://developer.osf.io/#operation/preprints_read).
Note: The resource now exposes a `versions` relationship pointing to
@@ -324,15 +291,6 @@ class PreprintDetail(PreprintOldVersionsImmutableMixin, PreprintMetricsViewMixin
view_category = 'preprints'
view_name = 'preprint-detail'
- metric_map = {
- 'downloads': PreprintDownload,
- 'views': PreprintView,
- }
-
- def add_metric_to_object(self, obj, metric_class, metric_name, after):
- count = metric_class.get_count_for_preprint(obj, after=after)
- setattr(obj, metric_name, count)
- return obj
def get_object(self):
preprint = self.get_preprint()
@@ -355,6 +313,7 @@ def delete(self, request, *args, **kwargs):
raise ValidationError('You cannot delete created preprint')
+
class PreprintNodeRelationship(PreprintOldVersionsImmutableMixin, JSONAPIBaseView, generics.RetrieveUpdateAPIView, PreprintMixin):
permission_classes = (
drf_permissions.IsAuthenticatedOrReadOnly,
diff --git a/api/providers/views.py b/api/providers/views.py
index fbfa287d4a7..4a35706bb4d 100644
--- a/api/providers/views.py
+++ b/api/providers/views.py
@@ -16,7 +16,6 @@
InvalidFilterValue,
)
from api.base.filters import ListFilterMixin, PreprintAsTargetFilterMixin, PreprintFilterMixin
-from api.base.metrics import PreprintMetricsViewMixin
from api.base.pagination import MaxSizePagination, IncreasedPageSizePagination
from api.base.settings import BULK_SETTINGS
from api.base.utils import get_object_or_error, get_user_auth, is_truthy
@@ -61,7 +60,6 @@
from framework.auth.oauth_scopes import CoreScopes
from framework.celery_tasks.handlers import enqueue_task
from guardian.shortcuts import get_objects_for_user
-from osf.metrics import PreprintDownload, PreprintView
from osf.models import (
AbstractNode,
CollectionProvider,
@@ -148,7 +146,7 @@ class RegistrationProviderList(GenericProviderList):
view_name = 'registration-providers-list'
-class PreprintProviderList(PreprintMetricsViewMixin, GenericProviderList):
+class PreprintProviderList(GenericProviderList):
"""See [documentation for this endpoint](https://developer.osf.io/#operation/preprint_provider_list).
"""
@@ -156,21 +154,6 @@ class PreprintProviderList(PreprintMetricsViewMixin, GenericProviderList):
serializer_class = PreprintProviderSerializer
view_category = 'preprint-providers'
view_name = 'preprint-providers-list'
- metric_map = {
- 'downloads': PreprintDownload,
- 'views': PreprintView,
- }
-
- # overrides PreprintMetricsViewMixin
- def get_annotated_queryset_with_metrics(self, queryset, metric_class, metric_name, after):
- return metric_class.get_top_by_count(
- qs=queryset,
- model_field='_id',
- metric_field='provider_id',
- annotation=metric_name,
- after=after,
- size=None,
- )
def get_renderer_context(self):
context = super().get_renderer_context()
diff --git a/api_tests/institutions/views/test_institution_department_list.py b/api_tests/institutions/views/test_institution_department_list.py
index 8b785504756..c5b53395d20 100644
--- a/api_tests/institutions/views/test_institution_department_list.py
+++ b/api_tests/institutions/views/test_institution_department_list.py
@@ -1,16 +1,17 @@
-import pytest
import datetime
+import pytest
+
from api.base.settings.defaults import API_BASE, DEFAULT_ES_NULL_VALUE
from osf_tests.factories import (
InstitutionFactory,
AuthUserFactory,
)
-from osf.metrics.reports import InstitutionalUserReport
+from osf.metrics.monthly_reports import MonthlyInstitutionalUserReport
from osf.metrics.utils import YearMonth
-@pytest.mark.es_metrics
+@pytest.mark.djelme_elasticsearch_backends
@pytest.mark.django_db
class TestInstitutionDepartmentList:
@@ -37,55 +38,55 @@ def user4(self):
@pytest.fixture()
def populate_counts(self, user, user2, user3, user4, admin, institution):
# This represents a Department that had a user, but no longer has any users, so does not appear in results.
- InstitutionalUserReport(
+ MonthlyInstitutionalUserReport(
report_yearmonth=YearMonth(2017, 2),
user_id=user._id,
institution_id=institution._id,
department_name='Old Department',
public_project_count=1,
private_project_count=1,
- ).save()
+ ).save(validate=False)
_this_month = YearMonth.from_date(datetime.date.today())
# The user has left the department
- InstitutionalUserReport(
+ MonthlyInstitutionalUserReport(
report_yearmonth=_this_month,
user_id=user._id,
institution_id=institution._id,
department_name='New Department',
public_project_count=1,
private_project_count=1,
- ).save()
+ ).save(validate=False)
# A second user entered the department
- InstitutionalUserReport(
+ MonthlyInstitutionalUserReport(
report_yearmonth=_this_month,
user_id=user2._id,
institution_id=institution._id,
department_name='New Department',
public_project_count=1,
private_project_count=1,
- ).save()
+ ).save(validate=False)
# A new department with a single user to test sorting
- InstitutionalUserReport(
+ MonthlyInstitutionalUserReport(
report_yearmonth=_this_month,
user_id=user3._id,
institution_id=institution._id,
department_name='Smaller Department',
public_project_count=1,
private_project_count=1,
- ).save()
+ ).save(validate=False)
# A user with no department
- InstitutionalUserReport(
+ MonthlyInstitutionalUserReport(
report_yearmonth=_this_month,
user_id=user4._id,
institution_id=institution._id,
public_project_count=1,
private_project_count=1,
- ).save()
+ ).save(validate=False)
@pytest.fixture()
def admin(self, institution):
@@ -113,7 +114,7 @@ def test_auth(self, app, url, user, admin):
assert resp.json['data'] == []
def test_get(self, app, url, admin, institution, populate_counts):
- InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern)
+ MonthlyInstitutionalUserReport.refresh()
resp = app.get(url, auth=admin.auth)
assert resp.json['data'] == [{
diff --git a/api_tests/institutions/views/test_institution_summary_metrics.py b/api_tests/institutions/views/test_institution_summary_metrics.py
index 6dd6c5bbda3..178bed6ce6b 100644
--- a/api_tests/institutions/views/test_institution_summary_metrics.py
+++ b/api_tests/institutions/views/test_institution_summary_metrics.py
@@ -5,12 +5,13 @@
InstitutionFactory,
AuthUserFactory,
)
-from osf.metrics.reports import InstitutionMonthlySummaryReport
+from osf.metrics.monthly_reports import MonthlyInstitutionSummaryReport
-@pytest.mark.es_metrics
+@pytest.mark.djelme_elasticsearch_backends
@pytest.mark.django_db
class TestInstitutionSummaryMetricsList:
+
@pytest.fixture()
def institution(self):
return InstitutionFactory()
@@ -30,10 +31,10 @@ def unshown_reports(self, institution):
# Reports that should not be shown in the results
# Report from another institution
another_institution = InstitutionFactory()
- _summary_report_factory('2024-08', another_institution)
+ _summary_report_factory('2024-08', another_institution, validate=False)
# Old report from the same institution
- _summary_report_factory('2024-07', institution)
- _summary_report_factory('2018-02', institution)
+ _summary_report_factory('2024-07', institution, validate=False)
+ _summary_report_factory('2018-02', institution, validate=False)
@pytest.fixture()
def reports(self, institution):
@@ -84,7 +85,7 @@ def test_get_empty(self, app, url, institutional_admin):
assert resp.json['meta'] == {'version': '2.0'}
def test_get_report(self, app, url, institutional_admin, institution, reports, unshown_reports):
- InstitutionMonthlySummaryReport._get_connection().indices.refresh(InstitutionMonthlySummaryReport._template_pattern)
+ MonthlyInstitutionSummaryReport.refresh()
resp = app.get(url, auth=institutional_admin.auth)
assert resp.status_code == 200
@@ -150,7 +151,7 @@ def test_get_report_with_multiple_months_and_institutions(
monthly_logged_in_user_count=270,
monthly_active_user_count=260,
)
- InstitutionMonthlySummaryReport._get_connection().indices.refresh(InstitutionMonthlySummaryReport._template_pattern)
+ MonthlyInstitutionSummaryReport.refresh()
resp = app.get(url, auth=institutional_admin.auth)
assert resp.status_code == 200
@@ -179,19 +180,21 @@ def test_get_with_valid_report_dates(self, app, url, institution, institutional_
'2024-08',
institution,
user_count=0,
+ validate=False,
)
_summary_report_factory(
'2024-09',
institution,
user_count=999,
-
+ validate=False,
)
_summary_report_factory(
'2018-02',
institution,
user_count=4133,
+ validate=False,
)
- InstitutionMonthlySummaryReport._get_connection().indices.refresh(InstitutionMonthlySummaryReport._template_pattern)
+ MonthlyInstitutionSummaryReport.refresh()
resp = app.get(f'{url}?report_yearmonth=2024-08', auth=institutional_admin.auth)
assert resp.status_code == 200
@@ -205,39 +208,25 @@ def test_get_with_valid_report_dates(self, app, url, institution, institutional_
attributes = resp.json['data']['attributes']
assert attributes['user_count'] == 4133
- def test_get_with_invalid_report_date(self, app, url, institution, institutional_admin):
- _summary_report_factory(
- '2024-08',
- institution,
- user_count=0,
- )
- _summary_report_factory(
- '2024-09',
- institution,
- user_count=999,
- )
- InstitutionMonthlySummaryReport._get_connection().indices.refresh(InstitutionMonthlySummaryReport._template_pattern)
-
- # Request with an invalid report_date format
- resp = app.get(f'{url}?report_yearmonth=invalid-date', auth=institutional_admin.auth)
- assert resp.status_code == 200
-
- # Verify it defaults to the most recent report data
- attributes = resp.json['data']['attributes']
- assert attributes['user_count'] == 999
+ def test_get_with_invalid_report_yearmonth(self, app, url, institution, institutional_admin):
+ # Request with an invalid report_yearmonth format
+ resp = app.get(f'{url}?report_yearmonth=invalid-date', auth=institutional_admin.auth, expect_errors=True)
+ assert resp.status_code == 400
def test_get_without_report_date_uses_most_recent(self, app, url, institution, institutional_admin):
_summary_report_factory(
'2024-08',
institution,
user_count=0,
+ validate=False,
)
_summary_report_factory(
'2024-09',
institution,
user_count=999,
+ validate=False,
)
- InstitutionMonthlySummaryReport._get_connection().indices.refresh(InstitutionMonthlySummaryReport._template_pattern)
+ MonthlyInstitutionSummaryReport.refresh()
resp = app.get(url, auth=institutional_admin.auth)
assert resp.status_code == 200
@@ -246,11 +235,11 @@ def test_get_without_report_date_uses_most_recent(self, app, url, institution, i
assert attributes['user_count'] == 999
-def _summary_report_factory(yearmonth, institution, **kwargs):
- report = InstitutionMonthlySummaryReport(
+def _summary_report_factory(yearmonth, institution, *, validate=True, **kwargs):
+ report = MonthlyInstitutionSummaryReport(
report_yearmonth=yearmonth,
institution_id=institution._id,
**kwargs,
)
- report.save()
+ report.save(validate=validate)
return report
diff --git a/api_tests/institutions/views/test_institution_user_metric_list.py b/api_tests/institutions/views/test_institution_user_metric_list.py
index d2b99da435f..dc097c221c4 100644
--- a/api_tests/institutions/views/test_institution_user_metric_list.py
+++ b/api_tests/institutions/views/test_institution_user_metric_list.py
@@ -12,12 +12,12 @@
AuthUserFactory,
)
-from osf.metrics.reports import InstitutionalUserReport
+from osf.metrics.monthly_reports import MonthlyInstitutionalUserReport
from osf.models import UserMessage
from tests.utils import capture_notifications
-@pytest.mark.es_metrics
+@pytest.mark.djelme_elasticsearch_backends
@pytest.mark.django_db
class TestInstitutionUserMetricList:
@pytest.fixture()
@@ -89,7 +89,7 @@ def test_get_empty(self, app, url, institutional_admin):
assert _resp.json['data'] == []
def test_get_reports(self, app, url, institutional_admin, institution, reports, unshown_reports):
- InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern)
+ MonthlyInstitutionalUserReport.refresh()
_resp = app.get(url, auth=institutional_admin.auth)
assert _resp.status_code == 200
assert len(_resp.json['data']) == len(reports)
@@ -101,7 +101,7 @@ def test_get_reports(self, app, url, institutional_admin, institution, reports,
assert len(response_object['attributes']['contacts']) == 0
def test_filter_reports(self, app, url, institutional_admin, institution, reports, unshown_reports):
- InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern)
+ MonthlyInstitutionalUserReport.refresh()
for _query, _expected_user_ids in (
({'filter[department]': 'nunavum'}, set()),
({'filter[department]': 'incidentally'}, set()),
@@ -137,7 +137,7 @@ def test_filter_reports(self, app, url, institutional_admin, institution, report
assert set(_user_ids(_resp)) == _expected_user_ids
def test_sort_reports(self, app, url, institutional_admin, institution, reports, unshown_reports):
- InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern)
+ MonthlyInstitutionalUserReport.refresh()
for _query, _expected_user_id_list in (
({'sort': 'storage_byte_count'}, ['u_sparse', 'u_orc', 'u_blargl', 'u_orcomma']),
({'sort': '-storage_byte_count'}, ['u_orcomma', 'u_blargl', 'u_orc', 'u_sparse']),
@@ -147,7 +147,7 @@ def test_sort_reports(self, app, url, institutional_admin, institution, reports,
assert list(_user_ids(_resp)) == _expected_user_id_list
def test_paginate_reports(self, app, url, institutional_admin, institution, reports, unshown_reports):
- InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern)
+ MonthlyInstitutionalUserReport.refresh()
for _query, _expected_user_id_list in (
({'sort': 'storage_byte_count', 'page[size]': 2}, ['u_sparse', 'u_orc']),
({'sort': 'storage_byte_count', 'page[size]': 2, 'page': 2}, ['u_blargl', 'u_orcomma']),
@@ -182,7 +182,7 @@ def test_get_report_formats_csv_tsv(self, app, url, institutional_admin, institu
month_last_active='2018-02',
month_last_login='2018-02',
)
- InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern)
+ MonthlyInstitutionalUserReport.refresh()
resp = app.get(f'{url}?format={format_type}', auth=institutional_admin.auth)
assert resp.status_code == 200
@@ -286,7 +286,7 @@ def test_csv_tsv_ignores_pagination(self, app, url, institutional_admin, institu
str(736662999298 + i),
f'Jalen Hurts #{i}',
])
- InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern)
+ MonthlyInstitutionalUserReport.refresh()
# Make request for CSV format with page[size]=10
resp = app.get(f'{url}?format={format_type}', auth=institutional_admin.auth)
@@ -352,7 +352,7 @@ def test_get_report_format_table_json(self, app, url, institutional_admin, insti
month_last_active='2018-02',
month_last_login='2018-02',
)
- InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern)
+ MonthlyInstitutionalUserReport.refresh()
resp = app.get(f'{url}?format=json_report', auth=institutional_admin.auth)
assert resp.status_code == 200
@@ -418,7 +418,7 @@ def test_correct_number_of_contact_messages(self, app, url, institutional_admin,
department_name='a department, or so, that happens, incidentally, to have commas',
storage_byte_count=736662999298,
)
- InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern)
+ MonthlyInstitutionalUserReport.refresh()
receiver = user1
with capture_notifications():
@@ -480,10 +480,10 @@ def _user_ids(api_response):
yield _datum['relationships']['user']['data']['id']
def _report_factory(yearmonth, institution, **kwargs):
- _report = InstitutionalUserReport(
+ _report = MonthlyInstitutionalUserReport(
report_yearmonth=yearmonth,
institution_id=institution._id,
**kwargs,
)
- _report.save()
+ _report.save(validate=False)
return _report
diff --git a/api_tests/metrics/test_composite_query.py b/api_tests/metrics/test_composite_query.py
deleted file mode 100644
index 016677c3a11..00000000000
--- a/api_tests/metrics/test_composite_query.py
+++ /dev/null
@@ -1,86 +0,0 @@
-import pytest
-from datetime import datetime
-from osf_tests.factories import (
- PreprintFactory,
- AuthUserFactory
-)
-
-from osf.metrics import PreprintDownload
-from api.base.settings import API_PRIVATE_BASE as API_BASE
-
-
-@pytest.fixture()
-def preprint():
- return PreprintFactory()
-
-
-@pytest.fixture()
-def user():
- user = AuthUserFactory()
- user.is_staff = True
- user.add_system_tag('preprint_metrics')
- user.save()
- return user
-
-
-@pytest.fixture
-def base_url():
- return f'/{API_BASE}metrics/preprints/'
-
-
-@pytest.mark.es_metrics
-@pytest.mark.django_db
-class TestElasticSearch:
-
- def test_elasticsearch_agg_query(self, app, user, base_url, preprint):
- post_url = f'{base_url}downloads/'
-
- payload = {
- 'data': {
- 'type': 'preprint_metrics',
- 'attributes': {
- 'query': {
- 'aggs': {
- 'preprints_by_year': {
- 'composite': {
- 'sources': [{
- 'date': {
- 'date_histogram': {
- 'field': 'timestamp',
- 'interval': 'year'
- }
- }
- }]
- }
- }
- }
- }
- }
- }
- }
-
- resp = app.post_json_api(post_url, payload, auth=user.auth)
-
- assert resp.status_code == 200
- assert resp.json['hits']['hits'] == []
-
- PreprintDownload.record_for_preprint(
- preprint,
- path=preprint.primary_file.path,
- timestamp=datetime(year=2020, month=1, day=1),
- )
- PreprintDownload.record_for_preprint(
- preprint,
- path=preprint.primary_file.path,
- timestamp=datetime(year=2020, month=2, day=1)
- )
- PreprintDownload._get_connection().indices.refresh(PreprintDownload._template_pattern)
-
- resp = app.post_json_api(post_url, payload, auth=user.auth)
- assert resp.status_code == 200
- assert len(resp.json['aggregations']['preprints_by_year']['buckets']) == 1
-
- payload['data']['attributes']['query']['aggs']['preprints_by_year']['composite']['sources'][0]['date']['date_histogram']['interval'] = 'month'
-
- resp = app.post_json_api(post_url, payload, auth=user.auth)
- assert len(resp.json['aggregations']['preprints_by_year']['buckets']) == 2
diff --git a/api_tests/metrics/test_counted_usage.py b/api_tests/metrics/test_counted_usage.py
index e954248c15b..c319cc1690b 100644
--- a/api_tests/metrics/test_counted_usage.py
+++ b/api_tests/metrics/test_counted_usage.py
@@ -1,10 +1,13 @@
from datetime import datetime, timezone
+from unittest import mock
import pytest
-from unittest import mock
+from elasticsearch_metrics.util.anon_enough import opaque_key
from framework.auth.core import Auth
+from osf.utils.permissions import ADMIN, READ, WRITE
+from api_tests.utils import create_test_file
from osf_tests.factories import (
AuthUserFactory,
NodeFactory,
@@ -12,11 +15,7 @@
PrivateLinkFactory,
ProjectFactory,
RegistrationFactory,
- # UserFactory,
)
-from osf.utils.permissions import ADMIN, READ, WRITE
-from api_tests.utils import create_test_file
-from elasticsearch_metrics.tests.util import djelme_test_backends
COUNTED_USAGE_URL = '/_/metrics/events/counted_usage/'
@@ -30,23 +29,24 @@ def counted_usage_payload(**attributes):
}
-def assert_saved_with(mock_save, *, expected_doc_id=None, expected_attrs):
- assert mock_save.call_count == 1
- args, kwargs = mock_save.call_args
- actual_instance = args[0]
+def assert_saved_with(mock_es8, *, expected_doc_id=None, expected_attrs):
+ assert mock_es8.index.call_count == 1
+ _args, _kwargs = mock_es8.index.call_args
if expected_doc_id is not None:
- assert actual_instance.meta.id == expected_doc_id
- actual_attrs = actual_instance.to_dict()
- for attr_name, expected_value in expected_attrs.items():
- actual_value = actual_attrs.get(attr_name, None)
- assert actual_value == expected_value, repr(actual_value)
+ assert _kwargs['id'] == expected_doc_id
+ _actual_attrs = _kwargs['body']
+ for _attr_name, _expected_value in expected_attrs.items():
+ _actual_value = _actual_attrs.get(_attr_name, None)
+ assert (_actual_value == _expected_value), repr(_actual_value)
@pytest.fixture
-def mock_save():
- with mock.patch('elasticsearch_metrics.imps.elastic6.BaseMetric.check_index_template'):
- with mock.patch('elasticsearch6_dsl.Document.save', autospec=True) as mock_save:
- yield mock_save
+def mock_es8():
+ with mock.patch('elasticsearch_metrics.imps.elastic8.TimeseriesRecord.check_djelme_setup'):
+ with mock.patch('elasticsearch_metrics.imps.elastic8.BaseDjelmeRecord._get_connection') as _mock_get_connection:
+ _mock_es8 = _mock_get_connection.return_value
+ _mock_es8.index.return_value = {'result': {}}
+ yield _mock_es8
@pytest.mark.django_db
@@ -76,21 +76,19 @@ def test_required_attributes(self, app, attrs):
@pytest.mark.django_db
class TestComputedFields:
- @pytest.fixture(autouse=True)
- def _real_elastic(self):
- with djelme_test_backends():
- yield
-
@pytest.fixture(autouse=True)
def mock_domain(self):
domain = 'http://example.foo/'
- with mock.patch('api.metrics.serializers.website_settings.DOMAIN', new=domain):
+ with mock.patch('website.settings.DOMAIN', new=domain):
yield domain
@pytest.fixture(autouse=True)
def mock_now(self):
timestamp = datetime(1981, 1, 1, 0, 1, 31, tzinfo=timezone.utc)
- with mock.patch('django.utils.timezone.now', return_value=timestamp):
+ with (
+ mock.patch('django.utils.timezone.now', return_value=timestamp),
+ mock.patch('elasticsearch_metrics.imps.elastic8.utcnow', return_value=timestamp),
+ ):
yield timestamp
@pytest.fixture
@@ -105,7 +103,7 @@ def user(self):
with mock.patch('osf.models.base.generate_guid', return_value='guidy'):
return AuthUserFactory()
- def test_by_client_session_id(self, app, mock_save, user, preprint):
+ def test_by_client_session_id(self, app, mock_es8, user, preprint):
payload = counted_usage_payload(
client_session_id='hello',
item_guid=preprint._id,
@@ -115,18 +113,25 @@ def test_by_client_session_id(self, app, mock_save, user, preprint):
headers = {
'User-Agent': 'haha',
}
- resp = app.post_json_api(COUNTED_USAGE_URL, payload, headers=headers, auth=user.auth)
+ resp = app.post_json_api(COUNTED_USAGE_URL, payload, headers=headers)
assert resp.status_code == 201
+ _expected_sessionhour_id = opaque_key(['hello', '1981-01-01', '0'])
assert_saved_with(
- mock_save,
- # doc_id: sha256(b'http://example.foo/|http://example.foo/blahblah/blee|5b7c8b0a740a5b23712258a9d1164d2af008df02a8e3d339f16ead1d19595b34|1981-01-01|3|api,view').hexdigest()
- expected_doc_id='3239044c7462dd318edd0522a0ed7d84b9c6502ef16cb40dfcae6c1f456d57a2',
+ mock_es8,
+ expected_doc_id=opaque_key([
+ 'http://example.foo/',
+ _expected_sessionhour_id,
+ "['api', 'view']",
+ 'http://example.foo/blahblah/blee',
+ '1981-01-01',
+ '3',
+ ]),
expected_attrs={
'platform_iri': 'http://example.foo/',
- 'item_guid': preprint._id,
- # session_id: sha256(b'hello|1981-01-01').hexdigest()
- 'session_id': '5b7c8b0a740a5b23712258a9d1164d2af008df02a8e3d339f16ead1d19595b34',
- 'action_labels': ['view', 'api'],
+ 'item_osfid': preprint._id,
+ 'item_type': 'Preprint',
+ 'sessionhour_id': _expected_sessionhour_id,
+ 'action_labels': ['api', 'view'],
'pageview_info': {
'page_url': 'http://example.foo/blahblah/blee',
'page_path': '/blahblah/blee',
@@ -135,9 +140,9 @@ def test_by_client_session_id(self, app, mock_save, user, preprint):
},
)
- def test_by_client_session_id_anon(self, app, mock_save, preprint):
+ def test_by_client_session_id_anon(self, app, mock_es8, preprint):
payload = counted_usage_payload(
- client_session_id='hello',
+ client_session_id='hihi',
item_guid=preprint._id,
action_labels=['view', 'web'],
pageview_info={
@@ -150,15 +155,22 @@ def test_by_client_session_id_anon(self, app, mock_save, preprint):
}
resp = app.post_json_api(COUNTED_USAGE_URL, payload, headers=headers)
assert resp.status_code == 201
+ _expected_sessionhour_id = opaque_key(['hihi', '1981-01-01', '0'])
assert_saved_with(
- mock_save,
- # doc_id: sha256(b'http://example.foo/|http://example.foo/bliz/|5b7c8b0a740a5b23712258a9d1164d2af008df02a8e3d339f16ead1d19595b34|1981-01-01|3|view,web').hexdigest()
- expected_doc_id='d01759e963893f9dc9b2ccf016a5ef29135673779802b5578f31449543677e82',
+ mock_es8,
+ expected_doc_id=opaque_key([
+ 'http://example.foo/',
+ _expected_sessionhour_id,
+ "['view', 'web']",
+ 'http://example.foo/bliz/',
+ '1981-01-01',
+ '3',
+ ]),
expected_attrs={
'platform_iri': 'http://example.foo/',
- 'item_guid': preprint._id,
- # session_id: sha256(b'hello|1981-01-01').hexdigest()
- 'session_id': '5b7c8b0a740a5b23712258a9d1164d2af008df02a8e3d339f16ead1d19595b34',
+ 'item_osfid': preprint._id,
+ 'item_type': 'Preprint',
+ 'sessionhour_id': _expected_sessionhour_id,
'action_labels': ['view', 'web'],
'pageview_info': {
'page_url': 'http://example.foo/bliz/',
@@ -170,7 +182,7 @@ def test_by_client_session_id_anon(self, app, mock_save, preprint):
},
)
- def test_by_user_auth(self, app, mock_save, user, preprint):
+ def test_by_user_auth(self, app, mock_es8, user, preprint):
payload = counted_usage_payload(
item_guid=preprint._id,
action_labels=['view', 'web'],
@@ -184,15 +196,22 @@ def test_by_user_auth(self, app, mock_save, user, preprint):
}
resp = app.post_json_api(COUNTED_USAGE_URL, payload, headers=headers, auth=user.auth)
assert resp.status_code == 201
+ _expected_sessionhour_id = opaque_key(['guidy', '1981-01-01', '0'])
assert_saved_with(
- mock_save,
- # doc_id: sha256(b'http://example.foo/|http://osf.io/mst3k|ec768abb16c3411570af99b9d635c2c32d1ca31d1b25eec8ee73759e7242e74a|1981-01-01|3|view,web').hexdigest()
- expected_doc_id='7b8bc27c6d90fb45aa5bbd02deceba9f7384ed61b9a6e7253317c262020b94c2',
+ mock_es8,
+ expected_doc_id=opaque_key([
+ 'http://example.foo/',
+ _expected_sessionhour_id,
+ "['view', 'web']",
+ 'http://osf.io/mst3k',
+ '1981-01-01',
+ '3',
+ ]),
expected_attrs={
'platform_iri': 'http://example.foo/',
- 'item_guid': preprint._id,
- # session_id: sha256(b'guidy|1981-01-01|0').hexdigest()
- 'session_id': 'ec768abb16c3411570af99b9d635c2c32d1ca31d1b25eec8ee73759e7242e74a',
+ 'item_osfid': preprint._id,
+ 'item_type': 'Preprint',
+ 'sessionhour_id': _expected_sessionhour_id,
'action_labels': ['view', 'web'],
'pageview_info': {
'page_url': 'http://osf.io/mst3k',
@@ -204,7 +223,7 @@ def test_by_user_auth(self, app, mock_save, user, preprint):
},
)
- def test_by_useragent_header(self, app, mock_save, preprint):
+ def test_by_useragent_header(self, app, mock_es8, preprint):
payload = counted_usage_payload(
item_guid=preprint._id,
action_labels=['view', 'api'],
@@ -218,16 +237,23 @@ def test_by_useragent_header(self, app, mock_save, preprint):
}
resp = app.post_json_api(COUNTED_USAGE_URL, payload, headers=headers)
assert resp.status_code == 201
+ _expected_sessionhour_id = opaque_key(['localhost:80', 'haha', '1981-01-01', '0'])
assert_saved_with(
- mock_save,
- # doc_id: sha256(b'http://example.foo/|yxwvu|97098dd3f7cd26053c0d0264d1c84eaeea8e08d2c55ca34017ffbe53c749ba5a|1981-01-01|3|api,view').hexdigest()
- expected_doc_id='6d7549df6734bb955eb832c6316ffae46c2959c95b5817ab4fcb341dbc875c23',
+ mock_es8,
+ expected_doc_id=opaque_key([
+ 'http://example.foo/',
+ _expected_sessionhour_id,
+ "['api', 'view']",
+ 'http://example.foo/bliz/',
+ '1981-01-01',
+ '3',
+ ]),
expected_attrs={
'platform_iri': 'http://example.foo/',
- 'item_guid': preprint._id,
- # session_id: sha256(b'localhost:80|haha|1981-01-01|0').hexdigest()
- 'session_id': '97098dd3f7cd26053c0d0264d1c84eaeea8e08d2c55ca34017ffbe53c749ba5a',
- 'action_labels': ['view', 'api'],
+ 'item_osfid': preprint._id,
+ 'item_type': 'Preprint',
+ 'sessionhour_id': opaque_key(['localhost:80', 'haha', '1981-01-01', '0']),
+ 'action_labels': ['api', 'view'],
'pageview_info': {
'page_url': 'http://example.foo/bliz/',
'page_path': '/bliz',
@@ -244,9 +270,10 @@ def test_by_useragent_header(self, app, mock_save, preprint):
class TestGuidFields:
@pytest.fixture(autouse=True)
- def _real_elastic(self):
- with djelme_test_backends():
- yield
+ def mock_domain(self):
+ domain = 'http://example.foo/'
+ with mock.patch('website.settings.DOMAIN', new=domain):
+ yield domain
@pytest.fixture
def preprint(self, item_public):
@@ -286,7 +313,7 @@ def child_reg_file(self, child_reg):
def child_reg_file_guid(self, child_reg_file):
return child_reg_file.get_guid(create=True)._id
- def test_preprint_file(self, app, mock_save, preprint, item_public):
+ def test_preprint_file(self, app, mock_es8, preprint, item_public, mock_domain):
# test_preprint_guid
payload = counted_usage_payload(
item_guid=preprint._id,
@@ -295,16 +322,18 @@ def test_preprint_file(self, app, mock_save, preprint, item_public):
resp = app.post_json_api(COUNTED_USAGE_URL, payload, headers={'User-Agent': 'blarg'})
assert resp.status_code == 201
assert_saved_with(
- mock_save,
+ mock_es8,
expected_attrs={
- 'item_guid': preprint._id,
- 'item_type': 'preprint',
+ 'item_osfid': preprint._id,
+ 'item_iri': f'{mock_domain}{preprint._id}',
+ 'item_type': 'Preprint',
'item_public': item_public,
'provider_id': preprint.provider._id,
- 'surrounding_guids': None,
+ 'database_iri': f'{mock_domain}preprints/{preprint.provider._id}',
+ 'within_iris': [f'{mock_domain}{preprint._id}'],
},
)
- mock_save.reset_mock()
+ mock_es8.reset_mock()
# test_preprint_file_guid
payload = counted_usage_payload(
@@ -314,17 +343,22 @@ def test_preprint_file(self, app, mock_save, preprint, item_public):
resp = app.post_json_api(COUNTED_USAGE_URL, payload, headers={'User-Agent': 'blarg'})
assert resp.status_code == 201
assert_saved_with(
- mock_save,
+ mock_es8,
expected_attrs={
- 'item_guid': preprint.primary_file.get_guid()._id,
- 'item_type': 'osfstoragefile',
+ 'item_osfid': preprint.primary_file.get_guid()._id,
+ 'item_iri': preprint.primary_file.get_semantic_iri(),
+ 'item_type': 'File',
'item_public': item_public,
'provider_id': preprint.primary_file.provider,
- 'surrounding_guids': [preprint._id],
+ 'database_iri': f'urn:files.osf.io:{preprint.primary_file.provider}',
+ 'within_iris': sorted([
+ f'{mock_domain}{preprint._id}',
+ preprint.primary_file.get_semantic_iri(),
+ ]),
},
)
- def test_child_registration_file(self, app, mock_save, child_reg_file_guid, child_reg, parent_reg, item_public):
+ def test_child_registration_file(self, app, mock_es8, child_reg_file_guid, child_reg_file, child_reg, parent_reg, item_public, mock_domain):
# test_child_registration_file_guid
payload = counted_usage_payload(
item_guid=child_reg_file_guid,
@@ -333,20 +367,22 @@ def test_child_registration_file(self, app, mock_save, child_reg_file_guid, chil
resp = app.post_json_api(COUNTED_USAGE_URL, payload, headers={'User-Agent': 'blarg'})
assert resp.status_code == 201
assert_saved_with(
- mock_save,
+ mock_es8,
expected_attrs={
'action_labels': ['view', 'web'],
- 'item_guid': child_reg_file_guid,
- 'item_type': 'osfstoragefile',
+ 'item_osfid': child_reg_file_guid,
+ 'item_type': 'File',
'item_public': item_public,
'provider_id': 'osfstorage',
- 'surrounding_guids': [
- child_reg._id,
- parent_reg._id,
- ],
+ 'database_iri': 'urn:files.osf.io:osfstorage',
+ 'within_iris': sorted([
+ child_reg_file.get_semantic_iri(),
+ child_reg.get_semantic_iri(),
+ parent_reg.get_semantic_iri(),
+ ]),
},
)
- mock_save.reset_mock()
+ mock_es8.reset_mock()
# test_child_registration_guid
payload = counted_usage_payload(
@@ -356,19 +392,22 @@ def test_child_registration_file(self, app, mock_save, child_reg_file_guid, chil
resp = app.post_json_api(COUNTED_USAGE_URL, payload, headers={'User-Agent': 'blarg'})
assert resp.status_code == 201
assert_saved_with(
- mock_save,
+ mock_es8,
expected_attrs={
'action_labels': ['view', 'web'],
- 'item_guid': child_reg._id,
- 'item_type': 'registration',
+ 'item_osfid': child_reg._id,
+ 'item_type': 'RegistrationComponent',
'item_public': item_public,
'provider_id': 'osf',
- 'surrounding_guids': [
- parent_reg._id,
- ],
+ 'database_iri': f'{mock_domain}registries/osf',
+ 'item_iri': child_reg.get_semantic_iri(),
+ 'within_iris': sorted([
+ child_reg.get_semantic_iri(),
+ parent_reg.get_semantic_iri(),
+ ]),
},
)
- mock_save.reset_mock()
+ mock_es8.reset_mock()
# test_parent_registration_guid
payload = counted_usage_payload(
@@ -378,13 +417,15 @@ def test_child_registration_file(self, app, mock_save, child_reg_file_guid, chil
resp = app.post_json_api(COUNTED_USAGE_URL, payload, headers={'User-Agent': 'blarg'})
assert resp.status_code == 201
assert_saved_with(
- mock_save,
+ mock_es8,
expected_attrs={
'action_labels': ['view', 'web'],
- 'item_guid': parent_reg._id,
+ 'item_osfid': parent_reg._id,
'item_public': item_public,
'provider_id': 'osf',
- 'surrounding_guids': None,
+ 'database_iri': f'{mock_domain}registries/osf',
+ 'item_iri': parent_reg.get_semantic_iri(),
+ 'within_iris': [parent_reg.get_semantic_iri()],
},
)
@@ -392,7 +433,7 @@ def test_child_registration_file(self, app, mock_save, child_reg_file_guid, chil
@pytest.mark.django_db
class TestContributorExclusion:
- def test_creator_pageview_not_recorded(self, app, mock_save):
+ def test_creator_pageview_not_recorded(self, app, mock_es8):
user = AuthUserFactory()
project = ProjectFactory(creator=user)
payload = counted_usage_payload(
@@ -402,14 +443,14 @@ def test_creator_pageview_not_recorded(self, app, mock_save):
)
resp = app.post_json_api(COUNTED_USAGE_URL, payload, auth=user.auth)
assert resp.status_code == 204
- assert mock_save.call_count == 0
+ assert mock_es8.index.call_count == 0
@pytest.mark.parametrize(
'permissions',
[READ, WRITE, ADMIN],
ids=['read', 'write', 'admin'],
)
- def test_contributor_pageview_not_recorded(self, app, mock_save, permissions):
+ def test_contributor_pageview_not_recorded(self, app, mock_es8, permissions):
creator = AuthUserFactory()
contributor = AuthUserFactory()
project = ProjectFactory(creator=creator)
@@ -421,9 +462,9 @@ def test_contributor_pageview_not_recorded(self, app, mock_save, permissions):
)
resp = app.post_json_api(COUNTED_USAGE_URL, payload, auth=contributor.auth)
assert resp.status_code == 204
- assert mock_save.call_count == 0
+ assert mock_es8.index.call_count == 0
- def test_non_contributor_pageview_recorded(self, app, mock_save):
+ def test_non_contributor_pageview_recorded(self, app, mock_es8):
creator = AuthUserFactory()
visitor = AuthUserFactory()
project = ProjectFactory(creator=creator, is_public=True)
@@ -434,9 +475,9 @@ def test_non_contributor_pageview_recorded(self, app, mock_save):
)
resp = app.post_json_api(COUNTED_USAGE_URL, payload, auth=visitor.auth)
assert resp.status_code == 201
- assert mock_save.call_count == 1
+ assert mock_es8.index.call_count == 1
- def test_parent_contributor_not_on_child_component_pageview_recorded(self, app, mock_save):
+ def test_parent_contributor_not_on_child_component_pageview_recorded(self, app, mock_es8):
creator = AuthUserFactory()
child_owner = AuthUserFactory()
parent_reader = AuthUserFactory()
@@ -451,9 +492,9 @@ def test_parent_contributor_not_on_child_component_pageview_recorded(self, app,
)
resp = app.post_json_api(COUNTED_USAGE_URL, payload, auth=parent_reader.auth)
assert resp.status_code == 201
- assert mock_save.call_count == 1
+ assert mock_es8.index.call_count == 1
- def test_anonymous_view_only_link_visitor_pageview_recorded(self, app, mock_save):
+ def test_anonymous_view_only_link_visitor_pageview_recorded(self, app, mock_es8):
creator = AuthUserFactory()
project = ProjectFactory(creator=creator, is_public=False)
link = PrivateLinkFactory(anonymous=True, creator=creator)
@@ -468,9 +509,9 @@ def test_anonymous_view_only_link_visitor_pageview_recorded(self, app, mock_save
)
resp = app.post_json_api(COUNTED_USAGE_URL, payload)
assert resp.status_code == 201
- assert mock_save.call_count == 1
+ assert mock_es8.index.call_count == 1
- def test_logged_in_non_contributor_view_only_link_pageview_recorded(self, app, mock_save):
+ def test_logged_in_non_contributor_view_only_link_pageview_recorded(self, app, mock_es8):
creator = AuthUserFactory()
visitor = AuthUserFactory()
project = ProjectFactory(creator=creator, is_public=False)
@@ -485,14 +526,14 @@ def test_logged_in_non_contributor_view_only_link_pageview_recorded(self, app, m
)
resp = app.post_json_api(COUNTED_USAGE_URL, payload, auth=visitor.auth)
assert resp.status_code == 201
- assert mock_save.call_count == 1
+ assert mock_es8.index.call_count == 1
@pytest.mark.parametrize(
'permissions',
[READ, WRITE, ADMIN],
ids=['read', 'write', 'admin'],
)
- def test_logged_in_contributor_view_only_link_pageview_not_recorded(self, app, mock_save, permissions):
+ def test_logged_in_contributor_view_only_link_pageview_not_recorded(self, app, mock_es8, permissions):
creator = AuthUserFactory()
contributor = AuthUserFactory()
project = ProjectFactory(creator=creator, is_public=False)
@@ -508,4 +549,4 @@ def test_logged_in_contributor_view_only_link_pageview_not_recorded(self, app, m
)
resp = app.post_json_api(COUNTED_USAGE_URL, payload, auth=contributor.auth)
assert resp.status_code == 204
- assert mock_save.call_count == 0
+ assert mock_es8.index.call_count == 0
diff --git a/api_tests/metrics/test_preprint_metrics.py b/api_tests/metrics/test_preprint_metrics.py
deleted file mode 100644
index cd9b8041c2d..00000000000
--- a/api_tests/metrics/test_preprint_metrics.py
+++ /dev/null
@@ -1,240 +0,0 @@
-import pytest
-from unittest import mock
-from datetime import datetime
-
-from website.app import setup_django
-
-setup_django()
-
-from django.utils import timezone
-from waffle.testutils import override_switch
-from elasticsearch6.exceptions import RequestError
-
-from osf import features
-from api.base.settings import API_PRIVATE_BASE as API_BASE
-from osf.metrics import PreprintDownload, PreprintView
-from osf_tests.factories import AuthUserFactory, PreprintFactory, NodeFactory
-
-pytestmark = pytest.mark.django_db
-
-
-@pytest.mark.django_db
-class TestPreprintMetrics:
-
- @pytest.fixture(autouse=True)
- def enable_elasticsearch_metrics(self):
- with override_switch(features.ELASTICSEARCH_METRICS, active=True):
- yield
-
- @pytest.fixture
- def user(self):
- user = AuthUserFactory()
- user.is_staff = True
- user.add_system_tag('preprint_metrics')
- user.save()
- return user
-
- @pytest.fixture
- def other_user(self):
- return AuthUserFactory()
-
- @pytest.fixture
- def other_admin_user(self):
- user = AuthUserFactory()
- user.is_staff = True
- user.save()
- return user
-
- @pytest.fixture
- def other_non_admin_user(self):
- user = AuthUserFactory()
- user.add_system_tag('preprint_metrics')
- user.save()
- return user
-
- @pytest.fixture
- def preprint(self, user):
- preprint = PreprintFactory(creator=user)
- return preprint
-
- @pytest.fixture
- def preprint_two(self):
- return PreprintFactory()
-
- @pytest.fixture
- def preprint_three(self):
- return PreprintFactory()
-
- @pytest.fixture
- def preprint_no_results(self):
- return PreprintFactory()
-
- @pytest.fixture
- def project(self):
- return NodeFactory()
-
- @pytest.fixture
- def project_two(self):
- return NodeFactory()
-
- @pytest.fixture
- def metric_dates(self):
- return ['2019-01-01', '2019-01-02', '2019-01-03']
-
- def add_views_and_downloads(self, preprint_to_add, user_to_use, dates_to_use):
- # create 3 timestamps for 3 days, 1 hour apart
- times = ['T00:05', 'T01:05', 'T02:05']
-
- metrics = [PreprintView, PreprintDownload]
- for metric in metrics:
- for date in dates_to_use:
- for time in times:
- metric.record_for_preprint(
- preprint=preprint_to_add,
- user=user_to_use,
- path=preprint_to_add.primary_file.path,
- timestamp=datetime.strptime(date + time, '%Y-%m-%dT%H:%M')
- )
-
- @pytest.fixture
- def base_url(self):
- return f'/{API_BASE}metrics/preprints/'
-
- @mock.patch('api.metrics.views.PreprintDownloadMetrics.execute_search')
- def test_custom_metric_malformed_query(self, mock_execute, app, user, base_url):
- mock_execute.side_effect = RequestError()
- post_url = f'{base_url}downloads/'
- post_data = {
- 'data': {
- 'type': 'preprint_metric',
- 'attributes': {
- 'query': {'not_a_field': 'Yay!'}
- }
- }
- }
- res = app.post_json_api(post_url, post_data, auth=user.auth, expect_errors=True)
- assert res.status_code == 400
- assert res.json['errors'][0]['detail'] == 'Malformed elasticsearch query.'
-
- @pytest.mark.es_metrics
- def test_agg_query(self, app, user, base_url):
-
- post_url = f'{base_url}downloads/'
-
- payload = {
- 'data': {
- 'type': 'preprint_metrics',
- 'attributes': {
- 'query': {
- 'aggs': {
- 'preprints_by_year': {
- 'composite': {
- 'sources': [{
- 'date': {
- 'date_histogram': {
- 'field': 'timestamp',
- 'interval': 'year'
- }
- }
- }]
- }
- }
- }
- }
- }
- }
- }
- resp = app.post_json_api(post_url, payload, auth=user.auth)
- assert resp.status_code == 200
-
- @mock.patch('api.metrics.views.PreprintDownloadMetrics.format_response')
- @mock.patch('api.metrics.views.PreprintDownloadMetrics.execute_search')
- def test_post_custom_metric(self, mock_execute, mock_format, app, user, base_url, preprint, other_user):
- mock_return = {'good': 'job'}
- mock_execute.return_value.to_dict.return_value = mock_return
- mock_format.return_value = mock_return
- post_url = f'{base_url}downloads/'
- post_data = {
- 'data': {
- 'type': 'preprint_metrics',
- 'attributes': {
- 'query': mock_return
- }
- }
- }
- res = app.post_json_api(post_url, post_data, auth=user.auth)
- assert res.json == mock_return
-
- @pytest.mark.parametrize('metric_name', ['downloads', 'views'])
- @mock.patch('api.metrics.utils.timezone.now')
- def test_preprint_list_with_metrics_fails(self, mock_timezone, app, user, base_url, preprint, preprint_two,
- preprint_three, metric_name, other_user, project, project_two,
- other_admin_user, other_non_admin_user):
- mock_timezone.return_value = datetime(2019, 1, 4, tzinfo=timezone.utc)
- url = f'{base_url}{metric_name}/'
-
- one_preprint_url = f'{url}?guids={preprint._id}'
- # test non-logged in cannot access
- res = app.get(one_preprint_url, expect_errors=True)
- assert res.status_code == 401
-
- # test logged in non-metrics, non-admin user cannot access
- res = app.get(one_preprint_url, auth=other_user.auth, expect_errors=True)
- assert res.status_code == 403
-
- # test logged in, non-metrics, admin user cannot access
- res = app.get(one_preprint_url, auth=other_admin_user.auth, expect_errors=True)
- assert res.status_code == 403
-
- # test logged in, metrics, non-admin user cannot access
- res = app.get(one_preprint_url, auth=other_non_admin_user.auth, expect_errors=True)
- assert res.status_code == 403
-
- @pytest.mark.skip('Return results will be entirely mocked so does not make a lot of sense to run on ci.')
- @mock.patch('api.metrics.utils.timezone.now')
- def test_preprint_with_metrics_succeeds(self, mock_timezone, app, user, base_url, preprint, other_user,
- preprint_no_results, metric_dates):
- mock_timezone.return_value = datetime(2019, 1, 4, tzinfo=timezone.utc)
- self.add_views_and_downloads(preprint, other_user, metric_dates)
- metric_name = 'downloads'
-
- mock_timezone.return_value = datetime(2019, 1, 4, tzinfo=timezone.utc)
- url = f'{base_url}{metric_name}/'
- one_preprint_url = f'{url}?guids={preprint._id}'
-
- # base url should return all results
- res = app.get(one_preprint_url, auth=user.auth)
- assert res.json['metric_type'] == metric_name
- assert len(res.json['data']) == 3
-
- # starting a day later only returns 2 results
- later_url = f'{one_preprint_url}&start_datetime=2019-01-02'
- res = app.get(later_url, auth=user.auth)
- assert len(res.json['data']) == 2
- datetimes = [result.keys()[0] for result in res.json['data']]
- assert '2019-01-01T00:05:00.000Z' not in datetimes
-
- # filter between two specific datetimes
- two_times_url = f'{one_preprint_url}&start_datetime=2019-01-02T00:00&end_datetime=2019-01-02T02:00'
- res = app.get(two_times_url, auth=user.auth)
- assert len(res.json['data']) == 1
- datetimes = [result.keys()[0] for result in res.json['data']]
- assert '2019-01-01T00:05:00.000Z' not in datetimes
- assert '2019-01-01T03:05:00.000Z' not in datetimes
-
- # test two specific datetimes with minute interval
- two_min_interval = f'{one_preprint_url}&start_datetime=2019-01-02T00:00&end_datetime=2019-01-02T02:00&interval=1m'
- res = app.get(two_min_interval, auth=user.auth)
- assert len(res.json['data']) == 61
- first = res.json['data'][0]
- last = res.json['data'][-1]
- assert first.keys() == ['2019-01-02T00:05:00.000Z']
- assert first['2019-01-02T00:05:00.000Z'] == {preprint._id: 1}
- assert last.keys() == ['2019-01-02T01:05:00.000Z']
- assert last['2019-01-02T01:05:00.000Z'] == {preprint._id: 1}
-
- # make sure requesting one preprint with no results is OK
- non_preprint_url = f'{url}?guids={preprint_no_results._id}'
- res = app.get(non_preprint_url, auth=user.auth)
- assert res.status_code == 200
- assert res.json['data'] == []
diff --git a/api_tests/metrics/test_queries.py b/api_tests/metrics/test_queries.py
index 8b19247f5b4..a7a2ef1e6d7 100644
--- a/api_tests/metrics/test_queries.py
+++ b/api_tests/metrics/test_queries.py
@@ -1,111 +1,259 @@
+import datetime
from unittest import mock
-import pytest
+from django.test import TestCase
+from elasticsearch_metrics.tests.util import RealElasticTestCase
+from osf.metrics.events import OsfCountedUsageEvent
from osf_tests.factories import NodeFactory, AuthUserFactory
-@pytest.mark.django_db
-class TestNodeAnalyticsQuery:
- @pytest.fixture
- def mock_search(self):
- with mock.patch('elasticsearch6.Elasticsearch.search', autospec=True) as mock_search:
- yield mock_search
- @pytest.mark.parametrize('timespan', ['week', 'fortnight', 'month'])
- def test_private_node(self, app, mock_search, timespan):
- node = NodeFactory(is_public=False)
- guid = node._id
- resp = app.get(
- f'/_/metrics/query/node_analytics/{guid}/{timespan}/',
- expect_errors=True,
- )
- assert resp.status_code == 401
+class TestNodeAnalyticsQueryErrors:
+ def test_private_node_anon(self, app):
+ _node = NodeFactory(is_public=False)
+ with mock.patch('elasticsearch8.Elasticsearch.search') as _mock_search:
+ for timespan in ['week', 'fortnight', 'month']:
+ resp = app.get(
+ f'/_/metrics/query/node_analytics/{_node._id}/{timespan}/',
+ expect_errors=True,
+ )
+ assert resp.status_code == 401
+ assert _mock_search.call_count == 0
+
+ def test_private_node_rando(self, app):
+ _node = NodeFactory(is_public=False)
+ _user = AuthUserFactory()
+ with mock.patch('elasticsearch8.Elasticsearch.search') as _mock_search:
+ for timespan in ['week', 'fortnight', 'month']:
+ resp = app.get(
+ f'/_/metrics/query/node_analytics/{_node._id}/{timespan}/',
+ expect_errors=True,
+ auth=_user.auth,
+ )
+ assert resp.status_code == 403
+ assert _mock_search.call_count == 0
- user = AuthUserFactory()
- resp = app.get(
- f'/_/metrics/query/node_analytics/{guid}/{timespan}/',
- auth=user.auth,
- expect_errors=True,
- )
- assert resp.status_code == 403
- assert mock_search.call_count == 0
+class TestNodeAnalyticsQuery(RealElasticTestCase, TestCase):
+ def setUp(self):
+ super().setUp()
+ self._node = NodeFactory(is_public=True)
+ self._osfid = self._node._id
+ self._today = datetime.date.today()
+ self._now = datetime.datetime(
+ self._today.year,
+ self._today.month,
+ self._today.day,
+ 12,
+ tzinfo=datetime.UTC,
+ )
+ ###
+ # past week
+ OsfCountedUsageEvent.record(
+ sessionhour_id='s1',
+ item_osfid=self._osfid,
+ action_labels=['view', 'web'],
+ timestamp=self._now - datetime.timedelta(hours=1),
+ pageview_info={
+ 'referer_url': 'http://somewhere.example.com/there',
+ 'page_url': 'http://osf.example/page/path',
+ 'route_name': 'page.route',
+ 'page_title': 'foo',
+ }
+ )
+ OsfCountedUsageEvent.record(
+ sessionhour_id='s2',
+ item_osfid=self._osfid,
+ action_labels=['view', 'web'],
+ timestamp=self._now - datetime.timedelta(days=1),
+ pageview_info={
+ 'referer_url': 'http://somewhere.example.com/there',
+ 'page_url': 'http://osf.example/page/path',
+ 'route_name': 'page.route',
+ 'page_title': 'foo',
+ }
+ )
+ OsfCountedUsageEvent.record(
+ sessionhour_id='s3',
+ item_osfid=self._osfid,
+ action_labels=['view', 'web'],
+ timestamp=self._now - datetime.timedelta(days=1, hours=1),
+ pageview_info={
+ 'referer_url': 'http://somewhere.example.com/there',
+ 'page_url': 'http://osf.example/page/another',
+ 'route_name': 'page.another',
+ 'page_title': 'blaz',
+ }
+ )
+ OsfCountedUsageEvent.record(
+ sessionhour_id='s4',
+ item_osfid=self._osfid,
+ action_labels=['view', 'web'],
+ timestamp=self._now - datetime.timedelta(days=1, hours=2),
+ pageview_info={
+ 'referer_url': 'http://elsewhere.example.com/there',
+ 'page_url': 'http://osf.example/page/another',
+ 'route_name': 'page.another',
+ 'page_title': 'blaz',
+ }
+ )
+ OsfCountedUsageEvent.record(
+ sessionhour_id='s5',
+ item_osfid=self._osfid,
+ action_labels=['view', 'web'],
+ timestamp=self._now - datetime.timedelta(days=2, hours=1),
+ pageview_info={
+ 'page_url': 'http://osf.example/page/another',
+ 'route_name': 'page.another',
+ 'page_title': 'blaz',
+ }
+ )
+ OsfCountedUsageEvent.record(
+ sessionhour_id='s6',
+ item_osfid=self._osfid,
+ action_labels=['view', 'web'],
+ timestamp=self._now - datetime.timedelta(days=2, hours=2),
+ pageview_info={
+ 'page_url': 'http://osf.example/page/another',
+ 'route_name': 'page.another',
+ 'page_title': 'blaz',
+ }
+ )
+ ###
+ # past fortnight
+ OsfCountedUsageEvent.record(
+ sessionhour_id='s7',
+ item_osfid=self._osfid,
+ action_labels=['view', 'web'],
+ timestamp=self._now - datetime.timedelta(days=10, hours=1),
+ pageview_info={
+ 'referer_url': 'http://elsewhere.example.com/there',
+ 'page_url': 'http://osf.example/page/another',
+ 'route_name': 'page.another',
+ 'page_title': 'blaz',
+ }
+ )
+ ###
+ # past month
+ OsfCountedUsageEvent.record(
+ sessionhour_id='s8',
+ item_osfid=self._osfid,
+ action_labels=['view', 'web'],
+ timestamp=self._now - datetime.timedelta(days=20, hours=1),
+ pageview_info={
+ 'referer_url': 'http://somewhere.example.com/anothere',
+ 'page_url': 'http://osf.example/page/another',
+ 'route_name': 'page.another',
+ 'page_title': 'blaz',
+ }
+ )
+ ###
+ # older than a month
+ OsfCountedUsageEvent.record(
+ sessionhour_id='s9',
+ item_osfid=self._osfid,
+ action_labels=['view', 'web'],
+ timestamp=self._now - datetime.timedelta(days=80, hours=7),
+ pageview_info={
+ 'referer_url': 'http://somewhere.example.com/anothere',
+ 'page_url': 'http://osf.example/page/another',
+ 'route_name': 'page.another',
+ 'page_title': 'blaz',
+ }
+ )
+ # refresh
+ OsfCountedUsageEvent.refresh()
- @pytest.mark.parametrize('timespan', ['week', 'fortnight', 'month'])
- def test_public_node(self, app, mock_search, timespan):
- node = NodeFactory(is_public=True)
- guid = node._id
- mock_search.return_value = {
- 'aggregations': {
- 'popular-pages': {
- 'buckets': [
- {
- 'key': '/page/path',
- 'doc_count': 17,
- 'route-for-path': {
- 'buckets': [{'key': 'page.route'}],
- },
- 'title-for-path': {
- 'buckets': [{'key': 'foo'}],
- },
- },
- {
- 'key': '/page/another',
- 'doc_count': 7,
- 'route-for-path': {
- 'buckets': [{'key': 'page.another'}],
- },
- 'title-for-path': {
- 'buckets': [{'key': 'blaz'}],
- },
- },
- ],
- },
- 'unique-visits': {
- 'buckets': [
- {'key': 1646265600000, 'key_as_string': '2022-03-03', 'doc_count': 8},
- {'key': 1646352000000, 'key_as_string': '2022-03-04', 'doc_count': 1},
- ],
- },
- 'time-of-day': {
- 'buckets': [
- {'key': 8, 'doc_count': 1},
- {'key': 9, 'doc_count': 2},
- {'key': 10, 'doc_count': 3},
- ],
- },
- 'referer-domain': {
- 'buckets': [
- {'key': 'somewhere.example.com', 'doc_count': 9},
- {'key': 'elsewhere.example.com', 'doc_count': 4},
- ],
- },
+ def test_public_node(self):
+ _week_resp = self.client.get(f'/_/metrics/query/node_analytics/{self._osfid}/week/')
+ assert _week_resp.json()['data'] == {
+ 'id': f'{self._osfid}:week',
+ 'type': 'node-analytics',
+ 'attributes': {
+ 'popular_pages': [
+ {'route': 'page.another', 'path': '/page/another', 'title': 'blaz', 'count': 4},
+ {'route': 'page.route', 'path': '/page/path', 'title': 'foo', 'count': 2},
+ ],
+ 'unique_visits': [
+ {'date': str(self._today - datetime.timedelta(days=2)), 'count': 2},
+ {'date': str(self._today - datetime.timedelta(days=1)), 'count': 3},
+ {'date': str(self._today), 'count': 1},
+ ],
+ 'time_of_day': [
+ {'hour': 11, 'count': 3},
+ {'hour': 10, 'count': 2},
+ {'hour': 12, 'count': 1},
+ ],
+ 'referer_domain': [
+ {'referer_domain': 'somewhere.example.com', 'count': 3},
+ {'referer_domain': 'elsewhere.example.com', 'count': 1},
+ ],
},
}
- resp = app.get(f'/_/metrics/query/node_analytics/{guid}/{timespan}/')
- assert resp.json['data'] == {
- 'id': f'{guid}:{timespan}',
+ _fortnight_resp = self.client.get(f'/_/metrics/query/node_analytics/{self._osfid}/fortnight/')
+ assert _fortnight_resp.json()['data'] == {
+ 'id': f'{self._osfid}:fortnight',
'type': 'node-analytics',
'attributes': {
'popular_pages': [
- {'route': 'page.route', 'path': '/page/path', 'title': 'foo', 'count': 17},
- {'route': 'page.another', 'path': '/page/another', 'title': 'blaz', 'count': 7},
+ {'route': 'page.another', 'path': '/page/another', 'title': 'blaz', 'count': 5},
+ {'route': 'page.route', 'path': '/page/path', 'title': 'foo', 'count': 2},
],
'unique_visits': [
- {'date': '2022-03-03', 'count': 8},
- {'date': '2022-03-04', 'count': 1},
+ {'date': str(self._today - datetime.timedelta(days=10)), 'count': 1},
+ *(
+ {'date': str(self._today - datetime.timedelta(days=_n)), 'count': 0}
+ for _n in range(9, 2, -1)
+ ),
+ {'date': str(self._today - datetime.timedelta(days=2)), 'count': 2},
+ {'date': str(self._today - datetime.timedelta(days=1)), 'count': 3},
+ {'date': str(self._today), 'count': 1},
],
'time_of_day': [
- {'hour': 8, 'count': 1},
- {'hour': 9, 'count': 2},
- {'hour': 10, 'count': 3},
+ {'hour': 11, 'count': 4},
+ {'hour': 10, 'count': 2},
+ {'hour': 12, 'count': 1},
],
'referer_domain': [
- {'referer_domain': 'somewhere.example.com', 'count': 9},
- {'referer_domain': 'elsewhere.example.com', 'count': 4},
+ {'referer_domain': 'somewhere.example.com', 'count': 3},
+ {'referer_domain': 'elsewhere.example.com', 'count': 2},
],
},
}
- assert mock_search.call_count == 1
+ _month_resp = self.client.get(f'/_/metrics/query/node_analytics/{self._osfid}/month/')
+ assert _month_resp.json()['data'] == {
+ 'id': f'{self._osfid}:month',
+ 'type': 'node-analytics',
+ 'attributes': {
+ 'popular_pages': [
+ {'route': 'page.another', 'path': '/page/another', 'title': 'blaz', 'count': 6},
+ {'route': 'page.route', 'path': '/page/path', 'title': 'foo', 'count': 2},
+ ],
+ 'unique_visits': [
+ {'date': str(self._today - datetime.timedelta(days=20)), 'count': 1},
+ *(
+ {'date': str(self._today - datetime.timedelta(days=_n)), 'count': 0}
+ for _n in range(19, 10, -1)
+ ),
+ {'date': str(self._today - datetime.timedelta(days=10)), 'count': 1},
+ *(
+ {'date': str(self._today - datetime.timedelta(days=_n)), 'count': 0}
+ for _n in range(9, 2, -1)
+ ),
+ {'date': str(self._today - datetime.timedelta(days=2)), 'count': 2},
+ {'date': str(self._today - datetime.timedelta(days=1)), 'count': 3},
+ {'date': str(self._today), 'count': 1},
+ ],
+ 'time_of_day': [
+ {'hour': 11, 'count': 5},
+ {'hour': 10, 'count': 2},
+ {'hour': 12, 'count': 1},
+ ],
+ 'referer_domain': [
+ {'referer_domain': 'somewhere.example.com', 'count': 4},
+ {'referer_domain': 'elsewhere.example.com', 'count': 2},
+ ],
+ },
+ }
diff --git a/api_tests/metrics/test_raw_metrics.py b/api_tests/metrics/test_raw_metrics.py
index e32936d9024..a30be5584e7 100644
--- a/api_tests/metrics/test_raw_metrics.py
+++ b/api_tests/metrics/test_raw_metrics.py
@@ -1,10 +1,8 @@
-import pytest
-
-from website.app import setup_django
-setup_django()
+from http import HTTPStatus
+import pytest
from waffle.testutils import override_switch
-from elasticsearch6_dsl.connections import connections as es6_connections
+from elasticsearch8.dsl.connections import connections as es8_connections
from osf import features
from osf_tests.factories import AuthUserFactory
@@ -14,7 +12,7 @@
pytestmark = pytest.mark.django_db
-@pytest.mark.es_metrics
+@pytest.mark.djelme_elasticsearch_backends
class TestRawMetrics:
@pytest.fixture(autouse=True)
@@ -23,10 +21,11 @@ def enable_elasticsearch_metrics(self):
yield
@pytest.fixture(autouse=True)
- def teardown_customer_index(self, es6_client):
- es6_client.indices.delete(index='customer', ignore_unavailable=True)
+ def teardown_customer_index(self):
+ _es8_client = es8_connections.get_connection('osfmetrics_es8')
+ _es8_client.indices.delete(index='customer', ignore_unavailable=True)
yield
- es6_client.indices.delete(index='customer', ignore_unavailable=True)
+ _es8_client.indices.delete(index='customer', ignore_unavailable=True)
@pytest.fixture
def user(self):
@@ -40,19 +39,17 @@ def user(self):
def other_user(self):
return AuthUserFactory()
- @pytest.fixture(params=['raw', 'raw-osfmetrics_es6'])
+ @pytest.fixture
def base_url(self, request):
- return f'/{API_BASE}metrics/{request.param}/'
+ return f'/{API_BASE}metrics/raw-osfmetrics_es8/'
def test_delete(self, app, user, base_url):
res = app.delete_json_api(base_url, auth=user.auth, expect_errors=True)
- assert res.status_code == 400
- assert res.json['errors'][0]['detail'] == 'DELETE not supported. Use GET/POST/PUT'
+ assert res.status_code == HTTPStatus.METHOD_NOT_ALLOWED
def test_put(self, app, user, base_url):
put_return = {
'_index': 'customer',
- '_type': '_doc',
'_id': '1',
'_version': 1,
'result': 'created',
@@ -69,7 +66,7 @@ def test_put(self, app, user, base_url):
put_data = {
'name': 'John Doe'
}
- res = app.put_json_api(put_url, put_data, auth=user.auth)
+ res = app.put_json_api(put_url, put_data, headers={'Content-Type': 'application/json'}, auth=user.auth)
assert res.json == put_return
def test_put_no_perms(self, app, other_user, base_url):
@@ -77,14 +74,13 @@ def test_put_no_perms(self, app, other_user, base_url):
put_data = {
'name': 'John Doe'
}
- res = app.put_json_api(put_url, put_data, auth=other_user.auth, expect_errors=True)
+ res = app.put_json_api(put_url, put_data, auth=other_user.auth, headers={'Content-Type': 'application/json'}, expect_errors=True)
assert res.status_code == 403
assert res.json['errors'][0]['detail'] == 'You do not have permission to perform this action.'
def test_post(self, app, user, base_url):
post_return = {
'_index': 'customer',
- '_type': '_doc',
'_id': '1',
'_version': 1,
'result': 'created',
@@ -101,7 +97,7 @@ def test_post(self, app, user, base_url):
post_data = {
'name': 'Jane Doe'
}
- res = app.post_json_api(post_url, post_data, auth=user.auth)
+ res = app.post_json_api(post_url, post_data, headers={'Content-Type': 'application/json'}, auth=user.auth)
assert res.json == post_return
def test_post_no_perms(self, app, other_user, base_url):
@@ -109,14 +105,13 @@ def test_post_no_perms(self, app, other_user, base_url):
post_data = {
'name': 'John Doe'
}
- res = app.post_json_api(post_url, post_data, auth=other_user.auth, expect_errors=True)
+ res = app.post_json_api(post_url, post_data, headers={'Content-Type': 'application/json'}, auth=other_user.auth, expect_errors=True)
assert res.status_code == 403
assert res.json['errors'][0]['detail'] == 'You do not have permission to perform this action.'
def test_post_and_get(self, app, user, base_url):
post_return = {
'_index': 'customer',
- '_type': '_doc',
'_id': '1',
'_version': 1,
'result': 'created',
@@ -133,17 +128,17 @@ def test_post_and_get(self, app, user, base_url):
post_data = {
'name': 'Beyonce'
}
- res = app.post_json_api(post_url, post_data, auth=user.auth)
+ res = app.post_json_api(post_url, post_data, headers={'Content-Type': 'application/json'}, auth=user.auth)
assert res.json == post_return
- es6_connections.get_connection('osfmetrics_es6').indices.refresh(
+ es8_connections.get_connection('osfmetrics_es8').indices.refresh(
index='customer',
)
get_url = f'{base_url}customer/_search?q=*'
res = app.get(get_url, auth=user.auth)
- assert res.json['hits']['total'] == 1
+ assert res.json['hits']['total']['value'] == 1
assert res.json['hits']['hits'][0]['_source']['name'] == 'Beyonce'
get_url = f'{base_url}customer/_doc/1/'
diff --git a/api_tests/metrics/test_registries_moderation_metrics.py b/api_tests/metrics/test_registries_moderation_metrics.py
index f5d3a047b10..feaf48b7de9 100644
--- a/api_tests/metrics/test_registries_moderation_metrics.py
+++ b/api_tests/metrics/test_registries_moderation_metrics.py
@@ -2,12 +2,11 @@
from osf_tests.factories import RegistrationFactory, AuthUserFactory
from osf.utils.workflows import RegistrationModerationStates, RegistrationModerationTriggers
-from osf.metrics import RegistriesModerationMetrics
+from osf.metrics.events import RegistriesModerationEvent
from tests.utils import capture_notifications
-pytestmark = pytest.mark.django_db
-
+@pytest.mark.djelme_elasticsearch_backends
@pytest.mark.django_db
class TestRegistrationModerationMetrics:
@@ -15,7 +14,6 @@ class TestRegistrationModerationMetrics:
def registration(self):
return RegistrationFactory()
- @pytest.mark.es_metrics
def test_record_transitions(self, registration):
with capture_notifications():
registration._write_registration_action(
@@ -24,10 +22,10 @@ def test_record_transitions(self, registration):
registration.creator,
'Metrics is easy'
)
- RegistriesModerationMetrics._get_connection().indices.refresh(RegistriesModerationMetrics._template_pattern)
+ RegistriesModerationEvent.refresh()
- assert RegistriesModerationMetrics.search().count() == 1
- data = RegistriesModerationMetrics.search().execute()['hits']['hits'][0]['_source']
+ assert RegistriesModerationEvent.search().count() == 1
+ data = RegistriesModerationEvent.search().execute()['hits']['hits'][0]['_source']
assert data['from_state'] == RegistrationModerationStates.INITIAL.db_name
assert data['to_state'] == RegistrationModerationStates.PENDING.db_name
@@ -36,6 +34,7 @@ def test_record_transitions(self, registration):
assert data['comment'] == 'Metrics is easy'
+@pytest.mark.djelme_elasticsearch_backends
@pytest.mark.django_db
class TestRegistrationModerationMetricsView:
@@ -59,7 +58,6 @@ def other_user(self):
def base_url(self):
return '/_/metrics/registries_moderation/transitions/'
- @pytest.mark.es_metrics
def test_registries_moderation_view(self, app, user, base_url, registration):
with capture_notifications():
registration._write_registration_action(
@@ -68,7 +66,7 @@ def test_registries_moderation_view(self, app, user, base_url, registration):
registration.creator,
'Metrics is easy'
)
- RegistriesModerationMetrics._get_connection().indices.refresh(RegistriesModerationMetrics._template_pattern)
+ RegistriesModerationEvent.refresh()
res = app.get(base_url, auth=user.auth, expect_errors=True)
data = res.json
diff --git a/api_tests/metrics/test_reports.py b/api_tests/metrics/test_reports.py
index db748bdb05b..bebb42059b8 100644
--- a/api_tests/metrics/test_reports.py
+++ b/api_tests/metrics/test_reports.py
@@ -21,7 +21,7 @@ def mock_domain(self):
@pytest.fixture
def mock_search(self):
- with mock.patch('elasticsearch6.Elasticsearch.search', autospec=True) as mock_search:
+ with mock.patch('elasticsearch8.Elasticsearch.search', autospec=True) as mock_search:
yield mock_search
def test_report_names(self, app, mock_domain):
@@ -44,11 +44,11 @@ def test_report_names(self, app, mock_domain):
@pytest.mark.parametrize('report_name', expected_report_names)
def test_recent_reports(self, app, mock_domain, mock_search, report_name):
- mock_search.return_value = {
+ mock_search.return_value.body = {
'hits': {
'hits': [
- {'_id': 'hi-by', '_source': {'report_date': '1234-12-12', 'hello': 'goodbye'}},
- {'_id': 'doof', '_source': {'report_date': '1234-12-11', 'hello': 'upwa'}},
+ {'_id': 'hi-by', '_source': {'report_date': '1234-12-12', 'hello': 'goodbye', 'created': '1235-12-13T01:00:00Z'}},
+ {'_id': 'doof', '_source': {'report_date': '1234-12-11', 'hello': 'upwa', 'created': '1235-12-12T01:00:00Z'}},
],
},
}
@@ -58,17 +58,19 @@ def test_recent_reports(self, app, mock_domain, mock_search, report_name):
assert resp.json['data'] == [
{
'id': 'hi-by',
- 'type': f'daily-report:{report_name}',
+ 'type': f'cyclic-report:{report_name}',
'attributes': {
'report_date': '1234-12-12',
'hello': 'goodbye',
+ 'created': '1235-12-13T01:00:00Z',
},
}, {
'id': 'doof',
- 'type': f'daily-report:{report_name}',
+ 'type': f'cyclic-report:{report_name}',
'attributes': {
'report_date': '1234-12-11',
'hello': 'upwa',
+ 'created': '1235-12-12T01:00:00Z',
},
}
]
@@ -84,12 +86,12 @@ def test_recent_reports(self, app, mock_domain, mock_search, report_name):
assert resp.unicode_body == CSV_REPORTS
-TSV_REPORTS = '''report_date hello
-1234-12-12 goodbye
-1234-12-11 upwa
+TSV_REPORTS = '''report_date created hello
+1234-12-12 1235-12-13 01:00:00+00:00 goodbye
+1234-12-11 1235-12-12 01:00:00+00:00 upwa
'''.replace('\n', '\r\n')
-CSV_REPORTS = '''report_date,hello
-1234-12-12,goodbye
-1234-12-11,upwa
+CSV_REPORTS = '''report_date,created,hello
+1234-12-12,1235-12-13 01:00:00+00:00,goodbye
+1234-12-11,1235-12-12 01:00:00+00:00,upwa
'''.replace('\n', '\r\n')
diff --git a/api_tests/preprints/views/test_preprint_detail_metrics.py b/api_tests/preprints/views/test_preprint_detail_metrics.py
index f98777be678..9d945e8159f 100644
--- a/api_tests/preprints/views/test_preprint_detail_metrics.py
+++ b/api_tests/preprints/views/test_preprint_detail_metrics.py
@@ -17,17 +17,13 @@ def enable_elasticsearch_metrics(self):
with override_switch(features.ELASTICSEARCH_METRICS, active=True):
yield
- @pytest.mark.parametrize(('metric_name', 'metric_class_name'),
- [
- ('downloads', 'PreprintDownload'),
- ('views', 'PreprintView'),
- ])
- def test_preprint_detail_with_downloads(self, app, settings, metric_name, metric_class_name):
+ @pytest.mark.parametrize('metric_name', ['downloads', 'views'])
+ def test_preprint_detail_with_downloads(self, app, settings, metric_name):
preprint = PreprintFactory()
url = f'/{API_BASE}preprints/{preprint._id}/?metrics[{metric_name}]=total'
- with mock.patch(f'api.preprints.views.{metric_class_name}.get_count_for_preprint') as mock_get_count_for_preprint:
- mock_get_count_for_preprint.return_value = 42
+ with mock.patch('api.base.metrics.UsageMetricsViewMixin._get_usage_count') as mock_get_count:
+ mock_get_count.return_value = 42
res = app.get(url)
assert res.status_code == 200
diff --git a/api_tests/preprints/views/test_preprint_list.py b/api_tests/preprints/views/test_preprint_list.py
index 3208c397893..15d12079328 100644
--- a/api_tests/preprints/views/test_preprint_list.py
+++ b/api_tests/preprints/views/test_preprint_list.py
@@ -1,9 +1,8 @@
from unittest import mock
-import datetime as dt
import pytest
from django.utils import timezone
-from waffle.testutils import override_switch, override_flag
+from waffle.testutils import override_flag
from addons.github.models import GithubFile
from api.base.settings.defaults import API_BASE
@@ -1027,65 +1026,3 @@ def provider(self):
@pytest.fixture()
def url(self, project):
return f'/{API_BASE}preprints/?version=2.2&'
-
-
-@pytest.mark.django_db
-class TestPreprintListWithMetrics:
-
- # enable the ELASTICSEARCH_METRICS switch for all tests
- @pytest.fixture(autouse=True)
- def enable_elasticsearch_metrics(self):
- with override_switch(features.ELASTICSEARCH_METRICS, active=True):
- yield
-
- @pytest.mark.parametrize(
- ('metric_name', 'metric_class_name'),
- [
- ('downloads', 'PreprintDownload'),
- ('views', 'PreprintView'),
- ],
- )
- def test_preprint_list_with_metrics(self, app, metric_name, metric_class_name):
- url = f'/{API_BASE}preprints/?metrics[{metric_name}]=total'
- preprint1 = PreprintFactory()
- preprint1.downloads = 41
- preprint2 = PreprintFactory()
- preprint2.downloads = 42
-
- with mock.patch(f'api.preprints.views.{metric_class_name}.get_top_by_count') as mock_get_top_by_count:
- mock_get_top_by_count.return_value = [preprint2, preprint1]
- res = app.get(url)
- assert res.status_code == 200
-
- preprint_2_data = res.json['data'][0]
- assert preprint_2_data['meta']['metrics']['downloads'] == 42
-
- preprint_1_data = res.json['data'][1]
- assert preprint_1_data['meta']['metrics']['downloads'] == 41
-
- @mock.patch('django.utils.timezone.now')
- @pytest.mark.parametrize(
- ('query_value', 'timedelta'),
- [
- ('daily', dt.timedelta(days=1)),
- ('weekly', dt.timedelta(days=7)),
- ('yearly', dt.timedelta(days=365)),
- ],
- )
- def test_preprint_list_filter_metric_by_time_period(self, mock_timezone_now, app, settings, query_value, timedelta):
- url = f'/{API_BASE}preprints/?metrics[views]={query_value}'
- mock_now = dt.datetime.utcnow().replace(tzinfo=timezone.utc)
- mock_timezone_now.return_value = mock_now
-
- preprint1 = PreprintFactory()
- preprint1.views = 41
- preprint2 = PreprintFactory()
- preprint2.views = 42
-
- with mock.patch('api.preprints.views.PreprintView.get_top_by_count') as mock_get_top_by_count:
- mock_get_top_by_count.return_value = [preprint2, preprint1]
- res = app.get(url)
-
- assert res.status_code == 200
- call_kwargs = mock_get_top_by_count.call_args[1]
- assert call_kwargs['after'] == mock_now - timedelta
diff --git a/api_tests/providers/preprints/views/test_preprint_provider_list.py b/api_tests/providers/preprints/views/test_preprint_provider_list.py
index c1624fd58f9..21499744d77 100644
--- a/api_tests/providers/preprints/views/test_preprint_provider_list.py
+++ b/api_tests/providers/preprints/views/test_preprint_provider_list.py
@@ -1,8 +1,5 @@
-from unittest import mock
import pytest
-from waffle.testutils import override_switch
-from osf import features
from api.base.settings.defaults import API_BASE
from osf_tests.factories import (
AuthUserFactory,
@@ -65,28 +62,3 @@ def test_preprint_provider_list_filtering(
url, filter_type, filter_value))
assert res.status_code == 200
assert len(res.json['data']) == 1
-
-
-@pytest.mark.django_db
-class TestPreprintProviderListWithMetrics:
-
- # enable the ELASTICSEARCH_METRICS switch for all tests
- @pytest.fixture(autouse=True)
- def enable_elasticsearch_metrics(self):
- with override_switch(features.ELASTICSEARCH_METRICS, active=True):
- yield
-
- def test_preprint_provider_list_with_metrics(self, app, url, provider_one, provider_two):
- provider_one.downloads = 41
- provider_two.downloads = 42
- with mock.patch('api.preprints.views.PreprintDownload.get_top_by_count') as mock_get_top_by_count:
- mock_get_top_by_count.return_value = [provider_one, provider_two]
- res = app.get(url + 'metrics[downloads]=total')
-
- assert res.status_code == 200
-
- provider_2_data = res.json['data'][0]
- provider_2_data['meta']['metrics']['downloads'] == 42
-
- provider_1_data = res.json['data'][1]
- provider_1_data['meta']['metrics']['downloads'] == 41
diff --git a/api_tests/share/test_share_node.py b/api_tests/share/test_share_node.py
index 2fcc9cc48a8..9466f603fda 100644
--- a/api_tests/share/test_share_node.py
+++ b/api_tests/share/test_share_node.py
@@ -21,7 +21,10 @@
from framework.auth.core import Auth
from api.share.utils import shtrove_ingest_url
-from ._utils import expect_ingest_request
+from ._utils import (
+ expect_ingest_request,
+ mock_share_responses,
+)
@pytest.mark.django_db
@@ -30,9 +33,12 @@ class TestNodeShare:
@pytest.fixture(scope='class', autouse=True)
def _patches(self):
- with patch('osf.models.identifiers.IdentifierMixin.request_identifier_update'):
- with patch.object(settings, 'USE_CELERY', False):
- yield
+ with (
+ patch.object(settings, 'USE_CELERY', False),
+ patch('osf.models.identifiers.IdentifierMixin.request_identifier_update'),
+ patch('osf.metadata.osf_gathering.MonthlyPublicItemUsageReport.from_last_month', return_value=()),
+ ):
+ yield
@pytest.fixture()
def user(self):
@@ -98,15 +104,21 @@ def registration_outcome(self, registration):
)
return o
- def test_update_node_share(self, mock_share_responses, node, user):
- with expect_ingest_request(mock_share_responses, node):
+ def test_update_node_share(self, node, user):
+ with (
+ mock_share_responses() as _mock_share_responses,
+ expect_ingest_request(_mock_share_responses, node),
+ ):
on_node_updated(node._id, user._id, False, {'is_public'})
- def test_update_registration_share(self, mock_share_responses, registration, user):
- with expect_ingest_request(mock_share_responses, registration):
+ def test_update_registration_share(self, registration, user):
+ with (
+ mock_share_responses() as _mock_share_responses,
+ expect_ingest_request(_mock_share_responses, registration),
+ ):
on_node_updated(registration._id, user._id, False, {'is_public'})
- def test_update_share_correctly_for_projects(self, mock_share_responses, node, user):
+ def test_update_share_correctly_for_projects(self, node, user):
cases = [{
'is_deleted': False,
'attrs': {'is_public': True, 'is_deleted': False, 'spam_status': SpamStatus.HAM}
@@ -121,14 +133,16 @@ def test_update_share_correctly_for_projects(self, mock_share_responses, node, u
'attrs': {'is_public': True, 'is_deleted': False, 'spam_status': SpamStatus.SPAM}
}]
- mock_share_responses._calls.reset() # reset after factory calls
for i, case in enumerate(cases):
for attr, value in case['attrs'].items():
setattr(node, attr, value)
- with expect_ingest_request(mock_share_responses, node, delete=case['is_deleted']):
+ with (
+ mock_share_responses() as _mock_share_responses,
+ expect_ingest_request(_mock_share_responses, node, delete=case['is_deleted']),
+ ):
node.save()
- def test_update_share_correctly_for_registrations(self, mock_share_responses, registration, user):
+ def test_update_share_correctly_for_registrations(self, registration, user):
cases = [{
'is_deleted': True,
'attrs': {'is_public': False, 'is_deleted': False}
@@ -140,44 +154,50 @@ def test_update_share_correctly_for_registrations(self, mock_share_responses, re
'attrs': {'is_public': True, 'is_deleted': False}
}]
- mock_share_responses._calls.reset() # reset after factory calls
for i, case in enumerate(cases):
for attr, value in case['attrs'].items():
setattr(registration, attr, value)
- with expect_ingest_request(mock_share_responses, registration, delete=case['is_deleted']):
+ with (
+ mock_share_responses() as _mock_share_responses,
+ expect_ingest_request(_mock_share_responses, registration, delete=case['is_deleted']),
+ ):
registration.save()
assert registration.is_registration
- def test_update_share_correctly_for_projects_with_qa_tags(self, mock_share_responses, node, user):
- with expect_ingest_request(mock_share_responses, node, delete=True):
- node.add_tag(settings.DO_NOT_INDEX_LIST['tags'][0], auth=Auth(user))
- with expect_ingest_request(mock_share_responses, node, delete=False):
- node.remove_tag(settings.DO_NOT_INDEX_LIST['tags'][0], auth=Auth(user), save=True)
-
- def test_update_share_correctly_for_registrations_with_qa_tags(self, mock_share_responses, registration, user):
- with expect_ingest_request(mock_share_responses, registration, delete=True):
- registration.add_tag(settings.DO_NOT_INDEX_LIST['tags'][0], auth=Auth(user))
- with expect_ingest_request(mock_share_responses, registration):
- registration.remove_tag(settings.DO_NOT_INDEX_LIST['tags'][0], auth=Auth(user), save=True)
-
- def test_update_share_correctly_for_projects_with_qa_titles(self, mock_share_responses, node, user):
+ def test_update_share_correctly_for_projects_with_qa_tags(self, node, user):
+ with mock_share_responses() as _mock_share_responses:
+ with expect_ingest_request(_mock_share_responses, node, delete=True):
+ node.add_tag(settings.DO_NOT_INDEX_LIST['tags'][0], auth=Auth(user))
+ with expect_ingest_request(_mock_share_responses, node, delete=False):
+ node.remove_tag(settings.DO_NOT_INDEX_LIST['tags'][0], auth=Auth(user), save=True)
+
+ def test_update_share_correctly_for_registrations_with_qa_tags(self, registration, user):
+ with mock_share_responses() as _mock_share_responses:
+ with expect_ingest_request(_mock_share_responses, registration, delete=True):
+ registration.add_tag(settings.DO_NOT_INDEX_LIST['tags'][0], auth=Auth(user))
+ with expect_ingest_request(_mock_share_responses, registration):
+ registration.remove_tag(settings.DO_NOT_INDEX_LIST['tags'][0], auth=Auth(user), save=True)
+
+ def test_update_share_correctly_for_projects_with_qa_titles(self, node, user):
node.title = settings.DO_NOT_INDEX_LIST['titles'][0] + ' arbitrary text for test title.'
node.save()
- with expect_ingest_request(mock_share_responses, node, delete=True):
- on_node_updated(node._id, user._id, False, {'is_public'})
- node.title = 'Not a qa title'
- with expect_ingest_request(mock_share_responses, node):
- node.save()
- assert node.title not in settings.DO_NOT_INDEX_LIST['titles']
+ with mock_share_responses() as _mock_share_responses:
+ with expect_ingest_request(_mock_share_responses, node, delete=True):
+ on_node_updated(node._id, user._id, False, {'is_public'})
+ node.title = 'Not a qa title'
+ with expect_ingest_request(_mock_share_responses, node):
+ node.save()
+ assert node.title not in settings.DO_NOT_INDEX_LIST['titles']
- def test_update_share_correctly_for_registrations_with_qa_titles(self, mock_share_responses, registration, user):
+ def test_update_share_correctly_for_registrations_with_qa_titles(self, registration, user):
registration.title = settings.DO_NOT_INDEX_LIST['titles'][0] + ' arbitrary text for test title.'
- with expect_ingest_request(mock_share_responses, registration, delete=True):
- registration.save()
- registration.title = 'Not a qa title'
- with expect_ingest_request(mock_share_responses, registration):
- registration.save()
- assert registration.title not in settings.DO_NOT_INDEX_LIST['titles']
+ with mock_share_responses() as _mock_share_responses:
+ with expect_ingest_request(_mock_share_responses, registration, delete=True):
+ registration.save()
+ registration.title = 'Not a qa title'
+ with expect_ingest_request(_mock_share_responses, registration):
+ registration.save()
+ assert registration.title not in settings.DO_NOT_INDEX_LIST['titles']
@responses.activate
def test_skips_no_settings(self, node, user):
@@ -185,22 +205,25 @@ def test_skips_no_settings(self, node, user):
assert len(responses.calls) == 0
@mark.skip('Synchronous retries not supported if celery >=5.0')
- def test_call_async_update_on_500_retry(self, mock_share_responses, node, user):
+ def test_call_async_update_on_500_retry(self, node, user):
"""This is meant to simulate a temporary outage, so the retry mechanism should kick in and complete it."""
- mock_share_responses.replace(responses.POST, shtrove_ingest_url(), status=500)
- mock_share_responses.add(responses.POST, shtrove_ingest_url(), status=200)
- with expect_ingest_request(mock_share_responses, node, count=2):
- on_node_updated(node._id, user._id, False, {'is_public'})
+ with mock_share_responses() as _mock_share_responses:
+ _mock_share_responses.replace(responses.POST, shtrove_ingest_url(), status=500)
+ _mock_share_responses.add(responses.POST, shtrove_ingest_url(), status=200)
+ with expect_ingest_request(_mock_share_responses, node, count=2):
+ on_node_updated(node._id, user._id, False, {'is_public'})
@mark.skip('Synchronous retries not supported if celery >=5.0')
- def test_call_async_update_on_500_failure(self, mock_share_responses, node, user):
+ def test_call_async_update_on_500_failure(self, node, user):
"""This is meant to simulate a total outage, so the retry mechanism should try X number of times and quit."""
- mock_share_responses.replace(responses.POST, shtrove_ingest_url(), status=500)
- with expect_ingest_request(mock_share_responses, node, count=5): # tries five times
- on_node_updated(node._id, user._id, False, {'is_public'})
+ with mock_share_responses() as _mock_share_responses:
+ _mock_share_responses.replace(responses.POST, shtrove_ingest_url(), status=500)
+ with expect_ingest_request(_mock_share_responses, node, count=5): # tries five times
+ on_node_updated(node._id, user._id, False, {'is_public'})
@mark.skip('Synchronous retries not supported if celery >=5.0')
- def test_no_call_async_update_on_400_failure(self, mock_share_responses, node, user):
- mock_share_responses.replace(responses.POST, shtrove_ingest_url(), status=400)
- with expect_ingest_request(mock_share_responses, node):
- on_node_updated(node._id, user._id, False, {'is_public'})
+ def test_no_call_async_update_on_400_failure(self, node, user):
+ with mock_share_responses() as _mock_share_responses:
+ _mock_share_responses.replace(responses.POST, shtrove_ingest_url(), status=400)
+ with expect_ingest_request(_mock_share_responses, node):
+ on_node_updated(node._id, user._id, False, {'is_public'})
diff --git a/api_tests/share/test_share_preprint.py b/api_tests/share/test_share_preprint.py
index 118abf3105b..ca58a868e47 100644
--- a/api_tests/share/test_share_preprint.py
+++ b/api_tests/share/test_share_preprint.py
@@ -17,8 +17,11 @@
)
from website import settings
from website.preprints.tasks import on_preprint_updated
-from ._utils import expect_preprint_ingest_request
from tests.utils import capture_notifications
+from ._utils import (
+ expect_preprint_ingest_request,
+ mock_share_responses,
+)
@pytest.mark.django_db
@@ -26,7 +29,10 @@
class TestPreprintShare:
@pytest.fixture(scope='class', autouse=True)
def _patches(self):
- with mock.patch.object(settings, 'USE_CELERY', False):
+ with (
+ mock.patch.object(settings, 'USE_CELERY', False),
+ mock.patch('osf.metadata.osf_gathering.MonthlyPublicItemUsageReport.from_last_month', return_value=()),
+ ):
yield
@pytest.fixture
@@ -45,7 +51,7 @@ def provider(self):
)
@pytest.fixture
- def project(self, user, mock_share_responses):
+ def project(self, user):
return ProjectFactory(creator=user, is_public=True)
@pytest.fixture
@@ -67,89 +73,121 @@ def preprint(self, project, user, provider, subject):
is_published=False
)
- def test_save_unpublished_not_called(self, mock_share_responses, preprint):
+ def test_save_unpublished_not_called(self, preprint):
# expecting no ingest requests (delete or otherwise)
- with expect_preprint_ingest_request(mock_share_responses, preprint, count=0):
+ with (
+ mock_share_responses() as _mock_share_responses,
+ expect_preprint_ingest_request(_mock_share_responses, preprint, count=0),
+ ):
preprint.save()
- def test_save_published_called(self, mock_share_responses, preprint, user, auth):
- with capture_notifications():
- with expect_preprint_ingest_request(mock_share_responses, preprint):
- preprint.set_published(True, auth=auth, save=True)
+ def test_save_published_called(self, preprint, user, auth):
+ with (
+ capture_notifications(),
+ mock_share_responses() as _mock_share_responses,
+ expect_preprint_ingest_request(_mock_share_responses, preprint),
+ ):
+ preprint.set_published(True, auth=auth, save=True)
# This covers an edge case where a preprint is forced back to unpublished
# that it sends the information back to share
- def test_save_unpublished_called_forced(self, mock_share_responses, auth, preprint):
- with capture_notifications():
- with expect_preprint_ingest_request(mock_share_responses, preprint):
+ def test_save_unpublished_called_forced(self, auth, preprint):
+ with (
+ capture_notifications(),
+ mock_share_responses() as _mock_share_responses,
+ ):
+ with expect_preprint_ingest_request(_mock_share_responses, preprint):
preprint.set_published(True, auth=auth, save=True)
- with expect_preprint_ingest_request(mock_share_responses, preprint, delete=True):
+ with expect_preprint_ingest_request(_mock_share_responses, preprint, delete=True):
preprint.is_published = False
preprint.save(**{'force_update': True})
- def test_save_published_subject_change_called(self, mock_share_responses, auth, preprint, subject, subject_two):
+ def test_save_published_subject_change_called(self, auth, preprint, subject, subject_two):
with capture_notifications():
preprint.set_published(True, auth=auth, save=True)
- with expect_preprint_ingest_request(mock_share_responses, preprint):
+ with (
+ mock_share_responses() as _mock_share_responses,
+ expect_preprint_ingest_request(_mock_share_responses, preprint),
+ ):
preprint.set_subjects([[subject_two._id]], auth=auth)
- def test_save_unpublished_subject_change_not_called(self, mock_share_responses, auth, preprint, subject_two):
- with expect_preprint_ingest_request(mock_share_responses, preprint, delete=True):
+ def test_save_unpublished_subject_change_not_called(self, auth, preprint, subject_two):
+ with (
+ mock_share_responses() as _mock_share_responses,
+ expect_preprint_ingest_request(_mock_share_responses, preprint, delete=True),
+ ):
preprint.set_subjects([[subject_two._id]], auth=auth)
- def test_send_to_share_is_true(self, mock_share_responses, auth, preprint):
+ def test_send_to_share_is_true(self, auth, preprint):
with capture_notifications():
preprint.set_published(True, auth=auth, save=True)
- with expect_preprint_ingest_request(mock_share_responses, preprint):
+ with (
+ mock_share_responses() as _mock_share_responses,
+ expect_preprint_ingest_request(_mock_share_responses, preprint),
+ ):
on_preprint_updated(preprint._id, saved_fields=['title'])
- def test_preprint_contributor_changes_updates_preprints_share(self, mock_share_responses, user, auth):
+ def test_preprint_contributor_changes_updates_preprints_share(self, user, auth):
with capture_notifications():
preprint = PreprintFactory(is_published=True, creator=user)
preprint.set_published(True, auth=auth, save=True)
user2 = AuthUserFactory()
- with expect_preprint_ingest_request(mock_share_responses, preprint):
- preprint.add_contributor(contributor=user2, auth=auth, save=True)
+ with mock_share_responses() as _mock_share_responses:
+ with expect_preprint_ingest_request(_mock_share_responses, preprint):
+ preprint.add_contributor(contributor=user2, auth=auth, save=True)
- with expect_preprint_ingest_request(mock_share_responses, preprint):
- preprint.move_contributor(contributor=user, index=0, auth=auth, save=True)
+ with expect_preprint_ingest_request(_mock_share_responses, preprint):
+ preprint.move_contributor(contributor=user, index=0, auth=auth, save=True)
- data = [{'id': user._id, 'permissions': ADMIN, 'visible': True},
- {'id': user2._id, 'permissions': WRITE, 'visible': False}]
+ data = [{'id': user._id, 'permissions': ADMIN, 'visible': True},
+ {'id': user2._id, 'permissions': WRITE, 'visible': False}]
- with expect_preprint_ingest_request(mock_share_responses, preprint):
- preprint.manage_contributors(data, auth=auth, save=True)
+ with expect_preprint_ingest_request(_mock_share_responses, preprint):
+ preprint.manage_contributors(data, auth=auth, save=True)
- with expect_preprint_ingest_request(mock_share_responses, preprint):
- preprint.update_contributor(user2, READ, True, auth=auth, save=True)
+ with expect_preprint_ingest_request(_mock_share_responses, preprint):
+ preprint.update_contributor(user2, READ, True, auth=auth, save=True)
- with expect_preprint_ingest_request(mock_share_responses, preprint):
- preprint.remove_contributor(contributor=user2, auth=auth)
+ with expect_preprint_ingest_request(_mock_share_responses, preprint):
+ preprint.remove_contributor(contributor=user2, auth=auth)
@pytest.mark.skip('Synchronous retries not supported if celery >=5.0')
- def test_call_async_update_on_500_failure(self, mock_share_responses, preprint, auth):
- mock_share_responses.replace(responses.POST, shtrove_ingest_url(), status=500)
+ def test_call_async_update_on_500_failure(self, preprint, auth):
preprint.set_published(True, auth=auth, save=True)
- with expect_preprint_ingest_request(mock_share_responses, preprint, count=5):
- preprint.update_search()
+ with mock_share_responses() as _mock_share_responses:
+ _mock_share_responses.replace(responses.POST, shtrove_ingest_url(), status=500)
+ with expect_preprint_ingest_request(_mock_share_responses, preprint, count=5):
+ preprint.update_search()
- def test_no_call_async_update_on_400_failure(self, mock_share_responses, preprint, auth):
+ def test_no_call_async_update_on_400_failure(self, preprint, auth):
with capture_notifications():
- mock_share_responses.replace(responses.POST, shtrove_ingest_url(), status=400)
preprint.set_published(True, auth=auth, save=True)
- with expect_preprint_ingest_request(mock_share_responses, preprint, count=1, error_response=True):
+ with (
+ mock_share_responses() as _mock_share_responses,
+ expect_preprint_ingest_request(_mock_share_responses, preprint, count=1, error_response=True),
+ ):
+ _mock_share_responses.replace(responses.POST, shtrove_ingest_url(), status=400)
preprint.update_search()
- def test_delete_from_share(self, mock_share_responses):
+ def test_delete_from_share(self):
preprint = PreprintFactory()
- with expect_preprint_ingest_request(mock_share_responses, preprint):
+ with (
+ mock_share_responses() as _mock_share_responses,
+ expect_preprint_ingest_request(_mock_share_responses, preprint),
+ ):
preprint.update_search()
preprint.date_withdrawn = datetime.now()
preprint.save()
- with expect_preprint_ingest_request(mock_share_responses, preprint):
+ with (
+ mock_share_responses() as _mock_share_responses,
+ expect_preprint_ingest_request(_mock_share_responses, preprint),
+ ):
preprint.update_search()
preprint.spam_status = SpamStatus.SPAM
preprint.save()
- with expect_preprint_ingest_request(mock_share_responses, preprint, delete=True):
+ with (
+ mock_share_responses() as _mock_share_responses,
+ expect_preprint_ingest_request(_mock_share_responses, preprint, delete=True),
+ ):
preprint.update_search()
diff --git a/conftest.py b/conftest.py
index e80c4e5c566..165fae951ca 100644
--- a/conftest.py
+++ b/conftest.py
@@ -4,7 +4,6 @@
import re
from django.db import transaction
-from elasticsearch6_dsl.connections import connections
from elasticsearch_metrics.tests.util import djelme_test_backends
from faker import Factory
import pytest
@@ -129,22 +128,12 @@ def _test_speedups_disable(request, settings, _test_speedups):
patcher.start()
-@pytest.fixture(scope='session')
-def setup_connections():
- connections.create_connection(hosts=[website_settings.ELASTIC6_URI])
-
-
-@pytest.fixture(scope='function')
-def es6_client(setup_connections):
- return connections.get_connection()
-
-
@pytest.fixture(scope='function', autouse=True)
-def _es_metrics_marker(request):
+def _djelme_elasticsearch_backends_marker(request):
"""Clear out all indices and index templates before and after
- tests marked with `es_metrics`.
+ tests marked with `djelme_elasticsearch_backends`.
"""
- marker = request.node.get_closest_marker('es_metrics')
+ marker = request.node.get_closest_marker('djelme_elasticsearch_backends')
if not marker:
yield
diff --git a/docker-compose-dist-arm64.override.yml b/docker-compose-dist-arm64.override.yml
deleted file mode 100644
index cffa4bd8982..00000000000
--- a/docker-compose-dist-arm64.override.yml
+++ /dev/null
@@ -1,11 +0,0 @@
-## Reference README-docker-compose.md for instructions.
-
-services:
-
- #######
- # OSF #
- #######
-
- elasticsearch6:
- image: quay.io/centerforopenscience/elasticsearch:es6-arm-6.3.1
- platform: linux/arm64
diff --git a/docker-compose.yml b/docker-compose.yml
index 42f7efc5ce7..c62541d6596 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -11,8 +11,6 @@ volumes:
external: false
elasticsearch_data_vol:
external: false
- elasticsearch6_data_vol:
- external: false
elasticsearch8_data_vol:
external: false
rabbitmq_vol:
@@ -67,22 +65,6 @@ services:
- elasticsearch_data_vol:/usr/share/elasticsearch/data
stdin_open: true
- # Temporary: Remove when we've upgraded to ES6
- elasticsearch6:
- image: docker.elastic.co/elasticsearch/elasticsearch:6.3.1
- environment:
- - ES_JAVA_OPTS=-Xms512m -Xmx512m # reduce memory usage
- ports:
- - 9201:9200
- volumes:
- - elasticsearch6_data_vol:/usr/share/elasticsearch/data
- healthcheck:
- start_period: 15s
- test: curl -s http://localhost:9200/_cluster/health | grep -vq '"status":"red"'
- interval: 10s
- retries: 30
- stdin_open: true
-
elasticsearch8:
image: elasticsearch:8.19.14
environment:
diff --git a/osf/features.yaml b/osf/features.yaml
index cce490a25a4..1da56e44f79 100644
--- a/osf/features.yaml
+++ b/osf/features.yaml
@@ -93,11 +93,6 @@ switches:
name: enable_inactive_schemas
note: This is no longer used
- - flag_name: COUNTEDUSAGE_UNIFIED_METRICS_2024
- name: countedusage_unified_metrics_2024
- note: use only `osf.metrics.counted_usage`-based metrics where possible; un-use PageCounter, PreprintView, PreprintDownload, etc
- active: false
-
- flag_name: ENABLE_MAILHOG
name: enable_mailhog
note: This is used to enable the MailHog email testing service, this will allow emails to be sent to the
diff --git a/osf/management/commands/fake_metrics_reports.py b/osf/management/commands/fake_metrics_reports.py
index 53e13472e74..b2c36adce38 100644
--- a/osf/management/commands/fake_metrics_reports.py
+++ b/osf/management/commands/fake_metrics_reports.py
@@ -4,18 +4,19 @@
from django.conf import settings
from django.core.management.base import BaseCommand
-from osf.metrics import (
- UserSummaryReport,
- PreprintSummaryReport,
+from osf.metrics.daily_reports import (
+ DailyUserSummaryReport,
+ DailyPreprintSummaryReport,
)
-from osf.metrics.reports import PublicItemUsageReport
+from osf.metrics.monthly_reports import MonthlyPublicItemUsageReport
from osf.metrics.utils import YearMonth
+from osf.models.base import osfid_iri
from osf.models import PreprintProvider
def fake_user_counts(days_back):
yesterday = date.today() - timedelta(days=1)
- first_report = UserSummaryReport(
+ first_report = DailyUserSummaryReport(
report_date=(yesterday - timedelta(days=days_back)),
active=randint(0, 23),
deactivated=randint(0, 2),
@@ -29,7 +30,7 @@ def fake_user_counts(days_back):
last_report = first_report
while last_report.report_date < yesterday:
new_user_count = randint(0, 500)
- new_report = UserSummaryReport(
+ new_report = DailyUserSummaryReport(
report_date=(last_report.report_date + timedelta(days=1)),
active=(last_report.active + randint(0, new_user_count)),
deactivated=(last_report.deactivated + randint(0, new_user_count)),
@@ -48,7 +49,7 @@ def fake_preprint_counts(days_back):
for day_delta in range(days_back):
for provider_key in provider_keys:
preprint_count = randint(100, 5000) * (days_back - day_delta)
- PreprintSummaryReport(
+ DailyPreprintSummaryReport(
report_date=yesterday - timedelta(days=day_delta),
provider_key=provider_key,
preprint_count=preprint_count,
@@ -57,16 +58,29 @@ def fake_preprint_counts(days_back):
def fake_usage_reports(osfid: str, count: int):
_ym = YearMonth.from_date(date.today()).prior()
+ _prior_report = None
for _months in range(count):
- PublicItemUsageReport.record(
+ _report = MonthlyPublicItemUsageReport(
item_osfid=osfid,
+ item_iri=osfid_iri(osfid),
report_yearmonth=_ym,
view_count=(_vc := randint(0, 500)),
- view_session_count=randint(0, _vc),
+ view_session_count=(_vsc := randint(0, _vc)),
+ cumulative_view_count=_vc,
+ cumulative_view_session_count=_vsc,
download_count=(_dc := randint(0, 300)),
- download_session_count=randint(0, _dc),
+ download_session_count=(_dsc := randint(0, _dc)),
+ cumulative_download_count=_dc,
+ cumulative_download_session_count=_dsc,
)
+ if _prior_report:
+ _report.cumulative_view_count += _prior_report.cumulative_view_count
+ _report.cumulative_view_session_count += _prior_report.cumulative_view_session_count
+ _report.cumulative_download_count += _prior_report.cumulative_download_count
+ _report.cumulative_download_session_count += _prior_report.cumulative_download_session_count
+ _report.save()
_ym = _ym.prior()
+ _prior_report = _report
class Command(BaseCommand):
diff --git a/osf/management/commands/make_dummy_pageviews_for_metrics.py b/osf/management/commands/make_dummy_pageviews_for_metrics.py
deleted file mode 100644
index 09de34bf7a8..00000000000
--- a/osf/management/commands/make_dummy_pageviews_for_metrics.py
+++ /dev/null
@@ -1,118 +0,0 @@
-"""osf/management/commands/poke_metrics_timespan_queries.py
-"""
-import logging
-import random
-import datetime
-
-from django.core.management.base import BaseCommand
-from osf.metrics import CountedAuthUsage
-
-
-logger = logging.getLogger(__name__)
-
-TIME_FILTERS = (
- {'gte': 'now/d-150d'},
- {'gte': '2021-11-28T23:00:00.000Z', 'lte': '2023-01-16T00:00:00.000Z'},
-)
-
-PLATFORM_IRI = 'http://localhost:9201/'
-
-ITEM_GUID = 'foo'
-
-
-class Command(BaseCommand):
-
- def add_arguments(self, parser):
- parser.add_argument(
- '--count',
- type=int,
- default=100,
- help='number of fake pageviews to generate',
- )
- parser.add_argument(
- '--seconds_back',
- type=int,
- default=60 * 60 * 24 * 14, # up to two weeks back
- help='max age in seconds of random event',
- )
-
- def handle(self, *args, **options):
- self._generate_random_countedusage(options.get('count'), options.get('seconds_back'))
-
- results = [
- self._run_date_query(time_filter)
- for time_filter in TIME_FILTERS
- ]
-
- self._print_line(
- (str(f) for f in TIME_FILTERS),
- label='timefilter:',
- )
-
- date_keys = {
- k
- for r in results
- for k in r
- }
- for date_key in sorted(date_keys):
- self._print_line(
- (r.get(date_key, 0) for r in results),
- label=str(date_key),
- )
-
- def _print_line(self, lineitems, label=''):
- print('\t'.join((label, *map(str, lineitems))))
-
- def _generate_random_countedusage(self, n, max_age):
- now = datetime.datetime.now(tz=datetime.UTC)
- for _ in range(n):
- seconds_back = random.randint(0, max_age)
- timestamp_time = now - datetime.timedelta(seconds=seconds_back)
- CountedAuthUsage.record(
- platform_iri=PLATFORM_IRI,
- timestamp=timestamp_time,
- item_guid=ITEM_GUID,
- session_id='freshen by key',
- user_is_authenticated=bool(random.randint(0, 1)),
- item_public=bool(random.randint(0, 1)),
- action_labels=[['view', 'download'][random.randint(0, 1)]],
- )
-
- def _run_date_query(self, time_range_filter):
- result = self._run_query({
- 'query': {
- 'bool': {
- 'filter': {
- 'range': {
- 'timestamp': time_range_filter,
- },
- },
- },
- },
- 'aggs': {
- 'by-date': {
- 'date_histogram': {
- 'field': 'timestamp',
- 'interval': 'day',
- },
- },
- 'max-timestamp': {
- 'max': {'field': 'timestamp'},
- },
- 'min-timestamp': {
- 'min': {'field': 'timestamp'},
- },
- },
- })
- return {
- 'min': result.aggs['min-timestamp'].value,
- 'max': result.aggs['max-timestamp'].value,
- **{
- str(bucket.key.date()): bucket.doc_count
- for bucket in result.aggs['by-date']
- },
- }
-
- def _run_query(self, query_dict):
- analytics_search = CountedAuthUsage.search().update_from_dict(query_dict)
- return analytics_search.execute()
diff --git a/osf/management/commands/metrics_backfill_pageviews.py b/osf/management/commands/metrics_backfill_pageviews.py
deleted file mode 100644
index 13898037923..00000000000
--- a/osf/management/commands/metrics_backfill_pageviews.py
+++ /dev/null
@@ -1,203 +0,0 @@
-"""osf/management/commands/metrics_backfill_pageviews.py
-
-Usage:
-
- $ dc-manage metrics_backfill_pageviews --source=$path_to_csv
- $ dc-manage metrics_backfill_pageviews --source=$path_to_csv --dry # dry run
- $ dc-manage metrics_backfill_pageviews --source=$path_to_csv --resume-from 1264 # start from record 1264
-
-
-"""
-import csv
-import logging
-import datetime
-
-from django.core.management.base import BaseCommand
-from osf.metrics import CountedAuthUsage
-from osf.models import Guid
-
-logger = logging.getLogger(__name__)
-
-def main(source, dry_run=False, resume_from=None):
- if not source:
- logger.info('No source file detected, exiting.')
- return
-
- # keen.timestamp => _source.timestamp # "2023-01-19T04:06:45.675432+00:00",
- # page.info.protocol + page.info.domain => _source.platform_iri # "http://localhost:5000/",
- # visitor.session => _source.session_id # "fcae918a3b6a19641bd0087f84083f0d57982d8c93ab821c405561d1b5c7b305",
- # user.id => _source.user_is_authenticated # true,
- # page.url => _source.pageview_info.page_url # "http://localhost:5000/my-projects/",
- # page.title => _source.pageview_info.page_title # "OSF | My Projects",
- # referrer.url => _source.pageview_info.referer_url # "http://localhost:5000/csab4/analytics",
- # page.meta.routeName => _source.pageview_info.route_name # "OsfWebRenderer.my_projects",
- # time.utc.hour_of_day => _source.pageview_info.hour_of_day # 4,
- # page.info.path => _source.pageview_info.page_path # "/my-projects",
- # referrer.info.domain => _source.pageview_info.referer_domain # "localhost:5000"
- # page.meta.public => _source.item_public # true,
- # node.id => _source.item_guid # "ry7dn",
-
- # ??? => _source.provider_id # "osf",
- # ??? => _source.item_type # "node"
- # ??? => _source.surrounding_guids = # [parent_guids?]
- # ??? => _source.action_labels # ["web"]
-
- count = 0
- reader = csv.DictReader(source)
- for row in reader:
- if not row['page.url'].startswith('https://staging.osf.io'):
- continue
-
- count += 1
- if resume_from is not None and count < resume_from:
- continue
-
- something_wonderful = {
- 'timestamp': _timestamp_to_dt(row['keen.timestamp']),
- 'platform_iri': row['page.info.protocol'] + '://' + row['page.info.domain'],
- 'session_id': row['visitor.session'],
- 'user_is_authenticated': row['user.id'] is not None,
- 'item_guid': row['node.id'],
- 'item_public': row['page.meta.public'] or row['page.meta.pubic'], # unfortunate misspelling
- 'pageview_info': {
- 'hour_of_day': row['time.utc.hour_of_day'],
- 'page_path': row['page.info.path'],
- 'page_title': row['page.title'],
- 'page_url': row['page.url'],
- 'referer_url': row['referrer.url'],
- 'referer_domain': row['referrer.info.domain'],
- 'route_name': row['page.meta.routeName'],
- },
- }
-
- db_info = annotate_from_db(row)
- if db_info:
- something_wonderful.update(db_info)
- populate_action_labels(something_wonderful, row)
-
- logger.info(f'*** {count}: something wonderful:({something_wonderful})')
-
- if not dry_run:
- CountedAuthUsage.record(**something_wonderful)
-
-def populate_action_labels(something_wonderful, row):
- labels = ['web']
-
- if row['page.info.path']:
- path_parts = row['page.info.path'].split('/')
- if len(path_parts) == 1 and path_parts[0] not in ('my-projects', 'goodbye', 'login'):
- labels.append('view')
- elif path_parts[1] in ('wiki'):
- labels.append('view')
-
- if row['page.meta.routeName']:
- route_name = row['page.meta.routeName']
- if 'search' in route_name:
- labels.append('search')
-
- something_wonderful['action_labels'] = labels
-
-guid_cache = {}
-# this may be done by CountedAuthUsage._fill_osfguid_info
-def annotate_from_db(row):
- item_guid = row['node.id']
- if not item_guid:
- return
-
- if not guid_cache.get(item_guid, None):
- guid_info = {}
- guid_instance = Guid.load(item_guid)
-
- if guid_instance and guid_instance.referent:
- guid_info = _fill_osfguid_info(guid_instance.referent)
- guid_cache[item_guid] = guid_info
-
- return guid_cache[item_guid]
-
-# from CountedAuthUsage
-def _fill_osfguid_info(guid_referent):
- guid_info = {}
- guid_info['item_public'] = _get_ispublic(guid_referent)
- guid_info['item_type'] = type(guid_referent).__name__.lower()
- guid_info['surrounding_guids'] = _get_surrounding_guids(guid_referent)
- guid_info['provider_id'] = _get_provider_id(guid_referent)
- return guid_info
-
-def _get_ispublic(guid_referent):
- # if it quacks like BaseFileNode, look at .target instead
- maybe_public = getattr(guid_referent, 'target', None) or guid_referent
- if hasattr(maybe_public, 'verified_publishable'):
- return maybe_public.verified_publishable # quacks like Preprint
- return getattr(maybe_public, 'is_public', None) # quacks like AbstractNode
-
-def _get_provider_id(guid_referent):
- provider = getattr(guid_referent, 'provider', None)
- if isinstance(provider, str):
- return provider # quacks like BaseFileNode
- elif provider:
- return provider._id # quacks like Registration, Preprint, Collection
- return 'osf' # quacks like Node, Comment, WikiPage
-
-def _get_immediate_wrapper(guid_referent):
- if hasattr(guid_referent, 'verified_publishable'):
- return None # quacks like Preprint
- return (
- getattr(guid_referent, 'parent_node', None) # quacks like AbstractNode
- or getattr(guid_referent, 'node', None) # quacks like WikiPage, Comment
- or getattr(guid_referent, 'target', None) # quacks like BaseFileNode
- )
-
-def _get_surrounding_guids(guid_referent):
- """get all the parent/owner/surrounding guids for the given guid_referent
-
- @param guid_referent: instance of a model that has GuidMixin
- @returns list of str
-
- For AbstractNode, goes up the node hierarchy up to the root.
- For WikiPage or BaseFileNode, grab the node it belongs to and
- follow the node hierarchy from there.
- """
- surrounding_guids = []
- current_referent = guid_referent
- while current_referent:
- next_referent = _get_immediate_wrapper(current_referent)
- if next_referent:
- surrounding_guids.append(next_referent._id)
- current_referent = next_referent
- return surrounding_guids
-
-def _timestamp_to_dt(timestamp):
- return datetime.datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S.%fZ').replace(tzinfo=datetime.UTC)
-
-def _timestamp_to_date(timestamp):
- dt_obj = _timestamp_to_dt(timestamp)
- return str(dt_obj.date())
-
-
-class Command(BaseCommand):
-
- def add_arguments(self, parser):
- super().add_arguments(parser)
- parser.add_argument(
- '--source',
- type=open,
- help='source file (csv format w/ header line)',
- )
- parser.add_argument(
- '--dry',
- dest='dry',
- action='store_true',
- help='Dry run'
- )
- parser.add_argument(
- '--resume-from',
- dest='resume_from',
- type=int,
- help='start from which record',
- )
-
- def handle(self, *args, **options):
- dry_run = options.get('dry', None)
- source = options.get('source', None)
- resume_from = options.get('resume_from', None)
- main(source, dry_run, resume_from)
diff --git a/osf/management/commands/metrics_backfill_summaries.py b/osf/management/commands/metrics_backfill_summaries.py
deleted file mode 100644
index d259e9b2a52..00000000000
--- a/osf/management/commands/metrics_backfill_summaries.py
+++ /dev/null
@@ -1,435 +0,0 @@
-"""osf/management/commands/metrics_backfill_summaries.py
-
-usage:
-
- $ dc-manage metrics_backfill_summaries --which=$which_metric --source=$path_to_csv
-
-where ``$which_metric`` is one of:
-
- file_summary
- download_count
- preprint_summary
- institution_summary
- user_summary
- node_summary
-
-"""
-import csv
-import logging
-import datetime
-
-from django.core.management.base import BaseCommand
-from osf.metrics import (
- DownloadCountReport,
- InstitutionSummaryReport,
- # NewUserDomainReport,
- NodeSummaryReport,
- OsfstorageFileCountReport,
- PreprintSummaryReport,
- # StorageAddonUsage,
- UserSummaryReport,
-)
-
-
-logger = logging.getLogger(__name__)
-
-
-def main(source, which, dry_run=False, resume_from=None):
- if which not in SUMMARIES:
- logger.info(f'No such summary, {which}, exiting.')
- return
-
- if not source:
- logger.info('No path to source data file, exiting.')
- return
-
- summary_meta = SUMMARIES[which]
-
- logger.info('Kicking off...')
- with open(source) as csvfile:
- reader = csv.DictReader(csvfile)
-
- count = 0
- for row in reader:
- count += 1
- if resume_from is not None and count < resume_from:
- continue
-
- something_wonderful = summary_meta['mapper'](row)
- logger.info(f'{count}: transformed:({something_wonderful})')
- if not dry_run:
- summary_meta['class'].record(**something_wonderful)
-
- logger.info('All done!')
- if which == 'preprint_summary':
- logger.error(f'Unrecognized provider names: ({bogus_preprints})')
-
-
-def _map_download_count(row):
- # date(keen.timestamp) => _source.report_date # "2022-12-30",
- # keen.created_at => _source.timestamp # "2023-01-02T14:58:38.041721+00:00"
- # files.total => _source.daily_file_downloads # 0,
- return {
- 'report_date': _timestamp_to_date(row['keen.timestamp']),
- 'timestamp': _timestamp_to_dt(row['keen.created_at']),
- 'daily_file_downloads': int(row['files.total']),
- }
-
-def _map_file_summary(row):
- # date(keen.timestamp) => _source.report_date # "2022-12-30",
- # keen.created_at => _source.timestamp # "2023-01-02T14:59:04.397056+00:00"
- # osfstorage_files.private => _source.files.private # 12146,
- # osfstorage_files.total_daily => _source.files.total_daily # 0,
- # osfstorage_files.public_daily => _source.files.public_daily # 0,
- # osfstorage_files.private_daily => _source.files.private_daily # 0
- return {
- 'report_date': _timestamp_to_date(row['keen.timestamp']),
- 'timestamp': _timestamp_to_dt(row['keen.created_at']),
- 'files': {
- 'total': int(row['osfstorage_files.total']),
- 'public': int(row['osfstorage_files.public']),
- 'private': int(row['osfstorage_files.private']),
- 'total_daily': int(row['osfstorage_files.total_daily']),
- 'public_daily': int(row['osfstorage_files.public_daily']),
- 'private_daily': int(row['osfstorage_files.private_daily']),
- },
- }
-
-
-def _map_institution_summary(row):
- # date(keen.timestamp) => _source.report_date # "2022-12-30",
- # keen.created => _source.timestamp # "2023-01-02T14:59:01.706319+00:00"
- # institution.id => _source.institution_id # "okstate",
- # institution.name => _source.institution_name # "Oklahoma State University [Test]",
- # ### => _source.users # {}
- # users.total => _source.total # 0,
- # users.total_daily => _source.total_daily # 0
- # ### => _source.nodes # {}
- # nodes.total => _source.nodes.total": 0,
- # nodes.public => _source.nodes.public": 0,
- # nodes.private => _source.nodes.private": 0,
- # nodes.total_daily => _source.nodes.total_daily": 0,
- # nodes.public_daily => _source.nodes.public_daily": 0,
- # nodes.private_daily => _source.nodes.private_daily": 0
- # ### => _source.projects # {}
- # projects.total => _source.projects.total": 0,
- # projects.public => _source.projects.public": 0,
- # projects.private => _source.projects.private": 0,
- # projects.total_daily => _source.projects.total_daily": 0,
- # projects.public_daily => _source.projects.public_daily": 0,
- # projects.private_daily => _source.projects.private_daily": 0
- # ### => _source.registered_nodes # {}
- # registered_nodes.total => _source.registered_nodes.total": 0,
- # registered_nodes.public => _source.registered_nodes.public": 0,
- # registered_nodes.embargoed => _source.registered_nodes.embargoed": 0,
- # registered_nodes.embargoed_v2 => _source.registered_nodes.embargoed_v2": 0,
- # registered_nodes.total_daily => _source.registered_nodes.total_daily": 0,
- # registered_nodes.public_daily => _source.registered_nodes.public_daily": 0,
- # registered_nodes.embargoed_daily => _source.registered_nodes.embargoed_daily": 0,
- # registered_nodes.embargoed_v2_daily => _source.registered_nodes.embargoed_v2_daily": 0
- # ### => _source.registered_projects # {}
- # registered_projects.total => _source.registered_projects.total": 0,
- # registered_projects.public => _source.registered_projects.public": 0,
- # registered_projects.embargoed => _source.registered_projects.embargoed": 0,
- # registered_projects.embargoed_v2 => _source.registered_projects.embargoed_v2": 0,
- # registered_projects.total_daily => _source.registered_projects.total_daily": 0,
- # registered_projects.public_daily => _source.registered_projects.public_daily": 0,
- # registered_projects.embargoed_daily => _source.registered_projects.embargoed_daily": 0,
- # registered_projects.embargoed_v2_daily => _source.registered_projects.embargoed_v2_daily": 0
- return {
- 'report_date': _timestamp_to_date(row['keen.timestamp']),
- 'timestamp': _timestamp_to_dt(row['keen.created_at']),
- 'institution_id': row['institution.id'],
- 'institution_name': row['institution.name'],
- 'users': {
- 'total': int(row['users.total']),
- 'total_daily': int(row['users.total_daily'] or 0),
- },
- 'nodes': {
- 'total': int(row['nodes.total']),
- 'public': int(row['nodes.public']),
- 'private': int(row['nodes.private']),
- 'total_daily': int(row['nodes.total_daily'] or 0),
- 'public_daily': int(row['nodes.public_daily'] or 0),
- 'private_daily': int(row['nodes.private_daily'] or 0),
- },
- 'projects': {
- 'total': int(row['projects.total']),
- 'public': int(row['projects.public']),
- 'private': int(row['projects.private']),
- 'total_daily': int(row['projects.total_daily'] or 0),
- 'public_daily': int(row['projects.public_daily'] or 0),
- 'private_daily': int(row['projects.private_daily'] or 0),
- },
- 'registered_nodes': {
- 'total': int(row['registered_nodes.total']),
- 'public': int(row['registered_nodes.public']),
- 'embargoed': int(row['registered_nodes.embargoed']),
- 'embargoed_v2': int(row['registered_nodes.embargoed_v2'] or 0),
- 'total_daily': int(row['registered_nodes.total_daily'] or 0),
- 'public_daily': int(row['registered_nodes.public_daily'] or 0),
- 'embargoed_daily': int(row['registered_nodes.embargoed_daily'] or 0),
- 'embargoed_v2_daily': int(row['registered_nodes.embargoed_v2_daily'] or 0),
- },
- 'registered_projects': {
- 'total': int(row['registered_projects.total']),
- 'public': int(row['registered_projects.public']),
- 'embargoed': int(row['registered_projects.embargoed']),
- 'embargoed_v2': int(row['registered_projects.embargoed_v2'] or 0),
- 'total_daily': int(row['registered_projects.total_daily'] or 0),
- 'public_daily': int(row['registered_projects.public_daily'] or 0),
- 'embargoed_daily': int(row['registered_projects.embargoed_daily'] or 0),
- 'embargoed_v2_daily': int(row['registered_projects.embargoed_v2_daily'] or 0),
- },
- }
-
-def _map_node_summary(row):
- # date(keen.timestamp) => _source.report_date # "2022-12-30",
- # keen.created_at => _source.timestamp # "2023-01-02T14:59:03.886999+00:00"
- # ### => _source.nodes # {}
- # nodes.total => _source.nodes.total # 58,
- # nodes.total_excluding_spam => _source.nodes.total_excluding_spam # 58,
- # nodes.public => _source.nodes.public # 14,
- # nodes.private => _source.nodes.private # 44,
- # nodes.total_daily => _source.nodes.total_daily # 0,
- # nodes.total_daily_excluding_spam => _source.nodes.total_daily_excluding_spam # 0,
- # nodes.public_daily => _source.nodes.public_daily # 0,
- # nodes.private_daily => _source.nodes.private_daily # 0
- # ### => _source.projects # {}
- # projects.total => _source.projects.total # 53,
- # projects.total_excluding_spam => _source.projects.total_excluding_spam # 53,
- # projects.public => _source.projects.public # 14,
- # projects.private => _source.projects.private # 39,
- # projects.total_daily => _source.projects.total_daily # 0,
- # projects.total_daily_excluding_spam => _source.projects.total_daily_excluding_spam # 0,
- # projects.public_daily => _source.projects.public_daily # 0,
- # projects.private_daily => _source.projects.private_daily # 0
- # ### => _source.registered_nodes # {}
- # registered_nodes.total => _source.registered_nodes.total # 10,
- # registered_nodes.public => _source.registered_nodes.public # 9,
- # registered_nodes.embargoed => _source.registered_nodes.embargoed # 1,
- # registered_nodes.embargoed_v2 => _source.registered_nodes.embargoed_v2 # 0,
- # registered_nodes.withdrawn => _source.registered_nodes.withdrawn # 0,
- # registered_nodes.total_daily => _source.registered_nodes.total_daily # 0,
- # registered_nodes.public_daily => _source.registered_nodes.public_daily # 0,
- # registered_nodes.embargoed_daily => _source.registered_nodes.embargoed_daily # 0,
- # registered_nodes.embargoed_v2_daily => _source.registered_nodes.embargoed_v2_daily # 0,
- # registered_nodes.withdrawn_daily => _source.registered_nodes.withdrawn_daily # 0
- # ### => _source.registered_projects # {}
- # registered_projects.total => _source.registered_projects."total # 10,
- # registered_projects.public => _source.registered_projects."public # 9,
- # registered_projects.embargoed => _source.registered_projects."embargoed # 1,
- # registered_projects.embargoed_v2 => _source.registered_projects."embargoed_v2 # 0,
- # registered_projects.withdrawn => _source.registered_projects."withdrawn # 0,
- # registered_projects.total_daily => _source.registered_projects."total_daily # 0,
- # registered_projects.public_daily => _source.registered_projects."public_daily # 0,
- # registered_projects.embargoed_daily => _source.registered_projects."embargoed_daily # 0,
- # registered_projects.embargoed_v2_daily => _source.registered_projects."embargoed_v2_daily # 0,
- # registered_projects.withdrawn_daily => _source.registered_projects."withdrawn_daily # 0
- return {
- 'report_date': _timestamp_to_date(row['keen.timestamp']),
- 'timestamp': _timestamp_to_dt(row['keen.created_at']),
- 'nodes': {
- 'total': int(row['nodes.total'] or 0),
- 'total_excluding_spam': int(row['nodes.total_excluding_spam'] or 0),
- 'public': int(row['nodes.public'] or 0),
- 'private': int(row['nodes.private'] or 0),
- 'total_daily': int(row['nodes.total_daily'] or 0),
- 'total_daily_excluding_spam': int(row['nodes.total_daily_excluding_spam'] or 0),
- 'public_daily': int(row['nodes.public_daily'] or 0),
- 'private_daily': int(row['nodes.private_daily'] or 0),
- },
- 'projects': {
- 'total': int(row['projects.total']),
- 'total_excluding_spam': int(row['projects.total_excluding_spam'] or 0),
- 'public': int(row['projects.public'] or 0),
- 'private': int(row['projects.private'] or 0),
- 'total_daily': int(row['projects.total_daily'] or 0),
- 'total_daily_excluding_spam': int(row['projects.total_daily_excluding_spam'] or 0),
- 'public_daily': int(row['projects.public_daily'] or 0),
- 'private_daily': int(row['projects.private_daily'] or 0),
- },
- 'registered_nodes': {
- 'total': int(row['registered_nodes.total'] or 0),
- 'public': int(row['registered_nodes.public'] or 0),
- 'embargoed': int(row['registered_nodes.embargoed'] or 0),
- 'embargoed_v2': int(row['registered_nodes.embargoed_v2'] or 0),
- 'withdrawn': int(row['registered_nodes.withdrawn'] or 0),
- 'total_daily': int(row['registered_nodes.total_daily'] or 0),
- 'public_daily': int(row['registered_nodes.public_daily'] or 0),
- 'embargoed_daily': int(row['registered_nodes.embargoed_daily'] or 0),
- 'embargoed_v2_daily': int(row['registered_nodes.embargoed_v2_daily'] or 0),
- 'withdrawn_daily': int(row['registered_nodes.withdrawn_daily'] or 0),
- },
- 'registered_projects': {
- 'total': int(row['registered_projects.total'] or 0),
- 'public': int(row['registered_projects.public'] or 0),
- 'embargoed': int(row['registered_projects.embargoed'] or 0),
- 'embargoed_v2': int(row['registered_projects.embargoed_v2'] or 0),
- 'withdrawn': int(row['registered_projects.withdrawn'] or 0),
- 'total_daily': int(row['registered_projects.total_daily'] or 0),
- 'public_daily': int(row['registered_projects.public_daily'] or 0),
- 'embargoed_daily': int(row['registered_projects.embargoed_daily'] or 0),
- 'embargoed_v2_daily': int(row['registered_projects.embargoed_v2_daily'] or 0),
- 'withdrawn_daily': int(row['registered_projects.withdrawn_daily'] or 0),
- },
- }
-
-
-preprint_name_map = {
- 'AfricArXiv': 'africarxiv',
- 'AgriXiv': 'agrixiv',
- 'Arabixiv': 'arabixiv',
- 'BioHackrXiv': 'biohackrxiv',
- 'BITSS': 'metaarxiv',
- 'BodoArXiv': 'bodoarxiv',
- 'coppreprints': 'coppreprints',
- 'EarthArXiv': 'eartharxiv',
- 'EcoEvoRxiv': 'ecoevorxiv',
- 'ECSarXiv': 'ecsarxiv',
- 'EdArXiv': 'edarxiv',
- 'engrXiv': 'engrxiv',
- 'FocUS Archive': 'focusarchive',
- 'Frenxiv': 'frenxiv',
- 'INA-Rxiv': 'inarxiv',
- 'IndiaRxiv': 'indiarxiv',
- 'LawArXiv': 'lawarxiv',
- 'LIS Scholarship Archive': 'lissa',
- 'LiveData': 'livedata',
- 'Research AZ': 'livedata',
- 'MarXiv': 'marxiv',
- 'MedArXiv': 'medarxiv',
- 'MediArXiv': 'mediarxiv',
- 'MetaArXiv': 'metaarxiv',
- 'MindRxiv': 'mindrxiv',
- 'NutriXiv': 'nutrixiv',
- 'Open Science Framework': 'osf',
- 'PaleorXiv': 'paleorxiv',
- 'PsyArXiv': 'psyarxiv',
- 'SocArXiv': 'socarxiv',
- 'SportRxiv': 'sportrxiv',
- 'Thesis Commons': 'thesiscommons',
- 'Vulnerability Assessment Testing': 'vulnerabilityassessmenttesting',
-}
-preprint_long_names = list(preprint_name_map.keys())
-preprint_short_names = list(preprint_name_map.values())
-bogus_preprints = {}
-def _map_preprint_summary(row):
- # date(keen.timestamp) => _source.report_date # "2022-12-30",
- # keen.created_at => _source.timestamp # "2023-01-02T14:59:05.684642+00:00"
- # provider.name => _source.provider_key # "psyarxiv",
- # provider.total => _source.preprint_count # 0,
-
- # normalize provider names: we used to store the formal name, now we store the short name
- provider_key = None
- provider_name = row['provider.name']
- if provider_name in preprint_short_names:
- provider_key = provider_name
- elif provider_name in preprint_long_names:
- provider_key = preprint_name_map[provider_name]
- else:
- logger.error(f'Unrecognized preprint provider name: ({provider_name})')
- if provider_name not in bogus_preprints:
- bogus_preprints[provider_name] = 0
- bogus_preprints[provider_name] += 1
- provider_key = provider_name # oh well
-
- return {
- 'report_date': _timestamp_to_date(row['keen.timestamp']),
- 'timestamp': _timestamp_to_dt(row['keen.created_at']),
- 'provider_key': provider_key,
- 'preprint_count': int(row['provider.total']),
- }
-
-def _map_user_summary(row):
- # date(keen.timestamp) => _source.report_date # "2023-01-03",
- # keen.created_at => _source.timestamp # "2023-01-04T13:47:34.216419+00:00"
- # status.active => _source.active # 7,
- # status.deactivated => _source.deactivated # 0,
- # status.merged => _source.merged # 0,
- # status.new_users_daily => _source.new_users_daily # 0,
- # status.new_users_with_institution_daily => _source.new_users_with_institution_daily # 0,
- # status.unconfirmed => _source.unconfirmed # 0,
- return {
- 'report_date': _timestamp_to_date(row['keen.timestamp']),
- 'timestamp': _timestamp_to_dt(row['keen.created_at']),
- 'active': int(row['status.active']),
- 'deactivated': int(row['status.deactivated'] or 0),
- 'merged': int(row['status.merged'] or 0),
- 'new_users_daily': int(row['status.new_users_daily'] or 0),
- 'new_users_with_institution_daily': int(row['status.new_users_with_institution_daily'] or 0),
- 'unconfirmed': int(row['status.unconfirmed'] or 0),
- }
-
-SUMMARIES = {
- 'download_count': {
- 'mapper': _map_download_count,
- 'class': DownloadCountReport,
- },
- 'file_summary': {
- 'mapper': _map_file_summary,
- 'class': OsfstorageFileCountReport,
- },
- 'institution_summary': {
- 'mapper': _map_institution_summary,
- 'class': InstitutionSummaryReport,
- },
- 'node_summary': {
- 'mapper': _map_node_summary,
- 'class': NodeSummaryReport,
- },
- 'preprint_summary': {
- 'mapper': _map_preprint_summary,
- 'class': PreprintSummaryReport,
- },
- 'user_summary': {
- 'mapper': _map_user_summary,
- 'class': UserSummaryReport,
- },
-}
-
-def _timestamp_to_dt(timestamp):
- return datetime.datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S.%fZ')
-
-def _timestamp_to_date(timestamp):
- dt_obj = _timestamp_to_dt(timestamp)
- return dt_obj.date()
-
-
-def _dt_to_date(dt):
- dt_obj = datetime.datetime.strptime(dt, '%Y-%m-%dT%H:%M:%S.%fZ')
- return str(dt_obj.date())
-
-class Command(BaseCommand):
-
- def add_arguments(self, parser):
- super().add_arguments(parser)
- parser.add_argument(
- '--source',
- type=str,
- help='source file path (csv format w/ header line)',
- )
- parser.add_argument(
- '--dry',
- dest='dry',
- action='store_true',
- help='Dry run'
- )
- parser.add_argument(
- '--which',
- type=str,
- help='which metric summary this data is for'
- )
- parser.add_argument(
- '--resume-from',
- dest='resume_from',
- type=int,
- help='start from which record',
- )
-
- def handle(self, *args, **options):
- dry_run = options.get('dry', None)
- source = options.get('source', None)
- which = options.get('which', None)
- resume_from = options.get('resume_from', None)
- main(source, which, dry_run, resume_from)
diff --git a/osf/management/commands/metrics_backfill_user_domains.py b/osf/management/commands/metrics_backfill_user_domains.py
deleted file mode 100644
index 685dd55243e..00000000000
--- a/osf/management/commands/metrics_backfill_user_domains.py
+++ /dev/null
@@ -1,130 +0,0 @@
-"""osf/management/commands/metrics_backfill_user_domains.py
-
-Usage:
-
- $ dc-manage metrics_backfill_user_domains --source=$path_to_csv
- $ dc-manage metrics_backfill_user_domains --source=$path_to_csv --dry # dry run
- $ dc-manage metrics_backfill_user_domains --source=$path_to_csv --resume-from 1264 # start from record 1264
-
-
-"""
-import csv
-import logging
-import datetime
-
-from django.core.management.base import BaseCommand
-from osf.metrics import NewUserDomainReport
-
-logger = logging.getLogger(__name__)
-
-def main(source, dry_run=False, resume_from=None):
- if not source:
- logger.info('No source file detected, exiting.')
- return
-
- # new user domains report is weird, b/c old data needs to be aggregated by date & domain
-
- count = 0
- reader = csv.DictReader(source)
- tally = {}
- this_year = None
- for row in reader:
- count += 1
- if resume_from is not None and count < resume_from:
- continue
-
- logger.info(f'count:({count}) this_year:({this_year})')
-
- event_ts = _timestamp_to_dt(row['keen.timestamp'])
- event_date = event_ts.date()
- event_date_str = str(event_date)
-
- if this_year is None:
- logger.info(' >>> setting new year')
- this_year = event_date.year
-
- if this_year != event_date.year:
- # we've built up a year of data; commit and clear
- logger.info(' >>> year is up, committing data')
- _upload_data_and_purge(tally, dry_run)
- this_year = event_date.year
- logger.info(' >>> data committed, new year is:({}) and tally should be '
- 'empty:({})'.format(this_year, tally))
-
- if event_date_str not in tally:
- tally[event_date_str] = {
- 'timestamp': event_ts,
- 'report_date': event_date,
- 'domains': {},
- }
-
- domain = row['domain']
- if domain not in tally[event_date_str]['domains']:
- tally[event_date_str]['domains'][domain] = 0
- tally[event_date_str]['domains'][domain] += 1
-
- _upload_data_and_purge(tally, dry_run)
-
-
-def _upload_data_and_purge(tally, dry_run):
- for event_date_str, record in tally.items():
- for domain, count in record['domains'].items():
-
- # date(keen.timestamp) => _source.report_date # "2022-12-30",
- # keen.created_at => _source.timestamp # "2023-01-02T14:59:05.684642+00:00"
- # domain => _source.domain_name # metrics.Keyword()
- # count_agg(domain) => _source.new_user_count # metrics.Integer()
-
- something_wonderful = {
- 'timestamp': record['timestamp'],
- 'report_date': record['report_date'],
- 'domain_name': domain,
- 'new_user_count': count,
- }
-
- logger.info(f' *** {event_date_str}::{domain}::{count}')
- logger.info(' *** {}::{}: something wonderful:({})'.format(event_date_str, domain,
- something_wonderful))
-
- if not dry_run:
- NewUserDomainReport.record(**something_wonderful)
-
- # purge tally
- tally.clear()
-
-
-def _timestamp_to_dt(timestamp):
- return datetime.datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S.%fZ').replace(tzinfo=datetime.UTC)
-
-def _timestamp_to_date(timestamp):
- dt_obj = _timestamp_to_dt(timestamp)
- return str(dt_obj.date())
-
-
-class Command(BaseCommand):
-
- def add_arguments(self, parser):
- super().add_arguments(parser)
- parser.add_argument(
- '--source',
- type=open,
- help='source file (csv format w/ header line)',
- )
- parser.add_argument(
- '--dry',
- dest='dry',
- action='store_true',
- help='Dry run'
- )
- parser.add_argument(
- '--resume-from',
- dest='resume_from',
- type=int,
- help='start from which record',
- )
-
- def handle(self, *args, **options):
- dry_run = options.get('dry', None)
- source = options.get('source', None)
- resume_from = options.get('resume_from', None)
- main(source, dry_run, resume_from)
diff --git a/osf/management/commands/migrate_osfmetrics_6to8.py b/osf/management/commands/migrate_osfmetrics_6to8.py
deleted file mode 100644
index 49396d36ba3..00000000000
--- a/osf/management/commands/migrate_osfmetrics_6to8.py
+++ /dev/null
@@ -1,915 +0,0 @@
-import collections
-import datetime
-import functools
-import heapq
-import itertools
-import logging
-
-from django.apps import apps
-from django.core.management.base import BaseCommand
-from django.db import OperationalError as DjangoOperationalError
-from elasticsearch6.exceptions import ConnectionError as Elastic6ConnectionError
-from elasticsearch6 import helpers as es6_helpers
-from elasticsearch6_dsl.connections import connections as es6_connections
-from elasticsearch8.exceptions import TransportError as Elastic8TransportError
-from elasticsearch8.helpers import BulkIndexError as Elastic8BulkIndexError
-from elasticsearch_metrics.registry import djelme_registry
-from elasticsearch_metrics.imps import elastic8 as djel8me
-from psycopg2 import OperationalError as PostgresOperationalError
-
-from framework.celery_tasks import app as celery_app
-from osf.metadata.rdfutils import OSF
-from osf.metadata.osfmap_utils import is_osf_component
-from osf.metrics.preprint_metrics import (
- PreprintView,
- PreprintDownload,
-)
-from osf.metrics.counted_usage import (
- CountedAuthUsage as CountedUsageEs6,
- get_provider_id,
-)
-from osf.metrics import reports as es6_reports
-from osf.metrics import es8_metrics, RegistriesModerationMetrics
-from osf.metrics.reporters.public_item_usage import _iter_composite_bucket_keys
-from osf.metrics.utils import (
- YearMonth,
- get_database_iri,
- get_item_type,
- get_item_type_from_model,
- get_item_type_from_iri,
-)
-from osf import models as osfdb
-from osf.models.base import osfid_iri
-from website import settings as website_settings
-
-
-_logger = logging.getLogger(__name__)
-
-###
-# constants
-
-_USAGE_DAYS_BACK = 99
-
-_MAX_CARDINALITY_PRECISION = 40000 # https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-metrics-cardinality-aggregation.html#_precision_control
-
-_COMPOSITE_CHUNK_SIZE = 500
-
-_UNCHANGED_RECORDTYPES = {
- # reports
- es6_reports.StorageAddonUsage: es8_metrics.DailyStorageAddonUsageReportEs8,
- es6_reports.DownloadCountReport: es8_metrics.DailyDownloadCountReportEs8,
- es6_reports.InstitutionSummaryReport: es8_metrics.DailyInstitutionSummaryReportEs8,
- es6_reports.NewUserDomainReport: es8_metrics.DailyNewUserDomainReportEs8,
- es6_reports.NodeSummaryReport: es8_metrics.DailyNodeSummaryReportEs8,
- es6_reports.OsfstorageFileCountReport: es8_metrics.DailyOsfstorageFileCountReportEs8,
- es6_reports.PreprintSummaryReport: es8_metrics.DailyPreprintSummaryReportEs8,
- es6_reports.UserSummaryReport: es8_metrics.DailyUserSummaryReportEs8,
- es6_reports.SpamSummaryReport: es8_metrics.MonthlySpamSummaryReportEs8,
- es6_reports.InstitutionalUserReport: es8_metrics.MonthlyInstitutionalUserReportEs8,
- es6_reports.InstitutionMonthlySummaryReport: es8_metrics.MonthlyInstitutionSummaryReportEs8,
- es6_reports.PrivateSpamMetricsReport: es8_metrics.MonthlyPrivateSpamMetricsReportEs8,
- # events
- RegistriesModerationMetrics: es8_metrics.RegistriesModerationEventEs8,
-}
-
-_TASK_KWARGS = dict(
- autoretry_for=(
- DjangoOperationalError,
- Elastic6ConnectionError,
- Elastic8TransportError,
- PostgresOperationalError,
- ),
- retry_backoff=True, # exponential backoff, with jitter
- max_retries=20,
-)
-
-###
-# celery tasks
-
-
-@celery_app.task(**_TASK_KWARGS)
-def migrate_unchanged_recordtype(es6_recordtype_name: str, until_when: str):
- _es6_recordtype = djelme_registry.get_recordtype('osf', es6_recordtype_name)
- _es8_recordtype = _UNCHANGED_RECORDTYPES[_es6_recordtype]
- _convert_kwargs = (
- _convert_unchanged_cyclicrecord_kwargs
- if issubclass(_es8_recordtype, djel8me.CyclicRecord)
- else (lambda _kw: _kw) # no conversion needed for event record
- )
- _each_new = (
- _es8_recordtype(**_convert_kwargs(_hit['_source']))
- for _hit in _es6_scan_range(_es6_recordtype, until_when=until_when)
- )
- _es8_bulk_save(_es8_recordtype, _each_new)
-
-
-@celery_app.task(**_TASK_KWARGS)
-def migrate_counted_usages(from_when: str, until_when: str):
- # CountedAuthUsage => OsfCountedUsageEvent
- _each_new = (
- _convert_counted_usage(_hit['_source'])
- for _hit in _es6_scan_range(
- CountedUsageEs6,
- from_when=from_when,
- until_when=until_when,
- addl_filter={'exists': {'field': 'item_guid'}},
- )
- )
- _es8_bulk_save(es8_metrics.OsfCountedUsageEvent, _each_new)
-
-
-@celery_app.task(**_TASK_KWARGS)
-def migrate_preprint_views(from_when: str, until_when: str):
- # PreprintView => OsfCountedUsageEvent
- _action_labels = ['view', 'web']
- _each_new = (
- _convert_preprint_metric(_hit, _action_labels)
- for _hit in _es6_scan_range(
- PreprintView, from_when=from_when, until_when=until_when
- )
- )
- _es8_bulk_save(es8_metrics.OsfCountedUsageEvent, _each_new)
-
-
-@celery_app.task(**_TASK_KWARGS)
-def migrate_preprint_downloads(from_when: str, until_when: str):
- # PreprintDownload => OsfCountedUsageEvent
- _action_labels = ['download']
- _each_new = (
- _convert_preprint_metric(_hit, _action_labels)
- for _hit in _es6_scan_range(
- PreprintDownload, from_when=from_when, until_when=until_when
- )
- )
- _es8_bulk_save(es8_metrics.OsfCountedUsageEvent, _each_new)
-
-
-@celery_app.task(**_TASK_KWARGS)
-def schedule_migrate_usage_reports(until_when: str):
- for _osfid in _merge_sorted_osfids(
- _each_usage_report_osfid(until_when=until_when),
- _each_countedusage_osfid(until_when=until_when),
- _each_preprintview_osfid(until_when=until_when),
- _each_preprintdownload_osfid(until_when=until_when),
- ):
- migrate_usage_reports.delay(_osfid, until_when)
-
-
-@celery_app.task(**_TASK_KWARGS)
-def migrate_usage_reports(osfid: str, until_when: str):
- # from PublicItemUsageReport to MonthlyPublicItemUsageReportEs8
- _osfobj, _ = osfdb.Guid.load_referent(osfid)
- _item_is_component = is_osf_component(_osfobj) if _osfobj else False
-
- def _each_new():
- _each_hit = _es6_scan_range(
- es6_reports.PublicItemUsageReport,
- until_when=until_when,
- addl_filter={'terms': {'item_osfid': _synonymous_osfids(osfid)}},
- )
- # (only a few dozen of these per item; should be fine to load all at once)
- _hits = list(_each_hit)
- if _osfobj and not _hits:
- # this item has usages, but only before the monthly usage reparts started
- # -- create one for cumulative counts (if the object still exists)
- yield _backfill_old_usage_report(_osfobj, _item_is_component, until_when)
- else:
- for _hit in _hits:
- yield _convert_public_usage_report(
- _hit['_source'],
- item_is_component=_item_is_component,
- )
-
- _es8_bulk_save(es8_metrics.MonthlyPublicItemUsageReportEs8, _each_new())
-
-
-###
-# various helper functions
-
-
-def _es6_connection():
- return es6_connections.get_connection('osfmetrics_es6')
-
-
-def _es8_bulk_save(es8_recordtype, each_new_record):
- try:
- es8_recordtype.bulk(each_new_record, stats_only=True)
- except Elastic8BulkIndexError as _bulk_error:
- # so actual errors show in celery task result
- raise Exception(_bulk_error.errors) from _bulk_error
-
-
-def _date_range(
- range_start: datetime.date,
- range_end: datetime.date,
- step: datetime.timedelta = datetime.timedelta(days=1),
-) -> collections.abc.Iterator[tuple[datetime.date, datetime.date]]:
- _from_date = range_start
- _until_date = range_start + step
- while _from_date < range_end:
- yield (_from_date, _until_date)
- (_from_date, _until_date) = (_until_date, _until_date + step)
-
-
-def _es6_scan_range(
- es6_recordtype,
- *,
- from_when: str = '',
- until_when: str,
- addl_filter=None,
-):
- _timestamp_range = {'lt': until_when}
- if from_when:
- _timestamp_range['gte'] = from_when
- _filters = [
- {'range': {'timestamp': _timestamp_range}},
- ]
- if addl_filter:
- _filters.append(addl_filter)
- _query_body = {'query': {'bool': {'filter': _filters}}}
- return es6_helpers.scan(
- _es6_connection(),
- index=es6_recordtype._template_pattern,
- query=_query_body,
- )
-
-
-def _es6_usage_report_counts() -> tuple[int, int]:
- _search = es6_reports.PublicItemUsageReport.search()
- _search.aggs.metric(
- 'agg_item_count',
- 'cardinality',
- field='item_osfid',
- precision_threshold=_MAX_CARDINALITY_PRECISION,
- )
- _response = _search.execute()
- _total_count = _response.hits.total
- _item_count = (
- _response.aggregations.agg_item_count.value
- if 'agg_item_count' in _response.aggregations
- else 0
- )
- return (_total_count, _item_count)
-
-
-def _es8_usage_report_counts() -> tuple[int, int]:
- _search = es8_metrics.MonthlyPublicItemUsageReportEs8.search()
- _search.aggs.metric(
- 'agg_item_count',
- 'cardinality',
- field='item_osfids',
- precision_threshold=_MAX_CARDINALITY_PRECISION,
- )
- _response = _search.execute()
- _total_count = _response.hits.total.value
- _item_count = (
- _response.aggregations.agg_item_count.value
- if 'agg_item_count' in _response.aggregations
- else 0
- )
- return (_total_count, _item_count)
-
-
-def _get_es6_field_names(es6_recordtype):
- '''
- adapted from DocumentBase._get_field_names in elasticsearch8.dsl
- '''
- for _field_name in es6_recordtype._doc_type.mapping:
- _field = es6_recordtype._doc_type.mapping[_field_name]
- if hasattr(_field, '_doc_class'):
- for _sub_field in _get_es6_field_names(_field._doc_class):
- yield f'{_field_name}.{_sub_field}'
- else:
- yield _field_name
-
-
-def _assert_field_unchangedness(es6_recordtype, es8_recordtype):
- _es6_fields = set(_get_es6_field_names(es6_recordtype))
- _es8_fields = set(es8_recordtype._get_field_names())
-
- # remove fields intentionally removed in migration
- if issubclass(es6_recordtype, es6_reports.DailyReport):
- assert issubclass(es8_recordtype, djel8me.CyclicRecord)
- _es6_fields.remove('timestamp')
- _es6_fields.remove('report_date')
- elif issubclass(es6_recordtype, es6_reports.MonthlyReport):
- assert issubclass(es8_recordtype, djel8me.CyclicRecord)
- _es6_fields.remove('timestamp')
- _es6_fields.remove('report_yearmonth')
- else:
- assert issubclass(es8_recordtype, djel8me.EventRecord)
-
- # remove fields intentionally added in migration
- _es8_fields.remove('timeseries_timeparts')
- if issubclass(es8_recordtype, djel8me.CyclicRecord):
- _es8_fields.remove('created')
- _es8_fields.remove('cycle_coverage')
-
- # all remaining fields should match
- assert _es6_fields == _es8_fields
-
-
-def _semverish_from_yearmonth(given_yearmonth):
- _ym = YearMonth.from_any(given_yearmonth)
- return f'{_ym.year}.{_ym.month}'
-
-
-def _semverish_from_date(given_date: str):
- _d = datetime.date.fromisoformat(given_date)
- return f'{_d.year}.{_d.month}.{_d.day}'
-
-
-def _convert_unchanged_cyclicrecord_kwargs(es6_source: dict) -> dict:
- def _each_kwarg():
- for _key, _val in es6_source.items():
- if _key == 'report_yearmonth':
- # report_yearmonth converts to cycle_coverage Y.M
- yield ('cycle_coverage', _semverish_from_yearmonth(_val))
- elif _key == 'report_date':
- # report_date converts to cycle_coverage Y.M.D
- yield ('cycle_coverage', _semverish_from_date(_val))
- elif _key != 'timestamp':
- # skipping timestamp; on daily/monthly reports just copied from yearmonth/date
- yield (_key, _val)
-
- return dict(_each_kwarg())
-
-
-def _convert_counted_usage(source: dict) -> es8_metrics.OsfCountedUsageEvent:
- return es8_metrics.OsfCountedUsageEvent(
- # fields from djelme.CountedUsageRecord:
- timestamp=source['timestamp'],
- sessionhour_id=source['session_id'],
- platform_iri=source.get('platform_iri') or website_settings.DOMAIN,
- database_iri=_convert_database_iri(
- provider_id=source.get('provider_id'),
- osf_model_name=source.get('item_type'),
- ),
- within_iris=[
- osfid_iri(_within_osfid)
- for _within_osfid in source.get('surrounding_guids', ())
- ],
- # fields from OsfCountedUsageEvent:
- item_osfid=source['item_guid'],
- item_type=_convert_item_type(
- source.get('item_type'),
- has_surrounding_items=bool(source.get('surrounding_guids')),
- ),
- item_public=source.get('item_public', True),
- provider_id=source.get('provider_id', 'osf'),
- user_is_authenticated=source.get('user_is_authenticated', False),
- action_labels=source.get('action_labels'),
- pageview_info=source.get('pageview_info'),
- )
-
-
-def _convert_preprint_metric(
- hit: dict, action_labels: list[str]
-) -> es8_metrics.OsfCountedUsageEvent:
- _source = hit['_source']
- _doc_id = hit['_id']
- return es8_metrics.OsfCountedUsageEvent.record(
- using=False, # don't save yet; will save in bulk
- # fields used to compute a sessionhour_id:
- timestamp=datetime.datetime.fromisoformat(_source['timestamp']),
- user_id=_source.get('user_id'),
- client_session_id=_doc_id, # unique session per event (best can do)
- # fields from djelme.CountedUsageRecord:
- platform_iri=website_settings.DOMAIN,
- database_iri=_convert_database_iri(
- provider_id=_source.get('provider_id'),
- osf_model_name='preprint',
- ),
- # fields from OsfCountedUsageEvent:
- item_osfid=_source['preprint_id'],
- item_type='Preprint',
- item_public=True,
- provider_id=_source.get('provider_id'),
- user_is_authenticated=bool(_source.get('user_id')),
- action_labels=action_labels,
- )
-
-
-def _convert_public_usage_report(
- source: dict,
- item_is_component: bool,
-) -> es8_metrics.MonthlyPublicItemUsageReportEs8:
- _c_views, _c_view_sess, _c_downloads, _c_download_sess = _get_cumulative_usage(
- osfid=source['item_osfid'],
- until_when=YearMonth.from_str(source['report_yearmonth']).month_end(),
- is_preprint=('preprint' in source.get('item_type', ())),
- )
- return es8_metrics.MonthlyPublicItemUsageReportEs8(
- cycle_coverage=_semverish_from_yearmonth(source['report_yearmonth']),
- item_iri=osfid_iri(source['item_osfid']),
- item_osfids=[source['item_osfid']],
- item_types=_convert_item_type_list(
- source.get('item_type', []),
- has_surrounding_items=item_is_component,
- ),
- database_iris=_convert_database_iri_list(
- provider_ids=source.get('provider_id', []),
- osf_model_names=source.get('item_type', []),
- ),
- provider_ids=source.get('provider_id'),
- platform_iris=source.get('platform_iri') or [website_settings.DOMAIN],
- view_count=source.get('view_count', 0),
- view_session_count=source.get('view_session_count') or source.get('view_count', 0),
- cumulative_view_count=_c_views,
- cumulative_view_session_count=_c_view_sess or _c_views,
- download_count=source.get('download_count', 0),
- download_session_count=source.get('download_session_count') or source.get('download_count', 0),
- cumulative_download_count=_c_downloads,
- cumulative_download_session_count=_c_download_sess or _c_downloads,
- )
-
-
-def _backfill_old_usage_report(osf_obj, is_component: bool, until_when: str):
- # add a "last month" report with cumulative counts up to that point
- _last_month = YearMonth.from_date(datetime.datetime.fromisoformat(until_when)).prior()
- _c_views, _c_view_sess, _c_downloads, _c_download_sess = _get_cumulative_usage(
- osfid=osf_obj._id,
- until_when=_last_month.month_end().isoformat(),
- is_preprint=isinstance(osf_obj, osfdb.Preprint),
- )
- return es8_metrics.MonthlyPublicItemUsageReportEs8(
- cycle_coverage=_semverish_from_yearmonth(_last_month),
- item_iri=osfid_iri(osf_obj._id),
- item_osfids=[osf_obj._id],
- item_types=[get_item_type(osf_obj)],
- provider_ids=[get_provider_id(osf_obj)],
- database_iris=[get_database_iri(osf_obj)],
- platform_iris=[website_settings.DOMAIN],
- view_count=0,
- view_session_count=0,
- cumulative_view_count=_c_views,
- cumulative_view_session_count=_c_view_sess or _c_views,
- download_count=0,
- download_session_count=0,
- cumulative_download_count=_c_downloads,
- cumulative_download_session_count=_c_download_sess or _c_downloads,
- )
-
-
-def _get_cumulative_usage(osfid: str, until_when, *, is_preprint: bool):
- if is_preprint:
- _views = _cumulative_preprint_count(PreprintView, osfid, until_when)
- _downloads = _cumulative_preprint_count(PreprintDownload, osfid, until_when)
- _view_sess, _download_sess = 0, 0 # no session info on preprints (yet)
- else:
- _views, _view_sess = _cumulative_countedusage_views(osfid, until_when)
- _downloads, _download_sess = _cumulative_countedusage_downloads(
- osfid, until_when
- )
- return (_views, _view_sess, _downloads, _download_sess)
-
-
-def _cumulative_countedusage_views(osfid: str, until_when: str) -> tuple[int, int]:
- '''compute view_session_count separately to avoid double-counting
-
- (the same session may be represented in both the composite agg on `item_guid`
- and that on `surrounding_guids`)
- '''
- # copied/adapted from osf.metrics.reporters.public_item_usage
- _search = (
- CountedUsageEs6.search()
- .filter('term', item_public=True)
- .filter('range', timestamp={'lt': until_when})
- .filter('term', action_labels='view')
- .filter(
- 'bool',
- should=[
- {'term': {'item_guid': osfid}},
- {'term': {'surrounding_guids': osfid}},
- ],
- minimum_should_match=1,
- )
- .extra(size=0) # only aggregations, no hits
- )
- _search.aggs.metric(
- 'agg_session_count',
- 'cardinality',
- field='session_id',
- precision_threshold=_MAX_CARDINALITY_PRECISION,
- )
- _response = _search.execute()
- _view_count = _response.hits.total
- _view_session_count = (
- _response.aggregations.agg_session_count.value
- if 'agg_session_count' in _response.aggregations
- else 0
- )
- return (_view_count, _view_session_count)
-
-
-def _cumulative_countedusage_downloads(osfid, until_when) -> tuple[int, int]:
- '''aggregate downloads on each osfid (not including components/files)'''
- # copied/adapted from osf.metrics.reporters.public_item_usage
- _search = (
- CountedUsageEs6.search()
- .filter('term', item_public=True)
- .filter('range', timestamp={'lt': until_when})
- .filter('term', action_labels='download')
- .filter('term', item_guid=osfid)
- )
- _search.aggs.metric(
- 'agg_session_count',
- 'cardinality',
- field='session_id',
- precision_threshold=_MAX_CARDINALITY_PRECISION,
- )
- _response = _search.execute()
- _download_count = _response.hits.total
- _download_session_count = (
- _response.aggregations.agg_session_count.value
- if 'agg_session_count' in _response.aggregations
- else 0
- )
- return (_download_count, _download_session_count)
-
-
-def _cumulative_preprint_count(preprint_metric_cls, osfid: str, until_when: str) -> int:
- '''aggregate counts on given preprint'''
- # copied/adapted from osf.metrics.preprint_metrics
- _search = (
- preprint_metric_cls.search()
- .filter('terms', preprint_id=_synonymous_osfids(osfid))
- .filter('range', timestamp={'lt': until_when})
- .extra(size=0) # no hits; only aggs
- )
- _search.aggs.metric('agg_count', 'sum', field='count')
- _response = _search.execute()
- return (
- int(_response.aggregations.agg_count.value)
- if hasattr(_response.aggregations, 'agg_count')
- else 0
- )
-
-
-def _synonymous_osfids(osfid: str) -> list[str]:
- _synonyms = [osfid]
- if osfid.endswith('_v1'):
- # include pre-versioned-guid counts for v1
- _synonyms.append(osfid.removesuffix('_v1'))
- elif '_' not in osfid:
- # include v1 (if it exists) with unversioned guid
- _synonyms.append(f'{osfid}_v1')
- return _synonyms
-
-
-def _convert_item_type_list(osf_model_names: list[str] | str, has_surrounding_items: bool):
- if isinstance(osf_model_names, str):
- osf_model_names = [osf_model_names]
- return [
- _convert_item_type(_model_name, has_surrounding_items)
- for _model_name in osf_model_names
- ]
-
-
-def _convert_item_type(osf_model_name: str | None, has_surrounding_items: bool):
- if osf_model_name:
- try:
- return get_item_type_from_model(
- apps.get_model('osf', osf_model_name),
- is_component=has_surrounding_items,
- )
- except LookupError:
- pass
- return get_item_type_from_iri(OSF.Object) # fallback abstract osf:Object
-
-
-def _convert_database_iri_list(provider_ids: list[str], osf_model_names: list[str]):
- return [
- _convert_database_iri(_id, _model_name)
- for _id in provider_ids
- for _model_name in osf_model_names
- ]
-
-
-def _convert_database_iri(provider_id: str | None, osf_model_name: str):
- if not provider_id:
- return website_settings.DOMAIN # osf is a provider, sure why not
-
- match osf_model_name: # lower-cased osf.models class names
- case 'node' | 'osfuser': # implicit untyped 'osf' provider
- return website_settings.DOMAIN
- case 'preprint': # match PreprintProvider.get_semantic_iri
- return f'{website_settings.DOMAIN}preprints/{provider_id}'
- case 'registration': # match RegistrationProvider.get_semantic_iri
- return f'{website_settings.DOMAIN}registries/{provider_id}'
- case _ if 'file' in osf_model_name:
- # file providers are a different thing that don't really have an iri, just an id
- return f'urn:files.osf.io:{provider_id}'
- case _: # give up gracefully
- _logger.error(
- f'unknown model {osf_model_name!r} with provider {provider_id!r}'
- )
- return f'urn:osf.io:{provider_id}'
-
-
-def _each_usage_report_osfid(until_when, after_osfid=None):
- _search = (
- es6_reports.PublicItemUsageReport.search()
- .filter('range', timestamp={'lt': until_when})
- .extra(size=0)
- )
- _search.aggs.bucket(
- 'agg_osfid',
- 'composite',
- sources=[{'osfid': {'terms': {'field': 'item_osfid'}}}],
- size=500,
- )
- return _iter_composite_bucket_keys(_search, 'agg_osfid', 'osfid', after=after_osfid)
-
-
-def _each_countedusage_osfid(until_when, after_osfid=None) -> collections.abc.Iterator[str]:
- _search = (
- CountedUsageEs6.search()
- .filter('term', item_public=True)
- .filter('terms', action_labels=['view', 'download'])
- .filter('range', timestamp={'lt': until_when})
- .extra(size=0) # only aggregations, no hits
- )
- _search.aggs.bucket(
- 'agg_osfid',
- 'composite',
- sources=[{'osfid': {'terms': {'field': 'item_guid'}}}],
- size=_COMPOSITE_CHUNK_SIZE,
- )
- return _iter_composite_bucket_keys(_search, 'agg_osfid', 'osfid', after=after_osfid)
-
-
-def _each_preprintview_osfid(until_when, after_osfid=None) -> collections.abc.Iterator[str]:
- _search = (
- PreprintView.search()
- .filter('range', timestamp={'lt': until_when})
- .extra(size=0) # only aggregations, no hits
- )
- _search.aggs.bucket(
- 'agg_osfid',
- 'composite',
- sources=[{'osfid': {'terms': {'field': 'preprint_id'}}}],
- size=_COMPOSITE_CHUNK_SIZE,
- )
- return _iter_composite_bucket_keys(_search, 'agg_osfid', 'osfid', after=after_osfid)
-
-
-def _each_preprintdownload_osfid(until_when, after_osfid=None) -> collections.abc.Iterator[str]:
- _search = (
- PreprintDownload.search()
- .filter('range', timestamp={'lt': until_when})
- .extra(size=0) # only aggregations, no hits
- )
- _search.aggs.bucket(
- 'agg_osfid',
- 'composite',
- sources=[{'osfid': {'terms': {'field': 'preprint_id'}}}],
- size=_COMPOSITE_CHUNK_SIZE,
- )
- return _iter_composite_bucket_keys(_search, 'agg_osfid', 'osfid', after=after_osfid)
-
-
-def _merge_sorted_osfids(*osfid_iterables):
- def _osfids_group_key(osfid: str):
- return ( # v1 same as unversioned
- osfid.removesuffix('_v1')
- if osfid.endswith('_v1')
- else osfid
- )
- for _k, _g in itertools.groupby(
- heapq.merge(*osfid_iterables),
- key=_osfids_group_key,
- ):
- yield _k
-
-
-###
-# the command itself
-
-class Command(BaseCommand):
- def add_arguments(self, parser):
- parser.add_argument(
- '--no-counts',
- action='store_true',
- )
- parser.add_argument(
- '--clear-state',
- action='store_true',
- )
- parser.add_argument(
- '--clear-es8-data',
- action='store_true',
- )
- parser.add_argument(
- '--start',
- action='store_true',
- )
- parser.add_argument(
- '--unchanged',
- action='store_true',
- )
- parser.add_argument(
- '--usage-events',
- action='store_true',
- )
- parser.add_argument(
- '--usage-reports',
- action='store_true',
- )
-
- @functools.cached_property
- def _migration_started_at(self):
- return es8_metrics.Elastic6To8State.get_started_at()
-
- def handle(
- self,
- *,
- no_counts,
- clear_state,
- clear_es8_data,
- start,
- unchanged,
- usage_events,
- usage_reports,
- **kwargs,
- ):
- self._quiet_chatty_loggers()
- if clear_state:
- self._clear_state()
- if clear_es8_data:
- self._clear_es8_data(unchanged, usage_events, usage_reports)
- self._check_started_at(start_now=start)
- _default_all = not any((unchanged, usage_events, usage_reports))
- if usage_reports or _default_all:
- self._handle_usage_reports(start=start, no_counts=no_counts)
- if usage_events or _default_all:
- self._handle_usage_events(start=start, no_counts=no_counts)
- if unchanged or _default_all:
- self._handle_unchanged(start=start, no_counts=no_counts)
- if not no_counts:
- self.stdout.write('(counts may be approximate)')
-
- def _handle_unchanged(self, *, start: bool, no_counts: bool):
- # for each (unchanged) report/event:
- for _es6_cls, _es8_cls in _UNCHANGED_RECORDTYPES.items():
- _assert_field_unchangedness(_es6_cls, _es8_cls)
- if not no_counts:
- # display counts
- _es6_count = _es6_cls.search().count()
- _es8_count = _es8_cls.search().count()
- self._write_tabbed('es6', _es6_cls, _es6_count)
- self._write_tabbed(
- 'es8',
- _es8_cls,
- _es8_count,
- style=self._eq_style(_es8_count, _es6_count),
- )
- if start: # schedule task
- self.stdout.write(
- f'starting {_es6_cls.__name__} => {_es8_cls.__name__}'
- )
- migrate_unchanged_recordtype.delay(
- _es6_cls.__name__, self._migration_started_at.isoformat()
- )
-
- def _handle_usage_events(self, *, start: bool, no_counts: bool):
- # for counted-usage events:
- _started = self._migration_started_at or datetime.datetime.now()
- _range_start = (_started - datetime.timedelta(days=_USAGE_DAYS_BACK)).date()
- _range_end = _started.date() + datetime.timedelta(days=1)
- if not no_counts:
- # display counts for each view/download event type
- _range_q = {
- 'range': {
- 'timestamp': {
- 'gte': _range_start.isoformat(),
- 'lt': _range_end.isoformat(),
- }
- }
- }
- _es6_usage_count_q = {
- 'bool': {
- 'filter': [_range_q, {'exists': {'field': 'item_guid'}}],
- },
- }
- _es6_pview_count = PreprintView.search().filter(_range_q).count()
- _es6_pdownload_count = PreprintDownload.search().filter(_range_q).count()
- _es6_usage_event_count = CountedUsageEs6.search().filter(_es6_usage_count_q).count()
- _es6_count = (
- _es6_pview_count + _es6_pdownload_count + _es6_usage_event_count
- )
- _es8_count = es8_metrics.OsfCountedUsageEvent.search().filter(_range_q).count()
- self._write_tabbed('es6', PreprintView, _es6_pview_count)
- self._write_tabbed('es6', PreprintDownload, _es6_pdownload_count)
- self._write_tabbed('es6', CountedUsageEs6, _es6_usage_event_count)
- self._write_tabbed(
- 'es6', f'(total between {_range_start} and {_range_end})', _es6_count
- )
- self._write_tabbed(
- 'es8',
- es8_metrics.OsfCountedUsageEvent,
- _es8_count,
- style=self._eq_style(_es8_count, _es6_count),
- )
- if start: # schedule (per-day?) tasks (if --start)
- self.stdout.write(
- f'starting usages => {es8_metrics.OsfCountedUsageEvent.__name__}'
- )
- for _from_date, _until_date in _date_range(_range_start, _range_end):
- _from_str = _from_date.isoformat()
- _until_str = _until_date.isoformat()
- migrate_counted_usages.delay(_from_str, _until_str)
- migrate_preprint_views.delay(_from_str, _until_str)
- migrate_preprint_downloads.delay(_from_str, _until_str)
-
- def _handle_usage_reports(self, *, start: bool, no_counts: bool):
- if not no_counts:
- # display counts of reports and distinct items
- _es6_count, _es6_item_count = _es6_usage_report_counts()
- _es8_count, _es8_item_count = _es8_usage_report_counts()
- self._write_tabbed('es6', es6_reports.PublicItemUsageReport, _es6_count)
- self._write_tabbed(
- 'es8',
- es8_metrics.MonthlyPublicItemUsageReportEs8,
- _es8_count,
- style=self._eq_style(_es8_count, _es6_count),
- )
- self._write_tabbed(
- 'es6',
- es6_reports.PublicItemUsageReport,
- 'osfid count:',
- _es6_item_count,
- )
- self._write_tabbed(
- 'es8',
- es8_metrics.MonthlyPublicItemUsageReportEs8,
- 'osfid count:',
- _es8_item_count,
- style=self._eq_style(_es8_item_count, _es6_item_count),
- )
- # (if --start) schedule task per item (by composite agg on es6 usage reports and events)
- # each item-task iter thru reports oldest to newest, adding cumulative counts
- if start:
- self.stdout.write(
- f'starting per-item {es6_reports.PublicItemUsageReport.__name__} => {es8_metrics.MonthlyPublicItemUsageReportEs8.__name__}'
- )
- schedule_migrate_usage_reports.delay(self._migration_started_at.isoformat())
-
- def _check_started_at(self, start_now):
- _started_at = self._migration_started_at
- if _started_at:
- self.stdout.write(
- f'osf.metrics 6->8 migration started previously, at {_started_at.isoformat()}'
- )
- elif start_now:
- _started_at = es8_metrics.Elastic6To8State.set_started_at_now()
- del self._migration_started_at # clear cache
- self.stdout.write(
- f'osf.metrics 6->8 migration starting now, at {_started_at.isoformat()}'
- )
- else:
- self.stdout.write(
- 'osf.metrics 6->8 migration not started nor starting (run with `--start` to start)'
- )
-
- def _clear_state(self):
- self.stdout.write(
- 'clearing all migration state (start time, etc)', self.style.NOTICE
- )
- es8_metrics.Elastic6To8State.search().query({'match_all': {}}).delete()
- es8_metrics.Elastic6To8State.refresh()
-
- def _clear_es8_data(self, unchanged, usage_events, usage_reports):
- _default_all = not any((unchanged, usage_events, usage_reports))
- _to_clear = []
- if _default_all or unchanged:
- _to_clear.extend(_UNCHANGED_RECORDTYPES.values())
- if _default_all or usage_events:
- _to_clear.append(es8_metrics.OsfCountedUsageEvent)
- if _default_all or usage_reports:
- _to_clear.append(es8_metrics.MonthlyPublicItemUsageReportEs8)
- for _es8_recordtype in _to_clear:
- self.stdout.write(
- f'clearing {_es8_recordtype.__name__}', self.style.NOTICE
- )
- _es8_recordtype.do_teardown(keep_templates=True)
-
- def _eq_style(self, num: int, should_be: int):
- return self.style.SUCCESS if (num == should_be) else self.style.WARNING
-
- def _write_tabbed(self, *strables, style=None):
- def _to_str(strable):
- if isinstance(strable, type):
- return strable.__name__
- return str(strable)
-
- self.stdout.write('\t'.join(map(_to_str, strables)), style)
-
- def _quiet_chatty_loggers(self):
- _chatty_loggers = [
- 'elasticsearch',
- 'elastic_transport',
- 'elasticsearch_metrics',
- ]
- for logger_name in _chatty_loggers:
- logging.getLogger(logger_name).setLevel(logging.ERROR)
diff --git a/osf/management/commands/monthly_reporters_go.py b/osf/management/commands/monthly_reporters_go.py
index cfcb22bfc7f..9f6d57bc5db 100644
--- a/osf/management/commands/monthly_reporters_go.py
+++ b/osf/management/commands/monthly_reporters_go.py
@@ -3,14 +3,12 @@
from django.core.management.base import BaseCommand
from django.db import OperationalError as DjangoOperationalError
-from elasticsearch6.exceptions import ConnectionError as Elastic6ConnectionError
from elasticsearch8.exceptions import ConnectionError as Elastic8ConnectionError
from psycopg2 import OperationalError as PostgresOperationalError
from framework.celery_tasks import app as celery_app
import framework.sentry
from osf.metrics.reporters import AllMonthlyReporters
-from osf.metrics.reports import MonthlyReport
from osf.metrics.utils import YearMonth
@@ -19,7 +17,6 @@
_CONTINUE_AFTER_ERRORS = (
DjangoOperationalError,
- Elastic6ConnectionError,
Elastic8ConnectionError,
PostgresOperationalError,
)
@@ -86,8 +83,6 @@ def monthly_reporter_do(reporter_key: str, yearmonth: str, report_kwargs: dict):
_reports = _reporter.report(**report_kwargs)
for _report in _reports:
- if isinstance(_report, MonthlyReport) and (_report.report_yearmonth is None):
- _report.report_yearmonth = _reporter.yearmonth
_report.save()
_followup_task = _reporter.followup_task(_report)
if _followup_task is not None:
diff --git a/osf/management/commands/populate_impact_preprint_metrics.py b/osf/management/commands/populate_impact_preprint_metrics.py
deleted file mode 100644
index f5fc60cd8e1..00000000000
--- a/osf/management/commands/populate_impact_preprint_metrics.py
+++ /dev/null
@@ -1,117 +0,0 @@
-import datetime as dt
-from random import random
-from django.core.management.base import BaseCommand
-
-from osf.metrics import (
- PreprintView,
- PreprintDownload,
-)
-
-from osf.models import Preprint
-
-
-"""
-This management command can be run to populate impact with fake
-preprints metrics data.
-
-All flags are optional with the script defaulting to 3 preprints from
-your local database with metrics for the past 7 days and an average
-count of 25 for preprint views/downloads per day.
-
---preprints: Specify preprint guids
---num_preprints: Specify the number of preprint to use from the database (if
-preprint guids aren't specified)
---days: Specify the number of days to write metrics data for
---group_counts: Indicates that metric counts should be grouped
-in a single record per preprint per day
---avg_counts: The average number of view/download counts to write
-for each preprint per day
-
-Example: docker-compose run --rm web python3 manage.py populate_impact_preprint_metrics --num_preprints 1 --days 5 --group_counts --avg_counts 50
-"""
-
-
-def populate_preprint_metrics(preprints, dates, avg_counts, group_counts=False):
- for date in dates:
- for preprint in preprints:
- preprint_view_count = int((avg_counts * 2) * random())
- preprint_download_count = int((avg_counts * 2) * random())
-
- if group_counts:
- PreprintView.record_for_preprint(
- preprint=preprint,
- path=preprint.primary_file.path,
- timestamp=date,
- count=preprint_view_count
- )
-
- PreprintDownload.record_for_preprint(
- preprint=preprint,
- path=preprint.primary_file.path,
- timestamp=date,
- count=preprint_download_count
- )
- else:
- for count in range(preprint_view_count):
- PreprintView.record_for_preprint(
- preprint=preprint,
- path=preprint.primary_file.path,
- timestamp=date
- )
-
- for count in range(preprint_download_count):
- PreprintDownload.record_for_preprint(
- preprint=preprint,
- path=preprint.primary_file.path,
- timestamp=date
- )
-
-
-class Command(BaseCommand):
-
- def add_arguments(self, parser):
- super().add_arguments(parser)
- parser.add_argument(
- '--preprints',
- nargs='*',
- help='Specify preprints guids'
- )
- parser.add_argument(
- '--num_preprints',
- type=int,
- default=3,
- help='Specify number of preprints to use if not specifying preprints'
- )
- parser.add_argument(
- '--days',
- type=int,
- default=7,
- help='Specify number of past days to write metrics data for'
- )
- parser.add_argument(
- '--group_counts',
- action='store_true',
- help='Group counts in metric records for fewer ES requests'
- )
- parser.add_argument(
- '--avg_counts',
- type=int,
- default=25,
- help='Average number of counts to write per day per preprint'
- )
-
- def handle(self, *args, **options):
- days = options.get('days')
- num_preprints = options.get('num_preprints')
- group_counts = options.get('group_counts')
- avg_counts = options.get('avg_counts')
-
- if options.get('preprints'):
- preprints = Preprint.objects.filter(guids___id__in=options.get('preprints'))
- else:
- preprints = Preprint.objects.all()[:num_preprints]
-
- today = dt.datetime.today()
- last_x_days = [(today - dt.timedelta(days=num_days)) for num_days in range(0, days)]
-
- populate_preprint_metrics(preprints, last_x_days, avg_counts, group_counts)
diff --git a/osf/management/commands/reindex_es6.py b/osf/management/commands/reindex_es6.py
deleted file mode 100644
index 8961ea6fff1..00000000000
--- a/osf/management/commands/reindex_es6.py
+++ /dev/null
@@ -1,104 +0,0 @@
-"""
-Reindex data to use current mapping for ES metrics classes
-"""
-import logging
-
-from django.core.management.base import BaseCommand
-from elasticsearch6_dsl import connections
-from elasticsearch_metrics.registry import registry
-
-logger = logging.getLogger(__name__)
-
-
-def get_metric_class(index_name: str) -> type:
- app_label, model_name = index_name.split('_')[:2]
- return registry.all_metrics[app_label][model_name]
-
-
-def increment_index_versions(client, old_indices: list):
- """
- Increment versions numbers for new indices, these kind don't matter because they should always be aliased to
- the original format of {app_label}_{cls.__name__.lower()}_{year}.
-
- :param old_indices: indices to be updated
- :return: indices names that are going to be reindexed into.
- """
- new_indices = []
- for index in old_indices:
- index_name = list(client.indices.get(index).keys())[0] # in case we've already aliased this index
- if '_v' in index_name and index_name[-1].isdigit():
- name, version_num = index_name.split('_v')
- new_index = f'{name}_v{int(version_num) + 1}'
- else:
- new_index = f'{index}_v2'
- new_indices.append(new_index)
-
- return new_indices
-
-
-def reindex_and_alias(old_indices: list, dry_run: bool = False):
- """
- To migrate data in ES with new mappings is a 4 step process:
- 1) Create an index with new mappings
- 2) Reindex data from old to new
- 3) Delete the old index
- 4) Alias the new index so it references the old.
-
- :param old_indices: indices with data that has old mappings
- :return: None
- """
- if dry_run:
- logger.info('[DRY RUN] THIS IS A DRY RUN.')
- client = connections.get_connection()
- new_indices = increment_index_versions(client, old_indices)
-
- for old_index, new_index in zip(old_indices, new_indices):
- metric_class = get_metric_class(old_index)
- if dry_run:
- logger.info(f'[DRY RUN] Would reindex {old_index} to {new_index} for {metric_class}')
- continue
- client.indices.create(new_index, body=metric_class._index.to_dict(), params={'wait_for_active_shards': 1})
- logger.info(f'Created index {new_index}')
- body = {
- 'source': {
- 'index': old_index
- },
- 'dest': {
- 'index': new_index
- }
- }
- logger.info(f'Created reindexing {old_index} to {new_index}')
- client.reindex(body, params={'wait_for_completion': 'true'})
- logger.info('Reindexing complete')
- old_index_name = list(client.indices.get(old_index).keys())[0] # in case we've already aliased this index
-
- if old_index_name == old_index: # True if not aliased
- client.indices.delete(old_index)
- logger.info(f'{old_index} deleted')
- client.indices.put_alias(new_index, old_index)
- else:
- client.indices.put_alias(new_index, old_index)
- client.indices.delete(old_index_name)
- logger.info(f'{old_index_name} deleted')
-
-
-class Command(BaseCommand):
- def add_arguments(self, parser):
- super().add_arguments(parser)
- parser.add_argument(
- '--indices',
- type=str,
- nargs='+',
- help='List of indices to be reindexed and remapped'
- )
- parser.add_argument(
- '--dry',
- action='store_true',
- dest='dry_run',
- help='Run migration and roll back changes to db',
- )
-
- def handle(self, *args, **options):
- indices = options.get('indices', [])
- dry_run = options.get('dry_run', True)
- reindex_and_alias(indices, dry_run)
diff --git a/osf/metadata/osf_gathering.py b/osf/metadata/osf_gathering.py
index a72a799402d..14c637955aa 100644
--- a/osf/metadata/osf_gathering.py
+++ b/osf/metadata/osf_gathering.py
@@ -11,12 +11,12 @@
from api.caching.tasks import get_storage_usage_total
from osf import models as osfdb
+from osf.models.base import osfid_iri
from osf.metadata import gather
from osf.metadata.definitions.datacite import DATACITE_RESOURCE_TYPES_GENERAL
from osf.metadata.osfmap_utils import (
osfmap_type,
is_osf_component,
- osfid_from_iri,
)
from osf.metadata.rdfutils import (
DATACITE,
@@ -37,7 +37,7 @@
format_dcterms_extent,
smells_like_iri,
)
-from osf.metrics.reports import PublicItemUsageReport
+from osf.metrics.monthly_reports import MonthlyPublicItemUsageReport
from osf.metrics.utils import YearMonth
from osf.utils import (
workflows as osfworkflows,
@@ -1085,22 +1085,33 @@ def gather_cedar_templates(focus):
@gather.er(OSF.usage)
def gather_last_month_usage(focus):
- _usage_report = PublicItemUsageReport.for_last_month(
- item_osfid=osfid_from_iri(focus.iri),
- )
- if _usage_report is not None:
+ _item_iris = [focus.iri]
+ # items with versioned osfids may have a separate usage report for each version,
+ # but this metadata is gathered for the unversioned osfid -- add counts together
+ if hasattr(focus.dbmodel, 'versioned_guids'):
+ _item_iris.extend(
+ osfid_iri(_vg.versioned_osfid())
+ for _vg in focus.dbmodel.versioned_guids.all()
+ )
+ _usage_reports = MonthlyPublicItemUsageReport.from_last_month(_item_iris)
+ if _usage_reports:
+ def _sum_usage(report_attr_name):
+ return sum(
+ getattr(_usage_report, report_attr_name)
+ for _usage_report in _usage_reports
+ )
_usage_report_ref = rdflib.BNode()
yield (OSF.usage, _usage_report_ref)
yield (_usage_report_ref, DCAT.accessService, rdflib.URIRef(website_settings.DOMAIN.rstrip('/')))
yield (_usage_report_ref, FOAF.primaryTopic, focus.iri)
yield (_usage_report_ref, DCTERMS.temporal, rdflib.Literal(
- str(_usage_report.report_yearmonth),
+ str(_usage_reports[0].report_yearmonth),
datatype=rdflib.XSD.gYearMonth,
))
- yield (_usage_report_ref, OSF.viewCount, _usage_report.view_count)
- yield (_usage_report_ref, OSF.viewSessionCount, _usage_report.view_session_count)
- yield (_usage_report_ref, OSF.downloadCount, _usage_report.download_count)
- yield (_usage_report_ref, OSF.downloadSessionCount, _usage_report.download_session_count)
+ yield (_usage_report_ref, OSF.viewCount, _sum_usage('view_count'))
+ yield (_usage_report_ref, OSF.viewSessionCount, _sum_usage('view_session_count'))
+ yield (_usage_report_ref, OSF.downloadCount, _sum_usage('download_count'))
+ yield (_usage_report_ref, OSF.downloadSessionCount, _sum_usage('download_session_count'))
@gather.er(OSF.hasOsfAddon)
diff --git a/osf/metadata/serializers/linkset.py b/osf/metadata/serializers/linkset.py
index 3ee907d0532..499a82fcdb4 100644
--- a/osf/metadata/serializers/linkset.py
+++ b/osf/metadata/serializers/linkset.py
@@ -16,7 +16,7 @@
import rdflib
from ._base import MetadataSerializer
-from osf.metadata.osf_gathering import osfid_from_iri
+from osf.metadata.osfmap_utils import osfid_from_iri
from osf.metadata.rdfutils import (DOI, DATACITE, DCTERMS, OWL, RDF, OSF, DCAT, SCHEMA, DATACITE_SCHEMA_RESOURCE_TYPE_GENERAL_MAPPING, map_resource_type_general_datacite_to_scheme)
from website.settings import DOMAIN
from website.util import web_url_for
diff --git a/osf/metrics/README.md b/osf/metrics/README.md
index b5dad732e29..4f81049a0c9 100644
--- a/osf/metrics/README.md
+++ b/osf/metrics/README.md
@@ -8,14 +8,15 @@ but note that the COUNTER_SUSHI api has not yet been implemented atop.
## data model
usage data and periodic reports are both stored in elasticsearch using
-`elasticsearch-dsl`-based data models.
+`django-elasticsearch-metrics` and `elasticsearch8.dsl`-based data models.
-each "usage" is represented as `CountedAuthUsage` -- see `osf.metrics.counted_usage`
+each "usage" is represented as `OsfCountedUsageEvent` -- see `osf.metrics.events`
for field definitions with comments mapping fields to concepts in the COUNTER spec.
-each periodic report is represented as a subclass of `DailyReport` or `MonthlyReport`
-(see `osf.metrics.reports`) and has a "reporter" (see `osf.metrics.reporters`) that
-is invoked periodically to report.
+each periodic report is a subclass of `osf.metrics.monthly_reports.BaseMonthlyReport`
+or `osf.metrics.daily_reports.BaseDailyReport` (themselves subclasses of
+`elasticsearch_metrics.imps.elastic8.CyclicRecord`) and has a "reporter"
+(see `osf.metrics.reporters`) that is invoked periodically to report.
## api
note: the `osf.metrics` api is subject to change, is supported only for use within OSF
@@ -29,12 +30,13 @@ endpoints of interest for new development (all starting with `/_/metrics/`):
- `events/counted_usage/`: POST-only, for recording a usage
- `reports/`: GET list of available report types
- `reports//recent`: GET list of recent reports
+ - `reports//`: GET list of reports (filterable, sortable)
- `query/`: namespace for views that query usage data on demand (only for statically defined, cheap queries)
## how to
### add a new monthly report
-- add a `MonthlyReport` subclass (in `osf.metrics.reports`) with the fields you want
+- add a `BaseMonthlyReport` subclass (in `osf.metrics.monthly_reports`) with the fields you want
- add a `MonthlyReporter` subclass (in a module under `osf.metrics.reporters`)
that knows how to build your report
- to have your reporter run automatically, add it to `osf.metrics.reporters.MONTHLY_REPORTERS`
diff --git a/osf/metrics/__init__.py b/osf/metrics/__init__.py
index 6056e6d92f3..7d124c501b7 100644
--- a/osf/metrics/__init__.py
+++ b/osf/metrics/__init__.py
@@ -1,42 +1,12 @@
-from .counted_usage import CountedAuthUsage
-
-from .preprint_metrics import (
- PreprintView,
- PreprintDownload,
-)
-
-from .registry_metrics import RegistriesModerationMetrics
-
-from .reports import (
- DownloadCountReport,
- InstitutionSummaryReport,
- NewUserDomainReport,
- NodeSummaryReport,
- OsfstorageFileCountReport,
- PreprintSummaryReport,
- StorageAddonUsage,
- UserSummaryReport,
-)
-from . import es8_metrics
-
-
-DAILY_REPORTS = (
- DownloadCountReport,
- InstitutionSummaryReport,
- NewUserDomainReport,
- NodeSummaryReport,
- OsfstorageFileCountReport,
- PreprintSummaryReport,
- StorageAddonUsage,
- UserSummaryReport,
+from . import (
+ events,
+ daily_reports,
+ monthly_reports,
)
__all__ = (
- 'CountedAuthUsage',
- 'DAILY_REPORTS',
- 'PreprintView',
- 'PreprintDownload',
- 'RegistriesModerationMetrics',
- 'es8_metrics',
+ 'events',
+ 'daily_reports',
+ 'monthly_reports',
)
diff --git a/osf/metrics/counted_usage.py b/osf/metrics/counted_usage.py
deleted file mode 100644
index 41ea012fda5..00000000000
--- a/osf/metrics/counted_usage.py
+++ /dev/null
@@ -1,196 +0,0 @@
-from datetime import datetime
-import enum
-import logging
-from urllib.parse import urlsplit
-
-from elasticsearch6_dsl import InnerDoc, analyzer, tokenizer
-import elasticsearch_metrics.imps.elastic6 as metrics
-from elasticsearch_metrics.signals import pre_save
-from django.dispatch import receiver
-import pytz
-
-from osf.metrics.utils import stable_key
-
-
-logger = logging.getLogger(__name__)
-
-route_prefix_analyzer = analyzer(
- 'route_prefix_analyzer',
- tokenizer=tokenizer('route_prefix_tokenizer', 'path_hierarchy', delimiter='.'),
-)
-
-class PageviewInfo(InnerDoc):
- """PageviewInfo
-
- for CountedAuthUsage generated by viewing a web page
- """
- # fields that should be provided
- referer_url = metrics.Keyword()
- page_url = metrics.Keyword()
- page_title = metrics.Keyword()
- route_name = metrics.Keyword(
- fields={
- 'by_prefix': metrics.Text(analyzer=route_prefix_analyzer),
- },
- )
-
- # fields autofilled from the above (see `_autofill_fields`)
- page_path = metrics.Keyword()
- referer_domain = metrics.Keyword()
- hour_of_day = metrics.Integer()
-
-
-class CountedAuthUsage(metrics.Metric):
- """CountedAuthUsage
-
- Something was used! Let's quickly take note of that and
- move on, then come back later to query/analyze/investigate.
-
- Aim to support a COUNTER-style reporting api
- (see https://cop5.projectcounter.org/en/5.0.2/)
- """
-
- # where noted, fields correspond to defined terms from COUNTER
- # https://cop5.projectcounter.org/en/5.0.2/appendices/a-glossary-of-terms.html
- platform_iri = metrics.Keyword() # counter:Platform
- provider_id = metrics.Keyword() # counter:Database(?)
- session_id = metrics.Keyword() # counter:Session
- item_guid = metrics.Keyword() # counter:Item
- item_type = metrics.Keyword() # counter:Data-Type
- surrounding_guids = metrics.Keyword(multi=True) # counter:Title
- item_public = metrics.Boolean() # counter:Access-Type(?)
- user_is_authenticated = metrics.Boolean()
-
- action_labels = metrics.Keyword(multi=True)
- class ActionLabel(enum.Enum):
- SEARCH = 'search' # counter:Search
- VIEW = 'view' # counter:Investigation
- DOWNLOAD = 'download' # counter:Request
- WEB = 'web' # counter:Regular (aka "pageview")
- API = 'api' # counter:TDM (aka "non-web api usage")
- # TODO: count api usage, distinguish between web and non-web api requests
-
- # pageviews get additional info to support the "node analytics" view
- # (see `api.metrics.views.NodeAnalyticsQuery`)
- pageview_info = metrics.Object(PageviewInfo)
-
- class Meta:
- dynamic = metrics.MetaField('strict')
- source = metrics.MetaField(enabled=True)
-
-
-@receiver(pre_save, sender=CountedAuthUsage)
-def _autofill_fields(sender, instance, **kwargs):
- pageview = getattr(instance, 'pageview_info', None)
- if pageview:
- _fill_pageview_info(instance)
- item_guid = getattr(instance, 'item_guid', None)
- if item_guid:
- from osf.models import Guid
- guid_instance = Guid.load(item_guid)
- if guid_instance and guid_instance.referent:
- _fill_osfguid_info(instance, guid_instance.referent)
- _fill_document_id(instance)
-
-
-def _fill_pageview_info(counted_usage):
- pageview = counted_usage.pageview_info
- pageview_dict = pageview.to_dict()
- pageview.hour_of_day = counted_usage.timestamp.hour
- pageview.page_path = urlsplit(pageview_dict['page_url']).path.rstrip('/')
- if referer := pageview_dict.get('referer_url'):
- pageview.referer_domain = urlsplit(referer).netloc
-
-
-def _fill_osfguid_info(counted_usage, guid_referent):
- counted_usage.item_public = _get_ispublic(guid_referent)
- counted_usage.item_type = get_item_type(guid_referent)
- counted_usage.surrounding_guids = _get_surrounding_guids(guid_referent)
- if not counted_usage.provider_id:
- counted_usage.provider_id = get_provider_id(guid_referent)
-
-
-def _fill_document_id(counted_usage):
- # set the document id to a hash of "unique together"
- # values to get "ON CONFLICT UPDATE" behavior -- if
- # a matching document already exists, it will be updated,
- # not duplicated.
-
- # cannot detect/avoid conflicts this way, but that's ok
- # because we want to approximate `counter:Double-Click Filtering`
-
- if counted_usage.pageview_info is not None and counted_usage.pageview_info.page_url is not None:
- target_identifier = counted_usage.pageview_info.page_url
- else:
- target_identifier = counted_usage.item_guid
-
- # slice the day into an array of 30-second windows,
- # find this timestamp's windowslice index
- day_start = datetime(
- counted_usage.timestamp.year,
- counted_usage.timestamp.month,
- counted_usage.timestamp.day,
- tzinfo=pytz.utc,
- )
- time_in_seconds = (counted_usage.timestamp - day_start).total_seconds()
- time_window = int(time_in_seconds / 30)
-
- counted_usage.meta.id = stable_key(
- # unique-together values:
- counted_usage.platform_iri,
- target_identifier,
- counted_usage.session_id,
- counted_usage.timestamp.date(),
- time_window,
- ','.join(sorted(counted_usage.action_labels)),
- )
-
-
-def _get_ispublic(guid_referent):
- # if it quacks like BaseFileNode, look at .target instead
- maybe_public = getattr(guid_referent, 'target', None) or guid_referent
- if hasattr(maybe_public, 'verified_publishable'):
- return maybe_public.verified_publishable # quacks like Preprint
- return getattr(maybe_public, 'is_public', None) # quacks like AbstractNode
-
-
-def get_provider_id(guid_referent):
- provider = getattr(guid_referent, 'provider', None)
- if isinstance(provider, str):
- return provider # quacks like BaseFileNode
- elif provider:
- return provider._id # quacks like Registration, Preprint, Collection
- return 'osf' # quacks like Node, Comment, WikiPage
-
-
-def get_item_type(guid_referent):
- return type(guid_referent).__name__.lower()
-
-
-def _get_immediate_wrapper(guid_referent):
- if hasattr(guid_referent, 'verified_publishable'):
- return None # quacks like Preprint
- return (
- getattr(guid_referent, 'parent_node', None) # quacks like AbstractNode
- or getattr(guid_referent, 'node', None) # quacks like WikiPage, Comment
- or getattr(guid_referent, 'target', None) # quacks like BaseFileNode
- )
-
-def _get_surrounding_guids(guid_referent):
- """get all the parent/owner/surrounding guids for the given guid_referent
-
- @param guid_referent: instance of a model that has GuidMixin
- @returns list of str
-
- For AbstractNode, goes up the node hierarchy up to the root.
- For WikiPage or BaseFileNode, grab the node it belongs to and
- follow the node hierarchy from there.
- """
- surrounding_guids = []
- current_referent = guid_referent
- while current_referent:
- next_referent = _get_immediate_wrapper(current_referent)
- if next_referent:
- surrounding_guids.append(next_referent._id)
- current_referent = next_referent
- return surrounding_guids
diff --git a/osf/metrics/daily_reports.py b/osf/metrics/daily_reports.py
new file mode 100644
index 00000000000..40eb5073236
--- /dev/null
+++ b/osf/metrics/daily_reports.py
@@ -0,0 +1,178 @@
+import datetime
+
+import elasticsearch8.dsl as esdsl
+from elasticsearch_metrics import DAILY, YEARLY
+import elasticsearch_metrics.imps.elastic8 as djelme
+
+from osf.metrics.utils import cycle_coverage_date
+
+__all__ = (
+ 'BaseDailyReport',
+ 'DailyDownloadCountReport',
+ 'DailyInstitutionSummaryReport',
+ 'DailyNewUserDomainReport',
+ 'DailyNodeSummaryReport',
+ 'DailyOsfstorageFileCountReport',
+ 'DailyPreprintSummaryReport',
+ 'DailyStorageAddonUsageReport',
+ 'DailyUserSummaryReport',
+)
+
+
+###
+# base class
+
+class BaseDailyReport(djelme.CyclicRecord):
+ CYCLE_TIMEDEPTH = DAILY
+
+ class Meta:
+ abstract = True
+
+ def __init__(self, *, report_date=None, **kwargs):
+ super().__init__(**kwargs)
+ # separate out report_date, so the property setter gets used
+ if report_date is not None:
+ self.report_date = report_date
+
+ @property
+ def report_date(self):
+ _year, _month, _day = map(int, self.cycle_coverage.split('.'))
+ return datetime.date(_year, _month, _day)
+
+ @report_date.setter
+ def report_date(self, d: str | datetime.date):
+ self.cycle_coverage = cycle_coverage_date(
+ datetime.date.fromisoformat(d) if isinstance(d, str) else d
+ )
+
+
+###
+# reusable inner objects
+
+class RunningTotal(esdsl.InnerDoc):
+ total: int
+ total_daily: int | None
+
+
+class FileRunningTotals(esdsl.InnerDoc):
+ total: int
+ public: int
+ private: int
+ total_daily: int
+ public_daily: int
+ private_daily: int
+
+
+class NodeRunningTotals(esdsl.InnerDoc):
+ total: int
+ total_excluding_spam: int | None
+ public: int
+ private: int
+ total_daily: int
+ total_daily_excluding_spam: int | None
+ public_daily: int
+ private_daily: int
+
+
+class RegistrationRunningTotals(esdsl.InnerDoc):
+ total: int
+ public: int
+ embargoed: int
+ embargoed_v2: int
+ withdrawn: int | None
+ total_daily: int
+ public_daily: int
+ embargoed_daily: int
+ embargoed_v2_daily: int
+ withdrawn_daily: int | None
+
+
+class UsageByStorageAddon(esdsl.InnerDoc):
+ addon_shortname: str
+ enabled_usersettings: RunningTotal
+ linked_usersettings: RunningTotal
+ deleted_usersettings: RunningTotal
+ usersetting_links: RunningTotal
+ connected_nodesettings: RunningTotal
+ disconnected_nodesettings: RunningTotal
+ deleted_nodesettings: RunningTotal
+
+
+###
+# daily reports
+
+class DailyStorageAddonUsageReport(BaseDailyReport):
+ usage_by_addon: list[UsageByStorageAddon]
+
+ class Meta:
+ timeseries_index_timedepth = YEARLY
+
+
+class DailyDownloadCountReport(BaseDailyReport):
+ daily_file_downloads: int
+
+ class Meta:
+ timeseries_index_timedepth = YEARLY
+
+
+class DailyInstitutionSummaryReport(BaseDailyReport):
+ UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'institution_id',)
+
+ institution_id: str
+ institution_name: str
+ users: RunningTotal
+ nodes: NodeRunningTotals
+ projects: NodeRunningTotals
+ registered_nodes: RegistrationRunningTotals
+ registered_projects: RegistrationRunningTotals
+
+ class Meta:
+ timeseries_index_timedepth = YEARLY
+
+
+class DailyNewUserDomainReport(BaseDailyReport):
+ UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'domain_name',)
+
+ domain_name: str
+ new_user_count: int
+
+ class Meta:
+ timeseries_index_timedepth = YEARLY
+
+
+class DailyNodeSummaryReport(BaseDailyReport):
+ nodes: NodeRunningTotals
+ projects: NodeRunningTotals
+ registered_nodes: RegistrationRunningTotals
+ registered_projects: RegistrationRunningTotals
+
+ class Meta:
+ timeseries_index_timedepth = YEARLY
+
+
+class DailyOsfstorageFileCountReport(BaseDailyReport):
+ files: FileRunningTotals
+
+ class Meta:
+ timeseries_index_timedepth = YEARLY
+
+
+class DailyPreprintSummaryReport(BaseDailyReport):
+ UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'provider_key',)
+ provider_key: str
+ preprint_count: int
+
+ class Meta:
+ timeseries_index_timedepth = YEARLY
+
+
+class DailyUserSummaryReport(BaseDailyReport):
+ active: int
+ deactivated: int
+ merged: int
+ new_users_daily: int
+ new_users_with_institution_daily: int
+ unconfirmed: int
+
+ class Meta:
+ timeseries_index_timedepth = YEARLY
diff --git a/osf/metrics/es8_metrics.py b/osf/metrics/es8_metrics.py
deleted file mode 100644
index be68883a648..00000000000
--- a/osf/metrics/es8_metrics.py
+++ /dev/null
@@ -1,576 +0,0 @@
-import datetime
-import enum
-import functools
-from urllib.parse import urlsplit
-
-import elasticsearch8.dsl as esdsl
-from elasticsearch_metrics import DAILY, MONTHLY, YEARLY
-import elasticsearch_metrics.imps.elastic8 as djelme
-
-from osf.metadata.osfmap_utils import osfid_from_iri
-from osf.metrics.counted_usage import _get_surrounding_guids
-from osf.metrics.utils import (
- YearMonth,
- get_database_iri,
- get_item_type,
-)
-from osf import models as osfdb
-from osf.models.base import osfid_iri
-from website import settings as website_settings
-
-
-###
-# custom dsl fields
-
-class YearmonthField(esdsl.Date):
- def __init__(self, *args, **kwargs):
- super().__init__(*args, **kwargs, format='strict_year_month')
-
- def deserialize(self, data):
- if isinstance(data, int):
- # elasticsearch stores dates in milliseconds since the unix epoch
- _as_datetime = datetime.datetime.fromtimestamp(data // 1000)
- return YearMonth.from_date(_as_datetime)
- elif data is None:
- return None
- try:
- return YearMonth.from_any(data)
- except ValueError:
- raise ValueError(f'unsure how to deserialize "{data}" (of type {type(data)}) to YearMonth')
-
- def serialize(self, data, skip_empty=True):
- if isinstance(data, str):
- return data
- elif isinstance(data, YearMonth):
- return str(data)
- elif isinstance(data, (datetime.datetime, datetime.date)):
- return str(YearMonth.from_date(data))
- elif data is None:
- return None
- else:
- raise ValueError(f'unsure how to serialize "{data}" (of type {type(data)}) as YYYY-MM')
-
-
-###
-# inner objects for events
-
-route_prefix_analyzer = esdsl.analyzer(
- 'route_prefix_analyzer',
- tokenizer=esdsl.tokenizer('route_prefix_tokenizer', 'path_hierarchy', delimiter='.'),
-)
-
-
-class PageviewInfo(esdsl.InnerDoc):
- """PageviewInfo
-
- for CountedAuthUsage generated by viewing a web page
- """
-
- # fields that should be provided
- referer_url: str | None
- page_url: str | None
- page_title: str | None
- route_name: str | None = esdsl.mapped_field(esdsl.Keyword(
- fields={
- 'by_prefix': esdsl.Text(analyzer=route_prefix_analyzer),
- },
- ))
-
- # fields auto-filled
- page_path: str | None
- referer_domain: str | None
- hour_of_day: int | None
-
-
-###
-# Event records
-
-class OsfCountedUsageEvent(djelme.CountedUsageRecord):
- '''
- Aim to support a COUNTER-style reporting api
- https://cop5.projectcounter.org/en/5.1/appendices/a-glossary-of-terms.html
- https://coprd.countermetrics.org/en/1.0.1/appendices/a-glossary.html
- '''
- UNIQUE_TOGETHER_FIELDS = (
- 'platform_iri',
- 'sessionhour_id',
- 'action_labels',
- # include some non-field properties for more complex logic to
- # slightly better approximate `counter:Double-Click Filtering`
- # and allow for multiple pages describing the same item_iri
- '_page_url_or_osfid', # non-field property
- '_timestamp_date', # non-field property
- '_timestamp_30sec_window', # non-field property
- )
-
- # inherited fields:
- # timestamp: datetime.datetime
- # platform_iri: str
- # database_iri: str
- # item_iri: str
- # sessionhour_id: str
- # within_iris: list[str]
-
- # osf-specific fields:
- item_osfid: str
- item_type: str
- item_public: bool
- provider_id: str | None
- user_is_authenticated: bool
- action_labels: list[str]
- pageview_info: PageviewInfo | None
-
- class Meta:
- timeseries_index_timedepth = MONTHLY
-
- class ActionLabel(enum.Enum):
- SEARCH = 'search' # counter:Search
- VIEW = 'view' # counter:Investigation
- DOWNLOAD = 'download' # counter:Request
- WEB = 'web' # counter:Regular (aka "pageview")
- API = 'api' # counter:TDM (aka "non-web api usage")
-
- @classmethod
- def record(cls, **kwargs):
- # autofill `user_is_authenticated` before `user_id` discarded (couldn't in `clean`)
- if 'user_is_authenticated' not in kwargs:
- kwargs['user_is_authenticated'] = bool(kwargs.get('user_id'))
- return super().record(**kwargs)
-
- @property
- def _page_url_or_osfid(self):
- # for UNIQUE_TOGETHER_FIELDS
- return (
- self.pageview_info.page_url
- if self.pageview_info is not None and self.pageview_info.page_url is not None
- else self.item_osfid
- )
-
- @property
- def _timestamp_date(self):
- # for UNIQUE_TOGETHER_FIELDS
- return self.timestamp.date()
-
- @property
- def _timestamp_30sec_window(self):
- # for UNIQUE_TOGETHER_FIELDS
- # slice the day into an array of 30-second windows,
- # find this timestamp's windowslice index
- _day_start = datetime.datetime(
- self.timestamp.year,
- self.timestamp.month,
- self.timestamp.day,
- tzinfo=self.timestamp.tzinfo,
- )
- _time_in_seconds = (self.timestamp - _day_start).total_seconds()
- return int(_time_in_seconds / 30) # 30-second windows
-
- @functools.cached_property
- def _osfid_referent(self):
- # for use by autofill methods, if needed
- _osfguid = osfdb.Guid.load(self.item_osfid)
- return _osfguid.referent if _osfguid else None
-
- def clean(self):
- self._autofill_platform_iri()
- self._autofill_item_iri_and_osfid()
- self._autofill_item_public()
- self._autofill_item_type()
- self._autofill_provider_id()
- self._autofill_within_iris()
- self._autofill_pageview()
- self._autofill_database_iri()
- self._clean_action_labels()
- super().clean()
-
- def _autofill_platform_iri(self):
- if self.platform_iri is None:
- self.platform_iri = website_settings.DOMAIN
-
- def _autofill_item_iri_and_osfid(self):
- if self.item_osfid and not self.item_iri:
- self.item_iri = osfid_iri(self.item_osfid)
- elif self.item_iri and not self.item_osfid:
- try:
- self.item_osfid = osfid_from_iri(self.item_iri)
- except ValueError:
- pass
-
- def _autofill_item_public(self):
- if self.item_osfid and (self.item_public is None):
- _item = self._osfid_referent
- # if it quacks like BaseFileNode, look at .target instead
- _item = getattr(_item, 'target', None) or _item
- self.item_public = (
- _item.verified_publishable # quacks like Preprint
- if hasattr(_item, 'verified_publishable')
- else getattr(_item, 'is_public', False) # quacks like AbstractNode
- )
-
- def _autofill_item_type(self):
- if self.item_osfid and not self.item_type:
- self.item_type = get_item_type(self._osfid_referent)
-
- def _autofill_provider_id(self):
- if self.item_osfid and not self.provider_id:
- _provider = getattr(self._osfid_referent, 'provider', None)
- if _provider is None:
- self.provider_id = 'osf' # quacks like Node, Comment, WikiPage
- elif isinstance(_provider, str):
- self.provider_id = _provider # quacks like BaseFileNode
- else:
- self.provider_id = _provider._id # quacks like Registration, Preprint, Collection
-
- def _autofill_within_iris(self):
- if self.item_osfid and (self.within_iris is None) and self._osfid_referent:
- self.within_iris = [
- osfid_iri(_osfid)
- for _osfid in _get_surrounding_guids(self._osfid_referent)
- ]
- # ensure inclusive "within"
- if not self.within_iris:
- self.within_iris = [self.item_iri]
- if self.item_iri not in self.within_iris:
- self.within_iris = [self.item_iri, *self.within_iris]
-
- def _autofill_pageview(self):
- # autofill pageview_info fields from other fields
- if self.pageview_info:
- self.pageview_info.hour_of_day = self.timestamp.hour
- _url = self.pageview_info.page_url
- if _url:
- self.pageview_info.page_path = urlsplit(_url).path.rstrip('/')
- _ref_url = self.pageview_info.referer_url
- if _ref_url:
- self.pageview_info.referer_domain = urlsplit(_ref_url).netloc
-
- def _autofill_database_iri(self):
- if self.item_osfid and not self.database_iri:
- self.database_iri = get_database_iri(self._osfid_referent)
-
- def _clean_action_labels(self):
- if self.action_labels:
- self.action_labels = sorted(self.action_labels)
-
-
-class RegistriesModerationEventEs8(djelme.EventRecord):
- UNIQUE_TOGETHER_FIELDS = (
- 'timestamp', 'registration_id', 'trigger', 'from_state', 'to_state', 'user_id'
- )
-
- registration_id: str
- provider_id: str
- trigger: str
- from_state: str
- to_state: str
- user_id: str
- comment: str | None
-
- class Meta:
- timeseries_recordtype_name = 'RegistriesModerationEvent'
- timeseries_index_timedepth = MONTHLY
-
-
-###
-# Reusable inner objects for reports
-
-class RunningTotal(esdsl.InnerDoc):
- total: int
- total_daily: int | None
-
-
-class FileRunningTotals(esdsl.InnerDoc):
- total: int
- public: int
- private: int
- total_daily: int
- public_daily: int
- private_daily: int
-
-
-class NodeRunningTotals(esdsl.InnerDoc):
- total: int
- total_excluding_spam: int | None
- public: int
- private: int
- total_daily: int
- total_daily_excluding_spam: int | None
- public_daily: int
- private_daily: int
-
-
-class RegistrationRunningTotals(esdsl.InnerDoc):
- total: int
- public: int
- embargoed: int
- embargoed_v2: int
- withdrawn: int | None
- total_daily: int
- public_daily: int
- embargoed_daily: int
- embargoed_v2_daily: int
- withdrawn_daily: int | None
-
-
-class UsageByStorageAddon(esdsl.InnerDoc):
- addon_shortname: str
- enabled_usersettings: RunningTotal
- linked_usersettings: RunningTotal
- deleted_usersettings: RunningTotal
- usersetting_links: RunningTotal
- connected_nodesettings: RunningTotal
- disconnected_nodesettings: RunningTotal
- deleted_nodesettings: RunningTotal
-
-
-###
-# Cyclic reports
-
-
-class DailyStorageAddonUsageReportEs8(djelme.CyclicRecord):
- CYCLE_TIMEDEPTH = DAILY
-
- usage_by_addon: list[UsageByStorageAddon]
-
- class Meta:
- timeseries_index_timedepth = YEARLY
- timeseries_recordtype_name = 'DailyStorageAddonUsageReport'
-
-
-class DailyDownloadCountReportEs8(djelme.CyclicRecord):
- CYCLE_TIMEDEPTH = DAILY
-
- daily_file_downloads: int
-
- class Meta:
- timeseries_index_timedepth = YEARLY
- timeseries_recordtype_name = 'DailyDownloadCountReport'
-
-
-class DailyInstitutionSummaryReportEs8(djelme.CyclicRecord):
- CYCLE_TIMEDEPTH = DAILY
- UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'institution_id',)
-
- institution_id: str
- institution_name: str
- users: RunningTotal
- nodes: NodeRunningTotals
- projects: NodeRunningTotals
- registered_nodes: RegistrationRunningTotals
- registered_projects: RegistrationRunningTotals
-
- class Meta:
- timeseries_index_timedepth = YEARLY
- timeseries_recordtype_name = 'DailyInstitutionSummaryReport'
-
-
-class DailyNewUserDomainReportEs8(djelme.CyclicRecord):
- CYCLE_TIMEDEPTH = DAILY
- UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'domain_name',)
-
- domain_name: str
- new_user_count: int
-
- class Meta:
- timeseries_index_timedepth = YEARLY
- timeseries_recordtype_name = 'DailyNewUserDomainReport'
-
-
-class DailyNodeSummaryReportEs8(djelme.CyclicRecord):
- CYCLE_TIMEDEPTH = DAILY
-
- nodes: NodeRunningTotals
- projects: NodeRunningTotals
- registered_nodes: RegistrationRunningTotals
- registered_projects: RegistrationRunningTotals
-
- class Meta:
- timeseries_index_timedepth = YEARLY
- timeseries_recordtype_name = 'DailyNodeSummaryReport'
-
-
-class DailyOsfstorageFileCountReportEs8(djelme.CyclicRecord):
- CYCLE_TIMEDEPTH = DAILY
-
- files: FileRunningTotals
-
- class Meta:
- timeseries_index_timedepth = YEARLY
- timeseries_recordtype_name = 'DailyOsfstorageFileCountReport'
-
-
-class DailyPreprintSummaryReportEs8(djelme.CyclicRecord):
- CYCLE_TIMEDEPTH = DAILY
-
- UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'provider_key',)
- provider_key: str
- preprint_count: int
-
- class Meta:
- timeseries_index_timedepth = YEARLY
- timeseries_recordtype_name = 'DailyPreprintSummaryReport'
-
-
-class DailyUserSummaryReportEs8(djelme.CyclicRecord):
- CYCLE_TIMEDEPTH = DAILY
-
- active: int
- deactivated: int
- merged: int
- new_users_daily: int
- new_users_with_institution_daily: int
- unconfirmed: int
-
- class Meta:
- timeseries_index_timedepth = YEARLY
- timeseries_recordtype_name = 'DailyUserSummaryReport'
-
-
-class MonthlySpamSummaryReportEs8(djelme.CyclicRecord):
- CYCLE_TIMEDEPTH = MONTHLY
-
- node_confirmed_spam: int
- node_confirmed_ham: int
- node_flagged: int
- registration_confirmed_spam: int
- registration_confirmed_ham: int
- registration_flagged: int
- preprint_confirmed_spam: int
- preprint_confirmed_ham: int
- preprint_flagged: int
- user_marked_as_spam: int
- user_marked_as_ham: int
-
- class Meta:
- timeseries_index_timedepth = YEARLY
- timeseries_recordtype_name = 'MonthlySpamSummaryReport'
-
-
-class MonthlyInstitutionalUserReportEs8(djelme.CyclicRecord):
- CYCLE_TIMEDEPTH = MONTHLY
- UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'institution_id', 'user_id',)
-
- institution_id: str
- # user info:
- user_id: str
- user_name: str
- department_name: str | None
- month_last_login = YearmonthField()
- month_last_active = YearmonthField()
- account_creation_date = YearmonthField()
- orcid_id: str | None
- # counts:
- public_project_count: int
- private_project_count: int
- public_registration_count: int
- embargoed_registration_count: int
- published_preprint_count: int
- public_file_count: int = esdsl.mapped_field(esdsl.Long())
- storage_byte_count: int = esdsl.mapped_field(esdsl.Long())
-
- class Meta:
- timeseries_index_timedepth = YEARLY
- timeseries_recordtype_name = 'MonthlyInstitutionalUserReport'
-
-
-class MonthlyInstitutionSummaryReportEs8(djelme.CyclicRecord):
- CYCLE_TIMEDEPTH = MONTHLY
- UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'institution_id', )
-
- institution_id: str
- user_count: int
- public_project_count: int
- private_project_count: int
- public_registration_count: int
- embargoed_registration_count: int
- published_preprint_count: int
- storage_byte_count: int = esdsl.mapped_field(esdsl.Long())
- public_file_count: int = esdsl.mapped_field(esdsl.Long())
- monthly_logged_in_user_count: int = esdsl.mapped_field(esdsl.Long())
- monthly_active_user_count: int = esdsl.mapped_field(esdsl.Long())
-
- class Meta:
- timeseries_index_timedepth = YEARLY
- timeseries_recordtype_name = 'MonthlyInstitutionSummaryReport'
-
-
-class MonthlyPublicItemUsageReportEs8(djelme.CyclicRecord):
- CYCLE_TIMEDEPTH = MONTHLY
- UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'item_iri')
-
- # where noted, fields are meant to correspond to defined terms from COUNTER
- # https://cop5.projectcounter.org/en/5.1/appendices/a-glossary-of-terms.html
- # https://coprd.countermetrics.org/en/1.0.1/appendices/a-glossary.html
- item_iri: str
- item_osfids: list[str]
- # fields built from aggregations -- more than one value unlikely, but possible
- item_types: list[str] # counter:Data-Type
- platform_iris: list[str] # counter:Platform
- database_iris: list[str] # counter:Database
- provider_ids: list[str] # osf-specific (usually corresponds to database_iri)
-
- # view counts include views on components or files contained by this item
- view_count: int | None = esdsl.mapped_field(esdsl.Long())
- view_session_count: int | None = esdsl.mapped_field(esdsl.Long())
- cumulative_view_count: int | None = esdsl.mapped_field(esdsl.Long())
- cumulative_view_session_count: int | None = esdsl.mapped_field(esdsl.Long())
-
- # download counts of this item only (not including contained components or files)
- download_count: int | None = esdsl.mapped_field(esdsl.Long())
- download_session_count: int | None = esdsl.mapped_field(esdsl.Long())
- cumulative_download_count: int | None = esdsl.mapped_field(esdsl.Long())
- cumulative_download_session_count: int | None = esdsl.mapped_field(esdsl.Long())
-
- class Meta:
- timeseries_index_timedepth = YEARLY
- timeseries_recordtype_name = 'MonthlyPublicItemUsageReport'
-
-
-class MonthlyPrivateSpamMetricsReportEs8(djelme.CyclicRecord):
- CYCLE_TIMEDEPTH = MONTHLY
-
- node_oopspam_flagged: int
- node_oopspam_hammed: int
- node_akismet_flagged: int
- node_akismet_hammed: int
- preprint_oopspam_flagged: int
- preprint_oopspam_hammed: int
- preprint_akismet_flagged: int
- preprint_akismet_hammed: int
-
- class Meta:
- timeseries_index_timedepth = YEARLY
- timeseries_recordtype_name = 'MonthlyPrivateSpamMetricsReport'
-
-
-###
-# data migration state
-
-class Elastic6To8State(djelme.SimpleRecord):
- """index for storing values helpful for keeping track of the elastic 6->8 data migration"""
- UNIQUE_TOGETHER_FIELDS = ('key',)
- key: str
- value: str | None
- timestamp: datetime.datetime = esdsl.mapped_field(
- default_factory=lambda: datetime.datetime.now(datetime.UTC),
- )
-
- @classmethod
- def get_by_key(cls, key: str):
- _response = cls.search().query({'term': {'key': key}})[0].execute()
- return _response[0] if _response else None
-
- @classmethod
- def get_timestamp(cls, key: str) -> datetime.datetime | None:
- _record = cls.get_by_key(key)
- return _record.timestamp if _record else None
-
- @classmethod
- def get_started_at(cls):
- return cls.get_timestamp('started_at')
-
- @classmethod
- def set_started_at_now(cls):
- _record = cls.record(key='started_at')
- cls.refresh()
- return _record.timestamp
diff --git a/osf/metrics/events.py b/osf/metrics/events.py
new file mode 100644
index 00000000000..e827581d2ef
--- /dev/null
+++ b/osf/metrics/events.py
@@ -0,0 +1,242 @@
+import datetime
+import enum
+import functools
+from urllib.parse import urlsplit
+
+import elasticsearch8.dsl as esdsl
+from elasticsearch_metrics import MONTHLY
+import elasticsearch_metrics.imps.elastic8 as djelme
+
+from osf.metadata.osfmap_utils import osfid_from_iri
+from osf.metrics.utils import (
+ get_database_iri,
+ get_item_type,
+ get_surrounding_osfids,
+)
+from osf import models as osfdb
+from osf.models.base import osfid_iri
+from website import settings as website_settings
+
+__all__ = (
+ 'OsfCountedUsageEvent',
+ 'RegistriesModerationEvent',
+)
+
+
+###
+# inner objects for events
+
+route_prefix_analyzer = esdsl.analyzer(
+ 'route_prefix_analyzer',
+ tokenizer=esdsl.tokenizer('route_prefix_tokenizer', 'path_hierarchy', delimiter='.'),
+)
+
+
+class PageviewInfo(esdsl.InnerDoc):
+ """PageviewInfo
+
+ for OsfCountedUsageEvent generated by viewing a web page
+ """
+
+ # fields that should be provided
+ referer_url: str | None
+ page_url: str | None
+ page_title: str | None
+ route_name: str | None = esdsl.mapped_field(esdsl.Keyword(
+ fields={
+ 'by_prefix': esdsl.Text(analyzer=route_prefix_analyzer),
+ },
+ ))
+
+ # fields auto-filled
+ page_path: str | None
+ referer_domain: str | None
+ hour_of_day: int | None
+
+
+###
+# Event records
+
+class OsfCountedUsageEvent(djelme.CountedUsageRecord):
+ '''
+ Aim to support a COUNTER-style reporting api
+ https://cop5.projectcounter.org/en/5.1/appendices/a-glossary-of-terms.html
+ https://coprd.countermetrics.org/en/1.0.1/appendices/a-glossary.html
+ '''
+ UNIQUE_TOGETHER_FIELDS = (
+ 'platform_iri',
+ 'sessionhour_id',
+ 'action_labels',
+ # include some non-field properties for more complex logic to
+ # slightly better approximate `counter:Double-Click Filtering`
+ # and allow for multiple pages describing the same item_iri
+ '_page_url_or_osfid', # non-field property
+ '_timestamp_date', # non-field property
+ '_timestamp_30sec_window', # non-field property
+ )
+
+ # inherited fields:
+ # timestamp: datetime.datetime
+ # platform_iri: str
+ # database_iri: str
+ # item_iri: str
+ # sessionhour_id: str
+ # within_iris: list[str]
+
+ # osf-specific fields:
+ item_osfid: str
+ item_type: str
+ item_public: bool
+ provider_id: str | None
+ user_is_authenticated: bool
+ action_labels: list[str]
+ pageview_info: PageviewInfo | None
+
+ class Meta:
+ timeseries_index_timedepth = MONTHLY
+
+ class ActionLabel(enum.Enum):
+ SEARCH = 'search' # counter:Search
+ VIEW = 'view' # counter:Investigation
+ DOWNLOAD = 'download' # counter:Request
+ WEB = 'web' # counter:Regular (aka "pageview")
+ API = 'api' # counter:TDM (aka "non-web api usage")
+
+ @classmethod
+ def record(cls, **kwargs):
+ # autofill `user_is_authenticated` before `user_id` discarded (couldn't in `clean`)
+ if 'user_is_authenticated' not in kwargs:
+ kwargs['user_is_authenticated'] = bool(kwargs.get('user_id'))
+ return super().record(**kwargs)
+
+ @property
+ def _page_url_or_osfid(self):
+ # for UNIQUE_TOGETHER_FIELDS
+ return (
+ self.pageview_info.page_url
+ if self.pageview_info is not None and self.pageview_info.page_url is not None
+ else self.item_osfid
+ )
+
+ @property
+ def _timestamp_date(self):
+ # for UNIQUE_TOGETHER_FIELDS
+ return self.timestamp.date()
+
+ @property
+ def _timestamp_30sec_window(self):
+ # for UNIQUE_TOGETHER_FIELDS
+ # slice the day into an array of 30-second windows,
+ # find this timestamp's windowslice index
+ _day_start = datetime.datetime(
+ self.timestamp.year,
+ self.timestamp.month,
+ self.timestamp.day,
+ tzinfo=self.timestamp.tzinfo,
+ )
+ _time_in_seconds = (self.timestamp - _day_start).total_seconds()
+ return int(_time_in_seconds / 30) # 30-second windows
+
+ @functools.cached_property
+ def _osfid_referent(self):
+ # for use by autofill methods, if needed
+ _osfguid = osfdb.Guid.load(self.item_osfid)
+ return _osfguid.referent if _osfguid else None
+
+ def clean(self):
+ self._autofill_platform_iri()
+ self._autofill_item_iri_and_osfid()
+ self._autofill_item_public()
+ self._autofill_item_type()
+ self._autofill_provider_id()
+ self._autofill_within_iris()
+ self._autofill_pageview()
+ self._autofill_database_iri()
+ self._clean_action_labels()
+ super().clean()
+
+ def _autofill_platform_iri(self):
+ if self.platform_iri is None:
+ self.platform_iri = website_settings.DOMAIN
+
+ def _autofill_item_iri_and_osfid(self):
+ if self.item_osfid and not self.item_iri:
+ self.item_iri = osfid_iri(self.item_osfid)
+ elif self.item_iri and not self.item_osfid:
+ try:
+ self.item_osfid = osfid_from_iri(self.item_iri)
+ except ValueError:
+ pass
+
+ def _autofill_item_public(self):
+ if self.item_osfid and (self.item_public is None):
+ _item = self._osfid_referent
+ # if it quacks like BaseFileNode, look at .target instead
+ _item = getattr(_item, 'target', None) or _item
+ self.item_public = (
+ _item.verified_publishable # quacks like Preprint
+ if hasattr(_item, 'verified_publishable')
+ else getattr(_item, 'is_public', False) # quacks like AbstractNode
+ )
+
+ def _autofill_item_type(self):
+ if self.item_osfid and not self.item_type:
+ self.item_type = get_item_type(self._osfid_referent)
+
+ def _autofill_provider_id(self):
+ if self.item_osfid and not self.provider_id:
+ _provider = getattr(self._osfid_referent, 'provider', None)
+ if _provider is None:
+ self.provider_id = 'osf' # quacks like Node, Comment, WikiPage
+ elif isinstance(_provider, str):
+ self.provider_id = _provider # quacks like BaseFileNode
+ else:
+ self.provider_id = _provider._id # quacks like Registration, Preprint, Collection
+
+ def _autofill_within_iris(self):
+ if self.item_osfid and (not self.within_iris) and self._osfid_referent:
+ self.within_iris = [
+ osfid_iri(_osfid)
+ for _osfid in get_surrounding_osfids(self._osfid_referent)
+ ]
+ # ensure inclusive "within"
+ if self.item_iri not in self.within_iris:
+ self.within_iris = [self.item_iri, *self.within_iris]
+ self.within_iris = sorted(self.within_iris)
+
+ def _autofill_pageview(self):
+ # autofill pageview_info fields from other fields
+ if self.pageview_info:
+ self.pageview_info.hour_of_day = self.timestamp.hour
+ _url = self.pageview_info.page_url
+ if _url:
+ self.pageview_info.page_path = urlsplit(_url).path.rstrip('/')
+ _ref_url = self.pageview_info.referer_url
+ if _ref_url:
+ self.pageview_info.referer_domain = urlsplit(_ref_url).netloc
+
+ def _autofill_database_iri(self):
+ if self.item_osfid and not self.database_iri:
+ self.database_iri = get_database_iri(self._osfid_referent)
+
+ def _clean_action_labels(self):
+ if self.action_labels:
+ self.action_labels = sorted(self.action_labels)
+
+
+class RegistriesModerationEvent(djelme.EventRecord):
+ UNIQUE_TOGETHER_FIELDS = (
+ 'timestamp', 'registration_id', 'trigger', 'from_state', 'to_state', 'user_id'
+ )
+
+ registration_id: str
+ provider_id: str
+ trigger: str
+ from_state: str
+ to_state: str
+ user_id: str
+ comment: str | None
+
+ class Meta:
+ timeseries_recordtype_name = 'RegistriesModerationEvent'
+ timeseries_index_timedepth = MONTHLY
diff --git a/osf/metrics/fields.py b/osf/metrics/fields.py
new file mode 100644
index 00000000000..a91ad40eeea
--- /dev/null
+++ b/osf/metrics/fields.py
@@ -0,0 +1,37 @@
+import datetime
+
+import elasticsearch8.dsl as esdsl
+
+from osf.metrics.utils import YearMonth
+
+
+###
+# custom elasticsearch dsl fields
+
+class YearmonthField(esdsl.Date):
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs, format='strict_year_month')
+
+ def deserialize(self, data):
+ if isinstance(data, int):
+ # elasticsearch stores dates in milliseconds since the unix epoch
+ _as_datetime = datetime.datetime.fromtimestamp(data // 1000)
+ return YearMonth.from_date(_as_datetime)
+ elif data is None:
+ return None
+ try:
+ return YearMonth.from_any(data)
+ except ValueError:
+ raise ValueError(f'unsure how to deserialize "{data}" (of type {type(data)}) to YearMonth')
+
+ def serialize(self, data, skip_empty=True):
+ if isinstance(data, str):
+ return data
+ elif isinstance(data, YearMonth):
+ return str(data)
+ elif isinstance(data, (datetime.datetime, datetime.date)):
+ return str(YearMonth.from_date(data))
+ elif data is None:
+ return None
+ else:
+ raise ValueError(f'unsure how to serialize "{data}" (of type {type(data)}) as YYYY-MM')
diff --git a/osf/metrics/metric_mixin.py b/osf/metrics/metric_mixin.py
deleted file mode 100644
index df87d5123b1..00000000000
--- a/osf/metrics/metric_mixin.py
+++ /dev/null
@@ -1,144 +0,0 @@
-from datetime import datetime
-
-from django.db import models
-from django.utils import timezone
-from elasticsearch6.exceptions import NotFoundError
-import pytz
-
-
-class MetricMixin:
-
- @classmethod
- def _get_all_indices(cls):
- all_aliases = cls._index.get_alias()
- indices = set()
- for index, aliases in all_aliases.items():
- indices.add(index)
- if aliases['aliases']:
- for alias in aliases['aliases'].keys():
- indices.add(alias)
- return indices
-
- @classmethod
- def _get_relevant_indices(cls, after, before):
- # NOTE: This will only work for yearly indices. This logic
- # will need to be updated if we change to monthly or daily indices
- if before and after:
- year_range = range(after.year, before.year + 1)
- elif after:
- year_range = range(after.year, timezone.now().year + 1)
- else:
- # No metric data from before 2013
- year_range = range(2013, before.year + 1)
- all_indices = cls._get_all_indices()
- relevant_indices = [
- # get_index_name takes a datetime, so get Jan 1 for each relevant year
- cls.get_index_name(datetime(year, 1, 1, tzinfo=pytz.utc))
- for year in year_range
- ]
- return [index for index in relevant_indices if index in all_indices]
-
- @classmethod
- def _get_id_to_count(cls, size, metric_field, count_field, after=None, before=None):
- """Performs the elasticsearch aggregation for get_top_by_count. Return a
- dict mapping ids to summed counts. If there's no data in the ES index, return None.
- """
- search = cls.search(after=after, before=before)
- timestamp = {}
- if after:
- timestamp['gte'] = after
- if before:
- timestamp['lt'] = before
- if timestamp:
- search = search.filter('range', timestamp=timestamp)
- search.aggs.\
- bucket('by_id', 'terms', field=metric_field, size=size, order={'sum_count': 'desc'}).\
- metric('sum_count', 'sum', field=count_field)
- # Optimization: set size to 0 so that hits aren't returned (we only care about the aggregation)
- search = search.extra(size=0)
- try:
- response = search.execute()
- except NotFoundError:
- # _get_relevant_indices returned 1 or more indices
- # that doesn't exist. Fall back to unoptimized query
- search = search.index().index(cls._default_index())
- response = search.execute()
- # No indexed data
- if not hasattr(response.aggregations, 'by_id'):
- return None
- buckets = response.aggregations.by_id.buckets
- # Map _id => count
- return {
- bucket.key: int(bucket.sum_count.value)
- for bucket in buckets
- }
-
- # Overrides Document.search to only search relevant
- # indices, determined from `after`
- @classmethod
- def search(cls, using=None, index=None, after=None, before=None, *args, **kwargs):
- if not index and (before or after):
- indices = cls._get_relevant_indices(after, before)
- index = ','.join(indices)
- return super().search(using=using, index=index, *args, **kwargs)
-
- @classmethod
- def get_top_by_count(cls, qs, model_field, metric_field,
- size, order_by=None,
- count_field='count',
- annotation='metric_count',
- after=None, before=None):
- """Return a queryset annotated with the metric counts for each item.
-
- Example: ::
-
- # Get the top 10 PreprintProviders by download count
- top_providers = PreprintDownload.get_top_by_count(
- qs=PreprintProvider.objects.all(),
- model_field='_id',
- metric_field='provider_id',
- annotation='download_count',
- size=10
- )
-
- for each in top_providers:
- print('{}: {}'.format(each._id, each.download_count))
-
- ``size`` determines the number of buckets returned by the aggregation.
- If ``size=None``, the size of the queryset is used.
- WARNING: Be careful when using size=None when using a large queryset.
-
- :param QuerySet qs: The initial queryset to annotate
- :param str model_field: Model field that corresponds to ``metric_field``.
- :param str metric_field: Metric field that corresponds to ``model_field``.
- :param int size: Size of the aggregation. Also determines the size of the final
- queryset.
- :param str order_by: Field to order queryset by. If `None`, orders by
- the metric, descending.
- :param datetime after: Minimum datetime to narrow the search (inclusive).
- :param datetime before: Maximum datetime to narrow the search (exclusive).
- :param str count_field: Name of the field where count values are stored.
- :param str annotation: Name of the annotation.
- """
- id_to_count = cls._get_id_to_count(
- size=size or qs.count(),
- metric_field=metric_field,
- count_field=count_field,
- after=after,
- before=before
- )
- if id_to_count is None:
- return qs.annotate(**{annotation: models.Value(0, models.IntegerField())})
- # Annotate the queryset with the counts for each id
- # https://stackoverflow.com/a/48187723/1157536
- whens = [
- models.When(**{
- model_field: k,
- 'then': v,
- }) for k, v in id_to_count.items()
- ]
- # By default order by annotation, desc
- order_by = order_by or f'-{annotation}'
- return qs.annotate(**{
- annotation: models.Case(*whens, default=0, output_field=models.IntegerField())
- }).order_by(order_by)
diff --git a/osf/metrics/monthly_reports.py b/osf/metrics/monthly_reports.py
new file mode 100644
index 00000000000..b0c3f4a3895
--- /dev/null
+++ b/osf/metrics/monthly_reports.py
@@ -0,0 +1,200 @@
+import collections.abc
+
+import elasticsearch8.dsl as esdsl
+from elasticsearch_metrics import MONTHLY, YEARLY
+import elasticsearch_metrics.imps.elastic8 as djelme
+
+from osf.metrics.fields import YearmonthField
+from osf.metrics.utils import (
+ YearMonth,
+ cycle_coverage_yearmonth,
+)
+
+__all__ = (
+ 'BaseMonthlyReport',
+ 'MonthlyInstitutionSummaryReport',
+ 'MonthlyInstitutionalUserReport',
+ 'MonthlyPrivateSpamMetricsReport',
+ 'MonthlyPublicItemUsageReport',
+ 'MonthlySpamSummaryReport',
+)
+
+
+###
+# base class
+
+class BaseMonthlyReport(djelme.CyclicRecord):
+ CYCLE_TIMEDEPTH = MONTHLY
+
+ class Meta:
+ abstract = True
+
+ @classmethod
+ def most_recent_cycle(cls, base_search=None) -> str | None:
+ _search = base_search or cls.search()
+ _search = _search[0:0] # omit hits
+ _search.aggs.bucket(
+ 'agg_most_recent_cycle',
+ 'terms',
+ field='cycle_coverage',
+ order={'_key': 'desc'},
+ size=1,
+ )
+ _response = _search.execute()
+ if not _response.aggregations:
+ return None
+ _buckets = _response.aggregations.agg_most_recent_cycle.buckets
+ if not _buckets:
+ return None
+ return _buckets[0].key
+
+ def __init__(self, *, report_yearmonth=None, **kwargs):
+ super().__init__(**kwargs)
+ # separate out report_yearmonth, so the property setter gets used
+ if report_yearmonth is not None:
+ self.report_yearmonth = report_yearmonth
+
+ @property
+ def report_yearmonth(self):
+ _year, _month = map(int, self.cycle_coverage.split('.'))
+ return YearMonth(_year, _month)
+
+ @report_yearmonth.setter
+ def report_yearmonth(self, ym):
+ self.cycle_coverage = cycle_coverage_yearmonth(YearMonth.from_any(ym))
+
+
+###
+# monthly reports
+
+class MonthlySpamSummaryReport(BaseMonthlyReport):
+ node_confirmed_spam: int
+ node_confirmed_ham: int
+ node_flagged: int
+ registration_confirmed_spam: int
+ registration_confirmed_ham: int
+ registration_flagged: int
+ preprint_confirmed_spam: int
+ preprint_confirmed_ham: int
+ preprint_flagged: int
+ user_marked_as_spam: int
+ user_marked_as_ham: int
+
+ class Meta:
+ timeseries_index_timedepth = YEARLY
+
+
+class MonthlyInstitutionalUserReport(BaseMonthlyReport):
+ UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'institution_id', 'user_id',)
+
+ institution_id: str
+ # user info:
+ user_id: str
+ user_name: str
+ department_name: str | None
+ month_last_login = YearmonthField()
+ month_last_active = YearmonthField()
+ account_creation_date = YearmonthField()
+ orcid_id: str | None
+ # counts:
+ public_project_count: int
+ private_project_count: int
+ public_registration_count: int
+ embargoed_registration_count: int
+ published_preprint_count: int
+ public_file_count: int = esdsl.mapped_field(esdsl.Long())
+ storage_byte_count: int = esdsl.mapped_field(esdsl.Long())
+
+ class Meta:
+ timeseries_index_timedepth = YEARLY
+
+
+class MonthlyInstitutionSummaryReport(BaseMonthlyReport):
+ UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'institution_id', )
+
+ institution_id: str
+ user_count: int
+ public_project_count: int
+ private_project_count: int
+ public_registration_count: int
+ embargoed_registration_count: int
+ published_preprint_count: int
+ storage_byte_count: int = esdsl.mapped_field(esdsl.Long())
+ public_file_count: int = esdsl.mapped_field(esdsl.Long())
+ monthly_logged_in_user_count: int = esdsl.mapped_field(esdsl.Long())
+ monthly_active_user_count: int = esdsl.mapped_field(esdsl.Long())
+
+ class Meta:
+ timeseries_index_timedepth = YEARLY
+
+
+class MonthlyPublicItemUsageReport(BaseMonthlyReport):
+ UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'item_iri')
+
+ # where noted, fields are meant to correspond to defined terms from COUNTER
+ # https://cop5.projectcounter.org/en/5.1/appendices/a-glossary-of-terms.html
+ # https://coprd.countermetrics.org/en/1.0.1/appendices/a-glossary.html
+ item_iri: str
+ item_osfids: list[str]
+ # fields built from aggregations -- more than one value unlikely, but possible
+ item_types: list[str] # counter:Data-Type
+ platform_iris: list[str] # counter:Platform
+ database_iris: list[str] # counter:Database
+ provider_ids: list[str] # osf-specific (usually corresponds to database_iri)
+
+ # view counts include views on components or files contained by this item
+ view_count: int | None = esdsl.mapped_field(esdsl.Long())
+ view_session_count: int | None = esdsl.mapped_field(esdsl.Long())
+ cumulative_view_count: int | None = esdsl.mapped_field(esdsl.Long())
+ cumulative_view_session_count: int | None = esdsl.mapped_field(esdsl.Long())
+
+ # download counts of this item only (not including contained components or files)
+ download_count: int | None = esdsl.mapped_field(esdsl.Long())
+ download_session_count: int | None = esdsl.mapped_field(esdsl.Long())
+ cumulative_download_count: int | None = esdsl.mapped_field(esdsl.Long())
+ cumulative_download_session_count: int | None = esdsl.mapped_field(esdsl.Long())
+
+ class Meta:
+ timeseries_index_timedepth = YEARLY
+
+ @classmethod
+ def from_last_month(
+ cls,
+ item_iris: collections.abc.Collection[str],
+ ) -> list['MonthlyPublicItemUsageReport']:
+ _last_month = YearMonth.from_today().prior()
+ _from_last_month = list(cls.each_from_month(item_iris, _last_month))
+ if item_iris and not _from_last_month:
+ # monthly reporters may not run immediately at the beginning of the month,
+ # but this could -- if none exist, try the month prior
+ _from_last_month = list(cls.each_from_month(item_iris, _last_month.prior()))
+ return _from_last_month
+
+ @classmethod
+ def each_from_month(
+ cls,
+ item_iris: collections.abc.Collection[str],
+ yearmonth: YearMonth,
+ ) -> collections.abc.Collection['MonthlyPublicItemUsageReport']:
+ if item_iris:
+ _search = (
+ cls.search()
+ .filter('term', cycle_coverage=cycle_coverage_yearmonth(yearmonth))
+ .filter('terms', item_iri=item_iris)
+ [:len(item_iris)]
+ )
+ yield from _search.execute()
+
+
+class MonthlyPrivateSpamMetricsReport(BaseMonthlyReport):
+ node_oopspam_flagged: int
+ node_oopspam_hammed: int
+ node_akismet_flagged: int
+ node_akismet_hammed: int
+ preprint_oopspam_flagged: int
+ preprint_oopspam_hammed: int
+ preprint_akismet_flagged: int
+ preprint_akismet_hammed: int
+
+ class Meta:
+ timeseries_index_timedepth = YEARLY
diff --git a/osf/metrics/preprint_metrics.py b/osf/metrics/preprint_metrics.py
deleted file mode 100644
index d284d80827e..00000000000
--- a/osf/metrics/preprint_metrics.py
+++ /dev/null
@@ -1,73 +0,0 @@
-from elasticsearch6.exceptions import NotFoundError
-import elasticsearch_metrics.imps.elastic6 as metrics
-
-from .metric_mixin import MetricMixin
-
-
-class BasePreprintMetric(MetricMixin, metrics.Metric):
- count = metrics.Integer(doc_values=True, index=True, required=True)
- provider_id = metrics.Keyword(index=True, doc_values=True, required=True)
- user_id = metrics.Keyword(index=True, doc_values=True, required=False)
- preprint_id = metrics.Keyword(index=True, doc_values=True, required=True)
- version = metrics.Keyword(index=True, doc_values=True)
- path = metrics.Text(index=True)
-
- # TODO: locale
-
- class Index:
- settings = {
- 'number_of_shards': 1,
- 'number_of_replicas': 1,
- 'refresh_interval': '1s',
- }
-
- class Meta:
- abstract = True
- source = metrics.MetaField(enabled=True)
-
- @classmethod
- def record_for_preprint(cls, preprint, user=None, **kwargs):
- count = kwargs.pop('count', 1)
- return cls.record(
- count=count,
- preprint_id=preprint._id,
- user_id=getattr(user, '_id', None),
- provider_id=preprint.provider._id,
- **kwargs
- )
-
- @classmethod
- def get_count_for_preprint(cls, preprint, after=None, before=None, index=None) -> int:
- if preprint.version == 1:
- search = cls.search(index=index).filter('terms', preprint_id=[preprint.get_guid()._id, preprint._id])
- else:
- search = cls.search(index=index).filter('term', preprint_id=preprint._id)
- timestamp = {}
- if after:
- timestamp['gte'] = after
- if before:
- timestamp['lt'] = before
- if timestamp:
- search = search.filter('range', timestamp=timestamp)
- search.aggs.metric('sum_count', 'sum', field='count')
- # Optimization: set size to 0 so that hits aren't returned (we only care about the aggregation)
- search = search.extra(size=0)
- try:
- response = search.execute()
- except NotFoundError:
- # _get_relevant_indices returned 1 or more indices
- # that doesn't exist. Fall back to unoptimized query
- search = search.index().index(cls._default_index())
- response = search.execute()
- # No indexed data
- if not hasattr(response.aggregations, 'sum_count'):
- return 0
- return int(response.aggregations.sum_count.value)
-
-
-class PreprintView(BasePreprintMetric):
- pass
-
-
-class PreprintDownload(BasePreprintMetric):
- pass
diff --git a/osf/metrics/registry_metrics.py b/osf/metrics/registry_metrics.py
deleted file mode 100644
index 9c779fe8c0b..00000000000
--- a/osf/metrics/registry_metrics.py
+++ /dev/null
@@ -1,176 +0,0 @@
-import elasticsearch_metrics.imps.elastic6 as metrics
-
-from osf.utils.workflows import RegistrationModerationTriggers, RegistrationModerationStates
-from .metric_mixin import MetricMixin
-
-
-class RegistriesModerationMetrics(MetricMixin, metrics.Metric):
- registration_id = metrics.Keyword(index=True, doc_values=True, required=True)
- provider_id = metrics.Keyword(index=True, doc_values=True, required=True)
- trigger = metrics.Keyword(index=True, doc_values=True, required=True)
- from_state = metrics.Keyword(index=True, doc_values=True, required=True)
- to_state = metrics.Keyword(index=True, doc_values=True, required=True)
- user_id = metrics.Keyword(index=True, doc_values=True, required=True)
- comment = metrics.Keyword(index=True)
-
- class Index:
- settings = {
- 'number_of_shards': 1,
- 'number_of_replicas': 1,
- 'refresh_interval': '1s',
- }
-
- class Meta:
- source = metrics.MetaField(enabled=True)
-
- @classmethod
- def record_transitions(cls, action):
- return cls.record(
- registration_id=action.target._id,
- provider_id=action.target.provider._id,
- from_state=action.from_state,
- to_state=action.to_state,
- trigger=action.trigger,
- user_id=action.creator._id,
- comment=action.comment,
- )
-
- @classmethod
- def get_registries_info(cls) -> dict:
- """
- Gets metrics info for each registry
- expected output:
- {
- 'doc_count_error_upper_bound': 0,
- 'sum_other_doc_count': 0,
- 'buckets': [{
- 'key': 'osf',
- 'doc_count': 6,
- 'rejected': {'doc_count': 0},
- 'submissions': {'doc_count': 3},
- 'not_embargoed_but_accepted': {'doc_count': 0},
- 'withdrawn': {'doc_count': 0},
- 'transitions_without_comments': {'doc_count': 1},
- 'embargoed': {'doc_count': 0},
- 'transitions_with_comments': {'doc_count': 5}
- },
- {
- 'key': 'provider2',
- 'doc_count': 4,
- 'rejected': {'doc_count': 1},
- 'submissions': {'doc_count': 1},
- 'not_embargoed_but_accepted': {'doc_count': 1},
- 'withdrawn': {'doc_count': 0},
- 'transitions_without_comments': {'doc_count': 0},
- 'embargoed': {'doc_count': 0},
- 'transitions_with_comments': {'doc_count': 4}
- }]
- }
- :return: dict
- """
- search = cls.search()
-
- return search.update_from_dict({
- 'aggs': {
- 'providers': {
- 'terms': {
- 'field': 'provider_id'
- },
- 'aggs': {
- 'transitions_without_comments': {
- 'missing': {
- 'field': 'comment'
- }
- },
- 'transitions_with_comments': {
- 'filter': {
- 'exists': {
- 'field': 'comment'
- }
- }
- },
- 'submissions': {
- 'filter': {
- 'match': {
- 'trigger': {
- 'query': RegistrationModerationTriggers.SUBMIT.db_name
- }
- }
- }
- },
- 'accepted_with_embargo': {
- 'filter': {
- 'bool': {
- 'must': [
- {
- 'match': {
- 'to_state': RegistrationModerationStates.EMBARGO.db_name
- }
- },
- {
- 'match': {
- 'trigger': RegistrationModerationTriggers.SUBMIT.db_name
- }
- }
- ]
- }
- }
- },
- 'accepted_without_embargo': {
- 'filter': {
- 'bool': {
- 'must': [
- {
- 'match': {
- 'to_state': RegistrationModerationStates.ACCEPTED.db_name
- }
- },
- {
- 'match': {
- 'trigger': RegistrationModerationTriggers.SUBMIT.db_name
- }
- }
- ]
- }
- }
- },
- 'rejected': {
- 'filter': {
- 'bool': {
- 'must': [
- {
- 'match': {
- 'to_state': RegistrationModerationStates.REJECTED.db_name
- }
- },
- {
- 'match': {
- 'trigger': RegistrationModerationTriggers.REJECT_SUBMISSION.db_name
- }
- }
- ]
- }
- }
- },
- 'withdrawn': {
- 'filter': {
- 'bool': {
- 'must': [
- {
- 'match': {
- 'to_state': RegistrationModerationStates.WITHDRAWN.db_name
- }
- },
- {
- 'match': {
- 'trigger': RegistrationModerationTriggers.ACCEPT_WITHDRAWAL.db_name
- }
- }
- ]
- }
- }
- },
- }
- }
- }
- }).execute().aggregations['providers'].to_dict()
diff --git a/osf/metrics/reporters/_base.py b/osf/metrics/reporters/_base.py
index 707e869522b..6f1d183ee6e 100644
--- a/osf/metrics/reporters/_base.py
+++ b/osf/metrics/reporters/_base.py
@@ -1,10 +1,11 @@
-from collections import abc
+import collections
import dataclasses
import logging
import celery
-from osf.metrics.reports import MonthlyReport
+from osf.metrics.daily_reports import BaseDailyReport
+from osf.metrics.monthly_reports import BaseMonthlyReport
from osf.metrics.utils import YearMonth
@@ -15,22 +16,28 @@
class MonthlyReporter:
yearmonth: YearMonth
- def iter_report_kwargs(self, continue_after: dict | None = None) -> abc.Iterator[dict]:
- # override for multiple reports per month
+ def iter_report_kwargs(self, continue_after: dict | None = None) -> collections.abc.Iterator[dict]:
+ """yield kwargs that can be passed to `report` (in separate async tasks)
+
+ by default, `report` is called once with empty kwargs
+ (override for multiple reports per month)
+ """
if continue_after is None:
- yield {} # by default, calls `.report()` once with no kwargs
+ yield {}
- def report(self, **report_kwargs) -> MonthlyReport | None:
- """build a report for the given month
+ def report(self, **report_kwargs) -> collections.abc.Iterator[BaseMonthlyReport]:
+ """yield reports for the given month and kwargs (from iter_report_kwargs)
"""
raise NotImplementedError(f'{self.__class__.__name__} must implement `report`')
def followup_task(self, report) -> celery.Signature | None:
+ """return a task signature that will be enqueued after the report is saved
+ """
return None
class DailyReporter:
- def report(self, report_date):
+ def report(self, report_date) -> collections.abc.Iterator[BaseDailyReport]:
"""build reports for the given date
return an iterable of DailyReport (unsaved)
@@ -38,9 +45,7 @@ def report(self, report_date):
raise NotImplementedError(f'{self.__class__.__name__} must implement `report`')
def run_and_record_for_date(self, report_date):
- reports = self.report(report_date)
-
# expecting each reporter to spit out only a handful of reports per day;
- # not bothering with bulk-create
- for report in reports:
+ # not bothering with bulk-create (this allows multiple types of reports)
+ for report in self.report(report_date):
report.save()
diff --git a/osf/metrics/reporters/download_count.py b/osf/metrics/reporters/download_count.py
index 4350c1440a1..7e98d24a326 100644
--- a/osf/metrics/reporters/download_count.py
+++ b/osf/metrics/reporters/download_count.py
@@ -1,22 +1,12 @@
from osf.models import PageCounter
-from osf.metrics.reports import DownloadCountReport
-from osf.metrics.es8_metrics import DailyDownloadCountReportEs8
-from osf.metrics.utils import cycle_coverage_date
+from osf.metrics.daily_reports import DailyDownloadCountReport
from ._base import DailyReporter
class DownloadCountReporter(DailyReporter):
def report(self, date):
download_count = int(PageCounter.get_all_downloads_on_date(date) or 0)
- reports = []
- report_es8 = DailyDownloadCountReportEs8(
- cycle_coverage=cycle_coverage_date(date),
- daily_file_downloads=download_count,
- )
- reports.append(report_es8)
- report = DownloadCountReport(
- daily_file_downloads=report_es8.daily_file_downloads,
+ yield DailyDownloadCountReport(
report_date=date,
+ daily_file_downloads=download_count,
)
- reports.append(report)
- return reports
diff --git a/osf/metrics/reporters/institution_summary.py b/osf/metrics/reporters/institution_summary.py
index 1148f2456e5..b34e22c1e1f 100644
--- a/osf/metrics/reporters/institution_summary.py
+++ b/osf/metrics/reporters/institution_summary.py
@@ -2,19 +2,8 @@
from django.db.models import Q
-from osf.metrics.reports import (
- InstitutionSummaryReport,
- RunningTotal,
- NodeRunningTotals,
- RegistrationRunningTotals,
-)
from osf.models import Institution
-from osf.metrics.es8_metrics import (
- DailyInstitutionSummaryReportEs8,
- RunningTotal as RunningTotalEs8,
- NodeRunningTotals as NodeRunningTotalsEs8,
- RegistrationRunningTotals as RegistrationRunningTotalsEs8
-)
+from osf.metrics.daily_reports import DailyInstitutionSummaryReport
from osf.metrics.utils import cycle_coverage_date
from ._base import DailyReporter
@@ -25,7 +14,6 @@
class InstitutionSummaryReporter(DailyReporter):
def report(self, date):
institutions = Institution.objects.all()
- reports = []
daily_query = Q(created__date=date)
public_query = Q(is_public=True)
@@ -45,15 +33,15 @@ def report(self, date):
created__date__lte=date,
type='osf.registration',
)
- report_es8 = DailyInstitutionSummaryReportEs8(
+ yield DailyInstitutionSummaryReport(
cycle_coverage=cycle_coverage_date(date),
institution_id=institution._id,
institution_name=institution.name,
- users=RunningTotalEs8(
+ users=dict(
total=institution.get_institution_users().filter(is_active=True).count(),
total_daily=institution.get_institution_users().filter(date_confirmed__date=date).count(),
),
- nodes=NodeRunningTotalsEs8(
+ nodes=dict(
total=node_qs.count(),
public=node_qs.filter(public_query).count(),
private=node_qs.filter(private_query).count(),
@@ -63,7 +51,7 @@ def report(self, date):
private_daily=node_qs.filter(private_query & daily_query).count(),
),
# Projects use get_roots to remove children
- projects=NodeRunningTotalsEs8(
+ projects=dict(
total=node_qs.get_roots().count(),
public=node_qs.filter(public_query).get_roots().count(),
private=node_qs.filter(private_query).get_roots().count(),
@@ -72,7 +60,7 @@ def report(self, date):
public_daily=node_qs.filter(public_query & daily_query).get_roots().count(),
private_daily=node_qs.filter(private_query & daily_query).get_roots().count(),
),
- registered_nodes=RegistrationRunningTotalsEs8(
+ registered_nodes=dict(
total=registration_qs.count(),
public=registration_qs.filter(public_query).count(),
embargoed=registration_qs.filter(private_query).count(),
@@ -83,7 +71,7 @@ def report(self, date):
embargoed_daily=registration_qs.filter(private_query & daily_query).count(),
embargoed_v2_daily=registration_qs.filter(private_query & daily_query & embargo_v2_query).count(),
),
- registered_projects=RegistrationRunningTotalsEs8(
+ registered_projects=dict(
total=registration_qs.get_roots().count(),
public=registration_qs.filter(public_query).get_roots().count(),
embargoed=registration_qs.filter(private_query).get_roots().count(),
@@ -96,58 +84,3 @@ def report(self, date):
private_query & daily_query & embargo_v2_query).get_roots().count(),
),
)
- reports.append(report_es8)
-
- report = InstitutionSummaryReport(
- report_date=date,
- institution_id=institution._id,
- institution_name=institution.name,
- users=RunningTotal(
- total=report_es8.users.total,
- total_daily=report_es8.users.total_daily,
- ),
- nodes=NodeRunningTotals(
- total=report_es8.nodes.total,
- public=report_es8.nodes.public,
- private=report_es8.nodes.private,
-
- total_daily=report_es8.nodes.total_daily,
- public_daily=report_es8.nodes.public_daily,
- private_daily=report_es8.nodes.private_daily,
- ),
- # Projects use get_roots to remove children
- projects=NodeRunningTotals(
- total=report_es8.projects.total,
- public=report_es8.projects.public,
- private=report_es8.projects.private,
-
- total_daily=report_es8.projects.total_daily,
- public_daily=report_es8.projects.public_daily,
- private_daily=report_es8.projects.private_daily,
- ),
- registered_nodes=RegistrationRunningTotals(
- total=report_es8.registered_nodes.total,
- public=report_es8.registered_nodes.public,
- embargoed=report_es8.registered_nodes.embargoed,
- embargoed_v2=report_es8.registered_nodes.embargoed_v2,
-
- total_daily=report_es8.registered_nodes.total_daily,
- public_daily=report_es8.registered_nodes.public_daily,
- embargoed_daily=report_es8.registered_nodes.embargoed_daily,
- embargoed_v2_daily=report_es8.registered_nodes.embargoed_v2_daily,
- ),
- registered_projects=RegistrationRunningTotals(
- total=report_es8.registered_projects.total,
- public=report_es8.registered_projects.public,
- embargoed=report_es8.registered_projects.embargoed,
- embargoed_v2=report_es8.registered_projects.embargoed_v2,
-
- total_daily=report_es8.registered_projects.total_daily,
- public_daily=report_es8.registered_projects.public_daily,
- embargoed_daily=report_es8.registered_projects.embargoed_daily,
- embargoed_v2_daily=report_es8.registered_projects.embargoed_v2_daily,
- ),
- )
-
- reports.append(report)
- return reports
diff --git a/osf/metrics/reporters/institution_summary_monthly.py b/osf/metrics/reporters/institution_summary_monthly.py
index 88d8e1fb891..559a2c2ae1a 100644
--- a/osf/metrics/reporters/institution_summary_monthly.py
+++ b/osf/metrics/reporters/institution_summary_monthly.py
@@ -4,13 +4,11 @@
from osf.models import Institution, Preprint, AbstractNode, FileVersion, NodeLog, PreprintLog
from osf.models.spam import SpamStatus
from addons.osfstorage.models import OsfStorageFile
-from osf.metrics.reports import InstitutionMonthlySummaryReport
-from osf.metrics.es8_metrics import MonthlyInstitutionSummaryReportEs8
-from osf.metrics.utils import cycle_coverage_yearmonth
+from osf.metrics.monthly_reports import MonthlyInstitutionSummaryReport
from ._base import MonthlyReporter
class InstitutionalSummaryMonthlyReporter(MonthlyReporter):
- """Generate an InstitutionMonthlySummaryReport for each institution."""
+ """Generate a MonthlyInstitutionSummaryReport for each institution."""
def iter_report_kwargs(self, continue_after: dict | None = None):
_inst_qs = Institution.objects.order_by('pk')
@@ -20,22 +18,16 @@ def iter_report_kwargs(self, continue_after: dict | None = None):
yield {'institution_pk': _pk}
def report(self, **report_kwargs):
- _institution = Institution.objects.get(pk=report_kwargs['institution_pk'])
- reports = self.generate_report(_institution)
- return reports
-
- def generate_report(self, institution):
+ institution = Institution.objects.get(pk=report_kwargs['institution_pk'])
node_queryset = institution.nodes.filter(
deleted__isnull=True,
created__lt=self.yearmonth.month_end()
).exclude(
spam_status=SpamStatus.SPAM,
)
-
preprint_queryset = self.get_published_preprints(institution, self.yearmonth)
- reports = []
- report_es8 = MonthlyInstitutionSummaryReportEs8(
- cycle_coverage=cycle_coverage_yearmonth(self.yearmonth),
+ yield MonthlyInstitutionSummaryReport(
+ report_yearmonth=self.yearmonth,
institution_id=institution._id,
user_count=institution.get_institution_users().count(),
private_project_count=self._get_count(node_queryset, 'osf.node', is_public=False),
@@ -48,23 +40,6 @@ def generate_report(self, institution):
monthly_logged_in_user_count=self.get_monthly_logged_in_user_count(institution, self.yearmonth),
monthly_active_user_count=self.get_monthly_active_user_count(institution, self.yearmonth),
)
- reports.append(report_es8)
-
- report = InstitutionMonthlySummaryReport(
- institution_id=report_es8.institution_id,
- user_count=report_es8.user_count,
- private_project_count=report_es8.private_project_count,
- public_project_count=report_es8.public_project_count,
- public_registration_count=report_es8.public_registration_count,
- embargoed_registration_count=report_es8.embargoed_registration_count,
- published_preprint_count=report_es8.published_preprint_count,
- storage_byte_count=report_es8.storage_byte_count,
- public_file_count=report_es8.public_file_count,
- monthly_logged_in_user_count=report_es8.monthly_logged_in_user_count,
- monthly_active_user_count=report_es8.monthly_active_user_count,
- )
- reports.append(report)
- return reports
def _get_count(self, node_queryset, node_type, is_public):
return node_queryset.filter(type=node_type, is_public=is_public, root_id=F('pk')).count()
diff --git a/osf/metrics/reporters/institutional_users.py b/osf/metrics/reporters/institutional_users.py
index a9fba3adfcb..f7a745be6ce 100644
--- a/osf/metrics/reporters/institutional_users.py
+++ b/osf/metrics/reporters/institutional_users.py
@@ -6,14 +6,13 @@
from osf import models as osfdb
from osf.models.spam import SpamStatus
from addons.osfstorage.models import OsfStorageFile
-from osf.metrics.reports import InstitutionalUserReport
-from osf.metrics.utils import YearMonth, cycle_coverage_yearmonth
-from osf.metrics.es8_metrics import MonthlyInstitutionalUserReportEs8
+from osf.metrics.utils import YearMonth
+from osf.metrics.monthly_reports import MonthlyInstitutionalUserReport
from ._base import MonthlyReporter
class InstitutionalUsersReporter(MonthlyReporter):
- '''build an InstitutionalUserReport for each institution-user affiliation
+ '''build a MonthlyInstitutionalUserReport for each institution-user affiliation
built for the institution dashboard at ://osf.example/institutions//dashboard/,
which offers institutional admins insight into how people at their institution are
@@ -39,7 +38,7 @@ def report(self, **report_kwargs):
_institution = osfdb.Institution.objects.get(pk=report_kwargs['institution_pk'])
_user = osfdb.OSFUser.objects.get(pk=report_kwargs['user_pk'])
_helper = _InstiUserReportHelper(_institution, _user, self.yearmonth)
- return _helper.build_reports()
+ yield _helper.build_report()
# helper
@@ -49,10 +48,10 @@ class _InstiUserReportHelper:
user: osfdb.OSFUser
yearmonth: YearMonth
- def build_reports(self):
+ def build_report(self):
_affiliation = self.user.get_institution_affiliation(self.institution._id)
- report_es8 = MonthlyInstitutionalUserReportEs8(
- cycle_coverage=cycle_coverage_yearmonth(self.yearmonth),
+ return MonthlyInstitutionalUserReport(
+ report_yearmonth=self.yearmonth,
institution_id=self.institution._id,
user_id=self.user._id,
user_name=self.user.fullname,
@@ -73,24 +72,6 @@ def build_reports(self):
published_preprint_count=self._published_preprint_queryset().count(),
storage_byte_count=self._storage_byte_count(),
)
- report_es6 = InstitutionalUserReport(
- institution_id=report_es8.institution_id,
- user_id=report_es8.user_id,
- user_name=report_es8.user_name,
- department_name=report_es8.department_name,
- month_last_login=report_es8.month_last_login,
- month_last_active=report_es8.month_last_active,
- account_creation_date=report_es8.account_creation_date,
- orcid_id=report_es8.orcid_id,
- public_project_count=report_es8.public_project_count,
- private_project_count=report_es8.private_project_count,
- public_registration_count=report_es8.public_registration_count,
- embargoed_registration_count=report_es8.embargoed_registration_count,
- public_file_count=report_es8.public_file_count,
- published_preprint_count=report_es8.published_preprint_count,
- storage_byte_count=report_es8.storage_byte_count,
- )
- return [report_es8, report_es6]
@property
def before_datetime(self):
diff --git a/osf/metrics/reporters/new_user_domain.py b/osf/metrics/reporters/new_user_domain.py
index 125e02754d7..8219d3e0ef3 100644
--- a/osf/metrics/reporters/new_user_domain.py
+++ b/osf/metrics/reporters/new_user_domain.py
@@ -2,8 +2,7 @@
from collections import Counter
from osf.models import OSFUser
-from osf.metrics.reports import NewUserDomainReport
-from osf.metrics.es8_metrics import DailyNewUserDomainReportEs8
+from osf.metrics.daily_reports import DailyNewUserDomainReport
from osf.metrics.utils import cycle_coverage_date
from ._base import DailyReporter
@@ -22,19 +21,9 @@ def report(self, date):
email.split('@')[-1]
for email in new_user_emails
)
- reports = []
for domain_name, count in domain_names.items():
- report_es8 = DailyNewUserDomainReportEs8(
+ yield DailyNewUserDomainReport(
cycle_coverage=cycle_coverage_date(date),
domain_name=domain_name,
new_user_count=count,
)
- reports.append(report_es8)
-
- report = NewUserDomainReport(
- report_date=date,
- domain_name=report_es8.domain_name,
- new_user_count=report_es8.new_user_count,
- )
- reports.append(report)
- return reports
diff --git a/osf/metrics/reporters/node_count.py b/osf/metrics/reporters/node_count.py
index 23f4c9bb78c..48831fdc87e 100644
--- a/osf/metrics/reporters/node_count.py
+++ b/osf/metrics/reporters/node_count.py
@@ -2,16 +2,7 @@
from django.db.models import Q
-from osf.metrics.reports import (
- NodeSummaryReport,
- NodeRunningTotals,
- RegistrationRunningTotals,
-)
-from osf.metrics.es8_metrics import (
- DailyNodeSummaryReportEs8,
- NodeRunningTotals as NodeRunningTotalsEs8,
- RegistrationRunningTotals as RegistrationRunningTotalsEs8
-)
+from osf.metrics.daily_reports import DailyNodeSummaryReport
from osf.metrics.utils import cycle_coverage_date
from ._base import DailyReporter
@@ -40,11 +31,10 @@ def report(self, date):
embargo_v2_query = Q(root__embargo__end_date__date__gt=date)
exclude_spam = ~Q(spam_status__in=[SpamStatus.SPAM, SpamStatus.FLAGGED])
- reports = []
- report_es8 = DailyNodeSummaryReportEs8(
+ yield DailyNodeSummaryReport(
cycle_coverage=cycle_coverage_date(date),
# Nodes - the number of projects and components
- nodes=NodeRunningTotalsEs8(
+ nodes=dict(
total=node_qs.count(),
total_excluding_spam=node_qs.filter(exclude_spam).count(),
public=node_qs.filter(public_query).count(),
@@ -55,7 +45,7 @@ def report(self, date):
private_daily=node_qs.filter(private_query & created_today_query).count(),
),
# Projects - the number of top-level only projects
- projects=NodeRunningTotalsEs8(
+ projects=dict(
total=node_qs.get_roots().count(),
total_excluding_spam=node_qs.get_roots().filter(exclude_spam).count(),
public=node_qs.filter(public_query).get_roots().count(),
@@ -66,7 +56,7 @@ def report(self, date):
private_daily=node_qs.filter(private_query & created_today_query).get_roots().count(),
),
# Registered Nodes - the number of registered projects and components
- registered_nodes=RegistrationRunningTotalsEs8(
+ registered_nodes=dict(
total=registration_qs.count(),
public=registration_qs.filter(public_query).count(),
embargoed=registration_qs.filter(private_query).count(),
@@ -80,7 +70,7 @@ def report(self, date):
),
# Registered Projects - the number of registered top level projects
- registered_projects=RegistrationRunningTotalsEs8(
+ registered_projects=dict(
total=registration_qs.get_roots().count(),
public=registration_qs.filter(public_query).get_roots().count(),
embargoed=registration_qs.filter(private_query).get_roots().count(),
@@ -93,58 +83,3 @@ def report(self, date):
withdrawn_daily=registration_qs.filter(retracted_query & retracted_today_query).get_roots().count(),
),
)
- reports.append(report_es8)
- report = NodeSummaryReport(
- report_date=date,
- # Nodes - the number of projects and components
- nodes=NodeRunningTotals(
- total=report_es8.nodes.total,
- total_excluding_spam=report_es8.nodes.total_excluding_spam,
- public=report_es8.nodes.public,
- private=report_es8.nodes.private,
- total_daily=report_es8.nodes.total_daily,
- total_daily_excluding_spam=report_es8.nodes.total_daily_excluding_spam,
- public_daily=report_es8.nodes.public_daily,
- private_daily=report_es8.nodes.private_daily,
- ),
- # Projects - the number of top-level only projects
- projects=NodeRunningTotals(
- total=report_es8.projects.total,
- total_excluding_spam=report_es8.projects.total_excluding_spam,
- public=report_es8.projects.public,
- private=report_es8.projects.private,
- total_daily=report_es8.projects.total_daily,
- total_daily_excluding_spam=report_es8.projects.total_daily_excluding_spam,
- public_daily=report_es8.projects.public_daily,
- private_daily=report_es8.projects.private_daily,
- ),
- # Registered Nodes - the number of registered projects and components
- registered_nodes=RegistrationRunningTotals(
- total=report_es8.registered_nodes.total,
- public=report_es8.registered_nodes.public,
- embargoed=report_es8.registered_nodes.embargoed,
- embargoed_v2=report_es8.registered_nodes.embargoed_v2,
- withdrawn=report_es8.registered_nodes.withdrawn,
- total_daily=report_es8.registered_nodes.total_daily,
- public_daily=report_es8.registered_nodes.public_daily,
- embargoed_daily=report_es8.registered_nodes.embargoed_daily,
- embargoed_v2_daily=report_es8.registered_nodes.embargoed_v2_daily,
- withdrawn_daily=report_es8.registered_nodes.withdrawn_daily,
- ),
- # Registered Projects - the number of registered top level projects
- registered_projects=RegistrationRunningTotals(
- total=report_es8.registered_projects.total,
- public=report_es8.registered_projects.public,
- embargoed=report_es8.registered_projects.embargoed,
- embargoed_v2=report_es8.registered_projects.embargoed_v2,
- withdrawn=report_es8.registered_projects.withdrawn,
- total_daily=report_es8.registered_projects.total_daily,
- public_daily=report_es8.registered_projects.public_daily,
- embargoed_daily=report_es8.registered_projects.embargoed_daily,
- embargoed_v2_daily=report_es8.registered_projects.embargoed_v2_daily,
- withdrawn_daily=report_es8.registered_projects.withdrawn_daily,
- ),
- )
- reports.append(report)
-
- return reports
diff --git a/osf/metrics/reporters/osfstorage_file_count.py b/osf/metrics/reporters/osfstorage_file_count.py
index 6ddeb89945b..5db9ad1cff6 100644
--- a/osf/metrics/reporters/osfstorage_file_count.py
+++ b/osf/metrics/reporters/osfstorage_file_count.py
@@ -2,12 +2,8 @@
from django.db.models import Q
import logging
-from osf.metrics.reports import OsfstorageFileCountReport, FileRunningTotals
from osf.models import AbstractNode, Preprint
-from osf.metrics.es8_metrics import (
- DailyOsfstorageFileCountReportEs8,
- FileRunningTotals as FileRunningTotalsEs8
-)
+from osf.metrics.daily_reports import DailyOsfstorageFileCountReport
from osf.metrics.utils import cycle_coverage_date
from ._base import DailyReporter
@@ -36,11 +32,9 @@ def report(self, date):
daily_query = Q(created__date=date)
- reports = []
-
- report_es8 = DailyOsfstorageFileCountReportEs8(
+ yield DailyOsfstorageFileCountReport(
cycle_coverage=cycle_coverage_date(date),
- files=FileRunningTotalsEs8(
+ files=dict(
total=file_qs.count(),
public=file_qs.filter(public_query).count(),
private=file_qs.filter(private_query).count(),
@@ -49,19 +43,3 @@ def report(self, date):
private_daily=file_qs.filter(private_query & daily_query).count(),
),
)
- reports.append(report_es8)
-
- report = OsfstorageFileCountReport(
- report_date=date,
- files=FileRunningTotals(
- total=report_es8.files.total,
- public=report_es8.files.public,
- private=report_es8.files.private,
- total_daily=report_es8.files.total_daily,
- public_daily=report_es8.files.public_daily,
- private_daily=report_es8.files.private_daily,
- ),
- )
- reports.append(report)
-
- return reports
diff --git a/osf/metrics/reporters/preprint_count.py b/osf/metrics/reporters/preprint_count.py
index 85ba639a32f..6cafa063c62 100644
--- a/osf/metrics/reporters/preprint_count.py
+++ b/osf/metrics/reporters/preprint_count.py
@@ -1,9 +1,8 @@
import logging
import requests
-from osf.metrics import PreprintSummaryReport
from website import settings
-from osf.metrics.es8_metrics import DailyPreprintSummaryReportEs8
+from osf.metrics.daily_reports import DailyPreprintSummaryReport
from osf.metrics.utils import cycle_coverage_date
from ._base import DailyReporter
@@ -46,24 +45,12 @@ class PreprintCountReporter(DailyReporter):
def report(self, date):
from osf.models import PreprintProvider
- reports = []
for preprint_provider in PreprintProvider.objects.all():
elastic_query = get_elastic_query(date, preprint_provider)
resp = requests.post(f'{settings.SHARE_URL}api/v2/search/creativeworks/_search', json=elastic_query).json()
- report_es8 = DailyPreprintSummaryReportEs8(
+ yield DailyPreprintSummaryReport(
cycle_coverage=cycle_coverage_date(date),
provider_key=preprint_provider._id,
preprint_count=resp['hits']['total'],
)
- reports.append(report_es8)
-
- report = PreprintSummaryReport(
- report_date=date,
- provider_key=report_es8.provider_key,
- preprint_count=report_es8.preprint_count,
- )
- reports.append(report)
- logger.info('{} Preprints counted for the provider {}'.format(resp['hits']['total'], preprint_provider.name))
-
- return reports
diff --git a/osf/metrics/reporters/private_spam_metrics.py b/osf/metrics/reporters/private_spam_metrics.py
index fde545247e6..c5f91206a7e 100644
--- a/osf/metrics/reporters/private_spam_metrics.py
+++ b/osf/metrics/reporters/private_spam_metrics.py
@@ -1,8 +1,6 @@
-from osf.metrics.reports import PrivateSpamMetricsReport
from osf.external.oopspam.client import OOPSpamClient
from osf.external.askismet.client import AkismetClient
-from osf.metrics.es8_metrics import MonthlyPrivateSpamMetricsReportEs8
-from osf.metrics.utils import cycle_coverage_yearmonth
+from osf.metrics.monthly_reports import MonthlyPrivateSpamMetricsReport
from ._base import MonthlyReporter
@@ -16,10 +14,8 @@ def report(self):
oopspam_client = OOPSpamClient()
akismet_client = AkismetClient()
- reports = []
-
- report_es8 = MonthlyPrivateSpamMetricsReportEs8(
- cycle_coverage=cycle_coverage_yearmonth(self.yearmonth),
+ yield MonthlyPrivateSpamMetricsReport(
+ report_yearmonth=self.yearmonth,
node_oopspam_flagged=oopspam_client.get_flagged_count(target_month, next_month, category='node'),
node_oopspam_hammed=oopspam_client.get_hammed_count(target_month, next_month, category='node'),
node_akismet_flagged=akismet_client.get_flagged_count(target_month, next_month, category='node'),
@@ -29,19 +25,3 @@ def report(self):
preprint_akismet_flagged=akismet_client.get_flagged_count(target_month, next_month, category='preprint'),
preprint_akismet_hammed=akismet_client.get_hammed_count(target_month, next_month, category='preprint')
)
- reports.append(report_es8)
-
- report = PrivateSpamMetricsReport(
- report_yearmonth=str(self.yearmonth),
- node_oopspam_flagged=report_es8.node_oopspam_flagged,
- node_oopspam_hammed=report_es8.node_oopspam_hammed,
- node_akismet_flagged=report_es8.node_akismet_flagged,
- node_akismet_hammed=report_es8.node_akismet_hammed,
- preprint_oopspam_flagged=report_es8.preprint_oopspam_flagged,
- preprint_oopspam_hammed=report_es8.preprint_oopspam_hammed,
- preprint_akismet_flagged=report_es8.preprint_akismet_flagged,
- preprint_akismet_hammed=report_es8.preprint_akismet_hammed,
- )
- reports.append(report)
-
- return reports
diff --git a/osf/metrics/reporters/public_item_usage.py b/osf/metrics/reporters/public_item_usage.py
index 985a1213be2..0fab423f85e 100644
--- a/osf/metrics/reporters/public_item_usage.py
+++ b/osf/metrics/reporters/public_item_usage.py
@@ -2,34 +2,12 @@
import datetime
import typing
-import waffle
+from elasticsearch8 import dsl as esdsl
-from osf.metrics.es8_metrics import MonthlyPublicItemUsageReportEs8
-
-if typing.TYPE_CHECKING:
- import elasticsearch6_dsl as edsl
-
-import osf.features
from osf.metadata.osf_gathering import OsfmapPartition
-from osf.metrics.counted_usage import (
- CountedAuthUsage,
- get_provider_id,
- get_item_type as get_legacy_item_type,
-)
-from osf.metrics.preprint_metrics import (
- PreprintDownload,
- PreprintView,
-)
-from osf.metrics.reports import PublicItemUsageReport
-from osf.metrics.utils import (
- YearMonth,
- cycle_coverage_yearmonth,
- get_database_iri,
- get_item_type,
-)
-from osf import models as osfdb
-from osf.models.base import osfid_iri
-from website import settings as website_settings
+from osf.metrics.monthly_reports import MonthlyPublicItemUsageReport
+from osf.metrics.events import OsfCountedUsageEvent
+from osf.metrics.utils import YearMonth, cycle_coverage_yearmonth
from ._base import MonthlyReporter
@@ -48,60 +26,20 @@ class PublicItemUsageReporter(MonthlyReporter):
includes projects, project components, registrations, registration components, and preprints
'''
def iter_report_kwargs(self, continue_after: dict | None = None):
- _after_osfid = continue_after['osfid'] if continue_after else None
- for _osfid in _zip_sorted(
- self._countedusage_osfids(_after_osfid),
- self._preprintview_osfids(_after_osfid),
- self._preprintdownload_osfids(_after_osfid),
- ):
- yield {'osfid': _osfid}
+ _after_item_iri = continue_after['item_iri'] if continue_after else None
+ for _item_iri in self._each_item_iri(_after_item_iri):
+ yield {'item_iri': _item_iri}
def report(self, **report_kwargs):
- _osfid = report_kwargs['osfid']
- # get usage metrics from several sources:
- # - osf.metrics.counted_usage:
- # - views and downloads for each item (using `CountedAuthUsage.item_guid`)
- # - views for each item's components and files (using `CountedAuthUsage.surrounding_guids`)
- # - osf.metrics.preprint_metrics:
- # - preprint views and downloads
- # - PageCounter? (no)
+ _item_iri = report_kwargs['item_iri']
try:
- _guid = osfdb.Guid.load(_osfid)
- if _guid is None or _guid.referent is None:
- raise _SkipItem
- _obj = _guid.referent
- _report = self._init_report(_obj)
- self._fill_report_counts(_report, _obj)
- if not any((
- _report.view_count,
- _report.view_session_count,
- _report.download_count,
- _report.download_session_count,
- )):
- raise _SkipItem
- _report_es6 = PublicItemUsageReport(
- item_osfid=_report.item_osfids[0],
- item_type=[get_legacy_item_type(_obj)],
- provider_id=list(_report.provider_ids),
- platform_iri=list(_report.platform_iris),
- view_count=_report.view_count,
- view_session_count=_report.view_session_count,
- download_count=_report.download_count,
- download_session_count=_report.download_session_count,
- )
- return [_report, _report_es6]
+ yield self._build_report(_item_iri)
except _SkipItem:
- return []
+ pass
def followup_task(self, report):
_last_month = YearMonth.from_date(datetime.date.today()).prior()
- if isinstance(report, MonthlyPublicItemUsageReportEs8):
- _is_last_month = (report.cycle_coverage == cycle_coverage_yearmonth(_last_month))
- elif isinstance(report, PublicItemUsageReport):
- return None # followup for only one of the two reports
- else:
- raise ValueError(report)
- if _is_last_month:
+ if report.report_yearmonth == _last_month:
from api.share.utils import task__update_share
return task__update_share.signature(
args=(report.item_osfids[0],),
@@ -112,212 +50,121 @@ def followup_task(self, report):
countdown=30, # give index time to settle
)
- def _countedusage_osfids(self, after_osfid: str | None) -> typing.Iterator[str]:
+ def _each_item_iri(self, after_item_iri: str | None) -> typing.Iterator[str]:
_search = self._base_usage_search()
_search.aggs.bucket(
- 'agg_osfid',
+ 'agg_item_iri',
'composite',
- sources=[{'osfid': {'terms': {'field': 'item_guid'}}}],
+ sources=[{'item_iri': {'terms': {'field': 'item_iri'}}}],
size=_CHUNK_SIZE,
)
- return _iter_composite_bucket_keys(_search, 'agg_osfid', 'osfid', after=after_osfid)
+ return _iter_composite_bucket_keys(_search, 'agg_item_iri', 'item_iri', after=after_item_iri)
- def _preprintview_osfids(self, after_osfid: str | None) -> typing.Iterator[str]:
- _search = (
- PreprintView.search()
- .filter('range', timestamp={
- 'gte': self.yearmonth.month_start(),
- 'lt': self.yearmonth.month_end(),
- })
- .extra(size=0) # only aggregations, no hits
- )
- _search.aggs.bucket(
- 'agg_osfid',
- 'composite',
- sources=[{'osfid': {'terms': {'field': 'preprint_id'}}}],
- size=_CHUNK_SIZE,
+ def _build_report(self, item_iri) -> MonthlyPublicItemUsageReport:
+ # get usage metrics from OsfCountedUsageEvent:
+ # - views of the item and its components and files (matching `within_iris`)
+ # - downloads for each item (matching `item_iri`)
+ _search = self._build_usage_counts_search(item_iri)
+ _response = _search.execute()
+ _views_bucket = _response.aggregations.agg_by_label.buckets.views
+ _downloads_bucket = _response.aggregations.agg_by_label.buckets.downloads
+ _fields_agg = _response.aggregations.agg_for_terms
+ _report = MonthlyPublicItemUsageReport(
+ report_yearmonth=self.yearmonth,
+ item_iri=item_iri,
+ item_osfids=_bucket_keys(_fields_agg.item_osfids.buckets),
+ database_iris=_bucket_keys(_fields_agg.database_iris.buckets),
+ platform_iris=_bucket_keys(_fields_agg.platform_iris.buckets),
+ provider_ids=_bucket_keys(_fields_agg.provider_ids.buckets),
+ item_types=_bucket_keys(_fields_agg.item_types.buckets),
+ view_count=_views_bucket.doc_count,
+ view_session_count=_views_bucket.agg_session_count.value,
+ download_count=_downloads_bucket.doc_count,
+ download_session_count=_downloads_bucket.agg_session_count.value,
+ # same as non-cumulative counts, unless there's a prior report (added below)
+ cumulative_view_count=_views_bucket.doc_count,
+ cumulative_view_session_count=_views_bucket.agg_session_count.value,
+ cumulative_download_count=_downloads_bucket.doc_count,
+ cumulative_download_session_count=_downloads_bucket.agg_session_count.value,
)
- return _iter_composite_bucket_keys(_search, 'agg_osfid', 'osfid', after=after_osfid)
-
- def _preprintdownload_osfids(self, after_osfid: str | None) -> typing.Iterator[str]:
- _search = (
- PreprintDownload.search()
+ _prior = self._prior_usage_report(item_iri)
+ if _prior is not None:
+ _report.cumulative_view_count += _prior.cumulative_view_count
+ _report.cumulative_view_session_count += _prior.cumulative_view_session_count
+ _report.cumulative_download_count += _prior.cumulative_download_count
+ _report.cumulative_download_session_count += _prior.cumulative_download_session_count
+ return _report
+
+ def _base_usage_search(self):
+ return (
+ OsfCountedUsageEvent.search()
+ .filter('term', item_public=True)
.filter('range', timestamp={
- 'gte': self.yearmonth.month_start(),
'lt': self.yearmonth.month_end(),
+ 'gte': self.yearmonth.month_start()
})
.extra(size=0) # only aggregations, no hits
)
- _search.aggs.bucket(
- 'agg_osfid',
- 'composite',
- sources=[{'osfid': {'terms': {'field': 'preprint_id'}}}],
- size=_CHUNK_SIZE,
- )
- return _iter_composite_bucket_keys(_search, 'agg_osfid', 'osfid', after=after_osfid)
-
- def _init_report(self, osf_obj) -> MonthlyPublicItemUsageReportEs8:
- if not _is_item_public(osf_obj):
- raise _SkipItem
- return MonthlyPublicItemUsageReportEs8(
- cycle_coverage=cycle_coverage_yearmonth(self.yearmonth),
- item_iri=osfid_iri(osf_obj._id),
- item_osfids=[osf_obj._id],
- item_types=[get_item_type(osf_obj)],
- provider_ids=[get_provider_id(osf_obj)],
- database_iris=[get_database_iri(osf_obj)],
- platform_iris=[website_settings.DOMAIN],
- # leave counts null; will be set if there's data
- )
-
- def _fill_report_counts(self, report, osf_obj):
- if (
- isinstance(osf_obj, osfdb.Preprint)
- and not waffle.switch_is_active(osf.features.COUNTEDUSAGE_UNIFIED_METRICS_2024) # type: ignore[attr-defined]
- ):
- # note: no session-count info in preprint metrics
- report.view_count = PreprintView.get_count_for_preprint(
- preprint=osf_obj,
- after=self.yearmonth.month_start(),
- before=self.yearmonth.month_end(),
- )
- report.download_count = PreprintDownload.get_count_for_preprint(
- preprint=osf_obj,
- after=self.yearmonth.month_start(),
- before=self.yearmonth.month_end(),
- )
- report.cumulative_view_count = PreprintView.get_count_for_preprint(
- preprint=osf_obj,
- before=self.yearmonth.month_end(),
- )
- report.cumulative_download_count = PreprintDownload.get_count_for_preprint(
- preprint=osf_obj,
- before=self.yearmonth.month_end(),
- )
- else:
- (
- report.view_count,
- report.view_session_count,
- ) = self._countedusage_view_counts(osf_obj, cumulative=False)
- (
- report.download_count,
- report.download_session_count,
- ) = self._countedusage_download_counts(osf_obj, cumulative=False)
- (
- report.cumulative_view_count,
- report.cumulative_view_session_count,
- ) = self._countedusage_view_counts(osf_obj, cumulative=True)
-
- (
- report.cumulative_download_count,
- report.cumulative_download_session_count,
- ) = self._countedusage_download_counts(osf_obj, cumulative=True)
-
- def _base_usage_search(self, cumulative: bool = False):
- timestamp_filter = {
- 'lt': self.yearmonth.month_end(),
- }
- if not cumulative:
- timestamp_filter['gte'] = self.yearmonth.month_start()
- return (
- CountedAuthUsage.search()
- .filter('term', item_public=True)
- .filter('range', timestamp=timestamp_filter)
- .extra(size=0) # only aggregations, no hits
- )
-
- def _countedusage_view_counts(self, osf_obj, cumulative: bool = False) -> tuple[int, int]:
- '''compute view_session_count separately to avoid double-counting
-
- (the same session may be represented in both the composite agg on `item_guid`
- and that on `surrounding_guids`)
+ def _build_usage_counts_search(self, item_iri, cumulative: bool = False) -> tuple[int, int]:
+ '''get usage counts for the given item_iri
'''
- _search = (
- self._base_usage_search(cumulative=cumulative)
- .query(
- 'bool',
- filter=[
- {'term': {'action_labels': CountedAuthUsage.ActionLabel.VIEW.value}},
- ],
- should=[
- {'term': {'item_guid': osf_obj._id}},
- {'term': {'surrounding_guids': osf_obj._id}},
- ],
- minimum_should_match=1,
- )
- )
- _search.aggs.metric(
+ _search = self._base_usage_search().filter('term', within_iris=item_iri)
+
+ # aggregation for counts by action label (views, downloads)
+ _agg_by_label = esdsl.A('filters', filters={
+ # bucket for views (including items within)
+ 'views': {'term': {'action_labels': OsfCountedUsageEvent.ActionLabel.VIEW.value}},
+ # bucket for downloads (excluding items within)
+ 'downloads': {
+ 'bool': {
+ 'filter': [
+ {'term': {'action_labels': OsfCountedUsageEvent.ActionLabel.DOWNLOAD.value}},
+ {'term': {'item_iri': item_iri}},
+ ],
+ },
+ },
+ })
+ # session count for each label bucket
+ _agg_by_label.metric(
'agg_session_count',
'cardinality',
- field='session_id',
+ field='sessionhour_id',
precision_threshold=_MAX_CARDINALITY_PRECISION,
)
- _response = _search.execute()
- _view_count = _response.hits.total
- _view_session_count = (
- _response.aggregations.agg_session_count.value
- if 'agg_session_count' in _response.aggregations
- else 0
- )
- return (_view_count, _view_session_count)
+ _search.aggs.bucket('agg_by_label', _agg_by_label)
- def _countedusage_download_counts(self, osf_obj, cumulative: bool = False) -> tuple[int, int]:
- '''aggregate downloads on each osfid (not including components/files)'''
- _search = (
- self._base_usage_search(cumulative=cumulative)
- .filter('term', item_guid=osf_obj._id)
- .filter('term', action_labels=CountedAuthUsage.ActionLabel.DOWNLOAD.value)
- )
- # agg: get download session count
- _search.aggs.metric(
- 'agg_session_count',
- 'cardinality',
- field='session_id',
- precision_threshold=_MAX_CARDINALITY_PRECISION,
- )
- _response = _search.execute()
- _download_count = _response.hits.total
- _download_session_count = (
- _response.aggregations.agg_session_count.value
- if 'agg_session_count' in _response.aggregations
- else 0
- )
- return (_download_count, _download_session_count)
+ # aggregation for getting terms used on usage events directly on the item
+ # (excluding items within) -- usually one value per field per item, but could be more
+ _agg_for_terms = esdsl.A('filter', term={'item_iri': item_iri})
+ _agg_for_terms.bucket('item_osfids', esdsl.A('terms', field='item_osfid'))
+ _agg_for_terms.bucket('item_types', esdsl.A('terms', field='item_type'))
+ _agg_for_terms.bucket('platform_iris', esdsl.A('terms', field='platform_iri'))
+ _agg_for_terms.bucket('database_iris', esdsl.A('terms', field='database_iri'))
+ _agg_for_terms.bucket('provider_ids', esdsl.A('terms', field='provider_id'))
+ _search.aggs.bucket('agg_for_terms', _agg_for_terms)
+ return _search
-def _is_item_public(osfid_referent) -> bool:
- if isinstance(osfid_referent, osfdb.Preprint):
- return bool(osfid_referent.verified_publishable) # quacks like Preprint
- return getattr(osfid_referent, 'is_public', False) # quacks like AbstractNode
+ def _prior_usage_report(self, item_iri):
+ _search = (
+ MonthlyPublicItemUsageReport.search()
+ .filter('term', item_iri=item_iri)
+ .filter('range', cycle_coverage={
+ 'lt': cycle_coverage_yearmonth(self.yearmonth),
+ })
+ .sort('-cycle_coverage') # most recent first
+ )
+ _response = _search[0].execute()
+ return _response[0] if _response else None
-def _zip_sorted(
- *iterators: typing.Iterator[str],
-) -> typing.Iterator[str]:
- '''loop thru multiple iterators on sorted (ascending) sequences of strings
- '''
- _nexts = { # holds the next value from each iterator, or None
- _i: next(_iter, None)
- for _i, _iter in enumerate(iterators)
- }
- while True:
- _nonnull_nexts = [
- _next
- for _next in _nexts.values()
- if _next is not None
- ]
- if not _nonnull_nexts:
- return # all done
- _value = min(_nonnull_nexts)
- yield _value
- for _i, _iter in enumerate(iterators):
- if _nexts[_i] == _value:
- _nexts[_i] = next(_iter, None)
+def _bucket_keys(buckets):
+ return [_bucket['key'] for _bucket in buckets]
def _iter_composite_bucket_keys(
- search: edsl.Search,
+ search: esdsl.Search,
composite_agg_name: str,
composite_source_name: str,
after: str | None = None,
diff --git a/osf/metrics/reporters/spam_count.py b/osf/metrics/reporters/spam_count.py
index 2fbac671ad1..50af17ab452 100644
--- a/osf/metrics/reporters/spam_count.py
+++ b/osf/metrics/reporters/spam_count.py
@@ -1,10 +1,8 @@
from osf.models import OSFUser
-from osf.metrics.reports import SpamSummaryReport
from osf.models import PreprintLog, NodeLog
from osf.models.spam import SpamStatus
-from osf.metrics.es8_metrics import MonthlySpamSummaryReportEs8
-from osf.metrics.utils import cycle_coverage_yearmonth
+from osf.metrics.monthly_reports import MonthlySpamSummaryReport
from ._base import MonthlyReporter
class SpamCountReporter(MonthlyReporter):
@@ -13,9 +11,8 @@ def report(self, **report_kwargs):
assert not report_kwargs
target_month = self.yearmonth.month_start()
next_month = self.yearmonth.month_end()
- reports = []
- report_es8 = MonthlySpamSummaryReportEs8(
- cycle_coverage=cycle_coverage_yearmonth(self.yearmonth),
+ yield MonthlySpamSummaryReport(
+ report_yearmonth=self.yearmonth,
node_confirmed_spam=NodeLog.objects.filter(
action=NodeLog.CONFIRM_SPAM,
created__gt=target_month,
@@ -81,23 +78,3 @@ def report(self, **report_kwargs):
created__lt=next_month,
).count()
)
- reports.append(report_es8)
- report = SpamSummaryReport(
- # Node Log entries
- node_confirmed_spam=report_es8.node_confirmed_spam,
- node_confirmed_ham=report_es8.node_confirmed_ham,
- node_flagged=report_es8.node_flagged,
- # Registration Log entries
- registration_confirmed_spam=report_es8.registration_confirmed_spam,
- registration_confirmed_ham=report_es8.registration_confirmed_ham,
- registration_flagged=report_es8.registration_flagged,
- # Preprint Log entries
- preprint_confirmed_spam=report_es8.preprint_confirmed_spam,
- preprint_confirmed_ham=report_es8.preprint_confirmed_ham,
- preprint_flagged=report_es8.preprint_flagged,
- # New Users marked as Spam/Ham
- user_marked_as_spam=report_es8.user_marked_as_spam,
- user_marked_as_ham=report_es8.user_marked_as_ham,
- )
- reports.append(report)
- return reports
diff --git a/osf/metrics/reporters/storage_addon_usage.py b/osf/metrics/reporters/storage_addon_usage.py
index af6dbb3ebdd..053470d578c 100644
--- a/osf/metrics/reporters/storage_addon_usage.py
+++ b/osf/metrics/reporters/storage_addon_usage.py
@@ -10,10 +10,9 @@
)
from addons.base.models import BaseOAuthUserSettings, BaseOAuthNodeSettings
-from osf.metrics.reports import StorageAddonUsage
from osf.models import SpamStatus, Tag
from website import settings
-from osf.metrics.es8_metrics import DailyStorageAddonUsageReportEs8
+from osf.metrics.daily_reports import DailyStorageAddonUsageReport
from osf.metrics.utils import cycle_coverage_date
from ._base import DailyReporter
@@ -168,13 +167,7 @@ def report(self, date):
'total_daily': node_counts.get('deleted_daily', 0),
},
})
- return [
- DailyStorageAddonUsageReportEs8(
- cycle_coverage=cycle_coverage_date(date),
- usage_by_addon=_usages_by_addon,
- ),
- StorageAddonUsage(
- report_date=date,
- usage_by_addon=_usages_by_addon,
- ),
- ]
+ yield DailyStorageAddonUsageReport(
+ cycle_coverage=cycle_coverage_date(date),
+ usage_by_addon=_usages_by_addon,
+ )
diff --git a/osf/metrics/reporters/user_count.py b/osf/metrics/reporters/user_count.py
index 121b830c466..8a11b4a41d7 100644
--- a/osf/metrics/reporters/user_count.py
+++ b/osf/metrics/reporters/user_count.py
@@ -1,7 +1,6 @@
from osf.models import OSFUser
-from osf.metrics import UserSummaryReport
-from osf.metrics.es8_metrics import DailyUserSummaryReportEs8
+from osf.metrics.daily_reports import DailyUserSummaryReport
from osf.metrics.utils import cycle_coverage_date
from ._base import DailyReporter
@@ -9,8 +8,7 @@
class UserCountReporter(DailyReporter):
def report(self, report_date):
- reports = []
- report_es8 = DailyUserSummaryReportEs8(
+ yield DailyUserSummaryReport(
cycle_coverage=cycle_coverage_date(report_date),
active=OSFUser.objects.filter(is_active=True, date_confirmed__date__lte=report_date).count(),
deactivated=OSFUser.objects.filter(date_disabled__isnull=False, date_disabled__date__lte=report_date).count(),
@@ -19,16 +17,3 @@ def report(self, report_date):
new_users_with_institution_daily=OSFUser.objects.filter(is_active=True, date_confirmed__date=report_date, institutionaffiliation__isnull=False).count(),
unconfirmed=OSFUser.objects.filter(date_registered__date__lte=report_date, date_confirmed__isnull=True).count(),
)
- reports.append(report_es8)
- report = UserSummaryReport(
- report_date=report_date,
- active=report_es8.active,
- deactivated=report_es8.deactivated,
- merged=report_es8.merged,
- new_users_daily=report_es8.new_users_daily,
- new_users_with_institution_daily=report_es8.new_users_with_institution_daily,
- unconfirmed=report_es8.unconfirmed,
- )
- reports.append(report)
-
- return reports
diff --git a/osf/metrics/reports.py b/osf/metrics/reports.py
deleted file mode 100644
index 62479e359cd..00000000000
--- a/osf/metrics/reports.py
+++ /dev/null
@@ -1,353 +0,0 @@
-from __future__ import annotations
-from collections import abc
-import datetime
-
-from django.dispatch import receiver
-from elasticsearch6_dsl import InnerDoc
-import elasticsearch_metrics.imps.elastic6 as metrics
-from elasticsearch_metrics.signals import pre_save as metrics_pre_save
-
-from osf.metrics.utils import stable_key, YearMonth
-
-
-class ReportInvalid(Exception):
- """Tried to save a report with invalid something-or-other
- """
- pass
-
-
-class DailyReport(metrics.Metric):
- """DailyReport (abstract base for report-based metrics)
-
- There's something we'd like to know about every so often,
- so let's regularly run a report and stash the results here.
- """
- UNIQUE_TOGETHER_FIELDS: tuple[str, ...] = ('report_date',) # override in subclasses for multiple reports per day
-
- report_date = metrics.Date(format='strict_date', required=True)
-
- def __init_subclass__(cls, **kwargs):
- super().__init_subclass__(**kwargs)
- assert 'report_date' in cls.UNIQUE_TOGETHER_FIELDS, f'DailyReport subclasses must have "report_date" in UNIQUE_TOGETHER_FIELDS (on {cls.__qualname__}, got {cls.UNIQUE_TOGETHER_FIELDS})'
-
- def save(self, *args, **kwargs):
- if self.timestamp is None:
- self.timestamp = datetime.datetime(
- self.report_date.year,
- self.report_date.month,
- self.report_date.day,
- tzinfo=datetime.UTC,
- )
- super().save(*args, **kwargs)
-
- class Meta:
- abstract = True
- dynamic = metrics.MetaField('strict')
- source = metrics.MetaField(enabled=True)
-
-
-class YearmonthField(metrics.Date):
- def __init__(self, *args, **kwargs):
- super().__init__(*args, **kwargs, format='strict_year_month')
-
- def deserialize(self, data):
- if isinstance(data, int):
- # elasticsearch stores dates in milliseconds since the unix epoch
- _as_datetime = datetime.datetime.fromtimestamp(data // 1000)
- return YearMonth.from_date(_as_datetime)
- elif data is None:
- return None
- try:
- return YearMonth.from_any(data)
- except ValueError:
- raise ValueError(f'unsure how to deserialize "{data}" (of type {type(data)}) to YearMonth')
-
- def serialize(self, data):
- if isinstance(data, str):
- return data
- elif isinstance(data, YearMonth):
- return str(data)
- elif isinstance(data, (datetime.datetime, datetime.date)):
- return str(YearMonth.from_date(data))
- elif data is None:
- return None
- else:
- raise ValueError(f'unsure how to serialize "{data}" (of type {type(data)}) as YYYY-MM')
-
-
-class MonthlyReport(metrics.Metric):
- """MonthlyReport (abstract base for report-based metrics that run monthly)
- """
- UNIQUE_TOGETHER_FIELDS: tuple[str, ...] = ('report_yearmonth',) # override in subclasses for multiple reports per month
-
- report_yearmonth = YearmonthField(required=True)
-
- class Meta:
- abstract = True
- dynamic = metrics.MetaField('strict')
- source = metrics.MetaField(enabled=True)
-
- @classmethod
- def most_recent_yearmonth(cls, base_search=None) -> YearMonth | None:
- _search = base_search or cls.search()
- _search = _search[0:0] # omit hits
- _search.aggs.bucket(
- 'agg_most_recent_yearmonth',
- 'terms',
- field='report_yearmonth',
- order={'_key': 'desc'},
- size=1,
- )
- _response = _search.execute()
- if not _response.aggregations:
- return None
-
- buckets = _response.aggregations.agg_most_recent_yearmonth.buckets
- if not buckets:
- return None
-
- return buckets[0].key
-
- def __init_subclass__(cls, **kwargs):
- super().__init_subclass__(**kwargs)
- assert 'report_yearmonth' in cls.UNIQUE_TOGETHER_FIELDS, f'MonthlyReport subclasses must have "report_yearmonth" in UNIQUE_TOGETHER_FIELDS (on {cls.__qualname__}, got {cls.UNIQUE_TOGETHER_FIELDS})'
-
- def save(self, *args, **kwargs):
- if self.timestamp is None:
- self.timestamp = YearMonth.from_any(self.report_yearmonth).month_start()
- super().save(*args, **kwargs)
-
-
-@receiver(metrics_pre_save)
-def set_report_id(sender, instance, **kwargs):
- if not issubclass(sender, metrics.Metric):
- return # skip es8 record types
- try:
- _unique_together_fields = instance.UNIQUE_TOGETHER_FIELDS
- except AttributeError:
- pass
- else:
- # Set the document id to a hash of "unique together" fields
- # for "ON CONFLICT UPDATE" behavior -- if the document
- # already exists, it will be updated rather than duplicated.
- # Cannot detect/avoid conflicts this way, but that's ok.
- _key_values = []
- for _field_name in _unique_together_fields:
- _field_value = getattr(instance, _field_name)
- if not _field_value or (
- isinstance(_field_value, abc.Iterable) and not isinstance(_field_value, str)
- ):
- raise ReportInvalid(f'because "{_field_name}" is in {sender.__name__}.UNIQUE_TOGETHER_FIELDS, {sender.__name__}.{_field_name} MUST have a non-empty scalar value (got {_field_value} of type {type(_field_value)})')
- _key_values.append(_field_value)
- instance.meta.id = stable_key(*_key_values)
-
-
-#### BEGIN reusable inner objects #####
-
-class RunningTotal(InnerDoc):
- total = metrics.Integer()
- total_daily = metrics.Integer()
-
-class FileRunningTotals(InnerDoc):
- total = metrics.Integer()
- public = metrics.Integer()
- private = metrics.Integer()
- total_daily = metrics.Integer()
- public_daily = metrics.Integer()
- private_daily = metrics.Integer()
-
-class NodeRunningTotals(InnerDoc):
- total = metrics.Integer()
- total_excluding_spam = metrics.Integer()
- public = metrics.Integer()
- private = metrics.Integer()
- total_daily = metrics.Integer()
- total_daily_excluding_spam = metrics.Integer()
- public_daily = metrics.Integer()
- private_daily = metrics.Integer()
-
-class RegistrationRunningTotals(InnerDoc):
- total = metrics.Integer()
- public = metrics.Integer()
- embargoed = metrics.Integer()
- embargoed_v2 = metrics.Integer()
- withdrawn = metrics.Integer()
- total_daily = metrics.Integer()
- public_daily = metrics.Integer()
- embargoed_daily = metrics.Integer()
- embargoed_v2_daily = metrics.Integer()
- withdrawn_daily = metrics.Integer()
-
-##### END reusable inner objects #####
-
-
-# TODO:
-# class ActiveUsersReport(DailyReport):
-# past_day = metrics.Integer()
-# past_week = metrics.Integer()
-# past_30_days = metrics.Integer()
-# past_year = metrics.Integer()
-
-
-class UsageByStorageAddon(InnerDoc):
- addon_shortname = metrics.Keyword()
-
- enabled_usersettings = metrics.Object(RunningTotal)
- linked_usersettings = metrics.Object(RunningTotal)
- deleted_usersettings = metrics.Object(RunningTotal)
- usersetting_links = metrics.Object(RunningTotal)
-
- connected_nodesettings = metrics.Object(RunningTotal)
- disconnected_nodesettings = metrics.Object(RunningTotal)
- deleted_nodesettings = metrics.Object(RunningTotal)
-
-
-class StorageAddonUsage(DailyReport):
- usage_by_addon = metrics.Object(UsageByStorageAddon, multi=True)
-
-
-class DownloadCountReport(DailyReport):
- daily_file_downloads = metrics.Integer()
-
-
-class InstitutionSummaryReport(DailyReport):
- UNIQUE_TOGETHER_FIELDS = ('report_date', 'institution_id',)
-
- institution_id = metrics.Keyword()
- institution_name = metrics.Keyword()
- users = metrics.Object(RunningTotal)
- nodes = metrics.Object(NodeRunningTotals)
- projects = metrics.Object(NodeRunningTotals)
- registered_nodes = metrics.Object(RegistrationRunningTotals)
- registered_projects = metrics.Object(RegistrationRunningTotals)
-
-
-class NewUserDomainReport(DailyReport):
- UNIQUE_TOGETHER_FIELDS = ('report_date', 'domain_name',)
-
- domain_name = metrics.Keyword()
- new_user_count = metrics.Integer()
-
-
-class NodeSummaryReport(DailyReport):
- nodes = metrics.Object(NodeRunningTotals)
- projects = metrics.Object(NodeRunningTotals)
- registered_nodes = metrics.Object(RegistrationRunningTotals)
- registered_projects = metrics.Object(RegistrationRunningTotals)
-
-
-class OsfstorageFileCountReport(DailyReport):
- files = metrics.Object(FileRunningTotals)
-
-
-class PreprintSummaryReport(DailyReport):
- UNIQUE_TOGETHER_FIELDS = ('report_date', 'provider_key',)
-
- provider_key = metrics.Keyword()
- preprint_count = metrics.Integer()
-
-
-class UserSummaryReport(DailyReport):
- active = metrics.Integer()
- deactivated = metrics.Integer()
- merged = metrics.Integer()
- new_users_daily = metrics.Integer()
- new_users_with_institution_daily = metrics.Integer()
- unconfirmed = metrics.Integer()
-
-
-class SpamSummaryReport(MonthlyReport):
- node_confirmed_spam = metrics.Integer()
- node_confirmed_ham = metrics.Integer()
- node_flagged = metrics.Integer()
- registration_confirmed_spam = metrics.Integer()
- registration_confirmed_ham = metrics.Integer()
- registration_flagged = metrics.Integer()
- preprint_confirmed_spam = metrics.Integer()
- preprint_confirmed_ham = metrics.Integer()
- preprint_flagged = metrics.Integer()
- user_marked_as_spam = metrics.Integer()
- user_marked_as_ham = metrics.Integer()
-
-
-class InstitutionalUserReport(MonthlyReport):
- UNIQUE_TOGETHER_FIELDS = ('report_yearmonth', 'institution_id', 'user_id',)
- institution_id = metrics.Keyword()
- # user info:
- user_id = metrics.Keyword()
- user_name = metrics.Keyword()
- department_name = metrics.Keyword()
- month_last_login = YearmonthField()
- month_last_active = YearmonthField()
- account_creation_date = YearmonthField()
- orcid_id = metrics.Keyword()
- # counts:
- public_project_count = metrics.Integer()
- private_project_count = metrics.Integer()
- public_registration_count = metrics.Integer()
- embargoed_registration_count = metrics.Integer()
- published_preprint_count = metrics.Integer()
- public_file_count = metrics.Long()
- storage_byte_count = metrics.Long()
-
-
-class InstitutionMonthlySummaryReport(MonthlyReport):
- UNIQUE_TOGETHER_FIELDS = ('report_yearmonth', 'institution_id', )
- institution_id = metrics.Keyword()
- user_count = metrics.Integer()
- public_project_count = metrics.Integer()
- private_project_count = metrics.Integer()
- public_registration_count = metrics.Integer()
- embargoed_registration_count = metrics.Integer()
- published_preprint_count = metrics.Integer()
- storage_byte_count = metrics.Long()
- public_file_count = metrics.Long()
- monthly_logged_in_user_count = metrics.Long()
- monthly_active_user_count = metrics.Long()
-
-
-class PublicItemUsageReport(MonthlyReport):
- UNIQUE_TOGETHER_FIELDS = ('report_yearmonth', 'item_osfid')
-
- # where noted, fields are meant to correspond to defined terms from COUNTER
- # https://cop5.projectcounter.org/en/5.1/appendices/a-glossary-of-terms.html
- # https://coprd.countermetrics.org/en/1.0.1/appendices/a-glossary.html
- item_osfid = metrics.Keyword() # counter:Item (or Dataset)
- item_type = metrics.Keyword(multi=True) # counter:Data-Type
- provider_id = metrics.Keyword(multi=True) # counter:Database(?)
- platform_iri = metrics.Keyword(multi=True) # counter:Platform
-
- # view counts include views on components or files contained by this item
- view_count = metrics.Long() # counter:Total Investigations
- view_session_count = metrics.Long() # counter:Unique Investigations
-
- # download counts of this item only (not including contained components or files)
- download_count = metrics.Long() # counter:Total Requests
- download_session_count = metrics.Long() # counter:Unique Requests
-
- @classmethod
- def for_last_month(cls, item_osfid: str) -> PublicItemUsageReport | None:
- _search = (
- PublicItemUsageReport.search()
- .filter('term', item_osfid=item_osfid)
- # only last month's report
- .filter('range', report_yearmonth={
- 'gte': 'now-2M/M',
- 'lt': 'now/M',
- })
- .sort('-report_yearmonth')
- [:1]
- )
- _response = _search.execute()
- return _response[0] if _response else None
-
-
-class PrivateSpamMetricsReport(MonthlyReport):
- node_oopspam_flagged = metrics.Integer()
- node_oopspam_hammed = metrics.Integer()
- node_akismet_flagged = metrics.Integer()
- node_akismet_hammed = metrics.Integer()
- preprint_oopspam_flagged = metrics.Integer()
- preprint_oopspam_hammed = metrics.Integer()
- preprint_akismet_flagged = metrics.Integer()
- preprint_akismet_hammed = metrics.Integer()
diff --git a/osf/metrics/utils.py b/osf/metrics/utils.py
index 87b2d48f6fd..e0ea0b1f4e6 100644
--- a/osf/metrics/utils.py
+++ b/osf/metrics/utils.py
@@ -75,6 +75,36 @@ def get_item_type_from_iri(type_iri) -> str:
return _shortname
+def get_surrounding_osfids(osfid_referent):
+ """get all the parent/owner/surrounding osfids for the given osfid_referent
+
+ @param osfid_referent: instance of a model that has GuidMixin
+ @returns list of str
+
+ For AbstractNode, goes up the node hierarchy up to the root.
+ For WikiPage or BaseFileNode, grab the node it belongs to and
+ follow the node hierarchy from there.
+ """
+ _surrounding_osfids = []
+ _current_referent = osfid_referent
+ while _current_referent:
+ next_referent = get_immediate_wrapper(_current_referent)
+ if next_referent:
+ _surrounding_osfids.append(next_referent._id)
+ _current_referent = next_referent
+ return _surrounding_osfids
+
+
+def get_immediate_wrapper(osfid_referent):
+ if hasattr(osfid_referent, 'verified_publishable'):
+ return None # quacks like Preprint
+ return (
+ getattr(osfid_referent, 'parent_node', None) # quacks like AbstractNode
+ or getattr(osfid_referent, 'node', None) # quacks like WikiPage, Comment
+ or getattr(osfid_referent, 'target', None) # quacks like BaseFileNode
+ )
+
+
@dataclasses.dataclass(frozen=True)
class YearMonth:
"""YearMonth: represents a specific month in a specific year"""
@@ -88,6 +118,11 @@ def from_date(cls, date: datetime.date) -> YearMonth:
"""construct a YearMonth from a `datetime.date` (or `datetime.datetime`)"""
return cls(date.year, date.month)
+ @classmethod
+ def from_today(cls) -> YearMonth:
+ """construct a YearMonth from the current moment"""
+ return cls.from_date(datetime.date.today())
+
@classmethod
def from_str(cls, input_str: str) -> YearMonth:
"""construct a YearMonth from a string in "YYYY-MM" format"""
diff --git a/osf/models/base.py b/osf/models/base.py
index 9e6d5f502d4..65cb73d05e8 100644
--- a/osf/models/base.py
+++ b/osf/models/base.py
@@ -309,6 +309,9 @@ class Meta:
UniqueConstraint(fields=['guid', 'version'], name='unique_guid_version')
]
+ def versioned_osfid(self):
+ return f'{self.guid._id}{VersionedGuidMixin.GUID_VERSION_DELIMITER}{self.version}'
+
class BlackListGuid(BaseModel):
id = models.AutoField(primary_key=True)
@@ -553,12 +556,11 @@ def _id(self):
f'`self.versioned_guids` does not exist: [self={self.pk}, type={type(self).__name__}]'
)
return None
- guid = versioned_guid.first().guid
- version = versioned_guid.first().version
+ _current_versioned_guid = versioned_guid.first()
except IndexError as e:
sentry.log_exception(e)
return None
- return f'{guid._id}{VersionedGuidMixin.GUID_VERSION_DELIMITER}{version}'
+ return _current_versioned_guid.versioned_osfid()
@_id.setter
def _id(self, value):
diff --git a/osf/models/registrations.py b/osf/models/registrations.py
index f13489f1201..568a1a575ed 100644
--- a/osf/models/registrations.py
+++ b/osf/models/registrations.py
@@ -24,8 +24,7 @@
from osf.utils.permissions import ADMIN, READ, WRITE
from osf.exceptions import NodeStateError, DraftRegistrationStateError
from osf.external.internet_archive.tasks import archive_to_ia, update_ia_metadata
-from osf.metrics import RegistriesModerationMetrics
-from osf.metrics.es8_metrics import RegistriesModerationEventEs8
+from osf.metrics.events import RegistriesModerationEvent
from osf.models.notification_type import NotificationTypeEnum
from .action import RegistrationAction
from .archive import ArchiveJob
@@ -786,8 +785,7 @@ def _write_registration_action(self, from_state, to_state, initiated_by, comment
)
action.save()
if waffle.switch_is_active(features.ELASTICSEARCH_METRICS):
- RegistriesModerationMetrics.record_transitions(action)
- RegistriesModerationEventEs8.record(
+ RegistriesModerationEvent.record(
registration_id=action.target._id,
provider_id=action.target.provider._id,
from_state=action.from_state,
diff --git a/osf_tests/management_commands/test_monthly_reporters_go.py b/osf_tests/management_commands/test_monthly_reporters_go.py
index 505e7adf4bd..1fbac32a77d 100644
--- a/osf_tests/management_commands/test_monthly_reporters_go.py
+++ b/osf_tests/management_commands/test_monthly_reporters_go.py
@@ -2,26 +2,24 @@
from django.core.management import call_command
from django.test import TestCase
-from elasticsearch_metrics.tests.util import djelme_test_backends
+from elasticsearch_metrics.tests.util import RealElasticTestCase
from framework.celery_tasks import app as celery_app
-from osf.metrics import reports as es6_reports
-from osf.metrics.es8_metrics import (
- MonthlyInstitutionSummaryReportEs8,
- MonthlyInstitutionalUserReportEs8,
- MonthlyPrivateSpamMetricsReportEs8,
- MonthlyPublicItemUsageReportEs8,
- MonthlySpamSummaryReportEs8,
+from osf.metrics.monthly_reports import (
+ MonthlyInstitutionSummaryReport,
+ MonthlyInstitutionalUserReport,
+ MonthlyPrivateSpamMetricsReport,
+ MonthlyPublicItemUsageReport,
+ MonthlySpamSummaryReport,
)
-from osf.metrics.counted_usage import CountedAuthUsage
+from osf.metrics.events import OsfCountedUsageEvent
from osf.metrics.utils import YearMonth
from osf_tests import factories
-from website import settings as website_settings
-class TestMonthlyReportersGo(TestCase):
+class TestMonthlyReportersGo(RealElasticTestCase, TestCase):
def setUp(self):
- self.enterContext(djelme_test_backends())
+ super().setUp()
celery_app.conf.update({
'task_always_eager': True,
'task_eager_propagates': True,
@@ -33,48 +31,26 @@ def setUp(self):
_user.add_or_update_affiliated_institution(_inst)
# set up for public item usage report
_reg = factories.RegistrationFactory(is_public=True)
- CountedAuthUsage.record(
- platform_iri=website_settings.DOMAIN,
- item_guid=_reg._id,
- session_id='blarg',
- user_is_authenticated=True,
+ OsfCountedUsageEvent.record(
+ item_osfid=_reg._id,
action_labels=['view', 'web'],
+ user_id=_user._id,
)
- CountedAuthUsage._get_connection().indices.refresh(CountedAuthUsage._template_pattern)
- # TODO when switching to use es8 data
- # OsfCountedUsageEvent.record(
- # item_osfid=_preprint._id,
- # action_labels=['view', 'web'],
- # user_id=_user._id,
- # )
- # OsfCountedUsageEvent.refresh()
+ OsfCountedUsageEvent.refresh()
def test_for_smoke(self):
- self._assert_count(MonthlyInstitutionSummaryReportEs8, 0)
- self._assert_count(MonthlyInstitutionalUserReportEs8, 0)
- self._assert_count(MonthlyPrivateSpamMetricsReportEs8, 0)
- self._assert_count(MonthlyPublicItemUsageReportEs8, 0)
- self._assert_count(MonthlySpamSummaryReportEs8, 0)
- self._assert_count(es6_reports.SpamSummaryReport, 0)
- self._assert_count(es6_reports.InstitutionalUserReport, 0)
- self._assert_count(es6_reports.InstitutionMonthlySummaryReport, 0)
- self._assert_count(es6_reports.PublicItemUsageReport, 0)
- self._assert_count(es6_reports.PrivateSpamMetricsReport, 0)
+ self._assert_count(MonthlyInstitutionSummaryReport, 0)
+ self._assert_count(MonthlyInstitutionalUserReport, 0)
+ self._assert_count(MonthlyPrivateSpamMetricsReport, 0)
+ self._assert_count(MonthlyPublicItemUsageReport, 0)
+ self._assert_count(MonthlySpamSummaryReport, 0)
call_command('monthly_reporters_go', yearmonth=str(self._report_yearmonth))
- self._assert_count(MonthlyInstitutionSummaryReportEs8, 1)
- self._assert_count(MonthlyInstitutionalUserReportEs8, 1)
- self._assert_count(MonthlyPrivateSpamMetricsReportEs8, 1)
- self._assert_count(MonthlyPublicItemUsageReportEs8, 1)
- self._assert_count(MonthlySpamSummaryReportEs8, 1)
- self._assert_count(es6_reports.SpamSummaryReport, 1)
- self._assert_count(es6_reports.InstitutionalUserReport, 1)
- self._assert_count(es6_reports.InstitutionMonthlySummaryReport, 1)
- self._assert_count(es6_reports.PublicItemUsageReport, 1)
- self._assert_count(es6_reports.PrivateSpamMetricsReport, 1)
+ self._assert_count(MonthlyInstitutionSummaryReport, 1)
+ self._assert_count(MonthlyInstitutionalUserReport, 1)
+ self._assert_count(MonthlyPrivateSpamMetricsReport, 1)
+ self._assert_count(MonthlyPublicItemUsageReport, 1)
+ self._assert_count(MonthlySpamSummaryReport, 1)
def _assert_count(self, recordtype, expected_count):
- if hasattr(recordtype, 'refresh'):
- recordtype.refresh()
- else: # elasticsearch_metrics.imps.elastic6
- recordtype._get_connection().indices.refresh(recordtype._template_pattern)
+ recordtype.refresh()
self.assertEqual(recordtype.search().count(), expected_count)
diff --git a/osf_tests/management_commands/test_reindex_es6.py b/osf_tests/management_commands/test_reindex_es6.py
deleted file mode 100644
index 36158c18da6..00000000000
--- a/osf_tests/management_commands/test_reindex_es6.py
+++ /dev/null
@@ -1,116 +0,0 @@
-import time
-import pytest
-from website import settings
-
-from osf.metrics import PreprintDownload
-from django.core.management import call_command
-
-from osf_tests.factories import (
- PreprintFactory,
- AuthUserFactory
-)
-
-from elasticsearch6_dsl import Keyword
-
-from tests.json_api_test_app import JSONAPITestApp
-
-from api.base import settings as django_settings
-
-
-@pytest.fixture()
-def app():
- return JSONAPITestApp()
-
-
-@pytest.mark.django_db
-class TestReindexingMetrics:
-
- @pytest.fixture()
- def preprint(self):
- return PreprintFactory()
-
- @pytest.fixture()
- def user(self):
- return AuthUserFactory()
-
- @pytest.fixture()
- def admin(self):
- user = AuthUserFactory()
- user.is_staff = True
- user.add_system_tag('preprint_metrics')
- user.save()
- return user
-
- @pytest.fixture()
- def url(self):
- return f'{settings.API_DOMAIN}_/metrics/preprints/downloads/'
-
- @pytest.mark.es
- @pytest.mark.skipif(django_settings.CI_ENV, reason='Non-deterministic fails on CI')
- def test_reindexing(self, app, url, preprint, user, admin, es6_client):
- preprint_download = PreprintDownload.record_for_preprint(
- preprint,
- user,
- version=1,
- path='/MalcolmJenkinsKnockedBrandinCooksOutColdInTheSuperBowl',
- random_new_field='Hi!' # Here's our unmapped field! It's a text field by default.
- )
- preprint_download.save()
-
- query = {
- 'aggs': {
- 'random_new_field': {
- 'terms': {
- 'field': 'random_new_field', # Oh no, this is a text field, you can't query it like that!
- }
- }
- }
- }
-
- payload = {
- 'data': {
- 'type': 'preprint_metrics',
- 'attributes': {
- 'query': query
- }
- }
- }
-
- # Hacky way to simulate a re-mapped index template
- index_template = preprint_download._index
- mapping = index_template._mapping
- mapping.properties._params['properties']['random_new_field'] = Keyword(doc_values=True, index=True)
- index_template._mapping._update_from_dict(mapping.to_dict())
-
- # This should 400 because random_new_field is still stored as a text field despite the our index being remapped.
- res = app.post_json_api(url, payload, auth=admin.auth, expect_errors=True)
- assert res.status_code == 400
- assert res.json['errors'][0]['detail'] == 'Fielddata is disabled on text fields by default. Set ' \
- 'fielddata=true on [random_new_field] in order to load' \
- ' fielddata in memory by uninverting the inverted inde' \
- 'x. Note that this can however use significant memory.' \
- ' Alternatively use a keyword field instead.'
-
- call_command('reindex_es6', f'--indices={preprint_download.meta["index"]}')
- time.sleep(2)
-
- res = app.post_json_api(url, payload, auth=admin.auth)
- assert res.status_code == 200
- assert res.json['hits']['hits'][0]['_source']['random_new_field'] == 'Hi!'
-
- # Just checking version number incremented properly
- es6_client.indices.get(f'{preprint_download.meta["index"]}_v2')
-
- # Just check it was aliased properly
- es6_client.indices.get(f'{preprint_download.meta["index"]}')
-
- call_command('reindex_es6', f'--indices={preprint_download.meta["index"]}')
- time.sleep(2)
-
- # Just checking version number incremented properly again
- es6_client.indices.get(f'{preprint_download.meta["index"]}_v3')
-
- # Just check it was aliased properly again (to the OG index, not the v2 index)
- data = es6_client.indices.get(f'{preprint_download.meta["index"]}')
-
- assert data[f'{preprint_download.meta["index"]}_v3']['aliases'] == {'osf_preprintdownload_2020': {}}
diff --git a/osf_tests/metadata/test_osf_gathering.py b/osf_tests/metadata/test_osf_gathering.py
index f235488e557..23095b066ee 100644
--- a/osf_tests/metadata/test_osf_gathering.py
+++ b/osf_tests/metadata/test_osf_gathering.py
@@ -25,7 +25,7 @@
checksum_iri,
)
from osf import models as osfdb
-from osf.metrics.reports import PublicItemUsageReport
+from osf.metrics.monthly_reports import MonthlyPublicItemUsageReport
from osf.metrics.utils import YearMonth
from osf.utils import permissions, workflows
from osf_tests import factories
@@ -799,22 +799,23 @@ def test_gather_cedar_templates(self):
def test_gather_last_month_usage(self):
# no usage report:
with mock.patch(
- 'osf.metrics.reports.PublicItemUsageReport.for_last_month',
- return_value=None,
+ 'osf.metrics.monthly_reports.MonthlyPublicItemUsageReport.from_last_month',
+ return_value=[],
):
assert_triples(osf_gathering.gather_last_month_usage(self.projectfocus), set())
# yes usage report:
_ym = YearMonth.from_date(datetime.datetime.now(tz=datetime.UTC))
with mock.patch(
- 'osf.metrics.reports.PublicItemUsageReport.for_last_month',
- return_value=PublicItemUsageReport(
- item_osfid=self.project._id,
+ 'osf.metrics.monthly_reports.MonthlyPublicItemUsageReport.from_last_month',
+ return_value=[MonthlyPublicItemUsageReport(
+ item_iri=self.project.get_semantic_iri(),
+ item_osfids=[self.project._id],
report_yearmonth=_ym,
view_count=71,
view_session_count=13,
download_count=43,
download_session_count=11,
- ),
+ )],
):
_usage_bnode = rdflib.BNode()
assert_triples(osf_gathering.gather_last_month_usage(self.projectfocus), {
diff --git a/osf_tests/metadata/test_serialized_metadata.py b/osf_tests/metadata/test_serialized_metadata.py
index 5dc4029aaf4..369e06555a7 100644
--- a/osf_tests/metadata/test_serialized_metadata.py
+++ b/osf_tests/metadata/test_serialized_metadata.py
@@ -9,7 +9,7 @@
from osf.metadata.osf_gathering import OsfmapPartition
from osf.metadata.rdfutils import OSF, DCTERMS
from osf.metadata.tools import pls_gather_metadata_file
-from osf.metrics.reports import PublicItemUsageReport
+from osf.metrics.monthly_reports import MonthlyPublicItemUsageReport
from osf.metrics.utils import YearMonth
from osf.models.licenses import NodeLicense
from api_tests.utils import create_test_file
@@ -311,14 +311,14 @@ def setUp(self):
'resource_type_general': 'StudyRegistration',
}, auth=self.user)
self.enterContext(mock.patch(
- 'osf.metrics.reports.PublicItemUsageReport.for_last_month',
- return_value=PublicItemUsageReport(
+ 'osf.metrics.monthly_reports.MonthlyPublicItemUsageReport.from_last_month',
+ return_value=[MonthlyPublicItemUsageReport(
report_yearmonth=YearMonth.from_date(forever_now()),
view_count=7,
view_session_count=5,
download_count=3,
download_session_count=2,
- ),
+ )],
))
self.guid_dict = {
OSF.Project: self.project._id,
diff --git a/osf_tests/metrics/reporters/_testutils.py b/osf_tests/metrics/reporters/_testutils.py
index 3275b0f1651..ef504c06a18 100644
--- a/osf_tests/metrics/reporters/_testutils.py
+++ b/osf_tests/metrics/reporters/_testutils.py
@@ -1,8 +1,9 @@
+from elasticsearch_metrics.imps.elastic8 import CyclicRecord
+
from osf.metrics.reporters._base import MonthlyReporter
-from osf.metrics.reports import MonthlyReport
-def list_monthly_reports(reporter: MonthlyReporter, *, flat=False) -> list[MonthlyReport]:
+def list_monthly_reports(reporter: MonthlyReporter) -> list[CyclicRecord]:
_each_reports_list = (
reporter.report(**_kwargs)
for _kwargs in reporter.iter_report_kwargs()
@@ -11,5 +12,4 @@ def list_monthly_reports(reporter: MonthlyReporter, *, flat=False) -> list[Month
_report
for _reports_list in _each_reports_list
for _report in _reports_list
- if isinstance(_report, MonthlyReport) # TODO: update tests with es8
]
diff --git a/osf_tests/metrics/reporters/test_institutional_summary_reporter.py b/osf_tests/metrics/reporters/test_institutional_summary_reporter.py
index 02c24d86f3c..5d45056d8ec 100644
--- a/osf_tests/metrics/reporters/test_institutional_summary_reporter.py
+++ b/osf_tests/metrics/reporters/test_institutional_summary_reporter.py
@@ -81,18 +81,18 @@ def test_report_generation(self):
reporter = InstitutionalSummaryMonthlyReporter(self._yearmonth)
reports = list_monthly_reports(reporter)
self.assertEqual(len(reports), 1)
- report = reports[0]
- self.assertEqual(report.institution_id, self._institution._id)
- self.assertEqual(report.user_count, 2) # _logged_in_user and _active_user
- self.assertEqual(report.public_project_count, 1)
- self.assertEqual(report.private_project_count, 1)
- self.assertEqual(report.public_registration_count, 1)
- self.assertEqual(report.embargoed_registration_count, 1)
- self.assertEqual(report.published_preprint_count, 1)
- self.assertEqual(report.storage_byte_count, 1337) # test value for one file
- self.assertEqual(report.public_file_count, 1)
- self.assertEqual(report.monthly_logged_in_user_count, 1)
- self.assertEqual(report.monthly_active_user_count, 1)
+ for report in reports:
+ self.assertEqual(report.institution_id, self._institution._id)
+ self.assertEqual(report.user_count, 2) # _logged_in_user and _active_user
+ self.assertEqual(report.public_project_count, 1)
+ self.assertEqual(report.private_project_count, 1)
+ self.assertEqual(report.public_registration_count, 1)
+ self.assertEqual(report.embargoed_registration_count, 1)
+ self.assertEqual(report.published_preprint_count, 1)
+ self.assertEqual(report.storage_byte_count, 1337) # test value for one file
+ self.assertEqual(report.public_file_count, 1)
+ self.assertEqual(report.monthly_logged_in_user_count, 1)
+ self.assertEqual(report.monthly_active_user_count, 1)
def test_report_generation_multiple_institutions(self):
institution2 = InstitutionFactory()
@@ -118,22 +118,24 @@ def test_report_generation_multiple_institutions(self):
self.assertEqual(len(reports), 3) # Reports for self._institution, institution2, institution3
# Extract reports by institution
- report_institution = next(r for r in reports if r.institution_id == self._institution._id)
- report_institution2 = next(r for r in reports if r.institution_id == institution2._id)
+ _reports1 = [r for r in reports if r.institution_id == self._institution._id]
+ _reports2 = [r for r in reports if r.institution_id == institution2._id]
# Validate report for self._institution
- self.assertEqual(report_institution.public_project_count, 1)
- self.assertEqual(report_institution.private_project_count, 1)
- self.assertEqual(report_institution.user_count, 2)
- self.assertEqual(report_institution.monthly_active_user_count, 1)
- self.assertEqual(report_institution.monthly_logged_in_user_count, 1)
+ for _report in _reports1:
+ self.assertEqual(_report.public_project_count, 1)
+ self.assertEqual(_report.private_project_count, 1)
+ self.assertEqual(_report.user_count, 2)
+ self.assertEqual(_report.monthly_active_user_count, 1)
+ self.assertEqual(_report.monthly_logged_in_user_count, 1)
# Validate report for institution2
- self.assertEqual(report_institution2.public_project_count, 1)
- self.assertEqual(report_institution2.private_project_count, 0)
- self.assertEqual(report_institution2.user_count, 1)
- self.assertEqual(report_institution2.monthly_active_user_count, 1)
- self.assertEqual(report_institution2.monthly_logged_in_user_count, 0) # No logged-in users
+ for _report in _reports2:
+ self.assertEqual(_report.public_project_count, 1)
+ self.assertEqual(_report.private_project_count, 0)
+ self.assertEqual(_report.user_count, 1)
+ self.assertEqual(_report.monthly_active_user_count, 1)
+ self.assertEqual(_report.monthly_logged_in_user_count, 0) # No logged-in users
class TestSummaryMonthlyReporterBenchmarker(TestCase):
@@ -264,7 +266,6 @@ def test_high_counts_multiple_institutions(self):
reporter_start_time = time.time()
reporter = InstitutionalSummaryMonthlyReporter(self._yearmonth)
reports = list_monthly_reports(reporter)
- assert len(reports) == additional_institution_count + 1
if enable_benchmarking:
reporter_end_time = time.time()
diff --git a/osf_tests/metrics/reporters/test_institutional_users_reporter.py b/osf_tests/metrics/reporters/test_institutional_users_reporter.py
index e399d848396..a43e90d1313 100644
--- a/osf_tests/metrics/reporters/test_institutional_users_reporter.py
+++ b/osf_tests/metrics/reporters/test_institutional_users_reporter.py
@@ -8,7 +8,6 @@
from api_tests.utils import create_test_file
from osf import models as osfdb
from osf.management.commands.populate_notification_types import populate_notification_types
-from osf.metrics.reports import InstitutionalUserReport
from osf.metrics.reporters import InstitutionalUsersReporter
from osf.metrics.utils import YearMonth
from osf_tests.factories import (
@@ -48,7 +47,7 @@ def setUpTestData(cls):
)
cls._user_setup_with_stuff.fill_uncounted_objects()
- def _assert_report_matches_setup(self, report: InstitutionalUserReport, setup: _InstiUserSetup):
+ def _assert_report_matches_setup(self, report, setup: _InstiUserSetup):
self.assertEqual(report.institution_id, setup.institution._id)
# user info:
self.assertEqual(report.user_id, setup.user._id)
@@ -99,10 +98,12 @@ def test_one_user_with_stuff_and_a_file(self):
_project = _user.nodes.first()
with _patch_now(self._now):
create_test_file(target=_project, user=_user, size=37)
- (_report,) = list_monthly_reports(InstitutionalUsersReporter(self._yearmonth))
- self._assert_report_matches_setup(_report, self._user_setup_with_stuff)
- self.assertEqual(_report.public_file_count, 3) # 2 preprint files
- self.assertEqual(_report.storage_byte_count, 2711) # 2 preprint files
+ _reports = list_monthly_reports(InstitutionalUsersReporter(self._yearmonth))
+ self.assertEqual(len(_reports), 1)
+ for _report in _reports:
+ self._assert_report_matches_setup(_report, self._user_setup_with_stuff)
+ self.assertEqual(_report.public_file_count, 3) # 2 preprint files
+ self.assertEqual(_report.storage_byte_count, 2711) # 2 preprint files
def test_one_user_with_stuff_and_multiple_files(self):
self._user_setup_with_stuff.affiliate_user()
@@ -116,10 +117,12 @@ def test_one_user_with_stuff_and_multiple_files(self):
create_test_file(target=_component, user=_user, size=53, filename='bla')
create_test_file(target=_component, user=_user, size=51, filename='blar')
create_test_file(target=_component, user=_user, size=47, filename='blarg')
- (_report,) = list_monthly_reports(InstitutionalUsersReporter(self._yearmonth))
- self._assert_report_matches_setup(_report, self._user_setup_with_stuff)
- self.assertEqual(_report.public_file_count, 7) # 2 preprint files
- self.assertEqual(_report.storage_byte_count, 2935) # 2 preprint files + 37 + 73 + 53 + 51 + 47
+ _reports = list_monthly_reports(InstitutionalUsersReporter(self._yearmonth))
+ self.assertEqual(len(_reports), 1)
+ for _report in _reports:
+ self._assert_report_matches_setup(_report, self._user_setup_with_stuff)
+ self.assertEqual(_report.public_file_count, 7) # 2 preprint files
+ self.assertEqual(_report.storage_byte_count, 2935) # 2 preprint files + 37 + 73 + 53 + 51 + 47
def test_several_users(self):
_setups = [
diff --git a/osf_tests/metrics/reporters/test_public_item_usage_reporter.py b/osf_tests/metrics/reporters/test_public_item_usage_reporter.py
index 69bd266285a..2f111efb6bd 100644
--- a/osf_tests/metrics/reporters/test_public_item_usage_reporter.py
+++ b/osf_tests/metrics/reporters/test_public_item_usage_reporter.py
@@ -1,246 +1,293 @@
from datetime import datetime, timedelta
+from functools import cached_property
from operator import attrgetter
from unittest import mock
-import pytest
+from django.test import TestCase
+from elasticsearch_metrics.tests.util import RealElasticTestCase
-from osf.metrics.counted_usage import CountedAuthUsage
-from osf.metrics.preprint_metrics import (
- PreprintDownload,
- PreprintView,
-)
+from osf.metadata.rdfutils import OSF
+from osf.metrics.events import OsfCountedUsageEvent
+from osf.metrics.monthly_reports import MonthlyPublicItemUsageReport
from osf.metrics.reporters.public_item_usage import PublicItemUsageReporter
-from osf.metrics.reports import PublicItemUsageReport
from osf.metrics.utils import YearMonth
-from osf import models as osfdb
from osf_tests import factories
from ._testutils import list_monthly_reports
-@pytest.mark.es_metrics
-@pytest.mark.django_db
-class TestPublicItemUsageReporter:
- @pytest.fixture(autouse=True)
- def _patch_settings(self):
- with mock.patch('website.settings.DOMAIN', 'http://osf.example'):
- yield
+class TestPublicItemUsageReporter(RealElasticTestCase, TestCase):
+ def setUp(self):
+ super().setUp()
+ self.enterContext(mock.patch('website.settings.DOMAIN', 'http://osf.example/'))
- @pytest.fixture
+ @cached_property
def item0(self):
_item0 = factories.PreprintFactory(is_public=True, set_guid='item0')
return _item0
- @pytest.fixture
+ @cached_property
def item1(self):
_item1 = factories.ProjectFactory(is_public=True)
_item1._id = 'item1'
return _item1
- @pytest.fixture
- def item2(self, item1):
- _item2 = factories.ProjectFactory(is_public=True, parent=item1)
+ @cached_property
+ def item2(self):
+ _item2 = factories.ProjectFactory(is_public=True, parent=self.item1)
_item2._id = 'item2'
return _item2
- @pytest.fixture
+ @cached_property
def ym_empty(self) -> YearMonth:
return YearMonth(2012, 7)
- @pytest.fixture
+ @cached_property
def ym_sparse(self) -> YearMonth:
return YearMonth(2017, 7)
- @pytest.fixture
+ @cached_property
def ym_busy(self) -> YearMonth:
return YearMonth(2023, 7)
- @pytest.fixture
- def sparse_month_usage(self, ym_sparse, item0, item1, item2):
+ def _setup_sparse_month_usage(self):
# "sparse" month:
# item0: 3 views, 0 downloads, 2 sessions
# item1: 1 views, 1 download, 1 session (plus 1 view from child item2)
# item2: 1 views, 0 downloads, 1 session
- _month_start = ym_sparse.month_start()
+ _month_start = self.ym_sparse.month_start()
_save_usage(
- item0,
+ self.item0,
timestamp=_month_start,
- session_id='sesh0',
+ sessionhour_id='sesh0',
action_labels=['view'],
)
_save_usage(
- item0,
+ self.item0,
timestamp=_month_start + timedelta(minutes=2),
- session_id='sesh0',
+ sessionhour_id='sesh0',
action_labels=['view'],
)
_save_usage(
- item1,
+ self.item1,
timestamp=_month_start + timedelta(minutes=3),
- session_id='sesh0',
+ sessionhour_id='sesh0',
action_labels=['download'],
)
_save_usage(
- item0,
+ self.item0,
timestamp=_month_start + timedelta(days=17),
- session_id='sesh1',
+ sessionhour_id='sesh1',
action_labels=['view'],
)
_save_usage(
- item1,
+ self.item1,
timestamp=_month_start + timedelta(days=17, minutes=3),
- session_id='sesh1',
+ sessionhour_id='sesh1',
action_labels=['view'],
)
_save_usage(
- item2,
+ self.item2,
timestamp=_month_start + timedelta(days=17, minutes=5),
- session_id='sesh1',
+ sessionhour_id='sesh1',
action_labels=['view'],
)
_save_usage(
- item2,
+ self.item2,
timestamp=_month_start + timedelta(days=17, minutes=11),
- session_id='sesh1',
+ sessionhour_id='sesh1',
action_labels=['download'],
)
- @pytest.fixture
- def busy_month_item0(self, ym_busy, item0):
+ def _setup_busy_month_item0(self):
# item0: 4 sessions, 4*7 views, 4*5 downloads
- _month_start = ym_busy.month_start()
+ _month_start = self.ym_busy.month_start()
for _sesh in range(0, 4):
_sesh_start = _month_start + timedelta(days=_sesh)
for _minute in range(0, 7):
_save_usage(
- item0,
+ self.item0,
timestamp=_sesh_start + timedelta(minutes=_minute),
- session_id=f'sesh0{_sesh}',
+ sessionhour_id=f'sesh0{_sesh}',
action_labels=['view'],
)
for _minute in range(10, 15):
_save_usage(
- item0,
+ self.item0,
timestamp=_sesh_start + timedelta(minutes=_minute),
- session_id=f'sesh0{_sesh}',
+ sessionhour_id=f'sesh0{_sesh}',
action_labels=['download'],
)
+ # plus prior report with cumulative counts:
+ # 4 views, 3 view sessions, 2 downloads, 1 download session
+ MonthlyPublicItemUsageReport.record(
+ report_yearmonth=self.ym_busy.prior(),
+ item_iri='http://osf.example/item0_v1',
+ item_osfids=['item0_v1'],
+ item_types=[OSF.Preprint],
+ platform_iris=['http://osf.example/'],
+ database_iris=[self.item0.provider.get_semantic_iri()],
+ provider_ids=[self.item0.provider._id],
+ view_count=1,
+ view_session_count=1,
+ cumulative_view_count=4,
+ cumulative_view_session_count=3,
+ download_count=2,
+ download_session_count=1,
+ cumulative_download_count=2,
+ cumulative_download_session_count=1,
+ )
- @pytest.fixture
- def busy_month_item1(self, ym_busy, item1):
+ def _setup_busy_month_item1(self):
# item1: 10 sessions, 6*9 views, 5*7 downloads
# (plus 11 views in 11 sessions from child item2)
- _month_start = ym_busy.month_start()
+ _month_start = self.ym_busy.month_start()
for _sesh in range(0, 6):
_sesh_start = _month_start + timedelta(days=_sesh)
for _minute in range(0, 9):
_save_usage(
- item1,
+ self.item1,
timestamp=_sesh_start + timedelta(minutes=_minute),
- session_id=f'sesh1{_sesh}',
+ sessionhour_id=f'sesh1{_sesh}',
action_labels=['view'],
)
for _sesh in range(5, 10):
_sesh_start = _month_start + timedelta(days=_sesh)
for _minute in range(10, 17):
_save_usage(
- item1,
+ self.item1,
timestamp=_sesh_start + timedelta(minutes=_minute),
- session_id=f'sesh1{_sesh}',
+ sessionhour_id=f'sesh1{_sesh}',
action_labels=['download'],
)
- @pytest.fixture
- def busy_month_item2(self, ym_busy, item2):
+ def _setup_busy_month_item2(self):
# item2: 11 sessions, 11 views, 11 downloads (child of item1)
- _month_start = ym_busy.month_start()
+ _month_start = self.ym_busy.month_start()
for _sesh in range(1, 12):
_save_usage(
- item2,
+ self.item2,
timestamp=_month_start + timedelta(days=_sesh),
- session_id=f'sesh2{_sesh}',
+ sessionhour_id=f'sesh2{_sesh}',
action_labels=['view'],
)
_save_usage(
- item2,
+ self.item2,
timestamp=_month_start + timedelta(days=_sesh, hours=_sesh),
- session_id=f'sesh2{_sesh}',
+ sessionhour_id=f'sesh2{_sesh}',
action_labels=['download'],
)
- def test_no_data(self, ym_empty):
- _reporter = PublicItemUsageReporter(ym_empty)
+ def test_no_data(self):
+ _reporter = PublicItemUsageReporter(self.ym_empty)
_empty = list_monthly_reports(_reporter)
assert _empty == []
- def test_reporter(self, ym_empty, ym_sparse, ym_busy, sparse_month_usage, busy_month_item0, busy_month_item1, busy_month_item2, item0):
- _empty = list_monthly_reports(PublicItemUsageReporter(ym_empty))
- _sparse = list_monthly_reports(PublicItemUsageReporter(ym_sparse))
- _busy = list_monthly_reports(PublicItemUsageReporter(ym_busy))
+ def test_reporter(self):
+ self._setup_sparse_month_usage()
+ self._setup_busy_month_item0()
+ self._setup_busy_month_item1()
+ self._setup_busy_month_item2()
+ OsfCountedUsageEvent.refresh()
+
+ _empty = list_monthly_reports(PublicItemUsageReporter(self.ym_empty))
+ _sparse = list_monthly_reports(PublicItemUsageReporter(self.ym_sparse))
+ _busy = list_monthly_reports(PublicItemUsageReporter(self.ym_busy))
# empty month:
assert _empty == []
# sparse month:
assert len(_sparse) == 3
- _sparse_item0, _sparse_item1, _sparse_item2 = sorted(_sparse, key=attrgetter('item_osfid'))
+ _sparse_item0, _sparse_item1, _sparse_item2 = sorted(_sparse, key=attrgetter('item_iri'))
# sparse-month item0
- assert isinstance(_sparse_item0, PublicItemUsageReport)
- assert _sparse_item0.item_osfid == 'item0_v1'
- assert _sparse_item0.provider_id == [item0.provider._id]
- assert _sparse_item0.platform_iri == ['http://osf.example']
+ assert isinstance(_sparse_item0, MonthlyPublicItemUsageReport)
+ assert _sparse_item0.item_iri == 'http://osf.example/item0_v1'
+ assert _sparse_item0.item_osfids == ['item0_v1']
+ assert _sparse_item0.provider_ids == [self.item0.provider._id]
+ assert _sparse_item0.platform_iris == ['http://osf.example']
assert _sparse_item0.view_count == 3
- assert _sparse_item0.view_session_count is None # no session count for preprints
+ assert _sparse_item0.view_session_count == 2
assert _sparse_item0.download_count == 0
- assert _sparse_item0.download_session_count is None # no session count for preprints
+ assert _sparse_item0.download_session_count == 0
+ assert _sparse_item0.cumulative_view_count == 3
+ assert _sparse_item0.cumulative_view_session_count == 2
+ assert _sparse_item0.cumulative_download_count == 0
+ assert _sparse_item0.cumulative_download_session_count == 0
# sparse-month item1
- assert isinstance(_sparse_item1, PublicItemUsageReport)
- assert _sparse_item1.item_osfid == 'item1'
- assert _sparse_item1.provider_id == ['osf']
- assert _sparse_item1.platform_iri == ['http://osf.example']
+ assert isinstance(_sparse_item1, MonthlyPublicItemUsageReport)
+ assert _sparse_item1.item_iri == 'http://osf.example/item1'
+ assert _sparse_item1.item_osfids == ['item1']
+ assert _sparse_item1.provider_ids == ['osf']
+ assert _sparse_item1.platform_iris == ['http://osf.example']
assert _sparse_item1.view_count == 2 # including item2
assert _sparse_item1.view_session_count == 1 # including item2
assert _sparse_item1.download_count == 1 # NOT including item2
assert _sparse_item1.download_session_count == 1 # NOT including item2
+ assert _sparse_item1.cumulative_view_count == 2
+ assert _sparse_item1.cumulative_view_session_count == 1
+ assert _sparse_item1.cumulative_download_count == 1
+ assert _sparse_item1.cumulative_download_session_count == 1
# sparse-month item2
- assert isinstance(_sparse_item1, PublicItemUsageReport)
- assert _sparse_item2.item_osfid == 'item2'
- assert _sparse_item2.provider_id == ['osf']
- assert _sparse_item2.platform_iri == ['http://osf.example']
+ assert isinstance(_sparse_item1, MonthlyPublicItemUsageReport)
+ assert _sparse_item2.item_iri == 'http://osf.example/item2'
+ assert _sparse_item2.item_osfids == ['item2']
+ assert _sparse_item2.provider_ids == ['osf']
+ assert _sparse_item2.platform_iris == ['http://osf.example']
assert _sparse_item2.view_count == 1
assert _sparse_item2.view_session_count == 1
assert _sparse_item2.download_count == 1
assert _sparse_item2.download_session_count == 1
+ assert _sparse_item2.cumulative_view_count == 1
+ assert _sparse_item2.cumulative_view_session_count == 1
+ assert _sparse_item2.cumulative_download_count == 1
+ assert _sparse_item2.cumulative_download_session_count == 1
# busy month:
assert len(_busy) == 3
- _busy_item0, _busy_item1, _busy_item2 = sorted(_busy, key=attrgetter('item_osfid'))
- # busy-month item0
- assert isinstance(_busy_item0, PublicItemUsageReport)
- assert _busy_item0.item_osfid == 'item0_v1'
- assert _busy_item0.provider_id == [item0.provider._id]
- assert _busy_item0.platform_iri == ['http://osf.example']
+ _busy_item0, _busy_item1, _busy_item2 = sorted(_busy, key=attrgetter('item_iri'))
+ # busy-month item0 (plus prior-month report)
+ assert isinstance(_busy_item0, MonthlyPublicItemUsageReport)
+ assert _busy_item0.item_iri == 'http://osf.example/item0_v1'
+ assert _busy_item0.item_osfids == ['item0_v1']
+ assert _busy_item0.provider_ids == [self.item0.provider._id]
+ assert _busy_item0.platform_iris == ['http://osf.example']
assert _busy_item0.view_count == 4 * 7
- assert _busy_item0.view_session_count is None # no session count for preprints
+ assert _busy_item0.view_session_count == 4
assert _busy_item0.download_count == 4 * 5
- assert _busy_item0.download_session_count is None # no session count for preprints
+ assert _busy_item0.download_session_count == 4
+ # plus values from prior report:
+ assert _busy_item0.cumulative_view_count == (4 * 7) + 4
+ assert _busy_item0.cumulative_view_session_count == 4 + 3
+ assert _busy_item0.cumulative_download_count == (4 * 5) + 2
+ assert _busy_item0.cumulative_download_session_count == 4 + 1
# busy-month item1
- assert isinstance(_busy_item1, PublicItemUsageReport)
- assert _busy_item1.item_osfid == 'item1'
- assert _busy_item1.provider_id == ['osf']
- assert _busy_item1.platform_iri == ['http://osf.example']
+ assert isinstance(_busy_item1, MonthlyPublicItemUsageReport)
+ assert _busy_item1.item_iri == 'http://osf.example/item1'
+ assert _busy_item1.item_osfids == ['item1']
+ assert _busy_item1.provider_ids == ['osf']
+ assert _busy_item1.platform_iris == ['http://osf.example']
assert _busy_item1.view_count == 6 * 9 + 11
assert _busy_item1.view_session_count == 6 + 11
assert _busy_item1.download_count == 5 * 7
assert _busy_item1.download_session_count == 5
+ assert _busy_item1.cumulative_view_count == 6 * 9 + 11
+ assert _busy_item1.cumulative_view_session_count == 6 + 11
+ assert _busy_item1.cumulative_download_count == 5 * 7
+ assert _busy_item1.cumulative_download_session_count == 5
# busy-month item2
- assert isinstance(_busy_item2, PublicItemUsageReport)
- assert _busy_item2.item_osfid == 'item2'
- assert _busy_item2.provider_id == ['osf']
- assert _busy_item2.platform_iri == ['http://osf.example']
+ assert isinstance(_busy_item2, MonthlyPublicItemUsageReport)
+ assert _busy_item2.item_iri == 'http://osf.example/item2'
+ assert _busy_item2.item_osfids == ['item2']
+ assert _busy_item2.provider_ids == ['osf']
+ assert _busy_item2.platform_iris == ['http://osf.example']
assert _busy_item2.view_count == 11
assert _busy_item2.view_session_count == 11
assert _busy_item2.download_count == 11
assert _busy_item2.download_session_count == 11
+ assert _busy_item2.cumulative_view_count == 11
+ assert _busy_item2.cumulative_view_session_count == 11
+ assert _busy_item2.cumulative_download_count == 11
+ assert _busy_item2.cumulative_download_session_count == 11
def _save_usage(
@@ -252,32 +299,9 @@ def _save_usage(
):
_countedusage_kwargs = {
'timestamp': timestamp,
- 'item_guid': item._id,
+ 'item_osfid': item._id,
'action_labels': action_labels,
'platform_iri': 'http://osf.example',
**kwargs,
}
- CountedAuthUsage(**_countedusage_kwargs).save(refresh=True)
- if isinstance(item, osfdb.Preprint):
- if 'view' in action_labels:
- _save_preprint_view(item, timestamp)
- if 'download' in action_labels:
- _save_preprint_download(item, timestamp)
-
-
-def _save_preprint_view(preprint, timestamp):
- PreprintView(
- timestamp=timestamp,
- count=1,
- preprint_id=preprint._id,
- provider_id=preprint.provider._id,
- ).save(refresh=True)
-
-
-def _save_preprint_download(preprint, timestamp):
- PreprintDownload(
- timestamp=timestamp,
- count=1,
- preprint_id=preprint._id,
- provider_id=preprint.provider._id,
- ).save(refresh=True)
+ OsfCountedUsageEvent.record(**_countedusage_kwargs)
diff --git a/osf_tests/metrics/test_daily_report.py b/osf_tests/metrics/test_daily_report.py
deleted file mode 100644
index 5228e2342c5..00000000000
--- a/osf_tests/metrics/test_daily_report.py
+++ /dev/null
@@ -1,88 +0,0 @@
-import datetime
-from unittest import mock
-
-import pytest
-import elasticsearch_metrics.imps.elastic6 as metrics
-
-from osf.metrics.reports import DailyReport, ReportInvalid
-
-
-class TestDailyReportKey:
- @pytest.fixture
- def mock_save(self):
- with mock.patch('elasticsearch_metrics.imps.elastic6.BaseMetric.check_index_template'):
- with mock.patch('elasticsearch6_dsl.Document.save', autospec=True) as mock_save:
- yield mock_save
-
- def test_default(self, mock_save):
- # only one of this type of report per day
- class UniqueByDate(DailyReport):
- blah = metrics.Keyword()
-
- class Meta:
- app_label = 'osf'
-
- today = datetime.date(2022, 5, 18)
- expected_timestamp = datetime.datetime(
- today.year,
- today.month,
- today.day,
- tzinfo=datetime.UTC,
- )
-
- reports = [
- UniqueByDate(report_date=today),
- UniqueByDate(report_date=today, blah='blah'),
- UniqueByDate(report_date=today, blah='fleh'),
- ]
- expected_key = '6fe48593af0f9d34159616759bd4678f383c912fdff3e8a338c51ecb1cf9d0d5'
-
- for report in reports:
- report.save()
- assert mock_save.call_count == 1
- assert mock_save.call_args[0][0] is report
- assert report.meta.id == expected_key
- assert report.timestamp == expected_timestamp
- mock_save.reset_mock()
-
- def test_with_unique_together(self, mock_save):
- # multiple reports of this type per day, unique by given field
- class UniqueByDateAndField(DailyReport):
- UNIQUE_TOGETHER_FIELDS = ('report_date', 'uniquefield',)
- uniquefield = metrics.Keyword()
-
- class Meta:
- app_label = 'osf'
-
- today = datetime.date(2022, 5, 18)
- expected_timestamp = datetime.datetime(
- today.year,
- today.month,
- today.day,
- tzinfo=datetime.UTC,
- )
-
- expected_blah = 'dca57e6cde89b19274ea24bc713971dab137a896b8e06d43a11a3f437cd1d151'
- blah_report = UniqueByDateAndField(report_date=today, uniquefield='blah')
- blah_report.save()
- assert mock_save.call_count == 1
- assert mock_save.call_args[0][0] is blah_report
- assert blah_report.meta.id == expected_blah
- assert blah_report.timestamp == expected_timestamp
- mock_save.reset_mock()
-
- expected_fleh = 'e7dd5ff6b087807efcfa958077dc713878f21c65af79b3ccdb5dc2409bf5ad99'
- fleh_report = UniqueByDateAndField(report_date=today, uniquefield='fleh')
- fleh_report.save()
- assert mock_save.call_count == 1
- assert mock_save.call_args[0][0] is fleh_report
- assert fleh_report.meta.id == expected_fleh
- assert fleh_report.timestamp == expected_timestamp
- mock_save.reset_mock()
-
- for _bad_report in (
- UniqueByDateAndField(report_date=today),
- UniqueByDateAndField(report_date=today, uniquefield=['list', 'of', 'things']),
- ):
- with pytest.raises(ReportInvalid):
- _bad_report.save()
diff --git a/osf_tests/metrics/test_es8_metrics.py b/osf_tests/metrics/test_es8_metrics.py
index 1560558abe1..a7312508a92 100644
--- a/osf_tests/metrics/test_es8_metrics.py
+++ b/osf_tests/metrics/test_es8_metrics.py
@@ -1,23 +1,15 @@
import datetime
-from elasticsearch_metrics.tests.util import djelme_test_backends
-import pytest
+from django.test import TestCase
+from elasticsearch_metrics.tests.util import RealElasticTestCase
-from osf.metrics.es8_metrics import (
- PageviewInfo,
- DailyDownloadCountReportEs8,
- OsfCountedUsageEvent,
-)
+from osf.metrics.daily_reports import DailyDownloadCountReport
+from osf.metrics.events import OsfCountedUsageEvent
-class TestEs8Metrics:
+class TestEs8Metrics(RealElasticTestCase, TestCase):
"""smoke tests to check that djelme records can be saved and searched"""
- @pytest.fixture(autouse=True)
- def _real_elastic(self):
- with djelme_test_backends():
- yield
-
def test_nested_pageview_autofill(self):
usage = OsfCountedUsageEvent.record(
timestamp=datetime.datetime(2024, 1, 1, 15, 0, tzinfo=datetime.UTC),
@@ -29,12 +21,12 @@ def test_nested_pageview_autofill(self):
item_type='Preprint',
platform_iri='https://osf.example',
user_is_authenticated=False,
- pageview_info=PageviewInfo(
- page_url='https://example.com/path/test',
- referer_url='https://google.com',
- route_name='foo.bar',
- page_title='title title',
- ),
+ pageview_info={
+ 'page_url': 'https://example.com/path/test',
+ 'referer_url': 'https://google.com',
+ 'route_name': 'foo.bar',
+ 'page_title': 'title title',
+ },
)
assert usage.pageview_info.page_path == '/path/test'
assert usage.pageview_info.referer_domain == 'google.com'
@@ -80,12 +72,12 @@ def test_none_pageview_nested_autofill(self):
assert usage.item_iri in usage.within_iris
def test_save_report(self):
- _saved = DailyDownloadCountReportEs8.record(
+ _saved = DailyDownloadCountReport.record(
cycle_coverage='2026.1.1',
daily_file_downloads=17,
)
- DailyDownloadCountReportEs8.refresh()
- _response = DailyDownloadCountReportEs8.search().execute()
+ DailyDownloadCountReport.refresh()
+ _response = DailyDownloadCountReport.search().execute()
(_fetched,) = _response
assert _fetched.meta.id == _saved.meta.id
assert _fetched.cycle_coverage == '2026.1.1'
diff --git a/osf_tests/metrics/test_metric_mixin.py b/osf_tests/metrics/test_metric_mixin.py
deleted file mode 100644
index ec9b2d302de..00000000000
--- a/osf_tests/metrics/test_metric_mixin.py
+++ /dev/null
@@ -1,35 +0,0 @@
-from unittest import mock
-import pytest
-import elasticsearch_metrics.imps.elastic6 as metrics
-
-from osf.metrics.metric_mixin import MetricMixin
-from osf.models import OSFUser
-from osf_tests.factories import UserFactory
-
-class DummyMetric(MetricMixin, metrics.Metric):
- count = metrics.Integer(doc_values=True, index=True, required=True)
- user_id = metrics.Keyword(index=True, doc_values=True, required=False)
-
- class Meta:
- app_label = 'osf'
-
-@pytest.mark.django_db
-@mock.patch.object(DummyMetric, '_get_id_to_count')
-def test_get_top_by_count(mock_get_id_to_count):
- user1, user2 = UserFactory(), UserFactory()
- mock_get_id_to_count.return_value = {
- user1._id: 41,
- user2._id: 42,
- }
-
- metric_qs = DummyMetric.get_top_by_count(
- qs=OSFUser.objects.all(),
- model_field='guids___id',
- metric_field='user_id',
- annotation='dummies',
- size=None,
- )
-
- annotated_user = metric_qs.first()
- assert annotated_user._id == user2._id
- assert annotated_user.dummies == 42
diff --git a/osf_tests/metrics/test_monthly_report.py b/osf_tests/metrics/test_monthly_report.py
deleted file mode 100644
index ba981e997d6..00000000000
--- a/osf_tests/metrics/test_monthly_report.py
+++ /dev/null
@@ -1,153 +0,0 @@
-import datetime
-from unittest import mock
-
-import pytest
-import elasticsearch_metrics.imps.elastic6 as metrics
-
-from osf.metrics.reports import MonthlyReport, ReportInvalid, PublicItemUsageReport
-from osf.metrics.utils import YearMonth
-
-
-class TestMonthlyReportKey:
- @pytest.fixture
- def mock_save(self):
- with mock.patch('elasticsearch_metrics.imps.elastic6.BaseMetric.check_index_template'):
- with mock.patch('elasticsearch6_dsl.Document.save', autospec=True) as mock_save:
- yield mock_save
-
- def test_default(self, mock_save):
- # only one of this type of report per month
- class UniqueByMonth(MonthlyReport):
- blah = metrics.Keyword()
-
- class Meta:
- app_label = 'osf'
-
- yearmonth = YearMonth(2022, 5)
- expected_timestamp = datetime.datetime(yearmonth.year, yearmonth.month, 1, tzinfo=datetime.UTC)
-
- reports = [
- UniqueByMonth(report_yearmonth=yearmonth),
- UniqueByMonth(report_yearmonth=yearmonth, blah='blah'),
- UniqueByMonth(report_yearmonth=yearmonth, blah='fleh'),
- ]
- expected_key = '8463aac67c1e5a038049196781d8f100f069225352d1829651892cf3fbfc50e2'
-
- for report in reports:
- report.save()
- assert mock_save.call_count == 1
- assert mock_save.call_args[0][0] is report
- assert report.meta.id == expected_key
- assert report.timestamp == expected_timestamp
- mock_save.reset_mock()
-
- def test_with_unique_together(self, mock_save):
- # multiple reports of this type per day, unique by given field
- class UniqueByMonthAndField(MonthlyReport):
- UNIQUE_TOGETHER_FIELDS = ('report_yearmonth', 'uniquefield',)
- uniquefield = metrics.Keyword()
-
- class Meta:
- app_label = 'osf'
-
- yearmonth = YearMonth(2022, 5)
- expected_timestamp = datetime.datetime(yearmonth.year, yearmonth.month, 1, tzinfo=datetime.UTC)
-
- expected_blah = '62ebf38317cd8402e27a50ce99f836d1734b3f545adf7d144d0e1cf37a0d9d08'
- blah_report = UniqueByMonthAndField(report_yearmonth=yearmonth, uniquefield='blah')
- blah_report.save()
- assert mock_save.call_count == 1
- assert mock_save.call_args[0][0] is blah_report
- assert blah_report.meta.id == expected_blah
- assert blah_report.timestamp == expected_timestamp
- mock_save.reset_mock()
-
- expected_fleh = '385700db282f6d6089a0d21836db5ee8423f548615e515b6e034bcc90a14500f'
- fleh_report = UniqueByMonthAndField(report_yearmonth=yearmonth, uniquefield='fleh')
- fleh_report.save()
- assert mock_save.call_count == 1
- assert mock_save.call_args[0][0] is fleh_report
- assert fleh_report.meta.id == expected_fleh
- assert fleh_report.timestamp == expected_timestamp
- mock_save.reset_mock()
-
- for _bad_report in (
- UniqueByMonthAndField(report_yearmonth=yearmonth),
- UniqueByMonthAndField(report_yearmonth=yearmonth, uniquefield=['list']),
- ):
- with pytest.raises(ReportInvalid):
- _bad_report.save()
-
-
-@pytest.mark.es_metrics
-@pytest.mark.django_db
-class TestLastMonthReport:
- @pytest.fixture
- def osfid(self):
- return 'abced'
-
- @pytest.fixture
- def this_month(self):
- return YearMonth.from_date(datetime.date.today())
-
- @pytest.fixture
- def last_month(self, this_month):
- return _prior_yearmonth(this_month)
-
- @pytest.fixture
- def two_months_back(self, last_month):
- return _prior_yearmonth(last_month)
-
- @pytest.fixture
- def three_months_back(self, two_months_back):
- return _prior_yearmonth(two_months_back)
-
- @pytest.fixture
- def this_month_report(self, osfid, this_month):
- return _item_usage_report(this_month, osfid, view_count=77)
-
- @pytest.fixture
- def last_month_report(self, osfid, last_month):
- return _item_usage_report(last_month, osfid, view_count=57)
-
- @pytest.fixture
- def diff_last_month_report(self, last_month):
- return _item_usage_report(last_month, 'zyxvt', view_count=17)
-
- @pytest.fixture
- def two_months_back_report(self, osfid, two_months_back):
- return _item_usage_report(two_months_back, osfid, view_count=27)
-
- @pytest.fixture
- def three_months_back_report(self, osfid, three_months_back):
- return _item_usage_report(three_months_back, osfid, view_count=37)
-
- def test_with_none(self, osfid):
- assert PublicItemUsageReport.for_last_month(osfid) is None
-
- def test_with_others(self, osfid, this_month_report, three_months_back_report, diff_last_month_report):
- assert PublicItemUsageReport.for_last_month(osfid) is None
-
- def test_with_prior_month(self, osfid, this_month_report, two_months_back_report, three_months_back_report, diff_last_month_report):
- assert PublicItemUsageReport.for_last_month(osfid) == two_months_back_report
-
- def test_with_last_month(self, osfid, this_month_report, last_month_report, two_months_back_report, three_months_back_report, diff_last_month_report):
- assert PublicItemUsageReport.for_last_month(osfid) == last_month_report
-
-
-def _prior_yearmonth(ym: YearMonth) -> YearMonth:
- return (
- YearMonth(ym.year - 1, 12)
- if ym.month == 1
- else YearMonth(ym.year, ym.month - 1)
- )
-
-
-def _item_usage_report(ym: YearMonth, osfid: str, **kwargs):
- _report = PublicItemUsageReport(
- report_yearmonth=ym,
- item_osfid=osfid,
- **kwargs
- )
- _report.save(refresh=True)
- return _report
diff --git a/osf_tests/metrics/test_monthly_usage_report.py b/osf_tests/metrics/test_monthly_usage_report.py
new file mode 100644
index 00000000000..8381d629f3f
--- /dev/null
+++ b/osf_tests/metrics/test_monthly_usage_report.py
@@ -0,0 +1,103 @@
+import datetime
+from functools import cached_property
+
+from django.test import TestCase
+from elasticsearch_metrics.tests.util import RealElasticTestCase
+
+from osf.models.base import osfid_iri
+from osf.metrics.monthly_reports import MonthlyPublicItemUsageReport
+from osf.metrics.utils import YearMonth
+
+
+class TestEachFromLastMonth(RealElasticTestCase, TestCase):
+ osfid = 'abced'
+
+ @cached_property
+ def item_iri(self):
+ return osfid_iri(self.osfid)
+
+ @cached_property
+ def this_month(self):
+ return YearMonth.from_date(datetime.date.today())
+
+ @cached_property
+ def last_month(self):
+ return self.this_month.prior()
+
+ @cached_property
+ def two_months_back(self):
+ return self.last_month.prior()
+
+ @cached_property
+ def three_months_back(self):
+ return self.two_months_back.prior()
+
+ @cached_property
+ def this_month_report(self):
+ return _item_usage_report(self.this_month, self.osfid, view_count=77)
+
+ @cached_property
+ def last_month_report(self):
+ return _item_usage_report(self.last_month, self.osfid, view_count=57)
+
+ @cached_property
+ def diff_last_month_report(self):
+ return _item_usage_report(self.last_month, 'zyxvt', view_count=17)
+
+ @cached_property
+ def two_months_back_report(self):
+ return _item_usage_report(self.two_months_back, self.osfid, view_count=27)
+
+ @cached_property
+ def three_months_back_report(self):
+ return _item_usage_report(self.three_months_back, self.osfid, view_count=37)
+
+ def test_with_none(self):
+ self.assertEqual(
+ MonthlyPublicItemUsageReport.from_last_month([self.item_iri]),
+ [],
+ )
+
+ def test_with_others(self):
+ self.this_month_report
+ self.three_months_back_report
+ self.diff_last_month_report
+ MonthlyPublicItemUsageReport.refresh()
+ self.assertEqual(
+ MonthlyPublicItemUsageReport.from_last_month([self.item_iri]),
+ [],
+ )
+
+ def test_with_prior_month(self):
+ self.this_month_report
+ self.two_months_back_report
+ self.three_months_back_report
+ self.diff_last_month_report
+ MonthlyPublicItemUsageReport.refresh()
+ self.assertEqual(
+ MonthlyPublicItemUsageReport.from_last_month([self.item_iri]),
+ [self.two_months_back_report],
+ )
+
+ def test_with_last_month(self):
+ self.this_month_report
+ self.last_month_report
+ self.two_months_back_report
+ self.three_months_back_report
+ self.diff_last_month_report
+ MonthlyPublicItemUsageReport.refresh()
+ self.assertEqual(
+ MonthlyPublicItemUsageReport.from_last_month([self.item_iri]),
+ [self.last_month_report],
+ )
+
+
+def _item_usage_report(ym: YearMonth, osfid: str, **kwargs):
+ _report = MonthlyPublicItemUsageReport(
+ report_yearmonth=ym,
+ item_iri=osfid_iri(osfid),
+ item_osfids=osfid,
+ **kwargs
+ )
+ _report.save(validate=False)
+ return _report
diff --git a/osf_tests/metrics/test_spam_count_reporter.py b/osf_tests/metrics/test_spam_count_reporter.py
index 448a8136f7a..a72d1a71ab5 100644
--- a/osf_tests/metrics/test_spam_count_reporter.py
+++ b/osf_tests/metrics/test_spam_count_reporter.py
@@ -1,7 +1,6 @@
import pytest
from datetime import datetime
from osf.metrics.reporters.private_spam_metrics import PrivateSpamMetricsReporter
-from osf.metrics.reports import PrivateSpamMetricsReport
from osf.metrics.utils import YearMonth
from osf_tests.factories import NodeLogFactory, NodeFactory
from unittest.mock import patch
@@ -31,10 +30,10 @@ def test_private_spam_metrics_reporter():
mock_akismet_get_hammed_count.return_value = 10
reporter = PrivateSpamMetricsReporter(report_yearmonth)
- reports_raw = reporter.report()
- report = next(r for r in reports_raw if isinstance(r, PrivateSpamMetricsReport))
-
- assert report.node_oopspam_flagged == 10, f"Expected 10, got {report.node_oopspam_flagged}"
- assert report.node_oopspam_hammed == 5, f"Expected 5, got {report.node_oopspam_hammed}"
- assert report.node_akismet_flagged == 20, f"Expected 20, got {report.node_akismet_flagged}"
- assert report.node_akismet_hammed == 10, f"Expected 10, got {report.node_akismet_hammed}"
+ reports = list(reporter.report())
+ assert len(reports) == 1
+ for report in reports:
+ assert report.node_oopspam_flagged == 10, f"Expected 10, got {report.node_oopspam_flagged}"
+ assert report.node_oopspam_hammed == 5, f"Expected 5, got {report.node_oopspam_hammed}"
+ assert report.node_akismet_flagged == 20, f"Expected 20, got {report.node_akismet_flagged}"
+ assert report.node_akismet_hammed == 10, f"Expected 10, got {report.node_akismet_hammed}"
diff --git a/poetry.lock b/poetry.lock
index 6e26c23d295..b6965cc1a35 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1085,7 +1085,7 @@ Django = ">=2.0"
[[package]]
name = "django-elasticsearch-metrics"
-version = "2026.0.4"
+version = "2026.0.5"
description = "Django app for storing time-series metrics in Elasticsearch."
optional = false
python-versions = ">=3.10,<4"
@@ -1095,14 +1095,13 @@ develop = false
[package.extras]
anydjango = ["django"]
-elastic6 = ["elasticsearch6-dsl (>=6.3.0,<7.0.0)"]
elastic8 = ["elasticsearch8 (>=8.0.0,<9.0.0)"]
[package.source]
type = "git"
url = "https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git"
-reference = "709ff1d5c869d5696212b9109ed79e5d9766c60c"
-resolved_reference = "709ff1d5c869d5696212b9109ed79e5d9766c60c"
+reference = "46890bb61d35459e9793eba92d9ae54d4ce9c6af"
+resolved_reference = "46890bb61d35459e9793eba92d9ae54d4ce9c6af"
[[package]]
name = "django-extensions"
@@ -1390,45 +1389,6 @@ files = [
[package.dependencies]
urllib3 = ">=1.8,<2.0"
-[[package]]
-name = "elasticsearch6"
-version = "6.8.2"
-description = "Python client for Elasticsearch"
-optional = false
-python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*, <4"
-groups = ["main"]
-files = [
- {file = "elasticsearch6-6.8.2-py2.py3-none-any.whl", hash = "sha256:4edf2d61f854f642185d5af915b23c57e70d9f2b54f558b62ae55fa720583f5e"},
- {file = "elasticsearch6-6.8.2.tar.gz", hash = "sha256:7c215910b6bc18928d24d6c1d0b09b0684c824af609906d5e007a9a268109678"},
-]
-
-[package.dependencies]
-urllib3 = ">=1.21.1"
-
-[package.extras]
-develop = ["coverage", "mock", "nose", "nosexcover", "numpy", "pandas", "pyyaml", "requests (>=2.0.0,<3.0.0)", "sphinx (<1.7)", "sphinx-rtd-theme"]
-requests = ["requests (>=2.4.0,<3.0.0)"]
-
-[[package]]
-name = "elasticsearch6-dsl"
-version = "6.4.0"
-description = "Python client for Elasticsearch"
-optional = false
-python-versions = "*"
-groups = ["main"]
-files = [
- {file = "elasticsearch6-dsl-6.4.0.tar.gz", hash = "sha256:4bbc60919b73484d028eca31f749f0eea80d8b0bfe0a9a33b54eb0afca1d9b5f"},
- {file = "elasticsearch6_dsl-6.4.0-py2.py3-none-any.whl", hash = "sha256:a5767ef65c50f7c8af7ba6c176bd8df2c1fb501c644bc196cbd675f15c0f2be1"},
-]
-
-[package.dependencies]
-elasticsearch6 = ">=6.0.0,<7.0.0"
-python-dateutil = "*"
-six = "*"
-
-[package.extras]
-develop = ["coverage (<5.0.0)", "mock", "pytest (>=3.0.0)", "pytest-cov", "pytz", "sphinx", "sphinx-rtd-theme"]
-
[[package]]
name = "elasticsearch8"
version = "8.19.3"
@@ -4727,4 +4687,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"]
[metadata]
lock-version = "2.1"
python-versions = "^3.12"
-content-hash = "964ae9c9b6ce89c023a6bf0216cab95cdaafe5ce20be927d7c8f7244127993db"
+content-hash = "d032b7d17bbc25dbbd06a5f7134b2dfd83946a2e959a699fd13aad23c3bcedb1"
diff --git a/pyproject.toml b/pyproject.toml
index 28f2f4ed22d..e9c856a4eab 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,8 +31,6 @@ Markupsafe = "2.1.5"
blinker = "1.7.0"
furl = "2.1.3"
elasticsearch2 = "2.5.1"
-elasticsearch6= "6.8.2"
-elasticsearch6-dsl = "6.4.0"
elasticsearch8 = "8.19.3"
elastic-transport = "8.17.1"
google-api-python-client = "2.123.0"
@@ -91,7 +89,7 @@ datacite = "1.1.3"
rdflib = "7.0.0"
colorlog = "6.8.2"
# Metrics
-django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "709ff1d5c869d5696212b9109ed79e5d9766c60c"}
+django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "46890bb61d35459e9793eba92d9ae54d4ce9c6af"}
# Impact Metrics CSV Export
djangorestframework-csv = "3.0.2"
gevent = "24.2.1"
diff --git a/pytest.ini b/pytest.ini
index 4417f537dd0..450117d7f68 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -8,5 +8,5 @@ markers =
enable_enqueue_task
enable_bookmark_creation
enable_implicit_clean
- es
enable_account_status_messaging
+ djelme_elasticsearch_backends
diff --git a/website/settings/defaults.py b/website/settings/defaults.py
index 220d03ba2e6..1d8ee10bd32 100644
--- a/website/settings/defaults.py
+++ b/website/settings/defaults.py
@@ -108,7 +108,6 @@ def parent_dir(path):
SEARCH_ENGINE = 'elastic' # Can be 'elastic', or None
ELASTIC_URI = '127.0.0.1:9200'
-ELASTIC6_URI = os.environ.get('ELASTIC6_URI', '127.0.0.1:9201')
ELASTIC8_URI = os.environ.get('ELASTIC8_URI')
ELASTIC8_CERT_PATH = os.environ.get('ELASTIC8_CERT_PATH')
ELASTIC8_ASSERT_HOSTNAME = os.environ.get('ELASTIC8_ASSERT_HOSTNAME')
@@ -485,7 +484,6 @@ class CeleryConfig:
}
background_migration_modules = {
- 'osf.management.commands.migrate_osfmetrics_6to8',
}
try:
@@ -602,7 +600,6 @@ class CeleryConfig:
'scripts.remove_after_use.merge_notification_subscription_provider_ct',
'scripts.disable_removed_beat_tasks',
'osf.management.commands.delete_withdrawn_or_failed_registration_files',
- 'osf.management.commands.migrate_osfmetrics_6to8',
)
# Modules that need metrics and release requirements
@@ -2144,8 +2141,6 @@ def from_node_usage(cls, usage_bytes, private_limit=None, public_limit=None):
CAS_LOG_LEVEL = 3 # ERROR
-PREPRINT_METRICS_START_DATE = datetime.datetime(2019, 1, 1)
-
WAFFLE_VALUES_YAML = 'osf/features.yaml'
DEFAULT_DRAFT_NODE_TITLE = 'Untitled'
USE_COLOR = False