diff --git a/src/palace/manager/data_layer/bibliographic.py b/src/palace/manager/data_layer/bibliographic.py index 9b500e8c98..a3d2dfa755 100644 --- a/src/palace/manager/data_layer/bibliographic.py +++ b/src/palace/manager/data_layer/bibliographic.py @@ -81,6 +81,10 @@ class BibliographicData(BaseMutableData): # Note: brought back to keep callers of bibliographic extraction process_one() methods simple. circulation: CirculationData | None = None + # FIXME: this parameter is a stopgap measure which should be removed once we sort out how to reasonably + # determine has_changed for overdrive. + minimum_time_between_updates: datetime.timedelta = datetime.timedelta(0) + @field_validator("language") @classmethod def _convert_langage_alpha3(cls, value: str | None) -> str | None: @@ -996,25 +1000,72 @@ def update_contributions( return contributors_changed - def has_changed(self, session: Session, edition: Edition | None = None) -> bool: + def has_changed( + self, + session: Session, + edition: Edition | None = None, + ) -> bool: """ Test if the bibliographic data has changed since the last import. + Returns True (needs update) in the following cases: + - No existing edition is found for this identifier. + - Both ``data_source_last_updated`` and ``edition.updated_at`` are ``None``. + - ``edition.updated_at`` is ``None`` but ``data_source_last_updated`` is set. + - ``data_source_last_updated`` is newer than ``edition.updated_at``. + - ``data_source_last_updated`` is ``None`` but ``edition.updated_at`` is older + than ``minimum_time_between_updates`` (default ``0 seconds``, meaning + any elapsed time triggers an update). + + Returns False (skip update) when ``data_source_last_updated`` is ``None`` and + ``edition.updated_at`` is within the ``minimum_time_between_updates`` + threshold. + + NOTE: this implementation is going to change soon. The use of minimum_time_between_updates + is a stopgap measure to prevent overproduction of metadata update tasks in cases where + no data_source_last_updated is available. + + :param session: Database session used to look up the edition if not provided. + :param edition: The Edition to compare against. If ``None``, the edition is + looked up from the database using this object's primary identifier. + :return: ``True`` if an update is needed, ``False`` otherwise. + """ if edition is None: edition, _ = self.edition(session, autocreate=False) if edition is None: - # We don't have an edition, so we definitely need to create one. + # No edition exists yet: always create one. return True - # If we don't have any information about the last update time, assume we need to update. - if edition.updated_at is None or self.data_source_last_updated is None: + if edition.updated_at is None: + # No edition timestamp: always re-import. + # Covers both "edition.updated_at is None" and "both timestamps None". return True - if self.data_source_last_updated > edition.updated_at: + elif self.data_source_last_updated is None: + # FIXME: This is a stopgap measure to deal with the fact that we don't have a good way + # to detect metadata changes for Overdrive. This block should be removed once that problem is addressed. + # if we have an edition update time but we don't have a source last updated time, assume no change unless + # the minimum time between updates is exceeded + # No source timestamp: re-import only if the edition is older than the minimum interval. + result = utc_now() - edition.updated_at > self.minimum_time_between_updates + + if not result: + self.log.info( + f"Publication {self.primary_identifier_data} has no source last updated timestamp so change is " + f"indeterminate. Since it has been updated in the last {self.minimum_time_between_updates} " + f"(i.e. on {edition.updated_at}), we will not attempt to update it." + ) + + return result + + elif self.data_source_last_updated > edition.updated_at: + # Source has a newer timestamp than the edition: re-import. return True - self.log.info( - f"Publication {self.primary_identifier_data} is unchanged. Last updated at " - f"{edition.updated_at}, data source last updated at {self.data_source_last_updated}" - ) - return False + else: + # Source timestamp is not newer than the edition: skip. + self.log.info( + f"Publication {self.primary_identifier_data} is unchanged. Last updated at " + f"{edition.updated_at}, data source last updated at {self.data_source_last_updated}" + ) + return False diff --git a/src/palace/manager/integration/license/overdrive/representation.py b/src/palace/manager/integration/license/overdrive/representation.py index 7defae7c21..df20d5fc7e 100644 --- a/src/palace/manager/integration/license/overdrive/representation.py +++ b/src/palace/manager/integration/license/overdrive/representation.py @@ -1,5 +1,6 @@ from __future__ import annotations +import datetime import logging import re from typing import TYPE_CHECKING, Any @@ -680,6 +681,8 @@ def book_info_to_bibliographic( measurements=measurements, links=links, duration=duration, + # FIXME - this parameter should be removed once we resolve the overdrive change detection issue. + minimum_time_between_updates=datetime.timedelta(days=3), ) else: bibliographic = BibliographicData( diff --git a/tests/manager/data_layer/test_bibliographic.py b/tests/manager/data_layer/test_bibliographic.py index 7421a7922f..eb617edc9a 100644 --- a/tests/manager/data_layer/test_bibliographic.py +++ b/tests/manager/data_layer/test_bibliographic.py @@ -1327,3 +1327,149 @@ def test_thumbnail_without_representation_logs_error( ) assert thumbnail.resource.representation is None assert [] == image.resource.representation.thumbnails + + def test_has_changed_no_edition(self, db: DatabaseTransactionFixture): + """has_changed returns True when no matching edition exists.""" + bibliographic = BibliographicData( + data_source_name=DataSource.FEEDBOOKS, + primary_identifier_data=IdentifierData( + type="any-identifier-type", + identifier="nonexistent-id-for-has-changed-test", + ), + data_source_last_updated=utc_now(), + ) + # No edition exists in the DB for this identifier. + assert bibliographic.has_changed(db.session) is True + + def test_has_changed_edition_updated_at_none(self, db: DatabaseTransactionFixture): + """has_changed returns True when the edition has no updated_at timestamp.""" + edition = db.edition() + edition.updated_at = None + + bibliographic = BibliographicData( + data_source_name=edition.data_source.name, + data_source_last_updated=utc_now(), + ) + assert bibliographic.has_changed(db.session, edition=edition) is True + + def test_has_changed_both_timestamps_none(self, db: DatabaseTransactionFixture): + """has_changed returns True when both edition.updated_at and + data_source_last_updated are None.""" + edition = db.edition() + edition.updated_at = None + + bibliographic = BibliographicData( + data_source_name=edition.data_source.name, + data_source_last_updated=None, + ) + assert bibliographic.has_changed(db.session, edition=edition) is True + + def test_has_changed_no_source_timestamp_edition_older_than_minimum( + self, db: DatabaseTransactionFixture + ): + """has_changed returns True when data_source_last_updated is None but the + edition was last updated longer ago than the minimum update interval.""" + now = utc_now() + three_hours_ago = now - datetime.timedelta(hours=3) + + edition = db.edition() + edition.updated_at = three_hours_ago + + bibliographic = BibliographicData( + data_source_name=edition.data_source.name, + data_source_last_updated=None, + minimum_time_between_updates=datetime.timedelta(hours=2), + ) + with freeze_time(now): + # Minimum is 2 hours; edition is 3 hours old → changed. + assert bibliographic.has_changed(db.session, edition=edition) is True + + def test_has_changed_no_source_timestamp_edition_within_minimum( + self, db: DatabaseTransactionFixture + ): + """has_changed returns False when data_source_last_updated is None and the + edition was updated more recently than the minimum update interval.""" + now = utc_now() + one_hour_ago = now - datetime.timedelta(hours=1) + + edition = db.edition() + edition.updated_at = one_hour_ago + + bibliographic = BibliographicData( + data_source_name=edition.data_source.name, + data_source_last_updated=None, + minimum_time_between_updates=datetime.timedelta(hours=2), + ) + with freeze_time(now): + # Minimum is 2 hours; edition is only 1 hour old → no change. + assert bibliographic.has_changed(db.session, edition=edition) is False + + def test_has_changed_custom_minimum_time(self, db: DatabaseTransactionFixture): + """minimum_time_between_updates is respected when data_source_last_updated is None.""" + now = utc_now() + forty_five_minutes_ago = now - datetime.timedelta(minutes=45) + + edition = db.edition() + edition.updated_at = forty_five_minutes_ago + + bibliographic = BibliographicData( + data_source_name=edition.data_source.name, + data_source_last_updated=None, + minimum_time_between_updates=datetime.timedelta(minutes=30), + ) + + with freeze_time(now): + # Edition is 45 min old, minimum is 30 min → changed. + assert ( + bibliographic.has_changed( + db.session, + edition=edition, + ) + is True + ) + + bibliographic.minimum_time_between_updates = datetime.timedelta(hours=1) + + with freeze_time(now): + # Edition is 45 min old, minimum is 60 min → no change. + assert ( + bibliographic.has_changed( + db.session, + edition=edition, + ) + is False + ) + + def test_has_changed_source_newer_than_edition( + self, db: DatabaseTransactionFixture + ): + """has_changed returns True when data_source_last_updated is newer than + edition.updated_at.""" + now = utc_now() + one_day_ago = now - datetime.timedelta(days=1) + + edition = db.edition() + edition.updated_at = one_day_ago + + bibliographic = BibliographicData( + data_source_name=edition.data_source.name, + data_source_last_updated=now, + ) + assert bibliographic.has_changed(db.session, edition=edition) is True + + def test_has_changed_source_older_than_edition( + self, db: DatabaseTransactionFixture + ): + """has_changed returns False when data_source_last_updated is older than or + equal to edition.updated_at.""" + now = utc_now() + one_day_ago = now - datetime.timedelta(days=1) + + edition = db.edition() + edition.updated_at = now + + bibliographic = BibliographicData( + data_source_name=edition.data_source.name, + data_source_last_updated=one_day_ago, + ) + assert bibliographic.has_changed(db.session, edition=edition) is False diff --git a/tests/manager/integration/license/overdrive/test_representation.py b/tests/manager/integration/license/overdrive/test_representation.py index bdb38a9387..1eb0353d64 100644 --- a/tests/manager/integration/license/overdrive/test_representation.py +++ b/tests/manager/integration/license/overdrive/test_representation.py @@ -1,5 +1,6 @@ from __future__ import annotations +import datetime from functools import partial from typing import Any from unittest.mock import MagicMock @@ -559,3 +560,18 @@ def test_internal_formats(self) -> None: # An unrecognized format does not correspond to any delivery # mechanisms. assert internal_formats("no-such-format") == [] + + def test_book_info_to_bibliographic_sets_minimum_time_between_updates( + self, overdrive_api_fixture: OverdriveAPIFixture + ): + """book_info_to_bibliographic sets a non-zero minimum_time_between_updates. + + Overdrive does not supply a data_source_last_updated timestamp, so without + a minimum update interval every record would be re-imported on every run. + The extractor sets a 3-day cooldown to prevent that churn. + """ + raw, info = overdrive_api_fixture.sample_json("overdrive_metadata.json") + metadata = OverdriveRepresentationExtractor.book_info_to_bibliographic(info) + + assert metadata is not None + assert metadata.minimum_time_between_updates == datetime.timedelta(days=3)