Skip to content
71 changes: 61 additions & 10 deletions src/palace/manager/data_layer/bibliographic.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,10 @@ class BibliographicData(BaseMutableData):
# Note: brought back to keep callers of bibliographic extraction process_one() methods simple.
circulation: CirculationData | None = None

# FIXME: this parameter is a stopgap measure which should be removed once we sort out how to reasonably
# determine has_changed for overdrive.
minimum_time_between_updates: datetime.timedelta = datetime.timedelta(0)

@field_validator("language")
@classmethod
def _convert_langage_alpha3(cls, value: str | None) -> str | None:
Expand Down Expand Up @@ -996,25 +1000,72 @@ def update_contributions(

return contributors_changed

def has_changed(self, session: Session, edition: Edition | None = None) -> bool:
def has_changed(
self,
session: Session,
edition: Edition | None = None,
) -> bool:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should probably update the docstring here temporarily, since it doesn't accurately reflect the current situation.

"""
Test if the bibliographic data has changed since the last import.
Returns True (needs update) in the following cases:
- No existing edition is found for this identifier.
- Both ``data_source_last_updated`` and ``edition.updated_at`` are ``None``.
- ``edition.updated_at`` is ``None`` but ``data_source_last_updated`` is set.
- ``data_source_last_updated`` is newer than ``edition.updated_at``.
- ``data_source_last_updated`` is ``None`` but ``edition.updated_at`` is older
than ``minimum_time_between_updates`` (default ``0 seconds``, meaning
any elapsed time triggers an update).

Returns False (skip update) when ``data_source_last_updated`` is ``None`` and
``edition.updated_at`` is within the ``minimum_time_between_updates``
threshold.

NOTE: this implementation is going to change soon. The use of minimum_time_between_updates
is a stopgap measure to prevent overproduction of metadata update tasks in cases where
no data_source_last_updated is available.

:param session: Database session used to look up the edition if not provided.
:param edition: The Edition to compare against. If ``None``, the edition is
looked up from the database using this object's primary identifier.
:return: ``True`` if an update is needed, ``False`` otherwise.

"""
if edition is None:
edition, _ = self.edition(session, autocreate=False)
if edition is None:
# We don't have an edition, so we definitely need to create one.
# No edition exists yet: always create one.
return True

# If we don't have any information about the last update time, assume we need to update.
if edition.updated_at is None or self.data_source_last_updated is None:
if edition.updated_at is None:
# No edition timestamp: always re-import.
# Covers both "edition.updated_at is None" and "both timestamps None".
return True

if self.data_source_last_updated > edition.updated_at:
elif self.data_source_last_updated is None:
# FIXME: This is a stopgap measure to deal with the fact that we don't have a good way
# to detect metadata changes for Overdrive. This block should be removed once that problem is addressed.
# if we have an edition update time but we don't have a source last updated time, assume no change unless
# the minimum time between updates is exceeded
# No source timestamp: re-import only if the edition is older than the minimum interval.
result = utc_now() - edition.updated_at > self.minimum_time_between_updates

if not result:
self.log.info(
f"Publication {self.primary_identifier_data} has no source last updated timestamp so change is "
f"indeterminate. Since it has been updated in the last {self.minimum_time_between_updates} "
f"(i.e. on {edition.updated_at}), we will not attempt to update it."
)

return result

elif self.data_source_last_updated > edition.updated_at:
# Source has a newer timestamp than the edition: re-import.
return True

self.log.info(
f"Publication {self.primary_identifier_data} is unchanged. Last updated at "
f"{edition.updated_at}, data source last updated at {self.data_source_last_updated}"
)
return False
else:
# Source timestamp is not newer than the edition: skip.
self.log.info(
f"Publication {self.primary_identifier_data} is unchanged. Last updated at "
f"{edition.updated_at}, data source last updated at {self.data_source_last_updated}"
)
return False
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import datetime
import logging
import re
from typing import TYPE_CHECKING, Any
Expand Down Expand Up @@ -680,6 +681,8 @@ def book_info_to_bibliographic(
measurements=measurements,
links=links,
duration=duration,
# FIXME - this parameter should be removed once we resolve the overdrive change detection issue.
minimum_time_between_updates=datetime.timedelta(days=3),
)
else:
bibliographic = BibliographicData(
Expand Down
146 changes: 146 additions & 0 deletions tests/manager/data_layer/test_bibliographic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1327,3 +1327,149 @@ def test_thumbnail_without_representation_logs_error(
)
assert thumbnail.resource.representation is None
assert [] == image.resource.representation.thumbnails

def test_has_changed_no_edition(self, db: DatabaseTransactionFixture):
"""has_changed returns True when no matching edition exists."""
bibliographic = BibliographicData(
data_source_name=DataSource.FEEDBOOKS,
primary_identifier_data=IdentifierData(
type="any-identifier-type",
identifier="nonexistent-id-for-has-changed-test",
),
data_source_last_updated=utc_now(),
)
# No edition exists in the DB for this identifier.
assert bibliographic.has_changed(db.session) is True

def test_has_changed_edition_updated_at_none(self, db: DatabaseTransactionFixture):
"""has_changed returns True when the edition has no updated_at timestamp."""
edition = db.edition()
edition.updated_at = None

bibliographic = BibliographicData(
data_source_name=edition.data_source.name,
data_source_last_updated=utc_now(),
)
assert bibliographic.has_changed(db.session, edition=edition) is True

def test_has_changed_both_timestamps_none(self, db: DatabaseTransactionFixture):
"""has_changed returns True when both edition.updated_at and
data_source_last_updated are None."""
edition = db.edition()
edition.updated_at = None

bibliographic = BibliographicData(
data_source_name=edition.data_source.name,
data_source_last_updated=None,
)
assert bibliographic.has_changed(db.session, edition=edition) is True

def test_has_changed_no_source_timestamp_edition_older_than_minimum(
self, db: DatabaseTransactionFixture
):
"""has_changed returns True when data_source_last_updated is None but the
edition was last updated longer ago than the minimum update interval."""
now = utc_now()
three_hours_ago = now - datetime.timedelta(hours=3)

edition = db.edition()
edition.updated_at = three_hours_ago

bibliographic = BibliographicData(
data_source_name=edition.data_source.name,
data_source_last_updated=None,
minimum_time_between_updates=datetime.timedelta(hours=2),
)
with freeze_time(now):
# Minimum is 2 hours; edition is 3 hours old → changed.
assert bibliographic.has_changed(db.session, edition=edition) is True

def test_has_changed_no_source_timestamp_edition_within_minimum(
self, db: DatabaseTransactionFixture
):
"""has_changed returns False when data_source_last_updated is None and the
edition was updated more recently than the minimum update interval."""
now = utc_now()
one_hour_ago = now - datetime.timedelta(hours=1)

edition = db.edition()
edition.updated_at = one_hour_ago

bibliographic = BibliographicData(
data_source_name=edition.data_source.name,
data_source_last_updated=None,
minimum_time_between_updates=datetime.timedelta(hours=2),
)
with freeze_time(now):
# Minimum is 2 hours; edition is only 1 hour old → no change.
assert bibliographic.has_changed(db.session, edition=edition) is False

def test_has_changed_custom_minimum_time(self, db: DatabaseTransactionFixture):
"""minimum_time_between_updates is respected when data_source_last_updated is None."""
now = utc_now()
forty_five_minutes_ago = now - datetime.timedelta(minutes=45)

edition = db.edition()
edition.updated_at = forty_five_minutes_ago

bibliographic = BibliographicData(
data_source_name=edition.data_source.name,
data_source_last_updated=None,
minimum_time_between_updates=datetime.timedelta(minutes=30),
)

with freeze_time(now):
# Edition is 45 min old, minimum is 30 min → changed.
assert (
bibliographic.has_changed(
db.session,
edition=edition,
)
is True
)

bibliographic.minimum_time_between_updates = datetime.timedelta(hours=1)

with freeze_time(now):
# Edition is 45 min old, minimum is 60 min → no change.
assert (
bibliographic.has_changed(
db.session,
edition=edition,
)
is False
)

def test_has_changed_source_newer_than_edition(
self, db: DatabaseTransactionFixture
):
"""has_changed returns True when data_source_last_updated is newer than
edition.updated_at."""
now = utc_now()
one_day_ago = now - datetime.timedelta(days=1)

edition = db.edition()
edition.updated_at = one_day_ago

bibliographic = BibliographicData(
data_source_name=edition.data_source.name,
data_source_last_updated=now,
)
assert bibliographic.has_changed(db.session, edition=edition) is True

def test_has_changed_source_older_than_edition(
self, db: DatabaseTransactionFixture
):
"""has_changed returns False when data_source_last_updated is older than or
equal to edition.updated_at."""
now = utc_now()
one_day_ago = now - datetime.timedelta(days=1)

edition = db.edition()
edition.updated_at = now

bibliographic = BibliographicData(
data_source_name=edition.data_source.name,
data_source_last_updated=one_day_ago,
)
assert bibliographic.has_changed(db.session, edition=edition) is False
16 changes: 16 additions & 0 deletions tests/manager/integration/license/overdrive/test_representation.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import datetime
from functools import partial
from typing import Any
from unittest.mock import MagicMock
Expand Down Expand Up @@ -559,3 +560,18 @@ def test_internal_formats(self) -> None:
# An unrecognized format does not correspond to any delivery
# mechanisms.
assert internal_formats("no-such-format") == []

def test_book_info_to_bibliographic_sets_minimum_time_between_updates(
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test does get at verifying that this change is affecting OverDrive books, but it would be nice to see the behavior difference in a more integrated way in tests/manager/data_layer/test_bibliographic.py.

I was thinking something there with a shape like:

@pytest.mark.parametrize(
      "is_overdrive, edition_age, expected",
      [
          pytest.param(
              False,
              datetime.timedelta(seconds=1),
              True,
              id="default-minimum-always-reimports",
          ),
          pytest.param(
              True,
              datetime.timedelta(days=1),
              False,
              id="overdrive-within-cooldown-skips",
          ),
          pytest.param(
              True,
              datetime.timedelta(days=4),
              True,
              id="overdrive-outside-cooldown-reimports",
          ),
      ],
  )
  def test_has_changed_no_source_timestamp(
      self,
      db: DatabaseTransactionFixture,
      is_overdrive: bool,
      edition_age: datetime.timedelta,
      expected: bool,
  ):
      now = utc_now()
      edition = db.edition()
      edition.updated_at = now - edition_age

      data_source_name = DataSource.OVERDRIVE if is_overdrive else edition.data_source.name

      bibliographic = BibliographicData(
          data_source_name=data_source_name,
          data_source_last_updated=None,
      )
      with freeze_time(now):
          assert bibliographic.has_changed(db.session, edition=edition) is expected

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I'm missing you @tdilauro: I'm probably being dense. I'll ping you on slack and maybe we can jump on a short call.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Whether the DS is overdrive or not has no bearing on the expected functionality of the has_changed method. The difference is driven by the presence of a min time btw updates.

self, overdrive_api_fixture: OverdriveAPIFixture
):
"""book_info_to_bibliographic sets a non-zero minimum_time_between_updates.

Overdrive does not supply a data_source_last_updated timestamp, so without
a minimum update interval every record would be re-imported on every run.
The extractor sets a 3-day cooldown to prevent that churn.
"""
raw, info = overdrive_api_fixture.sample_json("overdrive_metadata.json")
metadata = OverdriveRepresentationExtractor.book_info_to_bibliographic(info)

assert metadata is not None
assert metadata.minimum_time_between_updates == datetime.timedelta(days=3)
Loading