parser: coalesce http_etag and http_last_modified into caching_info. #…

…307
lemon24 · Sep 15, 2024 · 3cad494 · 3cad494
1 parent ffbd06f
commit 3cad494
Show file tree

Hide file tree

Showing 16 changed files with 205 additions and 222 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -15,7 +15,10 @@ Unreleased
 
   .. note::
 
-    The (unstable) :class:`.RetrieverType` protocol used by custom retrievers changed.
+    The (unstable) :class:`.RetrieverType` protocol used by retrievers changed.
+
+  * Allow retrievers to store arbitrary caching data via
+    :attr:`.RetrievedFeed.caching_info`.
 
 
 Version 3.14

diff --git a/docs/dev.rst b/docs/dev.rst
@@ -404,17 +404,13 @@ Lessons learned from the :ref:`Twitter` plugin:
   :meth:`~reader._parser.ParserType` being generic.
 
 * It is useful for a Retriever to store arbitrary caching data;
-  the plugin (mis)used
-  :attr:`~reader._parser.RetrieveResult.http_etag`
+  the plugin (mis)used ``RetrieveResult.http_etag``
   to store the (integer) id of the newest tweet in the thread.
 
-  It would be nice to formalize this into a single
-  "arbitrary caching data" attribute;
-  also see `this comment <https://github.com/lemon24/reader/blob/0ada24bf4e65c69c3028641bada8eaeabbe02754/src/reader/_parser.py#L569>`_.
+  Update: This was formalized as :attr:`.RetrievedFeed.caching_info` in 3.15.
 
 * It is useful for a Retriever to pass arbitrary data to itself;
-  the plugin (mis)used
-  :attr:`~reader._types.FeedForUpdate.http_etag` to pass from
+  the plugin (mis)used ``FeedForUpdate.http_etag`` to pass from
   :meth:`~reader._parser.FeedForUpdateRetrieverType.process_feed_for_update`
   to :meth:`~reader._parser.RetrieverType.__call__`:
 

diff --git a/src/reader/_parser/__init__.py b/src/reader/_parser/__init__.py
@@ -24,6 +24,7 @@
 from .._utils import lazy_import
 from ..exceptions import ParseError
 from ..types import _namedtuple_compat
+from ..types import JSONType
 from .requests import DEFAULT_TIMEOUT
 from .requests import Headers
 from .requests import SessionFactory
@@ -112,14 +113,11 @@ def __setattr__(self, name: str, value: Any) -> None:
             object.__setattr__(self._parser, name, value)
 
     def __call__(
-        self,
-        url: str,
-        http_etag: str | None = None,
-        http_last_modified: str | None = None,
+        self, url: str, caching_info: JSONType | None = None
     ) -> ParsedFeed | None:
         self._lazy_init()
         assert self._parser is not None
-        return self._parser(url, http_etag, http_last_modified)
+        return self._parser(url, caching_info)
 
     def _lazy_init(self) -> None:
         if self._parser:
@@ -168,12 +166,8 @@ def url(self) -> str:
         """The feed URL."""
 
     @property
-    def http_etag(self) -> str | None:
-        """The HTTP ``ETag`` header from the last update."""
-
-    @property
-    def http_last_modified(self) -> str | None:
-        """The the HTTP ``Last-Modified`` header from the last update."""
+    def caching_info(self) -> JSONType | None:
+        """:attr:`~RetrievedFeed.caching_info` from the last update."""
 
 
 T = TypeVar('T')
@@ -242,8 +236,6 @@ class RetrieveResult(_namedtuple_compat, Generic[F, T, E]):
 class RetrievedFeed(_namedtuple_compat, Generic[T]):
     """A (successfully) retrieved feed, plus metadata."""
 
-    # TODO: coalesce http_etag and http_last_modified into a single thing?
-
     #: The retrieved resource.
     #: Usually, a readable binary file.
     #: Passed to the parser.
@@ -253,13 +245,9 @@ class RetrievedFeed(_namedtuple_compat, Generic[T]):
     #: Used to select an appropriate parser.
     mime_type: str | None = None
 
-    #: The HTTP ``ETag`` header associated with the resource.
-    #: Passed back to the retriever on the next update.
-    http_etag: str | None = None
-
-    #: The HTTP ``Last-Modified`` header associated with the resource.
-    #: Passed back to the retriever on the next update.
-    http_last_modified: str | None = None
+    #: Caching info passed back to the retriever on the next update.
+    #: Usually, the ``ETag`` and ``Last-Modified`` headers.
+    caching_info: JSONType | None = None
 
     #: Details about the HTTP response.
     http_info: HTTPInfo | None = None
@@ -277,18 +265,16 @@ class RetrieverType(Protocol[T_co]):  # pragma: no cover
     def __call__(
         self,
         url: str,
-        http_etag: str | None,
-        http_last_modified: str | None,
+        caching_info: JSONType | None,
+        # FIXME also s/http_accept/accept/
         http_accept: str | None,
     ) -> ContextManager[RetrievedFeed[T_co] | T_co]:
         """Retrieve a feed.
 
         Args:
             feed (str): The feed URL.
-            http_etag (str or None):
-                The HTTP ``ETag`` header from the last update.
-            http_last_modified (str or None):
-                The the HTTP ``Last-Modified`` header from the last update.
+            caching_info (JSONType or None):
+                :attr:`~RetrievedFeed.caching_info` from the last update.
             http_accept (str or None):
                 Content types to be retrieved, as an HTTP ``Accept`` header.
 
@@ -358,16 +344,13 @@ class ParsedFeed(NamedTuple):
     feed: FeedData
     #: The entries.
     entries: Collection[EntryData]
-    #: The HTTP ``ETag`` header associated with the feed resource.
-    #: Passed back to the retriever on the next update.
-    http_etag: str | None = None
-    #: The HTTP ``Last-Modified`` header associated with the feed resource.
-    #: Passed back to the retriever on the next update.
-    http_last_modified: str | None = None
     #: The MIME type of the feed resource.
     #: Used by :meth:`~reader._parser.Parser.process_entry_pairs`
     #: to select an appropriate parser.
     mime_type: str | None = None
+    #: Caching info passed back to the retriever on the next update.
+    #: Usually, the ``ETag`` and ``Last-Modified`` headers.
+    caching_info: JSONType | None = None
 
 
 FeedAndEntries = tuple[FeedData, Collection[EntryData]]

diff --git a/src/reader/_parser/_lazy.py b/src/reader/_parser/_lazy.py
@@ -16,6 +16,7 @@
 from .._utils import MapFunction
 from ..exceptions import InvalidFeedURLError
 from ..exceptions import ParseError
+from ..types import JSONType
 from . import EntryPair
 from . import EntryPairsParserType
 from . import F
@@ -133,21 +134,16 @@ def parallel(
                 yield cast(ParseResult[F, ParseError], result)
 
     def __call__(
-        self,
-        url: str,
-        http_etag: str | None = None,
-        http_last_modified: str | None = None,
+        self, url: str, caching_info: JSONType | None = None
     ) -> ParsedFeed | None:
         """Retrieve and parse one feed.
 
         This is a convenience wrapper over :meth:`parallel`.
 
         Args:
             feed (str): The feed URL.
-            http_etag (str or None):
-                The HTTP ``ETag`` header from the last update.
-            http_last_modified (str or None):
-                The the HTTP ``Last-Modified`` header from the last update.
+            caching_info (JSONType or None):
+                :attr:`~RetrievedFeed.caching_info` from the last update.
 
         Returns:
             ParsedFeed or None:
@@ -157,9 +153,7 @@ def __call__(
             ParseError
 
         """
-        feed = FeedForUpdate(
-            url, http_etag=http_etag, http_last_modified=http_last_modified
-        )
+        feed = FeedForUpdate(url, caching_info=caching_info)
 
         (result,) = self.parallel([feed])
 
@@ -179,7 +173,7 @@ def retrieve_fn(self, feed: F) -> RetrieveResult[F, Any, Exception]:
 
         """
         try:
-            context = self.retrieve(feed.url, feed.http_etag, feed.http_last_modified)
+            context = self.retrieve(feed.url, feed.caching_info)
             return RetrieveResult(feed, context)
         except Exception as e:
             # pass around *all* exception types,
@@ -188,19 +182,14 @@ def retrieve_fn(self, feed: F) -> RetrieveResult[F, Any, Exception]:
             return RetrieveResult(feed, e)
 
     def retrieve(
-        self,
-        url: str,
-        http_etag: str | None = None,
-        http_last_modified: str | None = None,
+        self, url: str, caching_info: JSONType | None = None
     ) -> ContextManager[RetrievedFeed[Any]]:
         """Retrieve a feed.
 
         Args:
             url (str): The feed URL.
-            http_etag (str or None):
-                The HTTP ``ETag`` header from the last update.
-            http_last_modified (str or None):
-                The the HTTP ``Last-Modified`` header from the last update.
+            caching_info (JSONType or None):
+                :attr:`~RetrievedFeed.caching_info` from the last update.
 
         Returns:
             contextmanager(RetrieveResult or None):
@@ -226,21 +215,18 @@ def retrieve(
 
         retriever = self.get_retriever(url)
 
-        return self._retrieve(
-            retriever, url, http_etag, http_last_modified, http_accept
-        )
+        return self._retrieve(retriever, url, caching_info, http_accept)
 
     @contextmanager
     def _retrieve(
         self,
         retriever: RetrieverType[Any],
         url: str,
-        http_etag: str | None,
-        http_last_modified: str | None,
+        caching_info: JSONType | None,
         http_accept: str | None,
     ) -> Iterator[RetrievedFeed[Any]]:
         with wrap_exceptions(url, 'during retriever'):
-            context = retriever(url, http_etag, http_last_modified, http_accept)
+            context = retriever(url, caching_info, http_accept)
             with context as feed:
                 if not isinstance(feed, RetrievedFeed):
                     feed = RetrievedFeed(feed)
@@ -329,9 +315,7 @@ def parse(self, url: str, retrieved: RetrievedFeed[Any]) -> ParsedFeed:
         with wrap_exceptions(url, 'during parser'):
             feed, entries = parser(url, retrieved.resource, headers)
             entries = list(entries)
-        return ParsedFeed(
-            feed, entries, retrieved.http_etag, retrieved.http_last_modified, mime_type
-        )
+        return ParsedFeed(feed, entries, mime_type, retrieved.caching_info)
 
     def get_parser(
         self, url: str, mime_type: str | None

diff --git a/src/reader/_parser/http.py b/src/reader/_parser/http.py
@@ -4,6 +4,8 @@
 from collections.abc import Iterator
 from contextlib import contextmanager
 from dataclasses import dataclass
+from typing import Any
+from typing import cast
 from typing import ContextManager
 from typing import IO
 
@@ -40,8 +42,7 @@ class HTTPRetriever:
     def __call__(
         self,
         url: str,
-        http_etag: str | None = None,
-        http_last_modified: str | None = None,
+        caching_info: Any = None,
         http_accept: str | None = None,
     ) -> Iterator[RetrievedFeed[IO[bytes]]]:
         request_headers = {
@@ -58,12 +59,8 @@ def __call__(
 
         with self.get_session() as session, wrap_exceptions(error):
             error._message = "while getting feed"
-            response, http_etag, http_last_modified = session.caching_get(
-                url,
-                http_etag,
-                http_last_modified,
-                headers=request_headers,
-                stream=True,
+            response, response_caching_info = session.caching_get(
+                url, caching_info, request_headers, stream=True
             )
 
             with response:
@@ -97,8 +94,8 @@ def __call__(
                 yield RetrievedFeed(
                     response.raw,
                     mime_type,
-                    http_etag,
-                    http_last_modified,
+                    # https://github.com/python/mypy/issues/4976
+                    cast(dict[str, Any] | None, response_caching_info),
                     http_info,
                     slow_to_read=True,
                 )

diff --git a/src/reader/_parser/requests/__init__.py b/src/reader/_parser/requests/__init__.py
@@ -16,6 +16,7 @@
 from typing import ContextManager
 from typing import Protocol
 from typing import TYPE_CHECKING
+from typing import TypedDict
 from typing import TypeVar
 from typing import Union
 
@@ -86,6 +87,7 @@ def __call__(
 
 Headers = Mapping[str, str]
 TimeoutType = Union[None, float, tuple[float, float], tuple[float, None]]
+CachingInfo = TypedDict('CachingInfo', {'etag': str, 'last-modified': str}, total=False)
 
 DEFAULT_TIMEOUT = (3.05, 60)
 

diff --git a/src/reader/_parser/requests/_lazy.py b/src/reader/_parser/requests/_lazy.py
@@ -9,6 +9,7 @@
 
 import requests
 
+from . import CachingInfo
 from . import TimeoutType
 
 
@@ -114,31 +115,50 @@ def get(
     def caching_get(
         self,
         url: str,
-        etag: str | None = None,
-        last_modified: str | None = None,
+        caching_info: Any = None,
         headers: Headers | None = None,
         **kwargs: Any,
-    ) -> tuple[requests.Response, str | None, str | None]:
+    ) -> tuple[requests.Response, CachingInfo | None]:
         """Like :meth:`get()`, but set and return caching headers.
 
-        caching_get(url, etag, last_modified) -> response, etag, last_modified
+        caching_get(url, old_caching_info) -> response, new_caching_info
 
         """
         headers = dict(headers or ())
+
+        etag = _str_value(caching_info, 'etag')
+        last_modified = _str_value(caching_info, 'last-modified')
         if etag:
             headers.setdefault('If-None-Match', etag)
         if last_modified:
             headers.setdefault('If-Modified-Since', last_modified)
 
         response = self.get(url, headers=headers, **kwargs)
+
+        response_caching_info: CachingInfo = {}
         if response.ok:
-            etag = response.headers.get('ETag', etag)
+            etag = response.headers.get('ETag')
+            if etag:
+                response_caching_info['etag'] = etag
             last_modified = response.headers.get('Last-Modified', last_modified)
+            if last_modified:
+                response_caching_info['last-modified'] = last_modified
 
-        return response, etag, last_modified
+        return response, response_caching_info or None
 
     def __enter__(self) -> Self:
         return self
 
     def __exit__(self, *args: Any) -> None:
         self.session.close()
+
+
+def _str_value(d: Any | None, key: str) -> str | None:
+    if not d:
+        return None
+    assert isinstance(d, dict), d
+    rv = d.get(key)
+    if rv is None:
+        return None
+    assert isinstance(rv, str), rv
+    return rv