Skip to content

Commit

Permalink
parser: coalesce http_etag and http_last_modified into caching_info. #…
Browse files Browse the repository at this point in the history
  • Loading branch information
lemon24 committed Sep 15, 2024
1 parent ffbd06f commit 3cad494
Show file tree
Hide file tree
Showing 16 changed files with 205 additions and 222 deletions.
5 changes: 4 additions & 1 deletion CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,10 @@ Unreleased

.. note::

The (unstable) :class:`.RetrieverType` protocol used by custom retrievers changed.
The (unstable) :class:`.RetrieverType` protocol used by retrievers changed.

* Allow retrievers to store arbitrary caching data via
:attr:`.RetrievedFeed.caching_info`.


Version 3.14
Expand Down
10 changes: 3 additions & 7 deletions docs/dev.rst
Original file line number Diff line number Diff line change
Expand Up @@ -404,17 +404,13 @@ Lessons learned from the :ref:`Twitter` plugin:
:meth:`~reader._parser.ParserType` being generic.

* It is useful for a Retriever to store arbitrary caching data;
the plugin (mis)used
:attr:`~reader._parser.RetrieveResult.http_etag`
the plugin (mis)used ``RetrieveResult.http_etag``
to store the (integer) id of the newest tweet in the thread.

It would be nice to formalize this into a single
"arbitrary caching data" attribute;
also see `this comment <https://github.com/lemon24/reader/blob/0ada24bf4e65c69c3028641bada8eaeabbe02754/src/reader/_parser.py#L569>`_.
Update: This was formalized as :attr:`.RetrievedFeed.caching_info` in 3.15.

* It is useful for a Retriever to pass arbitrary data to itself;
the plugin (mis)used
:attr:`~reader._types.FeedForUpdate.http_etag` to pass from
the plugin (mis)used ``FeedForUpdate.http_etag`` to pass from
:meth:`~reader._parser.FeedForUpdateRetrieverType.process_feed_for_update`
to :meth:`~reader._parser.RetrieverType.__call__`:

Expand Down
47 changes: 15 additions & 32 deletions src/reader/_parser/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from .._utils import lazy_import
from ..exceptions import ParseError
from ..types import _namedtuple_compat
from ..types import JSONType
from .requests import DEFAULT_TIMEOUT
from .requests import Headers
from .requests import SessionFactory
Expand Down Expand Up @@ -112,14 +113,11 @@ def __setattr__(self, name: str, value: Any) -> None:
object.__setattr__(self._parser, name, value)

def __call__(
self,
url: str,
http_etag: str | None = None,
http_last_modified: str | None = None,
self, url: str, caching_info: JSONType | None = None
) -> ParsedFeed | None:
self._lazy_init()
assert self._parser is not None
return self._parser(url, http_etag, http_last_modified)
return self._parser(url, caching_info)

def _lazy_init(self) -> None:
if self._parser:
Expand Down Expand Up @@ -168,12 +166,8 @@ def url(self) -> str:
"""The feed URL."""

@property
def http_etag(self) -> str | None:
"""The HTTP ``ETag`` header from the last update."""

@property
def http_last_modified(self) -> str | None:
"""The the HTTP ``Last-Modified`` header from the last update."""
def caching_info(self) -> JSONType | None:
""":attr:`~RetrievedFeed.caching_info` from the last update."""


T = TypeVar('T')
Expand Down Expand Up @@ -242,8 +236,6 @@ class RetrieveResult(_namedtuple_compat, Generic[F, T, E]):
class RetrievedFeed(_namedtuple_compat, Generic[T]):
"""A (successfully) retrieved feed, plus metadata."""

# TODO: coalesce http_etag and http_last_modified into a single thing?

#: The retrieved resource.
#: Usually, a readable binary file.
#: Passed to the parser.
Expand All @@ -253,13 +245,9 @@ class RetrievedFeed(_namedtuple_compat, Generic[T]):
#: Used to select an appropriate parser.
mime_type: str | None = None

#: The HTTP ``ETag`` header associated with the resource.
#: Passed back to the retriever on the next update.
http_etag: str | None = None

#: The HTTP ``Last-Modified`` header associated with the resource.
#: Passed back to the retriever on the next update.
http_last_modified: str | None = None
#: Caching info passed back to the retriever on the next update.
#: Usually, the ``ETag`` and ``Last-Modified`` headers.
caching_info: JSONType | None = None

#: Details about the HTTP response.
http_info: HTTPInfo | None = None
Expand All @@ -277,18 +265,16 @@ class RetrieverType(Protocol[T_co]): # pragma: no cover
def __call__(
self,
url: str,
http_etag: str | None,
http_last_modified: str | None,
caching_info: JSONType | None,
# FIXME also s/http_accept/accept/
http_accept: str | None,
) -> ContextManager[RetrievedFeed[T_co] | T_co]:
"""Retrieve a feed.
Args:
feed (str): The feed URL.
http_etag (str or None):
The HTTP ``ETag`` header from the last update.
http_last_modified (str or None):
The the HTTP ``Last-Modified`` header from the last update.
caching_info (JSONType or None):
:attr:`~RetrievedFeed.caching_info` from the last update.
http_accept (str or None):
Content types to be retrieved, as an HTTP ``Accept`` header.
Expand Down Expand Up @@ -358,16 +344,13 @@ class ParsedFeed(NamedTuple):
feed: FeedData
#: The entries.
entries: Collection[EntryData]
#: The HTTP ``ETag`` header associated with the feed resource.
#: Passed back to the retriever on the next update.
http_etag: str | None = None
#: The HTTP ``Last-Modified`` header associated with the feed resource.
#: Passed back to the retriever on the next update.
http_last_modified: str | None = None
#: The MIME type of the feed resource.
#: Used by :meth:`~reader._parser.Parser.process_entry_pairs`
#: to select an appropriate parser.
mime_type: str | None = None
#: Caching info passed back to the retriever on the next update.
#: Usually, the ``ETag`` and ``Last-Modified`` headers.
caching_info: JSONType | None = None


FeedAndEntries = tuple[FeedData, Collection[EntryData]]
Expand Down
42 changes: 13 additions & 29 deletions src/reader/_parser/_lazy.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from .._utils import MapFunction
from ..exceptions import InvalidFeedURLError
from ..exceptions import ParseError
from ..types import JSONType
from . import EntryPair
from . import EntryPairsParserType
from . import F
Expand Down Expand Up @@ -133,21 +134,16 @@ def parallel(
yield cast(ParseResult[F, ParseError], result)

def __call__(
self,
url: str,
http_etag: str | None = None,
http_last_modified: str | None = None,
self, url: str, caching_info: JSONType | None = None
) -> ParsedFeed | None:
"""Retrieve and parse one feed.
This is a convenience wrapper over :meth:`parallel`.
Args:
feed (str): The feed URL.
http_etag (str or None):
The HTTP ``ETag`` header from the last update.
http_last_modified (str or None):
The the HTTP ``Last-Modified`` header from the last update.
caching_info (JSONType or None):
:attr:`~RetrievedFeed.caching_info` from the last update.
Returns:
ParsedFeed or None:
Expand All @@ -157,9 +153,7 @@ def __call__(
ParseError
"""
feed = FeedForUpdate(
url, http_etag=http_etag, http_last_modified=http_last_modified
)
feed = FeedForUpdate(url, caching_info=caching_info)

(result,) = self.parallel([feed])

Expand All @@ -179,7 +173,7 @@ def retrieve_fn(self, feed: F) -> RetrieveResult[F, Any, Exception]:
"""
try:
context = self.retrieve(feed.url, feed.http_etag, feed.http_last_modified)
context = self.retrieve(feed.url, feed.caching_info)
return RetrieveResult(feed, context)
except Exception as e:
# pass around *all* exception types,
Expand All @@ -188,19 +182,14 @@ def retrieve_fn(self, feed: F) -> RetrieveResult[F, Any, Exception]:
return RetrieveResult(feed, e)

def retrieve(
self,
url: str,
http_etag: str | None = None,
http_last_modified: str | None = None,
self, url: str, caching_info: JSONType | None = None
) -> ContextManager[RetrievedFeed[Any]]:
"""Retrieve a feed.
Args:
url (str): The feed URL.
http_etag (str or None):
The HTTP ``ETag`` header from the last update.
http_last_modified (str or None):
The the HTTP ``Last-Modified`` header from the last update.
caching_info (JSONType or None):
:attr:`~RetrievedFeed.caching_info` from the last update.
Returns:
contextmanager(RetrieveResult or None):
Expand All @@ -226,21 +215,18 @@ def retrieve(

retriever = self.get_retriever(url)

return self._retrieve(
retriever, url, http_etag, http_last_modified, http_accept
)
return self._retrieve(retriever, url, caching_info, http_accept)

@contextmanager
def _retrieve(
self,
retriever: RetrieverType[Any],
url: str,
http_etag: str | None,
http_last_modified: str | None,
caching_info: JSONType | None,
http_accept: str | None,
) -> Iterator[RetrievedFeed[Any]]:
with wrap_exceptions(url, 'during retriever'):
context = retriever(url, http_etag, http_last_modified, http_accept)
context = retriever(url, caching_info, http_accept)
with context as feed:
if not isinstance(feed, RetrievedFeed):
feed = RetrievedFeed(feed)
Expand Down Expand Up @@ -329,9 +315,7 @@ def parse(self, url: str, retrieved: RetrievedFeed[Any]) -> ParsedFeed:
with wrap_exceptions(url, 'during parser'):
feed, entries = parser(url, retrieved.resource, headers)
entries = list(entries)
return ParsedFeed(
feed, entries, retrieved.http_etag, retrieved.http_last_modified, mime_type
)
return ParsedFeed(feed, entries, mime_type, retrieved.caching_info)

def get_parser(
self, url: str, mime_type: str | None
Expand Down
17 changes: 7 additions & 10 deletions src/reader/_parser/http.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from collections.abc import Iterator
from contextlib import contextmanager
from dataclasses import dataclass
from typing import Any
from typing import cast
from typing import ContextManager
from typing import IO

Expand Down Expand Up @@ -40,8 +42,7 @@ class HTTPRetriever:
def __call__(
self,
url: str,
http_etag: str | None = None,
http_last_modified: str | None = None,
caching_info: Any = None,
http_accept: str | None = None,
) -> Iterator[RetrievedFeed[IO[bytes]]]:
request_headers = {
Expand All @@ -58,12 +59,8 @@ def __call__(

with self.get_session() as session, wrap_exceptions(error):
error._message = "while getting feed"
response, http_etag, http_last_modified = session.caching_get(
url,
http_etag,
http_last_modified,
headers=request_headers,
stream=True,
response, response_caching_info = session.caching_get(
url, caching_info, request_headers, stream=True
)

with response:
Expand Down Expand Up @@ -97,8 +94,8 @@ def __call__(
yield RetrievedFeed(
response.raw,
mime_type,
http_etag,
http_last_modified,
# https://github.com/python/mypy/issues/4976
cast(dict[str, Any] | None, response_caching_info),
http_info,
slow_to_read=True,
)
Expand Down
2 changes: 2 additions & 0 deletions src/reader/_parser/requests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from typing import ContextManager
from typing import Protocol
from typing import TYPE_CHECKING
from typing import TypedDict
from typing import TypeVar
from typing import Union

Expand Down Expand Up @@ -86,6 +87,7 @@ def __call__(

Headers = Mapping[str, str]
TimeoutType = Union[None, float, tuple[float, float], tuple[float, None]]
CachingInfo = TypedDict('CachingInfo', {'etag': str, 'last-modified': str}, total=False)

DEFAULT_TIMEOUT = (3.05, 60)

Expand Down
32 changes: 26 additions & 6 deletions src/reader/_parser/requests/_lazy.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

import requests

from . import CachingInfo
from . import TimeoutType


Expand Down Expand Up @@ -114,31 +115,50 @@ def get(
def caching_get(
self,
url: str,
etag: str | None = None,
last_modified: str | None = None,
caching_info: Any = None,
headers: Headers | None = None,
**kwargs: Any,
) -> tuple[requests.Response, str | None, str | None]:
) -> tuple[requests.Response, CachingInfo | None]:
"""Like :meth:`get()`, but set and return caching headers.
caching_get(url, etag, last_modified) -> response, etag, last_modified
caching_get(url, old_caching_info) -> response, new_caching_info
"""
headers = dict(headers or ())

etag = _str_value(caching_info, 'etag')
last_modified = _str_value(caching_info, 'last-modified')
if etag:
headers.setdefault('If-None-Match', etag)
if last_modified:
headers.setdefault('If-Modified-Since', last_modified)

response = self.get(url, headers=headers, **kwargs)

response_caching_info: CachingInfo = {}
if response.ok:
etag = response.headers.get('ETag', etag)
etag = response.headers.get('ETag')
if etag:
response_caching_info['etag'] = etag
last_modified = response.headers.get('Last-Modified', last_modified)
if last_modified:
response_caching_info['last-modified'] = last_modified

return response, etag, last_modified
return response, response_caching_info or None

def __enter__(self) -> Self:
return self

def __exit__(self, *args: Any) -> None:
self.session.close()


def _str_value(d: Any | None, key: str) -> str | None:
if not d:
return None
assert isinstance(d, dict), d
rv = d.get(key)
if rv is None:
return None
assert isinstance(rv, str), rv
return rv
Loading

0 comments on commit 3cad494

Please sign in to comment.