From 59dc1bfc8cad7de95234e9d0dc1fe8e0518d9919 Mon Sep 17 00:00:00 2001 From: Aidan McMahon-Smith Date: Wed, 13 Nov 2024 15:13:43 +0100 Subject: [PATCH] Add exclusions for aliases [RHELDST-13644] Certain paths on our NetStorage fs have no symlinks/aliases, and teams/ tools that make use of these paths expect similar behaviour from exodus. This change introduces "exclude_paths" to exclude alias generation for certain paths, so we can replicate NetStorage behaviour. --- exodus_gw/aws/dynamodb.py | 13 +++--- exodus_gw/aws/util.py | 23 ++++++--- exodus_gw/routers/config.py | 21 ++++++++- exodus_gw/routers/deploy.py | 15 +++++- exodus_gw/schemas.py | 4 ++ exodus_gw/worker/cache.py | 2 +- exodus_gw/worker/deploy.py | 11 ++++- tests/aws/test_uri_alias.py | 85 +++++++++++++++++++++++++++------- tests/conftest.py | 24 ++++++++-- tests/routers/test_config.py | 6 ++- tests/worker/test_cdn_cache.py | 14 +++++- tests/worker/test_deploy.py | 22 +++++++++ 12 files changed, 197 insertions(+), 43 deletions(-) diff --git a/exodus_gw/aws/dynamodb.py b/exodus_gw/aws/dynamodb.py index 8d4b5ca1..35c15186 100644 --- a/exodus_gw/aws/dynamodb.py +++ b/exodus_gw/aws/dynamodb.py @@ -46,18 +46,19 @@ def definitions(self): self._definitions = self.query_definitions() return self._definitions - def _aliases(self, alias_types: list[str]) -> list[tuple[str, str]]: - out: list[tuple[str, str]] = [] + def _aliases(self, alias_types: list[str]) -> list[tuple[str, str, list[str]]]: + out: list[tuple[str, str, list[str]]] = [] for k, v in self.definitions.items(): if k in alias_types: for alias in v: - out.append((alias["src"], alias["dest"])) + out.append((alias["src"], alias["dest"], + alias.get("exclude_paths") or [])) return out @property - def aliases_for_write(self) -> list[tuple[str, str]]: + def aliases_for_write(self) -> list[tuple[str, str, list[str]]]: # Aliases used when writing items to DynamoDB. # # Note that these aliases are traversed only in the src => dest @@ -95,7 +96,7 @@ def aliases_for_write(self) -> list[tuple[str, str]]: return self._aliases(["origin_alias", "releasever_alias"]) @property - def aliases_for_flush(self) -> list[tuple[str, str]]: + def aliases_for_flush(self) -> list[tuple[str, str, list[str]]]: # Aliases used when flushing cache. out = self._aliases(["origin_alias", "releasever_alias", "rhui_alias"]) @@ -128,7 +129,7 @@ def aliases_for_flush(self) -> list[tuple[str, str]]: # this alias. If we don't traverse the alias from dest => src then we # will miss the fact that /content/dist/rhel8/rhui paths should also # have cache flushed. - out = out + [(dest, src) for (src, dest) in out] + out = out + [(dest, src, exclusions) for (src, dest, exclusions) in out] return out diff --git a/exodus_gw/aws/util.py b/exodus_gw/aws/util.py index ef0d4816..9a1c36c3 100644 --- a/exodus_gw/aws/util.py +++ b/exodus_gw/aws/util.py @@ -161,7 +161,7 @@ def get_reader(cls, request): return cls(request) -def uri_alias(uri: str, aliases: list[tuple[str, str]]) -> list[str]: +def uri_alias(uri: str, aliases: list[tuple[str, str, list[str]]]) -> list[str]: # Resolve every alias between paths within the uri (e.g. # allow RHUI paths to be aliased to non-RHUI). # @@ -187,7 +187,7 @@ def uri_alias(uri: str, aliases: list[tuple[str, str]]) -> list[str]: def uri_alias_recurse( accum: list[str], uri: str, - aliases: list[tuple[str, str]], + aliases: list[tuple[str, str, list[str]]], depth=0, maxdepth=4, ): @@ -217,8 +217,19 @@ def add_out(new_uri: str) -> bool: accum.insert(0, new_uri) return out - for src, dest in aliases: + for src, dest, exclude_paths in aliases: if uri.startswith(src + "/") or uri == src: + # Used to replicate old NetStorage-compatible behaviour. This will + # typically match non-rpm paths, such as /images/ or /isos/ + if any([exclusion in uri for exclusion in exclude_paths]): + LOG.debug("Aliasing for %s was not applied as it matches one " + "of the following exclusion paths: %s.", + uri, + ",".join(exclude_paths), + extra={"event": "publish", "success": True}, + ) + continue + new_uri = uri.replace(src, dest, 1) LOG.debug( "Resolved alias:\n\tsrc: %s\n\tdest: %s", @@ -252,8 +263,8 @@ def add_out(new_uri: str) -> bool: # But this is not desired behavior, we know that every alias is intended # to be resolved in the URL a maximum of once, hence this adjustment. sub_aliases = [ - (subsrc, subdest) - for (subsrc, subdest) in aliases + (subsrc, subdest, exclusions) + for (subsrc, subdest, exclusions) in aliases if (subsrc, subdest) != (src, dest) ] @@ -267,7 +278,7 @@ def add_out(new_uri: str) -> bool: def uris_with_aliases( - uris: Iterable[str], aliases: list[tuple[str, str]] + uris: Iterable[str], aliases: list[tuple[str, str, list[str]]] ) -> list[str]: # Given a collection of uris and aliases, returns a new collection of uris # post alias resolution, including *both* sides of each alias when applicable. diff --git a/exodus_gw/routers/config.py b/exodus_gw/routers/config.py index 759ad7a2..94a345c1 100644 --- a/exodus_gw/routers/config.py +++ b/exodus_gw/routers/config.py @@ -43,6 +43,12 @@ "pattern": PATH_PATTERN, "description": "Target of the alias, relative to CDN root.", }, + "exclude_paths": { + "type": "array", + "items": {"type": "string"}, + "description": "Paths for which alias will not be resolved, " + "treated as an unanchored regex." + } }, }, "uniqueItems": True, @@ -134,19 +140,30 @@ def config_post( }, }, "origin_alias": [ - {"src": "/content/origin", "dest": "/origin"}, - {"src": "/origin/rpm", "dest": "/origin/rpms"}, + { + "src": "/content/origin", + "dest": "/origin", + "exclude_paths": [] + }, + { + "src": "/origin/rpm", + "dest": "/origin/rpms", + "exclude_paths": ["/iso/"] + }, ], "releasever_alias": [ { "dest": "/content/dist/rhel8/8.5", "src": "/content/dist/rhel8/8", + "exclude_paths": ["/files/", "/images/", "/iso/"] }, + ], "rhui_alias": [ { "dest": "/content/dist/rhel8", "src": "/content/dist/rhel8/rhui", + "exclude_paths": ["/files/", "/images/", "/iso/"] }, ], } diff --git a/exodus_gw/routers/deploy.py b/exodus_gw/routers/deploy.py index 5ca08a5e..d3e31ffa 100644 --- a/exodus_gw/routers/deploy.py +++ b/exodus_gw/routers/deploy.py @@ -56,19 +56,30 @@ def deploy_config( }, }, "origin_alias": [ - {"src": "/content/origin", "dest": "/origin"}, - {"src": "/origin/rpm", "dest": "/origin/rpms"}, + { + "src": "/content/origin", + "dest": "/origin", + "exclude_paths": [] + }, + { + "src": "/origin/rpm", + "dest": "/origin/rpms", + "exclude_paths": ["/iso/"] + }, ], "releasever_alias": [ { "dest": "/content/dist/rhel8/8.5", "src": "/content/dist/rhel8/8", + "exclude_paths": ["/files/", "/images/", "/iso/"] }, + ], "rhui_alias": [ { "dest": "/content/dist/rhel8", "src": "/content/dist/rhel8/rhui", + "exclude_paths": ["/files/", "/images/", "/iso/"] }, ], } diff --git a/exodus_gw/schemas.py b/exodus_gw/schemas.py index a2fcfc6d..bf1a6ee2 100644 --- a/exodus_gw/schemas.py +++ b/exodus_gw/schemas.py @@ -319,6 +319,10 @@ class Alias(BaseModel): dest: str = Field( ..., description="Target of the alias, relative to CDN root." ) + exclude_paths: list[str] | None = Field( + [], description="Paths for which alias will not be resolved, " + "treated as an unanchored regex." + ) class YumVariable(Enum): diff --git a/exodus_gw/worker/cache.py b/exodus_gw/worker/cache.py index 73545f6a..8d34f694 100644 --- a/exodus_gw/worker/cache.py +++ b/exodus_gw/worker/cache.py @@ -36,7 +36,7 @@ def __init__( paths: list[str], settings: Settings, env: str, - aliases: list[tuple[str, str]], + aliases: list[tuple[str, str, list[str]]], ): self.paths = [p for p in paths if not exclude_path(p)] self.settings = settings diff --git a/exodus_gw/worker/deploy.py b/exodus_gw/worker/deploy.py index 6b0d444c..6682682a 100644 --- a/exodus_gw/worker/deploy.py +++ b/exodus_gw/worker/deploy.py @@ -93,7 +93,8 @@ def deploy_config( db = Session(bind=db_engine(settings)) ddb = DynamoDB(env, settings, from_date) - original_aliases = {src: dest for (src, dest) in ddb.aliases_for_flush} + original_aliases = {src: dest for (src, dest, _) in ddb.aliases_for_flush} + original_exclusions = {src: exc for (src, _, exc) in ddb.aliases_for_flush} message = CurrentMessage.get_current_message() assert message @@ -142,12 +143,18 @@ def deploy_config( # URLs depending on what changed in the config. flush_paths: set[str] = set() - for src, updated_dest in ddb.aliases_for_flush: + for src, updated_dest, updated_exclusions in ddb.aliases_for_flush: if original_aliases.get(src) != updated_dest: for published_path in db.query(models.PublishedPath).filter( models.PublishedPath.env == env, models.PublishedPath.web_uri.like(f"{src}/%"), ): + # If any original exclusion matches the uri, the uri wouldn't + # have been treated as an alias, thus cache flushing would be + # unnecessary. + if any([exclusion in published_path.web_uri + for exclusion in original_exclusions.get(src, [])]): + continue LOG.info( "Updated alias %s will flush cache for %s", src, diff --git a/tests/aws/test_uri_alias.py b/tests/aws/test_uri_alias.py index b15ab4b0..f25a812c 100644 --- a/tests/aws/test_uri_alias.py +++ b/tests/aws/test_uri_alias.py @@ -11,8 +11,8 @@ ( "/content/origin/rpms/path/to/file.iso", [ - ("/content/origin", "/origin"), - ("/origin/rpm", "/origin/rpms"), + ("/content/origin", "/origin", []), + ("/origin/rpm", "/origin/rpms", []), ], [ "/origin/rpms/path/to/file.iso", @@ -22,7 +22,7 @@ ( "/content/dist/rhel8/8/path/to/file.rpm", [ - ("/content/dist/rhel8/8", "/content/dist/rhel8/8.5"), + ("/content/dist/rhel8/8", "/content/dist/rhel8/8.5", []), ], [ "/content/dist/rhel8/8.5/path/to/file.rpm", @@ -50,8 +50,8 @@ def test_uri_alias_multi_level_write(): aliases = [ # The data here is made up as there is not currently any identified # realistic scenario having multi-level aliases during write. - ("/content/testproduct/1", "/content/testproduct/1.1.0"), - ("/content/other", "/content/testproduct"), + ("/content/testproduct/1", "/content/testproduct/1.1.0", []), + ("/content/other", "/content/testproduct", []), ] out = uri_alias(uri, aliases) @@ -74,10 +74,10 @@ def test_uri_alias_multi_level_flush(): aliases = [ # The caller is providing aliases in both src => dest and # dest => src directions, as in the "cache flush" case. - ("/content/dist/rhel8/8", "/content/dist/rhel8/8.8"), - ("/content/dist/rhel8/8.8", "/content/dist/rhel8/8"), - ("/content/dist/rhel8/rhui", "/content/dist/rhel8"), - ("/content/dist/rhel8", "/content/dist/rhel8/rhui"), + ("/content/dist/rhel8/8", "/content/dist/rhel8/8.8", []), + ("/content/dist/rhel8/8.8", "/content/dist/rhel8/8", []), + ("/content/dist/rhel8/rhui", "/content/dist/rhel8", []), + ("/content/dist/rhel8", "/content/dist/rhel8/rhui", []), ] out = uri_alias(uri, aliases) @@ -108,15 +108,15 @@ def test_uri_alias_limit(caplog: pytest.LogCaptureFixture): # actually used on production. uri = "/path/a/repo" - aliases = [ - ("/path/a", "/path/b"), - ("/path/b", "/path/c"), - ("/path/c", "/path/d"), - ("/path/d", "/path/e"), - ("/path/e", "/path/f"), - ("/path/f", "/path/g"), - ("/path/g", "/path/h"), - ("/path/h", "/path/i"), + aliases: list[tuple[str, str, list]] = [ + ("/path/a", "/path/b", []), + ("/path/b", "/path/c", []), + ("/path/c", "/path/d", []), + ("/path/d", "/path/e", []), + ("/path/e", "/path/f", []), + ("/path/f", "/path/g", []), + ("/path/g", "/path/h", []), + ("/path/h", "/path/i", []), ] out = uri_alias(uri, aliases) @@ -137,3 +137,52 @@ def test_uri_alias_limit(caplog: pytest.LogCaptureFixture): assert ( "Aliases too deeply nested, bailing out at /path/f/repo" in caplog.text ) + + +@pytest.mark.parametrize( + "input,aliases,output,log_message", + [ + ( + "/content/dist/rhel9/9/x86_64/baseos/iso/PULP_MANIFEST", + [ + ("/content/dist/rhel9/9", "/content/dist/rhel9/9.5", ["/iso/"]), + ], + # just returns the original + ["/content/dist/rhel9/9/x86_64/baseos/iso/PULP_MANIFEST"], + "Aliasing for /content/dist/rhel9/9/x86_64/baseos/iso/PULP_MANIFEST " + "was not applied as it matches one of the following exclusion paths: /iso/." + ), + ( + "/some/path/with/file/in/it", + [ + ("/some/path", "/another/different/path", ["/none/", "/here/"]), + ("/another/different/path", "/this/wont/alias", ["/file/"]) + ], + ["/another/different/path/with/file/in/it", + "/some/path/with/file/in/it"], + "Aliasing for /another/different/path/with/file/in/it was not " + "applied as it matches one of the following exclusion paths: /file/." + ), + ( + "/my/base/content/path/cool_iso_tool.rpm", + [ + ("/my/base", "/your/own", ["/iso/"]), + ], + ["/your/own/content/path/cool_iso_tool.rpm", + "/my/base/content/path/cool_iso_tool.rpm"], + "Resolved alias:\\n\\tsrc: /my/base/content/path/cool_iso_tool.rpm" + "\\n\\tdest: /your/own/content/path/cool_iso_tool.rpm" + + ), + ], + + ids=["basic", "transitive", "filename"], +) +def test_uri_alias_exclusions(input, aliases, output, log_message, caplog): + caplog.set_level(DEBUG, logger="exodus-gw") + assert uri_alias(input, aliases) == output + assert ( + f'"message": "{log_message}", ' + '"event": "publish", ' + '"success": true' in caplog.text + ) diff --git a/tests/conftest.py b/tests/conftest.py index 52b5dcd3..1f63e49f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -17,6 +17,9 @@ from .async_utils import BlockDetector +DEFAULT_EXCLUDE_PATHS = ["/files/", "/images/", "/iso/"] + + async def fake_aexit_instancemethod(self, exc_type, exc_val, exc_tb): pass @@ -267,24 +270,39 @@ def fake_config(): }, }, "origin_alias": [ - {"src": "/content/origin", "dest": "/origin"}, - {"src": "/origin/rpm", "dest": "/origin/rpms"}, + { + "src": "/content/origin", + "dest": "/origin", + "exclude_paths": DEFAULT_EXCLUDE_PATHS, + }, + { + "src": "/origin/rpm", + "dest": "/origin/rpms", + "exclude_paths": DEFAULT_EXCLUDE_PATHS, + }, ], "releasever_alias": [ { "src": "/content/dist/rhel8/8", "dest": "/content/dist/rhel8/8.5", + "exclude_paths": DEFAULT_EXCLUDE_PATHS, }, { "src": "/content/testproduct/1", "dest": "/content/testproduct/1.1.0", + "exclude_paths": DEFAULT_EXCLUDE_PATHS, }, ], "rhui_alias": [ - {"src": "/content/dist/rhel8/rhui", "dest": "/content/dist/rhel8"}, + { + "src": "/content/dist/rhel8/rhui", + "dest": "/content/dist/rhel8", + "exclude_paths": DEFAULT_EXCLUDE_PATHS, + }, { "src": "/content/testproduct/rhui", "dest": "/content/testproduct", + "exclude_paths": DEFAULT_EXCLUDE_PATHS, }, ], } diff --git a/tests/routers/test_config.py b/tests/routers/test_config.py index 2f487827..54df79eb 100644 --- a/tests/routers/test_config.py +++ b/tests/routers/test_config.py @@ -46,7 +46,11 @@ def test_deploy_config_typical(fake_config, auth_header, endpoint): {"listing": {"/origin/rhel/server": {"values": ["8"], "var": "nope"}}}, { "rhui_alias": [ - {"dest": "/../rhel/rhui/server", "src": "/../rhel/rhui/server"} + { + "dest": "/../rhel/rhui/server", + "src": "/../rhel/rhui/server", + "exclude_paths": ["/this/", "/is/", "/fine/"], + } ] }, {"no_dont": 123}, diff --git a/tests/worker/test_cdn_cache.py b/tests/worker/test_cdn_cache.py index ac4ef6e2..95bd1339 100644 --- a/tests/worker/test_cdn_cache.py +++ b/tests/worker/test_cdn_cache.py @@ -209,10 +209,20 @@ def test_flush_cdn_cache_typical( { "origin_alias": [], "releasever_alias": [ - {"src": "/path/one", "dest": "/path/one-dest"}, + { + "src": "/path/one", + "dest": "/path/one-dest", + "exclude_paths": ["/files/", "/images/", + "/iso/"], + }, ], "rhui_alias": [ - {"src": "/path/rhui/two", "dest": "/path/two"}, + { + "src": "/path/rhui/two", + "dest": "/path/two", + "exclude_paths": ["/files/", "/images/", + "/iso/"], + }, ], } ) diff --git a/tests/worker/test_deploy.py b/tests/worker/test_deploy.py index 378d55cb..d236a972 100644 --- a/tests/worker/test_deploy.py +++ b/tests/worker/test_deploy.py @@ -138,6 +138,26 @@ def test_deploy_config_with_flush( ) ) + # Exclusion tests. This Path should not be flushed as it contains part of + # the original alias exclusions. + db.add( + PublishedPath( + env="test", + web_uri="/content/testproduct/1/iso/file4", + updated=datetime.now(tz=timezone.utc), + ) + ) + + # This path will be flushed as it was not part of the original alias + # exclusions. + db.add( + PublishedPath( + env="test", + web_uri="/content/testproduct/1/newExclusion/file5", + updated=datetime.now(tz=timezone.utc), + ) + ) + db.commit() # We're updating the alias in the config. @@ -146,6 +166,7 @@ def test_deploy_config_with_flush( { "dest": "/content/testproduct/1.2.0", "src": "/content/testproduct/1", + "exclude_paths": ["/newExclusion/"] }, ] worker.deploy_config(updated_config, "test", NOW_UTC) @@ -201,6 +222,7 @@ def test_deploy_config_with_flush( "/content/dist/rhel/server/listing", "/content/testproduct/1/file1", "/content/testproduct/1/file2", + "/content/testproduct/1/newExclusion/file5" ] # And actor call should have been delayed by this long.