From 42a68f5701b2905245046627e7be20226ed8f5be Mon Sep 17 00:00:00 2001 From: Fabian Steeg Date: Thu, 22 Sep 2022 16:31:25 +0200 Subject: [PATCH] Add SitemapReader originally developed in OERSI Reads sitemap from URL, sends each `loc` URL to the receiver. e.g. `"https://hoou.de/sitemap.xml" | read-sitemap | open-http ...` in a Flux workflow to process every document linked in the sitemap. Supports paging via `from=` query string parameter (see #464) See: https://en.wikipedia.org/wiki/Sitemaps https://gitlab.com/oersi/oersi-etl/-/issues/4 https://gitlab.com/oersi/oersi-etl/-/issues/17 --- metafacture-io/build.gradle | 3 +- .../org/metafacture/io/SitemapReader.java | 132 ++++++++++++++++++ .../main/resources/flux-commands.properties | 1 + .../org/metafacture/io/SitemapReaderTest.java | 83 +++++++++++ .../resources/org/metafacture/io/sitemap.xml | 12 ++ 5 files changed, 230 insertions(+), 1 deletion(-) create mode 100644 metafacture-io/src/main/java/org/metafacture/io/SitemapReader.java create mode 100644 metafacture-io/src/test/java/org/metafacture/io/SitemapReaderTest.java create mode 100644 metafacture-io/src/test/resources/org/metafacture/io/sitemap.xml diff --git a/metafacture-io/build.gradle b/metafacture-io/build.gradle index e9947499a..1f5577b68 100644 --- a/metafacture-io/build.gradle +++ b/metafacture-io/build.gradle @@ -22,10 +22,11 @@ dependencies { implementation project(':metafacture-commons') implementation 'commons-io:commons-io:2.5' implementation 'org.apache.commons:commons-compress:1.21' + implementation 'org.jooq:joox-java-6:1.6.0' + implementation 'org.slf4j:slf4j-simple:1.7.21' runtimeOnly 'org.tukaani:xz:1.6' testImplementation 'com.github.tomakehurst:wiremock-jre8:2.33.2' testImplementation 'junit:junit:4.12' testImplementation 'org.mockito:mockito-core:2.5.5' testImplementation 'org.assertj:assertj-core:3.11.1' - testRuntimeOnly 'org.slf4j:slf4j-simple:1.7.21' } diff --git a/metafacture-io/src/main/java/org/metafacture/io/SitemapReader.java b/metafacture-io/src/main/java/org/metafacture/io/SitemapReader.java new file mode 100644 index 000000000..88fda6123 --- /dev/null +++ b/metafacture-io/src/main/java/org/metafacture/io/SitemapReader.java @@ -0,0 +1,132 @@ +/* + * Copyright 2020, 2022 Fabian Steeg, hbz + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.metafacture.io; + +import org.metafacture.framework.FluxCommand; +import org.metafacture.framework.MetafactureException; +import org.metafacture.framework.ObjectReceiver; +import org.metafacture.framework.annotations.Description; +import org.metafacture.framework.annotations.In; +import org.metafacture.framework.annotations.Out; +import org.metafacture.framework.helpers.DefaultObjectPipe; + +import org.joox.JOOX; +import org.joox.Match; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.xml.sax.SAXException; + +import java.io.IOException; +import java.net.URL; +import java.util.List; +import java.util.Scanner; +import java.util.stream.Collectors; + +/** + * Reads a sitemap and emits URLs. + * + * @author Fabian Steeg (fsteeg) + */ +@Description("Reads an XML sitemap from a URL, sends the sitemap's `loc` URLs to the receiver. " + + "If the sitemap URL contains a `from=` query string parameter, the reader will keep paging until no more results are returned. " + + "Set `filter` to send only URLs matching a given regular expression to the receiver (defaults to sending all URLs). " + + "Set `limit` to limit the total number of URLs to send to the receiver (defaults to sending all URLs, set explicitly with `-1`). " + + "Set `wait` for the time (in milliseconds) to wait after sending a URL to the receiver (defaults to `1000` i.e. 1 second).") +@In(String.class) +@Out(String.class) +@FluxCommand("read-sitemap") +public final class SitemapReader extends DefaultObjectPipe> { + + private static final Logger LOG = LoggerFactory.getLogger(SitemapReader.class); + private static final int DEFAULT_WAIT = 1000; + private static final int DEFAULT_LIMIT = Integer.MAX_VALUE; + + private String filter; + private int limit = DEFAULT_LIMIT; + private int wait = DEFAULT_WAIT; + + /** + * Creates an instance of {@link SitemapReader}. + */ + public SitemapReader() { } + + /** + * @param filter The regex to match for filtering which URLs should be sent to the receiver. + */ + public void setFilter(final String filter) { + this.filter = filter; + } + + /** + * @param limit The total number of URLs that should be sent to the receiver (-1 for unlimited). + */ + public void setLimit(final int limit) { + this.limit = limit < 0 ? Integer.MAX_VALUE : limit; + } + + /** + * @param wait The time (in milliseconds) to wait after a URL has been sent to the receiver. + */ + public void setWait(final int wait) { + this.wait = wait; + } + + @Override + public void process(final String sitemap) { + LOG.debug("Processing sitemap URL {}", sitemap); + try { + final Match siteMapXml = JOOX.$(new URL(sitemap)); + final List urls = siteMapXml.find("loc") + .map(m -> m.element().getTextContent().trim()).stream() + .filter(s -> filter == null || s.matches(filter)).collect(Collectors.toList()); + sendAll(urls); + tryNextPage(sitemap, urls.size()); + } + catch (final SAXException | IOException e) { + throw new MetafactureException(e.getMessage(), e); + } + catch (final InterruptedException e) { + Thread.currentThread().interrupt(); + throw new MetafactureException(e.getMessage(), e); + } + } + + private void sendAll(final List urls) throws InterruptedException { + for (final String url : urls.subList(0, Math.min(limit, urls.size()))) { + LOG.trace("Processing resource URL {}", url); + getReceiver().process(url); + Thread.sleep(wait); + } + } + + private void tryNextPage(final String sitemap, final int currentPageSize) { + final String fromParam = "from="; + final boolean pagingIsSupported = sitemap.contains(fromParam); + final boolean isDone = currentPageSize == 0 || limit <= currentPageSize; + if (pagingIsSupported && !isDone) { + try (Scanner scanner = new Scanner( + sitemap.substring(sitemap.indexOf(fromParam) + fromParam.length()))) { + if (scanner.hasNextInt()) { + final int lastFrom = scanner.nextInt(); + final int nextFrom = lastFrom + currentPageSize; + process(sitemap.replace(fromParam + lastFrom, fromParam + nextFrom)); + } + } + } + } + +} diff --git a/metafacture-io/src/main/resources/flux-commands.properties b/metafacture-io/src/main/resources/flux-commands.properties index 39540d47e..2007fadb8 100644 --- a/metafacture-io/src/main/resources/flux-commands.properties +++ b/metafacture-io/src/main/resources/flux-commands.properties @@ -22,3 +22,4 @@ write org.metafacture.io.ObjectWriter as-records org.metafacture.io.RecordReader open-resource org.metafacture.io.ResourceOpener open-tar org.metafacture.io.TarReader +read-sitemap org.metafacture.io.SitemapReader diff --git a/metafacture-io/src/test/java/org/metafacture/io/SitemapReaderTest.java b/metafacture-io/src/test/java/org/metafacture/io/SitemapReaderTest.java new file mode 100644 index 000000000..4aa8dafe2 --- /dev/null +++ b/metafacture-io/src/test/java/org/metafacture/io/SitemapReaderTest.java @@ -0,0 +1,83 @@ +/* + * Copyright 2020, 2022 Fabian Steeg, hbz + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.metafacture.io; + +import org.metafacture.framework.MetafactureException; +import org.metafacture.framework.ObjectReceiver; + +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.mockito.InOrder; +import org.mockito.Mock; +import org.mockito.Mockito; +import org.mockito.MockitoAnnotations; + +import java.util.Arrays; +import java.util.List; + +/** + * Tests for {@link SitemapReader}. + * + * @author Fabian Steeg + * + */ +public final class SitemapReaderTest { + + private String sitemap = "sitemap.xml"; + private SitemapReader sitemapReader; + + @Mock + private ObjectReceiver receiver; + private InOrder inOrder; + + @Before + public void setup() { + MockitoAnnotations.initMocks(this); + sitemapReader = new SitemapReader(); + sitemapReader.setWait(0); // we're not actually crawling any urls in the tests + sitemapReader.setReceiver(receiver); + inOrder = Mockito.inOrder(receiver); + } + + @Test + public void testShouldProcessAll() { + sitemapReader.process(getClass().getResource(sitemap).toString()); + inOrder.verify(receiver).process("https://www.oncampus.de/Customer_Experience_Management"); + inOrder.verify(receiver).process("https://www.oncampus.de/Propädeutik_Mathe_Grundlagen"); + inOrder.verify(receiver).process("https://www.oncampus.de/MDR/Websession2020"); + inOrder.verifyNoMoreInteractions(); + } + + @Test + public void testShouldProcessPattern() { + sitemapReader.process(getClass().getResource(sitemap).toString()); + sitemapReader.setFilter(".*/MDR/.*"); + inOrder.verify(receiver).process("https://www.oncampus.de/MDR/Websession2020"); + inOrder.verifyNoMoreInteractions(); + } + + @Test(expected = MetafactureException.class) + public void testShouldThrowOnInvalidUrl() { + sitemapReader.process(""); + } + + @After + public void cleanup() { + sitemapReader.closeStream(); + } +} diff --git a/metafacture-io/src/test/resources/org/metafacture/io/sitemap.xml b/metafacture-io/src/test/resources/org/metafacture/io/sitemap.xml new file mode 100644 index 000000000..9b4d7d4da --- /dev/null +++ b/metafacture-io/src/test/resources/org/metafacture/io/sitemap.xml @@ -0,0 +1,12 @@ + + + + https://www.oncampus.de/Customer_Experience_Management + + + https://www.oncampus.de/Propädeutik_Mathe_Grundlagen + + + https://www.oncampus.de/MDR/Websession2020 + +