diff --git a/README.md b/README.md index 5f37ae064..21058556b 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ ![Metafacture](https://raw.github.com/wiki/metafacture/metafacture-core/img/metafacture.png) -[![Build](https://github.com/metafacture/metafacture-core/workflows/Build/badge.svg?branch=master)](https://github.com/metafacture/metafacture-core/actions?query=workflow%3ABuild) +[![Build](https://github.com/metafacture/metafacture-core/actions/workflows/build.yml/badge.svg?branch=master)](https://github.com/metafacture/metafacture-core/actions?query=workflow%3ABuild) Metafacture is a toolkit for processing semi-structured data with a focus on library metadata. It provides a versatile set of tools for reading, writing and transforming data. Metafacture can be used as a stand-alone application or as a Java library in other applications. The name Metafacture is a portmanteau of the words *meta*data and manu*facture*. diff --git a/build.gradle b/build.gradle index 424b6af76..37dcbe95e 100644 --- a/build.gradle +++ b/build.gradle @@ -28,7 +28,7 @@ subprojects { versions = [ 'assertj_core': '3.11.1', 'commons_compress': '1.21', - 'guava': '29.0-jre', + 'guava': '32.0.1-jre', 'jackson_databind': '2.15.1', 'jdk': '11', 'junit': '4.12', diff --git a/gradle/wrapper/gradle-wrapper.jar b/gradle/wrapper/gradle-wrapper.jar index e6441136f..a4b76b953 100644 Binary files a/gradle/wrapper/gradle-wrapper.jar and b/gradle/wrapper/gradle-wrapper.jar differ diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties index 8a1f6b97f..82dd18b20 100644 --- a/gradle/wrapper/gradle-wrapper.properties +++ b/gradle/wrapper/gradle-wrapper.properties @@ -1,7 +1,7 @@ distributionBase=GRADLE_USER_HOME distributionPath=wrapper/dists -distributionSha256Sum=a4b4158601f8636cdeeab09bd76afb640030bb5b144aafe261a5e8af027dc612 -distributionUrl=https\://services.gradle.org/distributions/gradle-8.8-bin.zip +distributionSha256Sum=57dafb5c2622c6cc08b993c85b7c06956a2f53536432a30ead46166dbca0f1e9 +distributionUrl=https\://services.gradle.org/distributions/gradle-8.11-bin.zip networkTimeout=10000 validateDistributionUrl=true zipStoreBase=GRADLE_USER_HOME diff --git a/gradlew b/gradlew index 1aa94a426..f5feea6d6 100755 --- a/gradlew +++ b/gradlew @@ -15,6 +15,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # +# SPDX-License-Identifier: Apache-2.0 +# ############################################################################## # @@ -55,7 +57,7 @@ # Darwin, MinGW, and NonStop. # # (3) This script is generated from the Groovy template -# https://github.com/gradle/gradle/blob/HEAD/subprojects/plugins/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt +# https://github.com/gradle/gradle/blob/HEAD/platforms/jvm/plugins-application/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt # within the Gradle project. # # You can find Gradle at https://github.com/gradle/gradle/. @@ -84,7 +86,8 @@ done # shellcheck disable=SC2034 APP_BASE_NAME=${0##*/} # Discard cd standard output in case $CDPATH is set (https://github.com/gradle/gradle/issues/25036) -APP_HOME=$( cd "${APP_HOME:-./}" > /dev/null && pwd -P ) || exit +APP_HOME=$( cd -P "${APP_HOME:-./}" > /dev/null && printf '%s +' "$PWD" ) || exit # Use the maximum available, or set MAX_FD != -1 to use that value. MAX_FD=maximum diff --git a/gradlew.bat b/gradlew.bat index 25da30dbd..9d21a2183 100644 --- a/gradlew.bat +++ b/gradlew.bat @@ -13,6 +13,8 @@ @rem See the License for the specific language governing permissions and @rem limitations under the License. @rem +@rem SPDX-License-Identifier: Apache-2.0 +@rem @if "%DEBUG%"=="" @echo off @rem ########################################################################## diff --git a/metafacture-biblio/src/main/java/org/metafacture/biblio/marc21/Marc21Encoder.java b/metafacture-biblio/src/main/java/org/metafacture/biblio/marc21/Marc21Encoder.java index 3cd536fe7..fb8ad50b2 100644 --- a/metafacture-biblio/src/main/java/org/metafacture/biblio/marc21/Marc21Encoder.java +++ b/metafacture-biblio/src/main/java/org/metafacture/biblio/marc21/Marc21Encoder.java @@ -81,6 +81,7 @@ public final class Marc21Encoder extends private State state = State.IN_STREAM; private boolean generateIdField; + private boolean validateLeader = true; /** * Initializes the encoder with MARC 21 constants and charset. @@ -108,6 +109,18 @@ public void setGenerateIdField(final boolean generateIdField) { this.generateIdField = generateIdField; } + /** + * Controls whether the leader should be validated. + *

+ * The default value of {@code validateLeader} is true. + *

+ * + * @param validateLeader if false the leader is not validated + */ + public void setValidateLeader(final boolean validateLeader) { + this.validateLeader = validateLeader; + } + /** * Gets the flag to decide whether the ID field is generated. * @@ -259,12 +272,14 @@ private void processLeaderAsSubfields(final String name, final char code) { } private void requireValidCode(final char code, final char[] validCodes) { - for (final char validCode: validCodes) { - if (validCode == code) { - return; + if (validateLeader) { + for (final char validCode : validCodes) { + if (validCode == code) { + return; + } } + throw new FormatException("invalid code '" + code + "'; allowed codes are: " + Arrays.toString(validCodes)); } - throw new FormatException("invalid code '" + code + "'; allowed codes are: " + Arrays.toString(validCodes)); } private void processTopLevelLiteral(final String name, final String value) { diff --git a/metafacture-biblio/src/main/java/org/metafacture/biblio/marc21/MarcXmlEncoder.java b/metafacture-biblio/src/main/java/org/metafacture/biblio/marc21/MarcXmlEncoder.java index 9dba83d51..9fb12b016 100644 --- a/metafacture-biblio/src/main/java/org/metafacture/biblio/marc21/MarcXmlEncoder.java +++ b/metafacture-biblio/src/main/java/org/metafacture/biblio/marc21/MarcXmlEncoder.java @@ -49,9 +49,6 @@ public final class MarcXmlEncoder extends DefaultStreamPipe"; - private static final String ROOT_CLOSE = ""; - private enum Tag { collection(" xmlns%s=\"" + NAMESPACE + "\"%s"), @@ -106,7 +103,6 @@ public String close(final Object[] args) { private static final int TAG_END = 3; private final Encoder encoder = new Encoder(); - private final Marc21Decoder decoder = new Marc21Decoder(); private final Marc21Encoder wrapper = new Marc21Encoder(); private DefaultStreamPipe> pipe; @@ -115,6 +111,7 @@ public String close(final Object[] args) { * Creates an instance of {@link MarcXmlEncoder}. */ public MarcXmlEncoder() { + final Marc21Decoder decoder = new Marc21Decoder(); decoder.setEmitLeaderAsWhole(true); wrapper @@ -136,7 +133,6 @@ public void setEmitNamespace(final boolean emitNamespace) { /** * Sets the flag to decide whether to omit the XML declaration. - * * Default value: {@value #OMIT_XML_DECLARATION} * * @param currentOmitXmlDeclaration true if the XML declaration is omitted, otherwise @@ -148,7 +144,6 @@ public void omitXmlDeclaration(final boolean currentOmitXmlDeclaration) { /** * Sets the XML version. - * * Default value: {@value #XML_VERSION} * * @param xmlVersion the XML version @@ -159,7 +154,6 @@ public void setXmlVersion(final String xmlVersion) { /** * Sets the XML encoding. - * * Default value: {@value #XML_ENCODING} * * @param xmlEncoding the XML encoding @@ -173,7 +167,6 @@ public void setXmlEncoding(final String xmlEncoding) { * If true, the input data is validated to ensure correct MARC21. Also the leader may be generated. * It acts as a wrapper: the input is piped to {@link org.metafacture.biblio.marc21.Marc21Encoder}, whose output is piped to {@link org.metafacture.biblio.marc21.Marc21Decoder}, whose output is piped to {@link org.metafacture.biblio.marc21.MarcXmlEncoder}. * This validation and treatment of the leader is more safe but comes with a performance impact. - * * Default value: {@value #ENSURE_CORRECT_MARC21_XML} * * @param ensureCorrectMarc21Xml if true the input data is validated to ensure correct MARC21. Also the leader may be generated. @@ -184,7 +177,6 @@ public void setEnsureCorrectMarc21Xml(final boolean ensureCorrectMarc21Xml) { /** * Formats the resulting xml by indentation. Aka "pretty printing". - * * Default value: {@value #PRETTY_PRINTED} * * @param formatted true if formatting is activated, otherwise false @@ -220,7 +212,7 @@ public void literal(final String name, final String value) { @Override protected void onResetStream() { - pipe.resetStream(); + encoder.onResetStream(); } @Override @@ -247,11 +239,12 @@ private static class Encoder extends DefaultStreamPipe> { private String currentEntity = ""; private boolean emitNamespace = true; - private Object[] namespacePrefix = new Object[]{emitNamespace ? NAMESPACE_PREFIX : EMPTY}; + private Object[] namespacePrefix = new Object[]{NAMESPACE_PREFIX}; private int indentationLevel; private boolean formatted = PRETTY_PRINTED; private int recordAttributeOffset; + private int recordLeaderOffset; private Encoder() { } @@ -294,7 +287,7 @@ public void startRecord(final String identifier) { writeTag(Tag.record::open); recordAttributeOffset = builder.length() - 1; prettyPrintNewLine(); - + recordLeaderOffset = builder.length(); incrementIndentationLevel(); } @@ -345,6 +338,7 @@ public void literal(final String name, final String value) { if (name.equals(Marc21EventNames.MARCXML_TYPE_LITERAL)) { if (value != null) { builder.insert(recordAttributeOffset, String.format(ATTRIBUTE_TEMPLATE, name, value)); + recordLeaderOffset = builder.length(); } } else if (!appendLeader(name, value)) { @@ -353,7 +347,7 @@ else if (!appendLeader(name, value)) { if (value != null) { writeEscaped(value.trim()); } - writeTag(Tag.controlfield::close); + writeTag(Tag.controlfield::close, false); prettyPrintNewLine(); } } @@ -378,7 +372,9 @@ protected void onResetStream() { @Override protected void onCloseStream() { - writeFooter(); + if (!atStreamStart) { + writeFooter(); + } sendAndClearData(); } @@ -408,9 +404,20 @@ private void writeFooter() { * @param str the unescaped sequence to be written */ private void writeRaw(final String str) { + builder.append(str); } + /** + * Writes the unescaped sequence to the leader position. + * + * @param str the unescaped sequence to be written to the leader position + */ + private void writeRawLeader(final String str) { + builder.insert(recordLeaderOffset, str); + recordLeaderOffset = recordLeaderOffset + str.length(); + } + private boolean appendLeader(final String name, final String value) { if (name.equals(Marc21EventNames.LEADER_ENTITY)) { leaderBuilder.append(value); @@ -432,12 +439,18 @@ private void writeEscaped(final String str) { private void writeLeader() { final String leader = leaderBuilder.toString(); - if (!leader.isEmpty()) { - prettyPrintIndentation(); - writeTag(Tag.leader::open); - writeRaw("0000" + leader.substring(0, 4) + "2200000" + leader.substring(5, 7) + "4500"); // creates a valid leader without counted elements - writeTag(Tag.leader::close); - prettyPrintNewLine(); + if (leaderBuilder.length() > 0) { + if (formatted) { + writeRawLeader(getIndentationPrefix()); + } + + writeTagLeader(Tag.leader::open); + writeRawLeader("0000" + leader.substring(0, 4) + "2200000" + leader.substring(5, 7) + "4500"); // creates a valid leader without counted elements + writeTagLeader(Tag.leader::close); + + if (formatted) { + writeRawLeader(NEW_LINE); + } } } @@ -447,10 +460,17 @@ private void writeTag(final Function function, final Object... writeRaw(function.apply(allArgs)); } + private void writeTagLeader(final Function function) { + writeRawLeader(function.apply(namespacePrefix)); + } + + private String getIndentationPrefix() { + return String.join("", Collections.nCopies(indentationLevel, INDENT)); + } + private void prettyPrintIndentation() { if (formatted) { - final String prefix = String.join("", Collections.nCopies(indentationLevel, INDENT)); - builder.append(prefix); + builder.append(getIndentationPrefix()); } } diff --git a/metafacture-biblio/src/main/java/org/metafacture/biblio/marc21/MarcXmlHandler.java b/metafacture-biblio/src/main/java/org/metafacture/biblio/marc21/MarcXmlHandler.java index 16632cc66..b0333faf0 100644 --- a/metafacture-biblio/src/main/java/org/metafacture/biblio/marc21/MarcXmlHandler.java +++ b/metafacture-biblio/src/main/java/org/metafacture/biblio/marc21/MarcXmlHandler.java @@ -32,7 +32,7 @@ * @author Markus Michael Geipel * */ -@Description("A marc xml reader") +@Description("A MARC XML reader. To read marc data without namespace specification set option `namespace=\"\"`") @In(XmlReceiver.class) @Out(StreamReceiver.class) @FluxCommand("handle-marcxml") @@ -63,7 +63,8 @@ public MarcXmlHandler() { * * Default value: {@value #NAMESPACE} * - * @param namespace the namespace + * @param namespace the namespace. Set to null if namespace shouldn't be checked. Set to empty string + * if the namespace is missing in the data. */ public void setNamespace(final String namespace) { this.namespace = namespace; diff --git a/metafacture-biblio/src/test/java/org/metafacture/biblio/marc21/Marc21EncoderTest.java b/metafacture-biblio/src/test/java/org/metafacture/biblio/marc21/Marc21EncoderTest.java index e8e70325d..f81d864c2 100644 --- a/metafacture-biblio/src/test/java/org/metafacture/biblio/marc21/Marc21EncoderTest.java +++ b/metafacture-biblio/src/test/java/org/metafacture/biblio/marc21/Marc21EncoderTest.java @@ -38,6 +38,8 @@ */ public final class Marc21EncoderTest { + private static final String BAD_LEADER = "00600ny a22002053n 4500"; + private Marc21Encoder marc21Encoder; @Mock @@ -147,4 +149,21 @@ public void issue524ShouldComputeValidLeader() { verify(receiver).process(matches("00055pam a2200037 c 4500021001700000\u001e.*\u001d")); } + @Test(expected = FormatException.class) + public void issue567ShouldFailValidateLeaderAsDefault() { + marc21Encoder.startRecord(""); + marc21Encoder.literal(LEADER_ENTITY, BAD_LEADER); + marc21Encoder.endRecord(); + } + + @Test + public void issue567ShouldNotValidateLeader() { + marc21Encoder.setValidateLeader(false); + marc21Encoder.startRecord(""); + marc21Encoder.literal(LEADER_ENTITY, BAD_LEADER ); + marc21Encoder.endRecord(); + + verify(receiver).process(matches("00026ny a22000253n 4500\u001e\u001d")); + } + } diff --git a/metafacture-biblio/src/test/java/org/metafacture/biblio/marc21/MarcXmlEncoderTest.java b/metafacture-biblio/src/test/java/org/metafacture/biblio/marc21/MarcXmlEncoderTest.java index 04e40652e..2e1c8df23 100644 --- a/metafacture-biblio/src/test/java/org/metafacture/biblio/marc21/MarcXmlEncoderTest.java +++ b/metafacture-biblio/src/test/java/org/metafacture/biblio/marc21/MarcXmlEncoderTest.java @@ -25,6 +25,7 @@ import org.junit.Test; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotEquals; import static org.junit.Assert.assertTrue; /** @@ -50,6 +51,7 @@ public class MarcXmlEncoderTest { private static final String RECORD_ID = "92005291"; private static StringBuilder resultCollector; + private static int resultCollectorsResetStreamCount; private static MarcXmlEncoder encoder; @Before @@ -61,6 +63,11 @@ public void setUp() { public void process(final String obj) { resultCollector.append(obj); } + @Override + public void resetStream() { + ++resultCollectorsResetStreamCount; + } + }); resultCollector = new StringBuilder(); } @@ -129,10 +136,19 @@ public void createAnEmptyRecord() { @Test public void createARecordPrettyPrint() { encoder.setFormatted(true); - addOneRecord(encoder); + encoder.startRecord(RECORD_ID); + encoder.startEntity(Marc21EventNames.LEADER_ENTITY); + encoder.literal(Marc21EventNames.LEADER_ENTITY, "dummy"); + encoder.endEntity(); + encoder.literal("001", RECORD_ID); + encoder.startEntity("010 "); + encoder.literal("a", RECORD_ID); + encoder.endEntity(); + encoder.endRecord(); encoder.closeStream(); - String expected = XML_DECLARATION + "\n" + XML_ROOT_OPEN + "\n"// " \n" + String expected = XML_DECLARATION + "\n" + XML_ROOT_OPEN + "\n" + "\t\n"// + + "\t\tdummy\n" + "\t\t92005291\n"// + "\t\t\n"// + "\t\t\t92005291\n"// @@ -231,6 +247,21 @@ public void issue336_createRecordWithTopLevelLeader_dummy() { assertEquals(expected, actual); } + @Test + public void issue548_createRecordWithTypeAttributeInRecordTagAndLeader() { + encoder.startRecord(RECORD_ID); + encoder.literal("type", "Bibliographic"); + encoder.startEntity(Marc21EventNames.LEADER_ENTITY); + encoder.literal(Marc21EventNames.LEADER_ENTITY, "dummy"); + encoder.endEntity(); + encoder.endRecord(); + encoder.closeStream(); + String expected = XML_DECLARATION + XML_ROOT_OPEN + "" + + "dummy" + XML_MARC_COLLECTION_END_TAG; + String actual = resultCollector.toString(); + assertEquals(expected, actual); + } + @Test public void issue336_createRecordWithTopLevelLeader_defaultMarc21Xml() { issue336_createRecordWithTopLevelLeader(encoder, "00000naa a2200000uc 4500"); @@ -249,12 +280,26 @@ private void issue336_createRecordWithTopLevelLeader(final MarcXmlEncoder encode encoder.endRecord(); encoder.closeStream(); String expected = XML_DECLARATION + XML_ROOT_OPEN - + "8u3287432" + - "" + expectedLeader + "" + XML_MARC_COLLECTION_END_TAG; + + "" + expectedLeader + "" + + "8u3287432" + XML_MARC_COLLECTION_END_TAG; String actual = resultCollector.toString(); assertEquals(expected, actual); } + @Test + public void issue548_failWhenLeaderIsNotFirst() { + encoder.startRecord("1"); + encoder.literal("001", "8u3287432"); + encoder.literal(Marc21EventNames.LEADER_ENTITY, "00000naa a2200000uc 4500"); + encoder.endRecord(); + encoder.closeStream(); + String expected = XML_DECLARATION + XML_ROOT_OPEN + + "8u3287432" + + "00000naa a2200000uc 4500" + XML_MARC_COLLECTION_END_TAG; + String actual = resultCollector.toString(); + assertNotEquals(expected, actual); + } + @Test public void issue527_shouldEmitLeaderAlwaysAsWholeString() { createRecordWithLeader("1", "a", "o", "a", " ", "a", "z", "u", " "); @@ -350,4 +395,26 @@ public void shouldNotEncodeNestedTypeLiteralAsAttribute() { assertEquals(expected, actual); } + @Test + public void issue543_shouldNotWriteFooterWhenRecordIsEmpty() { + encoder.closeStream(); + String actual = resultCollector.toString(); + assertTrue(actual.isEmpty()); + } + + @Test + public void issue543_shouldOnlyResetStreamOnce() { + resultCollectorsResetStreamCount = 0; + encoder.resetStream(); + assertEquals(resultCollectorsResetStreamCount, 1); + } + + @Test + public void issue543_shouldOnlyResetStreamOnceUsingWrapper() { + resultCollectorsResetStreamCount = 0; + encoder.setEnsureCorrectMarc21Xml(true); + encoder.resetStream(); + assertEquals(resultCollectorsResetStreamCount, 1); + } + } diff --git a/metafacture-csv/build.gradle b/metafacture-csv/build.gradle index 8d7d743b5..ee029ff14 100644 --- a/metafacture-csv/build.gradle +++ b/metafacture-csv/build.gradle @@ -19,7 +19,7 @@ description = 'Modules for processing comma-separated values' dependencies { api project(':metafacture-framework') - implementation 'com.opencsv:opencsv:3.9' + implementation 'com.opencsv:opencsv:5.9' testImplementation "junit:junit:${versions.junit}" testImplementation "org.mockito:mockito-core:${versions.mockito}" } diff --git a/metafacture-csv/src/main/java/org/metafacture/csv/CsvDecoder.java b/metafacture-csv/src/main/java/org/metafacture/csv/CsvDecoder.java index 06bd6a690..45b2ac4b0 100644 --- a/metafacture-csv/src/main/java/org/metafacture/csv/CsvDecoder.java +++ b/metafacture-csv/src/main/java/org/metafacture/csv/CsvDecoder.java @@ -1,5 +1,5 @@ /* - * Copyright 2013, 2014 Deutsche Nationalbibliothek + * Copyright 2013-2024 Deutsche Nationalbibliothek and hbz * * Licensed under the Apache License, Version 2.0 the "License"; * you may not use this file except in compliance with the License. @@ -24,6 +24,10 @@ import org.metafacture.framework.helpers.DefaultObjectPipe; import com.opencsv.CSVReader; +import com.opencsv.CSVReaderBuilder; +import com.opencsv.RFC4180Parser; +import com.opencsv.RFC4180ParserBuilder; +import com.opencsv.exceptions.CsvException; import java.io.IOException; import java.io.StringReader; @@ -48,6 +52,7 @@ public final class CsvDecoder extends DefaultObjectPipe private String[] header = new String[0]; private int count; private boolean hasHeader; + private RFC4180Parser parser; /** * Creates an instance of {@link CsvDecoder} with a given separator. @@ -56,6 +61,7 @@ public final class CsvDecoder extends DefaultObjectPipe */ public CsvDecoder(final String separator) { this.separator = separator.charAt(0); + initializeCsvParser(); } /** @@ -65,6 +71,7 @@ public CsvDecoder(final String separator) { */ public CsvDecoder(final char separator) { this.separator = separator; + initializeCsvParser(); } /** @@ -72,6 +79,13 @@ public CsvDecoder(final char separator) { * {@value #DEFAULT_SEP}. */ public CsvDecoder() { + initializeCsvParser(); + } + + private void initializeCsvParser() { + this.parser = new RFC4180ParserBuilder() + .withSeparator(separator) + .build(); } @Override @@ -105,18 +119,19 @@ else if (parts.length == header.length) { } } - private String[] parseCsv(final String string) { + private String[] parseCsv(final String csv) { String[] parts = new String[0]; try { - final CSVReader reader = new CSVReader(new StringReader(string), - separator); + final CSVReader reader = new CSVReaderBuilder(new StringReader(csv)) + .withCSVParser(parser) + .build(); final List lines = reader.readAll(); if (lines.size() > 0) { parts = lines.get(0); } reader.close(); } - catch (final IOException e) { + catch (final IOException | CsvException e) { e.printStackTrace(); } return parts; @@ -139,5 +154,6 @@ public void setHasHeader(final boolean hasHeader) { */ public void setSeparator(final String separator) { this.separator = separator.charAt(0); + initializeCsvParser(); } } diff --git a/metafacture-csv/src/test/java/org/metafacture/csv/CsvDecoderTest.java b/metafacture-csv/src/test/java/org/metafacture/csv/CsvDecoderTest.java index ed095383c..4958775c3 100644 --- a/metafacture-csv/src/test/java/org/metafacture/csv/CsvDecoderTest.java +++ b/metafacture-csv/src/test/java/org/metafacture/csv/CsvDecoderTest.java @@ -89,4 +89,22 @@ public void testTabSeparated() { ordered.verify(receiver).endRecord(); } + /** + * In: "a","b\t","c\\t","\","\cd\" + * Out: a, b , c\\t, \, \cd\ + */ + @Test + public void issue496_escaping() { + decoder.setHasHeader(false); + decoder.process("\"a\",\"b\t\",\"c\\t\",\"\\\",\"\\cd\\\""); + final InOrder ordered = inOrder(receiver); + ordered.verify(receiver).startRecord("1"); + ordered.verify(receiver).literal("0", "a"); + ordered.verify(receiver).literal("1", "b\t"); + ordered.verify(receiver).literal("2", "c\\t"); + ordered.verify(receiver).literal("3", "\\"); + ordered.verify(receiver).literal("4", "\\cd\\"); + ordered.verify(receiver).endRecord(); + } + } diff --git a/metafacture-io/src/main/java/org/metafacture/io/ObjectFileWriter.java b/metafacture-io/src/main/java/org/metafacture/io/ObjectFileWriter.java index f74629827..0e578d1da 100644 --- a/metafacture-io/src/main/java/org/metafacture/io/ObjectFileWriter.java +++ b/metafacture-io/src/main/java/org/metafacture/io/ObjectFileWriter.java @@ -18,7 +18,9 @@ import org.metafacture.framework.FluxCommand; import org.metafacture.framework.MetafactureException; +import org.metafacture.framework.annotations.Description; import org.metafacture.framework.annotations.In; +import org.metafacture.framework.annotations.Out; import java.io.FileOutputStream; import java.io.IOException; @@ -36,7 +38,9 @@ * @author Christoph Böhme * */ +@Description("Writes objects to one (or more) file(s)") @In(Object.class) +@Out(Void.class) @FluxCommand("write-files") public final class ObjectFileWriter extends AbstractObjectWriter { @@ -90,18 +94,21 @@ public void setCompression(final String compression) { @Override public void process(final T obj) { assert !closed; - try { - if (firstObject) { - getWriter().write(getHeader()); - firstObject = false; + final String objStr = obj.toString(); + if (!objStr.isEmpty()) { + try { + if (firstObject) { + getWriter().write(getHeader()); + firstObject = false; + } + else { + getWriter().write(getSeparator()); + } + getWriter().write(objStr); } - else { - getWriter().write(getSeparator()); + catch (final IOException e) { + throw new MetafactureException(e); } - getWriter().write(obj.toString()); - } - catch (final IOException e) { - throw new MetafactureException(e); } } diff --git a/metafacture-io/src/main/java/org/metafacture/io/ObjectStdoutWriter.java b/metafacture-io/src/main/java/org/metafacture/io/ObjectStdoutWriter.java index 5394b26a7..599713557 100644 --- a/metafacture-io/src/main/java/org/metafacture/io/ObjectStdoutWriter.java +++ b/metafacture-io/src/main/java/org/metafacture/io/ObjectStdoutWriter.java @@ -19,6 +19,7 @@ import org.metafacture.framework.FluxCommand; import org.metafacture.framework.annotations.Description; import org.metafacture.framework.annotations.In; +import org.metafacture.framework.annotations.Out; import java.nio.charset.Charset; @@ -31,6 +32,7 @@ @Description("Writes objects to stdout") @In(Object.class) +@Out(Void.class) @FluxCommand("print") public final class ObjectStdoutWriter extends AbstractObjectWriter { diff --git a/metafacture-io/src/test/java/org/metafacture/io/ObjectFileWriterTest.java b/metafacture-io/src/test/java/org/metafacture/io/ObjectFileWriterTest.java index 64d877fb2..77d9ad612 100644 --- a/metafacture-io/src/test/java/org/metafacture/io/ObjectFileWriterTest.java +++ b/metafacture-io/src/test/java/org/metafacture/io/ObjectFileWriterTest.java @@ -20,6 +20,12 @@ import static org.junit.Assert.assertTrue; import static org.junit.Assume.assumeFalse; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.metafacture.commons.ResourceUtil; + import java.io.File; import java.io.FileInputStream; import java.io.IOException; @@ -28,12 +34,6 @@ import java.nio.charset.StandardCharsets; import java.nio.file.Files; -import org.junit.Before; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; -import org.metafacture.commons.ResourceUtil; - /** * Tests for class {@link ObjectFileWriter}. * @@ -105,6 +105,14 @@ public void shouldIncrementCountOnResetBeforeStartingNewFile() throws IOExceptio assertTrue(new File(tempFolder.getRoot(), "test-1").exists()); } + @Test + public void issue543_shouldResultEmptyWhenNothingIsProcessed() throws IOException { + writer.process(""); + writer.closeStream(); + + assertOutput(""); + } + @Override protected ConfigurableObjectWriter getWriter() { return writer; diff --git a/metafacture-mangling/src/main/java/org/metafacture/mangling/ObjectToLiteral.java b/metafacture-mangling/src/main/java/org/metafacture/mangling/ObjectToLiteral.java index 63cc01c23..022b55084 100644 --- a/metafacture-mangling/src/main/java/org/metafacture/mangling/ObjectToLiteral.java +++ b/metafacture-mangling/src/main/java/org/metafacture/mangling/ObjectToLiteral.java @@ -19,6 +19,7 @@ import org.metafacture.framework.FluxCommand; import org.metafacture.framework.StreamReceiver; import org.metafacture.framework.annotations.Description; +import org.metafacture.framework.annotations.In; import org.metafacture.framework.annotations.Out; import org.metafacture.framework.helpers.DefaultObjectPipe; @@ -29,6 +30,7 @@ * @author Christoph Böhme, Fabian Steeg */ @Description("Outputs a record containing the input object as literal") +@In(Object.class) @Out(StreamReceiver.class) @FluxCommand("object-to-literal") public final class ObjectToLiteral extends diff --git a/metafacture-xml/src/main/java/org/metafacture/xml/GenericXmlHandler.java b/metafacture-xml/src/main/java/org/metafacture/xml/GenericXmlHandler.java index 3fbe15da6..6bfd72591 100644 --- a/metafacture-xml/src/main/java/org/metafacture/xml/GenericXmlHandler.java +++ b/metafacture-xml/src/main/java/org/metafacture/xml/GenericXmlHandler.java @@ -34,7 +34,11 @@ * @author Markus Michael Geipel * */ -@Description("A generic xml reader") +@Description("A generic XML reader. Separates XML data in distinct records with the defined record tag name (default: `recordtagname=\"record\"`) " + + "If no matching record tag is found, the output will be empty. " + + "The handler breaks down XML elements with simple string values and optional attributes " + + "into entities with a value subfield (name configurable) and additional subfields for each attribute. " + + "Record tag and value tag names can be configured. Attributes can get an attributeMarker.") @In(XmlReceiver.class) @Out(StreamReceiver.class) @FluxCommand("handle-generic-xml") diff --git a/metafacture-xml/src/main/java/org/metafacture/xml/SimpleXmlEncoder.java b/metafacture-xml/src/main/java/org/metafacture/xml/SimpleXmlEncoder.java index 485ed7c62..eb39d6671 100644 --- a/metafacture-xml/src/main/java/org/metafacture/xml/SimpleXmlEncoder.java +++ b/metafacture-xml/src/main/java/org/metafacture/xml/SimpleXmlEncoder.java @@ -47,7 +47,7 @@ * @author Christoph Böhme * */ -@Description("Encodes a stream as xml") +@Description("Encodes a stream as XML. Defaults: `rootTag=\"records\"`, `recordTag=\"record\"`, no attributeMarker.") @In(StreamReceiver.class) @Out(String.class) @FluxCommand("stream-to-xml") diff --git a/metafacture-xml/src/main/java/org/metafacture/xml/XmlDecoder.java b/metafacture-xml/src/main/java/org/metafacture/xml/XmlDecoder.java index 1712c2771..2fc113802 100644 --- a/metafacture-xml/src/main/java/org/metafacture/xml/XmlDecoder.java +++ b/metafacture-xml/src/main/java/org/metafacture/xml/XmlDecoder.java @@ -41,25 +41,24 @@ * @author Christoph Böhme * */ -@Description("Reads an XML file and passes the XML events to a receiver.") +@Description("Reads an XML file and passes the XML events to a receiver. Set `totalEntitySizeLimit=\"0\"` to allow unlimited XML entities.") @In(Reader.class) @Out(XmlReceiver.class) @FluxCommand("decode-xml") public final class XmlDecoder extends DefaultObjectPipe { private static final String SAX_PROPERTY_LEXICAL_HANDLER = "http://xml.org/sax/properties/lexical-handler"; - + private static final String TOTAL_ENTITY_SIZE_LIMIT = "http://www.oracle.com/xml/jaxp/properties/totalEntitySizeLimit"; private final XMLReader saxReader; /** - * Constructs an XmlDecoder by obtaining a new instance of an + * Creates an instance of {@link XmlDecoder} by obtaining a new instance of an * {@link org.xml.sax.XMLReader}. */ public XmlDecoder() { try { final SAXParserFactory parserFactory = SAXParserFactory.newInstance(); parserFactory.setNamespaceAware(true); - saxReader = parserFactory.newSAXParser().getXMLReader(); } catch (final ParserConfigurationException | SAXException e) { @@ -67,15 +66,29 @@ public XmlDecoder() { } } + /** + * Sets the total entity size limit for the XML parser. + * See java-api-xml-processing-jaxp-security-guide.html + * + * Defaults to "50,000,000". Set to "0" to allow unlimited entities. + * + * @param totalEntitySizeLimit the size of the allowed entities. Set to "0" if entities should be unlimited. + */ + public void setTotalEntitySizeLimit(final String totalEntitySizeLimit) { + try { + saxReader.setProperty(TOTAL_ENTITY_SIZE_LIMIT, totalEntitySizeLimit); + } + catch (final SAXException e) { + throw new MetafactureException(e); + } + } + @Override public void process(final Reader reader) { try { saxReader.parse(new InputSource(reader)); } - catch (final IOException e) { - throw new MetafactureException(e); - } - catch (final SAXException e) { + catch (final IOException | SAXException e) { throw new MetafactureException(e); } } @@ -89,10 +102,7 @@ protected void onSetReceiver() { try { saxReader.setProperty(SAX_PROPERTY_LEXICAL_HANDLER, getReceiver()); } - catch (final SAXNotRecognizedException e) { - throw new MetafactureException(e); - } - catch (final SAXNotSupportedException e) { + catch (final SAXNotRecognizedException | SAXNotSupportedException e) { throw new MetafactureException(e); } } diff --git a/metafacture-xml/src/main/java/org/metafacture/xml/XmlFilenameWriter.java b/metafacture-xml/src/main/java/org/metafacture/xml/XmlFilenameWriter.java index 77ed89d2e..f2bf6ff06 100644 --- a/metafacture-xml/src/main/java/org/metafacture/xml/XmlFilenameWriter.java +++ b/metafacture-xml/src/main/java/org/metafacture/xml/XmlFilenameWriter.java @@ -51,11 +51,11 @@ * @author Pascal Christoph * @author Christoph Böhme */ -@Description("Writes the xml into the filesystem. The filename is constructed from the xpath given as 'property'.\n" + // checkstyle-disable-line ClassDataAbstractionCoupling|ClassFanOutComplexity - " Variables are\n" + "- 'target' (determining the output directory)\n" + - "- 'property' (the element in the XML entity. Constitutes the main part of the file's name.)\n" + - "- 'startIndex' ( a subfolder will be extracted out of the filename. This marks the index' beginning )\n" + - "- 'stopIndex' ( a subfolder will be extracted out of the filename. This marks the index' end )\n") +@Description("Writes the XML into the filesystem. The filename is constructed from the XPATH given as 'property'." + // checkstyle-disable-line ClassDataAbstractionCoupling|ClassFanOutComplexity + " Variables are:" + "`target` (determining the output directory)" + + ", `property` (the element in the XML entity. Constitutes the main part of the file's name.)" + + ", `startIndex` ( a subfolder will be extracted out of the filename. This marks the index' beginning )" + + ", `stopIndex` ( a subfolder will be extracted out of the filename. This marks the index' end )") @In(StreamReceiver.class) @Out(Void.class) @FluxCommand("write-xml-files") diff --git a/metafacture-xml/src/test/java/org/metafacture/xml/XmlDecoderTest.java b/metafacture-xml/src/test/java/org/metafacture/xml/XmlDecoderTest.java new file mode 100644 index 000000000..7414aece7 --- /dev/null +++ b/metafacture-xml/src/test/java/org/metafacture/xml/XmlDecoderTest.java @@ -0,0 +1,69 @@ +/* + * Copyright 2024 Pascal Christoph (hbz) + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.metafacture.xml; + +import org.junit.Before; +import org.junit.Test; +import org.metafacture.framework.MetafactureException; + +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; + +/** + * Tests for class {@link XmlDecoder}. + * + * @author Pascal Christoph (dr0i) + */ +public final class XmlDecoderTest { + + private final String TEST_XML_WITH_TWO_ENTITIES = ">>"; + private XmlDecoder xmlDecoder; + private final Reader reader = new StringReader(TEST_XML_WITH_TWO_ENTITIES); + + @Before + public void initSystemUnderTest() { + xmlDecoder = new XmlDecoder(); + } + + @Test + public void issue554_default() { + process(xmlDecoder); + } + + @Test(expected = MetafactureException.class) + public void issue554_shouldFail() { + xmlDecoder.setTotalEntitySizeLimit("1"); + process(xmlDecoder); + } + + @Test + public void issue554_unlimitedEntities() { + xmlDecoder.setTotalEntitySizeLimit("0"); + process(xmlDecoder); + } + + private void process(XmlDecoder xmlDecoder) { + try { + xmlDecoder.process(reader); + reader.close(); + } + catch (IOException e) { + throw new RuntimeException(e); + } + } +}