diff --git a/metamorph/src/main/java/org/metafacture/metamorph/Metamorph.java b/metamorph/src/main/java/org/metafacture/metamorph/Metamorph.java index 4abe8026a..e38b9d052 100644 --- a/metamorph/src/main/java/org/metafacture/metamorph/Metamorph.java +++ b/metamorph/src/main/java/org/metafacture/metamorph/Metamorph.java @@ -82,8 +82,7 @@ public final class Metamorph implements StreamPipe, NamedValuePi private static final InterceptorFactory NULL_INTERCEPTOR_FACTORY = new NullInterceptorFactory(); private static final Map NO_VARS = Collections.emptyMap(); - private final Registry dataRegistry = - new WildcardRegistry<>(); + private final Registry dataRegistry = new WildcardRegistry<>(); private final List elseSources = new ArrayList<>(); private final Map> maps = new HashMap<>(); @@ -100,7 +99,8 @@ public final class Metamorph implements StreamPipe, NamedValuePi private int recordCount; private final List recordEndListener = new ArrayList<>(); private boolean elseNested; - final private Pattern literalPatternOfEntityMarker = Pattern.compile(flattener.getEntityMarker(), Pattern.LITERAL); + private boolean elseNestedEntityStarted; + private String currentLiteralName; private static final Logger LOG = LoggerFactory.getLogger(Metamorph.class); protected Metamorph() { @@ -122,7 +122,6 @@ public Metamorph(final String morphDef, final InterceptorFactory interceptorFact public Metamorph(final String morphDef, final Map vars, final InterceptorFactory interceptorFactory) { - this(getInputSource(morphDef), vars, interceptorFactory); } @@ -140,7 +139,6 @@ public Metamorph(final Reader morphDef, final InterceptorFactory interceptorFact public Metamorph(final Reader morphDef, final Map vars, final InterceptorFactory interceptorFactory) { - this(new InputSource(morphDef), vars, interceptorFactory); } @@ -158,7 +156,6 @@ public Metamorph(final InputStream morphDef, final InterceptorFactory intercepto public Metamorph(final InputStream morphDef, final Map vars, final InterceptorFactory interceptorFactory) { - this(new InputSource(morphDef), vars, interceptorFactory); } @@ -205,7 +202,7 @@ private void init() { flattener.setReceiver(new DefaultStreamReceiver() { @Override public void literal(final String name, final String value) { - dispatch(name, value, getElseSources()); + dispatch(name, value, getElseSources(), false); } }); } @@ -224,14 +221,16 @@ public void setErrorHandler(final MorphErrorHandler errorHandler) { protected void registerNamedValueReceiver(final String source, final NamedValueReceiver data) { if (ELSE_NESTED_KEYWORD.equals(source)) { - this.elseNested = true; + elseNested = true; } + if (ELSE_KEYWORD.equals(source) || ELSE_FLATTENED_KEYWORD.equals(source) || elseNested) { - if (elseSources.isEmpty()) + if (elseSources.isEmpty()) { elseSources.add(data); - else - LOG.warn( - "Only one of '_else', '_elseFlattened' and '_elseNested' is allowed. Ignoring the superflous ones."); + } + else { + LOG.warn("Only one of '_else', '_elseFlattened' and '_elseNested' is allowed. Ignoring the superflous ones."); + } } else { dataRegistry.register(source, data); } @@ -253,12 +252,11 @@ public void startRecord(final String identifier) { final String identifierFinal = identifier; outputStreamReceiver.startRecord(identifierFinal); - dispatch(StandardEventNames.ID, identifierFinal, null); + dispatch(StandardEventNames.ID, identifierFinal, null, false); } @Override public void endRecord() { - for(final FlushListener listener: recordEndListener){ listener.flush(recordCount, currentEntityCount); } @@ -287,17 +285,16 @@ public void startEntity(final String name) { @Override public void endEntity() { - dispatch(flattener.getCurrentPath(), "", null); + dispatch(flattener.getCurrentPath(), "", getElseSources(), true); currentEntityCount = entityCountStack.pop().intValue(); flattener.endEntity(); - } @Override public void literal(final String name, final String value) { + currentLiteralName = name; flattener.literal(name, value); - } @Override @@ -318,41 +315,62 @@ public void closeStream() { outputStreamReceiver.closeStream(); } - protected void dispatch(final String path, final String value, final List fallbackReceiver) { - List matchingData = dataRegistry.get(path); - boolean fallback = false; - if (matchingData == null || matchingData.isEmpty()) { - fallback = true; - matchingData = fallbackReceiver; + private void dispatch(final String path, final String value, final List fallbackReceiver, final boolean endEntity) { + final List matchingData = getData(path); + + if (matchingData != null) { + send(path, value, matchingData); } - if (null != matchingData) { - send(path, value, matchingData, fallback); + else if (fallbackReceiver != null) { + if (endEntity) { + if (elseNestedEntityStarted) { + outputStreamReceiver.endEntity(); + elseNestedEntityStarted = false; + } + } + else { + final String entityName = elseNested ? flattener.getCurrentEntityName() : null; + + if (entityName != null) { + if (getData(entityName) == null) { + if (!elseNestedEntityStarted) { + outputStreamReceiver.startEntity(entityName); + elseNestedEntityStarted = true; + } + + send(escapeFeedbackChar(currentLiteralName), value, fallbackReceiver); + } + } + else { + send(escapeFeedbackChar(path), value, fallbackReceiver); + } + } } } - private void send(final String path, final String value, final List dataList, - final boolean fallback) { + private List getData(final String path) { + final List matchingData = dataRegistry.get(path); + return matchingData != null && !matchingData.isEmpty() ? matchingData : null; + } + + private void send(final String path, final String value, final List dataList) { for (final NamedValueReceiver data : dataList) { - String key = path; - if (fallback && elseNested) { - if (flattener.getCurrentEntityName() != null) { - outputStreamReceiver.startEntity(flattener.getCurrentEntityName()); - key = literalPatternOfEntityMarker.split(path)[1]; - } - } try { - data.receive(key, value, null, recordCount, currentEntityCount); + data.receive(path, value, null, recordCount, currentEntityCount); } catch (final RuntimeException e) { errorHandler.error(e); } - if (fallback && elseNested) { - if (flattener.getCurrentEntityName() != null) { - outputStreamReceiver.endEntity(); - } - } } } + private boolean startsWithFeedbackChar(final String name) { + return name.length() != 0 && name.charAt(0) == FEEDBACK_CHAR; + } + + private String escapeFeedbackChar(final String name) { + return name == null ? null : (startsWithFeedbackChar(name) ? ESCAPE_CHAR : "") + name; + } + /** * @param streamReceiver * the outputHandler to set @@ -378,8 +396,8 @@ public void receive(final String name, final String value, final NamedValueSourc "encountered literal with name='null'. This indicates a bug in a function or a collector."); } - if (name.length() != 0 && name.charAt(0) == FEEDBACK_CHAR) { - dispatch(name, value, null); + if (startsWithFeedbackChar(name)) { + dispatch(name, value, null, false); return; } diff --git a/metamorph/src/test/java/org/metafacture/metamorph/TestMetamorphBasics.java b/metamorph/src/test/java/org/metafacture/metamorph/TestMetamorphBasics.java index e2ee92dac..7f551ba18 100644 --- a/metamorph/src/test/java/org/metafacture/metamorph/TestMetamorphBasics.java +++ b/metamorph/src/test/java/org/metafacture/metamorph/TestMetamorphBasics.java @@ -87,45 +87,135 @@ public void shouldHandleUnmatchedLiteralsInElseSource() { @Test public void shouldHandleUnmatchedLiteralsAndEntitiesInElseSource() { - testElseData( - "" + - " " + - "" - ); + testElseData("_else"); } @Test public void shouldHandleUnmatchedLiteralsAndEntitiesInElseFlattenedSource() { - testElseData( - "" + - " " + - "" - ); + testElseData("_elseFlattened"); } - private void testElseData(final String morphDef) { - assertMorph(receiver, morphDef, + private void testElseData(final String elseKeyword) { + assertMorph(receiver, + "" + + " " + + " " + + " " + + " " + + " " + + "", i -> { i.startRecord("1"); + i.literal("@id", "123"); i.literal("Shikotan", "Aekap"); i.startEntity("Germany"); i.literal("Langeoog", "Moin"); + i.literal("Sylt", "Aloha"); + i.literal("Borkum", "Tach"); i.endEntity(); i.startEntity("Germany"); + i.literal("@foo", "bar"); i.literal("Baltrum", "Moin Moin"); i.endEntity(); i.endRecord(); }, o -> { o.get().startRecord("1"); + o.get().literal("@id", "123"); o.get().literal("Shikotan", "Aekap"); o.get().literal("Germany.Langeoog", "Moin"); + o.get().startEntity("Germany"); + o.get().literal("Hawaii", "Aloha"); + o.get().literal("Germany.Borkum", "Tach"); + o.get().endEntity(); + o.get().literal("Germany.@foo", "bar"); o.get().literal("Germany.Baltrum", "Moin Moin"); o.get().endRecord(); } ); } + @Test + public void issue338_shouldPreserveSameEntitiesInElseNestedSource() { + assertMorph(receiver, + "" + + " " + + "", + i -> { + i.startRecord("1"); + i.literal("lit1", "val1"); + i.startEntity("ent1"); + i.literal("lit2", "val2"); + i.literal("lit3", "val3"); + i.endEntity(); + i.literal("lit4", "val4"); + i.startEntity("ent2"); + i.literal("lit5", "val5"); + i.literal("lit6", "val6"); + i.literal("lit7", "val7"); + i.endEntity(); + i.startEntity("ent2"); // sic! + i.literal("lit8", "val8"); + i.literal("lit9", "val9"); + i.endEntity(); + i.endRecord(); + i.startRecord("2"); + i.startEntity("ent1"); + i.literal("lit1", "val1"); + i.literal("lit2", "val2"); + i.endEntity(); + i.startEntity("ent2"); + i.literal("lit3", "val3"); + i.literal("lit4", "val4"); + i.literal("lit5", "val5"); + i.literal("lit6", "val6"); + i.endEntity(); + i.startEntity("ent3"); + i.literal("lit7", "val7"); + i.literal("lit8", "val8"); + i.endEntity(); + i.literal("lit9", "val9"); + i.endRecord(); + }, + o -> { + o.get().startRecord("1"); + o.get().literal("lit1", "val1"); + o.get().startEntity("ent1"); + o.get().literal("lit2", "val2"); + o.get().literal("lit3", "val3"); + o.get().endEntity(); + o.get().literal("lit4", "val4"); + o.get().startEntity("ent2"); + o.get().literal("lit5", "val5"); + o.get().literal("lit6", "val6"); + o.get().literal("lit7", "val7"); + o.get().endEntity(); + o.get().startEntity("ent2"); + o.get().literal("lit8", "val8"); + o.get().literal("lit9", "val9"); + o.get().endEntity(); + o.get().endRecord(); + o.get().startRecord("2"); + o.get().startEntity("ent1"); + o.get().literal("lit1", "val1"); + o.get().literal("lit2", "val2"); + o.get().endEntity(); + o.get().startEntity("ent2"); + o.get().literal("lit3", "val3"); + o.get().literal("lit4", "val4"); + o.get().literal("lit5", "val5"); + o.get().literal("lit6", "val6"); + o.get().endEntity(); + o.get().startEntity("ent3"); + o.get().literal("lit7", "val7"); + o.get().literal("lit8", "val8"); + o.get().endEntity(); + o.get().literal("lit9", "val9"); + o.get().endRecord(); + } + ); + } + @Test public void shouldHandleUnmatchedLiteralsAndEntitiesInElseNestedSource() { assertMorph(receiver, @@ -137,8 +227,10 @@ public void shouldHandleUnmatchedLiteralsAndEntitiesInElseNestedSource() { "", i -> { i.startRecord("1"); + i.literal("@id", "123"); i.literal("Shikotan", "Aekap"); i.startEntity("Germany"); + i.literal("@foo", "bar"); i.literal("Langeoog", "Moin"); i.literal("Baltrum", "Moin Moin"); i.endEntity(); @@ -149,21 +241,303 @@ public void shouldHandleUnmatchedLiteralsAndEntitiesInElseNestedSource() { }, o -> { o.get().startRecord("1"); + o.get().literal("@id", "123"); o.get().literal("Shikotan", "Aekap"); o.get().startEntity("Germany"); + o.get().literal("@foo", "bar"); o.get().literal("Langeoog", "Moin"); + o.get().literal("Baltrum", "Moin Moin"); + o.get().endEntity(); + o.get().startEntity("USA"); + o.get().literal("Hawaii", "Aloha"); o.get().endEntity(); + o.get().endRecord(); + } + ); + } + + @Test + public void shouldHandlePartiallyUnmatchedLiteralsAndEntitiesInElseNestedSource() { + assertMorph(receiver, + "" + + " " + + " " + + " " + + " " + + " " + + " " + + " " + + " " + + " " + + " " + + " " + + " " + + " " + + "", + i -> { + i.startRecord("1"); + i.literal("Shikotan", "Aekap"); + i.startEntity("Germany"); + i.literal("Langeoog", "Moin"); + i.literal("Baltrum", "Moin Moin"); + i.endEntity(); + i.startEntity("USA1"); + i.literal("Sylt", "Aloha"); + i.endEntity(); + i.startEntity("USA2"); + i.literal("Sylt", "Aloha"); + i.literal("Langeoog", "Moin"); + i.literal("Baltrum", "Moin Moin"); + i.endEntity(); + i.startEntity("USA3"); + i.literal("Langeoog", "Moin"); + i.literal("Sylt", "Aloha"); + i.literal("Baltrum", "Moin Moin"); + i.endEntity(); + i.startEntity("USA4"); + i.literal("Langeoog", "Moin"); + i.literal("Baltrum", "Moin Moin"); + i.literal("Sylt", "Aloha"); + i.endEntity(); + i.endRecord(); + }, + (o, f) -> { + // Pass-through coordinates with morph whether to start/end an entity + final boolean coordinatesWithEntity = false; + + // Pass-through and morph entities are separated (one ends when the other starts) + final boolean separatesFromEntity = false; + + o.get().startRecord("1"); + o.get().literal("Shikotan", "Aekap"); o.get().startEntity("Germany"); + o.get().literal("Langeoog", "Moin"); o.get().literal("Baltrum", "Moin Moin"); o.get().endEntity(); - o.get().startEntity("USA"); + o.get().startEntity("USA1"); + o.get().literal("Hawaii", "Aloha"); + o.get().endEntity(); + o.get().startEntity("USA2"); o.get().literal("Hawaii", "Aloha"); + if (!coordinatesWithEntity) { + o.get().endEntity(); + o.get().startEntity("USA2"); + } + o.get().literal("Langeoog", "Moin"); + o.get().literal("Baltrum", "Moin Moin"); o.get().endEntity(); + o.get().startEntity("USA3"); + o.get().literal("Langeoog", "Moin"); + if (!coordinatesWithEntity) { + o.get().startEntity("USA3"); + } + else if (separatesFromEntity) { + o.get().endEntity(); + o.get().startEntity("USA3"); + } + o.get().literal("Hawaii", "Aloha"); + if (!coordinatesWithEntity) { + o.get().endEntity(); + } + else if (separatesFromEntity) { + o.get().endEntity(); + o.get().startEntity("USA3"); + } + o.get().literal("Baltrum", "Moin Moin"); + o.get().endEntity(); + o.get().startEntity("USA4"); + o.get().literal("Langeoog", "Moin"); + o.get().literal("Baltrum", "Moin Moin"); + if (!coordinatesWithEntity) { + o.get().startEntity("USA4"); + } + else if (separatesFromEntity) { + o.get().endEntity(); + o.get().startEntity("USA4"); + } + o.get().literal("Hawaii", "Aloha"); + if (!coordinatesWithEntity) { + f.apply(2).endEntity(); + } + else { + o.get().endEntity(); + } o.get().endRecord(); } ); } + @Test + public void shouldNotHandleDataByElseNestedSourceIfDataBelongingToEntityIsRuledByMorph() { + assertMorph(receiver, + "" + + " " + + " " + + " " + + " " + + " " + + " " + + " " + + " " + + " " + + " " + + " " + + "", + i -> { + i.startRecord("1"); + i.literal("Shikotan", "Aekap"); + i.startEntity("Germany"); + i.literal("Langeoog", "Moin"); + i.literal("Baltrum", "Moin Moin"); + i.endEntity(); + i.startEntity("USA1"); + i.literal("Sylt", "Aloha"); + i.endEntity(); + i.startEntity("USA2"); + i.literal("Sylt", "Aloha"); + i.literal("Langeoog", "Moin"); + i.literal("Baltrum", "Moin Moin"); + i.endEntity(); + i.startEntity("USA2"); + i.literal("Langeoog", "Moin"); + i.literal("Baltrum", "Moin Moin"); + i.endEntity(); + i.startEntity("USA3"); + i.literal("Baltrum", "Moin Moin"); + i.endEntity(); + i.endRecord(); + }, + o -> { + o.get().startRecord("1"); + o.get().literal("Shikotan", "Aekap"); + o.get().startEntity("Germany"); + o.get().literal("Langeoog", "Moin"); + o.get().literal("Baltrum", "Moin Moin"); + o.get().endEntity(); + o.get().startEntity("USA1"); + o.get().literal("Hawaii", "Aloha"); + o.get().endEntity(); + o.get().startEntity("USA2"); + o.get().literal("Hawaii", "Aloha"); + o.get().literal("Langeoog", "Moin"); + o.get().endEntity(); + o.get().startEntity("USA2"); + o.get().literal("Langeoog", "Moin"); + o.get().endEntity(); + o.get().endRecord(); + } + ); + } + + // https://github.com/hagbeck/metafacture-sandbox/tree/master/else-nested-entities + @Test + public void shouldHandleUseCaseSandboxElseNestedEntities() { + assertMorph(receiver, + "" + + "" + + "" + + "" + + "" + + "" + + "" + + "" + + "" + + "" + + "" + + "" + + "" + + "" + + "" + + "" + + "" + + "" + + "" + + "" + + "" + + "" + + "" + + "" + + "" + + "" + + "" + + "" + + "", + i -> { + i.startRecord("ID1687931"); + i.literal("leader", "00564nam a2200024 c 4500"); + i.literal("001", "1687931"); + i.literal("007", "|| ||||||||||||||||||||"); + i.literal("008", "||||||nuuuuuuuu|||||| | |||||||||||und||"); + i.startEntity("035 "); + i.literal("a", "(UNION_SEAL)HT019953476"); + i.endEntity(); + i.startEntity("24500"); + i.literal("a", "Design and analysis of an asymmetric mutation operator"); + i.literal("c", "Thomas Jansen and Dirk Sudholt"); + i.endEntity(); + i.startEntity("8564 "); + i.literal("z", "Freie Internetressource"); + i.endEntity(); + i.startEntity("85640"); + i.literal("u", "http://hdl.handle.net/2003/22116"); + i.literal("x", "Resolving-System"); + i.literal("3", "Volltext"); + i.endEntity(); + i.startEntity("85640"); + i.literal("u", "http://dx.doi.org/10.17877/DE290R-14123"); + i.literal("x", "Resolving-System"); + i.literal("3", "Volltext"); + i.endEntity(); + i.startEntity("9801 "); + i.literal("e", "HBZ"); + i.endEntity(); + i.startEntity("997 "); + i.literal("a", "20190130"); + i.endEntity(); + i.startEntity("9984 "); + i.literal("z", "Freie Internetressource"); + i.endEntity(); + }, + o -> { + o.get().startRecord("ID1687931"); + o.get().literal("leader", "00564nam a2200024 c 4500"); + o.get().literal("001", "1687931"); + o.get().literal("007", "|| ||||||||||||||||||||"); + o.get().literal("008", "||||||nuuuuuuuu|||||| | |||||||||||und||"); + o.get().startEntity("035 "); + o.get().literal("a", "(UNION_SEAL)HT019953476"); + o.get().endEntity(); + o.get().startEntity("24500"); + o.get().literal("a", "Design and analysis of an asymmetric mutation operator"); + o.get().literal("c", "Thomas Jansen and Dirk Sudholt"); + o.get().endEntity(); + o.get().startEntity("8564 "); + o.get().literal("z", "Freie Internetressource"); + o.get().endEntity(); + o.get().startEntity("85640"); + o.get().literal("u", "https://hdl.handle.net/2003/22116"); + o.get().literal("x", "Resolving-System"); + o.get().literal("3", "Volltext"); + o.get().endEntity(); + o.get().startEntity("85640"); + o.get().literal("u", "https://doi.org/10.17877/DE290R-14123"); + o.get().literal("x", "Resolving-System"); + o.get().literal("3", "Volltext"); + o.get().endEntity(); + o.get().startEntity("9801 "); + o.get().literal("e", "HBZ"); + o.get().endEntity(); + o.get().startEntity("997 "); + o.get().literal("a", "20190130"); + o.get().endEntity(); + o.get().startEntity("9984 "); + o.get().literal("z", "Freie Internetressource"); + o.get().endEntity(); + } + ); + } + + @Test public void shouldMatchCharacterWithQuestionMarkWildcard() { assertMorph(receiver,