Skip to content

Commit

Permalink
[GOBBLIN-2188] Define Initializer.AfterInitializeMemento for GoT to…
Browse files Browse the repository at this point in the history
… tunnel state from `GenerateWorkUnits` to `CommitActivity` (#4091)
  • Loading branch information
phet authored Jan 7, 2025
1 parent 87c8ab4 commit a0cef28
Show file tree
Hide file tree
Showing 12 changed files with 479 additions and 48 deletions.
2 changes: 2 additions & 0 deletions gobblin-api/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ apply plugin: 'java'
dependencies {
compile externalDependency.guava
compile externalDependency.gson
compile externalDependency.jacksonCore
compile externalDependency.jacksonMapper
compile externalDependency.jasypt
compile externalDependency.jodaTime
compile externalDependency.commonsLang3
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,7 @@ public class ConfigurationKeys {
public static final String TASK_DATA_ROOT_DIR_KEY = "task.data.root.dir";
public static final String SOURCE_CLASS_KEY = "source.class";
public static final String CONVERTER_CLASSES_KEY = "converter.classes";
public static final String CONVERTER_INITIALIZERS_SERIALIZED_MEMENTOS_KEY = "converter.initializers.serialized.mementos";
public static final String RECORD_STREAM_PROCESSOR_CLASSES_KEY = "recordStreamProcessor.classes";
public static final String FORK_OPERATOR_CLASS_KEY = "fork.operator.class";
public static final String DEFAULT_FORK_OPERATOR_CLASS = "org.apache.gobblin.fork.IdentityForkOperator";
Expand Down Expand Up @@ -434,6 +435,7 @@ public class ConfigurationKeys {
public static final String WRITER_TRUNCATE_STAGING_TABLE = WRITER_PREFIX + ".truncate.staging.table";
public static final String WRITER_OUTPUT_DIR = WRITER_PREFIX + ".output.dir";
public static final String WRITER_BUILDER_CLASS = WRITER_PREFIX + ".builder.class";
public static final String WRITER_INITIALIZER_SERIALIZED_MEMENTO_KEY = "writer.initializer.serialized.memento";
public static final String DEFAULT_WRITER_BUILDER_CLASS = "org.apache.gobblin.writer.AvroDataWriterBuilder";
public static final String WRITER_FILE_NAME = WRITER_PREFIX + ".file.name";
public static final String WRITER_FILE_PATH = WRITER_PREFIX + ".file.path";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,22 +18,121 @@
package org.apache.gobblin.initializer;

import java.io.Closeable;
import java.util.Optional;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.fasterxml.jackson.annotation.JsonTypeInfo;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;


public interface Initializer extends Closeable {

/**
* Initialize for the writer.
* Marker interface to convey an opaque snapshot of the internal state of any concrete {@link Initializer}, thus affording state serialization for
* eventual "revival" as a new `Initializer` holding equivalent internal state. {@link #commemorate()} (i.e. create) the memento after
* {@link #initialize()} and subsequently {@link #recall(AfterInitializeMemento)} the state it preserved before performing {@link #close()}.
*
* When synchronous and the same instance throughout, the "Initializer Lifecycle" is:
* [concrete `My_T implements Initializer`, instance A] -
* `.initialize()`; ==PROCESSING RUNS==; `.close()`;
*
* When trading `AfterInitializeMemento` between instances (even memory-space boundaries) it becomes:
* [concrete `My_T implements Initializer`, instance A] -
* `.initialize()`; `.commemorate()`; ==PERSIST/TRANSMIT MEMENTO==
* ==PROCESSING RUNS==;
* [concrete `My_T implements Initializer`, instance B] -
* ==RECEIVE MEMENTO==; `.recall()`; `.close()`
*
* @param state
* @param workUnits WorkUnits created by Source
* Both for backwards compatibility and because not every concrete `Initializer` has internal state worth capturing, not every `Initializer`
* impl will implement an `AfterInitializeMemento`. Those that do will supply a unique impl capturing self-aware impl details of their
* `Initializer`.
*
* An `AfterInitializeMemento` impl needs simply be (de)serializable by {@link ObjectMapper}.
*
* An `Initializer` impl with an `AfterInitializeMemento` impl MUST NOT (re-)process any {@link org.apache.gobblin.source.workunit.WorkUnit}s
* during its {@link #close()} method: `WorkUnit` processing MUST occur entirely within {@link #initialize()}.
*/
@JsonTypeInfo(use = JsonTypeInfo.Id.CLASS, include = JsonTypeInfo.As.PROPERTY, property = "@class") // to handle variety of concrete impls
public interface AfterInitializeMemento {
static Logger logger = LoggerFactory.getLogger(AfterInitializeMemento.class);

/**
* Convey attempt to work with a concrete {@link AfterInitializeMemento} of type other than the single expected companion type known to `forInitializer`.
* @see #castAsOrThrow(Class, Initializer)
*/
static class MismatchedMementoException extends RuntimeException {
public MismatchedMementoException(AfterInitializeMemento memento, Class<?> asClass, Initializer forInitializer) {
super(String.format("Memento '%s' for Initializer '%s' of class '%s' - NOT '%s'", memento, forInitializer.getClass().getName(),
memento.getClass().getName(), asClass.getName()));
}
}

/** stringify as JSON */
static String serialize(AfterInitializeMemento memento) {
ObjectMapper objectMapper = new ObjectMapper();
try {
String result = objectMapper.writeValueAsString(memento);
logger.info("Serializing AfterInitializeMemento {} as '{}'", memento, result);
return result;
} catch (JsonProcessingException e) {
logger.error("Failed to serialize AfterInitializeMemento '" + memento + "'", e);
throw new RuntimeException(e);
}
}

/** destringify JSON */
static AfterInitializeMemento deserialize(String serialized) {
ObjectMapper objectMapper = new ObjectMapper();
try {
AfterInitializeMemento result = objectMapper.readValue(serialized, AfterInitializeMemento.class);
logger.info("Deserializing AfterInitializeMemento '{}' as {}", serialized, result);
return result;
} catch (JsonProcessingException e) {
logger.error("Failed to deserialize AfterInitializeMemento '" + serialized + "'", e);
throw new RuntimeException(e);
}
}

/** cast `this` (concrete `AfterInitializeMemento`) to `castClass`, else {@link MismatchedMementoException} */
default <T extends AfterInitializeMemento> T castAsOrThrow(Class<T> castClass, Initializer forInitializer)
throws MismatchedMementoException {
if (castClass.isAssignableFrom(this.getClass())) {
return (T) this;
} else {
throw new AfterInitializeMemento.MismatchedMementoException(this, castClass, forInitializer);
}
}
}

/**
* Initialize the writer/converter (e.g. using the state and/or {@link org.apache.gobblin.source.workunit.WorkUnit}s provided when constructing the instance)
*/
public void initialize();

/**
* Removed checked exception.
* {@inheritDoc}
* @see java.io.Closeable#close()
*
* NOTE: An `Initializer` impl with an `AfterInitializeMemento` impl MUST NOT (re-)process any {@link org.apache.gobblin.source.workunit.WorkUnit}s
* during its {@link #close()} method: `WorkUnit` processing MUST occur entirely within {@link #initialize()}.
*/
@Override
public void close();

/** @return the `Initializer`-specific companion memento, to convey internal state after {@link #initialize()}, and as needed to {@link #close()} */
default Optional<AfterInitializeMemento> commemorate() {
return Optional.empty();
}

/**
* "reinitialize" a fresh instance, per `Initializer`-specific companion `memento`, with (equivalent) post {@link #initialize()} internal state needed
* to {@link #close()}
*/
default void recall(AfterInitializeMemento memento) {
// noop
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,27 +18,39 @@
package org.apache.gobblin.converter.initializer;

import java.util.List;
import java.util.Optional;

import lombok.ToString;

import org.apache.gobblin.initializer.Initializer;
import org.apache.gobblin.initializer.MultiInitializer;


@ToString
public class MultiConverterInitializer implements ConverterInitializer {
private final Initializer intializer;
private final Initializer initializer;

public MultiConverterInitializer(List<ConverterInitializer> converterInitializers) {
this.intializer = new MultiInitializer(converterInitializers);
this.initializer = new MultiInitializer(converterInitializers);
}

@Override
public void initialize() {
this.intializer.initialize();
this.initializer.initialize();
}

@Override
public void close() {
this.intializer.close();
this.initializer.close();
}

@Override
public Optional<AfterInitializeMemento> commemorate() {
return this.initializer.commemorate();
}

@Override
public void recall(AfterInitializeMemento memento) {
this.initializer.recall(memento);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,18 +19,46 @@

import java.io.IOException;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;

import lombok.AccessLevel;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.NonNull;
import lombok.RequiredArgsConstructor;
import lombok.Setter;
import lombok.ToString;

import com.google.common.collect.ImmutableList;
import com.google.common.collect.Streams;
import com.google.common.io.Closer;


/**
* Wraps multiple writer initializer behind its interface. This is useful when there're more than one branch.
* Wraps multiple writer initializers, which is useful when more than one branch.
*/
@ToString
public class MultiInitializer implements Initializer {

/** Commemorate each (`Optional`) {@link org.apache.gobblin.initializer.Initializer.AfterInitializeMemento} of every wrapped {@link Initializer} */
@Data
@Setter(AccessLevel.NONE) // NOTE: non-`final` members solely to enable deserialization
@NoArgsConstructor // IMPORTANT: for jackson (de)serialization
@RequiredArgsConstructor
private static class Memento implements AfterInitializeMemento {
// WARNING: not possible to use `List<Optional<AfterInitializeMemento>>`, as first attempted, due to:
// com.fasterxml.jackson.databind.exc.UnrecognizedPropertyException: Unrecognized field "present" (class java.util.Optional), not marked as
// ignorable (0 known properties: ])
// at [Source:(String)"{\"@class\":\"org.apache.gobblin.initializer.MultiInitializer$Memento\",\"orderedInitializersMementos\":[{\"present\":false}]}"]
// (through reference chain: org.apache.gobblin.initializer.MultiInitializer$Memento[\"orderedInitializersMementos\"]->java.util.ArrayList[0]
// ->java.util.Optional[\"present\"])",
// the following does NOT fix, probably due to `Optional`'s nesting with `List`:
// @JsonIgnoreProperties(ignoreUnknown = true)
@NonNull private List<AfterInitializeMemento> orderedInitializersMementos;
}


private final List<Initializer> initializers;
private final Closer closer;

Expand All @@ -57,4 +85,21 @@ public void close() {
throw new RuntimeException(e);
}
}
}

@Override
public Optional<AfterInitializeMemento> commemorate() {
return Optional.of(new MultiInitializer.Memento(this.initializers.stream()
.map(Initializer::commemorate)
.map(opt -> opt.orElse(null))
.collect(Collectors.toList())));
}

@Override
public void recall(AfterInitializeMemento memento) {
Memento recollection = memento.castAsOrThrow(MultiInitializer.Memento.class, this);
Streams.zip(this.initializers.stream(), recollection.orderedInitializersMementos.stream(), (initializer, nullableInitializerMemento) -> {
Optional.ofNullable(nullableInitializerMemento).ifPresent(initializer::recall);
return null;
}).count(); // force evaluation, since `Streams.zip` used purely for side effects
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,31 +17,41 @@

package org.apache.gobblin.writer.initializer;

import org.apache.gobblin.initializer.Initializer;
import org.apache.gobblin.initializer.MultiInitializer;

import java.util.Optional;
import java.util.List;

import lombok.ToString;

import org.apache.gobblin.initializer.Initializer;
import org.apache.gobblin.initializer.MultiInitializer;


@ToString
public class MultiWriterInitializer implements WriterInitializer {

private final Initializer intializer;
private final Initializer initializer;

public MultiWriterInitializer(List<WriterInitializer> writerInitializers) {
this.intializer = new MultiInitializer(writerInitializers);
this.initializer = new MultiInitializer(writerInitializers);
}

@Override
public void initialize() {
this.intializer.initialize();
this.initializer.initialize();
}

@Override
public void close() {
this.intializer.close();
this.initializer.close();
}

@Override
public Optional<AfterInitializeMemento> commemorate() {
return this.initializer.commemorate();
}

@Override
public void recall(AfterInitializeMemento memento) {
this.initializer.recall(memento);
}
}
Loading

0 comments on commit a0cef28

Please sign in to comment.