Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[flink] Remove state from append only unaware bucket writer #4219

Merged
merged 4 commits into from
Sep 24, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
import org.apache.paimon.manifest.ManifestCacheFilter;
import org.apache.paimon.operation.AppendOnlyFileStoreScan;
import org.apache.paimon.operation.AppendOnlyFileStoreWrite;
import org.apache.paimon.operation.AppendOnlyFixedBucketFileStoreWrite;
import org.apache.paimon.operation.AppendOnlyUnawareBucketFileStoreWrite;
import org.apache.paimon.operation.RawFileSplitRead;
import org.apache.paimon.operation.ScanBucketFilter;
import org.apache.paimon.predicate.Predicate;
Expand Down Expand Up @@ -94,21 +96,36 @@ public AppendOnlyFileStoreWrite newWrite(String commitUser) {
@Override
public AppendOnlyFileStoreWrite newWrite(
String commitUser, ManifestCacheFilter manifestFilter) {
return new AppendOnlyFileStoreWrite(
fileIO,
newRead(),
schema.id(),
commitUser,
rowType,
pathFactory(),
snapshotManager(),
newScan(true).withManifestCacheFilter(manifestFilter),
options,
bucketMode(),
DeletionVectorsMaintainer.Factory dvMaintainerFactory =
options.deletionVectorsEnabled()
? DeletionVectorsMaintainer.factory(newIndexFileHandler())
: null,
tableName);
: null;
if (bucketMode() == BucketMode.BUCKET_UNAWARE) {
return new AppendOnlyUnawareBucketFileStoreWrite(
fileIO,
newRead(),
schema.id(),
rowType,
pathFactory(),
snapshotManager(),
newScan(true).withManifestCacheFilter(manifestFilter),
options,
dvMaintainerFactory,
tableName);
} else {
return new AppendOnlyFixedBucketFileStoreWrite(
fileIO,
newRead(),
schema.id(),
commitUser,
rowType,
pathFactory(),
snapshotManager(),
newScan(true).withManifestCacheFilter(manifestFilter),
options,
dvMaintainerFactory,
tableName);
}
}

private AppendOnlyFileStoreScan newScan(boolean forWrite) {
Expand All @@ -129,7 +146,7 @@ public void pushdown(Predicate predicate) {
splitAnd(predicate),
rowType.getFieldNames(),
bucketKeyType.getFieldNames());
if (bucketFilters.size() > 0) {
if (!bucketFilters.isEmpty()) {
setBucketKeyFilter(and(bucketFilters));
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
import java.util.Map;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.function.Function;

import static org.apache.paimon.io.DataFileMeta.getMaxSequenceNumber;

Expand All @@ -64,7 +65,6 @@ public abstract class AbstractFileStoreWrite<T> implements FileStoreWrite<T> {

private static final Logger LOG = LoggerFactory.getLogger(AbstractFileStoreWrite.class);

private final String commitUser;
protected final SnapshotManager snapshotManager;
private final FileStoreScan scan;
private final int writerNumberMax;
Expand All @@ -85,14 +85,12 @@ public abstract class AbstractFileStoreWrite<T> implements FileStoreWrite<T> {
private boolean isInsertOnly;

protected AbstractFileStoreWrite(
String commitUser,
SnapshotManager snapshotManager,
FileStoreScan scan,
@Nullable IndexMaintainer.Factory<T> indexFactory,
@Nullable DeletionVectorsMaintainer.Factory dvMaintainerFactory,
String tableName,
int writerNumberMax) {
this.commitUser = commitUser;
this.snapshotManager = snapshotManager;
this.scan = scan;
this.indexFactory = indexFactory;
Expand Down Expand Up @@ -169,28 +167,18 @@ public void notifyNewFiles(
@Override
public List<CommitMessage> prepareCommit(boolean waitCompaction, long commitIdentifier)
throws Exception {
long latestCommittedIdentifier;
Function<WriterContainer<T>, Boolean> writerCleanChecker;
if (writers.values().stream()
.map(Map::values)
.flatMap(Collection::stream)
.mapToLong(w -> w.lastModifiedCommitIdentifier)
.max()
.orElse(Long.MIN_VALUE)
== Long.MIN_VALUE) {
// Optimization for the first commit.
//
// If this is the first commit, no writer has previous modified commit, so the value of
// `latestCommittedIdentifier` does not matter.
//
// Without this optimization, we may need to scan through all snapshots only to find
// that there is no previous snapshot by this user, which is very inefficient.
latestCommittedIdentifier = Long.MIN_VALUE;
// If this is the first commit, no writer should be cleaned.
writerCleanChecker = writerContainer -> false;
} else {
latestCommittedIdentifier =
snapshotManager
.latestSnapshotOfUser(commitUser)
.map(Snapshot::commitIdentifier)
.orElse(Long.MIN_VALUE);
writerCleanChecker = createWriterCleanChecker();
}

List<CommitMessage> result = new ArrayList<>();
Expand Down Expand Up @@ -226,14 +214,7 @@ public List<CommitMessage> prepareCommit(boolean waitCompaction, long commitIden
result.add(committable);

if (committable.isEmpty()) {
// Condition 1: There is no more record waiting to be committed. Note that the
// condition is < (instead of <=), because each commit identifier may have
// multiple snapshots. We must make sure all snapshots of this identifier are
// committed.
// Condition 2: No compaction is in progress. That is, no more changelog will be
// produced.
if (writerContainer.lastModifiedCommitIdentifier < latestCommittedIdentifier
&& !writerContainer.writer.isCompacting()) {
if (writerCleanChecker.apply(writerContainer)) {
// Clear writer if no update, and if its latest modification has committed.
//
// We need a mechanism to clear writers, otherwise there will be more and
Expand All @@ -242,12 +223,10 @@ public List<CommitMessage> prepareCommit(boolean waitCompaction, long commitIden
LOG.debug(
"Closing writer for partition {}, bucket {}. "
+ "Writer's last modified identifier is {}, "
+ "while latest committed identifier is {}, "
+ "current commit identifier is {}.",
+ "while current commit identifier is {}.",
partition,
bucket,
writerContainer.lastModifiedCommitIdentifier,
latestCommittedIdentifier,
commitIdentifier);
}
writerContainer.writer.close();
Expand All @@ -266,6 +245,41 @@ public List<CommitMessage> prepareCommit(boolean waitCompaction, long commitIden
return result;
}

// This abstract function returns a whole function (instead of just a boolean value),
// because we do not want to introduce `commitUser` into this base class.
//
// For writers with no conflicts, `commitUser` might be some random value.
protected abstract Function<WriterContainer<T>, Boolean> createWriterCleanChecker();
Copy link
Contributor

@JingsongLi JingsongLi Sep 23, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we do this abstraction? Or just introducing a boolean checkWriterClean?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This abstract function returns a whole function (instead of just a boolean value), because we do not want to introduce commitUser into this base class.

For writers with no conflicts, commitUser might be some random value.


protected static <T>
Function<WriterContainer<T>, Boolean> createConflictAwareWriterCleanChecker(
String commitUser, SnapshotManager snapshotManager) {
long latestCommittedIdentifier =
snapshotManager
.latestSnapshotOfUser(commitUser)
.map(Snapshot::commitIdentifier)
.orElse(Long.MIN_VALUE);
if (LOG.isDebugEnabled()) {
LOG.debug("Latest committed identifier is {}", latestCommittedIdentifier);
}

// Condition 1: There is no more record waiting to be committed. Note that the
// condition is < (instead of <=), because each commit identifier may have
// multiple snapshots. We must make sure all snapshots of this identifier are
// committed.
//
// Condition 2: No compaction is in progress. That is, no more changelog will be
// produced.
return writerContainer ->
writerContainer.lastModifiedCommitIdentifier < latestCommittedIdentifier
&& !writerContainer.writer.isCompacting();
}

protected static <T>
Function<WriterContainer<T>, Boolean> createNoConflictAwareWriterCleanChecker() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just inline this method in AppendOnlyUnawareBucketFileStoreWrite.

return writerContainer -> true;
}

@Override
public void close() throws Exception {
for (Map<Integer, WriterContainer<T>> bucketWriters : writers.values()) {
Expand Down
Loading
Loading