Skip to content

Commit

Permalink
Explicitly cleanup SqlTask on worker when no longer needed
Browse files Browse the repository at this point in the history
Currently SqlTask objects are removed from SqlTaskManager.tasks map
(cache) after timeout (15 minutes by default). Even though the object is
not huge, we observed increased memory pressure up to OOM on busy
clusters.

With this PR entries are dropped form SqlTaskManager as soon as they are
no longer needed, when coordinator will no longer query for the
information
  • Loading branch information
losipiuk committed Jan 16, 2025
1 parent af4e200 commit 275bc50
Show file tree
Hide file tree
Showing 6 changed files with 48 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@
import java.util.function.Predicate;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkState;
import static com.google.common.base.Throwables.throwIfUnchecked;
import static com.google.common.collect.ImmutableList.toImmutableList;
import static com.google.common.collect.ImmutableSet.toImmutableSet;
Expand Down Expand Up @@ -656,6 +657,17 @@ public TaskInfo failTask(TaskId taskId, Throwable failure)
return tasks.getUnchecked(taskId).failed(failure);
}

public void cleanupTask(TaskId taskId)
{
requireNonNull(taskId, "taskId is null");
SqlTask sqlTask = tasks.getIfPresent(taskId);
if (sqlTask == null) {
return;
}
checkState(sqlTask.getTaskState() == TaskState.FINISHED, "cleanup called for task %s which is in state %s", taskId, sqlTask.getTaskState());
tasks.unsafeInvalidate(taskId);
}

@VisibleForTesting
void removeOldTasks()
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,15 @@ public TaskInfo failTask(
return taskManager.failTask(taskId, failTaskRequest.getFailureInfo().toException());
}

@POST
@Path("{taskId}/cleanup")
public void cleanupTask(
@PathParam("taskId") TaskId taskId)
{
requireNonNull(taskId, "taskId is null");
taskManager.cleanupTask(taskId);
}

@GET
@Path("{taskId}/results/{bufferId}/{token}")
@Produces(TRINO_PAGES)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ class ContinuousTaskStatusFetcher
private final Supplier<SpanBuilder> spanBuilderFactory;
private final RequestErrorTracker errorTracker;
private final RemoteTaskStats stats;
private final RemoteTaskCleaner remoteTaskCleaner;

@GuardedBy("this")
private boolean running;
Expand All @@ -84,7 +85,8 @@ public ContinuousTaskStatusFetcher(
Supplier<SpanBuilder> spanBuilderFactory,
Duration maxErrorDuration,
ScheduledExecutorService errorScheduledExecutor,
RemoteTaskStats stats)
RemoteTaskStats stats,
RemoteTaskCleaner remoteTaskCleaner)
{
requireNonNull(initialTaskStatus, "initialTaskStatus is null");

Expand All @@ -102,6 +104,7 @@ public ContinuousTaskStatusFetcher(

this.errorTracker = new RequestErrorTracker(taskId, initialTaskStatus.getSelf(), maxErrorDuration, errorScheduledExecutor, "getting task status");
this.stats = requireNonNull(stats, "stats is null");
this.remoteTaskCleaner = requireNonNull(remoteTaskCleaner, "remoteTaskCleaner is null");
}

public synchronized void start()
Expand All @@ -121,6 +124,7 @@ public synchronized void stop()
future.cancel(true);
future = null;
}
remoteTaskCleaner.markTaskStatusFetcherStopped(taskStatus.get().getState());
}

private synchronized void scheduleNextRequest()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ class DynamicFiltersFetcher
private final RequestErrorTracker errorTracker;
private final RemoteTaskStats stats;
private final DynamicFilterService dynamicFilterService;
private final RemoteTaskCleaner remoteTaskCleaner;

@GuardedBy("this")
private long dynamicFiltersVersion = INITIAL_DYNAMIC_FILTERS_VERSION;
Expand All @@ -83,7 +84,8 @@ public DynamicFiltersFetcher(
Duration maxErrorDuration,
ScheduledExecutorService errorScheduledExecutor,
RemoteTaskStats stats,
DynamicFilterService dynamicFilterService)
DynamicFilterService dynamicFilterService,
RemoteTaskCleaner remoteTaskCleaner)
{
this.taskId = requireNonNull(taskId, "taskId is null");
this.taskUri = requireNonNull(taskUri, "taskUri is null");
Expand All @@ -99,6 +101,8 @@ public DynamicFiltersFetcher(
this.errorTracker = new RequestErrorTracker(taskId, taskUri, maxErrorDuration, errorScheduledExecutor, "getting dynamic filter domains");
this.stats = requireNonNull(stats, "stats is null");
this.dynamicFilterService = requireNonNull(dynamicFilterService, "dynamicFilterService is null");

this.remoteTaskCleaner = requireNonNull(remoteTaskCleaner, "remoteTaskCleaner is null");
}

public synchronized void start()
Expand All @@ -124,6 +128,7 @@ public synchronized void updateDynamicFiltersVersionAndFetchIfNecessary(long new
private synchronized void stop()
{
running = false;
remoteTaskCleaner.markDynamidFilterFetcherStopped();
}

@VisibleForTesting
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,13 @@ public HttpRemoteTask(

TaskInfo initialTask = createInitialTask(taskId, location, nodeId, this.speculative.get(), pipelinedBufferStates, new TaskStats(DateTime.now(), null));

RemoteTaskCleaner remoteTaskCleaner = new RemoteTaskCleaner(
taskId,
location,
httpClient,
errorScheduledExecutor,
() -> createSpanBuilder("remote-task-cleaner", span));

this.dynamicFiltersFetcher = new DynamicFiltersFetcher(
this::fatalUnacknowledgedFailure,
taskId,
Expand All @@ -334,7 +341,8 @@ public HttpRemoteTask(
maxErrorDuration,
errorScheduledExecutor,
stats,
dynamicFilterService);
dynamicFilterService,
remoteTaskCleaner);

this.taskStatusFetcher = new ContinuousTaskStatusFetcher(
this::fatalUnacknowledgedFailure,
Expand All @@ -347,12 +355,14 @@ public HttpRemoteTask(
() -> createSpanBuilder("task-status", span),
maxErrorDuration,
errorScheduledExecutor,
stats);
stats,
remoteTaskCleaner);

RetryPolicy retryPolicy = getRetryPolicy(session);
this.taskInfoFetcher = new TaskInfoFetcher(
this::fatalUnacknowledgedFailure,
taskStatusFetcher,
remoteTaskCleaner,
initialTask,
httpClient,
() -> createSpanBuilder("task-info", span),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ public class TaskInfoFetcher
private final TaskId taskId;
private final Consumer<Throwable> onFail;
private final ContinuousTaskStatusFetcher taskStatusFetcher;
private final RemoteTaskCleaner remoteTaskCleaner;
private final StateMachine<TaskInfo> taskInfo;
private final StateMachine<Optional<TaskInfo>> finalTaskInfo;
private final JsonCodec<TaskInfo> taskInfoCodec;
Expand Down Expand Up @@ -100,6 +101,7 @@ public class TaskInfoFetcher
public TaskInfoFetcher(
Consumer<Throwable> onFail,
ContinuousTaskStatusFetcher taskStatusFetcher,
RemoteTaskCleaner remoteTaskCleaner,
TaskInfo initialTask,
HttpClient httpClient,
Supplier<SpanBuilder> spanBuilderFactory,
Expand All @@ -120,6 +122,7 @@ public TaskInfoFetcher(
this.taskId = initialTask.taskStatus().getTaskId();
this.onFail = requireNonNull(onFail, "onFail is null");
this.taskStatusFetcher = requireNonNull(taskStatusFetcher, "taskStatusFetcher is null");
this.remoteTaskCleaner = requireNonNull(remoteTaskCleaner, "remoteTaskCleaner is null");
this.taskInfo = new StateMachine<>("task " + taskId, executor, initialTask);
this.finalTaskInfo = new StateMachine<>("task-" + taskId, executor, Optional.empty());
this.taskInfoCodec = requireNonNull(taskInfoCodec, "taskInfoCodec is null");
Expand Down Expand Up @@ -163,6 +166,7 @@ private synchronized void stop()
if (scheduledFuture != null) {
scheduledFuture.cancel(true);
}
remoteTaskCleaner.markTaskInfoFetcherStopped();
}

/**
Expand Down

0 comments on commit 275bc50

Please sign in to comment.