-
Notifications
You must be signed in to change notification settings - Fork 86
First pass of job lock usage review #233
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -681,6 +681,7 @@ public Job createJob(String creator, | |
String minionType, | ||
String command, | ||
boolean defaults) throws Exception { | ||
// Possibly move lock acquire to just before putJobInSpawnState? Does task assignment need to be guarded by job lock? | ||
acquireJobLock(); | ||
try { | ||
Job job = new Job(UUID.randomUUID().toString(), creator); | ||
|
@@ -765,6 +766,7 @@ public void submitConfigUpdate(String jobId, String user, @Nullable String commi | |
if (jobUUID == null) { | ||
return null; | ||
} | ||
// Not needed | ||
acquireJobLock(); | ||
try { | ||
return spawnState.jobs.get(jobUUID); | ||
|
@@ -777,6 +779,7 @@ public void submitConfigUpdate(String jobId, String user, @Nullable String commi | |
if (jobUUID == null) { | ||
return null; | ||
} | ||
// Why? To prevent concurrent calls to getConfig? Synchronize that method! | ||
acquireJobLock(); | ||
try { | ||
return jobConfigManager.getConfig(jobUUID); | ||
|
@@ -813,6 +816,7 @@ public Response synchronizeJobState(String jobUUID, String user, String token, S | |
|
||
public Collection<Job> listJobs() { | ||
List<Job> clones = new ArrayList<>(spawnState.jobs.size()); | ||
// Not needed | ||
acquireJobLock(); | ||
try { | ||
for (Job job : spawnState.jobs.values()) { | ||
|
@@ -1133,6 +1137,7 @@ public boolean swapTask(JobTask task, String replicaHostID, boolean kickOnComple | |
return false; | ||
} | ||
Job job; | ||
// Maybe | ||
acquireJobLock(); | ||
try { | ||
job = getJob(task.getJobUUID()); | ||
|
@@ -1260,6 +1265,8 @@ public List<JobTaskMoveAssignment> executeReallocationAssignments(@Nullable List | |
* @return True if the task is successfully removed | ||
*/ | ||
public boolean deleteTask(String jobUUID, String hostUuid, Integer node, boolean isReplica) { | ||
// Maybe - but probably can be better handled by a lock on the individual job/task? | ||
// Why do spawnMQ.sendControlMessage and queueJobTaskUpdateEvent need to be guarded? | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. At the very least, I think the task.setReplicas call is worthy of consideration. If a task has replicas A, B, and C, and one thread tries to remove A at the same time as another thread tries to remove B, then it is possible that one removal would be swallowed depending on the timing. |
||
acquireJobLock(); | ||
try { | ||
if ((jobUUID == null) || (node == null)) { | ||
|
@@ -1313,6 +1320,7 @@ private static List<JobTaskReplica> removeReplicasForHost(String hostUuid, List< | |
} | ||
|
||
public void queueJobTaskUpdateEvent(IJob job) { | ||
// Not needed | ||
acquireJobLock(); | ||
try { | ||
jobUpdateQueue.add(job.getId()); | ||
|
@@ -1339,6 +1347,7 @@ public void queueJobTaskUpdateEvent(IJob job) { | |
public void updateJob(@Nullable IJob ijob, boolean reviseReplicas) throws Exception { | ||
checkNotNull(ijob, "ijob"); | ||
Job job = new Job(ijob); | ||
// Maybe | ||
acquireJobLock(); | ||
try { | ||
checkArgument(getJob(job.getId()) != null, "job " + job.getId() + " does not exist"); | ||
|
@@ -1388,6 +1397,7 @@ public Set<String> getDataSources(String jobId) { | |
if ((job == null) || (job.getParameters() == null)) { | ||
return dataSources; | ||
} | ||
// Maybe | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Given that Changing the getter to return a copy rather than the actual list might be a more lightweight solution |
||
acquireJobLock(); | ||
try { | ||
for (JobParameter param : job.getParameters()) { | ||
|
@@ -1480,6 +1490,7 @@ public RebalanceOutcome rebalanceJob(String jobUUID, int tasksToMove, String use | |
* @return A string description | ||
*/ | ||
public JSONObject fixTaskDir(String jobId, int node, boolean ignoreTaskState, boolean orphansOnly) { | ||
// Maybe | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Given all the task state checks inside the loop, I think this one should be kept. |
||
acquireJobLock(); | ||
try { | ||
Job job = getJob(jobId); | ||
|
@@ -1552,6 +1563,7 @@ public boolean resolveJobTaskDirectoryMatches(JobTask task, boolean deleteOrphan | |
|
||
public JSONArray checkTaskDirJSON(String jobId, int node) { | ||
JSONArray resultList = new JSONArray(); | ||
// Maybe | ||
acquireJobLock(); | ||
try { | ||
Job job = getJob(jobId); | ||
|
@@ -1656,6 +1668,7 @@ public boolean checkStatusForMove(String hostID) { | |
} | ||
|
||
public boolean prepareTaskStatesForRebalance(Job job, JobTask task, boolean isMigration) { | ||
// Why? | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it is conceivable that the task could have suddenly kicked in between the call to isInMovableState and the call to job.setTaskState . If that were the case, the task state would show up as REBALANCE, but the true state would be RUNNING, which would be a bad outcome. Holding the job lock here prevents the task from kicking until we have decided whether we are rebalancing this task. |
||
acquireJobLock(); | ||
try { | ||
if (!SpawnBalancer.isInMovableState(task)) { | ||
|
@@ -1672,6 +1685,7 @@ public boolean prepareTaskStatesForRebalance(Job job, JobTask task, boolean isMi | |
} | ||
|
||
public DeleteStatus forceDeleteJob(String jobUUID) throws Exception { | ||
// job lock is over kill - a better option could be to make setEnabled synchronized if concurrent modification to the job is the concern | ||
acquireJobLock(); | ||
Job job; | ||
try { | ||
|
@@ -1687,6 +1701,7 @@ public DeleteStatus forceDeleteJob(String jobUUID) throws Exception { | |
} finally { | ||
releaseJobLock(); | ||
} | ||
// If job is deleted in another thread, stopJob and killJob will throw exception. Should catch that and return DeleteStatus.JOB_MISSING | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't understand why we are calling both stopJob and killJob in the first place. If we are hoping to let the job shut down cleanly, 100 ms is certainly not enough when we consider that the job almost certainly has to replicate and backup. I think killJob should be sufficient, since our intent is to delete anyway. |
||
while ((job != null) && (job.getCountActiveTasks() > 0)) { | ||
stopJob(jobUUID); | ||
Uninterruptibles.sleepUninterruptibly(100, TimeUnit.MILLISECONDS); | ||
|
@@ -1698,6 +1713,7 @@ public DeleteStatus forceDeleteJob(String jobUUID) throws Exception { | |
} | ||
|
||
public DeleteStatus deleteJob(String jobUUID) throws Exception { | ||
// OK | ||
acquireJobLock(); | ||
try { | ||
Job job = getJob(jobUUID); | ||
|
@@ -1932,6 +1948,7 @@ public void stopJob(String jobUUID) throws Exception { | |
public void killJob(String jobUUID) throws Exception { | ||
boolean success = false; | ||
while (!success && !shuttingDown.get()) { | ||
// Maybe | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this loop is concerning for an unrelated reason -- if the queueLock is locked, we call tryLock over and over again without any sort of delay in between. Maybe we should sleep for a few millis on failure? |
||
acquireJobLock(); | ||
try { | ||
if (taskQueuesByPriority.tryLock()) { | ||
|
@@ -2099,6 +2116,7 @@ public void handleRebalanceFinish(IJob job, JobTask task, StatusTaskEnd update) | |
public JobMacro createJobHostMacro(String job, int port) { | ||
String sPort = Integer.valueOf(port).toString(); | ||
Set<String> jobHosts = new TreeSet<>();// best set? | ||
// The whole method can be removed | ||
acquireJobLock(); | ||
try { | ||
Collection<HostState> hosts = hostManager.listHostStatus(null); | ||
|
@@ -2138,6 +2156,7 @@ public JobMacro createJobHostMacro(String job, int port) { | |
* sent for a while. | ||
*/ | ||
public void saveAllJobs() { | ||
// Maybe | ||
acquireJobLock(); | ||
try { | ||
for (Job job : listJobs()) { | ||
|
@@ -2157,6 +2176,7 @@ public void saveAllJobs() { | |
* send job update event to registered listeners (usually http clients) | ||
*/ | ||
private void sendJobUpdateEvent(Job job) { | ||
// Maybe | ||
acquireJobLock(); | ||
try { | ||
jobConfigManager.updateJob(job); | ||
|
@@ -2457,6 +2477,7 @@ public void kickJobsOnQueue() { | |
boolean success = false; | ||
while (!success && !shuttingDown.get()) { | ||
// need the job lock first | ||
// Maybe... | ||
acquireJobLock(); | ||
try { | ||
if (taskQueuesByPriority.tryLock()) { | ||
|
@@ -2555,6 +2576,7 @@ List<HostState> getHealthyHostStatesHousingTask(JobTask task, boolean allowRepli | |
} | ||
|
||
@VisibleForTesting protected void loadJobs() { | ||
// Probably can do without | ||
acquireJobLock(); | ||
try { | ||
for (IJob iJob : jobConfigManager.loadJobs().values()) { | ||
|
@@ -2566,6 +2588,7 @@ List<HostState> getHealthyHostStatesHousingTask(JobTask task, boolean allowRepli | |
releaseJobLock(); | ||
} | ||
Thread loadDependencies = new Thread(() -> { | ||
// FIXME just iterate over the map entries... | ||
Set<String> jobIds = spawnState.jobs.keySet(); | ||
for (String jobId : jobIds) { | ||
IJob job = getJob(jobId); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -808,6 +808,7 @@ public void fixTasksForFailedHost(List<HostState> hosts, String failedHost) { | |
|
||
private List<JobTask> findAllTasksAssignedToHost(String failedHostUUID) { | ||
List<JobTask> rv = new ArrayList<>(); | ||
// Why? | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think that it is prudent to hold the job lock while failing hosts -- which is the only time this method is called. We don't want to, e.g., kick tasks before all the bad hosts have been replaced. However, this read-only method does not appear to be useful place to get the lock. I would suggest SpawnBalancer's |
||
spawn.acquireJobLock(); | ||
try { | ||
for (Job job : spawn.listJobs()) { | ||
|
@@ -1384,6 +1385,7 @@ private Collection<JobTaskMoveAssignment> balanceActiveJobsOnHost(HostState host | |
List<JobTaskMoveAssignment> rv = purgeMisplacedTasks(host, 1); | ||
String hostID = host.getHostUuid(); | ||
for (String jobID : activeJobs) { | ||
// Maybe | ||
spawn.acquireJobLock(); | ||
try { | ||
Job job = spawn.getJob(jobID); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Strongly believe that this lock should be kept.
If we somehow tried to swap in two threads at the same time (swaps can be executed via an HTTP call, I believe) then modifications to the replicas and the hostUUID could interleave in a very harmful way -- for example, setting the replica and the host to the same UUID and "losing" a valid replica elsewhere.