Skip to content
This repository has been archived by the owner on Apr 24, 2023. It is now read-only.

Commit

Permalink
Revert "allow preemptions for tasks with unknown status (#2097)" (#2101)
Browse files Browse the repository at this point in the history
This reverts commit 27938f4.
  • Loading branch information
laurameng authored Jun 22, 2022
1 parent 3097e95 commit 07a1d7c
Show file tree
Hide file tree
Showing 4 changed files with 10 additions and 47 deletions.
2 changes: 0 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -44,5 +44,3 @@ dist
.vagrant/
jobclient/python/docs/build/
*.egg-info/
.clj-kondo/
.lsp/
15 changes: 10 additions & 5 deletions scheduler/src/cook/rebalancer.clj
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
;; limitations under the License.
;;
(ns cook.rebalancer
(:require [clojure.core.async :as async]
(:require [chime :refer [chime-at]]
[clojure.core.async :as async]
[clojure.core.cache :as cache]
[clojure.data.priority-map :as pm]
[clojure.tools.logging :as log]
Expand All @@ -27,7 +28,7 @@
[cook.scheduler.dru :as dru]
[cook.task :as task]
[cook.tools :as util]
[datomic.api :as d]
[datomic.api :as d :refer [q]]
[metatransaction.core :as mt]
[metrics.histograms :as histograms]
[metrics.timers :as timers]
Expand Down Expand Up @@ -55,8 +56,12 @@
;;;
;;; Within each cycle, the rebalancer takes the first n tasks in a global DRU queue, it then 'tries' to see if they can
;;; (and should) preempt any existing running tasks. Only tasks over the share are eligible to be preempted. We only do
;;; the preemption if the DRU change is sufficiently large. If there is a set of running tasks that satisfy the
;;; criteria, they are killed and the host is reserved to launch the waiting job.
;;; the preemption if the DRU change is sufficiently large. If there is a set of running tasks that satisfy the criteria, they are killed and the host is reserved to launch the waiting job.
;;; be killed.
;;;
;;; At the start of a cycle, Rebalancer initializes its internal state. Then, for each iteration in a given cycle,
;;; Rebalancer processes a pending job and tries to make room for it by finding a task to preempt and updates its
;;; internal state if such preemption is found.
;;;
;;; Preemption Principle
;;; Rebalancer uses a score-based preemption algorithm.
Expand Down Expand Up @@ -476,7 +481,7 @@
;; all over the place.
(let [job-eid (:db/id (:job/_instance task-ent))
task-eid (:db/id task-ent)]
[[:generic/ensure-some task-eid :instance/status #{(d/entid db :instance.status/running) (d/entid db :instance.status/unknown)}]
[[:generic/ensure task-eid :instance/status (d/entid db :instance.status/running)]
[:generic/atomic-inc job-eid :job/preemptions 1]
;; The database can become inconsistent if we make multiple calls to :instance/update-state in a single
;; transaction; see the comment in the definition of :instance/update-state for more details
Expand Down
16 changes: 0 additions & 16 deletions scheduler/src/cook/schema.clj
Original file line number Diff line number Diff line change
Expand Up @@ -1134,22 +1134,6 @@ for a job. E.g. {:resources {:cpus 4 :mem 3} :constraints {\"unique_host_constra
(throw (ex-info "Fail to ensure attribute" {:entity e
:attribute a
:expected v}))))}}

{:db/id (d/tempid :db.part/user)
:db/ident :generic/ensure-some
:db/doc "Ensures an attribute of an entity has at least one of the expected values. Throws exception otherwise"
:db/fn #db/fn {:lang "clojure"
:params [db e a values]
:requires [[metatransaction.core :as mt]]
:code
(let [db (mt/filter-committed db)]
(if (some values
(map :v
(seq (d/datoms db :eavt e a))))
nil
(throw (ex-info "Fail to ensure attribute" {:entity e
:attribute a
:expected values}))))}}

{:db/id (d/tempid :db.part/user)
:db/ident :job/reasons->attempts-consumed
Expand Down
24 changes: 0 additions & 24 deletions scheduler/test/cook/test/rebalancer.clj
Original file line number Diff line number Diff line change
Expand Up @@ -985,30 +985,6 @@
(doall (for [[scored expected] scored->expected]
(is (= scored expected))))))))))

(deftest test-transact-preemption
(setup)
(testing "testing conditional to preempt only running and unknown instances"
(let [datomic-uri "datomic:mem://test-init-state"
conn (restore-fresh-database! datomic-uri)
db (d/db conn)
job1 (create-dummy-job conn :user "alexh" :memory 10.0 :ncpus 10.0)
job2 (create-dummy-job conn :user "alexh" :memory 10.0 :ncpus 10.0)
job3 (create-dummy-job conn :user "alexh" :memory 10.0 :ncpus 10.0)

task1 (create-dummy-instance conn job1 :instance-status :instance.status/running)
task2 (create-dummy-instance conn job2 :instance-status :instance.status/unknown)
task3 (create-dummy-instance conn job3 :instance-status :instance.status/success)

task-ent1 (d/entity (d/db conn) task1)
task-ent2 (d/entity (d/db conn) task2)
task-ent3 (d/entity (d/db conn) task3)]

;; Transaction exceptions are wrapped and converted to a log, so test for nil return.
(is (not (nil? (rebalancer/transact-preemption! db conn "pool1" task-ent1))))
(is (not (nil? (rebalancer/transact-preemption! db conn "pool1" task-ent2))))

(is (nil? (rebalancer/transact-preemption! db conn "pool1" task-ent3))))))

(defn rebalance
"Calculates the jobs to make for and the initial state, and then delegates to rebalancer/rebalance"
[db agent-attributes-cache pending-job-ents host->spare-resources rebalancer-reservation-atom
Expand Down

0 comments on commit 07a1d7c

Please sign in to comment.