-
Notifications
You must be signed in to change notification settings - Fork 16
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[VirtualCluster] Dashboard supports CRUD of virtual clusters [2/N] #430
Conversation
allocation_mode = "mixed" | ||
wait_for_condition(_create_or_update_virtual_cluster, timeout=10) | ||
|
||
# TODO: uncomment this when `MixedCluster::IsIdleNodeInstance` is fixed. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We have to fix MixedCluster::IsIdleNodeInstance
, making it work with ClusterResourceManager
revision=0, | ||
) | ||
assert result["result"] is False | ||
assert str(result["msg"]).endswith("No enough node instances to assign.") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We may have to tell users which node type and how many nodes we lack, so that users know how to scale up the primary cluster or adapt the virtual cluster requirements.
# The virtual cluster can not be removed because some nodes | ||
# are still in use. | ||
assert result["result"] is False | ||
assert str(result["msg"]).endswith("still in use.") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We may have to tell users which nodes are still in use, so that they know which nodes to drain before removing. We should tell similar things when users are trying to scale down a virtual cluster.
@@ -328,15 +348,40 @@ void ExclusiveCluster::ForeachJobCluster( | |||
|
|||
///////////////////////// MixedCluster ///////////////////////// | |||
bool MixedCluster::IsIdleNodeInstance(const std::string &job_cluster_id, | |||
const std::string &node_instance_id, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Put node_instance_id
into struct NodeInstance
} | ||
|
||
bool MixedCluster::InUse() const { | ||
// TODO(Shanly): Check if the virtual cluster still running jobs or placement groups. | ||
bool MixedCluster::InUse(ReplicaInstances *in_use_instances) const { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
bool MixedCluster::InUse(ReplicaInstances *in_use_instances) const { | |
bool MixedCluster::InUse() const { |
auto message = ostr.str(); | ||
RAY_LOG(ERROR) << message; | ||
// TODO(Shanly): build a new status. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
return Status::UnsafeToRemove();
@@ -261,10 +276,11 @@ class MixedCluster : public VirtualCluster { | |||
/// Check if the virtual cluster is in use. | |||
/// | |||
/// \return True if the virtual cluster is in use, false otherwise. | |||
bool InUse() const override; | |||
bool InUse(ReplicaInstances *in_use_instances = nullptr) const override; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
bool InUse(ReplicaInstances *in_use_instances = nullptr) const override; | |
bool InUse() const override; |
@@ -238,10 +250,11 @@ class ExclusiveCluster : public VirtualCluster { | |||
/// Check if the virtual cluster is in use. | |||
/// | |||
/// \return True if the virtual cluster is in use, false otherwise. | |||
bool InUse() const override; | |||
bool InUse(ReplicaInstances *in_use_instances = nullptr) const override; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
bool InUse(ReplicaInstances *in_use_instances = nullptr) const override; | |
bool InUse() const override; |
/// \return True if the virtual cluster is in use, false otherwise. | ||
virtual bool InUse() const = 0; | ||
virtual bool InUse(ReplicaInstances *in_use_instances = nullptr) const = 0; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
virtual bool InUse(ReplicaInstances *in_use_instances = nullptr) const = 0; | |
virtual bool InUse() const = 0; |
/// | ||
/// \param replica_sets The demand final replica sets. | ||
/// \param replica_instances The node instances lookuped best effort from the visible | ||
/// node instances. | ||
/// \return OK if the lookup is successful, otherwise return an error. | ||
Status LookupIdleNodeInstances(const ReplicaSets &replica_sets, | ||
Status LookupIdleNodeInstances(ReplicaSets &replica_sets, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Status LookupIdleNodeInstances(ReplicaSets &replica_sets, | |
Status LookupIdleNodeInstances(const ReplicaSets &replica_sets, |
// TODO(Shanly): Give a more detailed error message about the demand replica set and | ||
// the idle replica instances. | ||
return Status::OutOfResource("No enough node instances to assign."); | ||
return Status::OutOfResource(""); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
just return success directly.
# Create a new virtual cluster with exclusive allocation mode. | ||
virtual_cluster_id = "virtual_cluster_1" | ||
allocation_mode = "exclusive" | ||
replica_sets = {"4c8g": 1, "8c16g": 1} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
add an argument to _create_or_update_virtual_cluster
and fill it with replica_sets
@@ -460,6 +471,32 @@ Status PrimaryCluster::CreateOrUpdateVirtualCluster( | |||
replica_instances_to_add_to_logical_cluster, | |||
replica_instances_to_remove_from_logical_cluster); | |||
if (!status.ok()) { | |||
ReplicaInstances *replica_instances = nullptr; | |||
if (status.IsOutOfResource()) { | |||
replica_instances = &replica_instances_to_add_to_logical_cluster; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if (replica_sets) {
*replica_sets = toReplicaSets(replica_instances_to_add_to_logical_cluster);
}
@@ -530,6 +572,7 @@ void PrimaryCluster::OnNodeAdd(const rpc::GcsNodeInfo &node) { | |||
const auto &template_id = node.node_type_name(); | |||
auto node_instance_id = NodeID::FromBinary(node.node_id()).Hex(); | |||
auto node_instance = std::make_shared<gcs::NodeInstance>(); | |||
node_instance->set_node_instance_id(node_instance_id); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
auto node_instance = std::make_sharedgcs::NodeInstance(node_instance_id);
RAY_LOG(WARNING) << "Failed to create or update virtual cluster " | ||
<< virtual_cluster_id << ", status = " << status.ToString(); | ||
if (data != nullptr && (status.IsOutOfResource() || status.IsUnsafeToRemove())) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
remove there logic from callback
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Why are these changes needed?
This PR adds some e2e tests for the CRUD of virtual clusters.
Related issue number
#417 #418
Checks
git commit -s
) in this PR.scripts/format.sh
to lint the changes in this PR.method in Tune, I've added it in
doc/source/tune/api/
under thecorresponding
.rst
file.