Skip to content

Commit

Permalink
Add latency stats around cluster config file operations (valkey-io#1534)
Browse files Browse the repository at this point in the history
When the cluster changes, we need to persist the cluster configuration,
and these file IO operations may cause latency.

Signed-off-by: Binbin <[email protected]>
  • Loading branch information
enjoy-binbin authored Jan 11, 2025
1 parent 10357ce commit 11cb8ee
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 2 deletions.
30 changes: 28 additions & 2 deletions src/cluster_legacy.c
Original file line number Diff line number Diff line change
Expand Up @@ -817,6 +817,7 @@ int clusterSaveConfig(int do_fsync) {
ssize_t written_bytes;
int fd = -1;
int retval = C_ERR;
mstime_t latency;

server.cluster->todo_before_sleep &= ~CLUSTER_TODO_SAVE_CONFIG;

Expand All @@ -830,11 +831,15 @@ int clusterSaveConfig(int do_fsync) {

/* Create a temp file with the new content. */
tmpfilename = sdscatfmt(sdsempty(), "%s.tmp-%i-%I", server.cluster_configfile, (int)getpid(), mstime());
latencyStartMonitor(latency);
if ((fd = open(tmpfilename, O_WRONLY | O_CREAT, 0644)) == -1) {
serverLog(LL_WARNING, "Could not open temp cluster config file: %s", strerror(errno));
goto cleanup;
}
latencyEndMonitor(latency);
latencyAddSampleIfNeeded("cluster-config-open", latency);

latencyStartMonitor(latency);
while (offset < content_size) {
written_bytes = write(fd, ci + offset, content_size - offset);
if (written_bytes <= 0) {
Expand All @@ -845,31 +850,52 @@ int clusterSaveConfig(int do_fsync) {
}
offset += written_bytes;
}
latencyEndMonitor(latency);
latencyAddSampleIfNeeded("cluster-config-write", latency);

if (do_fsync) {
latencyStartMonitor(latency);
server.cluster->todo_before_sleep &= ~CLUSTER_TODO_FSYNC_CONFIG;
if (valkey_fsync(fd) == -1) {
serverLog(LL_WARNING, "Could not sync tmp cluster config file: %s", strerror(errno));
goto cleanup;
}
latencyEndMonitor(latency);
latencyAddSampleIfNeeded("cluster-config-fsync", latency);
}

latencyStartMonitor(latency);
if (rename(tmpfilename, server.cluster_configfile) == -1) {
serverLog(LL_WARNING, "Could not rename tmp cluster config file: %s", strerror(errno));
goto cleanup;
}
latencyEndMonitor(latency);
latencyAddSampleIfNeeded("cluster-config-rename", latency);

if (do_fsync) {
latencyStartMonitor(latency);
if (fsyncFileDir(server.cluster_configfile) == -1) {
serverLog(LL_WARNING, "Could not sync cluster config file dir: %s", strerror(errno));
goto cleanup;
}
latencyEndMonitor(latency);
latencyAddSampleIfNeeded("cluster-config-dir-fsync", latency);
}
retval = C_OK; /* If we reached this point, everything is fine. */

cleanup:
if (fd != -1) close(fd);
if (retval == C_ERR) unlink(tmpfilename);
if (fd != -1) {
latencyStartMonitor(latency);
close(fd);
latencyEndMonitor(latency);
latencyAddSampleIfNeeded("cluster-config-close", latency);
}
if (retval == C_ERR) {
latencyStartMonitor(latency);
unlink(tmpfilename);
latencyEndMonitor(latency);
latencyAddSampleIfNeeded("cluster-config-unlink", latency);
}
sdsfree(tmpfilename);
sdsfree(ci);
return retval;
Expand Down
12 changes: 12 additions & 0 deletions tests/unit/latency-monitor.tcl
Original file line number Diff line number Diff line change
Expand Up @@ -189,3 +189,15 @@ tags {"needs:debug"} {
assert_match "*wrong number of arguments for 'latency|help' command" $e
}
}

start_cluster 1 1 {tags {"latency-monitor cluster external:skip needs:latency"} overrides {latency-monitor-threshold 1}} {
test "Cluster config file latency" {
# This test just a sanity test so that we can make sure the code path is cover.
# We don't assert anything since we can't be sure whether it will be counted.
R 0 cluster saveconfig
R 1 cluster saveconfig
R 1 cluster failover force
R 0 latency latest
R 1 latency latest
}
}

0 comments on commit 11cb8ee

Please sign in to comment.