From db84a399042ed4f9b1cbf7a96fc463a91edcb3c4 Mon Sep 17 00:00:00 2001 From: xufangping Date: Tue, 3 Sep 2019 22:56:54 +0800 Subject: [PATCH 01/60] fix empty command which cause titan to crash in parsing command --- client.go | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/client.go b/client.go index f02360e5..a8523bba 100644 --- a/client.go +++ b/client.go @@ -102,6 +102,13 @@ func (c *client) serve(conn net.Conn) error { c.server.servCtx.Pause = 0 } + if len(cmd) <= 0 { + zap.L().Error("command is empty", zap.String("addr", c.cliCtx.RemoteAddr), + zap.Int64("clientid", c.cliCtx.ID)) + resp.ReplyError(c, command.ErrEmptyArray.Error()) + continue + } + c.cliCtx.Updated = time.Now() c.cliCtx.LastCmd = cmd[0] From 3e4b89e0f684b9161e528d95c9c078564f02f427 Mon Sep 17 00:00:00 2001 From: xufangping Date: Tue, 3 Sep 2019 23:03:01 +0800 Subject: [PATCH 02/60] change empty command's reply error from ErrEmptyArray to ErrEmptyCommand --- client.go | 2 +- command/error.go | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/client.go b/client.go index a8523bba..5e7467f6 100644 --- a/client.go +++ b/client.go @@ -105,7 +105,7 @@ func (c *client) serve(conn net.Conn) error { if len(cmd) <= 0 { zap.L().Error("command is empty", zap.String("addr", c.cliCtx.RemoteAddr), zap.Int64("clientid", c.cliCtx.ID)) - resp.ReplyError(c, command.ErrEmptyArray.Error()) + resp.ReplyError(c, command.ErrEmptyCommand.Error()) continue } diff --git a/command/error.go b/command/error.go index e4b46287..5ae788c4 100644 --- a/command/error.go +++ b/command/error.go @@ -91,6 +91,8 @@ var ( // ErrEmptyArray error ErrEmptyArray = errors.New("EmptyArray error") + ErrEmptyCommand = errors.New("command is empty") + //ErrExec exec without multi ErrExec = errors.New("ERR EXEC without MULTI") From c89b27f4e1de6356b08665b255c5f8c3ec72b1a9 Mon Sep 17 00:00:00 2001 From: xufangping Date: Tue, 3 Sep 2019 23:23:38 +0800 Subject: [PATCH 03/60] close the connection when meet empty command --- client.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/client.go b/client.go index 5e7467f6..cf6e5834 100644 --- a/client.go +++ b/client.go @@ -106,7 +106,8 @@ func (c *client) serve(conn net.Conn) error { zap.L().Error("command is empty", zap.String("addr", c.cliCtx.RemoteAddr), zap.Int64("clientid", c.cliCtx.ID)) resp.ReplyError(c, command.ErrEmptyCommand.Error()) - continue + c.conn.Close() + return nil } c.cliCtx.Updated = time.Now() From 43bd6c355fc5765e1f2f14ca4963ea44a1774198 Mon Sep 17 00:00:00 2001 From: xufangping Date: Fri, 6 Sep 2019 15:30:01 +0800 Subject: [PATCH 04/60] some illegal clients send unreadable command which may cause titan crash. just after read the command we check if it is in command list we support, else will skip it --- client.go | 12 ++++++++++-- command/command.go | 26 ++++++++++++++++---------- command/init.go | 2 ++ 3 files changed, 28 insertions(+), 12 deletions(-) diff --git a/client.go b/client.go index cf6e5834..87f1348c 100644 --- a/client.go +++ b/client.go @@ -103,15 +103,23 @@ func (c *client) serve(conn net.Conn) error { } if len(cmd) <= 0 { - zap.L().Error("command is empty", zap.String("addr", c.cliCtx.RemoteAddr), + err := command.ErrEmptyCommand + zap.L().Error(err.Error(), zap.String("addr", c.cliCtx.RemoteAddr), zap.Int64("clientid", c.cliCtx.ID)) - resp.ReplyError(c, command.ErrEmptyCommand.Error()) + resp.ReplyError(c, err.Error()) c.conn.Close() return nil } c.cliCtx.Updated = time.Now() c.cliCtx.LastCmd = cmd[0] + if !c.exec.CanExecute(c.cliCtx.LastCmd) { + err := command.ErrUnKnownCommand(c.cliCtx.LastCmd) + zap.L().Error(err.Error(), zap.String("addr", c.cliCtx.RemoteAddr), + zap.Int64("clientid", c.cliCtx.ID)) + resp.ReplyError(c, err.Error()) + continue + } ctx := &command.Context{ Name: cmd[0], diff --git a/command/command.go b/command/command.go index 61e26af0..186105ec 100644 --- a/command/command.go +++ b/command/command.go @@ -71,7 +71,7 @@ func BytesArray(w io.Writer, a [][]byte) OnCommit { continue } resp.ReplyBulkString(w, string(a[i])) - if i % 10 == 9 { + if i%10 == 9 { zap.L().Debug("reply 10 bulk string", zap.Int64("cost(us)", time.Since(start).Nanoseconds()/1000)) start = time.Now() } @@ -80,9 +80,9 @@ func BytesArray(w io.Writer, a [][]byte) OnCommit { } func BytesArrayOnce(w io.Writer, a [][]byte) OnCommit { - return func() { - resp.ReplyStringArray(w, a) - } + return func() { + resp.ReplyStringArray(w, a) + } } // TxnCommand runs a command in transaction @@ -182,18 +182,18 @@ func AutoCommit(cmd TxnCommand) Command { return func(ctx *Context) { retry.Ensure(ctx, func() error { mt := metrics.GetMetrics() - start := time.Now() + start := time.Now() txn, err := ctx.Client.DB.Begin() key := "" if len(ctx.Args) > 0 { key = ctx.Args[0] if len(ctx.Args) > 1 { - mt.CommandArgsNumHistogramVec.WithLabelValues(ctx.Client.Namespace, ctx.Name).Observe(float64(len(ctx.Args)-1)) + mt.CommandArgsNumHistogramVec.WithLabelValues(ctx.Client.Namespace, ctx.Name).Observe(float64(len(ctx.Args) - 1)) } } cost := time.Since(start).Seconds() - zap.L().Debug("transation begin", zap.String("name", ctx.Name), zap.String("key", key), zap.Int64("cost(us)", int64(cost*1000000))) - mt.TxnBeginHistogramVec.WithLabelValues(ctx.Client.Namespace, ctx.Name).Observe(cost) + zap.L().Debug("transation begin", zap.String("name", ctx.Name), zap.String("key", key), zap.Int64("cost(us)", int64(cost*1000000))) + mt.TxnBeginHistogramVec.WithLabelValues(ctx.Client.Namespace, ctx.Name).Observe(cost) if err != nil { mt.TxnFailuresCounterVec.WithLabelValues(ctx.Client.Namespace, ctx.Name).Inc() resp.ReplyError(ctx.Out, "ERR "+err.Error()) @@ -277,9 +277,9 @@ func feedMonitors(ctx *Context) { id := strconv.FormatInt(int64(ctx.Client.DB.ID), 10) line := ts + " [" + id + " " + ctx.Client.RemoteAddr + "]" + " " + ctx.Name + " " + strings.Join(ctx.Args, " ") - start := time.Now() + start := time.Now() err := resp.ReplySimpleString(mCtx.Out, line) - zap.L().Debug("feedMonitors reply", zap.String("name", ctx.Name), zap.Int64("cost(us)", time.Since(start).Nanoseconds()/1000)) + zap.L().Debug("feedMonitors reply", zap.String("name", ctx.Name), zap.Int64("cost(us)", time.Since(start).Nanoseconds()/1000)) if err != nil { ctx.Server.Monitors.Delete(k) } @@ -299,6 +299,12 @@ func NewExecutor() *Executor { return &Executor{txnCommands: txnCommands, commands: commands} } +func (e *Executor) CanExecute(cmd string) bool { + lowerName := strings.ToLower(cmd) + _, ok := commands[lowerName] + return ok +} + // Execute a command func (e *Executor) Execute(ctx *Context) { start := time.Now() diff --git a/command/init.go b/command/init.go index 5e1189f3..fa62cf15 100644 --- a/command/init.go +++ b/command/init.go @@ -117,6 +117,8 @@ func init() { // transactions, exec and discard should called explicitly, so they are registered here "multi": Desc{Proc: Multi, Cons: Constraint{1, flags("sF"), 0, 0, 0}}, + "exec": Desc{Proc: Exec, Cons: Constraint{1, flags("sF"), 0, 0, 0}}, + "discard": Desc{Proc: Discard, Cons: Constraint{1, flags("sF"), 0, 0, 0}}, "watch": Desc{Proc: Watch, Cons: Constraint{-2, flags("sF"), 1, -1, 1}}, "unwatch": Desc{Proc: Unwatch, Cons: Constraint{1, flags("sF"), 0, 0, 0}}, From 95bd04b6e08ba54615c2d0fab3c71f0f7df4753b Mon Sep 17 00:00:00 2001 From: xufangping Date: Fri, 6 Sep 2019 18:31:08 +0800 Subject: [PATCH 05/60] if a connection send unknown commands 3 times, close it --- client.go | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/client.go b/client.go index 87f1348c..2728481b 100644 --- a/client.go +++ b/client.go @@ -78,6 +78,7 @@ func (c *client) serve(conn net.Conn) error { var cmd []string var err error + unknownCmdTimes := int(0) for { select { case <-c.cliCtx.Done: @@ -118,7 +119,13 @@ func (c *client) serve(conn net.Conn) error { zap.L().Error(err.Error(), zap.String("addr", c.cliCtx.RemoteAddr), zap.Int64("clientid", c.cliCtx.ID)) resp.ReplyError(c, err.Error()) - continue + unknownCmdTimes++ + if unknownCmdTimes >= 3 { + c.conn.Close() + return nil + } else { + continue + } } ctx := &command.Context{ From 63b131d12986f3d73a8146d8dc311c84e253c482 Mon Sep 17 00:00:00 2001 From: xufangping Date: Wed, 18 Sep 2019 20:00:26 +0800 Subject: [PATCH 06/60] add expire left seconds metrics --- db/expire.go | 13 ++++++++++--- metrics/prometheus.go | 17 +++++++++++++---- 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/db/expire.go b/db/expire.go index 76d769b4..4528dac1 100644 --- a/db/expire.go +++ b/db/expire.go @@ -151,9 +151,10 @@ func runExpire(db *DB, batchLimit int) { limit := batchLimit now := time.Now().UnixNano() + ts := now for iter.Valid() && iter.Key().HasPrefix(expireKeyPrefix) && limit > 0 { rawKey := iter.Key() - ts := DecodeInt64(rawKey[expireTimestampOffset : expireTimestampOffset+8]) + ts = DecodeInt64(rawKey[expireTimestampOffset : expireTimestampOffset+8]) if ts > now { if logEnv := zap.L().Check(zap.DebugLevel, "[Expire] not need to expire key"); logEnv != nil { logEnv.Write(zap.String("raw-key", string(rawKey)), zap.Int64("last-timestamp", ts)) @@ -198,10 +199,17 @@ func runExpire(db *DB, batchLimit int) { logEnv.Write(zap.Int("expired_num", batchLimit-limit)) } + diff := (ts - now) / int64(time.Second) + if diff >= 0 { + metrics.GetMetrics().ExpireLeftSecondsVec.WithLabelValues("left").Set(float64(diff)) + } else { + metrics.GetMetrics().ExpireLeftSecondsVec.WithLabelValues("delay").Set(float64(-1 * diff)) + } + metrics.GetMetrics().ExpireKeysTotal.WithLabelValues("expired").Add(float64(batchLimit - limit)) } -func gcDataKey(txn *Transaction, namespace []byte, dbid DBID, key, id []byte)error{ +func gcDataKey(txn *Transaction, namespace []byte, dbid DBID, key, id []byte) error { dkey := toTikvDataKey(namespace, dbid, id) if err := gc(txn.t, dkey); err != nil { zap.L().Error("[Expire] gc failed", @@ -251,4 +259,3 @@ func doExpire(txn *Transaction, mkey, id []byte) error { } return gcDataKey(txn, namespace, dbid, key, id) } - diff --git a/metrics/prometheus.go b/metrics/prometheus.go index 8c7f42be..c1f0ab60 100644 --- a/metrics/prometheus.go +++ b/metrics/prometheus.go @@ -45,10 +45,11 @@ type Metrics struct { ConnectionOnlineGaugeVec *prometheus.GaugeVec //command - ZTInfoCounterVec *prometheus.CounterVec - IsLeaderGaugeVec *prometheus.GaugeVec - LRangeSeekHistogram prometheus.Histogram - GCKeysCounterVec *prometheus.CounterVec + ZTInfoCounterVec *prometheus.CounterVec + IsLeaderGaugeVec *prometheus.GaugeVec + ExpireLeftSecondsVec *prometheus.GaugeVec + LRangeSeekHistogram prometheus.Histogram + GCKeysCounterVec *prometheus.CounterVec //expire ExpireKeysTotal *prometheus.CounterVec @@ -171,6 +172,14 @@ func init() { }, bizLabel) prometheus.MustRegister(gm.ConnectionOnlineGaugeVec) + gm.ExpireLeftSecondsVec = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Name: "expire_left_seconds", + Help: "The seconds after which from now will do expire", + }, expireLabel) + prometheus.MustRegister(gm.ExpireLeftSecondsVec) + gm.LRangeSeekHistogram = prometheus.NewHistogram( prometheus.HistogramOpts{ Namespace: namespace, From a9c28fe2209fb29a1b63f37dbef039703ea763cd Mon Sep 17 00:00:00 2001 From: xufangping Date: Wed, 18 Sep 2019 20:46:37 +0800 Subject: [PATCH 07/60] fix expire left time calculating --- db/expire.go | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/db/expire.go b/db/expire.go index 4528dac1..4b21b82c 100644 --- a/db/expire.go +++ b/db/expire.go @@ -190,6 +190,14 @@ func runExpire(db *DB, batchLimit int) { limit-- } + now = time.Now().UnixNano() + diff := (ts - now) / int64(time.Second) + if diff >= 0 { + metrics.GetMetrics().ExpireLeftSecondsVec.WithLabelValues("left").Set(float64(diff)) + } else { + metrics.GetMetrics().ExpireLeftSecondsVec.WithLabelValues("delay").Set(float64(-1 * diff)) + } + if err := txn.Commit(context.Background()); err != nil { txn.Rollback() zap.L().Error("[Expire] commit failed", zap.Error(err)) @@ -199,13 +207,6 @@ func runExpire(db *DB, batchLimit int) { logEnv.Write(zap.Int("expired_num", batchLimit-limit)) } - diff := (ts - now) / int64(time.Second) - if diff >= 0 { - metrics.GetMetrics().ExpireLeftSecondsVec.WithLabelValues("left").Set(float64(diff)) - } else { - metrics.GetMetrics().ExpireLeftSecondsVec.WithLabelValues("delay").Set(float64(-1 * diff)) - } - metrics.GetMetrics().ExpireKeysTotal.WithLabelValues("expired").Add(float64(batchLimit - limit)) } From e5ff5013b4c8235fe9a28d634bdbfd20b09398a8 Mon Sep 17 00:00:00 2001 From: xufangping Date: Thu, 19 Sep 2019 11:13:29 +0800 Subject: [PATCH 08/60] when set left/delay current seconds(expire), also set other seconds to 0 --- db/expire.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/db/expire.go b/db/expire.go index 4b21b82c..36acc2b6 100644 --- a/db/expire.go +++ b/db/expire.go @@ -194,8 +194,10 @@ func runExpire(db *DB, batchLimit int) { diff := (ts - now) / int64(time.Second) if diff >= 0 { metrics.GetMetrics().ExpireLeftSecondsVec.WithLabelValues("left").Set(float64(diff)) + metrics.GetMetrics().ExpireLeftSecondsVec.WithLabelValues("delay").Set(0) } else { metrics.GetMetrics().ExpireLeftSecondsVec.WithLabelValues("delay").Set(float64(-1 * diff)) + metrics.GetMetrics().ExpireLeftSecondsVec.WithLabelValues("left").Set(0) } if err := txn.Commit(context.Background()); err != nil { From fb11c671ae648fc3fc7b3f8c01a2f60d5e9507b5 Mon Sep 17 00:00:00 2001 From: xufangping Date: Fri, 25 Oct 2019 15:28:22 +0800 Subject: [PATCH 09/60] delete zadd args output --- command/zsets.go | 3 --- 1 file changed, 3 deletions(-) diff --git a/command/zsets.go b/command/zsets.go index 35a1813d..6eb930c3 100644 --- a/command/zsets.go +++ b/command/zsets.go @@ -2,7 +2,6 @@ package command import ( "errors" - "fmt" "math" "strconv" "strings" @@ -14,8 +13,6 @@ import ( func ZAdd(ctx *Context, txn *db.Transaction) (OnCommit, error) { key := []byte(ctx.Args[0]) - fmt.Println("zadd", ctx.Args) - kvs := ctx.Args[1:] if len(kvs)%2 != 0 { return nil, errors.New("ERR syntax error") From 586b30d0ab9fb0653cef4c81a05c0a1e1a11246f Mon Sep 17 00:00:00 2001 From: xufangping Date: Thu, 7 Nov 2019 20:02:40 +0800 Subject: [PATCH 10/60] first version of rate limit --- command/command.go | 1 + conf/config.go | 23 +++++--- db/ratelimiter.go | 134 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 151 insertions(+), 7 deletions(-) create mode 100644 db/ratelimiter.go diff --git a/command/command.go b/command/command.go index 186105ec..bbac665a 100644 --- a/command/command.go +++ b/command/command.go @@ -159,6 +159,7 @@ func Call(ctx *Context) { feedMonitors(ctx) start := time.Now() + // judge ctx.name and ctx.args cmdInfoCommand.Proc(ctx) cost := time.Since(start) diff --git a/conf/config.go b/conf/config.go index d97adb9a..e677a415 100644 --- a/conf/config.go +++ b/conf/config.go @@ -1,6 +1,8 @@ package conf -import "time" +import ( + "time" +) // Titan configuration center type Titan struct { @@ -34,12 +36,13 @@ type Server struct { // Tikv config is the config of tikv sdk type Tikv struct { - PdAddrs string `cfg:"pd-addrs;required; ;pd address in tidb"` - DB DB `cfg:"db"` - GC GC `cfg:"gc"` - Expire Expire `cfg:"expire"` - ZT ZT `cfg:"zt"` - TikvGC TikvGC `cfg:"tikv-gc"` + PdAddrs string `cfg:"pd-addrs;required; ;pd address in tidb"` + DB DB `cfg:"db"` + GC GC `cfg:"gc"` + Expire Expire `cfg:"expire"` + ZT ZT `cfg:"zt"` + TikvGC TikvGC `cfg:"tikv-gc"` + RateLimit RateLimit `cfg:"rate-limit"` } // TikvGC config is the config of implement tikv sdk gcwork @@ -99,3 +102,9 @@ type Status struct { SSLCertFile string `cfg:"ssl-cert-file;;;status server SSL certificate file (enables SSL support)"` SSLKeyFile string `cfg:"ssl-key-file;;;status server SSL key file"` } + +type RateLimit struct { + LimitPeriod int `cfg:"limit-period; 20; numeric; the period in ms to limit rate"` + BalancePeriod int `cfg:"sync-global-period; 3000; numeric; the period in ms to balance rate limiting with other titan nodes"` + Rule string `cfg:"rule; setex:10000:1000000,set:20000:1000000; ; the rule to limit rate(command1:maxqps:maxrate,command2:maxqps:maxrate...)"` +} diff --git a/db/ratelimiter.go b/db/ratelimiter.go new file mode 100644 index 00000000..bb1b6381 --- /dev/null +++ b/db/ratelimiter.go @@ -0,0 +1,134 @@ +package db + +import ( + "sync" + "time" +) + +const ( + ALL_NAMESPACE = "*" +) + +type CommandLimiter struct { + lock sync.Mutex + startTime time.Time + commandsCount int + commandsSize int + maxCommandsCount int + maxCommandsSize int + period int + periodsNumToTriggerBalance int + totalCommandsCount int + totalCommandsSize int +} + +type NamespaceLimiter struct { + lock sync.Mutex + commandLimiter map[string]*CommandLimiter +} + +type LimitRule struct { + //namespace string + //command string + maxqps int + maxrate int +} +type Limiter struct { + lock sync.Mutex + + limitPeriod int + balancePeriod int + currentIp string + + limitRules map[string]map[string]LimitRule + limiter map[string]NamespaceLimiter +} + +func NewLimitRule(rule string) map[string]map[string]LimitRule { + return nil +} + +func NewLimiter(limitPeriod int, balancePeriod int, rule string) *Limiter { + l := &Limiter{ + limitPeriod: limitPeriod, + balancePeriod: balancePeriod, + currentIp: "127.0.0.1", //set current ip + limitRules: NewLimitRule(rule), + limiter: make(map[string]NamespaceLimiter), + } + + return l +} + +func (l *Limiter) addAllNamespaceLimiter(rule string) { + +} + +//func (l *Limiter) addNamespaceLimiter(namespace string, command string, maxQps int, maxRate int) { +// +//} + +func (l *Limiter) get(namespace string, command string) *CommandLimiter { + return nil +} + +func (l *Limiter) setCommandLimiter(namespace string, command string, commandLimiter *CommandLimiter) *CommandLimiter { + return nil +} + +func (l *Limiter) checkLimit(namespace string, command string, argSize int) { + commandLimiter := l.get(namespace, command) + if commandLimiter == nil { + //get rule from limitRule + limitRule, ok := l.limitRules[namespace][command] + if ok { + tmp := NewCommandLimiter(limitRule) + commandLimiter = l.setCommandLimiter(namespace, command, tmp) + } else { + limitRule, ok = l.limitRules[ALL_NAMESPACE][command] + if ok { + tmp := NewCommandLimiter(limitRule) + commandLimiter = l.setCommandLimiter(namespace, command, tmp) + } + } + } + if commandLimiter != nil { + commandLimiter.checkLimit(len(command), argSize) + } +} + +func (l *Limiter) watchConfigChanged() { + +} + +func (l *Limiter) clearLimitPeriod() { + +} + +func (l *Limiter) balanceLimitInNodes() { + +} + +//func NewNamespaceLimiter() *NamespaceLimiter { +// +//} + +func (nl *NamespaceLimiter) getOrSetCommandLimiter(command string, maxQps int, maxRate int) *CommandLimiter { + return nil +} + +//func (nl *NamespaceLimiter) checkLimit (command string, argSize int) { +// +//} + +func NewCommandLimiter(limitRule LimitRule) *CommandLimiter { + return nil +} + +func (cl *CommandLimiter) checkLimit(commandSize int, argSize int) { + +} + +func (cl *CommandLimiter) reset() { + +} From 27f54e7bdf99fe4275d75122cac9472f18843f5a Mon Sep 17 00:00:00 2001 From: xufangping Date: Thu, 21 Nov 2019 11:30:50 +0800 Subject: [PATCH 11/60] 1 use rate.limiter to implement limit and read tikv key/value to get new limit 2 just balance limit base in active titan server num 3 unit test of rate limit passed --- bin/titan/main.go | 7 +- command/command.go | 5 +- conf/config.go | 7 +- conf/mockconfig.go | 5 + context/context.go | 1 + db/limitersMgr.go | 465 ++++++++++++++++++++++++++++++++++++ db/ratelimiter.go | 134 ----------- go.mod | 2 +- metrics/prometheus.go | 28 +++ tools/autotest/auto.go | 165 ++++++++++++- tools/autotest/auto_test.go | 1 + tools/integration/titan.go | 9 +- tools/test/main.go | 1 + 13 files changed, 682 insertions(+), 148 deletions(-) create mode 100644 db/limitersMgr.go delete mode 100644 db/ratelimiter.go diff --git a/bin/titan/main.go b/bin/titan/main.go index 4adb1122..4616110b 100644 --- a/bin/titan/main.go +++ b/bin/titan/main.go @@ -68,11 +68,16 @@ func main() { } svr := metrics.NewServer(&config.Status) - + limitersMgr, err := db.NewLimitersMgr(store, config.Tikv.RateLimit) + if err != nil { + zap.L().Fatal("create limitersMgr failed", zap.Error(err)) + os.Exit(1) + } serv := titan.New(&context.ServerContext{ RequirePass: config.Server.Auth, Store: store, ListZipThreshold: config.Server.ListZipThreshold, + LimitersMgr: limitersMgr, }) var servOpts, statusOpts []continuous.ServerOption diff --git a/command/command.go b/command/command.go index bbac665a..0a17d06d 100644 --- a/command/command.go +++ b/command/command.go @@ -92,6 +92,10 @@ type TxnCommand func(ctx *Context, txn *db.Transaction) (OnCommit, error) func Call(ctx *Context) { ctx.Name = strings.ToLower(ctx.Name) + if _, ok := txnCommands[ctx.Name]; ok && ctx.Server.LimitersMgr != nil { + ctx.Server.LimitersMgr.CheckLimit(ctx.Client.Namespace, ctx.Name, ctx.Args) + } + if ctx.Name != "auth" && ctx.Server.RequirePass != "" && ctx.Client.Authenticated == false { @@ -159,7 +163,6 @@ func Call(ctx *Context) { feedMonitors(ctx) start := time.Now() - // judge ctx.name and ctx.args cmdInfoCommand.Proc(ctx) cost := time.Since(start) diff --git a/conf/config.go b/conf/config.go index e677a415..8228c40d 100644 --- a/conf/config.go +++ b/conf/config.go @@ -104,7 +104,8 @@ type Status struct { } type RateLimit struct { - LimitPeriod int `cfg:"limit-period; 20; numeric; the period in ms to limit rate"` - BalancePeriod int `cfg:"sync-global-period; 3000; numeric; the period in ms to balance rate limiting with other titan nodes"` - Rule string `cfg:"rule; setex:10000:1000000,set:20000:1000000; ; the rule to limit rate(command1:maxqps:maxrate,command2:maxqps:maxrate...)"` + InterfaceName string `cfg:interface-name; eth0; ; the interface name to get ip and write local titan status to tikv for balancing rate limit` + GlobalBalancePeriod time.Duration `cfg:"global-balance-period; 15s;; the period in seconds to balance rate limiting with other titan nodes"` + TitanStatusLifetime time.Duration `cfg:"titanstatus-life-time; 1m;; how long if a titan didn't update its status, we consider it dead"` + SyncSetPeriod time.Duration `cfg:"sync-set-period; 3s;; the period in seconds to sync new limit set in tikv"` } diff --git a/conf/mockconfig.go b/conf/mockconfig.go index eab70988..9649b490 100644 --- a/conf/mockconfig.go +++ b/conf/mockconfig.go @@ -33,6 +33,11 @@ func MockConf() *Titan { SafePointLifeTime: 10 * time.Minute, Concurrency: 2, }, + RateLimit: RateLimit{ + SyncSetPeriod: 1 * time.Second, + GlobalBalancePeriod: 15 * time.Second, + TitanStatusLifetime: 30 * time.Second, + }, }, } } diff --git a/context/context.go b/context/context.go index 5082436e..22880a74 100644 --- a/context/context.go +++ b/context/context.go @@ -77,6 +77,7 @@ type ServerContext struct { Store *db.RedisStore Monitors sync.Map Clients sync.Map + LimitersMgr *db.LimitersMgr Pause time.Duration // elapse to pause all clients StartAt time.Time ListZipThreshold int diff --git a/db/limitersMgr.go b/db/limitersMgr.go new file mode 100644 index 00000000..8a5da922 --- /dev/null +++ b/db/limitersMgr.go @@ -0,0 +1,465 @@ +package db + +import ( + "context" + "errors" + "fmt" + "github.com/distributedio/titan/conf" + "github.com/distributedio/titan/metrics" + sdk_kv "github.com/pingcap/tidb/kv" + "go.uber.org/zap" + "golang.org/x/time/rate" + "net" + "strconv" + "strings" + "sync" + "time" +) + +const ( + LIMITDATA_NAMESPACE = "sys_ratelimit" + LIMITDATA_DBID = 0 + ALL_NAMESPACE = "*" + NAMESPACE_COMMAND_TOKEN = "@" + QPS_PREFIX = "qps:" + RATE_PREFIX = "rate:" + RATE_BURST_TOKEN = " " + TITAN_STATUS_TOKEN = "titan_status:" + TIME_FORMAT = "2006-01-02 15:04:05" + DEFAULT_BURST = 1 +) + +type CommandLimiter struct { + localIp string + limiterName string + + qpsl *rate.Limiter + ratel *rate.Limiter + localPercent float64 + + lock sync.Mutex + lastTime time.Time + totalCommandsCount int64 + totalCommandsSize int64 +} + +type LimitersMgr struct { + limitDatadb *DB + globalBalancePeriod time.Duration + titanStatusLifeTime time.Duration + syncSetPeriod time.Duration + localIp string + localPercent float64 + + limiters sync.Map + lock sync.Mutex +} + +func NewLimitersMgr(store *RedisStore, rateLimit conf.RateLimit) (*LimitersMgr, error) { + var addrs []net.Addr + var err error + if rateLimit.InterfaceName != "" { + iface, err := net.InterfaceByName(rateLimit.InterfaceName) + if err != nil { + return nil, err + } + + addrs, err = iface.Addrs() + if err != nil { + return nil, err + } + } else { + addrs, err = net.InterfaceAddrs() + if err != nil { + return nil, err + } + } + + localIp := "" + for _, a := range addrs { + if ipnet, ok := a.(*net.IPNet); ok && !ipnet.IP.IsLoopback() && ipnet.IP.To4() != nil { + localIp = ipnet.IP.String() + break + } + } + if localIp == "" { + return nil, errors.New(rateLimit.InterfaceName + " adds is empty") + } + + l := &LimitersMgr{ + limitDatadb: store.DB(LIMITDATA_NAMESPACE, LIMITDATA_DBID), + globalBalancePeriod: rateLimit.GlobalBalancePeriod, + titanStatusLifeTime: rateLimit.TitanStatusLifetime, + syncSetPeriod: rateLimit.SyncSetPeriod, + localIp: localIp, + localPercent: 1, + } + + go l.startBalanceLimit() + go l.startSyncNewLimit() + return l, nil +} + +func (l *LimitersMgr) init(limiterName string) *CommandLimiter { + //lock is just prevent many new connection of same namespace to getlimit from tikv in same time + l.lock.Lock() + defer l.lock.Unlock() + + v, ok := l.limiters.Load(limiterName) + if ok { + return v.(*CommandLimiter) + } + + qpsLimit, _ := l.getLimit(limiterName, true) + rateLimit, rateBurst := l.getLimit(limiterName, false) + if qpsLimit > 0 || + (rateLimit > 0 && rateBurst > 0) { + newCl := NewCommandLimiter(l.localIp, limiterName, qpsLimit, rateLimit, rateBurst) + v, _ := l.limiters.LoadOrStore(limiterName, newCl) + return v.(*CommandLimiter) + } else { + l.limiters.LoadOrStore(limiterName, (*CommandLimiter)(nil)) + return nil + } +} + +func (l *LimitersMgr) getLimit(limiterName string, isQps bool) (float64, int) { + strs := strings.Split(limiterName, NAMESPACE_COMMAND_TOKEN) + if len(strs) < 2 { + zap.L().Error("[Limit] wrong format limiter name", zap.String("limiterName", limiterName)) + return 0, 0 + } + allMatchLimiter := fmt.Sprintf("%s%s%s", ALL_NAMESPACE, NAMESPACE_COMMAND_TOKEN, strs[1]) + limiterNames := []string{limiterName, allMatchLimiter} + limit := float64(0) + burst := int64(0) + + txn, err := l.limitDatadb.Begin() + if err != nil { + zap.L().Error("[Limit] transection begin failed", zap.String("limiterName", limiterName), zap.Bool("isQps", isQps), zap.Error(err)) + return 0, 0 + } + for i := range limiterNames { + var limiterKey string + if isQps { + limiterKey = QPS_PREFIX + limiterNames[i] + } else { + limiterKey = RATE_PREFIX + limiterNames[i] + } + + str, err := txn.String([]byte(limiterKey)) + if err != nil { + zap.L().Error("[Limit] get limit's value failed", zap.String("key", limiterKey), zap.Error(err)) + return 0, 0 + } + val, err := str.Get() + if err != nil { + zap.L().Info("[Limit] limit isn't set", zap.String("key", limiterKey), zap.Error(err)) + continue + } + + if isQps { + if limit, err = strconv.ParseFloat(string(val), 64); err != nil { + zap.L().Error("[Limit] qps limit can't be decoded to integer", zap.String("key", limiterKey), zap.ByteString("val", val), zap.Error(err)) + continue + } + } else { + rateLimitVals := strings.Split(string(val), RATE_BURST_TOKEN) + if len(rateLimitVals) < 2 { + zap.L().Error("[Limit] rate limit hasn't enough parameters, should be: K|k|M|m ", zap.String("key", limiterKey), zap.ByteString("val", val)) + continue + } + rateLimit := rateLimitVals[0] + rateBurst := rateLimitVals[1] + if len(rateLimit) < 2 { + zap.L().Error("[Limit] rate limit is invaild, should be: K|k|M|m ", zap.String("key", limiterKey), zap.ByteString("val", val)) + continue + } + var strUnit uint8 + var unit float64 + strUnit = rateLimit[len(rateLimit)-1] + if strUnit == 'k' || strUnit == 'K' { + unit = 1024 + } else if strUnit == 'm' || strUnit == 'M' { + unit = 1024 * 1024 + } else { + zap.L().Error("[Limit] rate limit is invaild, should be: K|k|M|m ", zap.String("key", limiterKey), zap.ByteString("val", val)) + continue + } + if limit, err = strconv.ParseFloat(rateLimit[:len(rateLimit)-1], 64); err != nil { + zap.L().Error("[Limit] rate limit can't be decoded to integer", zap.String("key", limiterKey), zap.ByteString("val", val), zap.Error(err)) + continue + } + limit *= unit + if burst, err = strconv.ParseInt(rateBurst, 10, 64); err != nil { + zap.L().Error("[Limit] rate burst can't be decoded to integer", zap.String("key", limiterKey), zap.ByteString("val", val), zap.Error(err)) + continue + } + } + + zap.L().Info("[Limit] got limit", zap.String("key", limiterKey), zap.Float64("limit", limit), zap.Int64("burst", burst)) + break + } + if err := txn.t.Commit(context.Background()); err != nil { + zap.L().Error("[Limit] commit after get limit failed", zap.String("limiterName", limiterName), zap.Error(err)) + txn.t.Rollback() + return 0, 0 + } + return limit, int(burst) +} + +func (l *LimitersMgr) CheckLimit(namespace string, cmdName string, cmdArgs []string) { + if namespace == LIMITDATA_NAMESPACE { + return + } + limiterName := fmt.Sprintf("%s%s%s", namespace, NAMESPACE_COMMAND_TOKEN, cmdName) + v, ok := l.limiters.Load(limiterName) + var commandLimiter *CommandLimiter + if !ok { + commandLimiter = l.init(limiterName) + } else { + commandLimiter = v.(*CommandLimiter) + } + + if commandLimiter != nil { + now := time.Now() + commandLimiter.checkLimit(cmdName, cmdArgs) + cost := time.Since(now).Seconds() + metrics.GetMetrics().LimitCostHistogramVec.WithLabelValues(namespace, cmdName).Observe(cost) + } +} + +func (l *LimitersMgr) startBalanceLimit() { + ticker := time.NewTicker(l.globalBalancePeriod) + defer ticker.Stop() + for range ticker.C { + l.runBalanceLimit() + } +} + +func (l *LimitersMgr) runBalanceLimit() { + txn, err := l.limitDatadb.Begin() + if err != nil { + zap.L().Error("[Limit] transection begin failed", zap.String("titan", l.localIp), zap.Error(err)) + return + } + + prefix := MetaKey(l.limitDatadb, []byte(TITAN_STATUS_TOKEN)) + endPrefix := sdk_kv.Key(prefix).PrefixNext() + iter, err := txn.t.Iter(prefix, endPrefix) + if err != nil { + zap.L().Error("[Limit] seek failed", zap.ByteString("prefix", prefix), zap.Error(err)) + txn.Rollback() + return + } + defer iter.Close() + + activeNum := 1 + prefixLen := len(prefix) + for ; iter.Valid() && iter.Key().HasPrefix(prefix); err = iter.Next() { + if err != nil { + zap.L().Error("[Limit] next failed", zap.ByteString("prefix", prefix), zap.Error(err)) + txn.Rollback() + return + } + + key := iter.Key() + if len(key) <= prefixLen { + zap.L().Error("ip is null", zap.ByteString("key", key)) + continue + } + ip := key[prefixLen:] + if string(ip) == l.localIp { + continue + } + + lastActive := iter.Value() + lastActiveT, err := time.Parse(TIME_FORMAT, string(lastActive)) + if err != nil { + zap.L().Error("[Limit] decode time string failed", zap.ByteString("ip", ip), zap.ByteString("lastActive", lastActive), zap.Error(err)) + continue + } + + zap.L().Info("[Limit] last active time", zap.ByteString("ip", ip), zap.ByteString("lastActive", lastActive)) + if time.Since(lastActiveT) <= l.titanStatusLifeTime { + activeNum++ + } + } + newPercent := float64(1.0 / activeNum) + if l.localPercent != newPercent { + zap.L().Info("[Limit] balance limit in all titan server", zap.Int("active server num", activeNum), + zap.Float64("oldPercent", l.localPercent), zap.Float64("newPercent", newPercent)) + l.limiters.Range(func(k, v interface{}) bool { + commandLimiter := v.(*CommandLimiter) + if commandLimiter != nil { + commandLimiter.updateLimitPercent(newPercent) + } + return true + }) + l.localPercent = newPercent + } + + key := []byte(TITAN_STATUS_TOKEN + l.localIp) + s := NewString(txn, key) + now := time.Now() + value := now.Format(TIME_FORMAT) + if err := s.Set([]byte(value), 0); err != nil { + txn.Rollback() + return + } + if err := txn.t.Commit(context.Background()); err != nil { + zap.L().Error("[Limit] commit after balance limit failed", zap.String("titan", l.localIp)) + txn.Rollback() + return + } +} + +func (l *LimitersMgr) startSyncNewLimit() { + ticker := time.NewTicker(l.syncSetPeriod) + defer ticker.Stop() + for range ticker.C { + l.limiters.Range(func(k, v interface{}) bool { + limiterName := k.(string) + commandLimiter := v.(*CommandLimiter) + qpsLimit, _ := l.getLimit(limiterName, true) + rateLimit, rateBurst := l.getLimit(limiterName, false) + if qpsLimit > 0 || + (rateLimit > 0 && rateBurst > 0) { + if commandLimiter == nil { + newCl := NewCommandLimiter(l.localIp, limiterName, qpsLimit, rateLimit, rateBurst) + l.limiters.Store(limiterName, newCl) + } else { + commandLimiter.updateLimit(qpsLimit, rateLimit, rateBurst) + } + } else { + if commandLimiter != nil { + l.limiters.Store(limiterName, (*CommandLimiter)(nil)) + } + } + return true + }) + } +} + +func NewCommandLimiter(localIp string, limiterName string, qpsLimit float64, rateLimit float64, rateBurst int) *CommandLimiter { + var qpsl, ratel *rate.Limiter + if qpsLimit > 0 { + qpsl = rate.NewLimiter(rate.Limit(qpsLimit), DEFAULT_BURST) + } + if rateLimit > 0 && rateBurst > 0 { + ratel = rate.NewLimiter(rate.Limit(rateLimit), rateBurst) + } + cl := &CommandLimiter{ + limiterName: limiterName, + localIp: localIp, + qpsl: qpsl, + ratel: ratel, + localPercent: 1, + lastTime: time.Now(), + } + return cl +} + +func (cl *CommandLimiter) updateLimit(qpsLimit float64, rateLimit float64, rateBurst int) { + cl.lock.Lock() + defer cl.lock.Unlock() + + if qpsLimit > 0 { + if cl.qpsl != nil { + if cl.qpsl.Limit() != rate.Limit(qpsLimit*cl.localPercent) { + cl.qpsl.SetLimit(rate.Limit(qpsLimit * cl.localPercent)) + } + } else { + cl.qpsl = rate.NewLimiter(rate.Limit(qpsLimit*cl.localPercent), DEFAULT_BURST) + } + } else { + cl.qpsl = nil + } + + if rateLimit > 0 && rateBurst > 0 { + if cl.ratel != nil { + if cl.ratel.Burst() != rateBurst { + cl.ratel = rate.NewLimiter(rate.Limit(rateLimit*cl.localPercent), rateBurst) + } else if cl.ratel.Limit() != rate.Limit(rateLimit*cl.localPercent) { + cl.ratel.SetLimit(rate.Limit(rateLimit * cl.localPercent)) + } + } else { + cl.ratel = rate.NewLimiter(rate.Limit(rateLimit*cl.localPercent), rateBurst) + } + } else { + cl.ratel = nil + } + + var qpsLocal, rateLocal float64 + seconds := time.Since(cl.lastTime).Seconds() + if seconds >= 0 { + qpsLocal = float64(cl.totalCommandsCount) / seconds + rateLocal = float64(cl.totalCommandsSize) / 1024 / seconds + } else { + qpsLocal = 0 + rateLocal = 0 + } + metrics.GetMetrics().LimiterQpsVec.WithLabelValues(cl.localIp, cl.limiterName).Set(qpsLocal) + metrics.GetMetrics().LimiterRateVec.WithLabelValues(cl.localIp, cl.limiterName).Set(rateLocal) + cl.totalCommandsCount = 0 + cl.totalCommandsSize = 0 + cl.lastTime = time.Now() +} + +func (cl *CommandLimiter) updateLimitPercent(newPercent float64) { + cl.lock.Lock() + defer cl.lock.Unlock() + + if cl.localPercent != newPercent && cl.localPercent > 0 && newPercent > 0 { + if cl.qpsl != nil { + qpsLimit := float64(cl.qpsl.Limit()) / cl.localPercent + cl.qpsl.SetLimit(rate.Limit(qpsLimit * newPercent)) + } + if cl.ratel != nil { + rateLimit := float64(cl.ratel.Limit()) / cl.localPercent + cl.ratel.SetLimit(rate.Limit(rateLimit * newPercent)) + } + cl.localPercent = newPercent + } +} + +func (cl *CommandLimiter) checkLimit(cmdName string, cmdArgs []string) { + cl.lock.Lock() + defer cl.lock.Unlock() + + if cl.qpsl != nil { + r := cl.qpsl.Reserve() + if !r.OK() { + zap.L().Error("[Limit] request events num exceed limiter burst", zap.Int("qps limiter burst", cl.qpsl.Burst())) + } else { + d := r.Delay() + if logEnv := zap.L().Check(zap.DebugLevel, "[Limit] trigger qps limit"); logEnv != nil { + logEnv.Write(zap.String("limiter name", cl.limiterName), zap.Int64("sleep us", int64(d/time.Microsecond))) + } + time.Sleep(d) + } + } + + cmdSize := len(cmdName) + for i := range cmdArgs { + cmdSize += len(cmdArgs[i]) + 1 + } + if cl.ratel != nil { + r := cl.ratel.ReserveN(time.Now(), cmdSize) + if !r.OK() { + zap.L().Error("[Limit] request events num exceed limiter burst", zap.Int("rate limiter burst", cl.ratel.Burst()), zap.Int("command size", cmdSize)) + } else { + d := r.Delay() + if logEnv := zap.L().Check(zap.DebugLevel, "[Limit] trigger rate limit"); logEnv != nil { + logEnv.Write(zap.String("limiter name", cl.limiterName), zap.Strings("args", cmdArgs), zap.Int64("sleep us", int64(d/time.Microsecond))) + } + time.Sleep(d) + } + } + + cl.totalCommandsCount++ + cl.totalCommandsSize += int64(cmdSize) + if logEnv := zap.L().Check(zap.DebugLevel, "[Limit] limiter state"); logEnv != nil { + logEnv.Write(zap.String("limiter name", cl.limiterName), zap.Time("last time", cl.lastTime), zap.Int64("command count", cl.totalCommandsCount), zap.Int64("command size", cl.totalCommandsSize)) + } +} diff --git a/db/ratelimiter.go b/db/ratelimiter.go deleted file mode 100644 index bb1b6381..00000000 --- a/db/ratelimiter.go +++ /dev/null @@ -1,134 +0,0 @@ -package db - -import ( - "sync" - "time" -) - -const ( - ALL_NAMESPACE = "*" -) - -type CommandLimiter struct { - lock sync.Mutex - startTime time.Time - commandsCount int - commandsSize int - maxCommandsCount int - maxCommandsSize int - period int - periodsNumToTriggerBalance int - totalCommandsCount int - totalCommandsSize int -} - -type NamespaceLimiter struct { - lock sync.Mutex - commandLimiter map[string]*CommandLimiter -} - -type LimitRule struct { - //namespace string - //command string - maxqps int - maxrate int -} -type Limiter struct { - lock sync.Mutex - - limitPeriod int - balancePeriod int - currentIp string - - limitRules map[string]map[string]LimitRule - limiter map[string]NamespaceLimiter -} - -func NewLimitRule(rule string) map[string]map[string]LimitRule { - return nil -} - -func NewLimiter(limitPeriod int, balancePeriod int, rule string) *Limiter { - l := &Limiter{ - limitPeriod: limitPeriod, - balancePeriod: balancePeriod, - currentIp: "127.0.0.1", //set current ip - limitRules: NewLimitRule(rule), - limiter: make(map[string]NamespaceLimiter), - } - - return l -} - -func (l *Limiter) addAllNamespaceLimiter(rule string) { - -} - -//func (l *Limiter) addNamespaceLimiter(namespace string, command string, maxQps int, maxRate int) { -// -//} - -func (l *Limiter) get(namespace string, command string) *CommandLimiter { - return nil -} - -func (l *Limiter) setCommandLimiter(namespace string, command string, commandLimiter *CommandLimiter) *CommandLimiter { - return nil -} - -func (l *Limiter) checkLimit(namespace string, command string, argSize int) { - commandLimiter := l.get(namespace, command) - if commandLimiter == nil { - //get rule from limitRule - limitRule, ok := l.limitRules[namespace][command] - if ok { - tmp := NewCommandLimiter(limitRule) - commandLimiter = l.setCommandLimiter(namespace, command, tmp) - } else { - limitRule, ok = l.limitRules[ALL_NAMESPACE][command] - if ok { - tmp := NewCommandLimiter(limitRule) - commandLimiter = l.setCommandLimiter(namespace, command, tmp) - } - } - } - if commandLimiter != nil { - commandLimiter.checkLimit(len(command), argSize) - } -} - -func (l *Limiter) watchConfigChanged() { - -} - -func (l *Limiter) clearLimitPeriod() { - -} - -func (l *Limiter) balanceLimitInNodes() { - -} - -//func NewNamespaceLimiter() *NamespaceLimiter { -// -//} - -func (nl *NamespaceLimiter) getOrSetCommandLimiter(command string, maxQps int, maxRate int) *CommandLimiter { - return nil -} - -//func (nl *NamespaceLimiter) checkLimit (command string, argSize int) { -// -//} - -func NewCommandLimiter(limitRule LimitRule) *CommandLimiter { - return nil -} - -func (cl *CommandLimiter) checkLimit(commandSize int, argSize int) { - -} - -func (cl *CommandLimiter) reset() { - -} diff --git a/go.mod b/go.mod index a0880a92..989ae37d 100644 --- a/go.mod +++ b/go.mod @@ -55,7 +55,7 @@ require ( go.uber.org/multierr v1.1.0 // indirect go.uber.org/zap v1.9.1 golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4 // indirect - golang.org/x/time v0.0.0-20181108054448-85acf8d2951c // indirect + golang.org/x/time v0.0.0-20181108054448-85acf8d2951c google.golang.org/genproto v0.0.0-20190108161440-ae2f86662275 // indirect google.golang.org/grpc v1.17.0 // indirect gopkg.in/natefinch/lumberjack.v2 v2.0.0 // indirect diff --git a/metrics/prometheus.go b/metrics/prometheus.go index c1f0ab60..949e3991 100644 --- a/metrics/prometheus.go +++ b/metrics/prometheus.go @@ -48,6 +48,8 @@ type Metrics struct { ZTInfoCounterVec *prometheus.CounterVec IsLeaderGaugeVec *prometheus.GaugeVec ExpireLeftSecondsVec *prometheus.GaugeVec + LimiterQpsVec *prometheus.GaugeVec + LimiterRateVec *prometheus.GaugeVec LRangeSeekHistogram prometheus.Histogram GCKeysCounterVec *prometheus.CounterVec @@ -59,6 +61,7 @@ type Metrics struct { //command biz CommandCallHistogramVec *prometheus.HistogramVec + LimitCostHistogramVec *prometheus.HistogramVec TxnBeginHistogramVec *prometheus.HistogramVec CommandFuncDoneHistogramVec *prometheus.HistogramVec TxnCommitHistogramVec *prometheus.HistogramVec @@ -111,6 +114,15 @@ func init() { }, multiLabel) prometheus.MustRegister(gm.CommandArgsNumHistogramVec) + gm.LimitCostHistogramVec = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Namespace: namespace, + Name: "limit_cost_seconds", + Buckets: prometheus.ExponentialBuckets(0.0001, 2, 10), + Help: "the cost times of command execute's limit", + }, multiLabel) + prometheus.MustRegister(gm.LimitCostHistogramVec) + gm.TxnBeginHistogramVec = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Namespace: namespace, @@ -180,6 +192,22 @@ func init() { }, expireLabel) prometheus.MustRegister(gm.ExpireLeftSecondsVec) + gm.LimiterQpsVec = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Name: "qps_limiter_status", + Help: "the qps of a namespace's command in a titan server", + }, multiLabel) + prometheus.MustRegister(gm.LimiterQpsVec) + + gm.LimiterRateVec = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Name: "rate_limiter_status", + Help: "the rate of a namespace's command in a titan server(KB/s)", + }, multiLabel) + prometheus.MustRegister(gm.LimiterRateVec) + gm.LRangeSeekHistogram = prometheus.NewHistogram( prometheus.HistogramOpts{ Namespace: namespace, diff --git a/tools/autotest/auto.go b/tools/autotest/auto.go index ed28b560..a1cf5138 100644 --- a/tools/autotest/auto.go +++ b/tools/autotest/auto.go @@ -1,13 +1,16 @@ package autotest import ( + "fmt" + "github.com/distributedio/titan/tools/autotest/cmd" + "github.com/gomodule/redigo/redis" + "github.com/stretchr/testify/assert" + "math" "strconv" + "strings" + "sync" "testing" "time" - - "github.com/distributedio/titan/tools/autotest/cmd" - - "github.com/gomodule/redigo/redis" ) //AutoClient check redis comman @@ -19,8 +22,10 @@ type AutoClient struct { *cmd.ExampleSystem em *cmd.ExampleMulti // addr string - pool *redis.Pool - conn redis.Conn + pool *redis.Pool + conn redis.Conn + conn2 redis.Conn + limitConn redis.Conn } //NewAutoClient creat auto client @@ -40,6 +45,27 @@ func (ac *AutoClient) Start(addr string) { panic(err) } ac.conn = conn + + conn2, err := redis.Dial("tcp", addr) + if err != nil { + panic(err) + } + _, err = redis.String(conn2.Do("auth", "test-1542098935-1-7ca41bda4efc2a1889c04e")) + if err != nil { + panic(err) + } + ac.conn2 = conn2 + + limitConn, err := redis.Dial("tcp", addr) + if err != nil { + panic(err) + } + _, err = redis.String(limitConn.Do("auth", "sys_ratelimit-1574130304-1-36c153b109ebca80b43769")) + if err != nil { + panic(err) + } + ac.limitConn = limitConn + ac.es = cmd.NewExampleString(conn) ac.ek = cmd.NewExampleKey(conn) ac.el = cmd.NewExampleList(conn) @@ -54,6 +80,21 @@ func (ac *AutoClient) Close() { ac.conn.Close() } +func (ac *AutoClient) setLimit(t *testing.T, key string, value string) { + reply, err := redis.String(ac.limitConn.Do("SET", key, value)) + assert.Equal(t, "OK", reply) + assert.NoError(t, err) + data, err := redis.Bytes(ac.limitConn.Do("GET", key)) + assert.Equal(t, value, string(data)) + assert.NoError(t, err) +} + +func (ac *AutoClient) delLimit(t *testing.T, expectReply int, key string) { + reply, err := redis.Int(ac.limitConn.Do("DEL", key)) + assert.Equal(t, expectReply, reply) + assert.NoError(t, err) +} + //StringCase check string case func (ac *AutoClient) StringCase(t *testing.T) { ac.es.SetNxEqual(t, "key-setx", "v1") @@ -275,3 +316,115 @@ func (ac *AutoClient) MultiCase(t *testing.T) { // exec ac.em.ExecEqual(t) } + +func (ac *AutoClient) LimitCase(t *testing.T) { + ac.es.SetEqual(t, "key1", "1") + //first command invoke won't be limited + times := []int{100, 101} + conns := []redis.Conn{ac.conn, ac.conn2} + + cost := ac.runCmdInGoRoutines(t, "get", "key1", times, conns) + assert.Equal(t, true, cost <= 0.1) + cost = ac.runCmdInGoRoutines(t, "mget", "key1 key2", times, conns) + assert.Equal(t, true, cost <= 0.1) + + ac.setLimit(t, "qps:*@get", "100") + time.Sleep(time.Second * 1) + cost = ac.runCmdInGoRoutines(t, "get", "key1", times, conns) + assert.Equal(t, true, math.Abs(cost-2) <= 0.1) + + ac.setLimit(t, "qps:test@get", "200") + time.Sleep(time.Second * 1) + cost = ac.runCmdInGoRoutines(t, "get", "key1", times, conns) + assert.Equal(t, true, math.Abs(cost-1) <= 0.1) + + ac.delLimit(t, 1, "qps:test@get") + time.Sleep(time.Second * 1) + cost = ac.runCmdInGoRoutines(t, "get", "key1", times, conns) + assert.Equal(t, true, math.Abs(cost-2) <= 0.1) + + ac.setLimit(t, "qps:*@get", "100a") + time.Sleep(time.Second * 1) + cost = ac.runCmdInGoRoutines(t, "get", "key1", times, conns) + assert.Equal(t, true, cost <= 0.1) + + ac.setLimit(t, "qps:*@mget", "100") + time.Sleep(time.Second * 1) + cost = ac.runCmdInGoRoutines(t, "mget", "key1 key2", times, conns) + assert.Equal(t, true, math.Abs(cost-2) <= 0.1) + + ac.delLimit(t, 1, "qps:*@mget") + ac.setLimit(t, "rate:*@mget", "1k") + time.Sleep(time.Second * 1) + cost = ac.runCmdInGoRoutines(t, "mget", "key1 key2", times, conns) + assert.Equal(t, true, cost <= 0.1) + + ac.setLimit(t, "rate:*@mget", "1 2") + time.Sleep(time.Second * 1) + cost = ac.runCmdInGoRoutines(t, "mget", "key1 key2", times, conns) + assert.Equal(t, true, cost <= 0.1) + + ac.setLimit(t, "rate:*@mget", "1s 2") + time.Sleep(time.Second * 1) + cost = ac.runCmdInGoRoutines(t, "mget", "key1 key2", times, conns) + assert.Equal(t, true, cost <= 0.1) + + ac.setLimit(t, "rate:*@mget", "kk 2") + time.Sleep(time.Second * 1) + cost = ac.runCmdInGoRoutines(t, "mget", "key1 key2", times, conns) + assert.Equal(t, true, cost <= 0.1) + + ac.setLimit(t, "rate:*@mget", "1k 2a") + time.Sleep(time.Second * 1) + cost = ac.runCmdInGoRoutines(t, "mget", "key1 key2", times, conns) + assert.Equal(t, true, cost <= 0.1) + + ac.setLimit(t, "rate:*@mget", "1k 2") + time.Sleep(time.Second * 1) + cost = ac.runCmdInGoRoutines(t, "mget", "key1 key2", times, conns) + assert.Equal(t, true, cost <= 0.1) + + ac.setLimit(t, "rate:*@mget", "28k 100") + time.Sleep(time.Second * 1) + times = []int{1024, 1025} + cost = ac.runCmdInGoRoutines(t, "mget", "key1 key2", times, conns) + assert.Equal(t, true, math.Abs(cost-1) <= 0.3) + + ac.setLimit(t, "rate:*@mget", "0.028M 100") + time.Sleep(time.Second * 1) + cost = ac.runCmdInGoRoutines(t, "mget", "key1 key2", times, conns) + assert.Equal(t, true, math.Abs(cost-1) <= 0.3) + + ac.delLimit(t, 1, "rate:*@mget") + time.Sleep(time.Second * 1) + cost = ac.runCmdInGoRoutines(t, "mget", "key1 key2", times, conns) + assert.Equal(t, true, cost <= 0.3) + + ac.ek.DelEqual(t, 1, "key1") +} + +func (ac *AutoClient) runCmdInGoRoutines(t *testing.T, cmd string, key string, times []int, conns []redis.Conn) float64 { + gonum := len(times) + if gonum != len(conns) { + return 0 + } + + var wg sync.WaitGroup + wg.Add(gonum) + now := time.Now() + for i := 0; i < gonum; i++ { + go func(times int, conn redis.Conn, wg *sync.WaitGroup) { + cmd := strings.ToLower(cmd) + for j := 0; j < times; j++ { + _, err := conn.Do(cmd, key) + if err != nil { + fmt.Printf("cmd=%s, key=%s, err=%s\n", cmd, key, err) + break + } + } + wg.Done() + }(times[i], conns[i], &wg) + } + wg.Wait() + return time.Since(now).Seconds() +} diff --git a/tools/autotest/auto_test.go b/tools/autotest/auto_test.go index caefbd1d..3ffe7c8e 100644 --- a/tools/autotest/auto_test.go +++ b/tools/autotest/auto_test.go @@ -6,6 +6,7 @@ import ( func Test(t *testing.T) { at.SystemCase(t) + at.LimitCase(t) at.ZSetCase(t) at.StringCase(t) at.KeyCase(t) diff --git a/tools/integration/titan.go b/tools/integration/titan.go index 9bc36c87..9d400e2d 100644 --- a/tools/integration/titan.go +++ b/tools/integration/titan.go @@ -48,10 +48,15 @@ func Start() { log.Fatalln(err) } + limitersMgr, err := db.NewLimitersMgr(store, tikvConf.RateLimit) + if err != nil { + log.Fatalln(err) + } svr = titan.New(&context.ServerContext{ - RequirePass: cfg.Auth, - Store: store, + RequirePass: cfg.Auth, + Store: store, ListZipThreshold: 100, + LimitersMgr: limitersMgr, }) err = svr.ListenAndServe(cfg.Listen) if err != nil { diff --git a/tools/test/main.go b/tools/test/main.go index e7222570..ae84dd86 100644 --- a/tools/test/main.go +++ b/tools/test/main.go @@ -28,6 +28,7 @@ func main() { client.StringCase(t) client.ListCase(t) client.KeyCase(t) + client.LimitCase(t) } client.Close() } From e5018e45bf2a8c73d0f36590173d946529ca29cd Mon Sep 17 00:00:00 2001 From: xufangping Date: Thu, 21 Nov 2019 12:51:39 +0800 Subject: [PATCH 12/60] qps also can be set burst, its limit also support k/K/m/M suffix --- db/limitersMgr.go | 100 ++++++++++++++++++++--------------------- tools/autotest/auto.go | 50 ++++++++++++--------- 2 files changed, 79 insertions(+), 71 deletions(-) diff --git a/db/limitersMgr.go b/db/limitersMgr.go index 8a5da922..3cad1fd8 100644 --- a/db/limitersMgr.go +++ b/db/limitersMgr.go @@ -23,7 +23,7 @@ const ( NAMESPACE_COMMAND_TOKEN = "@" QPS_PREFIX = "qps:" RATE_PREFIX = "rate:" - RATE_BURST_TOKEN = " " + LIMIT_BURST_TOKEN = " " TITAN_STATUS_TOKEN = "titan_status:" TIME_FORMAT = "2006-01-02 15:04:05" DEFAULT_BURST = 1 @@ -110,11 +110,11 @@ func (l *LimitersMgr) init(limiterName string) *CommandLimiter { return v.(*CommandLimiter) } - qpsLimit, _ := l.getLimit(limiterName, true) + qpsLimit, qpsBurst := l.getLimit(limiterName, true) rateLimit, rateBurst := l.getLimit(limiterName, false) - if qpsLimit > 0 || + if (qpsLimit > 0 && qpsBurst > 0) || (rateLimit > 0 && rateBurst > 0) { - newCl := NewCommandLimiter(l.localIp, limiterName, qpsLimit, rateLimit, rateBurst) + newCl := NewCommandLimiter(l.localIp, limiterName, qpsLimit, qpsBurst, rateLimit, rateBurst) v, _ := l.limiters.LoadOrStore(limiterName, newCl) return v.(*CommandLimiter) } else { @@ -158,43 +158,39 @@ func (l *LimitersMgr) getLimit(limiterName string, isQps bool) (float64, int) { continue } - if isQps { - if limit, err = strconv.ParseFloat(string(val), 64); err != nil { - zap.L().Error("[Limit] qps limit can't be decoded to integer", zap.String("key", limiterKey), zap.ByteString("val", val), zap.Error(err)) - continue - } + limitStrs := strings.Split(string(val), LIMIT_BURST_TOKEN) + if len(limitStrs) < 2 { + zap.L().Error("[Limit] limit hasn't enough parameters, should be: [K|k|M|m] ", zap.String("key", limiterKey), zap.ByteString("val", val)) + continue + } + limitStr := limitStrs[0] + burstStr := limitStrs[1] + if len(limitStr) < 1 { + zap.L().Error("[Limit] limit part's length isn't enough, should be: [K|k|M|m] ", zap.String("key", limiterKey), zap.ByteString("val", val)) + continue + } + var strUnit uint8 + var unit float64 + strUnit = limitStr[len(limitStr)-1] + if strUnit == 'k' || strUnit == 'K' { + unit = 1024 + limitStr = limitStr[:len(limitStr)-1] + } else if strUnit == 'm' || strUnit == 'M' { + unit = 1024 * 1024 + limitStr = limitStr[:len(limitStr)-1] } else { - rateLimitVals := strings.Split(string(val), RATE_BURST_TOKEN) - if len(rateLimitVals) < 2 { - zap.L().Error("[Limit] rate limit hasn't enough parameters, should be: K|k|M|m ", zap.String("key", limiterKey), zap.ByteString("val", val)) - continue - } - rateLimit := rateLimitVals[0] - rateBurst := rateLimitVals[1] - if len(rateLimit) < 2 { - zap.L().Error("[Limit] rate limit is invaild, should be: K|k|M|m ", zap.String("key", limiterKey), zap.ByteString("val", val)) - continue - } - var strUnit uint8 - var unit float64 - strUnit = rateLimit[len(rateLimit)-1] - if strUnit == 'k' || strUnit == 'K' { - unit = 1024 - } else if strUnit == 'm' || strUnit == 'M' { - unit = 1024 * 1024 - } else { - zap.L().Error("[Limit] rate limit is invaild, should be: K|k|M|m ", zap.String("key", limiterKey), zap.ByteString("val", val)) - continue - } - if limit, err = strconv.ParseFloat(rateLimit[:len(rateLimit)-1], 64); err != nil { - zap.L().Error("[Limit] rate limit can't be decoded to integer", zap.String("key", limiterKey), zap.ByteString("val", val), zap.Error(err)) - continue - } - limit *= unit - if burst, err = strconv.ParseInt(rateBurst, 10, 64); err != nil { - zap.L().Error("[Limit] rate burst can't be decoded to integer", zap.String("key", limiterKey), zap.ByteString("val", val), zap.Error(err)) - continue - } + unit = 1 + //zap.L().Error("[Limit] limit is invaild, should be: [K|k|M|m] ", zap.String("key", limiterKey), zap.ByteString("val", val)) + //continue + } + if limit, err = strconv.ParseFloat(limitStr, 64); err != nil { + zap.L().Error("[Limit] limit can't be decoded to integer", zap.String("key", limiterKey), zap.ByteString("val", val), zap.Error(err)) + continue + } + limit *= unit + if burst, err = strconv.ParseInt(burstStr, 10, 64); err != nil { + zap.L().Error("[Limit] burst can't be decoded to integer", zap.String("key", limiterKey), zap.ByteString("val", val), zap.Error(err)) + continue } zap.L().Info("[Limit] got limit", zap.String("key", limiterKey), zap.Float64("limit", limit), zap.Int64("burst", burst)) @@ -321,15 +317,15 @@ func (l *LimitersMgr) startSyncNewLimit() { l.limiters.Range(func(k, v interface{}) bool { limiterName := k.(string) commandLimiter := v.(*CommandLimiter) - qpsLimit, _ := l.getLimit(limiterName, true) + qpsLimit, qpsBurst := l.getLimit(limiterName, true) rateLimit, rateBurst := l.getLimit(limiterName, false) - if qpsLimit > 0 || + if (qpsLimit > 0 && qpsBurst > 0) || (rateLimit > 0 && rateBurst > 0) { if commandLimiter == nil { - newCl := NewCommandLimiter(l.localIp, limiterName, qpsLimit, rateLimit, rateBurst) + newCl := NewCommandLimiter(l.localIp, limiterName, qpsLimit, qpsBurst, rateLimit, rateBurst) l.limiters.Store(limiterName, newCl) } else { - commandLimiter.updateLimit(qpsLimit, rateLimit, rateBurst) + commandLimiter.updateLimit(qpsLimit, qpsBurst, rateLimit, rateBurst) } } else { if commandLimiter != nil { @@ -341,10 +337,10 @@ func (l *LimitersMgr) startSyncNewLimit() { } } -func NewCommandLimiter(localIp string, limiterName string, qpsLimit float64, rateLimit float64, rateBurst int) *CommandLimiter { +func NewCommandLimiter(localIp string, limiterName string, qpsLimit float64, qpsBurst int, rateLimit float64, rateBurst int) *CommandLimiter { var qpsl, ratel *rate.Limiter - if qpsLimit > 0 { - qpsl = rate.NewLimiter(rate.Limit(qpsLimit), DEFAULT_BURST) + if qpsLimit > 0 && qpsBurst > 0 { + qpsl = rate.NewLimiter(rate.Limit(qpsLimit), qpsBurst) } if rateLimit > 0 && rateBurst > 0 { ratel = rate.NewLimiter(rate.Limit(rateLimit), rateBurst) @@ -360,17 +356,19 @@ func NewCommandLimiter(localIp string, limiterName string, qpsLimit float64, rat return cl } -func (cl *CommandLimiter) updateLimit(qpsLimit float64, rateLimit float64, rateBurst int) { +func (cl *CommandLimiter) updateLimit(qpsLimit float64, qpsBurst int, rateLimit float64, rateBurst int) { cl.lock.Lock() defer cl.lock.Unlock() - if qpsLimit > 0 { + if qpsLimit > 0 && qpsBurst > 0 { if cl.qpsl != nil { - if cl.qpsl.Limit() != rate.Limit(qpsLimit*cl.localPercent) { + if cl.qpsl.Burst() != qpsBurst { + cl.qpsl = rate.NewLimiter(rate.Limit(qpsLimit*cl.localPercent), qpsBurst) + } else if cl.qpsl.Limit() != rate.Limit(qpsLimit*cl.localPercent) { cl.qpsl.SetLimit(rate.Limit(qpsLimit * cl.localPercent)) } } else { - cl.qpsl = rate.NewLimiter(rate.Limit(qpsLimit*cl.localPercent), DEFAULT_BURST) + cl.qpsl = rate.NewLimiter(rate.Limit(qpsLimit*cl.localPercent), qpsBurst) } } else { cl.qpsl = nil diff --git a/tools/autotest/auto.go b/tools/autotest/auto.go index a1cf5138..8d546106 100644 --- a/tools/autotest/auto.go +++ b/tools/autotest/auto.go @@ -324,81 +324,91 @@ func (ac *AutoClient) LimitCase(t *testing.T) { conns := []redis.Conn{ac.conn, ac.conn2} cost := ac.runCmdInGoRoutines(t, "get", "key1", times, conns) - assert.Equal(t, true, cost <= 0.1) + assert.Equal(t, true, cost <= 0.2) cost = ac.runCmdInGoRoutines(t, "mget", "key1 key2", times, conns) - assert.Equal(t, true, cost <= 0.1) + assert.Equal(t, true, cost <= 0.2) ac.setLimit(t, "qps:*@get", "100") time.Sleep(time.Second * 1) cost = ac.runCmdInGoRoutines(t, "get", "key1", times, conns) - assert.Equal(t, true, math.Abs(cost-2) <= 0.1) + assert.Equal(t, true, cost <= 0.2) - ac.setLimit(t, "qps:test@get", "200") + ac.setLimit(t, "qps:*@get", "k 1") time.Sleep(time.Second * 1) cost = ac.runCmdInGoRoutines(t, "get", "key1", times, conns) - assert.Equal(t, true, math.Abs(cost-1) <= 0.1) + assert.Equal(t, true, cost <= 0.2) + + ac.setLimit(t, "qps:*@get", "100 1") + time.Sleep(time.Second * 1) + cost = ac.runCmdInGoRoutines(t, "get", "key1", times, conns) + assert.Equal(t, true, math.Abs(cost-2) <= 0.2) + + ac.setLimit(t, "qps:test@get", "0.2k 20") + time.Sleep(time.Second * 1) + cost = ac.runCmdInGoRoutines(t, "get", "key1", times, conns) + assert.Equal(t, true, math.Abs(cost-1) <= 0.2) ac.delLimit(t, 1, "qps:test@get") time.Sleep(time.Second * 1) cost = ac.runCmdInGoRoutines(t, "get", "key1", times, conns) - assert.Equal(t, true, math.Abs(cost-2) <= 0.1) + assert.Equal(t, true, math.Abs(cost-2) <= 0.2) - ac.setLimit(t, "qps:*@get", "100a") + ac.setLimit(t, "qps:*@get", "100a 1") time.Sleep(time.Second * 1) cost = ac.runCmdInGoRoutines(t, "get", "key1", times, conns) - assert.Equal(t, true, cost <= 0.1) + assert.Equal(t, true, cost <= 0.2) - ac.setLimit(t, "qps:*@mget", "100") + ac.setLimit(t, "qps:*@mget", "100 1") time.Sleep(time.Second * 1) cost = ac.runCmdInGoRoutines(t, "mget", "key1 key2", times, conns) - assert.Equal(t, true, math.Abs(cost-2) <= 0.1) + assert.Equal(t, true, math.Abs(cost-2) <= 0.2) ac.delLimit(t, 1, "qps:*@mget") ac.setLimit(t, "rate:*@mget", "1k") time.Sleep(time.Second * 1) cost = ac.runCmdInGoRoutines(t, "mget", "key1 key2", times, conns) - assert.Equal(t, true, cost <= 0.1) + assert.Equal(t, true, cost <= 0.2) ac.setLimit(t, "rate:*@mget", "1 2") time.Sleep(time.Second * 1) cost = ac.runCmdInGoRoutines(t, "mget", "key1 key2", times, conns) - assert.Equal(t, true, cost <= 0.1) + assert.Equal(t, true, cost <= 0.2) ac.setLimit(t, "rate:*@mget", "1s 2") time.Sleep(time.Second * 1) cost = ac.runCmdInGoRoutines(t, "mget", "key1 key2", times, conns) - assert.Equal(t, true, cost <= 0.1) + assert.Equal(t, true, cost <= 0.2) ac.setLimit(t, "rate:*@mget", "kk 2") time.Sleep(time.Second * 1) cost = ac.runCmdInGoRoutines(t, "mget", "key1 key2", times, conns) - assert.Equal(t, true, cost <= 0.1) + assert.Equal(t, true, cost <= 0.2) ac.setLimit(t, "rate:*@mget", "1k 2a") time.Sleep(time.Second * 1) cost = ac.runCmdInGoRoutines(t, "mget", "key1 key2", times, conns) - assert.Equal(t, true, cost <= 0.1) + assert.Equal(t, true, cost <= 0.2) ac.setLimit(t, "rate:*@mget", "1k 2") time.Sleep(time.Second * 1) cost = ac.runCmdInGoRoutines(t, "mget", "key1 key2", times, conns) - assert.Equal(t, true, cost <= 0.1) + assert.Equal(t, true, cost <= 0.2) - ac.setLimit(t, "rate:*@mget", "28k 100") + ac.setLimit(t, "rate:*@mget", "0.028m 100") time.Sleep(time.Second * 1) times = []int{1024, 1025} cost = ac.runCmdInGoRoutines(t, "mget", "key1 key2", times, conns) - assert.Equal(t, true, math.Abs(cost-1) <= 0.3) + assert.Equal(t, true, math.Abs(cost-1) <= 0.4) ac.setLimit(t, "rate:*@mget", "0.028M 100") time.Sleep(time.Second * 1) cost = ac.runCmdInGoRoutines(t, "mget", "key1 key2", times, conns) - assert.Equal(t, true, math.Abs(cost-1) <= 0.3) + assert.Equal(t, true, math.Abs(cost-1) <= 0.4) ac.delLimit(t, 1, "rate:*@mget") time.Sleep(time.Second * 1) cost = ac.runCmdInGoRoutines(t, "mget", "key1 key2", times, conns) - assert.Equal(t, true, cost <= 0.3) + assert.Equal(t, true, cost <= 0.4) ac.ek.DelEqual(t, 1, "key1") } From ecd78a27d24e0268a07e2b12bcf8c4081422acfa Mon Sep 17 00:00:00 2001 From: xufangping Date: Thu, 21 Nov 2019 17:52:52 +0800 Subject: [PATCH 13/60] 1 change log level--limit not set, to debug 2 add setlimit.sh --- db/limitersMgr.go | 6 ++-- setlimit.sh | 85 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+), 3 deletions(-) create mode 100644 setlimit.sh diff --git a/db/limitersMgr.go b/db/limitersMgr.go index 3cad1fd8..90b877bb 100644 --- a/db/limitersMgr.go +++ b/db/limitersMgr.go @@ -154,7 +154,9 @@ func (l *LimitersMgr) getLimit(limiterName string, isQps bool) (float64, int) { } val, err := str.Get() if err != nil { - zap.L().Info("[Limit] limit isn't set", zap.String("key", limiterKey), zap.Error(err)) + if logEnv := zap.L().Check(zap.DebugLevel, "[Limit] limit isn't set"); logEnv != nil { + logEnv.Write(zap.String("key", limiterKey), zap.Error(err)) + } continue } @@ -180,8 +182,6 @@ func (l *LimitersMgr) getLimit(limiterName string, isQps bool) (float64, int) { limitStr = limitStr[:len(limitStr)-1] } else { unit = 1 - //zap.L().Error("[Limit] limit is invaild, should be: [K|k|M|m] ", zap.String("key", limiterKey), zap.ByteString("val", val)) - //continue } if limit, err = strconv.ParseFloat(limitStr, 64); err != nil { zap.L().Error("[Limit] limit can't be decoded to integer", zap.String("key", limiterKey), zap.ByteString("val", val), zap.Error(err)) diff --git a/setlimit.sh b/setlimit.sh new file mode 100644 index 00000000..742d34d8 --- /dev/null +++ b/setlimit.sh @@ -0,0 +1,85 @@ +#!/bin/bash +host=tkv7369.test.zhuaninc.com +port=7369 +rediscli="./redis-cli" +token="sys_ratelimit-1574130304-1-36c153b109ebca80b43769" +usage_exit() +{ + echo "usage:" + echo "$0 set qps=(1|0) cmd= namespace= limit=[k/K/m/M] burst=" + echo "or" + echo "$0 del qps=(1|0) cmd= namespace=" + echo ": all means matching all namespaces" + exit 1 +} + +if [ $# -lt 1 ]; then + usage_exit +fi +op=$1 +if [ "$op" != "set" -a "$op" != "del" ]; then + usage_exit +fi + +limitname= +cmd= +namespace= +limit= +burst= + +for arg in $* +do + if [ "$arg" = "$op" ]; then + continue + fi + + optname=`echo $arg | sed 's/=.*//'` + optvalue=`echo $arg | sed 's/.*=//'` + if [ -z $optname -o -z $optvalue ]; then + usage_exit + else + if [ "$optname" = "qps" ]; then + if [ $optvalue = 1 ]; then + limitname=qps + elif [ $optvalue = 0 ]; then + limitname=rate + else + usage_exit + fi + elif [ "$optname" = "cmd" ]; then + cmd=$optvalue + elif [ "$optname" = "namespace" ]; then + namespace=$optvalue + elif [ "$optname" = "limit" ]; then + limit=$optvalue + elif [ "$optname" = "burst" ]; then + burst=$optvalue + else + usage_exit + fi + fi +done + + +if [ -z "$limitname" -o -z "$cmd" -o -z "$namespace" ]; then + usage_exit +else + key= + if [ "$namespace" = "all" ]; then + key="${limitname}:*@${cmd}" + else + key="${limitname}:${namespace}@${cmd}" + fi + + if [ "$op" = "set" ]; then + if [ -z "$limit" -o -z "$burst" ]; then + usage_exit + else + $rediscli -h $host -p $port -a $token set $key "${limit} ${burst}" + fi + elif [ "$op" = "del" ]; then + $rediscli -h $host -p $port -a $token del $key + else + usage_exit + fi +fi From 0648ab89eb2423e8dabd0b61a07ea60c484b5fb6 Mon Sep 17 00:00:00 2001 From: xufangping Date: Thu, 21 Nov 2019 18:16:07 +0800 Subject: [PATCH 14/60] add limit default config items and fix error in config.go --- conf/config.go | 2 +- conf/titan.toml | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/conf/config.go b/conf/config.go index 8228c40d..347dbb70 100644 --- a/conf/config.go +++ b/conf/config.go @@ -104,7 +104,7 @@ type Status struct { } type RateLimit struct { - InterfaceName string `cfg:interface-name; eth0; ; the interface name to get ip and write local titan status to tikv for balancing rate limit` + InterfaceName string `cfg:"interface-name; eth0; ; the interface name to get ip and write local titan status to tikv for balancing rate limit"` GlobalBalancePeriod time.Duration `cfg:"global-balance-period; 15s;; the period in seconds to balance rate limiting with other titan nodes"` TitanStatusLifetime time.Duration `cfg:"titanstatus-life-time; 1m;; how long if a titan didn't update its status, we consider it dead"` SyncSetPeriod time.Duration `cfg:"sync-set-period; 3s;; the period in seconds to sync new limit set in tikv"` diff --git a/conf/titan.toml b/conf/titan.toml index be46308d..c938f58d 100644 --- a/conf/titan.toml +++ b/conf/titan.toml @@ -182,6 +182,26 @@ pd-addrs = "mocktikv://" #default: 2 #concurrency = 2 +[tikv.rate-limit] +#type: string +#default: eth0 +#description: the interface name to get ip and write local titan status to tikv for balancing rate limit +interface-name = "eth0" + +#type: time.Duration +#description: the period in seconds to balance rate limiting with other titan nodes +#default: 15s +#global-balance-period = 15s + +#type: time.Duration +#description: how long if a titan didn't update its status, we consider it dead +#default: 1m +#titanstatus-life-time = 1m + +#type: time.Duration +#description: the period in seconds to sync new limit set in tikv +#default: 3s +#sync-set-period = 3s [tikv-logger] From f86245a215e6d3fa8f73366be846383318de54fb Mon Sep 17 00:00:00 2001 From: xufangping Date: Thu, 21 Nov 2019 21:21:42 +0800 Subject: [PATCH 15/60] in startSyncNewLimit, just read all match limit once for every command's qps/rate --- db/limitersMgr.go | 217 ++++++++++++++++++++++++++++++---------------- 1 file changed, 140 insertions(+), 77 deletions(-) diff --git a/db/limitersMgr.go b/db/limitersMgr.go index 90b877bb..3c8bcd49 100644 --- a/db/limitersMgr.go +++ b/db/limitersMgr.go @@ -26,7 +26,6 @@ const ( LIMIT_BURST_TOKEN = " " TITAN_STATUS_TOKEN = "titan_status:" TIME_FORMAT = "2006-01-02 15:04:05" - DEFAULT_BURST = 1 ) type CommandLimiter struct { @@ -43,6 +42,11 @@ type CommandLimiter struct { totalCommandsSize int64 } +type LimitData struct { + limit float64 + burst int +} + type LimitersMgr struct { limitDatadb *DB globalBalancePeriod time.Duration @@ -51,8 +55,18 @@ type LimitersMgr struct { localIp string localPercent float64 - limiters sync.Map - lock sync.Mutex + limiters sync.Map + qpsAllmatchLimit sync.Map + rateAllmatchLimit sync.Map + lock sync.Mutex +} + +func getAllmatchLimiterName(limiterName string) string { + strs := strings.Split(limiterName, NAMESPACE_COMMAND_TOKEN) + if len(strs) < 2 { + return "" + } + return fmt.Sprintf("%s%s%s", ALL_NAMESPACE, NAMESPACE_COMMAND_TOKEN, strs[1]) } func NewLimitersMgr(store *RedisStore, rateLimit conf.RateLimit) (*LimitersMgr, error) { @@ -110,6 +124,10 @@ func (l *LimitersMgr) init(limiterName string) *CommandLimiter { return v.(*CommandLimiter) } + allmatchLimiterName := getAllmatchLimiterName(limiterName) + l.qpsAllmatchLimit.LoadOrStore(allmatchLimiterName, (*LimitData)(nil)) + l.rateAllmatchLimit.LoadOrStore(allmatchLimiterName, (*LimitData)(nil)) + qpsLimit, qpsBurst := l.getLimit(limiterName, true) rateLimit, rateBurst := l.getLimit(limiterName, false) if (qpsLimit > 0 && qpsBurst > 0) || @@ -124,13 +142,6 @@ func (l *LimitersMgr) init(limiterName string) *CommandLimiter { } func (l *LimitersMgr) getLimit(limiterName string, isQps bool) (float64, int) { - strs := strings.Split(limiterName, NAMESPACE_COMMAND_TOKEN) - if len(strs) < 2 { - zap.L().Error("[Limit] wrong format limiter name", zap.String("limiterName", limiterName)) - return 0, 0 - } - allMatchLimiter := fmt.Sprintf("%s%s%s", ALL_NAMESPACE, NAMESPACE_COMMAND_TOKEN, strs[1]) - limiterNames := []string{limiterName, allMatchLimiter} limit := float64(0) burst := int64(0) @@ -139,68 +150,70 @@ func (l *LimitersMgr) getLimit(limiterName string, isQps bool) (float64, int) { zap.L().Error("[Limit] transection begin failed", zap.String("limiterName", limiterName), zap.Bool("isQps", isQps), zap.Error(err)) return 0, 0 } - for i := range limiterNames { - var limiterKey string - if isQps { - limiterKey = QPS_PREFIX + limiterNames[i] - } else { - limiterKey = RATE_PREFIX + limiterNames[i] + defer func() { + if err := txn.t.Commit(context.Background()); err != nil { + zap.L().Error("[Limit] commit after get limit failed", zap.String("limiterName", limiterName), zap.Error(err)) + txn.t.Rollback() } + }() - str, err := txn.String([]byte(limiterKey)) - if err != nil { - zap.L().Error("[Limit] get limit's value failed", zap.String("key", limiterKey), zap.Error(err)) - return 0, 0 - } - val, err := str.Get() - if err != nil { - if logEnv := zap.L().Check(zap.DebugLevel, "[Limit] limit isn't set"); logEnv != nil { - logEnv.Write(zap.String("key", limiterKey), zap.Error(err)) - } - continue - } + var limiterKey string + if isQps { + limiterKey = QPS_PREFIX + limiterName + } else { + limiterKey = RATE_PREFIX + limiterName + } - limitStrs := strings.Split(string(val), LIMIT_BURST_TOKEN) - if len(limitStrs) < 2 { - zap.L().Error("[Limit] limit hasn't enough parameters, should be: [K|k|M|m] ", zap.String("key", limiterKey), zap.ByteString("val", val)) - continue - } - limitStr := limitStrs[0] - burstStr := limitStrs[1] - if len(limitStr) < 1 { - zap.L().Error("[Limit] limit part's length isn't enough, should be: [K|k|M|m] ", zap.String("key", limiterKey), zap.ByteString("val", val)) - continue - } - var strUnit uint8 - var unit float64 - strUnit = limitStr[len(limitStr)-1] - if strUnit == 'k' || strUnit == 'K' { - unit = 1024 - limitStr = limitStr[:len(limitStr)-1] - } else if strUnit == 'm' || strUnit == 'M' { - unit = 1024 * 1024 - limitStr = limitStr[:len(limitStr)-1] - } else { - unit = 1 - } - if limit, err = strconv.ParseFloat(limitStr, 64); err != nil { - zap.L().Error("[Limit] limit can't be decoded to integer", zap.String("key", limiterKey), zap.ByteString("val", val), zap.Error(err)) - continue - } - limit *= unit - if burst, err = strconv.ParseInt(burstStr, 10, 64); err != nil { - zap.L().Error("[Limit] burst can't be decoded to integer", zap.String("key", limiterKey), zap.ByteString("val", val), zap.Error(err)) - continue + str, err := txn.String([]byte(limiterKey)) + if err != nil { + zap.L().Error("[Limit] get limit's value failed", zap.String("key", limiterKey), zap.Error(err)) + return 0, 0 + } + val, err := str.Get() + if err != nil { + if logEnv := zap.L().Check(zap.DebugLevel, "[Limit] limit isn't set"); logEnv != nil { + logEnv.Write(zap.String("key", limiterKey), zap.Error(err)) } + return 0, 0 + } - zap.L().Info("[Limit] got limit", zap.String("key", limiterKey), zap.Float64("limit", limit), zap.Int64("burst", burst)) - break + limitStrs := strings.Split(string(val), LIMIT_BURST_TOKEN) + if len(limitStrs) < 2 { + zap.L().Error("[Limit] limit hasn't enough parameters, should be: [K|k|M|m] ", zap.String("key", limiterKey), zap.ByteString("val", val)) + return 0, 0 } - if err := txn.t.Commit(context.Background()); err != nil { - zap.L().Error("[Limit] commit after get limit failed", zap.String("limiterName", limiterName), zap.Error(err)) - txn.t.Rollback() + limitStr := limitStrs[0] + burstStr := limitStrs[1] + if len(limitStr) < 1 { + zap.L().Error("[Limit] limit part's length isn't enough, should be: [K|k|M|m] ", zap.String("key", limiterKey), zap.ByteString("val", val)) return 0, 0 } + var strUnit uint8 + var unit float64 + strUnit = limitStr[len(limitStr)-1] + if strUnit == 'k' || strUnit == 'K' { + unit = 1024 + limitStr = limitStr[:len(limitStr)-1] + } else if strUnit == 'm' || strUnit == 'M' { + unit = 1024 * 1024 + limitStr = limitStr[:len(limitStr)-1] + } else { + unit = 1 + } + if limit, err = strconv.ParseFloat(limitStr, 64); err != nil { + zap.L().Error("[Limit] limit can't be decoded to integer", zap.String("key", limiterKey), zap.ByteString("val", val), zap.Error(err)) + return 0, 0 + } + limit *= unit + if burst, err = strconv.ParseInt(burstStr, 10, 64); err != nil { + zap.L().Error("[Limit] burst can't be decoded to integer", zap.String("key", limiterKey), zap.ByteString("val", val), zap.Error(err)) + return 0, 0 + } + + if logEnv := zap.L().Check(zap.DebugLevel, "[Limit] got limit"); logEnv != nil { + logEnv.Write(zap.String("key", limiterKey), zap.Float64("limit", limit), zap.Int64("burst", burst)) + } + return limit, int(burst) } @@ -314,27 +327,77 @@ func (l *LimitersMgr) startSyncNewLimit() { ticker := time.NewTicker(l.syncSetPeriod) defer ticker.Stop() for range ticker.C { - l.limiters.Range(func(k, v interface{}) bool { + l.runSyncNewLimit() + } +} + +func (l *LimitersMgr) runSyncNewLimit() { + allmatchLimits := []*sync.Map{&l.qpsAllmatchLimit, &l.rateAllmatchLimit} + for i, allmatchLimit := range allmatchLimits { + allmatchLimit.Range(func(k, v interface{}) bool { limiterName := k.(string) - commandLimiter := v.(*CommandLimiter) - qpsLimit, qpsBurst := l.getLimit(limiterName, true) - rateLimit, rateBurst := l.getLimit(limiterName, false) - if (qpsLimit > 0 && qpsBurst > 0) || - (rateLimit > 0 && rateBurst > 0) { - if commandLimiter == nil { - newCl := NewCommandLimiter(l.localIp, limiterName, qpsLimit, qpsBurst, rateLimit, rateBurst) - l.limiters.Store(limiterName, newCl) + limitData := v.(*LimitData) + isQps := false + if i == 0 { + isQps = true + } + limit, burst := l.getLimit(limiterName, isQps) + if limit > 0 && burst > 0 { + if limitData == nil { + limitData = &LimitData{limit, burst} + allmatchLimit.Store(limiterName, limitData) } else { - commandLimiter.updateLimit(qpsLimit, qpsBurst, rateLimit, rateBurst) + limitData.limit = limit + limitData.burst = burst } } else { - if commandLimiter != nil { - l.limiters.Store(limiterName, (*CommandLimiter)(nil)) - } + allmatchLimit.Store(limiterName, (*LimitData)(nil)) } return true }) } + + l.limiters.Range(func(k, v interface{}) bool { + limiterName := k.(string) + commandLimiter := v.(*CommandLimiter) + allmatchLimiterName := getAllmatchLimiterName(limiterName) + qpsLimit, qpsBurst := l.getLimit(limiterName, true) + if !(qpsLimit > 0 && qpsBurst > 0) { + v, ok := l.qpsAllmatchLimit.Load(allmatchLimiterName) + if ok { + limitData := v.(*LimitData) + if limitData != nil { + qpsLimit = limitData.limit + qpsBurst = limitData.burst + } + } + } + rateLimit, rateBurst := l.getLimit(limiterName, false) + if !(rateLimit > 0 && rateBurst > 0) { + v, ok := l.rateAllmatchLimit.Load(allmatchLimiterName) + if ok { + limitData := v.(*LimitData) + if limitData != nil { + rateLimit = limitData.limit + rateBurst = limitData.burst + } + } + } + if (qpsLimit > 0 && qpsBurst > 0) || + (rateLimit > 0 && rateBurst > 0) { + if commandLimiter == nil { + newCl := NewCommandLimiter(l.localIp, limiterName, qpsLimit, qpsBurst, rateLimit, rateBurst) + l.limiters.Store(limiterName, newCl) + } else { + commandLimiter.updateLimit(qpsLimit, qpsBurst, rateLimit, rateBurst) + } + } else { + if commandLimiter != nil { + l.limiters.Store(limiterName, (*CommandLimiter)(nil)) + } + } + return true + }) } func NewCommandLimiter(localIp string, limiterName string, qpsLimit float64, qpsBurst int, rateLimit float64, rateBurst int) *CommandLimiter { From 157d70a45af13460e32ebb24226a4e45a5b2e052 Mon Sep 17 00:00:00 2001 From: xufangping Date: Fri, 22 Nov 2019 11:15:26 +0800 Subject: [PATCH 16/60] change got limit log trace --- db/limitersMgr.go | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/db/limitersMgr.go b/db/limitersMgr.go index 3c8bcd49..565eb9b8 100644 --- a/db/limitersMgr.go +++ b/db/limitersMgr.go @@ -171,9 +171,6 @@ func (l *LimitersMgr) getLimit(limiterName string, isQps bool) (float64, int) { } val, err := str.Get() if err != nil { - if logEnv := zap.L().Check(zap.DebugLevel, "[Limit] limit isn't set"); logEnv != nil { - logEnv.Write(zap.String("key", limiterKey), zap.Error(err)) - } return 0, 0 } @@ -383,8 +380,13 @@ func (l *LimitersMgr) runSyncNewLimit() { } } } + if (qpsLimit > 0 && qpsBurst > 0) || (rateLimit > 0 && rateBurst > 0) { + if logEnv := zap.L().Check(zap.DebugLevel, "[Limit] limit is set"); logEnv != nil { + logEnv.Write(zap.String("limiter name", limiterName), zap.Float64("qps limit", qpsLimit), zap.Int("qps burst", qpsBurst), + zap.Float64("rate limit", rateLimit), zap.Int("rate burst", rateBurst)) + } if commandLimiter == nil { newCl := NewCommandLimiter(l.localIp, limiterName, qpsLimit, qpsBurst, rateLimit, rateBurst) l.limiters.Store(limiterName, newCl) From fd0d8cc171c8eb18b4f78ce6854f9a0788b5aeb1 Mon Sep 17 00:00:00 2001 From: xufangping Date: Fri, 22 Nov 2019 12:02:25 +0800 Subject: [PATCH 17/60] add limit cleared log trace, just log limit is trigger when delay > 0 --- db/limitersMgr.go | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/db/limitersMgr.go b/db/limitersMgr.go index 565eb9b8..9f8802c6 100644 --- a/db/limitersMgr.go +++ b/db/limitersMgr.go @@ -395,6 +395,10 @@ func (l *LimitersMgr) runSyncNewLimit() { } } else { if commandLimiter != nil { + if logEnv := zap.L().Check(zap.DebugLevel, "[Limit] limit is cleared"); logEnv != nil { + logEnv.Write(zap.String("limiter name", limiterName), zap.Float64("qps limit", qpsLimit), zap.Int("qps burst", qpsBurst), + zap.Float64("rate limit", rateLimit), zap.Int("rate burst", rateBurst)) + } l.limiters.Store(limiterName, (*CommandLimiter)(nil)) } } @@ -496,10 +500,12 @@ func (cl *CommandLimiter) checkLimit(cmdName string, cmdArgs []string) { zap.L().Error("[Limit] request events num exceed limiter burst", zap.Int("qps limiter burst", cl.qpsl.Burst())) } else { d := r.Delay() - if logEnv := zap.L().Check(zap.DebugLevel, "[Limit] trigger qps limit"); logEnv != nil { - logEnv.Write(zap.String("limiter name", cl.limiterName), zap.Int64("sleep us", int64(d/time.Microsecond))) + if d > 0 { + if logEnv := zap.L().Check(zap.DebugLevel, "[Limit] trigger qps limit"); logEnv != nil { + logEnv.Write(zap.String("limiter name", cl.limiterName), zap.Int64("sleep us", int64(d/time.Microsecond))) + } + time.Sleep(d) } - time.Sleep(d) } } @@ -513,10 +519,12 @@ func (cl *CommandLimiter) checkLimit(cmdName string, cmdArgs []string) { zap.L().Error("[Limit] request events num exceed limiter burst", zap.Int("rate limiter burst", cl.ratel.Burst()), zap.Int("command size", cmdSize)) } else { d := r.Delay() - if logEnv := zap.L().Check(zap.DebugLevel, "[Limit] trigger rate limit"); logEnv != nil { - logEnv.Write(zap.String("limiter name", cl.limiterName), zap.Strings("args", cmdArgs), zap.Int64("sleep us", int64(d/time.Microsecond))) + if d > 0 { + if logEnv := zap.L().Check(zap.DebugLevel, "[Limit] trigger rate limit"); logEnv != nil { + logEnv.Write(zap.String("limiter name", cl.limiterName), zap.Strings("args", cmdArgs), zap.Int64("sleep us", int64(d/time.Microsecond))) + } + time.Sleep(d) } - time.Sleep(d) } } From 756a0e60c5dd34f37d3e0413d952544f918e92cb Mon Sep 17 00:00:00 2001 From: xufangping Date: Fri, 22 Nov 2019 17:03:54 +0800 Subject: [PATCH 18/60] change limit/commandFunc cost seconds factor from 2 to 1.4 --- metrics/prometheus.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/metrics/prometheus.go b/metrics/prometheus.go index 949e3991..7215f37c 100644 --- a/metrics/prometheus.go +++ b/metrics/prometheus.go @@ -118,7 +118,7 @@ func init() { prometheus.HistogramOpts{ Namespace: namespace, Name: "limit_cost_seconds", - Buckets: prometheus.ExponentialBuckets(0.0001, 2, 10), + Buckets: prometheus.ExponentialBuckets(0.0002, 1.4, 20), Help: "the cost times of command execute's limit", }, multiLabel) prometheus.MustRegister(gm.LimitCostHistogramVec) @@ -136,7 +136,7 @@ func init() { prometheus.HistogramOpts{ Namespace: namespace, Name: "command_func_done_seconds", - Buckets: prometheus.ExponentialBuckets(0.0002, 2, 10), + Buckets: prometheus.ExponentialBuckets(0.0002, 1.4, 20), Help: "The cost times of command func", }, multiLabel) prometheus.MustRegister(gm.CommandFuncDoneHistogramVec) From 46bb6daf407561117ce8ad8e150d0a41db7dee96 Mon Sep 17 00:00:00 2001 From: xufangping Date: Fri, 22 Nov 2019 18:19:36 +0800 Subject: [PATCH 19/60] fix titan active time decoding bug --- db/limitersMgr.go | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/db/limitersMgr.go b/db/limitersMgr.go index 9f8802c6..a56b8ff3 100644 --- a/db/limitersMgr.go +++ b/db/limitersMgr.go @@ -275,19 +275,21 @@ func (l *LimitersMgr) runBalanceLimit() { continue } ip := key[prefixLen:] - if string(ip) == l.localIp { + obj := NewString(txn, key) + if err = obj.decode(iter.Value()); err != nil { + zap.L().Error("[Limit] Strings decoded value error", zap.ByteString("key", key), zap.Error(err)) continue } - lastActive := iter.Value() + lastActive := obj.Meta.Value lastActiveT, err := time.Parse(TIME_FORMAT, string(lastActive)) if err != nil { - zap.L().Error("[Limit] decode time string failed", zap.ByteString("ip", ip), zap.ByteString("lastActive", lastActive), zap.Error(err)) + zap.L().Error("[Limit] value can't decoded into a time", zap.ByteString("key", key), zap.ByteString("lastActive", lastActive), zap.Error(err)) continue } zap.L().Info("[Limit] last active time", zap.ByteString("ip", ip), zap.ByteString("lastActive", lastActive)) - if time.Since(lastActiveT) <= l.titanStatusLifeTime { + if string(ip) != l.localIp && time.Since(lastActiveT) <= l.titanStatusLifeTime { activeNum++ } } From a839995a03e5579c61b4d03c7d64816799378163 Mon Sep 17 00:00:00 2001 From: xufangping Date: Fri, 22 Nov 2019 18:38:40 +0800 Subject: [PATCH 20/60] fix limit local percent bug --- db/limitersMgr.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/db/limitersMgr.go b/db/limitersMgr.go index a56b8ff3..e10408c3 100644 --- a/db/limitersMgr.go +++ b/db/limitersMgr.go @@ -260,7 +260,7 @@ func (l *LimitersMgr) runBalanceLimit() { } defer iter.Close() - activeNum := 1 + activeNum := float64(1) prefixLen := len(prefix) for ; iter.Valid() && iter.Key().HasPrefix(prefix); err = iter.Next() { if err != nil { @@ -293,9 +293,9 @@ func (l *LimitersMgr) runBalanceLimit() { activeNum++ } } - newPercent := float64(1.0 / activeNum) + newPercent := 1 / activeNum if l.localPercent != newPercent { - zap.L().Info("[Limit] balance limit in all titan server", zap.Int("active server num", activeNum), + zap.L().Info("[Limit] balance limit in all titan server", zap.Float64("active server num", activeNum), zap.Float64("oldPercent", l.localPercent), zap.Float64("newPercent", newPercent)) l.limiters.Range(func(k, v interface{}) bool { commandLimiter := v.(*CommandLimiter) From 815efa4c4744f6fbdbf85ee84ec48adae722f511 Mon Sep 17 00:00:00 2001 From: xufangping Date: Fri, 22 Nov 2019 19:57:31 +0800 Subject: [PATCH 21/60] fix titan active time parsing bug --- db/limitersMgr.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/db/limitersMgr.go b/db/limitersMgr.go index e10408c3..d1a0ab8b 100644 --- a/db/limitersMgr.go +++ b/db/limitersMgr.go @@ -282,13 +282,14 @@ func (l *LimitersMgr) runBalanceLimit() { } lastActive := obj.Meta.Value - lastActiveT, err := time.Parse(TIME_FORMAT, string(lastActive)) + lastActiveT, err := time.ParseInLocation(TIME_FORMAT, string(lastActive), time.Local) if err != nil { zap.L().Error("[Limit] value can't decoded into a time", zap.ByteString("key", key), zap.ByteString("lastActive", lastActive), zap.Error(err)) continue } - zap.L().Info("[Limit] last active time", zap.ByteString("ip", ip), zap.ByteString("lastActive", lastActive)) + diff := time.Since(lastActiveT).Seconds() + zap.L().Info("[Limit] last active time", zap.ByteString("ip", ip), zap.ByteString("lastActive", lastActive), zap.Float64("activePast", diff)) if string(ip) != l.localIp && time.Since(lastActiveT) <= l.titanStatusLifeTime { activeNum++ } From 961257161fcbb943fe97741fb491fa6c4abdc79a Mon Sep 17 00:00:00 2001 From: xufangping Date: Fri, 22 Nov 2019 20:27:29 +0800 Subject: [PATCH 22/60] add updateLimitPercent log --- db/limitersMgr.go | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/db/limitersMgr.go b/db/limitersMgr.go index d1a0ab8b..56b95867 100644 --- a/db/limitersMgr.go +++ b/db/limitersMgr.go @@ -482,12 +482,14 @@ func (cl *CommandLimiter) updateLimitPercent(newPercent float64) { if cl.localPercent != newPercent && cl.localPercent > 0 && newPercent > 0 { if cl.qpsl != nil { - qpsLimit := float64(cl.qpsl.Limit()) / cl.localPercent - cl.qpsl.SetLimit(rate.Limit(qpsLimit * newPercent)) + qpsLimit := (float64(cl.qpsl.Limit()) / cl.localPercent) * newPercent + zap.L().Info("percent changed", zap.String("limiterName", cl.limiterName), zap.Float64("qps limit", qpsLimit), zap.Int("burst", cl.qpsl.Burst())) + cl.qpsl.SetLimit(rate.Limit(qpsLimit)) } if cl.ratel != nil { - rateLimit := float64(cl.ratel.Limit()) / cl.localPercent - cl.ratel.SetLimit(rate.Limit(rateLimit * newPercent)) + rateLimit := float64(cl.ratel.Limit()) / cl.localPercent * newPercent + zap.L().Info("percent changed", zap.String("limiterName", cl.limiterName), zap.Float64("rate limit", rateLimit), zap.Int("burst", cl.ratel.Burst())) + cl.ratel.SetLimit(rate.Limit(rateLimit)) } cl.localPercent = newPercent } From cc4255a93ec9d24538c1cc0d7f1845b297f9c9ff Mon Sep 17 00:00:00 2001 From: xufangping Date: Fri, 22 Nov 2019 20:42:14 +0800 Subject: [PATCH 23/60] when create commandLimiter, also use localPercent to set limit --- db/limitersMgr.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/db/limitersMgr.go b/db/limitersMgr.go index 56b95867..74d8d8c5 100644 --- a/db/limitersMgr.go +++ b/db/limitersMgr.go @@ -132,7 +132,7 @@ func (l *LimitersMgr) init(limiterName string) *CommandLimiter { rateLimit, rateBurst := l.getLimit(limiterName, false) if (qpsLimit > 0 && qpsBurst > 0) || (rateLimit > 0 && rateBurst > 0) { - newCl := NewCommandLimiter(l.localIp, limiterName, qpsLimit, qpsBurst, rateLimit, rateBurst) + newCl := NewCommandLimiter(l.localIp, limiterName, qpsLimit*l.localPercent, qpsBurst, rateLimit*l.localPercent, rateBurst) v, _ := l.limiters.LoadOrStore(limiterName, newCl) return v.(*CommandLimiter) } else { @@ -391,7 +391,7 @@ func (l *LimitersMgr) runSyncNewLimit() { zap.Float64("rate limit", rateLimit), zap.Int("rate burst", rateBurst)) } if commandLimiter == nil { - newCl := NewCommandLimiter(l.localIp, limiterName, qpsLimit, qpsBurst, rateLimit, rateBurst) + newCl := NewCommandLimiter(l.localIp, limiterName, qpsLimit*l.localPercent, qpsBurst, rateLimit*l.localPercent, rateBurst) l.limiters.Store(limiterName, newCl) } else { commandLimiter.updateLimit(qpsLimit, qpsBurst, rateLimit, rateBurst) From f00d8c5e13f5c5cd831c68fbfa836020b44936de Mon Sep 17 00:00:00 2001 From: xufangping Date: Mon, 25 Nov 2019 20:33:26 +0800 Subject: [PATCH 24/60] fix limitersMgr localPercent using bug and lock before use it --- db/limitersMgr.go | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/db/limitersMgr.go b/db/limitersMgr.go index 74d8d8c5..ac4c58b1 100644 --- a/db/limitersMgr.go +++ b/db/limitersMgr.go @@ -132,7 +132,7 @@ func (l *LimitersMgr) init(limiterName string) *CommandLimiter { rateLimit, rateBurst := l.getLimit(limiterName, false) if (qpsLimit > 0 && qpsBurst > 0) || (rateLimit > 0 && rateBurst > 0) { - newCl := NewCommandLimiter(l.localIp, limiterName, qpsLimit*l.localPercent, qpsBurst, rateLimit*l.localPercent, rateBurst) + newCl := NewCommandLimiter(l.localIp, limiterName, qpsLimit, qpsBurst, rateLimit, rateBurst, l.localPercent) v, _ := l.limiters.LoadOrStore(limiterName, newCl) return v.(*CommandLimiter) } else { @@ -295,9 +295,13 @@ func (l *LimitersMgr) runBalanceLimit() { } } newPercent := 1 / activeNum - if l.localPercent != newPercent { + var oldPercent float64 + l.lock.Lock() + oldPercent = l.localPercent + l.lock.Unlock() + if oldPercent != newPercent { zap.L().Info("[Limit] balance limit in all titan server", zap.Float64("active server num", activeNum), - zap.Float64("oldPercent", l.localPercent), zap.Float64("newPercent", newPercent)) + zap.Float64("oldPercent", oldPercent), zap.Float64("newPercent", newPercent)) l.limiters.Range(func(k, v interface{}) bool { commandLimiter := v.(*CommandLimiter) if commandLimiter != nil { @@ -305,7 +309,10 @@ func (l *LimitersMgr) runBalanceLimit() { } return true }) + + l.lock.Lock() l.localPercent = newPercent + l.lock.Unlock() } key := []byte(TITAN_STATUS_TOKEN + l.localIp) @@ -357,6 +364,10 @@ func (l *LimitersMgr) runSyncNewLimit() { }) } + var localPercent float64 + l.lock.Lock() + localPercent = l.localPercent + l.lock.Unlock() l.limiters.Range(func(k, v interface{}) bool { limiterName := k.(string) commandLimiter := v.(*CommandLimiter) @@ -391,7 +402,7 @@ func (l *LimitersMgr) runSyncNewLimit() { zap.Float64("rate limit", rateLimit), zap.Int("rate burst", rateBurst)) } if commandLimiter == nil { - newCl := NewCommandLimiter(l.localIp, limiterName, qpsLimit*l.localPercent, qpsBurst, rateLimit*l.localPercent, rateBurst) + newCl := NewCommandLimiter(l.localIp, limiterName, qpsLimit, qpsBurst, rateLimit, rateBurst, localPercent) l.limiters.Store(limiterName, newCl) } else { commandLimiter.updateLimit(qpsLimit, qpsBurst, rateLimit, rateBurst) @@ -409,20 +420,20 @@ func (l *LimitersMgr) runSyncNewLimit() { }) } -func NewCommandLimiter(localIp string, limiterName string, qpsLimit float64, qpsBurst int, rateLimit float64, rateBurst int) *CommandLimiter { +func NewCommandLimiter(localIp string, limiterName string, qpsLimit float64, qpsBurst int, rateLimit float64, rateBurst int, localPercent float64) *CommandLimiter { var qpsl, ratel *rate.Limiter if qpsLimit > 0 && qpsBurst > 0 { - qpsl = rate.NewLimiter(rate.Limit(qpsLimit), qpsBurst) + qpsl = rate.NewLimiter(rate.Limit(qpsLimit*localPercent), qpsBurst) } if rateLimit > 0 && rateBurst > 0 { - ratel = rate.NewLimiter(rate.Limit(rateLimit), rateBurst) + ratel = rate.NewLimiter(rate.Limit(rateLimit*localPercent), rateBurst) } cl := &CommandLimiter{ limiterName: limiterName, localIp: localIp, qpsl: qpsl, ratel: ratel, - localPercent: 1, + localPercent: localPercent, lastTime: time.Now(), } return cl From da49bfae49ac6ca7836f964e5bc479852447e3d0 Mon Sep 17 00:00:00 2001 From: xufangping Date: Mon, 25 Nov 2019 21:31:33 +0800 Subject: [PATCH 25/60] reportLocalStat every globalBalancePeriod,even the commandLimiter is nil --- db/limitersMgr.go | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/db/limitersMgr.go b/db/limitersMgr.go index ac4c58b1..9cdc0918 100644 --- a/db/limitersMgr.go +++ b/db/limitersMgr.go @@ -240,9 +240,24 @@ func (l *LimitersMgr) startBalanceLimit() { defer ticker.Stop() for range ticker.C { l.runBalanceLimit() + l.reportLocalStat() } } +func (l *LimitersMgr) reportLocalStat() { + l.limiters.Range(func(k, v interface{}) bool { + limiterName := k.(string) + commandLimiter := v.(*CommandLimiter) + if commandLimiter != nil { + commandLimiter.reportLocalStat() + } else { + metrics.GetMetrics().LimiterQpsVec.WithLabelValues(l.localIp, limiterName).Set(0) + metrics.GetMetrics().LimiterRateVec.WithLabelValues(l.localIp, limiterName).Set(0) + } + return true + }) +} + func (l *LimitersMgr) runBalanceLimit() { txn, err := l.limitDatadb.Begin() if err != nil { @@ -470,6 +485,11 @@ func (cl *CommandLimiter) updateLimit(qpsLimit float64, qpsBurst int, rateLimit } else { cl.ratel = nil } +} + +func (cl *CommandLimiter) reportLocalStat() { + cl.lock.Lock() + defer cl.lock.Unlock() var qpsLocal, rateLocal float64 seconds := time.Since(cl.lastTime).Seconds() From 30f7d2c7c61df7d794f79b1f88f9328218d35d6e Mon Sep 17 00:00:00 2001 From: xufangping Date: Tue, 26 Nov 2019 16:26:18 +0800 Subject: [PATCH 26/60] limit can also work on auth-disabled titan server --- conf/config.go | 1 + conf/titan.toml | 5 +++++ db/limitersMgr.go | 9 +++++---- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/conf/config.go b/conf/config.go index 347dbb70..e4a01d9c 100644 --- a/conf/config.go +++ b/conf/config.go @@ -105,6 +105,7 @@ type Status struct { type RateLimit struct { InterfaceName string `cfg:"interface-name; eth0; ; the interface name to get ip and write local titan status to tikv for balancing rate limit"` + LimiterNamespace string `cfg:"limiter-namespace; sys_ratelimit;; the namespace of getting limit/balance data"` GlobalBalancePeriod time.Duration `cfg:"global-balance-period; 15s;; the period in seconds to balance rate limiting with other titan nodes"` TitanStatusLifetime time.Duration `cfg:"titanstatus-life-time; 1m;; how long if a titan didn't update its status, we consider it dead"` SyncSetPeriod time.Duration `cfg:"sync-set-period; 3s;; the period in seconds to sync new limit set in tikv"` diff --git a/conf/titan.toml b/conf/titan.toml index c938f58d..b1128f0a 100644 --- a/conf/titan.toml +++ b/conf/titan.toml @@ -188,6 +188,11 @@ pd-addrs = "mocktikv://" #description: the interface name to get ip and write local titan status to tikv for balancing rate limit interface-name = "eth0" +#type: string +#default: sys_ratelimit +#the namespace of getting limit/balance data +#limiter-namespace = "sys_ratelimit" + #type: time.Duration #description: the period in seconds to balance rate limiting with other titan nodes #default: 15s diff --git a/db/limitersMgr.go b/db/limitersMgr.go index 9cdc0918..dc477ffc 100644 --- a/db/limitersMgr.go +++ b/db/limitersMgr.go @@ -100,8 +100,12 @@ func NewLimitersMgr(store *RedisStore, rateLimit conf.RateLimit) (*LimitersMgr, return nil, errors.New(rateLimit.InterfaceName + " adds is empty") } + if rateLimit.LimiterNamespace == "" { + return nil, errors.New("limiter-namespace is configured with empty") + } + l := &LimitersMgr{ - limitDatadb: store.DB(LIMITDATA_NAMESPACE, LIMITDATA_DBID), + limitDatadb: store.DB(rateLimit.LimiterNamespace, LIMITDATA_DBID), globalBalancePeriod: rateLimit.GlobalBalancePeriod, titanStatusLifeTime: rateLimit.TitanStatusLifetime, syncSetPeriod: rateLimit.SyncSetPeriod, @@ -215,9 +219,6 @@ func (l *LimitersMgr) getLimit(limiterName string, isQps bool) (float64, int) { } func (l *LimitersMgr) CheckLimit(namespace string, cmdName string, cmdArgs []string) { - if namespace == LIMITDATA_NAMESPACE { - return - } limiterName := fmt.Sprintf("%s%s%s", namespace, NAMESPACE_COMMAND_TOKEN, cmdName) v, ok := l.limiters.Load(limiterName) var commandLimiter *CommandLimiter From bcbd5a72b3be8d89f78cc4bb44b7cb1bb3e29dc7 Mon Sep 17 00:00:00 2001 From: xufangping Date: Tue, 26 Nov 2019 18:32:51 +0800 Subject: [PATCH 27/60] decrease the lock range in reportStat --- db/limitersMgr.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/db/limitersMgr.go b/db/limitersMgr.go index dc477ffc..9cef2b71 100644 --- a/db/limitersMgr.go +++ b/db/limitersMgr.go @@ -489,10 +489,8 @@ func (cl *CommandLimiter) updateLimit(qpsLimit float64, qpsBurst int, rateLimit } func (cl *CommandLimiter) reportLocalStat() { - cl.lock.Lock() - defer cl.lock.Unlock() - var qpsLocal, rateLocal float64 + cl.lock.Lock() seconds := time.Since(cl.lastTime).Seconds() if seconds >= 0 { qpsLocal = float64(cl.totalCommandsCount) / seconds @@ -501,11 +499,13 @@ func (cl *CommandLimiter) reportLocalStat() { qpsLocal = 0 rateLocal = 0 } - metrics.GetMetrics().LimiterQpsVec.WithLabelValues(cl.localIp, cl.limiterName).Set(qpsLocal) - metrics.GetMetrics().LimiterRateVec.WithLabelValues(cl.localIp, cl.limiterName).Set(rateLocal) cl.totalCommandsCount = 0 cl.totalCommandsSize = 0 cl.lastTime = time.Now() + cl.lock.Unlock() + + metrics.GetMetrics().LimiterQpsVec.WithLabelValues(cl.localIp, cl.limiterName).Set(qpsLocal) + metrics.GetMetrics().LimiterRateVec.WithLabelValues(cl.localIp, cl.limiterName).Set(rateLocal) } func (cl *CommandLimiter) updateLimitPercent(newPercent float64) { From 1a5069558ed1354085a04d65362fe8db0a377270 Mon Sep 17 00:00:00 2001 From: xufangping Date: Tue, 26 Nov 2019 21:50:03 +0800 Subject: [PATCH 28/60] reportLocalStat run in itself go routine --- conf/mockconfig.go | 1 + conf/titan.toml | 6 +++--- db/limitersMgr.go | 8 ++++++++ 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/conf/mockconfig.go b/conf/mockconfig.go index 9649b490..b21ed652 100644 --- a/conf/mockconfig.go +++ b/conf/mockconfig.go @@ -37,6 +37,7 @@ func MockConf() *Titan { SyncSetPeriod: 1 * time.Second, GlobalBalancePeriod: 15 * time.Second, TitanStatusLifetime: 30 * time.Second, + LimiterNamespace: "sys_ratelimit", }, }, } diff --git a/conf/titan.toml b/conf/titan.toml index b1128f0a..37e2311a 100644 --- a/conf/titan.toml +++ b/conf/titan.toml @@ -196,17 +196,17 @@ interface-name = "eth0" #type: time.Duration #description: the period in seconds to balance rate limiting with other titan nodes #default: 15s -#global-balance-period = 15s +#global-balance-period = "15s" #type: time.Duration #description: how long if a titan didn't update its status, we consider it dead #default: 1m -#titanstatus-life-time = 1m +#titanstatus-life-time = "1m" #type: time.Duration #description: the period in seconds to sync new limit set in tikv #default: 3s -#sync-set-period = 3s +#sync-set-period = "3s" [tikv-logger] diff --git a/db/limitersMgr.go b/db/limitersMgr.go index 9cef2b71..37b7c18e 100644 --- a/db/limitersMgr.go +++ b/db/limitersMgr.go @@ -115,6 +115,7 @@ func NewLimitersMgr(store *RedisStore, rateLimit conf.RateLimit) (*LimitersMgr, go l.startBalanceLimit() go l.startSyncNewLimit() + go l.startReportLocalStat() return l, nil } @@ -241,6 +242,13 @@ func (l *LimitersMgr) startBalanceLimit() { defer ticker.Stop() for range ticker.C { l.runBalanceLimit() + } +} + +func (l *LimitersMgr) startReportLocalStat() { + ticker := time.NewTicker(l.globalBalancePeriod) + defer ticker.Stop() + for range ticker.C { l.reportLocalStat() } } From a7753c4021984b93e59e03cf1eb6e3381d29c856 Mon Sep 17 00:00:00 2001 From: xufangping Date: Fri, 29 Nov 2019 22:45:27 +0800 Subject: [PATCH 29/60] 1 change how to balance 2 change qps/rate metrics 3 labels: namespace, command, localip --- bin/titan/main.go | 2 +- conf/config.go | 4 + conf/mockconfig.go | 10 +- db/limitersMgr.go | 382 +++++++++++++++++++++++-------------- metrics/prometheus.go | 6 +- tools/integration/titan.go | 2 +- 6 files changed, 256 insertions(+), 150 deletions(-) diff --git a/bin/titan/main.go b/bin/titan/main.go index 4616110b..8c3cff8c 100644 --- a/bin/titan/main.go +++ b/bin/titan/main.go @@ -68,7 +68,7 @@ func main() { } svr := metrics.NewServer(&config.Status) - limitersMgr, err := db.NewLimitersMgr(store, config.Tikv.RateLimit) + limitersMgr, err := db.NewLimitersMgr(store, &config.Tikv.RateLimit) if err != nil { zap.L().Fatal("create limitersMgr failed", zap.Error(err)) os.Exit(1) diff --git a/conf/config.go b/conf/config.go index e4a01d9c..e7b00e08 100644 --- a/conf/config.go +++ b/conf/config.go @@ -109,4 +109,8 @@ type RateLimit struct { GlobalBalancePeriod time.Duration `cfg:"global-balance-period; 15s;; the period in seconds to balance rate limiting with other titan nodes"` TitanStatusLifetime time.Duration `cfg:"titanstatus-life-time; 1m;; how long if a titan didn't update its status, we consider it dead"` SyncSetPeriod time.Duration `cfg:"sync-set-period; 3s;; the period in seconds to sync new limit set in tikv"` + UsageToDivide float64 `cfg:"usage-to-divide; 0.6;; if the qps/weighted limit < the percent, will divide change Factor to balance limit"` + UsageToMultiply float64 `cfg:"usage-to-multiply; 0.9;; if the qps/weighted limit > the percent, will multiply change Factor to balance limit"` + WeightChangeFactor float64 `cfg:"weight-change-factor; 1.5;; the weight change unit when balance limit, is also the minimum weight"` + InitialPercent float64 `cfg:"initial-percent; 0.33;; the percent in a commandLimiter is created"` } diff --git a/conf/mockconfig.go b/conf/mockconfig.go index b21ed652..9a23a2bb 100644 --- a/conf/mockconfig.go +++ b/conf/mockconfig.go @@ -34,10 +34,14 @@ func MockConf() *Titan { Concurrency: 2, }, RateLimit: RateLimit{ - SyncSetPeriod: 1 * time.Second, - GlobalBalancePeriod: 15 * time.Second, - TitanStatusLifetime: 30 * time.Second, LimiterNamespace: "sys_ratelimit", + SyncSetPeriod: 1 * time.Second, + GlobalBalancePeriod: 3 * time.Second, + TitanStatusLifetime: 6 * time.Second, + UsageToDivide: 0.6, + UsageToMultiply: 0.9, + WeightChangeFactor: 1.5, + InitialPercent: 1, }, }, } diff --git a/db/limitersMgr.go b/db/limitersMgr.go index 37b7c18e..dc5006a7 100644 --- a/db/limitersMgr.go +++ b/db/limitersMgr.go @@ -17,15 +17,17 @@ import ( ) const ( - LIMITDATA_NAMESPACE = "sys_ratelimit" - LIMITDATA_DBID = 0 - ALL_NAMESPACE = "*" - NAMESPACE_COMMAND_TOKEN = "@" - QPS_PREFIX = "qps:" - RATE_PREFIX = "rate:" - LIMIT_BURST_TOKEN = " " - TITAN_STATUS_TOKEN = "titan_status:" - TIME_FORMAT = "2006-01-02 15:04:05" + LIMITDATA_DBID = 0 + ALL_NAMESPACE = "*" + NAMESPACE_COMMAND_TOKEN = "@" + QPS_PREFIX = "qps:" + RATE_PREFIX = "rate:" + LIMIT_VALUE_TOKEN = " " + LIMITER_STATUS_PREFIX = "limiter_status:" + LIMITER_STATUS_VALUE_TOKEN = "," + TIME_FORMAT = "2006-01-02 15:04:05" + MAXIMUM_WEIGHT = 1 + MINIMUM_WEIGHT = 0.1 ) type CommandLimiter struct { @@ -35,6 +37,8 @@ type CommandLimiter struct { qpsl *rate.Limiter ratel *rate.Limiter localPercent float64 + weight float64 + skipBalance bool lock sync.Mutex lastTime time.Time @@ -48,12 +52,9 @@ type LimitData struct { } type LimitersMgr struct { - limitDatadb *DB - globalBalancePeriod time.Duration - titanStatusLifeTime time.Duration - syncSetPeriod time.Duration - localIp string - localPercent float64 + limitDatadb *DB + conf *conf.RateLimit + localIp string limiters sync.Map qpsAllmatchLimit sync.Map @@ -69,7 +70,24 @@ func getAllmatchLimiterName(limiterName string) string { return fmt.Sprintf("%s%s%s", ALL_NAMESPACE, NAMESPACE_COMMAND_TOKEN, strs[1]) } -func NewLimitersMgr(store *RedisStore, rateLimit conf.RateLimit) (*LimitersMgr, error) { +func getLimiterKey(limiterName string) []byte { + var key []byte + key = append(key, []byte(LIMITER_STATUS_PREFIX)...) + key = append(key, []byte(limiterName)...) + key = append(key, ':') + return key +} + +func getNamespaceAndCmd(limiterName string) []string { + strs := strings.Split(limiterName, NAMESPACE_COMMAND_TOKEN) + if len(strs) < 2 { + return nil + } + return strs + +} + +func NewLimitersMgr(store *RedisStore, rateLimit *conf.RateLimit) (*LimitersMgr, error) { var addrs []net.Addr var err error if rateLimit.InterfaceName != "" { @@ -103,19 +121,24 @@ func NewLimitersMgr(store *RedisStore, rateLimit conf.RateLimit) (*LimitersMgr, if rateLimit.LimiterNamespace == "" { return nil, errors.New("limiter-namespace is configured with empty") } + if rateLimit.WeightChangeFactor <= 1 { + return nil, errors.New("weight-change-factor should > 1") + } + if !(rateLimit.UsageToDivide > 0 && rateLimit.UsageToDivide < rateLimit.UsageToMultiply && rateLimit.UsageToMultiply < 1) { + return nil, errors.New("should config 0 < usage-to-divide < usage-to-multiply < 1") + } + if rateLimit.InitialPercent > 1 || rateLimit.InitialPercent <= 0 { + return nil, errors.New("initial-percent should in (0, 1]") + } l := &LimitersMgr{ - limitDatadb: store.DB(rateLimit.LimiterNamespace, LIMITDATA_DBID), - globalBalancePeriod: rateLimit.GlobalBalancePeriod, - titanStatusLifeTime: rateLimit.TitanStatusLifetime, - syncSetPeriod: rateLimit.SyncSetPeriod, - localIp: localIp, - localPercent: 1, + limitDatadb: store.DB(rateLimit.LimiterNamespace, LIMITDATA_DBID), + conf: rateLimit, + localIp: localIp, } - go l.startBalanceLimit() go l.startSyncNewLimit() - go l.startReportLocalStat() + go l.startReportAndBalance() return l, nil } @@ -137,7 +160,7 @@ func (l *LimitersMgr) init(limiterName string) *CommandLimiter { rateLimit, rateBurst := l.getLimit(limiterName, false) if (qpsLimit > 0 && qpsBurst > 0) || (rateLimit > 0 && rateBurst > 0) { - newCl := NewCommandLimiter(l.localIp, limiterName, qpsLimit, qpsBurst, rateLimit, rateBurst, l.localPercent) + newCl := NewCommandLimiter(l.localIp, limiterName, qpsLimit, qpsBurst, rateLimit, rateBurst, l.conf.InitialPercent) v, _ := l.limiters.LoadOrStore(limiterName, newCl) return v.(*CommandLimiter) } else { @@ -179,7 +202,7 @@ func (l *LimitersMgr) getLimit(limiterName string, isQps bool) (float64, int) { return 0, 0 } - limitStrs := strings.Split(string(val), LIMIT_BURST_TOKEN) + limitStrs := strings.Split(string(val), LIMIT_VALUE_TOKEN) if len(limitStrs) < 2 { zap.L().Error("[Limit] limit hasn't enough parameters, should be: [K|k|M|m] ", zap.String("key", limiterKey), zap.ByteString("val", val)) return 0, 0 @@ -237,125 +260,33 @@ func (l *LimitersMgr) CheckLimit(namespace string, cmdName string, cmdArgs []str } } -func (l *LimitersMgr) startBalanceLimit() { - ticker := time.NewTicker(l.globalBalancePeriod) +func (l *LimitersMgr) startReportAndBalance() { + ticker := time.NewTicker(l.conf.GlobalBalancePeriod) defer ticker.Stop() for range ticker.C { - l.runBalanceLimit() + l.runReportAndBalance() } } -func (l *LimitersMgr) startReportLocalStat() { - ticker := time.NewTicker(l.globalBalancePeriod) - defer ticker.Stop() - for range ticker.C { - l.reportLocalStat() - } -} - -func (l *LimitersMgr) reportLocalStat() { +func (l *LimitersMgr) runReportAndBalance() { l.limiters.Range(func(k, v interface{}) bool { limiterName := k.(string) commandLimiter := v.(*CommandLimiter) if commandLimiter != nil { - commandLimiter.reportLocalStat() + averageQps := commandLimiter.reportLocalStat(l.conf.GlobalBalancePeriod) + commandLimiter.balanceLimit(averageQps, l.limitDatadb, l.conf.TitanStatusLifetime, l.conf.UsageToDivide, l.conf.UsageToMultiply, l.conf.WeightChangeFactor) + } else { - metrics.GetMetrics().LimiterQpsVec.WithLabelValues(l.localIp, limiterName).Set(0) - metrics.GetMetrics().LimiterRateVec.WithLabelValues(l.localIp, limiterName).Set(0) + namespaceAndCmd := getNamespaceAndCmd(limiterName) + metrics.GetMetrics().LimiterQpsVec.WithLabelValues(namespaceAndCmd[0], namespaceAndCmd[1], l.localIp).Set(0) + metrics.GetMetrics().LimiterRateVec.WithLabelValues(namespaceAndCmd[0], namespaceAndCmd[1], l.localIp).Set(0) } return true }) } -func (l *LimitersMgr) runBalanceLimit() { - txn, err := l.limitDatadb.Begin() - if err != nil { - zap.L().Error("[Limit] transection begin failed", zap.String("titan", l.localIp), zap.Error(err)) - return - } - - prefix := MetaKey(l.limitDatadb, []byte(TITAN_STATUS_TOKEN)) - endPrefix := sdk_kv.Key(prefix).PrefixNext() - iter, err := txn.t.Iter(prefix, endPrefix) - if err != nil { - zap.L().Error("[Limit] seek failed", zap.ByteString("prefix", prefix), zap.Error(err)) - txn.Rollback() - return - } - defer iter.Close() - - activeNum := float64(1) - prefixLen := len(prefix) - for ; iter.Valid() && iter.Key().HasPrefix(prefix); err = iter.Next() { - if err != nil { - zap.L().Error("[Limit] next failed", zap.ByteString("prefix", prefix), zap.Error(err)) - txn.Rollback() - return - } - - key := iter.Key() - if len(key) <= prefixLen { - zap.L().Error("ip is null", zap.ByteString("key", key)) - continue - } - ip := key[prefixLen:] - obj := NewString(txn, key) - if err = obj.decode(iter.Value()); err != nil { - zap.L().Error("[Limit] Strings decoded value error", zap.ByteString("key", key), zap.Error(err)) - continue - } - - lastActive := obj.Meta.Value - lastActiveT, err := time.ParseInLocation(TIME_FORMAT, string(lastActive), time.Local) - if err != nil { - zap.L().Error("[Limit] value can't decoded into a time", zap.ByteString("key", key), zap.ByteString("lastActive", lastActive), zap.Error(err)) - continue - } - - diff := time.Since(lastActiveT).Seconds() - zap.L().Info("[Limit] last active time", zap.ByteString("ip", ip), zap.ByteString("lastActive", lastActive), zap.Float64("activePast", diff)) - if string(ip) != l.localIp && time.Since(lastActiveT) <= l.titanStatusLifeTime { - activeNum++ - } - } - newPercent := 1 / activeNum - var oldPercent float64 - l.lock.Lock() - oldPercent = l.localPercent - l.lock.Unlock() - if oldPercent != newPercent { - zap.L().Info("[Limit] balance limit in all titan server", zap.Float64("active server num", activeNum), - zap.Float64("oldPercent", oldPercent), zap.Float64("newPercent", newPercent)) - l.limiters.Range(func(k, v interface{}) bool { - commandLimiter := v.(*CommandLimiter) - if commandLimiter != nil { - commandLimiter.updateLimitPercent(newPercent) - } - return true - }) - - l.lock.Lock() - l.localPercent = newPercent - l.lock.Unlock() - } - - key := []byte(TITAN_STATUS_TOKEN + l.localIp) - s := NewString(txn, key) - now := time.Now() - value := now.Format(TIME_FORMAT) - if err := s.Set([]byte(value), 0); err != nil { - txn.Rollback() - return - } - if err := txn.t.Commit(context.Background()); err != nil { - zap.L().Error("[Limit] commit after balance limit failed", zap.String("titan", l.localIp)) - txn.Rollback() - return - } -} - func (l *LimitersMgr) startSyncNewLimit() { - ticker := time.NewTicker(l.syncSetPeriod) + ticker := time.NewTicker(l.conf.SyncSetPeriod) defer ticker.Stop() for range ticker.C { l.runSyncNewLimit() @@ -388,10 +319,6 @@ func (l *LimitersMgr) runSyncNewLimit() { }) } - var localPercent float64 - l.lock.Lock() - localPercent = l.localPercent - l.lock.Unlock() l.limiters.Range(func(k, v interface{}) bool { limiterName := k.(string) commandLimiter := v.(*CommandLimiter) @@ -426,7 +353,7 @@ func (l *LimitersMgr) runSyncNewLimit() { zap.Float64("rate limit", rateLimit), zap.Int("rate burst", rateBurst)) } if commandLimiter == nil { - newCl := NewCommandLimiter(l.localIp, limiterName, qpsLimit, qpsBurst, rateLimit, rateBurst, localPercent) + newCl := NewCommandLimiter(l.localIp, limiterName, qpsLimit, qpsBurst, rateLimit, rateBurst, l.conf.InitialPercent) l.limiters.Store(limiterName, newCl) } else { commandLimiter.updateLimit(qpsLimit, qpsBurst, rateLimit, rateBurst) @@ -444,20 +371,29 @@ func (l *LimitersMgr) runSyncNewLimit() { }) } -func NewCommandLimiter(localIp string, limiterName string, qpsLimit float64, qpsBurst int, rateLimit float64, rateBurst int, localPercent float64) *CommandLimiter { +func NewCommandLimiter(localIp string, limiterName string, qpsLimit float64, qpsBurst int, rateLimit float64, rateBurst int, initialPercent float64) *CommandLimiter { + if !(qpsLimit > 0 && qpsBurst > 0) && + !(rateLimit > 0 && rateBurst > 0) { + return nil + } + if initialPercent <= 0 { + return nil + } var qpsl, ratel *rate.Limiter if qpsLimit > 0 && qpsBurst > 0 { - qpsl = rate.NewLimiter(rate.Limit(qpsLimit*localPercent), qpsBurst) + qpsl = rate.NewLimiter(rate.Limit(qpsLimit*initialPercent), qpsBurst) } if rateLimit > 0 && rateBurst > 0 { - ratel = rate.NewLimiter(rate.Limit(rateLimit*localPercent), rateBurst) + ratel = rate.NewLimiter(rate.Limit(rateLimit*initialPercent), rateBurst) } cl := &CommandLimiter{ limiterName: limiterName, localIp: localIp, qpsl: qpsl, ratel: ratel, - localPercent: localPercent, + localPercent: initialPercent, + weight: MAXIMUM_WEIGHT, + skipBalance: true, lastTime: time.Now(), } return cl @@ -467,6 +403,20 @@ func (cl *CommandLimiter) updateLimit(qpsLimit float64, qpsBurst int, rateLimit cl.lock.Lock() defer cl.lock.Unlock() + ////when limit is changed, the qps can't be used to balanceLimit + var orgQpsLimit, orgRateLimit float64 + var orgQpsBurst, orgRateBurst int + if cl.qpsl != nil { + orgQpsLimit = float64(cl.qpsl.Limit()) / cl.localPercent + orgQpsBurst = cl.qpsl.Burst() + } + if cl.ratel != nil { + orgRateLimit = float64(cl.ratel.Limit()) / cl.localPercent + orgRateBurst = cl.ratel.Burst() + } + if orgQpsLimit != qpsLimit || orgQpsBurst != qpsBurst || orgRateLimit != rateLimit || orgRateBurst != rateBurst { + cl.skipBalance = true + } if qpsLimit > 0 && qpsBurst > 0 { if cl.qpsl != nil { if cl.qpsl.Burst() != qpsBurst { @@ -496,9 +446,10 @@ func (cl *CommandLimiter) updateLimit(qpsLimit float64, qpsBurst int, rateLimit } } -func (cl *CommandLimiter) reportLocalStat() { +func (cl *CommandLimiter) reportLocalStat(globalBalancePeriod time.Duration) float64 { var qpsLocal, rateLocal float64 cl.lock.Lock() + defer cl.lock.Unlock() seconds := time.Since(cl.lastTime).Seconds() if seconds >= 0 { qpsLocal = float64(cl.totalCommandsCount) / seconds @@ -510,16 +461,161 @@ func (cl *CommandLimiter) reportLocalStat() { cl.totalCommandsCount = 0 cl.totalCommandsSize = 0 cl.lastTime = time.Now() - cl.lock.Unlock() - metrics.GetMetrics().LimiterQpsVec.WithLabelValues(cl.localIp, cl.limiterName).Set(qpsLocal) - metrics.GetMetrics().LimiterRateVec.WithLabelValues(cl.localIp, cl.limiterName).Set(rateLocal) + namespaceCmd := getNamespaceAndCmd(cl.limiterName) + metrics.GetMetrics().LimiterQpsVec.WithLabelValues(namespaceCmd[0], namespaceCmd[1], cl.localIp).Set(qpsLocal) + metrics.GetMetrics().LimiterRateVec.WithLabelValues(namespaceCmd[0], namespaceCmd[1], cl.localIp).Set(rateLocal) + + return qpsLocal } -func (cl *CommandLimiter) updateLimitPercent(newPercent float64) { +func (cl *CommandLimiter) balanceLimit(averageQps float64, limitDatadb *DB, titanStatusLifetime time.Duration, + devideUsage float64, multiplyUsage float64, weightChangeFactor float64) { cl.lock.Lock() defer cl.lock.Unlock() + if cl.qpsl == nil { + return + } + if cl.skipBalance { + cl.skipBalance = false + return + } + + txn, err := limitDatadb.Begin() + if err != nil { + zap.L().Error("[Limit] transection begin failed", zap.String("titan", cl.localIp), zap.Error(err)) + return + } + + weights, qpss, err := cl.scanStatusInOtherTitan(limitDatadb, txn, titanStatusLifetime) + if err != nil { + txn.Rollback() + return + } + + totalWeight := cl.weight + for i := range weights { + totalWeight += weights[i] + } + + originalLimit := float64(cl.qpsl.Limit()) / cl.localPercent + selfLimitInTarget := originalLimit * (cl.weight / totalWeight) + if averageQps < selfLimitInTarget*devideUsage { + otherFull := false + for i := range qpss { + otherLimitInTarget := originalLimit * (weights[i] / totalWeight) + if qpss[i] >= otherLimitInTarget*multiplyUsage { + otherFull = true + break + } + } + if otherFull { + cl.weight /= weightChangeFactor + if cl.weight < MINIMUM_WEIGHT { + cl.weight = MINIMUM_WEIGHT + } + } + } else if averageQps >= selfLimitInTarget*multiplyUsage { + cl.weight *= weightChangeFactor + if cl.weight > MAXIMUM_WEIGHT { + cl.weight = MAXIMUM_WEIGHT + } + } + + totalWeight = cl.weight + for i := range weights { + totalWeight += weights[i] + } + newPercent := cl.weight / totalWeight + + key := getLimiterKey(cl.limiterName) + key = append(key, []byte(cl.localIp)...) + s := NewString(txn, key) + now := time.Now() + strTime := now.Format(TIME_FORMAT) + value := fmt.Sprintf("%f%s%f%s%s", cl.weight, LIMITER_STATUS_VALUE_TOKEN, averageQps, LIMITER_STATUS_VALUE_TOKEN, strTime) + if err := s.Set([]byte(value), 0); err != nil { + txn.Rollback() + return + } + if err := txn.t.Commit(context.Background()); err != nil { + zap.L().Error("[Limit] commit after balance limit failed", zap.String("titan", cl.localIp)) + txn.Rollback() + return + } + zap.L().Info("[Limit] balance limit", zap.String("limiterName", cl.limiterName), + zap.Float64("qps", averageQps), zap.Float64("newWeight", cl.weight), zap.Float64("newPercent", newPercent)) + cl.updateLimitPercent(newPercent) +} + +func (cl *CommandLimiter) scanStatusInOtherTitan(limitDatadb *DB, txn *Transaction, titanStatusLifetime time.Duration) ([]float64, []float64, error) { + key := getLimiterKey(cl.limiterName) + prefix := MetaKey(limitDatadb, key) + endPrefix := sdk_kv.Key(prefix).PrefixNext() + iter, err := txn.t.Iter(prefix, endPrefix) + if err != nil { + zap.L().Error("[Limit] seek failed", zap.ByteString("prefix", prefix), zap.Error(err)) + return nil, nil, err + } + defer iter.Close() + + prefixLen := len(prefix) + var weights, qpss []float64 + var weight, qps float64 + for ; iter.Valid() && iter.Key().HasPrefix(prefix); err = iter.Next() { + if err != nil { + zap.L().Error("[Limit] next failed", zap.ByteString("prefix", prefix), zap.Error(err)) + return nil, nil, err + } + + key := iter.Key() + if len(key) <= prefixLen { + zap.L().Error("ip is null", zap.ByteString("key", key)) + continue + } + ip := key[prefixLen:] + obj := NewString(txn, key) + if err = obj.decode(iter.Value()); err != nil { + zap.L().Error("[Limit] Strings decoded value error", zap.ByteString("key", key), zap.Error(err)) + continue + } + + val := string(obj.Meta.Value) + vals := strings.Split(val, LIMITER_STATUS_VALUE_TOKEN) + if len(vals) < 3 { + zap.L().Error("[Limit] short of values(should 3 values)", zap.ByteString("key", key), zap.String("value", val)) + continue + } + sWeight := vals[0] + sQps := vals[1] + lastActive := vals[2] + + if weight, err = strconv.ParseFloat(sWeight, 64); err != nil { + zap.L().Error("[Limit] weight can't be decoded to float", zap.ByteString("key", key), zap.String("weight", sWeight), zap.Error(err)) + continue + } + if qps, err = strconv.ParseFloat(sQps, 64); err != nil { + zap.L().Error("[Limit] qps can't be decoded to float", zap.ByteString("key", key), zap.String("qps", sQps), zap.Error(err)) + continue + } + + lastActiveT, err := time.ParseInLocation(TIME_FORMAT, lastActive, time.Local) + if err != nil { + zap.L().Error("[Limit] value can't decoded into a time", zap.ByteString("key", key), zap.String("lastActive", lastActive), zap.Error(err)) + continue + } + + zap.L().Info("[Limit] other titan status", zap.ByteString("key", key), zap.Float64("weight", weight), zap.Float64("qps", qps), zap.String("lastActive", lastActive)) + if string(ip) != cl.localIp && time.Since(lastActiveT) <= titanStatusLifetime { + weights = append(weights, weight) + qpss = append(qpss, qps) + } + } + return weights, qpss, nil +} + +func (cl *CommandLimiter) updateLimitPercent(newPercent float64) { if cl.localPercent != newPercent && cl.localPercent > 0 && newPercent > 0 { if cl.qpsl != nil { qpsLimit := (float64(cl.qpsl.Limit()) / cl.localPercent) * newPercent diff --git a/metrics/prometheus.go b/metrics/prometheus.go index 7215f37c..b01639ac 100644 --- a/metrics/prometheus.go +++ b/metrics/prometheus.go @@ -22,6 +22,7 @@ const ( gckeys = "gckeys" expire = "expire" tikvGC = "tikvgc" + titanip = "titanip" ) var ( @@ -34,6 +35,7 @@ var ( gcKeysLabel = []string{gckeys} expireLabel = []string{expire} tikvGCLabel = []string{tikvGC} + limitLabel = []string{biz, command, titanip} // global prometheus object gm *Metrics @@ -197,7 +199,7 @@ func init() { Namespace: namespace, Name: "qps_limiter_status", Help: "the qps of a namespace's command in a titan server", - }, multiLabel) + }, limitLabel) prometheus.MustRegister(gm.LimiterQpsVec) gm.LimiterRateVec = prometheus.NewGaugeVec( @@ -205,7 +207,7 @@ func init() { Namespace: namespace, Name: "rate_limiter_status", Help: "the rate of a namespace's command in a titan server(KB/s)", - }, multiLabel) + }, limitLabel) prometheus.MustRegister(gm.LimiterRateVec) gm.LRangeSeekHistogram = prometheus.NewHistogram( diff --git a/tools/integration/titan.go b/tools/integration/titan.go index 9d400e2d..2e63fd1c 100644 --- a/tools/integration/titan.go +++ b/tools/integration/titan.go @@ -48,7 +48,7 @@ func Start() { log.Fatalln(err) } - limitersMgr, err := db.NewLimitersMgr(store, tikvConf.RateLimit) + limitersMgr, err := db.NewLimitersMgr(store, &tikvConf.RateLimit) if err != nil { log.Fatalln(err) } From fda1e666e59330d1dc34f23954a2f0836bd3f24f Mon Sep 17 00:00:00 2001 From: xufangping Date: Mon, 2 Dec 2019 11:11:24 +0800 Subject: [PATCH 30/60] update some config item's description and log --- conf/config.go | 6 +++--- conf/titan.toml | 20 ++++++++++++++++++++ db/limitersMgr.go | 2 +- 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/conf/config.go b/conf/config.go index e7b00e08..f5eff6dc 100644 --- a/conf/config.go +++ b/conf/config.go @@ -110,7 +110,7 @@ type RateLimit struct { TitanStatusLifetime time.Duration `cfg:"titanstatus-life-time; 1m;; how long if a titan didn't update its status, we consider it dead"` SyncSetPeriod time.Duration `cfg:"sync-set-period; 3s;; the period in seconds to sync new limit set in tikv"` UsageToDivide float64 `cfg:"usage-to-divide; 0.6;; if the qps/weighted limit < the percent, will divide change Factor to balance limit"` - UsageToMultiply float64 `cfg:"usage-to-multiply; 0.9;; if the qps/weighted limit > the percent, will multiply change Factor to balance limit"` - WeightChangeFactor float64 `cfg:"weight-change-factor; 1.5;; the weight change unit when balance limit, is also the minimum weight"` - InitialPercent float64 `cfg:"initial-percent; 0.33;; the percent in a commandLimiter is created"` + UsageToMultiply float64 `cfg:"usage-to-multiply; 0.9;; if the qps/weighted limit >= the percent, will multiply change Factor to balance limit"` + WeightChangeFactor float64 `cfg:"weight-change-factor; 1.5;; the factor to devide/multipy in current weight"` + InitialPercent float64 `cfg:"initial-percent; 0.33;; the limit is set in the percent when a commandLimiter is created"` } diff --git a/conf/titan.toml b/conf/titan.toml index 37e2311a..aada95e2 100644 --- a/conf/titan.toml +++ b/conf/titan.toml @@ -208,6 +208,26 @@ interface-name = "eth0" #default: 3s #sync-set-period = "3s" +#type: float64 +#description: if the qps/weighted limit < the percent, will divide change Factor to balance limit +#default: 0.6 +usage-to-divide = 0.6 + +#type: float64 +#description: if the qps/weighted limit >= the percent, will multiply change Factor to balance limit +#default: 0.9 +usage-to-multiply = 0.9 + +#type: float64 +#description: the factor to devide/multipy in current weight +#default: 1.5 +weight-change-factor = 1.5 + +#type: float64 +#description: the limit is set in the percent when a commandLimiter is created +#default: 0.33 +initial-percent = 0.33 + [tikv-logger] #type: string diff --git a/db/limitersMgr.go b/db/limitersMgr.go index dc5006a7..7716a20e 100644 --- a/db/limitersMgr.go +++ b/db/limitersMgr.go @@ -606,7 +606,7 @@ func (cl *CommandLimiter) scanStatusInOtherTitan(limitDatadb *DB, txn *Transacti continue } - zap.L().Info("[Limit] other titan status", zap.ByteString("key", key), zap.Float64("weight", weight), zap.Float64("qps", qps), zap.String("lastActive", lastActive)) + zap.L().Info("[Limit] titan status", zap.ByteString("key", key), zap.Float64("weight", weight), zap.Float64("qps", qps), zap.String("lastActive", lastActive)) if string(ip) != cl.localIp && time.Since(lastActiveT) <= titanStatusLifetime { weights = append(weights, weight) qpss = append(qpss, qps) From a474656d77763dc0d5a2ed822ad23edcf0e735b1 Mon Sep 17 00:00:00 2001 From: xufangping Date: Mon, 2 Dec 2019 15:12:21 +0800 Subject: [PATCH 31/60] add limit changed log --- db/limitersMgr.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/db/limitersMgr.go b/db/limitersMgr.go index 7716a20e..95176d97 100644 --- a/db/limitersMgr.go +++ b/db/limitersMgr.go @@ -415,6 +415,8 @@ func (cl *CommandLimiter) updateLimit(qpsLimit float64, qpsBurst int, rateLimit orgRateBurst = cl.ratel.Burst() } if orgQpsLimit != qpsLimit || orgQpsBurst != qpsBurst || orgRateLimit != rateLimit || orgRateBurst != rateBurst { + zap.L().Info("limit changed", zap.Float64("orgQpsLimit", orgQpsLimit), zap.Float64("qpsLimit", qpsLimit), zap.Int("orgQpsBurst", orgQpsBurst), zap.Int("qpsBurst", qpsBurst), + zap.Float64("orgRateLimit", orgRateLimit), zap.Float64("rateLimit", rateLimit), zap.Int("orgRateBurst", orgRateBurst), zap.Int("rateBurst", rateBurst)) cl.skipBalance = true } if qpsLimit > 0 && qpsBurst > 0 { From 8ed06984da977c5b0901462ff0a72ae019980152 Mon Sep 17 00:00:00 2001 From: xufangping Date: Mon, 2 Dec 2019 18:09:22 +0800 Subject: [PATCH 32/60] commandLimiter store globalQpsLimit/globalRateLimit in int64, use it to calculate the local limit in updatePercent() and judge if limit is changed, this resolve float precision lost problem --- db/limitersMgr.go | 116 +++++++++++++++++++++++++--------------------- 1 file changed, 63 insertions(+), 53 deletions(-) diff --git a/db/limitersMgr.go b/db/limitersMgr.go index 95176d97..d081fe01 100644 --- a/db/limitersMgr.go +++ b/db/limitersMgr.go @@ -34,11 +34,13 @@ type CommandLimiter struct { localIp string limiterName string - qpsl *rate.Limiter - ratel *rate.Limiter - localPercent float64 - weight float64 - skipBalance bool + qpsl *rate.Limiter + ratel *rate.Limiter + globalQpsLimit int64 + globalRateLimit int64 + localPercent float64 + weight float64 + skipBalance bool lock sync.Mutex lastTime time.Time @@ -47,7 +49,7 @@ type CommandLimiter struct { } type LimitData struct { - limit float64 + limit int64 burst int } @@ -169,8 +171,8 @@ func (l *LimitersMgr) init(limiterName string) *CommandLimiter { } } -func (l *LimitersMgr) getLimit(limiterName string, isQps bool) (float64, int) { - limit := float64(0) +func (l *LimitersMgr) getLimit(limiterName string, isQps bool) (int64, int) { + limit := int64(0) burst := int64(0) txn, err := l.limitDatadb.Begin() @@ -214,7 +216,7 @@ func (l *LimitersMgr) getLimit(limiterName string, isQps bool) (float64, int) { return 0, 0 } var strUnit uint8 - var unit float64 + var unit int64 strUnit = limitStr[len(limitStr)-1] if strUnit == 'k' || strUnit == 'K' { unit = 1024 @@ -225,18 +227,19 @@ func (l *LimitersMgr) getLimit(limiterName string, isQps bool) (float64, int) { } else { unit = 1 } - if limit, err = strconv.ParseFloat(limitStr, 64); err != nil { - zap.L().Error("[Limit] limit can't be decoded to integer", zap.String("key", limiterKey), zap.ByteString("val", val), zap.Error(err)) + limitInUnit, err := strconv.ParseFloat(limitStr, 64) + if err != nil { + zap.L().Error("[Limit] limit's number part can't be decoded to number", zap.String("key", limiterKey), zap.ByteString("val", val), zap.Error(err)) return 0, 0 } - limit *= unit - if burst, err = strconv.ParseInt(burstStr, 10, 64); err != nil { + limit = int64(limitInUnit * float64(unit)) + if burst, err = strconv.ParseInt(burstStr, 10, 32); err != nil { zap.L().Error("[Limit] burst can't be decoded to integer", zap.String("key", limiterKey), zap.ByteString("val", val), zap.Error(err)) return 0, 0 } if logEnv := zap.L().Check(zap.DebugLevel, "[Limit] got limit"); logEnv != nil { - logEnv.Write(zap.String("key", limiterKey), zap.Float64("limit", limit), zap.Int64("burst", burst)) + logEnv.Write(zap.String("key", limiterKey), zap.Int64("limit", limit), zap.Int64("burst", burst)) } return limit, int(burst) @@ -349,8 +352,8 @@ func (l *LimitersMgr) runSyncNewLimit() { if (qpsLimit > 0 && qpsBurst > 0) || (rateLimit > 0 && rateBurst > 0) { if logEnv := zap.L().Check(zap.DebugLevel, "[Limit] limit is set"); logEnv != nil { - logEnv.Write(zap.String("limiter name", limiterName), zap.Float64("qps limit", qpsLimit), zap.Int("qps burst", qpsBurst), - zap.Float64("rate limit", rateLimit), zap.Int("rate burst", rateBurst)) + logEnv.Write(zap.String("limiter name", limiterName), zap.Int64("qps limit", qpsLimit), zap.Int("qps burst", qpsBurst), + zap.Int64("rate limit", rateLimit), zap.Int("rate burst", rateBurst)) } if commandLimiter == nil { newCl := NewCommandLimiter(l.localIp, limiterName, qpsLimit, qpsBurst, rateLimit, rateBurst, l.conf.InitialPercent) @@ -361,8 +364,8 @@ func (l *LimitersMgr) runSyncNewLimit() { } else { if commandLimiter != nil { if logEnv := zap.L().Check(zap.DebugLevel, "[Limit] limit is cleared"); logEnv != nil { - logEnv.Write(zap.String("limiter name", limiterName), zap.Float64("qps limit", qpsLimit), zap.Int("qps burst", qpsBurst), - zap.Float64("rate limit", rateLimit), zap.Int("rate burst", rateBurst)) + logEnv.Write(zap.String("limiter name", limiterName), zap.Int64("qps limit", qpsLimit), zap.Int("qps burst", qpsBurst), + zap.Int64("rate limit", rateLimit), zap.Int("rate burst", rateBurst)) } l.limiters.Store(limiterName, (*CommandLimiter)(nil)) } @@ -371,7 +374,7 @@ func (l *LimitersMgr) runSyncNewLimit() { }) } -func NewCommandLimiter(localIp string, limiterName string, qpsLimit float64, qpsBurst int, rateLimit float64, rateBurst int, initialPercent float64) *CommandLimiter { +func NewCommandLimiter(localIp string, limiterName string, qpsLimit int64, qpsBurst int, rateLimit int64, rateBurst int, initialPercent float64) *CommandLimiter { if !(qpsLimit > 0 && qpsBurst > 0) && !(rateLimit > 0 && rateBurst > 0) { return nil @@ -381,71 +384,78 @@ func NewCommandLimiter(localIp string, limiterName string, qpsLimit float64, qps } var qpsl, ratel *rate.Limiter if qpsLimit > 0 && qpsBurst > 0 { - qpsl = rate.NewLimiter(rate.Limit(qpsLimit*initialPercent), qpsBurst) + localQpsLimit := float64(qpsLimit) * initialPercent + qpsl = rate.NewLimiter(rate.Limit(localQpsLimit), qpsBurst) } if rateLimit > 0 && rateBurst > 0 { - ratel = rate.NewLimiter(rate.Limit(rateLimit*initialPercent), rateBurst) + localRateLimit := float64(rateLimit) * initialPercent + ratel = rate.NewLimiter(rate.Limit(localRateLimit), rateBurst) } cl := &CommandLimiter{ - limiterName: limiterName, - localIp: localIp, - qpsl: qpsl, - ratel: ratel, - localPercent: initialPercent, - weight: MAXIMUM_WEIGHT, - skipBalance: true, - lastTime: time.Now(), + limiterName: limiterName, + localIp: localIp, + qpsl: qpsl, + ratel: ratel, + globalQpsLimit: qpsLimit, + globalRateLimit: rateLimit, + localPercent: initialPercent, + weight: MAXIMUM_WEIGHT, + skipBalance: true, + lastTime: time.Now(), } return cl } -func (cl *CommandLimiter) updateLimit(qpsLimit float64, qpsBurst int, rateLimit float64, rateBurst int) { +func (cl *CommandLimiter) updateLimit(newQpsLimit int64, newQpsBurst int, newRateLimit int64, newRateBurst int) { cl.lock.Lock() defer cl.lock.Unlock() ////when limit is changed, the qps can't be used to balanceLimit - var orgQpsLimit, orgRateLimit float64 - var orgQpsBurst, orgRateBurst int + var qpsBurst, rateBurst int if cl.qpsl != nil { - orgQpsLimit = float64(cl.qpsl.Limit()) / cl.localPercent - orgQpsBurst = cl.qpsl.Burst() + qpsBurst = cl.qpsl.Burst() } if cl.ratel != nil { - orgRateLimit = float64(cl.ratel.Limit()) / cl.localPercent - orgRateBurst = cl.ratel.Burst() + rateBurst = cl.ratel.Burst() } - if orgQpsLimit != qpsLimit || orgQpsBurst != qpsBurst || orgRateLimit != rateLimit || orgRateBurst != rateBurst { - zap.L().Info("limit changed", zap.Float64("orgQpsLimit", orgQpsLimit), zap.Float64("qpsLimit", qpsLimit), zap.Int("orgQpsBurst", orgQpsBurst), zap.Int("qpsBurst", qpsBurst), - zap.Float64("orgRateLimit", orgRateLimit), zap.Float64("rateLimit", rateLimit), zap.Int("orgRateBurst", orgRateBurst), zap.Int("rateBurst", rateBurst)) + if cl.globalQpsLimit != newQpsLimit || qpsBurst != newQpsBurst || cl.globalRateLimit != newRateLimit || rateBurst != newRateBurst { + zap.L().Info("limit changed", zap.Int64("globalQpsLimit", cl.globalQpsLimit), zap.Int64("newQpsLimit", newQpsLimit), zap.Int("qpsBurst", qpsBurst), zap.Int("newQpsBurst", newQpsBurst), + zap.Int64("globalRateLimit", cl.globalRateLimit), zap.Int64("newRateLimit", newRateLimit), zap.Int("rateBurst", rateBurst), zap.Int("newRateBurst", newRateBurst)) cl.skipBalance = true } - if qpsLimit > 0 && qpsBurst > 0 { + + if newQpsLimit > 0 && newQpsBurst > 0 { + localQpsLimit := float64(newQpsLimit) * cl.localPercent if cl.qpsl != nil { - if cl.qpsl.Burst() != qpsBurst { - cl.qpsl = rate.NewLimiter(rate.Limit(qpsLimit*cl.localPercent), qpsBurst) - } else if cl.qpsl.Limit() != rate.Limit(qpsLimit*cl.localPercent) { - cl.qpsl.SetLimit(rate.Limit(qpsLimit * cl.localPercent)) + if cl.qpsl.Burst() != newQpsBurst { + cl.qpsl = rate.NewLimiter(rate.Limit(localQpsLimit), newQpsBurst) + } else if cl.globalQpsLimit != newQpsLimit { + cl.qpsl.SetLimit(rate.Limit(localQpsLimit)) } } else { - cl.qpsl = rate.NewLimiter(rate.Limit(qpsLimit*cl.localPercent), qpsBurst) + cl.qpsl = rate.NewLimiter(rate.Limit(localQpsLimit), newQpsBurst) } } else { cl.qpsl = nil } - if rateLimit > 0 && rateBurst > 0 { + if newRateLimit > 0 && newRateBurst > 0 { + localRateLimit := float64(newRateLimit) * cl.localPercent if cl.ratel != nil { - if cl.ratel.Burst() != rateBurst { - cl.ratel = rate.NewLimiter(rate.Limit(rateLimit*cl.localPercent), rateBurst) - } else if cl.ratel.Limit() != rate.Limit(rateLimit*cl.localPercent) { - cl.ratel.SetLimit(rate.Limit(rateLimit * cl.localPercent)) + if cl.ratel.Burst() != newRateBurst { + cl.ratel = rate.NewLimiter(rate.Limit(localRateLimit), newRateBurst) + } else if cl.globalRateLimit != newRateLimit { + cl.ratel.SetLimit(rate.Limit(localRateLimit)) } } else { - cl.ratel = rate.NewLimiter(rate.Limit(rateLimit*cl.localPercent), rateBurst) + cl.ratel = rate.NewLimiter(rate.Limit(localRateLimit), newRateBurst) } } else { cl.ratel = nil } + + cl.globalQpsLimit = newQpsLimit + cl.globalRateLimit = newRateLimit } func (cl *CommandLimiter) reportLocalStat(globalBalancePeriod time.Duration) float64 { @@ -620,12 +630,12 @@ func (cl *CommandLimiter) scanStatusInOtherTitan(limitDatadb *DB, txn *Transacti func (cl *CommandLimiter) updateLimitPercent(newPercent float64) { if cl.localPercent != newPercent && cl.localPercent > 0 && newPercent > 0 { if cl.qpsl != nil { - qpsLimit := (float64(cl.qpsl.Limit()) / cl.localPercent) * newPercent + qpsLimit := float64(cl.globalQpsLimit) * newPercent zap.L().Info("percent changed", zap.String("limiterName", cl.limiterName), zap.Float64("qps limit", qpsLimit), zap.Int("burst", cl.qpsl.Burst())) cl.qpsl.SetLimit(rate.Limit(qpsLimit)) } if cl.ratel != nil { - rateLimit := float64(cl.ratel.Limit()) / cl.localPercent * newPercent + rateLimit := float64(cl.globalRateLimit) * newPercent zap.L().Info("percent changed", zap.String("limiterName", cl.limiterName), zap.Float64("rate limit", rateLimit), zap.Int("burst", cl.ratel.Burst())) cl.ratel.SetLimit(rate.Limit(rateLimit)) } From 6ae93be7bf9627f2034402f503b72dc26d37d79c Mon Sep 17 00:00:00 2001 From: xufangping Date: Tue, 3 Dec 2019 14:56:24 +0800 Subject: [PATCH 33/60] when all titan'a limiter qps is < devideUsage of its weight percent, add current titan weight again --- db/limitersMgr.go | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/db/limitersMgr.go b/db/limitersMgr.go index d081fe01..e6e63189 100644 --- a/db/limitersMgr.go +++ b/db/limitersMgr.go @@ -514,19 +514,30 @@ func (cl *CommandLimiter) balanceLimit(averageQps float64, limitDatadb *DB, tita originalLimit := float64(cl.qpsl.Limit()) / cl.localPercent selfLimitInTarget := originalLimit * (cl.weight / totalWeight) if averageQps < selfLimitInTarget*devideUsage { - otherFull := false + otherHaveHigh := false + otherAllLow := true for i := range qpss { otherLimitInTarget := originalLimit * (weights[i] / totalWeight) if qpss[i] >= otherLimitInTarget*multiplyUsage { - otherFull = true + otherHaveHigh = true + otherAllLow = false + break + } + if qpss[i] >= otherLimitInTarget*devideUsage { + otherAllLow = false break } } - if otherFull { + if otherHaveHigh { cl.weight /= weightChangeFactor if cl.weight < MINIMUM_WEIGHT { cl.weight = MINIMUM_WEIGHT } + } else if otherAllLow { + cl.weight *= weightChangeFactor + if cl.weight > MAXIMUM_WEIGHT { + cl.weight = MAXIMUM_WEIGHT + } } } else if averageQps >= selfLimitInTarget*multiplyUsage { cl.weight *= weightChangeFactor From 70b3f15445c592cda2f755f1763014c609dda192 Mon Sep 17 00:00:00 2001 From: xufangping Date: Wed, 4 Dec 2019 20:59:55 +0800 Subject: [PATCH 34/60] add showlimit.sh and add a conf parameter for setlimit.sh --- setlimit.sh | 25 +++++++++++++++---------- showlimit.sh | 27 +++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 10 deletions(-) create mode 100644 showlimit.sh diff --git a/setlimit.sh b/setlimit.sh index 742d34d8..bf2441f6 100644 --- a/setlimit.sh +++ b/setlimit.sh @@ -1,22 +1,25 @@ #!/bin/bash -host=tkv7369.test.zhuaninc.com -port=7369 -rediscli="./redis-cli" -token="sys_ratelimit-1574130304-1-36c153b109ebca80b43769" + usage_exit() { echo "usage:" - echo "$0 set qps=(1|0) cmd= namespace= limit=[k/K/m/M] burst=" + echo " set qps=(1|0) cmd= namespace= limit=[k/K/m/M] burst=" echo "or" - echo "$0 del qps=(1|0) cmd= namespace=" + echo " del qps=(1|0) cmd= namespace=" echo ": all means matching all namespaces" exit 1 } - -if [ $# -lt 1 ]; then +if [ $# -lt 5 ]; then usage_exit fi -op=$1 + +configpath=$1 +host=`grep host= $configpath|sed 's/host=//'` +port=`grep port= $configpath|sed 's/port=//'` +rediscli=`grep rediscli= $configpath|sed 's/rediscli=//'` +token=`grep token= $configpath|sed 's/token=//'` + +op=$2 if [ "$op" != "set" -a "$op" != "del" ]; then usage_exit fi @@ -27,9 +30,11 @@ namespace= limit= burst= +i=0 for arg in $* do - if [ "$arg" = "$op" ]; then + i=`expr $i + 1` + if [ $i -le 2 ]; then continue fi diff --git a/showlimit.sh b/showlimit.sh new file mode 100644 index 00000000..9697eb3c --- /dev/null +++ b/showlimit.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +usage_exit() +{ + echo "usage: " + exit 1 +} +if [ $# -lt 1 ]; then + usage_exit +fi + +configpath=$1 +host=`grep host= $configpath|sed 's/host=//'` +port=`grep port= $configpath|sed 's/port=//'` +rediscli=`grep rediscli= $configpath|sed 's/rediscli=//'` +token=`grep token= $configpath|sed 's/token=//'` + +for prefix in qps: rate: limiter_status: +do + echo "-------------------------- $prefix data --------------------------" + $rediscli -h $host -p $port -a $token scan 0 count 256 match ${prefix}*|sed '1d'|grep -v '^$'|while read key + do + value=`$rediscli -h $host -p $port -a $token get $key` + echo -e "key=${key}\tvalue=$value" + done + echo +done From a22ca0fb9cecc4fe24dd176931c084d4fb5c0a8a Mon Sep 17 00:00:00 2001 From: xufangping Date: Fri, 6 Dec 2019 12:05:19 +0800 Subject: [PATCH 35/60] 1 resolve the problem that titan still process command for remote closed pipelined connection 2 add namespace log for readCommand failure and client processing go routine's exit --- client.go | 18 +++++++++++------- command/command.go | 4 +++- titan.go | 2 +- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/client.go b/client.go index 2728481b..9b776a8c 100644 --- a/client.go +++ b/client.go @@ -54,11 +54,11 @@ func (c *client) Write(p []byte) (int, error) { zap.L().Debug("write to client", zap.Int64("clientid", c.cliCtx.ID), zap.String("msg", string(p))) n, err := c.conn.Write(p) if err != nil { - c.conn.Close() if err == io.EOF { - zap.L().Info("close connection", zap.String("addr", c.cliCtx.RemoteAddr), - zap.Int64("clientid", c.cliCtx.ID)) + zap.L().Info("connection was half-closed by remote peer", zap.String("addr", c.cliCtx.RemoteAddr), + zap.Int64("clientid", c.cliCtx.ID), zap.String("namespace", c.cliCtx.Namespace)) } else { + //may be unknown error with message "connection reset by peer" zap.L().Error("write net failed", zap.String("addr", c.cliCtx.RemoteAddr), zap.Int64("clientid", c.cliCtx.ID), zap.String("namespace", c.cliCtx.Namespace), @@ -66,10 +66,14 @@ func (c *client) Write(p []byte) (int, error) { zap.Bool("watching", c.cliCtx.Txn != nil), zap.String("command", c.cliCtx.LastCmd), zap.String("error", err.Error())) - return 0, err } + //client.serve() will get the channel close event, close the connection, exit current go routine + //if the remote client create the connection and use pipeline to invoke command, then it close the connection, titan still can get command from client.bufio.Reader and process + //close ClientContext.Done will help client.serve() to interrupt command processing + close(c.cliCtx.Done) } - return n, nil + //return err for above write() error, then replying many times command can break its sending to a half-closed connection, etc BytesArray(lrange invoke it). + return n, err } func (c *client) serve(conn net.Conn) error { @@ -89,11 +93,11 @@ func (c *client) serve(conn net.Conn) error { c.conn.Close() if err == io.EOF { zap.L().Info("close connection", zap.String("addr", c.cliCtx.RemoteAddr), - zap.Int64("clientid", c.cliCtx.ID)) + zap.Int64("clientid", c.cliCtx.ID), zap.String("namespace", c.cliCtx.Namespace)) return nil } zap.L().Error("read command failed", zap.String("addr", c.cliCtx.RemoteAddr), - zap.Int64("clientid", c.cliCtx.ID), zap.Error(err)) + zap.Int64("clientid", c.cliCtx.ID), zap.String("namespace", c.cliCtx.Namespace), zap.Error(err)) return err } } diff --git a/command/command.go b/command/command.go index 0a17d06d..c96b8ec3 100644 --- a/command/command.go +++ b/command/command.go @@ -70,7 +70,9 @@ func BytesArray(w io.Writer, a [][]byte) OnCommit { resp.ReplyNullBulkString(w) continue } - resp.ReplyBulkString(w, string(a[i])) + if err := resp.ReplyBulkString(w, string(a[i])); err != nil { + break + } if i%10 == 9 { zap.L().Debug("reply 10 bulk string", zap.Int64("cost(us)", time.Since(start).Nanoseconds()/1000)) start = time.Now() diff --git a/titan.go b/titan.go index b96f0bbf..198c2924 100644 --- a/titan.go +++ b/titan.go @@ -48,7 +48,7 @@ func (s *Server) Serve(lis net.Listener) error { metrics.GetMetrics().ConnectionOnlineGaugeVec.WithLabelValues(cli.cliCtx.Namespace).Inc() if err := cli.serve(conn); err != nil { zap.L().Error("serve conn failed", zap.String("addr", cli.cliCtx.RemoteAddr), - zap.Int64("clientid", cliCtx.ID), zap.Error(err)) + zap.Int64("clientid", cliCtx.ID), zap.String("namespace", cli.cliCtx.Namespace), zap.Error(err)) } metrics.GetMetrics().ConnectionOnlineGaugeVec.WithLabelValues(cli.cliCtx.Namespace).Dec() s.servCtx.Clients.Delete(cli.cliCtx.ID) From 8403a21bea8ad49aa89e5f4c42c7ed9beaaff7e4 Mon Sep 17 00:00:00 2001 From: xufangping Date: Fri, 6 Dec 2019 16:57:12 +0800 Subject: [PATCH 36/60] 1 BytesArray add judge for write eror 2 use remoteClosed flag to interrupt processing when pipeline connection is closed. if close ctx.Done(channel), and write() again, titan will crush for re-close channel --- client.go | 37 +++++++++++++------------------------ command/command.go | 10 +++++++--- 2 files changed, 20 insertions(+), 27 deletions(-) diff --git a/client.go b/client.go index 9b776a8c..df8b2540 100644 --- a/client.go +++ b/client.go @@ -6,7 +6,6 @@ import ( "io/ioutil" "net" "strings" - "sync" "time" "github.com/distributedio/titan/command" @@ -22,33 +21,18 @@ type client struct { exec *command.Executor r *bufio.Reader - eofLock sync.Mutex //the lock of reading_writing 'eof' - eof bool //is over when read data from socket + remoteClosed bool //is the connection closed by remote peer? } func newClient(cliCtx *context.ClientContext, s *Server, exec *command.Executor) *client { return &client{ - cliCtx: cliCtx, - server: s, - exec: exec, - eof: false, + cliCtx: cliCtx, + server: s, + exec: exec, + remoteClosed: false, } } -func (c *client) readEof() { - c.eofLock.Lock() - defer c.eofLock.Unlock() - - c.eof = true -} - -func (c *client) isEof() bool { - c.eofLock.Lock() - defer c.eofLock.Unlock() - - return c.eof -} - // Write to conn and log error if needed func (c *client) Write(p []byte) (int, error) { zap.L().Debug("write to client", zap.Int64("clientid", c.cliCtx.ID), zap.String("msg", string(p))) @@ -68,9 +52,9 @@ func (c *client) Write(p []byte) (int, error) { zap.String("error", err.Error())) } //client.serve() will get the channel close event, close the connection, exit current go routine - //if the remote client create the connection and use pipeline to invoke command, then it close the connection, titan still can get command from client.bufio.Reader and process - //close ClientContext.Done will help client.serve() to interrupt command processing - close(c.cliCtx.Done) + //if the remote client use pipeline to invoke command, then close the connection(timeout etc), titan still get command from client.bufio.Reader and process + //setting client.remoteClosed to true will help client.serve() to interrupt command processing + c.remoteClosed = true } //return err for above write() error, then replying many times command can break its sending to a half-closed connection, etc BytesArray(lrange invoke it). return n, err @@ -88,6 +72,11 @@ func (c *client) serve(conn net.Conn) error { case <-c.cliCtx.Done: return c.conn.Close() default: + if c.remoteClosed { + zap.L().Info("close connection", zap.String("addr", c.cliCtx.RemoteAddr), + zap.Int64("clientid", c.cliCtx.ID), zap.String("namespace", c.cliCtx.Namespace)) + return c.conn.Close() + } cmd, err = c.readCommand() if err != nil { c.conn.Close() diff --git a/command/command.go b/command/command.go index c96b8ec3..5f7b5ce4 100644 --- a/command/command.go +++ b/command/command.go @@ -62,16 +62,20 @@ func Integer(w io.Writer, v int64) OnCommit { func BytesArray(w io.Writer, a [][]byte) OnCommit { return func() { start := time.Now() - resp.ReplyArray(w, len(a)) + if _, err := resp.ReplyArray(w, len(a)); err != nil { + return + } zap.L().Debug("reply array size", zap.Int64("cost(us)", time.Since(start).Nanoseconds()/1000)) start = time.Now() for i := range a { if a[i] == nil { - resp.ReplyNullBulkString(w) + if err := resp.ReplyNullBulkString(w); err != nil { + return + } continue } if err := resp.ReplyBulkString(w, string(a[i])); err != nil { - break + return } if i%10 == 9 { zap.L().Debug("reply 10 bulk string", zap.Int64("cost(us)", time.Since(start).Nanoseconds()/1000)) From 2d59108eb26a1b528409c4ea151101f866c14f71 Mon Sep 17 00:00:00 2001 From: xufangping Date: Sat, 7 Dec 2019 10:28:18 +0800 Subject: [PATCH 37/60] refactor commandLimiter's qpsLimiter/ratelimiter and their globalLimit, lock, localPercent to a LimiterWrapper, and decrease the lock range --- db/limitersMgr.go | 258 +++++++++++++++++++++++----------------------- 1 file changed, 128 insertions(+), 130 deletions(-) diff --git a/db/limitersMgr.go b/db/limitersMgr.go index e6e63189..ce26bf39 100644 --- a/db/limitersMgr.go +++ b/db/limitersMgr.go @@ -30,19 +30,24 @@ const ( MINIMUM_WEIGHT = 0.1 ) +type LimiterWrapper struct { + limiterName string + limiter *rate.Limiter + globalLimit int64 + localPercent float64 + lock sync.Mutex +} + type CommandLimiter struct { localIp string limiterName string - qpsl *rate.Limiter - ratel *rate.Limiter - globalQpsLimit int64 - globalRateLimit int64 - localPercent float64 - weight float64 - skipBalance bool + qpsLw LimiterWrapper + rateLw LimiterWrapper + weight float64 lock sync.Mutex + skipBalance bool lastTime time.Time totalCommandsCount int64 totalCommandsSize int64 @@ -351,10 +356,6 @@ func (l *LimitersMgr) runSyncNewLimit() { if (qpsLimit > 0 && qpsBurst > 0) || (rateLimit > 0 && rateBurst > 0) { - if logEnv := zap.L().Check(zap.DebugLevel, "[Limit] limit is set"); logEnv != nil { - logEnv.Write(zap.String("limiter name", limiterName), zap.Int64("qps limit", qpsLimit), zap.Int("qps burst", qpsBurst), - zap.Int64("rate limit", rateLimit), zap.Int("rate burst", rateBurst)) - } if commandLimiter == nil { newCl := NewCommandLimiter(l.localIp, limiterName, qpsLimit, qpsBurst, rateLimit, rateBurst, l.conf.InitialPercent) l.limiters.Store(limiterName, newCl) @@ -382,80 +383,27 @@ func NewCommandLimiter(localIp string, limiterName string, qpsLimit int64, qpsBu if initialPercent <= 0 { return nil } - var qpsl, ratel *rate.Limiter - if qpsLimit > 0 && qpsBurst > 0 { - localQpsLimit := float64(qpsLimit) * initialPercent - qpsl = rate.NewLimiter(rate.Limit(localQpsLimit), qpsBurst) - } - if rateLimit > 0 && rateBurst > 0 { - localRateLimit := float64(rateLimit) * initialPercent - ratel = rate.NewLimiter(rate.Limit(localRateLimit), rateBurst) - } cl := &CommandLimiter{ - limiterName: limiterName, - localIp: localIp, - qpsl: qpsl, - ratel: ratel, - globalQpsLimit: qpsLimit, - globalRateLimit: rateLimit, - localPercent: initialPercent, - weight: MAXIMUM_WEIGHT, - skipBalance: true, - lastTime: time.Now(), - } + limiterName: limiterName, + localIp: localIp, + qpsLw: LimiterWrapper{localPercent: initialPercent, limiterName: limiterName + "-qps"}, + rateLw: LimiterWrapper{localPercent: initialPercent, limiterName: limiterName + "-rate"}, + weight: MAXIMUM_WEIGHT, + skipBalance: true, + lastTime: time.Now(), + } + cl.qpsLw.updateLimit(qpsLimit, qpsBurst) + cl.rateLw.updateLimit(rateLimit, rateBurst) return cl } func (cl *CommandLimiter) updateLimit(newQpsLimit int64, newQpsBurst int, newRateLimit int64, newRateBurst int) { - cl.lock.Lock() - defer cl.lock.Unlock() - - ////when limit is changed, the qps can't be used to balanceLimit - var qpsBurst, rateBurst int - if cl.qpsl != nil { - qpsBurst = cl.qpsl.Burst() + qpsLimitChanged := cl.qpsLw.updateLimit(newQpsLimit, newQpsBurst) + rateLimitChanged := cl.rateLw.updateLimit(newRateLimit, newRateBurst) + if qpsLimitChanged || rateLimitChanged { + ////when limit is changed, the qps can't be used to balanceLimit + cl.setSkipBalance(true) } - if cl.ratel != nil { - rateBurst = cl.ratel.Burst() - } - if cl.globalQpsLimit != newQpsLimit || qpsBurst != newQpsBurst || cl.globalRateLimit != newRateLimit || rateBurst != newRateBurst { - zap.L().Info("limit changed", zap.Int64("globalQpsLimit", cl.globalQpsLimit), zap.Int64("newQpsLimit", newQpsLimit), zap.Int("qpsBurst", qpsBurst), zap.Int("newQpsBurst", newQpsBurst), - zap.Int64("globalRateLimit", cl.globalRateLimit), zap.Int64("newRateLimit", newRateLimit), zap.Int("rateBurst", rateBurst), zap.Int("newRateBurst", newRateBurst)) - cl.skipBalance = true - } - - if newQpsLimit > 0 && newQpsBurst > 0 { - localQpsLimit := float64(newQpsLimit) * cl.localPercent - if cl.qpsl != nil { - if cl.qpsl.Burst() != newQpsBurst { - cl.qpsl = rate.NewLimiter(rate.Limit(localQpsLimit), newQpsBurst) - } else if cl.globalQpsLimit != newQpsLimit { - cl.qpsl.SetLimit(rate.Limit(localQpsLimit)) - } - } else { - cl.qpsl = rate.NewLimiter(rate.Limit(localQpsLimit), newQpsBurst) - } - } else { - cl.qpsl = nil - } - - if newRateLimit > 0 && newRateBurst > 0 { - localRateLimit := float64(newRateLimit) * cl.localPercent - if cl.ratel != nil { - if cl.ratel.Burst() != newRateBurst { - cl.ratel = rate.NewLimiter(rate.Limit(localRateLimit), newRateBurst) - } else if cl.globalRateLimit != newRateLimit { - cl.ratel.SetLimit(rate.Limit(localRateLimit)) - } - } else { - cl.ratel = rate.NewLimiter(rate.Limit(localRateLimit), newRateBurst) - } - } else { - cl.ratel = nil - } - - cl.globalQpsLimit = newQpsLimit - cl.globalRateLimit = newRateLimit } func (cl *CommandLimiter) reportLocalStat(globalBalancePeriod time.Duration) float64 { @@ -483,14 +431,12 @@ func (cl *CommandLimiter) reportLocalStat(globalBalancePeriod time.Duration) flo func (cl *CommandLimiter) balanceLimit(averageQps float64, limitDatadb *DB, titanStatusLifetime time.Duration, devideUsage float64, multiplyUsage float64, weightChangeFactor float64) { - cl.lock.Lock() - defer cl.lock.Unlock() - - if cl.qpsl == nil { + qpsGlobalLimit := float64(cl.qpsLw.getLimit()) + if qpsGlobalLimit <= 0 { return } - if cl.skipBalance { - cl.skipBalance = false + if cl.getSkipBalance() { + cl.setSkipBalance(false) return } @@ -511,13 +457,12 @@ func (cl *CommandLimiter) balanceLimit(averageQps float64, limitDatadb *DB, tita totalWeight += weights[i] } - originalLimit := float64(cl.qpsl.Limit()) / cl.localPercent - selfLimitInTarget := originalLimit * (cl.weight / totalWeight) + selfLimitInTarget := qpsGlobalLimit * (cl.weight / totalWeight) if averageQps < selfLimitInTarget*devideUsage { otherHaveHigh := false otherAllLow := true for i := range qpss { - otherLimitInTarget := originalLimit * (weights[i] / totalWeight) + otherLimitInTarget := qpsGlobalLimit * (weights[i] / totalWeight) if qpss[i] >= otherLimitInTarget*multiplyUsage { otherHaveHigh = true otherAllLow = false @@ -569,7 +514,8 @@ func (cl *CommandLimiter) balanceLimit(averageQps float64, limitDatadb *DB, tita } zap.L().Info("[Limit] balance limit", zap.String("limiterName", cl.limiterName), zap.Float64("qps", averageQps), zap.Float64("newWeight", cl.weight), zap.Float64("newPercent", newPercent)) - cl.updateLimitPercent(newPercent) + cl.qpsLw.updatePercent(newPercent) + cl.rateLw.updatePercent(newPercent) } func (cl *CommandLimiter) scanStatusInOtherTitan(limitDatadb *DB, txn *Transaction, titanStatusLifetime time.Duration) ([]float64, []float64, error) { @@ -638,63 +584,115 @@ func (cl *CommandLimiter) scanStatusInOtherTitan(limitDatadb *DB, txn *Transacti return weights, qpss, nil } -func (cl *CommandLimiter) updateLimitPercent(newPercent float64) { - if cl.localPercent != newPercent && cl.localPercent > 0 && newPercent > 0 { - if cl.qpsl != nil { - qpsLimit := float64(cl.globalQpsLimit) * newPercent - zap.L().Info("percent changed", zap.String("limiterName", cl.limiterName), zap.Float64("qps limit", qpsLimit), zap.Int("burst", cl.qpsl.Burst())) - cl.qpsl.SetLimit(rate.Limit(qpsLimit)) - } - if cl.ratel != nil { - rateLimit := float64(cl.globalRateLimit) * newPercent - zap.L().Info("percent changed", zap.String("limiterName", cl.limiterName), zap.Float64("rate limit", rateLimit), zap.Int("burst", cl.ratel.Burst())) - cl.ratel.SetLimit(rate.Limit(rateLimit)) - } - cl.localPercent = newPercent +func (cl *CommandLimiter) checkLimit(cmdName string, cmdArgs []string) { + d := cl.qpsLw.waitTime(1) + time.Sleep(d) + + cmdSize := len(cmdName) + for i := range cmdArgs { + cmdSize += len(cmdArgs[i]) + 1 + } + d = cl.rateLw.waitTime(cmdSize) + time.Sleep(d) + + cl.lock.Lock() + defer cl.lock.Unlock() + cl.totalCommandsCount++ + cl.totalCommandsSize += int64(cmdSize) + if logEnv := zap.L().Check(zap.DebugLevel, "[Limit] limiter state"); logEnv != nil { + logEnv.Write(zap.String("limiter name", cl.limiterName), zap.Time("last time", cl.lastTime), + zap.Int64("command count", cl.totalCommandsCount), zap.Int64("command size", cl.totalCommandsSize)) } } -func (cl *CommandLimiter) checkLimit(cmdName string, cmdArgs []string) { +func (cl *CommandLimiter) setSkipBalance(skipBalance bool) { cl.lock.Lock() defer cl.lock.Unlock() + cl.skipBalance = skipBalance +} - if cl.qpsl != nil { - r := cl.qpsl.Reserve() - if !r.OK() { - zap.L().Error("[Limit] request events num exceed limiter burst", zap.Int("qps limiter burst", cl.qpsl.Burst())) - } else { - d := r.Delay() - if d > 0 { - if logEnv := zap.L().Check(zap.DebugLevel, "[Limit] trigger qps limit"); logEnv != nil { - logEnv.Write(zap.String("limiter name", cl.limiterName), zap.Int64("sleep us", int64(d/time.Microsecond))) - } - time.Sleep(d) +func (cl *CommandLimiter) getSkipBalance() bool { + cl.lock.Lock() + defer cl.lock.Unlock() + return cl.skipBalance +} + +func (lw *LimiterWrapper) updateLimit(newLimit int64, newBurst int) bool { + lw.lock.Lock() + defer lw.lock.Unlock() + + var burst int + if lw.limiter != nil { + burst = lw.limiter.Burst() + } + + changed := false + if lw.globalLimit != newLimit || burst != newBurst { + zap.L().Info("limit changed", zap.String("limiterName", lw.limiterName), + zap.Int64("globalLimit", lw.globalLimit), zap.Int64("newGlobalLimit", newLimit), + zap.Int("burst", burst), zap.Int("newBurst", newBurst)) + changed = true + } + + if newLimit > 0 && newBurst > 0 { + localLimit := float64(newLimit) * lw.localPercent + if lw.limiter != nil { + if lw.limiter.Burst() != newBurst { + lw.limiter = rate.NewLimiter(rate.Limit(localLimit), newBurst) + } else if lw.globalLimit != newLimit { + lw.limiter.SetLimit(rate.Limit(localLimit)) } + } else { + lw.limiter = rate.NewLimiter(rate.Limit(localLimit), newBurst) } + } else { + lw.limiter = nil } - cmdSize := len(cmdName) - for i := range cmdArgs { - cmdSize += len(cmdArgs[i]) + 1 + lw.globalLimit = newLimit + return changed +} + +func (lw *LimiterWrapper) getLimit() int64 { + lw.lock.Lock() + defer lw.lock.Unlock() + + return lw.globalLimit +} + +func (lw *LimiterWrapper) updatePercent(newPercent float64) { + lw.lock.Lock() + defer lw.lock.Unlock() + + if lw.localPercent != newPercent && lw.localPercent > 0 && newPercent > 0 { + if lw.limiter != nil { + limit := float64(lw.globalLimit) * newPercent + zap.L().Info("percent changed", zap.String("limiterName", lw.limiterName), + zap.Float64("limit", limit), zap.Int("burst", lw.limiter.Burst())) + lw.limiter.SetLimit(rate.Limit(limit)) + } + + lw.localPercent = newPercent } - if cl.ratel != nil { - r := cl.ratel.ReserveN(time.Now(), cmdSize) +} + +func (lw *LimiterWrapper) waitTime(eventsNum int) time.Duration { + lw.lock.Lock() + defer lw.lock.Unlock() + + var d time.Duration + if lw.limiter != nil { + r := lw.limiter.ReserveN(time.Now(), eventsNum) if !r.OK() { - zap.L().Error("[Limit] request events num exceed limiter burst", zap.Int("rate limiter burst", cl.ratel.Burst()), zap.Int("command size", cmdSize)) + zap.L().Error("[Limit] request events num exceed limiter burst", zap.String("limiter name", lw.limiterName), zap.Int("burst", lw.limiter.Burst()), zap.Int("request events num", eventsNum)) } else { - d := r.Delay() + d = r.Delay() if d > 0 { - if logEnv := zap.L().Check(zap.DebugLevel, "[Limit] trigger rate limit"); logEnv != nil { - logEnv.Write(zap.String("limiter name", cl.limiterName), zap.Strings("args", cmdArgs), zap.Int64("sleep us", int64(d/time.Microsecond))) + if logEnv := zap.L().Check(zap.DebugLevel, "[Limit] trigger limit"); logEnv != nil { + logEnv.Write(zap.String("limiter name", lw.limiterName), zap.Int("request events num", eventsNum), zap.Int64("sleep us", int64(d/time.Microsecond))) } - time.Sleep(d) } } } - - cl.totalCommandsCount++ - cl.totalCommandsSize += int64(cmdSize) - if logEnv := zap.L().Check(zap.DebugLevel, "[Limit] limiter state"); logEnv != nil { - logEnv.Write(zap.String("limiter name", cl.limiterName), zap.Time("last time", cl.lastTime), zap.Int64("command count", cl.totalCommandsCount), zap.Int64("command size", cl.totalCommandsSize)) - } + return d } From b034581c518a08c6dc0e41084f51ce63e722dc51 Mon Sep 17 00:00:00 2001 From: xufangping Date: Sat, 7 Dec 2019 11:40:42 +0800 Subject: [PATCH 38/60] fix balance limit's bug --- db/limitersMgr.go | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/db/limitersMgr.go b/db/limitersMgr.go index ce26bf39..927a8dab 100644 --- a/db/limitersMgr.go +++ b/db/limitersMgr.go @@ -467,10 +467,8 @@ func (cl *CommandLimiter) balanceLimit(averageQps float64, limitDatadb *DB, tita otherHaveHigh = true otherAllLow = false break - } - if qpss[i] >= otherLimitInTarget*devideUsage { + } else if qpss[i] >= otherLimitInTarget*devideUsage { otherAllLow = false - break } } if otherHaveHigh { From 3b71e69d45465a07b0318f5cfbae754e4d861302 Mon Sep 17 00:00:00 2001 From: xufangping Date: Wed, 11 Dec 2019 15:42:14 +0800 Subject: [PATCH 39/60] 1 in Exec(), add transaction begin, cmd args num, cmd func, reply func metrics 2 cmd args num metrics include the key, which doesn't before 3 in AutoCommit, do adding transaction begin, cmd func, reply func metrics before log.debug() --- command/command.go | 10 ++++------ command/transactions.go | 19 ++++++++++++++++--- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/command/command.go b/command/command.go index 5f7b5ce4..2f6e418e 100644 --- a/command/command.go +++ b/command/command.go @@ -197,13 +197,11 @@ func AutoCommit(cmd TxnCommand) Command { key := "" if len(ctx.Args) > 0 { key = ctx.Args[0] - if len(ctx.Args) > 1 { - mt.CommandArgsNumHistogramVec.WithLabelValues(ctx.Client.Namespace, ctx.Name).Observe(float64(len(ctx.Args) - 1)) - } + mt.CommandArgsNumHistogramVec.WithLabelValues(ctx.Client.Namespace, ctx.Name).Observe(float64(len(ctx.Args))) } cost := time.Since(start).Seconds() - zap.L().Debug("transation begin", zap.String("name", ctx.Name), zap.String("key", key), zap.Int64("cost(us)", int64(cost*1000000))) mt.TxnBeginHistogramVec.WithLabelValues(ctx.Client.Namespace, ctx.Name).Observe(cost) + zap.L().Debug("transation begin", zap.String("name", ctx.Name), zap.String("key", key), zap.Int64("cost(us)", int64(cost*1000000))) if err != nil { mt.TxnFailuresCounterVec.WithLabelValues(ctx.Client.Namespace, ctx.Name).Inc() resp.ReplyError(ctx.Out, "ERR "+err.Error()) @@ -218,8 +216,8 @@ func AutoCommit(cmd TxnCommand) Command { start = time.Now() onCommit, err := cmd(ctx, txn) cost = time.Since(start).Seconds() - zap.L().Debug("command done", zap.String("name", ctx.Name), zap.String("key", key), zap.Int64("cost(us)", int64(cost*1000000))) mt.CommandFuncDoneHistogramVec.WithLabelValues(ctx.Client.Namespace, ctx.Name).Observe(cost) + zap.L().Debug("command done", zap.String("name", ctx.Name), zap.String("key", key), zap.Int64("cost(us)", int64(cost*1000000))) if err != nil { mt.TxnFailuresCounterVec.WithLabelValues(ctx.Client.Namespace, ctx.Name).Inc() resp.ReplyError(ctx.Out, err.Error()) @@ -267,8 +265,8 @@ func AutoCommit(cmd TxnCommand) Command { onCommit() } cost = time.Since(start).Seconds() - zap.L().Debug("onCommit ", zap.String("name", ctx.Name), zap.String("key", key), zap.Int64("cost(us)", int64(cost*1000000))) mt.ReplyFuncDoneHistogramVec.WithLabelValues(ctx.Client.Namespace, ctx.Name).Observe(cost) + zap.L().Debug("onCommit ", zap.String("name", ctx.Name), zap.String("key", key), zap.Int64("cost(us)", int64(cost*1000000))) mtFunc() return nil }) diff --git a/command/transactions.go b/command/transactions.go index f2d63713..89b55c39 100644 --- a/command/transactions.go +++ b/command/transactions.go @@ -38,9 +38,15 @@ func Exec(ctx *Context) { var outputs []*bytes.Buffer var onCommits []OnCommit err = retry.Ensure(ctx, func() error { + mt := metrics.GetMetrics() if !watching { + start := time.Now() txn, err = ctx.Client.DB.Begin() + cost := time.Since(start).Seconds() + mt.TxnBeginHistogramVec.WithLabelValues(ctx.Client.Namespace, ctx.Name).Observe(cost) + zap.L().Debug("transation begin", zap.String("name", ctx.Name), zap.Int64("cost(us)", int64(cost*1000000))) if err != nil { + mt.TxnFailuresCounterVec.WithLabelValues(ctx.Client.Namespace, ctx.Name).Inc() zap.L().Error("begin txn failed", zap.Int64("clientid", ctx.Client.ID), zap.String("command", ctx.Name), @@ -63,11 +69,16 @@ func Exec(ctx *Context) { Out: out, Context: ctx.Context, } + if len(ctx.Args) > 0 { + mt.CommandArgsNumHistogramVec.WithLabelValues(ctx.Client.Namespace, ctx.Name).Observe(float64(len(ctx.Args))) + } name := strings.ToLower(cmd.Name) if _, ok := txnCommands[name]; ok { start := time.Now() onCommit, err = TxnCall(subCtx, txn) - zap.L().Debug("execute", zap.String("command", subCtx.Name), zap.Int64("cost(us)", time.Since(start).Nanoseconds()/1000)) + cost := time.Since(start).Seconds() + mt.CommandFuncDoneHistogramVec.WithLabelValues(ctx.Client.Namespace, ctx.Name).Observe(cost) + zap.L().Debug("execute", zap.String("command", subCtx.Name), zap.Int64("cost(us)", int64(cost*1000000))) if err != nil { resp.ReplyError(out, err.Error()) } @@ -79,7 +90,6 @@ func Exec(ctx *Context) { commandCount++ } start := time.Now() - mt := metrics.GetMetrics() mt.MultiCommandHistogramVec.WithLabelValues(ctx.Client.Namespace, ctx.Name).Observe(float64(commandCount)) defer func() { cost := time.Since(start).Seconds() @@ -123,7 +133,7 @@ func Exec(ctx *Context) { return } - + start := time.Now() resp.ReplyArray(ctx.Out, size) // run OnCommit that fill reply to outputs for i := range onCommits { @@ -141,6 +151,9 @@ func Exec(ctx *Context) { break } } + cost := time.Since(start).Seconds() + metrics.GetMetrics().ReplyFuncDoneHistogramVec.WithLabelValues(ctx.Client.Namespace, ctx.Name).Observe(cost) + zap.L().Debug("onCommit ", zap.String("name", ctx.Name), zap.Int64("cost(us)", int64(cost*1000000))) } // Watch starts a transaction, watch is a global transaction and is not key associated(this is different from redis) From 0fb99b72051163a53efc2bdcc86804678826ca9d Mon Sep 17 00:00:00 2001 From: xufangping Date: Wed, 11 Dec 2019 16:45:26 +0800 Subject: [PATCH 40/60] simplify the msg recv/send log in debug level --- client.go | 4 ++-- command/command.go | 7 ------- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/client.go b/client.go index df8b2540..87f899d3 100644 --- a/client.go +++ b/client.go @@ -130,7 +130,6 @@ func (c *client) serve(conn net.Conn) error { } ctx.Context = context.New(c.cliCtx, c.server.servCtx) - zap.L().Debug("recv msg", zap.String("command", ctx.Name), zap.Strings("arguments", ctx.Args)) // Skip reply if necessary if c.cliCtx.SkipN != 0 { @@ -143,7 +142,8 @@ func (c *client) serve(conn net.Conn) error { env.Write(zap.String("addr", c.cliCtx.RemoteAddr), zap.Int64("clientid", c.cliCtx.ID), zap.String("traceid", ctx.TraceID), - zap.String("command", ctx.Name)) + zap.String("command", ctx.Name), + zap.Strings("arguments", ctx.Args)) } c.exec.Execute(ctx) diff --git a/command/command.go b/command/command.go index 2f6e418e..68f14158 100644 --- a/command/command.go +++ b/command/command.go @@ -61,12 +61,9 @@ func Integer(w io.Writer, v int64) OnCommit { // BytesArray replies a [][]byte when commit func BytesArray(w io.Writer, a [][]byte) OnCommit { return func() { - start := time.Now() if _, err := resp.ReplyArray(w, len(a)); err != nil { return } - zap.L().Debug("reply array size", zap.Int64("cost(us)", time.Since(start).Nanoseconds()/1000)) - start = time.Now() for i := range a { if a[i] == nil { if err := resp.ReplyNullBulkString(w); err != nil { @@ -77,10 +74,6 @@ func BytesArray(w io.Writer, a [][]byte) OnCommit { if err := resp.ReplyBulkString(w, string(a[i])); err != nil { return } - if i%10 == 9 { - zap.L().Debug("reply 10 bulk string", zap.Int64("cost(us)", time.Since(start).Nanoseconds()/1000)) - start = time.Now() - } } } } From 37c1c6815be07126637b2cfdbe851d3beba8f0d0 Mon Sep 17 00:00:00 2001 From: xufangping Date: Wed, 11 Dec 2019 18:03:29 +0800 Subject: [PATCH 41/60] fix bug of setting cmd arg num metric in Exec() --- command/transactions.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/command/transactions.go b/command/transactions.go index 89b55c39..aa3f610a 100644 --- a/command/transactions.go +++ b/command/transactions.go @@ -69,16 +69,16 @@ func Exec(ctx *Context) { Out: out, Context: ctx.Context, } - if len(ctx.Args) > 0 { - mt.CommandArgsNumHistogramVec.WithLabelValues(ctx.Client.Namespace, ctx.Name).Observe(float64(len(ctx.Args))) + if len(cmd.Args) > 0 { + mt.CommandArgsNumHistogramVec.WithLabelValues(ctx.Client.Namespace, cmd.Name).Observe(float64(len(cmd.Args))) } name := strings.ToLower(cmd.Name) if _, ok := txnCommands[name]; ok { start := time.Now() onCommit, err = TxnCall(subCtx, txn) cost := time.Since(start).Seconds() - mt.CommandFuncDoneHistogramVec.WithLabelValues(ctx.Client.Namespace, ctx.Name).Observe(cost) - zap.L().Debug("execute", zap.String("command", subCtx.Name), zap.Int64("cost(us)", int64(cost*1000000))) + mt.CommandFuncDoneHistogramVec.WithLabelValues(ctx.Client.Namespace, cmd.Name).Observe(cost) + zap.L().Debug("execute", zap.String("command", cmd.Name), zap.Int64("cost(us)", int64(cost*1000000))) if err != nil { resp.ReplyError(out, err.Error()) } From 9de2ae953d384b2a27032fd4fd5aa5e2a00759cd Mon Sep 17 00:00:00 2001 From: xufangping Date: Thu, 12 Dec 2019 18:51:20 +0800 Subject: [PATCH 42/60] add expire/gc round/seek/commit cost metrics --- db/expire.go | 22 ++++++++++++++++-- db/gc.go | 17 +++++++++++++- metrics/prometheus.go | 52 ++++++++++++++++++++++++++++++++++--------- 3 files changed, 77 insertions(+), 14 deletions(-) diff --git a/db/expire.go b/db/expire.go index 36acc2b6..de6109e2 100644 --- a/db/expire.go +++ b/db/expire.go @@ -21,6 +21,10 @@ var ( expireMetakeyOffset = expireTimestampOffset + 8 /*sizeof(int64)*/ + len(":") ) +const ( + expire_worker = "expire" +) + // IsExpired judge object expire through now func IsExpired(obj *Object, now int64) bool { if obj.ExpireAt == 0 || obj.ExpireAt > now { @@ -88,6 +92,8 @@ func StartExpire(db *DB, conf *conf.Expire) error { if conf.Disable { continue } + + start := time.Now() isLeader, err := isLeader(db, sysExpireLeader, id, conf.LeaderLifeTime) if err != nil { zap.L().Error("[Expire] check expire leader failed", zap.Error(err)) @@ -102,6 +108,7 @@ func StartExpire(db *DB, conf *conf.Expire) error { continue } runExpire(db, conf.BatchLimit) + metrics.GetMetrics().WorkerRoundCostHistogramVec.WithLabelValues(expire_worker).Observe(time.Since(start).Seconds()) } return nil } @@ -142,7 +149,9 @@ func runExpire(db *DB, batchLimit int) { return } endPrefix := kv.Key(expireKeyPrefix).PrefixNext() + start := time.Now() iter, err := txn.t.Iter(expireKeyPrefix, endPrefix) + metrics.GetMetrics().WorkerSeekCostHistogramVec.WithLabelValues(expire_worker).Observe(time.Since(start).Seconds()) if err != nil { zap.L().Error("[Expire] seek failed", zap.ByteString("prefix", expireKeyPrefix), zap.Error(err)) txn.Rollback() @@ -180,7 +189,13 @@ func runExpire(db *DB, batchLimit int) { logEnv.Write(zap.ByteString("mkey", mkey)) } - if err := iter.Next(); err != nil { + start = time.Now() + err := iter.Next() + cost := time.Since(start) + if cost >= time.Millisecond { + metrics.GetMetrics().WorkerSeekCostHistogramVec.WithLabelValues(expire_worker).Observe(cost.Seconds()) + } + if err != nil { zap.L().Error("[Expire] next failed", zap.ByteString("mkey", mkey), zap.Error(err)) @@ -200,7 +215,10 @@ func runExpire(db *DB, batchLimit int) { metrics.GetMetrics().ExpireLeftSecondsVec.WithLabelValues("left").Set(0) } - if err := txn.Commit(context.Background()); err != nil { + start = time.Now() + err = txn.Commit(context.Background()) + metrics.GetMetrics().WorkerCommitCostHistogramVec.WithLabelValues(expire_worker).Observe(time.Since(start).Seconds()) + if err != nil { txn.Rollback() zap.L().Error("[Expire] commit failed", zap.Error(err)) } diff --git a/db/gc.go b/db/gc.go index 29fd81ea..e1aa8f96 100644 --- a/db/gc.go +++ b/db/gc.go @@ -15,6 +15,10 @@ var ( sysGCLeader = []byte("$sys:0:GCL:GCLeader") ) +const ( + gc_worker = "gc" +) + func toTikvGCKey(key []byte) []byte { b := []byte{} b = append(b, sysNamespace...) @@ -45,7 +49,9 @@ func gcDeleteRange(txn store.Transaction, prefix []byte, limit int) (int, error) count int ) endPrefix := kv.Key(prefix).PrefixNext() + start := time.Now() itr, err := txn.Iter(prefix, endPrefix) + metrics.GetMetrics().WorkerSeekCostHistogramVec.WithLabelValues(gc_worker).Observe(time.Since(start).Seconds()) if err != nil { return count, err } @@ -83,7 +89,9 @@ func doGC(db *DB, limit int) error { txn := dbTxn.t store.SetOption(txn, store.KeyOnly, true) + start := time.Now() itr, err := txn.Iter(gcPrefix, endGCPrefix) + metrics.GetMetrics().WorkerSeekCostHistogramVec.WithLabelValues(gc_worker).Observe(time.Since(start).Seconds()) if err != nil { return err } @@ -133,7 +141,11 @@ func doGC(db *DB, limit int) error { txn.Rollback() return resultErr } - if err := txn.Commit(context.Background()); err != nil { + + start = time.Now() + err = txn.Commit(context.Background()) + metrics.GetMetrics().WorkerCommitCostHistogramVec.WithLabelValues(gc_worker).Observe(time.Since(start).Seconds()) + if err != nil { txn.Rollback() return err } @@ -158,6 +170,8 @@ func StartGC(db *DB, conf *conf.GC) { if conf.Disable { continue } + + start := time.Now() isLeader, err := isLeader(db, sysGCLeader, id, conf.LeaderLifeTime) if err != nil { zap.L().Error("[GC] check GC leader failed", @@ -183,5 +197,6 @@ func StartGC(db *DB, conf *conf.GC) { zap.Error(err)) continue } + metrics.GetMetrics().WorkerRoundCostHistogramVec.WithLabelValues(gc_worker).Observe(time.Since(start).Seconds()) } } diff --git a/metrics/prometheus.go b/metrics/prometheus.go index b01639ac..4213a7e7 100644 --- a/metrics/prometheus.go +++ b/metrics/prometheus.go @@ -23,6 +23,7 @@ const ( expire = "expire" tikvGC = "tikvgc" titanip = "titanip" + worker = "worker" ) var ( @@ -36,6 +37,7 @@ var ( expireLabel = []string{expire} tikvGCLabel = []string{tikvGC} limitLabel = []string{biz, command, titanip} + workerLabel = []string{worker} // global prometheus object gm *Metrics @@ -62,17 +64,20 @@ type Metrics struct { TikvGCTotal *prometheus.CounterVec //command biz - CommandCallHistogramVec *prometheus.HistogramVec - LimitCostHistogramVec *prometheus.HistogramVec - TxnBeginHistogramVec *prometheus.HistogramVec - CommandFuncDoneHistogramVec *prometheus.HistogramVec - TxnCommitHistogramVec *prometheus.HistogramVec - ReplyFuncDoneHistogramVec *prometheus.HistogramVec - CommandArgsNumHistogramVec *prometheus.HistogramVec - TxnRetriesCounterVec *prometheus.CounterVec - TxnConflictsCounterVec *prometheus.CounterVec - TxnFailuresCounterVec *prometheus.CounterVec - MultiCommandHistogramVec *prometheus.HistogramVec + CommandCallHistogramVec *prometheus.HistogramVec + LimitCostHistogramVec *prometheus.HistogramVec + TxnBeginHistogramVec *prometheus.HistogramVec + CommandFuncDoneHistogramVec *prometheus.HistogramVec + TxnCommitHistogramVec *prometheus.HistogramVec + ReplyFuncDoneHistogramVec *prometheus.HistogramVec + CommandArgsNumHistogramVec *prometheus.HistogramVec + TxnRetriesCounterVec *prometheus.CounterVec + TxnConflictsCounterVec *prometheus.CounterVec + TxnFailuresCounterVec *prometheus.CounterVec + MultiCommandHistogramVec *prometheus.HistogramVec + WorkerRoundCostHistogramVec *prometheus.HistogramVec + WorkerSeekCostHistogramVec *prometheus.HistogramVec + WorkerCommitCostHistogramVec *prometheus.HistogramVec //logger LogMetricsCounterVec *prometheus.CounterVec @@ -268,6 +273,31 @@ func init() { []string{labelName}, ) prometheus.MustRegister(gm.LogMetricsCounterVec) + + gm.WorkerRoundCostHistogramVec = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Namespace: namespace, + Name: "worker_round_cost", + Help: "the cost of a round expire/gc worker", + }, workerLabel) + prometheus.MustRegister(gm.WorkerRoundCostHistogramVec) + + gm.WorkerSeekCostHistogramVec = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Namespace: namespace, + Name: "worker_seek_cost", + Help: "the cost of tikv seek in expire/gc worker", + }, workerLabel) + prometheus.MustRegister(gm.WorkerSeekCostHistogramVec) + + gm.WorkerCommitCostHistogramVec = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Namespace: namespace, + Name: "worker_commit_cost", + Help: "the cost of commit in expire/gc worker", + }, workerLabel) + prometheus.MustRegister(gm.WorkerCommitCostHistogramVec) + RegisterSDKMetrics() } From 6c2328d0b27e92e409bf8a5041751aa5676e0286 Mon Sep 17 00:00:00 2001 From: xufangping Date: Sun, 22 Dec 2019 12:38:49 +0800 Subject: [PATCH 43/60] prevent connection more and more: when tikv command cost high, clients reconnect before it finish --- bin/titan/main.go | 10 ++++++---- conf/config.go | 13 +++++++------ context/context.go | 20 ++++++++++++-------- titan.go | 22 ++++++++++++++++++++++ 4 files changed, 47 insertions(+), 18 deletions(-) diff --git a/bin/titan/main.go b/bin/titan/main.go index 8c3cff8c..215078a3 100644 --- a/bin/titan/main.go +++ b/bin/titan/main.go @@ -74,10 +74,12 @@ func main() { os.Exit(1) } serv := titan.New(&context.ServerContext{ - RequirePass: config.Server.Auth, - Store: store, - ListZipThreshold: config.Server.ListZipThreshold, - LimitersMgr: limitersMgr, + RequirePass: config.Server.Auth, + Store: store, + ListZipThreshold: config.Server.ListZipThreshold, + LimitersMgr: limitersMgr, + MaxConnection: config.Server.MaxConnection, + MaxConnectionWait: config.Server.MaxConnectionWait, }) var servOpts, statusOpts []continuous.ServerOption diff --git a/conf/config.go b/conf/config.go index f5eff6dc..784de138 100644 --- a/conf/config.go +++ b/conf/config.go @@ -26,12 +26,13 @@ type Hash struct { // Server config is the config of titan server type Server struct { - Auth string `cfg:"auth;;;client connetion auth"` - Listen string `cfg:"listen; 0.0.0.0:7369; netaddr; address to listen"` - SSLCertFile string `cfg:"ssl-cert-file;;;server SSL certificate file (enables SSL support)"` - SSLKeyFile string `cfg:"ssl-key-file;;;server SSL key file"` - MaxConnection int64 `cfg:"max-connection;1000;numeric;client connection count"` - ListZipThreshold int `cfg:"list-zip-threshold;100;numeric;the max limit length of elements in list"` + Auth string `cfg:"auth;;;client connetion auth"` + Listen string `cfg:"listen; 0.0.0.0:7369; netaddr; address to listen"` + SSLCertFile string `cfg:"ssl-cert-file;;;server SSL certificate file (enables SSL support)"` + SSLKeyFile string `cfg:"ssl-key-file;;;server SSL key file"` + MaxConnection int64 `cfg:"max-connection;500;numeric;client connection count"` + ListZipThreshold int `cfg:"list-zip-threshold;100;numeric;the max limit length of elements in list"` + MaxConnectionWait int64 `cfg:"max-connection-wait;1000;numeric;wait ms before close connection when exceed max connection"` } // Tikv config is the config of tikv sdk diff --git a/context/context.go b/context/context.go index 22880a74..78e7aa63 100644 --- a/context/context.go +++ b/context/context.go @@ -73,14 +73,18 @@ func NewClientContext(id int64, conn net.Conn) *ClientContext { // ServerContext is the runtime context of the server type ServerContext struct { - RequirePass string - Store *db.RedisStore - Monitors sync.Map - Clients sync.Map - LimitersMgr *db.LimitersMgr - Pause time.Duration // elapse to pause all clients - StartAt time.Time - ListZipThreshold int + RequirePass string + Store *db.RedisStore + Monitors sync.Map + Clients sync.Map + LimitersMgr *db.LimitersMgr + Pause time.Duration // elapse to pause all clients + StartAt time.Time + ListZipThreshold int + MaxConnection int64 + MaxConnectionWait int64 + ClientsNum int64 + Lock sync.Mutex } // Context combines the client and server context diff --git a/titan.go b/titan.go index 198c2924..726b96d9 100644 --- a/titan.go +++ b/titan.go @@ -36,8 +36,27 @@ func (s *Server) Serve(lis net.Listener) error { } cliCtx := context.NewClientContext(s.idgen(), conn) + connectExceed := false + s.servCtx.Lock.Lock() + if s.servCtx.ClientsNum >= s.servCtx.MaxConnection { + connectExceed = true + } + s.servCtx.Lock.Unlock() + if connectExceed { + zap.L().Warn("max connection exceed, will close after some time", + zap.Int64("max connection num", s.servCtx.MaxConnection), zap.Int64("wait ms", s.servCtx.MaxConnectionWait), + zap.String("addr", cliCtx.RemoteAddr), zap.Int64("clientid", cliCtx.ID)) + go func() { + time.Sleep(time.Duration(s.servCtx.MaxConnectionWait) * time.Millisecond) + zap.L().Warn("close connection for max connection exceed", zap.String("addr", cliCtx.RemoteAddr), zap.Int64("clientid", cliCtx.ID)) + conn.Close() + }() + } cliCtx.DB = s.servCtx.Store.DB(cliCtx.Namespace, 0) s.servCtx.Clients.Store(cliCtx.ID, cliCtx) + s.servCtx.Lock.Lock() + s.servCtx.ClientsNum++ + s.servCtx.Lock.Unlock() cli := newClient(cliCtx, s, command.NewExecutor()) @@ -52,6 +71,9 @@ func (s *Server) Serve(lis net.Listener) error { } metrics.GetMetrics().ConnectionOnlineGaugeVec.WithLabelValues(cli.cliCtx.Namespace).Dec() s.servCtx.Clients.Delete(cli.cliCtx.ID) + s.servCtx.Lock.Lock() + s.servCtx.ClientsNum-- + s.servCtx.Lock.Unlock() }(cli, conn) } } From bbfce19dd5cf423acfba3bf253a0ad111ff89381 Mon Sep 17 00:00:00 2001 From: lugang Date: Wed, 25 Dec 2019 14:42:01 +0800 Subject: [PATCH 44/60] fix bug of connection limit --- titan.go | 1 + 1 file changed, 1 insertion(+) diff --git a/titan.go b/titan.go index 726b96d9..840872d4 100644 --- a/titan.go +++ b/titan.go @@ -51,6 +51,7 @@ func (s *Server) Serve(lis net.Listener) error { zap.L().Warn("close connection for max connection exceed", zap.String("addr", cliCtx.RemoteAddr), zap.Int64("clientid", cliCtx.ID)) conn.Close() }() + continue } cliCtx.DB = s.servCtx.Store.DB(cliCtx.Namespace, 0) s.servCtx.Clients.Store(cliCtx.ID, cliCtx) From 93cd7df384ca5fe46de2b7a3c8eb9baa2ffcc098 Mon Sep 17 00:00:00 2001 From: xufangping Date: Tue, 18 Feb 2020 18:07:55 +0800 Subject: [PATCH 45/60] 1 add metrics range, fix cost exceed max value problem 2 add config item to enable/disable max-connection limit 3 add txn failure metrics in multi/exec scenario --- bin/titan/main.go | 1 + command/transactions.go | 1 + conf/config.go | 1 + context/context.go | 1 + metrics/prometheus.go | 4 ++-- titan.go | 8 +++++--- 6 files changed, 11 insertions(+), 5 deletions(-) diff --git a/bin/titan/main.go b/bin/titan/main.go index 215078a3..70a47e5c 100644 --- a/bin/titan/main.go +++ b/bin/titan/main.go @@ -78,6 +78,7 @@ func main() { Store: store, ListZipThreshold: config.Server.ListZipThreshold, LimitersMgr: limitersMgr, + LimitConnection: config.Server.LimitConnection, MaxConnection: config.Server.MaxConnection, MaxConnectionWait: config.Server.MaxConnectionWait, }) diff --git a/command/transactions.go b/command/transactions.go index aa3f610a..d78476c0 100644 --- a/command/transactions.go +++ b/command/transactions.go @@ -80,6 +80,7 @@ func Exec(ctx *Context) { mt.CommandFuncDoneHistogramVec.WithLabelValues(ctx.Client.Namespace, cmd.Name).Observe(cost) zap.L().Debug("execute", zap.String("command", cmd.Name), zap.Int64("cost(us)", int64(cost*1000000))) if err != nil { + mt.TxnFailuresCounterVec.WithLabelValues(ctx.Client.Namespace, cmd.Name).Inc() resp.ReplyError(out, err.Error()) } } else { diff --git a/conf/config.go b/conf/config.go index 784de138..6d88c2ac 100644 --- a/conf/config.go +++ b/conf/config.go @@ -30,6 +30,7 @@ type Server struct { Listen string `cfg:"listen; 0.0.0.0:7369; netaddr; address to listen"` SSLCertFile string `cfg:"ssl-cert-file;;;server SSL certificate file (enables SSL support)"` SSLKeyFile string `cfg:"ssl-key-file;;;server SSL key file"` + LimitConnection bool `cfg:"limit-connection; false; boolean; limit max connection num when it's true"` MaxConnection int64 `cfg:"max-connection;500;numeric;client connection count"` ListZipThreshold int `cfg:"list-zip-threshold;100;numeric;the max limit length of elements in list"` MaxConnectionWait int64 `cfg:"max-connection-wait;1000;numeric;wait ms before close connection when exceed max connection"` diff --git a/context/context.go b/context/context.go index 78e7aa63..2aca5916 100644 --- a/context/context.go +++ b/context/context.go @@ -81,6 +81,7 @@ type ServerContext struct { Pause time.Duration // elapse to pause all clients StartAt time.Time ListZipThreshold int + LimitConnection bool MaxConnection int64 MaxConnectionWait int64 ClientsNum int64 diff --git a/metrics/prometheus.go b/metrics/prometheus.go index 4213a7e7..5e72b959 100644 --- a/metrics/prometheus.go +++ b/metrics/prometheus.go @@ -91,7 +91,7 @@ func init() { prometheus.HistogramOpts{ Namespace: namespace, Name: "command_duration_seconds", - Buckets: prometheus.ExponentialBuckets(0.0005, 1.4, 30), + Buckets: prometheus.ExponentialBuckets(0.0005, 1.4, 40), Help: "The cost times of command call", }, multiLabel) prometheus.MustRegister(gm.CommandCallHistogramVec) @@ -143,7 +143,7 @@ func init() { prometheus.HistogramOpts{ Namespace: namespace, Name: "command_func_done_seconds", - Buckets: prometheus.ExponentialBuckets(0.0002, 1.4, 20), + Buckets: prometheus.ExponentialBuckets(0.0005, 1.4, 40), Help: "The cost times of command func", }, multiLabel) prometheus.MustRegister(gm.CommandFuncDoneHistogramVec) diff --git a/titan.go b/titan.go index 840872d4..3084c294 100644 --- a/titan.go +++ b/titan.go @@ -37,9 +37,11 @@ func (s *Server) Serve(lis net.Listener) error { cliCtx := context.NewClientContext(s.idgen(), conn) connectExceed := false - s.servCtx.Lock.Lock() - if s.servCtx.ClientsNum >= s.servCtx.MaxConnection { - connectExceed = true + if s.servCtx.LimitConnection { + s.servCtx.Lock.Lock() + if s.servCtx.ClientsNum >= s.servCtx.MaxConnection { + connectExceed = true + } } s.servCtx.Lock.Unlock() if connectExceed { From ad7cff1b71ccd0d4b056795fbae7cedcc501b31c Mon Sep 17 00:00:00 2001 From: xufangping Date: Wed, 19 Feb 2020 17:35:14 +0800 Subject: [PATCH 46/60] 1 add example for limit-connection and max-connection-wait in titan.toml --- conf/titan.toml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/conf/titan.toml b/conf/titan.toml index aada95e2..0c5375dd 100644 --- a/conf/titan.toml +++ b/conf/titan.toml @@ -23,12 +23,24 @@ ssl-cert-file = "" #description: server SSL key file ssl-key-file = "" +#type: bool +#rules: boolean +#description: limit max connection num when it's true +#default: false +#limit-connection = false + #type: int64 #rules: numeric #description: client connection count #default: 1000 #max-connection = 1000 +#type: int64 +#rules: numeric +#description: wait ms before close connection when exceed max connection +#default: 1000 +#max-connection-wait = 1000 + #type: int #rules: numeric #description: the max limit length of elements in list From aeca4b3a5ecc2be8ffc88c73f0972e1c2da5654d Mon Sep 17 00:00:00 2001 From: xufangping Date: Wed, 19 Feb 2020 17:57:07 +0800 Subject: [PATCH 47/60] 1 fix unlock bug in limit-connection --- titan.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/titan.go b/titan.go index 3084c294..4135908d 100644 --- a/titan.go +++ b/titan.go @@ -42,8 +42,8 @@ func (s *Server) Serve(lis net.Listener) error { if s.servCtx.ClientsNum >= s.servCtx.MaxConnection { connectExceed = true } + s.servCtx.Lock.Unlock() } - s.servCtx.Lock.Unlock() if connectExceed { zap.L().Warn("max connection exceed, will close after some time", zap.Int64("max connection num", s.servCtx.MaxConnection), zap.Int64("wait ms", s.servCtx.MaxConnectionWait), From bb617b060c50005f6c3ca6af3669578a48fefddb Mon Sep 17 00:00:00 2001 From: xufangping Date: Wed, 19 Feb 2020 18:29:05 +0800 Subject: [PATCH 48/60] don't lock s.serverctx.lock when limit-connection disabled --- titan.go | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/titan.go b/titan.go index 4135908d..3d1bc79c 100644 --- a/titan.go +++ b/titan.go @@ -36,30 +36,32 @@ func (s *Server) Serve(lis net.Listener) error { } cliCtx := context.NewClientContext(s.idgen(), conn) - connectExceed := false if s.servCtx.LimitConnection { + connectExceed := false s.servCtx.Lock.Lock() if s.servCtx.ClientsNum >= s.servCtx.MaxConnection { connectExceed = true } s.servCtx.Lock.Unlock() - } - if connectExceed { - zap.L().Warn("max connection exceed, will close after some time", - zap.Int64("max connection num", s.servCtx.MaxConnection), zap.Int64("wait ms", s.servCtx.MaxConnectionWait), - zap.String("addr", cliCtx.RemoteAddr), zap.Int64("clientid", cliCtx.ID)) - go func() { - time.Sleep(time.Duration(s.servCtx.MaxConnectionWait) * time.Millisecond) - zap.L().Warn("close connection for max connection exceed", zap.String("addr", cliCtx.RemoteAddr), zap.Int64("clientid", cliCtx.ID)) - conn.Close() - }() - continue + if connectExceed { + zap.L().Warn("max connection exceed, will close after some time", + zap.Int64("max connection num", s.servCtx.MaxConnection), zap.Int64("wait ms", s.servCtx.MaxConnectionWait), + zap.String("addr", cliCtx.RemoteAddr), zap.Int64("clientid", cliCtx.ID)) + go func() { + time.Sleep(time.Duration(s.servCtx.MaxConnectionWait) * time.Millisecond) + zap.L().Warn("close connection for max connection exceed", zap.String("addr", cliCtx.RemoteAddr), zap.Int64("clientid", cliCtx.ID)) + conn.Close() + }() + continue + } } cliCtx.DB = s.servCtx.Store.DB(cliCtx.Namespace, 0) s.servCtx.Clients.Store(cliCtx.ID, cliCtx) - s.servCtx.Lock.Lock() - s.servCtx.ClientsNum++ - s.servCtx.Lock.Unlock() + if s.servCtx.LimitConnection { + s.servCtx.Lock.Lock() + s.servCtx.ClientsNum++ + s.servCtx.Lock.Unlock() + } cli := newClient(cliCtx, s, command.NewExecutor()) @@ -74,9 +76,11 @@ func (s *Server) Serve(lis net.Listener) error { } metrics.GetMetrics().ConnectionOnlineGaugeVec.WithLabelValues(cli.cliCtx.Namespace).Dec() s.servCtx.Clients.Delete(cli.cliCtx.ID) - s.servCtx.Lock.Lock() - s.servCtx.ClientsNum-- - s.servCtx.Lock.Unlock() + if s.servCtx.LimitConnection { + s.servCtx.Lock.Lock() + s.servCtx.ClientsNum-- + s.servCtx.Lock.Unlock() + } }(cli, conn) } } From 003e6b667d899f81abffaf24dbe5e849f2d0120a Mon Sep 17 00:00:00 2001 From: xufangping Date: Tue, 25 Feb 2020 19:21:37 +0800 Subject: [PATCH 49/60] expire seek end in at:now keys to reduce rocksdb tomstone problem --- db/expire.go | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/db/expire.go b/db/expire.go index de6109e2..572adf65 100644 --- a/db/expire.go +++ b/db/expire.go @@ -8,7 +8,6 @@ import ( "github.com/distributedio/titan/conf" "github.com/distributedio/titan/db/store" "github.com/distributedio/titan/metrics" - "github.com/pingcap/tidb/kv" "go.uber.org/zap" ) @@ -148,7 +147,15 @@ func runExpire(db *DB, batchLimit int) { zap.L().Error("[Expire] txn begin failed", zap.Error(err)) return } - endPrefix := kv.Key(expireKeyPrefix).PrefixNext() + + now := time.Now().UnixNano() + //iter get keys [key, upperBound), so using now+1 as 2nd parameter will get "at:now:" prefixed keys + //we seek end in "at:" replace in "at;" , it can reduce the seek range and seek the deleted expired keys as little as possible. + //the behavior should reduce the expire delay in days and get/mget timeout, which are caused by rocksdb tomstone problem + var endPrefix []byte + endPrefix = append(endPrefix, expireKeyPrefix...) + endPrefix = append(endPrefix, EncodeInt64(now+1)...) + start := time.Now() iter, err := txn.t.Iter(expireKeyPrefix, endPrefix) metrics.GetMetrics().WorkerSeekCostHistogramVec.WithLabelValues(expire_worker).Observe(time.Since(start).Seconds()) @@ -158,7 +165,6 @@ func runExpire(db *DB, batchLimit int) { return } limit := batchLimit - now := time.Now().UnixNano() ts := now for iter.Valid() && iter.Key().HasPrefix(expireKeyPrefix) && limit > 0 { From 4423eaffee99e592fa4dd36f4698c96c1948df71 Mon Sep 17 00:00:00 2001 From: xufangping Date: Wed, 26 Feb 2020 16:47:43 +0800 Subject: [PATCH 50/60] fix a failed case in tests/redis/unit/expire.tcl --- tests/redis/unit/expire.tcl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/redis/unit/expire.tcl b/tests/redis/unit/expire.tcl index de24eabe..a7fe15ed 100644 --- a/tests/redis/unit/expire.tcl +++ b/tests/redis/unit/expire.tcl @@ -3,11 +3,11 @@ start_server {tags {"expire"}} { r set x foobar set v1 [r expire x 5] set v2 [r ttl x] - set v3 [r expire x 10] + set v3 [r expire x 9] set v4 [r ttl x] r expire x 2 list $v1 $v2 $v3 $v4 - } {1 [45] 1 10} + } {1 [45] 1 [89]} test {EXPIRE - It should be still possible to read 'x'} { r get x From cc15799ca087c8aa81db0900a7d441ecde7073d3 Mon Sep 17 00:00:00 2001 From: xufangping Date: Thu, 27 Feb 2020 18:47:42 +0800 Subject: [PATCH 51/60] expire seek start in last processed expireKeys to reduce rocksdb tomstone problem --- db/expire.go | 36 +++++++++++++++++++++++++++--------- db/expire_test.go | 8 ++++---- 2 files changed, 31 insertions(+), 13 deletions(-) diff --git a/db/expire.go b/db/expire.go index 572adf65..8c11eec9 100644 --- a/db/expire.go +++ b/db/expire.go @@ -87,6 +87,7 @@ func StartExpire(db *DB, conf *conf.Expire) error { ticker := time.NewTicker(conf.Interval) defer ticker.Stop() id := UUID() + lastExpireEndTs := int64(0) for range ticker.C { if conf.Disable { continue @@ -106,7 +107,7 @@ func StartExpire(db *DB, conf *conf.Expire) error { } continue } - runExpire(db, conf.BatchLimit) + lastExpireEndTs = runExpire(db, conf.BatchLimit, lastExpireEndTs) metrics.GetMetrics().WorkerRoundCostHistogramVec.WithLabelValues(expire_worker).Observe(time.Since(start).Seconds()) } return nil @@ -141,11 +142,11 @@ func toTikvScorePrefix(namespace []byte, id DBID, key []byte) []byte { return b } -func runExpire(db *DB, batchLimit int) { +func runExpire(db *DB, batchLimit int, lastExpireEndTs int64) int64 { txn, err := db.Begin() if err != nil { zap.L().Error("[Expire] txn begin failed", zap.Error(err)) - return + return 0 } now := time.Now().UnixNano() @@ -156,16 +157,29 @@ func runExpire(db *DB, batchLimit int) { endPrefix = append(endPrefix, expireKeyPrefix...) endPrefix = append(endPrefix, EncodeInt64(now+1)...) + var startPrefix []byte + if lastExpireEndTs > 0 { + startPrefix = append(startPrefix, expireKeyPrefix...) + startPrefix = append(startPrefix, EncodeInt64(lastExpireEndTs)...) + startPrefix = append(startPrefix, ':') + } else { + startPrefix = expireKeyPrefix + } + start := time.Now() - iter, err := txn.t.Iter(expireKeyPrefix, endPrefix) + iter, err := txn.t.Iter(startPrefix, endPrefix) metrics.GetMetrics().WorkerSeekCostHistogramVec.WithLabelValues(expire_worker).Observe(time.Since(start).Seconds()) + if logEnv := zap.L().Check(zap.DebugLevel, "[Expire] seek expire keys"); logEnv != nil { + logEnv.Write(zap.ByteString("[startPrefix", startPrefix), zap.ByteString("endPrefix)", endPrefix)) + } if err != nil { zap.L().Error("[Expire] seek failed", zap.ByteString("prefix", expireKeyPrefix), zap.Error(err)) txn.Rollback() - return + return 0 } limit := batchLimit + thisExpireEndTs := int64(0) ts := now for iter.Valid() && iter.Key().HasPrefix(expireKeyPrefix) && limit > 0 { rawKey := iter.Key() @@ -179,7 +193,7 @@ func runExpire(db *DB, batchLimit int) { mkey := rawKey[expireMetakeyOffset:] if err := doExpire(txn, mkey, iter.Value()); err != nil { txn.Rollback() - return + return 0 } // Remove from expire list @@ -188,11 +202,11 @@ func runExpire(db *DB, batchLimit int) { zap.ByteString("mkey", mkey), zap.Error(err)) txn.Rollback() - return + return 0 } if logEnv := zap.L().Check(zap.DebugLevel, "[Expire] delete expire list item"); logEnv != nil { - logEnv.Write(zap.ByteString("mkey", mkey)) + logEnv.Write(zap.Int64("ts", ts), zap.ByteString("mkey", mkey)) } start = time.Now() @@ -206,8 +220,11 @@ func runExpire(db *DB, batchLimit int) { zap.ByteString("mkey", mkey), zap.Error(err)) txn.Rollback() - return + return 0 } + + //just use the latest processed expireKey(don't include the last expire key in the loop which is > now) as next seek's start key + thisExpireEndTs = ts limit-- } @@ -234,6 +251,7 @@ func runExpire(db *DB, batchLimit int) { } metrics.GetMetrics().ExpireKeysTotal.WithLabelValues("expired").Add(float64(batchLimit - limit)) + return thisExpireEndTs } func gcDataKey(txn *Transaction, namespace []byte, dbid DBID, key, id []byte) error { diff --git a/db/expire_test.go b/db/expire_test.go index 5eb3abb1..2a75c850 100644 --- a/db/expire_test.go +++ b/db/expire_test.go @@ -111,7 +111,7 @@ func Test_runExpire(t *testing.T) { t.Run(tt.name, func(t *testing.T) { id := tt.args.call(t, tt.args.key) txn := getTxn(t) - runExpire(txn.db, 1) + runExpire(txn.db, 1, 0) txn.Commit(context.TODO()) txn = getTxn(t) @@ -211,7 +211,7 @@ func Test_doExpire(t *testing.T) { name: "TestExpiredNotExistsMeta", args: args{ mkey: MetaKey(txn.db, []byte("TestExpiredRewriteHash")), - id: nmateHashId, + id: nmateHashId, }, want: want{ gckey: true, @@ -222,7 +222,7 @@ func Test_doExpire(t *testing.T) { args: args{ mkey: MetaKey(txn.db, []byte("TestExpiredHash_dirty_data")), id: dirtyDataHashID, - tp: byte(ObjectHash), + tp: byte(ObjectHash), }, want: want{ gckey: true, @@ -233,7 +233,7 @@ func Test_doExpire(t *testing.T) { args: args{ mkey: MetaKey(txn.db, []byte("TestExpiredRewriteHash_dirty_data")), id: rDHashId, - tp: byte(ObjectHash), + tp: byte(ObjectHash), }, want: want{ gckey: true, From d9a1bafbcc68795f563c07f616753ee4edf5d1f1 Mon Sep 17 00:00:00 2001 From: xufangping Date: Thu, 27 Feb 2020 19:28:14 +0800 Subject: [PATCH 52/60] fix expire log --- db/expire.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/db/expire.go b/db/expire.go index 8c11eec9..5f5776e8 100644 --- a/db/expire.go +++ b/db/expire.go @@ -170,7 +170,7 @@ func runExpire(db *DB, batchLimit int, lastExpireEndTs int64) int64 { iter, err := txn.t.Iter(startPrefix, endPrefix) metrics.GetMetrics().WorkerSeekCostHistogramVec.WithLabelValues(expire_worker).Observe(time.Since(start).Seconds()) if logEnv := zap.L().Check(zap.DebugLevel, "[Expire] seek expire keys"); logEnv != nil { - logEnv.Write(zap.ByteString("[startPrefix", startPrefix), zap.ByteString("endPrefix)", endPrefix)) + logEnv.Write(zap.Int64("[startTs", lastExpireEndTs), zap.Int64("endTs)", now+1)) } if err != nil { zap.L().Error("[Expire] seek failed", zap.ByteString("prefix", expireKeyPrefix), zap.Error(err)) From 83a370d03f026df3b18988bd24da51e876652b2b Mon Sep 17 00:00:00 2001 From: xufangping Date: Thu, 27 Feb 2020 20:06:06 +0800 Subject: [PATCH 53/60] also reduce expire seek range when no expire keys or all expire keys > now --- db/expire.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/db/expire.go b/db/expire.go index 5f5776e8..6a85e27a 100644 --- a/db/expire.go +++ b/db/expire.go @@ -227,6 +227,10 @@ func runExpire(db *DB, batchLimit int, lastExpireEndTs int64) int64 { thisExpireEndTs = ts limit-- } + if limit == batchLimit { + //means: no expire keys or all expire keys > now in current loop + thisExpireEndTs = now + } now = time.Now().UnixNano() diff := (ts - now) / int64(time.Second) From d75dd34073f72166113551118d398e728d9fc15e Mon Sep 17 00:00:00 2001 From: xufangping Date: Fri, 28 Feb 2020 19:50:17 +0800 Subject: [PATCH 54/60] also reduce expire seek range when no expire keys or all expire keys > now --- command/strings.go | 3 +++ db/kv.go | 7 ++++++- tools/autotest/auto.go | 14 ++++++++++++++ 3 files changed, 23 insertions(+), 1 deletion(-) diff --git a/command/strings.go b/command/strings.go index 3422742c..383b0ebc 100644 --- a/command/strings.go +++ b/command/strings.go @@ -337,6 +337,9 @@ func PSetEx(ctx *Context, txn *db.Transaction) (OnCommit, error) { if err != nil { return nil, ErrInteger } + if ui <= 0 { + return nil, ErrExpireSetEx + } unit := ui * uint64(time.Millisecond) if err := s.Set([]byte(ctx.Args[2]), int64(unit)); err != nil { return nil, errors.New("ERR " + err.Error()) diff --git a/db/kv.go b/db/kv.go index 93dda37c..ecc38c3d 100644 --- a/db/kv.go +++ b/db/kv.go @@ -121,8 +121,13 @@ func (kv *Kv) ExpireAt(key []byte, at int64) error { return err } } - if at > 0 { + if at <= now { + //expire goroutine just seek forward and processed higher and higher ts expireKey, can't seek backward + //so, if expire at a ts <= now, delete it at once + return kv.txn.Destory(obj, key) + } + if err := expireAt(kv.txn.t, mkey, obj.ID, obj.Type, obj.ExpireAt, at); err != nil { return err } diff --git a/tools/autotest/auto.go b/tools/autotest/auto.go index 8d546106..8397e33e 100644 --- a/tools/autotest/auto.go +++ b/tools/autotest/auto.go @@ -298,6 +298,20 @@ func (ac *AutoClient) KeyCase(t *testing.T) { ac.ez.ZAddEqual(t, "key-zset1", "2.0", "member1") ac.ek.DelEqual(t, 1, "key-zset1") ac.ek.ExistsEqual(t, 0, "key-zset1") + + //test negative expire + ac.ez.ZAddEqual(t, "key-zset2", "2.0", "member1") + ac.ek.ExpireEqual(t, "key-zset2", -1, 1) + ac.ek.ExistsEqual(t, 0, "key-zset2") + + ac.es.SetEqual(t, "key-set", "value") + ac.ek.ExpireEqual(t, "key-set", -1, 1) + ac.ek.ExistsEqual(t, 0, "key-set") + + ac.es.SetEqual(t, "key-set", "value") + at = time.Now().Unix() - 1 + ac.ek.ExpireAtEqual(t, "key-set", int(at), 1) + ac.ek.ExistsEqual(t, 0, "key-set") } //SystemCase check system case From 3519ff9c6936a94a6b98d7aadeacc8980c666c36 Mon Sep 17 00:00:00 2001 From: xufangping Date: Mon, 2 Mar 2020 16:30:19 +0800 Subject: [PATCH 55/60] comment unsupported string functions in command list, and delete its redis test case --- command/init.go | 46 ++++++++++++++--------------- tests/redis/support/test_common.tcl | 6 ---- 2 files changed, 23 insertions(+), 29 deletions(-) diff --git a/command/init.go b/command/init.go index fa62cf15..579b21d0 100644 --- a/command/init.go +++ b/command/init.go @@ -28,16 +28,16 @@ func init() { "getset": GetSet, "getrange": GetRange, // "msetnx": MSetNx, - "setnx": SetNx, - "setex": SetEx, - "psetex": PSetEx, - "setrange": SetRange, - "setbit": SetBit, + "setnx": SetNx, + "setex": SetEx, + "psetex": PSetEx, + //"setrange": SetRange, + //"setbit": SetBit, // "bitop": BitOp, // "bitfield": BitField, - "getbit": GetBit, - "bitpos": BitPos, - "bitcount": BitCount, + //"getbit": GetBit, + //"bitpos": BitPos, + //"bitcount": BitCount, "incr": Incr, "incrby": IncrBy, "decr": Decr, @@ -135,29 +135,29 @@ func init() { "rpushx": Desc{Proc: AutoCommit(RPushx), Cons: Constraint{-3, flags("wmF"), 1, 1, 1}}, // strings - "get": Desc{Proc: AutoCommit(Get), Cons: Constraint{2, flags("rF"), 1, 1, 1}}, - "set": Desc{Proc: AutoCommit(Set), Cons: Constraint{-3, flags("wm"), 1, 1, 1}}, - "setnx": Desc{Proc: AutoCommit(SetNx), Cons: Constraint{3, flags("wmF"), 1, 1, 1}}, - "setex": Desc{Proc: AutoCommit(SetEx), Cons: Constraint{4, flags("wm"), 1, 1, 1}}, - "psetex": Desc{Proc: AutoCommit(PSetEx), Cons: Constraint{4, flags("wm"), 1, 1, 1}}, - "mget": Desc{Proc: AutoCommit(MGet), Cons: Constraint{-2, flags("rF"), 1, -1, 1}}, - "mset": Desc{Proc: AutoCommit(MSet), Cons: Constraint{-3, flags("wm"), 1, -1, 2}}, - "msetnx": Desc{Proc: AutoCommit(MSetNx), Cons: Constraint{-3, flags("wm"), 1, -1, 2}}, - "strlen": Desc{Proc: AutoCommit(Strlen), Cons: Constraint{2, flags("rF"), 1, 1, 1}}, - "append": Desc{Proc: AutoCommit(Append), Cons: Constraint{3, flags("wm"), 1, 1, 1}}, - "setrange": Desc{Proc: AutoCommit(SetRange), Cons: Constraint{4, flags("wm"), 1, 1, 1}}, + "get": Desc{Proc: AutoCommit(Get), Cons: Constraint{2, flags("rF"), 1, 1, 1}}, + "set": Desc{Proc: AutoCommit(Set), Cons: Constraint{-3, flags("wm"), 1, 1, 1}}, + "setnx": Desc{Proc: AutoCommit(SetNx), Cons: Constraint{3, flags("wmF"), 1, 1, 1}}, + "setex": Desc{Proc: AutoCommit(SetEx), Cons: Constraint{4, flags("wm"), 1, 1, 1}}, + "psetex": Desc{Proc: AutoCommit(PSetEx), Cons: Constraint{4, flags("wm"), 1, 1, 1}}, + "mget": Desc{Proc: AutoCommit(MGet), Cons: Constraint{-2, flags("rF"), 1, -1, 1}}, + "mset": Desc{Proc: AutoCommit(MSet), Cons: Constraint{-3, flags("wm"), 1, -1, 2}}, + //"msetnx": Desc{Proc: AutoCommit(MSetNx), Cons: Constraint{-3, flags("wm"), 1, -1, 2}}, //run test in tests/redis/unit/type/string failed + "strlen": Desc{Proc: AutoCommit(Strlen), Cons: Constraint{2, flags("rF"), 1, 1, 1}}, + "append": Desc{Proc: AutoCommit(Append), Cons: Constraint{3, flags("wm"), 1, 1, 1}}, + //"setrange": Desc{Proc: AutoCommit(SetRange), Cons: Constraint{4, flags("wm"), 1, 1, 1}}, //run test in tests/redis/unit/type/string failed "getrange": Desc{Proc: AutoCommit(GetRange), Cons: Constraint{4, flags("r"), 1, 1, 1}}, "incr": Desc{Proc: AutoCommit(Incr), Cons: Constraint{2, flags("wmF"), 1, 1, 1}}, "decr": Desc{Proc: AutoCommit(Decr), Cons: Constraint{2, flags("wmF"), 1, 1, 1}}, "incrby": Desc{Proc: AutoCommit(IncrBy), Cons: Constraint{3, flags("wmF"), 1, 1, 1}}, "decrby": Desc{Proc: AutoCommit(DecrBy), Cons: Constraint{3, flags("wmF"), 1, 1, 1}}, "incrbyfloat": Desc{Proc: AutoCommit(IncrByFloat), Cons: Constraint{3, flags("wmF"), 1, 1, 1}}, - "setbit": Desc{Proc: AutoCommit(SetBit), Cons: Constraint{4, flags("wm"), 1, 1, 1}}, + //"setbit": Desc{Proc: AutoCommit(SetBit), Cons: Constraint{4, flags("wm"), 1, 1, 1}}, // "bitop": Desc{Proc: AutoCommit(BitOp), Cons: Constraint{-4, flags("wm"), 2, -1, 1}}, // "bitfield": Desc{Proc: AutoCommit(BitField), Cons: Constraint{-2, flags("wm"), 1, 1, 1}}, - "getbit": Desc{Proc: AutoCommit(GetBit), Cons: Constraint{3, flags("r"), 1, 1, 1}}, - "bitcount": Desc{Proc: AutoCommit(BitCount), Cons: Constraint{-2, flags("r"), 1, 1, 1}}, - "bitpos": Desc{Proc: AutoCommit(BitPos), Cons: Constraint{-3, flags("r"), 1, 1, 1}}, + //"getbit": Desc{Proc: AutoCommit(GetBit), Cons: Constraint{3, flags("r"), 1, 1, 1}}, + //"bitcount": Desc{Proc: AutoCommit(BitCount), Cons: Constraint{-2, flags("r"), 1, 1, 1}}, + //"bitpos": Desc{Proc: AutoCommit(BitPos), Cons: Constraint{-3, flags("r"), 1, 1, 1}}, // keys "type": Desc{Proc: AutoCommit(Type), Cons: Constraint{2, flags("rF"), 1, 1, 1}}, diff --git a/tests/redis/support/test_common.tcl b/tests/redis/support/test_common.tcl index 3bde792a..ec087751 100644 --- a/tests/redis/support/test_common.tcl +++ b/tests/redis/support/test_common.tcl @@ -14,15 +14,9 @@ array set ::toTestCase { "MGET against non-string key" 1 "MSET base case" 1 "MSET wrong number of args" 1 - "MSETNX with already existent key" 1 - "MSETNX with not existing keys" 1 "STRLEN against non-existing key" 1 "STRLEN against integer-encoded value" 1 "STRLEN against plain string" 1 - "SETRANGE against non-existing key" 1 - "SETRANGE against string-encoded key" 1 - "SETRANGE against key with wrong type" 1 - "SETRANGE with out of range offset" 1 "GETRANGE against non-existing key" 1 "GETRANGE against string value" 1 "GETRANGE fuzzing" 1 From c4eea836c17e374e4d4baea2cc02294965dda169 Mon Sep 17 00:00:00 2001 From: xufangping Date: Mon, 2 Mar 2020 16:47:44 +0800 Subject: [PATCH 56/60] comment unit test of unsupported string functions in strings_test.go --- command/strings_test.go | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/command/strings_test.go b/command/strings_test.go index a1065092..5fb3e92a 100644 --- a/command/strings_test.go +++ b/command/strings_test.go @@ -323,7 +323,7 @@ func TestStringPSetEx(t *testing.T) { } -func TestStringSetRange(t *testing.T) { +/*func TestStringSetRange(t *testing.T) { args := make([]string, 3) key := "setrange" args[0] = key @@ -364,7 +364,7 @@ func TestStringSetRange(t *testing.T) { ctx = ContextTest("setrange", args...) Call(ctx) assert.Contains(t, ctxString(ctx.Out), ErrMaximum.Error()) -} +}*/ func TestStringIncr(t *testing.T) { args := make([]string, 1) args[0] = "incr" @@ -469,7 +469,7 @@ func TestStringMset(t *testing.T) { assert.Contains(t, ctxString(ctx.Out), ErrMSet.Error()) } -func TestStringMsetNx(t *testing.T) { +/*func TestStringMsetNx(t *testing.T) { args := make([]string, 4) args[0] = "MsetN1" args[1] = "MsetN3" @@ -490,7 +490,7 @@ func TestStringMsetNx(t *testing.T) { Call(ctx) assert.Contains(t, ctxString(ctx.Out), "0") EqualGet(t, args[0], args[1], nil) -} +}*/ func TestStringAppend(t *testing.T) { args := make([]string, 2) @@ -504,7 +504,7 @@ func TestStringAppend(t *testing.T) { assert.Contains(t, out.String(), strconv.Itoa(len(args[1])*2)) } -func TestStringSetBit(t *testing.T) { +/*func TestStringSetBit(t *testing.T) { tests := []struct { name string args []string @@ -548,9 +548,9 @@ func TestStringSetBit(t *testing.T) { assert.Contains(t, out.String(), tt.want) }) } -} +}*/ -func TestStringGetBit(t *testing.T) { +/*func TestStringGetBit(t *testing.T) { CallTest("setbit", "getbit", "5", "1") tests := []struct { name string @@ -590,9 +590,9 @@ func TestStringGetBit(t *testing.T) { assert.Contains(t, out.String(), tt.want) }) } -} +}*/ -func TestStringBitCount(t *testing.T) { +/*func TestStringBitCount(t *testing.T) { CallTest("setbit", "bit-count", "5", "1") CallTest("setbit", "bit-count", "9", "1") tests := []struct { @@ -633,9 +633,9 @@ func TestStringBitCount(t *testing.T) { assert.Contains(t, out.String(), tt.want) }) } -} +}*/ -func TestStringBitPos(t *testing.T) { +/*func TestStringBitPos(t *testing.T) { CallTest("set", "bit-pos", "5") tests := []struct { name string @@ -685,4 +685,4 @@ func TestStringBitPos(t *testing.T) { assert.Contains(t, out.String(), tt.want) }) } -} +}*/ From bd43ec2322c66d4912501541f7d206a12177d3bc Mon Sep 17 00:00:00 2001 From: xufangping Date: Tue, 21 Apr 2020 20:44:19 +0800 Subject: [PATCH 57/60] hash expire-key, it will improve the keys num handled every seconds, and prevent that current expire-keys region writing node have higher load than other nodes --- db/db.go | 8 ++- db/expire.go | 157 +++++++++++++++++++++++++++++++----------- db/expire_test.go | 20 ++++-- metrics/prometheus.go | 25 ++++--- 4 files changed, 154 insertions(+), 56 deletions(-) diff --git a/db/db.go b/db/db.go index d431ef38..c9145ac9 100644 --- a/db/db.go +++ b/db/db.go @@ -118,8 +118,14 @@ func Open(conf *conf.Tikv) (*RedisStore, error) { } rds := &RedisStore{Storage: s, conf: conf} sysdb := rds.DB(sysNamespace, sysDatabaseID) + ls := NewLeaderStatus() go StartGC(sysdb, &conf.GC) - go StartExpire(sysdb, &conf.Expire) + go setExpireIsLeader(sysdb, &conf.Expire, ls) + go startExpire(sysdb, &conf.Expire, ls, "") + for i := 0; i < EXPIRE_HASH_NUM; i++ { + expireHash := fmt.Sprintf("%04d", i) + go startExpire(sysdb, &conf.Expire, ls, expireHash) + } go StartZT(sysdb, &conf.ZT) go StartTikvGC(sysdb, &conf.TikvGC) return rds, nil diff --git a/db/expire.go b/db/expire.go index 6a85e27a..4a07f1aa 100644 --- a/db/expire.go +++ b/db/expire.go @@ -3,6 +3,9 @@ package db import ( "bytes" "context" + "fmt" + "hash/crc32" + "sync" "time" "github.com/distributedio/titan/conf" @@ -21,9 +24,37 @@ var ( ) const ( - expire_worker = "expire" + expire_worker = "expire" + EXPIRE_HASH_NUM = 256 ) +type LeaderStatus struct { + isLeader bool + cond *sync.Cond +} + +func NewLeaderStatus() *LeaderStatus { + return &LeaderStatus{ + cond: sync.NewCond(new(sync.Mutex)), + } +} + +func (ls *LeaderStatus) setIsLeader(isLeader bool) { + ls.cond.L.Lock() + defer ls.cond.L.Unlock() + + ls.isLeader = isLeader + ls.cond.Broadcast() +} + +func (ls *LeaderStatus) getIsLeader() bool { + ls.cond.L.Lock() + defer ls.cond.L.Unlock() + + ls.cond.Wait() + return ls.isLeader +} + // IsExpired judge object expire through now func IsExpired(obj *Object, now int64) bool { if obj.ExpireAt == 0 || obj.ExpireAt > now { @@ -33,8 +64,12 @@ func IsExpired(obj *Object, now int64) bool { } func expireKey(key []byte, ts int64) []byte { + hashnum := crc32.ChecksumIEEE(key) + hashPrefix := fmt.Sprintf("%04d", hashnum%EXPIRE_HASH_NUM) var buf []byte buf = append(buf, expireKeyPrefix...) + buf = append(buf, []byte(hashPrefix)...) + buf = append(buf, ':') buf = append(buf, EncodeInt64(ts)...) buf = append(buf, ':') buf = append(buf, key...) @@ -82,21 +117,21 @@ func unExpireAt(txn store.Transaction, mkey []byte, expireAt int64) error { return nil } -// StartExpire get leader from db -func StartExpire(db *DB, conf *conf.Expire) error { +// setExpireIsLeader get leader from db +func setExpireIsLeader(db *DB, conf *conf.Expire, ls *LeaderStatus) error { ticker := time.NewTicker(conf.Interval) defer ticker.Stop() id := UUID() - lastExpireEndTs := int64(0) for range ticker.C { if conf.Disable { + ls.setIsLeader(false) continue } - start := time.Now() isLeader, err := isLeader(db, sysExpireLeader, id, conf.LeaderLifeTime) if err != nil { zap.L().Error("[Expire] check expire leader failed", zap.Error(err)) + ls.setIsLeader(false) continue } if !isLeader { @@ -105,14 +140,29 @@ func StartExpire(db *DB, conf *conf.Expire) error { zap.ByteString("uuid", id), zap.Duration("leader-life-time", conf.LeaderLifeTime)) } + ls.setIsLeader(isLeader) continue } - lastExpireEndTs = runExpire(db, conf.BatchLimit, lastExpireEndTs) - metrics.GetMetrics().WorkerRoundCostHistogramVec.WithLabelValues(expire_worker).Observe(time.Since(start).Seconds()) + ls.setIsLeader(isLeader) } return nil } +func startExpire(db *DB, conf *conf.Expire, ls *LeaderStatus, expireHash string) { + ticker := time.NewTicker(conf.Interval) + defer ticker.Stop() + lastExpireEndTs := int64(0) + for range ticker.C { + if !ls.getIsLeader() { + continue + } + + start := time.Now() + lastExpireEndTs = runExpire(db, conf.BatchLimit, expireHash, lastExpireEndTs) + metrics.GetMetrics().WorkerRoundCostHistogramVec.WithLabelValues(expire_worker).Observe(time.Since(start).Seconds()) + } +} + // split a meta key with format: {namespace}:{id}:M:{key} func splitMetaKey(key []byte) ([]byte, DBID, []byte) { idx := bytes.Index(key, []byte{':'}) @@ -142,10 +192,25 @@ func toTikvScorePrefix(namespace []byte, id DBID, key []byte) []byte { return b } -func runExpire(db *DB, batchLimit int, lastExpireEndTs int64) int64 { +func runExpire(db *DB, batchLimit int, expireHash string, lastExpireEndTs int64) int64 { + curExpireTimestampOffset := expireTimestampOffset + curExpireMetakeyOffset := expireMetakeyOffset + var curExpireKeyPrefix []byte //expireKeyPrefix of current go routine + curExpireKeyPrefix = append(curExpireKeyPrefix, expireKeyPrefix...) + var expireLogFlag string + if expireHash != "" { + curExpireKeyPrefix = append(curExpireKeyPrefix, expireHash...) + curExpireKeyPrefix = append(curExpireKeyPrefix, ':') + curExpireTimestampOffset += len(expireHash) + len(":") + curExpireMetakeyOffset += len(expireHash) + len(":") + expireLogFlag = fmt.Sprintf("[Expire-%s]", expireHash) + } else { + expireLogFlag = "[Expire]" + } + txn, err := db.Begin() if err != nil { - zap.L().Error("[Expire] txn begin failed", zap.Error(err)) + zap.L().Error(expireLogFlag+" txn begin failed", zap.Error(err)) return 0 } @@ -154,26 +219,26 @@ func runExpire(db *DB, batchLimit int, lastExpireEndTs int64) int64 { //we seek end in "at:" replace in "at;" , it can reduce the seek range and seek the deleted expired keys as little as possible. //the behavior should reduce the expire delay in days and get/mget timeout, which are caused by rocksdb tomstone problem var endPrefix []byte - endPrefix = append(endPrefix, expireKeyPrefix...) + endPrefix = append(endPrefix, curExpireKeyPrefix...) endPrefix = append(endPrefix, EncodeInt64(now+1)...) var startPrefix []byte if lastExpireEndTs > 0 { - startPrefix = append(startPrefix, expireKeyPrefix...) + startPrefix = append(startPrefix, curExpireKeyPrefix...) startPrefix = append(startPrefix, EncodeInt64(lastExpireEndTs)...) startPrefix = append(startPrefix, ':') } else { - startPrefix = expireKeyPrefix + startPrefix = curExpireKeyPrefix } start := time.Now() iter, err := txn.t.Iter(startPrefix, endPrefix) metrics.GetMetrics().WorkerSeekCostHistogramVec.WithLabelValues(expire_worker).Observe(time.Since(start).Seconds()) - if logEnv := zap.L().Check(zap.DebugLevel, "[Expire] seek expire keys"); logEnv != nil { + if logEnv := zap.L().Check(zap.DebugLevel, expireLogFlag+" seek expire keys"); logEnv != nil { logEnv.Write(zap.Int64("[startTs", lastExpireEndTs), zap.Int64("endTs)", now+1)) } if err != nil { - zap.L().Error("[Expire] seek failed", zap.ByteString("prefix", expireKeyPrefix), zap.Error(err)) + zap.L().Error(expireLogFlag+" seek failed", zap.ByteString("prefix", curExpireKeyPrefix), zap.Error(err)) txn.Rollback() return 0 } @@ -181,31 +246,31 @@ func runExpire(db *DB, batchLimit int, lastExpireEndTs int64) int64 { thisExpireEndTs := int64(0) ts := now - for iter.Valid() && iter.Key().HasPrefix(expireKeyPrefix) && limit > 0 { + for iter.Valid() && iter.Key().HasPrefix(curExpireKeyPrefix) && limit > 0 { rawKey := iter.Key() - ts = DecodeInt64(rawKey[expireTimestampOffset : expireTimestampOffset+8]) + ts = DecodeInt64(rawKey[curExpireTimestampOffset : curExpireTimestampOffset+8]) if ts > now { - if logEnv := zap.L().Check(zap.DebugLevel, "[Expire] not need to expire key"); logEnv != nil { + if logEnv := zap.L().Check(zap.DebugLevel, expireLogFlag+" not need to expire key"); logEnv != nil { logEnv.Write(zap.String("raw-key", string(rawKey)), zap.Int64("last-timestamp", ts)) } break } - mkey := rawKey[expireMetakeyOffset:] - if err := doExpire(txn, mkey, iter.Value()); err != nil { + mkey := rawKey[curExpireMetakeyOffset:] + if err := doExpire(txn, mkey, iter.Value(), expireLogFlag, ts); err != nil { txn.Rollback() return 0 } // Remove from expire list if err := txn.t.Delete(rawKey); err != nil { - zap.L().Error("[Expire] delete failed", + zap.L().Error(expireLogFlag+" delete failed", zap.ByteString("mkey", mkey), zap.Error(err)) txn.Rollback() return 0 } - if logEnv := zap.L().Check(zap.DebugLevel, "[Expire] delete expire list item"); logEnv != nil { + if logEnv := zap.L().Check(zap.DebugLevel, expireLogFlag+" delete expire list item"); logEnv != nil { logEnv.Write(zap.Int64("ts", ts), zap.ByteString("mkey", mkey)) } @@ -216,7 +281,7 @@ func runExpire(db *DB, batchLimit int, lastExpireEndTs int64) int64 { metrics.GetMetrics().WorkerSeekCostHistogramVec.WithLabelValues(expire_worker).Observe(cost.Seconds()) } if err != nil { - zap.L().Error("[Expire] next failed", + zap.L().Error(expireLogFlag+" next failed", zap.ByteString("mkey", mkey), zap.Error(err)) txn.Rollback() @@ -233,13 +298,11 @@ func runExpire(db *DB, batchLimit int, lastExpireEndTs int64) int64 { } now = time.Now().UnixNano() - diff := (ts - now) / int64(time.Second) - if diff >= 0 { - metrics.GetMetrics().ExpireLeftSecondsVec.WithLabelValues("left").Set(float64(diff)) - metrics.GetMetrics().ExpireLeftSecondsVec.WithLabelValues("delay").Set(0) + if ts < now { + diff := (now - ts) / int64(time.Second) + metrics.GetMetrics().ExpireDelaySecondsVec.WithLabelValues("delay-" + expireHash).Set(float64(diff)) } else { - metrics.GetMetrics().ExpireLeftSecondsVec.WithLabelValues("delay").Set(float64(-1 * diff)) - metrics.GetMetrics().ExpireLeftSecondsVec.WithLabelValues("left").Set(0) + metrics.GetMetrics().ExpireDelaySecondsVec.WithLabelValues("delay-" + expireHash).Set(0) } start = time.Now() @@ -247,10 +310,10 @@ func runExpire(db *DB, batchLimit int, lastExpireEndTs int64) int64 { metrics.GetMetrics().WorkerCommitCostHistogramVec.WithLabelValues(expire_worker).Observe(time.Since(start).Seconds()) if err != nil { txn.Rollback() - zap.L().Error("[Expire] commit failed", zap.Error(err)) + zap.L().Error(expireLogFlag+" commit failed", zap.Error(err)) } - if logEnv := zap.L().Check(zap.DebugLevel, "[Expire] expired end"); logEnv != nil { + if logEnv := zap.L().Check(zap.DebugLevel, expireLogFlag+" expired end"); logEnv != nil { logEnv.Write(zap.Int("expired_num", batchLimit-limit)) } @@ -258,10 +321,10 @@ func runExpire(db *DB, batchLimit int, lastExpireEndTs int64) int64 { return thisExpireEndTs } -func gcDataKey(txn *Transaction, namespace []byte, dbid DBID, key, id []byte) error { +func gcDataKey(txn *Transaction, namespace []byte, dbid DBID, key, id []byte, expireLogFlag string) error { dkey := toTikvDataKey(namespace, dbid, id) if err := gc(txn.t, dkey); err != nil { - zap.L().Error("[Expire] gc failed", + zap.L().Error(expireLogFlag+" gc failed", zap.ByteString("key", key), zap.ByteString("namepace", namespace), zap.Int64("db_id", int64(dbid)), @@ -269,17 +332,18 @@ func gcDataKey(txn *Transaction, namespace []byte, dbid DBID, key, id []byte) er zap.Error(err)) return err } - if logEnv := zap.L().Check(zap.DebugLevel, "[Expire] gc data key"); logEnv != nil { + if logEnv := zap.L().Check(zap.DebugLevel, expireLogFlag+" gc data key"); logEnv != nil { logEnv.Write(zap.ByteString("obj_id", id)) } return nil } -func doExpire(txn *Transaction, mkey, id []byte) error { + +func doExpire(txn *Transaction, mkey, id []byte, expireLogFlag string, expireAt int64) error { namespace, dbid, key := splitMetaKey(mkey) obj, err := getObject(txn, mkey) // Check for dirty data due to copying or flushdb/flushall if err == ErrKeyNotFound { - return gcDataKey(txn, namespace, dbid, key, id) + return gcDataKey(txn, namespace, dbid, key, id, expireLogFlag) } if err != nil { return err @@ -288,23 +352,38 @@ func doExpire(txn *Transaction, mkey, id []byte) error { if len(id) > idLen { id = id[:idLen] } + + //if a not-string structure haven't been deleted and set by user again after expire-time, because the expire go-routine is too slow and delayed. + //the id in old expire-keys's value is different with the new one in the new key's value + //so comparing id in doExpire() is necessary and also can handle below scenarios(should just delete old id object's data): + //a not-string structure was set with unhashed expire-key, and then deleted and set again with hashed expire-key + //or a string was set with unhashed expire-key, and set again with hashed expire-key if !bytes.Equal(obj.ID, id) { - return gcDataKey(txn, namespace, dbid, key, id) + return gcDataKey(txn, namespace, dbid, key, id, expireLogFlag) + } + + //compare expire-key's ts with object.expireat(their object id is same in the condition), + //if different, means it's a not-string structure and its expire-key was rewriten in hashed prefix, but old ones was writen in unhashed prefix + if obj.ExpireAt != expireAt { + if logEnv := zap.L().Check(zap.DebugLevel, expireLogFlag+" it should be unhashed expire key un-matching key's expireAt, skip doExpire"); logEnv != nil { + logEnv.Write(zap.ByteString("mkey", mkey), zap.Int64("this expire key's ts", expireAt), zap.Int64("key's expireAt", obj.ExpireAt)) + } + return nil } // Delete object meta if err := txn.t.Delete(mkey); err != nil { - zap.L().Error("[Expire] delete failed", + zap.L().Error(expireLogFlag+" delete failed", zap.ByteString("key", key), zap.Error(err)) return err } - if logEnv := zap.L().Check(zap.DebugLevel, "[Expire] delete metakey"); logEnv != nil { + if logEnv := zap.L().Check(zap.DebugLevel, expireLogFlag+" delete metakey"); logEnv != nil { logEnv.Write(zap.ByteString("mkey", mkey)) } if obj.Type == ObjectString { return nil } - return gcDataKey(txn, namespace, dbid, key, id) + return gcDataKey(txn, namespace, dbid, key, id, expireLogFlag) } diff --git a/db/expire_test.go b/db/expire_test.go index 2a75c850..4d30186a 100644 --- a/db/expire_test.go +++ b/db/expire_test.go @@ -3,6 +3,7 @@ package db import ( "bytes" "context" + "fmt" "testing" "time" @@ -111,7 +112,10 @@ func Test_runExpire(t *testing.T) { t.Run(tt.name, func(t *testing.T) { id := tt.args.call(t, tt.args.key) txn := getTxn(t) - runExpire(txn.db, 1, 0) + for i := 0; i < EXPIRE_HASH_NUM; i++ { + expireHash := fmt.Sprintf("%04d", i) + runExpire(txn.db, 1, expireHash, 0) + } txn.Commit(context.TODO()) txn = getTxn(t) @@ -183,9 +187,10 @@ func Test_doExpire(t *testing.T) { } tests := []struct { - name string - args args - want want + name string + args args + want want + expireAt int64 }{ { name: "TestExpiredHash", @@ -196,6 +201,7 @@ func Test_doExpire(t *testing.T) { want: want{ gckey: true, }, + expireAt: 0, }, { name: "TestExpiredRewriteHash", @@ -206,6 +212,7 @@ func Test_doExpire(t *testing.T) { want: want{ gckey: true, }, + expireAt: expireAt, }, { name: "TestExpiredNotExistsMeta", @@ -216,6 +223,7 @@ func Test_doExpire(t *testing.T) { want: want{ gckey: true, }, + expireAt: 0, }, { name: "TestExpiredHash_dirty_data", @@ -227,6 +235,7 @@ func Test_doExpire(t *testing.T) { want: want{ gckey: true, }, + expireAt: 0, }, { name: "TestExpiredRewriteHash_dirty_data", @@ -238,6 +247,7 @@ func Test_doExpire(t *testing.T) { want: want{ gckey: true, }, + expireAt: expireAt, }, } for _, tt := range tests { @@ -247,7 +257,7 @@ func Test_doExpire(t *testing.T) { if tt.args.tp == byte(ObjectHash) { id = append(id, tt.args.tp) } - err := doExpire(txn, tt.args.mkey, id) + err := doExpire(txn, tt.args.mkey, id, "", tt.expireAt) txn.Commit(context.TODO()) assert.NoError(t, err) diff --git a/metrics/prometheus.go b/metrics/prometheus.go index 5e72b959..f057471e 100644 --- a/metrics/prometheus.go +++ b/metrics/prometheus.go @@ -49,13 +49,13 @@ type Metrics struct { ConnectionOnlineGaugeVec *prometheus.GaugeVec //command - ZTInfoCounterVec *prometheus.CounterVec - IsLeaderGaugeVec *prometheus.GaugeVec - ExpireLeftSecondsVec *prometheus.GaugeVec - LimiterQpsVec *prometheus.GaugeVec - LimiterRateVec *prometheus.GaugeVec - LRangeSeekHistogram prometheus.Histogram - GCKeysCounterVec *prometheus.CounterVec + ZTInfoCounterVec *prometheus.CounterVec + IsLeaderGaugeVec *prometheus.GaugeVec + ExpireDelaySecondsVec *prometheus.GaugeVec + LimiterQpsVec *prometheus.GaugeVec + LimiterRateVec *prometheus.GaugeVec + LRangeSeekHistogram prometheus.Histogram + GCKeysCounterVec *prometheus.CounterVec //expire ExpireKeysTotal *prometheus.CounterVec @@ -191,13 +191,13 @@ func init() { }, bizLabel) prometheus.MustRegister(gm.ConnectionOnlineGaugeVec) - gm.ExpireLeftSecondsVec = prometheus.NewGaugeVec( + gm.ExpireDelaySecondsVec = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Namespace: namespace, - Name: "expire_left_seconds", - Help: "The seconds after which from now will do expire", + Name: "expire_delay_seconds", + Help: "how mach the processing expire-ts delay to now", }, expireLabel) - prometheus.MustRegister(gm.ExpireLeftSecondsVec) + prometheus.MustRegister(gm.ExpireDelaySecondsVec) gm.LimiterQpsVec = prometheus.NewGaugeVec( prometheus.GaugeOpts{ @@ -278,6 +278,7 @@ func init() { prometheus.HistogramOpts{ Namespace: namespace, Name: "worker_round_cost", + Buckets: prometheus.ExponentialBuckets(0.02, 2, 8), Help: "the cost of a round expire/gc worker", }, workerLabel) prometheus.MustRegister(gm.WorkerRoundCostHistogramVec) @@ -286,6 +287,7 @@ func init() { prometheus.HistogramOpts{ Namespace: namespace, Name: "worker_seek_cost", + Buckets: prometheus.ExponentialBuckets(0.002, 2, 12), Help: "the cost of tikv seek in expire/gc worker", }, workerLabel) prometheus.MustRegister(gm.WorkerSeekCostHistogramVec) @@ -294,6 +296,7 @@ func init() { prometheus.HistogramOpts{ Namespace: namespace, Name: "worker_commit_cost", + Buckets: prometheus.ExponentialBuckets(0.005, 2, 6), Help: "the cost of commit in expire/gc worker", }, workerLabel) prometheus.MustRegister(gm.WorkerCommitCostHistogramVec) From fe6e84b5548a443bc4629229c28982fbe25a5e9e Mon Sep 17 00:00:00 2001 From: xufangping Date: Fri, 24 Apr 2020 19:35:08 +0800 Subject: [PATCH 58/60] 1 fix runExpire's problem that unhashed goroutine scan hashed expire-key 2 fix expire_test.go: gc key can get before runExpire/doExpire if expireat < now; add 2 testCases for hash/string expire: check key is deleted and gc key exists after runExpire --- db/expire.go | 14 +++--- db/expire_test.go | 115 ++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 115 insertions(+), 14 deletions(-) diff --git a/db/expire.go b/db/expire.go index 4a07f1aa..44cb2953 100644 --- a/db/expire.go +++ b/db/expire.go @@ -15,8 +15,9 @@ import ( ) var ( - expireKeyPrefix = []byte("$sys:0:at:") - sysExpireLeader = []byte("$sys:0:EXL:EXLeader") + expireKeyPrefix = []byte("$sys:0:at:") + hashExpireKeyPrefix = expireKeyPrefix[:len(expireKeyPrefix)-1] + sysExpireLeader = []byte("$sys:0:EXL:EXLeader") // $sys:0:at:{ts}:{metaKey} expireTimestampOffset = len(expireKeyPrefix) @@ -67,7 +68,7 @@ func expireKey(key []byte, ts int64) []byte { hashnum := crc32.ChecksumIEEE(key) hashPrefix := fmt.Sprintf("%04d", hashnum%EXPIRE_HASH_NUM) var buf []byte - buf = append(buf, expireKeyPrefix...) + buf = append(buf, hashExpireKeyPrefix...) buf = append(buf, []byte(hashPrefix)...) buf = append(buf, ':') buf = append(buf, EncodeInt64(ts)...) @@ -196,15 +197,16 @@ func runExpire(db *DB, batchLimit int, expireHash string, lastExpireEndTs int64) curExpireTimestampOffset := expireTimestampOffset curExpireMetakeyOffset := expireMetakeyOffset var curExpireKeyPrefix []byte //expireKeyPrefix of current go routine - curExpireKeyPrefix = append(curExpireKeyPrefix, expireKeyPrefix...) var expireLogFlag string if expireHash != "" { + curExpireKeyPrefix = append(curExpireKeyPrefix, hashExpireKeyPrefix...) curExpireKeyPrefix = append(curExpireKeyPrefix, expireHash...) curExpireKeyPrefix = append(curExpireKeyPrefix, ':') - curExpireTimestampOffset += len(expireHash) + len(":") - curExpireMetakeyOffset += len(expireHash) + len(":") + curExpireTimestampOffset += len(expireHash) + curExpireMetakeyOffset += len(expireHash) expireLogFlag = fmt.Sprintf("[Expire-%s]", expireHash) } else { + curExpireKeyPrefix = append(curExpireKeyPrefix, expireKeyPrefix...) expireLogFlag = "[Expire]" } diff --git a/db/expire_test.go b/db/expire_test.go index 4d30186a..dbb1a4f1 100644 --- a/db/expire_test.go +++ b/db/expire_test.go @@ -18,7 +18,7 @@ func getTxn(t *testing.T) *Transaction { return txn } -func Test_runExpire(t *testing.T) { +func Test_setExpired_runExpire(t *testing.T) { hashKey := []byte("TestExpiredHash") strKey := []byte("TestExpiredString") expireAt := (time.Now().Unix() - 30) * int64(time.Second) @@ -112,6 +112,99 @@ func Test_runExpire(t *testing.T) { t.Run(tt.name, func(t *testing.T) { id := tt.args.call(t, tt.args.key) txn := getTxn(t) + gcKey := toTikvGCKey(toTikvDataKey([]byte(txn.db.Namespace), txn.db.ID, id)) + + _, err := txn.t.Get(gcKey) + txn.Commit(context.TODO()) + if tt.want.gckey { + assert.NoError(t, err) + } else { + assert.Equal(t, true, store.IsErrNotFound(err)) + } + }) + } + +} + +func Test_runExpire(t *testing.T) { + hashKey := []byte("TestHashToExpire") + strKey := []byte("TestStringToExpire") + expireAt := (time.Now().Unix() + 1) * int64(time.Second) + hashCall := func(t *testing.T, key []byte) []byte { + hash, txn, err := getHash(t, []byte(key)) + oldID := hash.meta.ID + assert.NoError(t, err) + assert.NotNil(t, txn) + assert.NotNil(t, hash) + hash.HSet([]byte("field1"), []byte("val")) + + kv := GetKv(txn) + err = kv.ExpireAt([]byte(key), expireAt) + assert.NoError(t, err) + txn.Commit(context.TODO()) + return oldID + } + + stringCall := func(t *testing.T, key []byte) []byte { + txn := getTxn(t) + str, err := GetString(txn, key) + oldID := str.Meta.ID + assert.NoError(t, err) + assert.NotNil(t, txn) + assert.NotNil(t, str) + str.Set([]byte("value"), 0) + + kv := GetKv(txn) + err = kv.ExpireAt([]byte(key), expireAt) + assert.NoError(t, err) + txn.Commit(context.TODO()) + return oldID + } + + type args struct { + key []byte + call func(*testing.T, []byte) []byte + } + type want struct { + gckey bool + } + + tests := []struct { + name string + args args + want want + }{ + { + name: "TestHashExpire", + args: args{ + key: hashKey, + call: hashCall, + }, + want: want{ + gckey: true, + }, + }, + { + name: "TestStringExpire", + args: args{ + key: strKey, + call: stringCall, + }, + want: want{ + gckey: false, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + id := tt.args.call(t, tt.args.key) + txn := getTxn(t) + now := time.Now().UnixNano() + if now < expireAt { + time.Sleep(time.Duration(expireAt - now)) + } + runExpire(txn.db, 1, "", 0) for i := 0; i < EXPIRE_HASH_NUM; i++ { expireHash := fmt.Sprintf("%04d", i) runExpire(txn.db, 1, expireHash, 0) @@ -119,21 +212,25 @@ func Test_runExpire(t *testing.T) { txn.Commit(context.TODO()) txn = getTxn(t) + metaKey := MetaKey(txn.db, tt.args.key) + _, err := txn.t.Get(metaKey) + assert.Equal(t, true, store.IsErrNotFound(err)) + gcKey := toTikvGCKey(toTikvDataKey([]byte(txn.db.Namespace), txn.db.ID, id)) - _, err := txn.t.Get(gcKey) + _, err = txn.t.Get(gcKey) txn.Commit(context.TODO()) if tt.want.gckey { assert.NoError(t, err) } else { assert.Equal(t, true, store.IsErrNotFound(err)) } + expireAt = (time.Now().Unix() + 1) * int64(time.Second) }) } - } -func Test_doExpire(t *testing.T) { +func Test_setExpired_doExpire(t *testing.T) { initHash := func(t *testing.T, key []byte) []byte { hash, txn, err := getHash(t, key) assert.NoError(t, err) @@ -257,14 +354,16 @@ func Test_doExpire(t *testing.T) { if tt.args.tp == byte(ObjectHash) { id = append(id, tt.args.tp) } - err := doExpire(txn, tt.args.mkey, id, "", tt.expireAt) - txn.Commit(context.TODO()) - assert.NoError(t, err) + if tt.expireAt == 0 { + err := doExpire(txn, tt.args.mkey, id, "", tt.expireAt) + txn.Commit(context.TODO()) + assert.NoError(t, err) + } txn = getTxn(t) gcKey := toTikvGCKey(toTikvDataKey([]byte(txn.db.Namespace), txn.db.ID, tt.args.id)) - _, err = txn.t.Get(gcKey) + _, err := txn.t.Get(gcKey) txn.Commit(context.TODO()) if tt.want.gckey { assert.NoError(t, err) From 977b4578248f8bdcefbb6a0728e89ac3d55249d0 Mon Sep 17 00:00:00 2001 From: xufangping Date: Sun, 26 Apr 2020 12:27:04 +0800 Subject: [PATCH 59/60] =?UTF-8?q?unhash=20expire=20goroutine=20use=20indiv?= =?UTF-8?q?idual=20limit=20configuration=20item=E3=80=81expired&round&seek?= =?UTF-8?q?&commit=20metrics=20label?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- conf/config.go | 9 +++++---- conf/mockconfig.go | 9 +++++---- conf/titan.toml | 10 ++++++++-- db/expire.go | 30 ++++++++++++++++++++++-------- 4 files changed, 40 insertions(+), 18 deletions(-) diff --git a/conf/config.go b/conf/config.go index 6d88c2ac..5d767090 100644 --- a/conf/config.go +++ b/conf/config.go @@ -66,10 +66,11 @@ type GC struct { // Expire config is the config of Titan expire work type Expire struct { - Disable bool `cfg:"disable; false; boolean; false is used to disable expire"` - Interval time.Duration `cfg:"interval;1s;;expire work tick interval"` - LeaderLifeTime time.Duration `cfg:"leader-life-time;3m;;lease flush leader interval"` - BatchLimit int `cfg:"batch-limit;256;numeric;key count limitation per-transection"` + Disable bool `cfg:"disable; false; boolean; false is used to disable expire"` + Interval time.Duration `cfg:"interval;1s;;expire work tick interval"` + LeaderLifeTime time.Duration `cfg:"leader-life-time;3m;;lease flush leader interval"` + BatchLimit int `cfg:"batch-limit;10;numeric;hashed expire-key count limitation per-transection"` + UnhashBatchLimit int `cfg:"unhash-batch-limit;256;numeric;unhashed expire-key count limitation per-transection"` } // ZT config is the config of zlist diff --git a/conf/mockconfig.go b/conf/mockconfig.go index 9a23a2bb..faacdfd1 100644 --- a/conf/mockconfig.go +++ b/conf/mockconfig.go @@ -14,10 +14,11 @@ func MockConf() *Titan { BatchLimit: 256, }, Expire: Expire{ - Disable: false, - Interval: time.Second, - LeaderLifeTime: 3 * time.Minute, - BatchLimit: 256, + Disable: false, + Interval: time.Second, + LeaderLifeTime: 3 * time.Minute, + BatchLimit: 10, + UnhashBatchLimit: 256, }, ZT: ZT{ Disable: false, diff --git a/conf/titan.toml b/conf/titan.toml index 0c5375dd..400b4b99 100644 --- a/conf/titan.toml +++ b/conf/titan.toml @@ -129,9 +129,15 @@ pd-addrs = "mocktikv://" #type: int #rules: numeric -#description: key count limitation per-transection +#description: hashed expire-key count limitation per-transection +#default: 10 +#batch-limit = 10 + +#type: int +#rules: numeric +#description: unhashed expire-key count limitation per-transection #default: 256 -#batch-limit = 256 +#unhash-batch-limit = 256 [tikv.zt] diff --git a/db/expire.go b/db/expire.go index 44cb2953..b174a141 100644 --- a/db/expire.go +++ b/db/expire.go @@ -25,8 +25,9 @@ var ( ) const ( - expire_worker = "expire" - EXPIRE_HASH_NUM = 256 + expire_worker = "expire" + expire_unhash_worker = "expire-unhash" + EXPIRE_HASH_NUM = 256 ) type LeaderStatus struct { @@ -159,8 +160,14 @@ func startExpire(db *DB, conf *conf.Expire, ls *LeaderStatus, expireHash string) } start := time.Now() - lastExpireEndTs = runExpire(db, conf.BatchLimit, expireHash, lastExpireEndTs) - metrics.GetMetrics().WorkerRoundCostHistogramVec.WithLabelValues(expire_worker).Observe(time.Since(start).Seconds()) + if expireHash != "" { + lastExpireEndTs = runExpire(db, conf.BatchLimit, expireHash, lastExpireEndTs) + metrics.GetMetrics().WorkerRoundCostHistogramVec.WithLabelValues(expire_worker).Observe(time.Since(start).Seconds()) + } else { + lastExpireEndTs = runExpire(db, conf.UnhashBatchLimit, expireHash, lastExpireEndTs) + metrics.GetMetrics().WorkerRoundCostHistogramVec.WithLabelValues(expire_unhash_worker).Observe(time.Since(start).Seconds()) + } + } } @@ -198,6 +205,7 @@ func runExpire(db *DB, batchLimit int, expireHash string, lastExpireEndTs int64) curExpireMetakeyOffset := expireMetakeyOffset var curExpireKeyPrefix []byte //expireKeyPrefix of current go routine var expireLogFlag string + var metricsLabel string if expireHash != "" { curExpireKeyPrefix = append(curExpireKeyPrefix, hashExpireKeyPrefix...) curExpireKeyPrefix = append(curExpireKeyPrefix, expireHash...) @@ -205,9 +213,11 @@ func runExpire(db *DB, batchLimit int, expireHash string, lastExpireEndTs int64) curExpireTimestampOffset += len(expireHash) curExpireMetakeyOffset += len(expireHash) expireLogFlag = fmt.Sprintf("[Expire-%s]", expireHash) + metricsLabel = expire_worker } else { curExpireKeyPrefix = append(curExpireKeyPrefix, expireKeyPrefix...) expireLogFlag = "[Expire]" + metricsLabel = expire_unhash_worker } txn, err := db.Begin() @@ -235,7 +245,7 @@ func runExpire(db *DB, batchLimit int, expireHash string, lastExpireEndTs int64) start := time.Now() iter, err := txn.t.Iter(startPrefix, endPrefix) - metrics.GetMetrics().WorkerSeekCostHistogramVec.WithLabelValues(expire_worker).Observe(time.Since(start).Seconds()) + metrics.GetMetrics().WorkerSeekCostHistogramVec.WithLabelValues(metricsLabel).Observe(time.Since(start).Seconds()) if logEnv := zap.L().Check(zap.DebugLevel, expireLogFlag+" seek expire keys"); logEnv != nil { logEnv.Write(zap.Int64("[startTs", lastExpireEndTs), zap.Int64("endTs)", now+1)) } @@ -280,7 +290,7 @@ func runExpire(db *DB, batchLimit int, expireHash string, lastExpireEndTs int64) err := iter.Next() cost := time.Since(start) if cost >= time.Millisecond { - metrics.GetMetrics().WorkerSeekCostHistogramVec.WithLabelValues(expire_worker).Observe(cost.Seconds()) + metrics.GetMetrics().WorkerSeekCostHistogramVec.WithLabelValues(metricsLabel).Observe(cost.Seconds()) } if err != nil { zap.L().Error(expireLogFlag+" next failed", @@ -309,7 +319,7 @@ func runExpire(db *DB, batchLimit int, expireHash string, lastExpireEndTs int64) start = time.Now() err = txn.Commit(context.Background()) - metrics.GetMetrics().WorkerCommitCostHistogramVec.WithLabelValues(expire_worker).Observe(time.Since(start).Seconds()) + metrics.GetMetrics().WorkerCommitCostHistogramVec.WithLabelValues(metricsLabel).Observe(time.Since(start).Seconds()) if err != nil { txn.Rollback() zap.L().Error(expireLogFlag+" commit failed", zap.Error(err)) @@ -319,7 +329,11 @@ func runExpire(db *DB, batchLimit int, expireHash string, lastExpireEndTs int64) logEnv.Write(zap.Int("expired_num", batchLimit-limit)) } - metrics.GetMetrics().ExpireKeysTotal.WithLabelValues("expired").Add(float64(batchLimit - limit)) + if expireHash != "" { + metrics.GetMetrics().ExpireKeysTotal.WithLabelValues("expired").Add(float64(batchLimit - limit)) + } else { + metrics.GetMetrics().ExpireKeysTotal.WithLabelValues("expired-unhash").Add(float64(batchLimit - limit)) + } return thisExpireEndTs } From 9161d7e5156bb2fac37f032456ea965b10d40853 Mon Sep 17 00:00:00 2001 From: xufangping Date: Sun, 26 Apr 2020 15:10:53 +0800 Subject: [PATCH 60/60] buckets of comand/limit/commandFunc/txn commit cost is too many to load for dashboard, replace 1.4 with 2, buckets num decrease 50% --- metrics/prometheus.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/metrics/prometheus.go b/metrics/prometheus.go index f057471e..7eb897c0 100644 --- a/metrics/prometheus.go +++ b/metrics/prometheus.go @@ -91,7 +91,7 @@ func init() { prometheus.HistogramOpts{ Namespace: namespace, Name: "command_duration_seconds", - Buckets: prometheus.ExponentialBuckets(0.0005, 1.4, 40), + Buckets: prometheus.ExponentialBuckets(0.0005, 2, 20), Help: "The cost times of command call", }, multiLabel) prometheus.MustRegister(gm.CommandCallHistogramVec) @@ -125,7 +125,7 @@ func init() { prometheus.HistogramOpts{ Namespace: namespace, Name: "limit_cost_seconds", - Buckets: prometheus.ExponentialBuckets(0.0002, 1.4, 20), + Buckets: prometheus.ExponentialBuckets(0.0002, 2, 10), Help: "the cost times of command execute's limit", }, multiLabel) prometheus.MustRegister(gm.LimitCostHistogramVec) @@ -143,7 +143,7 @@ func init() { prometheus.HistogramOpts{ Namespace: namespace, Name: "command_func_done_seconds", - Buckets: prometheus.ExponentialBuckets(0.0005, 1.4, 40), + Buckets: prometheus.ExponentialBuckets(0.0005, 2, 20), Help: "The cost times of command func", }, multiLabel) prometheus.MustRegister(gm.CommandFuncDoneHistogramVec) @@ -152,7 +152,7 @@ func init() { prometheus.HistogramOpts{ Namespace: namespace, Name: "txn_commit_seconds", - Buckets: prometheus.ExponentialBuckets(0.0005, 1.4, 30), + Buckets: prometheus.ExponentialBuckets(0.0005, 2, 15), Help: "The cost times of txn commit", }, multiLabel) prometheus.MustRegister(gm.TxnCommitHistogramVec) @@ -219,7 +219,7 @@ func init() { prometheus.HistogramOpts{ Namespace: namespace, Name: "lrange_seek_duration_seconds", - Buckets: prometheus.ExponentialBuckets(0.0005, 1.4, 30), + Buckets: prometheus.ExponentialBuckets(0.0005, 2, 15), Help: "The cost times of list lrange seek", }) prometheus.MustRegister(gm.LRangeSeekHistogram)