From db84a399042ed4f9b1cbf7a96fc463a91edcb3c4 Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Tue, 3 Sep 2019 22:56:54 +0800
Subject: [PATCH 01/60] fix empty command which cause titan to crash in parsing
 command

---
 client.go | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/client.go b/client.go
index f02360e5..a8523bba 100644
--- a/client.go
+++ b/client.go
@@ -102,6 +102,13 @@ func (c *client) serve(conn net.Conn) error {
 			c.server.servCtx.Pause = 0
 		}
 
+		if len(cmd) <= 0 {
+			zap.L().Error("command is empty", zap.String("addr", c.cliCtx.RemoteAddr),
+				zap.Int64("clientid", c.cliCtx.ID))
+			resp.ReplyError(c, command.ErrEmptyArray.Error())
+			continue
+		}
+
 		c.cliCtx.Updated = time.Now()
 		c.cliCtx.LastCmd = cmd[0]
 

From 3e4b89e0f684b9161e528d95c9c078564f02f427 Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Tue, 3 Sep 2019 23:03:01 +0800
Subject: [PATCH 02/60] change empty command's reply error from ErrEmptyArray
 to ErrEmptyCommand

---
 client.go        | 2 +-
 command/error.go | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/client.go b/client.go
index a8523bba..5e7467f6 100644
--- a/client.go
+++ b/client.go
@@ -105,7 +105,7 @@ func (c *client) serve(conn net.Conn) error {
 		if len(cmd) <= 0 {
 			zap.L().Error("command is empty", zap.String("addr", c.cliCtx.RemoteAddr),
 				zap.Int64("clientid", c.cliCtx.ID))
-			resp.ReplyError(c, command.ErrEmptyArray.Error())
+			resp.ReplyError(c, command.ErrEmptyCommand.Error())
 			continue
 		}
 
diff --git a/command/error.go b/command/error.go
index e4b46287..5ae788c4 100644
--- a/command/error.go
+++ b/command/error.go
@@ -91,6 +91,8 @@ var (
 	// ErrEmptyArray error
 	ErrEmptyArray = errors.New("EmptyArray error")
 
+	ErrEmptyCommand = errors.New("command is empty")
+
 	//ErrExec exec without multi
 	ErrExec = errors.New("ERR EXEC without MULTI")
 

From c89b27f4e1de6356b08665b255c5f8c3ec72b1a9 Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Tue, 3 Sep 2019 23:23:38 +0800
Subject: [PATCH 03/60] close the connection when meet empty command

---
 client.go | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/client.go b/client.go
index 5e7467f6..cf6e5834 100644
--- a/client.go
+++ b/client.go
@@ -106,7 +106,8 @@ func (c *client) serve(conn net.Conn) error {
 			zap.L().Error("command is empty", zap.String("addr", c.cliCtx.RemoteAddr),
 				zap.Int64("clientid", c.cliCtx.ID))
 			resp.ReplyError(c, command.ErrEmptyCommand.Error())
-			continue
+			c.conn.Close()
+			return nil
 		}
 
 		c.cliCtx.Updated = time.Now()

From 43bd6c355fc5765e1f2f14ca4963ea44a1774198 Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Fri, 6 Sep 2019 15:30:01 +0800
Subject: [PATCH 04/60] some illegal clients send unreadable command which may
 cause titan crash. just after read the command we check if it is in command
 list we support, else will skip it

---
 client.go          | 12 ++++++++++--
 command/command.go | 26 ++++++++++++++++----------
 command/init.go    |  2 ++
 3 files changed, 28 insertions(+), 12 deletions(-)

diff --git a/client.go b/client.go
index cf6e5834..87f1348c 100644
--- a/client.go
+++ b/client.go
@@ -103,15 +103,23 @@ func (c *client) serve(conn net.Conn) error {
 		}
 
 		if len(cmd) <= 0 {
-			zap.L().Error("command is empty", zap.String("addr", c.cliCtx.RemoteAddr),
+			err := command.ErrEmptyCommand
+			zap.L().Error(err.Error(), zap.String("addr", c.cliCtx.RemoteAddr),
 				zap.Int64("clientid", c.cliCtx.ID))
-			resp.ReplyError(c, command.ErrEmptyCommand.Error())
+			resp.ReplyError(c, err.Error())
 			c.conn.Close()
 			return nil
 		}
 
 		c.cliCtx.Updated = time.Now()
 		c.cliCtx.LastCmd = cmd[0]
+		if !c.exec.CanExecute(c.cliCtx.LastCmd) {
+			err := command.ErrUnKnownCommand(c.cliCtx.LastCmd)
+			zap.L().Error(err.Error(), zap.String("addr", c.cliCtx.RemoteAddr),
+				zap.Int64("clientid", c.cliCtx.ID))
+			resp.ReplyError(c, err.Error())
+			continue
+		}
 
 		ctx := &command.Context{
 			Name:    cmd[0],
diff --git a/command/command.go b/command/command.go
index 61e26af0..186105ec 100644
--- a/command/command.go
+++ b/command/command.go
@@ -71,7 +71,7 @@ func BytesArray(w io.Writer, a [][]byte) OnCommit {
 				continue
 			}
 			resp.ReplyBulkString(w, string(a[i]))
-			if i % 10 == 9 {
+			if i%10 == 9 {
 				zap.L().Debug("reply 10 bulk string", zap.Int64("cost(us)", time.Since(start).Nanoseconds()/1000))
 				start = time.Now()
 			}
@@ -80,9 +80,9 @@ func BytesArray(w io.Writer, a [][]byte) OnCommit {
 }
 
 func BytesArrayOnce(w io.Writer, a [][]byte) OnCommit {
-    return func() {
-        resp.ReplyStringArray(w, a)
-    }
+	return func() {
+		resp.ReplyStringArray(w, a)
+	}
 }
 
 // TxnCommand runs a command in transaction
@@ -182,18 +182,18 @@ func AutoCommit(cmd TxnCommand) Command {
 	return func(ctx *Context) {
 		retry.Ensure(ctx, func() error {
 			mt := metrics.GetMetrics()
-            start := time.Now()
+			start := time.Now()
 			txn, err := ctx.Client.DB.Begin()
 			key := ""
 			if len(ctx.Args) > 0 {
 				key = ctx.Args[0]
 				if len(ctx.Args) > 1 {
-					mt.CommandArgsNumHistogramVec.WithLabelValues(ctx.Client.Namespace, ctx.Name).Observe(float64(len(ctx.Args)-1))
+					mt.CommandArgsNumHistogramVec.WithLabelValues(ctx.Client.Namespace, ctx.Name).Observe(float64(len(ctx.Args) - 1))
 				}
 			}
 			cost := time.Since(start).Seconds()
-            zap.L().Debug("transation begin", zap.String("name", ctx.Name), zap.String("key", key), zap.Int64("cost(us)", int64(cost*1000000)))
-            mt.TxnBeginHistogramVec.WithLabelValues(ctx.Client.Namespace, ctx.Name).Observe(cost)
+			zap.L().Debug("transation begin", zap.String("name", ctx.Name), zap.String("key", key), zap.Int64("cost(us)", int64(cost*1000000)))
+			mt.TxnBeginHistogramVec.WithLabelValues(ctx.Client.Namespace, ctx.Name).Observe(cost)
 			if err != nil {
 				mt.TxnFailuresCounterVec.WithLabelValues(ctx.Client.Namespace, ctx.Name).Inc()
 				resp.ReplyError(ctx.Out, "ERR "+err.Error())
@@ -277,9 +277,9 @@ func feedMonitors(ctx *Context) {
 		id := strconv.FormatInt(int64(ctx.Client.DB.ID), 10)
 
 		line := ts + " [" + id + " " + ctx.Client.RemoteAddr + "]" + " " + ctx.Name + " " + strings.Join(ctx.Args, " ")
-        start := time.Now()
+		start := time.Now()
 		err := resp.ReplySimpleString(mCtx.Out, line)
-        zap.L().Debug("feedMonitors reply", zap.String("name", ctx.Name), zap.Int64("cost(us)", time.Since(start).Nanoseconds()/1000))
+		zap.L().Debug("feedMonitors reply", zap.String("name", ctx.Name), zap.Int64("cost(us)", time.Since(start).Nanoseconds()/1000))
 		if err != nil {
 			ctx.Server.Monitors.Delete(k)
 		}
@@ -299,6 +299,12 @@ func NewExecutor() *Executor {
 	return &Executor{txnCommands: txnCommands, commands: commands}
 }
 
+func (e *Executor) CanExecute(cmd string) bool {
+	lowerName := strings.ToLower(cmd)
+	_, ok := commands[lowerName]
+	return ok
+}
+
 // Execute a command
 func (e *Executor) Execute(ctx *Context) {
 	start := time.Now()
diff --git a/command/init.go b/command/init.go
index 5e1189f3..fa62cf15 100644
--- a/command/init.go
+++ b/command/init.go
@@ -117,6 +117,8 @@ func init() {
 
 		// transactions, exec and discard should called explicitly, so they are registered here
 		"multi":   Desc{Proc: Multi, Cons: Constraint{1, flags("sF"), 0, 0, 0}},
+		"exec":    Desc{Proc: Exec, Cons: Constraint{1, flags("sF"), 0, 0, 0}},
+		"discard": Desc{Proc: Discard, Cons: Constraint{1, flags("sF"), 0, 0, 0}},
 		"watch":   Desc{Proc: Watch, Cons: Constraint{-2, flags("sF"), 1, -1, 1}},
 		"unwatch": Desc{Proc: Unwatch, Cons: Constraint{1, flags("sF"), 0, 0, 0}},
 

From 95bd04b6e08ba54615c2d0fab3c71f0f7df4753b Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Fri, 6 Sep 2019 18:31:08 +0800
Subject: [PATCH 05/60] if a connection send unknown commands 3 times, close it

---
 client.go | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/client.go b/client.go
index 87f1348c..2728481b 100644
--- a/client.go
+++ b/client.go
@@ -78,6 +78,7 @@ func (c *client) serve(conn net.Conn) error {
 
 	var cmd []string
 	var err error
+	unknownCmdTimes := int(0)
 	for {
 		select {
 		case <-c.cliCtx.Done:
@@ -118,7 +119,13 @@ func (c *client) serve(conn net.Conn) error {
 			zap.L().Error(err.Error(), zap.String("addr", c.cliCtx.RemoteAddr),
 				zap.Int64("clientid", c.cliCtx.ID))
 			resp.ReplyError(c, err.Error())
-			continue
+			unknownCmdTimes++
+			if unknownCmdTimes >= 3 {
+				c.conn.Close()
+				return nil
+			} else {
+				continue
+			}
 		}
 
 		ctx := &command.Context{

From 63b131d12986f3d73a8146d8dc311c84e253c482 Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Wed, 18 Sep 2019 20:00:26 +0800
Subject: [PATCH 06/60] add expire left seconds metrics

---
 db/expire.go          | 13 ++++++++++---
 metrics/prometheus.go | 17 +++++++++++++----
 2 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/db/expire.go b/db/expire.go
index 76d769b4..4528dac1 100644
--- a/db/expire.go
+++ b/db/expire.go
@@ -151,9 +151,10 @@ func runExpire(db *DB, batchLimit int) {
 	limit := batchLimit
 	now := time.Now().UnixNano()
 
+	ts := now
 	for iter.Valid() && iter.Key().HasPrefix(expireKeyPrefix) && limit > 0 {
 		rawKey := iter.Key()
-		ts := DecodeInt64(rawKey[expireTimestampOffset : expireTimestampOffset+8])
+		ts = DecodeInt64(rawKey[expireTimestampOffset : expireTimestampOffset+8])
 		if ts > now {
 			if logEnv := zap.L().Check(zap.DebugLevel, "[Expire] not need to expire key"); logEnv != nil {
 				logEnv.Write(zap.String("raw-key", string(rawKey)), zap.Int64("last-timestamp", ts))
@@ -198,10 +199,17 @@ func runExpire(db *DB, batchLimit int) {
 		logEnv.Write(zap.Int("expired_num", batchLimit-limit))
 	}
 
+	diff := (ts - now) / int64(time.Second)
+	if diff >= 0 {
+		metrics.GetMetrics().ExpireLeftSecondsVec.WithLabelValues("left").Set(float64(diff))
+	} else {
+		metrics.GetMetrics().ExpireLeftSecondsVec.WithLabelValues("delay").Set(float64(-1 * diff))
+	}
+
 	metrics.GetMetrics().ExpireKeysTotal.WithLabelValues("expired").Add(float64(batchLimit - limit))
 }
 
-func gcDataKey(txn *Transaction, namespace []byte, dbid DBID, key, id []byte)error{
+func gcDataKey(txn *Transaction, namespace []byte, dbid DBID, key, id []byte) error {
 	dkey := toTikvDataKey(namespace, dbid, id)
 	if err := gc(txn.t, dkey); err != nil {
 		zap.L().Error("[Expire] gc failed",
@@ -251,4 +259,3 @@ func doExpire(txn *Transaction, mkey, id []byte) error {
 	}
 	return gcDataKey(txn, namespace, dbid, key, id)
 }
-
diff --git a/metrics/prometheus.go b/metrics/prometheus.go
index 8c7f42be..c1f0ab60 100644
--- a/metrics/prometheus.go
+++ b/metrics/prometheus.go
@@ -45,10 +45,11 @@ type Metrics struct {
 	ConnectionOnlineGaugeVec *prometheus.GaugeVec
 
 	//command
-	ZTInfoCounterVec    *prometheus.CounterVec
-	IsLeaderGaugeVec    *prometheus.GaugeVec
-	LRangeSeekHistogram prometheus.Histogram
-	GCKeysCounterVec    *prometheus.CounterVec
+	ZTInfoCounterVec     *prometheus.CounterVec
+	IsLeaderGaugeVec     *prometheus.GaugeVec
+	ExpireLeftSecondsVec *prometheus.GaugeVec
+	LRangeSeekHistogram  prometheus.Histogram
+	GCKeysCounterVec     *prometheus.CounterVec
 
 	//expire
 	ExpireKeysTotal *prometheus.CounterVec
@@ -171,6 +172,14 @@ func init() {
 		}, bizLabel)
 	prometheus.MustRegister(gm.ConnectionOnlineGaugeVec)
 
+	gm.ExpireLeftSecondsVec = prometheus.NewGaugeVec(
+		prometheus.GaugeOpts{
+			Namespace: namespace,
+			Name:      "expire_left_seconds",
+			Help:      "The seconds after which from now will do expire",
+		}, expireLabel)
+	prometheus.MustRegister(gm.ExpireLeftSecondsVec)
+
 	gm.LRangeSeekHistogram = prometheus.NewHistogram(
 		prometheus.HistogramOpts{
 			Namespace: namespace,

From a9c28fe2209fb29a1b63f37dbef039703ea763cd Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Wed, 18 Sep 2019 20:46:37 +0800
Subject: [PATCH 07/60] fix expire left time calculating

---
 db/expire.go | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/db/expire.go b/db/expire.go
index 4528dac1..4b21b82c 100644
--- a/db/expire.go
+++ b/db/expire.go
@@ -190,6 +190,14 @@ func runExpire(db *DB, batchLimit int) {
 		limit--
 	}
 
+	now = time.Now().UnixNano()
+	diff := (ts - now) / int64(time.Second)
+	if diff >= 0 {
+		metrics.GetMetrics().ExpireLeftSecondsVec.WithLabelValues("left").Set(float64(diff))
+	} else {
+		metrics.GetMetrics().ExpireLeftSecondsVec.WithLabelValues("delay").Set(float64(-1 * diff))
+	}
+
 	if err := txn.Commit(context.Background()); err != nil {
 		txn.Rollback()
 		zap.L().Error("[Expire] commit failed", zap.Error(err))
@@ -199,13 +207,6 @@ func runExpire(db *DB, batchLimit int) {
 		logEnv.Write(zap.Int("expired_num", batchLimit-limit))
 	}
 
-	diff := (ts - now) / int64(time.Second)
-	if diff >= 0 {
-		metrics.GetMetrics().ExpireLeftSecondsVec.WithLabelValues("left").Set(float64(diff))
-	} else {
-		metrics.GetMetrics().ExpireLeftSecondsVec.WithLabelValues("delay").Set(float64(-1 * diff))
-	}
-
 	metrics.GetMetrics().ExpireKeysTotal.WithLabelValues("expired").Add(float64(batchLimit - limit))
 }
 

From e5ff5013b4c8235fe9a28d634bdbfd20b09398a8 Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Thu, 19 Sep 2019 11:13:29 +0800
Subject: [PATCH 08/60] when set left/delay current seconds(expire), also set
 other seconds to 0

---
 db/expire.go | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/db/expire.go b/db/expire.go
index 4b21b82c..36acc2b6 100644
--- a/db/expire.go
+++ b/db/expire.go
@@ -194,8 +194,10 @@ func runExpire(db *DB, batchLimit int) {
 	diff := (ts - now) / int64(time.Second)
 	if diff >= 0 {
 		metrics.GetMetrics().ExpireLeftSecondsVec.WithLabelValues("left").Set(float64(diff))
+		metrics.GetMetrics().ExpireLeftSecondsVec.WithLabelValues("delay").Set(0)
 	} else {
 		metrics.GetMetrics().ExpireLeftSecondsVec.WithLabelValues("delay").Set(float64(-1 * diff))
+		metrics.GetMetrics().ExpireLeftSecondsVec.WithLabelValues("left").Set(0)
 	}
 
 	if err := txn.Commit(context.Background()); err != nil {

From fb11c671ae648fc3fc7b3f8c01a2f60d5e9507b5 Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Fri, 25 Oct 2019 15:28:22 +0800
Subject: [PATCH 09/60] delete zadd args output

---
 command/zsets.go | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/command/zsets.go b/command/zsets.go
index 35a1813d..6eb930c3 100644
--- a/command/zsets.go
+++ b/command/zsets.go
@@ -2,7 +2,6 @@ package command
 
 import (
 	"errors"
-	"fmt"
 	"math"
 	"strconv"
 	"strings"
@@ -14,8 +13,6 @@ import (
 func ZAdd(ctx *Context, txn *db.Transaction) (OnCommit, error) {
 	key := []byte(ctx.Args[0])
 
-	fmt.Println("zadd", ctx.Args)
-
 	kvs := ctx.Args[1:]
 	if len(kvs)%2 != 0 {
 		return nil, errors.New("ERR syntax error")

From 586b30d0ab9fb0653cef4c81a05c0a1e1a11246f Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Thu, 7 Nov 2019 20:02:40 +0800
Subject: [PATCH 10/60] first version of rate limit

---
 command/command.go |   1 +
 conf/config.go     |  23 +++++---
 db/ratelimiter.go  | 134 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 151 insertions(+), 7 deletions(-)
 create mode 100644 db/ratelimiter.go

diff --git a/command/command.go b/command/command.go
index 186105ec..bbac665a 100644
--- a/command/command.go
+++ b/command/command.go
@@ -159,6 +159,7 @@ func Call(ctx *Context) {
 
 	feedMonitors(ctx)
 	start := time.Now()
+	// judge ctx.name and ctx.args
 	cmdInfoCommand.Proc(ctx)
 	cost := time.Since(start)
 
diff --git a/conf/config.go b/conf/config.go
index d97adb9a..e677a415 100644
--- a/conf/config.go
+++ b/conf/config.go
@@ -1,6 +1,8 @@
 package conf
 
-import "time"
+import (
+	"time"
+)
 
 // Titan configuration center
 type Titan struct {
@@ -34,12 +36,13 @@ type Server struct {
 
 // Tikv config is the config of tikv sdk
 type Tikv struct {
-	PdAddrs string `cfg:"pd-addrs;required; ;pd address in tidb"`
-	DB      DB     `cfg:"db"`
-	GC      GC     `cfg:"gc"`
-	Expire  Expire `cfg:"expire"`
-	ZT      ZT     `cfg:"zt"`
-	TikvGC  TikvGC `cfg:"tikv-gc"`
+	PdAddrs   string    `cfg:"pd-addrs;required; ;pd address in tidb"`
+	DB        DB        `cfg:"db"`
+	GC        GC        `cfg:"gc"`
+	Expire    Expire    `cfg:"expire"`
+	ZT        ZT        `cfg:"zt"`
+	TikvGC    TikvGC    `cfg:"tikv-gc"`
+	RateLimit RateLimit `cfg:"rate-limit"`
 }
 
 // TikvGC config is the config of implement tikv sdk gcwork
@@ -99,3 +102,9 @@ type Status struct {
 	SSLCertFile string `cfg:"ssl-cert-file;;;status server SSL certificate file (enables SSL support)"`
 	SSLKeyFile  string `cfg:"ssl-key-file;;;status server SSL key file"`
 }
+
+type RateLimit struct {
+	LimitPeriod   int    `cfg:"limit-period; 20; numeric; the period in ms to limit rate"`
+	BalancePeriod int    `cfg:"sync-global-period; 3000; numeric; the period in ms to balance rate limiting with other titan nodes"`
+	Rule          string `cfg:"rule; setex:10000:1000000,set:20000:1000000; ; the rule to limit rate(command1:maxqps:maxrate,command2:maxqps:maxrate...)"`
+}
diff --git a/db/ratelimiter.go b/db/ratelimiter.go
new file mode 100644
index 00000000..bb1b6381
--- /dev/null
+++ b/db/ratelimiter.go
@@ -0,0 +1,134 @@
+package db
+
+import (
+	"sync"
+	"time"
+)
+
+const (
+	ALL_NAMESPACE = "*"
+)
+
+type CommandLimiter struct {
+	lock                       sync.Mutex
+	startTime                  time.Time
+	commandsCount              int
+	commandsSize               int
+	maxCommandsCount           int
+	maxCommandsSize            int
+	period                     int
+	periodsNumToTriggerBalance int
+	totalCommandsCount         int
+	totalCommandsSize          int
+}
+
+type NamespaceLimiter struct {
+	lock           sync.Mutex
+	commandLimiter map[string]*CommandLimiter
+}
+
+type LimitRule struct {
+	//namespace string
+	//command string
+	maxqps  int
+	maxrate int
+}
+type Limiter struct {
+	lock sync.Mutex
+
+	limitPeriod   int
+	balancePeriod int
+	currentIp     string
+
+	limitRules map[string]map[string]LimitRule
+	limiter    map[string]NamespaceLimiter
+}
+
+func NewLimitRule(rule string) map[string]map[string]LimitRule {
+	return nil
+}
+
+func NewLimiter(limitPeriod int, balancePeriod int, rule string) *Limiter {
+	l := &Limiter{
+		limitPeriod:   limitPeriod,
+		balancePeriod: balancePeriod,
+		currentIp:     "127.0.0.1", //set current ip
+		limitRules:    NewLimitRule(rule),
+		limiter:       make(map[string]NamespaceLimiter),
+	}
+
+	return l
+}
+
+func (l *Limiter) addAllNamespaceLimiter(rule string) {
+
+}
+
+//func (l *Limiter) addNamespaceLimiter(namespace string, command string, maxQps int, maxRate int) {
+//
+//}
+
+func (l *Limiter) get(namespace string, command string) *CommandLimiter {
+	return nil
+}
+
+func (l *Limiter) setCommandLimiter(namespace string, command string, commandLimiter *CommandLimiter) *CommandLimiter {
+	return nil
+}
+
+func (l *Limiter) checkLimit(namespace string, command string, argSize int) {
+	commandLimiter := l.get(namespace, command)
+	if commandLimiter == nil {
+		//get rule from limitRule
+		limitRule, ok := l.limitRules[namespace][command]
+		if ok {
+			tmp := NewCommandLimiter(limitRule)
+			commandLimiter = l.setCommandLimiter(namespace, command, tmp)
+		} else {
+			limitRule, ok = l.limitRules[ALL_NAMESPACE][command]
+			if ok {
+				tmp := NewCommandLimiter(limitRule)
+				commandLimiter = l.setCommandLimiter(namespace, command, tmp)
+			}
+		}
+	}
+	if commandLimiter != nil {
+		commandLimiter.checkLimit(len(command), argSize)
+	}
+}
+
+func (l *Limiter) watchConfigChanged() {
+
+}
+
+func (l *Limiter) clearLimitPeriod() {
+
+}
+
+func (l *Limiter) balanceLimitInNodes() {
+
+}
+
+//func NewNamespaceLimiter() *NamespaceLimiter {
+//
+//}
+
+func (nl *NamespaceLimiter) getOrSetCommandLimiter(command string, maxQps int, maxRate int) *CommandLimiter {
+	return nil
+}
+
+//func (nl *NamespaceLimiter) checkLimit (command string, argSize int) {
+//
+//}
+
+func NewCommandLimiter(limitRule LimitRule) *CommandLimiter {
+	return nil
+}
+
+func (cl *CommandLimiter) checkLimit(commandSize int, argSize int) {
+
+}
+
+func (cl *CommandLimiter) reset() {
+
+}

From 27f54e7bdf99fe4275d75122cac9472f18843f5a Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Thu, 21 Nov 2019 11:30:50 +0800
Subject: [PATCH 11/60] 1 use rate.limiter to implement limit and read tikv
 key/value to get new limit 2 just balance limit base in active titan server
 num 3 unit test of rate limit passed

---
 bin/titan/main.go           |   7 +-
 command/command.go          |   5 +-
 conf/config.go              |   7 +-
 conf/mockconfig.go          |   5 +
 context/context.go          |   1 +
 db/limitersMgr.go           | 465 ++++++++++++++++++++++++++++++++++++
 db/ratelimiter.go           | 134 -----------
 go.mod                      |   2 +-
 metrics/prometheus.go       |  28 +++
 tools/autotest/auto.go      | 165 ++++++++++++-
 tools/autotest/auto_test.go |   1 +
 tools/integration/titan.go  |   9 +-
 tools/test/main.go          |   1 +
 13 files changed, 682 insertions(+), 148 deletions(-)
 create mode 100644 db/limitersMgr.go
 delete mode 100644 db/ratelimiter.go

diff --git a/bin/titan/main.go b/bin/titan/main.go
index 4adb1122..4616110b 100644
--- a/bin/titan/main.go
+++ b/bin/titan/main.go
@@ -68,11 +68,16 @@ func main() {
 	}
 
 	svr := metrics.NewServer(&config.Status)
-
+	limitersMgr, err := db.NewLimitersMgr(store, config.Tikv.RateLimit)
+	if err != nil {
+		zap.L().Fatal("create limitersMgr failed", zap.Error(err))
+		os.Exit(1)
+	}
 	serv := titan.New(&context.ServerContext{
 		RequirePass:      config.Server.Auth,
 		Store:            store,
 		ListZipThreshold: config.Server.ListZipThreshold,
+		LimitersMgr:      limitersMgr,
 	})
 
 	var servOpts, statusOpts []continuous.ServerOption
diff --git a/command/command.go b/command/command.go
index bbac665a..0a17d06d 100644
--- a/command/command.go
+++ b/command/command.go
@@ -92,6 +92,10 @@ type TxnCommand func(ctx *Context, txn *db.Transaction) (OnCommit, error)
 func Call(ctx *Context) {
 	ctx.Name = strings.ToLower(ctx.Name)
 
+	if _, ok := txnCommands[ctx.Name]; ok && ctx.Server.LimitersMgr != nil {
+		ctx.Server.LimitersMgr.CheckLimit(ctx.Client.Namespace, ctx.Name, ctx.Args)
+	}
+
 	if ctx.Name != "auth" &&
 		ctx.Server.RequirePass != "" &&
 		ctx.Client.Authenticated == false {
@@ -159,7 +163,6 @@ func Call(ctx *Context) {
 
 	feedMonitors(ctx)
 	start := time.Now()
-	// judge ctx.name and ctx.args
 	cmdInfoCommand.Proc(ctx)
 	cost := time.Since(start)
 
diff --git a/conf/config.go b/conf/config.go
index e677a415..8228c40d 100644
--- a/conf/config.go
+++ b/conf/config.go
@@ -104,7 +104,8 @@ type Status struct {
 }
 
 type RateLimit struct {
-	LimitPeriod   int    `cfg:"limit-period; 20; numeric; the period in ms to limit rate"`
-	BalancePeriod int    `cfg:"sync-global-period; 3000; numeric; the period in ms to balance rate limiting with other titan nodes"`
-	Rule          string `cfg:"rule; setex:10000:1000000,set:20000:1000000; ; the rule to limit rate(command1:maxqps:maxrate,command2:maxqps:maxrate...)"`
+	InterfaceName       string        `cfg:interface-name; eth0; ; the interface name to get ip and write local titan status to tikv for balancing rate limit`
+	GlobalBalancePeriod time.Duration `cfg:"global-balance-period; 15s;; the period in seconds to balance rate limiting with other titan nodes"`
+	TitanStatusLifetime time.Duration `cfg:"titanstatus-life-time; 1m;; how long if a titan didn't update its status, we consider it dead"`
+	SyncSetPeriod       time.Duration `cfg:"sync-set-period; 3s;; the period in seconds to sync new limit set in tikv"`
 }
diff --git a/conf/mockconfig.go b/conf/mockconfig.go
index eab70988..9649b490 100644
--- a/conf/mockconfig.go
+++ b/conf/mockconfig.go
@@ -33,6 +33,11 @@ func MockConf() *Titan {
 				SafePointLifeTime: 10 * time.Minute,
 				Concurrency:       2,
 			},
+			RateLimit: RateLimit{
+				SyncSetPeriod:       1 * time.Second,
+				GlobalBalancePeriod: 15 * time.Second,
+				TitanStatusLifetime: 30 * time.Second,
+			},
 		},
 	}
 }
diff --git a/context/context.go b/context/context.go
index 5082436e..22880a74 100644
--- a/context/context.go
+++ b/context/context.go
@@ -77,6 +77,7 @@ type ServerContext struct {
 	Store            *db.RedisStore
 	Monitors         sync.Map
 	Clients          sync.Map
+	LimitersMgr      *db.LimitersMgr
 	Pause            time.Duration // elapse to pause all clients
 	StartAt          time.Time
 	ListZipThreshold int
diff --git a/db/limitersMgr.go b/db/limitersMgr.go
new file mode 100644
index 00000000..8a5da922
--- /dev/null
+++ b/db/limitersMgr.go
@@ -0,0 +1,465 @@
+package db
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"github.com/distributedio/titan/conf"
+	"github.com/distributedio/titan/metrics"
+	sdk_kv "github.com/pingcap/tidb/kv"
+	"go.uber.org/zap"
+	"golang.org/x/time/rate"
+	"net"
+	"strconv"
+	"strings"
+	"sync"
+	"time"
+)
+
+const (
+	LIMITDATA_NAMESPACE     = "sys_ratelimit"
+	LIMITDATA_DBID          = 0
+	ALL_NAMESPACE           = "*"
+	NAMESPACE_COMMAND_TOKEN = "@"
+	QPS_PREFIX              = "qps:"
+	RATE_PREFIX             = "rate:"
+	RATE_BURST_TOKEN        = " "
+	TITAN_STATUS_TOKEN      = "titan_status:"
+	TIME_FORMAT             = "2006-01-02 15:04:05"
+	DEFAULT_BURST           = 1
+)
+
+type CommandLimiter struct {
+	localIp     string
+	limiterName string
+
+	qpsl         *rate.Limiter
+	ratel        *rate.Limiter
+	localPercent float64
+
+	lock               sync.Mutex
+	lastTime           time.Time
+	totalCommandsCount int64
+	totalCommandsSize  int64
+}
+
+type LimitersMgr struct {
+	limitDatadb         *DB
+	globalBalancePeriod time.Duration
+	titanStatusLifeTime time.Duration
+	syncSetPeriod       time.Duration
+	localIp             string
+	localPercent        float64
+
+	limiters sync.Map
+	lock     sync.Mutex
+}
+
+func NewLimitersMgr(store *RedisStore, rateLimit conf.RateLimit) (*LimitersMgr, error) {
+	var addrs []net.Addr
+	var err error
+	if rateLimit.InterfaceName != "" {
+		iface, err := net.InterfaceByName(rateLimit.InterfaceName)
+		if err != nil {
+			return nil, err
+		}
+
+		addrs, err = iface.Addrs()
+		if err != nil {
+			return nil, err
+		}
+	} else {
+		addrs, err = net.InterfaceAddrs()
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	localIp := ""
+	for _, a := range addrs {
+		if ipnet, ok := a.(*net.IPNet); ok && !ipnet.IP.IsLoopback() && ipnet.IP.To4() != nil {
+			localIp = ipnet.IP.String()
+			break
+		}
+	}
+	if localIp == "" {
+		return nil, errors.New(rateLimit.InterfaceName + " adds is empty")
+	}
+
+	l := &LimitersMgr{
+		limitDatadb:         store.DB(LIMITDATA_NAMESPACE, LIMITDATA_DBID),
+		globalBalancePeriod: rateLimit.GlobalBalancePeriod,
+		titanStatusLifeTime: rateLimit.TitanStatusLifetime,
+		syncSetPeriod:       rateLimit.SyncSetPeriod,
+		localIp:             localIp,
+		localPercent:        1,
+	}
+
+	go l.startBalanceLimit()
+	go l.startSyncNewLimit()
+	return l, nil
+}
+
+func (l *LimitersMgr) init(limiterName string) *CommandLimiter {
+	//lock is just prevent many new connection of same namespace to getlimit from tikv in same time
+	l.lock.Lock()
+	defer l.lock.Unlock()
+
+	v, ok := l.limiters.Load(limiterName)
+	if ok {
+		return v.(*CommandLimiter)
+	}
+
+	qpsLimit, _ := l.getLimit(limiterName, true)
+	rateLimit, rateBurst := l.getLimit(limiterName, false)
+	if qpsLimit > 0 ||
+		(rateLimit > 0 && rateBurst > 0) {
+		newCl := NewCommandLimiter(l.localIp, limiterName, qpsLimit, rateLimit, rateBurst)
+		v, _ := l.limiters.LoadOrStore(limiterName, newCl)
+		return v.(*CommandLimiter)
+	} else {
+		l.limiters.LoadOrStore(limiterName, (*CommandLimiter)(nil))
+		return nil
+	}
+}
+
+func (l *LimitersMgr) getLimit(limiterName string, isQps bool) (float64, int) {
+	strs := strings.Split(limiterName, NAMESPACE_COMMAND_TOKEN)
+	if len(strs) < 2 {
+		zap.L().Error("[Limit] wrong format limiter name", zap.String("limiterName", limiterName))
+		return 0, 0
+	}
+	allMatchLimiter := fmt.Sprintf("%s%s%s", ALL_NAMESPACE, NAMESPACE_COMMAND_TOKEN, strs[1])
+	limiterNames := []string{limiterName, allMatchLimiter}
+	limit := float64(0)
+	burst := int64(0)
+
+	txn, err := l.limitDatadb.Begin()
+	if err != nil {
+		zap.L().Error("[Limit] transection begin failed", zap.String("limiterName", limiterName), zap.Bool("isQps", isQps), zap.Error(err))
+		return 0, 0
+	}
+	for i := range limiterNames {
+		var limiterKey string
+		if isQps {
+			limiterKey = QPS_PREFIX + limiterNames[i]
+		} else {
+			limiterKey = RATE_PREFIX + limiterNames[i]
+		}
+
+		str, err := txn.String([]byte(limiterKey))
+		if err != nil {
+			zap.L().Error("[Limit] get limit's value failed", zap.String("key", limiterKey), zap.Error(err))
+			return 0, 0
+		}
+		val, err := str.Get()
+		if err != nil {
+			zap.L().Info("[Limit] limit isn't set", zap.String("key", limiterKey), zap.Error(err))
+			continue
+		}
+
+		if isQps {
+			if limit, err = strconv.ParseFloat(string(val), 64); err != nil {
+				zap.L().Error("[Limit] qps limit can't be decoded to integer", zap.String("key", limiterKey), zap.ByteString("val", val), zap.Error(err))
+				continue
+			}
+		} else {
+			rateLimitVals := strings.Split(string(val), RATE_BURST_TOKEN)
+			if len(rateLimitVals) < 2 {
+				zap.L().Error("[Limit] rate limit hasn't enough parameters, should be: <limit>K|k|M|m <burst>", zap.String("key", limiterKey), zap.ByteString("val", val))
+				continue
+			}
+			rateLimit := rateLimitVals[0]
+			rateBurst := rateLimitVals[1]
+			if len(rateLimit) < 2 {
+				zap.L().Error("[Limit] rate limit is invaild, should be: <limit>K|k|M|m <burst>", zap.String("key", limiterKey), zap.ByteString("val", val))
+				continue
+			}
+			var strUnit uint8
+			var unit float64
+			strUnit = rateLimit[len(rateLimit)-1]
+			if strUnit == 'k' || strUnit == 'K' {
+				unit = 1024
+			} else if strUnit == 'm' || strUnit == 'M' {
+				unit = 1024 * 1024
+			} else {
+				zap.L().Error("[Limit] rate limit is invaild, should be: <limit>K|k|M|m <burst>", zap.String("key", limiterKey), zap.ByteString("val", val))
+				continue
+			}
+			if limit, err = strconv.ParseFloat(rateLimit[:len(rateLimit)-1], 64); err != nil {
+				zap.L().Error("[Limit] rate limit can't be decoded to integer", zap.String("key", limiterKey), zap.ByteString("val", val), zap.Error(err))
+				continue
+			}
+			limit *= unit
+			if burst, err = strconv.ParseInt(rateBurst, 10, 64); err != nil {
+				zap.L().Error("[Limit] rate burst can't be decoded to integer", zap.String("key", limiterKey), zap.ByteString("val", val), zap.Error(err))
+				continue
+			}
+		}
+
+		zap.L().Info("[Limit] got limit", zap.String("key", limiterKey), zap.Float64("limit", limit), zap.Int64("burst", burst))
+		break
+	}
+	if err := txn.t.Commit(context.Background()); err != nil {
+		zap.L().Error("[Limit] commit after get limit failed", zap.String("limiterName", limiterName), zap.Error(err))
+		txn.t.Rollback()
+		return 0, 0
+	}
+	return limit, int(burst)
+}
+
+func (l *LimitersMgr) CheckLimit(namespace string, cmdName string, cmdArgs []string) {
+	if namespace == LIMITDATA_NAMESPACE {
+		return
+	}
+	limiterName := fmt.Sprintf("%s%s%s", namespace, NAMESPACE_COMMAND_TOKEN, cmdName)
+	v, ok := l.limiters.Load(limiterName)
+	var commandLimiter *CommandLimiter
+	if !ok {
+		commandLimiter = l.init(limiterName)
+	} else {
+		commandLimiter = v.(*CommandLimiter)
+	}
+
+	if commandLimiter != nil {
+		now := time.Now()
+		commandLimiter.checkLimit(cmdName, cmdArgs)
+		cost := time.Since(now).Seconds()
+		metrics.GetMetrics().LimitCostHistogramVec.WithLabelValues(namespace, cmdName).Observe(cost)
+	}
+}
+
+func (l *LimitersMgr) startBalanceLimit() {
+	ticker := time.NewTicker(l.globalBalancePeriod)
+	defer ticker.Stop()
+	for range ticker.C {
+		l.runBalanceLimit()
+	}
+}
+
+func (l *LimitersMgr) runBalanceLimit() {
+	txn, err := l.limitDatadb.Begin()
+	if err != nil {
+		zap.L().Error("[Limit] transection begin failed", zap.String("titan", l.localIp), zap.Error(err))
+		return
+	}
+
+	prefix := MetaKey(l.limitDatadb, []byte(TITAN_STATUS_TOKEN))
+	endPrefix := sdk_kv.Key(prefix).PrefixNext()
+	iter, err := txn.t.Iter(prefix, endPrefix)
+	if err != nil {
+		zap.L().Error("[Limit] seek failed", zap.ByteString("prefix", prefix), zap.Error(err))
+		txn.Rollback()
+		return
+	}
+	defer iter.Close()
+
+	activeNum := 1
+	prefixLen := len(prefix)
+	for ; iter.Valid() && iter.Key().HasPrefix(prefix); err = iter.Next() {
+		if err != nil {
+			zap.L().Error("[Limit] next failed", zap.ByteString("prefix", prefix), zap.Error(err))
+			txn.Rollback()
+			return
+		}
+
+		key := iter.Key()
+		if len(key) <= prefixLen {
+			zap.L().Error("ip is null", zap.ByteString("key", key))
+			continue
+		}
+		ip := key[prefixLen:]
+		if string(ip) == l.localIp {
+			continue
+		}
+
+		lastActive := iter.Value()
+		lastActiveT, err := time.Parse(TIME_FORMAT, string(lastActive))
+		if err != nil {
+			zap.L().Error("[Limit] decode time string failed", zap.ByteString("ip", ip), zap.ByteString("lastActive", lastActive), zap.Error(err))
+			continue
+		}
+
+		zap.L().Info("[Limit] last active time", zap.ByteString("ip", ip), zap.ByteString("lastActive", lastActive))
+		if time.Since(lastActiveT) <= l.titanStatusLifeTime {
+			activeNum++
+		}
+	}
+	newPercent := float64(1.0 / activeNum)
+	if l.localPercent != newPercent {
+		zap.L().Info("[Limit] balance limit in all titan server", zap.Int("active server num", activeNum),
+			zap.Float64("oldPercent", l.localPercent), zap.Float64("newPercent", newPercent))
+		l.limiters.Range(func(k, v interface{}) bool {
+			commandLimiter := v.(*CommandLimiter)
+			if commandLimiter != nil {
+				commandLimiter.updateLimitPercent(newPercent)
+			}
+			return true
+		})
+		l.localPercent = newPercent
+	}
+
+	key := []byte(TITAN_STATUS_TOKEN + l.localIp)
+	s := NewString(txn, key)
+	now := time.Now()
+	value := now.Format(TIME_FORMAT)
+	if err := s.Set([]byte(value), 0); err != nil {
+		txn.Rollback()
+		return
+	}
+	if err := txn.t.Commit(context.Background()); err != nil {
+		zap.L().Error("[Limit] commit after balance limit failed", zap.String("titan", l.localIp))
+		txn.Rollback()
+		return
+	}
+}
+
+func (l *LimitersMgr) startSyncNewLimit() {
+	ticker := time.NewTicker(l.syncSetPeriod)
+	defer ticker.Stop()
+	for range ticker.C {
+		l.limiters.Range(func(k, v interface{}) bool {
+			limiterName := k.(string)
+			commandLimiter := v.(*CommandLimiter)
+			qpsLimit, _ := l.getLimit(limiterName, true)
+			rateLimit, rateBurst := l.getLimit(limiterName, false)
+			if qpsLimit > 0 ||
+				(rateLimit > 0 && rateBurst > 0) {
+				if commandLimiter == nil {
+					newCl := NewCommandLimiter(l.localIp, limiterName, qpsLimit, rateLimit, rateBurst)
+					l.limiters.Store(limiterName, newCl)
+				} else {
+					commandLimiter.updateLimit(qpsLimit, rateLimit, rateBurst)
+				}
+			} else {
+				if commandLimiter != nil {
+					l.limiters.Store(limiterName, (*CommandLimiter)(nil))
+				}
+			}
+			return true
+		})
+	}
+}
+
+func NewCommandLimiter(localIp string, limiterName string, qpsLimit float64, rateLimit float64, rateBurst int) *CommandLimiter {
+	var qpsl, ratel *rate.Limiter
+	if qpsLimit > 0 {
+		qpsl = rate.NewLimiter(rate.Limit(qpsLimit), DEFAULT_BURST)
+	}
+	if rateLimit > 0 && rateBurst > 0 {
+		ratel = rate.NewLimiter(rate.Limit(rateLimit), rateBurst)
+	}
+	cl := &CommandLimiter{
+		limiterName:  limiterName,
+		localIp:      localIp,
+		qpsl:         qpsl,
+		ratel:        ratel,
+		localPercent: 1,
+		lastTime:     time.Now(),
+	}
+	return cl
+}
+
+func (cl *CommandLimiter) updateLimit(qpsLimit float64, rateLimit float64, rateBurst int) {
+	cl.lock.Lock()
+	defer cl.lock.Unlock()
+
+	if qpsLimit > 0 {
+		if cl.qpsl != nil {
+			if cl.qpsl.Limit() != rate.Limit(qpsLimit*cl.localPercent) {
+				cl.qpsl.SetLimit(rate.Limit(qpsLimit * cl.localPercent))
+			}
+		} else {
+			cl.qpsl = rate.NewLimiter(rate.Limit(qpsLimit*cl.localPercent), DEFAULT_BURST)
+		}
+	} else {
+		cl.qpsl = nil
+	}
+
+	if rateLimit > 0 && rateBurst > 0 {
+		if cl.ratel != nil {
+			if cl.ratel.Burst() != rateBurst {
+				cl.ratel = rate.NewLimiter(rate.Limit(rateLimit*cl.localPercent), rateBurst)
+			} else if cl.ratel.Limit() != rate.Limit(rateLimit*cl.localPercent) {
+				cl.ratel.SetLimit(rate.Limit(rateLimit * cl.localPercent))
+			}
+		} else {
+			cl.ratel = rate.NewLimiter(rate.Limit(rateLimit*cl.localPercent), rateBurst)
+		}
+	} else {
+		cl.ratel = nil
+	}
+
+	var qpsLocal, rateLocal float64
+	seconds := time.Since(cl.lastTime).Seconds()
+	if seconds >= 0 {
+		qpsLocal = float64(cl.totalCommandsCount) / seconds
+		rateLocal = float64(cl.totalCommandsSize) / 1024 / seconds
+	} else {
+		qpsLocal = 0
+		rateLocal = 0
+	}
+	metrics.GetMetrics().LimiterQpsVec.WithLabelValues(cl.localIp, cl.limiterName).Set(qpsLocal)
+	metrics.GetMetrics().LimiterRateVec.WithLabelValues(cl.localIp, cl.limiterName).Set(rateLocal)
+	cl.totalCommandsCount = 0
+	cl.totalCommandsSize = 0
+	cl.lastTime = time.Now()
+}
+
+func (cl *CommandLimiter) updateLimitPercent(newPercent float64) {
+	cl.lock.Lock()
+	defer cl.lock.Unlock()
+
+	if cl.localPercent != newPercent && cl.localPercent > 0 && newPercent > 0 {
+		if cl.qpsl != nil {
+			qpsLimit := float64(cl.qpsl.Limit()) / cl.localPercent
+			cl.qpsl.SetLimit(rate.Limit(qpsLimit * newPercent))
+		}
+		if cl.ratel != nil {
+			rateLimit := float64(cl.ratel.Limit()) / cl.localPercent
+			cl.ratel.SetLimit(rate.Limit(rateLimit * newPercent))
+		}
+		cl.localPercent = newPercent
+	}
+}
+
+func (cl *CommandLimiter) checkLimit(cmdName string, cmdArgs []string) {
+	cl.lock.Lock()
+	defer cl.lock.Unlock()
+
+	if cl.qpsl != nil {
+		r := cl.qpsl.Reserve()
+		if !r.OK() {
+			zap.L().Error("[Limit] request events num exceed limiter burst", zap.Int("qps limiter burst", cl.qpsl.Burst()))
+		} else {
+			d := r.Delay()
+			if logEnv := zap.L().Check(zap.DebugLevel, "[Limit] trigger qps limit"); logEnv != nil {
+				logEnv.Write(zap.String("limiter name", cl.limiterName), zap.Int64("sleep us", int64(d/time.Microsecond)))
+			}
+			time.Sleep(d)
+		}
+	}
+
+	cmdSize := len(cmdName)
+	for i := range cmdArgs {
+		cmdSize += len(cmdArgs[i]) + 1
+	}
+	if cl.ratel != nil {
+		r := cl.ratel.ReserveN(time.Now(), cmdSize)
+		if !r.OK() {
+			zap.L().Error("[Limit] request events num exceed limiter burst", zap.Int("rate limiter burst", cl.ratel.Burst()), zap.Int("command size", cmdSize))
+		} else {
+			d := r.Delay()
+			if logEnv := zap.L().Check(zap.DebugLevel, "[Limit] trigger rate limit"); logEnv != nil {
+				logEnv.Write(zap.String("limiter name", cl.limiterName), zap.Strings("args", cmdArgs), zap.Int64("sleep us", int64(d/time.Microsecond)))
+			}
+			time.Sleep(d)
+		}
+	}
+
+	cl.totalCommandsCount++
+	cl.totalCommandsSize += int64(cmdSize)
+	if logEnv := zap.L().Check(zap.DebugLevel, "[Limit] limiter state"); logEnv != nil {
+		logEnv.Write(zap.String("limiter name", cl.limiterName), zap.Time("last time", cl.lastTime), zap.Int64("command count", cl.totalCommandsCount), zap.Int64("command size", cl.totalCommandsSize))
+	}
+}
diff --git a/db/ratelimiter.go b/db/ratelimiter.go
deleted file mode 100644
index bb1b6381..00000000
--- a/db/ratelimiter.go
+++ /dev/null
@@ -1,134 +0,0 @@
-package db
-
-import (
-	"sync"
-	"time"
-)
-
-const (
-	ALL_NAMESPACE = "*"
-)
-
-type CommandLimiter struct {
-	lock                       sync.Mutex
-	startTime                  time.Time
-	commandsCount              int
-	commandsSize               int
-	maxCommandsCount           int
-	maxCommandsSize            int
-	period                     int
-	periodsNumToTriggerBalance int
-	totalCommandsCount         int
-	totalCommandsSize          int
-}
-
-type NamespaceLimiter struct {
-	lock           sync.Mutex
-	commandLimiter map[string]*CommandLimiter
-}
-
-type LimitRule struct {
-	//namespace string
-	//command string
-	maxqps  int
-	maxrate int
-}
-type Limiter struct {
-	lock sync.Mutex
-
-	limitPeriod   int
-	balancePeriod int
-	currentIp     string
-
-	limitRules map[string]map[string]LimitRule
-	limiter    map[string]NamespaceLimiter
-}
-
-func NewLimitRule(rule string) map[string]map[string]LimitRule {
-	return nil
-}
-
-func NewLimiter(limitPeriod int, balancePeriod int, rule string) *Limiter {
-	l := &Limiter{
-		limitPeriod:   limitPeriod,
-		balancePeriod: balancePeriod,
-		currentIp:     "127.0.0.1", //set current ip
-		limitRules:    NewLimitRule(rule),
-		limiter:       make(map[string]NamespaceLimiter),
-	}
-
-	return l
-}
-
-func (l *Limiter) addAllNamespaceLimiter(rule string) {
-
-}
-
-//func (l *Limiter) addNamespaceLimiter(namespace string, command string, maxQps int, maxRate int) {
-//
-//}
-
-func (l *Limiter) get(namespace string, command string) *CommandLimiter {
-	return nil
-}
-
-func (l *Limiter) setCommandLimiter(namespace string, command string, commandLimiter *CommandLimiter) *CommandLimiter {
-	return nil
-}
-
-func (l *Limiter) checkLimit(namespace string, command string, argSize int) {
-	commandLimiter := l.get(namespace, command)
-	if commandLimiter == nil {
-		//get rule from limitRule
-		limitRule, ok := l.limitRules[namespace][command]
-		if ok {
-			tmp := NewCommandLimiter(limitRule)
-			commandLimiter = l.setCommandLimiter(namespace, command, tmp)
-		} else {
-			limitRule, ok = l.limitRules[ALL_NAMESPACE][command]
-			if ok {
-				tmp := NewCommandLimiter(limitRule)
-				commandLimiter = l.setCommandLimiter(namespace, command, tmp)
-			}
-		}
-	}
-	if commandLimiter != nil {
-		commandLimiter.checkLimit(len(command), argSize)
-	}
-}
-
-func (l *Limiter) watchConfigChanged() {
-
-}
-
-func (l *Limiter) clearLimitPeriod() {
-
-}
-
-func (l *Limiter) balanceLimitInNodes() {
-
-}
-
-//func NewNamespaceLimiter() *NamespaceLimiter {
-//
-//}
-
-func (nl *NamespaceLimiter) getOrSetCommandLimiter(command string, maxQps int, maxRate int) *CommandLimiter {
-	return nil
-}
-
-//func (nl *NamespaceLimiter) checkLimit (command string, argSize int) {
-//
-//}
-
-func NewCommandLimiter(limitRule LimitRule) *CommandLimiter {
-	return nil
-}
-
-func (cl *CommandLimiter) checkLimit(commandSize int, argSize int) {
-
-}
-
-func (cl *CommandLimiter) reset() {
-
-}
diff --git a/go.mod b/go.mod
index a0880a92..989ae37d 100644
--- a/go.mod
+++ b/go.mod
@@ -55,7 +55,7 @@ require (
 	go.uber.org/multierr v1.1.0 // indirect
 	go.uber.org/zap v1.9.1
 	golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4 // indirect
-	golang.org/x/time v0.0.0-20181108054448-85acf8d2951c // indirect
+	golang.org/x/time v0.0.0-20181108054448-85acf8d2951c
 	google.golang.org/genproto v0.0.0-20190108161440-ae2f86662275 // indirect
 	google.golang.org/grpc v1.17.0 // indirect
 	gopkg.in/natefinch/lumberjack.v2 v2.0.0 // indirect
diff --git a/metrics/prometheus.go b/metrics/prometheus.go
index c1f0ab60..949e3991 100644
--- a/metrics/prometheus.go
+++ b/metrics/prometheus.go
@@ -48,6 +48,8 @@ type Metrics struct {
 	ZTInfoCounterVec     *prometheus.CounterVec
 	IsLeaderGaugeVec     *prometheus.GaugeVec
 	ExpireLeftSecondsVec *prometheus.GaugeVec
+	LimiterQpsVec        *prometheus.GaugeVec
+	LimiterRateVec       *prometheus.GaugeVec
 	LRangeSeekHistogram  prometheus.Histogram
 	GCKeysCounterVec     *prometheus.CounterVec
 
@@ -59,6 +61,7 @@ type Metrics struct {
 
 	//command biz
 	CommandCallHistogramVec     *prometheus.HistogramVec
+	LimitCostHistogramVec       *prometheus.HistogramVec
 	TxnBeginHistogramVec        *prometheus.HistogramVec
 	CommandFuncDoneHistogramVec *prometheus.HistogramVec
 	TxnCommitHistogramVec       *prometheus.HistogramVec
@@ -111,6 +114,15 @@ func init() {
 		}, multiLabel)
 	prometheus.MustRegister(gm.CommandArgsNumHistogramVec)
 
+	gm.LimitCostHistogramVec = prometheus.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Namespace: namespace,
+			Name:      "limit_cost_seconds",
+			Buckets:   prometheus.ExponentialBuckets(0.0001, 2, 10),
+			Help:      "the cost times of command execute's limit",
+		}, multiLabel)
+	prometheus.MustRegister(gm.LimitCostHistogramVec)
+
 	gm.TxnBeginHistogramVec = prometheus.NewHistogramVec(
 		prometheus.HistogramOpts{
 			Namespace: namespace,
@@ -180,6 +192,22 @@ func init() {
 		}, expireLabel)
 	prometheus.MustRegister(gm.ExpireLeftSecondsVec)
 
+	gm.LimiterQpsVec = prometheus.NewGaugeVec(
+		prometheus.GaugeOpts{
+			Namespace: namespace,
+			Name:      "qps_limiter_status",
+			Help:      "the qps of a namespace's command in a titan server",
+		}, multiLabel)
+	prometheus.MustRegister(gm.LimiterQpsVec)
+
+	gm.LimiterRateVec = prometheus.NewGaugeVec(
+		prometheus.GaugeOpts{
+			Namespace: namespace,
+			Name:      "rate_limiter_status",
+			Help:      "the rate of a namespace's command in a titan server(KB/s)",
+		}, multiLabel)
+	prometheus.MustRegister(gm.LimiterRateVec)
+
 	gm.LRangeSeekHistogram = prometheus.NewHistogram(
 		prometheus.HistogramOpts{
 			Namespace: namespace,
diff --git a/tools/autotest/auto.go b/tools/autotest/auto.go
index ed28b560..a1cf5138 100644
--- a/tools/autotest/auto.go
+++ b/tools/autotest/auto.go
@@ -1,13 +1,16 @@
 package autotest
 
 import (
+	"fmt"
+	"github.com/distributedio/titan/tools/autotest/cmd"
+	"github.com/gomodule/redigo/redis"
+	"github.com/stretchr/testify/assert"
+	"math"
 	"strconv"
+	"strings"
+	"sync"
 	"testing"
 	"time"
-
-	"github.com/distributedio/titan/tools/autotest/cmd"
-
-	"github.com/gomodule/redigo/redis"
 )
 
 //AutoClient check redis comman
@@ -19,8 +22,10 @@ type AutoClient struct {
 	*cmd.ExampleSystem
 	em *cmd.ExampleMulti
 	// addr string
-	pool *redis.Pool
-	conn redis.Conn
+	pool      *redis.Pool
+	conn      redis.Conn
+	conn2     redis.Conn
+	limitConn redis.Conn
 }
 
 //NewAutoClient creat auto client
@@ -40,6 +45,27 @@ func (ac *AutoClient) Start(addr string) {
 		panic(err)
 	}
 	ac.conn = conn
+
+	conn2, err := redis.Dial("tcp", addr)
+	if err != nil {
+		panic(err)
+	}
+	_, err = redis.String(conn2.Do("auth", "test-1542098935-1-7ca41bda4efc2a1889c04e"))
+	if err != nil {
+		panic(err)
+	}
+	ac.conn2 = conn2
+
+	limitConn, err := redis.Dial("tcp", addr)
+	if err != nil {
+		panic(err)
+	}
+	_, err = redis.String(limitConn.Do("auth", "sys_ratelimit-1574130304-1-36c153b109ebca80b43769"))
+	if err != nil {
+		panic(err)
+	}
+	ac.limitConn = limitConn
+
 	ac.es = cmd.NewExampleString(conn)
 	ac.ek = cmd.NewExampleKey(conn)
 	ac.el = cmd.NewExampleList(conn)
@@ -54,6 +80,21 @@ func (ac *AutoClient) Close() {
 	ac.conn.Close()
 }
 
+func (ac *AutoClient) setLimit(t *testing.T, key string, value string) {
+	reply, err := redis.String(ac.limitConn.Do("SET", key, value))
+	assert.Equal(t, "OK", reply)
+	assert.NoError(t, err)
+	data, err := redis.Bytes(ac.limitConn.Do("GET", key))
+	assert.Equal(t, value, string(data))
+	assert.NoError(t, err)
+}
+
+func (ac *AutoClient) delLimit(t *testing.T, expectReply int, key string) {
+	reply, err := redis.Int(ac.limitConn.Do("DEL", key))
+	assert.Equal(t, expectReply, reply)
+	assert.NoError(t, err)
+}
+
 //StringCase check string case
 func (ac *AutoClient) StringCase(t *testing.T) {
 	ac.es.SetNxEqual(t, "key-setx", "v1")
@@ -275,3 +316,115 @@ func (ac *AutoClient) MultiCase(t *testing.T) {
 	// exec
 	ac.em.ExecEqual(t)
 }
+
+func (ac *AutoClient) LimitCase(t *testing.T) {
+	ac.es.SetEqual(t, "key1", "1")
+	//first command invoke won't be limited
+	times := []int{100, 101}
+	conns := []redis.Conn{ac.conn, ac.conn2}
+
+	cost := ac.runCmdInGoRoutines(t, "get", "key1", times, conns)
+	assert.Equal(t, true, cost <= 0.1)
+	cost = ac.runCmdInGoRoutines(t, "mget", "key1 key2", times, conns)
+	assert.Equal(t, true, cost <= 0.1)
+
+	ac.setLimit(t, "qps:*@get", "100")
+	time.Sleep(time.Second * 1)
+	cost = ac.runCmdInGoRoutines(t, "get", "key1", times, conns)
+	assert.Equal(t, true, math.Abs(cost-2) <= 0.1)
+
+	ac.setLimit(t, "qps:test@get", "200")
+	time.Sleep(time.Second * 1)
+	cost = ac.runCmdInGoRoutines(t, "get", "key1", times, conns)
+	assert.Equal(t, true, math.Abs(cost-1) <= 0.1)
+
+	ac.delLimit(t, 1, "qps:test@get")
+	time.Sleep(time.Second * 1)
+	cost = ac.runCmdInGoRoutines(t, "get", "key1", times, conns)
+	assert.Equal(t, true, math.Abs(cost-2) <= 0.1)
+
+	ac.setLimit(t, "qps:*@get", "100a")
+	time.Sleep(time.Second * 1)
+	cost = ac.runCmdInGoRoutines(t, "get", "key1", times, conns)
+	assert.Equal(t, true, cost <= 0.1)
+
+	ac.setLimit(t, "qps:*@mget", "100")
+	time.Sleep(time.Second * 1)
+	cost = ac.runCmdInGoRoutines(t, "mget", "key1 key2", times, conns)
+	assert.Equal(t, true, math.Abs(cost-2) <= 0.1)
+
+	ac.delLimit(t, 1, "qps:*@mget")
+	ac.setLimit(t, "rate:*@mget", "1k")
+	time.Sleep(time.Second * 1)
+	cost = ac.runCmdInGoRoutines(t, "mget", "key1 key2", times, conns)
+	assert.Equal(t, true, cost <= 0.1)
+
+	ac.setLimit(t, "rate:*@mget", "1 2")
+	time.Sleep(time.Second * 1)
+	cost = ac.runCmdInGoRoutines(t, "mget", "key1 key2", times, conns)
+	assert.Equal(t, true, cost <= 0.1)
+
+	ac.setLimit(t, "rate:*@mget", "1s 2")
+	time.Sleep(time.Second * 1)
+	cost = ac.runCmdInGoRoutines(t, "mget", "key1 key2", times, conns)
+	assert.Equal(t, true, cost <= 0.1)
+
+	ac.setLimit(t, "rate:*@mget", "kk 2")
+	time.Sleep(time.Second * 1)
+	cost = ac.runCmdInGoRoutines(t, "mget", "key1 key2", times, conns)
+	assert.Equal(t, true, cost <= 0.1)
+
+	ac.setLimit(t, "rate:*@mget", "1k 2a")
+	time.Sleep(time.Second * 1)
+	cost = ac.runCmdInGoRoutines(t, "mget", "key1 key2", times, conns)
+	assert.Equal(t, true, cost <= 0.1)
+
+	ac.setLimit(t, "rate:*@mget", "1k 2")
+	time.Sleep(time.Second * 1)
+	cost = ac.runCmdInGoRoutines(t, "mget", "key1 key2", times, conns)
+	assert.Equal(t, true, cost <= 0.1)
+
+	ac.setLimit(t, "rate:*@mget", "28k 100")
+	time.Sleep(time.Second * 1)
+	times = []int{1024, 1025}
+	cost = ac.runCmdInGoRoutines(t, "mget", "key1 key2", times, conns)
+	assert.Equal(t, true, math.Abs(cost-1) <= 0.3)
+
+	ac.setLimit(t, "rate:*@mget", "0.028M 100")
+	time.Sleep(time.Second * 1)
+	cost = ac.runCmdInGoRoutines(t, "mget", "key1 key2", times, conns)
+	assert.Equal(t, true, math.Abs(cost-1) <= 0.3)
+
+	ac.delLimit(t, 1, "rate:*@mget")
+	time.Sleep(time.Second * 1)
+	cost = ac.runCmdInGoRoutines(t, "mget", "key1 key2", times, conns)
+	assert.Equal(t, true, cost <= 0.3)
+
+	ac.ek.DelEqual(t, 1, "key1")
+}
+
+func (ac *AutoClient) runCmdInGoRoutines(t *testing.T, cmd string, key string, times []int, conns []redis.Conn) float64 {
+	gonum := len(times)
+	if gonum != len(conns) {
+		return 0
+	}
+
+	var wg sync.WaitGroup
+	wg.Add(gonum)
+	now := time.Now()
+	for i := 0; i < gonum; i++ {
+		go func(times int, conn redis.Conn, wg *sync.WaitGroup) {
+			cmd := strings.ToLower(cmd)
+			for j := 0; j < times; j++ {
+				_, err := conn.Do(cmd, key)
+				if err != nil {
+					fmt.Printf("cmd=%s, key=%s, err=%s\n", cmd, key, err)
+					break
+				}
+			}
+			wg.Done()
+		}(times[i], conns[i], &wg)
+	}
+	wg.Wait()
+	return time.Since(now).Seconds()
+}
diff --git a/tools/autotest/auto_test.go b/tools/autotest/auto_test.go
index caefbd1d..3ffe7c8e 100644
--- a/tools/autotest/auto_test.go
+++ b/tools/autotest/auto_test.go
@@ -6,6 +6,7 @@ import (
 
 func Test(t *testing.T) {
 	at.SystemCase(t)
+	at.LimitCase(t)
 	at.ZSetCase(t)
 	at.StringCase(t)
 	at.KeyCase(t)
diff --git a/tools/integration/titan.go b/tools/integration/titan.go
index 9bc36c87..9d400e2d 100644
--- a/tools/integration/titan.go
+++ b/tools/integration/titan.go
@@ -48,10 +48,15 @@ func Start() {
 		log.Fatalln(err)
 	}
 
+	limitersMgr, err := db.NewLimitersMgr(store, tikvConf.RateLimit)
+	if err != nil {
+		log.Fatalln(err)
+	}
 	svr = titan.New(&context.ServerContext{
-		RequirePass: cfg.Auth,
-		Store:       store,
+		RequirePass:      cfg.Auth,
+		Store:            store,
 		ListZipThreshold: 100,
+		LimitersMgr:      limitersMgr,
 	})
 	err = svr.ListenAndServe(cfg.Listen)
 	if err != nil {
diff --git a/tools/test/main.go b/tools/test/main.go
index e7222570..ae84dd86 100644
--- a/tools/test/main.go
+++ b/tools/test/main.go
@@ -28,6 +28,7 @@ func main() {
 		client.StringCase(t)
 		client.ListCase(t)
 		client.KeyCase(t)
+		client.LimitCase(t)
 	}
 	client.Close()
 }

From e5018e45bf2a8c73d0f36590173d946529ca29cd Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Thu, 21 Nov 2019 12:51:39 +0800
Subject: [PATCH 12/60] qps also can be set burst, its limit also support
 k/K/m/M suffix

---
 db/limitersMgr.go      | 100 ++++++++++++++++++++---------------------
 tools/autotest/auto.go |  50 ++++++++++++---------
 2 files changed, 79 insertions(+), 71 deletions(-)

diff --git a/db/limitersMgr.go b/db/limitersMgr.go
index 8a5da922..3cad1fd8 100644
--- a/db/limitersMgr.go
+++ b/db/limitersMgr.go
@@ -23,7 +23,7 @@ const (
 	NAMESPACE_COMMAND_TOKEN = "@"
 	QPS_PREFIX              = "qps:"
 	RATE_PREFIX             = "rate:"
-	RATE_BURST_TOKEN        = " "
+	LIMIT_BURST_TOKEN       = " "
 	TITAN_STATUS_TOKEN      = "titan_status:"
 	TIME_FORMAT             = "2006-01-02 15:04:05"
 	DEFAULT_BURST           = 1
@@ -110,11 +110,11 @@ func (l *LimitersMgr) init(limiterName string) *CommandLimiter {
 		return v.(*CommandLimiter)
 	}
 
-	qpsLimit, _ := l.getLimit(limiterName, true)
+	qpsLimit, qpsBurst := l.getLimit(limiterName, true)
 	rateLimit, rateBurst := l.getLimit(limiterName, false)
-	if qpsLimit > 0 ||
+	if (qpsLimit > 0 && qpsBurst > 0) ||
 		(rateLimit > 0 && rateBurst > 0) {
-		newCl := NewCommandLimiter(l.localIp, limiterName, qpsLimit, rateLimit, rateBurst)
+		newCl := NewCommandLimiter(l.localIp, limiterName, qpsLimit, qpsBurst, rateLimit, rateBurst)
 		v, _ := l.limiters.LoadOrStore(limiterName, newCl)
 		return v.(*CommandLimiter)
 	} else {
@@ -158,43 +158,39 @@ func (l *LimitersMgr) getLimit(limiterName string, isQps bool) (float64, int) {
 			continue
 		}
 
-		if isQps {
-			if limit, err = strconv.ParseFloat(string(val), 64); err != nil {
-				zap.L().Error("[Limit] qps limit can't be decoded to integer", zap.String("key", limiterKey), zap.ByteString("val", val), zap.Error(err))
-				continue
-			}
+		limitStrs := strings.Split(string(val), LIMIT_BURST_TOKEN)
+		if len(limitStrs) < 2 {
+			zap.L().Error("[Limit] limit hasn't enough parameters, should be: <limit>[K|k|M|m] <burst>", zap.String("key", limiterKey), zap.ByteString("val", val))
+			continue
+		}
+		limitStr := limitStrs[0]
+		burstStr := limitStrs[1]
+		if len(limitStr) < 1 {
+			zap.L().Error("[Limit] limit part's length isn't enough, should be: <limit>[K|k|M|m] <burst>", zap.String("key", limiterKey), zap.ByteString("val", val))
+			continue
+		}
+		var strUnit uint8
+		var unit float64
+		strUnit = limitStr[len(limitStr)-1]
+		if strUnit == 'k' || strUnit == 'K' {
+			unit = 1024
+			limitStr = limitStr[:len(limitStr)-1]
+		} else if strUnit == 'm' || strUnit == 'M' {
+			unit = 1024 * 1024
+			limitStr = limitStr[:len(limitStr)-1]
 		} else {
-			rateLimitVals := strings.Split(string(val), RATE_BURST_TOKEN)
-			if len(rateLimitVals) < 2 {
-				zap.L().Error("[Limit] rate limit hasn't enough parameters, should be: <limit>K|k|M|m <burst>", zap.String("key", limiterKey), zap.ByteString("val", val))
-				continue
-			}
-			rateLimit := rateLimitVals[0]
-			rateBurst := rateLimitVals[1]
-			if len(rateLimit) < 2 {
-				zap.L().Error("[Limit] rate limit is invaild, should be: <limit>K|k|M|m <burst>", zap.String("key", limiterKey), zap.ByteString("val", val))
-				continue
-			}
-			var strUnit uint8
-			var unit float64
-			strUnit = rateLimit[len(rateLimit)-1]
-			if strUnit == 'k' || strUnit == 'K' {
-				unit = 1024
-			} else if strUnit == 'm' || strUnit == 'M' {
-				unit = 1024 * 1024
-			} else {
-				zap.L().Error("[Limit] rate limit is invaild, should be: <limit>K|k|M|m <burst>", zap.String("key", limiterKey), zap.ByteString("val", val))
-				continue
-			}
-			if limit, err = strconv.ParseFloat(rateLimit[:len(rateLimit)-1], 64); err != nil {
-				zap.L().Error("[Limit] rate limit can't be decoded to integer", zap.String("key", limiterKey), zap.ByteString("val", val), zap.Error(err))
-				continue
-			}
-			limit *= unit
-			if burst, err = strconv.ParseInt(rateBurst, 10, 64); err != nil {
-				zap.L().Error("[Limit] rate burst can't be decoded to integer", zap.String("key", limiterKey), zap.ByteString("val", val), zap.Error(err))
-				continue
-			}
+			unit = 1
+			//zap.L().Error("[Limit] limit is invaild, should be: <limit>[K|k|M|m] <burst>", zap.String("key", limiterKey), zap.ByteString("val", val))
+			//continue
+		}
+		if limit, err = strconv.ParseFloat(limitStr, 64); err != nil {
+			zap.L().Error("[Limit] limit can't be decoded to integer", zap.String("key", limiterKey), zap.ByteString("val", val), zap.Error(err))
+			continue
+		}
+		limit *= unit
+		if burst, err = strconv.ParseInt(burstStr, 10, 64); err != nil {
+			zap.L().Error("[Limit] burst can't be decoded to integer", zap.String("key", limiterKey), zap.ByteString("val", val), zap.Error(err))
+			continue
 		}
 
 		zap.L().Info("[Limit] got limit", zap.String("key", limiterKey), zap.Float64("limit", limit), zap.Int64("burst", burst))
@@ -321,15 +317,15 @@ func (l *LimitersMgr) startSyncNewLimit() {
 		l.limiters.Range(func(k, v interface{}) bool {
 			limiterName := k.(string)
 			commandLimiter := v.(*CommandLimiter)
-			qpsLimit, _ := l.getLimit(limiterName, true)
+			qpsLimit, qpsBurst := l.getLimit(limiterName, true)
 			rateLimit, rateBurst := l.getLimit(limiterName, false)
-			if qpsLimit > 0 ||
+			if (qpsLimit > 0 && qpsBurst > 0) ||
 				(rateLimit > 0 && rateBurst > 0) {
 				if commandLimiter == nil {
-					newCl := NewCommandLimiter(l.localIp, limiterName, qpsLimit, rateLimit, rateBurst)
+					newCl := NewCommandLimiter(l.localIp, limiterName, qpsLimit, qpsBurst, rateLimit, rateBurst)
 					l.limiters.Store(limiterName, newCl)
 				} else {
-					commandLimiter.updateLimit(qpsLimit, rateLimit, rateBurst)
+					commandLimiter.updateLimit(qpsLimit, qpsBurst, rateLimit, rateBurst)
 				}
 			} else {
 				if commandLimiter != nil {
@@ -341,10 +337,10 @@ func (l *LimitersMgr) startSyncNewLimit() {
 	}
 }
 
-func NewCommandLimiter(localIp string, limiterName string, qpsLimit float64, rateLimit float64, rateBurst int) *CommandLimiter {
+func NewCommandLimiter(localIp string, limiterName string, qpsLimit float64, qpsBurst int, rateLimit float64, rateBurst int) *CommandLimiter {
 	var qpsl, ratel *rate.Limiter
-	if qpsLimit > 0 {
-		qpsl = rate.NewLimiter(rate.Limit(qpsLimit), DEFAULT_BURST)
+	if qpsLimit > 0 && qpsBurst > 0 {
+		qpsl = rate.NewLimiter(rate.Limit(qpsLimit), qpsBurst)
 	}
 	if rateLimit > 0 && rateBurst > 0 {
 		ratel = rate.NewLimiter(rate.Limit(rateLimit), rateBurst)
@@ -360,17 +356,19 @@ func NewCommandLimiter(localIp string, limiterName string, qpsLimit float64, rat
 	return cl
 }
 
-func (cl *CommandLimiter) updateLimit(qpsLimit float64, rateLimit float64, rateBurst int) {
+func (cl *CommandLimiter) updateLimit(qpsLimit float64, qpsBurst int, rateLimit float64, rateBurst int) {
 	cl.lock.Lock()
 	defer cl.lock.Unlock()
 
-	if qpsLimit > 0 {
+	if qpsLimit > 0 && qpsBurst > 0 {
 		if cl.qpsl != nil {
-			if cl.qpsl.Limit() != rate.Limit(qpsLimit*cl.localPercent) {
+			if cl.qpsl.Burst() != qpsBurst {
+				cl.qpsl = rate.NewLimiter(rate.Limit(qpsLimit*cl.localPercent), qpsBurst)
+			} else if cl.qpsl.Limit() != rate.Limit(qpsLimit*cl.localPercent) {
 				cl.qpsl.SetLimit(rate.Limit(qpsLimit * cl.localPercent))
 			}
 		} else {
-			cl.qpsl = rate.NewLimiter(rate.Limit(qpsLimit*cl.localPercent), DEFAULT_BURST)
+			cl.qpsl = rate.NewLimiter(rate.Limit(qpsLimit*cl.localPercent), qpsBurst)
 		}
 	} else {
 		cl.qpsl = nil
diff --git a/tools/autotest/auto.go b/tools/autotest/auto.go
index a1cf5138..8d546106 100644
--- a/tools/autotest/auto.go
+++ b/tools/autotest/auto.go
@@ -324,81 +324,91 @@ func (ac *AutoClient) LimitCase(t *testing.T) {
 	conns := []redis.Conn{ac.conn, ac.conn2}
 
 	cost := ac.runCmdInGoRoutines(t, "get", "key1", times, conns)
-	assert.Equal(t, true, cost <= 0.1)
+	assert.Equal(t, true, cost <= 0.2)
 	cost = ac.runCmdInGoRoutines(t, "mget", "key1 key2", times, conns)
-	assert.Equal(t, true, cost <= 0.1)
+	assert.Equal(t, true, cost <= 0.2)
 
 	ac.setLimit(t, "qps:*@get", "100")
 	time.Sleep(time.Second * 1)
 	cost = ac.runCmdInGoRoutines(t, "get", "key1", times, conns)
-	assert.Equal(t, true, math.Abs(cost-2) <= 0.1)
+	assert.Equal(t, true, cost <= 0.2)
 
-	ac.setLimit(t, "qps:test@get", "200")
+	ac.setLimit(t, "qps:*@get", "k 1")
 	time.Sleep(time.Second * 1)
 	cost = ac.runCmdInGoRoutines(t, "get", "key1", times, conns)
-	assert.Equal(t, true, math.Abs(cost-1) <= 0.1)
+	assert.Equal(t, true, cost <= 0.2)
+
+	ac.setLimit(t, "qps:*@get", "100 1")
+	time.Sleep(time.Second * 1)
+	cost = ac.runCmdInGoRoutines(t, "get", "key1", times, conns)
+	assert.Equal(t, true, math.Abs(cost-2) <= 0.2)
+
+	ac.setLimit(t, "qps:test@get", "0.2k 20")
+	time.Sleep(time.Second * 1)
+	cost = ac.runCmdInGoRoutines(t, "get", "key1", times, conns)
+	assert.Equal(t, true, math.Abs(cost-1) <= 0.2)
 
 	ac.delLimit(t, 1, "qps:test@get")
 	time.Sleep(time.Second * 1)
 	cost = ac.runCmdInGoRoutines(t, "get", "key1", times, conns)
-	assert.Equal(t, true, math.Abs(cost-2) <= 0.1)
+	assert.Equal(t, true, math.Abs(cost-2) <= 0.2)
 
-	ac.setLimit(t, "qps:*@get", "100a")
+	ac.setLimit(t, "qps:*@get", "100a 1")
 	time.Sleep(time.Second * 1)
 	cost = ac.runCmdInGoRoutines(t, "get", "key1", times, conns)
-	assert.Equal(t, true, cost <= 0.1)
+	assert.Equal(t, true, cost <= 0.2)
 
-	ac.setLimit(t, "qps:*@mget", "100")
+	ac.setLimit(t, "qps:*@mget", "100 1")
 	time.Sleep(time.Second * 1)
 	cost = ac.runCmdInGoRoutines(t, "mget", "key1 key2", times, conns)
-	assert.Equal(t, true, math.Abs(cost-2) <= 0.1)
+	assert.Equal(t, true, math.Abs(cost-2) <= 0.2)
 
 	ac.delLimit(t, 1, "qps:*@mget")
 	ac.setLimit(t, "rate:*@mget", "1k")
 	time.Sleep(time.Second * 1)
 	cost = ac.runCmdInGoRoutines(t, "mget", "key1 key2", times, conns)
-	assert.Equal(t, true, cost <= 0.1)
+	assert.Equal(t, true, cost <= 0.2)
 
 	ac.setLimit(t, "rate:*@mget", "1 2")
 	time.Sleep(time.Second * 1)
 	cost = ac.runCmdInGoRoutines(t, "mget", "key1 key2", times, conns)
-	assert.Equal(t, true, cost <= 0.1)
+	assert.Equal(t, true, cost <= 0.2)
 
 	ac.setLimit(t, "rate:*@mget", "1s 2")
 	time.Sleep(time.Second * 1)
 	cost = ac.runCmdInGoRoutines(t, "mget", "key1 key2", times, conns)
-	assert.Equal(t, true, cost <= 0.1)
+	assert.Equal(t, true, cost <= 0.2)
 
 	ac.setLimit(t, "rate:*@mget", "kk 2")
 	time.Sleep(time.Second * 1)
 	cost = ac.runCmdInGoRoutines(t, "mget", "key1 key2", times, conns)
-	assert.Equal(t, true, cost <= 0.1)
+	assert.Equal(t, true, cost <= 0.2)
 
 	ac.setLimit(t, "rate:*@mget", "1k 2a")
 	time.Sleep(time.Second * 1)
 	cost = ac.runCmdInGoRoutines(t, "mget", "key1 key2", times, conns)
-	assert.Equal(t, true, cost <= 0.1)
+	assert.Equal(t, true, cost <= 0.2)
 
 	ac.setLimit(t, "rate:*@mget", "1k 2")
 	time.Sleep(time.Second * 1)
 	cost = ac.runCmdInGoRoutines(t, "mget", "key1 key2", times, conns)
-	assert.Equal(t, true, cost <= 0.1)
+	assert.Equal(t, true, cost <= 0.2)
 
-	ac.setLimit(t, "rate:*@mget", "28k 100")
+	ac.setLimit(t, "rate:*@mget", "0.028m 100")
 	time.Sleep(time.Second * 1)
 	times = []int{1024, 1025}
 	cost = ac.runCmdInGoRoutines(t, "mget", "key1 key2", times, conns)
-	assert.Equal(t, true, math.Abs(cost-1) <= 0.3)
+	assert.Equal(t, true, math.Abs(cost-1) <= 0.4)
 
 	ac.setLimit(t, "rate:*@mget", "0.028M 100")
 	time.Sleep(time.Second * 1)
 	cost = ac.runCmdInGoRoutines(t, "mget", "key1 key2", times, conns)
-	assert.Equal(t, true, math.Abs(cost-1) <= 0.3)
+	assert.Equal(t, true, math.Abs(cost-1) <= 0.4)
 
 	ac.delLimit(t, 1, "rate:*@mget")
 	time.Sleep(time.Second * 1)
 	cost = ac.runCmdInGoRoutines(t, "mget", "key1 key2", times, conns)
-	assert.Equal(t, true, cost <= 0.3)
+	assert.Equal(t, true, cost <= 0.4)
 
 	ac.ek.DelEqual(t, 1, "key1")
 }

From ecd78a27d24e0268a07e2b12bcf8c4081422acfa Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Thu, 21 Nov 2019 17:52:52 +0800
Subject: [PATCH 13/60] 1 change log level--limit not set, to debug 2 add
 setlimit.sh

---
 db/limitersMgr.go |  6 ++--
 setlimit.sh       | 85 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 88 insertions(+), 3 deletions(-)
 create mode 100644 setlimit.sh

diff --git a/db/limitersMgr.go b/db/limitersMgr.go
index 3cad1fd8..90b877bb 100644
--- a/db/limitersMgr.go
+++ b/db/limitersMgr.go
@@ -154,7 +154,9 @@ func (l *LimitersMgr) getLimit(limiterName string, isQps bool) (float64, int) {
 		}
 		val, err := str.Get()
 		if err != nil {
-			zap.L().Info("[Limit] limit isn't set", zap.String("key", limiterKey), zap.Error(err))
+			if logEnv := zap.L().Check(zap.DebugLevel, "[Limit] limit isn't set"); logEnv != nil {
+				logEnv.Write(zap.String("key", limiterKey), zap.Error(err))
+			}
 			continue
 		}
 
@@ -180,8 +182,6 @@ func (l *LimitersMgr) getLimit(limiterName string, isQps bool) (float64, int) {
 			limitStr = limitStr[:len(limitStr)-1]
 		} else {
 			unit = 1
-			//zap.L().Error("[Limit] limit is invaild, should be: <limit>[K|k|M|m] <burst>", zap.String("key", limiterKey), zap.ByteString("val", val))
-			//continue
 		}
 		if limit, err = strconv.ParseFloat(limitStr, 64); err != nil {
 			zap.L().Error("[Limit] limit can't be decoded to integer", zap.String("key", limiterKey), zap.ByteString("val", val), zap.Error(err))
diff --git a/setlimit.sh b/setlimit.sh
new file mode 100644
index 00000000..742d34d8
--- /dev/null
+++ b/setlimit.sh
@@ -0,0 +1,85 @@
+#!/bin/bash
+host=tkv7369.test.zhuaninc.com
+port=7369
+rediscli="./redis-cli"
+token="sys_ratelimit-1574130304-1-36c153b109ebca80b43769"
+usage_exit()
+{
+	echo "usage:"
+	echo "$0 set qps=(1|0) cmd=<cmd> namespace=<namespace> limit=<num>[k/K/m/M] burst=<num>"
+	echo "or"
+	echo "$0 del qps=(1|0) cmd=<cmd> namespace=<namespace>"
+	echo "<namespace>: all means matching all namespaces"
+	exit 1
+}
+
+if [ $# -lt 1 ]; then
+	usage_exit
+fi
+op=$1
+if [ "$op" != "set" -a "$op" != "del" ]; then
+	usage_exit
+fi
+
+limitname=
+cmd=
+namespace=
+limit=
+burst=
+
+for arg in $*
+do
+	if [ "$arg" = "$op" ]; then
+		continue
+	fi
+
+	optname=`echo $arg | sed 's/=.*//'`
+	optvalue=`echo $arg | sed 's/.*=//'`
+	if [ -z $optname -o -z $optvalue ]; then
+		usage_exit
+	else 
+		if [ "$optname" = "qps" ]; then
+			if [ $optvalue = 1 ]; then
+				limitname=qps
+			elif [ $optvalue = 0 ]; then	
+				limitname=rate
+			else
+				usage_exit
+			fi
+		elif [ "$optname" = "cmd" ]; then
+			cmd=$optvalue
+		elif [ "$optname" = "namespace" ]; then
+			namespace=$optvalue
+		elif [ "$optname" = "limit" ]; then
+			limit=$optvalue
+		elif [ "$optname" = "burst" ]; then
+			burst=$optvalue
+		else
+			usage_exit
+		fi
+	fi
+done
+
+
+if [ -z "$limitname" -o -z "$cmd" -o -z "$namespace" ]; then
+	usage_exit
+else
+	key=
+	if [ "$namespace" = "all" ]; then
+		key="${limitname}:*@${cmd}"
+	else
+		key="${limitname}:${namespace}@${cmd}"
+	fi	
+
+	if [ "$op" = "set" ]; then
+		if [ -z "$limit" -o -z "$burst" ]; then
+			usage_exit
+		else
+			$rediscli -h $host -p $port -a $token set $key "${limit} ${burst}"
+		fi
+	elif [ "$op" = "del" ]; then
+		$rediscli -h $host -p $port -a $token del $key
+	else
+		usage_exit
+	fi
+fi

From 0648ab89eb2423e8dabd0b61a07ea60c484b5fb6 Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Thu, 21 Nov 2019 18:16:07 +0800
Subject: [PATCH 14/60] add limit default config items and fix error in
 config.go

---
 conf/config.go  |  2 +-
 conf/titan.toml | 20 ++++++++++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/conf/config.go b/conf/config.go
index 8228c40d..347dbb70 100644
--- a/conf/config.go
+++ b/conf/config.go
@@ -104,7 +104,7 @@ type Status struct {
 }
 
 type RateLimit struct {
-	InterfaceName       string        `cfg:interface-name; eth0; ; the interface name to get ip and write local titan status to tikv for balancing rate limit`
+	InterfaceName       string        `cfg:"interface-name; eth0; ; the interface name to get ip and write local titan status to tikv for balancing rate limit"`
 	GlobalBalancePeriod time.Duration `cfg:"global-balance-period; 15s;; the period in seconds to balance rate limiting with other titan nodes"`
 	TitanStatusLifetime time.Duration `cfg:"titanstatus-life-time; 1m;; how long if a titan didn't update its status, we consider it dead"`
 	SyncSetPeriod       time.Duration `cfg:"sync-set-period; 3s;; the period in seconds to sync new limit set in tikv"`
diff --git a/conf/titan.toml b/conf/titan.toml
index be46308d..c938f58d 100644
--- a/conf/titan.toml
+++ b/conf/titan.toml
@@ -182,6 +182,26 @@ pd-addrs = "mocktikv://"
 #default:     2
 #concurrency = 2
 
+[tikv.rate-limit]
+#type:        string
+#default:     eth0
+#description: the interface name to get ip and write local titan status to tikv for balancing rate limit
+interface-name = "eth0"
+
+#type:        time.Duration
+#description: the period in seconds to balance rate limiting with other titan nodes
+#default:     15s
+#global-balance-period = 15s
+
+#type:        time.Duration
+#description: how long if a titan didn't update its status, we consider it dead
+#default:     1m
+#titanstatus-life-time = 1m
+
+#type:        time.Duration
+#description: the period in seconds to sync new limit set in tikv
+#default:     3s
+#sync-set-period = 3s
 
 [tikv-logger]
 

From f86245a215e6d3fa8f73366be846383318de54fb Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Thu, 21 Nov 2019 21:21:42 +0800
Subject: [PATCH 15/60] in startSyncNewLimit, just read all match limit once
 for every command's qps/rate

---
 db/limitersMgr.go | 217 ++++++++++++++++++++++++++++++----------------
 1 file changed, 140 insertions(+), 77 deletions(-)

diff --git a/db/limitersMgr.go b/db/limitersMgr.go
index 90b877bb..3c8bcd49 100644
--- a/db/limitersMgr.go
+++ b/db/limitersMgr.go
@@ -26,7 +26,6 @@ const (
 	LIMIT_BURST_TOKEN       = " "
 	TITAN_STATUS_TOKEN      = "titan_status:"
 	TIME_FORMAT             = "2006-01-02 15:04:05"
-	DEFAULT_BURST           = 1
 )
 
 type CommandLimiter struct {
@@ -43,6 +42,11 @@ type CommandLimiter struct {
 	totalCommandsSize  int64
 }
 
+type LimitData struct {
+	limit float64
+	burst int
+}
+
 type LimitersMgr struct {
 	limitDatadb         *DB
 	globalBalancePeriod time.Duration
@@ -51,8 +55,18 @@ type LimitersMgr struct {
 	localIp             string
 	localPercent        float64
 
-	limiters sync.Map
-	lock     sync.Mutex
+	limiters          sync.Map
+	qpsAllmatchLimit  sync.Map
+	rateAllmatchLimit sync.Map
+	lock              sync.Mutex
+}
+
+func getAllmatchLimiterName(limiterName string) string {
+	strs := strings.Split(limiterName, NAMESPACE_COMMAND_TOKEN)
+	if len(strs) < 2 {
+		return ""
+	}
+	return fmt.Sprintf("%s%s%s", ALL_NAMESPACE, NAMESPACE_COMMAND_TOKEN, strs[1])
 }
 
 func NewLimitersMgr(store *RedisStore, rateLimit conf.RateLimit) (*LimitersMgr, error) {
@@ -110,6 +124,10 @@ func (l *LimitersMgr) init(limiterName string) *CommandLimiter {
 		return v.(*CommandLimiter)
 	}
 
+	allmatchLimiterName := getAllmatchLimiterName(limiterName)
+	l.qpsAllmatchLimit.LoadOrStore(allmatchLimiterName, (*LimitData)(nil))
+	l.rateAllmatchLimit.LoadOrStore(allmatchLimiterName, (*LimitData)(nil))
+
 	qpsLimit, qpsBurst := l.getLimit(limiterName, true)
 	rateLimit, rateBurst := l.getLimit(limiterName, false)
 	if (qpsLimit > 0 && qpsBurst > 0) ||
@@ -124,13 +142,6 @@ func (l *LimitersMgr) init(limiterName string) *CommandLimiter {
 }
 
 func (l *LimitersMgr) getLimit(limiterName string, isQps bool) (float64, int) {
-	strs := strings.Split(limiterName, NAMESPACE_COMMAND_TOKEN)
-	if len(strs) < 2 {
-		zap.L().Error("[Limit] wrong format limiter name", zap.String("limiterName", limiterName))
-		return 0, 0
-	}
-	allMatchLimiter := fmt.Sprintf("%s%s%s", ALL_NAMESPACE, NAMESPACE_COMMAND_TOKEN, strs[1])
-	limiterNames := []string{limiterName, allMatchLimiter}
 	limit := float64(0)
 	burst := int64(0)
 
@@ -139,68 +150,70 @@ func (l *LimitersMgr) getLimit(limiterName string, isQps bool) (float64, int) {
 		zap.L().Error("[Limit] transection begin failed", zap.String("limiterName", limiterName), zap.Bool("isQps", isQps), zap.Error(err))
 		return 0, 0
 	}
-	for i := range limiterNames {
-		var limiterKey string
-		if isQps {
-			limiterKey = QPS_PREFIX + limiterNames[i]
-		} else {
-			limiterKey = RATE_PREFIX + limiterNames[i]
+	defer func() {
+		if err := txn.t.Commit(context.Background()); err != nil {
+			zap.L().Error("[Limit] commit after get limit failed", zap.String("limiterName", limiterName), zap.Error(err))
+			txn.t.Rollback()
 		}
+	}()
 
-		str, err := txn.String([]byte(limiterKey))
-		if err != nil {
-			zap.L().Error("[Limit] get limit's value failed", zap.String("key", limiterKey), zap.Error(err))
-			return 0, 0
-		}
-		val, err := str.Get()
-		if err != nil {
-			if logEnv := zap.L().Check(zap.DebugLevel, "[Limit] limit isn't set"); logEnv != nil {
-				logEnv.Write(zap.String("key", limiterKey), zap.Error(err))
-			}
-			continue
-		}
+	var limiterKey string
+	if isQps {
+		limiterKey = QPS_PREFIX + limiterName
+	} else {
+		limiterKey = RATE_PREFIX + limiterName
+	}
 
-		limitStrs := strings.Split(string(val), LIMIT_BURST_TOKEN)
-		if len(limitStrs) < 2 {
-			zap.L().Error("[Limit] limit hasn't enough parameters, should be: <limit>[K|k|M|m] <burst>", zap.String("key", limiterKey), zap.ByteString("val", val))
-			continue
-		}
-		limitStr := limitStrs[0]
-		burstStr := limitStrs[1]
-		if len(limitStr) < 1 {
-			zap.L().Error("[Limit] limit part's length isn't enough, should be: <limit>[K|k|M|m] <burst>", zap.String("key", limiterKey), zap.ByteString("val", val))
-			continue
-		}
-		var strUnit uint8
-		var unit float64
-		strUnit = limitStr[len(limitStr)-1]
-		if strUnit == 'k' || strUnit == 'K' {
-			unit = 1024
-			limitStr = limitStr[:len(limitStr)-1]
-		} else if strUnit == 'm' || strUnit == 'M' {
-			unit = 1024 * 1024
-			limitStr = limitStr[:len(limitStr)-1]
-		} else {
-			unit = 1
-		}
-		if limit, err = strconv.ParseFloat(limitStr, 64); err != nil {
-			zap.L().Error("[Limit] limit can't be decoded to integer", zap.String("key", limiterKey), zap.ByteString("val", val), zap.Error(err))
-			continue
-		}
-		limit *= unit
-		if burst, err = strconv.ParseInt(burstStr, 10, 64); err != nil {
-			zap.L().Error("[Limit] burst can't be decoded to integer", zap.String("key", limiterKey), zap.ByteString("val", val), zap.Error(err))
-			continue
+	str, err := txn.String([]byte(limiterKey))
+	if err != nil {
+		zap.L().Error("[Limit] get limit's value failed", zap.String("key", limiterKey), zap.Error(err))
+		return 0, 0
+	}
+	val, err := str.Get()
+	if err != nil {
+		if logEnv := zap.L().Check(zap.DebugLevel, "[Limit] limit isn't set"); logEnv != nil {
+			logEnv.Write(zap.String("key", limiterKey), zap.Error(err))
 		}
+		return 0, 0
+	}
 
-		zap.L().Info("[Limit] got limit", zap.String("key", limiterKey), zap.Float64("limit", limit), zap.Int64("burst", burst))
-		break
+	limitStrs := strings.Split(string(val), LIMIT_BURST_TOKEN)
+	if len(limitStrs) < 2 {
+		zap.L().Error("[Limit] limit hasn't enough parameters, should be: <limit>[K|k|M|m] <burst>", zap.String("key", limiterKey), zap.ByteString("val", val))
+		return 0, 0
 	}
-	if err := txn.t.Commit(context.Background()); err != nil {
-		zap.L().Error("[Limit] commit after get limit failed", zap.String("limiterName", limiterName), zap.Error(err))
-		txn.t.Rollback()
+	limitStr := limitStrs[0]
+	burstStr := limitStrs[1]
+	if len(limitStr) < 1 {
+		zap.L().Error("[Limit] limit part's length isn't enough, should be: <limit>[K|k|M|m] <burst>", zap.String("key", limiterKey), zap.ByteString("val", val))
 		return 0, 0
 	}
+	var strUnit uint8
+	var unit float64
+	strUnit = limitStr[len(limitStr)-1]
+	if strUnit == 'k' || strUnit == 'K' {
+		unit = 1024
+		limitStr = limitStr[:len(limitStr)-1]
+	} else if strUnit == 'm' || strUnit == 'M' {
+		unit = 1024 * 1024
+		limitStr = limitStr[:len(limitStr)-1]
+	} else {
+		unit = 1
+	}
+	if limit, err = strconv.ParseFloat(limitStr, 64); err != nil {
+		zap.L().Error("[Limit] limit can't be decoded to integer", zap.String("key", limiterKey), zap.ByteString("val", val), zap.Error(err))
+		return 0, 0
+	}
+	limit *= unit
+	if burst, err = strconv.ParseInt(burstStr, 10, 64); err != nil {
+		zap.L().Error("[Limit] burst can't be decoded to integer", zap.String("key", limiterKey), zap.ByteString("val", val), zap.Error(err))
+		return 0, 0
+	}
+
+	if logEnv := zap.L().Check(zap.DebugLevel, "[Limit] got limit"); logEnv != nil {
+		logEnv.Write(zap.String("key", limiterKey), zap.Float64("limit", limit), zap.Int64("burst", burst))
+	}
+
 	return limit, int(burst)
 }
 
@@ -314,27 +327,77 @@ func (l *LimitersMgr) startSyncNewLimit() {
 	ticker := time.NewTicker(l.syncSetPeriod)
 	defer ticker.Stop()
 	for range ticker.C {
-		l.limiters.Range(func(k, v interface{}) bool {
+		l.runSyncNewLimit()
+	}
+}
+
+func (l *LimitersMgr) runSyncNewLimit() {
+	allmatchLimits := []*sync.Map{&l.qpsAllmatchLimit, &l.rateAllmatchLimit}
+	for i, allmatchLimit := range allmatchLimits {
+		allmatchLimit.Range(func(k, v interface{}) bool {
 			limiterName := k.(string)
-			commandLimiter := v.(*CommandLimiter)
-			qpsLimit, qpsBurst := l.getLimit(limiterName, true)
-			rateLimit, rateBurst := l.getLimit(limiterName, false)
-			if (qpsLimit > 0 && qpsBurst > 0) ||
-				(rateLimit > 0 && rateBurst > 0) {
-				if commandLimiter == nil {
-					newCl := NewCommandLimiter(l.localIp, limiterName, qpsLimit, qpsBurst, rateLimit, rateBurst)
-					l.limiters.Store(limiterName, newCl)
+			limitData := v.(*LimitData)
+			isQps := false
+			if i == 0 {
+				isQps = true
+			}
+			limit, burst := l.getLimit(limiterName, isQps)
+			if limit > 0 && burst > 0 {
+				if limitData == nil {
+					limitData = &LimitData{limit, burst}
+					allmatchLimit.Store(limiterName, limitData)
 				} else {
-					commandLimiter.updateLimit(qpsLimit, qpsBurst, rateLimit, rateBurst)
+					limitData.limit = limit
+					limitData.burst = burst
 				}
 			} else {
-				if commandLimiter != nil {
-					l.limiters.Store(limiterName, (*CommandLimiter)(nil))
-				}
+				allmatchLimit.Store(limiterName, (*LimitData)(nil))
 			}
 			return true
 		})
 	}
+
+	l.limiters.Range(func(k, v interface{}) bool {
+		limiterName := k.(string)
+		commandLimiter := v.(*CommandLimiter)
+		allmatchLimiterName := getAllmatchLimiterName(limiterName)
+		qpsLimit, qpsBurst := l.getLimit(limiterName, true)
+		if !(qpsLimit > 0 && qpsBurst > 0) {
+			v, ok := l.qpsAllmatchLimit.Load(allmatchLimiterName)
+			if ok {
+				limitData := v.(*LimitData)
+				if limitData != nil {
+					qpsLimit = limitData.limit
+					qpsBurst = limitData.burst
+				}
+			}
+		}
+		rateLimit, rateBurst := l.getLimit(limiterName, false)
+		if !(rateLimit > 0 && rateBurst > 0) {
+			v, ok := l.rateAllmatchLimit.Load(allmatchLimiterName)
+			if ok {
+				limitData := v.(*LimitData)
+				if limitData != nil {
+					rateLimit = limitData.limit
+					rateBurst = limitData.burst
+				}
+			}
+		}
+		if (qpsLimit > 0 && qpsBurst > 0) ||
+			(rateLimit > 0 && rateBurst > 0) {
+			if commandLimiter == nil {
+				newCl := NewCommandLimiter(l.localIp, limiterName, qpsLimit, qpsBurst, rateLimit, rateBurst)
+				l.limiters.Store(limiterName, newCl)
+			} else {
+				commandLimiter.updateLimit(qpsLimit, qpsBurst, rateLimit, rateBurst)
+			}
+		} else {
+			if commandLimiter != nil {
+				l.limiters.Store(limiterName, (*CommandLimiter)(nil))
+			}
+		}
+		return true
+	})
 }
 
 func NewCommandLimiter(localIp string, limiterName string, qpsLimit float64, qpsBurst int, rateLimit float64, rateBurst int) *CommandLimiter {

From 157d70a45af13460e32ebb24226a4e45a5b2e052 Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Fri, 22 Nov 2019 11:15:26 +0800
Subject: [PATCH 16/60] change got limit log trace

---
 db/limitersMgr.go | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/db/limitersMgr.go b/db/limitersMgr.go
index 3c8bcd49..565eb9b8 100644
--- a/db/limitersMgr.go
+++ b/db/limitersMgr.go
@@ -171,9 +171,6 @@ func (l *LimitersMgr) getLimit(limiterName string, isQps bool) (float64, int) {
 	}
 	val, err := str.Get()
 	if err != nil {
-		if logEnv := zap.L().Check(zap.DebugLevel, "[Limit] limit isn't set"); logEnv != nil {
-			logEnv.Write(zap.String("key", limiterKey), zap.Error(err))
-		}
 		return 0, 0
 	}
 
@@ -383,8 +380,13 @@ func (l *LimitersMgr) runSyncNewLimit() {
 				}
 			}
 		}
+
 		if (qpsLimit > 0 && qpsBurst > 0) ||
 			(rateLimit > 0 && rateBurst > 0) {
+			if logEnv := zap.L().Check(zap.DebugLevel, "[Limit] limit is set"); logEnv != nil {
+				logEnv.Write(zap.String("limiter name", limiterName), zap.Float64("qps limit", qpsLimit), zap.Int("qps burst", qpsBurst),
+					zap.Float64("rate limit", rateLimit), zap.Int("rate burst", rateBurst))
+			}
 			if commandLimiter == nil {
 				newCl := NewCommandLimiter(l.localIp, limiterName, qpsLimit, qpsBurst, rateLimit, rateBurst)
 				l.limiters.Store(limiterName, newCl)

From fd0d8cc171c8eb18b4f78ce6854f9a0788b5aeb1 Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Fri, 22 Nov 2019 12:02:25 +0800
Subject: [PATCH 17/60] add limit cleared log trace, just log limit is trigger
 when delay > 0

---
 db/limitersMgr.go | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/db/limitersMgr.go b/db/limitersMgr.go
index 565eb9b8..9f8802c6 100644
--- a/db/limitersMgr.go
+++ b/db/limitersMgr.go
@@ -395,6 +395,10 @@ func (l *LimitersMgr) runSyncNewLimit() {
 			}
 		} else {
 			if commandLimiter != nil {
+				if logEnv := zap.L().Check(zap.DebugLevel, "[Limit] limit is cleared"); logEnv != nil {
+					logEnv.Write(zap.String("limiter name", limiterName), zap.Float64("qps limit", qpsLimit), zap.Int("qps burst", qpsBurst),
+						zap.Float64("rate limit", rateLimit), zap.Int("rate burst", rateBurst))
+				}
 				l.limiters.Store(limiterName, (*CommandLimiter)(nil))
 			}
 		}
@@ -496,10 +500,12 @@ func (cl *CommandLimiter) checkLimit(cmdName string, cmdArgs []string) {
 			zap.L().Error("[Limit] request events num exceed limiter burst", zap.Int("qps limiter burst", cl.qpsl.Burst()))
 		} else {
 			d := r.Delay()
-			if logEnv := zap.L().Check(zap.DebugLevel, "[Limit] trigger qps limit"); logEnv != nil {
-				logEnv.Write(zap.String("limiter name", cl.limiterName), zap.Int64("sleep us", int64(d/time.Microsecond)))
+			if d > 0 {
+				if logEnv := zap.L().Check(zap.DebugLevel, "[Limit] trigger qps limit"); logEnv != nil {
+					logEnv.Write(zap.String("limiter name", cl.limiterName), zap.Int64("sleep us", int64(d/time.Microsecond)))
+				}
+				time.Sleep(d)
 			}
-			time.Sleep(d)
 		}
 	}
 
@@ -513,10 +519,12 @@ func (cl *CommandLimiter) checkLimit(cmdName string, cmdArgs []string) {
 			zap.L().Error("[Limit] request events num exceed limiter burst", zap.Int("rate limiter burst", cl.ratel.Burst()), zap.Int("command size", cmdSize))
 		} else {
 			d := r.Delay()
-			if logEnv := zap.L().Check(zap.DebugLevel, "[Limit] trigger rate limit"); logEnv != nil {
-				logEnv.Write(zap.String("limiter name", cl.limiterName), zap.Strings("args", cmdArgs), zap.Int64("sleep us", int64(d/time.Microsecond)))
+			if d > 0 {
+				if logEnv := zap.L().Check(zap.DebugLevel, "[Limit] trigger rate limit"); logEnv != nil {
+					logEnv.Write(zap.String("limiter name", cl.limiterName), zap.Strings("args", cmdArgs), zap.Int64("sleep us", int64(d/time.Microsecond)))
+				}
+				time.Sleep(d)
 			}
-			time.Sleep(d)
 		}
 	}
 

From 756a0e60c5dd34f37d3e0413d952544f918e92cb Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Fri, 22 Nov 2019 17:03:54 +0800
Subject: [PATCH 18/60] change limit/commandFunc cost seconds factor from 2 to
 1.4

---
 metrics/prometheus.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/metrics/prometheus.go b/metrics/prometheus.go
index 949e3991..7215f37c 100644
--- a/metrics/prometheus.go
+++ b/metrics/prometheus.go
@@ -118,7 +118,7 @@ func init() {
 		prometheus.HistogramOpts{
 			Namespace: namespace,
 			Name:      "limit_cost_seconds",
-			Buckets:   prometheus.ExponentialBuckets(0.0001, 2, 10),
+			Buckets:   prometheus.ExponentialBuckets(0.0002, 1.4, 20),
 			Help:      "the cost times of command execute's limit",
 		}, multiLabel)
 	prometheus.MustRegister(gm.LimitCostHistogramVec)
@@ -136,7 +136,7 @@ func init() {
 		prometheus.HistogramOpts{
 			Namespace: namespace,
 			Name:      "command_func_done_seconds",
-			Buckets:   prometheus.ExponentialBuckets(0.0002, 2, 10),
+			Buckets:   prometheus.ExponentialBuckets(0.0002, 1.4, 20),
 			Help:      "The cost times of command func",
 		}, multiLabel)
 	prometheus.MustRegister(gm.CommandFuncDoneHistogramVec)

From 46bb6daf407561117ce8ad8e150d0a41db7dee96 Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Fri, 22 Nov 2019 18:19:36 +0800
Subject: [PATCH 19/60] fix titan active time decoding bug

---
 db/limitersMgr.go | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/db/limitersMgr.go b/db/limitersMgr.go
index 9f8802c6..a56b8ff3 100644
--- a/db/limitersMgr.go
+++ b/db/limitersMgr.go
@@ -275,19 +275,21 @@ func (l *LimitersMgr) runBalanceLimit() {
 			continue
 		}
 		ip := key[prefixLen:]
-		if string(ip) == l.localIp {
+		obj := NewString(txn, key)
+		if err = obj.decode(iter.Value()); err != nil {
+			zap.L().Error("[Limit] Strings decoded value error", zap.ByteString("key", key), zap.Error(err))
 			continue
 		}
 
-		lastActive := iter.Value()
+		lastActive := obj.Meta.Value
 		lastActiveT, err := time.Parse(TIME_FORMAT, string(lastActive))
 		if err != nil {
-			zap.L().Error("[Limit] decode time string failed", zap.ByteString("ip", ip), zap.ByteString("lastActive", lastActive), zap.Error(err))
+			zap.L().Error("[Limit] value can't decoded into a time", zap.ByteString("key", key), zap.ByteString("lastActive", lastActive), zap.Error(err))
 			continue
 		}
 
 		zap.L().Info("[Limit] last active time", zap.ByteString("ip", ip), zap.ByteString("lastActive", lastActive))
-		if time.Since(lastActiveT) <= l.titanStatusLifeTime {
+		if string(ip) != l.localIp && time.Since(lastActiveT) <= l.titanStatusLifeTime {
 			activeNum++
 		}
 	}

From a839995a03e5579c61b4d03c7d64816799378163 Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Fri, 22 Nov 2019 18:38:40 +0800
Subject: [PATCH 20/60] fix limit local percent bug

---
 db/limitersMgr.go | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/db/limitersMgr.go b/db/limitersMgr.go
index a56b8ff3..e10408c3 100644
--- a/db/limitersMgr.go
+++ b/db/limitersMgr.go
@@ -260,7 +260,7 @@ func (l *LimitersMgr) runBalanceLimit() {
 	}
 	defer iter.Close()
 
-	activeNum := 1
+	activeNum := float64(1)
 	prefixLen := len(prefix)
 	for ; iter.Valid() && iter.Key().HasPrefix(prefix); err = iter.Next() {
 		if err != nil {
@@ -293,9 +293,9 @@ func (l *LimitersMgr) runBalanceLimit() {
 			activeNum++
 		}
 	}
-	newPercent := float64(1.0 / activeNum)
+	newPercent := 1 / activeNum
 	if l.localPercent != newPercent {
-		zap.L().Info("[Limit] balance limit in all titan server", zap.Int("active server num", activeNum),
+		zap.L().Info("[Limit] balance limit in all titan server", zap.Float64("active server num", activeNum),
 			zap.Float64("oldPercent", l.localPercent), zap.Float64("newPercent", newPercent))
 		l.limiters.Range(func(k, v interface{}) bool {
 			commandLimiter := v.(*CommandLimiter)

From 815efa4c4744f6fbdbf85ee84ec48adae722f511 Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Fri, 22 Nov 2019 19:57:31 +0800
Subject: [PATCH 21/60] fix titan active time parsing bug

---
 db/limitersMgr.go | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/db/limitersMgr.go b/db/limitersMgr.go
index e10408c3..d1a0ab8b 100644
--- a/db/limitersMgr.go
+++ b/db/limitersMgr.go
@@ -282,13 +282,14 @@ func (l *LimitersMgr) runBalanceLimit() {
 		}
 
 		lastActive := obj.Meta.Value
-		lastActiveT, err := time.Parse(TIME_FORMAT, string(lastActive))
+		lastActiveT, err := time.ParseInLocation(TIME_FORMAT, string(lastActive), time.Local)
 		if err != nil {
 			zap.L().Error("[Limit] value can't decoded into a time", zap.ByteString("key", key), zap.ByteString("lastActive", lastActive), zap.Error(err))
 			continue
 		}
 
-		zap.L().Info("[Limit] last active time", zap.ByteString("ip", ip), zap.ByteString("lastActive", lastActive))
+		diff := time.Since(lastActiveT).Seconds()
+		zap.L().Info("[Limit] last active time", zap.ByteString("ip", ip), zap.ByteString("lastActive", lastActive), zap.Float64("activePast", diff))
 		if string(ip) != l.localIp && time.Since(lastActiveT) <= l.titanStatusLifeTime {
 			activeNum++
 		}

From 961257161fcbb943fe97741fb491fa6c4abdc79a Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Fri, 22 Nov 2019 20:27:29 +0800
Subject: [PATCH 22/60] add updateLimitPercent log

---
 db/limitersMgr.go | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/db/limitersMgr.go b/db/limitersMgr.go
index d1a0ab8b..56b95867 100644
--- a/db/limitersMgr.go
+++ b/db/limitersMgr.go
@@ -482,12 +482,14 @@ func (cl *CommandLimiter) updateLimitPercent(newPercent float64) {
 
 	if cl.localPercent != newPercent && cl.localPercent > 0 && newPercent > 0 {
 		if cl.qpsl != nil {
-			qpsLimit := float64(cl.qpsl.Limit()) / cl.localPercent
-			cl.qpsl.SetLimit(rate.Limit(qpsLimit * newPercent))
+			qpsLimit := (float64(cl.qpsl.Limit()) / cl.localPercent) * newPercent
+			zap.L().Info("percent changed", zap.String("limiterName", cl.limiterName), zap.Float64("qps limit", qpsLimit), zap.Int("burst", cl.qpsl.Burst()))
+			cl.qpsl.SetLimit(rate.Limit(qpsLimit))
 		}
 		if cl.ratel != nil {
-			rateLimit := float64(cl.ratel.Limit()) / cl.localPercent
-			cl.ratel.SetLimit(rate.Limit(rateLimit * newPercent))
+			rateLimit := float64(cl.ratel.Limit()) / cl.localPercent * newPercent
+			zap.L().Info("percent changed", zap.String("limiterName", cl.limiterName), zap.Float64("rate limit", rateLimit), zap.Int("burst", cl.ratel.Burst()))
+			cl.ratel.SetLimit(rate.Limit(rateLimit))
 		}
 		cl.localPercent = newPercent
 	}

From cc4255a93ec9d24538c1cc0d7f1845b297f9c9ff Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Fri, 22 Nov 2019 20:42:14 +0800
Subject: [PATCH 23/60] when create commandLimiter, also use localPercent to
 set limit

---
 db/limitersMgr.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/db/limitersMgr.go b/db/limitersMgr.go
index 56b95867..74d8d8c5 100644
--- a/db/limitersMgr.go
+++ b/db/limitersMgr.go
@@ -132,7 +132,7 @@ func (l *LimitersMgr) init(limiterName string) *CommandLimiter {
 	rateLimit, rateBurst := l.getLimit(limiterName, false)
 	if (qpsLimit > 0 && qpsBurst > 0) ||
 		(rateLimit > 0 && rateBurst > 0) {
-		newCl := NewCommandLimiter(l.localIp, limiterName, qpsLimit, qpsBurst, rateLimit, rateBurst)
+		newCl := NewCommandLimiter(l.localIp, limiterName, qpsLimit*l.localPercent, qpsBurst, rateLimit*l.localPercent, rateBurst)
 		v, _ := l.limiters.LoadOrStore(limiterName, newCl)
 		return v.(*CommandLimiter)
 	} else {
@@ -391,7 +391,7 @@ func (l *LimitersMgr) runSyncNewLimit() {
 					zap.Float64("rate limit", rateLimit), zap.Int("rate burst", rateBurst))
 			}
 			if commandLimiter == nil {
-				newCl := NewCommandLimiter(l.localIp, limiterName, qpsLimit, qpsBurst, rateLimit, rateBurst)
+				newCl := NewCommandLimiter(l.localIp, limiterName, qpsLimit*l.localPercent, qpsBurst, rateLimit*l.localPercent, rateBurst)
 				l.limiters.Store(limiterName, newCl)
 			} else {
 				commandLimiter.updateLimit(qpsLimit, qpsBurst, rateLimit, rateBurst)

From f00d8c5e13f5c5cd831c68fbfa836020b44936de Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Mon, 25 Nov 2019 20:33:26 +0800
Subject: [PATCH 24/60] fix limitersMgr localPercent using bug and lock before
 use it

---
 db/limitersMgr.go | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/db/limitersMgr.go b/db/limitersMgr.go
index 74d8d8c5..ac4c58b1 100644
--- a/db/limitersMgr.go
+++ b/db/limitersMgr.go
@@ -132,7 +132,7 @@ func (l *LimitersMgr) init(limiterName string) *CommandLimiter {
 	rateLimit, rateBurst := l.getLimit(limiterName, false)
 	if (qpsLimit > 0 && qpsBurst > 0) ||
 		(rateLimit > 0 && rateBurst > 0) {
-		newCl := NewCommandLimiter(l.localIp, limiterName, qpsLimit*l.localPercent, qpsBurst, rateLimit*l.localPercent, rateBurst)
+		newCl := NewCommandLimiter(l.localIp, limiterName, qpsLimit, qpsBurst, rateLimit, rateBurst, l.localPercent)
 		v, _ := l.limiters.LoadOrStore(limiterName, newCl)
 		return v.(*CommandLimiter)
 	} else {
@@ -295,9 +295,13 @@ func (l *LimitersMgr) runBalanceLimit() {
 		}
 	}
 	newPercent := 1 / activeNum
-	if l.localPercent != newPercent {
+	var oldPercent float64
+	l.lock.Lock()
+	oldPercent = l.localPercent
+	l.lock.Unlock()
+	if oldPercent != newPercent {
 		zap.L().Info("[Limit] balance limit in all titan server", zap.Float64("active server num", activeNum),
-			zap.Float64("oldPercent", l.localPercent), zap.Float64("newPercent", newPercent))
+			zap.Float64("oldPercent", oldPercent), zap.Float64("newPercent", newPercent))
 		l.limiters.Range(func(k, v interface{}) bool {
 			commandLimiter := v.(*CommandLimiter)
 			if commandLimiter != nil {
@@ -305,7 +309,10 @@ func (l *LimitersMgr) runBalanceLimit() {
 			}
 			return true
 		})
+
+		l.lock.Lock()
 		l.localPercent = newPercent
+		l.lock.Unlock()
 	}
 
 	key := []byte(TITAN_STATUS_TOKEN + l.localIp)
@@ -357,6 +364,10 @@ func (l *LimitersMgr) runSyncNewLimit() {
 		})
 	}
 
+	var localPercent float64
+	l.lock.Lock()
+	localPercent = l.localPercent
+	l.lock.Unlock()
 	l.limiters.Range(func(k, v interface{}) bool {
 		limiterName := k.(string)
 		commandLimiter := v.(*CommandLimiter)
@@ -391,7 +402,7 @@ func (l *LimitersMgr) runSyncNewLimit() {
 					zap.Float64("rate limit", rateLimit), zap.Int("rate burst", rateBurst))
 			}
 			if commandLimiter == nil {
-				newCl := NewCommandLimiter(l.localIp, limiterName, qpsLimit*l.localPercent, qpsBurst, rateLimit*l.localPercent, rateBurst)
+				newCl := NewCommandLimiter(l.localIp, limiterName, qpsLimit, qpsBurst, rateLimit, rateBurst, localPercent)
 				l.limiters.Store(limiterName, newCl)
 			} else {
 				commandLimiter.updateLimit(qpsLimit, qpsBurst, rateLimit, rateBurst)
@@ -409,20 +420,20 @@ func (l *LimitersMgr) runSyncNewLimit() {
 	})
 }
 
-func NewCommandLimiter(localIp string, limiterName string, qpsLimit float64, qpsBurst int, rateLimit float64, rateBurst int) *CommandLimiter {
+func NewCommandLimiter(localIp string, limiterName string, qpsLimit float64, qpsBurst int, rateLimit float64, rateBurst int, localPercent float64) *CommandLimiter {
 	var qpsl, ratel *rate.Limiter
 	if qpsLimit > 0 && qpsBurst > 0 {
-		qpsl = rate.NewLimiter(rate.Limit(qpsLimit), qpsBurst)
+		qpsl = rate.NewLimiter(rate.Limit(qpsLimit*localPercent), qpsBurst)
 	}
 	if rateLimit > 0 && rateBurst > 0 {
-		ratel = rate.NewLimiter(rate.Limit(rateLimit), rateBurst)
+		ratel = rate.NewLimiter(rate.Limit(rateLimit*localPercent), rateBurst)
 	}
 	cl := &CommandLimiter{
 		limiterName:  limiterName,
 		localIp:      localIp,
 		qpsl:         qpsl,
 		ratel:        ratel,
-		localPercent: 1,
+		localPercent: localPercent,
 		lastTime:     time.Now(),
 	}
 	return cl

From da49bfae49ac6ca7836f964e5bc479852447e3d0 Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Mon, 25 Nov 2019 21:31:33 +0800
Subject: [PATCH 25/60] reportLocalStat every globalBalancePeriod,even the
 commandLimiter is nil

---
 db/limitersMgr.go | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/db/limitersMgr.go b/db/limitersMgr.go
index ac4c58b1..9cdc0918 100644
--- a/db/limitersMgr.go
+++ b/db/limitersMgr.go
@@ -240,9 +240,24 @@ func (l *LimitersMgr) startBalanceLimit() {
 	defer ticker.Stop()
 	for range ticker.C {
 		l.runBalanceLimit()
+		l.reportLocalStat()
 	}
 }
 
+func (l *LimitersMgr) reportLocalStat() {
+	l.limiters.Range(func(k, v interface{}) bool {
+		limiterName := k.(string)
+		commandLimiter := v.(*CommandLimiter)
+		if commandLimiter != nil {
+			commandLimiter.reportLocalStat()
+		} else {
+			metrics.GetMetrics().LimiterQpsVec.WithLabelValues(l.localIp, limiterName).Set(0)
+			metrics.GetMetrics().LimiterRateVec.WithLabelValues(l.localIp, limiterName).Set(0)
+		}
+		return true
+	})
+}
+
 func (l *LimitersMgr) runBalanceLimit() {
 	txn, err := l.limitDatadb.Begin()
 	if err != nil {
@@ -470,6 +485,11 @@ func (cl *CommandLimiter) updateLimit(qpsLimit float64, qpsBurst int, rateLimit
 	} else {
 		cl.ratel = nil
 	}
+}
+
+func (cl *CommandLimiter) reportLocalStat() {
+	cl.lock.Lock()
+	defer cl.lock.Unlock()
 
 	var qpsLocal, rateLocal float64
 	seconds := time.Since(cl.lastTime).Seconds()

From 30f7d2c7c61df7d794f79b1f88f9328218d35d6e Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Tue, 26 Nov 2019 16:26:18 +0800
Subject: [PATCH 26/60] limit can also work on auth-disabled titan server

---
 conf/config.go    | 1 +
 conf/titan.toml   | 5 +++++
 db/limitersMgr.go | 9 +++++----
 3 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/conf/config.go b/conf/config.go
index 347dbb70..e4a01d9c 100644
--- a/conf/config.go
+++ b/conf/config.go
@@ -105,6 +105,7 @@ type Status struct {
 
 type RateLimit struct {
 	InterfaceName       string        `cfg:"interface-name; eth0; ; the interface name to get ip and write local titan status to tikv for balancing rate limit"`
+	LimiterNamespace    string        `cfg:"limiter-namespace; sys_ratelimit;; the namespace of getting limit/balance data"`
 	GlobalBalancePeriod time.Duration `cfg:"global-balance-period; 15s;; the period in seconds to balance rate limiting with other titan nodes"`
 	TitanStatusLifetime time.Duration `cfg:"titanstatus-life-time; 1m;; how long if a titan didn't update its status, we consider it dead"`
 	SyncSetPeriod       time.Duration `cfg:"sync-set-period; 3s;; the period in seconds to sync new limit set in tikv"`
diff --git a/conf/titan.toml b/conf/titan.toml
index c938f58d..b1128f0a 100644
--- a/conf/titan.toml
+++ b/conf/titan.toml
@@ -188,6 +188,11 @@ pd-addrs = "mocktikv://"
 #description: the interface name to get ip and write local titan status to tikv for balancing rate limit
 interface-name = "eth0"
 
+#type:        string
+#default:     sys_ratelimit
+#the namespace of getting limit/balance data
+#limiter-namespace = "sys_ratelimit"
+
 #type:        time.Duration
 #description: the period in seconds to balance rate limiting with other titan nodes
 #default:     15s
diff --git a/db/limitersMgr.go b/db/limitersMgr.go
index 9cdc0918..dc477ffc 100644
--- a/db/limitersMgr.go
+++ b/db/limitersMgr.go
@@ -100,8 +100,12 @@ func NewLimitersMgr(store *RedisStore, rateLimit conf.RateLimit) (*LimitersMgr,
 		return nil, errors.New(rateLimit.InterfaceName + " adds is empty")
 	}
 
+	if rateLimit.LimiterNamespace == "" {
+		return nil, errors.New("limiter-namespace is configured with empty")
+	}
+
 	l := &LimitersMgr{
-		limitDatadb:         store.DB(LIMITDATA_NAMESPACE, LIMITDATA_DBID),
+		limitDatadb:         store.DB(rateLimit.LimiterNamespace, LIMITDATA_DBID),
 		globalBalancePeriod: rateLimit.GlobalBalancePeriod,
 		titanStatusLifeTime: rateLimit.TitanStatusLifetime,
 		syncSetPeriod:       rateLimit.SyncSetPeriod,
@@ -215,9 +219,6 @@ func (l *LimitersMgr) getLimit(limiterName string, isQps bool) (float64, int) {
 }
 
 func (l *LimitersMgr) CheckLimit(namespace string, cmdName string, cmdArgs []string) {
-	if namespace == LIMITDATA_NAMESPACE {
-		return
-	}
 	limiterName := fmt.Sprintf("%s%s%s", namespace, NAMESPACE_COMMAND_TOKEN, cmdName)
 	v, ok := l.limiters.Load(limiterName)
 	var commandLimiter *CommandLimiter

From bcbd5a72b3be8d89f78cc4bb44b7cb1bb3e29dc7 Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Tue, 26 Nov 2019 18:32:51 +0800
Subject: [PATCH 27/60] decrease the lock range in reportStat

---
 db/limitersMgr.go | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/db/limitersMgr.go b/db/limitersMgr.go
index dc477ffc..9cef2b71 100644
--- a/db/limitersMgr.go
+++ b/db/limitersMgr.go
@@ -489,10 +489,8 @@ func (cl *CommandLimiter) updateLimit(qpsLimit float64, qpsBurst int, rateLimit
 }
 
 func (cl *CommandLimiter) reportLocalStat() {
-	cl.lock.Lock()
-	defer cl.lock.Unlock()
-
 	var qpsLocal, rateLocal float64
+	cl.lock.Lock()
 	seconds := time.Since(cl.lastTime).Seconds()
 	if seconds >= 0 {
 		qpsLocal = float64(cl.totalCommandsCount) / seconds
@@ -501,11 +499,13 @@ func (cl *CommandLimiter) reportLocalStat() {
 		qpsLocal = 0
 		rateLocal = 0
 	}
-	metrics.GetMetrics().LimiterQpsVec.WithLabelValues(cl.localIp, cl.limiterName).Set(qpsLocal)
-	metrics.GetMetrics().LimiterRateVec.WithLabelValues(cl.localIp, cl.limiterName).Set(rateLocal)
 	cl.totalCommandsCount = 0
 	cl.totalCommandsSize = 0
 	cl.lastTime = time.Now()
+	cl.lock.Unlock()
+
+	metrics.GetMetrics().LimiterQpsVec.WithLabelValues(cl.localIp, cl.limiterName).Set(qpsLocal)
+	metrics.GetMetrics().LimiterRateVec.WithLabelValues(cl.localIp, cl.limiterName).Set(rateLocal)
 }
 
 func (cl *CommandLimiter) updateLimitPercent(newPercent float64) {

From 1a5069558ed1354085a04d65362fe8db0a377270 Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Tue, 26 Nov 2019 21:50:03 +0800
Subject: [PATCH 28/60] reportLocalStat run in itself go routine

---
 conf/mockconfig.go | 1 +
 conf/titan.toml    | 6 +++---
 db/limitersMgr.go  | 8 ++++++++
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/conf/mockconfig.go b/conf/mockconfig.go
index 9649b490..b21ed652 100644
--- a/conf/mockconfig.go
+++ b/conf/mockconfig.go
@@ -37,6 +37,7 @@ func MockConf() *Titan {
 				SyncSetPeriod:       1 * time.Second,
 				GlobalBalancePeriod: 15 * time.Second,
 				TitanStatusLifetime: 30 * time.Second,
+				LimiterNamespace:    "sys_ratelimit",
 			},
 		},
 	}
diff --git a/conf/titan.toml b/conf/titan.toml
index b1128f0a..37e2311a 100644
--- a/conf/titan.toml
+++ b/conf/titan.toml
@@ -196,17 +196,17 @@ interface-name = "eth0"
 #type:        time.Duration
 #description: the period in seconds to balance rate limiting with other titan nodes
 #default:     15s
-#global-balance-period = 15s
+#global-balance-period = "15s"
 
 #type:        time.Duration
 #description: how long if a titan didn't update its status, we consider it dead
 #default:     1m
-#titanstatus-life-time = 1m
+#titanstatus-life-time = "1m"
 
 #type:        time.Duration
 #description: the period in seconds to sync new limit set in tikv
 #default:     3s
-#sync-set-period = 3s
+#sync-set-period = "3s"
 
 [tikv-logger]
 
diff --git a/db/limitersMgr.go b/db/limitersMgr.go
index 9cef2b71..37b7c18e 100644
--- a/db/limitersMgr.go
+++ b/db/limitersMgr.go
@@ -115,6 +115,7 @@ func NewLimitersMgr(store *RedisStore, rateLimit conf.RateLimit) (*LimitersMgr,
 
 	go l.startBalanceLimit()
 	go l.startSyncNewLimit()
+	go l.startReportLocalStat()
 	return l, nil
 }
 
@@ -241,6 +242,13 @@ func (l *LimitersMgr) startBalanceLimit() {
 	defer ticker.Stop()
 	for range ticker.C {
 		l.runBalanceLimit()
+	}
+}
+
+func (l *LimitersMgr) startReportLocalStat() {
+	ticker := time.NewTicker(l.globalBalancePeriod)
+	defer ticker.Stop()
+	for range ticker.C {
 		l.reportLocalStat()
 	}
 }

From a7753c4021984b93e59e03cf1eb6e3381d29c856 Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Fri, 29 Nov 2019 22:45:27 +0800
Subject: [PATCH 29/60] 1 change how to balance 2 change qps/rate metrics 3
 labels: namespace, command, localip

---
 bin/titan/main.go          |   2 +-
 conf/config.go             |   4 +
 conf/mockconfig.go         |  10 +-
 db/limitersMgr.go          | 382 +++++++++++++++++++++++--------------
 metrics/prometheus.go      |   6 +-
 tools/integration/titan.go |   2 +-
 6 files changed, 256 insertions(+), 150 deletions(-)

diff --git a/bin/titan/main.go b/bin/titan/main.go
index 4616110b..8c3cff8c 100644
--- a/bin/titan/main.go
+++ b/bin/titan/main.go
@@ -68,7 +68,7 @@ func main() {
 	}
 
 	svr := metrics.NewServer(&config.Status)
-	limitersMgr, err := db.NewLimitersMgr(store, config.Tikv.RateLimit)
+	limitersMgr, err := db.NewLimitersMgr(store, &config.Tikv.RateLimit)
 	if err != nil {
 		zap.L().Fatal("create limitersMgr failed", zap.Error(err))
 		os.Exit(1)
diff --git a/conf/config.go b/conf/config.go
index e4a01d9c..e7b00e08 100644
--- a/conf/config.go
+++ b/conf/config.go
@@ -109,4 +109,8 @@ type RateLimit struct {
 	GlobalBalancePeriod time.Duration `cfg:"global-balance-period; 15s;; the period in seconds to balance rate limiting with other titan nodes"`
 	TitanStatusLifetime time.Duration `cfg:"titanstatus-life-time; 1m;; how long if a titan didn't update its status, we consider it dead"`
 	SyncSetPeriod       time.Duration `cfg:"sync-set-period; 3s;; the period in seconds to sync new limit set in tikv"`
+	UsageToDivide       float64       `cfg:"usage-to-divide; 0.6;; if the qps/weighted limit < the percent, will divide change Factor to balance limit"`
+	UsageToMultiply     float64       `cfg:"usage-to-multiply; 0.9;; if the qps/weighted limit > the percent, will multiply change Factor to balance limit"`
+	WeightChangeFactor  float64       `cfg:"weight-change-factor; 1.5;; the weight change unit when balance limit, is also the minimum weight"`
+	InitialPercent      float64       `cfg:"initial-percent; 0.33;; the percent in a commandLimiter is created"`
 }
diff --git a/conf/mockconfig.go b/conf/mockconfig.go
index b21ed652..9a23a2bb 100644
--- a/conf/mockconfig.go
+++ b/conf/mockconfig.go
@@ -34,10 +34,14 @@ func MockConf() *Titan {
 				Concurrency:       2,
 			},
 			RateLimit: RateLimit{
-				SyncSetPeriod:       1 * time.Second,
-				GlobalBalancePeriod: 15 * time.Second,
-				TitanStatusLifetime: 30 * time.Second,
 				LimiterNamespace:    "sys_ratelimit",
+				SyncSetPeriod:       1 * time.Second,
+				GlobalBalancePeriod: 3 * time.Second,
+				TitanStatusLifetime: 6 * time.Second,
+				UsageToDivide:       0.6,
+				UsageToMultiply:     0.9,
+				WeightChangeFactor:  1.5,
+				InitialPercent:      1,
 			},
 		},
 	}
diff --git a/db/limitersMgr.go b/db/limitersMgr.go
index 37b7c18e..dc5006a7 100644
--- a/db/limitersMgr.go
+++ b/db/limitersMgr.go
@@ -17,15 +17,17 @@ import (
 )
 
 const (
-	LIMITDATA_NAMESPACE     = "sys_ratelimit"
-	LIMITDATA_DBID          = 0
-	ALL_NAMESPACE           = "*"
-	NAMESPACE_COMMAND_TOKEN = "@"
-	QPS_PREFIX              = "qps:"
-	RATE_PREFIX             = "rate:"
-	LIMIT_BURST_TOKEN       = " "
-	TITAN_STATUS_TOKEN      = "titan_status:"
-	TIME_FORMAT             = "2006-01-02 15:04:05"
+	LIMITDATA_DBID             = 0
+	ALL_NAMESPACE              = "*"
+	NAMESPACE_COMMAND_TOKEN    = "@"
+	QPS_PREFIX                 = "qps:"
+	RATE_PREFIX                = "rate:"
+	LIMIT_VALUE_TOKEN          = " "
+	LIMITER_STATUS_PREFIX      = "limiter_status:"
+	LIMITER_STATUS_VALUE_TOKEN = ","
+	TIME_FORMAT                = "2006-01-02 15:04:05"
+	MAXIMUM_WEIGHT             = 1
+	MINIMUM_WEIGHT             = 0.1
 )
 
 type CommandLimiter struct {
@@ -35,6 +37,8 @@ type CommandLimiter struct {
 	qpsl         *rate.Limiter
 	ratel        *rate.Limiter
 	localPercent float64
+	weight       float64
+	skipBalance  bool
 
 	lock               sync.Mutex
 	lastTime           time.Time
@@ -48,12 +52,9 @@ type LimitData struct {
 }
 
 type LimitersMgr struct {
-	limitDatadb         *DB
-	globalBalancePeriod time.Duration
-	titanStatusLifeTime time.Duration
-	syncSetPeriod       time.Duration
-	localIp             string
-	localPercent        float64
+	limitDatadb *DB
+	conf        *conf.RateLimit
+	localIp     string
 
 	limiters          sync.Map
 	qpsAllmatchLimit  sync.Map
@@ -69,7 +70,24 @@ func getAllmatchLimiterName(limiterName string) string {
 	return fmt.Sprintf("%s%s%s", ALL_NAMESPACE, NAMESPACE_COMMAND_TOKEN, strs[1])
 }
 
-func NewLimitersMgr(store *RedisStore, rateLimit conf.RateLimit) (*LimitersMgr, error) {
+func getLimiterKey(limiterName string) []byte {
+	var key []byte
+	key = append(key, []byte(LIMITER_STATUS_PREFIX)...)
+	key = append(key, []byte(limiterName)...)
+	key = append(key, ':')
+	return key
+}
+
+func getNamespaceAndCmd(limiterName string) []string {
+	strs := strings.Split(limiterName, NAMESPACE_COMMAND_TOKEN)
+	if len(strs) < 2 {
+		return nil
+	}
+	return strs
+
+}
+
+func NewLimitersMgr(store *RedisStore, rateLimit *conf.RateLimit) (*LimitersMgr, error) {
 	var addrs []net.Addr
 	var err error
 	if rateLimit.InterfaceName != "" {
@@ -103,19 +121,24 @@ func NewLimitersMgr(store *RedisStore, rateLimit conf.RateLimit) (*LimitersMgr,
 	if rateLimit.LimiterNamespace == "" {
 		return nil, errors.New("limiter-namespace is configured with empty")
 	}
+	if rateLimit.WeightChangeFactor <= 1 {
+		return nil, errors.New("weight-change-factor should > 1")
+	}
+	if !(rateLimit.UsageToDivide > 0 && rateLimit.UsageToDivide < rateLimit.UsageToMultiply && rateLimit.UsageToMultiply < 1) {
+		return nil, errors.New("should config 0 < usage-to-divide < usage-to-multiply < 1")
+	}
+	if rateLimit.InitialPercent > 1 || rateLimit.InitialPercent <= 0 {
+		return nil, errors.New("initial-percent should in (0, 1]")
+	}
 
 	l := &LimitersMgr{
-		limitDatadb:         store.DB(rateLimit.LimiterNamespace, LIMITDATA_DBID),
-		globalBalancePeriod: rateLimit.GlobalBalancePeriod,
-		titanStatusLifeTime: rateLimit.TitanStatusLifetime,
-		syncSetPeriod:       rateLimit.SyncSetPeriod,
-		localIp:             localIp,
-		localPercent:        1,
+		limitDatadb: store.DB(rateLimit.LimiterNamespace, LIMITDATA_DBID),
+		conf:        rateLimit,
+		localIp:     localIp,
 	}
 
-	go l.startBalanceLimit()
 	go l.startSyncNewLimit()
-	go l.startReportLocalStat()
+	go l.startReportAndBalance()
 	return l, nil
 }
 
@@ -137,7 +160,7 @@ func (l *LimitersMgr) init(limiterName string) *CommandLimiter {
 	rateLimit, rateBurst := l.getLimit(limiterName, false)
 	if (qpsLimit > 0 && qpsBurst > 0) ||
 		(rateLimit > 0 && rateBurst > 0) {
-		newCl := NewCommandLimiter(l.localIp, limiterName, qpsLimit, qpsBurst, rateLimit, rateBurst, l.localPercent)
+		newCl := NewCommandLimiter(l.localIp, limiterName, qpsLimit, qpsBurst, rateLimit, rateBurst, l.conf.InitialPercent)
 		v, _ := l.limiters.LoadOrStore(limiterName, newCl)
 		return v.(*CommandLimiter)
 	} else {
@@ -179,7 +202,7 @@ func (l *LimitersMgr) getLimit(limiterName string, isQps bool) (float64, int) {
 		return 0, 0
 	}
 
-	limitStrs := strings.Split(string(val), LIMIT_BURST_TOKEN)
+	limitStrs := strings.Split(string(val), LIMIT_VALUE_TOKEN)
 	if len(limitStrs) < 2 {
 		zap.L().Error("[Limit] limit hasn't enough parameters, should be: <limit>[K|k|M|m] <burst>", zap.String("key", limiterKey), zap.ByteString("val", val))
 		return 0, 0
@@ -237,125 +260,33 @@ func (l *LimitersMgr) CheckLimit(namespace string, cmdName string, cmdArgs []str
 	}
 }
 
-func (l *LimitersMgr) startBalanceLimit() {
-	ticker := time.NewTicker(l.globalBalancePeriod)
+func (l *LimitersMgr) startReportAndBalance() {
+	ticker := time.NewTicker(l.conf.GlobalBalancePeriod)
 	defer ticker.Stop()
 	for range ticker.C {
-		l.runBalanceLimit()
+		l.runReportAndBalance()
 	}
 }
 
-func (l *LimitersMgr) startReportLocalStat() {
-	ticker := time.NewTicker(l.globalBalancePeriod)
-	defer ticker.Stop()
-	for range ticker.C {
-		l.reportLocalStat()
-	}
-}
-
-func (l *LimitersMgr) reportLocalStat() {
+func (l *LimitersMgr) runReportAndBalance() {
 	l.limiters.Range(func(k, v interface{}) bool {
 		limiterName := k.(string)
 		commandLimiter := v.(*CommandLimiter)
 		if commandLimiter != nil {
-			commandLimiter.reportLocalStat()
+			averageQps := commandLimiter.reportLocalStat(l.conf.GlobalBalancePeriod)
+			commandLimiter.balanceLimit(averageQps, l.limitDatadb, l.conf.TitanStatusLifetime, l.conf.UsageToDivide, l.conf.UsageToMultiply, l.conf.WeightChangeFactor)
+
 		} else {
-			metrics.GetMetrics().LimiterQpsVec.WithLabelValues(l.localIp, limiterName).Set(0)
-			metrics.GetMetrics().LimiterRateVec.WithLabelValues(l.localIp, limiterName).Set(0)
+			namespaceAndCmd := getNamespaceAndCmd(limiterName)
+			metrics.GetMetrics().LimiterQpsVec.WithLabelValues(namespaceAndCmd[0], namespaceAndCmd[1], l.localIp).Set(0)
+			metrics.GetMetrics().LimiterRateVec.WithLabelValues(namespaceAndCmd[0], namespaceAndCmd[1], l.localIp).Set(0)
 		}
 		return true
 	})
 }
 
-func (l *LimitersMgr) runBalanceLimit() {
-	txn, err := l.limitDatadb.Begin()
-	if err != nil {
-		zap.L().Error("[Limit] transection begin failed", zap.String("titan", l.localIp), zap.Error(err))
-		return
-	}
-
-	prefix := MetaKey(l.limitDatadb, []byte(TITAN_STATUS_TOKEN))
-	endPrefix := sdk_kv.Key(prefix).PrefixNext()
-	iter, err := txn.t.Iter(prefix, endPrefix)
-	if err != nil {
-		zap.L().Error("[Limit] seek failed", zap.ByteString("prefix", prefix), zap.Error(err))
-		txn.Rollback()
-		return
-	}
-	defer iter.Close()
-
-	activeNum := float64(1)
-	prefixLen := len(prefix)
-	for ; iter.Valid() && iter.Key().HasPrefix(prefix); err = iter.Next() {
-		if err != nil {
-			zap.L().Error("[Limit] next failed", zap.ByteString("prefix", prefix), zap.Error(err))
-			txn.Rollback()
-			return
-		}
-
-		key := iter.Key()
-		if len(key) <= prefixLen {
-			zap.L().Error("ip is null", zap.ByteString("key", key))
-			continue
-		}
-		ip := key[prefixLen:]
-		obj := NewString(txn, key)
-		if err = obj.decode(iter.Value()); err != nil {
-			zap.L().Error("[Limit] Strings decoded value error", zap.ByteString("key", key), zap.Error(err))
-			continue
-		}
-
-		lastActive := obj.Meta.Value
-		lastActiveT, err := time.ParseInLocation(TIME_FORMAT, string(lastActive), time.Local)
-		if err != nil {
-			zap.L().Error("[Limit] value can't decoded into a time", zap.ByteString("key", key), zap.ByteString("lastActive", lastActive), zap.Error(err))
-			continue
-		}
-
-		diff := time.Since(lastActiveT).Seconds()
-		zap.L().Info("[Limit] last active time", zap.ByteString("ip", ip), zap.ByteString("lastActive", lastActive), zap.Float64("activePast", diff))
-		if string(ip) != l.localIp && time.Since(lastActiveT) <= l.titanStatusLifeTime {
-			activeNum++
-		}
-	}
-	newPercent := 1 / activeNum
-	var oldPercent float64
-	l.lock.Lock()
-	oldPercent = l.localPercent
-	l.lock.Unlock()
-	if oldPercent != newPercent {
-		zap.L().Info("[Limit] balance limit in all titan server", zap.Float64("active server num", activeNum),
-			zap.Float64("oldPercent", oldPercent), zap.Float64("newPercent", newPercent))
-		l.limiters.Range(func(k, v interface{}) bool {
-			commandLimiter := v.(*CommandLimiter)
-			if commandLimiter != nil {
-				commandLimiter.updateLimitPercent(newPercent)
-			}
-			return true
-		})
-
-		l.lock.Lock()
-		l.localPercent = newPercent
-		l.lock.Unlock()
-	}
-
-	key := []byte(TITAN_STATUS_TOKEN + l.localIp)
-	s := NewString(txn, key)
-	now := time.Now()
-	value := now.Format(TIME_FORMAT)
-	if err := s.Set([]byte(value), 0); err != nil {
-		txn.Rollback()
-		return
-	}
-	if err := txn.t.Commit(context.Background()); err != nil {
-		zap.L().Error("[Limit] commit after balance limit failed", zap.String("titan", l.localIp))
-		txn.Rollback()
-		return
-	}
-}
-
 func (l *LimitersMgr) startSyncNewLimit() {
-	ticker := time.NewTicker(l.syncSetPeriod)
+	ticker := time.NewTicker(l.conf.SyncSetPeriod)
 	defer ticker.Stop()
 	for range ticker.C {
 		l.runSyncNewLimit()
@@ -388,10 +319,6 @@ func (l *LimitersMgr) runSyncNewLimit() {
 		})
 	}
 
-	var localPercent float64
-	l.lock.Lock()
-	localPercent = l.localPercent
-	l.lock.Unlock()
 	l.limiters.Range(func(k, v interface{}) bool {
 		limiterName := k.(string)
 		commandLimiter := v.(*CommandLimiter)
@@ -426,7 +353,7 @@ func (l *LimitersMgr) runSyncNewLimit() {
 					zap.Float64("rate limit", rateLimit), zap.Int("rate burst", rateBurst))
 			}
 			if commandLimiter == nil {
-				newCl := NewCommandLimiter(l.localIp, limiterName, qpsLimit, qpsBurst, rateLimit, rateBurst, localPercent)
+				newCl := NewCommandLimiter(l.localIp, limiterName, qpsLimit, qpsBurst, rateLimit, rateBurst, l.conf.InitialPercent)
 				l.limiters.Store(limiterName, newCl)
 			} else {
 				commandLimiter.updateLimit(qpsLimit, qpsBurst, rateLimit, rateBurst)
@@ -444,20 +371,29 @@ func (l *LimitersMgr) runSyncNewLimit() {
 	})
 }
 
-func NewCommandLimiter(localIp string, limiterName string, qpsLimit float64, qpsBurst int, rateLimit float64, rateBurst int, localPercent float64) *CommandLimiter {
+func NewCommandLimiter(localIp string, limiterName string, qpsLimit float64, qpsBurst int, rateLimit float64, rateBurst int, initialPercent float64) *CommandLimiter {
+	if !(qpsLimit > 0 && qpsBurst > 0) &&
+		!(rateLimit > 0 && rateBurst > 0) {
+		return nil
+	}
+	if initialPercent <= 0 {
+		return nil
+	}
 	var qpsl, ratel *rate.Limiter
 	if qpsLimit > 0 && qpsBurst > 0 {
-		qpsl = rate.NewLimiter(rate.Limit(qpsLimit*localPercent), qpsBurst)
+		qpsl = rate.NewLimiter(rate.Limit(qpsLimit*initialPercent), qpsBurst)
 	}
 	if rateLimit > 0 && rateBurst > 0 {
-		ratel = rate.NewLimiter(rate.Limit(rateLimit*localPercent), rateBurst)
+		ratel = rate.NewLimiter(rate.Limit(rateLimit*initialPercent), rateBurst)
 	}
 	cl := &CommandLimiter{
 		limiterName:  limiterName,
 		localIp:      localIp,
 		qpsl:         qpsl,
 		ratel:        ratel,
-		localPercent: localPercent,
+		localPercent: initialPercent,
+		weight:       MAXIMUM_WEIGHT,
+		skipBalance:  true,
 		lastTime:     time.Now(),
 	}
 	return cl
@@ -467,6 +403,20 @@ func (cl *CommandLimiter) updateLimit(qpsLimit float64, qpsBurst int, rateLimit
 	cl.lock.Lock()
 	defer cl.lock.Unlock()
 
+	////when limit is changed, the qps can't be used to balanceLimit
+	var orgQpsLimit, orgRateLimit float64
+	var orgQpsBurst, orgRateBurst int
+	if cl.qpsl != nil {
+		orgQpsLimit = float64(cl.qpsl.Limit()) / cl.localPercent
+		orgQpsBurst = cl.qpsl.Burst()
+	}
+	if cl.ratel != nil {
+		orgRateLimit = float64(cl.ratel.Limit()) / cl.localPercent
+		orgRateBurst = cl.ratel.Burst()
+	}
+	if orgQpsLimit != qpsLimit || orgQpsBurst != qpsBurst || orgRateLimit != rateLimit || orgRateBurst != rateBurst {
+		cl.skipBalance = true
+	}
 	if qpsLimit > 0 && qpsBurst > 0 {
 		if cl.qpsl != nil {
 			if cl.qpsl.Burst() != qpsBurst {
@@ -496,9 +446,10 @@ func (cl *CommandLimiter) updateLimit(qpsLimit float64, qpsBurst int, rateLimit
 	}
 }
 
-func (cl *CommandLimiter) reportLocalStat() {
+func (cl *CommandLimiter) reportLocalStat(globalBalancePeriod time.Duration) float64 {
 	var qpsLocal, rateLocal float64
 	cl.lock.Lock()
+	defer cl.lock.Unlock()
 	seconds := time.Since(cl.lastTime).Seconds()
 	if seconds >= 0 {
 		qpsLocal = float64(cl.totalCommandsCount) / seconds
@@ -510,16 +461,161 @@ func (cl *CommandLimiter) reportLocalStat() {
 	cl.totalCommandsCount = 0
 	cl.totalCommandsSize = 0
 	cl.lastTime = time.Now()
-	cl.lock.Unlock()
 
-	metrics.GetMetrics().LimiterQpsVec.WithLabelValues(cl.localIp, cl.limiterName).Set(qpsLocal)
-	metrics.GetMetrics().LimiterRateVec.WithLabelValues(cl.localIp, cl.limiterName).Set(rateLocal)
+	namespaceCmd := getNamespaceAndCmd(cl.limiterName)
+	metrics.GetMetrics().LimiterQpsVec.WithLabelValues(namespaceCmd[0], namespaceCmd[1], cl.localIp).Set(qpsLocal)
+	metrics.GetMetrics().LimiterRateVec.WithLabelValues(namespaceCmd[0], namespaceCmd[1], cl.localIp).Set(rateLocal)
+
+	return qpsLocal
 }
 
-func (cl *CommandLimiter) updateLimitPercent(newPercent float64) {
+func (cl *CommandLimiter) balanceLimit(averageQps float64, limitDatadb *DB, titanStatusLifetime time.Duration,
+	devideUsage float64, multiplyUsage float64, weightChangeFactor float64) {
 	cl.lock.Lock()
 	defer cl.lock.Unlock()
 
+	if cl.qpsl == nil {
+		return
+	}
+	if cl.skipBalance {
+		cl.skipBalance = false
+		return
+	}
+
+	txn, err := limitDatadb.Begin()
+	if err != nil {
+		zap.L().Error("[Limit] transection begin failed", zap.String("titan", cl.localIp), zap.Error(err))
+		return
+	}
+
+	weights, qpss, err := cl.scanStatusInOtherTitan(limitDatadb, txn, titanStatusLifetime)
+	if err != nil {
+		txn.Rollback()
+		return
+	}
+
+	totalWeight := cl.weight
+	for i := range weights {
+		totalWeight += weights[i]
+	}
+
+	originalLimit := float64(cl.qpsl.Limit()) / cl.localPercent
+	selfLimitInTarget := originalLimit * (cl.weight / totalWeight)
+	if averageQps < selfLimitInTarget*devideUsage {
+		otherFull := false
+		for i := range qpss {
+			otherLimitInTarget := originalLimit * (weights[i] / totalWeight)
+			if qpss[i] >= otherLimitInTarget*multiplyUsage {
+				otherFull = true
+				break
+			}
+		}
+		if otherFull {
+			cl.weight /= weightChangeFactor
+			if cl.weight < MINIMUM_WEIGHT {
+				cl.weight = MINIMUM_WEIGHT
+			}
+		}
+	} else if averageQps >= selfLimitInTarget*multiplyUsage {
+		cl.weight *= weightChangeFactor
+		if cl.weight > MAXIMUM_WEIGHT {
+			cl.weight = MAXIMUM_WEIGHT
+		}
+	}
+
+	totalWeight = cl.weight
+	for i := range weights {
+		totalWeight += weights[i]
+	}
+	newPercent := cl.weight / totalWeight
+
+	key := getLimiterKey(cl.limiterName)
+	key = append(key, []byte(cl.localIp)...)
+	s := NewString(txn, key)
+	now := time.Now()
+	strTime := now.Format(TIME_FORMAT)
+	value := fmt.Sprintf("%f%s%f%s%s", cl.weight, LIMITER_STATUS_VALUE_TOKEN, averageQps, LIMITER_STATUS_VALUE_TOKEN, strTime)
+	if err := s.Set([]byte(value), 0); err != nil {
+		txn.Rollback()
+		return
+	}
+	if err := txn.t.Commit(context.Background()); err != nil {
+		zap.L().Error("[Limit] commit after balance limit failed", zap.String("titan", cl.localIp))
+		txn.Rollback()
+		return
+	}
+	zap.L().Info("[Limit] balance limit", zap.String("limiterName", cl.limiterName),
+		zap.Float64("qps", averageQps), zap.Float64("newWeight", cl.weight), zap.Float64("newPercent", newPercent))
+	cl.updateLimitPercent(newPercent)
+}
+
+func (cl *CommandLimiter) scanStatusInOtherTitan(limitDatadb *DB, txn *Transaction, titanStatusLifetime time.Duration) ([]float64, []float64, error) {
+	key := getLimiterKey(cl.limiterName)
+	prefix := MetaKey(limitDatadb, key)
+	endPrefix := sdk_kv.Key(prefix).PrefixNext()
+	iter, err := txn.t.Iter(prefix, endPrefix)
+	if err != nil {
+		zap.L().Error("[Limit] seek failed", zap.ByteString("prefix", prefix), zap.Error(err))
+		return nil, nil, err
+	}
+	defer iter.Close()
+
+	prefixLen := len(prefix)
+	var weights, qpss []float64
+	var weight, qps float64
+	for ; iter.Valid() && iter.Key().HasPrefix(prefix); err = iter.Next() {
+		if err != nil {
+			zap.L().Error("[Limit] next failed", zap.ByteString("prefix", prefix), zap.Error(err))
+			return nil, nil, err
+		}
+
+		key := iter.Key()
+		if len(key) <= prefixLen {
+			zap.L().Error("ip is null", zap.ByteString("key", key))
+			continue
+		}
+		ip := key[prefixLen:]
+		obj := NewString(txn, key)
+		if err = obj.decode(iter.Value()); err != nil {
+			zap.L().Error("[Limit] Strings decoded value error", zap.ByteString("key", key), zap.Error(err))
+			continue
+		}
+
+		val := string(obj.Meta.Value)
+		vals := strings.Split(val, LIMITER_STATUS_VALUE_TOKEN)
+		if len(vals) < 3 {
+			zap.L().Error("[Limit] short of values(should 3 values)", zap.ByteString("key", key), zap.String("value", val))
+			continue
+		}
+		sWeight := vals[0]
+		sQps := vals[1]
+		lastActive := vals[2]
+
+		if weight, err = strconv.ParseFloat(sWeight, 64); err != nil {
+			zap.L().Error("[Limit] weight can't be decoded to float", zap.ByteString("key", key), zap.String("weight", sWeight), zap.Error(err))
+			continue
+		}
+		if qps, err = strconv.ParseFloat(sQps, 64); err != nil {
+			zap.L().Error("[Limit] qps can't be decoded to float", zap.ByteString("key", key), zap.String("qps", sQps), zap.Error(err))
+			continue
+		}
+
+		lastActiveT, err := time.ParseInLocation(TIME_FORMAT, lastActive, time.Local)
+		if err != nil {
+			zap.L().Error("[Limit] value can't decoded into a time", zap.ByteString("key", key), zap.String("lastActive", lastActive), zap.Error(err))
+			continue
+		}
+
+		zap.L().Info("[Limit] other titan status", zap.ByteString("key", key), zap.Float64("weight", weight), zap.Float64("qps", qps), zap.String("lastActive", lastActive))
+		if string(ip) != cl.localIp && time.Since(lastActiveT) <= titanStatusLifetime {
+			weights = append(weights, weight)
+			qpss = append(qpss, qps)
+		}
+	}
+	return weights, qpss, nil
+}
+
+func (cl *CommandLimiter) updateLimitPercent(newPercent float64) {
 	if cl.localPercent != newPercent && cl.localPercent > 0 && newPercent > 0 {
 		if cl.qpsl != nil {
 			qpsLimit := (float64(cl.qpsl.Limit()) / cl.localPercent) * newPercent
diff --git a/metrics/prometheus.go b/metrics/prometheus.go
index 7215f37c..b01639ac 100644
--- a/metrics/prometheus.go
+++ b/metrics/prometheus.go
@@ -22,6 +22,7 @@ const (
 	gckeys    = "gckeys"
 	expire    = "expire"
 	tikvGC    = "tikvgc"
+	titanip   = "titanip"
 )
 
 var (
@@ -34,6 +35,7 @@ var (
 	gcKeysLabel  = []string{gckeys}
 	expireLabel  = []string{expire}
 	tikvGCLabel  = []string{tikvGC}
+	limitLabel   = []string{biz, command, titanip}
 
 	// global prometheus object
 	gm *Metrics
@@ -197,7 +199,7 @@ func init() {
 			Namespace: namespace,
 			Name:      "qps_limiter_status",
 			Help:      "the qps of a namespace's command in a titan server",
-		}, multiLabel)
+		}, limitLabel)
 	prometheus.MustRegister(gm.LimiterQpsVec)
 
 	gm.LimiterRateVec = prometheus.NewGaugeVec(
@@ -205,7 +207,7 @@ func init() {
 			Namespace: namespace,
 			Name:      "rate_limiter_status",
 			Help:      "the rate of a namespace's command in a titan server(KB/s)",
-		}, multiLabel)
+		}, limitLabel)
 	prometheus.MustRegister(gm.LimiterRateVec)
 
 	gm.LRangeSeekHistogram = prometheus.NewHistogram(
diff --git a/tools/integration/titan.go b/tools/integration/titan.go
index 9d400e2d..2e63fd1c 100644
--- a/tools/integration/titan.go
+++ b/tools/integration/titan.go
@@ -48,7 +48,7 @@ func Start() {
 		log.Fatalln(err)
 	}
 
-	limitersMgr, err := db.NewLimitersMgr(store, tikvConf.RateLimit)
+	limitersMgr, err := db.NewLimitersMgr(store, &tikvConf.RateLimit)
 	if err != nil {
 		log.Fatalln(err)
 	}

From fda1e666e59330d1dc34f23954a2f0836bd3f24f Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Mon, 2 Dec 2019 11:11:24 +0800
Subject: [PATCH 30/60] update some config item's description and log

---
 conf/config.go    |  6 +++---
 conf/titan.toml   | 20 ++++++++++++++++++++
 db/limitersMgr.go |  2 +-
 3 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/conf/config.go b/conf/config.go
index e7b00e08..f5eff6dc 100644
--- a/conf/config.go
+++ b/conf/config.go
@@ -110,7 +110,7 @@ type RateLimit struct {
 	TitanStatusLifetime time.Duration `cfg:"titanstatus-life-time; 1m;; how long if a titan didn't update its status, we consider it dead"`
 	SyncSetPeriod       time.Duration `cfg:"sync-set-period; 3s;; the period in seconds to sync new limit set in tikv"`
 	UsageToDivide       float64       `cfg:"usage-to-divide; 0.6;; if the qps/weighted limit < the percent, will divide change Factor to balance limit"`
-	UsageToMultiply     float64       `cfg:"usage-to-multiply; 0.9;; if the qps/weighted limit > the percent, will multiply change Factor to balance limit"`
-	WeightChangeFactor  float64       `cfg:"weight-change-factor; 1.5;; the weight change unit when balance limit, is also the minimum weight"`
-	InitialPercent      float64       `cfg:"initial-percent; 0.33;; the percent in a commandLimiter is created"`
+	UsageToMultiply     float64       `cfg:"usage-to-multiply; 0.9;; if the qps/weighted limit >= the percent, will multiply change Factor to balance limit"`
+	WeightChangeFactor  float64       `cfg:"weight-change-factor; 1.5;; the factor to devide/multipy in current weight"`
+	InitialPercent      float64       `cfg:"initial-percent; 0.33;; the limit is set in the percent when a commandLimiter is created"`
 }
diff --git a/conf/titan.toml b/conf/titan.toml
index 37e2311a..aada95e2 100644
--- a/conf/titan.toml
+++ b/conf/titan.toml
@@ -208,6 +208,26 @@ interface-name = "eth0"
 #default:     3s
 #sync-set-period = "3s"
 
+#type:          float64
+#description:   if the qps/weighted limit < the percent, will divide change Factor to balance limit
+#default:       0.6
+usage-to-divide = 0.6
+
+#type:          float64
+#description:   if the qps/weighted limit >= the percent, will multiply change Factor to balance limit
+#default:       0.9
+usage-to-multiply = 0.9
+
+#type:          float64
+#description:   the factor to devide/multipy in current weight
+#default:       1.5
+weight-change-factor = 1.5
+
+#type:          float64
+#description:   the limit is set in the percent when a commandLimiter is created
+#default:       0.33
+initial-percent = 0.33
+
 [tikv-logger]
 
 #type:        string
diff --git a/db/limitersMgr.go b/db/limitersMgr.go
index dc5006a7..7716a20e 100644
--- a/db/limitersMgr.go
+++ b/db/limitersMgr.go
@@ -606,7 +606,7 @@ func (cl *CommandLimiter) scanStatusInOtherTitan(limitDatadb *DB, txn *Transacti
 			continue
 		}
 
-		zap.L().Info("[Limit] other titan status", zap.ByteString("key", key), zap.Float64("weight", weight), zap.Float64("qps", qps), zap.String("lastActive", lastActive))
+		zap.L().Info("[Limit] titan status", zap.ByteString("key", key), zap.Float64("weight", weight), zap.Float64("qps", qps), zap.String("lastActive", lastActive))
 		if string(ip) != cl.localIp && time.Since(lastActiveT) <= titanStatusLifetime {
 			weights = append(weights, weight)
 			qpss = append(qpss, qps)

From a474656d77763dc0d5a2ed822ad23edcf0e735b1 Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Mon, 2 Dec 2019 15:12:21 +0800
Subject: [PATCH 31/60] add limit changed log

---
 db/limitersMgr.go | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/db/limitersMgr.go b/db/limitersMgr.go
index 7716a20e..95176d97 100644
--- a/db/limitersMgr.go
+++ b/db/limitersMgr.go
@@ -415,6 +415,8 @@ func (cl *CommandLimiter) updateLimit(qpsLimit float64, qpsBurst int, rateLimit
 		orgRateBurst = cl.ratel.Burst()
 	}
 	if orgQpsLimit != qpsLimit || orgQpsBurst != qpsBurst || orgRateLimit != rateLimit || orgRateBurst != rateBurst {
+		zap.L().Info("limit changed", zap.Float64("orgQpsLimit", orgQpsLimit), zap.Float64("qpsLimit", qpsLimit), zap.Int("orgQpsBurst", orgQpsBurst), zap.Int("qpsBurst", qpsBurst),
+			zap.Float64("orgRateLimit", orgRateLimit), zap.Float64("rateLimit", rateLimit), zap.Int("orgRateBurst", orgRateBurst), zap.Int("rateBurst", rateBurst))
 		cl.skipBalance = true
 	}
 	if qpsLimit > 0 && qpsBurst > 0 {

From 8ed06984da977c5b0901462ff0a72ae019980152 Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Mon, 2 Dec 2019 18:09:22 +0800
Subject: [PATCH 32/60] commandLimiter store globalQpsLimit/globalRateLimit in
 int64, use it to calculate the local limit in updatePercent() and judge if
 limit is changed, this resolve float precision lost problem

---
 db/limitersMgr.go | 116 +++++++++++++++++++++++++---------------------
 1 file changed, 63 insertions(+), 53 deletions(-)

diff --git a/db/limitersMgr.go b/db/limitersMgr.go
index 95176d97..d081fe01 100644
--- a/db/limitersMgr.go
+++ b/db/limitersMgr.go
@@ -34,11 +34,13 @@ type CommandLimiter struct {
 	localIp     string
 	limiterName string
 
-	qpsl         *rate.Limiter
-	ratel        *rate.Limiter
-	localPercent float64
-	weight       float64
-	skipBalance  bool
+	qpsl            *rate.Limiter
+	ratel           *rate.Limiter
+	globalQpsLimit  int64
+	globalRateLimit int64
+	localPercent    float64
+	weight          float64
+	skipBalance     bool
 
 	lock               sync.Mutex
 	lastTime           time.Time
@@ -47,7 +49,7 @@ type CommandLimiter struct {
 }
 
 type LimitData struct {
-	limit float64
+	limit int64
 	burst int
 }
 
@@ -169,8 +171,8 @@ func (l *LimitersMgr) init(limiterName string) *CommandLimiter {
 	}
 }
 
-func (l *LimitersMgr) getLimit(limiterName string, isQps bool) (float64, int) {
-	limit := float64(0)
+func (l *LimitersMgr) getLimit(limiterName string, isQps bool) (int64, int) {
+	limit := int64(0)
 	burst := int64(0)
 
 	txn, err := l.limitDatadb.Begin()
@@ -214,7 +216,7 @@ func (l *LimitersMgr) getLimit(limiterName string, isQps bool) (float64, int) {
 		return 0, 0
 	}
 	var strUnit uint8
-	var unit float64
+	var unit int64
 	strUnit = limitStr[len(limitStr)-1]
 	if strUnit == 'k' || strUnit == 'K' {
 		unit = 1024
@@ -225,18 +227,19 @@ func (l *LimitersMgr) getLimit(limiterName string, isQps bool) (float64, int) {
 	} else {
 		unit = 1
 	}
-	if limit, err = strconv.ParseFloat(limitStr, 64); err != nil {
-		zap.L().Error("[Limit] limit can't be decoded to integer", zap.String("key", limiterKey), zap.ByteString("val", val), zap.Error(err))
+	limitInUnit, err := strconv.ParseFloat(limitStr, 64)
+	if err != nil {
+		zap.L().Error("[Limit] limit's number part can't be decoded to number", zap.String("key", limiterKey), zap.ByteString("val", val), zap.Error(err))
 		return 0, 0
 	}
-	limit *= unit
-	if burst, err = strconv.ParseInt(burstStr, 10, 64); err != nil {
+	limit = int64(limitInUnit * float64(unit))
+	if burst, err = strconv.ParseInt(burstStr, 10, 32); err != nil {
 		zap.L().Error("[Limit] burst can't be decoded to integer", zap.String("key", limiterKey), zap.ByteString("val", val), zap.Error(err))
 		return 0, 0
 	}
 
 	if logEnv := zap.L().Check(zap.DebugLevel, "[Limit] got limit"); logEnv != nil {
-		logEnv.Write(zap.String("key", limiterKey), zap.Float64("limit", limit), zap.Int64("burst", burst))
+		logEnv.Write(zap.String("key", limiterKey), zap.Int64("limit", limit), zap.Int64("burst", burst))
 	}
 
 	return limit, int(burst)
@@ -349,8 +352,8 @@ func (l *LimitersMgr) runSyncNewLimit() {
 		if (qpsLimit > 0 && qpsBurst > 0) ||
 			(rateLimit > 0 && rateBurst > 0) {
 			if logEnv := zap.L().Check(zap.DebugLevel, "[Limit] limit is set"); logEnv != nil {
-				logEnv.Write(zap.String("limiter name", limiterName), zap.Float64("qps limit", qpsLimit), zap.Int("qps burst", qpsBurst),
-					zap.Float64("rate limit", rateLimit), zap.Int("rate burst", rateBurst))
+				logEnv.Write(zap.String("limiter name", limiterName), zap.Int64("qps limit", qpsLimit), zap.Int("qps burst", qpsBurst),
+					zap.Int64("rate limit", rateLimit), zap.Int("rate burst", rateBurst))
 			}
 			if commandLimiter == nil {
 				newCl := NewCommandLimiter(l.localIp, limiterName, qpsLimit, qpsBurst, rateLimit, rateBurst, l.conf.InitialPercent)
@@ -361,8 +364,8 @@ func (l *LimitersMgr) runSyncNewLimit() {
 		} else {
 			if commandLimiter != nil {
 				if logEnv := zap.L().Check(zap.DebugLevel, "[Limit] limit is cleared"); logEnv != nil {
-					logEnv.Write(zap.String("limiter name", limiterName), zap.Float64("qps limit", qpsLimit), zap.Int("qps burst", qpsBurst),
-						zap.Float64("rate limit", rateLimit), zap.Int("rate burst", rateBurst))
+					logEnv.Write(zap.String("limiter name", limiterName), zap.Int64("qps limit", qpsLimit), zap.Int("qps burst", qpsBurst),
+						zap.Int64("rate limit", rateLimit), zap.Int("rate burst", rateBurst))
 				}
 				l.limiters.Store(limiterName, (*CommandLimiter)(nil))
 			}
@@ -371,7 +374,7 @@ func (l *LimitersMgr) runSyncNewLimit() {
 	})
 }
 
-func NewCommandLimiter(localIp string, limiterName string, qpsLimit float64, qpsBurst int, rateLimit float64, rateBurst int, initialPercent float64) *CommandLimiter {
+func NewCommandLimiter(localIp string, limiterName string, qpsLimit int64, qpsBurst int, rateLimit int64, rateBurst int, initialPercent float64) *CommandLimiter {
 	if !(qpsLimit > 0 && qpsBurst > 0) &&
 		!(rateLimit > 0 && rateBurst > 0) {
 		return nil
@@ -381,71 +384,78 @@ func NewCommandLimiter(localIp string, limiterName string, qpsLimit float64, qps
 	}
 	var qpsl, ratel *rate.Limiter
 	if qpsLimit > 0 && qpsBurst > 0 {
-		qpsl = rate.NewLimiter(rate.Limit(qpsLimit*initialPercent), qpsBurst)
+		localQpsLimit := float64(qpsLimit) * initialPercent
+		qpsl = rate.NewLimiter(rate.Limit(localQpsLimit), qpsBurst)
 	}
 	if rateLimit > 0 && rateBurst > 0 {
-		ratel = rate.NewLimiter(rate.Limit(rateLimit*initialPercent), rateBurst)
+		localRateLimit := float64(rateLimit) * initialPercent
+		ratel = rate.NewLimiter(rate.Limit(localRateLimit), rateBurst)
 	}
 	cl := &CommandLimiter{
-		limiterName:  limiterName,
-		localIp:      localIp,
-		qpsl:         qpsl,
-		ratel:        ratel,
-		localPercent: initialPercent,
-		weight:       MAXIMUM_WEIGHT,
-		skipBalance:  true,
-		lastTime:     time.Now(),
+		limiterName:     limiterName,
+		localIp:         localIp,
+		qpsl:            qpsl,
+		ratel:           ratel,
+		globalQpsLimit:  qpsLimit,
+		globalRateLimit: rateLimit,
+		localPercent:    initialPercent,
+		weight:          MAXIMUM_WEIGHT,
+		skipBalance:     true,
+		lastTime:        time.Now(),
 	}
 	return cl
 }
 
-func (cl *CommandLimiter) updateLimit(qpsLimit float64, qpsBurst int, rateLimit float64, rateBurst int) {
+func (cl *CommandLimiter) updateLimit(newQpsLimit int64, newQpsBurst int, newRateLimit int64, newRateBurst int) {
 	cl.lock.Lock()
 	defer cl.lock.Unlock()
 
 	////when limit is changed, the qps can't be used to balanceLimit
-	var orgQpsLimit, orgRateLimit float64
-	var orgQpsBurst, orgRateBurst int
+	var qpsBurst, rateBurst int
 	if cl.qpsl != nil {
-		orgQpsLimit = float64(cl.qpsl.Limit()) / cl.localPercent
-		orgQpsBurst = cl.qpsl.Burst()
+		qpsBurst = cl.qpsl.Burst()
 	}
 	if cl.ratel != nil {
-		orgRateLimit = float64(cl.ratel.Limit()) / cl.localPercent
-		orgRateBurst = cl.ratel.Burst()
+		rateBurst = cl.ratel.Burst()
 	}
-	if orgQpsLimit != qpsLimit || orgQpsBurst != qpsBurst || orgRateLimit != rateLimit || orgRateBurst != rateBurst {
-		zap.L().Info("limit changed", zap.Float64("orgQpsLimit", orgQpsLimit), zap.Float64("qpsLimit", qpsLimit), zap.Int("orgQpsBurst", orgQpsBurst), zap.Int("qpsBurst", qpsBurst),
-			zap.Float64("orgRateLimit", orgRateLimit), zap.Float64("rateLimit", rateLimit), zap.Int("orgRateBurst", orgRateBurst), zap.Int("rateBurst", rateBurst))
+	if cl.globalQpsLimit != newQpsLimit || qpsBurst != newQpsBurst || cl.globalRateLimit != newRateLimit || rateBurst != newRateBurst {
+		zap.L().Info("limit changed", zap.Int64("globalQpsLimit", cl.globalQpsLimit), zap.Int64("newQpsLimit", newQpsLimit), zap.Int("qpsBurst", qpsBurst), zap.Int("newQpsBurst", newQpsBurst),
+			zap.Int64("globalRateLimit", cl.globalRateLimit), zap.Int64("newRateLimit", newRateLimit), zap.Int("rateBurst", rateBurst), zap.Int("newRateBurst", newRateBurst))
 		cl.skipBalance = true
 	}
-	if qpsLimit > 0 && qpsBurst > 0 {
+
+	if newQpsLimit > 0 && newQpsBurst > 0 {
+		localQpsLimit := float64(newQpsLimit) * cl.localPercent
 		if cl.qpsl != nil {
-			if cl.qpsl.Burst() != qpsBurst {
-				cl.qpsl = rate.NewLimiter(rate.Limit(qpsLimit*cl.localPercent), qpsBurst)
-			} else if cl.qpsl.Limit() != rate.Limit(qpsLimit*cl.localPercent) {
-				cl.qpsl.SetLimit(rate.Limit(qpsLimit * cl.localPercent))
+			if cl.qpsl.Burst() != newQpsBurst {
+				cl.qpsl = rate.NewLimiter(rate.Limit(localQpsLimit), newQpsBurst)
+			} else if cl.globalQpsLimit != newQpsLimit {
+				cl.qpsl.SetLimit(rate.Limit(localQpsLimit))
 			}
 		} else {
-			cl.qpsl = rate.NewLimiter(rate.Limit(qpsLimit*cl.localPercent), qpsBurst)
+			cl.qpsl = rate.NewLimiter(rate.Limit(localQpsLimit), newQpsBurst)
 		}
 	} else {
 		cl.qpsl = nil
 	}
 
-	if rateLimit > 0 && rateBurst > 0 {
+	if newRateLimit > 0 && newRateBurst > 0 {
+		localRateLimit := float64(newRateLimit) * cl.localPercent
 		if cl.ratel != nil {
-			if cl.ratel.Burst() != rateBurst {
-				cl.ratel = rate.NewLimiter(rate.Limit(rateLimit*cl.localPercent), rateBurst)
-			} else if cl.ratel.Limit() != rate.Limit(rateLimit*cl.localPercent) {
-				cl.ratel.SetLimit(rate.Limit(rateLimit * cl.localPercent))
+			if cl.ratel.Burst() != newRateBurst {
+				cl.ratel = rate.NewLimiter(rate.Limit(localRateLimit), newRateBurst)
+			} else if cl.globalRateLimit != newRateLimit {
+				cl.ratel.SetLimit(rate.Limit(localRateLimit))
 			}
 		} else {
-			cl.ratel = rate.NewLimiter(rate.Limit(rateLimit*cl.localPercent), rateBurst)
+			cl.ratel = rate.NewLimiter(rate.Limit(localRateLimit), newRateBurst)
 		}
 	} else {
 		cl.ratel = nil
 	}
+
+	cl.globalQpsLimit = newQpsLimit
+	cl.globalRateLimit = newRateLimit
 }
 
 func (cl *CommandLimiter) reportLocalStat(globalBalancePeriod time.Duration) float64 {
@@ -620,12 +630,12 @@ func (cl *CommandLimiter) scanStatusInOtherTitan(limitDatadb *DB, txn *Transacti
 func (cl *CommandLimiter) updateLimitPercent(newPercent float64) {
 	if cl.localPercent != newPercent && cl.localPercent > 0 && newPercent > 0 {
 		if cl.qpsl != nil {
-			qpsLimit := (float64(cl.qpsl.Limit()) / cl.localPercent) * newPercent
+			qpsLimit := float64(cl.globalQpsLimit) * newPercent
 			zap.L().Info("percent changed", zap.String("limiterName", cl.limiterName), zap.Float64("qps limit", qpsLimit), zap.Int("burst", cl.qpsl.Burst()))
 			cl.qpsl.SetLimit(rate.Limit(qpsLimit))
 		}
 		if cl.ratel != nil {
-			rateLimit := float64(cl.ratel.Limit()) / cl.localPercent * newPercent
+			rateLimit := float64(cl.globalRateLimit) * newPercent
 			zap.L().Info("percent changed", zap.String("limiterName", cl.limiterName), zap.Float64("rate limit", rateLimit), zap.Int("burst", cl.ratel.Burst()))
 			cl.ratel.SetLimit(rate.Limit(rateLimit))
 		}

From 6ae93be7bf9627f2034402f503b72dc26d37d79c Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Tue, 3 Dec 2019 14:56:24 +0800
Subject: [PATCH 33/60] when all titan'a limiter qps is < devideUsage of its
 weight percent, add current titan weight again

---
 db/limitersMgr.go | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/db/limitersMgr.go b/db/limitersMgr.go
index d081fe01..e6e63189 100644
--- a/db/limitersMgr.go
+++ b/db/limitersMgr.go
@@ -514,19 +514,30 @@ func (cl *CommandLimiter) balanceLimit(averageQps float64, limitDatadb *DB, tita
 	originalLimit := float64(cl.qpsl.Limit()) / cl.localPercent
 	selfLimitInTarget := originalLimit * (cl.weight / totalWeight)
 	if averageQps < selfLimitInTarget*devideUsage {
-		otherFull := false
+		otherHaveHigh := false
+		otherAllLow := true
 		for i := range qpss {
 			otherLimitInTarget := originalLimit * (weights[i] / totalWeight)
 			if qpss[i] >= otherLimitInTarget*multiplyUsage {
-				otherFull = true
+				otherHaveHigh = true
+				otherAllLow = false
+				break
+			}
+			if qpss[i] >= otherLimitInTarget*devideUsage {
+				otherAllLow = false
 				break
 			}
 		}
-		if otherFull {
+		if otherHaveHigh {
 			cl.weight /= weightChangeFactor
 			if cl.weight < MINIMUM_WEIGHT {
 				cl.weight = MINIMUM_WEIGHT
 			}
+		} else if otherAllLow {
+			cl.weight *= weightChangeFactor
+			if cl.weight > MAXIMUM_WEIGHT {
+				cl.weight = MAXIMUM_WEIGHT
+			}
 		}
 	} else if averageQps >= selfLimitInTarget*multiplyUsage {
 		cl.weight *= weightChangeFactor

From 70b3f15445c592cda2f755f1763014c609dda192 Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Wed, 4 Dec 2019 20:59:55 +0800
Subject: [PATCH 34/60] add showlimit.sh and add a conf parameter for
 setlimit.sh

---
 setlimit.sh  | 25 +++++++++++++++----------
 showlimit.sh | 27 +++++++++++++++++++++++++++
 2 files changed, 42 insertions(+), 10 deletions(-)
 create mode 100644 showlimit.sh

diff --git a/setlimit.sh b/setlimit.sh
index 742d34d8..bf2441f6 100644
--- a/setlimit.sh
+++ b/setlimit.sh
@@ -1,22 +1,25 @@
 #!/bin/bash
-host=tkv7369.test.zhuaninc.com
-port=7369
-rediscli="./redis-cli"
-token="sys_ratelimit-1574130304-1-36c153b109ebca80b43769"
+
 usage_exit()
 {
 	echo "usage:"
-	echo "$0 set qps=(1|0) cmd=<cmd> namespace=<namespace> limit=<num>[k/K/m/M] burst=<num>"
+	echo "<hostportconfigpath> set qps=(1|0) cmd=<cmd> namespace=<namespace> limit=<num>[k/K/m/M] burst=<num>"
 	echo "or"
-	echo "$0 del qps=(1|0) cmd=<cmd> namespace=<namespace>"
+	echo "<hostportconfigpath> del qps=(1|0) cmd=<cmd> namespace=<namespace>"
 	echo "<namespace>: all means matching all namespaces"
 	exit 1
 }
-
-if [ $# -lt 1 ]; then
+if [ $# -lt 5 ]; then
 	usage_exit
 fi
-op=$1
+
+configpath=$1
+host=`grep host= $configpath|sed 's/host=//'`
+port=`grep port= $configpath|sed 's/port=//'`
+rediscli=`grep rediscli= $configpath|sed 's/rediscli=//'`
+token=`grep token= $configpath|sed 's/token=//'`
+
+op=$2
 if [ "$op" != "set" -a "$op" != "del" ]; then
 	usage_exit
 fi
@@ -27,9 +30,11 @@ namespace=
 limit=
 burst=
 
+i=0
 for arg in $*
 do
-	if [ "$arg" = "$op" ]; then
+	i=`expr $i + 1`
+	if [ $i -le 2 ]; then
 		continue
 	fi
 
diff --git a/showlimit.sh b/showlimit.sh
new file mode 100644
index 00000000..9697eb3c
--- /dev/null
+++ b/showlimit.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+usage_exit()
+{
+	echo "usage: <hostportconfigpath>"
+	exit 1
+}
+if [ $# -lt 1 ]; then
+	usage_exit
+fi
+
+configpath=$1
+host=`grep host= $configpath|sed 's/host=//'`
+port=`grep port= $configpath|sed 's/port=//'`
+rediscli=`grep rediscli= $configpath|sed 's/rediscli=//'`
+token=`grep token= $configpath|sed 's/token=//'`
+
+for prefix in qps: rate: limiter_status:
+do
+	echo "-------------------------- $prefix data --------------------------"
+	$rediscli -h $host -p $port -a $token scan 0 count 256 match ${prefix}*|sed '1d'|grep -v '^$'|while read key
+	do 
+		value=`$rediscli -h $host -p $port -a $token get $key`
+		echo -e "key=${key}\tvalue=$value"
+	done
+	echo
+done

From a22ca0fb9cecc4fe24dd176931c084d4fb5c0a8a Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Fri, 6 Dec 2019 12:05:19 +0800
Subject: [PATCH 35/60] 1 resolve the problem that titan still process command
 for remote closed pipelined connection 2 add namespace log for readCommand
 failure and client processing go routine's exit

---
 client.go          | 18 +++++++++++-------
 command/command.go |  4 +++-
 titan.go           |  2 +-
 3 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/client.go b/client.go
index 2728481b..9b776a8c 100644
--- a/client.go
+++ b/client.go
@@ -54,11 +54,11 @@ func (c *client) Write(p []byte) (int, error) {
 	zap.L().Debug("write to client", zap.Int64("clientid", c.cliCtx.ID), zap.String("msg", string(p)))
 	n, err := c.conn.Write(p)
 	if err != nil {
-		c.conn.Close()
 		if err == io.EOF {
-			zap.L().Info("close connection", zap.String("addr", c.cliCtx.RemoteAddr),
-				zap.Int64("clientid", c.cliCtx.ID))
+			zap.L().Info("connection was half-closed by remote peer", zap.String("addr", c.cliCtx.RemoteAddr),
+				zap.Int64("clientid", c.cliCtx.ID), zap.String("namespace", c.cliCtx.Namespace))
 		} else {
+			//may be unknown error with message "connection reset by peer"
 			zap.L().Error("write net failed", zap.String("addr", c.cliCtx.RemoteAddr),
 				zap.Int64("clientid", c.cliCtx.ID),
 				zap.String("namespace", c.cliCtx.Namespace),
@@ -66,10 +66,14 @@ func (c *client) Write(p []byte) (int, error) {
 				zap.Bool("watching", c.cliCtx.Txn != nil),
 				zap.String("command", c.cliCtx.LastCmd),
 				zap.String("error", err.Error()))
-			return 0, err
 		}
+		//client.serve() will get the channel close event, close the connection, exit current go routine
+		//if the remote client create the connection and use pipeline to invoke command, then it close the connection, titan still can get command from client.bufio.Reader and process
+		//close ClientContext.Done will help client.serve() to interrupt command processing
+		close(c.cliCtx.Done)
 	}
-	return n, nil
+	//return err for above write() error, then replying many times command can break its sending to a half-closed connection, etc BytesArray(lrange invoke it).
+	return n, err
 }
 
 func (c *client) serve(conn net.Conn) error {
@@ -89,11 +93,11 @@ func (c *client) serve(conn net.Conn) error {
 				c.conn.Close()
 				if err == io.EOF {
 					zap.L().Info("close connection", zap.String("addr", c.cliCtx.RemoteAddr),
-						zap.Int64("clientid", c.cliCtx.ID))
+						zap.Int64("clientid", c.cliCtx.ID), zap.String("namespace", c.cliCtx.Namespace))
 					return nil
 				}
 				zap.L().Error("read command failed", zap.String("addr", c.cliCtx.RemoteAddr),
-					zap.Int64("clientid", c.cliCtx.ID), zap.Error(err))
+					zap.Int64("clientid", c.cliCtx.ID), zap.String("namespace", c.cliCtx.Namespace), zap.Error(err))
 				return err
 			}
 		}
diff --git a/command/command.go b/command/command.go
index 0a17d06d..c96b8ec3 100644
--- a/command/command.go
+++ b/command/command.go
@@ -70,7 +70,9 @@ func BytesArray(w io.Writer, a [][]byte) OnCommit {
 				resp.ReplyNullBulkString(w)
 				continue
 			}
-			resp.ReplyBulkString(w, string(a[i]))
+			if err := resp.ReplyBulkString(w, string(a[i])); err != nil {
+				break
+			}
 			if i%10 == 9 {
 				zap.L().Debug("reply 10 bulk string", zap.Int64("cost(us)", time.Since(start).Nanoseconds()/1000))
 				start = time.Now()
diff --git a/titan.go b/titan.go
index b96f0bbf..198c2924 100644
--- a/titan.go
+++ b/titan.go
@@ -48,7 +48,7 @@ func (s *Server) Serve(lis net.Listener) error {
 			metrics.GetMetrics().ConnectionOnlineGaugeVec.WithLabelValues(cli.cliCtx.Namespace).Inc()
 			if err := cli.serve(conn); err != nil {
 				zap.L().Error("serve conn failed", zap.String("addr", cli.cliCtx.RemoteAddr),
-					zap.Int64("clientid", cliCtx.ID), zap.Error(err))
+					zap.Int64("clientid", cliCtx.ID), zap.String("namespace", cli.cliCtx.Namespace), zap.Error(err))
 			}
 			metrics.GetMetrics().ConnectionOnlineGaugeVec.WithLabelValues(cli.cliCtx.Namespace).Dec()
 			s.servCtx.Clients.Delete(cli.cliCtx.ID)

From 8403a21bea8ad49aa89e5f4c42c7ed9beaaff7e4 Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Fri, 6 Dec 2019 16:57:12 +0800
Subject: [PATCH 36/60] 1 BytesArray add judge for write eror 2 use
 remoteClosed flag to interrupt processing when pipeline connection is closed.
 if close ctx.Done(channel), and write() again, titan will crush for re-close
 channel

---
 client.go          | 37 +++++++++++++------------------------
 command/command.go | 10 +++++++---
 2 files changed, 20 insertions(+), 27 deletions(-)

diff --git a/client.go b/client.go
index 9b776a8c..df8b2540 100644
--- a/client.go
+++ b/client.go
@@ -6,7 +6,6 @@ import (
 	"io/ioutil"
 	"net"
 	"strings"
-	"sync"
 	"time"
 
 	"github.com/distributedio/titan/command"
@@ -22,33 +21,18 @@ type client struct {
 	exec   *command.Executor
 	r      *bufio.Reader
 
-	eofLock sync.Mutex //the lock of reading_writing 'eof'
-	eof     bool       //is over when read data from socket
+	remoteClosed bool //is the connection closed by remote peer?
 }
 
 func newClient(cliCtx *context.ClientContext, s *Server, exec *command.Executor) *client {
 	return &client{
-		cliCtx: cliCtx,
-		server: s,
-		exec:   exec,
-		eof:    false,
+		cliCtx:       cliCtx,
+		server:       s,
+		exec:         exec,
+		remoteClosed: false,
 	}
 }
 
-func (c *client) readEof() {
-	c.eofLock.Lock()
-	defer c.eofLock.Unlock()
-
-	c.eof = true
-}
-
-func (c *client) isEof() bool {
-	c.eofLock.Lock()
-	defer c.eofLock.Unlock()
-
-	return c.eof
-}
-
 // Write to conn and log error if needed
 func (c *client) Write(p []byte) (int, error) {
 	zap.L().Debug("write to client", zap.Int64("clientid", c.cliCtx.ID), zap.String("msg", string(p)))
@@ -68,9 +52,9 @@ func (c *client) Write(p []byte) (int, error) {
 				zap.String("error", err.Error()))
 		}
 		//client.serve() will get the channel close event, close the connection, exit current go routine
-		//if the remote client create the connection and use pipeline to invoke command, then it close the connection, titan still can get command from client.bufio.Reader and process
-		//close ClientContext.Done will help client.serve() to interrupt command processing
-		close(c.cliCtx.Done)
+		//if the remote client use pipeline to invoke command, then close the connection(timeout etc), titan still get command from client.bufio.Reader and process
+		//setting client.remoteClosed to true will help client.serve() to interrupt command processing
+		c.remoteClosed = true
 	}
 	//return err for above write() error, then replying many times command can break its sending to a half-closed connection, etc BytesArray(lrange invoke it).
 	return n, err
@@ -88,6 +72,11 @@ func (c *client) serve(conn net.Conn) error {
 		case <-c.cliCtx.Done:
 			return c.conn.Close()
 		default:
+			if c.remoteClosed {
+				zap.L().Info("close connection", zap.String("addr", c.cliCtx.RemoteAddr),
+					zap.Int64("clientid", c.cliCtx.ID), zap.String("namespace", c.cliCtx.Namespace))
+				return c.conn.Close()
+			}
 			cmd, err = c.readCommand()
 			if err != nil {
 				c.conn.Close()
diff --git a/command/command.go b/command/command.go
index c96b8ec3..5f7b5ce4 100644
--- a/command/command.go
+++ b/command/command.go
@@ -62,16 +62,20 @@ func Integer(w io.Writer, v int64) OnCommit {
 func BytesArray(w io.Writer, a [][]byte) OnCommit {
 	return func() {
 		start := time.Now()
-		resp.ReplyArray(w, len(a))
+		if _, err := resp.ReplyArray(w, len(a)); err != nil {
+			return
+		}
 		zap.L().Debug("reply array size", zap.Int64("cost(us)", time.Since(start).Nanoseconds()/1000))
 		start = time.Now()
 		for i := range a {
 			if a[i] == nil {
-				resp.ReplyNullBulkString(w)
+				if err := resp.ReplyNullBulkString(w); err != nil {
+					return
+				}
 				continue
 			}
 			if err := resp.ReplyBulkString(w, string(a[i])); err != nil {
-				break
+				return
 			}
 			if i%10 == 9 {
 				zap.L().Debug("reply 10 bulk string", zap.Int64("cost(us)", time.Since(start).Nanoseconds()/1000))

From 2d59108eb26a1b528409c4ea151101f866c14f71 Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Sat, 7 Dec 2019 10:28:18 +0800
Subject: [PATCH 37/60] refactor commandLimiter's qpsLimiter/ratelimiter and
 their globalLimit, lock, localPercent to a LimiterWrapper, and decrease the
 lock range

---
 db/limitersMgr.go | 258 +++++++++++++++++++++++-----------------------
 1 file changed, 128 insertions(+), 130 deletions(-)

diff --git a/db/limitersMgr.go b/db/limitersMgr.go
index e6e63189..ce26bf39 100644
--- a/db/limitersMgr.go
+++ b/db/limitersMgr.go
@@ -30,19 +30,24 @@ const (
 	MINIMUM_WEIGHT             = 0.1
 )
 
+type LimiterWrapper struct {
+	limiterName  string
+	limiter      *rate.Limiter
+	globalLimit  int64
+	localPercent float64
+	lock         sync.Mutex
+}
+
 type CommandLimiter struct {
 	localIp     string
 	limiterName string
 
-	qpsl            *rate.Limiter
-	ratel           *rate.Limiter
-	globalQpsLimit  int64
-	globalRateLimit int64
-	localPercent    float64
-	weight          float64
-	skipBalance     bool
+	qpsLw  LimiterWrapper
+	rateLw LimiterWrapper
+	weight float64
 
 	lock               sync.Mutex
+	skipBalance        bool
 	lastTime           time.Time
 	totalCommandsCount int64
 	totalCommandsSize  int64
@@ -351,10 +356,6 @@ func (l *LimitersMgr) runSyncNewLimit() {
 
 		if (qpsLimit > 0 && qpsBurst > 0) ||
 			(rateLimit > 0 && rateBurst > 0) {
-			if logEnv := zap.L().Check(zap.DebugLevel, "[Limit] limit is set"); logEnv != nil {
-				logEnv.Write(zap.String("limiter name", limiterName), zap.Int64("qps limit", qpsLimit), zap.Int("qps burst", qpsBurst),
-					zap.Int64("rate limit", rateLimit), zap.Int("rate burst", rateBurst))
-			}
 			if commandLimiter == nil {
 				newCl := NewCommandLimiter(l.localIp, limiterName, qpsLimit, qpsBurst, rateLimit, rateBurst, l.conf.InitialPercent)
 				l.limiters.Store(limiterName, newCl)
@@ -382,80 +383,27 @@ func NewCommandLimiter(localIp string, limiterName string, qpsLimit int64, qpsBu
 	if initialPercent <= 0 {
 		return nil
 	}
-	var qpsl, ratel *rate.Limiter
-	if qpsLimit > 0 && qpsBurst > 0 {
-		localQpsLimit := float64(qpsLimit) * initialPercent
-		qpsl = rate.NewLimiter(rate.Limit(localQpsLimit), qpsBurst)
-	}
-	if rateLimit > 0 && rateBurst > 0 {
-		localRateLimit := float64(rateLimit) * initialPercent
-		ratel = rate.NewLimiter(rate.Limit(localRateLimit), rateBurst)
-	}
 	cl := &CommandLimiter{
-		limiterName:     limiterName,
-		localIp:         localIp,
-		qpsl:            qpsl,
-		ratel:           ratel,
-		globalQpsLimit:  qpsLimit,
-		globalRateLimit: rateLimit,
-		localPercent:    initialPercent,
-		weight:          MAXIMUM_WEIGHT,
-		skipBalance:     true,
-		lastTime:        time.Now(),
-	}
+		limiterName: limiterName,
+		localIp:     localIp,
+		qpsLw:       LimiterWrapper{localPercent: initialPercent, limiterName: limiterName + "-qps"},
+		rateLw:      LimiterWrapper{localPercent: initialPercent, limiterName: limiterName + "-rate"},
+		weight:      MAXIMUM_WEIGHT,
+		skipBalance: true,
+		lastTime:    time.Now(),
+	}
+	cl.qpsLw.updateLimit(qpsLimit, qpsBurst)
+	cl.rateLw.updateLimit(rateLimit, rateBurst)
 	return cl
 }
 
 func (cl *CommandLimiter) updateLimit(newQpsLimit int64, newQpsBurst int, newRateLimit int64, newRateBurst int) {
-	cl.lock.Lock()
-	defer cl.lock.Unlock()
-
-	////when limit is changed, the qps can't be used to balanceLimit
-	var qpsBurst, rateBurst int
-	if cl.qpsl != nil {
-		qpsBurst = cl.qpsl.Burst()
+	qpsLimitChanged := cl.qpsLw.updateLimit(newQpsLimit, newQpsBurst)
+	rateLimitChanged := cl.rateLw.updateLimit(newRateLimit, newRateBurst)
+	if qpsLimitChanged || rateLimitChanged {
+		////when limit is changed, the qps can't be used to balanceLimit
+		cl.setSkipBalance(true)
 	}
-	if cl.ratel != nil {
-		rateBurst = cl.ratel.Burst()
-	}
-	if cl.globalQpsLimit != newQpsLimit || qpsBurst != newQpsBurst || cl.globalRateLimit != newRateLimit || rateBurst != newRateBurst {
-		zap.L().Info("limit changed", zap.Int64("globalQpsLimit", cl.globalQpsLimit), zap.Int64("newQpsLimit", newQpsLimit), zap.Int("qpsBurst", qpsBurst), zap.Int("newQpsBurst", newQpsBurst),
-			zap.Int64("globalRateLimit", cl.globalRateLimit), zap.Int64("newRateLimit", newRateLimit), zap.Int("rateBurst", rateBurst), zap.Int("newRateBurst", newRateBurst))
-		cl.skipBalance = true
-	}
-
-	if newQpsLimit > 0 && newQpsBurst > 0 {
-		localQpsLimit := float64(newQpsLimit) * cl.localPercent
-		if cl.qpsl != nil {
-			if cl.qpsl.Burst() != newQpsBurst {
-				cl.qpsl = rate.NewLimiter(rate.Limit(localQpsLimit), newQpsBurst)
-			} else if cl.globalQpsLimit != newQpsLimit {
-				cl.qpsl.SetLimit(rate.Limit(localQpsLimit))
-			}
-		} else {
-			cl.qpsl = rate.NewLimiter(rate.Limit(localQpsLimit), newQpsBurst)
-		}
-	} else {
-		cl.qpsl = nil
-	}
-
-	if newRateLimit > 0 && newRateBurst > 0 {
-		localRateLimit := float64(newRateLimit) * cl.localPercent
-		if cl.ratel != nil {
-			if cl.ratel.Burst() != newRateBurst {
-				cl.ratel = rate.NewLimiter(rate.Limit(localRateLimit), newRateBurst)
-			} else if cl.globalRateLimit != newRateLimit {
-				cl.ratel.SetLimit(rate.Limit(localRateLimit))
-			}
-		} else {
-			cl.ratel = rate.NewLimiter(rate.Limit(localRateLimit), newRateBurst)
-		}
-	} else {
-		cl.ratel = nil
-	}
-
-	cl.globalQpsLimit = newQpsLimit
-	cl.globalRateLimit = newRateLimit
 }
 
 func (cl *CommandLimiter) reportLocalStat(globalBalancePeriod time.Duration) float64 {
@@ -483,14 +431,12 @@ func (cl *CommandLimiter) reportLocalStat(globalBalancePeriod time.Duration) flo
 
 func (cl *CommandLimiter) balanceLimit(averageQps float64, limitDatadb *DB, titanStatusLifetime time.Duration,
 	devideUsage float64, multiplyUsage float64, weightChangeFactor float64) {
-	cl.lock.Lock()
-	defer cl.lock.Unlock()
-
-	if cl.qpsl == nil {
+	qpsGlobalLimit := float64(cl.qpsLw.getLimit())
+	if qpsGlobalLimit <= 0 {
 		return
 	}
-	if cl.skipBalance {
-		cl.skipBalance = false
+	if cl.getSkipBalance() {
+		cl.setSkipBalance(false)
 		return
 	}
 
@@ -511,13 +457,12 @@ func (cl *CommandLimiter) balanceLimit(averageQps float64, limitDatadb *DB, tita
 		totalWeight += weights[i]
 	}
 
-	originalLimit := float64(cl.qpsl.Limit()) / cl.localPercent
-	selfLimitInTarget := originalLimit * (cl.weight / totalWeight)
+	selfLimitInTarget := qpsGlobalLimit * (cl.weight / totalWeight)
 	if averageQps < selfLimitInTarget*devideUsage {
 		otherHaveHigh := false
 		otherAllLow := true
 		for i := range qpss {
-			otherLimitInTarget := originalLimit * (weights[i] / totalWeight)
+			otherLimitInTarget := qpsGlobalLimit * (weights[i] / totalWeight)
 			if qpss[i] >= otherLimitInTarget*multiplyUsage {
 				otherHaveHigh = true
 				otherAllLow = false
@@ -569,7 +514,8 @@ func (cl *CommandLimiter) balanceLimit(averageQps float64, limitDatadb *DB, tita
 	}
 	zap.L().Info("[Limit] balance limit", zap.String("limiterName", cl.limiterName),
 		zap.Float64("qps", averageQps), zap.Float64("newWeight", cl.weight), zap.Float64("newPercent", newPercent))
-	cl.updateLimitPercent(newPercent)
+	cl.qpsLw.updatePercent(newPercent)
+	cl.rateLw.updatePercent(newPercent)
 }
 
 func (cl *CommandLimiter) scanStatusInOtherTitan(limitDatadb *DB, txn *Transaction, titanStatusLifetime time.Duration) ([]float64, []float64, error) {
@@ -638,63 +584,115 @@ func (cl *CommandLimiter) scanStatusInOtherTitan(limitDatadb *DB, txn *Transacti
 	return weights, qpss, nil
 }
 
-func (cl *CommandLimiter) updateLimitPercent(newPercent float64) {
-	if cl.localPercent != newPercent && cl.localPercent > 0 && newPercent > 0 {
-		if cl.qpsl != nil {
-			qpsLimit := float64(cl.globalQpsLimit) * newPercent
-			zap.L().Info("percent changed", zap.String("limiterName", cl.limiterName), zap.Float64("qps limit", qpsLimit), zap.Int("burst", cl.qpsl.Burst()))
-			cl.qpsl.SetLimit(rate.Limit(qpsLimit))
-		}
-		if cl.ratel != nil {
-			rateLimit := float64(cl.globalRateLimit) * newPercent
-			zap.L().Info("percent changed", zap.String("limiterName", cl.limiterName), zap.Float64("rate limit", rateLimit), zap.Int("burst", cl.ratel.Burst()))
-			cl.ratel.SetLimit(rate.Limit(rateLimit))
-		}
-		cl.localPercent = newPercent
+func (cl *CommandLimiter) checkLimit(cmdName string, cmdArgs []string) {
+	d := cl.qpsLw.waitTime(1)
+	time.Sleep(d)
+
+	cmdSize := len(cmdName)
+	for i := range cmdArgs {
+		cmdSize += len(cmdArgs[i]) + 1
+	}
+	d = cl.rateLw.waitTime(cmdSize)
+	time.Sleep(d)
+
+	cl.lock.Lock()
+	defer cl.lock.Unlock()
+	cl.totalCommandsCount++
+	cl.totalCommandsSize += int64(cmdSize)
+	if logEnv := zap.L().Check(zap.DebugLevel, "[Limit] limiter state"); logEnv != nil {
+		logEnv.Write(zap.String("limiter name", cl.limiterName), zap.Time("last time", cl.lastTime),
+			zap.Int64("command count", cl.totalCommandsCount), zap.Int64("command size", cl.totalCommandsSize))
 	}
 }
 
-func (cl *CommandLimiter) checkLimit(cmdName string, cmdArgs []string) {
+func (cl *CommandLimiter) setSkipBalance(skipBalance bool) {
 	cl.lock.Lock()
 	defer cl.lock.Unlock()
+	cl.skipBalance = skipBalance
+}
 
-	if cl.qpsl != nil {
-		r := cl.qpsl.Reserve()
-		if !r.OK() {
-			zap.L().Error("[Limit] request events num exceed limiter burst", zap.Int("qps limiter burst", cl.qpsl.Burst()))
-		} else {
-			d := r.Delay()
-			if d > 0 {
-				if logEnv := zap.L().Check(zap.DebugLevel, "[Limit] trigger qps limit"); logEnv != nil {
-					logEnv.Write(zap.String("limiter name", cl.limiterName), zap.Int64("sleep us", int64(d/time.Microsecond)))
-				}
-				time.Sleep(d)
+func (cl *CommandLimiter) getSkipBalance() bool {
+	cl.lock.Lock()
+	defer cl.lock.Unlock()
+	return cl.skipBalance
+}
+
+func (lw *LimiterWrapper) updateLimit(newLimit int64, newBurst int) bool {
+	lw.lock.Lock()
+	defer lw.lock.Unlock()
+
+	var burst int
+	if lw.limiter != nil {
+		burst = lw.limiter.Burst()
+	}
+
+	changed := false
+	if lw.globalLimit != newLimit || burst != newBurst {
+		zap.L().Info("limit changed", zap.String("limiterName", lw.limiterName),
+			zap.Int64("globalLimit", lw.globalLimit), zap.Int64("newGlobalLimit", newLimit),
+			zap.Int("burst", burst), zap.Int("newBurst", newBurst))
+		changed = true
+	}
+
+	if newLimit > 0 && newBurst > 0 {
+		localLimit := float64(newLimit) * lw.localPercent
+		if lw.limiter != nil {
+			if lw.limiter.Burst() != newBurst {
+				lw.limiter = rate.NewLimiter(rate.Limit(localLimit), newBurst)
+			} else if lw.globalLimit != newLimit {
+				lw.limiter.SetLimit(rate.Limit(localLimit))
 			}
+		} else {
+			lw.limiter = rate.NewLimiter(rate.Limit(localLimit), newBurst)
 		}
+	} else {
+		lw.limiter = nil
 	}
 
-	cmdSize := len(cmdName)
-	for i := range cmdArgs {
-		cmdSize += len(cmdArgs[i]) + 1
+	lw.globalLimit = newLimit
+	return changed
+}
+
+func (lw *LimiterWrapper) getLimit() int64 {
+	lw.lock.Lock()
+	defer lw.lock.Unlock()
+
+	return lw.globalLimit
+}
+
+func (lw *LimiterWrapper) updatePercent(newPercent float64) {
+	lw.lock.Lock()
+	defer lw.lock.Unlock()
+
+	if lw.localPercent != newPercent && lw.localPercent > 0 && newPercent > 0 {
+		if lw.limiter != nil {
+			limit := float64(lw.globalLimit) * newPercent
+			zap.L().Info("percent changed", zap.String("limiterName", lw.limiterName),
+				zap.Float64("limit", limit), zap.Int("burst", lw.limiter.Burst()))
+			lw.limiter.SetLimit(rate.Limit(limit))
+		}
+
+		lw.localPercent = newPercent
 	}
-	if cl.ratel != nil {
-		r := cl.ratel.ReserveN(time.Now(), cmdSize)
+}
+
+func (lw *LimiterWrapper) waitTime(eventsNum int) time.Duration {
+	lw.lock.Lock()
+	defer lw.lock.Unlock()
+
+	var d time.Duration
+	if lw.limiter != nil {
+		r := lw.limiter.ReserveN(time.Now(), eventsNum)
 		if !r.OK() {
-			zap.L().Error("[Limit] request events num exceed limiter burst", zap.Int("rate limiter burst", cl.ratel.Burst()), zap.Int("command size", cmdSize))
+			zap.L().Error("[Limit] request events num exceed limiter burst", zap.String("limiter name", lw.limiterName), zap.Int("burst", lw.limiter.Burst()), zap.Int("request events num", eventsNum))
 		} else {
-			d := r.Delay()
+			d = r.Delay()
 			if d > 0 {
-				if logEnv := zap.L().Check(zap.DebugLevel, "[Limit] trigger rate limit"); logEnv != nil {
-					logEnv.Write(zap.String("limiter name", cl.limiterName), zap.Strings("args", cmdArgs), zap.Int64("sleep us", int64(d/time.Microsecond)))
+				if logEnv := zap.L().Check(zap.DebugLevel, "[Limit] trigger limit"); logEnv != nil {
+					logEnv.Write(zap.String("limiter name", lw.limiterName), zap.Int("request events num", eventsNum), zap.Int64("sleep us", int64(d/time.Microsecond)))
 				}
-				time.Sleep(d)
 			}
 		}
 	}
-
-	cl.totalCommandsCount++
-	cl.totalCommandsSize += int64(cmdSize)
-	if logEnv := zap.L().Check(zap.DebugLevel, "[Limit] limiter state"); logEnv != nil {
-		logEnv.Write(zap.String("limiter name", cl.limiterName), zap.Time("last time", cl.lastTime), zap.Int64("command count", cl.totalCommandsCount), zap.Int64("command size", cl.totalCommandsSize))
-	}
+	return d
 }

From b034581c518a08c6dc0e41084f51ce63e722dc51 Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Sat, 7 Dec 2019 11:40:42 +0800
Subject: [PATCH 38/60] fix balance limit's bug

---
 db/limitersMgr.go | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/db/limitersMgr.go b/db/limitersMgr.go
index ce26bf39..927a8dab 100644
--- a/db/limitersMgr.go
+++ b/db/limitersMgr.go
@@ -467,10 +467,8 @@ func (cl *CommandLimiter) balanceLimit(averageQps float64, limitDatadb *DB, tita
 				otherHaveHigh = true
 				otherAllLow = false
 				break
-			}
-			if qpss[i] >= otherLimitInTarget*devideUsage {
+			} else if qpss[i] >= otherLimitInTarget*devideUsage {
 				otherAllLow = false
-				break
 			}
 		}
 		if otherHaveHigh {

From 3b71e69d45465a07b0318f5cfbae754e4d861302 Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Wed, 11 Dec 2019 15:42:14 +0800
Subject: [PATCH 39/60] 1 in Exec(), add transaction begin, cmd args num, cmd
 func, reply func metrics 2 cmd args num metrics include the key, which
 doesn't before 3 in AutoCommit, do adding transaction begin, cmd func, reply
 func metrics before log.debug()

---
 command/command.go      | 10 ++++------
 command/transactions.go | 19 ++++++++++++++++---
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/command/command.go b/command/command.go
index 5f7b5ce4..2f6e418e 100644
--- a/command/command.go
+++ b/command/command.go
@@ -197,13 +197,11 @@ func AutoCommit(cmd TxnCommand) Command {
 			key := ""
 			if len(ctx.Args) > 0 {
 				key = ctx.Args[0]
-				if len(ctx.Args) > 1 {
-					mt.CommandArgsNumHistogramVec.WithLabelValues(ctx.Client.Namespace, ctx.Name).Observe(float64(len(ctx.Args) - 1))
-				}
+				mt.CommandArgsNumHistogramVec.WithLabelValues(ctx.Client.Namespace, ctx.Name).Observe(float64(len(ctx.Args)))
 			}
 			cost := time.Since(start).Seconds()
-			zap.L().Debug("transation begin", zap.String("name", ctx.Name), zap.String("key", key), zap.Int64("cost(us)", int64(cost*1000000)))
 			mt.TxnBeginHistogramVec.WithLabelValues(ctx.Client.Namespace, ctx.Name).Observe(cost)
+			zap.L().Debug("transation begin", zap.String("name", ctx.Name), zap.String("key", key), zap.Int64("cost(us)", int64(cost*1000000)))
 			if err != nil {
 				mt.TxnFailuresCounterVec.WithLabelValues(ctx.Client.Namespace, ctx.Name).Inc()
 				resp.ReplyError(ctx.Out, "ERR "+err.Error())
@@ -218,8 +216,8 @@ func AutoCommit(cmd TxnCommand) Command {
 			start = time.Now()
 			onCommit, err := cmd(ctx, txn)
 			cost = time.Since(start).Seconds()
-			zap.L().Debug("command done", zap.String("name", ctx.Name), zap.String("key", key), zap.Int64("cost(us)", int64(cost*1000000)))
 			mt.CommandFuncDoneHistogramVec.WithLabelValues(ctx.Client.Namespace, ctx.Name).Observe(cost)
+			zap.L().Debug("command done", zap.String("name", ctx.Name), zap.String("key", key), zap.Int64("cost(us)", int64(cost*1000000)))
 			if err != nil {
 				mt.TxnFailuresCounterVec.WithLabelValues(ctx.Client.Namespace, ctx.Name).Inc()
 				resp.ReplyError(ctx.Out, err.Error())
@@ -267,8 +265,8 @@ func AutoCommit(cmd TxnCommand) Command {
 				onCommit()
 			}
 			cost = time.Since(start).Seconds()
-			zap.L().Debug("onCommit ", zap.String("name", ctx.Name), zap.String("key", key), zap.Int64("cost(us)", int64(cost*1000000)))
 			mt.ReplyFuncDoneHistogramVec.WithLabelValues(ctx.Client.Namespace, ctx.Name).Observe(cost)
+			zap.L().Debug("onCommit ", zap.String("name", ctx.Name), zap.String("key", key), zap.Int64("cost(us)", int64(cost*1000000)))
 			mtFunc()
 			return nil
 		})
diff --git a/command/transactions.go b/command/transactions.go
index f2d63713..89b55c39 100644
--- a/command/transactions.go
+++ b/command/transactions.go
@@ -38,9 +38,15 @@ func Exec(ctx *Context) {
 	var outputs []*bytes.Buffer
 	var onCommits []OnCommit
 	err = retry.Ensure(ctx, func() error {
+		mt := metrics.GetMetrics()
 		if !watching {
+			start := time.Now()
 			txn, err = ctx.Client.DB.Begin()
+			cost := time.Since(start).Seconds()
+			mt.TxnBeginHistogramVec.WithLabelValues(ctx.Client.Namespace, ctx.Name).Observe(cost)
+			zap.L().Debug("transation begin", zap.String("name", ctx.Name), zap.Int64("cost(us)", int64(cost*1000000)))
 			if err != nil {
+				mt.TxnFailuresCounterVec.WithLabelValues(ctx.Client.Namespace, ctx.Name).Inc()
 				zap.L().Error("begin txn failed",
 					zap.Int64("clientid", ctx.Client.ID),
 					zap.String("command", ctx.Name),
@@ -63,11 +69,16 @@ func Exec(ctx *Context) {
 				Out:     out,
 				Context: ctx.Context,
 			}
+			if len(ctx.Args) > 0 {
+				mt.CommandArgsNumHistogramVec.WithLabelValues(ctx.Client.Namespace, ctx.Name).Observe(float64(len(ctx.Args)))
+			}
 			name := strings.ToLower(cmd.Name)
 			if _, ok := txnCommands[name]; ok {
 				start := time.Now()
 				onCommit, err = TxnCall(subCtx, txn)
-				zap.L().Debug("execute", zap.String("command", subCtx.Name), zap.Int64("cost(us)", time.Since(start).Nanoseconds()/1000))
+				cost := time.Since(start).Seconds()
+				mt.CommandFuncDoneHistogramVec.WithLabelValues(ctx.Client.Namespace, ctx.Name).Observe(cost)
+				zap.L().Debug("execute", zap.String("command", subCtx.Name), zap.Int64("cost(us)", int64(cost*1000000)))
 				if err != nil {
 					resp.ReplyError(out, err.Error())
 				}
@@ -79,7 +90,6 @@ func Exec(ctx *Context) {
 			commandCount++
 		}
 		start := time.Now()
-		mt := metrics.GetMetrics()
 		mt.MultiCommandHistogramVec.WithLabelValues(ctx.Client.Namespace, ctx.Name).Observe(float64(commandCount))
 		defer func() {
 			cost := time.Since(start).Seconds()
@@ -123,7 +133,7 @@ func Exec(ctx *Context) {
 		return
 	}
 
-
+	start := time.Now()
 	resp.ReplyArray(ctx.Out, size)
 	// run OnCommit that fill reply to outputs
 	for i := range onCommits {
@@ -141,6 +151,9 @@ func Exec(ctx *Context) {
 			break
 		}
 	}
+	cost := time.Since(start).Seconds()
+	metrics.GetMetrics().ReplyFuncDoneHistogramVec.WithLabelValues(ctx.Client.Namespace, ctx.Name).Observe(cost)
+	zap.L().Debug("onCommit ", zap.String("name", ctx.Name), zap.Int64("cost(us)", int64(cost*1000000)))
 }
 
 // Watch starts a transaction, watch is a global transaction and is not key associated(this is different from redis)

From 0fb99b72051163a53efc2bdcc86804678826ca9d Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Wed, 11 Dec 2019 16:45:26 +0800
Subject: [PATCH 40/60] simplify the msg recv/send log in debug level

---
 client.go          | 4 ++--
 command/command.go | 7 -------
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/client.go b/client.go
index df8b2540..87f899d3 100644
--- a/client.go
+++ b/client.go
@@ -130,7 +130,6 @@ func (c *client) serve(conn net.Conn) error {
 		}
 
 		ctx.Context = context.New(c.cliCtx, c.server.servCtx)
-		zap.L().Debug("recv msg", zap.String("command", ctx.Name), zap.Strings("arguments", ctx.Args))
 
 		// Skip reply if necessary
 		if c.cliCtx.SkipN != 0 {
@@ -143,7 +142,8 @@ func (c *client) serve(conn net.Conn) error {
 			env.Write(zap.String("addr", c.cliCtx.RemoteAddr),
 				zap.Int64("clientid", c.cliCtx.ID),
 				zap.String("traceid", ctx.TraceID),
-				zap.String("command", ctx.Name))
+				zap.String("command", ctx.Name),
+				zap.Strings("arguments", ctx.Args))
 		}
 
 		c.exec.Execute(ctx)
diff --git a/command/command.go b/command/command.go
index 2f6e418e..68f14158 100644
--- a/command/command.go
+++ b/command/command.go
@@ -61,12 +61,9 @@ func Integer(w io.Writer, v int64) OnCommit {
 // BytesArray replies a [][]byte when commit
 func BytesArray(w io.Writer, a [][]byte) OnCommit {
 	return func() {
-		start := time.Now()
 		if _, err := resp.ReplyArray(w, len(a)); err != nil {
 			return
 		}
-		zap.L().Debug("reply array size", zap.Int64("cost(us)", time.Since(start).Nanoseconds()/1000))
-		start = time.Now()
 		for i := range a {
 			if a[i] == nil {
 				if err := resp.ReplyNullBulkString(w); err != nil {
@@ -77,10 +74,6 @@ func BytesArray(w io.Writer, a [][]byte) OnCommit {
 			if err := resp.ReplyBulkString(w, string(a[i])); err != nil {
 				return
 			}
-			if i%10 == 9 {
-				zap.L().Debug("reply 10 bulk string", zap.Int64("cost(us)", time.Since(start).Nanoseconds()/1000))
-				start = time.Now()
-			}
 		}
 	}
 }

From 37c1c6815be07126637b2cfdbe851d3beba8f0d0 Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Wed, 11 Dec 2019 18:03:29 +0800
Subject: [PATCH 41/60] fix bug of setting cmd arg num metric in Exec()

---
 command/transactions.go | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/command/transactions.go b/command/transactions.go
index 89b55c39..aa3f610a 100644
--- a/command/transactions.go
+++ b/command/transactions.go
@@ -69,16 +69,16 @@ func Exec(ctx *Context) {
 				Out:     out,
 				Context: ctx.Context,
 			}
-			if len(ctx.Args) > 0 {
-				mt.CommandArgsNumHistogramVec.WithLabelValues(ctx.Client.Namespace, ctx.Name).Observe(float64(len(ctx.Args)))
+			if len(cmd.Args) > 0 {
+				mt.CommandArgsNumHistogramVec.WithLabelValues(ctx.Client.Namespace, cmd.Name).Observe(float64(len(cmd.Args)))
 			}
 			name := strings.ToLower(cmd.Name)
 			if _, ok := txnCommands[name]; ok {
 				start := time.Now()
 				onCommit, err = TxnCall(subCtx, txn)
 				cost := time.Since(start).Seconds()
-				mt.CommandFuncDoneHistogramVec.WithLabelValues(ctx.Client.Namespace, ctx.Name).Observe(cost)
-				zap.L().Debug("execute", zap.String("command", subCtx.Name), zap.Int64("cost(us)", int64(cost*1000000)))
+				mt.CommandFuncDoneHistogramVec.WithLabelValues(ctx.Client.Namespace, cmd.Name).Observe(cost)
+				zap.L().Debug("execute", zap.String("command", cmd.Name), zap.Int64("cost(us)", int64(cost*1000000)))
 				if err != nil {
 					resp.ReplyError(out, err.Error())
 				}

From 9de2ae953d384b2a27032fd4fd5aa5e2a00759cd Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Thu, 12 Dec 2019 18:51:20 +0800
Subject: [PATCH 42/60] add expire/gc round/seek/commit cost metrics

---
 db/expire.go          | 22 ++++++++++++++++--
 db/gc.go              | 17 +++++++++++++-
 metrics/prometheus.go | 52 ++++++++++++++++++++++++++++++++++---------
 3 files changed, 77 insertions(+), 14 deletions(-)

diff --git a/db/expire.go b/db/expire.go
index 36acc2b6..de6109e2 100644
--- a/db/expire.go
+++ b/db/expire.go
@@ -21,6 +21,10 @@ var (
 	expireMetakeyOffset   = expireTimestampOffset + 8 /*sizeof(int64)*/ + len(":")
 )
 
+const (
+	expire_worker = "expire"
+)
+
 // IsExpired judge object expire through now
 func IsExpired(obj *Object, now int64) bool {
 	if obj.ExpireAt == 0 || obj.ExpireAt > now {
@@ -88,6 +92,8 @@ func StartExpire(db *DB, conf *conf.Expire) error {
 		if conf.Disable {
 			continue
 		}
+
+		start := time.Now()
 		isLeader, err := isLeader(db, sysExpireLeader, id, conf.LeaderLifeTime)
 		if err != nil {
 			zap.L().Error("[Expire] check expire leader failed", zap.Error(err))
@@ -102,6 +108,7 @@ func StartExpire(db *DB, conf *conf.Expire) error {
 			continue
 		}
 		runExpire(db, conf.BatchLimit)
+		metrics.GetMetrics().WorkerRoundCostHistogramVec.WithLabelValues(expire_worker).Observe(time.Since(start).Seconds())
 	}
 	return nil
 }
@@ -142,7 +149,9 @@ func runExpire(db *DB, batchLimit int) {
 		return
 	}
 	endPrefix := kv.Key(expireKeyPrefix).PrefixNext()
+	start := time.Now()
 	iter, err := txn.t.Iter(expireKeyPrefix, endPrefix)
+	metrics.GetMetrics().WorkerSeekCostHistogramVec.WithLabelValues(expire_worker).Observe(time.Since(start).Seconds())
 	if err != nil {
 		zap.L().Error("[Expire] seek failed", zap.ByteString("prefix", expireKeyPrefix), zap.Error(err))
 		txn.Rollback()
@@ -180,7 +189,13 @@ func runExpire(db *DB, batchLimit int) {
 			logEnv.Write(zap.ByteString("mkey", mkey))
 		}
 
-		if err := iter.Next(); err != nil {
+		start = time.Now()
+		err := iter.Next()
+		cost := time.Since(start)
+		if cost >= time.Millisecond {
+			metrics.GetMetrics().WorkerSeekCostHistogramVec.WithLabelValues(expire_worker).Observe(cost.Seconds())
+		}
+		if err != nil {
 			zap.L().Error("[Expire] next failed",
 				zap.ByteString("mkey", mkey),
 				zap.Error(err))
@@ -200,7 +215,10 @@ func runExpire(db *DB, batchLimit int) {
 		metrics.GetMetrics().ExpireLeftSecondsVec.WithLabelValues("left").Set(0)
 	}
 
-	if err := txn.Commit(context.Background()); err != nil {
+	start = time.Now()
+	err = txn.Commit(context.Background())
+	metrics.GetMetrics().WorkerCommitCostHistogramVec.WithLabelValues(expire_worker).Observe(time.Since(start).Seconds())
+	if err != nil {
 		txn.Rollback()
 		zap.L().Error("[Expire] commit failed", zap.Error(err))
 	}
diff --git a/db/gc.go b/db/gc.go
index 29fd81ea..e1aa8f96 100644
--- a/db/gc.go
+++ b/db/gc.go
@@ -15,6 +15,10 @@ var (
 	sysGCLeader = []byte("$sys:0:GCL:GCLeader")
 )
 
+const (
+	gc_worker = "gc"
+)
+
 func toTikvGCKey(key []byte) []byte {
 	b := []byte{}
 	b = append(b, sysNamespace...)
@@ -45,7 +49,9 @@ func gcDeleteRange(txn store.Transaction, prefix []byte, limit int) (int, error)
 		count     int
 	)
 	endPrefix := kv.Key(prefix).PrefixNext()
+	start := time.Now()
 	itr, err := txn.Iter(prefix, endPrefix)
+	metrics.GetMetrics().WorkerSeekCostHistogramVec.WithLabelValues(gc_worker).Observe(time.Since(start).Seconds())
 	if err != nil {
 		return count, err
 	}
@@ -83,7 +89,9 @@ func doGC(db *DB, limit int) error {
 	txn := dbTxn.t
 	store.SetOption(txn, store.KeyOnly, true)
 
+	start := time.Now()
 	itr, err := txn.Iter(gcPrefix, endGCPrefix)
+	metrics.GetMetrics().WorkerSeekCostHistogramVec.WithLabelValues(gc_worker).Observe(time.Since(start).Seconds())
 	if err != nil {
 		return err
 	}
@@ -133,7 +141,11 @@ func doGC(db *DB, limit int) error {
 		txn.Rollback()
 		return resultErr
 	}
-	if err := txn.Commit(context.Background()); err != nil {
+
+	start = time.Now()
+	err = txn.Commit(context.Background())
+	metrics.GetMetrics().WorkerCommitCostHistogramVec.WithLabelValues(gc_worker).Observe(time.Since(start).Seconds())
+	if err != nil {
 		txn.Rollback()
 		return err
 	}
@@ -158,6 +170,8 @@ func StartGC(db *DB, conf *conf.GC) {
 		if conf.Disable {
 			continue
 		}
+
+		start := time.Now()
 		isLeader, err := isLeader(db, sysGCLeader, id, conf.LeaderLifeTime)
 		if err != nil {
 			zap.L().Error("[GC] check GC leader failed",
@@ -183,5 +197,6 @@ func StartGC(db *DB, conf *conf.GC) {
 				zap.Error(err))
 			continue
 		}
+		metrics.GetMetrics().WorkerRoundCostHistogramVec.WithLabelValues(gc_worker).Observe(time.Since(start).Seconds())
 	}
 }
diff --git a/metrics/prometheus.go b/metrics/prometheus.go
index b01639ac..4213a7e7 100644
--- a/metrics/prometheus.go
+++ b/metrics/prometheus.go
@@ -23,6 +23,7 @@ const (
 	expire    = "expire"
 	tikvGC    = "tikvgc"
 	titanip   = "titanip"
+	worker    = "worker"
 )
 
 var (
@@ -36,6 +37,7 @@ var (
 	expireLabel  = []string{expire}
 	tikvGCLabel  = []string{tikvGC}
 	limitLabel   = []string{biz, command, titanip}
+	workerLabel  = []string{worker}
 
 	// global prometheus object
 	gm *Metrics
@@ -62,17 +64,20 @@ type Metrics struct {
 	TikvGCTotal *prometheus.CounterVec
 
 	//command biz
-	CommandCallHistogramVec     *prometheus.HistogramVec
-	LimitCostHistogramVec       *prometheus.HistogramVec
-	TxnBeginHistogramVec        *prometheus.HistogramVec
-	CommandFuncDoneHistogramVec *prometheus.HistogramVec
-	TxnCommitHistogramVec       *prometheus.HistogramVec
-	ReplyFuncDoneHistogramVec   *prometheus.HistogramVec
-	CommandArgsNumHistogramVec  *prometheus.HistogramVec
-	TxnRetriesCounterVec        *prometheus.CounterVec
-	TxnConflictsCounterVec      *prometheus.CounterVec
-	TxnFailuresCounterVec       *prometheus.CounterVec
-	MultiCommandHistogramVec    *prometheus.HistogramVec
+	CommandCallHistogramVec      *prometheus.HistogramVec
+	LimitCostHistogramVec        *prometheus.HistogramVec
+	TxnBeginHistogramVec         *prometheus.HistogramVec
+	CommandFuncDoneHistogramVec  *prometheus.HistogramVec
+	TxnCommitHistogramVec        *prometheus.HistogramVec
+	ReplyFuncDoneHistogramVec    *prometheus.HistogramVec
+	CommandArgsNumHistogramVec   *prometheus.HistogramVec
+	TxnRetriesCounterVec         *prometheus.CounterVec
+	TxnConflictsCounterVec       *prometheus.CounterVec
+	TxnFailuresCounterVec        *prometheus.CounterVec
+	MultiCommandHistogramVec     *prometheus.HistogramVec
+	WorkerRoundCostHistogramVec  *prometheus.HistogramVec
+	WorkerSeekCostHistogramVec   *prometheus.HistogramVec
+	WorkerCommitCostHistogramVec *prometheus.HistogramVec
 
 	//logger
 	LogMetricsCounterVec *prometheus.CounterVec
@@ -268,6 +273,31 @@ func init() {
 		[]string{labelName},
 	)
 	prometheus.MustRegister(gm.LogMetricsCounterVec)
+
+	gm.WorkerRoundCostHistogramVec = prometheus.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Namespace: namespace,
+			Name:      "worker_round_cost",
+			Help:      "the cost of a round expire/gc worker",
+		}, workerLabel)
+	prometheus.MustRegister(gm.WorkerRoundCostHistogramVec)
+
+	gm.WorkerSeekCostHistogramVec = prometheus.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Namespace: namespace,
+			Name:      "worker_seek_cost",
+			Help:      "the cost of tikv seek in expire/gc worker",
+		}, workerLabel)
+	prometheus.MustRegister(gm.WorkerSeekCostHistogramVec)
+
+	gm.WorkerCommitCostHistogramVec = prometheus.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Namespace: namespace,
+			Name:      "worker_commit_cost",
+			Help:      "the cost of commit in expire/gc worker",
+		}, workerLabel)
+	prometheus.MustRegister(gm.WorkerCommitCostHistogramVec)
+
 	RegisterSDKMetrics()
 }
 

From 6c2328d0b27e92e409bf8a5041751aa5676e0286 Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Sun, 22 Dec 2019 12:38:49 +0800
Subject: [PATCH 43/60] prevent connection more and more: when tikv command
 cost high, clients reconnect before it finish

---
 bin/titan/main.go  | 10 ++++++----
 conf/config.go     | 13 +++++++------
 context/context.go | 20 ++++++++++++--------
 titan.go           | 22 ++++++++++++++++++++++
 4 files changed, 47 insertions(+), 18 deletions(-)

diff --git a/bin/titan/main.go b/bin/titan/main.go
index 8c3cff8c..215078a3 100644
--- a/bin/titan/main.go
+++ b/bin/titan/main.go
@@ -74,10 +74,12 @@ func main() {
 		os.Exit(1)
 	}
 	serv := titan.New(&context.ServerContext{
-		RequirePass:      config.Server.Auth,
-		Store:            store,
-		ListZipThreshold: config.Server.ListZipThreshold,
-		LimitersMgr:      limitersMgr,
+		RequirePass:       config.Server.Auth,
+		Store:             store,
+		ListZipThreshold:  config.Server.ListZipThreshold,
+		LimitersMgr:       limitersMgr,
+		MaxConnection:     config.Server.MaxConnection,
+		MaxConnectionWait: config.Server.MaxConnectionWait,
 	})
 
 	var servOpts, statusOpts []continuous.ServerOption
diff --git a/conf/config.go b/conf/config.go
index f5eff6dc..784de138 100644
--- a/conf/config.go
+++ b/conf/config.go
@@ -26,12 +26,13 @@ type Hash struct {
 
 // Server config is the config of titan server
 type Server struct {
-	Auth             string `cfg:"auth;;;client connetion auth"`
-	Listen           string `cfg:"listen; 0.0.0.0:7369; netaddr; address to listen"`
-	SSLCertFile      string `cfg:"ssl-cert-file;;;server SSL certificate file (enables SSL support)"`
-	SSLKeyFile       string `cfg:"ssl-key-file;;;server SSL key file"`
-	MaxConnection    int64  `cfg:"max-connection;1000;numeric;client connection count"`
-	ListZipThreshold int    `cfg:"list-zip-threshold;100;numeric;the max limit length of elements in list"`
+	Auth              string `cfg:"auth;;;client connetion auth"`
+	Listen            string `cfg:"listen; 0.0.0.0:7369; netaddr; address to listen"`
+	SSLCertFile       string `cfg:"ssl-cert-file;;;server SSL certificate file (enables SSL support)"`
+	SSLKeyFile        string `cfg:"ssl-key-file;;;server SSL key file"`
+	MaxConnection     int64  `cfg:"max-connection;500;numeric;client connection count"`
+	ListZipThreshold  int    `cfg:"list-zip-threshold;100;numeric;the max limit length of elements in list"`
+	MaxConnectionWait int64  `cfg:"max-connection-wait;1000;numeric;wait ms before close connection when exceed max connection"`
 }
 
 // Tikv config is the config of tikv sdk
diff --git a/context/context.go b/context/context.go
index 22880a74..78e7aa63 100644
--- a/context/context.go
+++ b/context/context.go
@@ -73,14 +73,18 @@ func NewClientContext(id int64, conn net.Conn) *ClientContext {
 
 // ServerContext is the runtime context of the server
 type ServerContext struct {
-	RequirePass      string
-	Store            *db.RedisStore
-	Monitors         sync.Map
-	Clients          sync.Map
-	LimitersMgr      *db.LimitersMgr
-	Pause            time.Duration // elapse to pause all clients
-	StartAt          time.Time
-	ListZipThreshold int
+	RequirePass       string
+	Store             *db.RedisStore
+	Monitors          sync.Map
+	Clients           sync.Map
+	LimitersMgr       *db.LimitersMgr
+	Pause             time.Duration // elapse to pause all clients
+	StartAt           time.Time
+	ListZipThreshold  int
+	MaxConnection     int64
+	MaxConnectionWait int64
+	ClientsNum        int64
+	Lock              sync.Mutex
 }
 
 // Context combines the client and server context
diff --git a/titan.go b/titan.go
index 198c2924..726b96d9 100644
--- a/titan.go
+++ b/titan.go
@@ -36,8 +36,27 @@ func (s *Server) Serve(lis net.Listener) error {
 		}
 
 		cliCtx := context.NewClientContext(s.idgen(), conn)
+		connectExceed := false
+		s.servCtx.Lock.Lock()
+		if s.servCtx.ClientsNum >= s.servCtx.MaxConnection {
+			connectExceed = true
+		}
+		s.servCtx.Lock.Unlock()
+		if connectExceed {
+			zap.L().Warn("max connection exceed, will close after some time",
+				zap.Int64("max connection num", s.servCtx.MaxConnection), zap.Int64("wait ms", s.servCtx.MaxConnectionWait),
+				zap.String("addr", cliCtx.RemoteAddr), zap.Int64("clientid", cliCtx.ID))
+			go func() {
+				time.Sleep(time.Duration(s.servCtx.MaxConnectionWait) * time.Millisecond)
+				zap.L().Warn("close connection for max connection exceed", zap.String("addr", cliCtx.RemoteAddr), zap.Int64("clientid", cliCtx.ID))
+				conn.Close()
+			}()
+		}
 		cliCtx.DB = s.servCtx.Store.DB(cliCtx.Namespace, 0)
 		s.servCtx.Clients.Store(cliCtx.ID, cliCtx)
+		s.servCtx.Lock.Lock()
+		s.servCtx.ClientsNum++
+		s.servCtx.Lock.Unlock()
 
 		cli := newClient(cliCtx, s, command.NewExecutor())
 
@@ -52,6 +71,9 @@ func (s *Server) Serve(lis net.Listener) error {
 			}
 			metrics.GetMetrics().ConnectionOnlineGaugeVec.WithLabelValues(cli.cliCtx.Namespace).Dec()
 			s.servCtx.Clients.Delete(cli.cliCtx.ID)
+			s.servCtx.Lock.Lock()
+			s.servCtx.ClientsNum--
+			s.servCtx.Lock.Unlock()
 		}(cli, conn)
 	}
 }

From bbfce19dd5cf423acfba3bf253a0ad111ff89381 Mon Sep 17 00:00:00 2001
From: lugang <lugang@zhuanzhuan.com>
Date: Wed, 25 Dec 2019 14:42:01 +0800
Subject: [PATCH 44/60] fix bug of connection limit

---
 titan.go | 1 +
 1 file changed, 1 insertion(+)

diff --git a/titan.go b/titan.go
index 726b96d9..840872d4 100644
--- a/titan.go
+++ b/titan.go
@@ -51,6 +51,7 @@ func (s *Server) Serve(lis net.Listener) error {
 				zap.L().Warn("close connection for max connection exceed", zap.String("addr", cliCtx.RemoteAddr), zap.Int64("clientid", cliCtx.ID))
 				conn.Close()
 			}()
+			continue
 		}
 		cliCtx.DB = s.servCtx.Store.DB(cliCtx.Namespace, 0)
 		s.servCtx.Clients.Store(cliCtx.ID, cliCtx)

From 93cd7df384ca5fe46de2b7a3c8eb9baa2ffcc098 Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Tue, 18 Feb 2020 18:07:55 +0800
Subject: [PATCH 45/60] 1 add metrics range, fix cost exceed max value problem
 2 add config item to enable/disable max-connection limit 3 add txn failure
 metrics in multi/exec scenario

---
 bin/titan/main.go       | 1 +
 command/transactions.go | 1 +
 conf/config.go          | 1 +
 context/context.go      | 1 +
 metrics/prometheus.go   | 4 ++--
 titan.go                | 8 +++++---
 6 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/bin/titan/main.go b/bin/titan/main.go
index 215078a3..70a47e5c 100644
--- a/bin/titan/main.go
+++ b/bin/titan/main.go
@@ -78,6 +78,7 @@ func main() {
 		Store:             store,
 		ListZipThreshold:  config.Server.ListZipThreshold,
 		LimitersMgr:       limitersMgr,
+		LimitConnection:   config.Server.LimitConnection,
 		MaxConnection:     config.Server.MaxConnection,
 		MaxConnectionWait: config.Server.MaxConnectionWait,
 	})
diff --git a/command/transactions.go b/command/transactions.go
index aa3f610a..d78476c0 100644
--- a/command/transactions.go
+++ b/command/transactions.go
@@ -80,6 +80,7 @@ func Exec(ctx *Context) {
 				mt.CommandFuncDoneHistogramVec.WithLabelValues(ctx.Client.Namespace, cmd.Name).Observe(cost)
 				zap.L().Debug("execute", zap.String("command", cmd.Name), zap.Int64("cost(us)", int64(cost*1000000)))
 				if err != nil {
+					mt.TxnFailuresCounterVec.WithLabelValues(ctx.Client.Namespace, cmd.Name).Inc()
 					resp.ReplyError(out, err.Error())
 				}
 			} else {
diff --git a/conf/config.go b/conf/config.go
index 784de138..6d88c2ac 100644
--- a/conf/config.go
+++ b/conf/config.go
@@ -30,6 +30,7 @@ type Server struct {
 	Listen            string `cfg:"listen; 0.0.0.0:7369; netaddr; address to listen"`
 	SSLCertFile       string `cfg:"ssl-cert-file;;;server SSL certificate file (enables SSL support)"`
 	SSLKeyFile        string `cfg:"ssl-key-file;;;server SSL key file"`
+	LimitConnection   bool   `cfg:"limit-connection; false; boolean; limit max connection num when it's true"`
 	MaxConnection     int64  `cfg:"max-connection;500;numeric;client connection count"`
 	ListZipThreshold  int    `cfg:"list-zip-threshold;100;numeric;the max limit length of elements in list"`
 	MaxConnectionWait int64  `cfg:"max-connection-wait;1000;numeric;wait ms before close connection when exceed max connection"`
diff --git a/context/context.go b/context/context.go
index 78e7aa63..2aca5916 100644
--- a/context/context.go
+++ b/context/context.go
@@ -81,6 +81,7 @@ type ServerContext struct {
 	Pause             time.Duration // elapse to pause all clients
 	StartAt           time.Time
 	ListZipThreshold  int
+	LimitConnection   bool
 	MaxConnection     int64
 	MaxConnectionWait int64
 	ClientsNum        int64
diff --git a/metrics/prometheus.go b/metrics/prometheus.go
index 4213a7e7..5e72b959 100644
--- a/metrics/prometheus.go
+++ b/metrics/prometheus.go
@@ -91,7 +91,7 @@ func init() {
 		prometheus.HistogramOpts{
 			Namespace: namespace,
 			Name:      "command_duration_seconds",
-			Buckets:   prometheus.ExponentialBuckets(0.0005, 1.4, 30),
+			Buckets:   prometheus.ExponentialBuckets(0.0005, 1.4, 40),
 			Help:      "The cost times of command call",
 		}, multiLabel)
 	prometheus.MustRegister(gm.CommandCallHistogramVec)
@@ -143,7 +143,7 @@ func init() {
 		prometheus.HistogramOpts{
 			Namespace: namespace,
 			Name:      "command_func_done_seconds",
-			Buckets:   prometheus.ExponentialBuckets(0.0002, 1.4, 20),
+			Buckets:   prometheus.ExponentialBuckets(0.0005, 1.4, 40),
 			Help:      "The cost times of command func",
 		}, multiLabel)
 	prometheus.MustRegister(gm.CommandFuncDoneHistogramVec)
diff --git a/titan.go b/titan.go
index 840872d4..3084c294 100644
--- a/titan.go
+++ b/titan.go
@@ -37,9 +37,11 @@ func (s *Server) Serve(lis net.Listener) error {
 
 		cliCtx := context.NewClientContext(s.idgen(), conn)
 		connectExceed := false
-		s.servCtx.Lock.Lock()
-		if s.servCtx.ClientsNum >= s.servCtx.MaxConnection {
-			connectExceed = true
+		if s.servCtx.LimitConnection {
+			s.servCtx.Lock.Lock()
+			if s.servCtx.ClientsNum >= s.servCtx.MaxConnection {
+				connectExceed = true
+			}
 		}
 		s.servCtx.Lock.Unlock()
 		if connectExceed {

From ad7cff1b71ccd0d4b056795fbae7cedcc501b31c Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Wed, 19 Feb 2020 17:35:14 +0800
Subject: [PATCH 46/60] 1 add example for limit-connection and
 max-connection-wait in titan.toml

---
 conf/titan.toml | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/conf/titan.toml b/conf/titan.toml
index aada95e2..0c5375dd 100644
--- a/conf/titan.toml
+++ b/conf/titan.toml
@@ -23,12 +23,24 @@ ssl-cert-file = ""
 #description: server SSL key file
 ssl-key-file = ""
 
+#type:        bool
+#rules:       boolean
+#description: limit max connection num when it's true
+#default:     false
+#limit-connection = false
+
 #type:        int64
 #rules:       numeric
 #description: client connection count
 #default:     1000
 #max-connection = 1000
 
+#type:        int64
+#rules:       numeric
+#description: wait ms before close connection when exceed max connection
+#default:     1000
+#max-connection-wait = 1000
+
 #type:        int
 #rules:       numeric
 #description: the max limit length of elements in list

From aeca4b3a5ecc2be8ffc88c73f0972e1c2da5654d Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Wed, 19 Feb 2020 17:57:07 +0800
Subject: [PATCH 47/60] 1 fix unlock bug in limit-connection

---
 titan.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/titan.go b/titan.go
index 3084c294..4135908d 100644
--- a/titan.go
+++ b/titan.go
@@ -42,8 +42,8 @@ func (s *Server) Serve(lis net.Listener) error {
 			if s.servCtx.ClientsNum >= s.servCtx.MaxConnection {
 				connectExceed = true
 			}
+			s.servCtx.Lock.Unlock()
 		}
-		s.servCtx.Lock.Unlock()
 		if connectExceed {
 			zap.L().Warn("max connection exceed, will close after some time",
 				zap.Int64("max connection num", s.servCtx.MaxConnection), zap.Int64("wait ms", s.servCtx.MaxConnectionWait),

From bb617b060c50005f6c3ca6af3669578a48fefddb Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Wed, 19 Feb 2020 18:29:05 +0800
Subject: [PATCH 48/60] don't lock s.serverctx.lock when limit-connection
 disabled

---
 titan.go | 40 ++++++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 18 deletions(-)

diff --git a/titan.go b/titan.go
index 4135908d..3d1bc79c 100644
--- a/titan.go
+++ b/titan.go
@@ -36,30 +36,32 @@ func (s *Server) Serve(lis net.Listener) error {
 		}
 
 		cliCtx := context.NewClientContext(s.idgen(), conn)
-		connectExceed := false
 		if s.servCtx.LimitConnection {
+			connectExceed := false
 			s.servCtx.Lock.Lock()
 			if s.servCtx.ClientsNum >= s.servCtx.MaxConnection {
 				connectExceed = true
 			}
 			s.servCtx.Lock.Unlock()
-		}
-		if connectExceed {
-			zap.L().Warn("max connection exceed, will close after some time",
-				zap.Int64("max connection num", s.servCtx.MaxConnection), zap.Int64("wait ms", s.servCtx.MaxConnectionWait),
-				zap.String("addr", cliCtx.RemoteAddr), zap.Int64("clientid", cliCtx.ID))
-			go func() {
-				time.Sleep(time.Duration(s.servCtx.MaxConnectionWait) * time.Millisecond)
-				zap.L().Warn("close connection for max connection exceed", zap.String("addr", cliCtx.RemoteAddr), zap.Int64("clientid", cliCtx.ID))
-				conn.Close()
-			}()
-			continue
+			if connectExceed {
+				zap.L().Warn("max connection exceed, will close after some time",
+					zap.Int64("max connection num", s.servCtx.MaxConnection), zap.Int64("wait ms", s.servCtx.MaxConnectionWait),
+					zap.String("addr", cliCtx.RemoteAddr), zap.Int64("clientid", cliCtx.ID))
+				go func() {
+					time.Sleep(time.Duration(s.servCtx.MaxConnectionWait) * time.Millisecond)
+					zap.L().Warn("close connection for max connection exceed", zap.String("addr", cliCtx.RemoteAddr), zap.Int64("clientid", cliCtx.ID))
+					conn.Close()
+				}()
+				continue
+			}
 		}
 		cliCtx.DB = s.servCtx.Store.DB(cliCtx.Namespace, 0)
 		s.servCtx.Clients.Store(cliCtx.ID, cliCtx)
-		s.servCtx.Lock.Lock()
-		s.servCtx.ClientsNum++
-		s.servCtx.Lock.Unlock()
+		if s.servCtx.LimitConnection {
+			s.servCtx.Lock.Lock()
+			s.servCtx.ClientsNum++
+			s.servCtx.Lock.Unlock()
+		}
 
 		cli := newClient(cliCtx, s, command.NewExecutor())
 
@@ -74,9 +76,11 @@ func (s *Server) Serve(lis net.Listener) error {
 			}
 			metrics.GetMetrics().ConnectionOnlineGaugeVec.WithLabelValues(cli.cliCtx.Namespace).Dec()
 			s.servCtx.Clients.Delete(cli.cliCtx.ID)
-			s.servCtx.Lock.Lock()
-			s.servCtx.ClientsNum--
-			s.servCtx.Lock.Unlock()
+			if s.servCtx.LimitConnection {
+				s.servCtx.Lock.Lock()
+				s.servCtx.ClientsNum--
+				s.servCtx.Lock.Unlock()
+			}
 		}(cli, conn)
 	}
 }

From 003e6b667d899f81abffaf24dbe5e849f2d0120a Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Tue, 25 Feb 2020 19:21:37 +0800
Subject: [PATCH 49/60] expire seek end in at:now keys to reduce rocksdb
 tomstone problem

---
 db/expire.go | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/db/expire.go b/db/expire.go
index de6109e2..572adf65 100644
--- a/db/expire.go
+++ b/db/expire.go
@@ -8,7 +8,6 @@ import (
 	"github.com/distributedio/titan/conf"
 	"github.com/distributedio/titan/db/store"
 	"github.com/distributedio/titan/metrics"
-	"github.com/pingcap/tidb/kv"
 	"go.uber.org/zap"
 )
 
@@ -148,7 +147,15 @@ func runExpire(db *DB, batchLimit int) {
 		zap.L().Error("[Expire] txn begin failed", zap.Error(err))
 		return
 	}
-	endPrefix := kv.Key(expireKeyPrefix).PrefixNext()
+
+	now := time.Now().UnixNano()
+	//iter get keys [key, upperBound), so using now+1 as 2nd parameter will get "at:now:" prefixed keys
+	//we seek end in "at:<now>" replace in "at;" , it can reduce the seek range and seek the deleted expired keys as little as possible.
+	//the behavior should reduce the expire delay in days and get/mget timeout, which are caused by rocksdb tomstone problem
+	var endPrefix []byte
+	endPrefix = append(endPrefix, expireKeyPrefix...)
+	endPrefix = append(endPrefix, EncodeInt64(now+1)...)
+
 	start := time.Now()
 	iter, err := txn.t.Iter(expireKeyPrefix, endPrefix)
 	metrics.GetMetrics().WorkerSeekCostHistogramVec.WithLabelValues(expire_worker).Observe(time.Since(start).Seconds())
@@ -158,7 +165,6 @@ func runExpire(db *DB, batchLimit int) {
 		return
 	}
 	limit := batchLimit
-	now := time.Now().UnixNano()
 
 	ts := now
 	for iter.Valid() && iter.Key().HasPrefix(expireKeyPrefix) && limit > 0 {

From 4423eaffee99e592fa4dd36f4698c96c1948df71 Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Wed, 26 Feb 2020 16:47:43 +0800
Subject: [PATCH 50/60] fix a failed case in tests/redis/unit/expire.tcl

---
 tests/redis/unit/expire.tcl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/redis/unit/expire.tcl b/tests/redis/unit/expire.tcl
index de24eabe..a7fe15ed 100644
--- a/tests/redis/unit/expire.tcl
+++ b/tests/redis/unit/expire.tcl
@@ -3,11 +3,11 @@ start_server {tags {"expire"}} {
         r set x foobar
         set v1 [r expire x 5]
         set v2 [r ttl x]
-        set v3 [r expire x 10]
+        set v3 [r expire x 9]
         set v4 [r ttl x]
         r expire x 2
         list $v1 $v2 $v3 $v4
-    } {1 [45] 1 10}
+    } {1 [45] 1 [89]}
 
     test {EXPIRE - It should be still possible to read 'x'} {
         r get x

From cc15799ca087c8aa81db0900a7d441ecde7073d3 Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Thu, 27 Feb 2020 18:47:42 +0800
Subject: [PATCH 51/60] expire seek start in last processed expireKeys to
 reduce rocksdb tomstone problem

---
 db/expire.go      | 36 +++++++++++++++++++++++++++---------
 db/expire_test.go |  8 ++++----
 2 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/db/expire.go b/db/expire.go
index 572adf65..8c11eec9 100644
--- a/db/expire.go
+++ b/db/expire.go
@@ -87,6 +87,7 @@ func StartExpire(db *DB, conf *conf.Expire) error {
 	ticker := time.NewTicker(conf.Interval)
 	defer ticker.Stop()
 	id := UUID()
+	lastExpireEndTs := int64(0)
 	for range ticker.C {
 		if conf.Disable {
 			continue
@@ -106,7 +107,7 @@ func StartExpire(db *DB, conf *conf.Expire) error {
 			}
 			continue
 		}
-		runExpire(db, conf.BatchLimit)
+		lastExpireEndTs = runExpire(db, conf.BatchLimit, lastExpireEndTs)
 		metrics.GetMetrics().WorkerRoundCostHistogramVec.WithLabelValues(expire_worker).Observe(time.Since(start).Seconds())
 	}
 	return nil
@@ -141,11 +142,11 @@ func toTikvScorePrefix(namespace []byte, id DBID, key []byte) []byte {
 	return b
 }
 
-func runExpire(db *DB, batchLimit int) {
+func runExpire(db *DB, batchLimit int, lastExpireEndTs int64) int64 {
 	txn, err := db.Begin()
 	if err != nil {
 		zap.L().Error("[Expire] txn begin failed", zap.Error(err))
-		return
+		return 0
 	}
 
 	now := time.Now().UnixNano()
@@ -156,16 +157,29 @@ func runExpire(db *DB, batchLimit int) {
 	endPrefix = append(endPrefix, expireKeyPrefix...)
 	endPrefix = append(endPrefix, EncodeInt64(now+1)...)
 
+	var startPrefix []byte
+	if lastExpireEndTs > 0 {
+		startPrefix = append(startPrefix, expireKeyPrefix...)
+		startPrefix = append(startPrefix, EncodeInt64(lastExpireEndTs)...)
+		startPrefix = append(startPrefix, ':')
+	} else {
+		startPrefix = expireKeyPrefix
+	}
+
 	start := time.Now()
-	iter, err := txn.t.Iter(expireKeyPrefix, endPrefix)
+	iter, err := txn.t.Iter(startPrefix, endPrefix)
 	metrics.GetMetrics().WorkerSeekCostHistogramVec.WithLabelValues(expire_worker).Observe(time.Since(start).Seconds())
+	if logEnv := zap.L().Check(zap.DebugLevel, "[Expire] seek expire keys"); logEnv != nil {
+		logEnv.Write(zap.ByteString("[startPrefix", startPrefix), zap.ByteString("endPrefix)", endPrefix))
+	}
 	if err != nil {
 		zap.L().Error("[Expire] seek failed", zap.ByteString("prefix", expireKeyPrefix), zap.Error(err))
 		txn.Rollback()
-		return
+		return 0
 	}
 	limit := batchLimit
 
+	thisExpireEndTs := int64(0)
 	ts := now
 	for iter.Valid() && iter.Key().HasPrefix(expireKeyPrefix) && limit > 0 {
 		rawKey := iter.Key()
@@ -179,7 +193,7 @@ func runExpire(db *DB, batchLimit int) {
 		mkey := rawKey[expireMetakeyOffset:]
 		if err := doExpire(txn, mkey, iter.Value()); err != nil {
 			txn.Rollback()
-			return
+			return 0
 		}
 
 		// Remove from expire list
@@ -188,11 +202,11 @@ func runExpire(db *DB, batchLimit int) {
 				zap.ByteString("mkey", mkey),
 				zap.Error(err))
 			txn.Rollback()
-			return
+			return 0
 		}
 
 		if logEnv := zap.L().Check(zap.DebugLevel, "[Expire] delete expire list item"); logEnv != nil {
-			logEnv.Write(zap.ByteString("mkey", mkey))
+			logEnv.Write(zap.Int64("ts", ts), zap.ByteString("mkey", mkey))
 		}
 
 		start = time.Now()
@@ -206,8 +220,11 @@ func runExpire(db *DB, batchLimit int) {
 				zap.ByteString("mkey", mkey),
 				zap.Error(err))
 			txn.Rollback()
-			return
+			return 0
 		}
+
+		//just use the latest processed expireKey(don't include the last expire key in the loop which is > now) as next seek's start key
+		thisExpireEndTs = ts
 		limit--
 	}
 
@@ -234,6 +251,7 @@ func runExpire(db *DB, batchLimit int) {
 	}
 
 	metrics.GetMetrics().ExpireKeysTotal.WithLabelValues("expired").Add(float64(batchLimit - limit))
+	return thisExpireEndTs
 }
 
 func gcDataKey(txn *Transaction, namespace []byte, dbid DBID, key, id []byte) error {
diff --git a/db/expire_test.go b/db/expire_test.go
index 5eb3abb1..2a75c850 100644
--- a/db/expire_test.go
+++ b/db/expire_test.go
@@ -111,7 +111,7 @@ func Test_runExpire(t *testing.T) {
 		t.Run(tt.name, func(t *testing.T) {
 			id := tt.args.call(t, tt.args.key)
 			txn := getTxn(t)
-			runExpire(txn.db, 1)
+			runExpire(txn.db, 1, 0)
 			txn.Commit(context.TODO())
 
 			txn = getTxn(t)
@@ -211,7 +211,7 @@ func Test_doExpire(t *testing.T) {
 			name: "TestExpiredNotExistsMeta",
 			args: args{
 				mkey: MetaKey(txn.db, []byte("TestExpiredRewriteHash")),
-				id:  nmateHashId,
+				id:   nmateHashId,
 			},
 			want: want{
 				gckey: true,
@@ -222,7 +222,7 @@ func Test_doExpire(t *testing.T) {
 			args: args{
 				mkey: MetaKey(txn.db, []byte("TestExpiredHash_dirty_data")),
 				id:   dirtyDataHashID,
-				tp: byte(ObjectHash),
+				tp:   byte(ObjectHash),
 			},
 			want: want{
 				gckey: true,
@@ -233,7 +233,7 @@ func Test_doExpire(t *testing.T) {
 			args: args{
 				mkey: MetaKey(txn.db, []byte("TestExpiredRewriteHash_dirty_data")),
 				id:   rDHashId,
-				tp: byte(ObjectHash),
+				tp:   byte(ObjectHash),
 			},
 			want: want{
 				gckey: true,

From d9a1bafbcc68795f563c07f616753ee4edf5d1f1 Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Thu, 27 Feb 2020 19:28:14 +0800
Subject: [PATCH 52/60] fix expire log

---
 db/expire.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/db/expire.go b/db/expire.go
index 8c11eec9..5f5776e8 100644
--- a/db/expire.go
+++ b/db/expire.go
@@ -170,7 +170,7 @@ func runExpire(db *DB, batchLimit int, lastExpireEndTs int64) int64 {
 	iter, err := txn.t.Iter(startPrefix, endPrefix)
 	metrics.GetMetrics().WorkerSeekCostHistogramVec.WithLabelValues(expire_worker).Observe(time.Since(start).Seconds())
 	if logEnv := zap.L().Check(zap.DebugLevel, "[Expire] seek expire keys"); logEnv != nil {
-		logEnv.Write(zap.ByteString("[startPrefix", startPrefix), zap.ByteString("endPrefix)", endPrefix))
+		logEnv.Write(zap.Int64("[startTs", lastExpireEndTs), zap.Int64("endTs)", now+1))
 	}
 	if err != nil {
 		zap.L().Error("[Expire] seek failed", zap.ByteString("prefix", expireKeyPrefix), zap.Error(err))

From 83a370d03f026df3b18988bd24da51e876652b2b Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Thu, 27 Feb 2020 20:06:06 +0800
Subject: [PATCH 53/60] also reduce expire seek range when no expire keys or
 all expire keys > now

---
 db/expire.go | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/db/expire.go b/db/expire.go
index 5f5776e8..6a85e27a 100644
--- a/db/expire.go
+++ b/db/expire.go
@@ -227,6 +227,10 @@ func runExpire(db *DB, batchLimit int, lastExpireEndTs int64) int64 {
 		thisExpireEndTs = ts
 		limit--
 	}
+	if limit == batchLimit {
+		//means: no expire keys or all expire keys > now in current loop
+		thisExpireEndTs = now
+	}
 
 	now = time.Now().UnixNano()
 	diff := (ts - now) / int64(time.Second)

From d75dd34073f72166113551118d398e728d9fc15e Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Fri, 28 Feb 2020 19:50:17 +0800
Subject: [PATCH 54/60] also reduce expire seek range when no expire keys or
 all expire keys > now

---
 command/strings.go     |  3 +++
 db/kv.go               |  7 ++++++-
 tools/autotest/auto.go | 14 ++++++++++++++
 3 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/command/strings.go b/command/strings.go
index 3422742c..383b0ebc 100644
--- a/command/strings.go
+++ b/command/strings.go
@@ -337,6 +337,9 @@ func PSetEx(ctx *Context, txn *db.Transaction) (OnCommit, error) {
 	if err != nil {
 		return nil, ErrInteger
 	}
+	if ui <= 0 {
+		return nil, ErrExpireSetEx
+	}
 	unit := ui * uint64(time.Millisecond)
 	if err := s.Set([]byte(ctx.Args[2]), int64(unit)); err != nil {
 		return nil, errors.New("ERR " + err.Error())
diff --git a/db/kv.go b/db/kv.go
index 93dda37c..ecc38c3d 100644
--- a/db/kv.go
+++ b/db/kv.go
@@ -121,8 +121,13 @@ func (kv *Kv) ExpireAt(key []byte, at int64) error {
 			return err
 		}
 	}
-
 	if at > 0 {
+		if at <= now {
+			//expire goroutine just seek forward and processed higher and higher ts expireKey, can't seek backward
+			//so, if expire at a ts <= now, delete it at once
+			return kv.txn.Destory(obj, key)
+		}
+
 		if err := expireAt(kv.txn.t, mkey, obj.ID, obj.Type, obj.ExpireAt, at); err != nil {
 			return err
 		}
diff --git a/tools/autotest/auto.go b/tools/autotest/auto.go
index 8d546106..8397e33e 100644
--- a/tools/autotest/auto.go
+++ b/tools/autotest/auto.go
@@ -298,6 +298,20 @@ func (ac *AutoClient) KeyCase(t *testing.T) {
 	ac.ez.ZAddEqual(t, "key-zset1", "2.0", "member1")
 	ac.ek.DelEqual(t, 1, "key-zset1")
 	ac.ek.ExistsEqual(t, 0, "key-zset1")
+
+	//test negative expire
+	ac.ez.ZAddEqual(t, "key-zset2", "2.0", "member1")
+	ac.ek.ExpireEqual(t, "key-zset2", -1, 1)
+	ac.ek.ExistsEqual(t, 0, "key-zset2")
+
+	ac.es.SetEqual(t, "key-set", "value")
+	ac.ek.ExpireEqual(t, "key-set", -1, 1)
+	ac.ek.ExistsEqual(t, 0, "key-set")
+
+	ac.es.SetEqual(t, "key-set", "value")
+	at = time.Now().Unix() - 1
+	ac.ek.ExpireAtEqual(t, "key-set", int(at), 1)
+	ac.ek.ExistsEqual(t, 0, "key-set")
 }
 
 //SystemCase check system case

From 3519ff9c6936a94a6b98d7aadeacc8980c666c36 Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Mon, 2 Mar 2020 16:30:19 +0800
Subject: [PATCH 55/60] comment unsupported string functions in command list,
 and delete its redis test case

---
 command/init.go                     | 46 ++++++++++++++---------------
 tests/redis/support/test_common.tcl |  6 ----
 2 files changed, 23 insertions(+), 29 deletions(-)

diff --git a/command/init.go b/command/init.go
index fa62cf15..579b21d0 100644
--- a/command/init.go
+++ b/command/init.go
@@ -28,16 +28,16 @@ func init() {
 		"getset":   GetSet,
 		"getrange": GetRange,
 		// "msetnx":   MSetNx,
-		"setnx":    SetNx,
-		"setex":    SetEx,
-		"psetex":   PSetEx,
-		"setrange": SetRange,
-		"setbit":   SetBit,
+		"setnx":  SetNx,
+		"setex":  SetEx,
+		"psetex": PSetEx,
+		//"setrange": SetRange,
+		//"setbit":   SetBit,
 		// "bitop":       BitOp,
 		// "bitfield":    BitField,
-		"getbit":      GetBit,
-		"bitpos":      BitPos,
-		"bitcount":    BitCount,
+		//"getbit":      GetBit,
+		//"bitpos":      BitPos,
+		//"bitcount":    BitCount,
 		"incr":        Incr,
 		"incrby":      IncrBy,
 		"decr":        Decr,
@@ -135,29 +135,29 @@ func init() {
 		"rpushx":  Desc{Proc: AutoCommit(RPushx), Cons: Constraint{-3, flags("wmF"), 1, 1, 1}},
 
 		// strings
-		"get":         Desc{Proc: AutoCommit(Get), Cons: Constraint{2, flags("rF"), 1, 1, 1}},
-		"set":         Desc{Proc: AutoCommit(Set), Cons: Constraint{-3, flags("wm"), 1, 1, 1}},
-		"setnx":       Desc{Proc: AutoCommit(SetNx), Cons: Constraint{3, flags("wmF"), 1, 1, 1}},
-		"setex":       Desc{Proc: AutoCommit(SetEx), Cons: Constraint{4, flags("wm"), 1, 1, 1}},
-		"psetex":      Desc{Proc: AutoCommit(PSetEx), Cons: Constraint{4, flags("wm"), 1, 1, 1}},
-		"mget":        Desc{Proc: AutoCommit(MGet), Cons: Constraint{-2, flags("rF"), 1, -1, 1}},
-		"mset":        Desc{Proc: AutoCommit(MSet), Cons: Constraint{-3, flags("wm"), 1, -1, 2}},
-		"msetnx":      Desc{Proc: AutoCommit(MSetNx), Cons: Constraint{-3, flags("wm"), 1, -1, 2}},
-		"strlen":      Desc{Proc: AutoCommit(Strlen), Cons: Constraint{2, flags("rF"), 1, 1, 1}},
-		"append":      Desc{Proc: AutoCommit(Append), Cons: Constraint{3, flags("wm"), 1, 1, 1}},
-		"setrange":    Desc{Proc: AutoCommit(SetRange), Cons: Constraint{4, flags("wm"), 1, 1, 1}},
+		"get":    Desc{Proc: AutoCommit(Get), Cons: Constraint{2, flags("rF"), 1, 1, 1}},
+		"set":    Desc{Proc: AutoCommit(Set), Cons: Constraint{-3, flags("wm"), 1, 1, 1}},
+		"setnx":  Desc{Proc: AutoCommit(SetNx), Cons: Constraint{3, flags("wmF"), 1, 1, 1}},
+		"setex":  Desc{Proc: AutoCommit(SetEx), Cons: Constraint{4, flags("wm"), 1, 1, 1}},
+		"psetex": Desc{Proc: AutoCommit(PSetEx), Cons: Constraint{4, flags("wm"), 1, 1, 1}},
+		"mget":   Desc{Proc: AutoCommit(MGet), Cons: Constraint{-2, flags("rF"), 1, -1, 1}},
+		"mset":   Desc{Proc: AutoCommit(MSet), Cons: Constraint{-3, flags("wm"), 1, -1, 2}},
+		//"msetnx":      Desc{Proc: AutoCommit(MSetNx), Cons: Constraint{-3, flags("wm"), 1, -1, 2}}, //run test in tests/redis/unit/type/string failed
+		"strlen": Desc{Proc: AutoCommit(Strlen), Cons: Constraint{2, flags("rF"), 1, 1, 1}},
+		"append": Desc{Proc: AutoCommit(Append), Cons: Constraint{3, flags("wm"), 1, 1, 1}},
+		//"setrange":    Desc{Proc: AutoCommit(SetRange), Cons: Constraint{4, flags("wm"), 1, 1, 1}}, //run test in tests/redis/unit/type/string failed
 		"getrange":    Desc{Proc: AutoCommit(GetRange), Cons: Constraint{4, flags("r"), 1, 1, 1}},
 		"incr":        Desc{Proc: AutoCommit(Incr), Cons: Constraint{2, flags("wmF"), 1, 1, 1}},
 		"decr":        Desc{Proc: AutoCommit(Decr), Cons: Constraint{2, flags("wmF"), 1, 1, 1}},
 		"incrby":      Desc{Proc: AutoCommit(IncrBy), Cons: Constraint{3, flags("wmF"), 1, 1, 1}},
 		"decrby":      Desc{Proc: AutoCommit(DecrBy), Cons: Constraint{3, flags("wmF"), 1, 1, 1}},
 		"incrbyfloat": Desc{Proc: AutoCommit(IncrByFloat), Cons: Constraint{3, flags("wmF"), 1, 1, 1}},
-		"setbit":      Desc{Proc: AutoCommit(SetBit), Cons: Constraint{4, flags("wm"), 1, 1, 1}},
+		//"setbit":      Desc{Proc: AutoCommit(SetBit), Cons: Constraint{4, flags("wm"), 1, 1, 1}},
 		// "bitop":       Desc{Proc: AutoCommit(BitOp), Cons: Constraint{-4, flags("wm"), 2, -1, 1}},
 		// "bitfield":    Desc{Proc: AutoCommit(BitField), Cons: Constraint{-2, flags("wm"), 1, 1, 1}},
-		"getbit":   Desc{Proc: AutoCommit(GetBit), Cons: Constraint{3, flags("r"), 1, 1, 1}},
-		"bitcount": Desc{Proc: AutoCommit(BitCount), Cons: Constraint{-2, flags("r"), 1, 1, 1}},
-		"bitpos":   Desc{Proc: AutoCommit(BitPos), Cons: Constraint{-3, flags("r"), 1, 1, 1}},
+		//"getbit":   Desc{Proc: AutoCommit(GetBit), Cons: Constraint{3, flags("r"), 1, 1, 1}},
+		//"bitcount": Desc{Proc: AutoCommit(BitCount), Cons: Constraint{-2, flags("r"), 1, 1, 1}},
+		//"bitpos":   Desc{Proc: AutoCommit(BitPos), Cons: Constraint{-3, flags("r"), 1, 1, 1}},
 
 		// keys
 		"type":      Desc{Proc: AutoCommit(Type), Cons: Constraint{2, flags("rF"), 1, 1, 1}},
diff --git a/tests/redis/support/test_common.tcl b/tests/redis/support/test_common.tcl
index 3bde792a..ec087751 100644
--- a/tests/redis/support/test_common.tcl
+++ b/tests/redis/support/test_common.tcl
@@ -14,15 +14,9 @@ array set ::toTestCase {
     "MGET against non-string key" 1
     "MSET base case" 1
     "MSET wrong number of args" 1
-    "MSETNX with already existent key" 1
-    "MSETNX with not existing keys" 1
     "STRLEN against non-existing key" 1
     "STRLEN against integer-encoded value" 1
     "STRLEN against plain string" 1
-    "SETRANGE against non-existing key" 1
-    "SETRANGE against string-encoded key" 1
-    "SETRANGE against key with wrong type" 1
-    "SETRANGE with out of range offset" 1
     "GETRANGE against non-existing key" 1
     "GETRANGE against string value" 1
     "GETRANGE fuzzing" 1

From c4eea836c17e374e4d4baea2cc02294965dda169 Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Mon, 2 Mar 2020 16:47:44 +0800
Subject: [PATCH 56/60] comment unit test of unsupported string functions in
 strings_test.go

---
 command/strings_test.go | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/command/strings_test.go b/command/strings_test.go
index a1065092..5fb3e92a 100644
--- a/command/strings_test.go
+++ b/command/strings_test.go
@@ -323,7 +323,7 @@ func TestStringPSetEx(t *testing.T) {
 
 }
 
-func TestStringSetRange(t *testing.T) {
+/*func TestStringSetRange(t *testing.T) {
 	args := make([]string, 3)
 	key := "setrange"
 	args[0] = key
@@ -364,7 +364,7 @@ func TestStringSetRange(t *testing.T) {
 	ctx = ContextTest("setrange", args...)
 	Call(ctx)
 	assert.Contains(t, ctxString(ctx.Out), ErrMaximum.Error())
-}
+}*/
 func TestStringIncr(t *testing.T) {
 	args := make([]string, 1)
 	args[0] = "incr"
@@ -469,7 +469,7 @@ func TestStringMset(t *testing.T) {
 	assert.Contains(t, ctxString(ctx.Out), ErrMSet.Error())
 }
 
-func TestStringMsetNx(t *testing.T) {
+/*func TestStringMsetNx(t *testing.T) {
 	args := make([]string, 4)
 	args[0] = "MsetN1"
 	args[1] = "MsetN3"
@@ -490,7 +490,7 @@ func TestStringMsetNx(t *testing.T) {
 	Call(ctx)
 	assert.Contains(t, ctxString(ctx.Out), "0")
 	EqualGet(t, args[0], args[1], nil)
-}
+}*/
 
 func TestStringAppend(t *testing.T) {
 	args := make([]string, 2)
@@ -504,7 +504,7 @@ func TestStringAppend(t *testing.T) {
 	assert.Contains(t, out.String(), strconv.Itoa(len(args[1])*2))
 }
 
-func TestStringSetBit(t *testing.T) {
+/*func TestStringSetBit(t *testing.T) {
 	tests := []struct {
 		name string
 		args []string
@@ -548,9 +548,9 @@ func TestStringSetBit(t *testing.T) {
 			assert.Contains(t, out.String(), tt.want)
 		})
 	}
-}
+}*/
 
-func TestStringGetBit(t *testing.T) {
+/*func TestStringGetBit(t *testing.T) {
 	CallTest("setbit", "getbit", "5", "1")
 	tests := []struct {
 		name string
@@ -590,9 +590,9 @@ func TestStringGetBit(t *testing.T) {
 			assert.Contains(t, out.String(), tt.want)
 		})
 	}
-}
+}*/
 
-func TestStringBitCount(t *testing.T) {
+/*func TestStringBitCount(t *testing.T) {
 	CallTest("setbit", "bit-count", "5", "1")
 	CallTest("setbit", "bit-count", "9", "1")
 	tests := []struct {
@@ -633,9 +633,9 @@ func TestStringBitCount(t *testing.T) {
 			assert.Contains(t, out.String(), tt.want)
 		})
 	}
-}
+}*/
 
-func TestStringBitPos(t *testing.T) {
+/*func TestStringBitPos(t *testing.T) {
 	CallTest("set", "bit-pos", "5")
 	tests := []struct {
 		name string
@@ -685,4 +685,4 @@ func TestStringBitPos(t *testing.T) {
 			assert.Contains(t, out.String(), tt.want)
 		})
 	}
-}
+}*/

From bd43ec2322c66d4912501541f7d206a12177d3bc Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Tue, 21 Apr 2020 20:44:19 +0800
Subject: [PATCH 57/60] hash expire-key, it will improve the keys num handled
 every seconds, and prevent that current expire-keys region writing node have
 higher load than other nodes

---
 db/db.go              |   8 ++-
 db/expire.go          | 157 +++++++++++++++++++++++++++++++-----------
 db/expire_test.go     |  20 ++++--
 metrics/prometheus.go |  25 ++++---
 4 files changed, 154 insertions(+), 56 deletions(-)

diff --git a/db/db.go b/db/db.go
index d431ef38..c9145ac9 100644
--- a/db/db.go
+++ b/db/db.go
@@ -118,8 +118,14 @@ func Open(conf *conf.Tikv) (*RedisStore, error) {
 	}
 	rds := &RedisStore{Storage: s, conf: conf}
 	sysdb := rds.DB(sysNamespace, sysDatabaseID)
+	ls := NewLeaderStatus()
 	go StartGC(sysdb, &conf.GC)
-	go StartExpire(sysdb, &conf.Expire)
+	go setExpireIsLeader(sysdb, &conf.Expire, ls)
+	go startExpire(sysdb, &conf.Expire, ls, "")
+	for i := 0; i < EXPIRE_HASH_NUM; i++ {
+		expireHash := fmt.Sprintf("%04d", i)
+		go startExpire(sysdb, &conf.Expire, ls, expireHash)
+	}
 	go StartZT(sysdb, &conf.ZT)
 	go StartTikvGC(sysdb, &conf.TikvGC)
 	return rds, nil
diff --git a/db/expire.go b/db/expire.go
index 6a85e27a..4a07f1aa 100644
--- a/db/expire.go
+++ b/db/expire.go
@@ -3,6 +3,9 @@ package db
 import (
 	"bytes"
 	"context"
+	"fmt"
+	"hash/crc32"
+	"sync"
 	"time"
 
 	"github.com/distributedio/titan/conf"
@@ -21,9 +24,37 @@ var (
 )
 
 const (
-	expire_worker = "expire"
+	expire_worker   = "expire"
+	EXPIRE_HASH_NUM = 256
 )
 
+type LeaderStatus struct {
+	isLeader bool
+	cond     *sync.Cond
+}
+
+func NewLeaderStatus() *LeaderStatus {
+	return &LeaderStatus{
+		cond: sync.NewCond(new(sync.Mutex)),
+	}
+}
+
+func (ls *LeaderStatus) setIsLeader(isLeader bool) {
+	ls.cond.L.Lock()
+	defer ls.cond.L.Unlock()
+
+	ls.isLeader = isLeader
+	ls.cond.Broadcast()
+}
+
+func (ls *LeaderStatus) getIsLeader() bool {
+	ls.cond.L.Lock()
+	defer ls.cond.L.Unlock()
+
+	ls.cond.Wait()
+	return ls.isLeader
+}
+
 // IsExpired judge object expire through now
 func IsExpired(obj *Object, now int64) bool {
 	if obj.ExpireAt == 0 || obj.ExpireAt > now {
@@ -33,8 +64,12 @@ func IsExpired(obj *Object, now int64) bool {
 }
 
 func expireKey(key []byte, ts int64) []byte {
+	hashnum := crc32.ChecksumIEEE(key)
+	hashPrefix := fmt.Sprintf("%04d", hashnum%EXPIRE_HASH_NUM)
 	var buf []byte
 	buf = append(buf, expireKeyPrefix...)
+	buf = append(buf, []byte(hashPrefix)...)
+	buf = append(buf, ':')
 	buf = append(buf, EncodeInt64(ts)...)
 	buf = append(buf, ':')
 	buf = append(buf, key...)
@@ -82,21 +117,21 @@ func unExpireAt(txn store.Transaction, mkey []byte, expireAt int64) error {
 	return nil
 }
 
-// StartExpire get leader from db
-func StartExpire(db *DB, conf *conf.Expire) error {
+// setExpireIsLeader get leader from db
+func setExpireIsLeader(db *DB, conf *conf.Expire, ls *LeaderStatus) error {
 	ticker := time.NewTicker(conf.Interval)
 	defer ticker.Stop()
 	id := UUID()
-	lastExpireEndTs := int64(0)
 	for range ticker.C {
 		if conf.Disable {
+			ls.setIsLeader(false)
 			continue
 		}
 
-		start := time.Now()
 		isLeader, err := isLeader(db, sysExpireLeader, id, conf.LeaderLifeTime)
 		if err != nil {
 			zap.L().Error("[Expire] check expire leader failed", zap.Error(err))
+			ls.setIsLeader(false)
 			continue
 		}
 		if !isLeader {
@@ -105,14 +140,29 @@ func StartExpire(db *DB, conf *conf.Expire) error {
 					zap.ByteString("uuid", id),
 					zap.Duration("leader-life-time", conf.LeaderLifeTime))
 			}
+			ls.setIsLeader(isLeader)
 			continue
 		}
-		lastExpireEndTs = runExpire(db, conf.BatchLimit, lastExpireEndTs)
-		metrics.GetMetrics().WorkerRoundCostHistogramVec.WithLabelValues(expire_worker).Observe(time.Since(start).Seconds())
+		ls.setIsLeader(isLeader)
 	}
 	return nil
 }
 
+func startExpire(db *DB, conf *conf.Expire, ls *LeaderStatus, expireHash string) {
+	ticker := time.NewTicker(conf.Interval)
+	defer ticker.Stop()
+	lastExpireEndTs := int64(0)
+	for range ticker.C {
+		if !ls.getIsLeader() {
+			continue
+		}
+
+		start := time.Now()
+		lastExpireEndTs = runExpire(db, conf.BatchLimit, expireHash, lastExpireEndTs)
+		metrics.GetMetrics().WorkerRoundCostHistogramVec.WithLabelValues(expire_worker).Observe(time.Since(start).Seconds())
+	}
+}
+
 // split a meta key with format: {namespace}:{id}:M:{key}
 func splitMetaKey(key []byte) ([]byte, DBID, []byte) {
 	idx := bytes.Index(key, []byte{':'})
@@ -142,10 +192,25 @@ func toTikvScorePrefix(namespace []byte, id DBID, key []byte) []byte {
 	return b
 }
 
-func runExpire(db *DB, batchLimit int, lastExpireEndTs int64) int64 {
+func runExpire(db *DB, batchLimit int, expireHash string, lastExpireEndTs int64) int64 {
+	curExpireTimestampOffset := expireTimestampOffset
+	curExpireMetakeyOffset := expireMetakeyOffset
+	var curExpireKeyPrefix []byte //expireKeyPrefix of current go routine
+	curExpireKeyPrefix = append(curExpireKeyPrefix, expireKeyPrefix...)
+	var expireLogFlag string
+	if expireHash != "" {
+		curExpireKeyPrefix = append(curExpireKeyPrefix, expireHash...)
+		curExpireKeyPrefix = append(curExpireKeyPrefix, ':')
+		curExpireTimestampOffset += len(expireHash) + len(":")
+		curExpireMetakeyOffset += len(expireHash) + len(":")
+		expireLogFlag = fmt.Sprintf("[Expire-%s]", expireHash)
+	} else {
+		expireLogFlag = "[Expire]"
+	}
+
 	txn, err := db.Begin()
 	if err != nil {
-		zap.L().Error("[Expire] txn begin failed", zap.Error(err))
+		zap.L().Error(expireLogFlag+" txn begin failed", zap.Error(err))
 		return 0
 	}
 
@@ -154,26 +219,26 @@ func runExpire(db *DB, batchLimit int, lastExpireEndTs int64) int64 {
 	//we seek end in "at:<now>" replace in "at;" , it can reduce the seek range and seek the deleted expired keys as little as possible.
 	//the behavior should reduce the expire delay in days and get/mget timeout, which are caused by rocksdb tomstone problem
 	var endPrefix []byte
-	endPrefix = append(endPrefix, expireKeyPrefix...)
+	endPrefix = append(endPrefix, curExpireKeyPrefix...)
 	endPrefix = append(endPrefix, EncodeInt64(now+1)...)
 
 	var startPrefix []byte
 	if lastExpireEndTs > 0 {
-		startPrefix = append(startPrefix, expireKeyPrefix...)
+		startPrefix = append(startPrefix, curExpireKeyPrefix...)
 		startPrefix = append(startPrefix, EncodeInt64(lastExpireEndTs)...)
 		startPrefix = append(startPrefix, ':')
 	} else {
-		startPrefix = expireKeyPrefix
+		startPrefix = curExpireKeyPrefix
 	}
 
 	start := time.Now()
 	iter, err := txn.t.Iter(startPrefix, endPrefix)
 	metrics.GetMetrics().WorkerSeekCostHistogramVec.WithLabelValues(expire_worker).Observe(time.Since(start).Seconds())
-	if logEnv := zap.L().Check(zap.DebugLevel, "[Expire] seek expire keys"); logEnv != nil {
+	if logEnv := zap.L().Check(zap.DebugLevel, expireLogFlag+" seek expire keys"); logEnv != nil {
 		logEnv.Write(zap.Int64("[startTs", lastExpireEndTs), zap.Int64("endTs)", now+1))
 	}
 	if err != nil {
-		zap.L().Error("[Expire] seek failed", zap.ByteString("prefix", expireKeyPrefix), zap.Error(err))
+		zap.L().Error(expireLogFlag+" seek failed", zap.ByteString("prefix", curExpireKeyPrefix), zap.Error(err))
 		txn.Rollback()
 		return 0
 	}
@@ -181,31 +246,31 @@ func runExpire(db *DB, batchLimit int, lastExpireEndTs int64) int64 {
 
 	thisExpireEndTs := int64(0)
 	ts := now
-	for iter.Valid() && iter.Key().HasPrefix(expireKeyPrefix) && limit > 0 {
+	for iter.Valid() && iter.Key().HasPrefix(curExpireKeyPrefix) && limit > 0 {
 		rawKey := iter.Key()
-		ts = DecodeInt64(rawKey[expireTimestampOffset : expireTimestampOffset+8])
+		ts = DecodeInt64(rawKey[curExpireTimestampOffset : curExpireTimestampOffset+8])
 		if ts > now {
-			if logEnv := zap.L().Check(zap.DebugLevel, "[Expire] not need to expire key"); logEnv != nil {
+			if logEnv := zap.L().Check(zap.DebugLevel, expireLogFlag+" not need to expire key"); logEnv != nil {
 				logEnv.Write(zap.String("raw-key", string(rawKey)), zap.Int64("last-timestamp", ts))
 			}
 			break
 		}
-		mkey := rawKey[expireMetakeyOffset:]
-		if err := doExpire(txn, mkey, iter.Value()); err != nil {
+		mkey := rawKey[curExpireMetakeyOffset:]
+		if err := doExpire(txn, mkey, iter.Value(), expireLogFlag, ts); err != nil {
 			txn.Rollback()
 			return 0
 		}
 
 		// Remove from expire list
 		if err := txn.t.Delete(rawKey); err != nil {
-			zap.L().Error("[Expire] delete failed",
+			zap.L().Error(expireLogFlag+" delete failed",
 				zap.ByteString("mkey", mkey),
 				zap.Error(err))
 			txn.Rollback()
 			return 0
 		}
 
-		if logEnv := zap.L().Check(zap.DebugLevel, "[Expire] delete expire list item"); logEnv != nil {
+		if logEnv := zap.L().Check(zap.DebugLevel, expireLogFlag+" delete expire list item"); logEnv != nil {
 			logEnv.Write(zap.Int64("ts", ts), zap.ByteString("mkey", mkey))
 		}
 
@@ -216,7 +281,7 @@ func runExpire(db *DB, batchLimit int, lastExpireEndTs int64) int64 {
 			metrics.GetMetrics().WorkerSeekCostHistogramVec.WithLabelValues(expire_worker).Observe(cost.Seconds())
 		}
 		if err != nil {
-			zap.L().Error("[Expire] next failed",
+			zap.L().Error(expireLogFlag+" next failed",
 				zap.ByteString("mkey", mkey),
 				zap.Error(err))
 			txn.Rollback()
@@ -233,13 +298,11 @@ func runExpire(db *DB, batchLimit int, lastExpireEndTs int64) int64 {
 	}
 
 	now = time.Now().UnixNano()
-	diff := (ts - now) / int64(time.Second)
-	if diff >= 0 {
-		metrics.GetMetrics().ExpireLeftSecondsVec.WithLabelValues("left").Set(float64(diff))
-		metrics.GetMetrics().ExpireLeftSecondsVec.WithLabelValues("delay").Set(0)
+	if ts < now {
+		diff := (now - ts) / int64(time.Second)
+		metrics.GetMetrics().ExpireDelaySecondsVec.WithLabelValues("delay-" + expireHash).Set(float64(diff))
 	} else {
-		metrics.GetMetrics().ExpireLeftSecondsVec.WithLabelValues("delay").Set(float64(-1 * diff))
-		metrics.GetMetrics().ExpireLeftSecondsVec.WithLabelValues("left").Set(0)
+		metrics.GetMetrics().ExpireDelaySecondsVec.WithLabelValues("delay-" + expireHash).Set(0)
 	}
 
 	start = time.Now()
@@ -247,10 +310,10 @@ func runExpire(db *DB, batchLimit int, lastExpireEndTs int64) int64 {
 	metrics.GetMetrics().WorkerCommitCostHistogramVec.WithLabelValues(expire_worker).Observe(time.Since(start).Seconds())
 	if err != nil {
 		txn.Rollback()
-		zap.L().Error("[Expire] commit failed", zap.Error(err))
+		zap.L().Error(expireLogFlag+" commit failed", zap.Error(err))
 	}
 
-	if logEnv := zap.L().Check(zap.DebugLevel, "[Expire] expired end"); logEnv != nil {
+	if logEnv := zap.L().Check(zap.DebugLevel, expireLogFlag+" expired end"); logEnv != nil {
 		logEnv.Write(zap.Int("expired_num", batchLimit-limit))
 	}
 
@@ -258,10 +321,10 @@ func runExpire(db *DB, batchLimit int, lastExpireEndTs int64) int64 {
 	return thisExpireEndTs
 }
 
-func gcDataKey(txn *Transaction, namespace []byte, dbid DBID, key, id []byte) error {
+func gcDataKey(txn *Transaction, namespace []byte, dbid DBID, key, id []byte, expireLogFlag string) error {
 	dkey := toTikvDataKey(namespace, dbid, id)
 	if err := gc(txn.t, dkey); err != nil {
-		zap.L().Error("[Expire] gc failed",
+		zap.L().Error(expireLogFlag+" gc failed",
 			zap.ByteString("key", key),
 			zap.ByteString("namepace", namespace),
 			zap.Int64("db_id", int64(dbid)),
@@ -269,17 +332,18 @@ func gcDataKey(txn *Transaction, namespace []byte, dbid DBID, key, id []byte) er
 			zap.Error(err))
 		return err
 	}
-	if logEnv := zap.L().Check(zap.DebugLevel, "[Expire] gc data key"); logEnv != nil {
+	if logEnv := zap.L().Check(zap.DebugLevel, expireLogFlag+" gc data key"); logEnv != nil {
 		logEnv.Write(zap.ByteString("obj_id", id))
 	}
 	return nil
 }
-func doExpire(txn *Transaction, mkey, id []byte) error {
+
+func doExpire(txn *Transaction, mkey, id []byte, expireLogFlag string, expireAt int64) error {
 	namespace, dbid, key := splitMetaKey(mkey)
 	obj, err := getObject(txn, mkey)
 	// Check for dirty data due to copying or flushdb/flushall
 	if err == ErrKeyNotFound {
-		return gcDataKey(txn, namespace, dbid, key, id)
+		return gcDataKey(txn, namespace, dbid, key, id, expireLogFlag)
 	}
 	if err != nil {
 		return err
@@ -288,23 +352,38 @@ func doExpire(txn *Transaction, mkey, id []byte) error {
 	if len(id) > idLen {
 		id = id[:idLen]
 	}
+
+	//if a not-string structure haven't been deleted and set by user again after expire-time, because the expire go-routine is too slow and delayed.
+	//the id in old expire-keys's value is different with the new one in the new key's value
+	//so comparing id in doExpire() is necessary and also can handle below scenarios(should just delete old id object's data):
+	//a not-string structure was set with unhashed expire-key, and then deleted and set again with hashed expire-key
+	//or a string was set with unhashed expire-key, and set again with hashed expire-key
 	if !bytes.Equal(obj.ID, id) {
-		return gcDataKey(txn, namespace, dbid, key, id)
+		return gcDataKey(txn, namespace, dbid, key, id, expireLogFlag)
+	}
+
+	//compare expire-key's ts with object.expireat(their object id is same in the condition),
+	//if different, means it's a not-string structure and its expire-key was rewriten in hashed prefix, but old ones was writen in unhashed prefix
+	if obj.ExpireAt != expireAt {
+		if logEnv := zap.L().Check(zap.DebugLevel, expireLogFlag+" it should be unhashed expire key un-matching key's expireAt, skip doExpire"); logEnv != nil {
+			logEnv.Write(zap.ByteString("mkey", mkey), zap.Int64("this expire key's ts", expireAt), zap.Int64("key's expireAt", obj.ExpireAt))
+		}
+		return nil
 	}
 
 	// Delete object meta
 	if err := txn.t.Delete(mkey); err != nil {
-		zap.L().Error("[Expire] delete failed",
+		zap.L().Error(expireLogFlag+" delete failed",
 			zap.ByteString("key", key),
 			zap.Error(err))
 		return err
 	}
 
-	if logEnv := zap.L().Check(zap.DebugLevel, "[Expire] delete metakey"); logEnv != nil {
+	if logEnv := zap.L().Check(zap.DebugLevel, expireLogFlag+" delete metakey"); logEnv != nil {
 		logEnv.Write(zap.ByteString("mkey", mkey))
 	}
 	if obj.Type == ObjectString {
 		return nil
 	}
-	return gcDataKey(txn, namespace, dbid, key, id)
+	return gcDataKey(txn, namespace, dbid, key, id, expireLogFlag)
 }
diff --git a/db/expire_test.go b/db/expire_test.go
index 2a75c850..4d30186a 100644
--- a/db/expire_test.go
+++ b/db/expire_test.go
@@ -3,6 +3,7 @@ package db
 import (
 	"bytes"
 	"context"
+	"fmt"
 	"testing"
 	"time"
 
@@ -111,7 +112,10 @@ func Test_runExpire(t *testing.T) {
 		t.Run(tt.name, func(t *testing.T) {
 			id := tt.args.call(t, tt.args.key)
 			txn := getTxn(t)
-			runExpire(txn.db, 1, 0)
+			for i := 0; i < EXPIRE_HASH_NUM; i++ {
+				expireHash := fmt.Sprintf("%04d", i)
+				runExpire(txn.db, 1, expireHash, 0)
+			}
 			txn.Commit(context.TODO())
 
 			txn = getTxn(t)
@@ -183,9 +187,10 @@ func Test_doExpire(t *testing.T) {
 	}
 
 	tests := []struct {
-		name string
-		args args
-		want want
+		name     string
+		args     args
+		want     want
+		expireAt int64
 	}{
 		{
 			name: "TestExpiredHash",
@@ -196,6 +201,7 @@ func Test_doExpire(t *testing.T) {
 			want: want{
 				gckey: true,
 			},
+			expireAt: 0,
 		},
 		{
 			name: "TestExpiredRewriteHash",
@@ -206,6 +212,7 @@ func Test_doExpire(t *testing.T) {
 			want: want{
 				gckey: true,
 			},
+			expireAt: expireAt,
 		},
 		{
 			name: "TestExpiredNotExistsMeta",
@@ -216,6 +223,7 @@ func Test_doExpire(t *testing.T) {
 			want: want{
 				gckey: true,
 			},
+			expireAt: 0,
 		},
 		{
 			name: "TestExpiredHash_dirty_data",
@@ -227,6 +235,7 @@ func Test_doExpire(t *testing.T) {
 			want: want{
 				gckey: true,
 			},
+			expireAt: 0,
 		},
 		{
 			name: "TestExpiredRewriteHash_dirty_data",
@@ -238,6 +247,7 @@ func Test_doExpire(t *testing.T) {
 			want: want{
 				gckey: true,
 			},
+			expireAt: expireAt,
 		},
 	}
 	for _, tt := range tests {
@@ -247,7 +257,7 @@ func Test_doExpire(t *testing.T) {
 			if tt.args.tp == byte(ObjectHash) {
 				id = append(id, tt.args.tp)
 			}
-			err := doExpire(txn, tt.args.mkey, id)
+			err := doExpire(txn, tt.args.mkey, id, "", tt.expireAt)
 			txn.Commit(context.TODO())
 			assert.NoError(t, err)
 
diff --git a/metrics/prometheus.go b/metrics/prometheus.go
index 5e72b959..f057471e 100644
--- a/metrics/prometheus.go
+++ b/metrics/prometheus.go
@@ -49,13 +49,13 @@ type Metrics struct {
 	ConnectionOnlineGaugeVec *prometheus.GaugeVec
 
 	//command
-	ZTInfoCounterVec     *prometheus.CounterVec
-	IsLeaderGaugeVec     *prometheus.GaugeVec
-	ExpireLeftSecondsVec *prometheus.GaugeVec
-	LimiterQpsVec        *prometheus.GaugeVec
-	LimiterRateVec       *prometheus.GaugeVec
-	LRangeSeekHistogram  prometheus.Histogram
-	GCKeysCounterVec     *prometheus.CounterVec
+	ZTInfoCounterVec      *prometheus.CounterVec
+	IsLeaderGaugeVec      *prometheus.GaugeVec
+	ExpireDelaySecondsVec *prometheus.GaugeVec
+	LimiterQpsVec         *prometheus.GaugeVec
+	LimiterRateVec        *prometheus.GaugeVec
+	LRangeSeekHistogram   prometheus.Histogram
+	GCKeysCounterVec      *prometheus.CounterVec
 
 	//expire
 	ExpireKeysTotal *prometheus.CounterVec
@@ -191,13 +191,13 @@ func init() {
 		}, bizLabel)
 	prometheus.MustRegister(gm.ConnectionOnlineGaugeVec)
 
-	gm.ExpireLeftSecondsVec = prometheus.NewGaugeVec(
+	gm.ExpireDelaySecondsVec = prometheus.NewGaugeVec(
 		prometheus.GaugeOpts{
 			Namespace: namespace,
-			Name:      "expire_left_seconds",
-			Help:      "The seconds after which from now will do expire",
+			Name:      "expire_delay_seconds",
+			Help:      "how mach the processing expire-ts delay to now",
 		}, expireLabel)
-	prometheus.MustRegister(gm.ExpireLeftSecondsVec)
+	prometheus.MustRegister(gm.ExpireDelaySecondsVec)
 
 	gm.LimiterQpsVec = prometheus.NewGaugeVec(
 		prometheus.GaugeOpts{
@@ -278,6 +278,7 @@ func init() {
 		prometheus.HistogramOpts{
 			Namespace: namespace,
 			Name:      "worker_round_cost",
+			Buckets:   prometheus.ExponentialBuckets(0.02, 2, 8),
 			Help:      "the cost of a round expire/gc worker",
 		}, workerLabel)
 	prometheus.MustRegister(gm.WorkerRoundCostHistogramVec)
@@ -286,6 +287,7 @@ func init() {
 		prometheus.HistogramOpts{
 			Namespace: namespace,
 			Name:      "worker_seek_cost",
+			Buckets:   prometheus.ExponentialBuckets(0.002, 2, 12),
 			Help:      "the cost of tikv seek in expire/gc worker",
 		}, workerLabel)
 	prometheus.MustRegister(gm.WorkerSeekCostHistogramVec)
@@ -294,6 +296,7 @@ func init() {
 		prometheus.HistogramOpts{
 			Namespace: namespace,
 			Name:      "worker_commit_cost",
+			Buckets:   prometheus.ExponentialBuckets(0.005, 2, 6),
 			Help:      "the cost of commit in expire/gc worker",
 		}, workerLabel)
 	prometheus.MustRegister(gm.WorkerCommitCostHistogramVec)

From fe6e84b5548a443bc4629229c28982fbe25a5e9e Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Fri, 24 Apr 2020 19:35:08 +0800
Subject: [PATCH 58/60] 1 fix runExpire's problem that unhashed goroutine scan
 hashed expire-key 2 fix expire_test.go: gc key can get before
 runExpire/doExpire if expireat < now; add 2 testCases for hash/string expire:
 check key is deleted and gc key exists after runExpire

---
 db/expire.go      |  14 +++---
 db/expire_test.go | 115 ++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 115 insertions(+), 14 deletions(-)

diff --git a/db/expire.go b/db/expire.go
index 4a07f1aa..44cb2953 100644
--- a/db/expire.go
+++ b/db/expire.go
@@ -15,8 +15,9 @@ import (
 )
 
 var (
-	expireKeyPrefix = []byte("$sys:0:at:")
-	sysExpireLeader = []byte("$sys:0:EXL:EXLeader")
+	expireKeyPrefix     = []byte("$sys:0:at:")
+	hashExpireKeyPrefix = expireKeyPrefix[:len(expireKeyPrefix)-1]
+	sysExpireLeader     = []byte("$sys:0:EXL:EXLeader")
 
 	// $sys:0:at:{ts}:{metaKey}
 	expireTimestampOffset = len(expireKeyPrefix)
@@ -67,7 +68,7 @@ func expireKey(key []byte, ts int64) []byte {
 	hashnum := crc32.ChecksumIEEE(key)
 	hashPrefix := fmt.Sprintf("%04d", hashnum%EXPIRE_HASH_NUM)
 	var buf []byte
-	buf = append(buf, expireKeyPrefix...)
+	buf = append(buf, hashExpireKeyPrefix...)
 	buf = append(buf, []byte(hashPrefix)...)
 	buf = append(buf, ':')
 	buf = append(buf, EncodeInt64(ts)...)
@@ -196,15 +197,16 @@ func runExpire(db *DB, batchLimit int, expireHash string, lastExpireEndTs int64)
 	curExpireTimestampOffset := expireTimestampOffset
 	curExpireMetakeyOffset := expireMetakeyOffset
 	var curExpireKeyPrefix []byte //expireKeyPrefix of current go routine
-	curExpireKeyPrefix = append(curExpireKeyPrefix, expireKeyPrefix...)
 	var expireLogFlag string
 	if expireHash != "" {
+		curExpireKeyPrefix = append(curExpireKeyPrefix, hashExpireKeyPrefix...)
 		curExpireKeyPrefix = append(curExpireKeyPrefix, expireHash...)
 		curExpireKeyPrefix = append(curExpireKeyPrefix, ':')
-		curExpireTimestampOffset += len(expireHash) + len(":")
-		curExpireMetakeyOffset += len(expireHash) + len(":")
+		curExpireTimestampOffset += len(expireHash)
+		curExpireMetakeyOffset += len(expireHash)
 		expireLogFlag = fmt.Sprintf("[Expire-%s]", expireHash)
 	} else {
+		curExpireKeyPrefix = append(curExpireKeyPrefix, expireKeyPrefix...)
 		expireLogFlag = "[Expire]"
 	}
 
diff --git a/db/expire_test.go b/db/expire_test.go
index 4d30186a..dbb1a4f1 100644
--- a/db/expire_test.go
+++ b/db/expire_test.go
@@ -18,7 +18,7 @@ func getTxn(t *testing.T) *Transaction {
 	return txn
 }
 
-func Test_runExpire(t *testing.T) {
+func Test_setExpired_runExpire(t *testing.T) {
 	hashKey := []byte("TestExpiredHash")
 	strKey := []byte("TestExpiredString")
 	expireAt := (time.Now().Unix() - 30) * int64(time.Second)
@@ -112,6 +112,99 @@ func Test_runExpire(t *testing.T) {
 		t.Run(tt.name, func(t *testing.T) {
 			id := tt.args.call(t, tt.args.key)
 			txn := getTxn(t)
+			gcKey := toTikvGCKey(toTikvDataKey([]byte(txn.db.Namespace), txn.db.ID, id))
+
+			_, err := txn.t.Get(gcKey)
+			txn.Commit(context.TODO())
+			if tt.want.gckey {
+				assert.NoError(t, err)
+			} else {
+				assert.Equal(t, true, store.IsErrNotFound(err))
+			}
+		})
+	}
+
+}
+
+func Test_runExpire(t *testing.T) {
+	hashKey := []byte("TestHashToExpire")
+	strKey := []byte("TestStringToExpire")
+	expireAt := (time.Now().Unix() + 1) * int64(time.Second)
+	hashCall := func(t *testing.T, key []byte) []byte {
+		hash, txn, err := getHash(t, []byte(key))
+		oldID := hash.meta.ID
+		assert.NoError(t, err)
+		assert.NotNil(t, txn)
+		assert.NotNil(t, hash)
+		hash.HSet([]byte("field1"), []byte("val"))
+
+		kv := GetKv(txn)
+		err = kv.ExpireAt([]byte(key), expireAt)
+		assert.NoError(t, err)
+		txn.Commit(context.TODO())
+		return oldID
+	}
+
+	stringCall := func(t *testing.T, key []byte) []byte {
+		txn := getTxn(t)
+		str, err := GetString(txn, key)
+		oldID := str.Meta.ID
+		assert.NoError(t, err)
+		assert.NotNil(t, txn)
+		assert.NotNil(t, str)
+		str.Set([]byte("value"), 0)
+
+		kv := GetKv(txn)
+		err = kv.ExpireAt([]byte(key), expireAt)
+		assert.NoError(t, err)
+		txn.Commit(context.TODO())
+		return oldID
+	}
+
+	type args struct {
+		key  []byte
+		call func(*testing.T, []byte) []byte
+	}
+	type want struct {
+		gckey bool
+	}
+
+	tests := []struct {
+		name string
+		args args
+		want want
+	}{
+		{
+			name: "TestHashExpire",
+			args: args{
+				key:  hashKey,
+				call: hashCall,
+			},
+			want: want{
+				gckey: true,
+			},
+		},
+		{
+			name: "TestStringExpire",
+			args: args{
+				key:  strKey,
+				call: stringCall,
+			},
+			want: want{
+				gckey: false,
+			},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			id := tt.args.call(t, tt.args.key)
+			txn := getTxn(t)
+			now := time.Now().UnixNano()
+			if now < expireAt {
+				time.Sleep(time.Duration(expireAt - now))
+			}
+			runExpire(txn.db, 1, "", 0)
 			for i := 0; i < EXPIRE_HASH_NUM; i++ {
 				expireHash := fmt.Sprintf("%04d", i)
 				runExpire(txn.db, 1, expireHash, 0)
@@ -119,21 +212,25 @@ func Test_runExpire(t *testing.T) {
 			txn.Commit(context.TODO())
 
 			txn = getTxn(t)
+			metaKey := MetaKey(txn.db, tt.args.key)
+			_, err := txn.t.Get(metaKey)
+			assert.Equal(t, true, store.IsErrNotFound(err))
+
 			gcKey := toTikvGCKey(toTikvDataKey([]byte(txn.db.Namespace), txn.db.ID, id))
 
-			_, err := txn.t.Get(gcKey)
+			_, err = txn.t.Get(gcKey)
 			txn.Commit(context.TODO())
 			if tt.want.gckey {
 				assert.NoError(t, err)
 			} else {
 				assert.Equal(t, true, store.IsErrNotFound(err))
 			}
+			expireAt = (time.Now().Unix() + 1) * int64(time.Second)
 		})
 	}
-
 }
 
-func Test_doExpire(t *testing.T) {
+func Test_setExpired_doExpire(t *testing.T) {
 	initHash := func(t *testing.T, key []byte) []byte {
 		hash, txn, err := getHash(t, key)
 		assert.NoError(t, err)
@@ -257,14 +354,16 @@ func Test_doExpire(t *testing.T) {
 			if tt.args.tp == byte(ObjectHash) {
 				id = append(id, tt.args.tp)
 			}
-			err := doExpire(txn, tt.args.mkey, id, "", tt.expireAt)
-			txn.Commit(context.TODO())
-			assert.NoError(t, err)
+			if tt.expireAt == 0 {
+				err := doExpire(txn, tt.args.mkey, id, "", tt.expireAt)
+				txn.Commit(context.TODO())
+				assert.NoError(t, err)
+			}
 
 			txn = getTxn(t)
 			gcKey := toTikvGCKey(toTikvDataKey([]byte(txn.db.Namespace), txn.db.ID, tt.args.id))
 
-			_, err = txn.t.Get(gcKey)
+			_, err := txn.t.Get(gcKey)
 			txn.Commit(context.TODO())
 			if tt.want.gckey {
 				assert.NoError(t, err)

From 977b4578248f8bdcefbb6a0728e89ac3d55249d0 Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Sun, 26 Apr 2020 12:27:04 +0800
Subject: [PATCH 59/60] =?UTF-8?q?unhash=20expire=20goroutine=20use=20indiv?=
 =?UTF-8?q?idual=20limit=20configuration=20item=E3=80=81expired&round&seek?=
 =?UTF-8?q?&commit=20metrics=20label?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 conf/config.go     |  9 +++++----
 conf/mockconfig.go |  9 +++++----
 conf/titan.toml    | 10 ++++++++--
 db/expire.go       | 30 ++++++++++++++++++++++--------
 4 files changed, 40 insertions(+), 18 deletions(-)

diff --git a/conf/config.go b/conf/config.go
index 6d88c2ac..5d767090 100644
--- a/conf/config.go
+++ b/conf/config.go
@@ -66,10 +66,11 @@ type GC struct {
 
 // Expire config is the config of Titan expire work
 type Expire struct {
-	Disable        bool          `cfg:"disable; false; boolean; false is used to disable expire"`
-	Interval       time.Duration `cfg:"interval;1s;;expire work tick interval"`
-	LeaderLifeTime time.Duration `cfg:"leader-life-time;3m;;lease flush leader interval"`
-	BatchLimit     int           `cfg:"batch-limit;256;numeric;key count limitation per-transection"`
+	Disable          bool          `cfg:"disable; false; boolean; false is used to disable expire"`
+	Interval         time.Duration `cfg:"interval;1s;;expire work tick interval"`
+	LeaderLifeTime   time.Duration `cfg:"leader-life-time;3m;;lease flush leader interval"`
+	BatchLimit       int           `cfg:"batch-limit;10;numeric;hashed expire-key count limitation per-transection"`
+	UnhashBatchLimit int           `cfg:"unhash-batch-limit;256;numeric;unhashed expire-key count limitation per-transection"`
 }
 
 // ZT config is the config of zlist
diff --git a/conf/mockconfig.go b/conf/mockconfig.go
index 9a23a2bb..faacdfd1 100644
--- a/conf/mockconfig.go
+++ b/conf/mockconfig.go
@@ -14,10 +14,11 @@ func MockConf() *Titan {
 				BatchLimit:     256,
 			},
 			Expire: Expire{
-				Disable:        false,
-				Interval:       time.Second,
-				LeaderLifeTime: 3 * time.Minute,
-				BatchLimit:     256,
+				Disable:          false,
+				Interval:         time.Second,
+				LeaderLifeTime:   3 * time.Minute,
+				BatchLimit:       10,
+				UnhashBatchLimit: 256,
 			},
 			ZT: ZT{
 				Disable:    false,
diff --git a/conf/titan.toml b/conf/titan.toml
index 0c5375dd..400b4b99 100644
--- a/conf/titan.toml
+++ b/conf/titan.toml
@@ -129,9 +129,15 @@ pd-addrs = "mocktikv://"
 
 #type:        int
 #rules:       numeric
-#description: key count limitation per-transection
+#description: hashed expire-key count limitation per-transection
+#default:     10
+#batch-limit = 10
+
+#type:        int
+#rules:       numeric
+#description: unhashed expire-key count limitation per-transection
 #default:     256
-#batch-limit = 256
+#unhash-batch-limit = 256
 
 
 [tikv.zt]
diff --git a/db/expire.go b/db/expire.go
index 44cb2953..b174a141 100644
--- a/db/expire.go
+++ b/db/expire.go
@@ -25,8 +25,9 @@ var (
 )
 
 const (
-	expire_worker   = "expire"
-	EXPIRE_HASH_NUM = 256
+	expire_worker        = "expire"
+	expire_unhash_worker = "expire-unhash"
+	EXPIRE_HASH_NUM      = 256
 )
 
 type LeaderStatus struct {
@@ -159,8 +160,14 @@ func startExpire(db *DB, conf *conf.Expire, ls *LeaderStatus, expireHash string)
 		}
 
 		start := time.Now()
-		lastExpireEndTs = runExpire(db, conf.BatchLimit, expireHash, lastExpireEndTs)
-		metrics.GetMetrics().WorkerRoundCostHistogramVec.WithLabelValues(expire_worker).Observe(time.Since(start).Seconds())
+		if expireHash != "" {
+			lastExpireEndTs = runExpire(db, conf.BatchLimit, expireHash, lastExpireEndTs)
+			metrics.GetMetrics().WorkerRoundCostHistogramVec.WithLabelValues(expire_worker).Observe(time.Since(start).Seconds())
+		} else {
+			lastExpireEndTs = runExpire(db, conf.UnhashBatchLimit, expireHash, lastExpireEndTs)
+			metrics.GetMetrics().WorkerRoundCostHistogramVec.WithLabelValues(expire_unhash_worker).Observe(time.Since(start).Seconds())
+		}
+
 	}
 }
 
@@ -198,6 +205,7 @@ func runExpire(db *DB, batchLimit int, expireHash string, lastExpireEndTs int64)
 	curExpireMetakeyOffset := expireMetakeyOffset
 	var curExpireKeyPrefix []byte //expireKeyPrefix of current go routine
 	var expireLogFlag string
+	var metricsLabel string
 	if expireHash != "" {
 		curExpireKeyPrefix = append(curExpireKeyPrefix, hashExpireKeyPrefix...)
 		curExpireKeyPrefix = append(curExpireKeyPrefix, expireHash...)
@@ -205,9 +213,11 @@ func runExpire(db *DB, batchLimit int, expireHash string, lastExpireEndTs int64)
 		curExpireTimestampOffset += len(expireHash)
 		curExpireMetakeyOffset += len(expireHash)
 		expireLogFlag = fmt.Sprintf("[Expire-%s]", expireHash)
+		metricsLabel = expire_worker
 	} else {
 		curExpireKeyPrefix = append(curExpireKeyPrefix, expireKeyPrefix...)
 		expireLogFlag = "[Expire]"
+		metricsLabel = expire_unhash_worker
 	}
 
 	txn, err := db.Begin()
@@ -235,7 +245,7 @@ func runExpire(db *DB, batchLimit int, expireHash string, lastExpireEndTs int64)
 
 	start := time.Now()
 	iter, err := txn.t.Iter(startPrefix, endPrefix)
-	metrics.GetMetrics().WorkerSeekCostHistogramVec.WithLabelValues(expire_worker).Observe(time.Since(start).Seconds())
+	metrics.GetMetrics().WorkerSeekCostHistogramVec.WithLabelValues(metricsLabel).Observe(time.Since(start).Seconds())
 	if logEnv := zap.L().Check(zap.DebugLevel, expireLogFlag+" seek expire keys"); logEnv != nil {
 		logEnv.Write(zap.Int64("[startTs", lastExpireEndTs), zap.Int64("endTs)", now+1))
 	}
@@ -280,7 +290,7 @@ func runExpire(db *DB, batchLimit int, expireHash string, lastExpireEndTs int64)
 		err := iter.Next()
 		cost := time.Since(start)
 		if cost >= time.Millisecond {
-			metrics.GetMetrics().WorkerSeekCostHistogramVec.WithLabelValues(expire_worker).Observe(cost.Seconds())
+			metrics.GetMetrics().WorkerSeekCostHistogramVec.WithLabelValues(metricsLabel).Observe(cost.Seconds())
 		}
 		if err != nil {
 			zap.L().Error(expireLogFlag+" next failed",
@@ -309,7 +319,7 @@ func runExpire(db *DB, batchLimit int, expireHash string, lastExpireEndTs int64)
 
 	start = time.Now()
 	err = txn.Commit(context.Background())
-	metrics.GetMetrics().WorkerCommitCostHistogramVec.WithLabelValues(expire_worker).Observe(time.Since(start).Seconds())
+	metrics.GetMetrics().WorkerCommitCostHistogramVec.WithLabelValues(metricsLabel).Observe(time.Since(start).Seconds())
 	if err != nil {
 		txn.Rollback()
 		zap.L().Error(expireLogFlag+" commit failed", zap.Error(err))
@@ -319,7 +329,11 @@ func runExpire(db *DB, batchLimit int, expireHash string, lastExpireEndTs int64)
 		logEnv.Write(zap.Int("expired_num", batchLimit-limit))
 	}
 
-	metrics.GetMetrics().ExpireKeysTotal.WithLabelValues("expired").Add(float64(batchLimit - limit))
+	if expireHash != "" {
+		metrics.GetMetrics().ExpireKeysTotal.WithLabelValues("expired").Add(float64(batchLimit - limit))
+	} else {
+		metrics.GetMetrics().ExpireKeysTotal.WithLabelValues("expired-unhash").Add(float64(batchLimit - limit))
+	}
 	return thisExpireEndTs
 }
 

From 9161d7e5156bb2fac37f032456ea965b10d40853 Mon Sep 17 00:00:00 2001
From: xufangping <xufangping@zhuanzhuan.com>
Date: Sun, 26 Apr 2020 15:10:53 +0800
Subject: [PATCH 60/60] buckets of comand/limit/commandFunc/txn commit cost  is
 too many to load for dashboard, replace 1.4 with 2, buckets num decrease 50%

---
 metrics/prometheus.go | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/metrics/prometheus.go b/metrics/prometheus.go
index f057471e..7eb897c0 100644
--- a/metrics/prometheus.go
+++ b/metrics/prometheus.go
@@ -91,7 +91,7 @@ func init() {
 		prometheus.HistogramOpts{
 			Namespace: namespace,
 			Name:      "command_duration_seconds",
-			Buckets:   prometheus.ExponentialBuckets(0.0005, 1.4, 40),
+			Buckets:   prometheus.ExponentialBuckets(0.0005, 2, 20),
 			Help:      "The cost times of command call",
 		}, multiLabel)
 	prometheus.MustRegister(gm.CommandCallHistogramVec)
@@ -125,7 +125,7 @@ func init() {
 		prometheus.HistogramOpts{
 			Namespace: namespace,
 			Name:      "limit_cost_seconds",
-			Buckets:   prometheus.ExponentialBuckets(0.0002, 1.4, 20),
+			Buckets:   prometheus.ExponentialBuckets(0.0002, 2, 10),
 			Help:      "the cost times of command execute's limit",
 		}, multiLabel)
 	prometheus.MustRegister(gm.LimitCostHistogramVec)
@@ -143,7 +143,7 @@ func init() {
 		prometheus.HistogramOpts{
 			Namespace: namespace,
 			Name:      "command_func_done_seconds",
-			Buckets:   prometheus.ExponentialBuckets(0.0005, 1.4, 40),
+			Buckets:   prometheus.ExponentialBuckets(0.0005, 2, 20),
 			Help:      "The cost times of command func",
 		}, multiLabel)
 	prometheus.MustRegister(gm.CommandFuncDoneHistogramVec)
@@ -152,7 +152,7 @@ func init() {
 		prometheus.HistogramOpts{
 			Namespace: namespace,
 			Name:      "txn_commit_seconds",
-			Buckets:   prometheus.ExponentialBuckets(0.0005, 1.4, 30),
+			Buckets:   prometheus.ExponentialBuckets(0.0005, 2, 15),
 			Help:      "The cost times of txn commit",
 		}, multiLabel)
 	prometheus.MustRegister(gm.TxnCommitHistogramVec)
@@ -219,7 +219,7 @@ func init() {
 		prometheus.HistogramOpts{
 			Namespace: namespace,
 			Name:      "lrange_seek_duration_seconds",
-			Buckets:   prometheus.ExponentialBuckets(0.0005, 1.4, 30),
+			Buckets:   prometheus.ExponentialBuckets(0.0005, 2, 15),
 			Help:      "The cost times of list lrange seek",
 		})
 	prometheus.MustRegister(gm.LRangeSeekHistogram)