Skip to content

Commit

Permalink
优化相似度判断, 并添加了distance/sim字段用来获取.
Browse files Browse the repository at this point in the history
优化fuzzybaseline的逻辑, 移动到处理线程中.
优化expr的性能
修复--fuzzy没启用也会生效的bug
  • Loading branch information
M09Ic committed Jan 9, 2023
1 parent 797ac74 commit a94f9e3
Show file tree
Hide file tree
Showing 4 changed files with 23 additions and 11 deletions.
12 changes: 7 additions & 5 deletions internal/pool.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ var (
maxRedirect = 3
maxCrawl = 3
maxRecursion = 0
nilBaseline = &pkg.Baseline{}
)

func NewPool(ctx context.Context, config *pkg.Config) (*Pool, error) {
Expand Down Expand Up @@ -54,6 +55,10 @@ func NewPool(ctx context.Context, config *pkg.Config) (*Pool, error) {
// 挂起一个异步的处理结果线程, 不干扰主线程的请求并发
go func() {
for bl := range pool.tempCh {
if bl.IsValid {
pool.addFuzzyBaseline(bl)
}

if _, ok := pool.Statistor.Counts[bl.Status]; ok {
pool.Statistor.Counts[bl.Status]++
} else {
Expand All @@ -71,7 +76,7 @@ func NewPool(ctx context.Context, config *pkg.Config) (*Pool, error) {
if bl, ok := pool.baselines[status]; ok {
params["bl"+strconv.Itoa(status)] = bl
} else {
params["bl"+strconv.Itoa(status)] = &pkg.Baseline{}
params["bl"+strconv.Itoa(status)] = nilBaseline
}
}
}
Expand Down Expand Up @@ -257,7 +262,7 @@ Loop:
if pool.Mod == pkg.HostSpray {
pool.reqPool.Invoke(newUnit(pkg.RandHost(), source))
} else if pool.Mod == pkg.PathSpray {
pool.reqPool.Invoke(newUnit(pkg.RandPath(), source))
pool.reqPool.Invoke(newUnit(safePath(pool.BaseURL, pkg.RandPath()), source))
}
case unit, ok := <-pool.additionCh:
if !ok {
Expand Down Expand Up @@ -323,7 +328,6 @@ func (pool *Pool) Invoke(v interface{}) {
pool.wg.Add(1)
pool.doRedirect(bl, unit.depth)
}
pool.addFuzzyBaseline(bl)
} else {
bl = pkg.NewInvalidBaseline(req.URI(), req.Host(), resp, err.Error())
}
Expand Down Expand Up @@ -643,11 +647,9 @@ func (pool *Pool) addAddition(u *Unit) {
func (pool *Pool) addFuzzyBaseline(bl *pkg.Baseline) {
if _, ok := pool.baselines[bl.Status]; !ok && IntsContains(FuzzyStatus, bl.Status) {
bl.Collect()
pool.locker.Lock()
pool.wg.Add(1)
pool.doCrawl(bl)
pool.baselines[bl.Status] = bl
pool.locker.Unlock()
logs.Log.Infof("[baseline.%dinit] %s", bl.Status, bl.Format([]string{"status", "length", "spend", "title", "frame", "redirect"}))
}
}
Expand Down
5 changes: 3 additions & 2 deletions internal/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -380,7 +380,6 @@ func (r *Runner) Outputting() {
} else {
logs.Log.Debug(bl.String())
}

}
}
}
Expand Down Expand Up @@ -410,7 +409,9 @@ func (r *Runner) Outputting() {
if !ok {
return
}
fuzzySaveFunc(bl)
if r.Fuzzy {
fuzzySaveFunc(bl)
}
}
}
}()
Expand Down
14 changes: 12 additions & 2 deletions pkg/baseline.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ type Baseline struct {
IsFuzzy bool `json:"fuzzy"`
Source int `json:"source"`
ReqDepth int `json:"depth"`
Distance uint8 `json:"distance"`
Recu bool `json:"-"`
RecuDepth int `json:"-"`
URLs []string `json:"-"`
Expand Down Expand Up @@ -233,10 +234,11 @@ func (bl *Baseline) Compare(other *Baseline) int {
return -1
}

var Distance uint8 = 5
var Distance uint8 = 5 // 数字越小越相似, 数字为0则为完全一致.

func (bl *Baseline) FuzzyCompare(other *Baseline) bool {
if parsers.SimhashCompare(other.BodySimhash, bl.BodySimhash) < Distance {
// 这里使用rawsimhash, 是为了保证一定数量的字符串, 否则超短的body会导致simhash偏差指较大
if other.Distance = parsers.SimhashCompare(other.RawSimhash, bl.RawSimhash); other.Distance < Distance {
return true
}
return false
Expand Down Expand Up @@ -278,6 +280,8 @@ func (bl *Baseline) Get(key string) string {
return strconv.Itoa(int(bl.Spended)) + "ms"
case "length":
return strconv.Itoa(bl.BodyLength)
case "sim", "distance":
return "sim:" + strconv.Itoa(int(bl.Distance))
case "source":
return GetSourceName(bl.Source)
case "extract":
Expand Down Expand Up @@ -366,6 +370,9 @@ func (bl *Baseline) ColorString() string {
line.WriteString(logs.YellowBold(strconv.Itoa(int(bl.Spended)) + "ms"))
line.WriteString(logs.YellowBold(" - " + GetSourceName(bl.Source)))
line.WriteString(logs.GreenLine(bl.Additional("title")))
if bl.Distance != 0 {
line.WriteString(logs.GreenLine(bl.Additional("sim")))
}
line.WriteString(logs.Cyan(bl.Frameworks.String()))
line.WriteString(logs.Cyan(bl.Extracteds.String()))
if bl.RedirectURL != "" {
Expand Down Expand Up @@ -416,6 +423,9 @@ func (bl *Baseline) String() string {
line.WriteString(" - ")
line.WriteString(strconv.Itoa(int(bl.Spended)) + "ms")
line.WriteString(bl.Additional("title"))
if bl.Distance != 0 {
line.WriteString(logs.GreenLine(bl.Additional("sim")))
}
line.WriteString(bl.Frameworks.String())
line.WriteString(bl.Extracteds.String())
if bl.RedirectURL != "" {
Expand Down
3 changes: 1 addition & 2 deletions pkg/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,9 +104,8 @@ const (
func RandPath() string {
n := 16
b := make([]byte, n)
b[0] = byte(0x2f)
// A rand.Int63() generates 63 random bits, enough for letterIdMax letters!
for i, cache, remain := n-1, src.Int63(), letterIdMax; i >= 1; {
for i, cache, remain := n-1, src.Int63(), letterIdMax; i >= 0; {
if remain == 0 {
cache, remain = src.Int63(), letterIdMax
}
Expand Down

0 comments on commit a94f9e3

Please sign in to comment.