Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

多路召回融合排序 #23

Merged
merged 7 commits into from
Jan 26, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 30 additions & 1 deletion src/main/java/com/search/docsearch/constant/Constants.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ public class Constants {

public static final String HTTPS_PREFIX = "https://";


/**
* Maxsocre that used to normlize the result
*/
Expand All @@ -31,4 +30,34 @@ public class Constants {
* Min socre that used to normlize the result
*/
public static final int MIN_SCORE = -1;

/**
* Google search start
*/
public static final int GOOGLE_START = 1;

/**
* Google search num
*/
public static final int GOOGLE_NUM = 10;

/**
* ES search start
*/
public static final int ES_START = 0;

/**
* ES search num
*/
public static final int ES_NUM = 100;

/**
* set score to CONSTANT_SCORE when normalize failed
*/
public static final double CONSTANT_SCORE = 1.0;

/**
* set MAGIC_SCORE to boost google search
*/
public static final double MAGIC_SCORE = 0.3;
}
Original file line number Diff line number Diff line change
Expand Up @@ -38,4 +38,9 @@ public class SearchCondition {
private List<Map<String, String>> limit;
@Size(max = 30)
private List<Map<String, String>> filter;

/**
* sort by time
*/
private String sort;
}
Original file line number Diff line number Diff line change
Expand Up @@ -10,22 +10,31 @@
*/
package com.search.docsearch.multirecall.composite;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import lombok.Data;

import com.search.docsearch.constant.Constants;
import com.search.docsearch.properties.FusionSortProperties;
import com.search.docsearch.utils.MergeUtil;


@Data
public class DataComposite implements Component {

/**
* logger.
*/
private static final Logger LOGGER = LoggerFactory.getLogger(Component.class);

/**
* insert fusion sort properties
*/
private FusionSortProperties fuProperties;

/**
* Recall results list.
Expand Down Expand Up @@ -99,7 +108,8 @@ public int getSize(){
public Map<String, Object> mergeResult(){
Map<String, Object> baseRes = this.getChild(0).getResList();
int pageSize = (int) baseRes.get("pageSize");
List<Map<String, Object>> mergedList = weightedMerge(pageSize);
int page = (int) baseRes.get("page");
List<Map<String, Object>> mergedList = weightedMerge(page, pageSize);
baseRes.put("records", mergedList);
return baseRes;
}
Expand Down Expand Up @@ -136,9 +146,10 @@ private Map<String, Object> postionMerge(){
*
* @return the merged result lists
*/
public List<Map<String, Object>> weightedMerge(int pageSize){
public List<Map<String, Object>> weightedMerge(int page, int pageSize){
List<Map<String, Object>> mergeList = new ArrayList<>();

Map<String, Object> hashMap = new HashMap();
for (Component recall : this.children){
double minScore = Constants.MAX_SCORE;
double maxScore = Constants.MIN_SCORE;
Expand All @@ -152,14 +163,40 @@ public List<Map<String, Object>> weightedMerge(int pageSize){
// do norm
for (Map<String, Object> entity : rcords) {
double score = (double) entity.get("score");
double normedScore = MergeUtil.normalize(score, minScore, maxScore);
entity.put("score", normedScore);
try {
double normedScore = MergeUtil.normalize(score, minScore, maxScore);
entity.put("score", normedScore);
} catch (IllegalArgumentException e) {
LOGGER.error("failed normalize score, google recall 1 resuslts");
entity.put("score", Constants.CONSTANT_SCORE);
}

mergeList.add(entity);
}
// do fuse
for(Map<String, Object> entity : mergeList) {
double score = (double) entity.get("score");
double initScore = 0;
if (hashMap.containsKey(entity.get("path"))) {
Map<String, Object> preMap = (Map<String, Object>) hashMap.get(entity.get("path"));
initScore = (double) preMap.get("score");
}
if ("E".equals(entity.get("recallType"))) {
entity.put("score", initScore + score * (double)fuProperties.getEsRecallWeight());
} else {
entity.put("score", initScore + score * (double)fuProperties.getGRecallWeight() + Constants.MAGIC_SCORE);
}
hashMap.put((String) entity.get("path"), entity);
}
}
List<Map<String, Object>> resList = new ArrayList<>();
for (Map.Entry<String, Object> entry : hashMap.entrySet()) {
Map<String, Object> value = (Map<String, Object>) entry.getValue();
resList.add(value);
}

mergeList = mergeList.stream().sorted((a, b) -> Double.compare((Double) b.get("score"), (Double) a.get("score"))).collect(Collectors.toList());
return mergeList.subList(0, Math.min(pageSize, mergeList.size()));
}
resList = resList.stream().sorted((a, b) -> Double.compare((Double) b.get("score"), (Double) a.get("score"))).collect(Collectors.toList());

return resList.subList(0 , Math.min(pageSize, resList.size()));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -34,17 +34,20 @@
import org.elasticsearch.search.builder.SearchSourceBuilder;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightField;
import org.elasticsearch.search.sort.SortOrder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.util.StringUtils;
import org.springframework.web.util.HtmlUtils;
import com.search.docsearch.utils.Trie;
import com.search.docsearch.config.EsfunctionScoreConfig;
import com.search.docsearch.constant.Constants;
import com.search.docsearch.entity.vo.SearchCondition;
import com.search.docsearch.except.ServiceImplException;
import com.search.docsearch.multirecall.composite.Component;
import com.search.docsearch.multirecall.composite.cdata.EsRecallData;
import com.search.docsearch.multirecall.recall.SearchStrategy;
import com.search.docsearch.properties.FusionSortProperties;
import com.search.docsearch.utils.General;
import org.elasticsearch.client.RestHighLevelClient;

Expand Down Expand Up @@ -79,6 +82,11 @@ public class EsSearchStrategy implements SearchStrategy {
*/
private EsfunctionScoreConfig esfunctionScoreConfig;

/**
* insert fusion sort properties
*/
private FusionSortProperties fuProperties;

/**
* roughly filter the recalled results
*
Expand All @@ -87,11 +95,12 @@ public class EsSearchStrategy implements SearchStrategy {
* @param paratire the algorithim toolkit
* @param config the boost socre config which used to ranking the result list
*/
public EsSearchStrategy(RestHighLevelClient pararestHighLevelClient, String paraindex, Trie paratire,EsfunctionScoreConfig config){
public EsSearchStrategy(RestHighLevelClient pararestHighLevelClient, String paraindex, Trie paratire,EsfunctionScoreConfig config, FusionSortProperties fuProperties){
this.restHighLevelClient = pararestHighLevelClient;
this.index = paraindex;
this.trie = paratire;
this.esfunctionScoreConfig = config;
this.fuProperties = fuProperties;
}

/**
Expand Down Expand Up @@ -167,7 +176,8 @@ private Component searchByCondition(SearchCondition condition) throws ServiceImp
if (highlightFields.containsKey("title")) {
map.put("title", highlightFields.get("title").getFragments()[0].toString());
}

reCaculateScore(map);
map.put("recallType", "E");
data.add(map);
}
if (data.isEmpty()) {
Expand All @@ -185,6 +195,28 @@ private Component searchByCondition(SearchCondition condition) throws ServiceImp
return resData;
}

/**
* caculate the es recall data by using the date
*
* @param entity the map entity of search result
*/
public void reCaculateScore(Map<String, Object> entity) {
double score = (double) entity.get("score");
try {
if (entity.containsKey("date")) {
String[] parts = entity.get("date").toString().split("-");
int year = Integer.parseInt(parts[0]);
int month = Integer.parseInt(parts[1]);
int day = Integer.parseInt(parts[2]);
List<Double> dateWeight = fuProperties.getDateWeight();
score += (year * dateWeight.get(0) + month * dateWeight.get(1) + day * dateWeight.get(2));
Comment on lines +208 to +212
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

try catch异常,避免split(-) 出现错误,导致下标越界

Comment on lines +208 to +212
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

出错后设置默认分数

entity.put("score", score);
}
} catch (Exception e) {
LOGGER.error("es recall score caculate error: {}", e.getMessage());
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

服务器中不打印具体错误信息

}
}

/**
* build the es qeury from search condition
*
Expand Down Expand Up @@ -290,6 +322,9 @@ private SearchRequest BuildSearchRequest(SearchCondition condition, String index
sourceBuilder.highlighter(highlightBuilder);
sourceBuilder.from(startIndex).size(condition.getPageSize());
sourceBuilder.timeout(TimeValue.timeValueMinutes(1L));
if ("desc".equals(condition.getSort())) {
sourceBuilder.sort("date", SortOrder.DESC);
}
request.source(sourceBuilder);
return request;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.huaban.analysis.jieba.JiebaSegmenter;
import com.search.docsearch.constant.Constants;
import com.search.docsearch.entity.vo.GoogleSearchParams;
import com.search.docsearch.entity.vo.SearchCondition;
import com.search.docsearch.except.ServiceImplException;
Expand Down Expand Up @@ -94,6 +95,9 @@ public Component search(SearchCondition condition) {
* @throws IOException
*/
private Component searchByCondition(SearchCondition condition) throws ServiceImplException, IOException {
if (!"".equals(condition.getType())) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

添加注释

return null;
}
// google search 处理无效字符
condition.setKeyword(condition.getKeyword().replace(" ", ""));
condition.setKeyword(condition.getKeyword().replace(".", ""));
Expand All @@ -103,7 +107,7 @@ private Component searchByCondition(SearchCondition condition) throws ServiceImp
googleSearchParams.setLr("lang_en");
}
int start = (condition.getPage() - 1) * condition.getPageSize() + 1;
int num = Math.min(10, condition.getPageSize());
int num = Constants.GOOGLE_NUM;
if(start + num > 100) {
return null;
} else {
Expand Down Expand Up @@ -154,6 +158,7 @@ private Component searchByCondition(SearchCondition condition) throws ServiceImp
map.put("lang", "zh");
}
map.put("score", (double) (5000 - (count + start) * 50));
map.put("recallType","G");
count++;
data.add(map);
}
Expand Down Expand Up @@ -185,12 +190,12 @@ public String highLightContent(String searchkey, String content){
List<String> segments = this.segmenter.sentenceProcess(searchkey);
String lightContent = content;
for (String keyword : segments){
Pattern pattern = Pattern.compile(Pattern.quote(keyword));
Pattern pattern = Pattern.compile(Pattern.quote(keyword), Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(lightContent);
StringBuffer result = new StringBuffer();
while (matcher.find()) {
matcher.appendReplacement(result, "<span>" + matcher.group() + "</span>");
}
}
matcher.appendTail(result);
lightContent = result.toString();
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
/* Copyright (c) 2024 openEuler Community
EasySoftware is licensed under the Mulan PSL v2.
You can use this software according to the terms and conditions of the Mulan PSL v2.
You may obtain a copy of Mulan PSL v2 at:
http://license.coscl.org.cn/MulanPSL2
THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
See the Mulan PSL v2 for more details.
*/
package com.search.docsearch.properties;

import java.util.List;

import org.springframework.boot.context.properties.ConfigurationProperties;
import org.springframework.stereotype.Component;

import lombok.Getter;
import lombok.Setter;

@Component
@Getter
@Setter
@ConfigurationProperties(prefix = "fusion-sort")
public class FusionSortProperties {

/**
* Date weight in fusion sort
*/
private List<Double> dateWeight;

/**
* The weight of es recall data;
*/
private double esRecallWeight;

/**
* The weight of google recall data;
*/
private double gRecallWeight;
}
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import com.search.docsearch.multirecall.recall.MultiSearchContext;
import com.search.docsearch.multirecall.recall.cstrategy.EsSearchStrategy;
import com.search.docsearch.multirecall.recall.cstrategy.GSearchStrategy;
import com.search.docsearch.properties.FusionSortProperties;
import com.search.docsearch.properties.GoogleSearchProperties;
import com.search.docsearch.service.SearchService;
import com.search.docsearch.utils.General;
Expand Down Expand Up @@ -124,6 +125,12 @@ public class SearchServiceImpl implements SearchService {
*/
@Autowired
private HttpConnectFactory httpConnectFactory;

/**
* insert fusion sort properties
*/
@Autowired
private FusionSortProperties fuProperties;

@Autowired
private EsfunctionScoreConfig esfunctionScoreConfig;
Expand Down Expand Up @@ -222,14 +229,18 @@ public Map<String, Object> getSuggestion(String keyword, String lang) throws Ser
@Override
public Map<String, Object> searchByCondition(SearchCondition condition) throws ServiceImplException {
//create es search strategy
EsSearchStrategy esRecall = new EsSearchStrategy(restHighLevelClient,mySystem.index,trie,esfunctionScoreConfig);
EsSearchStrategy esRecall = new EsSearchStrategy(restHighLevelClient,mySystem.index,trie,esfunctionScoreConfig,fuProperties);
GSearchStrategy gRecall = new GSearchStrategy(gProperties, httpConnectFactory);
MultiSearchContext multirecall = new MultiSearchContext();
//set es search into search contex
multirecall.setSearchStrategy(esRecall);
multirecall.setSearchStrategy(gRecall);
//do recall and fetch the result
DataComposite multiRecallRes = multirecall.executeMultiSearch(condition);
if ("desc".equals(condition.getSort())) {
return multiRecallRes.getChild(0).getResList();
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

注意下标

}
multiRecallRes.setFuProperties(fuProperties);
// multiRecallRes.filter("policy") filtering data here
return multiRecallRes.mergeResult();
//return multiRecallRes.getChild(1).getResList();
Expand Down Expand Up @@ -334,6 +345,9 @@ public SearchRequest BuildSearchRequest(SearchCondition condition, String index)
sourceBuilder.highlighter(highlightBuilder);
sourceBuilder.from(startIndex).size(condition.getPageSize());
sourceBuilder.timeout(TimeValue.timeValueMinutes(1L));
if ("desc".equals(condition.getSort())) {
sourceBuilder.sort("date", SortOrder.DESC);
}
request.source(sourceBuilder);
return request;
}
Expand Down
Loading