Skip to content

Commit

Permalink
(improvement)(Headless) Filtering based on dataSetIds during Mapper d…
Browse files Browse the repository at this point in the history
…etection Compatible with term
  • Loading branch information
jolunoluo committed Jun 5, 2024
1 parent b3b9687 commit 606f516
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 46 deletions.
15 changes: 1 addition & 14 deletions common/src/main/java/com/hankcs/hanlp/LoadRemoveService.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.Set;

@Data
@Slf4j
Expand All @@ -19,23 +18,11 @@ public class LoadRemoveService {
@Value("${mapper.remove.nature.prefix:}")
private String mapperRemoveNaturePrefix;

public List removeNatures(List value, Set<Long> detectModelIds) {
public List removeNatures(List value) {
if (CollectionUtils.isEmpty(value)) {
return value;
}
List<String> resultList = new ArrayList<>(value);
if (!CollectionUtils.isEmpty(detectModelIds)) {
resultList.removeIf(nature -> {
if (Objects.isNull(nature)) {
return false;
}
Long modelId = getDataSetId(nature);
if (Objects.nonNull(modelId)) {
return !detectModelIds.contains(modelId);
}
return false;
});
}
if (StringUtils.isNotBlank(mapperRemoveNaturePrefix)) {
resultList.removeIf(nature -> {
if (Objects.isNull(nature)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

import com.hankcs.hanlp.LoadRemoveService;
import com.hankcs.hanlp.corpus.io.ByteArray;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.DataOutputStream;
import java.io.IOException;
import java.io.ObjectInput;
Expand All @@ -14,8 +17,6 @@
import java.util.Objects;
import java.util.Queue;
import java.util.Set;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


public abstract class BaseNode<V> implements Comparable<BaseNode> {
Expand Down Expand Up @@ -286,12 +287,12 @@ public String toString() {
+ '}';
}

public void walkNode(Set<Map.Entry<String, V>> entrySet, Set<Long> detectModelIds) {
public void walkNode(Set<Map.Entry<String, V>> entrySet) {
if (status == Status.WORD_MIDDLE_2 || status == Status.WORD_END_3) {
logger.debug("detectModelIds:{},before:{}", detectModelIds, value.toString());
List natures = new LoadRemoveService().removeNatures((List) value, detectModelIds);
logger.debug("walkNode before:{}", value.toString());
List natures = new LoadRemoveService().removeNatures((List) value);
String name = this.prefix != null ? this.prefix + c : "" + c;
logger.debug("name:{},after:{},natures:{}", name, (List) value, natures);
logger.debug("walkNode name:{},after:{},natures:{}", name, (List) value, natures);
entrySet.add(new TrieEntry(name, (V) natures));
}
}
Expand All @@ -300,21 +301,17 @@ public void walkNode(Set<Map.Entry<String, V>> entrySet, Set<Long> detectModelId
* walk limit
* @param sb
* @param entrySet
* @param limit
*/
public void walkLimit(StringBuilder sb, Set<Map.Entry<String, V>> entrySet, int limit, Set<Long> detectModelIds) {
public void walkLimit(StringBuilder sb, Set<Map.Entry<String, V>> entrySet) {
Queue<BaseNode> queue = new ArrayDeque<>();
this.prefix = sb.toString();
queue.add(this);
while (!queue.isEmpty()) {
if (entrySet.size() >= limit) {
break;
}
BaseNode root = queue.poll();
if (root == null) {
continue;
}
root.walkNode(entrySet, detectModelIds);
root.walkNode(entrySet);
if (root.child == null) {
continue;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,22 +48,16 @@ public static List<HanlpMapResult> prefixSearch(String key, int limit, Map<Long,

public static List<HanlpMapResult> prefixSearch(String key, int limit, BinTrie<List<String>> binTrie,
Map<Long, List<Long>> modelIdToDataSetIds, Set<Long> detectDataSetIds) {
Set<Map.Entry<String, List<String>>> result = prefixSearchLimit(key, limit, binTrie,
modelIdToDataSetIds, detectDataSetIds);
Set<Map.Entry<String, List<String>>> result = search(key, binTrie);
List<HanlpMapResult> hanlpMapResults = result.stream().map(
entry -> {
String name = entry.getKey().replace("#", " ");
return new HanlpMapResult(name, entry.getValue(), key);
}
).sorted((a, b) -> -(b.getName().length() - a.getName().length()))
.limit(SEARCH_SIZE)
.collect(Collectors.toList());
for (HanlpMapResult hanlpMapResult : hanlpMapResults) {
List<String> natures = hanlpMapResult.getNatures().stream()
.map(nature -> NatureHelper.changeModel2DataSet(nature, modelIdToDataSetIds))
.flatMap(Collection::stream).collect(Collectors.toList());
hanlpMapResult.setNatures(natures);
}
hanlpMapResults = transformAndFilterByDataSet(hanlpMapResults, modelIdToDataSetIds,
detectDataSetIds, limit);
return hanlpMapResults;
}

Expand All @@ -80,11 +74,8 @@ public static List<HanlpMapResult> suffixSearch(String key, int limit, Map<Long,

public static List<HanlpMapResult> suffixSearch(String key, int limit, BinTrie<List<String>> binTrie,
Map<Long, List<Long>> modelIdToDataSetIds, Set<Long> detectDataSetIds) {

Set<Map.Entry<String, List<String>>> result = prefixSearchLimit(key, limit, binTrie, modelIdToDataSetIds,
detectDataSetIds);

return result.stream().map(
Set<Map.Entry<String, List<String>>> result = search(key, binTrie);
List<HanlpMapResult> hanlpMapResults = result.stream().map(
entry -> {
String name = entry.getKey().replace("#", " ");
List<String> natures = entry.getValue().stream()
Expand All @@ -94,15 +85,34 @@ public static List<HanlpMapResult> suffixSearch(String key, int limit, BinTrie<L
return new HanlpMapResult(name, natures, key);
}
).sorted((a, b) -> -(b.getName().length() - a.getName().length()))
.limit(SEARCH_SIZE)
.collect(Collectors.toList());
return transformAndFilterByDataSet(hanlpMapResults, modelIdToDataSetIds, detectDataSetIds, limit);
}

private static Set<Map.Entry<String, List<String>>> prefixSearchLimit(String key, int limit,
BinTrie<List<String>> binTrie, Map<Long, List<Long>> modelIdToDataSetIds, Set<Long> detectDataSetIds) {

Set<Long> detectModelIds = NatureHelper.getModelIds(modelIdToDataSetIds, detectDataSetIds);
private static List<HanlpMapResult> transformAndFilterByDataSet(List<HanlpMapResult> hanlpMapResults,
Map<Long, List<Long>> modelIdToDataSetIds,
Set<Long> detectDataSetIds, int limit) {
return hanlpMapResults.stream().peek(hanlpMapResult -> {
List<String> natures = hanlpMapResult.getNatures().stream()
.map(nature -> NatureHelper.changeModel2DataSet(nature, modelIdToDataSetIds))
.flatMap(Collection::stream)
.filter(nature -> {
if (CollectionUtils.isEmpty(detectDataSetIds)) {
return true;
}
Long dataSetId = NatureHelper.getDataSetId(nature);
if (dataSetId != null) {
return detectDataSetIds.contains(dataSetId);
}
return false;
}).collect(Collectors.toList());
hanlpMapResult.setNatures(natures);
}).filter(hanlpMapResult -> !CollectionUtils.isEmpty(hanlpMapResult.getNatures()))
.limit(limit).collect(Collectors.toList());
}

private static Set<Map.Entry<String, List<String>>> search(String key,
BinTrie<List<String>> binTrie) {
key = key.toLowerCase();
Set<Map.Entry<String, List<String>>> entrySet = new TreeSet<Map.Entry<String, List<String>>>();

Expand All @@ -122,7 +132,7 @@ private static Set<Map.Entry<String, List<String>>> prefixSearchLimit(String key
if (branch == null) {
return entrySet;
}
branch.walkLimit(sb, entrySet, limit, detectModelIds);
branch.walkLimit(sb, entrySet);
return entrySet;
}

Expand Down

0 comments on commit 606f516

Please sign in to comment.