Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

OR-1972 Duplicates #552

Merged
merged 12 commits into from
Nov 17, 2023
1 change: 1 addition & 0 deletions AssociationRegistry.sln.DotSettings
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,7 @@
<s:Boolean x:Key="/Default/UserDictionary/Words/=dulste/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=einddatum/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=Feitelijke/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=foezie/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=Geregistreerd/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=hoofdactivititeit/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=hoofdactivititeiten/@EntryIndexedValue">True</s:Boolean>
Expand Down
Original file line number Diff line number Diff line change
@@ -1,23 +1,23 @@
namespace AssociationRegistry.Admin.Api.DuplicateDetection;

using DuplicateVerenigingDetection;
using Marten;
using Schema.Constants;
using Schema.Detail;
using Nest;
using Schema.Search;
using System;
using System.Collections.Generic;
using System.Collections.Immutable;
using System.Linq;
using System.Linq.Expressions;
using System.Threading.Tasks;
using Vereniging;

public class SearchDuplicateVerenigingDetectionService : IDuplicateVerenigingDetectionService
{
private readonly IQuerySession _session;
private readonly IElasticClient _client;

public SearchDuplicateVerenigingDetectionService(IQuerySession session)
public SearchDuplicateVerenigingDetectionService(IElasticClient client)
{
_session = session;
_client = client;
}

public async Task<IReadOnlyCollection<DuplicaatVereniging>> GetDuplicates(VerenigingsNaam naam, Locatie[] locaties)
Expand All @@ -26,39 +26,87 @@ public async Task<IReadOnlyCollection<DuplicaatVereniging>> GetDuplicates(Vereni
var postcodes = locatiesMetAdres.Select(l => l.Adres!.Postcode).ToArray();
var gemeentes = locatiesMetAdres.Select(l => l.Adres!.Gemeente).ToArray();

return (await _session.Query<BeheerVerenigingDetailDocument>()
.Where(
document =>
document.Status.Equals(VerenigingStatus.Actief) &&
document.Naam.Equals(naam, StringComparison.InvariantCultureIgnoreCase) &&
document.Locaties.Any(
locatie =>
locatie.Adres != null && (
locatie.Adres.Postcode.IsOneOf(postcodes) ||
locatie.Adres.Gemeente.IsOneOf(gemeentes))
)
)
.ToListAsync())
.Select(ToDuplicateVereniging)
.ToArray();
var searchResponse =
await _client
.SearchAsync<DuplicateDetectionDocument>(
s => s.Query(
q => q.Bool(
b => b.Must(must => must.Match(m => FuzzyMatchOpNaam(m, f => f.Naam, naam)))
.Filter(f => f.Bool(
fb => fb.Should(MatchGemeente(gemeentes),
MatchPostcode(postcodes))
.MinimumShouldMatch(1))))));

return searchResponse.Documents.Select(ToDuplicateVereniging)
.ToArray();
}

private static Func<QueryContainerDescriptor<DuplicateDetectionDocument>, QueryContainer> MatchPostcode(string[] postcodes)
{
return postalCodesQuery => postalCodesQuery
.Nested(n => n
.Path(p => p.Locaties)
.Query(nq => nq
.Terms(t => t
.Field(f => f.Locaties
.First()
.Postcode)
.Terms(postcodes)
)
)
);
}

private static Func<QueryContainerDescriptor<DuplicateDetectionDocument>, QueryContainer> MatchGemeente(string[] gemeentes)
{
return gemeentesQuery => gemeentesQuery
.Nested(n => n
.Path(p => p.Locaties)
.Query(nq => nq
.Match(m =>
FuzzyMatchOpNaam(m,
f => f.Locaties
.First()
.Gemeente, string.Join(
separator: " ",
gemeentes))
)
)
);
}

private static MatchQueryDescriptor<DuplicateDetectionDocument> FuzzyMatchOpNaam(
MatchQueryDescriptor<DuplicateDetectionDocument> m,
Expression<Func<DuplicateDetectionDocument, string>> path,
string query)
{
return m
.Field(path)
.Query(query)
.Analyzer(DuplicateDetectionDocumentMapping
.DuplicateAnalyzer)
.Fuzziness(Fuzziness.Auto) // Assumes this analyzer applies lowercase and asciifolding
.MinimumShouldMatch("90%");
}

private static DuplicaatVereniging ToDuplicateVereniging(BeheerVerenigingDetailDocument document)
private static DuplicaatVereniging ToDuplicateVereniging(DuplicateDetectionDocument document)
=> new(
document.VCode,
new DuplicaatVereniging.VerenigingsType(document.Type.Code, document.Type.Beschrijving),
new DuplicaatVereniging.VerenigingsType(document.VerenigingsTypeCode,
Verenigingstype.Parse(document.VerenigingsTypeCode).Beschrijving),
document.Naam,
document.KorteNaam ?? string.Empty,
document.HoofdactiviteitenVerenigingsloket
.Select(h => new DuplicaatVereniging.HoofdactiviteitVerenigingsloket(h.Code, h.Beschrijving)).ToImmutableArray(),
document.KorteNaam,
document.HoofdactiviteitVerenigingsloket
.Select(h => new DuplicaatVereniging.HoofdactiviteitVerenigingsloket(
h, HoofdactiviteitVerenigingsloket.Create(h).Beschrijving)).ToImmutableArray(),
document.Locaties.Select(ToLocatie).ToImmutableArray());

private static DuplicaatVereniging.Locatie ToLocatie(BeheerVerenigingDetailDocument.Locatie loc)
private static DuplicaatVereniging.Locatie ToLocatie(DuplicateDetectionDocument.Locatie loc)
=> new(
loc.Locatietype,
loc.IsPrimair,
loc.Adresvoorstelling,
loc.Naam,
loc.Adres?.Postcode ?? string.Empty,
loc.Adres?.Gemeente ?? string.Empty);
loc.Postcode ?? string.Empty,
loc.Gemeente ?? string.Empty);
}
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,12 @@ public class ElasticSearchOptionsSection
public string? Username { get; set; }
public string? Password { get; set; }
public IndicesOptionsSection? Indices { get; set; }
public bool EnableDevelopmentLogs { get; set; }

public class IndicesOptionsSection
{
public string? Verenigingen { get; set; }
public string? DuplicateDetection { get; set; }

}
}
Original file line number Diff line number Diff line change
@@ -1,42 +1,70 @@
namespace AssociationRegistry.Admin.Api.Infrastructure.Extensions;

using System;
using ConfigurationBindings;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Logging;
using Nest;
using Schema.Search;
using System;
using System.Text;

public static class ElasticSearchExtensions
{
public static IServiceCollection AddElasticSearch(
this IServiceCollection services,
ElasticSearchOptionsSection elasticSearchOptions)
{
var elasticClient = CreateElasticClient(elasticSearchOptions);
var elasticClient = (IServiceProvider serviceProvider)
=> CreateElasticClient(elasticSearchOptions, serviceProvider.GetRequiredService<ILogger<ElasticClient>>());

services.AddSingleton(_ => elasticClient);
services.AddSingleton<IElasticClient>(_ => elasticClient);
services.AddSingleton(sp => elasticClient(sp));
services.AddSingleton<IElasticClient>(serviceProvider => serviceProvider.GetRequiredService<ElasticClient>());

return services;
}

private static ElasticClient CreateElasticClient(ElasticSearchOptionsSection elasticSearchOptions)
private static ElasticClient CreateElasticClient(ElasticSearchOptionsSection elasticSearchOptions, ILogger logger)
{
var settings = new ConnectionSettings(new Uri(elasticSearchOptions.Uri!))
.BasicAuthentication(
elasticSearchOptions.Username,
elasticSearchOptions.Password)
.MapVerenigingDocument(elasticSearchOptions.Indices!.Verenigingen!);
.BasicAuthentication(
elasticSearchOptions.Username,
elasticSearchOptions.Password)
.MapVerenigingDocument(elasticSearchOptions.Indices!.Verenigingen!)
.MapDuplicateDetectionDocument(elasticSearchOptions.Indices!.DuplicateDetection!);

if (elasticSearchOptions.EnableDevelopmentLogs)
settings = settings.DisableDirectStreaming()
.PrettyJson()
.OnRequestCompleted(apiCallDetails =>
{
if (apiCallDetails.RequestBodyInBytes != null)
logger.LogDebug(
message: "{HttpMethod} {Uri} \n {RequestBody}",
apiCallDetails.HttpMethod,
apiCallDetails.Uri,
Encoding.UTF8.GetString(apiCallDetails.RequestBodyInBytes));

if (apiCallDetails.ResponseBodyInBytes != null)
logger.LogDebug(message: "Response: {ResponseBody}",
Encoding.UTF8.GetString(apiCallDetails.ResponseBodyInBytes));
});

var elasticClient = new ElasticClient(settings);
return elasticClient;
return new ElasticClient(settings);
}

public static ConnectionSettings MapVerenigingDocument(this ConnectionSettings settings, string indexName)
{
return settings.DefaultMappingFor(
typeof(VerenigingZoekDocument),
descriptor => descriptor.IndexName(indexName)
.IdProperty(nameof(VerenigingZoekDocument.VCode)));
selector: descriptor => descriptor.IndexName(indexName)
.IdProperty(nameof(VerenigingZoekDocument.VCode)));
}

public static ConnectionSettings MapDuplicateDetectionDocument(this ConnectionSettings settings, string indexName)
{
return settings.DefaultMappingFor(
typeof(DuplicateDetectionDocument),
selector: descriptor => descriptor.IndexName(indexName)
.IdProperty(nameof(DuplicateDetectionDocument.VCode)));
}
}
1 change: 1 addition & 0 deletions src/AssociationRegistry.Admin.Api/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ namespace AssociationRegistry.Admin.Api;
using Infrastructure.Json;
using Infrastructure.Middleware;
using JasperFx.CodeGeneration;
using JasperFx.Core;
using Kbo;
using Lamar.Microsoft.DependencyInjection;
using Magda;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,19 @@ public class RegistreerFeitelijkeVerenigingRequest
/// <summary>Naam van de vereniging</summary>
[DataMember]
[Required]
public string Naam { get; init; } = null!;
public string Naam { get; set; } = null!;

/// <summary>Korte naam van de vereniging</summary>
[DataMember]
public string? KorteNaam { get; init; }
public string? KorteNaam { get; set; }

/// <summary>Korte beschrijving van de vereniging</summary>
[DataMember]
public string? KorteBeschrijving { get; init; }
public string? KorteBeschrijving { get; set; }

/// <summary>Datum waarop de vereniging gestart is. Deze datum mag niet later zijn dan vandaag</summary>
[DataMember]
public DateOnly? Startdatum { get; init; }
public DateOnly? Startdatum { get; set; }

/// <summary>
/// De doelgroep waar de activiteiten van deze vereniging zich op concentreert
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
namespace AssociationRegistry.Admin.Api.Verenigingen.Registreer;

using DuplicateVerenigingDetection;
using Infrastructure.ConfigurationBindings;
using System;
using System.Collections.Immutable;
using System.Linq;
using System.Runtime.Serialization;
using DuplicateVerenigingDetection;
using Infrastructure.ConfigurationBindings;

[DataContract]
public class PotentialDuplicatesResponse
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@
"Username": "elastic",
"Password": "local_development",
"Indices": {
"Verenigingen": "verenigingsregister-verenigingen-admin"
"Verenigingen": "verenigingsregister-verenigingen-admin",
"DuplicateDetection": "verenigingsregister-duplicate-detection"
}
},
"PostgreSQLOptions": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,6 @@ public class ElasticSearchOptionsSection
public class IndicesOptionsSection
{
public string? Verenigingen { get; set; }
public string? DuplicateDetection { get; set; }
}
}
Original file line number Diff line number Diff line change
@@ -1,14 +1,37 @@
namespace AssociationRegistry.Admin.ProjectionHost.Infrastructure.Extensions;

using Schema.Search;
using Nest;
using Nest.Specification.IndicesApi;
using Schema.Search;

public static class ElasticClientExtensions
{
public static void CreateVerenigingIndex(this IndicesNamespace indicesNamespace, IndexName index)
=> indicesNamespace.Create(
index,
descriptor =>
selector: descriptor =>
descriptor.Map<VerenigingZoekDocument>(VerenigingZoekDocumentMapping.Get));

public static void CreateDuplicateDetectionIndex(this IndicesNamespace indicesNamespace, IndexName index)
=> indicesNamespace.Create(
index,
selector: c => c
.Settings(s => s
.Analysis(a => a
.Analyzers(AddDuplicateDetectionAnalyzer)
.TokenFilters(AddDutchStopWordsFilter)))
.Map<DuplicateDetectionDocument>(DuplicateDetectionDocumentMapping.Get));

private static TokenFiltersDescriptor AddDutchStopWordsFilter(TokenFiltersDescriptor tf)
=> tf.Stop(name: "dutch_stop", selector: st => st
.StopWords("_dutch_") // Or provide your custom list
);

private static AnalyzersDescriptor AddDuplicateDetectionAnalyzer(AnalyzersDescriptor ad)
=> ad.Custom(DuplicateDetectionDocumentMapping.DuplicateAnalyzer,
selector: ca
=> ca
.Tokenizer("standard")
.Filters("lowercase", "asciifolding", "dutch_stop")
);
}
Loading
Loading