Skip to content

Commit

Permalink
Provide a new plugin for indexing repeating subfields
Browse files Browse the repository at this point in the history
The `scheming_subfields_index` plugin will group the values of the
same subfields in a text field that will make the values findable.
They are indexed as `extras_{field_name}__{key}`. `extras_*` is a
dynamic `text` Solr field that will allow free-text search on these values.

Added tests and updated the docs.
  • Loading branch information
amercader committed Jun 13, 2024
1 parent 604fb67 commit 156033f
Show file tree
Hide file tree
Showing 5 changed files with 94 additions and 5 deletions.
11 changes: 7 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -280,10 +280,13 @@ When using a plain string translations will be provided with gettext:
This field is the parent of group of repeating subfields. The value is
a list of fields entered the same way as normal fields.

> **_NOTE:_** CKAN needs an IPackageController plugin with `before_index` to
> convert repeating subfields to formats that can be indexed by solr. For
> testing you may use the included `scheming_nerf_index` plugin to encode
> all repeating fields as JSON strings to prevent solr errors.
> [!NOTE]
> CKAN needs an IPackageController plugin with `before_dataset_index` to
> convert repeating subfields to formats that can be indexed by solr. The
> included `scheming_subfields_index` plugin will group the values of the
> same subfields in a text field that will make the values findable. If
> you require more precise handling of a particular subfield,
> you will need to customize the Solr schema to add the necessary fields.

`repeating_label` may be used to provide a singular version of the label
for each group.
Expand Down
45 changes: 45 additions & 0 deletions ckanext/scheming/plugins.py
Original file line number Diff line number Diff line change
Expand Up @@ -499,6 +499,51 @@ def before_index(self, data_dict):
return data_dict


class SchemingSubfieldsIndexPlugin(p.SingletonPlugin):
"""
Index suitable repeating dataset fields in before_dataset_index to prevent failures
on unmodified solr schema. This will allow hitting results in most text and list
subfields. Ideally you probably want to select the relevant subfields that will get
indexed and modify the Solr schema if necessary.
This implementation will group the values of the same subfields into an
`extras_{field_name}__{key}`,a text Solr field that will allow free-text search on
its value. Again, if you require more precise handling of a particular subfield,
you will need to customize the Solr schema to add particular fields needed.
"""
p.implements(p.IPackageController, inherit=True)

def before_dataset_index(self, data_dict):
return self.before_index(data_dict)

def before_index(self, data_dict):
schemas = SchemingDatasetsPlugin.instance._expanded_schemas
if data_dict['type'] not in schemas:
return data_dict

schema = schemas[data_dict['type']]

for field in schema['dataset_fields']:
if field['field_name'] in data_dict and 'repeating_subfields' in field:
for item in data_dict[field['field_name']]:
for key in item:
value = item[key]
if isinstance(value, dict):
continue
if isinstance(value, list):
value = ' '.join(value)
# Index a flattened version
new_key = f'extras_{field["field_name"]}__{key}'
if not data_dict.get(new_key):
data_dict[new_key] = value
else:
data_dict[new_key] += ' ' + value

data_dict.pop(field['field_name'], None)

return data_dict


def _load_schemas(schemas, type_field):
out = {}
for n in schemas:
Expand Down
40 changes: 40 additions & 0 deletions ckanext/scheming/tests/test_subfields.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from unittest import mock

import pytest
import ckantoolkit

from ckantoolkit.tests.factories import Dataset
from ckantoolkit.tests.helpers import call_action


dataset_dict = {
"name": "test-dataset",
"type": "test-subfields",
# Repeating subfields
"contact_address": [
{"address": "Maple Street 123", "city": "New Paris", "country": "Maplonia"},
{"address": "Rose Avenue 452", "city": "Old York", "country": "Rosestan"},
],
}


@pytest.mark.usefixtures("with_plugins", "clean_db")
def test_repeating_subfields_index():

with mock.patch("ckan.lib.search.index.make_connection") as m:
call_action("package_create", **dataset_dict)

# Dict sent to Solr
search_dict = m.mock_calls[1].kwargs["docs"][0]
assert search_dict["extras_contact_address__city"] == "New Paris Old York"
assert search_dict["extras_contact_address__country"] == "Maplonia Rosestan"


@pytest.mark.usefixtures("with_plugins", "clean_db")
def test_repeating_subfields_search():

dataset = call_action("package_create", **dataset_dict)

result = call_action("package_search", q="Old York")

assert result["results"][0]["id"] == dataset["id"]
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
scheming_groups=ckanext.scheming.plugins:SchemingGroupsPlugin
scheming_organizations=ckanext.scheming.plugins:SchemingOrganizationsPlugin
scheming_nerf_index=ckanext.scheming.plugins:SchemingNerfIndexPlugin
scheming_subfields_index=ckanext.scheming.plugins:SchemingSubfieldsIndexPlugin
scheming_test_subclass=ckanext.scheming.tests.plugins:SchemingTestSubclass
scheming_test_plugin=ckanext.scheming.tests.plugins:SchemingTestSchemaPlugin
scheming_test_validation=ckanext.scheming.tests.plugins:SchemingTestValidationPlugin
Expand Down
2 changes: 1 addition & 1 deletion test.ini
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ port = 5000
use = config:../../src/ckan/test-core.ini

ckan.plugins = scheming_datasets scheming_groups scheming_organizations
scheming_test_plugin scheming_nerf_index
scheming_test_plugin scheming_subfields_index
scheming.dataset_schemas = ckanext.scheming:ckan_dataset.yaml
ckanext.scheming.tests:test_schema.json
ckanext.scheming.tests:test_subfields.yaml
Expand Down

0 comments on commit 156033f

Please sign in to comment.