From 950d9a4401a1db4da48a4d98c06469a7a058fafc Mon Sep 17 00:00:00 2001 From: mubaldino Date: Wed, 14 Oct 2020 17:50:49 -0400 Subject: [PATCH] Issue #59: country and location bias --- .../opensextant/processing/Parameters.java | 11 +++ .../extractors/geo/PlaceGeocoder.java | 19 ++++- .../geo/rules/LocationChooserRule.java | 72 +++++++++++++++++-- .../xlayer/server/TaggerResource.java | 27 ++++++- .../xlayer/server/xgeo/XponentsGeotagger.java | 69 ++++++++---------- 5 files changed, 151 insertions(+), 47 deletions(-) diff --git a/Core/src/main/java/org/opensextant/processing/Parameters.java b/Core/src/main/java/org/opensextant/processing/Parameters.java index fc43dcd2..9dd6b996 100644 --- a/Core/src/main/java/org/opensextant/processing/Parameters.java +++ b/Core/src/main/java/org/opensextant/processing/Parameters.java @@ -17,7 +17,9 @@ package org.opensextant.processing; import java.util.Date; +import java.util.HashMap; import java.util.HashSet; +import java.util.List; import java.util.Set; import org.joda.time.format.DateTimeFormat; @@ -81,6 +83,15 @@ public class Parameters extends java.util.Properties { public String outputFile = null; private Set formats = new HashSet(); + + /** + * A way of relaying arbitrary geographic filters to an extraction routine indicating that useful answers for + * disambiguation for tie-breakers come from these cues. + * + * "countries" = [c1, c2, c3, ...] + * "geohash" = [g1, g2, g3, ...] + */ + public HashMap> preferredGeography = new HashMap<>(); /** You the caller must explicitly set isdefault = false; * forcing you to actually look at these parameters. diff --git a/src/main/java/org/opensextant/extractors/geo/PlaceGeocoder.java b/src/main/java/org/opensextant/extractors/geo/PlaceGeocoder.java index 6e30752f..1b925e0a 100644 --- a/src/main/java/org/opensextant/extractors/geo/PlaceGeocoder.java +++ b/src/main/java/org/opensextant/extractors/geo/PlaceGeocoder.java @@ -441,6 +441,16 @@ private void reset() { private boolean geocode = true; private boolean tagOnly = !geocode; + + /** + * See {@link #extract(TextInput, Parameters)} below. + * This is the default extraction routine. If you need to tune extraction call extract( input, parameters ) + */ + @Override + public List extract(TextInput input) throws ExtractionException { + return extract(input, null); + } + /** * Extractor.extract() calls first XCoord to get coordinates, then PlacenameMatcher In the end you * have all geo entities ranked and scored. @@ -462,10 +472,13 @@ private void reset() { * @return TextMatch instances which are all PlaceCandidates. * @throws ExtractionException on err */ - @Override - public List extract(TextInput input) throws ExtractionException { + public List extract(TextInput input, Parameters jobParams) throws ExtractionException { long t1 = System.currentTimeMillis(); reset(); + + if (jobParams != null) { + this.setAllowLowerCase(jobParams.tag_lowercase); + } List matches = new ArrayList(); List coordinates = null; @@ -522,7 +535,7 @@ public List extract(TextInput input) throws ExtractionException { // Last rule: score, choose, add confidence. // chooser.setTextCase(input.isLower ? GeocodeRule.LOWERCASE : 0); - chooser.evaluate(candidates); + chooser.evaluate(candidates, jobParams); if (provinceNameSetter != null) { provinceNameSetter.evaluate(candidates); } diff --git a/src/main/java/org/opensextant/extractors/geo/rules/LocationChooserRule.java b/src/main/java/org/opensextant/extractors/geo/rules/LocationChooserRule.java index 7a4a8459..b38b518a 100644 --- a/src/main/java/org/opensextant/extractors/geo/rules/LocationChooserRule.java +++ b/src/main/java/org/opensextant/extractors/geo/rules/LocationChooserRule.java @@ -1,6 +1,7 @@ package org.opensextant.extractors.geo.rules; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; @@ -11,6 +12,7 @@ import org.opensextant.extractors.geo.PlaceCount; import org.opensextant.extractors.geo.PlaceEvidence; import org.opensextant.extractors.geo.PlaceGeocoder; +import org.opensextant.processing.Parameters; import org.opensextant.util.GeodeticUtility; /** @@ -34,6 +36,8 @@ public class LocationChooserRule extends GeocodeRule { private Map boundaryContext = null; private Map namespace = new HashMap<>(); private HashMap inferredCountries = new HashMap<>(); + private HashSet preferredCountries = new HashSet<>(); + private HashSet preferredLocations = new HashSet<>(); private int textCase = 0; @@ -53,12 +57,19 @@ public void reset() { documentCandidates.clear(); namespace.clear(); inferredCountries.clear(); + preferredCountries.clear(); + preferredLocations.clear(); + } + + @Override + public void evaluate(List names) { + evaluate(names, (Parameters) null); } /** * Walk the entire list. */ - public void evaluate(List names) { + public void evaluate(List names, Parameters preferences) { // INPUTS: // histogram of country mentions @@ -71,6 +82,16 @@ public void evaluate(List names) { // countryContext = countryObserver.countryMentionCount(); boundaryContext = boundaryObserver.placeMentionCount(); + // + // PREFS: + if (preferences != null) { + if (preferences.preferredGeography.containsKey("countries")) { + preferredCountries.addAll(preferences.preferredGeography.get("countries")); + } + if (preferences.preferredGeography.containsKey("geohashes")) { + preferredLocations.addAll(preferences.preferredGeography.get("geohashes")); + } + } /* TODO: DEBUG through location chooser using histograms * of found and resolved place metadata. @@ -190,6 +211,13 @@ private void debuggingHistograms(List names) { */ private static final int GLOBAL_POINTS = 5; + /** + * Preferred Country or Location -- when user supplies the context that may be missing.... We accept + * that and weight such preference higher. + */ + public static String PREF_COUNTRY = "PreferredCountry"; + public static String PREF_LOCATION = "PreferredLocation"; + /** * Yet unchosen location. Consider given evidence first, creating some weight there, then * introducing innate properties of possible locations, thereby amplifying the differences in the @@ -199,7 +227,30 @@ private void debuggingHistograms(List names) { @Override public void evaluate(PlaceCandidate name, Place geo) { + // With "preferred geography" we can influence in a subtle fashion ambiguous mentions, e.g., + // If known geography is Ohio and we see mentions of Springfield without other context, we can + // nudge choice of Springfield, OH as such. Such as with a preferred location (geohash). + + if (preferredCountries != null && !preferredCountries.isEmpty()) { + if (preferredCountries.contains(geo.getCountryCode())) { + // Get a half-point for being within the country + name.incrementPlaceScore(geo, 0.5); + name.addRule(PREF_COUNTRY); + } + } + if (preferredLocations != null && !preferredLocations.isEmpty()) { + for (String gh : preferredLocations) { + if (geo.getGeohash().startsWith(gh)) { + // Increment a full point for being within the geohash. Note geohash length of 4 or more chars is reasonably good resolution. + name.incrementPlaceScore(geo, 1.0); + name.addRule(PREF_LOCATION); + } + } + } + if (boundaryContext.isEmpty() && countryContext.isEmpty()) { + // So without context, there is nothing more we can do to influence the connection between + // the one named place and the candidate location return; } @@ -275,8 +326,8 @@ public void evaluate(PlaceCandidate name, Place geo) { public static final int MATCHCONF_NAME_REGION = 75; /** - * Absolute Confidence: Unique name in gazetteer. - * Confidence is high, however this needs to be tempered by the number of gazetteers, coverage, and diversity + * Absolute Confidence: Unique name in gazetteer. Confidence is high, however this needs to be + * tempered by the number of gazetteers, coverage, and diversity */ public static final int MATCHCONF_ONE_LOC = 70; @@ -309,6 +360,12 @@ public void evaluate(PlaceCandidate name, Place geo) { */ public static final int MATCHCONF_QUALIFIER_LOWERCASE = -15; + /** + * A subtle boost for locations that were preferred -- especially helps when there is no inherent + * context and we must rely on the caller's intuition. + */ + public static final int MATCHCONF_PREFERRED = 5; + private static boolean isShort(int matchLen) { return matchLen <= NonsenseFilter.GENERIC_ONE_WORD; } @@ -380,7 +437,7 @@ public void assessConfidence(PlaceCandidate pc) { if (fc != null) { featWeight = fc.factor; } - points = (int)((0.75 * points) + (0.25 * points * featWeight)); + points = (int) ((0.75 * points) + (0.25 * points * featWeight)); // Any of these may occur. //====================== @@ -457,6 +514,13 @@ public void assessConfidence(PlaceCandidate pc) { points += pc.getLength() - 4; } + if (pc.hasRule(PREF_COUNTRY)) { + points += MATCHCONF_PREFERRED; + } + if (pc.hasRule(PREF_LOCATION)) { + points += MATCHCONF_PREFERRED; + } + pc.setConfidence(points); } diff --git a/src/main/java/org/opensextant/xlayer/server/TaggerResource.java b/src/main/java/org/opensextant/xlayer/server/TaggerResource.java index 2c22965c..e2a475dd 100644 --- a/src/main/java/org/opensextant/xlayer/server/TaggerResource.java +++ b/src/main/java/org/opensextant/xlayer/server/TaggerResource.java @@ -2,11 +2,15 @@ import static org.apache.commons.lang3.StringUtils.isNotBlank; +import java.util.ArrayList; import java.util.HashSet; +import java.util.Iterator; +import java.util.List; import java.util.Set; import java.util.logging.Level; import java.util.logging.Logger; +import org.json.JSONArray; import org.json.JSONException; import org.json.JSONObject; import org.opensextant.data.TextInput; @@ -138,6 +142,20 @@ protected void resetParameters(Parameters job) { job.tag_patterns = false; job.addOutputFormat("json"); } + + /** + * + * @param a JSONArray + * @return + */ + protected List fromArray(JSONArray a){ + ArrayList strings = new ArrayList<>(); + Iterator iter = a.iterator(); + while (iter.hasNext()) { + strings.add((String)iter.next()); + } + return strings; + } /** * @@ -181,7 +199,14 @@ protected Parameters fromRequest(JSONObject inputs) throws JSONException { job.tag_lowercase = opts.contains("lowercase"); job.resolve_localities = opts.contains("revgeo") || opts.contains("resolve_localities"); } - + // + // Geographic filters + if (inputs.has("preferred_countries")) { + job.preferredGeography.put("countries", fromArray(inputs.getJSONArray("preferred_countries"))); + } + if (inputs.has("preferred_locations")) { + job.preferredGeography.put("geohashes", fromArray(inputs.getJSONArray("preferred_locations"))); + } if (job.clean_input || job.tag_lowercase) { job.isdefault = false; } diff --git a/src/main/java/org/opensextant/xlayer/server/xgeo/XponentsGeotagger.java b/src/main/java/org/opensextant/xlayer/server/xgeo/XponentsGeotagger.java index c958ee63..3673bcac 100644 --- a/src/main/java/org/opensextant/xlayer/server/xgeo/XponentsGeotagger.java +++ b/src/main/java/org/opensextant/xlayer/server/xgeo/XponentsGeotagger.java @@ -36,7 +36,7 @@ public XponentsGeotagger() { } /** - * get Xponents Exxtractor object from global attributes. + * get Xponents Exxtractor object from global attributes. */ public Extractor getExtractor(String xid) { Object X = this.getApplication().getContext().getAttributes().get(xid); @@ -60,27 +60,22 @@ public Extractor getExtractor(String xid) { } /** - * Contract: - * docid optional; 'text' | 'doc-list' required. - * command: cmd=ping sends back a simple response - * - * text = UTF-8 encoded text - * docid = user's provided document ID - * doc-list = An array of text - * - * cmd=ping = report status. - * - * Where json-array contains { docs=[ {docid='A', text='...'}, {docid='B', text='...',...] } - * The entire array must be parsable in memory as a single, traversible JSON object. - * We make no assumption about one-JSON object per line or anything about line-endings as separators. - * - * - * @param params - * the params - * @return the representation - * @throws JSONException - * the JSON exception - */ + * Contract: docid optional; 'text' | 'doc-list' required. command: cmd=ping sends back a simple + * response + * + * text = UTF-8 encoded text docid = user's provided document ID doc-list = An array of text + * + * cmd=ping = report status. + * + * Where json-array contains { docs=[ {docid='A', text='...'}, {docid='B', text='...',...] } The + * entire array must be parsable in memory as a single, traversible JSON object. We make no + * assumption about one-JSON object per line or anything about line-endings as separators. + * + * + * @param params JSON parameters per REST API: docid, text, lang, features, options, and preferred_* + * @return the representation + * @throws JSONException the JSON exception + */ @Post("application/json;charset=utf-8") public Representation processForm(JsonRepresentation params) throws JSONException { org.json.JSONObject json = params.getJsonObject(); @@ -100,16 +95,14 @@ public Representation processForm(JsonRepresentation params) throws JSONExceptio } /** - * HTTP GET -- vanilla. Do not use in production, unless you have really small data packages. - * This is useful for testing. Partial contract: - * - * miscellany: 'cmd' = 'ping' |... other commands. - * processing: 'docid' = ?, 'text' = ? - * - * @param params - * the params - * @return the representation - */ + * HTTP GET -- vanilla. Do not use in production, unless you have really small data packages. This + * is useful for testing. Partial contract: + * + * miscellany: 'cmd' = 'ping' |... other commands. processing: 'docid' = ?, 'text' = ? + * + * @param params JSON parameters. see process() + * @return the representation + */ @Get public Representation processGet(Representation params) { Form inputs = getRequest().getResourceRef().getQueryAsForm(); @@ -140,10 +133,8 @@ public Representation process(TextInput input, Parameters jobParams) { try { if (prodMode) { PlaceGeocoder xgeo = (PlaceGeocoder) getExtractor("xgeo"); - xgeo.setAllowLowerCase(jobParams.tag_lowercase); + List matches = xgeo.extract(input, jobParams); - List matches = xgeo.extract(input); - if (jobParams.tag_patterns) { XTemporal xt = (XTemporal) getExtractor("xtemp"); matches.addAll(xt.extract(input)); @@ -169,7 +160,7 @@ public Representation process(TextInput input, Parameters jobParams) { /** * Format matches as JSON * - * @param matches items to format + * @param matches items to format * @param jobParams parameters * @return formatted json * @throws JSONException on format error @@ -184,9 +175,9 @@ private Representation format(List matches, Parameters jobParams) thr } /** - * @param params parameters - * @param variousMatches matches to filter - */ + * @param params parameters + * @param variousMatches matches to filter + */ public void filter(List variousMatches, Parameters params) { // Determine what looks useful. Filter out things not worth // saving at all in data store.