From d0d93f358bb376000e2ea362f627666a98a83c88 Mon Sep 17 00:00:00 2001 From: funderburkjim Date: Sat, 6 Feb 2021 15:33:31 -0500 Subject: [PATCH] tweak simple search. For non-default searches, if the given spelling matches, show only the given spelling (and not alternates) --- simple-search/v1.1/getword_list_1.0_main.php | 25 ++- simple-search/v1.1/list-0.2s_rw.php | 15 +- simple-search/v1.1/simple_search.php | 184 +++++++++++++++---- 3 files changed, 182 insertions(+), 42 deletions(-) diff --git a/simple-search/v1.1/getword_list_1.0_main.php b/simple-search/v1.1/getword_list_1.0_main.php index e3b80a2..a1a306e 100644 --- a/simple-search/v1.1/getword_list_1.0_main.php +++ b/simple-search/v1.1/getword_list_1.0_main.php @@ -138,6 +138,7 @@ function getword_list_processone() { // reset xml, and status // 11-01-2017. Extra flag to indicate whether this is the user's input word $ans1['user_key_flag'] = ($dictheadword == $keyparmin_slp1); + #dbgprint(true,"getword: dictheadword = $dictheadword, keyparmin_slp1 = $keyparmin_slp1\n"); # accent-adjustment #require_once($dirpfx . "accent_adjust.php"); $dictinfo = $getParms->dictinfo; @@ -161,7 +162,10 @@ function getword_list_processone() { #dbgprint(true,"getword_list_1.0_main: #result=" . count($result) . "\n"); $result1a = order_by_wf($result,$wfreqs); $result1 = put_user_word_first($result1a); -$ans['result']=$result1; +// 02-05-2021: Return ONLY user result if inputparmin is not default, +// and certain other conditions +$result2 = restrict_to_user_word($result1,$inputparmin); +$ans['result']=$result2; $ru3 = microtime(true); //getrusage(); $utime = $ru3 - $ru2; @@ -169,7 +173,23 @@ function getword_list_processone() { return $ans; } // end of getword_list_processone - +function restrict_to_user_word($result1,$inputparmin){ + if (count($result1) == 0) { + return $result1; + } + if ($inputparmin == 'default') { + return $result1; + } + $ans1 = $result1[0]; + if (! $ans1['user_key_flag']) { + return $result1; + } + // Now, the user's spelling was found, and user specified a non-default + // spelling. + // Return ONLY the user's spelling + $result2 = array($ans1); + return $result2; +} function init_word_frequency() { # word_frequency_adj.txt removes duplicates from word_frequency.txt # see readme.org in ../v0.1 @@ -205,6 +225,7 @@ function wf_cmp($a,$b) { return ($a['wf'] > $b['wf']) ? -1 : 1; #return ($a['wf'] > $b['wf']) ? 1 : -1; } + function put_user_word_first($result) { $iuser = -1; $i = 0; diff --git a/simple-search/v1.1/list-0.2s_rw.php b/simple-search/v1.1/list-0.2s_rw.php index ea139d0..3da15a7 100644 --- a/simple-search/v1.1/list-0.2s_rw.php +++ b/simple-search/v1.1/list-0.2s_rw.php @@ -252,7 +252,7 @@ test.key = find_word; test.url = "getword_list_1.0.php"; // 01-22-2021 test.input = $('#input_simple').val(); // - console.log('list-0.2s_rw: simpleDisplay: test.input=',test.input); + //console.log('list-0.2s_rw: simpleDisplay: test.input=',test.input); test.output = $('#output').val(); //'deva'; test.dict = $('#dict').val();; // Blank out result area @@ -287,11 +287,22 @@ } listDisplay(dicthw); } - +displayOption2_error = function(json) { + console.log('displayOption2 error: json=',json); + var html = `

Programming error in simple search at displayOption2
+ Please notify programmer Funderburk of the error, with your inputs
+ at the email at bottom of Sanskrit-lexicon home page.` + $('#simpleinfo').html(html); +} displayOption2 = function(json) { // console.log('list-0.2s_rw.php: displayOption2'); // console.log('json=',json); + // 02-05-2021. Prepare for errors (e.g. non-json results) var results = json['result']; + if (! results) { // results may be undefined + displayOption2_error(); + return; + } var dict = json['dict']; var nresults = results.length; var nfound = nresults; diff --git a/simple-search/v1.1/simple_search.php b/simple-search/v1.1/simple_search.php index 11552b2..1731ddd 100644 --- a/simple-search/v1.1/simple_search.php +++ b/simple-search/v1.1/simple_search.php @@ -6,9 +6,13 @@ Also, revise dalnorm.normalize_key Add 't,tt' transition rule. called by getword_list_1.0_main.php, which uses public variables -user_keyin +user_keyin // in slp1 user_keyin_norm normkeys +status : 200 (OK), 404 (not found), + 201 (found by partialmatches only) 02-05-2021. False start. not used +02-05-2021. partialmatches not found useful. Code is deactivated but present. + */ require_once('get_parent_dirpfx.php'); $dirpfx = get_parent_dirpfx("simple-search"); @@ -18,37 +22,57 @@ class Simple_Search{ - public $transitionTable = [ // slp1 + public $transitionTable_default = [ + // The spellings are in slp1, but they are to applied when + // the user input spelling is default ["a","A"], ["i","I"], ["u","U"], - // ["r","f","F","ri","ar","ru","rI"], + ["o","O"], + ["e","E"], ["r","f","F","ri","ar","ru","rI","R","RI"], - ["l","x","X","lri",], + ["l","x","X","lri"], ["h","H"], - // ["M","n","R","Y","N","m"], # 01-24-2021. replaced by below - ["n","R","Y","N"], + ["n","Y","N","m","R","M"], ["S","z","s","zh","sh"], ["b","v"], ["k","K"], ["g","G"], ["c","C","Ch","ch","cC"], - // ["c","C","cC"], ["j","J"], + ["jy","jY"], ["w","W","t","T"], ["q","Q","d","D"], - // ["p","P","f"], ["p","P","f","b","B"], ["b","B","v","V"], ["t","tt"] -]; + ]; + public $transitionTable_slp1 = [ + // The spellings are in slp1. Apply these when + // the user input spelling is NOT default + ["a","A"], + ["i","I"], + ["u","U"], + ["o","O"], + ["e","E"], + ["r","f","F","ri","ar","ru","rI"], # Not R, since slp1 R is cerebral nasal + ["l","x","X","lri"], + ["n","R","Y","N","M","m"], + ["S","z","s"], + ["k","K"], + ["g","G"], + ["c","C","cC"], + ["j","J"], + ["jy","jY"], + ["w","W","t","T"], + ["q","Q","d","D"], + ["p","P","b","B"], + ["b","B","v"], + ["t","tt"] + ]; public $keysin,$keys,$normkeys; public $searchdict,$input_simple; public $dbg; - #public $ngram2_check; - #public $ngram2beg_check; // beginning 2-gram - #public $ngram3_check; - #public $ngram3beg_check; // beginning 2-gram public $dict; // 20170725 public $badprefs; // prefixes known to be non-valid for this search /* input00 is the 'input_simple' from list-0.2s_rw.php. It is the @@ -59,6 +83,11 @@ public function __construct($keyin00,$input00,$dict) { $this->dbg = false; dbgprint($this->dbg,"simple_search: construct: $keyin00, $input00, $dict\n"); $this->input_simple = $input00; + if ($this->input_simple == 'default') { + $this->transitionTable = $this->transitionTable_default; + }else { + $this->transitionTable = $this->transitionTable_slp1; + } $this->dict = strtolower($dict); # hwnorm1c copied from awork/sanhw1/ to ../hwnorm1/ @@ -68,8 +97,9 @@ public function __construct($keyin00,$input00,$dict) { // searchdict is associative array which is modified by doVariant // It's keys are the different variants $this->searchdict = array(); + $this->partialmatches = array(); $keyin0 = $this->convert_nonascii($keyin00); - dbgprint($this->dbg,"Simple_construct: keyin0 = $keyin0\n"); + dbgprint($this->dbg,"Simple_construct: keyin00 = $keyin00, keyin0 = $keyin0\n"); $alternates0 = [$keyin0]; $normkey = $this->dalnorm->normalize($keyin0); if (!in_array($normkey,$alternates0)) { @@ -87,7 +117,52 @@ public function __construct($keyin00,$input00,$dict) { $this->badprefs = []; $this->doVariant("",$keyin); } - // add grammar variants to searchdict. + if (count($this->searchdict) == 0) { // number of keys in searchdict + // add grammar variants to searchdict. + $this->add_grammar_variants(); + } else { // Always: e.g. veda has also vedana + // $this->add_grammar_variants(); + } + // Linearize searchdict keys into keysin array + $this->keysin = []; + foreach($this->searchdict as $k=>$v){ + $this->keysin[] = $k; + } + // Generate distinct normalized keys + $this->generate_normkeys(); + $nfound = count($this->normkeys); + dbgprint($this->dbg,"number of normalized keys found = $nfound\n"); + if ($nfound > 0 ) { + $this->status = 200; // ok + }else { + // try to use partialmatches + $this->generate_normkeys_from_partialmatches(); + if (count($this->normkeys) > 0 ) { + $this->status = 201; // almost ok + } else { + $this->status = 404; // not found + } + } + dbgprint($this->dbg,"status set to {$this->status}\n"); + $this->normalize_keyin($keyin0); + // close dalnorm connection + $this->dalnorm->close(); + if ($this->dbg) { + $this->dbgprint_api_values(); + } + } + public function dbgprint_api_values() { + dbgprint(true,"user_keyin = {$this->user_keyin}\n"); + dbgprint(true,"user_keyin_norm = {$this->user_keyin_norm}\n"); + $keys = $this->normkeys; + $nkeys = count($keys); + dbgprint(true,"length of normkeys = $nkeys\n"); + for ($i=0;$isearchdict as $k=>$v){ $galts = $this->grammar_variants($k); @@ -99,31 +174,27 @@ public function __construct($keyin00,$input00,$dict) { foreach($galtsall as $keyina) { $this->searchdict_add($keyina); } - - // Linearize searchdict keys into keysin array - $this->keysin = []; - foreach($this->searchdict as $k=>$v){ - $this->keysin[] = $k; } - // Generate distinct normalized keys - $this->generate_normkeys(); - //$this->user_key_first($keyin0); // 11-01-2017. not used. see comment below - // provide keyin_norm for user - $this->user_keyin = transcoder_processString($keyin0,"hk","slp1"); + public function normalize_keyin($keyin0) { + // keyin0 has been transcoded to slp1, based on input_simple + $this->user_keyin = $keyin0; $this->user_keyin_norm = $this->dalnorm->normalize($this->user_keyin); - // close dalnorm connection - $this->dalnorm->close(); - if ($this->dbg) { - $keys = $this->normkeys; - $nkeys = count($keys); - for ($i=0;$iinput_simple == 'default') { + // don't change + $this->user_keyin = $keyin0; + $this->user_keyin_norm = $keyin0; + return; } + $tranin = $this->input_simple; + $this->user_keyin = transcoder_processString($keyin0,$tranin,"slp1"); + $this->user_keyin_norm = $this->dalnorm->normalize($this->user_keyin); + */ } public function generate_normkeys() { // normkeys is the array that the caller sees. It contains - // normalized spellings that 'solve' the search + // normalized spellings that solve the search $this->normkeys = []; $dbg=false; #true @@ -138,7 +209,19 @@ public function generate_normkeys() { } } } - + public function generate_normkeys_from_partialmatches() { + // 02-05-2021. Attempt appears unuseful. + $this->normkeys = []; + return; + $max = 5; + $npartial = count($this->partialmatches); + $dbg=false; + dbgprint($dbg,"number of partial matches = $npartial\n"); + foreach($this->partialmatches as $key=>$val) { + dbgprint($dbg,"partialmatch key = $key\n"); + } + return; + } public function searchdict_add_basic($k0) { // add $k0 to searchdict PROVIDED it is in hwnorm1c for SOME dictionary $k = $this->dalnorm->normalize($k0); @@ -151,11 +234,14 @@ public function searchdict_add_basic($k0) { } } public function searchdict_add_norm($k0) { + $this->searchdict_add_basic($k0); + /* $k = $this->dalnorm->normalize($k0); if (!isset($this->searchdict[$k])) { dbgprint($this->dbg,"$k searchdict_add_norm\n"); $this->searchdict[$k]=true; } + */ } public function searchdict_add($k) { $alts1 = $this->generate_alternate_endings(array($k)); @@ -198,13 +284,16 @@ public function doVariant($pref,$word) { $this->searchdict_add($pref); return; // done } - if (!$this->ngramValidate($pref)) { + $val = $this->ngramValidate($pref); // false or a key that matches LIKE + if ($val == false) { // '$pref' is not validated , return with no further analysis // add $pref to the bad ones $this->badprefs[$pref]=true; # debug to disregard badprefs dbgprint($this->dbg,"doVariant bad pref 1: $pref ($word)\n"); return; } + $key = $val; // a normalized slp1 key of hwnorm1c + $this->partialmatches[$key] = true; dbgprint($this->dbg,"doVariant finding variants of $pref + $word\n"); // Find variants for the beinning of $word // $varCharsData is an array. @@ -385,6 +474,10 @@ public function grammar_variants_skd($word) { // examples uzas -> uzA $word1 = substr($word,0,-2) . 'A'; $ans[] = $word1; + }else if (preg_match('|in$|',$word)) { + // example guRin -> guRi + $word1 = substr($word,0,-2) . 'I'; + $ans[] = $word1; } return $ans; } @@ -409,7 +502,11 @@ public function ngramValidate($word) { if ($nmatches == 0) { return false; } else { - return true; + // extract the matching key + $rec = $matches[0]; // ($key,$data from hwnorm1c) + $key = $rec[0]; + return $key; + //return true; } } catch (Exception $e) { return true; @@ -450,7 +547,7 @@ public function convert_nonascii($wordin) { return $wordin2; } // step 2. Assume $input_simple == default - + // detect devanagari by converting to slp1 $wordin1 = transcoder_processString($wordin,'deva','slp1'); if ($wordin1 != $wordin) { @@ -461,10 +558,21 @@ public function convert_nonascii($wordin) { // $wordin might have letters with diacritics. // We will lower-case the string first. Try to handle diacritics. $wordin0 = mb_strtolower($wordin, 'UTF-8'); + // some letters are not in 'roman', but are in 'slp1' + $wordin0 = $this->clean_default($wordin0); // converting from 'roman' $wordin1 = transcoder_processString($wordin0,'roman','slp1'); $wordin2 = $this->clean_slp1($wordin1); return $wordin2; } + public function clean_default($word) { + // may need further adjustment. + // $word is in default spelling, but in lower case without diacritics + // prepare for conversion to slp1 + $word1 = preg_replace('|w|','v',$word); + $word1 = preg_replace('|f|','p',$word1); + $word1 = preg_replace('|x|','z',$word1); # e.g. xenophobe + return $word1; + } } ?>