diff --git a/composer.json b/composer.json index b190dda..d58a45c 100644 --- a/composer.json +++ b/composer.json @@ -11,7 +11,7 @@ ], "require": { "php": ">=7.3", - "voku/portable-utf8": "^5.4|^6.0" + "joomla/string": ">=2.0.1" }, "require-dev":{ "phpunit/phpunit": "^9.0" diff --git a/src/Stemmer/Catalan.php b/src/Stemmer/Catalan.php index d52e4fc..8a5c7d3 100644 --- a/src/Stemmer/Catalan.php +++ b/src/Stemmer/Catalan.php @@ -2,7 +2,7 @@ namespace Wamania\Snowball\Stemmer; -use voku\helper\UTF8; +use Joomla\String\StringHelper; /** * @@ -86,12 +86,7 @@ class Catalan extends Stem */ public function stem($word) { - // we do ALL in UTF-8 - if (!UTF8::is_utf8($word)) { - throw new \Exception('Word must be in UTF-8'); - } - - $this->word = UTF8::strtolower($word); + $this->word = StringHelper::strtolower($word); // Catalan stemmer does not use Rv $this->r1(); @@ -127,7 +122,7 @@ private function step0() { if (($position = $this->search(static::$attached_pronoun)) !== false) { if ($this->inR1($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); return true; } } @@ -146,7 +141,7 @@ private function step1a() // delete if in R2 if (($position = $this->search(['acions', 'ada', 'ades'])) !== false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } return true; } @@ -162,11 +157,11 @@ private function step1a() // atius atives ativa ativitat ativitats ible ibles assa asses assos ent ents íssim íssima íssims íssimes // ìssem ìsseu ìssin ims ima imes isme ista ismes istes inia inies íinia ínies ita ites triu trius oses osos // ient otes ots - // + // // delete if in R1 if (($position = $this->search(self::$standard_suffix_1a)) !== false) { if ($this->inR1($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } return true; } @@ -241,7 +236,7 @@ private function step1b() // delete if in R1 if (($position = $this->search(static::$verb_suffixes)) !== false) { if ($this->inR1($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } return true; } @@ -251,7 +246,7 @@ private function step1b() // delete if in R2 if (($position = $this->search(['ando'])) !== false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } return true; } @@ -270,7 +265,7 @@ private function step2() // delete if in R1 if (($position = $this->search(static::$residual_suffixes)) !== false) { if ($this->inR1($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } return true; } @@ -294,7 +289,7 @@ private function step2() */ private function finish() { - $this->word = UTF8::str_replace( + $this->word = str_replace( ['á', 'é', 'í', 'ó', 'ú', 'à', 'è', 'ì', 'ò', 'ï', 'ü', '·'], ['a', 'e', 'i', 'o', 'u', 'a', 'e', 'i', 'o', 'i', 'u', '.'], $this->word diff --git a/src/Stemmer/Danish.php b/src/Stemmer/Danish.php index c539fdb..5fc7507 100644 --- a/src/Stemmer/Danish.php +++ b/src/Stemmer/Danish.php @@ -2,7 +2,7 @@ namespace Wamania\Snowball\Stemmer; -use voku\helper\UTF8; +use Joomla\String\StringHelper; /** * @@ -22,12 +22,7 @@ class Danish extends Stem */ public function stem($word): string { - // we do ALL in UTF-8 - if (!UTF8::is_utf8($word)) { - throw new \Exception('Word must be in UTF-8'); - } - - $this->word = UTF8::strtolower($word); + $this->word = StringHelper::strtolower($word); // R2 is not used: R1 is defined in the same way as in the German stemmer $this->r1(); @@ -35,7 +30,7 @@ public function stem($word): string // then R1 is adjusted so that the region before it contains at least 3 letters. if ($this->r1Index < 3) { $this->r1Index = 3; - $this->r1 = UTF8::substr($this->word, 3); + $this->r1 = StringHelper::substr($this->word, 3); } // Do each of steps 1, 2 3 and 4. @@ -56,7 +51,7 @@ public function stem($word): string */ private function hasValidSEnding($word) { - $lastLetter = UTF8::substr($word, -1, 1); + $lastLetter = StringHelper::substr($word, -1, 1); return in_array($lastLetter, array('a', 'b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 't', 'v', 'y', 'z', 'å')); } @@ -74,14 +69,14 @@ private function step1() 'erens', 'ered', 'ende', 'erne', 'eres', 'eren', 'eret', 'erer', 'enes', 'heds', 'ens', 'ene', 'ere', 'ers', 'ets', 'hed', 'es', 'et', 'er', 'en', 'e' ))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); return true; } // s // delete if preceded by a valid s-ending if ( ($position = $this->searchIfInR1(array('s'))) !== false) { - $word = UTF8::substr($this->word, 0, $position); + $word = StringHelper::substr($this->word, 0, $position); if ($this->hasValidSEnding($word)) { $this->word = $word; } @@ -97,7 +92,7 @@ private function step1() private function step2() { if ($this->searchIfInR1(array('gd', 'dt', 'gt', 'kt')) !== false) { - $this->word = UTF8::substr($this->word, 0, -1); + $this->word = StringHelper::substr($this->word, 0, -1); } } @@ -108,14 +103,14 @@ private function step3() { // If the word ends igst, remove the final st. if ($this->search(array('igst')) !== false) { - $this->word = UTF8::substr($this->word, 0, -2); + $this->word = StringHelper::substr($this->word, 0, -2); } // Search for the longest among the following suffixes in R1, and perform the action indicated. // ig lig elig els // delete, and then repeat step 2 if ( ($position = $this->searchIfInR1(array('elig', 'lig', 'ig', 'els'))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); $this->step2(); return true; } @@ -123,7 +118,7 @@ private function step3() // løst // replace with løs if ($this->searchIfInR1(array('løst')) !== false) { - $this->word = UTF8::substr($this->word, 0, -1); + $this->word = StringHelper::substr($this->word, 0, -1); } } @@ -133,19 +128,19 @@ private function step3() */ private function step4() { - $length = UTF8::strlen($this->word); + $length = StringHelper::strlen($this->word); if (!$this->inR1(($length-1))) { return false; } - $lastLetter = UTF8::substr($this->word, -1, 1); + $lastLetter = StringHelper::substr($this->word, -1, 1); if (in_array($lastLetter, self::$vowels)) { return false; } - $beforeLastLetter = UTF8::substr($this->word, -2, 1); + $beforeLastLetter = StringHelper::substr($this->word, -2, 1); if ($lastLetter == $beforeLastLetter) { - $this->word = UTF8::substr($this->word, 0, -1); + $this->word = StringHelper::substr($this->word, 0, -1); } return true; } diff --git a/src/Stemmer/Dutch.php b/src/Stemmer/Dutch.php index fc7c1af..6a2b563 100644 --- a/src/Stemmer/Dutch.php +++ b/src/Stemmer/Dutch.php @@ -2,7 +2,7 @@ namespace Wamania\Snowball\Stemmer; -use voku\helper\UTF8; +use Joomla\String\StringHelper; /** * @@ -22,15 +22,10 @@ class Dutch extends Stem */ public function stem($word) { - // we do ALL in UTF-8 - if (!UTF8::is_utf8($word)) { - throw new \Exception('Word must be in UTF-8'); - } - - $this->word = UTF8::strtolower($word); + $this->word = StringHelper::strtolower($word); // First, remove all umlaut and acute accents. - $this->word = UTF8::str_replace( + $this->word = str_replace( array('ä', 'ë', 'ï', 'ö', 'ü', 'á', 'é', 'í', 'ó', 'ú'), array('a', 'e', 'i', 'o', 'u', 'a', 'e', 'i', 'o', 'u'), $this->word); @@ -50,7 +45,7 @@ public function stem($word) // but then R1 is adjusted so that the region before it contains at least 3 letters. if ($this->r1Index < 3) { $this->r1Index = 3; - $this->r1 = UTF8::substr($this->word, 3); + $this->r1 = StringHelper::substr($this->word, 3); } // Do each of steps 1, 2 3 and 4. @@ -71,7 +66,7 @@ public function stem($word) */ private function hasValidSEnding($word) { - $lastLetter = UTF8::substr($word, -1, 1); + $lastLetter = StringHelper::substr($word, -1, 1); return !in_array($lastLetter, array_merge(self::$vowels, array('j'))); } @@ -82,12 +77,12 @@ private function hasValidSEnding($word) */ private function hasValidEnEnding($word) { - $lastLetter = UTF8::substr($word, -1, 1); + $lastLetter = StringHelper::substr($word, -1, 1); if (in_array($lastLetter, self::$vowels)) { return false; } - $threeLastLetters = UTF8::substr($word, -3, 3); + $threeLastLetters = StringHelper::substr($word, -3, 3); if ($threeLastLetters == 'gem') { return false; } @@ -100,7 +95,7 @@ private function hasValidEnEnding($word) private function unDoubling() { if ($this->search(array('kk', 'dd', 'tt')) !== false) { - $this->word = UTF8::substr($this->word, 0, -1); + $this->word = StringHelper::substr($this->word, 0, -1); } } @@ -123,7 +118,7 @@ private function step1() // delete if in R1 and preceded by a valid en-ending, and then undouble the ending if ( ($position = $this->search(array('ene', 'en'))) !== false) { if ($this->inR1($position)) { - $word = UTF8::substr($this->word, 0, $position); + $word = StringHelper::substr($this->word, 0, $position); if ($this->hasValidEnEnding($word)) { $this->word = $word; $this->unDoubling(); @@ -136,7 +131,7 @@ private function step1() // delete if in R1 and preceded by a valid s-ending if ( ($position = $this->search(array('se', 's'))) !== false) { if ($this->inR1($position)) { - $word = UTF8::substr($this->word, 0, $position); + $word = StringHelper::substr($this->word, 0, $position); if ($this->hasValidSEnding($word)) { $this->word = $word; } @@ -155,9 +150,9 @@ private function step2() { if ( ($position = $this->search(array('e'))) !== false) { if ($this->inR1($position)) { - $letter = UTF8::substr($this->word, -2, 1); + $letter = StringHelper::substr($this->word, -2, 1); if (!in_array($letter, self::$vowels)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); $this->unDoubling(); return true; @@ -176,13 +171,13 @@ private function step3a() { if ( ($position = $this->search(array('heid'))) !== false) { if ($this->inR2($position)) { - $letter = UTF8::substr($this->word, -5, 1); + $letter = StringHelper::substr($this->word, -5, 1); if ($letter !== 'c') { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); if ( ($position = $this->search(array('en'))) !== false) { if ($this->inR1($position)) { - $word = UTF8::substr($this->word, 0, $position); + $word = StringHelper::substr($this->word, 0, $position); if ($this->hasValidEnEnding($word)) { $this->word = $word; $this->unDoubling(); @@ -206,12 +201,12 @@ private function step3b($removedE) // if preceded by ig, delete if in R2 and not preceded by e, otherwise undouble the ending if ( ($position = $this->search(array('end', 'ing'))) !== false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); if ( ($position2 = $this->searchIfInR2(array('ig'))) !== false) { - $letter = UTF8::substr($this->word, -3, 1); + $letter = StringHelper::substr($this->word, -3, 1); if ($letter !== 'e') { - $this->word = UTF8::substr($this->word, 0, $position2); + $this->word = StringHelper::substr($this->word, 0, $position2); } } else { $this->unDoubling(); @@ -226,9 +221,9 @@ private function step3b($removedE) // delete if in R2 and not preceded by e if ( ($position = $this->search(array('ig'))) !== false) { if ($this->inR2($position)) { - $letter = UTF8::substr($this->word, -3, 1); + $letter = StringHelper::substr($this->word, -3, 1); if ($letter !== 'e') { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } } return true; @@ -238,7 +233,7 @@ private function step3b($removedE) // delete if in R2, and then repeat step 2 if ( ($position = $this->search(array('lijk'))) !== false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); $this->step2(); } return true; @@ -248,7 +243,7 @@ private function step3b($removedE) // delete if in R2 if ( ($position = $this->search(array('baar'))) !== false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } return true; } @@ -257,7 +252,7 @@ private function step3b($removedE) // delete if in R2 and if step 2 actually removed an e if ( ($position = $this->search(array('bar'))) !== false) { if ($this->inR2($position) && $removedE) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } return true; } @@ -273,25 +268,25 @@ private function step3b($removedE) private function step4() { // D is a non-vowel other than I - $d = UTF8::substr($this->word, -1, 1); + $d = StringHelper::substr($this->word, -1, 1); if (in_array($d, array_merge(self::$vowels, array('I')))) { return false; } // V is double a, e, o or u - $v = UTF8::substr($this->word, -3, 2); + $v = StringHelper::substr($this->word, -3, 2); if (!in_array($v, array('aa', 'ee', 'oo', 'uu'))) { return false; } - $singleV = UTF8::substr($v, 0, 1); + $singleV = StringHelper::substr($v, 0, 1); // C is a non-vowel - $c = UTF8::substr($this->word, -4, 1); + $c = StringHelper::substr($this->word, -4, 1); if (in_array($c, self::$vowels)) { return false; } - $this->word = UTF8::substr($this->word, 0, -4); + $this->word = StringHelper::substr($this->word, 0, -4); $this->word .= $c . $singleV .$d; } @@ -301,6 +296,6 @@ private function step4() */ private function finish() { - $this->word = UTF8::str_replace(array('I', 'Y'), array('i', 'y'), $this->word); + $this->word = str_replace(array('I', 'Y'), array('i', 'y'), $this->word); } } diff --git a/src/Stemmer/English.php b/src/Stemmer/English.php index fe5f186..f0e1f2c 100644 --- a/src/Stemmer/English.php +++ b/src/Stemmer/English.php @@ -2,7 +2,7 @@ namespace Wamania\Snowball\Stemmer; -use voku\helper\UTF8; +use Joomla\String\StringHelper; /** * English Porter 2 @@ -27,16 +27,11 @@ class English extends Stem */ public function stem($word) { - // we do ALL in UTF-8 - if (!UTF8::is_utf8($word)) { - throw new \Exception('Word must be in UTF-8'); - } - - if (Utf8::strlen($word) < 3) { + if (StringHelper::strlen($word) < 3) { return $word; } - $this->word = UTF8::strtolower($word); + $this->word = StringHelper::strtolower($word); // exceptions if (null !== ($word = $this->exception1())) { @@ -47,9 +42,9 @@ public function stem($word) $this->plainVowels = implode('', self::$vowels); // Remove initial ', if present. - $first = UTF8::substr($this->word, 0, 1); + $first = StringHelper::substr($this->word, 0, 1); if ($first == "'") { - $this->word = UTF8::substr($this->word, 1); + $this->word = StringHelper::substr($this->word, 1); } // Set initial y, or y after a vowel, to Y @@ -88,7 +83,7 @@ public function stem($word) private function step0() { if ( ($position = $this->search(array("'s'", "'s", "'"))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } } @@ -123,10 +118,10 @@ private function step1a() // delete if the preceding word part contains a vowel not immediately before the s (so gas and this retain the s, gaps and kiwis lose it) if ( ($position = $this->search(array('s'))) !== false) { for ($i=0; $i<$position-1; $i++) { - $letter = UTF8::substr($this->word, $i, 1); + $letter = StringHelper::substr($this->word, $i, 1); if (in_array($letter, self::$vowels)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); return true; } } @@ -157,16 +152,16 @@ private function step1b() // if the word is short, add e (so hop -> hope) if ( ($position = $this->search(array('edly', 'ingly', 'ed', 'ing'))) !== false) { for ($i=0; $i<$position; $i++) { - $letter = UTF8::substr($this->word, $i, 1); + $letter = StringHelper::substr($this->word, $i, 1); if (in_array($letter, self::$vowels)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); if ($this->search(array('at', 'bl', 'iz')) !== false) { $this->word .= 'e'; } elseif ( ($position2 = $this->search(self::$doubles)) !== false) { - $this->word = UTF8::substr($this->word, 0, ($position2+1)); + $this->word = StringHelper::substr($this->word, 0, ($position2+1)); } elseif ($this->isShort()) { $this->word .= 'e'; @@ -188,7 +183,7 @@ private function step1c() { // replace suffix y or Y by i if preceded by a non-vowel // which is not the first letter of the word (so cry -> cri, by -> by, say -> say) - $length = UTF8::strlen($this->word); + $length = StringHelper::strlen($this->word); if ($length < 3) { return true; @@ -196,7 +191,7 @@ private function step1c() if ( ($position = $this->search(array('y', 'Y'))) !== false) { $before = $position - 1; - $letter = UTF8::substr($this->word, $before, 1); + $letter = StringHelper::substr($this->word, $before, 1); if (! in_array($letter, self::$vowels)) { $this->word = preg_replace('#(y|Y)$#u', 'i', $this->word); @@ -323,7 +318,7 @@ private function step2() if ($this->inR1($position)) { $before = $position - 1; - $letter = UTF8::substr($this->word, $before, 1); + $letter = StringHelper::substr($this->word, $before, 1); if ($letter == 'l') { $this->word = preg_replace('#(ogi)$#u', 'og', $this->word); @@ -338,10 +333,10 @@ private function step2() if ($this->inR1($position)) { // a letter for you - $letter = UTF8::substr($this->word, ($position-1), 1); + $letter = StringHelper::substr($this->word, ($position-1), 1); if (in_array($letter, self::$liEnding)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } } @@ -383,13 +378,13 @@ private function step3() // ful ness: delete if ( ($position = $this->searchIfInR1(array('ful', 'ness'))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); return true; } // ative*: delete if in R2 if ( (($position = $this->searchIfInR1(array('ative'))) !== false) && ($this->inR2($position)) ) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); return true; } @@ -409,7 +404,7 @@ private function step4() 'ate', 'iti', 'ous', 'ive', 'ize', 'al', 'er', 'ic'))) !== false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } return true; } @@ -418,10 +413,10 @@ private function step4() // delete if preceded by s or t if ( ($position = $this->searchIfInR2(array('ion'))) !== false) { $before = $position - 1; - $letter = UTF8::substr($this->word, $before, 1); + $letter = StringHelper::substr($this->word, $before, 1); if ($letter == 's' || $letter == 't') { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } return true; @@ -440,11 +435,11 @@ private function step5() // delete if in R2, or in R1 and not preceded by a short syllable if ( ($position = $this->search(array('e'))) !== false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } elseif ($this->inR1($position)) { if ( (! $this->searchShortSyllabe(-4, 3)) && (! $this->searchShortSyllabe(-3, 2)) ) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } } @@ -455,10 +450,10 @@ private function step5() // delete if in R2 and preceded by l if ( ($position = $this->searchIfInR2(array('l'))) !== false) { $before = $position - 1; - $letter = UTF8::substr($this->word, $before, 1); + $letter = StringHelper::substr($this->word, $before, 1); if ($letter == 'l') { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } return true; @@ -469,21 +464,21 @@ private function step5() private function finish() { - $this->word = UTF8::str_replace('Y', 'y', $this->word); + $this->word = str_replace('Y', 'y', $this->word); } private function exceptionR1() { - if (Utf8::strpos($this->word, 'gener') === 0) { - $this->r1 = UTF8::substr($this->word, 5); + if (StringHelper::strpos($this->word, 'gener') === 0) { + $this->r1 = StringHelper::substr($this->word, 5); $this->r1Index = 5; - } elseif (Utf8::strpos($this->word, 'commun') === 0) { - $this->r1 = UTF8::substr($this->word, 6); + } elseif (StringHelper::strpos($this->word, 'commun') === 0) { + $this->r1 = StringHelper::substr($this->word, 6); $this->r1Index = 6; - } elseif (Utf8::strpos($this->word, 'arsen') === 0) { - $this->r1 = UTF8::substr($this->word, 5); + } elseif (StringHelper::strpos($this->word, 'arsen') === 0) { + $this->r1 = StringHelper::substr($this->word, 5); $this->r1Index = 5; } } @@ -554,7 +549,7 @@ private function exception2() */ private function isShort() { - $length = UTF8::strlen($this->word); + $length = StringHelper::strlen($this->word); return ( ($this->searchShortSyllabe(-3, 3) || $this->searchShortSyllabe(-2, 2)) && ($length == $this->r1Index) ); } @@ -567,7 +562,7 @@ private function isShort() */ private function searchShortSyllabe($from, $nbLetters) { - $length = UTF8::strlen($this->word); + $length = StringHelper::strlen($this->word); if ($from < 0) { $from = $length + $from; @@ -581,8 +576,8 @@ private function searchShortSyllabe($from, $nbLetters) return false; } - $first = UTF8::substr($this->word, $from, 1); - $second = UTF8::substr($this->word, ($from+1), 1); + $first = StringHelper::substr($this->word, $from, 1); + $second = StringHelper::substr($this->word, ($from+1), 1); if ($nbLetters == 2) { if ( (in_array($first, self::$vowels)) && (!in_array($second, self::$vowels)) ) { @@ -590,7 +585,7 @@ private function searchShortSyllabe($from, $nbLetters) } } - $third = UTF8::substr($this->word, ($from+2), 1); + $third = StringHelper::substr($this->word, ($from+2), 1); if ( (!in_array($first, self::$vowels)) && (in_array($second, self::$vowels)) && (!in_array($third, array_merge(self::$vowels, array('x', 'Y', 'w'))))) { diff --git a/src/Stemmer/Finnish.php b/src/Stemmer/Finnish.php index 25539b2..c6487b5 100644 --- a/src/Stemmer/Finnish.php +++ b/src/Stemmer/Finnish.php @@ -6,7 +6,7 @@ */ namespace Wamania\Snowball\Stemmer; -use voku\helper\UTF8; +use Joomla\String\StringHelper; /** * Finnish Snowball Stemmer. @@ -38,12 +38,7 @@ class Finnish extends Stem */ public function stem($word) { - // we do ALL in UTF-8 - if (! UTF8::is_utf8($word)) { - throw new \Exception('Word must be in UTF-8'); - } - - $this->word = Utf8::strtolower($word); + $this->word = StringHelper::strtolower($word); // R1 and R2 are then defined in the usual way $this->r1(); @@ -74,10 +69,10 @@ private function step1() // (a) kin kaan kään ko kö han hän pa pä // delete if preceded by n, t or a vowel if (($position = $this->searchIfInR1(array('kaan', 'kään', 'kin', 'han', 'hän', 'ko', 'kö', 'pa', 'pä'))) !== false) { - $lastLetter = Utf8::substr($this->word, ($position-1), 1); + $lastLetter = StringHelper::substr($this->word, ($position-1), 1); if (in_array($lastLetter, array_merge(['t', 'n'], self::$vowels))) { - $this->word = Utf8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); $this->r1(); $this->r2(); } @@ -89,7 +84,7 @@ private function step1() // delete if in R2 if (($position = $this->searchIfInR1(array('sti'))) !== false) { if ($this->inR2($position)) { - $this->word = Utf8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); $this->r1(); $this->r2(); } @@ -111,10 +106,10 @@ private function step2() // si // delete if not preceded by k if (($position = $this->searchIfInR1(array('si'))) !== false) { - $lastLetter = Utf8::substr($this->word, ($position-1), 1); + $lastLetter = StringHelper::substr($this->word, ($position-1), 1); if ($lastLetter !== 'k') { - $this->word = Utf8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); $this->r1(); $this->r2(); return true; @@ -124,7 +119,7 @@ private function step2() // ni // delete if (($position = $this->searchIfInR1(array('ni'))) !== false) { - $this->word = Utf8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); // if preceded by kse, replace with ksi if ( ($position = $this->search(array('kse'))) !== false) { $this->word = preg_replace('#(kse)$#u', 'ksi', $this->word); @@ -137,7 +132,7 @@ private function step2() // nsa nsä mme nne // delete if (($position = $this->searchIfInR1(array('nsa', 'nsä', 'mme', 'nne'))) !== false) { - $this->word = Utf8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); $this->r1(); $this->r2(); return true; @@ -146,9 +141,9 @@ private function step2() // an // delete if preceded by one of ta ssa sta lla lta na if (($position = $this->searchIfInR1(array('an'))) !== false) { - $word = Utf8::substr($this->word, 0, $position); - $lastThreeLetters = Utf8::substr($word, -3, 3); - $lastTwoLetters = Utf8::substr($word, -2, 2); + $word = StringHelper::substr($this->word, 0, $position); + $lastThreeLetters = StringHelper::substr($word, -3, 3); + $lastTwoLetters = StringHelper::substr($word, -2, 2); if (in_array($lastThreeLetters, array('ssa', 'sta', 'lla', 'lta'), true) || in_array($lastTwoLetters, array('na', 'ta'), true)) { $this->word = $word; $this->r1(); @@ -160,9 +155,9 @@ private function step2() // än // delete if preceded by one of tä ssä stä llä ltä nä if (($position = $this->searchIfInR1(array('än'))) !== false) { - $word = Utf8::substr($this->word, 0, $position); - $lastThreeLetters = Utf8::substr($word, -3, 3); - $lastTwoLetters = Utf8::substr($word, -2, 2); + $word = StringHelper::substr($this->word, 0, $position); + $lastThreeLetters = StringHelper::substr($word, -3, 3); + $lastTwoLetters = StringHelper::substr($word, -2, 2); if (in_array($lastThreeLetters, array('ssä', 'stä', 'llä', 'ltä'), true) || in_array($lastTwoLetters, array('nä', 'tä'), true)) { $this->word = $word; $this->r1(); @@ -174,9 +169,9 @@ private function step2() // en // delete if preceded by one of lle ine if (($position = $this->searchIfInR1(array('en'))) !== false) { - $word = Utf8::substr($this->word, 0, $position); - if (Utf8::strlen($this->word) > 4) { - $lastThreeLetters = Utf8::substr($this->word, -5, 3); + $word = StringHelper::substr($this->word, 0, $position); + if (StringHelper::strlen($this->word) > 4) { + $lastThreeLetters = StringHelper::substr($this->word, -5, 3); if (in_array($lastThreeLetters, array('lle', 'ine'), true)) { $this->word = $word; $this->r1(); @@ -204,9 +199,9 @@ private function step3() continue; } if (($position = $this->searchIfInR1(array('h' . $vowel . 'n'))) !== false) { - $lastLetter = Utf8::substr($this->word, $position-1, 1); + $lastLetter = StringHelper::substr($this->word, $position-1, 1); if ($lastLetter === $vowel) { - $this->word = Utf8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); $this->_removedInStep3 = true; $this->r1(); $this->r2(); @@ -218,11 +213,11 @@ private function step3() // siin den tten // delete if preceded by Vi if (($position = $this->searchIfInR1(array('siin', 'den', 'tten'))) !== false) { - $lastLetter = Utf8::substr($this->word, ($position-1), 1); + $lastLetter = StringHelper::substr($this->word, ($position-1), 1); if ($lastLetter === 'i') { - $nextLastLetter = Utf8::substr($this->word, ($position-2), 1); + $nextLastLetter = StringHelper::substr($this->word, ($position-2), 1); if (in_array($nextLastLetter, self::$restrictedVowels, true)) { - $this->word = Utf8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); $this->_removedInStep3 = true; $this->r1(); $this->r2(); @@ -234,10 +229,10 @@ private function step3() // seen // delete if preceded by LV if (($position = $this->searchIfInR1(array('seen'))) !== false) { - $lastLetters = Utf8::substr($this->word, ($position-2), 2); + $lastLetters = StringHelper::substr($this->word, ($position-2), 2); if (in_array($lastLetters, self::$longVowels, true)) { - $this->word = Utf8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); $this->_removedInStep3 = true; $this->r1(); $this->r2(); @@ -248,10 +243,10 @@ private function step3() // tta ttä // delete if preceded by e if (($position = $this->searchIfInR1(array('tta', 'ttä'))) !== false) { - $lastLetter = Utf8::substr($this->word, ($position-1), 1); + $lastLetter = StringHelper::substr($this->word, ($position-1), 1); if ($lastLetter === 'e') { - $this->word = Utf8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); $this->_removedInStep3 = true; $this->r1(); $this->r2(); @@ -262,7 +257,7 @@ private function step3() // ta tä ssa ssä sta stä lla llä lta ltä lle na nä ksi ine // delete if (($position = $this->searchIfInR1(array('ssa', 'ssä', 'sta', 'stä', 'lla', 'llä', 'lta', 'ltä', 'lle', 'ksi', 'na', 'nä', 'ine', 'ta', 'tä'))) !== false) { - $this->word = Utf8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); $this->_removedInStep3 = true; $this->r1(); $this->r2(); @@ -272,11 +267,11 @@ private function step3() // a ä // delete if preceded by cv if (($position = $this->searchIfInR1(array('a', 'ä'))) !== false) { - $lastLetter = Utf8::substr($this->word, ($position-1), 1); - $nextLastLetter = Utf8::substr($this->word, ($position-2), 1); + $lastLetter = StringHelper::substr($this->word, ($position-1), 1); + $nextLastLetter = StringHelper::substr($this->word, ($position-2), 1); if (in_array($lastLetter, self::$vowels, true) && in_array($nextLastLetter, self::$consonants, true)) { - $this->word = Utf8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); $this->_removedInStep3 = true; $this->r1(); $this->r2(); @@ -287,12 +282,12 @@ private function step3() // n // delete, and if preceded by LV or ie, delete the last vowel if (($position = $this->searchIfInR1(array('n'))) !== false) { - $lastLetters = Utf8::substr($this->word, ($position-2), 2); + $lastLetters = StringHelper::substr($this->word, ($position-2), 2); if (in_array($lastLetters, self::$longVowels, true) || $lastLetters === 'ie') { - $this->word = Utf8::substr($this->word, 0, $position-1); + $this->word = StringHelper::substr($this->word, 0, $position-1); } else { - $this->word = Utf8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } $this->r1(); $this->r2(); @@ -314,9 +309,9 @@ private function step4() // mpi mpa mpä mmi mma mmä // delete if not preceded by po if (($position = $this->searchIfInR2(array('mpi', 'mpa', 'mpä', 'mmi', 'mma', 'mmä'))) !== false) { - $lastLetters = Utf8::substr($this->word, ($position-2), 2); + $lastLetters = StringHelper::substr($this->word, ($position-2), 2); if ($lastLetters !== 'po') { - $this->word = Utf8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); $this->r1(); $this->r2(); return true; @@ -326,7 +321,7 @@ private function step4() // impi impa impä immi imma immä eja ejä // delete if (($position = $this->searchIfInR2(array('impi', 'impa', 'impä', 'immi', 'imma', 'immä', 'eja', 'ejä'))) !== false) { - $this->word = Utf8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); $this->r1(); $this->r2(); return true; @@ -347,27 +342,27 @@ private function step5() { if ($this->_removedInStep3) { if (($position = $this->searchIfInR1(array('i', 'j'))) !== false) { - $this->word = Utf8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); $this->r1(); $this->r2(); return true; } } else { if (($position = $this->searchIfInR1(array('t'))) !== false) { - $lastLetter = Utf8::substr($this->word, ($position-1), 1); + $lastLetter = StringHelper::substr($this->word, ($position-1), 1); if (in_array($lastLetter, self::$vowels, true)) { - $this->word = Utf8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); $this->r1(); $this->r2(); if (($position2 = $this->searchIfInR2(array('imma'))) !== false) { - $this->word = Utf8::substr($this->word, 0, $position2); + $this->word = StringHelper::substr($this->word, 0, $position2); $this->r1(); $this->r2(); return true; } elseif (($position2 = $this->searchIfInR2(array('mma'))) !== false) { - $lastLetters = Utf8::substr($this->word, ($position2-2), 2); + $lastLetters = StringHelper::substr($this->word, ($position2-2), 2); if ($lastLetters !== 'po') { - $this->word = Utf8::substr($this->word, 0, $position2); + $this->word = StringHelper::substr($this->word, 0, $position2); $this->r1(); $this->r2(); return true; @@ -390,35 +385,35 @@ private function step6() // a) If R1 ends LV // delete the last letter if (($position = $this->searchIfInR1(self::$longVowels)) !== false) { - $this->word = Utf8::substr($this->word, 0, $position+1); + $this->word = StringHelper::substr($this->word, 0, $position+1); $this->r1(); $this->r2(); } // b) If R1 ends cX, c a consonant and X one of a ä e i, // delete the last letter - $lastLetter = Utf8::substr($this->r1, -1, 1); - $secondToLastLetter = Utf8::substr($this->r1, -2, 1); + $lastLetter = StringHelper::substr($this->r1, -1, 1); + $secondToLastLetter = StringHelper::substr($this->r1, -2, 1); if (in_array($secondToLastLetter, self::$consonants, true) && in_array($lastLetter, array('a', 'e', 'i', 'ä'))) { - $this->word = Utf8::substr($this->word, 0, -1); + $this->word = StringHelper::substr($this->word, 0, -1); $this->r1(); $this->r2(); } // c) If R1 ends oj or uj // delete the last letter - $twoLastLetters = Utf8::substr($this->r1, -2, 2); + $twoLastLetters = StringHelper::substr($this->r1, -2, 2); if (in_array($twoLastLetters, array('oj', 'uj'))) { - $this->word = Utf8::substr($this->word, 0, -1); + $this->word = StringHelper::substr($this->word, 0, -1); $this->r1(); $this->r2(); } // d) If R1 ends jo // delete the last letter - $twoLastLetters = Utf8::substr($this->r1, -2, 2); + $twoLastLetters = StringHelper::substr($this->r1, -2, 2); if ($twoLastLetters === 'jo') { - $this->word = Utf8::substr($this->word, 0, -1); + $this->word = StringHelper::substr($this->word, 0, -1); $this->r1(); $this->r2(); } @@ -427,15 +422,15 @@ private function step6() // vowels, remove the last consonant (so eläkk -> eläk, // aatonaatto -> aatonaato) $endVowels = ''; - for ($i = Utf8::strlen($this->word) - 1; $i > 0; $i--) { - $letter = Utf8::substr($this->word, $i, 1); + for ($i = StringHelper::strlen($this->word) - 1; $i > 0; $i--) { + $letter = StringHelper::substr($this->word, $i, 1); if (in_array($letter, self::$vowels, true)) { $endVowels = $letter . $endVowels; } else { // check for double consonant - $prevLetter = Utf8::substr($this->word, $i-1, 1); + $prevLetter = StringHelper::substr($this->word, $i-1, 1); if ($prevLetter === $letter) { - $this->word = Utf8::substr($this->word, 0, $i) . $endVowels; + $this->word = StringHelper::substr($this->word, 0, $i) . $endVowels; } break; } diff --git a/src/Stemmer/French.php b/src/Stemmer/French.php index 8e1ee96..2bc53ca 100644 --- a/src/Stemmer/French.php +++ b/src/Stemmer/French.php @@ -2,7 +2,7 @@ namespace Wamania\Snowball\Stemmer; -use voku\helper\UTF8; +use Joomla\String\StringHelper; /** * @@ -22,12 +22,7 @@ class French extends Stem */ public function stem($word) { - // we do ALL in UTF-8 - if (!UTF8::is_utf8($word)) { - throw new \Exception('Word must be in UTF-8'); - } - - $this->word = UTF8::strtolower($word); + $this->word = StringHelper::strtolower($word); $this->plainVowels = implode('', self::$vowels); @@ -96,7 +91,7 @@ private function step1() // delete if in R2 if ( ($position = $this->search(array('ances', 'iqUes', 'ismes', 'ables', 'istes', 'ance', 'iqUe','isme', 'able', 'iste', 'eux'))) !== false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } return 3; } @@ -106,10 +101,10 @@ private function step1() // if preceded by ic, delete if in R2, else replace by iqU if ( ($position = $this->search(array('atrices', 'ateurs', 'ations', 'atrice', 'ateur', 'ation'))) !== false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); if ( ($position2 = $this->searchIfInR2(array('ic'))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position2); + $this->word = StringHelper::substr($this->word, 0, $position2); } else { $this->word = preg_replace('#(ic)$#u', 'iqU', $this->word); } @@ -150,9 +145,9 @@ private function step1() if ( ($position = $this->search(array('issements', 'issement'))) != false) { if ($this->inR1($position)) { $before = $position - 1; - $letter = UTF8::substr($this->word, $before, 1); + $letter = StringHelper::substr($this->word, $before, 1); if (! in_array($letter, self::$vowels)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } } return 3; @@ -168,20 +163,20 @@ private function step1() // delete if in RV if ($this->inRv($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } // if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise, if ( ($position = $this->searchIfInR2(array('iv'))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); if ( ($position2 = $this->searchIfInR2(array('at'))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position2); + $this->word = StringHelper::substr($this->word, 0, $position2); } // if preceded by eus, delete if in R2, else replace by eux if in R1, otherwise, } elseif ( ($position = $this->search(array('eus'))) !== false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } elseif ($this->inR1($position)) { $this->word = preg_replace('#(eus)$#u', 'eux', $this->word); @@ -189,7 +184,7 @@ private function step1() // if preceded by abl or iqU, delete if in R2, otherwise, } elseif ( ($position = $this->searchIfInR2(array('abl', 'iqU'))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); // if preceded by ièr or Ièr, replace by i if in RV } elseif ( ($position = $this->searchIfInRv(array('ièr', 'Ièr'))) !== false) { @@ -207,13 +202,13 @@ private function step1() // delete if in R2 if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } // if preceded by abil, delete if in R2, else replace by abl, otherwise, if ( ($position = $this->search(array('abil'))) !== false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } else { $this->word = preg_replace('#(abil)$#u', 'abl', $this->word); } @@ -221,14 +216,14 @@ private function step1() // if preceded by ic, delete if in R2, else replace by iqU, otherwise, } elseif ( ($position = $this->search(array('ic'))) !== false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } else { $this->word = preg_replace('#(ic)$#u', 'iqU', $this->word); } // if preceded by iv, delete if in R2 } elseif ( ($position = $this->searchIfInR2(array('iv'))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } return 3; @@ -240,15 +235,15 @@ private function step1() if ( ($position = $this->search(array('ifs', 'ives', 'if', 'ive'))) !== false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } if ( ($position = $this->searchIfInR2(array('at'))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); if ( ($position2 = $this->search(array('ic'))) !== false) { if ($this->inR2($position2)) { - $this->word = UTF8::substr($this->word, 0, $position2); + $this->word = StringHelper::substr($this->word, 0, $position2); } else { $this->word = preg_replace('#(ic)$#u', 'iqU', $this->word); } @@ -278,7 +273,7 @@ private function step1() // delete if in R2, else replace by eux if in R1 if ( ($position = $this->search(array('euses', 'euse'))) !== false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } elseif ($this->inR1($position)) { $this->word = preg_replace('#(euses|euse)$#u', 'eux', $this->word); @@ -309,9 +304,9 @@ private function step1() // delete if preceded by a vowel in RV if ( ($position = $this->search(array('ments', 'ment'))) != false) { $before = $position - 1; - $letter = UTF8::substr($this->word, $before, 1); + $letter = StringHelper::substr($this->word, $before, 1); if ( $this->inRv($before) && (in_array($letter, self::$vowels)) ) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } return 2; @@ -337,9 +332,9 @@ private function step2a() 'issent', 'isses', 'issez', 'isse', 'issiez', 'issions', 'issons', 'is', 'it', 'i'))) !== false) { $before = $position - 1; - $letter = UTF8::substr($this->word, $before, 1); + $letter = StringHelper::substr($this->word, $before, 1); if ( $this->inRv($before) && (!in_array($letter, self::$vowels)) ) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); return true; } @@ -360,7 +355,7 @@ private function step2b() 'ées', 'èrent', 'erais', 'erait', 'erai', 'eraIent', 'eras', 'erez', 'eriez', 'erions', 'erons', 'eront', 'era', 'er', 'iez', 'ez','és', 'ée', 'é'))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); return true; } @@ -373,12 +368,12 @@ private function step2b() 'assent', 'asses', 'assiez', 'assions', 'asse', 'as', 'ai', 'a'))) !== false) { $before = $position - 1; - $letter = UTF8::substr($this->word, $before, 1); + $letter = StringHelper::substr($this->word, $before, 1); if ( $this->inRv($before) && ($letter == 'e') ) { - $this->word = UTF8::substr($this->word, 0, $before); + $this->word = StringHelper::substr($this->word, 0, $before); } else { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } return true; @@ -388,7 +383,7 @@ private function step2b() // delete if in R2 if ( ($position = $this->searchIfInRv(array('ions'))) !== false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } return true; @@ -413,7 +408,7 @@ private function step4() { //If the word ends s, not preceded by a, i, o, u, è or s, delete it. if (preg_match('#[^aiouès]s$#', $this->word)) { - $this->word = UTF8::substr($this->word, 0, -1); + $this->word = StringHelper::substr($this->word, 0, -1); } // In the rest of step 4, all tests are confined to the RV region. @@ -421,9 +416,9 @@ private function step4() // delete if in R2 and preceded by s or t if ( (($position = $this->searchIfInRv(array('ion'))) !== false) && ($this->inR2($position)) ) { $before = $position - 1; - $letter = UTF8::substr($this->word, $before, 1); + $letter = StringHelper::substr($this->word, $before, 1); if ( $this->inRv($before) && (($letter == 's') || ($letter == 't')) ) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } return true; } @@ -438,7 +433,7 @@ private function step4() // e // delete if ( ($this->searchIfInRv(array('e'))) !== false) { - $this->word = UTF8::substr($this->word, 0, -1); + $this->word = StringHelper::substr($this->word, 0, -1); return true; } @@ -446,7 +441,7 @@ private function step4() // if preceded by gu, delete if ( ($position = $this->searchIfInRv(array('guë'))) !== false) { if ($this->inRv($position+2)) { - $this->word = UTF8::substr($this->word, 0, -1); + $this->word = StringHelper::substr($this->word, 0, -1); return true; } } @@ -461,7 +456,7 @@ private function step4() private function step5() { if ($this->search(array('enn', 'onn', 'ett', 'ell', 'eill')) !== false) { - $this->word = UTF8::substr($this->word, 0, -1); + $this->word = StringHelper::substr($this->word, 0, -1); } } @@ -480,7 +475,7 @@ private function step6() */ private function finish() { - $this->word = UTF8::str_replace(array('I','U','Y'), array('i', 'u', 'y'), $this->word); + $this->word = str_replace(array('I','U','Y'), array('i', 'u', 'y'), $this->word); } /** @@ -491,7 +486,7 @@ private function finish() */ protected function rv() { - $length = UTF8::strlen($this->word); + $length = StringHelper::strlen($this->word); $this->rv = ''; $this->rvIndex = $length; @@ -501,28 +496,28 @@ protected function rv() } // If the word begins with two vowels, RV is the region after the third letter - $first = UTF8::substr($this->word, 0, 1); - $second = UTF8::substr($this->word, 1, 1); + $first = StringHelper::substr($this->word, 0, 1); + $second = StringHelper::substr($this->word, 1, 1); if ( (in_array($first, self::$vowels)) && (in_array($second, self::$vowels)) ) { - $this->rv = UTF8::substr($this->word, 3); + $this->rv = StringHelper::substr($this->word, 3); $this->rvIndex = 3; return true; } // (Exceptionally, par, col or tap, at the begining of a word is also taken to define RV as the region to their right.) - $begin3 = UTF8::substr($this->word, 0, 3); + $begin3 = StringHelper::substr($this->word, 0, 3); if (in_array($begin3, array('par', 'col', 'tap'))) { - $this->rv = UTF8::substr($this->word, 3); + $this->rv = StringHelper::substr($this->word, 3); $this->rvIndex = 3; return true; } // otherwise the region after the first vowel not at the beginning of the word, for ($i=1; $i<$length; $i++) { - $letter = UTF8::substr($this->word, $i, 1); + $letter = StringHelper::substr($this->word, $i, 1); if (in_array($letter, self::$vowels)) { - $this->rv = UTF8::substr($this->word, ($i + 1)); + $this->rv = StringHelper::substr($this->word, ($i + 1)); $this->rvIndex = $i + 1; return true; } diff --git a/src/Stemmer/German.php b/src/Stemmer/German.php index 4dc81a3..11dc733 100644 --- a/src/Stemmer/German.php +++ b/src/Stemmer/German.php @@ -2,7 +2,7 @@ namespace Wamania\Snowball\Stemmer; -use voku\helper\UTF8; +use Joomla\String\StringHelper; /** * @@ -26,17 +26,12 @@ class German extends Stem */ public function stem($word) { - // we do ALL in UTF-8 - if (!UTF8::is_utf8($word)) { - throw new \Exception('Word must be in UTF-8'); - } - $this->plainVowels = implode('', self::$vowels); - $this->word = UTF8::strtolower($word); + $this->word = StringHelper::strtolower($word); // First, replace ß by ss - $this->word = UTF8::str_replace('ß', 'ss', $this->word); + $this->word = str_replace('ß', 'ss', $this->word); // put u and y between vowels into upper case $this->word = preg_replace('#(['.$this->plainVowels.'])y(['.$this->plainVowels.'])#u', '$1Y$2', $this->word); @@ -49,7 +44,7 @@ public function stem($word) // but then R1 is adjusted so that the region before it contains at least 3 letters. if ($this->r1Index < 3) { $this->r1Index = 3; - $this->r1 = UTF8::substr($this->word, 3); + $this->r1 = StringHelper::substr($this->word, 3); } $this->step1(); @@ -68,7 +63,7 @@ private function step1() // delete if in R1 if ( ($position = $this->search(array('em', 'ern', 'er'))) !== false) { if ($this->inR1($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } return true; } @@ -76,11 +71,11 @@ private function step1() // delete if in R1 if ( ($position = $this->search(array('es', 'en', 'e'))) !== false) { if ($this->inR1($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); //If an ending of group (b) is deleted, and the ending is preceded by niss, delete the final s if ($this->search(array('niss')) !== false) { - $this->word = UTF8::substr($this->word, 0, -1); + $this->word = StringHelper::substr($this->word, 0, -1); } } return true; @@ -90,10 +85,10 @@ private function step1() if ( ($position = $this->search(array('s'))) !== false) { if ($this->inR1($position)) { $before = $position - 1; - $letter = UTF8::substr($this->word, $before, 1); + $letter = StringHelper::substr($this->word, $before, 1); if (in_array($letter, self::$sEndings)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } } return true; @@ -111,7 +106,7 @@ private function step2() // delete if in R1 if ( ($position = $this->search(array('en', 'er', 'est'))) !== false) { if ($this->inR1($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } return true; } @@ -122,10 +117,10 @@ private function step2() if ($this->inR1($position)) { $before = $position - 1; if ($before >= 3) { - $letter = UTF8::substr($this->word, $before, 1); + $letter = StringHelper::substr($this->word, $before, 1); if (in_array($letter, self::$stEndings)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } } } @@ -144,15 +139,15 @@ private function step3() // if preceded by ig, delete if in R2 and not preceded by e if ( ($position = $this->search(array('end', 'ung'))) !== false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } if ( ($position2 = $this->search(array('ig'))) !== false) { $before = $position2 - 1; - $letter = UTF8::substr($this->word, $before, 1); + $letter = StringHelper::substr($this->word, $before, 1); if ( ($this->inR2($position2)) && ($letter != 'e') ) { - $this->word = UTF8::substr($this->word, 0, $position2); + $this->word = StringHelper::substr($this->word, 0, $position2); } } return true; @@ -162,10 +157,10 @@ private function step3() // delete if in R2 and not preceded by e if ( ($position = $this->search(array('ig', 'ik', 'isch'))) !== false) { $before = $position - 1; - $letter = UTF8::substr($this->word, $before, 1); + $letter = StringHelper::substr($this->word, $before, 1); if ( ($this->inR2($position)) && ($letter != 'e') ) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } return true; } @@ -175,12 +170,12 @@ private function step3() // if preceded by er or en, delete if in R1 if ( ($position = $this->search(array('lich', 'heit'))) != false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } if ( ($position2 = $this->search(array('er', 'en'))) !== false) { if ($this->inR1($position2)) { - $this->word = UTF8::substr($this->word, 0, $position2); + $this->word = StringHelper::substr($this->word, 0, $position2); } } return true; @@ -191,12 +186,12 @@ private function step3() // if preceded by lich or ig, delete if in R2 if ( ($position = $this->search(array('keit'))) != false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } if ( ($position2 = $this->search(array('lich', 'ig'))) !== false) { if ($this->inR2($position2)) { - $this->word = UTF8::substr($this->word, 0, $position2); + $this->word = StringHelper::substr($this->word, 0, $position2); } } return true; @@ -211,6 +206,6 @@ private function step3() private function finish() { // turn U and Y back into lower case, and remove the umlaut accent from a, o and u. - $this->word = UTF8::str_replace(array('U', 'Y', 'ä', 'ü', 'ö'), array('u', 'y', 'a', 'u', 'o'), $this->word); + $this->word = str_replace(array('U', 'Y', 'ä', 'ü', 'ö'), array('u', 'y', 'a', 'u', 'o'), $this->word); } } diff --git a/src/Stemmer/Italian.php b/src/Stemmer/Italian.php index bb09dee..4bb2004 100644 --- a/src/Stemmer/Italian.php +++ b/src/Stemmer/Italian.php @@ -2,7 +2,7 @@ namespace Wamania\Snowball\Stemmer; -use voku\helper\UTF8; +use Joomla\String\StringHelper; /** * @@ -22,17 +22,12 @@ class Italian extends Stem */ public function stem($word) { - // we do ALL in UTF-8 - if (!UTF8::is_utf8($word)) { - throw new \Exception('Word must be in UTF-8'); - } - $this->plainVowels = implode('', self::$vowels); - $this->word = UTF8::strtolower($word); + $this->word = StringHelper::strtolower($word); // First, replace all acute accents by grave accents. - $this->word = UTF8::str_replace(array('á', 'é', 'í', 'ó', 'ú'), array('à', 'è', 'ì', 'ò', 'ù'), $this->word); + $this->word = str_replace(array('á', 'é', 'í', 'ó', 'ú'), array('à', 'è', 'ì', 'ò', 'ù'), $this->word); //And, as in French, put u after q, and u, i between vowels into upper case. (See note on vowel marking.) The vowels are then $this->word = preg_replace('#([q])u#u', '$1U', $this->word); @@ -72,7 +67,7 @@ private function step0() 'cele', 'celi', 'celo', 'cene', 'vela', 'vele', 'veli', 'velo', 'vene', 'gli', 'la', 'le', 'li', 'lo', 'mi', 'ne', 'si', 'ti', 'vi', 'ci'))) !== false) { - $suffixe = UTF8::substr($this->word, $position); + $suffixe = StringHelper::substr($this->word, $position); // following one of (in RV) // a @@ -82,7 +77,7 @@ private function step0() }, $a); // In case of (a) the suffix is deleted if ($this->searchIfInRv($a) !== false) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } //b @@ -112,19 +107,19 @@ private function step1() // if preceded by os, ic or abil, delete if in R2 if ( ($position = $this->search(array('amente'))) !== false) { if ($this->inR1($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } // if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise, if ( ($position2 = $this->searchIfInR2(array('iv'))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position2); + $this->word = StringHelper::substr($this->word, 0, $position2); if ( ($position3 = $this->searchIfInR2(array('at'))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position3); + $this->word = StringHelper::substr($this->word, 0, $position3); } // if preceded by os, ic or ad, delete if in R2 } elseif ( ($position4 = $this->searchIfInR2(array('os', 'ic', 'abil'))) != false) { - $this->word = UTF8::substr($this->word, 0, $position4); + $this->word = StringHelper::substr($this->word, 0, $position4); } return true; } @@ -137,7 +132,7 @@ private function step1() ))) !== false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } return true; } @@ -147,11 +142,11 @@ private function step1() // if preceded by ic, delete if in R2 if ( ($position = $this->search(array('azione', 'azioni', 'atore', 'atori'))) !== false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); if ( ($position2 = $this->search(array('ic'))) !== false) { if ($this->inR2($position2)) { - $this->word = UTF8::substr($this->word, 0, $position2); + $this->word = StringHelper::substr($this->word, 0, $position2); } } } @@ -189,7 +184,7 @@ private function step1() // delete if in RV if ( ($position = $this->search(array('amento', 'amenti', 'imento', 'imenti'))) !== false) { if ($this->inRv($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } return true; } @@ -199,11 +194,11 @@ private function step1() // if preceded by abil, ic or iv, delete if in R2 if ( ($position = $this->search(array('ità'))) !== false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } if ( ($position2 = $this->searchIfInR2(array('abil', 'ic', 'iv'))) != false) { - $this->word = UTF8::substr($this->word, 0, $position2); + $this->word = StringHelper::substr($this->word, 0, $position2); } return true; } @@ -213,13 +208,13 @@ private function step1() // if preceded by at, delete if in R2 (and if further preceded by ic, delete if in R2) if ( ($position = $this->search(array('ivo', 'ivi', 'iva', 'ive'))) !== false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } if ( ($position2 = $this->searchIfInR2(array('at'))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position2); + $this->word = StringHelper::substr($this->word, 0, $position2); if ( ($position3 = $this->searchIfInR2(array('ic'))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position3); + $this->word = StringHelper::substr($this->word, 0, $position3); } } return true; @@ -243,7 +238,7 @@ private function step2() 'ano', 'are', 'ata', 'ate', 'ati', 'ato', 'ava', 'avi', 'avo', 'erà', 'ere', 'erò', 'ete', 'eva', 'evi', 'evo', 'ire', 'ita', 'ite', 'iti', 'ito', 'iva', 'ivi', 'ivo', 'ono', 'uta', 'ute', 'uti', 'uto', 'irò', 'ar', 'ir'))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } } @@ -254,10 +249,10 @@ private function step2() private function step3a() { if ($this->searchIfInRv(array('a', 'e', 'i', 'o', 'à', 'è', 'ì', 'ò')) !== false) { - $this->word = UTF8::substr($this->word, 0, -1); + $this->word = StringHelper::substr($this->word, 0, -1); if ($this->searchIfInRv(array('i')) !== false) { - $this->word = UTF8::substr($this->word, 0, -1); + $this->word = StringHelper::substr($this->word, 0, -1); } return true; } @@ -284,6 +279,6 @@ private function step3b() */ private function finish() { - $this->word = UTF8::str_replace(array('I', 'U'), array('i', 'u'), $this->word); + $this->word = str_replace(array('I', 'U'), array('i', 'u'), $this->word); } } diff --git a/src/Stemmer/Norwegian.php b/src/Stemmer/Norwegian.php index b44b722..627a578 100644 --- a/src/Stemmer/Norwegian.php +++ b/src/Stemmer/Norwegian.php @@ -2,7 +2,7 @@ namespace Wamania\Snowball\Stemmer; -use voku\helper\UTF8; +use Joomla\String\StringHelper; /** * @@ -22,12 +22,7 @@ class Norwegian extends Stem */ public function stem($word) { - // we do ALL in UTF-8 - if (!UTF8::is_utf8($word)) { - throw new \Exception('Word must be in UTF-8'); - } - - $this->word = UTF8::strtolower($word); + $this->word = StringHelper::strtolower($word); // R2 is not used: R1 is defined in the same way as in the German stemmer $this->r1(); @@ -35,7 +30,7 @@ public function stem($word) // then R1 is adjusted so that the region before it contains at least 3 letters. if ($this->r1Index < 3) { $this->r1Index = 3; - $this->r1 = UTF8::substr($this->word, 3); + $this->r1 = StringHelper::substr($this->word, 3); } // Do each of steps 1, 2 3 and 4. @@ -56,12 +51,12 @@ public function stem($word) */ private function hasValidSEnding($word) { - $lastLetter = UTF8::substr($word, -1, 1); + $lastLetter = StringHelper::substr($word, -1, 1); if (in_array($lastLetter, array('b', 'c', 'd', 'f', 'g', 'h', 'j', 'l', 'm', 'n', 'o', 'p', 'r', 't', 'v', 'y', 'z'))) { return true; } if ($lastLetter == 'k') { - $beforeLetter = UTF8::substr($word, -2, 1); + $beforeLetter = StringHelper::substr($word, -2, 1); if (!in_array($beforeLetter, self::$vowels)) { return true; } @@ -88,14 +83,14 @@ private function step1() 'hetenes', 'hetene', 'hetens', 'heten', 'endes', 'heter', 'ande', 'ende', 'enes', 'edes', 'ede', 'ane', 'ene', 'het', 'ers', 'ets', 'ast', 'ens', 'en', 'ar', 'er', 'as', 'es', 'et', 'a', 'e' ))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); return true; } // s // delete if preceded by a valid s-ending if ( ($position = $this->searchIfInR1(array('s'))) !== false) { - $word = UTF8::substr($this->word, 0, $position); + $word = StringHelper::substr($this->word, 0, $position); if ($this->hasValidSEnding($word)) { $this->word = $word; } @@ -110,7 +105,7 @@ private function step1() private function step2() { if ($this->searchIfInR1(array('dt', 'vt')) !== false) { - $this->word = UTF8::substr($this->word, 0, -1); + $this->word = StringHelper::substr($this->word, 0, -1); } } @@ -124,7 +119,7 @@ private function step3() if ( ($position = $this->searchIfInR1(array( 'hetslov', 'eleg', 'elov', 'slov', 'elig', 'eig', 'lig', 'els', 'lov', 'leg', 'ig' ))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } } } diff --git a/src/Stemmer/Portuguese.php b/src/Stemmer/Portuguese.php index c71cc59..c5f3aae 100644 --- a/src/Stemmer/Portuguese.php +++ b/src/Stemmer/Portuguese.php @@ -2,7 +2,7 @@ namespace Wamania\Snowball\Stemmer; -use voku\helper\UTF8; +use Joomla\String\StringHelper; /** * @@ -22,14 +22,9 @@ class Portuguese extends Stem */ public function stem($word) { - // we do ALL in UTF-8 - if (!UTF8::is_utf8($word)) { - throw new \Exception('Word must be in UTF-8'); - } - - $this->word = UTF8::strtolower($word); + $this->word = StringHelper::strtolower($word); - $this->word = UTF8::str_replace(array('ã', 'õ'), array('a~', 'o~'), $this->word); + $this->word = str_replace(array('ã', 'õ'), array('a~', 'o~'), $this->word); $this->rv(); $this->r1(); @@ -66,7 +61,7 @@ private function step1() 'osos', 'osas', 'osa', 'ico', 'ica', 'ador', 'aça~o', 'aço~es' , 'ante'))) !== false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } return true; } @@ -106,19 +101,19 @@ private function step1() // delete if in R1 if ($this->inR1($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } // if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise, if ( ($position2 = $this->searchIfInR2(array('iv'))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position2); + $this->word = StringHelper::substr($this->word, 0, $position2); if ( ($position3 = $this->searchIfInR2(array('at'))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position3); + $this->word = StringHelper::substr($this->word, 0, $position3); } // if preceded by os, ic or ad, delete if in R2 } elseif ( ($position4 = $this->searchIfInR2(array('os', 'ic', 'ad'))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position4); + $this->word = StringHelper::substr($this->word, 0, $position4); } return true; } @@ -130,12 +125,12 @@ private function step1() // delete if in R2 if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } // if preceded by ante, avel or ível, delete if in R2 if ( ($position2 = $this->searchIfInR2(array('ante', 'avel', 'ível'))) != false) { - $this->word = UTF8::substr($this->word, 0, $position2); + $this->word = StringHelper::substr($this->word, 0, $position2); } return true; } @@ -147,12 +142,12 @@ private function step1() // delete if in R2 if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } // if preceded by abil, ic or iv, delete if in R2 if ( ($position2 = $this->searchIfInR2(array('abil', 'ic', 'iv'))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position2); + $this->word = StringHelper::substr($this->word, 0, $position2); } return true; } @@ -164,12 +159,12 @@ private function step1() // delete if in R2 if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } // if preceded by at, delete if in R2 if ( ($position2 = $this->searchIfInR2(array('at'))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position2); + $this->word = StringHelper::substr($this->word, 0, $position2); } return true; } @@ -180,7 +175,7 @@ private function step1() if ($this->inRv($position)) { $before = $position -1; - $letter = UTF8::substr($this->word, $before, 1); + $letter = StringHelper::substr($this->word, $before, 1); if ($letter == 'e') { $this->word = preg_replace('#(iras|ira)$#u', 'ir', $this->word); @@ -213,7 +208,7 @@ private function step2() 'ia', 'ei', 'am', 'em', 'ar', 'er', 'ir', 'as', 'es', 'is', 'eu', 'iu', 'ou', ))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); return true; } return false; @@ -227,10 +222,10 @@ private function step3() { // Delete suffix i if in RV and preceded by c if ($this->searchIfInRv(array('i')) !== false) { - $letter = UTF8::substr($this->word, -2, 1); + $letter = StringHelper::substr($this->word, -2, 1); if ($letter == 'c') { - $this->word = UTF8::substr($this->word, 0, -1); + $this->word = StringHelper::substr($this->word, 0, -1); } return true; } @@ -244,7 +239,7 @@ private function step4() { // If the word ends with one of the suffixes "os a i o á í ó" in RV, delete it if ( ($position = $this->searchIfInRv(array('os', 'a', 'i', 'o','á', 'í', 'ó'))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); return true; } return false; @@ -257,11 +252,11 @@ private function step5() { // If the word ends with one of "e é ê" in RV, delete it, and if preceded by gu (or ci) with the u (or i) in RV, delete the u (or i). if ($this->searchIfInRv(array('e', 'é', 'ê')) !== false) { - $this->word = UTF8::substr($this->word, 0, -1); + $this->word = StringHelper::substr($this->word, 0, -1); if ( ($position2 = $this->search(array('gu', 'ci'))) !== false) { if ($this->inRv(($position2+1))) { - $this->word = UTF8::substr($this->word, 0, -1); + $this->word = StringHelper::substr($this->word, 0, -1); } } return true; @@ -278,6 +273,6 @@ private function step5() private function finish() { // turn U and Y back into lower case, and remove the umlaut accent from a, o and u. - $this->word = UTF8::str_replace(array('a~', 'o~'), array('ã', 'õ'), $this->word); + $this->word = str_replace(array('a~', 'o~'), array('ã', 'õ'), $this->word); } } diff --git a/src/Stemmer/Romanian.php b/src/Stemmer/Romanian.php index 5da8744..87047dc 100644 --- a/src/Stemmer/Romanian.php +++ b/src/Stemmer/Romanian.php @@ -2,7 +2,7 @@ namespace Wamania\Snowball\Stemmer; -use voku\helper\UTF8; +use Joomla\String\StringHelper; /** * @@ -22,12 +22,7 @@ class Romanian extends Stem */ public function stem($word) { - // we do ALL in UTF-8 - if (!UTF8::is_utf8($word)) { - throw new \Exception('Word must be in UTF-8'); - } - - $this->word = UTF8::strtolower($word); + $this->word = StringHelper::strtolower($word); $this->plainVowels = implode('', self::$vowels); @@ -73,7 +68,7 @@ private function step0() // delete if ( ($position = $this->search(array('ul', 'ului'))) !== false) { if ($this->inR1($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } return true; } @@ -109,7 +104,7 @@ private function step0() // replace with i if not preceded by ab if ( ($position = $this->search(array('ile'))) !== false) { if ($this->inR1($position)) { - $before = UTF8::substr($this->word, ($position-2), 2); + $before = StringHelper::substr($this->word, ($position-2), 2); if ($before != 'ab') { $this->word = preg_replace('#(ile)$#u', 'i', $this->word); @@ -226,7 +221,7 @@ private function step2() 'at', 'os', 'iv', 'ut', 'it', 'ic' ))) !== false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } return true; } @@ -236,9 +231,9 @@ private function step2() if ( ($position = $this->search(array('iune', 'iuni'))) !== false) { if ($this->inR2($position)) { $before = $position - 1; - $letter = UTF8::substr($this->word, $before, 1); + $letter = StringHelper::substr($this->word, $before, 1); if ($letter == 'ţ') { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); $this->word = preg_replace('#(ţ)$#u', 't', $this->word); } } @@ -282,10 +277,10 @@ private function step3() if ($this->inRv($position)) { $before = $position - 1; if ($this->inRv($before)) { - $letter = UTF8::substr($this->word, $before, 1); + $letter = StringHelper::substr($this->word, $before, 1); if ( (!in_array($letter, self::$vowels)) || ($letter == 'u') ) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } } } @@ -301,7 +296,7 @@ private function step3() 'aţi', 'eţi', 'iţi', 'âţi', 'sei', 'se', 'ăm', 'âm', 'em', 'im' ))) !== false) { if ($this->inRv($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } return true; } @@ -315,7 +310,7 @@ private function step4() // Search for the longest among the suffixes "a e i ie ă " and, if it is in RV, delete it. if ( ($position = $this->search(array('a', 'ie', 'e', 'i', 'ă'))) !== false) { if ($this->inRv($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } } @@ -329,6 +324,6 @@ private function step4() private function finish() { // Turn I, U back into i, u - $this->word = UTF8::str_replace(array('I', 'U'), array('i', 'u'), $this->word); + $this->word = str_replace(array('I', 'U'), array('i', 'u'), $this->word); } } diff --git a/src/Stemmer/Russian.php b/src/Stemmer/Russian.php index cd18dbf..3949a45 100644 --- a/src/Stemmer/Russian.php +++ b/src/Stemmer/Russian.php @@ -2,7 +2,7 @@ namespace Wamania\Snowball\Stemmer; -use voku\helper\UTF8; +use Joomla\String\StringHelper; /** * @@ -56,12 +56,7 @@ class Russian extends Stem */ public function stem($word) { - // we do ALL in UTF-8 - if (!UTF8::is_utf8($word)) { - throw new \Exception('Word must be in UTF-8'); - } - - $this->word = UTF8::strtolower($word); + $this->word = StringHelper::strtolower($word); // R2 is not used: R1 is defined in the same way as in the German stemmer $this->r1(); @@ -88,7 +83,7 @@ private function step1() // group 1 if ( ($position = $this->searchIfInRv(self::$perfectiveGerund[0])) !== false) { if ( ($this->inRv($position)) && ($this->checkGroup1($position)) ) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); return true; } } @@ -96,7 +91,7 @@ private function step1() // group 2 if ( ($position = $this->searchIfInRv(self::$perfectiveGerund[1])) !== false) { if ($this->inRv($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); return true; } } @@ -104,7 +99,7 @@ private function step1() // Otherwise try and remove a REFLEXIVE ending if ( ($position = $this->searchIfInRv(self::$reflexive)) !== false) { if ($this->inRv($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } } @@ -112,18 +107,18 @@ private function step1() // As soon as one of the endings (1) to (3) is found remove it, and terminate step 1. if ( ($position = $this->searchIfInRv(self::$adjective)) !== false) { if ($this->inRv($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); if ( ($position2 = $this->search(self::$participle[0])) !== false) { if ( ($this->inRv($position2)) && ($this->checkGroup1($position2)) ) { - $this->word = UTF8::substr($this->word, 0, $position2); + $this->word = StringHelper::substr($this->word, 0, $position2); return true; } } if ( ($position2 = $this->search(self::$participle[1])) !== false) { if ($this->inRv($position2)) { - $this->word = UTF8::substr($this->word, 0, $position2); + $this->word = StringHelper::substr($this->word, 0, $position2); return true; } } @@ -134,21 +129,21 @@ private function step1() if ( ($position = $this->searchIfInRv(self::$verb[0])) !== false) { if ( ($this->inRv($position)) && ($this->checkGroup1($position)) ) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); return true; } } if ( ($position = $this->searchIfInRv(self::$verb[1])) !== false) { if ($this->inRv($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); return true; } } if ( ($position = $this->searchIfInRv(self::$noun)) !== false) { if ($this->inRv($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); return true; } } @@ -163,7 +158,7 @@ private function step2() { if ( ($position = $this->searchIfInRv(array('и'))) !== false) { if ($this->inRv($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); return true; } } @@ -178,7 +173,7 @@ private function step3() { if ( ($position = $this->searchIfInRv(self::$derivational)) !== false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); return true; } } @@ -192,18 +187,18 @@ private function step4() { // (2) if the word ends with a SUPERLATIVE ending, remove it if ( ($position = $this->searchIfInRv(self::$superlative)) !== false) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } // (1) Undouble н (n) if ( ($position = $this->searchIfInRv(array('нн'))) !== false) { - $this->word = UTF8::substr($this->word, 0, ($position+1)); + $this->word = StringHelper::substr($this->word, 0, ($position+1)); return true; } // (3) if the word ends ь (') (soft sign) remove it if ( ($position = $this->searchIfInRv(array('ь'))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); return true; } } @@ -213,15 +208,15 @@ private function step4() */ protected function rv() { - $length = UTF8::strlen($this->word); + $length = StringHelper::strlen($this->word); $this->rv = ''; $this->rvIndex = $length; for ($i=0; $i<$length; $i++) { - $letter = UTF8::substr($this->word, $i, 1); + $letter = StringHelper::substr($this->word, $i, 1); if (in_array($letter, self::$vowels)) { - $this->rv = UTF8::substr($this->word, ($i+1)); + $this->rv = StringHelper::substr($this->word, ($i+1)); $this->rvIndex = $i + 1; return true; } @@ -242,7 +237,7 @@ private function checkGroup1($position) return false; } - $letter = UTF8::substr($this->word, ($position - 1), 1); + $letter = StringHelper::substr($this->word, ($position - 1), 1); if ($letter == 'а' || $letter == 'я') { return true; diff --git a/src/Stemmer/Spanish.php b/src/Stemmer/Spanish.php index 4f6f2c8..b83c040 100644 --- a/src/Stemmer/Spanish.php +++ b/src/Stemmer/Spanish.php @@ -2,7 +2,8 @@ namespace Wamania\Snowball\Stemmer; -use voku\helper\UTF8; +use Joomla\String\StringHelper; +use Wamania\Snowball\Transliterate; /** * @@ -22,12 +23,7 @@ class Spanish extends Stem */ public function stem($word) { - // we do ALL in UTF-8 - if (!UTF8::is_utf8($word)) { - throw new \Exception('Word must be in UTF-8'); - } - - $this->word = UTF8::strtolower($word); + $this->word = StringHelper::strtolower($word); $this->rv(); $this->r1(); @@ -71,7 +67,7 @@ public function stem($word) private function step0() { if ( ($position = $this->searchIfInRv(array('selas', 'selos', 'las', 'los', 'les', 'nos', 'selo', 'sela', 'me', 'se', 'la', 'le', 'lo' ))) != false) { - $suffixe = UTF8::substr($this->word, $position); + $suffixe = StringHelper::substr($this->word, $position); // a $a = array('iéndo', 'ándo', 'ár', 'ér', 'ír'); @@ -80,11 +76,11 @@ private function step0() }, $a); if ( ($position2 = $this->searchIfInRv($a)) !== false) { - $suffixe2 = UTF8::substr($this->word, $position2); - $suffixe2 = UTF8::to_utf8(UTF8::to_ascii($suffixe2)); // unaccent - $this->word = UTF8::substr($this->word, 0, $position2); + $suffixe2 = StringHelper::substr($this->word, $position2); + $suffixe2 = Transliterate::utf8_latin_to_ascii($suffixe2); // unaccent + $this->word = StringHelper::substr($this->word, 0, $position2); $this->word .= $suffixe2; - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); return true; } @@ -95,15 +91,15 @@ private function step0() }, $b); if ( ($position2 = $this->searchIfInRv($b)) !== false) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); return true; } // c if ( ($position2 = $this->searchIfInRv(array('yendo' . $suffixe))) != false) { - $before = UTF8::substr($this->word, ($position2-1), 1); + $before = StringHelper::substr($this->word, ($position2-1), 1); if ( (isset($before)) && ($before == 'u') ) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); return true; } } @@ -125,7 +121,7 @@ private function step1() 'ible', 'ables', 'able', 'ismos', 'ismo', 'icas', 'icos', 'ica', 'ico', 'anzas', 'anza'))) != false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } return true; } @@ -137,11 +133,11 @@ private function step1() 'adoras', 'adora', 'aciones', 'ación', 'adores', 'ador', 'antes', 'ante', 'ancias', 'ancia'))) != false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } if ( ($position2 = $this->searchIfInR2(array('ic')))) { - $this->word = UTF8::substr($this->word, 0, $position2); + $this->word = StringHelper::substr($this->word, 0, $position2); } return true; } @@ -181,19 +177,19 @@ private function step1() // delete if in R1 if ($this->inR1($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } // if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise, if ( ($position2 = $this->searchIfInR2(array('iv'))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position2); + $this->word = StringHelper::substr($this->word, 0, $position2); if ( ($position3 = $this->searchIfInR2(array('at'))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position3); + $this->word = StringHelper::substr($this->word, 0, $position3); } // if preceded by os, ic or ad, delete if in R2 } elseif ( ($position4 = $this->searchIfInR2(array('os', 'ic', 'ad'))) != false) { - $this->word = UTF8::substr($this->word, 0, $position4); + $this->word = StringHelper::substr($this->word, 0, $position4); } return true; } @@ -205,12 +201,12 @@ private function step1() // delete if in R2 if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } // if preceded by ante, able or ible, delete if in R2 if ( ($position2 = $this->searchIfInR2(array('ante', 'able', 'ible'))) != false) { - $this->word = UTF8::substr($this->word, 0, $position2); + $this->word = StringHelper::substr($this->word, 0, $position2); } return true; } @@ -222,12 +218,12 @@ private function step1() // delete if in R2 if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } // if preceded by abil, ic or iv, delete if in R2 if ( ($position2 = $this->searchIfInR2(array('abil', 'ic', 'iv'))) != false) { - $this->word = UTF8::substr($this->word, 0, $position2); + $this->word = StringHelper::substr($this->word, 0, $position2); } return true; } @@ -239,12 +235,12 @@ private function step1() // delete if in R2 if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } // if preceded by at, delete if in R2 if ( ($position2 = $this->searchIfInR2(array('at'))) != false) { - $this->word = UTF8::substr($this->word, 0, $position2); + $this->word = StringHelper::substr($this->word, 0, $position2); } return true; } @@ -262,9 +258,9 @@ private function step2a() if ( ($position = $this->searchIfInRv(array( 'yamos', 'yendo', 'yeron', 'yan', 'yen', 'yais', 'yas', 'yes', 'yo', 'yó', 'ya', 'ye'))) != false) { - $before = UTF8::substr($this->word, ($position-1), 1); + $before = StringHelper::substr($this->word, ($position-1), 1); if ( (isset($before)) && ($before == 'u') ) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); return true; } } @@ -289,17 +285,17 @@ private function step2b() 'aré', 'erá', 'eré', 'áis', 'ías', 'irá', 'iré', 'aba', 'ían', 'ada', 'ara', 'ase', 'ida', 'ado', 'ido', 'ará', 'ad', 'ed', 'id', 'ís', 'ió', 'ar', 'er', 'ir', 'as', 'ía', 'an' ))) != false) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); return true; } // en es éis emos // delete, and if preceded by gu delete the u (the gu need not be in RV) if ( ($position = $this->searchIfInRv(array('éis', 'emos', 'en', 'es'))) != false) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); if ( ($position2 = $this->search(array('gu'))) != false) { - $this->word = UTF8::substr($this->word, 0, ($position2+1)); + $this->word = StringHelper::substr($this->word, 0, ($position2+1)); } @@ -316,19 +312,19 @@ private function step3() // os a o á í ó // delete if in RV if ( ($position = $this->searchIfInRv(array('os', 'a', 'o', 'á', 'í', 'ó'))) != false) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); return true; } // e é // delete if in RV, and if preceded by gu with the u in RV delete the u if ( ($position = $this->searchIfInRv(array('e', 'é'))) != false) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); if ( ($position2 = $this->searchIfInRv(array('u'))) != false) { - $before = UTF8::substr($this->word, ($position2-1), 1); + $before = StringHelper::substr($this->word, ($position2-1), 1); if ( (isset($before)) && ($before == 'g') ) { - $this->word = UTF8::substr($this->word, 0, $position2); + $this->word = StringHelper::substr($this->word, 0, $position2); return true; } } @@ -343,6 +339,6 @@ private function step3() */ private function finish() { - $this->word = UTF8::str_replace(array('á', 'í', 'ó', 'é', 'ú'), array('a', 'i', 'o', 'e', 'u'), $this->word); + $this->word = str_replace(array('á', 'í', 'ó', 'é', 'ú'), array('a', 'i', 'o', 'e', 'u'), $this->word); } } diff --git a/src/Stemmer/Stem.php b/src/Stemmer/Stem.php index 0c6f148..1ce7274 100644 --- a/src/Stemmer/Stem.php +++ b/src/Stemmer/Stem.php @@ -2,7 +2,7 @@ namespace Wamania\Snowball\Stemmer; -use voku\helper\UTF8; +use Joomla\String\StringHelper; abstract class Stem implements Stemmer { @@ -94,12 +94,12 @@ protected function searchIfInR2($suffixes) protected function search($suffixes, $offset = 0) { - $length = UTF8::strlen($this->word); + $length = StringHelper::strlen($this->word); if ($offset > $length) { return false; } foreach ($suffixes as $suffixe) { - if ( (($position = UTF8::strrpos($this->word, $suffixe, $offset)) !== false) && ((Utf8::strlen($suffixe)+$position) == $length) ) { + if ( (($position = StringHelper::strrpos($this->word, $suffixe, $offset)) !== false) && ((StringHelper::strlen($suffixe)+$position) == $length) ) { return $position; } } @@ -134,7 +134,7 @@ protected function r2() */ protected function rx($in) { - $length = UTF8::strlen($in); + $length = StringHelper::strlen($in); // defaults $value = ''; @@ -143,7 +143,7 @@ protected function rx($in) // we search all vowels $vowels = array(); for ($i=0; $i<$length; $i++) { - $letter = UTF8::substr($in, $i, 1); + $letter = StringHelper::substr($in, $i, 1); if (in_array($letter, static::$vowels)) { $vowels[] = $i; } @@ -152,11 +152,11 @@ protected function rx($in) // search the non-vowel following a vowel foreach ($vowels as $position) { $after = $position + 1; - $letter = UTF8::substr($in, $after, 1); + $letter = StringHelper::substr($in, $after, 1); if (! in_array($letter, static::$vowels)) { $index = $after + 1; - $value = UTF8::substr($in, ($after+1)); + $value = StringHelper::substr($in, ($after+1)); break; } @@ -175,7 +175,7 @@ protected function rx($in) */ protected function rv() { - $length = UTF8::strlen($this->word); + $length = StringHelper::strlen($this->word); $this->rv = ''; $this->rvIndex = $length; @@ -184,16 +184,16 @@ protected function rv() return true; } - $first = UTF8::substr($this->word, 0, 1); - $second = UTF8::substr($this->word, 1, 1); + $first = StringHelper::substr($this->word, 0, 1); + $second = StringHelper::substr($this->word, 1, 1); // If the second letter is a consonant, RV is the region after the next following vowel, if (!in_array($second, static::$vowels)) { for ($i=2; $i<$length; $i++) { - $letter = UTF8::substr($this->word, $i, 1); + $letter = StringHelper::substr($this->word, $i, 1); if (in_array($letter, static::$vowels)) { $this->rvIndex = $i + 1; - $this->rv = UTF8::substr($this->word, ($i+1)); + $this->rv = StringHelper::substr($this->word, ($i+1)); return true; } } @@ -202,10 +202,10 @@ protected function rv() // or if the first two letters are vowels, RV is the region after the next consonant, if ( (in_array($first, static::$vowels)) && (in_array($second, static::$vowels)) ) { for ($i=2; $i<$length; $i++) { - $letter = UTF8::substr($this->word, $i, 1); + $letter = StringHelper::substr($this->word, $i, 1); if (! in_array($letter, static::$vowels)) { $this->rvIndex = $i + 1; - $this->rv = UTF8::substr($this->word, ($i+1)); + $this->rv = StringHelper::substr($this->word, ($i+1)); return true; } } @@ -213,7 +213,7 @@ protected function rv() // and otherwise (consonant-vowel case) RV is the region after the third letter. if ( (! in_array($first, static::$vowels)) && (in_array($second, static::$vowels)) ) { - $this->rv = UTF8::substr($this->word, 3); + $this->rv = StringHelper::substr($this->word, 3); $this->rvIndex = 3; return true; } diff --git a/src/Stemmer/Swedish.php b/src/Stemmer/Swedish.php index 32352ef..ed8103c 100644 --- a/src/Stemmer/Swedish.php +++ b/src/Stemmer/Swedish.php @@ -2,7 +2,7 @@ namespace Wamania\Snowball\Stemmer; -use voku\helper\UTF8; +use Joomla\String\StringHelper; /** * @@ -22,12 +22,7 @@ class Swedish extends Stem */ public function stem($word) { - // we do ALL in UTF-8 - if (!UTF8::is_utf8($word)) { - throw new \Exception('Word must be in UTF-8'); - } - - $this->word = UTF8::strtolower($word); + $this->word = StringHelper::strtolower($word); // R2 is not used: R1 is defined in the same way as in the German stemmer $this->r1(); @@ -35,7 +30,7 @@ public function stem($word) // then R1 is adjusted so that the region before it contains at least 3 letters. if ($this->r1Index < 3) { $this->r1Index = 3; - $this->r1 = UTF8::substr($this->word, 3); + $this->r1 = StringHelper::substr($this->word, 3); } // Do each of steps 1, 2 3 and 4. @@ -55,7 +50,7 @@ public function stem($word) */ private function hasValidSEnding($word) { - $lastLetter = UTF8::substr($word, -1, 1); + $lastLetter = StringHelper::substr($word, -1, 1); return in_array($lastLetter, array('b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 't', 'v', 'y')); } @@ -74,14 +69,14 @@ private function step1() 'orna', 'arna', 'erna', 'aren', 'ande', 'ades', 'arne', 'erns', 'aste', 'ade', 'ern', 'het', 'ast', 'are', 'ens', 'or', 'es', 'ad', 'en', 'at', 'ar', 'as', 'er', 'a', 'e' ))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); return true; } // s // delete if preceded by a valid s-ending if ( ($position = $this->searchIfInR1(array('s'))) !== false) { - $word = UTF8::substr($this->word, 0, $position); + $word = StringHelper::substr($this->word, 0, $position); if ($this->hasValidSEnding($word)) { $this->word = $word; } @@ -96,7 +91,7 @@ private function step2() { // dd gd nn dt gt kt tt if ($this->searchIfInR1(array('dd', 'gd', 'nn', 'dt', 'gt', 'kt', 'tt')) !== false) { - $this->word = UTF8::substr($this->word, 0, -1); + $this->word = StringHelper::substr($this->word, 0, -1); } } @@ -109,21 +104,21 @@ private function step3() // lig ig els // delete if ( ($position = $this->searchIfInR1(array('lig', 'ig', 'els'))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); return true; } // löst // replace with lös if ( ($this->searchIfInR1(array('löst'))) !== false) { - $this->word = UTF8::substr($this->word, 0, -1); + $this->word = StringHelper::substr($this->word, 0, -1); return true; } // fullt // replace with full if ( ($this->searchIfInR1(array('fullt'))) !== false) { - $this->word = UTF8::substr($this->word, 0, -1); + $this->word = StringHelper::substr($this->word, 0, -1); return true; } } diff --git a/src/StemmerFactory.php b/src/StemmerFactory.php index d60a8c6..b8c487a 100644 --- a/src/StemmerFactory.php +++ b/src/StemmerFactory.php @@ -2,7 +2,7 @@ namespace Wamania\Snowball; -use voku\helper\UTF8; +use Joomla\String\StringHelper; use Wamania\Snowball\Stemmer\Catalan; use Wamania\Snowball\Stemmer\Danish; use Wamania\Snowball\Stemmer\Dutch; @@ -43,7 +43,7 @@ class StemmerFactory */ public static function create(string $code): Stemmer { - $code = UTF8::strtolower($code); + $code = StringHelper::strtolower($code); foreach (self::LANGS as $classname => $isoCodes) { if (in_array($code, $isoCodes)) { diff --git a/src/Transliterate.php b/src/Transliterate.php new file mode 100644 index 0000000..3399f6b --- /dev/null +++ b/src/Transliterate.php @@ -0,0 +1,253 @@ + 'a', + 'ô' => 'o', + 'ď' => 'd', + 'ḟ' => 'f', + 'ë' => 'e', + 'š' => 's', + 'ơ' => 'o', + 'ß' => 'ss', + 'ă' => 'a', + 'ř' => 'r', + 'ț' => 't', + 'ň' => 'n', + 'ā' => 'a', + 'ķ' => 'k', + 'ŝ' => 's', + 'ỳ' => 'y', + 'ņ' => 'n', + 'ĺ' => 'l', + 'ħ' => 'h', + 'ṗ' => 'p', + 'ó' => 'o', + 'ú' => 'u', + 'ě' => 'e', + 'é' => 'e', + 'ç' => 'c', + 'ẁ' => 'w', + 'ċ' => 'c', + 'õ' => 'o', + 'ṡ' => 's', + 'ø' => 'o', + 'ģ' => 'g', + 'ŧ' => 't', + 'ș' => 's', + 'ė' => 'e', + 'ĉ' => 'c', + 'ś' => 's', + 'î' => 'i', + 'ű' => 'u', + 'ć' => 'c', + 'ę' => 'e', + 'ŵ' => 'w', + 'ṫ' => 't', + 'ū' => 'u', + 'č' => 'c', + 'ö' => 'oe', + 'è' => 'e', + 'ŷ' => 'y', + 'ą' => 'a', + 'ł' => 'l', + 'ų' => 'u', + 'ů' => 'u', + 'ş' => 's', + 'ğ' => 'g', + 'ļ' => 'l', + 'ƒ' => 'f', + 'ž' => 'z', + 'ẃ' => 'w', + 'ḃ' => 'b', + 'å' => 'a', + 'ì' => 'i', + 'ï' => 'i', + 'ḋ' => 'd', + 'ť' => 't', + 'ŗ' => 'r', + 'ä' => 'ae', + 'í' => 'i', + 'ŕ' => 'r', + 'ê' => 'e', + 'ü' => 'ue', + 'ò' => 'o', + 'ē' => 'e', + 'ñ' => 'n', + 'ń' => 'n', + 'ĥ' => 'h', + 'ĝ' => 'g', + 'đ' => 'd', + 'ĵ' => 'j', + 'ÿ' => 'y', + 'ũ' => 'u', + 'ŭ' => 'u', + 'ư' => 'u', + 'ţ' => 't', + 'ý' => 'y', + 'ő' => 'o', + 'â' => 'a', + 'ľ' => 'l', + 'ẅ' => 'w', + 'ż' => 'z', + 'ī' => 'i', + 'ã' => 'a', + 'ġ' => 'g', + 'ṁ' => 'm', + 'ō' => 'o', + 'ĩ' => 'i', + 'ù' => 'u', + 'į' => 'i', + 'ź' => 'z', + 'á' => 'a', + 'û' => 'u', + 'þ' => 'th', + 'ð' => 'dh', + 'æ' => 'ae', + 'µ' => 'u', + 'ĕ' => 'e', + 'œ' => 'oe', + ]; + } + + $string = str_replace(array_keys($UTF8_LOWER_ACCENTS), array_values($UTF8_LOWER_ACCENTS), $string); + } + + if ($case >= 0) { + if (\is_null($UTF8_UPPER_ACCENTS)) { + $UTF8_UPPER_ACCENTS = [ + 'À' => 'A', + 'Ô' => 'O', + 'Ď' => 'D', + 'Ḟ' => 'F', + 'Ë' => 'E', + 'Š' => 'S', + 'Ơ' => 'O', + 'Ă' => 'A', + 'Ř' => 'R', + 'Ț' => 'T', + 'Ň' => 'N', + 'Ā' => 'A', + 'Ķ' => 'K', + 'Ŝ' => 'S', + 'Ỳ' => 'Y', + 'Ņ' => 'N', + 'Ĺ' => 'L', + 'Ħ' => 'H', + 'Ṗ' => 'P', + 'Ó' => 'O', + 'Ú' => 'U', + 'Ě' => 'E', + 'É' => 'E', + 'Ç' => 'C', + 'Ẁ' => 'W', + 'Ċ' => 'C', + 'Õ' => 'O', + 'Ṡ' => 'S', + 'Ø' => 'O', + 'Ģ' => 'G', + 'Ŧ' => 'T', + 'Ș' => 'S', + 'Ė' => 'E', + 'Ĉ' => 'C', + 'Ś' => 'S', + 'Î' => 'I', + 'Ű' => 'U', + 'Ć' => 'C', + 'Ę' => 'E', + 'Ŵ' => 'W', + 'Ṫ' => 'T', + 'Ū' => 'U', + 'Č' => 'C', + 'Ö' => 'Oe', + 'È' => 'E', + 'Ŷ' => 'Y', + 'Ą' => 'A', + 'Ł' => 'L', + 'Ų' => 'U', + 'Ů' => 'U', + 'Ş' => 'S', + 'Ğ' => 'G', + 'Ļ' => 'L', + 'Ƒ' => 'F', + 'Ž' => 'Z', + 'Ẃ' => 'W', + 'Ḃ' => 'B', + 'Å' => 'A', + 'Ì' => 'I', + 'Ï' => 'I', + 'Ḋ' => 'D', + 'Ť' => 'T', + 'Ŗ' => 'R', + 'Ä' => 'Ae', + 'Í' => 'I', + 'Ŕ' => 'R', + 'Ê' => 'E', + 'Ü' => 'Ue', + 'Ò' => 'O', + 'Ē' => 'E', + 'Ñ' => 'N', + 'Ń' => 'N', + 'Ĥ' => 'H', + 'Ĝ' => 'G', + 'Đ' => 'D', + 'Ĵ' => 'J', + 'Ÿ' => 'Y', + 'Ũ' => 'U', + 'Ŭ' => 'U', + 'Ư' => 'U', + 'Ţ' => 'T', + 'Ý' => 'Y', + 'Ő' => 'O', + 'Â' => 'A', + 'Ľ' => 'L', + 'Ẅ' => 'W', + 'Ż' => 'Z', + 'Ī' => 'I', + 'Ã' => 'A', + 'Ġ' => 'G', + 'Ṁ' => 'M', + 'Ō' => 'O', + 'Ĩ' => 'I', + 'Ù' => 'U', + 'Į' => 'I', + 'Ź' => 'Z', + 'Á' => 'A', + 'Û' => 'U', + 'Þ' => 'Th', + 'Ð' => 'Dh', + 'Æ' => 'Ae', + 'Ĕ' => 'E', + 'Œ' => 'Oe', + ]; + } + + $string = str_replace(array_keys($UTF8_UPPER_ACCENTS), array_values($UTF8_UPPER_ACCENTS), $string); + } + + return $string; + } +}