From 251778d7be9ec5136df1f914a70bea2288253f6a Mon Sep 17 00:00:00 2001 From: Roman Parpalak Date: Wed, 21 Aug 2024 13:26:10 +0300 Subject: [PATCH] Updated search query cleaning algorithm to be closer to indexing cleaning algorithm. --- .../Entity/Metadata/SentenceCollection.php | 6 +-- src/S2/Rose/Entity/Query.php | 54 +++++++++---------- src/S2/Rose/Snippet/WordsByStemsExtractor.php | 7 +-- tests/unit/Rose/Entity/QueryTest.php | 43 +++++++++++++++ 4 files changed, 76 insertions(+), 34 deletions(-) diff --git a/src/S2/Rose/Entity/Metadata/SentenceCollection.php b/src/S2/Rose/Entity/Metadata/SentenceCollection.php index a6f450e..e2a5c33 100644 --- a/src/S2/Rose/Entity/Metadata/SentenceCollection.php +++ b/src/S2/Rose/Entity/Metadata/SentenceCollection.php @@ -101,10 +101,10 @@ private function buildWordsInfo(): void */ public static function breakIntoWords(string $content): array { - // Replace comma as decimal separator to dot - $content = preg_replace('/[\s()]\d+\K,(?=\d+(?:[\s()]|\.\s))/', '.', $content); + // Replace decimal separator: ',' -> '.' + $content = preg_replace('#(?:^|[\s()])-?\d+\K,(?=\d+(?:$|[\s()]|\.\s))#', '.', $content); - // We allow letters, digits and some punctuation: ".,-" + // We allow letters, digits and some punctuation: ".,-^_" $content = str_replace(',', ', ', $content); $content = preg_replace('#[^\\-.,0-9\\p{L}^_]+#u', ' ', $content); $content = mb_strtolower($content); diff --git a/src/S2/Rose/Entity/Query.php b/src/S2/Rose/Entity/Query.php index fd5d2ff..325d181 100644 --- a/src/S2/Rose/Entity/Query.php +++ b/src/S2/Rose/Entity/Query.php @@ -115,46 +115,44 @@ public function valueToArray() // Normalize $content = str_replace(['«', '»', '“', '”', '‘', '’'], '"', $content); - $content = str_replace(['---', '--', '–', '−'], '—', $content); + $content = str_replace('−', '-', $content); // Replace minus sign to a hyphen + $content = str_replace(['---', '–', '−'], '—', $content); // Normalize dashes $content = preg_replace('#,\\s+,#u', ',,', $content); $content = preg_replace('#[^\\-\\p{L}0-9^_.,()";?!…:—]+#iu', ' ', $content); - $content = preg_replace('#\\n+#', ' ', $content); - $content = preg_replace('#\\s+#u', ' ', $content); $content = mb_strtolower($content); - $content = preg_replace('#(,+)#u', '\\1 ', $content); + // Replace decimal separators: ',' -> '.' + $content = preg_replace('#(?<=^|\\s)(\\-?\\d+),(\\d+)(?=\\s|$)#u', '\\1.\\2', $content); - $content = preg_replace('#[ |\\/]+#', ' ', $content); - - $words = explode(' ', $content); - foreach ($words as $k => $v) { - // Separate special chars from the letter combination - if (strlen($v) > 1) { - foreach (['—', '^', '(', ')', '"', ':', '?', '!'] as $specialChar) { - if (mb_substr($v, 0, 1) == $specialChar || mb_substr($v, -1) == $specialChar) { - $words[$k] = str_replace($specialChar, '', $v); - $words[] = $specialChar; - } - } + // Separate special chars at the beginning of the word + while (true) { + $content = preg_replace('#(?:^|\\s)\K([—^()"?:!])(?=[^\s])#u', '\\1 ', $content, -1, $count); + if ($count === 0) { + break; } + } - // Separate hyphen from the letter combination - if (strlen($v) > 1 && (substr($v, 0, 1) == '-' || substr($v, -1) == '-')) { - $words[$k] = str_replace('-', '', $v); - $words[] = '-'; + // Separate special chars at the end of the word + while (true) { + $content = preg_replace('#(?<=[^\s])([—^()"?:!])(?=\\s|$)#u', ' \\1', $content, -1, $count); + if ($count === 0) { + break; } + } - // Replace 'ё' inside words - if (false !== strpos($v, 'ё') && $v != 'ё') { - $words[$k] = str_replace('ё', 'е', $v); - } + // Separate groups of commas + $content = preg_replace('#(,+)#u', ' \\1 ', $content); - // Remove ',' - if (preg_match('#^[^,]+,$#u', $v) || preg_match('#^,[^,]+$#u', $v)) { - $words[$k] = str_replace(',', '', $v); - $words[] = ','; + $words = preg_split('#\\s+#', $content); + foreach ($words as $k => &$v) { + // Replace 'ё' inside words + if ($v !== 'ё' && false !== strpos($v, 'ё')) { + $v = str_replace('ё', 'е', $v); } } + unset($v); + + $words = array_unique($words); StringHelper::removeLongWords($words); diff --git a/src/S2/Rose/Snippet/WordsByStemsExtractor.php b/src/S2/Rose/Snippet/WordsByStemsExtractor.php index 3f9ca45..16d6dda 100755 --- a/src/S2/Rose/Snippet/WordsByStemsExtractor.php +++ b/src/S2/Rose/Snippet/WordsByStemsExtractor.php @@ -1,8 +1,8 @@ $word !== ''), [$text]); } } diff --git a/tests/unit/Rose/Entity/QueryTest.php b/tests/unit/Rose/Entity/QueryTest.php index 4b7fc8c..eed74c9 100644 --- a/tests/unit/Rose/Entity/QueryTest.php +++ b/tests/unit/Rose/Entity/QueryTest.php @@ -11,11 +11,13 @@ /** * @group entity + * @group query */ class QueryTest extends Unit { public function testFilterInput(): void { + // Tests for splitting strings by special delimiters $this->assertEquals([1, 2], (new Query('1|||2'))->valueToArray()); $this->assertEquals([1, 2], (new Query('1\\\\\\2'))->valueToArray()); $this->assertEquals(['a', 'b'], (new Query('a/b'))->valueToArray()); @@ -23,5 +25,46 @@ public function testFilterInput(): void $this->assertEquals(['..'], (new Query('..'))->valueToArray()); $this->assertEquals(['...'], (new Query('...'))->valueToArray()); $this->assertEquals(['a..b'], (new Query('a..b'))->valueToArray()); + + // Tests for replacing numbers + $this->assertEquals(['1.2'], (new Query('1,2'))->valueToArray()); + // $this->assertEquals(['-1.2'], (new Query('-1,2'))->valueToArray()); + $this->assertEquals(['1.2'], (new Query('1.2'))->valueToArray()); + + // Tests for replacing typographic quotes + $this->assertEquals(['"', 'text'], (new Query('«text»'))->valueToArray()); + $this->assertEquals(['"', 'text'], (new Query('“text”'))->valueToArray()); + + // Tests for replacing dashes + $this->assertEquals(['a--b'], (new Query('a--b'))->valueToArray()); + $this->assertEquals(['a—b'], (new Query('a---b'))->valueToArray()); // --- to mdash + $this->assertEquals(['a—b'], (new Query('a–b'))->valueToArray()); // ndash to mdash + $this->assertEquals(['a-b'], (new Query('a−b'))->valueToArray()); // Minus to hyphen + + // Test for replacing line breaks and extra spaces + $this->assertEquals(['a', 'b'], (new Query("a\n\nb"))->valueToArray()); + $this->assertEquals(['a', 'b'], (new Query("a \t b"))->valueToArray()); + + // Tests for separating special characters + $this->assertEquals(['a!b'], (new Query('a!b'))->valueToArray()); + $this->assertEquals(['!', 'ab'], (new Query('!ab'))->valueToArray()); + $this->assertEquals(['!', 'a!b'], (new Query('!a!b'))->valueToArray()); + $this->assertEquals(['(', 'word', ')'], (new Query('(word)'))->valueToArray()); + $this->assertEquals(['mysql', '--all-databases'], (new Query('mysql --all-databases'))->valueToArray()); + + // Test for replacing "ё" with "е" + $this->assertEquals(['ё', 'полет', 'field'], (new Query('ё полёт field'))->valueToArray()); + + // Tests for handling commas + $this->assertEquals(['a', ',', 'b'], (new Query('a,b'))->valueToArray()); + $this->assertEquals(['a', ',,', 'b'], (new Query('a,,b'))->valueToArray()); + $this->assertEquals(['a', ',,,', 'b'], (new Query('a,,,b'))->valueToArray()); + + // Tests for removing long words + $this->assertEquals(['a', 'c'], (new Query('a ' . str_repeat('b', 101) . ' c'))->valueToArray()); + + // Tests for compatibility of multiple rules + $this->assertEquals(['a—b', '"', 'text'], (new Query('a–b «text»'))->valueToArray()); + $this->assertEquals(['a', ',', 'b'], (new Query(" a, \n b "))->valueToArray()); } }