Skip to content

Commit

Permalink
Updated search query cleaning algorithm to be closer to indexing clea…
Browse files Browse the repository at this point in the history
…ning algorithm.
  • Loading branch information
parpalak committed Aug 21, 2024
1 parent 468d5e7 commit 251778d
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 34 deletions.
6 changes: 3 additions & 3 deletions src/S2/Rose/Entity/Metadata/SentenceCollection.php
Original file line number Diff line number Diff line change
Expand Up @@ -101,10 +101,10 @@ private function buildWordsInfo(): void
*/
public static function breakIntoWords(string $content): array
{
// Replace comma as decimal separator to dot
$content = preg_replace('/[\s()]\d+\K,(?=\d+(?:[\s()]|\.\s))/', '.', $content);
// Replace decimal separator: ',' -> '.'
$content = preg_replace('#(?:^|[\s()])-?\d+\K,(?=\d+(?:$|[\s()]|\.\s))#', '.', $content);

// We allow letters, digits and some punctuation: ".,-"
// We allow letters, digits and some punctuation: ".,-^_"
$content = str_replace(',', ', ', $content);
$content = preg_replace('#[^\\-.,0-9\\p{L}^_]+#u', ' ', $content);
$content = mb_strtolower($content);
Expand Down
54 changes: 26 additions & 28 deletions src/S2/Rose/Entity/Query.php
Original file line number Diff line number Diff line change
Expand Up @@ -115,46 +115,44 @@ public function valueToArray()

// Normalize
$content = str_replace(['«', '»', '', '', '', ''], '"', $content);
$content = str_replace(['---', '--', '', ''], '', $content);
$content = str_replace('', '-', $content); // Replace minus sign to a hyphen
$content = str_replace(['---', '', ''], '', $content); // Normalize dashes
$content = preg_replace('#,\\s+,#u', ',,', $content);
$content = preg_replace('#[^\\-\\p{L}0-9^_.,()";?!…:—]+#iu', ' ', $content);
$content = preg_replace('#\\n+#', ' ', $content);
$content = preg_replace('#\\s+#u', ' ', $content);
$content = mb_strtolower($content);

$content = preg_replace('#(,+)#u', '\\1 ', $content);
// Replace decimal separators: ',' -> '.'
$content = preg_replace('#(?<=^|\\s)(\\-?\\d+),(\\d+)(?=\\s|$)#u', '\\1.\\2', $content);

$content = preg_replace('#[ |\\/]+#', ' ', $content);

$words = explode(' ', $content);
foreach ($words as $k => $v) {
// Separate special chars from the letter combination
if (strlen($v) > 1) {
foreach (['', '^', '(', ')', '"', ':', '?', '!'] as $specialChar) {
if (mb_substr($v, 0, 1) == $specialChar || mb_substr($v, -1) == $specialChar) {
$words[$k] = str_replace($specialChar, '', $v);
$words[] = $specialChar;
}
}
// Separate special chars at the beginning of the word
while (true) {
$content = preg_replace('#(?:^|\\s)\K([—^()"?:!])(?=[^\s])#u', '\\1 ', $content, -1, $count);
if ($count === 0) {
break;
}
}

// Separate hyphen from the letter combination
if (strlen($v) > 1 && (substr($v, 0, 1) == '-' || substr($v, -1) == '-')) {
$words[$k] = str_replace('-', '', $v);
$words[] = '-';
// Separate special chars at the end of the word
while (true) {
$content = preg_replace('#(?<=[^\s])([—^()"?:!])(?=\\s|$)#u', ' \\1', $content, -1, $count);
if ($count === 0) {
break;
}
}

// Replace 'ё' inside words
if (false !== strpos($v, 'ё') && $v != 'ё') {
$words[$k] = str_replace('ё', 'е', $v);
}
// Separate groups of commas
$content = preg_replace('#(,+)#u', ' \\1 ', $content);

// Remove ','
if (preg_match('#^[^,]+,$#u', $v) || preg_match('#^,[^,]+$#u', $v)) {
$words[$k] = str_replace(',', '', $v);
$words[] = ',';
$words = preg_split('#\\s+#', $content);
foreach ($words as $k => &$v) {
// Replace 'ё' inside words
if ($v !== 'ё' && false !== strpos($v, 'ё')) {
$v = str_replace('ё', 'е', $v);
}
}
unset($v);

$words = array_unique($words);

StringHelper::removeLongWords($words);

Expand Down
7 changes: 4 additions & 3 deletions src/S2/Rose/Snippet/WordsByStemsExtractor.php
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
<?php
/**
* @copyright 2024 Roman Parpalak
* @license MIT
* @package Rose
* @license MIT
* @package Rose
*/

declare(strict_types=1);
Expand Down Expand Up @@ -85,6 +85,7 @@ public function extract(string $text): array
* check each fragment for a match with the searched stem.
*
* @param string $text
*
* @return string[]
*/
private function getWords(string $text): array
Expand All @@ -93,6 +94,6 @@ private function getWords(string $text): array
return [$text];
}

return array_merge(explode('-', $text), [$text]);
return array_merge(array_filter(explode('-', $text), static fn(string $word) => $word !== ''), [$text]);
}
}
43 changes: 43 additions & 0 deletions tests/unit/Rose/Entity/QueryTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,60 @@

/**
* @group entity
* @group query
*/
class QueryTest extends Unit
{
public function testFilterInput(): void
{
// Tests for splitting strings by special delimiters
$this->assertEquals([1, 2], (new Query('1|||2'))->valueToArray());
$this->assertEquals([1, 2], (new Query('1\\\\\\2'))->valueToArray());
$this->assertEquals(['a', 'b'], (new Query('a/b'))->valueToArray());
$this->assertEquals(['a', 'b'], (new Query(' a b '))->valueToArray());
$this->assertEquals(['..'], (new Query('..'))->valueToArray());
$this->assertEquals(['...'], (new Query('...'))->valueToArray());
$this->assertEquals(['a..b'], (new Query('a..b'))->valueToArray());

// Tests for replacing numbers
$this->assertEquals(['1.2'], (new Query('1,2'))->valueToArray());
// $this->assertEquals(['-1.2'], (new Query('-1,2'))->valueToArray());
$this->assertEquals(['1.2'], (new Query('1.2'))->valueToArray());

// Tests for replacing typographic quotes
$this->assertEquals(['"', 'text'], (new Query('«text»'))->valueToArray());
$this->assertEquals(['"', 'text'], (new Query('“text”'))->valueToArray());

// Tests for replacing dashes
$this->assertEquals(['a--b'], (new Query('a--b'))->valueToArray());
$this->assertEquals(['a—b'], (new Query('a---b'))->valueToArray()); // --- to mdash
$this->assertEquals(['a—b'], (new Query('a–b'))->valueToArray()); // ndash to mdash
$this->assertEquals(['a-b'], (new Query('a−b'))->valueToArray()); // Minus to hyphen

// Test for replacing line breaks and extra spaces
$this->assertEquals(['a', 'b'], (new Query("a\n\nb"))->valueToArray());
$this->assertEquals(['a', 'b'], (new Query("a \t b"))->valueToArray());

// Tests for separating special characters
$this->assertEquals(['a!b'], (new Query('a!b'))->valueToArray());
$this->assertEquals(['!', 'ab'], (new Query('!ab'))->valueToArray());
$this->assertEquals(['!', 'a!b'], (new Query('!a!b'))->valueToArray());
$this->assertEquals(['(', 'word', ')'], (new Query('(word)'))->valueToArray());
$this->assertEquals(['mysql', '--all-databases'], (new Query('mysql --all-databases'))->valueToArray());

// Test for replacing "ё" with "е"
$this->assertEquals(['ё', 'полет', 'field'], (new Query('ё полёт field'))->valueToArray());

// Tests for handling commas
$this->assertEquals(['a', ',', 'b'], (new Query('a,b'))->valueToArray());
$this->assertEquals(['a', ',,', 'b'], (new Query('a,,b'))->valueToArray());
$this->assertEquals(['a', ',,,', 'b'], (new Query('a,,,b'))->valueToArray());

// Tests for removing long words
$this->assertEquals(['a', 'c'], (new Query('a ' . str_repeat('b', 101) . ' c'))->valueToArray());

// Tests for compatibility of multiple rules
$this->assertEquals(['a—b', '"', 'text'], (new Query('a–b «text»'))->valueToArray());
$this->assertEquals(['a', ',', 'b'], (new Query(" a, \n b "))->valueToArray());
}
}

0 comments on commit 251778d

Please sign in to comment.