Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add auto-embedding fields #56

Merged
merged 1 commit into from
Feb 8, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 46 additions & 0 deletions docs/02_Mapping.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ biblioverse_typesense:
facet: true
- name: owners
type: string[]

```

Configuring the mapping involves the following functionalities:
Expand All @@ -29,6 +30,7 @@ Limitation:
* Only entities with a Single Identifier are supported. PR are welcome to support composite keys.



# Types

The type such as `string` or `string[]` are the same as the Typesense types. You can see the full list of types in the [Typesense documentation](https://typesense.org/docs/0.21.0/api/collections.html#schema-fields).
Expand Down Expand Up @@ -59,3 +61,47 @@ Config
entity_attribute: typesenseAddress
```

# Auto embedding fields

You can use the Typesense auto embedding fields to generate embeddings.

```yaml
embedding:
name: embedding
type: float[]
index: true
embed:
from: ["field_to_embed"]
model_config:
model_name: "..."
api_key: '...'
url: "..."
```

You can refer to the [Typesense](https://typesense.org/docs/27.1/api/vector-search.html#index-embeddings) documentation for the details of the configuration.

Here is an example for a local ollama embedding that would generate embeddings from the "summary" and "tags" fields, with the `nomic-embed-text` model.
- Please note: `numDim` should be specified and corresponding to your model specifications
- You need to prefix your model name with `openai/` if you are not using Typesense's own embedding engine.

```
embedding:
name: embedding
type: float[]
index: true
mapped: false
numDim: 768
embed:
from: ["tags", "summary"]
model_config:
model_name: "openai/nomic-embed-text"
api_key: '<key>'
url: "http://localhost:11434/"
```


## Mapped attribute

Unless you want to store embeddings in your database, you can specify `mapped: false` in the configuration, the embeddings will only live in typesense.

In this case, we recommend excluding these fields from being retrieved in the query.
17 changes: 17 additions & 0 deletions src/AbstractBiblioverseTypesenseBundle.php
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,9 @@ private function addCollectionsConfig(ArrayNodeDefinition $arrayNodeDefinition):
->booleanNode('facet')
->defaultNull()
->end()
->booleanNode('mapped')
->defaultTrue()
->end()
->scalarNode('entity_attribute')
->defaultNull()
->end()
Expand Down Expand Up @@ -123,6 +126,20 @@ private function addCollectionsConfig(ArrayNodeDefinition $arrayNodeDefinition):
->scalarNode('vecDist')
->defaultNull()
->end()
->arrayNode('embed')
->children()
->arrayNode('from')
->defaultValue([])->scalarPrototype()->end()
->end()
->arrayNode('model_config')
->children()
->scalarNode('model_name')->isRequired()->end()
->scalarNode('api_key')->defaultNull()->end()
->scalarNode('url')->defaultNull()->end()
->end()
->end()
->end()
->end()
->end()
->end()
->end()
Expand Down
4 changes: 4 additions & 0 deletions src/Mapper/Entity/EntityTransformer.php
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ public function transform(object $entity): array
$data = [];

foreach ($this->mappingGenerator->getMapping()->getFields() as $fieldMapping) {
if ($fieldMapping->isMapped() === false) {
continue;
}

$fieldName = $fieldMapping->getEntityAttribute() ?? $fieldMapping->getName();
$value = $this->valueExtractor->getValue($entity, $fieldName);

Expand Down
28 changes: 26 additions & 2 deletions src/Mapper/Fields/FieldMapping.php
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,18 @@
* 'locale'?:string|null,
* 'reference'?:string|null,
* 'entity_attribute'?:string|null,
* 'vecDist'?:string|null
* 'vecDist'?:string|null,
* 'embed'?:null|FieldMappingEmbedArray,
* 'mapped'?:bool
* }
* @phpstan-type FieldMappingEmbedArray array{
ragusa87 marked this conversation as resolved.
Show resolved Hide resolved
* 'from':string,
* 'model_config'?:FieldMappingEmbedModelValueArray,
* }
* @phpstan-type FieldMappingEmbedModelValueArray array{
* 'model_name':string,
* 'api_key'?:string,
* 'url'?:string
* }
*/
class FieldMapping implements FieldMappingInterface
Expand All @@ -30,6 +41,9 @@ class FieldMapping implements FieldMappingInterface

public ?string $entityAttribute = null;

/**
* @param FieldMappingEmbedArray $embed
*/
public function __construct(
public string $name,
DataTypeEnum|string $type,
Expand All @@ -46,6 +60,8 @@ public function __construct(
public ?string $locale = null,
public ?string $reference = null,
public ?string $vecDist = null,
public ?array $embed = null,
public bool $mapped = true,
) {
$this->type = $type instanceof DataTypeEnum ? $type->value : $type;
}
Expand All @@ -60,7 +76,7 @@ public function toArray(): array
'infix' => $this->infix,
'locale' => $this->locale,
'name' => $this->name,
'num_dim' => $this->numDim,
'num_dims' => $this->numDim,
'optional' => $this->optional,
'range_index' => $this->rangeIndex,
'reference' => $this->reference,
Expand All @@ -69,6 +85,7 @@ public function toArray(): array
'store' => $this->store,
'type' => $this->type,
'vec_dist' => $this->vecDist,
'embed' => $this->embed,
], fn ($value) => $value !== null);
}

Expand All @@ -82,6 +99,11 @@ public function getName(): string
return $this->name;
}

public function isMapped(): bool
{
return $this->mapped;
}

public function getEntityAttribute(): ?string
{
return $this->entityAttribute;
Expand All @@ -108,6 +130,8 @@ public static function fromArray(array $config): self
$config['locale'] ?? null,
$config['reference'] ?? null,
$config['vecDist'] ?? null,
$config['embed'] ?? null,
$config['mapped'] ?? true,
);

$result->entityAttribute = $config['entity_attribute'] ?? null;
Expand Down
2 changes: 2 additions & 0 deletions src/Mapper/Fields/FieldMappingInterface.php
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,6 @@ public function getName(): string;
public function getEntityAttribute(): ?string;

public function isOptional(): bool;

public function isMapped(): bool;
}
4 changes: 2 additions & 2 deletions src/Query/SearchQuery.php
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ public function __construct(
// After this line, we use alphabetical order for the parameters
?array $stopwords = null,
bool|array|null $prefix = null,
private readonly ?VectorQueryInterface $vectorQuery = null,
private readonly string|VectorQueryInterface|null $vectorQuery = null,
private readonly ?VoiceQueryInterface $voiceQuery = null,
private readonly ?bool $enableHighlightV1 = null,
private readonly ?bool $enableLazyFilter = null,
Expand Down Expand Up @@ -197,7 +197,7 @@ public function toArray(): array
'synonym_prefix' => $this->synonymPrefix,
'text_match_type' => $this->textMatchType,
'typo_tokens_threshold' => $this->typoTokensThreshold,
'vector_query' => $this->vectorQuery?->toArray(),
'vector_query' => $this->vectorQuery instanceof VectorQueryInterface ? (string) $this->vectorQuery : $this->vectorQuery,
'voice_query' => $this->voiceQuery instanceof VoiceQueryInterface ? (string) $this->voiceQuery : null,
], fn (mixed $value): bool => !is_null($value));
}
Expand Down
48 changes: 42 additions & 6 deletions src/Query/VectorQuery.php
Original file line number Diff line number Diff line change
Expand Up @@ -6,23 +6,59 @@ class VectorQuery implements VectorQueryInterface
{
/**
* @param array<float> $queryVector e.g., [0.1, 0.2, 0.3]
* @param string[] $queries
*/
public function __construct(
private readonly string $fieldName,
private readonly array $queryVector, // e.g., [0.1, 0.2, 0.3]
private readonly int $numCandidates, // e.g., 100
private readonly ?float $weight = null, // Optional weight
private readonly ?int $numCandidates = null, // e.g., 100
private readonly ?int $k = null,
private readonly ?string $id = null,
private readonly ?float $weights = null, // Optional weight
private readonly ?float $alpha = null,
private readonly ?float $distanceThreshold = null,
private readonly ?int $flatSearchCutoff = null,
private readonly ?array $queries = null,
private readonly ?int $ef = null,
) {
}

/**
* @return array<string,mixed>
* @return array<string,int|string|float|float[]|string[]|bool>
*/
public function toArray(): array
private function toArray(): array
{
return array_filter([
'query_vector' => $this->queryVector,
'alpha' => $this->alpha,
'id' => $this->id,
'k' => $this->k,
'num_candidates' => $this->numCandidates,
'weight' => $this->weight,
'query_vector' => $this->queryVector,
'distance_threshold' => $this->distanceThreshold,
'queries' => $this->queries,
'query_weights' => $this->weights,
'flat_search_cutoff' => $this->flatSearchCutoff,
'ef' => $this->ef,
], fn ($value): bool => !is_null($value));
}

public function __toString(): string
{
// Convert embedding array to a comma-separated string
$embeddingString = implode(', ', $this->queryVector);

// Return the query string in the required format
$result = $this->fieldName.":([$embeddingString]";

$values = $this->toArray();
unset($values['query_vector']);
foreach ($values as $name => $value) {
if (is_array($value)) {
$value = implode(', ', $value);
}
$result .= ", $name:$value";
}

return $result.')';
}
}
6 changes: 1 addition & 5 deletions src/Query/VectorQueryInterface.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,6 @@

namespace Biblioverse\TypesenseBundle\Query;

interface VectorQueryInterface
interface VectorQueryInterface extends \Stringable
{
/**
* @return array<string,mixed>
*/
public function toArray(): array;
}
16 changes: 16 additions & 0 deletions tests/Query/VectorQueryTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
<?php

namespace Biblioverse\TypesenseBundle\Tests\Query;

use Biblioverse\TypesenseBundle\Query\VectorQuery;
use PHPUnit\Framework\TestCase;

class VectorQueryTest extends TestCase
{
public function testToString(): void
{
$vectorQuery = new VectorQuery('embedding', queryVector: [0.96826, 0.94, 0.39557, 0.306488], k: 100, flatSearchCutoff: 20);

$this->assertSame('embedding:([0.96826, 0.94, 0.39557, 0.306488], k:100, flat_search_cutoff:20)', $vectorQuery->__toString());
}
}
Loading