Skip to content

Commit

Permalink
Merge pull request #239 from clulab/kwalcock/vectors
Browse files Browse the repository at this point in the history
Add vectors to elasticsearch
  • Loading branch information
kwalcock authored Feb 15, 2024
2 parents c9ddb0f + 2a132f8 commit 7d5a51d
Show file tree
Hide file tree
Showing 10 changed files with 172 additions and 12 deletions.
47 changes: 47 additions & 0 deletions belief_pipeline/es_vector_search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from argparse import ArgumentParser
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer

class VectorSearcher():
def __init__(self, url, username, password, model_name):
super().__init__()
self.elasticsearch = Elasticsearch(url, basic_auth=(username, password))
self.sentence_transformer = SentenceTransformer(model_name)

def search(self, k, text):
index = "habitus2"
# This vector is assumed to be normalized.
vector = self.sentence_transformer.encode(text).tolist()
query = {
"field": "chatVector",
"query_vector": vector,
"k": k,
"num_candidates": k
}
# The maximum value for size is limited by the index's index.max_result_window.
# If more results are desired, they need to be paged.
result = self.elasticsearch.search(index=index, knn=query, source=False, from_=0, size=k)
hits = result.body["hits"]["hits"]
ids_and_scores = [(hit["_id"], hit["_score"]) for hit in hits]
return ids_and_scores

def run(username, password, k, text):
url = "https://elasticsearch.habitus.clulab.org/"
model_name = "all-MiniLM-L6-v2"
vector_searcher = VectorSearcher(url, username, password, model_name)
ids_and_scores = vector_searcher.search(k, text)
for index, (id, score) in enumerate(ids_and_scores):
print(index, id, score)

def get_args():
argument_parser = ArgumentParser()
argument_parser.add_argument("-u", "--username", required=True, help="elasticsearch username")
argument_parser.add_argument("-p", "--password", required=True, help="elasticsearch password")
argument_parser.add_argument("-k", "--k", required=True, help="number of nearest neighbors")
argument_parser.add_argument("-t", "--text", required=True, help="text to be matched")
args = argument_parser.parse_args()
return args.username, args.password, args.k, args.text

if __name__ == "__main__":
username, password, k, text = get_args()
run(username, password, k, text)
25 changes: 25 additions & 0 deletions belief_pipeline/vector_input_stage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from pandas import DataFrame
from pipeline import InputStage

import pandas

class VectorInputStage(InputStage):
def __init__(self, file_name: str) -> None:
super().__init__(".")
self.file_name = file_name

def mk_data_frame(self, file_name: str) -> DataFrame:
data_frame = pandas.read_csv(self.file_name, sep="\t", encoding="utf-8", na_values=[""], keep_default_na=False, dtype={
"url": str,
"sentenceIndex": int,
"sentence": str,
"belief": bool,
"sent_locs": str,
"context_locs": str
})
return data_frame

def run(self) -> DataFrame:
data_frame = self.mk_data_frame(self.file_name)
# data_frame = data_frame[0:1000] # TODO: remove
return data_frame
28 changes: 28 additions & 0 deletions belief_pipeline/vector_main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from argparse import ArgumentParser
from pandas_output_stage import PandasOutputStage
from pipeline import Pipeline
from vector_input_stage import VectorInputStage
from vector_vector_stage import VectorVectorStage
from typing import Tuple


def get_in_and_out() -> Tuple[str, str]:
argument_parser = ArgumentParser()
argument_parser.add_argument("-i", "--input", required=True, help="input file name")
argument_parser.add_argument("-o", "--output", required=True, help="output file name")
args = argument_parser.parse_args()
return args.input, args.output

if __name__ == "__main__":
vector_model_name: str = "all-MiniLM-L6-v2"
input_file_name: str = "../corpora/uganda-mining/uganda-2.tsv"
output_file_name: str = "../corpora/uganda-mining/uganda-2-vectors.tsv"
# input_file_name, output_file_name = get_in_and_out()
pipeline = Pipeline(
VectorInputStage(input_file_name),
[
VectorVectorStage(vector_model_name)
],
PandasOutputStage(output_file_name)
)
pipeline.run()
28 changes: 28 additions & 0 deletions belief_pipeline/vector_vector_stage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from datasets import Dataset, DatasetDict
from pandas import DataFrame
from pipeline import InnerStage
from sentence_transformers import SentenceTransformer

import numpy
import torch

class VectorVectorStage(InnerStage):
def __init__(self, model_name: str) -> None:
super().__init__()
self.sentence_transformer = SentenceTransformer(model_name)

def encode(self, index, sentence):
print(index)
vector = self.sentence_transformer.encode(sentence)
vector_strings = [str(value) for value in vector]
vector_string = ", ".join(vector_strings)
return vector_string

def mk_vectors(self, data_frame: DataFrame):
vectors = [self.encode(index, sentence) for index, sentence in enumerate(data_frame["sentence"])]
return vectors

def run(self, data_frame: DataFrame) -> DataFrame:
vectors = self.mk_vectors(data_frame)
data_frame["vector"] = vectors
return data_frame
3 changes: 2 additions & 1 deletion elasticsearch/index.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ PUT habitus
{
"settings": {
"index.query.default_field": "sentence",
"index.number_of_replicas": 0
"index.number_of_replicas": 0,
"index.max_result_window": 10000
}
}
5 changes: 5 additions & 0 deletions elasticsearch/schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@
"sentence": {
"type": "text"
},
"chatVector": {
"type": "dense_vector",
"dims": 384,
"similarity": "dot_product"
},
"causalRelations": {
"type": "nested",
"properties": {
Expand Down
2 changes: 1 addition & 1 deletion elasticsearch/search.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ GET habitus/_search
GET habitus/_search
{
"query": {
"match": { "region", "uganda" }
"match": { "region": "uganda" }
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,8 @@ case class DatasetRecord(
prevLocations: Array[Location],
prevDistanceOpt: Option[Int],
nextLocations: Array[Location],
nextDistanceOpt: Option[Int]
nextDistanceOpt: Option[Int],
vector: Array[Float]
) {

// See advice at https://discuss.elastic.co/t/ways-to-build-json-doc-in-es8-java-api-client/314459.
Expand Down Expand Up @@ -109,6 +110,7 @@ case class DatasetRecord(
if (nextLocations.nonEmpty)
map.put("nextLocations", nextLocations.map(_.serialize()))
nextDistanceOpt.foreach { nextDistance => map.put("nextDistance", nextDistance.asInstanceOf[JInteger]) }
map.put("chatVector", vector)
map
}
}
Expand Down Expand Up @@ -250,7 +252,8 @@ object DatasetRecord {
latLonOpt = Some(LatLon(15f, 16f))
)
),
nextDistanceOpt = Some(5)
nextDistanceOpt = Some(5),
vector = Array(0f, 1f, 2f)
)

datasetRecord
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,20 +27,22 @@ object Step2InputEidos2 extends App with Logging {
belief: Boolean,
sentimentScoreOpt: Option[Float],
sentenceLocations: Array[Location],
contextLocations: Array[Location]
contextLocations: Array[Location],
vector: Array[Float]
)

implicit val formats: DefaultFormats.type = org.json4s.DefaultFormats
val contextWindow = 3
val baseDirectory = "../corpora/uganda-mining"
val inputFilename = "../corpora/uganda-mining/uganda-2.tsv"
val credentialsFilename = "../credentials/credentials.properties"
val inputFilename = "../corpora/uganda-mining/uganda-2-vectors.tsv"
val credentialsFilename = "../credentials/elasticsearch-credentials.properties"
val deserializer = new JLDDeserializer()
val url = new URL("http://localhost:9200")
// val url = new URL("https://elasticsearch.keithalcock.com")
val indexName = "habitus"
val indexName = "habitus2"
val datasetName = "uganda-mining.tsv"
val regionName = "uganda"
val alreadyNormalized = true

def jsonFileToJsonld(jsonFile: File): File =
FileEditor(jsonFile).setExt("jsonld").get
Expand Down Expand Up @@ -152,6 +154,24 @@ object Step2InputEidos2 extends App with Logging {
}
}

def parseVector(vectorString: String): Array[Float] = {
val values = vectorString.split(", ")
val floats = values.map(_.toFloat)

floats
}

def normalize(floats: Array[Float]): Array[Float] = {
if (alreadyNormalized) floats
else {
val sumSquare = floats.foldLeft(0f) { case (sum, float) => sum + float * float }
val divisor = math.sqrt(sumSquare)
val normalized = floats.map { float => (float / divisor).toFloat }

normalized
}
}

def getCausalRelations(causalMentionGroup: Seq[Mention]): Array[CausalRelation] = {
val causalRelations = causalMentionGroup.zipWithIndex.map { case (causalMention, causalIndex) =>
val causalAttributeCounts = mentionToAttributeCounts(causalMention)
Expand Down Expand Up @@ -235,14 +255,15 @@ object Step2InputEidos2 extends App with Logging {
val tsvReader = new TsvReader()

lines.map { line =>
val Array(url, sentenceIndexString, sentence, beliefString, sentimentScore, sentenceLocationsString, contextLocationsString) = tsvReader.readln(line, 7)
val Array(url, sentenceIndexString, sentence, beliefString, sentimentScore, sentenceLocationsString, contextLocationsString, vectorString) = tsvReader.readln(line, 8)
val sentenceIndex = sentenceIndexString.toInt
val belief = beliefString == "True"
val sentimentScoreOpt = if (sentimentScore.isEmpty) None else Some(sentimentScore.toFloat)
val sentenceLocations = parseLocations(sentenceLocationsString)
val contextLocations = parseLocations(contextLocationsString)
val vector = normalize(parseVector(vectorString))

(url, sentenceIndex) -> new LocalTsvRecord(sentenceIndex, sentence, belief, sentimentScoreOpt, sentenceLocations, contextLocations)
(url, sentenceIndex) -> new LocalTsvRecord(sentenceIndex, sentence, belief, sentimentScoreOpt, sentenceLocations, contextLocations, vector)
}.toMap
}
val restClient = Elasticsearch.mkRestClient(url, credentialsFilename)
Expand Down Expand Up @@ -305,7 +326,8 @@ object Step2InputEidos2 extends App with Logging {
prevLocations,
prevDistanceOpt,
nextLocations,
nextDistanceOpt
nextDistanceOpt,
tsvRecord.vector
)

elasticsearchIndexClient.index(datasetRecord)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -550,7 +550,8 @@ object Step2InputEidos2 extends App with Logging {
prevLocations,
prevDistanceOpt,
nextLocations,
nextDistanceOpt
nextDistanceOpt,
null // TODO
)

runIndex(connection, datasetRecord)
Expand Down

0 comments on commit 7d5a51d

Please sign in to comment.