From f796b243252b576dd8ad2dd69fcb68a7aa7fc18d Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Thu, 15 Feb 2024 09:44:21 -0700 Subject: [PATCH 1/7] "Calculate" vectors --- belief_pipeline/vector_input_stage.py | 22 ++++++++++++++++++++ belief_pipeline/vector_main.py | 28 ++++++++++++++++++++++++++ belief_pipeline/vector_vector_stage.py | 28 ++++++++++++++++++++++++++ elasticsearch/schema.json | 5 +++++ 4 files changed, 83 insertions(+) create mode 100644 belief_pipeline/vector_input_stage.py create mode 100644 belief_pipeline/vector_main.py create mode 100644 belief_pipeline/vector_vector_stage.py diff --git a/belief_pipeline/vector_input_stage.py b/belief_pipeline/vector_input_stage.py new file mode 100644 index 00000000..a6b232e7 --- /dev/null +++ b/belief_pipeline/vector_input_stage.py @@ -0,0 +1,22 @@ +from pandas import DataFrame +from pipeline import InputStage + +import pandas + +class VectorInputStage(InputStage): + def __init__(self, file_name: str) -> None: + super().__init__(".") + self.file_name = file_name + + def mk_data_frame(self, file_name: str) -> DataFrame: + data_frame = pandas.read_csv(self.file_name, sep="\t", encoding="utf-8", na_values=[""], keep_default_na=False, dtype={ + "url": str, + "sentenceIndex": int, + "sentence": str + }) + return data_frame + + def run(self) -> DataFrame: + data_frame = self.mk_data_frame(self.file_name) + # data_frame = data_frame[0:1000] # TODO: remove + return data_frame diff --git a/belief_pipeline/vector_main.py b/belief_pipeline/vector_main.py new file mode 100644 index 00000000..4bbe85d9 --- /dev/null +++ b/belief_pipeline/vector_main.py @@ -0,0 +1,28 @@ +from argparse import ArgumentParser +from pandas_output_stage import PandasOutputStage +from pipeline import Pipeline +from vector_input_stage import VectorInputStage +from vector_vector_stage import VectorVectorStage +from typing import Tuple + + +def get_in_and_out() -> Tuple[str, str]: + argument_parser = ArgumentParser() + argument_parser.add_argument("-i", "--input", required=True, help="input file name") + argument_parser.add_argument("-o", "--output", required=True, help="output file name") + args = argument_parser.parse_args() + return args.input, args.output + +if __name__ == "__main__": + vector_model_name: str = "all-MiniLM-L6-v2" + input_file_name: str = "../corpora/uganda-mining/uganda.tsv" + output_file_name: str = "../corpora/uganda-mining/uganda-vectors.tsv" + # input_file_name, output_file_name = get_in_and_out() + pipeline = Pipeline( + VectorInputStage(input_file_name), + [ + VectorVectorStage(vector_model_name) + ], + PandasOutputStage(output_file_name) + ) + pipeline.run() diff --git a/belief_pipeline/vector_vector_stage.py b/belief_pipeline/vector_vector_stage.py new file mode 100644 index 00000000..36befab0 --- /dev/null +++ b/belief_pipeline/vector_vector_stage.py @@ -0,0 +1,28 @@ +from datasets import Dataset, DatasetDict +from pandas import DataFrame +from pipeline import InnerStage +from sentence_transformers import SentenceTransformer + +import numpy +import torch + +class VectorVectorStage(InnerStage): + def __init__(self, model_name: str) -> None: + super().__init__() + self.sentence_transformer = SentenceTransformer(model_name) + + def encode(self, index, sentence): + print(index) + vector = self.sentence_transformer.encode(sentence) + vector_strings = [str(value) for value in vector] + vector_string = ", ".join(vector_strings) + return vector_string + + def mk_vectors(self, data_frame: DataFrame): + vectors = [self.encode(index, sentence) for index, sentence in enumerate(data_frame["sentence"])] + return vectors + + def run(self, data_frame: DataFrame) -> DataFrame: + vectors = self.mk_vectors(data_frame) + data_frame["vector"] = vectors + return data_frame diff --git a/elasticsearch/schema.json b/elasticsearch/schema.json index b6b63e5a..23ab2420 100644 --- a/elasticsearch/schema.json +++ b/elasticsearch/schema.json @@ -30,6 +30,11 @@ "sentence": { "type": "text" }, + "chatVector": { + "type": "dense_vector", + "dims": 384, + "similarity": "l2_norm" + }, "causalRelations": { "type": "nested", "properties": { From 65931060e11f06a471e92d17e51749f0f0b1f811 Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Thu, 15 Feb 2024 11:47:03 -0700 Subject: [PATCH 2/7] Include vector search --- belief_pipeline/vector_input_stage.py | 5 ++- belief_pipeline/vector_main.py | 4 +-- elasticsearch/schema.json | 2 +- elasticsearch/search.json | 2 +- .../elasticsearch/data/DatasetRecord.scala | 7 ++-- .../apps/elasticsearch/Step2InputEidos2.scala | 36 +++++++++++++++---- .../habitus/apps/mysql/Step2InputEidos2.scala | 3 +- 7 files changed, 44 insertions(+), 15 deletions(-) diff --git a/belief_pipeline/vector_input_stage.py b/belief_pipeline/vector_input_stage.py index a6b232e7..5c692a18 100644 --- a/belief_pipeline/vector_input_stage.py +++ b/belief_pipeline/vector_input_stage.py @@ -12,7 +12,10 @@ def mk_data_frame(self, file_name: str) -> DataFrame: data_frame = pandas.read_csv(self.file_name, sep="\t", encoding="utf-8", na_values=[""], keep_default_na=False, dtype={ "url": str, "sentenceIndex": int, - "sentence": str + "sentence": str, + "belief": bool, + "sent_locs": str, + "context_locs": str }) return data_frame diff --git a/belief_pipeline/vector_main.py b/belief_pipeline/vector_main.py index 4bbe85d9..496053f8 100644 --- a/belief_pipeline/vector_main.py +++ b/belief_pipeline/vector_main.py @@ -15,8 +15,8 @@ def get_in_and_out() -> Tuple[str, str]: if __name__ == "__main__": vector_model_name: str = "all-MiniLM-L6-v2" - input_file_name: str = "../corpora/uganda-mining/uganda.tsv" - output_file_name: str = "../corpora/uganda-mining/uganda-vectors.tsv" + input_file_name: str = "../corpora/uganda-mining/uganda-2.tsv" + output_file_name: str = "../corpora/uganda-mining/uganda-2-vectors.tsv" # input_file_name, output_file_name = get_in_and_out() pipeline = Pipeline( VectorInputStage(input_file_name), diff --git a/elasticsearch/schema.json b/elasticsearch/schema.json index 23ab2420..bb5d66c0 100644 --- a/elasticsearch/schema.json +++ b/elasticsearch/schema.json @@ -33,7 +33,7 @@ "chatVector": { "type": "dense_vector", "dims": 384, - "similarity": "l2_norm" + "similarity": "dot_product" }, "causalRelations": { "type": "nested", diff --git a/elasticsearch/search.json b/elasticsearch/search.json index 3b662308..27605a90 100644 --- a/elasticsearch/search.json +++ b/elasticsearch/search.json @@ -3,7 +3,7 @@ GET habitus/_search GET habitus/_search { "query": { - "match": { "region", "uganda" } + "match": { "region": "uganda" } } } diff --git a/elasticsearch/src/main/scala/org/clulab/habitus/elasticsearch/data/DatasetRecord.scala b/elasticsearch/src/main/scala/org/clulab/habitus/elasticsearch/data/DatasetRecord.scala index a97b4cd6..02123d4a 100644 --- a/elasticsearch/src/main/scala/org/clulab/habitus/elasticsearch/data/DatasetRecord.scala +++ b/elasticsearch/src/main/scala/org/clulab/habitus/elasticsearch/data/DatasetRecord.scala @@ -73,7 +73,8 @@ case class DatasetRecord( prevLocations: Array[Location], prevDistanceOpt: Option[Int], nextLocations: Array[Location], - nextDistanceOpt: Option[Int] + nextDistanceOpt: Option[Int], + vector: Array[Float] ) { // See advice at https://discuss.elastic.co/t/ways-to-build-json-doc-in-es8-java-api-client/314459. @@ -109,6 +110,7 @@ case class DatasetRecord( if (nextLocations.nonEmpty) map.put("nextLocations", nextLocations.map(_.serialize())) nextDistanceOpt.foreach { nextDistance => map.put("nextDistance", nextDistance.asInstanceOf[JInteger]) } + map.put("chatVector", vector) map } } @@ -250,7 +252,8 @@ object DatasetRecord { latLonOpt = Some(LatLon(15f, 16f)) ) ), - nextDistanceOpt = Some(5) + nextDistanceOpt = Some(5), + vector = Array(0f, 1f, 2f) ) datasetRecord diff --git a/src/main/scala/org/clulab/habitus/apps/elasticsearch/Step2InputEidos2.scala b/src/main/scala/org/clulab/habitus/apps/elasticsearch/Step2InputEidos2.scala index b940d909..34c31481 100644 --- a/src/main/scala/org/clulab/habitus/apps/elasticsearch/Step2InputEidos2.scala +++ b/src/main/scala/org/clulab/habitus/apps/elasticsearch/Step2InputEidos2.scala @@ -27,20 +27,22 @@ object Step2InputEidos2 extends App with Logging { belief: Boolean, sentimentScoreOpt: Option[Float], sentenceLocations: Array[Location], - contextLocations: Array[Location] + contextLocations: Array[Location], + vector: Array[Float] ) implicit val formats: DefaultFormats.type = org.json4s.DefaultFormats val contextWindow = 3 val baseDirectory = "../corpora/uganda-mining" - val inputFilename = "../corpora/uganda-mining/uganda-2.tsv" - val credentialsFilename = "../credentials/credentials.properties" + val inputFilename = "../corpora/uganda-mining/uganda-2-vectors.tsv" + val credentialsFilename = "../credentials/elasticsearch-credentials.properties" val deserializer = new JLDDeserializer() val url = new URL("http://localhost:9200") // val url = new URL("https://elasticsearch.keithalcock.com") - val indexName = "habitus" + val indexName = "habitus2" val datasetName = "uganda-mining.tsv" val regionName = "uganda" + val alreadyNormalized = true def jsonFileToJsonld(jsonFile: File): File = FileEditor(jsonFile).setExt("jsonld").get @@ -152,6 +154,24 @@ object Step2InputEidos2 extends App with Logging { } } + def parseVector(vectorString: String): Array[Float] = { + val values = vectorString.split(", ") + val floats = values.map(_.toFloat) + + floats + } + + def normalize(floats: Array[Float]): Array[Float] = { + if (alreadyNormalized) floats + else { + val sumSquare = floats.foldLeft(0f) { case (sum, float) => sum + float * float } + val divisor = math.sqrt(sumSquare) + val normalized = floats.map { float => (float / divisor).toFloat } + + normalized + } + } + def getCausalRelations(causalMentionGroup: Seq[Mention]): Array[CausalRelation] = { val causalRelations = causalMentionGroup.zipWithIndex.map { case (causalMention, causalIndex) => val causalAttributeCounts = mentionToAttributeCounts(causalMention) @@ -235,14 +255,15 @@ object Step2InputEidos2 extends App with Logging { val tsvReader = new TsvReader() lines.map { line => - val Array(url, sentenceIndexString, sentence, beliefString, sentimentScore, sentenceLocationsString, contextLocationsString) = tsvReader.readln(line, 7) + val Array(url, sentenceIndexString, sentence, beliefString, sentimentScore, sentenceLocationsString, contextLocationsString, vectorString) = tsvReader.readln(line, 8) val sentenceIndex = sentenceIndexString.toInt val belief = beliefString == "True" val sentimentScoreOpt = if (sentimentScore.isEmpty) None else Some(sentimentScore.toFloat) val sentenceLocations = parseLocations(sentenceLocationsString) val contextLocations = parseLocations(contextLocationsString) + val vector = normalize(parseVector(vectorString)) - (url, sentenceIndex) -> new LocalTsvRecord(sentenceIndex, sentence, belief, sentimentScoreOpt, sentenceLocations, contextLocations) + (url, sentenceIndex) -> new LocalTsvRecord(sentenceIndex, sentence, belief, sentimentScoreOpt, sentenceLocations, contextLocations, vector) }.toMap } val restClient = Elasticsearch.mkRestClient(url, credentialsFilename) @@ -305,7 +326,8 @@ object Step2InputEidos2 extends App with Logging { prevLocations, prevDistanceOpt, nextLocations, - nextDistanceOpt + nextDistanceOpt, + tsvRecord.vector ) elasticsearchIndexClient.index(datasetRecord) diff --git a/src/main/scala/org/clulab/habitus/apps/mysql/Step2InputEidos2.scala b/src/main/scala/org/clulab/habitus/apps/mysql/Step2InputEidos2.scala index 85045ab6..38f84d8c 100644 --- a/src/main/scala/org/clulab/habitus/apps/mysql/Step2InputEidos2.scala +++ b/src/main/scala/org/clulab/habitus/apps/mysql/Step2InputEidos2.scala @@ -550,7 +550,8 @@ object Step2InputEidos2 extends App with Logging { prevLocations, prevDistanceOpt, nextLocations, - nextDistanceOpt + nextDistanceOpt, + null // TODO ) runIndex(connection, datasetRecord) From e9e7560631a0afe1cde7f8fe5039765ca0ab3c4a Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Thu, 15 Feb 2024 13:28:46 -0700 Subject: [PATCH 3/7] Try to query with python --- belief_pipeline/es_vector_search.py | 42 +++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 belief_pipeline/es_vector_search.py diff --git a/belief_pipeline/es_vector_search.py b/belief_pipeline/es_vector_search.py new file mode 100644 index 00000000..390c7ba0 --- /dev/null +++ b/belief_pipeline/es_vector_search.py @@ -0,0 +1,42 @@ +from argparse import ArgumentParser +from elasticsearch import Elasticsearch +from sentence_transformers import SentenceTransformer + +class VectorSearcher(): + def __init__(self, url, username, password, model_name): + super().__init__() + self.elasticsearch = Elasticsearch(url, basic_auth=(username, password)) + self.sentence_transformer = SentenceTransformer(model_name) + + def search(self, k, text): + index = "habitus2" + # This vector is assumed to be normalized. + vector = self.sentence_transformer.encode(text).tolist() + query = { + "query_vector": vector, + "k": k, + "num_candidates": k + } + print(query) + result = self.elasticsearch.knn_search(index=index, knn=query, source=False) + print(result) + +def run(username, password, k, text): + url = "http://localhost:9200/" + model_name = "all-MiniLM-L6-v2" + vector_searcher = VectorSearcher(url, username, password, model_name) + matches = vector_searcher.search(k, text) + return matches + +def get_args(): + argument_parser = ArgumentParser() + argument_parser.add_argument("-u", "--username", required=True, help="elasticsearch username") + argument_parser.add_argument("-p", "--password", required=True, help="elasticsearch password") + argument_parser.add_argument("-k", "--k", required=True, help="number of nearest neighbors") + argument_parser.add_argument("-t", "--text", required=True, help="text to be matched") + args = argument_parser.parse_args() + return args.username, args.password, args.k, args.text + +if __name__ == "__main__": + username, password, k, text = get_args() + run(username, password, k, text) From 416f948f2a1eb307408dbd961a98ab5533a9c9a6 Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Thu, 15 Feb 2024 13:45:47 -0700 Subject: [PATCH 4/7] Get working but with deprecation --- belief_pipeline/es_vector_search.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/belief_pipeline/es_vector_search.py b/belief_pipeline/es_vector_search.py index 390c7ba0..5671d5be 100644 --- a/belief_pipeline/es_vector_search.py +++ b/belief_pipeline/es_vector_search.py @@ -13,20 +13,22 @@ def search(self, k, text): # This vector is assumed to be normalized. vector = self.sentence_transformer.encode(text).tolist() query = { + "field": "chatVector", "query_vector": vector, "k": k, "num_candidates": k } - print(query) result = self.elasticsearch.knn_search(index=index, knn=query, source=False) + ids_and_scores = [(hit._id, hit._score) for hit in result["hits"]["hits"]] print(result) + return ids_and_scores def run(username, password, k, text): url = "http://localhost:9200/" model_name = "all-MiniLM-L6-v2" vector_searcher = VectorSearcher(url, username, password, model_name) - matches = vector_searcher.search(k, text) - return matches + ids_and_scores = vector_searcher.search(k, text) + print(ids_and_scores) def get_args(): argument_parser = ArgumentParser() From 117910a4204f3d7114fb6d29d54b66961a3adedc Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Thu, 15 Feb 2024 13:59:36 -0700 Subject: [PATCH 5/7] Make it work once --- belief_pipeline/es_vector_search.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/belief_pipeline/es_vector_search.py b/belief_pipeline/es_vector_search.py index 5671d5be..26e130d5 100644 --- a/belief_pipeline/es_vector_search.py +++ b/belief_pipeline/es_vector_search.py @@ -18,8 +18,9 @@ def search(self, k, text): "k": k, "num_candidates": k } - result = self.elasticsearch.knn_search(index=index, knn=query, source=False) - ids_and_scores = [(hit._id, hit._score) for hit in result["hits"]["hits"]] + result = self.elasticsearch.search(index=index, knn=query, source=False) + hits = result.body["hits"]["hits"] + ids_and_scores = [(hit["_id"], hit["_score"]) for hit in hits] print(result) return ids_and_scores From 9084cb5e42592c8a38b3fe95069ce67cfd855a5e Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Thu, 15 Feb 2024 14:12:54 -0700 Subject: [PATCH 6/7] Get more results --- belief_pipeline/es_vector_search.py | 3 ++- elasticsearch/index.json | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/belief_pipeline/es_vector_search.py b/belief_pipeline/es_vector_search.py index 26e130d5..54d7a104 100644 --- a/belief_pipeline/es_vector_search.py +++ b/belief_pipeline/es_vector_search.py @@ -18,7 +18,8 @@ def search(self, k, text): "k": k, "num_candidates": k } - result = self.elasticsearch.search(index=index, knn=query, source=False) + # The maximum value for size is limited by the index's index.max_result_window. + result = self.elasticsearch.search(index=index, knn=query, source=False, from_=0, size=k) hits = result.body["hits"]["hits"] ids_and_scores = [(hit["_id"], hit["_score"]) for hit in hits] print(result) diff --git a/elasticsearch/index.json b/elasticsearch/index.json index dec2d586..8b045b99 100644 --- a/elasticsearch/index.json +++ b/elasticsearch/index.json @@ -2,6 +2,7 @@ PUT habitus { "settings": { "index.query.default_field": "sentence", - "index.number_of_replicas": 0 + "index.number_of_replicas": 0, + "index.max_result_window": 10000 } } From 2a132f8bcabdb597699c934916c4172add7bd590 Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Thu, 15 Feb 2024 15:23:16 -0700 Subject: [PATCH 7/7] Update search program --- belief_pipeline/es_vector_search.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/belief_pipeline/es_vector_search.py b/belief_pipeline/es_vector_search.py index 54d7a104..8eeab204 100644 --- a/belief_pipeline/es_vector_search.py +++ b/belief_pipeline/es_vector_search.py @@ -19,18 +19,19 @@ def search(self, k, text): "num_candidates": k } # The maximum value for size is limited by the index's index.max_result_window. + # If more results are desired, they need to be paged. result = self.elasticsearch.search(index=index, knn=query, source=False, from_=0, size=k) hits = result.body["hits"]["hits"] ids_and_scores = [(hit["_id"], hit["_score"]) for hit in hits] - print(result) return ids_and_scores def run(username, password, k, text): - url = "http://localhost:9200/" + url = "https://elasticsearch.habitus.clulab.org/" model_name = "all-MiniLM-L6-v2" vector_searcher = VectorSearcher(url, username, password, model_name) ids_and_scores = vector_searcher.search(k, text) - print(ids_and_scores) + for index, (id, score) in enumerate(ids_and_scores): + print(index, id, score) def get_args(): argument_parser = ArgumentParser()