From f796b243252b576dd8ad2dd69fcb68a7aa7fc18d Mon Sep 17 00:00:00 2001
From: Keith Alcock <github@keithalcock.com>
Date: Thu, 15 Feb 2024 09:44:21 -0700
Subject: [PATCH 1/7] "Calculate" vectors

---
 belief_pipeline/vector_input_stage.py  | 22 ++++++++++++++++++++
 belief_pipeline/vector_main.py         | 28 ++++++++++++++++++++++++++
 belief_pipeline/vector_vector_stage.py | 28 ++++++++++++++++++++++++++
 elasticsearch/schema.json              |  5 +++++
 4 files changed, 83 insertions(+)
 create mode 100644 belief_pipeline/vector_input_stage.py
 create mode 100644 belief_pipeline/vector_main.py
 create mode 100644 belief_pipeline/vector_vector_stage.py

diff --git a/belief_pipeline/vector_input_stage.py b/belief_pipeline/vector_input_stage.py
new file mode 100644
index 00000000..a6b232e7
--- /dev/null
+++ b/belief_pipeline/vector_input_stage.py
@@ -0,0 +1,22 @@
+from pandas import DataFrame
+from pipeline import InputStage
+
+import pandas
+
+class VectorInputStage(InputStage):
+    def __init__(self, file_name: str) -> None:
+        super().__init__(".")
+        self.file_name = file_name
+
+    def mk_data_frame(self, file_name: str) -> DataFrame:
+        data_frame = pandas.read_csv(self.file_name, sep="\t", encoding="utf-8", na_values=[""], keep_default_na=False, dtype={
+            "url": str,
+            "sentenceIndex": int,
+            "sentence": str
+        })
+        return data_frame
+
+    def run(self) -> DataFrame:
+        data_frame = self.mk_data_frame(self.file_name)
+        # data_frame = data_frame[0:1000] # TODO: remove
+        return data_frame
diff --git a/belief_pipeline/vector_main.py b/belief_pipeline/vector_main.py
new file mode 100644
index 00000000..4bbe85d9
--- /dev/null
+++ b/belief_pipeline/vector_main.py
@@ -0,0 +1,28 @@
+from argparse import ArgumentParser
+from pandas_output_stage import PandasOutputStage
+from pipeline import Pipeline
+from vector_input_stage import VectorInputStage
+from vector_vector_stage import VectorVectorStage
+from typing import Tuple
+
+
+def get_in_and_out() -> Tuple[str, str]:
+    argument_parser = ArgumentParser()
+    argument_parser.add_argument("-i", "--input", required=True, help="input file name")
+    argument_parser.add_argument("-o", "--output", required=True, help="output file name")
+    args = argument_parser.parse_args()
+    return args.input, args.output
+
+if __name__ == "__main__":
+    vector_model_name: str = "all-MiniLM-L6-v2"
+    input_file_name: str = "../corpora/uganda-mining/uganda.tsv"
+    output_file_name: str = "../corpora/uganda-mining/uganda-vectors.tsv"
+    # input_file_name, output_file_name = get_in_and_out()
+    pipeline = Pipeline(
+        VectorInputStage(input_file_name),
+        [
+            VectorVectorStage(vector_model_name)
+        ],
+        PandasOutputStage(output_file_name)
+    )
+    pipeline.run()
diff --git a/belief_pipeline/vector_vector_stage.py b/belief_pipeline/vector_vector_stage.py
new file mode 100644
index 00000000..36befab0
--- /dev/null
+++ b/belief_pipeline/vector_vector_stage.py
@@ -0,0 +1,28 @@
+from datasets import Dataset, DatasetDict
+from pandas import DataFrame
+from pipeline import InnerStage
+from sentence_transformers import SentenceTransformer
+
+import numpy
+import torch
+
+class VectorVectorStage(InnerStage):
+    def __init__(self, model_name: str) -> None:
+        super().__init__()
+        self.sentence_transformer = SentenceTransformer(model_name)
+
+    def encode(self, index, sentence):
+        print(index)
+        vector = self.sentence_transformer.encode(sentence)
+        vector_strings = [str(value) for value in vector]
+        vector_string = ", ".join(vector_strings)
+        return vector_string
+
+    def mk_vectors(self, data_frame: DataFrame):
+        vectors = [self.encode(index, sentence) for index, sentence in enumerate(data_frame["sentence"])]
+        return vectors
+
+    def run(self, data_frame: DataFrame) -> DataFrame:
+        vectors = self.mk_vectors(data_frame)
+        data_frame["vector"] = vectors
+        return data_frame
diff --git a/elasticsearch/schema.json b/elasticsearch/schema.json
index b6b63e5a..23ab2420 100644
--- a/elasticsearch/schema.json
+++ b/elasticsearch/schema.json
@@ -30,6 +30,11 @@
     "sentence": {
       "type": "text"
     },
+    "chatVector": {
+      "type": "dense_vector",
+      "dims": 384,
+      "similarity": "l2_norm"
+    },
     "causalRelations": {
       "type": "nested",
       "properties": {

From 65931060e11f06a471e92d17e51749f0f0b1f811 Mon Sep 17 00:00:00 2001
From: Keith Alcock <github@keithalcock.com>
Date: Thu, 15 Feb 2024 11:47:03 -0700
Subject: [PATCH 2/7] Include vector search

---
 belief_pipeline/vector_input_stage.py         |  5 ++-
 belief_pipeline/vector_main.py                |  4 +--
 elasticsearch/schema.json                     |  2 +-
 elasticsearch/search.json                     |  2 +-
 .../elasticsearch/data/DatasetRecord.scala    |  7 ++--
 .../apps/elasticsearch/Step2InputEidos2.scala | 36 +++++++++++++++----
 .../habitus/apps/mysql/Step2InputEidos2.scala |  3 +-
 7 files changed, 44 insertions(+), 15 deletions(-)

diff --git a/belief_pipeline/vector_input_stage.py b/belief_pipeline/vector_input_stage.py
index a6b232e7..5c692a18 100644
--- a/belief_pipeline/vector_input_stage.py
+++ b/belief_pipeline/vector_input_stage.py
@@ -12,7 +12,10 @@ def mk_data_frame(self, file_name: str) -> DataFrame:
         data_frame = pandas.read_csv(self.file_name, sep="\t", encoding="utf-8", na_values=[""], keep_default_na=False, dtype={
             "url": str,
             "sentenceIndex": int,
-            "sentence": str
+            "sentence": str,
+            "belief": bool,
+            "sent_locs": str,
+            "context_locs": str
         })
         return data_frame
 
diff --git a/belief_pipeline/vector_main.py b/belief_pipeline/vector_main.py
index 4bbe85d9..496053f8 100644
--- a/belief_pipeline/vector_main.py
+++ b/belief_pipeline/vector_main.py
@@ -15,8 +15,8 @@ def get_in_and_out() -> Tuple[str, str]:
 
 if __name__ == "__main__":
     vector_model_name: str = "all-MiniLM-L6-v2"
-    input_file_name: str = "../corpora/uganda-mining/uganda.tsv"
-    output_file_name: str = "../corpora/uganda-mining/uganda-vectors.tsv"
+    input_file_name: str = "../corpora/uganda-mining/uganda-2.tsv"
+    output_file_name: str = "../corpora/uganda-mining/uganda-2-vectors.tsv"
     # input_file_name, output_file_name = get_in_and_out()
     pipeline = Pipeline(
         VectorInputStage(input_file_name),
diff --git a/elasticsearch/schema.json b/elasticsearch/schema.json
index 23ab2420..bb5d66c0 100644
--- a/elasticsearch/schema.json
+++ b/elasticsearch/schema.json
@@ -33,7 +33,7 @@
     "chatVector": {
       "type": "dense_vector",
       "dims": 384,
-      "similarity": "l2_norm"
+      "similarity": "dot_product"
     },
     "causalRelations": {
       "type": "nested",
diff --git a/elasticsearch/search.json b/elasticsearch/search.json
index 3b662308..27605a90 100644
--- a/elasticsearch/search.json
+++ b/elasticsearch/search.json
@@ -3,7 +3,7 @@ GET habitus/_search
 GET habitus/_search
 {
   "query": {
-    "match": { "region", "uganda" }
+    "match": { "region":  "uganda" }
   }
 }
 
diff --git a/elasticsearch/src/main/scala/org/clulab/habitus/elasticsearch/data/DatasetRecord.scala b/elasticsearch/src/main/scala/org/clulab/habitus/elasticsearch/data/DatasetRecord.scala
index a97b4cd6..02123d4a 100644
--- a/elasticsearch/src/main/scala/org/clulab/habitus/elasticsearch/data/DatasetRecord.scala
+++ b/elasticsearch/src/main/scala/org/clulab/habitus/elasticsearch/data/DatasetRecord.scala
@@ -73,7 +73,8 @@ case class DatasetRecord(
   prevLocations: Array[Location],
   prevDistanceOpt: Option[Int],
   nextLocations: Array[Location],
-  nextDistanceOpt: Option[Int]
+  nextDistanceOpt: Option[Int],
+  vector: Array[Float]
 ) {
 
   // See advice at https://discuss.elastic.co/t/ways-to-build-json-doc-in-es8-java-api-client/314459.
@@ -109,6 +110,7 @@ case class DatasetRecord(
     if (nextLocations.nonEmpty)
       map.put("nextLocations", nextLocations.map(_.serialize()))
     nextDistanceOpt.foreach { nextDistance => map.put("nextDistance", nextDistance.asInstanceOf[JInteger]) }
+    map.put("chatVector", vector)
     map
   }
 }
@@ -250,7 +252,8 @@ object DatasetRecord {
           latLonOpt = Some(LatLon(15f, 16f))
         )
       ),
-      nextDistanceOpt = Some(5)
+      nextDistanceOpt = Some(5),
+      vector = Array(0f, 1f, 2f)
     )
 
     datasetRecord
diff --git a/src/main/scala/org/clulab/habitus/apps/elasticsearch/Step2InputEidos2.scala b/src/main/scala/org/clulab/habitus/apps/elasticsearch/Step2InputEidos2.scala
index b940d909..34c31481 100644
--- a/src/main/scala/org/clulab/habitus/apps/elasticsearch/Step2InputEidos2.scala
+++ b/src/main/scala/org/clulab/habitus/apps/elasticsearch/Step2InputEidos2.scala
@@ -27,20 +27,22 @@ object Step2InputEidos2 extends App with Logging {
     belief: Boolean,
     sentimentScoreOpt: Option[Float],
     sentenceLocations: Array[Location],
-    contextLocations: Array[Location]
+    contextLocations: Array[Location],
+    vector: Array[Float]
   )
 
   implicit val formats: DefaultFormats.type = org.json4s.DefaultFormats
   val contextWindow = 3
   val baseDirectory = "../corpora/uganda-mining"
-  val inputFilename = "../corpora/uganda-mining/uganda-2.tsv"
-  val credentialsFilename = "../credentials/credentials.properties"
+  val inputFilename = "../corpora/uganda-mining/uganda-2-vectors.tsv"
+  val credentialsFilename = "../credentials/elasticsearch-credentials.properties"
   val deserializer = new JLDDeserializer()
   val url = new URL("http://localhost:9200")
   // val url = new URL("https://elasticsearch.keithalcock.com")
-  val indexName = "habitus"
+  val indexName = "habitus2"
   val datasetName = "uganda-mining.tsv"
   val regionName = "uganda"
+  val alreadyNormalized = true
 
   def jsonFileToJsonld(jsonFile: File): File =
       FileEditor(jsonFile).setExt("jsonld").get
@@ -152,6 +154,24 @@ object Step2InputEidos2 extends App with Logging {
     }
   }
 
+  def parseVector(vectorString: String): Array[Float] = {
+    val values = vectorString.split(", ")
+    val floats = values.map(_.toFloat)
+
+    floats
+  }
+
+  def normalize(floats: Array[Float]): Array[Float] = {
+    if (alreadyNormalized) floats
+    else {
+      val sumSquare = floats.foldLeft(0f) { case (sum, float) => sum + float * float }
+      val divisor = math.sqrt(sumSquare)
+      val normalized = floats.map { float => (float / divisor).toFloat }
+
+      normalized
+    }
+  }
+
   def getCausalRelations(causalMentionGroup: Seq[Mention]): Array[CausalRelation] = {
     val causalRelations = causalMentionGroup.zipWithIndex.map { case (causalMention, causalIndex) =>
       val causalAttributeCounts = mentionToAttributeCounts(causalMention)
@@ -235,14 +255,15 @@ object Step2InputEidos2 extends App with Logging {
     val tsvReader = new TsvReader()
 
     lines.map { line =>
-      val Array(url, sentenceIndexString, sentence, beliefString, sentimentScore, sentenceLocationsString, contextLocationsString) = tsvReader.readln(line, 7)
+      val Array(url, sentenceIndexString, sentence, beliefString, sentimentScore, sentenceLocationsString, contextLocationsString, vectorString) = tsvReader.readln(line, 8)
       val sentenceIndex = sentenceIndexString.toInt
       val belief = beliefString == "True"
       val sentimentScoreOpt = if (sentimentScore.isEmpty) None else Some(sentimentScore.toFloat)
       val sentenceLocations = parseLocations(sentenceLocationsString)
       val contextLocations = parseLocations(contextLocationsString)
+      val vector = normalize(parseVector(vectorString))
 
-      (url, sentenceIndex) -> new LocalTsvRecord(sentenceIndex, sentence, belief, sentimentScoreOpt, sentenceLocations, contextLocations)
+      (url, sentenceIndex) -> new LocalTsvRecord(sentenceIndex, sentence, belief, sentimentScoreOpt, sentenceLocations, contextLocations, vector)
     }.toMap
   }
   val restClient = Elasticsearch.mkRestClient(url, credentialsFilename)
@@ -305,7 +326,8 @@ object Step2InputEidos2 extends App with Logging {
             prevLocations,
             prevDistanceOpt,
             nextLocations,
-            nextDistanceOpt
+            nextDistanceOpt,
+            tsvRecord.vector
           )
 
           elasticsearchIndexClient.index(datasetRecord)
diff --git a/src/main/scala/org/clulab/habitus/apps/mysql/Step2InputEidos2.scala b/src/main/scala/org/clulab/habitus/apps/mysql/Step2InputEidos2.scala
index 85045ab6..38f84d8c 100644
--- a/src/main/scala/org/clulab/habitus/apps/mysql/Step2InputEidos2.scala
+++ b/src/main/scala/org/clulab/habitus/apps/mysql/Step2InputEidos2.scala
@@ -550,7 +550,8 @@ object Step2InputEidos2 extends App with Logging {
             prevLocations,
             prevDistanceOpt,
             nextLocations,
-            nextDistanceOpt
+            nextDistanceOpt,
+            null // TODO
           )
 
           runIndex(connection, datasetRecord)

From e9e7560631a0afe1cde7f8fe5039765ca0ab3c4a Mon Sep 17 00:00:00 2001
From: Keith Alcock <github@keithalcock.com>
Date: Thu, 15 Feb 2024 13:28:46 -0700
Subject: [PATCH 3/7] Try to query with python

---
 belief_pipeline/es_vector_search.py | 42 +++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100644 belief_pipeline/es_vector_search.py

diff --git a/belief_pipeline/es_vector_search.py b/belief_pipeline/es_vector_search.py
new file mode 100644
index 00000000..390c7ba0
--- /dev/null
+++ b/belief_pipeline/es_vector_search.py
@@ -0,0 +1,42 @@
+from argparse import ArgumentParser
+from elasticsearch import Elasticsearch
+from sentence_transformers import SentenceTransformer
+
+class VectorSearcher():
+    def __init__(self, url, username, password, model_name):
+        super().__init__()
+        self.elasticsearch = Elasticsearch(url, basic_auth=(username, password))
+        self.sentence_transformer = SentenceTransformer(model_name)
+
+    def search(self, k, text):
+        index = "habitus2"
+        # This vector is assumed to be normalized.
+        vector = self.sentence_transformer.encode(text).tolist()
+        query = {
+            "query_vector": vector,
+            "k": k,
+            "num_candidates": k
+        }
+        print(query)
+        result = self.elasticsearch.knn_search(index=index, knn=query, source=False)
+        print(result)
+
+def run(username, password, k, text):
+    url = "http://localhost:9200/"
+    model_name = "all-MiniLM-L6-v2"
+    vector_searcher = VectorSearcher(url, username, password, model_name)
+    matches = vector_searcher.search(k, text)
+    return matches
+
+def get_args():
+    argument_parser = ArgumentParser()
+    argument_parser.add_argument("-u", "--username", required=True, help="elasticsearch username")
+    argument_parser.add_argument("-p", "--password", required=True, help="elasticsearch password")
+    argument_parser.add_argument("-k", "--k",        required=True, help="number of nearest neighbors")
+    argument_parser.add_argument("-t", "--text",     required=True, help="text to be matched")
+    args = argument_parser.parse_args()
+    return args.username, args.password, args.k, args.text
+
+if __name__ == "__main__":
+    username, password, k, text = get_args()
+    run(username, password, k, text)

From 416f948f2a1eb307408dbd961a98ab5533a9c9a6 Mon Sep 17 00:00:00 2001
From: Keith Alcock <github@keithalcock.com>
Date: Thu, 15 Feb 2024 13:45:47 -0700
Subject: [PATCH 4/7] Get working but with deprecation

---
 belief_pipeline/es_vector_search.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/belief_pipeline/es_vector_search.py b/belief_pipeline/es_vector_search.py
index 390c7ba0..5671d5be 100644
--- a/belief_pipeline/es_vector_search.py
+++ b/belief_pipeline/es_vector_search.py
@@ -13,20 +13,22 @@ def search(self, k, text):
         # This vector is assumed to be normalized.
         vector = self.sentence_transformer.encode(text).tolist()
         query = {
+            "field": "chatVector",
             "query_vector": vector,
             "k": k,
             "num_candidates": k
         }
-        print(query)
         result = self.elasticsearch.knn_search(index=index, knn=query, source=False)
+        ids_and_scores = [(hit._id, hit._score) for hit in result["hits"]["hits"]]
         print(result)
+        return ids_and_scores
 
 def run(username, password, k, text):
     url = "http://localhost:9200/"
     model_name = "all-MiniLM-L6-v2"
     vector_searcher = VectorSearcher(url, username, password, model_name)
-    matches = vector_searcher.search(k, text)
-    return matches
+    ids_and_scores = vector_searcher.search(k, text)
+    print(ids_and_scores)
 
 def get_args():
     argument_parser = ArgumentParser()

From 117910a4204f3d7114fb6d29d54b66961a3adedc Mon Sep 17 00:00:00 2001
From: Keith Alcock <github@keithalcock.com>
Date: Thu, 15 Feb 2024 13:59:36 -0700
Subject: [PATCH 5/7] Make it work once

---
 belief_pipeline/es_vector_search.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/belief_pipeline/es_vector_search.py b/belief_pipeline/es_vector_search.py
index 5671d5be..26e130d5 100644
--- a/belief_pipeline/es_vector_search.py
+++ b/belief_pipeline/es_vector_search.py
@@ -18,8 +18,9 @@ def search(self, k, text):
             "k": k,
             "num_candidates": k
         }
-        result = self.elasticsearch.knn_search(index=index, knn=query, source=False)
-        ids_and_scores = [(hit._id, hit._score) for hit in result["hits"]["hits"]]
+        result = self.elasticsearch.search(index=index, knn=query, source=False)
+        hits = result.body["hits"]["hits"]
+        ids_and_scores = [(hit["_id"], hit["_score"]) for hit in hits]
         print(result)
         return ids_and_scores
 

From 9084cb5e42592c8a38b3fe95069ce67cfd855a5e Mon Sep 17 00:00:00 2001
From: Keith Alcock <github@keithalcock.com>
Date: Thu, 15 Feb 2024 14:12:54 -0700
Subject: [PATCH 6/7] Get more results

---
 belief_pipeline/es_vector_search.py | 3 ++-
 elasticsearch/index.json            | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/belief_pipeline/es_vector_search.py b/belief_pipeline/es_vector_search.py
index 26e130d5..54d7a104 100644
--- a/belief_pipeline/es_vector_search.py
+++ b/belief_pipeline/es_vector_search.py
@@ -18,7 +18,8 @@ def search(self, k, text):
             "k": k,
             "num_candidates": k
         }
-        result = self.elasticsearch.search(index=index, knn=query, source=False)
+        # The maximum value for size is limited by the index's index.max_result_window.
+        result = self.elasticsearch.search(index=index, knn=query, source=False, from_=0, size=k)
         hits = result.body["hits"]["hits"]
         ids_and_scores = [(hit["_id"], hit["_score"]) for hit in hits]
         print(result)
diff --git a/elasticsearch/index.json b/elasticsearch/index.json
index dec2d586..8b045b99 100644
--- a/elasticsearch/index.json
+++ b/elasticsearch/index.json
@@ -2,6 +2,7 @@ PUT habitus
 {
   "settings": {
     "index.query.default_field": "sentence",
-    "index.number_of_replicas": 0
+    "index.number_of_replicas": 0,
+    "index.max_result_window": 10000
   }
 }

From 2a132f8bcabdb597699c934916c4172add7bd590 Mon Sep 17 00:00:00 2001
From: Keith Alcock <github@keithalcock.com>
Date: Thu, 15 Feb 2024 15:23:16 -0700
Subject: [PATCH 7/7] Update search program

---
 belief_pipeline/es_vector_search.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/belief_pipeline/es_vector_search.py b/belief_pipeline/es_vector_search.py
index 54d7a104..8eeab204 100644
--- a/belief_pipeline/es_vector_search.py
+++ b/belief_pipeline/es_vector_search.py
@@ -19,18 +19,19 @@ def search(self, k, text):
             "num_candidates": k
         }
         # The maximum value for size is limited by the index's index.max_result_window.
+        # If more results are desired, they need to be paged.
         result = self.elasticsearch.search(index=index, knn=query, source=False, from_=0, size=k)
         hits = result.body["hits"]["hits"]
         ids_and_scores = [(hit["_id"], hit["_score"]) for hit in hits]
-        print(result)
         return ids_and_scores
 
 def run(username, password, k, text):
-    url = "http://localhost:9200/"
+    url = "https://elasticsearch.habitus.clulab.org/"
     model_name = "all-MiniLM-L6-v2"
     vector_searcher = VectorSearcher(url, username, password, model_name)
     ids_and_scores = vector_searcher.search(k, text)
-    print(ids_and_scores)
+    for index, (id, score) in enumerate(ids_and_scores):
+        print(index, id, score)
 
 def get_args():
     argument_parser = ArgumentParser()