From 558d24b9ed445b52d604f68c79ef4fb0f05b6a59 Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Wed, 17 Apr 2024 21:57:40 -0700 Subject: [PATCH] Get fixed ghana into elasticsearch --- .../elasticsearch/CheckNormalization.scala | 401 ++++++++++++++++++ .../Step2InputEidos2GhanaApp.scala | 9 +- 2 files changed, 407 insertions(+), 3 deletions(-) create mode 100644 src/main/scala/org/clulab/habitus/apps/elasticsearch/CheckNormalization.scala diff --git a/src/main/scala/org/clulab/habitus/apps/elasticsearch/CheckNormalization.scala b/src/main/scala/org/clulab/habitus/apps/elasticsearch/CheckNormalization.scala new file mode 100644 index 00000000..e9bcfb88 --- /dev/null +++ b/src/main/scala/org/clulab/habitus/apps/elasticsearch/CheckNormalization.scala @@ -0,0 +1,401 @@ +package org.clulab.habitus.apps.elasticsearch + +object CheckNormalization extends App { + val array = Array( + -0.0015579222235828638, + 0.046756383031606674, + 0.03320016711950302, + -0.003020971082150936, + 0.0031890878453850746, + 0.07345180213451385, + 0.07176708430051804, + -0.053697001188993454, + 0.11290593445301056, + 0.06155059486627579, + -0.01636478491127491, + 0.07482271641492844, + -0.032704152166843414, + 0.0402897410094738, + -0.012704286724328995, + -0.004845243413001299, + -0.09643524885177612, + -0.011723821982741356, + -0.023252692073583603, + 0.05469772219657898, + -0.06684316694736481, + -0.021528715267777443, + 0.038941144943237305, + -0.004839417990297079, + -0.04832024499773979, + 0.008616921491920948, + 0.017459064722061157, + -0.03391644358634949, + 0.036823373287916183, + -0.07160962373018265, + -0.009660018607974052, + -0.04252755269408226, + 0.09882999211549759, + 0.025910520926117897, + -0.05211717262864113, + 0.0018940576119348407, + 0.024215228855609894, + -0.10456541925668716, + 0.016564765945076942, + -0.07093624025583267, + -0.0024302618112415075, + 0.03633902966976166, + -0.04824983701109886, + 0.04362930729985237, + -0.042049430310726166, + 0.014537685550749302, + 0.06504721194505692, + -0.03202000632882118, + -0.06448596715927124, + -0.048266805708408356, + -0.017485490068793297, + -0.0032008986454457045, + -0.005561065394431353, + -0.06149498000741005, + -0.029259851202368736, + 0.060640692710876465, + 0.03184633329510689, + -0.06468986719846725, + 0.03661811351776123, + -0.0930897668004036, + -0.010137617588043213, + 0.002633979544043541, + -0.0413396880030632, + 0.05606316402554512, + 0.012267524376511574, + 0.05634245648980141, + 0.0437580868601799, + 0.04945199564099312, + -0.09611742198467255, + 0.018299633637070656, + 0.016747035086154938, + 0.011103530414402485, + 0.04583752900362015, + 0.014348058961331844, + 0.02056422084569931, + -0.026701027527451515, + 0.05929544195532799, + -0.06241913139820099, + -0.02180054597556591, + 0.04562368616461754, + -0.024286307394504547, + -0.006579102482646704, + -0.012273433618247509, + -0.008675801567733288, + -0.045255374163389206, + -0.057757288217544556, + 0.0014844542602077127, + 0.02721400372684002, + 0.10567346960306168, + 0.00754966726526618, + -0.09799190610647202, + -0.12815727293491364, + 0.09550009667873383, + 0.010418944992125034, + -0.04483690485358238, + 0.08029047399759293, + -0.07828550785779953, + -0.030102428048849106, + 0.01035892590880394, + -0.014414415694773197, + 0.0007517688209190965, + -0.028540072962641716, + -0.06680746376514435, + 0.02819976769387722, + 0.027425967156887054, + -0.03616849705576897, + -0.06783357262611389, + -0.07953125238418579, + 0.014817715622484684, + 0.033664170652627945, + -0.04960232600569725, + -0.03968366980552673, + 0.023404616862535477, + 0.1120513528585434, + 0.046903908252716064, + -0.00901854783296585, + -0.04798654094338417, + -0.03273267671465874, + 0.00010927081893896684, + -0.11984733492136002, + -0.03290942311286926, + 0.05302047356963158, + 0.023003309965133667, + 0.045232001692056656, + -0.1082877442240715, + -0.046023353934288025, + 0.027949819341301918, + -4.119866723290199e-33, + 0.00031087713432498276, + -0.0016192914918065071, + 0.09158345311880112, + -0.07603964954614639, + 0.01911413110792637, + 0.033009015023708344, + -0.03291640058159828, + 0.007210110779851675, + -0.01503248792141676, + -0.00293876975774765, + -0.037623703479766846, + -0.023089319467544556, + 0.05592803284525871, + 0.025874679908156395, + 0.00766947353258729, + 0.0770609974861145, + -0.07413242012262344, + 0.06507354974746704, + 0.11962081491947174, + 0.02433842420578003, + 0.04900417849421501, + 0.05460724979639053, + -0.013277449645102024, + -0.06834614276885986, + -0.054614875465631485, + -0.10490445047616959, + -0.0032958975061774254, + 0.04371365159749985, + -0.035524897277355194, + 0.0025288083124905825, + 0.033540256321430206, + 0.02391907013952732, + 0.03639453649520874, + -0.014536550268530846, + 0.018761046230793, + -0.0327012874186039, + -0.14558155834674835, + 0.05389149859547615, + -0.0708320364356041, + -0.03836232051253319, + -0.02713063731789589, + -0.010462294332683086, + -0.021144958212971687, + 0.005797258578240871, + -0.006066432222723961, + -0.04190671816468239, + 0.005722514819353819, + -0.023609325289726257, + -0.09120525419712067, + 0.03493587300181389, + -0.012430357746779919, + 0.028762690722942352, + 0.030471889302134514, + -0.046362582594156265, + -0.03013031929731369, + 0.0630410686135292, + 0.02926339954137802, + 0.007419428322464228, + -0.08147356659173965, + -0.08009079843759537, + -0.01680009625852108, + -0.12305328249931335, + -0.008430072106420994, + 0.027577022090554237, + 0.018169518560171127, + 0.06823283433914185, + -0.01518191583454609, + -0.09498748928308487, + -0.03164917975664139, + -0.012673617340624332, + -0.0480959489941597, + 0.023772235959768295, + -0.05389649420976639, + 0.004763883538544178, + -0.05179118737578392, + 0.02978488989174366, + -0.012040773406624794, + -0.06166530400514603, + 0.007390553597360849, + 0.031318988651037216, + -0.09958219528198242, + -0.03343256935477257, + -0.006592493504285812, + 0.04770427569746971, + 0.16033877432346344, + -0.024765027686953545, + 0.05747982859611511, + -0.005864410661160946, + 0.10003767907619476, + 0.05300826579332352, + 0.00949875358492136, + 0.002831380581483245, + 0.008360794745385647, + -0.02915245108306408, + -0.11185413599014282, + 4.486779285142129e-34, + -0.045940227806568146, + -0.08407846093177795, + 0.03423440083861351, + 0.07183706760406494, + 0.020899394527077675, + 0.007741469424217939, + -0.07030516117811203, + -0.07291296124458313, + 0.04642238840460777, + 0.0010646618902683258, + -0.017027176916599274, + -0.0135109294205904, + 0.10059256106615067, + 0.0569571889936924, + 0.010150260291993618, + -0.07594190537929535, + -0.028875211253762245, + -0.03427863493561745, + -0.02567903697490692, + -0.09967402368783951, + 0.0004475240421015769, + -0.004533726722002029, + -0.04992523044347763, + -0.02833687700331211, + 0.0006708775763399899, + 0.026494652032852173, + -0.004937449470162392, + -0.022294918075203896, + -0.0495247021317482, + -0.0020540449768304825, + -0.02598644234240055, + 0.0880211889743805, + -0.10105810314416885, + -0.05003293976187706, + 0.08124736696481705, + -0.004971758462488651, + 0.03704409673810005, + 0.04700229689478874, + -0.010697813704609871, + -0.010109737515449524, + -0.08215630799531937, + 0.02897896245121956, + -0.06760874390602112, + 0.017348986119031906, + -0.030160333961248398, + -0.02800305187702179, + 0.02412205934524536, + 0.031049657613039017, + 0.013042393140494823, + 0.014396020211279392, + -0.07982484251260757, + 0.036464888602495193, + -0.017003972083330154, + -0.003932759165763855, + 0.11292926222085953, + 0.02497628889977932, + 0.06409800797700882, + -0.02159368060529232, + 0.18334132432937622, + 0.03776063770055771, + 0.024333423003554344, + 0.04160475358366966, + -0.028717149049043655, + 0.06133951619267464, + 0.03998758643865585, + 0.028546229004859924, + -0.005514279939234257, + 0.050185803323984146, + 0.04346054792404175, + 0.022028308361768723, + 0.035842910408973694, + -0.05380842834711075, + -0.08251427114009857, + -0.0069565558806061745, + -0.0053558493964374065, + -0.006351212505251169, + -0.01991822011768818, + -0.0035090188030153513, + -0.03189520537853241, + 0.005409338977187872, + 0.08887185156345367, + -0.037761908024549484, + 0.05145766958594322, + 0.024534715339541435, + -0.03210631385445595, + -0.09907735139131546, + -0.020710552111268044, + -0.014916516840457916, + 0.09412135183811188, + 0.03694682568311691, + -0.017638299614191055, + 0.021621214225888252, + 0.020279984921216965, + -0.052365679293870926, + -0.07922355830669403, + -3.364035450204028e-08, + -0.00912963692098856, + 0.0036144601181149483, + 0.01748746447265148, + 0.09101580083370209, + 0.0017105248989537358, + 0.03147963806986809, + 0.0719507560133934, + -0.00534447655081749, + -0.07605189085006714, + 0.036317747086286545, + 0.04098399728536606, + 0.033913735300302505, + 0.037917107343673706, + 0.028197139501571655, + 0.05604205280542374, + 0.004918874241411686, + 0.03770577535033226, + -0.07595308125019073, + -0.022489286959171295, + 0.014293287880718708, + 0.08106958121061325, + 0.04238438978791237, + 0.011312066577374935, + -0.01426590234041214, + 0.01736888848245144, + -0.01597782038152218, + -0.020427869632840157, + 0.025408847257494926, + -0.04223146662116051, + 0.041443221271038055, + 0.0591445229947567, + 0.02523469738662243, + -0.09725356847047806, + -0.01898801513016224, + 0.027847114950418472, + -0.04803319275379181, + 0.026944655925035477, + 0.02424442209303379, + 0.04505226016044617, + 0.017642047256231308, + 0.06782802939414978, + 0.07919913530349731, + 0.010018542408943176, + 0.06952426582574844, + -0.044443923979997635, + 0.04577431455254555, + -0.03933382034301758, + 0.04271325096487999, + -0.05939020961523056, + 0.010933411307632923, + -0.06958586722612381, + 0.035891421139240265, + 0.022889476269483566, + 0.02230827324092388, + 0.01698937825858593, + 0.01690031588077545, + 0.022693175822496414, + 0.003974761813879013, + -0.06939641386270523, + -0.06184966489672661, + 0.125400573015213, + 0.0036680810153484344, + 0.002990757580846548, + -0.1475633680820465 + ).map(_.toFloat) + + def normalize(floats: Array[Float]): Array[Float] = { + val sumSquare = floats.foldLeft(0f) { case (sum, float) => sum + float * float } + val divisor = math.sqrt(sumSquare) + val normalized = floats.map { float => (float / divisor).toFloat } + + normalized + } + + val normalized = normalize(array) + println(normalized) +} diff --git a/src/main/scala/org/clulab/habitus/apps/elasticsearch/Step2InputEidos2GhanaApp.scala b/src/main/scala/org/clulab/habitus/apps/elasticsearch/Step2InputEidos2GhanaApp.scala index fb2c8130..b9898e22 100644 --- a/src/main/scala/org/clulab/habitus/apps/elasticsearch/Step2InputEidos2GhanaApp.scala +++ b/src/main/scala/org/clulab/habitus/apps/elasticsearch/Step2InputEidos2GhanaApp.scala @@ -40,9 +40,9 @@ object Step2InputEidos2GhanaApp extends App with Logging { val credentialsFilename = "../credentials/elasticsearch-credentials.properties" val deserializer = new JLDDeserializer() val url = new URL("http://localhost:9200") - // val url = new URL("https://elasticsearch.keithalcock.com") +// val url = new URL("https://elasticsearch.keithalcock.com") // val indexName = "habitus4" - val indexName = "dataset55k4" + val indexName = "dataset55ka4" val datasetName = "dataset55k.tsv" val regionName = "ghana" val alreadyNormalized = true @@ -301,7 +301,10 @@ object Step2InputEidos2GhanaApp extends App with Logging { sentences.zipWithIndex.foreach { case (sentence, sentenceIndex) => val causal = causalMentionGroups.contains(sentenceIndex) val cleanText = getSentenceText(document, sentence) - val tsvRecord = localTsvRecord + // This localTsvRecord is the first sentence of the document. + // We need to find the sentenceIndexth sentence instead. + // val tsvRecord = localTsvRecord + val tsvRecord = urlSentenceIndexToTsvRecordMap(url, sentenceIndex) val contextBefore = sentences .slice(sentenceIndex - contextWindow, sentenceIndex) .map(getSentenceText(document, _))