init

arjunsk · Feb 15, 2024 · aeaf781 · aeaf781
1 parent d79f2b5
commit aeaf781
Show file tree

Hide file tree

Showing 20 changed files with 348 additions and 1,296 deletions.
diff --git a/README.md b/README.md
@@ -6,9 +6,7 @@
 
 
 This is a simple implementation of the [Elkan's Kmeans](https://cdn.aaai.org/ICML/2003/ICML03-022.pdf)
-algorithm in Go. The library also contains [Kmeans++](https://en.wikipedia.org/wiki/K-means%2B%2B),
-[Lloyd's kmeans](https://en.wikipedia.org/wiki/K-means_clustering#Standard_algorithm_(naive_k-means)) and
-[Simple Random Sampling](https://en.wikipedia.org/wiki/Simple_random_sample) algorithms.
+algorithm in Go.
 
 ### Installing
 
@@ -24,94 +22,52 @@ package main
 import (
 	"fmt"
 	"github.com/arjunsk/kmeans"
+	"github.com/arjunsk/kmeans/elkans"
 )
 
 func main() {
-	vectors := [][]float64{
+	vectorList := [][]float64{
 		{1, 2, 3, 4},
-		{0, 3, 4, 1},
-		{0, 9, 3, 1},
-		{0, 8, 4, 4},
-		{130, 200, 343, 224},
-		{100, 200, 300, 400},
-		{300, 400, 200, 110},
+		{1, 2, 4, 5},
+		{1, 2, 4, 5},
+		{1, 2, 3, 4},
+		{1, 2, 4, 5},
+		{1, 2, 4, 5},
+		{10, 2, 4, 5},
+		{10, 3, 4, 5},
+		{10, 5, 4, 5},
+		{10, 2, 4, 5},
+		{10, 3, 4, 5},
+		{10, 5, 4, 5},
 	}
 
-	clusterer, err := kmeans.NewCluster(kmeans.ELKAN, vectors, 2)
+	clusterer, err := elkans.NewKMeans(vectorList, 2,
+		500, 0.5,
+		kmeans.L2Distance, kmeans.KmeansPlusPlus, false)
 	if err != nil {
 		panic(err)
 	}
 
-	clusters, err := clusterer.Cluster()
+	centroids, err := clusterer.Cluster()
 	if err != nil {
 		panic(err)
 	}
 
-	for _, cluster := range clusters {
-		fmt.Println(cluster.Center())
+	for _, centroid := range centroids {
+		fmt.Println(centroid)
 	}
-	// Output:
-	// [1 2 3 4]
-	// [130 200 343 224]
-
+	/*
+	[1 2 3.6666666666666665 4.666666666666666]
+	[10 3.333333333333333 4 5]
+	*/
 }
 ```
 
 ### FAQ
 <details>
 <summary> Read More </summary>
 
-#### Why not Kmeans++ initialization in Elkan's?
-
-The default settings of Elkan's Kmeans is to use [random initialization](/initializer/random.go)
-instead of  [Kmeans++ initialization](/initializer/kmeans_plus_plus.go).
-
-Based on the excerpt
-from [FAISS discussion](https://github.com/facebookresearch/faiss/issues/268#issuecomment-348184505), it was observed
-that Kmeans++ overhead computation cost is not worth for large scale use case.
-
-> Scikitlearn uses k-means++ initialization by default (you can also use random points), which is good in the specific
-> corner-case you consider. It should actually gives you perfect result even without any iteration with high
-> probability,
-> because the kind of evaluation you consider is exactly what k-means++ has be designed to better handle.
-> We have not implemented it in Faiss, because with our former Yael library, which implements both k-means++ and regular
-> random initialization, we observed that the overhead computational cost was not worth the saving (negligible) in all
-> large-scale settings we have considered.
-
-#### When should you consider sub-sampling?
-
-As mentioned [here](https://github.com/facebookresearch/faiss/wiki/FAQ#can-i-ignore-warning-clustering-xxx-points-to-yyy-centroids),
-when the number of vectors is large, it is recommended to use sub-sampling.
-
-
-> When applying k-means algorithm to cluster n points to k centroids, there are several cases:
->
-> - n < k: this raises an exception with an assertion because we cannot do anything meaningful
-> - n < min_points_per_centroid * k: this produces the warning above. It means that usually there are too few points to
-    reliably estimate the centroids. This may still be ok if the dataset to index is as small as the training set.
-> - n < max_points_per_centroid * k: comfort zone
-> - n > max_points_per_centroid * k: there are too many points, making k-means unnecessarily slow. Then the training set
-    is sampled.
->
->The parameters {min,max}_points_per_centroids (39 and 256 by default) belong to the ClusteringParameters structure.
-
-#### What could be your sample size?
-- [Apache Sedona](https://github.com/apache/sedona/blob/06e7d679ff979a4f052e0afe5df0b303bf8d70fb/spark/common/src/main/java/org/apache/sedona/core/utils/RDDSampleUtils.java#L36C10-L36C10) uses the following sampling rule.
-
->  Number of partitions (ie K) cannot exceed half the number of records.
-> 
->  Returns total number of records if it is < 1000. Otherwise, returns 1% of the total number
->  of records or 2x number of partitions whichever is larger. Never returns a
->  number > Integer.MAX_VALUE.
-
-The 2x could be based on the dimension of vector (here is geo-coordinates). For example, if the vector is 
-1000 dimension, then the sample size could be Max( 1% * total vectors, 1000 x k).
-
-- Based on FAISS, the sample size could be `max_points_per_centroid * k` if `n > max_points_per_centroid * k`.
-
-
-
-#### What should be the ideal K?
+#### What should be the ideal Centroids Count?
 Based on the recommendations from [PGVector](https://github.com/pgvector/pgvector/tree/master#ivfflat) IVF INDEX, 
 the idea K should 
 

diff --git a/elkans/clusterer.go b/elkans/clusterer.go
@@ -16,8 +16,8 @@ package elkans
 
 import (
 	"github.com/arjunsk/kmeans"
-	"github.com/arjunsk/kmeans/moarray"
-	"github.com/arjunsk/kmeans/moerr"
+	moarray2 "github.com/arjunsk/kmeans/utils/moarray"
+	"github.com/arjunsk/kmeans/utils/moerr"
 	"gonum.org/v1/gonum/mat"
 	"math"
 	"math/rand"
@@ -87,7 +87,7 @@ func NewKMeans(vectors [][]float64, clusterCnt,
 		return nil, err
 	}
 
-	gonumVectors, err := moarray.ToGonumVectors[float64](vectors...)
+	gonumVectors, err := moarray2.ToGonumVectors[float64](vectors...)
 	if err != nil {
 		return nil, err
 	}
@@ -153,11 +153,11 @@ func (km *ElkanClusterer) InitCentroids() error {
 // Cluster returns the final centroids and the error if any.
 func (km *ElkanClusterer) Cluster() ([][]float64, error) {
 	if km.normalize {
-		moarray.NormalizeGonumVectors(km.vectorList)
+		moarray2.NormalizeGonumVectors(km.vectorList)
 	}
 
 	if km.vectorCnt == km.clusterCnt {
-		return moarray.ToMoArrays[float64](km.vectorList), nil
+		return moarray2.ToMoArrays[float64](km.vectorList), nil
 	}
 
 	err := km.InitCentroids() // step 0.1
@@ -172,7 +172,7 @@ func (km *ElkanClusterer) Cluster() ([][]float64, error) {
 		return nil, err
 	}
 
-	return moarray.ToMoArrays[float64](res), nil
+	return moarray2.ToMoArrays[float64](res), nil
 }
 
 func (km *ElkanClusterer) elkansCluster() ([]*mat.VecDense, error) {
@@ -384,7 +384,7 @@ func (km *ElkanClusterer) recalculateCentroids() []*mat.VecDense {
 
 			// normalize the random vector
 			if km.normalize {
-				moarray.NormalizeGonumVector(newCentroids[c])
+				moarray2.NormalizeGonumVector(newCentroids[c])
 			}
 		} else {
 			// find the mean of the cluster members

diff --git a/elkans/clusterer_test.go b/elkans/clusterer_test.go
@@ -16,8 +16,8 @@ package elkans
 
 import (
 	"github.com/arjunsk/kmeans"
-	"github.com/arjunsk/kmeans/assertx"
-	"github.com/arjunsk/kmeans/moarray"
+	"github.com/arjunsk/kmeans/utils/assertx"
+	"github.com/arjunsk/kmeans/utils/moarray"
 	"reflect"
 	"testing"
 )

diff --git a/elkans/distance_func.go b/elkans/distance_func.go
@@ -16,8 +16,9 @@ package elkans
 
 import (
 	"github.com/arjunsk/kmeans"
-	"github.com/arjunsk/kmeans/moerr"
+	"github.com/arjunsk/kmeans/utils/moerr"
 	"gonum.org/v1/gonum/mat"
+	"math"
 )
 
 // L2Distance is used for L2Distance distance in Euclidean Kmeans.
@@ -27,34 +28,34 @@ func L2Distance(v1, v2 *mat.VecDense) float64 {
 	return mat.Norm(diff, 2)
 }
 
-//// SphericalDistance is used for InnerProduct and CosineDistance in Spherical Kmeans.
-//// NOTE: spherical distance between two points on a sphere is equal to the
-//// angular distance between the two points, scaled by pi.
-//// Refs:
-//// https://en.wikipedia.org/wiki/Great-circle_distance#Vector_version
-//func SphericalDistance(v1, v2 *mat.VecDense) float64 {
-//	// Compute the dot product of the two vectors.
-//	// The dot product of two vectors is a measure of their similarity,
-//	// and it can be used to calculate the angle between them.
-//	dp := mat.Dot(v1, v2)
-//
-//	// Prevent NaN with acos with loss of precision.
-//	if dp > 1.0 {
-//		dp = 1.0
-//	} else if dp < -1.0 {
-//		dp = -1.0
-//	}
-//
-//	theta := math.Acos(dp)
-//
-//	//To scale the result to the range [0, 1], we divide by Pi.
-//	return theta / math.Pi
-//
-//	// NOTE:
-//	// Cosine distance is a measure of the similarity between two vectors. [Not satisfy triangle inequality]
-//	// Angular distance is a measure of the angular separation between two points. [Satisfy triangle inequality]
-//	// Spherical distance is a measure of the spatial separation between two points on a sphere. [Satisfy triangle inequality]
-//}
+// SphericalDistance is used for InnerProduct and CosineDistance in Spherical Kmeans.
+// NOTE: spherical distance between two points on a sphere is equal to the
+// angular distance between the two points, scaled by pi.
+// Refs:
+// https://en.wikipedia.org/wiki/Great-circle_distance#Vector_version
+func SphericalDistance(v1, v2 *mat.VecDense) float64 {
+	// Compute the dot product of the two vectors.
+	// The dot product of two vectors is a measure of their similarity,
+	// and it can be used to calculate the angle between them.
+	dp := mat.Dot(v1, v2)
+
+	// Prevent NaN with acos with loss of precision.
+	if dp > 1.0 {
+		dp = 1.0
+	} else if dp < -1.0 {
+		dp = -1.0
+	}
+
+	theta := math.Acos(dp)
+
+	//To scale the result to the range [0, 1], we divide by Pi.
+	return theta / math.Pi
+
+	// NOTE:
+	// Cosine distance is a measure of the similarity between two vectors. [Not satisfy triangle inequality]
+	// Angular distance is a measure of the angular separation between two points. [Satisfy triangle inequality]
+	// Spherical distance is a measure of the spatial separation between two points on a sphere. [Satisfy triangle inequality]
+}
 
 // resolveDistanceFn returns the distance function corresponding to the distance type
 // Distance function should satisfy triangle inequality.
@@ -66,8 +67,8 @@ func resolveDistanceFn(distType kmeans.DistanceType) (kmeans.DistanceFunction, e
 	switch distType {
 	case kmeans.L2Distance:
 		distanceFunction = L2Distance
-	//case kmeans.InnerProduct, kmeans.CosineDistance:
-	//	distanceFunction = SphericalDistance
+	case kmeans.InnerProduct, kmeans.CosineDistance:
+		distanceFunction = SphericalDistance
 	default:
 		return nil, moerr.NewInternalErrorNoCtx("invalid distance type")
 	}

diff --git a/elkans/distance_func_bench_test.go b/elkans/distance_func_bench_test.go
@@ -15,7 +15,7 @@
 package elkans
 
 import (
-	"github.com/arjunsk/kmeans/moarray"
+	moarray2 "github.com/arjunsk/kmeans/utils/moarray"
 	"gonum.org/v1/gonum/mat"
 	"math/rand"
 	"testing"
@@ -43,7 +43,7 @@ func Benchmark_L2Distance(b *testing.B) {
 		b.ResetTimer()
 
 		for i := 0; i < b.N; i++ {
-			_, _ = moarray.NormalizeL2[float64](v1[i])
+			_, _ = moarray2.NormalizeL2[float64](v1[i])
 		}
 	})
 
@@ -52,8 +52,8 @@ func Benchmark_L2Distance(b *testing.B) {
 		b.ResetTimer()
 
 		for i := 0; i < b.N; i++ {
-			v21, _ := moarray.NormalizeL2[float64](v2[i])
-			_ = L2Distance(v1[i], moarray.ToGonumVector(v21))
+			v21, _ := moarray2.NormalizeL2[float64](v2[i])
+			_ = L2Distance(v1[i], moarray2.ToGonumVector(v21))
 		}
 	})