From 5e132c08279b19640dc24c4cdd4932e3ad0b5ddc Mon Sep 17 00:00:00 2001 From: Badrul Chowdhury Date: Tue, 5 Dec 2023 10:23:27 -0800 Subject: [PATCH] [SYSTEMDS-3652] Adding support for removing duplicate rows to the unique() builtin function (#1949) This patch adds support for removing duplicate rows to the unique() builtin function. Currently, only CP is supported- support for SPARK will be added in a subsequent patch. Furthermore, we we will explore possible optimizations of the naive implementation using bitmaps in a subsequent patch. --- .../runtime/matrix/data/LibMatrixSketch.java | 62 +++++++++++++- .../test/functions/unique/UniqueRow.java | 80 +++++++++++++++++++ .../scripts/functions/unique/uniqueRow.dml | 24 ++++++ 3 files changed, 164 insertions(+), 2 deletions(-) create mode 100644 src/test/java/org/apache/sysds/test/functions/unique/UniqueRow.java create mode 100644 src/test/scripts/functions/unique/uniqueRow.dml diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixSketch.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixSketch.java index 43a4c1bc5a4..29089078a89 100644 --- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixSketch.java +++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixSketch.java @@ -22,14 +22,15 @@ import org.apache.commons.lang3.NotImplementedException; import org.apache.sysds.common.Types; +import java.util.ArrayList; import java.util.HashSet; import java.util.Iterator; public class LibMatrixSketch { public static MatrixBlock getUniqueValues(MatrixBlock blkIn, Types.Direction dir) { - //similar to R's unique, this operation takes a matrix and computes the - //unique values (or rows in case of multiple column inputs) + // similar to R's unique, this operation takes a matrix and computes the unique values + // (or rows in case of multiple column inputs) int rlen = blkIn.getNumRows(); int clen = blkIn.getNumColumns(); @@ -57,6 +58,63 @@ public static MatrixBlock getUniqueValues(MatrixBlock blkIn, Types.Direction dir break; case Row: + ArrayList retainedRows = new ArrayList<>(); + + for (int i=0; i