rapidsai · quasiben · Jun 4, 2019 · Jun 4, 2019 · Jun 4, 2019 · Jun 4, 2019
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -17,6 +17,7 @@
 - PR #1828 JSON Reader: add suport for bool8 columns
 - PR #1665 Add the point-in-polygon GIS function
 - PR #1863 Series and Dataframe methods for all and any
+- PR #1917 Adds an index hashing method
 
 ## Improvements
 - PR #1538 Replacing LesserRTTI with inequality_comparator

@@ -23,6 +23,7 @@
 from cudf.comm.serialize import register_distributed_serializer
 
 import cudf.bindings.copying as cpp_copying
+import cudf.bindings.hash as cpp_hash
 
 
 class Index(object):
@@ -530,6 +531,31 @@ def find_label_range(self, first, last):
             end += 1
         return begin, end
 
+    def hash_index(self):
+        """Hash the given index and return a new Series
+
+        Returns
+        -------
+        Series :
+            Sequence of column names. If columns is *None* (unspecified),
+            all columns in the frame are used.
+        """
+        from cudf.dataframe.series import Series
+
+        initial_hash_values = None
+        buf = Buffer(rmm.device_array(len(self), dtype=np.int32))
+        result = NumericalColumn(data=buf, dtype=buf.dtype)
+
+        _hash = cpp_hash.hash_columns([self.as_column()],
+                                      result, initial_hash_values)
+
+        sr = Series(_hash)
+
+        # hash_columns produces negative valuesg
+        # probably can switch to np.uint32
+        # when supported by libcudf
+        return abs(sr)
+
 
 class DatetimeIndex(GenericIndex):
     # TODO this constructor should take a timezone or something to be

@@ -123,6 +123,24 @@ def test_categorical_index():
     assert_eq(pdf.index, gdf2.index)
 
 
+@pytest.mark.parametrize('index_name', [
+    'num_idx',
+    'cat_idx',
+])
+def test_hashing_index(index_name):
+    pdf = pd.DataFrame()
+    pdf['num_idx'] = [1, 2, 3, 1]
+    pdf['cat_idx'] = pd.Categorical(['a', 'b', 'c', 'a'])
+    gdf = DataFrame.from_pandas(pdf)
+    sr = gdf.set_index(index_name).index.hash_index()
+
+    # values are always positive for modulo calculation
+    assert_eq(sr, sr[sr > 0])
+    assert len(sr) == len(pdf[index_name])
+    assert sr.iloc[0] == sr.iloc[-1]
+    assert len(sr.unique()) == len(sr) - 1
+
+
 def test_pandas_as_index():
     # Define Pandas Indexes
     pdf_int_index = pd.Int64Index([1, 2, 3, 4, 5])