databrickslabs · ronanstokes-db · Feb 22, 2024 · Mar 26, 2023 · Mar 27, 2023 · Apr 7, 2023
@@ -3,10 +3,10 @@
 ## Change History
 All notable changes to the Databricks Labs Data Generator will be documented in this file.
 
-
-
 #### Changed
 * Updated readme to include details on which versions of Databricks runtime support Unity Catalog `shared` access mode.
+* Updated code to use default parallelism of 200 when using a shared Spark session
+* Updated code to use Spark's SQL function `element_at` instead of array indexing due to incompatibility
 
 
 ### Version 0.3.5

@@ -26,7 +26,7 @@
 from .data_generator import DataGenerator
 from .datagen_constants import DEFAULT_RANDOM_SEED, RANDOM_SEED_RANDOM, RANDOM_SEED_FIXED, \
                                RANDOM_SEED_HASH_FIELD_NAME, MIN_PYTHON_VERSION, MIN_SPARK_VERSION, \
-                               INFER_DATATYPE
+                               INFER_DATATYPE, SPARK_DEFAULT_PARALLELISM
 from .utils import ensure, topologicalSort, mkBoundsList, coalesce_values, \
     deprecated, parse_time_interval, DataGenError, split_list_matching_condition, strip_margins, \
     json_value_from_path, system_time_millis

@@ -1085,7 +1085,7 @@ def _makeSingleGenerationExpression(self, index=None, use_pandas_optimizations=T
                                .astype(self.datatype))
 
             if self.values is not None:
-                new_def = array([lit(x) for x in self.values])[new_def.astype(IntegerType())]
+                new_def = F.element_at(F.array([F.lit(x) for x in self.values]), new_def.astype(IntegerType()) + 1)
             elif type(self.datatype) is StringType and self.expr is None:
                 new_def = self._applyPrefixSuffixExpressions(self.prefix, self.suffix, new_def)
 

@@ -16,7 +16,7 @@
 from .datagen_constants import DEFAULT_RANDOM_SEED, RANDOM_SEED_FIXED, RANDOM_SEED_HASH_FIELD_NAME, \
     DEFAULT_SEED_COLUMN, SPARK_RANGE_COLUMN, MIN_SPARK_VERSION, \
     OPTION_RANDOM, OPTION_RANDOM_SEED, OPTION_RANDOM_SEED_METHOD, \
-    INFER_DATATYPE
+    INFER_DATATYPE, SPARK_DEFAULT_PARALLELISM
 from .html_utils import HtmlUtils
 from .schema_parser import SchemaParser
 from .spark_singleton import SparkSingleton
@@ -50,6 +50,9 @@ class DataGenerator:
     it is recommended that you use a different name for the seed column - for example `_id`.
 
     This may be specified by setting the `seedColumnName` attribute to `_id`
+
+    Note: in a shared spark session, the sparkContext is not available, so the default parallelism is set to 200.
+    We recommend passing an explicit value for `partitions` in this case.
     """
 
     # class vars
@@ -97,9 +100,8 @@ def __init__(self, sparkSession=None, name=None, randomSeedMethod=None,
         # if the active Spark session is stopped, you may end up with a valid SparkSession object but the underlying
         # SparkContext will be invalid
         assert sparkSession is not None, "Spark session not initialized"
-        assert sparkSession.sparkContext is not None, "Expecting spark session to have valid sparkContext"
 
-        self.partitions = partitions if partitions is not None else sparkSession.sparkContext.defaultParallelism
+        self.partitions = partitions if partitions is not None else self._getDefaultSparkParallelism(sparkSession)
 
         # check for old versions of args
         if "starting_id" in kwargs:
@@ -239,6 +241,22 @@ def _setupLogger(self):
         else:
             self.logger.setLevel(logging.WARNING)
 
+    @staticmethod
+    def _getDefaultSparkParallelism(sparkSession):
+        """Get the default parallelism for a spark session, if spark session supports getting the sparkContext
+        :param sparkSession: spark session
+        :return: default parallelism
+        """
+        try:
+            if sparkSession.sparkContext is not None:
+                return sparkSession.sparkContext.defaultParallelism
+            else:
+                return SPARK_DEFAULT_PARALLELISM
+        except Exception as err:  # pylint: disable=broad-exception-caught
+            err_msg = f"Error getting default parallelism, using default setting of {SPARK_DEFAULT_PARALLELISM}"
+            logging.warning(err_msg)
+            return SPARK_DEFAULT_PARALLELISM
+
     @classmethod
     def useSeed(cls, seedVal):
         """ set seed for random number generation

@@ -42,4 +42,7 @@
 OPTION_RANDOM_SEED_METHOD = "randomSeedMethod"
 OPTION_RANDOM_SEED = "randomSeed"
 
-INFER_DATATYPE = "__infer__"
+INFER_DATATYPE = "__infer__"
+
+# default parallelism when sparkContext is not available
+SPARK_DEFAULT_PARALLELISM = 200
@@ -0,0 +1,82 @@
+import logging
+from unittest.mock import Mock, PropertyMock
+
+import pytest
+import dbldatagen as dg
+
+
+@pytest.fixture(scope="class")
+def setupLogging():
+    FORMAT = '%(asctime)-15s %(message)s'
+    logging.basicConfig(format=FORMAT)
+
+
+class TestSharedEnv:
+    """Tests to simulate testing under a Unity Catalog shared environment. In a Unity Catalog shared environment with
+    the 14.x versions of the Databricks runtime, the sparkSession object does not support use of the sparkContext
+    attribute to get the default parallelism. In this case, we want to catch errors and return a default of
+    200 as the default number of partitions. This is the same as the default parallelism in many versions of Spark.
+
+
+    """
+    SMALL_ROW_COUNT = 100000
+    COLUMN_COUNT = 10
+
+    @pytest.fixture(scope="class")
+    def sparkSession(self, setupLogging):
+        spark = dg.SparkSingleton.getLocalInstance("unit tests")
+        return spark
+
+    @pytest.fixture(scope="class")
+    def sharedSparkSession(self, setupLogging):
+        spark = Mock(wraps=dg.SparkSingleton.getLocalInstance("unit tests"))
+        del spark.sparkContext
+        return spark
+
+    @pytest.fixture(scope="class")
+    def sparkSessionNullContext(self, setupLogging):
+
+        class MockSparkSession:
+            def __init__(self):
+                self.sparkContext = None
+
+        spark = MockSparkSession()
+        return spark
+
+    def test_getDefaultParallelism(self, sparkSession):
+        """Test that the default parallelism is returned when the sparkSession object supports use of the
+        sparkContext attribute to get the default parallelism.
+
+        :param sparkSession: The sparkSession object to use for the test.
+        """
+        defaultParallelism = dg.DataGenerator._getDefaultSparkParallelism(sparkSession)
+        assert defaultParallelism == sparkSession.sparkContext.defaultParallelism
+
+    def test_getSharedDefaultParallelism(self, sharedSparkSession):
+        """Test that the default parallelism is returned when the sparkSession object supports use of the
+        sparkContext attribute to get the default parallelism, but that a constant is return when the `sparkContext`
+        attribute is not available.
+        """
+        defaultParallelism = dg.DataGenerator._getDefaultSparkParallelism(sharedSparkSession)
+        assert defaultParallelism == dg.SPARK_DEFAULT_PARALLELISM
+
+    def test_getNullContextDefaultParallelism(self, sparkSessionNullContext):
+        """Test that the default parallelism is returned when the sparkSession object supports use of the
+        sparkContext attribute to get the default parallelism.
+
+        :param sparkSession: The sparkSession object to use for the test.
+        """
+        defaultParallelism = dg.DataGenerator._getDefaultSparkParallelism(sparkSessionNullContext)
+        assert defaultParallelism == dg.SPARK_DEFAULT_PARALLELISM
+
+    def test_mocked_shared_session1(self, sharedSparkSession):
+        # validate that accessing the sparkContext on the shared spark session raises an exception
+        with pytest.raises(Exception) as excinfo:
+            context = sharedSparkSession.sparkContext
+
+        assert "sparkContext" in str(excinfo.value)
+
+    def test_null_context_spark_session(self, sparkSessionNullContext):
+        # validate that accessing the sparkContext on the shared spark session raises an exception
+        context = sparkSessionNullContext.sparkContext
+        assert context is None