Merge pull request #2577 from julijonas/fix-database-quoting

Fix spark database double-quoting
moj-analytical-services · Jan 22, 2025 · cdd52a3 · cdd52a3
2 parents 471f1ed + c2c3929
commit cdd52a3
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 2 deletions.
diff --git a/splink/internals/spark/database_api.py b/splink/internals/spark/database_api.py
@@ -155,9 +155,14 @@ def _set_splink_datastore(self, catalog, database):
         # be stored. The filter will remove none, so if catalog is not provided and
         # spark version is < 3.3.0 we will use the default catalog.
         self.splink_data_store = ".".join(
-            [f"`{x}`" for x in [catalog, database] if x is not None]
+            [self._quote_if_needed(x) for x in [catalog, database] if x is not None]
         )
 
+    def _quote_if_needed(self, identifier):
+        if identifier.startswith("`") and identifier.endswith("`"):
+            return identifier
+        return f"`{identifier}`"
+
     def _register_udfs_from_jar(self):
         # TODO: this should check if these are already registered and skip if so
         # to cut down on warnings

diff --git a/tests/test_full_example_spark.py b/tests/test_full_example_spark.py
@@ -70,14 +70,14 @@ def spark_csv_read(x):
 
     completeness_chart(df_spark, spark_api)
 
+    spark.sql("USE DATABASE `1111`")
     linker = Linker(
         df_spark,
         settings,
         SparkAPI(
             spark_session=spark,
             break_lineage_method="checkpoint",
             num_partitions_on_repartition=2,
-            database="1111",
         ),
     )