Skip to content

Commit

Permalink
Merge pull request #2577 from julijonas/fix-database-quoting
Browse files Browse the repository at this point in the history
Fix spark database double-quoting
  • Loading branch information
RobinL authored Jan 22, 2025
2 parents 471f1ed + c2c3929 commit cdd52a3
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 2 deletions.
7 changes: 6 additions & 1 deletion splink/internals/spark/database_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,9 +155,14 @@ def _set_splink_datastore(self, catalog, database):
# be stored. The filter will remove none, so if catalog is not provided and
# spark version is < 3.3.0 we will use the default catalog.
self.splink_data_store = ".".join(
[f"`{x}`" for x in [catalog, database] if x is not None]
[self._quote_if_needed(x) for x in [catalog, database] if x is not None]
)

def _quote_if_needed(self, identifier):
if identifier.startswith("`") and identifier.endswith("`"):
return identifier
return f"`{identifier}`"

def _register_udfs_from_jar(self):
# TODO: this should check if these are already registered and skip if so
# to cut down on warnings
Expand Down
2 changes: 1 addition & 1 deletion tests/test_full_example_spark.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,14 +70,14 @@ def spark_csv_read(x):

completeness_chart(df_spark, spark_api)

spark.sql("USE DATABASE `1111`")
linker = Linker(
df_spark,
settings,
SparkAPI(
spark_session=spark,
break_lineage_method="checkpoint",
num_partitions_on_repartition=2,
database="1111",
),
)

Expand Down

0 comments on commit cdd52a3

Please sign in to comment.