Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Snowflake integration #12

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 36 additions & 21 deletions .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
@@ -1,31 +1,46 @@
{
"name": "Fugue Development Environment",
"image": "fugueproject/devenv:0.7.7",
"settings": {
"terminal.integrated.shell.linux": "/bin/bash",
"python.pythonPath": "/usr/local/bin/python",
"python.linting.enabled": true,
"python.linting.pylintEnabled": true,
"python.formatting.autopep8Path": "/usr/local/py-utils/bin/autopep8",
"python.formatting.blackPath": "/usr/local/py-utils/bin/black",
"python.formatting.yapfPath": "/usr/local/py-utils/bin/yapf",
"python.linting.banditPath": "/usr/local/py-utils/bin/bandit",
"python.linting.flake8Path": "/usr/local/py-utils/bin/flake8",
"python.linting.mypyPath": "/usr/local/py-utils/bin/mypy",
"python.linting.pycodestylePath": "/usr/local/py-utils/bin/pycodestyle",
"python.linting.pydocstylePath": "/usr/local/py-utils/bin/pydocstyle",
"python.linting.pylintPath": "/usr/local/py-utils/bin/pylint"
"image": "mcr.microsoft.com/vscode/devcontainers/python:3.10",
"customizations": {
"vscode": {
"settings": {
"terminal.integrated.shell.linux": "/bin/bash",
"python.pythonPath": "/usr/local/bin/python",
"python.defaultInterpreterPath": "/usr/local/bin/python",
"editor.defaultFormatter": "ms-python.black-formatter",
"isort.interpreter": [
"/usr/local/bin/python"
],
"flake8.interpreter": [
"/usr/local/bin/python"
],
"pylint.interpreter": [
"/usr/local/bin/python"
],
"black-formatter.interpreter": [
"/usr/local/bin/python"
]
},
"extensions": [
"ms-python.python",
"ms-python.isort",
"ms-python.flake8",
"ms-python.pylint",
"ms-python.mypy",
"ms-python.black-formatter",
"GitHub.copilot",
"njpwerner.autodocstring"
]
}
},
"extensions": [
"ms-python.python",
"ms-python.isort",
"njpwerner.autodocstring"
],
"forwardPorts": [
8888
],
"postCreateCommand": "make devenv",
"features": {
"ghcr.io/devcontainers/features/docker-in-docker:2": {}
"ghcr.io/devcontainers/features/docker-in-docker:2": {},
"ghcr.io/devcontainers/features/java:1": {
"version": "11"
}
}
}
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ trinodocker:
testtrino:
python3 -bb -m pytest tests/fugue_trino --cov=fugue_trino

testsf:
python3 -bb -m pytest tests/fugue_snowflake --cov=fugue_snowflake

lab:
mkdir -p tmp
pip install .
Expand Down
9 changes: 3 additions & 6 deletions fugue_bigquery/execution_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,14 +112,11 @@ def join(
key_schema, end_schema = get_join_schemas(_df1, _df2, how=how, on=on)
_filter = _df2.native[key_schema.names]
on_fields = [_df1.native[k] == _filter[k] for k in key_schema]
suffixes = dict(lname="", rname="{name}" + _JOIN_RIGHT_SUFFIX) # noqa
if how.lower() in ["semi", "left_semi"]:
tb = _df1.native.inner_join(
_filter, on_fields, suffixes=("", _JOIN_RIGHT_SUFFIX)
)
tb = _df1.native.inner_join(_filter, on_fields, **suffixes)
else:
tb = _df1.native.left_join(
_filter, on_fields, suffixes=("", _JOIN_RIGHT_SUFFIX)
)
tb = _df1.native.left_join(_filter, on_fields, **suffixes)
tb = tb[tb[key_schema.names[0] + _JOIN_RIGHT_SUFFIX].isnull()]
return self.to_df(tb[end_schema.names], schema=end_schema)

Expand Down
3 changes: 3 additions & 0 deletions fugue_snowflake/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# flake8: noqa

from ._constants import FUGUE_SF_LOGGER
from .client import SnowflakeClient
from .dataframe import SnowflakeDataFrame
from .execution_engine import SnowflakeExecutionEngine
36 changes: 34 additions & 2 deletions fugue_snowflake/_constants.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,34 @@
FUGUE_SNOWFLAKE_CONF_CREDENTIALS_ENV = None
FUGUE_SNOWFLAKE_CONF_ACCOUNT = None
import logging
import os
from typing import Any, Dict

from triad import ParamDict

_FUGUE_SF_ENV_PREFIX = "FUGUE_SF_"
_FUGUE_SF_CONF_PREFIX = "fugue.sf."

FUGUE_SF_CONF_PACKAGES = "fugue.sf.packages"
FUGUE_SF_CONF_IMPORTS = "fugue.sf.imports"
FUGUE_SF_CONF_CASE_SENSITIVE = "fugue.sf.case_sensitive"

FUGUE_SF_LOGGER = logging.getLogger("fugue_snowflake")


def get_client_init_params(conf: Any) -> Dict[str, Any]:
_conf = ParamDict(conf)
return dict( # noqa: C408
account=_get_value(_conf, "account"),
user=_get_value(_conf, "user"),
password=_get_value(_conf, "password"),
warehouse=_get_value(_conf, "warehouse"),
database=_get_value(_conf, "database"),
schema=_get_value(_conf, "schema"),
)


def _get_value(conf: ParamDict, name: str) -> Any:
if _FUGUE_SF_CONF_PREFIX + name in conf:
return conf.get_or_throw(_FUGUE_SF_CONF_PREFIX + name, str)
if _FUGUE_SF_ENV_PREFIX + name.upper() in os.environ:
return os.environ[_FUGUE_SF_ENV_PREFIX + name.upper()]
return None
256 changes: 256 additions & 0 deletions fugue_snowflake/_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,256 @@
import re
from importlib.metadata import version as get_version
from typing import Any, Dict, Iterable, List, Optional, Set
from uuid import uuid4
import ibis.expr.datatypes as dt
import pyarrow as pa
from fugue_ibis import IbisSchema, IbisTable
from fugue_ibis._utils import ibis_to_pa_type
from ibis.backends.snowflake import Backend
from snowflake.connector.constants import FIELD_TYPES
from snowflake.connector.result_batch import ResultBatch
from triad import Schema
from triad.utils.pyarrow import (
get_alter_func,
parse_json_columns,
replace_types_in_table,
)

_PA_TYPE_TO_SF_TYPE: Dict[pa.DataType, str] = {
pa.string(): "STRING",
pa.bool_(): "BOOLEAN",
pa.int8(): "BYTEINT",
pa.int16(): "TINYINT",
pa.int32(): "SMALLINT",
pa.int64(): "BIGINT",
pa.uint8(): "INT",
pa.uint16(): "INT",
pa.uint32(): "INT",
pa.uint64(): "INT",
pa.float16(): "FLOAT",
pa.float32(): "FLOAT",
pa.float64(): "FLOAT",
pa.date32(): "DATE",
pa.binary(): "BINARY",
}


def quote_name(name: str) -> str:
quote = '"'
return quote + name.replace(quote, quote + quote) + quote


def unquote_name(name: str) -> str:
name = (
name.replace('""', "<DOUBLE_QUOTE>")
.replace('"', "")
.replace("<DOUBLE_QUOTE>", '"')
)
return name


def normalize_name(name: str) -> str:
if name.startswith('"') and name.endswith('"'):
return name
return name.upper()


def parse_table_name(name: str, normalize: bool = False) -> List[str]:
res: List[str] = []
start, p = 0, 0
while p < len(name):
if name[p] == '"':
p += 1
while p < len(name):
if name[p] == '"':
if p + 1 < len(name) and name[p + 1] == '"':
p += 1
else:
break
p += 1
p += 1
elif name[p] == ".":
res.append(name[start:p])
start = p + 1
p += 1
else:
p += 1
if start < len(name):
res.append(name[start:])
if normalize:
return [normalize_name(x) for x in res]
return res


def to_schema(schema: IbisSchema) -> Schema:
fields: List[Any] = []
for n, t in zip(schema.names, schema.types):
if _ibis_has_json(t):
fields.append((n, pa.string()))
else:
fields.append((n, ibis_to_pa_type(t)))
return Schema(fields)


def _ibis_has_json(tp: dt.DataType) -> bool:
if isinstance(tp, dt.Array):
return _ibis_has_json(tp.value_type)
if isinstance(tp, dt.Struct):
for t in tp.types:
if _ibis_has_json(t):
return True
if isinstance(tp, dt.Map):
return _ibis_has_json(tp.value_type)
return isinstance(tp, dt.JSON)


def pa_type_to_snowflake_type_str(tp: pa.DataType) -> str:
if tp in _PA_TYPE_TO_SF_TYPE:
return _PA_TYPE_TO_SF_TYPE[tp]
if pa.types.is_timestamp(tp):
if tp.tz is not None:
return "TIMESTAMP_TZ"
return "TIMESTAMP_NTZ"
if pa.types.is_decimal(tp):
return f"DECIMAL({tp.precision},{tp.scale})"
if pa.types.is_list(tp):
# itp = pa_type_to_snowflake_type_str(tp.value_type)
# return f"ARRAY({itp})"
return "ARRAY"
if pa.types.is_struct(tp):
# fields = []
# for f in tp:
# fields.append(
# f"{quote_name(f.name)} {pa_type_to_snowflake_type_str(f.type)}"
# )
# return f"OBJECT({', '.join(fields)})"
return "OBJECT"
if pa.types.is_map(tp):
# ktp = pa_type_to_snowflake_type_str(tp.key_type)
# vtp = pa_type_to_snowflake_type_str(tp.item_type)
# return f"MAP({ktp}, {vtp})"
return "MAP"
raise NotImplementedError(f"Unsupported type {tp}")


def fix_snowflake_arrow_result(result: pa.Table) -> pa.Table:
return replace_types_in_table(
result,
[
(lambda tp: pa.types.is_integer(tp), pa.int64()),
(lambda tp: pa.types.is_floating(tp), pa.float64()),
(
lambda tp: pa.types.is_decimal(tp)
and tp.precision == 38
and tp.scale == 0,
pa.int64(),
),
(lambda tp: pa.types.is_date64(tp), pa.date32()),
# (
# lambda tp: pa.types.is_timestamp(tp)
# and tp.tz is None
# and tp != TRIAD_DEFAULT_TIMESTAMP,
# TRIAD_DEFAULT_TIMESTAMP,
# ),
],
)


def is_sf_ibis_table(df: Any):
if not isinstance(df, IbisTable):
return False
try:
return isinstance(df._find_backend(), Backend)
except Exception: # pragma: no cover
return False


def to_snowflake_schema(schema: Any) -> str:
_s = schema if isinstance(schema, Schema) else Schema(schema)
fields = []
for f in _s.fields:
fields.append(f"{quote_name(f.name)} {pa_type_to_snowflake_type_str(f.type)}")
return ", ".join(fields)


def get_arrow_from_batches(
batches: Optional[List[ResultBatch]],
query_output_schema: Schema,
schema: Any = None,
infer_nested_types: bool = False,
) -> pa.Table:
output_schema_has_nested = False
if batches is None or len(batches) == 0:
if schema is not None:
return (
schema if isinstance(schema, Schema) else Schema(schema)
).create_empty_arrow_table()
return query_output_schema.create_empty_arrow_table()

def _batches_to_arrow(_batches: List[ResultBatch]) -> Iterable[pa.Table]:
has_result = False
for batch in _batches:
adf = batch.to_arrow()
if adf.num_rows == 0:
continue
func = get_alter_func(adf.schema, query_output_schema.pa_schema, safe=True)
has_result = True
yield func(adf)

if not has_result:
yield query_output_schema.create_empty_arrow_table()

adf = pa.concat_tables(_batches_to_arrow(batches))

nested_cols = _get_nested_columns(batches[0])
if schema is not None:
_schema = schema if isinstance(schema, Schema) else Schema(schema)
output_schema_has_nested = any(pa.types.is_nested(tp) for tp in _schema.types)
if (output_schema_has_nested or infer_nested_types) and len(nested_cols) > 0:
adf = parse_json_columns(adf, nested_cols)
if schema is not None:
func = get_alter_func(adf.schema, _schema.pa_schema, safe=True)
adf = func(adf)
return adf


def _get_nested_columns(batch: ResultBatch) -> List[str]:
res: List[str] = []
for meta in batch.schema:
f = FIELD_TYPES[meta.type_code]
if f.name in ["OBJECT", "ARRAY", "MAP", "VARIANT"]:
res.append(meta.name)
return res


def _get_batch_arrow_schema(batch: ResultBatch) -> pa.Schema:
fields = [
pa.field(s.name, FIELD_TYPES[s.type_code].pa_type()) for s in batch.schema
]
return pa.schema(fields)


def temp_rand_str() -> str:
return ("temp_" + str(uuid4()).split("-")[0]).upper()


def build_package_list(packages: Iterable[str]) -> List[str]:
ps: Set[str] = set()
for p in packages:
ps.add(p)
continue
try:
if "=" in p or "<" in p or ">" in p:
ps.add(p)
else:
ps.add(p + "==" + get_version(p))
except Exception: # pragma: no cover
ps.add(p)
return list(ps)


def is_select_query(s: str) -> bool:
return (
re.match(r"^\s*select\s", s, re.IGNORECASE) is not None
or re.match(r"^\s*with\s", s, re.IGNORECASE) is not None
)
Loading
Loading