fugue-project · goodwanghan · Jan 21, 2024 · Feb 1, 2024 · Feb 1, 2024 · Feb 8, 2024
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -1,31 +1,46 @@
 {
 	"name": "Fugue Development Environment",
-	"image": "fugueproject/devenv:0.7.7",
-	"settings": {
-		"terminal.integrated.shell.linux": "/bin/bash",
-		"python.pythonPath": "/usr/local/bin/python",
-		"python.linting.enabled": true,
-		"python.linting.pylintEnabled": true,
-		"python.formatting.autopep8Path": "/usr/local/py-utils/bin/autopep8",
-		"python.formatting.blackPath": "/usr/local/py-utils/bin/black",
-		"python.formatting.yapfPath": "/usr/local/py-utils/bin/yapf",
-		"python.linting.banditPath": "/usr/local/py-utils/bin/bandit",
-		"python.linting.flake8Path": "/usr/local/py-utils/bin/flake8",
-		"python.linting.mypyPath": "/usr/local/py-utils/bin/mypy",
-		"python.linting.pycodestylePath": "/usr/local/py-utils/bin/pycodestyle",
-		"python.linting.pydocstylePath": "/usr/local/py-utils/bin/pydocstyle",
-		"python.linting.pylintPath": "/usr/local/py-utils/bin/pylint"
+	"image": "mcr.microsoft.com/vscode/devcontainers/python:3.10",
+	"customizations": {
+		"vscode": {
+			"settings": {
+				"terminal.integrated.shell.linux": "/bin/bash",
+				"python.pythonPath": "/usr/local/bin/python",
+				"python.defaultInterpreterPath": "/usr/local/bin/python",
+				"editor.defaultFormatter": "ms-python.black-formatter",
+				"isort.interpreter": [
+					"/usr/local/bin/python"
+				],
+				"flake8.interpreter": [
+					"/usr/local/bin/python"
+				],
+				"pylint.interpreter": [
+					"/usr/local/bin/python"
+				],
+				"black-formatter.interpreter": [
+					"/usr/local/bin/python"
+				]
+			},
+			"extensions": [
+				"ms-python.python",
+				"ms-python.isort",
+				"ms-python.flake8",
+				"ms-python.pylint",
+				"ms-python.mypy",
+				"ms-python.black-formatter",
+				"GitHub.copilot",
+				"njpwerner.autodocstring"
+			]
+		}
 	},
-	"extensions": [
-		"ms-python.python",
-		"ms-python.isort",
-		"njpwerner.autodocstring"
-	],
 	"forwardPorts": [
 		8888
 	],
 	"postCreateCommand": "make devenv",
 	"features": {
-		"ghcr.io/devcontainers/features/docker-in-docker:2": {}
+		"ghcr.io/devcontainers/features/docker-in-docker:2": {},
+		"ghcr.io/devcontainers/features/java:1": {
+			"version": "11"
+		}
 	}
 }
diff --git a/Makefile b/Makefile
@@ -42,6 +42,9 @@ trinodocker:
 testtrino:
 	python3 -bb -m pytest tests/fugue_trino --cov=fugue_trino
 
+testsf:
+	python3 -bb -m pytest tests/fugue_snowflake --cov=fugue_snowflake
+
 lab:
 	mkdir -p tmp
 	pip install .

diff --git a/fugue_bigquery/execution_engine.py b/fugue_bigquery/execution_engine.py
@@ -112,14 +112,11 @@ def join(
         key_schema, end_schema = get_join_schemas(_df1, _df2, how=how, on=on)
         _filter = _df2.native[key_schema.names]
         on_fields = [_df1.native[k] == _filter[k] for k in key_schema]
+        suffixes = dict(lname="", rname="{name}" + _JOIN_RIGHT_SUFFIX)  # noqa
         if how.lower() in ["semi", "left_semi"]:
-            tb = _df1.native.inner_join(
-                _filter, on_fields, suffixes=("", _JOIN_RIGHT_SUFFIX)
-            )
+            tb = _df1.native.inner_join(_filter, on_fields, **suffixes)
         else:
-            tb = _df1.native.left_join(
-                _filter, on_fields, suffixes=("", _JOIN_RIGHT_SUFFIX)
-            )
+            tb = _df1.native.left_join(_filter, on_fields, **suffixes)
             tb = tb[tb[key_schema.names[0] + _JOIN_RIGHT_SUFFIX].isnull()]
         return self.to_df(tb[end_schema.names], schema=end_schema)
 

diff --git a/fugue_snowflake/__init__.py b/fugue_snowflake/__init__.py
@@ -1,3 +1,6 @@
 # flake8: noqa
 
+from ._constants import FUGUE_SF_LOGGER
 from .client import SnowflakeClient
+from .dataframe import SnowflakeDataFrame
+from .execution_engine import SnowflakeExecutionEngine
diff --git a/fugue_snowflake/_constants.py b/fugue_snowflake/_constants.py
@@ -1,2 +1,34 @@
-FUGUE_SNOWFLAKE_CONF_CREDENTIALS_ENV = None
-FUGUE_SNOWFLAKE_CONF_ACCOUNT = None
+import logging
+import os
+from typing import Any, Dict
+
+from triad import ParamDict
+
+_FUGUE_SF_ENV_PREFIX = "FUGUE_SF_"
+_FUGUE_SF_CONF_PREFIX = "fugue.sf."
+
+FUGUE_SF_CONF_PACKAGES = "fugue.sf.packages"
+FUGUE_SF_CONF_IMPORTS = "fugue.sf.imports"
+FUGUE_SF_CONF_CASE_SENSITIVE = "fugue.sf.case_sensitive"
+
+FUGUE_SF_LOGGER = logging.getLogger("fugue_snowflake")
+
+
+def get_client_init_params(conf: Any) -> Dict[str, Any]:
+    _conf = ParamDict(conf)
+    return dict(  # noqa: C408
+        account=_get_value(_conf, "account"),
+        user=_get_value(_conf, "user"),
+        password=_get_value(_conf, "password"),
+        warehouse=_get_value(_conf, "warehouse"),
+        database=_get_value(_conf, "database"),
+        schema=_get_value(_conf, "schema"),
+    )
+
+
+def _get_value(conf: ParamDict, name: str) -> Any:
+    if _FUGUE_SF_CONF_PREFIX + name in conf:
+        return conf.get_or_throw(_FUGUE_SF_CONF_PREFIX + name, str)
+    if _FUGUE_SF_ENV_PREFIX + name.upper() in os.environ:
+        return os.environ[_FUGUE_SF_ENV_PREFIX + name.upper()]
+    return None
diff --git a/fugue_snowflake/_utils.py b/fugue_snowflake/_utils.py
@@ -0,0 +1,256 @@
+import re
+from importlib.metadata import version as get_version
+from typing import Any, Dict, Iterable, List, Optional, Set
+from uuid import uuid4
+import ibis.expr.datatypes as dt
+import pyarrow as pa
+from fugue_ibis import IbisSchema, IbisTable
+from fugue_ibis._utils import ibis_to_pa_type
+from ibis.backends.snowflake import Backend
+from snowflake.connector.constants import FIELD_TYPES
+from snowflake.connector.result_batch import ResultBatch
+from triad import Schema
+from triad.utils.pyarrow import (
+    get_alter_func,
+    parse_json_columns,
+    replace_types_in_table,
+)
+
+_PA_TYPE_TO_SF_TYPE: Dict[pa.DataType, str] = {
+    pa.string(): "STRING",
+    pa.bool_(): "BOOLEAN",
+    pa.int8(): "BYTEINT",
+    pa.int16(): "TINYINT",
+    pa.int32(): "SMALLINT",
+    pa.int64(): "BIGINT",
+    pa.uint8(): "INT",
+    pa.uint16(): "INT",
+    pa.uint32(): "INT",
+    pa.uint64(): "INT",
+    pa.float16(): "FLOAT",
+    pa.float32(): "FLOAT",
+    pa.float64(): "FLOAT",
+    pa.date32(): "DATE",
+    pa.binary(): "BINARY",
+}
+
+
+def quote_name(name: str) -> str:
+    quote = '"'
+    return quote + name.replace(quote, quote + quote) + quote
+
+
+def unquote_name(name: str) -> str:
+    name = (
+        name.replace('""', "<DOUBLE_QUOTE>")
+        .replace('"', "")
+        .replace("<DOUBLE_QUOTE>", '"')
+    )
+    return name
+
+
+def normalize_name(name: str) -> str:
+    if name.startswith('"') and name.endswith('"'):
+        return name
+    return name.upper()
+
+
+def parse_table_name(name: str, normalize: bool = False) -> List[str]:
+    res: List[str] = []
+    start, p = 0, 0
+    while p < len(name):
+        if name[p] == '"':
+            p += 1
+            while p < len(name):
+                if name[p] == '"':
+                    if p + 1 < len(name) and name[p + 1] == '"':
+                        p += 1
+                    else:
+                        break
+                p += 1
+            p += 1
+        elif name[p] == ".":
+            res.append(name[start:p])
+            start = p + 1
+            p += 1
+        else:
+            p += 1
+    if start < len(name):
+        res.append(name[start:])
+    if normalize:
+        return [normalize_name(x) for x in res]
+    return res
+
+
+def to_schema(schema: IbisSchema) -> Schema:
+    fields: List[Any] = []
+    for n, t in zip(schema.names, schema.types):
+        if _ibis_has_json(t):
+            fields.append((n, pa.string()))
+        else:
+            fields.append((n, ibis_to_pa_type(t)))
+    return Schema(fields)
+
+
+def _ibis_has_json(tp: dt.DataType) -> bool:
+    if isinstance(tp, dt.Array):
+        return _ibis_has_json(tp.value_type)
+    if isinstance(tp, dt.Struct):
+        for t in tp.types:
+            if _ibis_has_json(t):
+                return True
+    if isinstance(tp, dt.Map):
+        return _ibis_has_json(tp.value_type)
+    return isinstance(tp, dt.JSON)
+
+
+def pa_type_to_snowflake_type_str(tp: pa.DataType) -> str:
+    if tp in _PA_TYPE_TO_SF_TYPE:
+        return _PA_TYPE_TO_SF_TYPE[tp]
+    if pa.types.is_timestamp(tp):
+        if tp.tz is not None:
+            return "TIMESTAMP_TZ"
+        return "TIMESTAMP_NTZ"
+    if pa.types.is_decimal(tp):
+        return f"DECIMAL({tp.precision},{tp.scale})"
+    if pa.types.is_list(tp):
+        # itp = pa_type_to_snowflake_type_str(tp.value_type)
+        # return f"ARRAY({itp})"
+        return "ARRAY"
+    if pa.types.is_struct(tp):
+        # fields = []
+        # for f in tp:
+        #     fields.append(
+        #         f"{quote_name(f.name)} {pa_type_to_snowflake_type_str(f.type)}"
+        #     )
+        # return f"OBJECT({', '.join(fields)})"
+        return "OBJECT"
+    if pa.types.is_map(tp):
+        # ktp = pa_type_to_snowflake_type_str(tp.key_type)
+        # vtp = pa_type_to_snowflake_type_str(tp.item_type)
+        # return f"MAP({ktp}, {vtp})"
+        return "MAP"
+    raise NotImplementedError(f"Unsupported type {tp}")
+
+
+def fix_snowflake_arrow_result(result: pa.Table) -> pa.Table:
+    return replace_types_in_table(
+        result,
+        [
+            (lambda tp: pa.types.is_integer(tp), pa.int64()),
+            (lambda tp: pa.types.is_floating(tp), pa.float64()),
+            (
+                lambda tp: pa.types.is_decimal(tp)
+                and tp.precision == 38
+                and tp.scale == 0,
+                pa.int64(),
+            ),
+            (lambda tp: pa.types.is_date64(tp), pa.date32()),
+            # (
+            #     lambda tp: pa.types.is_timestamp(tp)
+            #     and tp.tz is None
+            #     and tp != TRIAD_DEFAULT_TIMESTAMP,
+            #     TRIAD_DEFAULT_TIMESTAMP,
+            # ),
+        ],
+    )
+
+
+def is_sf_ibis_table(df: Any):
+    if not isinstance(df, IbisTable):
+        return False
+    try:
+        return isinstance(df._find_backend(), Backend)
+    except Exception:  # pragma: no cover
+        return False
+
+
+def to_snowflake_schema(schema: Any) -> str:
+    _s = schema if isinstance(schema, Schema) else Schema(schema)
+    fields = []
+    for f in _s.fields:
+        fields.append(f"{quote_name(f.name)} {pa_type_to_snowflake_type_str(f.type)}")
+    return ", ".join(fields)
+
+
+def get_arrow_from_batches(
+    batches: Optional[List[ResultBatch]],
+    query_output_schema: Schema,
+    schema: Any = None,
+    infer_nested_types: bool = False,
+) -> pa.Table:
+    output_schema_has_nested = False
+    if batches is None or len(batches) == 0:
+        if schema is not None:
+            return (
+                schema if isinstance(schema, Schema) else Schema(schema)
+            ).create_empty_arrow_table()
+        return query_output_schema.create_empty_arrow_table()
+
+    def _batches_to_arrow(_batches: List[ResultBatch]) -> Iterable[pa.Table]:
+        has_result = False
+        for batch in _batches:
+            adf = batch.to_arrow()
+            if adf.num_rows == 0:
+                continue
+            func = get_alter_func(adf.schema, query_output_schema.pa_schema, safe=True)
+            has_result = True
+            yield func(adf)
+
+        if not has_result:
+            yield query_output_schema.create_empty_arrow_table()
+
+    adf = pa.concat_tables(_batches_to_arrow(batches))
+
+    nested_cols = _get_nested_columns(batches[0])
+    if schema is not None:
+        _schema = schema if isinstance(schema, Schema) else Schema(schema)
+        output_schema_has_nested = any(pa.types.is_nested(tp) for tp in _schema.types)
+    if (output_schema_has_nested or infer_nested_types) and len(nested_cols) > 0:
+        adf = parse_json_columns(adf, nested_cols)
+    if schema is not None:
+        func = get_alter_func(adf.schema, _schema.pa_schema, safe=True)
+        adf = func(adf)
+    return adf
+
+
+def _get_nested_columns(batch: ResultBatch) -> List[str]:
+    res: List[str] = []
+    for meta in batch.schema:
+        f = FIELD_TYPES[meta.type_code]
+        if f.name in ["OBJECT", "ARRAY", "MAP", "VARIANT"]:
+            res.append(meta.name)
+    return res
+
+
+def _get_batch_arrow_schema(batch: ResultBatch) -> pa.Schema:
+    fields = [
+        pa.field(s.name, FIELD_TYPES[s.type_code].pa_type()) for s in batch.schema
+    ]
+    return pa.schema(fields)
+
+
+def temp_rand_str() -> str:
+    return ("temp_" + str(uuid4()).split("-")[0]).upper()
+
+
+def build_package_list(packages: Iterable[str]) -> List[str]:
+    ps: Set[str] = set()
+    for p in packages:
+        ps.add(p)
+        continue
+        try:
+            if "=" in p or "<" in p or ">" in p:
+                ps.add(p)
+            else:
+                ps.add(p + "==" + get_version(p))
+        except Exception:  # pragma: no cover
+            ps.add(p)
+    return list(ps)
+
+
+def is_select_query(s: str) -> bool:
+    return (
+        re.match(r"^\s*select\s", s, re.IGNORECASE) is not None
+        or re.match(r"^\s*with\s", s, re.IGNORECASE) is not None
+    )