fix: to_gbq uses default_type for ambiguous array types and struc…

…t field types
googleapis · Dec 12, 2024 · 5c5a04b · 5c5a04b
1 parent cc90edd
commit 5c5a04b
Show file tree

Hide file tree

Showing 3 changed files with 57 additions and 16 deletions.
diff --git a/pandas_gbq/schema/pandas_to_bigquery.py b/pandas_gbq/schema/pandas_to_bigquery.py
@@ -103,7 +103,7 @@ def dataframe_to_bigquery_fields(
 
         # Try to automatically determine the type based on a few rows of the data.
         values = dataframe.reset_index()[column]
-        bq_field = values_to_bigquery_field(column, values)
+        bq_field = values_to_bigquery_field(column, values, default_type=default_type)
 
         if bq_field:
             bq_schema_out.append(bq_field)
@@ -114,7 +114,7 @@ def dataframe_to_bigquery_fields(
             arrow_value = pyarrow.array(values)
             bq_field = (
                 pandas_gbq.schema.pyarrow_to_bigquery.arrow_type_to_bigquery_field(
-                    column, arrow_value.type
+                    column, arrow_value.type, default_type=default_type,
                 )
             )
 
@@ -164,7 +164,11 @@ def dtype_to_bigquery_field(name, dtype) -> Optional[schema.SchemaField]:
     return None
 
 
-def value_to_bigquery_field(name, value) -> Optional[schema.SchemaField]:
+def value_to_bigquery_field(name, value, default_type=None) -> Optional[schema.SchemaField]:
+    # There are no non-null values, so assume the default type.
+    if value is None:
+        return schema.SchemaField(name, default_type)
+
     if isinstance(value, str):
         return schema.SchemaField(name, "STRING")
 
@@ -188,29 +192,31 @@ def value_to_bigquery_field(name, value) -> Optional[schema.SchemaField]:
     return None
 
 
-def values_to_bigquery_field(name, values) -> Optional[schema.SchemaField]:
+def values_to_bigquery_field(name, values, default_type="STRING") -> Optional[schema.SchemaField]:
     value = pandas_gbq.core.pandas.first_valid(values)
 
-    # All NULL, type not determinable.
+    # All NULL, type not determinable by this method. Return None so we can try
+    # some other methods.
     if value is None:
         return None
 
-    field = value_to_bigquery_field(name, value)
+    field = value_to_bigquery_field(name, value, default_type=default_type)
     if field is not None:
         return field
 
     if isinstance(value, str):
         return schema.SchemaField(name, "STRING")
 
-    # Check plain ARRAY values here. Let STRUCT get determined by pyarrow,
-    # which can examine more values to determine all keys.
+    # Check plain ARRAY values here. Exclude mapping types to let STRUCT get
+    # determined by pyarrow, which can examine more values to determine all
+    # keys.
     if isinstance(value, collections.abc.Iterable) and not isinstance(
         value, collections.abc.Mapping
     ):
         # It could be that this value contains all None or is empty, so get the
         # first non-None value we can find.
         valid_item = pandas_gbq.core.pandas.first_array_valid(values)
-        field = value_to_bigquery_field(name, valid_item)
+        field = value_to_bigquery_field(name, valid_item, default_type=default_type)
 
         if field is not None:
             return schema.SchemaField(name, field.field_type, mode="REPEATED")

diff --git a/pandas_gbq/schema/pyarrow_to_bigquery.py b/pandas_gbq/schema/pyarrow_to_bigquery.py
@@ -37,7 +37,13 @@
 }
 
 
-def arrow_type_to_bigquery_field(name, type_) -> Optional[schema.SchemaField]:
+def arrow_type_to_bigquery_field(name, type_, default_type="STRING") -> Optional[schema.SchemaField]:
+    # If a sub-field is the null type, then assume it's the default type, as
+    # that's the best we can do.
+    # https://github.com/googleapis/python-bigquery-pandas/issues/836
+    if pyarrow.types.is_null(type_):
+        return schema.SchemaField(name, default_type)
+
     # Since both TIMESTAMP/DATETIME use pyarrow.timestamp(...), we need to use
     # a special case to disambiguate them. See:
     # https://github.com/googleapis/python-bigquery-pandas/issues/450
@@ -59,7 +65,7 @@ def arrow_type_to_bigquery_field(name, type_) -> Optional[schema.SchemaField]:
         struct_type = cast(pyarrow.StructType, type_)
         for field_index in range(struct_type.num_fields):
             field = struct_type[field_index]
-            inner_fields.append(arrow_type_to_bigquery_field(field.name, field.type))
+            inner_fields.append(arrow_type_to_bigquery_field(field.name, field.type, default_type=default_type))
 
         return schema.SchemaField(name, "RECORD", fields=inner_fields)
 

diff --git a/tests/unit/test_schema.py b/tests/unit/test_schema.py
@@ -70,7 +70,7 @@ def test_schema_is_subset_fails_if_not_subset():
     [
         pytest.param(
             pandas.DataFrame(data={"col1": [object()]}),
-            {"fields": [{"name": "col1", "type": "STRING"}]},
+            {"fields": [{"name": "col1", "type": "DEFAULT_TYPE"}]},
             id="default-type-fails-pyarrow-conversion",
         ),
         (
@@ -182,13 +182,15 @@ def test_schema_is_subset_fails_if_not_subset():
                         else "object",
                     ),
                     "list_of_struct": pandas.Series(
-                        [[], [{"test": "abc"}], []],
+                        [[], [{"test": 123.0}], []],
                         dtype=pandas.ArrowDtype(
-                            pyarrow.list_(pyarrow.struct([("test", pyarrow.string())]))
+                            pyarrow.list_(pyarrow.struct([("test", pyarrow.float64())]))
                         )
                         if hasattr(pandas, "ArrowDtype")
                         else "object",
                     ),
+                    "list_of_unknown": [[], [], []],
+                    "list_of_null": [[None, None], [None], [None, None]],
                 }
             ),
             {
@@ -200,17 +202,44 @@ def test_schema_is_subset_fails_if_not_subset():
                         "type": "RECORD",
                         "mode": "REPEATED",
                         "fields": [
-                            {"name": "test", "type": "STRING", "mode": "NULLABLE"},
+                            {"name": "test", "type": "FLOAT", "mode": "NULLABLE"},
                         ],
                     },
+                    # Use DEFAULT_TYPE because there are no values to detect a type.
+                    {"name": "list_of_unknown", "type": "DEFAULT_TYPE", "mode": "REPEATED"},
+                    {"name": "list_of_null", "type": "DEFAULT_TYPE", "mode": "REPEATED"},
                 ],
             },
             id="array",
         ),
+        pytest.param(
+            # If a struct contains only nulls in a sub-field, use the default
+            # type for subfields without a type we can determine.
+            # https://github.com/googleapis/python-bigquery-pandas/issues/836
+            pandas.DataFrame(
+                {
+                    "id": [0, 1],
+                    "positions": [{"state": None}, {"state": None}],
+                },
+            ),
+            {
+                "fields": [
+                    {"name": "id", "type": "INTEGER"},
+                    {
+                        "name": "positions",
+                        "type": "RECORD",
+                        "fields": [
+                            {"name": "state", "type": "DEFAULT_TYPE", "mode": "NULLABLE"},
+                        ],
+                    },
+                ],
+            },
+            id="issue832-null-struct-field",
+        ),
     ],
 )
 def test_generate_bq_schema(dataframe, expected_schema):
-    schema = pandas_gbq.gbq._generate_bq_schema(dataframe)
+    schema = pandas_gbq.gbq._generate_bq_schema(dataframe, default_type="DEFAULT_TYPE")
 
     # NULLABLE is the default mode.
     for field in expected_schema["fields"]: