Skip to content

Commit

Permalink
fix: to_gbq uses default_type for ambiguous array types and struc…
Browse files Browse the repository at this point in the history
…t field types
  • Loading branch information
tswast committed Dec 12, 2024
1 parent cc90edd commit 5c5a04b
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 16 deletions.
24 changes: 15 additions & 9 deletions pandas_gbq/schema/pandas_to_bigquery.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def dataframe_to_bigquery_fields(

# Try to automatically determine the type based on a few rows of the data.
values = dataframe.reset_index()[column]
bq_field = values_to_bigquery_field(column, values)
bq_field = values_to_bigquery_field(column, values, default_type=default_type)

if bq_field:
bq_schema_out.append(bq_field)
Expand All @@ -114,7 +114,7 @@ def dataframe_to_bigquery_fields(
arrow_value = pyarrow.array(values)
bq_field = (
pandas_gbq.schema.pyarrow_to_bigquery.arrow_type_to_bigquery_field(
column, arrow_value.type
column, arrow_value.type, default_type=default_type,
)
)

Expand Down Expand Up @@ -164,7 +164,11 @@ def dtype_to_bigquery_field(name, dtype) -> Optional[schema.SchemaField]:
return None


def value_to_bigquery_field(name, value) -> Optional[schema.SchemaField]:
def value_to_bigquery_field(name, value, default_type=None) -> Optional[schema.SchemaField]:
# There are no non-null values, so assume the default type.
if value is None:
return schema.SchemaField(name, default_type)

if isinstance(value, str):
return schema.SchemaField(name, "STRING")

Expand All @@ -188,29 +192,31 @@ def value_to_bigquery_field(name, value) -> Optional[schema.SchemaField]:
return None


def values_to_bigquery_field(name, values) -> Optional[schema.SchemaField]:
def values_to_bigquery_field(name, values, default_type="STRING") -> Optional[schema.SchemaField]:
value = pandas_gbq.core.pandas.first_valid(values)

# All NULL, type not determinable.
# All NULL, type not determinable by this method. Return None so we can try
# some other methods.
if value is None:
return None

field = value_to_bigquery_field(name, value)
field = value_to_bigquery_field(name, value, default_type=default_type)
if field is not None:
return field

if isinstance(value, str):
return schema.SchemaField(name, "STRING")

# Check plain ARRAY values here. Let STRUCT get determined by pyarrow,
# which can examine more values to determine all keys.
# Check plain ARRAY values here. Exclude mapping types to let STRUCT get
# determined by pyarrow, which can examine more values to determine all
# keys.
if isinstance(value, collections.abc.Iterable) and not isinstance(
value, collections.abc.Mapping
):
# It could be that this value contains all None or is empty, so get the
# first non-None value we can find.
valid_item = pandas_gbq.core.pandas.first_array_valid(values)
field = value_to_bigquery_field(name, valid_item)
field = value_to_bigquery_field(name, valid_item, default_type=default_type)

if field is not None:
return schema.SchemaField(name, field.field_type, mode="REPEATED")
Expand Down
10 changes: 8 additions & 2 deletions pandas_gbq/schema/pyarrow_to_bigquery.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,13 @@
}


def arrow_type_to_bigquery_field(name, type_) -> Optional[schema.SchemaField]:
def arrow_type_to_bigquery_field(name, type_, default_type="STRING") -> Optional[schema.SchemaField]:
# If a sub-field is the null type, then assume it's the default type, as
# that's the best we can do.
# https://github.com/googleapis/python-bigquery-pandas/issues/836
if pyarrow.types.is_null(type_):
return schema.SchemaField(name, default_type)

# Since both TIMESTAMP/DATETIME use pyarrow.timestamp(...), we need to use
# a special case to disambiguate them. See:
# https://github.com/googleapis/python-bigquery-pandas/issues/450
Expand All @@ -59,7 +65,7 @@ def arrow_type_to_bigquery_field(name, type_) -> Optional[schema.SchemaField]:
struct_type = cast(pyarrow.StructType, type_)
for field_index in range(struct_type.num_fields):
field = struct_type[field_index]
inner_fields.append(arrow_type_to_bigquery_field(field.name, field.type))
inner_fields.append(arrow_type_to_bigquery_field(field.name, field.type, default_type=default_type))

return schema.SchemaField(name, "RECORD", fields=inner_fields)

Expand Down
39 changes: 34 additions & 5 deletions tests/unit/test_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def test_schema_is_subset_fails_if_not_subset():
[
pytest.param(
pandas.DataFrame(data={"col1": [object()]}),
{"fields": [{"name": "col1", "type": "STRING"}]},
{"fields": [{"name": "col1", "type": "DEFAULT_TYPE"}]},
id="default-type-fails-pyarrow-conversion",
),
(
Expand Down Expand Up @@ -182,13 +182,15 @@ def test_schema_is_subset_fails_if_not_subset():
else "object",
),
"list_of_struct": pandas.Series(
[[], [{"test": "abc"}], []],
[[], [{"test": 123.0}], []],
dtype=pandas.ArrowDtype(
pyarrow.list_(pyarrow.struct([("test", pyarrow.string())]))
pyarrow.list_(pyarrow.struct([("test", pyarrow.float64())]))
)
if hasattr(pandas, "ArrowDtype")
else "object",
),
"list_of_unknown": [[], [], []],
"list_of_null": [[None, None], [None], [None, None]],
}
),
{
Expand All @@ -200,17 +202,44 @@ def test_schema_is_subset_fails_if_not_subset():
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{"name": "test", "type": "STRING", "mode": "NULLABLE"},
{"name": "test", "type": "FLOAT", "mode": "NULLABLE"},
],
},
# Use DEFAULT_TYPE because there are no values to detect a type.
{"name": "list_of_unknown", "type": "DEFAULT_TYPE", "mode": "REPEATED"},
{"name": "list_of_null", "type": "DEFAULT_TYPE", "mode": "REPEATED"},
],
},
id="array",
),
pytest.param(
# If a struct contains only nulls in a sub-field, use the default
# type for subfields without a type we can determine.
# https://github.com/googleapis/python-bigquery-pandas/issues/836
pandas.DataFrame(
{
"id": [0, 1],
"positions": [{"state": None}, {"state": None}],
},
),
{
"fields": [
{"name": "id", "type": "INTEGER"},
{
"name": "positions",
"type": "RECORD",
"fields": [
{"name": "state", "type": "DEFAULT_TYPE", "mode": "NULLABLE"},
],
},
],
},
id="issue832-null-struct-field",
),
],
)
def test_generate_bq_schema(dataframe, expected_schema):
schema = pandas_gbq.gbq._generate_bq_schema(dataframe)
schema = pandas_gbq.gbq._generate_bq_schema(dataframe, default_type="DEFAULT_TYPE")

# NULLABLE is the default mode.
for field in expected_schema["fields"]:
Expand Down

0 comments on commit 5c5a04b

Please sign in to comment.