diff --git a/pandas_gbq/schema/pandas_to_bigquery.py b/pandas_gbq/schema/pandas_to_bigquery.py index 5a979a12..acea660e 100644 --- a/pandas_gbq/schema/pandas_to_bigquery.py +++ b/pandas_gbq/schema/pandas_to_bigquery.py @@ -103,7 +103,7 @@ def dataframe_to_bigquery_fields( # Try to automatically determine the type based on a few rows of the data. values = dataframe.reset_index()[column] - bq_field = values_to_bigquery_field(column, values) + bq_field = values_to_bigquery_field(column, values, default_type=default_type) if bq_field: bq_schema_out.append(bq_field) @@ -114,7 +114,7 @@ def dataframe_to_bigquery_fields( arrow_value = pyarrow.array(values) bq_field = ( pandas_gbq.schema.pyarrow_to_bigquery.arrow_type_to_bigquery_field( - column, arrow_value.type + column, arrow_value.type, default_type=default_type, ) ) @@ -164,7 +164,11 @@ def dtype_to_bigquery_field(name, dtype) -> Optional[schema.SchemaField]: return None -def value_to_bigquery_field(name, value) -> Optional[schema.SchemaField]: +def value_to_bigquery_field(name, value, default_type=None) -> Optional[schema.SchemaField]: + # There are no non-null values, so assume the default type. + if value is None: + return schema.SchemaField(name, default_type) + if isinstance(value, str): return schema.SchemaField(name, "STRING") @@ -188,29 +192,31 @@ def value_to_bigquery_field(name, value) -> Optional[schema.SchemaField]: return None -def values_to_bigquery_field(name, values) -> Optional[schema.SchemaField]: +def values_to_bigquery_field(name, values, default_type="STRING") -> Optional[schema.SchemaField]: value = pandas_gbq.core.pandas.first_valid(values) - # All NULL, type not determinable. + # All NULL, type not determinable by this method. Return None so we can try + # some other methods. if value is None: return None - field = value_to_bigquery_field(name, value) + field = value_to_bigquery_field(name, value, default_type=default_type) if field is not None: return field if isinstance(value, str): return schema.SchemaField(name, "STRING") - # Check plain ARRAY values here. Let STRUCT get determined by pyarrow, - # which can examine more values to determine all keys. + # Check plain ARRAY values here. Exclude mapping types to let STRUCT get + # determined by pyarrow, which can examine more values to determine all + # keys. if isinstance(value, collections.abc.Iterable) and not isinstance( value, collections.abc.Mapping ): # It could be that this value contains all None or is empty, so get the # first non-None value we can find. valid_item = pandas_gbq.core.pandas.first_array_valid(values) - field = value_to_bigquery_field(name, valid_item) + field = value_to_bigquery_field(name, valid_item, default_type=default_type) if field is not None: return schema.SchemaField(name, field.field_type, mode="REPEATED") diff --git a/pandas_gbq/schema/pyarrow_to_bigquery.py b/pandas_gbq/schema/pyarrow_to_bigquery.py index da1a1ce8..97af2f8b 100644 --- a/pandas_gbq/schema/pyarrow_to_bigquery.py +++ b/pandas_gbq/schema/pyarrow_to_bigquery.py @@ -37,7 +37,13 @@ } -def arrow_type_to_bigquery_field(name, type_) -> Optional[schema.SchemaField]: +def arrow_type_to_bigquery_field(name, type_, default_type="STRING") -> Optional[schema.SchemaField]: + # If a sub-field is the null type, then assume it's the default type, as + # that's the best we can do. + # https://github.com/googleapis/python-bigquery-pandas/issues/836 + if pyarrow.types.is_null(type_): + return schema.SchemaField(name, default_type) + # Since both TIMESTAMP/DATETIME use pyarrow.timestamp(...), we need to use # a special case to disambiguate them. See: # https://github.com/googleapis/python-bigquery-pandas/issues/450 @@ -59,7 +65,7 @@ def arrow_type_to_bigquery_field(name, type_) -> Optional[schema.SchemaField]: struct_type = cast(pyarrow.StructType, type_) for field_index in range(struct_type.num_fields): field = struct_type[field_index] - inner_fields.append(arrow_type_to_bigquery_field(field.name, field.type)) + inner_fields.append(arrow_type_to_bigquery_field(field.name, field.type, default_type=default_type)) return schema.SchemaField(name, "RECORD", fields=inner_fields) diff --git a/tests/unit/test_schema.py b/tests/unit/test_schema.py index 48e8862a..54f6cf39 100644 --- a/tests/unit/test_schema.py +++ b/tests/unit/test_schema.py @@ -70,7 +70,7 @@ def test_schema_is_subset_fails_if_not_subset(): [ pytest.param( pandas.DataFrame(data={"col1": [object()]}), - {"fields": [{"name": "col1", "type": "STRING"}]}, + {"fields": [{"name": "col1", "type": "DEFAULT_TYPE"}]}, id="default-type-fails-pyarrow-conversion", ), ( @@ -182,13 +182,15 @@ def test_schema_is_subset_fails_if_not_subset(): else "object", ), "list_of_struct": pandas.Series( - [[], [{"test": "abc"}], []], + [[], [{"test": 123.0}], []], dtype=pandas.ArrowDtype( - pyarrow.list_(pyarrow.struct([("test", pyarrow.string())])) + pyarrow.list_(pyarrow.struct([("test", pyarrow.float64())])) ) if hasattr(pandas, "ArrowDtype") else "object", ), + "list_of_unknown": [[], [], []], + "list_of_null": [[None, None], [None], [None, None]], } ), { @@ -200,17 +202,44 @@ def test_schema_is_subset_fails_if_not_subset(): "type": "RECORD", "mode": "REPEATED", "fields": [ - {"name": "test", "type": "STRING", "mode": "NULLABLE"}, + {"name": "test", "type": "FLOAT", "mode": "NULLABLE"}, ], }, + # Use DEFAULT_TYPE because there are no values to detect a type. + {"name": "list_of_unknown", "type": "DEFAULT_TYPE", "mode": "REPEATED"}, + {"name": "list_of_null", "type": "DEFAULT_TYPE", "mode": "REPEATED"}, ], }, id="array", ), + pytest.param( + # If a struct contains only nulls in a sub-field, use the default + # type for subfields without a type we can determine. + # https://github.com/googleapis/python-bigquery-pandas/issues/836 + pandas.DataFrame( + { + "id": [0, 1], + "positions": [{"state": None}, {"state": None}], + }, + ), + { + "fields": [ + {"name": "id", "type": "INTEGER"}, + { + "name": "positions", + "type": "RECORD", + "fields": [ + {"name": "state", "type": "DEFAULT_TYPE", "mode": "NULLABLE"}, + ], + }, + ], + }, + id="issue832-null-struct-field", + ), ], ) def test_generate_bq_schema(dataframe, expected_schema): - schema = pandas_gbq.gbq._generate_bq_schema(dataframe) + schema = pandas_gbq.gbq._generate_bq_schema(dataframe, default_type="DEFAULT_TYPE") # NULLABLE is the default mode. for field in expected_schema["fields"]: