diff --git a/docs/generated/settings/settings-for-tenants.txt b/docs/generated/settings/settings-for-tenants.txt index e32cf41d344e..5644dc29d9d4 100644 --- a/docs/generated/settings/settings-for-tenants.txt +++ b/docs/generated/settings/settings-for-tenants.txt @@ -357,6 +357,7 @@ sql.stats.histogram_collection.enabled boolean true histogram collection mode ap sql.stats.histogram_samples.count integer 0 number of rows sampled for histogram construction during table statistics collection. Not setting this or setting a value of 0 means that a reasonable sample size will be automatically picked based on the table size. application sql.stats.multi_column_collection.enabled boolean true multi-column statistics collection mode application sql.stats.non_default_columns.min_retention_period duration 24h0m0s minimum retention period for table statistics collected on non-default columns application +sql.stats.non_indexed_json_histograms.enabled boolean false set to true to collect table statistics histograms on non-indexed JSON columns application sql.stats.persisted_rows.max integer 1000000 maximum number of rows of statement and transaction statistics that will be persisted in the system tables before compaction begins application sql.stats.post_events.enabled boolean false if set, an event is logged for every CREATE STATISTICS job application sql.stats.response.max integer 20000 the maximum number of statements and transaction stats returned in a CombinedStatements request application diff --git a/docs/generated/settings/settings.html b/docs/generated/settings/settings.html index 1424396bbf76..8461cba84f75 100644 --- a/docs/generated/settings/settings.html +++ b/docs/generated/settings/settings.html @@ -312,6 +312,7 @@
sql.stats.histogram_samples.count
integer0number of rows sampled for histogram construction during table statistics collection. Not setting this or setting a value of 0 means that a reasonable sample size will be automatically picked based on the table size.Serverless/Dedicated/Self-Hosted
sql.stats.multi_column_collection.enabled
booleantruemulti-column statistics collection modeServerless/Dedicated/Self-Hosted
sql.stats.non_default_columns.min_retention_period
duration24h0m0sminimum retention period for table statistics collected on non-default columnsServerless/Dedicated/Self-Hosted +
sql.stats.non_indexed_json_histograms.enabled
booleanfalseset to true to collect table statistics histograms on non-indexed JSON columnsServerless/Dedicated/Self-Hosted
sql.stats.persisted_rows.max
integer1000000maximum number of rows of statement and transaction statistics that will be persisted in the system tables before compaction beginsServerless/Dedicated/Self-Hosted
sql.stats.post_events.enabled
booleanfalseif set, an event is logged for every CREATE STATISTICS jobServerless/Dedicated/Self-Hosted
sql.stats.response.max
integer20000the maximum number of statements and transaction stats returned in a CombinedStatements requestServerless/Dedicated/Self-Hosted diff --git a/pkg/sql/create_stats.go b/pkg/sql/create_stats.go index 28c1780f147f..44021dcd614c 100644 --- a/pkg/sql/create_stats.go +++ b/pkg/sql/create_stats.go @@ -63,6 +63,18 @@ var statsOnVirtualCols = settings.RegisterBoolSetting( true, settings.WithPublic) +// Collecting histograms on non-indexed JSON columns can require a lot of memory +// when the JSON values are large. This is true even when only two histogram +// buckets are generated because we still sample many JSON values which exist in +// memory for the duration of the stats collection job. By default, we do not +// collect histograms for non-indexed JSON columns. +var nonIndexJSONHistograms = settings.RegisterBoolSetting( + settings.ApplicationLevel, + "sql.stats.non_indexed_json_histograms.enabled", + "set to true to collect table statistics histograms on non-indexed JSON columns", + false, + settings.WithPublic) + const nonIndexColHistogramBuckets = 2 // StubTableStats generates "stub" statistics for a table which are missing @@ -72,8 +84,10 @@ func StubTableStats( desc catalog.TableDescriptor, name string, ) ([]*stats.TableStatisticProto, error) { colStats, err := createStatsDefaultColumns( - context.Background(), desc, false /* virtColEnabled */, false, /* multiColEnabled */ - false /* partialStats */, nonIndexColHistogramBuckets, nil, /* evalCtx */ + context.Background(), desc, + false /* virtColEnabled */, false, /* multiColEnabled */ + false /* nonIndexJSONHistograms */, false, /* partialStats */ + nonIndexColHistogramBuckets, nil, /* evalCtx */ ) if err != nil { return nil, err @@ -256,6 +270,7 @@ func (n *createStatsNode) makeJobRecord(ctx context.Context) (*jobs.Record, erro tableDesc, virtColEnabled, multiColEnabled, + nonIndexJSONHistograms.Get(n.p.ExecCfg().SV()), n.Options.UsingExtremes, defaultHistogramBuckets, n.p.EvalContext(), @@ -375,6 +390,9 @@ const maxNonIndexCols = 100 // predicate expressions are also likely to appear in query filters, so stats // are collected for those columns as well. // +// If nonIndexJsonHistograms is true, 2-bucket histograms are collected for +// non-indexed JSON columns. +// // If partialStats is true, we only collect statistics on single columns that // are prefixes of forward indexes, and skip over partial, sharded, and // implicitly partitioned indexes. Partial statistic creation only supports @@ -386,7 +404,7 @@ const maxNonIndexCols = 100 func createStatsDefaultColumns( ctx context.Context, desc catalog.TableDescriptor, - virtColEnabled, multiColEnabled, partialStats bool, + virtColEnabled, multiColEnabled, nonIndexJSONHistograms, partialStats bool, defaultHistogramBuckets uint32, evalCtx *eval.Context, ) ([]jobspb.CreateStatsDetails_ColStat, error) { @@ -663,9 +681,13 @@ func createStatsDefaultColumns( if col.GetType().Family() == types.BoolFamily || col.GetType().Family() == types.EnumFamily { maxHistBuckets = defaultHistogramBuckets } + hasHistogram := !colinfo.ColumnTypeIsOnlyInvertedIndexable(col.GetType()) + if col.GetType().Family() == types.JsonFamily { + hasHistogram = nonIndexJSONHistograms + } colStats = append(colStats, jobspb.CreateStatsDetails_ColStat{ ColumnIDs: colIDs, - HasHistogram: !colinfo.ColumnTypeIsOnlyInvertedIndexable(col.GetType()), + HasHistogram: hasHistogram, HistogramMaxBuckets: maxHistBuckets, }) nonIdxCols++ diff --git a/pkg/sql/logictest/testdata/logic_test/distsql_stats b/pkg/sql/logictest/testdata/logic_test/distsql_stats index 6fff2c28062d..ff9c42d5e5b1 100644 --- a/pkg/sql/logictest/testdata/logic_test/distsql_stats +++ b/pkg/sql/logictest/testdata/logic_test/distsql_stats @@ -1546,7 +1546,7 @@ ORDER BY statistics_name column_names row_count null_count has_histogram s {a} 3 0 true s {b} 3 0 true -s {j} 3 0 true +s {j} 3 0 false s {rowid} 3 0 true statement ok @@ -2858,7 +2858,7 @@ statistics_name column_names row_count distinct_count null_count has_histog j1 {a} 5 5 0 true j1 {b,e} 5 5 1 false j1 {b} 5 5 1 true -j1 {c} 5 5 1 true +j1 {c} 5 5 1 false j1 {d} 5 5 1 true j1 {e} 5 5 1 true j1 {f} 5 5 1 true @@ -2983,7 +2983,7 @@ j4 {a} 5 5 0 j4 {b,e} 5 5 1 false j4 {b} 5 5 1 true j4 {crdb_internal_idx_expr} 5 5 1 true -j4 {c} 5 5 1 true +j4 {c} 5 5 1 false j4 {d} 5 5 1 true j4 {e} 5 5 1 true j4 {f} 5 5 1 true diff --git a/pkg/sql/logictest/testdata/logic_test/stats b/pkg/sql/logictest/testdata/logic_test/stats index 0d7ea19cd2b0..611194e3887e 100644 --- a/pkg/sql/logictest/testdata/logic_test/stats +++ b/pkg/sql/logictest/testdata/logic_test/stats @@ -64,3 +64,45 @@ BEGIN; ALTER TYPE greeting ADD VALUE 'hey'; SELECT * FROM t122312 WHERE g = 'hi'; COMMIT; + +# Regression test related to #139381. Do not collect histograms on non-indexed +# JSON columns by default. +statement ok +CREATE TABLE t139381 ( + k INT PRIMARY KEY, + j JSON, + v STRING AS (j->>'name') VIRTUAL, + INDEX (v) +) + +statement ok +INSERT INTO t139381 +SELECT i, ('{"name": "name_' || i || '", "data": "abcdefghij"}')::JSONB +FROM (VALUES (1), (2)) v(i) + +statement ok +ANALYZE t139381 + +query TT rowsort +SELECT column_names, IF(histogram_id IS NOT NULL, 'histogram_collected', 'no_histogram_collected') +FROM [SHOW STATISTICS FOR TABLE t139381] +---- +{k} histogram_collected +{j} no_histogram_collected +{v} histogram_collected + +# Histograms are collected on non-indexed JSON columns when the cluster setting +# is enabled. +statement ok +SET CLUSTER SETTING sql.stats.non_indexed_json_histograms.enabled = true + +statement ok +ANALYZE t139381 + +query TT rowsort +SELECT column_names, IF(histogram_id IS NOT NULL, 'histogram_collected', 'no_histogram_collected') +FROM [SHOW STATISTICS FOR TABLE t139381] +---- +{k} histogram_collected +{j} histogram_collected +{v} histogram_collected diff --git a/pkg/sql/opt/exec/execbuilder/testdata/stats b/pkg/sql/opt/exec/execbuilder/testdata/stats index 9b28a9261de5..c2d8fdb6dfb3 100644 --- a/pkg/sql/opt/exec/execbuilder/testdata/stats +++ b/pkg/sql/opt/exec/execbuilder/testdata/stats @@ -352,8 +352,6 @@ limit │ ├── columns: j:1 │ ├── immutable │ ├── stats: [rows=1, distinct(1)=1, null(1)=1] - │ │ histogram(1)= 0 1 - │ │ <--- NULL │ ├── cost: 23.775 │ ├── fd: ()-->(1) │ ├── limit hint: 1.00 @@ -361,8 +359,6 @@ limit │ ├── scan tj │ │ ├── columns: j:1 │ │ ├── stats: [rows=5, distinct(1)=4, null(1)=1] - │ │ │ histogram(1)= 0 1 0 1 2 1 - │ │ │ <--- NULL --- '1' --- '{}' │ │ ├── cost: 23.695 │ │ ├── limit hint: 5.00 │ │ ├── distribution: test