diff --git a/docs/generated/settings/settings-for-tenants.txt b/docs/generated/settings/settings-for-tenants.txt
index e32cf41d344e..5644dc29d9d4 100644
--- a/docs/generated/settings/settings-for-tenants.txt
+++ b/docs/generated/settings/settings-for-tenants.txt
@@ -357,6 +357,7 @@ sql.stats.histogram_collection.enabled boolean true histogram collection mode ap
sql.stats.histogram_samples.count integer 0 number of rows sampled for histogram construction during table statistics collection. Not setting this or setting a value of 0 means that a reasonable sample size will be automatically picked based on the table size. application
sql.stats.multi_column_collection.enabled boolean true multi-column statistics collection mode application
sql.stats.non_default_columns.min_retention_period duration 24h0m0s minimum retention period for table statistics collected on non-default columns application
+sql.stats.non_indexed_json_histograms.enabled boolean false set to true to collect table statistics histograms on non-indexed JSON columns application
sql.stats.persisted_rows.max integer 1000000 maximum number of rows of statement and transaction statistics that will be persisted in the system tables before compaction begins application
sql.stats.post_events.enabled boolean false if set, an event is logged for every CREATE STATISTICS job application
sql.stats.response.max integer 20000 the maximum number of statements and transaction stats returned in a CombinedStatements request application
diff --git a/docs/generated/settings/settings.html b/docs/generated/settings/settings.html
index 1424396bbf76..8461cba84f75 100644
--- a/docs/generated/settings/settings.html
+++ b/docs/generated/settings/settings.html
@@ -312,6 +312,7 @@
sql.stats.histogram_samples.count
| integer | 0 | number of rows sampled for histogram construction during table statistics collection. Not setting this or setting a value of 0 means that a reasonable sample size will be automatically picked based on the table size. | Serverless/Dedicated/Self-Hosted |
sql.stats.multi_column_collection.enabled
| boolean | true | multi-column statistics collection mode | Serverless/Dedicated/Self-Hosted |
sql.stats.non_default_columns.min_retention_period
| duration | 24h0m0s | minimum retention period for table statistics collected on non-default columns | Serverless/Dedicated/Self-Hosted |
+sql.stats.non_indexed_json_histograms.enabled
| boolean | false | set to true to collect table statistics histograms on non-indexed JSON columns | Serverless/Dedicated/Self-Hosted |
sql.stats.persisted_rows.max
| integer | 1000000 | maximum number of rows of statement and transaction statistics that will be persisted in the system tables before compaction begins | Serverless/Dedicated/Self-Hosted |
sql.stats.post_events.enabled
| boolean | false | if set, an event is logged for every CREATE STATISTICS job | Serverless/Dedicated/Self-Hosted |
sql.stats.response.max
| integer | 20000 | the maximum number of statements and transaction stats returned in a CombinedStatements request | Serverless/Dedicated/Self-Hosted |
diff --git a/pkg/sql/create_stats.go b/pkg/sql/create_stats.go
index 28c1780f147f..44021dcd614c 100644
--- a/pkg/sql/create_stats.go
+++ b/pkg/sql/create_stats.go
@@ -63,6 +63,18 @@ var statsOnVirtualCols = settings.RegisterBoolSetting(
true,
settings.WithPublic)
+// Collecting histograms on non-indexed JSON columns can require a lot of memory
+// when the JSON values are large. This is true even when only two histogram
+// buckets are generated because we still sample many JSON values which exist in
+// memory for the duration of the stats collection job. By default, we do not
+// collect histograms for non-indexed JSON columns.
+var nonIndexJSONHistograms = settings.RegisterBoolSetting(
+ settings.ApplicationLevel,
+ "sql.stats.non_indexed_json_histograms.enabled",
+ "set to true to collect table statistics histograms on non-indexed JSON columns",
+ false,
+ settings.WithPublic)
+
const nonIndexColHistogramBuckets = 2
// StubTableStats generates "stub" statistics for a table which are missing
@@ -72,8 +84,10 @@ func StubTableStats(
desc catalog.TableDescriptor, name string,
) ([]*stats.TableStatisticProto, error) {
colStats, err := createStatsDefaultColumns(
- context.Background(), desc, false /* virtColEnabled */, false, /* multiColEnabled */
- false /* partialStats */, nonIndexColHistogramBuckets, nil, /* evalCtx */
+ context.Background(), desc,
+ false /* virtColEnabled */, false, /* multiColEnabled */
+ false /* nonIndexJSONHistograms */, false, /* partialStats */
+ nonIndexColHistogramBuckets, nil, /* evalCtx */
)
if err != nil {
return nil, err
@@ -256,6 +270,7 @@ func (n *createStatsNode) makeJobRecord(ctx context.Context) (*jobs.Record, erro
tableDesc,
virtColEnabled,
multiColEnabled,
+ nonIndexJSONHistograms.Get(n.p.ExecCfg().SV()),
n.Options.UsingExtremes,
defaultHistogramBuckets,
n.p.EvalContext(),
@@ -375,6 +390,9 @@ const maxNonIndexCols = 100
// predicate expressions are also likely to appear in query filters, so stats
// are collected for those columns as well.
//
+// If nonIndexJsonHistograms is true, 2-bucket histograms are collected for
+// non-indexed JSON columns.
+//
// If partialStats is true, we only collect statistics on single columns that
// are prefixes of forward indexes, and skip over partial, sharded, and
// implicitly partitioned indexes. Partial statistic creation only supports
@@ -386,7 +404,7 @@ const maxNonIndexCols = 100
func createStatsDefaultColumns(
ctx context.Context,
desc catalog.TableDescriptor,
- virtColEnabled, multiColEnabled, partialStats bool,
+ virtColEnabled, multiColEnabled, nonIndexJSONHistograms, partialStats bool,
defaultHistogramBuckets uint32,
evalCtx *eval.Context,
) ([]jobspb.CreateStatsDetails_ColStat, error) {
@@ -663,9 +681,13 @@ func createStatsDefaultColumns(
if col.GetType().Family() == types.BoolFamily || col.GetType().Family() == types.EnumFamily {
maxHistBuckets = defaultHistogramBuckets
}
+ hasHistogram := !colinfo.ColumnTypeIsOnlyInvertedIndexable(col.GetType())
+ if col.GetType().Family() == types.JsonFamily {
+ hasHistogram = nonIndexJSONHistograms
+ }
colStats = append(colStats, jobspb.CreateStatsDetails_ColStat{
ColumnIDs: colIDs,
- HasHistogram: !colinfo.ColumnTypeIsOnlyInvertedIndexable(col.GetType()),
+ HasHistogram: hasHistogram,
HistogramMaxBuckets: maxHistBuckets,
})
nonIdxCols++
diff --git a/pkg/sql/logictest/testdata/logic_test/distsql_stats b/pkg/sql/logictest/testdata/logic_test/distsql_stats
index 6fff2c28062d..ff9c42d5e5b1 100644
--- a/pkg/sql/logictest/testdata/logic_test/distsql_stats
+++ b/pkg/sql/logictest/testdata/logic_test/distsql_stats
@@ -1546,7 +1546,7 @@ ORDER BY
statistics_name column_names row_count null_count has_histogram
s {a} 3 0 true
s {b} 3 0 true
-s {j} 3 0 true
+s {j} 3 0 false
s {rowid} 3 0 true
statement ok
@@ -2858,7 +2858,7 @@ statistics_name column_names row_count distinct_count null_count has_histog
j1 {a} 5 5 0 true
j1 {b,e} 5 5 1 false
j1 {b} 5 5 1 true
-j1 {c} 5 5 1 true
+j1 {c} 5 5 1 false
j1 {d} 5 5 1 true
j1 {e} 5 5 1 true
j1 {f} 5 5 1 true
@@ -2983,7 +2983,7 @@ j4 {a} 5 5 0
j4 {b,e} 5 5 1 false
j4 {b} 5 5 1 true
j4 {crdb_internal_idx_expr} 5 5 1 true
-j4 {c} 5 5 1 true
+j4 {c} 5 5 1 false
j4 {d} 5 5 1 true
j4 {e} 5 5 1 true
j4 {f} 5 5 1 true
diff --git a/pkg/sql/logictest/testdata/logic_test/stats b/pkg/sql/logictest/testdata/logic_test/stats
index 0d7ea19cd2b0..611194e3887e 100644
--- a/pkg/sql/logictest/testdata/logic_test/stats
+++ b/pkg/sql/logictest/testdata/logic_test/stats
@@ -64,3 +64,45 @@ BEGIN;
ALTER TYPE greeting ADD VALUE 'hey';
SELECT * FROM t122312 WHERE g = 'hi';
COMMIT;
+
+# Regression test related to #139381. Do not collect histograms on non-indexed
+# JSON columns by default.
+statement ok
+CREATE TABLE t139381 (
+ k INT PRIMARY KEY,
+ j JSON,
+ v STRING AS (j->>'name') VIRTUAL,
+ INDEX (v)
+)
+
+statement ok
+INSERT INTO t139381
+SELECT i, ('{"name": "name_' || i || '", "data": "abcdefghij"}')::JSONB
+FROM (VALUES (1), (2)) v(i)
+
+statement ok
+ANALYZE t139381
+
+query TT rowsort
+SELECT column_names, IF(histogram_id IS NOT NULL, 'histogram_collected', 'no_histogram_collected')
+FROM [SHOW STATISTICS FOR TABLE t139381]
+----
+{k} histogram_collected
+{j} no_histogram_collected
+{v} histogram_collected
+
+# Histograms are collected on non-indexed JSON columns when the cluster setting
+# is enabled.
+statement ok
+SET CLUSTER SETTING sql.stats.non_indexed_json_histograms.enabled = true
+
+statement ok
+ANALYZE t139381
+
+query TT rowsort
+SELECT column_names, IF(histogram_id IS NOT NULL, 'histogram_collected', 'no_histogram_collected')
+FROM [SHOW STATISTICS FOR TABLE t139381]
+----
+{k} histogram_collected
+{j} histogram_collected
+{v} histogram_collected
diff --git a/pkg/sql/opt/exec/execbuilder/testdata/stats b/pkg/sql/opt/exec/execbuilder/testdata/stats
index 9b28a9261de5..c2d8fdb6dfb3 100644
--- a/pkg/sql/opt/exec/execbuilder/testdata/stats
+++ b/pkg/sql/opt/exec/execbuilder/testdata/stats
@@ -352,8 +352,6 @@ limit
│ ├── columns: j:1
│ ├── immutable
│ ├── stats: [rows=1, distinct(1)=1, null(1)=1]
- │ │ histogram(1)= 0 1
- │ │ <--- NULL
│ ├── cost: 23.775
│ ├── fd: ()-->(1)
│ ├── limit hint: 1.00
@@ -361,8 +359,6 @@ limit
│ ├── scan tj
│ │ ├── columns: j:1
│ │ ├── stats: [rows=5, distinct(1)=4, null(1)=1]
- │ │ │ histogram(1)= 0 1 0 1 2 1
- │ │ │ <--- NULL --- '1' --- '{}'
│ │ ├── cost: 23.695
│ │ ├── limit hint: 5.00
│ │ ├── distribution: test