Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

release-24.3: sql: do not collect histograms for non-indexed JSON columns #139897

Merged
merged 1 commit into from
Jan 31, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/generated/settings/settings-for-tenants.txt
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,7 @@ sql.stats.histogram_collection.enabled boolean true histogram collection mode ap
sql.stats.histogram_samples.count integer 0 number of rows sampled for histogram construction during table statistics collection. Not setting this or setting a value of 0 means that a reasonable sample size will be automatically picked based on the table size. application
sql.stats.multi_column_collection.enabled boolean true multi-column statistics collection mode application
sql.stats.non_default_columns.min_retention_period duration 24h0m0s minimum retention period for table statistics collected on non-default columns application
sql.stats.non_indexed_json_histograms.enabled boolean true set to true to collect table statistics histograms on non-indexed JSON columns application
sql.stats.persisted_rows.max integer 1000000 maximum number of rows of statement and transaction statistics that will be persisted in the system tables before compaction begins application
sql.stats.post_events.enabled boolean false if set, an event is logged for every CREATE STATISTICS job application
sql.stats.response.max integer 20000 the maximum number of statements and transaction stats returned in a CombinedStatements request application
Expand Down
1 change: 1 addition & 0 deletions docs/generated/settings/settings.html
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,7 @@
<tr><td><div id="setting-sql-stats-histogram-samples-count" class="anchored"><code>sql.stats.histogram_samples.count</code></div></td><td>integer</td><td><code>0</code></td><td>number of rows sampled for histogram construction during table statistics collection. Not setting this or setting a value of 0 means that a reasonable sample size will be automatically picked based on the table size.</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
<tr><td><div id="setting-sql-stats-multi-column-collection-enabled" class="anchored"><code>sql.stats.multi_column_collection.enabled</code></div></td><td>boolean</td><td><code>true</code></td><td>multi-column statistics collection mode</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
<tr><td><div id="setting-sql-stats-non-default-columns-min-retention-period" class="anchored"><code>sql.stats.non_default_columns.min_retention_period</code></div></td><td>duration</td><td><code>24h0m0s</code></td><td>minimum retention period for table statistics collected on non-default columns</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
<tr><td><div id="setting-sql-stats-non-indexed-json-histograms-enabled" class="anchored"><code>sql.stats.non_indexed_json_histograms.enabled</code></div></td><td>boolean</td><td><code>true</code></td><td>set to true to collect table statistics histograms on non-indexed JSON columns</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
<tr><td><div id="setting-sql-stats-persisted-rows-max" class="anchored"><code>sql.stats.persisted_rows.max</code></div></td><td>integer</td><td><code>1000000</code></td><td>maximum number of rows of statement and transaction statistics that will be persisted in the system tables before compaction begins</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
<tr><td><div id="setting-sql-stats-post-events-enabled" class="anchored"><code>sql.stats.post_events.enabled</code></div></td><td>boolean</td><td><code>false</code></td><td>if set, an event is logged for every CREATE STATISTICS job</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
<tr><td><div id="setting-sql-stats-response-max" class="anchored"><code>sql.stats.response.max</code></div></td><td>integer</td><td><code>20000</code></td><td>the maximum number of statements and transaction stats returned in a CombinedStatements request</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
Expand Down
30 changes: 26 additions & 4 deletions pkg/sql/create_stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,18 @@ var statsOnVirtualCols = settings.RegisterBoolSetting(
true,
settings.WithPublic)

// Collecting histograms on non-indexed JSON columns can require a lot of memory
// when the JSON values are large. This is true even when only two histogram
// buckets are generated because we still sample many JSON values which exist in
// memory for the duration of the stats collection job. By default, we do not
// collect histograms for non-indexed JSON columns.
var nonIndexJSONHistograms = settings.RegisterBoolSetting(
settings.ApplicationLevel,
"sql.stats.non_indexed_json_histograms.enabled",
"set to true to collect table statistics histograms on non-indexed JSON columns",
true,
settings.WithPublic)

const nonIndexColHistogramBuckets = 2

// StubTableStats generates "stub" statistics for a table which are missing
Expand All @@ -72,8 +84,10 @@ func StubTableStats(
desc catalog.TableDescriptor, name string,
) ([]*stats.TableStatisticProto, error) {
colStats, err := createStatsDefaultColumns(
context.Background(), desc, false /* virtColEnabled */, false, /* multiColEnabled */
false /* partialStats */, nonIndexColHistogramBuckets, nil, /* evalCtx */
context.Background(), desc,
false /* virtColEnabled */, false, /* multiColEnabled */
false /* nonIndexJSONHistograms */, false, /* partialStats */
nonIndexColHistogramBuckets, nil, /* evalCtx */
)
if err != nil {
return nil, err
Expand Down Expand Up @@ -255,6 +269,7 @@ func (n *createStatsNode) makeJobRecord(ctx context.Context) (*jobs.Record, erro
tableDesc,
virtColEnabled,
multiColEnabled,
nonIndexJSONHistograms.Get(n.p.ExecCfg().SV()),
n.Options.UsingExtremes,
defaultHistogramBuckets,
n.p.EvalContext(),
Expand Down Expand Up @@ -374,6 +389,9 @@ const maxNonIndexCols = 100
// predicate expressions are also likely to appear in query filters, so stats
// are collected for those columns as well.
//
// If nonIndexJsonHistograms is true, 2-bucket histograms are collected for
// non-indexed JSON columns.
//
// If partialStats is true, we only collect statistics on single columns that
// are prefixes of forward indexes, and skip over partial, sharded, and
// implicitly partitioned indexes. Partial statistic creation only supports
Expand All @@ -385,7 +403,7 @@ const maxNonIndexCols = 100
func createStatsDefaultColumns(
ctx context.Context,
desc catalog.TableDescriptor,
virtColEnabled, multiColEnabled, partialStats bool,
virtColEnabled, multiColEnabled, nonIndexJSONHistograms, partialStats bool,
defaultHistogramBuckets uint32,
evalCtx *eval.Context,
) ([]jobspb.CreateStatsDetails_ColStat, error) {
Expand Down Expand Up @@ -662,9 +680,13 @@ func createStatsDefaultColumns(
if col.GetType().Family() == types.BoolFamily || col.GetType().Family() == types.EnumFamily {
maxHistBuckets = defaultHistogramBuckets
}
hasHistogram := !colinfo.ColumnTypeIsOnlyInvertedIndexable(col.GetType())
if col.GetType().Family() == types.JsonFamily {
hasHistogram = nonIndexJSONHistograms
}
colStats = append(colStats, jobspb.CreateStatsDetails_ColStat{
ColumnIDs: colIDs,
HasHistogram: !colinfo.ColumnTypeIsOnlyInvertedIndexable(col.GetType()),
HasHistogram: hasHistogram,
HistogramMaxBuckets: maxHistBuckets,
})
nonIdxCols++
Expand Down
45 changes: 45 additions & 0 deletions pkg/sql/logictest/testdata/logic_test/stats
Original file line number Diff line number Diff line change
Expand Up @@ -64,3 +64,48 @@ BEGIN;
ALTER TYPE greeting ADD VALUE 'hey';
SELECT * FROM t122312 WHERE g = 'hi';
COMMIT;

# Regression test related to #139381. Do not collect histograms on non-indexed
# JSON columns when sql.stats.non_indexed_json_histograms.enabled is false.
statement ok
CREATE TABLE t139381 (
k INT PRIMARY KEY,
j JSON,
v STRING AS (j->>'name') VIRTUAL,
INDEX (v)
)

statement ok
SET CLUSTER SETTING sql.stats.non_indexed_json_histograms.enabled = false

statement ok
INSERT INTO t139381
SELECT i, ('{"name": "name_' || i || '", "data": "abcdefghij"}')::JSONB
FROM (VALUES (1), (2)) v(i)

statement ok
ANALYZE t139381

query TT rowsort
SELECT column_names, IF(histogram_id IS NOT NULL, 'histogram_collected', 'no_histogram_collected')
FROM [SHOW STATISTICS FOR TABLE t139381]
----
{k} histogram_collected
{j} no_histogram_collected
{v} histogram_collected

# Histograms are collected on non-indexed JSON columns when the cluster setting
# is enabled.
statement ok
SET CLUSTER SETTING sql.stats.non_indexed_json_histograms.enabled = true

statement ok
ANALYZE t139381

query TT rowsort
SELECT column_names, IF(histogram_id IS NOT NULL, 'histogram_collected', 'no_histogram_collected')
FROM [SHOW STATISTICS FOR TABLE t139381]
----
{k} histogram_collected
{j} histogram_collected
{v} histogram_collected