Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit ff58736

Browse files
author
Nikita Glukhov
committed
Remove "length" stats, move "array_length" and "avg_array_length" stats to parent path, add "object_length" stats
1 parent ff3482c commit ff58736

File tree

3 files changed

+108
-88
lines changed

3 files changed

+108
-88
lines changed

src/backend/utils/adt/jsonb_selfuncs.c

Lines changed: 86 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -153,19 +153,19 @@ jsonStatsRelease(JsonStats data)
153153
}
154154

155155
/*
156-
* jsonPathStatsGetSpecialStats
157-
* Extract statistics of given type for JSON path.
158-
*
159-
* XXX This does not really extract any stats, it merely allocates the struct?
156+
* jsonPathStatsAllocSpecialStats
157+
* Allocate a copy of JsonPathStats for accessing special (length etc.)
158+
* stats for a given JSON path.
160159
*/
161160
static JsonPathStats
162-
jsonPathStatsGetSpecialStats(JsonPathStats pstats, JsonPathStatsType type)
161+
jsonPathStatsAllocSpecialStats(JsonPathStats pstats, JsonPathStatsType type)
163162
{
164163
JsonPathStats stats;
165164

166165
if (!pstats)
167166
return NULL;
168167

168+
/* copy and replace stats type */
169169
stats = palloc(sizeof(*stats));
170170
*stats = *pstats;
171171
stats->type = type;
@@ -174,35 +174,39 @@ jsonPathStatsGetSpecialStats(JsonPathStats pstats, JsonPathStatsType type)
174174
}
175175

176176
/*
177-
* jsonPathStatsGetLengthStats
178-
* Extract statistics of lengths (for arrays or objects) for the path.
177+
* jsonPathStatsGetArrayLengthStats
178+
* Extract statistics of array lengths for the path.
179179
*/
180180
JsonPathStats
181-
jsonPathStatsGetLengthStats(JsonPathStats pstats)
181+
jsonPathStatsGetArrayLengthStats(JsonPathStats pstats)
182182
{
183183
/*
184-
* The length statistics is relevant only for values that are objects or
185-
* arrays. So if we observed no such values, we know there can't be such
184+
* The array length statistics is relevant only for values that are arrays.
185+
* So if we observed no such values, we know there can't be such
186186
* statistics and so we simply return NULL.
187187
*/
188-
if (jsonPathStatsGetTypeFreq(pstats, jbvObject, 0.0) <= 0.0 &&
189-
jsonPathStatsGetTypeFreq(pstats, jbvArray, 0.0) <= 0.0)
188+
if (jsonPathStatsGetTypeFreq(pstats, jbvArray, 0.0) <= 0.0)
190189
return NULL;
191190

192-
return jsonPathStatsGetSpecialStats(pstats, JsonPathStatsLength);
191+
return jsonPathStatsAllocSpecialStats(pstats, JsonPathStatsArrayLength);
193192
}
194193

195194
/*
196-
* jsonPathStatsGetArrayLengthStats
197-
* Extract statistics of lengths for arrays.
198-
*
199-
* XXX Why doesn't this do jsonPathStatsGetTypeFreq check similar to what
200-
* jsonPathStatsGetLengthStats does?
195+
* jsonPathStatsGetObjectLengthStats
196+
* Extract statistics of object length for the path.
201197
*/
202-
static JsonPathStats
203-
jsonPathStatsGetArrayLengthStats(JsonPathStats pstats)
198+
JsonPathStats
199+
jsonPathStatsGetObjectLengthStats(JsonPathStats pstats)
204200
{
205-
return jsonPathStatsGetSpecialStats(pstats, JsonPathStatsArrayLength);
201+
/*
202+
* The object length statistics is relevant only for values that are arrays.
203+
* So if we observed no such values, we know there can't be such
204+
* statistics and so we simply return NULL.
205+
*/
206+
if (jsonPathStatsGetTypeFreq(pstats, jbvObject, 0.0) <= 0.0)
207+
return NULL;
208+
209+
return jsonPathStatsAllocSpecialStats(pstats, JsonPathStatsObjectLength);
206210
}
207211

208212
/*
@@ -474,15 +478,20 @@ jsonStatsGetPath(JsonStats jsdata, Datum *path, int pathlen, float4 *nullfrac)
474478
else
475479
{
476480
/* Find array index stats */
477-
float4 arrfreq;
478-
479481
/* FIXME consider object key "index" also */
480-
pstats = jsonPathStatsGetSubpath(pstats, NULL);
481-
sel *= jsonPathStatsGetArrayIndexSelectivity(pstats, index);
482-
arrfreq = jsonPathStatsGetFreq(pstats, 0.0);
482+
JsonPathStats arrstats = jsonPathStatsGetSubpath(pstats, NULL);
483+
484+
if (arrstats)
485+
{
486+
float4 arrfreq = jsonPathStatsGetFreq(pstats, 0.0);
483487

484-
if (arrfreq > 0.0)
485-
sel /= arrfreq;
488+
sel *= jsonPathStatsGetArrayIndexSelectivity(pstats, index);
489+
490+
if (arrfreq > 0.0)
491+
sel /= arrfreq;
492+
}
493+
494+
pstats = arrstats;
486495
}
487496

488497
pfree(key);
@@ -702,7 +711,8 @@ jsonPathStatsExtractData(JsonPathStats pstats, JsonStatType stattype,
702711
case JsonStatJsonb:
703712
case JsonStatJsonbWithoutSubpaths:
704713
key = pstats->type == JsonPathStatsArrayLength ? "array_length" :
705-
pstats->type == JsonPathStatsLength ? "length" : "json";
714+
pstats->type == JsonPathStatsObjectLength ? "object_length" :
715+
"json";
706716
type = JSONBOID;
707717
eqop = JsonbEqOperator;
708718
ltop = JsonbLtOperator;
@@ -846,30 +856,24 @@ jsonPathStatsGetTypeFreq(JsonPathStats pstats, JsonbValueType type,
846856
/*
847857
* When dealing with (object/array) length stats, we only really care about
848858
* objects and arrays.
859+
*
860+
* Lengths are always numeric, so simply return 0 if requested frequency
861+
* of non-numeric values.
849862
*/
850-
if (pstats->type == JsonPathStatsLength)
863+
if (pstats->type == JsonPathStatsArrayLength)
851864
{
852-
/*
853-
* Array/object length is always numeric, so simply return 0 if
854-
* requested non-numeric frequency.
855-
*/
856865
if (type != jbvNumeric)
857866
return 0.0;
858867

859-
return jsonPathStatsGetFloat(pstats, "freq_array", defaultfreq) +
860-
jsonPathStatsGetFloat(pstats, "freq_object", defaultfreq);
868+
return jsonPathStatsGetFloat(pstats, "freq_array", defaultfreq);
861869
}
862870

863-
if (pstats->type == JsonPathStatsArrayLength)
871+
if (pstats->type == JsonPathStatsObjectLength)
864872
{
865-
/*
866-
* Array length is always numeric, so simply return 0 if requested
867-
* non-numeric frequency.
868-
*/
869873
if (type != jbvNumeric)
870874
return 0.0;
871875

872-
return jsonPathStatsGetFreq(pstats, defaultfreq);
876+
return jsonPathStatsGetFloat(pstats, "freq_object", defaultfreq);
873877
}
874878

875879
/* Which JSON type are we interested in? Pick the right freq_type key. */
@@ -955,29 +959,33 @@ static HeapTuple
955959
jsonStatsGetArrayIndexStatsTuple(JsonStats jsdata, JsonStatType type, int32 index)
956960
{
957961
/* Extract statistics for root array elements */
958-
JsonPathStats pstats = jsonStatsGetRootArrayPath(jsdata);
962+
JsonPathStats arrstats = jsonStatsGetRootArrayPath(jsdata);
963+
JsonPathStats rootstats;
959964
Selectivity index_sel;
960965

961-
if (!pstats)
966+
if (!arrstats)
962967
return NULL;
963968

964969
/* Compute relative selectivity of 'EXISTS($[index])' */
965-
index_sel = jsonPathStatsGetArrayIndexSelectivity(pstats, index);
966-
index_sel /= jsonPathStatsGetFreq(pstats, 0.0);
970+
rootstats = jsonStatsGetRootPath(jsdata);
971+
index_sel = jsonPathStatsGetArrayIndexSelectivity(rootstats, index);
972+
index_sel /= jsonPathStatsGetFreq(arrstats, 0.0);
967973

968974
/* Form pg_statistics tuple, taking into account array index selectivity */
969-
return jsonPathStatsFormTuple(pstats, type, 1.0 - index_sel);
975+
return jsonPathStatsFormTuple(arrstats, type, 1.0 - index_sel);
970976
}
971977

972978
/*
973979
* jsonStatsGetPathFreq
974980
* Return frequency of a path (fraction of documents containing it).
975981
*/
976982
static float4
977-
jsonStatsGetPathFreq(JsonStats jsdata, Datum *path, int pathlen)
983+
jsonStatsGetPathFreq(JsonStats jsdata, Datum *path, int pathlen,
984+
bool try_array_indexes)
978985
{
979986
float4 nullfrac;
980-
JsonPathStats pstats = jsonStatsGetPath(jsdata, path, pathlen, &nullfrac);
987+
JsonPathStats pstats = jsonStatsGetPath(jsdata, path, pathlen,
988+
try_array_indexes, &nullfrac);
981989
float4 freq = (1.0 - nullfrac) * jsonPathStatsGetFreq(pstats, 0.0);
982990

983991
CLAMP_PROBABILITY(freq);
@@ -1192,14 +1200,14 @@ static void
11921200
jsonAccumulateSubPathSelectivity(Selectivity subpath_abs_sel,
11931201
Selectivity path_freq,
11941202
Selectivity *path_relative_sel,
1195-
bool is_array_accessor,
1196-
JsonPathStats path_stats)
1203+
JsonPathStats array_path_stats)
11971204
{
11981205
Selectivity sel = subpath_abs_sel / path_freq; /* relative selectivity */
11991206

12001207
/* XXX Try to take into account array length */
1201-
if (is_array_accessor)
1202-
sel = 1.0 - pow(1.0 - sel, jsonPathStatsGetAvgArraySize(path_stats));
1208+
if (array_path_stats)
1209+
sel = 1.0 - pow(1.0 - sel,
1210+
jsonPathStatsGetAvgArraySize(array_path_stats));
12031211

12041212
/* Accumulate selectivity of subpath into parent path */
12051213
*path_relative_sel *= sel;
@@ -1299,6 +1307,14 @@ jsonSelectivityContains(JsonStats stats, Jsonb *jb)
12991307
JsonPathStats pstats;
13001308
Selectivity freq;
13011309

1310+
/*
1311+
* First, find stats for the parent path if needed, it will be
1312+
* used in jsonAccumulateSubPathSelectivity().
1313+
*/
1314+
if (!path->stats)
1315+
path->stats = jsonStatsFindPath(stats, pathstr.data,
1316+
pathstr.len);
1317+
13021318
/* Appeend path string entry for array elements, get stats. */
13031319
jsonPathAppendEntry(&pathstr, NULL);
13041320
pstats = jsonStatsFindPath(stats, pathstr.data, pathstr.len);
@@ -1336,8 +1352,8 @@ jsonSelectivityContains(JsonStats stats, Jsonb *jb)
13361352
/* Accumulate selectivity into parent path */
13371353
jsonAccumulateSubPathSelectivity(abs_sel, path->freq,
13381354
&path->sel,
1339-
path->is_array_accesor,
1340-
path->stats);
1355+
path->is_array_accesor ?
1356+
path->parent->stats : NULL);
13411357
break;
13421358
}
13431359

@@ -1358,22 +1374,28 @@ jsonSelectivityContains(JsonStats stats, Jsonb *jb)
13581374
case WJB_ELEM:
13591375
{
13601376
/*
1361-
* Extract statistics for path. Arrays elements shares the
1377+
* Extract statistics for a path. Array elements share the
13621378
* same statistics that was extracted in WJB_BEGIN_ARRAY.
13631379
*/
13641380
JsonPathStats pstats = r == WJB_ELEM ? path->stats :
13651381
jsonStatsFindPath(stats, pathstr.data, pathstr.len);
1366-
/* Make scalar jsonb datum */
1367-
Datum scalar = JsonbPGetDatum(JsonbValueToJsonb(&v));
1368-
/* Absolute selectivity of 'path == scalar' */
1369-
Selectivity abs_sel = jsonSelectivity(pstats, scalar,
1370-
JsonbEqOperator);
1382+
Selectivity abs_sel; /* Absolute selectivity of 'path == scalar' */
1383+
1384+
if (pstats)
1385+
{
1386+
/* Make scalar jsonb datum and compute selectivity */
1387+
Datum scalar = JsonbPGetDatum(JsonbValueToJsonb(&v));
1388+
1389+
abs_sel = jsonSelectivity(pstats, scalar, JsonbEqOperator);
1390+
}
1391+
else
1392+
abs_sel = 0.0;
13711393

13721394
/* Accumulate selectivity into parent path */
13731395
jsonAccumulateSubPathSelectivity(abs_sel, path->freq,
13741396
&path->sel,
1375-
path->is_array_accesor,
1376-
path->stats);
1397+
path->is_array_accesor ?
1398+
path->parent->stats : NULL);
13771399
break;
13781400
}
13791401

@@ -1417,7 +1439,7 @@ jsonSelectivityExists(JsonStats stats, Datum key)
14171439
arrstats = jsonStatsGetRootArrayPath(stats);
14181440
arraysel = jsonSelectivity(arrstats, jbkey, JsonbEqOperator);
14191441
arraysel = 1.0 - pow(1.0 - arraysel,
1420-
jsonPathStatsGetAvgArraySize(arrstats));
1442+
jsonPathStatsGetAvgArraySize(rootstats));
14211443

14221444
sel = keysel + scalarsel + arraysel;
14231445
CLAMP_PROBABILITY(sel);

src/backend/utils/adt/jsonb_typanalyze.c

Lines changed: 19 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -136,8 +136,8 @@ typedef struct JsonValueStats
136136
JsonScalarStats numerics; /* stats for JSON numerics */
137137
#endif
138138

139-
JsonScalarStats lens; /* stats of object/array lengths */
140139
JsonScalarStats arrlens; /* stats of array lengths */
140+
JsonScalarStats objlens; /* stats of object lengths */
141141

142142
int nnulls; /* number of JSON null values */
143143
int ntrue; /* number of JSON true values */
@@ -146,6 +146,9 @@ typedef struct JsonValueStats
146146
int narrays; /* number of JSON arrays */
147147
int nstrings; /* number of JSON strings */
148148
int nnumerics; /* number of JSON numerics */
149+
150+
int64 narrelems; /* total number of array elements
151+
* (for avg. array length) */
149152
} JsonValueStats;
150153

151154
/* Main structure for analyzed JSON path */
@@ -377,20 +380,20 @@ jsonAnalyzeJsonValue(JsonAnalyzeContext *ctx, JsonValueStats *vstats,
377380
case jbvBinary:
378381
if (JsonContainerIsObject(jv->val.binary.data))
379382
{
380-
uint32 size = JsonContainerSize(jv->val.binary.data);
383+
uint32 size = JsonContainerSize(jv->val.binary.data);
381384

382385
value = DatumGetInt32(size);
383386
vstats->nobjects++;
384-
JsonValuesAppend(&vstats->lens.values, value, ctx->target);
387+
JsonValuesAppend(&vstats->objlens.values, value, ctx->target);
385388
}
386389
else if (JsonContainerIsArray(jv->val.binary.data))
387390
{
388-
uint32 size = JsonContainerSize(jv->val.binary.data);
391+
uint32 size = JsonContainerSize(jv->val.binary.data);
389392

390393
value = DatumGetInt32(size);
391394
vstats->narrays++;
392395
JsonValuesAppend(&vstats->arrlens.values, value, ctx->target);
393-
JsonValuesAppend(&vstats->lens.values, value, ctx->target);
396+
vstats->narrelems += size;
394397
}
395398
break;
396399

@@ -812,24 +815,18 @@ jsonAnalyzeBuildPathStats(JsonPathAnlStats *pstats)
812815
vstats->jsons.values.count);
813816

814817
/*
815-
* XXX not sure why we keep length and array length stats at this level.
816-
* Aren't those covered by the per-column stats? We certainly have
817-
* frequencies for array elements etc.
818+
* We keep array length stats here for queries like jsonpath '$.size() > 5'.
819+
* Object lengths stats can be useful for other query lanuages.
818820
*/
819-
if (pstats->vstats.lens.values.count)
820-
jsonAnalyzeMakeScalarStats(&ps, "length", &vstats->lens.stats);
821+
if (vstats->arrlens.values.count)
822+
jsonAnalyzeMakeScalarStats(&ps, "array_length", &vstats->arrlens.stats);
821823

822-
if (JsonPathEntryIsArray(&pstats->path))
823-
{
824-
JsonPathAnlStats *parent = (JsonPathAnlStats *) pstats->path.parent;
824+
if (vstats->objlens.values.count)
825+
jsonAnalyzeMakeScalarStats(&ps, "object_length", &vstats->objlens.stats);
825826

827+
if (vstats->narrays)
826828
pushJsonbKeyValueFloat(&ps, &val, "avg_array_length",
827-
(float4) vstats->jsons.values.count /
828-
parent->vstats.narrays);
829-
830-
jsonAnalyzeMakeScalarStats(&ps, "array_length",
831-
&parent->vstats.arrlens.stats);
832-
}
829+
(float4) vstats->narrelems / vstats->narrays);
833830

834831
if (full)
835832
{
@@ -895,12 +892,12 @@ jsonAnalyzePath(JsonAnalyzeContext *ctx, JsonPathAnlStats *pstats)
895892
* Lengths and array lengths. We divide counts by the total number of json
896893
* values to compute correct nullfrac (i.e. not all jsons have lengths).
897894
*/
898-
jsonAnalyzePathValues(ctx, &vstats->lens, INT4OID,
899-
pstats->freq * vstats->lens.values.count /
900-
vstats->jsons.values.count);
901895
jsonAnalyzePathValues(ctx, &vstats->arrlens, INT4OID,
902896
pstats->freq * vstats->arrlens.values.count /
903897
vstats->jsons.values.count);
898+
jsonAnalyzePathValues(ctx, &vstats->objlens, INT4OID,
899+
pstats->freq * vstats->objlens.values.count /
900+
vstats->jsons.values.count);
904901

905902
#ifdef JSON_ANALYZE_SCALARS
906903
/* stats for values of string/numeric types only */

0 commit comments

Comments
 (0)