Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix!: statistical functions should return null when provided a vector of only null values #5606

Merged
merged 19 commits into from
Jun 19, 2024
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 112 additions & 11 deletions engine/function/src/templates/Numeric.ftl
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import java.util.Arrays;
import static io.deephaven.base.CompareUtils.compare;
import static io.deephaven.util.QueryConstants.*;
import static io.deephaven.function.Basic.*;
import static io.deephaven.function.Sort.*;
import static io.deephaven.function.Cast.castDouble;

/**
Expand Down Expand Up @@ -364,6 +365,7 @@ public class Numeric {

double sum = 0;
double count = 0;
long nullCount = 0;

try ( final ${pt.vectorIterator} vi = values.iterator() ) {
while ( vi.hasNext() ) {
Expand All @@ -374,10 +376,16 @@ public class Numeric {
if (!isNull(c)) {
sum += c;
count++;
} else {
nullCount++;
}
}
}

if (nullCount == values.size()) {
return NULL_DOUBLE;
}

return sum / count;
}

Expand Down Expand Up @@ -418,6 +426,7 @@ public class Numeric {

double sum = 0;
double count = 0;
long nullCount = 0;

try ( final ${pt.vectorIterator} vi = values.iterator() ) {
while ( vi.hasNext() ) {
Expand All @@ -431,10 +440,16 @@ public class Numeric {
if (!isNull(c)) {
sum += Math.abs(c);
count++;
} else {
nullCount++;
}
}
}

if (nullCount == values.size()) {
return NULL_DOUBLE;
}

return sum / count;
}

Expand Down Expand Up @@ -485,6 +500,7 @@ public class Numeric {
double sum = 0;
double sum2 = 0;
long count = 0;
long nullCount = 0;
try ( final ${pt.vectorIterator} vi = values.iterator() ) {
while ( vi.hasNext() ) {
final ${pt.primitive} c = vi.${pt.iteratorNext}();
Expand All @@ -495,10 +511,16 @@ public class Numeric {
sum += (double)c;
sum2 += (double)c * (double)c;
count++;
} else {
nullCount++;
}
}
}

if (nullCount == values.size()) {
return NULL_DOUBLE;
}

// Return NaN if overflow or too few values to compute variance.
if (count <= 1 || Double.isInfinite(sum) || Double.isInfinite(sum2)) {
return Double.NaN;
Expand Down Expand Up @@ -596,6 +618,7 @@ public class Numeric {
double sum2 = 0;
double count = 0;
double count2 = 0;
long nullCount = 0;

try (
final ${pt.vectorIterator} vi = values.iterator();
Expand All @@ -615,10 +638,16 @@ public class Numeric {
sum2 += w * c * c;
count += w;
count2 += w * w;
} else {
nullCount++;
}
}
}

if (nullCount == values.size()) {
return NULL_DOUBLE;
}

// Return NaN if overflow or too few values to compute variance.
if (count <= 1 || Double.isInfinite(sum) || Double.isInfinite(sum2)) {
return Double.NaN;
Expand Down Expand Up @@ -868,6 +897,11 @@ public class Numeric {
throw new IllegalArgumentException("Incompatible input sizes: " + values.size() + ", " + weights.size());
}

final double s = wstd(values, weights);
if (s == NULL_DOUBLE) {
return NULL_DOUBLE;
}

// see https://stats.stackexchange.com/questions/25895/computing-standard-error-in-weighted-mean-estimation
double sumw = 0;
double sumw2 = 0;
Expand All @@ -887,7 +921,6 @@ public class Numeric {
}
}

final double s = wstd(values, weights);
return s == NULL_DOUBLE ? NULL_DOUBLE : s * Math.sqrt(sumw2/sumw/sumw);
}

Expand Down Expand Up @@ -996,7 +1029,15 @@ public class Numeric {
}

final double a = wavg(values, weights);
if (a == NULL_DOUBLE) {
return NULL_DOUBLE;
}

final double s = wste(values, weights);
if (s == NULL_DOUBLE) {
return NULL_DOUBLE;
}

return a / s;
}

Expand Down Expand Up @@ -1226,13 +1267,38 @@ public class Numeric {
int n = values.intSize("median");

if (n == 0) {
return Double.NaN;
return NULL_DOUBLE;
} else {
${pt.primitive}[] copy = values.copyToArray();
Arrays.sort(copy);
if (n % 2 == 0)
return 0.5 * (copy[n / 2 - 1] + copy[n / 2]);
else return copy[n / 2];
// NULL values sorted to beginning of the array.
${pt.primitive}[] copy = sort(values.toArray());
Copy link
Member

@rcaudy rcaudy Jun 14, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not ideal. The implementation of sort is sub-optimal:

  1. We make one toArray() call internally.
  2. Then we make an array of boxed values.
  3. Then we sort using a comparator.
  4. Then we make a third copy wherein we unbox the values.

Sorting on the vector at least let's us skip one of the copies.

We should write a separate ticket for updating the sort kernels in io.deephaven.engine.table.impl.sort to have a version that does not bother with a valuesToPermute chunk, and use that version for the implementation of sort. This way we make exactly one copyToArray() and we're done. Note copyToArray(), not toArray(): we mustn't sort our backing array.


// Determine if there are any NULL in the array.
int nullCount = 0;
for (int i = 0; i < n; i++) {
if (isNull(copy[i])) {
nullCount++;
} else {
break;
}
}

if (nullCount == 0) {
// No NULL, so we can just compute the median and return.
if (n % 2 == 0)
return 0.5 * (copy[n / 2 - 1] + copy[n / 2]);
else return copy[n / 2];
} else if (nullCount < n) {
// Some NULL, reduce the count and compute the median of the non-null values.
n -= nullCount;
if (n % 2 == 0) {
int index = n / 2;
return 0.5 * (copy[n / 2 - 1 + nullCount] + copy[n / 2 + nullCount]);
}
else return copy[n / 2 + nullCount];
} else {
// All values are NULL.
return NULL_DOUBLE;
}
}
}

Expand Down Expand Up @@ -1268,11 +1334,32 @@ public class Numeric {
}

int n = values.intSize("percentile");
${pt.primitive}[] copy = values.copyToArray();
Arrays.sort(copy);
// NULL values sorted to beginning of the array.
${pt.primitive}[] copy = sort(values.toArray());

int idx = (int) Math.round(percentile * (n - 1));
return copy[idx];
// Determine if there are any NULL in the array.
int nullCount = 0;
for (int i = 0; i < n; i++) {
if (isNull(copy[i])) {
nullCount++;
} else {
break;
}
}

if (nullCount == 0) {
// No NULL, so we can just compute the index and return.
int idx = (int) Math.round(percentile * (n - 1));
return copy[idx];
} else if (nullCount < n) {
// Some NULL, reduce the count and compute the median of the non-null values.
n -= nullCount;
int idx = (int) Math.round(percentile * (n - 1));
return copy[idx + nullCount];
} else {
// All values are NULL.
return ${pt.null};
}
}


Expand Down Expand Up @@ -1344,6 +1431,7 @@ public class Numeric {
double sum1 = 0;
double sum01 = 0;
double count = 0;
long nullCount = 0;

try (
final ${pt.vectorIterator} v0i = values0.iterator();
Expand All @@ -1364,10 +1452,16 @@ public class Numeric {
sum1 += v1;
sum01 += v0 * v1;
count++;
} else {
nullCount++;
}
}
}

if (nullCount == values0.size()) {
return NULL_DOUBLE;
}

return sum01 / count - sum0 * sum1 / count / count;
}

Expand Down Expand Up @@ -1438,6 +1532,7 @@ public class Numeric {
double sum1Sq = 0;
double sum01 = 0;
double count = 0;
long nullCount = 0;

try (
final ${pt.vectorIterator} v0i = values0.iterator();
Expand All @@ -1460,10 +1555,16 @@ public class Numeric {
sum1Sq += v1 * v1;
sum01 += v0 * v1;
count++;
} else {
nullCount++;
}
}
}

if (nullCount == values0.size()) {
return NULL_DOUBLE;
}

double cov = sum01 / count - sum0 * sum1 / count / count;
double var0 = sum0Sq / count - sum0 * sum0 / count / count;
double var1 = sum1Sq / count - sum1 * sum1 / count / count;
Expand Down
Loading
Loading