From f5d925842f84cc81f35be15f3a54e213eab19ba5 Mon Sep 17 00:00:00 2001 From: Nikolaus Schuetz Date: Fri, 26 Jun 2026 11:10:08 -0700 Subject: [PATCH] fix: make binningUdf and maxBins optional in distinct/histogram checks hasNumberOfDistinctValues and hasHistogramValues forced callers to pass binningUdf and maxBins, although Deequ's Scala API marks both as optional with defaults (binningUdf=None, maxBins=Histogram.MaximumAllowedDetailBins). Default to the Scala-side defaults via the apply$default$N accessors when the args are omitted, mirroring the existing satisfies() pattern. Existing positional callers remain backward compatible. Fixes #81 --- pydeequ/checks.py | 14 ++++++++++---- tests/test_checks.py | 27 +++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/pydeequ/checks.py b/pydeequ/checks.py index 87280c3..b9b6b3f 100644 --- a/pydeequ/checks.py +++ b/pydeequ/checks.py @@ -303,32 +303,38 @@ def hasUniqueValueRatio(self, columns, assertion, hint=None): self._Check = self._Check.hasUniqueValueRatio(columns_seq, assertion_func, hint, self._jvm.scala.Option.apply(None)) return self - def hasNumberOfDistinctValues(self, column, assertion, binningUdf, maxBins, hint=None): + def hasNumberOfDistinctValues(self, column, assertion, binningUdf=None, maxBins=None, hint=None): """Creates a constraint that asserts on the number of distinct values a column has. :param str column: Column in Data Frame to run the assertion on. :param lambda assertion: A function that accepts an int or float parameter. - :param lambda binningUDF: An optional binning function. + :param lambda binningUDF: An optional binning function. Defaults to Deequ's default (no binning). :param int maxBins: Histogram details is only provided for N column values with top counts. MaxBins sets the N. + Defaults to Deequ's default (Histogram.MaximumAllowedDetailBins). :param str hint: A hint that states why a constraint could have failed. :return: hasNumberOfDistinctValues self: A Check object that asserts distinctness in the column. """ assertion_func = ScalaFunction1(self._spark_session.sparkContext._gateway, assertion) + binningUdf = binningUdf if binningUdf is not None else getattr(self._Check, "hasNumberOfDistinctValues$default$3")() + maxBins = maxBins if maxBins is not None else getattr(self._Check, "hasNumberOfDistinctValues$default$4")() hint = self._jvm.scala.Option.apply(hint) self._Check = self._Check.hasNumberOfDistinctValues(column, assertion_func, binningUdf, maxBins, hint) return self - def hasHistogramValues(self, column, assertion, binningUdf, maxBins, hint=None): + def hasHistogramValues(self, column, assertion, binningUdf=None, maxBins=None, hint=None): """Creates a constraint that asserts on column's value distribution. :param str column: Column in Data Frame to run the assertion on. :param lambda assertion: A function that accepts an int or float parameter as a distribution input parameter. - :param str binningUDF: An optional binning function. + :param str binningUDF: An optional binning function. Defaults to Deequ's default (no binning). :param str maxBins: Histogram details is only provided for N column values with top counts. MaxBins sets the N. + Defaults to Deequ's default (Histogram.MaximumAllowedDetailBins). :param str hint: A hint that states why a constraint could have failed. :return: hasHistogramValues self: A Check object that asserts the column's value distribution in the column. """ assertion_func = ScalaFunction1(self._spark_session.sparkContext._gateway, assertion) + binningUdf = binningUdf if binningUdf is not None else getattr(self._Check, "hasHistogramValues$default$3")() + maxBins = maxBins if maxBins is not None else getattr(self._Check, "hasHistogramValues$default$4")() hint = self._jvm.scala.Option.apply(hint) self._Check = self._Check.hasHistogramValues(column, assertion_func, binningUdf, maxBins, hint) return self diff --git a/tests/test_checks.py b/tests/test_checks.py index 878257b..692d4b4 100644 --- a/tests/test_checks.py +++ b/tests/test_checks.py @@ -1072,6 +1072,12 @@ def test_hasNumberOfDistinctValues(self): [Row(constraint_status="Success")], ) + def test_hasNumberOfDistinctValues_without_binning_args(self): + # Issue #81: binningUdf and maxBins must be optional + check = Check(self.spark, CheckLevel.Warning, "test hasNumberOfDistinctValues optional") + result = self.run_check(check.hasNumberOfDistinctValues("b", lambda x: x == 3)) + self.assertEqual(result, [Row(constraint_status="Success")]) + def test_isPrimaryKey(self): check = Check(self.spark, CheckLevel.Warning, "test isPrimaryKey") check = ( @@ -1107,6 +1113,27 @@ def _parse_dv(dv): result = self.run_check(check.hasHistogramValues("b", assertion_func, None, 3)) self.assertEqual(result, [Row(constraint_status="Success")]) + def test_hasHistogramValues_without_binning_args(self): + # Issue #81: binningUdf and maxBins must be optional + def assertion_func(x): + def _parse_dv(dv): + return dv.absolute(), dv.ratio() + + distribution_values = scala_map_to_dict(self.spark._jvm, x.values()) + dv1 = _parse_dv(distribution_values["1"]) + dv2 = _parse_dv(distribution_values["2"]) + dv3 = _parse_dv(distribution_values["3"]) + return ( + len(distribution_values) == 3 + and dv1 == (1, 1 / 3) + and dv2 == (1, 1 / 3) + and dv3 == (1, 1 / 3) + ) + + check = Check(self.spark, CheckLevel.Warning, "test hasHistogramValues optional") + result = self.run_check(check.hasHistogramValues("b", assertion_func)) + self.assertEqual(result, [Row(constraint_status="Success")]) + def test_kllSketchSatisfies(self): def assertion_func(x): bucket0 = x.apply(0)