diff --git a/pydeequ/checks.py b/pydeequ/checks.py index 87280c3..b9b6b3f 100644 --- a/pydeequ/checks.py +++ b/pydeequ/checks.py @@ -303,32 +303,38 @@ def hasUniqueValueRatio(self, columns, assertion, hint=None): self._Check = self._Check.hasUniqueValueRatio(columns_seq, assertion_func, hint, self._jvm.scala.Option.apply(None)) return self - def hasNumberOfDistinctValues(self, column, assertion, binningUdf, maxBins, hint=None): + def hasNumberOfDistinctValues(self, column, assertion, binningUdf=None, maxBins=None, hint=None): """Creates a constraint that asserts on the number of distinct values a column has. :param str column: Column in Data Frame to run the assertion on. :param lambda assertion: A function that accepts an int or float parameter. - :param lambda binningUDF: An optional binning function. + :param lambda binningUDF: An optional binning function. Defaults to Deequ's default (no binning). :param int maxBins: Histogram details is only provided for N column values with top counts. MaxBins sets the N. + Defaults to Deequ's default (Histogram.MaximumAllowedDetailBins). :param str hint: A hint that states why a constraint could have failed. :return: hasNumberOfDistinctValues self: A Check object that asserts distinctness in the column. """ assertion_func = ScalaFunction1(self._spark_session.sparkContext._gateway, assertion) + binningUdf = binningUdf if binningUdf is not None else getattr(self._Check, "hasNumberOfDistinctValues$default$3")() + maxBins = maxBins if maxBins is not None else getattr(self._Check, "hasNumberOfDistinctValues$default$4")() hint = self._jvm.scala.Option.apply(hint) self._Check = self._Check.hasNumberOfDistinctValues(column, assertion_func, binningUdf, maxBins, hint) return self - def hasHistogramValues(self, column, assertion, binningUdf, maxBins, hint=None): + def hasHistogramValues(self, column, assertion, binningUdf=None, maxBins=None, hint=None): """Creates a constraint that asserts on column's value distribution. :param str column: Column in Data Frame to run the assertion on. :param lambda assertion: A function that accepts an int or float parameter as a distribution input parameter. - :param str binningUDF: An optional binning function. + :param str binningUDF: An optional binning function. Defaults to Deequ's default (no binning). :param str maxBins: Histogram details is only provided for N column values with top counts. MaxBins sets the N. + Defaults to Deequ's default (Histogram.MaximumAllowedDetailBins). :param str hint: A hint that states why a constraint could have failed. :return: hasHistogramValues self: A Check object that asserts the column's value distribution in the column. """ assertion_func = ScalaFunction1(self._spark_session.sparkContext._gateway, assertion) + binningUdf = binningUdf if binningUdf is not None else getattr(self._Check, "hasHistogramValues$default$3")() + maxBins = maxBins if maxBins is not None else getattr(self._Check, "hasHistogramValues$default$4")() hint = self._jvm.scala.Option.apply(hint) self._Check = self._Check.hasHistogramValues(column, assertion_func, binningUdf, maxBins, hint) return self diff --git a/tests/test_checks.py b/tests/test_checks.py index 878257b..692d4b4 100644 --- a/tests/test_checks.py +++ b/tests/test_checks.py @@ -1072,6 +1072,12 @@ def test_hasNumberOfDistinctValues(self): [Row(constraint_status="Success")], ) + def test_hasNumberOfDistinctValues_without_binning_args(self): + # Issue #81: binningUdf and maxBins must be optional + check = Check(self.spark, CheckLevel.Warning, "test hasNumberOfDistinctValues optional") + result = self.run_check(check.hasNumberOfDistinctValues("b", lambda x: x == 3)) + self.assertEqual(result, [Row(constraint_status="Success")]) + def test_isPrimaryKey(self): check = Check(self.spark, CheckLevel.Warning, "test isPrimaryKey") check = ( @@ -1107,6 +1113,27 @@ def _parse_dv(dv): result = self.run_check(check.hasHistogramValues("b", assertion_func, None, 3)) self.assertEqual(result, [Row(constraint_status="Success")]) + def test_hasHistogramValues_without_binning_args(self): + # Issue #81: binningUdf and maxBins must be optional + def assertion_func(x): + def _parse_dv(dv): + return dv.absolute(), dv.ratio() + + distribution_values = scala_map_to_dict(self.spark._jvm, x.values()) + dv1 = _parse_dv(distribution_values["1"]) + dv2 = _parse_dv(distribution_values["2"]) + dv3 = _parse_dv(distribution_values["3"]) + return ( + len(distribution_values) == 3 + and dv1 == (1, 1 / 3) + and dv2 == (1, 1 / 3) + and dv3 == (1, 1 / 3) + ) + + check = Check(self.spark, CheckLevel.Warning, "test hasHistogramValues optional") + result = self.run_check(check.hasHistogramValues("b", assertion_func)) + self.assertEqual(result, [Row(constraint_status="Success")]) + def test_kllSketchSatisfies(self): def assertion_func(x): bucket0 = x.apply(0)