Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 10 additions & 4 deletions pydeequ/checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,32 +303,38 @@ def hasUniqueValueRatio(self, columns, assertion, hint=None):
self._Check = self._Check.hasUniqueValueRatio(columns_seq, assertion_func, hint, self._jvm.scala.Option.apply(None))
return self

def hasNumberOfDistinctValues(self, column, assertion, binningUdf, maxBins, hint=None):
def hasNumberOfDistinctValues(self, column, assertion, binningUdf=None, maxBins=None, hint=None):
"""Creates a constraint that asserts on the number of distinct values a column has.

:param str column: Column in Data Frame to run the assertion on.
:param lambda assertion: A function that accepts an int or float parameter.
:param lambda binningUDF: An optional binning function.
:param lambda binningUDF: An optional binning function. Defaults to Deequ's default (no binning).
:param int maxBins: Histogram details is only provided for N column values with top counts. MaxBins sets the N.
Defaults to Deequ's default (Histogram.MaximumAllowedDetailBins).
:param str hint: A hint that states why a constraint could have failed.
:return: hasNumberOfDistinctValues self: A Check object that asserts distinctness in the column.
"""
assertion_func = ScalaFunction1(self._spark_session.sparkContext._gateway, assertion)
binningUdf = binningUdf if binningUdf is not None else getattr(self._Check, "hasNumberOfDistinctValues$default$3")()
maxBins = maxBins if maxBins is not None else getattr(self._Check, "hasNumberOfDistinctValues$default$4")()
hint = self._jvm.scala.Option.apply(hint)
self._Check = self._Check.hasNumberOfDistinctValues(column, assertion_func, binningUdf, maxBins, hint)
return self

def hasHistogramValues(self, column, assertion, binningUdf, maxBins, hint=None):
def hasHistogramValues(self, column, assertion, binningUdf=None, maxBins=None, hint=None):
"""Creates a constraint that asserts on column's value distribution.

:param str column: Column in Data Frame to run the assertion on.
:param lambda assertion: A function that accepts an int or float parameter as a distribution input parameter.
:param str binningUDF: An optional binning function.
:param str binningUDF: An optional binning function. Defaults to Deequ's default (no binning).
:param str maxBins: Histogram details is only provided for N column values with top counts. MaxBins sets the N.
Defaults to Deequ's default (Histogram.MaximumAllowedDetailBins).
:param str hint: A hint that states why a constraint could have failed.
:return: hasHistogramValues self: A Check object that asserts the column's value distribution in the column.
"""
assertion_func = ScalaFunction1(self._spark_session.sparkContext._gateway, assertion)
binningUdf = binningUdf if binningUdf is not None else getattr(self._Check, "hasHistogramValues$default$3")()
maxBins = maxBins if maxBins is not None else getattr(self._Check, "hasHistogramValues$default$4")()
hint = self._jvm.scala.Option.apply(hint)
self._Check = self._Check.hasHistogramValues(column, assertion_func, binningUdf, maxBins, hint)
return self
Expand Down
27 changes: 27 additions & 0 deletions tests/test_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1072,6 +1072,12 @@ def test_hasNumberOfDistinctValues(self):
[Row(constraint_status="Success")],
)

def test_hasNumberOfDistinctValues_without_binning_args(self):
# Issue #81: binningUdf and maxBins must be optional
check = Check(self.spark, CheckLevel.Warning, "test hasNumberOfDistinctValues optional")
result = self.run_check(check.hasNumberOfDistinctValues("b", lambda x: x == 3))
self.assertEqual(result, [Row(constraint_status="Success")])

def test_isPrimaryKey(self):
check = Check(self.spark, CheckLevel.Warning, "test isPrimaryKey")
check = (
Expand Down Expand Up @@ -1107,6 +1113,27 @@ def _parse_dv(dv):
result = self.run_check(check.hasHistogramValues("b", assertion_func, None, 3))
self.assertEqual(result, [Row(constraint_status="Success")])

def test_hasHistogramValues_without_binning_args(self):
# Issue #81: binningUdf and maxBins must be optional
def assertion_func(x):
def _parse_dv(dv):
return dv.absolute(), dv.ratio()

distribution_values = scala_map_to_dict(self.spark._jvm, x.values())
dv1 = _parse_dv(distribution_values["1"])
dv2 = _parse_dv(distribution_values["2"])
dv3 = _parse_dv(distribution_values["3"])
return (
len(distribution_values) == 3
and dv1 == (1, 1 / 3)
and dv2 == (1, 1 / 3)
and dv3 == (1, 1 / 3)
)

check = Check(self.spark, CheckLevel.Warning, "test hasHistogramValues optional")
result = self.run_check(check.hasHistogramValues("b", assertion_func))
self.assertEqual(result, [Row(constraint_status="Success")])

def test_kllSketchSatisfies(self):
def assertion_func(x):
bucket0 = x.apply(0)
Expand Down
Loading