From f5d925842f84cc81f35be15f3a54e213eab19ba5 Mon Sep 17 00:00:00 2001
From: Nikolaus Schuetz <nikolauspschuetz@gmail.com>
Date: Fri, 26 Jun 2026 11:10:08 -0700
Subject: [PATCH] fix: make binningUdf and maxBins optional in
 distinct/histogram checks

hasNumberOfDistinctValues and hasHistogramValues forced callers to pass
binningUdf and maxBins, although Deequ's Scala API marks both as optional
with defaults (binningUdf=None, maxBins=Histogram.MaximumAllowedDetailBins).
Default to the Scala-side defaults via the apply$default$N accessors when
the args are omitted, mirroring the existing satisfies() pattern. Existing
positional callers remain backward compatible.

Fixes #81
---
 pydeequ/checks.py    | 14 ++++++++++----
 tests/test_checks.py | 27 +++++++++++++++++++++++++++
 2 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/pydeequ/checks.py b/pydeequ/checks.py
index 87280c3..b9b6b3f 100644
--- a/pydeequ/checks.py
+++ b/pydeequ/checks.py
@@ -303,32 +303,38 @@ def hasUniqueValueRatio(self, columns, assertion, hint=None):
         self._Check = self._Check.hasUniqueValueRatio(columns_seq, assertion_func, hint, self._jvm.scala.Option.apply(None))
         return self
 
-    def hasNumberOfDistinctValues(self, column, assertion, binningUdf, maxBins, hint=None):
+    def hasNumberOfDistinctValues(self, column, assertion, binningUdf=None, maxBins=None, hint=None):
         """Creates a constraint that asserts on the number of distinct values a column has.
 
         :param str column: Column in Data Frame to run the assertion on.
         :param lambda assertion: A function that accepts an int or float parameter.
-        :param lambda binningUDF: An optional binning function.
+        :param lambda binningUDF: An optional binning function. Defaults to Deequ's default (no binning).
         :param int maxBins: Histogram details is only provided for N column values with top counts. MaxBins sets the N.
+                Defaults to Deequ's default (Histogram.MaximumAllowedDetailBins).
         :param str hint: A hint that states why a constraint could have failed.
         :return: hasNumberOfDistinctValues self: A Check object that asserts distinctness in the column.
         """
         assertion_func = ScalaFunction1(self._spark_session.sparkContext._gateway, assertion)
+        binningUdf = binningUdf if binningUdf is not None else getattr(self._Check, "hasNumberOfDistinctValues$default$3")()
+        maxBins = maxBins if maxBins is not None else getattr(self._Check, "hasNumberOfDistinctValues$default$4")()
         hint = self._jvm.scala.Option.apply(hint)
         self._Check = self._Check.hasNumberOfDistinctValues(column, assertion_func, binningUdf, maxBins, hint)
         return self
 
-    def hasHistogramValues(self, column, assertion, binningUdf, maxBins, hint=None):
+    def hasHistogramValues(self, column, assertion, binningUdf=None, maxBins=None, hint=None):
         """Creates a constraint that asserts on column's value distribution.
 
         :param str column: Column in Data Frame to run the assertion on.
         :param lambda assertion: A function that accepts an int or float parameter as a distribution input parameter.
-        :param str binningUDF: An optional binning function.
+        :param str binningUDF: An optional binning function. Defaults to Deequ's default (no binning).
         :param str maxBins: Histogram details is only provided for N column values with top counts. MaxBins sets the N.
+                Defaults to Deequ's default (Histogram.MaximumAllowedDetailBins).
         :param str hint: A hint that states why a constraint could have failed.
         :return: hasHistogramValues self: A Check object that asserts the column's value distribution in the column.
         """
         assertion_func = ScalaFunction1(self._spark_session.sparkContext._gateway, assertion)
+        binningUdf = binningUdf if binningUdf is not None else getattr(self._Check, "hasHistogramValues$default$3")()
+        maxBins = maxBins if maxBins is not None else getattr(self._Check, "hasHistogramValues$default$4")()
         hint = self._jvm.scala.Option.apply(hint)
         self._Check = self._Check.hasHistogramValues(column, assertion_func, binningUdf, maxBins, hint)
         return self
diff --git a/tests/test_checks.py b/tests/test_checks.py
index 878257b..692d4b4 100644
--- a/tests/test_checks.py
+++ b/tests/test_checks.py
@@ -1072,6 +1072,12 @@ def test_hasNumberOfDistinctValues(self):
             [Row(constraint_status="Success")],
         )
 
+    def test_hasNumberOfDistinctValues_without_binning_args(self):
+        # Issue #81: binningUdf and maxBins must be optional
+        check = Check(self.spark, CheckLevel.Warning, "test hasNumberOfDistinctValues optional")
+        result = self.run_check(check.hasNumberOfDistinctValues("b", lambda x: x == 3))
+        self.assertEqual(result, [Row(constraint_status="Success")])
+
     def test_isPrimaryKey(self):
         check = Check(self.spark, CheckLevel.Warning, "test isPrimaryKey")
         check = (
@@ -1107,6 +1113,27 @@ def _parse_dv(dv):
         result = self.run_check(check.hasHistogramValues("b", assertion_func, None, 3))
         self.assertEqual(result, [Row(constraint_status="Success")])
 
+    def test_hasHistogramValues_without_binning_args(self):
+        # Issue #81: binningUdf and maxBins must be optional
+        def assertion_func(x):
+            def _parse_dv(dv):
+                return dv.absolute(), dv.ratio()
+
+            distribution_values = scala_map_to_dict(self.spark._jvm, x.values())
+            dv1 = _parse_dv(distribution_values["1"])
+            dv2 = _parse_dv(distribution_values["2"])
+            dv3 = _parse_dv(distribution_values["3"])
+            return (
+                len(distribution_values) == 3
+                and dv1 == (1, 1 / 3)
+                and dv2 == (1, 1 / 3)
+                and dv3 == (1, 1 / 3)
+            )
+
+        check = Check(self.spark, CheckLevel.Warning, "test hasHistogramValues optional")
+        result = self.run_check(check.hasHistogramValues("b", assertion_func))
+        self.assertEqual(result, [Row(constraint_status="Success")])
+
     def test_kllSketchSatisfies(self):
         def assertion_func(x):
             bucket0 = x.apply(0)