From ed58976164786cb2d7e83b4c875125c80c4c8f8a Mon Sep 17 00:00:00 2001
From: Brendan Collins <brendancol@gmail.com>
Date: Sat, 25 Apr 2026 13:42:35 -0700
Subject: [PATCH] Add memory guard to emerging_hotspots public API (#1274)

The numpy and cupy backends each materialise three full (T, H, W) cubes
(float32 input copy, gi_zscore float32, gi_bin int8) plus small H*W
temporaries with no memory check. A (100, 20000, 20000) input projected
to about 480 GB.

Add _available_memory_bytes() and _check_memory() helpers and call the
guard from the public API for non-dask inputs. Dask paths process
per-chunk via map_blocks/map_overlap and do not need the cube guard.

Same pattern as #1231, #1240, #1263, #1264.
---
 .claude/sweep-security-state.json         |  7 ++
 xrspatial/emerging_hotspots.py            | 58 ++++++++++++++++
 xrspatial/tests/test_emerging_hotspots.py | 84 +++++++++++++++++++++++
 3 files changed, 149 insertions(+)

diff --git a/.claude/sweep-security-state.json b/.claude/sweep-security-state.json
index 38a23f13..d0fe4a22 100644
--- a/.claude/sweep-security-state.json
+++ b/.claude/sweep-security-state.json
@@ -153,6 +153,13 @@
       "severity_max": null,
       "categories_found": [],
       "notes": "Clean. Small (271 LOC) module computing 3x3 second-derivative stencil. Cat 1: only single output buffer matching input shape (np.empty at line 37, cupy.empty at line 101) -- bounded by caller, per audit guidance not a finding. Cat 2: _cpu numba kernel uses range(1, rows-1)/range(1, cols-1) with simple (y, x) indices; no flat indexing or queue arrays; numba range loops produce int64. Cat 3: division by cellsize*cellsize on line 44 -- cellsize comes from get_dataarray_resolution() (raster property, not user-direct); cellsize=0 is unrealistic and would produce inf consistently across backends. NaN inputs propagate correctly through float arithmetic. Cat 4: _run_gpu (line 79-86) has full bounds guard via 'i + di <= out.shape[0] - 1 and j + dj <= out.shape[1] - 1' which guarantees i < shape[0] and j < shape[1] before the out[i, j] write; no shared memory; out is pre-filled with NaN at line 102 so threads outside the guard correctly leave NaN. Cat 5: no file I/O. Cat 6: curvature() calls _validate_raster at line 253; all four backend paths explicitly cast to float32 (lines 51, 62, 97, 112) so dtype is normalized before any computation; tests cover int32/int64/uint32/uint64/float32/float64 across numpy/cupy/dask+numpy/dask+cupy."
+    },
+    "emerging_hotspots": {
+      "last_inspected": "2026-04-25",
+      "issue": 1274,
+      "severity_max": "HIGH",
+      "categories_found": [1],
+      "notes": "HIGH (fixed #1274): emerging_hotspots() public API only validated ndim and shape[0] >= 2. The numpy and cupy backends each materialised three full (T, H, W) cubes (a float32 input copy, gi_zscore float32, gi_bin int8) plus H*W temporaries with no memory check; a (100, 20000, 20000) input projected to ~480 GB. Fixed by adding _available_memory_bytes()/_check_memory(n_times, ny, nx) (12 bytes per cube cell budget) and calling it from the public API for non-dask inputs. Dask paths skip the guard because their map_blocks/map_overlap chunk functions do not materialise the full cube. MEDIUM (unfixed, Cat 6): public API does not call _validate_raster() so non-numeric dtypes fail later with a confusing error rather than a clean TypeError. No GPU kernels in this module (uses convolve_2d). No file I/O. Cat 3 statistical paths are robust: _mann_kendall_statistic_numpy guards var_s <= 0 before sqrt, both numpy and cupy backends raise ZeroDivisionError on global_std == 0, and _mk_pvalue handles z==0 explicitly."
     }
   }
 }
diff --git a/xrspatial/emerging_hotspots.py b/xrspatial/emerging_hotspots.py
index e51abde8..02c83552 100644
--- a/xrspatial/emerging_hotspots.py
+++ b/xrspatial/emerging_hotspots.py
@@ -57,6 +57,57 @@
 _MK_ALPHA = 0.05  # significance level for Mann-Kendall trend test
 
 
+# ---------------------------------------------------------------------------
+# Memory guard
+# ---------------------------------------------------------------------------
+
+# Peak working-memory footprint per cell of the (T, H, W) cube on the
+# numpy/cupy backends:
+#   data.astype(float32) copy : 4
+#   gi_zscore (float32)       : 4
+#   gi_bin (int8)              : 1
+# Plus 2-D temporaries (H*W) for convolved scratch, category, trend_z,
+# trend_p, which are negligible relative to the cube for realistic T.
+# Round up to 12 bytes per cube cell to cover small temporaries.
+_BYTES_PER_CUBE_CELL = 12
+
+
+def _available_memory_bytes():
+    """Best-effort estimate of available memory in bytes."""
+    try:
+        with open('/proc/meminfo', 'r') as f:
+            for line in f:
+                if line.startswith('MemAvailable:'):
+                    return int(line.split()[1]) * 1024
+    except (OSError, ValueError, IndexError):
+        pass
+    try:
+        import psutil
+        return psutil.virtual_memory().available
+    except (ImportError, AttributeError):
+        pass
+    return 2 * 1024 ** 3
+
+
+def _check_memory(n_times, ny, nx):
+    """Raise MemoryError if the (T, H, W) working buffers would exceed 50% RAM.
+
+    The numpy and cupy backends each materialise three full-cube arrays
+    (a float32 input copy, gi_zscore float32, gi_bin int8) plus small
+    H*W temporaries.  Budget ~12 bytes per cube cell.
+    """
+    required = int(n_times) * int(ny) * int(nx) * _BYTES_PER_CUBE_CELL
+    available = _available_memory_bytes()
+    if required > 0.5 * available:
+        raise MemoryError(
+            f"emerging_hotspots on a ({n_times}, {ny}, {nx}) cube requires "
+            f"~{required / 1e9:.1f} GB of working memory but only "
+            f"~{available / 1e9:.1f} GB is available.  "
+            f"Use a smaller raster or pass a dask-backed DataArray for "
+            f"out-of-core processing."
+        )
+
+
 # ---------------------------------------------------------------------------
 # Mann-Kendall helpers (Numba-JIT for use inside pixel loops)
 # ---------------------------------------------------------------------------
@@ -695,6 +746,13 @@ def emerging_hotspots(raster, kernel, boundary='nan'):
 
     _validate_boundary(boundary)
 
+    # Memory guard for numpy/cupy backends only.  The dask backends
+    # process per-chunk via map_blocks/map_overlap and do not need a
+    # whole-cube guard.
+    if da is None or not isinstance(raster.data, da.Array):
+        n_times, ny, nx = raster.shape
+        _check_memory(n_times, ny, nx)
+
     mapper = ArrayTypeFunctionMapping(
         numpy_func=partial(_emerging_hotspots_numpy, boundary=boundary),
         cupy_func=partial(_emerging_hotspots_cupy, boundary=boundary),
diff --git a/xrspatial/tests/test_emerging_hotspots.py b/xrspatial/tests/test_emerging_hotspots.py
index 14439d15..e6955a22 100644
--- a/xrspatial/tests/test_emerging_hotspots.py
+++ b/xrspatial/tests/test_emerging_hotspots.py
@@ -457,3 +457,87 @@ def test_dask_cupy_matches_numpy(self):
                 dc_vals, ds_np[var].values, atol=1e-5,
                 err_msg=f"mismatch in {var}",
             )
+
+
+# ---------------------------------------------------------------------------
+# Memory guard (issue #1274)
+# ---------------------------------------------------------------------------
+
+class TestMemoryGuard:
+    """Memory guard on the public emerging_hotspots() API (#1274).
+
+    The numpy and cupy backends materialise three full ``(T, H, W)``
+    cubes (a float32 input copy, gi_zscore float32, gi_bin int8) plus
+    small H*W temporaries.  The public API estimates this footprint and
+    raises ``MemoryError`` before the first allocation when it would
+    exceed 50% of available RAM.  Dask-backed inputs are not guarded
+    because their map_blocks/map_overlap path processes per-chunk.
+    """
+
+    def _stub_available(self, monkeypatch, n_bytes):
+        """Pin _available_memory_bytes to a fixed return value."""
+        import sys
+        mod = sys.modules['xrspatial.emerging_hotspots']
+        monkeypatch.setattr(mod, '_available_memory_bytes', lambda: n_bytes)
+
+    def test_numpy_raises_when_budget_exceeded(self, monkeypatch):
+        """numpy backend should refuse a cube too large for memory."""
+        # A (5, 100, 100) cube needs ~600 KB; with 1 KB "available" the
+        # guard must trip.
+        self._stub_available(monkeypatch, 1024)
+        rng = np.random.default_rng(0)
+        data = rng.standard_normal((5, 100, 100)).astype('f4')
+        raster = _make_raster(data)
+        with pytest.raises(MemoryError, match=r"emerging_hotspots"):
+            emerging_hotspots(raster, _kernel_3x3())
+
+    def test_numpy_passes_when_budget_ample(self, monkeypatch):
+        """Small inputs should pass the guard with normal RAM headroom."""
+        self._stub_available(monkeypatch, 64 * 1024 ** 3)
+        rng = np.random.default_rng(0)
+        data = rng.standard_normal((5, 20, 20)).astype('f4')
+        raster = _make_raster(data)
+        ds = emerging_hotspots(raster, _kernel_3x3())
+        assert ds['category'].shape == (20, 20)
+
+    def test_error_message_mentions_shape_and_gb(self, monkeypatch):
+        """Error message should surface the cube shape and projected size."""
+        self._stub_available(monkeypatch, 1024)
+        rng = np.random.default_rng(0)
+        data = rng.standard_normal((4, 50, 50)).astype('f4')
+        raster = _make_raster(data)
+        with pytest.raises(MemoryError, match=r"\(4, 50, 50\)"):
+            emerging_hotspots(raster, _kernel_3x3())
+
+    def test_cupy_raises_when_budget_exceeded(self, monkeypatch):
+        """CuPy backend honours the same in-RAM guard."""
+        cp = pytest.importorskip("cupy")
+        self._stub_available(monkeypatch, 1024)
+        rng = np.random.default_rng(0)
+        data = cp.asarray(
+            rng.standard_normal((5, 100, 100)).astype('f4')
+        )
+        raster = _make_raster(data)
+        with pytest.raises(MemoryError, match=r"emerging_hotspots"):
+            emerging_hotspots(raster, _kernel_3x3())
+
+    def test_dask_skips_in_ram_guard(self, monkeypatch):
+        """Dask-backed inputs should not be blocked by the in-RAM guard.
+
+        The numpy/cupy guard targets backends that materialise the full
+        cube; dask paths process per-chunk via map_blocks/map_overlap.
+        """
+        da_mod = pytest.importorskip("dask.array")
+        # Even with 1 byte "available", the dask path should pass the
+        # public-API check because the guard is skipped for dask inputs.
+        self._stub_available(monkeypatch, 1)
+        rng = np.random.default_rng(0)
+        data = da_mod.from_array(
+            rng.standard_normal((5, 20, 20)).astype('f4'),
+            chunks=(5, 10, 10),
+        )
+        raster = _make_raster(data)
+        ds = emerging_hotspots(raster, _kernel_3x3())
+        # Materialise the lazy result to confirm the full pipeline runs.
+        ds = ds.compute()
+        assert ds['category'].shape == (20, 20)