diff --git a/.claude/sweep-security-state.json b/.claude/sweep-security-state.json index 3525b199..23c8bd9d 100644 --- a/.claude/sweep-security-state.json +++ b/.claude/sweep-security-state.json @@ -167,6 +167,13 @@ "severity_max": "MEDIUM", "categories_found": [6], "notes": "MEDIUM (fixed #1271): the five public functions sobel_x, sobel_y, laplacian, prewitt_x, prewitt_y did not call _validate_raster on agg. Non-DataArray inputs raised AttributeError from agg.data and wrong-ndim DataArrays failed inside numba/cupy with confusing errors instead of clean TypeError/ValueError. Numerical correctness was unaffected because convolve_2d._promote_float casts integer dtypes to float32 before the kernel runs. Fixed by adding _validate_raster(agg, func_name=..., name='agg') at the top of each function. No CRITICAL/HIGH findings: convolve_2d enforces 3x3 odd kernels and 2D agg.data, allocations match input shape, no CUDA kernels owned by this module, no file I/O." + }, + "emerging_hotspots": { + "last_inspected": "2026-04-25", + "issue": 1274, + "severity_max": "HIGH", + "categories_found": [1], + "notes": "HIGH (fixed #1274): emerging_hotspots() public API only validated ndim and shape[0] >= 2. The numpy and cupy backends each materialised three full (T, H, W) cubes (a float32 input copy, gi_zscore float32, gi_bin int8) plus H*W temporaries with no memory check; a (100, 20000, 20000) input projected to ~480 GB. Fixed by adding _available_memory_bytes()/_check_memory(n_times, ny, nx) (12 bytes per cube cell budget) and calling it from the public API for non-dask inputs. Dask paths skip the guard because their map_blocks/map_overlap chunk functions do not materialise the full cube. MEDIUM (unfixed, Cat 6): public API does not call _validate_raster() so non-numeric dtypes fail later with a confusing error rather than a clean TypeError. No GPU kernels in this module (uses convolve_2d). No file I/O. Cat 3 statistical paths are robust: _mann_kendall_statistic_numpy guards var_s <= 0 before sqrt, both numpy and cupy backends raise ZeroDivisionError on global_std == 0, and _mk_pvalue handles z==0 explicitly." } } } diff --git a/xrspatial/emerging_hotspots.py b/xrspatial/emerging_hotspots.py index e51abde8..02c83552 100644 --- a/xrspatial/emerging_hotspots.py +++ b/xrspatial/emerging_hotspots.py @@ -57,6 +57,57 @@ _MK_ALPHA = 0.05 # significance level for Mann-Kendall trend test +# --------------------------------------------------------------------------- +# Memory guard +# --------------------------------------------------------------------------- + +# Peak working-memory footprint per cell of the (T, H, W) cube on the +# numpy/cupy backends: +# data.astype(float32) copy : 4 +# gi_zscore (float32) : 4 +# gi_bin (int8) : 1 +# Plus 2-D temporaries (H*W) for convolved scratch, category, trend_z, +# trend_p, which are negligible relative to the cube for realistic T. +# Round up to 12 bytes per cube cell to cover small temporaries. +_BYTES_PER_CUBE_CELL = 12 + + +def _available_memory_bytes(): + """Best-effort estimate of available memory in bytes.""" + try: + with open('/proc/meminfo', 'r') as f: + for line in f: + if line.startswith('MemAvailable:'): + return int(line.split()[1]) * 1024 + except (OSError, ValueError, IndexError): + pass + try: + import psutil + return psutil.virtual_memory().available + except (ImportError, AttributeError): + pass + return 2 * 1024 ** 3 + + +def _check_memory(n_times, ny, nx): + """Raise MemoryError if the (T, H, W) working buffers would exceed 50% RAM. + + The numpy and cupy backends each materialise three full-cube arrays + (a float32 input copy, gi_zscore float32, gi_bin int8) plus small + H*W temporaries. Budget ~12 bytes per cube cell. + """ + required = int(n_times) * int(ny) * int(nx) * _BYTES_PER_CUBE_CELL + available = _available_memory_bytes() + if required > 0.5 * available: + raise MemoryError( + f"emerging_hotspots on a ({n_times}, {ny}, {nx}) cube requires " + f"~{required / 1e9:.1f} GB of working memory but only " + f"~{available / 1e9:.1f} GB is available. " + f"Use a smaller raster or pass a dask-backed DataArray for " + f"out-of-core processing." + ) + + # --------------------------------------------------------------------------- # Mann-Kendall helpers (Numba-JIT for use inside pixel loops) # --------------------------------------------------------------------------- @@ -695,6 +746,13 @@ def emerging_hotspots(raster, kernel, boundary='nan'): _validate_boundary(boundary) + # Memory guard for numpy/cupy backends only. The dask backends + # process per-chunk via map_blocks/map_overlap and do not need a + # whole-cube guard. + if da is None or not isinstance(raster.data, da.Array): + n_times, ny, nx = raster.shape + _check_memory(n_times, ny, nx) + mapper = ArrayTypeFunctionMapping( numpy_func=partial(_emerging_hotspots_numpy, boundary=boundary), cupy_func=partial(_emerging_hotspots_cupy, boundary=boundary), diff --git a/xrspatial/tests/test_emerging_hotspots.py b/xrspatial/tests/test_emerging_hotspots.py index 14439d15..e6955a22 100644 --- a/xrspatial/tests/test_emerging_hotspots.py +++ b/xrspatial/tests/test_emerging_hotspots.py @@ -457,3 +457,87 @@ def test_dask_cupy_matches_numpy(self): dc_vals, ds_np[var].values, atol=1e-5, err_msg=f"mismatch in {var}", ) + + +# --------------------------------------------------------------------------- +# Memory guard (issue #1274) +# --------------------------------------------------------------------------- + +class TestMemoryGuard: + """Memory guard on the public emerging_hotspots() API (#1274). + + The numpy and cupy backends materialise three full ``(T, H, W)`` + cubes (a float32 input copy, gi_zscore float32, gi_bin int8) plus + small H*W temporaries. The public API estimates this footprint and + raises ``MemoryError`` before the first allocation when it would + exceed 50% of available RAM. Dask-backed inputs are not guarded + because their map_blocks/map_overlap path processes per-chunk. + """ + + def _stub_available(self, monkeypatch, n_bytes): + """Pin _available_memory_bytes to a fixed return value.""" + import sys + mod = sys.modules['xrspatial.emerging_hotspots'] + monkeypatch.setattr(mod, '_available_memory_bytes', lambda: n_bytes) + + def test_numpy_raises_when_budget_exceeded(self, monkeypatch): + """numpy backend should refuse a cube too large for memory.""" + # A (5, 100, 100) cube needs ~600 KB; with 1 KB "available" the + # guard must trip. + self._stub_available(monkeypatch, 1024) + rng = np.random.default_rng(0) + data = rng.standard_normal((5, 100, 100)).astype('f4') + raster = _make_raster(data) + with pytest.raises(MemoryError, match=r"emerging_hotspots"): + emerging_hotspots(raster, _kernel_3x3()) + + def test_numpy_passes_when_budget_ample(self, monkeypatch): + """Small inputs should pass the guard with normal RAM headroom.""" + self._stub_available(monkeypatch, 64 * 1024 ** 3) + rng = np.random.default_rng(0) + data = rng.standard_normal((5, 20, 20)).astype('f4') + raster = _make_raster(data) + ds = emerging_hotspots(raster, _kernel_3x3()) + assert ds['category'].shape == (20, 20) + + def test_error_message_mentions_shape_and_gb(self, monkeypatch): + """Error message should surface the cube shape and projected size.""" + self._stub_available(monkeypatch, 1024) + rng = np.random.default_rng(0) + data = rng.standard_normal((4, 50, 50)).astype('f4') + raster = _make_raster(data) + with pytest.raises(MemoryError, match=r"\(4, 50, 50\)"): + emerging_hotspots(raster, _kernel_3x3()) + + def test_cupy_raises_when_budget_exceeded(self, monkeypatch): + """CuPy backend honours the same in-RAM guard.""" + cp = pytest.importorskip("cupy") + self._stub_available(monkeypatch, 1024) + rng = np.random.default_rng(0) + data = cp.asarray( + rng.standard_normal((5, 100, 100)).astype('f4') + ) + raster = _make_raster(data) + with pytest.raises(MemoryError, match=r"emerging_hotspots"): + emerging_hotspots(raster, _kernel_3x3()) + + def test_dask_skips_in_ram_guard(self, monkeypatch): + """Dask-backed inputs should not be blocked by the in-RAM guard. + + The numpy/cupy guard targets backends that materialise the full + cube; dask paths process per-chunk via map_blocks/map_overlap. + """ + da_mod = pytest.importorskip("dask.array") + # Even with 1 byte "available", the dask path should pass the + # public-API check because the guard is skipped for dask inputs. + self._stub_available(monkeypatch, 1) + rng = np.random.default_rng(0) + data = da_mod.from_array( + rng.standard_normal((5, 20, 20)).astype('f4'), + chunks=(5, 10, 10), + ) + raster = _make_raster(data) + ds = emerging_hotspots(raster, _kernel_3x3()) + # Materialise the lazy result to confirm the full pipeline runs. + ds = ds.compute() + assert ds['category'].shape == (20, 20)