diff --git a/src/crawlee/_utils/sitemap.py b/src/crawlee/_utils/sitemap.py index 2632ed98e5..e6930fc195 100644 --- a/src/crawlee/_utils/sitemap.py +++ b/src/crawlee/_utils/sitemap.py @@ -437,17 +437,11 @@ async def parse_sitemap( up to the specified maximum depth. """ # Set default options - default_timeout = timedelta(seconds=30) - if options: - emit_nested_sitemaps = options['emit_nested_sitemaps'] - max_depth = options['max_depth'] - sitemap_retries = options['sitemap_retries'] - timeout = options.get('timeout', default_timeout) - else: - emit_nested_sitemaps = False - max_depth = float('inf') - sitemap_retries = 3 - timeout = default_timeout + options = options or {} + emit_nested_sitemaps = options.get('emit_nested_sitemaps', False) + max_depth = options.get('max_depth', float('inf')) + sitemap_retries = options.get('sitemap_retries', 3) + timeout = options.get('timeout', timedelta(seconds=30)) # Setup working state sources = list(initial_sources) diff --git a/tests/unit/_utils/test_sitemap.py b/tests/unit/_utils/test_sitemap.py index 5f2005ca16..9e6fd795b3 100644 --- a/tests/unit/_utils/test_sitemap.py +++ b/tests/unit/_utils/test_sitemap.py @@ -1,12 +1,12 @@ import base64 import gzip -from datetime import datetime +from datetime import datetime, timedelta from typing import Any from unittest.mock import AsyncMock, MagicMock from yarl import URL -from crawlee._utils.sitemap import Sitemap, SitemapUrl, discover_valid_sitemaps, parse_sitemap +from crawlee._utils.sitemap import ParseSitemapOptions, Sitemap, SitemapUrl, discover_valid_sitemaps, parse_sitemap from crawlee.http_clients._base import HttpClient, HttpResponse BASIC_SITEMAP = """ @@ -267,6 +267,14 @@ async def test_sitemap_from_string() -> None: assert set(sitemap.urls) == BASIC_RESULTS +async def test_parse_sitemap_with_partial_options() -> None: + """Test that missing keys in partial `ParseSitemapOptions` fall back to defaults.""" + options = ParseSitemapOptions(timeout=timedelta(seconds=10)) + items = [item async for item in parse_sitemap([{'type': 'raw', 'content': BASIC_SITEMAP}], options=options)] + + assert {item.loc for item in items} == BASIC_RESULTS + + async def test_discover_sitemap_from_robots_txt() -> None: """Sitemap URL found in robots.txt is yielded.""" robots_content = b'User-agent: *\nSitemap: http://example.com/custom-sitemap.xml'