Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 5 additions & 11 deletions src/crawlee/_utils/sitemap.py
Original file line number Diff line number Diff line change
Expand Up @@ -437,17 +437,11 @@ async def parse_sitemap(
up to the specified maximum depth.
"""
# Set default options
default_timeout = timedelta(seconds=30)
if options:
emit_nested_sitemaps = options['emit_nested_sitemaps']
max_depth = options['max_depth']
sitemap_retries = options['sitemap_retries']
timeout = options.get('timeout', default_timeout)
else:
emit_nested_sitemaps = False
max_depth = float('inf')
sitemap_retries = 3
timeout = default_timeout
options = options or {}
emit_nested_sitemaps = options.get('emit_nested_sitemaps', False)
max_depth = options.get('max_depth', float('inf'))
sitemap_retries = options.get('sitemap_retries', 3)
timeout = options.get('timeout', timedelta(seconds=30))

# Setup working state
sources = list(initial_sources)
Expand Down
12 changes: 10 additions & 2 deletions tests/unit/_utils/test_sitemap.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import base64
import gzip
from datetime import datetime
from datetime import datetime, timedelta
from typing import Any
from unittest.mock import AsyncMock, MagicMock

from yarl import URL

from crawlee._utils.sitemap import Sitemap, SitemapUrl, discover_valid_sitemaps, parse_sitemap
from crawlee._utils.sitemap import ParseSitemapOptions, Sitemap, SitemapUrl, discover_valid_sitemaps, parse_sitemap
from crawlee.http_clients._base import HttpClient, HttpResponse

BASIC_SITEMAP = """
Expand Down Expand Up @@ -267,6 +267,14 @@ async def test_sitemap_from_string() -> None:
assert set(sitemap.urls) == BASIC_RESULTS


async def test_parse_sitemap_with_partial_options() -> None:
"""Test that missing keys in partial `ParseSitemapOptions` fall back to defaults."""
options = ParseSitemapOptions(timeout=timedelta(seconds=10))
items = [item async for item in parse_sitemap([{'type': 'raw', 'content': BASIC_SITEMAP}], options=options)]

assert {item.loc for item in items} == BASIC_RESULTS


async def test_discover_sitemap_from_robots_txt() -> None:
"""Sitemap URL found in robots.txt is yielded."""
robots_content = b'User-agent: *\nSitemap: http://example.com/custom-sitemap.xml'
Expand Down
Loading