Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 11 additions & 8 deletions src/crawlee/_utils/sitemap.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,9 +321,9 @@ async def _fetch_and_process_sitemap(

sitemap_url = source['url']

try:
while retries_left > 0:
retries_left -= 1
while retries_left > 0:
retries_left -= 1
try:
async with http_client.stream(
sitemap_url, method='GET', headers=SITEMAP_HEADERS, proxy_info=proxy_info, timeout=timeout
) as response:
Expand Down Expand Up @@ -372,12 +372,15 @@ async def _fetch_and_process_sitemap(
yield result
finally:
parser.close()
break
break

except Exception as e:
if retries_left > 0:
logger.warning(f'Error fetching sitemap {sitemap_url}: {e}. Retries left: {retries_left}')
await asyncio.sleep(1) # Brief pause before retry
except Exception as e:
if retries_left > 0:
logger.warning(f'Error fetching sitemap {sitemap_url}: {e}. Retries left: {retries_left}')
await asyncio.sleep(1) # Brief pause before retry
else:
logger.exception(f'Failed to fetch sitemap {sitemap_url}, no retries left.')
raise


class Sitemap:
Expand Down
55 changes: 54 additions & 1 deletion tests/unit/_utils/test_sitemap.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import base64
import gzip
from contextlib import asynccontextmanager
from datetime import datetime, timedelta
from typing import Any
from typing import TYPE_CHECKING, Any, cast
from unittest.mock import AsyncMock, MagicMock

import pytest
from yarl import URL

from crawlee._utils.sitemap import (
Expand All @@ -17,6 +19,9 @@
)
from crawlee.http_clients._base import HttpClient, HttpResponse

if TYPE_CHECKING:
from collections.abc import AsyncIterator

BASIC_SITEMAP = """
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
Expand Down Expand Up @@ -73,6 +78,30 @@ async def send_request(url: str, **_kwargs: Any) -> HttpResponse:
return client


def _make_flaky_stream_client(body: bytes, *, fail_times: int) -> tuple[AsyncMock, list[int]]:
"""Create a mock client whose `stream` fails with a network error for the first `fail_times` calls."""
attempts: list[int] = []

@asynccontextmanager
async def stream(_url: str, **_kwargs: Any) -> 'AsyncIterator[HttpResponse]':
attempt = len(attempts) + 1
attempts.append(attempt)
if attempt <= fail_times:
raise ConnectionError(f'Network error on attempt {attempt}')

async def read_stream() -> 'AsyncIterator[bytes]':
yield body

response = MagicMock(spec=HttpResponse)
response.headers = {'content-type': 'application/xml; charset=utf-8'}
response.read_stream = read_stream
yield cast('HttpResponse', response)

client = AsyncMock(spec=HttpClient)
client.stream = stream
return client, attempts


def compress_gzip(data: str) -> bytes:
"""Compress a string using gzip."""
return gzip.compress(data.encode())
Expand Down Expand Up @@ -275,6 +304,30 @@ async def test_sitemap_from_string() -> None:
assert set(sitemap.urls) == BASIC_RESULTS


async def test_sitemap_fetch_retries_on_transient_error() -> None:
"""Transient fetch errors are retried up to `sitemap_retries` times before giving up."""
client, attempts = _make_flaky_stream_client(BASIC_SITEMAP.encode(), fail_times=2)

items = [
item async for item in parse_sitemap([{'type': 'url', 'url': 'http://not-exists.com/sitemap.xml'}], client)
]

assert len(attempts) == 3
assert {item.loc for item in items} == BASIC_RESULTS


async def test_sitemap_fetch_raises_after_retries_exhausted() -> None:
"""A persistent fetch error is raised to the caller once all retries are exhausted."""
client, attempts = _make_flaky_stream_client(BASIC_SITEMAP.encode(), fail_times=10)

with pytest.raises(ConnectionError):
_ = [
item async for item in parse_sitemap([{'type': 'url', 'url': 'http://not-exists.com/sitemap.xml'}], client)
]

assert len(attempts) == 3


async def test_parse_sitemap_with_partial_options() -> None:
"""Test that missing keys in partial `ParseSitemapOptions` fall back to defaults."""
options = ParseSitemapOptions(timeout=timedelta(seconds=10))
Expand Down
Loading