From 2016d669a93481f30bc107af937ef42729cc0b46 Mon Sep 17 00:00:00 2001 From: Yurii Chukhlib Date: Sat, 17 Jan 2026 11:17:28 +0100 Subject: [PATCH] fix: Respect tag for relative link resolution in html2text Fixes #1680 The HTML2Text class was ignoring the tag, causing relative links to be resolved against the page URL instead of the base URL specified in the attribute. Added tag handling in both HTML2Text and CustomHTML2Text to update self.baseurl when the tag is encountered, ensuring proper link resolution according to HTML standards. Co-Authored-By: Claude --- crawl4ai/html2text/__init__.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/crawl4ai/html2text/__init__.py b/crawl4ai/html2text/__init__.py index ca15b4534..7f6e788b3 100644 --- a/crawl4ai/html2text/__init__.py +++ b/crawl4ai/html2text/__init__.py @@ -316,6 +316,12 @@ def handle_tag( if self.tag_callback(self, tag, attrs, start) is True: return + # Handle tag to update base URL for relative links + if tag == "base" and start: + href = attrs.get("href") + if href: + self.baseurl = href + # first thing inside the anchor tag is another tag # that produces some output if ( @@ -1069,6 +1075,15 @@ def update_params(self, **kwargs): setattr(self, key, value) def handle_tag(self, tag, attrs, start): + # Handle tag to update base URL for relative links + # Must be handled before preserved tags since is in + if tag == "base" and start: + href = attrs.get("href") if attrs else None + if href: + self.baseurl = href + # Also let parent class handle it + return super().handle_tag(tag, attrs, start) + # Handle preserved tags if tag in self.preserve_tags: if start: