diff --git a/crawl4ai/html2text/__init__.py b/crawl4ai/html2text/__init__.py index ca15b4534..7f6e788b3 100644 --- a/crawl4ai/html2text/__init__.py +++ b/crawl4ai/html2text/__init__.py @@ -316,6 +316,12 @@ def handle_tag( if self.tag_callback(self, tag, attrs, start) is True: return + # Handle tag to update base URL for relative links + if tag == "base" and start: + href = attrs.get("href") + if href: + self.baseurl = href + # first thing inside the anchor tag is another tag # that produces some output if ( @@ -1069,6 +1075,15 @@ def update_params(self, **kwargs): setattr(self, key, value) def handle_tag(self, tag, attrs, start): + # Handle tag to update base URL for relative links + # Must be handled before preserved tags since is in + if tag == "base" and start: + href = attrs.get("href") if attrs else None + if href: + self.baseurl = href + # Also let parent class handle it + return super().handle_tag(tag, attrs, start) + # Handle preserved tags if tag in self.preserve_tags: if start: