Commits

David Larlet committed f21d2a4

Deal with anchors and relative images' paths

  • Participants
  • Parent commits 6e1e795

Comments (0)

Files changed (1)

         The content doesn't contain `<body>` tags to be directly
         embeddable in the template and rendered as is.
         """
-        # Base URL construction to deal with relative links
-        parsed_url = urlparse.urlparse(url)
-        base_url = "{scheme}://{netloc}".format(
-            scheme=parsed_url.scheme,
-            netloc=parsed_url.netloc
-        )
-
         # Retrieves the resource and turns it into a Readability doc
         response = requests.get(url)
         response.encoding = guess_encoding(response)
         document = BrowserDocument(response.text)
-        document.html = self.rewrite_links(document._html(), base_url)
+        document.html = self.rewrite_links(document._html(), url)
 
         # The short title is more concise and readable
         title = document.short_title()
         content = document.summary(html_partial=True)
         return title, content
 
-    def rewrite_links(self, html, base_url):
+    def rewrite_links(self, html, current_url):
         """Returns transformed HTML with proxied and absolute links.
 
-        All links and images are turned to absolute for rendering,
-        links are proxied to be able to browse content from peer to peer.
+        All links and images are turned to absolute for rendering
+        except anchors, links are proxied to be able to browse content
+        from peer to peer.
         """
+        # Extract base and relative URLs to deal with distant images
+        parsed_url = urlparse.urlparse(current_url)
+        base_url = "{scheme}://{netloc}".format(
+            scheme=parsed_url.scheme,
+            netloc=parsed_url.netloc
+        )
+        relative_url = "{base_url}{relative_path}".format(
+            base_url=base_url,
+            relative_path=parsed_url.path.endswith('/') and parsed_url.path \
+                            or "/".join(parsed_url.path.split("/")[:-1]) + "/"
+        )
+
         for element, attribute, link, position in html.iterlinks():
             link = link.strip()
-            if link.startswith("/"):  # Deal with relative links
-                link = base_url + link
-
+            # Deal with relative links
+            if not link.startswith(("http", "//")):
+                link = "{img_path}{original_link}".format(
+                    img_path=link.startswith("/") and base_url or relative_url,
+                    original_link=link
+                )
             if attribute == "src":  # Do not proxy images' URLs
                 element.attrib[attribute] = link
             else:
-                element.attrib[attribute] = self.prepend_proxy_url(link)
+                # Deal with relative anchors
+                if link.startswith(current_url + "#"):
+                    link = "#{anchor}".format(anchor=link.split("#")[1])
+                else:
+                    link = self.prepend_proxy_url(link)
+                element.attrib[attribute] = link
         return html