Stefan Saasen avatar Stefan Saasen committed 0ee0077

Use Hpricot to extract elements from the HTML document easily

Comments (0)

Files changed (4)

         <jruby-complete.version>1.6.7</jruby-complete.version>
 
         <!-- Add the names of the Rubygems you need to the required.gems property: -->
-        <required.gems>sinatra jruby-rack</required.gems>
+        <required.gems>sinatra jruby-rack hpricot</required.gems>
         <gem.path>${basedir}/src/main/rubygems</gem.path>
         <gem.spec.index>${basedir}/target/classes/META-INF/gemspec.index</gem.spec.index>
         <gem.options>--no-rdoc --no-ri --install-dir=${gem.path}</gem.options>

src/main/java/com/atlassian/plugins/polyglot/jrubyexample/servlet/HttpResponseInformation.java

 public interface HttpResponseInformation {
     Map<String,String> getHttpHeaders();
     List<String> getBodyLines();
+    List<String> getHeadlines();
+    String getTitle();
 }

src/main/resources/templates/rurl.vm

         <div class="hero-unit">
             <h1>Make an HTTP Request</h1>
             <p>
-                Make an HTTP request and view the response headers.
+                Make an HTTP request and view the response headers and the document structure.
             </p>
         </div>
     #end
 $header.key: $header.value
 #end
 </pre>
+
+        <h3>Document Structure</h3>
+<pre>
+Title: $!responseInformation.title
+
+Headlines:
+#foreach($headline in $responseInformation.headlines)
+$headline
+#end
+</pre>
+
         <h3>Response Body</h3>
 <pre>
 #foreach($header in $responseInformation.bodyLines)

src/main/ruby/rurl.rb

 require 'rubygems'
 require 'open-uri'
+require 'hpricot'
 require 'java'
 
 class RurlResponse
   include Java::com.atlassian.plugins.polyglot.jrubyexample.servlet.HttpResponseInformation
-  attr_accessor :http_headers, :body_lines
-  def initialize(http_headers, body_lines)
-    @http_headers, @body_lines = http_headers, body_lines
+  attr_accessor :http_headers, :body_lines, :title, :headlines
+  def initialize(http_headers, body_lines, title, headlines)
+    @http_headers, @body_lines, @title, @headlines = http_headers, body_lines, title, headlines
   end
 end
 
 class Rurl
   def curl(url)
     open(url) do |f|
-      RurlResponse.new(f.meta, f.each_line.to_a[1..16])
+      # Raw body as array
+      body = f.each_line.to_a
+
+      # Extract a few elements
+      doc = Hpricot(body.join)
+      headlines = (doc/:h1).concat((doc/:h2)).map {|e| e.innerText }.compact.map {|text| text.strip }
+      title = (doc/'//head/title').text
+
+      RurlResponse.new(f.meta, body, title, headlines)
     end
   end
 end
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.