Commits

Stefan Saasen committed 0ee0077

Use Hpricot to extract elements from the HTML document easily

  • Participants
  • Parent commits a992a22

Comments (0)

Files changed (4)

         <jruby-complete.version>1.6.7</jruby-complete.version>
 
         <!-- Add the names of the Rubygems you need to the required.gems property: -->
-        <required.gems>sinatra jruby-rack</required.gems>
+        <required.gems>sinatra jruby-rack hpricot</required.gems>
         <gem.path>${basedir}/src/main/rubygems</gem.path>
         <gem.spec.index>${basedir}/target/classes/META-INF/gemspec.index</gem.spec.index>
         <gem.options>--no-rdoc --no-ri --install-dir=${gem.path}</gem.options>

File src/main/java/com/atlassian/plugins/polyglot/jrubyexample/servlet/HttpResponseInformation.java

 public interface HttpResponseInformation {
     Map<String,String> getHttpHeaders();
     List<String> getBodyLines();
+    List<String> getHeadlines();
+    String getTitle();
 }

File src/main/resources/templates/rurl.vm

         <div class="hero-unit">
             <h1>Make an HTTP Request</h1>
             <p>
-                Make an HTTP request and view the response headers.
+                Make an HTTP request and view the response headers and the document structure.
             </p>
         </div>
     #end
 $header.key: $header.value
 #end
 </pre>
+
+        <h3>Document Structure</h3>
+<pre>
+Title: $!responseInformation.title
+
+Headlines:
+#foreach($headline in $responseInformation.headlines)
+$headline
+#end
+</pre>
+
         <h3>Response Body</h3>
 <pre>
 #foreach($header in $responseInformation.bodyLines)

File src/main/ruby/rurl.rb

 require 'rubygems'
 require 'open-uri'
+require 'hpricot'
 require 'java'
 
 class RurlResponse
   include Java::com.atlassian.plugins.polyglot.jrubyexample.servlet.HttpResponseInformation
-  attr_accessor :http_headers, :body_lines
-  def initialize(http_headers, body_lines)
-    @http_headers, @body_lines = http_headers, body_lines
+  attr_accessor :http_headers, :body_lines, :title, :headlines
+  def initialize(http_headers, body_lines, title, headlines)
+    @http_headers, @body_lines, @title, @headlines = http_headers, body_lines, title, headlines
   end
 end
 
 class Rurl
   def curl(url)
     open(url) do |f|
-      RurlResponse.new(f.meta, f.each_line.to_a[1..16])
+      # Raw body as array
+      body = f.each_line.to_a
+
+      # Extract a few elements
+      doc = Hpricot(body.join)
+      headlines = (doc/:h1).concat((doc/:h2)).map {|e| e.innerText }.compact.map {|text| text.strip }
+      title = (doc/'//head/title').text
+
+      RurlResponse.new(f.meta, body, title, headlines)
     end
   end
 end