1. Stefan Saasen
  2. atlassian-jruby-example-plugin


Stefan Saasen  committed 0ee0077

Use Hpricot to extract elements from the HTML document easily

  • Participants
  • Parent commits a992a22
  • Branches master

Comments (0)

Files changed (4)

File pom.xml

View file
  • Ignore whitespace
         <!-- Add the names of the Rubygems you need to the required.gems property: -->
-        <required.gems>sinatra jruby-rack</required.gems>
+        <required.gems>sinatra jruby-rack hpricot</required.gems>
         <gem.options>--no-rdoc --no-ri --install-dir=${gem.path}</gem.options>

File src/main/java/com/atlassian/plugins/polyglot/jrubyexample/servlet/HttpResponseInformation.java

View file
  • Ignore whitespace
 public interface HttpResponseInformation {
     Map<String,String> getHttpHeaders();
     List<String> getBodyLines();
+    List<String> getHeadlines();
+    String getTitle();

File src/main/resources/templates/rurl.vm

View file
  • Ignore whitespace
         <div class="hero-unit">
             <h1>Make an HTTP Request</h1>
-                Make an HTTP request and view the response headers.
+                Make an HTTP request and view the response headers and the document structure.
 $header.key: $header.value
+        <h3>Document Structure</h3>
+Title: $!responseInformation.title
+#foreach($headline in $responseInformation.headlines)
         <h3>Response Body</h3>
 #foreach($header in $responseInformation.bodyLines)

File src/main/ruby/rurl.rb

View file
  • Ignore whitespace
 require 'rubygems'
 require 'open-uri'
+require 'hpricot'
 require 'java'
 class RurlResponse
   include Java::com.atlassian.plugins.polyglot.jrubyexample.servlet.HttpResponseInformation
-  attr_accessor :http_headers, :body_lines
-  def initialize(http_headers, body_lines)
-    @http_headers, @body_lines = http_headers, body_lines
+  attr_accessor :http_headers, :body_lines, :title, :headlines
+  def initialize(http_headers, body_lines, title, headlines)
+    @http_headers, @body_lines, @title, @headlines = http_headers, body_lines, title, headlines
 class Rurl
   def curl(url)
     open(url) do |f|
-      RurlResponse.new(f.meta, f.each_line.to_a[1..16])
+      # Raw body as array
+      body = f.each_line.to_a
+      # Extract a few elements
+      doc = Hpricot(body.join)
+      headlines = (doc/:h1).concat((doc/:h2)).map {|e| e.innerText }.compact.map {|text| text.strip }
+      title = (doc/'//head/title').text
+      RurlResponse.new(f.meta, body, title, headlines)