Micha Kops avatar Micha Kops committed 640cbd3

jsoap screenscraping tutorial sources added

Comments (0)

Files changed (3)

jsoup-tutorial/pom.xml

+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <groupId>com.hascode.samples</groupId>
+  <artifactId>jsoup-example</artifactId>
+  <version>0.0.1-SNAPSHOT</version>
+  <dependencies>
+  	<dependency>
+  		<groupId>org.jsoup</groupId>
+  		<artifactId>jsoup</artifactId>
+  		<version>1.6.1</version>
+  	</dependency>
+  </dependencies>
+</project>

jsoup-tutorial/src/main/java/com/hascode/samples/jsoup/FragmentParser.java

+package com.hascode.samples.jsoup;
+
+import java.io.IOException;
+
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+
+public class FragmentParser {
+	public static void main(final String[] args) throws IOException {
+		String htmlFragment = "<div class=\"breadcrumb\">";
+		htmlFragment += "<ul><li><a href=\"/\">Home</a></li>";
+		htmlFragment += "<li><a href=\"#cat1\">Category 1</a></li>";
+		htmlFragment += "</ul></div>";
+		Document doc = Jsoup.parseBodyFragment(htmlFragment);
+		Element div = doc.body().select("div").first();
+		Element a1 = div.select("ul a").first();
+		Element a2 = div.select("ul a").get(1);
+		System.out.println(String.format("The div has the class '%s'",
+				div.attr("class")));
+		System.out
+				.println(String
+						.format("The first link in the breadcrum has the text '%s' and links to '%s'.",
+								a1.text(), a1.attr("href")));
+		System.out
+				.println(String
+						.format("The second link in the breadcrumb has the text '%s' and links to '%s'",
+								a2.text(), a2.attr("href")));
+	}
+}

jsoup-tutorial/src/main/java/com/hascode/samples/jsoup/WebScraper.java

+package com.hascode.samples.jsoup;
+
+import java.io.IOException;
+
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.select.Elements;
+
+public class WebScraper {
+	public static void main(final String[] args) throws IOException {
+		Document doc = Jsoup.connect("http://www.hascode.com/")
+				.userAgent("Mozilla").timeout(6000).get();
+		String title = doc.title(); // parsing the page's title
+		System.out.println("The title of www.hascode.com is: " + title);
+		Elements heading = doc.select("h2 > a"); // parsing the latest article's
+													// heading
+		System.out.println("The latest article is: " + heading.text());
+		System.out.println("The article's URL is: " + heading.attr("href"));
+		Elements editorial = doc.select("div.BlockContent-body small");
+		System.out.println("The was created: " + editorial.text());
+	}
+
+}
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.