Commits

Micha Kops committed e561ae3 Draft

Content extraction from different formats and content type detection examples added.

Comments (0)

Files changed (10)

src/main/java/com/hascode/tutorial/AutoDetectionExample.java

+package com.hascode.tutorial;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.SAXException;
+
+public class AutoDetectionExample {
+	public static void main(final String[] args) throws IOException,
+			SAXException, TikaException {
+		Parser parser = new AutoDetectParser();
+
+		System.out.println("------------ Parsing a PDF:");
+		extractFromFile(parser, "/demo.pdf");
+
+		System.out.println("------------ Parsing an Office Document:");
+		extractFromFile(parser, "/demo.docx");
+
+		System.out.println("------------ Parsing a Spreadsheet:");
+		extractFromFile(parser, "/demo.xlsx");
+
+		System.out.println("------------ Parsing a Presentation:");
+		extractFromFile(parser, "/demo.odp");
+
+		System.out.println("------------ Parsing a PNG:");
+		extractFromFile(parser, "/demo.png");
+
+		System.out.println("------------ Parsing a Video/AVI:");
+		extractFromFile(parser, "/demo.avi");
+
+		System.out.println("------------ Parsing a MP3:");
+		extractFromFile(parser, "/demo.mp3");
+
+		System.out.println("------------ Parsing a Java Class:");
+		extractFromFile(parser,
+				"/com/hascode/tutorial/ConcretePDFExtractor.class");
+
+		System.out.println("------------ Parsing a HTML File:");
+		extractFromFile(parser, "/demo.html");
+	}
+
+	private static void extractFromFile(final Parser parser,
+			final String fileName) throws IOException, SAXException,
+			TikaException {
+		long start = System.currentTimeMillis();
+		BodyContentHandler handler = new BodyContentHandler(10000000);
+		Metadata metadata = new Metadata();
+		InputStream content = AutoDetectionExample.class
+				.getResourceAsStream(fileName);
+		parser.parse(content, handler, metadata, new ParseContext());
+		for (String name : metadata.names()) {
+			System.out.println(name + ":\t" + metadata.get(name));
+		}
+		System.out.println(String.format(
+				"------------ Processing took %s millis\n\n",
+				System.currentTimeMillis() - start));
+	}
+}

src/main/java/com/hascode/tutorial/ContentDetectionExample.java

+package com.hascode.tutorial;
+
+import java.io.IOException;
+
+import org.apache.tika.detect.DefaultDetector;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+public class ContentDetectionExample {
+	public static void main(final String[] args) throws IOException {
+		detectContentFromFile("/demo.pdf");
+		detectContentFromFile("/demo.docx");
+		detectContentFromFile("/demo.xlsx");
+		detectContentFromFile("/demo.odp");
+		detectContentFromFile("/demo.png");
+		detectContentFromFile("/demo.avi");
+		detectContentFromFile("/demo.mp3");
+		detectContentFromFile("/com/hascode/tutorial/ConcretePDFExtractor.class");
+		detectContentFromFile("/demo.html");
+	}
+
+	private static void detectContentFromFile(final String fileName)
+			throws IOException {
+		Detector detector = new DefaultDetector();
+		MediaType type = detector.detect(
+				ContentDetectionExample.class.getResourceAsStream(fileName),
+				new Metadata());
+		System.out.println(String.format(
+				"detected media type for given file %s: %s", fileName,
+				type.toString()));
+	}
+
+}

src/main/resources/demo.avi

Binary file added.

src/main/resources/demo.docx

Binary file added.

src/main/resources/demo.html

+<html>
+<head>
+<title>hasCode.com Sample Page</title>
+<meta http-equiv="content-type" content="text/html;charset=UTF-8" />
+<meta name="author" content="Micha Kops" />
+<meta name="description"
+	content="A sample HTML file for my Tika Tutorial" />
+<meta name="keywords"
+	content="tika, java, programming, content extraction, tutorials">
+<meta name="date" content="2012-11-30T06:30:00-01:00">
+<meta name="DC.title" content="hasCode.com Sample Page" />
+<meta name="DC.creator" content="Micha Kops" />
+<meta name="DC.subject" content="Tika Tutorial" />
+<meta name="DC.description"
+	content="A sample HTML file for my Tika Tutorial" />
+<meta name="DC.publisher" content="hasCode.com" />
+<meta name="DC.contributor" content="Micha Kops" />
+<meta name="DC.date" content="2012-11-30T06:30:00-01:00"
+	scheme="DCTERMS.W3CDTF" />
+<meta name="DC.type" content="Text" scheme="DCTERMS.DCMIType" />
+<meta name="DC.format" content="text/html" scheme="DCTERMS.IMT" />
+<meta name="DC.language" content="en" scheme="DCTERMS.RFC3066" />
+<meta name="DC.relation" content="http://dublincore.org/"
+	scheme="DCTERMS.URI" />
+</head>
+<body>
+	<h1>hasCode.com</h1>
+	<div>Now with an improved layout.</div>
+</body>
+</html>

src/main/resources/demo.mp3

Binary file added.

src/main/resources/demo.odp

Binary file added.

src/main/resources/demo.pdf

Binary file added.

src/main/resources/demo.png

Added
New image

src/main/resources/demo.xlsx

Binary file added.