Commits

Micha Kops  committed 5f80140 Draft

Initial import.

  • Participants

Comments (0)

Files changed (4)

+.settings
+.project
+.classpath
+target
+# Apache Tika Examples
+
+Some examples on content extraction on different file formats using Apache Tika.
+
+Please feel free to visit [my blog] for the full tutorial.
+
+---
+
+**2012 Micha Kops / hasCode.com**
+
+   [my blog]:http://www.hascode.com
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+	<modelVersion>4.0.0</modelVersion>
+	<groupId>com.hascode.tutorial</groupId>
+	<artifactId>tika-samples</artifactId>
+	<version>0.0.1</version>
+	
+	<properties>
+		<tika.version>1.2</tika.version>
+	</properties>
+	
+	<dependencies>
+		<dependency>
+			<groupId>org.apache.tika</groupId>
+			<artifactId>tika-core</artifactId>
+			<version>${tika.version}</version>
+		</dependency>
+		<dependency>
+			<groupId>org.apache.tika</groupId>
+			<artifactId>tika-parsers</artifactId>
+			<version>${tika.version}</version>
+		</dependency>
+	</dependencies>
+</project>

File src/main/java/com/hascode/tutorial/ConcretePDFExtractor.java

+package com.hascode.tutorial;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.pdf.PDFParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.SAXException;
+
+public class ConcretePDFExtractor {
+	public static void main(final String[] args) throws IOException,
+			SAXException, TikaException {
+		Parser parser = new PDFParser();
+		BodyContentHandler handler = new BodyContentHandler(10000000);
+		Metadata metadata = new Metadata();
+		InputStream content = ConcretePDFExtractor.class
+				.getResourceAsStream("/demo.pdf");
+		parser.parse(content, handler, metadata, new ParseContext());
+		for (String name : metadata.names()) {
+			System.out.println(name + ":\t" + metadata.get(name));
+		}
+	}
+}