Micha Kops avatar Micha Kops committed 785e3d0 Draft

Content extraction examples added.

Comments (0)

Files changed (1)

src/main/java/com/hascode/tutorial/ContentExtraction.java

+package com.hascode.tutorial;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.SAXException;
+
+public class ContentExtraction {
+	public static void main(final String[] args) throws IOException,
+			SAXException, TikaException {
+		Parser parser = new AutoDetectParser();
+
+		System.out
+				.println("------------ Extracting the content from an Office Document:");
+		extractContentFromFile(parser, "/demo.docx");
+
+		System.out
+				.println("------------ Extracting the content from a Spreadsheet:");
+		extractContentFromFile(parser, "/demo.xlsx");
+
+		System.out
+				.println("------------ Extracting the content from a Presentation:");
+		extractContentFromFile(parser, "/demo.odp");
+
+		System.out.println("------------ Extracting the content from a MP3:");
+		extractContentFromFile(parser, "/demo.mp3");
+
+		System.out
+				.println("------------ Extracting the content from a Java Class:");
+		extractContentFromFile(parser,
+				"/com/hascode/tutorial/ConcretePDFExtractor.class");
+
+		System.out
+				.println("------------ Extracting the content from a HTML File:");
+		extractContentFromFile(parser, "/demo.html");
+	}
+
+	private static void extractContentFromFile(final Parser parser,
+			final String fileName) throws IOException, SAXException,
+			TikaException {
+		BodyContentHandler handler = new BodyContentHandler(10000000);
+		Metadata metadata = new Metadata();
+		InputStream content = AutoDetectionExample.class
+				.getResourceAsStream(fileName);
+		parser.parse(content, handler, metadata, new ParseContext());
+		System.out.println(handler.toString());
+	}
+}
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.