Steffen Dienst avatar Steffen Dienst committed a0cfd1a

initial commit

Comments (0)

Files changed (6)

+pom.xml
+*jar
+/lib/
+/classes/
+.lein-deps-sum
+.classpath
+.settings/
+.project
+/bin/
+*war
+target/
+checkouts/
+# pdf-index
+
+A simple app that indexes the full text of all pdfs within one directory as well as allows searching for terms.
+Uses the great PDF extraction library [PDFTextStream from SnowTide](http://snowtide.com) 
+
+## Usage
+* Build with: `lein uberjar`
+* create an index with: `java -jar pdf-index.jar --index <index directory> <pdf directories>+`
+* search for any term with: `java -jar pdf-index.jar --search <index directory> <search term>+`
+
+**Beware**: You **MUST NOT** redistribute the resulting jar file (restriction of the pdf library, see [License](http://snowtide.com/buy)).
+## License
+
+Copyright © 2012 Steffen Dienst
+
+Distributed under the Eclipse Public License, the same as Clojure.
+(defproject pdf-index "0.1.0-SNAPSHOT"
+  :description "FIXME: write description"
+  :url "http://example.com/FIXME"
+  :license {:name "Eclipse Public License"
+            :url "http://www.eclipse.org/legal/epl-v10.html"}
+  :dependencies [[com.snowtide/pdftextstream "2.6.0"]
+                 [org.apache.lucene/lucene-core "2.2.0"]]
+  :repositories [["snowtide-releases" {:url "http://maven.snowtide.com/releases"
+                                      :snapshots false}]]
+  :java-source-paths ["src/main/java"]
+  :omit-source true
+  :main pdfindex.Main
+  )
+

src/main/java/pdfindex/CreateIndex.java

+package pdfindex;
+import java.io.File;
+import java.io.IOException;
+
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.store.LockObtainFailedException;
+
+import com.snowtide.pdf.PDFTextStream;
+import com.snowtide.pdf.lucene.DocumentFactoryConfig;
+import com.snowtide.pdf.lucene.PDFDocumentFactory;
+ 
+public class CreateIndex {
+    /**
+     * Simple method that adds the contents of the provided PDF document to the
+     * Lucene index via an already-open Lucene IndexWriter.
+     */
+    public static void addPDFToIndex (IndexWriter openIndex, File pdfFile)
+            throws IOException {
+        // create and configure new DocumentFactoryConfig instance
+        DocumentFactoryConfig config = new DocumentFactoryConfig();
+        // set the name to be used for the main body of text extracted from the
+        // PDF file, and set it to not be stored, but to be tokenized and indexed
+        config.setMainTextFieldName("body_text");
+        config.setTextSettings(false, true, true);
+        // only copy the PDF metadata attributes into Lucene Document instances
+        // produced by PDFDocumentFactory that we explicitly map
+        // via DocumentFactoryConfig.setFieldName()
+        config.setCopyAllPDFAttrs(false);
+        // cause PDF metadata attribute values to be stored, tokenized, and indexed
+        config.setPDFAttrSettings(true, true, true);
+        // Explicitly set the names that should be used for the fields that are
+        // created in the Lucene Document instance -- otherwise, default PDF
+        // names will be used that will likely not be picked up when the index
+        // is searched.
+        // For example, the default name for the modification date
+        // field in PDF files is 'ModDate', but our example Lucene index stores
+        // the modification dates of Documents with the name 'mod_date'. The
+        // third setFieldName() call below establishes the correct mapping.
+        config.setFieldName(PDFTextStream.ATTR_AUTHOR, "creator");
+        config.setFieldName(PDFTextStream.ATTR_TITLE, "title");
+        //config.setFieldName(PDFTextStream.ATTR_CREATION_DATE, "creation_date");
+        //config.setFieldName(PDFTextStream.ATTR_MOD_DATE, "mod_date");
+        // actually generate the Lucene Document instance from the PDF file
+        // using the configuration we've just built, and add the Document to the
+        // Lucene index
+        Document doc = PDFDocumentFactory.buildPDFDocument(pdfFile, config);
+        doc.add(new Field("filename",pdfFile.getAbsoluteFile().toString(),Field.Store.YES, Field.Index.TOKENIZED));
+        openIndex.addDocument(doc);
+    }
+
+	public static void main(String[] args) throws CorruptIndexException,
+			LockObtainFailedException, IOException {
+		if (args.length < 2) {
+			System.err.println("Call with <index directory> <pdf directory>!");
+		} else {
+			IndexWriter iw = new IndexWriter(args[0], new StandardAnalyzer());
+			addAllPdfsIn(new File(args[1]), iw);
+			iw.flush();
+			iw.close();
+		}
+	}
+	private static void addAllPdfsIn(File directory, IndexWriter iw) throws IOException {
+		for(File f:directory.listFiles()){
+			if(f.isDirectory())
+				addAllPdfsIn(f, iw);
+			else if(f.getName().toLowerCase().endsWith(".pdf")){
+				System.out.println("Indexing pdf: "+f);
+				addPDFToIndex(iw, f);
+			}
+		}
+		
+	}
+}

src/main/java/pdfindex/Main.java

+package pdfindex;
+import java.io.IOException;
+
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.queryParser.ParseException;
+import org.apache.lucene.store.LockObtainFailedException;
+
+
+public class Main {
+
+	/**
+	 * @param args
+	 * @throws IOException 
+	 * @throws LockObtainFailedException 
+	 * @throws CorruptIndexException 
+	 * @throws ParseException 
+	 */
+	public static void main(String[] args) throws CorruptIndexException, LockObtainFailedException, IOException, ParseException {
+		if(args.length<3)
+			System.err.println("Call with: [--index|--search] <index directory> (<pdf directory>+|<query term>+)");
+		else {
+			String[] newArgs = new String[args.length-1];
+			System.arraycopy(args, 1, newArgs, 0, newArgs.length);
+			
+			if("--index".equals(args[0]))
+				CreateIndex.main(newArgs);
+			else
+				Search.main(newArgs);
+		}
+	}
+
+}

src/main/java/pdfindex/Search.java

+package pdfindex;
+import java.io.IOException;
+
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.queryParser.MultiFieldQueryParser;
+import org.apache.lucene.queryParser.ParseException;
+import org.apache.lucene.queryParser.QueryParser;
+import org.apache.lucene.search.Hits;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+
+public class Search {
+  private IndexSearcher searcher = null;
+  private QueryParser parser = null;
+
+  public Search(String indexDir) throws IOException {
+    searcher = new IndexSearcher(indexDir);
+    parser = new MultiFieldQueryParser(new String[]{"body_text","filename"}, new StandardAnalyzer());
+  }
+
+  public Hits performSearch(String queryString) 
+  throws IOException, ParseException {
+    Query query = parser.parse(queryString);
+    Hits hits = searcher.search(query);
+    return hits;
+  }
+
+	public static void main(String[] args) throws IOException, ParseException {
+		if (args.length < 2)
+			System.err.println("Call with <index dir> <query term>+");
+		else {
+			Search s = new Search(args[0]);
+			Hits hits = s.performSearch(args[1]);
+			for (int i = 0; i < hits.length(); i++) {
+				System.out.println(hits.doc(i).getField("filename")
+						.stringValue());
+			}
+		}
+
+	}
+  
+}
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.