Commits

Sara Magliacane committed fb7fe60

converting to standard maven directory structure

  • Participants
  • Parent commits ac67ca0

Comments (0)

Files changed (84)

File src/main/java/nl/vu/recoprov/CompletePipeline.java

+package nl.vu.recoprov;
+
+/**
+ * The complete pipeline class calls all the components of the system in the right order.
+ */
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import nl.vu.recoprov.ProvDMtranslator;
+import nl.vu.recoprov.baseclasses.DependencyGraph;
+import nl.vu.recoprov.signalaggregators.WeightedSumAggregator;
+import nl.vu.recoprov.signaldetectors.ImageSimilaritySignal;
+import nl.vu.recoprov.signaldetectors.LuceneInverseSimilarity;
+import nl.vu.recoprov.signaldetectors.LuceneSimilaritySignal;
+import nl.vu.recoprov.signaldetectors.MatchTitleInContentSignal;
+import nl.vu.recoprov.signaldetectors.MetadataSimilaritySignal;
+import nl.vu.recoprov.signalfilters.BackwardTemporalFilter;
+import nl.vu.recoprov.utils.ConfigurationDefaults;
+import nl.vu.recoprov.utils.ConfigurationReader;
+
+
+public class CompletePipeline {
+
+	private String currentDir;
+	private Boolean connectToInternet = true;
+	private String[] params = null;
+	
+	public CompletePipeline() {}
+
+	public CompletePipeline(Boolean online) {
+		this.connectToInternet = online;
+	}
+
+	public CompletePipeline(Boolean online, String dir) {
+		this.connectToInternet = online;
+		this.currentDir = dir;
+	}
+	public CompletePipeline(Boolean online, String dir, String[] params) {
+		this.connectToInternet = online;
+		this.currentDir = dir;
+		this.params = params;
+	}
+	
+	public static void main(String[] args) throws Exception {
+		CompletePipeline pipeline = new CompletePipeline();
+
+		ConfigurationReader confreader = new ConfigurationReader();
+		confreader.readParameters(pipeline);
+
+		// System.out.println("### FIRST PHASE: Fetch the documents from Dropbox.\n");
+		DependencyGraph depGraph = new DependencyGraph();
+
+		depGraph = pipeline.initDependencyGraph();
+
+		depGraph = pipeline.loadMetadaAndIndexes(depGraph);
+
+		depGraph = pipeline.computeSignals(depGraph);
+
+		depGraph = pipeline.filterSignals(depGraph);
+
+		// System.out.println(depGraph);
+		// System.out.println(depGraph.getAttributes());
+
+		// System.out.println(depGraph.toCSVString());
+
+		depGraph = pipeline.aggregateSignals(depGraph);
+
+		System.out.println(depGraph);
+
+		pipeline.writeToFile(depGraph);
+
+
+		pipeline.translateToPROVDM(depGraph);
+		
+
+
+	}
+
+
+
+	private void translateToPROVDM(DependencyGraph depGraph) {
+		new ProvDMtranslator().translate(depGraph);
+		
+	}
+
+	public DependencyGraph initDependencyGraph() throws Exception {
+		DependencyGraph depGraph = new DependencyGraph();
+		DropboxClient client = new DropboxClient();
+
+		if (connectToInternet) {
+			if (!client.isLinked()) {
+				client.linkToAccount();
+			}
+			// dir = dir.substring(dir.indexOf("Dropbox/")+7, dir.length());
+			depGraph = client.getAllRevs(depGraph, currentDir);
+			currentDir = ConfigurationDefaults.TEMPDIR;
+		} else {
+			depGraph = client.getAllRevsOffline(new File(currentDir), depGraph);
+		}
+		System.out.println("PHASE 1 completed: Initialization of the DependencyGraph");
+
+		return depGraph;
+	}
+
+	public DependencyGraph loadMetadaAndIndexes(DependencyGraph depGraph) {
+
+		// System.out.println("### SECOND PHASE: Analyze the contents with Apache Tika. \n");
+		TikaReader tika = new TikaReader(currentDir);
+		depGraph = tika.read(depGraph, params);
+
+		// TODO: stripping of tags
+
+		// System.out.println("### THIRD PHASE: Index the contents with Apache Lucene. \n");
+		depGraph = indexFiles(depGraph);
+
+		// EXTRACT IMAGES
+		depGraph = ImageReader.read(currentDir, depGraph);
+
+		return depGraph;
+
+	}
+
+	public DependencyGraph indexFiles(DependencyGraph depGraph){
+		return indexFiles(depGraph, currentDir);
+	}
+	
+	public DependencyGraph indexFiles(DependencyGraph depGraph, String dir){
+		LuceneIndexer indexer = new LuceneIndexer(dir);
+		return indexer.indexFiles(depGraph);
+	}
+	
+	public DependencyGraph computeSignals(DependencyGraph depGraph) {
+		// SIGNALS
+		depGraph = new LuceneSimilaritySignal().computeSignal(depGraph);
+		depGraph = new LuceneInverseSimilarity().computeSignal(depGraph);
+		// use lucene for better similarity - overlap of words
+		depGraph = new MetadataSimilaritySignal().computeSignal(depGraph);
+		depGraph = new MatchTitleInContentSignal().computeSignal(depGraph);
+
+		// TODO: compare nouns, verbs, named entities
+		// preprocessing get rid of the tags
+
+		// TODO: transitive reduction after transitive closure?
+
+		depGraph = new ImageSimilaritySignal().computeSignal(depGraph);
+
+		// depGraph = new DiffSignal().computeSignal(depGraph);
+		return depGraph;
+	}
+
+	public DependencyGraph filterSignals(DependencyGraph depGraph) {
+		// FILTERS
+		depGraph = new BackwardTemporalFilter().filterSignals(depGraph);
+		// Doesn't work completely, better the standard dot transitive reduction
+		// depGraph = new TransitiveReductionFilter().filterSignals(depGraph);
+
+		return depGraph;
+	}
+	
+	
+	public DependencyGraph aggregateSignals(DependencyGraph depGraph) {
+		depGraph = new WeightedSumAggregator().aggregateSignals(depGraph);
+		return depGraph;
+	}
+
+	public void writeToFile(DependencyGraph depGraph){
+		File file = new File(ConfigurationDefaults.RESULTS);
+		BufferedWriter writer;
+		try {
+			writer = new BufferedWriter(new FileWriter(file));
+			writer.write(depGraph.toString());
+			writer.close();
+		} catch (IOException e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+		}
+	}
+	
+
+	public String getCurrentDir(){
+		return currentDir;
+	}
+	
+	public void setCurrentDir(String dir){
+		currentDir = dir;
+	}
+	
+	public void setConnectToInternet(Boolean online){
+		connectToInternet = online;
+	}
+	
+}

File src/main/java/nl/vu/recoprov/DropboxClient.java

+package nl.vu.recoprov;
+
+/**
+ * Dropbox client based on the example client in the SDK
+ */
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.List;
+import nl.vu.recoprov.baseclasses.DependencyGraph;
+import nl.vu.recoprov.baseclasses.DependencyNode;
+import nl.vu.recoprov.utils.ConfigurationDefaults;
+import com.dropbox.client2.DropboxAPI;
+import com.dropbox.client2.DropboxAPI.Entry;
+import com.dropbox.client2.exception.DropboxException;
+import com.dropbox.client2.session.AccessTokenPair;
+import com.dropbox.client2.session.AppKeyPair;
+import com.dropbox.client2.session.Session;
+import com.dropbox.client2.session.WebAuthSession;
+
+public class DropboxClient extends SearchCache {
+
+	private String APPKEY;
+	private String SECRET;
+
+	/**
+	 * Get permission to access a person's Dropbox folder.
+	 * @throws DropboxException
+	 */
+	public void linkToAccount() throws DropboxException {
+		
+		File f = new File(ConfigurationDefaults.STATE_FILE);
+		AppKeyPair appKeyPair = null;
+
+		if (!f.exists())
+			try {
+				f.createNewFile();
+				appKeyPair = readSecretFile(ConfigurationDefaults.SECRET_FILE);
+			} catch (IOException e) {
+				System.err.println("Could not read secret file: " + ConfigurationDefaults.SECRET_FILE);
+				e.printStackTrace();
+			}
+
+		// Make the user log in and authorize us.
+		WebAuthSession was = new WebAuthSession(appKeyPair,
+				Session.AccessType.APP_FOLDER);
+		WebAuthSession.WebAuthInfo info = was.getAuthInfo();
+		System.out.println("1. Go to: " + info.url);
+		System.out.println("2. Allow access to this app.");
+		System.out.println("3. Press ENTER.");
+
+		try {
+			while (System.in.read() != '\n') {
+			}
+		} catch (IOException ex) {
+			System.err.println("I/O error: " + ex.getMessage());
+			System.exit(1);
+			throw new RuntimeException();
+		}
+
+		// This will fail if the user didn't visit the above URL and hit
+		// 'Allow'.
+		was.retrieveWebAccessToken(info.requestTokenPair);
+		AccessTokenPair accessToken = was.getAccessTokenPair();
+		System.out.println("Link successful.");
+
+		// Save state
+		State state = new State(appKeyPair, accessToken, new Content.Folder());
+		state.save(ConfigurationDefaults.STATE_FILE);
+
+	}
+
+	public WebAuthSession getDropboxApproval() throws DropboxException {
+		File f = new File(ConfigurationDefaults.STATE_FILE);
+		if (!f.exists())
+			try {
+				f.createNewFile();
+				readSecretFile(ConfigurationDefaults.SECRET_FILE);
+			} catch (IOException e) {
+				// TODO Auto-generated catch block
+				e.printStackTrace();
+			}
+		System.out.println("Creating web auth session");
+		AppKeyPair appKeyPair = new AppKeyPair(APPKEY, SECRET);
+		WebAuthSession was = new WebAuthSession(appKeyPair,
+				Session.AccessType.APP_FOLDER);
+		return was;
+	}
+
+	public void saveRequest(WebAuthSession was) throws Exception {
+		was.retrieveWebAccessToken(was.getAuthInfo().requestTokenPair);
+		AccessTokenPair accessToken = was.getAccessTokenPair();
+		System.out.println("Link successful.");
+
+		AppKeyPair appKeyPair = new AppKeyPair(APPKEY, SECRET);
+		// Save state
+		State state = new State(appKeyPair, accessToken, new Content.Folder());
+		state.save(ConfigurationDefaults.STATE_FILE);
+	}
+
+	/**
+	 * Read the file containing the APPKEY and SECRET
+	 * @param filename
+	 * @return the pair APPKEY and SECRET
+	 */
+	private AppKeyPair readSecretFile(String filename) {
+		System.out.println("Reading secret file: " + filename);
+		final File file = new File(filename);
+
+		try {
+			final BufferedReader reader = new BufferedReader(
+					new InputStreamReader(new FileInputStream(file)));
+
+			String line = null;
+			while ((line = reader.readLine()) != null) {
+
+				if (line.startsWith("APPKEY=")) {
+					APPKEY = line.split("=")[1];
+				}
+				if (line.startsWith("SECRET=")) {
+					SECRET = line.split("=")[1];
+				}
+			}
+
+			reader.close();
+
+		} catch (final IOException e) {
+			System.out.println("Error processing secret file: " + filename);
+		}
+
+		AppKeyPair appKeyPair = new AppKeyPair(APPKEY, SECRET);
+		return appKeyPair;
+	}
+
+	public DependencyGraph getAllRevsOffline(File dir, DependencyGraph depGraph)
+			throws DropboxException {
+
+		for (File f : dir.listFiles()) {
+
+			if (ConfigurationDefaults.ignoreFile(f.getAbsolutePath())) {
+				continue;
+			}
+
+			if (f.isDirectory()) {
+				depGraph.putAll(getAllRevsOffline(f, depGraph));
+				continue;
+
+			}
+
+			DependencyNode d = new DependencyNode(depGraph);
+			d.setCompleteFilepath(f.getAbsolutePath());
+			depGraph.put(d.getCompleteFilepath(), d);
+
+		}
+
+		return depGraph;
+
+	}
+
+	/**
+	 * Get all reviews of the files for the complete Dropbox 
+	 * @param depGraph
+	 * @return
+	 * @throws DropboxException
+	 */
+	public DependencyGraph getAllRevs(DependencyGraph depGraph)
+			throws DropboxException {
+		return getAllRevs(depGraph, "");
+	}
+
+	/**
+	 * Get all reviews of the files for a specific Dropbox folder
+	 * @param depGraph
+	 * @param dropboxFolder
+	 * @return
+	 * @throws DropboxException
+	 */
+	public DependencyGraph getAllRevs(DependencyGraph depGraph,
+			String dropboxFolder) throws DropboxException {
+
+		// Load state.
+		State state = State.load(ConfigurationDefaults.STATE_FILE);
+
+		// Connect to Dropbox.
+		// WebAuthSession session = new WebAuthSession(state.appKey,
+		// WebAuthSession.AccessType.APP_FOLDER);
+		WebAuthSession session = new WebAuthSession(state.appKey,
+				WebAuthSession.AccessType.DROPBOX);
+		session.setAccessTokenPair(state.accessToken);
+		DropboxAPI<?> client = new DropboxAPI<WebAuthSession>(session);
+
+		// make dirs
+		File dir = new File(ConfigurationDefaults.TEMPDIR);
+		dir.delete();
+		dir.mkdir();
+
+		File dropboxDir = new File(dir.getAbsolutePath() + dropboxFolder);
+		dropboxDir.mkdirs();
+
+		List<Entry> result = getRevisions(client, dropboxFolder, dropboxDir);
+
+		// print the obtained metadata
+		// printMetadata(result);
+
+		// copy all the files to the temporary folder
+		copyToTempFolder(client, result, dir, dropboxFolder);
+
+		for (Entry e : result) {
+
+			if (ConfigurationDefaults.ignoreFile(e.path)) {
+				continue;
+			}
+
+			if(e.bytes == 0)
+				continue;
+			
+			DependencyNode d = new DependencyNode(depGraph);
+			d.setDropboxEntry(e);
+			d.setCompleteFilepath(createTempFilename(dir, e, dropboxFolder));
+			// System.out.println(d.getCompleteFilepath() + " "+ d);
+			depGraph.put(d.getCompleteFilepath(), d);
+
+		}
+
+		return depGraph;
+	}
+
+	private void copyToTempFolder(DropboxAPI client, List<Entry> result,
+			File dir, String dropboxFolder) {
+
+		// copy files
+		for (Entry e : result) {
+
+			if (ConfigurationDefaults.ignoreFile(e.path)) {
+				continue;
+			}
+			
+			if(e.bytes == 0)
+				continue;
+
+			try {
+				String filename = createTempFilename(dir, e, dropboxFolder);
+				FileOutputStream fout = new FileOutputStream(filename);
+				client.getFile(e.path, e.rev, fout, null);
+				fout.close();
+
+			} catch (Exception e1) {
+				e1.printStackTrace();
+				continue;
+			}
+
+		}
+	}
+
+	private List<Entry> getRevisions(DropboxAPI client, String filepath,
+			File dir) {
+
+		Entry e = null;
+		List<Entry> revisions = new ArrayList<Entry>();
+
+		try {
+			e = client.metadata(filepath, 0, null, true, null);
+		} catch (DropboxException e2) {
+			e2.printStackTrace();
+			return null;
+		}
+
+		for (Entry file : e.contents) {
+			String filename = file.path;
+			List<Entry> temp = null;
+
+			if (file.isDir == false) {
+
+				try {
+					temp = client.revisions(filename, 0);
+					System.out.println("Received Dropbox metadata for file: "
+							+ filename);
+				} catch (DropboxException e1) {
+					e1.printStackTrace();
+				}
+
+			} else {
+				if (dir != null) {
+					File newdir = new File(dir.getAbsolutePath() + "/"
+							+ file.fileName());
+					newdir.mkdirs();
+					temp = getRevisions(client, filename, newdir);
+				} else {
+					temp = getRevisions(client, filename, dir);
+				}
+			}
+
+			if (temp != null) {
+				revisions.addAll(temp);
+
+			}
+		}
+
+		return revisions;
+	}
+
+	public Boolean isLinked() throws Exception {
+
+		System.out.println("Checking linkage");
+		File f = new File(ConfigurationDefaults.STATE_FILE);
+		if (!f.exists())
+			return false;
+
+		// Load state.
+		State state = State.load(ConfigurationDefaults.STATE_FILE);
+
+		// Connect to Dropbox.
+		WebAuthSession session = new WebAuthSession(state.appKey,
+				WebAuthSession.AccessType.APP_FOLDER);
+		session.setAccessTokenPair(state.accessToken);
+
+		// System.out.println("Got to before returning " + session.isLinked());
+		return session.isLinked();
+
+	}
+
+	public static String createTempFilename(File dir, Entry e,
+			String dropboxFolder) {
+		
+		String temp;
+		
+		if (!e.path.contains(".")){
+			temp = dir.getAbsolutePath()
+					+ e.path + "_" + e.rev;
+		}
+		else{
+			temp = dir.getAbsolutePath()
+					+ e.path.substring(0, e.path.lastIndexOf(".")) + "_" + e.rev
+					+ e.path.substring(e.path.lastIndexOf("."), e.path.length());
+			// System.out.println(temp);
+		}
+		return temp;
+	}
+
+	/**
+	 * Used for debugging purposes - prints out the metadata
+	 * @param result
+	 */
+	private static void printMetadata(List<Entry> result) {
+		for (Entry e : result) {
+			System.out.println(e.fileName() + "\n" + e.rev + " " + e.modified
+					+ " " + e.bytes + " bytes " + e.mimeType + " " + e.path);
+		}
+	}
+
+}

File src/main/java/nl/vu/recoprov/ImageReader.java

+package nl.vu.recoprov;
+
+import java.awt.Color;
+import java.awt.image.BufferedImage;
+import java.io.File;
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+
+
+import org.apache.sanselan.*;
+import org.apache.sanselan.common.IImageMetadata;
+
+import net.semanticmetadata.lire.DocumentBuilder;
+import net.semanticmetadata.lire.DocumentBuilderFactory;
+import nl.vu.recoprov.baseclasses.DependencyGraph;
+import nl.vu.recoprov.baseclasses.DependencyNode;
+import nl.vu.recoprov.utils.ConfigurationDefaults;
+
+
+import org.apache.commons.io.FileUtils;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.en.EnglishAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.store.LockObtainFailedException;
+import org.apache.lucene.store.SimpleFSDirectory;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.pdmodel.PDResources;
+import org.apache.pdfbox.pdmodel.common.PDMetadata;
+import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObject;
+import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectForm;
+import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectImage;
+
+
+public class ImageReader {
+	
+	static  int counter = 0;
+	static String directoryname;
+
+
+	
+	public static DependencyGraph  read(String dirname,  DependencyGraph depNodeMap ){
+		File dir = new File(dirname);
+		directoryname = dirname;
+		
+		File imageDir = new File(ConfigurationDefaults.IMAGE_DIRECTORY);
+		imageDir.mkdir();
+		
+		return read(dir, depNodeMap);
+	}
+
+	public static DependencyGraph  read(File dir, DependencyGraph input ){
+
+		DocumentBuilder docbuilder = DocumentBuilderFactory.getDefaultDocumentBuilder();
+		
+		File indexDir = new File(ConfigurationDefaults.IMAGE_INDEX_DIR);
+		//IMAGE_INDEX_DIR = indexDir.getAbsolutePath();
+
+		
+		if (!indexDir.exists()) {
+			indexDir.mkdir();
+		}	
+		else {
+			try {
+				FileUtils.deleteDirectory(indexDir);
+			} catch (IOException e) {
+				// TODO Auto-generated catch block
+				e.printStackTrace();
+			}
+		}
+		
+		FSDirectory store = null;
+		try {
+			store = SimpleFSDirectory.open(indexDir);
+			store.clearLock(ConfigurationDefaults.IMAGE_INDEX_DIR);
+		} catch (IOException e2) {
+			// TODO Auto-generated catch block
+			e2.printStackTrace();
+		}
+		
+		Analyzer analyzer = new EnglishAnalyzer(ConfigurationDefaults.LUCENE_VERSION);
+		IndexWriterConfig config = new IndexWriterConfig(ConfigurationDefaults.LUCENE_VERSION, analyzer );
+		
+		IndexWriter writer = null;
+		try {
+			writer = new IndexWriter(store, config);
+		} catch (CorruptIndexException e1) {
+			// TODO Auto-generated catch block
+			e1.printStackTrace();
+		} catch (LockObtainFailedException e1) {
+			// TODO Auto-generated catch block
+			e1.printStackTrace();
+		} catch (IOException e1) {
+			// TODO Auto-generated catch block
+			e1.printStackTrace();
+		}
+		
+
+		for (String depname: input.keySet()){
+			try {
+      		
+				DependencyNode d = input.get(depname);
+				
+				
+				if (ConfigurationDefaults.ignoreFile(depname)){
+					continue;
+				}
+
+				
+				if(d.getMimeType().contains("image")){
+					
+					File imgfile = new File(d.getCompleteFilepath());
+					
+					BufferedImage img = Sanselan.getBufferedImage(imgfile);
+					IImageMetadata metadata = Sanselan.getMetadata(imgfile);
+										
+					//TODO: check if still problems with tiff also with Sanselan
+					
+//					// trying to get the tiffs to work
+//					BufferedImage bimg = new BufferedImage(img.getWidth(), img.getHeight(), BufferedImage.TYPE_INT_RGB);
+//				    bimg.createGraphics().drawRenderedImage(img, null);
+					
+					
+					if(img == null){
+						continue;
+						
+//						img = findSubstituteImage(d.getCompleteFilepath().replace("tiff", "pdf"));
+					}
+					
+					
+					d = addImageToIndex(d, img, metadata, docbuilder, writer);
+					input.put(depname, d);
+					
+				}
+	    		
+				if(d.getMimeType().contains("pdf")){
+	    			
+					
+					input = readImagesFromPDF( input,  d,  depname,  docbuilder,  writer); 
+				}
+				
+	      	
+
+			} catch (Exception e) {
+				// TODO Auto-generated catch block
+				e.printStackTrace();
+			}
+      	
+		}
+		
+		try {
+			writer.close();
+			store.close();
+		} catch (CorruptIndexException e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+		} catch (IOException e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+		}
+    	
+
+		counter = 0;
+		return input;
+	}
+
+	public static DependencyGraph readImagesFromPDF(DependencyGraph input, DependencyNode d, String depname, DocumentBuilder docbuilder, IndexWriter writer) throws IOException{
+		File f = new File(d.getCompleteFilepath());
+		PDDocument document = null;
+		try{
+			document = PDDocument.load(f);
+		}catch(IOException e){
+				
+			return input;
+		}
+		List<PDPage> pages = document.getDocumentCatalog().getAllPages();
+
+		
+		for (PDPage page: pages){
+			PDResources resources = page.getResources();
+			input = elaborateResources( resources,  input,  d,  depname,  docbuilder,  writer);
+			
+			
+		}
+		
+		document.close();
+		
+		return input;
+	}
+	
+	public static DependencyGraph elaborateResources(PDResources resources, DependencyGraph input, DependencyNode d, String depname, DocumentBuilder docbuilder, IndexWriter writer) throws IOException{
+		
+		//has to be recursive (images inside resources)
+		
+		//System.out.println( "Resources: "+ d.getCompleteFilepath() + resources);
+		
+		//System.out.println( "Resource Properties: "+ d.getCompleteFilepath() + resources.getProperties());
+		
+		Map<String,PDXObjectImage> images = resources.getImages();
+		if( images !=null && !images.isEmpty() ){
+			
+			for (String key: images.keySet()){
+
+				PDXObjectImage image = images.get(key);
+				PDMetadata metadata = image.getMetadata();
+				//System.out.println( "Image: "+ d.getCompleteFilepath() + image);
+				
+				BufferedImage imagebuf = null;
+				
+				try{
+					imagebuf = image.getRGBImage();
+				}catch (Throwable e){
+					e.printStackTrace();
+					continue;
+				}
+				
+				if(imagebuf != null){
+					d = addImageToIndex(d, imagebuf, metadata, docbuilder, writer);
+					input.put(depname, d);
+				}
+				
+
+			}
+
+			images.clear();
+			
+		 }
+		
+		Map<String, PDXObject> xobjects = resources.getXObjects();
+		for (String key: xobjects.keySet()){
+			PDXObject xobject = xobjects.get(key);
+			//System.out.println( "Objects: "+ d.getCompleteFilepath() + xobject.getClass());
+			
+			if( xobject instanceof PDXObjectForm){
+				PDXObjectForm form = (PDXObjectForm) xobject;
+				input = elaborateResources( form.getResources(),  input,  d,  depname,  docbuilder,  writer);
+	
+				
+			}
+			
+
+
+		}
+		
+		//System.out.println( "ResourceGraphicStates: "+ d.getCompleteFilepath() + resources.getGraphicsStates());
+		
+		
+		return input;
+		
+	}
+	
+
+	
+
+	public static DependencyNode addImageToIndex(DependencyNode d, BufferedImage image, Object metadata, DocumentBuilder docbuilder, IndexWriter writer) throws IOException{
+		
+		if (isImageWhite (image)){
+			return d;
+		}
+		
+		String name = ConfigurationDefaults.IMAGE_DIRECTORY + counter++ + ".png" ;
+		System.out.println( "Adding image: " + name  +  " " + d.getCompleteFilepath() );
+		File newfile = new File(name );
+		//image.write2file( newfile );
+		try {
+			Sanselan.writeImage(image, newfile, ImageFormat.IMAGE_FORMAT_PNG, null);
+		} catch (ImageWriteException e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+		}
+
+		Document doc = docbuilder.createDocument(image, name);
+		doc.add(new StringField("name", name, Field.Store.YES));
+		doc.add(new StringField("originalFilename", d.getCompleteFilepath() , Field.Store.YES));
+		writer.addDocument(doc);
+		
+		image.flush();
+		image = null;
+
+		d.getMetadata().addImages(name, metadata);
+		return d;
+	}
+	
+	
+	/**
+	 * Check if the image is empty (completely white), 
+	 * so we don't add it to the index.
+	 * @param img
+	 * @return true = image is completely white
+	 */
+	public static Boolean isImageWhite(BufferedImage img){
+		int width = img.getWidth()-1;
+		int height = img.getHeight()-1;
+		Boolean white = true;
+		int whiteColor = Color.WHITE.getRGB();
+		
+		for (int i = 0; i < width; i++){
+			for (int j = 0; j < height; j++){
+				int rgb = 0;
+				try{
+					rgb = img.getRGB(i,j);
+				}catch(Exception e){
+					continue;
+				}
+				
+				
+				if (rgb != whiteColor)
+					white = false;
+			}
+		}
+		
+		return white;
+	}
+}
+

File src/main/java/nl/vu/recoprov/LuceneIndexer.java

+package nl.vu.recoprov;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Set;
+import nl.vu.recoprov.baseclasses.DependencyGraph;
+import nl.vu.recoprov.baseclasses.DependencyNode;
+import nl.vu.recoprov.utils.ConfigurationDefaults;
+import nl.vu.recoprov.utils.CustomAnalyzer;
+import nl.vu.recoprov.utils.CustomFileReader;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.en.EnglishAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.store.SimpleFSDirectory;
+
+
+public class LuceneIndexer {
+
+
+	private String rootpath;
+	private Boolean cleanupIndex = false;
+	
+	public LuceneIndexer(String rootpath) {
+		this.rootpath = rootpath;
+	}
+
+	public DependencyGraph indexFiles(DependencyGraph input) {
+		
+		File indexDir = new File(ConfigurationDefaults.RELATIVE_INDEX_DIR);
+		
+		if (!indexDir.exists())
+			input = createIndex(input);
+		else {
+			
+			if (cleanupIndex){
+				try {
+					FileUtils.deleteDirectory(indexDir);
+				} catch (IOException e) {
+					e.printStackTrace();
+				}
+			}
+		}
+		
+
+		input = assignLuceneNumbers(input);
+		return input;
+
+	}
+
+
+			
+	public DependencyGraph createIndex(DependencyGraph input) {
+		int count = 0;
+
+		File indexDir = new File(ConfigurationDefaults.RELATIVE_INDEX_DIR);
+		ConfigurationDefaults.INDEX_DIR = indexDir.getAbsolutePath();
+
+
+		try {
+			FSDirectory store = SimpleFSDirectory.open(indexDir);
+
+			Analyzer analyzer = new CustomAnalyzer(ConfigurationDefaults.LUCENE_VERSION);
+//			Map<String, Analyzer> fieldAnalyzers = new HashMap<String, Analyzer>();
+//			fieldAnalyzers.put("raw-contents", new KeywordAnalyzer());
+//
+//			PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(
+//					analyzer, fieldAnalyzers);
+
+			IndexWriterConfig config = new IndexWriterConfig(ConfigurationDefaults.LUCENE_VERSION,
+					analyzer);
+
+			IndexWriter writer = new IndexWriter(store, config);
+			Set<String> names = input.keySet();
+
+			for (String name : names) {
+
+				if (ConfigurationDefaults.ignoreFile(name)) {
+					continue;
+				}
+
+				System.out.println(count + ": adding " + name);
+				count++;
+
+				DependencyNode node = input.get(name);
+				StringBuffer content = new StringBuffer();
+
+				if ((node.getContent() != null)
+						&& (!node.getContent().isEmpty())) {
+					CustomFileReader contentFile = new CustomFileReader(
+							node.getContent());
+
+					while (true) {
+						String line = contentFile.readLine();
+						if (line == null)
+							break;
+						content = new StringBuffer(content + line);
+					}
+
+					contentFile.close();
+
+				}
+
+				Document doc = new Document();
+				doc.add(new StringField("name", node.getCompleteFilepath(),Field.Store.YES));
+				doc.add(new StringField("path", node.getCompleteFilepath(),
+						Field.Store.YES));
+				if ((node.getContent() != null)
+						&& (!node.getContent().isEmpty())) {
+					doc.add(new TextField("contents", new String(content),
+							Field.Store.YES));
+					doc.add(new TextField("raw-contents", new String(content),
+							Field.Store.YES));
+				} else {
+					System.out.println("No content in file "
+							+ node.getCompleteFilepath());
+				}
+				writer.addDocument(doc);
+
+			}
+
+			writer.close();
+			store.close();
+
+			return input;
+
+		} catch (IOException e) {
+			System.out
+					.println("IOException: Indexing of files failed for directory: "
+							+ rootpath);
+			e.printStackTrace();
+
+			return input;
+		}
+	}
+
+	public DependencyGraph assignLuceneNumbers(DependencyGraph input) {
+
+		File indexDir = new File(ConfigurationDefaults.RELATIVE_INDEX_DIR);
+		FSDirectory store;
+		try {
+			store = SimpleFSDirectory.open(indexDir);
+
+			IndexReader reader = DirectoryReader.open(store);
+			IndexSearcher searcher = new IndexSearcher(reader);
+			int numdocs = reader.numDocs();
+
+			for (int i = 0; i < numdocs; i++) {
+				Document doc = searcher.doc(i);
+				String key = doc.getField("name").stringValue();
+				DependencyNode d = input.get(key);
+
+				d.setLuceneDocNumber(i);
+				input.put(d.getCompleteFilepath(), d);
+			}
+
+			reader.close();
+			store.close();
+
+		} catch (IOException e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+		}
+
+		return input;
+
+	}
+	
+}
+	
+

File src/main/java/nl/vu/recoprov/ProvDMtranslator.java

+package nl.vu.recoprov;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileWriter;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.LinkedList;
+
+import javax.xml.namespace.QName;
+
+import nl.vu.recoprov.baseclasses.DependencyGraph;
+import nl.vu.recoprov.baseclasses.DependencyNode;
+import nl.vu.recoprov.baseclasses.DependencyGraph.LabelledEdge;
+import nl.vu.recoprov.signalaggregators.WeightedSumAggregator;
+import nl.vu.recoprov.signaldetectors.MetadataSimilaritySignal;
+import nl.vu.recoprov.utils.TransitiveClosure;
+
+import org.openprovenance.prov.dot.ProvToDot;
+import org.openprovenance.prov.xml.Activity;
+import org.openprovenance.prov.xml.Agent;
+import org.openprovenance.prov.xml.Document;
+import org.openprovenance.prov.xml.Entity;
+import org.openprovenance.prov.xml.EntityRef;
+import org.openprovenance.prov.xml.InternationalizedString;
+import org.openprovenance.prov.xml.ProvFactory;
+import org.openprovenance.prov.xml.SpecializationOf;
+import org.openprovenance.prov.xml.Statement;
+import org.openprovenance.prov.xml.WasDerivedFrom;
+//import org.openprovenance.prov.rdf.RdfConstructor;
+
+// prov to dot uses the XML representation
+
+//import org.openprovenance.prov.rdf.Entity;
+//import org.openprovenance.prov.rdf.Derivation;
+//import org.openprovenance.prov.rdf.Revision;
+//import org.openprovenance.prov.rdf.EntityInvolvement;
+//import org.openprovenance.prov.rdf.TimeInstant;
+
+/// track the time in which an entity was generated
+
+
+// further annotation is in Notes - an entity can be linked to a note by hasAnnotation
+
+// entity attributes - [attr1=val] prov:type="document'
+// derivation attributes - prov:type = "physical transform" wasDerivedFrom
+// types of derivation:
+// revision - newer and older   wasRevisionOf
+// quotation - quote(partial copy), original     wasQuotedFrom
+// original source  - derived (entity), source   hadOriginalSource
+
+// derivation is a particular form of Trace - tracedTo - entity, ancestor
+
+// specialization is a more constrained entity  specializationOf
+
+// attrib - prov:label, prov:type, prov:value (score)
+
+
+public class ProvDMtranslator {
+
+	private HashMap<Integer, Entity> listOfAvailableEntities = new HashMap<Integer, Entity> ();
+	private Collection<Statement> listOfAvailableRelations = new ArrayList<Statement> ();
+	private boolean useTred = false;
+	
+		public String translate(DependencyGraph input){
+			System.out.println("Got called...");
+			String result =  translate( input, "graph.gv");
+			System.out.println("Got someting back");
+			return result;
+		}
+	 
+	 
+	public String translate(DependencyGraph input,String graphfilename) {
+		System.out.println("Translate to PROVDM.");
+		ProvFactory factory = new ProvFactory();
+		Document container;
+
+		for (String name: input.keySet()){
+			
+			// for each node take the edges
+			DependencyNode d = input.get(name);
+			
+			EntityRef originEntity = getEntityRefFromDependencyNode(factory, d);
+			
+			ArrayList<LabelledEdge> edgearray = input.getAllEdges(d.getLuceneDocNumber());
+			
+			if(edgearray == null)
+				continue;
+
+			
+			for(LabelledEdge edge: edgearray){
+				
+				double score = edge.getScore();
+				if(score <= 0.0)
+					continue;
+				
+				if(edge.getLabel().equals(WeightedSumAggregator.FINAL_SCORE) || edge.getLabel().equals(TransitiveClosure.INFERRED)){
+					DependencyNode d2 = input.get(edge.getId());
+					EntityRef generatedEntity = getEntityRefFromDependencyNode(factory, d2);
+					
+					String scoreString = ""+score;
+					
+					WasDerivedFrom derivationRelation = factory.newWasDerivedFrom(scoreString, generatedEntity, originEntity);
+//					derivationRelation.setGeneratedEntity(generatedEntity);
+//					derivationRelation.setUsedEntity(originEntity);
+					
+				
+					derivationRelation.getType().add(WeightedSumAggregator.FINAL_SCORE);
+					listOfAvailableRelations.add(derivationRelation);}
+				else if (edge.getLabel().equals(MetadataSimilaritySignal.REVISION_SIMILARITY)){
+					DependencyNode d2 = input.get(edge.getId());
+					EntityRef generatedEntity = getEntityRefFromDependencyNode(factory, d2);
+				
+					
+					SpecializationOf revisionRelation = new SpecializationOf();
+//					revisionRelation.setSpecializedEntity(generatedEntity);
+//					revisionRelation.setGeneralEntity(originEntity);
+
+					
+					//TODO: we remove for graph purposes
+				
+					
+					listOfAvailableRelations.add(revisionRelation);
+					}
+				
+			}	
+		}
+		
+		System.out.println("Going to build a dot file");
+		
+		ProvToDot provtodot = new ProvToDot();
+				
+		container = factory.newDocument( new LinkedList<Activity>(), listOfAvailableEntities.values(),   new LinkedList<Agent>(), listOfAvailableRelations);
+		try {
+			provtodot.convert(container, new File(graphfilename));
+			
+			//System.out.println(graphfilename);
+			
+			//Transitive reduction
+			if (useTred) {
+				String cmd = "tred " + graphfilename;
+				BufferedWriter fout = new BufferedWriter(new FileWriter("Tred"
+						+ graphfilename));
+
+				Runtime run = Runtime.getRuntime();
+				Process pr = run.exec(cmd);
+				pr.waitFor();
+				BufferedReader buf = new BufferedReader(new InputStreamReader(
+						pr.getInputStream()));
+				String line = "";
+				String out = "";
+				while ((line = buf.readLine()) != null) {
+					System.out.println(line);
+					out = out + line + "\n";
+					fout.write(line);
+				}
+
+				fout.flush();
+				fout.close();
+				buf.close();
+				pr.destroy();
+
+			}
+			String svg = "didn't work";
+		    // svg = convertToSVG("graph2.gv");
+			
+
+			listOfAvailableEntities = new HashMap<Integer, Entity> ();
+			listOfAvailableRelations = new ArrayList<Statement> ();
+			
+			return svg;
+
+		} catch (Exception e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+		}
+	
+		
+		return "didn't work";
+	}
+	
+	
+	public String convertToSVG(String filename) throws Exception
+	{
+		String cmd = "dot -Tsvg " + filename  ;
+		
+		Runtime run = Runtime.getRuntime();
+		Process pr = run.exec(cmd);
+		pr.waitFor();
+		BufferedReader buf = new BufferedReader(new InputStreamReader(pr.getInputStream())); 
+		String line = ""; 
+		String out = "";
+		while ((line=buf.readLine())!=null) { 
+				System.out.println(line); 
+				out = out + line + "\n";
+				
+		} 
+
+		buf.close();
+		return out;
+	}
+	
+	public  EntityRef getEntityRefFromDependencyNode(ProvFactory factory, DependencyNode d){
+		EntityRef ref = new EntityRef();
+		
+		Entity entity = listOfAvailableEntities.get(d.getLuceneDocNumber());
+		if(entity == null){
+			entity = factory.newEntity(""+d.getLuceneDocNumber(), ""+d.getLuceneDocNumber());
+			//entity.setId(new QName(d.getCompleteFilepath()));
+			entity.getType().add("document");
+			//entity.getAny().add(d.getCompleteFilepath());
+			listOfAvailableEntities.put(d.getLuceneDocNumber(), entity);
+		}		
+		
+		ref.setRef(entity.getId());
+		return ref;
+	}
+		
+
+}

File src/main/java/nl/vu/recoprov/SearchCache.java

+package nl.vu.recoprov;
+
+import com.dropbox.client2.exception.DropboxException;
+import com.dropbox.client2.session.Session;
+import com.dropbox.client2.session.WebAuthSession;
+import com.dropbox.client2.session.AppKeyPair;
+import com.dropbox.client2.session.AccessTokenPair;
+import com.dropbox.client2.DropboxAPI;
+import com.dropbox.client2.DropboxAPI.DeltaEntry;
+
+
+import com.dropbox.client2.jsonextract.*;
+
+import nl.vu.recoprov.utils.ConfigurationDefaults;
+
+import org.json.simple.JSONArray;
+import org.json.simple.JSONObject;
+import org.json.simple.parser.JSONParser;
+import org.json.simple.parser.ParseException;
+
+import java.io.*;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Map;
+
+public class SearchCache
+{
+    
+
+    public static void main(String[] args)
+        throws DropboxException
+    {
+        if (args.length == 0) {
+            printUsage(System.out);
+            throw die();
+        }
+
+        String command = args[0];
+        if (command.equals("link")) {
+            doLink(args);
+        }
+        else if (command.equals("update")) {
+            doUpdate(args);
+        }
+        else if (command.equals("find")) {
+            doFind(args);
+        }
+        else if (command.equals("reset")) {
+            doReset(args);
+        }
+        else {
+            System.err.println("ERROR: Unknown command: \"" + command + "\"");
+            System.err.println("Run with no arguments for help.");
+            throw die();
+        }
+    }
+    
+   
+    
+
+    private static void doLink(String[] args)
+        throws DropboxException
+    {
+        if (args.length != 3) {
+            throw die("ERROR: \"link\" takes exactly two arguments.");
+        }
+
+        AppKeyPair appKeyPair = new AppKeyPair(args[1], args[2]);
+        WebAuthSession was = new WebAuthSession(appKeyPair, Session.AccessType.APP_FOLDER);
+
+        // Make the user log in and authorize us.
+        WebAuthSession.WebAuthInfo info = was.getAuthInfo();
+        System.out.println("1. Go to: " + info.url);
+        System.out.println("2. Allow access to this app.");
+        System.out.println("3. Press ENTER.");
+
+        try {
+            while (System.in.read() != '\n') {}
+        }
+        catch (IOException ex) {
+            throw die("I/O error: " + ex.getMessage());
+        }
+
+        // This will fail if the user didn't visit the above URL and hit 'Allow'.
+        was.retrieveWebAccessToken(info.requestTokenPair);
+        AccessTokenPair accessToken = was.getAccessTokenPair();
+        System.out.println("Link successful.");
+
+        // Save state
+        State state = new State(appKeyPair, accessToken, new Content.Folder());
+        state.save(ConfigurationDefaults.STATE_FILE);
+    }
+    
+   
+
+    private static void doUpdate(String[] args)
+        throws DropboxException
+    {
+        int pageLimit;
+        if (args.length == 2) {
+            pageLimit = Integer.parseInt(args[1]);
+        }
+        else if (args.length == 1) {
+            pageLimit = -1;
+        }
+        else {
+            throw die("ERROR: \"update\" takes either zero or one arguments.");
+        }
+
+        // Load state.
+        State state = State.load(ConfigurationDefaults.STATE_FILE);
+
+        // Connect to Dropbox.
+        WebAuthSession session = new WebAuthSession(state.appKey, WebAuthSession.AccessType.APP_FOLDER);
+        session.setAccessTokenPair(state.accessToken);
+        DropboxAPI<?> client = new DropboxAPI<WebAuthSession>(session);
+
+        int pageNum = 0;
+        boolean changed = false;
+        String cursor = state.cursor;
+        while (pageLimit < 0 || (pageNum < pageLimit)) {
+            // Get /delta results from Dropbox
+            DropboxAPI.DeltaPage<DropboxAPI.Entry> page = client.delta(cursor);
+            pageNum++;
+            if (page.reset) {
+                state.tree.children.clear();
+                changed = true;
+            }
+            // Apply the entries one by one.
+            for (DeltaEntry<DropboxAPI.Entry> e : page.entries) {
+                applyDelta(state.tree, e);
+                changed = true;
+            }
+            cursor = page.cursor;
+            if (!page.hasMore) break;
+        }
+
+        // Save state.
+        if (changed) {
+            state.cursor = cursor;
+            state.save(ConfigurationDefaults.STATE_FILE);
+        }
+        else {
+            System.out.println("No updates.");
+        }
+    }
+
+    private static void printUsage(PrintStream out)
+    {
+        out.println("Usage:");
+        out.println("    ./run link <app-key> <secret>  Link a user's account to the given app.");
+        out.println("    ./run update                   Update cache to the latest on Dropbox.");
+        out.println("    ./run update <num>             Update cache, limit to <num> pages of updates.");
+        out.println("    ./run find <term>              Search cache for <term> (case-sensitive).");
+        out.println("    ./run find                     Display entire cache.");
+        out.println("    ./run reset                    Delete the cache.");
+    }
+
+    private static RuntimeException die(String message)
+    {
+        System.err.println(message);
+        return die();
+    }
+
+    private static RuntimeException die()
+    {
+        System.exit(1);
+        return new RuntimeException();
+    }
+
+    // ------------------------------------------------------------------------
+    // Apply delta entries to the tree.
+
+    private static void applyDelta(Content.Folder parent, DeltaEntry<DropboxAPI.Entry> e)
+    {
+        Path path = Path.parse(e.lcPath);
+        DropboxAPI.Entry md = e.metadata;
+
+        if (md != null) {
+            System.out.println("+ " + e.lcPath);
+            // Traverse down the tree until we find the parent of the entry we
+            // want to add.  Create any missing folders along the way.
+            for (String b : path.branch) {
+                Node n = getOrCreateChild(parent, b);
+                if (n.content instanceof Content.Folder) {
+                    parent = (Content.Folder) n.content;
+                } else {
+                    // No folder here, automatically create an empty one.
+                    n.content = parent = new Content.Folder();
+                }
+            }
+
+            // Create the file/folder here.
+            Node n = getOrCreateChild(parent, path.leaf);
+            n.path = md.path;  // Save the un-lower-cased path.
+            if (md.isDir) {
+                // Only create an empty folder if there isn't one there already.
+                if (!(n.content instanceof Content.Folder)) {
+                    n.content = new Content.Folder();
+                }
+            }
+            else {
+                n.content = new Content.File(md.size, md.modified);
+            }
+        }
+        else {
+            System.out.println("- " + e.lcPath);
+            // Traverse down the tree until we find the parent of the entry we
+            // want to delete.
+            boolean missingParent = false;
+            for (String b : path.branch) {
+                Node n = parent.children.get(b);
+                if (n != null && n.content instanceof Content.Folder) {
+                    parent = (Content.Folder) n.content;
+                } else {
+                    // If one of the parent folders is missing, then we're done.
+                    missingParent = true;
+                    break;
+                }
+            }
+
+            if (!missingParent) {
+                parent.children.remove(path.leaf);
+            }
+        }
+    }
+
+    private static Node getOrCreateChild(Content.Folder folder, String lowercaseName)
+    {
+        Node n = folder.children.get(lowercaseName);
+        if (n == null) {
+            folder.children.put(lowercaseName, n = new Node(null, null));
+        }
+        return n;
+    }
+
+    /**
+     * Represent a path as a list of ancestors and a leaf name.
+     *
+     * For example, "/a/b/c" -> Path(["a", "b"], "c")
+     */
+    public static final class Path
+    {
+        public final String[] branch;
+        public final String leaf;
+
+        public Path(String[] branch, String leaf)
+        {
+            assert branch != null;
+            assert leaf != null;
+            this.branch = branch;
+            this.leaf = leaf;
+        }
+
+        public static Path parse(String s)
+        {
+            assert s.startsWith("/");
+            String[] parts = s.split("/");
+            assert parts.length > 0;
+
+            String[] branch = new String[parts.length-2];
+            System.arraycopy(parts, 1, branch, 0, branch.length);
+            String leaf = parts[parts.length-1];
+            return new Path(branch, leaf);
+        }
+    }
+
+    // ------------------------------------------------------------------------
+    // Search through the tree.
+
+    private static void doFind(String[] args)
+        throws DropboxException
+    {
+        String term;
+        if (args.length == 1) {
+            term = "";
+        }
+        else if (args.length == 2) {
+            term = args[1];
+        }
+        else {
+            throw die("ERROR: \"find\" takes either zero or one arguments");
+        }
+
+        // Load cached state.
+        State state = State.load(ConfigurationDefaults.STATE_FILE);
+
+        ArrayList<String> results = new ArrayList<String>();
+        searchTree(results, state.tree, term);
+        for (String r : results) {
+            System.out.println(r);
+        }
+        if (results.isEmpty()) {
+            System.out.println("[No matches.]");
+        }
+    }
+
+    private static void searchTree(ArrayList<String> results, Content.Folder tree, String term)
+    {
+        for (Map.Entry<String,Node> child : tree.children.entrySet()) {
+            Node n = child.getValue();
+            String path = n.path;
+            if (path != null && path.contains(term)) {
+                if (n.content instanceof Content.Folder) {
+                    results.add(path);
+                }
+                else if (n.content instanceof Content.File) {
+                    Content.File f = (Content.File) n.content;
+                    results.add(path + " (" + f.size + ", " + f.lastModified + ")");
+                }
+                else {
+                    throw new AssertionError("bad type: " + n.content);
+                }
+            }
+            // Recurse on children.
+            if (n.content instanceof Content.Folder) {
+                Content.Folder f = (Content.Folder) n.content;
+                searchTree(results, f, term);
+            }
+        }
+    }
+
+    // ------------------------------------------------------------------------
+    // Reset state
+
+    private static void doReset(String[] args)
+        throws DropboxException
+    {
+        if (args.length != 1) {
+            throw die("ERROR: \"reset\" takes no arguments");
+        }
+
+        // Load state.
+        State state = State.load(ConfigurationDefaults.STATE_FILE);
+
+        // Clear state.
+        state.tree.children.clear();
+        state.cursor = null;
+
+        // Save state back.
+        state.save(ConfigurationDefaults.STATE_FILE);
+    }
+
+    // ------------------------------------------------------------------------
+    // State model (load+save to JSON)
+
+    public static final class State
+    {
+        public final AppKeyPair appKey;
+        public final AccessTokenPair accessToken;
+        public final Content.Folder tree;
+
+        public State(AppKeyPair appKey, AccessTokenPair accessToken, Content.Folder tree)
+        {
+            this.appKey = appKey;
+            this.accessToken = accessToken;
+            this.tree = tree;
+        }
+
+        public String cursor;
+
+        public void save(String fileName)
+        {
+            JSONObject jstate = new JSONObject();
+
+            // Convert app key
+            JSONArray japp = new JSONArray();
+            japp.add(appKey.key);
+            japp.add(appKey.secret);
+            jstate.put("app_key", japp);
+
+            // Convert access token
+            JSONArray jaccess = new JSONArray();
+            jaccess.add(accessToken.key);
+            jaccess.add(accessToken.secret);
+            jstate.put("access_token", jaccess);
+
+            // Convert tree
+            JSONObject jtree = tree.toJson();
+            jstate.put("tree", jtree);
+
+            // Convert cursor, if present.
+            if (cursor != null) {
+                jstate.put("cursor", cursor);
+            }
+
+            try {
+                FileWriter fout = new FileWriter(fileName);
+                try {
+                    jstate.writeJSONString(fout);
+                }
+                finally {
+                    fout.close();
+                }
+            }
+            catch (IOException ex) {
+                throw die("ERROR: unable to save to state file \"" + fileName + "\": " + ex.getMessage());
+            }
+        }
+
+        public static State load(String fileName)
+        {
+            JsonThing j;
+            try {
+                FileReader fin = new FileReader(fileName);
+                try {
+                    j = new JsonThing(new JSONParser().parse(fin));
+                } catch (ParseException ex) {
+                    throw die("ERROR: State file \"" + fileName + "\" isn't valid JSON: " + ex.getMessage());
+                } finally {
+                    fin.close();
+                }
+            }
+            catch (IOException ex) {
+                throw die("ERROR: unable to load state file \"" + fileName + "\": " + ex.getMessage());
+            }
+
+            try {
+                JsonMap jm = j.expectMap();
+
+                JsonList japp = jm.get("app_key").expectList();
+                AppKeyPair appKey = new AppKeyPair(japp.get(0).expectString(), japp.get(1).expectString());
+
+                JsonList jaccess = jm.get("access_token").expectList();
+                AccessTokenPair accessToken = new AccessTokenPair(jaccess.get(0).expectString(), jaccess.get(1).expectString());
+
+                JsonMap jtree = jm.get("tree").expectMap();
+                Content.Folder tree = Content.Folder.fromJson(jtree);
+
+                State state = new State(appKey, accessToken, tree);
+
+                JsonThing jcursor = jm.getMaybe("cursor");
+                if (jcursor != null) {
+                    state.cursor = jcursor.expectString();
+                }
+
+                return state;
+            }
+            catch (JsonExtractionException ex) {
+                throw die ("ERROR: State file has incorrect structure: " + ex.getMessage());
+            }
+        }
+    }
+
+    // ------------------------------------------------------------------------
+    // We represent our local cache as a tree of 'Node' objects.
+    public static final class Node
+    {
+        /**
+         * The original path of the file.  We track this separately because
+         * Folder.children only contains lower-cased names.
+         */
+        public String path;
+
+        /**
+         * The node content (either Content.File or Content.Folder)
+         */
+        public Content content;
+
+        public Node(String path, Content content)
+        {
+            this.path = path;
+            this.content = content;
+        }
+
+        public final JSONArray toJson()
+        {
+            JSONArray array = new JSONArray();
+            array.add(path);
+            array.add(content.toJson());
+            return array;
+        }
+
+        public static Node fromJson(JsonThing t)
+            throws JsonExtractionException
+        {
+            JsonList l = t.expectList();
+            String path = l.get(0).expectStringOrNull();
+            JsonThing jcontent = l.get(1);
+            Content content;
+            if (jcontent.isList()) {
+                content = Content.File.fromJson(jcontent.expectList());
+            } else if (jcontent.isMap()) {
+                content = Content.Folder.fromJson(jcontent.expectMap());
+            } else {
+                throw jcontent.unexpected();
+            }
+            return new Node(path, content);
+        }
+    }
+
+    public static abstract class Content
+    {
+        public abstract Object toJson();
+
+        public static final class Folder extends Content
+        {
+            public final HashMap<String,Node> children = new HashMap<String,Node>();
+
+            public JSONObject toJson()
+            {
+                JSONObject o = new JSONObject();
+                for (Map.Entry<String,Node> c : children.entrySet()) {
+                    o.put(c.getKey(), c.getValue().toJson());
+                }
+                return o;
+            }
+
+            public static Folder fromJson(JsonMap j)
+                throws JsonExtractionException
+            {
+                Folder folder = new Folder();
+                for (Map.Entry<String,JsonThing> e : j) {
+                    folder.children.put(e.getKey(), Node.fromJson(e.getValue()));
+                }
+                return folder;
+            }
+        }
+
+        public static final class File extends Content
+        {
+            public final String size;
+            public final String lastModified;
+
+            public File(String size, String lastModified)
+            {
+                this.size = size;
+                this.lastModified = lastModified;
+            }
+
+            public JSONArray toJson()
+            {
+                JSONArray j = new JSONArray();
+                j.add(size);
+                j.add(lastModified);
+                return j;
+            }
+
+            public static File fromJson(JsonList l)
+                throws JsonExtractionException
+            {
+                return new File(l.get(0).expectString(), l.get(1).expectString());
+            }
+        }
+    }
+
+}

File src/main/java/nl/vu/recoprov/TikaReader.java

+package nl.vu.recoprov;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileWriter;
+import java.sql.Date;
+