Sara Magliacane avatar Sara Magliacane committed 5b86056

readding the custom analyzer

Comments (0)

Files changed (31)

dropbox-java-sdk-1.3/src/com/dropbox/client2/DropboxAPI.java

 
         String url_path = "/copy_ref/" + session.getAccessType() + sourcePath;
 
-        @SuppressWarnings("unchecked")
         Object result = RESTUtility.request(RequestMethod.GET,
                 session.getAPIServer(), url_path, VERSION, params, session);
 

dropbox-java-sdk-1.3/src/com/dropbox/client2/jsonextract/JsonExtractionException.java

 
 public final class JsonExtractionException extends Exception {
 
-    public JsonExtractionException(String path, String message, Object value) {
+	private static final long serialVersionUID = 1L;
+
+	public JsonExtractionException(String path, String message, Object value) {
         super((path == null ? "" : path + ": ") + message +
               (value == null ? "" : ": " + summarizeValue(value)));
     }
             return buf.toString();
         }
         else if (value instanceof java.util.List) {
+            @SuppressWarnings("rawtypes")
             List<?> list = (List) value;
             if (list.isEmpty()) {
                 return "[]";

dropbox-java-sdk-1.3/src/com/dropbox/client2/jsonextract/JsonThing.java

         super(internal, null);
     }
 
-    private static final java.util.HashMap<Class,String> TypeNames = new java.util.HashMap<Class,String>();
+    @SuppressWarnings("rawtypes")
+	private static final java.util.HashMap<Class,String> TypeNames = new java.util.HashMap<Class,String>();
     static {
         TypeNames.put(String.class, "a string");
         TypeNames.put(Number.class, "a number");

dropbox-java-sdk-1.3/src/com/dropbox/client2/session/AppKeyPair.java

  */
 public final class AppKeyPair extends TokenPair {
 
-    public AppKeyPair(String key, String secret) {
+	private static final long serialVersionUID = -5526503075188547139L;
+
+	public AppKeyPair(String key, String secret) {
         super(key, secret);
     }
 }

dropbox-java-sdk-1.3/src/com/dropbox/client2/session/TokenPair.java

 public abstract class TokenPair implements Serializable
 {
 
-    /**
+	private static final long serialVersionUID = 1L;
+
+	/**
      * The "key" portion of the pair.  For example, the "consumer key",
      * "request token", or "access token".  Will never contain the "|"
      * character.
 	<version>4.2.1</version>
 </dependency>
 <dependency>
+	<groupId>org.apache.solr</groupId>
+	<artifactId>solr-core</artifactId>
+	<version>4.2.1</version>
+</dependency>
+<dependency>
+	<groupId>org.apache.solr</groupId>
+	<artifactId>solr-analysis-extras</artifactId>
+	<version>4.2.1</version>
+</dependency>            
+<dependency>
 	<groupId>org.apache.commons</groupId>
 	<artifactId>commons-io</artifactId>
 	<version>1.3.2</version>

src/nl/vu/recoprov/CompletePipeline.java

 package nl.vu.recoprov;
 
+/**
+ * The complete pipeline class calls all the components of the system in the right order.
+ */
 
 import java.io.BufferedWriter;
 import java.io.File;
-
 import java.io.FileWriter;
 import java.io.IOException;
-
 import nl.vu.recoprov.ProvDMtranslator;
 import nl.vu.recoprov.baseclasses.DependencyGraph;
 import nl.vu.recoprov.signalaggregators.WeightedSumAggregator;
 
 		// System.out.println(depGraph.toCSVString());
 
-		// AGGREGATE
 		depGraph = pipeline.aggregateSignals(depGraph);
 
 		System.out.println(depGraph);
 		pipeline.writeToFile(depGraph);
 
 
-		// OUTPUT DOT
 		pipeline.translateToPROVDM(depGraph);
 		
 

src/nl/vu/recoprov/DropboxClient.java

 package nl.vu.recoprov;
 
+/**
+ * Dropbox client based on the example client in the SDK
+ */
+
 import java.io.BufferedReader;
 import java.io.File;
 import java.io.FileInputStream;
 import nl.vu.recoprov.baseclasses.DependencyGraph;
 import nl.vu.recoprov.baseclasses.DependencyNode;
 import nl.vu.recoprov.utils.ConfigurationDefaults;
-
 import com.dropbox.client2.DropboxAPI;
 import com.dropbox.client2.DropboxAPI.Entry;
 import com.dropbox.client2.exception.DropboxException;
 
 public class DropboxClient extends SearchCache {
 
+	private String APPKEY;
+	private String SECRET;
 
-	private static ArrayList<String> directories = new ArrayList<String>();
-
-	public String APPKEY;
-	public String SECRET;
-
+	/**
+	 * Get permission to access a person's Dropbox folder.
+	 * @throws DropboxException
+	 */
 	public void linkToAccount() throws DropboxException {
+		
 		File f = new File(ConfigurationDefaults.STATE_FILE);
 		AppKeyPair appKeyPair = null;
 
 				f.createNewFile();
 				appKeyPair = readSecretFile(ConfigurationDefaults.SECRET_FILE);
 			} catch (IOException e) {
-				// TODO Auto-generated catch block
+				System.err.println("Could not read secret file: " + ConfigurationDefaults.SECRET_FILE);
 				e.printStackTrace();
 			}
 
 		state.save(ConfigurationDefaults.STATE_FILE);
 	}
 
+	/**
+	 * Read the file containing the APPKEY and SECRET
+	 * @param filename
+	 * @return the pair APPKEY and SECRET
+	 */
 	private AppKeyPair readSecretFile(String filename) {
 		System.out.println("Reading secret file: " + filename);
 		final File file = new File(filename);
 
 	}
 
+	/**
+	 * Get all reviews of the files for the complete Dropbox 
+	 * @param depGraph
+	 * @return
+	 * @throws DropboxException
+	 */
 	public DependencyGraph getAllRevs(DependencyGraph depGraph)
 			throws DropboxException {
 		return getAllRevs(depGraph, "");
 	}
 
+	/**
+	 * Get all reviews of the files for a specific Dropbox folder
+	 * @param depGraph
+	 * @param dropboxFolder
+	 * @return
+	 * @throws DropboxException
+	 */
 	public DependencyGraph getAllRevs(DependencyGraph depGraph,
 			String dropboxFolder) throws DropboxException {
 
 		return temp;
 	}
 
+	/**
+	 * Used for debugging purposes - prints out the metadata
+	 * @param result
+	 */
 	private static void printMetadata(List<Entry> result) {
 		for (Entry e : result) {
 			System.out.println(e.fileName() + "\n" + e.rev + " " + e.modified

src/nl/vu/recoprov/ImageReader.java

 import java.awt.image.BufferedImage;
 import java.io.File;
 import java.io.IOException;
-import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 
 import org.apache.lucene.analysis.en.EnglishAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
+import org.apache.lucene.document.StringField;
 import org.apache.lucene.index.CorruptIndexException;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.store.FSDirectory;
 import org.apache.lucene.store.LockObtainFailedException;
 import org.apache.lucene.store.SimpleFSDirectory;
-import org.apache.lucene.util.Version;
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.pdmodel.PDPage;
 import org.apache.pdfbox.pdmodel.PDResources;
 			e2.printStackTrace();
 		}
 		
-		Analyzer analyzer = new EnglishAnalyzer(Version.LUCENE_42);
-		IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_42, analyzer );
+		Analyzer analyzer = new EnglishAnalyzer(ConfigurationDefaults.LUCENE_VERSION);
+		IndexWriterConfig config = new IndexWriterConfig(ConfigurationDefaults.LUCENE_VERSION, analyzer );
 		
 		IndexWriter writer = null;
 		try {
 		try{
 			document = PDDocument.load(f);
 		}catch(IOException e){
-			
-			if (document != null)
-				document.close();
-			
+				
 			return input;
 		}
 		List<PDPage> pages = document.getDocumentCatalog().getAllPages();
 		File newfile = new File(name );
 		//image.write2file( newfile );
 		try {
-			Sanselan.writeImage(image, newfile, ImageFormat.IMAGE_FORMAT_PNG, new HashMap());
+			Sanselan.writeImage(image, newfile, ImageFormat.IMAGE_FORMAT_PNG, null);
 		} catch (ImageWriteException e) {
 			// TODO Auto-generated catch block
 			e.printStackTrace();
 		}
 
 		Document doc = docbuilder.createDocument(image, name);
-		doc.add(new Field("name", name, Field.Store.YES, Field.Index.NOT_ANALYZED));
-		doc.add(new Field("originalFilename", d.getCompleteFilepath() , Field.Store.YES, Field.Index.NOT_ANALYZED));
+		doc.add(new StringField("name", name, Field.Store.YES));
+		doc.add(new StringField("originalFilename", d.getCompleteFilepath() , Field.Store.YES));
 		writer.addDocument(doc);
 		
 		image.flush();
 		return d;
 	}
 	
+	
+	/**
+	 * Check if the image is empty (completely white), 
+	 * so we don't add it to the index.
+	 * @param img
+	 * @return true = image is completely white
+	 */
 	public static Boolean isImageWhite(BufferedImage img){
 		int width = img.getWidth()-1;
 		int height = img.getHeight()-1;
 		Boolean white = true;
 		int whiteColor = Color.WHITE.getRGB();
-		int blackColor = Color.BLACK.getRGB();
 		
 		for (int i = 0; i < width; i++){
 			for (int j = 0; j < height; j++){

src/nl/vu/recoprov/LuceneIndexer.java

 
 import java.io.File;
 import java.io.IOException;
-import java.util.HashMap;
-import java.util.Map;
 import java.util.Set;
 import nl.vu.recoprov.baseclasses.DependencyGraph;
 import nl.vu.recoprov.baseclasses.DependencyNode;
 import nl.vu.recoprov.utils.ConfigurationDefaults;
+import nl.vu.recoprov.utils.CustomAnalyzer;
 import nl.vu.recoprov.utils.CustomFileReader;
 
 import org.apache.commons.io.FileUtils;
 import org.apache.lucene.analysis.en.EnglishAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.store.FSDirectory;
 import org.apache.lucene.store.SimpleFSDirectory;
-import org.apache.lucene.util.Version;
+
 
 public class LuceneIndexer {
 
 		try {
 			FSDirectory store = SimpleFSDirectory.open(indexDir);
 
-			Analyzer analyzer = new EnglishAnalyzer(Version.LUCENE_42);
+			Analyzer analyzer = new CustomAnalyzer(ConfigurationDefaults.LUCENE_VERSION);
 //			Map<String, Analyzer> fieldAnalyzers = new HashMap<String, Analyzer>();
 //			fieldAnalyzers.put("raw-contents", new KeywordAnalyzer());
 //
 //			PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(
 //					analyzer, fieldAnalyzers);
 
-			IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_42,
+			IndexWriterConfig config = new IndexWriterConfig(ConfigurationDefaults.LUCENE_VERSION,
 					analyzer);
 
 			IndexWriter writer = new IndexWriter(store, config);
 				}
 
 				Document doc = new Document();
-				doc.add(new Field("name", node.getCompleteFilepath(),
-						Field.Store.YES, Field.Index.NOT_ANALYZED));
-				doc.add(new Field("path", node.getCompleteFilepath(),
-						Field.Store.YES, Field.Index.NOT_ANALYZED));
+				doc.add(new StringField("name", node.getCompleteFilepath(),Field.Store.YES));
+				doc.add(new StringField("path", node.getCompleteFilepath(),
+						Field.Store.YES));
 				if ((node.getContent() != null)
 						&& (!node.getContent().isEmpty())) {
-					doc.add(new Field("contents", new String(content),
-							Field.Store.YES, Field.Index.ANALYZED,
-							Field.TermVector.WITH_POSITIONS_OFFSETS));
-					doc.add(new Field("raw-contents", new String(content),
-							Field.Store.YES, Field.Index.ANALYZED,
-							Field.TermVector.WITH_POSITIONS_OFFSETS));
+					doc.add(new TextField("contents", new String(content),
+							Field.Store.YES));
+					doc.add(new TextField("raw-contents", new String(content),
+							Field.Store.YES));
 				} else {
 					System.out.println("No content in file "
 							+ node.getCompleteFilepath());
 		try {
 			store = SimpleFSDirectory.open(indexDir);
 
-			IndexReader reader = IndexReader.open(store);
+			IndexReader reader = DirectoryReader.open(store);
 			IndexSearcher searcher = new IndexSearcher(reader);
 			int numdocs = reader.numDocs();
 

src/nl/vu/recoprov/ProvDMtranslator.java

 import java.io.BufferedReader;
 import java.io.BufferedWriter;
 import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.FileOutputStream;
 import java.io.FileWriter;
 import java.io.InputStreamReader;
 import java.util.ArrayList;
 import org.openprovenance.prov.dot.ProvToDot;
 import org.openprovenance.prov.xml.Activity;
 import org.openprovenance.prov.xml.Agent;
-import org.openprovenance.prov.xml.Attribute;
 import org.openprovenance.prov.xml.Document;
 import org.openprovenance.prov.xml.Entity;
 import org.openprovenance.prov.xml.EntityRef;
 	 
 	 
 	public String translate(DependencyGraph input,String graphfilename) {
-		System.out.println("Got called 2...");
+		System.out.println("Translate to PROVDM.");
 		ProvFactory factory = new ProvFactory();
 		Document container;
 
 			//Transitive reduction
 			
 			String cmd = "tred "+ graphfilename ;
-			BufferedWriter fout = new BufferedWriter(new FileWriter("graph2.gv"));
+			BufferedWriter fout = new BufferedWriter(new FileWriter("graphTred.gv"));
 		
 			Runtime run = Runtime.getRuntime();
 			Process pr = run.exec(cmd);

src/nl/vu/recoprov/SearchCache.java

 import com.dropbox.client2.session.AccessTokenPair;
 import com.dropbox.client2.DropboxAPI;
 import com.dropbox.client2.DropboxAPI.DeltaEntry;
-import com.dropbox.client2.DropboxAPI.Entry;
+
 
 import com.dropbox.client2.jsonextract.*;
 
 import java.io.*;
 import java.util.ArrayList;
 import java.util.HashMap;
-import java.util.List;
 import java.util.Map;
 
 public class SearchCache

src/nl/vu/recoprov/TikaReader.java

 package nl.vu.recoprov;
 
-import java.io.BufferedReader;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileWriter;

src/nl/vu/recoprov/abstractclasses/SignalAggregator.java

 package nl.vu.recoprov.abstractclasses;
-
+/**
+ * Abstract class for the signal aggregation phase
+ */
 import nl.vu.recoprov.baseclasses.DependencyGraph;
 
 public abstract class SignalAggregator {

src/nl/vu/recoprov/abstractclasses/SignalDetector.java

 package nl.vu.recoprov.abstractclasses;
-
+/**
+ * Abstract class for the signal detection phase 
+ */
 import nl.vu.recoprov.baseclasses.DependencyGraph;
 
 public abstract class SignalDetector {

src/nl/vu/recoprov/abstractclasses/SignalFilterer.java

 package nl.vu.recoprov.abstractclasses;
-
+/**
+ * Abstract class for the signal filter phase
+ */
 import nl.vu.recoprov.baseclasses.DependencyGraph;
 
 public abstract class SignalFilterer {

src/nl/vu/recoprov/baseclasses/DependencyGraph.java

 import java.util.HashMap;
 import java.util.LinkedHashMap;
 import java.util.LinkedHashSet;
-import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.TreeMap;
-import java.util.TreeSet;
-
 import nl.vu.recoprov.signalaggregators.WeightedSumAggregator;
 import nl.vu.recoprov.utils.TransitiveClosure;
 
 /**
  * A class that represents just a graph with several labelled edges and attributes for nodes.
  * TODO: find something standard
- * @author saramagliacane
+ *
  *
  */
 

src/nl/vu/recoprov/baseclasses/DependencyNode.java

 package nl.vu.recoprov.baseclasses;
 
 
-import java.util.Arrays;
 import java.util.LinkedHashMap;
 import java.util.Map;
 import java.util.Set;
-import java.util.TreeSet;
-
 import org.apache.lucene.search.ScoreDoc;
 import org.apache.tika.metadata.Metadata;
-
 import com.dropbox.client2.DropboxAPI.Entry;
 
 

src/nl/vu/recoprov/baseclasses/RecoMetadata.java

 		if(this.get(this.CREATION_DATE) != null){
 			String date = this.get(this.CREATION_DATE);
 			
-			SimpleDateFormat formatter, FORMATTER;
+			SimpleDateFormat formatter;
 			formatter = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'");
 
 			Date val = null;
 	
 	public void setFSModified(Date modified){
 		this.fsmodified = modified;
-		this.set(Property.externalDate(this.FILESYSTEM_LASTMODIFIED), modified);
+		this.set(Property.externalDate(RecoMetadata.FILESYSTEM_LASTMODIFIED), modified);
 	}
 	
 	public String getDropboxPath(){

src/nl/vu/recoprov/experiments/CorpusGeneratorProvDM.java

 
 import java.io.File;
 import java.io.FileNotFoundException;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Date;
 import java.util.GregorianCalendar;
 import java.util.HashMap;
 import java.util.Hashtable;
 import nl.vu.recoprov.CompletePipeline;
 import nl.vu.recoprov.baseclasses.DependencyGraph;
 import nl.vu.recoprov.baseclasses.DependencyNode;
-import nl.vu.recoprov.baseclasses.DependencyGraph.LabelledEdge;
 import nl.vu.recoprov.signalaggregators.WeightedSumAggregator;
-import nl.vu.recoprov.signaldetectors.MetadataSimilaritySignal;
 import nl.vu.recoprov.utils.ConfigurationDefaults;
 
 
 import org.openprovenance.prov.xml.Activity;
 import org.openprovenance.prov.xml.ActivityRef;
 import org.openprovenance.prov.xml.Agent;
-import org.openprovenance.prov.xml.AgentRef;
 import org.openprovenance.prov.xml.Document;
 import org.openprovenance.prov.xml.Entity;
 import org.openprovenance.prov.xml.EntityRef;
-import org.openprovenance.prov.xml.NamedBundle;
 import org.openprovenance.prov.xml.ProvFactory;
-import org.openprovenance.prov.xml.SpecializationOf;
 import org.openprovenance.prov.xml.Statement;
 import org.openprovenance.prov.xml.Used;
 import org.openprovenance.prov.xml.WasAssociatedWith;
-import org.openprovenance.prov.xml.WasDerivedFrom;
 import org.openprovenance.prov.xml.WasGeneratedBy;
 //import org.openprovenance.prov.rdf.RdfConstructor;
 
-
-import com.dropbox.client2.exception.DropboxException;
-
 // prov to dot uses the XML representation
 
 //import org.openprovenance.prov.rdf.Entity;

src/nl/vu/recoprov/experiments/Experiment1.java

 package nl.vu.recoprov.experiments;
 
-import java.io.BufferedReader;
 import java.io.File;
 import java.io.FileWriter;
 import java.io.IOException;
-import java.io.InputStreamReader;
-
 import nl.vu.recoprov.CompletePipeline;
-import nl.vu.recoprov.LuceneIndexer;
 import nl.vu.recoprov.ProvDMtranslator;
 import nl.vu.recoprov.baseclasses.DependencyGraph;
 import nl.vu.recoprov.signalaggregators.WeightedSumAggregator;
-import nl.vu.recoprov.signaldetectors.DiffSignal;
 import nl.vu.recoprov.signaldetectors.ImageSimilaritySignal;
 import nl.vu.recoprov.signaldetectors.LuceneInverseSimilarity;
 import nl.vu.recoprov.signaldetectors.LuceneSimilaritySignal;
 import nl.vu.recoprov.signaldetectors.MatchTitleInContentSignal;
 import nl.vu.recoprov.signaldetectors.MetadataSimilaritySignal;
 import nl.vu.recoprov.signalfilters.BackwardTemporalFilter;
-import nl.vu.recoprov.signalfilters.TransitiveReductionFilter;
 import nl.vu.recoprov.utils.TransitiveClosure;
 
 import org.openprovenance.prov.xml.ProvFactory;
 
-import com.dropbox.client2.exception.DropboxException;
-
 public class Experiment1 {
 	
 	public static String dirTemp = "/Users/saramagliacane/Dropbox/reconstructing-dataprep-provenance/";

src/nl/vu/recoprov/experiments/Experiment2.java

 package nl.vu.recoprov.experiments;
 
-import java.io.BufferedReader;
 import java.io.File;
 import java.io.FileWriter;
 import java.io.IOException;
-import java.io.InputStreamReader;
-
 import nl.vu.recoprov.CompletePipeline;
-import nl.vu.recoprov.LuceneIndexer;
 import nl.vu.recoprov.ProvDMtranslator;
 import nl.vu.recoprov.baseclasses.DependencyGraph;
 import nl.vu.recoprov.signalaggregators.WeightedSumAggregator;
-import nl.vu.recoprov.signaldetectors.DiffSignal;
 import nl.vu.recoprov.signaldetectors.ImageSimilaritySignal;
 import nl.vu.recoprov.signaldetectors.LuceneInverseSimilarity;
 import nl.vu.recoprov.signaldetectors.LuceneSimilaritySignal;
 import nl.vu.recoprov.signaldetectors.MatchTitleInContentSignal;
 import nl.vu.recoprov.signaldetectors.MetadataSimilaritySignal;
 import nl.vu.recoprov.signalfilters.BackwardTemporalFilter;
-import nl.vu.recoprov.signalfilters.TransitiveReductionFilter;
 import nl.vu.recoprov.utils.TransitiveClosure;
 
 import org.openprovenance.prov.xml.ProvFactory;
 
-import com.dropbox.client2.exception.DropboxException;
-
 public class Experiment2 {
 	
 	public final static String dirTemp = "/Users/saramagliacane/Documents/workspace/recoprov/temp/Data2Semantics/Philips-Elsevier-Usecase/IDSA Qs/";

src/nl/vu/recoprov/experiments/PROVReader.java

 import java.util.HashMap;
 import java.util.Hashtable;
 import java.util.LinkedHashMap;
-import java.util.LinkedList;
 import java.util.List;
 
 import nl.vu.recoprov.CompletePipeline;
 import org.openprovenance.prov.dot.ProvToDot;
 import org.openprovenance.prov.json.Converter;
 import org.openprovenance.prov.xml.Activity;
-import org.openprovenance.prov.xml.Agent;
 import org.openprovenance.prov.xml.Document;
 import org.openprovenance.prov.xml.Entity;
 import org.openprovenance.prov.xml.ProvFactory;
-import org.openprovenance.prov.xml.Statement;
 import org.openprovenance.prov.xml.StatementOrBundle;
 import org.openprovenance.prov.xml.WasDerivedFrom;
 
 	private  DependencyGraph depGraph;
 	private  String dir ;
 	private  String jsonfile ;
+	private Document provdoc = new Document();
 	
 	private final static String suspiciousFolder = "susp/";
 	private final static String sourceFolder = "src/";
 
 		readJSON(jsonfile, factory);
 
-		// convertToDot(factory);
+		convertToDot(factory);
 
 		return depGraph;
 
 		}
 
 	}
+	
+	public void readPROVN(String provnfile, ProvFactory factory) {
+		Converter conv = new Converter();
+
+		File sourcedir = new File(dir, sourceFolder);
+		File suspdir = new File(dir, suspiciousFolder);
+		
+		try {
+			provdoc = conv.readDocument(provnfile);
+		} catch (Exception e) {
+			// ignore}
+		}
+		List<StatementOrBundle> provlist = provdoc
+				.getEntityAndActivityAndWasGeneratedBy();
+		for (StatementOrBundle s : provlist) {
+			// assume are all statements
+			// if (s instanceof Entity){
+			// String entityName = ((Entity) s).getId().toString();
+			// entityName = entityName.replace(namespace, "");
+			// System.out.println("Entity: " + entityName);
+			// }
+			//
+			// if (s instanceof Activity){
+			// String actName = ((Activity) s).getId().toString();
+			// actName = actName.replace(namespace, "");
+			// System.out.println("Activity: " + actName);
+			// }
+
+			// if (s instanceof WasGeneratedBy){
+			// String actName = ((WasGeneratedBy) s).getId().toString();
+			// actName = actName.replace(namespace, "");
+			// System.out.println("WasGeneratedBy: " + actName);
+			// }
+
+			if (s instanceof WasDerivedFrom) {
+				String used = ((WasDerivedFrom) s).getUsedEntity().getRef()
+						.getLocalPart();
+				String generated = ((WasDerivedFrom) s).getGeneratedEntity()
+						.getRef().getLocalPart();
+				// System.out.println("WasDerivedFrom: " + used + " ->"+
+				// generated);
+
+				File usedFile = new File(sourcedir, used);
+				File genFile = new File(suspdir, generated);
+				
+				depGraph.addEdge(depGraph.get(genFile.getAbsolutePath()),
+						depGraph.get(usedFile.getAbsolutePath()),
+						WeightedSumAggregator.FINAL_SCORE, 1.0);
+			}
+
+		}
+
+	}
+
+
 
 
 	
 
 	public void readJSON(String jsonfile, ProvFactory factory) {
 		Converter conv = new Converter();
-		Document doc = new Document();
+
 		File sourcedir = new File(dir, sourceFolder);
 		File suspdir = new File(dir, suspiciousFolder);
 		
 		try {
-			doc = conv.readDocument(jsonfile);
+			provdoc = conv.readDocument(jsonfile);
 		} catch (Exception e) {
 			// ignore}
 		}
-		List<StatementOrBundle> provlist = doc
+		List<StatementOrBundle> provlist = provdoc
 				.getEntityAndActivityAndWasGeneratedBy();
 		for (StatementOrBundle s : provlist) {
 			// assume are all statements
 	public void convertToDot(ProvFactory factory) {
 
 		ProvToDot provtodot = new ProvToDot();
-		Document container;
-		container = factory.newDocument(listOfAvailableActivities.values(),
-				listOfAvailableEntities.values(), new LinkedList<Agent>(),
-				new LinkedList<Statement>());
+//		Document container;
+//		container = factory.newDocument(listOfAvailableActivities.values(),
+//				listOfAvailableEntities.values(), new LinkedList<Agent>(),
+//				new LinkedList<Statement>());
 
 		try {
 		
-			provtodot.convert(container, new File("graphCorpus.gv"));
+			provtodot.convert(provdoc, new File("graphCorpus.gv"));
 		
 		} catch (FileNotFoundException e) {
 

src/nl/vu/recoprov/signaldetectors/DiffSignal.java

 //
 import nl.vu.recoprov.abstractclasses.SignalDetector;
 import nl.vu.recoprov.baseclasses.DependencyGraph;
-import nl.vu.recoprov.baseclasses.DependencyNode;
+//import nl.vu.recoprov.baseclasses.DependencyNode;
 
 
 public class DiffSignal extends SignalDetector {

src/nl/vu/recoprov/signaldetectors/ImageSimilaritySignal.java

 
 import java.awt.image.BufferedImage;
 import java.io.File;
-import java.io.FileInputStream;
 import java.io.IOException;
 
-import javax.imageio.ImageIO;
-
-import org.apache.commons.io.FileUtils;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
+import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.IndexWriterConfig;
-import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.store.FSDirectory;
 import org.apache.lucene.store.SimpleFSDirectory;
-import org.apache.lucene.util.Version;
 import org.apache.sanselan.ImageReadException;
 import org.apache.sanselan.Sanselan;
 
 import net.semanticmetadata.lire.ImageSearchHits;
 import net.semanticmetadata.lire.ImageSearcher;
 import net.semanticmetadata.lire.ImageSearcherFactory;
-import nl.vu.recoprov.CompletePipeline;
-import nl.vu.recoprov.ImageReader;
 import nl.vu.recoprov.abstractclasses.SignalDetector;
 import nl.vu.recoprov.baseclasses.DependencyGraph;
 import nl.vu.recoprov.baseclasses.DependencyNode;
 		try {
 			store = SimpleFSDirectory.open(new File(ConfigurationDefaults.IMAGE_INDEX_DIR));
 
-		IndexReader reader = IndexReader.open(store);
+		IndexReader reader = DirectoryReader.open(store);
 		ImageSearcher searcher = ImageSearcherFactory.createDefaultSearcher();
 
 		

src/nl/vu/recoprov/signaldetectors/LuceneInverseSimilarity.java

 
 import java.util.ArrayList;
 import java.util.Date;
-import java.util.HashMap;
-
 import nl.vu.recoprov.abstractclasses.SignalDetector;
 import nl.vu.recoprov.baseclasses.DependencyGraph;
 import nl.vu.recoprov.baseclasses.DependencyGraph.LabelledEdge;

src/nl/vu/recoprov/signaldetectors/LuceneSimilaritySignal.java

 
 import java.io.File;
 import java.io.IOException;
-import java.util.Map;
-import java.util.TreeMap;
-
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.en.EnglishAnalyzer;
+
 import org.apache.lucene.document.Document;
 import org.apache.lucene.index.DirectoryReader;
-import org.apache.lucene.index.Fields;
 import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.Term;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.IndexSearcher;
-import org.apache.lucene.search.PhraseQuery;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.ScoreDoc;
 import org.apache.lucene.search.TopScoreDocCollector;
 import org.apache.lucene.store.FSDirectory;
 import org.apache.lucene.store.SimpleFSDirectory;
-import org.apache.lucene.util.Version;
 import org.apache.lucene.queryparser.classic.ParseException;
 import org.apache.lucene.queryparser.classic.QueryParser;
 
-import nl.vu.recoprov.LuceneIndexer;
 import nl.vu.recoprov.abstractclasses.SignalDetector;
 import nl.vu.recoprov.baseclasses.DependencyGraph;
 import nl.vu.recoprov.baseclasses.DependencyNode;
 import nl.vu.recoprov.utils.ConfigurationDefaults;
+import nl.vu.recoprov.utils.CustomAnalyzer;
 
 
 public class LuceneSimilaritySignal extends SignalDetector {
 			String queryString) throws IOException, ParseException {
 
 		return searchForString("contents", searcher, queryString,
-				new EnglishAnalyzer(Version.LUCENE_42));
+				new CustomAnalyzer(ConfigurationDefaults.LUCENE_VERSION));
 
 	}
 	
 
 		BooleanQuery.setMaxClauseCount(1000000);
 
-		QueryParser parser = new QueryParser(Version.LUCENE_42, fieldname,
+		QueryParser parser = new QueryParser(ConfigurationDefaults.LUCENE_VERSION, fieldname,
 				analyzer);
 		Query query = parser.parse(queryString);
 
 			return new ScoreDoc[0];
 
 		
-		QueryParser parser = new QueryParser(Version.LUCENE_42, fieldname,
+		QueryParser parser = new QueryParser(ConfigurationDefaults.LUCENE_VERSION, fieldname,
 				analyzer);
 		Query query = parser.parse(queryString);
 

src/nl/vu/recoprov/signaldetectors/MatchTitleInContentSignal.java

 import java.io.File;
 
 
+import org.apache.lucene.analysis.en.EnglishAnalyzer;
 import org.apache.lucene.document.Document;
+import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.ScoreDoc;
 import org.apache.lucene.store.FSDirectory;
 import org.apache.lucene.store.SimpleFSDirectory;
+
 import nl.vu.recoprov.abstractclasses.SignalDetector;
 import nl.vu.recoprov.baseclasses.DependencyGraph;
 import nl.vu.recoprov.baseclasses.RecoMetadata;
 			//indexDir.mkdir();
 			
 			FSDirectory store = SimpleFSDirectory.open(indexDir);
-			IndexReader reader = IndexReader.open(store);
+			IndexReader reader = DirectoryReader.open(store);
 			IndexSearcher searcher = new IndexSearcher(reader);
 			int numdocs = reader.numDocs();
 			
 				//better to have an original index (no strange changes)
 				
 				ScoreDoc[] hits = null;
-				//hits = LuceneSimilaritySignal.searchForPhrase("contents", searcher, key, new EnglishAnalyzer());
+				hits = LuceneSimilaritySignal.searchForPhrase("contents", searcher, key, new EnglishAnalyzer(ConfigurationDefaults.LUCENE_VERSION));
 
 				
 				

src/nl/vu/recoprov/utils/ConfigurationDefaults.java

 import java.io.File;
 import java.util.ArrayList;
 
+import org.apache.lucene.util.Version;
+
 public class ConfigurationDefaults {
 	
 	public static final String CONFIG_FILE = "config.txt";
 	 */
 	public static String RELATIVE_INDEX_DIR = "lucene_index/";
 	public static String INDEX_DIR = "lucene_index/";
+	public static final Version LUCENE_VERSION = Version.LUCENE_42;
 	
 	
 	/**

src/nl/vu/recoprov/utils/CustomAnalyzer.java

+package nl.vu.recoprov.utils;
+
+
+/*
+ *  Modified version of StandardAnalyzer (cannot extend since it's final and wrapping it is a tedious job).
+ *  Includes Porter Stemmer
+ */
+
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.*;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.Version;
+import org.apache.lucene.analysis.core.LowerCaseFilter;
+import org.apache.lucene.analysis.core.StopAnalyzer;
+import org.apache.lucene.analysis.core.StopFilter;
+import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
+import org.apache.lucene.analysis.miscellaneous.LengthFilter;
+import org.apache.lucene.analysis.miscellaneous.TrimFilterFactory;
+import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilterFactory;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Set;
+
+/**
+ * Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link
+ * LowerCaseFilter} and {@link StopFilter}, using a list of
+ * English stop words.
+ *
+ * <a name="version"/>
+ * <p>You must specify the required {@link Version}
+ * compatibility when creating PorterAnalyzer:
+ * <ul>
+ *   <li> As of 3.4, Hiragana and Han characters are no longer wrongly split
+ *        from their combining characters. If you use a previous version number,
+ *        you get the exact broken behavior for backwards compatibility.
+ *   <li> As of 3.1, StandardTokenizer implements Unicode text segmentation,
+ *        and StopFilter correctly handles Unicode 4.0 supplementary characters
+ *        in stopwords.  {@link ClassicTokenizer} and {@link ClassicAnalyzer} 
+ *        are the pre-3.1 implementations of StandardTokenizer and
+ *        PorterAnalyzer.
+ *   <li> As of 2.9, StopFilter preserves position increments
+ *   <li> As of 2.4, Tokens incorrectly identified as acronyms
+ *        are corrected (see <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1068</a>)
+ * </ul>
+ */
+public final class CustomAnalyzer extends StopwordAnalyzerBase {
+
+  /** Default maximum allowed token length */
+  public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
+
+  private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
+
+  /**
+   * Specifies whether deprecated acronyms should be replaced with HOST type.
+   * See {@linkplain "https://issues.apache.org/jira/browse/LUCENE-1068"}
+   */
+  private final boolean replaceInvalidAcronym;
+
+  /** An unmodifiable set containing some common English words that are usually not
+  useful for searching. */
+  public static final Set<?> STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET; 
+
+  /** Builds an analyzer with the given stop words.
+   * @param matchVersion Lucene version to match See {@link
+   * <a href="#version">above</a>}
+   * @param stopWords stop words */
+  public CustomAnalyzer(Version matchVersion, Set<?> stopWords) {
+    super(matchVersion);
+    replaceInvalidAcronym = matchVersion.onOrAfter(ConfigurationDefaults.LUCENE_VERSION);
+  }
+
+  /** Builds an analyzer with the default stop words ({@link
+   * #STOP_WORDS_SET}).
+   * @param matchVersion Lucene version to match See {@link
+   * <a href="#version">above</a>}
+   */
+  public CustomAnalyzer(Version matchVersion) {
+    this(matchVersion, STOP_WORDS_SET);
+  }
+
+  /** Builds an analyzer with the stop words from the given file.
+   * @see WordlistLoader#getWordSet(Reader, Version)
+   * @param matchVersion Lucene version to match See {@link
+   * <a href="#version">above</a>}
+   * @param stopwords File to read stop words from */
+  public CustomAnalyzer(Version matchVersion, File stopwords) throws IOException {
+    this(matchVersion, WordlistLoader.getWordSet(IOUtils.getDecodingReader(stopwords,
+        IOUtils.CHARSET_UTF_8), matchVersion));
+  }
+
+  /** Builds an analyzer with the stop words from the given reader.
+   * @see WordlistLoader#getWordSet(Reader, Version)
+   * @param matchVersion Lucene version to match See {@link
+   * <a href="#version">above</a>}
+   * @param stopwords Reader to read stop words from */
+  public CustomAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
+    this(matchVersion, WordlistLoader.getWordSet(stopwords, matchVersion));
+  }
+
+  /**
+   * Set maximum allowed token length.  If a token is seen
+   * that exceeds this length then it is discarded.  This
+   * setting only takes effect the next time tokenStream or
+   * reusableTokenStream is called.
+   */
+  public void setMaxTokenLength(int length) {
+    maxTokenLength = length;
+  }
+    
+  /**
+   * @see #setMaxTokenLength
+   */
+  public int getMaxTokenLength() {
+    return maxTokenLength;
+  }
+
+  @Override
+  protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
+    final StandardTokenizer src = new StandardTokenizer(matchVersion, reader);
+    src.setMaxTokenLength(maxTokenLength);
+
+    
+ 
+    TokenStream tok = new StandardFilter(matchVersion, src);
+    
+    // TODO: make it better - only difference - remove short or too long tokens
+    tok = new ASCIIFoldingFilter(tok);
+    tok = (new TrimFilterFactory()).create(tok);
+    tok = new LengthFilter(true,  tok, 3,255);
+    
+    tok = new LowerCaseFilter(matchVersion, tok);
+    tok = new StopFilter(matchVersion, tok, stopwords);
+    // only difference
+    //tok = new PorterStemFilter(tok);
+    
+    tok = (new WordDelimiterFilterFactory()).create(tok);
+    
+    return new TokenStreamComponents(src, tok) ;
+//	{
+//      protected boolean reset(final Reader reader) throws IOException {
+//        src.setMaxTokenLength(CustomAnalyzer.this.maxTokenLength);
+//        return super.reset(reader);
+//      }
+//    };
+  }
+}

src/nl/vu/recoprov/utils/CustomFileReader.java

 package nl.vu.recoprov.utils;
-
+/**
+ * Custom file reader that extends BufferedReader to accept filenames as Strings
+ * and Files directly.
+ */
 import java.io.BufferedReader;
 import java.io.File;
 import java.io.FileInputStream;
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.