Commits

Sara Magliacane committed f866cb6

adding a switch to switch off the new features

Comments (0)

Files changed (12)

src/main/java/nl/vu/recoprov/CompletePipeline.java

 import java.io.FileWriter;
 import java.io.IOException;
 
+import org.apache.lucene.search.ScoreDoc;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 
 import nl.vu.recoprov.ProvDMtranslator;
 import nl.vu.recoprov.baseclasses.DependencyGraph;
+import nl.vu.recoprov.baseclasses.DependencyNode;
 import nl.vu.recoprov.signalaggregators.WeightedSumAggregator;
 import nl.vu.recoprov.signaldetectors.CompressionDistanceSignal;
 import nl.vu.recoprov.signaldetectors.ImageSimilaritySignal;
 		connectToInternet = online;
 	}
 	
+	public void createDependencyGraphForSentences(DependencyGraph input) throws IOException, DropboxException{
+		if (ConfigurationDefaults.useNewFeatures == true) {
+			createDependencyGraphForSentences();
+			SentenceSplitter.setLuceneIdForSentence(input);
+		}
+	}
 	
-	public DependencyGraph createDependencyGraphForSentences() throws IOException, DropboxException{
+	public DependencyGraph createDependencyGraphForSentences() throws IOException, DropboxException{		
+		if (ConfigurationDefaults.useNewFeatures == false) {
+			return new DependencyGraph();
+		}
+		
 		File sentenceDir = new File (SentenceSplitter.SENTENCE_DIRECTORY);
 		if (!sentenceDir.exists()){
 			initDependencyGraph();

src/main/java/nl/vu/recoprov/LuceneIndexer.java

 
 public class LuceneIndexer {
 
-	private Logger logger;
+	private static Logger logger;
 	private Boolean cleanupIndex = false;
 	private String indexParentDir = "";
 	private SentenceSplitter sentenceSplitter = new SentenceSplitter();
 					+ node.getCompleteFilepath());
 		}
 		
+		logger.info("Document {} created", node.getCompleteFilepath());
+		
 		return doc;
 		
 	}
 		return writer;
 
 	}
+	
+	public static IndexReader createIndexReader(String indexDirFilename) {
+		File indexDir = new File(indexDirFilename);
+
+		FSDirectory store;
+		IndexReader reader;
+		try {
+			store = SimpleFSDirectory.open(indexDir);
+			reader = DirectoryReader.open(store);
+		} catch (IOException e1) {
+			logger.error("Lucene Directory failing: {}", indexDir.getName());
+			e1.printStackTrace();
+			return null;
+		}
+
+		return reader;
+	}
 
 
 }

src/main/java/nl/vu/recoprov/SentenceSplitter.java

 import java.io.IOException;
 import java.io.InputStream;
 
+import org.apache.lucene.analysis.core.KeywordAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.queryparser.classic.ParseException;
+import org.apache.lucene.queryparser.classic.QueryParser;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TopScoreDocCollector;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.store.SimpleFSDirectory;
+
+import nl.vu.recoprov.baseclasses.DependencyGraph;
 import nl.vu.recoprov.baseclasses.DependencyNode;
+import nl.vu.recoprov.signaldetectors.LuceneSimilaritySignal;
+import nl.vu.recoprov.utils.ConfigurationDefaults;
 
 import opennlp.tools.sentdetect.SentenceDetectorME;
 import opennlp.tools.sentdetect.SentenceModel;
 			else{			
 				File file = new File(filename);
 				File outfile = new File(SENTENCE_DIRECTORY + file.getName()+"_"+counter);
-				d.addLuceneIdSentences(d.getId()+counter);
-				
+
 				if (!outfile.exists()){
 				
 					try {
 		for (String string: content.split(" "))
 			System.out.println(string);
 	}
+	
+	public static ScoreDoc[] getLuceneIdForSentence(StringBuffer filename, DependencyNode d){
+		IndexReader reader  = LuceneIndexer.createIndexReader(SENTENCE_DIRECTORY + ConfigurationDefaults.RELATIVE_INDEX_DIR);
+		IndexSearcher searcher = new IndexSearcher(reader);		
+		File file = new File(filename.toString());
+		File dir = new File(SENTENCE_DIRECTORY);
+		StringBuffer querystring = new StringBuffer(dir.getAbsolutePath()+ "*"+ file.getName()+"*");
+
+		QueryParser parser = new QueryParser(
+				ConfigurationDefaults.LUCENE_VERSION, "name", new KeywordAnalyzer());
+		Query query;
+		try {
+			query = parser.parse(QueryParser.escape(querystring.toString()));
+		} catch (ParseException e) {
+			e.printStackTrace();
+			return new ScoreDoc[0];
+		}
+
+		TopScoreDocCollector collector = TopScoreDocCollector.create(
+				ConfigurationDefaults.LUCENE_MAX_NUMBER_DOCS, true);
+		try {
+			searcher.search(query, collector);
+		} catch (IOException e) {
+			e.printStackTrace();
+			return new ScoreDoc[0];
+		}
+		ScoreDoc[] hits = collector.topDocs().scoreDocs;
+		return hits;
+	}
+
+	public static void setLuceneIdForSentence(DependencyGraph input) {
+		IndexReader reader  = LuceneIndexer.createIndexReader(SENTENCE_DIRECTORY + ConfigurationDefaults.RELATIVE_INDEX_DIR);
+		IndexSearcher searcher = new IndexSearcher(reader);	
+		
+		int numdocs = reader.numDocs();
+		for (int i = 0; i < numdocs; i++) {
+			Document doc;
+			try {
+				doc = searcher.doc(i);
+			} catch (IOException e) {
+				e.printStackTrace();
+				continue;
+			}
+
+			String key = doc.getField("name").stringValue();
+			key = key.substring(key.lastIndexOf("/")+1, key.lastIndexOf("_"));
+			
+			DependencyNode node = input.getSimilar(key);
+			node.addLuceneIdSentences(i);
+		}
+		
+		
+		
+	}
 
 }

src/main/java/nl/vu/recoprov/abstractclasses/SignalFilterer.java

 	public DependencyGraph filterGraph (DependencyGraph input){
 		for (String name : input.keySet()) {
 			DependencyNode d = input.get(name);
-			filterNode(input, d);			
+			input = filterNode(input, d);			
 		}	
 		return input;
 	}

src/main/java/nl/vu/recoprov/baseclasses/DependencyGraph.java

 
 		return recovered;
 	}
+
+
+
+	public DependencyNode getSimilar(String key) {
+		DependencyNode d = this.get(key);
+		
+		if (d != null)
+			return d;
+		
+		for (String s: this.keySet()){
+			if (s.contains(key)){
+				return this.get(s);
+			}
+		}
+		
+		return d;
+		
+	}
 	
 }

src/main/java/nl/vu/recoprov/baseclasses/DependencyNode.java

 			temp.append("Contentfilename: ");
 			temp.append(content);
 		}
+		
+		if (this.luceneIdSentences != null) {
+			temp.append("LuceneId: ");
+			temp.append(luceneIdSentences);
+		}
 
 		temp.append("\n");
 		return temp.toString();

src/main/java/nl/vu/recoprov/experiments/Experiment3.java

 public class Experiment3 {
 
 	private static String dirfile = "pan12-detailed-comparison-training-corpus/";
-
 	private static String jsonfile = "pan.json";
 
 	private static DependencyGraph baselineGraph = null;
 		FileWriter writer = createFileResultsWriter();
 		DependencyGraph depGraphLucene = createLuceneGraph(pipeline);
 		DependencyGraph depGraph = createReferenceGraph(pipeline);
-		DependencyGraph depGraphSentencesLucene = createLuceneGraphForSentences(pipeline, depGraphLucene);
-		experimentWithThresholds(pipeline, writer, depGraph, depGraphLucene, depGraphSentencesLucene);
+		//DependencyGraph depGraphSentencesLucene = createLuceneGraphForSentences(pipeline, depGraphLucene);
+		experimentWithThresholds(pipeline, writer, depGraph, depGraphLucene, null);
 		
 		
 		writer.flush();
 						"LuceneMorePAN2Filters", writer);
 				
 				
-				// dependency graph with lucene sentences
-				
-				DependencyGraph depGraphSentencesLuceneThreshold = depGraphSentencesLucene
-						.copyGraph();
-				pipeline.filterLuceneThreshold(depGraphSentencesLuceneThreshold);
-				pipeline.aggregateSignals(depGraphSentencesLuceneThreshold);
-				writeResults(pipeline, depGraph, depGraphSentencesLuceneThreshold, "LuceneSentences", writer);
+//				// dependency graph with lucene sentences
+//				
+//				DependencyGraph depGraphSentencesLuceneThreshold = depGraphSentencesLucene
+//						.copyGraph();
+//				pipeline.filterLuceneThreshold(depGraphSentencesLuceneThreshold);
+//				pipeline.aggregateSignals(depGraphSentencesLuceneThreshold);
+//				writeResults(pipeline, depGraph, depGraphSentencesLuceneThreshold, "LuceneSentences", writer);
 
 
 			} catch (Exception e) {
 		if (depGraph1 != null)
 			return depGraph1;
 
-		PROVReader provreader = new PROVReader(dirfile, jsonfile);
+		PROVReader provreader = new PROVReader(dirfile, jsonfile, "src/", "susp/");
 		try {
-			depGraph1 = provreader.generatePANDepGraph();
+			depGraph1 = provreader.generateDepGraph();
 			pipeline.translateToPROVDM(depGraph1, "graphCorpus.gv");
 			depGraph1.serialize("depGraphReference.ser");
 			logger.info("Reference graph created.");
 		return depGraphLuceneCompression;
 	}
 	
-	
-	private static DependencyGraph createDependencyGraphForSentences (CompletePipeline pipeline) {
+	@Deprecated
+	private static DependencyGraph createDependencyGraphForSentences (CompletePipeline pipeline, DependencyGraph input) {
 		
-		DependencyGraph depGraph1 = DependencyGraph.deserializeDependencyGraph("depGraphSentences.ser");
-
-		if (depGraph1 != null)
-			return depGraph1;
+//		DependencyGraph depGraph1 = DependencyGraph.deserializeDependencyGraph("depGraphSentences.ser");
+//
+//		if (depGraph1 != null)
+//			return depGraph1;
 
 		try {
-			depGraph1 = pipeline.createDependencyGraphForSentences();
-			depGraph1.serialize("depGraphSentences.ser");
+			pipeline.createDependencyGraphForSentences(input);
+//			input.serialize("depGraphSentences.ser");
 			logger.info("depGraphSentences graph created.");
 		} catch (Exception e) {
 			e.printStackTrace();
 			logger.error("Could not create depGraphSentences graph.");
 		}
 
-		return depGraph1;
+		return input;
 	}
 
 	
-	
+	@Deprecated
 	private static DependencyGraph createLuceneGraphForSentences (CompletePipeline pipeline, DependencyGraph input) {
 		
 		logger.info("depGraphSentencesLucene search started.");
 
 		if (depGraph1 != null)
 			return depGraph1;
-
+		
 		depGraph1 = input.copyGraph();
 		
 		try {
-			createDependencyGraphForSentences(pipeline);
+			createDependencyGraphForSentences(pipeline, depGraph1);
 			new HomogeneityFilter().filterSignals(depGraph1);
 			//pipeline.aggregateSignals(depGraph1);
-			depGraph1.serialize("depGraphSentencesLucene.ser");
+			input.serialize("depGraphSentencesLucene.ser");
 			logger.info("depGraphSentencesLucene graph created.");
 		} catch (Exception e) {
 			e.printStackTrace();

src/main/java/nl/vu/recoprov/experiments/PROVReader.java

 public class PROVReader {
 
 	private static HashMap<String, Entity> listOfAvailableEntities = new HashMap<String, Entity> ();
-	private static HashMap<String, Object> listOfAvailableRelations =  new HashMap<String, Object> ();
-	private static  HashMap<String,Activity> listOfAvailableActivities = new  LinkedHashMap<String, Activity> ();
-	private static int counter = 0;
+//	private static HashMap<String, Object> listOfAvailableRelations =  new HashMap<String, Object> ();
+//	private static  HashMap<String,Activity> listOfAvailableActivities = new  LinkedHashMap<String, Activity> ();
+//	private static int counter = 0;
 	private  Logger logger;
 	
-	private  String dir ;
-	private  String jsonfile ;
+	private String dir;
+	private String jsonfile;
+	private String src = "";
+	private String susp = "";
 	private Document provdoc = new Document();
 	
-	private final static String suspiciousFolder = "susp/";
-	private final static String sourceFolder = "src/";
+
 	
 
 	public PROVReader(String dir, String jsonfile){
 				.getLogger("nl.vu.recoprov.experiments.PROVReader");
 	}
 	
+	public PROVReader(String dir, String jsonfile, String src, String susp){
+		this.dir = dir;
+		this.jsonfile = jsonfile;
+		this.src = src;
+		this.susp = susp;
+		logger = LoggerFactory
+				.getLogger("nl.vu.recoprov.experiments.PROVReader");
+	}
 	
-	public DependencyGraph generatePANDepGraph() throws DropboxException, IOException {
+	public DependencyGraph generateDepGraph() throws DropboxException, IOException {
 
 		logger.info("Reading PROV description of folder: {} in JSON {}: ", dir, jsonfile);
 		
 	}
 		
 	
-	public ProvFactory initFactory() {
+	private ProvFactory initFactory() {
 		ProvFactory factory = ProvFactory.getFactory();
 		Hashtable<String, String> namespace = new Hashtable<String, String>();	
 		namespace.put("_", "");
 	
 
 	
-	public void createEntityFromDepGraph(ProvFactory factory,
+	private void createEntityFromDepGraph(ProvFactory factory,
 			DependencyGraph depGraph) {
 		for (String name : depGraph.keySet()) {
 			// create an entity for each node in the dependency graph
 	}
 
 
-	public void createEntity(ProvFactory factory, DependencyNode d) {
+	private void createEntity(ProvFactory factory, DependencyNode d) {
 
 		String name = d.getCompleteFilepath();
 		// name = name.replace(dir, "");
 
 
 
-	public void readJSON(String jsonfile, DependencyGraph depGraph, ProvFactory factory) {
+	private void readJSON(String jsonfile, DependencyGraph depGraph, ProvFactory factory) {
 		Converter conv = new Converter();
 
-		File sourcedir = new File(dir, sourceFolder);
-		File suspdir = new File(dir, suspiciousFolder);
+		File sourcedir = new File(dir, src);
+		File suspdir = new File(dir, susp);
 		
 		try {
 			provdoc = conv.readDocument(jsonfile);
 				File usedFile = new File(sourcedir, used);
 				File genFile = new File(suspdir, generated);
 				
-				depGraph.addEdge(depGraph.get(genFile.getAbsolutePath()),
-						depGraph.get(usedFile.getAbsolutePath()),
+				depGraph.addEdge(depGraph.get(usedFile.getAbsolutePath()),depGraph.get(genFile.getAbsolutePath()),
 						WeightedSumAggregator.FINAL_SCORE, 1.0);
 			}
 
 
 
 
-	public void convertToDot(ProvFactory factory) {
+	private void convertToDot(ProvFactory factory) {
 
 		ProvToDot provtodot = new ProvToDot();
 //		Document container;

src/main/java/nl/vu/recoprov/signaldetectors/CompressionDistanceSignal.java

 				DependencyNode d2 = input.get(name2);
 				long size2 = computeCompressionSizePerNode(d2);
 				long size12 = computeCompressionSize(d1, d2);
+				long size21 = computeCompressionSize(d2, d1);
 
-				if (size1 != 0 && size2 != 0 && size12!=0) {
+				if (size1 != 0 && size2 != 0 && size12!=0 && size21!=0) {
+					double score1 =  ((double) size12  - size1)/ (double) size2;
+					double score2 =  ((double) size21  - size2)/ (double) size1;
 					double score =  1 - ((double) size12  - Math.min(size1,size2))/ (double) Math.max(size1,size2);
-					input.addEdge(d1, d2, COMPRESSION_DISTANCE, score);
+					
+					if (score1 > score2)
+						input.addEdge( d2, d1, COMPRESSION_DISTANCE, score);
+					if (score2 > score1)
+						input.addEdge(d1, d2, COMPRESSION_DISTANCE, score);
 
 				}
 			}

src/main/java/nl/vu/recoprov/signaldetectors/LuceneSimilaritySignal.java

 				ConfigurationDefaults.LUCENE_VERSION, fieldname, analyzer);
 		Query query;
 		try {
-			query = parser.parse(queryString.toString());
+			query = parser.parse(QueryParser.escape(queryString.toString()));
 		} catch (ParseException e) {
 			logger.error("Parser exception for queryString: {} ",
-					queryString.substring(0, 100));
+					queryString.substring(0, 5));
 			e.printStackTrace();
 			return new ScoreDoc[0];
 		}
 			// ConfigurationDefaults.LUCENE_THRESHOLD)
 			// continue;
 
-			if (hits[j].doc == i)
+			if (hits[j].doc == i){
 				continue;
-
+			}
 			// d.addDepNodeSimilarity("lucene_similarity", d2,
 			// hits[j].score);
-			input.addEdge(i, hits[j].doc, LUCENE_SIMILARITY, hits[j].score);
+			
+			double score = hits[j].score/hits[i].score;
+			input.addEdge(i, hits[j].doc, LUCENE_SIMILARITY, score);
 
 		}
 	}

src/main/java/nl/vu/recoprov/signalfilters/HomogeneityFilter.java

 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import nl.vu.recoprov.LuceneIndexer;
 import nl.vu.recoprov.SentenceSplitter;
 import nl.vu.recoprov.abstractclasses.SignalFilterer;
 import nl.vu.recoprov.baseclasses.DependencyGraph;
 
 public class HomogeneityFilter extends SignalFilterer {
 
+	private static final float THRESHOLD = 0.1f;
 	private Logger logger;
 	
 	public HomogeneityFilter(){
 		if (edgearray == null)
 			return input;
 		
-		IndexReader reader = createIndexReader(SentenceSplitter.SENTENCE_DIRECTORY
+		IndexReader reader = LuceneIndexer.createIndexReader(SentenceSplitter.SENTENCE_DIRECTORY
 				+ ConfigurationDefaults.RELATIVE_INDEX_DIR);
 		IndexSearcher searcher = new IndexSearcher(reader);
 
 		for (LabelledEdge edge : edgearray) {
 			int value = compareTwoNodes(input, d, input.get(edge.getId()), reader, searcher);
 			if (value == 1){
-				logger.trace("Removing edge {} -> {}", d.getId(), edge.getId());
+				logger.info("Removing edge {} -> {}", d.getId(), edge.getId());
 				input.removeEdge(d.getId(), edge.getId());
 			}
 			if (value == -1){
-				logger.trace("Removing edge {} -> {}", edge.getId(), d.getId());
+				logger.info("Removing edge {} -> {}", edge.getId(), d.getId());
 				input.removeEdge(edge.getId(), d.getId());
 			}
 		}
 		double score1 = 0.0;
 		double score2 = 0.0;
 		
-		// for each sentence in the original node
+
+		// for each sentence that is very similar in both docs
 		for (Integer i : luceneIdSentences1) {
 			ScoreDoc[] hits = LuceneSimilaritySignal.computeSimilarity( reader,  searcher,  input,  i);
 			for (ScoreDoc scoredoc: hits){
+				
+				if(scoredoc.doc == i){
+					continue;
+				}
+				
+				if(scoredoc.score < THRESHOLD){
+					// they are ordered, so others will be even worse
+					break;
+				}
+				
 				// check how many of the sentences are similar in the first
 				if(luceneIdSentences1.contains(scoredoc.doc)){
 					score1 += scoredoc.score;
 			}
 		}
 		
-		logger.trace("Comparing two nodes: {} -> {}; scores: {} {}",
+		//normalize by the number of sentences
+		score1 =  score1/(luceneIdSentences1.size()+1);
+		score2 =  score1/(luceneIdSentences2.size()+1);
+		
+		logger.info("Comparing two nodes: {} -> {}; scores: {} {}",
 				d1.getId(), d2.getId(), score1, score2);
 		
 		if (score1 > score2){
 
 	}
 	
-	private IndexReader createIndexReader(String indexDirFilename) {
-		File indexDir = new File(indexDirFilename);
 
-		FSDirectory store;
-		IndexReader reader;
-		try {
-			store = SimpleFSDirectory.open(indexDir);
-			reader = DirectoryReader.open(store);
-		} catch (IOException e1) {
-			logger.error("Lucene Directory failing: {}", indexDir.getName());
-			e1.printStackTrace();
-			return null;
-		}
-
-		return reader;
-	}
 }

src/main/java/nl/vu/recoprov/utils/ConfigurationDefaults.java

 	
 	public static final String CONFIG_FILE = "config.txt";
 	
+	public static final Boolean useNewFeatures = false;
+	
 	/**
 	 * Dropbox configuration defaults
 	 */