1. Sara Magliacane
  2. recoprov

Commits

Sara Magliacane  committed b50112f

adding top-k edges filter

  • Participants
  • Parent commits 2562eb1
  • Branches master

Comments (0)

Files changed (13)

File src/main/java/nl/vu/recoprov/CompletePipeline.java

View file
  • Ignore whitespace
 import nl.vu.recoprov.signalaggregators.WeightedSumAggregator;
 import nl.vu.recoprov.signaldetectors.ImageSimilaritySignal;
 import nl.vu.recoprov.signaldetectors.LuceneInverseSimilarity;
+import nl.vu.recoprov.signaldetectors.LuceneMoreLikeThisSignal;
 import nl.vu.recoprov.signaldetectors.LuceneSimilaritySignal;
 import nl.vu.recoprov.signaldetectors.MatchTitleInContentSignal;
 import nl.vu.recoprov.signaldetectors.MetadataSimilaritySignal;
 import nl.vu.recoprov.signalfilters.LuceneThresholdFilter;
 import nl.vu.recoprov.signalfilters.PlagiarismCorpusSpecificFilter;
 import nl.vu.recoprov.signalfilters.TextContainmentFilter;
+import nl.vu.recoprov.signalfilters.TopKEdges;
 import nl.vu.recoprov.utils.ConfigurationDefaults;
 import nl.vu.recoprov.utils.ConfigurationReader;
 
 	private MetadataSimilaritySignal metadataSimilaritySignal = new MetadataSimilaritySignal();
 	private MatchTitleInContentSignal matchTitleInContentSignal = new MatchTitleInContentSignal();
 	private ImageSimilaritySignal imageSimilaritySignal = new ImageSimilaritySignal();
+	private LuceneMoreLikeThisSignal luceneMoreLikeThisSignal = new LuceneMoreLikeThisSignal();
+	private TopKEdges topKEdges = new TopKEdges();
 	
 	private TikaReader tika;
 	
 		luceneSimilaritySignal.computeSignal(depGraph);
 	}
 	
+	public void luceneMoreLikeThisSignal(DependencyGraph depGraph){
+		luceneMoreLikeThisSignal.computeSignal(depGraph);
+	}
+	
 	public void luceneInverseSimilarity(DependencyGraph depGraph) {
 		luceneInverseSimilarity.computeSignal(depGraph);
 	}
 		plagiarismCorpusSpecificFilter.filterSignals(depGraph);
 	}
 	
+	public void filterTopKEdges(DependencyGraph input){
+		topKEdges.filterSignals(input);
+	}
+	
+	public void filterTopKEdges(DependencyGraph input, String label){
+		new TopKEdges(label).filterSignals(input);
+	}
+	
+	public void filterTopKEdges(DependencyGraph input, String label, int k ){
+		new TopKEdges(label, k).filterSignals(input);
+	}
+	
 	public void filterSignals(DependencyGraph depGraph) {
 		filterBackWards(depGraph);
 		filterLuceneThreshold (depGraph);

File src/main/java/nl/vu/recoprov/LuceneIndexer.java

View file
  • Ignore whitespace
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
+import org.apache.lucene.document.FieldType;
 import org.apache.lucene.document.StringField;
 import org.apache.lucene.document.TextField;
 import org.apache.lucene.index.DirectoryReader;
 		doc.add(new StringField("path", node.getCompleteFilepath(),
 				Field.Store.YES));
 		if (content != null && content.length() !=  0) {
-			doc.add(new TextField("contents", new String(content),
-					Field.Store.YES));
+			
+			FieldType type = new FieldType();
+			type.setIndexed(true);
+			type.setStored(true);
+			type.setStoreTermVectors(true);
+			Field field = new Field("contents", new String(content), type);
+			doc.add(field);
+
 		} 
 		else {
 			System.out.println("No content in file "

File src/main/java/nl/vu/recoprov/ProvDMtranslator.java

View file
  • Ignore whitespace
 import java.util.LinkedList;
 import nl.vu.recoprov.baseclasses.DependencyGraph;
 import nl.vu.recoprov.baseclasses.DependencyNode;
-import nl.vu.recoprov.baseclasses.DependencyGraph.LabelledEdge;
+import nl.vu.recoprov.baseclasses.LabelledEdge;
 import nl.vu.recoprov.signalaggregators.WeightedSumAggregator;
 import nl.vu.recoprov.signaldetectors.MetadataSimilaritySignal;
 import nl.vu.recoprov.utils.TransitiveClosure;

File src/main/java/nl/vu/recoprov/baseclasses/DependencyGraph.java

View file
  • Ignore whitespace
 		return this.incidendencyMatrix.get(starting_node);
 	}
 	
-	public ArrayList<LabelledEdge> getAllAggregatedEdges(Integer starting_node){
+	public ArrayList<LabelledEdge> getAllAggregatedEdges(Integer starting_node) {
+		return getAllEdges(starting_node,
+				WeightedSumAggregator.FINAL_SCORE);
+	}
+	
+	public ArrayList<LabelledEdge> getAllEdges(Integer starting_node, String label){
 		ArrayList<LabelledEdge> edges = new ArrayList<LabelledEdge>();
 		
 		if(!this.incidendencyMatrix.containsKey(starting_node))
 			return edges;
 		
 		for(LabelledEdge e: this.incidendencyMatrix.get(starting_node)){
-			if (e.getLabel().equals(WeightedSumAggregator.FINAL_SCORE))
+			if (e.getLabel().equals(label))
 				edges.add(e);
 			else 
 				continue;
 		
 	}
 	
-	public class LabelledEdge implements Serializable {
-		/**
-		 * 
-		 */
-		private static final long serialVersionUID = 1L;
-		private int id;
-		private String label ;
-		private double score = 0.0;
-		
-		public LabelledEdge(Integer i, String s, double score){
-			setId(i);
-			setLabel(s);
-			this.score = score;
-		}
-
-		public double getScore() {
-			return score;
-		}
-
-		public void setScore(double score) {
-			this.score = score;
-		}
-
-		public int getId() {
-			return id;
-		}
-
-		public void setId(int id) {
-			this.id = id;
-		}
-
-		public String getLabel() {
-			return label;
-		}
-
-		public void setLabel(String label) {
-			this.label = label;
-		}
-		
-		public String toString(){
-			return label + " -> " + id + " [ score: "+score+ "]";
-		}
-		
-
-		
-	}
+	
 	
 	
 	public class SimilarGraphResults implements Serializable{

File src/main/java/nl/vu/recoprov/experiments/Experiment3.java

View file
  • Ignore whitespace
 		FileWriter writer = createFileResultsWriter();
 		DependencyGraph depGraphLucene = createLuceneGraph(pipeline);
 		DependencyGraph depGraph = createReferenceGraph(pipeline);
+		DependencyGraph depGraphLuceneMore = createLuceneMoreGraph(pipeline);
+
+		
 
 		// useful for small experiments
 		double[] thresholds = { 0.01, 0.05, 0.1, 0.2 };
 				pipeline.filterTextContainment(depGraph1);
 				pipeline.filterPlagiarismCorpus(depGraph1);
 				pipeline.aggregateSignals(depGraph1);
+				pipeline.filterTopKEdges(depGraph1);
 				writeResults(pipeline, depGraph, depGraph1,
 						"LucenePAN2Filters", writer);
+				
+				
+				
+				
+				DependencyGraph depGraphLuceneMoreThreshold = depGraphLuceneMore
+						.copyGraph();
+				pipeline.filterLuceneThreshold(depGraphLuceneMoreThreshold);
+
+				depGraph1 = depGraphLuceneMoreThreshold.copyGraph();
+				pipeline.aggregateSignals(depGraph1);
+				writeResults(pipeline, depGraph, depGraph1, "LuceneMore", writer);
+
+				depGraph1 = depGraphLuceneMoreThreshold.copyGraph();
+				pipeline.filterTextContainment(depGraph1);
+				pipeline.aggregateSignals(depGraph1);
+				writeResults(pipeline, depGraph, depGraph1, "LuceneMoreFilter",
+						writer);
+
+				depGraph1 = depGraphLuceneMoreThreshold.copyGraph();
+				pipeline.filterPlagiarismCorpus(depGraph1);
+				pipeline.aggregateSignals(depGraph1);
+				writeResults(pipeline, depGraph, depGraph1, "LuceneMorePANFilter",
+						writer);
+
+				depGraph1 = depGraphLuceneMoreThreshold.copyGraph();
+				pipeline.filterTextContainment(depGraph1);
+				pipeline.filterPlagiarismCorpus(depGraph1);
+				pipeline.aggregateSignals(depGraph1);
+				writeResults(pipeline, depGraph, depGraph1,
+						"LuceneMorePAN2Filters", writer);
 
 			} catch (Exception e) {
 				e.printStackTrace();
 		}
 		// trying with Lucene More Like This
 
-		// DependencyGraph depGraphLuceneMore = createGraph();
-		// new LuceneMoreLikeThisSignal().computeSignal(depGraphLuceneMore);
-		// logger.info("LuceneMoreLikeThis graph created.");
+
 		//
 		// for (double threshold : thresholds) {
 		//
 		}
 		return depGraphLucene;
 	}
+	
+	
+	public static DependencyGraph createLuceneMoreGraph(CompletePipeline pipeline)
+			throws Exception {
+		DependencyGraph depGraphLuceneMore = DependencyGraph.deserializeDependencyGraph("depGraphLuceneMore.ser");
+
+		if (depGraphLuceneMore == null) {
+			depGraphLuceneMore = createGraph();
+
+			try {
+				pipeline.luceneMoreLikeThisSignal(depGraphLuceneMore);
+				logger.info("Lucene More graph created.");
+			} catch (Exception e) {
+				e.printStackTrace();
+				logger.error("Could not create Lucene More graph.");
+				throw e;
+			}
+
+			depGraphLuceneMore.serialize("depGraphLuceneMore.ser");
+		}
+		return depGraphLuceneMore;
+	}
 
 	public static DependencyGraph createReferenceGraph(CompletePipeline pipeline)
 			throws Exception {

File src/main/java/nl/vu/recoprov/signalaggregators/WeightedSumAggregator.java

View file
  • Ignore whitespace
 import nl.vu.recoprov.abstractclasses.SignalAggregator;
 import nl.vu.recoprov.baseclasses.DependencyGraph;
 import nl.vu.recoprov.baseclasses.DependencyNode;
-import nl.vu.recoprov.baseclasses.DependencyGraph.LabelledEdge;
+import nl.vu.recoprov.baseclasses.LabelledEdge;
 
 public class WeightedSumAggregator extends SignalAggregator {
 
 				if (count == 0)
 					continue;
 				else {
-					LabelledEdge newedge = new DependencyGraph().new LabelledEdge(
+					LabelledEdge newedge = new LabelledEdge(
 							edge.getId(), FINAL_SCORE, finalScore / count);
 
 					HashMap<Integer, LabelledEdge> temp = addEdgesArray.get(d

File src/main/java/nl/vu/recoprov/signaldetectors/LuceneInverseSimilarity.java

View file
  • Ignore whitespace
 import java.util.Date;
 import nl.vu.recoprov.abstractclasses.SignalDetector;
 import nl.vu.recoprov.baseclasses.DependencyGraph;
-import nl.vu.recoprov.baseclasses.DependencyGraph.LabelledEdge;
+import nl.vu.recoprov.baseclasses.LabelledEdge;
 import nl.vu.recoprov.baseclasses.DependencyNode;
 import nl.vu.recoprov.baseclasses.RecoMetadata;
 import nl.vu.recoprov.signaldetectors.LuceneSimilaritySignal;

File src/main/java/nl/vu/recoprov/signaldetectors/LuceneSimilaritySignal.java

View file
  • Ignore whitespace
 import org.apache.lucene.document.Document;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.search.BooleanQuery;
 
 		for (int i = 0; i < numdocs; i++) {
 			Terms tfvs;
-			try {
+				try {
 				tfvs = reader.getTermVector(i, "contents");
 			} catch (IOException e) {
 				logger.error(

File src/main/java/nl/vu/recoprov/signalfilters/BackwardTemporalFilter.java

View file
  • Ignore whitespace
 import java.util.Date;
 import nl.vu.recoprov.abstractclasses.SignalFilterer;
 import nl.vu.recoprov.baseclasses.DependencyGraph;
-import nl.vu.recoprov.baseclasses.DependencyGraph.LabelledEdge;
+import nl.vu.recoprov.baseclasses.LabelledEdge;
 import nl.vu.recoprov.baseclasses.DependencyNode;
 import nl.vu.recoprov.baseclasses.RecoMetadata;
 

File src/main/java/nl/vu/recoprov/signalfilters/LuceneThresholdFilter.java

View file
  • Ignore whitespace
 import java.util.Date;
 import nl.vu.recoprov.abstractclasses.SignalFilterer;
 import nl.vu.recoprov.baseclasses.DependencyGraph;
-import nl.vu.recoprov.baseclasses.DependencyGraph.LabelledEdge;
+import nl.vu.recoprov.baseclasses.LabelledEdge;
 import nl.vu.recoprov.baseclasses.DependencyNode;
 import nl.vu.recoprov.baseclasses.RecoMetadata;
 import nl.vu.recoprov.signaldetectors.LuceneMoreLikeThisSignal;

File src/main/java/nl/vu/recoprov/signalfilters/PlagiarismCorpusSpecificFilter.java

View file
  • Ignore whitespace
 import java.util.ArrayList;
 import nl.vu.recoprov.abstractclasses.SignalFilterer;
 import nl.vu.recoprov.baseclasses.DependencyGraph;
-import nl.vu.recoprov.baseclasses.DependencyGraph.LabelledEdge;
+import nl.vu.recoprov.baseclasses.LabelledEdge;
 import nl.vu.recoprov.baseclasses.DependencyNode;
 
 

File src/main/java/nl/vu/recoprov/signalfilters/TextContainmentFilter.java

View file
  • Ignore whitespace
 import java.util.Date;
 import nl.vu.recoprov.abstractclasses.SignalFilterer;
 import nl.vu.recoprov.baseclasses.DependencyGraph;
-import nl.vu.recoprov.baseclasses.DependencyGraph.LabelledEdge;
+import nl.vu.recoprov.baseclasses.LabelledEdge;
 import nl.vu.recoprov.baseclasses.DependencyNode;
 import nl.vu.recoprov.baseclasses.RecoMetadata;
 import nl.vu.recoprov.signaldetectors.CompressionDistanceSignal;

File src/main/java/nl/vu/recoprov/utils/TransitiveClosure.java

View file
  • Ignore whitespace
 import nl.vu.recoprov.abstractclasses.SignalFilterer;
 import nl.vu.recoprov.baseclasses.DependencyGraph;
 import nl.vu.recoprov.baseclasses.DependencyNode;
-import nl.vu.recoprov.baseclasses.DependencyGraph.LabelledEdge;
+import nl.vu.recoprov.baseclasses.LabelledEdge;
 import nl.vu.recoprov.signalaggregators.WeightedSumAggregator;
 
 public class TransitiveClosure extends SignalAggregator{