Sara Magliacane avatar Sara Magliacane committed 08eae2f

adding homogeneity filter

Comments (0)

Files changed (8)

src/main/java/nl/vu/recoprov/LuceneIndexer.java

 	private Logger logger;
 	private Boolean cleanupIndex = false;
 	private String indexParentDir = "";
+	private SentenceSplitter sentenceSplitter = new SentenceSplitter();
 	
 	public LuceneIndexer() {
 		this("");
 			StringBuffer content = readContentFile(node);
 			Document doc = createLuceneDocument(node, content);
 			writer.addDocument(doc);
-
 		}
 
 		writer.close();
 			Document doc = searcher.doc(i);
 			String key = doc.getField("name").stringValue();
 			DependencyNode d = input.get(key);
-
 			d.setLuceneDocNumber(i);
+			
+			// split sentences		
+			if (!key.contains(SentenceSplitter.SENTENCE_DIRECTORY))
+				sentenceSplitter.createFileForKNgrams(d, key, readContentFile(d).toString());
+			
 			input.put(d.getCompleteFilepath(), d);
 		}
 

src/main/java/nl/vu/recoprov/SentenceSplitter.java

 import java.io.IOException;
 import java.io.InputStream;
 
+import nl.vu.recoprov.baseclasses.DependencyNode;
+
 import opennlp.tools.sentdetect.SentenceDetectorME;
 import opennlp.tools.sentdetect.SentenceModel;
 
 	private SentenceDetectorME sentenceDetector = null;
 
 	public static final String SENTENCE_DIRECTORY = "sentences/";
-	public static final int DEFAULT_K = 100;
+	public static final int DEFAULT_K = 1000;
 	
 	public SentenceSplitter() {
 		initialize();
 	}
 
 	// may be better to use standard Lucene Shingles
-	public void createFileForKNgrams(String filename, String content){
+	public void createFileForKNgrams(DependencyNode d, String filename, String content){
 		int counter = 0;
 		int k = 0;
 		StringBuffer buffer = new StringBuffer();
 			else{			
 				File file = new File(filename);
 				File outfile = new File(SENTENCE_DIRECTORY + file.getName()+"_"+counter);
+				d.addLuceneIdSentences(d.getId()+counter);
 				
 				if (!outfile.exists()){
 				

src/main/java/nl/vu/recoprov/TikaReader.java

 	private AutoDetectParser parser = new AutoDetectParser();
 	private ParseContext context = new ParseContext();
 	private String contentdirname = ConfigurationDefaults.CONTENTDIR;
-	private SentenceSplitter sentenceSplitter = new SentenceSplitter();
 	private Logger logger;
 
 	public TikaReader(String dir) {
 				d.getMetadata().setFSModified(new Date(f.lastModified()));
 				d.getMetadata().setFSSize(f.length());
 				
-				// split sentences		
-				if (!contentFilename.contains(SentenceSplitter.SENTENCE_DIRECTORY))
-					sentenceSplitter.createFileForKNgrams(contentFilename, textHandler.toString());
 				
 
 				if (f.getParentFile().isDirectory()) {

src/main/java/nl/vu/recoprov/baseclasses/DependencyNode.java

 package nl.vu.recoprov.baseclasses;
 
 import java.io.Serializable;
+import java.util.ArrayList;
 import java.util.Set;
 import org.apache.lucene.search.ScoreDoc;
 import org.apache.tika.metadata.Metadata;
 	//private ScoreDoc[] luceneSimilarity = null;
 	private String content = null;
 	private RecoMetadata recoMetadata = new RecoMetadata();
+	
+	private ArrayList<Integer> luceneIdSentences = new ArrayList<Integer>();
 
 	public DependencyNode(DependencyGraph d) {
 		setDepGraph(d);
 		//d.setLuceneSimilarity(getLuceneSimilarity());
 		d.setLuceneDocNumber(getLuceneDocNumber());
 		d.setMetadata(getMetadata());
+		d.setLuceneIdSentences(getLuceneIdSentences());
 		return d;
 	}
 	
 		return temp.toString();
 	}
 
+	public ArrayList<Integer> getLuceneIdSentences() {
+		return luceneIdSentences;
+	}
+
+	public void setLuceneIdSentences(ArrayList<Integer> luceneIdSentences) {
+		this.luceneIdSentences = luceneIdSentences;
+	}
+	
+	public void addLuceneIdSentences(Integer luceneid) {
+		this.luceneIdSentences.add(luceneid);
+	}
+
+
 
 }

src/main/java/nl/vu/recoprov/experiments/Experiment3.java

 import nl.vu.recoprov.CompletePipeline;
 import nl.vu.recoprov.baseclasses.DependencyGraph;
 import nl.vu.recoprov.signaldetectors.CompressionDistanceSignal;
+import nl.vu.recoprov.signalfilters.HomogeneityFilter;
 import nl.vu.recoprov.utils.ConfigurationDefaults;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 		FileWriter writer = createFileResultsWriter();
 		DependencyGraph depGraphLucene = createLuceneGraph(pipeline);
 		DependencyGraph depGraph = createReferenceGraph(pipeline);
-		//experimentWithThresholds(pipeline, writer depGraph);
-		DependencyGraph depGraphSentencesLucene = createLuceneGraphForSentences(pipeline);
-		writer.append(depGraphSentencesLucene.toString());
+		DependencyGraph depGraphSentencesLucene = createLuceneGraphForSentences(pipeline, depGraphLucene);
+		experimentWithThresholds(pipeline, writer, depGraph, depGraphLucene, depGraphSentencesLucene);
 		
-		//
 		
 		writer.flush();
 		writer.close();
 
 	}
 		
-	public static void experimentWithThresholds(CompletePipeline pipeline, FileWriter writer , DependencyGraph depGraph) throws Exception{
-
-
+	public static void experimentWithThresholds(CompletePipeline pipeline,
+			FileWriter writer, DependencyGraph depGraph,
+			DependencyGraph depGraphLucene, DependencyGraph depGraphSentencesLucene) throws Exception {
 
 		DependencyGraph depGraphLuceneMore = createLuceneMoreGraph(pipeline);
 		
 		//add compression
 		//DependencyGraph depGraphLuceneCompression = createCompressionGraph(pipeline);
 	
-		double[] thresholds = { 0.01, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35,
-				0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9,
-				0.95 };
+		double[] thresholds = { 0.0, 0.01, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35,
+				0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7 };
 
 		for (double threshold : thresholds) {
 			try {
 				DependencyGraph depGraphLuceneThreshold = depGraphLucene
 						.copyGraph();
 				pipeline.filterLuceneThreshold(depGraphLuceneThreshold);
+				
 
 				DependencyGraph depGraph1 = depGraphLuceneThreshold.copyGraph();
 				pipeline.aggregateSignals(depGraph1);
 				pipeline.aggregateSignals(depGraph1);
 				writeResults(pipeline, depGraph, depGraph1,
 						"LuceneMorePAN2Filters", writer);
+				
+				
+				// dependency graph with lucene sentences
+				
+				DependencyGraph depGraphSentencesLuceneThreshold = depGraphSentencesLucene
+						.copyGraph();
+				pipeline.filterLuceneThreshold(depGraphSentencesLuceneThreshold);
+				pipeline.aggregateSignals(depGraphSentencesLuceneThreshold);
+				writeResults(pipeline, depGraph, depGraphSentencesLuceneThreshold, "LuceneSentences", writer);
+
 
 			} catch (Exception e) {
 				e.printStackTrace();
 
 	
 	
-	private static DependencyGraph createLuceneGraphForSentences (CompletePipeline pipeline) {
+	private static DependencyGraph createLuceneGraphForSentences (CompletePipeline pipeline, DependencyGraph input) {
 		
+		logger.info("depGraphSentencesLucene search started.");
 		DependencyGraph depGraph1 = DependencyGraph.deserializeDependencyGraph("depGraphSentencesLucene.ser");
 
 		if (depGraph1 != null)
 			return depGraph1;
 
+		depGraph1 = input.copyGraph();
+		
 		try {
-			depGraph1 = createDependencyGraphForSentences(pipeline);
-			pipeline.luceneSimilaritySignal(depGraph1, "sentences/lucene_index/");
-			pipeline.aggregateSignals(depGraph1);
+			createDependencyGraphForSentences(pipeline);
+			new HomogeneityFilter().filterSignals(depGraph1);
+			//pipeline.aggregateSignals(depGraph1);
 			depGraph1.serialize("depGraphSentencesLucene.ser");
 			logger.info("depGraphSentencesLucene graph created.");
 		} catch (Exception e) {

src/main/java/nl/vu/recoprov/signaldetectors/LuceneSimilaritySignal.java

 			+ "_inverse";
 
 	private String indexDirFilename;
-	private Logger logger;
+	private static Logger logger;
 
 	public LuceneSimilaritySignal() {
 		this(ConfigurationDefaults.INDEX_DIR);
 		int numdocs = reader.numDocs();
 
 		for (int i = 0; i < numdocs; i++) {
-			Terms tfvs;
-				try {
-				tfvs = reader.getTermVector(i, "contents");
-			} catch (IOException e) {
-				logger.error(
-						"IOException in retrieving term vector for doc {}", i);
-				e.printStackTrace();
-				continue;
-			}
-
-			ScoreDoc[] hits = new ScoreDoc[0];
-			StringBuffer querystring = null;
-
-			if (tfvs == null) {
-				logger.error("No term vector for doc {}", i);
-			} else {
-				logger.info("LuceneSimilarity: working on doc {} ", i);
-				TermsEnum tenum;
-				try {
-					tenum = tfvs.iterator(null);
-				} catch (IOException e) {
-					logger.error("No iterator for term vector for doc {}", i);
-					e.printStackTrace();
-					continue;
-				}
-
-				try {
-					while (tenum.next() != null) {
-
-						if (querystring == null) {
-							querystring = new StringBuffer(tenum.term()
-									.utf8ToString());
-						} else {
-							querystring.append(" OR ");
-							querystring.append(tenum.term().utf8ToString());
-						}
-
-					}
-				} catch (IOException e) {
-					logger.error("Iterator exception for term vector: doc {}",
-							i);
-					e.printStackTrace();
-					continue;
-				}
-
-				hits = searchForString(searcher, querystring);
-
-			}
-
-			Document doc;
-			try {
-				doc = searcher.doc(i);
-			} catch (IOException e) {
-				logger.error("Searcher exception for doc {}", i);
-				e.printStackTrace();
-				continue;
-			}
-
-			String key = doc.getField("name").stringValue();
-
-			for (int j = 0; j < hits.length; j++) {
-
-				// if (hits[j].score <
-				// ConfigurationDefaults.LUCENE_THRESHOLD)
-				// continue;
-
-				if (hits[j].doc == i)
-					continue;
-
-				// d.addDepNodeSimilarity("lucene_similarity", d2,
-				// hits[j].score);
-				input.addEdge(i, hits[j].doc, LUCENE_SIMILARITY, hits[j].score);
-
-			}
+			ScoreDoc[] hits = computeSimilarity( reader,  searcher,  input,  i);
+			addEdges(searcher, input, i, hits);
 		}
 		
 		logger.info("Signal computation completed successfully.");
 
 	}
 
-	public ScoreDoc[] searchForString(IndexSearcher searcher,
+	public static ScoreDoc[] searchForString(IndexSearcher searcher,
 			StringBuffer queryString) {
 
 		return searchForString("contents", searcher, queryString,
 
 	}
 
-	public ScoreDoc[] searchForString(String fieldname, IndexSearcher searcher,
+	public static ScoreDoc[] searchForString(String fieldname, IndexSearcher searcher,
 			StringBuffer queryString, Analyzer analyzer) {
 
 		if (queryString.equals(""))
 		return str.matches(".*[a-zA-Z]+.*");
 	}
 
+	
+	public static ScoreDoc[] computeSimilarity(IndexReader reader, IndexSearcher searcher, DependencyGraph input, int i) {
+		Terms tfvs;
+		ScoreDoc[] hits = new ScoreDoc[0];
+		StringBuffer querystring = null;
+		try {
+			tfvs = reader.getTermVector(i, "contents");
+		} catch (IOException e) {
+			logger.error("IOException in retrieving term vector for doc {}", i);
+			e.printStackTrace();
+			return hits;
+		}
+
+		if (tfvs == null) {
+			logger.error("No term vector for doc {}", i);
+		} else {
+			logger.trace("LuceneSimilarity: working on doc {} ", i);
+			TermsEnum tenum;
+			try {
+				tenum = tfvs.iterator(null);
+			} catch (IOException e) {
+				logger.error("No iterator for term vector for doc {}", i);
+				e.printStackTrace();
+				return hits;
+			}
+
+			try {
+				while (tenum.next() != null) {
+
+					if (querystring == null) {
+						querystring = new StringBuffer(tenum.term()
+								.utf8ToString());
+					} else {
+						querystring.append(" OR ");
+						querystring.append(tenum.term().utf8ToString());
+					}
+
+				}
+			} catch (IOException e) {
+				logger.error("Iterator exception for term vector: doc {}", i);
+				e.printStackTrace();
+				return hits;
+			}
+
+			hits = searchForString(searcher, querystring);
+		}
+		return hits;
+
+		}
+
+		
+	private void addEdges(IndexSearcher searcher, DependencyGraph input, int i,
+			ScoreDoc[] hits) {
+		Document doc;
+		try {
+			doc = searcher.doc(i);
+		} catch (IOException e) {
+			logger.error("Searcher exception for doc {}", i);
+			e.printStackTrace();
+			return;
+		}
+
+		String key = doc.getField("name").stringValue();
+
+		for (int j = 0; j < hits.length; j++) {
+
+			// if (hits[j].score <
+			// ConfigurationDefaults.LUCENE_THRESHOLD)
+			// continue;
+
+			if (hits[j].doc == i)
+				continue;
+
+			// d.addDepNodeSimilarity("lucene_similarity", d2,
+			// hits[j].score);
+			input.addEdge(i, hits[j].doc, LUCENE_SIMILARITY, hits[j].score);
+
+		}
+	}
 }

src/main/java/nl/vu/recoprov/signalfilters/HomogeneityFilter.java

+package nl.vu.recoprov.signalfilters;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Date;
+
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.store.SimpleFSDirectory;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import nl.vu.recoprov.SentenceSplitter;
+import nl.vu.recoprov.abstractclasses.SignalFilterer;
+import nl.vu.recoprov.baseclasses.DependencyGraph;
+import nl.vu.recoprov.baseclasses.LabelledEdge;
+import nl.vu.recoprov.baseclasses.DependencyNode;
+import nl.vu.recoprov.baseclasses.RecoMetadata;
+import nl.vu.recoprov.signaldetectors.CompressionDistanceSignal;
+import nl.vu.recoprov.signaldetectors.LuceneSimilaritySignal;
+import nl.vu.recoprov.utils.ConfigurationDefaults;
+
+public class HomogeneityFilter extends SignalFilterer {
+
+	private Logger logger;
+	
+	public HomogeneityFilter(){
+		logger = LoggerFactory
+				.getLogger("nl.vu.recoprov.signalfilters.HomogeneityFilter");
+	}
+	
+	@Override
+	public DependencyGraph filterNode(DependencyGraph input, DependencyNode d) {
+		// for each outgoing edge and relative node
+		
+		if (d == null) {
+			System.out.println("No node with name " + d);
+			return input;
+		}
+
+		// get all its outcoming edges
+		ArrayList<LabelledEdge> edgearray = input.getAllEdges(d
+				.getLuceneDocNumber());
+
+		if (edgearray == null)
+			return input;
+		
+		IndexReader reader = createIndexReader(SentenceSplitter.SENTENCE_DIRECTORY
+				+ ConfigurationDefaults.RELATIVE_INDEX_DIR);
+		IndexSearcher searcher = new IndexSearcher(reader);
+
+		// for each of its outcoming edges
+		for (LabelledEdge edge : edgearray) {
+			int value = compareTwoNodes(input, d, input.get(edge.getId()), reader, searcher);
+			if (value == 1){
+				logger.trace("Removing edge {} -> {}", d.getId(), edge.getId());
+				input.removeEdge(d.getId(), edge.getId());
+			}
+			if (value == -1){
+				logger.trace("Removing edge {} -> {}", edge.getId(), d.getId());
+				input.removeEdge(edge.getId(), d.getId());
+			}
+		}
+		
+		
+		
+		return input;
+	}
+
+	private int compareTwoNodes(DependencyGraph input, DependencyNode d1,
+			DependencyNode d2, IndexReader reader, IndexSearcher searcher) {
+		
+		ArrayList<Integer> luceneIdSentences1 = d1.getLuceneIdSentences();
+		ArrayList<Integer> luceneIdSentences2 = d2.getLuceneIdSentences();
+		double score1 = 0.0;
+		double score2 = 0.0;
+		
+		// for each sentence in the original node
+		for (Integer i : luceneIdSentences1) {
+			ScoreDoc[] hits = LuceneSimilaritySignal.computeSimilarity( reader,  searcher,  input,  i);
+			for (ScoreDoc scoredoc: hits){
+				// check how many of the sentences are similar in the first
+				if(luceneIdSentences1.contains(scoredoc.doc)){
+					score1 += scoredoc.score;
+				}
+				// and how many are similar in the second
+				if(luceneIdSentences2.contains(scoredoc.doc)){
+					score2 += scoredoc.score;
+				}
+			}
+		}
+		
+		logger.trace("Comparing two nodes: {} -> {}; scores: {} {}",
+				d1.getId(), d2.getId(), score1, score2);
+		
+		if (score1 > score2){
+			return +1;
+		}
+		
+		if (score1 < score2){
+			return -1;
+		}
+		
+		return 0;
+
+	}
+	
+	private IndexReader createIndexReader(String indexDirFilename) {
+		File indexDir = new File(indexDirFilename);
+
+		FSDirectory store;
+		IndexReader reader;
+		try {
+			store = SimpleFSDirectory.open(indexDir);
+			reader = DirectoryReader.open(store);
+		} catch (IOException e1) {
+			logger.error("Lucene Directory failing: {}", indexDir.getName());
+			e1.printStackTrace();
+			return null;
+		}
+
+		return reader;
+	}
+}

src/main/java/nl/vu/recoprov/utils/ConfigurationDefaults.java

 	/**
 	 * Lucene indexer defaults
 	 */
-	public static String RELATIVE_INDEX_DIR = "lucene_index/";
+	public static final String RELATIVE_INDEX_DIR = "lucene_index/";
 	public static String INDEX_DIR = "lucene_index/";
 	public static final Version LUCENE_VERSION = Version.LUCENE_42;
 	
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.