Commits

Sara Magliacane  committed d2be735

Debugging before the weekend.

  • Participants
  • Parent commits 3e84561

Comments (0)

Files changed (6)

File src/main/java/nl/vu/recoprov/CompletePipeline.java

 	private ProvDMtranslator ProvDMtranslator = new ProvDMtranslator();
 	private DropboxClient client = new DropboxClient();
 	private BackwardTemporalFilter backwardTemporalFilter = new BackwardTemporalFilter();
-	private LuceneIndexer indexer = new LuceneIndexer();
 	private LuceneSimilaritySignal luceneSimilaritySignal = new LuceneSimilaritySignal();
 	private LuceneInverseSimilarity luceneInverseSimilarity = new LuceneInverseSimilarity();
 	private MetadataSimilaritySignal metadataSimilaritySignal = new MetadataSimilaritySignal();
 	private CompressionDistanceSignal compressionDistanceSignal = new CompressionDistanceSignal();
 	
 	private TikaReader tika;
+	private LuceneIndexer indexer;
 	
 	public CompletePipeline() {
 		this (false, "", null);
 		logger = LoggerFactory
 				.getLogger("nl.vu.recoprov.CompletePipeline");
 		tika = new TikaReader(currentDir);
+		indexer = new LuceneIndexer();
 	}
 	
 	public static void main(String[] args) throws Exception {
 	}
 	
 	public void loadMetadaAndIndexes(DependencyGraph depGraph) throws IOException {
-		tika.read(depGraph, params);
+		tikaRead(depGraph);
 		indexFiles(depGraph);
-
 		depGraph = ImageReader.read(currentDir, depGraph);
-
+	}
+	
+	public void tikaRead(DependencyGraph depGraph){
+		tika.read(depGraph, params);
 	}
 
 	public void indexFiles(DependencyGraph depGraph) throws IOException{
 		luceneSimilaritySignal.computeSignal(depGraph);
 	}
 	
+	public void luceneSimilaritySignal(DependencyGraph depGraph, String indexDir){
+		new LuceneSimilaritySignal(indexDir).computeSignal(depGraph);
+	}
+	
+	
 	public void luceneMoreLikeThisSignal(DependencyGraph depGraph){
 		luceneMoreLikeThisSignal.computeSignal(depGraph);
 	}
 		connectToInternet = online;
 	}
 	
+	
+	public DependencyGraph createDependencyGraphForSentences() throws IOException, DropboxException{
+		File sentenceDir = new File (SentenceSplitter.SENTENCE_DIRECTORY);
+		if (!sentenceDir.exists()){
+			initDependencyGraph();
+		}
+		
+		CompletePipeline pipeline = new CompletePipeline(false, SentenceSplitter.SENTENCE_DIRECTORY);		
+		DependencyGraph depGraph = pipeline.getDropboxMetadata();
+		pipeline.tikaRead(depGraph);
+		LuceneIndexer luceneIndexer = new LuceneIndexer(SentenceSplitter.SENTENCE_DIRECTORY);
+		luceneIndexer.indexFiles(depGraph);
+		
+		return depGraph;
 
+	}
+	
 	
 }

File src/main/java/nl/vu/recoprov/LuceneIndexer.java

 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.store.FSDirectory;
 import org.apache.lucene.store.SimpleFSDirectory;
+import org.slf4j.LoggerFactory;
+import org.slf4j.Logger;
+
+import com.dropbox.client2.exception.DropboxException;
+
 
 /**
  * Create the Lucene index for the files in the DependencyGraph.
 
 public class LuceneIndexer {
 
+	private Logger logger;
 	private Boolean cleanupIndex = false;
+	private String indexParentDir = "";
+	
+	public LuceneIndexer() {
+		this("");
+	}
+	
+	public LuceneIndexer( String indexParentDir){
+		this.indexParentDir = indexParentDir;
+		logger = LoggerFactory
+				.getLogger("nl.vu.recoprov.signaldetector.LuceneIndexer");
+	}
 
+	
 	public void indexFiles(DependencyGraph input) throws IOException {
 
-		File indexDir = new File(ConfigurationDefaults.RELATIVE_INDEX_DIR);
+		File indexDir;
+
+		if (indexParentDir.equals(""))
+			indexDir = new File(ConfigurationDefaults.RELATIVE_INDEX_DIR);
+		else
+			indexDir = new File(indexParentDir,
+					ConfigurationDefaults.RELATIVE_INDEX_DIR);
 
 		if (!indexDir.exists()) {
 			// if index directory doesn't exist, create the index
 		
 		int count = 0;
 
-		File indexDir = new File(ConfigurationDefaults.RELATIVE_INDEX_DIR);
+		File indexDir;
+
+		if (indexParentDir.equals(""))
+			indexDir = new File(ConfigurationDefaults.RELATIVE_INDEX_DIR);
+		else
+			indexDir = new File(indexParentDir,
+					ConfigurationDefaults.RELATIVE_INDEX_DIR);
+
 		ConfigurationDefaults.INDEX_DIR = indexDir.getAbsolutePath();
 		
 		FSDirectory store = SimpleFSDirectory.open(indexDir);
 				continue;
 			}
 
-			System.out.println(count++ + ": adding " + name);
+			logger.info(count++ + ": adding " + name);
 
 			DependencyNode node = input.get(name);
 			StringBuffer content = readContentFile(node);
 
 	private void assignLuceneNumbers(DependencyGraph input) throws IOException {
 
-		File indexDir = new File(ConfigurationDefaults.RELATIVE_INDEX_DIR);
+		File indexDir;
+
+		if (indexParentDir.equals(""))
+			indexDir = new File(ConfigurationDefaults.RELATIVE_INDEX_DIR);
+		else
+			indexDir = new File(indexParentDir,
+					ConfigurationDefaults.RELATIVE_INDEX_DIR);
+
 		FSDirectory store;
 
 		store = SimpleFSDirectory.open(indexDir);
 		return writer;
 
 	}
-	
+
 
 }
 	

File src/main/java/nl/vu/recoprov/SentenceSplitter.java

 			}
 			
 			else{			
-				try {
-					writer = new FileWriter(new File(SENTENCE_DIRECTORY + filename+"_"+counter));
-					writer.append(buffer);
-				} catch (IOException e) {
-					e.printStackTrace();
-				} finally{
-					if (writer != null)
-						try{
-							writer.close();
-						} catch (IOException e){
-						}
-				}
+				File file = new File(filename);
+				File outfile = new File(SENTENCE_DIRECTORY + file.getName()+"_"+counter);
 				
+				if (!outfile.exists()){
+				
+					try {
+						writer = new FileWriter(outfile);
+						writer.append(buffer);
+					} catch (IOException e) {
+						e.printStackTrace();
+					} finally{
+						if (writer != null)
+							try{
+								writer.close();
+							} catch (IOException e){
+							}
+					}
+				}
 				buffer = new StringBuffer();
 				k = 0;
 				counter++;

File src/main/java/nl/vu/recoprov/TikaReader.java

 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.sax.BodyContentHandler;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 import org.xml.sax.ContentHandler;
 
 
 	private ParseContext context = new ParseContext();
 	private String contentdirname = ConfigurationDefaults.CONTENTDIR;
 	private SentenceSplitter sentenceSplitter = new SentenceSplitter();
+	private Logger logger;
 
 	public TikaReader(String dir) {
 		currentDir = dir;
 		tika.setMaxStringLength(-1);
 		context.set(Parser.class, parser);
-		
-		// make content dir - where you store the contents of the files
-		File contentdir = new File(contentdirname);
-		contentdir.mkdir();
+		logger = LoggerFactory
+				.getLogger("nl.vu.recoprov.TikaReader");
 	}
 
 	public DependencyGraph read(DependencyGraph depNodeMap, String[] params) {
 	public DependencyGraph read(String dirname, DependencyGraph depNodeMap,
 			String[] params) {
 
+		// make content dir - where you store the contents of the files
+		File contentdir = new File(contentdirname);
+		if (!contentdir.exists())
+			contentdir.mkdir();
+		
 		File dir = new File(dirname);
 
 		for (String filename : dir.list()) {
 					continue;
 				}
 
-				if (ConfigurationDefaults.checkBlackList(filename, params)) {
+				if (ConfigurationDefaults.isInBlackList(filename, params)) {
 					continue;
 				}
 
 
 				parser.parse(tin, textHandler, metadata, context);
 
-				File contentFile = new File(contentdirname, filename);
-				String contentFilename = contentFile.getAbsolutePath();
-				FileWriter out = new FileWriter(contentFile);		
-				out.write(textHandler.toString());
-				out.close();
-				
-				// split sentences		
-				sentenceSplitter.createFileForKNgrams(filename, textHandler.toString());
-
 				tin.close();
 
 				String filepath = f.getAbsolutePath();
 				DependencyNode d = depNodeMap.get(filepath);
 
 				if (d == null) {
-					System.out.println("DependencyNode not found: " + filepath);
-					// TODO: do not continue
+					logger.error("DependencyNode not found: {}", filepath);
+					continue;
 				}
+				else{
+					logger.trace("Processing dependency node: {}", filepath);
+				}
+				
+				String contentFilename = filepath;
 
+				// if they are not only txt files, copy text content
+				if (!mimeType.contains("text/plain")){
+				
+					File contentFile = new File(contentdirname, filename +".txt");
+					contentFilename = contentFile.getAbsolutePath();
+					FileWriter out = new FileWriter(contentFile);		
+					out.write(textHandler.toString());
+					out.close();
+				}
+				
+				//otherwise refer to the files itself
+				d.setContent(contentFilename);		
+				
 				metadata.add(Metadata.CONTENT_TYPE, mimeType);
 				d.setMetadata(metadata);
-				d.setContent(contentFilename);
 				d.getMetadata().setFSModified(new Date(f.lastModified()));
 				d.getMetadata().setFSSize(f.length());
+				
+				// split sentences		
+				if (!contentFilename.contains(SentenceSplitter.SENTENCE_DIRECTORY))
+					sentenceSplitter.createFileForKNgrams(contentFilename, textHandler.toString());
+				
 
 				if (f.getParentFile().isDirectory()) {
 					d.getMetadata().setFsDirModified(

File src/main/java/nl/vu/recoprov/experiments/Experiment3.java

 	public static final String RESULTS_DIRECTORY = "results/";
 	
 	public static void main(String[] args) throws Exception {
-
-		dirfile = "sentences/";
+		
+		// pass the directory and the json of the reference graph
 		
 		logger = LoggerFactory
 				.getLogger("nl.vu.recoprov.experiments.Experiment3");
 
-		if (!checkInitialParameters()) {
+		if (!checkInitialParameters(args)) {
 			System.exit(0);
 		}
 		
 		
 		// experimenting to get perfect recall
 		ConfigurationDefaults.LUCENE_MAX_NUMBER_DOCS = 6000;
-
 		CompletePipeline pipeline = new CompletePipeline(false, dirfile);
+		
 		FileWriter writer = createFileResultsWriter();
+		DependencyGraph depGraphSentencesLucene = createLuceneGraphForSentences(pipeline);
+		writer.append(depGraphSentencesLucene.toString());
+		
+		//DependencyGraph depGraph = createReferenceGraph(pipeline);
+		//experimentWithThresholds(pipeline, writer depGraph);
+		
+		writer.flush();
+		writer.close();
+
+	}
+		
+	public static void experimentWithThresholds(CompletePipeline pipeline, FileWriter writer , DependencyGraph depGraph) throws Exception{
+
+
 		DependencyGraph depGraphLucene = createLuceneGraph(pipeline);
-		DependencyGraph depGraph = createReferenceGraph(pipeline);
 		DependencyGraph depGraphLuceneMore = createLuceneMoreGraph(pipeline);
 		
 		//add compression
 				throw e;
 			}
 		}
-		
-		writer.flush();
-		writer.close();
+	
 
 	}
 
 	
 	
 
-	public static Boolean checkInitialParameters() {
-		logger.info("Starting experiment 3: Plagiarism Detection Corpus");
+	public static Boolean checkInitialParameters(String [] args) {
+
+		
+		if (args.length >= 1) {
+			dirfile = args [0];
+		}
+		
+		if (args.length >= 2){
+			jsonfile = args [1];
+		}
+		
+		logger.info("Starting experiment: {} {}", dirfile, jsonfile);
 
 		File dir = new File(dirfile);
 
 
 		return depGraphLuceneCompression;
 	}
+	
+	
+	private static DependencyGraph createDependencyGraphForSentences (CompletePipeline pipeline) {
+		
+		DependencyGraph depGraph1 = DependencyGraph.deserializeDependencyGraph("depGraphSentences.ser");
+
+		if (depGraph1 != null)
+			return depGraph1;
+
+		try {
+			depGraph1 = pipeline.createDependencyGraphForSentences();
+			depGraph1.serialize("depGraphSentences.ser");
+			logger.info("depGraphSentences graph created.");
+		} catch (Exception e) {
+			e.printStackTrace();
+			logger.error("Could not create depGraphSentences graph.");
+		}
+
+		return depGraph1;
+	}
+
+	
+	
+	private static DependencyGraph createLuceneGraphForSentences (CompletePipeline pipeline) {
+		
+		DependencyGraph depGraph1 = DependencyGraph.deserializeDependencyGraph("depGraphSentencesLucene.ser");
+
+		if (depGraph1 != null)
+			return depGraph1;
+
+		try {
+			depGraph1 = createDependencyGraphForSentences(pipeline);
+			pipeline.luceneSimilaritySignal(depGraph1, "sentences/lucene_index/");
+			pipeline.aggregateSignals(depGraph1);
+			depGraph1.serialize("depGraphSentencesLucene.ser");
+			logger.info("depGraphSentencesLucene graph created.");
+		} catch (Exception e) {
+			e.printStackTrace();
+			logger.error("Could not create depGraphSentencesLucene graph.");
+		}
+
+		return depGraph1;
+	}
 
 }

File src/main/java/nl/vu/recoprov/utils/ConfigurationDefaults.java

 
 	}
 
-	public static boolean checkBlackList(String filename,
+	public static boolean isInBlackList(String filename,
 			String[] params) {
-		if (params == null)
-			return true;
-		
-		for (String s: params){
-			if (filename.contains(s))
-				return true;
+		if (params != null){
+			for (String s: params){
+				if (filename.contains(s))
+					return true;
+			}
 		}
 		
-		
 		if (filename.contains(ConfigurationDefaults.RELATIVE_INDEX_DIR))
 			return true;