Anonymous avatar Anonymous committed 55a47b9

added URL reading

Comments (0)

Files changed (2)

+====
+    Copyright (C) 2013 pm286 <peter.murray.rust@googlemail.com>
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+            http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+====
+
+==Overview==
+I work on tasks when there is a need; I'd be very happy for volunteers. Assuming
+we split into PDF2SVG and Fonts, here are a few issues:
+
+==Refactoring==
+* keep the coupling between PDF2SVG and Fonts as low as possible
+* expose the heuristics for resolving non-standard codepoints
+
+==PDF2SVG==
+* page sizes - all work so far has been on "A4" or similar
+* colours - most work has been monochrome
+* rotated sections (is it possible to rotate these back into landscape?)
+* reading from streams
+* highlighting of uncertain characters in text.
+
+==FONTS==
+* interpretation of glyph vectors. Can characters be recognized from their outline?
+  This would be a fun project for those who like puzzles
+* manually creation of codepointSets for non-standard fonts. (generally not fun).
+* recognition of font families and styles through regexes (e.g CMMI9 is 
+  CML with italic (I think) and size 9
+* GUI for creating font resources 

src/main/java/org/xmlcml/pdf2svg/PDF2SVGConverter.java

 package org.xmlcml.pdf2svg;
 
 import java.io.File;
-
 import java.io.FileNotFoundException;
 import java.io.FileOutputStream;
 import java.io.FilenameFilter;
 import java.io.IOException;
 import java.io.InputStream;
+import java.net.URL;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.pdmodel.PDPage;
 import org.apache.pdfbox.util.PDFStreamEngine;
-import org.xmlcml.font.NonStandardFontManager;
 import org.xmlcml.font.CodePointSet;
 import org.xmlcml.font.FontFamilySet;
+import org.xmlcml.font.NonStandardFontManager;
 import org.xmlcml.graphics.svg.SVGSVG;
 import org.xmlcml.pdf2svg.log.XMLLogger;
 import org.xmlcml.pdf2svg.util.MenuSystem;
 	public static final String PASSWORD = "-password";
 	public static final String PUB = "-pub";
 
+	public static final String HTTP = "http";
 	private static final int DEFAULT_MAX_PAGE = 200;
 
 
 						LOGGLYPHS, EXITONERR, DEBUG_CHAR_CODE, DEBUG_CHAR_NAME, DEBUG_FONT_NAME);
 	}
 
+
+	private void openPDFURL(String urlString) throws Exception {
+		URL url = new URL(urlString);
+		InputStream is = url.openStream();
+		page2svgConverter = new PDFPage2SVGConverter();
+		LOG.debug("URL "+ urlString);
+		readDocument(is);
+		openAndProcess((File)null, url);
+	}
+
 	private void openPDFFile(File file) throws Exception {
 
 		svgPageList = null;
 		LOG.debug("PDF "+ file.getCanonicalPath());
 		readDocument(file, useNonSeqParser, PDFpassword);
 
+		openAndProcess(file, (URL) null);
+
+		document.close();
+	}
+
+	private void openAndProcess(File file, URL url) {
 		@SuppressWarnings("unchecked")
 		List<PDPage> pages = document.getDocumentCatalog().getAllPages();
 
 
 		System.out.print(" .. pages "+pr.toString()+" ("+pages.size()+") "); 
 
-		String basename = file.getName().toLowerCase();
-		if (basename.endsWith(PDF)) {
-			basename = basename.substring(0, basename.length() - 4);
+		String basename = null;
+		if (file != null) {
+			basename = file.getName().toLowerCase();
+			if (basename.endsWith(PDF)) {
+				basename = basename.substring(0, basename.length() - 4);
+			}
+		} else {
+			basename = "target"; // change later
 		}
 
 		createOutputDirectory(basename);
 			reportNewFontFamilyNames();
 			writeHTMLSystem(outfileList);
 		}
-
-		document.close();
 	}
 
 	private void addPageToPageList() {
 
 	}
 
-	private void readDocument(InputStream inputStream, boolean useNonSeqParser, String password) throws IOException {
-		if (useNonSeqParser) {
-//			document = PDDocument.loadNonSeq(is, null, password);
-		} else {
-			document = PDDocument.load(inputStream);
-			if (document.isEncrypted()) {
-				try {
-					document.decrypt(password);
-				} catch (InvalidPasswordException e) {
-					System.err
-							.printf("Error: The document in inputSstream is encrypted (use '-password' option).%n");
-					return;
-				} catch (CryptographyException e) {
-					System.err
-							.printf("Error: Failed to decrypt document in inputStream");
-					return;
-				}
-			}
-		}
+	private void readDocument(InputStream inputStream) throws IOException {
+		document = PDDocument.load(inputStream);
+//			if (document.isEncrypted()) {
+//				try {
+//					document.decrypt(password);
+//				} catch (InvalidPasswordException e) {
+//					System.err
+//							.printf("Error: The document in inputSstream is encrypted (use '-password' option).%n");
+//					return;
+//				} catch (CryptographyException e) {
+//					System.err
+//							.printf("Error: Failed to decrypt document in inputStream");
+//					return;
+//				}
+//			}
 
 	}
 
 
 		for (String filename : fileList) {
 			try {
-				readFileOrDirectory(filename);
+				readFileOrDirectoryOrURL(filename);
 			} catch (Exception e) {
 				e.printStackTrace();
-				System.err.printf("Cannot parse PDF '" + filename + "':" + e);
+				System.err.printf("Cannot parse PDF '" + filename + "':" + e+"\n");
 				if (exitOnError) {
 					return false;
 				}
 		}
 	}
 
-	private void readFileOrDirectory(String filename) {
-		File file = new File(filename);
-		if (!file.exists()) {
-			throw new RuntimeException("File does not exist: " + filename);
-		}
-		if (file.isDirectory()) {
-			File[] pdfFiles = file.listFiles(new FilenameFilter() {
-				public boolean accept(File dir, String filename) {
-					return filename.endsWith(PDF);
-				}
-			});
-			if (pdfFiles != null) {
-				for (File pdf : pdfFiles) {
-					try {
-						openPDFFile(pdf);
-					} catch (Exception e) {
-						LOG.error("Failed to convert file: "+pdf+", skipping", e);
+	private void readFileOrDirectoryOrURL(String filename) {
+		if (filename == null) {
+			throw new RuntimeException("No input filename");
+		} else if (filename.startsWith(HTTP)) {
+			try {
+				openPDFURL(filename);
+			} catch (Exception e) {
+				LOG.error("Failed to convert URL: "+filename, e);
+			}
+		} else {
+			File file = new File(filename);
+			if (!file.exists()) {
+				throw new RuntimeException("File does not exist: " + filename);
+			}
+			if (file.isDirectory()) {
+				File[] pdfFiles = file.listFiles(new FilenameFilter() {
+					public boolean accept(File dir, String filename) {
+						return filename.endsWith(PDF);
+					}
+				});
+				if (pdfFiles != null) {
+					for (File pdf : pdfFiles) {
+						try {
+							openPDFFile(pdf);
+						} catch (Exception e) {
+							LOG.error("Failed to convert file: "+pdf+", skipping", e);
+						}
 					}
 				}
-			}
-		} else {
-			try {
-				openPDFFile(file);
-			} catch (Exception e) {
-				throw new RuntimeException("Cannot convert file: "+file, e);
+			} else {
+				try {
+					openPDFFile(file);
+				} catch (Exception e) {
+					throw new RuntimeException("Cannot convert file: "+file, e);
+				}
 			}
 		}
-		
 	}
 	
 
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.