Anonymous avatar Anonymous committed 80be5d4 Draft

partial refactoring of charCode, textPosition; added regression test

Comments (0)

Files changed (6)

 			<artifactId>levigo-jbig2-imageio</artifactId>
 			<version>1.3</version>
 		</dependency>
+		
+		<dependency>
+			<version>1.1-SNAPSHOT</version>
+			<groupId>org.xml-cml</groupId>
+			<artifactId>jumbo-testutil</artifactId>
+			<scope>test</scope>
+		</dependency>
 	</dependencies>
 	
 	<build>

src/main/java/org/xmlcml/pdf2svg/CodePointSet.java

 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
 
 import nu.xom.Builder;
 import nu.xom.Element;
 		return codePointsElement;
 		
 	}
+	
+	public Set<String> getUnicodeNames() {
+		return codePointByUnicodeNameMap.keySet();
+	}
+
+	public Set<String> getNames() {
+		return codePointByNameMap.keySet();
+	}
 
 	public int size() {
 		return codePointByDecimalMap.size();

src/main/java/org/xmlcml/pdf2svg/PDFPage2SVGConverter.java

 //	private Composite composite;
 //	private Paint paint;
 	private PDGraphicsState graphicsState;
-	private Matrix textPos;
+	private Matrix testMatrix;
 	private PDFont pdFont;
 
 	private String fontFamilyName;
 	private FontFamily fontFamily;
 
 	private HashMap<String, Integer> integerByClipStringMap;
+	private SVGElement defs1;
+	private boolean reportedEncodingError = false;
+	private TextPosition textPosition;
 
-	private SVGElement defs1;
-
-	private boolean reportedEncodingError = false;;
-	
+	private int charCode;;
 
 	public PDFPage2SVGConverter() throws IOException {
 		super();
 
 	@Override
 	protected void processTextPosition(TextPosition textPosition) {
-
+		this.textPosition = textPosition;
 		charname = null;
 		charWasLogged = false;
 
 
 		setAndProcessFontNameAndFamilyName();
 
-		int charCode = getCharCodeAndSetEncodingAndCharname(textPosition);
+		int charCode = getCharCodeAndSetEncodingAndCharname();
 
 		SVGText svgText = new SVGText();
 		
 		if (pdf2svgConverter.useXMLLogger) {
 			pdf2svgConverter.xmlLogger.newFont(amiFont);
 			if (pdf2svgConverter.xmlLoggerLogGlyphs) {
-				captureAndIndexGlyphVector(textPosition, charCode);
+				captureAndIndexGlyphVector(charCode);
 			}
 		}
 
-		createAndReOrientateTextPosition(textPosition, svgText);
+		createAndReOrientateTextPosition(svgText);
 
 		svgText.setFontWeight(amiFont.getFontWeight());
 
 		LOG.trace("Fn: "+fontName+"; Ff: "+fontFamilyName+"; "+textContent+"; "+charCode+"; "+charname);
 
 		float width = getCharacterWidth(pdFont, textContent);
-		addContentAndAttributesToSVGText(textPosition, svgText, width, charCode);
+		addContentAndAttributesToSVGText(svgText, width, charCode);
 
 		svg.appendChild(svgText);
 	}
 		}
 		fontFamilyName = amiFont.getFontFamilyName();
 		fontFamily = amiFontManager.getFontFamily(fontFamilyName);
-//		if (fontFamily.getCodePointSet() == null && amiFont.isSymbol()) {
-//			throw new RuntimeException("Symbol font ("+fontFamilyName+") needs codePointSet");
-//		}
-//		checkPublisherFontFamily();
 	}
 
-//	private void checkPublisherFontFamily() {
-//		Publisher publisher = pdf2svgConverter.getPublisher();
-//		if (publisher != null) {
-//			if (!publisher.containsFontFamilyName(fontFamilyName))  {
-//				publisher.addFontFamilyName(fontFamilyName);
-//				LOG.trace("added fontFamilyName "+fontFamilyName);
-//			} else {
-//				LOG.trace("already in publisher fontFamilySet "+fontFamilyName);
-//			}
-//		}
-//	}
-
-	private int getCharCodeAndSetEncodingAndCharname(TextPosition textPosition) {
+	private int getCharCodeAndSetEncodingAndCharname() {
 
 		encoding = amiFont.getEncoding();
 		int[] codePoints = textPosition.getCodePoints();
 		LOG.trace("codePoints: "+(codePoints == null ? null : codePoints.length));
-		int charCode = -1;
+		charCode = -1;
 		if (encoding == null) {
 			if (codePoints != null) {
 				charCode = codePoints[0];
 				reportedEncodingError = true;
 			}
 		} else {
-//			if (encoding instanceof DictionaryEncoding) {
-				getCharnameThroughEncoding(charCode);
-//			}
+			getCharnameThroughEncoding(charCode);
 		}
 
 		return charCode;
 		}
 	}
 
-	private void addContentAndAttributesToSVGText(TextPosition textPosition, SVGText svgText,
-			float width, int charCode) {
+	private void addContentAndAttributesToSVGText(SVGText svgText, float width, int charCode) {
 		try {
 			svgText.setText(textPosition.getCharacter());
 		} catch (RuntimeException e) {
 			// drops here if cannot encode as XML character
-			annotateUnusualCharacters(textPosition, svgText);
+			annotateUnusualCharacters(svgText);
 		}
 		
 		getFontSizeAndSetNotZeroRotations(svgText);
 		if (amiFont.isBold() != null && amiFont.isBold()) {
 			svgText.setFontWeight("bold");
 		}
-		addCodePointToHighPoints(textPosition);
+		addCodePointToHighPoints();
 		if ("Symbol".equals(svgText.getFontFamily())) {
 			svgText.setFontFamily("Symbol-X"); // to stop browsers misbehaving
 		}
 		}
 	}
 
-	private void captureAndIndexGlyphVector(TextPosition text, int charCode) {
+	private void captureAndIndexGlyphVector(int charCode) {
 		String key = charname;
 		if (key == null) {
 			key = "" + charCode;
 		if (pathString == null) {
 			ensurePageSize();
 			PDFGraphics2D graphics = new PDFGraphics2D(amiFont);
-			Matrix textPos = text.getTextPos().copy();
+			Matrix textPos = textPosition.getTextPos().copy();
 			float x = textPos.getXPosition();
 			// the 0,0-reference has to be moved from the lower left (PDF) to
 			// the upper left (AWT-graphics)
 			// transformation
 			// we should remove it from the parameter list in the long run
 			try {
-				pdFont.drawString(text.getCharacter(), text.getCodePoints(),
+				pdFont.drawString(textPosition.getCharacter(), textPosition.getCodePoints(),
 						graphics, 1, at, x, y);
 			} catch (IOException e) {
 				throw new RuntimeException("font.drawString", e);
 		}
 	}
 
-	private int addCodePointToHighPoints(TextPosition text) {
+	private int addCodePointToHighPoints() {
 		pdf2svgConverter.ensureCodePointSets();
-		int charCode = text.getCharacter().charAt(0);
+		int charCode = textPosition.getCharacter().charAt(0);
 		if (charCode > 255) {
 			if (pdf2svgConverter.knownCodePointSet.containsKey((Integer)charCode)) {
 				// known
 		return width;
 	}
 
-	private void annotateUnusualCharacters(TextPosition text, SVGText svgText) {
-		char cc = text.getCharacter().charAt(0);
-		String s = AMIFontManager.BADCHAR_S+(int)cc+AMIFontManager.BADCHAR_E;
+	private void annotateUnusualCharacters(SVGText svgText) {
+//		char cc = textPosition.getCharacter().charAt(0);
+		String s = AMIFontManager.BADCHAR_S+(int)charCode+AMIFontManager.BADCHAR_E;
 		if (pdf2svgConverter.useXMLLogger && !charWasLogged) {
-			pdf2svgConverter.xmlLogger.newCharacter(fontName, fontFamilyName, charname, cc);
+			pdf2svgConverter.xmlLogger.newCharacter(fontName, fontFamilyName, charname, charCode);
 			charWasLogged = true;
 		}
 		else
 			LOG.debug(s+" "+fontName+" ("+fontSubType+") charname: "+charname);
-		s = ""+(char)(BADCHAR+Math.min(9, cc));
+		s = ""+(char)(BADCHAR+Math.min(9, charCode));
 		svgText.setText(s);
 		svgText.setStroke("red");
 		svgText.setFill("red");
 
 	private double getFontSizeAndSetNotZeroRotations(SVGText svgText) {
 		// attempts to see if matrices were scaling glyphs - apparently not.
-		AffineTransform at = textPos.createAffineTransform();
+		AffineTransform at = testMatrix.createAffineTransform();
 //		double atScaleX = at.getScaleX();
 //		double atScaleY = at.getScaleY();
 //		double atScaleRatio = atScaleX/atScaleY;
 
 	/** changes coordinates because AWT and SVG use top-left origin while PDF uses bottom left
 	 * 
-	 * @param text
+	 * @param textPosition
 	 * @param svgText
 	 */
-	private void createAndReOrientateTextPosition(TextPosition text, SVGText svgText) {
+	private void createAndReOrientateTextPosition(SVGText svgText) {
 		ensurePageSize();
-		textPos = text.getTextPos().copy();
-		float x = textPos.getXPosition();
+		testMatrix = textPosition.getTextPos().copy();
+		float x = testMatrix.getXPosition();
 		// the 0,0-reference has to be moved from the lower left (PDF) to
 		// the upper left (AWT-graphics)
-		float y = pageSize.height - textPos.getYPosition();
+		float y = pageSize.height - testMatrix.getYPosition();
 		// Set translation to 0,0. We only need the scaling and shearing
-		textPos.setValue(2, 0, 0);
-		textPos.setValue(2, 1, 0);
+		testMatrix.setValue(2, 0, 0);
+		testMatrix.setValue(2, 1, 0);
 		// because of the moved 0,0-reference, we have to shear in the
 		// opposite direction
-		textPos.setValue(0, 1, (-1) * textPos.getValue(0, 1));
-		textPos.setValue(1, 0, (-1) * textPos.getValue(1, 0));
+		testMatrix.setValue(0, 1, (-1) * testMatrix.getValue(0, 1));
+		testMatrix.setValue(1, 0, (-1) * testMatrix.getValue(1, 0));
 		currentXY = new Real2(x, y);
 		svgText.setXY(currentXY);
 	}

src/main/java/org/xmlcml/pdf2svg/util/XMLLogger.java

 		page.appendChild(character);
 	}
 
+	public void newException(Exception e) {
+		if (file == null || page == null) {
+			throw new RuntimeException("no current PDF file or page!");
+		}
+
+		Element exceptionElement = new Element("exception");
+		StackTraceElement[] steArray = e.getStackTrace();
+		for (StackTraceElement ste : steArray) {
+			Element ste0 = new Element("stackTrace");
+			exceptionElement.appendChild(ste0);
+			ste0.appendChild(ste.toString());
+		}
+		page.appendChild(exceptionElement);
+	}
+
 	public void writeXMLFile(OutputStream outputStream) {
 		Document doc = new Document(root);
 		try {

src/test/java/org/xmlcml/pdf2svg/SamplesForTest.java

 //		new PDF2SVGConverter().run("-logger", "-infofiles", "-logglyphs", "-outdir", "target/misc/", "../pdfs/misc/");
 //      new PDF2SVGConverter().run("-logger", "-infofiles", "-logglyphs", "-outdir", "target/npg/", "../pdfs/npg/");
 		// OK
+		
+//		new PDF2SVGConverter().run("-logger", "-infofiles", "-logglyphs", "-outdir", "target/pdfsByJournal/BMCBioinformatics", "../pdfs/pdfsByJournal/BMCBioinformatics");
+		new PDF2SVGConverter().run("-logger", "-infofiles", "-logglyphs", "-outdir", "target/test", "../pdfs/test");
+		
 //		new PDF2SVGConverter().run("-logger", "-infofiles", "-logglyphs", "-outdir", "target/pdfsByJournal/ActaPalaeontologicaPolonica", "../pdfs/pdfsByJournal/ActaPalaeontologicaPolonica");
 //		new PDF2SVGConverter().run("-logger", "-infofiles", "-logglyphs", "-outdir", "target/pdfsByJournal/ActaZoologica", "../pdfs/pdfsByJournal/ActaZoologica");
 //		new PDF2SVGConverter().run("-logger", "-infofiles", "-logglyphs", "-outdir", "target/pdfsByJournal/AmericanJournalBotany", "../pdfs/pdfsByJournal/AmericanJournalBotany");
 //		new PDF2SVGConverter().run("-logger", "-infofiles", "-logglyphs", "-outdir", "target/pdfsByJournal/BiologicalReviews", "../pdfs/pdfsByJournal/BiologicalReviews");
 //		new PDF2SVGConverter().run("-logger", "-infofiles", "-logglyphs", "-outdir", "target/pdfsByJournal/BiologyLetters", "../pdfs/pdfsByJournal/BiologyLetters");
 //		new PDF2SVGConverter().run("-logger", "-infofiles", "-logglyphs", "-outdir", "target/pdfsByJournal/BiologicalReviews", "../pdfs/pdfsByJournal/BiologicalReviews");
-		new PDF2SVGConverter().run("-logger", "-infofiles", "-logglyphs", "-outdir", "target/pdfsByJournal/BiologyPhilosophy", "../pdfs/pdfsByJournal/BiologyPhilosophy");
-		new PDF2SVGConverter().run("-logger", "-infofiles", "-logglyphs", "-outdir", "target/pdfsByJournal/BulletinAmericanMuseumNaturalHistory", "../pdfs/pdfsByJournal/BulletinAmericanMuseumNaturalHistory");
-		new PDF2SVGConverter().run("-logger", "-infofiles", "-logglyphs", "-outdir", "target/pdfsByJournal/CanadianJournalEarthSciences", "../pdfs/pdfsByJournal/CanadianJournalEarthSciences");
-		new PDF2SVGConverter().run("-logger", "-infofiles", "-logglyphs", "-outdir", "target/pdfsByJournal/Cladistics", "../pdfs/pdfsByJournal/Cladistics");
-		new PDF2SVGConverter().run("-logger", "-infofiles", "-logglyphs", "-outdir", "target/pdfsByJournal/Comptes Rendus Palevol", "../pdfs/pdfsByJournal/Comptes Rendus Palevol");
-		new PDF2SVGConverter().run("-logger", "-infofiles", "-logglyphs", "-outdir", "target/pdfsByJournal/Copeia", "../pdfs/pdfsByJournal/Copeia");
-		new PDF2SVGConverter().run("-logger", "-infofiles", "-logglyphs", "-outdir", "target/pdfsByJournal/Cretaceous Research", "../pdfs/pdfsByJournal/CretaceousResearch");
-		new PDF2SVGConverter().run("-logger", "-infofiles", "-logglyphs", "-outdir", "target/pdfsByJournal/EarthEnvironmentalScienceTransactionsRoyalSocietyEdinburgh", "../pdfs/pdfsByJournal/EarthEnvironmentalScienceTransactionsRoyalSocietyEdinburgh");
-		new PDF2SVGConverter().run("-logger", "-infofiles", "-logglyphs", "-outdir", "target/pdfsByJournal/Evolution", "../pdfs/pdfsByJournal/Evolution");
-		new PDF2SVGConverter().run("-logger", "-infofiles", "-logglyphs", "-outdir", "target/pdfsByJournal/EvolutionaryBiology", "../pdfs/pdfsByJournal/EvolutionaryBiology");
-		new PDF2SVGConverter().run("-logger", "-infofiles", "-logglyphs", "-outdir", "target/pdfsByJournal/Exs", "../pdfs/pdfsByJournal/Exs");
-		new PDF2SVGConverter().run("-logger", "-infofiles", "-logglyphs", "-outdir", "target/pdfsByJournal/FungalBiology", "../pdfs/pdfsByJournal/FungalBiology");
+//		new PDF2SVGConverter().run("-logger", "-infofiles", "-logglyphs", "-outdir", "target/pdfsByJournal/BiologyPhilosophy", "../pdfs/pdfsByJournal/BiologyPhilosophy");
+//		new PDF2SVGConverter().run("-logger", "-infofiles", "-logglyphs", "-outdir", "target/pdfsByJournal/BulletinAmericanMuseumNaturalHistory", "../pdfs/pdfsByJournal/BulletinAmericanMuseumNaturalHistory");
+//		new PDF2SVGConverter().run("-logger", "-infofiles", "-logglyphs", "-outdir", "target/pdfsByJournal/CanadianJournalEarthSciences", "../pdfs/pdfsByJournal/CanadianJournalEarthSciences");
+//		new PDF2SVGConverter().run("-logger", "-infofiles", "-logglyphs", "-outdir", "target/pdfsByJournal/Cladistics", "../pdfs/pdfsByJournal/Cladistics");
+//		new PDF2SVGConverter().run("-logger", "-infofiles", "-logglyphs", "-outdir", "target/pdfsByJournal/Comptes Rendus Palevol", "../pdfs/pdfsByJournal/Comptes Rendus Palevol");
+//		new PDF2SVGConverter().run("-logger", "-infofiles", "-logglyphs", "-outdir", "target/pdfsByJournal/Copeia", "../pdfs/pdfsByJournal/Copeia");
+//		new PDF2SVGConverter().run("-logger", "-infofiles", "-logglyphs", "-outdir", "target/pdfsByJournal/Cretaceous Research", "../pdfs/pdfsByJournal/CretaceousResearch");
+//		new PDF2SVGConverter().run("-logger", "-infofiles", "-logglyphs", "-outdir", "target/pdfsByJournal/EarthEnvironmentalScienceTransactionsRoyalSocietyEdinburgh", "../pdfs/pdfsByJournal/EarthEnvironmentalScienceTransactionsRoyalSocietyEdinburgh");
+//		new PDF2SVGConverter().run("-logger", "-infofiles", "-logglyphs", "-outdir", "target/pdfsByJournal/Evolution", "../pdfs/pdfsByJournal/Evolution");
+//		new PDF2SVGConverter().run("-logger", "-infofiles", "-logglyphs", "-outdir", "target/pdfsByJournal/EvolutionaryBiology", "../pdfs/pdfsByJournal/EvolutionaryBiology");
+//		new PDF2SVGConverter().run("-logger", "-infofiles", "-logglyphs", "-outdir", "target/pdfsByJournal/Exs", "../pdfs/pdfsByJournal/Exs");
+//		new PDF2SVGConverter().run("-logger", "-infofiles", "-logglyphs", "-outdir", "target/pdfsByJournal/FungalBiology", "../pdfs/pdfsByJournal/FungalBiology");
+//		new PDF2SVGConverter().run("-logger", "-infofiles", "-logglyphs", "-outdir", "target/pdfsByJournal/Geobios", "../pdfs/pdfsByJournal/Geobios");
+//		new PDF2SVGConverter().run("-logger", "-infofiles", "-logglyphs", "-outdir", "target/pdfsByJournal/Geodiversitas", "../pdfs/pdfsByJournal/Geodiversitas");
+//		new PDF2SVGConverter().run("-logger", "-infofiles", "-logglyphs", "-outdir", "target/pdfsByJournal/HerpetologicalMonographs", "../pdfs/pdfsByJournal/HerpetologicalMonographs");
+//		new PDF2SVGConverter().run("-logger", "-infofiles", "-logglyphs", "-outdir", "target/pdfsByJournal/IchthyologicalResearch", "../pdfs/pdfsByJournal/IchthyologicalResearch");
+//		new PDF2SVGConverter().run("-logger", "-infofiles", "-logglyphs", "-outdir", "target/pdfsByJournal/InvertebrateBiology", "../pdfs/pdfsByJournal/InvertebrateBiology");
+//		new PDF2SVGConverter().run("-logger", "-infofiles", "-logglyphs", "-outdir", "target/pdfsByJournal/JournalBiogeography", "../pdfs/pdfsByJournal/JournalBiogeography");
+		
+		
+		
+//		new PDF2SVGConverter().run("-logger", "-infofiles", "-logglyphs", "-outdir", "target/pdfsByJournal/JournalEvolutionaryBiology", "../pdfs/pdfsByJournal/JournalEvolutionaryBiology");
+//		new PDF2SVGConverter().run("-logger", "-infofiles", "-logglyphs", "-outdir", "target/pdfsByJournal/JournalHumanEvolution", "../pdfs/pdfsByJournal/JournalHumanEvolution");
+//		new PDF2SVGConverter().run("-logger", "-infofiles", "-logglyphs", "-outdir", "target/pdfsByJournal/JournalMammalianEvolution", "../pdfs/pdfsByJournal/JournalMammalianEvolution");
+//		new PDF2SVGConverter().run("-logger", "-infofiles", "-logglyphs", "-outdir", "target/pdfsByJournal/JournalMolluscanStudies", "../pdfs/pdfsByJournal/JournalMolluscanStudies");
+//		new PDF2SVGConverter().run("-logger", "-infofiles", "-logglyphs", "-outdir", "target/pdfsByJournal/JournalPaleontology", "../pdfs/pdfsByJournal/JournalPaleontology");
+//		new PDF2SVGConverter().run("-logger", "-infofiles", "-logglyphs", "-outdir", "target/pdfsByJournal/JournalSystematicPalaeontology", "../pdfs/pdfsByJournal/JournalSystematicPalaeontology");
 			
 //		new PDF2SVGConverter().run("-logger", "-infofiles", "-logglyphs", "-outdir", "target/plosone/", "../pdfs/plosone/");
 //		new PDF2SVGConverter().run("-logger", "-infofiles", "-logglyphs", "-outdir", "target/taylorfrancis/", "../pdfs/taylorfrancis/");

src/test/java/org/xmlcml/pdf2svg/SemiTest.java

 
 /** Not really tests.
  * Run over a large number of different PDFs to gather information and
- * a vague hope of cataching regressions
+ * a vague hope of catching regressions
  * Run manually
  * 
  * @author pm286
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.