Commits

Anonymous committed 61d3378 Draft

line/text analyzer

Comments (0)

Files changed (50)

src/main/java/org/xmlcml/graphics/control/CommandElement.java

 import org.xmlcml.graphics.control.document.FontManagerElement;
 import org.xmlcml.graphics.control.document.PageIteratorElement;
 import org.xmlcml.graphics.control.document.PageSelectorElement;
-import org.xmlcml.graphics.control.page.PlotAnalyzerElement;
+import org.xmlcml.graphics.control.page.ChunkAnalyzerElement;
 import org.xmlcml.graphics.control.page.BoxDrawerElement;
 import org.xmlcml.graphics.control.page.BoxProcessorElement;
 import org.xmlcml.graphics.control.page.ElementDeleterElement;
 		} else if (tag.equals(DocumentWriterElement.TAG)) {
 			newElement = new DocumentWriterElement();
 			
-		} else if (tag.equals(PlotAnalyzerElement.TAG)) {
-			newElement = new PlotAnalyzerElement();
+		} else if (tag.equals(ChunkAnalyzerElement.TAG)) {
+			newElement = new ChunkAnalyzerElement();
 		} else if (tag.equals(BoxDrawerElement.TAG)) {
 			newElement = new BoxDrawerElement();
 		} else if (tag.equals(BoxProcessorElement.TAG)) {

src/main/java/org/xmlcml/graphics/control/document/DocumentPageRunnerAction.java

 import org.xmlcml.cml.base.CMLConstants;
 import org.xmlcml.graphics.control.AbstractActionElement;
 import org.xmlcml.graphics.control.page.PageAction;
+import org.xmlcml.graphics.control.page.PageAnalyzer;
 import org.xmlcml.graphics.control.page.PageAnalyzerAction;
 import org.xmlcml.graphics.control.page.PageAnalyzerElement;
 import org.xmlcml.graphics.control.page.PageSelector;
 import org.xmlcml.graphics.pdf2svg.PConstants;
-import org.xmlcml.graphics.pdf2svg.PageAnalyzer;
 import org.xmlcml.graphics.svg.SVGSVG;
 
 public class DocumentPageRunnerAction extends DocumentAction {

src/main/java/org/xmlcml/graphics/control/page/Axis.java

 		this.boxThickness = boxThickness;
 		this.boxLengthExtension = boxLengthExtension;
 		texts = SVGUtil.getQuerySVGElements(container, ".//svg:text");
-		LOG.debug("TEXTS "+texts.size());
+		LOG.debug("AXIS TEXTS "+texts.size());
 		Real2Range textBox = getTextBox(complexLine.getBackbone());
 		BoxEdge edge = (LineOrientation.HORIZONTAL.equals(getOrientation())) ? BoxEdge.XMIN : BoxEdge.YMIN;
 		List<SVGElement> sortedTexts = BoundingBoxManager.getElementsSortedByEdge(texts, edge);
 		LOG.debug("BB "+boundedTexts.size());
 		List<SVGText> horizontalTexts = getTexts(boundedTexts, LineOrientation.HORIZONTAL);
 		for (SVGText horizontalText : horizontalTexts) {
-			LOG.trace("HOR "+horizontalText.getValue());
+			LOG.debug("HOR "+horizontalText.getValue());
 		}
 		CMLUtil.outputQuietly(container, new File("target/axis1HorizontalText.svg"), 1);
 		processScaleValuesAndScaleTitle(horizontalTexts);
 
 	private void TransformArrayFromPixelsToScale() {
 		getOrientation();
-		SVGElement parent = (SVGElement) complexLine.getBackbone().getParent().getParent();
-		List<SVGElement> polylineElements = SVGUtil.getQuerySVGElements(parent, "./svg:g/svg:polyline");
-		List<SVGPolyline> polylines = SVGPolyline.extractPolylines(polylineElements);
-		ensureTickmarks();
-		LOG.debug(("POLYLINES "+polylines.size()));
-		for (SVGPolyline polyline : polylines) {
-			LOG.debug("Polyline "+polyline.getBoundingBox());
-			Real2Array polylineCoords = polyline.getReal2Array();
-			RealArray polylineAxisPixelCoords = (LineOrientation.HORIZONTAL.equals(orientation)) ?
-					polylineCoords.getXArray() : polylineCoords.getYArray();
-			RealArray polylineValueCoords = polylineAxisPixelCoords.createScaledArrayToRange(
-				lowestMajorTickCoordInPixels, highestMajorTickCoordInPixels, lowestTickMarkValue, highestTickMarkValue);
-			Double range = polylineValueCoords.getRange().getRange();
-			int places = (int) Math.max(0, 6 - (Math.log10(range)-0.5));
-			polylineValueCoords.format(places);
-			LOG.debug("SCALED "+polylineValueCoords);
-
+		SVGElement parentSVG = (SVGElement)complexLine.getBackbone().getParent();
+		if (parentSVG == null) {
+			LOG.debug("NULL SVG PARENT");
+		} else {
+			LOG.debug("++++ OK SVG PARENT");
+			SVGElement parent = (SVGElement) parentSVG.getParent();
+			List<SVGElement> polylineElements = SVGUtil.getQuerySVGElements(parent, "./svg:g/svg:polyline");
+			List<SVGPolyline> polylines = SVGPolyline.extractPolylines(polylineElements);
+			ensureTickmarks();
+			LOG.debug(("POLYLINES "+polylines.size()));
+			for (SVGPolyline polyline : polylines) {
+				LOG.debug("Polyline "+polyline.getBoundingBox());
+				Real2Array polylineCoords = polyline.getReal2Array();
+				RealArray polylineAxisPixelCoords = (LineOrientation.HORIZONTAL.equals(orientation)) ?
+						polylineCoords.getXArray() : polylineCoords.getYArray();
+				RealArray polylineValueCoords = polylineAxisPixelCoords.createScaledArrayToRange(
+					lowestMajorTickCoordInPixels, highestMajorTickCoordInPixels, lowestTickMarkValue, highestTickMarkValue);
+				Double range = polylineValueCoords.getRange().getRange();
+				int places = (int) Math.max(0, 6 - (Math.log10(range)-0.5));
+				polylineValueCoords.format(places);
+				LOG.debug("SCALED "+polylineValueCoords);
+	
+			}
+			LOG.trace("POLY "+polylines.size()+ " ... "+complexLine.getBackbone().getBoundingBox());
 		}
-		LOG.trace("POLY "+polylines.size()+ " ... "+complexLine.getBackbone().getBoundingBox());
 	}
 
 	private void ensureTickmarks() {

src/main/java/org/xmlcml/graphics/control/page/AxisAnalyzer.java

 import org.xmlcml.graphics.paths.ComplexLine;
 import org.xmlcml.graphics.paths.ComplexLine.CombType;
 import org.xmlcml.graphics.paths.ComplexLine.LineOrientation;
-import org.xmlcml.graphics.pdf2svg.PageAnalyzer;
 import org.xmlcml.graphics.svg.SVGElement;
 import org.xmlcml.graphics.svg.SVGLine;
 

src/main/java/org/xmlcml/graphics/control/page/BoxProcessorAction.java

 import org.apache.log4j.Logger;
 import org.xmlcml.graphics.control.AbstractActionElement;
 import org.xmlcml.graphics.paths.PathAnalyzer;
-import org.xmlcml.graphics.pdf2svg.Chunk;
-import org.xmlcml.graphics.pdf2svg.ChunkStyle;
 import org.xmlcml.graphics.svg.SVGElement;
 import org.xmlcml.graphics.svg.SVGUtil;
 

src/main/java/org/xmlcml/graphics/control/page/Chunk.java

+package org.xmlcml.graphics.control.page;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Set;
+
+import nu.xom.Attribute;
+import nu.xom.Element;
+import nu.xom.Elements;
+
+import org.apache.log4j.Level;
+import org.apache.log4j.Logger;
+import org.xmlcml.cml.base.CMLUtil;
+import org.xmlcml.euclid.Real;
+import org.xmlcml.euclid.Real2;
+import org.xmlcml.euclid.Real2Range;
+import org.xmlcml.euclid.RealRange;
+import org.xmlcml.graphics.pdf2svg.BoundingBoxManager;
+import org.xmlcml.graphics.pdf2svg.StyleManager;
+import org.xmlcml.graphics.pdf2svg.BoundingBoxManager.BoxEdge;
+import org.xmlcml.graphics.svg.SVGElement;
+import org.xmlcml.graphics.svg.SVGG;
+import org.xmlcml.graphics.svg.SVGLine;
+import org.xmlcml.graphics.svg.SVGPath;
+import org.xmlcml.graphics.svg.SVGPolyline;
+import org.xmlcml.graphics.svg.SVGSVG;
+import org.xmlcml.graphics.svg.SVGText;
+import org.xmlcml.graphics.svg.SVGUtil;
+
+/** a chunk of the page, managed as a <g> element
+ * 
+ * @author pm286
+ *
+ */
+public class Chunk extends SVGG {
+	private static final String CLIP_PATH = "clipPath";
+	private static Logger LOG = Logger.getLogger(Chunk.class);
+	static {
+		LOG.setLevel(Level.DEBUG);
+	}
+	
+	public enum ChunkRole {
+		abstractSubsection,
+		additionalFile,
+		additionalMaterial,
+		author,
+		authorContribution,
+		authorSuperscript,
+		authors,
+		blueBanner,
+		bodyText,
+		copyright,
+		correspondence,
+		doiBox,
+		figureCaption,
+		graphicsStrokes,
+		heading,
+		icon,
+		image,
+		mainTitle,
+		pageBiblio,
+		referenceEntry,
+		subheading,
+		suscript,
+		tableBody,
+		tableCaption,
+		tableHeading,
+	}
+	
+	private static final String SWEEP_BOX = "sweepBox";
+	private static final String ROLE = "role";
+	public static final String MIN = "MIN";
+	public static final String EDGE = "edge";
+	private static final String NONE = "none";
+	private static final String NULL_CLIP = "nullClip";
+	public static final String CHUNK_STYLE = "chunkStyle";
+	private static final double TOLER = 0.01;
+	private static final double BOX_EQUALITY = 0.5;
+	private static final double TOLER1 = 1.0;
+	
+	private List<SVGElement> elementList = new ArrayList<SVGElement>();
+	private List<Real2Range> emptyBoxList;
+	protected BoundingBoxManager boundingBoxManager;
+	private BoxEdge edge;
+	protected ChunkStyle chunkStyle;
+	private Set<Class<?>> svgClassSet;
+	private PageChunkSplitter chunkAnalyzer;
+	protected StyleManager styleManager;
+
+	public Chunk() {
+		super();
+	}
+	
+	public Chunk(PageChunkSplitter chunkAnalyzer) {
+		this.chunkAnalyzer = chunkAnalyzer;
+		ensureBoundingBoxManager();
+	}
+
+	private void ensureBoundingBoxManager() {
+		if (boundingBoxManager == null) {
+			boundingBoxManager = new BoundingBoxManager();
+		}
+	}
+	
+	public Chunk(SVGElement element) {
+		this(null, element);
+	}
+		
+	public Chunk(PageChunkSplitter chunkAnalyzer, SVGElement element) {
+		this(chunkAnalyzer);
+		createElementListAndCalculateBoundingBoxes(element);
+	}
+
+	public void createElementListAndCalculateBoundingBoxes() {
+		createElementListAndCalculateBoundingBoxes(this);
+		this.boundingBox = null;
+	}
+
+	public void createElementListAndCalculateBoundingBoxes(SVGElement element) {
+		// remove grouping elements and defs
+		elementList = SVGUtil.getQuerySVGElements(element, 
+				".//svg:*[not(self::svg:svg or self::svg:g or self::*[ancestor-or-self::svg:defs])]");
+		calculateBoundingBoxes();
+	}
+	
+	private List<Real2Range> calculateBoundingBoxes() {
+		int i = 0;
+		ensureBoundingBoxManager();
+		for (SVGElement element : elementList) {
+			LOG.trace("pre "+element.getClass());
+			Real2Range boundingBox = element.getBoundingBox();
+			LOG.trace("BB "+(i++));
+			boundingBoxManager.add(boundingBox);
+		}
+		return boundingBoxManager.getBBoxList();
+	}
+
+	public List<Real2Range> createEmptyBoxList(BoxEdge edge) {
+		this.edge = edge;
+		emptyBoxList = boundingBoxManager.createEmptyBoxList(edge);
+		return emptyBoxList;
+	}
+	
+	public List<Chunk> splitIntoChunks(Double chunkWidth, BoxEdge edge) {
+		ensureEmptyBoxList(edge);
+		SVGUtil.setBoundingBoxCached(elementList, true);
+		Long time0 = System.currentTimeMillis();
+		elementList = BoundingBoxManager.getElementsSortedByEdge(elementList, edge);
+		LOG.trace("sort edge: "+(System.currentTimeMillis()-time0));
+		addTerminatingEmptyBox(1.5*chunkWidth, edge);
+		List<Chunk> chunkList = new ArrayList<Chunk>();
+		Chunk chunk = null;
+		if (emptyBoxList == null) {
+			return chunkList;
+		}
+		Iterator<Real2Range> boxIterator = emptyBoxList.iterator();
+		Iterator<SVGElement> elementIterator = elementList.iterator();
+		Real2Range box = boxIterator.next();
+		SVGElement element = elementIterator.next();
+		int count = 0;
+		while (element != null) {
+			count++;
+			if (box == null || elementLagsBehindBox(edge, box, element)) {
+				time0 = System.currentTimeMillis();
+				if (chunk == null) {
+					chunk = makeChunk(chunkWidth, edge, PageAnalyzer.DECIMAL_PLACES, count);
+					chunkList.add(chunk);
+					this.appendChild(chunk);
+				}
+				chunk.add(element);
+				element = elementIterator.hasNext() ? elementIterator.next() : null;
+				LOG.trace("addChunk/element: "+(System.currentTimeMillis()-time0));
+			} else {
+				box = null;
+				chunk = null; // ?
+				if (boxIterator.hasNext()) {
+					chunk = null;
+					while (boxIterator.hasNext()) {
+						box = boxIterator.next();
+						if (boxLargeEnough(chunkWidth, edge, box)) {
+							break;
+						}
+					}
+				}
+			}
+		}
+		LOG.trace("iterations: "+count+" loop count time: "+(System.currentTimeMillis()-time0));
+		for (Chunk chunk0 : chunkList) {
+			chunk0.setBoundingBoxAttribute(PageAnalyzer.DECIMAL_PLACES);
+		}
+		LOG.trace("reformat chunkList: "+chunkList.size()+"/"+(System.currentTimeMillis()-time0));
+		return chunkList;
+	}
+	
+	private Chunk makeChunk(Double chunkWidth, BoxEdge edge, Integer decimalPlaces, int count) {
+		Chunk chunk;
+		chunk = new Chunk(chunkAnalyzer);
+		chunk.setBoundingBoxCached(true);
+		chunk.setBoundingBoxAttribute(decimalPlaces);
+		chunk.addAttribute(new Attribute("edge", ""+edge));
+		chunk.addAttribute(new Attribute("width", ""+chunkWidth));
+		chunk.setTitle("chunk"+count);
+		return chunk;
+	}
+
+	private Real2Range addTerminatingEmptyBox(double chunkWidth, BoxEdge edge) {
+		Real2Range bbox = null;
+		if (elementList.size() > 0) {
+			Real2Range lastR2R = elementList.get(elementList.size()-1).getBoundingBox();
+			if (BoxEdge.YMIN.equals(edge)) {
+				double cc = lastR2R.getYRange().getMax();
+				bbox = new Real2Range(lastR2R.getXRange(), new RealRange(cc, cc+chunkWidth));
+			} else if (BoxEdge.XMIN.equals(edge)) {
+				double cc = lastR2R.getXRange().getMax();
+				bbox = new Real2Range(new RealRange(cc, cc+chunkWidth), lastR2R.getYRange());
+			} else {
+				throw new RuntimeException("unsuuported edge: "+edge);
+			}
+			emptyBoxList.add(bbox);
+		}
+		return bbox;
+	}
+
+	private boolean elementLagsBehindBox(BoxEdge edge, Real2Range box, SVGElement element) {
+		Boolean lags = null;
+		if (box == null && element != null) {
+			lags = true;
+		} else {
+			element.setBoundingBoxCached(true);
+			double elemCoord = getRange(element.getBoundingBox(), edge).getMin();
+			RealRange rr = getRange(box, edge);
+			if (rr != null) {
+				double boxCoord = rr.getMax();
+				lags = elemCoord < boxCoord;
+			}
+		}
+		return (lags == null) ? false : lags;
+	}
+
+	private boolean boxLargeEnough(Double chunkWidth, BoxEdge edge, Real2Range emptyBox) {
+		RealRange rr = getRange(emptyBox, edge);
+		return (rr == null) ? false : rr.getRange() >= chunkWidth;
+	}
+
+	private void ensureEmptyBoxList(BoxEdge edge) {
+		if (emptyBoxList == null) {
+			ensureBoundingBoxManager();
+			emptyBoxList = boundingBoxManager.createEmptyBoxList(edge);
+		}
+	}
+
+	public List<SVGElement> getElementList() {
+		if (elementList == null) {
+			elementList = SVGUtil.getQuerySVGElements(this, ".//svg:*");
+		}
+		return elementList;
+	}
+	
+	public List<SVGElement> refreshElementList() {
+		elementList = null;
+		getElementList();
+		return elementList;
+	}
+
+	private RealRange getRange(Real2Range box, BoxEdge edge) {
+		RealRange r = null;
+		if (box == null) {
+			r = null;
+		} else if (BoxEdge.YMIN.equals(edge) || BoxEdge.YMAX.equals(edge)) {
+			r = box.getYRange();
+		} else if (BoxEdge.XMIN.equals(edge) || BoxEdge.XMAX.equals(edge)) {
+			r = box.getXRange();
+		}
+		return r;
+	}
+
+	private void add(SVGElement element) {
+		elementList.add(element);
+		if (this.getParent() != null) {
+			element.detach();
+			this.appendChild(element);
+		}
+		boundingBoxManager.add(element.getBoundingBox());
+	}
+	
+//	private void add(WordSequence subLine) {
+//		elementList.add(subLine);
+//	}
+	
+	public Real2Range getBoundingBox() {
+		if (boundingBoxNeedsUpdating()) {
+			if (elementList != null && elementList.size() > 0) {
+				boundingBox = new Real2Range();
+				for (SVGElement element : elementList) {
+					if (element != null) {
+						boundingBox = boundingBox.plus(element.getBoundingBox());
+					}
+				}
+			}
+		}
+		return boundingBox;
+	}
+		
+	public String getStringValue() {
+		
+		StringBuilder sb = new StringBuilder();
+		for (SVGElement line : elementList) {
+			sb.append(line.getValue()+"\n");
+		}
+		return sb.toString();
+	}
+
+
+	List<Chunk> splitByPhysicalStyle(PageAnalyzer pageAnalyzer) {
+		ensureStyleManager(pageAnalyzer);
+		PageChunkSplitter chunkAnalyzer = pageAnalyzer.getChunkAnalyzer();
+		List<Chunk> chunkList = new ArrayList<Chunk>();
+		String lastPhysicalStyle = null;
+		Chunk chunk = null;
+		for (int i = 0; i < elementList.size(); i++) {
+			SVGElement element = elementList.get(i);
+			String physicalStyle = element.getAttributeValue(StyleManager.PHYSICAL_STYLE);
+			if (physicalStyle == null) {
+				LOG.trace("null physicalStyle for: "+element.toXML());
+				continue;
+			}
+			if (!physicalStyle.equals(lastPhysicalStyle)) {
+				chunk = new Chunk(chunkAnalyzer);
+				addPhysicalStyle(chunk, physicalStyle);
+				chunkList.add(chunk);
+				this.appendChild(chunk);
+//				styleManager = pageAnalyzer.getOrCreateStyleManager();
+//				List<ChunkStyle> styles = (List<ChunkStyle>) styleManager.getStyles(clipPathDString);
+//				if (styles.size() == 1) {
+//					ChunkStyle style = styles.get(0);
+//					chunk.setChunkStyle(style);
+//				}
+				lastPhysicalStyle = physicalStyle;
+			}
+			chunk.add(element);
+		}
+		return chunkList;
+	}
+
+	private void addPhysicalStyle(Chunk chunk, String styleName) {
+		chunk.addAttribute(new Attribute(StyleManager.PHYSICAL_STYLE, styleName));
+	}
+	
+	protected StyleManager ensureStyleManager(PageAnalyzer pageAnalyzer) {
+		if (styleManager == null) {
+			styleManager = pageAnalyzer.getStyleManager();
+		}
+		return styleManager;
+	}
+
+	public void setChunkStyle(ChunkStyle chunkStyle) {
+		this.chunkStyle = chunkStyle;
+		String roleString = chunkStyle.getRolesString();
+		if (roleString == null) {
+			throw new RuntimeException("Cannot find roleString: "+chunkStyle);
+		}
+		setChunkStyleValue(chunkStyle.getRolesString());
+	}
+	
+	public void setChunkStyleValue(String styleString) {
+		this.addAttribute(new Attribute(CHUNK_STYLE, styleString));
+	}
+
+	public String getChunkStyleValue() {
+		return getAttributeValue(CHUNK_STYLE);
+	}
+
+	public ChunkStyle createChunkStyle(PageAnalyzer pageAnalyzer) {
+		String styleAtt = getChunkStyleValue();
+		LOG.trace("SA "+styleAtt);
+		StyleManager styleManager = pageAnalyzer.getStyleManager();
+		// this is a mess. chunkStyle is actually roles
+		chunkStyle = (styleAtt == null) ? null : styleManager.getStyleByRole(styleAtt);
+		return chunkStyle;
+	}
+
+	public ChunkStyle getChunkStyle() {
+		return chunkStyle;
+	}
+
+	public String getStroke() {
+		return (chunkStyle == null) ? NONE : chunkStyle.getStroke();
+	}
+
+	public String getFill() {
+		return (chunkStyle == null) ? NONE : chunkStyle.getFill();
+	}
+
+	/** developed for debugging
+	 * 
+	 */
+	private void annotate() {
+		ensureClassSet();
+		String fill = "none";
+		if (svgClassSet.size() == 1) {
+			if (svgClassSet.contains(SVGText.class)) {
+				fill = "yellow";
+			} else if (svgClassSet.contains(SVGPath.class)) {
+				fill = "blue";
+			} else {
+				
+			}
+		} else if (svgClassSet.size() == 2) {
+			if (svgClassSet.contains(SVGText.class) &&
+				svgClassSet.contains(SVGPath.class)) {
+					fill = "green";
+			} else {
+				for (Class<?> clazz : svgClassSet) {
+					System.out.println("...................................."+clazz.getName());
+				}
+			}
+		} else {
+			fill = "magenta";
+			for (Class<?> clazz : svgClassSet) {
+				System.out.println("...................................."+clazz.getName());
+			}
+		}
+		SVGSVG svgPage = chunkAnalyzer.getSVGPage();
+		SVGUtil.drawBoxes(Arrays.asList(this), svgPage, "black", fill, 1., 0.3);
+	}
+	private void ensureClassSet() {
+		svgClassSet = new HashSet<Class<?>>();
+		for (SVGElement element : elementList) {
+			svgClassSet.add(element.getClass());
+		}
+	}
+
+	public Boolean isTextChunk() {
+		ensureClassSet();
+		return svgClassSet != null && svgClassSet.size() == 1 && svgClassSet.contains(SVGText.class);
+	}
+
+	/** remove everything within epsilon of border as it is
+	 * probably a box
+	 */
+	void removeBorders() {
+		List<SVGElement> paths = SVGUtil.getQuerySVGElements(this, "svg:path");
+//		System.out.println("PATHS "+paths.size());
+		Real2Range boundingBox = this.getBoundingBox();
+		for (SVGElement elem : paths) {
+			SVGPath path = (SVGPath) elem;
+			SVGPolyline polyline = path.createPolyline();
+			if (polyline != null) {
+				polyline.createLineList();
+				boolean addBox = false;
+				if (polyline.isBox(Chunk.TOLER)) {
+					Real2Range polyBox = polyline.getBoundingBox();
+					// are boxes identical??
+					addBox = boundingBox.isEqualTo(polyBox, BOX_EQUALITY);
+					addBox |= Chunk.isSideOfBox(boundingBox, polyline, Chunk.TOLER1);
+				} else if (Chunk.isUnclosedBox(polyline, Chunk.TOLER)) {
+					addBox = Chunk.isSideOfBox(boundingBox, polyline, Chunk.TOLER1);
+				} else if (polyline.createLineList().size() == 1) {    // a line?
+					addBox = Chunk.isSideOfBox(boundingBox, polyline, Chunk.TOLER1);
+				}
+				if (addBox) {
+					polyline.addAttribute(new Attribute(ROLE, SWEEP_BOX));
+					path.getParent().replaceChild(path, polyline);
+				}
+			}
+		}
+	}
+	
+	/**
+	 * //TODO change this to SVGUtil
+	 
+	 * @param outputDir
+	 * @param root
+	 * @param i
+	 * @throws IOException
+	 */
+	public void writeTo(File outputDir, String root, int i) throws IOException {
+		this.detach();
+		SVGSVG svg = new SVGSVG();
+		svg.appendChild(this);
+		File file = new File(outputDir, root+i+".svg");
+		CMLUtil.debug(svg, new FileOutputStream(file), 1);
+		LOG.debug("writing "+file.getAbsolutePath());
+	}
+	/**
+	g chunkStyle="FIGURE" edge="YMIN" width="5.0" box="((60.65999984741211,534.614013671875),(529.3779907226562,536.8779907226562))">
+	<rect x="56.69200134277344" y="88.98600006103516" width="481.89100646972656" height="440.3919906616211" style=" fill : none; stroke : none; stroke-width : 1.0; opacity : 0.3;"/>
+	<g clipPath="clipPath4" chunkStyle="graphicsStrokes">
+	<path style=" fill : none; stroke : black; stroke-width : 0.5;" clip-path="url(#clipPath4)" fill-rule="evenodd" stroke="none" d="M56.693 92.956 L56.75 92.276 C56.987 90.644 58.349 89.282 59.98 89.043 L60.661 88.986 L60.661 89.213 L60.037 89.271 C58.521 89.503 57.249 90.806 56.976 92.276 L56.919 92.956 "/>
+	...
+	<rect x="56.69300079345703" y="88.98600006103516" width="481.89000701904297" height="436.4220199584961" style=" fill : blue; stroke : black; stroke-width : 1.0; opacity : 0.3;"/>
+	</g>
+	<g clipPath="clipPath6" chunkStyle="author">
+	<text style=" stroke : none;" clip-path="url(#clipPath6)" text-rendering="optimizeLegibility" shape-rendering="auto" stroke="none" xml:space="preserve" x="260.80384821" y="110.13322536000001" font-size="5.8749">A</text>
+	<text style=" stroke : none;" clip-path="url(#clipPath6)" text-rendering="optimizeLegibility" shape-rendering="auto" stroke="none" xml:space="preserve" x="264.72240651" y="110.13322536000001" font-size="5.8749">s</text>
+	<rect x="260.80384821" y="104.25832536000001" width="59.25424140000001" height="5.874899999999997" style=" fill : green; stroke : green; stroke-width : 1.0; opacity : 0.5;"/>
+	<rect x="260.80384821" y="104.25832536000001" width="59.25424140000001" height="5.874899999999997" style=" fill : yellow; stroke : black; stroke-width : 1.0; opacity : 0.3;"/>
+	<g name="para">
+	<text style=" stroke : none;" x="260.80384821" y="110.13322536000001" font-size="8.0">Ascophyllum nodosum</text>
+	</g>
+	</g>
+	 */
+	void removeOriginalText() {
+		List<SVGElement> textList = SVGUtil.getQuerySVGElements(this, "svg:g[svg:g[@name='para']]/svg:text");
+		for (SVGElement text : textList) {
+			text.detach();
+		}
+	}
+
+	/** calculates whether 4 lines form a rectangle aligned with the axes
+	 * 
+	 * @param epsilon tolerance in coords
+	 * @return is rectangle
+	 */
+	public static Boolean isUnclosedBox(SVGPolyline polyline, double epsilon) {
+		boolean isBox = false;
+		List<SVGLine> lineList = polyline.createLineList();
+		if (lineList.size() == 3) {
+			SVGLine line0 = lineList.get(0);
+			SVGLine line2 = lineList.get(2);
+			Real2 point0 = line0.getXY(0);
+			Real2 point1 = line0.getXY(1);
+			Real2 point2 = line2.getXY(0);
+			Real2 point3 = line2.getXY(1);
+			// vertical
+			isBox = Real.isEqual(point0.getX(), point1.getX(), epsilon) &&
+				Real.isEqual(point2.getX(), point3.getX(), epsilon) &&
+				Real.isEqual(point1.getY(), point2.getY(), epsilon) &&
+				Real.isEqual(point3.getY(), point0.getY(), epsilon);
+			if (!isBox) {
+				isBox = Real.isEqual(point0.getY(), point1.getY(), epsilon) &&
+						Real.isEqual(point2.getY(), point3.getY(), epsilon) &&
+						Real.isEqual(point1.getX(), point2.getX(), epsilon) &&
+						Real.isEqual(point3.getX(), point0.getX(), epsilon);
+			}
+		}
+		return isBox;
+	}
+
+
+	/** is a very thin box part of the sides of a box?
+	 * 
+	 * @param chunkBox
+	 * @param polyBox
+	 * @param eps
+	 * @return
+	 */
+	private static boolean isSideOfBox(Real2Range chunkBox, SVGElement element, double eps) {
+		boolean isSide = false;
+		Real2Range elementBox = element.getBoundingBox();
+		RealRange elemXRange = elementBox.getXRange();
+		RealRange elemYRange = elementBox.getYRange();
+		RealRange chunkXRange = chunkBox.getXRange();
+		RealRange chunkYRange = chunkBox.getYRange();
+		isSide |= isSide(elemXRange, chunkXRange, eps);
+		isSide |= isSide(elemYRange, chunkYRange, eps);
+		return isSide;
+	}
+	
+	private static boolean isSide(RealRange elemRange, RealRange chunkRange, double eps) {
+		boolean isSide = false;
+		if (elemRange.getRange() < eps) {
+			double coord = elemRange.getMidPoint();
+			isSide = Real.isEqual(chunkRange.getMin(), coord, eps) ||
+				Real.isEqual(chunkRange.getMax(), coord, eps);
+		}
+		return isSide;
+	}
+	
+	public void copyAttributesAndChildrenFromSVGElement(SVGElement g) {
+		SVGElement gcopy = (SVGElement) g.copy();
+		Elements gcopyChildren = gcopy.getChildElements();
+		for (int i = 0; i < gcopyChildren.size(); i++) {
+			Element child = gcopyChildren.get(i);
+			child.detach();
+			this.appendChild(child);
+		}
+		this.copyAttributes(g);
+		this.setChunkStyleValue(this.getChunkStyleName());
+	}
+	
+	/** override in subclasses
+	 * 
+	 * @return
+	 */
+	protected String getChunkStyleName() {
+		return "CHUNK";
+	}
+	
+	void setBoundingBoxCacheForSelfAndDescendants(boolean cached) {
+		Long time0 = System.currentTimeMillis();
+		for (SVGElement element : elementList) {
+			element.setBoundingBoxCached(cached);
+		}
+		LOG.trace("set cache "+(System.currentTimeMillis()-time0));
+	}
+	
+	public boolean isScript() {
+		getChunkStyle();
+		boolean isScript = chunkStyle != null && chunkStyle.containsRole(ChunkRole.suscript);
+		return isScript;
+	}
+	public static Chunk createAndReplace(SVGElement element) {
+		Chunk chunk = new Chunk();
+		chunk.copyAttributesAndChildrenFromSVGElement(element);
+		element.getParent().replaceChild(element, chunk);
+		return chunk;
+	}
+	
+	public static Chunk createFromAndReplace(SVGG g, Chunk chunk) {
+		chunk.copyAttributesAndChildrenFromSVGElement(g);
+		g.getParent().replaceChild(g, chunk);
+		return chunk;
+	}
+
+}

src/main/java/org/xmlcml/graphics/control/page/ChunkAnalyzer.java

+package org.xmlcml.graphics.control.page;
+
+import java.util.List;
+
+import org.apache.log4j.Logger;
+import org.xmlcml.euclid.Real2Array;
+import org.xmlcml.euclid.RealArray;
+import org.xmlcml.graphics.paths.ComplexLine;
+import org.xmlcml.graphics.paths.LineAnalyzer;
+import org.xmlcml.graphics.paths.PolylineAnalyzer;
+import org.xmlcml.graphics.pdf2svg.AbstractSVGAnalyzer;
+import org.xmlcml.graphics.svg.SVGElement;
+import org.xmlcml.graphics.svg.SVGG;
+import org.xmlcml.graphics.svg.SVGLine;
+import org.xmlcml.graphics.svg.SVGPolyline;
+import org.xmlcml.graphics.svg.SVGText;
+import org.xmlcml.graphics.svg.SVGUtil;
+import org.xmlcml.graphics.text.TextAnalyzer;
+import org.xmlcml.graphics.util.GraphUtil;
+
+public class ChunkAnalyzer extends AbstractSVGAnalyzer {
+
+	private static final Logger LOG = Logger.getLogger(ChunkAnalyzer.class);
+
+	private static final int MIN_LINE_COUNT = 10;
+	public static final int PLACES = 6;
+
+	private List<SVGText> texts;
+	private TextAnalyzer textAnalyzer;
+	private List<SVGLine> lines;
+	private LineAnalyzer lineAnalyzer;
+	private List<SVGPolyline> polylines;
+	private PolylineAnalyzer polylineAnalyzer;
+	private SVGG svgg;
+
+	public ChunkAnalyzer(PageAnalyzer pageAnalyzer) {
+		this.pageAnalyzer = pageAnalyzer;
+		this.svgPage = pageAnalyzer.getSVGPage();
+	}
+
+	public void analyzeChunk(SVGG g) {
+		this.svgg = g;
+		debugLeaf();
+		analyzeTexts();
+		analyzeLines();
+		analyzePolylines();
+	}
+
+	private void analyzeTexts() {
+		texts = SVGText.extractTexts(SVGUtil.getQuerySVGElements(svgg, ".//svg:text"));
+		if (texts.size() > 0) {
+			textAnalyzer = new TextAnalyzer(pageAnalyzer);
+			textAnalyzer.analyzeTexts(svgg, texts);
+		}
+	}
+	
+	private void analyzeLines() {
+		lines = SVGLine.extractLines(SVGUtil.getQuerySVGElements(svgg, ".//svg:line"));
+		if (lines.size() > 0) {
+			lineAnalyzer = new LineAnalyzer(pageAnalyzer);
+			lineAnalyzer.analyzeLines(svgg, lines);
+		}
+	}
+
+	private void analyzePolylines() {
+		polylines = SVGPolyline.extractPolylines(SVGUtil.getQuerySVGElements(svgg, ".//svg:polyline"));
+		if (polylines.size() > 0) {
+			polylineAnalyzer = new PolylineAnalyzer(pageAnalyzer);
+			polylineAnalyzer.analyzePolylines(svgg, polylines);
+		}
+	}
+
+	private void debugLeaf() {
+		List<SVGElement> gList = SVGUtil.getQuerySVGElements(svgg, "./svg:g");
+		LOG.debug("G children: "+gList.size());
+		for (SVGElement g : gList) {
+			debugG();
+		}
+	}
+
+	private void debugG() {
+//		List<SVGElement> texts = SVGUtil.getQuerySVGElements(svgg, "./svg:text");
+//		List<SVGElement> lines = SVGUtil.getQuerySVGElements(svgg, "./svg:line");
+//		if (lines.size() > 0) {
+//			LineAnalyzer lineAnalyzer = new LineAnalyzer();
+//			lineAnalyzer.addLines(SVGLine.extractLines(lines));
+//			LOG.debug(lineAnalyzer.debug());
+//		}
+//		List<SVGElement> polylines = SVGUtil.getQuerySVGElements(svgg, "./svg:polyline");
+//		LOG.debug("G "+texts.size()+" texts;    "+lines.size()+" lines;    "+polylines.size()+" polylines; ");
+	}
+
+
+
+}

src/main/java/org/xmlcml/graphics/control/page/ChunkAnalyzerAction.java

+package org.xmlcml.graphics.control.page;
+
+import java.util.List;
+
+import org.apache.log4j.Logger;
+import org.xmlcml.graphics.control.AbstractActionElement;
+import org.xmlcml.graphics.svg.SVGElement;
+import org.xmlcml.graphics.svg.SVGG;
+import org.xmlcml.graphics.svg.SVGUtil;
+
+public class ChunkAnalyzerAction extends PageAction {
+
+	private final static Logger LOG = Logger.getLogger(ChunkAnalyzerAction.class);
+	
+	
+	public ChunkAnalyzerAction(AbstractActionElement pageActionCommand) {
+		super(pageActionCommand);
+	}
+	
+	@Override
+	public void run() {
+		String xpath = getXPath();
+		if (xpath != null) {
+			List<SVGElement> elements = SVGUtil.getQuerySVGElements(getSVGPage(), xpath);
+			LOG.debug("LEAFS "+elements.size());
+			for (SVGElement element : elements) {
+				if (!(element instanceof SVGG)) {
+					throw new RuntimeException("Must operate on <g> elements");
+				}
+				LOG.debug("*********************ELEMENT "+element.getId());
+				analyzeChunk((SVGG)element);
+			}
+			debugFile("target/chunkAnalyzer1Axes.svg");
+		}
+	}
+	
+	private void analyzeChunk(SVGG svgg) {
+		ChunkAnalyzer chunkAnalyzer = new ChunkAnalyzer(pageAnalyzer);
+		chunkAnalyzer.analyzeChunk(svgg);
+	}
+	
+}

src/main/java/org/xmlcml/graphics/control/page/ChunkAnalyzerElement.java

+package org.xmlcml.graphics.control.page;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+
+import nu.xom.Node;
+import nu.xom.Nodes;
+
+import org.xmlcml.graphics.control.AbstractActionElement;
+import org.xmlcml.graphics.control.CommandElement;
+import org.xmlcml.graphics.pdf2svg.DocumentAnalyzer;
+import org.xmlcml.graphics.svg.SVGSVG;
+
+
+public class ChunkAnalyzerElement extends AbstractActionElement {
+
+	public final static String TAG ="chunkAnalyzer";
+	private static final List<String> ATTNAMES = new ArrayList<String>();
+	
+	/** attribute names
+	 * 
+	 */
+
+	static {
+		ATTNAMES.add(PageActionElement.XPATH);
+	}
+
+	/** constructor
+	 */
+	public ChunkAnalyzerElement() {
+		super(TAG);
+		init();
+	}
+	
+	protected void init() {
+	}
+	
+	/** constructor
+	 */
+	public ChunkAnalyzerElement(CommandElement element) {
+        super(element);
+	}
+	
+    /**
+     * copy node .
+     *
+     * @return Node
+     */
+    public Node copy() {
+        return new ChunkAnalyzerElement(this);
+    }
+
+	/**
+	 * @return tag
+	 */
+	public String getTag() {
+		return TAG;
+	}
+
+	protected List<String> getAttributeNames() {
+		return ATTNAMES;
+	}
+
+	protected List<String> getRequiredAttributeNames() {
+		return Arrays.asList(new String[]{
+				AbstractActionElement.XPATH,
+		});
+	}
+
+
+}

src/main/java/org/xmlcml/graphics/control/page/ChunkStyle.java

+package org.xmlcml.graphics.control.page;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import nu.xom.Element;
+import nu.xom.Nodes;
+
+import org.xmlcml.cml.base.CMLConstants;
+import org.xmlcml.graphics.control.page.Chunk.ChunkRole;
+import org.xmlcml.graphics.svg.SVGPath;
+import org.xmlcml.graphics.svg.SVGText;
+
+/** holds style info for svg:text, svg:path or svg:image
+ * a single style has one path and one optional font
+ * there can be several styles with the same path and/or role
+ * we are still exploring what is allowed
+<styles>
+  <style target="path" role="graphicsStrokes" pages="all">
+    <path d="M0.0 0.0 L595.0 0.0 L595.0 793.0 L0.0 793.0 L0.0 0.0 Z" />
+    <!-- pages 0 2 4 5 6 7 8 12 14 -->
+  </style>
+  <style target="text" role="biblio tableCaption figureCaption" pages="all">
+    <path d="M0.0 0.0 L74.653 0.0 L74.653 99.495 L0.0 99.495 L0.0 0.0 Z" />
+    <!-- pages 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 -->
+    <font size="797" />
+  </style>
+  <style target="text" role="biblio tableCaption figureCaption" pages="all">
+    <path d="M0.0 0.0 L74.653 0.0 L74.653 99.495 L0.0 99.495 L0.0 0.0 Z" />
+    <font size="796" />
+    <!-- pages 1 3 8 9 10 11 12 14 -->
+  </style>
+
+ * 
+ * @author pm286
+ *
+ */
+public class ChunkStyle {
+
+	private static final String FILENAME = "filename";
+	private static final String INVOKE = "invoke";
+	// attributes of style
+	private static final String SIZE = "size";
+	private static final String FONT = "font";
+	private static final String SUBSCRIPTY = "subscripty";
+	private static final String SUPERSCRIPTY = "superscripty";
+	private static final String INTERLINE = "interline";
+	
+	private static final String TEXT = "text";
+	private static final String PATH = "path";
+	private static final String TARGET = "target";
+	private static final String ROLE = "role";
+	private static final String PAGES = "pages";
+	private static final String NAME = "name";
+	private static final String ACTION = "action";
+	private static final String STROKE = "stroke";
+	private static final String BOXCOUNT = "boxCount";
+	private static final String FILL = "fill";
+	private static final String FONT_SIZE = SIZE;
+	private static final String MARGINX = "marginX";
+	private static final String MARGINY = "marginY";
+	static final String REGEX = "regex";
+	private static final String STARTS_WITH = "startsWith";
+	
+
+	// logical components
+//	public static final String BOX = "box";
+	public static final String OUTLINED_BOX = "outlinedBox";
+	public static final String FIGURE_CAPTION = "figureCaption";
+	public static final String FIGURE = "figure";
+	public static final String TRUE = "true";
+	public static final String FIGURE_BODY = "FIGURE_BODY";
+	public static final String TEXT_CHUNK = "TEXT_CHUNK";
+	private static final String XPATH = "xpath";
+	private static final String VARS = "vars";
+	public static final String DELETE = "delete";
+	static final String PAGE_RANGE = "pageRange";
+	public static final String REPLACE_ROUNDED_BOX = "replaceRoundedBox";
+	public static final String SUSCRIPT = "suscript";
+	public static final String MAKE_TEXT_CHUNKS = "makeTextChunks";
+	public static final String DRAW_BOXES = "drawBoxes";
+	public static final String WRITE_FILE = "writeFile";
+	private static final String OPACITY = "opacity";
+	private static final String STROKE_WIDTH = "strokeWidth";
+	
+	Element element;
+	private List<ChunkRole> chunkRoleList;
+
+	public ChunkStyle(Element element) {
+		this.element = (Element) element.copy();
+	}
+	
+	public String getPathDString() {
+		Nodes nodes = element.query("path/@d");
+		return nodes.size() == 1 ? nodes.get(0).getValue() : null;
+	}
+	
+	public Double getInterlineSeparation() {
+		return getDouble(INTERLINE);
+	}
+
+	public Double getSubscriptY() {
+		return getDouble(SUBSCRIPTY);
+	}
+
+	public Double getSuperscriptY() {
+		return getDouble(SUPERSCRIPTY);
+	}
+
+	private Double getDouble(String attName) {
+		String ss = element.getAttributeValue(attName);
+		return (ss == null) ? null : new Double(ss);
+	}
+
+	private Double getDouble(String attName, Double defalt) {
+		Double d = getDouble(attName);
+		return (d == null) ? defalt : d;
+	}
+
+	public Integer getBoxCount() {
+		return getInteger(BOXCOUNT);
+	}
+
+	private Integer getInteger(String attName) {
+		String ss = element.getAttributeValue(attName);
+		return (ss == null) ? null : new Integer(ss);
+	}
+
+	public Double getMarginX() {
+		return getDouble(MARGINX);
+	}
+
+	public Double getMarginY() {
+		return getDouble(MARGINY);
+	}
+
+	public Double getStrokeWidth() {
+		return getDouble(STROKE_WIDTH, 1.0);
+	}
+
+	public Double getOpacity() {
+		return getDouble(OPACITY, 1.0);
+	}
+
+	private String getSingleFontAttributeValue(String attName) {
+		Element font = getSingleFont(); 
+		return (font == null) ? null : font.getAttributeValue(attName);
+	}
+	
+	private Element getSingleFont() {
+		Nodes fonts = element.query(FONT+"[@"+SIZE+"]");
+		return (fonts.size() == 1) ? (Element) fonts.get(0) : null;
+	}
+
+	public Integer getFontSize() {
+		String fontS = getSingleFontAttributeValue(FONT_SIZE);
+		return (fontS == null) ? null : new Integer(fontS);
+	}
+	
+	public String getAllowedPages() {
+		return element.getAttributeValue(PAGES);
+	}
+
+	/**
+	  <style page="1-*" xpath="svg:g[@id='chunk0.0.1']/svg:g/svg:g/svg:g[@name='para']/svg:text" title="pageNumber" 
+		    invoke="afterChunk" action="regex delete" 
+		    regex="Page\s+(\d+)of(\d+)" vars="page pageCount" 
+		    />
+
+	<!--=========== page 0 paths ===================-->
+	  <style page="0" xpath="svg:g[@id='chunk0.0.0']/svg:g/svg:g/svg:g[@name='para']/svg:text" title="pageMetadata" 
+	  invoke="afterChunk" action="regex delete" 
+	    regex="(.*)(BMC.*)(\d\d\d\d), (\d+):(\d+)\s*http\://www.biomedcentral.com/(\d+\-\d+)/(\d+)/(\d+)" 
+	    vars="author journal year issue article doiSuffix issue1 article1"
+	    />
+*/
+	public String getInvoke() {
+		return element.getAttributeValue(INVOKE);
+	}
+	
+	public List<String> getActionList() {
+		return getList(ACTION); 
+	}
+
+	public String getXpath() {
+		return element.getAttributeValue(XPATH); 
+	}
+
+	public List<String> getVarsList() {
+		return getList(VARS); 
+	}
+
+	private List<String> getList(String attName) {
+		String s = element.getAttributeValue(attName);
+		return (s == null) ? new ArrayList<String>() : Arrays.asList(s.split(CMLConstants.S_WHITEREGEX));
+	}
+	
+	public String getRolesString() {
+		String roleString = null;
+		Nodes roles = element.query("@"+ROLE);
+		if (roles.size() == 1) {
+			roleString = roles.get(0).getValue();
+		}
+		return roleString;
+	}
+	
+	public List<ChunkRole> getChunkRoleList() {
+		if (chunkRoleList == null) {
+			chunkRoleList = new ArrayList<ChunkRole>();
+			String roleString = this.getRolesString();
+			if (roleString != null) {
+				String[] roles = roleString.split(" ");
+				for (String role : roles) {
+					ChunkRole chunkRole = ChunkRole.valueOf(role);
+					if (chunkRole == null) {
+						throw new RuntimeException("Cannot find ChunkRole for: "+role);
+					}
+					chunkRoleList.add(chunkRole);
+				}
+			}
+		}
+		return chunkRoleList;
+	}
+	
+	public boolean containsRole(ChunkRole chunkRole) {
+		getChunkRoleList();
+		return chunkRoleList != null && chunkRoleList.contains(chunkRole);
+	}
+	
+	public Class<?> getTargetClass() {
+		Class clazz = null;
+		Nodes targets = element.query("@"+TARGET);
+		if (targets.size() == 1) {
+			String target = targets.get(0).getValue();
+			if (PATH.equals(target)) {
+				clazz = SVGPath.class;
+			} else if (TEXT.equals(target)) {
+				clazz = SVGText.class;
+			}
+		}
+		return clazz;
+	}
+
+	public String getName() {
+		return element.getAttributeValue(NAME);
+	}
+
+	public String getStroke() {
+		return element.getAttributeValue(STROKE);
+	}
+
+	public String getFill() {
+		return element.getAttributeValue(FILL);
+	}
+
+	public String getRegex() {
+		return element.getAttributeValue(REGEX);
+	}
+
+	public String getStartsWith() {
+		return element.getAttributeValue(STARTS_WITH);
+	}
+
+	public String toString() {
+		return element == null ? null : element.toXML();
+	}
+
+	public String getAttribute(String name) {
+		return element.getAttributeValue(name);
+	}
+
+	public String getFilename() {
+		return getAttribute(FILENAME);
+	}
+
+}
+

src/main/java/org/xmlcml/graphics/control/page/PageAction.java

 import org.apache.log4j.Logger;
 import org.xmlcml.cml.base.CMLConstants;
 import org.xmlcml.cml.base.CMLUtil;
+import org.xmlcml.euclid.RealArray;
 import org.xmlcml.graphics.control.AbstractAction;
 import org.xmlcml.graphics.control.AbstractActionElement;
 import org.xmlcml.graphics.pdf2svg.AbstractAnalyzer;
 import org.xmlcml.graphics.pdf2svg.DocumentAnalyzer;
-import org.xmlcml.graphics.pdf2svg.PageAnalyzer;
 import org.xmlcml.graphics.svg.SVGElement;
 import org.xmlcml.graphics.svg.SVGSVG;
 import org.xmlcml.graphics.svg.SVGUtil;

src/main/java/org/xmlcml/graphics/control/page/PageActionFactory.java

 
 		PageAction pageAction = null;
 		if (false) {
-		} else if(command instanceof PlotAnalyzerElement) {
-			pageAction = new PlotAnalyzerAction(command);
+		} else if(command instanceof ChunkAnalyzerElement) {
+			pageAction = new ChunkAnalyzerAction(command);
 		} else if(command instanceof BoxDrawerElement) {
 			pageAction = new BoxDrawerAction(command);
 		} else if(command instanceof BoxProcessorElement) {

src/main/java/org/xmlcml/graphics/control/page/PageAnalyzer.java

+package org.xmlcml.graphics.control.page;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import nu.xom.Attribute;
+import nu.xom.Element;
+
+import org.apache.log4j.Logger;
+import org.xmlcml.cml.base.CMLUtil;
+import org.xmlcml.euclid.Transform2;
+import org.xmlcml.graphics.figure.Figure;
+import org.xmlcml.graphics.figure.FigureAnalyzer;
+import org.xmlcml.graphics.font.FontManager;
+import org.xmlcml.graphics.font.Glyph;
+import org.xmlcml.graphics.font.OutlineFont;
+import org.xmlcml.graphics.paths.PathAnalyzer;
+import org.xmlcml.graphics.pdf2svg.AbstractAnalyzer;
+import org.xmlcml.graphics.pdf2svg.DocumentAnalyzer;
+import org.xmlcml.graphics.pdf2svg.PConstants;
+import org.xmlcml.graphics.pdf2svg.StyleManager;
+import org.xmlcml.graphics.pdf2svg.Table;
+import org.xmlcml.graphics.pdf2svg.TableAnalyzer;
+import org.xmlcml.graphics.svg.SVGClipPath;
+import org.xmlcml.graphics.svg.SVGElement;
+import org.xmlcml.graphics.svg.SVGPath;
+import org.xmlcml.graphics.svg.SVGSVG;
+import org.xmlcml.graphics.svg.SVGText;
+import org.xmlcml.graphics.svg.SVGUtil;
+import org.xmlcml.graphics.text.TextAnalyzer;
+import org.xmlcml.graphics.util.GraphUtil;
+
+import com.google.common.collect.BiMap;
+import com.google.common.collect.HashBiMap;
+import com.google.common.collect.Multimap;
+
+/** the key class controlling the analysis of a page.
+ * 
+ * PageInterpreter analyzes a page without other context (although its results can be fed back to
+ * DocumentAnalyzer. (It might be called PageAnalyzer but that is visually close to PathAnalyzer)
+ * 
+ * It has several sub-analyzers (e.g. 
+ * ChunkAnalyzer, TextAnalyzer, PathAnalyzer, FigureAnalyzer, TableAnalyzer
+ * which are deployed in sequence (and possibly multiple times) on the single page. The results of
+ * these are left in the transformed page but may also be abstracted and aggregated 
+ * back to DocumentAnalyzer (in this way overall heuristic data can be compiled)
+ * These analyzers all have references to PageInterpreter
+ * 
+ * Any "stitching together" of pages will be done by document analyzer.
+ * 
+ * @author pm286
+ *
+ */
+public class PageAnalyzer extends AbstractAnalyzer {
+
+	private static final Logger LOG = Logger.getLogger(PageAnalyzer.class);
+	
+	private static final String SHAPE_RENDERING = "shape-rendering";
+	private static final String SPACE = "space";
+	private static final String TEXT_RENDERING = "text-rendering";
+	
+	private static final String AFTER_CHUNK = "afterChunk";
+	private static final String AFTER_TEXT = "afterText";
+	private static final String AFTER_PATH = "afterPath";
+
+	public static final int DECIMAL_PLACES = 3;
+
+	public static final String ROLE = "role";
+	public static final String CHAR = "char";
+	public static final String RECT = "rect";
+	public static final String POLYLINE = "polyline";
+	public static final String PATH = "path";
+	public static final String USE = "use";
+	public static final String FIRST = "first";
+	public static final String NOT_FIRST = "notFirst";
+	public static final String LAST = "last";
+	public static final String REPORTED_PAGE_NUMBER = "reportedPageNumber";
+	private static final String BEFORE_TEXT = "beforeText";
+	public static final String NAME_PREFIX = PConstants.P;
+
+	private List<Glyph> glyphList;
+	private FontManager fontManager;
+	private StyleManager styleManager;
+	private BiMap<String, String> clipPathByIdMap;
+	
+	private SVGSVG svgPage;
+	private int pageNumber;
+	private String pageNumberString;
+	private Integer pageNumberInteger;
+	private List<SVGText> textChunkList;
+	private List<Figure> figureList;
+	private List<Table> tableList;
+
+	/** not sure how many of these are used
+	 */
+	
+	private PathAnalyzer pathAnalyzer;
+	private OutlineFont outlineFont;
+	
+	DocumentAnalyzer documentAnalyzer;
+	private PageClipPathAnalyzer clipPathAnalyzer;
+	private PageFontSizeAnalyzer fontSizeAnalyzer;
+	private PageChunkSplitter chunkAnalyzer;
+	private TextAnalyzer textAnalyzer;
+	private FigureAnalyzer figureAnalyzer;
+	private TableAnalyzer tableAnalyzer;
+
+	public PageAnalyzer() {
+	}
+	
+	public PageAnalyzer(SVGSVG svgPage) {
+		this(null, svgPage);
+	}
+
+	public PageAnalyzer(DocumentAnalyzer documentAnalyzer, SVGSVG svgPage) {
+		this();
+		this.documentAnalyzer = documentAnalyzer;
+		this.svgPage = svgPage;
+	}
+
+	public static void removeUnwantedSVGAttributesAndAddIds(SVGSVG svgPage) {
+		Long time0 = System.currentTimeMillis();
+		List<SVGElement> elements = SVGUtil.getQuerySVGElements(svgPage, "//svg:*");
+		List<Attribute> attributeList = new ArrayList<Attribute>();
+		int attno = 0;
+		for (SVGElement element : elements) {
+			for (int i = 0; i < element.getAttributeCount(); i++) {
+				Attribute attribute = element.getAttribute(i);
+				String name = attribute.getLocalName();
+				if (name.equals(TEXT_RENDERING) ||
+				    name.equals(SHAPE_RENDERING) ||
+					name.equals(SPACE)) {
+					attributeList.add(attribute);
+				}
+			}
+			if (element.getId() == null) {
+				element.setId(element.getLocalName()+(attno++));
+			}
+		}
+
+		for (Attribute attribute : attributeList) {
+			attribute.detach();
+		}
+		LOG.trace("ATTS "+(System.currentTimeMillis()-time0));
+	}
+	
+
+	// ========================= TRANSFERRED ELSEWHERE ======================
+	
+	public PathAnalyzer ensurePathAnalyzer() {
+		if (pathAnalyzer == null) {
+			pathAnalyzer = new PathAnalyzer(this);
+		}
+		return pathAnalyzer;
+	}
+
+	public int getPageNumber() {
+		return pageNumber;
+	}
+
+	public PathAnalyzer getPathAnalyzer() {
+		return pathAnalyzer;
+	}
+
+
+	public TextAnalyzer getTextAnalyzer() {
+		return textAnalyzer;
+	}
+
+	public OutlineFont getOutlineFont() {
+		return outlineFont;
+	}
+
+	/** returns a conventional page number (e.g. at top or bottom of page)
+	 * formally unrelated to the number of pages in the document though normally a constant offset
+	 * @return the number as a string (might be "22a", etc.)
+	 */
+	public String getAuthorPageNumberString() {
+		return pageNumberString;
+	}
+	
+	/** returns a conventional page number (e.g. at top or bottom of page)
+	 * formally unrelated to the number of pages in the document though normally a constant offset
+	 * @return the number as an integer (null if unparsable as such)
+	 */
+	public Integer getAuthorPageNumber() {
+		return pageNumberInteger;
+	}
+	
+	/**
+	 * get chunks of text in PDF order. Requires heuristics to decide what they are
+	 * @return
+	 */
+	public List<SVGText> getTextChunkList() {
+		return textChunkList;
+	}
+	
+	/**
+	 * get chunks of text in PDF order. Requires heuristics to decide what they are
+	 * @return
+	 */
+	public List<Figure> getFigureList() {
+		ensureFigureList();
+		return figureList;
+	}
+	
+	private void ensureFigureList() {
+		if (figureList == null) {
+			figureList = new ArrayList<Figure>();
+		}
+	}
+
+	public List<Glyph> getGlyphList() {
+		return glyphList;
+	}
+
+	public FontManager getFontManager() {
+		return fontManager;
+	}
+
+	public void setFontManager(FontManager fontManager) {
+		this.fontManager = fontManager;
+	}
+	
+	public void analyzeClipPathsAndAddToMap(Multimap<String, PageAnalyzer> clipPathDMap) {
+		analyzeClipPaths();
+		addClipPathsToMap(clipPathDMap);
+	}
+
+	public void analyzeClipPaths() {
+		ensureClipPathAnalyzer();
+		clipPathAnalyzer.analyze();
+	}
+
+	private void ensureClipPathAnalyzer() {
+		if (clipPathAnalyzer == null) {
+			clipPathAnalyzer = new PageClipPathAnalyzer(this);
+		}
+	}
+
+	public void addClipPathsToMap(Multimap<String, PageAnalyzer> clipPathDMap) {
+		List<SVGElement> clipPaths  = this.getPageClipPathAnalyzer().getClipPathList();
+		for (SVGElement clipPath : clipPaths) {
+			SVGPath path = (SVGPath) ((SVGClipPath) clipPath).getChildElements().get(0);
+			String d = path.getDString();
+			LOG.trace(""+this.getPageNumber()+" .. "+d);
+			clipPathDMap.put(d, this);
+		}
+	}
+
+	public void analyzeFontSizesAndAddToMap(Multimap<Integer, PageAnalyzer> fontSizeMap) {
+		analyzeFontSizes();
+		addFontSizesToMap(fontSizeMap);
+	}
+
+	private void analyzeFontSizes() {
+		ensureFontSizeAnalyzer();
+		fontSizeAnalyzer.analyze();
+	}
+
+	public PageFontSizeAnalyzer ensureFontSizeAnalyzer() {
+		if (fontSizeAnalyzer == null) {
+			fontSizeAnalyzer = new PageFontSizeAnalyzer(this);
+		}
+		return fontSizeAnalyzer;
+	}
+
+	public PageChunkSplitter ensureChunkAnalyzer() {
+		if (chunkAnalyzer == null) {
+			chunkAnalyzer = new PageChunkSplitter(this);
+		}
+		return chunkAnalyzer;
+	}
+
+	public TextAnalyzer ensureTextAnalyzer() {
+		if (textAnalyzer == null) {
+			textAnalyzer = new TextAnalyzer(this);
+		}
+		return textAnalyzer;
+	}
+
+	public FigureAnalyzer ensureFigureAnalyzer() {
+		if (figureAnalyzer == null) {
+			figureAnalyzer = new FigureAnalyzer(this);
+		}
+		return figureAnalyzer;
+	}
+
+	public TableAnalyzer ensureTableAnalyzer() {
+		if (tableAnalyzer == null) {
+			tableAnalyzer = new TableAnalyzer(this);
+		}
+		return tableAnalyzer;
+	}
+
+	private void applyBrowserScale() {
+		List<SVGElement> gList = SVGUtil.getQuerySVGElements(svgPage, ".//svg:g[@id='"+PageChunkSplitter.TOP_CHUNK+"']");
+		if (gList.size() != 1) {
+			LOG.error("should have one topChunk G");
+		} else {
+			gList.get(0).setTransform(Transform2.applyScale(0.7));
+		}
+	}
+	
+
+	public void addFontSizesToMap(Multimap<Integer, PageAnalyzer> fontSizeMap) {
+		Multimap<Integer, SVGElement> elementsByFontSize = fontSizeAnalyzer.createMapsForElementsByFontSize();
+		for (Integer fontSize : elementsByFontSize.keySet()) {
+			fontSizeMap.put(fontSize, this);
+		}
+	}
+	
+	public SVGSVG getSVGPage() {
+		return svgPage;
+	}
+
+	public void setSVGPage(SVGSVG svgPage) {
+		this.svgPage = svgPage;
+	}
+
+	public void setPageNumber(int pageNumber) {
+		this.pageNumber = pageNumber;
+	}
+
+	public PageClipPathAnalyzer getPageClipPathAnalyzer() {
+		return clipPathAnalyzer;
+	}
+
+	public PageFontSizeAnalyzer getPageFontSizeAnalyzer() {
+		return fontSizeAnalyzer;
+	}
+
+	public PageChunkSplitter getChunkAnalyzer() {
+		return chunkAnalyzer;
+	}
+
+	public StyleManager getStyleManager() {
+		return documentAnalyzer.getSemanticDocumentAction().getStyleManager();
+	}
+
+	public List<Table> getTableList() {
+		ensureTableList();
+		return tableList;
+	}
+
+	private void ensureTableList() {
+		if (tableList == null) {
+			tableList = new ArrayList<Table>();
+		}
+	}
+
+	/** because the unscaled clipPaths can hide the scaled stuff
+	 * use with care as the defs are used for styles - i.e. only at the end
+	 * 
+	 */
+	public void removeClipPathsForDisplay() {
+		List<SVGElement> clipPathDefs = SVGUtil.getQuerySVGElements(
+				svgPage, "svg:g/svg:defs/svg:clipPath");
+		for (SVGElement def : clipPathDefs) {
+			def.detach();
+		}
+	}
+
+	private static void usage() {
+		System.err.println("Usage: <svgfilein>");
+	}
+	
+	public void setOutlineFont(OutlineFont outlineFont) {
+		this.outlineFont = outlineFont;
+	}
+
+	public boolean islastPage(int pageNumber) {
+		boolean isLastPage = false;
+		Object lastPageS = documentAnalyzer.getValue(DocumentAnalyzer.REPORTED_PAGE_COUNT);
+		if (lastPageS != null && lastPageS instanceof String) {
+			try {
+				Integer lastPage = new Integer((String)lastPageS);
+				isLastPage = lastPage == pageNumber + 1; // we count from ZERO, document counts from 1
+			} catch (Exception e) {
+				throw new RuntimeException("bad page number: "+lastPageS, e);
+			}
+		}
+		return isLastPage;
+	}
+	/** tru if no pages range or pages='first' and page==0
+	 * 
+	 * @param chunkStyle TODO
+	 * @param pageNumber
+	 * @return
+	 */
+	public boolean isAllowedPage(ChunkStyle chunkStyle, int pageNumber) {
+		boolean allowed = false;
+		String range = chunkStyle.element.getAttributeValue(ChunkStyle.PAGE_RANGE);
+		if (range == null) {
+			allowed = true; // no page range, allowed
+		} else if (FIRST.equals(range) && pageNumber == 0) {
+			allowed = true;
+		} else if (NOT_FIRST.equals(range) && pageNumber > 0) {
+			allowed = true;
+		} else if (LAST.equals(range) && islastPage(pageNumber)) {
+			allowed = true;
+		} else {
+			allowed = false;
+		}
+		return allowed;
+	}
+
+	/**
+   <clipPath clipPathUnits="userSpaceOnUse" id="clipPath1">
+    <path d="M0 0 L60.9419 0 L60.9419 81.2217 L0 81.2217 L0 0 Z"/>
+   </clipPath>
+	 */
+	public BiMap<String, String> ensureClipPathByIdMap() {
+		if (clipPathByIdMap == null) {
+			this.clipPathByIdMap = HashBiMap.create();
+			List<SVGElement> clipPaths = SVGUtil.getQuerySVGElements(svgPage, "svg:g/svg:defs/svg:clipPath");
+			for (SVGElement clipPath : clipPaths) {
+				String id = clipPath.getId();
+				String d = clipPath.getChildElements().get(0).getAttributeValue("d");
+				try {
+					clipPathByIdMap.put(id,  d);
+				} catch (IllegalArgumentException iae) {
+					LOG.trace("clip path failure: "+iae);
+				}
+			}
+		}
+		return clipPathByIdMap;
+	}
+
+	public DocumentAnalyzer getDocumentAnalyzer() {
+		return documentAnalyzer;
+	}
+
+	public void setDocumentAnalyzer(DocumentAnalyzer documentAnalyzer) {
+		this.documentAnalyzer = documentAnalyzer;
+	}
+
+	public String getNamePrefix() {
+		return NAME_PREFIX;
+	}
+}

src/main/java/org/xmlcml/graphics/control/page/PageAnalyzerAction.java

 import org.xmlcml.graphics.pdf2svg.AbstractAnalyzer;
 import org.xmlcml.graphics.pdf2svg.DocumentAnalyzer;
 import org.xmlcml.graphics.pdf2svg.PDF2SVGReader;
-import org.xmlcml.graphics.pdf2svg.PageAnalyzer;
 import org.xmlcml.graphics.svg.SVGSVG;
 
 public class PageAnalyzerAction extends AbstractAction {

src/main/java/org/xmlcml/graphics/control/page/PageChunkSplitter.java

+package org.xmlcml.graphics.control.page;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import nu.xom.Attribute;
+import nu.xom.Nodes;
+
+import org.apache.log4j.Logger;
+import org.xmlcml.graphics.pdf2svg.AbstractAnalyzer;
+import org.xmlcml.graphics.pdf2svg.BoundingBoxManager.BoxEdge;
+import org.xmlcml.graphics.svg.SVGElement;
+import org.xmlcml.graphics.svg.SVGSVG;
+import org.xmlcml.graphics.svg.SVGUtil;
+
+/**
+ * page-oriented
+ * 
+ * slices the page up into chunks using continuous whitespace. 
+ * Can then recurse. Most obvious strategy is:
+ * 1 slice Y (i.e. horizontal borders)
+ * 2 slice X on results of 1
+ * 3 slice Y on results of 2
+ * 
+ * determine types of chunk after this and apply different strategies