Commits

petermr  committed e6e3f5d

changed id and artifact

  • Participants

Comments (0)

Files changed (187)

+<?xml version="1.0" encoding="UTF-8"?>
+<classpath>
+	<classpathentry kind="src" output="target/classes" path="src/main/java"/>
+	<classpathentry excluding="**" kind="src" output="target/classes" path="src/main/resources"/>
+	<classpathentry kind="src" output="target/test-classes" path="src/test/java"/>
+	<classpathentry excluding="**" kind="src" output="target/test-classes" path="src/test/resources"/>
+	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/J2SE-1.5"/>
+	<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER"/>
+	<classpathentry kind="output" path="target/classes"/>
+</classpath>
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+	<name>chemdraw</name>
+	<comment></comment>
+	<projects>
+	</projects>
+	<buildSpec>
+		<buildCommand>
+			<name>org.eclipse.jdt.core.javabuilder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+		<buildCommand>
+			<name>org.eclipse.m2e.core.maven2Builder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+	</buildSpec>
+	<natures>
+		<nature>org.eclipse.jdt.core.javanature</nature>
+		<nature>org.eclipse.m2e.core.maven2Nature</nature>
+	</natures>
+</projectDescription>

File .settings/org.eclipse.core.resources.prefs

+#Sat Feb 25 17:05:13 GMT 2012
+eclipse.preferences.version=1
+encoding//src/main/java=UTF-8
+encoding//src/main/resources=UTF-8
+encoding//src/test/java=UTF-8
+encoding//src/test/resources=UTF-8
+encoding/<project>=UTF-8

File .settings/org.eclipse.jdt.core.prefs

+#Sat Feb 25 17:05:14 GMT 2012
+eclipse.preferences.version=1
+org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.5
+org.eclipse.jdt.core.compiler.compliance=1.5
+org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
+org.eclipse.jdt.core.compiler.source=1.5

File .settings/org.eclipse.m2e.core.prefs

+#Sat Feb 25 17:05:11 GMT 2012
+activeProfiles=
+eclipse.preferences.version=1
+resolveWorkspaceProjects=true
+version=1
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+	<modelVersion>4.0.0</modelVersion>
+	
+	<parent>
+		<groupId>uk.ac.cam.ch.wwmm</groupId>
+		<artifactId>wwmm-parent</artifactId>
+		<version>3</version>
+    </parent>
+    
+	<groupId>org.xml-cml.chemdraw</groupId>
+	<artifactId>chemdraw-converter</artifactId>
+	<version>0.3-SNAPSHOT</version>
+	<name>Chemdraw-converters</name>
+	
+	<description>Converts CDX and CDXML from and to CML</description>
+	<url>http://www.xml-cml.org/</url>
+	<scm />
+	
+	<properties>
+	    <chemdraw.groupId>org.xml-cml.chemdraw</chemdraw.groupId>
+	    
+		<junit.groupId>junit</junit.groupId>
+		<junit.artifactId>junit</junit.artifactId>
+		<junit.version>4.8.2</junit.version>
+		
+  	    <jumbo.groupId>org.xml-cml</jumbo.groupId>
+		<jumbo.version>6.1-SNAPSHOT</jumbo.version>
+		
+		<jumbo-testutil.version>1.1-SNAPSHOT</jumbo-testutil.version>
+	</properties>
+
+	<build>
+		<finalName>chemdraw-converter</finalName>
+		<plugins>
+			<plugin>
+				<artifactId>maven-assembly-plugin</artifactId>
+				<configuration>
+					<descriptorRefs>
+						<descriptorRef>jar-with-dependencies
+						</descriptorRef>
+					</descriptorRefs>
+					<archive>
+						<manifest>
+							<mainClass>org.xmlcml.cml.chemdraw.ChemdrawConverter
+							</mainClass>
+						</manifest>
+					</archive>
+				</configuration>
+			</plugin>
+			<plugin>
+				<artifactId>maven-compiler-plugin</artifactId>
+				<configuration>
+					<source>1.5</source>
+					<target>1.5</target>
+				</configuration>
+			</plugin>
+			<plugin>
+				<artifactId>maven-surefire-plugin</artifactId>
+				<configuration>
+					<excludes>
+						<exclude>org/xmlcml/util/TestUtils.java</exclude>
+					</excludes>
+				</configuration>
+			</plugin>			
+		</plugins>
+	</build>
+	
+	<repositories>
+		<repository>
+			<id>wwmm-repo</id>
+			<name>WWMM Maven2</name>
+			<url>https://maven.ch.cam.ac.uk/m2repo</url>
+		</repository>
+	</repositories>
+	
+	<dependencies>
+		<dependency>
+			<groupId>${jumbo.groupId}</groupId>
+			<artifactId>jumbo</artifactId>
+			<version>${jumbo.version}</version>
+		</dependency>
+		<dependency>
+			<groupId>${junit.groupId}</groupId>
+			<artifactId>junit</artifactId>
+			<version>${junit.version}</version>
+			<scope>test</scope>
+		</dependency>
+		<dependency>
+			<groupId>log4j</groupId>
+			<artifactId>log4j</artifactId>
+			<version>1.2.13</version>
+		</dependency>
+      <dependency>
+         <groupId>${jumbo.groupId}</groupId>
+         <artifactId>jumbo-testutil</artifactId>
+			<version>${jumbo-testutil.version}</version>
+         <scope>test</scope>
+      </dependency>
+	</dependencies>
+	
+	<reporting>
+		<plugins>
+			<plugin>
+				<artifactId>maven-project-info-reports-plugin
+				</artifactId>
+				<reportSets>
+					<reportSet>
+						<reports>
+							<report>dependencies</report>
+							<report>project-team</report>
+							<report>license</report>
+							<report>scm</report>
+						</reports>
+					</reportSet>
+				</reportSets>
+			</plugin>
+			<plugin>
+				<artifactId>maven-javadoc-plugin</artifactId>
+			</plugin>
+		</plugins>
+	</reporting>
+	<distributionManagement>
+		<repository>
+			<id>wwmm-dav</id>
+			<name>WWMM</name>
+			<url>dav:http://wwmm.ch.cam.ac.uk/maven2
+			</url>
+		</repository>
+	</distributionManagement>
+</project>

File src/main/java/org/xmlcml/cml/chemdraw/CDX2CDXML.java

+package org.xmlcml.cml.chemdraw;
+
+import java.io.ByteArrayInputStream;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.log4j.Level;
+import org.apache.log4j.Logger;
+import org.xmlcml.cml.base.CMLConstants;
+import org.xmlcml.cml.chemdraw.components.CDXObject;
+import org.xmlcml.cml.chemdraw.components.CDXParser;
+import org.xmlcml.cml.chemdraw.components.ChemdrawRuntimeException;
+
+/**
+ * Converts a CDX file to a CDXML file.
+ * relies on format on Chemdraw website.
+ * some primitives (especially pure graphics) may not be
+ * fully supported
+ * 
+This code is open source under the Artistic License
+see http://www.opensource.org for conditions
+@author P.Murray-Rust, 2001-2008
+*/
+
+
+public class CDX2CDXML {
+
+	static Logger LOG = Logger.getLogger(CDX2CDXML.class);
+	static {
+		LOG.setLevel(Level.DEBUG);
+	}
+	private static final int ROWSIZE = 16;
+	private static final int BLOCKSIZE = 8 * ROWSIZE;
+	private CDXObject parsedObject;
+	private CDXParser parser;
+	
+	/**
+     */
+	public CDX2CDXML() {
+        init();
+	}
+
+	private void init() {
+		parser = new CDXParser();
+	}
+	
+	/**
+	 * @param is
+	 * @throws IOException
+	 * @throws CDXException
+	 */
+	public void parseCDX(InputStream is) throws IOException {
+		byte[] bytes = IOUtils.toByteArray(is);
+		this.parseCDX(bytes);
+    }
+	
+	public CDXParser getParser() {
+		return parser;
+	}
+
+	public void parseCDX(byte[] bytes) {
+		LOG.trace("bytes: "+bytes.length);
+		parseAllowingForCorruptInput(bytes);
+		parsedObject = parser.getParsedObject();
+	}
+
+	private void parseAllowingForCorruptInput(byte[] bytes) {
+		boolean finished = false;
+		int count = 12;
+		while (!finished && count-- > 0) {
+			try {
+				parser.parseCDX(bytes);
+				finished = true;
+			} catch (ChemdrawRuntimeException cre) {
+				bytes = exciseBlocksUntilNoLongerCorrupt(bytes, cre);
+			} catch (Exception e) {
+				throw new RuntimeException("cannot parse", e);
+			}
+		}
+	}
+
+	private byte[] exciseBlocksUntilNoLongerCorrupt(byte[] bytes,
+			ChemdrawRuntimeException cre) {
+		int byteCount = parser.getByteCount();
+		byteCount = roundToBlock(byteCount, BLOCKSIZE);
+//				debugBytesBothways("before", bytes, byteCount, BLOCKSIZE);
+		// the 4 is empirical
+		bytes = exciseBlock(bytes, byteCount, 4);
+		System.err.println("Excised block");
+//		debugBytesBothways("after", bytes, byteCount, 2*BLOCKSIZE);
+//				throw new RuntimeException("finish");
+		return bytes;
+	}
+
+	public CDXObject getCDXMLObject() {
+		return parsedObject;
+	}
+
+//===================
+	private int roundToBlock(int byteCount, int blocksize) {
+		return (byteCount / blocksize) * blocksize;
+	}
+	private void debugBytesBothways(String msg, byte[] bytes, int byteCount, int deltaBytes) {
+		System.out.println(">>>>>>>>>>"+msg+">>>>>>>>>>>>>>");
+		System.out.println(debugBytesFoward(bytes, byteCount-deltaBytes, deltaBytes));
+		System.out.println("====================================");
+		System.out.println(debugBytesFoward(bytes, byteCount, deltaBytes));
+		System.out.println("<<<<<<<<<<"+msg+"<<<<<<<<<<<<<");
+	}
+	private static String debugBytesFoward(byte[] bytes, int start, int deltaBytes) {
+		StringBuilder sb = new StringBuilder();
+		if (start % ROWSIZE != 0) {
+			throw new RuntimeException("bad start "+start);
+		}
+		if (deltaBytes % ROWSIZE != 0) {
+			throw new RuntimeException("bad deltaBytes "+start);
+		}
+		int nrows = deltaBytes / ROWSIZE;
+		for (int irow = 0; irow < nrows; irow++) {
+			byte[] rowBytes = copyBytes(bytes, start, ROWSIZE);
+			sb.append(toHexString(start)+":  ");
+			sb.append(toString(rowBytes));
+			sb.append("\n");
+			start += ROWSIZE;
+		}
+		return sb.toString();
+	}
+	private static String toString(byte[] rowByte) {
+		StringBuilder sb = new StringBuilder();
+		for (int i = 0; i < rowByte.length; i++) {
+			if (i > 0 && i % 8 == 0) {
+				sb.append(CMLConstants.S_SPACE);
+			}
+			sb.append(toHexString(rowByte[i]));
+			sb.append(CMLConstants.S_SPACE);
+		}
+		return sb.toString();
+	}
+	
+	private static String toHexString(byte b) {
+		String s = Integer.toHexString((int)b);
+		StringBuilder sb = new StringBuilder(s);
+		if (sb.length() == 8) {
+			sb.delete(0, 6);
+		} else if (sb.length() == 1) {
+			sb.insert(0, '0');
+		}
+		return sb.toString();
+	}
+	
+	private static String toHexString(int i) {
+		String s = Integer.toHexString(i);
+		StringBuilder sb = new StringBuilder(s);
+		if (sb.length() == 8) {
+			sb.delete(0, 4);
+		} else if (sb.length() == 1) {
+			sb.insert(0, "000");
+		} else if (sb.length() == 2) {
+			sb.insert(0, "00");
+		} else if (sb.length() == 3) {
+			sb.insert(0, "0");
+		}
+		return sb.toString();
+	}
+	private static byte[] copyBytes(byte[] bytes, int start, int rowsize) {
+		byte[] rowBytes = new byte[rowsize];
+		for (int i = 0; i < rowsize; i++) {
+			rowBytes[i] = bytes[start+i];
+		}
+		return rowBytes;
+	}
+	private static byte[] exciseBlock(byte[] bytes, int byteCount, int blocksToExcise) {
+		int leftover = bytes.length % BLOCKSIZE;
+		if (leftover != 0) {
+			throw new RuntimeException("bytes not multiple of blocksize "+bytes.length+" / "+leftover);
+		}
+		int startBlock = BLOCKSIZE * (byteCount / BLOCKSIZE);
+		if (bytes.length - startBlock < BLOCKSIZE) {
+			throw new RuntimeException("Final block error: "+bytes.length+ " - " +startBlock +" < "+ BLOCKSIZE);
+		}
+		int second = startBlock + blocksToExcise * BLOCKSIZE;
+		int newLength = bytes.length - blocksToExcise * BLOCKSIZE;
+		byte[] newBytes = new byte[bytes.length - blocksToExcise * BLOCKSIZE];
+		System.out.println("start "+startBlock+"/"+bytes.length+"/"+newBytes.length+"/"+newLength);
+		System.arraycopy(bytes, 0, newBytes, 0, startBlock);
+		System.arraycopy(bytes, second, newBytes, startBlock, bytes.length - second);
+		System.out.println("======================================BYTES "+newBytes.length);
+		return newBytes;
+	}
+	
+	private static byte[] exciseBlock(byte[] bytes, int byteCount) {
+		int leftover = bytes.length % BLOCKSIZE;
+		if (leftover != 0) {
+			throw new RuntimeException("bytes not multiple of blocksize "+bytes.length+" / "+leftover);
+		}
+		int startBlock = BLOCKSIZE * (byteCount / BLOCKSIZE);
+		if (bytes.length - startBlock < BLOCKSIZE) {
+			throw new RuntimeException("Final block error: "+bytes.length+ " - " +startBlock +" < "+ BLOCKSIZE);
+		}
+		int second = startBlock + BLOCKSIZE;
+		byte[] newBytes = new byte[bytes.length - BLOCKSIZE];
+		System.arraycopy(bytes, 0, newBytes, 0, startBlock);
+		System.arraycopy(bytes, second, newBytes, startBlock, bytes.length - second);
+		System.out.println("======================================BYTES "+newBytes.length);
+		return newBytes;
+	}
+	
+};
+
+
+
+
+

File src/main/java/org/xmlcml/cml/chemdraw/CDXConstants.java

+package org.xmlcml.cml.chemdraw; 
+
+import org.xmlcml.cml.base.CMLConstants;
+import org.xmlcml.cml.chemdraw.components.BoundingBox;
+
+
+/**
+ * 
+ * @author pm286
+ *
+ */
+public interface CDXConstants extends CMLConstants {
+// I think this is the default but it isn't clearly mentioned
+	/** */
+	float MAG = 8;
+	/** */
+	float SCALE2D = 65536;
+	/** NO IDEA WHETHER THIS WORKS*/
+	float SCALE3D = 65536;
+	/** */
+	float SCALE2DMAG = 65536 * MAG;
+
+	/** */
+    int BLOCKSIZE = 0x100;   
+
+	/** */
+    int NZEROS = 16;
+	/** */
+	int BYTESIZE = 1000000;
+	/** */
+	int MAXDEPTH = 10; // to avoid recursion in misreads
+
+	/**	 */
+	String CDX_PREFIX = "cdx";
+	/**	 */
+	String CDX_NAMESPACE = "http://www.xml-cml/namespaces/cdx";
+	
+	/** left bracket for fonts */
+	String FLBRAK = "[[";
+	/** right bracket for fonts */
+	String FRBRAK = "]]";
+	/** left escape for non-ASCII */
+	String LESCAPE = "{{";
+	/** right escape for non-ASCII */
+	String RESCAPE = "}}";
+	/** left escape in regex */
+	String LESCAPEREGEX = "\\{\\{";
+	/** right escape in regex */
+	String RESCAPEREGEX = "\\}\\}";
+	
+	/** degree */
+	String ESCAPE_DEGREE = LESCAPE+"176"+RESCAPE;
+	/** degree in regex */
+	String REGEX_DEGREE = LESCAPEREGEX+"176"+RESCAPEREGEX;
+	/** */
+
+	String TEMP_TEXT = "temp_Text";
+	// these will be heuristic
+	/** largest allowed atom label size */
+	int MAX_ATOM_LABEL_FONT_SIZE = 5;
+	/** smallest allowed atom label size */
+	int MIN_ATOM_LABEL_FONT_SIZE = 5;
+	/** largest allowed molecule label size */
+	int MAX_MOLECULE_LABEL_FONT_SIZE = 13;
+	/** smallest allowed molecule label size */
+	int MIN_MOLECULE_LABEL_FONT_SIZE = 9;
+	/** smallest allowed reaction label size */
+	int MAX_REACTION_LABEL_FONT_SIZE = 9;
+	/** smallest allowed reaction label size */
+	int MIN_REACTION_LABEL_FONT_SIZE = 6;
+	/** largest allowed difference between top of molBB and bottom of labelBB */
+	int MAX_MOLECULE_TO_LABEL_YDELTA = 30;
+	/** largest allowed difference between top of molBB and bottom of labelBB */
+	int MIN_MOLECULE_TO_LABEL_YDELTA = -20;
+	
+	/** */
+	String ATT_BOUNDING_BOX = BoundingBox.TAG;
+	/** */
+	String ATT_YDELTA = "ydelta";
+	/** */
+	String CDX_YDELTA = CDX_PREFIX+S_COLON+ATT_YDELTA;
+	/** */
+	String ATT_FONTSIZE = "size";
+	/** */
+	String ATT_POINT = "p";
+}

File src/main/java/org/xmlcml/cml/chemdraw/CDXML2CMLProcessor.java

+package org.xmlcml.cml.chemdraw;
+
+import static org.xmlcml.cml.base.CMLConstants.CMLXSD_ID;
+import static org.xmlcml.cml.base.CMLConstants.CML_XPATH;
+import static org.xmlcml.cml.chemdraw.CDXConstants.ATT_BOUNDING_BOX;
+import static org.xmlcml.cml.chemdraw.CDXConstants.ATT_FONTSIZE;
+import static org.xmlcml.cml.chemdraw.CDXConstants.ATT_POINT;
+import static org.xmlcml.cml.chemdraw.CDXConstants.ATT_YDELTA;
+import static org.xmlcml.cml.chemdraw.CDXConstants.CDX_NAMESPACE;
+import static org.xmlcml.cml.chemdraw.CDXConstants.CDX_PREFIX;
+import static org.xmlcml.cml.chemdraw.CDXConstants.CDX_YDELTA;
+import static org.xmlcml.cml.chemdraw.CDXConstants.MAX_ATOM_LABEL_FONT_SIZE;
+import static org.xmlcml.cml.chemdraw.CDXConstants.MAX_MOLECULE_LABEL_FONT_SIZE;
+import static org.xmlcml.cml.chemdraw.CDXConstants.MAX_MOLECULE_TO_LABEL_YDELTA;
+import static org.xmlcml.cml.chemdraw.CDXConstants.MIN_MOLECULE_LABEL_FONT_SIZE;
+import static org.xmlcml.cml.chemdraw.CDXConstants.MIN_MOLECULE_TO_LABEL_YDELTA;
+import static org.xmlcml.cml.chemdraw.CDXConstants.TEMP_TEXT;
+import static org.xmlcml.euclid.EuclidConstants.S_EMPTY;
+import static org.xmlcml.euclid.EuclidConstants.S_SPACE;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import nu.xom.Attribute;
+import nu.xom.Element;
+import nu.xom.Nodes;
+import nu.xom.ParentNode;
+
+import org.apache.log4j.Level;
+import org.apache.log4j.Logger;
+import org.xmlcml.cml.base.CMLConstants;
+import org.xmlcml.cml.base.CMLElement;
+import org.xmlcml.cml.base.CMLUtil;
+import org.xmlcml.cml.base.CMLElement.CoordinateType;
+import org.xmlcml.cml.chemdraw.components.CDXColorTable;
+import org.xmlcml.cml.chemdraw.components.CDXFontTable;
+import org.xmlcml.cml.chemdraw.components.CDXList;
+import org.xmlcml.cml.chemdraw.components.CDXML;
+import org.xmlcml.cml.chemdraw.components.CDXObject;
+import org.xmlcml.cml.chemdraw.components.CDXPage;
+import org.xmlcml.cml.chemdraw.components.CDXReactionStep;
+import org.xmlcml.cml.chemdraw.components.CDXText;
+import org.xmlcml.cml.chemdraw.components.CDXUtil;
+import org.xmlcml.cml.element.CMLAtom;
+import org.xmlcml.cml.element.CMLBond;
+import org.xmlcml.cml.element.CMLBondStereo;
+import org.xmlcml.cml.element.CMLCml;
+import org.xmlcml.cml.element.CMLLabel;
+import org.xmlcml.cml.element.CMLMolecule;
+import org.xmlcml.cml.element.CMLMoleculeList;
+import org.xmlcml.cml.element.CMLReaction;
+import org.xmlcml.cml.element.CMLMolecule.HydrogenControl;
+import org.xmlcml.cml.tools.AtomSetTool;
+import org.xmlcml.cml.tools.GeometryTool;
+import org.xmlcml.cml.tools.MoleculeTool;
+import org.xmlcml.euclid.EuclidRuntimeException;
+import org.xmlcml.euclid.Real2;
+import org.xmlcml.euclid.Real2Range;
+import org.xmlcml.euclid.RealRange;
+import org.xmlcml.euclid.Transform2;
+import org.xmlcml.util.CMLUtilNew;
+
+/**
+ * attempts to convert a CDXML file for CML.
+ * since CDX has many graphics primitives that CML does not some
+ * graphics semantics may be lost. Similarly CDX does not support various chemical
+ * semantics in CML and heuristics are used.
+ * 
+ * @author pm286
+ *
+ */
+public class CDXML2CMLProcessor {
+
+	final static Logger LOG = Logger.getLogger(CDXML2CMLProcessor.class);
+	static {
+		LOG.setLevel(Level.INFO);
+	}
+	
+    final static double BOND_LENGTH = 2.0;
+    private CDXObject rootCDXObject;
+	private CDXList cdxList;
+	private CDXObject page;
+    private CMLCml cmlCml = null;
+    
+    private boolean cleanMolecules = true;
+    private boolean flatten = true;
+    private boolean rescale = true;
+    private boolean removeCDXAttributes = true;
+    
+    public boolean isRemoveCDXAttributes() {
+		return removeCDXAttributes;
+	}
+
+	public void setRemoveCDXAttributes(boolean removeCDXAttributes) {
+		this.removeCDXAttributes = removeCDXAttributes;
+	}
+
+	public CDXML2CMLProcessor() {
+    	
+    }
+    
+    public void convertParsedXMLToCML(Element cdxml) {
+    	XMLToCDXMLConverter xml2cdxmlConverter = new XMLToCDXMLConverter();
+        rootCDXObject = (CDXObject) xml2cdxmlConverter.convertToCDXObject(cdxml);
+//   		 <cdxList id="null">
+//   		 <CDXML CreationProgram="ChemDraw 7.0.1" Name="Microsoft Word - C:\Chemistry\_PHD\MyThesis\PHD_Thesis_JHarter2002_v22_5.doc" Magnification="666" WindowPosition="24.9749 51.4484" WindowSize="549.4499 370.1294" PrintMargins="28.3464 28.3464 28.3464 28.3464" ShowAtomQuery="true" ShowAtomStereo="false" ShowAtomNumber="false" ShowBondQuery="true" ShowBondStereo="false" LabelLineHeight="0" CaptionLineHeight="1" ChainAngle="7864320" BondLength="12.1992" BoldWidth="1.8141" LineWidth="0.8503" MarginWidth="1.25" HashSpacing="1.75" CaptionJustification="Left" LabelJustification="Auto" BondSpacing="200" BoundingBox="138.9999 60.9799 393.6269 106.9499" LabelFont="3" LabelFace="96" LabelSize="8" LabelColor="3" CaptionFont="3" CaptionFace="1" CaptionSize="6" CaptionColor="3" id="0">
+//   		  <page BoundingBox="0 0 538.507 785.107" WidthPages="1" HeightPages="1" HeaderPosition="35.9999" FooterPosition="35.9999" id="156">
+//   		   <fragment BoundingBox="138.9999 67.0011 214.1269 91.6999" id="19">
+
+        
+ 		try {
+			if (!(rootCDXObject instanceof CDXList)) {
+ 	        	 throw new RuntimeException("expected cdxList as root element");
+ 	        }
+ 	        cdxList = (CDXList) rootCDXObject;
+ 	        cdxList.setChemDrawConverterRecursively(this);
+ 	        cdxml = null;
+ 		 	if (cdxList.getChildElements().size() == 0) {
+ 		 		LOG.warn("cdxList has no children");
+ 		 	}
+ 		 	for (int j = 0; j < cdxList.getChildElements().size(); j++) {
+	 		 	CDXObject obj = (CDXObject) cdxList.getChildElements().get(j);
+	 		 	if (obj instanceof CDXML) {
+	 		 		cdxml = (CDXML) cdxList.getChildElements().get(0);
+	 		 	} else if ("object".equals(obj.getLocalName())) {
+	 		 		LOG.error("*********** Uninterpreted object in cdxList");
+	 		 	 } else {
+	 		 		 throw new RuntimeException("unexpected child of cdxList: "+obj.getLocalName());
+	 		 	 }
+ 	         }
+ 		 	 if (cdxml == null) {
+ 		 		LOG.error("Cannot find CDXML element");
+ 		 	 } else {
+	 		 	 convertCDXML(cdxml);
+ 		 	 }
+ 		} catch (Exception e) {
+ 			e.printStackTrace();
+ 			LOG.error(e);
+ 		}
+	}
+    
+    public CMLElement getCML() {
+    	return cmlCml;
+    }
+    
+	private void convertCDXML(Element cdxml) {
+		int pageCount = 0;
+		page = null;
+		// check for errors and non-page structures
+		 for (int i = 0; i < cdxml.getChildElements().size(); i++) {
+			 CDXObject child = (CDXObject) cdxml.getChildElements().get(i);
+			 if (child instanceof CDXColorTable) {
+				 // skip
+			 } else  if (child instanceof CDXFontTable) {
+				 // skip
+		     } else if (child instanceof CDXPage) {
+		    	 if (pageCount > 0) {
+		    		 throw new RuntimeException("Only one page allowed");
+		    	 }
+		    	 pageCount++;
+				 page = (CDXObject) cdxml.getChildElements().get(0);
+		     } else if (child instanceof CDXML) {
+		    	LOG.error("**************Cannot process nested CDXML");
+			 } else  if (child.getLocalName().equals("object")) {
+				LOG.error("Unexpected CDXML child 'object'");
+		     } else {
+				 throw new RuntimeException("Unexpected CDXML child: "+child.getLocalName());
+		     }
+		 }
+//		 // find a single page
+//		 page = null;
+//		 for (int i = 0; i < cdxml.getChildElements().size(); i++) {
+//			 CDXObject obj = (CDXObject) cdxml.getChildElements().get(i);
+//			 if (obj instanceof CDXPage) {
+//				 page = (CDXObject) cdxml.getChildElements().get(0);
+//			 } else if (obj instanceof CDXFontTable) {
+//				 // skip
+//			 } else if (obj instanceof CDXColorTable) {
+//				 // skip
+//			 } else {
+//				LOG.warn("Skipped non-page child of CDXML: "+obj);
+//			 }
+//		 }
+		 if (page != null) {
+		     tidyToCML();
+		 } else {
+//		 		 		 LOG.warn("EMPTY page");
+		 }
+	}
+    
+	private void tidyToCML() {
+		// result is a <cml> object
+		cmlCml = new CMLCml();
+		cmlCml.addNamespaceDeclaration(CDX_PREFIX, CDX_NAMESPACE);
+		expandFontInfo(page);
+		// main explorations of content
+		page.process2CML(cmlCml);
+		// tidying
+		CDXML2CMLProcessor.addLabelsToMolecules(cmlCml);
+		addHydrogenAtomsToMolecules();
+//		this.processReactions();
+		this.processReactionsNew();
+		cleanRedundantHierarchy();
+		transformRGroups();
+		addLabelsToAtoms();
+		 
+		removeCDXAttributes = true;
+		if (removeCDXAttributes) {
+			removeCDXAttributes();
+		}
+		if (cleanMolecules) {
+			flattenGroupingElements();
+			removeAtomsWithChildrenExcludingLabels();
+		    removeAtomsWithoutElementTypeOrCoordinates();
+			cleanExternalConnectionPoints();
+		}
+		flipAndRescaleMolecules();
+		if (LOG.isDebugEnabled()) {
+			CMLUtil.debug(cmlCml, "==cmlCML==");
+		}
+		ensureXMLIds(cmlCml);
+		groupMultipleMolecules();
+		removeDeadCDXObjects();
+	}
+    
+	private void removeDeadCDXObjects() {
+		Nodes groups = cmlCml.query("//*[local-name()='group' and (.='')]");
+		for (int i = 0; i < groups.size(); i++) {
+			groups.get(i).detach();
+		}
+	}
+
+	private void groupMultipleMolecules() {
+		Nodes molecules = cmlCml.query("cml:molecule", CMLConstants.CML_XPATH);
+		if (molecules.size() > 1) {
+			CMLMoleculeList moleculeList = new CMLMoleculeList();
+			for (int i = 0; i < molecules.size(); i++) {
+				CMLMolecule molecule = (CMLMolecule) molecules.get(i);
+				molecule.detach();
+				moleculeList.addMolecule(molecule);
+			}
+			cmlCml.appendChild(moleculeList);
+		}
+	}
+
+	private void ensureXMLIds(CMLCml cmlCml) {
+		Nodes ids = cmlCml.query("//@id");
+		for (int i = 0; i < ids.size(); i++) {
+			Attribute idAtt = (Attribute) ids.get(i);
+			String value = idAtt.getValue();
+			value = CDXUtil.ensureXMLID(value);
+			idAtt.setValue(value);
+		}
+	}
+
+	/**
+	 * @param nodes
+	 */
+	private void flattenGroupingElement(Nodes nodes) {
+		for (int i = 0; i < nodes.size(); i++) {
+			 CMLElement element = (CMLElement)nodes.get(i);
+			 CMLUtilNew.transferChildrenToParent(element);
+//             Element parent = (Element) element.getParent(); 
+//			 CMLUtil.transferChildren(element, parent);
+			 element.detach();
+		 }
+	}
+
+     
+     /** child elements screw up some CML viewers
+      */
+     private void removeAtomsWithChildrenExcludingLabels() {
+    	 if (cmlCml != null) {
+	    	 Nodes atoms = cmlCml.query("//cml:atom/*[not(local-name()='label')]", CML_XPATH);
+	    	 for (int i = 0; i < atoms.size(); i++) {
+	    		 CMLAtom atom = (CMLAtom) atoms.get(i);
+	    		 atom.detach();
+	    	 }
+    	 }
+     }
+     
+     /** atoms without elementType or coordinates
+      */
+     private void removeAtomsWithoutElementTypeOrCoordinates() {
+    	 if (cmlCml != null) {
+	    	 Nodes atoms = cmlCml.query(
+	    			 "//cml:atom[not(@elementType) or not(@x2) or not(@y2)]", CML_XPATH);
+	    	 for (int i = 0; i < atoms.size(); i++) {
+	    		 atoms.get(i).detach();
+	    	 }
+    	 }
+     }
+     
+     private void flipAndRescaleMolecules() {
+    	 if (cmlCml != null) {
+	    	 Nodes molecules = cmlCml.query("//cml:molecule", CML_XPATH);
+	    	 for (int i = 0; i < molecules.size(); i++) {
+	    		 scale((CMLMolecule)molecules.get(i));
+	    	 }
+    	 }
+     }
+     
+     private void scale(CMLMolecule molecule) {
+    	 // only treat top level molecules
+    	 if (molecule.query(".//cml:molecule", CML_XPATH).size() == 0) {
+    		 MoleculeTool moleculeTool = MoleculeTool.getOrCreateTool(molecule);
+    		 try {
+	    		 double bb = moleculeTool.getAverageBondLength(CoordinateType.TWOD);
+	    		 double scale = (rescale) ? BOND_LENGTH / bb : 1.0;
+	    		 // this flips y-coordinates
+	    		 Transform2 transform = new Transform2(
+	    				 new double[]{
+	    				 scale, 0.0,   0.0,
+	    				 0.0,  -scale, 0.0,
+	    				 0.0,   0.0,   1.0
+	    				 }
+				 );
+	    		 moleculeTool.transform(transform);
+    		 } catch (RuntimeException cmle) {
+    			 // no coordinates
+    		 }
+    	 }
+     }
+
+     private void processReactions() {
+    	 Nodes reactionNodes = cmlCml.query("//cml:reaction", CML_XPATH);
+    	 for (int i = 0; i < reactionNodes.size(); i++) {
+    		 CMLReaction reaction = (CMLReaction) reactionNodes.get(i);
+    		 ChemDrawReactionConverter chemDrawReactionConverter = new ChemDrawReactionConverter(reaction, cmlCml);
+    		 chemDrawReactionConverter.processAfterParsing();
+    	 }
+    	 if (LOG.isDebugEnabled()) {
+    		 CMLUtil.debug(cmlCml, "cmlCml");
+    	 }
+     }
+     
+     private void processReactionsNew() {
+    	 Nodes reactionNodes = cmlCml.query("//cml:reaction", CML_XPATH);
+    	 for (int i = 0; i < reactionNodes.size(); i++) {
+    		 CMLReaction reaction = (CMLReaction) reactionNodes.get(i);
+    		 CDXReactionStep.processReactionStep(reaction);
+    	 }
+     }
+     
+     private void addHydrogenAtomsToMolecules() {
+    	 Nodes nodes = cmlCml.query("//cml:molecule", CML_XPATH);
+    	 for (int i = 0; i < nodes.size(); i++) {
+    		 CMLMolecule molecule = (CMLMolecule)nodes.get(i);
+    		 addHydrogens(molecule);
+    	 }
+     }
+
+    private void addHydrogens(CMLMolecule molecule) {
+		MoleculeTool moleculeTool = MoleculeTool.getOrCreateTool(molecule);
+	    moleculeTool.adjustHydrogenCountsToValency(HydrogenControl.REPLACE_HYDROGEN_COUNT);
+        GeometryTool geometryTool = new GeometryTool(molecule);
+    	geometryTool.addCalculatedCoordinatesForHydrogens(CoordinateType.TWOD, HydrogenControl.USE_EXPLICIT_HYDROGENS);
+	}
+
+	/**
+      * "carbons" with label children are actually not C.
+      * Try to guess them
+      */
+     private void transformRGroups() {
+    	 Nodes atomNodes = cmlCml.query("//cml:atom[@elementType='C' and cml:label]", CML_XPATH);
+    	 for (int i = 0; i < atomNodes.size(); i++) {
+    		 CMLAtom atom = (CMLAtom)atomNodes.get(i);
+    		 CMLLabel label = atom.getLabelElements().get(0);
+    		 String labelS = label.getCMLValue();
+    		 LOG.debug("LAB... "+labelS);
+    		 if (labelS.equals("R")) {
+    			 atom.setElementType("R");
+				 label.detach();
+    		 } else if (labelS.equals("C")){
+    		 } else if (labelS.equals("C-")){
+    		 } else if (labelS.equals("C+")){
+    		 } else {
+    			 atom.setElementType("R");
+    		 }
+    	 }
+	 }
+     
+     private void addLabelsToAtoms() {
+    	 Nodes labelNodes = cmlCml.query("/cml:cml/cml:label", CML_XPATH);
+    	 for (int i = 0; i < labelNodes.size(); i++) {
+    		 CMLLabel label = (CMLLabel)labelNodes.get(i);
+    		 String fontSizeS = label.getAttributeValue(ATT_FONTSIZE, CDX_NAMESPACE);
+    		 if (fontSizeS != null) {
+    			 int fontSize = Integer.parseInt(fontSizeS);
+    			 if (fontSize <= MAX_ATOM_LABEL_FONT_SIZE) {
+    				 CMLAtom atom = getNearestAtom(label);
+    				 if (atom != null) {
+    					 label.detach();
+    					 atom.addLabel(label);
+    				 }
+    			 }
+    		 }
+    	 }
+	 }
+
+     private CMLAtom getNearestAtom(CMLLabel label) {
+    	 CMLAtom closestAtom = null;
+    	 String p = label.getAttributeValue(ATT_POINT, CDX_NAMESPACE);
+    	 if (p != null) {
+    		 try {
+    			 double[] dd = org.xmlcml.euclid.Util.splitToDoubleArray(p);
+    			 Real2 point = new Real2(dd);
+    			 Nodes moleculeNodes = cmlCml.query("//cml:molecule", CML_XPATH);
+    			 double closestDist = Double.MAX_VALUE;
+    			 for (int i = 0; i < moleculeNodes.size(); i++) {
+    				 CMLMolecule molecule = (CMLMolecule) moleculeNodes.get(i);
+    				 CMLAtom atom = AtomSetTool.getOrCreateTool(molecule).getNearestAtom(point);
+    				 if (atom != null) {
+    					 double dist = atom.getXY2().getDistance(point);
+    					 if (dist < closestDist) {
+    						 closestDist = dist;
+    						 closestAtom = atom;
+    					 }
+    				 }
+    			 }
+    		 } catch (Exception e) {
+    			 //
+    		 }
+    	 }
+    	 return closestAtom;
+	 }
+     
+     private void cleanRedundantHierarchy() {
+    	 // top cml node
+    	 Nodes nodes = cmlCml.query(
+    			 "/cml:cml", CML_XPATH);
+    	 if (nodes.size() != 1) {
+    		 throw new RuntimeException("need exactly one toplevel cml");
+    	 }
+    	 CMLCml cmlCml = (CMLCml) nodes.get(0);
+    	 // remove moleculeList with only 1 child
+    	 nodes = cmlCml.query(
+    			 "//cml:moleculeList[count(cml:molecule)=1]", CML_XPATH);
+    	 flattenGroupingElement(nodes);
+    	 // remove empty moleculeList
+    	 nodes = cmlCml.query(
+    			 "//cml:moleculeList[count(cml:molecule)=0]", CML_XPATH);
+    	 for (int i = 0; i < nodes.size(); i++) {
+    		 nodes.get(i).detach();
+    	 }
+    	 // flatten single top-level list (/cml/list)
+    	 nodes = cmlCml.query(
+    			 "cml:cml[count(cml:list)=1 and count(*)=1]/cml:list", CML_XPATH);
+    	 flattenGroupingElement(nodes);
+    	 // remove empty lists
+    	 nodes = cmlCml.query(
+    			 "//cml:list[count(*)=0]", CML_XPATH);
+    	 for (int i = 0; i < nodes.size(); i++) {
+    		 nodes.get(i).detach();
+    	 }
+    	 // put top molecules under moleculeList
+    	 nodes = cmlCml.query("./cml:molecule", CML_XPATH);
+    	 if (nodes.size() > 1) {
+	    	 CMLMoleculeList moleculeList = new CMLMoleculeList();
+	    	 cmlCml.appendChild(moleculeList);
+	    	 for (int i = 0; i < nodes.size(); i++) {
+	    		 nodes.get(i).detach();
+	    		 moleculeList.addMolecule((CMLMolecule) nodes.get(i));
+	    	 }
+    	 }
+     }
+	
+     private void cleanExternalConnectionPoints() {
+    	 if (cmlCml != null) {
+    		 Nodes molecules = cmlCml.query(
+    				 "//cml:molecule", CML_XPATH);
+    		 if (molecules.size() == 0) {
+    			 return;
+    		 }
+    		 CMLMolecule molecule = (CMLMolecule) molecules.get(0);
+	    	 Nodes atoms = cmlCml.query(
+	    			 "//cml:atom[@*[local-name()='NodeType' and .='ExternalConnectionPoint']]", CML_XPATH);
+	    	 for (int i = 0; i < atoms.size(); i++) {
+	    		 atoms.get(i).detach();
+	    	 }
+/** 
+	   <atom id="a558" elementType="C" cdx:NodeType="Fragment" 
+	       x2="25.035752216501713" y2="26.953796780405963" 
+	       xmlns:cdx="http://www.xml-cml/namespaces/cdx"/>
+	   <atom id="a560" elementType="N" hydrogenCount="1" 
+	     x2="25.075671620016394" y2="27.255753645645697"/>
+*/
+	    	 atoms = cmlCml.query(
+	    			 "//cml:atom[@*[local-name()='NodeType' and .='Fragment']]", CML_XPATH);
+	    	 for (int i = 0; i < atoms.size(); i++) {
+	    		 // fragment id
+	    		 CMLAtom fragment = (CMLAtom) atoms.get(i);
+	    		 String fragmentId = fragment.getAttributeValue(CMLXSD_ID);
+	    		 // following sibling id
+	    		 ParentNode atomArray = fragment.getParent();
+	    		 int iatom = atomArray.indexOf(fragment);
+	    		 CMLAtom nextAtom = null;
+	    		 try {
+	    			 nextAtom = (CMLAtom) atomArray.getChild(iatom+1);
+	    		 } catch (ArrayIndexOutOfBoundsException aaiobe) {
+	    			 LOG.error("Cannot find neighboring atom "+iatom);
+	    			 continue;
+	    		 }
+//	    		 String nextId = nextAtom.getAttributeValue(CMLXSD_ID);
+	    		 List<CMLBond> bonds = molecule.getBonds();
+	    		 for (CMLBond bond : bonds) {
+	    			 String[] atomRefs2 = bond.getAtomRefs2();
+	    			 int otherAtomNo = -1;
+	    			 if (atomRefs2[0].equals(fragmentId)) {
+	    				 otherAtomNo = 1;
+	    			 } else if (atomRefs2[1].equals(fragmentId)) {
+	    				 otherAtomNo = 0;
+	    			 }
+	    			 if (otherAtomNo >= 0) {
+	    				 CMLAtom rootAtom = bond.getOtherAtom(fragment);
+	    				 String order = bond.getOrder();
+	    				 CMLBondStereo bondStereo = bond.getBondStereo();
+	    				 molecule.deleteBond(bond);
+	    				 molecule.deleteAtom(fragment);
+	    				 CMLAtom atom0 = rootAtom;
+	    				 CMLAtom atom1 = nextAtom;
+	    				 if (otherAtomNo == 1) {
+		    				 atom1 = rootAtom;
+		    				 atom0 = nextAtom;
+	    				 }
+	    				 bond = new CMLBond(atom0, atom1);
+	    				 bond.setOrder(order);
+	    				 if (bondStereo != null) {
+	    					 bond.addBondStereo(bondStereo);
+	    				 }
+	    				 molecule.addBond(bond);
+	    				 break;
+	    			 }
+	    		 }
+	    	 }
+    	 }
+     }
+     
+ 	 private void removeCDXAttributes() {
+    	 if (cmlCml != null) {
+	    	 Nodes nodes = cmlCml.query(
+	    			 "//*/@B | " +
+	    			 "//*/@BondCircularOrdering | " +
+	    			 "//*/@BS | " +
+	    			 "//*/@Display | " +
+	    			 "//*/@Display2 | " +
+	    			 "//*/@DoublePosition | " +
+	    			 "//*/@E | " +
+	    			 "//*/@Order | " +
+	    			 "//*/@Z" +
+	    			 "", CML_XPATH);
+	    	 for (int i = 0; i < nodes.size(); i++) {
+	    		 nodes.get(i).detach();
+	    	 }
+    	 }
+ 	 }
+     
+     /** child elements screw up some CML viewers
+      */
+     private void flattenGroupingElements() {
+    	 if (cmlCml != null) {
+	    	 Nodes nodes = cmlCml.query("//cml:moleculeList", CML_XPATH);
+	    	 flattenGroupingElement(nodes);
+	    	 nodes = cmlCml.query("//cml:list", CML_XPATH);
+	    	 flattenGroupingElement(nodes);
+    	 }
+     }
+
+     
+     private void expandFontInfo(CDXObject page) {
+    	 Nodes texts = page.query(".//t[@"+TEMP_TEXT+"]");
+    	 for (int i = 0; i < texts.size(); i++) {
+    		 ((CDXText)texts.get(i)).addFontInfoFromTempText();
+    	 }
+     }
+
+ 	public static void addLabelsToMolecules(CMLElement scopeElement) {
+ 		// labels are normally "underneath" molecules. Since coordinates run 
+ 		// vertically "downwards" the coord look like:
+ 		// --------------------------------> +X
+ 		// |
+ 		// |    |-----------------|
+ 		// |    |                 |
+ 		// |    |    molecule     |
+ 		// |    |                 |
+ 		// |    |-----------------|
+ 		// |
+ 		// |        |-------|
+ 		// |        | label |
+ 		// |        |-------|
+ 		// |        
+ 		// V 
+ 		// +Y
+ 		
+	//		  <label value="172" BoundingBox="351.2457 97.8999 366.2457 106.9499" id="69"/>
+		Nodes moleculeListNodes = scopeElement.query("cml:moleculeList", CML_XPATH);
+		if (moleculeListNodes.size() > 0) {
+			CMLMoleculeList moleculeList = (CMLMoleculeList) moleculeListNodes.get(0);
+			Nodes labelNodes = scopeElement.query("cml:label", CML_XPATH);
+			for (CMLMolecule molecule : moleculeList.getMoleculeElements()) {
+				if (molecule.getRef() == null || S_EMPTY == molecule.getRef()) {
+					CDXML2CMLProcessor.moveLabelsToMolecules(scopeElement, labelNodes, molecule);
+				}
+			}
+		}
+	}
+
+	public static void createMoleculeList(CMLElement element) {
+		Nodes molecules = element.query("cml:molecule", CMLConstants.CML_XPATH);
+		CMLMoleculeList moleculeList = new CMLMoleculeList();
+		element.appendChild(moleculeList);
+		for (int i = 0; i < molecules.size(); i++) {
+			CMLMolecule molecule = (CMLMolecule) molecules.get(i);
+			molecule.detach();
+			moleculeList.addMolecule(molecule);
+		}
+	}
+
+	/**
+	 * @param element
+	 * @param labelNodes
+	 * @param molecule
+	 * @throws EuclidRuntimeException
+	 * @throws NumberFormatException
+	 */
+	static void moveLabelsToMolecules(CMLElement element, Nodes labelNodes, CMLMolecule molecule) {
+		// only take original molecules
+		if (!"cdx:fragment".equals(molecule.getRole())) {
+			Real2Range moleculeBB = CDXML2CMLProcessor.getNormalizedBoundingBox(molecule);
+			if (moleculeBB != null) {
+				double molYMax = moleculeBB.getYRange().getMax();
+				List<CMLLabel> labels = CDXML2CMLProcessor.getVerticalLabels(
+					element, labelNodes, moleculeBB, 100, -100,
+					MIN_MOLECULE_LABEL_FONT_SIZE,
+					MAX_MOLECULE_LABEL_FONT_SIZE
+				);
+				labels = CDXML2CMLProcessor.sortLabelsByY(labels);
+				for (CMLLabel label : labels) {
+					Real2Range labelBB = CDXML2CMLProcessor.getNormalizedBoundingBox(label);
+					double deltaY = labelBB.getYRange().getMin() - molYMax;
+					if (deltaY < MIN_MOLECULE_TO_LABEL_YDELTA ||
+						deltaY > MAX_MOLECULE_TO_LABEL_YDELTA) {
+						continue;
+					}
+					label.detach();
+					molecule.addLabel(label);
+				}
+			} else {
+				LOG.debug("Null bounding box");
+			}
+		}
+	}
+
+	/**
+	 * @param labels
+	 * @return list of labels
+	 * @throws NumberFormatException
+	 */
+	protected static List<CMLLabel> sortLabelsByY(List<CMLLabel> labels) throws NumberFormatException {
+		// sort by labels
+		List<CMLLabel> labelList1 = new ArrayList<CMLLabel>();
+		while (labels.size() > 0) {
+			double dd = -Double.MAX_VALUE;
+			CMLLabel label1 = null;
+			for (CMLLabel label : labels) {
+				String ys = label.getAttributeValue(ATT_YDELTA, CDX_NAMESPACE);
+				if (ys != null) {
+					double y = new Double(ys).doubleValue();
+					if (y > dd) {
+						dd = y;
+						label1 = label;
+					}
+				} else {
+					label.debug();
+				}
+			}
+			labelList1.add(label1);
+			labels.remove(label1);
+		}
+		return labelList1;
+	}
+
+	protected static List<CMLLabel> getVerticalLabels(
+		Element parent, Nodes labelNodes, Real2Range boundingBox, 
+		double topYDelta, double bottomYDelta, int minFont, int maxFont) {
+		List<CMLLabel> labelList = new ArrayList<CMLLabel>();
+		if (boundingBox == null) {
+			throw new RuntimeException("Null bounding box");
+		}
+		RealRange targetXRange = boundingBox.getXRange();
+		RealRange targetYRange = boundingBox.getYRange();
+		for (int i = 0; i < labelNodes.size(); i++) {
+			CMLLabel label = (CMLLabel) labelNodes.get(i);
+			int fontSize = getFontSize(label);
+			if (fontSize < minFont || fontSize > maxFont) {
+				continue;
+			}
+			Real2Range labelBoundingBox = CDXML2CMLProcessor.getNormalizedBoundingBox(label);
+			if (labelBoundingBox != null) {
+				RealRange labelXRange = labelBoundingBox.getXRange();
+				RealRange labelYRange = labelBoundingBox.getYRange();
+				RealRange commonXRange = labelXRange.intersectionWith(targetXRange);
+				RealRange commonYRange = labelYRange.intersectionWith(targetYRange);
+				if (commonXRange != null) {
+					double yAbove = targetYRange.getMin() - labelYRange.getMax();
+					double yBelow = targetYRange.getMax() - labelYRange.getMin();
+					if (commonYRange != null || 
+							(yAbove > 0 && yAbove < topYDelta)) {
+						label.addAttribute(new Attribute(CDX_YDELTA, CDX_NAMESPACE, ""+yAbove));
+						labelList.add(label);
+					} else if (commonYRange != null ||  
+							(yBelow < 0 && yBelow > bottomYDelta)) {
+						label.addAttribute(new Attribute(CDX_YDELTA, CDX_NAMESPACE, ""+yBelow));
+						labelList.add(label);
+					} else {
+//						LOG.warn("BoundingBox problem for vertical labels");
+					}
+				} else {
+				}
+			} else {
+				LOG.warn("NULL LABEL "+label.getCMLValue());
+			}
+		}
+		return labelList;
+	}
+	
+	private static int getFontSize(CMLLabel label) {
+		String fontS = label.getAttributeValue("size", CDX_NAMESPACE);
+		int size = (fontS == null) ? -1 : Integer.parseInt(fontS);
+		if (fontS == null) {
+			label.debug("FONT");
+			throw new RuntimeException("all text should have font size");
+		}
+		return size;
+	}
+	
+
+	/** get bounding box
+	 * normalize so that xmin < xmax and ymin < ymax
+	 * @param label
+	 * @throws EuclidRuntimeException
+	 */
+	static Real2Range getNormalizedBoundingBox(CMLElement element) throws EuclidRuntimeException {
+		String boundingBoxS = element.getAttributeValue(ATT_BOUNDING_BOX, CDX_NAMESPACE);
+		Real2Range r2r = null;
+		if (boundingBoxS != null) {
+			double[] bb = org.xmlcml.euclid.Util.splitToDoubleArray(boundingBoxS, S_SPACE);
+			r2r = new Real2Range(
+				new RealRange(bb[0], bb[2], true),
+				new RealRange(bb[1], bb[3], true));
+		}
+		return r2r;
+	}
+
+  	/**
+  	 * @return the flatten
+  	 */
+  	public boolean isFlatten() {
+  		return flatten;
+  	}
+
+  	/**
+  	 * @param flatten the flatten to set
+  	 */
+  	public void setFlatten(boolean flatten) {
+  		this.flatten = flatten;
+  	}
+
+	/**
+	 * @return the rescale
+	 */
+	public boolean isRescale() {
+		return rescale;
+	}
+
+	/**
+	 * @param rescale the rescale to set
+	 */
+	public void setRescale(boolean rescale) {
+		this.rescale = rescale;
+	}
+
+ 	/**
+ 	 * @return the cleanMolecules
+ 	 */
+ 	public boolean isCleanMolecules() {
+ 		return cleanMolecules;
+ 	}
+
+ 	/**
+ 	 * @param cleanMolecules the cleanMolecules to set
+ 	 */
+ 	public void setCleanMolecules(boolean cleanMolecules) {
+ 		this.cleanMolecules = cleanMolecules;
+ 	}
+ 	
+
+}

File src/main/java/org/xmlcml/cml/chemdraw/CDXRawToCMLCreator.java

+package org.xmlcml.cml.chemdraw;
+
+import org.apache.log4j.Logger;
+import org.xmlcml.cml.element.CMLMolecule;
+
+/** an object to hold CML state in the CDX context.
+*/
+public class CDXRawToCMLCreator implements CDXConstants {
+
+    static Logger LOG = Logger.getLogger(CDXRawToCMLCreator.class);
+
+    private CMLMolecule molecule;
+    private boolean addCDXAttributes;
+    double scale2 = 1.0;
+
+    /**
+     */
+	public CDXRawToCMLCreator() {
+		LOG.debug("NEW CDXRAW2CML ");
+		
+	}
+
+//	/**
+//	 * @param cmlNode
+//	 */
+//	private void setCMLParent(Node cmlNode) {
+//        this.cmlParent = cmlNode;
+//    }
+//
+//	private Node getCMLParent() {
+//        return this.cmlParent;
+//    }
+
+	void setMolecule(CMLMolecule molecule) {
+        this.molecule = molecule;
+    }
+
+	CMLMolecule getMolecule() {
+		if (molecule == null) {
+			molecule = new CMLMolecule();
+			LOG.debug("created new molecule: "+molecule.hashCode());
+			
+		}
+        return molecule;
+    }
+
+//	private void setFlatten(boolean f) {
+//        this.flatten = f;
+//    }
+
+//	private boolean getFlatten() {
+//        return this.flatten;
+//    }
+//
+//	private void setReorient(boolean f) {
+//        this.reorient = f;
+//    }
+//
+//	private boolean getReorient() {
+//        return this.reorient;
+//    }
+
+//	private void setScale2(double s) {
+//        this.scale2 = s;
+//    }
+//
+//	private double getScale2() {
+//        return this.scale2;
+//    }
+
+//	private void setAddCDXAttributes(boolean f) {
+//        this.addCDXAttributes = f;
+//    }
+
+	boolean getAddCDXAttributes() {
+        return this.addCDXAttributes;
+    }
+
+/* process all molecules which are children of atoms.
+ * convert the meaningful atoms (i.e. not stubs and links)
+ * to normal toplevel atoms.
+
+Typical CDX with fragment groups
+<?xml version="1.0" encoding="UTF-8"?>
+  <molecule>
+      <atomArray>
+        <atom id="a1113" elementType="C" x2="17.0622" y2="-38.0114"/>
+        <atom id="a1114" elementType="C" x2="18.1252" y2="-39.8526"/>
+        <atom id="a1115" elementType="O" hydrogenCount="0" x2="20.1819" y2="-39.3138">
+          <scalar dictRef="cdx:text">O</scalar>
+        </atom>
+        <atom id="a1094" elementType="C" x2="22.226" y2="-39.8526"/>
+        <atom id="a1095" elementType="C" x2="21.1631" y2="-38.0114"/>
+        <atom id="a115" elementType="O" hydrogenCount="0" x2="19.1063" y2="-38.5501">
+          <scalar dictRef="cdx:text">O</scalar>
+        </atom>
+// this is a dummy and will be replaced by its first granchild atom (a1096)
+        <atom id="a116" elementType="C" x2="17.0622" y2="-35.8855">
+          <scalar dictRef="cdx:text">OMe</scalar>
+          <molecule>
+            <atomArray>
+// the first atom is the link atom; it replaces its grandparent (a116)
+              <atom id="a1096" elementType="O" hydrogenCount="0" x2="18.1368" y2="-36.4999">
+                <scalar dictRef="cdx:text">O</scalar>
+              </atom>
+              <atom id="a122" elementType="C" x2="20.2902" y2="-37.7432"/>
+// the LAST atom seems to be the dummy
+              <atom id="a123" elementType="C" x2="-10.9059" y2="-21.4857"/>
+            </atomArray>
+            <bondArray>
+// the FIRST bond seems to contain the dummy atom as the FIRST atom
+              <bond atomRefs2="a123 a1096" id="a123_a1096" order="1"/>
+              <bond atomRefs2="a1096 a122" id="a1096_a122" order="1"/>
+            </bondArray>
+          </molecule>
+        </atom>
+        <atom id="a126" elementType="C" x2="15.2211" y2="-39.0744"/>
+        <atom id="a127" elementType="C" x2="18.1252" y2="-41.9786">
+          <scalar dictRef="cdx:text">OMe</scalar>
+          <molecule>
+            <atomArray>
+              <atom id="a129" elementType="O" hydrogenCount="0" x2="19.1998" y2="-40.1106">
+                <scalar dictRef="cdx:text">O</scalar>
+              </atom>
+              <atom id="a130" elementType="C" x2="21.3532" y2="-41.3538"/>
+              <atom id="a131" elementType="C" x2="-11.0862" y2="-20.4523"/>
+            </atomArray>
+            <bondArray>
+              <bond atomRefs2="a131 a129" id="a131_a129" order="1"/>
+              <bond atomRefs2="a129 a130" id="a129_a130" order="1"/>
+            </bondArray>
+          </molecule>
+        </atom>
+        <atom id="a134" elementType="C" x2="15.9992" y2="-39.7588"/>
+        <atom id="a135" elementType="C" x2="23.9062" y2="-39.1875"/>
+        <atom id="a136" elementType="C" x2="23.25" y2="-38.4375"/>
+        <atom id="a137" elementType="O" hydrogenCount="1" x2="26.1549" y2="-39.8484">
+          <scalar dictRef="cdx:text">OH</scalar>
+        </atom>
+        <atom id="a138" elementType="C" x2="25.2922" y2="-37.7242">
+          <scalar dictRef="cdx:text">OTBS</scalar>
+          <molecule>
+            <atomArray>
+              <atom id="a6745" elementType="O" hydrogenCount="0" x2="25.2018" y2="-37.5284">
+                <scalar dictRef="cdx:text">O</scalar>
+              </atom>
+              <atom id="a6746" elementType="Si" hydrogenCount="0" x2="27.0421" y2="-38.5909">
+                <scalar dictRef="cdx:text">Si</scalar>
+              </atom>
+              <atom id="a6747" elementType="C" x2="28.1069" y2="-36.7465"/>
+              <atom id="a6748" elementType="C" x2="28.8824" y2="-39.6534"/>
+              <atom id="a6749" elementType="C" x2="25.9819" y2="-40.4271"/>
+              <atom id="a6750" elementType="C" x2="26.2585" y2="-35.6793"/>
+              <atom id="a1169" elementType="C" x2="29.1695" y2="-34.9062"/>
+              <atom id="a6751" elementType="C" x2="29.9391" y2="-37.8043"/>
+// dummy
+              <atom id="a6752" elementType="C" x2="-10.5937" y2="-21.5625"/>
+            </atomArray>
+            <bondArray>
+// dummy - locant
+              <bond atomRefs2="a6752 a6745" id="a6752_a6745" order="1"/>
+              <bond atomRefs2="a6745 a6746" id="a6745_a6746" order="1"/>
+              <bond atomRefs2="a6746 a6747" id="a6746_a6747" order="1"/>
+              <bond atomRefs2="a6746 a6748" id="a6746_a6748" order="1"/>
+              <bond atomRefs2="a6746 a6749" id="a6746_a6749" order="1"/>
+              <bond atomRefs2="a6747 a6750" id="a6747_a6750" order="1"/>
+              <bond atomRefs2="a6747 a1169" id="a6747_a1169" order="1"/>
+              <bond atomRefs2="a6747 a6751" id="a6747_a6751" order="1"/>
+            </bondArray>
+          </molecule>
+        </atom>
+      </atomArray>
+      <bondArray>
+        <bond atomRefs2="a1113 a1114" id="a1113_a1114" order="1"/>
+        <bond atomRefs2="a1114 a1115" id="a1114_a1115" order="1">
+          <bondStereo>H</bondStereo>
+        </bond>
+        <bond atomRefs2="a1115 a1094" id="a1115_a1094" order="1"/>
+        <bond atomRefs2="a1095 a1094" id="a1095_a1094" order="1">
+          <bondStereo>H</bondStereo>
+        </bond>
+        <bond atomRefs2="a1095 a115" id="a1095_a115" order="1"/>
+        <bond atomRefs2="a115 a1113" id="a115_a1113" order="1"/>
+        <bond atomRefs2="a1113 a116" id="a1113_a116" order="1"/>
+        <bond atomRefs2="a1113 a126" id="a1113_a126" order="1"/>
+        <bond atomRefs2="a1114 a127" id="a1114_a127" order="1"/>
+        <bond atomRefs2="a1114 a134" id="a1114_a134" order="1"/>
+        <bond atomRefs2="a1094 a135" id="a1094_a135" order="1"/>
+        <bond atomRefs2="a1095 a136" id="a1095_a136" order="1"/>
+        <bond atomRefs2="a135 a137" id="a135_a137" order="1"/>
+        <bond atomRefs2="a136 a138" id="a136_a138" order="1"/>
+      </bondArray>
+    </molecule>
+
+--*/
+//	private static void expandAtoms(MoleculeTool moleculeTool) {
+//        if (moleculeTool != null) {
+//            moleculeTool.setUpdateAtoms(true);
+//            moleculeTool.getAtoms();
+//            moleculeTool.setUpdateBonds(true);
+//            moleculeTool.getBonds();
+////            CMLAtom atom = moleculeTool.getAtomById("a138");
+////            LOG.debug("AA "+atom.getId());
+//// these atoms should be the top level of atoms, not subgroups
+//            LOG.debug("Expanding atoms with subgroups... ");
+//            CMLAtom[] topAtoms = moleculeTool.getAtoms();
+//            for (int i = 0; i < topAtoms.length; i++) {
+//                NodeList subMoleculeList = topAtoms[i].getElementsByTagName("molecule");
+//                if (subMoleculeList.getLength() == 1) {
+//                    LOG.debug("TA "+topAtoms[i].getId());
+//                    expandAtom(topAtoms[i], (CMLMolecule) subMoleculeList.item(0), moleculeTool);
+//                }
+//            }
+//        }
+//    }
+
+//	private static void expandAtom(CMLAtom replacedAtom, CMLMolecule subMolecule, CMLMolecule parentMolecule) {
+//        List<CMLAtom> replacedAtomLigands = replacedAtom.getLigandAtoms();
+//        List<CMLAtom> subAtoms = subMolecule.getAtoms();
+//        CMLAtomSet subAtomSet = new CMLAtomSet(subAtoms.toArray(new CMLAtom[0]));
+//
+//// get bonds before we start deleting atoms
+//// transfer bonds to current (top) molecule
+//// don't transfer first one as it is a dummy
+//        List<CMLBond> subBonds = subMolecule.getBonds();
+//
+//// implicit semantics in chemdraw are horrible.
+//// we guess the last atom is a dummy
+//// it has only one ligand which is the replacing atom
+//// Dummy atom; I think this just defines a vector? it may not even have coords
+//        CMLAtom dummyAtom = subAtoms.get(subAtoms.size()-1);
+//// assume exactly onw ligand and that it is the replacing atom
+//        List<CMLAtom> dummyAtomLigands = dummyAtom.getLigandAtoms();
+//        if (dummyAtomLigands.size() != 1) {
+//            throw new RuntimeException("Expected only one ligand of dummy atom");
+//        }
+//// I think this atom replaces the old replaced atom
+//        CMLAtom replacingAtom = dummyAtomLigands.get(0);
+//
+//// set atoms to belong to grandparent molecule
+//// don't transfer the final one; it is a dummy and delete later
+//        for (int i = 0; i < subAtoms.size()-1; i++) {
+//            LOG.debug("Transferring: "+subAtoms.get(i).getId());
+////            subAtomTool.transferToMolecule(parentMolecule);
+//        }
+////        LOG.debug("sub bonds1 "+subBonds.length);
+//        subMolecule.deleteAtom(subAtoms.get(subAtoms.size()-1));
+//
+//
+//// preserve coordinates of replaced atom
+//        double oldx2 = replacedAtom.getX2();
+//        double oldy2 = replacedAtom.getY2();
+//        Real2 oldxy2 = new Real2(oldx2, oldy2);
+//        double newx2 = replacingAtom.getX2();
+//        double newy2 = replacingAtom.getY2();
+//
+//        double dx2 = oldx2 - newx2;
+//        double dy2 = oldy2 - newy2;
+//        Real2 delta = new Real2(dx2, dy2);
+//        subAtomSet.translate2D(delta);
+//        newx2 = replacingAtom.getX2();
+//        newy2 = replacingAtom.getY2();
+//
+//// align vectors of overlapping atoms
+//        CMLAtom oldLigand = replacedAtom.getLigandAtoms().get(0);
+//        logger.log(Level.INFO,
+//        "replaced atom: "+replacedAtom.getElementType()+"/"+replacedAtom.getId()+"/"+replacedAtom.getX2()+"/"+replacedAtom.getY2()+
+//        "; old ligand: "+oldLigand.getElementType()+"/"+oldLigand.getId()+"/"+oldLigand.getX2()+"/"+oldLigand.getY2());
+//        Vector2 oldLigandVector =
+//            new Vector2(oldx2 - oldLigand.getX2(),
+//                        oldy2 - oldLigand.getY2());
+//        CMLAtom newLigand = replacingAtom.getLigandAtoms().get(0);
+//        logger.log(Level.INFO,
+//        "replacing atom: "+replacingAtom.getElementType()+"/"+replacingAtom.getId()+"/"+replacingAtom.getX2()+"/"+replacingAtom.getY2()+
+//        "; new ligand: "+newLigand.getElementType()+"/"+newLigand.getId()+"/"+newLigand.getX2()+"/"+newLigand.getY2());
+//        Vector2 newLigandVector =
+//            new Vector2(newLigand.getX2() - newx2,
+//                        newLigand.getY2() - newy2);
+//        //newLigandVector.negative();
+//        Transform2 t2 = null;
+//        try {
+////            Transform2 rotMatrix = new Transform2(oldLigandVector, newLigandVector);
+//            Transform2 rotMatrix = new Transform2(newLigandVector, oldLigandVector);
+//            t2 = new Transform2(rotMatrix, oldxy2);
+//            LOG.debug("T2 "+t2);
+//        } catch (Exception e) {
+//            LOG.error("Transform error: "+e);
+//        }
+//        subAtomSet.transform(t2);
+//
+//// get first ligand of old and new atoms.
+//
+////        replacingAtom.setX2(oldx2);
+////        replacingAtom.setY2(oldy2);
+//
+//// join replacingAtom to ligands of replacedAtom and remove the latter
+//// set new bond orders to old (deleted) bond order
+////        CMLAtom[] replacedAtomLigands = replacedAtomTool.getLigandList();
+//        for (int i = 0; i < replacedAtomLigands.size(); i++) {
+//            CMLBond oldBond = parentMolecule.getBond(replacedAtom, replacedAtomLigands.get(i));
+//// keep track of old bond stereo
+//            CMLBondStereo oldBondStereo = oldBond.getBondStereoElements().get(0);
+//            parentMolecule.deleteBond(replacedAtom, replacedAtomLigands.get(i));
+//// order of atoms matters to keep stereo correct
+//            CMLBond newBond = new CMLBond(replacedAtomLigands.get(i), replacingAtom);
+//            parentMolecule.addBond(newBond);
+//// transfer order and stereo
+//            newBond.setOrder(oldBond.getOrder());
+//            if (oldBondStereo != null) {
+//                newBond.appendChild(oldBondStereo);
+//            }
+//        }
+//// transfer label ...
+//        CMLLabel topLabel = replacedAtom.getLabelElements().get(0);
+//        if (topLabel != null) {
+//        	topLabel.detach();
+////            replacedAtom.removeLabel(topLabel);
+//// remove any existing label from replacing atom
+//            CMLLabel subLabel = replacingAtom.getLabelElements().get(0);
+//            if (subLabel != null) {
+//                subLabel.detach();