Commits

petermr committed fc703b1

added parsepatent

Comments (0)

Files changed (3)

 #Autogenerated by Oscar
-#Sun Aug 15 11:45:42 BST 2010
+#Thu Aug 19 16:28:06 BST 2010
 InChI=C\:\\Users\\pm286\\workspace\\patentAnalysis\\cInChI-1.exe
 cacheExtensionNameResolver=yes
 chemNameDict=chemnamedict.xml
+<visitors>
+  <log>log.xml</log>
+  
+  <visitor>
+    <name>default</name>
+    <addFileId/>
+    <class>org.xmlcml.cml.crystaleye.DefaultVisitor</class>
+    <debug>true</debug>
+    <clean>0</clean>
+    <!-- must be name of diecrtoyr this file is in. Tacky. Sorry -->
+    <outputDirectory>../patentData4</outputDirectory>
+    <maxEntries>10000</maxEntries>
+    <iterate>true</iterate>
+
+
+   </visitor>
+
+<!--  DOWNLOAD PATENT -->
+  <visitor>
+    <skip/>
+    <!-- do not delete skip -->
+    <name>indexFile</name>
+    <class>org.xmlcml.cml.crystaleye.NullVisitor</class>
+     <fileFilter method=".">*.xml</fileFilter>
+  </visitor>
+  
+  <visitor>
+    <!-- finds all EPO-xml index files and downloads the zips, 1 per patent -->
+   
+    <name>zip</name>
+    <requires>indexFile</requires>
+    <class>uk.ac.cam.ch.wwmm.patents.crawler.EpoDownloadVisitor</class>
+    <fileFilter method=".">*.zip</fileFilter>
+  </visitor>
+  
+  <!--  UNZIP -->
+  <visitor>
+    <!--  unzips each patent -->
+    <name>unzip</name>
+    
+    <requires>zip</requires>
+    <class>org.xmlcml.cml.crystaleye.UnzipVisitor</class>
+    <parameter name="TIF">tif TIF tiff TIFF</parameter>
+     <fileFilter method=".">*.unzip</fileFilter>
+  </visitor>
+
+ <visitor>
+    <name>dirs</name>
+    <skip/>
+    <class>org.xmlcml.cml.crystaleye.NullVisitor</class>
+    <fileFilter method="."></fileFilter>
+  </visitor>
+ 
+
+  <visitor>
+    <!--  renames each patent -->
+    <name>rename1</name>
+    
+    <requires>dirs</requires>
+    <class>org.xmlcml.cml.crystaleye.FileUtilVisitor</class>
+    <parameter name="RENAME">EP.*NWA.* DOC00001.xml</parameter>
+    <parameter name="DELETE">disclaimer\.txt</parameter>
+    <parameter name="DELETE">SGML\.NRM</parameter>
+    <parameter name="DELETE">PAGE\d\d\d\d</parameter>
+    <fileFilter method=".">cleaned.cleaned</fileFilter>
+  </visitor>
+
+
+ 
+  <!-- MAIN DOCUMENT -->
+  <visitor>
+    <name>doc</name>
+    <skip/>
+   <class>org.xmlcml.cml.crystaleye.NullVisitor</class>
+    <fileFilter method=".">DOC00001.xml</fileFilter>
+  </visitor>
+ 
+  <visitor>
+    <name>structured</name>
+    
+    <requires>doc</requires>
+    <class>uk.ac.cam.ch.wwmm.patents.doc.EPOProcessorVisitor</class>
+     <fileFilter method=".">structured.xml</fileFilter>
+  </visitor>
+ 
+ <visitor>
+    <name>dataxml</name>
+    
+    <requires>structured</requires>
+    <class>uk.ac.cam.ch.wwmm.patents.chemtag.DataAnnotatorVisitor</class>
+    <fileFilter method=".">data.xml</fileFilter>
+  </visitor>
+
+ 
+    <visitor>
+      <name>examples</name>
+      <requires>dataxml</requires>
+      
+      <class>org.xmlcml.cml.crystaleye.XPathVisitor</class>
+      <!--  this may need tuning -->
+      <parameter name="XPATH">//exampleList/example|//example/heading/p|//example/p</parameter>
+      <parameter name="XPATH_DIRECTORY">examples</parameter>
+      <parameter name="XPATH_PREFIX">{@id}/example.xml</parameter>
+      <fileFilter method=".">example.xml</fileFilter>
+    </visitor>
+ 
+  <visitor>
+     
+    <name>chemicalTagger</name>
+    <requires>examples</requires>
+    <class>org.xmlcml.cml.crystaleye.supptext.ChemicalTaggerVisitor</class>
+    <fileFilter method=".">chemicalTagger.xml</fileFilter>
+    
+  </visitor>
+  
+ 
+  <visitor>
+    <name>chemicalTreeBank</name>
+    
+    <requires>chemicalTagger</requires>
+    <class>org.xmlcml.cml.crystaleye.supptext.ChemicalTreeBankVisitor</class>
+    <fileFilter method=".">chemicalTreeBank.xml</fileFilter>
+  </visitor>
+  
+  
+  <visitor>
+    <name>resolved</name>
+    <requires>chemicalTreeBank</requires>
+    <class>org.xmlcml.cml.crystaleye.supptext.ResolveCTFromName</class>
+    <fileFilter method=".">resolved.xml</fileFilter>
+  </visitor>
+
+  <visitor>
+    <name>dissolve</name>
+    <requires>resolved</requires>
+    <auxiliaryFile>org/xmlcml/cml/crystaleye/dissolve.xsl</auxiliaryFile>
+    <class>org.xmlcml.cml.crystaleye.XSLTVisitor</class>
+    <fileFilter method=".">dissolve.html</fileFilter>
+  </visitor>
+  
+ <visitor>
+    <name>wash</name>
+    <requires>resolved</requires>
+    <auxiliaryFile>org/xmlcml/cml/crystaleye/wash.xsl</auxiliaryFile>
+    <class>org.xmlcml.cml.crystaleye.XSLTVisitor</class>
+    <fileFilter method=".">wash.html</fileFilter>
+  </visitor>
+ 
+</visitors>

src/main/java/uk/ac/cam/ch/wwmm/patents/crawler/EpoDownloadVisitor.java

 		try {
 			if (xmlFile != null) {
 				String name = xmlFile.getName();
-				if (name.startsWith(EPO_PREFIX)) {
+				if (name.startsWith(EPO_PREFIX) && name.endsWith(XML_SUFFIX)) {
 					download(xmlFile);
 				}
 			}