Commits

Anonymous committed cd55812

Added computation and control of ARC record offset when processing not compressed ARC files

Comments (0)

Files changed (3)

src/main/java/org/jhove2/module/format/arc/ArcModule.java

 	        	boolean isCompressedArc = 
 	        		(source instanceof GzipMemberSource) ? true : false;
 	        	//initialize arc record parser
-	        	ArcParser arc = new ArcParser();
 	        	in = source.getInputStream();
+	        	ArcParser arc = new ArcParser(in);
 	        	//get the version block
-	        	VersionBlock versionBlock = arc.getVersionBlock(in);
+	        	VersionBlock versionBlock = arc.getVersionBlock();
 	          	processArcRecord(jhove2,source,versionBlock,isCompressedArc,null);
 	        	Version version = versionBlock.version;
 	        	this.arcFileName = versionBlock.path;
 	        		this.arcFileSize = input.getSize();
 	        		// Regular ARC file => Read the whole file content
 		        	ArcRecord arcRecord;
-		        	while ((arcRecord = arc.getNextArcRecord(version,fieldDesc,in)) != null) {
+		        	while ((arcRecord = arc.getNextArcRecord(version,fieldDesc)) != null) {
 		        		processArcRecord(jhove2, source,arcRecord,isCompressedArc,threadPool);
 		        	}
 	        	}
     //------------------------------------------------------------------------
     // Specific implementation
     //------------------------------------------------------------------------
+
     /**
      * Gets the presumptive format for the specified MIME type.
      * @param mimeType the MIME type.
      */
     private final class GzipRecordParser implements Parser
     {
-        /**
-         * ARC record version.
-         */
-        private final Version version;
-        /**
-         * Version block field description.
-         */
-        private final String[] fieldDesc;
+    	/** ARC record version. */
+    	private final Version version;
+    	/** Version block field description. */
+    	private final String[] fieldDesc;
 
-        /**
+    	/**
          * Instantiate a new <code>GzipRecordParser</code> instance.
          * @param version   ARC record version
          * @param fieldDesc version block field description
          */
-        public GzipRecordParser(Version version, String[] fieldDesc) {
-            super();
-            this.version = version;
-            this.fieldDesc = fieldDesc;
-        }
+    	public GzipRecordParser(Version version, String[] fieldDesc) {
+    		super();
+    		this.version = version;
+    		this.fieldDesc = fieldDesc;
+    	}
 
-        @Override
-        public long parse(JHOVE2 jhove2, Source source, Input input) 
-                                        throws JHOVE2Exception, IOException {
-            InputStream in = null; 
-            try {
-                in = source.getInputStream();
-                //Get the next ARC record
-                ArcRecord arcRecord = 
-                        new ArcParser().getNextArcRecord(version, fieldDesc, in);
-                //characterize and validate the ARC record
-                processArcRecord(jhove2, source, arcRecord, true,null);
-            }
+		@Override
+		public long parse(JHOVE2 jhove2, Source source, Input input) 
+		                                   throws JHOVE2Exception, IOException {
+	        InputStream in = null; 
+	        try {
+	        	in = source.getInputStream();
+	        	//Get the next ARC record
+	        	ArcRecord arcRecord = 
+	        		new ArcParser(in).getNextArcRecord(version, fieldDesc);
+	        	//characterize and validate the ARC record
+        		processArcRecord(jhove2, source, arcRecord, true, null);
+	        }
             finally {
             	if(in != null){
             		try { in.close(); } catch (Exception e) { /* Ignore... */ }
                        jhove2.getConfigInfo());
     }
 
-    /**
-     * ARC record counter getter.
-     * @return the ARC record counter
-     */
-    public AtomicInteger getArcRecordCounter() {
-        return this.arcRecordNumber;
+	/**
+	 * ARC record counter getter.
+	 * @return the ARC record counter
+	 */
+	public AtomicInteger getArcRecordCounter() {
+		return this.arcRecordNumber;
     }
-
-    public void setParallelCharacterization(int level) {
-        if (level < 0) {
-            level = 0;
-        }
-        this.nThreads = level;
-    }
-
-    /**
-     * nThreads getter
-     * @return nThreads
-     */
-    public int getParallelCharacterization(){
-        return this.nThreads;
-    }
+	
+	public void setParallelCharacterization(int level) {
+	        if (level < 0) {
+	            level = 0;
+	        }
+	        this.nThreads = level;
+	}
+	/**
+	 * nThreads getter
+	 * @return nThreads
+	 */
+	public int getParallelCharacterization(){
+		return this.nThreads;
+	}
 }

src/main/java/org/jhove2/module/format/arc/ArcParser.java

 
 
 import java.io.ByteArrayOutputStream;
+import java.io.FilterInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.StringReader;
 	final static String RESERVED_FIELD           = "Reserved";
 	final static String ORIGIN_FIELD             = "Origin-code";
 
-	/**
-	 * Version-1-block fields.
-	 */
+	/** Version-1-block fields */
 	private final static String[] VERSION_1_BLOCK_FIELDS = {
 	        URL_FIELD, IP_ADDRESS_FIELD, DATE_FIELD, CONTENT_TYPE_FIELD,
 	        LENGTH_FIELD};
-	/**
-	 * Version-2-block fields.
-	 */
+	/** Version-2-block fields */
 	private final static String[] VERSION_2_BLOCK_FIELDS = {
 	        URL_FIELD, IP_ADDRESS_FIELD, DATE_FIELD, CONTENT_TYPE_FIELD,
 	        RESULT_CODE_FIELD, CHECKSUM_FIELD, LOCATION_FIELD,
 	        OFFSET_FIELD, FILENAME_FIELD,
 	        LENGTH_FIELD };
 
-	/**
-	 * Version description fields
-	 */
-	private final static String[] VERSION_DESC_FIELDS = {VERSION_FIELD,
-		RESERVED_FIELD, ORIGIN_FIELD };
+	/** Version description fields */
+	private final static String[] VERSION_DESC_FIELDS = {
+		VERSION_FIELD, RESERVED_FIELD, ORIGIN_FIELD };
 
 	private final static String VERSION1_BLOCK_DEF =
-										join(' ', VERSION_1_BLOCK_FIELDS);
+					join(' ', VERSION_1_BLOCK_FIELDS);
 	private final static String VERSION2_BLOCK_DEF =
-										join(' ', VERSION_2_BLOCK_FIELDS);
+					join(' ', VERSION_2_BLOCK_FIELDS);
+
+	private final ByteCountingInputStream in;
 
 	/**
 	 * Creates a new ARC parser.
 	 */
-	public ArcParser() {
+	public ArcParser(InputStream in) {
 		super();
+		if (in == null) {
+		    throw new IllegalArgumentException("in");
+		}
+		this.in = new ByteCountingInputStream(in);
 	}
 
 	/**
 	 * @throws IOException
 	 * @throws JHOVE2Exception 
 	 */
-	public VersionBlock getVersionBlock(InputStream in)
+	public VersionBlock getVersionBlock()
                                         throws IOException, JHOVE2Exception {
-		return new VersionBlock(getVersionBlockDescription(in));
+		return new VersionBlock(getVersionBlockDescription());
 	}
 	
 	/**
 	 * @return the next ARC record
 	 * @throws IOException
 	 */
-	public ArcRecord getNextArcRecord(Version version, String [] fields,
-								      InputStream in) throws IOException {
-		RecordDesc recordDesc = this.getArcRecordDescription(in, fields);
+	public ArcRecord getNextArcRecord(Version version, String [] fields)
+	                                                    throws IOException {
+		RecordDesc recordDesc = this.getArcRecordDescription(fields);
 		return (recordDesc == null) ? null : 
 			new ArcRecord(version,recordDesc);
 	}
 	 * @return the version block description
 	 * @throws IOException
 	 */
-	private RecordDesc getVersionBlockDescription(InputStream in) throws IOException {
+	private RecordDesc getVersionBlockDescription() throws IOException {
+		long startOffset = this.in.getBytesRead();
 		HeaderReader r = this.readHeader(in, 3);
-		RecordDesc recordDesc =  this.parseRecordHeader(null, r, in);
+		RecordDesc recordDesc   =  this.parseRecordHeader(null, r, in);
 		recordDesc.headerLength = r.consumedLength;
+		recordDesc.startOffset  = startOffset;
 		return recordDesc;
 	}
 	
 	 * @return the version block description
 	 * @throws IOException
 	 */
-	private RecordDesc getArcRecordDescription(InputStream in, String[] formatDesc) 
-												throws IOException{
+	private RecordDesc getArcRecordDescription(String[] formatDesc)
+	                                                    throws IOException {
+                long startOffset = this.in.getBytesRead();
 		HeaderReader r = this.readHeader(in);
 		RecordDesc recordDesc = null;
 		if (r != null) {
 			recordDesc = this.parseRecordHeader(formatDesc, r, in);
 		}
+                recordDesc.startOffset = startOffset;
 		return recordDesc;
 	}
 
 	 * @return the next not empty line or <code>null<code> if EOF has been reached.
 	 * @throws IOException
 	 */
-	
 	public HeaderReader readHeader(InputStream in) throws IOException{
 		String line = readStringUntilNl(in);
 		// Escape lines with white spaces.
 		int i = 0;
 		while ((line != null) && (line.length() == 0) ) { 
-            line = readStringUntilNl(in);
-            ++i;
-        }
+		    line = readStringUntilNl(in);
+		    ++i;
+		}
 		return (line != null) ? new HeaderReader(line, line.length()) : null;
 	}
 	
 	 *
 	 */
 	public final static class RecordDesc
-    {   
-		/**
-		 * URL record values
-		 */
-		public final String url;
-		public final String ipAddress;
-		public final String archiveDate;
-		public final String contentType;
-		public final String length;	
-		public final String versionNumber;
-		public final String reserved;
-		public final String originCode;
+        {
+	    public final String url;
+	    public final String ipAddress;
+	    public final String archiveDate;
+	    public final String contentType;
+	    public final String length;	
+	    public final String versionNumber;
+	    public final String reserved;
+	    public final String originCode;
 	    public final String recordDef;
 	    /**
 	     * Fields description
 	     */
-        public final String[] fieldDesc;
-        public final boolean valid;
-        /**
-         * Header length
-         */
-        public long headerLength = 0L;
+	    public final String[] fieldDesc;
+	    public final boolean valid;
+	    private long startOffset = 0L;
+	    /**
+	     * Header length
+	     */
+	    public long headerLength = 0L;
         /**
          * The input stream to parse
          */
 
         public RecordDesc(InputStream in,String recordDef, String versionDef,
                                             String[] fieldDesc) {
-        	this.in = in;
-        	this.recordDef = recordDef;
+            this.in = in;
+            this.recordDef = recordDef;
             this.fieldDesc = fieldDesc;
             // Parse URL Record Definition
             String[] hdr = this.parse(recordDef, fieldDesc, this.fields);
             }
             this.versionNumber = this.fields.get(VERSION_FIELD);
             this.reserved = this.fields.get(RESERVED_FIELD);
-            this.originCode      = this.fields.get(ORIGIN_FIELD);
+            this.originCode = this.fields.get(ORIGIN_FIELD);
         }
 
         public boolean isValid() {
             return this.getField(FILENAME_FIELD);
         }
 
+        /**
+         * Returns the starting offset of the record in the containing ARC.
+         * @return the starting offset of the record.
+         */
+        public long getStartOffset() {
+            return this.startOffset;
+        }
+
         public String getField(String name) {
             String v = this.fields.get(name);
             if ((v != null) && ((v.length() == 0) || ("-".equals(v)))) {
             this.errors.add(new ValidationError(errorType, desc, value));
         }
     }
+
+    public final static class ByteCountingInputStream extends FilterInputStream
+    {
+        private long bytesRead = 0L;
+        private long mark = 0L;
+
+        public ByteCountingInputStream(InputStream parent) {
+            super(parent);
+        }
+
+        public int read() throws IOException {
+            int b = super.read();
+            if (b != -1) {
+                this.bytesRead++;
+            }
+            return b;
+        }
+        public int read(byte b[], int off, int len) throws IOException {
+            int n = super.read(b, off, len);
+            if (n > 0) {
+                this.bytesRead += n;
+            }
+            return n;
+        }
+
+        public long skip(long n) throws IOException {
+            n = super.skip(n);
+            this.bytesRead += n;
+            return n;
+        }
+
+        public synchronized void mark(int readlimit) {
+            super.mark(readlimit);
+            this.mark = this.bytesRead;
+        }
+
+        public synchronized void reset() throws IOException {
+            super.reset();
+            this.bytesRead = this.mark;
+        }
+
+        public long getBytesRead() {
+            return this.bytesRead;
+        }
+    }
 }

src/main/java/org/jhove2/module/format/arc/ArcRecordBase.java

 	 * Specifies whether the network has been already validated or not.
 	 */
 	private boolean isNetworkDocValidated = false;
-	
+
+	private long startOffset = -1L;
 	/**
 	 * ARC record version
 	 */
 			this.parseString(desc.getChecksum(),
 					ArcParser.CHECKSUM_FIELD);
 			this.parseString(desc.getLocation(), 
-					ArcParser.LOCATION_FIELD);
+					ArcParser.LOCATION_FIELD, true);
 			this.offset = this.parseLong(desc.getOffset(),ArcParser.OFFSET_FIELD);
 			this.parseString(desc.getFileName(),ArcParser.FILENAME_FIELD);
 		}
+		this.startOffset = desc.getStartOffset();
+		// Check read and computed offset value only if we're reading
+		// a plain ARC file, not a GZipped ARC.
+		if ((this.offset != null) && (this.startOffset > 0L) &&
+		    (this.offset.longValue() != this.startOffset)) {
+		    this.addValidationError(ErrorType.INVALID,
+		                    ArcParser.OFFSET_FIELD, desc.getOffset());
+		}
 	}
 	
 	/**
 	}
 
 	/**
+         * Returns the starting offset of the record in the containing ARC.
+         * @return the starting offset of the record.
+         */
+	public long getStartOffset() {
+	    return this.startOffset;
+	}
+
+	/**
 	 * Checks if the ARC record is valid.
 	 * @return true/false based on whether the ARC record is valid or not 
 	 */
 	public boolean isValid() {
-       return (this.desc.hasErrors() == false);
-    }
+	    return (this.desc.hasErrors() == false);
+	}
 
 	/**
 	 * Checks if the ARC record has warnings.
 	 * @return true/false based on whether the ARC record has warnings or not 
 	 */
 	public boolean hasWarnings() {
-       return ((networkDoc != null) && (networkDoc.hasWarnings()));
-    }
-	
+	    return ((networkDoc != null) && (networkDoc.hasWarnings()));
+	}
+
 	/**
 	 * Gets Network doc warnings.
 	 * @return validation errors list/
 	 */
 	public Collection<String> getWarnings() {
-        return (this.hasWarnings())? networkDoc.getWarnings(): null;
+	    return (this.hasWarnings())? networkDoc.getWarnings(): null;
 	}
 	
 	/**
 	 * @return validation errors list/
 	 */
 	public Collection<ValidationError> getValidationErrors() {
-        return (this.desc.hasErrors())? this.desc.getValidationErrors(): null;
+	    return (this.desc.hasErrors())? this.desc.getValidationErrors(): null;
 	}
-	
+
 	/**
 	 * Parses ARC record content type
 	 * @param contentType ARC record content type.
 	 * @return ARC record content type.
 	 */
 	public String parseContentType(String contentType){
-		return this.parseString(contentType,ArcParser.CONTENT_TYPE_FIELD);
+	    return this.parseString(contentType,ArcParser.CONTENT_TYPE_FIELD);
 	}
 	
 	/**
 	 * @return the IP address.
 	 */
 	private InetAddress parseIpAddress(String ipAddress) {
-		InetAddress address = null;
-        if (isSet(ipAddress)) {
-        	address = IPAddressParser.getAddress(ipAddress);
-        	if(address == null){
-                // Invalid date.
-            this.addValidationError(ErrorType.INVALID, ArcParser.IP_ADDRESS_FIELD, ipAddress);
-        	} 
-        }
-        else {
-            // Missing mandatory value.
-            this.addValidationError(ErrorType.MISSING, ArcParser.IP_ADDRESS_FIELD, ipAddress);
-        }
-        return address;
+	    InetAddress address = null;
+	    if (isSet(ipAddress)) {
+	        address = IPAddressParser.getAddress(ipAddress);
+	        if(address == null){
+	            // Invalid date.
+	            this.addValidationError(ErrorType.INVALID, ArcParser.IP_ADDRESS_FIELD, ipAddress);
+	        }
+	    }
+	    else {
+	        // Missing mandatory value.
+	        this.addValidationError(ErrorType.MISSING, ArcParser.IP_ADDRESS_FIELD, ipAddress);
+	    }
+	    return address;
 	}
 
 	/**