Commits

Roy Smith  committed 7fdd8e6

Add first draft of "about"

  • Participants
  • Parent commits e43f598

Comments (0)

Files changed (4)

File around/fio.py

+"File I/O operations"
+
+import os
+import logging
+
+logger = logging.getLogger("fio")
+
+class SeekError(Exception):
+    pass
+
+class TextRecordFile:
+    """A read-only wrapper around a Python file object which treats
+    the file as a random-access sequence of variable-length records
+    (i.e. lines of text).
+
+    Wen you read a line, it is guaranteed to always return a complete
+    line.  You can seek to an arbitrary location in the file (using
+    the standard byte offsets), and the next read will return the line
+    which contains that offset.  Imagine the following file (where $'s
+    indicate newlines):
+
+                 1         2         3         4
+        1234567890123456789012345678901234567890
+        This is the first line$And the second$An
+
+                 5         6         7         8
+        1234567890123456789012345678901234567890
+        d this is the third$
+
+    The file is 60 characters long, and contains 3 records.  Some
+    illustrative operation sequences and what they return are:
+
+    seek(23)
+    readline() => "This is the first line\n"
+
+    seek(24)
+    readline() => "And the second\n"
+
+    seed(25)
+    readline() => "And the second\n"
+
+    NOTE: unlike normal files, a seek() immediately followed by a
+    tell() will *not* necessarially return the offset seeked to.
+    Instead, it will return the position of the beginning of the line
+    which contains the offset.
+
+    This impementation assumes a Unix-like environment, with newlines
+    as terminators and sane seek()/tell() behavior.  I have no idea if
+    this will work on Windows.  Nor do I particularly care :-)
+
+    It is assumed that the file is not being modified while we're
+    reading it.
+
+    """
+    def __init__(self, path):
+        self.file = open(path)
+        self.size = self.get_size()
+
+    def close(self):
+        self.file.close()
+
+    def seek(self, offset):
+        """Move to a location in the file.
+
+        Offset is an integer, as returned by file.tell().  The
+        location moved to is the beginning of the line which contains
+        the requested offset.
+
+        Unlike standard file objects, all seeks are absolute (as per
+        os.SEEK_SET).
+
+        """
+        # Validate offset
+        if offset < 0:
+            raise ValueError("Offset (%r) must be a positive integer")
+        if offset == 0:
+            self.file.seek(0)
+            return
+
+        # Read a chunk of data in front of the current position.
+        chunk_size = 1024
+        start_of_buffer = max(0, offset - chunk_size)
+        self.file.seek(start_of_buffer)
+        buf_size = offset - start_of_buffer
+        assert 0 < buf_size <= chunk_size
+        buffer = self.file.read(buf_size)
+        assert len(buffer) == buf_size
+
+        # If there's no newline in the buffer, either we're in the
+        # first line of the file (which is OK), or we've blown the
+        # assumption that no line is longer than chunk_size.
+        try:
+            index = buffer.rindex('\n')
+        except ValueError:
+            if start_of_buffer == 0:
+                # We're good; it's the first line of the file
+                self.file.seek(0)
+                return
+            else:
+                raise SeekError("no newline found (start=%d, size=%d)" \
+                                % (start_of_buffer, buf_size))
+
+        # There's at least one newline in the buffer; we want to be
+        # right after the last one.
+        self.file.seek(start_of_buffer + index + 1)
+        return
+
+
+    def readline(self):
+        return self.file.readline()
+
+    def get_size(self):
+        """Discover the size of the file.
+
+        Returns an integer suitable for passing to file.seek() as an
+        offset..
+
+        """
+        # This may not be the most efficient way, but it works.
+        current_position = self.file.tell()
+        self.file.seek(0, os.SEEK_END)
+        size = self.file.tell()
+        self.file.seek(current_position, os.SEEK_SET)
+        return size

File around/test_fio.py

+#!/usr/bin/env python
+
+import unittest
+import logging
+from fio import TextRecordFile
+
+class Basic(unittest.TestCase):
+    def setUp(self):
+        self.f = TextRecordFile("testdata/basic.log")
+
+    def tearDown(self):
+        self.f.close()
+        
+    def test_read_first_line(self):
+        line = self.f.readline()
+        self.assertEquals(line, "2012-07-07t00:00:01+00:00 line 1\n")
+
+    def test_seek_back_to_beginning(self):
+        line1 = self.f.readline()
+        self.f.seek(0)
+        line2 = self.f.readline()
+        self.assertEquals(line1, "2012-07-07t00:00:01+00:00 line 1\n")
+        self.assertEquals(line2, "2012-07-07t00:00:01+00:00 line 1\n")
+
+    def test_eof(self):
+        for i in range(20):
+            line = self.f.readline()
+        self.assertEquals(line, "2012-07-07t00:00:20+00:00 line 20\n")
+        line = self.f.readline()
+        self.assertEqual(line, "")
+
+class File60(unittest.TestCase):
+    def setUp(self):
+        test_file = "testdata/file60"
+        self.f = TextRecordFile(test_file)
+
+        # Get the first two lines of the file
+        f = open(test_file)
+        self.line1 = f.readline()
+        self.line2 = f.readline()
+        assert self.line1.endswith('\n')
+        assert self.line2.endswith('\n')
+        self.len1 = len(self.line1.rstrip('\n'))
+        self.len2 = len(self.line2.rstrip('\n'))
+        f.close()
+
+    def tearDown(self):
+        self.f.close()
+
+    def test_seek_end_of_first_line(self):
+        self.f.seek(self.len1 - 1)
+        line = self.f.readline()
+        self.assertEqual(line, self.line1)
+
+    def test_seek_newline_of_first_line(self):
+        self.f.seek(self.len1)
+        line = self.f.readline()
+        self.assertEqual(line, self.line1)
+
+    def test_seek_start_of_second_line(self):
+        self.f.seek(self.len1 + 1)
+        line = self.f.readline()
+        self.assertEqual(line, self.line2)
+
+    def test_seek_second_character_of_second_line(self):
+        self.f.seek(self.len1 + 2)
+        line = self.f.readline()
+        self.assertEqual(line, self.line2)
+        
+if __name__ == '__main__':
+    logging.basicConfig()
+    logging.getLogger("fio").setLevel(logging.DEBUG)
+    unittest.main()

File around/testdata/basic.log

+2012-07-07t00:00:01+00:00 line 1
+2012-07-07t00:00:02+00:00 line 2
+2012-07-07t00:00:03+00:00 line 3
+2012-07-07t00:00:04+00:00 line 4
+2012-07-07t00:00:05+00:00 line 5
+2012-07-07t00:00:06+00:00 line 6
+2012-07-07t00:00:07+00:00 line 7
+2012-07-07t00:00:08+00:00 line 8
+2012-07-07t00:00:09+00:00 line 9
+2012-07-07t00:00:10+00:00 line 10
+2012-07-07t00:00:11+00:00 line 11
+2012-07-07t00:00:12+00:00 line 12
+2012-07-07t00:00:13+00:00 line 13
+2012-07-07t00:00:14+00:00 line 14
+2012-07-07t00:00:15+00:00 line 15
+2012-07-07t00:00:16+00:00 line 16
+2012-07-07t00:00:16+00:00 line 17
+2012-07-07t00:00:18+00:00 line 18
+2012-07-07t00:00:19+00:00 line 19
+2012-07-07t00:00:20+00:00 line 20

File around/testdata/file60

+This is the first line
+And the second
+And this is the third