Commits

Anonymous committed ac4c34d

Issue 2255: Handle scanning of UTF-8 and UTF-16 files. (Greg Spencer)

Comments (0)

Files changed (28)

     tfp.close()
 
 def soscan(node, env, path):
-    c = node.get_contents()
+    c = node.get_text_contents()
     return re.compile(r"^[\.']so\s+(\S+)", re.M).findall(c)
 
 soelimbuilder = Builder(action = Action(soelim),
 def scanxml(node, env, target):
     includes = []
 
-    contents = node.get_contents()
+    contents = node.get_text_contents()
 
     includes.extend(entity_re.findall(contents))
 
 
 .ES
 def xyz_scan(node, env, path):
-    contents = node.get_contents()
+    contents = node.get_text_contents()
     # Scan the contents and return the included files.
 
 XYZScanner = Scanner(xyz_scan)
 include_re = re.compile(r'^include\\s+(\\S+)$', re.M)
 
 def kfile_scan(node, env, path, arg):
-    contents = node.get_contents()
+    contents = node.get_text_contents()
     includes = include_re.findall(contents)
     return includes
 
 include_re = re.compile(r'^include\\s+(\\S+)$', re.M)
 
 def my_scan(node, env, path, arg):
-   contents = node.get_contents()
+   contents = node.get_text_contents()
    includes = include_re.findall(contents)
    if includes == []:
         return []

doc/user/scanners.in

       include_re = re.compile(r'^include\s+(\S+)$', re.M)
       
       def kfile_scan(node, env, path, arg):
-          contents = node.get_contents()
+          contents = node.get_text_contents()
           return include_re.findall(contents)
     </programlisting>
 
       The path name to the file can be
       used by converting the node to a string
       using the <literal>str()</literal> function,
-      or an internal &SCons; <literal>get_contents()</literal>
+      or an internal &SCons; <literal>get_text_contents()</literal>
       object method can be used to fetch the contents.
 
       </para>
         include_re = re.compile(r'^include\s+(\S+)$', re.M)
 
         def kfile_scan(node, env, path):
-            contents = node.get_contents()
+            contents = node.get_text_contents()
             includes = include_re.findall(contents)
             return includes
 

doc/user/scanners.xml

       include_re = re.compile(r'^include\s+(\S+)$', re.M)
       
       def kfile_scan(node, env, path, arg):
-          contents = node.get_contents()
+          contents = node.get_text_contents()
           return include_re.findall(contents)
     </programlisting>
 
       The path name to the file can be
       used by converting the node to a string
       using the <literal>str()</literal> function,
-      or an internal &SCons; <literal>get_contents()</literal>
+      or an internal &SCons; <literal>get_text_contents()</literal>
       object method can be used to fetch the contents.
 
       </para>
         include_re = re.compile(r'^include\s+(\S+)$', re.M)
 
         def kfile_scan(node, env, path):
-            contents = node.get_contents()
+            contents = node.get_text_contents()
             includes = include_re.findall(contents)
             return includes
 
 
     - Add detection of Microsoft Visual Studio 9.
 
+  From Greg Spencer:
+
+    - Support implicit dependency scanning of files encoded in utf-8
+      and utf-16.
+
   From Matthew Wesley:
 
     - Copy file attributes so we identify, and can link a shared library

src/engine/SCons/Environment.py

             self._dict[key] = value
 
     def get(self, key, default=None):
-        "Emulates the get() method of dictionaries."""
+        """Emulates the get() method of dictionaries."""
         return self._dict.get(key, default)
 
     def has_key(self, key):

src/engine/SCons/Node/FS.py

 
 __revision__ = "__FILE__ __REVISION__ __DATE__ __DEVELOPER__"
 
+from itertools import izip
+import cStringIO
 import fnmatch
-from itertools import izip
 import os
 import os.path
 import re
 import string
 import sys
 import time
-import cStringIO
+
+try:
+    import codecs
+except ImportError:
+    pass
 
 import SCons.Action
 from SCons.Debug import logInstanceCreation
         return self.get_suffix()
 
     def get_contents(self):
-        """Fetch the contents of the entry.
-
-        Since this should return the real contents from the file
-        system, we check to see into what sort of subclass we should
-        morph this Entry."""
+        """Fetch the contents of the entry.  Returns the exact binary
+        contents of the file."""
         try:
             self = self.disambiguate(must_exist=1)
         except SCons.Errors.UserError:
         else:
             return self.get_contents()
 
+    def get_text_contents(self):
+        """Fetch the decoded text contents of a Unicode encoded Entry.
+
+        Since this should return the text contents from the file
+        system, we check to see into what sort of subclass we should
+        morph this Entry."""
+        try:
+            self = self.disambiguate(must_exist=1)
+        except SCons.Errors.UserError:
+            # There was nothing on disk with which to disambiguate
+            # this entry.  Leave it as an Entry, but return a null
+            # string so calls to get_text_contents() in emitters and
+            # the like (e.g. in qt.py) don't have to disambiguate by
+            # hand or catch the exception.
+            return ''
+        else:
+            return self.get_text_contents()
+
     def must_be_same(self, klass):
         """Called to make sure a Node is a Dir.  Since we're an
         Entry, we can morph into one."""
         """A directory does not get scanned."""
         return None
 
+    def get_text_contents(self):
+        """We already emit things in text, so just return the binary
+        version."""
+        return self.get_contents()
+
     def get_contents(self):
         """Return content signatures and names of all our children
         separated by new-lines. Ensure that the nodes are sorted."""
         contents = []
         name_cmp = lambda a, b: cmp(a.name, b.name)
         sorted_children = self.children()[:]
-        sorted_children.sort(name_cmp)        
+        sorted_children.sort(name_cmp)
         for node in sorted_children:
             contents.append('%s %s\n' % (node.get_csig(), node.name))
         return string.join(contents, '')
             return ''
         fname = self.rfile().abspath
         try:
-            r = open(fname, "rb").read()
+            contents = open(fname, "rb").read()
         except EnvironmentError, e:
             if not e.filename:
                 e.filename = fname
             raise
-        return r
+        return contents
+
+    try:
+        import codecs
+    except ImportError:
+        get_text_contents = get_contents
+    else:
+        # This attempts to figure out what the encoding of the text is
+        # based upon the BOM bytes, and then decodes the contents so that
+        # it's a valid python string.
+        def get_text_contents(self):
+            contents = self.get_contents()
+            if contents.startswith(codecs.BOM_UTF8):
+                contents = contents.decode('utf-8')
+            elif contents.startswith(codecs.BOM_UTF16):
+                contents = contents.decode('utf-16')
+            return contents
 
     def get_content_hash(self):
         """

src/engine/SCons/Node/FSTests.py

         f1 = fs.File(test.workpath("binary_file"))
         assert f1.get_contents() == "Foo\x1aBar", f1.get_contents()
 
+        try:
+            # TODO(1.5)
+            eval('test_string = u"Foo\x1aBar"')
+        except SyntaxError:
+            pass
+        else:
+            # This tests to make sure we can decode UTF-8 text files.
+            test.write("utf8_file", test_string.encode('utf-8'))
+            f1 = fs.File(test.workpath("utf8_file"))
+            assert eval('f1.get_text_contents() == u"Foo\x1aBar"'), \
+                   f1.get_text_contents()
+
         def nonexistent(method, s):
             try:
                 x = method(s, create = 0)
         finally:
             test.unlink("file")
 
+        # test Entry.get_text_contents()
+        e = fs.Entry('does_not_exist')
+        c = e.get_text_contents()
+        assert c == "", c
+        assert e.__class__ == SCons.Node.FS.Entry
+
+        test.write("file", "file\n")
+        try:
+            e = fs.Entry('file')
+            c = e.get_text_contents()
+            assert c == "file\n", c
+            assert e.__class__ == SCons.Node.FS.File
+        finally:
+            test.unlink("file")
+
         test.subdir("dir")
         e = fs.Entry('dir')
         c = e.get_contents()
         assert c == "", c
         assert e.__class__ == SCons.Node.FS.Dir
 
+        c = e.get_text_contents()
+        try:
+            eval('assert c == u"", c')
+        except SyntaxError:
+            assert c == ""
+
         if hasattr(os, 'symlink'):
             os.symlink('nonexistent', test.workpath('dangling_symlink'))
             e = fs.Entry('dangling_symlink')
             c = e.get_contents()
             assert e.__class__ == SCons.Node.FS.Entry, e.__class__
             assert c == "", c
+            c = e.get_text_contents()
+            try:
+                eval('assert c == u"", c')
+            except SyntaxError:
+                assert c == "", c
 
         test.write("tstamp", "tstamp\n")
         try:
         files = string.split(d.get_contents(), '\n')
 
         assert e.get_contents() == '', e.get_contents()
+        assert e.get_text_contents() == '', e.get_text_contents()
         assert e.get_csig()+" empty" == files[0], files
         assert f.get_csig()+" f" == files[1], files
         assert g.get_csig()+" g" == files[2], files
         finally:
             test.unlink(["rep3", "contents"])
 
+    def test_get_text_contents(self):
+        """Ensure get_text_contents() returns text contents from
+        Repositories"""
+        fs = self.fs
+        test = self.test
+
+        # Use a test string that has a file terminator in it to make
+        # sure we read the entire file, regardless of its contents.
+        try:
+            eval('test_string = u"Con\x1aTents\n"')
+        except SyntaxError:
+            import UserString
+            class FakeUnicodeString(UserString.UserString):
+                def encode(self, encoding):
+                    return str(self)
+            test_string = FakeUnicodeString("Con\x1aTents\n")
+
+
+        # Test with ASCII.
+        test.write(["rep3", "contents"], test_string.encode('ascii'))
+        try:
+            c = fs.File("contents").get_text_contents()
+            assert test_string == c, "got %s" % repr(c)
+        finally:
+            test.unlink(["rep3", "contents"])
+
+        # Test with utf-8
+        test.write(["rep3", "contents"], test_string.encode('utf-8'))
+        try:
+            c = fs.File("contents").get_text_contents()
+            assert test_string == c, "got %s" % repr(c)
+        finally:
+            test.unlink(["rep3", "contents"])
+
+        # Test with utf-16
+        test.write(["rep3", "contents"], test_string.encode('utf-16'))
+        try:
+            c = fs.File("contents").get_text_contents()
+            assert test_string == c, "got %s" % repr(c)
+        finally:
+            test.unlink(["rep3", "contents"])
+
     #def test_is_up_to_date(self):
 
 

src/engine/SCons/SConfTests.py

         self.scons_env[comp] = oldcomp
         self.scons_env['%sFLAGS' % comp] = 'qwertyuiop'
         r = func()
-        assert not r, "%s worked with %sFLAGS = qwertyuiop ?" % name
+        assert not r, "%s worked with %sFLAGS = qwertyuiop ?" % (name, comp)
 
     def test_CheckCC(self):
         """Test SConf.CheckCC()

src/engine/SCons/Scanner/D.py

 
     def find_include_names(self, node):
         includes = []
-        for i in self.cre.findall(node.get_contents()):
+        for i in self.cre.findall(node.get_text_contents()):
             includes = includes + self.cre2.findall(i)
         return includes

src/engine/SCons/Scanner/Fortran.py

             mods_and_includes = node.includes
         else:
             # retrieve all included filenames
-            includes = self.cre_incl.findall(node.get_contents())
+            includes = self.cre_incl.findall(node.get_text_contents())
             # retrieve all USE'd module names
-            modules = self.cre_use.findall(node.get_contents())
+            modules = self.cre_use.findall(node.get_text_contents())
             # retrieve all defined module names
-            defmodules = self.cre_def.findall(node.get_contents())
+            defmodules = self.cre_def.findall(node.get_text_contents())
 
             # Remove all USE'd module names that are defined in the same file
             d = {}

src/engine/SCons/Scanner/LaTeX.py

         if node.includes != None:
             includes = node.includes
         else:
-            includes = self.cre.findall(node.get_contents())
+            includes = self.cre.findall(node.get_text_contents())
             # 1. Split comma-separated lines, e.g.
             #      ('bibliography', 'phys,comp')
             #    should become two entries

src/engine/SCons/Scanner/ScannerTests.py

                 return self._exists
             def get_contents(self):
                 return self._contents
+            def get_text_contents(self):
+                return self._contents
             def get_dir(self):
                 return self._dir
 

src/engine/SCons/Scanner/__init__.py

         return SCons.Node.FS._my_normcase(include)
 
     def find_include_names(self, node):
-        return self.cre.findall(node.get_contents())
+        return self.cre.findall(node.get_text_contents())
 
     def scan(self, node, path=()):
 

src/engine/SCons/Tool/FortranCommon.py

     mod_regex = """(?i)^\s*MODULE\s+(?!PROCEDURE)(\w+)"""
     cre = re.compile(mod_regex,re.M)
     # Retrieve all USE'd module names
-    modules = cre.findall(node.get_contents())
+    modules = cre.findall(node.get_text_contents())
     # Remove unique items from the list
     modules = SCons.Util.unique(modules)
     # Convert module name to a .mod filename

src/engine/SCons/Tool/jar.py

             jarchdir = env.fs.Dir(jarchdir)
     result = []
     for src in source:
-        contents = src.get_contents()
+        contents = src.get_text_contents()
         if contents[:16] != "Manifest-Version":
             if jarchdir_set:
                 _chdir = jarchdir
 def jarManifest(target, source, env, for_signature):
     """Look in sources for a manifest file, if any."""
     for src in source:
-        contents = src.get_contents()
+        contents = src.get_text_contents()
         if contents[:16] == "Manifest-Version":
             return src
     return ''
     flag is specified."""
     jarflags = env.subst('$JARFLAGS', target=target, source=source)
     for src in source:
-        contents = src.get_contents()
+        contents = src.get_text_contents()
         if contents[:16] == "Manifest-Version":
             if not 'm' in jarflags:
                 return jarflags + 'm'

src/engine/SCons/Tool/qt.py

                     print "scons: qt: '%s' is no cxx file. Discarded." % str(cpp) 
                 # c or fortran source
                 continue
-            #cpp_contents = comment.sub('', cpp.get_contents())
-            cpp_contents = cpp.get_contents()
+            #cpp_contents = comment.sub('', cpp.get_text_contents())
+            cpp_contents = cpp.get_text_contents()
             h=None
             for h_ext in header_extensions:
                 # try to find the header file in the corresponding source
                 if h:
                     if debug:
                         print "scons: qt: Scanning '%s' (header of '%s')" % (str(h), str(cpp))
-                    #h_contents = comment.sub('', h.get_contents())
-                    h_contents = h.get_contents()
+                    #h_contents = comment.sub('', h.get_text_contents())
+                    h_contents = h.get_text_contents()
                     break
             if not h and debug:
                 print "scons: qt: no header for '%s'." % (str(cpp))
     lookout = []
     lookout.extend(env['CPPPATH'])
     lookout.append(str(node.rfile().dir))
-    includes = re.findall("<include.*?>(.*?)</include>", node.get_contents())
+    includes = re.findall("<include.*?>(.*?)</include>", node.get_text_contents())
     result = []
     for incFile in includes:
         dep = env.FindFile(incFile,lookout)

src/engine/SCons/Tool/tex.py

     # we have to run makeindex at least once to keep the build
     # happy even if there is no index.
     # Same for glossaries and nomenclature
-    src_content = source[0].get_contents()
+    src_content = source[0].get_text_contents()
     run_makeindex = makeindex_re.search(src_content) and not os.path.exists(targetbase + '.idx')
     run_nomenclature = makenomenclature_re.search(src_content) and not os.path.exists(targetbase + '.nlo')
     run_glossary = makeglossary_re.search(src_content) and not os.path.exists(targetbase + '.glo')
 def is_LaTeX(flist):
     # Scan a file list to decide if it's TeX- or LaTeX-flavored.
     for f in flist:
-        content = f.get_contents()
+        content = f.get_text_contents()
         if LaTeX_re.search(content):
             return 1
     return 0
 def ScanFiles(theFile, target, paths, file_tests, file_tests_search, env, graphics_extensions, targetdir):
     # for theFile (a Node) update any file_tests and search for graphics files
     # then find all included files and call ScanFiles for each of them
-    content = theFile.get_contents()
+    content = theFile.get_text_contents()
     if Verbose:
         print " scanning ",str(theFile)
 
     env.Clean(target[0],auxfilename)
     env.Clean(target[0],logfilename)
 
-    content = source[0].get_contents()
+    content = source[0].get_text_contents()
 
     idx_exists = os.path.exists(targetbase + '.idx')
     nlo_exists = os.path.exists(targetbase + '.nlo')

test/Scanner/FindPathDirs.py

 include_re = re.compile(r'^include\s+(\S+)$', re.M)
 
 def kfile_scan(node, env, path, arg):
-    contents = node.get_contents()
+    contents = node.get_text_contents()
     includes = include_re.findall(contents)
     if includes == []:
          return []

test/Scanner/Scanner.py

 include_re = re.compile(r'^include\s+(\S+)$', re.M)
 
 def kfile_scan(node, env, scanpaths, arg):
-    contents = node.get_contents()
+    contents = node.get_text_contents()
     includes = include_re.findall(contents)
     return includes
 
 
 def blork(env, target, source):
     open(str(target[0]), 'wb').write(
-        string.replace(source[0].get_contents(), 'getfile', 'MISSEDME'))
+        string.replace(source[0].get_text_contents(), 'getfile', 'MISSEDME'))
 
 kbld = Builder(action=r'%(_python_)s build.py $SOURCES $TARGET',
                src_suffix='.lork',

test/Scanner/dictionary.py

 include3_re = re.compile(r'^include3\s+(\S+)$', re.M)
 
 def kfile_scan1(node, env, scanpaths, arg=None):
-    contents = node.get_contents()
+    contents = node.get_text_contents()
     includes = include1_re.findall(contents)
     return includes
 
 def kfile_scan2(node, env, scanpaths, arg=None):
-    contents = node.get_contents()
+    contents = node.get_text_contents()
     includes = include2_re.findall(contents)
     return includes
 
 def kfile_scan3(node, env, scanpaths, arg=None):
-    contents = node.get_contents()
+    contents = node.get_text_contents()
     includes = include3_re.findall(contents)
     return includes
 

test/Scanner/exception.py

 exception_re = re.compile(r'^exception\s+(.+)$', re.M)
 
 def kfile_scan(node, env, target, arg):
-    contents = node.get_contents()
+    contents = node.get_text_contents()
     exceptions = exception_re.findall(contents)
     if exceptions:
         raise Exception, "kfile_scan error:  %s" % exceptions[0]

test/Scanner/multi-env.py

 input_re = re.compile(r'^input\s+(\S+)$', re.M)
 
 scan1 = Scanner(name = 'Include',
-                function = lambda N,E,P,A: A.findall(N.get_contents()),
+                function = lambda N,E,P,A: A.findall(N.get_text_contents()),
                 argument = include_re,
                 skeys = ['.inp'])
 
 scan2 = Scanner(name = 'Input',
-                function = lambda N,E,P,A: A.findall(N.get_contents()),
+                function = lambda N,E,P,A: A.findall(N.get_text_contents()),
                 argument = input_re,
                 skeys = ['.inp'])
 

test/Scanner/source_scanner-dict.py

 include3_re = re.compile(r'^include3\s+(\S+)$', re.M)
 
 def k1_scan(node, env, scanpaths, arg=None):
-    contents = node.get_contents()
+    contents = node.get_text_contents()
     includes = include1_re.findall(contents)
     return includes
 
 def k2_scan(node, env, scanpaths, arg=None):
-    contents = node.get_contents()
+    contents = node.get_text_contents()
     includes = include2_re.findall(contents)
     return includes
 
 def k3_scan(node, env, scanpaths, arg=None):
-    contents = node.get_contents()
+    contents = node.get_text_contents()
     includes = include3_re.findall(contents)
     return includes
 

test/explain/basic.py

 include_re = re.compile(r'^include\s+(\S+)$', re.M)
 
 def kfile_scan(node, env, target, arg):
-    contents = node.get_contents()
+    contents = node.get_text_contents()
     includes = include_re.findall(contents)
     return includes
 

test/explain/save-info.py

 include_re = re.compile(r'^include\s+(\S+)$', re.M)
 
 def kfile_scan(node, env, target, arg):
-    contents = node.get_contents()
+    contents = node.get_text_contents()
     includes = include_re.findall(contents)
     return includes
 

test/srcchange.py

 import string
 
 def subrevision(target, source ,env):
-    orig = target[0].get_contents()
+    orig = target[0].get_text_contents()
     new = re.sub('\$REV.*?\$',
-                 '$REV: %%s$'%%string.strip(source[0].get_contents()),
-                 target[0].get_contents())
+                 '$REV: %%s$'%%string.strip(source[0].get_text_contents()),
+                 target[0].get_text_contents())
     outf = open(str(target[0]),'wb')
     outf.write(new)
     outf.close()