Anonymous avatar Anonymous committed b40e3b4

[format checker] check for anomalous backslash escape (new W1401, W1402). Closes #104571

Comments (0)

Files changed (6)

 ====================
 
 --
+    * #104571: check for anomalous backslash escape, introducing new
+      W1401 and W1402 messages (patch by Martin Pool)
+
     * #100707: check for boolop being used as exception class, introducing
       new W0711 message (patch by Tim Hatch)
 
 * Wolfgang Grafen, Axel Muller, Fabio Zadrozny, Pierre Rouleau,
   Maarten ter Huurne, Mirko Friedenhagen (among others):
   bug reports, feedback, feature requests...
+* Martin Pool (Google): warnings for anomalous backslashes
 * All the Logilab's team: daily use, bug reports, feature requests
 * Other people have contributed by their feedback, if I've forgotten
   you, send me a note !

checkers/__init__.py

 11: typecheck
 12: logging
 13: string_format
+14: string_constant
 14-50: not yet used: reserved for future internal checkers.
 51-99: perhaps used: reserved for external checkers
 

checkers/format.py

 # Copyright (c) 2003-2010 Sylvain Thenault (thenault@gmail.com).
 # Copyright (c) 2003-2012 LOGILAB S.A. (Paris, FRANCE).
+# Copyright 2012 Google Inc.
+#
 # This program is free software; you can redistribute it and/or modify it under
 # the terms of the GNU General Public License as published by the Free Software
 # Foundation; either version 2 of the License, or (at your option) any later
               'Used when more than on statement are found on the same line.'),
     'C0322': ('Operator not preceded by a space\n%s',
               'Used when one of the following operator (!= | <= | == | >= | < '
-              '| > | = | \+= | -= | \*= | /= | %) is not preceded by a space.'),
+              '| > | = | \\+= | -= | \\*= | /= | %) is not preceded by a space.'),
     'C0323': ('Operator not followed by a space\n%s',
               'Used when one of the following operator (!= | <= | == | >= | < '
-              '| > | = | \+= | -= | \*= | /= | %) is not followed by a space.'),
+              '| > | = | \\+= | -= | \\*= | /= | %) is not followed by a space.'),
     'C0324': ('Comma not followed by a space\n%s',
               'Used when a comma (",") is not followed by a space.'),
     }
 SASTRING_RGX = r"'([^'\\]|\\.)*?'"
 # triple quoted string rgx
 TQSTRING_RGX = r'"""([^"]|("(?!"")))*?(""")'
-# triple apostrophed string rgx # FIXME english please
+# triple apostrophe'd string rgx
 TASTRING_RGX = r"'''([^']|('(?!'')))*?(''')"
 
 # finally, the string regular expression
      re.compile(OP_RGX_SEARCH_2, re.M),
      'C0323'),
 
-    (re.compile(r'.*,[^(\s|\]|}|\))].*', re.M), 
+    (re.compile(r'.*,[^(\s|\]|}|\))].*', re.M),
      re.compile(r',[^\s)]', re.M),
      'C0324'),
     )
 
+_PY3K = sys.version_info >= (3, 0)
 
 def get_string_coords(line):
     """return a list of string positions (tuple (start, end)) in the line
                                    expected * unit_size))
 
 
+class StringConstantChecker(BaseRawChecker):
+    """Check string literals"""
+
+    msgs = {
+        'W1401': ('Anomalous backslash in string: \'%s\'. '
+                  'String constant might be missing an r prefix.',
+                  'Used when a backslash is in a literal string but not as an '
+                  'escape.'),
+        'W1402': ('Anomalous Unicode escape in byte string: \'%s\'. '
+                  'String constant might be missing an r or u prefix.',
+                  'Used when an escape like \\u is encountered in a byte '
+                  'string where it has no effect.'),
+        }
+    name = 'string_constant'
+    __implements__ = (IRawChecker, IASTNGChecker)
+
+    # Characters that have a special meaning after a backslash in either
+    # Unicode or byte strings.
+    ESCAPE_CHARACTERS = 'abfnrtvox\n\r\t\\\'\"'
+
+    # Characters that have a special meaning after a backslash but only in
+    # Unicode strings.
+    UNICODE_ESCAPE_CHARACTERS = 'uUN'
+
+    def process_tokens(self, tokens):
+        for (tok_type, token, (start_row, start_col), _, _) in tokens:
+            if tok_type == tokenize.STRING:
+                # 'token' is the whole un-parsed token; we can look at the start
+                # of it to see whether it's a raw or unicode string etc.
+                self.process_string_token(token, start_row, start_col)
+
+    def process_string_token(self, token, start_row, start_col):
+        for i, c in enumerate(token):
+            if c in '\'\"':
+                quote_char = c
+                break
+        prefix = token[:i].lower()  #  markers like u, b, r.
+        after_prefix = token[i:]
+        if after_prefix[:3] == after_prefix[-3:] == 3 * quote_char:
+            string_body = after_prefix[3:-3]
+        else:
+            string_body = after_prefix[1:-1]  # Chop off quotes
+        # No special checks on raw strings at the moment.
+        if 'r' not in prefix:
+            self.process_non_raw_string_token(prefix, string_body,
+                start_row, start_col)
+
+    def process_non_raw_string_token(self, prefix, string_body, start_row,
+        start_col):
+        """check for bad escapes in a non-raw string.
+
+        prefix: lowercase string of eg 'ur' string prefix markers.
+        string_body: the un-parsed body of the string, not including the quote
+        marks.
+        start_row: integer line number in the source.
+        start_col: integer column number in the source.
+        """
+        # Walk through the string; if we see a backslash then escape the next
+        # character, and skip over it.  If we see a non-escaped character,
+        # alert, and continue.
+        #
+        # Accept a backslash when it escapes a backslash, or a quote, or
+        # end-of-line, or one of the letters that introduce a special escape
+        # sequence <http://docs.python.org/reference/lexical_analysis.html>
+        #
+        # TODO(mbp): Maybe give a separate warning about the rarely-used
+        # \a \b \v \f?
+        #
+        # TODO(mbp): We could give the column of the problem character, but
+        # add_message doesn't seem to have a way to pass it through at present.
+        i = 0
+        while True:
+            i = string_body.find('\\', i)
+            if i == -1:
+                break
+            # There must be a next character; having a backslash at the end
+            # of the string would be a SyntaxError.
+            next_char = string_body[i+1]
+            match = string_body[i:i+2]
+            if next_char in self.UNICODE_ESCAPE_CHARACTERS:
+                if 'u' in prefix:
+                    pass
+                elif _PY3K and 'b' not in prefix:
+                    pass  # unicode by default
+                else:
+                    self.add_message('W1402', line=start_row, args=(match, ))
+            elif next_char not in self.ESCAPE_CHARACTERS:
+                self.add_message('W1401', line=start_row, args=(match, ))
+            # Whether it was a valid escape or not, backslash followed by
+            # another character can always be consumed whole: the second
+            # character can never be the start of a new backslash escape.
+            i += 2
+
+
 def register(linter):
     """required method to auto register this checker """
     linter.register_checker(FormatChecker(linter))
+    linter.register_checker(StringConstantChecker(linter))

test/input/func_excess_escapes.py

+# pylint:disable=W0105, W0511
+"""Stray backslash escapes may be missing a raw-string prefix."""
+
+__revision__ = '$Id$'
+
+# Bad escape sequences, which probably don't do what you expect.
+A = "\[\]\\"
+assert '\/' == '\\/'
+ESCAPE_BACKSLASH = '\`'
+
+# Valid escape sequences.
+NEWLINE = "\n"
+OLD_ESCAPES = '\a\b\f\n\t\r\v'
+HEX = '\xad\x0a\x0d'
+OCTAL = '\o123\o000'
+UNICODE = u'\u1234'
+HIGH_UNICODE = u'\U0000abcd'
+QUOTES = '\'\"'
+LITERAL_NEWLINE = '\
+'
+ESCAPE_UNICODE = "\\\\n"
+
+# Bad docstring
+"""Even in a docstring
+
+You shouldn't have ambiguous text like: C:\Program Files\alpha
+"""
+
+# Would be valid in Unicode, but probably not what you want otherwise
+BAD_UNICODE = '\u0042'
+BAD_LONG_UNICODE = '\U00000042'
+BAD_NAMED_UNICODE = '\N{GREEK SMALL LETTER ALPHA}'
+
+GOOD_UNICODE = u'\u0042'
+GOOD_LONG_UNICODE = u'\U00000042'
+GOOD_NAMED_UNICODE = u'\N{GREEK SMALL LETTER ALPHA}'
+
+
+# Valid raw strings
+RAW_BACKSLASHES = r'raw'
+RAW_UNICODE = ur"\u0062\n"
+
+# In a comment you can have whatever you want: \ \\ \n \m
+# even things that look like bad strings: "C:\Program Files"

test/messages/func_excess_escapes.txt

+W:  7: Anomalous backslash in string: '\['. String constant might be missing an r prefix.
+W:  7: Anomalous backslash in string: '\]'. String constant might be missing an r prefix.
+W:  8: Anomalous backslash in string: '\/'. String constant might be missing an r prefix.
+W:  9: Anomalous backslash in string: '\`'. String constant might be missing an r prefix.
+W: 24: Anomalous backslash in string: '\P'. String constant might be missing an r prefix.
+W: 30: Anomalous Unicode escape in byte string: '\u'. String constant might be missing an r or u prefix.
+W: 31: Anomalous Unicode escape in byte string: '\U'. String constant might be missing an r or u prefix.
+W: 32: Anomalous Unicode escape in byte string: '\N'. String constant might be missing an r or u prefix.
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.