Commits

Ry4an Brase committed 19502af

Add --ignore-imports option to similarity checking. Closes #106534.

Additionally:
- add access to existing --ignore-docstrings option to symilar command line
- add access to new --ignore-imports option to symilar command line
- add test for existing --ignore-docstring feature
- add test for new --ignore-imports feature

  • Participants
  • Parent commits 7a1e32a

Comments (0)

Files changed (5)

 ====================
 
 --
+    * #106534: add --ignore-imports option to code similarity checking
+      and 'symilar' command line tool (patch by Ry4an Brase)
+
     * #104571: check for anomalous backslash escape, introducing new
       W1401 and W1402 messages (patch by Martin Pool)
 

checkers/similar.py

     """finds copy-pasted lines of code in a project"""
 
     def __init__(self, min_lines=4, ignore_comments=False,
-                 ignore_docstrings=False):
+                 ignore_docstrings=False, ignore_imports=False):
         self.min_lines = min_lines
         self.ignore_comments = ignore_comments
         self.ignore_docstrings = ignore_docstrings
+        self.ignore_imports = ignore_imports
         self.linesets = []
 
     def append_stream(self, streamid, stream):
         self.linesets.append(LineSet(streamid,
                                      stream.readlines(),
                                      self.ignore_comments,
-                                     self.ignore_docstrings))
+                                     self.ignore_docstrings,
+                                     self.ignore_imports))
 
     def run(self):
         """start looking for similarities and display results on stdout"""
                 for sim in self._find_common(lineset, lineset2):
                     yield sim
 
-def stripped_lines(lines, ignore_comments, ignore_docstrings):
+def stripped_lines(lines, ignore_comments, ignore_docstrings, ignore_imports):
+    """return lines with leading/trailing whitespace and any ignored code
+    features removed
+    """
+
     strippedlines = []
     docstring = None
     for line in lines:
                 if line.endswith(docstring):
                     docstring = None
                 line = ''
+        if ignore_imports:
+            if line.startswith("import ") or line.startswith("from "):
+                line = ''
         if ignore_comments:
             # XXX should use regex in checkers/format to avoid cutting
             # at a "#" in a string
 class LineSet:
     """Holds and indexes all the lines of a single source file"""
     def __init__(self, name, lines, ignore_comments=False,
-                 ignore_docstrings=False):
+                 ignore_docstrings=False, ignore_imports=False):
         self.name = name
         self._real_lines = lines
         self._stripped_lines = stripped_lines(lines, ignore_comments,
-                                              ignore_docstrings)
+                                              ignore_docstrings,
+                                              ignore_imports)
         self._index = self._mk_index()
 
     def __str__(self):
                 {'default' : True, 'type' : 'yn', 'metavar' : '<y or n>',
                  'help': 'Ignore docstrings when computing similarities.'}
                 ),
+               ('ignore-imports',
+                {'default' : False, 'type' : 'yn', 'metavar' : '<y or n>',
+                 'help': 'Ignore imports when computing similarities.'}
+                ),
                )
     # reports
     reports = ( ('RP0801', 'Duplication', report_similarities), )
             self.ignore_comments = self.config.ignore_comments
         elif optname == 'ignore-docstrings':
             self.ignore_docstrings = self.config.ignore_docstrings
+        elif optname == 'ignore-imports':
+            self.ignore_imports = self.config.ignore_imports
 
     def open(self):
         """init the checkers: reset linesets and statistics information"""
     print "finds copy pasted blocks in a set of files"
     print
     print 'Usage: symilar [-d|--duplicates min_duplicated_lines] \
-[-i|--ignore-comments] file1...'
+[-i|--ignore-comments] [--ignore-docstrings] [--ignore-imports] file1...'
     sys.exit(status)
 
 def Run(argv=None):
         argv = sys.argv[1:]
     from getopt import getopt
     s_opts = 'hdi'
-    l_opts = ('help', 'duplicates=', 'ignore-comments')
+    l_opts = ('help', 'duplicates=', 'ignore-comments', 'ignore-imports',
+              'ignore-docstrings')
     min_lines = 4
     ignore_comments = False
+    ignore_docstrings = False
+    ignore_imports = False
     opts, args = getopt(argv, s_opts, l_opts)
     for opt, val in opts:
         if opt in ('-d', '--duplicates'):
             usage()
         elif opt in ('-i', '--ignore-comments'):
             ignore_comments = True
+        elif opt in ('--ignore-docstrings'):
+            ignore_docstrings = True
+        elif opt in ('--ignore-imports'):
+            ignore_imports = True
     if not args:
         usage(1)
-    sim = Similar(min_lines, ignore_comments)
+    sim = Similar(min_lines, ignore_comments, ignore_docstrings, ignore_imports)
     for filename in args:
         sim.append_stream(filename, open(filename))
     sim.run()

test/input/similar1

-this file is used
-to check the similar 
-command line tool
+import one
+from two import two
+three
+four
+five
+six # comments optionally ignored
+seven
+eight
+nine
+''' ten
+eleven
+twelve '''
+thirteen
+fourteen
+fifteen
 
-see the similar2 file which is almost the
-same file as this one. 
-more than 4
-identical lines should
-be # ignore comments !
-detected
 
 
-h�h�h�h
 
-
-
-
-
-Yo !
+sixteen
+seventeen
+eighteen

test/input/similar2

-this file is used
-to check the similar 
-command line tool
+import one
+from two import two
+three
+four
+five
+six
+seven
+eight
+nine
+''' ten
+ELEVEN
+twelve '''
+thirteen
+fourteen
+FIFTEEN
 
-see the similar1 file which is almost the
-same file as this one. 
-more than 4
-identical lines should
-be
-detected
 
 
-hohohoh
 
-
-
-
-
-Yo !
+sixteen
+seventeen
+eighteen

test/test_similar.py

         finally:
             sys.stdout = sys.__stdout__
         self.assertMultiLineEqual(output.strip(), ("""
-7 similar lines in 2 files
-==%s:5
-==%s:5
-   same file as this one. 
-   more than 4
-   identical lines should
-   be
-   detected
-   
-   
-TOTAL lines=38 duplicates=7 percent=18.42
+10 similar lines in 2 files
+==%s:0
+==%s:0
+   import one
+   from two import two
+   three
+   four
+   five
+   six
+   seven
+   eight
+   nine
+   ''' ten
+TOTAL lines=44 duplicates=10 percent=22.73
 """ % (SIMILAR1, SIMILAR2)).strip())
 
 
-    def test_dont_ignore_comments(self):
+    def test_ignore_docsrings(self):
+        sys.stdout = StringIO()
+        try:
+            similar.Run(['--ignore-docstrings', SIMILAR1, SIMILAR2])
+        except SystemExit, ex:
+            self.assertEqual(ex.code, 0)
+            output = sys.stdout.getvalue()
+        else:
+            self.fail('not system exit')
+        finally:
+            sys.stdout = sys.__stdout__
+        self.assertMultiLineEqual(output.strip(), ("""
+8 similar lines in 2 files
+==%s:6
+==%s:6
+   seven
+   eight
+   nine
+   ''' ten
+   ELEVEN
+   twelve '''
+   thirteen
+   fourteen
+
+5 similar lines in 2 files
+==%s:0
+==%s:0
+   import one
+   from two import two
+   three
+   four
+   five
+TOTAL lines=44 duplicates=13 percent=29.55
+""" % ((SIMILAR1, SIMILAR2) * 2)).strip())
+
+
+    def test_ignore_imports(self):
+        sys.stdout = StringIO()
+        try:
+            similar.Run(['--ignore-imports', SIMILAR1, SIMILAR2])
+        except SystemExit, ex:
+            self.assertEqual(ex.code, 0)
+            output = sys.stdout.getvalue()
+        else:
+            self.fail('not system exit')
+        finally:
+            sys.stdout = sys.__stdout__
+        self.assertMultiLineEqual(output.strip(), """
+TOTAL lines=44 duplicates=0 percent=0.00
+""".strip())
+
+
+    def test_ignore_nothing(self):
         sys.stdout = StringIO()
         try:
             similar.Run([SIMILAR1, SIMILAR2])
             self.fail('not system exit')
         finally:
             sys.stdout = sys.__stdout__
-        self.assertMultiLineEqual(output.strip(), """
-TOTAL lines=38 duplicates=0 percent=0.00
-        """.strip())
+        self.assertMultiLineEqual(output.strip(), ("""
+5 similar lines in 2 files
+==%s:0
+==%s:0
+   import one
+   from two import two
+   three
+   four
+   five
+TOTAL lines=44 duplicates=5 percent=11.36
+""" % (SIMILAR1, SIMILAR2)).strip())
 
     def test_help(self):
         sys.stdout = StringIO()