1. Ian George
  2. pylint


pylint / pylint / checkers / similar.py

# pylint: disable=W0622
# Copyright (c) 2004-2006 LOGILAB S.A. (Paris, FRANCE).
# http://www.logilab.fr/ -- mailto:contact@logilab.fr
# This program is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation; either version 2 of the License, or (at your option) any later
# version.
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details
# You should have received a copy of the GNU General Public License along with
# this program; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
"""a similarities / code duplication command line tool and pylint checker
from __future__ import generators

import sys
from itertools import izip

from logilab.common.ureports import Table

from pylint.interfaces import IRawChecker
from pylint.checkers import BaseChecker, table_lines_from_stats

class Similar:
    """finds copy-pasted lines of code in a project"""

    def __init__(self, min_lines=4, ignore_comments=False,
        self.min_lines = min_lines
        self.ignore_comments = ignore_comments
        self.ignore_docstrings = ignore_docstrings
        self.linesets = []

    def append_stream(self, streamid, stream):
        """append a file to search for similarities"""
        stream.seek(0) # XXX may be removed with astng > 0.23

    def run(self):
        """start looking for similarities and display results on stdout"""

    def _compute_sims(self):
        """compute similarities in appended files"""
        no_duplicates = {}
        for num, lineset1, idx1, lineset2, idx2 in self._iter_sims():
            duplicate = no_duplicates.setdefault(num, [])
            for couples in duplicate:
                if (lineset1, idx1) in couples or (lineset2, idx2) in couples:
                    couples.add( (lineset1, idx1) )
                    couples.add( (lineset2, idx2) )
                duplicate.append( set([(lineset1, idx1), (lineset2, idx2)]) )
        sims = []
        for num, ensembles in no_duplicates.iteritems():
            for couples in ensembles:
                sims.append( (num, couples) )
        return sims

    def _display_sims(self, sims):
        """display computed similarities on stdout"""
        nb_lignes_dupliquees = 0
        for num, couples in sims:
            print num, "similar lines in", len(couples), "files"
            couples = sorted(couples)
            for lineset, idx in couples:
                print "==%s:%s" % (lineset.name, idx)
            # pylint: disable=W0631
            for line in lineset._real_lines[idx:idx+num]:
                print "  ", line,
            nb_lignes_dupliquees += num * (len(couples)-1)
        nb_total_lignes = sum([len(lineset) for lineset in self.linesets])
        print "TOTAL lines=%s duplicates=%s percent=%.2f" \
            % (nb_total_lignes, nb_lignes_dupliquees,
               nb_lignes_dupliquees*100. / nb_total_lignes)

    def _find_common(self, lineset1, lineset2):
        """find similarities in the two given linesets"""
        lines1 = lineset1.enumerate_stripped
        lines2 = lineset2.enumerate_stripped
        find = lineset2.find
        index1 = 0
        min_lines = self.min_lines
        while index1 < len(lineset1):
            skip = 1
            num = 0
            for index2 in find( lineset1[index1] ):
                non_blank = 0
                for num, ((_, line1), (_, line2)) in enumerate(
                    izip(lines1(index1), lines2(index2))):
                    if line1 != line2:
                        if non_blank > min_lines:
                            yield num, lineset1, index1, lineset2, index2
                        skip = max(skip, num)
                    if line1:
                        non_blank += 1
                    # we may have reach the end
                    num += 1
                    if non_blank > min_lines:
                        yield num, lineset1, index1, lineset2, index2
                    skip = max(skip, num)
            index1 += skip

    def _iter_sims(self):
        """iterate on similarities among all files, by making a cartesian
        for idx, lineset in enumerate(self.linesets[:-1]):
            for lineset2 in self.linesets[idx+1:]:
                for sim in self._find_common(lineset, lineset2):
                    yield sim

def stripped_lines(lines, ignore_comments, ignore_docstrings):
    strippedlines = []
    docstring = None
    for line in lines:
        line = line.strip()
        if ignore_docstrings:
            if not docstring and \
                   (line.startswith('"""') or line.startswith("'''")):
                docstring = line[:3]
                line = line[3:]
            if docstring:
                if line.endswith(docstring):
                    docstring = None
                line = ''
        if ignore_comments:
            # XXX should use regex in checkers/format to avoid cutting
            # at a "#" in a string
            line = line.split('#', 1)[0].strip()
    return strippedlines

class LineSet:
    """Holds and indexes all the lines of a single source file"""
    def __init__(self, name, lines, ignore_comments=False,
        self.name = name
        self._real_lines = lines
        self._stripped_lines = stripped_lines(lines, ignore_comments,
        self._index = self._mk_index()

    def __str__(self):
        return '<Lineset for %s>' % self.name

    def __len__(self):
        return len(self._real_lines)

    def __getitem__(self, index):
        return self._stripped_lines[index]

    def __lt__(self, other):
        return self.name < other.name

    def __hash__(self):
        return id(self)

    def enumerate_stripped(self, start_at=0):
        """return an iterator on stripped lines, starting from a given index
        if specified, else 0
        idx = start_at
        if start_at:
            lines = self._stripped_lines[start_at:]
            lines = self._stripped_lines
        for line in lines:
            #if line:
            yield idx, line
            idx += 1

    def find(self, stripped_line):
        """return positions of the given stripped line in this set"""
        return self._index.get(stripped_line, ())

    def _mk_index(self):
        """create the index for this set"""
        index = {}
        for line_no, line in enumerate(self._stripped_lines):
            if line:
                index.setdefault(line, []).append( line_no )
        return index

MSGS = {'R0801': ('Similar lines in %s files\n%s',
                  'Indicates that a set of similar lines has been detected \
                  among multiple file. This usually means that the code should \
                  be refactored to avoid this duplication.')}

def report_similarities(sect, stats, old_stats):
    """make a layout with some stats about duplication"""
    lines = ['', 'now', 'previous', 'difference']
    lines += table_lines_from_stats(stats, old_stats,
    sect.append(Table(children=lines, cols=4, rheaders=1, cheaders=1))

# wrapper to get a pylint checker from the similar class
class SimilarChecker(BaseChecker, Similar):
    """checks for similarities and duplicated code. This computation may be
    memory / CPU intensive, so you should disable it if you experiment some

    __implements__ = (IRawChecker,)
    # configuration section name
    name = 'similarities'
    # messages
    msgs = MSGS
    # configuration options
    # for available dict keys/values see the optik parser 'add_option' method
    options = (('min-similarity-lines',
                {'default' : 4, 'type' : "int", 'metavar' : '<int>',
                 'help' : 'Minimum lines number of a similarity.'}),
                {'default' : True, 'type' : 'yn', 'metavar' : '<y or n>',
                 'help': 'Ignore comments when computing similarities.'}
                {'default' : True, 'type' : 'yn', 'metavar' : '<y or n>',
                 'help': 'Ignore docstrings when computing similarities.'}
    # reports
    reports = ( ('R0801', 'Duplication', report_similarities), ) # XXX actually a Refactoring message

    def __init__(self, linter=None):
        BaseChecker.__init__(self, linter)
        Similar.__init__(self, min_lines=4,
                         ignore_comments=True, ignore_docstrings=True)
        self.stats = None

    def set_option(self, optname, value, action=None, optdict=None):
        """method called to set an option (registered in the options list)

        overridden to report options setting to Similar
        BaseChecker.set_option(self, optname, value, action, optdict)
        if optname == 'min-similarity-lines':
            self.min_lines = self.config.min_similarity_lines
        elif optname == 'ignore-comments':
            self.ignore_comments = self.config.ignore_comments
        elif optname == 'ignore-docstrings':
            self.ignore_docstrings = self.config.ignore_docstrings

    def open(self):
        """init the checkers: reset linesets and statistics information"""
        self.linesets = []
        self.stats = self.linter.add_stats(nb_duplicated_lines=0,

    def process_module(self, node):
        """process a module

        the module's content is accessible via the stream object

        stream must implement the readlines method
        self.append_stream(self.linter.current_name, node.file_stream)

    def close(self):
        """compute and display similarities on closing (i.e. end of parsing)"""
        total = sum([len(lineset) for lineset in self.linesets])
        duplicated = 0
        stats = self.stats
        for num, couples in self._compute_sims():
            msg = []
            for lineset, idx in couples:
                msg.append("==%s:%s" % (lineset.name, idx))
            # pylint: disable=W0631
            for line in lineset._real_lines[idx:idx+num]:
            self.add_message('R0801', args=(len(couples), '\n'.join(msg)))
            duplicated += num * (len(couples) - 1)
        stats['nb_duplicated_lines'] = duplicated
        stats['percent_duplicated_lines'] = total and duplicated * 100. / total

def register(linter):
    """required method to auto register this checker """

def usage(status=0):
    """display command line usage information"""
    print "finds copy pasted blocks in a set of files"
    print 'Usage: symilar [-d|--duplicates min_duplicated_lines] \
[-i|--ignore-comments] file1...'

def run(argv=None):
    """standalone command line access point"""
    if argv is None:
        argv = sys.argv[1:]
    from getopt import getopt
    s_opts = 'hdi'
    l_opts = ('help', 'duplicates=', 'ignore-comments')
    min_lines = 4
    ignore_comments = False
    opts, args = getopt(argv, s_opts, l_opts)
    for opt, val in opts:
        if opt in ('-d', '--duplicates'):
            min_lines = int(val)
        elif opt in ('-h', '--help'):
        elif opt in ('-i', '--ignore-comments'):
            ignore_comments = True
    if not args:
    sim = Similar(min_lines, ignore_comments)
    for filename in args:
        sim.append_stream(filename, open(filename))

if __name__ == '__main__':