1. Henrik Tunedal
  2. fdroidserver

Commits

Henrik Tunedal  committed 32e1429

Improved license recognition.

  • Participants
  • Parent commits 5f8de5d
  • Branches default

Comments (0)

Files changed (1)

File check_license.py

View file
     filename = os.path.join(metadata_root, package_id) + ".txt"
     return os.path.exists(filename)
 
-def has_source(package_id):
-    return (valid_id(package_id) and
-            os.path.isdir(os.path.join(source_root, package_id)))
+# Returns the source directory for a package, or None if there is none
+def sourcedir_for_id(package_id):
+    sourcedir = os.path.join(source_root, package_id)
+    if valid_id(package_id) and os.path.isdir(sourcedir):
+        return sourcedir
+
+def downloaded_packages():
+    return (p for p in os.listdir(source_root) if valid_id(p))
 
 # Duplicate even more functionality, this time from build.py
 def clone_repo(metadata):
         repotype = metadata["repo type"]
     except KeyError:
         raise VCSError, "invalid repository specification"
-    if not has_source(package_id):
+    if not sourcedir_for_id(package_id):
         raise VCSError, "local repo for '%s' not found'" % package_id
     source_dir = os.path.join(source_root, package_id)
     try:
             package_dir = os.path.join(metadata_root, package)
             metadata = parse_package_metadata(package_dir)
             print metadata["package id"]
-            if has_source(metadata["package id"]):
+            if sourcedir_for_id(metadata["package id"]):
                 if kw.get("update", False):
                     try:
                         update_repo(metadata)
 # And now for the actually useful functions... #
 ################################################
 
-# Try to guess if a file is binary, using Mercurial's definition: does
-# it contain NUL bytes?
-def file_is_binary(filename):
-    f = open(filename, "r")
-    data = f.read(1024)
-    while data:
-        for c in data:
-            if c == "\0":
-                f.close()
-                return True
-        data = f.read(1024)
-    f.close()
-    return False
-
-# Is the file so tiny that it hopefully is not a Copyrighted Work?
-def file_is_tiny(filename):
-    info = os.stat(filename)
-    return info.st_size < 512
-
-def grep_snippets(filename, snippets, regex=False):
+def grep_snippets(data, snippets, regex=False):
     max_distance = 800
     if not isinstance(snippets, list):
         raise ValueError, "snippets should be a list"
-    data = open(filename, "r").read(8192)
     if regex:
         remaining = data
         for snippet in snippets:
             pos = match
     return True
 
-def file_is_gplv2(filename):
-    return grep_snippets(
-        filename,
-        ["This program is free software",
-         "under the terms of the GNU General Public License",
-         "version 2"])
+# Remove line-breaks, punctuation and other noise from the license notice.
+# TODO: detect and remove SGML tags.
+def normalize_license_notice(text):
+    result = []
+    collapse = False
+    for character in text:
+        c = ord(character)
+        # Preserve ASCII letters and digits
+        if 48 <= c <= 57 or 97 <= c <= 122: # digit or lowercase letter
+            result.append(character)
+            collapse = False
+        elif 65 <= c <= 90:     # uppercase letter
+            result.append(character.lower())
+            collapse =  False
+        elif not collapse and c in (9, 10, 13, 32): # collapse spaces
+            result.append(" ")
+            collapse = True
+    return "".join(result)
 
-def file_is_gplv2later(filename):
-    return grep_snippets(
-        filename,
-        ["the terms of the GNU General Public License",
-         "Free Software Foundation[;,] either version 2",
-         "\(at your option\) any later version"],
-        True)
+# Try to guess if a file is binary, using Mercurial's definition: does
+# it contain NUL bytes?
+def file_is_binary(f, data, text):
+    return data.find("\0") != -1
 
-def file_is_gplv3(filename, or_later=False):
-    return grep_snippets(
-        filename,
-        ["the terms of the GNU General Public License",
-         "Free Software",
-         "version 3"])
+# Is the file so tiny that it hopefully is not a Copyrighted Work?
+def file_is_tiny(f, data, text):
+    return os.fstat(f.fileno()).st_size < 512
 
-def file_is_gplv3later(filename):
-    return grep_snippets(
-        filename,
-        ["the terms of the GNU General Public License",
-         "Free Software",
-         "either version 3 of the License",
-         "(at your option) any later"])
+def file_is_gpl_plus(f, data, text, version):
+    return text.find(
+        "is free software you can redistribute "
+        "it andor modify it under the terms of the gnu "
+        "general public license as published by the free "
+        "software foundation either version %i of the license "
+        "or at your option any later version" % version) != -1
 
-def file_is_apache2(filename):
-    data = open(filename, "r").read(8192)
-    return (data.find("Licensed under the Apache License, Version 2.") != -1
+def file_is_gpl(f, data, text, version):
+    part1 = text.find(
+        "is free software you can redistribute "
+        "it andor modify it under the terms of the gnu "
+        "general public license")
+    part2 = text.find("version %i" % version, part1)
+    part3 = text.find("as published by the free software foundation", part1)
+    return (-1 not in (part1, part2, part3)        # all parts found
+             and max(part2, part3) - part1 < 800)  # fairly close together
+
+def file_is_apache2(f, data, text):
+    return (text.find("licensed under the apache license version 2") != -1
             or data.find("http://www.apache.org/licenses/LICENSE-2.0") != -1)
 
+# New BSD-license, without the problematic advertising clause
+def file_is_bsd(f, data, text):
+    intro = ("redistribution and use in source and binary forms with "
+             "or without modification are permitted provided that the "
+             "following conditions are met")
+    c1 = ("1 redistributions of source code must retain the above "
+          "copyright notice this list of conditions and the following "
+          "disclaimer")
+    c2 = ("2 redistributions in binary form must reproduce the above "
+          "copyright notice this list of conditions and the following "
+          "disclaimer in the documentation andor other materials provided "
+          "with the distribution")
+    c3 = ("3 the name of the author may not be used to endorse or promote "
+          "products derived from this software without specific prior "
+          "written permission")
+    start = 0
+    for p in (intro, c1, c2):
+        pos = text.find(p)
+        if pos == -1 or (start > 0 and pos - start > 100):
+            return False
+        start = pos + len(p)
+    else:
+        return True
+
+# New BSD-license, without the problematic advertising clause
+def file_is_mit(f, data, text):
+    mit = ("permission is hereby granted free of charge to any person "
+           "obtaining a copy of this software and associated documentation "
+           "files the software to deal in the software without restriction "
+           "including without limitation the rights to use copy modify "
+           "merge publish distribute sublicense andor sell copies of the "
+           "software and to permit persons to whom the software is "
+           "furnished to do so subject to the following conditions "
+           "the above copyright notice and this permission notice shall be "
+           "included in all copies or substantial portions of the software")
+    return text.find(mit) != -1
+
 # Not really a license, so it's impossible to find all the ways people
 # try to put things in the public domain.
-def file_is_public_domain(filename):
-    data = open(filename, "r").read(16384)
-    for s in ["Not copyrighted -- provided to the public domain",
-              "I am placing this code in the Public Domain.",
-              "This file has been placed in the public domain by the authors.",
-              "This script is public domain, no copyright.",
-              "This code is explicitly placed into the public domain.",
-              "This file is put in the public domain."]:
-        if data.find(s) != -1:
+def file_is_public_domain(f, data, text):
+    for s in ["not copyrighted provided to the public domain",
+              "i am placing this code in the public domain",
+              "this file has been placed in the public domain by the authors",
+              "this script is public domain no copyright",
+              "this code is explicitly placed into the public domain",
+              "this file is put in the public domain"]:
+        if text.find(s) != -1:
             return True
     else:
         return False
 
 # XML files can probably be ignored, assuming that no Android developer
 # is crazy enough to code in XSLT.
-def file_is_xml(filename):
-    return filename.endswith(".xml") and grep_snippets(filename, ["<?xml "])
+def file_is_xml(f, data, text):
+    return f.name.endswith(".xml") and data.find("<?xml ") != -1
 
 # Determine if a directory looks like the main project directory with
 # the customary layout.
         else:
             yield (root, dirs, files)
 
-def check_package(package_id, apps_only=False):
+# TODO:
+# * Prioritize the license found at the earliest point in the file?
+# * Find conflicting licenses.
+def check_file(filename):
     tests = [("binary", file_is_binary),
-             ("GPLv2+", file_is_gplv2later),
-             ("GPLv2", file_is_gplv2),
-             ("GPLv3+", file_is_gplv3later),
-             ("GPLv3", file_is_gplv3),
+             ("GPLv2+", lambda f, d, t: file_is_gpl_plus(f, d, t, 2)),
+             ("GPLv2", lambda f, d, t: file_is_gpl(f, d, t, 2)),
+             ("GPLv3+", lambda f, d, t: file_is_gpl_plus(f, d, t, 3)),
+             ("GPLv3", lambda f, d, t: file_is_gpl(f, d, t, 3)),
+             ("MIT", file_is_mit),
+             ("BSD", file_is_bsd),
              ("Apache2", file_is_apache2),
              ("public domain", file_is_public_domain),
              ("XML", file_is_xml),
              ("tiny", file_is_tiny)]
+    f = open(filename, "r")
+    initial_data = f.read(8192)
+    text = normalize_license_notice(initial_data)
+    for test, testfunc in tests:
+        if testfunc(f, initial_data, text):
+            return test
+
+def check_package(package_id, apps_only=False):
     results = {}
     for root, dirs, files in walk_package_dirs(package_id, apps_only):
         for fil in files:
             filename = os.path.join(root, fil)
-            for test, testfunc in tests:
-                if testfunc(filename):
-                    results.setdefault(test, []).append(filename)
-                    break
-            else:
-                results.setdefault("unknown", []).append(filename)
+            license_type = check_file(filename) or "unknown"
+            results.setdefault(license_type, []).append(filename)
     return results
 
 # Packages can be specified in three equivalent forms:
         clone_everything(packages, update=args.update)
     elif args.list:
         license_type = args.list.lower()
-        for package in (packages or os.listdir(source_root)):
+        for package in (packages or downloaded_packages()):
             meta = metadata_for_id(package)
             results = check_package(meta["package id"], args.apps)
             for k in results.keys():
             for fil in results[key]:
                 print fil
     else:
-        for package in (packages or os.listdir(source_root)):
+        for package in (packages or downloaded_packages()):
             meta = metadata_for_id(package)
             print "%s (%s)" % (meta["package id"], meta["license"])
             res = check_package(meta["package id"], args.apps).items()