mirror of
https://git.yoctoproject.org/poky
synced 2026-04-28 06:32:34 +02:00
Add a function for finding licenses in a directory or upwards but not above a top directory. (From OE-Core rev: c5c3f7397e62e6e4be6b6fe611317a2f5f853a04) Signed-off-by: Christian Lindeberg <christian.lindeberg@axis.com> Signed-off-by: Mathieu Dubois-Briand <mathieu.dubois-briand@bootlin.com> Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org>
205 lines
7.4 KiB
Python
205 lines
7.4 KiB
Python
#
|
|
# Copyright OpenEmbedded Contributors
|
|
#
|
|
# SPDX-License-Identifier: GPL-2.0-only
|
|
#
|
|
|
|
import fnmatch
|
|
import hashlib
|
|
import logging
|
|
import os
|
|
import re
|
|
|
|
import bb
|
|
import bb.utils
|
|
|
|
logger = logging.getLogger("BitBake.OE.LicenseFinder")
|
|
|
|
def _load_hash_csv(d):
|
|
"""
|
|
Load a mapping of (checksum: license name) from all files/license-hashes.csv
|
|
files that can be found in the available layers.
|
|
"""
|
|
import csv
|
|
md5sums = {}
|
|
|
|
# Read license md5sums from csv file
|
|
for path in d.getVar('BBPATH').split(':'):
|
|
csv_path = os.path.join(path, 'files', 'license-hashes.csv')
|
|
if os.path.isfile(csv_path):
|
|
with open(csv_path, newline='') as csv_file:
|
|
reader = csv.DictReader(csv_file, delimiter=',', fieldnames=['md5sum', 'license'])
|
|
for row in reader:
|
|
md5sums[row['md5sum']] = row['license']
|
|
|
|
return md5sums
|
|
|
|
|
|
def _crunch_known_licenses(d):
|
|
"""
|
|
Calculate the MD5 checksums for the original and "crunched" versions of all
|
|
known licenses.
|
|
"""
|
|
md5sums = {}
|
|
|
|
lic_dirs = [d.getVar('COMMON_LICENSE_DIR')] + (d.getVar('LICENSE_PATH') or "").split()
|
|
for lic_dir in lic_dirs:
|
|
for fn in os.listdir(lic_dir):
|
|
path = os.path.join(lic_dir, fn)
|
|
# Hash the exact contents
|
|
md5value = bb.utils.md5_file(path)
|
|
md5sums[md5value] = fn
|
|
# Also hash a "crunched" version
|
|
md5value = _crunch_license(path)
|
|
md5sums[md5value] = fn
|
|
|
|
return md5sums
|
|
|
|
|
|
def _crunch_license(licfile):
|
|
'''
|
|
Remove non-material text from a license file and then calculate its
|
|
md5sum. This works well for licenses that contain a copyright statement,
|
|
but is also a useful way to handle people's insistence upon reformatting
|
|
the license text slightly (with no material difference to the text of the
|
|
license).
|
|
'''
|
|
|
|
import oe.utils
|
|
|
|
# Note: these are carefully constructed!
|
|
license_title_re = re.compile(r'^#*\(? *(This is )?([Tt]he )?.{0,15} ?[Ll]icen[sc]e( \(.{1,10}\))?\)?[:\.]? ?#*$')
|
|
license_statement_re = re.compile(r'^((This (project|software)|.{1,10}) is( free software)? (released|licen[sc]ed)|(Released|Licen[cs]ed)) under the .{1,10} [Ll]icen[sc]e:?$')
|
|
copyright_re = re.compile(r'^ *[#\*]* *(Modified work |MIT LICENSED )?Copyright ?(\([cC]\))? .*$')
|
|
disclaimer_re = re.compile(r'^ *\*? ?All [Rr]ights [Rr]eserved\.$')
|
|
email_re = re.compile(r'^.*<[\w\.-]*@[\w\.\-]*>$')
|
|
header_re = re.compile(r'^(\/\**!?)? ?[\-=\*]* ?(\*\/)?$')
|
|
tag_re = re.compile(r'^ *@?\(?([Ll]icense|MIT)\)?$')
|
|
url_re = re.compile(r'^ *[#\*]* *https?:\/\/[\w\.\/\-]+$')
|
|
|
|
lictext = []
|
|
with open(licfile, 'r', errors='surrogateescape') as f:
|
|
for line in f:
|
|
# Drop opening statements
|
|
if copyright_re.match(line):
|
|
continue
|
|
elif disclaimer_re.match(line):
|
|
continue
|
|
elif email_re.match(line):
|
|
continue
|
|
elif header_re.match(line):
|
|
continue
|
|
elif tag_re.match(line):
|
|
continue
|
|
elif url_re.match(line):
|
|
continue
|
|
elif license_title_re.match(line):
|
|
continue
|
|
elif license_statement_re.match(line):
|
|
continue
|
|
# Strip comment symbols
|
|
line = line.replace('*', '') \
|
|
.replace('#', '')
|
|
# Unify spelling
|
|
line = line.replace('sub-license', 'sublicense')
|
|
# Squash spaces
|
|
line = oe.utils.squashspaces(line.strip())
|
|
# Replace smart quotes, double quotes and backticks with single quotes
|
|
line = line.replace(u"\u2018", "'").replace(u"\u2019", "'").replace(u"\u201c","'").replace(u"\u201d", "'").replace('"', '\'').replace('`', '\'')
|
|
# Unify brackets
|
|
line = line.replace("{", "[").replace("}", "]")
|
|
if line:
|
|
lictext.append(line)
|
|
|
|
m = hashlib.md5()
|
|
try:
|
|
m.update(' '.join(lictext).encode('utf-8'))
|
|
md5val = m.hexdigest()
|
|
except UnicodeEncodeError:
|
|
md5val = None
|
|
return md5val
|
|
|
|
|
|
def find_license_files(srctree, first_only=False, bottom=""):
|
|
"""
|
|
Search srctree for files that look like they could be licenses.
|
|
If first_only is True, only return the first file found.
|
|
If bottom is not empty, start at bottom and continue upwards to the top.
|
|
"""
|
|
licspecs = ['*LICEN[CS]E*', 'COPYING*', '*[Ll]icense*', 'LEGAL*', '[Ll]egal*', '*GPL*', 'README.lic*', 'COPYRIGHT*', '[Cc]opyright*', 'e[dp]l-v10']
|
|
skip_extensions = (".html", ".js", ".json", ".svg", ".ts", ".go", ".sh")
|
|
licfiles = []
|
|
if bottom:
|
|
srcdir = bottom
|
|
while srcdir.startswith(srctree):
|
|
files = []
|
|
with os.scandir(srcdir) as it:
|
|
for entry in it:
|
|
if entry.is_file():
|
|
files.append(entry.name)
|
|
for name in sorted(files):
|
|
if name.endswith(skip_extensions):
|
|
continue
|
|
for spec in licspecs:
|
|
if fnmatch.fnmatch(name, spec):
|
|
licfiles.append(os.path.join(srcdir, name))
|
|
if first_only:
|
|
return licfiles
|
|
srcdir = os.path.dirname(srcdir)
|
|
return licfiles
|
|
|
|
for root, dirs, files in os.walk(srctree):
|
|
# Sort files so that LICENSE is before LICENSE.subcomponent, which is
|
|
# meaningful if first_only is set.
|
|
for fn in sorted(files):
|
|
if fn.endswith(skip_extensions):
|
|
continue
|
|
for spec in licspecs:
|
|
if fnmatch.fnmatch(fn, spec):
|
|
fullpath = os.path.join(root, fn)
|
|
if not fullpath in licfiles:
|
|
licfiles.append(fullpath)
|
|
if first_only:
|
|
return licfiles
|
|
|
|
return licfiles
|
|
|
|
|
|
def match_licenses(licfiles, srctree, d, extra_hashes={}):
|
|
md5sums = {}
|
|
md5sums.update(_load_hash_csv(d))
|
|
md5sums.update(_crunch_known_licenses(d))
|
|
md5sums.update(extra_hashes)
|
|
|
|
licenses = []
|
|
for licfile in sorted(licfiles):
|
|
resolved_licfile = d.expand(licfile)
|
|
md5value = bb.utils.md5_file(resolved_licfile)
|
|
license = md5sums.get(md5value, None)
|
|
if not license:
|
|
crunched_md5 = _crunch_license(resolved_licfile)
|
|
license = md5sums.get(crunched_md5, None)
|
|
if not license:
|
|
license = 'Unknown'
|
|
logger.info("Please add the following line for '%s' to a 'license-hashes.csv' " \
|
|
"and replace `Unknown` with the license:\n" \
|
|
"%s,Unknown" % (os.path.relpath(licfile, srctree + "/.."), md5value))
|
|
|
|
licenses.append((license, os.path.relpath(licfile, srctree), md5value))
|
|
|
|
return licenses
|
|
|
|
|
|
def find_licenses(srctree, d, first_only=False, extra_hashes={}):
|
|
licfiles = find_license_files(srctree, first_only)
|
|
licenses = match_licenses(licfiles, srctree, d, extra_hashes)
|
|
|
|
# FIXME should we grab at least one source file with a license header and add that too?
|
|
|
|
return licenses
|
|
|
|
|
|
def find_licenses_up(srcdir, topdir, d, first_only=False, extra_hashes={}):
|
|
licfiles = find_license_files(topdir, first_only, srcdir)
|
|
return match_licenses(licfiles, topdir, d, extra_hashes)
|