gcc/contrib/update-copyright.py

#!/usr/bin/env python3
#
# Copyright (C) 2013-2022 Free Software Foundation, Inc.
#
# This script is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3, or (at your option)
# any later version.

# This script adjusts the copyright notices at the top of source files
# so that they have the form:
#
#   Copyright XXXX-YYYY Free Software Foundation, Inc.
#
# It doesn't change code that is known to be maintained elsewhere or
# that carries a non-FSF copyright.
#
# The script also doesn't change testsuite files, except those in
# libstdc++-v3.  This is because libstdc++-v3 has a conformance testsuite,
# while most tests in other directories are just things that failed at some
# point in the past.
#
# Pass --this-year to the script if you want it to add the current year
# to all applicable notices.  Pass --quilt if you are using quilt and
# want files to be added to the quilt before being changed.
#
# By default the script will update all directories for which the
# output has been vetted.  You can instead pass the names of individual
# directories, including those that haven't been approved.  So:
#
#    update-copyright.py --this-year
#
# is the command that would be used at the beginning of a year to update
# all copyright notices (and possibly at other times to check whether
# new files have been added with old years).  On the other hand:
#
#    update-copyright.py --this-year libitm
#
# would run the script on just libitm/.
#
# Note that things like --version output strings must be updated before
# this script is run.  There's already a separate procedure for that.

import os
import re
import sys
import time
import subprocess

class Errors:
    def __init__ (self):
        self.num_errors = 0

    def report (self, filename, string):
        if filename:
            string = filename + ': ' + string
        sys.stderr.write (string + '\n')
        self.num_errors += 1

    def ok (self):
        return self.num_errors == 0

class GenericFilter:
    def __init__ (self):
        self.skip_files = set()
        self.skip_dirs = set()
        self.skip_extensions = set([
                '.png',
                '.pyc',
                ])
        self.fossilised_files = set()
        self.own_files = set()

        self.skip_files |= set ([
                # Skip licence files.
                'COPYING',
                'COPYING.LIB',
                'COPYING3',
                'COPYING3.LIB',
                'LICENSE',
                'LICENSE.txt',
                'fdl.texi',
                'gpl_v3.texi',
                'fdl-1.3.xml',
                'gpl-3.0.xml',

                # Skip auto- and libtool-related files
                'aclocal.m4',
                'compile',
                'config.guess',
                'config.sub',
                'depcomp',
                'install-sh',
                'libtool.m4',
                'ltmain.sh',
                'ltoptions.m4',
                'ltsugar.m4',
                'ltversion.m4',
                'lt~obsolete.m4',
                'missing',
                'mkdep',
                'mkinstalldirs',
                'move-if-change',
                'shlibpath.m4',
                'symlink-tree',
                'ylwrap',

                # Skip FSF mission statement, etc.
                'gnu.texi',
                'funding.texi',
                'appendix_free.xml',

                # Skip imported texinfo files.
                'texinfo.tex',
                ])


    def get_line_filter (self, dir, filename):
        if filename.startswith ('ChangeLog'):
            # Ignore references to copyright in changelog entries.
            return re.compile ('\t')

        return None

    def skip_file (self, dir, filename):
        if filename in self.skip_files:
            return True

        (base, extension) = os.path.splitext (os.path.join (dir, filename))
        if extension in self.skip_extensions:
            return True

        if extension == '.in':
            # Skip .in files produced by automake.
            if os.path.exists (base + '.am'):
                return True

            # Skip files produced by autogen
            if (os.path.exists (base + '.def')
                and os.path.exists (base + '.tpl')):
                return True

        # Skip configure files produced by autoconf
        if filename == 'configure':
            if os.path.exists (base + '.ac'):
                return True
            if os.path.exists (base + '.in'):
                return True

        return False

    def skip_dir (self, dir, subdir):
        return subdir in self.skip_dirs

    def is_fossilised_file (self, dir, filename):
        if filename in self.fossilised_files:
            return True
        # Only touch current current ChangeLogs.
        if filename != 'ChangeLog' and filename.find ('ChangeLog') >= 0:
            return True
        return False

    def by_package_author (self, dir, filename):
        return filename in self.own_files

class Copyright:
    def __init__ (self, errors):
        self.errors = errors

        # Characters in a range of years.  Include '.' for typos.
        ranges = '[0-9](?:[-0-9.,\s]|\s+and\s+)*[0-9]'

        # Non-whitespace characters in a copyright holder's name.
        name = '[\w.,-]'

        # Matches one year.
        self.year_re = re.compile ('[0-9]+')

        # Matches part of a year or copyright holder.
        self.continuation_re = re.compile (ranges + '|' + name)

        # Matches a full copyright notice:
        self.copyright_re = re.compile (
            # 1: 'Copyright (C)', etc.
            '([Cc]opyright'
            '|[Cc]opyright\s+\([Cc]\)'
            '|[Cc]opyright\s+%s'
            '|[Cc]opyright\s+&copy;'
            '|[Cc]opyright\s+@copyright{}'
            '|copyright = u\''
            '|@set\s+copyright[\w-]+)'

            # 2: the years.  Include the whitespace in the year, so that
            # we can remove any excess.
            '(\s*(?:' + ranges + ',?'
            '|@value\{[^{}]*\})\s*)'

            # 3: 'by ', if used
            '(by\s+)?'

            # 4: the copyright holder.  Don't allow multiple consecutive
            # spaces, so that right-margin gloss doesn't get caught
            # (e.g. gnat_ugn.texi).
            '(' + name + '(?:\s?' + name + ')*)?')

        # A regexp for notices that might have slipped by.  Just matching
        # 'copyright' is too noisy, and 'copyright.*[0-9]' falls foul of
        # HTML header markers, so check for 'copyright' and two digits.
        self.other_copyright_re = re.compile ('copyright.*[0-9][0-9]',
                                              re.IGNORECASE)
        self.comment_re = re.compile('#+|[*]+|;+|%+|//+|@c |dnl ')
        self.holders = { '@copying': '@copying' }
        self.holder_prefixes = set()

        # True to 'quilt add' files before changing them.
        self.use_quilt = False

        # If set, force all notices to include this year.
        self.max_year = None

        # Goes after the year(s).  Could be ', '.
        self.separator = ' '

    def add_package_author (self, holder, canon_form = None):
        if not canon_form:
            canon_form = holder
        self.holders[holder] = canon_form
        index = holder.find (' ')
        while index >= 0:
            self.holder_prefixes.add (holder[:index])
            index = holder.find (' ', index + 1)

    def add_external_author (self, holder):
        self.holders[holder] = None

    class BadYear (Exception):
        def __init__ (self, year):
            self.year = year

        def __str__ (self):
            return 'unrecognised year: ' + self.year

    def parse_year (self, string):
        year = int (string)
        if len (string) == 2:
            if year > 70:
                return year + 1900
        elif len (string) == 4:
            return year
        raise self.BadYear (string)

    def year_range (self, years):
        year_list = [self.parse_year (year)
                     for year in self.year_re.findall (years)]
        assert len (year_list) > 0
        return (min (year_list), max (year_list))

    def set_use_quilt (self, use_quilt):
        self.use_quilt = use_quilt

    def include_year (self, year):
        assert not self.max_year
        self.max_year = year

    def canonicalise_years (self, dir, filename, filter, years):
        # Leave texinfo variables alone.
        if years.startswith ('@value'):
            return years

        (min_year, max_year) = self.year_range (years)

        # Update the upper bound, if enabled.
        if self.max_year and not filter.is_fossilised_file (dir, filename):
            max_year = max (max_year, self.max_year)

        # Use a range.
        if min_year == max_year:
            return '%d' % min_year
        else:
            return '%d-%d' % (min_year, max_year)

    def strip_continuation (self, line):
        line = line.lstrip()
        match = self.comment_re.match (line)
        if match:
            line = line[match.end():].lstrip()
        return line

    def is_complete (self, match):
        holder = match.group (4)
        return (holder
                and (holder not in self.holder_prefixes
                     or holder in self.holders))

    def update_copyright (self, dir, filename, filter, file, line, match):
        orig_line = line
        next_line = None
        pathname = os.path.join (dir, filename)

        intro = match.group (1)
        if intro.startswith ('@set'):
            # Texinfo year variables should always be on one line
            after_years = line[match.end (2):].strip()
            if after_years != '':
                self.errors.report (pathname,
                                    'trailing characters in @set: '
                                    + after_years)
                return (False, orig_line, next_line)
        else:
            # If it looks like the copyright is incomplete, add the next line.
            while not self.is_complete (match):
                try:
                    next_line = file.readline()
                except StopIteration:
                    break

                # If the next line doesn't look like a proper continuation,
                # assume that what we've got is complete.
                continuation = self.strip_continuation (next_line)
                if not self.continuation_re.match (continuation):
                    break

                # Merge the lines for matching purposes.
                orig_line += next_line
                line = line.rstrip() + ' ' + continuation
                next_line = None

                # Rematch with the longer line, at the original position.
                match = self.copyright_re.match (line, match.start())
                assert match

            holder = match.group (4)

            # Use the filter to test cases where markup is getting in the way.
            if filter.by_package_author (dir, filename):
                assert holder not in self.holders

            elif not holder:
                self.errors.report (pathname, 'missing copyright holder')
                return (False, orig_line, next_line)

            elif holder not in self.holders:
                self.errors.report (pathname,
                                    'unrecognised copyright holder: ' + holder)
                return (False, orig_line, next_line)

            else:
                # See whether the copyright is associated with the package
                # author.
                canon_form = self.holders[holder]
                if not canon_form:
                    return (False, orig_line, next_line)

                # Make sure the author is given in a consistent way.
                line = (line[:match.start (4)]
                        + canon_form
                        + line[match.end (4):])

                # Remove any 'by'
                line = line[:match.start (3)] + line[match.end (3):]

        # Update the copyright years.
        years = match.group (2).strip()
        try:
            canon_form = self.canonicalise_years (dir, filename, filter, years)
        except self.BadYear as e:
            self.errors.report (pathname, str (e))
            return (False, orig_line, next_line)

        line = (line[:match.start (2)]
                + ('' if intro.startswith ('copyright = ') else ' ')
                + canon_form + self.separator
                + line[match.end (2):])

        # Use the standard (C) form.
        if intro.endswith ('right'):
            intro += ' (C)'
        elif intro.endswith ('(c)'):
            intro = intro[:-3] + '(C)'
        line = line[:match.start (1)] + intro + line[match.end (1):]

        # Strip trailing whitespace
        line = line.rstrip() + '\n'

        return (line != orig_line, line, next_line)

    def guess_encoding (self, pathname):
        for encoding in ('utf8', 'iso8859'):
            try:
                open(pathname, 'r', encoding=encoding).read()
                return encoding
            except UnicodeDecodeError:
                pass
        return None

    def process_file (self, dir, filename, filter):
        pathname = os.path.join (dir, filename)
        if filename.endswith ('.tmp'):
            # Looks like something we tried to create before.
            try:
                os.remove (pathname)
            except OSError:
                pass
            return

        lines = []
        changed = False
        line_filter = filter.get_line_filter (dir, filename)
        mode = None
        encoding = self.guess_encoding(pathname)
        with open (pathname, 'r', encoding=encoding) as file:
            prev = None
            mode = os.fstat (file.fileno()).st_mode
            for line in file:
                while line:
                    next_line = None
                    # Leave filtered-out lines alone.
                    if not (line_filter and line_filter.match (line)):
                        match = self.copyright_re.search (line)
                        if match:
                            res = self.update_copyright (dir, filename, filter,
                                                         file, line, match)
                            (this_changed, line, next_line) = res
                            changed = changed or this_changed

                        # Check for copyright lines that might have slipped by.
                        elif self.other_copyright_re.search (line):
                            self.errors.report (pathname,
                                                'unrecognised copyright: %s'
                                                % line.strip())
                    lines.append (line)
                    line = next_line

        # If something changed, write the new file out.
        if changed and self.errors.ok():
            tmp_pathname = pathname + '.tmp'
            with open (tmp_pathname, 'w', encoding=encoding) as file:
                for line in lines:
                    file.write (line)
                os.fchmod (file.fileno(), mode)
            if self.use_quilt:
                subprocess.call (['quilt', 'add', pathname])
            os.rename (tmp_pathname, pathname)

    def process_tree (self, tree, filter):
        for (dir, subdirs, filenames) in os.walk (tree):
            # Don't recurse through directories that should be skipped.
            for i in range (len (subdirs) - 1, -1, -1):
                if filter.skip_dir (dir, subdirs[i]):
                    del subdirs[i]

            # Handle the files in this directory.
            for filename in filenames:
                if filter.skip_file (dir, filename):
                    sys.stdout.write ('Skipping %s\n'
                                      % os.path.join (dir, filename))
                else:
                    self.process_file (dir, filename, filter)

class CmdLine:
    def __init__ (self, copyright = Copyright):
        self.errors = Errors()
        self.copyright = copyright (self.errors)
        self.dirs = []
        self.default_dirs = []
        self.chosen_dirs = []
        self.option_handlers = dict()
        self.option_help = []

        self.add_option ('--help', 'Print this help', self.o_help)
        self.add_option ('--quilt', '"quilt add" files before changing them',
                         self.o_quilt)
        self.add_option ('--this-year', 'Add the current year to every notice',
                         self.o_this_year)

    def add_option (self, name, help, handler):
        self.option_help.append ((name, help))
        self.option_handlers[name] = handler

    def add_dir (self, dir, filter = GenericFilter()):
        self.dirs.append ((dir, filter))

    def o_help (self, option = None):
        sys.stdout.write ('Usage: %s [options] dir1 dir2...\n\n'
                          'Options:\n' % sys.argv[0])
        format = '%-15s %s\n'
        for (what, help) in self.option_help:
            sys.stdout.write (format % (what, help))
        sys.stdout.write ('\nDirectories:\n')

        format = '%-25s'
        i = 0
        for (dir, filter) in self.dirs:
            i += 1
            if i % 3 == 0 or i == len (self.dirs):
                sys.stdout.write (dir + '\n')
            else:
                sys.stdout.write (format % dir)
        sys.exit (0)

    def o_quilt (self, option):
        self.copyright.set_use_quilt (True)

    def o_this_year (self, option):
        self.copyright.include_year (time.localtime().tm_year)

    def main (self):
        for arg in sys.argv[1:]:
            if arg[:1] != '-':
                self.chosen_dirs.append (arg)
            elif arg in self.option_handlers:
                self.option_handlers[arg] (arg)
            else:
                self.errors.report (None, 'unrecognised option: ' + arg)
        if self.errors.ok():
            if len (self.chosen_dirs) == 0:
                self.chosen_dirs = self.default_dirs
            if len (self.chosen_dirs) == 0:
                self.o_help()
            else:
                for chosen_dir in self.chosen_dirs:
                    canon_dir = os.path.join (chosen_dir, '')
                    count = 0
                    for (dir, filter) in self.dirs:
                        if (dir + os.sep).startswith (canon_dir):
                            count += 1
                            self.copyright.process_tree (dir, filter)
                    if count == 0:
                        self.errors.report (None, 'unrecognised directory: '
                                            + chosen_dir)
        sys.exit (0 if self.errors.ok() else 1)

#----------------------------------------------------------------------------

class TopLevelFilter (GenericFilter):
    def skip_dir (self, dir, subdir):
        return True

class ConfigFilter (GenericFilter):
    def __init__ (self):
        GenericFilter.__init__ (self)

    def skip_file (self, dir, filename):
        if filename.endswith ('.m4'):
            pathname = os.path.join (dir, filename)
            with open (pathname) as file:
                # Skip files imported from gettext.
                if file.readline().find ('gettext-') >= 0:
                    return True
        return GenericFilter.skip_file (self, dir, filename)

class GCCFilter (GenericFilter):
    def __init__ (self):
        GenericFilter.__init__ (self)

        self.skip_files |= set ([
                # Not part of GCC
                'math-68881.h',
                ])

        self.skip_dirs |= set ([
                # Better not create a merge nightmare for the GNAT folks.
                'ada',

                # Handled separately.
                'testsuite',
                ])

        self.skip_extensions |= set ([
                # Maintained by the translation project.
                '.po',

                # Automatically-generated.
                '.pot',
                ])

        self.fossilised_files |= set ([
                # Old news won't be updated.
                'ONEWS',
                ])

class TestsuiteFilter (GenericFilter):
    def __init__ (self):
        GenericFilter.__init__ (self)

        self.skip_extensions |= set ([
                # Don't change the tests, which could be woend by anyone.
                '.c',
                '.C',
                '.cc',
                '.d',
                '.h',
                '.hs',
                '.f',
                '.f90',
                '.go',
                '.inc',
                '.java',
                ])

    def skip_file (self, dir, filename):
        # g++.niklas/README contains historical copyright information
        # and isn't updated.
        if filename == 'README' and os.path.basename (dir) == 'g++.niklas':
            return True
        # Similarly params/README.
        if filename == 'README' and os.path.basename (dir) == 'params':
            return True
        if filename == 'pdt_5.f03' and os.path.basename (dir) == 'gfortran.dg':
            return True
        return GenericFilter.skip_file (self, dir, filename)

class LibCppFilter (GenericFilter):
    def __init__ (self):
        GenericFilter.__init__ (self)

        self.skip_extensions |= set ([
                # Maintained by the translation project.
                '.po',

                # Automatically-generated.
                '.pot',
                ])

class LibGCCFilter (GenericFilter):
    def __init__ (self):
        GenericFilter.__init__ (self)

        self.skip_dirs |= set ([
                # Imported from GLIBC.
                'soft-fp',
                ])

class LibPhobosFilter (GenericFilter):
    def __init__ (self):
        GenericFilter.__init__ (self)

        self.skip_files |= set ([
                # Source module imported from upstream.
                'object.d',
                ])

        self.skip_dirs |= set ([
                # Contains sources imported from upstream.
                'core',
                'etc',
                'gc',
                'gcstub',
                'rt',
                'std',
                ])

class LibStdCxxFilter (GenericFilter):
    def __init__ (self):
        GenericFilter.__init__ (self)

        self.skip_files |= set ([
                # Contains no copyright of its own, but quotes the GPL.
                'intro.xml',
                ])

        self.skip_dirs |= set ([
                # Contains automatically-generated sources.
                'html',

                # The testsuite data files shouldn't be changed.
                'data',

                # Contains imported images
                'images',
                ])

        self.own_files |= set ([
                # Contains markup around the copyright owner.
                'spine.xml',
                ])

    def get_line_filter (self, dir, filename):
        if filename == 'boost_concept_check.h':
            return re.compile ('// \(C\) Copyright Jeremy Siek')
        return GenericFilter.get_line_filter (self, dir, filename)

class GCCCopyright (Copyright):
    def __init__ (self, errors):
        Copyright.__init__ (self, errors)

        canon_fsf = 'Free Software Foundation, Inc.'
        self.add_package_author ('Free Software Foundation', canon_fsf)
        self.add_package_author ('Free Software Foundation.', canon_fsf)
        self.add_package_author ('Free Software Foundation Inc.', canon_fsf)
        self.add_package_author ('Free Software Foundation, Inc', canon_fsf)
        self.add_package_author ('Free Software Foundation, Inc.', canon_fsf)
        self.add_package_author ('The Free Software Foundation', canon_fsf)
        self.add_package_author ('The Free Software Foundation, Inc.', canon_fsf)
        self.add_package_author ('Software Foundation, Inc.', canon_fsf)

        self.add_external_author ('ARM')
        self.add_external_author ('AdaCore')
        self.add_external_author ('Advanced Micro Devices Inc.')
        self.add_external_author ('Ami Tavory and Vladimir Dreizin, IBM-HRL.')
        self.add_external_author ('Cavium Networks.')
        self.add_external_author ('Faraday Technology Corp.')
        self.add_external_author ('Florida State University')
        self.add_external_author ('Gerard Jungman')
        self.add_external_author ('Greg Colvin and Beman Dawes.')
        self.add_external_author ('Hewlett-Packard Company')
        self.add_external_author ('Intel Corporation')
        self.add_external_author ('Information Technology Industry Council.')
        self.add_external_author ('James Theiler, Brian Gough')
        self.add_external_author ('Makoto Matsumoto and Takuji Nishimura,')
        self.add_external_author ('Mentor Graphics Corporation')
        self.add_external_author ('National Research Council of Canada.')
        self.add_external_author ('NVIDIA Corporation')
        self.add_external_author ('Peter Dimov and Multi Media Ltd.')
        self.add_external_author ('Peter Dimov')
        self.add_external_author ('Pipeline Associates, Inc.')
        self.add_external_author ('Regents of the University of California.')
        self.add_external_author ('Silicon Graphics Computer Systems, Inc.')
        self.add_external_author ('Silicon Graphics')
        self.add_external_author ('Stephen L. Moshier')
        self.add_external_author ('Sun Microsystems, Inc. All rights reserved.')
        self.add_external_author ('The D Language Foundation, All Rights Reserved')
        self.add_external_author ('The Go Authors.  All rights reserved.')
        self.add_external_author ('The Go Authors. All rights reserved.')
        self.add_external_author ('The Go Authors.')
        self.add_external_author ('The Regents of the University of California.')
        self.add_external_author ('Ulf Adams')
        self.add_external_author ('Unicode, Inc.')
        self.add_external_author ('University of Illinois at Urbana-Champaign.')
        self.add_external_author ('University of Toronto.')
        self.add_external_author ('Yoshinori Sato')

class GCCCmdLine (CmdLine):
    def __init__ (self):
        CmdLine.__init__ (self, GCCCopyright)

        self.add_dir ('.', TopLevelFilter())
        # boehm-gc is imported from upstream.
        self.add_dir ('c++tools')
        self.add_dir ('config', ConfigFilter())
        # contrib isn't really part of GCC.
        self.add_dir ('fixincludes')
        self.add_dir ('gcc', GCCFilter())
        self.add_dir (os.path.join ('gcc', 'testsuite'), TestsuiteFilter())
        self.add_dir ('gnattools')
        self.add_dir ('gotools')
        self.add_dir ('include')
        # intl is imported from upstream.
        self.add_dir ('libada')
        self.add_dir ('libatomic')
        self.add_dir ('libbacktrace')
        self.add_dir ('libcc1')
        self.add_dir ('libcpp', LibCppFilter())
        self.add_dir ('libdecnumber')
        # libffi is imported from upstream.
        self.add_dir ('libgcc', LibGCCFilter())
        self.add_dir ('libgfortran')
        # libgo is imported from upstream.
        self.add_dir ('libgomp')
        self.add_dir ('libiberty')
        self.add_dir ('libitm')
        self.add_dir ('libobjc')
        # liboffloadmic is imported from upstream.
        self.add_dir ('libphobos', LibPhobosFilter())
        self.add_dir ('libquadmath')
        # libsanitizer is imported from upstream.
        self.add_dir ('libssp')
        self.add_dir ('libstdc++-v3', LibStdCxxFilter())
        self.add_dir ('libvtv')
        self.add_dir ('lto-plugin')
        # maintainer-scripts maintainer-scripts
        # zlib is imported from upstream.

        self.default_dirs = [
            'c++tools',
            'gcc',
            'include',
            'libada',
            'libatomic',
            'libbacktrace',
            'libcc1',
            'libcpp',
            'libdecnumber',
            'libgcc',
            'libgfortran',
            'libgomp',
            'libiberty',
            'libitm',
            'libobjc',
            'libphobos',
            'libssp',
            'libstdc++-v3',
            'libvtv',
            'lto-plugin',
            ]

GCCCmdLine().main()