|
| 1 | +# Copyright 2025 Center for Digital Humanities, Princeton University |
| 2 | +# SPDX-License-Identifier: Apache-2.0 |
| 3 | + |
| 4 | +import argparse |
| 5 | +import sys |
| 6 | +import re |
| 7 | +from pathlib import Path |
| 8 | + |
| 9 | +# variables for adding copyright notice programmatically |
| 10 | +COPYRIGHT_YEAR = "2025" |
| 11 | +COPYRIGHT_HOLDER = "Center for Digital Humanities, Princeton University" |
| 12 | +COPYRIGHT_STATEMENT = f"Copyright {COPYRIGHT_YEAR} {COPYRIGHT_HOLDER}" |
| 13 | +SPDX_ID = "SPDX-License-Identifier: Apache-2.0" |
| 14 | +SUPPORTED_TYPES = [".py", ".yaml", ".yml", ".toml"] |
| 15 | + |
| 16 | +# regex to check if ANY copyright is present |
| 17 | +COPYRIGHT_RE = re.compile(r"^\s*#\s*[Cc]opyright", re.MULTILINE) |
| 18 | + |
| 19 | +# regex for existing emory copyright, keeping track of indentation |
| 20 | +EUL_RE = re.compile(r"^(\s*#\s*)Copyright [\d,]+ Emory University", re.MULTILINE) |
| 21 | + |
| 22 | + |
| 23 | +def main(arg_list=None): |
| 24 | + """Add or check for copyright statements""" |
| 25 | + parser = argparse.ArgumentParser() |
| 26 | + parser.add_argument( |
| 27 | + "--check", |
| 28 | + action="store_true", |
| 29 | + help="Check files for copyright notice", |
| 30 | + ) |
| 31 | + parser.add_argument( |
| 32 | + "filenames", |
| 33 | + nargs="*", |
| 34 | + help="Filenames (may use glob pattern) to check or modify", |
| 35 | + ) |
| 36 | + args = parser.parse_args(arg_list) |
| 37 | + |
| 38 | + check = args.check |
| 39 | + needs_copyright = [] |
| 40 | + |
| 41 | + for filename in args.filenames: |
| 42 | + path = Path(filename) |
| 43 | + if ( |
| 44 | + not path.is_file() # skip non-files |
| 45 | + or path.name in ["lextab.py", "parsetab.py"] # skip generated PLY files |
| 46 | + or "schema_data" in path.parts # skip schema_data/* (xml from elsewhere) |
| 47 | + # skip test fixtures |
| 48 | + or any("fixtures" in part for part in path.parts if "test" in path.parts) |
| 49 | + ): |
| 50 | + continue |
| 51 | + |
| 52 | + try: |
| 53 | + contents = path.read_text(encoding="utf-8") |
| 54 | + except (UnicodeDecodeError, OSError): |
| 55 | + # skip unreadable/binary files |
| 56 | + continue |
| 57 | + |
| 58 | + if not COPYRIGHT_RE.search(contents): |
| 59 | + # missing copyright: add to pre-commit check failures |
| 60 | + needs_copyright.append(filename) |
| 61 | + if not check and path.suffix in SUPPORTED_TYPES: |
| 62 | + # automatically add CDH copyright with SPDX identifier |
| 63 | + new_contents = f"# {COPYRIGHT_STATEMENT}\n# {SPDX_ID}\n\n{contents}" |
| 64 | + path.write_text(new_contents) |
| 65 | + elif not check and EUL_RE.search(contents): |
| 66 | + # has EUL copyright: add CDH copyright directly above |
| 67 | + new_contents = EUL_RE.sub( |
| 68 | + lambda m: f"{m.group(1)}{COPYRIGHT_STATEMENT}\n{m.group(0)}", |
| 69 | + contents, |
| 70 | + ) |
| 71 | + path.write_text(new_contents) |
| 72 | + |
| 73 | + if check: |
| 74 | + # use exit code to tell pre-commit success or failure |
| 75 | + if needs_copyright: |
| 76 | + print("The following files are missing a copyright notice:") |
| 77 | + for f in needs_copyright: |
| 78 | + print(f" - {f}") |
| 79 | + sys.exit(1) |
| 80 | + print("All modified files have copyright notices.") |
| 81 | + sys.exit(0) |
| 82 | + |
| 83 | + |
| 84 | +if __name__ == "__main__": |
| 85 | + main() |
0 commit comments