Skip to content

Commit afeb801

Browse files
committed
Add script and pre-commit hook for copyright (#13)
1 parent 2784104 commit afeb801

2 files changed

Lines changed: 94 additions & 0 deletions

File tree

.pre-commit-config.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
# Copyright 2025 Center for Digital Humanities, Princeton University
2+
# SPDX-License-Identifier: Apache-2.0
3+
14
repos:
25
- repo: https://github.com/astral-sh/ruff-pre-commit
36
# Ruff version.
@@ -8,3 +11,9 @@ repos:
811
args: [ --fix ]
912
# Run the formatter.
1013
- id: ruff-format
14+
- repo: local
15+
hooks:
16+
- id: check-copyright
17+
name: check-copyright
18+
entry: python scripts/add_copyright.py --check
19+
language: python

scripts/add_copyright.py

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
# Copyright 2025 Center for Digital Humanities, Princeton University
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
import argparse
5+
import sys
6+
import re
7+
from pathlib import Path
8+
9+
# variables for adding copyright notice programmatically
10+
COPYRIGHT_YEAR = "2025"
11+
COPYRIGHT_HOLDER = "Center for Digital Humanities, Princeton University"
12+
COPYRIGHT_STATEMENT = f"Copyright {COPYRIGHT_YEAR} {COPYRIGHT_HOLDER}"
13+
SPDX_ID = "SPDX-License-Identifier: Apache-2.0"
14+
SUPPORTED_TYPES = [".py", ".yaml", ".yml", ".toml"]
15+
16+
# regex to check if ANY copyright is present
17+
COPYRIGHT_RE = re.compile(r"^\s*#\s*[Cc]opyright", re.MULTILINE)
18+
19+
# regex for existing emory copyright, keeping track of indentation
20+
EUL_RE = re.compile(r"^(\s*#\s*)Copyright [\d,]+ Emory University", re.MULTILINE)
21+
22+
23+
def main(arg_list=None):
24+
"""Add or check for copyright statements"""
25+
parser = argparse.ArgumentParser()
26+
parser.add_argument(
27+
"--check",
28+
action="store_true",
29+
help="Check files for copyright notice",
30+
)
31+
parser.add_argument(
32+
"filenames",
33+
nargs="*",
34+
help="Filenames (may use glob pattern) to check or modify",
35+
)
36+
args = parser.parse_args(arg_list)
37+
38+
check = args.check
39+
needs_copyright = []
40+
41+
for filename in args.filenames:
42+
path = Path(filename)
43+
if (
44+
not path.is_file() # skip non-files
45+
or path.name in ["lextab.py", "parsetab.py"] # skip generated PLY files
46+
or "schema_data" in path.parts # skip schema_data/* (xml from elsewhere)
47+
# skip test fixtures
48+
or any("fixtures" in part for part in path.parts if "test" in path.parts)
49+
):
50+
continue
51+
52+
try:
53+
contents = path.read_text(encoding="utf-8")
54+
except (UnicodeDecodeError, OSError):
55+
# skip unreadable/binary files
56+
continue
57+
58+
if not COPYRIGHT_RE.search(contents):
59+
# missing copyright: add to pre-commit check failures
60+
needs_copyright.append(filename)
61+
if not check and path.suffix in SUPPORTED_TYPES:
62+
# automatically add CDH copyright with SPDX identifier
63+
new_contents = f"# {COPYRIGHT_STATEMENT}\n# {SPDX_ID}\n\n{contents}"
64+
path.write_text(new_contents)
65+
elif not check and EUL_RE.search(contents):
66+
# has EUL copyright: add CDH copyright directly above
67+
new_contents = EUL_RE.sub(
68+
lambda m: f"{m.group(1)}{COPYRIGHT_STATEMENT}\n{m.group(0)}",
69+
contents,
70+
)
71+
path.write_text(new_contents)
72+
73+
if check:
74+
# use exit code to tell pre-commit success or failure
75+
if needs_copyright:
76+
print("The following files are missing a copyright notice:")
77+
for f in needs_copyright:
78+
print(f" - {f}")
79+
sys.exit(1)
80+
print("All modified files have copyright notices.")
81+
sys.exit(0)
82+
83+
84+
if __name__ == "__main__":
85+
main()

0 commit comments

Comments
 (0)