-
-
Notifications
You must be signed in to change notification settings - Fork 32
Expand file tree
/
Copy pathdelete_privates.py
More file actions
117 lines (92 loc) · 3.72 KB
/
delete_privates.py
File metadata and controls
117 lines (92 loc) · 3.72 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/usr/bin/env python
"""
Delete private dataset DATA files from the public owid-catalog bucket.
This script identifies private datasets and removes their data files (.feather, .parquet, .csv)
from the public bucket, while keeping metadata files (.meta.json, index.json) intact.
Usage:
.venv/bin/python scripts/delete_private_from_public_bucket.py --dry-run
.venv/bin/python scripts/delete_private_from_public_bucket.py
"""
from collections.abc import Iterator
from pathlib import Path
from typing import Any
import click
from owid.catalog.api.legacy import CHANNEL, LocalCatalog
from owid.catalog.s3_utils import connect_r2
from etl import config
from etl.paths import DATA_DIR
def is_metadata_file(filename: str) -> bool:
"""Check if a file is a metadata file (should stay in public bucket)."""
return filename.endswith(".meta.json") or filename.endswith("index.json")
def walk_s3(s3: Any, bucket: str, path: str) -> Iterator[dict[str, Any]]:
"""Walk all objects in an S3 bucket with a given prefix."""
objs = s3.list_objects(Bucket=bucket, Prefix=path, MaxKeys=100)
yield from objs.get("Contents", [])
while objs["IsTruncated"] and objs.get("Contents"):
marker = objs.get("NextMarker", objs["Contents"][-1]["Key"])
objs = s3.list_objects(Bucket=bucket, Prefix=path, Marker=marker)
yield from objs.get("Contents", [])
def delete_files(s3: Any, bucket: str, keys: list[str], dry_run: bool) -> None:
"""Delete files from S3 in batches of 1000."""
if dry_run or not keys:
return
while keys:
chunk = keys[:1000]
s3.delete_objects(
Bucket=bucket,
Delete={"Objects": [{"Key": k} for k in chunk], "Quiet": True},
)
keys = keys[1000:]
@click.command()
@click.option(
"--dry-run",
is_flag=True,
default=False,
help="Preview files to delete without actually deleting them.",
)
@click.option(
"--channel",
"-c",
multiple=True,
type=click.Choice(CHANNEL.__args__),
default=CHANNEL.__args__,
help="Process only selected channel(s).",
)
def main(dry_run: bool, channel: tuple) -> None:
"""Delete private dataset data files from the public R2 bucket."""
catalog = Path(DATA_DIR)
local = LocalCatalog(catalog)
s3 = connect_r2()
bucket = config.R2_BUCKET
total_files = 0
total_bytes = 0
for ch in channel:
print(f"\n=== Channel: {ch} ===")
for ds in local.iter_datasets(ch):
if ds.metadata.is_public:
continue
# This is a private dataset
path = Path(ds.path).relative_to(catalog).as_posix()
if not path.endswith("/"):
path += "/"
# Find all data files for this dataset in the public bucket
files_to_delete = []
for obj in walk_s3(s3, bucket, path):
key = obj["Key"]
# Only delete data files, keep metadata
if not is_metadata_file(key):
files_to_delete.append(key)
size_mb = obj.get("Size", 0) / (1024 * 1024)
print(f" DEL {key} ({size_mb:.2f} MB)")
total_files += 1
total_bytes += obj.get("Size", 0)
if files_to_delete:
print(f" -> {path} ({len(files_to_delete)} files)")
delete_files(s3, bucket, files_to_delete, dry_run)
print(f"\n{'[DRY RUN] ' if dry_run else ''}Summary:")
print(f" Files {'to delete' if dry_run else 'deleted'}: {total_files}")
print(f" Total size: {total_bytes / (1024 * 1024):.2f} MB")
if dry_run and total_files > 0:
print("\nRun without --dry-run to actually delete these files.")
if __name__ == "__main__":
main()