Skip to content

Commit 5bf398d

Browse files
teh-hippoCopilot
andcommitted
feat: add SQLite asset manifest for identity-based sync
Replace fragile size-based dedup with definitive identity matching using iCloud's recordName (asset_id). The manifest DB at {download_dir}/.icloudpd.db tracks every downloaded file with: - asset_id: iCloud master recordName (stable unique ID) - zone_id: library zone (personal vs shared, prevents collisions) - local_path: relative path (portable) - version_size: iCloud-reported size (pre-EXIF-injection) - version_checksum: iCloud fileChecksum - change_tag: recordChangeTag (detects metadata updates) This eliminates the EXIF re-serialisation false dedup problem entirely: the manifest stores version_size from the API, which is always compared against the API's version.size (both pre-injection). Local file size is never compared. Both primary files and live photo companions get manifest rows. Existing files are adopted into the manifest on first run. Legacy size-based dedup is preserved as fallback when manifest is unavailable. 14 unit tests for ManifestDB, updated integration tests for manifest behaviour. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 3b9b511 commit 5bf398d

File tree

8 files changed

+1044
-117
lines changed

8 files changed

+1044
-117
lines changed

src/icloudpd/autodelete.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
from tzlocal import get_localzone
1111

12+
from icloudpd.manifest import ManifestDB
1213
from icloudpd.paths import local_download_path
1314
from pyicloud_ipd.asset_version import calculate_version_filename
1415
from pyicloud_ipd.raw_policy import RawTreatmentPolicy
@@ -39,6 +40,7 @@ def autodelete_photos(
3940
_sizes: Sequence[AssetVersionSize],
4041
lp_filename_generator: Callable[[str], str],
4142
raw_policy: RawTreatmentPolicy,
43+
manifest: ManifestDB | None = None,
4244
) -> None:
4345
"""
4446
Scans the "Recently Deleted" folder and deletes any matching files
@@ -113,4 +115,6 @@ def autodelete_photos(
113115
if os.path.exists(path):
114116
logger.debug("Deleting %s...", path)
115117
delete_local = delete_file_dry_run if dry_run else delete_file
116-
delete_local(logger, path)
118+
if delete_local(logger, path) and manifest is not None and not path.endswith(".xmp"):
119+
rel_path = os.path.relpath(path, directory)
120+
manifest.remove_by_path(rel_path)

src/icloudpd/base.py

Lines changed: 254 additions & 30 deletions
Large diffs are not rendered by default.

src/icloudpd/manifest.py

Lines changed: 322 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,322 @@
1+
"""SQLite asset manifest for identity-based sync tracking.
2+
3+
The manifest DB is the single source of truth for everything icloudpd knows
4+
about your library. It stores identity (which iCloud asset maps to which local
5+
file), sync state (has this asset changed?), and all metadata the API provides.
6+
7+
XMP sidecars are an export format generated from the same API data. The DB and
8+
XMP are independent — XMP generation does not read from the DB.
9+
10+
The manifest lives at {download_dir}/.icloudpd.db and travels with the library.
11+
"""
12+
13+
import logging
14+
import os
15+
import sqlite3
16+
from dataclasses import dataclass
17+
from datetime import datetime, timezone
18+
19+
logger = logging.getLogger(__name__)
20+
21+
SCHEMA_VERSION = 1
22+
23+
_SCHEMA_V1 = """\
24+
CREATE TABLE IF NOT EXISTS manifest (
25+
asset_id TEXT NOT NULL,
26+
zone_id TEXT NOT NULL DEFAULT '',
27+
local_path TEXT NOT NULL,
28+
version_size INTEGER NOT NULL,
29+
version_checksum TEXT,
30+
change_tag TEXT,
31+
downloaded_at TEXT NOT NULL,
32+
last_updated_at TEXT NOT NULL,
33+
item_type TEXT,
34+
filename TEXT,
35+
asset_date TEXT,
36+
added_date TEXT,
37+
is_favorite INTEGER DEFAULT 0,
38+
is_hidden INTEGER DEFAULT 0,
39+
is_deleted INTEGER DEFAULT 0,
40+
original_width INTEGER,
41+
original_height INTEGER,
42+
duration INTEGER,
43+
orientation INTEGER,
44+
title TEXT,
45+
description TEXT,
46+
keywords TEXT,
47+
gps_latitude REAL,
48+
gps_longitude REAL,
49+
gps_altitude REAL,
50+
PRIMARY KEY (asset_id, zone_id, local_path)
51+
);
52+
CREATE INDEX IF NOT EXISTS idx_manifest_path ON manifest(local_path);
53+
"""
54+
55+
# Columns added between schema versions, for migration from older DBs.
56+
# Each entry: (version_introduced, ALTER TABLE statement)
57+
_MIGRATIONS: list[tuple[int, str]] = [
58+
# Future migrations go here, e.g.:
59+
# (2, "ALTER TABLE manifest ADD COLUMN new_field TEXT DEFAULT NULL"),
60+
]
61+
62+
63+
@dataclass(frozen=True)
64+
class ManifestRow:
65+
"""A single manifest entry."""
66+
67+
asset_id: str
68+
zone_id: str
69+
local_path: str
70+
version_size: int
71+
version_checksum: str | None
72+
change_tag: str | None
73+
downloaded_at: str
74+
last_updated_at: str
75+
item_type: str | None
76+
filename: str | None
77+
asset_date: str | None
78+
added_date: str | None
79+
is_favorite: int
80+
is_hidden: int
81+
is_deleted: int
82+
original_width: int | None
83+
original_height: int | None
84+
duration: int | None
85+
orientation: int | None
86+
title: str | None
87+
description: str | None
88+
keywords: str | None
89+
gps_latitude: float | None
90+
gps_longitude: float | None
91+
gps_altitude: float | None
92+
93+
94+
_ALL_COLUMNS = (
95+
"asset_id, zone_id, local_path, version_size, version_checksum, "
96+
"change_tag, downloaded_at, last_updated_at, item_type, filename, "
97+
"asset_date, added_date, is_favorite, is_hidden, is_deleted, "
98+
"original_width, original_height, duration, orientation, "
99+
"title, description, keywords, gps_latitude, gps_longitude, gps_altitude"
100+
)
101+
102+
103+
class ManifestDB:
104+
"""SQLite-backed asset manifest for tracking downloaded files."""
105+
106+
def __init__(self, download_dir: str) -> None:
107+
self._db_path = os.path.join(download_dir, ".icloudpd.db")
108+
self._conn: sqlite3.Connection | None = None
109+
self._dirty = False
110+
self._pending_count = 0
111+
self._flush_interval = 500
112+
self.zone_id: str = ""
113+
114+
@property
115+
def _db(self) -> sqlite3.Connection:
116+
if self._conn is None:
117+
raise RuntimeError("ManifestDB is not open")
118+
return self._conn
119+
120+
def open(self) -> None:
121+
"""Open the manifest DB, creating schema or migrating if needed."""
122+
self._conn = sqlite3.connect(self._db_path, timeout=10)
123+
self._conn.execute("PRAGMA journal_mode=WAL")
124+
self._conn.execute("PRAGMA synchronous=NORMAL")
125+
self._dirty = False
126+
self._pending_count = 0
127+
128+
current_version = self._conn.execute("PRAGMA user_version").fetchone()[0]
129+
if current_version == 0:
130+
# Fresh DB or pre-versioned DB — check if table exists
131+
tables = self._conn.execute(
132+
"SELECT name FROM sqlite_master WHERE type='table' AND name='manifest'"
133+
).fetchone()
134+
if tables is None:
135+
# Brand new DB
136+
self._conn.executescript(_SCHEMA_V1)
137+
else:
138+
# Pre-versioned DB (has table but no user_version) — migrate
139+
self._migrate_from_v0()
140+
self._conn.execute(f"PRAGMA user_version={SCHEMA_VERSION}")
141+
self._conn.commit()
142+
elif current_version < SCHEMA_VERSION:
143+
self._run_migrations(current_version)
144+
145+
def _migrate_from_v0(self) -> None:
146+
"""Migrate from the original 7-column schema to the full schema."""
147+
existing = {
148+
row[1]
149+
for row in self._conn.execute("PRAGMA table_info(manifest)").fetchall() # type: ignore[union-attr]
150+
}
151+
new_columns = [
152+
("last_updated_at", "TEXT NOT NULL DEFAULT ''"),
153+
("item_type", "TEXT"),
154+
("filename", "TEXT"),
155+
("asset_date", "TEXT"),
156+
("added_date", "TEXT"),
157+
("is_favorite", "INTEGER DEFAULT 0"),
158+
("is_hidden", "INTEGER DEFAULT 0"),
159+
("is_deleted", "INTEGER DEFAULT 0"),
160+
("original_width", "INTEGER"),
161+
("original_height", "INTEGER"),
162+
("duration", "INTEGER"),
163+
("orientation", "INTEGER"),
164+
("title", "TEXT"),
165+
("description", "TEXT"),
166+
("keywords", "TEXT"),
167+
("gps_latitude", "REAL"),
168+
("gps_longitude", "REAL"),
169+
("gps_altitude", "REAL"),
170+
]
171+
for col_name, col_def in new_columns:
172+
if col_name not in existing:
173+
self._conn.execute(f"ALTER TABLE manifest ADD COLUMN {col_name} {col_def}") # type: ignore[union-attr]
174+
logger.info("Migrated manifest DB from v0 to v%d (%d columns added)",
175+
SCHEMA_VERSION, sum(1 for c, _ in new_columns if c not in existing))
176+
self._conn.execute( # type: ignore[union-attr]
177+
"CREATE INDEX IF NOT EXISTS idx_manifest_path ON manifest(local_path)"
178+
)
179+
180+
def _run_migrations(self, from_version: int) -> None:
181+
"""Run incremental migrations from from_version to SCHEMA_VERSION."""
182+
for version, sql in _MIGRATIONS:
183+
if version > from_version:
184+
self._conn.execute(sql) # type: ignore[union-attr]
185+
self._conn.execute(f"PRAGMA user_version={SCHEMA_VERSION}") # type: ignore[union-attr]
186+
self._conn.commit() # type: ignore[union-attr]
187+
188+
def close(self) -> None:
189+
"""Close the manifest DB, committing any pending writes."""
190+
if self._conn:
191+
if self._dirty:
192+
self._conn.commit()
193+
self._dirty = False
194+
self._pending_count = 0
195+
self._conn.close()
196+
self._conn = None
197+
198+
def flush(self) -> None:
199+
"""Commit pending writes without closing."""
200+
if self._conn and self._dirty:
201+
self._conn.commit()
202+
self._dirty = False
203+
self._pending_count = 0
204+
205+
def __enter__(self) -> "ManifestDB":
206+
self.open()
207+
return self
208+
209+
def __exit__(self, *_: object) -> None:
210+
self.close()
211+
212+
def lookup(self, asset_id: str, zone_id: str, local_path: str) -> ManifestRow | None:
213+
"""Look up a manifest entry by identity."""
214+
row = self._db.execute(
215+
f"SELECT {_ALL_COLUMNS} FROM manifest "
216+
"WHERE asset_id = ? AND zone_id = ? AND local_path = ?",
217+
(asset_id, zone_id, local_path),
218+
).fetchone()
219+
if row is None:
220+
return None
221+
return ManifestRow(*row)
222+
223+
def lookup_by_path(self, local_path: str) -> ManifestRow | None:
224+
"""Look up a manifest entry by local path."""
225+
row = self._db.execute(
226+
f"SELECT {_ALL_COLUMNS} FROM manifest "
227+
"WHERE local_path = ? LIMIT 1",
228+
(local_path,),
229+
).fetchone()
230+
if row is None:
231+
return None
232+
return ManifestRow(*row)
233+
234+
def upsert(
235+
self,
236+
asset_id: str,
237+
zone_id: str,
238+
local_path: str,
239+
version_size: int,
240+
version_checksum: str | None = None,
241+
change_tag: str | None = None,
242+
item_type: str | None = None,
243+
filename: str | None = None,
244+
asset_date: str | None = None,
245+
added_date: str | None = None,
246+
is_favorite: int = 0,
247+
is_hidden: int = 0,
248+
is_deleted: int = 0,
249+
original_width: int | None = None,
250+
original_height: int | None = None,
251+
duration: int | None = None,
252+
orientation: int | None = None,
253+
title: str | None = None,
254+
description: str | None = None,
255+
keywords: str | None = None,
256+
gps_latitude: float | None = None,
257+
gps_longitude: float | None = None,
258+
gps_altitude: float | None = None,
259+
) -> None:
260+
"""Insert or update a manifest entry. Auto-flushes every 500 writes."""
261+
try:
262+
now = datetime.now(tz=timezone.utc).isoformat()
263+
self._db.execute(
264+
f"INSERT INTO manifest ({_ALL_COLUMNS}) "
265+
"VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?) "
266+
"ON CONFLICT(asset_id, zone_id, local_path) DO UPDATE SET "
267+
"version_size=excluded.version_size, "
268+
"version_checksum=excluded.version_checksum, "
269+
"change_tag=excluded.change_tag, "
270+
"last_updated_at=excluded.last_updated_at, "
271+
"item_type=excluded.item_type, "
272+
"filename=excluded.filename, "
273+
"asset_date=excluded.asset_date, "
274+
"added_date=excluded.added_date, "
275+
"is_favorite=excluded.is_favorite, "
276+
"is_hidden=excluded.is_hidden, "
277+
"is_deleted=excluded.is_deleted, "
278+
"original_width=excluded.original_width, "
279+
"original_height=excluded.original_height, "
280+
"duration=excluded.duration, "
281+
"orientation=excluded.orientation, "
282+
"title=excluded.title, "
283+
"description=excluded.description, "
284+
"keywords=excluded.keywords, "
285+
"gps_latitude=excluded.gps_latitude, "
286+
"gps_longitude=excluded.gps_longitude, "
287+
"gps_altitude=excluded.gps_altitude",
288+
(
289+
asset_id, zone_id, local_path, version_size, version_checksum,
290+
change_tag, now, now, item_type, filename,
291+
asset_date, added_date, is_favorite, is_hidden, is_deleted,
292+
original_width, original_height, duration, orientation,
293+
title, description, keywords, gps_latitude, gps_longitude, gps_altitude,
294+
),
295+
)
296+
self._dirty = True
297+
self._pending_count += 1
298+
if self._pending_count >= self._flush_interval:
299+
self.flush()
300+
except sqlite3.Error as e:
301+
logger.warning("Manifest write failed for %s: %s", local_path, e)
302+
303+
def remove(self, asset_id: str, zone_id: str, local_path: str) -> None:
304+
"""Remove a manifest entry."""
305+
self._db.execute(
306+
"DELETE FROM manifest WHERE asset_id = ? AND zone_id = ? AND local_path = ?",
307+
(asset_id, zone_id, local_path),
308+
)
309+
self._dirty = True
310+
311+
def remove_by_path(self, local_path: str) -> None:
312+
"""Remove all manifest entries for a local path (used by autodelete)."""
313+
self._db.execute(
314+
"DELETE FROM manifest WHERE local_path = ?",
315+
(local_path,),
316+
)
317+
self._dirty = True
318+
319+
def count(self) -> int:
320+
"""Return the total number of manifest entries."""
321+
row = self._db.execute("SELECT COUNT(*) FROM manifest").fetchone()
322+
return row[0] if row else 0

tests/helpers/__init__.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,10 @@ def __call__(self, __first: _T, __second: _T, __msg: str) -> None: ...
101101
def assert_files(
102102
assert_equal: AssertEquality, data_dir: str, files_to_assert: Sequence[Tuple[str, str]]
103103
) -> None:
104-
files_in_result = glob.glob(os.path.join(data_dir, "**/*.*"), recursive=True)
104+
files_in_result = [
105+
f for f in glob.glob(os.path.join(data_dir, "**/*.*"), recursive=True)
106+
if not os.path.basename(f).startswith(".icloudpd")
107+
]
105108

106109
assert_equal(sum(1 for _ in files_in_result), len(files_to_assert), "File count does not match")
107110

0 commit comments

Comments
 (0)