Skip to content

Commit ef23b7d

Browse files
committed
During sync, skip package download for packages with unchanged NEVRAs
Currently, we start synchronizing packages even if the package is unchanged from a previous sync. This means that we end up re-signing packages each time we sync when signing during sync. This commit fixes that problem by re-using the already synced version of the package for packages that already exist in the latest version of the repository with the same NEVRA. Note that this means that a package that has been changed without changing the NEVRA will *not* be resynced anymore. This may be seen as a security feature where packages can't be silently changed. To sync a package with the same NEVRA, the original package must first be removed from the repository and then the repository must be synced again. Signed-off-by: Jonathan Dieter <jdieter@ciq.com>
1 parent 0df0bd1 commit ef23b7d

3 files changed

Lines changed: 164 additions & 30 deletions

File tree

pulp_rpm/app/tasks/synchronizing.py

Lines changed: 65 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@
9191
is_previous_version,
9292
get_sha256,
9393
urlpath_sanitize,
94+
format_nevra,
9495
)
9596
from pulp_rpm.app.rpm_version import RpmVersion
9697

@@ -1239,6 +1240,27 @@ async def parse_packages(self, primary_xml, filelists_xml, other_xml, modulemd_l
12391240
# we might want to pick the latest based on the build time.
12401241
latest_build_time_by_nevra = {}
12411242

1243+
# Cache packages already in the repository to later check for duplicates
1244+
# This needs to be @sync_to_async because self.repository.latest_version() is synchronous
1245+
@sync_to_async
1246+
def get_existing_package_nevras():
1247+
contents = self.repository.latest_version().content.filter(
1248+
pulp_type=Package.get_pulp_type()
1249+
)
1250+
nevra_to_declarative = {}
1251+
1252+
for content in contents:
1253+
# Get package object from content
1254+
pkg = content.rpm_package
1255+
nevra = format_nevra(pkg.name, pkg.epoch, pkg.version, pkg.release, pkg.arch)
1256+
# Create a DeclarativeContent object for the existing package
1257+
dc = DeclarativeContent(content=pkg)
1258+
nevra_to_declarative[nevra] = dc
1259+
1260+
return nevra_to_declarative
1261+
1262+
existing_package_nevras = await get_existing_package_nevras()
1263+
12421264
# Perform various checks and potentially filter out unwanted packages
12431265
# We parse all of primary.xml first and fail fast if something is wrong.
12441266
# Collect a list of any package nevras() we don't want to include.
@@ -1357,33 +1379,48 @@ def verification_and_skip_callback(pkg):
13571379
# entries with the same NEVRA, pick the one with the larger build time
13581380
elif pkg.time_build != latest_build_time_by_nevra[pkg_nevra]:
13591381
continue
1360-
# Implicit: There can be multiple package entries that are completely identical
1361-
# (same NEVRA, same build time, same checksum / pkgid) and the same or different
1362-
# location_href. We're not explicitly handling this, the pipeline will deduplicate.
1363-
package = Package(**Package.createrepo_to_dict(pkg))
1364-
base_url = pkg.location_base or self.remote_url
1365-
url = urlpath_sanitize(base_url, package.location_href)
1366-
del pkg # delete it as soon as we're done with it
1367-
1368-
# Location_href is not a property of the Package in isolation [0], and Pulp has
1369-
# a well defined way of generating the layout/locations on publication time.
1370-
# We only need to use the original location_href for metadata mirroring
1371-
# [0] https://github.com/pulp/pulp_rpm/issues/2580
1372-
original_location_href = package.location_href
1373-
package.location_href = package.filename
1374-
store_package_for_mirroring(self.repository, package.pkgId, original_location_href)
1375-
1376-
artifact = Artifact(size=package.size_package)
1377-
checksum_type = getattr(CHECKSUM_TYPES, package.checksum_type.upper())
1378-
setattr(artifact, checksum_type, package.pkgId)
1379-
da = DeclarativeArtifact(
1380-
artifact=artifact,
1381-
url=url,
1382-
relative_path=package.location_href,
1383-
remote=self.remote,
1384-
deferred_download=self.deferred_download,
1385-
)
1386-
dc = DeclarativeContent(content=package, d_artifacts=[da])
1382+
1383+
# Check for identical NEVRA to what's in the repo right now. For these, we don't
1384+
# want to "skip" them, but we also don't want to re-download them, so let's just
1385+
# make the declarative content point to the existing package with no artifact.
1386+
if pkg_nevra in existing_package_nevras:
1387+
log.debug(
1388+
f"Substituting already existing package {pkg_nevra} into repo to avoid "
1389+
f"re-download"
1390+
)
1391+
dc = existing_package_nevras[pkg_nevra]
1392+
else:
1393+
# Implicit: There can be multiple package entries that are completely identical
1394+
# (same NEVRA, same build time, same checksum / pkgid) and the same or different
1395+
# location_href. We're not explicitly handling this, the pipeline will
1396+
# deduplicate.
1397+
package = Package(**Package.createrepo_to_dict(pkg))
1398+
base_url = pkg.location_base or self.remote_url
1399+
url = urlpath_sanitize(base_url, package.location_href)
1400+
del pkg # delete it as soon as we're done with it
1401+
1402+
# Location_href is not a property of the Package in isolation [0], and Pulp has
1403+
# a well defined way of generating the layout/locations on publication time.
1404+
# We only need to use the original location_href for metadata mirroring
1405+
# [0] https://github.com/pulp/pulp_rpm/issues/2580
1406+
original_location_href = package.location_href
1407+
package.location_href = package.filename
1408+
store_package_for_mirroring(
1409+
self.repository, package.pkgId, original_location_href
1410+
)
1411+
1412+
artifact = Artifact(size=package.size_package)
1413+
checksum_type = getattr(CHECKSUM_TYPES, package.checksum_type.upper())
1414+
setattr(artifact, checksum_type, package.pkgId)
1415+
da = DeclarativeArtifact(
1416+
artifact=artifact,
1417+
url=url,
1418+
relative_path=package.location_href,
1419+
remote=self.remote,
1420+
deferred_download=self.deferred_download,
1421+
)
1422+
dc = DeclarativeContent(content=package, d_artifacts=[da])
1423+
13871424
dc.extra_data = defaultdict(list)
13881425

13891426
# find if a package relates to a modulemd
@@ -1644,7 +1681,7 @@ async def run(self):
16441681

16451682
async for batch in self.batches():
16461683
for d_content in batch:
1647-
if isinstance(d_content.content, Package):
1684+
if isinstance(d_content.content, Package) and len(d_content.d_artifacts) > 0:
16481685
await self._sign_rpm_content(d_content)
16491686

16501687
await self.put(d_content)

pulp_rpm/tests/functional/api/test_package_signing.py

Lines changed: 97 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,12 @@
1010
from pulpcore.exceptions.validation import InvalidSignatureError
1111

1212
from pulp_rpm.app.shared_utils import RpmTool
13-
from pulp_rpm.tests.functional.constants import RPM_PACKAGE_FILENAME, RPM_UNSIGNED_URL
13+
from pulp_rpm.tests.functional.constants import (
14+
RPM_PACKAGE_FILENAME,
15+
RPM_UNSIGNED_URL,
16+
RPM_UNSIGNED_MODIFIED_FIXTURE_URL,
17+
RPM_PACKAGE_CONTENT_NAME,
18+
)
1419
from pulp_rpm.tests.functional.utils import get_package_repo_path
1520

1621

@@ -260,7 +265,6 @@ def test_no_sign_packages_on_sync_if_no_signing_service(
260265

261266
# Setup GPG and RPM tool
262267
gpg_a, _ = signing_gpg_extra
263-
fingerprint = gpg_a.fingerprint
264268

265269
rpm_tool = RpmTool(tmp_path)
266270
rpm_tool.import_pubkey_string(gpg_a.pubkey)
@@ -414,3 +418,94 @@ def test_error_signing_with_mirror_complete_sync(
414418
assert json.loads(e.value.body) == [
415419
"Cannot use 'mirror_complete' sync policy when repository has a package signing service."
416420
]
421+
422+
423+
def test_no_resync_of_packages_on_second_sync(
424+
init_and_sync,
425+
tmp_path,
426+
gen_object_with_cleanup,
427+
download_content_unit,
428+
signing_gpg_extra,
429+
rpm_package_signing_service,
430+
rpm_package_api,
431+
rpm_repository_api,
432+
rpm_repository_factory,
433+
rpm_publication_factory,
434+
rpm_distribution_factory,
435+
delete_orphans_pre,
436+
get_content,
437+
):
438+
"""
439+
Ensure that a second sync doesn't re-download and re-sign packages that haven't changed
440+
"""
441+
from pulp_rpm.app.shared_utils import RpmTool
442+
443+
# Setup GPG and RPM tool
444+
gpg_a, _ = signing_gpg_extra
445+
446+
rpm_tool = RpmTool(tmp_path)
447+
rpm_tool.import_pubkey_string(gpg_a.pubkey)
448+
449+
# Create repository with package signing service configured
450+
repository = rpm_repository_factory(
451+
package_signing_service=rpm_package_signing_service.pulp_href,
452+
package_signing_fingerprint=gpg_a.fingerprint,
453+
)
454+
init_and_sync(
455+
repository=repository,
456+
url=RPM_UNSIGNED_MODIFIED_FIXTURE_URL,
457+
sync_policy="mirror_content_only",
458+
)
459+
460+
# Get synced packages - refresh repository to get latest version
461+
updated_repository = rpm_repository_api.read(repository.pulp_href)
462+
packages = rpm_package_api.list(repository_version=updated_repository.latest_version_href)
463+
assert packages.count > 0, "No packages were synced"
464+
465+
# Test the first package to verify it was signed during sync
466+
test_package = packages.results[0]
467+
468+
# Verify that the final served package is signed
469+
publication = rpm_publication_factory(repository=repository.pulp_href)
470+
distribution = rpm_distribution_factory(publication=publication.pulp_href)
471+
downloaded_package = tmp_path / "package.rpm"
472+
downloaded_package.write_bytes(
473+
download_content_unit(
474+
distribution.base_path, get_package_repo_path(test_package.location_href)
475+
)
476+
)
477+
assert rpm_tool.verify_signature(downloaded_package)
478+
479+
# Save the content information of the packages from the original sync
480+
original_packages = {
481+
content["name"]: content
482+
for content in get_content(updated_repository)["present"][RPM_PACKAGE_CONTENT_NAME]
483+
}
484+
485+
init_and_sync(repository=repository, sync_policy="mirror_content_only")
486+
# Get synced packages - refresh repository to get latest version
487+
updated_repository = rpm_repository_api.read(repository.pulp_href)
488+
packages = rpm_package_api.list(repository_version=updated_repository.latest_version_href)
489+
assert packages.count > 0, "No packages were synced"
490+
491+
# Test the first package to verify it was signed during sync
492+
test_package = packages.results[0]
493+
494+
# Verify that the final served package is signed
495+
publication = rpm_publication_factory(repository=repository.pulp_href)
496+
distribution = rpm_distribution_factory(publication=publication.pulp_href)
497+
downloaded_package = tmp_path / "package.rpm"
498+
downloaded_package.write_bytes(
499+
download_content_unit(
500+
distribution.base_path, get_package_repo_path(test_package.location_href)
501+
)
502+
)
503+
assert rpm_tool.verify_signature(downloaded_package)
504+
505+
# Test that the zebra package wasn't re-synced
506+
mutated_packages = {
507+
content["name"]: content
508+
for content in get_content(updated_repository)["present"][RPM_PACKAGE_CONTENT_NAME]
509+
}
510+
511+
assert original_packages["zebra"] == mutated_packages["zebra"]

pulp_rpm/tests/functional/constants.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,8 @@
9696

9797
RPM_UNSIGNED_FIXTURE_URL = urljoin(PULP_FIXTURES_BASE_URL, "rpm-unsigned/")
9898

99+
RPM_UNSIGNED_MODIFIED_FIXTURE_URL = urljoin(PULP_FIXTURES_BASE_URL, "rpm-unsigned-modified/")
100+
99101
RPM_ZSTD_METADATA_FIXTURE_URL = urljoin(PULP_FIXTURES_BASE_URL, "rpm-zstd-metadata/")
100102

101103
RPM_INVALID_FIXTURE_URL = urljoin(PULP_FIXTURES_BASE_URL, "rpm-missing-filelists/")

0 commit comments

Comments
 (0)