Skip to content

Commit b73fb15

Browse files
Runless events - refactor job_versions_io_mapping (#2654)
* get lineage from job_versions_io_mapping table only Signed-off-by: Pawel Leszczynski <leszczynski.pawel@gmail.com> * add made_current_at field to job_versions Signed-off-by: Pawel Leszczynski <leszczynski.pawel@gmail.com> --------- Signed-off-by: Pawel Leszczynski <leszczynski.pawel@gmail.com>
1 parent a5a0e55 commit b73fb15

10 files changed

Lines changed: 617 additions & 71 deletions

api/src/main/java/marquez/db/JobVersionDao.java

Lines changed: 71 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -192,40 +192,73 @@ ExtendedJobVersionRow upsertJobVersion(
192192
String namespaceName);
193193

194194
/**
195-
* Used to link an input dataset to a given job version.
195+
* Used to upsert an input or output dataset to a given job version.
196196
*
197197
* @param jobVersionUuid The unique ID of the job version.
198-
* @param inputDatasetUuid The unique ID of the input dataset.
198+
* @param datasetUuid The unique ID of the output dataset
199+
* @param ioType The {@link IoType} of the dataset.
200+
* @param jobUuid The unique ID of the job.
199201
*/
200-
default void upsertInputDatasetFor(UUID jobVersionUuid, UUID inputDatasetUuid) {
201-
upsertInputOrOutputDatasetFor(jobVersionUuid, inputDatasetUuid, IoType.INPUT);
202-
}
202+
@SqlUpdate(
203+
"""
204+
INSERT INTO job_versions_io_mapping (
205+
job_version_uuid, dataset_uuid, io_type, job_uuid, job_symlink_target_uuid, is_current_job_version, made_current_at)
206+
VALUES (:jobVersionUuid, :datasetUuid, :ioType, :jobUuid, :symlinkTargetJobUuid, TRUE, NOW())
207+
ON CONFLICT (job_version_uuid, dataset_uuid, io_type, job_uuid) DO NOTHING
208+
""")
209+
void upsertCurrentInputOrOutputDatasetFor(
210+
UUID jobVersionUuid,
211+
UUID datasetUuid,
212+
UUID jobUuid,
213+
UUID symlinkTargetJobUuid,
214+
IoType ioType);
215+
216+
@SqlUpdate(
217+
"""
218+
UPDATE job_versions_io_mapping
219+
SET is_current_job_version = FALSE
220+
WHERE (job_uuid = :jobUuid OR job_symlink_target_uuid = :jobUuid)
221+
AND job_version_uuid != :jobVersionUuid
222+
AND io_type = :ioType
223+
AND is_current_job_version = TRUE;
224+
""")
225+
void markInputOrOutputDatasetAsPreviousFor(UUID jobVersionUuid, UUID jobUuid, IoType ioType);
226+
227+
@SqlUpdate(
228+
"""
229+
UPDATE job_versions_io_mapping
230+
SET is_current_job_version = FALSE
231+
WHERE (job_uuid = :jobUuid OR job_symlink_target_uuid = :jobUuid)
232+
AND io_type = :ioType
233+
AND is_current_job_version = TRUE;
234+
""")
235+
void markInputOrOutputDatasetAsPreviousFor(UUID jobUuid, IoType ioType);
203236

204237
/**
205-
* Used to link an output dataset to a given job version.
238+
* Used to link an input dataset to a given job version.
206239
*
207-
* @param jobVersionUuid The unique ID of the job version.
208-
* @param outputDatasetUuid The unique ID of the output dataset.
240+
* @param inputDatasetUuid The unique ID of the input dataset.
241+
* @param jobUuid The unique ID of the job.
209242
*/
210-
default void upsertOutputDatasetFor(UUID jobVersionUuid, UUID outputDatasetUuid) {
211-
upsertInputOrOutputDatasetFor(jobVersionUuid, outputDatasetUuid, IoType.OUTPUT);
243+
default void upsertInputDatasetFor(
244+
UUID jobVersionUuid, UUID inputDatasetUuid, UUID jobUuid, UUID symlinkTargetJobUuid) {
245+
markInputOrOutputDatasetAsPreviousFor(jobVersionUuid, jobUuid, IoType.INPUT);
246+
upsertCurrentInputOrOutputDatasetFor(
247+
jobVersionUuid, inputDatasetUuid, jobUuid, symlinkTargetJobUuid, IoType.INPUT);
212248
}
213249

214250
/**
215-
* Used to upsert an input or output dataset to a given job version.
251+
* Used to link an output dataset to a given job version.
216252
*
217-
* @param jobVersionUuid The unique ID of the job version.
218-
* @param datasetUuid The unique ID of the output dataset
219-
* @param ioType The {@link IoType} of the dataset.
253+
* @param outputDatasetUuid The unique ID of the output dataset.
254+
* @param jobUuid The unique ID of the job.
220255
*/
221-
@SqlUpdate(
222-
"""
223-
INSERT INTO job_versions_io_mapping (
224-
job_version_uuid, dataset_uuid, io_type)
225-
VALUES (:jobVersionUuid, :datasetUuid, :ioType)
226-
ON CONFLICT DO NOTHING
227-
""")
228-
void upsertInputOrOutputDatasetFor(UUID jobVersionUuid, UUID datasetUuid, IoType ioType);
256+
default void upsertOutputDatasetFor(
257+
UUID jobVersionUuid, UUID outputDatasetUuid, UUID jobUuid, UUID symlinkTargetJobUuid) {
258+
markInputOrOutputDatasetAsPreviousFor(jobVersionUuid, jobUuid, IoType.OUTPUT);
259+
upsertCurrentInputOrOutputDatasetFor(
260+
jobVersionUuid, outputDatasetUuid, jobUuid, symlinkTargetJobUuid, IoType.OUTPUT);
261+
}
229262

230263
/**
231264
* Returns the input datasets to a given job version.
@@ -366,14 +399,20 @@ default BagOfJobVersionInfo upsertRunlessJobVersion(
366399
inputs.forEach(
367400
i -> {
368401
jobVersionDao.upsertInputDatasetFor(
369-
jobVersionRow.getUuid(), i.getDatasetVersionRow().getDatasetUuid());
402+
jobVersionRow.getUuid(),
403+
i.getDatasetVersionRow().getDatasetUuid(),
404+
jobVersionRow.getJobUuid(),
405+
jobRow.getSymlinkTargetId());
370406
});
371407

372408
// Link the output datasets to the job version.
373409
outputs.forEach(
374410
o -> {
375411
jobVersionDao.upsertOutputDatasetFor(
376-
jobVersionRow.getUuid(), o.getDatasetVersionRow().getDatasetUuid());
412+
jobVersionRow.getUuid(),
413+
o.getDatasetVersionRow().getDatasetUuid(),
414+
jobVersionRow.getJobUuid(),
415+
jobRow.getSymlinkTargetId());
377416
});
378417

379418
jobDao.updateVersionFor(jobRow.getUuid(), jobRow.getCreatedAt(), jobVersionRow.getUuid());
@@ -468,14 +507,20 @@ default BagOfJobVersionInfo upsertJobVersionOnRunTransition(
468507
jobVersionInputs.forEach(
469508
jobVersionInput -> {
470509
jobVersionDao.upsertInputDatasetFor(
471-
jobVersionRow.getUuid(), jobVersionInput.getDatasetUuid());
510+
jobVersionRow.getUuid(),
511+
jobVersionInput.getDatasetUuid(),
512+
jobVersionRow.getJobUuid(),
513+
jobRow.getSymlinkTargetId());
472514
});
473515

474516
// Link the output datasets to the job version.
475517
jobVersionOutputs.forEach(
476518
jobVersionOutput -> {
477519
jobVersionDao.upsertOutputDatasetFor(
478-
jobVersionRow.getUuid(), jobVersionOutput.getDatasetUuid());
520+
jobVersionRow.getUuid(),
521+
jobVersionOutput.getDatasetUuid(),
522+
jobVersionRow.getJobUuid(),
523+
jobRow.getSymlinkTargetId());
479524
});
480525

481526
// Link the job version to the run.

api/src/main/java/marquez/db/LineageDao.java

Lines changed: 39 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -56,43 +56,45 @@ public record UpstreamRunRow(JobSummary job, RunSummary run, DatasetSummary inpu
5656
@SqlQuery(
5757
"""
5858
WITH RECURSIVE
59-
-- Find the current version of a job or its symlink target if the target has no
60-
-- current_version_uuid. This ensures that we don't lose lineage for a job after it is
61-
-- symlinked to another job but before that target job has run successfully.
62-
job_current_version AS (
63-
SELECT COALESCE(j.symlink_target_uuid, j.uuid) AS job_uuid,
64-
COALESCE(s.current_version_uuid, j.current_version_uuid) AS job_version_uuid
65-
FROM jobs j
66-
LEFT JOIN jobs s ON s.uuid=j.symlink_target_uuid
67-
WHERE s.current_version_uuid IS NULL
68-
),
69-
job_io AS (
70-
SELECT j.job_uuid,
71-
ARRAY_AGG(DISTINCT io.dataset_uuid) FILTER (WHERE io_type='INPUT') AS inputs,
72-
ARRAY_AGG(DISTINCT io.dataset_uuid) FILTER (WHERE io_type='OUTPUT') AS outputs
73-
FROM job_versions_io_mapping io
74-
INNER JOIN job_current_version j ON io.job_version_uuid=j.job_version_uuid
75-
GROUP BY j.job_uuid
76-
),
77-
lineage(job_uuid, inputs, outputs) AS (
78-
SELECT v.job_uuid AS job_uuid,
79-
COALESCE(inputs, Array[]::uuid[]) AS inputs,
80-
COALESCE(outputs, Array[]::uuid[]) AS outputs,
81-
0 AS depth
82-
FROM jobs j
83-
INNER JOIN job_current_version v ON (j.symlink_target_uuid IS NULL AND j.uuid=v.job_uuid) OR v.job_uuid=j.symlink_target_uuid
84-
LEFT JOIN job_io io ON io.job_uuid=v.job_uuid
85-
WHERE j.uuid IN (<jobIds>) OR j.symlink_target_uuid IN (<jobIds>)
86-
UNION
87-
SELECT io.job_uuid, io.inputs, io.outputs, l.depth + 1
88-
FROM job_io io,
89-
lineage l
90-
WHERE io.job_uuid != l.job_uuid AND
91-
array_cat(io.inputs, io.outputs) && array_cat(l.inputs, l.outputs)
92-
AND depth < :depth)
93-
SELECT DISTINCT ON (j.uuid) j.*, inputs AS input_uuids, outputs AS output_uuids
94-
FROM lineage l2
95-
INNER JOIN jobs_view j ON j.uuid=l2.job_uuid;
59+
job_io AS (
60+
SELECT
61+
io.job_uuid AS job_uuid,
62+
io.job_symlink_target_uuid AS job_symlink_target_uuid,
63+
ARRAY_AGG(DISTINCT io.dataset_uuid) FILTER (WHERE io.io_type='INPUT') AS inputs,
64+
ARRAY_AGG(DISTINCT io.dataset_uuid) FILTER (WHERE io.io_type='OUTPUT') AS outputs
65+
FROM job_versions_io_mapping io
66+
WHERE io.is_current_job_version = TRUE
67+
GROUP BY io.job_symlink_target_uuid, io.job_uuid
68+
),
69+
lineage(job_uuid, job_symlink_target_uuid, inputs, outputs) AS (
70+
SELECT job_uuid,
71+
job_symlink_target_uuid,
72+
COALESCE(inputs, Array[]::uuid[]) AS inputs,
73+
COALESCE(outputs, Array[]::uuid[]) AS outputs,
74+
0 AS depth
75+
FROM job_io
76+
WHERE job_uuid IN (<jobIds>) OR job_symlink_target_uuid IN (<jobIds>)
77+
UNION
78+
SELECT io.job_uuid, io.job_symlink_target_uuid, io.inputs, io.outputs, l.depth + 1
79+
FROM job_io io, lineage l
80+
WHERE (io.job_uuid != l.job_uuid) AND
81+
array_cat(io.inputs, io.outputs) && array_cat(l.inputs, l.outputs)
82+
AND depth < :depth),
83+
lineage_outside_job_io(job_uuid) AS (
84+
SELECT
85+
param_jobs.param_job_uuid as job_uuid,
86+
j.symlink_target_uuid,
87+
Array[]::uuid[] AS inputs,
88+
Array[]::uuid[] AS outputs,
89+
0 AS depth
90+
FROM (SELECT unnest(ARRAY[<jobIds>]::UUID[]) AS param_job_uuid) param_jobs
91+
LEFT JOIN lineage l on param_jobs.param_job_uuid = l.job_uuid
92+
INNER JOIN jobs j ON j.uuid = param_jobs.param_job_uuid
93+
WHERE l.job_uuid IS NULL
94+
)
95+
SELECT DISTINCT ON (j.uuid) j.*, inputs AS input_uuids, outputs AS output_uuids
96+
FROM (SELECT * FROM lineage UNION SELECT * FROM lineage_outside_job_io) l2
97+
INNER JOIN jobs_view j ON (j.uuid=l2.job_uuid OR j.uuid=l2.job_symlink_target_uuid)
9698
""")
9799
Set<JobData> getLineage(@BindList Set<UUID> jobIds, int depth);
98100

api/src/main/java/marquez/db/OpenLineageDao.java

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
import marquez.common.models.SourceType;
3434
import marquez.db.DatasetFieldDao.DatasetFieldMapping;
3535
import marquez.db.JobVersionDao.BagOfJobVersionInfo;
36+
import marquez.db.JobVersionDao.IoType;
3637
import marquez.db.RunDao.RunUpsert;
3738
import marquez.db.RunDao.RunUpsert.RunUpsertBuilder;
3839
import marquez.db.mappers.LineageEventMapper;
@@ -362,27 +363,33 @@ default UpdateLineageRow updateBaseMarquezModel(LineageEvent event, ObjectMapper
362363

363364
// RunInput list uses null as a sentinel value
364365
List<DatasetRecord> datasetInputs = null;
365-
if (event.getInputs() != null) {
366+
if (event.getInputs() != null && !event.getInputs().isEmpty()) {
366367
datasetInputs = new ArrayList<>();
367368
for (Dataset dataset : event.getInputs()) {
368369
DatasetRecord record = upsertLineageDataset(daos, dataset, now, runUuid, true);
369370
datasetInputs.add(record);
370371
insertDatasetFacets(daos, dataset, record, runUuid, event.getEventType(), now);
371372
insertInputDatasetFacets(daos, dataset, record, runUuid, event.getEventType(), now);
372373
}
374+
} else {
375+
// mark job_versions_io_mapping as obsolete
376+
daos.getJobVersionDao().markInputOrOutputDatasetAsPreviousFor(job.getUuid(), IoType.INPUT);
373377
}
374378
bag.setInputs(Optional.ofNullable(datasetInputs));
375379

376380
// RunInput list uses null as a sentinel value
377381
List<DatasetRecord> datasetOutputs = null;
378-
if (event.getOutputs() != null) {
382+
if (event.getOutputs() != null && !event.getOutputs().isEmpty()) {
379383
datasetOutputs = new ArrayList<>();
380384
for (Dataset dataset : event.getOutputs()) {
381385
DatasetRecord record = upsertLineageDataset(daos, dataset, now, runUuid, false);
382386
datasetOutputs.add(record);
383387
insertDatasetFacets(daos, dataset, record, runUuid, event.getEventType(), now);
384388
insertOutputDatasetFacets(daos, dataset, record, runUuid, event.getEventType(), now);
385389
}
390+
} else {
391+
// mark job_versions_io_mapping as obsolete
392+
daos.getJobVersionDao().markInputOrOutputDatasetAsPreviousFor(job.getUuid(), IoType.OUTPUT);
386393
}
387394

388395
bag.setOutputs(Optional.ofNullable(datasetOutputs));
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
/*
2+
* Copyright 2018-2023 contributors to the Marquez project
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package marquez.db.migrations;
7+
8+
import lombok.extern.slf4j.Slf4j;
9+
import org.flywaydb.core.api.MigrationVersion;
10+
import org.flywaydb.core.api.migration.Context;
11+
import org.flywaydb.core.api.migration.JavaMigration;
12+
import org.jdbi.v3.core.Jdbi;
13+
14+
@Slf4j
15+
public class V67_2_JobVersionsIOMappingBackfillJob implements JavaMigration {
16+
17+
public static final String UPDATE_QUERY =
18+
"""
19+
UPDATE job_versions_io_mapping
20+
SET
21+
job_uuid = j.uuid,
22+
job_symlink_target_uuid = j.symlink_target_uuid,
23+
is_current_job_version = (jv.uuid = j.current_version_uuid)::BOOLEAN,
24+
made_current_at = NOW()
25+
FROM job_versions jv
26+
INNER JOIN jobs_view j ON j.uuid = jv.job_uuid
27+
WHERE jv.uuid = job_versions_io_mapping.job_version_uuid
28+
""";
29+
30+
@Override
31+
public MigrationVersion getVersion() {
32+
return MigrationVersion.fromVersion("67.2");
33+
}
34+
35+
@Override
36+
public void migrate(Context context) throws Exception {
37+
Jdbi jdbi = Jdbi.create(context.getConnection());
38+
jdbi.withHandle(h -> h.createUpdate(UPDATE_QUERY).execute());
39+
}
40+
41+
@Override
42+
public String getDescription() {
43+
return "Back fill job_uuid and is_current_job_version in job_versions_io_mapping table";
44+
}
45+
46+
@Override
47+
public Integer getChecksum() {
48+
return null;
49+
}
50+
51+
@Override
52+
public boolean isUndo() {
53+
return false;
54+
}
55+
56+
@Override
57+
public boolean canExecuteInTransaction() {
58+
return false;
59+
}
60+
61+
@Override
62+
public boolean isBaselineMigration() {
63+
return false;
64+
}
65+
}

api/src/main/resources/marquez/db/migration/R__1_Jobs_view_and_rewrite_function.sql

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,10 @@ BEGIN
111111
LEFT JOIN aliases a ON a.link_target_uuid = j.uuid
112112
) j
113113
WHERE jobs.uuid=j.uuid;
114+
UPDATE job_versions_io_mapping
115+
SET job_symlink_target_uuid=j.symlink_target_uuid
116+
FROM jobs j
117+
WHERE job_versions_io_mapping.job_uuid=j.uuid AND j.uuid = NEW.uuid;
114118
END IF;
115119
SELECT * INTO inserted_job FROM jobs_view
116120
WHERE uuid=job_uuid OR (new_symlink_target_uuid IS NOT NULL AND uuid=new_symlink_target_uuid);
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
ALTER TABLE job_versions_io_mapping ADD COLUMN job_uuid uuid REFERENCES jobs(uuid) ON DELETE CASCADE;
2+
ALTER TABLE job_versions_io_mapping ADD COLUMN job_symlink_target_uuid uuid REFERENCES jobs(uuid) ON DELETE CASCADE;
3+
ALTER TABLE job_versions_io_mapping ADD COLUMN is_current_job_version boolean DEFAULT FALSE;
4+
ALTER TABLE job_versions_io_mapping ADD COLUMN made_current_at TIMESTAMP;
5+
6+
-- To add job_uuid to the unique constraint, we first drop the primary key, then recreate it; note given that job_version_uuid can be NULL, we need to check that job_version_uuid != NULL before inserting (duplicate columns otherwise)
7+
ALTER TABLE job_versions_io_mapping DROP CONSTRAINT job_versions_io_mapping_pkey;
8+
ALTER TABLE job_versions_io_mapping ALTER COLUMN job_version_uuid DROP NOT NULL;
9+
10+
CREATE INDEX job_versions_io_mapping_job_uuid_job_symlink_target_uuid ON job_versions_io_mapping (job_uuid, job_symlink_target_uuid);
11+
12+
ALTER TABLE job_versions_io_mapping ADD CONSTRAINT job_versions_io_mapping_mapping_pkey UNIQUE (job_version_uuid, dataset_uuid, io_type, job_uuid);

0 commit comments

Comments
 (0)