Skip to content

Commit 40d51f1

Browse files
fix broken lineage for repeated runs (#2710)
Signed-off-by: Pawel Leszczynski <leszczynski.pawel@gmail.com>
1 parent 83608bb commit 40d51f1

4 files changed

Lines changed: 114 additions & 2 deletions

File tree

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@
22

33
## [Unreleased](https://github.com/MarquezProject/marquez/compare/0.43.0...HEAD)
44

5+
### Fixed:
6+
* API: fix broken lineage graph for multiple runs of the same job.[`#2710`](https://github.com/MarquezProject/marquez/pull/2710) [@pawel-big-lebowski]( https://github.com/pawel-big-lebowski)
7+
*Problem: lineage graph was not available for jobs run multiple times of the same job as a result of bug introduced with recent release. In order to fix the inconsistent data, [this query](https://github.com/MarquezProject/marquez/blob/83608bb13bd4dc235c065f95bebf8a88dcb53c61/api/src/main/java/marquez/db/migrations/V67_2_JobVersionsIOMappingBackfillJob.java#L19) should be run. This is not required when upgrading directly to this version.*
8+
59
## [0.43.0](https://github.com/MarquezProject/marquez/compare/0.42.0...0.43.0) - 2023-12-15
610
### Added
711
* API: refactor the `RunDao` SQL query [`#2685`](https://github.com/MarquezProject/marquez/pull/2685) [@sophiely](https://github.com/sophiely)

api/src/main/java/marquez/db/JobVersionDao.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -204,7 +204,7 @@ ExtendedJobVersionRow upsertJobVersion(
204204
INSERT INTO job_versions_io_mapping (
205205
job_version_uuid, dataset_uuid, io_type, job_uuid, job_symlink_target_uuid, is_current_job_version, made_current_at)
206206
VALUES (:jobVersionUuid, :datasetUuid, :ioType, :jobUuid, :symlinkTargetJobUuid, TRUE, NOW())
207-
ON CONFLICT (job_version_uuid, dataset_uuid, io_type, job_uuid) DO NOTHING
207+
ON CONFLICT (job_version_uuid, dataset_uuid, io_type, job_uuid) DO UPDATE SET is_current_job_version = TRUE
208208
""")
209209
void upsertCurrentInputOrOutputDatasetFor(
210210
UUID jobVersionUuid,

api/src/test/java/marquez/db/LineageTestUtils.java

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,13 +116,48 @@ public static UpdateLineageRow createLineageRow(
116116
List<Dataset> outputs,
117117
@Valid LineageEvent.ParentRunFacet parentRunFacet,
118118
ImmutableMap<String, Object> runFacets) {
119+
return createLineageRow(
120+
dao,
121+
jobName,
122+
UUID.randomUUID(),
123+
status,
124+
jobFacet,
125+
inputs,
126+
outputs,
127+
parentRunFacet,
128+
runFacets);
129+
}
130+
131+
/**
132+
* Create an {@link UpdateLineageRow} from the input job details and datasets.
133+
*
134+
* @param dao
135+
* @param jobName
136+
* @param runId
137+
* @param status
138+
* @param jobFacet
139+
* @param inputs
140+
* @param outputs
141+
* @param parentRunFacet
142+
* @param runFacets
143+
* @return
144+
*/
145+
public static UpdateLineageRow createLineageRow(
146+
OpenLineageDao dao,
147+
String jobName,
148+
UUID runId,
149+
String status,
150+
JobFacet jobFacet,
151+
List<Dataset> inputs,
152+
List<Dataset> outputs,
153+
@Valid LineageEvent.ParentRunFacet parentRunFacet,
154+
ImmutableMap<String, Object> runFacets) {
119155
NominalTimeRunFacet nominalTimeRunFacet = new NominalTimeRunFacet();
120156
nominalTimeRunFacet.setNominalStartTime(
121157
Instant.now().atZone(LOCAL_ZONE).truncatedTo(ChronoUnit.HOURS));
122158
nominalTimeRunFacet.setNominalEndTime(
123159
nominalTimeRunFacet.getNominalStartTime().plus(1, ChronoUnit.HOURS));
124160

125-
UUID runId = UUID.randomUUID();
126161
LineageEvent event =
127162
LineageEvent.builder()
128163
.eventType(status)

api/src/test/java/marquez/service/LineageServiceTest.java

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
import java.util.LinkedList;
1717
import java.util.List;
1818
import java.util.Optional;
19+
import java.util.UUID;
1920
import java.util.stream.Collectors;
2021
import marquez.api.JdbiUtils;
2122
import marquez.common.models.DatasetId;
@@ -56,6 +57,7 @@
5657
import org.junit.jupiter.api.BeforeAll;
5758
import org.junit.jupiter.api.Test;
5859
import org.junit.jupiter.api.extension.ExtendWith;
60+
import org.testcontainers.shaded.com.google.common.collect.ImmutableMap;
5961

6062
@ExtendWith(MarquezJdbiExternalPostgresExtension.class)
6163
public class LineageServiceTest {
@@ -427,6 +429,77 @@ public void testLineageWithWithCycle() {
427429
.matches(n -> n.isJobType() && n.asJobId().getName().getValue().equals("writeJob"));
428430
}
429431

432+
@Test
433+
public void testGetLineageJobRunTwice() {
434+
Dataset input = Dataset.builder().name("input-dataset").namespace(NAMESPACE).build();
435+
Dataset output = Dataset.builder().name("output-dataset").namespace(NAMESPACE).build();
436+
UUID runId = UUID.randomUUID();
437+
438+
// (1) Run batch job which outputs input-dataset
439+
LineageTestUtils.createLineageRow(
440+
openLineageDao,
441+
"someJob",
442+
runId,
443+
"START",
444+
jobFacet,
445+
Arrays.asList(input),
446+
Collections.emptyList(),
447+
null,
448+
ImmutableMap.of());
449+
450+
LineageTestUtils.createLineageRow(
451+
openLineageDao,
452+
"someJob",
453+
runId,
454+
"COMPLETE",
455+
jobFacet,
456+
Collections.emptyList(),
457+
Arrays.asList(output),
458+
null,
459+
ImmutableMap.of());
460+
461+
// (2) Rerun it
462+
LineageTestUtils.createLineageRow(
463+
openLineageDao,
464+
"someJob",
465+
runId,
466+
"START",
467+
jobFacet,
468+
Arrays.asList(input),
469+
Collections.emptyList(),
470+
null,
471+
ImmutableMap.of());
472+
473+
LineageTestUtils.createLineageRow(
474+
openLineageDao,
475+
"someJob",
476+
runId,
477+
"COMPLETE",
478+
jobFacet,
479+
Collections.emptyList(),
480+
Arrays.asList(output),
481+
null,
482+
ImmutableMap.of());
483+
484+
// (4) lineage on output dataset shall be same as lineage on input dataset
485+
Lineage lineageFromInput =
486+
lineageService.lineage(
487+
NodeId.of(
488+
new DatasetId(new NamespaceName(NAMESPACE), new DatasetName("input-dataset"))),
489+
5,
490+
true);
491+
492+
Lineage lineageFromOutput =
493+
lineageService.lineage(
494+
NodeId.of(
495+
new DatasetId(new NamespaceName(NAMESPACE), new DatasetName("output-dataset"))),
496+
5,
497+
true);
498+
499+
assertThat(lineageFromInput.getGraph()).hasSize(3); // 2 datasets + 1 job
500+
assertThat(lineageFromInput.getGraph()).isEqualTo(lineageFromOutput.getGraph());
501+
}
502+
430503
@Test
431504
public void testGetLineageForRunningStreamingJob() {
432505
Dataset input = Dataset.builder().name("input-dataset").namespace(NAMESPACE).build();

0 commit comments

Comments
 (0)