Skip to content

Commit b9abb19

Browse files
authored
Fix bug that caused a single run event to create multiple jobs (#2162)
Signed-off-by: Michael Collado <collado.mike@gmail.com> Signed-off-by: Michael Collado <collado.mike@gmail.com>
1 parent 67e9249 commit b9abb19

2 files changed

Lines changed: 161 additions & 78 deletions

File tree

api/src/main/java/marquez/db/OpenLineageDao.java

Lines changed: 105 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@
5858
import marquez.service.models.LineageEvent.LifecycleStateChangeFacet;
5959
import marquez.service.models.LineageEvent.NominalTimeRunFacet;
6060
import marquez.service.models.LineageEvent.ParentRunFacet;
61+
import marquez.service.models.LineageEvent.Run;
6162
import marquez.service.models.LineageEvent.RunFacet;
6263
import marquez.service.models.LineageEvent.SchemaDatasetFacet;
6364
import marquez.service.models.LineageEvent.SchemaField;
@@ -150,24 +151,12 @@ default UpdateLineageRow updateBaseMarquezModel(LineageEvent event, ObjectMapper
150151
DEFAULT_NAMESPACE_OWNER);
151152
bag.setNamespace(namespace);
152153

153-
String description =
154-
Optional.ofNullable(event.getJob().getFacets())
155-
.map(JobFacet::getDocumentation)
156-
.map(DocumentationJobFacet::getDescription)
157-
.orElse(null);
158-
159154
Map<String, String> context = buildJobContext(event);
160155
JobContextRow jobContext =
161156
jobContextDao.upsert(
162157
UUID.randomUUID(), now, Utils.toJson(context), Utils.checksumFor(context));
163158
bag.setJobContext(jobContext);
164159

165-
String location =
166-
Optional.ofNullable(event.getJob().getFacets())
167-
.flatMap(f -> Optional.ofNullable(f.getSourceCodeLocation()))
168-
.flatMap(s -> Optional.ofNullable(s.getUrl()))
169-
.orElse(null);
170-
171160
Instant nominalStartTime =
172161
Optional.ofNullable(event.getRun().getFacets())
173162
.flatMap(f -> Optional.ofNullable(f.getNominalTime()))
@@ -181,75 +170,26 @@ default UpdateLineageRow updateBaseMarquezModel(LineageEvent event, ObjectMapper
181170
.map(t -> t.withZoneSameInstant(ZoneId.of("UTC")).toInstant())
182171
.orElse(null);
183172

184-
Logger log = LoggerFactory.getLogger(OpenLineageDao.class);
185173
Optional<ParentRunFacet> parentRun =
186-
Optional.ofNullable(event.getRun())
187-
.map(LineageEvent.Run::getFacets)
188-
.map(RunFacet::getParent);
189-
174+
Optional.ofNullable(event.getRun()).map(Run::getFacets).map(RunFacet::getParent);
190175
Optional<UUID> parentUuid = parentRun.map(Utils::findParentRunUuid);
191-
Optional<JobRow> parentJob =
192-
parentUuid.map(
193-
uuid ->
194-
findParentJobRow(
195-
event,
196-
namespace,
197-
jobContext,
198-
location,
199-
nominalStartTime,
200-
nominalEndTime,
201-
log,
202-
parentRun.get(),
203-
uuid));
204176

205-
// construct the simple name of the job by removing the parent prefix plus the dot '.' separator
206-
String jobName =
207-
parentJob
208-
.map(
209-
p -> {
210-
if (event.getJob().getName().startsWith(p.getName() + '.')) {
211-
return event.getJob().getName().substring(p.getName().length() + 1);
212-
} else {
213-
return event.getJob().getName();
214-
}
215-
})
216-
.orElse(event.getJob().getName());
217-
log.debug(
218-
"Calculated job name {} from job {} with parent {}",
219-
jobName,
220-
event.getJob().getName(),
221-
parentJob.map(JobRow::getName));
222177
JobRow job =
223-
parentJob
224-
.map(
225-
parent ->
226-
jobDao.upsertJob(
227-
UUID.randomUUID(),
228-
parent.getUuid(),
229-
getJobType(event.getJob()),
230-
now,
231-
namespace.getUuid(),
232-
namespace.getName(),
233-
jobName,
234-
description,
235-
jobContext.getUuid(),
236-
location,
237-
null,
238-
jobDao.toJson(toDatasetId(event.getInputs()), mapper)))
178+
runDao
179+
.findJobRowByRunUuid(runToUuid(event.getRun().getRunId()))
239180
.orElseGet(
240181
() ->
241-
jobDao.upsertJob(
242-
UUID.randomUUID(),
243-
getJobType(event.getJob()),
182+
buildJobFromEvent(
183+
event,
184+
mapper,
185+
jobDao,
244186
now,
245-
namespace.getUuid(),
246-
namespace.getName(),
247-
jobName,
248-
description,
249-
jobContext.getUuid(),
250-
location,
251-
null,
252-
jobDao.toJson(toDatasetId(event.getInputs()), mapper)));
187+
namespace,
188+
jobContext,
189+
nominalStartTime,
190+
nominalEndTime,
191+
parentRun));
192+
253193
bag.setJob(job);
254194

255195
Map<String, String> runArgsMap = createRunArgs(event);
@@ -277,8 +217,8 @@ default UpdateLineageRow updateBaseMarquezModel(LineageEvent event, ObjectMapper
277217
runStateType,
278218
now,
279219
namespace.getName(),
280-
jobName,
281-
location,
220+
job.getName(),
221+
job.getLocation(),
282222
jobContext.getUuid());
283223
} else {
284224
run =
@@ -294,8 +234,8 @@ default UpdateLineageRow updateBaseMarquezModel(LineageEvent event, ObjectMapper
294234
nominalEndTime,
295235
namespace.getUuid(),
296236
namespace.getName(),
297-
jobName,
298-
location,
237+
job.getName(),
238+
job.getLocation(),
299239
jobContext.getUuid());
300240
}
301241
bag.setRun(run);
@@ -363,6 +303,93 @@ default UpdateLineageRow updateBaseMarquezModel(LineageEvent event, ObjectMapper
363303
return bag;
364304
}
365305

306+
private JobRow buildJobFromEvent(
307+
LineageEvent event,
308+
ObjectMapper mapper,
309+
JobDao jobDao,
310+
Instant now,
311+
NamespaceRow namespace,
312+
JobContextRow jobContext,
313+
Instant nominalStartTime,
314+
Instant nominalEndTime,
315+
Optional<ParentRunFacet> parentRun) {
316+
Logger log = LoggerFactory.getLogger(OpenLineageDao.class);
317+
String description =
318+
Optional.ofNullable(event.getJob().getFacets())
319+
.map(JobFacet::getDocumentation)
320+
.map(DocumentationJobFacet::getDescription)
321+
.orElse(null);
322+
323+
String location =
324+
Optional.ofNullable(event.getJob().getFacets())
325+
.flatMap(f -> Optional.ofNullable(f.getSourceCodeLocation()))
326+
.flatMap(s -> Optional.ofNullable(s.getUrl()))
327+
.orElse(null);
328+
329+
Optional<UUID> parentUuid = parentRun.map(Utils::findParentRunUuid);
330+
Optional<JobRow> parentJob =
331+
parentUuid.map(
332+
uuid ->
333+
findParentJobRow(
334+
event,
335+
namespace,
336+
jobContext,
337+
location,
338+
nominalStartTime,
339+
nominalEndTime,
340+
log,
341+
parentRun.get(),
342+
uuid));
343+
344+
// construct the simple name of the job by removing the parent prefix plus the dot '.' separator
345+
String jobName =
346+
parentJob
347+
.map(
348+
p -> {
349+
if (event.getJob().getName().startsWith(p.getName() + '.')) {
350+
return event.getJob().getName().substring(p.getName().length() + 1);
351+
} else {
352+
return event.getJob().getName();
353+
}
354+
})
355+
.orElse(event.getJob().getName());
356+
log.debug(
357+
"Calculated job name {} from job {} with parent {}",
358+
jobName,
359+
event.getJob().getName(),
360+
parentJob.map(JobRow::getName));
361+
return parentJob
362+
.map(
363+
parent ->
364+
jobDao.upsertJob(
365+
UUID.randomUUID(),
366+
parent.getUuid(),
367+
getJobType(event.getJob()),
368+
now,
369+
namespace.getUuid(),
370+
namespace.getName(),
371+
jobName,
372+
description,
373+
jobContext.getUuid(),
374+
location,
375+
null,
376+
jobDao.toJson(toDatasetId(event.getInputs()), mapper)))
377+
.orElseGet(
378+
() ->
379+
jobDao.upsertJob(
380+
UUID.randomUUID(),
381+
getJobType(event.getJob()),
382+
now,
383+
namespace.getUuid(),
384+
namespace.getName(),
385+
jobName,
386+
description,
387+
jobContext.getUuid(),
388+
location,
389+
null,
390+
jobDao.toJson(toDatasetId(event.getInputs()), mapper)));
391+
}
392+
366393
private JobRow findParentJobRow(
367394
LineageEvent event,
368395
NamespaceRow namespace,

api/src/test/java/marquez/OpenLineageIntegrationTest.java

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
import com.fasterxml.jackson.core.type.TypeReference;
1313
import com.fasterxml.jackson.databind.JsonNode;
1414
import com.fasterxml.jackson.databind.ObjectMapper;
15+
import com.fasterxml.jackson.databind.node.ObjectNode;
16+
import com.fasterxml.jackson.databind.node.TextNode;
1517
import com.google.common.collect.ImmutableMap;
1618
import io.dropwizard.util.Resources;
1719
import io.openlineage.client.OpenLineage;
@@ -369,6 +371,60 @@ public void testOpenLineageJobHierarchyAirflowIntegrationWithParentAndParentRunF
369371
assertThat(runsList).isNotEmpty().hasSize(1);
370372
}
371373

374+
@Test
375+
public void testOpenLineageJobHierarchyAirflowIntegrationWithParentOnStartEventOnly()
376+
throws ExecutionException, InterruptedException, TimeoutException, JsonProcessingException {
377+
OpenLineage ol = new OpenLineage(URI.create("http://openlineage.test.com/"));
378+
ZonedDateTime startOfHour =
379+
Instant.now()
380+
.atZone(LineageTestUtils.LOCAL_ZONE)
381+
.with(ChronoField.MINUTE_OF_HOUR, 0)
382+
.with(ChronoField.SECOND_OF_MINUTE, 0);
383+
ZonedDateTime endOfHour = startOfHour.plusHours(1);
384+
String airflowParentRunId = UUID.randomUUID().toString();
385+
String task1Name = "task1";
386+
String dagName = "the_dag";
387+
RunEvent event1 =
388+
createAirflowRunEvent(
389+
ol,
390+
startOfHour,
391+
endOfHour,
392+
airflowParentRunId,
393+
dagName,
394+
dagName + "." + task1Name,
395+
NAMESPACE_NAME);
396+
ObjectMapper mapper = Utils.newObjectMapper();
397+
JsonNode eventOneJson = mapper.valueToTree(event1);
398+
((ObjectNode) eventOneJson).set("eventType", new TextNode("START"));
399+
400+
event1.getRun().getFacets().getAdditionalProperties().remove("parent");
401+
CompletableFuture.allOf(
402+
sendLineage(mapper.writeValueAsString(eventOneJson))
403+
.thenCompose(
404+
r -> {
405+
try {
406+
return sendLineage(mapper.writeValueAsString(event1));
407+
} catch (JsonProcessingException e) {
408+
throw new RuntimeException(e);
409+
}
410+
}))
411+
.get(5, TimeUnit.SECONDS);
412+
413+
Job job = client.getJob(NAMESPACE_NAME, dagName + "." + task1Name);
414+
assertThat(job)
415+
.isNotNull()
416+
.hasFieldOrPropertyWithValue("id", new JobId(NAMESPACE_NAME, dagName + "." + task1Name))
417+
.hasFieldOrPropertyWithValue("parentJobName", dagName);
418+
419+
Job parentJob = client.getJob(NAMESPACE_NAME, dagName);
420+
assertThat(parentJob)
421+
.isNotNull()
422+
.hasFieldOrPropertyWithValue("id", new JobId(NAMESPACE_NAME, dagName))
423+
.hasFieldOrPropertyWithValue("parentJobName", null);
424+
List<Run> runsList = client.listRuns(NAMESPACE_NAME, dagName);
425+
assertThat(runsList).isNotEmpty().hasSize(1);
426+
}
427+
372428
@Test
373429
public void testOpenLineageJobHierarchyAirflowIntegrationWithDagNameWithDot()
374430
throws ExecutionException, InterruptedException, TimeoutException {

0 commit comments

Comments
 (0)