Skip to content

Commit ef1c494

Browse files
get column lineage by job
Signed-off-by: Pawel Leszczynski <leszczynski.pawel@gmail.com>
1 parent 3c26f6f commit ef1c494

10 files changed

Lines changed: 189 additions & 30 deletions

File tree

api/src/main/java/marquez/db/DatasetFieldDao.java

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,22 @@ WHERE CAST((:namespaceName, :datasetName) AS DATASET_NAME) = ANY(d.dataset_symli
105105
""")
106106
List<UUID> findDatasetFieldsUuids(String namespaceName, String datasetName);
107107

108+
@SqlQuery(
109+
"""
110+
WITH latest_run AS (
111+
SELECT DISTINCT r.uuid as uuid, r.created_at
112+
FROM runs_view r
113+
WHERE r.namespace_name = :namespaceName AND r.job_name = :jobName
114+
ORDER BY r.created_at DESC
115+
LIMIT 1
116+
)
117+
SELECT dataset_fields.uuid
118+
FROM dataset_fields
119+
JOIN dataset_versions ON dataset_versions.dataset_uuid = dataset_fields.dataset_uuid
120+
JOIN latest_run ON dataset_versions.run_uuid = latest_run.uuid
121+
""")
122+
List<UUID> findFieldsUuidsByJob(String namespaceName, String jobName);
123+
108124
@SqlQuery(
109125
"""
110126
SELECT df.uuid

api/src/main/java/marquez/service/ColumnLineageService.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import lombok.extern.slf4j.Slf4j;
2020
import marquez.common.models.DatasetFieldId;
2121
import marquez.common.models.DatasetId;
22+
import marquez.common.models.JobId;
2223
import marquez.db.ColumnLineageDao;
2324
import marquez.db.DatasetFieldDao;
2425
import marquez.db.models.ColumnLineageNodeData;
@@ -124,6 +125,11 @@ List<UUID> getColumnNodeUuids(NodeId nodeId) {
124125
datasetFieldId.getDatasetId().getName().getValue(),
125126
datasetFieldId.getFieldName().getValue())
126127
.ifPresent(uuid -> columnNodeUuids.add(uuid));
128+
} else if (nodeId.isJobType()) {
129+
JobId jobId = nodeId.asJobId();
130+
columnNodeUuids.addAll(
131+
datasetFieldDao.findFieldsUuidsByJob(
132+
jobId.getNamespace().getValue(), jobId.getName().getValue()));
127133
}
128134
return columnNodeUuids;
129135
}

api/src/test/java/marquez/ColumnLineageIntegrationTest.java

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ public void tearDown(Jdbi jdbi) {
6464

6565
@Test
6666
public void testColumnLineageEndpointByDataset() {
67-
MarquezClient.Lineage lineage = client.getColumnLineage("namespace", "dataset_b");
67+
MarquezClient.Lineage lineage = client.getColumnLineageByDataset("namespace", "dataset_b");
6868

6969
assertThat(lineage.getGraph()).hasSize(3);
7070
assertThat(getNodeByFieldName(lineage, "col_a")).isPresent();
@@ -74,7 +74,8 @@ public void testColumnLineageEndpointByDataset() {
7474

7575
@Test
7676
public void testColumnLineageEndpointByDatasetField() {
77-
MarquezClient.Lineage lineage = client.getColumnLineage("namespace", "dataset_b", "col_c");
77+
MarquezClient.Lineage lineage =
78+
client.getColumnLineageByDataset("namespace", "dataset_b", "col_c");
7879

7980
assertThat(lineage.getGraph()).hasSize(3);
8081
assertThat(getNodeByFieldName(lineage, "col_a")).isPresent();
@@ -85,7 +86,7 @@ public void testColumnLineageEndpointByDatasetField() {
8586
@Test
8687
public void testColumnLineageEndpointWithDepthLimit() {
8788
MarquezClient.Lineage lineage =
88-
client.getColumnLineage("namespace", "dataset_c", "col_d", 1, false);
89+
client.getColumnLineageByDatasetField("namespace", "dataset_c", "col_d", 1, false);
8990

9091
assertThat(lineage.getGraph()).hasSize(2);
9192
assertThat(getNodeByFieldName(lineage, "col_c")).isPresent();
@@ -95,12 +96,22 @@ public void testColumnLineageEndpointWithDepthLimit() {
9596
@Test
9697
public void testColumnLineageEndpointWithDownstream() {
9798
MarquezClient.Lineage lineage =
98-
client.getColumnLineage("namespace", "dataset_b", "col_c", 10, true);
99+
client.getColumnLineageByDatasetField("namespace", "dataset_b", "col_c", 10, true);
99100

100101
assertThat(lineage.getGraph()).hasSize(4);
101102
assertThat(getNodeByFieldName(lineage, "col_d")).isPresent();
102103
}
103104

105+
@Test
106+
public void testColumnLineaapi/src/test/java/marquez/ColumnLineageIntegrationTest.javageEndpointByJob() {
107+
MarquezClient.Lineage lineage = client.getColumnLineageByJob("namespace", "job1");
108+
109+
assertThat(lineage.getGraph()).hasSize(3);
110+
assertThat(getNodeByFieldName(lineage, "col_a")).isPresent();
111+
assertThat(getNodeByFieldName(lineage, "col_b")).isPresent();
112+
assertThat(getNodeByFieldName(lineage, "col_c")).isPresent();
113+
}
114+
104115
private Optional<Node> getNodeByFieldName(MarquezClient.Lineage lineage, String field) {
105116
return lineage.getGraph().stream()
106117
.filter(n -> n.getId().asDatasetFieldId().getField().equals(field))

api/src/test/java/marquez/service/ColumnLineageServiceTest.java

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020
import marquez.common.models.DatasetFieldId;
2121
import marquez.common.models.DatasetId;
2222
import marquez.common.models.DatasetName;
23+
import marquez.common.models.JobId;
24+
import marquez.common.models.JobName;
2325
import marquez.common.models.NamespaceName;
2426
import marquez.db.ColumnLineageDao;
2527
import marquez.db.ColumnLineageTestUtils;
@@ -355,6 +357,44 @@ public void testEnrichDatasetsHasNoDuplicates() {
355357
assertThat(dataset_b.getColumnLineage()).hasSize(1);
356358
}
357359

360+
@Test
361+
public void testGetLineageByJob() {
362+
LineageEvent.Dataset dataset_A = getDatasetA();
363+
LineageEvent.Dataset dataset_B = getDatasetB();
364+
LineageEvent.Dataset dataset_C = getDatasetC();
365+
366+
LineageTestUtils.createLineageRow(
367+
openLineageDao,
368+
"job1",
369+
"COMPLETE",
370+
jobFacet,
371+
Arrays.asList(dataset_A),
372+
Arrays.asList(dataset_B));
373+
374+
LineageTestUtils.createLineageRow(
375+
openLineageDao,
376+
"job2",
377+
"COMPLETE",
378+
jobFacet,
379+
Arrays.asList(dataset_B),
380+
Arrays.asList(dataset_C));
381+
382+
// getting lineage by job_1 should be the same as getting it by dataset_B
383+
assertThat(
384+
lineageService.lineage(
385+
NodeId.of(JobId.of(NamespaceName.of("namespace"), JobName.of("job1"))),
386+
20,
387+
true,
388+
Instant.now()))
389+
.isEqualTo(
390+
lineageService.lineage(
391+
NodeId.of(
392+
new DatasetId(NamespaceName.of("namespace"), DatasetName.of("dataset_b"))),
393+
20,
394+
true,
395+
Instant.now()));
396+
}
397+
358398
private Optional<Node> getNode(Lineage lineage, String datasetName, String fieldName) {
359399
return lineage.getGraph().stream()
360400
.filter(n -> n.getId().asDatasetFieldId().getFieldName().getValue().equals(fieldName))

clients/java/src/main/java/marquez/client/MarquezClient.java

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -115,33 +115,50 @@ public enum SortDirection {
115115
@Getter public final String value;
116116
}
117117

118-
public Lineage getColumnLineage(@NonNull String namespaceName, @NonNull String datasetName) {
119-
return getColumnLineage(namespaceName, datasetName, DEFAULT_LINEAGE_GRAPH_DEPTH, false);
118+
public Lineage getColumnLineageByDataset(
119+
@NonNull String namespaceName, @NonNull String datasetName) {
120+
return getColumnLineageByDataset(
121+
namespaceName, datasetName, DEFAULT_LINEAGE_GRAPH_DEPTH, false);
120122
}
121123

122-
public Lineage getColumnLineage(
124+
public Lineage getColumnLineageByDataset(
123125
@NonNull String namespaceName, @NonNull String datasetName, @NonNull String field) {
124-
return getColumnLineage(namespaceName, datasetName, field, DEFAULT_LINEAGE_GRAPH_DEPTH, false);
126+
return getColumnLineageByDatasetField(
127+
namespaceName, datasetName, field, DEFAULT_LINEAGE_GRAPH_DEPTH, false);
125128
}
126129

127-
public Lineage getColumnLineage(
130+
public Lineage getColumnLineageByDataset(
128131
@NonNull String namespaceName,
129132
@NonNull String datasetName,
130133
int depth,
131134
boolean withDownstream) {
132135
final String bodyAsJson =
133-
http.get(url.toColumnLineageUrl(namespaceName, datasetName, depth, withDownstream));
136+
http.get(
137+
url.toColumnLineageUrlByDataset(namespaceName, datasetName, depth, withDownstream));
134138
return Lineage.fromJson(bodyAsJson);
135139
}
136140

137-
public Lineage getColumnLineage(
141+
public Lineage getColumnLineageByDatasetField(
138142
@NonNull String namespaceName,
139143
@NonNull String datasetName,
140144
@NonNull String field,
141145
int depth,
142146
boolean withDownstream) {
143147
final String bodyAsJson =
144-
http.get(url.toColumnLineageUrl(namespaceName, datasetName, field, depth, withDownstream));
148+
http.get(
149+
url.toColumnLineageUrlByDatasetField(
150+
namespaceName, datasetName, field, depth, withDownstream));
151+
return Lineage.fromJson(bodyAsJson);
152+
}
153+
154+
public Lineage getColumnLineageByJob(@NonNull String namespaceName, @NonNull String jobName) {
155+
return getColumnLineageByJob(namespaceName, jobName, DEFAULT_LINEAGE_GRAPH_DEPTH, false);
156+
}
157+
158+
public Lineage getColumnLineageByJob(
159+
@NonNull String namespaceName, @NonNull String jobName, int depth, boolean withDownstream) {
160+
final String bodyAsJson =
161+
http.get(url.toColumnLineageUrlByJob(namespaceName, jobName, depth, withDownstream));
145162
return Lineage.fromJson(bodyAsJson);
146163
}
147164

clients/java/src/main/java/marquez/client/MarquezUrl.java

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
import lombok.NonNull;
4545
import marquez.client.models.DatasetFieldId;
4646
import marquez.client.models.DatasetId;
47+
import marquez.client.models.JobId;
4748
import marquez.client.models.NodeId;
4849
import marquez.client.models.RunState;
4950
import marquez.client.models.SearchFilter;
@@ -210,7 +211,7 @@ URL toSearchUrl(
210211
return from(searchPath(), queryParams.build());
211212
}
212213

213-
URL toColumnLineageUrl(
214+
URL toColumnLineageUrlByDatasetField(
214215
String namespace, String dataset, String field, int depth, boolean withDownstream) {
215216
final ImmutableMap.Builder queryParams = new ImmutableMap.Builder();
216217
queryParams.put("nodeId", NodeId.of(new DatasetFieldId(namespace, dataset, field)).getValue());
@@ -219,11 +220,20 @@ URL toColumnLineageUrl(
219220
return from(columnLineagePath(), queryParams.build());
220221
}
221222

222-
URL toColumnLineageUrl(String namespace, String dataset, int depth, boolean withDownstream) {
223+
URL toColumnLineageUrlByDataset(
224+
String namespace, String dataset, int depth, boolean withDownstream) {
223225
final ImmutableMap.Builder queryParams = new ImmutableMap.Builder();
224226
queryParams.put("nodeId", NodeId.of(new DatasetId(namespace, dataset)).getValue());
225227
queryParams.put("depth", String.valueOf(depth));
226228
queryParams.put("withDownstream", String.valueOf(withDownstream));
227229
return from(columnLineagePath(), queryParams.build());
228230
}
231+
232+
URL toColumnLineageUrlByJob(String namespace, String job, int depth, boolean withDownstream) {
233+
final ImmutableMap.Builder queryParams = new ImmutableMap.Builder();
234+
queryParams.put("nodeId", NodeId.of(new JobId(namespace, job)).getValue());
235+
queryParams.put("depth", String.valueOf(depth));
236+
queryParams.put("withDownstream", String.valueOf(withDownstream));
237+
return from(columnLineagePath(), queryParams.build());
238+
}
229239
}

clients/java/src/main/java/marquez/client/models/NodeId.java

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,10 @@ public static NodeId of(@NonNull DatasetFieldId datasetFieldId) {
7070
datasetFieldId.getField()));
7171
}
7272

73+
public static NodeId of(@NonNull JobId jobId) {
74+
return of(ID_JOINER.join(ID_PREFX_JOB, jobId.getNamespace(), jobId.getName()));
75+
}
76+
7377
@JsonIgnore
7478
public boolean isDatasetFieldType() {
7579
return value.startsWith(ID_PREFX_DATASET_FIELD);
@@ -80,6 +84,11 @@ public boolean isDatasetType() {
8084
return value.startsWith(ID_PREFX_DATASET + ID_DELIM);
8185
}
8286

87+
@JsonIgnore
88+
public boolean isJobType() {
89+
return value.startsWith(ID_PREFX_JOB);
90+
}
91+
8392
@JsonIgnore
8493
private String[] parts(int expectedParts, String expectedType) {
8594

@@ -124,6 +133,12 @@ public DatasetFieldId asDatasetFieldId() {
124133
return new DatasetFieldId(parts[1], parts[2], parts[3]);
125134
}
126135

136+
@JsonIgnore
137+
public JobId asJobId() {
138+
String[] parts = parts(3, ID_PREFX_JOB);
139+
return new JobId(parts[1], parts[2]);
140+
}
141+
127142
public static class FromValue extends StdConverter<String, NodeId> {
128143
@Override
129144
public NodeId convert(@NonNull String value) {

clients/java/src/test/java/marquez/client/MarquezClientTest.java

Lines changed: 41 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -982,13 +982,10 @@ public void testGetColumnLineage() throws Exception {
982982
.thenReturn(lineageJson);
983983

984984
Node retrievedNode =
985-
client.getColumnLineage("namespace", "dataset").getGraph().stream().findAny().get();
986-
assertThat(retrievedNode.getId()).isEqualTo(node.getId());
987-
assertThat(retrievedNode.getData()).isEqualTo(node.getData());
988-
assertThat(retrievedNode.getInEdges().stream().findFirst())
989-
.isEqualTo(node.getInEdges().stream().findFirst());
990-
assertThat(retrievedNode.getOutEdges().stream().findFirst())
991-
.isEqualTo(node.getOutEdges().stream().findFirst());
985+
client.getColumnLineageByDataset("namespace", "dataset").getGraph().stream()
986+
.findAny()
987+
.get();
988+
assertThat(retrievedNode).isEqualTo(node);
992989
}
993990

994991
@Test
@@ -1022,15 +1019,45 @@ public void testGetColumnLineageByField() throws Exception {
10221019
.thenReturn(lineageJson);
10231020

10241021
Node retrievedNode =
1025-
client.getColumnLineage("namespace", "dataset", "some-col1").getGraph().stream()
1022+
client.getColumnLineageByDataset("namespace", "dataset", "some-col1").getGraph().stream()
10261023
.findAny()
10271024
.get();
1028-
assertThat(retrievedNode.getId()).isEqualTo(node.getId());
1029-
assertThat(retrievedNode.getData()).isEqualTo(node.getData());
1030-
assertThat(retrievedNode.getInEdges().stream().findFirst())
1031-
.isEqualTo(node.getInEdges().stream().findFirst());
1032-
assertThat(retrievedNode.getOutEdges().stream().findFirst())
1033-
.isEqualTo(node.getOutEdges().stream().findFirst());
1025+
assertThat(retrievedNode).isEqualTo(node);
1026+
}
1027+
1028+
@Test
1029+
public void testGetColumnLineageByJob() throws Exception {
1030+
Node node =
1031+
new Node(
1032+
NodeId.of(DATASET_FIELD_ID),
1033+
NodeType.DATASET_FIELD,
1034+
new ColumnLineageNodeData(
1035+
NAMESPACE_NAME,
1036+
DB_TABLE_NAME,
1037+
FIELD_NAME,
1038+
"String",
1039+
"transformationDescription",
1040+
"transformationType",
1041+
Collections.singletonList(
1042+
new DatasetFieldId("namespace", "inDataset", "some-col1"))),
1043+
ImmutableSet.of(
1044+
Edge.of(
1045+
NodeId.of(DATASET_FIELD_ID),
1046+
NodeId.of(new DatasetFieldId("namespace", "inDataset", "some-col1")))),
1047+
ImmutableSet.of(
1048+
Edge.of(
1049+
NodeId.of(new DatasetFieldId("namespace", "outDataset", "some-col2")),
1050+
NodeId.of(DATASET_FIELD_ID))));
1051+
MarquezClient.Lineage lineage = new MarquezClient.Lineage(ImmutableSet.of(node));
1052+
String lineageJson = lineage.toJson();
1053+
when(http.get(
1054+
buildUrlFor(
1055+
"/column-lineage?nodeId=job%3Anamespace%3Ajob&depth=20&withDownstream=false")))
1056+
.thenReturn(lineageJson);
1057+
1058+
Node retrievedNode =
1059+
client.getColumnLineageByJob("namespace", "job").getGraph().stream().findAny().get();
1060+
assertThat(retrievedNode).isEqualTo(node);
10341061
}
10351062

10361063
private URL buildUrlFor(String pathTemplate) throws Exception {

clients/java/src/test/java/marquez/client/MarquezUrlTest.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,10 +38,12 @@ void testEncodedMarquezUrl() {
3838
void testToColumnLineageUrl() {
3939
Assertions.assertEquals(
4040
"http://marquez:5000/api/v1/column-lineage?nodeId=dataset%3Anamespace%3Adataset&depth=20&withDownstream=true",
41-
marquezUrl.toColumnLineageUrl("namespace", "dataset", 20, true).toString());
41+
marquezUrl.toColumnLineageUrlByDataset("namespace", "dataset", 20, true).toString());
4242

4343
Assertions.assertEquals(
4444
"http://marquez:5000/api/v1/column-lineage?nodeId=datasetField%3Anamespace%3Adataset%3Afield&depth=20&withDownstream=true",
45-
marquezUrl.toColumnLineageUrl("namespace", "dataset", "field", 20, true).toString());
45+
marquezUrl
46+
.toColumnLineageUrlByDatasetField("namespace", "dataset", "field", 20, true)
47+
.toString());
4648
}
4749
}

clients/java/src/test/java/marquez/client/models/NodeIdTest.java

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,4 +47,19 @@ public void testDatasetField(String namespace, String dataset, String field) {
4747
assertEquals(dataset, nodeId.asDatasetFieldId().getDataset());
4848
assertEquals(field, nodeId.asDatasetFieldId().getField());
4949
}
50+
51+
@ParameterizedTest(name = "testJob-{index} {argumentsWithNames}")
52+
@CsvSource(
53+
value = {"my-namespace$my-job", "org://team$my-job"},
54+
delimiter = '$')
55+
public void testJob(String namespace, String job) {
56+
JobId jobId = new JobId(namespace, job);
57+
NodeId nodeId = NodeId.of(jobId);
58+
assertTrue(nodeId.isJobType());
59+
assertFalse(nodeId.isDatasetType());
60+
assertEquals(jobId, nodeId.asJobId());
61+
assertEquals(nodeId, NodeId.of(nodeId.getValue()));
62+
assertEquals(namespace, nodeId.asJobId().getNamespace());
63+
assertEquals(job, nodeId.asJobId().getName());
64+
}
5065
}

0 commit comments

Comments
 (0)