Skip to content

Commit b79d712

Browse files
add column lineage graph endpoint
Signed-off-by: Pawel Leszczynski <leszczynski.pawel@gmail.com>
1 parent b6544ec commit b79d712

31 files changed

Lines changed: 1405 additions & 81 deletions

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
### Added
66
* Implemented dataset symlink feature which allows providing multiple names for a dataset and adds edges to lineage graph based on symlinks [`#2066`](https://github.com/MarquezProject/marquez/pull/2066) [@pawel-big-lebowski](https://github.com/pawel-big-lebowski)
77
* Store column lineage facets in separate table [`#2096`](https://github.com/MarquezProject/marquez/pull/2096) [@mzareba382](https://github.com/mzareba382) [@pawel-big-lebowski](https://github.com/pawel-big-lebowski)
8+
* Lineage graph endpoint for column lineage [`#2124`](https://github.com/MarquezProject/marquez/pull/2124) [@pawel-big-lebowski](https://github.com/pawel-big-lebowski)
89

910
### Fixed
1011
* Add support for `parentRun` facet as reported by older Airflow OpenLineage versions [@collado-mike](https://github.com/collado-mike)

api/src/main/java/marquez/MarquezContext.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import java.util.List;
1414
import lombok.Getter;
1515
import lombok.NonNull;
16+
import marquez.api.ColumnLineageResource;
1617
import marquez.api.DatasetResource;
1718
import marquez.api.JobResource;
1819
import marquez.api.NamespaceResource;
@@ -22,6 +23,7 @@
2223
import marquez.api.TagResource;
2324
import marquez.api.exceptions.JdbiExceptionExceptionMapper;
2425
import marquez.db.BaseDao;
26+
import marquez.db.ColumnLineageDao;
2527
import marquez.db.DatasetDao;
2628
import marquez.db.DatasetFieldDao;
2729
import marquez.db.DatasetVersionDao;
@@ -39,6 +41,7 @@
3941
import marquez.db.TagDao;
4042
import marquez.graphql.GraphqlSchemaBuilder;
4143
import marquez.graphql.MarquezGraphqlServletBuilder;
44+
import marquez.service.ColumnLineageService;
4245
import marquez.service.DatasetFieldService;
4346
import marquez.service.DatasetService;
4447
import marquez.service.DatasetVersionService;
@@ -70,6 +73,7 @@ public final class MarquezContext {
7073
@Getter private final TagDao tagDao;
7174
@Getter private final OpenLineageDao openLineageDao;
7275
@Getter private final LineageDao lineageDao;
76+
@Getter private final ColumnLineageDao columnLineageDao;
7377
@Getter private final SearchDao searchDao;
7478
@Getter private final List<RunTransitionListener> runTransitionListeners;
7579

@@ -81,9 +85,11 @@ public final class MarquezContext {
8185
@Getter private final RunService runService;
8286
@Getter private final OpenLineageService openLineageService;
8387
@Getter private final LineageService lineageService;
88+
@Getter private final ColumnLineageService columnLineageService;
8489
@Getter private final NamespaceResource namespaceResource;
8590
@Getter private final SourceResource sourceResource;
8691
@Getter private final DatasetResource datasetResource;
92+
@Getter private final ColumnLineageResource columnLineageResource;
8793
@Getter private final JobResource jobResource;
8894
@Getter private final TagResource tagResource;
8995
@Getter private final OpenLineageResource openLineageResource;
@@ -115,6 +121,7 @@ private MarquezContext(
115121
this.tagDao = jdbi.onDemand(TagDao.class);
116122
this.openLineageDao = jdbi.onDemand(OpenLineageDao.class);
117123
this.lineageDao = jdbi.onDemand(LineageDao.class);
124+
this.columnLineageDao = jdbi.onDemand(ColumnLineageDao.class);
118125
this.searchDao = jdbi.onDemand(SearchDao.class);
119126
this.runTransitionListeners = runTransitionListeners;
120127

@@ -128,6 +135,7 @@ private MarquezContext(
128135
this.tagService.init(tags);
129136
this.openLineageService = new OpenLineageService(baseDao, runService);
130137
this.lineageService = new LineageService(lineageDao, jobDao);
138+
this.columnLineageService = new ColumnLineageService(columnLineageDao, datasetFieldDao);
131139
this.jdbiException = new JdbiExceptionExceptionMapper();
132140
final ServiceFactory serviceFactory =
133141
ServiceFactory.builder()
@@ -139,12 +147,14 @@ private MarquezContext(
139147
.openLineageService(openLineageService)
140148
.sourceService(sourceService)
141149
.lineageService(lineageService)
150+
.columnLineageService(columnLineageService)
142151
.datasetFieldService(new DatasetFieldService(baseDao))
143152
.datasetVersionService(new DatasetVersionService(baseDao))
144153
.build();
145154
this.namespaceResource = new NamespaceResource(serviceFactory);
146155
this.sourceResource = new SourceResource(serviceFactory);
147156
this.datasetResource = new DatasetResource(serviceFactory);
157+
this.columnLineageResource = new ColumnLineageResource(serviceFactory);
148158
this.jobResource = new JobResource(serviceFactory, jobVersionDao);
149159
this.tagResource = new TagResource(serviceFactory);
150160
this.openLineageResource = new OpenLineageResource(serviceFactory, openLineageDao);
@@ -155,6 +165,7 @@ private MarquezContext(
155165
namespaceResource,
156166
sourceResource,
157167
datasetResource,
168+
columnLineageResource,
158169
jobResource,
159170
tagResource,
160171
jdbiException,

api/src/main/java/marquez/api/BaseResource.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
import marquez.common.models.NamespaceName;
2626
import marquez.common.models.RunId;
2727
import marquez.common.models.SourceName;
28+
import marquez.service.ColumnLineageService;
2829
import marquez.service.DatasetFieldService;
2930
import marquez.service.DatasetService;
3031
import marquez.service.DatasetVersionService;
@@ -50,6 +51,7 @@ public class BaseResource {
5051
protected DatasetVersionService datasetVersionService;
5152
protected DatasetFieldService datasetFieldService;
5253
protected LineageService lineageService;
54+
protected ColumnLineageService columnLineageService;
5355

5456
public BaseResource(ServiceFactory serviceFactory) {
5557
this.serviceFactory = serviceFactory;
@@ -63,6 +65,7 @@ public BaseResource(ServiceFactory serviceFactory) {
6365
this.datasetVersionService = serviceFactory.getDatasetVersionService();
6466
this.datasetFieldService = serviceFactory.getDatasetFieldService();
6567
this.lineageService = serviceFactory.getLineageService();
68+
this.columnLineageService = serviceFactory.getColumnLineageService();
6669
}
6770

6871
void throwIfNotExists(@NonNull NamespaceName namespaceName) {
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
/*
2+
* Copyright 2018-2022 contributors to the Marquez project
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package marquez.api;
7+
8+
import static javax.ws.rs.core.MediaType.APPLICATION_JSON;
9+
10+
import com.codahale.metrics.annotation.ExceptionMetered;
11+
import com.codahale.metrics.annotation.ResponseMetered;
12+
import com.codahale.metrics.annotation.Timed;
13+
import java.time.Instant;
14+
import java.util.concurrent.ExecutionException;
15+
import javax.validation.constraints.NotNull;
16+
import javax.ws.rs.DefaultValue;
17+
import javax.ws.rs.GET;
18+
import javax.ws.rs.Path;
19+
import javax.ws.rs.Produces;
20+
import javax.ws.rs.QueryParam;
21+
import javax.ws.rs.core.Response;
22+
import lombok.NonNull;
23+
import lombok.extern.slf4j.Slf4j;
24+
import marquez.service.ServiceFactory;
25+
import marquez.service.models.NodeId;
26+
27+
@Slf4j
28+
@Path("/api/v1/column-lineage")
29+
public class ColumnLineageResource extends BaseResource {
30+
31+
private static final String DEFAULT_DEPTH = "20";
32+
33+
public ColumnLineageResource(@NonNull final ServiceFactory serviceFactory) {
34+
super(serviceFactory);
35+
}
36+
37+
@Timed
38+
@ResponseMetered
39+
@ExceptionMetered
40+
@GET
41+
@Produces(APPLICATION_JSON)
42+
public Response getLineage(
43+
@QueryParam("nodeId") @NotNull NodeId nodeId,
44+
@QueryParam("depth") @DefaultValue(DEFAULT_DEPTH) int depth)
45+
throws ExecutionException, InterruptedException {
46+
return Response.ok(columnLineageService.lineage(nodeId, depth, Instant.now())).build();
47+
}
48+
}
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
/*
2+
* Copyright 2018-2022 contributors to the Marquez project
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package marquez.common.models;
7+
8+
import lombok.AllArgsConstructor;
9+
import lombok.EqualsAndHashCode;
10+
import lombok.Getter;
11+
import lombok.ToString;
12+
13+
/** ID for {@code DatasetField}. */
14+
@EqualsAndHashCode
15+
@AllArgsConstructor
16+
@ToString
17+
public class DatasetFieldId {
18+
19+
@Getter private final DatasetId datasetId;
20+
@Getter private final FieldName fieldName;
21+
22+
public static DatasetFieldId of(String namespace, String datasetName, String field) {
23+
return new DatasetFieldId(
24+
new DatasetId(NamespaceName.of(namespace), DatasetName.of(datasetName)),
25+
FieldName.of(field));
26+
}
27+
}

api/src/main/java/marquez/db/ColumnLineageDao.java

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,20 +5,27 @@
55

66
package marquez.db;
77

8+
import static org.jdbi.v3.sqlobject.customizer.BindList.EmptyHandling.NULL_STRING;
9+
810
import java.time.Instant;
911
import java.util.Collections;
1012
import java.util.List;
13+
import java.util.Set;
1114
import java.util.UUID;
1215
import java.util.stream.Collectors;
16+
import marquez.db.mappers.ColumnLineageNodeDataMapper;
1317
import marquez.db.mappers.ColumnLineageRowMapper;
18+
import marquez.db.models.ColumnLineageNodeData;
1419
import marquez.db.models.ColumnLineageRow;
1520
import org.apache.commons.lang3.tuple.Pair;
1621
import org.jdbi.v3.sqlobject.config.RegisterRowMapper;
1722
import org.jdbi.v3.sqlobject.customizer.BindBeanList;
23+
import org.jdbi.v3.sqlobject.customizer.BindList;
1824
import org.jdbi.v3.sqlobject.statement.SqlQuery;
1925
import org.jdbi.v3.sqlobject.statement.SqlUpdate;
2026

2127
@RegisterRowMapper(ColumnLineageRowMapper.class)
28+
@RegisterRowMapper(ColumnLineageNodeDataMapper.class)
2229
public interface ColumnLineageDao extends BaseDao {
2330

2431
default List<ColumnLineageRow> upsertColumnLineageRow(
@@ -88,4 +95,59 @@ void doUpsertColumnLineageRow(
8895
},
8996
value = "values")
9097
List<ColumnLineageRow> rows);
98+
99+
@SqlQuery(
100+
"""
101+
WITH RECURSIVE
102+
dataset_fields_view AS (
103+
SELECT d.namespace_name as namespace_name, d.name as dataset_name, df.name as field_name, df.type, df.uuid
104+
FROM dataset_fields df
105+
INNER JOIN datasets_view d ON d.uuid = df.dataset_uuid
106+
),
107+
column_lineage_recursive AS (
108+
SELECT *, 0 as depth
109+
FROM column_lineage
110+
WHERE output_dataset_field_uuid IN (<datasetFieldUuids>) AND created_at <= :createdAtUntil
111+
UNION
112+
SELECT
113+
upstream_node.output_dataset_version_uuid,
114+
upstream_node.output_dataset_field_uuid,
115+
upstream_node.input_dataset_version_uuid,
116+
upstream_node.input_dataset_field_uuid,
117+
upstream_node.transformation_description,
118+
upstream_node.transformation_type,
119+
upstream_node.created_at,
120+
upstream_node.updated_at,
121+
node.depth + 1 as depth
122+
FROM column_lineage upstream_node, column_lineage_recursive node
123+
WHERE node.input_dataset_field_uuid = upstream_node.output_dataset_field_uuid
124+
AND node.depth < :depth
125+
)
126+
SELECT
127+
output_fields.namespace_name,
128+
output_fields.dataset_name,
129+
output_fields.field_name,
130+
output_fields.type,
131+
ARRAY_AGG(ARRAY[input_fields.namespace_name, input_fields.dataset_name, input_fields.field_name]) AS inputFields,
132+
clr.transformation_description,
133+
clr.transformation_type,
134+
clr.created_at,
135+
clr.updated_at
136+
FROM column_lineage_recursive clr
137+
INNER JOIN dataset_fields_view output_fields ON clr.output_dataset_field_uuid = output_fields.uuid -- hidden datasets will be filtered
138+
LEFT JOIN dataset_fields_view input_fields ON clr.input_dataset_field_uuid = input_fields.uuid
139+
GROUP BY
140+
output_fields.namespace_name,
141+
output_fields.dataset_name,
142+
output_fields.field_name,
143+
output_fields.type,
144+
clr.transformation_description,
145+
clr.transformation_type,
146+
clr.created_at,
147+
clr.updated_at
148+
""")
149+
Set<ColumnLineageNodeData> getLineage(
150+
int depth,
151+
@BindList(onEmpty = NULL_STRING) List<UUID> datasetFieldUuids,
152+
Instant createdAtUntil);
91153
}

api/src/main/java/marquez/db/DatasetFieldDao.java

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,25 @@ default Dataset updateTags(
9393
+ "WHERE dataset_uuid = :datasetUuid AND name = :name")
9494
Optional<UUID> findUuid(UUID datasetUuid, String name);
9595

96+
@SqlQuery(
97+
"""
98+
SELECT df.uuid
99+
FROM dataset_fields df
100+
INNER JOIN datasets_view AS d
101+
ON d.uuid = df.dataset_uuid AND d.name = :datasetName AND d.namespace_name = :namespace
102+
""")
103+
List<UUID> findDatasetFieldsUuids(String namespace, String datasetName);
104+
105+
@SqlQuery(
106+
"""
107+
SELECT df.uuid
108+
FROM dataset_fields df
109+
INNER JOIN datasets_view AS d
110+
ON d.uuid = df.dataset_uuid AND d.name = :datasetName AND d.namespace_name = :namespace
111+
WHERE df.name = :name
112+
""")
113+
Optional<UUID> findUuid(String namespace, String datasetName, String name);
114+
96115
@SqlQuery(
97116
"SELECT f.*, "
98117
+ "ARRAY(SELECT t.name "

api/src/main/java/marquez/db/OpenLineageDao.java

Lines changed: 19 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -731,36 +731,40 @@ private List<ColumnLineageRow> upsertColumnLineage(
731731

732732
return Optional.ofNullable(ds.getFacets())
733733
.map(DatasetFacets::getColumnLineage)
734-
.map(LineageEvent.ColumnLineageFacet::getOutputColumnsList)
734+
.map(LineageEvent.ColumnLineageDatasetFacet::getFields)
735+
.map(LineageEvent.ColumnLineageDatasetFacetFields::getAdditional)
735736
.stream()
736-
.flatMap(list -> list.stream())
737+
.flatMap(map -> map.keySet().stream())
738+
.filter(
739+
columnName ->
740+
ds.getFacets().getColumnLineage().getFields().getAdditional().get(columnName)
741+
instanceof LineageEvent.ColumnLineageOutputColumn)
737742
.flatMap(
738-
outputColumn -> {
743+
columnName -> {
744+
LineageEvent.ColumnLineageOutputColumn columnLineage =
745+
ds.getFacets().getColumnLineage().getFields().getAdditional().get(columnName);
739746
Optional<DatasetFieldRow> outputField =
740-
datasetFields.stream()
741-
.filter(dfr -> dfr.getName().equals(outputColumn.getName()))
742-
.findAny();
747+
datasetFields.stream().filter(dfr -> dfr.getName().equals(columnName)).findAny();
743748

744749
if (outputField.isEmpty()) {
745750
Logger log = LoggerFactory.getLogger(OpenLineageDao.class);
746751
log.error(
747752
"Cannot produce column lineage for missing output field in output dataset: {}",
748-
outputColumn.getName());
749-
return Stream.empty();
753+
columnName);
754+
return Stream.<ColumnLineageRow>empty();
750755
}
751756

752757
// get field uuids of input columns related to this run
753758
List<Pair<UUID, UUID>> inputFields =
754759
runFields.stream()
755760
.filter(
756761
fieldData ->
757-
outputColumn.getInputFields().stream()
762+
columnLineage.getInputFields().stream()
758763
.filter(
759764
of ->
760-
of.getDatasetNamespace().equals(fieldData.getNamespace())
761-
&& of.getDatasetName()
762-
.equals(fieldData.getDatasetName())
763-
&& of.getFieldName().equals(fieldData.getField()))
765+
of.getNamespace().equals(fieldData.getNamespace())
766+
&& of.getName().equals(fieldData.getDatasetName())
767+
&& of.getField().equals(fieldData.getField()))
764768
.findAny()
765769
.isPresent())
766770
.map(
@@ -775,8 +779,8 @@ private List<ColumnLineageRow> upsertColumnLineage(
775779
datasetVersionRow.getUuid(),
776780
outputField.get().getUuid(),
777781
inputFields,
778-
outputColumn.getTransformationDescription(),
779-
outputColumn.getTransformationType(),
782+
columnLineage.getTransformationDescription(),
783+
columnLineage.getTransformationType(),
780784
now)
781785
.stream();
782786
})

0 commit comments

Comments
 (0)