Skip to content

Commit b6544ec

Browse files
mzareba382Mariusz Zarębapawel-big-lebowski
authored
Model and store column lineage in Marquez DB (#2096)
* Create database representation, model classes Signed-off-by: mzareba <mzareba382@gmail.com> * Implement ColumnLevelLineageDao Signed-off-by: mzareba <mzareba382@gmail.com> * Instantiate ColumnLevelLineageDao in updateBaseMarquezModel Signed-off-by: mzareba <mzareba382@gmail.com> * Upsert ColumnLevelLineageRow to db, model representation in LineageEvent Signed-off-by: mzareba <mzareba382@gmail.com> * Fix problems in OpenLineageDao, add a list of ColumnLevelLineageRow to DatasetRecord, write test for createLineageRow() invocation Signed-off-by: mzareba <mzareba382@gmail.com> * Change wildcard imports to single class imports Signed-off-by: mzareba <mzareba382@gmail.com> * Change wildcard imports to single class imports Signed-off-by: mzareba <mzareba382@gmail.com> * Change wildcard imports to single class imports Signed-off-by: mzareba <mzareba382@gmail.com> * Apply spotless Signed-off-by: mzareba <mzareba382@gmail.com> * Check for ds.getFacets not null Signed-off-by: mzareba <mzareba382@gmail.com> * Format fix Signed-off-by: mzareba <mzareba382@gmail.com> * Update testUpdateMarquezModelDatasetWithColumnLineageFacet Signed-off-by: mzareba <mzareba382@gmail.com> * Test for column_level_lineage upsert. Signed-off-by: mzareba <mzareba382@gmail.com> * Apply spotless Signed-off-by: mzareba <mzareba382@gmail.com> * switch to data field references Signed-off-by: Pawel Leszczynski <leszczynski.pawel@gmail.com> * fix broken tests Signed-off-by: Pawel Leszczynski <leszczynski.pawel@gmail.com> * test when dataset_field is missing Signed-off-by: Pawel Leszczynski <leszczynski.pawel@gmail.com> * add input_dataset_version_uuid field Signed-off-by: Pawel Leszczynski <leszczynski.pawel@gmail.com> * increase db file version Signed-off-by: Pawel Leszczynski <leszczynski.pawel@gmail.com> * increase db file version Signed-off-by: Pawel Leszczynski <leszczynski.pawel@gmail.com> * rename ColumnLevelLineage -> ColumnLineage Signed-off-by: Pawel Leszczynski <leszczynski.pawel@gmail.com> Signed-off-by: mzareba <mzareba382@gmail.com> Signed-off-by: Pawel Leszczynski <leszczynski.pawel@gmail.com> Co-authored-by: Mariusz Zaręba <mzareba382@getindata.com> Co-authored-by: Pawel Leszczynski <leszczynski.pawel@gmail.com>
1 parent 2909864 commit b6544ec

15 files changed

Lines changed: 856 additions & 5 deletions

CHANGELOG.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,13 @@
11
# Changelog
22

33
## [Unreleased](https://github.com/MarquezProject/marquez/compare/0.26.0...HEAD)
4+
5+
### Added
6+
* Implemented dataset symlink feature which allows providing multiple names for a dataset and adds edges to lineage graph based on symlinks [`#2066`](https://github.com/MarquezProject/marquez/pull/2066) [@pawel-big-lebowski](https://github.com/pawel-big-lebowski)
7+
* Store column lineage facets in separate table [`#2096`](https://github.com/MarquezProject/marquez/pull/2096) [@mzareba382](https://github.com/mzareba382) [@pawel-big-lebowski](https://github.com/pawel-big-lebowski)
8+
49
### Fixed
510
* Add support for `parentRun` facet as reported by older Airflow OpenLineage versions [@collado-mike](https://github.com/collado-mike)
6-
* Implemented dataset symlink feature which allows providing multiple names for a dataset and adds edges to lineage graph based on symlinks [`#2066`](https://github.com/MarquezProject/marquez/pull/2066) [@pawel-big-lebowski](https://github.com/pawel-big-lebowski)
711

812
## [0.26.0](https://github.com/MarquezProject/marquez/compare/0.25.0...0.26.0) - 2022-09-15
913

api/src/main/java/marquez/db/BaseDao.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,4 +53,7 @@ public interface BaseDao extends SqlObject {
5353

5454
@CreateSqlObject
5555
OpenLineageDao createOpenLineageDao();
56+
57+
@CreateSqlObject
58+
ColumnLineageDao createColumnLineageDao();
5659
}
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
/*
2+
* Copyright 2018-2022 contributors to the Marquez project
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package marquez.db;
7+
8+
import java.time.Instant;
9+
import java.util.Collections;
10+
import java.util.List;
11+
import java.util.UUID;
12+
import java.util.stream.Collectors;
13+
import marquez.db.mappers.ColumnLineageRowMapper;
14+
import marquez.db.models.ColumnLineageRow;
15+
import org.apache.commons.lang3.tuple.Pair;
16+
import org.jdbi.v3.sqlobject.config.RegisterRowMapper;
17+
import org.jdbi.v3.sqlobject.customizer.BindBeanList;
18+
import org.jdbi.v3.sqlobject.statement.SqlQuery;
19+
import org.jdbi.v3.sqlobject.statement.SqlUpdate;
20+
21+
@RegisterRowMapper(ColumnLineageRowMapper.class)
22+
public interface ColumnLineageDao extends BaseDao {
23+
24+
default List<ColumnLineageRow> upsertColumnLineageRow(
25+
UUID outputDatasetVersionUuid,
26+
UUID outputDatasetFieldUuid,
27+
List<Pair<UUID, UUID>> inputs,
28+
String transformationDescription,
29+
String transformationType,
30+
Instant now) {
31+
32+
if (inputs.isEmpty()) {
33+
return Collections.emptyList();
34+
}
35+
36+
doUpsertColumnLineageRow(
37+
inputs.stream()
38+
.map(
39+
input ->
40+
new ColumnLineageRow(
41+
outputDatasetVersionUuid,
42+
outputDatasetFieldUuid,
43+
input.getLeft(), // input_dataset_version_uuid
44+
input.getRight(), // input_dataset_field_uuid
45+
transformationDescription,
46+
transformationType,
47+
now,
48+
now))
49+
.collect(Collectors.toList()));
50+
return findColumnLineageByDatasetVersionColumnAndOutputDatasetField(
51+
outputDatasetVersionUuid, outputDatasetFieldUuid);
52+
}
53+
54+
@SqlQuery(
55+
"SELECT * FROM column_lineage WHERE output_dataset_version_uuid = :datasetVersionUuid AND output_dataset_field_uuid = :outputDatasetFieldUuid")
56+
List<ColumnLineageRow> findColumnLineageByDatasetVersionColumnAndOutputDatasetField(
57+
UUID datasetVersionUuid, UUID outputDatasetFieldUuid);
58+
59+
@SqlUpdate(
60+
"""
61+
INSERT INTO column_lineage (
62+
output_dataset_version_uuid,
63+
output_dataset_field_uuid,
64+
input_dataset_version_uuid,
65+
input_dataset_field_uuid,
66+
transformation_description,
67+
transformation_type,
68+
created_at,
69+
updated_at
70+
) VALUES <values>
71+
ON CONFLICT (output_dataset_version_uuid, output_dataset_field_uuid, input_dataset_version_uuid, input_dataset_field_uuid)
72+
DO UPDATE SET
73+
transformation_description = EXCLUDED.transformation_description,
74+
transformation_type = EXCLUDED.transformation_type,
75+
updated_at = EXCLUDED.updated_at
76+
""")
77+
void doUpsertColumnLineageRow(
78+
@BindBeanList(
79+
propertyNames = {
80+
"outputDatasetVersionUuid",
81+
"outputDatasetFieldUuid",
82+
"inputDatasetVersionUuid",
83+
"inputDatasetFieldUuid",
84+
"transformationDescription",
85+
"transformationType",
86+
"createdAt",
87+
"updatedAt"
88+
},
89+
value = "values")
90+
List<ColumnLineageRow> rows);
91+
}

api/src/main/java/marquez/db/Columns.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,15 @@ private Columns() {}
129129
public static final String RUN_UUID = "run_uuid";
130130
public static final String STATE = "state";
131131

132+
/* COLUMN LEVEL LINEAGE ROW COLUMNS */
133+
public static final String FIELD_NAME = "field_name";
134+
public static final String OUTPUT_DATASET_VERSION_UUID = "output_dataset_version_uuid";
135+
public static final String OUTPUT_DATASET_FIELD_UUID = "output_dataset_field_uuid";
136+
public static final String INPUT_DATASET_FIELD_UUID = "input_dataset_field_uuid";
137+
public static final String INPUT_DATASET_VERSION_UUID = "input_dataset_version_uuid";
138+
public static final String TRANSFORMATION_DESCRIPTION = "transformation_description";
139+
public static final String TRANSFORMATION_TYPE = "transformation_type";
140+
132141
/* LINEAGE EVENT ROW COLUMNS */
133142
public static final String EVENT = "event";
134143

api/src/main/java/marquez/db/DatasetFieldDao.java

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,10 @@
1616
import marquez.common.models.TagName;
1717
import marquez.db.mappers.DatasetFieldMapper;
1818
import marquez.db.mappers.DatasetFieldRowMapper;
19+
import marquez.db.mappers.FieldDataMapper;
1920
import marquez.db.models.DatasetFieldRow;
2021
import marquez.db.models.DatasetRow;
22+
import marquez.db.models.InputFieldData;
2123
import marquez.db.models.TagRow;
2224
import marquez.service.models.Dataset;
2325
import marquez.service.models.DatasetVersion;
@@ -29,6 +31,7 @@
2931

3032
@RegisterRowMapper(DatasetFieldRowMapper.class)
3133
@RegisterRowMapper(DatasetFieldMapper.class)
34+
@RegisterRowMapper(FieldDataMapper.class)
3235
public interface DatasetFieldDao extends BaseDao {
3336
@SqlQuery(
3437
"SELECT EXISTS ("
@@ -101,6 +104,24 @@ default Dataset updateTags(
101104
+ "WHERE fm.dataset_version_uuid = :datasetVersionUuid")
102105
List<Field> find(UUID datasetVersionUuid);
103106

107+
@SqlQuery(
108+
"""
109+
SELECT
110+
datasets_view.namespace_name as namespace_name,
111+
datasets_view.name as dataset_name,
112+
dataset_fields.name as field_name,
113+
datasets_view.uuid as dataset_uuid,
114+
dataset_versions.uuid as dataset_version_uuid,
115+
dataset_fields.uuid as dataset_field_uuid
116+
FROM dataset_fields
117+
JOIN dataset_versions_field_mapping fm ON fm.dataset_field_uuid = dataset_fields.uuid
118+
JOIN dataset_versions ON dataset_versions.uuid = fm.dataset_version_uuid
119+
JOIN datasets_view ON datasets_view.uuid = dataset_versions.dataset_uuid
120+
JOIN runs_input_mapping ON runs_input_mapping.dataset_version_uuid = dataset_versions.uuid
121+
WHERE runs_input_mapping.run_uuid = :runUuid
122+
""")
123+
List<InputFieldData> findInputFieldsDataAssociatedWithRun(UUID runUuid);
124+
104125
@SqlQuery(
105126
"INSERT INTO dataset_fields ("
106127
+ "uuid, "

api/src/main/java/marquez/db/OpenLineageDao.java

Lines changed: 94 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,16 @@
1313
import java.time.ZoneId;
1414
import java.time.ZonedDateTime;
1515
import java.util.ArrayList;
16+
import java.util.Collections;
1617
import java.util.HashSet;
1718
import java.util.LinkedHashMap;
1819
import java.util.List;
1920
import java.util.Map;
2021
import java.util.Optional;
2122
import java.util.Set;
2223
import java.util.UUID;
24+
import java.util.stream.Collectors;
25+
import java.util.stream.Stream;
2326
import marquez.common.Utils;
2427
import marquez.common.models.DatasetId;
2528
import marquez.common.models.DatasetName;
@@ -31,10 +34,12 @@
3134
import marquez.db.DatasetFieldDao.DatasetFieldMapping;
3235
import marquez.db.JobVersionDao.BagOfJobVersionInfo;
3336
import marquez.db.mappers.LineageEventMapper;
37+
import marquez.db.models.ColumnLineageRow;
3438
import marquez.db.models.DatasetFieldRow;
3539
import marquez.db.models.DatasetRow;
3640
import marquez.db.models.DatasetSymlinkRow;
3741
import marquez.db.models.DatasetVersionRow;
42+
import marquez.db.models.InputFieldData;
3843
import marquez.db.models.JobContextRow;
3944
import marquez.db.models.JobRow;
4045
import marquez.db.models.NamespaceRow;
@@ -56,6 +61,7 @@
5661
import marquez.service.models.LineageEvent.RunFacet;
5762
import marquez.service.models.LineageEvent.SchemaDatasetFacet;
5863
import marquez.service.models.LineageEvent.SchemaField;
64+
import org.apache.commons.lang3.tuple.Pair;
5965
import org.jdbi.v3.sqlobject.config.RegisterRowMapper;
6066
import org.jdbi.v3.sqlobject.statement.SqlQuery;
6167
import org.jdbi.v3.sqlobject.statement.SqlUpdate;
@@ -131,6 +137,7 @@ default UpdateLineageRow updateBaseMarquezModel(LineageEvent event, ObjectMapper
131137
RunDao runDao = createRunDao();
132138
RunArgsDao runArgsDao = createRunArgsDao();
133139
RunStateDao runStateDao = createRunStateDao();
140+
ColumnLineageDao columnLineageDao = createColumnLineageDao();
134141

135142
Instant now = event.getEventTime().withZoneSameInstant(ZoneId.of("UTC")).toInstant();
136143

@@ -323,7 +330,8 @@ default UpdateLineageRow updateBaseMarquezModel(LineageEvent event, ObjectMapper
323330
datasetDao,
324331
datasetVersionDao,
325332
datasetFieldDao,
326-
runDao);
333+
runDao,
334+
columnLineageDao);
327335
datasetInputs.add(record);
328336
}
329337
}
@@ -345,7 +353,8 @@ default UpdateLineageRow updateBaseMarquezModel(LineageEvent event, ObjectMapper
345353
datasetDao,
346354
datasetVersionDao,
347355
datasetFieldDao,
348-
runDao);
356+
runDao,
357+
columnLineageDao);
349358
datasetOutputs.add(record);
350359
}
351360
}
@@ -541,7 +550,8 @@ default DatasetRecord upsertLineageDataset(
541550
DatasetDao datasetDao,
542551
DatasetVersionDao datasetVersionDao,
543552
DatasetFieldDao datasetFieldDao,
544-
RunDao runDao) {
553+
RunDao runDao,
554+
ColumnLineageDao columnLineageDao) {
545555
NamespaceRow dsNamespace =
546556
namespaceDao.upsertNamespaceRow(
547557
UUID.randomUUID(), now, ds.getNamespace(), DEFAULT_NAMESPACE_OWNER);
@@ -662,6 +672,7 @@ default DatasetRecord upsertLineageDataset(
662672
return row;
663673
});
664674
List<DatasetFieldMapping> datasetFieldMappings = new ArrayList<>();
675+
List<DatasetFieldRow> datasetFields = new ArrayList<>();
665676
if (fields != null) {
666677
for (SchemaField field : fields) {
667678
DatasetFieldRow datasetFieldRow =
@@ -672,6 +683,7 @@ default DatasetRecord upsertLineageDataset(
672683
field.getType(),
673684
field.getDescription(),
674685
datasetRow.getUuid());
686+
datasetFields.add(datasetFieldRow);
675687
datasetFieldMappings.add(
676688
new DatasetFieldMapping(datasetVersionRow.getUuid(), datasetFieldRow.getUuid()));
677689
}
@@ -690,7 +702,85 @@ default DatasetRecord upsertLineageDataset(
690702
}
691703
}
692704

693-
return new DatasetRecord(datasetRow, datasetVersionRow, datasetNamespace);
705+
List<ColumnLineageRow> columnLineageRows = Collections.emptyList();
706+
if (!isInput) {
707+
columnLineageRows =
708+
upsertColumnLineage(
709+
runUuid,
710+
ds,
711+
now,
712+
datasetFields,
713+
columnLineageDao,
714+
datasetFieldDao,
715+
datasetVersionRow);
716+
}
717+
718+
return new DatasetRecord(datasetRow, datasetVersionRow, datasetNamespace, columnLineageRows);
719+
}
720+
721+
private List<ColumnLineageRow> upsertColumnLineage(
722+
UUID runUuid,
723+
Dataset ds,
724+
Instant now,
725+
List<DatasetFieldRow> datasetFields,
726+
ColumnLineageDao columnLineageDao,
727+
DatasetFieldDao datasetFieldDao,
728+
DatasetVersionRow datasetVersionRow) {
729+
// get all the fields related to this particular run
730+
List<InputFieldData> runFields = datasetFieldDao.findInputFieldsDataAssociatedWithRun(runUuid);
731+
732+
return Optional.ofNullable(ds.getFacets())
733+
.map(DatasetFacets::getColumnLineage)
734+
.map(LineageEvent.ColumnLineageFacet::getOutputColumnsList)
735+
.stream()
736+
.flatMap(list -> list.stream())
737+
.flatMap(
738+
outputColumn -> {
739+
Optional<DatasetFieldRow> outputField =
740+
datasetFields.stream()
741+
.filter(dfr -> dfr.getName().equals(outputColumn.getName()))
742+
.findAny();
743+
744+
if (outputField.isEmpty()) {
745+
Logger log = LoggerFactory.getLogger(OpenLineageDao.class);
746+
log.error(
747+
"Cannot produce column lineage for missing output field in output dataset: {}",
748+
outputColumn.getName());
749+
return Stream.empty();
750+
}
751+
752+
// get field uuids of input columns related to this run
753+
List<Pair<UUID, UUID>> inputFields =
754+
runFields.stream()
755+
.filter(
756+
fieldData ->
757+
outputColumn.getInputFields().stream()
758+
.filter(
759+
of ->
760+
of.getDatasetNamespace().equals(fieldData.getNamespace())
761+
&& of.getDatasetName()
762+
.equals(fieldData.getDatasetName())
763+
&& of.getFieldName().equals(fieldData.getField()))
764+
.findAny()
765+
.isPresent())
766+
.map(
767+
fieldData ->
768+
Pair.of(
769+
fieldData.getDatasetVersionUuid(),
770+
fieldData.getDatasetFieldUuid()))
771+
.collect(Collectors.toList());
772+
773+
return columnLineageDao
774+
.upsertColumnLineageRow(
775+
datasetVersionRow.getUuid(),
776+
outputField.get().getUuid(),
777+
inputFields,
778+
outputColumn.getTransformationDescription(),
779+
outputColumn.getTransformationType(),
780+
now)
781+
.stream();
782+
})
783+
.collect(Collectors.toList());
694784
}
695785

696786
default String formatDatasetName(String name) {
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
/*
2+
* Copyright 2018-2022 contributors to the Marquez project
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package marquez.db.mappers;
7+
8+
import static marquez.db.Columns.TRANSFORMATION_DESCRIPTION;
9+
import static marquez.db.Columns.TRANSFORMATION_TYPE;
10+
import static marquez.db.Columns.stringOrThrow;
11+
import static marquez.db.Columns.timestampOrThrow;
12+
import static marquez.db.Columns.uuidOrThrow;
13+
14+
import java.sql.ResultSet;
15+
import java.sql.SQLException;
16+
import lombok.NonNull;
17+
import marquez.db.Columns;
18+
import marquez.db.models.ColumnLineageRow;
19+
import org.jdbi.v3.core.mapper.RowMapper;
20+
import org.jdbi.v3.core.statement.StatementContext;
21+
22+
public class ColumnLineageRowMapper implements RowMapper<ColumnLineageRow> {
23+
24+
@Override
25+
public ColumnLineageRow map(@NonNull ResultSet results, @NonNull StatementContext context)
26+
throws SQLException {
27+
return new ColumnLineageRow(
28+
uuidOrThrow(results, Columns.OUTPUT_DATASET_VERSION_UUID),
29+
uuidOrThrow(results, Columns.OUTPUT_DATASET_FIELD_UUID),
30+
uuidOrThrow(results, Columns.INPUT_DATASET_VERSION_UUID),
31+
uuidOrThrow(results, Columns.INPUT_DATASET_FIELD_UUID),
32+
stringOrThrow(results, TRANSFORMATION_DESCRIPTION),
33+
stringOrThrow(results, TRANSFORMATION_TYPE),
34+
timestampOrThrow(results, Columns.CREATED_AT),
35+
timestampOrThrow(results, Columns.UPDATED_AT));
36+
}
37+
}

0 commit comments

Comments
 (0)